diff --git a/.github/workflows/api-and-integration-tests.yml b/.github/workflows/api-and-integration-tests.yml index bdedd570f..e69d7b878 100644 --- a/.github/workflows/api-and-integration-tests.yml +++ b/.github/workflows/api-and-integration-tests.yml @@ -114,7 +114,7 @@ jobs: cd tests nohup bash -c 'while true ; do sleep 5 ; ../yoda/docker/run-cronjob.sh copytovault >> ../copytovault.log 2>&1 ; ../yoda/docker/run-cronjob.sh publication >> ../publication.log 2>&1 ; done' & test -d mycache || mkdir -p mycache - python3 -m pytest --skip-ui --intake --datarequest --deposit -o cache_dir=mycache --environment environments/docker.json + python3 -m pytest --skip-ui --datarequest --deposit -o cache_dir=mycache --environment environments/docker.json cat ../copytovault.log cat ../publication.log diff --git a/.github/workflows/api-documentation.yml b/.github/workflows/api-documentation.yml index ce1bf53dd..80a3c1e3a 100644 --- a/.github/workflows/api-documentation.yml +++ b/.github/workflows/api-documentation.yml @@ -55,13 +55,11 @@ jobs: export PYTHONPATH="${PYTHONPATH}:." python tools/api/generate-openapi.py rules_uu --module datarequest > build/api_datarequest.json python tools/api/generate-openapi.py rules_uu --module deposit > build/api_deposit.json - python tools/api/generate-openapi.py rules_uu --module intake > build/api_intake.json - name: Validate Yoda module API documentation run: | openapi-spec-validator build/api_datarequest.json openapi-spec-validator build/api_deposit.json - openapi-spec-validator build/api_intake.json - name: Deploy 🚀 uses: JamesIves/github-pages-deploy-action@releases/v3 diff --git a/__init__.py b/__init__.py index 90707339d..92c63a291 100644 --- a/__init__.py +++ b/__init__.py @@ -56,10 +56,6 @@ # Import certain modules only when enabled. from .util.config import config -if config.enable_intake: - from intake import * - from intake_vault import * - if config.enable_datarequest: from datarequest import * diff --git a/intake.py b/intake.py deleted file mode 100644 index d304a6e7a..000000000 --- a/intake.py +++ /dev/null @@ -1,924 +0,0 @@ -# -*- coding: utf-8 -*- -"""Functions for intake module.""" - -__copyright__ = 'Copyright (c) 2019-2021, Utrecht University' -__license__ = 'GPLv3, see LICENSE' - -import fnmatch -import itertools -import time -import traceback - -import genquery - -import intake_dataset -import intake_lock -import intake_scan -from util import * - - -__all__ = ['api_intake_list_studies', - 'api_intake_list_dm_studies', - 'api_intake_count_total_files', - 'api_intake_list_unrecognized_files', - 'api_intake_list_datasets', - 'api_intake_scan_for_datasets', - 'api_intake_lock_dataset', - 'api_intake_unlock_dataset', - 'api_intake_dataset_get_details', - 'api_intake_dataset_add_comment', - 'api_intake_report_vault_dataset_counts_per_study', - 'api_intake_report_vault_aggregated_info', - 'api_intake_report_export_study_data', - 'rule_intake_scan_for_datasets'] - -INTAKE_FILE_EXCLUSION_PATTERNS = ['*.abc', '*.PNG'] -""" List of file patterns not to take into account within INTAKE module.""" - - -@api.make() -def api_intake_list_studies(ctx): - """Get list of all studies current user is involved in. - - :param ctx: Combined type of a callback and rei struct - - :returns: List of studies - - """ - groups = [] - user_name = user.name(ctx) - user_zone = user.zone(ctx) - - iter = genquery.row_iterator( - "USER_GROUP_NAME", - "USER_NAME = '" + user_name + "' AND USER_ZONE = '" + user_zone + "'", - genquery.AS_LIST, ctx - ) - - for row in iter: - if row[0].startswith('grp-intake-'): - groups.append(row[0][11:]) - elif row[0].startswith('intake-'): - groups.append(row[0][7:]) - - groups.sort() - return groups - - -@api.make() -def api_intake_list_dm_studies(ctx): - """Return list of studies current user is datamanager of. - - :param ctx: Combined type of a callback and rei struct - - :returns: List of dm studies - """ - datamanager_groups = [] - user_name = user.name(ctx) - user_zone = user.zone(ctx) - - iter = genquery.row_iterator( - "USER_GROUP_NAME", - "USER_NAME = '" + user_name + "' AND USER_ZONE = '" + user_zone + "'", - genquery.AS_LIST, ctx - ) - - for row in iter: - study = '' - if row[0].startswith('grp-intake-'): - study = row[0][11:] - elif row[0].startswith('intake-'): - study = row[0][7:] - - if study: - # Is a member of this study ... check whether member of corresponding datamanager group - iter2 = genquery.row_iterator( - "USER_NAME", - "USER_TYPE = 'rodsgroup' AND USER_NAME like 'grp-datamanager-" + study + "'", - genquery.AS_LIST, ctx - ) - for row2 in iter2: - datamanager_group = row2[0] - if user.is_member_of(ctx, datamanager_group): - datamanager_groups.append(study) - - return datamanager_groups - - -@api.make() -def api_intake_count_total_files(ctx, coll): - """Get the total count of all files in collection - . - :param ctx: Combined type of a callback and rei struct - :param coll: Collection from which to count all datasets - - :returns: Total file count - """ - main_collection_iterator = genquery.row_iterator( - "COLL_NAME, DATA_NAME", - "COLL_NAME = '" + coll + "'", - genquery.AS_LIST, ctx - ) - - subcollection_iterator = genquery.row_iterator( - "COLL_NAME, DATA_NAME", - "COLL_NAME like '" + coll + "/%'", - genquery.AS_LIST, ctx - ) - - count = 0 - for row in itertools.chain(main_collection_iterator, subcollection_iterator): - exclusion_matched = any(fnmatch.fnmatch(row[1], p) for p in INTAKE_FILE_EXCLUSION_PATTERNS) - if not exclusion_matched: - count += 1 - - return count - - -@api.make() -def api_intake_list_unrecognized_files(ctx, coll): - """Get list of all unrecognized files for given path including relevant metadata. - - :param ctx: Combined type of a callback and rei struct - :param coll: Collection from which to list all unrecognized files - - :returns: List of unrecognized files - """ - # check permissions - parts = coll.split('/') - group = parts[3] - datamanager_group = intake_group_to_datamanager_group(group) - - if user.is_member_of(ctx, group): - pass - elif user.is_member_of(ctx, datamanager_group): - pass - else: - return {} - - # Include coll name as equal names do occur and genquery delivers distinct results. - main_collection_iterator = genquery.row_iterator( - "COLL_NAME, DATA_NAME, COLL_CREATE_TIME, DATA_OWNER_NAME", - "COLL_NAME = '" + coll + "' AND META_DATA_ATTR_NAME = 'unrecognized'", - genquery.AS_LIST, ctx - ) - - subcollection_iterator = genquery.row_iterator( - "COLL_NAME, DATA_NAME, COLL_CREATE_TIME, DATA_OWNER_NAME", - "COLL_NAME like '" + coll + "/%' AND META_DATA_ATTR_NAME = 'unrecognized'", - genquery.AS_LIST, ctx - ) - - files = [] - for row in itertools.chain(main_collection_iterator, subcollection_iterator): - # Check whether object type is within exclusion pattern - exclusion_matched = any(fnmatch.fnmatch(row[1], p) for p in INTAKE_FILE_EXCLUSION_PATTERNS) - if not exclusion_matched: - # Error is hardcoded! (like in the original) and initialize attributes already as empty strings. - file_data = {"name": row[1], - "path": row[0], - "date": time.strftime('%Y-%m-%d', time.localtime(int(row[2]))), - "creator": row[3], - "error": 'Experiment type, wave or pseudocode is missing from path', - "experiment_type": '', - "pseudocode": '', - "wave": '', - "version": ''} - - # per data object get relevant metadata (experiment type, version, wave, pseudocode) if present - iter2 = genquery.row_iterator( - "META_DATA_ATTR_NAME, META_DATA_ATTR_VALUE", - "COLL_NAME = '" + row[0] + "' AND DATA_NAME = '" + row[1] + "' AND META_DATA_ATTR_NAME in ('experiment_type', 'pseudocode', 'wave', 'version')", - genquery.AS_LIST, ctx - ) - for row2 in iter2: - file_data[row2[0]] = row2[1] - - files.append(file_data) - - return files - - -@api.make() -def api_intake_list_datasets(ctx, coll): - """Get list of datasets for given path. - - A dataset is distinguished by attribute name 'dataset_toplevel' which can either reside on a collection or a data object. - That is why 2 separate queries have to be performed. - - :param ctx: Combined type of a callback and rei struct - :param coll: Collection from which to list all datasets - - :returns: list of datasets - """ - datasets = [] - - # 1) Query for datasets distinguished by collections - c_main_collection_iterator = genquery.row_iterator( - "META_COLL_ATTR_VALUE, COLL_NAME", - "COLL_NAME = '" + coll + "' AND META_COLL_ATTR_NAME = 'dataset_toplevel' ", - genquery.AS_LIST, ctx - ) - - c_subcollection_iterator = genquery.row_iterator( - "META_COLL_ATTR_VALUE, COLL_NAME", - "COLL_NAME LIKE '" + coll + "/%' AND META_COLL_ATTR_NAME = 'dataset_toplevel' ", - genquery.AS_LIST, ctx - ) - - for row in itertools.chain(c_main_collection_iterator, c_subcollection_iterator): - dataset = get_dataset_details(ctx, row[0], row[1]) - datasets.append(dataset) - - # 2) Query for datasets distinguished dataobjects - d_main_collection_iterator = genquery.row_iterator( - "META_DATA_ATTR_VALUE, COLL_NAME", - "COLL_NAME = '" + coll + "' AND META_DATA_ATTR_NAME = 'dataset_toplevel' ", - genquery.AS_LIST, ctx - ) - - d_subcollection_iterator = genquery.row_iterator( - "META_DATA_ATTR_VALUE, COLL_NAME", - "COLL_NAME LIKE '" + coll + "/%' AND META_DATA_ATTR_NAME = 'dataset_toplevel' ", - genquery.AS_LIST, ctx - ) - - for row in itertools.chain(d_main_collection_iterator, d_subcollection_iterator): - dataset = get_dataset_details(ctx, row[0], row[1]) - datasets.append(dataset) - - return datasets - - -def get_dataset_details(ctx, dataset_id, path): - """Get details of dataset based on dataset identifier. - - :param ctx: Combined type of a callback and rei struct - :param dataset_id: Identifier of dataset - :param path: Path to dataset - - :returns: Dict holding objects for the dataset - """ - # Inialise all attributes - dataset = {"dataset_id": dataset_id, - "path": path} - - # Parse dataset_id to get WEPV-items individually - dataset_parts = dataset_id.split('\t') - dataset['wave'] = dataset_parts[0] - dataset['experiment_type'] = dataset_parts[1] - dataset['pseudocode'] = dataset_parts[2] - dataset['version'] = dataset_parts[3] - dataset['datasetStatus'] = 'scanned' - dataset['datasetCreateName'] = '==UNKNOWN==' - dataset['datasetCreateDate'] = 0 - dataset['datasetCreateDateFormatted'] = '' - dataset['datasetErrors'] = 0 - dataset['datasetWarnings'] = 0 - dataset['datasetComments'] = 0 - dataset['objects'] = 0 - dataset['objectErrors'] = 0 - dataset['objectWarnings'] = 0 - - tl_info = get_dataset_toplevel_objects(ctx, path, dataset_id) - is_collection = tl_info['is_collection'] - tl_objects = tl_info['objects'] - - if is_collection: - """ dataset is based on a collection """ - tl_collection = tl_objects[0] - iter = genquery.row_iterator( - "COLL_NAME, COLL_OWNER_NAME, COLL_CREATE_TIME", - "COLL_NAME = '" + tl_collection + "' ", - genquery.AS_LIST, ctx - ) - for row in iter: - dataset['datasetCreateName'] = row[1] - dataset['datasetCreateDate'] = int(row[2]) - dataset['datasetCreateDateFormatted'] = time.strftime('%Y-%m-%d', time.localtime(int(row[2]))) - dataset['datasetCreatedByWhen'] = row[1] + ':' + row[2] - - iter = genquery.row_iterator( - "COLL_NAME, META_COLL_ATTR_NAME, count(META_COLL_ATTR_VALUE)", - "COLL_NAME = '" + tl_collection + "' ", - genquery.AS_LIST, ctx - ) - for row in iter: - if row[1] == 'dataset_error': - dataset['datasetErrors'] += int(row[2]) - if row[1] == 'dataset_warning': - dataset['datasetWarnings'] += int(row[2]) - if row[1] == 'comment': - dataset['datasetComments'] += int(row[2]) - if row[1] == 'to_vault_freeze': - dataset['datasetStatus'] = 'frozen' - if row[1] == 'to_vault_lock': - dataset['datasetStatus'] = 'locked' - - iter = genquery.row_iterator( - "COLL_NAME, META_COLL_ATTR_NAME, META_COLL_ATTR_VALUE", - "COLL_NAME = '" + tl_collection + "' ", - genquery.AS_LIST, ctx - ) - for row in iter: - if row[1] == 'object_count': - dataset['objects'] += int(row[2]) - if row[1] == 'object_errors': - dataset['objectErrors'] += int(row[2]) - if row[1] == 'object_warnings': - dataset['objectWarnings'] += int(row[2]) - else: - # Dataset is based on a dataobject - # Step through all data objects as found in tl_objects - objects = 0 - object_errors = 0 - object_warnings = 0 - for tl_object in tl_objects: - - # split tl_object - tlo = pathutil.chop(tl_object) - parent = tlo[0] - base_name = tlo[1] - - objects += 1 - if objects == 1: - iter = genquery.row_iterator( - "DATA_OWNER_NAME, DATA_CREATE_TIME", - "COLL_NAME = '" + parent + "' and DATA_NAME = '" + base_name + "' ", - genquery.AS_LIST, ctx - ) - for row in iter: - dataset['datasetCreateName'] = row[0] - dataset['datasetCreateDate'] = int(row[1]) - dataset['datasetCreateDateFormatted'] = time.strftime('%Y-%m-%d', time.localtime(int(row[1]))) - dataset['datasetCreatedByWhen'] = row[0] + ':' + row[1] - - iter = genquery.row_iterator( - "META_DATA_ATTR_NAME, META_DATA_ATTR_VALUE", - "COLL_NAME = '" + parent + "' and DATA_NAME = '" + base_name + "' ", - genquery.AS_LIST, ctx - ) - for row in iter: - if row[0] == 'error': - object_errors += 1 - if row[0] == 'warning': - object_warnings += 1 - if objects == 1: - # Only look at these items when objects==1 as they are added to each toplevel object present - if row[0] == 'dataset_error': - dataset['datasetErrors'] += 1 - if row[0] == 'dataset_warning': - dataset['datasetWarnings'] += 1 - if row[0] == 'comment': - dataset['datasetComments'] += 1 - if row[0] == 'to_vault_freeze': - dataset['datasetStatus'] = 'frozen' - if row[0] == 'to_vault_lock': - dataset['datasetStatus'] = 'locked' - dataset['objects'] = objects - dataset['objectErrors'] = object_errors - dataset['objectWarnings'] = object_warnings - - return dataset - - -def get_dataset_toplevel_objects(ctx, root, dataset_id): - """Returns dict with toplevel object paths and whether is collection based dataset. - - If is a collection - only one object is returned (collection path). - If not a collection- all objects are returned with full object path. - - :param ctx: Combined type of a callback and rei struct - :param root: Path within which to search for datasets (e.g. an intake group collection) - :param dataset_id: Identifier of the dataset - - :returns: Dict holding top-level object paths for the dataset (in the 'objects' key) and a boolean value which - says whether it is a collection-based dataset (in the 'is_collection' key) - """ - c_main_collection_iterator = genquery.row_iterator( - "COLL_NAME", - "COLL_NAME = '" + root + "' AND META_COLL_ATTR_NAME = 'dataset_toplevel' " - "AND META_COLL_ATTR_VALUE = '" + dataset_id + "'", - genquery.AS_LIST, ctx - ) - - c_subcollection_iterator = genquery.row_iterator( - "COLL_NAME", - "COLL_NAME LIKE '" + root + "/%' AND META_COLL_ATTR_NAME = 'dataset_toplevel' " - "AND META_COLL_ATTR_VALUE = '" + dataset_id + "'", - genquery.AS_LIST, ctx - ) - - for row in itertools.chain(c_main_collection_iterator, c_subcollection_iterator): - return {'is_collection': True, - 'objects': [row[0]]} - - # For dataobject situation gather all object path strings as a list - d_main_collection_iterator = genquery.row_iterator( - "DATA_NAME, COLL_NAME", - "COLL_NAME = '" + root + "' AND META_DATA_ATTR_NAME = 'dataset_toplevel' " - "AND META_DATA_ATTR_VALUE = '" + dataset_id + "'", - genquery.AS_LIST, ctx - ) - - d_subcollection_iterator = genquery.row_iterator( - "DATA_NAME, COLL_NAME", - "COLL_NAME LIKE '" + root + "/%' AND META_DATA_ATTR_NAME = 'dataset_toplevel' " - "AND META_DATA_ATTR_VALUE = '" + dataset_id + "'", - genquery.AS_LIST, ctx - ) - - objects = [] - for row in itertools.chain(d_main_collection_iterator, d_subcollection_iterator): - objects.append(row[1] + '/' + row[0]) - return {'is_collection': False, - 'objects': objects} - - -@api.make() -def api_intake_scan_for_datasets(ctx, coll): - """The toplevel of a dataset can be determined by attribute 'dataset_toplevel' - and can either be a collection or a data_object. - - :param ctx: Combined type of a callback and rei struct - :param coll: Collection to scan for datasets - - :returns: indication correct - """ - - if _intake_check_authorized_to_scan(ctx, coll): - try: - _intake_scan_for_datasets(ctx, coll) - except Exception: - log.write(ctx, "Intake scan (API) failed with the following exception: " + traceback.format_exc()) - return {"proc_status": "NOK", "error_msg": "Error during scanning process"} - else: - return {"proc_status": "NOK", "error_msg": "No permissions to scan collection"} - - return {"proc_status": "OK"} - - -@rule.make(inputs=[0], outputs=[1]) -def rule_intake_scan_for_datasets(ctx, coll): - """The toplevel of a dataset can be determined by attribute 'dataset_toplevel' - and can either be a collection or a data_object. - - :param ctx: Combined type of a callback and rei struct - :param coll: Collection to scan for datasets - - :returns: 0=correct, 1=insufficient rights, 2=error during scanning process - """ - if not collection.exists(ctx, coll): - return "Non existing collection: " + coll - if _intake_check_authorized_to_scan(ctx, coll): - try: - _intake_scan_for_datasets(ctx, coll, tl_datasets_log_target='stdout') - except Exception: - log.write(ctx, "Intake scan (rule) failed with the following exception: " + traceback.format_exc()) - return "Error scanning for datasets for collection: " + coll - else: - return "Insufficient permissions for collection: " + coll - - return 0 - - -def _intake_check_authorized_to_scan(ctx, coll): - """Checks that user is authorized to scan intake group, either as - a data manager or as an intake group member. - - :param ctx: Combined type of a callback and rei struct - :param coll: Collection to scan for datasets - - :returns: boolean - whether user is authorized - """ - parts = coll.split('/') - group = parts[3] - datamanager_group = intake_group_to_datamanager_group(group) - - if (user.is_member_of(ctx, group) or user.is_member_of(ctx, datamanager_group)): - return True - else: - log.write(ctx, "No permissions to scan collection") - return False - - -def _intake_scan_for_datasets(ctx, coll, tl_datasets_log_target=''): - """Internal function for actually running intake scan - - :param ctx: Combined type of a callback and rei struct - :param coll: Collection to scan for datasets - :param tl_datasets_log_target: If in ['stdout', 'serverLog'] logging of toplevel datasets will take place to the specified target - - """ - scope = {"wave": "", - "experiment_type": "", - "pseudocode": ""} - found_datasets = [] - found_datasets = intake_scan.intake_scan_collection(ctx, coll, scope, False, found_datasets) - - if tl_datasets_log_target in ['stdout', 'serverLog']: - for subscope in found_datasets: - try: - version = subscope['version'] - except KeyError: - version = 'Raw' - ctx.writeLine(tl_datasets_log_target, ("Found dataset toplevel collection: " - + "W<" + subscope['wave'] - + "> E<" + subscope['experiment_type'] - + "> P<" + subscope['pseudocode'] - + "> V<" + version - + "> D<" + subscope['directory'] - + ">")) - - intake_scan.intake_check_datasets(ctx, coll) - - -@api.make() -def api_intake_lock_dataset(ctx, path, dataset_ids): - """Lock datasets as an indication it can be 'frozen' for it to progress to vault. - - Lock = datamanager only - - :param ctx: Combined type of a callback and rei struct - :param path: Collection for which to lock a specific dataset id - :param dataset_ids: Comma separated identifiers of datasets to be locked - - :returns: indication correct - """ - # check permissions - datamanager only - parts = path.split('/') - group = parts[3] - datamanager_group = intake_group_to_datamanager_group(group) - - if not user.is_member_of(ctx, datamanager_group): - log.write(ctx, "No permissions to lock dataset") - return {"proc_status": "NOK", - "error_msg": "No permissions to lock dataset(s)", - "error_dataset_ids": []} - - error_dataset_ids = [] - for dataset_id in dataset_ids.split(','): - # error_dataset_ids.append(dataset_id) - try: - intake_lock.intake_dataset_lock(ctx, path, dataset_id) - except Exception: - error_dataset_ids.append(dataset_id) - - if error_dataset_ids: - return {"proc_status": "NOK", - "error_msg": "Something went wrong locking datasets", - "error_dataset_ids": error_dataset_ids} - - return {"proc_status": "OK"} - - -@api.make() -def api_intake_unlock_dataset(ctx, path, dataset_ids): - """Unlock a dataset to remove the indication so it can be 'frozen' for it to progress to vault - - Unlock = datamanager only - - :param ctx: Combined type of a callback and rei struct - :param path: Collection for which to lock a specific dataset id - :param dataset_ids: Comma separated identifiers of datasets to be locked - - :returns: indication correct - """ - # check permissions - datamanager only - parts = path.split('/') - group = parts[3] - datamanager_group = intake_group_to_datamanager_group(group) - - if not user.is_member_of(ctx, datamanager_group): - log.write(ctx, "No permissions to unlock dataset(s)") - return {"proc_status": "NOK", - "error_msg": "No permissions to unlock dataset", - "error_dataset_ids": []} - - error_dataset_ids = [] - for dataset_id in dataset_ids.split(','): - # error_dataset_ids.append(dataset_id) - try: - intake_lock.intake_dataset_unlock(ctx, path, dataset_id) - except Exception: - error_dataset_ids.append(dataset_id) - - if error_dataset_ids: - return {"proc_status": "NOK", - "error_msg": "Something went wrong unlocking datasets", - "error_dataset_ids": error_dataset_ids} - - return {"proc_status": "OK"} - - -@api.make() -def api_intake_dataset_add_comment(ctx, study_id, dataset_id, comment): - """Add a comment to a dataset. - - :param ctx: Combined type of a callback and rei struct - :param study_id: Id of the study given dataset belongs to - :param dataset_id: Identifier of the dataset to add a comment to - :param comment: Comment as added by user - - :returns: indication correct - """ - coll = '/' + user.zone(ctx) + '/home/' + study_id - - # check permissions - can be researcher or datamanager - parts = coll.split('/') - group = parts[3] - datamanager_group = intake_group_to_datamanager_group(group) - - if not (user.is_member_of(ctx, group) or user.is_member_of(ctx, datamanager_group)): - log.write(ctx, "No permissions to scan collection") - return {} - - tl_info = get_dataset_toplevel_objects(ctx, coll, dataset_id) - is_collection = tl_info['is_collection'] - tl_objects = tl_info['objects'] - - if not is_collection and len(tl_objects) == 0: - return {"proc_status": "NOK", - "error_msg": "Dataset does not exist"} - - timestamp = int(time.time()) # int(datetime.timestamp(datetime.now())) - comment_data = user.name(ctx) + ':' + str(timestamp) + ':' + comment - - for tl in tl_objects: - if is_collection: - avu.associate_to_coll(ctx, tl, 'comment', comment_data) - else: - avu.associate_to_data(ctx, tl, 'comment', comment_data) - - return {'user': user.name(ctx), 'timestamp': time.strftime('%Y/%m/%d %H:%M:%S', time.localtime(timestamp)), 'comment': comment} - - -@api.make() -def api_intake_dataset_get_details(ctx, coll, dataset_id): - """Get all details for a dataset (errors/warnings, scanned by who/when, comments, file tree). - - 1) Errors/warnings - 2) Comments - 3) Tree view of files within dataset. - - :param ctx: Combined type of a callback and rei struct - :param coll: Collection to start from - :param dataset_id: Identifier of the dataset to get details for - - :returns: dictionary with all dataset data - """ - # check permissions - can be researcher or datamanager - parts = coll.split('/') - group = parts[3] - datamanager_group = intake_group_to_datamanager_group(group) - - if not (user.is_member_of(ctx, group) or user.is_member_of(ctx, datamanager_group)): - log.write(ctx, "No permissions to scan collection") - return {} - - tl_info = get_dataset_toplevel_objects(ctx, coll, dataset_id) - is_collection = tl_info['is_collection'] - tl_objects = tl_info['objects'] - - scanned = '' - comments = [] - dataset_warnings = [] - dataset_errors = [] - files = {} - for tl in tl_objects: - if is_collection: - coll = tl - # Dataset based on a collection - iter = genquery.row_iterator( - "META_COLL_ATTR_VALUE, META_COLL_ATTR_NAME, order_asc(META_COLL_MODIFY_TIME)", - "COLL_NAME = '{}' and META_COLL_ATTR_NAME in ('dataset_error', 'dataset_warning', 'comment')".format(coll), - genquery.AS_LIST, ctx - ) - for row in iter: - if row[1] == 'dataset_error': - dataset_errors.append(row[0]) - elif row[1] == 'dataset_warning': - dataset_warnings.append(row[0]) - else: - comments.append(row[0]) - - # Scanned by/when - iter = genquery.row_iterator( - "META_DATA_ATTR_VALUE", - "META_DATA_ATTR_NAME = 'scanned' AND COLL_NAME = '{}'".format(coll), - genquery.AS_LIST, ctx - ) - for row in iter: - scanned = row[0] - break - - break - else: - # Dataset is based on a data object - parts = pathutil.chop(tl) - coll = parts[0] - file = parts[1] - iter = genquery.row_iterator( - "META_DATA_ATTR_VALUE, META_DATA_ATTR_NAME, order_asc(META_DATA_MODIFY_TIME)", - "COLL_NAME = '{}' AND DATA_NAME = '{}' and META_DATA_ATTR_NAME in ('dataset_error','dataset_warning','comment', 'scanned')".format(coll, file), - genquery.AS_LIST, ctx - ) - for row in iter: - if row[1] == 'dataset_error': - dataset_errors.append(row[0]) - elif row[1] == 'dataset_warning': - dataset_warnings.append(row[0]) - elif row[1] == 'scanned': - scanned = row[0] - else: - comments.append(row[0]) - - # do it only once - all data is gathered in the first run - break - - level = '0' - files = coll_objects(ctx, level, coll, dataset_id) - - if len(scanned.split(':')) != 2: - # Retrieve scannedby/when information in a different way - dataset = get_dataset_details(ctx, dataset_id, coll) - scanned = dataset.get('datasetCreatedByWhen', "unknown") - - return {"files": files, - # "is_collection": is_collection, - # "tlobj": tl_objects, - "scanned": scanned, - "comments": comments, - "dataset_warnings": dataset_warnings, - "dataset_errors": dataset_errors} - - -def coll_objects(ctx, level, coll, dataset_id): - """Recursive function to pass entire folder/file structure in such that frontend - can do something useful with it including errors/warnings on object level - - :param ctx: Combined type of a callback and rei struct - :param level: Level in hierarchy (tree) - :param coll: Collection to collect - :param dataset_id: id of the dataset involved - - :returns: Tree of collections and files - """ - # First get the sub collections - counter = 0 - files = {} - - # COLLECTIONS - iter = genquery.row_iterator( - "COLL_NAME, COLL_ID", - "COLL_PARENT_NAME = '{}' AND META_COLL_ATTR_NAME = 'dataset_id' AND META_COLL_ATTR_VALUE = '{}'".format(coll, dataset_id), - genquery.AS_LIST, ctx - ) - for row in iter: - # files(pathutil.basename(row[0])) - node = {} - node['name'] = pathutil.basename(row[0]) - node['isFolder'] = True - node['parent_id'] = level - warnings = [] - errors = [] - # Per collection add errors/warnings from scan process - iter2 = genquery.row_iterator( - "META_COLL_ATTR_VALUE, META_COLL_ATTR_NAME", - "META_COLL_ATTR_NAME in ('warning', 'error') AND COLL_ID = '{}'".format(row[1]), - genquery.AS_LIST, ctx - ) - for row2 in iter2: - if row[1] == 'error': - errors.append(row2[0]) - else: - warnings.append(row2[0]) - node['errors'] = errors - node['warnings'] = warnings - - files[level + "." + str(counter)] = node - - files.update(coll_objects(ctx, level + "." + str(counter), row[0], dataset_id)) - - counter += 1 - - # DATA OBJECTS - iter = genquery.row_iterator( - "DATA_NAME, DATA_ID", - "COLL_NAME = '{}' AND META_DATA_ATTR_NAME = 'dataset_id' AND META_DATA_ATTR_VALUE = '{}'".format(coll, dataset_id), - genquery.AS_LIST, ctx - ) - for row in iter: - node = {} - node['name'] = row[0] - node['isFolder'] = False - node['parent_id'] = level - # Per data object add errors/warnings from scan process - iter2 = genquery.row_iterator( - "META_DATA_ATTR_VALUE, META_DATA_ATTR_NAME", - "META_DATA_ATTR_NAME in ('warning', 'error') AND DATA_ID = '{}'".format(row[1]), - genquery.AS_LIST, ctx - ) - warnings = [] - errors = [] - for row2 in iter2: - if row2[1] == 'error': - errors.append(row2[0]) - else: - warnings.append(row2[0]) - node['errors'] = errors - node['warnings'] = warnings - - files[level + "." + str(counter)] = node - - counter += 1 - - return files - - -# Reporting / export functions -@api.make() -def api_intake_report_vault_dataset_counts_per_study(ctx, study_id): - """Get the count of datasets wave/experimenttype. - - In the vault a dataset is always located in a folder. - Therefore, looking at the folders only is enough. - - :param ctx: Combined type of a callback and rei struct - :param study_id: Study id - - :returns: Dictionary with relevant aggregated counts - """ - # check permissions - datamanager only - datamanager_group = "grp-datamanager-" + study_id - - if not user.is_member_of(ctx, datamanager_group): - log.write(ctx, "No permissions for reporting functionality") - return {} - - return intake_dataset.intake_youth_dataset_counts_per_study(ctx, study_id) - - -@api.make() -def api_intake_report_vault_aggregated_info(ctx, study_id): - """Collects the following information for Raw, Processed datasets. - Including a totalisation of this all (Raw/processed is kept in VERSION). - - -Total datasets - -Total files - -Total file size - -File size growth in a month - -Datasets growth in a month - -Pseudocodes (distinct) - - :param ctx: Combined type of a callback and rei struct - :param study_id: Study id - - :returns: Dictionary with data for analysis - """ - # check permissions - datamanager only - datamanager_group = "grp-datamanager-" + study_id - - if not user.is_member_of(ctx, datamanager_group): - log.write(ctx, "No permissions for reporting functionality") - return {} - - return intake_dataset.vault_aggregated_info(ctx, study_id) - - -@api.make() -def api_intake_report_export_study_data(ctx, study_id): - """Find all datasets in the vault for $studyID. - - Include file count and total file size as well as dataset meta data version, experiment type, pseudocode and wave - - :param ctx: Combined type of a callback and rei struct - :param study_id: Study id to get a report from - - :returns: Study report - """ - # check permissions - datamanager only - datamanager_group = "grp-datamanager-" + study_id - - if not user.is_member_of(ctx, datamanager_group): - log.write(ctx, "No permissions to export data for this study") - return {} - - return intake_dataset.intake_report_export_study_data(ctx, study_id) - - -def intake_group_to_datamanager_group(intake_group): - """Determines the name of the data manager group of a particular intake group. - - :param intake_group: name of intake group - - :returns: name of datamanager group - - :raises ValueError: if provided group name is not a valid intake group name - """ - if intake_group.startswith("grp-intake-"): - return intake_group.replace("-intake-", "-datamanager-", 1) - elif intake_group.startswith("intake-"): - return intake_group.replace("intake-", "grp-datamanager-", 1) - else: - raise ValueError("Unexpected intake group format for group " + intake_group) diff --git a/intake_checksums.py b/intake_checksums.py deleted file mode 100644 index 669d06afe..000000000 --- a/intake_checksums.py +++ /dev/null @@ -1,53 +0,0 @@ -# -*- coding: utf-8 -*- -"""Functions for intake checksums.""" - -__copyright__ = 'Copyright (c) 2019-2021, Utrecht University' -__license__ = 'GPLv3, see LICENSE' - -import itertools - -import genquery - -from util import * - - -def chop_checksum(checksum): - """Chop iRODS checksum in checksum type and checksum string. - - Checksum format is ({type}:){checksum}, if type is missing then it is "md5". - - :param checksum: iRODS checksum string - :returns: type checksum - """ - checksum_split = checksum.split(":") - - if len(checksum_split) > 1: - type = checksum_split[0] - checksum = checksum_split[1] - - return type, checksum - - -def intake_generate_dataset_checksums(ctx, dataset_path, checksum_file): - """"Generate data object with all checksums of a dataset. - - :param ctx: Combined type of a callback and rei struct - :param dataset_path: Root collection of dataset to be indexed - :param checksum_file: Data object to write checksums to - """ - q_root = genquery.row_iterator("COLL_NAME, DATA_NAME, DATA_CHECKSUM, DATA_SIZE", - "COLL_NAME = '{}'".format(dataset_path), - genquery.AS_LIST, ctx) - - q_sub = genquery.row_iterator("COLL_NAME, DATA_NAME, DATA_CHECKSUM, DATA_SIZE", - "COLL_NAME like '{}/%'".format(dataset_path), - genquery.AS_LIST, ctx) - - # Create checksums file. - checksums = "" - for row in itertools.chain(q_root, q_sub): - type, checksum = chop_checksum(row[2]) - checksums += "{} {} {} {}/{}\n".format(type, checksum, row[3], row[0], row[1]) - - # Write checksums file. - data_object.write(ctx, checksum_file, checksums) diff --git a/intake_dataset.py b/intake_dataset.py deleted file mode 100644 index d8417fc71..000000000 --- a/intake_dataset.py +++ /dev/null @@ -1,284 +0,0 @@ -# -*- coding: utf-8 -*- -"""Functions for intake datasets.""" - -__copyright__ = 'Copyright (c) 2019-2021, Utrecht University' -__license__ = 'GPLv3, see LICENSE' - -import itertools - -import genquery - -from util import * - - -def intake_report_export_study_data(ctx, study_id): - """ Get the information for the export functionality - - Retrieved metadata for a study: - - dataset_date_created - - wave - - version - - experiment_type - - pseudocode - - number of files - - total file size - - :param ctx: Combined type of a callback and rei struct - :param study_id: Unique identifier op study - :returns: returns datasets - """ - zone = user.zone(ctx) - - main_collection_iterator = genquery.row_iterator("COLL_NAME, COLL_PARENT_NAME, META_COLL_ATTR_NAME, META_COLL_ATTR_VALUE", - " = '/{}/home/grp-vault-{}' AND META_COLL_ATTR_NAME IN ('dataset_id', 'dataset_date_created', 'wave', 'version', 'experiment_type', 'pseudocode')".format(zone, study_id), - genquery.AS_LIST, ctx) - - subcollection_iterator = genquery.row_iterator("COLL_NAME, COLL_PARENT_NAME, META_COLL_ATTR_NAME, META_COLL_ATTR_VALUE", - "COLL_NAME like '/{}/home/grp-vault-{}/%' AND META_COLL_ATTR_NAME IN ('dataset_id', 'dataset_date_created', 'wave', 'version', 'experiment_type', 'pseudocode')".format(zone, study_id), - genquery.AS_LIST, ctx) - - datasets = {} - for row in itertools.chain(main_collection_iterator, subcollection_iterator): - path = row[0] - try: - datasets[path][row[2]] = row[3] - except KeyError: - datasets[path] = {row[2]: row[3]} - - real_datasets = {} - for set_path in datasets: - if 'dataset_date_created' in datasets[set_path]: - real_datasets[set_path] = datasets[set_path] - # collect total file size and total amount of files - real_datasets[set_path]['totalFileSize'] = 0 - real_datasets[set_path]['totalFiles'] = 0 - - # get the filesize and file count - stat_main_collection_iterator = genquery.row_iterator("count(DATA_ID), sum(DATA_SIZE)", - "COLL_NAME = '{}'".format(set_path), - genquery.AS_LIST, ctx) - - stat_subcollection_iterator = genquery.row_iterator("count(DATA_ID), sum(DATA_SIZE)", - "COLL_NAME like '{}/%'".format(set_path), - genquery.AS_LIST, ctx) - - for row in itertools.chain(stat_main_collection_iterator, stat_subcollection_iterator): - real_datasets[set_path]['totalFiles'] = int(row[0]) / 2 - totalFileSize = 0 - if row[1]: - totalFileSize = int(row[1]) - real_datasets[set_path]['totalFileSize'] = totalFileSize / 2 - - return real_datasets - - -def intake_youth_get_datasets_in_study(ctx, study_id): - """Get the of datasets (with relevant metadata) in a study. - - Retrieved metadata: - - 'dataset_id' - - 'dataset_date_created' - - 'wave' - - 'version' - - 'experiment_type' - - 'pseudocode' - - :param ctx: Combined type of a callback and rei struct - :param study_id: Unique identifier of study - - :returns: Dict with datasets and relevant metadata. - """ - zone = user.zone(ctx) - - main_collection_iterator = genquery.row_iterator("COLL_NAME, COLL_PARENT_NAME, META_COLL_ATTR_NAME, META_COLL_ATTR_VALUE", - "COLL_NAME = '/{}/home/grp-vault-{}' AND META_COLL_ATTR_NAME IN ('dataset_id', 'dataset_date_created', 'wave', 'version', 'experiment_type', 'pseudocode')".format(zone, study_id), - genquery.AS_LIST, ctx) - - subcollection_iterator = genquery.row_iterator("COLL_NAME, COLL_PARENT_NAME, META_COLL_ATTR_NAME, META_COLL_ATTR_VALUE", - "COLL_NAME LIKE '/{}/home/grp-vault-{}/*' AND META_COLL_ATTR_NAME IN ('dataset_id', 'dataset_date_created', 'wave', 'version', 'experiment_type', 'pseudocode')".format(zone, study_id), - genquery.AS_LIST, ctx) - - datasets = {} - - # Construct all datasets. - for row in itertools.chain(main_collection_iterator, subcollection_iterator): - dataset = row[0] - attribute_name = row[2] - attribute_value = row[3] - - if attribute_name in ['dataset_date_created', 'wave', 'version', 'experiment_type', 'pseudocode']: - if attribute_name in ['version', 'experiment_type']: - val = attribute_value.lower() - else: - val = attribute_value - try: - datasets[dataset][attribute_name] = val - except KeyError: - datasets[dataset] = {attribute_name: val} - - return datasets - - -def intake_youth_dataset_counts_per_study(ctx, study_id): - """"Get the counts of datasets wave/experimenttype. - - In the vault a dataset is always located in a folder. - Therefore, looking at the folders only is enough. - - :param ctx: Combined type of a callback and rei struct - :param study_id: Unique identifier op study - - :returns: Dict with counts of datasets wave/experimenttype - """ - datasets = intake_youth_get_datasets_in_study(ctx, study_id) - - dataset_type_counts = {} - # Loop through datasets and count wave and experimenttype. - for dataset in datasets: - # Meta attribute 'dataset_date_created' defines that a folder holds a complete set. - if 'dataset_date_created' in datasets[dataset]: - type = datasets[dataset]['experiment_type'] - wave = datasets[dataset]['wave'] - version = datasets[dataset]['version'] - - try: - dataset_type_counts[type][wave][version] += 1 - except KeyError: - if type not in dataset_type_counts: - dataset_type_counts[type] = {wave: {version: 1}} - elif wave not in dataset_type_counts[type]: - dataset_type_counts[type][wave] = {version: 1} - else: - dataset_type_counts[type][wave][version] = 1 - - return dataset_type_counts - - -def vault_aggregated_info(ctx, study_id): - """Collects aggregated information for raw and processed datasets. - - Collects the following information for RAW and PROCESSED datasets. - Including a totalisation of this all (raw/processed is kept in VERSION) - - Total datasets - - Total files - - Total file size - - File size growth in a month - - Datasets growth in a month - - Pseudocodes (distinct) - - :param ctx: Combined type of a callback and rei struct - :param study_id: Unique identifier op study - - :returns: Dict with aggregated information for raw and processed datasets - """ - datasets = intake_youth_get_datasets_in_study(ctx, study_id) - - dataset_count = {'raw': 0, 'processed': 0} - dataset_growth = {'raw': 0, 'processed': 0} - dataset_file_count = {'raw': 0, 'processed': 0} - dataset_file_size = {'raw': 0, 'processed': 0} - dataset_file_growth = {'raw': 0, 'processed': 0} - dataset_pseudocodes = {'raw': [], 'processed': []} - - # Determine full last month reference point - import time - from datetime import datetime, date, timedelta - - last_day_of_prev_month = date.today().replace(day=1) - timedelta(days=1) - month = int(last_day_of_prev_month.strftime("%m")) - year = int(last_day_of_prev_month.strftime("%Y")) - - last_month = int(time.time() - int(datetime(year, month, int(date.today().strftime("%d")), 0, 0, 0).strftime('%s'))) - - dataset_paths = [] - for dataset in datasets: - # Meta attribute 'dataset_date_created' defines that a folder holds a complete set. - if 'dataset_date_created' in datasets[dataset]: - dataset_paths.append(dataset) - - if datasets[dataset]['version'].lower() == 'raw': - version = 'raw' - else: - version = 'processed' - - # if version in ['raw', 'processed']: - dataset_count[version] += 1 - - try: - date_created = int(datasets[dataset]['dataset_date_created']) - except Exception: - # This is nonsense and arose from an erroneous situation - date_created = last_month - - if date_created - last_month >= 0: - dataset_growth[version] += 1 - - try: - pseudocode = datasets[dataset]['pseudocode'] - if pseudocode not in dataset_pseudocodes[version]: - dataset_pseudocodes[version].append(pseudocode) - except KeyError: - continue - - zone = user.zone(ctx) - main_collection_iterator = genquery.row_iterator("DATA_NAME, COLL_NAME, DATA_SIZE, COLL_CREATE_TIME", - "COLL_NAME = '/{}/home/grp-vault-{}'".format(zone, study_id), - genquery.AS_LIST, ctx) - - subcollection_iterator = genquery.row_iterator("DATA_NAME, COLL_NAME, DATA_SIZE, COLL_CREATE_TIME", - "COLL_NAME like '/{}/home/grp-vault-{}/%'".format(zone, study_id), - genquery.AS_LIST, ctx) - - for row in itertools.chain(main_collection_iterator, subcollection_iterator): - coll_name = row[1] - data_size = int(row[2]) - coll_create_time = int(row[3]) - - # Check whether the file is part of a dataset. - part_of_dataset = False - for dataset in dataset_paths: - if dataset in coll_name: - part_of_dataset = True - break - - # File is part of dataset. - if part_of_dataset: - # version = datasets[dataset]['version'] - - if datasets[dataset]['version'].lower() == 'raw': - version = 'raw' - else: - version = 'processed' - - dataset_file_count[version] += 1 - dataset_file_size[version] += data_size - - if coll_create_time - last_month >= 0: - dataset_file_growth[version] += data_size - - return { - 'total': { - 'totalDatasets': dataset_count['raw'] + dataset_count['processed'], - 'totalFiles': dataset_file_count['raw'] + dataset_file_count['processed'], - 'totalFileSize': dataset_file_size['raw'] + dataset_file_size['processed'], - 'totalFileSizeMonthGrowth': dataset_file_growth['raw'] + dataset_file_growth['processed'], - 'datasetsMonthGrowth': dataset_growth['raw'] + dataset_growth['processed'], - 'distinctPseudoCodes': len(dataset_pseudocodes['raw']) + len(dataset_pseudocodes['processed']), - }, - 'raw': { - 'totalDatasets': dataset_count['raw'], - 'totalFiles': dataset_file_count['raw'], - 'totalFileSize': dataset_file_size['raw'], - 'totalFileSizeMonthGrowth': dataset_file_growth['raw'], - 'datasetsMonthGrowth': dataset_growth['raw'], - 'distinctPseudoCodes': len(dataset_pseudocodes['raw']), - }, - 'notRaw': { - 'totalDatasets': dataset_count['processed'], - 'totalFiles': dataset_file_count['processed'], - 'totalFileSize': dataset_file_size['processed'], - 'totalFileSizeMonthGrowth': dataset_file_growth['processed'], - 'datasetsMonthGrowth': dataset_growth['processed'], - 'distinctPseudoCodes': len(dataset_pseudocodes['processed']), - }, - } diff --git a/intake_lock.py b/intake_lock.py deleted file mode 100644 index d31c202ce..000000000 --- a/intake_lock.py +++ /dev/null @@ -1,203 +0,0 @@ -# -*- coding: utf-8 -*- -"""Functions for intake locking.""" - -__copyright__ = 'Copyright (c) 2019-2021, Utrecht University' -__license__ = 'GPLv3, see LICENSE' - -import time - -import genquery - -import intake -from util import * - - -def intake_dataset_treewalk_change_status(ctx, collection, status, timestamp, remove): - """Treewalk dataset collection and change status. - - :param ctx: Combined type of a callback and rei struct - :param collection: Will change every time as it represents every collection that has to be processed - :param status: Status to set on dataset objects - :param timestamp: Timestamp of status change - :param remove: Boolean, set or remove status - """ - # 1. Change status on this collection. - if remove: - try: - avu.rmw_from_coll(ctx, collection, status, "%") - except msi.Error as e: - log.write(ctx, 'ERROR REMOVE') - log.write(ctx, e) - else: - log.write(ctx, 'step1 . set_on_col') - avu.set_on_coll(ctx, collection, status, timestamp) - - # 2. Change status on data objects located directly within the collection. - data_objects = genquery.row_iterator( - "DATA_NAME", - "COLL_NAME = '{}'".format(collection), - genquery.AS_LIST, ctx - ) - - for row in data_objects: - if remove: - avu.rmw_from_data(ctx, "{}/{}".format(collection, row[0]), status, "%") - else: - log.write(ctx, 'step2 . set_on_data') - avu.set_on_data(ctx, "{}/{}".format(collection, row[0]), status, timestamp) - - # 3. Loop through subcollections. - subcollections = genquery.row_iterator( - "COLL_NAME", - "COLL_PARENT_NAME = '{}'".format(collection), - genquery.AS_LIST, ctx - ) - - for row in subcollections: - intake_dataset_treewalk_change_status(ctx, row[0], status, timestamp, remove) - - -def intake_dataset_change_status(ctx, object, is_collection, dataset_id, status, timestamp, remove): - """Change status on dataset. - - :param ctx: Combined type of a callback and rei struct - :param object: Will change every time as it represents every object of the dataset - :param is_collection: Indicator if dataset is within a collection - :param dataset_id: Dataset identifier - :param status: Status to set on dataset objects - :param timestamp: Timestamp of status change - :param remove: Boolean, set or remove status - """ - # Is dataset a collection? - if is_collection: - # Recursively change the status on all objects in the dataset - intake_dataset_treewalk_change_status(ctx, object, status, timestamp, remove) - else: - # Dataset is not a collection, find all the dataset objects. - data_objects = genquery.row_iterator("DATA_NAME", - "COLL_NAME = '{}' AND META_DATA_ATTR_NAME = 'dataset_toplevel' AND META_DATA_ATTR_VALUE = '{}'".format(object, dataset_id), - genquery.AS_LIST, ctx) - - # Change dataset status on all objects. - for row in data_objects: - if remove: - avu.rmw_from_data(ctx, "{}/{}".format(object, row[0]), status, "%") - else: - avu.set_on_data(ctx, "{}/{}".format(object, row[0]), status, timestamp) - - -def intake_dataset_lock(ctx, collection, dataset_id): - timestamp = str(int(time.time())) - - tl_info = intake.get_dataset_toplevel_objects(ctx, collection, dataset_id) - is_collection = tl_info['is_collection'] - tl_objects = tl_info['objects'] - log.write(ctx, tl_info) - - if not is_collection and len(tl_objects) == 0: - raise Exception("Dataset \"{}\" in collection {} not found".format(collection, dataset_id)) - - if is_collection: - intake_dataset_change_status(ctx, tl_objects[0], is_collection, dataset_id, "to_vault_lock", timestamp, False) - else: - # Dataset based on - for tl_object in tl_objects: - avu.set_on_data(ctx, tl_object, "to_vault_lock", timestamp) - - -def intake_dataset_unlock(ctx, collection, dataset_id): - timestamp = str(int(time.time())) - - tl_info = intake.get_dataset_toplevel_objects(ctx, collection, dataset_id) - is_collection = tl_info['is_collection'] - tl_objects = tl_info['objects'] - - if not is_collection and len(tl_objects) == 0: - raise Exception("Dataset \"{}\" in collection {} not found".format(collection, dataset_id)) - - # It is possible that the status of the dataset status has moved on. - if is_collection: - intake_dataset_change_status(ctx, tl_objects[0], is_collection, dataset_id, "to_vault_lock", timestamp, True) - else: - # Dataset based on data objects - for tl_object in tl_objects: - avu.rmw_from_data(ctx, tl_object, "to_vault_lock", "%") - - -def intake_dataset_freeze(ctx, collection, dataset_id): - # timestamp = str(int(time.time())) - # top_collection = "" - # is_collection = "" - # ctx.uuYcDatasetGetTopLevel(collection, dataset_id, top_collection, is_collection) - - # intake_dataset_change_status(ctx, top_collection, is_collection, dataset_id, "to_vault_freeze", timestamp, False) - - timestamp = str(int(time.time())) - - tl_info = intake.get_dataset_toplevel_objects(ctx, collection, dataset_id) - is_collection = tl_info['is_collection'] - tl_objects = tl_info['objects'] - log.write(ctx, tl_info) - - if is_collection: - intake_dataset_change_status(ctx, tl_objects[0], is_collection, dataset_id, "to_vault_freeze", timestamp, False) - else: - # Dataset based on - for tl_object in tl_objects: - avu.set_on_data(ctx, tl_object, "to_vault_freeze", timestamp) - - -def intake_dataset_melt(ctx, collection, dataset_id): - # timestamp = str(int(time.time())) - # top_collection = "" - # is_collection = "" - # ctx.uuYcDatasetGetTopLevel(collection, dataset_id, top_collection, is_collection) - - # intake_dataset_change_status(ctx, top_collection, is_collection, dataset_id, "to_vault_freeze", timestamp, True) - - timestamp = str(int(time.time())) - - tl_info = intake.get_dataset_toplevel_objects(ctx, collection, dataset_id) - is_collection = tl_info['is_collection'] - tl_objects = tl_info['objects'] - - # It is possible that the status of the dataset status has moved on. - if is_collection: - intake_dataset_change_status(ctx, tl_objects[0], is_collection, dataset_id, "to_vault_freeze", timestamp, True) - else: - # Dataset based on data objects - for tl_object in tl_objects: - avu.rmw_from_data(ctx, tl_object, "to_vault_freeze", "%") - - -def intake_dataset_object_get_status(ctx, path): - """Get the status of an object in a dataset. - - :param ctx: Combined type of a callback and rei struct - :param path: Path of dataset object - - :returns: Tuple booleans indicating if the object is locked or frozen - """ - locked = False - frozen = False - - if collection.exists(ctx, path): - attribute_names = genquery.row_iterator("META_COLL_ATTR_NAME", - "COLL_NAME = '{}'".format(path), - genquery.AS_LIST, ctx) - else: - coll_name, data_name = pathutil.chop(path) - attribute_names = genquery.row_iterator("META_DATA_ATTR_NAME", - "COLL_NAME = '{}' AND DATA_NAME = '{}'".format(coll_name, data_name), - genquery.AS_LIST, ctx) - - for row in attribute_names: - attribute_name = row[0] - if attribute_name in ["to_vault_lock", "to_vault_freeze"]: - locked = True - - if attribute_name == "to_vault_freeze": - frozen = True - break - - return locked, frozen diff --git a/intake_scan.py b/intake_scan.py deleted file mode 100644 index ba024c4cc..000000000 --- a/intake_scan.py +++ /dev/null @@ -1,462 +0,0 @@ -# -*- coding: utf-8 -*- -"""Functions for intake scanning.""" - -__copyright__ = 'Copyright (c) 2019-2021, Utrecht University' -__license__ = 'GPLv3, see LICENSE' - -import itertools -import time - -import genquery - -import intake -from intake_utils import dataset_parse_id, intake_scan_get_metadata_update -from util import * - - -def intake_scan_collection(ctx, root, scope, in_dataset, found_datasets): - """Recursively scan a directory in a Youth Cohort intake. - - :param ctx: Combined type of a callback and rei struct - :param root: the directory to scan - :param scope: a scoped kvlist buffer - :param in_dataset: whether this collection is within a dataset collection - :param found_datasets: collection of subscopes that were found in order to report toplevel datasets in the scanning process - - :returns: Found datasets - """ - - # Loop until pseudocode, experiment type and wave are complete. - # But the found values can be overwritten when deeper levels are found. - - # Scan files under root - iter = genquery.row_iterator( - "DATA_NAME, COLL_NAME", - "COLL_NAME = '" + root + "'", - genquery.AS_LIST, ctx - ) - for row in iter: - path = row[1] + '/' + row[0] - - # Determene lock state for object (no collectoin - locked_state = object_is_locked(ctx, path, False) - - if locked_state['locked'] or locked_state['frozen']: - continue - - remove_dataset_metadata(ctx, path, False) - scan_mark_scanned(ctx, path, False) - - parent_in_dataset = in_dataset - metadata_update = intake_scan_get_metadata_update(ctx, path, False, in_dataset, scope) - - if metadata_update["in_dataset"]: - apply_dataset_metadata(ctx, path, metadata_update["new_metadata"], False) - if not parent_in_dataset: - # We found a top-level dataset data object. - found_datasets.append(metadata_update["new_metadata"]) - else: - apply_partial_metadata(ctx, metadata_update["new_metadata"], path, False) - avu.set_on_data(ctx, path, "unrecognized", "Experiment type, wave or pseudocode missing from path") - - # Scan collections under root - iter = genquery.row_iterator( - "COLL_NAME", - "COLL_PARENT_NAME = '" + root + "'", - genquery.AS_LIST, ctx - ) - counter = 0 - for row in iter: - path = row[0] - counter = counter + 1 - dirname = pathutil.basename(path) - - if dirname != '/': - # get locked /frozen status - locked_state = object_is_locked(ctx, path, True) - - if locked_state['locked'] or locked_state['frozen']: - continue - - remove_dataset_metadata(ctx, path, True) - scan_mark_scanned(ctx, path, True) - - parent_in_dataset = in_dataset - metadata_update = intake_scan_get_metadata_update(ctx, path, True, in_dataset, scope) - - if metadata_update["in_dataset"]: - apply_dataset_metadata(ctx, path, metadata_update["new_metadata"], True) - if not parent_in_dataset: - # We found a new top-level dataset data object. - found_datasets.append(metadata_update["new_metadata"]) - else: - apply_partial_metadata(ctx, metadata_update["new_metadata"], path, True) - - found_datasets = intake_scan_collection(ctx, - path, - metadata_update["new_metadata"], - parent_in_dataset or metadata_update["in_dataset"], - found_datasets) - - return found_datasets - - -def object_is_locked(ctx, path, is_collection): - """Returns whether given object in path (collection or dataobject) is locked or frozen - - :param ctx: Combined type of a callback and rei struct - :param path: Path to object or collection - :param is_collection: Whether path contains a collection or data object - - :returns: Returns locked state - """ - locked_state = {"locked": False, - "frozen": False} - - if is_collection: - iter = genquery.row_iterator( - "META_COLL_ATTR_NAME", - "COLL_NAME = '" + path + "'", - genquery.AS_LIST, ctx - ) - for row in iter: - if row[0] in ['to_vault_lock', 'to_vault_freeze']: - locked_state['locked'] = True - if row[0] == 'to_vault_freeze': - locked_state['frozen'] = True - else: - parent_coll = pathutil.dirname(path) - iter = genquery.row_iterator( - "META_DATA_ATTR_NAME", - "COLL_NAME = '" + parent_coll + "' AND DATA_NAME = '" + pathutil.basename(path) + "'", - genquery.AS_LIST, ctx - ) - # return locked_state - for row in iter: - if row[0] in ['to_vault_lock', 'to_vault_freeze']: - locked_state['locked'] = True - if row[0] == 'to_vault_freeze': - locked_state['frozen'] = True - - return locked_state - - -def remove_dataset_metadata(ctx, path, is_collection): - """Remove all intake metadata from dataset. - - :param ctx: Combined type of a callback and rei struct - :param path: Path to collection or data object - :param is_collection: Whether is a collection or data object - """ - intake_metadata = ["wave", - "experiment_type", - "pseudocode", - "version", - "dataset_id", - "dataset_toplevel", - "error", - "warning", - "dataset_error", - "dataset_warning", - "unrecognized", - "object_count", - "object_errors", - "object_warnings"] - intake_metadata_set = set(intake_metadata) - - # Add the following two lines to remove accumulated metadata during testing. - # "comment" - # "scanned"] - - if is_collection: - iter = genquery.row_iterator( - "COLL_ID, META_COLL_ATTR_NAME, META_COLL_ATTR_VALUE", - "COLL_NAME = '" + path + "'", - genquery.AS_LIST, ctx - ) - else: - iter = genquery.row_iterator( - "DATA_ID, META_DATA_ATTR_NAME, META_DATA_ATTR_VALUE", - "COLL_NAME = '" + pathutil.dirname(path) + "' AND DATA_NAME = '" + pathutil.basename(path) + "'", - genquery.AS_LIST, ctx - ) - - for _row in iter: - metadata_name = _row[1] - if metadata_name in intake_metadata_set: - if is_collection: - try: - avu.rmw_from_coll(ctx, path, metadata_name, '%') - except Exception as e: - log.write(ctx, "Warning: unable to remove metadata attr {} from {}".format(metadata_name, path)) - log.write(ctx, "Removing metadata failed with exception {}".format(str(e))) - else: - try: - avu.rmw_from_data(ctx, path, metadata_name, '%') - except Exception as e: - log.write(ctx, "Warning: unable to remove metadata attr {} from {}".format(metadata_name, path)) - log.write(ctx, "Removing metadata failed with exception {}".format(str(e))) - - -def scan_mark_scanned(ctx, path, is_collection): - """Sets the username of the scanner and a timestamp as metadata on the scanned object. - - :param ctx: Combined type of a callback and rei struct - :param path: Path on which to add scan indication to - :param is_collection: Is scanned object a collection? - """ - timestamp = int(time.time()) - user_and_timestamp = user.name(ctx) + ':' + str(timestamp) # str(datetime.date.today()) - - if is_collection: - avu.set_on_coll(ctx, path, 'scanned', user_and_timestamp) - else: - avu.set_on_data(ctx, path, 'scanned', user_and_timestamp) - - -def apply_dataset_metadata(ctx, path, scope, is_collection): - """Apply dataset metadata to an object in a dataset. - - :param ctx: Combined type of a callback and rei struct - :param path: Path to the object - :param scope: A scanner scope containing WEPV values - :param is_collection: Whether the object is a collection - """ - for key in scope: - if scope[key]: - if is_collection: - avu.set_on_coll(ctx, path, key, scope[key]) - else: - avu.set_on_data(ctx, path, key, scope[key]) - - -def apply_partial_metadata(ctx, scope, path, is_collection): - """Apply any available id component metadata to the given object. - - To be called only for objects outside datasets. When inside a dataset - (or at a dataset toplevel), use intake_apply_dataset_metadata() instead. - - :param ctx: Combined type of a callback and rei struct - :param scope: A scanner scope containing some WEPV values - :param path: Path to the object - :param is_collection: Whether the object is a collection - """ - keys = ['wave', 'experiment_type', 'pseudocode', 'version'] - for key in keys: - if key in scope: - if scope[key]: - if is_collection: - avu.set_on_coll(ctx, path, key, scope[key]) - else: - avu.set_on_data(ctx, path, key, scope[key]) - - -def dataset_add_error(ctx, top_levels, is_collection_toplevel, text, suppress_duplicate_avu_error=False): - """Add a dataset error to all given dataset toplevels. - - :param ctx: Combined type of a callback and rei struct - :param top_levels: A list of toplevel datasets - :param is_collection_toplevel: Indication of whether it is a collection or object - :param text: Error text - :param suppress_duplicate_avu_error: If an AVU already exists, suppress the irods-error. Allow for this situation - - :raises Exception: Raises exception when associating error to collection or data object fails - """ - for tl in top_levels: - if is_collection_toplevel: - try: - avu.associate_to_coll(ctx, tl, "dataset_error", text) - except msi.Error as e: - # iRODS errorcode 809000 (CATALOG_ALREADY_HAS_ITEM_BY_THAT_NAME) - if suppress_duplicate_avu_error and str(e).find("809000") > -1: - log.write(ctx, "Trying to associate dataset_error already present on collection: {}".format(tl)) - log.write(ctx, "Suppress error handling for AVU: dataset_error - {}".format(text)) - else: - raise Exception(e) - else: - try: - avu.associate_to_data(ctx, tl, "dataset_error", text) - except msi.Error as e: - # iRODS errorcode 809000 (CATALOG_ALREADY_HAS_ITEM_BY_THAT_NAME) - if suppress_duplicate_avu_error and str(e).find("809000") > -1: - log.write(ctx, "Trying to associate dataset_error already present on data object: {}".format(tl)) - log.write(ctx, "Suppress error handling for AVU: dataset_error - {}".format(text)) - else: - raise Exception(e) - - -def dataset_get_ids(ctx, coll): - """Find dataset ids under collection. - :param ctx: Combined type of a callback and rei struct - :param coll: Collection name for which to find dataset-ids - :returns: Returns a set of dataset ids - """ - data_ids = set() - - # Get distinct data_ids - main_collection_iterator = genquery.row_iterator( - "META_DATA_ATTR_VALUE", - "COLL_NAME = '" + coll + "' AND META_DATA_ATTR_NAME = 'dataset_id' ", - genquery.AS_LIST, ctx - ) - - subcollection_iterator = genquery.row_iterator( - "META_DATA_ATTR_VALUE", - "COLL_NAME LIKE '" + coll + "/%' AND META_DATA_ATTR_NAME = 'dataset_id' ", - genquery.AS_LIST, ctx - ) - for row in itertools.chain(main_collection_iterator, subcollection_iterator): - if row[0]: - data_ids.add(row[0]) - - return data_ids - - -def intake_check_datasets(ctx, root): - """Run checks on all datasets under root. - - :param ctx: Combined type of a callback and rei struct - :param root: The collection to get datasets for - """ - dataset_ids = dataset_get_ids(ctx, root) - for dataset_id in dataset_ids: - intake_check_dataset(ctx, root, dataset_id) - - -def intake_check_dataset(ctx, root, dataset_id): - """Run checks on the dataset specified by the given dataset id. - - This function adds object counts and error counts to top-level objects within the dataset. - For historical reasons, it also adds a warning count, which is always 0. - - :param ctx: Combined type of a callback and rei struct - :param root: Collection name - :param dataset_id: Dataset identifier - """ - tl_info = intake.get_dataset_toplevel_objects(ctx, root, dataset_id) - is_collection = tl_info['is_collection'] - tl_objects = tl_info['objects'] - - # Check validity of wav - waves = ["20w", "30w", "0m", "5m", "10m", "3y", "6y", "9y", "12y", "15y"] - components = dataset_parse_id(dataset_id) - if components['wave'] not in waves: - dataset_add_error(ctx, tl_objects, is_collection, "The wave '" + components['wave'] + "' is not in the list of accepted waves") - - # check presence of wave, pseudo-ID and experiment - if '' in [components['wave'], components['experiment_type'], components['pseudocode']]: - # Suppress error handing and continue normal processing should a situation arise where Wepv missing is already present on the dataobject/collection - dataset_add_error(ctx, tl_objects, is_collection, "Wave, experiment type or pseudo-ID missing", True) - - for tl in tl_objects: - # Save the aggregated counts of #objects, #warnings, #errors on object level - - count = get_aggregated_object_count(ctx, dataset_id, tl) - if is_collection: - avu.set_on_coll(ctx, tl, "object_count", str(count)) - else: - avu.set_on_data(ctx, tl, "object_count", str(count)) - - count = get_aggregated_object_error_count(ctx, tl) - if is_collection: - avu.set_on_coll(ctx, tl, "object_errors", str(count)) - else: - avu.set_on_data(ctx, tl, "object_errors", str(count)) - - count = 0 - if is_collection: - avu.set_on_coll(ctx, tl, "object_warnings", str(count)) - else: - avu.set_on_data(ctx, tl, "object_warnings", str(count)) - - -def get_rel_paths_objects(ctx, root, dataset_id): - """Get a list of relative paths to all data objects in a dataset. - - :param ctx: Combined type of a callback and rei struct - :param root: Root path of the dataset - :param dataset_id: Dataset identifier - - :returns: List of objects of relative object paths (e.g. file1.dat, some-subdir/file2.dat...) - """ - tl_info = intake.get_dataset_toplevel_objects(ctx, root, dataset_id) - is_collection = tl_info['is_collection'] - tl_objects = tl_info['objects'] - - rel_path_objects = [] - - # get the correct parent_collection - try: - if is_collection: - parent_coll = tl_objects[0] - else: - parent_coll = pathutil.dirname(tl_objects[0]) - except Exception: - parent_coll = '/' - - main_collection_iterator = genquery.row_iterator( - "DATA_NAME, COLL_NAME", - "COLL_NAME = '" + parent_coll + "' AND META_DATA_ATTR_NAME = 'dataset_id' AND META_DATA_ATTR_VALUE = '" + dataset_id + "' ", - genquery.AS_LIST, ctx - ) - - subcollection_iterator = genquery.row_iterator( - "DATA_NAME, COLL_NAME", - "COLL_NAME LIKE '" + parent_coll + "/%' AND META_DATA_ATTR_NAME = 'dataset_id' AND META_DATA_ATTR_VALUE = '" + dataset_id + "' ", - genquery.AS_LIST, ctx - ) - - for row in itertools.chain(main_collection_iterator, subcollection_iterator): - # Add objects including relative paths - rel_path_objects.append(row[1][len(parent_coll):] + '/' + row[0]) - - return rel_path_objects - - -def get_aggregated_object_count(ctx, dataset_id, tl_collection): - """Return total amounts of objects. - - :param ctx: Combined type of a callback and rei struct - :param dataset_id: Dataset id - :param tl_collection: Collection name of top level - - :returns: Aggregated object count - """ - main_collection_iterator = genquery.row_iterator( - "DATA_ID", - "COLL_NAME = '" + tl_collection + "' AND META_DATA_ATTR_NAME = 'dataset_id' " - "AND META_DATA_ATTR_VALUE = '" + dataset_id + "' ", - genquery.AS_LIST, ctx - ) - - subcollection_iterator = genquery.row_iterator( - "DATA_ID", - "COLL_NAME like '" + tl_collection + "/%' AND META_DATA_ATTR_NAME = 'dataset_id' " - "AND META_DATA_ATTR_VALUE = '" + dataset_id + "' ", - genquery.AS_LIST, ctx - ) - - return len(list(main_collection_iterator) + list(subcollection_iterator)) - - -def get_aggregated_object_error_count(ctx, tl_collection): - """Return total amount of object errors. - - :param ctx: Combined type of a callback and rei struct - :param tl_collection: Collection name of top level - - :returns: Total amount of object errors - """ - main_collection_iterator = genquery.row_iterator( - "DATA_ID", - "COLL_NAME = '" + tl_collection + "' AND META_DATA_ATTR_NAME = 'error' ", - genquery.AS_LIST, ctx - ) - - subcollection_iterator = genquery.row_iterator( - "DATA_ID", - "COLL_NAME like '" + tl_collection + "/%' AND META_DATA_ATTR_NAME = 'error' ", - genquery.AS_LIST, ctx - ) - - return len(list(main_collection_iterator) + list(subcollection_iterator)) diff --git a/intake_utils.py b/intake_utils.py deleted file mode 100644 index ff90cf7f6..000000000 --- a/intake_utils.py +++ /dev/null @@ -1,204 +0,0 @@ -# -*- coding: utf-8 -*- - -"""Utility functions for the intake module. These are in a separate file so that - we can test the main logic without having iRODS-related dependencies in the way.""" - -__copyright__ = 'Copyright (c) 2019-2021, Utrecht University' -__license__ = 'GPLv3, see LICENSE' - -import os -import re - - -def intake_tokens_identify_dataset(tokens): - """Check whether the tokens gathered so far are sufficient for identifying a dataset. - - :param tokens: A dictionary of tokens - - :returns: Returns whether a dataset is identified - """ - required = ['wave', 'experiment_type', 'pseudocode'] # version is optional - - missing = 0 - for req_token in required: - # required tokens must be present and must have a value - if req_token not in tokens or tokens[req_token] == "": - missing = missing + 1 - - return (missing == 0) - - -def intake_ensure_version_present(ctx, metadata): - """Adds a version attribute with a default value to metadata if it is not yet present. - - :param ctx: Combined type of a callback and rei struct - :param metadata: Dictionary with intake module metadata - """ - if "version" not in metadata: - metadata["version"] = "Raw" - - -def intake_extract_tokens_from_name(ctx, path, scoped_buffer): - """Extract one or more tokens from a file / directory name and add dataset information as metadata. - :param ctx: Combined type of a callback and rei struct - :param path: Full path of the data object or collection - :param scoped_buffer: Holds dataset buffer with prefilled keys - :returns: Returns extended scope buffer - """ - basename = os.path.basename(path) - name_without_ext = os.path.splitext(basename)[0] - parts = re.split("[_-]", name_without_ext) - for part in parts: - scoped_buffer.update(intake_extract_tokens(ctx, part)) - return scoped_buffer - - -def intake_extract_tokens(ctx, string): - """Extract tokens from a string and return as dict. - - :param ctx: Combined type of a callback and rei struct - :param string: Token of which to be determined whether experiment type, version etc - - :returns: Returns found kv's - """ - exp_types = ["pci", - "echo", - "facehouse", - "faceemo", - "coherence", - "infprogap", - "infsgaze", - "infpop", - # "mriinhibition", - # "mriemotion", - # "mockinhibition", - "chprogap", - "chantigap", - "chsgaze", - "pciconflict", - "pcivacation", - "peabody", - "discount", - "cyberball", - "trustgame", - "other", - # MRI: - "inhibmockbehav", - "inhibmribehav", - "emotionmribehav", - "emotionmriscan", - "anatomymriscan", - "restingstatemriscan", - "dtiamriscan", - "dtipmriscan", - "mriqcreport", - "mriqceval", - "vasmri", - "vasmock", - # - "looklisten", - "handgame", - "infpeabody", - "delaygratification", - "dtimriscan", - "inhibmriscan", - # 16-Apr-2019 fbyoda email request new exp type: - "chdualet", - # 15-Feb-2021 fbyoda email request new exp type: - "functionalmriscan", - "infdualet", - "vrbartbehav", - "infssat"] - - str_lower = string.lower() - str_upper = string.upper() - str_for_pseudocode_test = string.split('.')[0] - str_for_version_test = string.translate(None, ".") - - foundKVs = {} - if re.match('^[0-9]{1,2}[wmy]$', str_lower) is not None: - # String contains a wave. - # Wave validity is checked later on in the dataset checks. - foundKVs["wave"] = str_lower - elif re.match('^[bap][0-9]{5}$', str_for_pseudocode_test.lower()) is not None: - # String contains a pseudocode. - foundKVs["pseudocode"] = str_upper[0:len(str_for_pseudocode_test)] - elif re.match('^[Vv][Ee][Rr][A-Z][a-zA-Z0-9-]*$', str_for_version_test) is not None: - foundKVs["version"] = string[3:len(string)] - elif str_lower in exp_types: - foundKVs["experiment_type"] = str_lower - - return foundKVs - - -def intake_scan_get_metadata_update(ctx, path, is_collection, in_dataset, parent_metadata): - """Determine metadata to be updated for a particular collection or data object, based - on its name and parent metadata. - - This function is separate from the function that actually performs the updates, so - that we can test the logic separately. - - :param ctx: Combined type of a callback and rei struct - :param path: Full path of the data object or collection - :param is_collection: true if it's a collection, false if it's a data object - :param in_dataset: true if the parent already has complete WEP(V) attributes. Otherwise false. - :param parent_metadata: dict containing the intake module metadata of the parent collection ( if any) - - :returns: Returns a dictionary with the following keys / values: - new_metadata: dictionary of new metadata to apply to this data object or collection - in_dataset: true if current object (along with values passed from parents) has complete WEP(V) values. - otherwise false. - """ - - local_metadata = parent_metadata.copy() - - result = {"new_metadata": local_metadata, "in_dataset": in_dataset} - - if in_dataset: - # If we already are in a dataset, we get all the metadata from the parent. We - # cannot override attributes in this case. However we need to remove the top-level - # attribute, because the present object is within in a dataset, and thus not a top-level - # data object. - if "dataset_toplevel" in local_metadata: - del [local_metadata["dataset_toplevel"]] - else: - intake_extract_tokens_from_name(ctx, path, local_metadata) - if intake_tokens_identify_dataset(local_metadata): - intake_ensure_version_present(ctx, local_metadata) - local_metadata["directory"] = path if is_collection else os.path.dirname(path) - local_metadata["dataset_id"] = dataset_make_id(local_metadata) - local_metadata["dataset_toplevel"] = dataset_make_id(local_metadata) - result["in_dataset"] = True - else: - # result["in_dataset"] is already set to false - pass - - return result - - -def dataset_make_id(scope): - """Construct a dataset based on WEPV and directory. - - :param scope: Create a dataset id - - :returns: Dataset identifier - """ - return scope['wave'] + '\t' + scope['experiment_type'] + '\t' + scope['pseudocode'] + '\t' + scope['version'] + '\t' + scope['directory'] - - -def dataset_parse_id(dataset_id): - """Parse a dataset into its consructive data. - - :param dataset_id: Dataset identifier - - :returns: Dataset as a dict - """ - dataset_parts = dataset_id.split('\t') - dataset = {} - dataset['wave'] = dataset_parts[0] - dataset['experiment_type'] = dataset_parts[1] - dataset['pseudocode'] = dataset_parts[2] - dataset['version'] = dataset_parts[3] - dataset['directory'] = dataset_parts[4] - - return dataset diff --git a/intake_vault.py b/intake_vault.py deleted file mode 100644 index bc0e85258..000000000 --- a/intake_vault.py +++ /dev/null @@ -1,412 +0,0 @@ -# -*- coding: utf-8 -*- -"""Functions for intake vault.""" - -__copyright__ = 'Copyright (c) 2021, Utrecht University' -__license__ = 'GPLv3, see LICENSE' - -import itertools -import time - -import genquery - -import intake -import intake_lock -import intake_scan -from util import * - -__all__ = ['rule_intake_to_vault'] - - -@rule.make(inputs=range(2), outputs=range(2, 2)) -def rule_intake_to_vault(ctx, intake_root, vault_root): - # 1. add to_vault_freeze metadata lock to the dataset - # 2. check that dataset does not yet exist in the vault - # 3. copy dataset to vault with its metadata - # 4. remove dataset from intake - # upon any error: - # - delete partial data from vault - # - add error to intake dataset metadata - # - remove locks on intake dataset (to_vault_freeze, to_vault_lock) - - # note that we have to allow for multiple types of datasets: - # type A: a single toplevel collection with a tree underneath - # type B: one or more datafiles located within the same collection - # processing varies slightly between them, so process each type in turn - # - - # status: 0 is success, nonzero is error - status = 0 - # counter of datasets moved to the vault area - datasets_moved = 0 - - # TYPE A: - c_main_collection_iterator = genquery.row_iterator( - "COLL_NAME, META_COLL_ATTR_VALUE", - "META_COLL_ATTR_NAME = 'dataset_toplevel' AND COLL_NAME = '" + intake_root + "'", - genquery.AS_LIST, ctx) - - for row in itertools.chain(c_main_collection_iterator): - toplevel_collection = row[0] - dataset_id = row[1] - # Get status ( locked / frozen ) - locked_state = intake_scan.object_is_locked(ctx, toplevel_collection, True) - if locked_state['locked']: - # Freeze the dataset - intake_lock.intake_dataset_freeze(ctx, toplevel_collection, dataset_id) - - # Dataset frozen, now move to vault and remove from intake area - status = dataset_collection_move_2_vault(ctx, toplevel_collection, dataset_id, vault_root) - if status == 0: - datasets_moved += 1 - - # TYPE B: - d_main_collection_iterator = genquery.row_iterator( - "COLL_NAME, META_DATA_ATTR_VALUE", - "META_DATA_ATTR_NAME = 'dataset_toplevel' AND COLL_NAME = '" + intake_root + "'", - genquery.AS_LIST, ctx) - - for row in itertools.chain(d_main_collection_iterator): - toplevel_collection = row[0] - dataset_id = row[1] - # check if to_vault_lock exists on all the dataobjects of this dataset - all_locked = True - iter2 = genquery.row_iterator( - "DATA_NAME", - "COLL_NAME = '" + toplevel_collection + "' " - "AND META_DATA_ATTR_NAME = 'dataset_toplevel' " - "AND META_DATA_ATTR_VALUE = '" + dataset_id + "'", - genquery.AS_LIST, ctx) - - for row2 in iter2: - locked_state = intake_scan.object_is_locked(ctx, toplevel_collection + '/' + row2[0], False) - all_locked = all_locked and locked_state['locked'] - if not all_locked: - break - - if all_locked: - # Freeze the dataset - intake_lock.intake_dataset_freeze(ctx, toplevel_collection, dataset_id) - - # Dataset frozen, now move to fault and remove from intake area - status = dataset_objects_only_move_2_vault(ctx, toplevel_collection, dataset_id, vault_root) - if status == 0: - datasets_moved += 1 - - if datasets_moved: - log.write(ctx, "Datasets moved to the vault: " + str(datasets_moved)) - - return 0 - - -def dataset_collection_move_2_vault(ctx, toplevel_collection, dataset_id, vault_root): - """Move intake datasets consisting of collections to the vault - - :param ctx: Combined type of a callback and rei struct - :param toplevel_collection: Toplevel collection - :param dataset_id: Identifier of dataset - :param vault_root: Root path of vault - - :returns: Status - """ - status = 0 - if vault_dataset_exists(ctx, vault_root, dataset_id): - # duplicate dataset, signal error and throw out of vault queue - log.write(ctx, "INFO: version already exists in vault: " + dataset_id) - message = "Duplicate dataset, version already exists in vault" - intake_scan.dataset_add_error(ctx, [toplevel_collection], True, message) - intake_lock.intake_dataset_melt(ctx, toplevel_collection, dataset_id) - intake_lock.intake_dataset_unlock(ctx, toplevel_collection, dataset_id) - return 1 - - # Dataset does not exist - move from research to vault area - vault_path = get_dataset_path(vault_root, dataset_id) - - vault_parent = pathutil.chop(vault_path)[0] - try: - collection.create(ctx, vault_parent, "1") - except Exception: - log.write(ctx, "ERROR: parent collection could not be created " + vault_parent) - return 2 - - # variable for treewalk interface - buffer = {} - buffer["source"] = toplevel_collection - buffer["destination"] = vault_path - - status = vault_tree_walk_collection(ctx, toplevel_collection, buffer, vault_walk_ingest_object) - - # reset buffer - buffer = {} - if status == 0: - # stamp the vault dataset collection with additional metadata - avu.set_on_coll(ctx, vault_path, "dataset_date_created", str(int(time.time()))) - - # and finally remove the dataset original in the intake area - try: - collection.remove(ctx, toplevel_collection) - except Exception: - log.write(ctx, "ERROR: unable to remove intake collection " + toplevel_collection) - return 3 - else: - # move failed (partially), cleanup vault - # NB: keep the dataset in the vault queue so we can retry some other time - log.write("ERROR: Ingest failed for " + dataset_id + ", error = " + status) - status = vault_tree_walk_collection(ctx, vault_path, buffer, vault_walk_remove_object) - - return status - - -def dataset_objects_only_move_2_vault(ctx, toplevel_collection, dataset_id, vault_root): - """Move intake datasets consisting of data objects to the vault - - :param ctx: Combined type of a callback and rei struct - :param toplevel_collection: Toplevel collection - :param dataset_id: Identifier of dataset - :param vault_root: Root path of vault - - :returns: Status - """ - status = 0 - if vault_dataset_exists(ctx, vault_root, dataset_id): - # duplicate dataset, signal error and throw out of vault queue - log.write(ctx, "INFO: version already exists in vault: " + dataset_id) - message = "Duplicate dataset, version already exists in vault" - - tl_info = intake.get_dataset_toplevel_objects(ctx, toplevel_collection, dataset_id) - is_collection = tl_info['is_collection'] - tl_objects = tl_info['objects'] - - # dataset_add_error(ctx, tl_objects, is_collection, "The wave '" + components['wave'] + "' is not in the list of accepted waves") - - intake_scan.dataset_add_error(ctx, tl_objects, is_collection, message) - intake_lock.intake_dataset_melt(ctx, toplevel_collection, dataset_id) - intake_lock.intake_dataset_unlock(ctx, toplevel_collection, dataset_id) - return 1 - - # Dataset does not exist - move it from research to vault space - # new dataset(version) we can safely ingest into vault - vault_path = get_dataset_path(vault_root, dataset_id) - - # create path to and including the toplevel collection (will create in-between levels) - try: - collection.create(ctx, vault_path, "1") - except Exception: - log.write(ctx, "ERROR: parent collection could not be created " + vault_path) - return 2 - - # stamp the vault dataset collection with default metadata - try: - vault_dataset_add_default_metadata(ctx, vault_path, dataset_id) - except Exception: - log.write(ctx, "ERROR: default metadata could not be added to " + vault_path) - return 3 - - # copy data objects to the vault - iter = genquery.row_iterator( - "DATA_NAME", - "COLL_NAME = '" + toplevel_collection + "' " - "AND META_DATA_ATTR_NAME = 'dataset_toplevel' " - "AND META_DATA_ATTR_VALUE = '" + dataset_id + "' ", - genquery.AS_LIST, ctx) - - for row in iter: - intake_path = toplevel_collection + '/' + row[0] - - status = vault_ingest_object(ctx, intake_path, False, vault_path + "/" + row[0]) - if status: - break - - # data ingested, what's left is to delete the original in intake area - # this will also melt/unfreeze etc because metadata is removed too - iter = genquery.row_iterator( - "DATA_NAME", - "COLL_NAME = '" + toplevel_collection + "' " - "AND META_DATA_ATTR_NAME = 'dataset_toplevel' " - "AND META_DATA_ATTR_VALUE = '" + dataset_id + "' ", - genquery.AS_LIST, ctx) - - for row in iter: - intake_path = toplevel_collection + "/" + row[0] - # Now remove data object in intake - try: - data_object.remove(ctx, intake_path, force=True) - except Exception: - log.write(ctx, "ERROR: unable to remove intake object " + intake_path) - # error occurred during ingest, cleanup vault area and relay the error to user - # NB: keep the dataset in the vault queue so we can retry some other time - log.write(ctx, "ERROR: Ingest failed for *datasetId error = *status") - - # reset buffer interface - buffer = {} - status = vault_tree_walk_collection(ctx, vault_path, buffer, vault_walk_remove_object) - - # Finally return status - return status - - -def vault_ingest_object(ctx, object_path, is_collection, vault_path): - # from the original object only the below list is copied to the vault object, other info is ignored - copied_metadata = ["wave", "experiment_type", "pseudocode", "version", - "error", "warning", "comment", "dataset_error", - "dataset_warning", "datasetid"] - - if not is_collection: - # first chksum the original file then use it to verify the vault copy - try: - ctx.msiDataObjChksum(object_path, "forceChksum=", 0) - ctx.msiDataObjCopy(object_path, vault_path, 'verifyChksum=', 0) - except msi.Error: - return 1 - - coll, dataname = pathutil.chop(object_path) - - iter = genquery.row_iterator( - "META_DATA_ATTR_NAME, META_DATA_ATTR_VALUE", - "COLL_NAME = '" + coll + "' AND DATA_NAME = '" + dataname + "' ", - genquery.AS_LIST, ctx) - - for row in iter: - if row[0] in copied_metadata: - avu.set_on_data(ctx, vault_path, row[0], row[1]) - - # add metadata found in system info - iter = genquery.row_iterator( - "DATA_OWNER_NAME, DATA_OWNER_ZONE, DATA_CREATE_TIME", - "COLL_NAME = '" + coll + "' AND DATA_NAME = '" + dataname + "' ", - genquery.AS_LIST, ctx) - - for row in iter: - avu.set_on_data(ctx, vault_path, "submitted_by=", row[0] + '#' + row[1]) - avu.set_on_data(ctx, vault_path, "submitted_date", row[2]) - else: - # CREATE COLLECTION - try: - collection.create(ctx, vault_path, "1") - except msi.Error: - return 1 - - iter = genquery.row_iterator( - "META_COLL_ATTR_NAME, META_COLL_ATTR_VALUE", - "COLL_NAME = '" + object_path + "' ", - genquery.AS_LIST, ctx) - - for row in iter: - if row[0] in copied_metadata: - avu.set_on_coll(ctx, vault_path, row[0], row[1]) - - # add metadata found in system info - iter = genquery.row_iterator( - "COLL_OWNER_NAME, COLL_OWNER_ZONE, COLL_CREATE_TIME", - "COLL_NAME = '" + object_path + "' ", - genquery.AS_LIST, ctx) - - for row in iter: - avu.set_on_coll(ctx, vault_path, "submitted_by=", row[0] + '#' + row[1]) - avu.set_on_coll(ctx, vault_path, "submitted_date", row[2]) - - return 0 - - -def vault_walk_remove_object(ctx, item_parent, item_name, is_collection): - status = 0 - try: - if is_collection: - collection.remove(ctx, item_parent + '/' + item_name) - else: - data_object.remove(ctx, item_parent + '/' + item_name, force=True) - except Exception: - status = 1 - - return status - - -def vault_walk_ingest_object(ctx, item_parent, item_name, is_collection, buffer): - source_path = item_parent + '/' + item_name - dest_path = buffer["destination"] - if source_path != buffer["source"]: - # rewrite path to copy objects that are located underneath the toplevel collection - source_length = len(source_path) - relative_path = source_path[(len(buffer["source"]) + 1): source_length] - dest_path = buffer["destination"] + '/' + relative_path - - return vault_ingest_object(ctx, source_path, is_collection, dest_path) - - -def vault_tree_walk_collection(ctx, path, buffer, rule_to_process): - """Walk a subtree and perform 'rule_to_process' per item. - - :param ctx: Combined type of a callback and rei struct - :param path: Path of collection to treewalk - :param buffer: Exclusively to be used by the rule we will can - :param rule_to_process: Name of the rule to be executed in the context of a tree-item - - :returns: Error status - """ - parent_collection, collection = pathutil.chop(path) - - error = 0 - # first deal with any subcollections within this collection - iter = genquery.row_iterator( - "COLL_NAME", - "COLL_PARENT_NAME = '" + path + "' ", - genquery.AS_LIST, ctx) - for row in iter: - error = vault_tree_walk_collection(ctx, row[0], buffer, rule_to_process) - if error: - break - - # when done then process the dataobjects directly located within this collection - if error == 0: - iter = genquery.row_iterator( - "DATA_NAME", - "COLL_NAME = '" + path + "' ", - genquery.AS_LIST, ctx) - for row in iter: - error = rule_to_process(ctx, path, row[0], False, buffer) - if error: - break - - # and lastly process the collection itself - if error == 0: - error = rule_to_process(ctx, parent_collection, collection, True, buffer) - - return error - - -def vault_dataset_add_default_metadata(ctx, vault_path, dataset_id): - id_components = intake_scan.dataset_parse_id(dataset_id) - # my_date = datetime.now() - # id_components["dataset_date_created"] = my_date.strftime('%Y-%m-%dT%H:%M:%S.%f%z') - id_components["dataset_date_created"] = str(int(time.time())) - - keys = ["wave", "experiment_type", "pseudocode", "version", "dataset_date_created"] - for key in keys: - try: - avu.set_on_data(ctx, vault_path, key, id_components[key]) - except Exception: - avu.set_on_coll(ctx, vault_path, key, id_components[key]) - - -def vault_dataset_exists(ctx, vault_root, dataset_id): - id_components = intake_scan.dataset_parse_id(dataset_id) - # Beware! extra 'ver' before version from original code: *wepv = *wave ++ *sep ++ *experimentType ++ *sep ++ *pseudocode ++ *sep ++ "ver*version"; - wepv = id_components["wave"] + "_" + id_components["experiment_type"] + "_" + id_components["pseudocode"] + "_ver" + id_components["version"] - dataset_path = vault_root + '/' + id_components["wave"] + "/" + id_components["experiment_type"] + "/" + id_components["pseudocode"] + "/" + wepv - - iter = genquery.row_iterator( - "COLL_NAME", - "COLL_NAME = '" + dataset_path + "' ", - genquery.AS_LIST, ctx) - - for _row in iter: - return True - - return False - - -def get_dataset_path(root, dataset_id): - id_components = intake_scan.dataset_parse_id(dataset_id) - # Beware! extra 'ver' before version from original code: *wepv = *wave ++ *sep ++ *experimentType ++ *sep ++ *pseudocode ++ *sep ++ "ver*version"; - wepv = id_components["wave"] + "_" + id_components["experiment_type"] + "_" + id_components["pseudocode"] + "_ver" + id_components["version"] - - return root + '/' + id_components["wave"] + "/" + id_components["experiment_type"] + "/" + id_components["pseudocode"] + "/" + wepv diff --git a/policies_intake.py b/policies_intake.py index 5b490e63e..159ddbca9 100644 --- a/policies_intake.py +++ b/policies_intake.py @@ -1,15 +1,54 @@ # -*- coding: utf-8 -*- -"""iRODS policy implementations.""" +"""Policies for intake.""" -__copyright__ = 'Copyright (c) 2021, Utrecht University' +__copyright__ = 'Copyright (c) 2021-2024, Utrecht University' __license__ = 'GPLv3, see LICENSE' import genquery -import intake_scan from util import * +def object_is_locked(ctx, path, is_collection): + """Returns whether given object in path (collection or dataobject) is locked or frozen + + :param ctx: Combined type of a callback and rei struct + :param path: Path to object or collection + :param is_collection: Whether path contains a collection or data object + + :returns: Returns locked state + """ + locked_state = {"locked": False, + "frozen": False} + + if is_collection: + iter = genquery.row_iterator( + "META_COLL_ATTR_NAME", + "COLL_NAME = '" + path + "'", + genquery.AS_LIST, ctx + ) + for row in iter: + if row[0] in ['to_vault_lock', 'to_vault_freeze']: + locked_state['locked'] = True + if row[0] == 'to_vault_freeze': + locked_state['frozen'] = True + else: + parent_coll = pathutil.dirname(path) + iter = genquery.row_iterator( + "META_DATA_ATTR_NAME", + "COLL_NAME = '" + parent_coll + "' AND DATA_NAME = '" + pathutil.basename(path) + "'", + genquery.AS_LIST, ctx + ) + # return locked_state + for row in iter: + if row[0] in ['to_vault_lock', 'to_vault_freeze']: + locked_state['locked'] = True + if row[0] == 'to_vault_freeze': + locked_state['frozen'] = True + + return locked_state + + def is_data_in_locked_dataset(ctx, actor, path): """ Check whether given data object is within a locked dataset """ dataset_id = '' @@ -64,7 +103,7 @@ def is_data_in_locked_dataset(ctx, actor, path): toplevel_is_collection = False if toplevel_collection: - locked_state = intake_scan.object_is_locked(ctx, toplevel_collection, toplevel_is_collection) + locked_state = object_is_locked(ctx, toplevel_collection, toplevel_is_collection) log.debug(ctx, locked_state) return (locked_state['locked'] or locked_state['frozen']) and not user.is_admin(ctx, actor) else: @@ -117,7 +156,7 @@ def is_coll_in_locked_dataset(ctx, actor, coll): toplevel_is_collection = False if toplevel_collection: - locked_state = intake_scan.object_is_locked(ctx, toplevel_collection, toplevel_is_collection) + locked_state = object_is_locked(ctx, toplevel_collection, toplevel_is_collection) log.debug(ctx, locked_state) return (locked_state['locked'] or locked_state['frozen']) and not user.is_admin(ctx, actor) else: @@ -169,7 +208,7 @@ def coll_in_path_of_locked_dataset(ctx, actor, coll): toplevel_is_collection = False if toplevel_collection: - locked_state = intake_scan.object_is_locked(ctx, toplevel_collection, toplevel_is_collection) + locked_state = object_is_locked(ctx, toplevel_collection, toplevel_is_collection) log.debug(ctx, locked_state) return (locked_state['locked'] or locked_state['frozen']) and not user.is_admin(ctx, actor) else: diff --git a/rules_uu.cfg.template b/rules_uu.cfg.template index 50a1b863d..8524920ff 100644 --- a/rules_uu.cfg.template +++ b/rules_uu.cfg.template @@ -36,7 +36,6 @@ eus_api_tls_verify = enable_deposit = enable_open_search = -enable_intake = enable_datarequest = yoda_portal_fqdn = diff --git a/setup.cfg b/setup.cfg index ab6ee494a..a9ef75804 100644 --- a/setup.cfg +++ b/setup.cfg @@ -5,4 +5,4 @@ strictness=short docstring_style=sphinx max-line-length=127 exclude=__init__.py,tools,tests/env/ -application-import-names=avu,conftest,util,api,config,constants,data_access_token,datacite,datarequest,data_object,epic,error,folder,groups,groups_import,intake,intake_dataset,intake_lock,intake_scan,intake_utils,intake_vault,json_datacite,json_landing_page,jsonutil,log,mail,meta,meta_form,msi,notifications,schema,schema_transformation,schema_transformations,settings,pathutil,provenance,policies_intake,policies_datamanager,policies_datapackage_status,policies_folder_status,policies_datarequest_status,publication,query,replication,revisions,revision_strategies,revision_utils,rule,user,vault,sram,arb_data_manager,cached_data_manager,resource,yoda_names,policies_utils +application-import-names=avu,conftest,util,api,config,constants,data_access_token,datacite,datarequest,data_object,epic,error,folder,groups,groups_import,json_datacite,json_landing_page,jsonutil,log,mail,meta,meta_form,msi,notifications,schema,schema_transformation,schema_transformations,settings,pathutil,provenance,policies_intake,policies_datamanager,policies_datapackage_status,policies_folder_status,policies_datarequest_status,publication,query,replication,revisions,revision_strategies,revision_utils,rule,user,vault,sram,arb_data_manager,cached_data_manager,resource,yoda_names,policies_utils diff --git a/tests/conftest.py b/tests/conftest.py index a7c983b15..a13b3b9da 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -25,7 +25,6 @@ datarequest = False deposit = False -intake = False archive = False smoke = False skip_api = False @@ -45,7 +44,6 @@ def pytest_addoption(parser): parser.addoption("--datarequest", action="store_true", default=False, help="Run datarequest tests") parser.addoption("--deposit", action="store_true", default=False, help="Run deposit tests") - parser.addoption("--intake", action="store_true", default=False, help="Run intake tests") parser.addoption("--archive", action="store_true", default=False, help="Run vault archive tests") parser.addoption("--no-env-csrf", action="store_true", default=False, help="Do not get CSRF token from environment (this is enabled by default for smoke tests)") parser.addoption("--smoke", action="store_true", default=False, help="Run Smoke tests") @@ -59,7 +57,6 @@ def pytest_addoption(parser): def pytest_configure(config): config.addinivalue_line("markers", "datarequest: Run datarequest tests") config.addinivalue_line("markers", "deposit: Run deposit tests") - config.addinivalue_line("markers", "intake: Run intake tests") config.addinivalue_line("markers", "archive: Run vault archive tests") config.addinivalue_line("markers", "all: Run all tests") config.addinivalue_line("markers", "ui: UI test") @@ -86,10 +83,9 @@ def pytest_configure(config): global verbose_test verbose_test = config.getoption("--verbose-test") - global datarequest, deposit, intake, archive, smoke, run_all, skip_api, skip_ui, no_env_csrf + global datarequest, deposit, archive, smoke, run_all, skip_api, skip_ui, no_env_csrf datarequest = config.getoption("--datarequest") deposit = config.getoption("--deposit") - intake = config.getoption("--intake") archive = config.getoption("--archive") smoke = config.getoption("--smoke") skip_ui = config.getoption("--skip-ui") @@ -109,7 +105,6 @@ def pytest_configure(config): if run_all: datarequest = True deposit = True - intake = True archive = True # Store cookies for each user. @@ -131,10 +126,6 @@ def pytest_bdd_apply_tag(tag, function): marker = pytest.mark.skip(reason="Skip deposit") marker(function) return True - elif tag == 'intake' and not intake: - marker = pytest.mark.skip(reason="Skip intake") - marker(function) - return True elif tag == 'archive' and not archive: marker = pytest.mark.skip(reason="Skip vault archive") marker(function) diff --git a/tests/features/api/api_intake.feature b/tests/features/api/api_intake.feature deleted file mode 100644 index 7f3e1e007..000000000 --- a/tests/features/api/api_intake.feature +++ /dev/null @@ -1,195 +0,0 @@ -@api @intake -Feature: Intake API - - Scenario Outline: Find all studies a user is involved with - Given user is authenticated - And the Yoda intake list studies API is queried - Then the response status code is "200" - And study is returned - - Examples: - | user | study | - | researcher | initial | - | researcher | test | - | datamanager | initial | - | datamanager | test | - - - Scenario Outline: Find all studies a user is datamanager of - Given user is authenticated - And the Yoda intake list datamanager studies API is queried - Then the response status code is "200" - And study is returned - - Examples: - | user | study | - | datamanager | initial | - | datamanager | test | - - - Scenario Outline: Get the total count of all files in a collection - Given user is authenticated - And the Yoda intake count total files API is queried with collection - Then the response status code is "200" - # And ... - - Examples: - | user | collection | - | datamanager | /tempZone/home/grp-intake-initial | - | researcher | /tempZone/home/grp-intake-initial | - - - Scenario Outline: Get list of all unrecognized and unscanned files - Given user is authenticated - And the Yoda intake list unrecognized files API is queried with collection - Then the response status code is "200" - # And ... - - Examples: - | user | collection | - | datamanager | /tempZone/yoda/grp-intake-initial | - | researcher | /tempZone/yoda/grp-intake-initial | - - - Scenario Outline: Get list of all datasets - Given user is authenticated - And the Yoda intake list datasets API is queried with collection - Then the response status code is "200" - # And ... - - Examples: - | user | collection | - | datamanager | /tempZone/home/grp-intake-initial | - | researcher | /tempZone/home/grp-intake-initial | - - - Scenario Outline: Scan for and recognize datasets in study intake area - Given user is authenticated - And the Yoda intake scan for datasets API is queried with collection - Then the response status code is "200" - # And ... - - Examples: - | user | collection | - | datamanager | /tempZone/home/grp-intake-initial | - | researcher | /tempZone/home/grp-intake-initial | - - - Scenario Outline: Lock dataset in study intake area - Given user is authenticated - And the Yoda intake lock API is queried with dataset id and collection - Then the response status code is "200" - # And ... - - Examples: - | user | collection | dataset_id | - | datamanager | /tempZone/home/grp-intake-initial | 3y*discount*B00000*Raw | - | researcher | /tempZone/home/grp-intake-initial | 3y*discount*B00001*Raw | - - - Scenario Outline: Cannot lock non-existent dataset - Given user is authenticated - And the Yoda intake lock API is queried with dataset id and collection - # Errors during locking individual datasets do not result in an error status code. This test - # codifies current behaviour of this API endpoint. - Then the response status code is "200" - And the result is equivalent to {"error_dataset_ids": ["3y\ndiscount\nB99999\nRaw"], "error_msg": "Something went wrong locking datasets", "proc_status": "NOK"} - - Examples: - | user | collection | dataset_id | - | datamanager | /tempZone/home/grp-intake-initial | 3y*discount*B99999*Raw | - - - Scenario Outline: Unlock dataset in study intake area - Given user is authenticated - And the Yoda intake unlock API is queried with dataset id and collection - Then the response status code is "200" - # And ... - - Examples: - | user | collection | dataset_id | - | datamanager | /tempZone/home/grp-intake-initial | 3y*discount*B00000*Raw | - | researcher | /tempZone/home/grp-intake-initial | 3y*discount*B00001*Raw | - - - Scenario Outline: Cannot unlock non-existent dataset - Given user is authenticated - And the Yoda intake unlock API is queried with dataset id and collection - # Errors during unlocking individual datasets do not result in an error status code. This test - # codifies current behaviour of this API endpoint. - Then the response status code is "200" - And the result is equivalent to {"error_dataset_ids": ["3y\ndiscount\nB99999\nRaw"], "error_msg": "Something went wrong unlocking datasets", "proc_status": "NOK"} - - Examples: - | user | collection | dataset_id | - | datamanager | /tempZone/home/grp-intake-initial | 3y*discount*B99999*Raw | - - - Scenario Outline: Get all details for a dataset - Given user is authenticated - And the Yoda intake dataset get details API is queried with dataset id and collection - Then the response status code is "200" - # And ... - - Examples: - | user | collection | dataset_id | - | datamanager | /tempZone/home/grp-intake-initial | 3y*discount*B00000*Raw | - | researcher | /tempZone/home/grp-intake-initial | 3y*discount*B00001*Raw | - - - Scenario Outline: Add a comment to a dataset - Given user is authenticated - And the Yoda intake dataset add comment API is queried with dataset id , study id and comment - Then the response status code is "200" - # And ... - - Examples: - | user | study_id | comment | dataset_id | - | datamanager | grp-intake-initial | comment1 | 3y*discount*B00000*Raw | - | researcher | grp-intake-initial | comment2 | 3y*discount*B00001*Raw | - - - Scenario Outline: Cannot add comment to nonexistent dataset - Given user is authenticated - And the Yoda intake dataset add comment API is queried with dataset id , study id and comment - # Adding a comment to a nonexistent dataset currently does not result in an error status code. This test - # codifies current behaviour of this API endpoint. - Then the response status code is "200" - And the result is equivalent to {"error_msg": "Dataset does not exist", "proc_status": "NOK"} - - Examples: - | user | study_id | comment | dataset_id | - | datamanager | grp-intake-initial | comment1 | 3y*discount*B99999*Raw | - - - Scenario Outline: Get vault dataset related counts for reporting for a study - Given user is authenticated - And the Yoda intake report vault dataset counts per study API is queried with study id - Then the response status code is "200" - # And ... - - Examples: - | user | study_id | - | datamanager | grp-intake-initial | - - - Scenario Outline: Get aggregated vault dataset info for reporting for a study - Given user is authenticated - And the Yoda intake report vault aggregated info API is queried with study id - Then the response status code is "200" - # And ... - - Examples: - | user | study_id | - | datamanager | grp-intake-initial | - - - Scenario Outline: Get vault data for export of a study - Given user is authenticated - And the Yoda intake report export study data API is queried with study id - Then the response status code is "200" - # And ... - - Examples: - | user | study_id | - | datamanager | grp-intake-initial | diff --git a/tests/features/api/api_resources.feature b/tests/features/api/api_resources.feature index 52ab3356e..ba3c35432 100644 --- a/tests/features/api/api_resources.feature +++ b/tests/features/api/api_resources.feature @@ -21,7 +21,7 @@ Feature: Resources API And only 1 group is found Examples: - | user | group | + | user | group | | researcher | research-core-1 | | datamanager | research-core-1 | @@ -40,7 +40,6 @@ Feature: Resources API | datamanager | deposit-pilot1 | - @intake Scenario Outline: Get paginated result when searching for one specific intake / grp group Given user is authenticated And the Yoda resources API is queried for a paginated range of research groups filtered on group @@ -82,36 +81,35 @@ Feature: Resources API | datamanager | deposit-pilot | - @intake Scenario Outline: Get a full year of storage data for intake group Given user is authenticated And the Yoda resources full year differentiated group data API is queried with - Then the response status code is "200" - And storage data for group is found + Then the response status code is "200" + And storage data for group is found Examples: | user | group | | researcher | research-initial | | datamanager | research-initial | - + @deposit Scenario Outline: Get a full year of differentiated storage data starting from current month and look back one year Given user is authenticated And the Yoda resources full year differentiated group data API is queried with - Then the response status code is "200" - And storage data for group is found + Then the response status code is "200" + And storage data for group is found Examples: | user | group | | researcher | research-deposit-test | | datamanager | research-deposit-test | - @intake + Scenario Outline: Get a full year of differentiated storage data starting from current month and look back one year Given user is authenticated And the Yoda resources full year differentiated group data API is queried with - Then the response status code is "200" - And storage data for group is found + Then the response status code is "200" + And storage data for group is found Examples: | user | group | @@ -136,8 +134,8 @@ Feature: Resources API Scenario Outline: Collect storage stats for all twelve months based upon categories a user is datamanager of Given user is authenticated And the Yoda resources monthly category stats API is queried - Then the response status code is "200" - And storage data for export is found + Then the response status code is "200" + And storage data for export is found Examples: | user | @@ -152,7 +150,7 @@ Feature: Resources API And group data are sorted by in order Examples: - | user | sort_on | sort_order | + | user | sort_on | sort_order | | researcher | name | asc | | researcher | name | desc | | researcher | size | asc | diff --git a/tests/features/ui/ui_intake.feature b/tests/features/ui/ui_intake.feature deleted file mode 100644 index fea264427..000000000 --- a/tests/features/ui/ui_intake.feature +++ /dev/null @@ -1,43 +0,0 @@ -@ui @intake -Feature: Intake UI - - @fail - Scenario: Intake scan only and find datasets and unrecognized files - Given user datamanager is logged in - And module "intake" is shown - When activate study "test" - And total datasets is "0" - When activate study "initial" - And total datasets is "0" - And unscanned files are present - When scanned for datasets - Then scan button is disabled - When scanning for datasets is successful - And total datasets is "3" - And unrecognized files are present - - When click for details of first dataset row - - When add "COMMENTS" to comment field and press comment button - - When check first dataset for locking - And lock and unlock buttons are "enabled" - - When uncheck first dataset for locking - And lock and unlock buttons are "disabled" - - When check all datasets for locking - - Then click lock button - And wait for all datasets to be in locked state successfully - And wait for all datasets to be in frozen state - And wait for frozen sets to be added to vault - - Scenario: Intake reporting - Given user datamanager is logged in - And module "intake" is shown - - When open intake reporting area - When check reporting result - When export all data and download file - When return to intake area diff --git a/tests/features/ui/ui_statistics.feature b/tests/features/ui/ui_statistics.feature index 516e7d47f..843880d77 100644 --- a/tests/features/ui/ui_statistics.feature +++ b/tests/features/ui/ui_statistics.feature @@ -28,7 +28,6 @@ Feature: Statistics UI | datamanager | deposit-pilot | - @intake Scenario Outline: Viewing storage details of a intake / grp group Given user is logged in And module "stats" is shown @@ -55,7 +54,6 @@ Feature: Statistics UI | datamanager | test-automation | - @intake Scenario Outline: Viewing intake category storage details as a technicaladmin or datamanager Given user is logged in When module "stats" is shown diff --git a/tests/step_defs/api/test_api_intake.py b/tests/step_defs/api/test_api_intake.py deleted file mode 100644 index b26223004..000000000 --- a/tests/step_defs/api/test_api_intake.py +++ /dev/null @@ -1,157 +0,0 @@ -# coding=utf-8 -"""Intake API feature tests.""" - -__copyright__ = 'Copyright (c) 2020-2022, Utrecht University' -__license__ = 'GPLv3, see LICENSE' - -import json - -from deepdiff import DeepDiff -from pytest_bdd import ( - given, - parsers, - scenarios, - then, -) - -from conftest import api_request - -scenarios('../../features/api/api_intake.feature') - - -@given('the Yoda intake list studies API is queried', target_fixture="api_response") -def api_intake_list_studies(user): - return api_request( - user, - "intake_list_studies", - {} - ) - - -@given('the Yoda intake list datamanager studies API is queried', target_fixture="api_response") -def api_intake_list_dm_studies(user): - return api_request( - user, - "intake_list_dm_studies", - {} - ) - - -@given(parsers.parse("the Yoda intake count total files API is queried with collection {collection}"), target_fixture="api_response") -def api_intake_count_total_files(user, collection): - return api_request( - user, - "intake_count_total_files", - {"coll": collection} - ) - - -@given(parsers.parse("the Yoda intake list unrecognized files API is queried with collection {collection}"), target_fixture="api_response") -def api_intake_list_unrecognized_files(user, collection): - return api_request( - user, - "intake_list_unrecognized_files", - {"coll": collection} - ) - - -@given(parsers.parse("the Yoda intake list datasets API is queried with collection {collection}"), target_fixture="api_response") -def api_intake_list_datasets(user, collection): - return api_request( - user, - "intake_list_datasets", - {"coll": collection} - ) - - -@given(parsers.parse("the Yoda intake scan for datasets API is queried with collection {collection}"), target_fixture="api_response") -def api_intake_scan_for_datasets(user, collection): - return api_request( - user, - "intake_scan_for_datasets", - {"coll": collection} - ) - - -@given(parsers.parse("the Yoda intake lock API is queried with dataset id {dataset_id} and collection {collection}"), target_fixture="api_response") -def api_intake_lock_dataset(user, dataset_id, collection): - return api_request( - user, - "intake_lock_dataset", - {"path": collection, "dataset_ids": dataset_id.replace("*", "\n")} - ) - - -@given(parsers.parse("the Yoda intake unlock API is queried with dataset id {dataset_id} and collection {collection}"), target_fixture="api_response") -def api_intake_unlock_dataset(user, dataset_id, collection): - return api_request( - user, - "intake_unlock_dataset", - {"path": collection, "dataset_ids": dataset_id.replace("*", "\n")} - ) - - -@given(parsers.parse("the Yoda intake dataset get details API is queried with dataset id {dataset_id} and collection {collection}"), target_fixture="api_response") -def api_intake_dataset_get_details(user, dataset_id, collection): - return api_request( - user, - "intake_dataset_get_details", - {"coll": collection, "dataset_id": dataset_id.replace("*", "\t")} - ) - - -@given(parsers.parse("the Yoda intake dataset add comment API is queried with dataset id {dataset_id}, study id {study_id} and comment {comment}"), target_fixture="api_response") -def api_intake_dataset_add_comment(user, dataset_id, study_id, comment): - return api_request( - user, - "intake_dataset_add_comment", - {"study_id": study_id, "dataset_id": dataset_id.replace("*", "\n"), "comment": comment} - ) - - -@given(parsers.parse("the Yoda intake report vault dataset counts per study API is queried with study id {study_id}"), target_fixture="api_response") -def api_intake_report_vault_dataset_counts_per_study(user, study_id): - return api_request( - user, - "intake_report_vault_dataset_counts_per_study", - {"study_id": study_id} - ) - - -@given(parsers.parse("the Yoda intake report vault aggregated info API is queried with study id {study_id}"), target_fixture="api_response") -def api_intake_report_vault_aggregated_info(user, study_id): - return api_request( - user, - "intake_report_vault_aggregated_info", - {"study_id": study_id} - ) - - -@given(parsers.parse("the Yoda intake report export study data API is queried with study id {study_id}"), target_fixture="api_response") -def api_intake_report_export_study_data(user, study_id): - return api_request( - user, - "intake_report_export_study_data", - {"study_id": study_id} - ) - - -@then(parsers.parse("study {study} is returned")) -def study_returned(api_response, study): - _, body = api_response - - assert study in body['data'] - - -@then('debug') -def debug(api_response): - _, body = api_response - - assert 0, body - - -@then(parsers.parse("the result is equivalent to {result}")) -def result_equivalent_to(api_response, result): - _, body = api_response - - assert DeepDiff(json.loads(result), body['data']) == {} diff --git a/tests/step_defs/ui/test_ui_intake.py b/tests/step_defs/ui/test_ui_intake.py deleted file mode 100644 index d6d0f226d..000000000 --- a/tests/step_defs/ui/test_ui_intake.py +++ /dev/null @@ -1,191 +0,0 @@ -# coding =utf-8 -"""Vault UI feature tests.""" - -__copyright__ = 'Copyright (c) 2020-2021, Utrecht University' -__license__ = 'GPLv3, see LICENSE' - -import time - -from pytest_bdd import ( - parsers, - scenarios, - then, - when, -) - -scenarios('../../features/ui/ui_intake.feature') - - -# GENERIC FUNCTIONS -def get_unscanned_from_error_area_text(browser): - # Unrecognised and unscanned (17) files or Unrecognised (12) and unscanned (-) files - error_area_text = browser.find_by_id('scan_result_text') - parts = error_area_text.value.split(' and ') - s = parts[1] - return s[s.find("(") + 1:s.find(")")] - - -def get_unrecognized_from_error_area_text(browser): - error_area_text = browser.find_by_id('scan_result_text') - parts = error_area_text.value.split(' and ') - s = parts[0] - first_bracket = s.find("(") - if first_bracket == -1: - return "0" - return s[first_bracket + 1:s.find(")")] - - -# SCENARIO 1 -@when(parsers.parse('activate study "{study}"')) -def ui_intake_activate_study(browser, study): - dropdown = browser.find_by_id('dropdown-select-study') - dropdown.click() - table = browser.find_by_id('select-study') - rows = table.find_by_tag('tr') - for row in rows: - if row.has_class('ta-' + study): - row.find_by_tag('td').click() - return True - assert False - - -@when(parsers.parse('total datasets is "{dataset_count}"')) -def ui_intake_total_dataset_count(browser, dataset_count): - dataset_count_area = browser.find_by_id('datatable_info') - if dataset_count == '0': - assert dataset_count_area.value == 'No datasets present' - else: - assert dataset_count_area.value == "Total datasets: " + dataset_count - - -@when('unscanned files are present') # ben ik hier niet de prerequisite aan het testen??? -def ui_intake_unscanned_files_present(browser): - assert int(get_unscanned_from_error_area_text(browser)) > 0 - - -@when('scanned for datasets') -def ui_intake_scanned_for_datasets(browser): - browser.find_by_id('btn-start-scan').click() - - -@then('scan button is disabled') -def ui_intake_scan_button_is_disabled(browser): - assert browser.find_by_id('btn-start-scan').has_class('disabled') - - -@when('scanning for datasets is successful') -def ui_intake_scanning_is_successful(browser): - assert browser.is_text_present('Successfully scanned for datasets.', wait_time=20) - - -@when('unrecognized files are present') -def ui_intake_unrecognized_files_are_present(browser): - assert int(get_unrecognized_from_error_area_text(browser)) > 0 - - -@when('click for details of first dataset row') -def ui_intake_click_for_details_of_first_dataset_row(browser): - browser.find_by_id('datatable')[0].click() - - -@when(parsers.parse('add "{comments}" to comment field and press comment button')) -def ui_intake_add_comments_to_dataset(browser, comments): - browser.find_by_name('comments').fill(comments) - browser.find_by_css(".btn-add-comment").click() - - -@when('check first dataset for locking') -def ui_check_first_dataset_for_locking(browser): - browser.find_by_css('.cbDataSet')[0].click() - - -@when(parsers.parse('lock and unlock buttons are "{enabled_state}"')) -def ui_intake_lock_and_unlock_buttons_are(browser, enabled_state): - if enabled_state == 'enabled': - assert not browser.find_by_id('btn-unlock').has_class('disabled') - assert not browser.find_by_id('btn-lock').has_class('disabled') - else: - assert browser.find_by_id('btn-unlock').has_class('disabled') - assert browser.find_by_id('btn-lock').has_class('disabled') - - -@when('uncheck first dataset for locking') -def ui_uncheck_first_dataset_for_locking(browser): - # if not checkbox.is_selected() meenemen hier - browser.find_by_css('.cbDataSet')[0].click() - - -@when('check all datasets for locking') -def ui_check_all_datasets_for_locking(browser): - browser.find_by_css('.control-all-cbDataSets').click() - - -@then('click lock button') -def ui_intake_click_lock_button(browser): - browser.find_by_id("btn-lock").click() - - -@then('wait for all datasets to be in locked state successfully') -def ui_intake_wait_all_datasets_in_locked_state(browser): - assert browser.is_text_present('Successfully locked the selected dataset(s).', wait_time=30) - - assert len(browser.find_by_css('.datasetstatus_locked', wait_time=30)) == 2 - - -@then('wait for all datasets to be in frozen state') -def ui_intake_wait_all_datasets_in_frozen_state(browser): - i = 0 - no_more_locked_datasets_present = False - while i < 20: - time.sleep(20) - browser.visit(browser.url) - # if there are no longer datasets in locked state -> frozen or error - if len(browser.find_by_css('.datasetstatus_locked', wait_time=5)) == 0: # .datasetstatus_frozen - no_more_locked_datasets_present = True - # either datasets are frozen now. Or have been marked errorenous - break - i = i + 1 - assert no_more_locked_datasets_present - - -@then('wait for frozen sets to be added to vault') -def ui_intake_wait_frozen_datasets_to_vault(browser): - # When all frozen datasets have been moved to the vault only 1 will remain with dataset_status_scanned - i = 0 - no_more_frozen_datasets_present = False - while i < 20: - time.sleep(20) - browser.visit(browser.url) - # if there are no longer datasets in locked state -> frozen or error - if len(browser.find_by_css('.datasetstatus_scanned', wait_time=5)) == 3: # .datasetstatus_frozen - no_more_frozen_datasets_present = True - # either datasets are frozen now. Or have been marked errorenous - break - i = i + 1 - assert no_more_frozen_datasets_present - - -# SCENARIO 2 -@when('open intake reporting area') -def ui_intake_open_intake_reporting_area(browser): - browser.find_by_css('.btn-goto-reports').click() - - -@when('check reporting result') -def ui_intake_check_reporting_result(browser): - # classes are part of rows in result table. - assert len(browser.find_by_css('.dataset-type-counts-raw')) > 0 - assert len(browser.find_by_css('.dataset-type-counts-processed')) == 0 - assert len(browser.find_by_css('.dataset-aggregated-version-raw')) > 0 - assert len(browser.find_by_css('.dataset-aggregated-version-processed')) > 0 - assert len(browser.find_by_css('.dataset-aggregated-version-total')) > 0 - - -@when('export all data and download file') -def ui_intake_export_all_data_and_download_file(browser): - browser.find_by_css('.btn-export-data').click() - - -@when('return to intake area') -def ui_intake_return_to_intake_area(browser): - browser.find_by_css('.btn-goto-intake').click() diff --git a/tools/api/generate-openapi.py b/tools/api/generate-openapi.py index c3393aafc..b922b86e1 100755 --- a/tools/api/generate-openapi.py +++ b/tools/api/generate-openapi.py @@ -282,7 +282,7 @@ def gen_fn_spec(name, fn): name = re.sub('^api_', '', name) if core: - modules = ['datarequest', 'deposit', 'intake'] + modules = ['datarequest', 'deposit'] if name.startswith(tuple(modules)): continue diff --git a/tools/intake/ExportDatasetErrorsAndWarnings.r b/tools/intake/ExportDatasetErrorsAndWarnings.r deleted file mode 100644 index ce2bedf1a..000000000 --- a/tools/intake/ExportDatasetErrorsAndWarnings.r +++ /dev/null @@ -1,115 +0,0 @@ -# Date: 2019-01-16 -# Functionality: -# Find files within the dynamic area of an intake study that have errors and/or warnings at file level. -# A check for errors/warnings is performed ONLY on file level. -# Errors that can be found on dataset-toplevel or on collection level within a dataset, are NOT reported - -# Parameters: -# - Study: Name of the study the export has to search - -# Run with DatasetErrorsAndWarnins.sh script to have the export added to a csv file. - -ExportDatasetErrorsAndWarnings { - ## OVERRULE PARAMS FOR NOW as I wasn't able to add multiple input params -# *studyParam="test"; - - # Possibly use uuClientFullName as user, or $userNameClienterNameClient; ???????????????????????? - # writeLine("stdout", "uuClientFullName: " ++ uuClientFullName); - - - # Initialisation of variables based on command line parameters -# *user="datamanager"; - *user = uuClientFullName - *study = *studyParam; - *datamanagerGroup = 'grp-datamanager-' ++ *study; - *studyFolder = "/" ++ $rodsZoneClient ++ "/" ++ 'home/grp-intake-' ++ *studyParam; - - # Check whether user is a datamanager for the study involved - *isDatamanager = false; - foreach (*row in - SELECT USER_NAME - WHERE USER_TYPE = 'rodsgroup' - AND USER_NAME = *datamanagerGroup ) { - - uuGroupUserExists(*datamanagerGroup, *user, true, *membership) - if (*membership) { - *isDatamanager = true; - } - } - - if (!*isDatamanager) { - writeLine("stdout", 'Not the datamanager of current group'); - succeed; # the journey ends here - } - - - # Setup list of dataset ids that are later used to find data objects having this dataset_id's - *datasetList = list(); - foreach(*row in SELECT META_DATA_ATTR_NAME, META_DATA_ATTR_VALUE WHERE COLL_NAME like '*studyFolder%%' AND META_DATA_ATTR_NAME='dataset_toplevel') { - msiGetValByKey(*row, "META_DATA_ATTR_VALUE", *datasetId); - *datasetList = cons(*datasetId, *datasetList); - } - - foreach(*row in SELECT COLL_ID, META_COLL_ATTR_NAME, META_COLL_ATTR_VALUE WHERE COLL_NAME like '*studyFolder%%' AND META_COLL_ATTR_NAME='dataset_toplevel') { - msiGetValByKey(*row, "META_COLL_ATTR_VALUE", *datasetId); - *datasetList = cons(*datasetId, *datasetList); - } - - # Write header row for the export table - writeLine('stdout', "Wave,Experiment type,Pseudocode,Version,Bestand,Errors,Warnings"); - - # At first find datasets, designated by presence of metadata attribute 'dataset_toplevel'. - # The value of the datasetId is combination of wepv and path to make it unique. - foreach(*datasetId in *datasetList) { - # Collect all data objects with a given datasetId - # And per data object find out whether it contains errors or warnings in its metadata - foreach(*row2 in SELECT META_DATA_ATTR_NAME, META_DATA_ATTR_VALUE, DATA_NAME, DATA_ID, COLL_NAME WHERE META_DATA_ATTR_VALUE='*datasetId' AND META_DATA_ATTR_NAME='dataset_id') { - msiGetValByKey(*row2, "DATA_NAME", *dataName); - msiGetValByKey(*row2, "COLL_NAME", *collName); - msiGetValByKey(*row2, "DATA_ID", *dataId); - - # Given 1 object step thtough all its metadata attributes. - - msiString2KeyValPair("", *kvp); - - # build list of all attributes that are involved - *attrList = list('wave', 'experiment_type', 'pseudocode', 'version', 'error', 'warning'); - # initialize all attributes to empty strings - foreach (*attr in *attrList) { - *kvp."*attr" = ''; - } - - foreach(*row3 in SELECT META_DATA_ATTR_NAME, META_DATA_ATTR_VALUE WHERE DATA_ID=*dataId ) { - msiGetValByKey(*row3, "META_DATA_ATTR_NAME", *attrName); - msiGetValByKey(*row3, "META_DATA_ATTR_VALUE", *attrValue); - - foreach (*attr in *attrList) { - #writeLine('stdout', 'attrLIST: ' ++ *attr); - if (*attrName==*attr) { - if (*attr=='error' || *attr=='warning') { # must be concatination as there can be more errors/warnings on 1 data object - if (strlen(*kvp."*attr")>0) { - *kvp."*attr" = *kvp."*attr" ++ ' - ' ++ *attrValue; - } - else { - *kvp."*attr" = *attrValue; - } - } - else { - *kvp."*attr" = *attrValue; - } - } - } - } - # Add data object to file - only if errors or warnins present. - if (strlen(*kvp.'error')>0 || strlen(*kvp.'warning')>0) { - *dataPath = *collName ++ '/' ++ *dataName; - writeLine('stdout', *kvp."wave" ++ "," ++ *kvp."experiment_type" ++ "," ++ *kvp."pseudocode"++ "," ++ *kvp."version" ++ "," ++ *dataPath ++ "," ++ *kvp."error" ++ "," ++ *kvp."warning"); - } - } - } -} - - -input *studyParam="test" -output ruleExecOut - diff --git a/tools/intake/ExportDatasetErrorsAndWarnings.sh b/tools/intake/ExportDatasetErrorsAndWarnings.sh deleted file mode 100755 index e63ddce41..000000000 --- a/tools/intake/ExportDatasetErrorsAndWarnings.sh +++ /dev/null @@ -1,13 +0,0 @@ -#!/bin/sh -# /Date: 2019-01-16 -# /Functionality: -# /Find files within the dynamic area of an intake study that have errors and/or warnings at file level. -# /A check for errors/warnings is performed ONLY on file level. -# /Errors that can be found on dataset-toplevel or on collection level within a dataset, are NOT reported - -# /Parameters: -# /Study: Name of the study the export has to search - -# /Run with DatasetErrorsAndWarnins.sh script to have the export added to a csv file. - -irule -r irods_rule_engine_plugin-irods_rule_language-instance -F ExportDatasetErrorsAndWarnings.r "*studyParam='$1'" > DatasetErrorsAndWarnings.csv diff --git a/tools/intake/collCopyPseudo.r b/tools/intake/collCopyPseudo.r deleted file mode 100644 index 6264a99f6..000000000 --- a/tools/intake/collCopyPseudo.r +++ /dev/null @@ -1,31 +0,0 @@ -#Author Harm de Raaff -#Date: 2019-01-16 - -collCopyPseudo { - #changes YYYY-MM-DD.hh:mm:ss into seconds since epoch format - msiHumanToSystemTime(*datefrom, *datefrom) - msiHumanToSystemTime(*datetill, *datetill) - - # pseudocodes are passes as a comma-separated list. - *pseudoList = split(*pseudoCodes,','); - - foreach(*row in SELECT COLL_OWNER_ZONE) { - *zone=*row.COLL_OWNER_ZONE; - foreach(*pc in *pseudoList) { - foreach(*row2 in SELECT COLL_NAME - WHERE COLL_NAME like '/*zone/home/grp-vault-%' - AND META_COLL_ATTR_NAME = 'pseudocode' - AND META_COLL_ATTR_VALUE = *pc - AND COLL_CREATE_TIME between *datefrom *datetill - #datefrom must be the same amount of digits as datetill - #wont be a problem if chosing times from yodas existence till future - ) { - *name=*row2.COLL_NAME; - writeLine('stdout', *name); - } - } - } -} - -input *pseudoCodes="", *datefrom="", *datetill="" -output ruleExecOut diff --git a/tools/intake/collCopyPseudo.sh b/tools/intake/collCopyPseudo.sh deleted file mode 100755 index a6a752a07..000000000 --- a/tools/intake/collCopyPseudo.sh +++ /dev/null @@ -1,75 +0,0 @@ -#!/bin/bash -# \author Niek Bats -# \date 2019-01-19 -# \file collCopyPseudo.sh -# \brief copies all collections which matches pseudocodes as passed in a file ($3) and in between datefrom ($4) and datetill ($5) to a folder ($1) -# \how to use store the .sh file and .r file to your linux folder and make it the current directory (using cd) -# \ if you want to copy the collections to your linux subfolder, specify iget ($2). The folder ($1) is created in your current linux folder. -# \ if you want to copy the collections to a yoda subfolder, specify icp ($2) instead. The folder ($1) should be preceeded by the yoda -# \ group-folder (e.g. research-copiedcollections/pseudocodelist1, the folder pseudocodelist1 is created by the script) -# \copyright Copyright (c) 2018, Utrecht University. All rights reserved -# \dependencies requires login on an irods user (e.g. datamanager) with execution right to this script and permission to execute user icommands -# \usage bash randomCollCopy.sh - -#invalid input handling - -if [[ $1 = "" || $2 = "" || $3 = "" || $4 = "" || $5 = "" ]] ; then - echo "the usage of this script is: " - echo "bash randomCollCopy.sh " - echo "where folder, howtoCopy is text. dateFrom and dateTill is text in YYYY-MM-DD.HH:mm:ss format" - echo "folder is the created subfolder, when using iget. For icp, the folder to be created should be preceeded by the yoda research-name " - echo "e.g. 'research-copiedcollections/pseudocodelist1' and you must be a user of research-copiedcollection." - exit 1 -fi - -#convert input params to named variables for readability also insta docu of what they are -folder="$1" #is text -copyHow="$2" #iget or icp -pseudocodeCsvFile="$3" #is filename of file holding pseudocodes -dateFrom="$4" #is text in YYYY-MM-DD.HH:mm:ss format -dateTill="$5" #is text in YYYY-MM-DD.HH:mm:ss format - -if [[ $copyHow != "iget" && $copyHow != "icp" ]] ; then - echo "Your copy method is not correct. It must either be 'iget' or 'icp'" - echo "Now it is $copyHow" - exit 1 -fi - -#Collect comma separated pseudocodes from file -pseudoCodes=`cat $pseudocodeCsvFile` -echo "pseudocodes: $pseudoCodes" - -#run rule put output in an array -read -ra array <<< $(irule -r irods_rule_engine_plugin-irods_rule_language-instance -F collCopyPseudo.r "'$pseudoCodes'" "'$dateFrom'" "'$dateTill'") - -#if array is empty give notice and exit -if [ ${#array[@]} -eq 0 ]; then - echo "couldnt find any collections matching your parameters at the moment" - echo "possible causes there arent any matches, the servers are down or you dont have a connection" - exit 1 -fi - -#make folder -if [[ "$copyHow" == "iget" ]] ; then - mkdir "$folder" - cd "$folder" -fi -if [[ "$copyHow" == "icp" ]] ; then - imkdir ../"$folder" - icd ../"$folder" -fi - - -echo "Copy selection: " -for item in ${array[@]} -do - echo "$item" - - if [[ "$copyHow" == "iget" ]] ; then - iget -r "$item" - fi - if [[ "$copyHow" == "icp" ]] ; then - icp -r "$item" . - fi -done - diff --git a/tools/intake/intakeDataCheck.sh b/tools/intake/intakeDataCheck.sh deleted file mode 100644 index e9bcb8e60..000000000 --- a/tools/intake/intakeDataCheck.sh +++ /dev/null @@ -1,34 +0,0 @@ -#!/bin/bash -#Author Niek Bats -#$1 wave -#$2 experiment -#$3 pseudocode -#lists all files, when found any grp-intake-folder using specified parameter(s) - -#input check and build query -if [[ "$1" != "" ]] #if no wave dont do anything -then - query="like '%/grp-intake-%' AND DATA_PATH like '%$1%'" - if [[ "$2" != "" ]] - then - query="$query AND DATA_PATH like '%$2%'" - if [[ "$3" != "" ]] - then - query="$query AND DATA_PATH like '%$3%'" - fi - elif [[ "$3" != "" ]] - then - exit 1 - fi - -echo $query - -#icommand format query is in printf format -output=$(iquest ""%s";%s" "SELECT DATA_PATH, DATA_SIZE WHERE DATA_PATH $query") - -#echo $output - -printf ""Filepath/name";"filesize"\n" > outputIntake.csv -printf "$output" >> outputIntake.csv - -fi diff --git a/tools/intake/randomCollCopy.r b/tools/intake/randomCollCopy.r deleted file mode 100644 index 2ab5c9e4c..000000000 --- a/tools/intake/randomCollCopy.r +++ /dev/null @@ -1,36 +0,0 @@ -#Author Niek Bats -#Date: 2019-01-16 - -randomCollCopy { - #changes YYYY-MM-DD.hh:mm:ss into seconds since epoch format - msiHumanToSystemTime(*datefrom, *datefrom) - msiHumanToSystemTime(*datetill, *datetill) - - foreach(*row in SELECT COLL_OWNER_ZONE) { - *zone=*row.COLL_OWNER_ZONE; - foreach(*row2 in SELECT COLL_NAME - WHERE COLL_NAME like '/*zone/home/grp-vault-%' - AND META_COLL_ATTR_NAME = 'wave' - AND META_COLL_ATTR_VALUE = *wave - # AND COLL_CREATE_TIME between *datefrom *datetill - #datefrom must be the same amount of digits as datetill - #wont be a problem if chosing times from yodas existence till future - ) { - *name=*row2.COLL_NAME; - foreach(*row3 in SELECT COLL_CREATE_TIME - WHERE COLL_NAME = *name - AND META_COLL_ATTR_NAME = 'experiment_type' - AND META_COLL_ATTR_VALUE = *experiment - ) { - *collCreateTime=int(*row3.COLL_CREATE_TIME); - writeLine("stdout", "*name"); - - # test if already present in list - we do not want multiples. - } - } - } -} - -input *wave="", *experiment="", *datefrom="", *datetill="" -output ruleExecOut - diff --git a/tools/intake/randomCollCopy.sh b/tools/intake/randomCollCopy.sh deleted file mode 100755 index 9d6ac9dfc..000000000 --- a/tools/intake/randomCollCopy.sh +++ /dev/null @@ -1,93 +0,0 @@ -#!/bin/bash -# \author Niek Bats -# \date 2019-01-16 -# \file randomCollCopy.sh -# \brief copies random collections which matches selected wave ($3) experiment ($4) in between datefrom ($5) and datetill ($6) to a folder ($1) -# \ with a maximum $6 collections, if specified. -# \how to use store the .sh file and .r file to your linux folder and make it the current directory (using cd) -# \ if you want to copy the collections to your linux subfolder, specify iget ($2). The folder ($1) is created in your current linux folder. -# \ if you want to copy the collections to a yoda subfolder, specify icp ($2) instead. The folder ($1) should be preceeded by the yoda -# \ group-folder (e.g. research-collection/30w-pci, the folder 30w-pci is created by the script) -# \ will be created and the collections copied -# \copyright Copyright (c) 2018, Utrecht University. All rights reserved -# \dependencies requires login on an irods user (e.g. datamanager) with execution right to this script and permission to execute user icommands -# \usage bash randomCollCopy.sh <(optional) amount> - -#invalid input handling - -if [[ $1 = "" || $2 = "" || $3 = "" || $4 = "" || $5 = "" || $6 = "" ]] || [[ ! $7 -gt 0 && ! $7 = "" ]] ; then -#[[ ! $6 -gt 0 ]] check if = a number and more then 0 - echo "the usage of this script is: " - echo "bash randomCollCopy.sh <(optional) amount>" - echo "where folder, wave, experimentType is text. dateFrom and dateTill is text in YYYY-MM-DD.HH:mm:ss format and amount is an number" - echo "folder is the created subfolder, when using iget. For icp, the folder to be created should be preceeded by the yoda research-name" - echo "e.g. 'research-copiedcollection/30w-pci' and you should be a user of research-copiedcollection." - exit 1 -fi - -#convert input params to named variables for readability also insta docu of what they are -folder="$1" #is text -copyHow="$2" #iget or icp -wave="$3" #is text -experimentType="$4" #is text -dateFrom="$5" #is text in YYYY-MM-DD.HH:mm:ss format -dateTill="$6" #is text in YYYY-MM-DD.HH:mm:ss format -amount=10 #is a positive number default=10 -if [[ $7 != "" ]] ; then - amount="$7" -fi - -if [[ $copyHow != "iget" && $copyHow != "icp" ]] ; then - echo "Your copy method is not correct. It must either be 'iget' or 'icp'" - echo "Now it is $copyHow" - exit 1 -fi - -#run rule put output in an array -read -ra array <<< $(irule -r irods_rule_engine_plugin-irods_rule_language-instance -F randomCollCopy.r "'$wave'" "'$experimentType'" "'$dateFrom'" "'$dateTill'") - -#if array is empty give notice and exit -if [ ${#array[@]} -eq 0 ]; then - echo "couldnt find any collections matching your parameters at the moment" - echo "possible causes there arent any matches, the servers are down or you dont have a connection" - exit 1 -fi - -echo "Selecting $amount items from following list: " -for item in ${array[@]} -do - echo "$item" -done - -#make folder -if [[ "$copyHow" == "iget" ]] ; then - mkdir "$folder" - cd "$folder" -fi -if [[ "$copyHow" == "icp" ]] ; then - imkdir ../"$folder" - icd ../"$folder" - fi - -echo "selected: " -#make loop to select amount collections from array -for (( i=0; i<$amount; i++ )); -do - #select a random collection from list - - if [[ ${#array[@]} -ne 0 ]] ; then - randomNr=$(( RANDOM % ${#array[@]} )) - #echo which one is copied and copy - echo "${array[$randomNr]}" - if [[ "$copyHow" == "iget" ]] ; then - iget -r "${array[$randomNr]}" - fi - if [[ "$copyHow" == "icp" ]] ; then - icp -r "${array[$randomNr]}" . - fi - - #remove from list - unset array[$randomNr] - array=( "${array[@]}" ) - fi -done diff --git a/tools/intake/vaultedDataCheck.sh b/tools/intake/vaultedDataCheck.sh deleted file mode 100644 index 97da7c2f8..000000000 --- a/tools/intake/vaultedDataCheck.sh +++ /dev/null @@ -1,29 +0,0 @@ -#!/bin/bash -#Author Niek Bats -#$1 wave -#$2 experiment -#$3 pseudocode -#lists all files, when found any grp-vault-folder using specified parameter(s) - -#input check -if("$1" == "") do #if no wave kill script - exit 1 -done - -#build iquest query -query="%" -for arg in "$@" #add per argument: "$argument/" -do - if [ "$arg" != "" ] - then - query="$query$arg/" - fi -done - -query="$query%" - -#icommand format query is in printf format -output=$(iquest ""%s";%s" "SELECT DATA_PATH, DATA_SIZE WHERE DATA_PATH like '$query'") - -printf ""Filepath/name";"filesize"\n" > outputVault.csv -printf "$output" >> outputVault.csv diff --git a/tools/intake/youthIntakeCheck.r b/tools/intake/youthIntakeCheck.r deleted file mode 100644 index 585e92dcc..000000000 --- a/tools/intake/youthIntakeCheck.r +++ /dev/null @@ -1,91 +0,0 @@ -#Author Niek Bats - -youthIntakeCheck { - *intakeOrVault="intake"; #intake vault - - #non empty *wave, *experiment and *pseudocode - if ((*wave != "") && (*experiment != "") && (*pseudocode != "")) then { - foreach(*row in SELECT COLL_OWNER_ZONE) { - *zone=*row.COLL_OWNER_ZONE; - - foreach(*row2 in SELECT COLL_NAME - WHERE COLL_NAME like '/*zone/home/grp-*intakeOrVault-%' - AND META_DATA_ATTR_NAME = 'wave' - AND META_DATA_ATTR_VALUE = *wave) { - *path=*row2.COLL_NAME; - - foreach(*row3 in SELECT DATA_NAME - WHERE COLL_NAME = *path - AND META_DATA_ATTR_NAME = 'experiment_type' - AND META_DATA_ATTR_VALUE = *experiment) { - *nameExtension=*row3.DATA_NAME; - - foreach(*row4 in SELECT DATA_SIZE - WHERE DATA_NAME = *nameExtension - AND COLL_NAME = *path - AND META_DATA_ATTR_NAME = 'pseudocode' - AND META_DATA_ATTR_VALUE = *pseudocode) { - *size=*row4.DATA_SIZE; - *name=trimr(*nameExtension, "."); - *extension=triml(*nameExtension, *name); - - writeLine("stdout", "\"*path\";\"*name\";\"*extension\";\"*size\""); - } - } - } - } - } - - #non empty *wave and *experiment - else if ((*wave != "") && (*experiment != "")) then { - foreach(*row in SELECT COLL_OWNER_ZONE) { - *zone=*row.COLL_OWNER_ZONE; - - foreach(*row2 in SELECT COLL_NAME - WHERE COLL_NAME like '/*zone/home/grp-*intakeOrVault-%' - AND META_DATA_ATTR_NAME = 'wave' - AND META_DATA_ATTR_VALUE = *wave) { - *path=*row2.COLL_NAME; - - foreach(*row3 in SELECT DATA_NAME, DATA_SIZE - WHERE COLL_NAME = *path - AND META_DATA_ATTR_NAME = 'experiment_type' - AND META_DATA_ATTR_VALUE = *experiment) { - *nameExtension=*row3.DATA_NAME; - *size=*row3.DATA_SIZE; - *name=trimr(*nameExtension, "."); - *extension=triml(*nameExtension, *name); - - writeLine("stdout", "\"*path\";\"*name\";\"*extension\";\"*size\""); - } - } - } - } - - #non empty wave pseudocode is empty - else if (*wave != "" && *pseudocode == "") then { - foreach(*row in SELECT COLL_OWNER_ZONE) { - *zone=*row.COLL_OWNER_ZONE; - - foreach(*row2 in SELECT COLL_NAME, DATA_NAME, DATA_SIZE - WHERE COLL_NAME like '/*zone/home/grp-*intakeOrVault-%' - AND META_DATA_ATTR_NAME ='wave' - AND META_DATA_ATTR_VALUE = *wave) { - *path=*row2.COLL_NAME; - *nameExtension=*row2.DATA_NAME; - *size=*row2.DATA_SIZE; - *name=trimr(*nameExtension, "."); - *extension=triml(*nameExtension, *name); - - writeLine("stdout", "\"*path\";\"*name\";\"*extension\";\"*size\""); - } - } - } - - else { - writeLine("stdout", "Invalid input"); - } -} - -input *wave="", *experiment="", *pseudocode="" -output ruleExecOut \ No newline at end of file diff --git a/tools/intake/youthIntakeCheck.sh b/tools/intake/youthIntakeCheck.sh deleted file mode 100644 index bb24b4821..000000000 --- a/tools/intake/youthIntakeCheck.sh +++ /dev/null @@ -1,33 +0,0 @@ -#!/bin/bash -#Author Niek Bats -#$1 wave -#$2 experiment -#$3 pseudocode -#lists all files, when found any grp-intake-folder using specified parameter(s) - -output=$(irule -r irods_rule_engine_plugin-irods_rule_language-instance -F youthIntakeCheck.r "'$1'" "'$2'" "'$3'") -#echo $output -if [[ "$output" == "" ]] -then - echo "no results with parameters $1 $2 $3" - echo "please note that files have to be scanned to be found" - -elif [[ $output == "Invalid input" ]] -then - echo "$output" - -else - outputFile="intake-$1" - if [[ "$2" != "" ]] - then - outputFile="$outputFile-$2" - fi - if [[ "$3" != "" ]] - then - outputFile="$outputFile-$3" - fi - outputFile="$outputFile.csv" - - printf "\"Filepath\";\"name\";\"extension\";\"filesize\"\n" > "$outputFile" - printf "$output" >> "$outputFile" -fi diff --git a/tools/intake/youthVaultCheck.r b/tools/intake/youthVaultCheck.r deleted file mode 100644 index 1b2dbbdb1..000000000 --- a/tools/intake/youthVaultCheck.r +++ /dev/null @@ -1,91 +0,0 @@ -#Author Niek Bats - -youthVaultCheck { - *intakeOrVault="vault"; #intake vault - - #non empty *wave, *experiment and *pseudocode - if ((*wave != "") && (*experiment != "") && (*pseudocode != "")) then { - foreach(*row in SELECT COLL_OWNER_ZONE) { - *zone=*row.COLL_OWNER_ZONE; - - foreach(*row2 in SELECT COLL_NAME - WHERE COLL_NAME like '/*zone/home/grp-*intakeOrVault-%' - AND META_DATA_ATTR_NAME = 'wave' - AND META_DATA_ATTR_VALUE = *wave) { - *path=*row2.COLL_NAME; - - foreach(*row3 in SELECT DATA_NAME - WHERE COLL_NAME = *path - AND META_DATA_ATTR_NAME = 'experiment_type' - AND META_DATA_ATTR_VALUE = *experiment) { - *nameExtension=*row3.DATA_NAME; - - foreach(*row4 in SELECT DATA_SIZE - WHERE DATA_NAME = *nameExtension - AND COLL_NAME = *path - AND META_DATA_ATTR_NAME = 'pseudocode' - AND META_DATA_ATTR_VALUE = *pseudocode) { - *size=*row4.DATA_SIZE; - *name=trimr(*nameExtension, "."); - *extension=triml(*nameExtension, *name); - - writeLine("stdout", "\"*path\";\"*name\";\"*extension\";\"*size\""); - } - } - } - } - } - - #non empty *wave and *experiment - else if ((*wave != "") && (*experiment != "")) then { - foreach(*row in SELECT COLL_OWNER_ZONE) { - *zone=*row.COLL_OWNER_ZONE; - - foreach(*row2 in SELECT COLL_NAME - WHERE COLL_NAME like '/*zone/home/grp-*intakeOrVault-%' - AND META_DATA_ATTR_NAME = 'wave' - AND META_DATA_ATTR_VALUE = *wave) { - *path=*row2.COLL_NAME; - - foreach(*row3 in SELECT DATA_NAME, DATA_SIZE - WHERE COLL_NAME = *path - AND META_DATA_ATTR_NAME = 'experiment_type' - AND META_DATA_ATTR_VALUE = *experiment) { - *nameExtension=*row3.DATA_NAME; - *size=*row3.DATA_SIZE; - *name=trimr(*nameExtension, "."); - *extension=triml(*nameExtension, *name); - - writeLine("stdout", "\"*path\";\"*name\";\"*extension\";\"*size\""); - } - } - } - } - - #non empty wave pseudocode is empty - else if (*wave != "" && *pseudocode == "") then { - foreach(*row in SELECT COLL_OWNER_ZONE) { - *zone=*row.COLL_OWNER_ZONE; - - foreach(*row2 in SELECT COLL_NAME, DATA_NAME, DATA_SIZE - WHERE COLL_NAME like '/*zone/home/grp-*intakeOrVault-%' - AND META_DATA_ATTR_NAME ='wave' - AND META_DATA_ATTR_VALUE = *wave) { - *path=*row2.COLL_NAME; - *nameExtension=*row2.DATA_NAME; - *size=*row2.DATA_SIZE; - *name=trimr(*nameExtension, "."); - *extension=triml(*nameExtension, *name); - - writeLine("stdout", "\"*path\";\"*name\";\"*extension\";\"*size\""); - } - } - } - - else { - writeLine("stdout", "Invalid input"); - } -} - -input *wave="", *experiment="", *pseudocode="" -output ruleExecOut \ No newline at end of file diff --git a/tools/intake/youthVaultCheck.sh b/tools/intake/youthVaultCheck.sh deleted file mode 100644 index 2f4d9c6c7..000000000 --- a/tools/intake/youthVaultCheck.sh +++ /dev/null @@ -1,32 +0,0 @@ -#!/bin/bash -#Author Niek Bats -#$1 wave -#$2 experiment -#$3 pseudocode -#lists all files, when found any grp-vault-folder using specified parameter(s) - -output=$(irule -r irods_rule_engine_plugin-irods_rule_language-instance -F youthVaultCheck.r "'$1'" "'$2'" "'$3'") -#echo $output -if [[ "$output" == "" ]] -then - echo "no results with parameters $1 $2 $3" - -elif [[ $output == "Invalid input" ]] -then - echo "$output" - -else - outputFile="vault-$1" - if [[ "$2" != "" ]] - then - outputFile="$outputFile-$2" - fi - if [[ "$3" != "" ]] - then - outputFile="$outputFile-$3" - fi - outputFile="$outputFile.csv" - - printf "\"Filepath\";\"name\";\"extension\";\"filesize\"\n" > "$outputFile" - printf "$output" >> "$outputFile" -fi diff --git a/tools/job_scan.r b/tools/job_scan.r deleted file mode 100644 index dd9b2377c..000000000 --- a/tools/job_scan.r +++ /dev/null @@ -1,43 +0,0 @@ -# \file -# \brief job -# \author Ton Smeele, Sietse Snel -# \copyright Copyright (c) 2015-2021, Utrecht university. All rights reserved -# \license GPLv3, see LICENSE -# -# This file can be executed manually or scheduled e.g. once a day. -# It scans an intake collection for datasets and checks the sets, if no collection -# is provided, it will scan a predefined list on intake groups (*groupList) -# -# Prerequisite: the irods user should have write access on the collection and its objects -# -# - - -uuYcRunIntakeScan { - *collectionList = list(); - # intake areas can be added to the group list as needed - *groupList = list('youth'); - *zone = $rodsZoneClient; - - if ( *intakeRoot == 'dummy' ) { - foreach (*grp in *groupList) { - *root = "/*zone/home/grp-intake-*grp"; - *collectionList = cons( *root, *collectionList); - } - } - else { - *collectionList = cons (*intakeRoot, *collectionList); - } - - foreach (*coll in *collectionList) { - writeLine("stdout","Running intake scan for *coll ..."); - *status = "0"; - rule_intake_scan_for_datasets(*coll, *status); - if (*status == "0" ) then *result = "ok" else *result = "ERROR (*status)"; - writeLine("stdout","RunIntakeScan for *intakeRoot result = *result"); - } - -} - -input *intakeRoot='dummy' -output ruleExecOut diff --git a/unit-tests/test_intake.py b/unit-tests/test_intake.py deleted file mode 100644 index 43d737dcc..000000000 --- a/unit-tests/test_intake.py +++ /dev/null @@ -1,159 +0,0 @@ -# -*- coding: utf-8 -*- - -"""Unit tests for the intake module -""" - -__copyright__ = 'Copyright (c) 2019-2021, Utrecht University' -__license__ = 'GPLv3, see LICENSE' - -import os -import sys -from unittest import TestCase - -sys.path.append('..') - -from intake_utils import dataset_make_id, dataset_parse_id, intake_extract_tokens, intake_extract_tokens_from_name, intake_scan_get_metadata_update, intake_tokens_identify_dataset - - -class IntakeTest(TestCase): - - def test_intake_tokens_identify_dataset(self): - empty_data = dict() - self.assertFalse(intake_tokens_identify_dataset(empty_data)) - missing_key_data = {"wave": "1", "pseudocode": "2"} - self.assertFalse(intake_tokens_identify_dataset(missing_key_data)) - missing_value_data = {"wave": "1", "pseudocode": "2", "experiment_type": ""} - self.assertFalse(intake_tokens_identify_dataset(missing_value_data)) - complete_data = {"wave": "1", "pseudocode": "2", "experiment_type": "3"} - self.assertTrue(intake_tokens_identify_dataset(complete_data)) - - def test_intake_extract_tokens(self): - no_token_data = intake_extract_tokens(None, "") - self.assertEquals(len(no_token_data), 0) - wave_data = intake_extract_tokens(None, "20w") - self.assertEquals(len(wave_data), 1) - self.assertEquals(wave_data["wave"], "20w") - et_data = intake_extract_tokens(None, "chantigap") - self.assertEquals(len(et_data), 1) - self.assertEquals(et_data["experiment_type"], "chantigap") - pseudocode_data = intake_extract_tokens(None, "B12345") - self.assertEquals(len(pseudocode_data), 1) - self.assertEquals(pseudocode_data["pseudocode"], "B12345") - version_data = intake_extract_tokens(None, "VerABC") - self.assertEquals(len(version_data), 1) - self.assertEquals(version_data["version"], "ABC") - - def test_intake_extract_tokens_from_name(self): - buffer = dict() - output = intake_extract_tokens_from_name(None, "20w_chantigap_B12345_VerABC.txt", buffer) - self.assertEquals(len(output), 4) - self.assertEquals(output["wave"], "20w") - self.assertEquals(output["experiment_type"], "chantigap") - self.assertEquals(output["version"], "ABC") - self.assertEquals(output["pseudocode"], "B12345") - - def test_intake_scan_get_metadata_update_coll_in_dataset(self): - parent_path = "/foo/bar/chantigap_10w_B12345" - path = parent_path + "/chantigap_20w_B12346" - complete_metadata = {"wave": "1", - "pseudocode": "2", - "experiment_type": "3", - "version": "Raw", - "directory": parent_path, - "dataset_id": "4", - "dataset_toplevel": "5"} - - output = intake_scan_get_metadata_update(None, path, True, True, complete_metadata) - self.assertEquals(output["in_dataset"], True) - self.assertEquals(len(output["new_metadata"]), 6) - self.assertEquals(output["new_metadata"]["directory"], parent_path) - self.assertEquals(output["new_metadata"]["wave"], "1") - self.assertEquals(output["new_metadata"]["pseudocode"], "2") - self.assertEquals(output["new_metadata"]["experiment_type"], "3") - self.assertEquals(output["new_metadata"]["version"], "Raw") - self.assertEquals(output["new_metadata"]["dataset_id"], "4") - self.assertTrue("dataset_toplevel" not in output["new_metadata"]) - - def test_intake_scan_get_metadata_update_coll_out_dataset_complete(self): - incomplete_metadata = {"wave": "1", "pseudocode": "2"} - path = "/foo/bar/chantigap_10w_B12345/chantigap_B12346" - output = intake_scan_get_metadata_update(None, path, True, False, incomplete_metadata) - self.assertEquals(output["in_dataset"], True) - self.assertEquals(len(output["new_metadata"]), 7) - self.assertEquals(output["new_metadata"]["directory"], path) - self.assertEquals(output["new_metadata"]["dataset_toplevel"], dataset_make_id(output["new_metadata"])) - self.assertEquals(output["new_metadata"]["dataset_id"], dataset_make_id(output["new_metadata"])) - self.assertEquals(output["new_metadata"]["wave"], "1") - self.assertEquals(output["new_metadata"]["version"], "Raw") - self.assertEquals(output["new_metadata"]["pseudocode"], "B12346") - self.assertEquals(output["new_metadata"]["experiment_type"], "chantigap") - - def test_intake_scan_get_metadata_update_coll_out_dataset_incomplete(self): - incomplete_metadata = {"wave": "1"} - path = "/foo/bar/chantigap_10w_B12345/B12346" - output = intake_scan_get_metadata_update(None, path, True, False, incomplete_metadata) - self.assertEquals(output["in_dataset"], False) - self.assertEquals(len(output["new_metadata"]), 2) - self.assertEquals(output["new_metadata"]["wave"], "1") - self.assertEquals(output["new_metadata"]["pseudocode"], "B12346") - - def test_intake_scan_get_metadata_update_do_in_dataset(self): - complete_metadata = {"wave": "1", - "pseudocode": "2", - "experiment_type": "3", - "version": "Raw", - "dataset_id": "4", - "dataset_toplevel": "5", - "directory": "6"} - path = "/foo/bar/chantigap_10w_B12345/chantigap_20w_B12346.txt" - output = intake_scan_get_metadata_update(None, path, False, True, complete_metadata) - self.assertEquals(output["in_dataset"], True) - self.assertEquals(len(output["new_metadata"]), 6) - self.assertEquals(output["new_metadata"]["wave"], "1") - self.assertEquals(output["new_metadata"]["pseudocode"], "2") - self.assertEquals(output["new_metadata"]["experiment_type"], "3") - self.assertEquals(output["new_metadata"]["version"], "Raw") - self.assertEquals(output["new_metadata"]["dataset_id"], "4") - self.assertTrue("dataset_toplevel" not in output["new_metadata"]) - - def test_intake_scan_get_metadata_update_do_out_dataset_complete(self): - incomplete_metadata = {"wave": "1", "pseudocode": "2"} - path = "/foo/bar/chantigap_10w_B12345/chantigap_B12346.txt" - coll = os.path.dirname(path) - output = intake_scan_get_metadata_update(None, path, False, False, incomplete_metadata) - self.assertEquals(output["in_dataset"], True) - self.assertEquals(len(output["new_metadata"]), 7) - self.assertEquals(output["new_metadata"]["directory"], coll) - self.assertEquals(output["new_metadata"]["dataset_id"], dataset_make_id(output["new_metadata"])) - self.assertEquals(output["new_metadata"]["dataset_toplevel"], dataset_make_id(output["new_metadata"])) - self.assertEquals(output["new_metadata"]["wave"], "1") - self.assertEquals(output["new_metadata"]["version"], "Raw") - self.assertEquals(output["new_metadata"]["pseudocode"], "B12346") - self.assertEquals(output["new_metadata"]["experiment_type"], "chantigap") - - def test_intake_scan_get_metadata_update_do_out_dataset_incomplete(self): - incomplete_metadata = {"wave": "1"} - path = "/foo/bar/chantigap_10w_B12345/B12346.txt" - output = intake_scan_get_metadata_update(None, path, False, False, incomplete_metadata) - self.assertEquals(output["in_dataset"], False) - self.assertEquals(len(output["new_metadata"]), 2) - self.assertEquals(output["new_metadata"]["wave"], "1") - self.assertEquals(output["new_metadata"]["pseudocode"], "B12346") - - def test_dataset_make_id(self): - input = {"wave": "20w", - "experiment_type": "echo", - "pseudocode": "B12345", - "version": "Raw", - "directory": "/foo/bar/baz"} - self.assertEquals(dataset_make_id(input), - "20w\techo\tB12345\tRaw\t/foo/bar/baz") - - def test_dataset_parse_id(self): - input = "20w\techo\tB12345\tRaw\t/foo/bar/baz" - output = dataset_parse_id(input) - self.assertEquals(output.get("wave"), "20w") - self.assertEquals(output.get("experiment_type"), "echo") - self.assertEquals(output.get("pseudocode"), "B12345") - self.assertEquals(output.get("version"), "Raw") - self.assertEquals(output.get("directory"), "/foo/bar/baz") diff --git a/unit-tests/unit_tests.py b/unit-tests/unit_tests.py index 3bd9d873e..8af940d91 100644 --- a/unit-tests/unit_tests.py +++ b/unit-tests/unit_tests.py @@ -6,7 +6,6 @@ from unittest import makeSuite, TestSuite from test_group_import import GroupImportTest -from test_intake import IntakeTest from test_policies import PoliciesTest from test_revisions import RevisionTest from test_schema_transformations import CorrectifyIsniTest, CorrectifyOrcidTest, CorrectifyScopusTest @@ -21,7 +20,6 @@ def suite(): test_suite.addTest(makeSuite(CorrectifyOrcidTest)) test_suite.addTest(makeSuite(CorrectifyScopusTest)) test_suite.addTest(makeSuite(GroupImportTest)) - test_suite.addTest(makeSuite(IntakeTest)) test_suite.addTest(makeSuite(PoliciesTest)) test_suite.addTest(makeSuite(RevisionTest)) test_suite.addTest(makeSuite(UtilMiscTest)) diff --git a/util/config.py b/util/config.py index 731d131d9..4827294c1 100644 --- a/util/config.py +++ b/util/config.py @@ -99,7 +99,6 @@ def __repr__(self): enable_deposit=False, enable_open_search=False, enable_inactivity_notification=False, - enable_intake=False, enable_datarequest=False, enable_data_package_archive=False, enable_data_package_download=False, diff --git a/uuGroupPolicies.r b/uuGroupPolicies.r index 7bf09aec9..c2cea29a3 100644 --- a/uuGroupPolicies.r +++ b/uuGroupPolicies.r @@ -59,14 +59,14 @@ uuGroupPreSudoGroupAdd(*groupName, *initialAttr, *initialValue, *initialUnit, *p uuGetBaseGroup(*groupName, *baseName); if (*baseName == *groupName) { # Do not allow creating a standalone "read-" or "vault-" group. - # There must always be a corresponding "intake-" or "research-" group. + # There must always be a corresponding "research-" group. fail; } uuGroupUserIsManager(*baseName, uuClientFullName, *isManagerInBaseGroup); if (!*isManagerInBaseGroup) { # Only allow creation of a read or vault group if the creator is a - # manager in the base group. (research or intake). + # manager in the research group. fail; } @@ -469,14 +469,14 @@ uuPostSudoGroupAdd(*groupName, *initialAttr, *initialValue, *initialUnit, *polic # taken after automatic creation of vault groups. } else { - # This is a group manager managed group (i.e. 'research-', 'deposit-','grp-', 'intake-', 'priv-', 'datamanager-'). + # This is a group manager managed group (i.e. 'research-', 'deposit-', 'priv-', 'datamanager-'). # Add the creator as a member. errorcode(msiSudoGroupMemberAdd(*groupName, uuClientFullName, "")); - # Perform group prefix-dependent actions (e.g. create vaults for intake/research groups). + # Perform group prefix-dependent actions (e.g. create vaults for research groups). - if (*groupName like regex "(intake|research)-.*") { + if (*groupName like regex "research-.*") { # Create a corresponding RO group. uuChop(*groupName, *_, *baseName, "-", true); *roGroupName = "read-*baseName"; @@ -502,7 +502,7 @@ uuPostSudoGroupAdd(*groupName, *initialAttr, *initialValue, *initialUnit, *polic } else if (*groupName like "datamanager-*") { # Give the newly created datamanager group read access to all - # existing intake/research home dirs and vaults in its category. + # existing research home dirs and vaults in its category. *category = *policyKv."category"; foreach ( @@ -513,9 +513,9 @@ uuPostSudoGroupAdd(*groupName, *initialAttr, *initialValue, *initialUnit, *polic AND META_USER_ATTR_NAME = 'category' AND META_USER_ATTR_VALUE = '*category' ) { - # Filter down to intake/research groups and get their vault groups. + # Filter down to research groups and get their vault groups. *catGroup = *row."USER_GROUP_NAME"; - if (*catGroup like regex "(intake|research)-.*") { + if (*catGroup like regex "research-.*") { *aclKv."forGroup" = *catGroup; msiSudoObjAclSet("recursive", "read", *groupName, "/$rodsZoneClient/home/*catGroup", *aclKv); @@ -548,7 +548,7 @@ uuPostSudoGroupAdd(*groupName, *initialAttr, *initialValue, *initialUnit, *polic # Set group manager-managed group metadata. # - # Note: Setting the category of an intake/research group will trigger + # Note: Setting the category of an research group will trigger # an ACL change: The datamanager group in the category, if it exists # will get read access to this group an its accompanying vault. # See uuPostSudoObjMetaSet. diff --git a/uuGroupPolicyChecks.r b/uuGroupPolicyChecks.r index 54d08ee04..374b84e44 100644 --- a/uuGroupPolicyChecks.r +++ b/uuGroupPolicyChecks.r @@ -38,7 +38,7 @@ uuUserNameIsValid(*name) # # Group names must: # -# - be prefixed with 'intake-' or 'research-' or 'deposit-' +# - be prefixed with 'research-' or 'deposit-' # - contain only lowercase characters, numbers and hyphens # - not start or end with a hyphen # @@ -49,7 +49,7 @@ uuUserNameIsValid(*name) # \param[in] name # uuGroupNameIsValid(*name) - = *name like regex ``(intake|research|deposit)-([a-z0-9]|[a-z0-9][a-z0-9-]*[a-z0-9])``; + = *name like regex ``(research|deposit)-([a-z0-9]|[a-z0-9][a-z0-9-]*[a-z0-9])``; uuGroupNameIsDatamanager(*name) = *name like regex ``(datamanager)-([a-z0-9]|[a-z0-9][a-z0-9-]*[a-z0-9])``; @@ -198,7 +198,7 @@ uuGroupPolicyCanGroupAdd(*actor, *groupName, *category, *subcategory, *expiratio uuChop(*groupName, *prefix, *base, "-", true); - # For research and intake groups: Make sure their ro and + # For research groups: Make sure their ro and # vault groups do not exist yet. *roName = "read-*base"; uuGroupExists(*roName, *roExists); @@ -274,7 +274,7 @@ uuGroupPolicyCanGroupAdd(*actor, *groupName, *category, *subcategory, *expiratio *reason = "You must have priv-group-add and priv-cat-add to add a datamanger group" } } else { - *reason = "Group names must start with one of 'intake-', 'research-', 'deposit-', or 'datamanager-' and may only contain lowercase letters (a-z) and hyphens (-)."; + *reason = "Group names must start with one of 'research-', 'deposit-', or 'datamanager-' and may only contain lowercase letters (a-z) and hyphens (-)."; } } else { *reason = "You cannot create groups because you are not a member of the priv-group-add group."; diff --git a/uuLock.r b/uuLock.r deleted file mode 100644 index 2f16e4820..000000000 --- a/uuLock.r +++ /dev/null @@ -1,140 +0,0 @@ -# \file uuLock.r -# \brief Locking functions. -# \author Ton Smeele -# \copyright Copyright (c) 2015, Utrecht University. All rights reserved. -# \license GPLv3, see LICENSE. - -# \brief Obtain a lock on a collection. -# -# \param[in] collection name of the collection to be locked -# \param[out] status 0 = locked, nonzero = lock failed (e.g. in use) -# -uuLock(*collection, *status) { - msiGetIcatTime(*dateTime, "unix"); - *lockId = $userNameClient ++ ":" ++ *dateTime; - # let everyone know we need a lock - # NB: a race condition could happen when another process owned by - # the same user requests a lock at the very same second. - # to minimize the risk we include username in the lockid - msiString2KeyValPair("uuLockRequest=*lockId",*kvLockRequest); - msiAssociateKeyValuePairsToObj(*kvLockRequest, *collection, "-C"); - # check upstream and on collection itself if lock (request) exists - *path = ""; - *lockFound = false; - foreach (*segment in split(*collection, '/')) { - *path = "*path/*segment"; - if (*path != *collection) { - uuLockExists(*path, *lockFound); - if (*lockFound) { - break; - } - } else { - # TODO check collection itself yet ignore our own request - foreach (*row in SELECT META_COLL_ATTR_NAME,META_COLL_ATTR_VALUE - WHERE COLL_NAME = *collection - AND META_COLL_ATTR_NAME LIKE "uuLock%" - ) { - msiGetValByKey(*row, "META_COLL_ATTR_NAME", *key); - msiGetValByKey(*row, "META_COLL_ATTR_VALUE", *value); - if ("*key=*value" != "uuLockRequest=*lockId"){ - *lockFound = true; - } - } - } - } - if (!*lockFound) { - # also check downstream if other have (requested) a lock - # we can check all subcollections in one go - foreach (*rows in SELECT META_COLL_ATTR_NAME,COLL_NAME - WHERE COLL_PARENT_NAME LIKE '*collection%' - AND META_COLL_ATTR_NAME LIKE 'uuLock%' - ){ - # SELECT does not support 'OR' construct, therefore we need to - # check and ignore collections that start with similar prefix - # yet are in a different tree - # e.g. /zone/home/col/col2 and /zone/home/cola/col2 - # both cases col2 appears to have parent "col%" - msiGetValByKey(*rows, "COLL_NAME", *thisCollection); - if (*thisCollection like "*collection/\*") { - # we have an existing lock - *lockFound = true; - break; - } - } - } - if (*lockFound) { - *status = 1; - # retract our lock request, someone else got a lock - msiRemoveKeyValuePairsFromObj(*kvLockRequest, *collection, "-C"); - } else { - # change our request into a real lock - msiString2KeyValPair("uuLocked=*lockId",*kvLock); - msiAssociateKeyValuePairsToObj(*kvLock, *collection, "-C"); - msiRemoveKeyValuePairsFromObj(*kvLockRequest, *collection, "-C"); - *status = 0; - } -} - -# -# \brief uuUnlock unlocks a collection -# -# \param[in] collection name of the collection to unlock -uuUnlock(*collection) { - # NB: always succeeds regardless if lock actually exists - foreach (*rows in SELECT META_COLL_ATTR_VALUE - WHERE COLL_NAME = '*collection' - AND META_COLL_ATTR_NAME = 'uuLocked' - ){ - # should return max 1 row, otherwise we have multiple locks?? - msiGetValByKey(*rows,"META_COLL_ATTR_VALUE",*lockValue); - msiString2KeyValPair("uuLocked=*lockValue",*kvLocked); - msiRemoveKeyValuePairsFromObj(*kvLocked, *collection, "-C") - } -} - -# \brief See if a collection has a lock on it. -# -# \param[in] collection name of the collection -# \param[out] isLocked true if collection has a lock(request) -# -uuLockExists(*collection, *isLocked) { - # NB: reports true for both existing locks and lock requests - *isLocked = false; - msiGetIcatTime(*currentTime, "unix"); - foreach (*row in SELECT META_COLL_ATTR_NAME,META_COLL_ATTR_VALUE - WHERE COLL_NAME = *collection - AND META_COLL_ATTR_NAME LIKE "uuLock%" - ) { - # rows found means there is an existing lock (request) - # our last hope is that this is an expired request that we can ignore - msiGetValByKey(*row,"META_COLL_ATTR_NAME",*lockKey); - msiGetValByKey(*row,"META_COLL_ATTR_VALUE",*lockValue); - *lockTime = double(uuLockGetDateTime(*lockValue)); - if ( - ((*lockTime + 7 * 86400 ) < *currentTime) - # remove locks/requests after expire time of 1 week - # && (*lockKey == "lockRequest") - ) { - # cleanup lock requests older than 5 minutes - msiString2KeyValPair("*lockKey=*lockValue",*kvExpiredLock); - msiRemoveKeyValuePairsFromObj(*kvExpiredLock, *collection, "-C"); - } else { - # there is a valid existing lock - *isLocked = true; - } - } -} - -# \brief Function to get the username part of a lock. -# -# \param[in] lock name of the lock -# \return username -# -uuLockGetUser(*lock) = substr(*lock, 0, strlen(*lock) - strlen(triml(*lock,":")) -1); - -# \brief Function to get the datestamp part of a lock. -# -# \param[in] lock name of the lock -# \return datetimestamp (in seconds since epoch) -# -uuLockGetDateTime(*lock) = triml(*lock,":"); diff --git a/yc2Vault.r b/yc2Vault.r deleted file mode 100644 index ce9331e6a..000000000 --- a/yc2Vault.r +++ /dev/null @@ -1,388 +0,0 @@ -# \file -# \brief move selected datasets from intake area to the vault area -# this rule is to be executed by a background process with write access to vault -# and read access to the intake area -# \author Ton Smeele -# \copyright Copyright (c) 2015, Utrecht university. All rights reserved -# \license GPLv3, see LICENSE -# -#test { -# *intakeRoot = '/nluu1ot/home/grp-intake-youth'; -# *vaultRoot = '/nluu1ot/home/grp-vault-youth'; -# uuYc2Vault(*intakeRoot, *vaultRoot, *status); -# writeLine("serverLog","result status of yc2Vault is *status"); -#} - - -# \brief -# -# \param[in] path pathname of the tree-item -# \param[in] name segment of path, name of collection or data object -# \param[in] isCol true if the object is a collection, otherwise false -# \param[in,out] buffer -# -#uuTreeMyRule(*parent, *objectName, *isCol, *buffer) { -# writeLine("serverLog","parent = *parent"); -# writeLine("serverLog","name = *objectName"); -# writeLine("serverLog","isCol = *isCol"); -# writeLine("serverLog","buffer[path]= " ++ *buffer."path"); -# if (*isCol) { -# *buffer."path" = *buffer."path"++"="; -# } -#} - - - - -uuYcVaultDatasetGetPath(*vaultRoot, *datasetId, *datasetPath) { - uuYcDatasetParseId(*datasetId, *datasetComponents); - *wave = *datasetComponents."wave"; - *experimentType = *datasetComponents."experiment_type"; - *pseudocode = *datasetComponents."pseudocode"; - *version = *datasetComponents."version"; - *sep = "_"; - *wepv = *wave ++ *sep ++ *experimentType ++ *sep ++ *pseudocode ++ *sep ++ "ver*version"; - *datasetPath = "*vaultRoot/*wave/*experimentType/*pseudocode/*wepv"; -} - -uuYcVaultDatasetExists(*vaultRoot, *datasetId, *exists) { - *exists = false; - uuYcVaultDatasetGetPath(*vaultRoot, *datasetId, *datasetPath); - foreach (*row in SELECT COLL_NAME WHERE COLL_NAME = '*datasetPath') { - *exists = true; - break; - } -} - - -uuYcVaultDatasetAddMeta(*vaultPath, *datasetId) { - uuYcDatasetParseId(*datasetId, *datasetComponents); - *wave = *datasetComponents."wave"; - *experimentType = *datasetComponents."experiment_type"; - *pseudocode = *datasetComponents."pseudocode"; - *version = *datasetComponents."version"; - msiGetIcatTime(*date, "unix"); - msiAddKeyVal(*kv, "wave", *wave); - msiAddKeyVal(*kv, "experiment_type", *experimentType); - msiAddKeyVal(*kv, "pseudocode", *pseudocode); - msiAddKeyVal(*kv, "version", *version); - msiAddKeyVal(*kv, "dataset_date_created", *date); - msiAssociateKeyValuePairsToObj(*kv, *vaultPath, "-C"); -} - -uuYcVaultWalkRemoveObject(*itemParent, *itemName, *itemIsCollection, *buffer, *status) { -# writeLine("serverLog", "...removing *itemParent/*itemName"); - if (*itemIsCollection) { - msiRmColl("*itemParent/*itemName", "forceFlag=", *status); - } else { - msiDataObjUnlink("objPath=*itemParent/*itemName++++forceFlag=", *status); - } -} - - -uuYcVaultIngestObject(*objectPath, *isCollection, *vaultPath, *status) { - # from the original object only the below list '*copiedMetadata' of metadata keys - # is copied to the vault object, other info is ignored - *copiedMetadata = list("wave", "experiment_type", "pseudocode", "version", - "error", "warning", "comment", "dataset_error", - "dataset_warning", "datasetid"); - *status = 0; - if (*isCollection) { - msiCollCreate(*vaultPath, "1", *status); - if (*status == 0) { - foreach (*row in SELECT META_COLL_ATTR_NAME, META_COLL_ATTR_VALUE - WHERE COLL_NAME = '*objectPath' - ) { - msiGetValByKey(*row, "META_COLL_ATTR_NAME", *key); - msiGetValByKey(*row, "META_COLL_ATTR_VALUE", *value); - msiString2KeyValPair("*key=*value",*kv); - # add relevant kvlist to vault collection object - foreach (*meta in *copiedMetadata) { - if (*key == *meta) { - msiAssociateKeyValuePairsToObj(*kv, *vaultPath, "-C"); - } - } - } - foreach (*row in SELECT COLL_OWNER_NAME, COLL_OWNER_ZONE, COLL_CREATE_TIME - WHERE COLL_NAME = '*objectPath' - ) { - msiGetValByKey(*row, "COLL_OWNER_NAME", *ownerName); - msiGetValByKey(*row, "COLL_OWNER_ZONE", *ownerZone); - msiGetValByKey(*row, "COLL_CREATE_TIME", *createTime); - msiString2KeyValPair("submitted_by=*ownerName#*ownerZone",*kvSubmittedBy); - msiString2KeyValPair("submitted_date=*createTime",*kvSubmittedDate); - msiAssociateKeyValuePairsToObj(*kvSubmittedBy, *vaultPath, "-C"); - msiAssociateKeyValuePairsToObj(*kvSubmittedDate, *vaultPath, "-C"); - } - } - } else { # its not a collection but a data object - # first chksum the original file, then use it to verify the vault copy - msiDataObjChksum(*objectPath, "forceChksum=", *checksum); - msiDataObjCopy(*objectPath, *vaultPath, "verifyChksum=", *status); - if (*status == 0) { - uuChopPath(*objectPath, *collection, *dataName); - foreach (*row in SELECT META_DATA_ATTR_NAME, META_DATA_ATTR_VALUE - WHERE COLL_NAME = '*collection' - AND DATA_NAME = '*dataName' - ) { - msiGetValByKey(*row, "META_DATA_ATTR_NAME", *key); - msiGetValByKey(*row, "META_DATA_ATTR_VALUE", *value); - # add relevant kvlist to vault collection object - msiString2KeyValPair("*key=*value",*kv); - foreach (*meta in *copiedMetadata) { - if (*key == *meta) { - msiAssociateKeyValuePairsToObj(*kv, *vaultPath, "-d"); - } - } - } - # add metadata found in system info - foreach (*row in SELECT DATA_OWNER_NAME, DATA_OWNER_ZONE, DATA_CREATE_TIME - WHERE COLL_NAME = '*collection' - AND DATA_NAME = '*dataName' - ) { - msiGetValByKey(*row, "DATA_OWNER_NAME", *ownerName); - msiGetValByKey(*row, "DATA_OWNER_ZONE", *ownerZone); - msiGetValByKey(*row, "DATA_CREATE_TIME", *createTime); - msiString2KeyValPair("submitted_by=*ownerName#*ownerZone",*kvSubmittedBy); - msiString2KeyValPair("submitted_date=*createTime",*kvSubmittedDate); - msiAssociateKeyValuePairsToObj(*kvSubmittedBy, *vaultPath, "-d"); - msiAssociateKeyValuePairsToObj(*kvSubmittedDate, *vaultPath, "-d"); - # Skip duplicas - break; - } - } - } -} - - - -uuYcVaultWalkIngestObject(*itemParent, *itemName, *itemIsCollection, *buffer, *status) { - *sourcePath = "*itemParent/*itemName"; - *destPath = *buffer."destination"; # top level destination is specified - if (*sourcePath != *buffer."source") { - # rewrite path to copy objects that are located underneath the toplevel collection - *sourceLength = strlen(*sourcePath); - *relativePath = substr(*sourcePath, strlen(*buffer."source") + 1, *sourceLength); - *destPath = *buffer."destination" ++ "/" ++ *relativePath; - } -# writeLine("serverLog","VLT from = *sourcePath"); -# writeLine("serverLog","VLT to = *destPath"); - uuYcVaultIngestObject(*sourcePath, *itemIsCollection, *destPath, *status); -} - - -uuYcDatasetCollectionMove2Vault(*intakeRoot,*topLevelCollection, *datasetId, *vaultRoot, *status) { - writeLine("serverLog","\nmoving dataset-typeA *datasetId from *topLevelCollection to vault"); - *status = 0; - uuYcVaultDatasetExists(*vaultRoot, *datasetId, *exists); - if (!*exists) { - uuYcVaultDatasetGetPath(*vaultRoot, *datasetId, *vaultPath); - # create the in-between levels of the path to the toplevel collection - uuChopPath(*vaultPath, *vaultParent, *vaultCollection); - msiCollCreate(*vaultParent, "1", *status); -# writeLine("serverLog","VAULT: dataset created *datasetId status=*status path=*vaultPath"); - if (*status == 0) { - # copy the dataset tree to the vault - uuChopPath(*topLevelCollection, *intakeParent, *intakeCollection); - *buffer."source" = *topLevelCollection; - *buffer."destination" = *vaultPath; -# writeLine("serverLog","VAULT: source = *topLevelCollection"); -# writeLine("serverLog","VAULT: dest = *vaultPath"); - uuTreeWalk( - "forward", - *topLevelCollection, - "uuYcVaultWalkIngestObject", - *buffer, - *status - ); - uuKvClear(*buffer); - if (*status == 0) { - # stamp the vault dataset collection with additional metadata - msiGetIcatTime(*date, "unix"); - msiAddKeyVal(*kv, "dataset_date_created", *date); - msiAssociateKeyValuePairsToObj(*kv, *vaultPath, "-C"); - # and finally remove the dataset original in the intake area - msiRmColl(*topLevelCollection, "forceFlag=", *error); -# uuTreeWalk( -# "reverse", -# *topLevelCollection, -# "uuYcVaultWalkRemoveObject", -# *buffer, -# *error -# ); - if (*error != 0) { - writeLine("serverLog", - "ERROR: unable to remove intake collection *topLevelCollection"); - } - } else { - # move failed (partially), cleanup vault - # NB: keep the dataset in the vault queue so we can retry some other time - writeLine("serverLog","ERROR: Ingest failed for *datasetId error = *status"); - uuTreeWalk("reverse", *vaultPath, "uuYcVaultWalkRemoveObject", *buffer, *error); - } - - } - } else { - writeLine("serverLog","INFO: version already exists in vault: *datasetId"); - # duplicate dataset, signal error and throw out of vault queue - *message = "Duplicate dataset, version already exists in vault"; - uuYcDatasetErrorAdd(*intakeRoot, *datasetId,*message); - uuYcDatasetMelt(*topLevelCollection, *datasetId, *status); - uuYcDatasetUnlock(*topLevelCollection, *datasetId, *status); - *status = 1; # duplicate dataset version error - } -} - -uuYcDatasetObjectsOnlyMove2Vault(*intakeRoot, *topLevelCollection, *datasetId, *vaultRoot, *status) { - writeLine("serverLog","\nmoving dataset-typeB *datasetId from *topLevelCollection to vault"); - uuYcVaultDatasetExists(*vaultRoot, *datasetId, *exists); - if (!*exists) { - # new dataset(version) we can safely ingest into vault - uuYcVaultDatasetGetPath(*vaultRoot, *datasetId, *vaultPath); - # create path to and including the toplevel collection (will create in-between levels) - msiCollCreate(*vaultPath, "1", *status); -# writeLine("serverLog","VAULT: dataset created *datasetId status=*status path=*vaultPath"); - if (*status == 0) { - # stamp the vault dataset collection with default metadata - uuYcVaultDatasetAddMeta(*vaultPath, *datasetId); - # copy data objects to the vault - foreach (*dataRow in SELECT DATA_NAME - WHERE COLL_NAME = '*topLevelCollection' - AND META_DATA_ATTR_NAME = 'dataset_toplevel' - AND META_DATA_ATTR_VALUE = '*datasetId' - ) { - msiGetValByKey(*dataRow, "DATA_NAME", *dataName); - *intakePath = "*topLevelCollection/*dataName"; - uuYcVaultIngestObject(*intakePath, false, "*vaultPath/*dataName", *status); - if (*status != 0) { - break; - } - } - if (*status == 0) { - # data ingested, what's left is to delete the original in intake area - # this will also melt/unfreeze etc because metadata is removed too - foreach (*dataRow in SELECT DATA_NAME - WHERE COLL_NAME = '*topLevelCollection' - AND META_DATA_ATTR_NAME = 'dataset_toplevel' - AND META_DATA_ATTR_VALUE = '*datasetId' - ) { - msiGetValByKey(*dataRow, "DATA_NAME", *dataName); - *intakePath = "*topLevelCollection/*dataName"; -# writeLine("serverLog","removing intake file: *intakePath"); - msiDataObjUnlink("objPath=*intakePath++++forceFlag=", *error); - if (*error != 0) { - writeLine("serverLog","ERROR: unable to remove intake object *intakePath"); - } - } - } else { - # error occurred during ingest, cleanup vault area and relay the error to user - # NB: keep the dataset in the vault queue so we can retry some other time - writeLine("serverLog","ERROR: Ingest failed for *datasetId error = *status"); - *buffer = "required yet dummy parameter"; - uuTreeWalk("reverse", *vaultPath, "uuYcVaultWalkRemoveObject", *buffer, *error); - } - } - } else { - # duplicate dataset, signal error and throw out of vault queue - writeLine("serverLog","INFO: version already exists in vault: *datasetId"); - *message = "Duplicate dataset, version already exists in vault"; - uuYcDatasetErrorAdd(*intakeRoot, *datasetId,*message); - uuYcDatasetMelt(*topLevelCollection, *datasetId, *status); - uuYcDatasetUnlock(*topLevelCollection, *datasetId, *status); - *status = 1; # duplicate dataset version error - } -} - - - -# \brief move all locked datasets to the vault -# -# \param[in] intakeCollection pathname root of intake area -# \param[in] vaultCollection pathname root of vault area -# \param[out] status result of operation either "ok" or "error" -# -uuYc2Vault(*intakeRoot, *vaultRoot, *status) { - # 1. add to_vault_freeze metadata lock to the dataset - # 2. check that dataset does not yet exist in the vault - # 3. copy dataset to vault with its metadata - # 4. remove dataset from intake - # upon any error: - # - delete partial data from vault - # - add error to intake dataset metadata - # - remove locks on intake dataset (to_vault_freeze, to_vault_lock) - *status = 0; # 0 is success, nonzero is error - *datasets_moved = 0; - - # note that we have to allow for multiple types of datasets: - # type A: a single toplevel collection with a tree underneath - # type B: one or more data files located within the same collection - # processing varies slightly between them, so process each type in turn - # - # TYPE A: - foreach (*row in SELECT COLL_NAME, META_COLL_ATTR_VALUE - WHERE META_COLL_ATTR_NAME = 'dataset_toplevel' - AND COLL_NAME like '*intakeRoot/%') { - msiGetValByKey(*row, "COLL_NAME", *topLevelCollection); - msiGetValByKey(*row, "META_COLL_ATTR_VALUE", *datasetId); - uuYcObjectIsLocked(*topLevelCollection, true, *locked, *frozen); - if (*locked) { - uuYcDatasetFreeze(*topLevelCollection, *datasetId, *status); - if (*status == 0) { - # dataset frozen; now move to vault and remove from intake area - uuYcDatasetCollectionMove2Vault( - *intakeRoot, - *topLevelCollection, - *datasetId, - *vaultRoot, - *status - ); - if (*status == 0) { - *datasets_moved = *datasets_moved + 1; - } - } - } - } - # TYPE B: - foreach (*row in SELECT COLL_NAME, META_DATA_ATTR_VALUE - WHERE META_DATA_ATTR_NAME = 'dataset_toplevel' - AND COLL_NAME like '*intakeRoot%' -# fixme: skip collnames that are not in the same tree yet share the prefix - ) { - - msiGetValByKey(*row, "COLL_NAME", *topLevelCollection); - msiGetValByKey(*row, "META_DATA_ATTR_VALUE", *datasetId); - # check if to_vault_lock exists on all the dataobjects of this dataset - *allLocked = true; - foreach (*dataRow in SELECT DATA_NAME - WHERE COLL_NAME = '*topLevelCollection' - AND META_DATA_ATTR_NAME = 'dataset_toplevel' - AND META_DATA_ATTR_VALUE = '*datasetId' - ) { - msiGetValByKey(*dataRow, "DATA_NAME", *dataName); - uuYcObjectIsLocked("*topLevelCollection/*dataName", false, *locked, *frozen); - *allLocked = *allLocked && *locked; - } - if (*allLocked) { - uuYcDatasetFreeze(*topLevelCollection, *datasetId, *status); - if (*status == 0) { - # dataset frozen, now move to fault and remove from intake area - uuYcDatasetObjectsOnlyMove2Vault( - *intakeRoot, - *topLevelCollection, - *datasetId, - *vaultRoot, - *status - ); - if (*status == 0) { - *datasets_moved = *datasets_moved + 1; - } - } - } - } - if (*datasets_moved > 0) { - writeLine("serverLog","\nmoved in total *datasets_moved dataset(s) to the vault"); - } -} - -#input null -#output ruleExecOut diff --git a/ycDataset.r b/ycDataset.r deleted file mode 100644 index 4cb09dab7..000000000 --- a/ycDataset.r +++ /dev/null @@ -1,175 +0,0 @@ -# \file -# \brief Youth Cohort - Dataset related functions. -# \author Chris Smeele -# \copyright Copyright (c) 2015, Utrecht University. All rights reserved. -# \license GPLv3, see LICENSE - -# \brief Generate a dataset identifier based on WEPV values. -# -# \param[in] idComponents a kvList containing WEPV values -# \param[out] id a dataset id string -# -uuYcDatasetMakeId(*idComponents, *id){ - *id = - *idComponents."wave" - ++ "\t" ++ *idComponents."experiment_type" - ++ "\t" ++ *idComponents."pseudocode" - ++ "\t" ++ *idComponents."version" - ++ "\t" ++ *idComponents."directory"; -} - -# \brief Parse a dataset identifier and return WEPV values. -# -# \param[in] id a dataset id string -# \param[out] idComponents a kvList containing WEPV values -# -uuYcDatasetParseId(*id, *idComponents){ - *idParts = split(*id, "\t"); - *idComponents."wave" = elem(*idParts, 0); - *idComponents."experiment_type" = elem(*idParts, 1); - *idComponents."pseudocode" = elem(*idParts, 2); - *idComponents."version" = elem(*idParts, 3); - *idComponents."directory" = elem(*idParts, 4); -} - -# \brief Find dataset ids under *root. -# -# \param[in] root -# \param[out] ids a list of dataset ids -# -uuYcDatasetGetIds(*root, *ids) { - *idsString = ""; - foreach (*item in SELECT META_DATA_ATTR_VALUE WHERE COLL_NAME = "*root" AND META_DATA_ATTR_NAME = 'dataset_id') { - # Datasets directly under *root need to be checked for separately due to limitations on the general query system. - if (strlen(*idsString) > 0) { - *idsString = *idsString ++ "\n"; - } - *idsString = *idsString ++ *item."META_DATA_ATTR_VALUE"; - } - foreach (*item in SELECT META_DATA_ATTR_VALUE WHERE COLL_NAME LIKE "*root/%" AND META_DATA_ATTR_NAME = 'dataset_id') { - if (strlen(*idsString) > 0) { - *idsString = *idsString ++ "\n"; - } - *idsString = *idsString ++ *item."META_DATA_ATTR_VALUE"; - } - *ids = split(*idsString, "\n"); -} - -# \brief Get a list of toplevel objects that belong to the given dataset id. -# -# \param[in] root -# \param[in] id -# \param[out] objects a list of toplevel object paths -# \param[out] isCollection whether this dataset consists of a single toplevel collection -# -uuYcDatasetGetToplevelObjects(*root, *id, *objects, *isCollection) { - *isCollection = false; - - *objectsString = ""; - foreach (*item in SELECT COLL_NAME WHERE COLL_NAME LIKE "*root/%" AND META_COLL_ATTR_NAME = 'dataset_toplevel' AND META_COLL_ATTR_VALUE = "*id") { - *isCollection = true; - *objectsString = *item."COLL_NAME"; - } - if (!*isCollection) { - foreach (*item in SELECT DATA_NAME, COLL_NAME WHERE COLL_NAME = "*root" AND META_DATA_ATTR_NAME = 'dataset_toplevel' AND META_DATA_ATTR_VALUE = "*id") { - # Datasets directly under *root need to be checked for separately due to limitations on the general query system. - if (strlen(*objectsString) > 0) { - *objectsString = *objectsString ++ "\n"; - } - *objectsString = *objectsString ++ *item."COLL_NAME" ++ "/" ++ *item."DATA_NAME"; - } - foreach (*item in SELECT DATA_NAME, COLL_NAME WHERE COLL_NAME LIKE "*root/%" AND META_DATA_ATTR_NAME = 'dataset_toplevel' AND META_DATA_ATTR_VALUE = "*id") { - if (strlen(*objectsString) > 0) { - *objectsString = *objectsString ++ "\n"; - } - *objectsString = *objectsString ++ *item."COLL_NAME" ++ "/" ++ *item."DATA_NAME"; - } - } - *objects = split(*objectsString, "\n"); - #writeLine("stdout", "Got dataset toplevel objects for <*id>: *objectsString"); -} - -# \brief Get a list of relative paths to all data objects in a dataset. -# -# \param[in] root -# \param[in] id -# \param[out] objects a list of relative object paths (e.g. file1.dat, some-subdir/file2.dat...) -# -uuYcDatasetGetDataObjectRelPaths(*root, *id, *objects) { - - uuYcDatasetGetToplevelObjects(*root, *id, *toplevelObjects, *isCollection); - - # NOTE: This will crash when an invalid dataset id is provided. - if (*isCollection) { - *parentCollection = elem(*toplevelObjects, 0); - } else { - uuChopPath(elem(*toplevelObjects, 0), *dataObjectParent, *dataObjectName); - *parentCollection = *dataObjectParent; - } - - *objectsString = ""; - foreach (*item in SELECT DATA_NAME, COLL_NAME WHERE COLL_NAME = "*parentCollection" AND META_DATA_ATTR_NAME = 'dataset_id' AND META_DATA_ATTR_VALUE = "*id") { - # Datasets directly under *root need to be checked for separately due to limitations on the general query system. - if (strlen(*objectsString) > 0) { - *objectsString = *objectsString ++ "\n"; - } - *objectsString = *objectsString ++ *item."DATA_NAME"; - } - foreach (*item in SELECT DATA_NAME, COLL_NAME WHERE COLL_NAME LIKE "*parentCollection/%" AND META_DATA_ATTR_NAME = 'dataset_id' AND META_DATA_ATTR_VALUE = "*id") { - if (strlen(*objectsString) > 0) { - *objectsString = *objectsString ++ "\n"; - } - *objectsString = *objectsString - ++ substr(*item."COLL_NAME", strlen(*parentCollection)+1, strlen(*item."COLL_NAME")) - ++ "/" - ++ *item."DATA_NAME"; - } - *objects = split(*objectsString, "\n"); -} - -# \brief Check if a dataset id is locked. -# -# \param[in] root -# \param[in] id -# \param[out] isLocked -# \param[out] isFrozen -# -uuYcDatasetIsLocked(*root, *id, *isLocked, *isFrozen) { - uuYcDatasetGetToplevelObjects(*root, *id, *toplevelObjects, *isCollection); - - *isLocked = false; - *isFrozen = false; - foreach (*item in *toplevelObjects) { - uuYcObjectIsLocked(*item, *isCollection, *isLocked, *isFrozen); - if (*isLocked || *isFrozen) { - break; - } - } -} - - -# \brief Adds an error to the dataset specified by *datasetId. -# -# \param[in] root -# \param[in] datasetId -# \param[in] message -# -uuYcDatasetErrorAdd(*root, *datasetId, *message) { - - uuYcDatasetGetToplevelObjects(*root, *datasetId, *toplevelObjects, *isCollection); - - foreach (*toplevel in *toplevelObjects) { - msiAddKeyVal(*kv, "dataset_error", "*message"); - # note that we want to silently ignore any duplicates of the message (using errorcode) - errorcode(msiAssociateKeyValuePairsToObj(*kv, *toplevel, if *isCollection then "-C" else "-d")); - - # This does not work for some reason. - #uuSetMetaData( - # *toplevel, - # "comment", - # *comment, - # if *isCollection then "-C" else "-d" - #); - } -} - diff --git a/ycDatasetGetToplevel.r b/ycDatasetGetToplevel.r deleted file mode 100644 index 54817e1fe..000000000 --- a/ycDatasetGetToplevel.r +++ /dev/null @@ -1,76 +0,0 @@ -# \file -# \brief dataset lookup function -# \author Ton Smeele -# \copyright Copyright (c) 2015, Utrecht university. All rights reserved -# \license GPLv3, see LICENSE -# - -#test { -# uuYcDatasetGetTopLevel("/tsm/home/rods", "x", *collection, *isCol); -# writeLine("stdout","coll = *collection and isCol = *isCol"); -#} - - -# \brief uuYcDatasetGetTopLevel retrieves the collection path and dataset type for a dataset -# -# \param[in] rootcollection path of a tree to search for the dataset -# \param[in] datasetid unique identifier of the dataset -# \param[out] topLevelCollection collection that has the dataset -# if dataset is not found an empty string is returned -# \param[out] topLevelIsCollection type of dataset: true = collection false = data objects -# -uuYcDatasetGetTopLevel(*rootCollection, *datasetId, *topLevelCollection, *topLevelIsCollection) { - # datasets can be - # A) one collection with a subtree - # B) one or more data objects located (possibly with other objects) in same collection - *topLevelIsCollection = false; - *topLevelCollection = ""; - # try to find a collection. note we will expect 0 or 1 rows: - foreach (*row in SELECT COLL_NAME - WHERE META_COLL_ATTR_NAME = 'dataset_toplevel' - AND META_COLL_ATTR_VALUE = '*datasetId' - AND COLL_NAME LIKE '*rootCollection/%' - ) { - *topLevelIsCollection = true; - msiGetValByKey(*row, "COLL_NAME", *topLevelCollection); - } - if (! *topLevelIsCollection) { - # also try the root itself - foreach (*row in SELECT COLL_NAME - WHERE META_COLL_ATTR_NAME = 'dataset_toplevel' - AND META_COLL_ATTR_VALUE = '*datasetId' - AND COLL_NAME = '*rootCollection' - ) { - *topLevelIsCollection = true; - msiGetValByKey(*row, "COLL_NAME", *topLevelCollection); - } - } - if (! *topLevelIsCollection) { - # apparently not a collection, let's search for data objects instead - foreach (*row in SELECT COLL_NAME,DATA_NAME - WHERE META_DATA_ATTR_NAME = 'dataset_toplevel' - AND META_DATA_ATTR_VALUE = '*datasetId' - AND COLL_NAME LIKE '*rootCollection/%' - ) { - msiGetValByKey(*row, "COLL_NAME", *topLevelCollection); - break; - } - if (*topLevelCollection == "") { - # not found yet, maybe data object(s) in the rootcollection itself? - - foreach (*row in SELECT COLL_NAME,DATA_NAME - WHERE META_DATA_ATTR_NAME = 'dataset_toplevel' - AND META_DATA_ATTR_VALUE = '*datasetId' - AND COLL_NAME = '*rootCollection' - ) { - msiGetValByKey(*row, "COLL_NAME", *topLevelCollection); - break; - } - } else { - # dataset not found! - } - } -} - -#input null -#output ruleExecOut diff --git a/ycDatasetLock.r b/ycDatasetLock.r deleted file mode 100644 index f0497d59b..000000000 --- a/ycDatasetLock.r +++ /dev/null @@ -1,253 +0,0 @@ -# \file -# \brief lock/freeze and unlock/unfreeze datasets within a collection -# \author Ton Smeele -# \copyright Copyright (c) 2015, Utrecht university. All rights reserved -# \license GPLv3, see LICENSE -# - -#test { -#*collection = "/nluu1ot/home/ton"; -#*datasetId = "y"; -#uuYcDatasetLock(*collection, *datasetId, *result); -#writeLine("stdout","lock result = *result"); -#uuYcDatasetFreeze(*collection, *datasetId, *result); -#writeLine("stdout","freeze result = *result"); -#uuYcObjectIsLocked("*collection/Newfile.txt",false, *locked, *frozen); -#writeLine("stdout","locked = *locked and frozen = *frozen"); - -#uuYcDatasetUnlock(*collection, *datasetId, *result); -#writeLine("stdout","unlock result = *result"); -#uuYcDatasetMelt(*collection, *datasetId, *result); -#writeLine("stdout","melt result = *result"); -#uuYcDatasetUnlock(*collection, *datasetId, *result); -#writeLine("stdout","unlock result = *result"); -#} - -uuYcDatasetLockChangeObject(*parentCollection, *objectName, *isCollection, - *lockName, *lockIt, *dateTime,*result) { - *objectType = "-d"; - *path = "*parentCollection/*objectName"; - if (*isCollection) { - *objectType = "-C"; - *collection = *objectName; - } - if (*lockIt) { - msiString2KeyValPair("*lockName=*dateTime",*kvPair); - *result = errorcode(msiSetKeyValuePairsToObj(*kvPair, *path, *objectType)); - } else { # unlock it - # - # if the lock is of type to_vault_lock this operation is - # disallowed if the object also has a to_vault_freeze lock - uuYcObjectIsLocked(*path,*isCollection,*locked,*frozen); - *allowed = (*lockName == "to_vault_freeze") || !*frozen; - if (*allowed) { - *result = 0; - # in order to remove the key we need to lookup its value(s) - if (*isCollection) { - # remove lock from collection - foreach (*row in SELECT META_COLL_ATTR_VALUE - WHERE COLL_NAME = '*path' - AND META_COLL_ATTR_NAME = '*lockName') { - msiGetValByKey(*row, "META_COLL_ATTR_VALUE", *value); - msiString2KeyValPair("*lockName=*value", *kvPair); - *result = errorcode( - msiRemoveKeyValuePairsFromObj(*kvPair, *path, "-C") - ); - if (*result != 0) { - break; - } - } - } else { - # remove lock from data object - foreach (*row in SELECT META_DATA_ATTR_VALUE - WHERE DATA_NAME = '*objectName' - AND COLL_NAME = '*parentCollection' - AND META_DATA_ATTR_NAME = '*lockName' - ) { - msiGetValByKey(*row,"META_DATA_ATTR_VALUE",*value); - msiString2KeyValPair("*lockName=*value",*kvPair); - *result = errorcode( - msiRemoveKeyValuePairsFromObj( - *kvPair, - "*parentCollection/*objectName", - "-d" - ) - ); - if (*result != 0) { - break; - } - } - } # end else remove lock from dataobject - } else { # unlock not allowed - *result = -1; - } - } -} - -uuYcDatasetWalkVaultLock(*itemCollection, *itemName, *itemIsCollection, *buffer, *error) { - msiGetIcatTime(*dateTime,"unix"); - uuYcDatasetLockChangeObject(*itemCollection, *itemName, *itemIsCollection, - "to_vault_lock", true, *dateTime, *error); -} - -uuYcDatasetWalkVaultUnlock(*itemCollection, *itemName, *itemIsCollection, *buffer, *error) { - msiGetIcatTime(*dateTime,"unix"); - uuYcDatasetLockChangeObject(*itemCollection, *itemName, *itemIsCollection, - "to_vault_lock", false, *dateTime, *error); -} - -uuYcDatasetWalkFreezeLock(*itemCollection, *itemName, *itemIsCollection, *buffer, *error) { - msiGetIcatTime(*dateTime,"unix"); - uuYcDatasetLockChangeObject(*itemCollection, *itemName, *itemIsCollection, - "to_vault_freeze", true, *dateTime, *error); -} - - -uuYcDatasetWalkFreezeUnlock(*itemCollection, *itemName, *itemIsCollection, *buffer, *error) { - msiGetIcatTime(*dateTime,"unix"); - uuYcDatasetLockChangeObject(*itemCollection, *itemName, *itemIsCollection, - "to_vault_freeze", false, *dateTime, *error); -} - - -uuYcDatasetLockChange(*rootCollection, *datasetId, *lockName, *lockIt, *status){ - *status = -1; - *lock = "Unlock"; - if (*lockIt) { - *lock = "Lock"; - } - *lockProcedure = "Vault"; - if (*lockName == "to_vault_freeze") { - *lockProcedure = "Freeze"; - } - # find the toplevel collection for this dataset - uuYcDatasetGetTopLevel(*rootCollection, *datasetId, *collection, *isCollection); - if (*collection != "") { - # we found the dataset, now change the lock on each object - if (*isCollection) { - *buffer = "dummy"; - uuTreeWalk("forward", *collection, "uuYcDatasetWalk*lockProcedure*lock", *buffer, *error); - *status = *error; -# if (*error == "0") { -# *status = 0; -# } - } else { - # dataset is not a collection, let's find the objects and make the change - msiGetIcatTime(*dateTime,"unix"); - *status = 0; - foreach (*row in SELECT DATA_NAME - WHERE COLL_NAME = '*collection' - AND META_DATA_ATTR_NAME = 'dataset_toplevel' - AND META_DATA_ATTR_VALUE = '*datasetId' - ) { - msiGetValByKey(*row,"DATA_NAME",*dataName); - # now change it .... - uuYcDatasetLockChangeObject( - *collection, - *dataName, - false, - *lockName, - *lockIt, - *dateTime, - *error); - if (*error != 0 ) { - *status = *error; - break; - } - } - } - - } else { - # result is false "dataset not found" - } -} - - -# \brief uuYcDatasetLock locks (all objects of) a dataset -# -# \param[in] collection collection that may have datasets -# \param[in] datasetId identifier to depict the dataset -# \param[out] status 0 upon success, otherwise nonzero -# -uuYcDatasetLock(*collection, *datasetId, *status) { - uuYcDatasetLockChange(*collection, *datasetId,"to_vault_lock", true, *status); -} - -# \brief uuYcDatasetUnlock unlocks (all objects of) a dataset -# -# \param[in] collection collection that may have datasets -# \param[in] datasetId identifier to depict the dataset -# \param[out] result "true" upon success, otherwise "false" -# \param[out] status 0 upon success, otherwise nonzero -# -uuYcDatasetUnlock(*collection, *datasetId, *status) { - uuYcDatasetLockChange(*collection, *datasetId, "to_vault_lock", false, *status); -} - -# \brief uuYcDatasetFreeze freeze-locks (all objects of) a dataset -# -# \param[in] collection collection that may have datasets -# \param[in] datasetId identifier to depict the dataset -# \param[out] status 0 upon success, otherwise nonzero -# -uuYcDatasetFreeze(*collection, *datasetId, *status) { - uuYcDatasetLockChange(*collection, *datasetId,"to_vault_freeze", true, *status); -} - -# \brief uuYcDatasetUnfreeze undo freeze-locks (all objects of) a dataset -# -# \param[in] collection collection that may have datasets -# \param[in] datasetId identifier to depict the dataset -# \param[out] status 0 upon success, otherwise nonzero -# -uuYcDatasetMelt(*collection, *datasetId, *status) { - uuYcDatasetLockChange(*collection, *datasetId, "to_vault_freeze", false, *status); -} - -# \brief uuYcObjectIsLocked query an object to see if it is locked -# -# \param[in] objectPath full path to collection of data object -# \param[in] isCollection true if path references a collection -# \param[out] locked true if the object is vault-locked -# \param[out] frozen true if the object is vault-frozen - -uuYcObjectIsLocked(*objectPath, *isCollection, *locked, *frozen) { - *locked = false; - *frozen = false; - if (*isCollection) { - foreach (*row in SELECT META_COLL_ATTR_NAME - WHERE COLL_NAME = '*objectPath' - ) { - msiGetValByKey(*row, "META_COLL_ATTR_NAME", *key); - if ( *key == "to_vault_lock" - || *key == "to_vault_freeze" - ) { - *locked = true; - if (*key == "to_vault_freeze") { - *frozen = true; - break; - } - } - } - } else { - uuChopPath(*objectPath, *parentCollection, *dataName); - foreach (*row in SELECT META_DATA_ATTR_NAME - WHERE COLL_NAME = '*parentCollection' - AND DATA_NAME = '*dataName' - ) { - msiGetValByKey(*row, "META_DATA_ATTR_NAME", *key); - if ( *key == "to_vault_lock" - || *key == "to_vault_freeze" - ) { - *locked = true; - if (*key == "to_vault_freeze") { - *frozen = true; - break; - } - } - } - } -} - -#input null -#output ruleExecOut diff --git a/ycModule.r b/ycModule.r deleted file mode 100644 index e37ea9afa..000000000 --- a/ycModule.r +++ /dev/null @@ -1,194 +0,0 @@ -# \file ycModule.r -# \brief Youth Cohort module -# \copyright Copyright (c) 2016-2021, Utrecht University. All rights reserved. -# \license GPLv3, see LICENSE - - -# \brief (over)write data object with a list of vault object checksums -# -# \param[in] vaultRoot root collection to be indexed -# \param[in] destinationObject dataobject that will be written to -# \param[out] status 0 = success, nonzero is error -uuYcGenerateDatasetsIndex(*vaultRoot, *destinationObject, *status) { - *status = 0; - msiDataObjCreate(*destinationObject, "forceFlag=", *FHANDLE); - - foreach (*row in SELECT COLL_NAME, DATA_NAME, DATA_CHECKSUM, DATA_SIZE - WHERE COLL_NAME = "*vaultRoot" ) { - *checksum = *row."DATA_CHECKSUM"; - *name = *row."DATA_NAME"; - *col = *row."COLL_NAME"; - *size = *row."DATA_SIZE"; - uuChopChecksum(*checksum, *type, *checksumOut); - *textLine = "*type *checksumOut *size *col/*name\n"; - msiStrlen(*textLine, *length); - msiStrToBytesBuf(*textLine, *buffer); - msiDataObjWrite(*FHANDLE, *buffer, *bytesWritten); - if (int(*length) != *bytesWritten) then { - *status = 1; - } - } - foreach (*row in SELECT COLL_NAME, DATA_NAME, DATA_CHECKSUM, DATA_SIZE - WHERE COLL_NAME like '*vaultRoot/%' ) { - *checksum = *row."DATA_CHECKSUM"; - *name = *row."DATA_NAME"; - *col = *row."COLL_NAME"; - *size = *row."DATA_SIZE"; - uuChopChecksum(*checksum, *type, *checksumOut); - *textLine = "*type *checksumOut *size *col/*name\n"; - msiStrlen(*textLine, *length); - msiStrToBytesBuf(*textLine, *buffer); - msiDataObjWrite(*FHANDLE, *buffer, *bytesWritten); - if (int(*length) != *bytesWritten) then { - *status = 1; - } - } - msiDataObjClose(*FHANDLE, *status2); - *status; -} - -# \brief Add a dataset warning to all given dataset toplevels. -# -# \param[in] toplevels -# \param[in] isCollectionToplevel -# \param[in] text -# -uuYcIntakeCheckAddDatasetWarning(*toplevels, *isCollectionToplevel, *text) { - msiAddKeyVal(*kv, "dataset_warning", *text); - - foreach (*toplevel in *toplevels) { - msiAssociateKeyValuePairsToObj(*kv, *toplevel, if *isCollectionToplevel then "-C" else "-d"); - } -} - -# \brief Add a dataset error to all given dataset toplevels. -# -# \param[in] toplevels -# \param[in] isCollectionToplevel -# \param[in] text -# -uuYcIntakeCheckAddDatasetError(*toplevels, *isCollectionToplevel, *text) { - msiAddKeyVal(*kv, "dataset_error", *text); - - foreach (*toplevel in *toplevels) { - msiAssociateKeyValuePairsToObj(*kv, *toplevel, if *isCollectionToplevel then "-C" else "-d"); - } -} - -# Reusable check utilities {{{ - -# \brief Check if a certain filename pattern has enough occurrences in a dataset. -# -# Adds a warning if the match count is out of range. -# -# NOTE: Currently, patterns must match the full relative object path. -# At the time of writing, Echo is the only experiment type we run this -# check for, and it is a flat dataset without subdirectories, so it makes -# no difference there. -# -# For other experiment types it may be desirable to match patterns with -# basenames instead of paths. In this case the currently commented-out -# code in this function can be used. -# -# \param[in] datasetParent either the dataset collection or the first parent of a data-object dataset toplevel -# \param[in] toplevels a list of toplevel objects -# \param[in] isCollectionToplevel -# \param[in] objects a list of dataset object paths relative to the datasetParent parameter -# \param[in] patternHuman a human-readable pattern (e.g.: 'I0000000.raw') -# \param[in] patternRegex a regular expression that matches filenames (e.g.: 'I[0-9]{7}\.raw') -# \param[in] min the minimum amount of occurrences. set to -1 to disable minimum check. -# \param[in] max the maximum amount of occurrences. set to -1 to disable maximum check. -# -uuYcIntakeCheckFileCount(*datasetParent, *toplevels, *isCollectionToplevel, *objects, *patternHuman, *patternRegex, *min, *max) { - *count = 0; - foreach (*path in *objects) { - *name = *path; - - #if (*path like "*/*") { - # # We might want to match basenames instead of paths relative to the dataset root. - # uuChopPath(*path, *parent, *name); - #} else { - # *name = *path; - #} - if (*name like regex *patternRegex) { - *count = *count + 1; - } - } - - if (*min != -1 && *count < *min) { - uuYcIntakeCheckAddDatasetWarning(*toplevels, *isCollectionToplevel, "Expected at least *min files of type '*patternHuman', found *count"); - } - if (*max != -1 && *count > *max) { - uuYcIntakeCheckAddDatasetWarning(*toplevels, *isCollectionToplevel, "Expected at most *max files of type '*patternHuman', found *count"); - } -} - -# }}} -# Generic checks {{{ - -# \brief Check if a dataset's wave is a valid one. -# -# \param[in] root -# \param[in] id the dataset id to check -# \param[in] toplevels a list of toplevel objects for this dataset id -# \param[in] isCollectionToplevel -# -uuYcIntakeCheckWaveValidity(*root, *id, *toplevels, *isCollectionToplevel) { - # Note: It might be cleaner to grab the wave metadata tag from the toplevel instead. - uuYcDatasetParseId(*id, *idComponents); - uuStrToLower(*idComponents."wave", *wave); - - *waves = list( - "20w", "30w", - "0m", "5m", "10m", - "3y", "6y", "9y", "12y", "15y" - ); - - uuListContains(*waves, *wave, *waveIsValid); - if (!*waveIsValid) { - uuYcIntakeCheckAddDatasetError(*toplevels, *isCollectionToplevel, "The wave '*wave' is not in the list of accepted waves"); - } -} - -# \brief Run checks that must be applied to all datasets regardless of WEPV values. -# -# Call any generic checks you make in this function. -# -# \param[in] root -# \param[in] id the dataset id to check -# \param[in] toplevels a list of toplevel objects for this dataset id -# \param[in] isCollection -# -uuYcIntakeCheckGeneric(*root, *id, *toplevels, *isCollection) { - uuYcIntakeCheckWaveValidity(*root, *id, *toplevels, *isCollection); -} - -# }}} -# Experiment type specific checks {{{ -# Echo {{{ - -# \brief Run checks specific to the Echo experiment type. -# -# \param[in] root -# \param[in] id the dataset id to check -# \param[in] toplevels a list of toplevel objects for this dataset id -# \param[in] isCollection -# -uuYcIntakeCheckEtEcho(*root, *id, *toplevels, *isCollection) { - if (*isCollection) { - *datasetParent = elem(*toplevels, 0); - } else { - uuChopPath(elem(*toplevels, 0), *dataObjectParent, *dataObjectName); - *datasetParent = *dataObjectParent; - } - - uuYcDatasetGetDataObjectRelPaths(*root, *id, *objects); - - uuYcIntakeCheckFileCount(*datasetParent, *toplevels, *isCollection, *objects, ``I0000000.index.jpg``, ``(.*/)?I[0-9]{7}\.index\.jpe?g``, 13, -1); - uuYcIntakeCheckFileCount(*datasetParent, *toplevels, *isCollection, *objects, ``I0000000.raw``, ``(.*/)?I[0-9]{7}\.raw``, 7, -1); - uuYcIntakeCheckFileCount(*datasetParent, *toplevels, *isCollection, *objects, ``I0000000.dcm``, ``(.*/)?I[0-9]{7}\.dcm``, 6, -1); - uuYcIntakeCheckFileCount(*datasetParent, *toplevels, *isCollection, *objects, ``I0000000.vol``, ``(.*/)?I[0-9]{7}\.vol``, 6, -1); -} - -# }}} -# }}} diff --git a/ycUtil.r b/ycUtil.r deleted file mode 100644 index 5fa8f4d19..000000000 --- a/ycUtil.r +++ /dev/null @@ -1,36 +0,0 @@ -# Youth cohort utility functions - -# \brief Clears a kv-list's contents. -# -# \param kvList -# -uuKvClear(*kvList) { - *kvList."." = "."; - foreach (*key in *kvList) { - *kvList.*key = "."; - } -} - -uuYcObjectIsLocked(*objPath, *locked) { - msiGetObjType(*objPath, *objType); - *locked = false; - if (*objType == '-d') { - uuChopPath(*objPath, *collection, *dataName); - foreach (*row in SELECT META_DATA_ATTR_VALUE - WHERE COLL_NAME = '*collection' - AND DATA_NAME = '*dataName' - AND META_DATA_ATTR_NAME = 'to_vault_lock' - ) { - *locked = true; - break; - } - } else { - foreach (*row in SELECT META_COLL_ATTR_VALUE - WHERE COLL_NAME = '*objPath' - AND META_COLL_ATTR_NAME = 'to_vault_lock' - ) { - *locked = true; - break; - } - } -}