diff --git a/.github/workflows/api-and-integration-tests.yml b/.github/workflows/api-and-integration-tests.yml index 3c54adc84..e69d7b878 100644 --- a/.github/workflows/api-and-integration-tests.yml +++ b/.github/workflows/api-and-integration-tests.yml @@ -5,6 +5,7 @@ on: branches: - development - release-1.9 + - release-1.10 - "**-atr" # We can force an integration/API test run without opening a PR by pushing to a branch name that ends with "-atr" pull_request: @@ -38,6 +39,8 @@ jobs: run: | if [ "${{ steps.extract_branch.outputs.branch }}" = "release-1.9" ]; then echo "branch=release-1.9" >> $GITHUB_OUTPUT + elif [ "${{ steps.extract_branch.outputs.branch }}" = "release-1.10" ]; then + echo "branch=release-1.10" >> $GITHUB_OUTPUT else echo "branch=development" >> $GITHUB_OUTPUT fi @@ -111,7 +114,7 @@ jobs: cd tests nohup bash -c 'while true ; do sleep 5 ; ../yoda/docker/run-cronjob.sh copytovault >> ../copytovault.log 2>&1 ; ../yoda/docker/run-cronjob.sh publication >> ../publication.log 2>&1 ; done' & test -d mycache || mkdir -p mycache - python3 -m pytest --skip-ui --intake --datarequest --deposit -o cache_dir=mycache --environment environments/docker.json + python3 -m pytest --skip-ui --datarequest --deposit -o cache_dir=mycache --environment environments/docker.json cat ../copytovault.log cat ../publication.log diff --git a/.github/workflows/api-documentation.yml b/.github/workflows/api-documentation.yml index ce1bf53dd..80a3c1e3a 100644 --- a/.github/workflows/api-documentation.yml +++ b/.github/workflows/api-documentation.yml @@ -55,13 +55,11 @@ jobs: export PYTHONPATH="${PYTHONPATH}:." python tools/api/generate-openapi.py rules_uu --module datarequest > build/api_datarequest.json python tools/api/generate-openapi.py rules_uu --module deposit > build/api_deposit.json - python tools/api/generate-openapi.py rules_uu --module intake > build/api_intake.json - name: Validate Yoda module API documentation run: | openapi-spec-validator build/api_datarequest.json openapi-spec-validator build/api_deposit.json - openapi-spec-validator build/api_intake.json - name: Deploy 🚀 uses: JamesIves/github-pages-deploy-action@releases/v3 diff --git a/.github/workflows/build-push-image.yml b/.github/workflows/build-push-image.yml index d0a692e42..8b987d1bc 100644 --- a/.github/workflows/build-push-image.yml +++ b/.github/workflows/build-push-image.yml @@ -6,6 +6,7 @@ on: branches: - 'development' - 'release-1.9' + - 'release-1.10' jobs: push-image: diff --git a/__init__.py b/__init__.py index 722df1424..92c63a291 100644 --- a/__init__.py +++ b/__init__.py @@ -26,39 +26,36 @@ # Import all modules containing rules into the package namespace, # so that they become visible to iRODS. -from admin import * -from browse import * -from folder import * -from groups import * -from json_datacite import * -from json_landing_page import * -from mail import * -from meta import * -from meta_form import * -from provenance import * -from research import * -from resources import * -from schema import * -from schema_transformation import * -from schema_transformations import * -from vault import * -from datacite import * -from epic import * -from publication import * -from policies import * -from replication import * -from revisions import * -from settings import * -from notifications import * -from integration_tests import * +from admin import * +from browse import * +from folder import * +from groups import * +from json_datacite import * +from json_landing_page import * +from mail import * +from meta import * +from meta_form import * +from provenance import * +from research import * +from resources import * +from schema import * +from schema_transformation import * +from schema_transformations import * +from publication_troubleshoot import * +from vault import * +from datacite import * +from epic import * +from publication import * +from policies import * +from replication import * +from revisions import * +from settings import * +from notifications import * +from integration_tests import * # Import certain modules only when enabled. from .util.config import config -if config.enable_intake: - from intake import * - from intake_vault import * - if config.enable_datarequest: from datarequest import * diff --git a/folder.py b/folder.py index 6cb88a9dd..519069e18 100644 --- a/folder.py +++ b/folder.py @@ -205,7 +205,7 @@ def precheck_folder_secure(ctx, coll): found, last_run = get_last_run_time(ctx, coll) if (not correct_copytovault_start_status(ctx, coll) or not correct_copytovault_start_location(coll) - or not misc.last_run_time_acceptable(coll, found, last_run, config.vault_copy_backoff_time)): + or not misc.last_run_time_acceptable(found, last_run, config.vault_copy_backoff_time)): return False return True diff --git a/groups.py b/groups.py index f189bfe7a..f4dfbd69d 100644 --- a/groups.py +++ b/groups.py @@ -25,6 +25,7 @@ 'rule_group_remove_external_user', 'rule_group_check_external_user', 'rule_group_expiration_date_validate', + 'rule_user_exists', 'rule_group_user_exists', 'api_group_search_users', 'api_group_exists', @@ -56,18 +57,13 @@ def getGroupsData(ctx): attr = row[1] value = row[2] - # Create/update group with this information. - try: - group = groups[name] - except Exception: - group = { - "name": name, - "managers": [], - "members": [], - "read": [], - "invited": [] - } - groups[name] = group + group = groups.setdefault(name, { + "name": name, + "managers": [], + "members": [], + "read": [], + "invited": [] + }) if attr in ["schema_id", "data_classification", "category", "subcategory"]: group[attr] = value @@ -95,26 +91,17 @@ def getGroupsData(ctx): if name.startswith("read-"): # Match read-* group with research-* or initial-* group. name = name[5:] - try: - # Attempt to add to read list of research group. - group = groups["research-" + name] - group["read"].append(user) - except Exception: - try: - # Attempt to add to read list of initial group. - group = groups["initial-" + name] + for prefix in ("research-", "initial-"): + group = groups.get(prefix + name) + if group: group["read"].append(user) - except Exception: - pass + break elif not name.startswith("vault-"): - try: - # Ordinary group. - group = groups[name] + group = groups.get(name) + if group: group["members"].append(user) - except KeyError: - pass - # Third query: obtain list of invited SRAM users + # Third query: obtain list of invited SRAM users. if config.enable_sram: iter = genquery.row_iterator( "META_USER_ATTR_VALUE, USER_NAME, USER_ZONE", @@ -124,11 +111,9 @@ def getGroupsData(ctx): for row in iter: name = row[0] user = row[1] + "#" + row[2] - try: - group = groups[name] + group = groups.get(name) + if group: group["invited"].append(user) - except KeyError: - pass return groups.values() @@ -529,11 +514,11 @@ def api_group_process_csv(ctx, csv_header_and_data, allow_update, delete_users): return api.Error('errors', validation_errors) # Step 3: Create / update groups. - error = apply_data(ctx, data, allow_update, delete_users) - if len(error): - return api.Error('errors', [error]) + status_msg = apply_data(ctx, data, allow_update, delete_users) + if status_msg['status'] == 'error': + return api.Error('errors', [status_msg['message']]) - return api.Result.ok() + return api.Result.ok(info=[status_msg['message']]) def validate_data(ctx, data, allow_update): @@ -553,7 +538,7 @@ def validate_data(ctx, data, allow_update): for (category, subcategory, groupname, _managers, _members, _viewers, _schema_id, _expiration_date) in data: if group.exists(ctx, groupname) and not allow_update: - errors.append('Group "{}" already exists'.format(groupname)) + errors.append('Group "{}" already exists. It has not been updated.'.format(groupname)) # Is user admin or has category add privileges? if not (is_admin or can_add_category): @@ -575,11 +560,13 @@ def apply_data(ctx, data, allow_update, delete_users): :param allow_update: Allow updates in groups :param delete_users: Allow for deleting of users from groups - :returns: Errors if found any + :returns: Errors if found any, or message with actions if everything is successful """ for (category, subcategory, group_name, managers, members, viewers, schema_id, expiration_date) in data: new_group = False + users_added, users_removed = 0, 0 + message = '' log.write(ctx, 'CSV import - Adding and updating group: {}'.format(group_name)) @@ -590,10 +577,12 @@ def apply_data(ctx, data, allow_update, delete_users): if response: new_group = True + message += "Group '{}' created.".format(group_name) elif response.status == "error_group_exists" and allow_update: log.write(ctx, 'CSV import - WARNING: group "{}" not created, it already exists'.format(group_name)) + message += "Group '{}' already exists.".format(group_name) else: - return "Error while attempting to create group {}. Status/message: {} / {}".format(group_name, response.status, response.status_info) + return {status: 'error', message: "Error while attempting to create group {}. Status/message: {} / {}".format(group_name, response.status, response.status_info)} # Now add the users and set their role if other than member allusers = managers + members + viewers @@ -604,6 +593,7 @@ def apply_data(ctx, data, allow_update, delete_users): if response: currentrole = "normal" log.write(ctx, "CSV import - Notice: added user {} to group {}".format(username, group_name)) + users_added += 1 else: log.write(ctx, "CSV import - Warning: error occurred while attempting to add user {} to group {}".format(username, group_name)) log.write(ctx, "CSV import - Status: {} , Message: {}".format(response.status, response.status_info)) @@ -669,11 +659,21 @@ def apply_data(ctx, data, allow_update, delete_users): response = group_remove_user_from_group(ctx, username, usergroupname) if response: log.write(ctx, "CSV import - Removing user {} from group {}".format(username, usergroupname)) + users_removed += 1 else: log.write(ctx, "CSV import - Warning: error while attempting to remove user {} from group {}".format(username, usergroupname)) log.write(ctx, "CSV import - Status: {} , Message: {}".format(response.status, response.status_info)) - return '' + if users_added > 0: + message += ' Users added ({}).'.format(users_added) + if users_removed > 0: + message += ' Users removed ({}).'.format(users_removed) + + # If no users added, no users removed and not new group created. + if not users_added and not users_removed and not new_group: + message += ' No changes made.' + + return {"status": "ok", "message": message} def _are_roles_equivalent(a, b): @@ -705,6 +705,18 @@ def group_user_exists(ctx, group_name, username, include_readonly): return False +@rule.make(inputs=[0], outputs=[1]) +def rule_user_exists(ctx, username): + """Rule wrapper to check if a user exists. + + :param ctx: Combined type of a ctx and rei struct + :param username: User to check for existence + + :returns: Indicator if user exists + """ + return "true" if user.exists(ctx, username) else "false" + + def rule_group_user_exists(rule_args, callback, rei): """Check if a user is a member of the given group. @@ -973,12 +985,15 @@ def group_create(ctx, group_name, category, subcategory, schema_id, expiration_d if not sram.sram_connect_service_collaboration(ctx, short_name): return api.Error('sram_error', 'Something went wrong connecting service to group "{}" in SRAM'.format(group_name)) + if group.exists(ctx, group_name): + return api.Error('group_exists', "Group {} not created, it already exists".format(group_name)) + response = ctx.uuGroupAdd(group_name, category, subcategory, schema_id, expiration_date, description, data_classification, co_identifier, '', '')['arguments'] status = response[8] message = response[9] if status == '0': return api.Result.ok() - elif status == '-1089000' or status == '-809000': + elif status == '-1089000' or status == '-809000' or status == '-806000': return api.Error('group_exists', "Group {} not created, it already exists".format(group_name)) else: return api.Error('policy_error', message) diff --git a/groups_import.py b/groups_import.py index 34d71d8b1..c1c876d68 100644 --- a/groups_import.py +++ b/groups_import.py @@ -142,7 +142,7 @@ def parse_csv_file(ctx): # Start processing the actual group data rows for line in lines: row_number += 1 - rowdata, error = process_csv_line(line) + rowdata, error = process_csv_line(ctx, line) if error is None: extracted_data.append(rowdata) diff --git a/intake.py b/intake.py deleted file mode 100644 index d304a6e7a..000000000 --- a/intake.py +++ /dev/null @@ -1,924 +0,0 @@ -# -*- coding: utf-8 -*- -"""Functions for intake module.""" - -__copyright__ = 'Copyright (c) 2019-2021, Utrecht University' -__license__ = 'GPLv3, see LICENSE' - -import fnmatch -import itertools -import time -import traceback - -import genquery - -import intake_dataset -import intake_lock -import intake_scan -from util import * - - -__all__ = ['api_intake_list_studies', - 'api_intake_list_dm_studies', - 'api_intake_count_total_files', - 'api_intake_list_unrecognized_files', - 'api_intake_list_datasets', - 'api_intake_scan_for_datasets', - 'api_intake_lock_dataset', - 'api_intake_unlock_dataset', - 'api_intake_dataset_get_details', - 'api_intake_dataset_add_comment', - 'api_intake_report_vault_dataset_counts_per_study', - 'api_intake_report_vault_aggregated_info', - 'api_intake_report_export_study_data', - 'rule_intake_scan_for_datasets'] - -INTAKE_FILE_EXCLUSION_PATTERNS = ['*.abc', '*.PNG'] -""" List of file patterns not to take into account within INTAKE module.""" - - -@api.make() -def api_intake_list_studies(ctx): - """Get list of all studies current user is involved in. - - :param ctx: Combined type of a callback and rei struct - - :returns: List of studies - - """ - groups = [] - user_name = user.name(ctx) - user_zone = user.zone(ctx) - - iter = genquery.row_iterator( - "USER_GROUP_NAME", - "USER_NAME = '" + user_name + "' AND USER_ZONE = '" + user_zone + "'", - genquery.AS_LIST, ctx - ) - - for row in iter: - if row[0].startswith('grp-intake-'): - groups.append(row[0][11:]) - elif row[0].startswith('intake-'): - groups.append(row[0][7:]) - - groups.sort() - return groups - - -@api.make() -def api_intake_list_dm_studies(ctx): - """Return list of studies current user is datamanager of. - - :param ctx: Combined type of a callback and rei struct - - :returns: List of dm studies - """ - datamanager_groups = [] - user_name = user.name(ctx) - user_zone = user.zone(ctx) - - iter = genquery.row_iterator( - "USER_GROUP_NAME", - "USER_NAME = '" + user_name + "' AND USER_ZONE = '" + user_zone + "'", - genquery.AS_LIST, ctx - ) - - for row in iter: - study = '' - if row[0].startswith('grp-intake-'): - study = row[0][11:] - elif row[0].startswith('intake-'): - study = row[0][7:] - - if study: - # Is a member of this study ... check whether member of corresponding datamanager group - iter2 = genquery.row_iterator( - "USER_NAME", - "USER_TYPE = 'rodsgroup' AND USER_NAME like 'grp-datamanager-" + study + "'", - genquery.AS_LIST, ctx - ) - for row2 in iter2: - datamanager_group = row2[0] - if user.is_member_of(ctx, datamanager_group): - datamanager_groups.append(study) - - return datamanager_groups - - -@api.make() -def api_intake_count_total_files(ctx, coll): - """Get the total count of all files in collection - . - :param ctx: Combined type of a callback and rei struct - :param coll: Collection from which to count all datasets - - :returns: Total file count - """ - main_collection_iterator = genquery.row_iterator( - "COLL_NAME, DATA_NAME", - "COLL_NAME = '" + coll + "'", - genquery.AS_LIST, ctx - ) - - subcollection_iterator = genquery.row_iterator( - "COLL_NAME, DATA_NAME", - "COLL_NAME like '" + coll + "/%'", - genquery.AS_LIST, ctx - ) - - count = 0 - for row in itertools.chain(main_collection_iterator, subcollection_iterator): - exclusion_matched = any(fnmatch.fnmatch(row[1], p) for p in INTAKE_FILE_EXCLUSION_PATTERNS) - if not exclusion_matched: - count += 1 - - return count - - -@api.make() -def api_intake_list_unrecognized_files(ctx, coll): - """Get list of all unrecognized files for given path including relevant metadata. - - :param ctx: Combined type of a callback and rei struct - :param coll: Collection from which to list all unrecognized files - - :returns: List of unrecognized files - """ - # check permissions - parts = coll.split('/') - group = parts[3] - datamanager_group = intake_group_to_datamanager_group(group) - - if user.is_member_of(ctx, group): - pass - elif user.is_member_of(ctx, datamanager_group): - pass - else: - return {} - - # Include coll name as equal names do occur and genquery delivers distinct results. - main_collection_iterator = genquery.row_iterator( - "COLL_NAME, DATA_NAME, COLL_CREATE_TIME, DATA_OWNER_NAME", - "COLL_NAME = '" + coll + "' AND META_DATA_ATTR_NAME = 'unrecognized'", - genquery.AS_LIST, ctx - ) - - subcollection_iterator = genquery.row_iterator( - "COLL_NAME, DATA_NAME, COLL_CREATE_TIME, DATA_OWNER_NAME", - "COLL_NAME like '" + coll + "/%' AND META_DATA_ATTR_NAME = 'unrecognized'", - genquery.AS_LIST, ctx - ) - - files = [] - for row in itertools.chain(main_collection_iterator, subcollection_iterator): - # Check whether object type is within exclusion pattern - exclusion_matched = any(fnmatch.fnmatch(row[1], p) for p in INTAKE_FILE_EXCLUSION_PATTERNS) - if not exclusion_matched: - # Error is hardcoded! (like in the original) and initialize attributes already as empty strings. - file_data = {"name": row[1], - "path": row[0], - "date": time.strftime('%Y-%m-%d', time.localtime(int(row[2]))), - "creator": row[3], - "error": 'Experiment type, wave or pseudocode is missing from path', - "experiment_type": '', - "pseudocode": '', - "wave": '', - "version": ''} - - # per data object get relevant metadata (experiment type, version, wave, pseudocode) if present - iter2 = genquery.row_iterator( - "META_DATA_ATTR_NAME, META_DATA_ATTR_VALUE", - "COLL_NAME = '" + row[0] + "' AND DATA_NAME = '" + row[1] + "' AND META_DATA_ATTR_NAME in ('experiment_type', 'pseudocode', 'wave', 'version')", - genquery.AS_LIST, ctx - ) - for row2 in iter2: - file_data[row2[0]] = row2[1] - - files.append(file_data) - - return files - - -@api.make() -def api_intake_list_datasets(ctx, coll): - """Get list of datasets for given path. - - A dataset is distinguished by attribute name 'dataset_toplevel' which can either reside on a collection or a data object. - That is why 2 separate queries have to be performed. - - :param ctx: Combined type of a callback and rei struct - :param coll: Collection from which to list all datasets - - :returns: list of datasets - """ - datasets = [] - - # 1) Query for datasets distinguished by collections - c_main_collection_iterator = genquery.row_iterator( - "META_COLL_ATTR_VALUE, COLL_NAME", - "COLL_NAME = '" + coll + "' AND META_COLL_ATTR_NAME = 'dataset_toplevel' ", - genquery.AS_LIST, ctx - ) - - c_subcollection_iterator = genquery.row_iterator( - "META_COLL_ATTR_VALUE, COLL_NAME", - "COLL_NAME LIKE '" + coll + "/%' AND META_COLL_ATTR_NAME = 'dataset_toplevel' ", - genquery.AS_LIST, ctx - ) - - for row in itertools.chain(c_main_collection_iterator, c_subcollection_iterator): - dataset = get_dataset_details(ctx, row[0], row[1]) - datasets.append(dataset) - - # 2) Query for datasets distinguished dataobjects - d_main_collection_iterator = genquery.row_iterator( - "META_DATA_ATTR_VALUE, COLL_NAME", - "COLL_NAME = '" + coll + "' AND META_DATA_ATTR_NAME = 'dataset_toplevel' ", - genquery.AS_LIST, ctx - ) - - d_subcollection_iterator = genquery.row_iterator( - "META_DATA_ATTR_VALUE, COLL_NAME", - "COLL_NAME LIKE '" + coll + "/%' AND META_DATA_ATTR_NAME = 'dataset_toplevel' ", - genquery.AS_LIST, ctx - ) - - for row in itertools.chain(d_main_collection_iterator, d_subcollection_iterator): - dataset = get_dataset_details(ctx, row[0], row[1]) - datasets.append(dataset) - - return datasets - - -def get_dataset_details(ctx, dataset_id, path): - """Get details of dataset based on dataset identifier. - - :param ctx: Combined type of a callback and rei struct - :param dataset_id: Identifier of dataset - :param path: Path to dataset - - :returns: Dict holding objects for the dataset - """ - # Inialise all attributes - dataset = {"dataset_id": dataset_id, - "path": path} - - # Parse dataset_id to get WEPV-items individually - dataset_parts = dataset_id.split('\t') - dataset['wave'] = dataset_parts[0] - dataset['experiment_type'] = dataset_parts[1] - dataset['pseudocode'] = dataset_parts[2] - dataset['version'] = dataset_parts[3] - dataset['datasetStatus'] = 'scanned' - dataset['datasetCreateName'] = '==UNKNOWN==' - dataset['datasetCreateDate'] = 0 - dataset['datasetCreateDateFormatted'] = '' - dataset['datasetErrors'] = 0 - dataset['datasetWarnings'] = 0 - dataset['datasetComments'] = 0 - dataset['objects'] = 0 - dataset['objectErrors'] = 0 - dataset['objectWarnings'] = 0 - - tl_info = get_dataset_toplevel_objects(ctx, path, dataset_id) - is_collection = tl_info['is_collection'] - tl_objects = tl_info['objects'] - - if is_collection: - """ dataset is based on a collection """ - tl_collection = tl_objects[0] - iter = genquery.row_iterator( - "COLL_NAME, COLL_OWNER_NAME, COLL_CREATE_TIME", - "COLL_NAME = '" + tl_collection + "' ", - genquery.AS_LIST, ctx - ) - for row in iter: - dataset['datasetCreateName'] = row[1] - dataset['datasetCreateDate'] = int(row[2]) - dataset['datasetCreateDateFormatted'] = time.strftime('%Y-%m-%d', time.localtime(int(row[2]))) - dataset['datasetCreatedByWhen'] = row[1] + ':' + row[2] - - iter = genquery.row_iterator( - "COLL_NAME, META_COLL_ATTR_NAME, count(META_COLL_ATTR_VALUE)", - "COLL_NAME = '" + tl_collection + "' ", - genquery.AS_LIST, ctx - ) - for row in iter: - if row[1] == 'dataset_error': - dataset['datasetErrors'] += int(row[2]) - if row[1] == 'dataset_warning': - dataset['datasetWarnings'] += int(row[2]) - if row[1] == 'comment': - dataset['datasetComments'] += int(row[2]) - if row[1] == 'to_vault_freeze': - dataset['datasetStatus'] = 'frozen' - if row[1] == 'to_vault_lock': - dataset['datasetStatus'] = 'locked' - - iter = genquery.row_iterator( - "COLL_NAME, META_COLL_ATTR_NAME, META_COLL_ATTR_VALUE", - "COLL_NAME = '" + tl_collection + "' ", - genquery.AS_LIST, ctx - ) - for row in iter: - if row[1] == 'object_count': - dataset['objects'] += int(row[2]) - if row[1] == 'object_errors': - dataset['objectErrors'] += int(row[2]) - if row[1] == 'object_warnings': - dataset['objectWarnings'] += int(row[2]) - else: - # Dataset is based on a dataobject - # Step through all data objects as found in tl_objects - objects = 0 - object_errors = 0 - object_warnings = 0 - for tl_object in tl_objects: - - # split tl_object - tlo = pathutil.chop(tl_object) - parent = tlo[0] - base_name = tlo[1] - - objects += 1 - if objects == 1: - iter = genquery.row_iterator( - "DATA_OWNER_NAME, DATA_CREATE_TIME", - "COLL_NAME = '" + parent + "' and DATA_NAME = '" + base_name + "' ", - genquery.AS_LIST, ctx - ) - for row in iter: - dataset['datasetCreateName'] = row[0] - dataset['datasetCreateDate'] = int(row[1]) - dataset['datasetCreateDateFormatted'] = time.strftime('%Y-%m-%d', time.localtime(int(row[1]))) - dataset['datasetCreatedByWhen'] = row[0] + ':' + row[1] - - iter = genquery.row_iterator( - "META_DATA_ATTR_NAME, META_DATA_ATTR_VALUE", - "COLL_NAME = '" + parent + "' and DATA_NAME = '" + base_name + "' ", - genquery.AS_LIST, ctx - ) - for row in iter: - if row[0] == 'error': - object_errors += 1 - if row[0] == 'warning': - object_warnings += 1 - if objects == 1: - # Only look at these items when objects==1 as they are added to each toplevel object present - if row[0] == 'dataset_error': - dataset['datasetErrors'] += 1 - if row[0] == 'dataset_warning': - dataset['datasetWarnings'] += 1 - if row[0] == 'comment': - dataset['datasetComments'] += 1 - if row[0] == 'to_vault_freeze': - dataset['datasetStatus'] = 'frozen' - if row[0] == 'to_vault_lock': - dataset['datasetStatus'] = 'locked' - dataset['objects'] = objects - dataset['objectErrors'] = object_errors - dataset['objectWarnings'] = object_warnings - - return dataset - - -def get_dataset_toplevel_objects(ctx, root, dataset_id): - """Returns dict with toplevel object paths and whether is collection based dataset. - - If is a collection - only one object is returned (collection path). - If not a collection- all objects are returned with full object path. - - :param ctx: Combined type of a callback and rei struct - :param root: Path within which to search for datasets (e.g. an intake group collection) - :param dataset_id: Identifier of the dataset - - :returns: Dict holding top-level object paths for the dataset (in the 'objects' key) and a boolean value which - says whether it is a collection-based dataset (in the 'is_collection' key) - """ - c_main_collection_iterator = genquery.row_iterator( - "COLL_NAME", - "COLL_NAME = '" + root + "' AND META_COLL_ATTR_NAME = 'dataset_toplevel' " - "AND META_COLL_ATTR_VALUE = '" + dataset_id + "'", - genquery.AS_LIST, ctx - ) - - c_subcollection_iterator = genquery.row_iterator( - "COLL_NAME", - "COLL_NAME LIKE '" + root + "/%' AND META_COLL_ATTR_NAME = 'dataset_toplevel' " - "AND META_COLL_ATTR_VALUE = '" + dataset_id + "'", - genquery.AS_LIST, ctx - ) - - for row in itertools.chain(c_main_collection_iterator, c_subcollection_iterator): - return {'is_collection': True, - 'objects': [row[0]]} - - # For dataobject situation gather all object path strings as a list - d_main_collection_iterator = genquery.row_iterator( - "DATA_NAME, COLL_NAME", - "COLL_NAME = '" + root + "' AND META_DATA_ATTR_NAME = 'dataset_toplevel' " - "AND META_DATA_ATTR_VALUE = '" + dataset_id + "'", - genquery.AS_LIST, ctx - ) - - d_subcollection_iterator = genquery.row_iterator( - "DATA_NAME, COLL_NAME", - "COLL_NAME LIKE '" + root + "/%' AND META_DATA_ATTR_NAME = 'dataset_toplevel' " - "AND META_DATA_ATTR_VALUE = '" + dataset_id + "'", - genquery.AS_LIST, ctx - ) - - objects = [] - for row in itertools.chain(d_main_collection_iterator, d_subcollection_iterator): - objects.append(row[1] + '/' + row[0]) - return {'is_collection': False, - 'objects': objects} - - -@api.make() -def api_intake_scan_for_datasets(ctx, coll): - """The toplevel of a dataset can be determined by attribute 'dataset_toplevel' - and can either be a collection or a data_object. - - :param ctx: Combined type of a callback and rei struct - :param coll: Collection to scan for datasets - - :returns: indication correct - """ - - if _intake_check_authorized_to_scan(ctx, coll): - try: - _intake_scan_for_datasets(ctx, coll) - except Exception: - log.write(ctx, "Intake scan (API) failed with the following exception: " + traceback.format_exc()) - return {"proc_status": "NOK", "error_msg": "Error during scanning process"} - else: - return {"proc_status": "NOK", "error_msg": "No permissions to scan collection"} - - return {"proc_status": "OK"} - - -@rule.make(inputs=[0], outputs=[1]) -def rule_intake_scan_for_datasets(ctx, coll): - """The toplevel of a dataset can be determined by attribute 'dataset_toplevel' - and can either be a collection or a data_object. - - :param ctx: Combined type of a callback and rei struct - :param coll: Collection to scan for datasets - - :returns: 0=correct, 1=insufficient rights, 2=error during scanning process - """ - if not collection.exists(ctx, coll): - return "Non existing collection: " + coll - if _intake_check_authorized_to_scan(ctx, coll): - try: - _intake_scan_for_datasets(ctx, coll, tl_datasets_log_target='stdout') - except Exception: - log.write(ctx, "Intake scan (rule) failed with the following exception: " + traceback.format_exc()) - return "Error scanning for datasets for collection: " + coll - else: - return "Insufficient permissions for collection: " + coll - - return 0 - - -def _intake_check_authorized_to_scan(ctx, coll): - """Checks that user is authorized to scan intake group, either as - a data manager or as an intake group member. - - :param ctx: Combined type of a callback and rei struct - :param coll: Collection to scan for datasets - - :returns: boolean - whether user is authorized - """ - parts = coll.split('/') - group = parts[3] - datamanager_group = intake_group_to_datamanager_group(group) - - if (user.is_member_of(ctx, group) or user.is_member_of(ctx, datamanager_group)): - return True - else: - log.write(ctx, "No permissions to scan collection") - return False - - -def _intake_scan_for_datasets(ctx, coll, tl_datasets_log_target=''): - """Internal function for actually running intake scan - - :param ctx: Combined type of a callback and rei struct - :param coll: Collection to scan for datasets - :param tl_datasets_log_target: If in ['stdout', 'serverLog'] logging of toplevel datasets will take place to the specified target - - """ - scope = {"wave": "", - "experiment_type": "", - "pseudocode": ""} - found_datasets = [] - found_datasets = intake_scan.intake_scan_collection(ctx, coll, scope, False, found_datasets) - - if tl_datasets_log_target in ['stdout', 'serverLog']: - for subscope in found_datasets: - try: - version = subscope['version'] - except KeyError: - version = 'Raw' - ctx.writeLine(tl_datasets_log_target, ("Found dataset toplevel collection: " - + "W<" + subscope['wave'] - + "> E<" + subscope['experiment_type'] - + "> P<" + subscope['pseudocode'] - + "> V<" + version - + "> D<" + subscope['directory'] - + ">")) - - intake_scan.intake_check_datasets(ctx, coll) - - -@api.make() -def api_intake_lock_dataset(ctx, path, dataset_ids): - """Lock datasets as an indication it can be 'frozen' for it to progress to vault. - - Lock = datamanager only - - :param ctx: Combined type of a callback and rei struct - :param path: Collection for which to lock a specific dataset id - :param dataset_ids: Comma separated identifiers of datasets to be locked - - :returns: indication correct - """ - # check permissions - datamanager only - parts = path.split('/') - group = parts[3] - datamanager_group = intake_group_to_datamanager_group(group) - - if not user.is_member_of(ctx, datamanager_group): - log.write(ctx, "No permissions to lock dataset") - return {"proc_status": "NOK", - "error_msg": "No permissions to lock dataset(s)", - "error_dataset_ids": []} - - error_dataset_ids = [] - for dataset_id in dataset_ids.split(','): - # error_dataset_ids.append(dataset_id) - try: - intake_lock.intake_dataset_lock(ctx, path, dataset_id) - except Exception: - error_dataset_ids.append(dataset_id) - - if error_dataset_ids: - return {"proc_status": "NOK", - "error_msg": "Something went wrong locking datasets", - "error_dataset_ids": error_dataset_ids} - - return {"proc_status": "OK"} - - -@api.make() -def api_intake_unlock_dataset(ctx, path, dataset_ids): - """Unlock a dataset to remove the indication so it can be 'frozen' for it to progress to vault - - Unlock = datamanager only - - :param ctx: Combined type of a callback and rei struct - :param path: Collection for which to lock a specific dataset id - :param dataset_ids: Comma separated identifiers of datasets to be locked - - :returns: indication correct - """ - # check permissions - datamanager only - parts = path.split('/') - group = parts[3] - datamanager_group = intake_group_to_datamanager_group(group) - - if not user.is_member_of(ctx, datamanager_group): - log.write(ctx, "No permissions to unlock dataset(s)") - return {"proc_status": "NOK", - "error_msg": "No permissions to unlock dataset", - "error_dataset_ids": []} - - error_dataset_ids = [] - for dataset_id in dataset_ids.split(','): - # error_dataset_ids.append(dataset_id) - try: - intake_lock.intake_dataset_unlock(ctx, path, dataset_id) - except Exception: - error_dataset_ids.append(dataset_id) - - if error_dataset_ids: - return {"proc_status": "NOK", - "error_msg": "Something went wrong unlocking datasets", - "error_dataset_ids": error_dataset_ids} - - return {"proc_status": "OK"} - - -@api.make() -def api_intake_dataset_add_comment(ctx, study_id, dataset_id, comment): - """Add a comment to a dataset. - - :param ctx: Combined type of a callback and rei struct - :param study_id: Id of the study given dataset belongs to - :param dataset_id: Identifier of the dataset to add a comment to - :param comment: Comment as added by user - - :returns: indication correct - """ - coll = '/' + user.zone(ctx) + '/home/' + study_id - - # check permissions - can be researcher or datamanager - parts = coll.split('/') - group = parts[3] - datamanager_group = intake_group_to_datamanager_group(group) - - if not (user.is_member_of(ctx, group) or user.is_member_of(ctx, datamanager_group)): - log.write(ctx, "No permissions to scan collection") - return {} - - tl_info = get_dataset_toplevel_objects(ctx, coll, dataset_id) - is_collection = tl_info['is_collection'] - tl_objects = tl_info['objects'] - - if not is_collection and len(tl_objects) == 0: - return {"proc_status": "NOK", - "error_msg": "Dataset does not exist"} - - timestamp = int(time.time()) # int(datetime.timestamp(datetime.now())) - comment_data = user.name(ctx) + ':' + str(timestamp) + ':' + comment - - for tl in tl_objects: - if is_collection: - avu.associate_to_coll(ctx, tl, 'comment', comment_data) - else: - avu.associate_to_data(ctx, tl, 'comment', comment_data) - - return {'user': user.name(ctx), 'timestamp': time.strftime('%Y/%m/%d %H:%M:%S', time.localtime(timestamp)), 'comment': comment} - - -@api.make() -def api_intake_dataset_get_details(ctx, coll, dataset_id): - """Get all details for a dataset (errors/warnings, scanned by who/when, comments, file tree). - - 1) Errors/warnings - 2) Comments - 3) Tree view of files within dataset. - - :param ctx: Combined type of a callback and rei struct - :param coll: Collection to start from - :param dataset_id: Identifier of the dataset to get details for - - :returns: dictionary with all dataset data - """ - # check permissions - can be researcher or datamanager - parts = coll.split('/') - group = parts[3] - datamanager_group = intake_group_to_datamanager_group(group) - - if not (user.is_member_of(ctx, group) or user.is_member_of(ctx, datamanager_group)): - log.write(ctx, "No permissions to scan collection") - return {} - - tl_info = get_dataset_toplevel_objects(ctx, coll, dataset_id) - is_collection = tl_info['is_collection'] - tl_objects = tl_info['objects'] - - scanned = '' - comments = [] - dataset_warnings = [] - dataset_errors = [] - files = {} - for tl in tl_objects: - if is_collection: - coll = tl - # Dataset based on a collection - iter = genquery.row_iterator( - "META_COLL_ATTR_VALUE, META_COLL_ATTR_NAME, order_asc(META_COLL_MODIFY_TIME)", - "COLL_NAME = '{}' and META_COLL_ATTR_NAME in ('dataset_error', 'dataset_warning', 'comment')".format(coll), - genquery.AS_LIST, ctx - ) - for row in iter: - if row[1] == 'dataset_error': - dataset_errors.append(row[0]) - elif row[1] == 'dataset_warning': - dataset_warnings.append(row[0]) - else: - comments.append(row[0]) - - # Scanned by/when - iter = genquery.row_iterator( - "META_DATA_ATTR_VALUE", - "META_DATA_ATTR_NAME = 'scanned' AND COLL_NAME = '{}'".format(coll), - genquery.AS_LIST, ctx - ) - for row in iter: - scanned = row[0] - break - - break - else: - # Dataset is based on a data object - parts = pathutil.chop(tl) - coll = parts[0] - file = parts[1] - iter = genquery.row_iterator( - "META_DATA_ATTR_VALUE, META_DATA_ATTR_NAME, order_asc(META_DATA_MODIFY_TIME)", - "COLL_NAME = '{}' AND DATA_NAME = '{}' and META_DATA_ATTR_NAME in ('dataset_error','dataset_warning','comment', 'scanned')".format(coll, file), - genquery.AS_LIST, ctx - ) - for row in iter: - if row[1] == 'dataset_error': - dataset_errors.append(row[0]) - elif row[1] == 'dataset_warning': - dataset_warnings.append(row[0]) - elif row[1] == 'scanned': - scanned = row[0] - else: - comments.append(row[0]) - - # do it only once - all data is gathered in the first run - break - - level = '0' - files = coll_objects(ctx, level, coll, dataset_id) - - if len(scanned.split(':')) != 2: - # Retrieve scannedby/when information in a different way - dataset = get_dataset_details(ctx, dataset_id, coll) - scanned = dataset.get('datasetCreatedByWhen', "unknown") - - return {"files": files, - # "is_collection": is_collection, - # "tlobj": tl_objects, - "scanned": scanned, - "comments": comments, - "dataset_warnings": dataset_warnings, - "dataset_errors": dataset_errors} - - -def coll_objects(ctx, level, coll, dataset_id): - """Recursive function to pass entire folder/file structure in such that frontend - can do something useful with it including errors/warnings on object level - - :param ctx: Combined type of a callback and rei struct - :param level: Level in hierarchy (tree) - :param coll: Collection to collect - :param dataset_id: id of the dataset involved - - :returns: Tree of collections and files - """ - # First get the sub collections - counter = 0 - files = {} - - # COLLECTIONS - iter = genquery.row_iterator( - "COLL_NAME, COLL_ID", - "COLL_PARENT_NAME = '{}' AND META_COLL_ATTR_NAME = 'dataset_id' AND META_COLL_ATTR_VALUE = '{}'".format(coll, dataset_id), - genquery.AS_LIST, ctx - ) - for row in iter: - # files(pathutil.basename(row[0])) - node = {} - node['name'] = pathutil.basename(row[0]) - node['isFolder'] = True - node['parent_id'] = level - warnings = [] - errors = [] - # Per collection add errors/warnings from scan process - iter2 = genquery.row_iterator( - "META_COLL_ATTR_VALUE, META_COLL_ATTR_NAME", - "META_COLL_ATTR_NAME in ('warning', 'error') AND COLL_ID = '{}'".format(row[1]), - genquery.AS_LIST, ctx - ) - for row2 in iter2: - if row[1] == 'error': - errors.append(row2[0]) - else: - warnings.append(row2[0]) - node['errors'] = errors - node['warnings'] = warnings - - files[level + "." + str(counter)] = node - - files.update(coll_objects(ctx, level + "." + str(counter), row[0], dataset_id)) - - counter += 1 - - # DATA OBJECTS - iter = genquery.row_iterator( - "DATA_NAME, DATA_ID", - "COLL_NAME = '{}' AND META_DATA_ATTR_NAME = 'dataset_id' AND META_DATA_ATTR_VALUE = '{}'".format(coll, dataset_id), - genquery.AS_LIST, ctx - ) - for row in iter: - node = {} - node['name'] = row[0] - node['isFolder'] = False - node['parent_id'] = level - # Per data object add errors/warnings from scan process - iter2 = genquery.row_iterator( - "META_DATA_ATTR_VALUE, META_DATA_ATTR_NAME", - "META_DATA_ATTR_NAME in ('warning', 'error') AND DATA_ID = '{}'".format(row[1]), - genquery.AS_LIST, ctx - ) - warnings = [] - errors = [] - for row2 in iter2: - if row2[1] == 'error': - errors.append(row2[0]) - else: - warnings.append(row2[0]) - node['errors'] = errors - node['warnings'] = warnings - - files[level + "." + str(counter)] = node - - counter += 1 - - return files - - -# Reporting / export functions -@api.make() -def api_intake_report_vault_dataset_counts_per_study(ctx, study_id): - """Get the count of datasets wave/experimenttype. - - In the vault a dataset is always located in a folder. - Therefore, looking at the folders only is enough. - - :param ctx: Combined type of a callback and rei struct - :param study_id: Study id - - :returns: Dictionary with relevant aggregated counts - """ - # check permissions - datamanager only - datamanager_group = "grp-datamanager-" + study_id - - if not user.is_member_of(ctx, datamanager_group): - log.write(ctx, "No permissions for reporting functionality") - return {} - - return intake_dataset.intake_youth_dataset_counts_per_study(ctx, study_id) - - -@api.make() -def api_intake_report_vault_aggregated_info(ctx, study_id): - """Collects the following information for Raw, Processed datasets. - Including a totalisation of this all (Raw/processed is kept in VERSION). - - -Total datasets - -Total files - -Total file size - -File size growth in a month - -Datasets growth in a month - -Pseudocodes (distinct) - - :param ctx: Combined type of a callback and rei struct - :param study_id: Study id - - :returns: Dictionary with data for analysis - """ - # check permissions - datamanager only - datamanager_group = "grp-datamanager-" + study_id - - if not user.is_member_of(ctx, datamanager_group): - log.write(ctx, "No permissions for reporting functionality") - return {} - - return intake_dataset.vault_aggregated_info(ctx, study_id) - - -@api.make() -def api_intake_report_export_study_data(ctx, study_id): - """Find all datasets in the vault for $studyID. - - Include file count and total file size as well as dataset meta data version, experiment type, pseudocode and wave - - :param ctx: Combined type of a callback and rei struct - :param study_id: Study id to get a report from - - :returns: Study report - """ - # check permissions - datamanager only - datamanager_group = "grp-datamanager-" + study_id - - if not user.is_member_of(ctx, datamanager_group): - log.write(ctx, "No permissions to export data for this study") - return {} - - return intake_dataset.intake_report_export_study_data(ctx, study_id) - - -def intake_group_to_datamanager_group(intake_group): - """Determines the name of the data manager group of a particular intake group. - - :param intake_group: name of intake group - - :returns: name of datamanager group - - :raises ValueError: if provided group name is not a valid intake group name - """ - if intake_group.startswith("grp-intake-"): - return intake_group.replace("-intake-", "-datamanager-", 1) - elif intake_group.startswith("intake-"): - return intake_group.replace("intake-", "grp-datamanager-", 1) - else: - raise ValueError("Unexpected intake group format for group " + intake_group) diff --git a/intake_checksums.py b/intake_checksums.py deleted file mode 100644 index 669d06afe..000000000 --- a/intake_checksums.py +++ /dev/null @@ -1,53 +0,0 @@ -# -*- coding: utf-8 -*- -"""Functions for intake checksums.""" - -__copyright__ = 'Copyright (c) 2019-2021, Utrecht University' -__license__ = 'GPLv3, see LICENSE' - -import itertools - -import genquery - -from util import * - - -def chop_checksum(checksum): - """Chop iRODS checksum in checksum type and checksum string. - - Checksum format is ({type}:){checksum}, if type is missing then it is "md5". - - :param checksum: iRODS checksum string - :returns: type checksum - """ - checksum_split = checksum.split(":") - - if len(checksum_split) > 1: - type = checksum_split[0] - checksum = checksum_split[1] - - return type, checksum - - -def intake_generate_dataset_checksums(ctx, dataset_path, checksum_file): - """"Generate data object with all checksums of a dataset. - - :param ctx: Combined type of a callback and rei struct - :param dataset_path: Root collection of dataset to be indexed - :param checksum_file: Data object to write checksums to - """ - q_root = genquery.row_iterator("COLL_NAME, DATA_NAME, DATA_CHECKSUM, DATA_SIZE", - "COLL_NAME = '{}'".format(dataset_path), - genquery.AS_LIST, ctx) - - q_sub = genquery.row_iterator("COLL_NAME, DATA_NAME, DATA_CHECKSUM, DATA_SIZE", - "COLL_NAME like '{}/%'".format(dataset_path), - genquery.AS_LIST, ctx) - - # Create checksums file. - checksums = "" - for row in itertools.chain(q_root, q_sub): - type, checksum = chop_checksum(row[2]) - checksums += "{} {} {} {}/{}\n".format(type, checksum, row[3], row[0], row[1]) - - # Write checksums file. - data_object.write(ctx, checksum_file, checksums) diff --git a/intake_dataset.py b/intake_dataset.py deleted file mode 100644 index d8417fc71..000000000 --- a/intake_dataset.py +++ /dev/null @@ -1,284 +0,0 @@ -# -*- coding: utf-8 -*- -"""Functions for intake datasets.""" - -__copyright__ = 'Copyright (c) 2019-2021, Utrecht University' -__license__ = 'GPLv3, see LICENSE' - -import itertools - -import genquery - -from util import * - - -def intake_report_export_study_data(ctx, study_id): - """ Get the information for the export functionality - - Retrieved metadata for a study: - - dataset_date_created - - wave - - version - - experiment_type - - pseudocode - - number of files - - total file size - - :param ctx: Combined type of a callback and rei struct - :param study_id: Unique identifier op study - :returns: returns datasets - """ - zone = user.zone(ctx) - - main_collection_iterator = genquery.row_iterator("COLL_NAME, COLL_PARENT_NAME, META_COLL_ATTR_NAME, META_COLL_ATTR_VALUE", - " = '/{}/home/grp-vault-{}' AND META_COLL_ATTR_NAME IN ('dataset_id', 'dataset_date_created', 'wave', 'version', 'experiment_type', 'pseudocode')".format(zone, study_id), - genquery.AS_LIST, ctx) - - subcollection_iterator = genquery.row_iterator("COLL_NAME, COLL_PARENT_NAME, META_COLL_ATTR_NAME, META_COLL_ATTR_VALUE", - "COLL_NAME like '/{}/home/grp-vault-{}/%' AND META_COLL_ATTR_NAME IN ('dataset_id', 'dataset_date_created', 'wave', 'version', 'experiment_type', 'pseudocode')".format(zone, study_id), - genquery.AS_LIST, ctx) - - datasets = {} - for row in itertools.chain(main_collection_iterator, subcollection_iterator): - path = row[0] - try: - datasets[path][row[2]] = row[3] - except KeyError: - datasets[path] = {row[2]: row[3]} - - real_datasets = {} - for set_path in datasets: - if 'dataset_date_created' in datasets[set_path]: - real_datasets[set_path] = datasets[set_path] - # collect total file size and total amount of files - real_datasets[set_path]['totalFileSize'] = 0 - real_datasets[set_path]['totalFiles'] = 0 - - # get the filesize and file count - stat_main_collection_iterator = genquery.row_iterator("count(DATA_ID), sum(DATA_SIZE)", - "COLL_NAME = '{}'".format(set_path), - genquery.AS_LIST, ctx) - - stat_subcollection_iterator = genquery.row_iterator("count(DATA_ID), sum(DATA_SIZE)", - "COLL_NAME like '{}/%'".format(set_path), - genquery.AS_LIST, ctx) - - for row in itertools.chain(stat_main_collection_iterator, stat_subcollection_iterator): - real_datasets[set_path]['totalFiles'] = int(row[0]) / 2 - totalFileSize = 0 - if row[1]: - totalFileSize = int(row[1]) - real_datasets[set_path]['totalFileSize'] = totalFileSize / 2 - - return real_datasets - - -def intake_youth_get_datasets_in_study(ctx, study_id): - """Get the of datasets (with relevant metadata) in a study. - - Retrieved metadata: - - 'dataset_id' - - 'dataset_date_created' - - 'wave' - - 'version' - - 'experiment_type' - - 'pseudocode' - - :param ctx: Combined type of a callback and rei struct - :param study_id: Unique identifier of study - - :returns: Dict with datasets and relevant metadata. - """ - zone = user.zone(ctx) - - main_collection_iterator = genquery.row_iterator("COLL_NAME, COLL_PARENT_NAME, META_COLL_ATTR_NAME, META_COLL_ATTR_VALUE", - "COLL_NAME = '/{}/home/grp-vault-{}' AND META_COLL_ATTR_NAME IN ('dataset_id', 'dataset_date_created', 'wave', 'version', 'experiment_type', 'pseudocode')".format(zone, study_id), - genquery.AS_LIST, ctx) - - subcollection_iterator = genquery.row_iterator("COLL_NAME, COLL_PARENT_NAME, META_COLL_ATTR_NAME, META_COLL_ATTR_VALUE", - "COLL_NAME LIKE '/{}/home/grp-vault-{}/*' AND META_COLL_ATTR_NAME IN ('dataset_id', 'dataset_date_created', 'wave', 'version', 'experiment_type', 'pseudocode')".format(zone, study_id), - genquery.AS_LIST, ctx) - - datasets = {} - - # Construct all datasets. - for row in itertools.chain(main_collection_iterator, subcollection_iterator): - dataset = row[0] - attribute_name = row[2] - attribute_value = row[3] - - if attribute_name in ['dataset_date_created', 'wave', 'version', 'experiment_type', 'pseudocode']: - if attribute_name in ['version', 'experiment_type']: - val = attribute_value.lower() - else: - val = attribute_value - try: - datasets[dataset][attribute_name] = val - except KeyError: - datasets[dataset] = {attribute_name: val} - - return datasets - - -def intake_youth_dataset_counts_per_study(ctx, study_id): - """"Get the counts of datasets wave/experimenttype. - - In the vault a dataset is always located in a folder. - Therefore, looking at the folders only is enough. - - :param ctx: Combined type of a callback and rei struct - :param study_id: Unique identifier op study - - :returns: Dict with counts of datasets wave/experimenttype - """ - datasets = intake_youth_get_datasets_in_study(ctx, study_id) - - dataset_type_counts = {} - # Loop through datasets and count wave and experimenttype. - for dataset in datasets: - # Meta attribute 'dataset_date_created' defines that a folder holds a complete set. - if 'dataset_date_created' in datasets[dataset]: - type = datasets[dataset]['experiment_type'] - wave = datasets[dataset]['wave'] - version = datasets[dataset]['version'] - - try: - dataset_type_counts[type][wave][version] += 1 - except KeyError: - if type not in dataset_type_counts: - dataset_type_counts[type] = {wave: {version: 1}} - elif wave not in dataset_type_counts[type]: - dataset_type_counts[type][wave] = {version: 1} - else: - dataset_type_counts[type][wave][version] = 1 - - return dataset_type_counts - - -def vault_aggregated_info(ctx, study_id): - """Collects aggregated information for raw and processed datasets. - - Collects the following information for RAW and PROCESSED datasets. - Including a totalisation of this all (raw/processed is kept in VERSION) - - Total datasets - - Total files - - Total file size - - File size growth in a month - - Datasets growth in a month - - Pseudocodes (distinct) - - :param ctx: Combined type of a callback and rei struct - :param study_id: Unique identifier op study - - :returns: Dict with aggregated information for raw and processed datasets - """ - datasets = intake_youth_get_datasets_in_study(ctx, study_id) - - dataset_count = {'raw': 0, 'processed': 0} - dataset_growth = {'raw': 0, 'processed': 0} - dataset_file_count = {'raw': 0, 'processed': 0} - dataset_file_size = {'raw': 0, 'processed': 0} - dataset_file_growth = {'raw': 0, 'processed': 0} - dataset_pseudocodes = {'raw': [], 'processed': []} - - # Determine full last month reference point - import time - from datetime import datetime, date, timedelta - - last_day_of_prev_month = date.today().replace(day=1) - timedelta(days=1) - month = int(last_day_of_prev_month.strftime("%m")) - year = int(last_day_of_prev_month.strftime("%Y")) - - last_month = int(time.time() - int(datetime(year, month, int(date.today().strftime("%d")), 0, 0, 0).strftime('%s'))) - - dataset_paths = [] - for dataset in datasets: - # Meta attribute 'dataset_date_created' defines that a folder holds a complete set. - if 'dataset_date_created' in datasets[dataset]: - dataset_paths.append(dataset) - - if datasets[dataset]['version'].lower() == 'raw': - version = 'raw' - else: - version = 'processed' - - # if version in ['raw', 'processed']: - dataset_count[version] += 1 - - try: - date_created = int(datasets[dataset]['dataset_date_created']) - except Exception: - # This is nonsense and arose from an erroneous situation - date_created = last_month - - if date_created - last_month >= 0: - dataset_growth[version] += 1 - - try: - pseudocode = datasets[dataset]['pseudocode'] - if pseudocode not in dataset_pseudocodes[version]: - dataset_pseudocodes[version].append(pseudocode) - except KeyError: - continue - - zone = user.zone(ctx) - main_collection_iterator = genquery.row_iterator("DATA_NAME, COLL_NAME, DATA_SIZE, COLL_CREATE_TIME", - "COLL_NAME = '/{}/home/grp-vault-{}'".format(zone, study_id), - genquery.AS_LIST, ctx) - - subcollection_iterator = genquery.row_iterator("DATA_NAME, COLL_NAME, DATA_SIZE, COLL_CREATE_TIME", - "COLL_NAME like '/{}/home/grp-vault-{}/%'".format(zone, study_id), - genquery.AS_LIST, ctx) - - for row in itertools.chain(main_collection_iterator, subcollection_iterator): - coll_name = row[1] - data_size = int(row[2]) - coll_create_time = int(row[3]) - - # Check whether the file is part of a dataset. - part_of_dataset = False - for dataset in dataset_paths: - if dataset in coll_name: - part_of_dataset = True - break - - # File is part of dataset. - if part_of_dataset: - # version = datasets[dataset]['version'] - - if datasets[dataset]['version'].lower() == 'raw': - version = 'raw' - else: - version = 'processed' - - dataset_file_count[version] += 1 - dataset_file_size[version] += data_size - - if coll_create_time - last_month >= 0: - dataset_file_growth[version] += data_size - - return { - 'total': { - 'totalDatasets': dataset_count['raw'] + dataset_count['processed'], - 'totalFiles': dataset_file_count['raw'] + dataset_file_count['processed'], - 'totalFileSize': dataset_file_size['raw'] + dataset_file_size['processed'], - 'totalFileSizeMonthGrowth': dataset_file_growth['raw'] + dataset_file_growth['processed'], - 'datasetsMonthGrowth': dataset_growth['raw'] + dataset_growth['processed'], - 'distinctPseudoCodes': len(dataset_pseudocodes['raw']) + len(dataset_pseudocodes['processed']), - }, - 'raw': { - 'totalDatasets': dataset_count['raw'], - 'totalFiles': dataset_file_count['raw'], - 'totalFileSize': dataset_file_size['raw'], - 'totalFileSizeMonthGrowth': dataset_file_growth['raw'], - 'datasetsMonthGrowth': dataset_growth['raw'], - 'distinctPseudoCodes': len(dataset_pseudocodes['raw']), - }, - 'notRaw': { - 'totalDatasets': dataset_count['processed'], - 'totalFiles': dataset_file_count['processed'], - 'totalFileSize': dataset_file_size['processed'], - 'totalFileSizeMonthGrowth': dataset_file_growth['processed'], - 'datasetsMonthGrowth': dataset_growth['processed'], - 'distinctPseudoCodes': len(dataset_pseudocodes['processed']), - }, - } diff --git a/intake_lock.py b/intake_lock.py deleted file mode 100644 index d31c202ce..000000000 --- a/intake_lock.py +++ /dev/null @@ -1,203 +0,0 @@ -# -*- coding: utf-8 -*- -"""Functions for intake locking.""" - -__copyright__ = 'Copyright (c) 2019-2021, Utrecht University' -__license__ = 'GPLv3, see LICENSE' - -import time - -import genquery - -import intake -from util import * - - -def intake_dataset_treewalk_change_status(ctx, collection, status, timestamp, remove): - """Treewalk dataset collection and change status. - - :param ctx: Combined type of a callback and rei struct - :param collection: Will change every time as it represents every collection that has to be processed - :param status: Status to set on dataset objects - :param timestamp: Timestamp of status change - :param remove: Boolean, set or remove status - """ - # 1. Change status on this collection. - if remove: - try: - avu.rmw_from_coll(ctx, collection, status, "%") - except msi.Error as e: - log.write(ctx, 'ERROR REMOVE') - log.write(ctx, e) - else: - log.write(ctx, 'step1 . set_on_col') - avu.set_on_coll(ctx, collection, status, timestamp) - - # 2. Change status on data objects located directly within the collection. - data_objects = genquery.row_iterator( - "DATA_NAME", - "COLL_NAME = '{}'".format(collection), - genquery.AS_LIST, ctx - ) - - for row in data_objects: - if remove: - avu.rmw_from_data(ctx, "{}/{}".format(collection, row[0]), status, "%") - else: - log.write(ctx, 'step2 . set_on_data') - avu.set_on_data(ctx, "{}/{}".format(collection, row[0]), status, timestamp) - - # 3. Loop through subcollections. - subcollections = genquery.row_iterator( - "COLL_NAME", - "COLL_PARENT_NAME = '{}'".format(collection), - genquery.AS_LIST, ctx - ) - - for row in subcollections: - intake_dataset_treewalk_change_status(ctx, row[0], status, timestamp, remove) - - -def intake_dataset_change_status(ctx, object, is_collection, dataset_id, status, timestamp, remove): - """Change status on dataset. - - :param ctx: Combined type of a callback and rei struct - :param object: Will change every time as it represents every object of the dataset - :param is_collection: Indicator if dataset is within a collection - :param dataset_id: Dataset identifier - :param status: Status to set on dataset objects - :param timestamp: Timestamp of status change - :param remove: Boolean, set or remove status - """ - # Is dataset a collection? - if is_collection: - # Recursively change the status on all objects in the dataset - intake_dataset_treewalk_change_status(ctx, object, status, timestamp, remove) - else: - # Dataset is not a collection, find all the dataset objects. - data_objects = genquery.row_iterator("DATA_NAME", - "COLL_NAME = '{}' AND META_DATA_ATTR_NAME = 'dataset_toplevel' AND META_DATA_ATTR_VALUE = '{}'".format(object, dataset_id), - genquery.AS_LIST, ctx) - - # Change dataset status on all objects. - for row in data_objects: - if remove: - avu.rmw_from_data(ctx, "{}/{}".format(object, row[0]), status, "%") - else: - avu.set_on_data(ctx, "{}/{}".format(object, row[0]), status, timestamp) - - -def intake_dataset_lock(ctx, collection, dataset_id): - timestamp = str(int(time.time())) - - tl_info = intake.get_dataset_toplevel_objects(ctx, collection, dataset_id) - is_collection = tl_info['is_collection'] - tl_objects = tl_info['objects'] - log.write(ctx, tl_info) - - if not is_collection and len(tl_objects) == 0: - raise Exception("Dataset \"{}\" in collection {} not found".format(collection, dataset_id)) - - if is_collection: - intake_dataset_change_status(ctx, tl_objects[0], is_collection, dataset_id, "to_vault_lock", timestamp, False) - else: - # Dataset based on - for tl_object in tl_objects: - avu.set_on_data(ctx, tl_object, "to_vault_lock", timestamp) - - -def intake_dataset_unlock(ctx, collection, dataset_id): - timestamp = str(int(time.time())) - - tl_info = intake.get_dataset_toplevel_objects(ctx, collection, dataset_id) - is_collection = tl_info['is_collection'] - tl_objects = tl_info['objects'] - - if not is_collection and len(tl_objects) == 0: - raise Exception("Dataset \"{}\" in collection {} not found".format(collection, dataset_id)) - - # It is possible that the status of the dataset status has moved on. - if is_collection: - intake_dataset_change_status(ctx, tl_objects[0], is_collection, dataset_id, "to_vault_lock", timestamp, True) - else: - # Dataset based on data objects - for tl_object in tl_objects: - avu.rmw_from_data(ctx, tl_object, "to_vault_lock", "%") - - -def intake_dataset_freeze(ctx, collection, dataset_id): - # timestamp = str(int(time.time())) - # top_collection = "" - # is_collection = "" - # ctx.uuYcDatasetGetTopLevel(collection, dataset_id, top_collection, is_collection) - - # intake_dataset_change_status(ctx, top_collection, is_collection, dataset_id, "to_vault_freeze", timestamp, False) - - timestamp = str(int(time.time())) - - tl_info = intake.get_dataset_toplevel_objects(ctx, collection, dataset_id) - is_collection = tl_info['is_collection'] - tl_objects = tl_info['objects'] - log.write(ctx, tl_info) - - if is_collection: - intake_dataset_change_status(ctx, tl_objects[0], is_collection, dataset_id, "to_vault_freeze", timestamp, False) - else: - # Dataset based on - for tl_object in tl_objects: - avu.set_on_data(ctx, tl_object, "to_vault_freeze", timestamp) - - -def intake_dataset_melt(ctx, collection, dataset_id): - # timestamp = str(int(time.time())) - # top_collection = "" - # is_collection = "" - # ctx.uuYcDatasetGetTopLevel(collection, dataset_id, top_collection, is_collection) - - # intake_dataset_change_status(ctx, top_collection, is_collection, dataset_id, "to_vault_freeze", timestamp, True) - - timestamp = str(int(time.time())) - - tl_info = intake.get_dataset_toplevel_objects(ctx, collection, dataset_id) - is_collection = tl_info['is_collection'] - tl_objects = tl_info['objects'] - - # It is possible that the status of the dataset status has moved on. - if is_collection: - intake_dataset_change_status(ctx, tl_objects[0], is_collection, dataset_id, "to_vault_freeze", timestamp, True) - else: - # Dataset based on data objects - for tl_object in tl_objects: - avu.rmw_from_data(ctx, tl_object, "to_vault_freeze", "%") - - -def intake_dataset_object_get_status(ctx, path): - """Get the status of an object in a dataset. - - :param ctx: Combined type of a callback and rei struct - :param path: Path of dataset object - - :returns: Tuple booleans indicating if the object is locked or frozen - """ - locked = False - frozen = False - - if collection.exists(ctx, path): - attribute_names = genquery.row_iterator("META_COLL_ATTR_NAME", - "COLL_NAME = '{}'".format(path), - genquery.AS_LIST, ctx) - else: - coll_name, data_name = pathutil.chop(path) - attribute_names = genquery.row_iterator("META_DATA_ATTR_NAME", - "COLL_NAME = '{}' AND DATA_NAME = '{}'".format(coll_name, data_name), - genquery.AS_LIST, ctx) - - for row in attribute_names: - attribute_name = row[0] - if attribute_name in ["to_vault_lock", "to_vault_freeze"]: - locked = True - - if attribute_name == "to_vault_freeze": - frozen = True - break - - return locked, frozen diff --git a/intake_scan.py b/intake_scan.py deleted file mode 100644 index ba024c4cc..000000000 --- a/intake_scan.py +++ /dev/null @@ -1,462 +0,0 @@ -# -*- coding: utf-8 -*- -"""Functions for intake scanning.""" - -__copyright__ = 'Copyright (c) 2019-2021, Utrecht University' -__license__ = 'GPLv3, see LICENSE' - -import itertools -import time - -import genquery - -import intake -from intake_utils import dataset_parse_id, intake_scan_get_metadata_update -from util import * - - -def intake_scan_collection(ctx, root, scope, in_dataset, found_datasets): - """Recursively scan a directory in a Youth Cohort intake. - - :param ctx: Combined type of a callback and rei struct - :param root: the directory to scan - :param scope: a scoped kvlist buffer - :param in_dataset: whether this collection is within a dataset collection - :param found_datasets: collection of subscopes that were found in order to report toplevel datasets in the scanning process - - :returns: Found datasets - """ - - # Loop until pseudocode, experiment type and wave are complete. - # But the found values can be overwritten when deeper levels are found. - - # Scan files under root - iter = genquery.row_iterator( - "DATA_NAME, COLL_NAME", - "COLL_NAME = '" + root + "'", - genquery.AS_LIST, ctx - ) - for row in iter: - path = row[1] + '/' + row[0] - - # Determene lock state for object (no collectoin - locked_state = object_is_locked(ctx, path, False) - - if locked_state['locked'] or locked_state['frozen']: - continue - - remove_dataset_metadata(ctx, path, False) - scan_mark_scanned(ctx, path, False) - - parent_in_dataset = in_dataset - metadata_update = intake_scan_get_metadata_update(ctx, path, False, in_dataset, scope) - - if metadata_update["in_dataset"]: - apply_dataset_metadata(ctx, path, metadata_update["new_metadata"], False) - if not parent_in_dataset: - # We found a top-level dataset data object. - found_datasets.append(metadata_update["new_metadata"]) - else: - apply_partial_metadata(ctx, metadata_update["new_metadata"], path, False) - avu.set_on_data(ctx, path, "unrecognized", "Experiment type, wave or pseudocode missing from path") - - # Scan collections under root - iter = genquery.row_iterator( - "COLL_NAME", - "COLL_PARENT_NAME = '" + root + "'", - genquery.AS_LIST, ctx - ) - counter = 0 - for row in iter: - path = row[0] - counter = counter + 1 - dirname = pathutil.basename(path) - - if dirname != '/': - # get locked /frozen status - locked_state = object_is_locked(ctx, path, True) - - if locked_state['locked'] or locked_state['frozen']: - continue - - remove_dataset_metadata(ctx, path, True) - scan_mark_scanned(ctx, path, True) - - parent_in_dataset = in_dataset - metadata_update = intake_scan_get_metadata_update(ctx, path, True, in_dataset, scope) - - if metadata_update["in_dataset"]: - apply_dataset_metadata(ctx, path, metadata_update["new_metadata"], True) - if not parent_in_dataset: - # We found a new top-level dataset data object. - found_datasets.append(metadata_update["new_metadata"]) - else: - apply_partial_metadata(ctx, metadata_update["new_metadata"], path, True) - - found_datasets = intake_scan_collection(ctx, - path, - metadata_update["new_metadata"], - parent_in_dataset or metadata_update["in_dataset"], - found_datasets) - - return found_datasets - - -def object_is_locked(ctx, path, is_collection): - """Returns whether given object in path (collection or dataobject) is locked or frozen - - :param ctx: Combined type of a callback and rei struct - :param path: Path to object or collection - :param is_collection: Whether path contains a collection or data object - - :returns: Returns locked state - """ - locked_state = {"locked": False, - "frozen": False} - - if is_collection: - iter = genquery.row_iterator( - "META_COLL_ATTR_NAME", - "COLL_NAME = '" + path + "'", - genquery.AS_LIST, ctx - ) - for row in iter: - if row[0] in ['to_vault_lock', 'to_vault_freeze']: - locked_state['locked'] = True - if row[0] == 'to_vault_freeze': - locked_state['frozen'] = True - else: - parent_coll = pathutil.dirname(path) - iter = genquery.row_iterator( - "META_DATA_ATTR_NAME", - "COLL_NAME = '" + parent_coll + "' AND DATA_NAME = '" + pathutil.basename(path) + "'", - genquery.AS_LIST, ctx - ) - # return locked_state - for row in iter: - if row[0] in ['to_vault_lock', 'to_vault_freeze']: - locked_state['locked'] = True - if row[0] == 'to_vault_freeze': - locked_state['frozen'] = True - - return locked_state - - -def remove_dataset_metadata(ctx, path, is_collection): - """Remove all intake metadata from dataset. - - :param ctx: Combined type of a callback and rei struct - :param path: Path to collection or data object - :param is_collection: Whether is a collection or data object - """ - intake_metadata = ["wave", - "experiment_type", - "pseudocode", - "version", - "dataset_id", - "dataset_toplevel", - "error", - "warning", - "dataset_error", - "dataset_warning", - "unrecognized", - "object_count", - "object_errors", - "object_warnings"] - intake_metadata_set = set(intake_metadata) - - # Add the following two lines to remove accumulated metadata during testing. - # "comment" - # "scanned"] - - if is_collection: - iter = genquery.row_iterator( - "COLL_ID, META_COLL_ATTR_NAME, META_COLL_ATTR_VALUE", - "COLL_NAME = '" + path + "'", - genquery.AS_LIST, ctx - ) - else: - iter = genquery.row_iterator( - "DATA_ID, META_DATA_ATTR_NAME, META_DATA_ATTR_VALUE", - "COLL_NAME = '" + pathutil.dirname(path) + "' AND DATA_NAME = '" + pathutil.basename(path) + "'", - genquery.AS_LIST, ctx - ) - - for _row in iter: - metadata_name = _row[1] - if metadata_name in intake_metadata_set: - if is_collection: - try: - avu.rmw_from_coll(ctx, path, metadata_name, '%') - except Exception as e: - log.write(ctx, "Warning: unable to remove metadata attr {} from {}".format(metadata_name, path)) - log.write(ctx, "Removing metadata failed with exception {}".format(str(e))) - else: - try: - avu.rmw_from_data(ctx, path, metadata_name, '%') - except Exception as e: - log.write(ctx, "Warning: unable to remove metadata attr {} from {}".format(metadata_name, path)) - log.write(ctx, "Removing metadata failed with exception {}".format(str(e))) - - -def scan_mark_scanned(ctx, path, is_collection): - """Sets the username of the scanner and a timestamp as metadata on the scanned object. - - :param ctx: Combined type of a callback and rei struct - :param path: Path on which to add scan indication to - :param is_collection: Is scanned object a collection? - """ - timestamp = int(time.time()) - user_and_timestamp = user.name(ctx) + ':' + str(timestamp) # str(datetime.date.today()) - - if is_collection: - avu.set_on_coll(ctx, path, 'scanned', user_and_timestamp) - else: - avu.set_on_data(ctx, path, 'scanned', user_and_timestamp) - - -def apply_dataset_metadata(ctx, path, scope, is_collection): - """Apply dataset metadata to an object in a dataset. - - :param ctx: Combined type of a callback and rei struct - :param path: Path to the object - :param scope: A scanner scope containing WEPV values - :param is_collection: Whether the object is a collection - """ - for key in scope: - if scope[key]: - if is_collection: - avu.set_on_coll(ctx, path, key, scope[key]) - else: - avu.set_on_data(ctx, path, key, scope[key]) - - -def apply_partial_metadata(ctx, scope, path, is_collection): - """Apply any available id component metadata to the given object. - - To be called only for objects outside datasets. When inside a dataset - (or at a dataset toplevel), use intake_apply_dataset_metadata() instead. - - :param ctx: Combined type of a callback and rei struct - :param scope: A scanner scope containing some WEPV values - :param path: Path to the object - :param is_collection: Whether the object is a collection - """ - keys = ['wave', 'experiment_type', 'pseudocode', 'version'] - for key in keys: - if key in scope: - if scope[key]: - if is_collection: - avu.set_on_coll(ctx, path, key, scope[key]) - else: - avu.set_on_data(ctx, path, key, scope[key]) - - -def dataset_add_error(ctx, top_levels, is_collection_toplevel, text, suppress_duplicate_avu_error=False): - """Add a dataset error to all given dataset toplevels. - - :param ctx: Combined type of a callback and rei struct - :param top_levels: A list of toplevel datasets - :param is_collection_toplevel: Indication of whether it is a collection or object - :param text: Error text - :param suppress_duplicate_avu_error: If an AVU already exists, suppress the irods-error. Allow for this situation - - :raises Exception: Raises exception when associating error to collection or data object fails - """ - for tl in top_levels: - if is_collection_toplevel: - try: - avu.associate_to_coll(ctx, tl, "dataset_error", text) - except msi.Error as e: - # iRODS errorcode 809000 (CATALOG_ALREADY_HAS_ITEM_BY_THAT_NAME) - if suppress_duplicate_avu_error and str(e).find("809000") > -1: - log.write(ctx, "Trying to associate dataset_error already present on collection: {}".format(tl)) - log.write(ctx, "Suppress error handling for AVU: dataset_error - {}".format(text)) - else: - raise Exception(e) - else: - try: - avu.associate_to_data(ctx, tl, "dataset_error", text) - except msi.Error as e: - # iRODS errorcode 809000 (CATALOG_ALREADY_HAS_ITEM_BY_THAT_NAME) - if suppress_duplicate_avu_error and str(e).find("809000") > -1: - log.write(ctx, "Trying to associate dataset_error already present on data object: {}".format(tl)) - log.write(ctx, "Suppress error handling for AVU: dataset_error - {}".format(text)) - else: - raise Exception(e) - - -def dataset_get_ids(ctx, coll): - """Find dataset ids under collection. - :param ctx: Combined type of a callback and rei struct - :param coll: Collection name for which to find dataset-ids - :returns: Returns a set of dataset ids - """ - data_ids = set() - - # Get distinct data_ids - main_collection_iterator = genquery.row_iterator( - "META_DATA_ATTR_VALUE", - "COLL_NAME = '" + coll + "' AND META_DATA_ATTR_NAME = 'dataset_id' ", - genquery.AS_LIST, ctx - ) - - subcollection_iterator = genquery.row_iterator( - "META_DATA_ATTR_VALUE", - "COLL_NAME LIKE '" + coll + "/%' AND META_DATA_ATTR_NAME = 'dataset_id' ", - genquery.AS_LIST, ctx - ) - for row in itertools.chain(main_collection_iterator, subcollection_iterator): - if row[0]: - data_ids.add(row[0]) - - return data_ids - - -def intake_check_datasets(ctx, root): - """Run checks on all datasets under root. - - :param ctx: Combined type of a callback and rei struct - :param root: The collection to get datasets for - """ - dataset_ids = dataset_get_ids(ctx, root) - for dataset_id in dataset_ids: - intake_check_dataset(ctx, root, dataset_id) - - -def intake_check_dataset(ctx, root, dataset_id): - """Run checks on the dataset specified by the given dataset id. - - This function adds object counts and error counts to top-level objects within the dataset. - For historical reasons, it also adds a warning count, which is always 0. - - :param ctx: Combined type of a callback and rei struct - :param root: Collection name - :param dataset_id: Dataset identifier - """ - tl_info = intake.get_dataset_toplevel_objects(ctx, root, dataset_id) - is_collection = tl_info['is_collection'] - tl_objects = tl_info['objects'] - - # Check validity of wav - waves = ["20w", "30w", "0m", "5m", "10m", "3y", "6y", "9y", "12y", "15y"] - components = dataset_parse_id(dataset_id) - if components['wave'] not in waves: - dataset_add_error(ctx, tl_objects, is_collection, "The wave '" + components['wave'] + "' is not in the list of accepted waves") - - # check presence of wave, pseudo-ID and experiment - if '' in [components['wave'], components['experiment_type'], components['pseudocode']]: - # Suppress error handing and continue normal processing should a situation arise where Wepv missing is already present on the dataobject/collection - dataset_add_error(ctx, tl_objects, is_collection, "Wave, experiment type or pseudo-ID missing", True) - - for tl in tl_objects: - # Save the aggregated counts of #objects, #warnings, #errors on object level - - count = get_aggregated_object_count(ctx, dataset_id, tl) - if is_collection: - avu.set_on_coll(ctx, tl, "object_count", str(count)) - else: - avu.set_on_data(ctx, tl, "object_count", str(count)) - - count = get_aggregated_object_error_count(ctx, tl) - if is_collection: - avu.set_on_coll(ctx, tl, "object_errors", str(count)) - else: - avu.set_on_data(ctx, tl, "object_errors", str(count)) - - count = 0 - if is_collection: - avu.set_on_coll(ctx, tl, "object_warnings", str(count)) - else: - avu.set_on_data(ctx, tl, "object_warnings", str(count)) - - -def get_rel_paths_objects(ctx, root, dataset_id): - """Get a list of relative paths to all data objects in a dataset. - - :param ctx: Combined type of a callback and rei struct - :param root: Root path of the dataset - :param dataset_id: Dataset identifier - - :returns: List of objects of relative object paths (e.g. file1.dat, some-subdir/file2.dat...) - """ - tl_info = intake.get_dataset_toplevel_objects(ctx, root, dataset_id) - is_collection = tl_info['is_collection'] - tl_objects = tl_info['objects'] - - rel_path_objects = [] - - # get the correct parent_collection - try: - if is_collection: - parent_coll = tl_objects[0] - else: - parent_coll = pathutil.dirname(tl_objects[0]) - except Exception: - parent_coll = '/' - - main_collection_iterator = genquery.row_iterator( - "DATA_NAME, COLL_NAME", - "COLL_NAME = '" + parent_coll + "' AND META_DATA_ATTR_NAME = 'dataset_id' AND META_DATA_ATTR_VALUE = '" + dataset_id + "' ", - genquery.AS_LIST, ctx - ) - - subcollection_iterator = genquery.row_iterator( - "DATA_NAME, COLL_NAME", - "COLL_NAME LIKE '" + parent_coll + "/%' AND META_DATA_ATTR_NAME = 'dataset_id' AND META_DATA_ATTR_VALUE = '" + dataset_id + "' ", - genquery.AS_LIST, ctx - ) - - for row in itertools.chain(main_collection_iterator, subcollection_iterator): - # Add objects including relative paths - rel_path_objects.append(row[1][len(parent_coll):] + '/' + row[0]) - - return rel_path_objects - - -def get_aggregated_object_count(ctx, dataset_id, tl_collection): - """Return total amounts of objects. - - :param ctx: Combined type of a callback and rei struct - :param dataset_id: Dataset id - :param tl_collection: Collection name of top level - - :returns: Aggregated object count - """ - main_collection_iterator = genquery.row_iterator( - "DATA_ID", - "COLL_NAME = '" + tl_collection + "' AND META_DATA_ATTR_NAME = 'dataset_id' " - "AND META_DATA_ATTR_VALUE = '" + dataset_id + "' ", - genquery.AS_LIST, ctx - ) - - subcollection_iterator = genquery.row_iterator( - "DATA_ID", - "COLL_NAME like '" + tl_collection + "/%' AND META_DATA_ATTR_NAME = 'dataset_id' " - "AND META_DATA_ATTR_VALUE = '" + dataset_id + "' ", - genquery.AS_LIST, ctx - ) - - return len(list(main_collection_iterator) + list(subcollection_iterator)) - - -def get_aggregated_object_error_count(ctx, tl_collection): - """Return total amount of object errors. - - :param ctx: Combined type of a callback and rei struct - :param tl_collection: Collection name of top level - - :returns: Total amount of object errors - """ - main_collection_iterator = genquery.row_iterator( - "DATA_ID", - "COLL_NAME = '" + tl_collection + "' AND META_DATA_ATTR_NAME = 'error' ", - genquery.AS_LIST, ctx - ) - - subcollection_iterator = genquery.row_iterator( - "DATA_ID", - "COLL_NAME like '" + tl_collection + "/%' AND META_DATA_ATTR_NAME = 'error' ", - genquery.AS_LIST, ctx - ) - - return len(list(main_collection_iterator) + list(subcollection_iterator)) diff --git a/intake_utils.py b/intake_utils.py deleted file mode 100644 index ff90cf7f6..000000000 --- a/intake_utils.py +++ /dev/null @@ -1,204 +0,0 @@ -# -*- coding: utf-8 -*- - -"""Utility functions for the intake module. These are in a separate file so that - we can test the main logic without having iRODS-related dependencies in the way.""" - -__copyright__ = 'Copyright (c) 2019-2021, Utrecht University' -__license__ = 'GPLv3, see LICENSE' - -import os -import re - - -def intake_tokens_identify_dataset(tokens): - """Check whether the tokens gathered so far are sufficient for identifying a dataset. - - :param tokens: A dictionary of tokens - - :returns: Returns whether a dataset is identified - """ - required = ['wave', 'experiment_type', 'pseudocode'] # version is optional - - missing = 0 - for req_token in required: - # required tokens must be present and must have a value - if req_token not in tokens or tokens[req_token] == "": - missing = missing + 1 - - return (missing == 0) - - -def intake_ensure_version_present(ctx, metadata): - """Adds a version attribute with a default value to metadata if it is not yet present. - - :param ctx: Combined type of a callback and rei struct - :param metadata: Dictionary with intake module metadata - """ - if "version" not in metadata: - metadata["version"] = "Raw" - - -def intake_extract_tokens_from_name(ctx, path, scoped_buffer): - """Extract one or more tokens from a file / directory name and add dataset information as metadata. - :param ctx: Combined type of a callback and rei struct - :param path: Full path of the data object or collection - :param scoped_buffer: Holds dataset buffer with prefilled keys - :returns: Returns extended scope buffer - """ - basename = os.path.basename(path) - name_without_ext = os.path.splitext(basename)[0] - parts = re.split("[_-]", name_without_ext) - for part in parts: - scoped_buffer.update(intake_extract_tokens(ctx, part)) - return scoped_buffer - - -def intake_extract_tokens(ctx, string): - """Extract tokens from a string and return as dict. - - :param ctx: Combined type of a callback and rei struct - :param string: Token of which to be determined whether experiment type, version etc - - :returns: Returns found kv's - """ - exp_types = ["pci", - "echo", - "facehouse", - "faceemo", - "coherence", - "infprogap", - "infsgaze", - "infpop", - # "mriinhibition", - # "mriemotion", - # "mockinhibition", - "chprogap", - "chantigap", - "chsgaze", - "pciconflict", - "pcivacation", - "peabody", - "discount", - "cyberball", - "trustgame", - "other", - # MRI: - "inhibmockbehav", - "inhibmribehav", - "emotionmribehav", - "emotionmriscan", - "anatomymriscan", - "restingstatemriscan", - "dtiamriscan", - "dtipmriscan", - "mriqcreport", - "mriqceval", - "vasmri", - "vasmock", - # - "looklisten", - "handgame", - "infpeabody", - "delaygratification", - "dtimriscan", - "inhibmriscan", - # 16-Apr-2019 fbyoda email request new exp type: - "chdualet", - # 15-Feb-2021 fbyoda email request new exp type: - "functionalmriscan", - "infdualet", - "vrbartbehav", - "infssat"] - - str_lower = string.lower() - str_upper = string.upper() - str_for_pseudocode_test = string.split('.')[0] - str_for_version_test = string.translate(None, ".") - - foundKVs = {} - if re.match('^[0-9]{1,2}[wmy]$', str_lower) is not None: - # String contains a wave. - # Wave validity is checked later on in the dataset checks. - foundKVs["wave"] = str_lower - elif re.match('^[bap][0-9]{5}$', str_for_pseudocode_test.lower()) is not None: - # String contains a pseudocode. - foundKVs["pseudocode"] = str_upper[0:len(str_for_pseudocode_test)] - elif re.match('^[Vv][Ee][Rr][A-Z][a-zA-Z0-9-]*$', str_for_version_test) is not None: - foundKVs["version"] = string[3:len(string)] - elif str_lower in exp_types: - foundKVs["experiment_type"] = str_lower - - return foundKVs - - -def intake_scan_get_metadata_update(ctx, path, is_collection, in_dataset, parent_metadata): - """Determine metadata to be updated for a particular collection or data object, based - on its name and parent metadata. - - This function is separate from the function that actually performs the updates, so - that we can test the logic separately. - - :param ctx: Combined type of a callback and rei struct - :param path: Full path of the data object or collection - :param is_collection: true if it's a collection, false if it's a data object - :param in_dataset: true if the parent already has complete WEP(V) attributes. Otherwise false. - :param parent_metadata: dict containing the intake module metadata of the parent collection ( if any) - - :returns: Returns a dictionary with the following keys / values: - new_metadata: dictionary of new metadata to apply to this data object or collection - in_dataset: true if current object (along with values passed from parents) has complete WEP(V) values. - otherwise false. - """ - - local_metadata = parent_metadata.copy() - - result = {"new_metadata": local_metadata, "in_dataset": in_dataset} - - if in_dataset: - # If we already are in a dataset, we get all the metadata from the parent. We - # cannot override attributes in this case. However we need to remove the top-level - # attribute, because the present object is within in a dataset, and thus not a top-level - # data object. - if "dataset_toplevel" in local_metadata: - del [local_metadata["dataset_toplevel"]] - else: - intake_extract_tokens_from_name(ctx, path, local_metadata) - if intake_tokens_identify_dataset(local_metadata): - intake_ensure_version_present(ctx, local_metadata) - local_metadata["directory"] = path if is_collection else os.path.dirname(path) - local_metadata["dataset_id"] = dataset_make_id(local_metadata) - local_metadata["dataset_toplevel"] = dataset_make_id(local_metadata) - result["in_dataset"] = True - else: - # result["in_dataset"] is already set to false - pass - - return result - - -def dataset_make_id(scope): - """Construct a dataset based on WEPV and directory. - - :param scope: Create a dataset id - - :returns: Dataset identifier - """ - return scope['wave'] + '\t' + scope['experiment_type'] + '\t' + scope['pseudocode'] + '\t' + scope['version'] + '\t' + scope['directory'] - - -def dataset_parse_id(dataset_id): - """Parse a dataset into its consructive data. - - :param dataset_id: Dataset identifier - - :returns: Dataset as a dict - """ - dataset_parts = dataset_id.split('\t') - dataset = {} - dataset['wave'] = dataset_parts[0] - dataset['experiment_type'] = dataset_parts[1] - dataset['pseudocode'] = dataset_parts[2] - dataset['version'] = dataset_parts[3] - dataset['directory'] = dataset_parts[4] - - return dataset diff --git a/intake_vault.py b/intake_vault.py deleted file mode 100644 index bc0e85258..000000000 --- a/intake_vault.py +++ /dev/null @@ -1,412 +0,0 @@ -# -*- coding: utf-8 -*- -"""Functions for intake vault.""" - -__copyright__ = 'Copyright (c) 2021, Utrecht University' -__license__ = 'GPLv3, see LICENSE' - -import itertools -import time - -import genquery - -import intake -import intake_lock -import intake_scan -from util import * - -__all__ = ['rule_intake_to_vault'] - - -@rule.make(inputs=range(2), outputs=range(2, 2)) -def rule_intake_to_vault(ctx, intake_root, vault_root): - # 1. add to_vault_freeze metadata lock to the dataset - # 2. check that dataset does not yet exist in the vault - # 3. copy dataset to vault with its metadata - # 4. remove dataset from intake - # upon any error: - # - delete partial data from vault - # - add error to intake dataset metadata - # - remove locks on intake dataset (to_vault_freeze, to_vault_lock) - - # note that we have to allow for multiple types of datasets: - # type A: a single toplevel collection with a tree underneath - # type B: one or more datafiles located within the same collection - # processing varies slightly between them, so process each type in turn - # - - # status: 0 is success, nonzero is error - status = 0 - # counter of datasets moved to the vault area - datasets_moved = 0 - - # TYPE A: - c_main_collection_iterator = genquery.row_iterator( - "COLL_NAME, META_COLL_ATTR_VALUE", - "META_COLL_ATTR_NAME = 'dataset_toplevel' AND COLL_NAME = '" + intake_root + "'", - genquery.AS_LIST, ctx) - - for row in itertools.chain(c_main_collection_iterator): - toplevel_collection = row[0] - dataset_id = row[1] - # Get status ( locked / frozen ) - locked_state = intake_scan.object_is_locked(ctx, toplevel_collection, True) - if locked_state['locked']: - # Freeze the dataset - intake_lock.intake_dataset_freeze(ctx, toplevel_collection, dataset_id) - - # Dataset frozen, now move to vault and remove from intake area - status = dataset_collection_move_2_vault(ctx, toplevel_collection, dataset_id, vault_root) - if status == 0: - datasets_moved += 1 - - # TYPE B: - d_main_collection_iterator = genquery.row_iterator( - "COLL_NAME, META_DATA_ATTR_VALUE", - "META_DATA_ATTR_NAME = 'dataset_toplevel' AND COLL_NAME = '" + intake_root + "'", - genquery.AS_LIST, ctx) - - for row in itertools.chain(d_main_collection_iterator): - toplevel_collection = row[0] - dataset_id = row[1] - # check if to_vault_lock exists on all the dataobjects of this dataset - all_locked = True - iter2 = genquery.row_iterator( - "DATA_NAME", - "COLL_NAME = '" + toplevel_collection + "' " - "AND META_DATA_ATTR_NAME = 'dataset_toplevel' " - "AND META_DATA_ATTR_VALUE = '" + dataset_id + "'", - genquery.AS_LIST, ctx) - - for row2 in iter2: - locked_state = intake_scan.object_is_locked(ctx, toplevel_collection + '/' + row2[0], False) - all_locked = all_locked and locked_state['locked'] - if not all_locked: - break - - if all_locked: - # Freeze the dataset - intake_lock.intake_dataset_freeze(ctx, toplevel_collection, dataset_id) - - # Dataset frozen, now move to fault and remove from intake area - status = dataset_objects_only_move_2_vault(ctx, toplevel_collection, dataset_id, vault_root) - if status == 0: - datasets_moved += 1 - - if datasets_moved: - log.write(ctx, "Datasets moved to the vault: " + str(datasets_moved)) - - return 0 - - -def dataset_collection_move_2_vault(ctx, toplevel_collection, dataset_id, vault_root): - """Move intake datasets consisting of collections to the vault - - :param ctx: Combined type of a callback and rei struct - :param toplevel_collection: Toplevel collection - :param dataset_id: Identifier of dataset - :param vault_root: Root path of vault - - :returns: Status - """ - status = 0 - if vault_dataset_exists(ctx, vault_root, dataset_id): - # duplicate dataset, signal error and throw out of vault queue - log.write(ctx, "INFO: version already exists in vault: " + dataset_id) - message = "Duplicate dataset, version already exists in vault" - intake_scan.dataset_add_error(ctx, [toplevel_collection], True, message) - intake_lock.intake_dataset_melt(ctx, toplevel_collection, dataset_id) - intake_lock.intake_dataset_unlock(ctx, toplevel_collection, dataset_id) - return 1 - - # Dataset does not exist - move from research to vault area - vault_path = get_dataset_path(vault_root, dataset_id) - - vault_parent = pathutil.chop(vault_path)[0] - try: - collection.create(ctx, vault_parent, "1") - except Exception: - log.write(ctx, "ERROR: parent collection could not be created " + vault_parent) - return 2 - - # variable for treewalk interface - buffer = {} - buffer["source"] = toplevel_collection - buffer["destination"] = vault_path - - status = vault_tree_walk_collection(ctx, toplevel_collection, buffer, vault_walk_ingest_object) - - # reset buffer - buffer = {} - if status == 0: - # stamp the vault dataset collection with additional metadata - avu.set_on_coll(ctx, vault_path, "dataset_date_created", str(int(time.time()))) - - # and finally remove the dataset original in the intake area - try: - collection.remove(ctx, toplevel_collection) - except Exception: - log.write(ctx, "ERROR: unable to remove intake collection " + toplevel_collection) - return 3 - else: - # move failed (partially), cleanup vault - # NB: keep the dataset in the vault queue so we can retry some other time - log.write("ERROR: Ingest failed for " + dataset_id + ", error = " + status) - status = vault_tree_walk_collection(ctx, vault_path, buffer, vault_walk_remove_object) - - return status - - -def dataset_objects_only_move_2_vault(ctx, toplevel_collection, dataset_id, vault_root): - """Move intake datasets consisting of data objects to the vault - - :param ctx: Combined type of a callback and rei struct - :param toplevel_collection: Toplevel collection - :param dataset_id: Identifier of dataset - :param vault_root: Root path of vault - - :returns: Status - """ - status = 0 - if vault_dataset_exists(ctx, vault_root, dataset_id): - # duplicate dataset, signal error and throw out of vault queue - log.write(ctx, "INFO: version already exists in vault: " + dataset_id) - message = "Duplicate dataset, version already exists in vault" - - tl_info = intake.get_dataset_toplevel_objects(ctx, toplevel_collection, dataset_id) - is_collection = tl_info['is_collection'] - tl_objects = tl_info['objects'] - - # dataset_add_error(ctx, tl_objects, is_collection, "The wave '" + components['wave'] + "' is not in the list of accepted waves") - - intake_scan.dataset_add_error(ctx, tl_objects, is_collection, message) - intake_lock.intake_dataset_melt(ctx, toplevel_collection, dataset_id) - intake_lock.intake_dataset_unlock(ctx, toplevel_collection, dataset_id) - return 1 - - # Dataset does not exist - move it from research to vault space - # new dataset(version) we can safely ingest into vault - vault_path = get_dataset_path(vault_root, dataset_id) - - # create path to and including the toplevel collection (will create in-between levels) - try: - collection.create(ctx, vault_path, "1") - except Exception: - log.write(ctx, "ERROR: parent collection could not be created " + vault_path) - return 2 - - # stamp the vault dataset collection with default metadata - try: - vault_dataset_add_default_metadata(ctx, vault_path, dataset_id) - except Exception: - log.write(ctx, "ERROR: default metadata could not be added to " + vault_path) - return 3 - - # copy data objects to the vault - iter = genquery.row_iterator( - "DATA_NAME", - "COLL_NAME = '" + toplevel_collection + "' " - "AND META_DATA_ATTR_NAME = 'dataset_toplevel' " - "AND META_DATA_ATTR_VALUE = '" + dataset_id + "' ", - genquery.AS_LIST, ctx) - - for row in iter: - intake_path = toplevel_collection + '/' + row[0] - - status = vault_ingest_object(ctx, intake_path, False, vault_path + "/" + row[0]) - if status: - break - - # data ingested, what's left is to delete the original in intake area - # this will also melt/unfreeze etc because metadata is removed too - iter = genquery.row_iterator( - "DATA_NAME", - "COLL_NAME = '" + toplevel_collection + "' " - "AND META_DATA_ATTR_NAME = 'dataset_toplevel' " - "AND META_DATA_ATTR_VALUE = '" + dataset_id + "' ", - genquery.AS_LIST, ctx) - - for row in iter: - intake_path = toplevel_collection + "/" + row[0] - # Now remove data object in intake - try: - data_object.remove(ctx, intake_path, force=True) - except Exception: - log.write(ctx, "ERROR: unable to remove intake object " + intake_path) - # error occurred during ingest, cleanup vault area and relay the error to user - # NB: keep the dataset in the vault queue so we can retry some other time - log.write(ctx, "ERROR: Ingest failed for *datasetId error = *status") - - # reset buffer interface - buffer = {} - status = vault_tree_walk_collection(ctx, vault_path, buffer, vault_walk_remove_object) - - # Finally return status - return status - - -def vault_ingest_object(ctx, object_path, is_collection, vault_path): - # from the original object only the below list is copied to the vault object, other info is ignored - copied_metadata = ["wave", "experiment_type", "pseudocode", "version", - "error", "warning", "comment", "dataset_error", - "dataset_warning", "datasetid"] - - if not is_collection: - # first chksum the original file then use it to verify the vault copy - try: - ctx.msiDataObjChksum(object_path, "forceChksum=", 0) - ctx.msiDataObjCopy(object_path, vault_path, 'verifyChksum=', 0) - except msi.Error: - return 1 - - coll, dataname = pathutil.chop(object_path) - - iter = genquery.row_iterator( - "META_DATA_ATTR_NAME, META_DATA_ATTR_VALUE", - "COLL_NAME = '" + coll + "' AND DATA_NAME = '" + dataname + "' ", - genquery.AS_LIST, ctx) - - for row in iter: - if row[0] in copied_metadata: - avu.set_on_data(ctx, vault_path, row[0], row[1]) - - # add metadata found in system info - iter = genquery.row_iterator( - "DATA_OWNER_NAME, DATA_OWNER_ZONE, DATA_CREATE_TIME", - "COLL_NAME = '" + coll + "' AND DATA_NAME = '" + dataname + "' ", - genquery.AS_LIST, ctx) - - for row in iter: - avu.set_on_data(ctx, vault_path, "submitted_by=", row[0] + '#' + row[1]) - avu.set_on_data(ctx, vault_path, "submitted_date", row[2]) - else: - # CREATE COLLECTION - try: - collection.create(ctx, vault_path, "1") - except msi.Error: - return 1 - - iter = genquery.row_iterator( - "META_COLL_ATTR_NAME, META_COLL_ATTR_VALUE", - "COLL_NAME = '" + object_path + "' ", - genquery.AS_LIST, ctx) - - for row in iter: - if row[0] in copied_metadata: - avu.set_on_coll(ctx, vault_path, row[0], row[1]) - - # add metadata found in system info - iter = genquery.row_iterator( - "COLL_OWNER_NAME, COLL_OWNER_ZONE, COLL_CREATE_TIME", - "COLL_NAME = '" + object_path + "' ", - genquery.AS_LIST, ctx) - - for row in iter: - avu.set_on_coll(ctx, vault_path, "submitted_by=", row[0] + '#' + row[1]) - avu.set_on_coll(ctx, vault_path, "submitted_date", row[2]) - - return 0 - - -def vault_walk_remove_object(ctx, item_parent, item_name, is_collection): - status = 0 - try: - if is_collection: - collection.remove(ctx, item_parent + '/' + item_name) - else: - data_object.remove(ctx, item_parent + '/' + item_name, force=True) - except Exception: - status = 1 - - return status - - -def vault_walk_ingest_object(ctx, item_parent, item_name, is_collection, buffer): - source_path = item_parent + '/' + item_name - dest_path = buffer["destination"] - if source_path != buffer["source"]: - # rewrite path to copy objects that are located underneath the toplevel collection - source_length = len(source_path) - relative_path = source_path[(len(buffer["source"]) + 1): source_length] - dest_path = buffer["destination"] + '/' + relative_path - - return vault_ingest_object(ctx, source_path, is_collection, dest_path) - - -def vault_tree_walk_collection(ctx, path, buffer, rule_to_process): - """Walk a subtree and perform 'rule_to_process' per item. - - :param ctx: Combined type of a callback and rei struct - :param path: Path of collection to treewalk - :param buffer: Exclusively to be used by the rule we will can - :param rule_to_process: Name of the rule to be executed in the context of a tree-item - - :returns: Error status - """ - parent_collection, collection = pathutil.chop(path) - - error = 0 - # first deal with any subcollections within this collection - iter = genquery.row_iterator( - "COLL_NAME", - "COLL_PARENT_NAME = '" + path + "' ", - genquery.AS_LIST, ctx) - for row in iter: - error = vault_tree_walk_collection(ctx, row[0], buffer, rule_to_process) - if error: - break - - # when done then process the dataobjects directly located within this collection - if error == 0: - iter = genquery.row_iterator( - "DATA_NAME", - "COLL_NAME = '" + path + "' ", - genquery.AS_LIST, ctx) - for row in iter: - error = rule_to_process(ctx, path, row[0], False, buffer) - if error: - break - - # and lastly process the collection itself - if error == 0: - error = rule_to_process(ctx, parent_collection, collection, True, buffer) - - return error - - -def vault_dataset_add_default_metadata(ctx, vault_path, dataset_id): - id_components = intake_scan.dataset_parse_id(dataset_id) - # my_date = datetime.now() - # id_components["dataset_date_created"] = my_date.strftime('%Y-%m-%dT%H:%M:%S.%f%z') - id_components["dataset_date_created"] = str(int(time.time())) - - keys = ["wave", "experiment_type", "pseudocode", "version", "dataset_date_created"] - for key in keys: - try: - avu.set_on_data(ctx, vault_path, key, id_components[key]) - except Exception: - avu.set_on_coll(ctx, vault_path, key, id_components[key]) - - -def vault_dataset_exists(ctx, vault_root, dataset_id): - id_components = intake_scan.dataset_parse_id(dataset_id) - # Beware! extra 'ver' before version from original code: *wepv = *wave ++ *sep ++ *experimentType ++ *sep ++ *pseudocode ++ *sep ++ "ver*version"; - wepv = id_components["wave"] + "_" + id_components["experiment_type"] + "_" + id_components["pseudocode"] + "_ver" + id_components["version"] - dataset_path = vault_root + '/' + id_components["wave"] + "/" + id_components["experiment_type"] + "/" + id_components["pseudocode"] + "/" + wepv - - iter = genquery.row_iterator( - "COLL_NAME", - "COLL_NAME = '" + dataset_path + "' ", - genquery.AS_LIST, ctx) - - for _row in iter: - return True - - return False - - -def get_dataset_path(root, dataset_id): - id_components = intake_scan.dataset_parse_id(dataset_id) - # Beware! extra 'ver' before version from original code: *wepv = *wave ++ *sep ++ *experimentType ++ *sep ++ *pseudocode ++ *sep ++ "ver*version"; - wepv = id_components["wave"] + "_" + id_components["experiment_type"] + "_" + id_components["pseudocode"] + "_ver" + id_components["version"] - - return root + '/' + id_components["wave"] + "/" + id_components["experiment_type"] + "/" + id_components["pseudocode"] + "/" + wepv diff --git a/integration_tests.py b/integration_tests.py index abfa859c9..f92f4c0dd 100644 --- a/integration_tests.py +++ b/integration_tests.py @@ -117,6 +117,27 @@ def _test_avu_rmw_collection(ctx, rmw_attributes): return result +def _test_avu_get_attr_val_of_coll(ctx, attr, value): + # Test getting the value of an attribute on a collection + tmp_coll = _create_tmp_collection(ctx) + ctx.msi_add_avu('-c', tmp_coll, attr, value, "baz") + result = avu.get_attr_val_of_coll(ctx, tmp_coll, attr) + collection.remove(ctx, tmp_coll) + return result + + +def _test_avu_get_attr_val_of_coll_exception(ctx): + # Test that getting a non existing attribute on a collection raises an exception (True for exception raised) + tmp_coll = _create_tmp_collection(ctx) + result = False + try: + result = avu.get_attr_val_of_coll(ctx, tmp_coll, "foo") + except Exception: + result = True + collection.remove(ctx, tmp_coll) + return result + + def _test_folder_set_retry_avus(ctx): tmp_coll = _create_tmp_collection(ctx) folder.folder_secure_set_retry_avus(ctx, tmp_coll, 2) @@ -350,16 +371,16 @@ def _test_folder_secure_func(ctx, func): "test": lambda ctx: _call_msvc_json_arrayops(ctx, '["a", "b", "c"]', "", "size", 0, 3), "check": lambda x: x == 3}, {"name": "msvc.json_objops.add_notexist_empty", - "test": lambda ctx: _call_msvc_json_objops(ctx, '', msi.kvpair(ctx, "e", "f"), 'add', 0), + "test": lambda ctx: _call_msvc_json_objops(ctx, '', msi.kvpair(ctx, "e", "f"), 'add', 0), "check": lambda x: x == '{"e": "f"}'}, {"name": "msvc.json_objops.add_notexist_nonempty", - "test": lambda ctx: _call_msvc_json_objops(ctx, '{"a": "b"}', msi.kvpair(ctx, "e", "f"), 'add', 0), + "test": lambda ctx: _call_msvc_json_objops(ctx, '{"a": "b"}', msi.kvpair(ctx, "e", "f"), 'add', 0), "check": lambda x: x == '{"a": "b", "e": "f"}'}, {"name": "msvc.json_objops.add_exist_nonempty", - "test": lambda ctx: _call_msvc_json_objops(ctx, '{"a": "b"}', msi.kvpair(ctx, "e", "g"), 'add', 0), + "test": lambda ctx: _call_msvc_json_objops(ctx, '{"a": "b"}', msi.kvpair(ctx, "e", "g"), 'add', 0), "check": lambda x: x == '{"a": "b", "e": "g"}'}, {"name": "msvc.json_objops.get_exist", - "test": lambda ctx: _call_msvc_json_objops(ctx, '{"a": "b", "c": "d"}', msi.kvpair(ctx, "c", ""), 'get', 1), + "test": lambda ctx: _call_msvc_json_objops(ctx, '{"a": "b", "c": "d"}', msi.kvpair(ctx, "c", ""), 'get', 1), "check": lambda x: str(x) == "(['c'], ['d'])"}, {"name": "msvc.json_objops.get_notexist", "test": lambda ctx: _call_msvc_json_objops(ctx, '{"a": "b", "c": "d"}', msi.kvpair(ctx, "e", ""), 'get', 1), @@ -482,6 +503,12 @@ def _test_folder_secure_func(ctx, func): "check": lambda x: (("aap", "noot", "mies") in x and len([a for a in x if a[0] not in ["org_replication_scheduled"]]) == 1 )}, + {"name": "avu.get_attr_val_of_coll.exists.yes", + "test": lambda ctx: _test_avu_get_attr_val_of_coll(ctx, "foo", "bar"), + "check": lambda x: x == "bar"}, + {"name": "avu.get_attr_val_of_coll.exists.no", + "test": lambda ctx: _test_avu_get_attr_val_of_coll_exception(ctx), + "check": lambda x: x}, {"name": "avu.apply_atomic_operations.collection", "test": lambda ctx: _test_msvc_apply_atomic_operations_collection(ctx), "check": lambda x: (("foo", "bar", "baz") in x and len(x) == 1)}, diff --git a/meta.py b/meta.py index 829f60dfa..797162ec5 100644 --- a/meta.py +++ b/meta.py @@ -13,6 +13,7 @@ import irods_types from deepdiff import DeepDiff +import meta_form import provenance import publication import schema as schema_ @@ -164,7 +165,7 @@ def is_json_metadata_valid(callback, :param metadata: Pre-parsed JSON object :param ignore_required: Ignore required fields - :returns: Boolean indicating if JSON metadata us valid + :returns: Boolean indicating if JSON metadata is valid """ try: return len(get_json_metadata_errors(callback, @@ -271,7 +272,7 @@ def collection_has_cloneable_metadata(callback, coll): @api.make() def api_meta_remove(ctx, coll): - """Remove a collection's metadata JSON, if it exist.""" + """Remove a collection's metadata JSON, if it exists.""" log.write(ctx, 'Remove metadata of coll {}'.format(coll)) try: @@ -790,3 +791,50 @@ def copy_user_metadata(ctx, source, target): log.write(ctx, "copy_user_metadata: copied user metadata from <{}> to <{}/original>".format(source, target)) except Exception: log.write(ctx, "copy_user_metadata: failed to copy user metadata from <{}> to <{}/original>".format(source, target)) + + +def vault_metadata_matches_schema(ctx, coll_name, schema_cache, report_name, write_stdout): + """Process a single data package to retrieve and validate that its metadata conforms to the schema. + + :param ctx: Combined type of a callback and rei struct + :param coll_name: String representing the data package collection path. + :param schema_cache: Dictionary storing schema blueprints, can be empty. + :param report_name: Name of report script (for logging) + :param write_stdout: A boolean representing whether to write to stdout or rodsLog + + :returns: A dictionary result containing if schema matches and the schema short name. + """ + metadata_path = get_latest_vault_metadata_path(ctx, coll_name) + + if not metadata_path: + log.write(ctx, "{} skips {}, because metadata could not be found.".format(report_name, coll_name), write_stdout) + return None + + try: + metadata = jsonutil.read(ctx, metadata_path) + except Exception as exc: + log.write(ctx, "{} skips {}, because of exception while reading metadata file {}: {}".format(report_name, coll_name, metadata_path, str(exc)), write_stdout) + log.write(ctx, "vault_metadata_matches_schema: Error while reading metadata file {} of data package {}: {}".format(metadata_path, coll_name, str(exc)), write_stdout) + return None + + # Determine schema + schema_id = schema_.get_schema_id(ctx, metadata_path) + schema_shortname = schema_id.split("/")[-2] + + # Retrieve schema and cache it for future use + schema_path = schema_.get_schema_path_by_id(ctx, metadata_path, schema_id) + if schema_shortname in schema_cache: + schema_contents = schema_cache[schema_shortname] + else: + schema_contents = jsonutil.read(ctx, schema_path) + schema_cache[schema_shortname] = schema_contents + + # Check whether metadata matches schema and log any errors + error_list = get_json_metadata_errors(ctx, metadata_path, metadata=metadata, schema=schema_contents) + match_schema = len(error_list) == 0 + if not match_schema: + errors_formatted = [meta_form.humanize_validation_error(e).encode('utf-8') for e in error_list] + log.write(ctx, "{}: metadata {} did not match schema {}: {}".format(report_name, metadata_path, schema_shortname, str(errors_formatted)), write_stdout) + log.write(ctx, "vault_metadata_matches_schema: Metadata {} of data package {} did not match the schema {}. Error list: {}".format(metadata_path, coll_name, schema_shortname, str(errors_formatted)), write_stdout) + + return {"schema": schema_shortname, "match_schema": match_schema} diff --git a/policies_intake.py b/policies_intake.py index 5b490e63e..159ddbca9 100644 --- a/policies_intake.py +++ b/policies_intake.py @@ -1,15 +1,54 @@ # -*- coding: utf-8 -*- -"""iRODS policy implementations.""" +"""Policies for intake.""" -__copyright__ = 'Copyright (c) 2021, Utrecht University' +__copyright__ = 'Copyright (c) 2021-2024, Utrecht University' __license__ = 'GPLv3, see LICENSE' import genquery -import intake_scan from util import * +def object_is_locked(ctx, path, is_collection): + """Returns whether given object in path (collection or dataobject) is locked or frozen + + :param ctx: Combined type of a callback and rei struct + :param path: Path to object or collection + :param is_collection: Whether path contains a collection or data object + + :returns: Returns locked state + """ + locked_state = {"locked": False, + "frozen": False} + + if is_collection: + iter = genquery.row_iterator( + "META_COLL_ATTR_NAME", + "COLL_NAME = '" + path + "'", + genquery.AS_LIST, ctx + ) + for row in iter: + if row[0] in ['to_vault_lock', 'to_vault_freeze']: + locked_state['locked'] = True + if row[0] == 'to_vault_freeze': + locked_state['frozen'] = True + else: + parent_coll = pathutil.dirname(path) + iter = genquery.row_iterator( + "META_DATA_ATTR_NAME", + "COLL_NAME = '" + parent_coll + "' AND DATA_NAME = '" + pathutil.basename(path) + "'", + genquery.AS_LIST, ctx + ) + # return locked_state + for row in iter: + if row[0] in ['to_vault_lock', 'to_vault_freeze']: + locked_state['locked'] = True + if row[0] == 'to_vault_freeze': + locked_state['frozen'] = True + + return locked_state + + def is_data_in_locked_dataset(ctx, actor, path): """ Check whether given data object is within a locked dataset """ dataset_id = '' @@ -64,7 +103,7 @@ def is_data_in_locked_dataset(ctx, actor, path): toplevel_is_collection = False if toplevel_collection: - locked_state = intake_scan.object_is_locked(ctx, toplevel_collection, toplevel_is_collection) + locked_state = object_is_locked(ctx, toplevel_collection, toplevel_is_collection) log.debug(ctx, locked_state) return (locked_state['locked'] or locked_state['frozen']) and not user.is_admin(ctx, actor) else: @@ -117,7 +156,7 @@ def is_coll_in_locked_dataset(ctx, actor, coll): toplevel_is_collection = False if toplevel_collection: - locked_state = intake_scan.object_is_locked(ctx, toplevel_collection, toplevel_is_collection) + locked_state = object_is_locked(ctx, toplevel_collection, toplevel_is_collection) log.debug(ctx, locked_state) return (locked_state['locked'] or locked_state['frozen']) and not user.is_admin(ctx, actor) else: @@ -169,7 +208,7 @@ def coll_in_path_of_locked_dataset(ctx, actor, coll): toplevel_is_collection = False if toplevel_collection: - locked_state = intake_scan.object_is_locked(ctx, toplevel_collection, toplevel_is_collection) + locked_state = object_is_locked(ctx, toplevel_collection, toplevel_is_collection) log.debug(ctx, locked_state) return (locked_state['locked'] or locked_state['frozen']) and not user.is_admin(ctx, actor) else: diff --git a/publication.py b/publication.py index 31e98e3cf..d97507197 100644 --- a/publication.py +++ b/publication.py @@ -4,6 +4,7 @@ __copyright__ = 'Copyright (c) 2019-2024, Utrecht University' __license__ = 'GPLv3, see LICENSE' +import re from datetime import datetime import genquery @@ -1314,7 +1315,7 @@ def process_republication(ctx, vault_package): return publication_state["status"] -@rule.make(inputs=range(4), outputs=range(4, 6)) +@rule.make(inputs=range(4)) def rule_update_publication(ctx, vault_package, update_datacite, update_landingpage, update_moai): """Rule interface for updating the publication of a vault package. @@ -1323,10 +1324,33 @@ def rule_update_publication(ctx, vault_package, update_datacite, update_landingp :param update_datacite: Flag that indicates updating DataCite :param update_landingpage: Flag that indicates updating landingpage :param update_moai: Flag that indicates updating MOAI (OAI-PMH) - - :returns: "OK" if all went ok """ - return update_publication(ctx, vault_package, update_datacite == 'Yes', update_landingpage == 'Yes', update_moai == 'Yes') + if user.user_type(ctx) != 'rodsadmin': + log.write(ctx, "User is no rodsadmin", True) + return + + log.write(ctx, "[UPDATE PUBLICATIONS] Start for {}".format(vault_package), True) + collections = genquery.row_iterator( + "COLL_NAME", + "COLL_NAME like '%%/home/vault-%%' " + "AND META_COLL_ATTR_NAME = '" + constants.UUORGMETADATAPREFIX + "vault_status' " + "AND META_COLL_ATTR_VALUE = '{}'".format(str(constants.vault_package_state.PUBLISHED)), + genquery.AS_LIST, + ctx + ) + + packages_found = False + for collection in collections: + coll_name = collection[0] + if ((vault_package == '*' and re.match(r'/[^/]+/home/vault-.*', coll_name)) or (vault_package != '*' and re.match(r'/[^/]+/home/vault-.*', coll_name) and coll_name == vault_package)): + packages_found = True + output = update_publication(ctx, coll_name, update_datacite == 'Yes', update_landingpage == 'Yes', update_moai == 'Yes') + log.write(ctx, coll_name + ': ' + output, True) + + if not packages_found: + log.write(ctx, "[UPDATE PUBLICATIONS] No packages found for {}".format(vault_package), True) + else: + log.write(ctx, "[UPDATE PUBLICATIONS] Finished for {}".format(vault_package), True) def update_publication(ctx, vault_package, update_datacite=False, update_landingpage=False, update_moai=False): diff --git a/publication_troubleshoot.py b/publication_troubleshoot.py new file mode 100644 index 000000000..6ceafe737 --- /dev/null +++ b/publication_troubleshoot.py @@ -0,0 +1,442 @@ +# -*- coding: utf-8 -*- +"""Functions and rules for troubleshooting published data packages.""" + +__copyright__ = 'Copyright (c) 2024, Utrecht University' +__license__ = 'GPLv3, see LICENSE' + +__all__ = [ + 'api_batch_troubleshoot_published_data_packages', + 'rule_batch_troubleshoot_published_data_packages' +] + +import json +from datetime import datetime + +import genquery +import requests +import urllib3 + +import datacite +from meta import vault_metadata_matches_schema +from publication import get_publication_config +from util import * + + +def find_full_package_path(ctx, package_name, write_stdout): + """ + Find the full path of a data package based on its short name. + + :param ctx: Combined type of a callback and rei struct + :param package_name: The short name of the data package to find. + :param write_stdout: A boolean representing whether to write to stdout or rodsLog + + :returns: The full path of the data package if found, otherwise None. + """ + try: + query_condition = ( + "COLL_NAME like '%{}%'".format(package_name) + ) + query_attributes = "COLL_NAME" + iter = genquery.row_iterator(query_attributes, query_condition, genquery.AS_LIST, ctx) + + # Return full package path if exists + for row in iter: + return row[0] + except Exception as e: + log.write(ctx, "find_full_package_path: An error occurred while executing the query: {}".format(e), write_stdout) + return None + + +def find_data_packages(ctx, write_stdout): + """ + Find all data packages in Retry, Unrecoverable and Unknown status by matching its AVU. + + :param ctx: Combined type of a callback and rei struct + :param write_stdout: A boolean representing whether to write to stdout or rodsLog + + :returns: A list of collection names that have not been processed successfully + """ + user_zone = user.zone(ctx) + + try: + # Get all the vault packages that have org_publication_status in metadata + query_condition = ( + "COLL_NAME like '/{}/home/vault-%' AND " + "META_COLL_ATTR_NAME = '{}publication_status'".format(user_zone, constants.UUORGMETADATAPREFIX) + ) + query_attributes = "COLL_NAME" + iter = genquery.row_iterator(query_attributes, query_condition, genquery.AS_LIST, ctx) + + # Collecting only the collection names + return [row[0] for row in iter] + + except Exception as e: + log.write(ctx, "find_data_packages: An error occurred while executing the query: {}".format(e), write_stdout) + return [] + + +def check_print_data_package_system_avus(ctx, data_package, write_stdout): + """ + Checks whether a data package has the expected system AVUs that start with constants.UUORGMETADATAPREFIX (i.e, 'org_'). + This function compares the AVUs of the provided data package against a set of ground truth AVUs derived from + a successfully published data package. + This also prints if there are any missing or unexpected results. + + :param ctx: Combined type of a callback and rei struct + :param data_package: String representing the data package collection path. + :param write_stdout: A boolean representing whether to write to stdout or rodsLog + + :returns: A 2-tuple containing boolean results of checking results + """ + extracted_avus = avu.of_coll(ctx, data_package) + results = misc.check_data_package_system_avus(extracted_avus) + + if not results["no_missing_avus"]: + log.write(ctx, "check_data_package_system_avus: There are some missing AVUs in data package <{}> - {}".format(data_package, list(results["missing_avus"])), write_stdout) + + if not results["no_unexpected_avus"]: + log.write(ctx, "check_data_package_system_avus: There are some unexpected AVUs in data package <{}> - {}".format(data_package, list(results["unexpected_avus"])), write_stdout) + + return (results["no_missing_avus"], results["no_unexpected_avus"]) + + +def check_one_datacite_doi_reg(ctx, data_package, doi_name, write_stdout): + try: + doi = get_val_for_attr_with_pub_prefix(ctx, data_package, doi_name) + except ValueError as e: + log.write(ctx, "check_datacite_doi_registration: Error while trying to get {} - {}".format(doi_name, e), write_stdout) + return False + + status_code = datacite.metadata_get(ctx, doi) + return status_code == 200 + + +def check_datacite_doi_registration(ctx, data_package, write_stdout): + """ + Check the registration status of both versionDOI and baseDOI with the DataCite API, + ensuring that both DOIs return a 200 status code, which indicates successful registration. + + :param ctx: Combined type of a callback and rei struct + :param data_package: String representing the data package collection path. + :param write_stdout: A boolean representing whether to write to stdout or rodsLog + + :returns: A tuple of booleans indicating check success or not (base doi check may be None if not relevant). + """ + version_doi_check = check_one_datacite_doi_reg(ctx, data_package, "versionDOI", write_stdout) + + previous_version = '' + try: + previous_version = get_val_for_attr_with_pub_prefix(ctx, data_package, "previous_version") + except Exception: + pass + + if previous_version: + base_doi_check = check_one_datacite_doi_reg(ctx, data_package, "baseDOI", write_stdout) + return version_doi_check, base_doi_check + + return (version_doi_check, None) + + +def get_val_for_attr_with_pub_prefix(ctx, data_package, attribute_suffix): + """ + Retrieves the value given the suffix of the attribute from a data package. + + :param ctx: Combined type of a callback and rei struct + :param data_package: String representing the data package collection path. + :param attribute_suffix: Suffix of the attribute before adding prefix such as "org_publication_" + + :returns: Value of the attribute. + """ + attr = constants.UUORGMETADATAPREFIX + "publication_" + attribute_suffix + return avu.get_attr_val_of_coll(ctx, data_package, attr) + + +def get_landingpage_paths(ctx, data_package, write_stdout): + """Given a data package get what the path and remote url should be""" + file_path = '' + try: + file_path = get_val_for_attr_with_pub_prefix(ctx, data_package, "landingPagePath") + url = get_val_for_attr_with_pub_prefix(ctx, data_package, "landingPageUrl") + return file_path, url + + except Exception: + log.write(ctx, "get_landingpage_paths: Could not find landing page for data package: {}".format(data_package), write_stdout) + return '', '' + + +def compare_local_remote_landingpage(ctx, file_path, url, offline, api_call): + """ + Compares file contents between a file in irods and its remote version to verify their integrity. + + :param ctx: Combined type of a callback and rei struct + :param file_path: Path to file in irods + :param url: URL of file on remote + :param offline: Whether to skip requests.get call + :param api_call: Boolean representing whether was called by api and not a script + + :returns: True if the file contents match, False otherwise + """ + write_stdout = not api_call + # Local/irods file + if api_call: + # If called by technicaladmin, only check that the file exists since we don't have access to the contents + return data_object.exists(ctx, file_path) + else: + try: + local_data = data_object.read(ctx, file_path) + except Exception: + log.write(ctx, "compare_local_remote_landingpage: Local file not found at path {}.".format(file_path), write_stdout) + return False + + if offline: + return len(local_data) > 0 + + urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning) + + try: + response = requests.get(url, verify=False) + except requests.exceptions.ConnectionError as e: + log.write(ctx, "compare_local_remote_landingpage: Failed to connect to {}".format(url), write_stdout) + log.write(ctx, "compare_local_remote_landingpage: Error: {}".format(e), write_stdout) + return False + + if response.status_code != 200: + log.write(ctx, "compare_local_remote_landingpage: Error {} when connecting to <{}>.".format(response.status_code, url), write_stdout) + return False + + # Set encoding to utf-8 for the response text (otherwise will not match local_data) + # response.text is then returned as unicode + response.encoding = 'utf-8' + local_data_uni = local_data.decode("utf-8") + + if local_data_uni == response.text: + return True + + log.write(ctx, "compare_local_remote_landingpage: File contents at irods path <{}> and remote landing page <{}> do not match.".format(file_path, url), write_stdout) + return False + + +def check_landingpage(ctx, data_package, offline, api_call): + """ + Checks the integrity of landing page by comparing the contents + + :param ctx: Combined type of a callback and rei struct + :param data_package: String representing the data package collection path. + :param offline: Whether to skip any checks that require external server access + :param api_call: Boolean of whether this is for an api call version of the troubleshooting script + + :returns: A tuple containing boolean results of checking + """ + irods_file_path, landing_page_url = get_landingpage_paths(ctx, data_package, not api_call) + if len(irods_file_path) == 0 or len(landing_page_url) == 0: + return False + + return compare_local_remote_landingpage(ctx, irods_file_path, landing_page_url, offline, api_call) + + +def check_combi_json(ctx, data_package, publication_config, offline, write_stdout): + """ + Checks the integrity of combi JSON by checking URL and existence of file. + + :param ctx: Combined type of a callback and rei struct + :param data_package: String representing the data package collection path. + :param publication_config: Dictionary of publication config + :param offline: Whether to skip any checks that require external server access + :param write_stdout: A boolean representing whether to write to stdout or rodsLog + + :returns: A tuple containing boolean results of checking + """ + # Check that the combi json in irods exists + file_path = '' + try: + file_path = get_val_for_attr_with_pub_prefix(ctx, data_package, "combiJsonPath") + except Exception: + pass + exists = data_object.exists(ctx, file_path) + if not exists: + log.write(ctx, "check_combi_json: combi JSON file in irods does not exist: {}".format(file_path), write_stdout) + return False + + if offline: + return True + + urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning) + + # Get the version doi + version_doi = '' + try: + version_doi = get_val_for_attr_with_pub_prefix(ctx, data_package, "versionDOI") + except Exception: + pass + url = "https://{}/oai/oai?verb=GetRecord&metadataPrefix=oai_datacite&identifier=oai:{}".format(publication_config["publicVHost"], version_doi) + try: + response = requests.get(url, verify=False) + except requests.exceptions.ConnectionError as e: + log.write(ctx, "check_combi_json: Failed to connect to {}".format(url), write_stdout) + log.write(ctx, "check_combi_json: Error: {}".format(e), write_stdout) + return False + + if response.status_code != 200: + log.write(ctx, "check_combi_json: Error {} when connecting to <{}>.".format(response.status_code, url), write_stdout) + return False + + # Look at the first few parts of the response for signs of error. + if "idDoesNotExist" in response.text[:5000]: + log.write(ctx, "check_combi_json: combiJson not found in oai for data package <{}>".format(data_package), write_stdout) + return False + + return True + + +def print_troubleshoot_result(ctx, data_package, result, datacite_check): + """Print the result of troubleshooting one package in human-friendly format""" + pass_all_tests = all(result.values()) + + log.write(ctx, "Results for: {}".format(data_package), True) + if pass_all_tests: + log.write(ctx, "Package passed all tests.", True) + else: + log.write(ctx, "Package FAILED one or more tests:", True) + log.write(ctx, "Schema matches: {}".format(result['schema_check']), True) + log.write(ctx, "All expected AVUs exist: {}".format(result['no_missing_AVUs_check']), True) + log.write(ctx, "No unexpected AVUs: {}".format(result['no_unexpected_AVUs_check']), True) + + if datacite_check: + log.write(ctx, "Version DOI matches: {}".format(result['versionDOI_check']), True) + if 'baseDOI_check' in result: + log.write(ctx, "Base DOI matches: {}".format(result['baseDOI_check']), True) + + log.write(ctx, "Landing page matches: {}".format(result['landingPage_check']), True) + log.write(ctx, "Combined JSON matches: {}".format(result['combiJson_check']), True) + + log.write(ctx, "", True) + + +def collect_troubleshoot_data_packages(ctx, requested_package, write_stdout): + data_packages = [] + + if requested_package == 'None': + # Retrieve all data packages + all_packages = find_data_packages(ctx, write_stdout) + if not all_packages: + log.write(ctx, "collect_troubleshoot_data_packages: No packages found.", write_stdout) + return None + + data_packages = all_packages + else: + # Get full path of the given package + full_package_path = find_full_package_path(ctx, requested_package, write_stdout) + + if not full_package_path: + log.write(ctx, "collect_troubleshoot_data_packages: Data package '{}' cannot be found.".format(requested_package), write_stdout) + return None + + data_packages.append(full_package_path) + + return data_packages + + +def batch_troubleshoot_published_data_packages(ctx, requested_package, log_file, offline, api_call, check_datacite): + """ + Troubleshoots published data packages. + + :param ctx: Context that combines a callback and rei struct. + :param requested_package: A string representing a specific data package path or all packages with failed publications. + :param log_file: A boolean representing to write results in log. + :param offline: A boolean representing whether to perform all checks without connecting to external servers. + :param api_call: Boolean of whether this is run by a script or api test. + :param check_datacite: Boolean representing whether to do the datacite checks + + :returns: A dictionary of dictionaries providing the results of the job. + """ + write_stdout = not api_call + # Check permissions - rodsadmin only + if user.user_type(ctx) != 'rodsadmin': + log.write(ctx, "User is not rodsadmin", write_stdout) + return {} + + data_packages = collect_troubleshoot_data_packages(ctx, requested_package, write_stdout) + if not data_packages: + return {} + schema_cache = {} + results = {} + + # Troubleshooting + for data_package in data_packages: + log.write(ctx, "Troubleshooting data package: {}".format(data_package), write_stdout) + result = {} + # Cannot check the metadata as technicaladmin + if not api_call: + schema_check_dict = vault_metadata_matches_schema(ctx, data_package, schema_cache, "troubleshoot-publications", write_stdout) + result['schema_check'] = schema_check_dict['match_schema'] if schema_check_dict else False + + result['no_missing_AVUs_check'], result['no_unexpected_AVUs_check'] = check_print_data_package_system_avus(ctx, data_package, write_stdout) + + # Only check datacite if enabled + if check_datacite: + result['versionDOI_check'], base_doi_check = check_datacite_doi_registration(ctx, data_package, write_stdout) + if base_doi_check is not None: + result['baseDOI_check'] = base_doi_check + + result['landingPage_check'] = check_landingpage(ctx, data_package, offline, api_call) + publication_config = get_publication_config(ctx) + result['combiJson_check'] = check_combi_json(ctx, data_package, publication_config, offline, write_stdout) + + results[data_package] = result + + if not api_call: + print_troubleshoot_result(ctx, data_package, result, check_datacite) + + if log_file: + log_loc = "/var/lib/irods/log/troubleshoot_publications.log" + with open(log_loc, "a") as writer: + writer.writelines("Batch run date and time: {}".format(datetime.now().strftime('%Y-%m-%d %H:%M:%S'))) + writer.writelines('\n') + writer.writelines("Troubleshooting data package: {}".format(data_package)) + writer.writelines('\n') + json.dump(result, writer) + writer.writelines('\n') + + return results + + +@api.make() +def api_batch_troubleshoot_published_data_packages(ctx, requested_package, log_file, offline): + """ + Wrapper for the batch script for troubleshooting published data packages. + Runs a subset of the tests since "technicaladmin" is usually more restricted than "rods". + + :param ctx: Combined type of a callback and rei struct + :param requested_package: A string representing a specific data package path or all packages with failed publications. + :param log_file: A boolean representing to write results in log. + :param offline: A boolean representing whether to perform all checks without connecting to external servers. + + :returns: A dictionary of dictionaries providing the results of the job. + """ + return batch_troubleshoot_published_data_packages(ctx, requested_package, log_file, offline, True, False) + + +@rule.make(inputs=[0, 1, 2, 3], outputs=[]) +def rule_batch_troubleshoot_published_data_packages(ctx, requested_package, log_file, offline, no_datacite): + """ + Troubleshoots published data packages. + + Prints results of the following checks: + 1. Metadata schema compliance. + 2. Presence and correctness of expected AVUs. + 3. Registration with Data Cite. + 4. File integrity of landing page and combi JSON files. + + Operates on either a single specified package or all published packages, depending on the input. + + :param ctx: Context that combines a callback and rei struct. + :param requested_package: A string representing a specific data package path or all packages with failed publications. + :param log_file: A string boolean representing to write results in log. + :param offline: A string boolean representing whether to perform all checks without connecting to external servers. + :param no_datacite: A string boolean representing whether to skip the datacite checks + """ + offline = offline == "True" + log_file = log_file == "True" + check_datacite = no_datacite == "False" + + batch_troubleshoot_published_data_packages(ctx, requested_package, log_file, offline, False, check_datacite) diff --git a/revisions.py b/revisions.py index 07553652b..e1e3d4d8a 100644 --- a/revisions.py +++ b/revisions.py @@ -361,6 +361,10 @@ def rule_revision_batch(ctx, verbose, balance_id_min, balance_id_max, batch_size minimum_timestamp = int(time.time() - config.async_revision_delay_time) + # Remove revision creation AVUs from deleted data objects. + # This makes it easier to monitor the number of data objects waiting for revision creation. + remove_revision_creation_avu_from_deleted_data_objects(ctx, print_verbose) + # Get list of up to batch size limit of data objects (in research space) scheduled for revision, taking into account # modification time. log.write(ctx, "verbose = {}".format(verbose)) @@ -1054,3 +1058,28 @@ def memory_limit_exceeded(rss_limit): """ rss_limit = int(rss_limit) return rss_limit and memory_rss_usage() > rss_limit + + +def remove_revision_creation_avu_from_deleted_data_objects(ctx, print_verbose): + """ + Removes revision creation AVUs from deleted data objects [marked with 'org_revision_scheduled' metadata]. + + :param ctx: Combined type of a callback and rei struct + :param print_verbose: Whether to log verbose messages for troubleshooting (Boolean) + """ + revision_avu_name = constants.UUORGMETADATAPREFIX + "revision_scheduled" + + iter = genquery.row_iterator( + "COLL_NAME, DATA_NAME", + "COLL_NAME like '%{}/trash/home/%' AND META_DATA_ATTR_NAME = '{}'".format(user.zone(ctx), revision_avu_name), + genquery.AS_LIST, ctx + ) + + for coll_name, data_name in iter: + path = coll_name + '/' + data_name + try: + avu.rmw_from_data(ctx, path, revision_avu_name, "%") # use wildcard cause rm_from_data causes problems + if print_verbose: + log.write(ctx, 'Removed revision creation AVUs from data object: {}'.format(path)) + except Exception as e: + log.write(ctx, "Error processing data object {}: {}".format(path, str(e))) diff --git a/rules_uu.cfg.template b/rules_uu.cfg.template index 50a1b863d..8524920ff 100644 --- a/rules_uu.cfg.template +++ b/rules_uu.cfg.template @@ -36,7 +36,6 @@ eus_api_tls_verify = enable_deposit = enable_open_search = -enable_intake = enable_datarequest = yoda_portal_fqdn = diff --git a/schema_transformation.py b/schema_transformation.py index 35bc35dd9..77299ada3 100644 --- a/schema_transformation.py +++ b/schema_transformation.py @@ -19,7 +19,6 @@ import session_vars import meta -import meta_form import schema import schema_transformations from util import * @@ -66,6 +65,7 @@ def api_transform_metadata(ctx, coll, keep_metadata_backup=True): execute_transformation(ctx, metadata_path, transform, keep_metadata_backup) else: return api.Error('no_metadata', 'No metadata file found') + return None def get(ctx, metadata_path, metadata=None): @@ -197,7 +197,7 @@ def rule_batch_transform_vault_metadata(rule_args, callback, rei): vault_package = '/'.join(path_parts[:5]) metadata_path = meta.get_latest_vault_metadata_path(callback, vault_package) log.write(callback, "[METADATA] Checking whether metadata needs to be transformed: " + metadata_path) - if metadata_path != '': + if metadata_path != '': transform = get(callback, metadata_path) if transform is not None: log.write(callback, "[METADATA] Executing transformation for: " + metadata_path) @@ -376,9 +376,7 @@ def html(f): re.split('\n{2,}', f.__doc__))) # Remove docstring. - description = re.sub('((:param).*)|((:returns:).*)', ' ', description) - - return description + return re.sub('((:param).*)|((:returns:).*)', ' ', description) @rule.make(inputs=[], outputs=[0]) @@ -394,8 +392,8 @@ def rule_batch_vault_metadata_schema_report(ctx): the metadata matches the JSON schema). match_schema only has a meaning if a metadata schema could be found. """ - results = dict() - schema_cache = dict() + results = {} + schema_cache = {} # Find all vault collections iter = genquery.row_iterator( @@ -405,41 +403,13 @@ def rule_batch_vault_metadata_schema_report(ctx): genquery.AS_LIST, ctx) for row in iter: - coll_name = row[0] - metadata_path = meta.get_latest_vault_metadata_path(ctx, coll_name) - - if metadata_path == '' or metadata_path is None: - log.write(ctx, "Vault metadata schema report skips %s, because metadata could not be found." - % (coll_name)) - continue - try: - metadata = jsonutil.read(ctx, metadata_path) - except Exception as exc: - log.write(ctx, "Vault metadata report skips %s, because of exception while reading metadata file %s: %s." - % (coll_name, metadata_path, str(exc))) + coll_name = row[0] + result = meta.vault_metadata_matches_schema(ctx, coll_name, schema_cache, "Vault metadata schema report", True) + if result: + results[coll_name] = result + except Exception as e: + log.write(ctx, "Error processing collection {}: {}".format(coll_name, str(e))) continue - # Determine schema - schema_id = schema.get_schema_id(ctx, metadata_path) - schema_shortname = schema_id.split("/")[-2] - - # Retrieve schema and cache it for future use - schema_path = schema.get_schema_path_by_id(ctx, metadata_path, schema_id) - if schema_shortname in schema_cache: - schema_contents = schema_cache[schema_shortname] - else: - schema_contents = jsonutil.read(ctx, schema_path) - schema_cache[schema_shortname] = schema_contents - - # Check whether metadata matches schema and log any errors - error_list = meta.get_json_metadata_errors(ctx, metadata_path, metadata=metadata, schema=schema_contents) - match_schema = len(error_list) == 0 - if not match_schema: - log.write(ctx, "Vault metadata schema report: metadata %s did not match schema %s: %s" % - (metadata_path, schema_shortname, str([meta_form.humanize_validation_error(e).encode('utf-8') for e in error_list]))) - - # Update results - results[coll_name] = {"schema": schema_shortname, "match_schema": match_schema} - return json.dumps(results) diff --git a/schema_transformations.py b/schema_transformations.py index af6d14ebb..5e6bd9ad9 100644 --- a/schema_transformations.py +++ b/schema_transformations.py @@ -6,6 +6,8 @@ import re +from schema_transformations_utils import correctify_isni, correctify_orcid, correctify_researcher_id, correctify_scopus + import meta from util import * @@ -128,21 +130,44 @@ def _default2_default3(ctx, m): person_identifiers = [] for person_identifier in creator.get('Person_Identifier', []): + # Check ORCID if person_identifier.get('Name_Identifier_Scheme', None) == 'ORCID': # Check for incorrect ORCID format. if not re.search("^(https://orcid.org/)[0-9]{4}-[0-9]{4}-[0-9]{4}-[0-9]{3}[0-9X]$", person_identifier.get('Name_Identifier', None)): corrected_orcid = correctify_orcid(person_identifier['Name_Identifier']) - # Only it an actual correction took place change the value and mark this data as 'changed'. + # Only if an actual correction took place change the value and mark this data as 'changed'. if corrected_orcid is None: log.write(ctx, "Warning: could not correct ORCID %s during schema transformation. It needs to be fixed manually." % (person_identifier['Name_Identifier'])) elif corrected_orcid != person_identifier['Name_Identifier']: person_identifier['Name_Identifier'] = corrected_orcid + # Check Scopus + elif person_identifier.get('Name_Identifier_Scheme', None) == 'Author identifier (Scopus)': + # Check for incorrect Scopus format. + if not re.search("^\d{1,11}$", person_identifier.get('Name_Identifier', None)): + corrected_scopus = correctify_scopus(person_identifier['Name_Identifier']) + # Only if an actual correction took place change the value and mark this data as 'changed'. + if corrected_scopus is None: + log.write(ctx, "Warning: could not correct Scopus %s during schema transformation. It needs to be fixed manually." + % (person_identifier['Name_Identifier'])) + elif corrected_scopus != person_identifier['Name_Identifier']: + person_identifier['Name_Identifier'] = corrected_scopus + # Check ISNI + elif person_identifier.get('Name_Identifier_Scheme', None) == 'ISNI': + # Check for incorrect ISNI format. + if not re.search("^(https://isni.org/isni/)[0-9]{15}[0-9X]$", person_identifier.get('Name_Identifier', None)): + corrected_isni = correctify_isni(person_identifier['Name_Identifier']) + # Only if an actual correction took place change the value and mark this data as 'changed'. + if corrected_isni is None: + log.write(ctx, "Warning: could not correct ISNI %s during schema transformation. It needs to be fixed manually." + % (person_identifier['Name_Identifier'])) + elif corrected_isni != person_identifier['Name_Identifier']: + person_identifier['Name_Identifier'] = corrected_isni elif person_identifier.get('Name_Identifier_Scheme', None) == 'ResearcherID (Web of Science)': # Check for incorrect ResearcherID format. if not re.search("^(https://www.researcherid.com/rid/)[A-Z]-[0-9]{4}-[0-9]{4}$", person_identifier.get('Name_Identifier', None)): corrected_researcher_id = correctify_researcher_id(person_identifier['Name_Identifier']) - # Only it an actual correction took place change the value and mark this data as 'changed'. + # Only if an actual correction took place change the value and mark this data as 'changed'. if corrected_researcher_id != person_identifier['Name_Identifier']: person_identifier['Name_Identifier'] = corrected_researcher_id elif 'Name_Identifier_Scheme' not in person_identifier: @@ -164,21 +189,44 @@ def _default2_default3(ctx, m): person_identifiers = [] for person_identifier in contributor.get('Person_Identifier', []): + # Check ORCID if person_identifier.get('Name_Identifier_Scheme', None) == 'ORCID': # Check for incorrect ORCID format. if not re.search("^(https://orcid.org/)[0-9]{4}-[0-9]{4}-[0-9]{4}-[0-9]{3}[0-9X]$", person_identifier.get('Name_Identifier', None)): corrected_orcid = correctify_orcid(person_identifier['Name_Identifier']) - # Only it an actual correction took place change the value and mark this data as 'changed'. + # Only if an actual correction took place change the value and mark this data as 'changed'. if corrected_orcid is None: log.write(ctx, "Warning: could not correct ORCID %s during schema transformation. It needs to be fixed manually." % (person_identifier['Name_Identifier'])) elif corrected_orcid != person_identifier['Name_Identifier']: person_identifier['Name_Identifier'] = corrected_orcid + # Check Scopus + elif person_identifier.get('Name_Identifier_Scheme', None) == 'Author identifier (Scopus)': + # Check for incorrect Scopus format. + if not re.search("^\d{1,11}$", person_identifier.get('Name_Identifier', None)): + corrected_scopus = correctify_scopus(person_identifier['Name_Identifier']) + # Only if an actual correction took place change the value and mark this data as 'changed'. + if corrected_scopus is None: + log.write(ctx, "Warning: could not correct Scopus %s during schema transformation. It needs to be fixed manually." + % (person_identifier['Name_Identifier'])) + elif corrected_scopus != person_identifier['Name_Identifier']: + person_identifier['Name_Identifier'] = corrected_scopus + # Check ISNI + elif person_identifier.get('Name_Identifier_Scheme', None) == 'ISNI': + # Check for incorrect ISNI format. + if not re.search("^(https://isni.org/isni/)[0-9]{15}[0-9X]$", person_identifier.get('Name_Identifier', None)): + corrected_isni = correctify_isni(person_identifier['Name_Identifier']) + # Only if an actual correction took place change the value and mark this data as 'changed'. + if corrected_isni is None: + log.write(ctx, "Warning: could not correct ISNI %s during schema transformation. It needs to be fixed manually." + % (person_identifier['Name_Identifier'])) + elif corrected_isni != person_identifier['Name_Identifier']: + person_identifier['Name_Identifier'] = corrected_isni elif person_identifier.get('Name_Identifier_Scheme', None) == 'ResearcherID (Web of Science)': # Check for incorrect ResearcherID format. if not re.search("^(https://www.researcherid.com/rid/)[A-Z]-[0-9]{4}-[0-9]{4}$", person_identifier.get('Name_Identifier', None)): corrected_researcher_id = correctify_researcher_id(person_identifier['Name_Identifier']) - # Only it an actual correction took place change the value and mark this data as 'changed'. + # Only if an actual correction took place change the value and mark this data as 'changed'. if corrected_researcher_id != person_identifier['Name_Identifier']: person_identifier['Name_Identifier'] = corrected_researcher_id elif 'Name_Identifier_Scheme' not in person_identifier: @@ -702,36 +750,3 @@ def get(src_id, dst_id): x = transformations.get(src_id) return None if x is None else x.get(dst_id) - - -def correctify_orcid(org_orcid): - """Correct illformatted ORCID.""" - # Get rid of all spaces. - orcid = org_orcid.replace(' ', '') - - # Upper-case X. - orcid = org_orcid.replace('x', 'X') - - # The last part should hold a valid id like eg: 1234-1234-1234-123X. - # If not, it is impossible to correct it to the valid orcid format - orcs = orcid.split('/') - if not re.search("^[0-9]{4}-[0-9]{4}-[0-9]{4}-[0-9]{3}[0-9X]$", orcs[-1]): - # Return original value. - return org_orcid - - return "https://orcid.org/{}".format(orcs[-1]) - - -def correctify_researcher_id(org_researcher_id): - """Correct illformatted ResearcherID.""" - # Get rid of all spaces. - researcher_id = org_researcher_id.replace(' ', '') - - # The last part should hold a valid id like eg: A-1234-1234 - # If not, it is impossible to correct it to the valid ResearcherID format - orcs = researcher_id.split('/') - if not re.search("^[A-Z]-[0-9]{4}-[0-9]{4}$", orcs[-1]): - # Return original value. - return org_researcher_id - - return "https://www.researcherid.com/rid/{}".format(orcs[-1]) diff --git a/schema_transformations_utils.py b/schema_transformations_utils.py new file mode 100644 index 000000000..d5cf58f68 --- /dev/null +++ b/schema_transformations_utils.py @@ -0,0 +1,67 @@ +# -*- coding: utf-8 -*- +"""JSON schema transformation utility functions.""" + +__copyright__ = 'Copyright (c) 2024, Utrecht University' +__license__ = 'GPLv3, see LICENSE' + +import re + + +def correctify_orcid(org_orcid): + """Correct illformatted ORCID.""" + # Get rid of all spaces. + orcid = org_orcid.replace(' ', '') + + # Upper-case X. + orcid = orcid.replace('x', 'X') + + # The last part should hold a valid id like eg: 1234-1234-1234-123X. + # If not, it is impossible to correct it to the valid orcid format + orcs = orcid.split('/') + if not re.search("^[0-9]{4}-[0-9]{4}-[0-9]{4}-[0-9]{3}[0-9X]$", orcs[-1]): + return None + + return "https://orcid.org/{}".format(orcs[-1]) + + +def correctify_scopus(org_scopus): + """Correct illformatted Scopus.""" + # Get rid of all spaces. + new_scopus = org_scopus.replace(' ', '') + + if not re.search("^\d{1,11}$", new_scopus): + return None + + return new_scopus + + +def correctify_isni(org_isni): + """Correct ill-formatted ISNI.""" + # Remove all spaces. + new_isni = org_isni.replace(' ', '') + + # Upper-case X. + new_isni = new_isni.replace('x', 'X') + + # The last part should hold a valid id like eg: 123412341234123X. + # If not, it is impossible to correct it to the valid isni format + new_isni = new_isni.split('/') + if not re.search("^[0-9]{15}[0-9X]$", new_isni[-1]): + return None + + return "https://isni.org/isni/{}".format(new_isni[-1]) + + +def correctify_researcher_id(org_researcher_id): + """Correct illformatted ResearcherID.""" + # Get rid of all spaces. + researcher_id = org_researcher_id.replace(' ', '') + + # The last part should hold a valid id like eg: A-1234-1234 + # If not, it is impossible to correct it to the valid ResearcherID format + orcs = researcher_id.split('/') + if not re.search("^[A-Z]-[0-9]{4}-[0-9]{4}$", orcs[-1]): + # Return original value. + return org_researcher_id + + return "https://www.researcherid.com/rid/{}".format(orcs[-1]) diff --git a/setup.cfg b/setup.cfg index ab6ee494a..a9ef75804 100644 --- a/setup.cfg +++ b/setup.cfg @@ -5,4 +5,4 @@ strictness=short docstring_style=sphinx max-line-length=127 exclude=__init__.py,tools,tests/env/ -application-import-names=avu,conftest,util,api,config,constants,data_access_token,datacite,datarequest,data_object,epic,error,folder,groups,groups_import,intake,intake_dataset,intake_lock,intake_scan,intake_utils,intake_vault,json_datacite,json_landing_page,jsonutil,log,mail,meta,meta_form,msi,notifications,schema,schema_transformation,schema_transformations,settings,pathutil,provenance,policies_intake,policies_datamanager,policies_datapackage_status,policies_folder_status,policies_datarequest_status,publication,query,replication,revisions,revision_strategies,revision_utils,rule,user,vault,sram,arb_data_manager,cached_data_manager,resource,yoda_names,policies_utils +application-import-names=avu,conftest,util,api,config,constants,data_access_token,datacite,datarequest,data_object,epic,error,folder,groups,groups_import,json_datacite,json_landing_page,jsonutil,log,mail,meta,meta_form,msi,notifications,schema,schema_transformation,schema_transformations,settings,pathutil,provenance,policies_intake,policies_datamanager,policies_datapackage_status,policies_folder_status,policies_datarequest_status,publication,query,replication,revisions,revision_strategies,revision_utils,rule,user,vault,sram,arb_data_manager,cached_data_manager,resource,yoda_names,policies_utils diff --git a/tests/conftest.py b/tests/conftest.py index 70a2520b6..a13b3b9da 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -25,7 +25,6 @@ datarequest = False deposit = False -intake = False archive = False smoke = False skip_api = False @@ -45,7 +44,6 @@ def pytest_addoption(parser): parser.addoption("--datarequest", action="store_true", default=False, help="Run datarequest tests") parser.addoption("--deposit", action="store_true", default=False, help="Run deposit tests") - parser.addoption("--intake", action="store_true", default=False, help="Run intake tests") parser.addoption("--archive", action="store_true", default=False, help="Run vault archive tests") parser.addoption("--no-env-csrf", action="store_true", default=False, help="Do not get CSRF token from environment (this is enabled by default for smoke tests)") parser.addoption("--smoke", action="store_true", default=False, help="Run Smoke tests") @@ -59,7 +57,6 @@ def pytest_addoption(parser): def pytest_configure(config): config.addinivalue_line("markers", "datarequest: Run datarequest tests") config.addinivalue_line("markers", "deposit: Run deposit tests") - config.addinivalue_line("markers", "intake: Run intake tests") config.addinivalue_line("markers", "archive: Run vault archive tests") config.addinivalue_line("markers", "all: Run all tests") config.addinivalue_line("markers", "ui: UI test") @@ -86,10 +83,9 @@ def pytest_configure(config): global verbose_test verbose_test = config.getoption("--verbose-test") - global datarequest, deposit, intake, archive, smoke, run_all, skip_api, skip_ui, no_env_csrf + global datarequest, deposit, archive, smoke, run_all, skip_api, skip_ui, no_env_csrf datarequest = config.getoption("--datarequest") deposit = config.getoption("--deposit") - intake = config.getoption("--intake") archive = config.getoption("--archive") smoke = config.getoption("--smoke") skip_ui = config.getoption("--skip-ui") @@ -109,7 +105,6 @@ def pytest_configure(config): if run_all: datarequest = True deposit = True - intake = True archive = True # Store cookies for each user. @@ -131,10 +126,6 @@ def pytest_bdd_apply_tag(tag, function): marker = pytest.mark.skip(reason="Skip deposit") marker(function) return True - elif tag == 'intake' and not intake: - marker = pytest.mark.skip(reason="Skip intake") - marker(function) - return True elif tag == 'archive' and not archive: marker = pytest.mark.skip(reason="Skip vault archive") marker(function) @@ -228,7 +219,7 @@ def api_request(user, request, data, timeout=10): # Retrieve user cookies. csrf, session = user_cookies[user] - # Disable unsecure connection warning. + # Disable insecure connection warning. urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning) # Replace zone name with zone name from environment configuration. @@ -286,7 +277,7 @@ def post_form_data(user, request, files): # Retrieve user cookies. csrf, session = user_cookies[user] - # Disable unsecure connection warning. + # Disable insecure connection warning. urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning) # Make POST request. diff --git a/tests/features/api/api_deposit_open.feature b/tests/features/api/api_deposit_open.feature index a120c5c6a..8782e7f2d 100644 --- a/tests/features/api/api_deposit_open.feature +++ b/tests/features/api/api_deposit_open.feature @@ -60,7 +60,7 @@ Feature: Deposit API (open) And deposit exists And deposit is archived And user viewer is authenticated - And as viewer the Yoda browse collections API is queried with # Workaround for https://github.com/pytest-dev/pytest-bdd/issues/689 + And the Yoda browse collections API is queried with Then the response status code is "200" And the browse result contains deposit diff --git a/tests/features/api/api_deposit_restricted.feature b/tests/features/api/api_deposit_restricted.feature index 3155de18e..573957e74 100644 --- a/tests/features/api/api_deposit_restricted.feature +++ b/tests/features/api/api_deposit_restricted.feature @@ -49,7 +49,7 @@ Feature: Deposit API (restricted) And deposit exists And deposit is archived And user viewer is authenticated - And as viewer the Yoda browse collections API is queried with # Workaround for https://github.com/pytest-dev/pytest-bdd/issues/689 + And the Yoda browse collections API is queried with Then the response status code is "200" And the browse result does not contain deposit diff --git a/tests/features/api/api_intake.feature b/tests/features/api/api_intake.feature deleted file mode 100644 index 7f3e1e007..000000000 --- a/tests/features/api/api_intake.feature +++ /dev/null @@ -1,195 +0,0 @@ -@api @intake -Feature: Intake API - - Scenario Outline: Find all studies a user is involved with - Given user is authenticated - And the Yoda intake list studies API is queried - Then the response status code is "200" - And study is returned - - Examples: - | user | study | - | researcher | initial | - | researcher | test | - | datamanager | initial | - | datamanager | test | - - - Scenario Outline: Find all studies a user is datamanager of - Given user is authenticated - And the Yoda intake list datamanager studies API is queried - Then the response status code is "200" - And study is returned - - Examples: - | user | study | - | datamanager | initial | - | datamanager | test | - - - Scenario Outline: Get the total count of all files in a collection - Given user is authenticated - And the Yoda intake count total files API is queried with collection - Then the response status code is "200" - # And ... - - Examples: - | user | collection | - | datamanager | /tempZone/home/grp-intake-initial | - | researcher | /tempZone/home/grp-intake-initial | - - - Scenario Outline: Get list of all unrecognized and unscanned files - Given user is authenticated - And the Yoda intake list unrecognized files API is queried with collection - Then the response status code is "200" - # And ... - - Examples: - | user | collection | - | datamanager | /tempZone/yoda/grp-intake-initial | - | researcher | /tempZone/yoda/grp-intake-initial | - - - Scenario Outline: Get list of all datasets - Given user is authenticated - And the Yoda intake list datasets API is queried with collection - Then the response status code is "200" - # And ... - - Examples: - | user | collection | - | datamanager | /tempZone/home/grp-intake-initial | - | researcher | /tempZone/home/grp-intake-initial | - - - Scenario Outline: Scan for and recognize datasets in study intake area - Given user is authenticated - And the Yoda intake scan for datasets API is queried with collection - Then the response status code is "200" - # And ... - - Examples: - | user | collection | - | datamanager | /tempZone/home/grp-intake-initial | - | researcher | /tempZone/home/grp-intake-initial | - - - Scenario Outline: Lock dataset in study intake area - Given user is authenticated - And the Yoda intake lock API is queried with dataset id and collection - Then the response status code is "200" - # And ... - - Examples: - | user | collection | dataset_id | - | datamanager | /tempZone/home/grp-intake-initial | 3y*discount*B00000*Raw | - | researcher | /tempZone/home/grp-intake-initial | 3y*discount*B00001*Raw | - - - Scenario Outline: Cannot lock non-existent dataset - Given user is authenticated - And the Yoda intake lock API is queried with dataset id and collection - # Errors during locking individual datasets do not result in an error status code. This test - # codifies current behaviour of this API endpoint. - Then the response status code is "200" - And the result is equivalent to {"error_dataset_ids": ["3y\ndiscount\nB99999\nRaw"], "error_msg": "Something went wrong locking datasets", "proc_status": "NOK"} - - Examples: - | user | collection | dataset_id | - | datamanager | /tempZone/home/grp-intake-initial | 3y*discount*B99999*Raw | - - - Scenario Outline: Unlock dataset in study intake area - Given user is authenticated - And the Yoda intake unlock API is queried with dataset id and collection - Then the response status code is "200" - # And ... - - Examples: - | user | collection | dataset_id | - | datamanager | /tempZone/home/grp-intake-initial | 3y*discount*B00000*Raw | - | researcher | /tempZone/home/grp-intake-initial | 3y*discount*B00001*Raw | - - - Scenario Outline: Cannot unlock non-existent dataset - Given user is authenticated - And the Yoda intake unlock API is queried with dataset id and collection - # Errors during unlocking individual datasets do not result in an error status code. This test - # codifies current behaviour of this API endpoint. - Then the response status code is "200" - And the result is equivalent to {"error_dataset_ids": ["3y\ndiscount\nB99999\nRaw"], "error_msg": "Something went wrong unlocking datasets", "proc_status": "NOK"} - - Examples: - | user | collection | dataset_id | - | datamanager | /tempZone/home/grp-intake-initial | 3y*discount*B99999*Raw | - - - Scenario Outline: Get all details for a dataset - Given user is authenticated - And the Yoda intake dataset get details API is queried with dataset id and collection - Then the response status code is "200" - # And ... - - Examples: - | user | collection | dataset_id | - | datamanager | /tempZone/home/grp-intake-initial | 3y*discount*B00000*Raw | - | researcher | /tempZone/home/grp-intake-initial | 3y*discount*B00001*Raw | - - - Scenario Outline: Add a comment to a dataset - Given user is authenticated - And the Yoda intake dataset add comment API is queried with dataset id , study id and comment - Then the response status code is "200" - # And ... - - Examples: - | user | study_id | comment | dataset_id | - | datamanager | grp-intake-initial | comment1 | 3y*discount*B00000*Raw | - | researcher | grp-intake-initial | comment2 | 3y*discount*B00001*Raw | - - - Scenario Outline: Cannot add comment to nonexistent dataset - Given user is authenticated - And the Yoda intake dataset add comment API is queried with dataset id , study id and comment - # Adding a comment to a nonexistent dataset currently does not result in an error status code. This test - # codifies current behaviour of this API endpoint. - Then the response status code is "200" - And the result is equivalent to {"error_msg": "Dataset does not exist", "proc_status": "NOK"} - - Examples: - | user | study_id | comment | dataset_id | - | datamanager | grp-intake-initial | comment1 | 3y*discount*B99999*Raw | - - - Scenario Outline: Get vault dataset related counts for reporting for a study - Given user is authenticated - And the Yoda intake report vault dataset counts per study API is queried with study id - Then the response status code is "200" - # And ... - - Examples: - | user | study_id | - | datamanager | grp-intake-initial | - - - Scenario Outline: Get aggregated vault dataset info for reporting for a study - Given user is authenticated - And the Yoda intake report vault aggregated info API is queried with study id - Then the response status code is "200" - # And ... - - Examples: - | user | study_id | - | datamanager | grp-intake-initial | - - - Scenario Outline: Get vault data for export of a study - Given user is authenticated - And the Yoda intake report export study data API is queried with study id - Then the response status code is "200" - # And ... - - Examples: - | user | study_id | - | datamanager | grp-intake-initial | diff --git a/tests/features/api/api_resources.feature b/tests/features/api/api_resources.feature index 52ab3356e..ba3c35432 100644 --- a/tests/features/api/api_resources.feature +++ b/tests/features/api/api_resources.feature @@ -21,7 +21,7 @@ Feature: Resources API And only 1 group is found Examples: - | user | group | + | user | group | | researcher | research-core-1 | | datamanager | research-core-1 | @@ -40,7 +40,6 @@ Feature: Resources API | datamanager | deposit-pilot1 | - @intake Scenario Outline: Get paginated result when searching for one specific intake / grp group Given user is authenticated And the Yoda resources API is queried for a paginated range of research groups filtered on group @@ -82,36 +81,35 @@ Feature: Resources API | datamanager | deposit-pilot | - @intake Scenario Outline: Get a full year of storage data for intake group Given user is authenticated And the Yoda resources full year differentiated group data API is queried with - Then the response status code is "200" - And storage data for group is found + Then the response status code is "200" + And storage data for group is found Examples: | user | group | | researcher | research-initial | | datamanager | research-initial | - + @deposit Scenario Outline: Get a full year of differentiated storage data starting from current month and look back one year Given user is authenticated And the Yoda resources full year differentiated group data API is queried with - Then the response status code is "200" - And storage data for group is found + Then the response status code is "200" + And storage data for group is found Examples: | user | group | | researcher | research-deposit-test | | datamanager | research-deposit-test | - @intake + Scenario Outline: Get a full year of differentiated storage data starting from current month and look back one year Given user is authenticated And the Yoda resources full year differentiated group data API is queried with - Then the response status code is "200" - And storage data for group is found + Then the response status code is "200" + And storage data for group is found Examples: | user | group | @@ -136,8 +134,8 @@ Feature: Resources API Scenario Outline: Collect storage stats for all twelve months based upon categories a user is datamanager of Given user is authenticated And the Yoda resources monthly category stats API is queried - Then the response status code is "200" - And storage data for export is found + Then the response status code is "200" + And storage data for export is found Examples: | user | @@ -152,7 +150,7 @@ Feature: Resources API And group data are sorted by in order Examples: - | user | sort_on | sort_order | + | user | sort_on | sort_order | | researcher | name | asc | | researcher | name | desc | | researcher | size | asc | diff --git a/tests/features/api/api_vault.feature b/tests/features/api/api_vault.feature index 0039a709b..4ed3d018b 100644 --- a/tests/features/api/api_vault.feature +++ b/tests/features/api/api_vault.feature @@ -98,6 +98,17 @@ Feature: Vault API | /tempZone/home/vault-default-2 | | /tempZone/home/vault-core-2 | | /tempZone/home/vault-default-3 | + + + Scenario Outline: Published vault package passes troubleshooting script checks + Given user technicaladmin is authenticated + And data package exists in + Then data package in passes troubleshooting script checks + + Examples: + | vault | + | /tempZone/home/vault-default-2 | + | /tempZone/home/vault-default-3 | Scenario Outline: Vault preservable formats lists diff --git a/tests/features/ui/ui_admin.feature b/tests/features/ui/ui_admin.feature index cb51ef7af..28c6bebf4 100644 --- a/tests/features/ui/ui_admin.feature +++ b/tests/features/ui/ui_admin.feature @@ -112,3 +112,28 @@ Feature: Admin UI Examples: | user | text | | functionaladminpriv | TemporaryTerms | + + + Scenario Outline: Admin user deletes file format + Given user is logged in + When module "admin" is shown + And the user selects the file format list to delete + And the user clicks the Delete file format list button + Then the success message for deleting the file format list is shown + + Examples: + | user | filename | + | functionaladminpriv | 4TU.json | + | functionaladminpriv | DANS.json | + + + Scenario Outline: Admin user uploads new file format + Given user is logged in + When module "admin" is shown + And the user clicks the Upload file format list + Then the success message of uploading a file format list is shown + + Examples: + | user | filename | + | functionaladminpriv | 4TU.json | + | functionaladminpriv | DANS.json | diff --git a/tests/features/ui/ui_intake.feature b/tests/features/ui/ui_intake.feature deleted file mode 100644 index fea264427..000000000 --- a/tests/features/ui/ui_intake.feature +++ /dev/null @@ -1,43 +0,0 @@ -@ui @intake -Feature: Intake UI - - @fail - Scenario: Intake scan only and find datasets and unrecognized files - Given user datamanager is logged in - And module "intake" is shown - When activate study "test" - And total datasets is "0" - When activate study "initial" - And total datasets is "0" - And unscanned files are present - When scanned for datasets - Then scan button is disabled - When scanning for datasets is successful - And total datasets is "3" - And unrecognized files are present - - When click for details of first dataset row - - When add "COMMENTS" to comment field and press comment button - - When check first dataset for locking - And lock and unlock buttons are "enabled" - - When uncheck first dataset for locking - And lock and unlock buttons are "disabled" - - When check all datasets for locking - - Then click lock button - And wait for all datasets to be in locked state successfully - And wait for all datasets to be in frozen state - And wait for frozen sets to be added to vault - - Scenario: Intake reporting - Given user datamanager is logged in - And module "intake" is shown - - When open intake reporting area - When check reporting result - When export all data and download file - When return to intake area diff --git a/tests/features/ui/ui_statistics.feature b/tests/features/ui/ui_statistics.feature index 516e7d47f..843880d77 100644 --- a/tests/features/ui/ui_statistics.feature +++ b/tests/features/ui/ui_statistics.feature @@ -28,7 +28,6 @@ Feature: Statistics UI | datamanager | deposit-pilot | - @intake Scenario Outline: Viewing storage details of a intake / grp group Given user is logged in And module "stats" is shown @@ -55,7 +54,6 @@ Feature: Statistics UI | datamanager | test-automation | - @intake Scenario Outline: Viewing intake category storage details as a technicaladmin or datamanager Given user is logged in When module "stats" is shown diff --git a/tests/files/file_formats/4TU.json b/tests/files/file_formats/4TU.json new file mode 100644 index 000000000..80686335f --- /dev/null +++ b/tests/files/file_formats/4TU.json @@ -0,0 +1,35 @@ +{ + "name": "4TU Preferred formats", + "help": "Checks if the files in the data folder and subfolders comply with the 4TU Centre for Research Data guidelines for preferred file formats as per 1-Aug-2019. Their guidelines states that usage of the preferred file formats is of essential importance in order to ensure that the research data will remain usable in the future. For more information see https://researchdata.4tu.nl/. Disclaimer: Please note that Yoda currently deducts the file format from the filename. It does not inspect the file content.", + "advice": "For files that do not comply with preferred formats, we recommend that you include in the data package a specification of the file format. Should you want to reference an external specification document then try to find a sustainable link, ideally use a DOI. If feasible, also include in a separate folder of your data package a copy of the file transformed into of the preferred formats. Please consult a datamanager or consult the Research Support desk (see https://www.uu.nl/rdm) in case you need any assistance.", + "formats": [ + "txt", + "xml", + "html", + "pdf", + "json", + "pdb", + "ent", + "brk", + "xyz", + "csv", + "jpg", + "jpeg", + "tif", + "tiff", + "png", + "svg", + "gml", + "kml", + "kmz", + "shp", + "shx", + "dbf", + "nc", + "wav", + "zip", + "tar", + "gzip", + "7z" + ] +} diff --git a/tests/files/file_formats/DANS.json b/tests/files/file_formats/DANS.json new file mode 100644 index 000000000..31a274f40 --- /dev/null +++ b/tests/files/file_formats/DANS.json @@ -0,0 +1,49 @@ +{ + "name": "DANS Preferred formats", + "help": "Checks if the files in the data folder and subfolders comply with the DANS guidelines for preferred file formats as per 1-Aug-2019. Their guidelines states that the preferred file formats offer the best long-term guarantees in terms of usability, accessibility and sustainability. For more information see https://dans.knaw.nl/. Disclaimer: Please note that Yoda currently deducts the file format from the filename. It does not inspect the file content.", + "advice": "For files that do not comply with preferred formats, we recommend that you include in the data package a specification of the file format. Should you want to reference an external specification document then try to find a sustainable link, ideally use a DOI. If feasible, also include in a separate folder of your data package a copy of the file transformed into of the preferred formats. Please consult a datamanager or consult the Research Support desk (see https://www.uu.nl/rdm) in case you need any assistance.", + "formats": [ + "pdf", + "odt", + "txt", + "xml", + "html", + "css", + "xslt", + "js", + "es", + "ods", + "csv", + "sql", + "siard", + "por", + "dta", + "jpg", + "jpeg", + "tif", + "tiff", + "png", + "jp2", + "dcm", + "svg", + "bwf", + "mxf", + "mka", + "flac", + "mxf", + "mkv", + "dxf", + "gml", + "mif", + "mid", + "asc", + "obj", + "ply", + "x3d", + "dae", + "rdf", + "trig", + "ttl", + "nt" + ] +} diff --git a/tests/requirements.txt b/tests/requirements.txt index ca93ffea6..6caab6ffa 100644 --- a/tests/requirements.txt +++ b/tests/requirements.txt @@ -3,7 +3,7 @@ requests==2.32.2 selenium==4.21.0 splinter==0.21.0 pytest-splinter==3.3.2 -pytest_bdd==7.2.0 +pytest_bdd==7.3.0 pytest==8.2.2 deepdiff==6.6.1 pyperclip==1.9.0 diff --git a/tests/step_defs/api/common_vault.py b/tests/step_defs/api/common_vault.py index 2cfa8fa55..9b2706221 100644 --- a/tests/step_defs/api/common_vault.py +++ b/tests/step_defs/api/common_vault.py @@ -174,6 +174,21 @@ def data_package_status(user, vault, data_package, status): raise AssertionError() +@then(parsers.parse('data package in {vault} passes troubleshooting script checks')) +def api_vault_batch_troubleshoot(user, vault, data_package): + http_status, result = api_request( + user, + "batch_troubleshoot_published_data_packages", + {"requested_package": data_package, "log_file": True, "offline": True} + ) + assert http_status == 200 + data = result['data'] + assert len(data) == 1 + # Confirm that all checks passed for this data package + for checks in data.values(): + assert all(checks.values()) + + @then('preservable formats lists are returned') def preservable_formats_lists(api_response): http_status, body = api_response diff --git a/tests/step_defs/api/test_api_deposit.py b/tests/step_defs/api/test_api_deposit.py index e4665cb6e..621dcc25c 100644 --- a/tests/step_defs/api/test_api_deposit.py +++ b/tests/step_defs/api/test_api_deposit.py @@ -143,13 +143,12 @@ def data_access_restriction_restricted(user, deposit_name): ) -# Workaround for https://github.com/pytest-dev/pytest-bdd/issues/689 -@given(parsers.parse("as viewer the Yoda browse collections API is queried with {collection}"), target_fixture="api_response") -def api_browse_folder(collection): +@given(parsers.parse("the Yoda browse collections API is queried with {collection}"), target_fixture="api_response") +def api_browse_collections(user, collection): return api_request( - "viewer", - "browse_folder", - {"coll": collection} + user, + "browse_collections", + {"coll": collection, "sort_order": "desc"} ) diff --git a/tests/step_defs/api/test_api_intake.py b/tests/step_defs/api/test_api_intake.py deleted file mode 100644 index b26223004..000000000 --- a/tests/step_defs/api/test_api_intake.py +++ /dev/null @@ -1,157 +0,0 @@ -# coding=utf-8 -"""Intake API feature tests.""" - -__copyright__ = 'Copyright (c) 2020-2022, Utrecht University' -__license__ = 'GPLv3, see LICENSE' - -import json - -from deepdiff import DeepDiff -from pytest_bdd import ( - given, - parsers, - scenarios, - then, -) - -from conftest import api_request - -scenarios('../../features/api/api_intake.feature') - - -@given('the Yoda intake list studies API is queried', target_fixture="api_response") -def api_intake_list_studies(user): - return api_request( - user, - "intake_list_studies", - {} - ) - - -@given('the Yoda intake list datamanager studies API is queried', target_fixture="api_response") -def api_intake_list_dm_studies(user): - return api_request( - user, - "intake_list_dm_studies", - {} - ) - - -@given(parsers.parse("the Yoda intake count total files API is queried with collection {collection}"), target_fixture="api_response") -def api_intake_count_total_files(user, collection): - return api_request( - user, - "intake_count_total_files", - {"coll": collection} - ) - - -@given(parsers.parse("the Yoda intake list unrecognized files API is queried with collection {collection}"), target_fixture="api_response") -def api_intake_list_unrecognized_files(user, collection): - return api_request( - user, - "intake_list_unrecognized_files", - {"coll": collection} - ) - - -@given(parsers.parse("the Yoda intake list datasets API is queried with collection {collection}"), target_fixture="api_response") -def api_intake_list_datasets(user, collection): - return api_request( - user, - "intake_list_datasets", - {"coll": collection} - ) - - -@given(parsers.parse("the Yoda intake scan for datasets API is queried with collection {collection}"), target_fixture="api_response") -def api_intake_scan_for_datasets(user, collection): - return api_request( - user, - "intake_scan_for_datasets", - {"coll": collection} - ) - - -@given(parsers.parse("the Yoda intake lock API is queried with dataset id {dataset_id} and collection {collection}"), target_fixture="api_response") -def api_intake_lock_dataset(user, dataset_id, collection): - return api_request( - user, - "intake_lock_dataset", - {"path": collection, "dataset_ids": dataset_id.replace("*", "\n")} - ) - - -@given(parsers.parse("the Yoda intake unlock API is queried with dataset id {dataset_id} and collection {collection}"), target_fixture="api_response") -def api_intake_unlock_dataset(user, dataset_id, collection): - return api_request( - user, - "intake_unlock_dataset", - {"path": collection, "dataset_ids": dataset_id.replace("*", "\n")} - ) - - -@given(parsers.parse("the Yoda intake dataset get details API is queried with dataset id {dataset_id} and collection {collection}"), target_fixture="api_response") -def api_intake_dataset_get_details(user, dataset_id, collection): - return api_request( - user, - "intake_dataset_get_details", - {"coll": collection, "dataset_id": dataset_id.replace("*", "\t")} - ) - - -@given(parsers.parse("the Yoda intake dataset add comment API is queried with dataset id {dataset_id}, study id {study_id} and comment {comment}"), target_fixture="api_response") -def api_intake_dataset_add_comment(user, dataset_id, study_id, comment): - return api_request( - user, - "intake_dataset_add_comment", - {"study_id": study_id, "dataset_id": dataset_id.replace("*", "\n"), "comment": comment} - ) - - -@given(parsers.parse("the Yoda intake report vault dataset counts per study API is queried with study id {study_id}"), target_fixture="api_response") -def api_intake_report_vault_dataset_counts_per_study(user, study_id): - return api_request( - user, - "intake_report_vault_dataset_counts_per_study", - {"study_id": study_id} - ) - - -@given(parsers.parse("the Yoda intake report vault aggregated info API is queried with study id {study_id}"), target_fixture="api_response") -def api_intake_report_vault_aggregated_info(user, study_id): - return api_request( - user, - "intake_report_vault_aggregated_info", - {"study_id": study_id} - ) - - -@given(parsers.parse("the Yoda intake report export study data API is queried with study id {study_id}"), target_fixture="api_response") -def api_intake_report_export_study_data(user, study_id): - return api_request( - user, - "intake_report_export_study_data", - {"study_id": study_id} - ) - - -@then(parsers.parse("study {study} is returned")) -def study_returned(api_response, study): - _, body = api_response - - assert study in body['data'] - - -@then('debug') -def debug(api_response): - _, body = api_response - - assert 0, body - - -@then(parsers.parse("the result is equivalent to {result}")) -def result_equivalent_to(api_response, result): - _, body = api_response - - assert DeepDiff(json.loads(result), body['data']) == {} diff --git a/tests/step_defs/ui/test_ui_admin.py b/tests/step_defs/ui/test_ui_admin.py index 1c45870ae..adda1b237 100644 --- a/tests/step_defs/ui/test_ui_admin.py +++ b/tests/step_defs/ui/test_ui_admin.py @@ -4,6 +4,7 @@ __copyright__ = "Copyright (c) 2024, Utrecht University" __license__ = "GPLv3, see LICENSE" +import os import time from pytest_bdd import ( @@ -182,3 +183,41 @@ def ui_admin_removed_text_not_displayed(browser, text): time.sleep(1) terms = browser.find_by_id('admin-publication-terms').first.value assert text not in terms + + +@when(parsers.parse('the user clicks the Upload file format list {filename}')) +def ui_admin_clicks_upload_file_format_button(browser, filename): + browser.execute_script("document.getElementById('upload-button').scrollIntoView();") + browser.find_by_css("#upload-button") + + cwd = os.getcwd() + if os.name == 'nt': + browser.find_by_css('input[type="file"]')[0].fill("{}\\files\\file_formats\\{}".format(cwd, filename)) + else: + browser.find_by_css('input[type="file"]')[0].fill("{}/files/file_formats/{}".format(cwd, filename)) + + +@then(parsers.parse('the success message of uploading a file format list {filename} is shown')) +def ui_admin_upload_file_format_success(browser, filename): + assert browser.is_text_present("File format list '{}' uploaded successfully.".format(filename)) + + +@when(parsers.parse('the user selects the file format list {filename} to delete')) +def ui_admin_select_file_format(browser, filename): + browser.execute_script("document.getElementById('file-formats-list').scrollIntoView();") + browser.find_by_css('#file-formats-list').click() + options = browser.find_by_css('#file-formats-list option') + for option in options: + if option.value == filename.split('.')[0]: + option.click() + break + + +@when('the user clicks the Delete file format list button') +def ui_admin_click_delete_button(browser): + browser.find_by_css("#delete-format-button").click() + + +@then(parsers.parse('the success message for deleting the file format list {filename} is shown')) +def ui_admin_delete_file_format_success(browser, filename): + assert browser.is_text_present("File format list '{}' uploaded successfully.".format(filename)) diff --git a/tests/step_defs/ui/test_ui_intake.py b/tests/step_defs/ui/test_ui_intake.py deleted file mode 100644 index d6d0f226d..000000000 --- a/tests/step_defs/ui/test_ui_intake.py +++ /dev/null @@ -1,191 +0,0 @@ -# coding =utf-8 -"""Vault UI feature tests.""" - -__copyright__ = 'Copyright (c) 2020-2021, Utrecht University' -__license__ = 'GPLv3, see LICENSE' - -import time - -from pytest_bdd import ( - parsers, - scenarios, - then, - when, -) - -scenarios('../../features/ui/ui_intake.feature') - - -# GENERIC FUNCTIONS -def get_unscanned_from_error_area_text(browser): - # Unrecognised and unscanned (17) files or Unrecognised (12) and unscanned (-) files - error_area_text = browser.find_by_id('scan_result_text') - parts = error_area_text.value.split(' and ') - s = parts[1] - return s[s.find("(") + 1:s.find(")")] - - -def get_unrecognized_from_error_area_text(browser): - error_area_text = browser.find_by_id('scan_result_text') - parts = error_area_text.value.split(' and ') - s = parts[0] - first_bracket = s.find("(") - if first_bracket == -1: - return "0" - return s[first_bracket + 1:s.find(")")] - - -# SCENARIO 1 -@when(parsers.parse('activate study "{study}"')) -def ui_intake_activate_study(browser, study): - dropdown = browser.find_by_id('dropdown-select-study') - dropdown.click() - table = browser.find_by_id('select-study') - rows = table.find_by_tag('tr') - for row in rows: - if row.has_class('ta-' + study): - row.find_by_tag('td').click() - return True - assert False - - -@when(parsers.parse('total datasets is "{dataset_count}"')) -def ui_intake_total_dataset_count(browser, dataset_count): - dataset_count_area = browser.find_by_id('datatable_info') - if dataset_count == '0': - assert dataset_count_area.value == 'No datasets present' - else: - assert dataset_count_area.value == "Total datasets: " + dataset_count - - -@when('unscanned files are present') # ben ik hier niet de prerequisite aan het testen??? -def ui_intake_unscanned_files_present(browser): - assert int(get_unscanned_from_error_area_text(browser)) > 0 - - -@when('scanned for datasets') -def ui_intake_scanned_for_datasets(browser): - browser.find_by_id('btn-start-scan').click() - - -@then('scan button is disabled') -def ui_intake_scan_button_is_disabled(browser): - assert browser.find_by_id('btn-start-scan').has_class('disabled') - - -@when('scanning for datasets is successful') -def ui_intake_scanning_is_successful(browser): - assert browser.is_text_present('Successfully scanned for datasets.', wait_time=20) - - -@when('unrecognized files are present') -def ui_intake_unrecognized_files_are_present(browser): - assert int(get_unrecognized_from_error_area_text(browser)) > 0 - - -@when('click for details of first dataset row') -def ui_intake_click_for_details_of_first_dataset_row(browser): - browser.find_by_id('datatable')[0].click() - - -@when(parsers.parse('add "{comments}" to comment field and press comment button')) -def ui_intake_add_comments_to_dataset(browser, comments): - browser.find_by_name('comments').fill(comments) - browser.find_by_css(".btn-add-comment").click() - - -@when('check first dataset for locking') -def ui_check_first_dataset_for_locking(browser): - browser.find_by_css('.cbDataSet')[0].click() - - -@when(parsers.parse('lock and unlock buttons are "{enabled_state}"')) -def ui_intake_lock_and_unlock_buttons_are(browser, enabled_state): - if enabled_state == 'enabled': - assert not browser.find_by_id('btn-unlock').has_class('disabled') - assert not browser.find_by_id('btn-lock').has_class('disabled') - else: - assert browser.find_by_id('btn-unlock').has_class('disabled') - assert browser.find_by_id('btn-lock').has_class('disabled') - - -@when('uncheck first dataset for locking') -def ui_uncheck_first_dataset_for_locking(browser): - # if not checkbox.is_selected() meenemen hier - browser.find_by_css('.cbDataSet')[0].click() - - -@when('check all datasets for locking') -def ui_check_all_datasets_for_locking(browser): - browser.find_by_css('.control-all-cbDataSets').click() - - -@then('click lock button') -def ui_intake_click_lock_button(browser): - browser.find_by_id("btn-lock").click() - - -@then('wait for all datasets to be in locked state successfully') -def ui_intake_wait_all_datasets_in_locked_state(browser): - assert browser.is_text_present('Successfully locked the selected dataset(s).', wait_time=30) - - assert len(browser.find_by_css('.datasetstatus_locked', wait_time=30)) == 2 - - -@then('wait for all datasets to be in frozen state') -def ui_intake_wait_all_datasets_in_frozen_state(browser): - i = 0 - no_more_locked_datasets_present = False - while i < 20: - time.sleep(20) - browser.visit(browser.url) - # if there are no longer datasets in locked state -> frozen or error - if len(browser.find_by_css('.datasetstatus_locked', wait_time=5)) == 0: # .datasetstatus_frozen - no_more_locked_datasets_present = True - # either datasets are frozen now. Or have been marked errorenous - break - i = i + 1 - assert no_more_locked_datasets_present - - -@then('wait for frozen sets to be added to vault') -def ui_intake_wait_frozen_datasets_to_vault(browser): - # When all frozen datasets have been moved to the vault only 1 will remain with dataset_status_scanned - i = 0 - no_more_frozen_datasets_present = False - while i < 20: - time.sleep(20) - browser.visit(browser.url) - # if there are no longer datasets in locked state -> frozen or error - if len(browser.find_by_css('.datasetstatus_scanned', wait_time=5)) == 3: # .datasetstatus_frozen - no_more_frozen_datasets_present = True - # either datasets are frozen now. Or have been marked errorenous - break - i = i + 1 - assert no_more_frozen_datasets_present - - -# SCENARIO 2 -@when('open intake reporting area') -def ui_intake_open_intake_reporting_area(browser): - browser.find_by_css('.btn-goto-reports').click() - - -@when('check reporting result') -def ui_intake_check_reporting_result(browser): - # classes are part of rows in result table. - assert len(browser.find_by_css('.dataset-type-counts-raw')) > 0 - assert len(browser.find_by_css('.dataset-type-counts-processed')) == 0 - assert len(browser.find_by_css('.dataset-aggregated-version-raw')) > 0 - assert len(browser.find_by_css('.dataset-aggregated-version-processed')) > 0 - assert len(browser.find_by_css('.dataset-aggregated-version-total')) > 0 - - -@when('export all data and download file') -def ui_intake_export_all_data_and_download_file(browser): - browser.find_by_css('.btn-export-data').click() - - -@when('return to intake area') -def ui_intake_return_to_intake_area(browser): - browser.find_by_css('.btn-goto-intake').click() diff --git a/tools/api/generate-openapi.py b/tools/api/generate-openapi.py index c3393aafc..b922b86e1 100755 --- a/tools/api/generate-openapi.py +++ b/tools/api/generate-openapi.py @@ -282,7 +282,7 @@ def gen_fn_spec(name, fn): name = re.sub('^api_', '', name) if core: - modules = ['datarequest', 'deposit', 'intake'] + modules = ['datarequest', 'deposit'] if name.startswith(tuple(modules)): continue diff --git a/tools/arb-update-resources.py b/tools/arb-update-resources.py index 20874af8f..eda353f53 100644 --- a/tools/arb-update-resources.py +++ b/tools/arb-update-resources.py @@ -13,6 +13,7 @@ import os import socket import ssl +import sys from collections import OrderedDict from io import StringIO @@ -53,7 +54,7 @@ def parse_args(): def parse_cs_values(input): """Parses a comma-separated list of key:value pairs as a dict.""" - result = dict() + result = {} for kv_pair in input.split(","): if kv_pair == "": continue @@ -96,7 +97,7 @@ def setup_session(irods_environment_config, ca_file="/etc/pki/tls/certs/chain.cr 'encryption_num_hash_rounds': 16, 'encryption_salt_size': 8, 'ssl_context': ssl_context} - settings = dict() + settings = {} settings.update(irods_environment_config) settings.update(ssl_settings) settings["password"] = password @@ -160,8 +161,8 @@ def call_rule(session, rulename, params, number_outputs, rule_engine='irods_rule output=output_params, **re_config) - outArray = myrule.execute() - buf = outArray.MsParam_PI[0].inOutStruct.stdoutBuf.buf.decode( + out_array = myrule.execute() + buf = out_array.MsParam_PI[0].inOutStruct.stdoutBuf.buf.decode( 'utf-8').splitlines() return buf[:number_outputs] @@ -192,8 +193,18 @@ def main(): args = parse_args() env = get_irods_environment() + for ca_file_option in ["/etc/pki/tls/certs/chain.pem", + "/etc/ssl/certs/chain.crt", + "/etc/ssl/certs/localhost.crt"]: + if os.path.isfile(ca_file_option): + ca_file = ca_file_option + break + else: + print("Error: could not find CA chain file.", file=sys.stderr) + sys.exit(1) + try: - session = setup_session(env) + session = setup_session(env, ca_file=ca_file) override_free_dict = parse_cs_values(args.override_free) override_total_dict = parse_cs_values(args.override_total) local_ufs_resources = get_local_ufs_resources(session) @@ -208,7 +219,7 @@ def main(): print("Updating misc resources ...") call_rule_update_misc(session) except NetworkException: - print("Could not connect to iRODS sever ...") + print("Could not connect to iRODS server ...") if __name__ == '__main__': diff --git a/tools/intake/ExportDatasetErrorsAndWarnings.r b/tools/intake/ExportDatasetErrorsAndWarnings.r deleted file mode 100644 index ce2bedf1a..000000000 --- a/tools/intake/ExportDatasetErrorsAndWarnings.r +++ /dev/null @@ -1,115 +0,0 @@ -# Date: 2019-01-16 -# Functionality: -# Find files within the dynamic area of an intake study that have errors and/or warnings at file level. -# A check for errors/warnings is performed ONLY on file level. -# Errors that can be found on dataset-toplevel or on collection level within a dataset, are NOT reported - -# Parameters: -# - Study: Name of the study the export has to search - -# Run with DatasetErrorsAndWarnins.sh script to have the export added to a csv file. - -ExportDatasetErrorsAndWarnings { - ## OVERRULE PARAMS FOR NOW as I wasn't able to add multiple input params -# *studyParam="test"; - - # Possibly use uuClientFullName as user, or $userNameClienterNameClient; ???????????????????????? - # writeLine("stdout", "uuClientFullName: " ++ uuClientFullName); - - - # Initialisation of variables based on command line parameters -# *user="datamanager"; - *user = uuClientFullName - *study = *studyParam; - *datamanagerGroup = 'grp-datamanager-' ++ *study; - *studyFolder = "/" ++ $rodsZoneClient ++ "/" ++ 'home/grp-intake-' ++ *studyParam; - - # Check whether user is a datamanager for the study involved - *isDatamanager = false; - foreach (*row in - SELECT USER_NAME - WHERE USER_TYPE = 'rodsgroup' - AND USER_NAME = *datamanagerGroup ) { - - uuGroupUserExists(*datamanagerGroup, *user, true, *membership) - if (*membership) { - *isDatamanager = true; - } - } - - if (!*isDatamanager) { - writeLine("stdout", 'Not the datamanager of current group'); - succeed; # the journey ends here - } - - - # Setup list of dataset ids that are later used to find data objects having this dataset_id's - *datasetList = list(); - foreach(*row in SELECT META_DATA_ATTR_NAME, META_DATA_ATTR_VALUE WHERE COLL_NAME like '*studyFolder%%' AND META_DATA_ATTR_NAME='dataset_toplevel') { - msiGetValByKey(*row, "META_DATA_ATTR_VALUE", *datasetId); - *datasetList = cons(*datasetId, *datasetList); - } - - foreach(*row in SELECT COLL_ID, META_COLL_ATTR_NAME, META_COLL_ATTR_VALUE WHERE COLL_NAME like '*studyFolder%%' AND META_COLL_ATTR_NAME='dataset_toplevel') { - msiGetValByKey(*row, "META_COLL_ATTR_VALUE", *datasetId); - *datasetList = cons(*datasetId, *datasetList); - } - - # Write header row for the export table - writeLine('stdout', "Wave,Experiment type,Pseudocode,Version,Bestand,Errors,Warnings"); - - # At first find datasets, designated by presence of metadata attribute 'dataset_toplevel'. - # The value of the datasetId is combination of wepv and path to make it unique. - foreach(*datasetId in *datasetList) { - # Collect all data objects with a given datasetId - # And per data object find out whether it contains errors or warnings in its metadata - foreach(*row2 in SELECT META_DATA_ATTR_NAME, META_DATA_ATTR_VALUE, DATA_NAME, DATA_ID, COLL_NAME WHERE META_DATA_ATTR_VALUE='*datasetId' AND META_DATA_ATTR_NAME='dataset_id') { - msiGetValByKey(*row2, "DATA_NAME", *dataName); - msiGetValByKey(*row2, "COLL_NAME", *collName); - msiGetValByKey(*row2, "DATA_ID", *dataId); - - # Given 1 object step thtough all its metadata attributes. - - msiString2KeyValPair("", *kvp); - - # build list of all attributes that are involved - *attrList = list('wave', 'experiment_type', 'pseudocode', 'version', 'error', 'warning'); - # initialize all attributes to empty strings - foreach (*attr in *attrList) { - *kvp."*attr" = ''; - } - - foreach(*row3 in SELECT META_DATA_ATTR_NAME, META_DATA_ATTR_VALUE WHERE DATA_ID=*dataId ) { - msiGetValByKey(*row3, "META_DATA_ATTR_NAME", *attrName); - msiGetValByKey(*row3, "META_DATA_ATTR_VALUE", *attrValue); - - foreach (*attr in *attrList) { - #writeLine('stdout', 'attrLIST: ' ++ *attr); - if (*attrName==*attr) { - if (*attr=='error' || *attr=='warning') { # must be concatination as there can be more errors/warnings on 1 data object - if (strlen(*kvp."*attr")>0) { - *kvp."*attr" = *kvp."*attr" ++ ' - ' ++ *attrValue; - } - else { - *kvp."*attr" = *attrValue; - } - } - else { - *kvp."*attr" = *attrValue; - } - } - } - } - # Add data object to file - only if errors or warnins present. - if (strlen(*kvp.'error')>0 || strlen(*kvp.'warning')>0) { - *dataPath = *collName ++ '/' ++ *dataName; - writeLine('stdout', *kvp."wave" ++ "," ++ *kvp."experiment_type" ++ "," ++ *kvp."pseudocode"++ "," ++ *kvp."version" ++ "," ++ *dataPath ++ "," ++ *kvp."error" ++ "," ++ *kvp."warning"); - } - } - } -} - - -input *studyParam="test" -output ruleExecOut - diff --git a/tools/intake/ExportDatasetErrorsAndWarnings.sh b/tools/intake/ExportDatasetErrorsAndWarnings.sh deleted file mode 100755 index e63ddce41..000000000 --- a/tools/intake/ExportDatasetErrorsAndWarnings.sh +++ /dev/null @@ -1,13 +0,0 @@ -#!/bin/sh -# /Date: 2019-01-16 -# /Functionality: -# /Find files within the dynamic area of an intake study that have errors and/or warnings at file level. -# /A check for errors/warnings is performed ONLY on file level. -# /Errors that can be found on dataset-toplevel or on collection level within a dataset, are NOT reported - -# /Parameters: -# /Study: Name of the study the export has to search - -# /Run with DatasetErrorsAndWarnins.sh script to have the export added to a csv file. - -irule -r irods_rule_engine_plugin-irods_rule_language-instance -F ExportDatasetErrorsAndWarnings.r "*studyParam='$1'" > DatasetErrorsAndWarnings.csv diff --git a/tools/intake/collCopyPseudo.r b/tools/intake/collCopyPseudo.r deleted file mode 100644 index 6264a99f6..000000000 --- a/tools/intake/collCopyPseudo.r +++ /dev/null @@ -1,31 +0,0 @@ -#Author Harm de Raaff -#Date: 2019-01-16 - -collCopyPseudo { - #changes YYYY-MM-DD.hh:mm:ss into seconds since epoch format - msiHumanToSystemTime(*datefrom, *datefrom) - msiHumanToSystemTime(*datetill, *datetill) - - # pseudocodes are passes as a comma-separated list. - *pseudoList = split(*pseudoCodes,','); - - foreach(*row in SELECT COLL_OWNER_ZONE) { - *zone=*row.COLL_OWNER_ZONE; - foreach(*pc in *pseudoList) { - foreach(*row2 in SELECT COLL_NAME - WHERE COLL_NAME like '/*zone/home/grp-vault-%' - AND META_COLL_ATTR_NAME = 'pseudocode' - AND META_COLL_ATTR_VALUE = *pc - AND COLL_CREATE_TIME between *datefrom *datetill - #datefrom must be the same amount of digits as datetill - #wont be a problem if chosing times from yodas existence till future - ) { - *name=*row2.COLL_NAME; - writeLine('stdout', *name); - } - } - } -} - -input *pseudoCodes="", *datefrom="", *datetill="" -output ruleExecOut diff --git a/tools/intake/collCopyPseudo.sh b/tools/intake/collCopyPseudo.sh deleted file mode 100755 index a6a752a07..000000000 --- a/tools/intake/collCopyPseudo.sh +++ /dev/null @@ -1,75 +0,0 @@ -#!/bin/bash -# \author Niek Bats -# \date 2019-01-19 -# \file collCopyPseudo.sh -# \brief copies all collections which matches pseudocodes as passed in a file ($3) and in between datefrom ($4) and datetill ($5) to a folder ($1) -# \how to use store the .sh file and .r file to your linux folder and make it the current directory (using cd) -# \ if you want to copy the collections to your linux subfolder, specify iget ($2). The folder ($1) is created in your current linux folder. -# \ if you want to copy the collections to a yoda subfolder, specify icp ($2) instead. The folder ($1) should be preceeded by the yoda -# \ group-folder (e.g. research-copiedcollections/pseudocodelist1, the folder pseudocodelist1 is created by the script) -# \copyright Copyright (c) 2018, Utrecht University. All rights reserved -# \dependencies requires login on an irods user (e.g. datamanager) with execution right to this script and permission to execute user icommands -# \usage bash randomCollCopy.sh - -#invalid input handling - -if [[ $1 = "" || $2 = "" || $3 = "" || $4 = "" || $5 = "" ]] ; then - echo "the usage of this script is: " - echo "bash randomCollCopy.sh " - echo "where folder, howtoCopy is text. dateFrom and dateTill is text in YYYY-MM-DD.HH:mm:ss format" - echo "folder is the created subfolder, when using iget. For icp, the folder to be created should be preceeded by the yoda research-name " - echo "e.g. 'research-copiedcollections/pseudocodelist1' and you must be a user of research-copiedcollection." - exit 1 -fi - -#convert input params to named variables for readability also insta docu of what they are -folder="$1" #is text -copyHow="$2" #iget or icp -pseudocodeCsvFile="$3" #is filename of file holding pseudocodes -dateFrom="$4" #is text in YYYY-MM-DD.HH:mm:ss format -dateTill="$5" #is text in YYYY-MM-DD.HH:mm:ss format - -if [[ $copyHow != "iget" && $copyHow != "icp" ]] ; then - echo "Your copy method is not correct. It must either be 'iget' or 'icp'" - echo "Now it is $copyHow" - exit 1 -fi - -#Collect comma separated pseudocodes from file -pseudoCodes=`cat $pseudocodeCsvFile` -echo "pseudocodes: $pseudoCodes" - -#run rule put output in an array -read -ra array <<< $(irule -r irods_rule_engine_plugin-irods_rule_language-instance -F collCopyPseudo.r "'$pseudoCodes'" "'$dateFrom'" "'$dateTill'") - -#if array is empty give notice and exit -if [ ${#array[@]} -eq 0 ]; then - echo "couldnt find any collections matching your parameters at the moment" - echo "possible causes there arent any matches, the servers are down or you dont have a connection" - exit 1 -fi - -#make folder -if [[ "$copyHow" == "iget" ]] ; then - mkdir "$folder" - cd "$folder" -fi -if [[ "$copyHow" == "icp" ]] ; then - imkdir ../"$folder" - icd ../"$folder" -fi - - -echo "Copy selection: " -for item in ${array[@]} -do - echo "$item" - - if [[ "$copyHow" == "iget" ]] ; then - iget -r "$item" - fi - if [[ "$copyHow" == "icp" ]] ; then - icp -r "$item" . - fi -done - diff --git a/tools/intake/intakeDataCheck.sh b/tools/intake/intakeDataCheck.sh deleted file mode 100644 index e9bcb8e60..000000000 --- a/tools/intake/intakeDataCheck.sh +++ /dev/null @@ -1,34 +0,0 @@ -#!/bin/bash -#Author Niek Bats -#$1 wave -#$2 experiment -#$3 pseudocode -#lists all files, when found any grp-intake-folder using specified parameter(s) - -#input check and build query -if [[ "$1" != "" ]] #if no wave dont do anything -then - query="like '%/grp-intake-%' AND DATA_PATH like '%$1%'" - if [[ "$2" != "" ]] - then - query="$query AND DATA_PATH like '%$2%'" - if [[ "$3" != "" ]] - then - query="$query AND DATA_PATH like '%$3%'" - fi - elif [[ "$3" != "" ]] - then - exit 1 - fi - -echo $query - -#icommand format query is in printf format -output=$(iquest ""%s";%s" "SELECT DATA_PATH, DATA_SIZE WHERE DATA_PATH $query") - -#echo $output - -printf ""Filepath/name";"filesize"\n" > outputIntake.csv -printf "$output" >> outputIntake.csv - -fi diff --git a/tools/intake/randomCollCopy.r b/tools/intake/randomCollCopy.r deleted file mode 100644 index 2ab5c9e4c..000000000 --- a/tools/intake/randomCollCopy.r +++ /dev/null @@ -1,36 +0,0 @@ -#Author Niek Bats -#Date: 2019-01-16 - -randomCollCopy { - #changes YYYY-MM-DD.hh:mm:ss into seconds since epoch format - msiHumanToSystemTime(*datefrom, *datefrom) - msiHumanToSystemTime(*datetill, *datetill) - - foreach(*row in SELECT COLL_OWNER_ZONE) { - *zone=*row.COLL_OWNER_ZONE; - foreach(*row2 in SELECT COLL_NAME - WHERE COLL_NAME like '/*zone/home/grp-vault-%' - AND META_COLL_ATTR_NAME = 'wave' - AND META_COLL_ATTR_VALUE = *wave - # AND COLL_CREATE_TIME between *datefrom *datetill - #datefrom must be the same amount of digits as datetill - #wont be a problem if chosing times from yodas existence till future - ) { - *name=*row2.COLL_NAME; - foreach(*row3 in SELECT COLL_CREATE_TIME - WHERE COLL_NAME = *name - AND META_COLL_ATTR_NAME = 'experiment_type' - AND META_COLL_ATTR_VALUE = *experiment - ) { - *collCreateTime=int(*row3.COLL_CREATE_TIME); - writeLine("stdout", "*name"); - - # test if already present in list - we do not want multiples. - } - } - } -} - -input *wave="", *experiment="", *datefrom="", *datetill="" -output ruleExecOut - diff --git a/tools/intake/randomCollCopy.sh b/tools/intake/randomCollCopy.sh deleted file mode 100755 index 9d6ac9dfc..000000000 --- a/tools/intake/randomCollCopy.sh +++ /dev/null @@ -1,93 +0,0 @@ -#!/bin/bash -# \author Niek Bats -# \date 2019-01-16 -# \file randomCollCopy.sh -# \brief copies random collections which matches selected wave ($3) experiment ($4) in between datefrom ($5) and datetill ($6) to a folder ($1) -# \ with a maximum $6 collections, if specified. -# \how to use store the .sh file and .r file to your linux folder and make it the current directory (using cd) -# \ if you want to copy the collections to your linux subfolder, specify iget ($2). The folder ($1) is created in your current linux folder. -# \ if you want to copy the collections to a yoda subfolder, specify icp ($2) instead. The folder ($1) should be preceeded by the yoda -# \ group-folder (e.g. research-collection/30w-pci, the folder 30w-pci is created by the script) -# \ will be created and the collections copied -# \copyright Copyright (c) 2018, Utrecht University. All rights reserved -# \dependencies requires login on an irods user (e.g. datamanager) with execution right to this script and permission to execute user icommands -# \usage bash randomCollCopy.sh <(optional) amount> - -#invalid input handling - -if [[ $1 = "" || $2 = "" || $3 = "" || $4 = "" || $5 = "" || $6 = "" ]] || [[ ! $7 -gt 0 && ! $7 = "" ]] ; then -#[[ ! $6 -gt 0 ]] check if = a number and more then 0 - echo "the usage of this script is: " - echo "bash randomCollCopy.sh <(optional) amount>" - echo "where folder, wave, experimentType is text. dateFrom and dateTill is text in YYYY-MM-DD.HH:mm:ss format and amount is an number" - echo "folder is the created subfolder, when using iget. For icp, the folder to be created should be preceeded by the yoda research-name" - echo "e.g. 'research-copiedcollection/30w-pci' and you should be a user of research-copiedcollection." - exit 1 -fi - -#convert input params to named variables for readability also insta docu of what they are -folder="$1" #is text -copyHow="$2" #iget or icp -wave="$3" #is text -experimentType="$4" #is text -dateFrom="$5" #is text in YYYY-MM-DD.HH:mm:ss format -dateTill="$6" #is text in YYYY-MM-DD.HH:mm:ss format -amount=10 #is a positive number default=10 -if [[ $7 != "" ]] ; then - amount="$7" -fi - -if [[ $copyHow != "iget" && $copyHow != "icp" ]] ; then - echo "Your copy method is not correct. It must either be 'iget' or 'icp'" - echo "Now it is $copyHow" - exit 1 -fi - -#run rule put output in an array -read -ra array <<< $(irule -r irods_rule_engine_plugin-irods_rule_language-instance -F randomCollCopy.r "'$wave'" "'$experimentType'" "'$dateFrom'" "'$dateTill'") - -#if array is empty give notice and exit -if [ ${#array[@]} -eq 0 ]; then - echo "couldnt find any collections matching your parameters at the moment" - echo "possible causes there arent any matches, the servers are down or you dont have a connection" - exit 1 -fi - -echo "Selecting $amount items from following list: " -for item in ${array[@]} -do - echo "$item" -done - -#make folder -if [[ "$copyHow" == "iget" ]] ; then - mkdir "$folder" - cd "$folder" -fi -if [[ "$copyHow" == "icp" ]] ; then - imkdir ../"$folder" - icd ../"$folder" - fi - -echo "selected: " -#make loop to select amount collections from array -for (( i=0; i<$amount; i++ )); -do - #select a random collection from list - - if [[ ${#array[@]} -ne 0 ]] ; then - randomNr=$(( RANDOM % ${#array[@]} )) - #echo which one is copied and copy - echo "${array[$randomNr]}" - if [[ "$copyHow" == "iget" ]] ; then - iget -r "${array[$randomNr]}" - fi - if [[ "$copyHow" == "icp" ]] ; then - icp -r "${array[$randomNr]}" . - fi - - #remove from list - unset array[$randomNr] - array=( "${array[@]}" ) - fi -done diff --git a/tools/intake/vaultedDataCheck.sh b/tools/intake/vaultedDataCheck.sh deleted file mode 100644 index 97da7c2f8..000000000 --- a/tools/intake/vaultedDataCheck.sh +++ /dev/null @@ -1,29 +0,0 @@ -#!/bin/bash -#Author Niek Bats -#$1 wave -#$2 experiment -#$3 pseudocode -#lists all files, when found any grp-vault-folder using specified parameter(s) - -#input check -if("$1" == "") do #if no wave kill script - exit 1 -done - -#build iquest query -query="%" -for arg in "$@" #add per argument: "$argument/" -do - if [ "$arg" != "" ] - then - query="$query$arg/" - fi -done - -query="$query%" - -#icommand format query is in printf format -output=$(iquest ""%s";%s" "SELECT DATA_PATH, DATA_SIZE WHERE DATA_PATH like '$query'") - -printf ""Filepath/name";"filesize"\n" > outputVault.csv -printf "$output" >> outputVault.csv diff --git a/tools/intake/youthIntakeCheck.r b/tools/intake/youthIntakeCheck.r deleted file mode 100644 index 585e92dcc..000000000 --- a/tools/intake/youthIntakeCheck.r +++ /dev/null @@ -1,91 +0,0 @@ -#Author Niek Bats - -youthIntakeCheck { - *intakeOrVault="intake"; #intake vault - - #non empty *wave, *experiment and *pseudocode - if ((*wave != "") && (*experiment != "") && (*pseudocode != "")) then { - foreach(*row in SELECT COLL_OWNER_ZONE) { - *zone=*row.COLL_OWNER_ZONE; - - foreach(*row2 in SELECT COLL_NAME - WHERE COLL_NAME like '/*zone/home/grp-*intakeOrVault-%' - AND META_DATA_ATTR_NAME = 'wave' - AND META_DATA_ATTR_VALUE = *wave) { - *path=*row2.COLL_NAME; - - foreach(*row3 in SELECT DATA_NAME - WHERE COLL_NAME = *path - AND META_DATA_ATTR_NAME = 'experiment_type' - AND META_DATA_ATTR_VALUE = *experiment) { - *nameExtension=*row3.DATA_NAME; - - foreach(*row4 in SELECT DATA_SIZE - WHERE DATA_NAME = *nameExtension - AND COLL_NAME = *path - AND META_DATA_ATTR_NAME = 'pseudocode' - AND META_DATA_ATTR_VALUE = *pseudocode) { - *size=*row4.DATA_SIZE; - *name=trimr(*nameExtension, "."); - *extension=triml(*nameExtension, *name); - - writeLine("stdout", "\"*path\";\"*name\";\"*extension\";\"*size\""); - } - } - } - } - } - - #non empty *wave and *experiment - else if ((*wave != "") && (*experiment != "")) then { - foreach(*row in SELECT COLL_OWNER_ZONE) { - *zone=*row.COLL_OWNER_ZONE; - - foreach(*row2 in SELECT COLL_NAME - WHERE COLL_NAME like '/*zone/home/grp-*intakeOrVault-%' - AND META_DATA_ATTR_NAME = 'wave' - AND META_DATA_ATTR_VALUE = *wave) { - *path=*row2.COLL_NAME; - - foreach(*row3 in SELECT DATA_NAME, DATA_SIZE - WHERE COLL_NAME = *path - AND META_DATA_ATTR_NAME = 'experiment_type' - AND META_DATA_ATTR_VALUE = *experiment) { - *nameExtension=*row3.DATA_NAME; - *size=*row3.DATA_SIZE; - *name=trimr(*nameExtension, "."); - *extension=triml(*nameExtension, *name); - - writeLine("stdout", "\"*path\";\"*name\";\"*extension\";\"*size\""); - } - } - } - } - - #non empty wave pseudocode is empty - else if (*wave != "" && *pseudocode == "") then { - foreach(*row in SELECT COLL_OWNER_ZONE) { - *zone=*row.COLL_OWNER_ZONE; - - foreach(*row2 in SELECT COLL_NAME, DATA_NAME, DATA_SIZE - WHERE COLL_NAME like '/*zone/home/grp-*intakeOrVault-%' - AND META_DATA_ATTR_NAME ='wave' - AND META_DATA_ATTR_VALUE = *wave) { - *path=*row2.COLL_NAME; - *nameExtension=*row2.DATA_NAME; - *size=*row2.DATA_SIZE; - *name=trimr(*nameExtension, "."); - *extension=triml(*nameExtension, *name); - - writeLine("stdout", "\"*path\";\"*name\";\"*extension\";\"*size\""); - } - } - } - - else { - writeLine("stdout", "Invalid input"); - } -} - -input *wave="", *experiment="", *pseudocode="" -output ruleExecOut \ No newline at end of file diff --git a/tools/intake/youthIntakeCheck.sh b/tools/intake/youthIntakeCheck.sh deleted file mode 100644 index bb24b4821..000000000 --- a/tools/intake/youthIntakeCheck.sh +++ /dev/null @@ -1,33 +0,0 @@ -#!/bin/bash -#Author Niek Bats -#$1 wave -#$2 experiment -#$3 pseudocode -#lists all files, when found any grp-intake-folder using specified parameter(s) - -output=$(irule -r irods_rule_engine_plugin-irods_rule_language-instance -F youthIntakeCheck.r "'$1'" "'$2'" "'$3'") -#echo $output -if [[ "$output" == "" ]] -then - echo "no results with parameters $1 $2 $3" - echo "please note that files have to be scanned to be found" - -elif [[ $output == "Invalid input" ]] -then - echo "$output" - -else - outputFile="intake-$1" - if [[ "$2" != "" ]] - then - outputFile="$outputFile-$2" - fi - if [[ "$3" != "" ]] - then - outputFile="$outputFile-$3" - fi - outputFile="$outputFile.csv" - - printf "\"Filepath\";\"name\";\"extension\";\"filesize\"\n" > "$outputFile" - printf "$output" >> "$outputFile" -fi diff --git a/tools/intake/youthVaultCheck.r b/tools/intake/youthVaultCheck.r deleted file mode 100644 index 1b2dbbdb1..000000000 --- a/tools/intake/youthVaultCheck.r +++ /dev/null @@ -1,91 +0,0 @@ -#Author Niek Bats - -youthVaultCheck { - *intakeOrVault="vault"; #intake vault - - #non empty *wave, *experiment and *pseudocode - if ((*wave != "") && (*experiment != "") && (*pseudocode != "")) then { - foreach(*row in SELECT COLL_OWNER_ZONE) { - *zone=*row.COLL_OWNER_ZONE; - - foreach(*row2 in SELECT COLL_NAME - WHERE COLL_NAME like '/*zone/home/grp-*intakeOrVault-%' - AND META_DATA_ATTR_NAME = 'wave' - AND META_DATA_ATTR_VALUE = *wave) { - *path=*row2.COLL_NAME; - - foreach(*row3 in SELECT DATA_NAME - WHERE COLL_NAME = *path - AND META_DATA_ATTR_NAME = 'experiment_type' - AND META_DATA_ATTR_VALUE = *experiment) { - *nameExtension=*row3.DATA_NAME; - - foreach(*row4 in SELECT DATA_SIZE - WHERE DATA_NAME = *nameExtension - AND COLL_NAME = *path - AND META_DATA_ATTR_NAME = 'pseudocode' - AND META_DATA_ATTR_VALUE = *pseudocode) { - *size=*row4.DATA_SIZE; - *name=trimr(*nameExtension, "."); - *extension=triml(*nameExtension, *name); - - writeLine("stdout", "\"*path\";\"*name\";\"*extension\";\"*size\""); - } - } - } - } - } - - #non empty *wave and *experiment - else if ((*wave != "") && (*experiment != "")) then { - foreach(*row in SELECT COLL_OWNER_ZONE) { - *zone=*row.COLL_OWNER_ZONE; - - foreach(*row2 in SELECT COLL_NAME - WHERE COLL_NAME like '/*zone/home/grp-*intakeOrVault-%' - AND META_DATA_ATTR_NAME = 'wave' - AND META_DATA_ATTR_VALUE = *wave) { - *path=*row2.COLL_NAME; - - foreach(*row3 in SELECT DATA_NAME, DATA_SIZE - WHERE COLL_NAME = *path - AND META_DATA_ATTR_NAME = 'experiment_type' - AND META_DATA_ATTR_VALUE = *experiment) { - *nameExtension=*row3.DATA_NAME; - *size=*row3.DATA_SIZE; - *name=trimr(*nameExtension, "."); - *extension=triml(*nameExtension, *name); - - writeLine("stdout", "\"*path\";\"*name\";\"*extension\";\"*size\""); - } - } - } - } - - #non empty wave pseudocode is empty - else if (*wave != "" && *pseudocode == "") then { - foreach(*row in SELECT COLL_OWNER_ZONE) { - *zone=*row.COLL_OWNER_ZONE; - - foreach(*row2 in SELECT COLL_NAME, DATA_NAME, DATA_SIZE - WHERE COLL_NAME like '/*zone/home/grp-*intakeOrVault-%' - AND META_DATA_ATTR_NAME ='wave' - AND META_DATA_ATTR_VALUE = *wave) { - *path=*row2.COLL_NAME; - *nameExtension=*row2.DATA_NAME; - *size=*row2.DATA_SIZE; - *name=trimr(*nameExtension, "."); - *extension=triml(*nameExtension, *name); - - writeLine("stdout", "\"*path\";\"*name\";\"*extension\";\"*size\""); - } - } - } - - else { - writeLine("stdout", "Invalid input"); - } -} - -input *wave="", *experiment="", *pseudocode="" -output ruleExecOut \ No newline at end of file diff --git a/tools/intake/youthVaultCheck.sh b/tools/intake/youthVaultCheck.sh deleted file mode 100644 index 2f4d9c6c7..000000000 --- a/tools/intake/youthVaultCheck.sh +++ /dev/null @@ -1,32 +0,0 @@ -#!/bin/bash -#Author Niek Bats -#$1 wave -#$2 experiment -#$3 pseudocode -#lists all files, when found any grp-vault-folder using specified parameter(s) - -output=$(irule -r irods_rule_engine_plugin-irods_rule_language-instance -F youthVaultCheck.r "'$1'" "'$2'" "'$3'") -#echo $output -if [[ "$output" == "" ]] -then - echo "no results with parameters $1 $2 $3" - -elif [[ $output == "Invalid input" ]] -then - echo "$output" - -else - outputFile="vault-$1" - if [[ "$2" != "" ]] - then - outputFile="$outputFile-$2" - fi - if [[ "$3" != "" ]] - then - outputFile="$outputFile-$3" - fi - outputFile="$outputFile.csv" - - printf "\"Filepath\";\"name\";\"extension\";\"filesize\"\n" > "$outputFile" - printf "$output" >> "$outputFile" -fi diff --git a/tools/job_scan.r b/tools/job_scan.r deleted file mode 100644 index dd9b2377c..000000000 --- a/tools/job_scan.r +++ /dev/null @@ -1,43 +0,0 @@ -# \file -# \brief job -# \author Ton Smeele, Sietse Snel -# \copyright Copyright (c) 2015-2021, Utrecht university. All rights reserved -# \license GPLv3, see LICENSE -# -# This file can be executed manually or scheduled e.g. once a day. -# It scans an intake collection for datasets and checks the sets, if no collection -# is provided, it will scan a predefined list on intake groups (*groupList) -# -# Prerequisite: the irods user should have write access on the collection and its objects -# -# - - -uuYcRunIntakeScan { - *collectionList = list(); - # intake areas can be added to the group list as needed - *groupList = list('youth'); - *zone = $rodsZoneClient; - - if ( *intakeRoot == 'dummy' ) { - foreach (*grp in *groupList) { - *root = "/*zone/home/grp-intake-*grp"; - *collectionList = cons( *root, *collectionList); - } - } - else { - *collectionList = cons (*intakeRoot, *collectionList); - } - - foreach (*coll in *collectionList) { - writeLine("stdout","Running intake scan for *coll ..."); - *status = "0"; - rule_intake_scan_for_datasets(*coll, *status); - if (*status == "0" ) then *result = "ok" else *result = "ERROR (*status)"; - writeLine("stdout","RunIntakeScan for *intakeRoot result = *result"); - } - -} - -input *intakeRoot='dummy' -output ruleExecOut diff --git a/tools/troubleshoot-published-data.py b/tools/troubleshoot-published-data.py new file mode 100644 index 000000000..bba14bc72 --- /dev/null +++ b/tools/troubleshoot-published-data.py @@ -0,0 +1,46 @@ +#!/usr/bin/env python3 +"""This script collects all published packages and checks that they have all the required info. + +Example: +To check all published packages: +python3 troubleshoot-published-data.py + +To check one specific package by name: +python3 troubleshoot-published-data.py -p research-initial[1725262507] + +To put results into a log file and complete the checks offline: +python3 troubleshoot-published-data.py -l -o +""" +import argparse +import subprocess + + +def parse_args(): + parser = argparse.ArgumentParser( + prog="troubleshoot-published-data.py", + description=__doc__, + formatter_class=argparse.RawTextHelpFormatter) + parser.add_argument("-l", "--log-file", action='store_true', + help="If log file parameter is true then write to log at: /var/lib/irods/log/troubleshoot_publications.log") + parser.add_argument("-o", "--offline", action='store_true', + help="If actions should be performed without connecting to external servers (needed for the Yoda team's development setup).") + parser.add_argument("-n", "--no-datacite", action='store_true', + help="If datacite check should be skipped (needed for the Yoda team's development environment in some cases).") + parser.add_argument("-p", "--package", type=str, required=False, + help="Troubleshoot a specific data package by name (default: troubleshoot all packages)") + return parser.parse_args() + + +def main(): + args = parse_args() + rule_name = "/etc/irods/yoda-ruleset/tools/troubleshoot_data.r" + data_package = f"*data_package={args.package}" + log_loc = f"*log_loc={args.log_file if args.log_file else ''}" + offline = f"*offline={args.offline}" + no_datacite = f"*no_datacite={args.no_datacite}" + subprocess.call(['irule', '-r', 'irods_rule_engine_plugin-python-instance', '-F', + rule_name, data_package, log_loc, offline, no_datacite]) + + +if __name__ == '__main__': + main() diff --git a/tools/troubleshoot_data.r b/tools/troubleshoot_data.r new file mode 100644 index 000000000..3caac4671 --- /dev/null +++ b/tools/troubleshoot_data.r @@ -0,0 +1,11 @@ +#!/usr/bin/irule -r irods_rule_engine_plugin-python-instance -F + +def main(rule_args, callback, rei): + data_package = global_vars["*data_package"].strip('"') + log_loc = global_vars["*log_loc"].strip('"') + offline = global_vars["*offline"].strip('"') + no_datacite = global_vars["*no_datacite"].strip('"') + callback.rule_batch_troubleshoot_published_data_packages(data_package, log_loc, offline, no_datacite) + +INPUT *data_package="", *log_loc="", *offline="", *no_datacite="" +OUTPUT ruleExecOut diff --git a/tools/update-publications.r b/tools/update-publications.r index e4ef73069..7e7f5c1ca 100644 --- a/tools/update-publications.r +++ b/tools/update-publications.r @@ -11,47 +11,7 @@ # $ irule -r irods_rule_engine_plugin-irods_rule_language-instance -F /etc/irods/yoda-ruleset/tools/update-publications.r # updatePublications() { - writeLine("stdout", "[UPDATE PUBLICATIONS] Start for *package"); - *packagesFound = 0; - - # Scan for published vault packages. - *ContInxOld = 1; - msiAddSelectFieldToGenQuery("COLL_NAME", "", *GenQ2Inp); - msiAddConditionToGenQuery("COLL_NAME", "like", "%%/home/vault-%%", *GenQ2Inp); - msiAddConditionToGenQuery("META_COLL_ATTR_NAME", "=", UUORGMETADATAPREFIX ++ "vault_status", *GenQ2Inp); - msiAddConditionToGenQuery("META_COLL_ATTR_VALUE", "=", PUBLISHED, *GenQ2Inp); - - msiExecGenQuery(*GenQ2Inp, *GenQ2Out); - msiGetContInxFromGenQueryOut(*GenQ2Out, *ContInxNew); - - while(*ContInxOld > 0) { - foreach(*row in *GenQ2Out) { - *collName = *row.COLL_NAME; - - # Check if this really is a vault package, or selected vault package - if ((*package == '*' && *collName like regex "/[^/]+/home/vault-.*") || - (*package != '*' && *collName like regex "/[^/]+/home/vault-.*" && *collName == *package ) ) { - *packagesFound = 1; - *status = '' - *statusInfo = ''; - rule_update_publication(*collName, *updateDatacite, *updateLandingpage, *updateMOAI, *status, *statusInfo); - writeLine("stdout", "*collName: *status *statusInfo"); - } - } - - *ContInxOld = *ContInxNew; - if(*ContInxOld > 0) { - msiGetMoreRows(*GenQ2Inp, *GenQ2Out, *ContInxNew); - } - } - msiCloseGenQuery(*GenQ2Inp, *GenQ2Out); - - if (*packagesFound == 0) { - writeLine("stdout", "[UPDATE PUBLICATIONS] No packages found for *package") - } - else { - writeLine("stdout", "[UPDATE PUBLICATIONS] Finished for *package"); - } + rule_update_publication(*package, *updateDatacite, *updateLandingpage, *updateMOAI); } input *updateDatacite="Yes", *updateLandingpage="Yes", *updateMOAI="Yes", *package='*' diff --git a/unit-tests/test_intake.py b/unit-tests/test_intake.py deleted file mode 100644 index 43d737dcc..000000000 --- a/unit-tests/test_intake.py +++ /dev/null @@ -1,159 +0,0 @@ -# -*- coding: utf-8 -*- - -"""Unit tests for the intake module -""" - -__copyright__ = 'Copyright (c) 2019-2021, Utrecht University' -__license__ = 'GPLv3, see LICENSE' - -import os -import sys -from unittest import TestCase - -sys.path.append('..') - -from intake_utils import dataset_make_id, dataset_parse_id, intake_extract_tokens, intake_extract_tokens_from_name, intake_scan_get_metadata_update, intake_tokens_identify_dataset - - -class IntakeTest(TestCase): - - def test_intake_tokens_identify_dataset(self): - empty_data = dict() - self.assertFalse(intake_tokens_identify_dataset(empty_data)) - missing_key_data = {"wave": "1", "pseudocode": "2"} - self.assertFalse(intake_tokens_identify_dataset(missing_key_data)) - missing_value_data = {"wave": "1", "pseudocode": "2", "experiment_type": ""} - self.assertFalse(intake_tokens_identify_dataset(missing_value_data)) - complete_data = {"wave": "1", "pseudocode": "2", "experiment_type": "3"} - self.assertTrue(intake_tokens_identify_dataset(complete_data)) - - def test_intake_extract_tokens(self): - no_token_data = intake_extract_tokens(None, "") - self.assertEquals(len(no_token_data), 0) - wave_data = intake_extract_tokens(None, "20w") - self.assertEquals(len(wave_data), 1) - self.assertEquals(wave_data["wave"], "20w") - et_data = intake_extract_tokens(None, "chantigap") - self.assertEquals(len(et_data), 1) - self.assertEquals(et_data["experiment_type"], "chantigap") - pseudocode_data = intake_extract_tokens(None, "B12345") - self.assertEquals(len(pseudocode_data), 1) - self.assertEquals(pseudocode_data["pseudocode"], "B12345") - version_data = intake_extract_tokens(None, "VerABC") - self.assertEquals(len(version_data), 1) - self.assertEquals(version_data["version"], "ABC") - - def test_intake_extract_tokens_from_name(self): - buffer = dict() - output = intake_extract_tokens_from_name(None, "20w_chantigap_B12345_VerABC.txt", buffer) - self.assertEquals(len(output), 4) - self.assertEquals(output["wave"], "20w") - self.assertEquals(output["experiment_type"], "chantigap") - self.assertEquals(output["version"], "ABC") - self.assertEquals(output["pseudocode"], "B12345") - - def test_intake_scan_get_metadata_update_coll_in_dataset(self): - parent_path = "/foo/bar/chantigap_10w_B12345" - path = parent_path + "/chantigap_20w_B12346" - complete_metadata = {"wave": "1", - "pseudocode": "2", - "experiment_type": "3", - "version": "Raw", - "directory": parent_path, - "dataset_id": "4", - "dataset_toplevel": "5"} - - output = intake_scan_get_metadata_update(None, path, True, True, complete_metadata) - self.assertEquals(output["in_dataset"], True) - self.assertEquals(len(output["new_metadata"]), 6) - self.assertEquals(output["new_metadata"]["directory"], parent_path) - self.assertEquals(output["new_metadata"]["wave"], "1") - self.assertEquals(output["new_metadata"]["pseudocode"], "2") - self.assertEquals(output["new_metadata"]["experiment_type"], "3") - self.assertEquals(output["new_metadata"]["version"], "Raw") - self.assertEquals(output["new_metadata"]["dataset_id"], "4") - self.assertTrue("dataset_toplevel" not in output["new_metadata"]) - - def test_intake_scan_get_metadata_update_coll_out_dataset_complete(self): - incomplete_metadata = {"wave": "1", "pseudocode": "2"} - path = "/foo/bar/chantigap_10w_B12345/chantigap_B12346" - output = intake_scan_get_metadata_update(None, path, True, False, incomplete_metadata) - self.assertEquals(output["in_dataset"], True) - self.assertEquals(len(output["new_metadata"]), 7) - self.assertEquals(output["new_metadata"]["directory"], path) - self.assertEquals(output["new_metadata"]["dataset_toplevel"], dataset_make_id(output["new_metadata"])) - self.assertEquals(output["new_metadata"]["dataset_id"], dataset_make_id(output["new_metadata"])) - self.assertEquals(output["new_metadata"]["wave"], "1") - self.assertEquals(output["new_metadata"]["version"], "Raw") - self.assertEquals(output["new_metadata"]["pseudocode"], "B12346") - self.assertEquals(output["new_metadata"]["experiment_type"], "chantigap") - - def test_intake_scan_get_metadata_update_coll_out_dataset_incomplete(self): - incomplete_metadata = {"wave": "1"} - path = "/foo/bar/chantigap_10w_B12345/B12346" - output = intake_scan_get_metadata_update(None, path, True, False, incomplete_metadata) - self.assertEquals(output["in_dataset"], False) - self.assertEquals(len(output["new_metadata"]), 2) - self.assertEquals(output["new_metadata"]["wave"], "1") - self.assertEquals(output["new_metadata"]["pseudocode"], "B12346") - - def test_intake_scan_get_metadata_update_do_in_dataset(self): - complete_metadata = {"wave": "1", - "pseudocode": "2", - "experiment_type": "3", - "version": "Raw", - "dataset_id": "4", - "dataset_toplevel": "5", - "directory": "6"} - path = "/foo/bar/chantigap_10w_B12345/chantigap_20w_B12346.txt" - output = intake_scan_get_metadata_update(None, path, False, True, complete_metadata) - self.assertEquals(output["in_dataset"], True) - self.assertEquals(len(output["new_metadata"]), 6) - self.assertEquals(output["new_metadata"]["wave"], "1") - self.assertEquals(output["new_metadata"]["pseudocode"], "2") - self.assertEquals(output["new_metadata"]["experiment_type"], "3") - self.assertEquals(output["new_metadata"]["version"], "Raw") - self.assertEquals(output["new_metadata"]["dataset_id"], "4") - self.assertTrue("dataset_toplevel" not in output["new_metadata"]) - - def test_intake_scan_get_metadata_update_do_out_dataset_complete(self): - incomplete_metadata = {"wave": "1", "pseudocode": "2"} - path = "/foo/bar/chantigap_10w_B12345/chantigap_B12346.txt" - coll = os.path.dirname(path) - output = intake_scan_get_metadata_update(None, path, False, False, incomplete_metadata) - self.assertEquals(output["in_dataset"], True) - self.assertEquals(len(output["new_metadata"]), 7) - self.assertEquals(output["new_metadata"]["directory"], coll) - self.assertEquals(output["new_metadata"]["dataset_id"], dataset_make_id(output["new_metadata"])) - self.assertEquals(output["new_metadata"]["dataset_toplevel"], dataset_make_id(output["new_metadata"])) - self.assertEquals(output["new_metadata"]["wave"], "1") - self.assertEquals(output["new_metadata"]["version"], "Raw") - self.assertEquals(output["new_metadata"]["pseudocode"], "B12346") - self.assertEquals(output["new_metadata"]["experiment_type"], "chantigap") - - def test_intake_scan_get_metadata_update_do_out_dataset_incomplete(self): - incomplete_metadata = {"wave": "1"} - path = "/foo/bar/chantigap_10w_B12345/B12346.txt" - output = intake_scan_get_metadata_update(None, path, False, False, incomplete_metadata) - self.assertEquals(output["in_dataset"], False) - self.assertEquals(len(output["new_metadata"]), 2) - self.assertEquals(output["new_metadata"]["wave"], "1") - self.assertEquals(output["new_metadata"]["pseudocode"], "B12346") - - def test_dataset_make_id(self): - input = {"wave": "20w", - "experiment_type": "echo", - "pseudocode": "B12345", - "version": "Raw", - "directory": "/foo/bar/baz"} - self.assertEquals(dataset_make_id(input), - "20w\techo\tB12345\tRaw\t/foo/bar/baz") - - def test_dataset_parse_id(self): - input = "20w\techo\tB12345\tRaw\t/foo/bar/baz" - output = dataset_parse_id(input) - self.assertEquals(output.get("wave"), "20w") - self.assertEquals(output.get("experiment_type"), "echo") - self.assertEquals(output.get("pseudocode"), "B12345") - self.assertEquals(output.get("version"), "Raw") - self.assertEquals(output.get("directory"), "/foo/bar/baz") diff --git a/unit-tests/test_schema_transformations.py b/unit-tests/test_schema_transformations.py new file mode 100644 index 000000000..d273365ca --- /dev/null +++ b/unit-tests/test_schema_transformations.py @@ -0,0 +1,93 @@ +# -*- coding: utf-8 -*- +"""Unit tests for the correctify functions in schema_transformations""" + +__copyright__ = 'Copyright (c) 2024, Utrecht University' +__license__ = 'GPLv3, see LICENSE' + +import sys +from unittest import TestCase + +sys.path.append('..') + +from schema_transformations_utils import correctify_isni, correctify_orcid, correctify_scopus + + +class CorrectifyIsniTest(TestCase): + def test_isni_correct_format(self): + """Test ISNI with correct format""" + isni = "https://isni.org/isni/1234123412341234" + self.assertEqual(correctify_isni(isni), isni) + + def test_isni_correct_format_containing_x(self): + """Test ISNI with correct format""" + isni = "https://isni.org/isni/123412341234123x" + correct_isni = "https://isni.org/isni/123412341234123X" + self.assertEqual(correctify_isni(isni), correct_isni) + + def test_isni_invalid_format(self): + """Test ISNI with invalid format (1 less number)""" + isni = "123412341234123" + self.assertIsNone(correctify_isni(isni)) + + def test_isni_malformed_format(self): + """Test ISNI with invalid format""" + isni = "foobar0123456789" + self.assertIsNone(correctify_isni(isni)) + + def test_isni_with_spaces(self): + """Test ISNI that contains spaces and should be corrected""" + isni = " https://isni.org/isni/123412341234123x " + corrected_isni = "https://isni.org/isni/123412341234123X" + self.assertEqual(correctify_isni(isni), corrected_isni) + + +class CorrectifyOrcidTest(TestCase): + def test_orcid_correct_format(self): + """Test ORCID with correct format""" + orcid = "https://orcid.org/1234-1234-1234-1234" + self.assertEqual(correctify_orcid(orcid), orcid) + + def test_orcid_correct_format_containing_x(self): + """Test ORCID with correct format""" + orcid = "https://orcid.org/1234-1234-1234-123x" + correct_orcid = "https://orcid.org/1234-1234-1234-123X" + self.assertEqual(correctify_orcid(orcid), correct_orcid) + + def test_orcid_invalid_format(self): + """Test ORCID with invalid format (1 less number)""" + orcid = "1234-1234-1234-123" + self.assertIsNone(correctify_orcid(orcid)) + + def test_orcid_malformed_format(self): + """Test ORCID with invalid format""" + orcid = "1234-foo-bar-1234" + self.assertIsNone(correctify_orcid(orcid)) + + def test_orcid_with_spaces(self): + """Test ORCID that contains spaces and should be corrected""" + orcid = " https://orcid.org/1234-1234-1234-123x " + corrected_orcid = "https://orcid.org/1234-1234-1234-123X" + self.assertEqual(correctify_orcid(orcid), corrected_orcid) + + +class CorrectifyScopusTest(TestCase): + def test_correctify_format(self): + """Test SCOPUS with correct format""" + scopus = "12345678901" + self.assertEqual(correctify_scopus(scopus), scopus) + + def test_correctify_invalid_format(self): + """Test SCOPUS with invalid format""" + scopus = "123456789012" + self.assertIsNone(correctify_scopus(scopus)) + + def test_malformed_format(self): + """Test SCOPUS with invalid format""" + scopus = "foobar1234" + self.assertIsNone(correctify_scopus(scopus)) + + def test_orcid_with_spaces(self): + """Test SCOPUS that contains spaces and should be corrected""" + scopus = " 01234567890 " + corrected_scopus = "01234567890" + self.assertEqual(correctify_scopus(scopus), corrected_scopus) diff --git a/unit-tests/test_util_misc.py b/unit-tests/test_util_misc.py index cddbe5fcd..428fa33e8 100644 --- a/unit-tests/test_util_misc.py +++ b/unit-tests/test_util_misc.py @@ -6,35 +6,200 @@ import sys import time -from collections import OrderedDict +from collections import namedtuple, OrderedDict from unittest import TestCase sys.path.append('../util') -from misc import human_readable_size, last_run_time_acceptable, remove_empty_objects +from misc import check_data_package_system_avus, human_readable_size, last_run_time_acceptable, remove_empty_objects + +# AVs of a successfully published data package, that is the first version of the package +avs_success_data_package = { + "org_publication_accessRestriction": "Open - freely retrievable", + "org_publication_anonymousAccess": "yes", + "org_publication_approval_actor": "datamanager#tempZone", + "org_publication_combiJsonPath": "/tempZone/yoda/publication/ICGVFV-combi.json", + "org_publication_dataCiteJsonPath": "/tempZone/yoda/publication/ICGVFV-dataCite.json", + "org_publication_dataCiteMetadataPosted": "yes", + "org_publication_landingPagePath": "/tempZone/yoda/publication/ICGVFV.html", + "org_publication_landingPageUploaded": "yes", + "org_publication_landingPageUrl": "https://public.yoda.test/allinone/UU01/ICGVFV.html", + "org_publication_lastModifiedDateTime": "2024-10-04T15:32:46.000000", + "org_publication_license": "Creative Commons Attribution 4.0 International Public License", + "org_publication_licenseUri": "https://creativecommons.org/licenses/by/4.0/legalcode", + "org_publication_oaiUploaded": "yes", + "org_publication_publicationDate": "2024-10-04T15:33:17.853806", + "org_publication_randomId": "ICGVFV", + "org_publication_status": "OK", + "org_publication_submission_actor": "researcher#tempZone", + "org_publication_vaultPackage": "/tempZone/home/vault-default-3/research-default-3[1728048679]", + "org_publication_versionDOI": "10.00012/UU01-ICGVFV", + "org_publication_versionDOIMinted": "yes", +} + +avs_success_data_package_multiversion = { + "org_publication_accessRestriction": "Open - freely retrievable", + "org_publication_anonymousAccess": "yes", + "org_publication_approval_actor": "datamanager#tempZone", + "org_publication_baseDOI": "10.00012/UU01-X0GU3S", + "org_publication_baseDOIMinted": "yes", + "org_publication_baseRandomId": "X0GU3S", + "org_publication_combiJsonPath": "/tempZone/yoda/publication/YU0JDH-combi.json", + "org_publication_dataCiteJsonPath": "/tempZone/yoda/publication/YU0JDH-dataCite.json", + "org_publication_dataCiteMetadataPosted": "yes", + "org_publication_landingPagePath": "/tempZone/yoda/publication/YU0JDH.html", + "org_publication_landingPageUploaded": "yes", + "org_publication_landingPageUrl": "https://public.yoda.test/allinone/UU01/YU0JDH.html", + "org_publication_lastModifiedDateTime": "2024-10-11T08:49:17.000000", + "org_publication_license": "Custom", + "org_publication_oaiUploaded": "yes", + "org_publication_previous_version": "/tempZone/home/vault-initial1/new-group01[1728550839]", + "org_publication_publicationDate": "2024-10-11T08:50:01.812220", + "org_publication_randomId": "YU0JDH", + "org_publication_status": "OK", + "org_publication_submission_actor": "datamanager#tempZone", + "org_publication_vaultPackage": "/tempZone/home/vault-initial1/new-group01[1728629336]", + "org_publication_versionDOI": "10.00012/UU01-YU0JDH", + "org_publication_versionDOIMinted": "yes" +} + +avs_success_data_package_multiversion_first = { + "org_publication_accessRestriction": "Open - freely retrievable", + "org_publication_anonymousAccess": "yes", + "org_publication_approval_actor": "datamanager#tempZone", + "org_publication_baseDOI": "10.00012/UU01-X0GU3S", + "org_publication_baseRandomId": "X0GU3S", + "org_publication_combiJsonPath": "/tempZone/yoda/publication/T8D8QU-combi.json", + "org_publication_dataCiteJsonPath": "/tempZone/yoda/publication/T8D8QU-dataCite.json", + "org_publication_dataCiteMetadataPosted": "yes", + "org_publication_landingPagePath": "/tempZone/yoda/publication/T8D8QU.html", + "org_publication_landingPageUploaded": "yes", + "org_publication_landingPageUrl": "https://public.yoda.test/allinone/UU01/T8D8QU.html", + "org_publication_lastModifiedDateTime": "2024-10-10T09:06:05.000000", + "org_publication_license": "Creative Commons Attribution 4.0 International Public License", + "org_publication_licenseUri": "https://creativecommons.org/licenses/by/4.0/legalcode", + "org_publication_next_version": "/tempZone/home/vault-initial1/new-group01[1728545387]", + "org_publication_oaiUploaded": "yes", + "org_publication_publicationDate": "2024-10-10T09:06:02.177810", + "org_publication_randomId": "T8D8QU", + "org_publication_status": "OK", + "org_publication_submission_actor": "datamanager#tempZone", + "org_publication_vaultPackage": "/tempZone/home/vault-initial1/new-group01[1728543897]", + "org_publication_versionDOI": "10.00012/UU01-T8D8QU", + "org_publication_versionDOIMinted": "yes", +} + +# From avu.py +Avu = namedtuple('Avu', list('avu')) +Avu.attr = Avu.a +Avu.value = Avu.v +Avu.unit = Avu.u class UtilMiscTest(TestCase): + def test_check_data_package_system_avus(self): + # Success + avs = avs_success_data_package + avus_success = [Avu(attr, val, "") for attr, val in avs.items()] + result = check_data_package_system_avus(avus_success) + self.assertTrue(result['no_missing_avus']) + self.assertTrue(result['no_unexpected_avus']) + self.assertTrue(len(result['missing_avus']) == 0) + self.assertTrue(len(result['unexpected_avus']) == 0) + + # Success, extra optional avu + avs['org_publication_baseDOIAvailable'] = 'yes' + avus_success = [Avu(attr, val, "") for attr, val in avs.items()] + result = check_data_package_system_avus(avus_success) + self.assertTrue(result['no_missing_avus']) + self.assertTrue(result['no_unexpected_avus']) + self.assertTrue(len(result['missing_avus']) == 0) + self.assertTrue(len(result['unexpected_avus']) == 0) + del avs['org_publication_baseDOIAvailable'] + + # Missing license Uri for non-custom license + del avs['org_publication_licenseUri'] + avus_missing_license_uri = [Avu(attr, val, "") for attr, val in avs.items()] + result = check_data_package_system_avus(avus_missing_license_uri) + self.assertFalse(result['no_missing_avus']) + self.assertTrue(result['no_unexpected_avus']) + self.assertTrue(len(result['missing_avus']) == 1) + self.assertTrue(len(result['unexpected_avus']) == 0) + + # Custom license, no license Uri (happy flow) + avs['org_publication_license'] = "Custom" + avus_custom_license = [Avu(attr, val, "") for attr, val in avs.items()] + result = check_data_package_system_avus(avus_custom_license) + self.assertTrue(result['no_missing_avus']) + self.assertTrue(result['no_unexpected_avus']) + self.assertTrue(len(result['missing_avus']) == 0) + self.assertTrue(len(result['unexpected_avus']) == 0) + + # Unexpected + avs['org_publication_userAddedSomethingWeird'] = "yodayoda:)" + avus_unexpected = [Avu(attr, val, "") for attr, val in avs.items()] + result = check_data_package_system_avus(avus_unexpected) + self.assertTrue(result['no_missing_avus']) + self.assertFalse(result['no_unexpected_avus']) + self.assertTrue(len(result['missing_avus']) == 0) + self.assertTrue(len(result['unexpected_avus']) == 1) + + # Missing and unexpected + del avs['org_publication_landingPagePath'] + avus_missing_unexpected = [Avu(attr, val, "") for attr, val in avs.items()] + result = check_data_package_system_avus(avus_missing_unexpected) + self.assertFalse(result['no_missing_avus']) + self.assertFalse(result['no_unexpected_avus']) + self.assertTrue(len(result['missing_avus']) == 1) + self.assertTrue(len(result['unexpected_avus']) == 1) + + # Missing + del avs['org_publication_userAddedSomethingWeird'] + avus_missing = [Avu(attr, val, "") for attr, val in avs.items()] + result = check_data_package_system_avus(avus_missing) + self.assertFalse(result['no_missing_avus']) + self.assertTrue(result['no_unexpected_avus']) + self.assertTrue(len(result['missing_avus']) == 1) + self.assertTrue(len(result['unexpected_avus']) == 0) + + # Success, latest version of a publication + avs = avs_success_data_package_multiversion + avus_success = [Avu(attr, val, "") for attr, val in avs.items()] + result = check_data_package_system_avus(avus_success) + self.assertTrue(result['no_missing_avus']) + self.assertTrue(result['no_unexpected_avus']) + self.assertTrue(len(result['missing_avus']) == 0) + self.assertTrue(len(result['unexpected_avus']) == 0) + + # Success, first version of a publication that has had other versions + avs = avs_success_data_package_multiversion_first + avus_success = [Avu(attr, val, "") for attr, val in avs.items()] + result = check_data_package_system_avus(avus_success) + self.assertTrue(result['no_missing_avus']) + self.assertTrue(result['no_unexpected_avus']) + self.assertTrue(len(result['missing_avus']) == 0) + self.assertTrue(len(result['unexpected_avus']) == 0) + def test_last_run_time_acceptable(self): """Test the last run time for copy to vault""" - # No last run time (job hasn't be tried before) + # No last run time (job hasn't been tried before) found = False last_run = 1 - self.assertEqual(last_run_time_acceptable("b", found, last_run, 300), True) + self.assertEqual(last_run_time_acceptable(found, last_run, 300), True) # Last run time greater than the backoff, so can run now = int(time.time()) found = True copy_backoff_time = 300 last_run = now - copy_backoff_time - 1 - self.assertEqual(last_run_time_acceptable("b", found, last_run, copy_backoff_time), True) + self.assertEqual(last_run_time_acceptable(found, last_run, copy_backoff_time), True) # Last run time more recent than the backoff, so should not run found = True copy_backoff_time = 300 last_run = now - self.assertEqual(last_run_time_acceptable("b", found, int(time.time()), copy_backoff_time), False) + self.assertEqual(last_run_time_acceptable(found, int(time.time()), copy_backoff_time), False) def test_human_readable_size(self): output = human_readable_size(0) diff --git a/unit-tests/unit_tests.py b/unit-tests/unit_tests.py index a008c8607..8af940d91 100644 --- a/unit-tests/unit_tests.py +++ b/unit-tests/unit_tests.py @@ -6,9 +6,9 @@ from unittest import makeSuite, TestSuite from test_group_import import GroupImportTest -from test_intake import IntakeTest from test_policies import PoliciesTest from test_revisions import RevisionTest +from test_schema_transformations import CorrectifyIsniTest, CorrectifyOrcidTest, CorrectifyScopusTest from test_util_misc import UtilMiscTest from test_util_pathutil import UtilPathutilTest from test_util_yoda_names import UtilYodaNamesTest @@ -16,8 +16,10 @@ def suite(): test_suite = TestSuite() + test_suite.addTest(makeSuite(CorrectifyIsniTest)) + test_suite.addTest(makeSuite(CorrectifyOrcidTest)) + test_suite.addTest(makeSuite(CorrectifyScopusTest)) test_suite.addTest(makeSuite(GroupImportTest)) - test_suite.addTest(makeSuite(IntakeTest)) test_suite.addTest(makeSuite(PoliciesTest)) test_suite.addTest(makeSuite(RevisionTest)) test_suite.addTest(makeSuite(UtilMiscTest)) diff --git a/util/avu.py b/util/avu.py index 470620403..0098fcea4 100644 --- a/util/avu.py +++ b/util/avu.py @@ -35,6 +35,18 @@ def of_coll(ctx, coll): "COLL_NAME = '{}'".format(coll))) +def get_attr_val_of_coll(ctx, coll, attr): + """Get the value corresponding to an attr for a given collection.""" + iter = genquery.Query( + ctx, + "META_COLL_ATTR_VALUE", + "META_COLL_ATTR_NAME = '{}' AND COLL_NAME = '{}'".format(attr, coll)) + + for row in iter: + return row + raise ValueError("Attribute {} not found in AVUs of collection {}".format(attr, coll)) + + def inside_coll(ctx, path, recursive=False): """Get a list of all AVUs inside a collection with corresponding paths. diff --git a/util/config.py b/util/config.py index 731d131d9..4827294c1 100644 --- a/util/config.py +++ b/util/config.py @@ -99,7 +99,6 @@ def __repr__(self): enable_deposit=False, enable_open_search=False, enable_inactivity_notification=False, - enable_intake=False, enable_datarequest=False, enable_data_package_archive=False, enable_data_package_download=False, diff --git a/util/group.py b/util/group.py index b56b32067..7ea8d4c10 100644 --- a/util/group.py +++ b/util/group.py @@ -1,5 +1,5 @@ # -*- coding: utf-8 -*- -"""Utility / convenience functions for querying user info.""" +"""Utility / convenience functions for querying group info.""" __copyright__ = 'Copyright (c) 2019-2023, Utrecht University' __license__ = 'GPLv3, see LICENSE' diff --git a/util/log.py b/util/log.py index 897b9562c..545e626ca 100644 --- a/util/log.py +++ b/util/log.py @@ -17,15 +17,20 @@ import user -def write(ctx, message): - """Write a message to the log, including client name and originating module. +def write(ctx, message, write_stdout=False): + """Write a message to the log or stdout. + Includes client name and originating module if writing to log. - :param ctx: Combined type of a callback and rei struct - :param message: Message to write to log + :param ctx: Combined type of a callback and rei struct + :param message: Message to write to log + :param write_stdout: Whether to write to stdout (used for a few of our scripts) """ - stack = inspect.stack()[1] - module = inspect.getmodule(stack[0]) - _write(ctx, '[{}] {}'.format(module.__name__.replace("rules_uu.", ""), message)) + if write_stdout: + ctx.writeLine("stdout", message) + else: + stack = inspect.stack()[1] + module = inspect.getmodule(stack[0]) + _write(ctx, '[{}] {}'.format(module.__name__.replace("rules_uu.", ""), message)) def _write(ctx, message): @@ -40,15 +45,6 @@ def _write(ctx, message): ctx.writeLine('serverLog', message) -def write_stdout(ctx, message): - """Write a message to stdout. Used for some of our scripts. - - :param ctx: Combined type of a callback and rei struct - :param message: Message to write to log - """ - ctx.writeLine("stdout", message) - - def debug(ctx, message): """"Write a message to the log, if in a development environment. diff --git a/util/misc.py b/util/misc.py index 12df2a0af..73b05d2e6 100644 --- a/util/misc.py +++ b/util/misc.py @@ -8,8 +8,90 @@ import time from collections import OrderedDict +import constants -def last_run_time_acceptable(coll, found, last_run, config_backoff_time): + +def check_data_package_system_avus(extracted_avus): + """ + Checks whether a data package has the expected system AVUs that start with constants.UUORGMETADATAPREFIX (i.e, 'org_'). + This function compares the AVUs of the provided data package against a set of ground truth AVUs derived from + a successfully published data package. + + :param extracted_avus: AVUs of the data package in AVU form + + :returns: Dictionary of the results of the check + """ + # Filter those starting with 'org_publication' + extracted_avs = {} + for m in extracted_avus: + if m.attr.startswith(constants.UUORGMETADATAPREFIX + 'publication_'): + extracted_avs[m.attr] = m.value + extracted_attrs = set(extracted_avs.keys()) + + # Define the set of ground truth AVUs + avu_names_suffix = { + 'approval_actor', 'randomId', + 'versionDOI', 'dataCiteJsonPath', 'license', + 'anonymousAccess', 'versionDOIMinted', + 'accessRestriction', 'landingPagePath', + 'publicationDate', + 'vaultPackage', 'submission_actor', 'status', + 'lastModifiedDateTime', 'combiJsonPath', + 'landingPageUploaded', 'oaiUploaded', + 'landingPageUrl', 'dataCiteMetadataPosted' + } + + # If the license is not Custom, it must have a licenseUri + if constants.UUORGMETADATAPREFIX + 'publication_license' in extracted_attrs: + if extracted_avs[constants.UUORGMETADATAPREFIX + 'publication_license'] != "Custom": + avu_names_suffix.add('licenseUri') + + # Define additional set of AVUs with more than one version of publication + avu_names_version_suffix = { + 'previous_version', 'baseDOI', 'baseRandomId', + 'baseDOIMinted' + } + + # Define additional set of AVUs expected for the first version of a publication, when there are multiple versions + avu_names_first_version_suffix = { + 'baseRandomId', 'baseDOI', 'next_version' + } + + # for the second version, all we need is next_version in addition to avu_names_version_suffix + avu_names_previous_version_suffix = {'next_version'} + + # optional avus + avu_names_optional_suffix = { + 'versionDOIAvailable', 'baseDOIAvailable' + } + + combined_avu_names_suffix = avu_names_suffix + + if constants.UUORGMETADATAPREFIX + 'publication_previous_version' in extracted_attrs: + combined_avu_names_suffix.update(avu_names_version_suffix) + if constants.UUORGMETADATAPREFIX + 'publication_next_version' in extracted_attrs: + combined_avu_names_suffix.update(avu_names_previous_version_suffix) + elif constants.UUORGMETADATAPREFIX + 'publication_next_version' in extracted_attrs: + combined_avu_names_suffix.update(avu_names_first_version_suffix) + + ground_truth_avus = {"{}publication_{}".format(constants.UUORGMETADATAPREFIX, name) for name in combined_avu_names_suffix} + combined_avu_names_suffix.update(avu_names_optional_suffix) + ground_truth_avus_with_optional = {"{}publication_{}".format(constants.UUORGMETADATAPREFIX, name) for name in combined_avu_names_suffix} + # Find missing and unexpected AVUs + missing_avus = ground_truth_avus - extracted_attrs + unexpected_avus = extracted_attrs - ground_truth_avus_with_optional + + results = { + 'no_missing_avus': not bool(missing_avus), + 'missing_avus': list(missing_avus), + 'no_unexpected_avus': not bool(unexpected_avus), + 'unexpected_avus': list(unexpected_avus) + } + + return results + + +def last_run_time_acceptable(found, last_run, config_backoff_time): """Return whether the last run time is acceptable to continue with task.""" now = int(time.time()) @@ -48,5 +130,5 @@ def remove_empty_objects(d): # Clean lists by filtering out empty objects. return [remove_empty_objects(item) for item in d if remove_empty_objects(item) not in (None, '', {}, [])] else: - # Return the value abecause it is not a dict or list. + # Return the value because it is not a dict or list. return d diff --git a/util/user.py b/util/user.py index ed94b67e6..28725672b 100644 --- a/util/user.py +++ b/util/user.py @@ -57,11 +57,17 @@ def from_str(ctx, s): def exists(ctx, user): - """Check if a user exists.""" + """Check if a user ('rodsuser' or 'rodsadmin') exists. + + :param ctx: Combined type of a callback and rei struct + :param user: Given user + + :returns: Boolean indicating if user exists + """ if type(user) is str: user = from_str(ctx, user) - return genquery.Query(ctx, "USER_TYPE", "USER_NAME = '{}' AND USER_ZONE = '{}'".format(*user)).first() is not None + return genquery.Query(ctx, "USER_TYPE", "USER_NAME = '{}' AND USER_ZONE = '{}'".format(*user)).first() in ["rodsuser", "rodsadmin"] def user_type(ctx, user=None): diff --git a/uuGroup.r b/uuGroup.r index 266f5aa11..6430f01b5 100644 --- a/uuGroup.r +++ b/uuGroup.r @@ -118,26 +118,6 @@ uuGroupVaultPathExists(*vaultName, *exists) { } } -# \brief Check if a rodsuser or rodsadmin with the given name exists. -# -# \param[in] userName username(#zone) -# \param[out] exists -# -uuUserExists(*user, *exists) { - *exists = false; - uuGetUserAndZone(*user, *userName, *userZone); - foreach ( - *row in - SELECT USER_NAME, USER_TYPE - WHERE USER_NAME = '*userName' - AND USER_ZONE = '*userZone' - ) { - if (*row."USER_TYPE" == "rodsuser" || *row."USER_TYPE" == "rodsadmin") { - *exists = true; - break; - } - } -} # \brief Check if a user is a member of the given group. # @@ -160,6 +140,7 @@ uuGroupUserExists(*group, *user, *includeRo, *membership) { } } + # \brief Check if the home collection belonging to a group is empty. # # \param[in] groupName group name (no zone) @@ -958,19 +939,20 @@ uuGroupUserAdd(*groupName, *user, *creatorUser, *creatorZone, *status, *message) # Check that the creator user exists *fullNameCreator = "*creatorUser#*creatorZone"; - uuUserExists(*fullNameCreator, *exists); + *exists = "" + rule_user_exists(*fullNameCreator, *exists); # If creator does not exist, exit - if (!*exists) { - succeed; # Return here (fail would ruin the status and error message). + if (*exists != "true") { + succeed; # Return here (fail would ruin the status and error message). } uuGetUserAndZone(*user, *userName, *userZone); *fullName = "*userName#*userZone"; - uuUserExists(*fullName, *exists); + rule_user_exists(*fullName, *exists); # User does not exist, add user to iRODS first. - if (!*exists) { + if (*exists != "true") { *kv."forGroup" = *groupName; *status = str(errorcode(msiSudoUserAdd(*fullName, "", "", "", *kv))); if (*status != '0') { @@ -1028,7 +1010,7 @@ uuGroupUserAdd(*groupName, *user, *creatorUser, *creatorZone, *status, *message) uuGroupUserAdd(*groupName, *user, *status, *message) { *status = '1'; *message = "An internal error occurred."; - + uuGroupUserAdd(*groupName, *user, $userNameClient, $rodsZoneClient, *status, *message) } diff --git a/uuGroupPolicies.r b/uuGroupPolicies.r index 7bf09aec9..c2cea29a3 100644 --- a/uuGroupPolicies.r +++ b/uuGroupPolicies.r @@ -59,14 +59,14 @@ uuGroupPreSudoGroupAdd(*groupName, *initialAttr, *initialValue, *initialUnit, *p uuGetBaseGroup(*groupName, *baseName); if (*baseName == *groupName) { # Do not allow creating a standalone "read-" or "vault-" group. - # There must always be a corresponding "intake-" or "research-" group. + # There must always be a corresponding "research-" group. fail; } uuGroupUserIsManager(*baseName, uuClientFullName, *isManagerInBaseGroup); if (!*isManagerInBaseGroup) { # Only allow creation of a read or vault group if the creator is a - # manager in the base group. (research or intake). + # manager in the research group. fail; } @@ -469,14 +469,14 @@ uuPostSudoGroupAdd(*groupName, *initialAttr, *initialValue, *initialUnit, *polic # taken after automatic creation of vault groups. } else { - # This is a group manager managed group (i.e. 'research-', 'deposit-','grp-', 'intake-', 'priv-', 'datamanager-'). + # This is a group manager managed group (i.e. 'research-', 'deposit-', 'priv-', 'datamanager-'). # Add the creator as a member. errorcode(msiSudoGroupMemberAdd(*groupName, uuClientFullName, "")); - # Perform group prefix-dependent actions (e.g. create vaults for intake/research groups). + # Perform group prefix-dependent actions (e.g. create vaults for research groups). - if (*groupName like regex "(intake|research)-.*") { + if (*groupName like regex "research-.*") { # Create a corresponding RO group. uuChop(*groupName, *_, *baseName, "-", true); *roGroupName = "read-*baseName"; @@ -502,7 +502,7 @@ uuPostSudoGroupAdd(*groupName, *initialAttr, *initialValue, *initialUnit, *polic } else if (*groupName like "datamanager-*") { # Give the newly created datamanager group read access to all - # existing intake/research home dirs and vaults in its category. + # existing research home dirs and vaults in its category. *category = *policyKv."category"; foreach ( @@ -513,9 +513,9 @@ uuPostSudoGroupAdd(*groupName, *initialAttr, *initialValue, *initialUnit, *polic AND META_USER_ATTR_NAME = 'category' AND META_USER_ATTR_VALUE = '*category' ) { - # Filter down to intake/research groups and get their vault groups. + # Filter down to research groups and get their vault groups. *catGroup = *row."USER_GROUP_NAME"; - if (*catGroup like regex "(intake|research)-.*") { + if (*catGroup like regex "research-.*") { *aclKv."forGroup" = *catGroup; msiSudoObjAclSet("recursive", "read", *groupName, "/$rodsZoneClient/home/*catGroup", *aclKv); @@ -548,7 +548,7 @@ uuPostSudoGroupAdd(*groupName, *initialAttr, *initialValue, *initialUnit, *polic # Set group manager-managed group metadata. # - # Note: Setting the category of an intake/research group will trigger + # Note: Setting the category of an research group will trigger # an ACL change: The datamanager group in the category, if it exists # will get read access to this group an its accompanying vault. # See uuPostSudoObjMetaSet. diff --git a/uuGroupPolicyChecks.r b/uuGroupPolicyChecks.r index 54d08ee04..374b84e44 100644 --- a/uuGroupPolicyChecks.r +++ b/uuGroupPolicyChecks.r @@ -38,7 +38,7 @@ uuUserNameIsValid(*name) # # Group names must: # -# - be prefixed with 'intake-' or 'research-' or 'deposit-' +# - be prefixed with 'research-' or 'deposit-' # - contain only lowercase characters, numbers and hyphens # - not start or end with a hyphen # @@ -49,7 +49,7 @@ uuUserNameIsValid(*name) # \param[in] name # uuGroupNameIsValid(*name) - = *name like regex ``(intake|research|deposit)-([a-z0-9]|[a-z0-9][a-z0-9-]*[a-z0-9])``; + = *name like regex ``(research|deposit)-([a-z0-9]|[a-z0-9][a-z0-9-]*[a-z0-9])``; uuGroupNameIsDatamanager(*name) = *name like regex ``(datamanager)-([a-z0-9]|[a-z0-9][a-z0-9-]*[a-z0-9])``; @@ -198,7 +198,7 @@ uuGroupPolicyCanGroupAdd(*actor, *groupName, *category, *subcategory, *expiratio uuChop(*groupName, *prefix, *base, "-", true); - # For research and intake groups: Make sure their ro and + # For research groups: Make sure their ro and # vault groups do not exist yet. *roName = "read-*base"; uuGroupExists(*roName, *roExists); @@ -274,7 +274,7 @@ uuGroupPolicyCanGroupAdd(*actor, *groupName, *category, *subcategory, *expiratio *reason = "You must have priv-group-add and priv-cat-add to add a datamanger group" } } else { - *reason = "Group names must start with one of 'intake-', 'research-', 'deposit-', or 'datamanager-' and may only contain lowercase letters (a-z) and hyphens (-)."; + *reason = "Group names must start with one of 'research-', 'deposit-', or 'datamanager-' and may only contain lowercase letters (a-z) and hyphens (-)."; } } else { *reason = "You cannot create groups because you are not a member of the priv-group-add group."; diff --git a/uuLock.r b/uuLock.r deleted file mode 100644 index 2f16e4820..000000000 --- a/uuLock.r +++ /dev/null @@ -1,140 +0,0 @@ -# \file uuLock.r -# \brief Locking functions. -# \author Ton Smeele -# \copyright Copyright (c) 2015, Utrecht University. All rights reserved. -# \license GPLv3, see LICENSE. - -# \brief Obtain a lock on a collection. -# -# \param[in] collection name of the collection to be locked -# \param[out] status 0 = locked, nonzero = lock failed (e.g. in use) -# -uuLock(*collection, *status) { - msiGetIcatTime(*dateTime, "unix"); - *lockId = $userNameClient ++ ":" ++ *dateTime; - # let everyone know we need a lock - # NB: a race condition could happen when another process owned by - # the same user requests a lock at the very same second. - # to minimize the risk we include username in the lockid - msiString2KeyValPair("uuLockRequest=*lockId",*kvLockRequest); - msiAssociateKeyValuePairsToObj(*kvLockRequest, *collection, "-C"); - # check upstream and on collection itself if lock (request) exists - *path = ""; - *lockFound = false; - foreach (*segment in split(*collection, '/')) { - *path = "*path/*segment"; - if (*path != *collection) { - uuLockExists(*path, *lockFound); - if (*lockFound) { - break; - } - } else { - # TODO check collection itself yet ignore our own request - foreach (*row in SELECT META_COLL_ATTR_NAME,META_COLL_ATTR_VALUE - WHERE COLL_NAME = *collection - AND META_COLL_ATTR_NAME LIKE "uuLock%" - ) { - msiGetValByKey(*row, "META_COLL_ATTR_NAME", *key); - msiGetValByKey(*row, "META_COLL_ATTR_VALUE", *value); - if ("*key=*value" != "uuLockRequest=*lockId"){ - *lockFound = true; - } - } - } - } - if (!*lockFound) { - # also check downstream if other have (requested) a lock - # we can check all subcollections in one go - foreach (*rows in SELECT META_COLL_ATTR_NAME,COLL_NAME - WHERE COLL_PARENT_NAME LIKE '*collection%' - AND META_COLL_ATTR_NAME LIKE 'uuLock%' - ){ - # SELECT does not support 'OR' construct, therefore we need to - # check and ignore collections that start with similar prefix - # yet are in a different tree - # e.g. /zone/home/col/col2 and /zone/home/cola/col2 - # both cases col2 appears to have parent "col%" - msiGetValByKey(*rows, "COLL_NAME", *thisCollection); - if (*thisCollection like "*collection/\*") { - # we have an existing lock - *lockFound = true; - break; - } - } - } - if (*lockFound) { - *status = 1; - # retract our lock request, someone else got a lock - msiRemoveKeyValuePairsFromObj(*kvLockRequest, *collection, "-C"); - } else { - # change our request into a real lock - msiString2KeyValPair("uuLocked=*lockId",*kvLock); - msiAssociateKeyValuePairsToObj(*kvLock, *collection, "-C"); - msiRemoveKeyValuePairsFromObj(*kvLockRequest, *collection, "-C"); - *status = 0; - } -} - -# -# \brief uuUnlock unlocks a collection -# -# \param[in] collection name of the collection to unlock -uuUnlock(*collection) { - # NB: always succeeds regardless if lock actually exists - foreach (*rows in SELECT META_COLL_ATTR_VALUE - WHERE COLL_NAME = '*collection' - AND META_COLL_ATTR_NAME = 'uuLocked' - ){ - # should return max 1 row, otherwise we have multiple locks?? - msiGetValByKey(*rows,"META_COLL_ATTR_VALUE",*lockValue); - msiString2KeyValPair("uuLocked=*lockValue",*kvLocked); - msiRemoveKeyValuePairsFromObj(*kvLocked, *collection, "-C") - } -} - -# \brief See if a collection has a lock on it. -# -# \param[in] collection name of the collection -# \param[out] isLocked true if collection has a lock(request) -# -uuLockExists(*collection, *isLocked) { - # NB: reports true for both existing locks and lock requests - *isLocked = false; - msiGetIcatTime(*currentTime, "unix"); - foreach (*row in SELECT META_COLL_ATTR_NAME,META_COLL_ATTR_VALUE - WHERE COLL_NAME = *collection - AND META_COLL_ATTR_NAME LIKE "uuLock%" - ) { - # rows found means there is an existing lock (request) - # our last hope is that this is an expired request that we can ignore - msiGetValByKey(*row,"META_COLL_ATTR_NAME",*lockKey); - msiGetValByKey(*row,"META_COLL_ATTR_VALUE",*lockValue); - *lockTime = double(uuLockGetDateTime(*lockValue)); - if ( - ((*lockTime + 7 * 86400 ) < *currentTime) - # remove locks/requests after expire time of 1 week - # && (*lockKey == "lockRequest") - ) { - # cleanup lock requests older than 5 minutes - msiString2KeyValPair("*lockKey=*lockValue",*kvExpiredLock); - msiRemoveKeyValuePairsFromObj(*kvExpiredLock, *collection, "-C"); - } else { - # there is a valid existing lock - *isLocked = true; - } - } -} - -# \brief Function to get the username part of a lock. -# -# \param[in] lock name of the lock -# \return username -# -uuLockGetUser(*lock) = substr(*lock, 0, strlen(*lock) - strlen(triml(*lock,":")) -1); - -# \brief Function to get the datestamp part of a lock. -# -# \param[in] lock name of the lock -# \return datetimestamp (in seconds since epoch) -# -uuLockGetDateTime(*lock) = triml(*lock,":"); diff --git a/yc2Vault.r b/yc2Vault.r deleted file mode 100644 index ce9331e6a..000000000 --- a/yc2Vault.r +++ /dev/null @@ -1,388 +0,0 @@ -# \file -# \brief move selected datasets from intake area to the vault area -# this rule is to be executed by a background process with write access to vault -# and read access to the intake area -# \author Ton Smeele -# \copyright Copyright (c) 2015, Utrecht university. All rights reserved -# \license GPLv3, see LICENSE -# -#test { -# *intakeRoot = '/nluu1ot/home/grp-intake-youth'; -# *vaultRoot = '/nluu1ot/home/grp-vault-youth'; -# uuYc2Vault(*intakeRoot, *vaultRoot, *status); -# writeLine("serverLog","result status of yc2Vault is *status"); -#} - - -# \brief -# -# \param[in] path pathname of the tree-item -# \param[in] name segment of path, name of collection or data object -# \param[in] isCol true if the object is a collection, otherwise false -# \param[in,out] buffer -# -#uuTreeMyRule(*parent, *objectName, *isCol, *buffer) { -# writeLine("serverLog","parent = *parent"); -# writeLine("serverLog","name = *objectName"); -# writeLine("serverLog","isCol = *isCol"); -# writeLine("serverLog","buffer[path]= " ++ *buffer."path"); -# if (*isCol) { -# *buffer."path" = *buffer."path"++"="; -# } -#} - - - - -uuYcVaultDatasetGetPath(*vaultRoot, *datasetId, *datasetPath) { - uuYcDatasetParseId(*datasetId, *datasetComponents); - *wave = *datasetComponents."wave"; - *experimentType = *datasetComponents."experiment_type"; - *pseudocode = *datasetComponents."pseudocode"; - *version = *datasetComponents."version"; - *sep = "_"; - *wepv = *wave ++ *sep ++ *experimentType ++ *sep ++ *pseudocode ++ *sep ++ "ver*version"; - *datasetPath = "*vaultRoot/*wave/*experimentType/*pseudocode/*wepv"; -} - -uuYcVaultDatasetExists(*vaultRoot, *datasetId, *exists) { - *exists = false; - uuYcVaultDatasetGetPath(*vaultRoot, *datasetId, *datasetPath); - foreach (*row in SELECT COLL_NAME WHERE COLL_NAME = '*datasetPath') { - *exists = true; - break; - } -} - - -uuYcVaultDatasetAddMeta(*vaultPath, *datasetId) { - uuYcDatasetParseId(*datasetId, *datasetComponents); - *wave = *datasetComponents."wave"; - *experimentType = *datasetComponents."experiment_type"; - *pseudocode = *datasetComponents."pseudocode"; - *version = *datasetComponents."version"; - msiGetIcatTime(*date, "unix"); - msiAddKeyVal(*kv, "wave", *wave); - msiAddKeyVal(*kv, "experiment_type", *experimentType); - msiAddKeyVal(*kv, "pseudocode", *pseudocode); - msiAddKeyVal(*kv, "version", *version); - msiAddKeyVal(*kv, "dataset_date_created", *date); - msiAssociateKeyValuePairsToObj(*kv, *vaultPath, "-C"); -} - -uuYcVaultWalkRemoveObject(*itemParent, *itemName, *itemIsCollection, *buffer, *status) { -# writeLine("serverLog", "...removing *itemParent/*itemName"); - if (*itemIsCollection) { - msiRmColl("*itemParent/*itemName", "forceFlag=", *status); - } else { - msiDataObjUnlink("objPath=*itemParent/*itemName++++forceFlag=", *status); - } -} - - -uuYcVaultIngestObject(*objectPath, *isCollection, *vaultPath, *status) { - # from the original object only the below list '*copiedMetadata' of metadata keys - # is copied to the vault object, other info is ignored - *copiedMetadata = list("wave", "experiment_type", "pseudocode", "version", - "error", "warning", "comment", "dataset_error", - "dataset_warning", "datasetid"); - *status = 0; - if (*isCollection) { - msiCollCreate(*vaultPath, "1", *status); - if (*status == 0) { - foreach (*row in SELECT META_COLL_ATTR_NAME, META_COLL_ATTR_VALUE - WHERE COLL_NAME = '*objectPath' - ) { - msiGetValByKey(*row, "META_COLL_ATTR_NAME", *key); - msiGetValByKey(*row, "META_COLL_ATTR_VALUE", *value); - msiString2KeyValPair("*key=*value",*kv); - # add relevant kvlist to vault collection object - foreach (*meta in *copiedMetadata) { - if (*key == *meta) { - msiAssociateKeyValuePairsToObj(*kv, *vaultPath, "-C"); - } - } - } - foreach (*row in SELECT COLL_OWNER_NAME, COLL_OWNER_ZONE, COLL_CREATE_TIME - WHERE COLL_NAME = '*objectPath' - ) { - msiGetValByKey(*row, "COLL_OWNER_NAME", *ownerName); - msiGetValByKey(*row, "COLL_OWNER_ZONE", *ownerZone); - msiGetValByKey(*row, "COLL_CREATE_TIME", *createTime); - msiString2KeyValPair("submitted_by=*ownerName#*ownerZone",*kvSubmittedBy); - msiString2KeyValPair("submitted_date=*createTime",*kvSubmittedDate); - msiAssociateKeyValuePairsToObj(*kvSubmittedBy, *vaultPath, "-C"); - msiAssociateKeyValuePairsToObj(*kvSubmittedDate, *vaultPath, "-C"); - } - } - } else { # its not a collection but a data object - # first chksum the original file, then use it to verify the vault copy - msiDataObjChksum(*objectPath, "forceChksum=", *checksum); - msiDataObjCopy(*objectPath, *vaultPath, "verifyChksum=", *status); - if (*status == 0) { - uuChopPath(*objectPath, *collection, *dataName); - foreach (*row in SELECT META_DATA_ATTR_NAME, META_DATA_ATTR_VALUE - WHERE COLL_NAME = '*collection' - AND DATA_NAME = '*dataName' - ) { - msiGetValByKey(*row, "META_DATA_ATTR_NAME", *key); - msiGetValByKey(*row, "META_DATA_ATTR_VALUE", *value); - # add relevant kvlist to vault collection object - msiString2KeyValPair("*key=*value",*kv); - foreach (*meta in *copiedMetadata) { - if (*key == *meta) { - msiAssociateKeyValuePairsToObj(*kv, *vaultPath, "-d"); - } - } - } - # add metadata found in system info - foreach (*row in SELECT DATA_OWNER_NAME, DATA_OWNER_ZONE, DATA_CREATE_TIME - WHERE COLL_NAME = '*collection' - AND DATA_NAME = '*dataName' - ) { - msiGetValByKey(*row, "DATA_OWNER_NAME", *ownerName); - msiGetValByKey(*row, "DATA_OWNER_ZONE", *ownerZone); - msiGetValByKey(*row, "DATA_CREATE_TIME", *createTime); - msiString2KeyValPair("submitted_by=*ownerName#*ownerZone",*kvSubmittedBy); - msiString2KeyValPair("submitted_date=*createTime",*kvSubmittedDate); - msiAssociateKeyValuePairsToObj(*kvSubmittedBy, *vaultPath, "-d"); - msiAssociateKeyValuePairsToObj(*kvSubmittedDate, *vaultPath, "-d"); - # Skip duplicas - break; - } - } - } -} - - - -uuYcVaultWalkIngestObject(*itemParent, *itemName, *itemIsCollection, *buffer, *status) { - *sourcePath = "*itemParent/*itemName"; - *destPath = *buffer."destination"; # top level destination is specified - if (*sourcePath != *buffer."source") { - # rewrite path to copy objects that are located underneath the toplevel collection - *sourceLength = strlen(*sourcePath); - *relativePath = substr(*sourcePath, strlen(*buffer."source") + 1, *sourceLength); - *destPath = *buffer."destination" ++ "/" ++ *relativePath; - } -# writeLine("serverLog","VLT from = *sourcePath"); -# writeLine("serverLog","VLT to = *destPath"); - uuYcVaultIngestObject(*sourcePath, *itemIsCollection, *destPath, *status); -} - - -uuYcDatasetCollectionMove2Vault(*intakeRoot,*topLevelCollection, *datasetId, *vaultRoot, *status) { - writeLine("serverLog","\nmoving dataset-typeA *datasetId from *topLevelCollection to vault"); - *status = 0; - uuYcVaultDatasetExists(*vaultRoot, *datasetId, *exists); - if (!*exists) { - uuYcVaultDatasetGetPath(*vaultRoot, *datasetId, *vaultPath); - # create the in-between levels of the path to the toplevel collection - uuChopPath(*vaultPath, *vaultParent, *vaultCollection); - msiCollCreate(*vaultParent, "1", *status); -# writeLine("serverLog","VAULT: dataset created *datasetId status=*status path=*vaultPath"); - if (*status == 0) { - # copy the dataset tree to the vault - uuChopPath(*topLevelCollection, *intakeParent, *intakeCollection); - *buffer."source" = *topLevelCollection; - *buffer."destination" = *vaultPath; -# writeLine("serverLog","VAULT: source = *topLevelCollection"); -# writeLine("serverLog","VAULT: dest = *vaultPath"); - uuTreeWalk( - "forward", - *topLevelCollection, - "uuYcVaultWalkIngestObject", - *buffer, - *status - ); - uuKvClear(*buffer); - if (*status == 0) { - # stamp the vault dataset collection with additional metadata - msiGetIcatTime(*date, "unix"); - msiAddKeyVal(*kv, "dataset_date_created", *date); - msiAssociateKeyValuePairsToObj(*kv, *vaultPath, "-C"); - # and finally remove the dataset original in the intake area - msiRmColl(*topLevelCollection, "forceFlag=", *error); -# uuTreeWalk( -# "reverse", -# *topLevelCollection, -# "uuYcVaultWalkRemoveObject", -# *buffer, -# *error -# ); - if (*error != 0) { - writeLine("serverLog", - "ERROR: unable to remove intake collection *topLevelCollection"); - } - } else { - # move failed (partially), cleanup vault - # NB: keep the dataset in the vault queue so we can retry some other time - writeLine("serverLog","ERROR: Ingest failed for *datasetId error = *status"); - uuTreeWalk("reverse", *vaultPath, "uuYcVaultWalkRemoveObject", *buffer, *error); - } - - } - } else { - writeLine("serverLog","INFO: version already exists in vault: *datasetId"); - # duplicate dataset, signal error and throw out of vault queue - *message = "Duplicate dataset, version already exists in vault"; - uuYcDatasetErrorAdd(*intakeRoot, *datasetId,*message); - uuYcDatasetMelt(*topLevelCollection, *datasetId, *status); - uuYcDatasetUnlock(*topLevelCollection, *datasetId, *status); - *status = 1; # duplicate dataset version error - } -} - -uuYcDatasetObjectsOnlyMove2Vault(*intakeRoot, *topLevelCollection, *datasetId, *vaultRoot, *status) { - writeLine("serverLog","\nmoving dataset-typeB *datasetId from *topLevelCollection to vault"); - uuYcVaultDatasetExists(*vaultRoot, *datasetId, *exists); - if (!*exists) { - # new dataset(version) we can safely ingest into vault - uuYcVaultDatasetGetPath(*vaultRoot, *datasetId, *vaultPath); - # create path to and including the toplevel collection (will create in-between levels) - msiCollCreate(*vaultPath, "1", *status); -# writeLine("serverLog","VAULT: dataset created *datasetId status=*status path=*vaultPath"); - if (*status == 0) { - # stamp the vault dataset collection with default metadata - uuYcVaultDatasetAddMeta(*vaultPath, *datasetId); - # copy data objects to the vault - foreach (*dataRow in SELECT DATA_NAME - WHERE COLL_NAME = '*topLevelCollection' - AND META_DATA_ATTR_NAME = 'dataset_toplevel' - AND META_DATA_ATTR_VALUE = '*datasetId' - ) { - msiGetValByKey(*dataRow, "DATA_NAME", *dataName); - *intakePath = "*topLevelCollection/*dataName"; - uuYcVaultIngestObject(*intakePath, false, "*vaultPath/*dataName", *status); - if (*status != 0) { - break; - } - } - if (*status == 0) { - # data ingested, what's left is to delete the original in intake area - # this will also melt/unfreeze etc because metadata is removed too - foreach (*dataRow in SELECT DATA_NAME - WHERE COLL_NAME = '*topLevelCollection' - AND META_DATA_ATTR_NAME = 'dataset_toplevel' - AND META_DATA_ATTR_VALUE = '*datasetId' - ) { - msiGetValByKey(*dataRow, "DATA_NAME", *dataName); - *intakePath = "*topLevelCollection/*dataName"; -# writeLine("serverLog","removing intake file: *intakePath"); - msiDataObjUnlink("objPath=*intakePath++++forceFlag=", *error); - if (*error != 0) { - writeLine("serverLog","ERROR: unable to remove intake object *intakePath"); - } - } - } else { - # error occurred during ingest, cleanup vault area and relay the error to user - # NB: keep the dataset in the vault queue so we can retry some other time - writeLine("serverLog","ERROR: Ingest failed for *datasetId error = *status"); - *buffer = "required yet dummy parameter"; - uuTreeWalk("reverse", *vaultPath, "uuYcVaultWalkRemoveObject", *buffer, *error); - } - } - } else { - # duplicate dataset, signal error and throw out of vault queue - writeLine("serverLog","INFO: version already exists in vault: *datasetId"); - *message = "Duplicate dataset, version already exists in vault"; - uuYcDatasetErrorAdd(*intakeRoot, *datasetId,*message); - uuYcDatasetMelt(*topLevelCollection, *datasetId, *status); - uuYcDatasetUnlock(*topLevelCollection, *datasetId, *status); - *status = 1; # duplicate dataset version error - } -} - - - -# \brief move all locked datasets to the vault -# -# \param[in] intakeCollection pathname root of intake area -# \param[in] vaultCollection pathname root of vault area -# \param[out] status result of operation either "ok" or "error" -# -uuYc2Vault(*intakeRoot, *vaultRoot, *status) { - # 1. add to_vault_freeze metadata lock to the dataset - # 2. check that dataset does not yet exist in the vault - # 3. copy dataset to vault with its metadata - # 4. remove dataset from intake - # upon any error: - # - delete partial data from vault - # - add error to intake dataset metadata - # - remove locks on intake dataset (to_vault_freeze, to_vault_lock) - *status = 0; # 0 is success, nonzero is error - *datasets_moved = 0; - - # note that we have to allow for multiple types of datasets: - # type A: a single toplevel collection with a tree underneath - # type B: one or more data files located within the same collection - # processing varies slightly between them, so process each type in turn - # - # TYPE A: - foreach (*row in SELECT COLL_NAME, META_COLL_ATTR_VALUE - WHERE META_COLL_ATTR_NAME = 'dataset_toplevel' - AND COLL_NAME like '*intakeRoot/%') { - msiGetValByKey(*row, "COLL_NAME", *topLevelCollection); - msiGetValByKey(*row, "META_COLL_ATTR_VALUE", *datasetId); - uuYcObjectIsLocked(*topLevelCollection, true, *locked, *frozen); - if (*locked) { - uuYcDatasetFreeze(*topLevelCollection, *datasetId, *status); - if (*status == 0) { - # dataset frozen; now move to vault and remove from intake area - uuYcDatasetCollectionMove2Vault( - *intakeRoot, - *topLevelCollection, - *datasetId, - *vaultRoot, - *status - ); - if (*status == 0) { - *datasets_moved = *datasets_moved + 1; - } - } - } - } - # TYPE B: - foreach (*row in SELECT COLL_NAME, META_DATA_ATTR_VALUE - WHERE META_DATA_ATTR_NAME = 'dataset_toplevel' - AND COLL_NAME like '*intakeRoot%' -# fixme: skip collnames that are not in the same tree yet share the prefix - ) { - - msiGetValByKey(*row, "COLL_NAME", *topLevelCollection); - msiGetValByKey(*row, "META_DATA_ATTR_VALUE", *datasetId); - # check if to_vault_lock exists on all the dataobjects of this dataset - *allLocked = true; - foreach (*dataRow in SELECT DATA_NAME - WHERE COLL_NAME = '*topLevelCollection' - AND META_DATA_ATTR_NAME = 'dataset_toplevel' - AND META_DATA_ATTR_VALUE = '*datasetId' - ) { - msiGetValByKey(*dataRow, "DATA_NAME", *dataName); - uuYcObjectIsLocked("*topLevelCollection/*dataName", false, *locked, *frozen); - *allLocked = *allLocked && *locked; - } - if (*allLocked) { - uuYcDatasetFreeze(*topLevelCollection, *datasetId, *status); - if (*status == 0) { - # dataset frozen, now move to fault and remove from intake area - uuYcDatasetObjectsOnlyMove2Vault( - *intakeRoot, - *topLevelCollection, - *datasetId, - *vaultRoot, - *status - ); - if (*status == 0) { - *datasets_moved = *datasets_moved + 1; - } - } - } - } - if (*datasets_moved > 0) { - writeLine("serverLog","\nmoved in total *datasets_moved dataset(s) to the vault"); - } -} - -#input null -#output ruleExecOut diff --git a/ycDataset.r b/ycDataset.r deleted file mode 100644 index 4cb09dab7..000000000 --- a/ycDataset.r +++ /dev/null @@ -1,175 +0,0 @@ -# \file -# \brief Youth Cohort - Dataset related functions. -# \author Chris Smeele -# \copyright Copyright (c) 2015, Utrecht University. All rights reserved. -# \license GPLv3, see LICENSE - -# \brief Generate a dataset identifier based on WEPV values. -# -# \param[in] idComponents a kvList containing WEPV values -# \param[out] id a dataset id string -# -uuYcDatasetMakeId(*idComponents, *id){ - *id = - *idComponents."wave" - ++ "\t" ++ *idComponents."experiment_type" - ++ "\t" ++ *idComponents."pseudocode" - ++ "\t" ++ *idComponents."version" - ++ "\t" ++ *idComponents."directory"; -} - -# \brief Parse a dataset identifier and return WEPV values. -# -# \param[in] id a dataset id string -# \param[out] idComponents a kvList containing WEPV values -# -uuYcDatasetParseId(*id, *idComponents){ - *idParts = split(*id, "\t"); - *idComponents."wave" = elem(*idParts, 0); - *idComponents."experiment_type" = elem(*idParts, 1); - *idComponents."pseudocode" = elem(*idParts, 2); - *idComponents."version" = elem(*idParts, 3); - *idComponents."directory" = elem(*idParts, 4); -} - -# \brief Find dataset ids under *root. -# -# \param[in] root -# \param[out] ids a list of dataset ids -# -uuYcDatasetGetIds(*root, *ids) { - *idsString = ""; - foreach (*item in SELECT META_DATA_ATTR_VALUE WHERE COLL_NAME = "*root" AND META_DATA_ATTR_NAME = 'dataset_id') { - # Datasets directly under *root need to be checked for separately due to limitations on the general query system. - if (strlen(*idsString) > 0) { - *idsString = *idsString ++ "\n"; - } - *idsString = *idsString ++ *item."META_DATA_ATTR_VALUE"; - } - foreach (*item in SELECT META_DATA_ATTR_VALUE WHERE COLL_NAME LIKE "*root/%" AND META_DATA_ATTR_NAME = 'dataset_id') { - if (strlen(*idsString) > 0) { - *idsString = *idsString ++ "\n"; - } - *idsString = *idsString ++ *item."META_DATA_ATTR_VALUE"; - } - *ids = split(*idsString, "\n"); -} - -# \brief Get a list of toplevel objects that belong to the given dataset id. -# -# \param[in] root -# \param[in] id -# \param[out] objects a list of toplevel object paths -# \param[out] isCollection whether this dataset consists of a single toplevel collection -# -uuYcDatasetGetToplevelObjects(*root, *id, *objects, *isCollection) { - *isCollection = false; - - *objectsString = ""; - foreach (*item in SELECT COLL_NAME WHERE COLL_NAME LIKE "*root/%" AND META_COLL_ATTR_NAME = 'dataset_toplevel' AND META_COLL_ATTR_VALUE = "*id") { - *isCollection = true; - *objectsString = *item."COLL_NAME"; - } - if (!*isCollection) { - foreach (*item in SELECT DATA_NAME, COLL_NAME WHERE COLL_NAME = "*root" AND META_DATA_ATTR_NAME = 'dataset_toplevel' AND META_DATA_ATTR_VALUE = "*id") { - # Datasets directly under *root need to be checked for separately due to limitations on the general query system. - if (strlen(*objectsString) > 0) { - *objectsString = *objectsString ++ "\n"; - } - *objectsString = *objectsString ++ *item."COLL_NAME" ++ "/" ++ *item."DATA_NAME"; - } - foreach (*item in SELECT DATA_NAME, COLL_NAME WHERE COLL_NAME LIKE "*root/%" AND META_DATA_ATTR_NAME = 'dataset_toplevel' AND META_DATA_ATTR_VALUE = "*id") { - if (strlen(*objectsString) > 0) { - *objectsString = *objectsString ++ "\n"; - } - *objectsString = *objectsString ++ *item."COLL_NAME" ++ "/" ++ *item."DATA_NAME"; - } - } - *objects = split(*objectsString, "\n"); - #writeLine("stdout", "Got dataset toplevel objects for <*id>: *objectsString"); -} - -# \brief Get a list of relative paths to all data objects in a dataset. -# -# \param[in] root -# \param[in] id -# \param[out] objects a list of relative object paths (e.g. file1.dat, some-subdir/file2.dat...) -# -uuYcDatasetGetDataObjectRelPaths(*root, *id, *objects) { - - uuYcDatasetGetToplevelObjects(*root, *id, *toplevelObjects, *isCollection); - - # NOTE: This will crash when an invalid dataset id is provided. - if (*isCollection) { - *parentCollection = elem(*toplevelObjects, 0); - } else { - uuChopPath(elem(*toplevelObjects, 0), *dataObjectParent, *dataObjectName); - *parentCollection = *dataObjectParent; - } - - *objectsString = ""; - foreach (*item in SELECT DATA_NAME, COLL_NAME WHERE COLL_NAME = "*parentCollection" AND META_DATA_ATTR_NAME = 'dataset_id' AND META_DATA_ATTR_VALUE = "*id") { - # Datasets directly under *root need to be checked for separately due to limitations on the general query system. - if (strlen(*objectsString) > 0) { - *objectsString = *objectsString ++ "\n"; - } - *objectsString = *objectsString ++ *item."DATA_NAME"; - } - foreach (*item in SELECT DATA_NAME, COLL_NAME WHERE COLL_NAME LIKE "*parentCollection/%" AND META_DATA_ATTR_NAME = 'dataset_id' AND META_DATA_ATTR_VALUE = "*id") { - if (strlen(*objectsString) > 0) { - *objectsString = *objectsString ++ "\n"; - } - *objectsString = *objectsString - ++ substr(*item."COLL_NAME", strlen(*parentCollection)+1, strlen(*item."COLL_NAME")) - ++ "/" - ++ *item."DATA_NAME"; - } - *objects = split(*objectsString, "\n"); -} - -# \brief Check if a dataset id is locked. -# -# \param[in] root -# \param[in] id -# \param[out] isLocked -# \param[out] isFrozen -# -uuYcDatasetIsLocked(*root, *id, *isLocked, *isFrozen) { - uuYcDatasetGetToplevelObjects(*root, *id, *toplevelObjects, *isCollection); - - *isLocked = false; - *isFrozen = false; - foreach (*item in *toplevelObjects) { - uuYcObjectIsLocked(*item, *isCollection, *isLocked, *isFrozen); - if (*isLocked || *isFrozen) { - break; - } - } -} - - -# \brief Adds an error to the dataset specified by *datasetId. -# -# \param[in] root -# \param[in] datasetId -# \param[in] message -# -uuYcDatasetErrorAdd(*root, *datasetId, *message) { - - uuYcDatasetGetToplevelObjects(*root, *datasetId, *toplevelObjects, *isCollection); - - foreach (*toplevel in *toplevelObjects) { - msiAddKeyVal(*kv, "dataset_error", "*message"); - # note that we want to silently ignore any duplicates of the message (using errorcode) - errorcode(msiAssociateKeyValuePairsToObj(*kv, *toplevel, if *isCollection then "-C" else "-d")); - - # This does not work for some reason. - #uuSetMetaData( - # *toplevel, - # "comment", - # *comment, - # if *isCollection then "-C" else "-d" - #); - } -} - diff --git a/ycDatasetGetToplevel.r b/ycDatasetGetToplevel.r deleted file mode 100644 index 54817e1fe..000000000 --- a/ycDatasetGetToplevel.r +++ /dev/null @@ -1,76 +0,0 @@ -# \file -# \brief dataset lookup function -# \author Ton Smeele -# \copyright Copyright (c) 2015, Utrecht university. All rights reserved -# \license GPLv3, see LICENSE -# - -#test { -# uuYcDatasetGetTopLevel("/tsm/home/rods", "x", *collection, *isCol); -# writeLine("stdout","coll = *collection and isCol = *isCol"); -#} - - -# \brief uuYcDatasetGetTopLevel retrieves the collection path and dataset type for a dataset -# -# \param[in] rootcollection path of a tree to search for the dataset -# \param[in] datasetid unique identifier of the dataset -# \param[out] topLevelCollection collection that has the dataset -# if dataset is not found an empty string is returned -# \param[out] topLevelIsCollection type of dataset: true = collection false = data objects -# -uuYcDatasetGetTopLevel(*rootCollection, *datasetId, *topLevelCollection, *topLevelIsCollection) { - # datasets can be - # A) one collection with a subtree - # B) one or more data objects located (possibly with other objects) in same collection - *topLevelIsCollection = false; - *topLevelCollection = ""; - # try to find a collection. note we will expect 0 or 1 rows: - foreach (*row in SELECT COLL_NAME - WHERE META_COLL_ATTR_NAME = 'dataset_toplevel' - AND META_COLL_ATTR_VALUE = '*datasetId' - AND COLL_NAME LIKE '*rootCollection/%' - ) { - *topLevelIsCollection = true; - msiGetValByKey(*row, "COLL_NAME", *topLevelCollection); - } - if (! *topLevelIsCollection) { - # also try the root itself - foreach (*row in SELECT COLL_NAME - WHERE META_COLL_ATTR_NAME = 'dataset_toplevel' - AND META_COLL_ATTR_VALUE = '*datasetId' - AND COLL_NAME = '*rootCollection' - ) { - *topLevelIsCollection = true; - msiGetValByKey(*row, "COLL_NAME", *topLevelCollection); - } - } - if (! *topLevelIsCollection) { - # apparently not a collection, let's search for data objects instead - foreach (*row in SELECT COLL_NAME,DATA_NAME - WHERE META_DATA_ATTR_NAME = 'dataset_toplevel' - AND META_DATA_ATTR_VALUE = '*datasetId' - AND COLL_NAME LIKE '*rootCollection/%' - ) { - msiGetValByKey(*row, "COLL_NAME", *topLevelCollection); - break; - } - if (*topLevelCollection == "") { - # not found yet, maybe data object(s) in the rootcollection itself? - - foreach (*row in SELECT COLL_NAME,DATA_NAME - WHERE META_DATA_ATTR_NAME = 'dataset_toplevel' - AND META_DATA_ATTR_VALUE = '*datasetId' - AND COLL_NAME = '*rootCollection' - ) { - msiGetValByKey(*row, "COLL_NAME", *topLevelCollection); - break; - } - } else { - # dataset not found! - } - } -} - -#input null -#output ruleExecOut diff --git a/ycDatasetLock.r b/ycDatasetLock.r deleted file mode 100644 index f0497d59b..000000000 --- a/ycDatasetLock.r +++ /dev/null @@ -1,253 +0,0 @@ -# \file -# \brief lock/freeze and unlock/unfreeze datasets within a collection -# \author Ton Smeele -# \copyright Copyright (c) 2015, Utrecht university. All rights reserved -# \license GPLv3, see LICENSE -# - -#test { -#*collection = "/nluu1ot/home/ton"; -#*datasetId = "y"; -#uuYcDatasetLock(*collection, *datasetId, *result); -#writeLine("stdout","lock result = *result"); -#uuYcDatasetFreeze(*collection, *datasetId, *result); -#writeLine("stdout","freeze result = *result"); -#uuYcObjectIsLocked("*collection/Newfile.txt",false, *locked, *frozen); -#writeLine("stdout","locked = *locked and frozen = *frozen"); - -#uuYcDatasetUnlock(*collection, *datasetId, *result); -#writeLine("stdout","unlock result = *result"); -#uuYcDatasetMelt(*collection, *datasetId, *result); -#writeLine("stdout","melt result = *result"); -#uuYcDatasetUnlock(*collection, *datasetId, *result); -#writeLine("stdout","unlock result = *result"); -#} - -uuYcDatasetLockChangeObject(*parentCollection, *objectName, *isCollection, - *lockName, *lockIt, *dateTime,*result) { - *objectType = "-d"; - *path = "*parentCollection/*objectName"; - if (*isCollection) { - *objectType = "-C"; - *collection = *objectName; - } - if (*lockIt) { - msiString2KeyValPair("*lockName=*dateTime",*kvPair); - *result = errorcode(msiSetKeyValuePairsToObj(*kvPair, *path, *objectType)); - } else { # unlock it - # - # if the lock is of type to_vault_lock this operation is - # disallowed if the object also has a to_vault_freeze lock - uuYcObjectIsLocked(*path,*isCollection,*locked,*frozen); - *allowed = (*lockName == "to_vault_freeze") || !*frozen; - if (*allowed) { - *result = 0; - # in order to remove the key we need to lookup its value(s) - if (*isCollection) { - # remove lock from collection - foreach (*row in SELECT META_COLL_ATTR_VALUE - WHERE COLL_NAME = '*path' - AND META_COLL_ATTR_NAME = '*lockName') { - msiGetValByKey(*row, "META_COLL_ATTR_VALUE", *value); - msiString2KeyValPair("*lockName=*value", *kvPair); - *result = errorcode( - msiRemoveKeyValuePairsFromObj(*kvPair, *path, "-C") - ); - if (*result != 0) { - break; - } - } - } else { - # remove lock from data object - foreach (*row in SELECT META_DATA_ATTR_VALUE - WHERE DATA_NAME = '*objectName' - AND COLL_NAME = '*parentCollection' - AND META_DATA_ATTR_NAME = '*lockName' - ) { - msiGetValByKey(*row,"META_DATA_ATTR_VALUE",*value); - msiString2KeyValPair("*lockName=*value",*kvPair); - *result = errorcode( - msiRemoveKeyValuePairsFromObj( - *kvPair, - "*parentCollection/*objectName", - "-d" - ) - ); - if (*result != 0) { - break; - } - } - } # end else remove lock from dataobject - } else { # unlock not allowed - *result = -1; - } - } -} - -uuYcDatasetWalkVaultLock(*itemCollection, *itemName, *itemIsCollection, *buffer, *error) { - msiGetIcatTime(*dateTime,"unix"); - uuYcDatasetLockChangeObject(*itemCollection, *itemName, *itemIsCollection, - "to_vault_lock", true, *dateTime, *error); -} - -uuYcDatasetWalkVaultUnlock(*itemCollection, *itemName, *itemIsCollection, *buffer, *error) { - msiGetIcatTime(*dateTime,"unix"); - uuYcDatasetLockChangeObject(*itemCollection, *itemName, *itemIsCollection, - "to_vault_lock", false, *dateTime, *error); -} - -uuYcDatasetWalkFreezeLock(*itemCollection, *itemName, *itemIsCollection, *buffer, *error) { - msiGetIcatTime(*dateTime,"unix"); - uuYcDatasetLockChangeObject(*itemCollection, *itemName, *itemIsCollection, - "to_vault_freeze", true, *dateTime, *error); -} - - -uuYcDatasetWalkFreezeUnlock(*itemCollection, *itemName, *itemIsCollection, *buffer, *error) { - msiGetIcatTime(*dateTime,"unix"); - uuYcDatasetLockChangeObject(*itemCollection, *itemName, *itemIsCollection, - "to_vault_freeze", false, *dateTime, *error); -} - - -uuYcDatasetLockChange(*rootCollection, *datasetId, *lockName, *lockIt, *status){ - *status = -1; - *lock = "Unlock"; - if (*lockIt) { - *lock = "Lock"; - } - *lockProcedure = "Vault"; - if (*lockName == "to_vault_freeze") { - *lockProcedure = "Freeze"; - } - # find the toplevel collection for this dataset - uuYcDatasetGetTopLevel(*rootCollection, *datasetId, *collection, *isCollection); - if (*collection != "") { - # we found the dataset, now change the lock on each object - if (*isCollection) { - *buffer = "dummy"; - uuTreeWalk("forward", *collection, "uuYcDatasetWalk*lockProcedure*lock", *buffer, *error); - *status = *error; -# if (*error == "0") { -# *status = 0; -# } - } else { - # dataset is not a collection, let's find the objects and make the change - msiGetIcatTime(*dateTime,"unix"); - *status = 0; - foreach (*row in SELECT DATA_NAME - WHERE COLL_NAME = '*collection' - AND META_DATA_ATTR_NAME = 'dataset_toplevel' - AND META_DATA_ATTR_VALUE = '*datasetId' - ) { - msiGetValByKey(*row,"DATA_NAME",*dataName); - # now change it .... - uuYcDatasetLockChangeObject( - *collection, - *dataName, - false, - *lockName, - *lockIt, - *dateTime, - *error); - if (*error != 0 ) { - *status = *error; - break; - } - } - } - - } else { - # result is false "dataset not found" - } -} - - -# \brief uuYcDatasetLock locks (all objects of) a dataset -# -# \param[in] collection collection that may have datasets -# \param[in] datasetId identifier to depict the dataset -# \param[out] status 0 upon success, otherwise nonzero -# -uuYcDatasetLock(*collection, *datasetId, *status) { - uuYcDatasetLockChange(*collection, *datasetId,"to_vault_lock", true, *status); -} - -# \brief uuYcDatasetUnlock unlocks (all objects of) a dataset -# -# \param[in] collection collection that may have datasets -# \param[in] datasetId identifier to depict the dataset -# \param[out] result "true" upon success, otherwise "false" -# \param[out] status 0 upon success, otherwise nonzero -# -uuYcDatasetUnlock(*collection, *datasetId, *status) { - uuYcDatasetLockChange(*collection, *datasetId, "to_vault_lock", false, *status); -} - -# \brief uuYcDatasetFreeze freeze-locks (all objects of) a dataset -# -# \param[in] collection collection that may have datasets -# \param[in] datasetId identifier to depict the dataset -# \param[out] status 0 upon success, otherwise nonzero -# -uuYcDatasetFreeze(*collection, *datasetId, *status) { - uuYcDatasetLockChange(*collection, *datasetId,"to_vault_freeze", true, *status); -} - -# \brief uuYcDatasetUnfreeze undo freeze-locks (all objects of) a dataset -# -# \param[in] collection collection that may have datasets -# \param[in] datasetId identifier to depict the dataset -# \param[out] status 0 upon success, otherwise nonzero -# -uuYcDatasetMelt(*collection, *datasetId, *status) { - uuYcDatasetLockChange(*collection, *datasetId, "to_vault_freeze", false, *status); -} - -# \brief uuYcObjectIsLocked query an object to see if it is locked -# -# \param[in] objectPath full path to collection of data object -# \param[in] isCollection true if path references a collection -# \param[out] locked true if the object is vault-locked -# \param[out] frozen true if the object is vault-frozen - -uuYcObjectIsLocked(*objectPath, *isCollection, *locked, *frozen) { - *locked = false; - *frozen = false; - if (*isCollection) { - foreach (*row in SELECT META_COLL_ATTR_NAME - WHERE COLL_NAME = '*objectPath' - ) { - msiGetValByKey(*row, "META_COLL_ATTR_NAME", *key); - if ( *key == "to_vault_lock" - || *key == "to_vault_freeze" - ) { - *locked = true; - if (*key == "to_vault_freeze") { - *frozen = true; - break; - } - } - } - } else { - uuChopPath(*objectPath, *parentCollection, *dataName); - foreach (*row in SELECT META_DATA_ATTR_NAME - WHERE COLL_NAME = '*parentCollection' - AND DATA_NAME = '*dataName' - ) { - msiGetValByKey(*row, "META_DATA_ATTR_NAME", *key); - if ( *key == "to_vault_lock" - || *key == "to_vault_freeze" - ) { - *locked = true; - if (*key == "to_vault_freeze") { - *frozen = true; - break; - } - } - } - } -} - -#input null -#output ruleExecOut diff --git a/ycModule.r b/ycModule.r deleted file mode 100644 index e37ea9afa..000000000 --- a/ycModule.r +++ /dev/null @@ -1,194 +0,0 @@ -# \file ycModule.r -# \brief Youth Cohort module -# \copyright Copyright (c) 2016-2021, Utrecht University. All rights reserved. -# \license GPLv3, see LICENSE - - -# \brief (over)write data object with a list of vault object checksums -# -# \param[in] vaultRoot root collection to be indexed -# \param[in] destinationObject dataobject that will be written to -# \param[out] status 0 = success, nonzero is error -uuYcGenerateDatasetsIndex(*vaultRoot, *destinationObject, *status) { - *status = 0; - msiDataObjCreate(*destinationObject, "forceFlag=", *FHANDLE); - - foreach (*row in SELECT COLL_NAME, DATA_NAME, DATA_CHECKSUM, DATA_SIZE - WHERE COLL_NAME = "*vaultRoot" ) { - *checksum = *row."DATA_CHECKSUM"; - *name = *row."DATA_NAME"; - *col = *row."COLL_NAME"; - *size = *row."DATA_SIZE"; - uuChopChecksum(*checksum, *type, *checksumOut); - *textLine = "*type *checksumOut *size *col/*name\n"; - msiStrlen(*textLine, *length); - msiStrToBytesBuf(*textLine, *buffer); - msiDataObjWrite(*FHANDLE, *buffer, *bytesWritten); - if (int(*length) != *bytesWritten) then { - *status = 1; - } - } - foreach (*row in SELECT COLL_NAME, DATA_NAME, DATA_CHECKSUM, DATA_SIZE - WHERE COLL_NAME like '*vaultRoot/%' ) { - *checksum = *row."DATA_CHECKSUM"; - *name = *row."DATA_NAME"; - *col = *row."COLL_NAME"; - *size = *row."DATA_SIZE"; - uuChopChecksum(*checksum, *type, *checksumOut); - *textLine = "*type *checksumOut *size *col/*name\n"; - msiStrlen(*textLine, *length); - msiStrToBytesBuf(*textLine, *buffer); - msiDataObjWrite(*FHANDLE, *buffer, *bytesWritten); - if (int(*length) != *bytesWritten) then { - *status = 1; - } - } - msiDataObjClose(*FHANDLE, *status2); - *status; -} - -# \brief Add a dataset warning to all given dataset toplevels. -# -# \param[in] toplevels -# \param[in] isCollectionToplevel -# \param[in] text -# -uuYcIntakeCheckAddDatasetWarning(*toplevels, *isCollectionToplevel, *text) { - msiAddKeyVal(*kv, "dataset_warning", *text); - - foreach (*toplevel in *toplevels) { - msiAssociateKeyValuePairsToObj(*kv, *toplevel, if *isCollectionToplevel then "-C" else "-d"); - } -} - -# \brief Add a dataset error to all given dataset toplevels. -# -# \param[in] toplevels -# \param[in] isCollectionToplevel -# \param[in] text -# -uuYcIntakeCheckAddDatasetError(*toplevels, *isCollectionToplevel, *text) { - msiAddKeyVal(*kv, "dataset_error", *text); - - foreach (*toplevel in *toplevels) { - msiAssociateKeyValuePairsToObj(*kv, *toplevel, if *isCollectionToplevel then "-C" else "-d"); - } -} - -# Reusable check utilities {{{ - -# \brief Check if a certain filename pattern has enough occurrences in a dataset. -# -# Adds a warning if the match count is out of range. -# -# NOTE: Currently, patterns must match the full relative object path. -# At the time of writing, Echo is the only experiment type we run this -# check for, and it is a flat dataset without subdirectories, so it makes -# no difference there. -# -# For other experiment types it may be desirable to match patterns with -# basenames instead of paths. In this case the currently commented-out -# code in this function can be used. -# -# \param[in] datasetParent either the dataset collection or the first parent of a data-object dataset toplevel -# \param[in] toplevels a list of toplevel objects -# \param[in] isCollectionToplevel -# \param[in] objects a list of dataset object paths relative to the datasetParent parameter -# \param[in] patternHuman a human-readable pattern (e.g.: 'I0000000.raw') -# \param[in] patternRegex a regular expression that matches filenames (e.g.: 'I[0-9]{7}\.raw') -# \param[in] min the minimum amount of occurrences. set to -1 to disable minimum check. -# \param[in] max the maximum amount of occurrences. set to -1 to disable maximum check. -# -uuYcIntakeCheckFileCount(*datasetParent, *toplevels, *isCollectionToplevel, *objects, *patternHuman, *patternRegex, *min, *max) { - *count = 0; - foreach (*path in *objects) { - *name = *path; - - #if (*path like "*/*") { - # # We might want to match basenames instead of paths relative to the dataset root. - # uuChopPath(*path, *parent, *name); - #} else { - # *name = *path; - #} - if (*name like regex *patternRegex) { - *count = *count + 1; - } - } - - if (*min != -1 && *count < *min) { - uuYcIntakeCheckAddDatasetWarning(*toplevels, *isCollectionToplevel, "Expected at least *min files of type '*patternHuman', found *count"); - } - if (*max != -1 && *count > *max) { - uuYcIntakeCheckAddDatasetWarning(*toplevels, *isCollectionToplevel, "Expected at most *max files of type '*patternHuman', found *count"); - } -} - -# }}} -# Generic checks {{{ - -# \brief Check if a dataset's wave is a valid one. -# -# \param[in] root -# \param[in] id the dataset id to check -# \param[in] toplevels a list of toplevel objects for this dataset id -# \param[in] isCollectionToplevel -# -uuYcIntakeCheckWaveValidity(*root, *id, *toplevels, *isCollectionToplevel) { - # Note: It might be cleaner to grab the wave metadata tag from the toplevel instead. - uuYcDatasetParseId(*id, *idComponents); - uuStrToLower(*idComponents."wave", *wave); - - *waves = list( - "20w", "30w", - "0m", "5m", "10m", - "3y", "6y", "9y", "12y", "15y" - ); - - uuListContains(*waves, *wave, *waveIsValid); - if (!*waveIsValid) { - uuYcIntakeCheckAddDatasetError(*toplevels, *isCollectionToplevel, "The wave '*wave' is not in the list of accepted waves"); - } -} - -# \brief Run checks that must be applied to all datasets regardless of WEPV values. -# -# Call any generic checks you make in this function. -# -# \param[in] root -# \param[in] id the dataset id to check -# \param[in] toplevels a list of toplevel objects for this dataset id -# \param[in] isCollection -# -uuYcIntakeCheckGeneric(*root, *id, *toplevels, *isCollection) { - uuYcIntakeCheckWaveValidity(*root, *id, *toplevels, *isCollection); -} - -# }}} -# Experiment type specific checks {{{ -# Echo {{{ - -# \brief Run checks specific to the Echo experiment type. -# -# \param[in] root -# \param[in] id the dataset id to check -# \param[in] toplevels a list of toplevel objects for this dataset id -# \param[in] isCollection -# -uuYcIntakeCheckEtEcho(*root, *id, *toplevels, *isCollection) { - if (*isCollection) { - *datasetParent = elem(*toplevels, 0); - } else { - uuChopPath(elem(*toplevels, 0), *dataObjectParent, *dataObjectName); - *datasetParent = *dataObjectParent; - } - - uuYcDatasetGetDataObjectRelPaths(*root, *id, *objects); - - uuYcIntakeCheckFileCount(*datasetParent, *toplevels, *isCollection, *objects, ``I0000000.index.jpg``, ``(.*/)?I[0-9]{7}\.index\.jpe?g``, 13, -1); - uuYcIntakeCheckFileCount(*datasetParent, *toplevels, *isCollection, *objects, ``I0000000.raw``, ``(.*/)?I[0-9]{7}\.raw``, 7, -1); - uuYcIntakeCheckFileCount(*datasetParent, *toplevels, *isCollection, *objects, ``I0000000.dcm``, ``(.*/)?I[0-9]{7}\.dcm``, 6, -1); - uuYcIntakeCheckFileCount(*datasetParent, *toplevels, *isCollection, *objects, ``I0000000.vol``, ``(.*/)?I[0-9]{7}\.vol``, 6, -1); -} - -# }}} -# }}} diff --git a/ycUtil.r b/ycUtil.r deleted file mode 100644 index 5fa8f4d19..000000000 --- a/ycUtil.r +++ /dev/null @@ -1,36 +0,0 @@ -# Youth cohort utility functions - -# \brief Clears a kv-list's contents. -# -# \param kvList -# -uuKvClear(*kvList) { - *kvList."." = "."; - foreach (*key in *kvList) { - *kvList.*key = "."; - } -} - -uuYcObjectIsLocked(*objPath, *locked) { - msiGetObjType(*objPath, *objType); - *locked = false; - if (*objType == '-d') { - uuChopPath(*objPath, *collection, *dataName); - foreach (*row in SELECT META_DATA_ATTR_VALUE - WHERE COLL_NAME = '*collection' - AND DATA_NAME = '*dataName' - AND META_DATA_ATTR_NAME = 'to_vault_lock' - ) { - *locked = true; - break; - } - } else { - foreach (*row in SELECT META_COLL_ATTR_VALUE - WHERE COLL_NAME = '*objPath' - AND META_COLL_ATTR_NAME = 'to_vault_lock' - ) { - *locked = true; - break; - } - } -}