Skip to content

Commit

Permalink
Merge pull request #344 from UtrechtUniversity/rc-1.8.11
Browse files Browse the repository at this point in the history
Version 1.8.11
  • Loading branch information
stsnel authored Oct 11, 2023
2 parents 06c0db7 + 57ba611 commit 961ff3f
Show file tree
Hide file tree
Showing 16 changed files with 986 additions and 437 deletions.
31 changes: 31 additions & 0 deletions .github/workflows/unit-tests.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
name: "Run unit tests"

on:
push:
paths-ignore:
- 'tests/**'
pull_request:
paths-ignore:
- 'tests/**'

jobs:
flake8:
runs-on: ubuntu-latest
strategy:
matrix:
python-version: [2.7]
steps:
- uses: actions/checkout@v3

- name: Set up Python
# setup-python stopped supporting Python 2.7, use https://github.com/MatteoH2O1999/setup-python
uses: MatteoH2O1999/[email protected]
with:
python-version: ${{ matrix.python-version }}
allow-build: info
cache-build: true

- name: Run unit tests
run: |
cd unit-tests
python2 -m unittest unit_tests
88 changes: 66 additions & 22 deletions intake.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,9 @@
__license__ = 'GPLv3, see LICENSE'

import fnmatch
import itertools
import time
import traceback

import genquery

Expand Down Expand Up @@ -112,15 +114,20 @@ def api_intake_count_total_files(ctx, coll):
:returns: Total file count
"""
# Include coll name as equal names do occur and genquery delivers distinct results.
iter = genquery.row_iterator(
main_collection_iterator = genquery.row_iterator(
"COLL_NAME, DATA_NAME",
"COLL_NAME = '" + coll + "'",
genquery.AS_LIST, ctx
)

subcollection_iterator = genquery.row_iterator(
"COLL_NAME, DATA_NAME",
"COLL_NAME like '" + coll + "%'",
"COLL_NAME like '" + coll + "/%'",
genquery.AS_LIST, ctx
)

count = 0
for row in iter:
for row in itertools.chain(main_collection_iterator, subcollection_iterator):
exclusion_matched = any(fnmatch.fnmatch(row[1], p) for p in INTAKE_FILE_EXCLUSION_PATTERNS)
if not exclusion_matched:
count += 1
Expand Down Expand Up @@ -150,14 +157,20 @@ def api_intake_list_unrecognized_files(ctx, coll):
return {}

# Include coll name as equal names do occur and genquery delivers distinct results.
iter = genquery.row_iterator(
main_collection_iterator = genquery.row_iterator(
"COLL_NAME, DATA_NAME, COLL_CREATE_TIME, DATA_OWNER_NAME",
"COLL_NAME = '" + coll + "' AND META_DATA_ATTR_NAME = 'unrecognized'",
genquery.AS_LIST, ctx
)

subcollection_iterator = genquery.row_iterator(
"COLL_NAME, DATA_NAME, COLL_CREATE_TIME, DATA_OWNER_NAME",
"COLL_NAME like '" + coll + "%' AND META_DATA_ATTR_NAME = 'unrecognized'",
"COLL_NAME like '" + coll + "/%' AND META_DATA_ATTR_NAME = 'unrecognized'",
genquery.AS_LIST, ctx
)

files = []
for row in iter:
for row in itertools.chain(main_collection_iterator, subcollection_iterator):
# Check whether object type is within exclusion pattern
exclusion_matched = any(fnmatch.fnmatch(row[1], p) for p in INTAKE_FILE_EXCLUSION_PATTERNS)
if not exclusion_matched:
Expand Down Expand Up @@ -201,22 +214,36 @@ def api_intake_list_datasets(ctx, coll):
datasets = []

# 1) Query for datasets distinguished by collections
iter = genquery.row_iterator(
c_main_collection_iterator = genquery.row_iterator(
"META_COLL_ATTR_VALUE, COLL_NAME",
"COLL_NAME like '" + coll + "%' AND META_COLL_ATTR_NAME = 'dataset_toplevel' ",
"COLL_NAME = '" + coll + "' AND META_COLL_ATTR_NAME = 'dataset_toplevel' ",
genquery.AS_LIST, ctx
)
for row in iter:

c_subcollection_iterator = genquery.row_iterator(
"META_COLL_ATTR_VALUE, COLL_NAME",
"COLL_NAME LIKE '" + coll + "/%' AND META_COLL_ATTR_NAME = 'dataset_toplevel' ",
genquery.AS_LIST, ctx
)

for row in itertools.chain(c_main_collection_iterator, c_subcollection_iterator):
dataset = get_dataset_details(ctx, row[0], row[1])
datasets.append(dataset)

# 2) Query for datasets distinguished dataobjects
iter = genquery.row_iterator(
d_main_collection_iterator = genquery.row_iterator(
"META_DATA_ATTR_VALUE, COLL_NAME",
"COLL_NAME like '" + coll + "%' AND META_DATA_ATTR_NAME = 'dataset_toplevel' ",
"COLL_NAME = '" + coll + "' AND META_DATA_ATTR_NAME = 'dataset_toplevel' ",
genquery.AS_LIST, ctx
)
for row in iter:

d_subcollection_iterator = genquery.row_iterator(
"META_DATA_ATTR_VALUE, COLL_NAME",
"COLL_NAME LIKE '" + coll + "/%' AND META_DATA_ATTR_NAME = 'dataset_toplevel' ",
genquery.AS_LIST, ctx
)

for row in itertools.chain(d_main_collection_iterator, d_subcollection_iterator):
dataset = get_dataset_details(ctx, row[0], row[1])
datasets.append(dataset)

Expand All @@ -239,7 +266,6 @@ def get_dataset_details(ctx, dataset_id, path):
# Parse dataset_id to get WEPV-items individually
dataset_parts = dataset_id.split('\t')
dataset['wave'] = dataset_parts[0]
dataset['expType'] = dataset_parts[1]
dataset['experiment_type'] = dataset_parts[1]
dataset['pseudocode'] = dataset_parts[2]
dataset['version'] = dataset_parts[3]
Expand Down Expand Up @@ -363,30 +389,47 @@ def get_dataset_toplevel_objects(ctx, root, dataset_id):
If not a collection- all objects are returned with full object path.
:param ctx: Combined type of a callback and rei struct
:param root: Path to a dataset
:param root: Path within which to search for datasets (e.g. an intake group collection)
:param dataset_id: Identifier of the dataset
:returns: Dict holding objects for the dataset
:returns: Dict holding top-level object paths for the dataset (in the 'objects' key) and a boolean value which
says whether it is a collection-based dataset (in the 'is_collection' key)
"""
iter = genquery.row_iterator(
c_main_collection_iterator = genquery.row_iterator(
"COLL_NAME",
"COLL_NAME LIKE '" + root + "%' AND META_COLL_ATTR_NAME = 'dataset_toplevel' "
"COLL_NAME = '" + root + "' AND META_COLL_ATTR_NAME = 'dataset_toplevel' "
"AND META_COLL_ATTR_VALUE = '" + dataset_id + "'",
genquery.AS_LIST, ctx
)
for row in iter:

c_subcollection_iterator = genquery.row_iterator(
"COLL_NAME",
"COLL_NAME LIKE '" + root + "/%' AND META_COLL_ATTR_NAME = 'dataset_toplevel' "
"AND META_COLL_ATTR_VALUE = '" + dataset_id + "'",
genquery.AS_LIST, ctx
)

for row in itertools.chain(c_main_collection_iterator, c_subcollection_iterator):
return {'is_collection': True,
'objects': [row[0]]}

# For dataobject situation gather all object path strings as a list
iter = genquery.row_iterator(
d_main_collection_iterator = genquery.row_iterator(
"DATA_NAME, COLL_NAME",
"COLL_NAME like '" + root + "%' AND META_DATA_ATTR_NAME = 'dataset_toplevel' "
"COLL_NAME = '" + root + "' AND META_DATA_ATTR_NAME = 'dataset_toplevel' "
"AND META_DATA_ATTR_VALUE = '" + dataset_id + "'",
genquery.AS_LIST, ctx
)

d_subcollection_iterator = genquery.row_iterator(
"DATA_NAME, COLL_NAME",
"COLL_NAME LIKE '" + root + "/%' AND META_DATA_ATTR_NAME = 'dataset_toplevel' "
"AND META_DATA_ATTR_VALUE = '" + dataset_id + "'",
genquery.AS_LIST, ctx
)

objects = []
for row in iter:
for row in itertools.chain(d_main_collection_iterator, d_subcollection_iterator):
objects.append(row[1] + '/' + row[0])
return {'is_collection': False,
'objects': objects}
Expand All @@ -407,6 +450,7 @@ def api_intake_scan_for_datasets(ctx, coll):
try:
_intake_scan_for_datasets(ctx, coll)
except Exception:
log.write(ctx, "Intake scan failed with the following exception: " + traceback.format_exc())
return {"proc_status": "NOK", "error_msg": "Error during scanning process"}
else:
return {"proc_status": "NOK", "error_msg": "No permissions to scan collection"}
Expand Down
57 changes: 38 additions & 19 deletions intake_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,8 @@
__copyright__ = 'Copyright (c) 2019-2021, Utrecht University'
__license__ = 'GPLv3, see LICENSE'

import itertools

import genquery

from util import *
Expand All @@ -27,12 +29,16 @@ def intake_report_export_study_data(ctx, study_id):
"""
zone = user.zone(ctx)

result = genquery.row_iterator("COLL_NAME, COLL_PARENT_NAME, META_COLL_ATTR_NAME, META_COLL_ATTR_VALUE",
"COLL_NAME like '/{}/home/grp-vault-{}%' AND META_COLL_ATTR_NAME IN ('dataset_id', 'dataset_date_created', 'wave', 'version', 'experiment_type', 'pseudocode')".format(zone, study_id),
genquery.AS_LIST, ctx)
main_collection_iterator = genquery.row_iterator("COLL_NAME, COLL_PARENT_NAME, META_COLL_ATTR_NAME, META_COLL_ATTR_VALUE",
" = '/{}/home/grp-vault-{}' AND META_COLL_ATTR_NAME IN ('dataset_id', 'dataset_date_created', 'wave', 'version', 'experiment_type', 'pseudocode')".format(zone, study_id),
genquery.AS_LIST, ctx)

subcollection_iterator = genquery.row_iterator("COLL_NAME, COLL_PARENT_NAME, META_COLL_ATTR_NAME, META_COLL_ATTR_VALUE",
"COLL_NAME like '/{}/home/grp-vault-{}/%' AND META_COLL_ATTR_NAME IN ('dataset_id', 'dataset_date_created', 'wave', 'version', 'experiment_type', 'pseudocode')".format(zone, study_id),
genquery.AS_LIST, ctx)

datasets = {}
for row in result:
for row in itertools.chain(main_collection_iterator, subcollection_iterator):
path = row[0]
try:
datasets[path][row[2]] = row[3]
Expand All @@ -48,10 +54,15 @@ def intake_report_export_study_data(ctx, study_id):
real_datasets[set_path]['totalFiles'] = 0

# get the filesize and file count
result = genquery.row_iterator("count(DATA_ID), sum(DATA_SIZE)",
"COLL_NAME like '{}%'".format(set_path),
genquery.AS_LIST, ctx)
for row in result:
stat_main_collection_iterator = genquery.row_iterator("count(DATA_ID), sum(DATA_SIZE)",
"COLL_NAME = '{}'".format(set_path),
genquery.AS_LIST, ctx)

stat_subcollection_iterator = genquery.row_iterator("count(DATA_ID), sum(DATA_SIZE)",
"COLL_NAME like '{}/%'".format(set_path),
genquery.AS_LIST, ctx)

for row in itertools.chain(stat_main_collection_iterator, stat_subcollection_iterator):
real_datasets[set_path]['totalFiles'] = int(row[0]) / 2
totalFileSize = 0
if row[1]:
Expand All @@ -65,37 +76,41 @@ def intake_youth_get_datasets_in_study(ctx, study_id):
"""Get the of datasets (with relevant metadata) in a study.
Retrieved metadata:
- 'dataset_id'
- 'dataset_date_created'
- 'wave'
- 'version'
- 'experiment_type'
- 'pseudocode'
:param ctx: Combined type of a callback and rei struct
:param study_id: Unique identifier op study
:param study_id: Unique identifier of study
:returns: Dict with datasets and relevant metadata.
"""
zone = user.zone(ctx)

result = genquery.row_iterator("COLL_NAME, COLL_PARENT_NAME, META_COLL_ATTR_NAME, META_COLL_ATTR_VALUE",
"COLL_NAME like '/{}/home/grp-vault-{}%' AND META_COLL_ATTR_NAME IN ('dataset_id', 'dataset_date_created', 'wave', 'version', 'experiment_type', 'pseudocode')".format(zone, study_id),
genquery.AS_LIST, ctx)
main_collection_iterator = genquery.row_iterator("COLL_NAME, COLL_PARENT_NAME, META_COLL_ATTR_NAME, META_COLL_ATTR_VALUE",
"COLL_NAME = '/{}/home/grp-vault-{}' AND META_COLL_ATTR_NAME IN ('dataset_id', 'dataset_date_created', 'wave', 'version', 'experiment_type', 'pseudocode')".format(zone, study_id),
genquery.AS_LIST, ctx)

subcollection_iterator = genquery.row_iterator("COLL_NAME, COLL_PARENT_NAME, META_COLL_ATTR_NAME, META_COLL_ATTR_VALUE",
"COLL_NAME LIKE '/{}/home/grp-vault-{}/*' AND META_COLL_ATTR_NAME IN ('dataset_id', 'dataset_date_created', 'wave', 'version', 'experiment_type', 'pseudocode')".format(zone, study_id),
genquery.AS_LIST, ctx)

datasets = {}

# Construct all datasets.
for row in result:
for row in itertools.chain(main_collection_iterator, subcollection_iterator):
dataset = row[0]
attribute_name = row[2]
attribute_value = row[3]

if attribute_name in ['dataset_date_created', 'wave', 'version', 'experiment_type', 'pseudocode']:
if attribute_name in ['version', 'experiment_type']:
val = attribute_value.lower()
# datasets[dataset][attribute_name] = attribute_value.lower()
else:
val = attribute_value
# datasets[dataset][attribute_name] = attribute_value
try:
datasets[dataset][attribute_name] = val
except KeyError:
Expand Down Expand Up @@ -206,11 +221,15 @@ def vault_aggregated_info(ctx, study_id):
continue

zone = user.zone(ctx)
result = genquery.row_iterator("DATA_NAME, COLL_NAME, DATA_SIZE, COLL_CREATE_TIME",
"COLL_NAME like '/{}/home/grp-vault-{}%'".format(zone, study_id),
genquery.AS_LIST, ctx)
main_collection_iterator = genquery.row_iterator("DATA_NAME, COLL_NAME, DATA_SIZE, COLL_CREATE_TIME",
"COLL_NAME = '/{}/home/grp-vault-{}'".format(zone, study_id),
genquery.AS_LIST, ctx)

subcollection_iterator = genquery.row_iterator("DATA_NAME, COLL_NAME, DATA_SIZE, COLL_CREATE_TIME",
"COLL_NAME like '/{}/home/grp-vault-{}/%'".format(zone, study_id),
genquery.AS_LIST, ctx)

for row in result:
for row in itertools.chain(main_collection_iterator, subcollection_iterator):
coll_name = row[1]
data_size = int(row[2])
coll_create_time = int(row[3])
Expand Down
Loading

0 comments on commit 961ff3f

Please sign in to comment.