Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

YDA-5829 troubleshooting tool for published data packages #517

Merged
merged 44 commits into from
Oct 23, 2024
Merged
Show file tree
Hide file tree
Changes from 42 commits
Commits
Show all changes
44 commits
Select commit Hold shift + click to select a range
ce5123c
initial commit
FuHsinyu Aug 29, 2024
85453b4
add rule to find all published data packages
FuHsinyu Aug 29, 2024
03ba1df
wrap up case 2,3,4
FuHsinyu Aug 30, 2024
c17f76c
add case 1 wrapped
FuHsinyu Aug 30, 2024
e4937ce
refactoring codes
FuHsinyu Aug 30, 2024
ae0cfc9
remove unused files
FuHsinyu Aug 30, 2024
e473b29
separete verify package schema from schema report for case 1
FuHsinyu Aug 30, 2024
f997da3
clean up codes
FuHsinyu Aug 30, 2024
69b7e9d
clean up codes
FuHsinyu Aug 30, 2024
c97a6c8
fix lint
FuHsinyu Aug 30, 2024
238809d
fix lint
FuHsinyu Aug 30, 2024
583d44b
improve doc strings
FuHsinyu Aug 30, 2024
93360d2
improve comments and doc strings
FuHsinyu Aug 30, 2024
5331ce9
fix lint
FuHsinyu Aug 30, 2024
32d94b7
improve doc strings
FuHsinyu Aug 30, 2024
953d962
add error info to doc string
FuHsinyu Aug 30, 2024
1124916
wip troubleshoot change to stdout, configurable log
claravox Oct 14, 2024
0541461
troubleshoot by url not md5
claravox Sep 4, 2024
0d7e509
troubleshoot by url for landing page and combi JSON
claravox Sep 6, 2024
0d4b06b
use schema cache, remove md5 code
claravox Oct 14, 2024
b02d053
Move verify vault package metadata function to meta.py
claravox Oct 14, 2024
bb9db6d
add try catch for Connection Error
claravox Sep 9, 2024
73a319f
lint
claravox Sep 9, 2024
69ba0b8
YDA-5829 - Work in progress for troubleshooting publications.
kaur16 Sep 22, 2024
c043d7e
add offline check option for landing page and datacite check
claravox Sep 30, 2024
912a5ef
simplify get landing page path and urls
claravox Sep 30, 2024
4b44fa6
minor changes after testing requests.get
claravox Oct 14, 2024
6c8b4fc
Add offline mode to datacite doi check
claravox Oct 2, 2024
b34c8a5
delete shell script in favor of python script
claravox Oct 2, 2024
48ffb88
Move write_stdout to log.write
claravox Oct 2, 2024
8a31a22
Call log.write rather than write_stdout in publication
claravox Oct 2, 2024
309ee5f
Fix print all test results bug
claravox Oct 2, 2024
136697c
Support both script and api version of troubleshoot publications
claravox Oct 2, 2024
4ddbe4d
api tests troubleshoot publication happy flow
claravox Oct 3, 2024
374684d
Fix check landing page call
claravox Oct 7, 2024
d337e2b
Unittest: check system avus
claravox Oct 14, 2024
04120e2
Add avu get func and its integration tests
claravox Oct 8, 2024
929b73f
Function for checking datacite doi registration
claravox Oct 9, 2024
ef936e6
Refine system AVUs check and unit tests
claravox Oct 14, 2024
9345d2c
Remove now unneeded inbox user
claravox Oct 14, 2024
adfdac0
Support optional avus for datacite
claravox Oct 15, 2024
6ff897a
Add datacite check toggle
claravox Oct 15, 2024
abbdfbd
Rename troubleshoot publication file
claravox Oct 18, 2024
1c9a5b4
Update log.write function description
claravox Oct 18, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions __init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,7 @@
from schema import *
from schema_transformation import *
from schema_transformations import *
from troubleshoot_data import *
from vault import *
from datacite import *
from epic import *
Expand Down
27 changes: 27 additions & 0 deletions integration_tests.py
Original file line number Diff line number Diff line change
Expand Up @@ -117,6 +117,27 @@ def _test_avu_rmw_collection(ctx, rmw_attributes):
return result


def _test_avu_get_attr_val_of_coll(ctx, attr, value):
# Test getting the value of an attribute on a collection
tmp_coll = _create_tmp_collection(ctx)
ctx.msi_add_avu('-c', tmp_coll, attr, value, "baz")
result = avu.get_attr_val_of_coll(ctx, tmp_coll, attr)
collection.remove(ctx, tmp_coll)
return result


def _test_avu_get_attr_val_of_coll_exception(ctx):
# Test that getting a non existing attribute on a collection raises an exception (True for exception raised)
tmp_coll = _create_tmp_collection(ctx)
result = False
try:
result = avu.get_attr_val_of_coll(ctx, tmp_coll, "foo")
except Exception:
result = True
collection.remove(ctx, tmp_coll)
return result


def _test_folder_set_retry_avus(ctx):
tmp_coll = _create_tmp_collection(ctx)
folder.folder_secure_set_retry_avus(ctx, tmp_coll, 2)
Expand Down Expand Up @@ -482,6 +503,12 @@ def _test_folder_secure_func(ctx, func):
"check": lambda x: (("aap", "noot", "mies") in x
and len([a for a in x if a[0] not in ["org_replication_scheduled"]]) == 1
)},
{"name": "avu.get_attr_val_of_coll.exists.yes",
"test": lambda ctx: _test_avu_get_attr_val_of_coll(ctx, "foo", "bar"),
"check": lambda x: x == "bar"},
{"name": "avu.get_attr_val_of_coll.exists.no",
"test": lambda ctx: _test_avu_get_attr_val_of_coll_exception(ctx),
"check": lambda x: x},
{"name": "avu.apply_atomic_operations.collection",
"test": lambda ctx: _test_msvc_apply_atomic_operations_collection(ctx),
"check": lambda x: (("foo", "bar", "baz") in x and len(x) == 1)},
Expand Down
48 changes: 48 additions & 0 deletions meta.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
import irods_types
from deepdiff import DeepDiff

import meta_form
import provenance
import publication
import schema as schema_
Expand Down Expand Up @@ -790,3 +791,50 @@ def copy_user_metadata(ctx, source, target):
log.write(ctx, "copy_user_metadata: copied user metadata from <{}> to <{}/original>".format(source, target))
except Exception:
log.write(ctx, "copy_user_metadata: failed to copy user metadata from <{}> to <{}/original>".format(source, target))


def vault_metadata_matches_schema(ctx, coll_name, schema_cache, report_name, write_stdout):
"""Process a single data package to retrieve and validate that its metadata conforms to the schema.

:param ctx: Combined type of a callback and rei struct
:param coll_name: String representing the data package collection path.
:param schema_cache: Dictionary storing schema blueprints, can be empty.
:param report_name: Name of report script (for logging)
:param write_stdout: A boolean representing whether to write to stdout or rodsLog

:returns: A dictionary result containing if schema matches and the schema short name.
"""
metadata_path = get_latest_vault_metadata_path(ctx, coll_name)

if not metadata_path:
log.write(ctx, "{} skips {}, because metadata could not be found.".format(report_name, coll_name), write_stdout)
return None

try:
metadata = jsonutil.read(ctx, metadata_path)
except Exception as exc:
log.write(ctx, "{} skips {}, because of exception while reading metadata file {}: {}".format(report_name, coll_name, metadata_path, str(exc)), write_stdout)
log.write(ctx, "vault_metadata_matches_schema: Error while reading metadata file {} of data package {}: {}".format(metadata_path, coll_name, str(exc)), write_stdout)
return None

# Determine schema
schema_id = schema_.get_schema_id(ctx, metadata_path)
schema_shortname = schema_id.split("/")[-2]

# Retrieve schema and cache it for future use
schema_path = schema_.get_schema_path_by_id(ctx, metadata_path, schema_id)
if schema_shortname in schema_cache:
schema_contents = schema_cache[schema_shortname]
else:
schema_contents = jsonutil.read(ctx, schema_path)
schema_cache[schema_shortname] = schema_contents

# Check whether metadata matches schema and log any errors
error_list = get_json_metadata_errors(ctx, metadata_path, metadata=metadata, schema=schema_contents)
match_schema = len(error_list) == 0
if not match_schema:
errors_formatted = [meta_form.humanize_validation_error(e).encode('utf-8') for e in error_list]
log.write(ctx, "{}: metadata {} did not match schema {}: {}".format(report_name, metadata_path, schema_shortname, str(errors_formatted)), write_stdout)
log.write(ctx, "vault_metadata_matches_schema: Metadata {} of data package {} did not match the schema {}. Error list: {}".format(metadata_path, coll_name, schema_shortname, str(errors_formatted)), write_stdout)

return {"schema": schema_shortname, "match_schema": match_schema}
10 changes: 5 additions & 5 deletions publication.py
Original file line number Diff line number Diff line change
Expand Up @@ -1326,10 +1326,10 @@ def rule_update_publication(ctx, vault_package, update_datacite, update_landingp
:param update_moai: Flag that indicates updating MOAI (OAI-PMH)
"""
if user.user_type(ctx) != 'rodsadmin':
log.write_stdout(ctx, "User is no rodsadmin")
log.write(ctx, "User is no rodsadmin", True)
return

log.write_stdout(ctx, "[UPDATE PUBLICATIONS] Start for {}".format(vault_package))
log.write(ctx, "[UPDATE PUBLICATIONS] Start for {}".format(vault_package), True)
collections = genquery.row_iterator(
"COLL_NAME",
"COLL_NAME like '%%/home/vault-%%' "
Expand All @@ -1345,12 +1345,12 @@ def rule_update_publication(ctx, vault_package, update_datacite, update_landingp
if ((vault_package == '*' and re.match(r'/[^/]+/home/vault-.*', coll_name)) or (vault_package != '*' and re.match(r'/[^/]+/home/vault-.*', coll_name) and coll_name == vault_package)):
packages_found = True
output = update_publication(ctx, coll_name, update_datacite == 'Yes', update_landingpage == 'Yes', update_moai == 'Yes')
log.write_stdout(ctx, coll_name + ': ' + output)
log.write(ctx, coll_name + ': ' + output, True)

if not packages_found:
log.write_stdout(ctx, "[UPDATE PUBLICATIONS] No packages found for {}".format(vault_package))
log.write(ctx, "[UPDATE PUBLICATIONS] No packages found for {}".format(vault_package), True)
else:
log.write_stdout(ctx, "[UPDATE PUBLICATIONS] Finished for {}".format(vault_package))
log.write(ctx, "[UPDATE PUBLICATIONS] Finished for {}".format(vault_package), True)


def update_publication(ctx, vault_package, update_datacite=False, update_landingpage=False, update_moai=False):
Expand Down
41 changes: 6 additions & 35 deletions schema_transformation.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,6 @@
import session_vars

import meta
import meta_form
import schema
import schema_transformations
from util import *
Expand Down Expand Up @@ -405,41 +404,13 @@ def rule_batch_vault_metadata_schema_report(ctx):
genquery.AS_LIST, ctx)

for row in iter:
coll_name = row[0]
metadata_path = meta.get_latest_vault_metadata_path(ctx, coll_name)

if metadata_path == '' or metadata_path is None:
log.write(ctx, "Vault metadata schema report skips %s, because metadata could not be found."
% (coll_name))
continue

try:
metadata = jsonutil.read(ctx, metadata_path)
except Exception as exc:
log.write(ctx, "Vault metadata report skips %s, because of exception while reading metadata file %s: %s."
% (coll_name, metadata_path, str(exc)))
coll_name = row[0]
result = meta.vault_metadata_matches_schema(ctx, coll_name, schema_cache, "Vault metadata schema report", True)
if result:
results[coll_name] = result
except Exception as e:
log.write(ctx, "Error processing collection {}: {}".format(coll_name, str(e)))
continue

# Determine schema
schema_id = schema.get_schema_id(ctx, metadata_path)
schema_shortname = schema_id.split("/")[-2]

# Retrieve schema and cache it for future use
schema_path = schema.get_schema_path_by_id(ctx, metadata_path, schema_id)
if schema_shortname in schema_cache:
schema_contents = schema_cache[schema_shortname]
else:
schema_contents = jsonutil.read(ctx, schema_path)
schema_cache[schema_shortname] = schema_contents

# Check whether metadata matches schema and log any errors
error_list = meta.get_json_metadata_errors(ctx, metadata_path, metadata=metadata, schema=schema_contents)
match_schema = len(error_list) == 0
if not match_schema:
log.write(ctx, "Vault metadata schema report: metadata %s did not match schema %s: %s" %
(metadata_path, schema_shortname, str([meta_form.humanize_validation_error(e).encode('utf-8') for e in error_list])))

# Update results
results[coll_name] = {"schema": schema_shortname, "match_schema": match_schema}

return json.dumps(results)
11 changes: 11 additions & 0 deletions tests/features/api/api_vault.feature
Original file line number Diff line number Diff line change
Expand Up @@ -98,6 +98,17 @@ Feature: Vault API
| /tempZone/home/vault-default-2 |
| /tempZone/home/vault-core-2 |
| /tempZone/home/vault-default-3 |


Scenario Outline: Published vault package passes troubleshooting script checks
Given user technicaladmin is authenticated
And data package exists in <vault>
Then data package in <vault> passes troubleshooting script checks

Examples:
| vault |
| /tempZone/home/vault-default-2 |
| /tempZone/home/vault-default-3 |


Scenario Outline: Vault preservable formats lists
Expand Down
15 changes: 15 additions & 0 deletions tests/step_defs/api/common_vault.py
Original file line number Diff line number Diff line change
Expand Up @@ -174,6 +174,21 @@ def data_package_status(user, vault, data_package, status):
raise AssertionError()


@then(parsers.parse('data package in {vault} passes troubleshooting script checks'))
def api_vault_batch_troubleshoot(user, vault, data_package):
http_status, result = api_request(
user,
"batch_troubleshoot_published_data_packages",
{"requested_package": data_package, "log_file": True, "offline": True}
)
assert http_status == 200
data = result['data']
assert len(data) == 1
# Confirm that all checks passed for this data package
for checks in data.values():
assert all(checks.values())


@then('preservable formats lists are returned')
def preservable_formats_lists(api_response):
http_status, body = api_response
Expand Down
46 changes: 46 additions & 0 deletions tools/troubleshoot-published-data.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,46 @@
#!/usr/bin/env python3
"""This script collects all published packages and checks that they have all the required info.

Example:
To check all published packages:
python3 troubleshoot-published-data.py

To check one specific package by name:
python3 troubleshoot-published-data.py -p research-initial[1725262507]

To put results into a log file and complete the checks offline:
python3 troubleshoot-published-data.py -l -o
"""
import argparse
import subprocess


def parse_args():
parser = argparse.ArgumentParser(
prog="troubleshoot-published-data.py",
description=__doc__,
formatter_class=argparse.RawTextHelpFormatter)
parser.add_argument("-l", "--log-file", action='store_true',
help="If log file parameter is true then write to log at: /var/lib/irods/log/troubleshoot_publications.log")
parser.add_argument("-o", "--offline", action='store_true',
help="If actions should be performed without connecting to external servers (needed for the Yoda team's development setup).")
parser.add_argument("-n", "--no-datacite", action='store_true',
help="If datacite check should be skipped (needed for the Yoda team's development environment in some cases).")
parser.add_argument("-p", "--package", type=str, required=False,
help="Troubleshoot a specific data package by name (default: troubleshoot all packages)")
return parser.parse_args()


def main():
args = parse_args()
rule_name = "/etc/irods/yoda-ruleset/tools/troubleshoot_data.r"
data_package = f"*data_package={args.package}"
log_loc = f"*log_loc={args.log_file if args.log_file else ''}"
offline = f"*offline={args.offline}"
no_datacite = f"*no_datacite={args.no_datacite}"
subprocess.call(['irule', '-r', 'irods_rule_engine_plugin-python-instance', '-F',
rule_name, data_package, log_loc, offline, no_datacite])


if __name__ == '__main__':
main()
11 changes: 11 additions & 0 deletions tools/troubleshoot_data.r
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
#!/usr/bin/irule -r irods_rule_engine_plugin-python-instance -F

def main(rule_args, callback, rei):
data_package = global_vars["*data_package"].strip('"')
log_loc = global_vars["*log_loc"].strip('"')
offline = global_vars["*offline"].strip('"')
no_datacite = global_vars["*no_datacite"].strip('"')
callback.rule_batch_troubleshoot_published_data_packages(data_package, log_loc, offline, no_datacite)

INPUT *data_package="", *log_loc="", *offline="", *no_datacite=""
OUTPUT ruleExecOut
Loading
Loading