UtrechtUniversity · lwesterhof · Oct 23, 2024 · Aug 29, 2024 · Aug 29, 2024 · Aug 30, 2024
diff --git a/__init__.py b/__init__.py
@@ -41,6 +41,7 @@
 from schema                 import *
 from schema_transformation  import *
 from schema_transformations import *
+from troubleshoot_data      import *
 from vault                  import *
 from datacite               import *
 from epic                   import *

diff --git a/integration_tests.py b/integration_tests.py
@@ -117,6 +117,27 @@ def _test_avu_rmw_collection(ctx, rmw_attributes):
     return result
 
 
+def _test_avu_get_attr_val_of_coll(ctx, attr, value):
+    # Test getting the value of an attribute on a collection
+    tmp_coll = _create_tmp_collection(ctx)
+    ctx.msi_add_avu('-c', tmp_coll, attr, value, "baz")
+    result = avu.get_attr_val_of_coll(ctx, tmp_coll, attr)
+    collection.remove(ctx, tmp_coll)
+    return result
+
+
+def _test_avu_get_attr_val_of_coll_exception(ctx):
+    # Test that getting a non existing attribute on a collection raises an exception (True for exception raised)
+    tmp_coll = _create_tmp_collection(ctx)
+    result = False
+    try:
+        result = avu.get_attr_val_of_coll(ctx, tmp_coll, "foo")
+    except Exception:
+        result = True
+    collection.remove(ctx, tmp_coll)
+    return result
+
+
 def _test_folder_set_retry_avus(ctx):
     tmp_coll = _create_tmp_collection(ctx)
     folder.folder_secure_set_retry_avus(ctx, tmp_coll, 2)
@@ -482,6 +503,12 @@ def _test_folder_secure_func(ctx, func):
      "check": lambda x: (("aap", "noot", "mies") in x
                          and len([a for a in x if a[0] not in ["org_replication_scheduled"]]) == 1
                          )},
+    {"name": "avu.get_attr_val_of_coll.exists.yes",
+     "test": lambda ctx: _test_avu_get_attr_val_of_coll(ctx, "foo", "bar"),
+     "check": lambda x: x == "bar"},
+    {"name": "avu.get_attr_val_of_coll.exists.no",
+     "test": lambda ctx: _test_avu_get_attr_val_of_coll_exception(ctx),
+     "check": lambda x: x},
     {"name": "avu.apply_atomic_operations.collection",
      "test": lambda ctx: _test_msvc_apply_atomic_operations_collection(ctx),
      "check": lambda x: (("foo", "bar", "baz") in x and len(x) == 1)},

diff --git a/meta.py b/meta.py
@@ -13,6 +13,7 @@
 import irods_types
 from deepdiff import DeepDiff
 
+import meta_form
 import provenance
 import publication
 import schema as schema_
@@ -790,3 +791,50 @@ def copy_user_metadata(ctx, source, target):
         log.write(ctx, "copy_user_metadata: copied user metadata from <{}> to <{}/original>".format(source, target))
     except Exception:
         log.write(ctx, "copy_user_metadata: failed to copy user metadata from <{}> to <{}/original>".format(source, target))
+
+
+def vault_metadata_matches_schema(ctx, coll_name, schema_cache, report_name, write_stdout):
+    """Process a single data package to retrieve and validate that its metadata conforms to the schema.
+
+    :param ctx:          Combined type of a callback and rei struct
+    :param coll_name:    String representing the data package collection path.
+    :param schema_cache: Dictionary storing schema blueprints, can be empty.
+    :param report_name:  Name of report script (for logging)
+    :param write_stdout: A boolean representing whether to write to stdout or rodsLog
+
+    :returns:            A dictionary result containing if schema matches and the schema short name.
+    """
+    metadata_path = get_latest_vault_metadata_path(ctx, coll_name)
+
+    if not metadata_path:
+        log.write(ctx, "{} skips {}, because metadata could not be found.".format(report_name, coll_name), write_stdout)
+        return None
+
+    try:
+        metadata = jsonutil.read(ctx, metadata_path)
+    except Exception as exc:
+        log.write(ctx, "{} skips {}, because of exception while reading metadata file {}: {}".format(report_name, coll_name, metadata_path, str(exc)), write_stdout)
+        log.write(ctx, "vault_metadata_matches_schema: Error while reading metadata file {} of data package {}: {}".format(metadata_path, coll_name, str(exc)), write_stdout)
+        return None
+
+    # Determine schema
+    schema_id = schema_.get_schema_id(ctx, metadata_path)
+    schema_shortname = schema_id.split("/")[-2]
+
+    # Retrieve schema and cache it for future use
+    schema_path = schema_.get_schema_path_by_id(ctx, metadata_path, schema_id)
+    if schema_shortname in schema_cache:
+        schema_contents = schema_cache[schema_shortname]
+    else:
+        schema_contents = jsonutil.read(ctx, schema_path)
+        schema_cache[schema_shortname] = schema_contents
+
+    # Check whether metadata matches schema and log any errors
+    error_list = get_json_metadata_errors(ctx, metadata_path, metadata=metadata, schema=schema_contents)
+    match_schema = len(error_list) == 0
+    if not match_schema:
+        errors_formatted = [meta_form.humanize_validation_error(e).encode('utf-8') for e in error_list]
+        log.write(ctx, "{}: metadata {} did not match schema {}: {}".format(report_name, metadata_path, schema_shortname, str(errors_formatted)), write_stdout)
+        log.write(ctx, "vault_metadata_matches_schema: Metadata {} of data package {} did not match the schema {}. Error list: {}".format(metadata_path, coll_name, schema_shortname, str(errors_formatted)), write_stdout)
+
+    return {"schema": schema_shortname, "match_schema": match_schema}
diff --git a/publication.py b/publication.py
@@ -1326,10 +1326,10 @@ def rule_update_publication(ctx, vault_package, update_datacite, update_landingp
     :param update_moai:        Flag that indicates updating MOAI (OAI-PMH)
     """
     if user.user_type(ctx) != 'rodsadmin':
-        log.write_stdout(ctx, "User is no rodsadmin")
+        log.write(ctx, "User is no rodsadmin", True)
         return
 
-    log.write_stdout(ctx, "[UPDATE PUBLICATIONS] Start for {}".format(vault_package))
+    log.write(ctx, "[UPDATE PUBLICATIONS] Start for {}".format(vault_package), True)
     collections = genquery.row_iterator(
         "COLL_NAME",
         "COLL_NAME like '%%/home/vault-%%' "
@@ -1345,12 +1345,12 @@ def rule_update_publication(ctx, vault_package, update_datacite, update_landingp
         if ((vault_package == '*' and re.match(r'/[^/]+/home/vault-.*', coll_name)) or (vault_package != '*' and re.match(r'/[^/]+/home/vault-.*', coll_name) and coll_name == vault_package)):
             packages_found = True
             output = update_publication(ctx, coll_name, update_datacite == 'Yes', update_landingpage == 'Yes', update_moai == 'Yes')
-            log.write_stdout(ctx, coll_name + ': ' + output)
+            log.write(ctx, coll_name + ': ' + output, True)
 
     if not packages_found:
-        log.write_stdout(ctx, "[UPDATE PUBLICATIONS] No packages found for {}".format(vault_package))
+        log.write(ctx, "[UPDATE PUBLICATIONS] No packages found for {}".format(vault_package), True)
     else:
-        log.write_stdout(ctx, "[UPDATE PUBLICATIONS] Finished for {}".format(vault_package))
+        log.write(ctx, "[UPDATE PUBLICATIONS] Finished for {}".format(vault_package), True)
 
 
 def update_publication(ctx, vault_package, update_datacite=False, update_landingpage=False, update_moai=False):

diff --git a/schema_transformation.py b/schema_transformation.py
@@ -19,7 +19,6 @@
 import session_vars
 
 import meta
-import meta_form
 import schema
 import schema_transformations
 from util import *
@@ -405,41 +404,13 @@ def rule_batch_vault_metadata_schema_report(ctx):
         genquery.AS_LIST, ctx)
 
     for row in iter:
-        coll_name = row[0]
-        metadata_path = meta.get_latest_vault_metadata_path(ctx, coll_name)
-
-        if metadata_path == '' or metadata_path is None:
-            log.write(ctx, "Vault metadata schema report skips %s, because metadata could not be found."
-                           % (coll_name))
-            continue
-
         try:
-            metadata = jsonutil.read(ctx, metadata_path)
-        except Exception as exc:
-            log.write(ctx, "Vault metadata report skips %s, because of exception while reading metadata file %s: %s."
-                           % (coll_name, metadata_path, str(exc)))
+            coll_name = row[0]
+            result = meta.vault_metadata_matches_schema(ctx, coll_name, schema_cache, "Vault metadata schema report", True)
+            if result:
+                results[coll_name] = result
+        except Exception as e:
+            log.write(ctx, "Error processing collection {}: {}".format(coll_name, str(e)))
             continue
 
-        # Determine schema
-        schema_id = schema.get_schema_id(ctx, metadata_path)
-        schema_shortname = schema_id.split("/")[-2]
-
-        # Retrieve schema and cache it for future use
-        schema_path = schema.get_schema_path_by_id(ctx, metadata_path, schema_id)
-        if schema_shortname in schema_cache:
-            schema_contents = schema_cache[schema_shortname]
-        else:
-            schema_contents = jsonutil.read(ctx, schema_path)
-            schema_cache[schema_shortname] = schema_contents
-
-        # Check whether metadata matches schema and log any errors
-        error_list = meta.get_json_metadata_errors(ctx, metadata_path, metadata=metadata, schema=schema_contents)
-        match_schema = len(error_list) == 0
-        if not match_schema:
-            log.write(ctx, "Vault metadata schema report: metadata %s did not match schema %s: %s" %
-                           (metadata_path, schema_shortname, str([meta_form.humanize_validation_error(e).encode('utf-8') for e in error_list])))
-
-        # Update results
-        results[coll_name] = {"schema": schema_shortname, "match_schema": match_schema}
-
     return json.dumps(results)
diff --git a/tests/features/api/api_vault.feature b/tests/features/api/api_vault.feature
@@ -98,6 +98,17 @@ Feature: Vault API
             | /tempZone/home/vault-default-2 |
             | /tempZone/home/vault-core-2    |
             | /tempZone/home/vault-default-3 |
+
+
+    Scenario Outline: Published vault package passes troubleshooting script checks
+        Given user technicaladmin is authenticated
+        And data package exists in <vault>
+        Then data package in <vault> passes troubleshooting script checks
+
+        Examples:
+            | vault                          |
+            | /tempZone/home/vault-default-2 |
+            | /tempZone/home/vault-default-3 |
 
 
     Scenario Outline: Vault preservable formats lists

diff --git a/tests/step_defs/api/common_vault.py b/tests/step_defs/api/common_vault.py
@@ -174,6 +174,21 @@ def data_package_status(user, vault, data_package, status):
     raise AssertionError()
 
 
+@then(parsers.parse('data package in {vault} passes troubleshooting script checks'))
+def api_vault_batch_troubleshoot(user, vault, data_package):
+    http_status, result = api_request(
+        user,
+        "batch_troubleshoot_published_data_packages",
+        {"requested_package": data_package, "log_file": True, "offline": True}
+    )
+    assert http_status == 200
+    data = result['data']
+    assert len(data) == 1
+    # Confirm that all checks passed for this data package
+    for checks in data.values():
+        assert all(checks.values())
+
+
 @then('preservable formats lists are returned')
 def preservable_formats_lists(api_response):
     http_status, body = api_response

diff --git a/tools/troubleshoot-published-data.py b/tools/troubleshoot-published-data.py
@@ -0,0 +1,46 @@
+#!/usr/bin/env python3
+"""This script collects all published packages and checks that they have all the required info.
+
+Example:
+To check all published packages:
+python3 troubleshoot-published-data.py
+
+To check one specific package by name:
+python3 troubleshoot-published-data.py -p research-initial[1725262507]
+
+To put results into a log file and complete the checks offline:
+python3 troubleshoot-published-data.py -l -o
+"""
+import argparse
+import subprocess
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(
+        prog="troubleshoot-published-data.py",
+        description=__doc__,
+        formatter_class=argparse.RawTextHelpFormatter)
+    parser.add_argument("-l", "--log-file", action='store_true',
+                        help="If log file parameter is true then write to log at: /var/lib/irods/log/troubleshoot_publications.log")
+    parser.add_argument("-o", "--offline", action='store_true',
+                        help="If actions should be performed without connecting to external servers (needed for the Yoda team's development setup).")
+    parser.add_argument("-n", "--no-datacite", action='store_true',
+                        help="If datacite check should be skipped (needed for the Yoda team's development environment in some cases).")
+    parser.add_argument("-p", "--package", type=str, required=False,
+                        help="Troubleshoot a specific data package by name (default: troubleshoot all packages)")
+    return parser.parse_args()
+
+
+def main():
+    args = parse_args()
+    rule_name = "/etc/irods/yoda-ruleset/tools/troubleshoot_data.r"
+    data_package = f"*data_package={args.package}"
+    log_loc = f"*log_loc={args.log_file if args.log_file else ''}"
+    offline = f"*offline={args.offline}"
+    no_datacite = f"*no_datacite={args.no_datacite}"
+    subprocess.call(['irule', '-r', 'irods_rule_engine_plugin-python-instance', '-F',
+                    rule_name, data_package, log_loc, offline, no_datacite])
+
+
+if __name__ == '__main__':
+    main()
diff --git a/tools/troubleshoot_data.r b/tools/troubleshoot_data.r
@@ -0,0 +1,11 @@
+#!/usr/bin/irule -r irods_rule_engine_plugin-python-instance -F
+
+def main(rule_args, callback, rei):
+    data_package = global_vars["*data_package"].strip('"')
+    log_loc = global_vars["*log_loc"].strip('"')
+    offline = global_vars["*offline"].strip('"')
+    no_datacite = global_vars["*no_datacite"].strip('"')
+    callback.rule_batch_troubleshoot_published_data_packages(data_package, log_loc, offline, no_datacite)
+
+INPUT *data_package="", *log_loc="", *offline="", *no_datacite=""
+OUTPUT ruleExecOut