From c1eb47483ee20cc508d822bbd173465ecbc59087 Mon Sep 17 00:00:00 2001 From: Chris MacDonald <31731869+chrismacdonaldw@users.noreply.github.com> Date: Fri, 19 Apr 2024 11:56:20 -0300 Subject: [PATCH 01/20] Update analysis tooling --- scripts/.gitignore | 37 +++++++ scripts/README.md | 56 +++++----- scripts/data_analysis.py | 55 ++++++++++ scripts/datastream_export.py | 137 +++++++++++++++++++++++ scripts/datastream_updater.py | 200 ++++++++++++++++++++++++++++++++++ scripts/export_metadata.sh | 33 ------ scripts/metadata_analysis.sh | 66 ----------- scripts/queries.py | 108 ++++++++++++++++++ scripts/requirements.txt | 6 + scripts/util.in | 89 --------------- scripts/utils.py | 24 ++++ 11 files changed, 598 insertions(+), 213 deletions(-) create mode 100644 scripts/.gitignore create mode 100644 scripts/data_analysis.py create mode 100644 scripts/datastream_export.py create mode 100644 scripts/datastream_updater.py delete mode 100644 scripts/export_metadata.sh delete mode 100644 scripts/metadata_analysis.sh create mode 100644 scripts/queries.py create mode 100644 scripts/requirements.txt delete mode 100644 scripts/util.in create mode 100644 scripts/utils.py diff --git a/scripts/.gitignore b/scripts/.gitignore new file mode 100644 index 0000000..b0df7e8 --- /dev/null +++ b/scripts/.gitignore @@ -0,0 +1,37 @@ +# Python +*.py[cod] +__pycache__/ +venv/ +*.pyc + +# Jupyter Notebook +.ipynb_checkpoints/ + +# IDEs +.idea/ +.vscode/ + +# Compiled files +*.pyd +*.pyo +*.pyw +*.pyz +*.pyzw + +# Distribution / packaging +dist/ +build/ +*.egg-info/ +*.egg + +# Environment +.env +.env.* + +# Script Outputs +results/* +output/* +*.xml + +# Etc +.DS_Store diff --git a/scripts/README.md b/scripts/README.md index 8b37736..1d2cdc9 100644 --- a/scripts/README.md +++ b/scripts/README.md @@ -1,56 +1,62 @@ # FCREPO3 Analysis Helpers - ## Introduction -Scripts to analyse and export metadata from an FCREPO3 instance. +Tools to analyse and export metadata from an FCREPO3 instance using Python scripts. ## Table of Contents - * [Setup](#setup) * [Features](#features) * [Usage](#usage) ## Setup +These tools are designed to be run with a Python environment. Ensure Python 3.6 or higher is installed on your system. You will need to set up a Python virtual environment and install the required packages: -These scripts require an FCREPO3 instance to be run over. In the event, these scripts are run on a separate system from where -the repository lives, modifications may be required to the `fedora-xacml-policies` directory located at `$FEDORA_HOME/data/fedora-xacml-policies`. +```bash +python3 -m venv venv +source venv/bin/activate +pip install -r requirements.txt +``` -The metadata export command requires [GNU Parallel](https://www.gnu.org/software/parallel/parallel.html) to be installed -for faster processing. +The scripts also require an FCREPO3 instance. If these tools are run on a system separate from where the repository is hosted, modifications might be necessary in the `fedora-xacml-policies` directory at `$FEDORA_HOME/data/fedora-xacml-policies`. ## Features - ### Metadata Analysis -A script to generate the following: -1. A total count of all objects in the repository. -2. A breakdown of objects by content models and their count in CSV form (`models.csv`). -3. A breakdown of unique datastream IDs and their count in CSV form (`dsids.csv`). +Python scripts that perform the following: +1. Count all objects in the repository. +2. Provide a breakdown of objects by content models (`models.csv`). +3. Output a breakdown of unique datastream IDs (`dsids.csv`). ### Metadata Export -A script to export all objects within the repository that contain a specified metadata datastream ID. +Scripts to export all objects within the repository that contain a specified metadata datastream ID, saving results as XML. ## Usage - ### Metadata Analysis #### Command ```bash -sudo bash /path_to_the_module/scripts/metadata_analysis.sh --fedora_pass=the_password +python3 data_analysis.py --url=http://your-fedora-url --user=admin --password=secret --output_dir=./results ``` - #### Output -``` -The total number of objects is 40. -Outputted model breakdown to CSV (models.csv). -Outputted DSID breakdown to CSV (dsids.csv). -``` +Exports all queries found in `queries.py` to their own CSV in the `results` folder by default. Can be changed with the `--output_dir` flag. ### Metadata Export #### Command ```bash -sudo bash shell_scripts/export_metadata.sh --fedora_pass=the_password --skip_auth_check +python3 datastream_export.py --url=http://your-fedora-url:8080 --user=admin --password=secret --dsid=DSID --output_dir=./output --pid_file=./some_pids +``` +> The script supports adding comments in the pid_file using `#`. PIDs can also contain URL encoded characters (e.g., `%3A` for `:` which will be automatically decoded). + +#### Output +Exports all metadata entries related to the specified DSID into XML files stored in the defined output directory. +Each file's name will be in the format `pid-DSID.xml`. + +### Datastream Updater +#### Command +```bash +python3 datastream_updater.py --xml=input.xml --dsid=DSID --content=content.bin --label='New Version' --output=output.xml ``` +> This script allows you to specify the XML file to modify, the datastream ID, the binary content file (which will be base64 encoded), and optionally a label for the new datastream version. -> Utilizing the `--skip_auth_check` flag here is an important performance optimization as it will greatly speed up the -export operation due to not needing to validate the request prior. +The only non-required argument is `label` which is in the case if you want to specify a custom label. If previous datastream versions do not have a label and you didn't specify one in the args, it will prompt you for a new one. #### Output -The command does not output anything but will export all objects in the form of `the:pid-DSID.xml`. +Updates the specified XML file with a new version of the datastream, encoding the provided binary content into base64. The updated XML is saved to the specified output file. + diff --git a/scripts/data_analysis.py b/scripts/data_analysis.py new file mode 100644 index 0000000..a71af10 --- /dev/null +++ b/scripts/data_analysis.py @@ -0,0 +1,55 @@ +import argparse +import os +from utils import perform_http_request +from queries import queries + + +def parse_args(): + parser = argparse.ArgumentParser( + description="Process SPARQL queries and save results." + ) + parser.add_argument("--url", type=str, help="Fedora server URL", required=True) + parser.add_argument("--user", type=str, help="Fedora username", required=True) + parser.add_argument("--password", type=str, help="Fedora password", required=True) + parser.add_argument( + "--output_dir", + type=str, + default="./results", + help="Directory to save CSV files", + ) + return parser.parse_args() + + +def save_to_csv(data, filename, output_dir): + """ + Save the given data to a CSV file. + + Args: + data (str): The data to be written to the CSV file. + filename (str): The name of the CSV file. + output_dir (str): The directory where the CSV file will be saved. + + Returns: + None + """ + os.makedirs(output_dir, exist_ok=True) + with open(os.path.join(output_dir, filename), "w", newline="") as file: + file.write(data) + + +def main(): + args = parse_args() + + for query_name, query in queries.items(): + print(f"Processing query '{query_name}'...") + result = perform_http_request(query, args.url, args.user, args.password) + if result: + csv_filename = f"{query_name}.csv" + print(f"Saving results to {csv_filename}...\n") + save_to_csv(result, csv_filename, args.output_dir) + else: + print(f"Failed to retrieve data for query '{query_name}'.\n") + + +if __name__ == "__main__": + main() diff --git a/scripts/datastream_export.py b/scripts/datastream_export.py new file mode 100644 index 0000000..90aa0bd --- /dev/null +++ b/scripts/datastream_export.py @@ -0,0 +1,137 @@ +import argparse +import requests +from tqdm import tqdm +import concurrent.futures +import os +import mimetypes +from utils import perform_http_request + + +def parse_args(): + parser = argparse.ArgumentParser( + description="Export metadata using SPARQL query and save as XML." + ) + parser.add_argument("--url", required=True, help="Fedora base URL") + parser.add_argument("--user", required=True, help="Username for Fedora access") + parser.add_argument("--password", required=True, help="Password for Fedora access") + parser.add_argument("--dsid", required=True, help="Datastream ID for querying") + parser.add_argument( + "--output_dir", default="./output", help="Directory to save XML files" + ) + parser.add_argument( + "--pid_file", type=str, help="File containing PIDs to process", required=False + ) + return parser.parse_args() + + +def fetch_data(dsid, base_url, user, password, output_dir, obj_id): + """ + Fetches the datastream content for a given datastream ID (dsid) and object ID (obj_id) from a Fedora repository. + + Args: + dsid (str): The ID of the datastream to fetch. + base_url (str): The base URL of the Fedora repository. + user (str): The username for authentication. + password (str): The password for authentication. + output_dir (str): The directory where the fetched data will be saved. + obj_id (str): The ID of the object that contains the datastream. + + Returns: + bool: True if the datastream content was successfully fetched and saved, False otherwise. + """ + obj_id = obj_id.replace("info:fedora/", "") + url = f"{base_url}/fedora/objects/{obj_id}/datastreams/{dsid}/content" + print(f"Downloading {dsid} for PID: {obj_id}") + try: + response = requests.get(url, auth=(user, password)) + response.raise_for_status() + dsid_dir = os.path.join(output_dir, dsid) + os.makedirs(dsid_dir, exist_ok=True) + content_type = response.headers.get("Content-Type", "") + extension = mimetypes.guess_extension(content_type) if content_type else "" + filename = f"{obj_id}-{dsid}{extension}" + with open(os.path.join(dsid_dir, filename), "wb") as f: + f.write(response.content) + print(f"Successfully saved {filename}") + return True + except Exception as e: + print(f"Failed to fetch data for {obj_id}, error: {str(e)}") + return False + + +def process_pid_file(filepath): + """ + Process a file containing PIDs (Persistent Identifiers) and return a list of PIDs. + Supports comments in the file using '#' character. + Replace '%3A' with ':' in PIDs. + + Args: + filepath (str): The path to the file containing PIDs. + + Returns: + list: A list of PIDs extracted from the file. + """ + pids = [] + with open(filepath, "r") as file: + for line in file: + line = line.strip() + if "#" in line: + line = line[: line.index("#")].strip() + if line: + line = line.replace("%3A", ":") + pids.append(line) + return pids + + +def main(): + args = parse_args() + os.makedirs(args.output_dir, exist_ok=True) + + object_ids = [] + + # If a PID file is provided, process the file to get the list of PIDs. + if args.pid_file: + object_ids = process_pid_file(args.pid_file) + else: + query = f""" + SELECT ?obj WHERE {{ + ?obj ; + ?model; + ?ds. + ?ds + FILTER(!sameTerm(?model, )) + FILTER(!sameTerm(?model, )) + }} + """ + + result = perform_http_request(query, args.url, args.user, args.password) + object_ids.extend(result.strip().split("\n")[1:]) + + # Download metadata for each PID in parallel using ThreadPoolExecutor. + with concurrent.futures.ThreadPoolExecutor(max_workers=3) as executor, tqdm( + total=len(object_ids), desc="Downloading Metadata" + ) as progress: + futures = { + executor.submit( + fetch_data, + args.dsid, + args.url, + args.user, + args.password, + args.output_dir, + obj_id, + ): obj_id + for obj_id in object_ids + } + for future in concurrent.futures.as_completed(futures): + obj_id = futures[future] + try: + success = future.result() + if success: + progress.update(1) + except Exception as exc: + print(f"{obj_id} generated an exception: {exc}") + + +if __name__ == "__main__": + main() diff --git a/scripts/datastream_updater.py b/scripts/datastream_updater.py new file mode 100644 index 0000000..0b2e6cb --- /dev/null +++ b/scripts/datastream_updater.py @@ -0,0 +1,200 @@ +#!/usr/bin/env python + +import base64 +import datetime +import argparse +from lxml import etree as ET +from lxml.etree import QName +import mimetypes +import logging + +# Setting up basic logging +logging.basicConfig(level=logging.INFO) + + +def format_xml_element(element, level=0, indent=" "): + """ + Formats an XML element by adding appropriate spacing and indentation. + + Args: + element (Element): The XML element to format. + level (int, optional): The current level of indentation. Defaults to 0. + indent (str, optional): The string used for indentation. Defaults to " ". + + Returns: + None + """ + spacing = "\n" + level * indent + + if len(element): + if not element.text or not element.text.strip(): + element.text = spacing + indent + if not element.tail or not element.tail.strip(): + element.tail = spacing + for child in element: + format_xml_element(child, level + 1, indent) + else: + if level and (not element.tail or not element.tail.strip()): + element.tail = spacing + + +def compress_and_encode(file_path): + """ + Compresses and encodes the binary data from the given file path. + + Args: + file_path (str): The path to the file containing the binary data. + + Returns: + tuple: A tuple containing the indented base64-encoded data and the original size of the binary data. + """ + with open(file_path, "rb") as f_in: + binary_data = f_in.read() + original_size = len(binary_data) + base64_data = base64.b64encode(binary_data) + base64_lines = [ + base64_data[i : i + 80].decode("utf-8") + for i in range(0, len(base64_data), 80) + ] + indented_base64 = "\n ".join(base64_lines) + return indented_base64, original_size + + +def register_namespaces(xml_path): + """ + Registers XML namespaces from the given XML file. + + Args: + xml_path (str): The path to the XML file. + + Raises: + Exception: If there is an error registering the namespaces. + """ + try: + namespaces = dict( + [node for _, node in ET.iterparse(xml_path, events=["start-ns"])] + ) + for ns in namespaces: + ET.register_namespace(ns, namespaces[ns]) + except Exception as e: + logging.error(f"Error registering namespaces: {e}") + raise + + +def add_datastream_version( + xml_path, dsid, base64_data, original_size, mimetype, label=None +): + """ + Adds a new version of a datastream to an XML file. + + Args: + xml_path (str): The path to the XML file. + dsid (str): The ID of the datastream. + base64_data (str): The base64-encoded content of the datastream. + original_size (int): The original size of the datastream in bytes. + mimetype (str): The MIME type of the datastream. + label (str, optional): The label for the datastream version. If not provided, a default label will be used. + + Returns: + str: The XML string with the new datastream version added. + + Raises: + ET.ParseError: If there is an error parsing the XML file. + Exception: If there is an error creating the XML string. + """ + try: + root = ET.parse(xml_path).getroot() + except ET.ParseError as e: + logging.exception(f"XML parsing error: {e}") + return + + nsmap = { + "foxml": "info:fedora/fedora-system:def/foxml#", + "xsi": "http://www.w3.org/2001/XMLSchema-instance", + "audit": "info:fedora/fedora-system:def/audit#", + "rdf": "http://www.w3.org/1999/02/22-rdf-syntax-ns#", + "fedora": "info:fedora/fedora-system:def/relations-external#", + "fedora-model": "info:fedora/fedora-system:def/model#", + "islandora": "http://islandora.ca/ontology/relsext#", + } + + # Have to use qualified names when creating an element. + ds_version_tag = QName(nsmap["foxml"], "datastreamVersion") + binary_content_tag = QName(nsmap["foxml"], "binaryContent") + + datastream = root.find(f".//foxml:datastream[@ID='{dsid}']", namespaces=nsmap) + if datastream is None: + logging.warning(f"Datastream with ID of {dsid} does not exist.") + return + + if label is None: + datastream_version = datastream.find( + ".//foxml:datastreamVersion[last()]", namespaces=nsmap + ) + label = ( + datastream_version.get("LABEL") + if datastream_version is not None + else "default_label" + ) + + new_id = "{}.{}".format( + dsid, len(datastream.findall(".//foxml:datastreamVersion", namespaces=nsmap)) + ) + datastream_version = ET.SubElement( + datastream, + ds_version_tag, + { + "ID": new_id, + "LABEL": label, + "MIMETYPE": mimetype, + "SIZE": str(original_size), + }, + ) + + dt = datetime.datetime.utcnow().strftime("%Y-%m-%dT%H:%M:%S.%f")[:-3] + "Z" + datastream_version.set("CREATED", dt) + + binary_content = ET.SubElement(datastream_version, binary_content_tag) + binary_content.text = "\n " + base64_data + "\n " + + try: + ET.indent(root, space=" ") + format_xml_element(root) + xml_string = ET.tostring( + root, encoding="utf-8", method="xml", xml_declaration=True + ) + except Exception as e: + logging.exception(f"Error creating XML string: {e}") + raise + + return xml_string + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument("--xml", help="path to the XML file to modify", required=True) + parser.add_argument("--dsid", help="ID of the datastream to modify", required=True) + parser.add_argument( + "--content", + help="path to the binary content to add as a new datastreamVersion", + required=True, + ) + parser.add_argument("--label", help="label of the new datastream version") + parser.add_argument("--output", help="path to the output XML file", required=True) + args = parser.parse_args() + + try: + mimetype, _ = mimetypes.guess_type(args.content) + mimetype = mimetype or "application/octet-stream" + + base64_data, original_size = compress_and_encode(args.content) + register_namespaces(args.xml) + updated_xml = add_datastream_version( + args.xml, args.dsid, base64_data, original_size, mimetype, args.label + ) + + if updated_xml: + with open(args.output, "w") as f_out: + f_out.write(updated_xml.decode("utf-8")) + except Exception as e: + logging.exception(f"Error in script execution: {e}") diff --git a/scripts/export_metadata.sh b/scripts/export_metadata.sh deleted file mode 100644 index 1ed4fbc..0000000 --- a/scripts/export_metadata.sh +++ /dev/null @@ -1,33 +0,0 @@ -#!/bin/bash - -SCRIPT_DIR=$(dirname $(readlink -f $0)) - -. "$SCRIPT_DIR/util.in" - -# Ensure GNU Parallel is installed. -if ! type parallel >/dev/null 2>&1 ; then - printf "*****************************************************\n" - printf "* Error: GNU Parallel is not installed. *\n" - printf "*****************************************************\n" - exit 1 -fi - -DSID_QUERY=$(cat << EOQ -SELECT ?obj -WHERE { - ?obj ; - ?model ; - ?ds . - ?ds -FILTER(!sameTerm(?model, )) -FILTER(!sameTerm(?model, )) -} -EOQ -) - -# Go perform the query for all objects and pass off to parallel to do the heavy lifting. To note the URI here chops off -# the info:fedora/ piece from the front with Perl. -# @see: https://www.gnu.org/software/parallel/parallel_tutorial.html#perl-expression-replacement-string -do_curl "${DSID_QUERY}" "CSV" | parallel --jobs 3 --skip-first-line curl --location "${FEDORA_URL}:8080/fedora/objects/{= s/info:fedora\/// =}/datastreams/${DSID}/content" \ --u "${FEDORA_USER}:${FEDORA_PASS}" \ --o "${SCRIPT_DIR}/{= s/info:fedora\/// =}-${DSID}.xml" diff --git a/scripts/metadata_analysis.sh b/scripts/metadata_analysis.sh deleted file mode 100644 index c4fe19b..0000000 --- a/scripts/metadata_analysis.sh +++ /dev/null @@ -1,66 +0,0 @@ -#!/bin/bash - -SCRIPT_DIR=$(dirname $(readlink -f $0)) - -. "$SCRIPT_DIR/util.in" - -# Base RI query to build things up. -BASE_QUERY=$(cat << EOQ -WHERE { - ?obj ; - ?model ; -FILTER(!sameTerm(?model, )) -FILTER(!sameTerm(?model, )) -} -EOQ -) - -# Retrieves a list of content models and their count to CSV. -model_breakdown() { - local QUERY=$(cat << EOQ -SELECT ?model (COUNT(?model) as ?count) -${BASE_QUERY} -GROUP BY ?model -EOQ -) - - do_curl "$QUERY" "CSV" > "$SCRIPT_DIR"/models.csv - echo "Outputted model breakdown to CSV (${SCRIPT_DIR}/models.csv)." -} - -# Retrieves the total amount of objects in the repository. -total_count() { - local QUERY=$(cat << EOQ -SELECT ?obj -${BASE_QUERY} -EOQ -) - - local COUNT=$(do_curl "$QUERY" "count") - echo "The total number of objects is ${COUNT}." -} - -# Breaks down the unique datastream IDs and their count to CSV. -dsid_breakdown() { - local QUERY=$(cat << EOQ -SELECT ?ds (COUNT(?ds) as ?count) -WHERE { - ?obj ; - ?model ; - [ ?ds] -FILTER(!sameTerm(?model, )) -FILTER(!sameTerm(?model, )) -} -GROUP BY ?ds -EOQ -) - - do_curl "$QUERY" "CSV" > "$SCRIPT_DIR"/dsids.csv - echo "Outputted DSID breakdown to CSV (${SCRIPT_DIR}/dsids.csv)." -} - -total_count -model_breakdown -dsid_breakdown - -exit 0 diff --git a/scripts/queries.py b/scripts/queries.py new file mode 100644 index 0000000..7e9571d --- /dev/null +++ b/scripts/queries.py @@ -0,0 +1,108 @@ +queries = { + "content_model_distribution": """ + SELECT ?model (COUNT(?obj) as ?count) + FROM <#ri> + WHERE { + ?obj ?model; + } + GROUP BY ?model + """, + + "object_count": """ + SELECT (COUNT(?obj) as ?count) + FROM <#ri> + WHERE { + ?obj . + } + LIMIT 1 + """, + + "active_deleted_count": """ + SELECT (COUNT(?activeObj) AS ?ActiveCount) (COUNT(?deletedObj) AS ?DeletedCount) + FROM <#ri> + WHERE { + { + SELECT ?activeObj + WHERE { + ?activeObj . + } + } UNION { + SELECT ?deletedObj + WHERE { + ?deletedObj . + } + } + } + """, + + "deleted_objects": """ + SELECT ?obj + FROM <#ri> + WHERE { + ?obj + } + """, + + "datastream_distribution": """ + SELECT ?datastream (COUNT(?datastream) as ?count) + FROM <#ri> + WHERE { + ?obj ; + OPTIONAL { + ?obj ?c . + ?c ?datastream ; + } + } + GROUP BY ?datastream + """, + + "owner_distribution": """ + SELECT ?owner (COUNT(?obj) as ?count) + FROM <#ri> + WHERE { + ?obj ?owner; + } + GROUP BY ?owner + """, + + "collection_distribution": """ + SELECT ?collection (COUNT(?obj) as ?count) + FROM <#ri> + WHERE { + ?obj ?collection; + } + GROUP BY ?collection + """, + + "relationships": """ + SELECT DISTINCT ?relationship + FROM <#ri> + WHERE { + ?o ?relationship ?s . + ?o + } + """, + + "orphaned_objects": """ + SELECT DISTINCT ?orphan + FROM <#ri> + WHERE { + ?orphan + FILTER NOT EXISTS { + ?orphan ?subject . + } + FILTER NOT EXISTS { + ?orphan ?subject . + } + } + """, + + "mimetype_distribution": """ + SELECT ?mimetype (COUNT(?mimetype) as ?count) + FROM <#ri> + WHERE { + ?o ?mimetype + } + GROUP BY ?mimetype + """ +} diff --git a/scripts/requirements.txt b/scripts/requirements.txt new file mode 100644 index 0000000..cbf27c6 --- /dev/null +++ b/scripts/requirements.txt @@ -0,0 +1,6 @@ +### Non Version Specific Requirements +pyyaml +requests +tqdm +bs4 +lxml diff --git a/scripts/util.in b/scripts/util.in deleted file mode 100644 index de1db76..0000000 --- a/scripts/util.in +++ /dev/null @@ -1,89 +0,0 @@ -#!/bin/bash - -FEDORA_URL=http://localhost -FEDORA_USER=fedoraAdmin -SKIP_AUTH=false -DSID=MODS - -print_help() { - cat <<-HELP -This script is used to generate statistics or pull data from a FCREPO 3 instance for migration data analysis. -The following arguments are provided: - --fedora_url [FCREPO3 URL] - Default ($FEDORA_URL) - --fedora_user [FCREPO3 User] - Default ($FEDORA_USER) - --fedora_pass [FCREPO3 Password] - REQUIRED - --skip_auth_check [Skips verification of credentials on a request] - Default (false) - --dsid [Metadata datastream ID to be used for metadata export] - Default ($DSID) -HELP -exit 0 -} - - -# Helper to perform RI searches. -do_curl() { -curl --location --request POST "${FEDORA_URL}:8080/fedora/risearch" \ --u "${FEDORA_USER}:${FEDORA_PASS}" \ --s \ ---header 'Content-Type: application/x-www-form-urlencoded' \ ---data-urlencode "type=tuples" \ ---data-urlencode 'lang=sparql' \ ---data-urlencode "format=$2" \ ---data-urlencode 'limit=' \ ---data-urlencode 'dt=on' \ ---data-urlencode "query=$1" -} - -# Helper to ensure can authenticate to Fedora. -check_auth() { -CODE=$(curl --location "${FEDORA_URL}:8080/fedora/objects/fedora-system:ContentModel-3.0/export" \ --u "${FEDORA_USER}:${FEDORA_PASS}" \ --s \ --w '%{http_code}' \ --o /dev/null \ -) -echo "${CODE}" -} - -while [ "$#" -gt 0 ]; do - case "$1" in - --fedora_url=*) - FEDORA_URL="${1#*=}" - ;; - --fedora_user=*) - FEDORA_USER="${1#*=}" - ;; - --fedora_pass=*) - FEDORA_PASS="${1#*=}" - ;; - --skip_auth_check) - SKIP_AUTH=true - ;; - --dsid=*) - DSID="${1#*=}" - ;; - --help) print_help;; - *) - printf "************************************************************\n" - printf "* Error: Invalid argument, run --help for valid arguments. *\n" - printf "************************************************************\n" - exit 1 - esac - shift -done - -if [ -z "${FEDORA_PASS}" ]; then - printf "*****************************************************\n" - printf "* Error: A password for the Fedora user is required *\n" - printf "*****************************************************\n" - exit 1 -fi; - -if [ "${SKIP_AUTH}" = false ]; then - AUTH_CODE=$(check_auth) - if [ "${AUTH_CODE}" != 200 ]; then - printf "************************************************************\n" - printf "* Error: Authentication failed to Fedora. *\n" - printf "************************************************************\n" - exit 1 - fi -fi diff --git a/scripts/utils.py b/scripts/utils.py new file mode 100644 index 0000000..4012fd7 --- /dev/null +++ b/scripts/utils.py @@ -0,0 +1,24 @@ +import requests + + +def perform_http_request(query, endpoint_url, user, password, output_format="CSV"): + headers = {"Content-Type": "application/x-www-form-urlencoded"} + payload = { + "type": "tuples", + "lang": "sparql", + "format": output_format, + "limit": "", + "dt": "on", + "query": query, + } + response = requests.post( + f"{endpoint_url}/fedora/risearch", + auth=(user, password), + headers=headers, + data=payload, + ) + if response.status_code == 200: + return response.text + else: + print(f"Error {response.status_code} while querying: {query}") + return None From 69ab4ee071bfe2903a647ccec870553235aa7fe1 Mon Sep 17 00:00:00 2001 From: Chris MacDonald <31731869+chrismacdonaldw@users.noreply.github.com> Date: Fri, 19 Apr 2024 12:02:10 -0300 Subject: [PATCH 02/20] Update README --- scripts/README.md | 20 +++++++++++++++----- 1 file changed, 15 insertions(+), 5 deletions(-) diff --git a/scripts/README.md b/scripts/README.md index 1d2cdc9..05fb31d 100644 --- a/scripts/README.md +++ b/scripts/README.md @@ -20,13 +20,23 @@ The scripts also require an FCREPO3 instance. If these tools are run on a system ## Features ### Metadata Analysis -Python scripts that perform the following: -1. Count all objects in the repository. -2. Provide a breakdown of objects by content models (`models.csv`). -3. Output a breakdown of unique datastream IDs (`dsids.csv`). +Script to run SPARQL queries against an FCREPO's RI and gather information. Current queries include: + - Content model distribution + - Total object count + - Count of active and deleted objects + - List of deleted objects + - Datastream distribution + - Owner distribution + - Collection distribution + - List of relationships + - List of orphaned objects + - MIME type distribution ### Metadata Export -Scripts to export all objects within the repository that contain a specified metadata datastream ID, saving results as XML. +Script to export all objects within the repository that contain a specified metadata datastream ID, saving results as XML. + +### Datastream Updater +Script to inject a binary into an archival FOXML as base64 encoded data within a datastream. ## Usage ### Metadata Analysis From 08beabc558add75031c69e8fa101f40373a19517 Mon Sep 17 00:00:00 2001 From: Chris MacDonald <31731869+chrismacdonaldw@users.noreply.github.com> Date: Fri, 19 Apr 2024 13:11:49 -0300 Subject: [PATCH 03/20] Add FOXML export --- scripts/README.md | 13 +++++ scripts/datastream_export.py | 62 ++++++++---------------- scripts/foxml_export.py | 93 ++++++++++++++++++++++++++++++++++++ scripts/utils.py | 36 ++++++++++++++ 4 files changed, 161 insertions(+), 43 deletions(-) create mode 100644 scripts/foxml_export.py diff --git a/scripts/README.md b/scripts/README.md index 05fb31d..afbe2c9 100644 --- a/scripts/README.md +++ b/scripts/README.md @@ -35,6 +35,9 @@ Script to run SPARQL queries against an FCREPO's RI and gather information. Curr ### Metadata Export Script to export all objects within the repository that contain a specified metadata datastream ID, saving results as XML. +### FOXML Export +Script to export FOXML archival objects from a Fedora repository given a list of PIDs. + ### Datastream Updater Script to inject a binary into an archival FOXML as base64 encoded data within a datastream. @@ -58,6 +61,16 @@ python3 datastream_export.py --url=http://your-fedora-url:8080 --user=admin --pa Exports all metadata entries related to the specified DSID into XML files stored in the defined output directory. Each file's name will be in the format `pid-DSID.xml`. +### FOXML Export +#### Command +```bash +python3 foxml_export.py --url=http://your-fedora-url:8080 --user=admin --pasword=secret --pid_file=./some_pids --output_dir=./output +``` +> The script supports adding comments in the pid_file using `#`. PIDs can also contain URL encoded characters (e.g., `%3A` for `:` which will be automatically decoded). + +#### Output +Exports all archival FOXML found in the associated PID file passed in through arguments to their own folder in `output_dir/FOXML`. + ### Datastream Updater #### Command ```bash diff --git a/scripts/datastream_export.py b/scripts/datastream_export.py index 90aa0bd..78ce767 100644 --- a/scripts/datastream_export.py +++ b/scripts/datastream_export.py @@ -4,7 +4,7 @@ import concurrent.futures import os import mimetypes -from utils import perform_http_request +from utils import perform_http_request, process_pid_file def parse_args(): @@ -24,9 +24,9 @@ def parse_args(): return parser.parse_args() -def fetch_data(dsid, base_url, user, password, output_dir, obj_id): +def fetch_data(dsid, base_url, user, password, output_dir, pid): """ - Fetches the datastream content for a given datastream ID (dsid) and object ID (obj_id) from a Fedora repository. + Fetches the datastream content for a given datastream ID (dsid) and PID from a Fedora repository. Args: dsid (str): The ID of the datastream to fetch. @@ -34,14 +34,14 @@ def fetch_data(dsid, base_url, user, password, output_dir, obj_id): user (str): The username for authentication. password (str): The password for authentication. output_dir (str): The directory where the fetched data will be saved. - obj_id (str): The ID of the object that contains the datastream. + pid (str): The PID of the object that contains the datastream. Returns: bool: True if the datastream content was successfully fetched and saved, False otherwise. """ - obj_id = obj_id.replace("info:fedora/", "") - url = f"{base_url}/fedora/objects/{obj_id}/datastreams/{dsid}/content" - print(f"Downloading {dsid} for PID: {obj_id}") + pid = pid.replace("info:fedora/", "") + url = f"{base_url}/fedora/objects/{pid}/datastreams/{dsid}/content" + print(f"Downloading {dsid} for PID: {pid}") try: response = requests.get(url, auth=(user, password)) response.raise_for_status() @@ -49,49 +49,25 @@ def fetch_data(dsid, base_url, user, password, output_dir, obj_id): os.makedirs(dsid_dir, exist_ok=True) content_type = response.headers.get("Content-Type", "") extension = mimetypes.guess_extension(content_type) if content_type else "" - filename = f"{obj_id}-{dsid}{extension}" + filename = f"{pid}-{dsid}{extension}" with open(os.path.join(dsid_dir, filename), "wb") as f: f.write(response.content) - print(f"Successfully saved {filename}") + print(f"Successfully saved {filename}\n") return True except Exception as e: - print(f"Failed to fetch data for {obj_id}, error: {str(e)}") + print(f"Failed to fetch data for {pid}, error: {str(e)}\n") return False -def process_pid_file(filepath): - """ - Process a file containing PIDs (Persistent Identifiers) and return a list of PIDs. - Supports comments in the file using '#' character. - Replace '%3A' with ':' in PIDs. - - Args: - filepath (str): The path to the file containing PIDs. - - Returns: - list: A list of PIDs extracted from the file. - """ - pids = [] - with open(filepath, "r") as file: - for line in file: - line = line.strip() - if "#" in line: - line = line[: line.index("#")].strip() - if line: - line = line.replace("%3A", ":") - pids.append(line) - return pids - - def main(): args = parse_args() os.makedirs(args.output_dir, exist_ok=True) - object_ids = [] + pids = [] # If a PID file is provided, process the file to get the list of PIDs. if args.pid_file: - object_ids = process_pid_file(args.pid_file) + pids = process_pid_file(args.pid_file) else: query = f""" SELECT ?obj WHERE {{ @@ -105,11 +81,11 @@ def main(): """ result = perform_http_request(query, args.url, args.user, args.password) - object_ids.extend(result.strip().split("\n")[1:]) + pids.extend(result.strip().split("\n")[1:]) # Download metadata for each PID in parallel using ThreadPoolExecutor. with concurrent.futures.ThreadPoolExecutor(max_workers=3) as executor, tqdm( - total=len(object_ids), desc="Downloading Metadata" + total=len(pids), desc="Downloading Metadata" ) as progress: futures = { executor.submit( @@ -119,18 +95,18 @@ def main(): args.user, args.password, args.output_dir, - obj_id, - ): obj_id - for obj_id in object_ids + pid, + ): pid + for pid in pids } for future in concurrent.futures.as_completed(futures): - obj_id = futures[future] + pid = futures[future] try: success = future.result() if success: progress.update(1) except Exception as exc: - print(f"{obj_id} generated an exception: {exc}") + print(f"{pid} generated an exception: {exc}") if __name__ == "__main__": diff --git a/scripts/foxml_export.py b/scripts/foxml_export.py new file mode 100644 index 0000000..5ee5a90 --- /dev/null +++ b/scripts/foxml_export.py @@ -0,0 +1,93 @@ +import argparse +import requests +from tqdm import tqdm +import concurrent.futures +import os +import mimetypes +from utils import process_pid_file + + +def parse_args(): + parser = argparse.ArgumentParser( + description="Export metadata using SPARQL query and save as XML." + ) + parser.add_argument("--url", required=True, help="Fedora base URL") + parser.add_argument("--user", required=True, help="Username for Fedora access") + parser.add_argument("--password", required=True, help="Password for Fedora access") + parser.add_argument( + "--output_dir", default="./output", help="Directory to save XML files" + ) + parser.add_argument( + "--pid_file", type=str, required=True, help="File containing PIDs to process" + ) + return parser.parse_args() + + +def fetch_foxml(base_url, user, password, output_dir, pid): + """ + Fetches the archival FOXML for a given PID from a Fedora repository. + + Args: + base_url (str): The base URL of the Fedora repository. + user (str): The username for authentication. + password (str): The password for authentication. + output_dir (str): The directory where the fetched data will be saved. + pid (str): The ID of the object that contains the datastream. + + Returns: + bool: True if the datastream content was successfully fetched and saved, False otherwise. + """ + pid = pid.replace("info:fedora/", "") + url = f"{base_url}/fedora/objects/{pid}/export?context=archive" + print(f"Downloading FOXML for PID: {pid}") + try: + response = requests.get(url, auth=(user, password)) + response.raise_for_status() + foxml_dir = os.path.join(output_dir, "FOXML") + os.makedirs(foxml_dir, exist_ok=True) + content_type = response.headers.get("Content-Type", "") + extension = mimetypes.guess_extension(content_type) if content_type else "" + filename = f"{pid}-FOXML{extension}" + with open(os.path.join(foxml_dir, filename), "wb") as f: + f.write(response.content) + print(f"Successfully saved {filename}\n") + return True + except Exception as e: + print(f"Failed to fetch FOXML for {pid}, error: {str(e)}\n") + return False + + +def main(): + args = parse_args() + os.makedirs(args.output_dir, exist_ok=True) + + pids = [] + + pids = process_pid_file(args.pid_file) + + with concurrent.futures.ThreadPoolExecutor(max_workers=3) as executor, tqdm( + total=len(pids), desc="Downloading FOXML" + ) as progress: + futures = { + executor.submit( + fetch_foxml, + args.url, + args.user, + args.password, + args.output_dir, + pid, + ): pid + for pid in pids + } + for future in concurrent.futures.as_completed(futures): + pid = futures[future] + try: + success = future.result() + if success: + progress.update(1) + except Exception as exc: + print(f"{pid} generated an exception: {exc}") + + +if __name__ == "__main__": + main() diff --git a/scripts/utils.py b/scripts/utils.py index 4012fd7..17eeb86 100644 --- a/scripts/utils.py +++ b/scripts/utils.py @@ -2,6 +2,19 @@ def perform_http_request(query, endpoint_url, user, password, output_format="CSV"): + """ + Perform an HTTP request to a specified endpoint URL with the given query. + + Args: + query (str): The SPARQL query to be executed. + endpoint_url (str): The URL of the SPARQL endpoint. + user (str): The username for authentication. + password (str): The password for authentication. + output_format (str, optional): The desired format of the response. Defaults to "CSV". + + Returns: + str: The response text if the request is successful, None otherwise. + """ headers = {"Content-Type": "application/x-www-form-urlencoded"} payload = { "type": "tuples", @@ -22,3 +35,26 @@ def perform_http_request(query, endpoint_url, user, password, output_format="CSV else: print(f"Error {response.status_code} while querying: {query}") return None + +def process_pid_file(filepath): + """ + Process a file containing PIDs (Persistent Identifiers) and return a list of PIDs. + Supports comments in the file using '#' character. + Replace '%3A' with ':' in PIDs. + + Args: + filepath (str): The path to the file containing PIDs. + + Returns: + list: A list of PIDs extracted from the file. + """ + pids = [] + with open(filepath, "r") as file: + for line in file: + line = line.strip() + if "#" in line: + line = line[: line.index("#")].strip() + if line: + line = line.replace("%3A", ":") + pids.append(line) + return pids From 6a2edb636c27df3586b7ee128782d81b9c49cc97 Mon Sep 17 00:00:00 2001 From: Chris MacDonald <31731869+chrismacdonaldw@users.noreply.github.com> Date: Fri, 19 Apr 2024 13:25:33 -0300 Subject: [PATCH 04/20] Address some old linting issues --- src/Plugin/migrate/process/Parse.php | 8 ++++---- src/Plugin/migrate/source/Foxml.php | 6 +++--- src/StreamWrapper/Foxml.php | 6 +++--- .../Fedora3/Element/AbstractStreamOffsetContent.php | 2 +- src/Utility/Fedora3/FoxmlParser.php | 2 +- 5 files changed, 12 insertions(+), 12 deletions(-) diff --git a/src/Plugin/migrate/process/Parse.php b/src/Plugin/migrate/process/Parse.php index 7d84ba8..efa3b5f 100644 --- a/src/Plugin/migrate/process/Parse.php +++ b/src/Plugin/migrate/process/Parse.php @@ -2,14 +2,14 @@ namespace Drupal\foxml\Plugin\migrate\process; +use Drupal\Core\Plugin\ContainerFactoryPluginInterface; + +use Drupal\foxml\Utility\Fedora3\FoxmlParser; +use Drupal\foxml\Utility\Fedora3\Element\DigitalObject; use Drupal\migrate\ProcessPluginBase; use Drupal\migrate\MigrateExecutableInterface; use Drupal\migrate\Row; use Drupal\migrate\MigrateException; -use Drupal\foxml\Utility\Fedora3\FoxmlParser; -use Drupal\foxml\Utility\Fedora3\Element\DigitalObject; - -use Drupal\Core\Plugin\ContainerFactoryPluginInterface; use Symfony\Component\DependencyInjection\ContainerInterface; diff --git a/src/Plugin/migrate/source/Foxml.php b/src/Plugin/migrate/source/Foxml.php index 6d1f8d2..8521e7a 100644 --- a/src/Plugin/migrate/source/Foxml.php +++ b/src/Plugin/migrate/source/Foxml.php @@ -2,13 +2,13 @@ namespace Drupal\foxml\Plugin\migrate\source; +use Drupal\Core\StringTranslation\StringTranslationTrait; +use Drupal\Core\Plugin\ContainerFactoryPluginInterface; + use Drupal\foxml\Utility\Fedora3\ObjectLowLevelAdapterInterface; use Drupal\migrate\Plugin\migrate\source\SourcePluginBase; use Drupal\migrate\Plugin\MigrationInterface; -use Drupal\Core\StringTranslation\StringTranslationTrait; -use Drupal\Core\Plugin\ContainerFactoryPluginInterface; - use Symfony\Component\DependencyInjection\ContainerInterface; /** diff --git a/src/StreamWrapper/Foxml.php b/src/StreamWrapper/Foxml.php index 601ab11..3599595 100644 --- a/src/StreamWrapper/Foxml.php +++ b/src/StreamWrapper/Foxml.php @@ -2,13 +2,13 @@ namespace Drupal\foxml\StreamWrapper; -use Drupal\foxml\Utility\Fedora3\DatastreamLowLevelAdapterInterface; -use Drupal\foxml\Utility\Fedora3\ObjectLowLevelAdapterInterface; - use Drupal\Core\File\FileSystem; use Drupal\Core\StreamWrapper\LocalReadOnlyStream; use Drupal\Core\Url; +use Drupal\foxml\Utility\Fedora3\DatastreamLowLevelAdapterInterface; +use Drupal\foxml\Utility\Fedora3\ObjectLowLevelAdapterInterface; + /** * FOXML stream wrapper. */ diff --git a/src/Utility/Fedora3/Element/AbstractStreamOffsetContent.php b/src/Utility/Fedora3/Element/AbstractStreamOffsetContent.php index ede8e58..8b7f5b0 100644 --- a/src/Utility/Fedora3/Element/AbstractStreamOffsetContent.php +++ b/src/Utility/Fedora3/Element/AbstractStreamOffsetContent.php @@ -2,8 +2,8 @@ namespace Drupal\foxml\Utility\Fedora3\Element; -use Drupal\foxml\Utility\Fedora3\AbstractParser; use Drupal\foxml\StreamWrapper\Substream; +use Drupal\foxml\Utility\Fedora3\AbstractParser; /** * Abstract element handler for inline content. diff --git a/src/Utility/Fedora3/FoxmlParser.php b/src/Utility/Fedora3/FoxmlParser.php index 6d34b93..b23f559 100644 --- a/src/Utility/Fedora3/FoxmlParser.php +++ b/src/Utility/Fedora3/FoxmlParser.php @@ -2,9 +2,9 @@ namespace Drupal\foxml\Utility\Fedora3; +use Drupal\Core\Cache\CacheBackendInterface; use Drupal\Core\Lock\LockBackendInterface; use Drupal\foxml\Utility\Fedora3\Element\DigitalObject; -use Drupal\Core\Cache\CacheBackendInterface; /** * Foxml parser. From ec995590739519a27d08286e97ce09b09958dde5 Mon Sep 17 00:00:00 2001 From: Chris MacDonald <31731869+chrismacdonaldw@users.noreply.github.com> Date: Fri, 19 Apr 2024 13:28:08 -0300 Subject: [PATCH 05/20] A few more linting issues --- src/Plugin/migrate/process/Parse.php | 4 ++-- src/Plugin/migrate/source/Foxml.php | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/src/Plugin/migrate/process/Parse.php b/src/Plugin/migrate/process/Parse.php index efa3b5f..e3fb2fb 100644 --- a/src/Plugin/migrate/process/Parse.php +++ b/src/Plugin/migrate/process/Parse.php @@ -4,12 +4,12 @@ use Drupal\Core\Plugin\ContainerFactoryPluginInterface; -use Drupal\foxml\Utility\Fedora3\FoxmlParser; use Drupal\foxml\Utility\Fedora3\Element\DigitalObject; +use Drupal\foxml\Utility\Fedora3\FoxmlParser; use Drupal\migrate\ProcessPluginBase; +use Drupal\migrate\MigrateException; use Drupal\migrate\MigrateExecutableInterface; use Drupal\migrate\Row; -use Drupal\migrate\MigrateException; use Symfony\Component\DependencyInjection\ContainerInterface; diff --git a/src/Plugin/migrate/source/Foxml.php b/src/Plugin/migrate/source/Foxml.php index 8521e7a..77425aa 100644 --- a/src/Plugin/migrate/source/Foxml.php +++ b/src/Plugin/migrate/source/Foxml.php @@ -2,12 +2,12 @@ namespace Drupal\foxml\Plugin\migrate\source; -use Drupal\Core\StringTranslation\StringTranslationTrait; use Drupal\Core\Plugin\ContainerFactoryPluginInterface; +use Drupal\Core\StringTranslation\StringTranslationTrait; use Drupal\foxml\Utility\Fedora3\ObjectLowLevelAdapterInterface; -use Drupal\migrate\Plugin\migrate\source\SourcePluginBase; use Drupal\migrate\Plugin\MigrationInterface; +use Drupal\migrate\Plugin\migrate\source\SourcePluginBase; use Symfony\Component\DependencyInjection\ContainerInterface; From e4f8136ad3ef54d750d4ee4ad3783ce333d0eef4 Mon Sep 17 00:00:00 2001 From: Chris MacDonald <31731869+chrismacdonaldw@users.noreply.github.com> Date: Fri, 19 Apr 2024 13:29:00 -0300 Subject: [PATCH 06/20] Move SourcePluginBase --- src/Plugin/migrate/source/Foxml.php | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/Plugin/migrate/source/Foxml.php b/src/Plugin/migrate/source/Foxml.php index 77425aa..27fff67 100644 --- a/src/Plugin/migrate/source/Foxml.php +++ b/src/Plugin/migrate/source/Foxml.php @@ -6,8 +6,8 @@ use Drupal\Core\StringTranslation\StringTranslationTrait; use Drupal\foxml\Utility\Fedora3\ObjectLowLevelAdapterInterface; -use Drupal\migrate\Plugin\MigrationInterface; use Drupal\migrate\Plugin\migrate\source\SourcePluginBase; +use Drupal\migrate\Plugin\MigrationInterface; use Symfony\Component\DependencyInjection\ContainerInterface; From c9bc7b4c167b344bc5edfeda5d298d9a80041536 Mon Sep 17 00:00:00 2001 From: Chris MacDonald <31731869+chrismacdonaldw@users.noreply.github.com> Date: Fri, 19 Apr 2024 13:30:08 -0300 Subject: [PATCH 07/20] Alphabetical imports --- src/Plugin/migrate/process/Parse.php | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/Plugin/migrate/process/Parse.php b/src/Plugin/migrate/process/Parse.php index e3fb2fb..61aa5f3 100644 --- a/src/Plugin/migrate/process/Parse.php +++ b/src/Plugin/migrate/process/Parse.php @@ -6,9 +6,9 @@ use Drupal\foxml\Utility\Fedora3\Element\DigitalObject; use Drupal\foxml\Utility\Fedora3\FoxmlParser; -use Drupal\migrate\ProcessPluginBase; use Drupal\migrate\MigrateException; use Drupal\migrate\MigrateExecutableInterface; +use Drupal\migrate\ProcessPluginBase; use Drupal\migrate\Row; use Symfony\Component\DependencyInjection\ContainerInterface; From cdb3f626663473001374914ac2096a829ec09de4 Mon Sep 17 00:00:00 2001 From: Chris MacDonald <31731869+chrismacdonaldw@users.noreply.github.com> Date: Fri, 19 Apr 2024 13:44:49 -0300 Subject: [PATCH 08/20] Remove pyyaml from requirements.txt --- scripts/requirements.txt | 1 - 1 file changed, 1 deletion(-) diff --git a/scripts/requirements.txt b/scripts/requirements.txt index cbf27c6..3d9bd8c 100644 --- a/scripts/requirements.txt +++ b/scripts/requirements.txt @@ -1,5 +1,4 @@ ### Non Version Specific Requirements -pyyaml requests tqdm bs4 From 604a5a5315663148041db5de6ede7ee072672ec4 Mon Sep 17 00:00:00 2001 From: Chris MacDonald <31731869+chrismacdonaldw@users.noreply.github.com> Date: Fri, 19 Apr 2024 13:46:08 -0300 Subject: [PATCH 09/20] Add newline according to Black formatter --- scripts/utils.py | 1 + 1 file changed, 1 insertion(+) diff --git a/scripts/utils.py b/scripts/utils.py index 17eeb86..d99e822 100644 --- a/scripts/utils.py +++ b/scripts/utils.py @@ -36,6 +36,7 @@ def perform_http_request(query, endpoint_url, user, password, output_format="CSV print(f"Error {response.status_code} while querying: {query}") return None + def process_pid_file(filepath): """ Process a file containing PIDs (Persistent Identifiers) and return a list of PIDs. From 2f0d97e4dc9b267f7655e38514510a04804af277 Mon Sep 17 00:00:00 2001 From: Chris MacDonald <31731869+chrismacdonaldw@users.noreply.github.com> Date: Mon, 29 Apr 2024 14:44:28 -0300 Subject: [PATCH 10/20] Update scripts/README.md Co-authored-by: JojoVes --- scripts/README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/README.md b/scripts/README.md index afbe2c9..1e25837 100644 --- a/scripts/README.md +++ b/scripts/README.md @@ -8,7 +8,7 @@ Tools to analyse and export metadata from an FCREPO3 instance using Python scrip * [Usage](#usage) ## Setup -These tools are designed to be run with a Python environment. Ensure Python 3.6 or higher is installed on your system. You will need to set up a Python virtual environment and install the required packages: +These tools are designed to be run with a Python environment. Ensure Python 3.6 or higher is installed on your system; you can check the version with `python3 --version`. You will need to set up a Python virtual environment and install the required packages; this can be done using these command within this 'scripts' directory: ```bash python3 -m venv venv From 5b80bd0dfb4c341c5663038f1acad686102d4bd7 Mon Sep 17 00:00:00 2001 From: Chris MacDonald <31731869+chrismacdonaldw@users.noreply.github.com> Date: Mon, 29 Apr 2024 14:44:46 -0300 Subject: [PATCH 11/20] Update scripts/README.md Co-authored-by: JojoVes --- scripts/README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/README.md b/scripts/README.md index 1e25837..1c011c1 100644 --- a/scripts/README.md +++ b/scripts/README.md @@ -45,7 +45,7 @@ Script to inject a binary into an archival FOXML as base64 encoded data within a ### Metadata Analysis #### Command ```bash -python3 data_analysis.py --url=http://your-fedora-url --user=admin --password=secret --output_dir=./results +python3 data_analysis.py --url= --user= --password= --output_dir=<./results> ``` #### Output Exports all queries found in `queries.py` to their own CSV in the `results` folder by default. Can be changed with the `--output_dir` flag. From d4fe7d53e63391abdbcea22e5621492d56b57e83 Mon Sep 17 00:00:00 2001 From: Chris MacDonald <31731869+chrismacdonaldw@users.noreply.github.com> Date: Mon, 29 Apr 2024 14:44:54 -0300 Subject: [PATCH 12/20] Update scripts/README.md Co-authored-by: JojoVes --- scripts/README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/README.md b/scripts/README.md index 1c011c1..251d861 100644 --- a/scripts/README.md +++ b/scripts/README.md @@ -53,7 +53,7 @@ Exports all queries found in `queries.py` to their own CSV in the `results` fold ### Metadata Export #### Command ```bash -python3 datastream_export.py --url=http://your-fedora-url:8080 --user=admin --password=secret --dsid=DSID --output_dir=./output --pid_file=./some_pids +python3 datastream_export.py --url= --user= --password= --dsid= --output_dir=<./output> --pid_file=<./some_pids> ``` > The script supports adding comments in the pid_file using `#`. PIDs can also contain URL encoded characters (e.g., `%3A` for `:` which will be automatically decoded). From 20db80ddfcab35e08532d94e18623d6cadb719ae Mon Sep 17 00:00:00 2001 From: Chris MacDonald <31731869+chrismacdonaldw@users.noreply.github.com> Date: Mon, 29 Apr 2024 14:45:01 -0300 Subject: [PATCH 13/20] Update scripts/README.md Co-authored-by: JojoVes --- scripts/README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/README.md b/scripts/README.md index 251d861..5f5558e 100644 --- a/scripts/README.md +++ b/scripts/README.md @@ -64,7 +64,7 @@ Each file's name will be in the format `pid-DSID.xml`. ### FOXML Export #### Command ```bash -python3 foxml_export.py --url=http://your-fedora-url:8080 --user=admin --pasword=secret --pid_file=./some_pids --output_dir=./output +python3 foxml_export.py --url= --user= --pasword= --pid_file=<./some_pids_to_export> --output_dir=<./output> ``` > The script supports adding comments in the pid_file using `#`. PIDs can also contain URL encoded characters (e.g., `%3A` for `:` which will be automatically decoded). From ee1f43f7343d9e4b5d2f54f780d5f3633167741b Mon Sep 17 00:00:00 2001 From: Chris MacDonald <31731869+chrismacdonaldw@users.noreply.github.com> Date: Mon, 29 Apr 2024 14:45:13 -0300 Subject: [PATCH 14/20] Update scripts/README.md Co-authored-by: JojoVes --- scripts/README.md | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/scripts/README.md b/scripts/README.md index 5f5558e..5359edb 100644 --- a/scripts/README.md +++ b/scripts/README.md @@ -55,7 +55,8 @@ Exports all queries found in `queries.py` to their own CSV in the `results` fold ```bash python3 datastream_export.py --url= --user= --password= --dsid= --output_dir=<./output> --pid_file=<./some_pids> ``` -> The script supports adding comments in the pid_file using `#`. PIDs can also contain URL encoded characters (e.g., `%3A` for `:` which will be automatically decoded). +> The script supports adding comments in the pid_file using `#`. PIDs can also contain URL encoded characters (e.g., `%3A` for `:` which will be automatically decoded). Expected format of the `pid_file` is one PID per line. +If `--pid_file` isn't specified, the script will do a query intended to get a list of all pids in the system and export all of them. #### Output Exports all metadata entries related to the specified DSID into XML files stored in the defined output directory. From f9e66904d026c51731e1d94f50bd2a17b03a8d66 Mon Sep 17 00:00:00 2001 From: Chris MacDonald <31731869+chrismacdonaldw@users.noreply.github.com> Date: Mon, 29 Apr 2024 14:45:31 -0300 Subject: [PATCH 15/20] Update scripts/README.md Co-authored-by: JojoVes --- scripts/README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/README.md b/scripts/README.md index 5359edb..ef28730 100644 --- a/scripts/README.md +++ b/scripts/README.md @@ -67,7 +67,7 @@ Each file's name will be in the format `pid-DSID.xml`. ```bash python3 foxml_export.py --url= --user= --pasword= --pid_file=<./some_pids_to_export> --output_dir=<./output> ``` -> The script supports adding comments in the pid_file using `#`. PIDs can also contain URL encoded characters (e.g., `%3A` for `:` which will be automatically decoded). +> The script supports adding comments in the pid_file using `#`. PIDs can also contain URL encoded characters (e.g., `%3A` for `:` which will be automatically decoded). Expected format of the `pid_file` is one PID per line. #### Output Exports all archival FOXML found in the associated PID file passed in through arguments to their own folder in `output_dir/FOXML`. From c613dc42898964b535d7fa3400a991482a0efac6 Mon Sep 17 00:00:00 2001 From: Chris MacDonald <31731869+chrismacdonaldw@users.noreply.github.com> Date: Tue, 30 Apr 2024 16:33:25 -0300 Subject: [PATCH 16/20] Update README --- scripts/README.md | 3 +++ 1 file changed, 3 insertions(+) diff --git a/scripts/README.md b/scripts/README.md index ef28730..69ca502 100644 --- a/scripts/README.md +++ b/scripts/README.md @@ -84,3 +84,6 @@ The only non-required argument is `label` which is in the case if you want to sp #### Output Updates the specified XML file with a new version of the datastream, encoding the provided binary content into base64. The updated XML is saved to the specified output file. +## Known Issues: +* `datastream_updater.py` is very finnicky and will probably fail on most FOXML objects. + * The eventual intention with this script is to update it using `xmltodict`, and simplify it even more. Most of its current issues derive from XML namespaces. From 5300afeee14b3121e5b36c5bb54e95b9dc78d96c Mon Sep 17 00:00:00 2001 From: Chris MacDonald <31731869+chrismacdonaldw@users.noreply.github.com> Date: Thu, 2 May 2024 11:36:51 -0300 Subject: [PATCH 17/20] Update scripts/README.md Co-authored-by: JojoVes --- scripts/README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/README.md b/scripts/README.md index 69ca502..23370a9 100644 --- a/scripts/README.md +++ b/scripts/README.md @@ -33,7 +33,7 @@ Script to run SPARQL queries against an FCREPO's RI and gather information. Curr - MIME type distribution ### Metadata Export -Script to export all objects within the repository that contain a specified metadata datastream ID, saving results as XML. +Script to export all objects (or a specified list of PIDs) within the repository that contain a specified metadata datastream ID, saving results as XML. ### FOXML Export Script to export FOXML archival objects from a Fedora repository given a list of PIDs. From 42f403dccfbf6b6b5c5919a4b395f9d03c576eea Mon Sep 17 00:00:00 2001 From: Chris MacDonald <31731869+chrismacdonaldw@users.noreply.github.com> Date: Thu, 2 May 2024 11:36:58 -0300 Subject: [PATCH 18/20] Update scripts/README.md Co-authored-by: JojoVes --- scripts/README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/README.md b/scripts/README.md index 23370a9..a625cd9 100644 --- a/scripts/README.md +++ b/scripts/README.md @@ -75,7 +75,7 @@ Exports all archival FOXML found in the associated PID file passed in through ar ### Datastream Updater #### Command ```bash -python3 datastream_updater.py --xml=input.xml --dsid=DSID --content=content.bin --label='New Version' --output=output.xml +python3 datastream_updater.py --xml= --dsid= --content= --label=<'New Version'> --output= ``` > This script allows you to specify the XML file to modify, the datastream ID, the binary content file (which will be base64 encoded), and optionally a label for the new datastream version. From a2b50607aec82ed4cdbf868d644079f92e3a2fcb Mon Sep 17 00:00:00 2001 From: Chris MacDonald <31731869+chrismacdonaldw@users.noreply.github.com> Date: Thu, 2 May 2024 14:51:42 -0300 Subject: [PATCH 19/20] Update some queries. --- scripts/queries.py | 18 ++++++++++++++++-- 1 file changed, 16 insertions(+), 2 deletions(-) diff --git a/scripts/queries.py b/scripts/queries.py index 7e9571d..76f3336 100644 --- a/scripts/queries.py +++ b/scripts/queries.py @@ -18,7 +18,7 @@ """, "active_deleted_count": """ - SELECT (COUNT(?activeObj) AS ?ActiveCount) (COUNT(?deletedObj) AS ?DeletedCount) + SELECT (COUNT(?activeObj) AS ?active) (COUNT(?deletedObj) AS ?deleted) (COUNT(?inactiveObj) AS ?inactive) FROM <#ri> WHERE { { @@ -31,6 +31,11 @@ WHERE { ?deletedObj . } + } UNION { + SELECT ?inactiveObj + WHERE { + ?inactiveObj . + } } } """, @@ -43,6 +48,14 @@ } """, + "inactive_objects": """ + SELECT ?obj + FROM <#ri> + WHERE { + ?obj + } + """, + "datastream_distribution": """ SELECT ?datastream (COUNT(?datastream) as ?count) FROM <#ri> @@ -69,7 +82,8 @@ SELECT ?collection (COUNT(?obj) as ?count) FROM <#ri> WHERE { - ?obj ?collection; + ?obj ?collection . + ?collection } GROUP BY ?collection """, From a10f682adbf48b285b173255aea5ccc49d97a0f2 Mon Sep 17 00:00:00 2001 From: Chris MacDonald <31731869+chrismacdonaldw@users.noreply.github.com> Date: Thu, 2 May 2024 14:54:08 -0300 Subject: [PATCH 20/20] Provide clarity on the metadata analysis scripts. --- scripts/README.md | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/scripts/README.md b/scripts/README.md index a625cd9..f9f0a69 100644 --- a/scripts/README.md +++ b/scripts/README.md @@ -32,6 +32,10 @@ Script to run SPARQL queries against an FCREPO's RI and gather information. Curr - List of orphaned objects - MIME type distribution + Before running this as a script, you might want to verify that the queries provided in `queries.py` are compatible with the system you are querying. If the system has Mulgara instead of Blazegraph, it would be restricted to SPARQL 1.0; to check what features would not be available for SPARQL 1.0, you can see a list of new features added in SPARQL 1.1 at the bottom of https://www.w3.org/TR/sparql11-query/. + +If you find that the system you are querying has some relationships outside of the ones covered in these queries by default, you will need to modify the relevant queries to get an accurate analysis. For example, sometimes there are relationships that have different capitalization or typos; these relationships need to be accounted for in this analysis phase for completeness and accuracy of the analysis as well as ensuring they get mapped appropriately in the actual migration. + ### Metadata Export Script to export all objects (or a specified list of PIDs) within the repository that contain a specified metadata datastream ID, saving results as XML.