diff --git a/scripts/.gitignore b/scripts/.gitignore new file mode 100644 index 0000000..b0df7e8 --- /dev/null +++ b/scripts/.gitignore @@ -0,0 +1,37 @@ +# Python +*.py[cod] +__pycache__/ +venv/ +*.pyc + +# Jupyter Notebook +.ipynb_checkpoints/ + +# IDEs +.idea/ +.vscode/ + +# Compiled files +*.pyd +*.pyo +*.pyw +*.pyz +*.pyzw + +# Distribution / packaging +dist/ +build/ +*.egg-info/ +*.egg + +# Environment +.env +.env.* + +# Script Outputs +results/* +output/* +*.xml + +# Etc +.DS_Store diff --git a/scripts/README.md b/scripts/README.md index 8b37736..f9f0a69 100644 --- a/scripts/README.md +++ b/scripts/README.md @@ -1,56 +1,93 @@ # FCREPO3 Analysis Helpers - ## Introduction -Scripts to analyse and export metadata from an FCREPO3 instance. +Tools to analyse and export metadata from an FCREPO3 instance using Python scripts. ## Table of Contents - * [Setup](#setup) * [Features](#features) * [Usage](#usage) ## Setup +These tools are designed to be run with a Python environment. Ensure Python 3.6 or higher is installed on your system; you can check the version with `python3 --version`. You will need to set up a Python virtual environment and install the required packages; this can be done using these command within this 'scripts' directory: -These scripts require an FCREPO3 instance to be run over. In the event, these scripts are run on a separate system from where -the repository lives, modifications may be required to the `fedora-xacml-policies` directory located at `$FEDORA_HOME/data/fedora-xacml-policies`. +```bash +python3 -m venv venv +source venv/bin/activate +pip install -r requirements.txt +``` -The metadata export command requires [GNU Parallel](https://www.gnu.org/software/parallel/parallel.html) to be installed -for faster processing. +The scripts also require an FCREPO3 instance. If these tools are run on a system separate from where the repository is hosted, modifications might be necessary in the `fedora-xacml-policies` directory at `$FEDORA_HOME/data/fedora-xacml-policies`. ## Features - ### Metadata Analysis -A script to generate the following: -1. A total count of all objects in the repository. -2. A breakdown of objects by content models and their count in CSV form (`models.csv`). -3. A breakdown of unique datastream IDs and their count in CSV form (`dsids.csv`). +Script to run SPARQL queries against an FCREPO's RI and gather information. Current queries include: + - Content model distribution + - Total object count + - Count of active and deleted objects + - List of deleted objects + - Datastream distribution + - Owner distribution + - Collection distribution + - List of relationships + - List of orphaned objects + - MIME type distribution + + Before running this as a script, you might want to verify that the queries provided in `queries.py` are compatible with the system you are querying. If the system has Mulgara instead of Blazegraph, it would be restricted to SPARQL 1.0; to check what features would not be available for SPARQL 1.0, you can see a list of new features added in SPARQL 1.1 at the bottom of https://www.w3.org/TR/sparql11-query/. + +If you find that the system you are querying has some relationships outside of the ones covered in these queries by default, you will need to modify the relevant queries to get an accurate analysis. For example, sometimes there are relationships that have different capitalization or typos; these relationships need to be accounted for in this analysis phase for completeness and accuracy of the analysis as well as ensuring they get mapped appropriately in the actual migration. ### Metadata Export -A script to export all objects within the repository that contain a specified metadata datastream ID. +Script to export all objects (or a specified list of PIDs) within the repository that contain a specified metadata datastream ID, saving results as XML. -## Usage +### FOXML Export +Script to export FOXML archival objects from a Fedora repository given a list of PIDs. + +### Datastream Updater +Script to inject a binary into an archival FOXML as base64 encoded data within a datastream. +## Usage ### Metadata Analysis #### Command ```bash -sudo bash /path_to_the_module/scripts/metadata_analysis.sh --fedora_pass=the_password +python3 data_analysis.py --url= --user= --password= --output_dir=<./results> ``` - #### Output +Exports all queries found in `queries.py` to their own CSV in the `results` folder by default. Can be changed with the `--output_dir` flag. + +### Metadata Export +#### Command +```bash +python3 datastream_export.py --url= --user= --password= --dsid= --output_dir=<./output> --pid_file=<./some_pids> ``` -The total number of objects is 40. -Outputted model breakdown to CSV (models.csv). -Outputted DSID breakdown to CSV (dsids.csv). +> The script supports adding comments in the pid_file using `#`. PIDs can also contain URL encoded characters (e.g., `%3A` for `:` which will be automatically decoded). Expected format of the `pid_file` is one PID per line. +If `--pid_file` isn't specified, the script will do a query intended to get a list of all pids in the system and export all of them. + +#### Output +Exports all metadata entries related to the specified DSID into XML files stored in the defined output directory. +Each file's name will be in the format `pid-DSID.xml`. + +### FOXML Export +#### Command +```bash +python3 foxml_export.py --url= --user= --pasword= --pid_file=<./some_pids_to_export> --output_dir=<./output> ``` +> The script supports adding comments in the pid_file using `#`. PIDs can also contain URL encoded characters (e.g., `%3A` for `:` which will be automatically decoded). Expected format of the `pid_file` is one PID per line. -### Metadata Export +#### Output +Exports all archival FOXML found in the associated PID file passed in through arguments to their own folder in `output_dir/FOXML`. + +### Datastream Updater #### Command ```bash -sudo bash shell_scripts/export_metadata.sh --fedora_pass=the_password --skip_auth_check +python3 datastream_updater.py --xml= --dsid= --content= --label=<'New Version'> --output= ``` +> This script allows you to specify the XML file to modify, the datastream ID, the binary content file (which will be base64 encoded), and optionally a label for the new datastream version. -> Utilizing the `--skip_auth_check` flag here is an important performance optimization as it will greatly speed up the -export operation due to not needing to validate the request prior. +The only non-required argument is `label` which is in the case if you want to specify a custom label. If previous datastream versions do not have a label and you didn't specify one in the args, it will prompt you for a new one. #### Output -The command does not output anything but will export all objects in the form of `the:pid-DSID.xml`. +Updates the specified XML file with a new version of the datastream, encoding the provided binary content into base64. The updated XML is saved to the specified output file. + +## Known Issues: +* `datastream_updater.py` is very finnicky and will probably fail on most FOXML objects. + * The eventual intention with this script is to update it using `xmltodict`, and simplify it even more. Most of its current issues derive from XML namespaces. diff --git a/scripts/data_analysis.py b/scripts/data_analysis.py new file mode 100644 index 0000000..a71af10 --- /dev/null +++ b/scripts/data_analysis.py @@ -0,0 +1,55 @@ +import argparse +import os +from utils import perform_http_request +from queries import queries + + +def parse_args(): + parser = argparse.ArgumentParser( + description="Process SPARQL queries and save results." + ) + parser.add_argument("--url", type=str, help="Fedora server URL", required=True) + parser.add_argument("--user", type=str, help="Fedora username", required=True) + parser.add_argument("--password", type=str, help="Fedora password", required=True) + parser.add_argument( + "--output_dir", + type=str, + default="./results", + help="Directory to save CSV files", + ) + return parser.parse_args() + + +def save_to_csv(data, filename, output_dir): + """ + Save the given data to a CSV file. + + Args: + data (str): The data to be written to the CSV file. + filename (str): The name of the CSV file. + output_dir (str): The directory where the CSV file will be saved. + + Returns: + None + """ + os.makedirs(output_dir, exist_ok=True) + with open(os.path.join(output_dir, filename), "w", newline="") as file: + file.write(data) + + +def main(): + args = parse_args() + + for query_name, query in queries.items(): + print(f"Processing query '{query_name}'...") + result = perform_http_request(query, args.url, args.user, args.password) + if result: + csv_filename = f"{query_name}.csv" + print(f"Saving results to {csv_filename}...\n") + save_to_csv(result, csv_filename, args.output_dir) + else: + print(f"Failed to retrieve data for query '{query_name}'.\n") + + +if __name__ == "__main__": + main() diff --git a/scripts/datastream_export.py b/scripts/datastream_export.py new file mode 100644 index 0000000..78ce767 --- /dev/null +++ b/scripts/datastream_export.py @@ -0,0 +1,113 @@ +import argparse +import requests +from tqdm import tqdm +import concurrent.futures +import os +import mimetypes +from utils import perform_http_request, process_pid_file + + +def parse_args(): + parser = argparse.ArgumentParser( + description="Export metadata using SPARQL query and save as XML." + ) + parser.add_argument("--url", required=True, help="Fedora base URL") + parser.add_argument("--user", required=True, help="Username for Fedora access") + parser.add_argument("--password", required=True, help="Password for Fedora access") + parser.add_argument("--dsid", required=True, help="Datastream ID for querying") + parser.add_argument( + "--output_dir", default="./output", help="Directory to save XML files" + ) + parser.add_argument( + "--pid_file", type=str, help="File containing PIDs to process", required=False + ) + return parser.parse_args() + + +def fetch_data(dsid, base_url, user, password, output_dir, pid): + """ + Fetches the datastream content for a given datastream ID (dsid) and PID from a Fedora repository. + + Args: + dsid (str): The ID of the datastream to fetch. + base_url (str): The base URL of the Fedora repository. + user (str): The username for authentication. + password (str): The password for authentication. + output_dir (str): The directory where the fetched data will be saved. + pid (str): The PID of the object that contains the datastream. + + Returns: + bool: True if the datastream content was successfully fetched and saved, False otherwise. + """ + pid = pid.replace("info:fedora/", "") + url = f"{base_url}/fedora/objects/{pid}/datastreams/{dsid}/content" + print(f"Downloading {dsid} for PID: {pid}") + try: + response = requests.get(url, auth=(user, password)) + response.raise_for_status() + dsid_dir = os.path.join(output_dir, dsid) + os.makedirs(dsid_dir, exist_ok=True) + content_type = response.headers.get("Content-Type", "") + extension = mimetypes.guess_extension(content_type) if content_type else "" + filename = f"{pid}-{dsid}{extension}" + with open(os.path.join(dsid_dir, filename), "wb") as f: + f.write(response.content) + print(f"Successfully saved {filename}\n") + return True + except Exception as e: + print(f"Failed to fetch data for {pid}, error: {str(e)}\n") + return False + + +def main(): + args = parse_args() + os.makedirs(args.output_dir, exist_ok=True) + + pids = [] + + # If a PID file is provided, process the file to get the list of PIDs. + if args.pid_file: + pids = process_pid_file(args.pid_file) + else: + query = f""" + SELECT ?obj WHERE {{ + ?obj ; + ?model; + ?ds. + ?ds + FILTER(!sameTerm(?model, )) + FILTER(!sameTerm(?model, )) + }} + """ + + result = perform_http_request(query, args.url, args.user, args.password) + pids.extend(result.strip().split("\n")[1:]) + + # Download metadata for each PID in parallel using ThreadPoolExecutor. + with concurrent.futures.ThreadPoolExecutor(max_workers=3) as executor, tqdm( + total=len(pids), desc="Downloading Metadata" + ) as progress: + futures = { + executor.submit( + fetch_data, + args.dsid, + args.url, + args.user, + args.password, + args.output_dir, + pid, + ): pid + for pid in pids + } + for future in concurrent.futures.as_completed(futures): + pid = futures[future] + try: + success = future.result() + if success: + progress.update(1) + except Exception as exc: + print(f"{pid} generated an exception: {exc}") + + +if __name__ == "__main__": + main() diff --git a/scripts/datastream_updater.py b/scripts/datastream_updater.py new file mode 100644 index 0000000..0b2e6cb --- /dev/null +++ b/scripts/datastream_updater.py @@ -0,0 +1,200 @@ +#!/usr/bin/env python + +import base64 +import datetime +import argparse +from lxml import etree as ET +from lxml.etree import QName +import mimetypes +import logging + +# Setting up basic logging +logging.basicConfig(level=logging.INFO) + + +def format_xml_element(element, level=0, indent=" "): + """ + Formats an XML element by adding appropriate spacing and indentation. + + Args: + element (Element): The XML element to format. + level (int, optional): The current level of indentation. Defaults to 0. + indent (str, optional): The string used for indentation. Defaults to " ". + + Returns: + None + """ + spacing = "\n" + level * indent + + if len(element): + if not element.text or not element.text.strip(): + element.text = spacing + indent + if not element.tail or not element.tail.strip(): + element.tail = spacing + for child in element: + format_xml_element(child, level + 1, indent) + else: + if level and (not element.tail or not element.tail.strip()): + element.tail = spacing + + +def compress_and_encode(file_path): + """ + Compresses and encodes the binary data from the given file path. + + Args: + file_path (str): The path to the file containing the binary data. + + Returns: + tuple: A tuple containing the indented base64-encoded data and the original size of the binary data. + """ + with open(file_path, "rb") as f_in: + binary_data = f_in.read() + original_size = len(binary_data) + base64_data = base64.b64encode(binary_data) + base64_lines = [ + base64_data[i : i + 80].decode("utf-8") + for i in range(0, len(base64_data), 80) + ] + indented_base64 = "\n ".join(base64_lines) + return indented_base64, original_size + + +def register_namespaces(xml_path): + """ + Registers XML namespaces from the given XML file. + + Args: + xml_path (str): The path to the XML file. + + Raises: + Exception: If there is an error registering the namespaces. + """ + try: + namespaces = dict( + [node for _, node in ET.iterparse(xml_path, events=["start-ns"])] + ) + for ns in namespaces: + ET.register_namespace(ns, namespaces[ns]) + except Exception as e: + logging.error(f"Error registering namespaces: {e}") + raise + + +def add_datastream_version( + xml_path, dsid, base64_data, original_size, mimetype, label=None +): + """ + Adds a new version of a datastream to an XML file. + + Args: + xml_path (str): The path to the XML file. + dsid (str): The ID of the datastream. + base64_data (str): The base64-encoded content of the datastream. + original_size (int): The original size of the datastream in bytes. + mimetype (str): The MIME type of the datastream. + label (str, optional): The label for the datastream version. If not provided, a default label will be used. + + Returns: + str: The XML string with the new datastream version added. + + Raises: + ET.ParseError: If there is an error parsing the XML file. + Exception: If there is an error creating the XML string. + """ + try: + root = ET.parse(xml_path).getroot() + except ET.ParseError as e: + logging.exception(f"XML parsing error: {e}") + return + + nsmap = { + "foxml": "info:fedora/fedora-system:def/foxml#", + "xsi": "http://www.w3.org/2001/XMLSchema-instance", + "audit": "info:fedora/fedora-system:def/audit#", + "rdf": "http://www.w3.org/1999/02/22-rdf-syntax-ns#", + "fedora": "info:fedora/fedora-system:def/relations-external#", + "fedora-model": "info:fedora/fedora-system:def/model#", + "islandora": "http://islandora.ca/ontology/relsext#", + } + + # Have to use qualified names when creating an element. + ds_version_tag = QName(nsmap["foxml"], "datastreamVersion") + binary_content_tag = QName(nsmap["foxml"], "binaryContent") + + datastream = root.find(f".//foxml:datastream[@ID='{dsid}']", namespaces=nsmap) + if datastream is None: + logging.warning(f"Datastream with ID of {dsid} does not exist.") + return + + if label is None: + datastream_version = datastream.find( + ".//foxml:datastreamVersion[last()]", namespaces=nsmap + ) + label = ( + datastream_version.get("LABEL") + if datastream_version is not None + else "default_label" + ) + + new_id = "{}.{}".format( + dsid, len(datastream.findall(".//foxml:datastreamVersion", namespaces=nsmap)) + ) + datastream_version = ET.SubElement( + datastream, + ds_version_tag, + { + "ID": new_id, + "LABEL": label, + "MIMETYPE": mimetype, + "SIZE": str(original_size), + }, + ) + + dt = datetime.datetime.utcnow().strftime("%Y-%m-%dT%H:%M:%S.%f")[:-3] + "Z" + datastream_version.set("CREATED", dt) + + binary_content = ET.SubElement(datastream_version, binary_content_tag) + binary_content.text = "\n " + base64_data + "\n " + + try: + ET.indent(root, space=" ") + format_xml_element(root) + xml_string = ET.tostring( + root, encoding="utf-8", method="xml", xml_declaration=True + ) + except Exception as e: + logging.exception(f"Error creating XML string: {e}") + raise + + return xml_string + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument("--xml", help="path to the XML file to modify", required=True) + parser.add_argument("--dsid", help="ID of the datastream to modify", required=True) + parser.add_argument( + "--content", + help="path to the binary content to add as a new datastreamVersion", + required=True, + ) + parser.add_argument("--label", help="label of the new datastream version") + parser.add_argument("--output", help="path to the output XML file", required=True) + args = parser.parse_args() + + try: + mimetype, _ = mimetypes.guess_type(args.content) + mimetype = mimetype or "application/octet-stream" + + base64_data, original_size = compress_and_encode(args.content) + register_namespaces(args.xml) + updated_xml = add_datastream_version( + args.xml, args.dsid, base64_data, original_size, mimetype, args.label + ) + + if updated_xml: + with open(args.output, "w") as f_out: + f_out.write(updated_xml.decode("utf-8")) + except Exception as e: + logging.exception(f"Error in script execution: {e}") diff --git a/scripts/export_metadata.sh b/scripts/export_metadata.sh deleted file mode 100644 index 1ed4fbc..0000000 --- a/scripts/export_metadata.sh +++ /dev/null @@ -1,33 +0,0 @@ -#!/bin/bash - -SCRIPT_DIR=$(dirname $(readlink -f $0)) - -. "$SCRIPT_DIR/util.in" - -# Ensure GNU Parallel is installed. -if ! type parallel >/dev/null 2>&1 ; then - printf "*****************************************************\n" - printf "* Error: GNU Parallel is not installed. *\n" - printf "*****************************************************\n" - exit 1 -fi - -DSID_QUERY=$(cat << EOQ -SELECT ?obj -WHERE { - ?obj ; - ?model ; - ?ds . - ?ds -FILTER(!sameTerm(?model, )) -FILTER(!sameTerm(?model, )) -} -EOQ -) - -# Go perform the query for all objects and pass off to parallel to do the heavy lifting. To note the URI here chops off -# the info:fedora/ piece from the front with Perl. -# @see: https://www.gnu.org/software/parallel/parallel_tutorial.html#perl-expression-replacement-string -do_curl "${DSID_QUERY}" "CSV" | parallel --jobs 3 --skip-first-line curl --location "${FEDORA_URL}:8080/fedora/objects/{= s/info:fedora\/// =}/datastreams/${DSID}/content" \ --u "${FEDORA_USER}:${FEDORA_PASS}" \ --o "${SCRIPT_DIR}/{= s/info:fedora\/// =}-${DSID}.xml" diff --git a/scripts/foxml_export.py b/scripts/foxml_export.py new file mode 100644 index 0000000..5ee5a90 --- /dev/null +++ b/scripts/foxml_export.py @@ -0,0 +1,93 @@ +import argparse +import requests +from tqdm import tqdm +import concurrent.futures +import os +import mimetypes +from utils import process_pid_file + + +def parse_args(): + parser = argparse.ArgumentParser( + description="Export metadata using SPARQL query and save as XML." + ) + parser.add_argument("--url", required=True, help="Fedora base URL") + parser.add_argument("--user", required=True, help="Username for Fedora access") + parser.add_argument("--password", required=True, help="Password for Fedora access") + parser.add_argument( + "--output_dir", default="./output", help="Directory to save XML files" + ) + parser.add_argument( + "--pid_file", type=str, required=True, help="File containing PIDs to process" + ) + return parser.parse_args() + + +def fetch_foxml(base_url, user, password, output_dir, pid): + """ + Fetches the archival FOXML for a given PID from a Fedora repository. + + Args: + base_url (str): The base URL of the Fedora repository. + user (str): The username for authentication. + password (str): The password for authentication. + output_dir (str): The directory where the fetched data will be saved. + pid (str): The ID of the object that contains the datastream. + + Returns: + bool: True if the datastream content was successfully fetched and saved, False otherwise. + """ + pid = pid.replace("info:fedora/", "") + url = f"{base_url}/fedora/objects/{pid}/export?context=archive" + print(f"Downloading FOXML for PID: {pid}") + try: + response = requests.get(url, auth=(user, password)) + response.raise_for_status() + foxml_dir = os.path.join(output_dir, "FOXML") + os.makedirs(foxml_dir, exist_ok=True) + content_type = response.headers.get("Content-Type", "") + extension = mimetypes.guess_extension(content_type) if content_type else "" + filename = f"{pid}-FOXML{extension}" + with open(os.path.join(foxml_dir, filename), "wb") as f: + f.write(response.content) + print(f"Successfully saved {filename}\n") + return True + except Exception as e: + print(f"Failed to fetch FOXML for {pid}, error: {str(e)}\n") + return False + + +def main(): + args = parse_args() + os.makedirs(args.output_dir, exist_ok=True) + + pids = [] + + pids = process_pid_file(args.pid_file) + + with concurrent.futures.ThreadPoolExecutor(max_workers=3) as executor, tqdm( + total=len(pids), desc="Downloading FOXML" + ) as progress: + futures = { + executor.submit( + fetch_foxml, + args.url, + args.user, + args.password, + args.output_dir, + pid, + ): pid + for pid in pids + } + for future in concurrent.futures.as_completed(futures): + pid = futures[future] + try: + success = future.result() + if success: + progress.update(1) + except Exception as exc: + print(f"{pid} generated an exception: {exc}") + + +if __name__ == "__main__": + main() diff --git a/scripts/metadata_analysis.sh b/scripts/metadata_analysis.sh deleted file mode 100644 index c4fe19b..0000000 --- a/scripts/metadata_analysis.sh +++ /dev/null @@ -1,66 +0,0 @@ -#!/bin/bash - -SCRIPT_DIR=$(dirname $(readlink -f $0)) - -. "$SCRIPT_DIR/util.in" - -# Base RI query to build things up. -BASE_QUERY=$(cat << EOQ -WHERE { - ?obj ; - ?model ; -FILTER(!sameTerm(?model, )) -FILTER(!sameTerm(?model, )) -} -EOQ -) - -# Retrieves a list of content models and their count to CSV. -model_breakdown() { - local QUERY=$(cat << EOQ -SELECT ?model (COUNT(?model) as ?count) -${BASE_QUERY} -GROUP BY ?model -EOQ -) - - do_curl "$QUERY" "CSV" > "$SCRIPT_DIR"/models.csv - echo "Outputted model breakdown to CSV (${SCRIPT_DIR}/models.csv)." -} - -# Retrieves the total amount of objects in the repository. -total_count() { - local QUERY=$(cat << EOQ -SELECT ?obj -${BASE_QUERY} -EOQ -) - - local COUNT=$(do_curl "$QUERY" "count") - echo "The total number of objects is ${COUNT}." -} - -# Breaks down the unique datastream IDs and their count to CSV. -dsid_breakdown() { - local QUERY=$(cat << EOQ -SELECT ?ds (COUNT(?ds) as ?count) -WHERE { - ?obj ; - ?model ; - [ ?ds] -FILTER(!sameTerm(?model, )) -FILTER(!sameTerm(?model, )) -} -GROUP BY ?ds -EOQ -) - - do_curl "$QUERY" "CSV" > "$SCRIPT_DIR"/dsids.csv - echo "Outputted DSID breakdown to CSV (${SCRIPT_DIR}/dsids.csv)." -} - -total_count -model_breakdown -dsid_breakdown - -exit 0 diff --git a/scripts/queries.py b/scripts/queries.py new file mode 100644 index 0000000..76f3336 --- /dev/null +++ b/scripts/queries.py @@ -0,0 +1,122 @@ +queries = { + "content_model_distribution": """ + SELECT ?model (COUNT(?obj) as ?count) + FROM <#ri> + WHERE { + ?obj ?model; + } + GROUP BY ?model + """, + + "object_count": """ + SELECT (COUNT(?obj) as ?count) + FROM <#ri> + WHERE { + ?obj . + } + LIMIT 1 + """, + + "active_deleted_count": """ + SELECT (COUNT(?activeObj) AS ?active) (COUNT(?deletedObj) AS ?deleted) (COUNT(?inactiveObj) AS ?inactive) + FROM <#ri> + WHERE { + { + SELECT ?activeObj + WHERE { + ?activeObj . + } + } UNION { + SELECT ?deletedObj + WHERE { + ?deletedObj . + } + } UNION { + SELECT ?inactiveObj + WHERE { + ?inactiveObj . + } + } + } + """, + + "deleted_objects": """ + SELECT ?obj + FROM <#ri> + WHERE { + ?obj + } + """, + + "inactive_objects": """ + SELECT ?obj + FROM <#ri> + WHERE { + ?obj + } + """, + + "datastream_distribution": """ + SELECT ?datastream (COUNT(?datastream) as ?count) + FROM <#ri> + WHERE { + ?obj ; + OPTIONAL { + ?obj ?c . + ?c ?datastream ; + } + } + GROUP BY ?datastream + """, + + "owner_distribution": """ + SELECT ?owner (COUNT(?obj) as ?count) + FROM <#ri> + WHERE { + ?obj ?owner; + } + GROUP BY ?owner + """, + + "collection_distribution": """ + SELECT ?collection (COUNT(?obj) as ?count) + FROM <#ri> + WHERE { + ?obj ?collection . + ?collection + } + GROUP BY ?collection + """, + + "relationships": """ + SELECT DISTINCT ?relationship + FROM <#ri> + WHERE { + ?o ?relationship ?s . + ?o + } + """, + + "orphaned_objects": """ + SELECT DISTINCT ?orphan + FROM <#ri> + WHERE { + ?orphan + FILTER NOT EXISTS { + ?orphan ?subject . + } + FILTER NOT EXISTS { + ?orphan ?subject . + } + } + """, + + "mimetype_distribution": """ + SELECT ?mimetype (COUNT(?mimetype) as ?count) + FROM <#ri> + WHERE { + ?o ?mimetype + } + GROUP BY ?mimetype + """ +} diff --git a/scripts/requirements.txt b/scripts/requirements.txt new file mode 100644 index 0000000..3d9bd8c --- /dev/null +++ b/scripts/requirements.txt @@ -0,0 +1,5 @@ +### Non Version Specific Requirements +requests +tqdm +bs4 +lxml diff --git a/scripts/util.in b/scripts/util.in deleted file mode 100644 index de1db76..0000000 --- a/scripts/util.in +++ /dev/null @@ -1,89 +0,0 @@ -#!/bin/bash - -FEDORA_URL=http://localhost -FEDORA_USER=fedoraAdmin -SKIP_AUTH=false -DSID=MODS - -print_help() { - cat <<-HELP -This script is used to generate statistics or pull data from a FCREPO 3 instance for migration data analysis. -The following arguments are provided: - --fedora_url [FCREPO3 URL] - Default ($FEDORA_URL) - --fedora_user [FCREPO3 User] - Default ($FEDORA_USER) - --fedora_pass [FCREPO3 Password] - REQUIRED - --skip_auth_check [Skips verification of credentials on a request] - Default (false) - --dsid [Metadata datastream ID to be used for metadata export] - Default ($DSID) -HELP -exit 0 -} - - -# Helper to perform RI searches. -do_curl() { -curl --location --request POST "${FEDORA_URL}:8080/fedora/risearch" \ --u "${FEDORA_USER}:${FEDORA_PASS}" \ --s \ ---header 'Content-Type: application/x-www-form-urlencoded' \ ---data-urlencode "type=tuples" \ ---data-urlencode 'lang=sparql' \ ---data-urlencode "format=$2" \ ---data-urlencode 'limit=' \ ---data-urlencode 'dt=on' \ ---data-urlencode "query=$1" -} - -# Helper to ensure can authenticate to Fedora. -check_auth() { -CODE=$(curl --location "${FEDORA_URL}:8080/fedora/objects/fedora-system:ContentModel-3.0/export" \ --u "${FEDORA_USER}:${FEDORA_PASS}" \ --s \ --w '%{http_code}' \ --o /dev/null \ -) -echo "${CODE}" -} - -while [ "$#" -gt 0 ]; do - case "$1" in - --fedora_url=*) - FEDORA_URL="${1#*=}" - ;; - --fedora_user=*) - FEDORA_USER="${1#*=}" - ;; - --fedora_pass=*) - FEDORA_PASS="${1#*=}" - ;; - --skip_auth_check) - SKIP_AUTH=true - ;; - --dsid=*) - DSID="${1#*=}" - ;; - --help) print_help;; - *) - printf "************************************************************\n" - printf "* Error: Invalid argument, run --help for valid arguments. *\n" - printf "************************************************************\n" - exit 1 - esac - shift -done - -if [ -z "${FEDORA_PASS}" ]; then - printf "*****************************************************\n" - printf "* Error: A password for the Fedora user is required *\n" - printf "*****************************************************\n" - exit 1 -fi; - -if [ "${SKIP_AUTH}" = false ]; then - AUTH_CODE=$(check_auth) - if [ "${AUTH_CODE}" != 200 ]; then - printf "************************************************************\n" - printf "* Error: Authentication failed to Fedora. *\n" - printf "************************************************************\n" - exit 1 - fi -fi diff --git a/scripts/utils.py b/scripts/utils.py new file mode 100644 index 0000000..d99e822 --- /dev/null +++ b/scripts/utils.py @@ -0,0 +1,61 @@ +import requests + + +def perform_http_request(query, endpoint_url, user, password, output_format="CSV"): + """ + Perform an HTTP request to a specified endpoint URL with the given query. + + Args: + query (str): The SPARQL query to be executed. + endpoint_url (str): The URL of the SPARQL endpoint. + user (str): The username for authentication. + password (str): The password for authentication. + output_format (str, optional): The desired format of the response. Defaults to "CSV". + + Returns: + str: The response text if the request is successful, None otherwise. + """ + headers = {"Content-Type": "application/x-www-form-urlencoded"} + payload = { + "type": "tuples", + "lang": "sparql", + "format": output_format, + "limit": "", + "dt": "on", + "query": query, + } + response = requests.post( + f"{endpoint_url}/fedora/risearch", + auth=(user, password), + headers=headers, + data=payload, + ) + if response.status_code == 200: + return response.text + else: + print(f"Error {response.status_code} while querying: {query}") + return None + + +def process_pid_file(filepath): + """ + Process a file containing PIDs (Persistent Identifiers) and return a list of PIDs. + Supports comments in the file using '#' character. + Replace '%3A' with ':' in PIDs. + + Args: + filepath (str): The path to the file containing PIDs. + + Returns: + list: A list of PIDs extracted from the file. + """ + pids = [] + with open(filepath, "r") as file: + for line in file: + line = line.strip() + if "#" in line: + line = line[: line.index("#")].strip() + if line: + line = line.replace("%3A", ":") + pids.append(line) + return pids diff --git a/src/Plugin/migrate/process/Parse.php b/src/Plugin/migrate/process/Parse.php index 7d84ba8..61aa5f3 100644 --- a/src/Plugin/migrate/process/Parse.php +++ b/src/Plugin/migrate/process/Parse.php @@ -2,14 +2,14 @@ namespace Drupal\foxml\Plugin\migrate\process; -use Drupal\migrate\ProcessPluginBase; +use Drupal\Core\Plugin\ContainerFactoryPluginInterface; + +use Drupal\foxml\Utility\Fedora3\Element\DigitalObject; +use Drupal\foxml\Utility\Fedora3\FoxmlParser; +use Drupal\migrate\MigrateException; use Drupal\migrate\MigrateExecutableInterface; +use Drupal\migrate\ProcessPluginBase; use Drupal\migrate\Row; -use Drupal\migrate\MigrateException; -use Drupal\foxml\Utility\Fedora3\FoxmlParser; -use Drupal\foxml\Utility\Fedora3\Element\DigitalObject; - -use Drupal\Core\Plugin\ContainerFactoryPluginInterface; use Symfony\Component\DependencyInjection\ContainerInterface; diff --git a/src/Plugin/migrate/source/Foxml.php b/src/Plugin/migrate/source/Foxml.php index 6d1f8d2..27fff67 100644 --- a/src/Plugin/migrate/source/Foxml.php +++ b/src/Plugin/migrate/source/Foxml.php @@ -2,13 +2,13 @@ namespace Drupal\foxml\Plugin\migrate\source; +use Drupal\Core\Plugin\ContainerFactoryPluginInterface; +use Drupal\Core\StringTranslation\StringTranslationTrait; + use Drupal\foxml\Utility\Fedora3\ObjectLowLevelAdapterInterface; use Drupal\migrate\Plugin\migrate\source\SourcePluginBase; use Drupal\migrate\Plugin\MigrationInterface; -use Drupal\Core\StringTranslation\StringTranslationTrait; -use Drupal\Core\Plugin\ContainerFactoryPluginInterface; - use Symfony\Component\DependencyInjection\ContainerInterface; /** diff --git a/src/StreamWrapper/Foxml.php b/src/StreamWrapper/Foxml.php index 601ab11..3599595 100644 --- a/src/StreamWrapper/Foxml.php +++ b/src/StreamWrapper/Foxml.php @@ -2,13 +2,13 @@ namespace Drupal\foxml\StreamWrapper; -use Drupal\foxml\Utility\Fedora3\DatastreamLowLevelAdapterInterface; -use Drupal\foxml\Utility\Fedora3\ObjectLowLevelAdapterInterface; - use Drupal\Core\File\FileSystem; use Drupal\Core\StreamWrapper\LocalReadOnlyStream; use Drupal\Core\Url; +use Drupal\foxml\Utility\Fedora3\DatastreamLowLevelAdapterInterface; +use Drupal\foxml\Utility\Fedora3\ObjectLowLevelAdapterInterface; + /** * FOXML stream wrapper. */ diff --git a/src/Utility/Fedora3/Element/AbstractStreamOffsetContent.php b/src/Utility/Fedora3/Element/AbstractStreamOffsetContent.php index ede8e58..8b7f5b0 100644 --- a/src/Utility/Fedora3/Element/AbstractStreamOffsetContent.php +++ b/src/Utility/Fedora3/Element/AbstractStreamOffsetContent.php @@ -2,8 +2,8 @@ namespace Drupal\foxml\Utility\Fedora3\Element; -use Drupal\foxml\Utility\Fedora3\AbstractParser; use Drupal\foxml\StreamWrapper\Substream; +use Drupal\foxml\Utility\Fedora3\AbstractParser; /** * Abstract element handler for inline content. diff --git a/src/Utility/Fedora3/FoxmlParser.php b/src/Utility/Fedora3/FoxmlParser.php index 6d34b93..b23f559 100644 --- a/src/Utility/Fedora3/FoxmlParser.php +++ b/src/Utility/Fedora3/FoxmlParser.php @@ -2,9 +2,9 @@ namespace Drupal\foxml\Utility\Fedora3; +use Drupal\Core\Cache\CacheBackendInterface; use Drupal\Core\Lock\LockBackendInterface; use Drupal\foxml\Utility\Fedora3\Element\DigitalObject; -use Drupal\Core\Cache\CacheBackendInterface; /** * Foxml parser.