From 08beabc558add75031c69e8fa101f40373a19517 Mon Sep 17 00:00:00 2001 From: Chris MacDonald <31731869+chrismacdonaldw@users.noreply.github.com> Date: Fri, 19 Apr 2024 13:11:49 -0300 Subject: [PATCH] Add FOXML export --- scripts/README.md | 13 +++++ scripts/datastream_export.py | 62 ++++++++---------------- scripts/foxml_export.py | 93 ++++++++++++++++++++++++++++++++++++ scripts/utils.py | 36 ++++++++++++++ 4 files changed, 161 insertions(+), 43 deletions(-) create mode 100644 scripts/foxml_export.py diff --git a/scripts/README.md b/scripts/README.md index 05fb31d..afbe2c9 100644 --- a/scripts/README.md +++ b/scripts/README.md @@ -35,6 +35,9 @@ Script to run SPARQL queries against an FCREPO's RI and gather information. Curr ### Metadata Export Script to export all objects within the repository that contain a specified metadata datastream ID, saving results as XML. +### FOXML Export +Script to export FOXML archival objects from a Fedora repository given a list of PIDs. + ### Datastream Updater Script to inject a binary into an archival FOXML as base64 encoded data within a datastream. @@ -58,6 +61,16 @@ python3 datastream_export.py --url=http://your-fedora-url:8080 --user=admin --pa Exports all metadata entries related to the specified DSID into XML files stored in the defined output directory. Each file's name will be in the format `pid-DSID.xml`. +### FOXML Export +#### Command +```bash +python3 foxml_export.py --url=http://your-fedora-url:8080 --user=admin --pasword=secret --pid_file=./some_pids --output_dir=./output +``` +> The script supports adding comments in the pid_file using `#`. PIDs can also contain URL encoded characters (e.g., `%3A` for `:` which will be automatically decoded). + +#### Output +Exports all archival FOXML found in the associated PID file passed in through arguments to their own folder in `output_dir/FOXML`. + ### Datastream Updater #### Command ```bash diff --git a/scripts/datastream_export.py b/scripts/datastream_export.py index 90aa0bd..78ce767 100644 --- a/scripts/datastream_export.py +++ b/scripts/datastream_export.py @@ -4,7 +4,7 @@ import concurrent.futures import os import mimetypes -from utils import perform_http_request +from utils import perform_http_request, process_pid_file def parse_args(): @@ -24,9 +24,9 @@ def parse_args(): return parser.parse_args() -def fetch_data(dsid, base_url, user, password, output_dir, obj_id): +def fetch_data(dsid, base_url, user, password, output_dir, pid): """ - Fetches the datastream content for a given datastream ID (dsid) and object ID (obj_id) from a Fedora repository. + Fetches the datastream content for a given datastream ID (dsid) and PID from a Fedora repository. Args: dsid (str): The ID of the datastream to fetch. @@ -34,14 +34,14 @@ def fetch_data(dsid, base_url, user, password, output_dir, obj_id): user (str): The username for authentication. password (str): The password for authentication. output_dir (str): The directory where the fetched data will be saved. - obj_id (str): The ID of the object that contains the datastream. + pid (str): The PID of the object that contains the datastream. Returns: bool: True if the datastream content was successfully fetched and saved, False otherwise. """ - obj_id = obj_id.replace("info:fedora/", "") - url = f"{base_url}/fedora/objects/{obj_id}/datastreams/{dsid}/content" - print(f"Downloading {dsid} for PID: {obj_id}") + pid = pid.replace("info:fedora/", "") + url = f"{base_url}/fedora/objects/{pid}/datastreams/{dsid}/content" + print(f"Downloading {dsid} for PID: {pid}") try: response = requests.get(url, auth=(user, password)) response.raise_for_status() @@ -49,49 +49,25 @@ def fetch_data(dsid, base_url, user, password, output_dir, obj_id): os.makedirs(dsid_dir, exist_ok=True) content_type = response.headers.get("Content-Type", "") extension = mimetypes.guess_extension(content_type) if content_type else "" - filename = f"{obj_id}-{dsid}{extension}" + filename = f"{pid}-{dsid}{extension}" with open(os.path.join(dsid_dir, filename), "wb") as f: f.write(response.content) - print(f"Successfully saved {filename}") + print(f"Successfully saved {filename}\n") return True except Exception as e: - print(f"Failed to fetch data for {obj_id}, error: {str(e)}") + print(f"Failed to fetch data for {pid}, error: {str(e)}\n") return False -def process_pid_file(filepath): - """ - Process a file containing PIDs (Persistent Identifiers) and return a list of PIDs. - Supports comments in the file using '#' character. - Replace '%3A' with ':' in PIDs. - - Args: - filepath (str): The path to the file containing PIDs. - - Returns: - list: A list of PIDs extracted from the file. - """ - pids = [] - with open(filepath, "r") as file: - for line in file: - line = line.strip() - if "#" in line: - line = line[: line.index("#")].strip() - if line: - line = line.replace("%3A", ":") - pids.append(line) - return pids - - def main(): args = parse_args() os.makedirs(args.output_dir, exist_ok=True) - object_ids = [] + pids = [] # If a PID file is provided, process the file to get the list of PIDs. if args.pid_file: - object_ids = process_pid_file(args.pid_file) + pids = process_pid_file(args.pid_file) else: query = f""" SELECT ?obj WHERE {{ @@ -105,11 +81,11 @@ def main(): """ result = perform_http_request(query, args.url, args.user, args.password) - object_ids.extend(result.strip().split("\n")[1:]) + pids.extend(result.strip().split("\n")[1:]) # Download metadata for each PID in parallel using ThreadPoolExecutor. with concurrent.futures.ThreadPoolExecutor(max_workers=3) as executor, tqdm( - total=len(object_ids), desc="Downloading Metadata" + total=len(pids), desc="Downloading Metadata" ) as progress: futures = { executor.submit( @@ -119,18 +95,18 @@ def main(): args.user, args.password, args.output_dir, - obj_id, - ): obj_id - for obj_id in object_ids + pid, + ): pid + for pid in pids } for future in concurrent.futures.as_completed(futures): - obj_id = futures[future] + pid = futures[future] try: success = future.result() if success: progress.update(1) except Exception as exc: - print(f"{obj_id} generated an exception: {exc}") + print(f"{pid} generated an exception: {exc}") if __name__ == "__main__": diff --git a/scripts/foxml_export.py b/scripts/foxml_export.py new file mode 100644 index 0000000..5ee5a90 --- /dev/null +++ b/scripts/foxml_export.py @@ -0,0 +1,93 @@ +import argparse +import requests +from tqdm import tqdm +import concurrent.futures +import os +import mimetypes +from utils import process_pid_file + + +def parse_args(): + parser = argparse.ArgumentParser( + description="Export metadata using SPARQL query and save as XML." + ) + parser.add_argument("--url", required=True, help="Fedora base URL") + parser.add_argument("--user", required=True, help="Username for Fedora access") + parser.add_argument("--password", required=True, help="Password for Fedora access") + parser.add_argument( + "--output_dir", default="./output", help="Directory to save XML files" + ) + parser.add_argument( + "--pid_file", type=str, required=True, help="File containing PIDs to process" + ) + return parser.parse_args() + + +def fetch_foxml(base_url, user, password, output_dir, pid): + """ + Fetches the archival FOXML for a given PID from a Fedora repository. + + Args: + base_url (str): The base URL of the Fedora repository. + user (str): The username for authentication. + password (str): The password for authentication. + output_dir (str): The directory where the fetched data will be saved. + pid (str): The ID of the object that contains the datastream. + + Returns: + bool: True if the datastream content was successfully fetched and saved, False otherwise. + """ + pid = pid.replace("info:fedora/", "") + url = f"{base_url}/fedora/objects/{pid}/export?context=archive" + print(f"Downloading FOXML for PID: {pid}") + try: + response = requests.get(url, auth=(user, password)) + response.raise_for_status() + foxml_dir = os.path.join(output_dir, "FOXML") + os.makedirs(foxml_dir, exist_ok=True) + content_type = response.headers.get("Content-Type", "") + extension = mimetypes.guess_extension(content_type) if content_type else "" + filename = f"{pid}-FOXML{extension}" + with open(os.path.join(foxml_dir, filename), "wb") as f: + f.write(response.content) + print(f"Successfully saved {filename}\n") + return True + except Exception as e: + print(f"Failed to fetch FOXML for {pid}, error: {str(e)}\n") + return False + + +def main(): + args = parse_args() + os.makedirs(args.output_dir, exist_ok=True) + + pids = [] + + pids = process_pid_file(args.pid_file) + + with concurrent.futures.ThreadPoolExecutor(max_workers=3) as executor, tqdm( + total=len(pids), desc="Downloading FOXML" + ) as progress: + futures = { + executor.submit( + fetch_foxml, + args.url, + args.user, + args.password, + args.output_dir, + pid, + ): pid + for pid in pids + } + for future in concurrent.futures.as_completed(futures): + pid = futures[future] + try: + success = future.result() + if success: + progress.update(1) + except Exception as exc: + print(f"{pid} generated an exception: {exc}") + + +if __name__ == "__main__": + main() diff --git a/scripts/utils.py b/scripts/utils.py index 4012fd7..17eeb86 100644 --- a/scripts/utils.py +++ b/scripts/utils.py @@ -2,6 +2,19 @@ def perform_http_request(query, endpoint_url, user, password, output_format="CSV"): + """ + Perform an HTTP request to a specified endpoint URL with the given query. + + Args: + query (str): The SPARQL query to be executed. + endpoint_url (str): The URL of the SPARQL endpoint. + user (str): The username for authentication. + password (str): The password for authentication. + output_format (str, optional): The desired format of the response. Defaults to "CSV". + + Returns: + str: The response text if the request is successful, None otherwise. + """ headers = {"Content-Type": "application/x-www-form-urlencoded"} payload = { "type": "tuples", @@ -22,3 +35,26 @@ def perform_http_request(query, endpoint_url, user, password, output_format="CSV else: print(f"Error {response.status_code} while querying: {query}") return None + +def process_pid_file(filepath): + """ + Process a file containing PIDs (Persistent Identifiers) and return a list of PIDs. + Supports comments in the file using '#' character. + Replace '%3A' with ':' in PIDs. + + Args: + filepath (str): The path to the file containing PIDs. + + Returns: + list: A list of PIDs extracted from the file. + """ + pids = [] + with open(filepath, "r") as file: + for line in file: + line = line.strip() + if "#" in line: + line = line[: line.index("#")].strip() + if line: + line = line.replace("%3A", ":") + pids.append(line) + return pids