Add FOXML export

discoverygarden · Apr 19, 2024 · 08beabc · 08beabc
1 parent 69ab4ee
commit 08beabc
Show file tree

Hide file tree

Showing 4 changed files with 161 additions and 43 deletions.
diff --git a/scripts/README.md b/scripts/README.md
@@ -35,6 +35,9 @@ Script to run SPARQL queries against an FCREPO's RI and gather information. Curr
 ### Metadata Export
 Script to export all objects within the repository that contain a specified metadata datastream ID, saving results as XML.
 
+### FOXML Export
+Script to export FOXML archival objects from a Fedora repository given a list of PIDs.
+
 ### Datastream Updater
 Script to inject a binary into an archival FOXML as base64 encoded data within a datastream.
 
@@ -58,6 +61,16 @@ python3 datastream_export.py --url=http://your-fedora-url:8080 --user=admin --pa
 Exports all metadata entries related to the specified DSID into XML files stored in the defined output directory.
 Each file's name will be in the format `pid-DSID.xml`.
 
+### FOXML Export
+#### Command
+```bash
+python3 foxml_export.py --url=http://your-fedora-url:8080 --user=admin --pasword=secret --pid_file=./some_pids --output_dir=./output
+```
+> The script supports adding comments in the pid_file using `#`. PIDs can also contain URL encoded characters (e.g., `%3A` for `:` which will be automatically decoded).
+
+#### Output
+Exports all archival FOXML found in the associated PID file passed in through arguments to their own folder in `output_dir/FOXML`.
+
 ### Datastream Updater
 #### Command
 ```bash

diff --git a/scripts/datastream_export.py b/scripts/datastream_export.py
@@ -4,7 +4,7 @@
 import concurrent.futures
 import os
 import mimetypes
-from utils import perform_http_request
+from utils import perform_http_request, process_pid_file
 
 
 def parse_args():
@@ -24,74 +24,50 @@ def parse_args():
     return parser.parse_args()
 
 
-def fetch_data(dsid, base_url, user, password, output_dir, obj_id):
+def fetch_data(dsid, base_url, user, password, output_dir, pid):
     """
-    Fetches the datastream content for a given datastream ID (dsid) and object ID (obj_id) from a Fedora repository.
+    Fetches the datastream content for a given datastream ID (dsid) and PID from a Fedora repository.
 
     Args:
         dsid (str): The ID of the datastream to fetch.
         base_url (str): The base URL of the Fedora repository.
         user (str): The username for authentication.
         password (str): The password for authentication.
         output_dir (str): The directory where the fetched data will be saved.
-        obj_id (str): The ID of the object that contains the datastream.
+        pid (str): The PID of the object that contains the datastream.
 
     Returns:
         bool: True if the datastream content was successfully fetched and saved, False otherwise.
     """
-    obj_id = obj_id.replace("info:fedora/", "")
-    url = f"{base_url}/fedora/objects/{obj_id}/datastreams/{dsid}/content"
-    print(f"Downloading {dsid} for PID: {obj_id}")
+    pid = pid.replace("info:fedora/", "")
+    url = f"{base_url}/fedora/objects/{pid}/datastreams/{dsid}/content"
+    print(f"Downloading {dsid} for PID: {pid}")
     try:
         response = requests.get(url, auth=(user, password))
         response.raise_for_status()
         dsid_dir = os.path.join(output_dir, dsid)
         os.makedirs(dsid_dir, exist_ok=True)
         content_type = response.headers.get("Content-Type", "")
         extension = mimetypes.guess_extension(content_type) if content_type else ""
-        filename = f"{obj_id}-{dsid}{extension}"
+        filename = f"{pid}-{dsid}{extension}"
         with open(os.path.join(dsid_dir, filename), "wb") as f:
             f.write(response.content)
-        print(f"Successfully saved {filename}")
+        print(f"Successfully saved {filename}\n")
         return True
     except Exception as e:
-        print(f"Failed to fetch data for {obj_id}, error: {str(e)}")
+        print(f"Failed to fetch data for {pid}, error: {str(e)}\n")
         return False
 
 
-def process_pid_file(filepath):
-    """
-    Process a file containing PIDs (Persistent Identifiers) and return a list of PIDs.
-    Supports comments in the file using '#' character.
-    Replace '%3A' with ':' in PIDs.
-
-    Args:
-        filepath (str): The path to the file containing PIDs.
-
-    Returns:
-        list: A list of PIDs extracted from the file.
-    """
-    pids = []
-    with open(filepath, "r") as file:
-        for line in file:
-            line = line.strip()
-            if "#" in line:
-                line = line[: line.index("#")].strip()
-            if line:
-                line = line.replace("%3A", ":")
-                pids.append(line)
-    return pids
-
-
 def main():
     args = parse_args()
     os.makedirs(args.output_dir, exist_ok=True)
 
-    object_ids = []
+    pids = []
 
     # If a PID file is provided, process the file to get the list of PIDs.
     if args.pid_file:
-        object_ids = process_pid_file(args.pid_file)
+        pids = process_pid_file(args.pid_file)
     else:
         query = f"""
         SELECT ?obj WHERE {{
@@ -105,11 +81,11 @@ def main():
         """
 
         result = perform_http_request(query, args.url, args.user, args.password)
-        object_ids.extend(result.strip().split("\n")[1:])
+        pids.extend(result.strip().split("\n")[1:])
 
     # Download metadata for each PID in parallel using ThreadPoolExecutor.
     with concurrent.futures.ThreadPoolExecutor(max_workers=3) as executor, tqdm(
-        total=len(object_ids), desc="Downloading Metadata"
+        total=len(pids), desc="Downloading Metadata"
     ) as progress:
         futures = {
             executor.submit(
@@ -119,18 +95,18 @@ def main():
                 args.user,
                 args.password,
                 args.output_dir,
-                obj_id,
-            ): obj_id
-            for obj_id in object_ids
+                pid,
+            ): pid
+            for pid in pids
         }
         for future in concurrent.futures.as_completed(futures):
-            obj_id = futures[future]
+            pid = futures[future]
             try:
                 success = future.result()
                 if success:
                     progress.update(1)
             except Exception as exc:
-                print(f"{obj_id} generated an exception: {exc}")
+                print(f"{pid} generated an exception: {exc}")
 
 
 if __name__ == "__main__":

diff --git a/scripts/foxml_export.py b/scripts/foxml_export.py
@@ -0,0 +1,93 @@
+import argparse
+import requests
+from tqdm import tqdm
+import concurrent.futures
+import os
+import mimetypes
+from utils import process_pid_file
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(
+        description="Export metadata using SPARQL query and save as XML."
+    )
+    parser.add_argument("--url", required=True, help="Fedora base URL")
+    parser.add_argument("--user", required=True, help="Username for Fedora access")
+    parser.add_argument("--password", required=True, help="Password for Fedora access")
+    parser.add_argument(
+        "--output_dir", default="./output", help="Directory to save XML files"
+    )
+    parser.add_argument(
+        "--pid_file", type=str, required=True, help="File containing PIDs to process"
+    )
+    return parser.parse_args()
+
+
+def fetch_foxml(base_url, user, password, output_dir, pid):
+    """
+    Fetches the archival FOXML for a given PID from a Fedora repository.
+
+    Args:
+        base_url (str): The base URL of the Fedora repository.
+        user (str): The username for authentication.
+        password (str): The password for authentication.
+        output_dir (str): The directory where the fetched data will be saved.
+        pid (str): The ID of the object that contains the datastream.
+
+    Returns:
+        bool: True if the datastream content was successfully fetched and saved, False otherwise.
+    """
+    pid = pid.replace("info:fedora/", "")
+    url = f"{base_url}/fedora/objects/{pid}/export?context=archive"
+    print(f"Downloading FOXML for PID: {pid}")
+    try:
+        response = requests.get(url, auth=(user, password))
+        response.raise_for_status()
+        foxml_dir = os.path.join(output_dir, "FOXML")
+        os.makedirs(foxml_dir, exist_ok=True)
+        content_type = response.headers.get("Content-Type", "")
+        extension = mimetypes.guess_extension(content_type) if content_type else ""
+        filename = f"{pid}-FOXML{extension}"
+        with open(os.path.join(foxml_dir, filename), "wb") as f:
+            f.write(response.content)
+        print(f"Successfully saved {filename}\n")
+        return True
+    except Exception as e:
+        print(f"Failed to fetch FOXML for {pid}, error: {str(e)}\n")
+        return False
+
+
+def main():
+    args = parse_args()
+    os.makedirs(args.output_dir, exist_ok=True)
+
+    pids = []
+
+    pids = process_pid_file(args.pid_file)
+
+    with concurrent.futures.ThreadPoolExecutor(max_workers=3) as executor, tqdm(
+        total=len(pids), desc="Downloading FOXML"
+    ) as progress:
+        futures = {
+            executor.submit(
+                fetch_foxml,
+                args.url,
+                args.user,
+                args.password,
+                args.output_dir,
+                pid,
+            ): pid
+            for pid in pids
+        }
+        for future in concurrent.futures.as_completed(futures):
+            pid = futures[future]
+            try:
+                success = future.result()
+                if success:
+                    progress.update(1)
+            except Exception as exc:
+                print(f"{pid} generated an exception: {exc}")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/scripts/utils.py b/scripts/utils.py
@@ -2,6 +2,19 @@
 
 
 def perform_http_request(query, endpoint_url, user, password, output_format="CSV"):
+    """
+    Perform an HTTP request to a specified endpoint URL with the given query.
+
+    Args:
+        query (str): The SPARQL query to be executed.
+        endpoint_url (str): The URL of the SPARQL endpoint.
+        user (str): The username for authentication.
+        password (str): The password for authentication.
+        output_format (str, optional): The desired format of the response. Defaults to "CSV".
+
+    Returns:
+        str: The response text if the request is successful, None otherwise.
+    """
     headers = {"Content-Type": "application/x-www-form-urlencoded"}
     payload = {
         "type": "tuples",
@@ -22,3 +35,26 @@ def perform_http_request(query, endpoint_url, user, password, output_format="CSV
     else:
         print(f"Error {response.status_code} while querying: {query}")
         return None
+
+def process_pid_file(filepath):
+    """
+    Process a file containing PIDs (Persistent Identifiers) and return a list of PIDs.
+    Supports comments in the file using '#' character.
+    Replace '%3A' with ':' in PIDs.
+
+    Args:
+        filepath (str): The path to the file containing PIDs.
+
+    Returns:
+        list: A list of PIDs extracted from the file.
+    """
+    pids = []
+    with open(filepath, "r") as file:
+        for line in file:
+            line = line.strip()
+            if "#" in line:
+                line = line[: line.index("#")].strip()
+            if line:
+                line = line.replace("%3A", ":")
+                pids.append(line)
+    return pids