Skip to content

Commit

Permalink
Add FOXML export
Browse files Browse the repository at this point in the history
  • Loading branch information
chrismacdonaldw committed Apr 19, 2024
1 parent 69ab4ee commit 08beabc
Show file tree
Hide file tree
Showing 4 changed files with 161 additions and 43 deletions.
13 changes: 13 additions & 0 deletions scripts/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,9 @@ Script to run SPARQL queries against an FCREPO's RI and gather information. Curr
### Metadata Export
Script to export all objects within the repository that contain a specified metadata datastream ID, saving results as XML.

### FOXML Export
Script to export FOXML archival objects from a Fedora repository given a list of PIDs.

### Datastream Updater
Script to inject a binary into an archival FOXML as base64 encoded data within a datastream.

Expand All @@ -58,6 +61,16 @@ python3 datastream_export.py --url=http://your-fedora-url:8080 --user=admin --pa
Exports all metadata entries related to the specified DSID into XML files stored in the defined output directory.
Each file's name will be in the format `pid-DSID.xml`.

### FOXML Export
#### Command
```bash
python3 foxml_export.py --url=http://your-fedora-url:8080 --user=admin --pasword=secret --pid_file=./some_pids --output_dir=./output
```
> The script supports adding comments in the pid_file using `#`. PIDs can also contain URL encoded characters (e.g., `%3A` for `:` which will be automatically decoded).
#### Output
Exports all archival FOXML found in the associated PID file passed in through arguments to their own folder in `output_dir/FOXML`.

### Datastream Updater
#### Command
```bash
Expand Down
62 changes: 19 additions & 43 deletions scripts/datastream_export.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
import concurrent.futures
import os
import mimetypes
from utils import perform_http_request
from utils import perform_http_request, process_pid_file


def parse_args():
Expand All @@ -24,74 +24,50 @@ def parse_args():
return parser.parse_args()


def fetch_data(dsid, base_url, user, password, output_dir, obj_id):
def fetch_data(dsid, base_url, user, password, output_dir, pid):
"""
Fetches the datastream content for a given datastream ID (dsid) and object ID (obj_id) from a Fedora repository.
Fetches the datastream content for a given datastream ID (dsid) and PID from a Fedora repository.
Args:
dsid (str): The ID of the datastream to fetch.
base_url (str): The base URL of the Fedora repository.
user (str): The username for authentication.
password (str): The password for authentication.
output_dir (str): The directory where the fetched data will be saved.
obj_id (str): The ID of the object that contains the datastream.
pid (str): The PID of the object that contains the datastream.
Returns:
bool: True if the datastream content was successfully fetched and saved, False otherwise.
"""
obj_id = obj_id.replace("info:fedora/", "")
url = f"{base_url}/fedora/objects/{obj_id}/datastreams/{dsid}/content"
print(f"Downloading {dsid} for PID: {obj_id}")
pid = pid.replace("info:fedora/", "")
url = f"{base_url}/fedora/objects/{pid}/datastreams/{dsid}/content"
print(f"Downloading {dsid} for PID: {pid}")
try:
response = requests.get(url, auth=(user, password))
response.raise_for_status()
dsid_dir = os.path.join(output_dir, dsid)
os.makedirs(dsid_dir, exist_ok=True)
content_type = response.headers.get("Content-Type", "")
extension = mimetypes.guess_extension(content_type) if content_type else ""
filename = f"{obj_id}-{dsid}{extension}"
filename = f"{pid}-{dsid}{extension}"
with open(os.path.join(dsid_dir, filename), "wb") as f:
f.write(response.content)
print(f"Successfully saved {filename}")
print(f"Successfully saved {filename}\n")
return True
except Exception as e:
print(f"Failed to fetch data for {obj_id}, error: {str(e)}")
print(f"Failed to fetch data for {pid}, error: {str(e)}\n")
return False


def process_pid_file(filepath):
"""
Process a file containing PIDs (Persistent Identifiers) and return a list of PIDs.
Supports comments in the file using '#' character.
Replace '%3A' with ':' in PIDs.
Args:
filepath (str): The path to the file containing PIDs.
Returns:
list: A list of PIDs extracted from the file.
"""
pids = []
with open(filepath, "r") as file:
for line in file:
line = line.strip()
if "#" in line:
line = line[: line.index("#")].strip()
if line:
line = line.replace("%3A", ":")
pids.append(line)
return pids


def main():
args = parse_args()
os.makedirs(args.output_dir, exist_ok=True)

object_ids = []
pids = []

# If a PID file is provided, process the file to get the list of PIDs.
if args.pid_file:
object_ids = process_pid_file(args.pid_file)
pids = process_pid_file(args.pid_file)
else:
query = f"""
SELECT ?obj WHERE {{
Expand All @@ -105,11 +81,11 @@ def main():
"""

result = perform_http_request(query, args.url, args.user, args.password)
object_ids.extend(result.strip().split("\n")[1:])
pids.extend(result.strip().split("\n")[1:])

# Download metadata for each PID in parallel using ThreadPoolExecutor.
with concurrent.futures.ThreadPoolExecutor(max_workers=3) as executor, tqdm(
total=len(object_ids), desc="Downloading Metadata"
total=len(pids), desc="Downloading Metadata"
) as progress:
futures = {
executor.submit(
Expand All @@ -119,18 +95,18 @@ def main():
args.user,
args.password,
args.output_dir,
obj_id,
): obj_id
for obj_id in object_ids
pid,
): pid
for pid in pids
}
for future in concurrent.futures.as_completed(futures):
obj_id = futures[future]
pid = futures[future]
try:
success = future.result()
if success:
progress.update(1)
except Exception as exc:
print(f"{obj_id} generated an exception: {exc}")
print(f"{pid} generated an exception: {exc}")


if __name__ == "__main__":
Expand Down
93 changes: 93 additions & 0 deletions scripts/foxml_export.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,93 @@
import argparse
import requests
from tqdm import tqdm
import concurrent.futures
import os
import mimetypes
from utils import process_pid_file


def parse_args():
parser = argparse.ArgumentParser(
description="Export metadata using SPARQL query and save as XML."
)
parser.add_argument("--url", required=True, help="Fedora base URL")
parser.add_argument("--user", required=True, help="Username for Fedora access")
parser.add_argument("--password", required=True, help="Password for Fedora access")
parser.add_argument(
"--output_dir", default="./output", help="Directory to save XML files"
)
parser.add_argument(
"--pid_file", type=str, required=True, help="File containing PIDs to process"
)
return parser.parse_args()


def fetch_foxml(base_url, user, password, output_dir, pid):
"""
Fetches the archival FOXML for a given PID from a Fedora repository.
Args:
base_url (str): The base URL of the Fedora repository.
user (str): The username for authentication.
password (str): The password for authentication.
output_dir (str): The directory where the fetched data will be saved.
pid (str): The ID of the object that contains the datastream.
Returns:
bool: True if the datastream content was successfully fetched and saved, False otherwise.
"""
pid = pid.replace("info:fedora/", "")
url = f"{base_url}/fedora/objects/{pid}/export?context=archive"
print(f"Downloading FOXML for PID: {pid}")
try:
response = requests.get(url, auth=(user, password))
response.raise_for_status()
foxml_dir = os.path.join(output_dir, "FOXML")
os.makedirs(foxml_dir, exist_ok=True)
content_type = response.headers.get("Content-Type", "")
extension = mimetypes.guess_extension(content_type) if content_type else ""
filename = f"{pid}-FOXML{extension}"
with open(os.path.join(foxml_dir, filename), "wb") as f:
f.write(response.content)
print(f"Successfully saved {filename}\n")
return True
except Exception as e:
print(f"Failed to fetch FOXML for {pid}, error: {str(e)}\n")
return False


def main():
args = parse_args()
os.makedirs(args.output_dir, exist_ok=True)

pids = []

pids = process_pid_file(args.pid_file)

with concurrent.futures.ThreadPoolExecutor(max_workers=3) as executor, tqdm(
total=len(pids), desc="Downloading FOXML"
) as progress:
futures = {
executor.submit(
fetch_foxml,
args.url,
args.user,
args.password,
args.output_dir,
pid,
): pid
for pid in pids
}
for future in concurrent.futures.as_completed(futures):
pid = futures[future]
try:
success = future.result()
if success:
progress.update(1)
except Exception as exc:
print(f"{pid} generated an exception: {exc}")


if __name__ == "__main__":
main()
36 changes: 36 additions & 0 deletions scripts/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,19 @@


def perform_http_request(query, endpoint_url, user, password, output_format="CSV"):
"""
Perform an HTTP request to a specified endpoint URL with the given query.
Args:
query (str): The SPARQL query to be executed.
endpoint_url (str): The URL of the SPARQL endpoint.
user (str): The username for authentication.
password (str): The password for authentication.
output_format (str, optional): The desired format of the response. Defaults to "CSV".
Returns:
str: The response text if the request is successful, None otherwise.
"""
headers = {"Content-Type": "application/x-www-form-urlencoded"}
payload = {
"type": "tuples",
Expand All @@ -22,3 +35,26 @@ def perform_http_request(query, endpoint_url, user, password, output_format="CSV
else:
print(f"Error {response.status_code} while querying: {query}")
return None

def process_pid_file(filepath):
"""
Process a file containing PIDs (Persistent Identifiers) and return a list of PIDs.
Supports comments in the file using '#' character.
Replace '%3A' with ':' in PIDs.
Args:
filepath (str): The path to the file containing PIDs.
Returns:
list: A list of PIDs extracted from the file.
"""
pids = []
with open(filepath, "r") as file:
for line in file:
line = line.strip()
if "#" in line:
line = line[: line.index("#")].strip()
if line:
line = line.replace("%3A", ":")
pids.append(line)
return pids

0 comments on commit 08beabc

Please sign in to comment.