From 8fbf299314a0ecdf6629a689433961761f7c3b0b Mon Sep 17 00:00:00 2001 From: Matt Burridge Date: Thu, 12 Sep 2024 20:18:01 +0100 Subject: [PATCH] Broken - attempt at not changing dirs during zip process --- .github/workflows/python.yml | 5 - python/Cargo.lock | 2 +- python/Cargo.toml | 2 +- src/ro_crate/write.rs | 360 ++++++++++++++---- .../fixtures/_ro-crate-metadata-minimal.json | 2 +- 5 files changed, 298 insertions(+), 73 deletions(-) diff --git a/.github/workflows/python.yml b/.github/workflows/python.yml index 47d7335..195b139 100644 --- a/.github/workflows/python.yml +++ b/.github/workflows/python.yml @@ -21,12 +21,7 @@ jobs: if: "startsWith(github.ref, 'refs/tags/')" steps: - uses: actions/checkout@v4 - - name: Install dependencies - run: brew install gettext - uses: actions/setup-python@v5 - with: - python-version: ${{ env.PYTHON_VERSION }} - architecture: x64 - uses: dtolnay/rust-toolchain@stable - name: Build wheels - x86_64 uses: PyO3/maturin-action@v1 diff --git a/python/Cargo.lock b/python/Cargo.lock index 90c842d..09d664e 100644 --- a/python/Cargo.lock +++ b/python/Cargo.lock @@ -1587,7 +1587,7 @@ dependencies = [ [[package]] name = "rocraters-python" -version = "0.1.0" +version = "0.2.0" dependencies = [ "chrono", "pyo3", diff --git a/python/Cargo.toml b/python/Cargo.toml index 17f6656..b430f73 100644 --- a/python/Cargo.toml +++ b/python/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "rocraters-python" -version = "0.1.0" +version = "0.2.0" edition = "2021" authors = ["Matt Burridge "] description = "Lightweight Python library for RO-Crate manipulation implemented in Rust" diff --git a/src/ro_crate/write.rs b/src/ro_crate/write.rs index 30ca3eb..0045e0c 100644 --- a/src/ro_crate/write.rs +++ b/src/ro_crate/write.rs @@ -254,7 +254,7 @@ pub fn zip_crate_external( // Pop all non-urls ids.retain(|id| is_not_url(id)); - let nonrels = get_nonrelative_paths(&ids, crate_path); + let nonrels = get_nonrelative_paths(ids, crate_path); // if nonrels is not empty, means data entities are external // therefore we need to package them @@ -301,28 +301,35 @@ pub fn zip_crate_external( /// It currently focuses only on Windows extended-length path prefixes and needs to be updated in /// line with new ids. fn update_zip_ids(rocrate: &mut RoCrate, id: PathBuf, zip_id: &str) { - let id_str = id.to_str().unwrap_or_default(); - - // NOTE: this only really checks for extended length path failures - others may be present so this can - // be refactored when needed - // base update on direct match - if rocrate.update_id_recursive(id_str, zip_id).is_none() { - // if fail - check if the ID string contains the '\\?\' prefix - if id_str.starts_with(r"\\?\") { - let stripped_id = &id_str[4..]; - if let Some(_) = rocrate.update_id_recursive(stripped_id, zip_id) { - } else { - // if win extend length not an issue, check \\ stripping - if id_str.contains("\\\\") { - if let Some(_) = rocrate.update_id_recursive(stripped_id, zip_id) {} - } else { - if let Some(_) = rocrate.update_id_recursive(stripped_id, zip_id) {} - } - } - } else { + let id_str = match id.to_str() { + Some(s) => s, + None => { + // Log or handle the error in some way instead of defaulting silently. + eprintln!("Failed to convert path to string"); + return; } + }; + + // Try updating the ID directly + if rocrate.update_id_recursive(id_str, zip_id).is_some() { + return; + } + + // Handle Windows extended-length paths (\\?\) + let stripped_id = if id_str.starts_with(r"\\?\") { + &id_str[4..] // Strip the extended-length path prefix } else { + id_str + }; + + // Attempt the update with the stripped ID + if rocrate.update_id_recursive(stripped_id, zip_id).is_some() { + return; } + + // Handle the case where the path contains double backslashes + let normalized_id = stripped_id.replace("\\\\", "\\"); + rocrate.update_id_recursive(&normalized_id, zip_id); } /// Identifies file paths that are not relative to the given RO-Crate directory. @@ -336,58 +343,53 @@ fn update_zip_ids(rocrate: &mut RoCrate, id: PathBuf, zip_id: &str) { /// /// # Returns /// A vector of `PathBuf` objects representing files that are outside the crate's base directory. -fn get_nonrelative_paths(ids: &Vec<&String>, crate_dir: &Path) -> Vec { +fn get_nonrelative_paths(ids: Vec<&String>, crate_dir: &Path) -> Vec { let mut nonrels: Vec = Vec::new(); + // Get the absolute path of the crate directory let rocrate_path = get_absolute_path(crate_dir).unwrap(); - let root_dir = rocrate_path.parent(); - // Extract the directory part of the path - if let Some(directory_path) = root_dir { - // Try to change the current working directory - let _ = env::set_current_dir(directory_path); - } else { - } + println!("rocrate_path = {:?}", rocrate_path); - // Iterate over all the ids, check the paths are relative to crate. - // If not relative to crate and a file, then grab, add to extern folder - // and zip + // Iterate over all the ids, check the paths are relative to the crate. + // If not relative to the crate and it's a file, then mark it as non-relative for id in ids.iter() { + // Skip if the ID is a fragment (starts with '#') if id.starts_with('#') { continue; } - if let Some(path) = get_absolute_path(Path::new(id)) { - if path.exists() { - let nonrel = is_outside_base_folder(root_dir.unwrap(), &path); - if nonrel { - if id.starts_with(".") { - nonrels.push(id.into()); - } else { - nonrels.push(path); - } + + // Resolve the absolute path of the ID + let id_path = Path::new(id); + println!("ID Path: {:?}", id_path); + if let Some(abs_id_path) = get_absolute_path(id_path) { + println!("{:?}", abs_id_path); + if abs_id_path.exists() { + // Check if the path is outside of the crate directory + if is_outside_base_folder(&rocrate_path, &abs_id_path) { + nonrels.push(abs_id_path); } - } else { } - } else { - continue; } } + nonrels } -/// Converts a relative path to an absolute one, if possible. -/// -/// This utility function is useful for obtaining the absolute path representation of a file or directory. + +/// Resolves the absolute path of the given path. /// /// # Arguments -/// * `relative_path` - The path to be converted to its absolute form. +/// * `path` - The path to resolve. /// /// # Returns -/// An `Option` containing the absolute path, if the conversion was successful; otherwise, `None`. -fn get_absolute_path(relative_path: &Path) -> Option { - match fs::canonicalize(relative_path) { - Ok(path) => Some(path), - Err(_e) => None, +/// The absolute path as a `PathBuf`, or `None` if the path cannot be resolved. +fn get_absolute_path(path: &Path) -> Option { + if path.is_absolute() { + Some(path.to_path_buf()) + } else { + env::current_dir().ok().map(|cwd| cwd.join(path)) } } + /// Determines whether a given string is not a URL. /// /// This function checks if the provided string represents a file path rather than a URL. It's particularly @@ -405,21 +407,41 @@ fn get_absolute_path(relative_path: &Path) -> Option { /// assert!(!is_not_url("http://example.com")); /// ``` fn is_not_url(path: &str) -> bool { - // Check if the path is likely a Windows extended-length path - let is_extended_windows_path = path.starts_with(r"\\?\"); - - // Check if the path is likely a normal file path - let is_normal_file_path = path.starts_with(r"\\") // UNC path - || path.chars().next().map(|c| c.is_alphabetic() && path.chars().nth(1) == Some(':')).unwrap_or(false) // Drive letter, e.g., C:\ - || path.starts_with('/') // Unix-style path - || path.starts_with('.'); // Relative path - - // If it looks like a file path, return true early - if is_extended_windows_path || is_normal_file_path { - return true; + if is_windows_path(path) || is_unix_path(path) || is_relative_path(path) { + return true; // It's a file path + } + + if Url::parse(path).is_ok() { + return false; // It's a URL + } + + if is_probably_a_domain(path) { + return false; // It's a URL } - Url::parse(path).is_err() + true +} + +/// Determines if a string is likely a Windows-style file path (e.g., "C:\path\to\file.txt"). +fn is_windows_path(path: &str) -> bool { + // Check for UNC paths (\\server\share) or drive letter paths (C:\) + path.starts_with(r"\\") || (path.len() > 2 && path[1..3] == *":\\") +} + +/// Determines if a string is likely a Unix-style file path (e.g., "/home/user/file.txt"). +fn is_unix_path(path: &str) -> bool { + path.starts_with('/') +} + +/// Determines if a string is a relative file path (e.g., "./file.txt" or "../file.txt"). +fn is_relative_path(path: &str) -> bool { + path.starts_with('.') // Relative paths often start with "./" or "../" +} + +/// Determines if a string looks like a domain without a protocol (e.g., "example.com", "www.example.com"). +fn is_probably_a_domain(path: &str) -> bool { + // Check if the string contains a dot, has no spaces, and isn't starting/ending with a dot + path.contains('.') && !path.contains(' ') && !path.starts_with('.') && !path.ends_with('.') } /// Checks if a given file path lies outside of a specified base folder. @@ -477,4 +499,212 @@ mod write_crate_tests { // Clean up: Remove the created file after the test fs::remove_file(file_name).expect("Failed to remove test file"); } + + #[test] + fn test_windows_paths() { + // Windows drive letter paths + assert!(is_not_url(r"C:\path\to\file.txt")); + assert!(is_not_url(r"D:\folder\file.exe")); + + // Windows UNC paths + assert!(is_not_url(r"\\server\share\file.txt")); + + // Windows extended-length path + assert!(is_not_url(r"\\?\C:\extended\path\file.txt")); + } + + #[test] + fn test_unix_paths() { + // Unix-style absolute paths + assert!(is_not_url(r"/home/user/file.txt")); + assert!(is_not_url(r"/usr/local/bin")); + + // Unix-style relative paths + assert!(is_not_url(r"./relative/path")); + assert!(is_not_url(r"../relative/path")); + } + + #[test] + fn test_urls() { + // HTTP URLs + assert!(!is_not_url("http://example.com")); + assert!(!is_not_url("https://example.com/path")); + + // FTP URLs + assert!(!is_not_url("ftp://example.com/file")); + + // URLs without a protocol + assert!(!is_not_url("example.com")); + + // Localhost as URL + assert!(!is_not_url("http://localhost:8080")); + } + + #[test] + fn test_edge_cases() { + // Path that could resemble a domain + assert!(is_not_url(r"C:\example.com\file.txt")); + + // Empty string is not a valid file path or URL + assert!(is_not_url("")); + + // URL-like string without protocol + assert!(!is_not_url("www.example.com")); + } + + // Struct to hold all test strings + struct TestPaths { + relative_file1: String, + relative_file2: String, + absolute_outside1: String, + absolute_outside2: String, + relative_file3: String, + absolute_inside: String, + fragment1: String, + } + + // Helper function to create and return the test paths struct + fn get_test_paths() -> TestPaths { + TestPaths { + relative_file1: String::from("file1.txt"), // Relative inside crate + relative_file2: String::from("subdir/file2.txt"), // Relative inside crate + absolute_outside1: String::from("/mocked/root/otherdir/file3.txt"), // Absolute outside crate + absolute_outside2: String::from("/mocked/root/anotherdir/file4.txt"), // Absolute outside crate + relative_file3: String::from("./subdir/file2.txt"), // Relative inside crate + absolute_inside: String::from("/mocked/root/crate/file_inside.txt"), // Absolute inside crate + fragment1: String::from("#fragment1"), // Fragment identifier + } + } + + #[test] + fn test_relative_paths_within_crate() { + let crate_dir = Path::new("/mocked/root/crate"); + let paths = get_test_paths(); // Fetch test paths + + let ids = vec![&paths.relative_file1, &paths.relative_file2]; + + // Paths are inside the crate directory + let nonrels = get_nonrelative_paths(ids, crate_dir); + assert!( + nonrels.is_empty(), + "All paths should be relative to the crate directory." + ); + } + + #[test] + fn test_absolute_paths_outside_crate() { + let crate_dir = Path::new("/mocked/root/crate"); + let paths = get_test_paths(); // Fetch test paths + + let ids = vec![&paths.absolute_outside1, &paths.absolute_outside2]; + + // Paths are outside the crate directory + let nonrels = get_nonrelative_paths(ids, crate_dir); + assert_eq!( + nonrels.len(), + 2, + "Both paths should be identified as non-relative." + ); + } + + #[test] + fn test_mixed_paths() { + let crate_dir = Path::new("/mocked/root/crate"); + let paths = get_test_paths(); // Fetch test paths + + let ids = vec![ + &paths.relative_file1, // Inside crate + &paths.absolute_outside1, // Outside crate + &paths.relative_file3, // Inside crate + &paths.absolute_inside, // Inside crate + ]; + + // One path is outside the crate, others are inside + let nonrels = get_nonrelative_paths(ids, crate_dir); + println!("{:?}", nonrels); + assert_eq!( + nonrels.len(), + 1, + "One path should be identified as non-relative." + ); + } + + #[test] + fn test_skip_fragment_identifiers() { + let crate_dir = Path::new("/mocked/root/crate"); + let paths = get_test_paths(); // Fetch test paths + + let ids = vec![ + &paths.fragment1, // Should be skipped + &paths.relative_file1, // Inside crate + ]; + + // No external paths, and fragments should be ignored + let nonrels = get_nonrelative_paths(ids, crate_dir); + assert!( + nonrels.is_empty(), + "Fragment identifiers should be ignored." + ); + } + + #[test] + fn test_non_existent_paths() { + let crate_dir = Path::new("/mocked/root/crate"); + + let non_exist_1 = String::from("non_existent_file.txt"); // Non-existent path + let non_exist_2 = String::from("subdir/also_non_existent.txt"); // Non-existent path + let ids = vec![&non_exist_1, &non_exist_2]; + + // Non-existent paths should not be added + let nonrels = get_nonrelative_paths(ids, crate_dir); + assert!( + nonrels.is_empty(), + "Non-existent files should not be considered." + ); + } + + #[test] + fn test_is_outside_base_folder() { + let crate_dir = Path::new("/mocked/root/crate"); + let paths = get_test_paths(); // Fetch test paths + assert!(!is_outside_base_folder( + crate_dir, + Path::new(&paths.relative_file3) + )); + assert!(!is_outside_base_folder( + crate_dir, + Path::new(&paths.absolute_inside) + )); + } + + #[test] + fn test_absolute_path() { + let absolute_path = Path::new("/mocked/root/absolute_file.txt"); + let result = get_absolute_path(absolute_path); + + assert!(result.is_some(), "Absolute path should return Some"); + assert_eq!( + result.unwrap(), + absolute_path, + "The absolute path should match the input" + ); + } + + #[test] + fn test_relative_path() { + let relative_path = Path::new("relative_file.txt"); + + // Get current directory to verify result + let current_dir = env::current_dir().expect("Failed to get current directory"); + let expected_path = current_dir.join(relative_path); + + let result = get_absolute_path(relative_path); + + assert!(result.is_some(), "Relative path should return Some"); + assert_eq!( + result.unwrap(), + expected_path, + "The relative path should be joined with the current directory" + ); + } } diff --git a/tests/fixtures/_ro-crate-metadata-minimal.json b/tests/fixtures/_ro-crate-metadata-minimal.json index c5b373a..4f004f5 100644 --- a/tests/fixtures/_ro-crate-metadata-minimal.json +++ b/tests/fixtures/_ro-crate-metadata-minimal.json @@ -24,4 +24,4 @@ "name": "Attribution-NonCommercial-ShareAlike 3.0 Australia (CC BY-NC-SA 3.0 AU)" } ] -} \ No newline at end of file +}