diff --git a/Cargo.lock b/Cargo.lock index 2822106..f41ebe2 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -154,6 +154,28 @@ dependencies = [ "winapi", ] +[[package]] +name = "equivalent" +version = "1.0.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5443807d6dff69373d433ab9ef5378ad8df50ca6298caf15de6e52e24aaf54d5" + +[[package]] +name = "errno" +version = "0.3.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "534c5cf6194dfab3db3242765c03bbe257cf92f22b38f6bc0c58d59108a820ba" +dependencies = [ + "libc", + "windows-sys", +] + +[[package]] +name = "fastrand" +version = "2.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9fc0510504f03c51ada170672ac806f1f105a88aa97a5281117e1ddc3368e51a" + [[package]] name = "getrandom" version = "0.2.15" @@ -165,6 +187,12 @@ dependencies = [ "wasi", ] +[[package]] +name = "hashbrown" +version = "0.14.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e5274423e17b7c9fc20b6e7e208532f9b19825d82dfd615708b70edd83df41f1" + [[package]] name = "heck" version = "0.5.0" @@ -177,6 +205,16 @@ version = "0.3.9" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "d231dfb89cfffdbc30e7fc41579ed6066ad03abda9e567ccafae602b97ec5024" +[[package]] +name = "indexmap" +version = "2.2.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "168fb715dda47215e360912c096649d23d58bf392ac62f73919e831745e40f26" +dependencies = [ + "equivalent", + "hashbrown", +] + [[package]] name = "is-terminal" version = "0.4.12" @@ -216,6 +254,30 @@ dependencies = [ "libc", ] +[[package]] +name = "libyml" +version = "0.0.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3e281a65eeba3d4503a2839252f86374528f9ceafe6fed97c1d3b52e1fb625c1" + +[[package]] +name = "linux-raw-sys" +version = "0.4.14" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "78b3ae25bc7c8c38cec158d1f2757ee79e9b3740fbc7ccf0e59e4b08d793fa89" + +[[package]] +name = "log" +version = "0.4.21" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "90ed8c1e510134f979dbc4f070f87d4313098b704861a105fe34231c70a3901c" + +[[package]] +name = "memchr" +version = "2.7.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "78ca9ab1a0babb1e7d5695e3530886289c18cf2f87ec19a575a0abdce112e3a3" + [[package]] name = "num-conv" version = "0.1.0" @@ -276,9 +338,11 @@ dependencies = [ "rio_api", "rio_turtle", "serde", + "serde_yml", "slog", "slog-async", "slog-term", + "tempfile", ] [[package]] @@ -309,12 +373,31 @@ dependencies = [ "rio_api", ] +[[package]] +name = "rustix" +version = "0.38.34" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "70dc5ec042f7a43c4a73241207cecc9873a06d45debb38b329f8541d85c2730f" +dependencies = [ + "bitflags", + "errno", + "libc", + "linux-raw-sys", + "windows-sys", +] + [[package]] name = "rustversion" version = "1.0.17" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "955d28af4278de8121b7ebeb796b6a45735dc01436d898801014aced2773a3d6" +[[package]] +name = "ryu" +version = "1.0.18" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f3cb5ba0dc43242ce17de99c180e96db90b235b8a9fdc9543c96d2209116bd9f" + [[package]] name = "serde" version = "1.0.203" @@ -335,6 +418,34 @@ dependencies = [ "syn", ] +[[package]] +name = "serde_json" +version = "1.0.117" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "455182ea6142b14f93f4bc5320a2b31c1f266b66a4a5c858b013302a5d8cbfc3" +dependencies = [ + "itoa", + "ryu", + "serde", +] + +[[package]] +name = "serde_yml" +version = "0.0.10" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "78ce6afeda22f0b55dde2c34897bce76a629587348480384231205c14b59a01f" +dependencies = [ + "indexmap", + "itoa", + "libyml", + "log", + "memchr", + "ryu", + "serde", + "serde_json", + "tempfile", +] + [[package]] name = "slog" version = "2.7.0" @@ -389,6 +500,18 @@ version = "0.2.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "f764005d11ee5f36500a149ace24e00e3da98b0158b3e2d53a7495660d3f4d60" +[[package]] +name = "tempfile" +version = "3.10.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "85b77fafb263dd9d05cbeac119526425676db3784113aa9295c88498cbf8bff1" +dependencies = [ + "cfg-if", + "fastrand", + "rustix", + "windows-sys", +] + [[package]] name = "term" version = "0.7.0" diff --git a/Cargo.toml b/Cargo.toml index 5834d8e..6eda731 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -17,3 +17,5 @@ clap = { version = "4.5.7", features = ["derive"] } rio_turtle = "0.8.4" rio_api = "0.8.4" bitflags = "2.5.0" +serde_yml = "0.0.10" +tempfile = "3.10.1" diff --git a/src/io.rs b/src/io.rs index 485913e..941100f 100644 --- a/src/io.rs +++ b/src/io.rs @@ -1,4 +1,6 @@ +use crate::rules::Config; use rio_turtle::NTriplesParser; +use serde_yml; use std::{ boxed::Box, fs::File, @@ -18,7 +20,7 @@ pub fn get_reader(path: &Path) -> Box { pub fn get_writer(path: &Path) -> Box { return match path.to_str().unwrap() { "-" => Box::new(BufWriter::new(stdout())), - path => Box::new(BufWriter::new(File::open(path).unwrap())), + path => Box::new(BufWriter::new(File::create(path).unwrap())), }; } @@ -28,11 +30,22 @@ pub fn parse_ntriples(reader: impl BufRead) -> NTriplesParser { return NTriplesParser::new(reader); } +// Parse yaml configuration file. +pub fn parse_config(path: &Path) -> Config { + return match File::open(&path) { + Ok(file) => serde_yml::from_reader(file).expect("Error parsing config file."), + Err(e) => panic!("Cannot open file '{:?}': '{}'.", path, e), + }; +} + #[cfg(test)] mod tests { - use super::parse_ntriples; + use super::{parse_config, parse_ntriples}; use rio_api::parser::TriplesParser; - use std::io::{BufRead, BufReader}; + use std::{ + io::{BufRead, BufReader}, + path::Path, + }; #[test] // Test the parsing of a triple. @@ -49,4 +62,10 @@ mod tests { }) .expect("Error parsing triple"); } + // Test the parsing of a config file. + #[test] + fn config_parsing() { + let config_path = Path::new("tests/data/config.yaml"); + parse_config(&config_path); + } } diff --git a/src/main.rs b/src/main.rs index 3f0e177..a24b009 100644 --- a/src/main.rs +++ b/src/main.rs @@ -50,6 +50,11 @@ struct PseudoArgs { #[arg(default_value = "-")] input: PathBuf, + /// The config file descriptor to use for defining RDF elements to pseudonymize. + /// Format: yaml + #[arg(short, long)] + config: PathBuf, + /// Output file descriptor for pseudonymized triples. /// Defaults to `stdout`. #[arg(short, long, default_value = "-")] @@ -80,7 +85,7 @@ fn main() { } Subcommands::Pseudo(args) => { info!(log, "Args: {:?}", args); - pseudonymize_graph(&log, &args.input, &args.output, &args.index) + pseudonymize_graph(&log, &args.input, &args.config, &args.output, &args.index) } } } diff --git a/src/pass_second.rs b/src/pass_second.rs index cbaf866..54afdbd 100644 --- a/src/pass_second.rs +++ b/src/pass_second.rs @@ -10,6 +10,7 @@ use crate::{ io, log::Logger, model::{pseudonymize_triple, TripleMask}, + rules::Config, }; fn mask_triple(triple: &Triple) -> TripleMask { @@ -18,7 +19,12 @@ fn mask_triple(triple: &Triple) -> TripleMask { // mask and encode input triple // NOTE: This will need the type-map to perform masking -fn process_triple(triple: &Triple, out: &mut impl Write) -> Result<(), TurtleError> { +fn process_triple( + triple: &Triple, + rules_config: &Config, + node_to_type: &HashMap, + out: &mut impl Write, +) -> Result<(), TurtleError> { let mask = mask_triple(triple); let pseudo_triple = pseudonymize_triple(&triple, mask); let _ = out.write(&format!("{} .\n", &pseudo_triple.to_string()).into_bytes()); @@ -41,32 +47,42 @@ fn load_type_map(input: impl BufRead) -> HashMap { return node_to_type; } -pub fn pseudonymize_graph(log: &Logger, input: &Path, output: &Path, index: &Path) { +pub fn pseudonymize_graph(log: &Logger, input: &Path, config: &Path, output: &Path, index: &Path) { let buf_input = io::get_reader(input); let buf_index = io::get_reader(index); let mut buf_output = io::get_writer(output); + let rules_config = io::parse_config(config); let node_to_type: HashMap = load_type_map(buf_index); let mut triples = io::parse_ntriples(buf_input); while !triples.is_end() { triples - .parse_step(&mut |t| process_triple(&t, &mut buf_output)) + .parse_step(&mut |t| process_triple(&t, &rules_config, &node_to_type, &mut buf_output)) .unwrap(); } } #[cfg(test)] mod tests { - use super::encrypt; + use super::pseudonymize_graph; use crate::log; use std::path::Path; + use tempfile::tempdir; #[test] // Test the parsing of a triple. fn encrypt_nt_file() { + let dir = tempdir().unwrap(); let input_path = Path::new("tests/data/test.nt"); - let output_path = Path::new("tests/data/output.nt"); + let config_path = Path::new("tests/data/config.yaml"); + let output_path = dir.path().join("output.nt"); let type_map_path = Path::new("tests/data/type_map.nt"); let logger = log::create_logger(true); - encrypt(&logger, &input_path, &output_path, &type_map_path); + pseudonymize_graph( + &logger, + &input_path, + &config_path, + &output_path, + &type_map_path, + ); } } diff --git a/src/rules.rs b/src/rules.rs index 9e201ad..c00452e 100644 --- a/src/rules.rs +++ b/src/rules.rs @@ -1,13 +1,14 @@ +use ::std::collections::{HashMap, HashSet}; use serde::{Deserialize, Serialize}; #[derive(Serialize, Deserialize, Debug)] -struct Config { +pub struct Config { // Replace values of nodes with a certain type. - replace_values_of_nodes_with_type: Vec, + pub replace_uri_of_nodes_with_type: HashSet, // Replace values of `subject` & `predicate`. - replace_values_of_subject_predicate: Vec<(String, String)>, + pub replace_values_of_subject_predicate: HashMap>, // Replace values in matched `predicates`. - replace_value_of_predicate: Vec, + pub replace_value_of_predicate: HashSet, } diff --git a/tests/data/config.yaml b/tests/data/config.yaml index 085fa43..6d1d791 100644 --- a/tests/data/config.yaml +++ b/tests/data/config.yaml @@ -1,15 +1,15 @@ -# Hash URIs of people and online accounts. -replace-uris-of-nodes-with-type: - "http://xmlns.com/foaf/0.1/Person" # All nodes which are `rdf:type Person`. - "http://xmlns.com/foaf/OnlineAccount" # or `rdf::type OnlineAccount` +# hash URIs of people and online accounts +replace_uri_of_nodes_with_type: + - "http://xmlns.com/foaf/0.1/Person" # All nodes which are rdf:type Person + - "http://xmlns.com/foaf/OnlineAccount" # "" OnlineAccount -# Hash name only for instances of person and online account. -replace-values-of-predicate-object: +# hash name only for instances of person and online account +replace_values_of_subject_predicate: "http://xmlns.com/foaf/OnlineAccount": - "http://schema.org/name" + - "http://schema.org/name" "http://xmlns.com/foaf/0.1/Person": - "http://schema.org/name" + - "http://schema.org/name" -# Hash accesscode values for all nodes. -replace-values-of-predicate: - "http://schema.org/accessCode" \ No newline at end of file +# hash accesscode values for all nodes +replace_value_of_predicate: + - "http://schema.org/accessCode"