Skip to content

Commit

Permalink
feat: create and load index (#17)
Browse files Browse the repository at this point in the history
Co-authored-by: Gabriel Nützi <[email protected]>
  • Loading branch information
cmdoret and gabyx authored Jun 24, 2024
1 parent 16ebe3a commit 341de0e
Show file tree
Hide file tree
Showing 4 changed files with 111 additions and 49 deletions.
20 changes: 15 additions & 5 deletions src/io.rs
Original file line number Diff line number Diff line change
@@ -1,14 +1,24 @@
use rio_turtle::NTriplesParser;
use std::{
boxed::Box,
fs::File,
io::{BufRead, BufReader},
io::{stdin, stdout, BufRead, BufReader, BufWriter, Write},
path::Path,
};

pub fn get_buffer(path: &Path) -> BufReader<File> {
return match File::open(&path) {
Ok(file) => BufReader::new(file),
Err(e) => panic!("Cannot open file '{path:?}': '{e}'."),
/// Get a reader based on input path, either from stdin or a file.
pub fn get_reader(path: &Path) -> Box<dyn BufRead> {
return match path.to_str().unwrap() {
"-" => Box::new(BufReader::new(stdin())),
_ => Box::new(BufReader::new(File::open(&path).unwrap())),
};
}

/// Get a writer based on input path, either to stdout or a file.
pub fn get_writer(path: &Path) -> Box<dyn Write> {
return match path.to_str().unwrap() {
"-" => Box::new(BufWriter::new(stdout())),
path => Box::new(BufWriter::new(File::open(path).unwrap())),
};
}

Expand Down
59 changes: 34 additions & 25 deletions src/main.rs
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,8 @@ mod rules;
// Define the imports.
use crate::{
log::{create_logger, info},
pass_second::encrypt,
pass_first::create_type_map,
pass_second::pseudonymize_graph,
};

use clap::{Args, Parser, Subcommand};
Expand All @@ -19,59 +20,67 @@ use std::path::PathBuf;
#[derive(Parser)]
#[command(name = "rdf-protect")]
#[command(version = "0.0.1")]
#[command(about ="A tool to anonymize nodes/edges in RDF graphs.", long_about = None)]
#[command(about ="A tool to pseudonymize URIs and values in RDF graphs.", long_about = None)]
struct Cli {
#[command(subcommand)]
command: Subcommands,
}

#[derive(Args, Debug)]
struct TypeMapArgs {
#[arg(short, long)]
output_file: PathBuf,
struct IndexArgs {
/// Output file descriptor to for the node-to-type index.
#[arg(short, long, default_value = "-")]
output: PathBuf,

/// File descriptor to read triples from.
/// Defaults to `stdin`.
#[arg(default_value = "-")]
input: PathBuf,
}

#[derive(Args, Debug)]
struct EncryptArgs {
/// The file which maps `node` ids to `type`s.
/// This is used in `encrypt` as the second pass to encrypt RDF triples.
struct PseudoArgs {
/// Index file produced by prepare-index.
/// Required for pseudonymization.
#[arg(short, long)]
type_map_file: PathBuf,
index: PathBuf,

/// The input file descriptor to use for outputting the RDF triples.
/// File descriptor to read input triples from.
/// Defaults to `stdin`.
#[arg(short, long, default_value = "-")]
#[arg(default_value = "-")]
input: PathBuf,

/// The output file descriptor to use for outputting the RDF triples.
// Defaults to `stdout`.
/// Output file descriptor for pseudonymized triples.
/// Defaults to `stdout`.
#[arg(short, long, default_value = "-")]
output: PathBuf,
}

#[derive(Subcommand, Debug)]
enum Subcommands {
/// 1. Pass: Create the node-to-type mapping.
// This is used in `encrypt` for the second pass to
// encrypt RDF triples based on some rules.
CreateTypeMap(TypeMapArgs),
/// 1. Pass: Create a node-to-type index from input triples.
// This is used in `pseudonymize` for the second pass to
// pseudonymize RDF triples based on a configuration.
Index(IndexArgs),

/// 2. Pass: Encrypt RDF triples read from a file descriptor (default `stdin`)
// This is based on rules and output them again on a file descriptor (default `stdout`)
Encrypt(EncryptArgs),
/// 2. Pass: Pseudonymize input triples.
// A config file defines pseudonymization rules. The deidentified triples are sent to the
// output file descriptor. (default `stdout`)
Pseudo(PseudoArgs),
}

fn main() {
let log = create_logger(true);
let log = create_logger(false);
let cli = Cli::parse();

match cli.command {
Subcommands::CreateTypeMap(args) => {
info!(log, "Args: {:?}", args)
Subcommands::Index(args) => {
info!(log, "Args: {:?}", args);
create_type_map(&args.input, &args.output)
}
Subcommands::Encrypt(args) => {
Subcommands::Pseudo(args) => {
info!(log, "Args: {:?}", args);
encrypt(&log, &args.input, &args.output, &args.type_map_file)
pseudonymize_graph(&log, &args.input, &args.output, &args.index)
}
}
}
Expand Down
26 changes: 26 additions & 0 deletions src/pass_first.rs
Original file line number Diff line number Diff line change
@@ -1 +1,27 @@
use rio_api::{model::Triple, parser::TriplesParser};
use rio_turtle::TurtleError;
use std::{io::Write, path::Path};

use crate::io;

fn index_triple(t: Triple, out: &mut impl Write) -> Result<(), TurtleError> {
match t.predicate.iri {
"http://www.w3.org/1999/02/22-rdf-syntax-ns#type" => {
let _ = out.write(&format!("{} .\n", &t.to_string()).into_bytes());
}
_ => {}
}

Ok(())
}

pub fn create_type_map(input: &Path, output: &Path) {
let buf_in = io::get_reader(input);
let mut buf_out = io::get_writer(output);
let mut triples = io::parse_ntriples(buf_in);
while !triples.is_end() {
triples
.parse_step(&mut |t| index_triple(t, &mut buf_out))
.unwrap();
}
}
55 changes: 36 additions & 19 deletions src/pass_second.rs
Original file line number Diff line number Diff line change
@@ -1,7 +1,8 @@
use rio_api::{model::Triple, parser::TriplesParser};
use rio_turtle::TurtleError;
use std::{
io::{BufRead, BufReader},
collections::HashMap,
io::{BufRead, Write},
path::Path,
};

Expand All @@ -11,30 +12,46 @@ use crate::{
model::{pseudonymize_triple, TripleMask},
};

fn mask_triple(triple: &Triple) -> TripleMask {
return TripleMask::SUBJECT;
}

// mask and encode input triple
// NOTE: This will need the type-map to perform masking
fn process_triple(triple: &Triple) -> Result<(), TurtleError> {
let mask = TripleMask::SUBJECT;
println!("{}", pseudonymize_triple(&triple, mask).to_string());
fn process_triple(triple: &Triple, out: &mut impl Write) -> Result<(), TurtleError> {
let mask = mask_triple(triple);
let pseudo_triple = pseudonymize_triple(&triple, mask);
let _ = out.write(&format!("{} .\n", &pseudo_triple.to_string()).into_bytes());

Ok(())
}

pub fn encrypt(log: &Logger, input: &Path, output: &Path, type_map_file: &Path) {
// Construct the buffer either from `stdio` or from an input file.
//
// This object is constructed on the stack and is a `trait object`.
// The wide-pointer `buffer` will have a pointer to the vtable
// and pointer to data on the stack.
// Normally that would be done with `Box::new(std::io::stdin())` on the heap, but since the
// newest version in Rust that also works on the stack (life-time extensions).
let buffer: &mut dyn BufRead = match input.to_str().unwrap() {
"-" => &mut BufReader::new(std::io::stdin()),
_ => &mut io::get_buffer(input),
};

let mut triples = io::parse_ntriples(buffer);
// Create a index mapping node -> type from an input ntriples buffer
fn load_type_map(input: impl BufRead) -> HashMap<String, String> {
let mut node_to_type: HashMap<String, String> = HashMap::new();
let mut triples = io::parse_ntriples(input);

while !triples.is_end() {
let _: Result<(), TurtleError> = triples.parse_step(&mut |t| {
node_to_type.insert(t.subject.to_string(), t.object.to_string());
Ok(())
});
}

return node_to_type;
}

pub fn pseudonymize_graph(log: &Logger, input: &Path, output: &Path, index: &Path) {
let buf_input = io::get_reader(input);
let buf_index = io::get_reader(index);
let mut buf_output = io::get_writer(output);

let node_to_type: HashMap<String, String> = load_type_map(buf_index);
let mut triples = io::parse_ntriples(buf_input);
while !triples.is_end() {
triples.parse_step(&mut |t| process_triple(&t)).unwrap();
triples
.parse_step(&mut |t| process_triple(&t, &mut buf_output))
.unwrap();
}
}
#[cfg(test)]
Expand Down

0 comments on commit 341de0e

Please sign in to comment.