-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
refactor: oxigraph -> sophia api (#13)
* refactor: rm all oxigraph imports * refactor(deps): rio+oxigraph -> sophia * refactor: standalone GraphFormat enum + adapt signatures * refactor: use sophia 0.8.0 [WIP] * refactor: complete refactor to sophia * refactor: use Output enum instead of trait generics * feat: add benchmark script * fix(bench): add stdev on viz * fix(bench): add stdev on viz (bis) * chore: rm unused deps * refactor: simplify RdfParser signatures * doc: module documentation * tests(io): unit testing * test(cli): integration tests * fix(cli): disable output on --no-out
- Loading branch information
Showing
10 changed files
with
694 additions
and
509 deletions.
There are no files selected for viewing
Large diffs are not rendered by default.
Oops, something went wrong.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,19 @@ | ||
#!/usr/bin/env bash | ||
# Compare runtime of rdfpipe vs rdfpipe-rs | ||
# hyperfine is the only dependency (besides rdfpipe and rdfpipe-rs) | ||
set -euo pipefail | ||
|
||
# File path to a (large) ntriples RDF dataset | ||
DATASET="$1" | ||
RDFPIPE_PY="rdfpipe" | ||
RDFPIPE_RS="./target/release/rdfpipe-rs" | ||
|
||
# Run both implementations with different number of triples | ||
# timings are saved in timings.csv | ||
hyperfine \ | ||
--warmup 1 \ | ||
-L N 1,2,3,4,5,10,15,20,50 \ | ||
-L FMT ttl,xml \ | ||
--export-csv timings.csv \ | ||
"head -n {N}000 ${DATASET} | ${RDFPIPE_PY} -i nt -o {FMT} - > /dev/null" \ | ||
"head -n {N}000 ${DATASET} | ${RDFPIPE_RS} -i nt -o {FMT} - > /dev/null" |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,42 @@ | ||
# Visualization of timings for rdfpipe vs rdfpipe-rs | ||
# tidyverse>=1.1.3 is the only dependency | ||
|
||
library(tidyverse) | ||
|
||
timings <- read_csv("timings.csv") | ||
|
||
bench <- timings %>% | ||
rename( | ||
tool = command, | ||
thousand_lines = parameter_N, | ||
fmt = parameter_FMT | ||
) %>% | ||
mutate(tool = case_when( | ||
str_detect(tool, "rdfpipe-rs") ~ "rdfpipe-rs", | ||
TRUE ~ "rdfpipe" | ||
)) %>% | ||
select(tool, mean, fmt, stddev, thousand_lines) %>% | ||
arrange(thousand_lines, tool) | ||
|
||
ggplot(bench, aes(x = thousand_lines, y = log10(mean), color = tool)) + | ||
geom_ribbon( | ||
aes( | ||
y = log10(mean), | ||
ymin = log10(mean - stddev), | ||
ymax = log10(mean + stddev), | ||
), | ||
alpha = .5, | ||
linewidth = 0, | ||
fill = "lightgrey", | ||
) + | ||
geom_line() + | ||
xlab("Thousands of lines parsed") + | ||
ylab("Log10 time (seconds)") + | ||
theme_bw(base_size = 22) + | ||
coord_fixed(ratio = 10) + | ||
facet_grid(~fmt, labeller = labeller( | ||
fmt = c( | ||
"ttl" = "ntriples -> turtle", | ||
"xml" = "ntriples -> xml" | ||
) | ||
)) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,39 @@ | ||
//! # Conversion logic | ||
//! | ||
//! This module contains the `RdfIO` trait which is used to parse and serialize RDF graphs. | ||
//! Each RDF serialization format should implement this trait. | ||
//! | ||
use crate::io::{Input, Output}; | ||
use sophia::api::prelude::TripleParser; | ||
use sophia::api::serializer::TripleSerializer; | ||
use sophia::api::source::TripleSource; | ||
use sophia::inmem::graph::FastGraph; | ||
|
||
/// The `RdfIO` trait is used to parse and serialize RDF graphs. | ||
pub trait RdfIO<'a, P: TripleParser<Input>, F: TripleSerializer> { | ||
/// Parse an RDF graph from an input source to an in-memory graph. | ||
fn parse(&self, input: Input) -> Result<FastGraph, String> { | ||
let mut graph = FastGraph::new(); | ||
match self.parser().parse(input).add_to_graph(&mut graph) { | ||
Ok(_) => Ok(graph), | ||
Err(_) => Err(String::from("Could not parse graph")), | ||
} | ||
} | ||
|
||
/// Serialize an in-memory RDF graph to an output source. | ||
fn serialize(&self, writer: Output, graph: FastGraph) -> Result<(), String> { | ||
let mut formatter = self.serializer(writer); | ||
match formatter.serialize_graph(&graph) { | ||
Ok(_) => Ok(()), | ||
Err(_) => Err(String::from("Could not serialize graph")), | ||
} | ||
} | ||
|
||
/// Create a new parser for this format. | ||
fn parser(&self) -> P; | ||
|
||
/// Create a new serializer for this format. | ||
fn serializer(&self, writer: Output) -> F; | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,77 @@ | ||
//! # Implementation of concrete RDF formats | ||
//! | ||
//! This module implements `RdfIO` trait for each RDF serialization format. | ||
use crate::cli::GraphFormat; | ||
use crate::converter::RdfIO; | ||
use crate::io::{Input, Output}; | ||
use sophia::inmem::graph::FastGraph; | ||
use sophia::turtle::parser::nt::NTriplesParser; | ||
use sophia::turtle::parser::turtle::TurtleParser; | ||
use sophia::turtle::serializer::nt::NtSerializer; | ||
use sophia::turtle::serializer::turtle::TurtleSerializer; | ||
use sophia::xml::parser::RdfXmlParser; | ||
use sophia::xml::serializer::RdfXmlSerializer; | ||
|
||
pub(crate) struct NTriples; | ||
pub(crate) struct Turtle; | ||
pub(crate) struct RdfXml; | ||
|
||
/// The `RdfParser` struct provides a generic interface to parse RDF graphs | ||
/// from different formats. | ||
pub struct RdfParser { | ||
pub graph: FastGraph, | ||
} | ||
|
||
impl RdfParser { | ||
pub fn new(input: Input, format: GraphFormat) -> Result<Self, String> { | ||
let graph = match format { | ||
GraphFormat::NTriples => NTriples.parse(input), | ||
GraphFormat::Turtle => Turtle.parse(input), | ||
GraphFormat::RdfXml => RdfXml.parse(input), | ||
}?; | ||
Ok(RdfParser { graph }) | ||
} | ||
} | ||
|
||
/// The `RdfSerializer` struct provides a generic interface to serialize | ||
/// RDF graphs to different formats. | ||
pub struct RdfSerializer; | ||
|
||
impl RdfSerializer { | ||
pub fn serialize(dest: Output, format: GraphFormat, graph: FastGraph) -> Result<(), String> { | ||
match format { | ||
GraphFormat::NTriples => NTriples.serialize(dest, graph), | ||
GraphFormat::Turtle => Turtle.serialize(dest, graph), | ||
GraphFormat::RdfXml => RdfXml.serialize(dest, graph), | ||
} | ||
} | ||
} | ||
impl<'a> RdfIO<'a, NTriplesParser, NtSerializer<Output>> for NTriples { | ||
fn parser(&self) -> NTriplesParser { | ||
NTriplesParser {} | ||
} | ||
|
||
fn serializer(&self, writer: Output) -> NtSerializer<Output> { | ||
NtSerializer::new(writer) | ||
} | ||
} | ||
|
||
impl<'a> RdfIO<'a, TurtleParser, TurtleSerializer<Output>> for Turtle { | ||
fn parser(&self) -> TurtleParser { | ||
TurtleParser { base: None } | ||
} | ||
|
||
fn serializer(&self, writer: Output) -> TurtleSerializer<Output> { | ||
TurtleSerializer::new(writer) | ||
} | ||
} | ||
|
||
impl<'a> RdfIO<'a, RdfXmlParser, RdfXmlSerializer<Output>> for RdfXml { | ||
fn parser(&self) -> RdfXmlParser { | ||
RdfXmlParser { base: None } | ||
} | ||
|
||
fn serializer(&self, writer: Output) -> RdfXmlSerializer<Output> { | ||
RdfXmlSerializer::new(writer) | ||
} | ||
} |
Oops, something went wrong.