Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat(py): caching support #29

Merged
merged 15 commits into from
Oct 22, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
23 changes: 12 additions & 11 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

97 changes: 60 additions & 37 deletions fuzon/src/cache.rs
Original file line number Diff line number Diff line change
@@ -1,70 +1,93 @@
use std::fs;
use std::hash::{DefaultHasher, Hash, Hasher};
use std::path::PathBuf;
use std::{
fs,
hash::{DefaultHasher, Hash, Hasher},
path::PathBuf,
};

use anyhow::Result;
use reqwest::blocking::Client;
use reqwest::Url;
use reqwest::{blocking::Client, Url};

use crate::TermMatcher;

/// Requests headers with redirection to create a stamp for the URL
/// consisting of the last modified date and/or ETag.
pub fn get_url_stamp(url: &str) -> Result<String> {
let client = Client::new();
let response = client.head(url).send().unwrap();
let response = client.head(url).send()?;
let headers = response.headers();
cmdoret marked this conversation as resolved.
Show resolved Hide resolved
let etag = headers
.get("ETag")
.map_or("", |v| v.to_str().unwrap());
let etag = headers.get("ETag").map_or("", |v| v.to_str().unwrap());
let last_modified = headers
.get("Last-Modified")
.map_or("", |v| v.to_str().unwrap());
return Ok(format!("{}-{}-{}", url, etag, last_modified));

Ok(format!("{}-{}-{}", url, etag, last_modified))
}

/// Crafts a file metadata to create a stamp consisting of the file path,
/// size and last modified date.
cmdoret marked this conversation as resolved.
Show resolved Hide resolved
pub fn get_file_stamp(path: &str) -> Result<String> {
let metadata = fs::metadata(path).unwrap();
let metadata = fs::metadata(path)?;
let size = metadata.len();
let modified = metadata.modified().unwrap();
return Ok(format!("{}-{}-{:?}", path, size, modified));
let modified = metadata.modified()?;

Ok(format!("{}-{}-{:?}", path, size, modified))
}

/// Generate a fixed cache key based on a collection of source paths.
/// Each path is converted to a stamp in the format "{path}-{fingerprint}-{modified-date}".
/// Stamps are then concatenated and hash of this concatenation is returned.
pub fn get_cache_key(sources: &Vec<&str>) -> String {
let mut paths = sources.clone();
pub fn get_cache_key(paths: &mut Vec<&str>) -> Result<String> {
paths.sort();
let concat = paths
.into_iter()
.map(|s|
if let Ok(_) = Url::parse(s) {
get_url_stamp(&s).unwrap()
} else {
get_file_stamp(&s).unwrap()
}
)
.collect::<Vec<String>>()
.join(" ");

// Craft all stamps and concatenate them into the hasher
let mut state = DefaultHasher::new();
concat.hash(&mut state);
for path in paths.iter() {
let stamp = if let Ok(_) = Url::parse(path) {
get_url_stamp(&path)?
} else if PathBuf::from(path).exists() {
get_file_stamp(&path)?
} else {
return Err(anyhow::anyhow!("Invalid path: {}", path));
};
stamp.hash(&mut state);
}

// Hash the concatenated stamps
let key = state.finish();

return key.to_string()
Ok(key.to_string())
}

/// Get the full cross-platform cache path for a collection of source paths.
pub fn get_cache_path(sources: &Vec<&str>) -> PathBuf {

pub fn get_cache_path(sources: &mut Vec<&str>) -> Result<PathBuf> {
let cache_dir = dirs::cache_dir().unwrap().join("fuzon");
let cache_key = get_cache_key(
&sources
);
let cache_key = get_cache_key(sources)?;
let cache_path = cache_dir.join(&cache_key);

return cache_dir.join(&cache_key)
Ok(cache_path)
}

/// Save each source into an independent TermMatcher cache file.
pub fn cache_by_source(sources: Vec<&str>) -> Result<()> {
for source in sources {
let matcher = TermMatcher::from_paths(vec![source])?;
let cache_path = get_cache_path(&mut vec![source])?;
matcher.dump(&cache_path)?;
}

Ok(())
}

/// Load and combine single-source cache entries into a combined TermMatcher.
pub fn load_by_source(sources: Vec<&str>) -> Result<TermMatcher> {
let mut matcher: TermMatcher = TermMatcher { terms: Vec::new() };

for source in sources {
let cache_path = get_cache_path(&mut vec![source])?;
matcher = matcher + TermMatcher::load(&cache_path)?;
}

Ok(matcher)
}

#[cfg(test)]
Expand All @@ -87,9 +110,9 @@ mod tests {

#[test]
fn cache_path() {
let sources = vec!["Cargo.toml", "https://www.rust-lang.org/"];
let path = get_cache_path(&sources);
let key = get_cache_key(&sources);
let mut sources = vec!["Cargo.toml", "https://www.rust-lang.org/"];
let path = get_cache_path(&mut sources.clone()).unwrap();
let key = get_cache_key(&mut sources).unwrap();
assert!(path.ends_with(key));
}
}
46 changes: 33 additions & 13 deletions fuzon/src/lib.rs
Original file line number Diff line number Diff line change
@@ -1,22 +1,24 @@
use core::fmt;
use std::collections::HashSet;
use std::fs::File;
use std::path::Path;
use std::io::{BufRead, BufReader};
use std::{
collections::HashSet,
fs::File,
hash::Hash,
io::{BufRead, BufReader},
ops::Add,
path::Path,
};

use anyhow::Result;
use lazy_static::lazy_static;
use oxrdfio::{RdfFormat, RdfParser};
use postcard;
use reqwest::blocking::Client;
use reqwest::Url;
use reqwest::{blocking::Client, Url};
use serde::{Deserialize, Serialize};

use rff;


pub mod ui;
pub mod cache;
pub mod ui;

// HashMap of common annotation properties
lazy_static! {
Expand All @@ -42,6 +44,23 @@ pub struct TermMatcher {
pub terms: Vec<Term>,
}

impl Add for TermMatcher {
type Output = Self;

fn add(self, rhs: Self) -> Self::Output {
// union of terms
let terms = self
.terms
.into_iter()
.chain(rhs.terms.into_iter())
.collect::<HashSet<Term>>()
.into_iter()
.collect();

TermMatcher { terms }
}
}

impl TermMatcher {
pub fn new() -> Self {
TermMatcher { terms: Vec::new() }
Expand All @@ -61,18 +80,21 @@ impl TermMatcher {
}
pub fn from_readers(readers: Vec<(impl BufRead, RdfFormat)>) -> Self {
let terms = gather_terms(readers).collect();

TermMatcher { terms }
}

pub fn from_paths(paths: Vec<&str>) -> Result<Self> {
let readers = paths.into_iter().map(|p| get_source(p).unwrap()).collect();
let terms: Vec<Term> = gather_terms(readers).collect();

Ok(TermMatcher { terms })
}

pub fn load(path: &Path) -> Result<Self> {
let bytes = std::fs::read(path)?;
let matcher = postcard::from_bytes(&bytes)?;

Ok(matcher)
}

Expand All @@ -83,8 +105,7 @@ impl TermMatcher {
}
}


#[derive(Debug, Clone, Serialize, Deserialize, Eq, PartialEq)]
#[derive(Debug, Clone, Serialize, Deserialize, Eq, Hash, PartialEq)]
pub struct Term {
pub uri: String,
pub label: String,
Expand Down Expand Up @@ -134,10 +155,9 @@ pub fn rank_terms<'a>(query: &str, terms: Vec<&'a Term>) -> Vec<(&'a Term, f64)>
.collect();
ranked.sort_by(|a, b| b.1.partial_cmp(&a.1).unwrap());

return ranked;
ranked
}


// Load URI-label pairs from all sources.
pub fn gather_terms(readers: Vec<(impl BufRead, RdfFormat)>) -> impl Iterator<Item = Term> {
// NOTE: May want to use bulk loader for better performances
Expand All @@ -154,10 +174,10 @@ pub fn gather_terms(readers: Vec<(impl BufRead, RdfFormat)>) -> impl Iterator<It
.collect();
terms.append(&mut out);
}

terms.into_iter()
}


#[cfg(test)]
mod tests {
use super::*;
Expand Down
32 changes: 10 additions & 22 deletions fuzon/src/main.rs
Original file line number Diff line number Diff line change
@@ -1,10 +1,9 @@
use std::fs;
use fuzon::ui::{interactive, search};
use std::fs;

use anyhow::Result;
use clap::Parser;
use fuzon::TermMatcher;
use fuzon::cache::get_cache_path;
use fuzon::{cache::get_cache_path, TermMatcher};

/// fuzzy match terms from ontologies to get their uri
#[derive(Parser, Debug)]
Expand All @@ -29,26 +28,21 @@ struct Args {
fn main() -> Result<()> {
let args = Args::parse();

let sources = args.source
.iter()
.map(|s| s.as_str())
.collect();
let mut sources = args.source.iter().map(|s| s.as_str()).collect();

// Attempt to load from cache
let matcher: TermMatcher;
if !args.no_cache {
let cache_path = get_cache_path(
&sources
);
let cache_path = get_cache_path(&mut sources)?;
let _ = fs::create_dir_all(cache_path.parent().unwrap());
// Cache hit
matcher = if let Ok(matcher) = TermMatcher::load(&cache_path) {
matcher
cmdoret marked this conversation as resolved.
Show resolved Hide resolved
matcher
// Cache miss
} else {
let matcher =TermMatcher::from_paths(sources)?;
let matcher = TermMatcher::from_paths(sources)?;
matcher.dump(&cache_path)?;
matcher
matcher
};
} else {
matcher = TermMatcher::from_paths(sources)?;
Expand All @@ -59,18 +53,12 @@ fn main() -> Result<()> {
for (term, score) in search(&matcher, &query, args.top) {
println!("[{}] {}", score, term)
}
return Ok(());
Ok(())
// Or interactively trigger search on keystrokes
} else {
return interactive(&matcher, args.top);
interactive(&matcher, args.top)
}
}



#[cfg(test)]
mod tests {
use super::*;

fn match_urls() {}
}
mod tests {}
Loading
Loading