sdsc-ordes · cmdoret · Oct 22, 2024 · Oct 21, 2024 · Oct 21, 2024 · Oct 21, 2024
diff --git a/Cargo.lock b/Cargo.lock
diff --git a/fuzon/src/cache.rs b/fuzon/src/cache.rs
@@ -1,70 +1,73 @@
-use std::fs;
-use std::hash::{DefaultHasher, Hash, Hasher};
-use std::path::PathBuf;
+use std::{
+    fs,
+    hash::{DefaultHasher, Hash, Hasher},
+    path::PathBuf,
+};
 
 use anyhow::Result;
-use reqwest::blocking::Client;
-use reqwest::Url;
-
+use reqwest::{blocking::Client, Url};
 
 /// Requests headers with redirection to create a stamp for the URL
 /// consisting of the last modified date and/or ETag.
 pub fn get_url_stamp(url: &str) -> Result<String> {
     let client = Client::new();
-    let response = client.head(url).send().unwrap();
+    let response = client.head(url).send()?;
     let headers = response.headers();
-    let etag = headers
-        .get("ETag")
-        .map_or("", |v| v.to_str().unwrap());
+    let etag = headers.get("ETag").map_or("", |v| v.to_str().unwrap());
     let last_modified = headers
         .get("Last-Modified")
         .map_or("", |v| v.to_str().unwrap());
+
     return Ok(format!("{}-{}-{}", url, etag, last_modified));
 }
 
 /// Crafts a file metadata to create a stamp consisting of the file path,
 /// size and last modified date.
 pub fn get_file_stamp(path: &str) -> Result<String> {
-    let metadata = fs::metadata(path).unwrap();
+    let metadata = fs::metadata(path)?;
     let size = metadata.len();
-    let modified = metadata.modified().unwrap();
+    let modified = metadata.modified()?;
+
     return Ok(format!("{}-{}-{:?}", path, size, modified));
 }
 
 /// Generate a fixed cache key based on a collection of source paths.
 /// Each path is converted to a stamp in the format "{path}-{fingerprint}-{modified-date}".
 /// Stamps are then concatenated and hash of this concatenation is returned.
-pub fn get_cache_key(sources: &Vec<&str>) -> String {
+pub fn get_cache_key(sources: &Vec<&str>) -> Result<String> {
     let mut paths = sources.clone();
     paths.sort();
-    let concat = paths
-        .into_iter()
-        .map(|s|
-            if let Ok(_) = Url::parse(s) {
-                get_url_stamp(&s).unwrap()
-            } else {
-                get_file_stamp(&s).unwrap()
-            }
-        )
-        .collect::<Vec<String>>()
-        .join(" ");
+
+    // Craft all stamps and concatenate them
+    let mut concat = String::new();
+    for path in paths.into_iter() {
+        if !PathBuf::from(path).exists() && Url::parse(path).is_err() {
+            return Err(anyhow::anyhow!("Invalid path: {}", path));
+        }
+
+        let stamp = if let Ok(_) = Url::parse(path) {
+            get_url_stamp(&path)?
+        } else {
+            get_file_stamp(&path)?
+        };
+        concat.push_str(&stamp);
+    }
+
+    // Hash the concatenated stamps
     let mut state = DefaultHasher::new();
     concat.hash(&mut state);
     let key = state.finish();
 
-    return key.to_string()
+    return Ok(key.to_string());
 }
 
 /// Get the full cross-platform cache path for a collection of source paths.
-pub fn get_cache_path(sources: &Vec<&str>) -> PathBuf {
-
+pub fn get_cache_path(sources: &Vec<&str>) -> Result<PathBuf> {
     let cache_dir = dirs::cache_dir().unwrap().join("fuzon");
-    let cache_key = get_cache_key(
-        &sources
-    );
-
-    return cache_dir.join(&cache_key)
+    let cache_key = get_cache_key(&sources);
+    let cache_path = cache_dir.join(&cache_key?);
 
+    return Ok(cache_path);
 }
 
 #[cfg(test)]
@@ -88,8 +91,8 @@ mod tests {
     #[test]
     fn cache_path() {
         let sources = vec!["Cargo.toml", "https://www.rust-lang.org/"];
-        let path = get_cache_path(&sources);
-        let key = get_cache_key(&sources);
+        let path = get_cache_path(&sources).unwrap();
+        let key = get_cache_key(&sources).unwrap();
         assert!(path.ends_with(key));
     }
 }
diff --git a/fuzon/src/main.rs b/fuzon/src/main.rs
@@ -1,10 +1,9 @@
-use std::fs;
 use fuzon::ui::{interactive, search};
+use std::fs;
 
 use anyhow::Result;
 use clap::Parser;
-use fuzon::TermMatcher;
-use fuzon::cache::get_cache_path;
+use fuzon::{cache::get_cache_path, TermMatcher};
 
 /// fuzzy match terms from ontologies to get their uri
 #[derive(Parser, Debug)]
@@ -29,26 +28,21 @@ struct Args {
 fn main() -> Result<()> {
     let args = Args::parse();
 
-    let sources = args.source
-        .iter()
-        .map(|s| s.as_str())
-        .collect();
+    let sources = args.source.iter().map(|s| s.as_str()).collect();
 
     // Attempt to load from cache
     let matcher: TermMatcher;
     if !args.no_cache {
-        let cache_path = get_cache_path(
-            &sources
-        );
+        let cache_path = get_cache_path(&sources)?;
         let _ = fs::create_dir_all(cache_path.parent().unwrap());
         // Cache hit
         matcher = if let Ok(matcher) = TermMatcher::load(&cache_path) {
-           matcher
+            matcher
         // Cache miss
         } else {
-            let matcher =TermMatcher::from_paths(sources)?;
+            let matcher = TermMatcher::from_paths(sources)?;
             matcher.dump(&cache_path)?;
-            matcher 
+            matcher
         };
     } else {
         matcher = TermMatcher::from_paths(sources)?;
@@ -66,8 +60,6 @@ fn main() -> Result<()> {
     }
 }
 
-
-
 #[cfg(test)]
 mod tests {
     use super::*;

diff --git a/pyfuzon/Cargo.toml b/pyfuzon/Cargo.toml
@@ -15,6 +15,6 @@ fuzon = { version = "0.2.2", path = "../fuzon" }
 lazy_static = "1.5.0"
 oxrdf = "0.1.7"
 oxttl = "0.1.0-rc.1"
-pyo3 = { version = "0.22.2", features = ["abi3-py310"] }
+pyo3 = { version = "0.22.2", features = ["abi3-py310", "anyhow"] }
 ratatui = "0.28.1"
 rff = "0.3.0"
diff --git a/pyfuzon/python/pyfuzon/__init__.py b/pyfuzon/python/pyfuzon/__init__.py
@@ -5,3 +5,4 @@
     __all__ = pyfuzon.__all__
 
 from .matcher import TermMatcher
+from .cache import get_cache_key, get_cache_path
diff --git a/pyfuzon/python/pyfuzon/cache.py b/pyfuzon/python/pyfuzon/cache.py
@@ -0,0 +1,28 @@
+"""Caching utilities for pyfuzon.
+
+This module provides functions to help manage the cache of TermMatchers.
+Cache keys are built by fuzon using the sorted source paths. For each path,
+a stamp is computed as follows (missing values are replaced by empty strings):
+    + file path: {path}-{size}-{last-modified-datetime}
+    + url: {url}-{etag-checksum}-{last-modified-datetime}
+All stamps are then concatenated and hash of the result is used as the cache key.
+
+Cache paths adhere to the following specifications:
+    + Linux: XDG base / user directory
+    + Windows: Known folder API
+    + MacOS: Standard Directories guidelines
+For more information, see: https://github.com/dirs-dev/dirs-rs
+"""
+
+from pathlib import Path
+
+from .pyfuzon import get_cache_key as _get_cache_key
+from .pyfuzon import get_cache_path as _get_cache_path
+
+def get_cache_key(sources: list[str]) -> str:
+    """Return a deterministic cache key based on a collection of source paths."""
+    return _get_cache_key(sources)
+
+def get_cache_path(sources: list[str]) -> Path:
+    """Return a full platform-specific cache path based on a collection of source paths."""
+    return Path(_get_cache_path(sources))
diff --git a/pyfuzon/python/pyfuzon/matcher.py b/pyfuzon/python/pyfuzon/matcher.py
@@ -1,5 +1,6 @@
 from dataclasses import dataclass
 from typing import Self
+
 from dataclasses import dataclass
 
 from pyfuzon import Term, score_terms, parse_files, load_terms, dump_terms
@@ -10,7 +11,7 @@ class TermMatcher:
     """Fuzzy matches terms from RDF terminologies to input queries."""
 
     terms: list[Term]
-    
+
     def top(self, query: str, n: int=5) -> list[Term]:
         """Return the n terms most similar to input query."""
         return self.rank(query)[:n]
@@ -19,7 +20,7 @@ def rank(self, query: str) -> list[Term]:
         """Return all terms, ranked by query similarity."""
         scores = self.score(query)
         ranks = [
-            i[0] for i in 
+            i[0] for i in
             sorted(enumerate(scores), key=lambda x:x[1], reverse=True)
         ]
         return [self.terms[rank] for rank in ranks]
@@ -45,4 +46,3 @@ def load(cls, path):
     def dump(self, path):
         """Serialize to disk."""
         dump_terms(self.terms, path)
-
Original file line number	Diff line number	Diff line change
Expand Up		@@ -5,3 +5,4 @@
		__all__ = pyfuzon.__all__

		from .matcher import TermMatcher
		from .cache import get_cache_key, get_cache_path