diff --git a/src/ringo/molecule/model/molecule.rs b/src/ringo/molecule/model/molecule.rs index d304917..3ac6f2c 100644 --- a/src/ringo/molecule/model/molecule.rs +++ b/src/ringo/molecule/model/molecule.rs @@ -11,6 +11,7 @@ use std::collections::{BTreeSet}; use std::collections::hash_map::DefaultHasher; use std::fmt::Debug; use std::hash::Hasher; +use crate::ringo::math::similarity::tanimoto::tanimoto_vec; use crate::ringo::molecule::smiles::reader::molecule::parse_molecule; pub struct Molecule { @@ -92,7 +93,7 @@ impl Molecule { let mut fp = BitSet::new(); for node in self.graph.node_indices() { - ecfp_recursive(&self.graph, radius, 1, node, &mut fp, fp_length); + ecfp_recursive(&self.graph, radius, 1, node, &mut fp, fp_length, &mut DefaultHasher::new()); } BitVec::from_fn(fp_length, |idx| fp.contains(idx)) @@ -106,22 +107,21 @@ fn ecfp_recursive( node: NodeIndex, fp: &mut BitSet, fp_length: usize, + hasher: &mut DefaultHasher, ) { - //println!("ecfp_recursive({}, {}, {})", radius, depth, node.index()); if depth > radius { return; } - let mut hasher = DefaultHasher::new(); - let atom = graph.node_weight(node).unwrap(); hasher.write_u8(atom.element.atomic_number); hasher.write_u8(atom.isotope); hasher.write_i8(atom.charge); hasher.write_u8(atom.hs); - fp.insert(hasher.finish() as usize % fp_length); + let value = hasher.clone().finish(); + fp.insert(value as usize % fp_length); for edge in graph.edges(node) { let bond = edge.weight(); @@ -133,15 +133,15 @@ fn ecfp_recursive( edge.source() }; - ecfp_recursive(graph, radius, depth + 1, target, fp, fp_length); + ecfp_recursive(graph, radius, depth + 1, target, fp, fp_length, hasher); } } #[test] fn test_ecfp() { - let ecfp_ibuprofen = parse_molecule("CC(C)CC1=CC=C(C=C1)C(C)C(=O)O").unwrap().1.ecfp(6, 128); - let paracetamol = parse_molecule("CC(=O)NC1=CC=C(C=C1)O").unwrap().1.ecfp(6, 128); - println!("{:?}", ecfp_ibuprofen); - println!("{:?}", paracetamol); + let ecfp_ibuprofen = parse_molecule("CC(C)CC1=CC=C(C=C1)C(C)C(=O)O").unwrap().1.ecfp(2, 128); + let ecfp_naproxen = parse_molecule("CC(C1=CC2=C(C=C1)C=C(C=C2)OC)C(=O)O").unwrap().1.ecfp(2, 128); + let sim = tanimoto_vec(&ecfp_ibuprofen, &ecfp_naproxen); + assert!(0.53 < sim && sim < 0.54); } diff --git a/src/ringo/molecule/smiles/reader/molecule.rs b/src/ringo/molecule/smiles/reader/molecule.rs index e11efde..55bbe32 100644 --- a/src/ringo/molecule/smiles/reader/molecule.rs +++ b/src/ringo/molecule/smiles/reader/molecule.rs @@ -387,7 +387,7 @@ mod tests { for smiles in ["C", "CC", "C1COCNC1", "CN1C=NC2=C1C(=O)N(C(=O)N2C)C"] { println!("{}: ", smiles); let m = parse_molecule(smiles).unwrap().1; - let result = m.ecfp(2, 128); + let result = m.ecfp(2, 512); for bit in result { print!("{}", if bit { 1 } else { 0 }); } diff --git a/src/ringo/ringo/index.rs b/src/ringo/ringo/index.rs index 5b31e35..c95eab5 100644 --- a/src/ringo/ringo/index.rs +++ b/src/ringo/ringo/index.rs @@ -3,28 +3,22 @@ use std::io::{BufRead}; use crate::ringo::molecule::smiles::reader::molecule::parse_molecule; use crate::ringo::ringo::index_item::IndexItem; -fn index(smiles_file: &str) -> Vec { +fn index(smiles_file: &str) { // open file for reading let fi = File::open(smiles_file).expect("Could not open file"); - let mut result = Vec::new(); // open binary file for index let mut offset = 0; // let mut fo = File::create(smiles_file.to_owned() + ".fp"); for line in std::io::BufReader::new(fi).lines() { let line = line.unwrap(); let molecule = parse_molecule(&line).unwrap().1; - // let ecfp = molecule.ecfp(2, 512); - // write ecfp and offset to binary file - result.push(IndexItem::new(offset, molecule.ecfp(2, 512))); - + IndexItem::new(offset, molecule.ecfp(2, 512)); offset += line.len() + 1; } - return result; } #[test] fn test_index() { - let result = index("molecules.smi"); - assert_eq!(result.len(), 2); + index("molecules.smi"); }