diff --git a/Cargo.toml b/Cargo.toml index 04895de..3f05f63 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -15,3 +15,11 @@ bincode = "2.0.0-rc.3" fixedbitset = "0.5.7" nom = "7.1.3" petgraph = "0.6.5" + +[[bin]] +name = "ringo-index" +path = "src/ringo/ringo/index/main.rs" + +[[bin]] +name = "ringo-search" +path = "src/ringo/ringo/search/main.rs" diff --git a/src/bin.rs b/src/bin.rs new file mode 100644 index 0000000..8b13789 --- /dev/null +++ b/src/bin.rs @@ -0,0 +1 @@ + diff --git a/src/lib.rs b/src/lib.rs index d58a822..6424c7f 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -1 +1,2 @@ +mod bin; pub mod ringo; diff --git a/src/ringo.rs b/src/ringo.rs index 76b46a1..ee49d68 100644 --- a/src/ringo.rs +++ b/src/ringo.rs @@ -1,3 +1,4 @@ +pub mod fingerprint; pub mod math; pub mod molecule; mod ringo; diff --git a/src/ringo/fingerprint.rs b/src/ringo/fingerprint.rs new file mode 100644 index 0000000..c2d67fd --- /dev/null +++ b/src/ringo/fingerprint.rs @@ -0,0 +1 @@ +pub mod fingerprint; diff --git a/src/ringo/ringo/fingerprint.rs b/src/ringo/fingerprint/fingerprint.rs similarity index 86% rename from src/ringo/ringo/fingerprint.rs rename to src/ringo/fingerprint/fingerprint.rs index c0fd772..dc97a66 100644 --- a/src/ringo/ringo/fingerprint.rs +++ b/src/ringo/fingerprint/fingerprint.rs @@ -32,7 +32,7 @@ impl<'de> bincode::BorrowDecode<'de> for Fingerprint { #[cfg(test)] mod tests { - use crate::ringo::ringo::fingerprint::{Fingerprint, FINGERPRINT_SIZE}; + use crate::ringo::fingerprint::fingerprint::{Fingerprint, FINGERPRINT_SIZE}; use fixedbitset::FixedBitSet; #[test] @@ -44,10 +44,9 @@ mod tests { let mut buf = vec![0u8; FINGERPRINT_SIZE / 8]; bincode::encode_into_slice(&fp, buf.as_mut_slice(), bincode::config::standard()).unwrap(); - let decoded: Fingerprint = - bincode::decode_from_slice(&buf, bincode::config::standard()) - .unwrap() - .0; + let decoded: Fingerprint = bincode::decode_from_slice(&buf, bincode::config::standard()) + .unwrap() + .0; assert_eq!(decoded.0.ones().collect::>(), vec![1, 17]); } } diff --git a/src/ringo/molecule/model/atom.rs b/src/ringo/molecule/model/atom.rs index 6fcde13..521be5d 100644 --- a/src/ringo/molecule/model/atom.rs +++ b/src/ringo/molecule/model/atom.rs @@ -1,9 +1,28 @@ use crate::ringo::molecule::model::element::Element; -#[derive(Hash, Eq, PartialEq)] +#[derive(Hash, Eq, PartialEq, Debug)] pub struct Atom { pub element: Element, pub isotope: u8, pub charge: i8, pub hs: u8, } + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_atom() { + let atom = Atom { + element: Element { atomic_number: 6 }, + isotope: 12, + charge: 0, + hs: 0, + }; + assert_eq!(atom.element, Element { atomic_number: 6 }); + assert_eq!(atom.isotope, 12); + assert_eq!(atom.charge, 0); + assert_eq!(atom.hs, 0); + } +} diff --git a/src/ringo/molecule/model/bond.rs b/src/ringo/molecule/model/bond.rs index c37646d..3c2a411 100644 --- a/src/ringo/molecule/model/bond.rs +++ b/src/ringo/molecule/model/bond.rs @@ -9,3 +9,16 @@ pub enum BondOrder { pub struct Bond { pub order: BondOrder, } + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_bond_order() { + let bond = Bond { + order: BondOrder::Single, + }; + assert_eq!(bond.order, BondOrder::Single); + } +} diff --git a/src/ringo/molecule/model/element.rs b/src/ringo/molecule/model/element.rs index 0fc5eea..2607977 100644 --- a/src/ringo/molecule/model/element.rs +++ b/src/ringo/molecule/model/element.rs @@ -1,93 +1,106 @@ -#[derive(Hash, Eq, PartialEq)] +#[derive(Hash, Eq, PartialEq, Debug)] pub struct Element { pub atomic_number: u8, } -pub fn atomic_weight(element: &Element) -> f64 { - match element.atomic_number { - 1 => 1.007825, - 2 => 4.002603, - 3 => 6.938, - 4 => 9.0121831, - 5 => 10.806, - 6 => 12.011, - 7 => 14.007, - 8 => 15.999, - 9 => 18.998403163, - 10 => 20.180, - 11 => 22.990, - 12 => 24.305, - 13 => 26.9815385, - 14 => 28.085, - 15 => 30.973761998, - 16 => 32.06, - 17 => 35.45, - 18 => 39.948, - 19 => 39.0983, - 20 => 40.078, - 21 => 44.955908, - 22 => 47.867, - 23 => 50.9415, - 24 => 51.9961, - 25 => 54.938044, - 26 => 55.845, - 27 => 58.933194, - 28 => 58.6934, - 29 => 63.546, - 30 => 65.38, - 31 => 69.723, - 32 => 72.630, - 33 => 74.921595, - 34 => 78.96, - 35 => 79.904, - 36 => 83.798, - 37 => 85.4678, - 38 => 87.62, - 39 => 88.90584, - 40 => 91.224, - 41 => 92.90637, - 42 => 95.95, - 43 => 98.0, - 44 => 101.07, - 45 => 102.90550, - 46 => 106.42, - 47 => 107.8682, - 48 => 112.414, - 49 => 114.818, - 50 => 118.710, - 51 => 121.760, - 52 => 127.60, - 53 => 126.90447, - 54 => 131.293, - 55 => 132.90545196, - 56 => 137.327, - 57 => 138.90547, - 58 => 140.116, - 59 => 140.90766, - 60 => 144.242, - 61 => 145.0, - 62 => 150.36, - 63 => 151.964, - 64 => 157.25, - 65 => 158.92535, - 66 => 162.500, - 67 => 164.93033, - 68 => 167.259, - 69 => 168.93422, - 70 => 173.054, - 71 => 174.9668, - 72 => 178.49, - 73 => 180.94788, - 74 => 183.84, - 75 => 186.207, - 76 => 190.23, - 77 => 192.217, - 78 => 195.084, - 79 => 196.966569, - 80 => 200.592, - 81 => 204.38, - 82 => 207.2, - 83 => 208.98040, - _ => panic!("Unsupported atomic number {}", element.atomic_number), +impl Element { + pub fn atomic_weight(&self) -> f64 { + match self.atomic_number { + 1 => 1.007825, + 2 => 4.002603, + 3 => 6.938, + 4 => 9.0121831, + 5 => 10.806, + 6 => 12.011, + 7 => 14.007, + 8 => 15.999, + 9 => 18.998403163, + 10 => 20.180, + 11 => 22.990, + 12 => 24.305, + 13 => 26.9815385, + 14 => 28.085, + 15 => 30.973761998, + 16 => 32.06, + 17 => 35.45, + 18 => 39.948, + 19 => 39.0983, + 20 => 40.078, + 21 => 44.955908, + 22 => 47.867, + 23 => 50.9415, + 24 => 51.9961, + 25 => 54.938044, + 26 => 55.845, + 27 => 58.933194, + 28 => 58.6934, + 29 => 63.546, + 30 => 65.38, + 31 => 69.723, + 32 => 72.630, + 33 => 74.921595, + 34 => 78.96, + 35 => 79.904, + 36 => 83.798, + 37 => 85.4678, + 38 => 87.62, + 39 => 88.90584, + 40 => 91.224, + 41 => 92.90637, + 42 => 95.95, + 43 => 98.0, + 44 => 101.07, + 45 => 102.90550, + 46 => 106.42, + 47 => 107.8682, + 48 => 112.414, + 49 => 114.818, + 50 => 118.710, + 51 => 121.760, + 52 => 127.60, + 53 => 126.90447, + 54 => 131.293, + 55 => 132.90545196, + 56 => 137.327, + 57 => 138.90547, + 58 => 140.116, + 59 => 140.90766, + 60 => 144.242, + 61 => 145.0, + 62 => 150.36, + 63 => 151.964, + 64 => 157.25, + 65 => 158.92535, + 66 => 162.500, + 67 => 164.93033, + 68 => 167.259, + 69 => 168.93422, + 70 => 173.054, + 71 => 174.9668, + 72 => 178.49, + 73 => 180.94788, + 74 => 183.84, + 75 => 186.207, + 76 => 190.23, + 77 => 192.217, + 78 => 195.084, + 79 => 196.966569, + 80 => 200.592, + 81 => 204.38, + 82 => 207.2, + 83 => 208.98040, + _ => panic!("Unsupported atomic number {}", self.atomic_number), + } + } +} + +#[cfg(test)] +mod test { + use crate::ringo::molecule::model::element::Element; + + #[test] + fn test_element() { + let element = Element { atomic_number: 1 }; + assert_eq!(element.atomic_weight(), 1.007825); } } diff --git a/src/ringo/molecule/model/molecule.rs b/src/ringo/molecule/model/molecule.rs index 2ef01df..354d122 100644 --- a/src/ringo/molecule/model/molecule.rs +++ b/src/ringo/molecule/model/molecule.rs @@ -1,9 +1,8 @@ +use crate::ringo::fingerprint::fingerprint::{Fingerprint, FINGERPRINT_SIZE}; use crate::ringo::math::similarity::tanimoto::tanimoto_bitset; use crate::ringo::molecule::model::atom::Atom; use crate::ringo::molecule::model::bond::Bond; -use crate::ringo::molecule::model::element::atomic_weight; use crate::ringo::molecule::smiles::reader::molecule::parse_molecule; -use crate::ringo::ringo::fingerprint::{Fingerprint, FINGERPRINT_SIZE}; use fixedbitset::FixedBitSet; use petgraph::stable_graph::{EdgeIndex, NodeIndex, StableGraph}; use petgraph::visit::EdgeRef; @@ -82,7 +81,7 @@ impl Molecule { pub fn weight(&self) -> f64 { let mut weight: f64 = 0.0; for atom in self.graph.node_weights() { - weight += atomic_weight(atom.element.borrow()) + weight += atom.element.atomic_weight(); } weight } diff --git a/src/ringo/ringo.rs b/src/ringo/ringo.rs index df87b4d..34de02b 100644 --- a/src/ringo/ringo.rs +++ b/src/ringo/ringo.rs @@ -1,4 +1,2 @@ -pub(crate) mod fingerprint; mod index; -mod index_item; mod search; diff --git a/src/ringo/ringo/index.rs b/src/ringo/ringo/index.rs index a06ccc5..3612268 100644 --- a/src/ringo/ringo/index.rs +++ b/src/ringo/ringo/index.rs @@ -1,41 +1,3 @@ -use crate::ringo::molecule::smiles::reader::molecule::parse_molecule; -use crate::ringo::ringo::index_item::IndexItem; -use std::fs::File; -use std::io::{BufRead, BufWriter, Write}; -use bincode::{encode_into_slice}; -use crate::ringo::ringo::fingerprint::FINGERPRINT_SIZE; - -#[cfg(windows)] -const LINE_ENDING_LENGTH: usize = 2; -#[cfg(not(windows))] -const LINE_ENDING_LENGTH: usize = 1; - -fn index(smiles_file: &str) { - // open file for reading - let fi = File::open(smiles_file).expect("Could not open file"); - - // open binary file for index - let mut offset = 0; - let fo = File::create(smiles_file.to_owned() + ".fp"); - let mut writer = BufWriter::new(fo.unwrap()); - - for line in std::io::BufReader::new(fi).lines() { - let line = line.unwrap(); - let molecule = parse_molecule(&line).unwrap().1; - let index_item = IndexItem { - position: offset, - fingerprint: molecule.ecfp(2, 512), - }; - offset += line.len() + LINE_ENDING_LENGTH; - - let mut buf = vec![0u8; FINGERPRINT_SIZE / 8 + 8]; - - encode_into_slice(&index_item, buf.as_mut_slice(), bincode::config::standard()).unwrap(); - writer.write(&buf).unwrap(); - } -} - -#[test] -fn test_index() { - index("molecules.smi"); -} +pub(crate) mod index; +pub mod index_item; +mod main; diff --git a/src/ringo/ringo/index/index.rs b/src/ringo/ringo/index/index.rs new file mode 100644 index 0000000..55e6419 --- /dev/null +++ b/src/ringo/ringo/index/index.rs @@ -0,0 +1,46 @@ +use crate::ringo::fingerprint::fingerprint::FINGERPRINT_SIZE; +use crate::ringo::molecule::smiles::reader::molecule::parse_molecule; +use crate::ringo::ringo::index::index_item::IndexItem; +use bincode::encode_into_slice; +use std::fs::File; +use std::io::{BufRead, BufWriter, Write}; + +#[cfg(windows)] +const LINE_ENDING_LENGTH: usize = 2; +#[cfg(not(windows))] +const LINE_ENDING_LENGTH: usize = 1; + +pub(crate) fn index(smiles_file: &str) { + // open file for reading + let fi = File::open(smiles_file).expect("Could not open file"); + + // open binary file for index + let mut offset = 0; + let fo = File::create(smiles_file.to_owned() + ".fp"); + let mut writer = BufWriter::new(fo.unwrap()); + + for line in std::io::BufReader::new(fi).lines() { + let line = line.unwrap(); + let molecule = parse_molecule(&line).unwrap().1; + let index_item = IndexItem { + position: offset, + fingerprint: molecule.ecfp(2, 512), + }; + offset += line.len() + LINE_ENDING_LENGTH; + + let mut buf = vec![0u8; FINGERPRINT_SIZE / 8 + 8]; + + encode_into_slice(&index_item, buf.as_mut_slice(), bincode::config::standard()).unwrap(); + writer.write(&buf).unwrap(); + } +} + +#[cfg(test)] +mod test { + use crate::ringo::ringo::index::index::index; + + #[test] + fn test_index() { + index("molecules.smi"); + } +} diff --git a/src/ringo/ringo/index_item.rs b/src/ringo/ringo/index/index_item.rs similarity index 83% rename from src/ringo/ringo/index_item.rs rename to src/ringo/ringo/index/index_item.rs index ad33538..74ae080 100644 --- a/src/ringo/ringo/index_item.rs +++ b/src/ringo/ringo/index/index_item.rs @@ -1,4 +1,4 @@ -use crate::ringo::ringo::fingerprint::Fingerprint; +use crate::ringo::fingerprint::fingerprint::Fingerprint; use bincode::{Decode, Encode}; #[derive(Debug, Encode, Decode)] @@ -9,8 +9,8 @@ pub struct IndexItem { #[cfg(test)] mod tests { - use crate::ringo::ringo::fingerprint::Fingerprint; - use crate::ringo::ringo::index_item::IndexItem; + use crate::ringo::fingerprint::fingerprint::Fingerprint; + use crate::ringo::ringo::index::index_item::IndexItem; use bincode::config::standard; use bincode::{decode_from_slice, encode_to_vec}; use fixedbitset::FixedBitSet; diff --git a/src/ringo/ringo/index/main.rs b/src/ringo/ringo/index/main.rs new file mode 100644 index 0000000..4f841dd --- /dev/null +++ b/src/ringo/ringo/index/main.rs @@ -0,0 +1,3 @@ +fn main() { + println!("ringo-index"); +} diff --git a/src/ringo/ringo/search.rs b/src/ringo/ringo/search.rs index ddc8b65..c6075b6 100644 --- a/src/ringo/ringo/search.rs +++ b/src/ringo/ringo/search.rs @@ -1,71 +1,2 @@ -use std::fs::File; -use std::io::{BufRead, BufReader, Read, Seek}; -use crate::ringo::math::similarity::tanimoto::tanimoto_bitset; -use crate::ringo::molecule::smiles::reader::molecule::parse_molecule; -use crate::ringo::ringo::index_item::IndexItem; - -pub struct SearchResult { - pub line: String, - pub similarity: f32 -} - -pub fn similarity_search(smiles_file: &str, query: &str, min_similarity: f32, limit: usize) -> Vec { - let query = parse_molecule(query).unwrap().1; - let query_fp = query.ecfp(2, 512); - - // smiles file - let fis = File::open(&smiles_file).expect("Could not open file"); - let mut reader = BufReader::new(fis); - - //fingerprings file - let fif = File::open(smiles_file.to_owned() + ".fp").expect("Could not open file"); - let file_len = fif.metadata().unwrap().len(); - let index_item_size = 72u8; - let index_count = file_len / index_item_size as u64; - let mut buf_reader = BufReader::new(fif); - - let mut results = Vec::new(); - - for _ in 0..index_count { - // read index item from file - let mut buf = vec![0u8; index_item_size as usize]; - buf_reader.read_exact(&mut buf).unwrap(); - - // decode index item - let index_item: IndexItem = bincode::decode_from_slice(&buf, bincode::config::standard()).unwrap().0; - - // calculate similarity - let similarity = tanimoto_bitset(&index_item.fingerprint.0, &query_fp.0); - - // print similarity if it is greater than min_similarity - if similarity >= min_similarity { - let position = index_item.position; - reader.seek(std::io::SeekFrom::Start(position as u64)).unwrap(); - - let mut line = String::new(); - reader.read_line(&mut line).unwrap(); - // println!("{i} {similarity} {position} {line}"); - results.push(SearchResult { - line: line, - similarity: similarity - }); - - if results.len() >= limit { - break; - } - } - } - - results -} - -#[test] -fn test_similarity_search() { - let results = similarity_search("molecules.smi", "CC(C)CC1=CC=C(C=C1)C(C)C(=O)O", 0.7, 100); - assert_eq!(results.len(), 1); - assert!(results[0].line.starts_with("CC(C)CC1=CC=C(C=C1)C(C)C(=O)O")); - assert_eq!(results[0].similarity, 1.0); - let results = similarity_search("molecules.smi", "CC(C)CC1=CC=C(C=C1)C(C)C(=O)O", 0.5, 100); - assert_eq!(results.len(), 2); - -} +mod main; +pub mod search; diff --git a/src/ringo/ringo/search/main.rs b/src/ringo/ringo/search/main.rs new file mode 100644 index 0000000..1c46ffe --- /dev/null +++ b/src/ringo/ringo/search/main.rs @@ -0,0 +1,3 @@ +fn main() { + println!("ringo-search"); +} diff --git a/src/ringo/ringo/search/search.rs b/src/ringo/ringo/search/search.rs new file mode 100644 index 0000000..2179248 --- /dev/null +++ b/src/ringo/ringo/search/search.rs @@ -0,0 +1,86 @@ +use crate::ringo::math::similarity::tanimoto::tanimoto_bitset; +use crate::ringo::molecule::smiles::reader::molecule::parse_molecule; +use crate::ringo::ringo::index::index_item::IndexItem; +use std::fs::File; +use std::io::{BufRead, BufReader, Read, Seek}; + +pub struct SearchResult { + pub line: String, + pub similarity: f32, +} + +pub fn similarity_search( + smiles_file: &str, + query: &str, + min_similarity: f32, + limit: usize, +) -> Vec { + let query = parse_molecule(query).unwrap().1; + let query_fp = query.ecfp(2, 512); + + // smiles file + let fis = File::open(&smiles_file).expect("Could not open file"); + let mut reader = BufReader::new(fis); + + //fingerprings file + let fif = File::open(smiles_file.to_owned() + ".fp").expect("Could not open file"); + let file_len = fif.metadata().unwrap().len(); + let index_item_size = 72u8; + let index_count = file_len / index_item_size as u64; + let mut buf_reader = BufReader::new(fif); + + let mut results = Vec::new(); + + for _ in 0..index_count { + // read index item from file + let mut buf = vec![0u8; index_item_size as usize]; + buf_reader.read_exact(&mut buf).unwrap(); + + // decode index item + let index_item: IndexItem = bincode::decode_from_slice(&buf, bincode::config::standard()) + .unwrap() + .0; + + // calculate similarity + let similarity = tanimoto_bitset(&index_item.fingerprint.0, &query_fp.0); + + // print similarity if it is greater than min_similarity + if similarity >= min_similarity { + let position = index_item.position; + reader + .seek(std::io::SeekFrom::Start(position as u64)) + .unwrap(); + + let mut line = String::new(); + reader.read_line(&mut line).unwrap(); + // println!("{i} {similarity} {position} {line}"); + results.push(SearchResult { + line: line, + similarity: similarity, + }); + + if results.len() >= limit { + break; + } + } + } + + results +} + +#[cfg(test)] +mod test { + use crate::ringo::ringo::index::index::index; + use crate::ringo::ringo::search::search::similarity_search; + + #[test] + fn test_similarity_search() { + index("molecles.smi"); + let results = similarity_search("molecules.smi", "CC(C)CC1=CC=C(C=C1)C(C)C(=O)O", 0.7, 100); + assert_eq!(results.len(), 1); + assert!(results[0].line.starts_with("CC(C)CC1=CC=C(C=C1)C(C)C(=O)O")); + assert_eq!(results[0].similarity, 1.0); + let results = similarity_search("molecules.smi", "CC(C)CC1=CC=C(C=C1)C(C)C(=O)O", 0.5, 100); + assert_eq!(results.len(), 2); + } +}