From c34c596be59b4840a4817d4c91fa7293cc29daf5 Mon Sep 17 00:00:00 2001 From: n4n5 Date: Wed, 5 Jun 2024 09:44:31 +0200 Subject: [PATCH 1/3] simplify and add some hash --- src/data/regex.json | 51 ++++++++++++++++++++ src/identifier/mod.rs | 109 ++++++++++++++++++++---------------------- 2 files changed, 103 insertions(+), 57 deletions(-) diff --git a/src/data/regex.json b/src/data/regex.json index b1f1472..8087200 100644 --- a/src/data/regex.json +++ b/src/data/regex.json @@ -2822,5 +2822,56 @@ ], "Invalid": [] } + }, + { + "Name": "sha1", + "Regex": "^[0-9a-fA-F]{40}$", + "plural_name": false, + "Description": "sha1 hash", + "Rarity": 1, + "URL": null, + "Tags": [ + "hash" + ], + "Examples": { + "Valid": [ + "5511f894571b58ad5fe231297ed88be4c410c34c" + ], + "Invalid": [] + } + }, + { + "Name": "sha256", + "Regex": "^[0-9a-fA-F]{64}$", + "plural_name": false, + "Description": "sha256 hash", + "Rarity": 1, + "URL": null, + "Tags": [ + "hash" + ], + "Examples": { + "Valid": [ + "7c13517e2bd3f4d4dc4c44be73f56c4a7f15caf3776cd68c33f7d6054f2ee919" + ], + "Invalid": [] + } + }, + { + "Name": "md5", + "Regex": "^[0-9a-fA-F]{32}$", + "plural_name": false, + "Description": "md5 hash", + "Rarity": 1, + "URL": null, + "Tags": [ + "hash" + ], + "Examples": { + "Valid": [ + "c528dcb56c26016a869f89090e1c80eb" + ], + "Invalid": [] + } } ] diff --git a/src/identifier/mod.rs b/src/identifier/mod.rs index c647a99..94bad3e 100644 --- a/src/identifier/mod.rs +++ b/src/identifier/mod.rs @@ -9,6 +9,7 @@ use { pub mod bytes; use once_cell::sync::Lazy; +use rayon::iter::IndexedParallelIterator; use regex::Regex; use serde::Serialize; @@ -125,38 +126,39 @@ impl Identifier { ®EX }; - if self.file_support && is_file(text) { - let strings = read_file_to_strings(text); - - strings - .par_iter() - .map(|text| { - DATA.iter() - .enumerate() - .filter_map(|(i, e)| { - if is_valid_filter(self, e) && regexes[i].is_match(text) { - Some(Match::new(text.to_owned(), e.clone())) - } else { - None - } - }) - .collect::>() - }) - .flatten() - .collect() - } else { - // iter has almost same or sometimes better performance than par_iter for single text! - DATA.iter() - .enumerate() + let check_fn = |data: rayon::slice::Iter, text: &String| { + data.enumerate() .filter_map(|(i, e)| { - if is_valid_filter(self, e) && regexes[i].is_match(text) { + let validity = is_valid_filter(self, e); + if !validity.is_valid() { + // eprintln!("{} is not valid: {:?}", e.name, validity); + None + } else if regexes[i].is_match(text) { + // eprintln!("{:?} matched", regexes[i]); Some(Match::new(text.to_owned(), e.clone())) } else { None } }) .collect::>() + }; + #[cfg(not(target_arch = "wasm32"))] + match (self.file_support, is_file(text)) { + (true, true) => { + let strings = read_file_to_strings(text); + + strings + .par_iter() + .map(|text| check_fn(DATA.par_iter(), text)) + .flatten() + .collect() + } + + _ => check_fn(DATA.par_iter(), &text.to_string()), } + + #[cfg(target_arch = "wasm32")] + check_fn(DATA.to_vec(), &text.to_string()) } /// This returns the first identification. @@ -188,7 +190,7 @@ impl Identifier { for (i, x) in DATA .iter() .enumerate() - .filter(|(_, x)| is_valid_filter(self, x)) + .filter(|(_, x)| is_valid_filter(self, x).is_valid()) { // only consider the regex which compiles! if regexes[i].is_match(text) { @@ -200,32 +202,6 @@ impl Identifier { } } -// Identifier implementation for wasm -#[cfg(target_arch = "wasm32")] -impl Identifier { - // There is no file system on the web, so we are not reading strings from file. - // let the user perform the I/O and read the file, then pass the content of it. - pub fn identify(&self, text: &[String]) -> Vec { - let regexes = if self.boundaryless { - &BOUNDARYLESS_REGEX - } else { - ®EX - }; - - text.iter() - .flat_map(|text| { - DATA.iter().enumerate().filter_map(|(i, e)| { - if is_valid_filter(self, e) && regexes[i].is_match(text) { - Some(Match::new(text.to_owned(), e.clone())) - } else { - None - } - }) - }) - .collect() - } -} - // Output Implementation impl Identifier { /// Convert [`Vec`] to JSON @@ -266,12 +242,31 @@ fn is_file(name: &str) -> bool { } } -fn is_valid_filter(configs: &Identifier, regex_data: &Data) -> bool { +/// Validation filter +#[derive(Debug)] +pub enum ValidationFilter { + DataLowerRarity, + DataHigherRarity, + DataInTags, + DataInExclude, + Good, +} + +impl ValidationFilter { + pub fn is_valid(&self) -> bool { + match self { + ValidationFilter::Good => true, + _ => false, + } + } +} + +pub fn is_valid_filter(configs: &Identifier, regex_data: &Data) -> ValidationFilter { if regex_data.rarity < configs.min_rarity { - return false; + return ValidationFilter::DataLowerRarity; } if regex_data.rarity > configs.max_rarity { - return false; + return ValidationFilter::DataHigherRarity; } if configs @@ -279,17 +274,17 @@ fn is_valid_filter(configs: &Identifier, regex_data: &Data) -> bool { .iter() .any(|y| !regex_data.tags.iter().any(|x| x == y)) { - return false; + return ValidationFilter::DataInTags; } if configs .exclude_tags .iter() .any(|y| regex_data.tags.iter().any(|x| x == y)) { - return false; + return ValidationFilter::DataInExclude; } - true + ValidationFilter::Good } #[cfg(not(target_arch = "wasm32"))] From 9da6f7f6536d8d90e9da898764f143286d6c2b61 Mon Sep 17 00:00:00 2001 From: n4n5 Date: Wed, 5 Jun 2024 09:50:09 +0200 Subject: [PATCH 2/3] add inline --- src/identifier/mod.rs | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/identifier/mod.rs b/src/identifier/mod.rs index 94bad3e..aafbf40 100644 --- a/src/identifier/mod.rs +++ b/src/identifier/mod.rs @@ -253,6 +253,7 @@ pub enum ValidationFilter { } impl ValidationFilter { + #[inline] pub fn is_valid(&self) -> bool { match self { ValidationFilter::Good => true, @@ -261,6 +262,7 @@ impl ValidationFilter { } } +#[inline] pub fn is_valid_filter(configs: &Identifier, regex_data: &Data) -> ValidationFilter { if regex_data.rarity < configs.min_rarity { return ValidationFilter::DataLowerRarity; From 82bf0503e64568dbd44da2a631480da1e261be25 Mon Sep 17 00:00:00 2001 From: n4n5 Date: Wed, 5 Jun 2024 10:29:19 +0200 Subject: [PATCH 3/3] reverse & cleaner --- src/identifier/mod.rs | 76 ++++++++++++++++++++++++++----------------- 1 file changed, 46 insertions(+), 30 deletions(-) diff --git a/src/identifier/mod.rs b/src/identifier/mod.rs index aafbf40..4cf0c11 100644 --- a/src/identifier/mod.rs +++ b/src/identifier/mod.rs @@ -142,23 +142,17 @@ impl Identifier { }) .collect::>() }; - #[cfg(not(target_arch = "wasm32"))] - match (self.file_support, is_file(text)) { - (true, true) => { - let strings = read_file_to_strings(text); - - strings - .par_iter() - .map(|text| check_fn(DATA.par_iter(), text)) - .flatten() - .collect() - } - - _ => check_fn(DATA.par_iter(), &text.to_string()), + if self.file_support && is_file(text) { + let strings = read_file_to_strings(text); + + strings + .par_iter() + .map(|text| check_fn(DATA.par_iter(), text)) + .flatten() + .collect() + } else { + check_fn(DATA.par_iter(), &text.to_string()) } - - #[cfg(target_arch = "wasm32")] - check_fn(DATA.to_vec(), &text.to_string()) } /// This returns the first identification. @@ -201,6 +195,31 @@ impl Identifier { None } } +// Identifier implementation for wasm +#[cfg(target_arch = "wasm32")] +impl Identifier { + // There is no file system on the web, so we are not reading strings from file. + // let the user perform the I/O and read the file, then pass the content of it. + pub fn identify(&self, text: &[String]) -> Vec { + let regexes = if self.boundaryless { + &BOUNDARYLESS_REGEX + } else { + ®EX + }; + + text.iter() + .flat_map(|text| { + DATA.iter().enumerate().filter_map(|(i, e)| { + if is_valid_filter(self, e).is_valid() && regexes[i].is_match(text) { + Some(Match::new(text.to_owned(), e.clone())) + } else { + None + } + }) + }) + .collect() + } +} // Output Implementation impl Identifier { @@ -245,30 +264,27 @@ fn is_file(name: &str) -> bool { /// Validation filter #[derive(Debug)] pub enum ValidationFilter { - DataLowerRarity, - DataHigherRarity, - DataInTags, - DataInExclude, - Good, + InvalidDataRarityTooLow, + InvalidDataRarityTooHigh, + InvalidDataNotInTags, + InvalidDataInExclude, + Valid, } impl ValidationFilter { #[inline] pub fn is_valid(&self) -> bool { - match self { - ValidationFilter::Good => true, - _ => false, - } + matches!(self, ValidationFilter::Valid) } } #[inline] pub fn is_valid_filter(configs: &Identifier, regex_data: &Data) -> ValidationFilter { if regex_data.rarity < configs.min_rarity { - return ValidationFilter::DataLowerRarity; + return ValidationFilter::InvalidDataRarityTooLow; } if regex_data.rarity > configs.max_rarity { - return ValidationFilter::DataHigherRarity; + return ValidationFilter::InvalidDataRarityTooHigh; } if configs @@ -276,17 +292,17 @@ pub fn is_valid_filter(configs: &Identifier, regex_data: &Data) -> ValidationFil .iter() .any(|y| !regex_data.tags.iter().any(|x| x == y)) { - return ValidationFilter::DataInTags; + return ValidationFilter::InvalidDataNotInTags; } if configs .exclude_tags .iter() .any(|y| regex_data.tags.iter().any(|x| x == y)) { - return ValidationFilter::DataInExclude; + return ValidationFilter::InvalidDataInExclude; } - ValidationFilter::Good + ValidationFilter::Valid } #[cfg(not(target_arch = "wasm32"))]