diff --git a/Cargo.lock b/Cargo.lock index 9516fd50..f3c7b020 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -278,6 +278,20 @@ version = "0.8.20" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "22ec99545bb0ed0ea7bb9b8e1e9122ea386ff8a48c0922e43f36d45ab09e0e80" +[[package]] +name = "dashmap" +version = "5.5.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "978747c1d849a7d2ee5e8adc0159961c48fb7e5db2f06af6723b80123bb53856" +dependencies = [ + "cfg-if", + "hashbrown 0.14.5", + "lock_api", + "once_cell", + "parking_lot_core 0.9.10", + "rayon", +] + [[package]] name = "dirs" version = "5.0.1" @@ -374,6 +388,12 @@ version = "2.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "9fc0510504f03c51ada170672ac806f1f105a88aa97a5281117e1ddc3368e51a" +[[package]] +name = "finl_unicode" +version = "1.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8fcfdc7a0362c9f4444381a9e697c79d435fe65b52a37466fc2c1184cee9edc6" + [[package]] name = "fixedbitset" version = "0.4.2" @@ -511,10 +531,12 @@ dependencies = [ "chrono", "clap", "content_inspector", + "dashmap", "dirs", "dotenv", "duct", "env_logger", + "finl_unicode", "graphql_client", "hipcheck-macros", "indexmap 2.2.6", @@ -545,7 +567,6 @@ dependencies = [ "termcolor", "toml", "unicode-normalization", - "unicode-segmentation", "ureq", "url", "walkdir", @@ -796,7 +817,7 @@ checksum = "7d17b78036a60663b797adeaee46f5c9dfebb86948d1255007a1d6be0271ff99" dependencies = [ "instant", "lock_api", - "parking_lot_core", + "parking_lot_core 0.8.6", ] [[package]] @@ -808,11 +829,24 @@ dependencies = [ "cfg-if", "instant", "libc", - "redox_syscall", + "redox_syscall 0.2.16", "smallvec", "winapi", ] +[[package]] +name = "parking_lot_core" +version = "0.9.10" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1e401f977ab385c9e4e3ab30627d6f26d00e2c73eef317493c4ec6d468726cf8" +dependencies = [ + "cfg-if", + "libc", + "redox_syscall 0.5.2", + "smallvec", + "windows-targets 0.52.5", +] + [[package]] name = "paste" version = "1.0.15" @@ -952,6 +986,15 @@ dependencies = [ "bitflags 1.3.2", ] +[[package]] +name = "redox_syscall" +version = "0.5.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c82cf8cff14456045f55ec4241383baeff27af886adb72ffb2162f99911de0fd" +dependencies = [ + "bitflags 2.5.0", +] + [[package]] name = "redox_users" version = "0.4.5" diff --git a/hipcheck/Cargo.toml b/hipcheck/Cargo.toml index 9cd654bd..1dcdfa12 100644 --- a/hipcheck/Cargo.toml +++ b/hipcheck/Cargo.toml @@ -54,7 +54,6 @@ spdx-rs = "0.5.0" termcolor = "1.1.3" toml = "0.8.14" unicode-normalization = "0.1.19" -unicode-segmentation = "1.9.0" ureq = { version = "2.9.7", default-features = false, features = [ "json", "tls", @@ -66,6 +65,8 @@ xml-rs = "0.8.20" tempdir = "0.3.7" rayon = "1.10.0" indexmap = "2.2.6" +dashmap = { version = "5.5.3", features = ["rayon", "inline"] } +finl_unicode = { version = "1.2.0", default-features = false, features = ["grapheme_clusters"] } # Windows-specific dependencies [target.'cfg(windows)'.dependencies] diff --git a/hipcheck/src/cli.rs b/hipcheck/src/cli.rs index e7ebd02d..9fbd938f 100644 --- a/hipcheck/src/cli.rs +++ b/hipcheck/src/cli.rs @@ -396,9 +396,6 @@ pub enum Commands { Unstable(UnstableArgs), } -// If no subcommand matched, default to use of '-t Result { - let grapheme_freqs = GraphemeFreqCalculator::for_diff(&commit_diff.diff, db)?.calculate(); + // Dashmap (fast concurrent hashmap) to store counts for each grapheme. + let grapheme_table: DashMap = DashMap::new(); + + // Use this variable to track the total number of graphemes accross all patches in this commit diff. + let mut total_graphemes: usize = 0; + + // Iterate over the file diffs by reference. + for file_diff in &commit_diff.diff.file_diffs { + // Filter out any that are probably not source files. + if db + .is_likely_source_file(Arc::clone(&file_diff.file_name))? + .not() + { + continue; + } + + // Filter out any that are empty. + if file_diff.patch.is_empty() { + continue; + } + + // Count the number of graphemes in this patch, add it to the tortal, + // and track the number of each grapheme. + total_graphemes += file_diff + .patch + // Iterate in parallel over the lines of the patch. + .par_lines() + // Normalize each line. + // See https://en.wikipedia.org/wiki/Unicode_equivalence. + .map(|line: &str| line.chars().nfc().collect::()) + // Count the graphemes in each normalized line. + // Also update the graphemes table here. + // We'll sum these counts to get the total number of graphemes. + .map(|normalized_line: String| { + // Create an iterator over the graphemes in the line. + Graphemes::new(&normalized_line) + // Update the graphemes table. + .map(|grapheme: &str| { + // Use this if statement to avoid allocating a new string unless needed. + if let Some(mut count) = grapheme_table.get_mut(grapheme) { + *count += 1; + } else { + grapheme_table.insert(grapheme.to_owned(), 1); + } + }) + // get the grapheme count for this normalized line. + .count() + }) + .sum::(); + } + // Transform out table (dashmap) of graphemes and their frequencies into a list to return. + let grapheme_freqs = grapheme_table + // Iterate in parallel for performance. + .into_par_iter() + .map(|(grapheme, count)| GraphemeFreq { + grapheme, + freq: count as f64 / total_graphemes as f64, + }) + .collect(); + + // Return the collected list of graphemes and their frequencies for this commit diff. Ok(CommitGraphemeFreq { commit: Arc::clone(&commit_diff.commit), grapheme_freqs, @@ -228,61 +289,3 @@ fn z_scores(mut commit_entropies: Vec) -> Result, - /// The total number of graphemes in the diff. - grapheme_total: u64, -} - -impl GraphemeFreqCalculator { - /// Initialize the calculator with data from a specific diff, filtering out - /// non-source-file changes. - fn for_diff(diff: &Diff, db: &dyn MetricProvider) -> Result { - let mut cgf = GraphemeFreqCalculator::default(); - - for file_diff in &diff.file_diffs { - if db.is_likely_source_file(Arc::clone(&file_diff.file_name))? - && file_diff.patch.is_empty().not() - { - for line in file_diff.patch.lines() { - cgf.add_line(line); - } - } - } - - Ok(cgf) - } - - /// Used by the constructor to add individual diff line data to the calculator. - #[allow(clippy::suspicious_map)] - fn add_line(&mut self, line: &str) { - let line = line.chars().nfc().collect::(); - - self.grapheme_total += line - .graphemes(true) - .map(|grapheme| { - let grapheme = grapheme.to_owned(); - let entry = self.grapheme_counts.entry(grapheme).or_insert(0); - *entry += 1; - }) - .count() as u64; - } - - /// Calculate the grapheme frequencies based on the data collected in the calculator. - fn calculate(self) -> Vec { - let mut grapheme_freqs = Vec::new(); - - for (grapheme, count) in self.grapheme_counts { - grapheme_freqs.push(GraphemeFreq { - grapheme, - freq: count as f64 / self.grapheme_total as f64, - }); - } - - grapheme_freqs - } -}