analyze: borrowck: cache results of polonius runs on disk (#1056)

The Polonius stage of borrow checking takes a long time to run on certain functions, such as lighttpd's `li_MD5Transform`. Worse, we often run Polonius multiple times on the same function as the interprocedural analysis iterates to reach a fixpoint. This branch speeds up the analysis by caching Polonius results on disk. The caching logic is fairly simple: the core Polonius analysis is effectively a pure function from input facts to output facts, so we hash the input facts before each call and check whether a file named after that hash is present in the cache directory. There's no need to factor in any details of the crate, MIR, permissions, etc. If the current Polonius query has the same input facts as a previous query, it will necessarily produce the same output facts, regardless of how those input facts were computed. Computing the input facts still has a nontrivial cost for some functions, but this branch provides significant speedups on `algo_md5` and `lighttpd_rust_amalgamated` once `c2rust-analyze` has run once to populate the cache.
immunant · Jan 19, 2024 · d818e4d · d818e4d
2 parents 21907d8 + 09cb658
commit d818e4d
Show file tree

Hide file tree

Showing 4 changed files with 218 additions and 8 deletions.
diff --git a/Cargo.lock b/Cargo.lock
diff --git a/c2rust-analyze/Cargo.toml b/c2rust-analyze/Cargo.toml
@@ -29,6 +29,7 @@ clap = { version = "4.2.7", features = ["derive"] }
 fs-err = "2.9.0"
 anyhow = "1.0.75"
 toml_edit = "0.19.8"
+sha2 = "0.10.8"
 
 [build-dependencies]
 c2rust-build-paths = { path = "../c2rust-build-paths", version = "0.18.0" }

diff --git a/c2rust-analyze/src/borrowck/atoms.rs b/c2rust-analyze/src/borrowck/atoms.rs
@@ -1,12 +1,15 @@
 use polonius_engine::{self, Atom, FactTypes};
 use rustc_middle::mir::{BasicBlock, Local, Location, Place, PlaceElem};
 use rustc_middle::ty::TyCtxt;
+use serde::{Deserialize, Serialize};
 use std::collections::hash_map::{Entry, HashMap};
 use std::hash::Hash;
 
 macro_rules! define_atom_type {
     ($Atom:ident) => {
-        #[derive(Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Debug, Hash)]
+        #[derive(
+            Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Debug, Hash, Serialize, Deserialize,
+        )]
         pub struct $Atom(usize);
 
         impl From<usize> for $Atom {

diff --git a/c2rust-analyze/src/borrowck/mod.rs b/c2rust-analyze/src/borrowck/mod.rs
@@ -14,9 +14,11 @@ use rustc_middle::ty::{
     TyKind,
 };
 use rustc_type_ir::RegionKind::ReEarlyBound;
+use sha2::{Digest, Sha256};
 use std::collections::HashMap;
-use std::fmt::{Debug, Formatter};
-use std::hash::Hash;
+use std::fmt::{Debug, Formatter, Write as _};
+use std::fs::{self, File};
+use std::hash::{Hash, Hasher};
 
 mod atoms;
 mod def_use;
@@ -358,12 +360,215 @@ fn run_polonius<'tcx>(
 
     dump::dump_facts_to_dir(&facts, &maps, format!("inspect/{}", name)).unwrap();
 
-    let output = polonius_engine::Output::compute(&facts, polonius_engine::Algorithm::Naive, true);
+    eprintln!("running polonius analysis on {name}");
+    let facts_hash = bytes_to_hex_string(&hash_facts(&facts));
+    let output = match try_load_cached_output(&facts_hash) {
+        Some(output) => output,
+        None => {
+            let output = polonius_engine::Output::compute(
+                &facts,
+                polonius_engine::Algorithm::DatafrogOpt,
+                true,
+            );
+            save_cached_output(&facts_hash, &output).unwrap();
+            output
+        }
+    };
     dump::dump_output_to_dir(&output, &maps, format!("inspect/{}", name)).unwrap();
 
     (facts, maps, output)
 }
 
+fn try_load_cached_output(facts_hash: &str) -> Option<Output> {
+    let path = format!("polonius_cache/{}.output", facts_hash);
+
+    let f = File::open(&path).ok()?;
+    let raw = match bincode::deserialize_from(f) {
+        Ok(x) => x,
+        Err(e) => {
+            log::warn!("failed to parse polonius cache file {path:?}: {e}");
+            return None;
+        }
+    };
+    // The Polonius `Output` type doesn't implement `Serialize`.  Rather than define a local
+    // wrapper or proxy type and implement `Serialize` on that, we just unpack the struct into a
+    // tuple and serialize that instead.  However, tuples only implement `Deserialize` up to length
+    // 12, so we have to split up this 17-element tuple into several pieces.
+    let (
+        (
+            errors,
+            subset_errors,
+            move_errors,
+            dump_enabled,
+            loan_live_at,
+            origin_contains_loan_at,
+            origin_contains_loan_anywhere,
+            origin_live_on_entry,
+            loan_invalidated_at,
+            subset,
+            subset_anywhere,
+            var_live_on_entry,
+        ),
+        (
+            var_drop_live_on_entry,
+            path_maybe_initialized_on_exit,
+            path_maybe_uninitialized_on_exit,
+            known_contains,
+            var_maybe_partly_initialized_on_exit,
+        ),
+    ) = raw;
+
+    eprintln!("loaded cached facts from {}", path);
+
+    Some(Output {
+        errors,
+        subset_errors,
+        move_errors,
+        dump_enabled,
+        loan_live_at,
+        origin_contains_loan_at,
+        origin_contains_loan_anywhere,
+        origin_live_on_entry,
+        loan_invalidated_at,
+        subset,
+        subset_anywhere,
+        var_live_on_entry,
+        var_drop_live_on_entry,
+        path_maybe_initialized_on_exit,
+        path_maybe_uninitialized_on_exit,
+        known_contains,
+        var_maybe_partly_initialized_on_exit,
+    })
+}
+
+fn save_cached_output(facts_hash: &str, output: &Output) -> Result<(), bincode::Error> {
+    fs::create_dir_all("polonius_cache")?;
+    let path = format!("polonius_cache/{}.output", facts_hash);
+
+    let Output {
+        ref errors,
+        ref subset_errors,
+        ref move_errors,
+        ref dump_enabled,
+        ref loan_live_at,
+        ref origin_contains_loan_at,
+        ref origin_contains_loan_anywhere,
+        ref origin_live_on_entry,
+        ref loan_invalidated_at,
+        ref subset,
+        ref subset_anywhere,
+        ref var_live_on_entry,
+        ref var_drop_live_on_entry,
+        ref path_maybe_initialized_on_exit,
+        ref path_maybe_uninitialized_on_exit,
+        ref known_contains,
+        ref var_maybe_partly_initialized_on_exit,
+    } = *output;
+
+    // Split the tuple into several pieces, as described in `try_load_cached_output`.  The tuple
+    // format used here must match the one in `try_load_cached_output`.
+    let raw = (
+        (
+            errors,
+            subset_errors,
+            move_errors,
+            dump_enabled,
+            loan_live_at,
+            origin_contains_loan_at,
+            origin_contains_loan_anywhere,
+            origin_live_on_entry,
+            loan_invalidated_at,
+            subset,
+            subset_anywhere,
+            var_live_on_entry,
+        ),
+        (
+            var_drop_live_on_entry,
+            path_maybe_initialized_on_exit,
+            path_maybe_uninitialized_on_exit,
+            known_contains,
+            var_maybe_partly_initialized_on_exit,
+        ),
+    );
+
+    let f = File::create(path)?;
+    bincode::serialize_into(f, &raw)
+}
+
+fn bytes_to_hex_string(b: &[u8]) -> String {
+    let mut s = String::with_capacity(b.len() * 2);
+    for &x in b {
+        write!(s, "{:02x}", x).unwrap();
+    }
+    s
+}
+
+fn hash_facts(facts: &AllFacts) -> [u8; 32] {
+    let AllFacts {
+        ref loan_issued_at,
+        ref universal_region,
+        ref cfg_edge,
+        ref loan_killed_at,
+        ref subset_base,
+        ref loan_invalidated_at,
+        ref var_used_at,
+        ref var_defined_at,
+        ref var_dropped_at,
+        ref use_of_var_derefs_origin,
+        ref drop_of_var_derefs_origin,
+        ref child_path,
+        ref path_is_var,
+        ref path_assigned_at_base,
+        ref path_moved_at_base,
+        ref path_accessed_at_base,
+        ref known_placeholder_subset,
+        ref placeholder,
+    } = *facts;
+
+    // Only tuples up to size 12 implement `Hash`, so we break up this list into nested tuples.
+    sha256_hash(&(
+        (
+            loan_issued_at,
+            universal_region,
+            cfg_edge,
+            loan_killed_at,
+            subset_base,
+            loan_invalidated_at,
+            var_used_at,
+            var_defined_at,
+            var_dropped_at,
+            use_of_var_derefs_origin,
+            drop_of_var_derefs_origin,
+            child_path,
+        ),
+        (
+            path_is_var,
+            path_assigned_at_base,
+            path_moved_at_base,
+            path_accessed_at_base,
+            known_placeholder_subset,
+            placeholder,
+        ),
+    ))
+}
+
+fn sha256_hash<T: Hash>(x: &T) -> [u8; 32] {
+    struct Sha256Hasher(Sha256);
+    impl Hasher for Sha256Hasher {
+        fn write(&mut self, bytes: &[u8]) {
+            self.0.update(bytes);
+        }
+        fn finish(&self) -> u64 {
+            panic!("Sha256Hasher doesn't support finish()");
+        }
+    }
+
+    let mut hasher = Sha256Hasher(Sha256::new());
+    x.hash(&mut hasher);
+    let digest = hasher.0.finalize();
+    digest.as_slice().try_into().unwrap()
+}
+
 fn construct_adt_origins<'tcx>(
     ltcx: &LTyCtxt<'tcx>,
     adt_metadata: &AdtMetadataTable,