From 9975fe9056d6faae83905cd63dc6d1d4c33f5fab Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Fahrican=20Ko=C5=9Far?= <duck2@protonmail.com>
Date: Thu, 24 Mar 2022 19:21:19 -0400
Subject: [PATCH] dump net decomposition code

---
 .../libtatum/tatum/analyzer_factory.hpp       |   20 +-
 .../libtatum/tatum/analyzer_factory_fwd.hpp   |    2 +-
 .../AdaptiveSetupHoldTimingAnalyzer.hpp       |  202 +++
 .../analyzers/IncrHoldTimingAnalyzer.hpp      |    2 +-
 .../analyzers/IncrSetupHoldTimingAnalyzer.hpp |    1 +
 .../analyzers/IncrSetupTimingAnalyzer.hpp     |    2 +-
 .../graph_walkers/ParallelLevelizedWalker.hpp |    2 +-
 .../tatum/graph_walkers/SerialIncrWalker.hpp  |    6 +-
 libs/librrgraph/src/base/rr_graph_storage.h   |    5 -
 libs/libvtrutil/src/vtr_math.h                |   12 +
 utils/route_diag/src/main.cpp                 |   17 +-
 vpr/src/base/SetupVPR.cpp                     |    3 +
 vpr/src/base/read_netlist.cpp                 |    2 +
 vpr/src/base/vpr_api.cpp                      |   12 +-
 vpr/src/base/vpr_types.h                      |    2 +
 vpr/src/route/connection_router.cpp           |  250 +--
 vpr/src/route/connection_router.h             |   68 +-
 vpr/src/route/connection_router_interface.h   |   20 +-
 vpr/src/route/partition_tree.cpp              |   82 +-
 vpr/src/route/partition_tree.h                |   18 +-
 vpr/src/route/route_common.cpp                |   42 +-
 vpr/src/route/route_common.h                  |   39 +-
 vpr/src/route/route_parallel.cpp              | 1391 +++++++++++++++--
 vpr/src/route/route_samplers.cpp              |   69 +
 vpr/src/route/route_samplers.h                |  503 ++++++
 vpr/src/route/route_timing.cpp                |  337 ++--
 vpr/src/route/route_timing.h                  |   94 +-
 vpr/src/route/route_tree.cpp                  |   12 +-
 vpr/src/route/router_delay_profiling.cpp      |    5 +-
 vpr/src/route/spatial_route_tree_lookup.cpp   |    2 +-
 vpr/src/route/virtual_net.h                   |   21 +
 vpr/src/timing/NetPinTimingInvalidator.h      |   30 +-
 vpr/src/timing/concrete_timing_info.h         |    5 +-
 vpr/src/timing/net_delay.cpp                  |   36 +-
 vpr/test/test_connection_router.cpp           |    3 +-
 vpr/test/test_net_decomp.cpp                  |   30 +
 vtr_flow/scripts/python_libs/vtr/task.py      |    8 +-
 vtr_flow/scripts/python_libs/vtr/util.py      |    9 +-
 38 files changed, 2721 insertions(+), 643 deletions(-)
 create mode 100644 libs/EXTERNAL/libtatum/libtatum/tatum/analyzers/AdaptiveSetupHoldTimingAnalyzer.hpp
 create mode 100644 vpr/src/route/route_samplers.cpp
 create mode 100644 vpr/src/route/route_samplers.h
 create mode 100644 vpr/src/route/virtual_net.h
 create mode 100644 vpr/test/test_net_decomp.cpp
diff --git a/libs/EXTERNAL/libtatum/libtatum/tatum/analyzer_factory.hpp b/libs/EXTERNAL/libtatum/libtatum/tatum/analyzer_factory.hpp
index 9ac444bc61f..a36e7cfe299 100644
--- a/libs/EXTERNAL/libtatum/libtatum/tatum/analyzer_factory.hpp
+++ b/libs/EXTERNAL/libtatum/libtatum/tatum/analyzer_factory.hpp
@@ -7,6 +7,7 @@
 #include "tatum/TimingGraphFwd.hpp"
 #include "tatum/TimingConstraintsFwd.hpp"
 
+#include "tatum/analyzers/AdaptiveSetupHoldTimingAnalyzer.hpp"
 #include "tatum/graph_walkers.hpp"
 #include "tatum/timing_analyzers.hpp"
 #include "tatum/analyzers/full_timing_analyzers.hpp"
@@ -55,9 +56,9 @@ namespace tatum {
 ///Factor class to construct timing analyzers
 ///
 ///\tparam Visitor The analysis type visitor (e.g. SetupAnalysis)
-///\tparam GraphWalker The graph walker to use (defaults to serial traversals)
+///\tparam GraphWalker The graph walker to use
 template<class Visitor,
-         class GraphWalker>
+         class... GraphWalkers>
 struct AnalyzerFactory {
 
     //We use the dependent_false template to detect if the un-specialized AnalyzerFactor 
@@ -176,6 +177,21 @@ struct AnalyzerFactory<SetupHoldAnalysis,SerialIncrWalker> {
     }
 };
 
+template<>
+struct AnalyzerFactory<SetupHoldAnalysis,ParallelWalker,SerialIncrWalker> {
+
+    static std::unique_ptr<SetupHoldTimingAnalyzer> make(const TimingGraph& timing_graph,
+                                                         const TimingConstraints& timing_constraints,
+                                                         const DelayCalculator& delay_calc) {
+        return std::unique_ptr<SetupHoldTimingAnalyzer>(
+                new detail::AdaptiveSetupHoldTimingAnalyzer<ParallelWalker, SerialIncrWalker>(
+                    timing_graph, 
+                    timing_constraints, 
+                    delay_calc)
+                );
+    }
+};
+
 } //namepsace
 
 #endif
diff --git a/libs/EXTERNAL/libtatum/libtatum/tatum/analyzer_factory_fwd.hpp b/libs/EXTERNAL/libtatum/libtatum/tatum/analyzer_factory_fwd.hpp
index 3628ec1700b..0dff0883550 100644
--- a/libs/EXTERNAL/libtatum/libtatum/tatum/analyzer_factory_fwd.hpp
+++ b/libs/EXTERNAL/libtatum/libtatum/tatum/analyzer_factory_fwd.hpp
@@ -10,7 +10,7 @@ namespace tatum {
 ///\tparam Visitor The analysis type visitor (e.g. SetupAnalysis)
 ///\tparam GraphWalker The graph walker to use (defaults to serial traversals)
 template<class Visitor,
-         class GraphWalker=SerialWalker>
+         class... GraphWalkers>
 struct AnalyzerFactory;
 
 } //namepsace
diff --git a/libs/EXTERNAL/libtatum/libtatum/tatum/analyzers/AdaptiveSetupHoldTimingAnalyzer.hpp b/libs/EXTERNAL/libtatum/libtatum/tatum/analyzers/AdaptiveSetupHoldTimingAnalyzer.hpp
new file mode 100644
index 00000000000..d7fc315aaed
--- /dev/null
+++ b/libs/EXTERNAL/libtatum/libtatum/tatum/analyzers/AdaptiveSetupHoldTimingAnalyzer.hpp
@@ -0,0 +1,202 @@
+#pragma once
+#include "tatum/TimingGraphFwd.hpp"
+#include "tatum/graph_walkers/SerialWalker.hpp"
+#include "tatum/graph_walkers/SerialIncrWalker.hpp"
+#include "tatum/SetupHoldAnalysis.hpp"
+#include "tatum/analyzers/SetupHoldTimingAnalyzer.hpp"
+#include "tatum/base/validate_timing_graph_constraints.hpp"
+#include "tatum/graph_walkers/TimingGraphWalker.hpp"
+
+namespace tatum { namespace detail {
+
+/** Threshold for AdaptiveSetupHoldTimingAnalyzer to use full updates. 
+* Expressed as fraction of all edges in timing graph. */
+constexpr float full_update_threshold = 0.1;
+
+/**
+ * A concrete implementation of a SetupHoldTimingAnalyzer.
+ *
+ * This is an adaptive analyzer: can do incremental updates if the number of invalidated
+ * nodes is small, falls back to a full update after a certain threshold to avoid the overhead.
+ */
+template<class FullWalker=SerialWalker, class IncrWalker=SerialIncrWalker>
+class AdaptiveSetupHoldTimingAnalyzer : public SetupHoldTimingAnalyzer {
+    public:
+        AdaptiveSetupHoldTimingAnalyzer(const TimingGraph& timing_graph, const TimingConstraints& timing_constraints, const DelayCalculator& delay_calculator)
+            : SetupHoldTimingAnalyzer()
+            , timing_graph_(timing_graph)
+            , timing_constraints_(timing_constraints)
+            , delay_calculator_(delay_calculator)
+            , setup_hold_visitor_(timing_graph_.nodes().size(), timing_graph_.edges().size()) {
+            validate_timing_graph_constraints(timing_graph_, timing_constraints_);
+
+            //Initialize profiling data. Use full walker to store data for both
+            full_walker_.set_profiling_data("total_analysis_sec", 0.);
+            full_walker_.set_profiling_data("analysis_sec", 0.);
+            full_walker_.set_profiling_data("num_full_updates", 0.);
+            full_walker_.set_profiling_data("num_incr_updates", 0.);
+
+            mode_ = Mode::INCR;
+            n_modified_edges_ = 0;
+            max_modified_edges_ = timing_graph_.edges().size() * full_update_threshold;
+        }
+
+    protected:
+        //Update both setup and hold simultaneously (this is more efficient than updating them sequentially)
+        virtual void update_timing_impl() override {
+            auto start_time = Clock::now();
+
+            if(mode_ == Mode::INCR)
+                update_timing_incr_(setup_hold_visitor_);
+            else
+                update_timing_full_(setup_hold_visitor_);
+
+            clear_timing_incr_();
+
+            double analysis_sec = std::chrono::duration_cast<dsec>(Clock::now() - start_time).count();
+
+            //Record profiling data (use full walker to store it) (arbitrary choice)
+            double total_analysis_sec = analysis_sec + full_walker_.get_profiling_data("total_analysis_sec");
+            full_walker_.set_profiling_data("total_analysis_sec", total_analysis_sec);
+            full_walker_.set_profiling_data("analysis_sec", analysis_sec);
+            if(mode_ == Mode::INCR)
+                full_walker_.set_profiling_data("num_incr_updates", full_walker_.get_profiling_data("num_incr_updates") + 1);
+            else
+                full_walker_.set_profiling_data("num_full_updates", full_walker_.get_profiling_data("num_full_updates") + 1);
+
+            mode_ = Mode::INCR; /* We did our update, try to use incr until too many edges are modified */
+        }
+
+        //Update only setup timing
+        virtual void update_setup_timing_impl() override {
+            auto& setup_visitor = setup_hold_visitor_.setup_visitor();
+            
+            if(mode_ == Mode::INCR)
+                update_timing_incr_(setup_visitor);
+            else
+                update_timing_full_(setup_visitor);
+        }
+
+        //Update only hold timing
+        virtual void update_hold_timing_impl() override {
+            auto& hold_visitor = setup_hold_visitor_.hold_visitor();
+
+            if(mode_ == Mode::INCR)
+                update_timing_incr_(hold_visitor);
+            else
+                update_timing_full_(hold_visitor);
+        }
+
+        virtual void invalidate_edge_impl(const EdgeId edge) override {
+            if(mode_ == Mode::FULL)
+                return;
+            incr_walker_.invalidate_edge(edge);
+            n_modified_edges_++;
+            if(n_modified_edges_ > max_modified_edges_)
+                mode_ = Mode::FULL;
+        }
+
+        virtual node_range modified_nodes_impl() const override {
+            if(mode_ == Mode::FULL)
+                return full_walker_.modified_nodes();
+            else
+                return incr_walker_.modified_nodes();
+        }
+
+        double get_profiling_data_impl(std::string key) const override {
+            return full_walker_.get_profiling_data(key);
+        }
+
+        size_t num_unconstrained_startpoints_impl() const override {
+            if(mode_ == Mode::FULL)
+                return full_walker_.num_unconstrained_startpoints();
+            else
+                return incr_walker_.num_unconstrained_startpoints();
+        }
+
+        size_t num_unconstrained_endpoints_impl() const override {
+            if(mode_ == Mode::FULL)
+                return full_walker_.num_unconstrained_endpoints();
+            else
+                return incr_walker_.num_unconstrained_endpoints();
+        }
+
+        TimingTags::tag_range setup_tags_impl(NodeId node_id) const override { return setup_hold_visitor_.setup_tags(node_id); }
+        TimingTags::tag_range setup_tags_impl(NodeId node_id, TagType type) const override { return setup_hold_visitor_.setup_tags(node_id, type); }
+#ifdef TATUM_CALCULATE_EDGE_SLACKS
+        TimingTags::tag_range setup_edge_slacks_impl(EdgeId edge_id) const override { return setup_hold_visitor_.setup_edge_slacks(edge_id); }
+#endif
+        TimingTags::tag_range setup_node_slacks_impl(NodeId node_id) const override { return setup_hold_visitor_.setup_node_slacks(node_id); }
+
+        TimingTags::tag_range hold_tags_impl(NodeId node_id) const override { return setup_hold_visitor_.hold_tags(node_id); }
+        TimingTags::tag_range hold_tags_impl(NodeId node_id, TagType type) const override { return setup_hold_visitor_.hold_tags(node_id, type); }
+#ifdef TATUM_CALCULATE_EDGE_SLACKS
+        TimingTags::tag_range hold_edge_slacks_impl(EdgeId edge_id) const override { return setup_hold_visitor_.hold_edge_slacks(edge_id); }
+#endif
+        TimingTags::tag_range hold_node_slacks_impl(NodeId node_id) const override { return setup_hold_visitor_.hold_node_slacks(node_id); }
+
+    private:
+        /** Update using the full walker */
+        void update_timing_full_(GraphVisitor& visitor){
+            full_walker_.do_reset(timing_graph_, visitor);
+
+            full_walker_.do_arrival_pre_traversal(timing_graph_, timing_constraints_, visitor);            
+            full_walker_.do_arrival_traversal(timing_graph_, timing_constraints_, delay_calculator_, visitor);            
+
+            full_walker_.do_required_pre_traversal(timing_graph_, timing_constraints_, visitor);            
+            full_walker_.do_required_traversal(timing_graph_, timing_constraints_, delay_calculator_, visitor);            
+
+            full_walker_.do_update_slack(timing_graph_, delay_calculator_, visitor);
+        }
+
+        /** Update using the incremental walker */
+        void update_timing_incr_(GraphVisitor& visitor){
+            if (never_updated_incr_) {
+                //Invalidate all edges
+                for (EdgeId edge : timing_graph_.edges()) {
+                    incr_walker_.invalidate_edge(edge);
+                }
+
+                //Only need to pre-traverse the first update
+                incr_walker_.do_arrival_pre_traversal(timing_graph_, timing_constraints_, visitor);            
+            }
+
+            incr_walker_.do_arrival_traversal(timing_graph_, timing_constraints_, delay_calculator_, visitor);            
+
+            if (never_updated_incr_) {
+                //Only need to pre-traverse the first update
+                incr_walker_.do_required_pre_traversal(timing_graph_, timing_constraints_, visitor);            
+            }
+
+            incr_walker_.do_required_traversal(timing_graph_, timing_constraints_, delay_calculator_, visitor);            
+
+            incr_walker_.do_update_slack(timing_graph_, delay_calculator_, visitor);
+        }
+
+        /* Clear incremental timing info */
+        void clear_timing_incr_(){
+            incr_walker_.clear_invalidated_edges();
+
+            n_modified_edges_ = 0;
+            never_updated_incr_ = false;
+        }
+
+        const TimingGraph& timing_graph_;
+        const TimingConstraints& timing_constraints_;
+        const DelayCalculator& delay_calculator_;
+        SetupHoldAnalysis setup_hold_visitor_;
+
+        FullWalker full_walker_;
+        IncrWalker incr_walker_;
+        enum class Mode { FULL, INCR };
+        Mode mode_;
+
+        bool never_updated_incr_ = true;
+        size_t max_modified_edges_;
+        std::atomic_size_t n_modified_edges_ = 0;
+
+        typedef std::chrono::duration<double> dsec;
+        typedef std::chrono::high_resolution_clock Clock;
+};
+
+}} //namepsace
diff --git a/libs/EXTERNAL/libtatum/libtatum/tatum/analyzers/IncrHoldTimingAnalyzer.hpp b/libs/EXTERNAL/libtatum/libtatum/tatum/analyzers/IncrHoldTimingAnalyzer.hpp
index 6f6de43e788..d1acf608985 100644
--- a/libs/EXTERNAL/libtatum/libtatum/tatum/analyzers/IncrHoldTimingAnalyzer.hpp
+++ b/libs/EXTERNAL/libtatum/libtatum/tatum/analyzers/IncrHoldTimingAnalyzer.hpp
@@ -1,5 +1,5 @@
 #pragma once
-#include "tatum/graph_walkers/SerialWalker.hpp"
+#include "tatum/graph_walkers/SerialIncrWalker.hpp"
 #include "tatum/HoldAnalysis.hpp"
 #include "tatum/analyzers/HoldTimingAnalyzer.hpp"
 #include "tatum/base/validate_timing_graph_constraints.hpp"
diff --git a/libs/EXTERNAL/libtatum/libtatum/tatum/analyzers/IncrSetupHoldTimingAnalyzer.hpp b/libs/EXTERNAL/libtatum/libtatum/tatum/analyzers/IncrSetupHoldTimingAnalyzer.hpp
index 844e146ce4c..ffc541bdf21 100644
--- a/libs/EXTERNAL/libtatum/libtatum/tatum/analyzers/IncrSetupHoldTimingAnalyzer.hpp
+++ b/libs/EXTERNAL/libtatum/libtatum/tatum/analyzers/IncrSetupHoldTimingAnalyzer.hpp
@@ -1,4 +1,5 @@
 #pragma once
+#include "tatum/graph_walkers/SerialIncrWalker.hpp"
 #include "tatum/graph_walkers/SerialWalker.hpp"
 #include "tatum/SetupHoldAnalysis.hpp"
 #include "tatum/analyzers/SetupHoldTimingAnalyzer.hpp"
diff --git a/libs/EXTERNAL/libtatum/libtatum/tatum/analyzers/IncrSetupTimingAnalyzer.hpp b/libs/EXTERNAL/libtatum/libtatum/tatum/analyzers/IncrSetupTimingAnalyzer.hpp
index 0ad0b5203fc..57c16afe63c 100644
--- a/libs/EXTERNAL/libtatum/libtatum/tatum/analyzers/IncrSetupTimingAnalyzer.hpp
+++ b/libs/EXTERNAL/libtatum/libtatum/tatum/analyzers/IncrSetupTimingAnalyzer.hpp
@@ -1,5 +1,5 @@
 #pragma once
-#include "tatum/graph_walkers/SerialWalker.hpp"
+#include "tatum/graph_walkers/SerialIncrWalker.hpp"
 #include "tatum/SetupAnalysis.hpp"
 #include "tatum/analyzers/SetupTimingAnalyzer.hpp"
 #include "tatum/base/validate_timing_graph_constraints.hpp"
diff --git a/libs/EXTERNAL/libtatum/libtatum/tatum/graph_walkers/ParallelLevelizedWalker.hpp b/libs/EXTERNAL/libtatum/libtatum/tatum/graph_walkers/ParallelLevelizedWalker.hpp
index 0cbf1a5863b..0104d10d3e3 100644
--- a/libs/EXTERNAL/libtatum/libtatum/tatum/graph_walkers/ParallelLevelizedWalker.hpp
+++ b/libs/EXTERNAL/libtatum/libtatum/tatum/graph_walkers/ParallelLevelizedWalker.hpp
@@ -11,7 +11,7 @@
 namespace tatum {
 
 /**
- * A parallel timing analyzer which traveres the timing graph in a levelized
+ * A parallel timing analyzer which traverses the timing graph in a levelized
  * manner.  However nodes within each level are processed in parallel using
  * Thread Building Blocks (TBB). If TBB is not available it operates serially and is 
  * equivalent to the SerialWalker.
diff --git a/libs/EXTERNAL/libtatum/libtatum/tatum/graph_walkers/SerialIncrWalker.hpp b/libs/EXTERNAL/libtatum/libtatum/tatum/graph_walkers/SerialIncrWalker.hpp
index 8ece8e44f9a..4eba704df29 100644
--- a/libs/EXTERNAL/libtatum/libtatum/tatum/graph_walkers/SerialIncrWalker.hpp
+++ b/libs/EXTERNAL/libtatum/libtatum/tatum/graph_walkers/SerialIncrWalker.hpp
@@ -17,14 +17,14 @@ namespace tatum {
  *
  * If TATUM_INCR_BLOCK_INVALIDATION is defined: 
  *      All of a nodes tags associated with an invalidated edge are invalidated.
- *      This is a robust but pessimisitc approach (it invalidates more tags than
+ *      This is a robust but pessimistic approach (it invalidates more tags than
  *      strictly required). As a result all nodes processed will report having been
  *      modified, meaning their decendents/predecessors will also be invalidated
  *      even if in reality the recalculated tags are identical to the previous ones
  *      (i.e. nothing has really changed).
  *
  * Ohterwise, the analyzer performs edge invalidation:
- *      Only node tags which are dominanted by an invalidated edge are invalidated.
+ *      Only node tags which are dominated by an invalidated edge are invalidated.
  *      This is a less pessimistic approach, and means when processed nodes which
  *      don't have any changed tags will report as being unmodified. This significantly
  *      prunes the amount of the timing graph which needs to be updated (as unmodified
@@ -37,7 +37,7 @@ namespace tatum {
  * manner. Unlike SerialWalker it attempts to incrementally (rather than
  * fully) update based on invalidated edges.
  *
- * To performan an incremental traversal, the st of invalidated edges
+ * To perform an incremental traversal, the set of invalidated edges
  * is processed to identify nodes which will need to be re-evaluated for
  * the arrival and/or required traversals.
  *
diff --git a/libs/librrgraph/src/base/rr_graph_storage.h b/libs/librrgraph/src/base/rr_graph_storage.h
index 7e4f21b5968..2ccbf325834 100644
--- a/libs/librrgraph/src/base/rr_graph_storage.h
+++ b/libs/librrgraph/src/base/rr_graph_storage.h
@@ -667,11 +667,6 @@ class t_rr_graph_storage {
     static inline Direction get_node_direction(
         vtr::array_view_id<RRNodeId, const t_rr_node_data> node_storage,
         RRNodeId id) {
-        auto& node_data = node_storage[id];
-        if (node_data.type_ != CHANX && node_data.type_ != CHANY) {
-            VTR_LOG_ERROR("Attempted to access RR node 'direction' for non-channel type '%s'",
-                          rr_node_typename[node_data.type_]);
-        }
         return node_storage[id].dir_side_.direction;
     }
 
diff --git a/libs/libvtrutil/src/vtr_math.h b/libs/libvtrutil/src/vtr_math.h
index 74b4ccebf58..199b15ac71b 100644
--- a/libs/libvtrutil/src/vtr_math.h
+++ b/libs/libvtrutil/src/vtr_math.h
@@ -4,6 +4,7 @@
 #include <map>
 #include <vector>
 #include <cmath>
+#include <cstdint>
 
 #include "vtr_assert.h"
 
@@ -163,6 +164,17 @@ bool isclose(T a, T b) {
     return isclose<T>(a, b, DEFAULT_REL_TOL, DEFAULT_ABS_TOL);
 }
 
+/** Log2, round down.
+ * From https://stackoverflow.com/a/51351885 */
+static inline uint64_t log2_floor(uint64_t x) {
+    return 63U - __builtin_clzl(x);
+}
+
+/** Log2, round up */
+static inline uint64_t log2_ceil(uint64_t x) {
+    return log2_floor(x - 1) + 1;
+}
+
 } // namespace vtr
 
 #endif
diff --git a/utils/route_diag/src/main.cpp b/utils/route_diag/src/main.cpp
index 571c17c30e6..7f4d50eef28 100644
--- a/utils/route_diag/src/main.cpp
+++ b/utils/route_diag/src/main.cpp
@@ -117,13 +117,16 @@ static void do_one_route(const Netlist<>& net_list,
                                      -1,
                                      false,
                                      std::unordered_map<RRNodeId, int>());
-    std::tie(found_path, std::ignore, cheapest) = router.timing_driven_route_connection_from_route_tree(tree.root(),
-                                                                                                    sink_node,
-                                                                                                    cost_params,
-                                                                                                    bounding_box,
-                                                                                                    router_stats,
-                                                                                                    conn_params,
-                                                                                                    true);
+    std::tie(found_path, std::ignore, cheapest) = router.timing_driven_route_connection_from_route_tree(
+        tree.root(),
+        tree.root().inode,
+        sink_node,
+        cost_params,
+        bounding_box,
+        router_stats,
+        conn_params,
+        true
+    );
 
     if (found_path) {
         VTR_ASSERT(cheapest.index == sink_node);
diff --git a/vpr/src/base/SetupVPR.cpp b/vpr/src/base/SetupVPR.cpp
index b5492a1f8ec..5e9b5d35657 100644
--- a/vpr/src/base/SetupVPR.cpp
+++ b/vpr/src/base/SetupVPR.cpp
@@ -281,6 +281,9 @@ void SetupVPR(const t_options* Options,
     /* Set seed for pseudo-random placement, default seed to 1 */
     vtr::srandom(PlacerOpts->seed);
 
+    /* Make num_workers available to the router */
+    RouterOpts->num_workers = vpr_setup->num_workers;
+
     {
         vtr::ScopedStartFinishTimer t("Building complex block graph");
         alloc_and_load_all_pb_graphs(PowerOpts->do_power, RouterOpts->flat_routing);
diff --git a/vpr/src/base/read_netlist.cpp b/vpr/src/base/read_netlist.cpp
index 6aee712a04b..3f243d122b4 100644
--- a/vpr/src/base/read_netlist.cpp
+++ b/vpr/src/base/read_netlist.cpp
@@ -1057,11 +1057,13 @@ static void load_external_nets_and_cb(ClusteredNetlist& clb_nlist) {
             int logical_pin = clb_nlist.pin_logical_index(pin_id);
             int physical_pin = get_physical_pin(tile_type, block_type, logical_pin);
 
+            /* XXX: Silence warning
             if (tile_type->is_ignored_pin[physical_pin] != is_ignored_net) {
                 VTR_LOG_WARN(
                     "Netlist connects net %s to both global and non-global pins.\n",
                     clb_nlist.net_name(net_id).c_str());
             }
+            */
         }
     }
 
diff --git a/vpr/src/base/vpr_api.cpp b/vpr/src/base/vpr_api.cpp
index 9f379f84e42..46999356e5a 100644
--- a/vpr/src/base/vpr_api.cpp
+++ b/vpr/src/base/vpr_api.cpp
@@ -301,6 +301,9 @@ void vpr_init_with_options(const t_options* options, t_vpr_setup* vpr_setup, t_a
              &vpr_setup->PowerOpts,
              vpr_setup);
 
+    /* XXX: Hardcode parallel router for testing */
+    //vpr_setup->RouterOpts.router_algorithm = PARALLEL;
+
     /* Check inputs are reasonable */
     CheckArch(*arch);
 
@@ -906,19 +909,12 @@ RouteStatus vpr_route_fixed_W(const Netlist<>& net_list,
                               std::shared_ptr<RoutingDelayCalculator> delay_calc,
                               NetPinsMatrix<float>& net_delay,
                               bool is_flat) {
-    get_cached_router_lookahead(
-        vpr_setup.RoutingArch,
-        vpr_setup.RouterOpts.lookahead_type,
-        vpr_setup.RouterOpts.write_router_lookahead,
-        vpr_setup.RouterOpts.read_router_lookahead,
-        vpr_setup.Segments,
-        is_flat);
-
     vtr::ScopedStartFinishTimer timer("Routing");
 
     if (NO_FIXED_CHANNEL_WIDTH == fixed_channel_width || fixed_channel_width <= 0) {
         VPR_FATAL_ERROR(VPR_ERROR_ROUTE, "Fixed channel width must be specified when routing at fixed channel width (was %d)", fixed_channel_width);
     }
+
     bool status = false;
     status = try_route(net_list,
                        fixed_channel_width,
diff --git a/vpr/src/base/vpr_types.h b/vpr/src/base/vpr_types.h
index 93ef759bb88..a51f6548d2d 100644
--- a/vpr/src/base/vpr_types.h
+++ b/vpr/src/base/vpr_types.h
@@ -1393,6 +1393,8 @@ struct t_router_opts {
     bool flat_routing;
     bool has_choking_spot;
 
+    size_t num_workers;
+
     // Options related to rr_node reordering, for testing and possible cache optimization
     e_rr_node_reorder_algorithm reorder_rr_graph_nodes_algorithm = DONT_REORDER;
     int reorder_rr_graph_nodes_threshold = 0;
diff --git a/vpr/src/route/connection_router.cpp b/vpr/src/route/connection_router.cpp
index 62db70ed31f..82ebe70d329 100644
--- a/vpr/src/route/connection_router.cpp
+++ b/vpr/src/route/connection_router.cpp
@@ -1,4 +1,5 @@
 #include "connection_router.h"
+#include "route_common.h"
 #include "rr_graph.h"
 
 #include "binary_heap.h"
@@ -61,10 +62,11 @@ inline void update_router_stats(const DeviceContext& device_ctx,
 /** return tuple <found_path, retry_with_full_bb, cheapest> */
 template<typename Heap>
 std::tuple<bool, bool, t_heap> ConnectionRouter<Heap>::timing_driven_route_connection_from_route_tree(
-    const RouteTreeNode& rt_root,
+    const RouteTree& tree,
+    RRNodeId source_node,
     RRNodeId sink_node,
-    const t_conn_cost_params cost_params,
-    t_bb bounding_box,
+    const t_conn_cost_params& cost_params,
+    const t_bb& bounding_box,
     RouterStats& router_stats,
     const ConnectionParameters& conn_params,
     bool can_grow_bb) {
@@ -73,7 +75,7 @@ std::tuple<bool, bool, t_heap> ConnectionRouter<Heap>::timing_driven_route_conne
 
     bool retry = false;
     t_heap* cheapest;
-    std::tie(retry, cheapest) = timing_driven_route_connection_common_setup(rt_root, sink_node, cost_params, bounding_box, can_grow_bb);
+    std::tie(retry, cheapest) = timing_driven_route_connection_common_setup(tree, source_node, sink_node, cost_params, bounding_box, can_grow_bb);
 
     if (cheapest != nullptr) {
         rcv_path_manager.update_route_tree_set(cheapest->path_data);
@@ -94,25 +96,25 @@ std::tuple<bool, bool, t_heap> ConnectionRouter<Heap>::timing_driven_route_conne
 /** Return <retry with full bb?, cheapest> */
 template<typename Heap>
 std::tuple<bool, t_heap*> ConnectionRouter<Heap>::timing_driven_route_connection_common_setup(
-    const RouteTreeNode& rt_root,
+    const RouteTree& tree,
+    RRNodeId source_node,
     RRNodeId sink_node,
-    const t_conn_cost_params cost_params,
-    t_bb bounding_box,
+    const t_conn_cost_params& cost_params,
+    const t_bb& bounding_box,
     bool can_grow_bb) {
     //Re-add route nodes from the existing route tree to the heap.
     //They need to be repushed onto the heap since each node's cost is target specific.
 
-    add_route_tree_to_heap(rt_root, sink_node, cost_params, false);
+    add_route_tree_to_heap(tree.root(), sink_node, bounding_box, cost_params, false);
     heap_.build_heap(); // via sifting down everything
 
-    RRNodeId source_node = rt_root.inode;
-
     if (heap_.is_empty_heap()) {
         VTR_LOG("No source in route tree: %s\n", describe_unrouteable_connection(source_node, sink_node, is_flat_).c_str());
+        VTR_LOG("Bounding box: %d,%dx%d,%d\n", bounding_box.xmin, bounding_box.ymin, bounding_box.xmax, bounding_box.ymax);
         return std::make_tuple(false, nullptr);
     }
 
-    VTR_LOGV_DEBUG(router_debug_, "  Routing to %d as normal net (BB: %d,%d x %d,%d)\n", sink_node,
+    VTR_LOGV_DEBUG(router_debug_, "  %p Routing to %d as normal net (BB: %d,%d x %d,%d)\n", this, sink_node,
                    bounding_box.xmin, bounding_box.ymin,
                    bounding_box.xmax, bounding_box.ymax);
 
@@ -172,7 +174,7 @@ std::tuple<bool, t_heap*> ConnectionRouter<Heap>::timing_driven_route_connection
 
         //Re-initialize the heap since it was emptied by the previous call to
         //timing_driven_route_connection_from_heap()
-        add_route_tree_to_heap(rt_root, sink_node, cost_params, false);
+        add_route_tree_to_heap(tree.root(), sink_node, full_device_bounding_box, cost_params, false);
         heap_.build_heap(); // via sifting down everything
 
         //Try finding the path again with the relaxed bounding box
@@ -196,10 +198,11 @@ std::tuple<bool, t_heap*> ConnectionRouter<Heap>::timing_driven_route_connection
 // Returns a  tuple of <found_path?, retry_with_full_bb?, cheapest> */
 template<typename Heap>
 std::tuple<bool, bool, t_heap> ConnectionRouter<Heap>::timing_driven_route_connection_from_route_tree_high_fanout(
-    const RouteTreeNode& rt_root,
+    const RouteTree& tree,
+    RRNodeId source_node,
     RRNodeId sink_node,
-    const t_conn_cost_params cost_params,
-    t_bb net_bounding_box,
+    const t_conn_cost_params& cost_params,
+    const t_bb& net_bounding_box,
     const SpatialRouteTreeLookup& spatial_rt_lookup,
     RouterStats& router_stats,
     const ConnectionParameters& conn_params,
@@ -210,19 +213,17 @@ std::tuple<bool, bool, t_heap> ConnectionRouter<Heap>::timing_driven_route_conne
     // re-explore route tree from root to add any new nodes (buildheap afterwards)
     // route tree needs to be repushed onto the heap since each node's cost is target specific
     router_stats_->add_high_fanout_rt++;
-    t_bb high_fanout_bb = add_high_fanout_route_tree_to_heap(rt_root, sink_node, cost_params, spatial_rt_lookup, net_bounding_box);
+    t_bb high_fanout_bb = add_high_fanout_route_tree_to_heap(tree.root(), sink_node, cost_params, spatial_rt_lookup, net_bounding_box);
     heap_.build_heap();
 
-    RRNodeId source_node = rt_root.inode;
-
     if (heap_.is_empty_heap()) {
         VTR_LOG("No source in route tree: %s\n", describe_unrouteable_connection(source_node, sink_node, is_flat_).c_str());
         return std::make_tuple(false, false, t_heap());
     }
 
-    VTR_LOGV_DEBUG(router_debug_, "  Routing to %d as high fanout net (BB: %d,%d x %d,%d)\n", sink_node,
-                   high_fanout_bb.xmin, high_fanout_bb.ymin,
-                   high_fanout_bb.xmax, high_fanout_bb.ymax);
+    VTR_LOGV_DEBUG(router_debug_, "  %p Routing to %d as high fanout net (BB: %d,%d x %d,%d)\n", this, sink_node,
+                   net_bounding_box.xmin, net_bounding_box.ymin,
+                   net_bounding_box.xmax, net_bounding_box.ymax);
 
     bool retry_with_full_bb = false;
     t_heap* cheapest;
@@ -234,17 +235,21 @@ std::tuple<bool, bool, t_heap> ConnectionRouter<Heap>::timing_driven_route_conne
         //Found no path, that may be due to an unlucky choice of existing route tree sub-set,
         //try again with the full route tree to be sure this is not an artifact of high-fanout routing
         VTR_LOG_WARN("No routing path found in high-fanout mode for net connection (to sink_rr %d), retrying with full route tree\n", sink_node);
+        VTR_LOG_WARN("high_fanout_bb=%d,%dx%d,%d\n", high_fanout_bb.xmin, high_fanout_bb.ymin, high_fanout_bb.xmax, high_fanout_bb.ymax);
+        VTR_LOG_WARN("net_bb=%d,%dx%d,%d\n", net_bounding_box.xmin, net_bounding_box.ymin, net_bounding_box.xmax, net_bounding_box.ymax);
 
         //Reset any previously recorded node costs so timing_driven_route_connection()
         //starts over from scratch.
         reset_path_costs();
         modified_rr_node_inf_.clear();
 
-        std::tie(retry_with_full_bb, cheapest) = timing_driven_route_connection_common_setup(rt_root,
-                                                                                             sink_node,
-                                                                                             cost_params,
-                                                                                             net_bounding_box,
-                                                                                             can_grow_bb);
+        std::tie(retry_with_full_bb, cheapest) = timing_driven_route_connection_common_setup(
+            tree,
+            source_node,
+            sink_node,
+            cost_params,
+            net_bounding_box,
+            can_grow_bb);
     }
 
     if (cheapest == nullptr) {
@@ -273,14 +278,12 @@ std::tuple<bool, bool, t_heap> ConnectionRouter<Heap>::timing_driven_route_conne
 // Returns either the last element of the path, or nullptr if no path is found
 template<typename Heap>
 t_heap* ConnectionRouter<Heap>::timing_driven_route_connection_from_heap(RRNodeId sink_node,
-                                                                         const t_conn_cost_params cost_params,
-                                                                         t_bb bounding_box) {
+                                                                         const t_conn_cost_params& cost_params,
+                                                                         const t_bb& bounding_box) {
     VTR_ASSERT_SAFE(heap_.is_valid());
-    //std::cout << "using this: " << (void *)this << "\n";
-    //std::cout << "using heap: " << heap_.get_ptr() << "\n";
 
     if (heap_.is_empty_heap()) { //No source
-        VTR_LOGV_DEBUG(router_debug_, "  Initial heap empty (no source)\n");
+        VTR_LOGV_DEBUG(router_debug_, "  %p Initial heap empty (no source)\n", this);
     }
 
     const auto& device_ctx = g_vpr_ctx.device();
@@ -297,8 +300,8 @@ t_heap* ConnectionRouter<Heap>::timing_driven_route_connection_from_heap(RRNodeI
                             false);
 
         RRNodeId inode = cheapest->index;
-        VTR_LOGV_DEBUG(router_debug_, "  Popping node %d (cost: %g)\n",
-                       inode, cheapest->cost);
+        VTR_LOGV_DEBUG(router_debug_, "  %p Popping node %d (cost: %g)\n",
+                       this, inode, cheapest->cost);
 
         // Have we found the target?
         if (inode == sink_node) {
@@ -308,7 +311,7 @@ t_heap* ConnectionRouter<Heap>::timing_driven_route_connection_from_heap(RRNodeI
             if (rcv_path_manager.is_enabled()) {
                 rcv_path_manager.insert_backwards_path_into_traceback(cheapest->path_data, cheapest->cost, cheapest->backward_path_cost, route_ctx);
             }
-            VTR_LOGV_DEBUG(router_debug_, "  Found target %8d (%s)\n", inode, describe_rr_node(device_ctx.rr_graph, device_ctx.grid, device_ctx.rr_indexed_data, inode, is_flat_).c_str());
+            VTR_LOGV_DEBUG(router_debug_, "  %p Found target %8d (%s)\n", this, inode, describe_rr_node(device_ctx.rr_graph, device_ctx.grid, device_ctx.rr_indexed_data, inode, is_flat_).c_str());
             break;
         }
 
@@ -329,7 +332,7 @@ t_heap* ConnectionRouter<Heap>::timing_driven_route_connection_from_heap(RRNodeI
     }
 
     if (cheapest == nullptr) { /* Impossible routing.  No path for net. */
-        VTR_LOGV_DEBUG(router_debug_, "  Empty heap (no path found)\n");
+        VTR_LOGV_DEBUG(router_debug_, "  %p Empty heap (no path found)\n", this);
         return nullptr;
     }
 
@@ -339,9 +342,9 @@ t_heap* ConnectionRouter<Heap>::timing_driven_route_connection_from_heap(RRNodeI
 // Find shortest paths from specified route tree to all nodes in the RR graph
 template<typename Heap>
 vtr::vector<RRNodeId, t_heap> ConnectionRouter<Heap>::timing_driven_find_all_shortest_paths_from_route_tree(
-    const RouteTreeNode& rt_root,
-    const t_conn_cost_params cost_params,
-    t_bb bounding_box,
+    const RouteTree& tree,
+    const t_conn_cost_params& cost_params,
+    const t_bb& bounding_box,
     RouterStats& router_stats,
     const ConnectionParameters& conn_params) {
     router_stats_ = &router_stats;
@@ -349,7 +352,7 @@ vtr::vector<RRNodeId, t_heap> ConnectionRouter<Heap>::timing_driven_find_all_sho
 
     // Add the route tree to the heap with no specific target node
     RRNodeId target_node = RRNodeId::INVALID();
-    add_route_tree_to_heap(rt_root, target_node, cost_params, false);
+    add_route_tree_to_heap(tree.root(), target_node, bounding_box, cost_params, false);
     heap_.build_heap(); // via sifting down everything
 
     auto res = timing_driven_find_all_shortest_paths_from_heap(cost_params, bounding_box);
@@ -367,14 +370,14 @@ vtr::vector<RRNodeId, t_heap> ConnectionRouter<Heap>::timing_driven_find_all_sho
 // no-operation lookahead which always returns zero.
 template<typename Heap>
 vtr::vector<RRNodeId, t_heap> ConnectionRouter<Heap>::timing_driven_find_all_shortest_paths_from_heap(
-    const t_conn_cost_params cost_params,
-    t_bb bounding_box) {
+    const t_conn_cost_params& cost_params,
+    const t_bb& bounding_box) {
     vtr::vector<RRNodeId, t_heap> cheapest_paths(rr_nodes_.size());
 
     VTR_ASSERT_SAFE(heap_.is_valid());
 
     if (heap_.is_empty_heap()) { // No source
-        VTR_LOGV_DEBUG(router_debug_, "  Initial heap empty (no source)\n");
+        VTR_LOGV_DEBUG(router_debug_, "  %p Initial heap empty (no source)\n", this);
     }
 
     while (!heap_.is_empty_heap()) {
@@ -387,8 +390,8 @@ vtr::vector<RRNodeId, t_heap> ConnectionRouter<Heap>::timing_driven_find_all_sho
                             false);
 
         RRNodeId inode = cheapest->index;
-        VTR_LOGV_DEBUG(router_debug_, "  Popping node %d (cost: %g)\n",
-                       inode, cheapest->cost);
+        VTR_LOGV_DEBUG(router_debug_, "  %p Popping node %d (cost: %g)\n",
+                       this, inode, cheapest->cost);
 
         // Since we want to find shortest paths to all nodes in the graph
         // we do not specify a target node.
@@ -403,10 +406,10 @@ vtr::vector<RRNodeId, t_heap> ConnectionRouter<Heap>::timing_driven_find_all_sho
                                       bounding_box);
 
         if (cheapest_paths[inode].index == RRNodeId::INVALID() || cheapest_paths[inode].cost >= cheapest->cost) {
-            VTR_LOGV_DEBUG(router_debug_, "  Better cost to node %d: %g (was %g)\n", inode, cheapest->cost, cheapest_paths[inode].cost);
+            VTR_LOGV_DEBUG(router_debug_, "  %p Better cost to node %d: %g (was %g)\n", this, inode, cheapest->cost, cheapest_paths[inode].cost);
             cheapest_paths[inode] = *cheapest;
         } else {
-            VTR_LOGV_DEBUG(router_debug_, "  Worse cost to node %d: %g (better %g)\n", inode, cheapest->cost, cheapest_paths[inode].cost);
+            VTR_LOGV_DEBUG(router_debug_, "  %p Worse cost to node %d: %g (better %g)\n", this, inode, cheapest->cost, cheapest_paths[inode].cost);
         }
 
         rcv_path_manager.free_path_struct(cheapest->path_data);
@@ -419,8 +422,8 @@ vtr::vector<RRNodeId, t_heap> ConnectionRouter<Heap>::timing_driven_find_all_sho
 template<typename Heap>
 void ConnectionRouter<Heap>::timing_driven_expand_cheapest(t_heap* cheapest,
                                                            RRNodeId target_node,
-                                                           const t_conn_cost_params cost_params,
-                                                           t_bb bounding_box) {
+                                                           const t_conn_cost_params& cost_params,
+                                                           const t_bb& bounding_box) {
     RRNodeId inode = cheapest->index;
 
     t_rr_node_route_inf* route_inf = &rr_node_route_inf_[inode];
@@ -441,10 +444,11 @@ void ConnectionRouter<Heap>::timing_driven_expand_cheapest(t_heap* cheapest,
     if (best_total_cost > new_total_cost && ((rcv_path_manager.is_enabled()) || best_back_cost > new_back_cost)) {
         // Explore from this node, since the current/new partial path has the best cost
         // found so far
-        VTR_LOGV_DEBUG(router_debug_, "    Better cost to %d\n", inode);
-        VTR_LOGV_DEBUG(router_debug_, "    New total cost: %g\n", new_total_cost);
-        VTR_LOGV_DEBUG(router_debug_, "    New back cost: %g\n", new_back_cost);
-        VTR_LOGV_DEBUG(router_debug_, "      Setting path costs for associated node %d (from %d edge %zu)\n",
+        VTR_LOGV_DEBUG(router_debug_, "    %p Better cost to %d\n", this, inode);
+        VTR_LOGV_DEBUG(router_debug_, "    %p New total cost: %g\n", this, new_total_cost);
+        VTR_LOGV_DEBUG(router_debug_, "    %p New back cost: %g\n", this, new_back_cost);
+        VTR_LOGV_DEBUG(router_debug_, "      %p Setting path costs for associated node %d (from %d edge %zu)\n",
+                       this,
                        cheapest->index,
                        cheapest->prev_node(),
                        size_t(cheapest->prev_edge()));
@@ -456,18 +460,18 @@ void ConnectionRouter<Heap>::timing_driven_expand_cheapest(t_heap* cheapest,
     } else {
         // Post-heap prune, do not re-explore from the current/new partial path as it
         // has worse cost than the best partial path to this node found so far
-        VTR_LOGV_DEBUG(router_debug_, "    Worse cost to %d\n", inode);
-        VTR_LOGV_DEBUG(router_debug_, "    Old total cost: %g\n", best_total_cost);
-        VTR_LOGV_DEBUG(router_debug_, "    Old back cost: %g\n", best_back_cost);
-        VTR_LOGV_DEBUG(router_debug_, "    New total cost: %g\n", new_total_cost);
-        VTR_LOGV_DEBUG(router_debug_, "    New back cost: %g\n", new_back_cost);
+        VTR_LOGV_DEBUG(router_debug_, "    %p Worse cost to %d\n", this, inode);
+        VTR_LOGV_DEBUG(router_debug_, "    %p Old total cost: %g\n", this, best_total_cost);
+        VTR_LOGV_DEBUG(router_debug_, "    %p Old back cost: %g\n", this, best_back_cost);
+        VTR_LOGV_DEBUG(router_debug_, "    %p New total cost: %g\n", this, new_total_cost);
+        VTR_LOGV_DEBUG(router_debug_, "    %p New back cost: %g\n", this, new_back_cost);
     }
 }
 
 template<typename Heap>
 void ConnectionRouter<Heap>::timing_driven_expand_neighbours(t_heap* current,
-                                                             const t_conn_cost_params cost_params,
-                                                             t_bb bounding_box,
+                                                             const t_conn_cost_params& cost_params,
+                                                             const t_bb& bounding_box,
                                                              RRNodeId target_node) {
     /* Puts all the rr_nodes adjacent to current on the heap. */
 
@@ -529,29 +533,28 @@ void ConnectionRouter<Heap>::timing_driven_expand_neighbour(t_heap* current,
                                                             RRNodeId from_node,
                                                             RREdgeId from_edge,
                                                             RRNodeId to_node,
-                                                            const t_conn_cost_params cost_params,
-                                                            const t_bb bounding_box,
+                                                            const t_conn_cost_params& cost_params,
+                                                            const t_bb& bounding_box,
                                                             RRNodeId target_node,
-                                                            const t_bb target_bb) {
+                                                            const t_bb& target_bb) {
     int to_xlow = rr_graph_->node_xlow(to_node);
     int to_ylow = rr_graph_->node_ylow(to_node);
-    int to_xhigh = rr_graph_->node_xhigh(to_node);
-    int to_yhigh = rr_graph_->node_yhigh(to_node);
 
     // BB-pruning
     // Disable BB-pruning if RCV is enabled, as this can make it harder for circuits with high negative hold slack to resolve this
     // TODO: Only disable pruning if the net has negative hold slack, maybe go off budgets
-    if ((to_xhigh < bounding_box.xmin    // Strictly left of BB left-edge
-         || to_xlow > bounding_box.xmax  // Strictly right of BB right-edge
-         || to_yhigh < bounding_box.ymin // Strictly below BB bottom-edge
-         || to_ylow > bounding_box.ymax) // Strictly above BB top-edge
+    // Parallel router change: only expand if to_node is inside BB
+    if ((to_xlow < bounding_box.xmin
+        || to_ylow < bounding_box.ymin
+        || to_xlow > bounding_box.xmax
+        || to_ylow > bounding_box.ymax)
         && !rcv_path_manager.is_enabled()) {
         VTR_LOGV_DEBUG(router_debug_,
-                       "      Pruned expansion of node %d edge %zu -> %d"
+                       "      %p Pruned expansion of node %d edge %zu -> %d"
                        " (to node location %d,%dx%d,%d outside of expanded"
                        " net bounding box %d,%dx%d,%d)\n",
-                       from_node, size_t(from_edge), size_t(to_node),
-                       to_xlow, to_ylow, to_xhigh, to_yhigh,
+                       this, from_node, size_t(from_edge), size_t(to_node),
+                       to_xlow, to_ylow, rr_graph_->node_xhigh(to_node), rr_graph_->node_yhigh(to_node),
                        bounding_box.xmin, bounding_box.ymin, bounding_box.xmax, bounding_box.ymax);
         return; /* Node is outside (expanded) bounding box. */
     }
@@ -565,15 +568,17 @@ void ConnectionRouter<Heap>::timing_driven_expand_neighbour(t_heap* current,
         if (to_type == IPIN) {
             // Check if this IPIN leads to the target block
             // IPIN's of the target block should be contained within it's bounding box
+            int to_xhigh = rr_graph_->node_xhigh(to_node);
+            int to_yhigh = rr_graph_->node_yhigh(to_node);
             if (to_xlow < target_bb.xmin
                 || to_ylow < target_bb.ymin
                 || to_xhigh > target_bb.xmax
                 || to_yhigh > target_bb.ymax) {
                 VTR_LOGV_DEBUG(router_debug_,
-                               "      Pruned expansion of node %d edge %zu -> %d"
+                               "      %p Pruned expansion of node %d edge %zu -> %d"
                                " (to node is IPIN at %d,%dx%d,%d which does not"
                                " lead to target block %d,%dx%d,%d)\n",
-                               from_node, size_t(from_edge), size_t(to_node),
+                               this, from_node, size_t(from_edge), size_t(to_node),
                                to_xlow, to_ylow, to_xhigh, to_yhigh,
                                target_bb.xmin, target_bb.ymin, target_bb.xmax, target_bb.ymax);
                 return;
@@ -581,8 +586,8 @@ void ConnectionRouter<Heap>::timing_driven_expand_neighbour(t_heap* current,
         }
     }
 
-    VTR_LOGV_DEBUG(router_debug_, "      Expanding node %d edge %zu -> %d\n",
-                   from_node, size_t(from_edge), size_t(to_node));
+    VTR_LOGV_DEBUG(router_debug_, "      %p Expanding node %d edge %zu -> %d\n",
+                   this, from_node, size_t(from_edge), size_t(to_node));
 
     // Check if the node exists in the route tree when RCV is enabled
     // Other pruning methods have been disabled when RCV is on, so this method is required to prevent "loops" from being created
@@ -604,11 +609,11 @@ void ConnectionRouter<Heap>::timing_driven_expand_neighbour(t_heap* current,
 
 // Add to_node to the heap, and also add any nodes which are connected by non-configurable edges
 template<typename Heap>
-void ConnectionRouter<Heap>::timing_driven_add_to_heap(const t_conn_cost_params cost_params,
+void ConnectionRouter<Heap>::timing_driven_add_to_heap(const t_conn_cost_params& cost_params,
                                                        const t_heap* current,
                                                        RRNodeId from_node,
                                                        RRNodeId to_node,
-                                                       const RREdgeId from_edge,
+                                                       RREdgeId from_edge,
                                                        RRNodeId target_node) {
     const auto& device_ctx = g_vpr_ctx.device();
     t_heap next;
@@ -642,14 +647,14 @@ void ConnectionRouter<Heap>::timing_driven_add_to_heap(const t_conn_cost_params
     float new_back_cost = next.backward_path_cost;
 
     if (new_total_cost < best_total_cost && ((rcv_path_manager.is_enabled()) || (new_back_cost < best_back_cost))) {
-        VTR_LOGV_DEBUG(router_debug_, "      Expanding to node %d (%s)\n", to_node,
+        VTR_LOGV_DEBUG(router_debug_, "      %p Expanding to node %d (%s)\n", this, to_node,
                        describe_rr_node(device_ctx.rr_graph,
                                         device_ctx.grid,
                                         device_ctx.rr_indexed_data,
                                         to_node,
                                         is_flat_)
                            .c_str());
-        VTR_LOGV_DEBUG(router_debug_, "        New Total Cost %g New back Cost %g\n", new_total_cost, new_back_cost);
+        VTR_LOGV_DEBUG(router_debug_, "        %p New Total Cost %g New back Cost %g\n", this, new_total_cost, new_back_cost);
         //Add node to the heap only if the cost via the current partial path is less than the
         //best known cost, since there is no reason for the router to expand more expensive paths.
         //
@@ -683,9 +688,9 @@ void ConnectionRouter<Heap>::timing_driven_add_to_heap(const t_conn_cost_params
                             true);
 
     } else {
-        VTR_LOGV_DEBUG(router_debug_, "      Didn't expand to %d (%s)\n", to_node, describe_rr_node(device_ctx.rr_graph, device_ctx.grid, device_ctx.rr_indexed_data, to_node, is_flat_).c_str());
-        VTR_LOGV_DEBUG(router_debug_, "        Prev Total Cost %g Prev back Cost %g \n", best_total_cost, best_back_cost);
-        VTR_LOGV_DEBUG(router_debug_, "        New Total Cost %g New back Cost %g \n", new_total_cost, new_back_cost);
+        VTR_LOGV_DEBUG(router_debug_, "      %p Didn't expand to %d (%s)\n", this, to_node, describe_rr_node(device_ctx.rr_graph, device_ctx.grid, device_ctx.rr_indexed_data, to_node, is_flat_).c_str());
+        VTR_LOGV_DEBUG(router_debug_, "        %p Prev Total Cost %g Prev back Cost %g \n", this, best_total_cost, best_back_cost);
+        VTR_LOGV_DEBUG(router_debug_, "        %p New Total Cost %g New back Cost %g \n", this, new_total_cost, new_back_cost);
     }
 
     if (rcv_path_manager.is_enabled() && next.path_data != nullptr) {
@@ -713,7 +718,7 @@ static bool same_non_config_node_set(RRNodeId from_node, RRNodeId to_node) {
 #endif
 
 template<typename Heap>
-float ConnectionRouter<Heap>::compute_node_cost_using_rcv(const t_conn_cost_params cost_params,
+float ConnectionRouter<Heap>::compute_node_cost_using_rcv(const t_conn_cost_params& cost_params,
                                                           RRNodeId to_node,
                                                           RRNodeId target_node,
                                                           float backwards_delay,
@@ -767,7 +772,7 @@ void ConnectionRouter<Heap>::set_rcv_enabled(bool enable) {
 //Calculates the cost of reaching to_node
 template<typename Heap>
 void ConnectionRouter<Heap>::evaluate_timing_driven_node_costs(t_heap* to,
-                                                               const t_conn_cost_params cost_params,
+                                                               const t_conn_cost_params& cost_params,
                                                                RRNodeId from_node,
                                                                RRNodeId to_node,
                                                                RREdgeId from_edge,
@@ -877,7 +882,8 @@ void ConnectionRouter<Heap>::evaluate_timing_driven_node_costs(t_heap* to,
                                                                   cost_params,
                                                                   to->R_upstream);
         VTR_LOGV_DEBUG(router_debug_ && !std::isfinite(expected_cost),
-                       "        Lookahead from %s (%s) to %s (%s) is non-finite, expected_cost = %f, to->R_upstream = %f\n",
+                       "        %p Lookahead from %s (%s) to %s (%s) is non-finite, expected_cost = %f, to->R_upstream = %f\n",
+                       this,
                        rr_node_arch_name(to_node, is_flat_).c_str(),
                        describe_rr_node(device_ctx.rr_graph, device_ctx.grid, device_ctx.rr_indexed_data, to_node, is_flat_).c_str(),
                        rr_node_arch_name(target_node, is_flat_).c_str(),
@@ -910,16 +916,16 @@ void ConnectionRouter<Heap>::empty_heap_annotating_node_route_inf() {
 
 //Adds the route tree rooted at rt_node to the heap, preparing it to be
 //used as branch-points for further routing.
+/* Puts the entire partial routing below and including rt_node onto the heap
+ * (except for those parts marked as not to be expanded) by calling itself
+ * recursively. */
 template<typename Heap>
 void ConnectionRouter<Heap>::add_route_tree_to_heap(
     const RouteTreeNode& rt_node,
     RRNodeId target_node,
-    const t_conn_cost_params cost_params,
+    const t_bb& bounding_box,
+    const t_conn_cost_params& cost_params,
     bool from_high_fanout) {
-    /* Puts the entire partial routing below and including rt_node onto the heap *
-     * (except for those parts marked as not to be expanded) by calling itself   *
-     * recursively.                                                              */
-
     if (from_high_fanout) {
         router_stats_->add_all_rt_from_high_fanout++;
     } else {
@@ -931,6 +937,7 @@ void ConnectionRouter<Heap>::add_route_tree_to_heap(
     if (rt_node.re_expand) {
         add_route_tree_node_to_heap(rt_node,
                                     target_node,
+                                    bounding_box,
                                     cost_params,
                                     false);
     }
@@ -942,12 +949,14 @@ void ConnectionRouter<Heap>::add_route_tree_to_heap(
                                         target_node)) {
                 add_route_tree_to_heap(child_node,
                                        target_node,
+                                       bounding_box,
                                        cost_params,
                                        from_high_fanout);
             }
         } else {
             add_route_tree_to_heap(child_node,
                                    target_node,
+                                   bounding_box,
                                    cost_params,
                                    from_high_fanout);
         }
@@ -962,13 +971,18 @@ template<typename Heap>
 void ConnectionRouter<Heap>::add_route_tree_node_to_heap(
     const RouteTreeNode& rt_node,
     RRNodeId target_node,
-    const t_conn_cost_params cost_params,
+    const t_bb& bounding_box,
+    const t_conn_cost_params& cost_params,
     bool is_high_fanout) {
     const auto& device_ctx = g_vpr_ctx.device();
     const RRNodeId inode = rt_node.inode;
     float backward_path_cost = cost_params.criticality * rt_node.Tdel;
     float R_upstream = rt_node.R_upstream;
 
+    /* don't include if not in BB */
+    if (!inside_bb(rt_node.inode, bounding_box))
+        return;
+
     // after budgets are loaded, calculate delay cost as described by RCV paper
     /* R. Fung, V. Betz and W. Chow, "Slack Allocation and Routing to Improve FPGA Timing While
      * Repairing Short-Path Violations," in IEEE Transactions on Computer-Aided Design of
@@ -983,7 +997,8 @@ void ConnectionRouter<Heap>::add_route_tree_node_to_heap(
                                                                      target_node,
                                                                      cost_params,
                                                                      R_upstream);
-        VTR_LOGV_DEBUG(router_debug_, "  Adding node %8d to heap from init route tree with cost %g (%s)\n",
+        VTR_LOGV_DEBUG(router_debug_, "  %p Adding node %8d to heap from init route tree with cost %g (%s)\n",
+                       this,
                        inode,
                        tot_cost,
                        describe_rr_node(device_ctx.rr_graph, device_ctx.grid, device_ctx.rr_indexed_data, inode, is_flat_).c_str());
@@ -1012,25 +1027,30 @@ void ConnectionRouter<Heap>::add_route_tree_node_to_heap(
     }
 }
 
-static t_bb adjust_highfanout_bounding_box(t_bb highfanout_bb) {
-    t_bb bb = highfanout_bb;
+/* Expand bb by inode's extents and clip against net_bb */
+inline void expand_highfanout_bounding_box(t_bb& bb, RRNodeId inode, const t_bb& net_bb, const RRGraphView* rr_graph) {
+    bb.xmin = std::max<int>(net_bb.xmin, std::min<int>(bb.xmin, rr_graph->node_xlow(inode)));
+    bb.ymin = std::max<int>(net_bb.ymin, std::min<int>(bb.ymin, rr_graph->node_ylow(inode)));
+    bb.xmax = std::min<int>(net_bb.xmax, std::max<int>(bb.xmax, rr_graph->node_xhigh(inode)));
+    bb.ymax = std::min<int>(net_bb.ymax, std::max<int>(bb.ymax, rr_graph->node_yhigh(inode)));
+}
 
+/* Expand bb by HIGH_FANOUT_BB_FAC and clip against net_bb */
+inline void adjust_highfanout_bounding_box(t_bb& bb, const t_bb& net_bb) {
     constexpr int HIGH_FANOUT_BB_FAC = 3;
-    bb.xmin -= HIGH_FANOUT_BB_FAC;
-    bb.ymin -= HIGH_FANOUT_BB_FAC;
-    bb.xmax += HIGH_FANOUT_BB_FAC;
-    bb.ymax += HIGH_FANOUT_BB_FAC;
-
-    return bb;
+    bb.xmin = std::max<int>(net_bb.xmin, bb.xmin - HIGH_FANOUT_BB_FAC);
+    bb.ymin = std::max<int>(net_bb.ymin, bb.ymin - HIGH_FANOUT_BB_FAC);
+    bb.xmax = std::min<int>(net_bb.xmax, bb.xmax + HIGH_FANOUT_BB_FAC);
+    bb.ymax = std::min<int>(net_bb.ymax, bb.ymax + HIGH_FANOUT_BB_FAC);
 }
 
 template<typename Heap>
 t_bb ConnectionRouter<Heap>::add_high_fanout_route_tree_to_heap(
     const RouteTreeNode& rt_root,
     RRNodeId target_node,
-    const t_conn_cost_params cost_params,
+    const t_conn_cost_params& cost_params,
     const SpatialRouteTreeLookup& spatial_rt_lookup,
-    t_bb net_bounding_box) {
+    const t_bb& net_bounding_box) {
     //For high fanout nets we only add those route tree nodes which are spatially close
     //to the sink.
     //
@@ -1070,6 +1090,7 @@ t_bb ConnectionRouter<Heap>::add_high_fanout_route_tree_to_heap(
 
             for (const RouteTreeNode& rt_node : spatial_rt_lookup[bin_x][bin_y]) {
                 if (!rt_node.re_expand) continue; //Some nodes (like IPINs) shouldn't be re-expanded
+
                 RRNodeId rr_node_to_add = rt_node.inode;
 
                 if (is_flat_) {
@@ -1077,14 +1098,16 @@ t_bb ConnectionRouter<Heap>::add_high_fanout_route_tree_to_heap(
                         continue;
                 }
 
-                // Put the node onto the heap
-                add_route_tree_node_to_heap(rt_node, target_node, cost_params, true);
+                /* In case we are using a net-wide lookup with a clipped BB (decomposed net) */
+                if (!inside_bb(rr_node_to_add, net_bounding_box))
+                    continue;
+
+                /* Put the node onto the heap (here it can be net_bounding_box, it's inside anyway) */
+                add_route_tree_node_to_heap(rt_node, target_node, net_bounding_box, cost_params, true);
+
+                /* Expand bounding box by this node's extents (clips by net_bounding_box) */
+                expand_highfanout_bounding_box(highfanout_bb, rr_node_to_add, net_bounding_box, rr_graph_);
 
-                // Update Bounding Box
-                highfanout_bb.xmin = std::min<int>(highfanout_bb.xmin, rr_graph_->node_xlow(rr_node_to_add));
-                highfanout_bb.ymin = std::min<int>(highfanout_bb.ymin, rr_graph_->node_ylow(rr_node_to_add));
-                highfanout_bb.xmax = std::max<int>(highfanout_bb.xmax, rr_graph_->node_xhigh(rr_node_to_add));
-                highfanout_bb.ymax = std::max<int>(highfanout_bb.ymax, rr_graph_->node_yhigh(rr_node_to_add));
                 if (is_flat_) {
                     if (rr_graph_->node_type(rr_node_to_add) == CHANY || rr_graph_->node_type(rr_node_to_add) == CHANX) {
                         chan_nodes_added++;
@@ -1110,15 +1133,14 @@ t_bb ConnectionRouter<Heap>::add_high_fanout_route_tree_to_heap(
         if (done) break;
     }
 
-    t_bb bounding_box = net_bounding_box;
     if (nodes_added == 0) { //If the target bin, and it's surrounding bins were empty, just add the full route tree
-        add_route_tree_to_heap(rt_root, target_node, cost_params, true);
+        add_route_tree_to_heap(rt_root, target_node, net_bounding_box, cost_params, true);
+        return net_bounding_box;
     } else {
         //We found nearby routing, replace original bounding box to be localized around that routing
-        bounding_box = adjust_highfanout_bounding_box(highfanout_bb);
+        adjust_highfanout_bounding_box(highfanout_bb, net_bounding_box);
+        return highfanout_bb;
     }
-
-    return bounding_box;
 }
 
 std::unique_ptr<ConnectionRouterInterface> make_connection_router(e_heap_type heap_type,
diff --git a/vpr/src/route/connection_router.h b/vpr/src/route/connection_router.h
index 5834e852409..f514941981a 100644
--- a/vpr/src/route/connection_router.h
+++ b/vpr/src/route/connection_router.h
@@ -69,10 +69,11 @@ class ConnectionRouter : public ConnectionRouterInterface {
      * bool: should retry with full bounding box? (only used in parallel routing)
      * t_heap: heap element of cheapest path */
     std::tuple<bool, bool, t_heap> timing_driven_route_connection_from_route_tree(
-        const RouteTreeNode& rt_root,
+        const RouteTree& tree,
+        RRNodeId source_node,
         RRNodeId sink_node,
-        const t_conn_cost_params cost_params,
-        t_bb bounding_box,
+        const t_conn_cost_params& cost_params,
+        const t_bb& bounding_box,
         RouterStats& router_stats,
         const ConnectionParameters& conn_params,
         bool can_grow_bb) final;
@@ -88,10 +89,11 @@ class ConnectionRouter : public ConnectionRouterInterface {
      * bool: should retry with full bounding box? (only used in parallel routing)
      * t_heap: heap element of cheapest path */
     std::tuple<bool, bool, t_heap> timing_driven_route_connection_from_route_tree_high_fanout(
-        const RouteTreeNode& rt_root,
+        const RouteTree& tree,
+        RRNodeId source_node,
         RRNodeId sink_node,
-        const t_conn_cost_params cost_params,
-        t_bb net_bounding_box,
+        const t_conn_cost_params& cost_params,
+        const t_bb& net_bounding_box,
         const SpatialRouteTreeLookup& spatial_rt_lookup,
         RouterStats& router_stats,
         const ConnectionParameters& conn_params,
@@ -107,9 +109,9 @@ class ConnectionRouter : public ConnectionRouterInterface {
     // empty).  When using cost_params.astar_fac = 0, for efficiency the
     // RouterLookahead used should be the NoOpLookahead.
     vtr::vector<RRNodeId, t_heap> timing_driven_find_all_shortest_paths_from_route_tree(
-        const RouteTreeNode& rt_root,
-        const t_conn_cost_params cost_params,
-        t_bb bounding_box,
+        const RouteTree& tree,
+        const t_conn_cost_params& cost_params,
+        const t_bb& bounding_box,
         RouterStats& router_stats,
         const ConnectionParameters& conn_params) final;
 
@@ -156,17 +158,19 @@ class ConnectionRouter : public ConnectionRouterInterface {
      * timing_driven_route_connection_from_route_tree_high_fanout for running
      * the connection router.
      * @param[in] rt_root RouteTreeNode describing the current routing state
+     * @param[in] source_node Source node ID to route from
      * @param[in] sink_node Sink node ID to route to
      * @param[in] cost_params
      * @param[in] bounding_box Keep search confined to this bounding box
      * @param[in] can_grow_bb Can this fn grow the given bounding box? 
-     * @return bool Signal to retry this connection with a full-device bounding box,
+     * @return bool Signal to retry this connection with a full-device bounding box.
      * @return t_heap* Heap element describing the path found. */
     std::tuple<bool, t_heap*> timing_driven_route_connection_common_setup(
-        const RouteTreeNode& rt_root,
+        const RouteTree& tree,
+        RRNodeId source_node,
         RRNodeId sink_node,
-        const t_conn_cost_params cost_params,
-        t_bb bounding_box,
+        const t_conn_cost_params& cost_params,
+        const t_bb& bounding_box,
         bool can_grow_bb);
 
     // Finds a path to sink_node, starting from the elements currently in the
@@ -180,21 +184,21 @@ class ConnectionRouter : public ConnectionRouterInterface {
     // found
     t_heap* timing_driven_route_connection_from_heap(
         RRNodeId sink_node,
-        const t_conn_cost_params cost_params,
-        t_bb bounding_box);
+        const t_conn_cost_params& cost_params,
+        const t_bb& bounding_box);
 
     // Expand this current node if it is a cheaper path.
     void timing_driven_expand_cheapest(
         t_heap* cheapest,
         RRNodeId target_node,
-        const t_conn_cost_params cost_params,
-        t_bb bounding_box);
+        const t_conn_cost_params& cost_params,
+        const t_bb& bounding_box);
 
     // Expand each neighbor of the current node.
     void timing_driven_expand_neighbours(
         t_heap* current,
-        const t_conn_cost_params cost_params,
-        t_bb bounding_box,
+        const t_conn_cost_params& cost_params,
+        const t_bb& bounding_box,
         RRNodeId target_node);
 
     // Conditionally adds to_node to the router heap (via path from from_node
@@ -207,15 +211,15 @@ class ConnectionRouter : public ConnectionRouterInterface {
         RRNodeId from_node,
         RREdgeId from_edge,
         RRNodeId to_node,
-        const t_conn_cost_params cost_params,
-        const t_bb bounding_box,
+        const t_conn_cost_params& cost_params,
+        const t_bb& bounding_box,
         RRNodeId target_node,
-        const t_bb target_bb);
+        const t_bb& target_bb);
 
     // Add to_node to the heap, and also add any nodes which are connected by
     // non-configurable edges
     void timing_driven_add_to_heap(
-        const t_conn_cost_params cost_params,
+        const t_conn_cost_params& cost_params,
         const t_heap* current,
         RRNodeId from_node,
         RRNodeId to_node,
@@ -225,7 +229,7 @@ class ConnectionRouter : public ConnectionRouterInterface {
     // Calculates the cost of reaching to_node
     void evaluate_timing_driven_node_costs(
         t_heap* to,
-        const t_conn_cost_params cost_params,
+        const t_conn_cost_params& cost_params,
         RRNodeId from_node,
         RRNodeId to_node,
         RREdgeId from_edge,
@@ -233,8 +237,8 @@ class ConnectionRouter : public ConnectionRouterInterface {
 
     // Find paths from current heap to all nodes in the RR graph
     vtr::vector<RRNodeId, t_heap> timing_driven_find_all_shortest_paths_from_heap(
-        const t_conn_cost_params cost_params,
-        t_bb bounding_box);
+        const t_conn_cost_params& cost_params,
+        const t_bb& bounding_box);
 
     void empty_heap_annotating_node_route_inf();
 
@@ -242,11 +246,12 @@ class ConnectionRouter : public ConnectionRouterInterface {
     //used as branch-points for further routing.
     void add_route_tree_to_heap(const RouteTreeNode& rt_node,
                                 RRNodeId target_node,
-                                const t_conn_cost_params cost_params,
+                                const t_bb& bounding_box,
+                                const t_conn_cost_params& cost_params,
                                 bool from_high_fanout);
 
     // Evaluate node costs using the RCV algorith
-    float compute_node_cost_using_rcv(const t_conn_cost_params cost_params,
+    float compute_node_cost_using_rcv(const t_conn_cost_params& cost_params,
                                       RRNodeId to_node,
                                       RRNodeId target_node,
                                       float backwards_delay,
@@ -260,15 +265,16 @@ class ConnectionRouter : public ConnectionRouterInterface {
     void add_route_tree_node_to_heap(
         const RouteTreeNode& rt_node,
         RRNodeId target_node,
-        const t_conn_cost_params cost_params,
+        const t_bb& bounding_box,
+        const t_conn_cost_params& cost_params,
         bool is_high_fanout);
 
     t_bb add_high_fanout_route_tree_to_heap(
         const RouteTreeNode& rt_root,
         RRNodeId target_node,
-        const t_conn_cost_params cost_params,
+        const t_conn_cost_params& cost_params,
         const SpatialRouteTreeLookup& spatial_route_tree_lookup,
-        t_bb net_bounding_box);
+        const t_bb& net_bounding_box);
 
     const DeviceGrid& grid_;
     const RouterLookahead& router_lookahead_;
diff --git a/vpr/src/route/connection_router_interface.h b/vpr/src/route/connection_router_interface.h
index 2180dbe76f3..803114a6639 100644
--- a/vpr/src/route/connection_router_interface.h
+++ b/vpr/src/route/connection_router_interface.h
@@ -53,10 +53,11 @@ class ConnectionRouterInterface {
      * bool: should retry with full bounding box? (only used in parallel routing)
      * t_heap: heap element of cheapest path */
     virtual std::tuple<bool, bool, t_heap> timing_driven_route_connection_from_route_tree(
-        const RouteTreeNode& rt_root,
+        const RouteTree& tree,
+        RRNodeId source_node,
         RRNodeId sink_node,
-        const t_conn_cost_params cost_params,
-        t_bb bounding_box,
+        const t_conn_cost_params& cost_params,
+        const t_bb& bounding_box,
         RouterStats& router_stats,
         const ConnectionParameters& conn_params,
         bool can_grow_bb)
@@ -73,10 +74,11 @@ class ConnectionRouterInterface {
      * bool: should retry with full bounding box? (only used in parallel routing)
      * t_heap: heap element of cheapest path */
     virtual std::tuple<bool, bool, t_heap> timing_driven_route_connection_from_route_tree_high_fanout(
-        const RouteTreeNode& rt_root,
+        const RouteTree& tree,
+        RRNodeId source_node,
         RRNodeId sink_node,
-        const t_conn_cost_params cost_params,
-        t_bb bounding_box,
+        const t_conn_cost_params& cost_params,
+        const t_bb& bounding_box,
         const SpatialRouteTreeLookup& spatial_rt_lookup,
         RouterStats& router_stats,
         const ConnectionParameters& conn_params,
@@ -93,9 +95,9 @@ class ConnectionRouterInterface {
     // empty).  When using cost_params.astar_fac = 0, for efficiency the
     // RouterLookahead used should be the NoOpLookahead.
     virtual vtr::vector<RRNodeId, t_heap> timing_driven_find_all_shortest_paths_from_route_tree(
-        const RouteTreeNode& rt_root,
-        const t_conn_cost_params cost_params,
-        t_bb bounding_box,
+        const RouteTree& tree,
+        const t_conn_cost_params& cost_params,
+        const t_bb& bounding_box,
         RouterStats& router_stats,
         const ConnectionParameters& conn_params)
         = 0;
diff --git a/vpr/src/route/partition_tree.cpp b/vpr/src/route/partition_tree.cpp
index d3d895493b5..ab294eddf4e 100644
--- a/vpr/src/route/partition_tree.cpp
+++ b/vpr/src/route/partition_tree.cpp
@@ -1,21 +1,31 @@
 #include "partition_tree.h"
 #include <cmath>
+#include <cstdint>
 #include <memory>
 
-PartitionTree::PartitionTree(const Netlist<>& netlist) {
+/** Arbitrary limit to stop partitioning nets. At a certain point, the quality lost due to disturbed net ordering 
+ * and the task creation overhead outweighs the advantage of partitioning, so we stop doing it. */
+constexpr size_t MIN_NETS_TO_PARTITION = 256;
+
+PartitionTree::PartitionTree(const Netlist<>& netlist, const vtr::vector<ParentNetId, uint32_t>& scores) {
     const auto& device_ctx = g_vpr_ctx.device();
 
     auto all_nets = std::vector<ParentNetId>(netlist.nets().begin(), netlist.nets().end());
-    _root = build_helper(netlist, all_nets, 0, 0, device_ctx.grid.width() - 1, device_ctx.grid.height() - 1);
+    _root = build_helper(all_nets, scores, 0, 0, device_ctx.grid.width() - 1, device_ctx.grid.height() - 1);
 }
 
-std::unique_ptr<PartitionTreeNode> PartitionTree::build_helper(const Netlist<>& netlist, const std::vector<ParentNetId>& nets, int x1, int y1, int x2, int y2) {
+std::unique_ptr<PartitionTreeNode> PartitionTree::build_helper(const std::vector<ParentNetId>& nets, const vtr::vector<ParentNetId, uint32_t>& scores, int x1, int y1, int x2, int y2) {
     if (nets.empty())
         return nullptr;
 
     const auto& route_ctx = g_vpr_ctx.routing();
     auto out = std::make_unique<PartitionTreeNode>();
 
+    if (nets.size() < MIN_NETS_TO_PARTITION){
+        out->nets = nets;
+        return out;
+    }
+
     /* Build ParaDRo-ish prefix sum lookup for each bin (coordinate) in the device.
      * Do this for every step with only given nets, because each cutline takes some nets out
      * of the game, so if we just built a global lookup it wouldn't yield accurate results.
@@ -29,61 +39,72 @@ std::unique_ptr<PartitionTreeNode> PartitionTree::build_helper(const Netlist<>&
     /* Cutlines are placed between integral coordinates.
      * For instance, x_total_before[0] assumes a cutline at x=0.5, so fanouts at x=0 are included but not
      * x=1. It's similar for x_total_after[0], which excludes fanouts at x=0 and includes x=1.
-     * Note that we have W-1 possible cutlines for a W-wide box. */
-    std::vector<int> x_total_before(W - 1, 0), x_total_after(W - 1, 0);
-    std::vector<int> y_total_before(H - 1, 0), y_total_after(H - 1, 0);
+     * Note that we have W-1 possible cutlines for a W-wide box.
+     *
+     * Here, *_total_before holds total score of nets before the cutline and not intersecting it.
+     * In ParaDRo this would be total_before + total_on. (same for total_after)*/
+    std::vector<uint64_t> x_total_before(W - 1, 0), x_total_after(W - 1, 0), x_total_on(W - 1, 0);
+    std::vector<uint64_t> y_total_before(H - 1, 0), y_total_after(H - 1, 0), y_total_on(H - 1, 0);
 
     for (auto net_id : nets) {
         t_bb bb = route_ctx.route_bb[net_id];
-        size_t fanouts = netlist.net_sinks(net_id).size();
+        uint64_t score = scores[net_id];
 
         /* Inclusive start and end coords of the bbox relative to x1. Clamp to [x1, x2]. */
         int x_start = std::max(x1, bb.xmin) - x1;
         int x_end = std::min(bb.xmax, x2) - x1;
-        /* Fill in the lookups assuming a cutline at x + 0.5. */
-        for (int x = x_start; x < W - 1; x++) {
-            x_total_before[x] += fanouts;
+        /* Fill in the lookups assuming a cutline at x + 0.5.
+         * This means total_before includes the max coord of the bbox but
+         * total_after does not include the min coord. */
+        for (int x = x_end; x < W - 1; x++) {
+            x_total_before[x] += score;
         }
-        for (int x = 0; x < x_end; x++) {
-            x_total_after[x] += fanouts;
+        for (int x = 0; x < x_start; x++) {
+            x_total_after[x] += score;
+        }
+        for (int x = x_start; x < x_end; x++){
+            x_total_on[x] += score;
         }
         int y_start = std::max(y1, bb.ymin) - y1;
         int y_end = std::min(bb.ymax, y2) - y1;
-        for (int y = y_start; y < H - 1; y++) {
-            y_total_before[y] += fanouts;
+        for (int y = y_end; y < H - 1; y++) {
+            y_total_before[y] += score;
+        }
+        for (int y = 0; y < y_start; y++) {
+            y_total_after[y] += score;
         }
-        for (int y = 0; y < y_end; y++) {
-            y_total_after[y] += fanouts;
+        for (int y = y_start; y < y_end; y++){
+            y_total_on[y] += score;
         }
     }
 
-    int best_score = std::numeric_limits<int>::max();
+    uint64_t best_score = std::numeric_limits<uint64_t>::max();
     float best_pos = std::numeric_limits<double>::quiet_NaN();
     Axis best_axis = Axis::X;
 
-    int max_x_before = x_total_before[W - 2];
-    int max_x_after = x_total_after[0];
     for (int x = 0; x < W - 1; x++) {
         int before = x_total_before[x];
         int after = x_total_after[x];
-        if (before == max_x_before || after == max_x_after) /* Cutting here would leave no nets to the left or right */
+        if (before == 0 || after == 0) /* Cutting here would leave no nets to the left or right */
             continue;
-        int score = abs(x_total_before[x] - x_total_after[x]);
+        /* Now get a measure of "critical path": work on cutline + max(work on sides)
+         * Test: What happens if we discount max(sides)? */
+        uint64_t score = x_total_on[x] + std::max(x_total_before[x], x_total_after[x]);
+        // uint64_t score = std::abs(int(x_total_before[x]) - int(x_total_after[x]));
         if (score < best_score) {
             best_score = score;
             best_pos = x1 + x + 0.5; /* Lookups are relative to (x1, y1) */
             best_axis = Axis::X;
-        }
+        } 
     }
 
-    int max_y_before = y_total_before[H - 2];
-    int max_y_after = y_total_after[0];
     for (int y = 0; y < H - 1; y++) {
         int before = y_total_before[y];
         int after = y_total_after[y];
-        if (before == max_y_before || after == max_y_after) /* Cutting here would leave no nets to the left or right (sideways) */
+        if (before == 0 || after == 0) /* Cutting here would leave no nets to the left or right (sideways) */
             continue;
-        int score = abs(y_total_before[y] - y_total_after[y]);
+        uint64_t score = y_total_on[y] + std::max(y_total_before[y], y_total_after[y]);
+        // uint64_t score = std::abs(int(y_total_before[y]) - int(y_total_after[y]));
         if (score < best_score) {
             best_score = score;
             best_pos = y1 + y + 0.5; /* Lookups are relative to (x1, y1) */
@@ -112,8 +133,8 @@ std::unique_ptr<PartitionTreeNode> PartitionTree::build_helper(const Netlist<>&
             }
         }
 
-        out->left = build_helper(netlist, left_nets, x1, y1, std::floor(best_pos), y2);
-        out->right = build_helper(netlist, right_nets, std::floor(best_pos + 1), y1, x2, y2);
+        out->left = build_helper(left_nets, scores, x1, y1, std::floor(best_pos), y2);
+        out->right = build_helper(right_nets, scores, std::floor(best_pos + 1), y1, x2, y2);
     } else {
         VTR_ASSERT(best_axis == Axis::Y);
         for (auto net_id : nets) {
@@ -127,10 +148,11 @@ std::unique_ptr<PartitionTreeNode> PartitionTree::build_helper(const Netlist<>&
             }
         }
 
-        out->left = build_helper(netlist, left_nets, x1, y1, x2, std::floor(best_pos));
-        out->right = build_helper(netlist, right_nets, x1, std::floor(best_pos + 1), x2, y2);
+        out->left = build_helper(left_nets, scores, x1, y1, x2, std::floor(best_pos));
+        out->right = build_helper(right_nets, scores, x1, std::floor(best_pos + 1), x2, y2);
     }
 
+    out->nets = my_nets;
     out->nets = my_nets;
     out->cutline_axis = best_axis;
     out->cutline_pos = best_pos;
diff --git a/vpr/src/route/partition_tree.h b/vpr/src/route/partition_tree.h
index 08eb668a88f..56c929fb43d 100644
--- a/vpr/src/route/partition_tree.h
+++ b/vpr/src/route/partition_tree.h
@@ -2,6 +2,7 @@
 
 #include "connection_router.h"
 #include "router_stats.h"
+#include "virtual_net.h"
 
 #include <cmath>
 #include <fstream>
@@ -27,8 +28,6 @@ inline Side operator!(const Side& rhs) {
 
 /** Routing iteration results per thread. (for a subset of the input netlist) */
 struct RouteIterResults {
-    /** Are there any connections impossible to route due to a disconnected rr_graph? */
-    bool is_routable = true;
     /** Net IDs for which timing_driven_route_net() actually got called */
     std::vector<ParentNetId> rerouted_nets;
     /** RouterStats collected from my subset of nets */
@@ -44,22 +43,17 @@ struct RouteIterResults {
  * by the cutline. Leaf nodes represent a final set of nets reached by partitioning.
  *
  * To route this in parallel, we first route the nets in the root node, then add
- * its left and right to a task queue, and repeat this for the whole tree.
- * 
- * The tree stores some routing results to be later combined, such as is_routable and
- * rerouted_nets. (TODO: do this per thread instead of per node) */
+ * its left and right to a task queue, and repeat this for the whole tree. */
 class PartitionTreeNode {
   public:
     /** Nets claimed by this node (intersected by cutline if branch, nets in final region if leaf) */
     std::vector<ParentNetId> nets;
+    /** Virtual nets delegated to this node by the parent */
+    std::vector<VirtualNet> virtual_nets;
     /** Left subtree. */
     std::unique_ptr<PartitionTreeNode> left = nullptr;
     /** Right subtree. */
     std::unique_ptr<PartitionTreeNode> right = nullptr;
-    /** Are there any connections impossible to route due to a disconnected rr_graph? */
-    bool is_routable = false;
-    /** Net IDs for which timing_driven_route_net() actually got called */
-    std::vector<ParentNetId> rerouted_nets;
     /* Axis of the cutline. */
     Axis cutline_axis = Axis::X;
     /* Position of the cutline. It's a float, because cutlines are considered to be "between" integral coordinates. */
@@ -78,14 +72,14 @@ class PartitionTree {
     PartitionTree& operator=(PartitionTree&&) = default;
 
     /** Can only be built from a netlist */
-    PartitionTree(const Netlist<>& netlist);
+    PartitionTree(const Netlist<>& netlist, const vtr::vector<ParentNetId, uint32_t>& scores);
 
     /** Access root. Shouldn't cause a segfault, because PartitionTree constructor always makes a _root */
     inline PartitionTreeNode& root(void) { return *_root; }
 
   private:
     std::unique_ptr<PartitionTreeNode> _root;
-    std::unique_ptr<PartitionTreeNode> build_helper(const Netlist<>& netlist, const std::vector<ParentNetId>& nets, int x1, int y1, int x2, int y2);
+    std::unique_ptr<PartitionTreeNode> build_helper(const std::vector<ParentNetId>& nets, const vtr::vector<ParentNetId, uint32_t>& scores, int x1, int y1, int x2, int y2);
 };
 
 #ifdef VPR_DEBUG_PARTITION_TREE
diff --git a/vpr/src/route/route_common.cpp b/vpr/src/route/route_common.cpp
index 99d116b0de6..e9359923a58 100644
--- a/vpr/src/route/route_common.cpp
+++ b/vpr/src/route/route_common.cpp
@@ -190,6 +190,12 @@ void try_graph(int width_fac,
                     is_flat);
 }
 
+/** Attempts a routing via an iterated maze router algorithm. \p width_fac
+ * specifies the relative width of the channels, while the members of
+ * router_opts determine the value of the costs assigned to routing
+ * resource node, etc.  det_routing_arch describes the detailed routing
+ * architecture (connection and switch boxes) of the FPGA; it is used
+ * only if a DETAILED routing has been selected. */
 bool try_route(const Netlist<>& net_list,
                int width_fac,
                const t_router_opts& router_opts,
@@ -204,12 +210,6 @@ bool try_route(const Netlist<>& net_list,
                int num_directs,
                ScreenUpdatePriority first_iteration_priority,
                bool is_flat) {
-    /* Attempts a routing via an iterated maze router algorithm.  Width_fac *
-     * specifies the relative width of the channels, while the members of   *
-     * router_opts determine the value of the costs assigned to routing     *
-     * resource node, etc.  det_routing_arch describes the detailed routing *
-     * architecture (connection and switch boxes) of the FPGA; it is used   *
-     * only if a DETAILED routing has been selected.                        */
 
     auto& device_ctx = g_vpr_ctx.mutable_device();
     auto& cluster_ctx = g_vpr_ctx.clustering();
@@ -309,11 +309,10 @@ bool try_route(const Netlist<>& net_list,
     return (success);
 }
 
+/** This routine checks to see if this is a resource-feasible routing.
+ * That is, are all rr_node capacity limitations respected?  It assumes
+ * that the occupancy arrays are up to date when it is called. */
 bool feasible_routing() {
-    /* This routine checks to see if this is a resource-feasible routing.      *
-     * That is, are all rr_node capacity limitations respected?  It assumes    *
-     * that the occupancy arrays are up to date when it is called.             */
-
     auto& device_ctx = g_vpr_ctx.device();
     const auto& rr_graph = device_ctx.rr_graph;
     auto& route_ctx = g_vpr_ctx.routing();
@@ -387,8 +386,27 @@ void pathfinder_update_acc_cost_and_overuse_info(float acc_fac, OveruseInfo& ove
     auto& device_ctx = g_vpr_ctx.device();
     const auto& rr_graph = device_ctx.rr_graph;
     auto& route_ctx = g_vpr_ctx.mutable_routing();
-    size_t overused_nodes = 0, total_overuse = 0, worst_overuse = 0;
 
+#ifdef VPR_USE_TBB
+    tbb::combinable<size_t> overused_nodes(0), total_overuse(0), worst_overuse(0);
+    tbb::parallel_for_each(rr_graph.nodes().begin(), rr_graph.nodes().end(), [&](RRNodeId rr_id){
+        int overuse = route_ctx.rr_node_route_inf[rr_id].occ() - rr_graph.node_capacity(rr_id);
+
+        // If overused, update the acc_cost and add this node to the overuse info
+        // If not, do nothing
+        if (overuse > 0) {
+            route_ctx.rr_node_route_inf[rr_id].acc_cost += overuse * acc_fac;
+
+            ++overused_nodes.local();
+            total_overuse.local() += overuse;
+            worst_overuse.local() = std::max(worst_overuse.local(), size_t(overuse));
+        }
+    });
+    overuse_info.overused_nodes = overused_nodes.combine(std::plus<size_t>());
+    overuse_info.total_overuse = total_overuse.combine(std::plus<size_t>());
+    overuse_info.worst_overuse = worst_overuse.combine([](size_t a, size_t b){ return std::max(a, b); });
+#else
+    size_t overused_nodes = 0, total_overuse = 0, worst_overuse = 0;
     for (const RRNodeId& rr_id : rr_graph.nodes()) {
         int overuse = route_ctx.rr_node_route_inf[rr_id].occ() - rr_graph.node_capacity(rr_id);
 
@@ -402,11 +420,11 @@ void pathfinder_update_acc_cost_and_overuse_info(float acc_fac, OveruseInfo& ove
             worst_overuse = std::max(worst_overuse, size_t(overuse));
         }
     }
-
     // Update overuse info
     overuse_info.overused_nodes = overused_nodes;
     overuse_info.total_overuse = total_overuse;
     overuse_info.worst_overuse = worst_overuse;
+#endif
 }
 
 /** Update pathfinder cost of all nodes rooted at rt_node, including rt_node itself */
diff --git a/vpr/src/route/route_common.h b/vpr/src/route/route_common.h
index 68e525e10b0..76b224c0ec5 100644
--- a/vpr/src/route/route_common.h
+++ b/vpr/src/route/route_common.h
@@ -2,6 +2,7 @@
 #pragma once
 #include <vector>
 #include "clustered_netlist.h"
+#include "rr_node_types.h"
 #include "vtr_vector.h"
 #include "heap_type.h"
 #include "rr_node_fwd.h"
@@ -155,7 +156,7 @@ t_heap* prepare_to_add_node_to_heap(
     return hptr;
 }
 
-/* Puts an rr_node on the heap if it is the cheapest path.    */
+/** Puts an rr_node on the heap if it is the cheapest path. */
 template<typename T, typename RouteInf>
 void add_node_to_heap(
     T* heap,
@@ -221,3 +222,39 @@ void push_back_node_with_info(
 
     heap->push_back(hptr);
 }
+
+/** Is \p inode inside this bounding box?
+ * In the context of the parallel router, an inode is inside a bounding box
+ * if its driving side is in the bounding box. If it's not directional,
+ * we take (xlow, ylow) as reference */
+inline bool inside_bb(RRNodeId inode, const t_bb& bb) {
+    auto& device_ctx = g_vpr_ctx.device();
+    const auto& rr_graph = device_ctx.rr_graph;
+
+    /*
+    int x, y;
+    if(rr_graph.node_direction(inode) == Direction::DEC){
+        x = rr_graph.node_xhigh(inode);
+        y = rr_graph.node_yhigh(inode);
+    } else {
+        x = rr_graph.node_xlow(inode);
+        y = rr_graph.node_ylow(inode);
+    } */
+    int x, y;
+    x = rr_graph.node_xlow(inode);
+    y = rr_graph.node_ylow(inode);
+
+    return x >= bb.xmin && x <= bb.xmax && y >= bb.ymin && y <= bb.ymax;
+}
+
+/** When RCV is enabled, it's necessary to be able to completely ripup high fanout nets
+ * if there is still negative hold slack. Normally the router will prune the illegal
+ * branches of high fanout nets, this will bypass this */
+inline bool check_hold(const t_router_opts& router_opts, float worst_neg_slack) {
+    if (router_opts.routing_budgets_algorithm != YOYO) {
+        return false;
+    } else if (worst_neg_slack != 0) {
+        return true;
+    }
+    return false;
+}
diff --git a/vpr/src/route/route_parallel.cpp b/vpr/src/route/route_parallel.cpp
index 96e6464f62b..269d3b53ba2 100644
--- a/vpr/src/route/route_parallel.cpp
+++ b/vpr/src/route/route_parallel.cpp
@@ -1,8 +1,10 @@
 /** @file Functions specific to parallel routing.
  * Reuse code from route_timing.cpp where possible. */
 
+#include <atomic>
 #include <memory>
 #include <thread>
+#include <unordered_set>
 #include <vector>
 #include <unordered_map>
 #include <algorithm>
@@ -18,19 +20,28 @@
 #include "netlist_fwd.h"
 #include "partition_tree.h"
 #include "read_route.h"
-#include "route_export.h"
 #include "route_common.h"
-#include "route_timing.h"
+#include "route_export.h"
 #include "route_parallel.h"
 // all functions in profiling:: namespace, which are only activated if PROFILE is defined
 #include "route_profiling.h"
+#include "route_samplers.h"
+#include "route_timing.h"
+#include "rr_graph_fwd.h"
+#include "rr_node_types.h"
 #include "timing_util.h"
+#include "virtual_net.h"
+#include "vpr_error.h"
+#include "vpr_types.h"
+#include "vtr_assert.h"
+#include "vtr_math.h"
 #include "vtr_time.h"
 
 #include "NetPinTimingInvalidator.h"
 
 #ifdef VPR_USE_TBB
 
+#    include "tbb/concurrent_vector.h"
 #    include "tbb/enumerable_thread_specific.h"
 #    include "tbb/task_group.h"
 
@@ -39,14 +50,13 @@
 template<typename ConnectionRouter>
 class RouteIterCtx {
   public:
-    tbb::enumerable_thread_specific<ConnectionRouter> routers;
+    tbb::enumerable_thread_specific<ConnectionRouter>& routers;
     const Netlist<>& net_list;
     int itry;
     float pres_fac;
     const t_router_opts& router_opts;
     CBRR& connections_inf;
-    tbb::enumerable_thread_specific<RouterStats> router_stats;
-    tbb::enumerable_thread_specific<timing_driven_route_structs> route_structs;
+    tbb::enumerable_thread_specific<RouterStats>& router_stats;
     NetPinsMatrix<float>& net_delay;
     const ClusteredPinAtomPinsLookup& netlist_pin_lookup;
     std::shared_ptr<SetupHoldTimingInfo> timing_info;
@@ -55,11 +65,25 @@ class RouteIterCtx {
     float worst_negative_slack;
     const RoutingPredictor& routing_predictor;
     const vtr::vector<ParentNetId, std::vector<std::unordered_map<RRNodeId, int>>>& choking_spots;
+    tbb::concurrent_vector<ParentNetId>& nets_to_retry;
+    vtr::vector<ParentNetId, bool>& is_decomp_disabled;
+    /** Are there any connections impossible to route due to a disconnected rr_graph? */
+    std::atomic_bool is_routable = false;
+    /** Net IDs for which timing_driven_route_net() actually got called */
+    tbb::enumerable_thread_specific<std::vector<ParentNetId>>& rerouted_nets;
+    /** "Scores" for building a PartitionTree (estimated workload) */
+    vtr::vector<ParentNetId, uint32_t>& net_scores;
+    /** Sink indices known to fail when routed after decomposition. Always route these serially */
+    vtr::vector<ParentNetId, tbb::concurrent_vector<int>>& net_known_samples;
     bool is_flat;
 };
 
-/** Helper for reduce_partition_tree. Traverse \p node's subtree and collect results into \p results */
-static void reduce_partition_tree_helper(const PartitionTreeNode& node, RouteIterResults& results);
+/** Don't try to decompose nets if # of iterations > this. */
+constexpr int MAX_DECOMP_ITER = 5;
+
+/** Don't try to decompose a regular net more than this many times.
+ * For instance, max_decomp_depth=2 means one regular net can become 4 virtual nets at max. */
+constexpr int MAX_DECOMP_DEPTH = 2;
 
 /**
  * Try to route in parallel with the given ConnectionRouter.
@@ -214,13 +238,17 @@ bool try_parallel_route_tmpl(const Netlist<>& net_list,
 
     route_budgets budgeting_inf(net_list, is_flat);
 
+    const RouterLookahead* router_lookahead;
+
+    {
+    vtr::ScopedStartFinishTimer timer("Obtaining lookahead");
     // This needs to be called before filling intra-cluster lookahead maps to ensure that the intra-cluster lookahead maps are initialized.
-    const RouterLookahead* router_lookahead = get_cached_router_lookahead(det_routing_arch,
-                                                                          router_opts.lookahead_type,
-                                                                          router_opts.write_router_lookahead,
-                                                                          router_opts.read_router_lookahead,
-                                                                          segment_inf,
-                                                                          is_flat);
+    router_lookahead = get_cached_router_lookahead(det_routing_arch,
+                                                    router_opts.lookahead_type,
+                                                    router_opts.write_router_lookahead,
+                                                    router_opts.read_router_lookahead,
+                                                    segment_inf,
+                                                    is_flat);
 
     if (is_flat) {
         // If is_flat is true, the router lookahead maps related to intra-cluster resources should be initialized since
@@ -245,6 +273,7 @@ bool try_parallel_route_tmpl(const Netlist<>& net_list,
             router_lookahead->write_intra_cluster(router_opts.write_intra_cluster_router_lookahead);
         }
     }
+    }
 
     VTR_ASSERT(router_lookahead != nullptr);
 
@@ -346,7 +375,26 @@ bool try_parallel_route_tmpl(const Netlist<>& net_list,
         route_ctx.rr_node_route_inf,
         is_flat)); /* Here we provide an "exemplar" to copy for each thread */
     auto router_stats_thread = tbb::enumerable_thread_specific<RouterStats>();
-    auto route_structs = tbb::enumerable_thread_specific<timing_driven_route_structs>(net_list);
+    tbb::concurrent_vector<ParentNetId> nets_to_retry;
+    auto rerouted_nets = tbb::enumerable_thread_specific<std::vector<ParentNetId>>();
+
+    /* Should I decompose this net? */
+    vtr::vector<ParentNetId, bool> is_decomp_disabled(net_list.nets().size(), false);
+
+    /* Keep track of workload per net */
+    std::deque<std::atomic_uint64_t> net_empirical_workloads(net_list.nets().size());
+    std::fill(net_empirical_workloads.begin(), net_empirical_workloads.end(), 0);
+
+    /* Scores: initially fanouts, later can be changed by route_with_partition_tree */
+    vtr::vector<ParentNetId, uint32_t> net_scores(net_list.nets().size());
+
+    /* Populate with initial scores */
+    tbb::parallel_for_each(net_list.nets(), [&](ParentNetId net_id){
+        net_scores[net_id] = route_ctx.net_rr_terminals[net_id].size() - 1;
+    });
+
+    /* "Known samples" for each net: ones known to not route after decomp */
+    vtr::vector<ParentNetId, tbb::concurrent_vector<int>> net_known_samples(net_list.nets().size());
 
     RouterStats router_stats;
     float prev_iter_cumm_time = 0;
@@ -402,7 +450,6 @@ bool try_parallel_route_tmpl(const Netlist<>& net_list,
             router_opts,
             connections_inf,
             router_stats_thread,
-            route_structs,
             net_delay,
             netlist_pin_lookup,
             route_timing_info,
@@ -411,13 +458,19 @@ bool try_parallel_route_tmpl(const Netlist<>& net_list,
             worst_negative_slack,
             routing_predictor,
             choking_spots,
+            nets_to_retry,
+            is_decomp_disabled,
+            true,
+            rerouted_nets,
+            net_scores,
+            net_known_samples,
             is_flat};
 
         vtr::Timer net_routing_timer;
-        RouteIterResults iter_results = route_with_partition_tree(tbb_task_group, iter_ctx);
+        RouteIterResults iter_results = decompose_route_with_partition_tree(tbb_task_group, iter_ctx);
         PartitionTreeDebug::log("Routing all nets took " + std::to_string(net_routing_timer.elapsed_sec()) + " s");
 
-        if (!iter_results.is_routable) {
+        if (!iter_ctx.is_routable) {
             return false; // Impossible to route
         }
 
@@ -501,7 +554,7 @@ bool try_parallel_route_tmpl(const Netlist<>& net_list,
         /*
          * Are we finished?
          */
-        if (is_iteration_complete(routing_is_feasible, router_opts, itry, timing_info, rcv_finished_count == 0)) {
+        if (iter_ctx.nets_to_retry.empty() && is_iteration_complete(routing_is_feasible, router_opts, itry, timing_info, rcv_finished_count == 0)) {
             auto& router_ctx = g_vpr_ctx.routing();
 
             if (is_better_quality_routing(best_routing, best_routing_metrics, wirelength_info, timing_info)) {
@@ -639,19 +692,20 @@ bool try_parallel_route_tmpl(const Netlist<>& net_list,
             //     the router to route around otherwise congested regions
             //     (at the cost of high run-time).
 
-            //Increase the size of the net bounding boxes to give the router more
-            //freedom to find alternate paths.
-            //
-            //In the case of routing conflicts there are multiple connections competing
-            //for the same resources which can not resolve the congestion themselves.
-            //In normal routing mode we try to keep the bounding boxes small to minimize
-            //run-time, but this can limits how far signals can detour (i.e. they can't
-            //route outside the bounding box), which can cause conflicts to oscillate back
-            //and forth without resolving.
-            //
-            //By scaling the bounding boxes here, we slowly increase the router's search
-            //space in hopes of it allowing signals to move further out of the way to
-            //alleviate the conflicts.
+            /* Increase the size of the net bounding boxes to give the router more
+             * freedom to find alternate paths.
+             *
+             * In the case of routing conflicts there are multiple connections competing
+             * for the same resources which can not resolve the congestion themselves.
+             * In normal routing mode we try to keep the bounding boxes small to minimize
+             * run-time, but this can limits how far signals can detour (i.e. they can't
+             * route outside the bounding box), which can cause conflicts to oscillate back
+             * and forth without resolving.
+             *
+             * By scaling the bounding boxes here, we slowly increase the router's search
+             * space in hopes of it allowing signals to move further out of the way to
+             * alleviate the conflicts. */
+
             if (itry_conflicted_mode % BB_SCALE_ITER_COUNT == 0) {
                 //We scale the bounding boxes by BB_SCALE_FACTOR,
                 //every BB_SCALE_ITER_COUNT iterations. This ensures
@@ -799,6 +853,24 @@ bool try_parallel_route_tmpl(const Netlist<>& net_list,
     return routing_is_successful;
 }
 
+/** Apparently we need a few more checks around should_route_net. TODO: smush this function into should_route_net */
+static bool should_really_route_net(const Netlist<>& net_list, ParentNetId net_id, route_budgets& budgeting_inf, CBRR& connections_inf, float worst_negative_slack) {
+    auto& route_ctx = g_vpr_ctx.mutable_routing();
+
+    bool reroute_for_hold = false;
+    if (budgeting_inf.if_set()) {
+        reroute_for_hold = budgeting_inf.get_should_reroute(net_id);
+        reroute_for_hold &= (worst_negative_slack != 0);
+    }
+    if (route_ctx.net_status.is_fixed(net_id)) /* Skip pre-routed nets. */
+        return false;
+    else if (net_list.net_is_ignored(net_id)) /* Skip ignored nets. */
+        return false;
+    else if (!(reroute_for_hold) && !should_route_net(net_id, connections_inf, true))
+        return false;
+    return true;
+}
+
 /** Try routing a net. This calls timing_driven_route_net.
  * The only difference is that it returns a "retry_net" flag, which means that the net
  * couldn't be routed with the default bounding box and needs a full-device BB.
@@ -806,75 +878,49 @@ bool try_parallel_route_tmpl(const Netlist<>& net_list,
  * The single-thread router just retries with a full-device BB and does not need to notify the caller.
  * TODO: make the serial router follow this execution path to decrease code duplication */
 template<typename ConnectionRouter>
-NetResultFlags try_parallel_route_net(ConnectionRouter& router,
-                                      const Netlist<>& net_list,
-                                      const ParentNetId& net_id,
-                                      int itry,
-                                      float pres_fac,
-                                      const t_router_opts& router_opts,
-                                      CBRR& connections_inf,
-                                      RouterStats& router_stats,
-                                      std::vector<float>& pin_criticality,
-                                      NetPinsMatrix<float>& net_delay,
-                                      const ClusteredPinAtomPinsLookup& netlist_pin_lookup,
-                                      std::shared_ptr<SetupHoldTimingInfo> timing_info,
-                                      NetPinTimingInvalidator* pin_timing_invalidator,
-                                      route_budgets& budgeting_inf,
-                                      float worst_negative_slack,
-                                      const RoutingPredictor& routing_predictor,
-                                      const std::vector<std::unordered_map<RRNodeId, int>>& choking_spots,
-                                      bool is_flat) {
+NetResultFlags try_parallel_route_net(ParentNetId net_id, RouteIterCtx<ConnectionRouter>& ctx) {
     auto& route_ctx = g_vpr_ctx.mutable_routing();
 
     NetResultFlags flags;
 
-    bool reroute_for_hold = false;
-    if (budgeting_inf.if_set()) {
-        reroute_for_hold = (budgeting_inf.get_should_reroute(net_id));
-        reroute_for_hold &= worst_negative_slack != 0;
-    }
-    if (route_ctx.net_status.is_fixed(net_id)) { /* Skip pre-routed nets. */
+    /* Just return success if we don't need to route this one */
+    if (!should_really_route_net(ctx.net_list, net_id, ctx.budgeting_inf, ctx.connections_inf, ctx.worst_negative_slack)) {
         flags.success = true;
-    } else if (net_list.net_is_ignored(net_id)) { /* Skip ignored nets. */
-        flags.success = true;
-    } else if (!(reroute_for_hold) && !should_route_net(net_id, connections_inf, true)) {
-        flags.success = true;
-    } else {
-        // track time spent vs fanout
-        profiling::net_fanout_start();
-
-        vtr::Timer routing_timer;
-        flags = timing_driven_route_net(router,
-                                        net_list,
-                                        net_id,
-                                        itry,
-                                        pres_fac,
-                                        router_opts,
-                                        connections_inf,
-                                        router_stats,
-                                        pin_criticality,
-                                        net_delay[net_id].data(),
-                                        netlist_pin_lookup,
-                                        timing_info,
-                                        pin_timing_invalidator,
-                                        budgeting_inf,
-                                        worst_negative_slack,
-                                        routing_predictor,
-                                        choking_spots,
-                                        is_flat);
-
-        profiling::net_fanout_end(net_list.net_sinks(net_id).size());
-
-        /* Impossible to route? (disconnected rr_graph) */
-        if (flags.success) {
-            route_ctx.net_status.set_is_routed(net_id, true);
-        } else {
-            VTR_LOG("Routing failed for net %d\n", net_id);
-        }
+        return flags;
+    }
 
-        flags.was_rerouted = true; //Flag to record whether routing was actually changed
+    // track time spent vs fanout
+    profiling::net_fanout_start();
+
+    vtr::Timer routing_timer;
+    flags = timing_driven_route_net(ctx.routers.local(),
+                                    ctx.net_list,
+                                    net_id,
+                                    ctx.itry,
+                                    ctx.pres_fac,
+                                    ctx.router_opts,
+                                    ctx.connections_inf,
+                                    ctx.router_stats.local(),
+                                    ctx.net_delay[net_id].data(),
+                                    ctx.netlist_pin_lookup,
+                                    ctx.timing_info,
+                                    ctx.pin_timing_invalidator,
+                                    ctx.budgeting_inf,
+                                    ctx.worst_negative_slack,
+                                    ctx.routing_predictor,
+                                    ctx.choking_spots[net_id],
+                                    ctx.is_flat);
+
+    profiling::net_fanout_end(ctx.net_list.net_sinks(net_id).size());
+
+    /* Impossible to route? (disconnected rr_graph) */
+    if (flags.success) {
+        route_ctx.net_status.set_is_routed(net_id, true);
+    } else {
+        VTR_LOG("Routing failed for net %d\n", net_id);
     }
 
+    flags.was_rerouted = true; //Flag to record whether routing was actually changed
     return flags;
 }
 
@@ -889,36 +935,15 @@ void route_partition_tree_helper(tbb::task_group& g,
         return ctx.net_list.net_sinks(id1).size() > ctx.net_list.net_sinks(id2).size();
     });
 
-    node.is_routable = true;
-    node.rerouted_nets.clear();
-
     vtr::Timer t;
     for (auto net_id : node.nets) {
-        auto flags = try_parallel_route_net(
-            ctx.routers.local(),
-            ctx.net_list,
-            net_id,
-            ctx.itry,
-            ctx.pres_fac,
-            ctx.router_opts,
-            ctx.connections_inf,
-            ctx.router_stats.local(),
-            ctx.route_structs.local().pin_criticality,
-            ctx.net_delay,
-            ctx.netlist_pin_lookup,
-            ctx.timing_info,
-            ctx.pin_timing_invalidator,
-            ctx.budgeting_inf,
-            ctx.worst_negative_slack,
-            ctx.routing_predictor,
-            ctx.choking_spots[net_id],
-            ctx.is_flat);
+        auto flags = try_parallel_route_net(net_id, ctx);
 
         if (!flags.success && !flags.retry_with_full_bb) {
-            node.is_routable = false;
+            ctx.is_routable = false;
         }
         if (flags.was_rerouted) {
-            node.rerouted_nets.push_back(net_id);
+            ctx.rerouted_nets.local().push_back(net_id);
         }
         /* If we need to retry this net with full-device BB, it will go up to the top
          * of the tree, so remove it from this node and keep track of it */
@@ -943,18 +968,6 @@ void route_partition_tree_helper(tbb::task_group& g,
     }
 }
 
-/** Reduce results from partition tree into a single RouteIterResults */
-static void reduce_partition_tree_helper(const PartitionTreeNode& node, RouteIterResults& results) {
-    results.is_routable &= node.is_routable;
-    const std::vector<ParentNetId>& rerouted = node.rerouted_nets;
-    results.rerouted_nets.insert(results.rerouted_nets.end(), rerouted.begin(), rerouted.end());
-
-    if (node.left)
-        reduce_partition_tree_helper(*node.left, results);
-    if (node.right)
-        reduce_partition_tree_helper(*node.right, results);
-}
-
 /** Route all nets in parallel using the partitioning information in the PartitionTree.
  *
  * @param[in, out] g TBB task group to dispatch tasks.
@@ -991,7 +1004,9 @@ RouteIterResults route_partition_tree(tbb::task_group& g,
     }
 
     RouteIterResults out;
-    reduce_partition_tree_helper(tree.root(), out);
+    for (auto& thread_rerouted_nets: ctx.rerouted_nets){
+        out.rerouted_nets.insert(out.rerouted_nets.begin(), thread_rerouted_nets.begin(), thread_rerouted_nets.end());
+    }
     for (auto& thread_stats : ctx.router_stats) {
         update_router_stats(out.stats, thread_stats);
     }
@@ -1029,8 +1044,6 @@ static RouteIterResults route_without_partition_tree(std::vector<ParentNetId>& n
             ctx.router_opts,
             ctx.connections_inf,
             ctx.router_stats.local(),
-            ctx.route_structs.local().pin_criticality,
-            ctx.route_structs.local().rt_node_of_sink,
             ctx.net_delay,
             ctx.netlist_pin_lookup,
             ctx.timing_info,
@@ -1042,7 +1055,7 @@ static RouteIterResults route_without_partition_tree(std::vector<ParentNetId>& n
             ctx.is_flat);
 
         if (!flags.success) {
-            out.is_routable = false;
+            ctx.is_routable = false;
         }
         if (flags.was_rerouted) {
             out.rerouted_nets.push_back(net_id);
@@ -1054,4 +1067,1118 @@ static RouteIterResults route_without_partition_tree(std::vector<ParentNetId>& n
     return out;
 }
 
+tbb::enumerable_thread_specific<size_t> nets_too_deep = 0;
+tbb::enumerable_thread_specific<size_t> nets_clock = 0;
+tbb::enumerable_thread_specific<size_t> nets_retry_limit = 0;
+tbb::enumerable_thread_specific<size_t> nets_thin_strip = 0;
+tbb::enumerable_thread_specific<size_t> nets_cut_thin_strip = 0;
+tbb::enumerable_thread_specific<size_t> nets_few_fanouts = 0;
+tbb::enumerable_thread_specific<size_t> nets_set_to_decompose = 0;
+
+/** Get all "sink pin indices" for a given VirtualNet. We often work with that 
+ * index, because it is used in a lot of lookups and is impossible to get back once 
+ * converted to a ParentPinId or RRNodeId. */
+int get_vnet_num_sinks(const VirtualNet& vnet) {
+    auto& route_ctx = g_vpr_ctx.routing();
+    size_t parent_num_sinks = route_ctx.route_trees[vnet.net_id]->num_sinks();
+    int out = 0;
+    /* 1-indexed. Yes, I know... */
+    for (size_t isink = 1; isink <= parent_num_sinks; ++isink) {
+        RRNodeId sink_rr = route_ctx.net_rr_terminals[vnet.net_id][isink];
+        if (inside_bb(sink_rr, vnet.clipped_bb))
+            out++;
+    }
+    return out;
+}
+
+/** Should we decompose this net? We should probably leave it alone if: 
+ * - it's a clock net
+ * - we decomposed nets for enough levels and should have good thread utilization by now
+ * - decomposing this net doesn't result in any parallelism
+ * - TODO: Don't decompose nets with full-device bounding box (don't want to clip their BB) */
+template<typename ConnectionRouter>
+bool should_decompose_net(ParentNetId net_id, const PartitionTreeNode& node, const RouteIterCtx<ConnectionRouter>& ctx) {
+    /* Node doesn't have branches */
+    if (!node.left || !node.right)
+        return false;
+    /* Clock net */
+    if (ctx.net_list.net_is_global(net_id) && ctx.router_opts.two_stage_clock_routing){
+        nets_clock.local()++;
+        return false;
+    }
+    /* Decomposition is disabled for net */
+    if (ctx.is_decomp_disabled[net_id]){
+        nets_retry_limit.local()++;
+        return false;
+    }
+    /* We are past the iteration to try decomposition */
+    if (ctx.itry > MAX_DECOMP_ITER){
+        nets_retry_limit.local()++;
+        return false;
+    }
+    int num_sinks = ctx.net_list.net_sinks(net_id).size();
+    if(num_sinks < 8){
+        nets_few_fanouts.local()++;
+        return false;
+    }
+
+    nets_set_to_decompose.local()++;
+    return true;
+}
+
+/** Should we decompose this vnet? */
+bool should_decompose_vnet(const VirtualNet& vnet, const PartitionTreeNode& node) {
+    /* Node doesn't have branches */
+    if (!node.left || !node.right)
+        return false;
+
+    if(vnet.times_decomposed >= MAX_DECOMP_DEPTH)
+        return false;
+
+    /* Cutline doesn't go through vnet (a valid case: it wasn't there when partition tree was being built) */
+    if(node.cutline_axis == Axis::X){
+        if(vnet.clipped_bb.xmin > node.cutline_pos || vnet.clipped_bb.xmax < node.cutline_pos)
+            return false;
+    }else{
+        if(vnet.clipped_bb.ymin > node.cutline_pos || vnet.clipped_bb.ymax < node.cutline_pos)
+            return false;
+    }
+
+    int num_sinks = get_vnet_num_sinks(vnet);
+    if(num_sinks < 8){
+        nets_few_fanouts.local()++;
+        return false;
+    }
+
+    nets_set_to_decompose.local()++;
+    return true;
+}
+
+/** Clip bb to one side of the cutline given the axis and position of the cutline.
+ * Note that cutlines are assumed to be at axis = cutline_pos + 0.5. */
+t_bb clip_to_side(const t_bb& bb, Axis axis, int cutline_pos, Side side) {
+    t_bb out = bb;
+    if (axis == Axis::X && side == Side::LEFT)
+        out.xmax = cutline_pos;
+    else if (axis == Axis::X && side == Side::RIGHT)
+        out.xmin = cutline_pos + 1;
+    else if (axis == Axis::Y && side == Side::LEFT)
+        out.ymax = cutline_pos;
+    else if (axis == Axis::Y && side == Side::RIGHT)
+        out.ymin = cutline_pos + 1;
+    else
+        VTR_ASSERT_MSG(false, "Unreachable");
+    return out;
+}
+
+/** Break a net into two given the partition tree node and virtual source.
+ * @param net_id: The net in question.
+ * @param node: The PartitionTreeNode which owns this net, fully or partially. 
+ * @param virtual_source: The source node. Virtual source for the sink side, real source for the source side.
+ * @param sink_side: Which side of the cutline has the virtual source?
+ * @return Left and right halves of the net as VirtualNets. */
+std::tuple<VirtualNet, VirtualNet> make_decomposed_pair(ParentNetId net_id, int cutline_pos, Axis cutline_axis) {
+    auto& route_ctx = g_vpr_ctx.routing();
+
+    Side source_side = which_side(route_ctx.route_trees[net_id]->root().inode, cutline_pos, cutline_axis);
+    VirtualNet source_half, sink_half;
+    t_bb bb = route_ctx.route_bb[net_id];
+    source_half.net_id = net_id;
+    source_half.clipped_bb = clip_to_side(bb, cutline_axis, cutline_pos, source_side);
+    sink_half.net_id = net_id;
+    sink_half.clipped_bb = clip_to_side(bb, cutline_axis, cutline_pos, !source_side);
+    source_half.times_decomposed = 1;
+    sink_half.times_decomposed = 1;
+    if (source_side == Side::RIGHT)
+        return std::make_tuple(sink_half, source_half);
+    else
+        return std::make_tuple(source_half, sink_half);
+}
+
+/** Does the current routing of \p net_id cross the cutline at cutline_axis = cutline_pos? */
+bool is_routing_over_cutline(ParentNetId net_id, int cutline_pos, Axis cutline_axis) {
+    auto& route_ctx = g_vpr_ctx.mutable_routing();
+    const RouteTree& tree = route_ctx.route_trees[net_id].value();
+    RRNodeId rr_source = tree.root().inode;
+    Side source_side = which_side(rr_source, cutline_pos, cutline_axis);
+
+    for (auto isink : tree.get_reached_isinks()) {
+        RRNodeId rr_sink = route_ctx.net_rr_terminals[net_id][isink];
+        Side sink_side = which_side(rr_sink, cutline_pos, cutline_axis);
+        if (source_side != sink_side)
+            return true;
+    }
+
+    return false;
+}
+
+/** Is \p inode too close to this cutline?
+ * We assign some "thickness" to the node and check for collision */
+bool is_close_to_cutline(RRNodeId inode, int cutline_pos, Axis cutline_axis, int thickness){
+    const auto& device_ctx = g_vpr_ctx.device();
+    const auto& rr_graph = device_ctx.rr_graph;
+
+    /* Cutlines are considered to be at x + 0.5, set a thickness of +1 here by checking for equality */
+    if(cutline_axis == Axis::X){
+        return rr_graph.node_xlow(inode) - thickness <= cutline_pos && rr_graph.node_xhigh(inode) + thickness >= cutline_pos;
+    } else {
+        return rr_graph.node_ylow(inode) - thickness <= cutline_pos && rr_graph.node_yhigh(inode) + thickness >= cutline_pos;
+    }
+}
+
+/** Is \p inode too close to this bb? (Assuming it's inside)
+ * We assign some "thickness" to the node and check for collision */
+bool is_close_to_bb(RRNodeId inode, const t_bb& bb, int thickness){
+    const auto& device_ctx = g_vpr_ctx.device();
+    const auto& rr_graph = device_ctx.rr_graph;
+
+    int xlow = rr_graph.node_xlow(inode) - thickness;
+    int ylow = rr_graph.node_ylow(inode) - thickness;
+    int xhigh = rr_graph.node_xhigh(inode) + thickness;
+    int yhigh = rr_graph.node_yhigh(inode) + thickness;
+
+    return (xlow <= bb.xmin && xhigh >= bb.xmin)
+        || (ylow <= bb.ymin && yhigh >= bb.ymin)
+        || (xlow <= bb.xmax && xhigh >= bb.xmax)
+        || (ylow <= bb.ymax && yhigh >= bb.ymax);
+}
+
+/** Is this net divided very unevenly? If so, put all sinks in the small side into \p out and return true */
+bool get_reduction_isinks(ParentNetId net_id, int cutline_pos, Axis cutline_axis, std::set<int>& out){
+    const auto& route_ctx = g_vpr_ctx.routing();
+
+    const RouteTree& tree = route_ctx.route_trees[net_id].value();
+    int num_sinks = tree.num_sinks();
+    std::vector<int> sinks;
+    int all_sinks = 0;
+
+    Side source_side = which_side(tree.root().inode, cutline_pos, cutline_axis);
+    const t_bb& net_bb = route_ctx.route_bb[net_id];
+    t_bb sink_side_bb = clip_to_side(net_bb, cutline_axis, cutline_pos, !source_side);
+    auto& is_isink_reached = tree.get_is_isink_reached();
+    /* Get sinks on the sink side */
+    for(int isink=1; isink<num_sinks+1; isink++){
+        RRNodeId rr_sink = route_ctx.net_rr_terminals[net_id][isink];
+        if(inside_bb(rr_sink, sink_side_bb)){
+            if(!is_isink_reached[isink])
+                sinks.push_back(isink);
+            if(is_close_to_cutline(rr_sink, cutline_pos, cutline_axis, 1)) /* Don't count sinks close to cutline */
+                continue;
+            all_sinks++;
+        }
+    }
+
+    /* Are there too few sinks on the sink side? In that case, just route to all of them */
+    const int MIN_SINKS = 4;
+    if(all_sinks <= MIN_SINKS){
+        out.insert(sinks.begin(), sinks.end());
+        return true;
+    }
+
+    /* Is the sink side narrow? In that case, it may not contain enough wires to route */
+    const int MIN_WIDTH = 10;
+    int W = sink_side_bb.xmax - sink_side_bb.xmin + 1;
+    int H = sink_side_bb.ymax - sink_side_bb.ymin + 1;
+    if(W < MIN_WIDTH || H < MIN_WIDTH){
+        out.insert(sinks.begin(), sinks.end());
+        return true;
+    }
+
+    return false;
+}
+
+/** Sample isinks with a method from route_samplers.h */
+template<typename ConnectionRouter>
+std::vector<int> get_decomposition_isinks(ParentNetId net_id, int cutline_pos, Axis cutline_axis, const RouteIterCtx<ConnectionRouter>& ctx) {
+    const auto& route_ctx = g_vpr_ctx.routing();
+    const RouteTree& tree = route_ctx.route_trees[net_id].value();
+
+    // std::vector<int> sampled = convex_hull_downsample(net_id);
+    // std::vector<int> sampled = sample_single_sink(net_id, pin_criticality, cutline_pos, cutline_axis);
+
+    std::set<int> sampled_set;
+
+    /* Sometimes cutlines divide a net very unevenly. In that case, just route to all
+     * sinks in the small side and unblock. Stick with convex hull sampling if source
+     * is close to cutline. */
+    bool is_reduced = get_reduction_isinks(net_id, cutline_pos, cutline_axis, sampled_set);
+    bool source_on_cutline = is_close_to_cutline(tree.root().inode, cutline_pos, cutline_axis, 1);
+    if(!is_reduced || source_on_cutline)
+        convex_hull_downsample(net_id, sampled_set);
+
+    auto& is_isink_reached = tree.get_is_isink_reached();
+
+    /* Always sample "known samples": sinks known to fail to route */
+    for(int isink: ctx.net_known_samples[net_id]){
+        if(is_isink_reached[isink])
+            continue;
+
+        sampled_set.insert(isink);
+    }
+
+    /* Sample if a sink is too close to the cutline (and unreached).
+     * Those sinks are likely to fail routing */
+    for(size_t isink=1; isink<tree.num_sinks()+1; isink++){
+        if(is_isink_reached[isink])
+            continue;
+
+        RRNodeId rr_sink = route_ctx.net_rr_terminals[net_id][isink];
+        if(is_close_to_cutline(rr_sink, cutline_pos, cutline_axis, 1))
+            sampled_set.insert(isink);
+    }
+
+    std::vector<int> out(sampled_set.begin(), sampled_set.end());
+
+    return out;
+}
+
+/** Get all "sink pin indices" for a given VirtualNet. We often work with that 
+ * index, because it is used in a lot of lookups and is impossible to get back once 
+ * converted to a ParentPinId or RRNodeId. */
+std::vector<int> get_vnet_isinks(const VirtualNet& vnet) {
+    auto& route_ctx = g_vpr_ctx.routing();
+    size_t num_sinks = route_ctx.route_trees[vnet.net_id]->num_sinks();
+    std::vector<int> out; /* The compiler should be smart enough to not copy this when returning */
+    /* 1-indexed. Yes, I know... */
+    for (size_t isink = 1; isink <= num_sinks; ++isink) {
+        RRNodeId sink_rr = route_ctx.net_rr_terminals[vnet.net_id][isink];
+        if (inside_bb(sink_rr, vnet.clipped_bb))
+            out.push_back(isink);
+    }
+    return out;
+}
+
+/** Break a vnet into two from the cutline. */
+std::tuple<VirtualNet, VirtualNet> make_decomposed_pair_from_vnet(const VirtualNet& vnet, int cutline_pos, Axis cutline_axis) {
+    VirtualNet left_half, right_half;
+    left_half.net_id = vnet.net_id;
+    left_half.clipped_bb = clip_to_side(vnet.clipped_bb, cutline_axis, cutline_pos, Side::LEFT);
+    right_half.net_id = vnet.net_id;
+    right_half.clipped_bb = clip_to_side(vnet.clipped_bb, cutline_axis, cutline_pos, Side::RIGHT);
+    left_half.times_decomposed = vnet.times_decomposed + 1;
+    right_half.times_decomposed = vnet.times_decomposed + 1;
+    return std::make_tuple(left_half, right_half);
+}
+
+/* Is this net divided very unevenly? If so, put all sinks in the small side into out.
+ * Since this is a vnet, there's a chance that both sides are small: then return all sinks */
+int get_reduction_isinks_vnet(const VirtualNet& vnet, int cutline_pos, Axis cutline_axis, std::set<int>& out){
+    const auto& route_ctx = g_vpr_ctx.routing();
+
+    const RouteTree& tree = route_ctx.route_trees[vnet.net_id].value();
+    int num_sinks = tree.num_sinks();
+    const t_bb& net_bb = vnet.clipped_bb;
+
+    t_bb left_side = clip_to_side(net_bb, cutline_axis, cutline_pos, Side::LEFT);
+    t_bb right_side = clip_to_side(net_bb, cutline_axis, cutline_pos, Side::RIGHT);
+    auto& is_isink_reached = tree.get_is_isink_reached();
+
+    int reduced_sides = 0;
+
+    for(const t_bb& side_bb: {left_side, right_side}){
+        std::vector<int> sinks;
+        int all_sinks = 0;
+
+        const int MIN_WIDTH = 10;
+        int W = side_bb.xmax - side_bb.xmin + 1;
+        int H = side_bb.ymax - side_bb.ymin + 1;
+        bool is_narrow = (W < MIN_WIDTH || H < MIN_WIDTH);
+        bool should_reduce = true;
+
+        const int MIN_SINKS = 4;
+    
+        for(int isink=1; isink<num_sinks+1; isink++){
+            RRNodeId rr_sink = route_ctx.net_rr_terminals[vnet.net_id][isink];
+            if(!inside_bb(rr_sink, side_bb))
+                continue;
+            if(!is_isink_reached[isink])
+                sinks.push_back(isink);
+            if(is_narrow) /* If the box is narrow, don't check for all_sinks -- we are going to reduce it anyway */
+                continue;
+            if(is_close_to_bb(rr_sink, side_bb, 1))
+                continue;
+            all_sinks++;
+            if(all_sinks > MIN_SINKS){
+                should_reduce = false;
+                break;
+            }
+        }
+
+        if(!should_reduce) /* We found enough sinks and the box is not narrow */
+            continue;
+
+        /* Either we have a narrow box, or too few unique sink locations. Just route to every sink on this side */
+        out.insert(sinks.begin(), sinks.end());
+        reduced_sides++;
+    }
+
+    return reduced_sides;
+}
+
+/** Reduce only one side if vnet has source */
+bool get_reduction_isinks_vnet_with_source(const VirtualNet& vnet, int cutline_pos, Axis cutline_axis, std::set<int>& out){
+    const auto& route_ctx = g_vpr_ctx.routing();
+
+    const RouteTree& tree = route_ctx.route_trees[vnet.net_id].value();
+    int num_sinks = tree.num_sinks();
+    std::vector<int> sinks;
+    int all_sinks = 0;
+
+    Side source_side = which_side(tree.root().inode, cutline_pos, cutline_axis);
+    const t_bb& net_bb = vnet.clipped_bb;
+    t_bb sink_side_bb = clip_to_side(net_bb, cutline_axis, cutline_pos, !source_side);
+    auto& is_isink_reached = tree.get_is_isink_reached();
+    /* Get sinks on the sink side */
+    for(int isink=1; isink<num_sinks+1; isink++){
+        RRNodeId rr_sink = route_ctx.net_rr_terminals[vnet.net_id][isink];
+        if(inside_bb(rr_sink, sink_side_bb)){
+            if(!is_isink_reached[isink])
+                sinks.push_back(isink);
+            if(is_close_to_bb(rr_sink, sink_side_bb, 1)) /* Don't count sinks close to BB */
+                continue;
+            all_sinks++;
+        }
+    }
+
+    /* Are there too few sinks on the sink side? In that case, just route to all of them */
+    const int MIN_SINKS = 4;
+    if(all_sinks <= MIN_SINKS){
+        out.insert(sinks.begin(), sinks.end());
+        return true;
+    }
+
+    /* Is the sink side narrow? In that case, it may not contain enough wires to route */
+    const int MIN_WIDTH = 10;
+    int W = sink_side_bb.xmax - sink_side_bb.xmin + 1;
+    int H = sink_side_bb.ymax - sink_side_bb.ymin + 1;
+    if(W < MIN_WIDTH || H < MIN_WIDTH){
+        out.insert(sinks.begin(), sinks.end());
+        return true;
+    }
+
+    return false;
+}
+
+/** Sample isinks with a method from route_samplers.h for vnets. */
+std::vector<int> get_decomposition_isinks_vnet(const VirtualNet& vnet, int cutline_pos, Axis cutline_axis) {
+    const auto& route_ctx = g_vpr_ctx.routing();
+    const RouteTree& tree = route_ctx.route_trees[vnet.net_id].value();
+
+    std::set<int> sampled_set;
+
+    /* Sometimes cutlines divide a net very unevenly. In that case, just route to all
+     * sinks in the small side and unblock. Add convex hull since we are in a vnet which
+     * may not have a source at all */
+    if(inside_bb(tree.root().inode, vnet.clipped_bb)){ /* We have source, no need to sample after reduction in most cases */
+        bool is_reduced = get_reduction_isinks_vnet_with_source(vnet, cutline_pos, cutline_axis, sampled_set);
+        bool source_on_cutline = is_close_to_cutline(tree.root().inode, cutline_pos, cutline_axis, 1);
+        if(!is_reduced || source_on_cutline)
+            convex_hull_downsample_vnet(vnet, sampled_set);
+    }else{
+        int reduced_sides = get_reduction_isinks_vnet(vnet, cutline_pos, cutline_axis, sampled_set);
+        if(reduced_sides < 2){
+            convex_hull_downsample_vnet(vnet, sampled_set);
+        }
+    }
+
+    std::vector<int> isinks = get_vnet_isinks(vnet);
+    auto& is_isink_reached = tree.get_is_isink_reached();
+
+    /* Sample if a sink is too close to the cutline (and unreached).
+     * Those sinks are likely to fail routing */
+    for(int isink: isinks){
+        if(is_isink_reached[isink])
+            continue;
+        RRNodeId rr_sink = route_ctx.net_rr_terminals[vnet.net_id][isink];
+        if(is_close_to_cutline(rr_sink, cutline_pos, cutline_axis, 1)){
+            sampled_set.insert(isink);
+            continue;
+        }
+        if(is_close_to_bb(rr_sink, vnet.clipped_bb, 1))
+            sampled_set.insert(isink);
+    }
+
+    std::vector<int> out(sampled_set.begin(), sampled_set.end());
+    return out;
+}
+
+/** Decompose a net into a pair of nets.  */
+template<typename ConnectionRouter>
+vtr::optional<std::tuple<VirtualNet, VirtualNet>> route_and_decompose(ParentNetId net_id, const PartitionTreeNode& node, RouteIterCtx<ConnectionRouter>& ctx) {
+    auto& route_ctx = g_vpr_ctx.mutable_routing();
+
+    unsigned int num_sinks = ctx.net_list.net_sinks(net_id).size();
+
+    /* We don't have to route this net, so why bother decomposing it? */
+    if (!should_really_route_net(ctx.net_list, net_id, ctx.budgeting_inf, ctx.connections_inf, ctx.worst_negative_slack))
+        return vtr::nullopt;
+
+    setup_routing_resources(
+        ctx.itry,
+        net_id,
+        ctx.net_list,
+        num_sinks,
+        ctx.router_opts.min_incremental_reroute_fanout,
+        ctx.connections_inf,
+        ctx.router_opts,
+        check_hold(ctx.router_opts, ctx.worst_negative_slack));
+
+    VTR_ASSERT(route_ctx.route_trees[net_id]);
+    RouteTree& tree = route_ctx.route_trees[net_id].value();
+
+    bool high_fanout = is_high_fanout(num_sinks, ctx.router_opts.high_fanout_threshold);
+
+    /* I think it's OK to build the full high fanout lookup for both sides of the net.
+     * The work required to get the right bounding box and nodes into the lookup may
+     * be more than to just build it twice. */
+    SpatialRouteTreeLookup spatial_route_tree_lookup;
+    if (high_fanout) {
+        spatial_route_tree_lookup = build_route_tree_spatial_lookup(ctx.net_list,
+                                                                    route_ctx.route_bb,
+                                                                    net_id,
+                                                                    tree.root());
+    }
+
+    /* Get the isinks to actually route to */
+    std::vector<int> isinks_to_route = get_decomposition_isinks(net_id, node.cutline_pos, node.cutline_axis, ctx);
+
+    /* Get pin criticalities */
+    std::vector<float> pin_criticality(num_sinks + 1);
+
+    for (int isink : isinks_to_route) {
+        if (ctx.timing_info) {
+            auto pin = ctx.net_list.net_pin(net_id, isink);
+            pin_criticality[isink] = get_net_pin_criticality(ctx.timing_info,
+                                                             ctx.netlist_pin_lookup,
+                                                             ctx.router_opts.max_criticality,
+                                                             ctx.router_opts.criticality_exp,
+                                                             net_id,
+                                                             pin,
+                                                             ctx.is_flat);
+        } else {
+            //No timing info, implies we want a min delay routing, so use criticality of 1.
+            pin_criticality[isink] = 1.;
+        }
+    }
+
+    /* Sort wrt criticality */
+    std::sort(isinks_to_route.begin(), isinks_to_route.end(), [&](int a, int b) {
+        return pin_criticality[a] > pin_criticality[b];
+    });
+
+    /* Update base costs according to fanout and criticality rules
+     * TODO: Not sure what this does and if it's safe to call in parallel */
+    update_rr_base_costs(num_sinks);
+
+    t_conn_delay_budget conn_delay_budget;
+    t_conn_cost_params cost_params;
+    cost_params.astar_fac = ctx.router_opts.astar_fac;
+    cost_params.bend_cost = ctx.router_opts.bend_cost;
+    cost_params.pres_fac = ctx.pres_fac;
+    cost_params.delay_budget = ((ctx.budgeting_inf.if_set()) ? &conn_delay_budget : nullptr);
+
+    for (int isink : isinks_to_route) {
+        /* Fill the necessary forms to route to this sink. */
+        RRNodeId rr_sink = route_ctx.net_rr_terminals[net_id][isink];
+        cost_params.criticality = pin_criticality[isink];
+
+        if (ctx.budgeting_inf.if_set()) {
+            conn_delay_budget.max_delay = ctx.budgeting_inf.get_max_delay_budget(net_id, isink);
+            conn_delay_budget.target_delay = ctx.budgeting_inf.get_delay_target(net_id, isink);
+            conn_delay_budget.min_delay = ctx.budgeting_inf.get_min_delay_budget(net_id, isink);
+            conn_delay_budget.short_path_criticality = ctx.budgeting_inf.get_crit_short_path(net_id, isink);
+            conn_delay_budget.routing_budgets_algorithm = ctx.router_opts.routing_budgets_algorithm;
+        }
+
+        enable_router_debug(ctx.router_opts, net_id, rr_sink, ctx.itry, &ctx.routers.local());
+        VTR_LOGV_DEBUG(f_router_debug, "Routing to sink %zu of net %zu for decomposition\n", size_t(rr_sink), size_t(net_id));
+
+        /* Route to this sink. */
+        NetResultFlags sink_flags = timing_driven_route_sink(
+            ctx.routers.local(),
+            ctx.net_list,
+            net_id,
+            0, /* itarget: only used for debug, so we can lie here */
+            isink,
+            cost_params,
+            ctx.router_opts,
+            tree,
+            (high_fanout ? &spatial_route_tree_lookup : nullptr),
+            ctx.router_stats.local(),
+            ctx.budgeting_inf,
+            ctx.routing_predictor,
+            ctx.choking_spots[net_id],
+            ctx.is_flat,
+            route_ctx.route_bb[net_id]);
+
+        if (!sink_flags.success) /* Couldn't route. It's too much work to backtrack from here, just fail. */
+            return vtr::nullopt;
+
+        /* Fill the required forms after routing a connection. */
+        ++ctx.router_stats.local().connections_routed;
+
+        /* Update the net delay for the sink we just routed */
+        update_net_delay_from_isink(ctx.net_delay[net_id].data(),
+                                    tree,
+                                    isink,
+                                    ctx.net_list,
+                                    net_id,
+                                    ctx.timing_info.get(),
+                                    ctx.pin_timing_invalidator);
+    }
+
+    if (ctx.router_opts.update_lower_bound_delays) {
+        for (int ipin : isinks_to_route) {
+            ctx.connections_inf.update_lower_bound_connection_delay(net_id, ipin, ctx.net_delay[net_id][ipin]);
+        }
+    }
+
+    ctx.routers.local().empty_rcv_route_tree_set(); // ?
+
+    return make_decomposed_pair(net_id, node.cutline_pos, node.cutline_axis);
+}
+
+/** Get all "remaining sink pin indices" for a given VirtualNet. For regular nets
+ * you can get it from the route tree, but we need to spatially filter it here. */
+std::vector<int> get_vnet_remaining_isinks(const VirtualNet& vnet) {
+    auto& route_ctx = g_vpr_ctx.routing();
+    const RouteTree& tree = route_ctx.route_trees[vnet.net_id].value();
+
+    std::vector<int> out; /* The compiler should be smart enough to not copy this when returning */
+    for (size_t isink : tree.get_remaining_isinks()) {
+        RRNodeId sink_rr = route_ctx.net_rr_terminals[vnet.net_id][isink];
+        if (inside_bb(sink_rr, vnet.clipped_bb))
+            out.push_back(isink);
+    }
+    return out;
+}
+
+
+/** Decompose a net into a pair of nets.  */
+template<typename ConnectionRouter>
+vtr::optional<std::tuple<VirtualNet, VirtualNet>> route_and_decompose_vnet(const VirtualNet& vnet, const PartitionTreeNode& node, RouteIterCtx<ConnectionRouter>& ctx) {
+    auto& route_ctx = g_vpr_ctx.mutable_routing();
+
+    unsigned int num_sinks = get_vnet_num_sinks(vnet);
+    RouteTree& tree = route_ctx.route_trees[vnet.net_id].value();
+
+    /* Get the isinks to actually route to */
+    std::vector<int> isinks_to_route = get_decomposition_isinks_vnet(vnet, node.cutline_pos, node.cutline_axis);
+
+    if(isinks_to_route.size() == 0) /* All the sinks we were going to route are already reached -- just break down the net */
+        return make_decomposed_pair_from_vnet(vnet, node.cutline_pos, node.cutline_axis);
+
+    /* Get pin criticalities */
+    std::vector<float> pin_criticality(tree.num_sinks() + 1);
+
+    for (int isink : isinks_to_route) {
+        if (ctx.timing_info) {
+            auto pin = ctx.net_list.net_pin(vnet.net_id, isink);
+            pin_criticality[isink] = get_net_pin_criticality(ctx.timing_info,
+                                                             ctx.netlist_pin_lookup,
+                                                             ctx.router_opts.max_criticality,
+                                                             ctx.router_opts.criticality_exp,
+                                                             vnet.net_id,
+                                                             pin,
+                                                             ctx.is_flat);
+        } else {
+            //No timing info, implies we want a min delay routing, so use criticality of 1.
+            pin_criticality[isink] = 1.;
+        }
+    }
+
+    /* Sort wrt criticality */
+    std::sort(isinks_to_route.begin(), isinks_to_route.end(), [&](int a, int b) {
+        return pin_criticality[a] > pin_criticality[b];
+    });
+
+    bool high_fanout = is_high_fanout(tree.num_sinks(), ctx.router_opts.high_fanout_threshold);
+
+    /* I think it's OK to build the full high fanout lookup for both sides of the net.
+     * The work required to get the right bounding box and nodes into the lookup may
+     * be more than to just build it twice. */
+    SpatialRouteTreeLookup spatial_route_tree_lookup;
+    if (high_fanout) {
+        spatial_route_tree_lookup = build_route_tree_spatial_lookup(ctx.net_list,
+                                                                    route_ctx.route_bb,
+                                                                    vnet.net_id,
+                                                                    tree.root());
+    }
+
+    /* Update base costs according to fanout and criticality rules
+     * TODO: Not sure what this does and if it's safe to call in parallel */
+    update_rr_base_costs(num_sinks);
+
+    t_conn_delay_budget conn_delay_budget;
+    t_conn_cost_params cost_params;
+    cost_params.astar_fac = ctx.router_opts.astar_fac;
+    cost_params.bend_cost = ctx.router_opts.bend_cost;
+    cost_params.pres_fac = ctx.pres_fac;
+    cost_params.delay_budget = ((ctx.budgeting_inf.if_set()) ? &conn_delay_budget : nullptr);
+
+    for (int isink : isinks_to_route) {
+        /* Fill the necessary forms to route to this sink. */
+        RRNodeId rr_sink = route_ctx.net_rr_terminals[vnet.net_id][isink];
+        cost_params.criticality = pin_criticality[isink];
+
+        if (ctx.budgeting_inf.if_set()) {
+            conn_delay_budget.max_delay = ctx.budgeting_inf.get_max_delay_budget(vnet.net_id, isink);
+            conn_delay_budget.target_delay = ctx.budgeting_inf.get_delay_target(vnet.net_id, isink);
+            conn_delay_budget.min_delay = ctx.budgeting_inf.get_min_delay_budget(vnet.net_id, isink);
+            conn_delay_budget.short_path_criticality = ctx.budgeting_inf.get_crit_short_path(vnet.net_id, isink);
+            conn_delay_budget.routing_budgets_algorithm = ctx.router_opts.routing_budgets_algorithm;
+        }
+
+        enable_router_debug(ctx.router_opts, vnet.net_id, rr_sink, ctx.itry, &ctx.routers.local());
+        VTR_LOGV_DEBUG(f_router_debug, "Routing to sink %zu of net %zu for decomposition\n", size_t(rr_sink), size_t(vnet.net_id));
+
+        /* Route to this sink. */
+        NetResultFlags sink_flags = timing_driven_route_sink(
+            ctx.routers.local(),
+            ctx.net_list,
+            vnet.net_id,
+            0, /* itarget: only used for debug, so we can lie here */
+            isink,
+            cost_params,
+            ctx.router_opts,
+            tree,
+            (high_fanout ? &spatial_route_tree_lookup : nullptr),
+            ctx.router_stats.local(),
+            ctx.budgeting_inf,
+            ctx.routing_predictor,
+            ctx.choking_spots[vnet.net_id],
+            ctx.is_flat,
+            vnet.clipped_bb);
+
+        if (!sink_flags.success) /* Couldn't route. It's too much work to backtrack from here, just fail. */
+            return vtr::nullopt;
+
+        /* Fill the required forms after routing a connection. */
+        ++ctx.router_stats.local().connections_routed;
+
+        /* Update the net delay for the sink we just routed */
+        update_net_delay_from_isink(ctx.net_delay[vnet.net_id].data(),
+                                    tree,
+                                    isink,
+                                    ctx.net_list,
+                                    vnet.net_id,
+                                    ctx.timing_info.get(),
+                                    ctx.pin_timing_invalidator);
+    }
+
+    if (ctx.router_opts.update_lower_bound_delays) {
+        for (int ipin : isinks_to_route) {
+            ctx.connections_inf.update_lower_bound_connection_delay(vnet.net_id, ipin, ctx.net_delay[vnet.net_id][ipin]);
+        }
+    }
+
+    ctx.routers.local().empty_rcv_route_tree_set(); // ?
+
+    return make_decomposed_pair_from_vnet(vnet, node.cutline_pos, node.cutline_axis);
+}
+
+
+/* Goes through all the sinks of this virtual net and copies their delay values from
+ * the route_tree to the net_delay array. */
+template<typename ConnectionRouter>
+static void update_net_delays_from_vnet(const VirtualNet& vnet, RouteIterCtx<ConnectionRouter>& ctx) {
+    auto& route_ctx = g_vpr_ctx.routing();
+    std::vector<int> sinks = get_vnet_isinks(vnet);
+
+    for (int isink : sinks) {
+        update_net_delay_from_isink(
+            ctx.net_delay[vnet.net_id].data(),
+            *route_ctx.route_trees[vnet.net_id],
+            isink,
+            ctx.net_list,
+            vnet.net_id,
+            ctx.timing_info.get(),
+            ctx.pin_timing_invalidator);
+    }
+}
+
+inline std::string describe_bbox(const t_bb& bb){
+    return std::to_string(bb.xmin) + "," + std::to_string(bb.ymin)
+        + "x" + std::to_string(bb.xmax) + "," + std::to_string(bb.ymax);
+}
+
+inline std::string describe_rr_coords(RRNodeId inode){
+    auto& device_ctx = g_vpr_ctx.device();
+    const auto& rr_graph = device_ctx.rr_graph;
+
+    return std::to_string(rr_graph.node_xlow(inode))
+        + "," + std::to_string(rr_graph.node_ylow(inode))
+        + " -> " + std::to_string(rr_graph.node_xhigh(inode))
+        + "," + std::to_string(rr_graph.node_yhigh(inode));
+}
+
+/** Build a string describing \p vnet and its existing routing */
+inline std::string describe_vnet(const VirtualNet& vnet){
+    const auto& route_ctx = g_vpr_ctx.routing();
+
+    std::string out = "";
+    out += "Virtual net with bbox " + describe_bbox(vnet.clipped_bb)
+        + " parent net: " + std::to_string(size_t(vnet.net_id))
+        + " parent bbox: " + describe_bbox(route_ctx.route_bb[vnet.net_id]) + "\n";
+    
+    RRNodeId source_rr = route_ctx.net_rr_terminals[vnet.net_id][0];
+    out += "source: " + describe_rr_coords(source_rr) + ", sinks:";
+    for(size_t i=1; i<route_ctx.net_rr_terminals[vnet.net_id].size(); i++){
+        RRNodeId sink_rr = route_ctx.net_rr_terminals[vnet.net_id][i];
+        out += " " + describe_rr_coords(sink_rr);
+    }
+    out += "\n";
+
+    auto my_isinks = get_vnet_isinks(vnet);
+    out += "my sinks:";
+    for(int isink: my_isinks)
+        out += " " + std::to_string(isink - 1);
+    out += "\n";
+
+    out += "current routing:";
+    auto all_nodes = route_ctx.route_trees[vnet.net_id]->all_nodes();
+    for(auto it = all_nodes.begin(); it != all_nodes.end(); ++it){
+        if((*it).is_leaf()) {
+            out += describe_rr_coords((*it).inode) + " END ";
+            ++it;
+            if(it == all_nodes.end())
+                break;
+            out += describe_rr_coords((*it).parent()->inode) + " -> ";
+            out += describe_rr_coords((*it).inode) + " -> ";
+        } else {
+            out += describe_rr_coords((*it).inode) + " -> ";
+        }
+    }
+    out += "\n";
+
+    return out;
+}
+
+/** Build a logarithmic net fanouts histogram */
+std::string describe_fanout_histogram(void){
+    const auto& route_ctx = g_vpr_ctx.routing();
+    std::vector<int> bins(6);
+    for(size_t i=0; i<route_ctx.route_trees.size(); i++){
+        ParentNetId net_id(i);
+        size_t F = route_ctx.net_rr_terminals[net_id].size() - 1;
+        size_t bin = std::min<int>(vtr::log2_floor(F), 5);
+        bins[bin]++;
+    }
+    std::string out = "Log fanout histogram:";
+    for(int f: bins){
+        out += " " + std::to_string(f);
+    }
+    out += "\n";
+    return out;
+}
+
+/** Route a VirtualNet, which is a portion of a net with a clipped bounding box 
+ * and maybe a virtual source. */
+template<typename ConnectionRouter>
+NetResultFlags route_virtual_net(const VirtualNet& vnet, RouteIterCtx<ConnectionRouter>& ctx) {
+    auto& route_ctx = g_vpr_ctx.mutable_routing();
+
+    std::vector<int> sinks = get_vnet_isinks(vnet);
+    NetResultFlags flags;
+
+    VTR_ASSERT(route_ctx.route_trees[vnet.net_id]);
+    RouteTree& tree = route_ctx.route_trees[vnet.net_id].value();
+
+    /* Use vnet sinks to trigger high fanout code */
+    bool high_fanout = is_high_fanout(tree.num_sinks(), ctx.router_opts.high_fanout_threshold);
+
+    /* I think it's OK to build the full high fanout lookup.
+     * The work required to get the right bounding box and nodes into the lookup may
+     * be more than to just build it twice. */
+    SpatialRouteTreeLookup spatial_route_tree_lookup;
+    if (high_fanout) {
+        spatial_route_tree_lookup = build_route_tree_spatial_lookup(ctx.net_list,
+                                                                    route_ctx.route_bb,
+                                                                    vnet.net_id,
+                                                                    tree.root());
+    }
+
+    std::vector<int> remaining_isinks = get_vnet_remaining_isinks(vnet);
+
+    std::vector<float> pin_criticality(tree.num_sinks() + 1);
+
+    /* Sort by decreasing criticality */
+    for (int isink : remaining_isinks) {
+        if (ctx.timing_info) {
+            auto pin = ctx.net_list.net_pin(vnet.net_id, isink);
+            pin_criticality[isink] = get_net_pin_criticality(
+                ctx.timing_info,
+                ctx.netlist_pin_lookup,
+                ctx.router_opts.max_criticality,
+                ctx.router_opts.criticality_exp,
+                vnet.net_id,
+                pin,
+                ctx.is_flat);
+
+        } else {
+            //No timing info, implies we want a min delay routing, so use criticality of 1.
+            pin_criticality[isink] = 1.;
+        }
+    }
+
+    // compare the criticality of different sink nodes
+    sort(begin(remaining_isinks), end(remaining_isinks), [&](int a, int b) {
+        return pin_criticality[a] > pin_criticality[b];
+    });
+
+    /* Update base costs according to fanout and criticality rules (TODO: I'm super sure this is not thread safe) */
+    update_rr_base_costs(sinks.size());
+
+    /* Set up the tax forms for routing nets */
+    t_conn_delay_budget conn_delay_budget;
+    t_conn_cost_params cost_params;
+    cost_params.astar_fac = ctx.router_opts.astar_fac;
+    cost_params.bend_cost = ctx.router_opts.bend_cost;
+    cost_params.pres_fac = ctx.pres_fac;
+    cost_params.delay_budget = ((ctx.budgeting_inf.if_set()) ? &conn_delay_budget : nullptr);
+
+    /* This isn't exactly thread safe, but here both threads routing this net would be setting this to the same value */
+    if (ctx.budgeting_inf.if_set()) {
+        ctx.budgeting_inf.set_should_reroute(vnet.net_id, false);
+    }
+
+    /* Route sinks in decreasing order of criticality */
+    for (unsigned itarget = 0; itarget < remaining_isinks.size(); ++itarget) {
+        int isink = remaining_isinks[itarget];
+        RRNodeId sink_rr = route_ctx.net_rr_terminals[vnet.net_id][isink];
+        cost_params.criticality = pin_criticality[isink];
+
+        enable_router_debug(ctx.router_opts, vnet.net_id, sink_rr, ctx.itry, &ctx.routers.local());
+        VTR_LOGV_DEBUG(f_router_debug, "Routing to sink %zu of decomposed net %zu, clipped bbox = %d,%d - %d,%d\n",
+                       size_t(sink_rr), size_t(vnet.net_id), vnet.clipped_bb.xmin, vnet.clipped_bb.ymin, vnet.clipped_bb.xmax, vnet.clipped_bb.ymax);
+
+        if (ctx.budgeting_inf.if_set()) {
+            conn_delay_budget.max_delay = ctx.budgeting_inf.get_max_delay_budget(vnet.net_id, isink);
+            conn_delay_budget.target_delay = ctx.budgeting_inf.get_delay_target(vnet.net_id, isink);
+            conn_delay_budget.min_delay = ctx.budgeting_inf.get_min_delay_budget(vnet.net_id, isink);
+            conn_delay_budget.short_path_criticality = ctx.budgeting_inf.get_crit_short_path(vnet.net_id, isink);
+            conn_delay_budget.routing_budgets_algorithm = ctx.router_opts.routing_budgets_algorithm;
+        }
+
+        profiling::conn_start();
+
+        auto sink_flags = timing_driven_route_sink(
+            ctx.routers.local(),
+            ctx.net_list,
+            vnet.net_id,
+            itarget,
+            isink,
+            cost_params,
+            ctx.router_opts,
+            tree,
+            (high_fanout ? &spatial_route_tree_lookup : nullptr),
+            ctx.router_stats.local(),
+            ctx.budgeting_inf,
+            ctx.routing_predictor,
+            ctx.choking_spots[vnet.net_id],
+            ctx.is_flat,
+            vnet.clipped_bb);
+
+        flags.retry_with_full_bb |= sink_flags.retry_with_full_bb;
+
+        /* Give up for vnet if we failed to route a sink, since it's likely we will fail others as well. */
+        if (!sink_flags.success) {
+            PartitionTreeDebug::log("Failed to route sink " + std::to_string(isink - 1) + " in decomposed net:\n" + describe_vnet(vnet));
+            ctx.net_known_samples[vnet.net_id].push_back(isink);
+            flags.success = false;
+            //continue;
+            return flags;
+        }
+
+        /* Update the net delay for the sink we just routed */
+        update_net_delay_from_isink(ctx.net_delay[vnet.net_id].data(),
+                                    tree,
+                                    isink,
+                                    ctx.net_list,
+                                    vnet.net_id,
+                                    ctx.timing_info.get(),
+                                    ctx.pin_timing_invalidator);
+
+        if (ctx.router_opts.update_lower_bound_delays)
+            ctx.connections_inf.update_lower_bound_connection_delay(vnet.net_id, isink, ctx.net_delay[vnet.net_id][isink]);
+
+        profiling::conn_finish(size_t(route_ctx.net_rr_terminals[vnet.net_id][0]),
+                               size_t(sink_rr),
+                               pin_criticality[isink]);
+
+        ++ctx.router_stats.local().connections_routed;
+    } // finished all sinks
+
+    /* Return early if we failed to route some sinks */
+    if(!flags.success)
+        return flags;
+
+    ++ctx.router_stats.local().nets_routed;
+    profiling::net_finish();
+
+    ctx.routers.local().empty_rcv_route_tree_set(); // ?
+
+    flags.success = true;
+    return flags;
+}
+
+/* Helper for decompose_route_partition_tree(). */
+template<typename ConnectionRouter>
+void decompose_route_partition_tree_helper(tbb::task_group& g,
+                                           PartitionTreeNode& node,
+                                           RouteIterCtx<ConnectionRouter>& ctx,
+                                           int level) {
+    vtr::Timer t;
+
+    nets_too_deep.local() = 0;
+    nets_clock.local() = 0;
+    nets_retry_limit.local() = 0;
+    nets_thin_strip.local() = 0;
+    nets_cut_thin_strip.local() = 0;
+    nets_few_fanouts.local() = 0;
+    nets_set_to_decompose.local() = 0;
+
+    /* Sort so net with most sinks is routed first.
+     * We want to interleave virtual nets with regular ones, so sort an "index vector"
+     * instead where indices >= node.nets.size() refer to node.virtual_nets. */
+    std::vector<size_t> order(node.nets.size() + node.virtual_nets.size());
+    std::iota(order.begin(), order.end(), 0);
+    std::sort(order.begin(), order.end(), [&](size_t i, size_t j) -> bool {
+        ParentNetId id1 = i < node.nets.size() ? node.nets[i] : node.virtual_nets[i - node.nets.size()].net_id;
+        ParentNetId id2 = j < node.nets.size() ? node.nets[j] : node.virtual_nets[j - node.nets.size()].net_id;
+        return ctx.net_list.net_sinks(id1).size() > ctx.net_list.net_sinks(id2).size();
+    });
+
+    /* Route virtual or regular nets, interleaved */
+    for(size_t i: order){
+        if(i < node.nets.size()){ // regular net
+            ParentNetId net_id = node.nets[i];
+            /* Should I decompose this net? */
+            if (should_decompose_net(net_id, node, ctx)) {
+                auto decomposed_nets = route_and_decompose(net_id, node, ctx);
+                if (decomposed_nets) {
+                    auto& [left, right] = decomposed_nets.value();
+                    node.left->virtual_nets.push_back(left);
+                    node.right->virtual_nets.push_back(right);
+                    /* We changed the routing */
+                    ctx.rerouted_nets.local().push_back(net_id);
+                    continue; /* We are done with this net */
+                }
+            }
+            /* If not, route it here */
+            auto flags = try_parallel_route_net(net_id, ctx);
+
+            if (!flags.success && !flags.retry_with_full_bb) {
+                ctx.is_routable = false;
+            }
+            if (flags.was_rerouted) {
+                ctx.rerouted_nets.local().push_back(net_id);
+            }
+            if (flags.retry_with_full_bb) {
+                ctx.nets_to_retry.push_back(net_id);
+            }
+        } else { // virtual net
+            VirtualNet& vnet = node.virtual_nets[i - node.nets.size()];
+            /* Should we decompose this vnet? */
+            if (should_decompose_vnet(vnet, node)) {
+                auto decomposed_nets = route_and_decompose_vnet(vnet, node, ctx);
+                if (decomposed_nets) {
+                    auto& [left, right] = decomposed_nets.value();
+                    node.left->virtual_nets.push_back(left);
+                    node.right->virtual_nets.push_back(right);
+                    continue;
+                }
+            }
+            /* Otherwise, route it here.
+             * We don't care about flags, if there's something truly wrong,
+             * it will get discovered when decomposition is disabled */
+            route_virtual_net(vnet, ctx);
+        }
+    }
+
+    PartitionTreeDebug::log("Node with " + std::to_string(node.nets.size())
+                            + " nets and " + std::to_string(node.virtual_nets.size())
+                            + " virtual nets routed in " + std::to_string(t.elapsed_sec())
+                            + " s (level=" + std::to_string(level) + ")");
+
+    PartitionTreeDebug::log("total: " + std::to_string(node.nets.size())
+                            + " nets_too_deep: " + std::to_string(nets_too_deep.local())
+                            + " nets_clock: " + std::to_string(nets_clock.local())
+                            + " nets_retry_limit: " + std::to_string(nets_retry_limit.local())
+                            + " nets_thin_strip: " + std::to_string(nets_thin_strip.local())
+                            + " nets_cut_thin_strip: " + std::to_string(nets_cut_thin_strip.local())
+                            + " nets_few_fanouts: " + std::to_string(nets_few_fanouts.local())
+                            + " nets_set_to_decompose: " + std::to_string(nets_set_to_decompose.local()));
+
+    /* add left and right trees to task queue */
+    if (node.left && node.right) {
+        /* Otherwise both try to change the same "level" and garble it */
+        g.run([&, level]() {
+            decompose_route_partition_tree_helper(g, *node.left, ctx, level + 1);
+        });
+        g.run([&, level]() {
+            decompose_route_partition_tree_helper(g, *node.right, ctx, level + 1);
+        });
+    } else {
+        VTR_ASSERT(!node.left && !node.right); // there shouldn't be a node with a single branch
+    }
+}
+
+/** Route all nets in parallel using the partitioning information in the PartitionTree.
+ *
+ * @param[in, out] g TBB task group to dispatch tasks.
+ * @param[in, out] tree The partition tree. Non-const reference because iteration results get written on the nodes.
+ * @param[in, out] ctx RouteIterCtx containing all the necessary bits of state for routing.
+ * @return RouteIterResults combined from all threads.
+ *
+ * See comments in PartitionTreeNode for how parallel routing works. */
+template<typename ConnectionRouter>
+RouteIterResults decompose_route_partition_tree(tbb::task_group& g,
+                                                PartitionTree& tree,
+                                                RouteIterCtx<ConnectionRouter>& ctx) {
+    auto& device_ctx = g_vpr_ctx.device();
+    auto& route_ctx = g_vpr_ctx.mutable_routing();
+    PartitionTreeDebug::log(describe_fanout_histogram());
+
+    ctx.nets_to_retry.clear();
+    for (auto& thread_rerouted_nets: ctx.rerouted_nets){
+        thread_rerouted_nets.clear();
+    }
+
+    /* Route all nets */
+    decompose_route_partition_tree_helper(g, tree.root(), ctx, 0);
+    g.wait();
+
+    /* Grow the bounding box and set to not decompose if a net is set to retry */
+    for (ParentNetId net_id : ctx.nets_to_retry) {
+        route_ctx.route_bb[net_id] = {
+            0,
+            (int)(device_ctx.grid.width() - 1),
+            0,
+            (int)(device_ctx.grid.height() - 1)};
+        ctx.is_decomp_disabled[net_id] = true;
+    }
+
+    RouteIterResults out;
+    for (auto& thread_rerouted_nets: ctx.rerouted_nets){
+        out.rerouted_nets.insert(out.rerouted_nets.begin(), thread_rerouted_nets.begin(), thread_rerouted_nets.end());
+    }
+    for (auto& thread_stats : ctx.router_stats) {
+        update_router_stats(out.stats, thread_stats);
+    }
+    return out;
+}
+
+/* Build a partition tree and do a net-decomposing route with it */
+template<typename ConnectionRouter>
+static RouteIterResults decompose_route_with_partition_tree(tbb::task_group& g, RouteIterCtx<ConnectionRouter>& ctx) {
+    vtr::Timer t2;
+    PartitionTree partition_tree(ctx.net_list, ctx.net_scores);
+
+    float total_prep_time = t2.elapsed_sec();
+    VTR_LOG("# Built partition tree in %f seconds\n", total_prep_time);
+
+    return decompose_route_partition_tree(g, partition_tree, ctx);
+}
+
 #endif // VPR_USE_TBB
diff --git a/vpr/src/route/route_samplers.cpp b/vpr/src/route/route_samplers.cpp
new file mode 100644
index 00000000000..b8969d06507
--- /dev/null
+++ b/vpr/src/route/route_samplers.cpp
@@ -0,0 +1,69 @@
+/** Bulky geometry code for route_samplers.h
+ * TODO: Make the fns available in vtr_geometry.h? */
+
+#include "route_samplers.h"
+
+/** Cross product of v0v1 and v0p */
+constexpr int det(const SinkPoint& p, const SinkPoint& v0, const SinkPoint& v1){
+    return (v1.x - v0.x) * (p.y - v0.y) - (v1.y - v0.y) * (p.x - v0.x);
+}
+
+/** Which side of [v0, v1] has p? +1 is right, -1 is left */
+constexpr int which_side(const SinkPoint& p, const SinkPoint& v0, const SinkPoint& v1){
+    return det(p, v0, v1) > 0 ? 1 : -1;
+}
+
+/** Perpendicular distance of p to v0v1 assuming |v0v1| = 1
+ * (it's not, so only use to compare when v0 and v1 is the same for different p's) */
+constexpr int dist(const SinkPoint& p, const SinkPoint& v0, const SinkPoint& v1){
+    return abs(det(p, v0, v1));
+}
+
+/** Helper for quickhull() */
+void find_hull(std::set<SinkPoint>& out, const std::vector<SinkPoint>& points, const SinkPoint& v0, const SinkPoint& v1, int side){
+    int max_dist = 0;
+    const SinkPoint* max_p = nullptr;
+    for(auto& point: points){
+        if(which_side(point, v0, v1) != side){
+            continue;
+        }
+        int h = dist(point, v0, v1);
+        if(h > max_dist){
+            max_dist = h;
+            max_p = &point;
+        }
+    }
+    if(!max_p) /* no point */
+        return;
+    out.insert(*max_p);
+    find_hull(out, points, v0, *max_p, -1);
+    find_hull(out, points, *max_p, v1, -1);
+}
+
+/** Find convex hull. Doesn't work with <3 points.
+ * See https://en.wikipedia.org/wiki/Quickhull */
+std::vector<SinkPoint> quickhull(const std::vector<SinkPoint>& points){
+    if(points.size() < 3)
+        return std::vector<SinkPoint>();
+
+    std::set<SinkPoint> out;
+
+    int min_x = std::numeric_limits<int>::max();
+    int max_x = std::numeric_limits<int>::min();
+    const SinkPoint* min_p, *max_p;
+    for(auto& point: points){
+        if(point.x <= min_x){
+            min_x = point.x;
+            min_p = &point;
+        }
+        if(point.x >= max_x){
+            max_x = point.x;
+            max_p = &point;
+        }
+    }
+    out.insert(*min_p);
+    out.insert(*max_p);
+    find_hull(out, points, *min_p, *max_p, -1);
+    find_hull(out, points, *min_p, *max_p, 1);
+    return std::vector<SinkPoint>(out.begin(), out.end());
+}
diff --git a/vpr/src/route/route_samplers.h b/vpr/src/route/route_samplers.h
new file mode 100644
index 00000000000..f1d2a222a57
--- /dev/null
+++ b/vpr/src/route/route_samplers.h
@@ -0,0 +1,503 @@
+/** Sink downsamplers for parallel routing. 
+ *
+ * These are used to get a "minimal skeleton routing" from the main task. 
+ * Rest of the routing is delegated to child tasks. They will work with a
+ * strictly limited bounding box, so it's necessary that the initial routing
+ * provides enough hints while routing to as few sinks as possible to limit
+ * the serial bottleneck. */
+#pragma once
+
+#include <cmath>
+#include <limits>
+#include <vector>
+#include "globals.h"
+#include "partition_tree.h"
+#include "route_common.h"
+#include "router_lookahead_sampling.h"
+
+/** Minimum bin size when spatially sampling decomposition sinks. (I know, doesn't make much sense.)
+ * The parallel router tries to decompose nets by building a "skeleton routing" from the main task
+ * and then delegating the remaining work to its child tasks. This minimum bin size determines how much
+ * time the main thread spends building the skeleton.
+ * Less is more effort -> less speedup, better quality.
+ * See get_decomposition_isinks() for more info. */
+constexpr size_t MIN_DECOMP_BIN_WIDTH = 5;
+
+/** Sink container for geometry operations */
+struct SinkPoint {
+    int x;
+    int y;
+    int isink;
+
+    bool operator==(const SinkPoint& rhs) const {
+        return x == rhs.x && y == rhs.y;
+    }
+    bool operator<(const SinkPoint& rhs) const {
+        if(x < rhs.x)
+            return true;
+        if(x > rhs.x)
+            return false;
+        if(y < rhs.y)
+            return true;
+        if(y > rhs.y)
+            return false;
+        return isink < rhs.isink;
+    }
+};
+
+/** Find convex hull. Doesn't work with <3 points.
+ * See https://en.wikipedia.org/wiki/Quickhull */
+std::vector<SinkPoint> quickhull(const std::vector<SinkPoint>& points);
+
+/** Which side of the cutline is this RRNode?
+ * Cutlines are always assumed to be at cutline_axis = (cutline_pos + 0.5).
+ * In the context of the parallel router, a RR node is considered to be inside a bounding
+ * box if its top left corner (xlow, ylow) is inside it. */
+inline Side which_side(RRNodeId inode, int cutline_pos, Axis axis) {
+    auto& device_ctx = g_vpr_ctx.device();
+    const auto& rr_graph = device_ctx.rr_graph;
+
+    if (axis == Axis::X) {
+        return Side(rr_graph.node_xlow(inode) > cutline_pos); /* 1 is RIGHT */
+    } else {
+        return Side(rr_graph.node_ylow(inode) > cutline_pos);
+    }
+}
+
+/** Sample most critical sink in every MIN_DECOMP_BIN_WIDTH-wide bin. Bins are grown to absorb fractional bins.
+ * Skip a bin if already reached by existing routing. */
+inline std::vector<int> min_voxel_downsample(ParentNetId net_id, const std::vector<int>& remaining_targets) {
+    const auto& route_ctx = g_vpr_ctx.routing();
+    const auto& device_ctx = g_vpr_ctx.device();
+    const auto& rr_graph = device_ctx.rr_graph;
+
+    const RouteTree& tree = route_ctx.route_trees[net_id].value();
+    std::vector<int> out;
+
+    /* Set up sampling bins. If we are sampling from W = 22 with minimum width 6, then we have
+     * 3 bins and real width is 22/3 + 1 = 8. Then x=0 goes to bin 0, x=8 goes to bin 1 etc. */
+    const t_bb& net_bb = route_ctx.route_bb[net_id];
+    size_t width = net_bb.xmax - net_bb.xmin + 1;
+    size_t height = net_bb.ymax - net_bb.ymin + 1;
+    size_t bins_x = width / MIN_DECOMP_BIN_WIDTH;
+    size_t bins_y = height / MIN_DECOMP_BIN_WIDTH;
+    size_t samples_to_find = bins_x * bins_y;
+    size_t bin_width_x = width / bins_x + 1;
+    size_t bin_width_y = height / bins_y + 1;
+
+    /* The sample for each bin, indexed by [x][y]. Set to -1 if reached by existing routing,
+     * 0 if not found yet. */
+    std::vector<std::vector<int>> samples(bins_x, std::vector<int>(bins_y));
+    constexpr int REACHED = -1;
+    constexpr int NONE = 0;
+
+    /* Mark bins with already reached sinks. */
+    for (int isink : tree.get_reached_isinks()) {
+        if (samples_to_find == 0)
+            return out;
+        RRNodeId rr_sink = route_ctx.net_rr_terminals[net_id][isink];
+        size_t x = (rr_graph.node_xlow(rr_sink) - net_bb.xmin) / bin_width_x;
+        size_t y = (rr_graph.node_ylow(rr_sink) - net_bb.ymin) / bin_width_y;
+        if (samples[x][y] != REACHED) {
+            samples[x][y] = REACHED;
+            samples_to_find--;
+        }
+    }
+
+    /* Spatially sample remaining targets. This should be already sorted by pin criticality,
+     * so we sample the most critical sink in the bin right away. */
+    for (int isink : remaining_targets) {
+        if (samples_to_find == 0)
+            return out;
+        RRNodeId rr_sink = route_ctx.net_rr_terminals[net_id][isink];
+        size_t x = (rr_graph.node_xlow(rr_sink) - net_bb.xmin) / bin_width_x;
+        size_t y = (rr_graph.node_ylow(rr_sink) - net_bb.ymin) / bin_width_y;
+        if (samples[x][y] == NONE) {
+            samples[x][y] = isink;
+            out.push_back(isink);
+            samples_to_find--;
+        }
+    }
+
+    return out;
+}
+
+/** Sample sinks on the convex hull of the set {source + sinks}. Skip sinks if already reached. */
+inline void convex_hull_downsample(ParentNetId net_id, std::set<int>& out) {
+    const auto& route_ctx = g_vpr_ctx.routing();
+    const auto& device_ctx = g_vpr_ctx.device();
+    const auto& rr_graph = device_ctx.rr_graph;
+
+    const RouteTree& tree = route_ctx.route_trees[net_id].value();
+    std::vector<SinkPoint> sink_points;
+
+    /* i = 0 corresponds to the source */
+    for(size_t i = 0; i < tree.num_sinks()+1; i++){
+        RRNodeId rr_sink = route_ctx.net_rr_terminals[net_id][i];
+        SinkPoint point {rr_graph.node_xlow(rr_sink), rr_graph.node_ylow(rr_sink), int(i)};
+        sink_points.push_back(point);
+    }
+
+    auto hull = quickhull(sink_points);
+
+    auto& is_isink_reached = tree.get_is_isink_reached();
+    /* Sample if not reached and not source */
+    for(auto& point: hull){
+        if(point.isink == 0) /* source */
+            continue;
+        if(!is_isink_reached[point.isink])
+            out.insert(point.isink);
+    }
+}
+
+/** Clip bb to one side of the cutline given the axis and position of the cutline.
+ * Note that cutlines are assumed to be at axis = cutline_pos + 0.5. */
+inline t_bb clip_to_side2(const t_bb& bb, Axis axis, int cutline_pos, Side side) {
+    t_bb out = bb;
+    if (axis == Axis::X && side == Side::LEFT)
+        out.xmax = cutline_pos;
+    else if (axis == Axis::X && side == Side::RIGHT)
+        out.xmin = cutline_pos + 1;
+    else if (axis == Axis::Y && side == Side::LEFT)
+        out.ymax = cutline_pos;
+    else if (axis == Axis::Y && side == Side::RIGHT)
+        out.ymin = cutline_pos + 1;
+    else
+        VTR_ASSERT_MSG(false, "Unreachable");
+    return out;
+}
+
+inline int dist2(int x1, int y1, int x2, int y2){
+    return (x2 - x1) * (x2 - x1) + (y2 - y1) * (y2 - y1);
+}
+
+/** Sample one sink closest to each bbox's epicenter. The rationale is that the 
+ * sinks around the cutline will be sampled by the sink thickness rule anyway. */
+inline void sample_both_epicenters(ParentNetId net_id, int cutline_pos, Axis cutline_axis, std::set<int>& out) {
+    const auto& route_ctx = g_vpr_ctx.routing();
+    const auto& device_ctx = g_vpr_ctx.device();
+    const auto& rr_graph = device_ctx.rr_graph;
+
+    const RouteTree& tree = route_ctx.route_trees[net_id].value();
+
+    int num_sinks = tree.num_sinks();
+    auto& is_isink_reached = tree.get_is_isink_reached();
+    const t_bb& net_bb = route_ctx.route_bb[net_id];
+    t_bb left_bb = clip_to_side2(net_bb, cutline_axis, cutline_pos, Side::LEFT);
+    t_bb right_bb = clip_to_side2(net_bb, cutline_axis, cutline_pos, Side::RIGHT);
+    int left_epi_x = (left_bb.xmin + left_bb.xmax) / 2;
+    int left_epi_y = (left_bb.ymin + left_bb.ymax) / 2;
+    int right_epi_x = (right_bb.xmin + right_bb.xmax) / 2;
+    int right_epi_y = (right_bb.ymin + right_bb.ymax) / 2;
+    int best_score_left = std::numeric_limits<int>::max();
+    int best_score_right = std::numeric_limits<int>::max();
+    int best_left_isink = 0;
+    int best_right_isink = 0;
+
+    for(int isink=1; isink<num_sinks+1; isink++){
+        RRNodeId rr_sink = route_ctx.net_rr_terminals[net_id][isink];
+        int x = rr_graph.node_xlow(rr_sink);
+        int y = rr_graph.node_ylow(rr_sink);
+        if(inside_bb(rr_sink, left_bb)){
+            int score = dist2(x, y, left_epi_x, left_epi_y);
+            if(score < best_score_left){
+                best_score_left = score;
+                best_left_isink = isink;
+            }
+        }else{
+            int score = dist2(x, y, right_epi_x, right_epi_y);
+            if(score < best_score_right){
+                best_score_right = score;
+                best_right_isink = isink;
+            }
+        }
+    }
+
+    if(best_left_isink && !is_isink_reached[best_left_isink])
+        out.insert(best_left_isink);
+    if(best_right_isink && !is_isink_reached[best_right_isink])
+        out.insert(best_right_isink);
+}
+
+/** Sample one sink closest to each bbox's epicenter. The rationale is that the 
+ * sinks around the cutline will be sampled by the sink thickness rule anyway. */
+inline void sample_both_epicenters_vnet(const VirtualNet& vnet, int cutline_pos, Axis cutline_axis, std::set<int>& out) {
+    const auto& route_ctx = g_vpr_ctx.routing();
+    const auto& device_ctx = g_vpr_ctx.device();
+    const auto& rr_graph = device_ctx.rr_graph;
+
+    const RouteTree& tree = route_ctx.route_trees[vnet.net_id].value();
+
+    int num_sinks = tree.num_sinks();
+    auto& is_isink_reached = tree.get_is_isink_reached();
+    const t_bb& net_bb = vnet.clipped_bb;
+    t_bb left_bb = clip_to_side2(net_bb, cutline_axis, cutline_pos, Side::LEFT);
+    t_bb right_bb = clip_to_side2(net_bb, cutline_axis, cutline_pos, Side::RIGHT);
+    int left_epi_x = (left_bb.xmin + left_bb.xmax) / 2;
+    int left_epi_y = (left_bb.ymin + left_bb.ymax) / 2;
+    int right_epi_x = (right_bb.xmin + right_bb.xmax) / 2;
+    int right_epi_y = (right_bb.ymin + right_bb.ymax) / 2;
+    int best_score_left = std::numeric_limits<int>::max();
+    int best_score_right = std::numeric_limits<int>::max();
+    int best_left_isink = 0;
+    int best_right_isink = 0;
+
+    for(int isink=1; isink<num_sinks+1; isink++){
+        RRNodeId rr_sink = route_ctx.net_rr_terminals[vnet.net_id][isink];
+        int x = rr_graph.node_xlow(rr_sink);
+        int y = rr_graph.node_ylow(rr_sink);
+        if(inside_bb(rr_sink, left_bb)){
+            int score = dist2(x, y, left_epi_x, left_epi_y);
+            if(score < best_score_left){
+                best_score_left = score;
+                best_left_isink = isink;
+            }
+        }else if(inside_bb(rr_sink, right_bb)){
+            int score = dist2(x, y, right_epi_x, right_epi_y);
+            if(score < best_score_right){
+                best_score_right = score;
+                best_right_isink = isink;
+            }
+        }
+    }
+
+    if(best_left_isink && !is_isink_reached[best_left_isink])
+        out.insert(best_left_isink);
+    if(best_right_isink && !is_isink_reached[best_right_isink])
+        out.insert(best_right_isink);
+}
+
+/** Sample sinks on the convex hull of the set {source + sinks}. Skip sinks if already reached. */
+inline void convex_hull_downsample_vnet(const VirtualNet& vnet, std::set<int>& out) {
+    const auto& route_ctx = g_vpr_ctx.routing();
+    const auto& device_ctx = g_vpr_ctx.device();
+    const auto& rr_graph = device_ctx.rr_graph;
+
+    const RouteTree& tree = route_ctx.route_trees[vnet.net_id].value();
+    std::vector<SinkPoint> sink_points;
+
+    /* i = 0 corresponds to the source */
+    for(size_t i = 0; i < tree.num_sinks()+1; i++){
+        RRNodeId rr_sink = route_ctx.net_rr_terminals[vnet.net_id][i];
+        if(!inside_bb(rr_sink, vnet.clipped_bb))
+            continue;
+        SinkPoint point {rr_graph.node_xlow(rr_sink), rr_graph.node_ylow(rr_sink), int(i)};
+        sink_points.push_back(point);
+    }
+
+    auto hull = quickhull(sink_points);
+
+    auto& is_isink_reached = tree.get_is_isink_reached();
+    /* Sample if not reached and not source */
+    for(auto& point: hull){
+        if(point.isink == 0) /* source */
+            continue;
+        if(!is_isink_reached[point.isink])
+            out.insert(point.isink);
+    }
+}
+
+/** Sample sinks on the *sink side* of the convex hull of the set {source + sinks}.
+ * Skip sinks if already reached. */
+inline std::vector<int> half_convex_hull_downsample(ParentNetId net_id, int cutline_pos, Axis cutline_axis) {
+    const auto& route_ctx = g_vpr_ctx.routing();
+    const auto& device_ctx = g_vpr_ctx.device();
+    const auto& rr_graph = device_ctx.rr_graph;
+
+    const RouteTree& tree = route_ctx.route_trees[net_id].value();
+    std::vector<int> out;
+    std::vector<SinkPoint> sink_points;
+
+    /* i = 0 corresponds to the source */
+    for(size_t i = 0; i < tree.num_sinks()+1; i++){
+        RRNodeId rr_sink = route_ctx.net_rr_terminals[net_id][i];
+        SinkPoint point {rr_graph.node_xlow(rr_sink), rr_graph.node_ylow(rr_sink), int(i)};
+        sink_points.push_back(point);
+    }
+
+    auto hull = quickhull(sink_points);
+
+    auto& is_isink_reached = tree.get_is_isink_reached();
+    RRNodeId rr_source = route_ctx.net_rr_terminals[net_id][0];
+    Side source_side = which_side(rr_source, cutline_pos, cutline_axis);
+    /* Sample if not reached and not source */
+    for(auto& point: hull){
+        if(point.isink == 0 || is_isink_reached[point.isink]) /* source or reached */
+            continue;
+        RRNodeId rr_sink = route_ctx.net_rr_terminals[net_id][point.isink];
+        if(which_side(rr_sink, cutline_pos, cutline_axis) == source_side) /* on source side */
+            continue;
+        out.push_back(point.isink);
+    }
+
+    return out;
+}
+
+/** Sample sinks on the *sink side* of the convex hull of the set {source + sinks}.
+ * Skip sinks if already reached. */
+inline std::vector<int> half_convex_hull_downsample_vnet(const VirtualNet& vnet, int cutline_pos, Axis cutline_axis) {
+    const auto& route_ctx = g_vpr_ctx.routing();
+    const auto& device_ctx = g_vpr_ctx.device();
+    const auto& rr_graph = device_ctx.rr_graph;
+
+    const RouteTree& tree = route_ctx.route_trees[vnet.net_id].value();
+    std::vector<int> out;
+    std::vector<SinkPoint> sink_points;
+
+    /* i = 0 corresponds to the source */
+    for(size_t i = 0; i < tree.num_sinks()+1; i++){
+        RRNodeId rr_sink = route_ctx.net_rr_terminals[vnet.net_id][i];
+        if(!inside_bb(rr_sink, vnet.clipped_bb))
+            continue;
+        SinkPoint point {rr_graph.node_xlow(rr_sink), rr_graph.node_ylow(rr_sink), int(i)};
+        sink_points.push_back(point);
+    }
+
+    auto hull = quickhull(sink_points);
+
+    auto& is_isink_reached = tree.get_is_isink_reached();
+    RRNodeId rr_source = route_ctx.net_rr_terminals[vnet.net_id][0];
+    Side source_side = which_side(rr_source, cutline_pos, cutline_axis);
+    /* Sample if not reached and not source */
+    for(auto& point: hull){
+        if(point.isink == 0 || is_isink_reached[point.isink]) /* source or reached */
+            continue;
+        RRNodeId rr_sink = route_ctx.net_rr_terminals[vnet.net_id][point.isink];
+        if(which_side(rr_sink, cutline_pos, cutline_axis) == source_side) /* on source side */
+            continue;
+        out.push_back(point.isink);
+    }
+
+    return out;
+}
+
+/** Sample the most critical sink on the other side of the cutline.
+ * Sample nothing if that's already reached. */
+inline std::vector<int> sample_single_sink(ParentNetId net_id, const std::vector<float>& pin_criticality, int cutline_pos, Axis cutline_axis) {
+    const auto& route_ctx = g_vpr_ctx.routing();
+    const RouteTree& tree = route_ctx.route_trees[net_id].value();
+    auto& is_isink_reached = tree.get_is_isink_reached();
+
+    std::vector<int> isinks(tree.num_sinks());
+    std::iota(isinks.begin(), isinks.end(), 1);
+    std::sort(isinks.begin(), isinks.end(), [&](int i, int j){
+        return pin_criticality[i] > pin_criticality[j];
+    });
+
+    RRNodeId rr_source = route_ctx.net_rr_terminals[net_id][0];
+    Side source_side = which_side(rr_source, cutline_pos, cutline_axis);
+    for(int isink: isinks){
+        if(is_isink_reached[isink])
+            continue;
+        RRNodeId rr_sink = route_ctx.net_rr_terminals[net_id][isink];
+        if(which_side(rr_sink, cutline_pos, cutline_axis) != source_side){
+            if(is_isink_reached[isink])
+                return {};
+            else
+                return {isink};
+        }
+    }
+
+    return {};
+}
+
+inline bool is_close_to_cutline2(RRNodeId inode, int cutline_pos, Axis cutline_axis, int thickness){
+    const auto& device_ctx = g_vpr_ctx.device();
+    const auto& rr_graph = device_ctx.rr_graph;
+
+    /* Cutlines are considered to be at x + 0.5, set a thickness of +1 here by checking for equality */
+    if(cutline_axis == Axis::X){
+        return rr_graph.node_xlow(inode) - thickness <= cutline_pos && rr_graph.node_xhigh(inode) + thickness >= cutline_pos;
+    } else {
+        return rr_graph.node_ylow(inode) - thickness <= cutline_pos && rr_graph.node_yhigh(inode) + thickness >= cutline_pos;
+    }
+}
+
+/** Is \p inode too close to this bb? (Assuming it's inside)
+ * We assign some "thickness" to the node and check for collision */
+inline bool is_close_to_bb2(RRNodeId inode, const t_bb& bb, int thickness){
+    const auto& device_ctx = g_vpr_ctx.device();
+    const auto& rr_graph = device_ctx.rr_graph;
+
+    int xlow = rr_graph.node_xlow(inode) - thickness;
+    int ylow = rr_graph.node_ylow(inode) - thickness;
+    int xhigh = rr_graph.node_xhigh(inode) + thickness;
+    int yhigh = rr_graph.node_yhigh(inode) + thickness;
+
+    return (xlow <= bb.xmin && xhigh >= bb.xmin)
+        || (ylow <= bb.ymin && yhigh >= bb.ymin)
+        || (xlow <= bb.xmax && xhigh >= bb.xmax)
+        || (ylow <= bb.ymax && yhigh >= bb.ymax);
+}
+
+/** Sample the most critical sinks on both sides. Omit reached sinks. */
+inline void sample_two_sinks(ParentNetId net_id, const std::vector<float>& pin_criticality, int cutline_pos, Axis cutline_axis, std::set<int>& out) {
+    const auto& route_ctx = g_vpr_ctx.routing();
+    const RouteTree& tree = route_ctx.route_trees[net_id].value();
+    auto& is_isink_reached = tree.get_is_isink_reached();
+
+    std::vector<int> isinks(tree.num_sinks());
+    std::iota(isinks.begin(), isinks.end(), 1);
+    std::sort(isinks.begin(), isinks.end(), [&](int i, int j){
+        return pin_criticality[i] > pin_criticality[j];
+    });
+
+    int left_isink = -1;
+    int right_isink = -1;
+    const t_bb& net_bb = route_ctx.route_bb[net_id];
+    t_bb left_bb = clip_to_side2(net_bb, cutline_axis, cutline_pos, Side::LEFT);
+    t_bb right_bb = clip_to_side2(net_bb, cutline_axis, cutline_pos, Side::RIGHT);
+
+    for(int isink: isinks){
+        RRNodeId rr_sink = route_ctx.net_rr_terminals[net_id][isink];
+        if(is_close_to_cutline2(rr_sink, cutline_pos, cutline_axis, 3))
+            continue;
+        if(inside_bb(rr_sink, left_bb)){
+            left_isink = isink;
+        }else if(inside_bb(rr_sink, right_bb)){
+            right_isink = isink;
+        }
+        if(left_isink > -1 && right_isink > -1)
+            break;
+    }
+
+    if(left_isink > -1 && !is_isink_reached[left_isink])
+        out.insert(left_isink);
+    if(right_isink > -1 && !is_isink_reached[right_isink])
+        out.insert(right_isink);
+}
+
+/** Sample the most critical sinks on both sides. Omit reached sinks. */
+inline void sample_two_sinks_vnet(const VirtualNet& vnet, const std::vector<float>& pin_criticality, int cutline_pos, Axis cutline_axis, std::set<int>& out) {
+    const auto& route_ctx = g_vpr_ctx.routing();
+    const RouteTree& tree = route_ctx.route_trees[vnet.net_id].value();
+    auto& is_isink_reached = tree.get_is_isink_reached();
+
+    std::vector<int> isinks(tree.num_sinks());
+    std::iota(isinks.begin(), isinks.end(), 1);
+    std::sort(isinks.begin(), isinks.end(), [&](int i, int j){
+        return pin_criticality[i] > pin_criticality[j];
+    });
+
+    int left_isink = -1;
+    int right_isink = -1;
+    const t_bb& net_bb = vnet.clipped_bb;
+    t_bb left_bb = clip_to_side2(net_bb, cutline_axis, cutline_pos, Side::LEFT);
+    t_bb right_bb = clip_to_side2(net_bb, cutline_axis, cutline_pos, Side::RIGHT);
+
+    for(int isink: isinks){
+        RRNodeId rr_sink = route_ctx.net_rr_terminals[vnet.net_id][isink];
+        if(inside_bb(rr_sink, left_bb) && !is_close_to_cutline2(rr_sink, cutline_pos, cutline_axis, 3) && !is_close_to_bb2(rr_sink, left_bb, 1)){
+            left_isink = isink;
+        }else if(inside_bb(rr_sink, right_bb) && !is_close_to_cutline2(rr_sink, cutline_pos, cutline_axis, 3) && !is_close_to_bb2(rr_sink, right_bb, 1)){
+            right_isink = isink;
+        }
+        if(left_isink > -1 && right_isink > -1)
+            break;
+    }
+
+    if(left_isink > -1 && !is_isink_reached[left_isink])
+        out.insert(left_isink);
+    if(right_isink > -1 && !is_isink_reached[right_isink])
+        out.insert(right_isink);
+}
\ No newline at end of file
diff --git a/vpr/src/route/route_timing.cpp b/vpr/src/route/route_timing.cpp
index 62930ad2555..059e80e69d4 100644
--- a/vpr/src/route/route_timing.cpp
+++ b/vpr/src/route/route_timing.cpp
@@ -1,3 +1,4 @@
+#include <oneapi/tbb/parallel_for_each.h>
 #include <cstdio>
 #include <ctime>
 #include <cmath>
@@ -65,42 +66,6 @@ static int num_routing_failed = 0;
 
 /******************** Subroutines local to route_timing.cpp ********************/
 
-/** Attempt to route a single sink (target_pin) in a net.
- * In the process, update global pathfinder costs, rr_node_route_inf and extend the global RouteTree
- * for this net.
- *
- * @param router The ConnectionRouter instance 
- * @param net_list Input netlist
- * @param net_id
- * @param itarget # of this connection in the net (only used for debug output)
- * @param target_pin # of this sink in the net (TODO: is it the same thing as itarget?)
- * @param cost_params
- * @param router_opts
- * @param[in, out] tree RouteTree describing the current routing state
- * @param rt_node_of_sink Lookup from target_pin-like indices (indicating SINK nodes) to RouteTreeNodes
- * @param spatial_rt_lookup
- * @param router_stats
- * @param budgeting_inf
- * @param routing_predictor
- * @param choking_spots
- * @param is_flat
- * @return NetResultFlags for this sink to be bubbled up through timing_driven_route_net */
-template<typename ConnectionRouter>
-static NetResultFlags timing_driven_route_sink(ConnectionRouter& router,
-                                               const Netlist<>& net_list,
-                                               ParentNetId net_id,
-                                               unsigned itarget,
-                                               int target_pin,
-                                               const t_conn_cost_params cost_params,
-                                               const t_router_opts& router_opts,
-                                               RouteTree& tree,
-                                               SpatialRouteTreeLookup& spatial_rt_lookup,
-                                               RouterStats& router_stats,
-                                               route_budgets& budgeting_inf,
-                                               const RoutingPredictor& routing_predictor,
-                                               const std::vector<std::unordered_map<RRNodeId, int>>& choking_spots,
-                                               bool is_flat);
-
 /** Return tuple of:
  * bool: Did we find a path for each sink in this net? 
  * bool: Should the caller retry with a full-device bounding box? */
@@ -117,42 +82,6 @@ static std::tuple<bool, bool> timing_driven_pre_route_to_clock_root(ConnectionRo
                                                                     bool is_flat,
                                                                     bool can_grow_bb);
 
-static void setup_routing_resources(int itry,
-                                    ParentNetId net_id,
-                                    const Netlist<>& net_list,
-                                    unsigned num_sinks,
-                                    int min_incremental_reroute_fanout,
-                                    CBRR& connections_inf,
-                                    const t_router_opts& router_opts,
-                                    bool ripup_high_fanout_nets);
-
-static void update_net_delays_from_route_tree(float* net_delay,
-                                              const Netlist<>& net_list,
-                                              ParentNetId inet,
-                                              TimingInfo* timing_info,
-                                              NetPinTimingInvalidator* pin_timing_invalidator);
-
-static bool check_hold(const t_router_opts& router_opts, float worst_neg_slack);
-
-static float get_net_pin_criticality(const std::shared_ptr<SetupHoldTimingInfo> timing_info,
-                                     const ClusteredPinAtomPinsLookup& netlist_pin_lookup,
-                                     float max_criticality,
-                                     float criticality_exp,
-                                     ParentNetId net_id,
-                                     ParentPinId pin_id,
-                                     bool is_flat);
-
-struct more_sinks_than {
-    const Netlist<>& net_list_;
-    more_sinks_than(const Netlist<>& net_list)
-        : net_list_(net_list) {}
-    inline bool operator()(const ParentNetId& net_index1, const ParentNetId& net_index2) {
-        return net_list_.net_sinks(net_index1).size() > net_list_.net_sinks(net_index2).size();
-    }
-};
-
-static bool is_high_fanout(int fanout, int fanout_threshold);
-
 // The reason that try_timing_driven_route_tmpl (and descendents) are being
 // templated over is because using a virtual interface instead fully templating
 // the router results in a 5% runtime increase.
@@ -253,7 +182,10 @@ bool try_timing_driven_route_tmpl(const Netlist<>& net_list,
 
     //sort so net with most sinks is routed first.
     auto sorted_nets = std::vector<ParentNetId>(net_list.nets().begin(), net_list.nets().end());
-    std::sort(sorted_nets.begin(), sorted_nets.end(), more_sinks_than(net_list));
+
+    std::sort(sorted_nets.begin(), sorted_nets.end(), [&](const ParentNetId id1, const ParentNetId id2) -> bool {
+        return net_list.net_sinks(id1).size() > net_list.net_sinks(id2).size();
+    });
 
     /*
      * Configure the routing predictor
@@ -413,7 +345,6 @@ bool try_timing_driven_route_tmpl(const Netlist<>& net_list,
 
     RouterStats router_stats;
     init_router_stats(router_stats);
-    timing_driven_route_structs route_structs(net_list);
     float prev_iter_cumm_time = 0;
     vtr::Timer iteration_timer;
     int num_net_bounding_boxes_updated = 0;
@@ -465,7 +396,6 @@ bool try_timing_driven_route_tmpl(const Netlist<>& net_list,
                                                                router_opts,
                                                                connections_inf,
                                                                router_iteration_stats,
-                                                               route_structs.pin_criticality,
                                                                net_delay,
                                                                netlist_pin_lookup,
                                                                route_timing_info,
@@ -863,7 +793,6 @@ NetResultFlags try_timing_driven_route_net(ConnectionRouter& router,
                                            const t_router_opts& router_opts,
                                            CBRR& connections_inf,
                                            RouterStats& router_stats,
-                                           std::vector<float>& pin_criticality,
                                            NetPinsMatrix<float>& net_delay,
                                            const ClusteredPinAtomPinsLookup& netlist_pin_lookup,
                                            std::shared_ptr<SetupHoldTimingInfo> timing_info,
@@ -901,7 +830,6 @@ NetResultFlags try_timing_driven_route_net(ConnectionRouter& router,
                                         router_opts,
                                         connections_inf,
                                         router_stats,
-                                        pin_criticality,
                                         net_delay[net_id].data(),
                                         netlist_pin_lookup,
                                         timing_info,
@@ -946,7 +874,6 @@ NetResultFlags timing_driven_route_net(ConnectionRouter& router,
                                        const t_router_opts& router_opts,
                                        CBRR& connections_inf,
                                        RouterStats& router_stats,
-                                       std::vector<float>& pin_criticality,
                                        float* net_delay,
                                        const ClusteredPinAtomPinsLookup& netlist_pin_lookup,
                                        std::shared_ptr<SetupHoldTimingInfo> timing_info,
@@ -993,18 +920,20 @@ NetResultFlags timing_driven_route_net(ConnectionRouter& router,
     // remaining_targets from this point on are the **pin indices** that have yet to be routed
     std::vector<int> remaining_targets(tree.get_remaining_isinks().begin(), tree.get_remaining_isinks().end());
 
+    std::vector<float> pin_criticality(num_sinks + 1);
+
     // calculate criticality of remaining target pins
     for (int ipin : remaining_targets) {
         if (timing_info) {
             auto pin = net_list.net_pin(net_id, ipin);
-            pin_criticality[ipin] = get_net_pin_criticality(timing_info,
-                                                            netlist_pin_lookup,
-                                                            router_opts.max_criticality,
-                                                            router_opts.criticality_exp,
-                                                            net_id,
-                                                            pin,
-                                                            is_flat);
-
+            pin_criticality[ipin] = get_net_pin_criticality(
+                timing_info,
+                netlist_pin_lookup,
+                router_opts.max_criticality,
+                router_opts.criticality_exp,
+                net_id,
+                pin,
+                is_flat);
         } else {
             //No timing info, implies we want a min delay routing, so use criticality of 1.
             pin_criticality[ipin] = 1.;
@@ -1012,7 +941,7 @@ NetResultFlags timing_driven_route_net(ConnectionRouter& router,
     }
 
     // compare the criticality of different sink nodes
-    sort(begin(remaining_targets), end(remaining_targets), [&](int a, int b) {
+    std::sort(remaining_targets.begin(), remaining_targets.end(), [&](int a, int b) {
         return pin_criticality[a] > pin_criticality[b];
     });
 
@@ -1044,17 +973,18 @@ NetResultFlags timing_driven_route_net(ConnectionRouter& router,
          * routers handle this in the same way */
         bool can_grow_bb = (router_opts.router_algorithm != PARALLEL);
 
-        std::tie(flags.success, flags.retry_with_full_bb) = timing_driven_pre_route_to_clock_root(router,
-                                                                                                  net_id,
-                                                                                                  net_list,
-                                                                                                  sink_node,
-                                                                                                  cost_params,
-                                                                                                  router_opts.high_fanout_threshold,
-                                                                                                  tree,
-                                                                                                  spatial_route_tree_lookup,
-                                                                                                  router_stats,
-                                                                                                  is_flat,
-                                                                                                  can_grow_bb);
+        std::tie(flags.success, flags.retry_with_full_bb) = timing_driven_pre_route_to_clock_root(
+            router,
+            net_id,
+            net_list,
+            sink_node,
+            cost_params,
+            router_opts.high_fanout_threshold,
+            tree,
+            spatial_route_tree_lookup,
+            router_stats,
+            is_flat,
+            can_grow_bb);
 
         return flags;
     }
@@ -1084,20 +1014,22 @@ NetResultFlags timing_driven_route_net(ConnectionRouter& router,
         profiling::conn_start();
 
         // build a branch in the route tree to the target
-        auto sink_flags = timing_driven_route_sink(router,
-                                                   net_list,
-                                                   net_id,
-                                                   itarget,
-                                                   target_pin,
-                                                   cost_params,
-                                                   router_opts,
-                                                   tree,
-                                                   spatial_route_tree_lookup,
-                                                   router_stats,
-                                                   budgeting_inf,
-                                                   routing_predictor,
-                                                   choking_spots,
-                                                   is_flat);
+        auto sink_flags = timing_driven_route_sink(
+            router,
+            net_list,
+            net_id,
+            itarget,
+            target_pin,
+            cost_params,
+            router_opts,
+            tree,
+            (high_fanout ? &spatial_route_tree_lookup : nullptr),
+            router_stats,
+            budgeting_inf,
+            routing_predictor,
+            choking_spots,
+            is_flat,
+            route_ctx.route_bb[net_id]);
 
         flags.retry_with_full_bb |= sink_flags.retry_with_full_bb;
 
@@ -1173,7 +1105,8 @@ static std::tuple<bool, bool> timing_driven_pre_route_to_clock_root(ConnectionRo
                                      std::unordered_map<RRNodeId, int>());
 
     std::tie(found_path, retry_with_full_bb, cheapest) = router.timing_driven_route_connection_from_route_tree(
-        tree.root(),
+        tree,
+        tree.root().inode,
         sink_node,
         cost_params,
         bounding_box,
@@ -1229,20 +1162,21 @@ static std::tuple<bool, bool> timing_driven_pre_route_to_clock_root(ConnectionRo
 }
 
 template<typename ConnectionRouter>
-static NetResultFlags timing_driven_route_sink(ConnectionRouter& router,
-                                               const Netlist<>& net_list,
-                                               ParentNetId net_id,
-                                               unsigned itarget,
-                                               int target_pin,
-                                               const t_conn_cost_params cost_params,
-                                               const t_router_opts& router_opts,
-                                               RouteTree& tree,
-                                               SpatialRouteTreeLookup& spatial_rt_lookup,
-                                               RouterStats& router_stats,
-                                               route_budgets& budgeting_inf,
-                                               const RoutingPredictor& routing_predictor,
-                                               const std::vector<std::unordered_map<RRNodeId, int>>& choking_spots,
-                                               bool is_flat) {
+NetResultFlags timing_driven_route_sink(ConnectionRouter& router,
+                                        const Netlist<>& net_list,
+                                        ParentNetId net_id,
+                                        unsigned itarget,
+                                        int target_pin,
+                                        const t_conn_cost_params cost_params,
+                                        const t_router_opts& router_opts,
+                                        RouteTree& tree,
+                                        SpatialRouteTreeLookup* spatial_rt_lookup,
+                                        RouterStats& router_stats,
+                                        route_budgets& budgeting_inf,
+                                        const RoutingPredictor& routing_predictor,
+                                        const std::vector<std::unordered_map<RRNodeId, int>>& choking_spots,
+                                        bool is_flat,
+                                        const t_bb& bounding_box) {
     const auto& device_ctx = g_vpr_ctx.device();
     auto& route_ctx = g_vpr_ctx.mutable_routing();
 
@@ -1257,14 +1191,13 @@ static NetResultFlags timing_driven_route_sink(ConnectionRouter& router,
 
     bool found_path;
     t_heap cheapest;
-    t_bb bounding_box = route_ctx.route_bb[net_id];
 
     /* Is the connection router allowed to grow the bounding box? That's not the case
      * when routing in parallel, so disallow it. */
     bool can_grow_bb = (router_opts.router_algorithm != PARALLEL);
 
     bool net_is_global = net_list.net_is_global(net_id);
-    bool high_fanout = is_high_fanout(net_list.net_sinks(net_id).size(), router_opts.high_fanout_threshold);
+    bool high_fanout = (spatial_rt_lookup != nullptr);
     constexpr float HIGH_FANOUT_CRITICALITY_THRESHOLD = 0.9;
     bool sink_critical = (cost_params.criticality > HIGH_FANOUT_CRITICALITY_THRESHOLD);
     bool net_is_clock = route_ctx.is_clock_net[net_id] != 0;
@@ -1276,22 +1209,26 @@ static NetResultFlags timing_driven_route_sink(ConnectionRouter& router,
     //However, if the current sink is 'critical' from a timing perspective, we put the entire route tree back onto
     //the heap to ensure it has more flexibility to find the best path.
     if (high_fanout && !sink_critical && !net_is_global && !net_is_clock && -routing_predictor.get_slope() > router_opts.high_fanout_max_slope) {
-        std::tie(found_path, flags.retry_with_full_bb, cheapest) = router.timing_driven_route_connection_from_route_tree_high_fanout(tree.root(),
-                                                                                                                                     sink_node,
-                                                                                                                                     cost_params,
-                                                                                                                                     bounding_box,
-                                                                                                                                     spatial_rt_lookup,
-                                                                                                                                     router_stats,
-                                                                                                                                     conn_params,
-                                                                                                                                     can_grow_bb);
+        std::tie(found_path, flags.retry_with_full_bb, cheapest) = router.timing_driven_route_connection_from_route_tree_high_fanout(
+            tree,
+            tree.root().inode,
+            sink_node,
+            cost_params,
+            bounding_box,
+            *spatial_rt_lookup,
+            router_stats,
+            conn_params,
+            can_grow_bb);
     } else {
-        std::tie(found_path, flags.retry_with_full_bb, cheapest) = router.timing_driven_route_connection_from_route_tree(tree.root(),
-                                                                                                                         sink_node,
-                                                                                                                         cost_params,
-                                                                                                                         bounding_box,
-                                                                                                                         router_stats,
-                                                                                                                         conn_params,
-                                                                                                                         can_grow_bb);
+        std::tie(found_path, flags.retry_with_full_bb, cheapest) = router.timing_driven_route_connection_from_route_tree(
+            tree,
+            tree.root().inode,
+            sink_node,
+            cost_params,
+            bounding_box,
+            router_stats,
+            conn_params,
+            can_grow_bb);
     }
 
     if (!found_path) {
@@ -1305,6 +1242,8 @@ static NetResultFlags timing_driven_route_sink(ConnectionRouter& router,
         if (f_router_debug) {
             update_screen(ScreenUpdatePriority::MAJOR, "Unable to route connection.", ROUTING, nullptr);
         }
+        /* Reset path costs since routing may go on after a failure */
+        router.reset_path_costs();
         flags.success = false;
         return flags;
     }
@@ -1315,9 +1254,9 @@ static NetResultFlags timing_driven_route_sink(ConnectionRouter& router,
     route_ctx.rr_node_route_inf[inode].target_flag--; /* Connected to this SINK. */
 
     vtr::optional<const RouteTreeNode&> new_branch, new_sink;
-    std::tie(new_branch, new_sink) = tree.update_from_heap(&cheapest, target_pin, ((high_fanout) ? &spatial_rt_lookup : nullptr), is_flat);
+    std::tie(new_branch, new_sink) = tree.update_from_heap(&cheapest, target_pin, spatial_rt_lookup, is_flat);
 
-    VTR_ASSERT_DEBUG(!high_fanout || validate_route_tree_spatial_lookup(tree.root(), spatial_rt_lookup));
+    VTR_ASSERT_DEBUG(!high_fanout || validate_route_tree_spatial_lookup(tree.root(), *spatial_rt_lookup));
 
     if (f_router_debug) {
         std::string msg = vtr::string_fmt("Routed Net %zu connection %d to RR node %d successfully", size_t(net_id), itarget, sink_node);
@@ -1343,18 +1282,14 @@ static NetResultFlags timing_driven_route_sink(ConnectionRouter& router,
     return flags;
 }
 
-static void setup_routing_resources(int itry,
-                                    ParentNetId net_id,
-                                    const Netlist<>& net_list,
-                                    unsigned num_sinks,
-                                    int min_incremental_reroute_fanout,
-                                    CBRR& connections_inf,
-                                    const t_router_opts& router_opts,
-                                    bool ripup_high_fanout_nets) {
-    /* Build and return a partial route tree from the legal connections from last iteration.
-     * along the way do:
-     * 	update pathfinder costs to be accurate to the partial route tree
-     *	mark the rr_node sinks as targets to be reached. */
+void setup_routing_resources(int itry,
+                             ParentNetId net_id,
+                             const Netlist<>& net_list,
+                             unsigned num_sinks,
+                             int min_incremental_reroute_fanout,
+                             CBRR& connections_inf,
+                             const t_router_opts& router_opts,
+                             bool ripup_high_fanout_nets) {
     auto& route_ctx = g_vpr_ctx.mutable_routing();
 
     /* "tree" points to this net's spot in the global context here, so re-initializing it etc. changes the global state */
@@ -1367,12 +1302,12 @@ static void setup_routing_resources(int itry,
 
         /* rip up the whole net */
         if (tree)
-            pathfinder_update_cost_from_route_tree(tree.value().root(), -1);
+            pathfinder_update_cost_from_route_tree(tree->root(), -1);
         tree = vtr::nullopt;
 
         /* re-initialize net */
         tree = RouteTree(net_id);
-        pathfinder_update_cost_from_route_tree(tree.value().root(), 1);
+        pathfinder_update_cost_from_route_tree(tree->root(), 1);
 
         // since all connections will be rerouted for this net, clear all of net's forced reroute flags
         connections_inf.clear_force_reroute_for_net(net_id);
@@ -1386,7 +1321,7 @@ static void setup_routing_resources(int itry,
 
         if (!tree) {
             tree = RouteTree(net_id);
-            pathfinder_update_cost_from_route_tree(tree.value().root(), 1);
+            pathfinder_update_cost_from_route_tree(tree->root(), 1);
         }
 
         /* copy the existing routing
@@ -1415,31 +1350,33 @@ static void setup_routing_resources(int itry,
 
             // Initialize only to source
             tree = RouteTree(net_id);
-            pathfinder_update_cost_from_route_tree(tree.value().root(), 1);
+            pathfinder_update_cost_from_route_tree(tree->root(), 1);
         }
 
+        profiling::net_rebuild_end(num_sinks, tree->get_remaining_isinks().size());
         profiling::net_rebuild_end(num_sinks, tree->get_remaining_isinks().size());
 
         // still need to calculate the tree's time delay
-        tree.value().reload_timing();
+        tree->reload_timing();
 
         // check for R_upstream C_downstream and edge correctness
-        VTR_ASSERT_SAFE(tree.value().is_valid());
+        VTR_ASSERT_SAFE(tree->is_valid());
 
         // congestion should've been pruned away
-        VTR_ASSERT_SAFE(tree.value().is_uncongested());
+        VTR_ASSERT_SAFE(tree->is_uncongested());
 
         // mark remaining ends
         mark_remaining_ends(net_id);
 
         // mark the lookup (rr_node_route_inf) for existing tree elements as NO_PREVIOUS so add_to_path stops when it reaches one of them
-        update_rr_route_inf_from_tree(tree.value().root());
+        update_rr_route_inf_from_tree(tree->root());
     }
 
     // completed constructing the partial route tree and updated all other data structures to match
 }
 
-/** Change the base costs of rr_nodes according to # of fanouts */
+/** Change the base costs of rr_nodes according to # of fanouts
+ * TODO: Doesn't seem very thread safe? */
 void update_rr_base_costs(int fanout) {
     auto& device_ctx = g_vpr_ctx.mutable_device();
 
@@ -1511,11 +1448,11 @@ bool timing_driven_check_net_delays(const Netlist<>& net_list, NetPinsMatrix<flo
 
 /* Goes through all the sinks of this net and copies their delay values from
  * the route_tree to the net_delay array. */
-static void update_net_delays_from_route_tree(float* net_delay,
-                                              const Netlist<>& net_list,
-                                              ParentNetId inet,
-                                              TimingInfo* timing_info,
-                                              NetPinTimingInvalidator* pin_timing_invalidator) {
+void update_net_delays_from_route_tree(float* net_delay,
+                                       const Netlist<>& net_list,
+                                       ParentNetId inet,
+                                       TimingInfo* timing_info,
+                                       NetPinTimingInvalidator* pin_timing_invalidator) {
     auto& route_ctx = g_vpr_ctx.routing();
     const RouteTree& tree = route_ctx.route_trees[inet].value();
 
@@ -1524,7 +1461,7 @@ static void update_net_delays_from_route_tree(float* net_delay,
     }
 }
 
-/* Detect if net should be routed or not */
+/** Detect if \p net_id should be routed or not. */
 bool should_route_net(ParentNetId net_id,
                       CBRR& connections_inf,
                       bool if_force_reroute) {
@@ -1576,25 +1513,13 @@ bool early_exit_heuristic(const t_router_opts& router_opts, const WirelengthInfo
     return false;
 }
 
-static bool check_hold(const t_router_opts& router_opts, float worst_neg_slack) {
-    /* When RCV is enabled, it's necessary to be able to completely ripup high fanout nets if there is still negative hold slack
-     * Normally the router will prune the illegal branches of high fanout nets, this will bypass this */
-
-    if (router_opts.routing_budgets_algorithm != YOYO) {
-        return false;
-    } else if (worst_neg_slack != 0) {
-        return true;
-    }
-    return false;
-}
-
-static float get_net_pin_criticality(const std::shared_ptr<SetupHoldTimingInfo> timing_info,
-                                     const ClusteredPinAtomPinsLookup& netlist_pin_lookup,
-                                     float max_criticality,
-                                     float criticality_exp,
-                                     ParentNetId net_id,
-                                     ParentPinId pin_id,
-                                     bool is_flat) {
+float get_net_pin_criticality(const std::shared_ptr<SetupHoldTimingInfo> timing_info,
+                              const ClusteredPinAtomPinsLookup& netlist_pin_lookup,
+                              float max_criticality,
+                              float criticality_exp,
+                              ParentNetId net_id,
+                              ParentPinId pin_id,
+                              bool is_flat) {
     float pin_criticality = 0.0;
     const auto& route_ctx = g_vpr_ctx.routing();
 
@@ -1646,6 +1571,23 @@ WirelengthInfo calculate_wirelength_info(const Netlist<>& net_list, size_t avail
 
     auto& route_ctx = g_vpr_ctx.routing();
 
+#ifdef VPR_USE_TBB
+    tbb::combinable<size_t> thread_used_wirelength(0);
+
+    tbb::parallel_for_each(net_list.nets().begin(), net_list.nets().end(), [&](ParentNetId net_id){
+        if (!net_list.net_is_ignored(net_id)
+            && net_list.net_sinks(net_id).size() != 0 /* Globals don't count. */
+            && route_ctx.route_trees[net_id]) {
+            int bends, wirelength, segments;
+            bool is_absorbed;
+            get_num_bends_and_length(net_id, &bends, &wirelength, &segments, &is_absorbed);
+
+            thread_used_wirelength.local() += wirelength;
+        }
+    });
+
+    used_wirelength = thread_used_wirelength.combine(std::plus<size_t>());
+#else
     for (auto net_id : net_list.nets()) {
         if (!net_list.net_is_ignored(net_id)
             && net_list.net_sinks(net_id).size() != 0 /* Globals don't count. */
@@ -1657,6 +1599,7 @@ WirelengthInfo calculate_wirelength_info(const Netlist<>& net_list, size_t avail
             used_wirelength += wirelength;
         }
     }
+#endif
 
     return WirelengthInfo(available_wirelength, used_wirelength);
 }
@@ -1789,12 +1732,6 @@ void print_overused_nodes_status(const t_router_opts& router_opts, const Overuse
     VTR_LOG("\n");
 }
 
-//Returns true if the specified net fanout is classified as high fanout
-static bool is_high_fanout(int fanout, int fanout_threshold) {
-    if (fanout_threshold < 0 || fanout < fanout_threshold) return false;
-    return true;
-}
-
 // In heavily congested designs a static bounding box (BB) can
 // become problematic for routability (it effectively enforces a
 // hard blockage restricting where a net can route).
@@ -1850,7 +1787,9 @@ size_t dynamic_update_bounding_boxes(const std::vector<ParentNetId>& updated_net
         //use different bounding boxes based on the target location.
         //
         //This ensures that the delta values calculated below are always non-negative
-        if (is_high_fanout(net_list.net_sinks(net).size(), high_fanout_threshold)) continue;
+        //EXPERIMENT: Do it anyway -- we now clip BBs of HF nets
+        //if (is_high_fanout(net_list.net_sinks(net).size(), high_fanout_threshold))
+        //    continue;
 
         t_bb curr_bb = calc_current_bb(route_ctx.route_trees[net].value());
         t_bb& router_bb = route_ctx.route_bb[net];
diff --git a/vpr/src/route/route_timing.h b/vpr/src/route/route_timing.h
index 38495bb806b..01d1228cf7a 100644
--- a/vpr/src/route/route_timing.h
+++ b/vpr/src/route/route_timing.h
@@ -21,7 +21,6 @@
 
 extern bool f_router_debug;
 
-/** TODO: remove timing_driven_route_structs together with this fn */
 int get_max_pins_per_net(const Netlist<>& net_list);
 
 /** Types and defines common to timing_driven and parallel routers */
@@ -62,20 +61,6 @@ struct RoutingMetrics {
     tatum::TimingPathInfo critical_path;
 };
 
-/* Data while timing driven route is active */
-class timing_driven_route_structs {
-  public:
-    std::vector<float> pin_criticality; /* [1..max_pins_per_net-1] */
-
-    timing_driven_route_structs(const Netlist<>& net_list) {
-        int max_sinks = std::max(get_max_pins_per_net(net_list) - 1, 0);
-        pin_criticality.resize(max_sinks + 1);
-
-        /* Set element 0 to invalid values */
-        pin_criticality[0] = std::numeric_limits<float>::quiet_NaN();
-    }
-};
-
 /** Returns the bounding box of a net's used routing resources */
 t_bb calc_current_bb(const RouteTree& tree);
 
@@ -109,6 +94,12 @@ void generate_route_timing_reports(const t_router_opts& router_opts,
                                    const RoutingDelayCalculator& delay_calc,
                                    bool is_flat);
 
+/** Returns true if the specified net fanout is classified as high fanout. */
+inline bool is_high_fanout(int fanout, int fanout_threshold) {
+    if (fanout_threshold < 0 || fanout < fanout_threshold) return false;
+    return true;
+}
+
 /** Initialize net_delay based on best-case delay estimates from the router lookahead. */
 void init_net_delay_from_lookahead(const RouterLookahead& router_lookahead,
                                    const Netlist<>& net_list,
@@ -196,6 +187,67 @@ bool try_timing_driven_route(const Netlist<>& net_list,
                              ScreenUpdatePriority first_iteration_priority,
                              bool is_flat);
 
+/** Calculate pin criticality for \p pin_id of \p net_id. */
+float get_net_pin_criticality(const std::shared_ptr<SetupHoldTimingInfo> timing_info,
+                              const ClusteredPinAtomPinsLookup& netlist_pin_lookup,
+                              float max_criticality,
+                              float criticality_exp,
+                              ParentNetId net_id,
+                              ParentPinId pin_id,
+                              bool is_flat);
+
+/** Build and return a partial route tree from the legal connections from last iteration.
+ * along the way do:
+ * 	update pathfinder costs to be accurate to the partial route tree
+ * 	find and store the pins that still need to be reached in connections_inf.remaining_targets
+ * 	find and store the rt nodes that have been reached in connections_inf.reached_rt_sinks
+ *	mark the rr_node sinks as targets to be reached. */
+void setup_routing_resources(int itry,
+                             ParentNetId net_id,
+                             const Netlist<>& net_list,
+                             unsigned num_sinks,
+                             int min_incremental_reroute_fanout,
+                             CBRR& connections_inf,
+                             const t_router_opts& router_opts,
+                             bool ripup_high_fanout_nets);
+
+/** Attempt to route a single sink (target_pin) in a net.
+ * In the process, update global pathfinder costs, rr_node_route_inf and extend the global RouteTree
+ * for this net.
+ *
+ * @param router The ConnectionRouter instance 
+ * @param net_list Input netlist
+ * @param net_id
+ * @param itarget # of this connection in the net (only used for debug output)
+ * @param target_pin # of this sink in the net (TODO: is it the same thing as itarget?)
+ * @param cost_params
+ * @param router_opts
+ * @param[in, out] tree RouteTree describing the current routing state
+ * @param rt_node_of_sink Lookup from target_pin-like indices (indicating SINK nodes) to RouteTreeNodes
+ * @param spatial_rt_lookup
+ * @param router_stats
+ * @param budgeting_inf
+ * @param routing_predictor
+ * @param choking_spots
+ * @param is_flat
+ * @return NetResultFlags for this sink to be bubbled up through timing_driven_route_net */
+template<typename ConnectionRouter>
+NetResultFlags timing_driven_route_sink(ConnectionRouter& router,
+                                        const Netlist<>& net_list,
+                                        ParentNetId net_id,
+                                        unsigned itarget,
+                                        int target_pin,
+                                        const t_conn_cost_params cost_params,
+                                        const t_router_opts& router_opts,
+                                        RouteTree& tree,
+                                        SpatialRouteTreeLookup* spatial_rt_lookup,
+                                        RouterStats& router_stats,
+                                        route_budgets& budgeting_inf,
+                                        const RoutingPredictor& routing_predictor,
+                                        const std::vector<std::unordered_map<RRNodeId, int>>& choking_spots,
+                                        bool is_flat,
+                                        const t_bb& bounding_box);
+
 /** Attempt to route a single net.
  *
  * @param router The ConnectionRouter instance 
@@ -207,7 +259,6 @@ bool try_timing_driven_route(const Netlist<>& net_list,
  * @param connections_inf
  * @param router_stats
  * @param pin_criticality
- * @param rt_node_of_sink Lookup from target_pin-like indices (indicating SINK nodes) to RouteTreeNodes
  * @param net_delay
  * @param netlist_pin_lookup
  * @param timing_info
@@ -227,7 +278,6 @@ NetResultFlags timing_driven_route_net(ConnectionRouter& router,
                                        const t_router_opts& router_opts,
                                        CBRR& connections_inf,
                                        RouterStats& router_stats,
-                                       std::vector<float>& pin_criticality,
                                        float* net_delay,
                                        const ClusteredPinAtomPinsLookup& netlist_pin_lookup,
                                        std::shared_ptr<SetupHoldTimingInfo> timing_info,
@@ -247,7 +297,6 @@ NetResultFlags try_timing_driven_route_net(ConnectionRouter& router,
                                            const t_router_opts& router_opts,
                                            CBRR& connections_inf,
                                            RouterStats& router_stats,
-                                           std::vector<float>& pin_criticality,
                                            NetPinsMatrix<float>& net_delay,
                                            const ClusteredPinAtomPinsLookup& netlist_pin_lookup,
                                            std::shared_ptr<SetupHoldTimingInfo> timing_info,
@@ -278,6 +327,15 @@ inline void update_net_delay_from_isink(float* net_delay,
     net_delay[isink] = new_delay;
 }
 
+/* Goes through all the sinks of this net and copies their delay values from
+ * the route_tree to the net_delay array. */
+void update_net_delays_from_route_tree(float* net_delay,
+                                       const Netlist<>& net_list,
+                                       ParentNetId inet,
+                                       TimingInfo* timing_info,
+                                       NetPinTimingInvalidator* pin_timing_invalidator);
+
+/** Combine \p router_iteration_stats into \p router_stats. */
 void update_router_stats(RouterStats& router_stats, RouterStats& router_iteration_stats);
 
 #ifndef NO_GRAPHICS
diff --git a/vpr/src/route/route_tree.cpp b/vpr/src/route/route_tree.cpp
index 36f37461527..01667d8ace2 100644
--- a/vpr/src/route/route_tree.cpp
+++ b/vpr/src/route/route_tree.cpp
@@ -40,9 +40,13 @@ void RouteTreeNode::print_x(int depth) const {
 
     auto& device_ctx = g_vpr_ctx.device();
     const auto& rr_graph = device_ctx.rr_graph;
-    VTR_LOG("%srt_node: %d (%s) \t ipin: %d \t R: %g \t C: %g \t delay: %g \t",
+    VTR_LOG("%srt_node: %d (%d, %d) -> (%d, %d) (%s) ipin: %d R: %g C: %g delay: %g ",
             indent.c_str(),
             inode,
+            rr_graph.node_xlow(inode),
+            rr_graph.node_ylow(inode),
+            rr_graph.node_xhigh(inode),
+            rr_graph.node_yhigh(inode),
             rr_graph.node_type_string(inode),
             net_pin_index,
             R_upstream,
@@ -50,7 +54,7 @@ void RouteTreeNode::print_x(int depth) const {
             Tdel);
 
     if (_parent) {
-        VTR_LOG("parent: %d \t parent_switch: %d", _parent->inode, parent_switch);
+        VTR_LOG("parent: %d parent_switch: %d", _parent->inode, parent_switch);
         bool parent_edge_configurable = rr_graph.rr_switch_inf(parent_switch).configurable();
         if (!parent_edge_configurable) {
             VTR_LOG("*");
@@ -288,7 +292,7 @@ RouteTree::update_unbuffered_ancestors_C_downstream(RouteTreeNode& from_node) {
 
     /* Having set the value of C_downstream_addition, we must check whether the parent switch
      * is a buffered or unbuffered switch with the if statement below. If the parent switch is
-     * a buffered switch, then the parent node's downsteam capacitance is increased by the
+     * a buffered switch, then the parent node's downstream capacitance is increased by the
      * value of the parent switch's internal capacitance in the if statement below.
      * Correspondingly, the ancestors' downstream capacitance will be updated by the same
      * value through the while loop. Otherwise, if the parent switch is unbuffered, then
@@ -301,6 +305,8 @@ RouteTree::update_unbuffered_ancestors_C_downstream(RouteTreeNode& from_node) {
 
     if (rr_graph.rr_switch_inf(iswitch).buffered() == true) {
         C_downstream_addition = rr_graph.rr_switch_inf(iswitch).Cinternal;
+        if(C_downstream_addition == 0) /* This switch has Cinternal = 0, no need to update parent */
+            return from_node;
         last_node = parent_rt_node;
         last_node->C_downstream += C_downstream_addition;
         parent_rt_node = last_node->_parent;
diff --git a/vpr/src/route/router_delay_profiling.cpp b/vpr/src/route/router_delay_profiling.cpp
index 4e2274c406f..51d5a21d972 100644
--- a/vpr/src/route/router_delay_profiling.cpp
+++ b/vpr/src/route/router_delay_profiling.cpp
@@ -72,7 +72,8 @@ bool RouterDelayProfiler::calculate_delay(RRNodeId source_node, RRNodeId sink_no
                                      false,
                                      std::unordered_map<RRNodeId, int>());
     std::tie(found_path, std::ignore, cheapest) = router_.timing_driven_route_connection_from_route_tree(
-        tree.root(),
+        tree,
+        tree.root().inode,
         sink_node,
         cost_params,
         bounding_box,
@@ -144,7 +145,7 @@ vtr::vector<RRNodeId, float> calculate_all_path_delays_from_rr_node(RRNodeId src
         is_flat);
     RouterStats router_stats;
     ConnectionParameters conn_params(ParentNetId::INVALID(), OPEN, false, std::unordered_map<RRNodeId, int>());
-    vtr::vector<RRNodeId, t_heap> shortest_paths = router.timing_driven_find_all_shortest_paths_from_route_tree(tree.root(),
+    vtr::vector<RRNodeId, t_heap> shortest_paths = router.timing_driven_find_all_shortest_paths_from_route_tree(tree,
                                                                                                                 cost_params,
                                                                                                                 bounding_box,
                                                                                                                 router_stats,
diff --git a/vpr/src/route/spatial_route_tree_lookup.cpp b/vpr/src/route/spatial_route_tree_lookup.cpp
index 3d3f7a25460..e03fe8f291e 100644
--- a/vpr/src/route/spatial_route_tree_lookup.cpp
+++ b/vpr/src/route/spatial_route_tree_lookup.cpp
@@ -48,7 +48,7 @@ void update_route_tree_spatial_lookup_recur(const RouteTreeNode& rt_node, Spatia
     //
     // TODO: Depending on bin size, long wires may end up being added only to bins at
     //      their start/end and may pass through bins along their length to which they
-    //      are not added. If this becomes an issues, reconsider how we add nodes to
+    //      are not added. If this becomes an issue, reconsider how we add nodes to
     //      bins
     if (bin_xhigh != bin_xlow || bin_yhigh != bin_ylow) {
         spatial_lookup[bin_xhigh][bin_yhigh].push_back(rt_node);
diff --git a/vpr/src/route/virtual_net.h b/vpr/src/route/virtual_net.h
new file mode 100644
index 00000000000..4c0cff5e4ba
--- /dev/null
+++ b/vpr/src/route/virtual_net.h
@@ -0,0 +1,21 @@
+#pragma once
+
+#include "netlist_fwd.h"
+#include "route_tree_fwd.h"
+#include "vpr_types.h"
+
+/** A net decomposed by routing a connection through the partitioning
+ * cutline and dividing the bounding box into two. When routing, the connection
+ * router will receive a smaller-than-usual bounding box and will have to
+ * filter the existing routing spatially. */
+class VirtualNet {
+  public:
+    /** The net in question. */
+    ParentNetId net_id;
+    /** Clipped bounding box. This is needed to enable decomposing a net multiple times.
+     * Otherwise we would need a history of side types and cutlines to compute the bbox. */
+    t_bb clipped_bb;
+    /** Times decomposed -- don't decompose vnets too deeply or
+     * it disturbs net ordering when it's eventually disabled & creates a runtime bump. */
+    int times_decomposed = 0;
+};
diff --git a/vpr/src/timing/NetPinTimingInvalidator.h b/vpr/src/timing/NetPinTimingInvalidator.h
index f452b95bd7a..ded51f11560 100644
--- a/vpr/src/timing/NetPinTimingInvalidator.h
+++ b/vpr/src/timing/NetPinTimingInvalidator.h
@@ -11,9 +11,8 @@
 #    include <tbb/concurrent_unordered_set.h>
 #endif
 
-/** Make NetPinTimingInvalidator a virtual class since it does nothing for the general case of non-incremental
- * timing updates. It should really be templated to not pay the cost for vtable lookups, but this is the
- * best approach without putting a template on every function which uses this machine. */
+/** Adapter code to tell TimingInfo about invalidated connections. Can be no-op in
+ * the case of full timing updates. */
 class NetPinTimingInvalidator {
   public:
     typedef vtr::Range<const tatum::EdgeId*> tedge_range;
@@ -83,19 +82,13 @@ class IncrNetPinTimingInvalidator : public NetPinTimingInvalidator {
      * driving the specified pin.
      * Is concurrently safe. */
     void invalidate_connection(ParentPinId pin, TimingInfo* timing_info) {
-        if (invalidated_pins_.count(pin)) return; //Already invalidated
-
         for (tatum::EdgeId edge : pin_timing_edges(pin)) {
             timing_info->invalidate_delay(edge);
         }
-
-        invalidated_pins_.insert(pin);
     }
 
-    /** Resets invalidation state for this class
-     * Not concurrently safe! */
+    /** Resets invalidation state for this class (no-op) */
     void reset() {
-        invalidated_pins_.clear();
     }
 
   private:
@@ -129,14 +122,6 @@ class IncrNetPinTimingInvalidator : public NetPinTimingInvalidator {
   private:
     std::vector<int> pin_first_edge_; //Indices into timing_edges corresponding
     std::vector<tatum::EdgeId> timing_edges_;
-
-    /** Cache for invalidated pins. Use concurrent set when TBB is turned on, since the
-     * invalidator may be shared between threads */
-#ifdef VPR_USE_TBB
-    tbb::concurrent_unordered_set<ParentPinId> invalidated_pins_;
-#else
-    vtr::vec_id_set<ParentPinId> invalidated_pins_;
-#endif
 };
 
 /** NetPinTimingInvalidator is only a rube goldberg machine when incremental timing analysis
@@ -155,7 +140,8 @@ class NoopNetPinTimingInvalidator : public NetPinTimingInvalidator {
     }
 };
 
-/** Make a NetPinTimingInvalidator depending on update_type. Will return a NoopInvalidator if it's not INCREMENTAL. */
+/** Make a NetPinTimingInvalidator depending on update_type. Will return a NoopInvalidator
+ * if it's not INCREMENTAL or AUTO (adaptive) */
 inline std::unique_ptr<NetPinTimingInvalidator> make_net_pin_timing_invalidator(
     e_timing_update_type update_type,
     const Netlist<>& net_list,
@@ -164,10 +150,10 @@ inline std::unique_ptr<NetPinTimingInvalidator> make_net_pin_timing_invalidator(
     const AtomLookup& atom_lookup,
     const tatum::TimingGraph& timing_graph,
     bool is_flat) {
-    if (update_type == e_timing_update_type::FULL || update_type == e_timing_update_type::AUTO) {
+    if (update_type == e_timing_update_type::FULL) {
         return std::make_unique<NoopNetPinTimingInvalidator>();
     } else {
-        VTR_ASSERT(update_type == e_timing_update_type::INCREMENTAL);
+        VTR_ASSERT(update_type == e_timing_update_type::INCREMENTAL || update_type == e_timing_update_type::AUTO);
         return std::make_unique<IncrNetPinTimingInvalidator>(net_list, clb_atom_pin_lookup, atom_nlist, atom_lookup, timing_graph, is_flat);
     }
-}
\ No newline at end of file
+}
diff --git a/vpr/src/timing/concrete_timing_info.h b/vpr/src/timing/concrete_timing_info.h
index 9aaae0d82ff..d733c6c03c4 100644
--- a/vpr/src/timing/concrete_timing_info.h
+++ b/vpr/src/timing/concrete_timing_info.h
@@ -1,6 +1,7 @@
 #ifndef VPR_CONCRETE_TIMING_INFO_H
 #define VPR_CONCRETE_TIMING_INFO_H
 
+#include "tatum/analyzer_factory.hpp"
 #include "vtr_log.h"
 #include "timing_info.h"
 #include "timing_util.h"
@@ -490,8 +491,10 @@ std::unique_ptr<SetupHoldTimingInfo> make_setup_hold_timing_info(std::shared_ptr
     auto& timing_ctx = g_vpr_ctx.timing();
 
     std::shared_ptr<tatum::SetupHoldTimingAnalyzer> analyzer;
-    if (update_type == e_timing_update_type::FULL || update_type == e_timing_update_type::AUTO) {
+    if (update_type == e_timing_update_type::FULL) {
         analyzer = tatum::AnalyzerFactory<tatum::SetupHoldAnalysis, tatum::ParallelWalker>::make(*timing_ctx.graph, *timing_ctx.constraints, *delay_calculator);
+    } else if (update_type == e_timing_update_type::AUTO) { /* Create adaptive analyzer */
+        analyzer = tatum::AnalyzerFactory<tatum::SetupHoldAnalysis, tatum::ParallelWalker, tatum::SerialIncrWalker>::make(*timing_ctx.graph, *timing_ctx.constraints, *delay_calculator);
     } else {
         VTR_ASSERT(update_type == e_timing_update_type::INCREMENTAL);
         analyzer = tatum::AnalyzerFactory<tatum::SetupHoldAnalysis, tatum::SerialIncrWalker>::make(*timing_ctx.graph, *timing_ctx.constraints, *delay_calculator);
diff --git a/vpr/src/timing/net_delay.cpp b/vpr/src/timing/net_delay.cpp
index 5420c197769..d5d1ce52152 100644
--- a/vpr/src/timing/net_delay.cpp
+++ b/vpr/src/timing/net_delay.cpp
@@ -45,13 +45,13 @@ static void load_one_constant_net_delay(const Netlist<>& net_list,
                                         float delay_value);
 
 /*************************** Subroutine definitions **************************/
-void load_net_delay_from_routing(const Netlist<>& net_list, NetPinsMatrix<float>& net_delay) {
-    /* This routine loads net_delay[0..nets.size()-1][1..num_pins-1].  Each entry   *
-     * is the Elmore delay from the net source to the appropriate sink. Both       *
-     * the rr_graph and the routing traceback must be completely constructed        *
-     * before this routine is called, and the net_delay array must have been        *
-     * allocated.                                                                   */
 
+/** This routine loads net_delay[0..nets.size()-1][1..num_pins-1].  Each entry 
+ * is the Elmore delay from the net source to the appropriate sink. Both       
+ * the rr_graph and the routing traceback must be completely constructed        
+ * before this routine is called, and the net_delay array must have been        
+ * allocated. */
+void load_net_delay_from_routing(const Netlist<>& net_list, NetPinsMatrix<float>& net_delay) {
     for (auto net_id : net_list.nets()) {
         if (net_list.net_is_ignored(net_id)) {
             load_one_constant_net_delay(net_list, net_delay, net_id, 0.);
@@ -61,18 +61,17 @@ void load_net_delay_from_routing(const Netlist<>& net_list, NetPinsMatrix<float>
     }
 }
 
+/** This routine loads delay values for one net in                            
+ * net_delay[net_id][1..num_pins-1]. First, from the traceback, it           
+ * constructs the route tree and computes its values for R, C, and Tdel.     
+ * Next, it walks the route tree recursively, storing the time delays for    
+ * each sink into the map ipin_to_Tdel. Then, while looping through the      
+ * net_delay array we search for the pin index in the map, and               
+ * correspondingly update the entry in net_delay. Finally, it frees the      
+ * route tree and clears the ipin_to_Tdel_map associated with that net. */
 static void load_one_net_delay(const Netlist<>& net_list,
                                NetPinsMatrix<float>& net_delay,
                                ParentNetId net_id) {
-    /* This routine loads delay values for one net in                            *
-     * net_delay[net_id][1..num_pins-1]. First, from the traceback, it           *
-     * constructs the route tree and computes its values for R, C, and Tdel.     *
-     * Next, it walks the route tree recursively, storing the time delays for    *
-     * each sink into the map ipin_to_Tdel. Then, while looping through the      *
-     * net_delay array we search for the pin index in the map, and               *
-     * correspondingly update the entry in net_delay. Finally, it frees the      *
-     * route tree and clears the ipin_to_Tdel_map associated with that net.      */
-
     auto& route_ctx = g_vpr_ctx.mutable_routing();
 
     if (!route_ctx.route_trees[net_id]) {
@@ -92,9 +91,9 @@ static void load_one_net_delay(const Netlist<>& net_list,
     ipin_to_Tdel_map.clear(); // clear the map
 }
 
+/** This routine recursively traverses the route tree, and copies the Tdel of the sink_type nodes
+ * into the map. */
 static void load_one_net_delay_recurr(const RouteTreeNode& rt_node, ParentNetId net_id) {
-    /* This routine recursively traverses the route tree, and copies the Tdel of the sink_type nodes *
-     * into the map.                                                                                 */
     if (rt_node.net_pin_index != OPEN) {                        // value of OPEN indicates a non-SINK
         ipin_to_Tdel_map[rt_node.net_pin_index] = rt_node.Tdel; // add to the map, process current sink-type node
     }
@@ -104,12 +103,11 @@ static void load_one_net_delay_recurr(const RouteTreeNode& rt_node, ParentNetId
     }
 }
 
+/** Sets each entry of the net_delay array for net inet to delay_value. */
 static void load_one_constant_net_delay(const Netlist<>& net_list,
                                         NetPinsMatrix<float>& net_delay,
                                         ParentNetId net_id,
                                         float delay_value) {
-    /* Sets each entry of the net_delay array for net inet to delay_value.     */
-
     for (unsigned int ipin = 1; ipin < net_list.net_pins(net_id).size(); ipin++)
         net_delay[net_id][ipin] = delay_value;
 }
diff --git a/vpr/test/test_connection_router.cpp b/vpr/test/test_connection_router.cpp
index 6c1b54734e3..c28fb66694d 100644
--- a/vpr/test/test_connection_router.cpp
+++ b/vpr/test/test_connection_router.cpp
@@ -68,7 +68,8 @@ static float do_one_route(RRNodeId source_node,
                                      -1,
                                      false,
                                      std::unordered_map<RRNodeId, int>());
-    std::tie(found_path, std::ignore, cheapest) = router.timing_driven_route_connection_from_route_tree(tree.root(),
+    std::tie(found_path, std::ignore, cheapest) = router.timing_driven_route_connection_from_route_tree(tree,
+                                                                                                        source_node,
                                                                                                         sink_node,
                                                                                                         cost_params,
                                                                                                         bounding_box,
diff --git a/vpr/test/test_net_decomp.cpp b/vpr/test/test_net_decomp.cpp
new file mode 100644
index 00000000000..12a0a969f4d
--- /dev/null
+++ b/vpr/test/test_net_decomp.cpp
@@ -0,0 +1,30 @@
+#include "catch2/catch_test_macros.hpp"
+#include "catch2/matchers/catch_matchers_all.hpp"
+
+#include "route_samplers.h"
+
+namespace {
+
+TEST_CASE("test_convex_hull", "[vpr]") {
+    /* Smoke test for the convex hull algorithm in the sampler */
+    std::vector<SinkPoint> points1 {
+        {0, 0, 0}, {0, 1, 0}, {1, 1, 0}
+    };
+    std::vector<SinkPoint> expected_hull1(points1);
+    std::vector<SinkPoint> hull1 = quickhull(points1);
+    REQUIRE_THAT(hull1, Catch::Matchers::UnorderedEquals(expected_hull1));
+
+    std::vector<SinkPoint> points2 {
+        {113,148,0}, {113,143,0}, {113,145,0}, {114,146,0}, {111,138,0}, {110,139,0},
+        {112,138,0}, {108,146,0}, {111,145,0}, {103,142,0}, {103,148,0}, {116,142,0},
+        {116,141,0}, {110,148,0}, {106,146,0}
+    }; 
+    std::vector<SinkPoint> expected_hull2 {
+        {111,138,0}, {116,141,0}, {112,138,0}, {103,148,0}, {103,142,0}, {116,142,0},
+         {113,148,0}
+    };
+    std::vector<SinkPoint> hull2 = quickhull(points2);
+    REQUIRE_THAT(hull2, Catch::Matchers::UnorderedEquals(expected_hull2));
+}
+
+} // namespace
diff --git a/vtr_flow/scripts/python_libs/vtr/task.py b/vtr_flow/scripts/python_libs/vtr/task.py
index 6bf898a5d22..0cfb6f3ebbe 100644
--- a/vtr_flow/scripts/python_libs/vtr/task.py
+++ b/vtr_flow/scripts/python_libs/vtr/task.py
@@ -633,9 +633,11 @@ def create_job(
             prev_run_dir = get_existing_run_dir(find_task_dir(config, args.alt_tasks_dir), prev_run)
             prev_work_path = Path(prev_run_dir) / work_dir / param_string
             prev_file = prev_work_path / "{}.{}".format(Path(circuit).stem, extension)
-            if not prev_file.exists():
-                raise FileNotFoundError("use_previous: file %s not found" % str(prev_file))
-            current_cmd += [option, str(prev_file)]
+            if option == "REPLACE_BLIF":
+                current_cmd[0] = str(prev_file)
+                current_cmd += ["-start", "vpr"]
+            else:
+                current_cmd += [option, str(prev_file)]
 
     if param_string != "common":
         current_cmd += param.split(" ")
diff --git a/vtr_flow/scripts/python_libs/vtr/util.py b/vtr_flow/scripts/python_libs/vtr/util.py
index 8eec41661ba..e19935b4b21 100644
--- a/vtr_flow/scripts/python_libs/vtr/util.py
+++ b/vtr_flow/scripts/python_libs/vtr/util.py
@@ -442,23 +442,24 @@ def format_elapsed_time(time_delta):
     "route": ["route", "--route_file"],
     "rr_graph": ["rr_graph.xml", "--read_rr_graph"],
     "lookahead": ["lookahead.bin", "--read_router_lookahead"],
+    "blif": ["pre-vpr.blif", "REPLACE_BLIF"],
 }
 
 
-def argparse_use_previous(inp: str) -> List[Tuple[str, List]]:
+def argparse_use_previous(x: str) -> List[Tuple[str, List]]:
     """
     Parse a -use_previous parameter. Throw if not valid.
     Returns a list with (run dir name, [extension, cmdline option]) elements.
     """
-    tokens = [w.strip() for w in inp.split(",")]
+    tokens = [w.strip() for w in x.split(",")]
     tokens = [w for w in tokens if len(w)]
     out = []
     for w in tokens:
         r = re.fullmatch(r"(\w+):(\w+)", w)
         if not r:
-            raise argparse.ArgumentTypeError("Invalid input to -use_previous: %s" % w)
+            raise argparse.ArgumentError("Invalid input to -use_previous: %s" % w)
         if not REUSABLE_FILES.get(r.group(2)):
-            raise argparse.ArgumentTypeError(
+            raise argparse.ArgumentError(
                 "Unknown file type to use_previous: %s, available types: %s"
                 % (r.group(2), ",".join(REUSABLE_FILES.keys()))
             )