From 9975fe9056d6faae83905cd63dc6d1d4c33f5fab Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Fahrican=20Ko=C5=9Far?= Date: Thu, 24 Mar 2022 19:21:19 -0400 Subject: [PATCH] dump net decomposition code --- .../libtatum/tatum/analyzer_factory.hpp | 20 +- .../libtatum/tatum/analyzer_factory_fwd.hpp | 2 +- .../AdaptiveSetupHoldTimingAnalyzer.hpp | 202 +++ .../analyzers/IncrHoldTimingAnalyzer.hpp | 2 +- .../analyzers/IncrSetupHoldTimingAnalyzer.hpp | 1 + .../analyzers/IncrSetupTimingAnalyzer.hpp | 2 +- .../graph_walkers/ParallelLevelizedWalker.hpp | 2 +- .../tatum/graph_walkers/SerialIncrWalker.hpp | 6 +- libs/librrgraph/src/base/rr_graph_storage.h | 5 - libs/libvtrutil/src/vtr_math.h | 12 + utils/route_diag/src/main.cpp | 17 +- vpr/src/base/SetupVPR.cpp | 3 + vpr/src/base/read_netlist.cpp | 2 + vpr/src/base/vpr_api.cpp | 12 +- vpr/src/base/vpr_types.h | 2 + vpr/src/route/connection_router.cpp | 250 +-- vpr/src/route/connection_router.h | 68 +- vpr/src/route/connection_router_interface.h | 20 +- vpr/src/route/partition_tree.cpp | 82 +- vpr/src/route/partition_tree.h | 18 +- vpr/src/route/route_common.cpp | 42 +- vpr/src/route/route_common.h | 39 +- vpr/src/route/route_parallel.cpp | 1391 +++++++++++++++-- vpr/src/route/route_samplers.cpp | 69 + vpr/src/route/route_samplers.h | 503 ++++++ vpr/src/route/route_timing.cpp | 337 ++-- vpr/src/route/route_timing.h | 94 +- vpr/src/route/route_tree.cpp | 12 +- vpr/src/route/router_delay_profiling.cpp | 5 +- vpr/src/route/spatial_route_tree_lookup.cpp | 2 +- vpr/src/route/virtual_net.h | 21 + vpr/src/timing/NetPinTimingInvalidator.h | 30 +- vpr/src/timing/concrete_timing_info.h | 5 +- vpr/src/timing/net_delay.cpp | 36 +- vpr/test/test_connection_router.cpp | 3 +- vpr/test/test_net_decomp.cpp | 30 + vtr_flow/scripts/python_libs/vtr/task.py | 8 +- vtr_flow/scripts/python_libs/vtr/util.py | 9 +- 38 files changed, 2721 insertions(+), 643 deletions(-) create mode 100644 libs/EXTERNAL/libtatum/libtatum/tatum/analyzers/AdaptiveSetupHoldTimingAnalyzer.hpp create mode 100644 vpr/src/route/route_samplers.cpp create mode 100644 vpr/src/route/route_samplers.h create mode 100644 vpr/src/route/virtual_net.h create mode 100644 vpr/test/test_net_decomp.cpp diff --git a/libs/EXTERNAL/libtatum/libtatum/tatum/analyzer_factory.hpp b/libs/EXTERNAL/libtatum/libtatum/tatum/analyzer_factory.hpp index 9ac444bc61f..a36e7cfe299 100644 --- a/libs/EXTERNAL/libtatum/libtatum/tatum/analyzer_factory.hpp +++ b/libs/EXTERNAL/libtatum/libtatum/tatum/analyzer_factory.hpp @@ -7,6 +7,7 @@ #include "tatum/TimingGraphFwd.hpp" #include "tatum/TimingConstraintsFwd.hpp" +#include "tatum/analyzers/AdaptiveSetupHoldTimingAnalyzer.hpp" #include "tatum/graph_walkers.hpp" #include "tatum/timing_analyzers.hpp" #include "tatum/analyzers/full_timing_analyzers.hpp" @@ -55,9 +56,9 @@ namespace tatum { ///Factor class to construct timing analyzers /// ///\tparam Visitor The analysis type visitor (e.g. SetupAnalysis) -///\tparam GraphWalker The graph walker to use (defaults to serial traversals) +///\tparam GraphWalker The graph walker to use template + class... GraphWalkers> struct AnalyzerFactory { //We use the dependent_false template to detect if the un-specialized AnalyzerFactor @@ -176,6 +177,21 @@ struct AnalyzerFactory { } }; +template<> +struct AnalyzerFactory { + + static std::unique_ptr make(const TimingGraph& timing_graph, + const TimingConstraints& timing_constraints, + const DelayCalculator& delay_calc) { + return std::unique_ptr( + new detail::AdaptiveSetupHoldTimingAnalyzer( + timing_graph, + timing_constraints, + delay_calc) + ); + } +}; + } //namepsace #endif diff --git a/libs/EXTERNAL/libtatum/libtatum/tatum/analyzer_factory_fwd.hpp b/libs/EXTERNAL/libtatum/libtatum/tatum/analyzer_factory_fwd.hpp index 3628ec1700b..0dff0883550 100644 --- a/libs/EXTERNAL/libtatum/libtatum/tatum/analyzer_factory_fwd.hpp +++ b/libs/EXTERNAL/libtatum/libtatum/tatum/analyzer_factory_fwd.hpp @@ -10,7 +10,7 @@ namespace tatum { ///\tparam Visitor The analysis type visitor (e.g. SetupAnalysis) ///\tparam GraphWalker The graph walker to use (defaults to serial traversals) template + class... GraphWalkers> struct AnalyzerFactory; } //namepsace diff --git a/libs/EXTERNAL/libtatum/libtatum/tatum/analyzers/AdaptiveSetupHoldTimingAnalyzer.hpp b/libs/EXTERNAL/libtatum/libtatum/tatum/analyzers/AdaptiveSetupHoldTimingAnalyzer.hpp new file mode 100644 index 00000000000..d7fc315aaed --- /dev/null +++ b/libs/EXTERNAL/libtatum/libtatum/tatum/analyzers/AdaptiveSetupHoldTimingAnalyzer.hpp @@ -0,0 +1,202 @@ +#pragma once +#include "tatum/TimingGraphFwd.hpp" +#include "tatum/graph_walkers/SerialWalker.hpp" +#include "tatum/graph_walkers/SerialIncrWalker.hpp" +#include "tatum/SetupHoldAnalysis.hpp" +#include "tatum/analyzers/SetupHoldTimingAnalyzer.hpp" +#include "tatum/base/validate_timing_graph_constraints.hpp" +#include "tatum/graph_walkers/TimingGraphWalker.hpp" + +namespace tatum { namespace detail { + +/** Threshold for AdaptiveSetupHoldTimingAnalyzer to use full updates. +* Expressed as fraction of all edges in timing graph. */ +constexpr float full_update_threshold = 0.1; + +/** + * A concrete implementation of a SetupHoldTimingAnalyzer. + * + * This is an adaptive analyzer: can do incremental updates if the number of invalidated + * nodes is small, falls back to a full update after a certain threshold to avoid the overhead. + */ +template +class AdaptiveSetupHoldTimingAnalyzer : public SetupHoldTimingAnalyzer { + public: + AdaptiveSetupHoldTimingAnalyzer(const TimingGraph& timing_graph, const TimingConstraints& timing_constraints, const DelayCalculator& delay_calculator) + : SetupHoldTimingAnalyzer() + , timing_graph_(timing_graph) + , timing_constraints_(timing_constraints) + , delay_calculator_(delay_calculator) + , setup_hold_visitor_(timing_graph_.nodes().size(), timing_graph_.edges().size()) { + validate_timing_graph_constraints(timing_graph_, timing_constraints_); + + //Initialize profiling data. Use full walker to store data for both + full_walker_.set_profiling_data("total_analysis_sec", 0.); + full_walker_.set_profiling_data("analysis_sec", 0.); + full_walker_.set_profiling_data("num_full_updates", 0.); + full_walker_.set_profiling_data("num_incr_updates", 0.); + + mode_ = Mode::INCR; + n_modified_edges_ = 0; + max_modified_edges_ = timing_graph_.edges().size() * full_update_threshold; + } + + protected: + //Update both setup and hold simultaneously (this is more efficient than updating them sequentially) + virtual void update_timing_impl() override { + auto start_time = Clock::now(); + + if(mode_ == Mode::INCR) + update_timing_incr_(setup_hold_visitor_); + else + update_timing_full_(setup_hold_visitor_); + + clear_timing_incr_(); + + double analysis_sec = std::chrono::duration_cast(Clock::now() - start_time).count(); + + //Record profiling data (use full walker to store it) (arbitrary choice) + double total_analysis_sec = analysis_sec + full_walker_.get_profiling_data("total_analysis_sec"); + full_walker_.set_profiling_data("total_analysis_sec", total_analysis_sec); + full_walker_.set_profiling_data("analysis_sec", analysis_sec); + if(mode_ == Mode::INCR) + full_walker_.set_profiling_data("num_incr_updates", full_walker_.get_profiling_data("num_incr_updates") + 1); + else + full_walker_.set_profiling_data("num_full_updates", full_walker_.get_profiling_data("num_full_updates") + 1); + + mode_ = Mode::INCR; /* We did our update, try to use incr until too many edges are modified */ + } + + //Update only setup timing + virtual void update_setup_timing_impl() override { + auto& setup_visitor = setup_hold_visitor_.setup_visitor(); + + if(mode_ == Mode::INCR) + update_timing_incr_(setup_visitor); + else + update_timing_full_(setup_visitor); + } + + //Update only hold timing + virtual void update_hold_timing_impl() override { + auto& hold_visitor = setup_hold_visitor_.hold_visitor(); + + if(mode_ == Mode::INCR) + update_timing_incr_(hold_visitor); + else + update_timing_full_(hold_visitor); + } + + virtual void invalidate_edge_impl(const EdgeId edge) override { + if(mode_ == Mode::FULL) + return; + incr_walker_.invalidate_edge(edge); + n_modified_edges_++; + if(n_modified_edges_ > max_modified_edges_) + mode_ = Mode::FULL; + } + + virtual node_range modified_nodes_impl() const override { + if(mode_ == Mode::FULL) + return full_walker_.modified_nodes(); + else + return incr_walker_.modified_nodes(); + } + + double get_profiling_data_impl(std::string key) const override { + return full_walker_.get_profiling_data(key); + } + + size_t num_unconstrained_startpoints_impl() const override { + if(mode_ == Mode::FULL) + return full_walker_.num_unconstrained_startpoints(); + else + return incr_walker_.num_unconstrained_startpoints(); + } + + size_t num_unconstrained_endpoints_impl() const override { + if(mode_ == Mode::FULL) + return full_walker_.num_unconstrained_endpoints(); + else + return incr_walker_.num_unconstrained_endpoints(); + } + + TimingTags::tag_range setup_tags_impl(NodeId node_id) const override { return setup_hold_visitor_.setup_tags(node_id); } + TimingTags::tag_range setup_tags_impl(NodeId node_id, TagType type) const override { return setup_hold_visitor_.setup_tags(node_id, type); } +#ifdef TATUM_CALCULATE_EDGE_SLACKS + TimingTags::tag_range setup_edge_slacks_impl(EdgeId edge_id) const override { return setup_hold_visitor_.setup_edge_slacks(edge_id); } +#endif + TimingTags::tag_range setup_node_slacks_impl(NodeId node_id) const override { return setup_hold_visitor_.setup_node_slacks(node_id); } + + TimingTags::tag_range hold_tags_impl(NodeId node_id) const override { return setup_hold_visitor_.hold_tags(node_id); } + TimingTags::tag_range hold_tags_impl(NodeId node_id, TagType type) const override { return setup_hold_visitor_.hold_tags(node_id, type); } +#ifdef TATUM_CALCULATE_EDGE_SLACKS + TimingTags::tag_range hold_edge_slacks_impl(EdgeId edge_id) const override { return setup_hold_visitor_.hold_edge_slacks(edge_id); } +#endif + TimingTags::tag_range hold_node_slacks_impl(NodeId node_id) const override { return setup_hold_visitor_.hold_node_slacks(node_id); } + + private: + /** Update using the full walker */ + void update_timing_full_(GraphVisitor& visitor){ + full_walker_.do_reset(timing_graph_, visitor); + + full_walker_.do_arrival_pre_traversal(timing_graph_, timing_constraints_, visitor); + full_walker_.do_arrival_traversal(timing_graph_, timing_constraints_, delay_calculator_, visitor); + + full_walker_.do_required_pre_traversal(timing_graph_, timing_constraints_, visitor); + full_walker_.do_required_traversal(timing_graph_, timing_constraints_, delay_calculator_, visitor); + + full_walker_.do_update_slack(timing_graph_, delay_calculator_, visitor); + } + + /** Update using the incremental walker */ + void update_timing_incr_(GraphVisitor& visitor){ + if (never_updated_incr_) { + //Invalidate all edges + for (EdgeId edge : timing_graph_.edges()) { + incr_walker_.invalidate_edge(edge); + } + + //Only need to pre-traverse the first update + incr_walker_.do_arrival_pre_traversal(timing_graph_, timing_constraints_, visitor); + } + + incr_walker_.do_arrival_traversal(timing_graph_, timing_constraints_, delay_calculator_, visitor); + + if (never_updated_incr_) { + //Only need to pre-traverse the first update + incr_walker_.do_required_pre_traversal(timing_graph_, timing_constraints_, visitor); + } + + incr_walker_.do_required_traversal(timing_graph_, timing_constraints_, delay_calculator_, visitor); + + incr_walker_.do_update_slack(timing_graph_, delay_calculator_, visitor); + } + + /* Clear incremental timing info */ + void clear_timing_incr_(){ + incr_walker_.clear_invalidated_edges(); + + n_modified_edges_ = 0; + never_updated_incr_ = false; + } + + const TimingGraph& timing_graph_; + const TimingConstraints& timing_constraints_; + const DelayCalculator& delay_calculator_; + SetupHoldAnalysis setup_hold_visitor_; + + FullWalker full_walker_; + IncrWalker incr_walker_; + enum class Mode { FULL, INCR }; + Mode mode_; + + bool never_updated_incr_ = true; + size_t max_modified_edges_; + std::atomic_size_t n_modified_edges_ = 0; + + typedef std::chrono::duration dsec; + typedef std::chrono::high_resolution_clock Clock; +}; + +}} //namepsace diff --git a/libs/EXTERNAL/libtatum/libtatum/tatum/analyzers/IncrHoldTimingAnalyzer.hpp b/libs/EXTERNAL/libtatum/libtatum/tatum/analyzers/IncrHoldTimingAnalyzer.hpp index 6f6de43e788..d1acf608985 100644 --- a/libs/EXTERNAL/libtatum/libtatum/tatum/analyzers/IncrHoldTimingAnalyzer.hpp +++ b/libs/EXTERNAL/libtatum/libtatum/tatum/analyzers/IncrHoldTimingAnalyzer.hpp @@ -1,5 +1,5 @@ #pragma once -#include "tatum/graph_walkers/SerialWalker.hpp" +#include "tatum/graph_walkers/SerialIncrWalker.hpp" #include "tatum/HoldAnalysis.hpp" #include "tatum/analyzers/HoldTimingAnalyzer.hpp" #include "tatum/base/validate_timing_graph_constraints.hpp" diff --git a/libs/EXTERNAL/libtatum/libtatum/tatum/analyzers/IncrSetupHoldTimingAnalyzer.hpp b/libs/EXTERNAL/libtatum/libtatum/tatum/analyzers/IncrSetupHoldTimingAnalyzer.hpp index 844e146ce4c..ffc541bdf21 100644 --- a/libs/EXTERNAL/libtatum/libtatum/tatum/analyzers/IncrSetupHoldTimingAnalyzer.hpp +++ b/libs/EXTERNAL/libtatum/libtatum/tatum/analyzers/IncrSetupHoldTimingAnalyzer.hpp @@ -1,4 +1,5 @@ #pragma once +#include "tatum/graph_walkers/SerialIncrWalker.hpp" #include "tatum/graph_walkers/SerialWalker.hpp" #include "tatum/SetupHoldAnalysis.hpp" #include "tatum/analyzers/SetupHoldTimingAnalyzer.hpp" diff --git a/libs/EXTERNAL/libtatum/libtatum/tatum/analyzers/IncrSetupTimingAnalyzer.hpp b/libs/EXTERNAL/libtatum/libtatum/tatum/analyzers/IncrSetupTimingAnalyzer.hpp index 0ad0b5203fc..57c16afe63c 100644 --- a/libs/EXTERNAL/libtatum/libtatum/tatum/analyzers/IncrSetupTimingAnalyzer.hpp +++ b/libs/EXTERNAL/libtatum/libtatum/tatum/analyzers/IncrSetupTimingAnalyzer.hpp @@ -1,5 +1,5 @@ #pragma once -#include "tatum/graph_walkers/SerialWalker.hpp" +#include "tatum/graph_walkers/SerialIncrWalker.hpp" #include "tatum/SetupAnalysis.hpp" #include "tatum/analyzers/SetupTimingAnalyzer.hpp" #include "tatum/base/validate_timing_graph_constraints.hpp" diff --git a/libs/EXTERNAL/libtatum/libtatum/tatum/graph_walkers/ParallelLevelizedWalker.hpp b/libs/EXTERNAL/libtatum/libtatum/tatum/graph_walkers/ParallelLevelizedWalker.hpp index 0cbf1a5863b..0104d10d3e3 100644 --- a/libs/EXTERNAL/libtatum/libtatum/tatum/graph_walkers/ParallelLevelizedWalker.hpp +++ b/libs/EXTERNAL/libtatum/libtatum/tatum/graph_walkers/ParallelLevelizedWalker.hpp @@ -11,7 +11,7 @@ namespace tatum { /** - * A parallel timing analyzer which traveres the timing graph in a levelized + * A parallel timing analyzer which traverses the timing graph in a levelized * manner. However nodes within each level are processed in parallel using * Thread Building Blocks (TBB). If TBB is not available it operates serially and is * equivalent to the SerialWalker. diff --git a/libs/EXTERNAL/libtatum/libtatum/tatum/graph_walkers/SerialIncrWalker.hpp b/libs/EXTERNAL/libtatum/libtatum/tatum/graph_walkers/SerialIncrWalker.hpp index 8ece8e44f9a..4eba704df29 100644 --- a/libs/EXTERNAL/libtatum/libtatum/tatum/graph_walkers/SerialIncrWalker.hpp +++ b/libs/EXTERNAL/libtatum/libtatum/tatum/graph_walkers/SerialIncrWalker.hpp @@ -17,14 +17,14 @@ namespace tatum { * * If TATUM_INCR_BLOCK_INVALIDATION is defined: * All of a nodes tags associated with an invalidated edge are invalidated. - * This is a robust but pessimisitc approach (it invalidates more tags than + * This is a robust but pessimistic approach (it invalidates more tags than * strictly required). As a result all nodes processed will report having been * modified, meaning their decendents/predecessors will also be invalidated * even if in reality the recalculated tags are identical to the previous ones * (i.e. nothing has really changed). * * Ohterwise, the analyzer performs edge invalidation: - * Only node tags which are dominanted by an invalidated edge are invalidated. + * Only node tags which are dominated by an invalidated edge are invalidated. * This is a less pessimistic approach, and means when processed nodes which * don't have any changed tags will report as being unmodified. This significantly * prunes the amount of the timing graph which needs to be updated (as unmodified @@ -37,7 +37,7 @@ namespace tatum { * manner. Unlike SerialWalker it attempts to incrementally (rather than * fully) update based on invalidated edges. * - * To performan an incremental traversal, the st of invalidated edges + * To perform an incremental traversal, the set of invalidated edges * is processed to identify nodes which will need to be re-evaluated for * the arrival and/or required traversals. * diff --git a/libs/librrgraph/src/base/rr_graph_storage.h b/libs/librrgraph/src/base/rr_graph_storage.h index 7e4f21b5968..2ccbf325834 100644 --- a/libs/librrgraph/src/base/rr_graph_storage.h +++ b/libs/librrgraph/src/base/rr_graph_storage.h @@ -667,11 +667,6 @@ class t_rr_graph_storage { static inline Direction get_node_direction( vtr::array_view_id node_storage, RRNodeId id) { - auto& node_data = node_storage[id]; - if (node_data.type_ != CHANX && node_data.type_ != CHANY) { - VTR_LOG_ERROR("Attempted to access RR node 'direction' for non-channel type '%s'", - rr_node_typename[node_data.type_]); - } return node_storage[id].dir_side_.direction; } diff --git a/libs/libvtrutil/src/vtr_math.h b/libs/libvtrutil/src/vtr_math.h index 74b4ccebf58..199b15ac71b 100644 --- a/libs/libvtrutil/src/vtr_math.h +++ b/libs/libvtrutil/src/vtr_math.h @@ -4,6 +4,7 @@ #include #include #include +#include #include "vtr_assert.h" @@ -163,6 +164,17 @@ bool isclose(T a, T b) { return isclose(a, b, DEFAULT_REL_TOL, DEFAULT_ABS_TOL); } +/** Log2, round down. + * From https://stackoverflow.com/a/51351885 */ +static inline uint64_t log2_floor(uint64_t x) { + return 63U - __builtin_clzl(x); +} + +/** Log2, round up */ +static inline uint64_t log2_ceil(uint64_t x) { + return log2_floor(x - 1) + 1; +} + } // namespace vtr #endif diff --git a/utils/route_diag/src/main.cpp b/utils/route_diag/src/main.cpp index 571c17c30e6..7f4d50eef28 100644 --- a/utils/route_diag/src/main.cpp +++ b/utils/route_diag/src/main.cpp @@ -117,13 +117,16 @@ static void do_one_route(const Netlist<>& net_list, -1, false, std::unordered_map()); - std::tie(found_path, std::ignore, cheapest) = router.timing_driven_route_connection_from_route_tree(tree.root(), - sink_node, - cost_params, - bounding_box, - router_stats, - conn_params, - true); + std::tie(found_path, std::ignore, cheapest) = router.timing_driven_route_connection_from_route_tree( + tree.root(), + tree.root().inode, + sink_node, + cost_params, + bounding_box, + router_stats, + conn_params, + true + ); if (found_path) { VTR_ASSERT(cheapest.index == sink_node); diff --git a/vpr/src/base/SetupVPR.cpp b/vpr/src/base/SetupVPR.cpp index b5492a1f8ec..5e9b5d35657 100644 --- a/vpr/src/base/SetupVPR.cpp +++ b/vpr/src/base/SetupVPR.cpp @@ -281,6 +281,9 @@ void SetupVPR(const t_options* Options, /* Set seed for pseudo-random placement, default seed to 1 */ vtr::srandom(PlacerOpts->seed); + /* Make num_workers available to the router */ + RouterOpts->num_workers = vpr_setup->num_workers; + { vtr::ScopedStartFinishTimer t("Building complex block graph"); alloc_and_load_all_pb_graphs(PowerOpts->do_power, RouterOpts->flat_routing); diff --git a/vpr/src/base/read_netlist.cpp b/vpr/src/base/read_netlist.cpp index 6aee712a04b..3f243d122b4 100644 --- a/vpr/src/base/read_netlist.cpp +++ b/vpr/src/base/read_netlist.cpp @@ -1057,11 +1057,13 @@ static void load_external_nets_and_cb(ClusteredNetlist& clb_nlist) { int logical_pin = clb_nlist.pin_logical_index(pin_id); int physical_pin = get_physical_pin(tile_type, block_type, logical_pin); + /* XXX: Silence warning if (tile_type->is_ignored_pin[physical_pin] != is_ignored_net) { VTR_LOG_WARN( "Netlist connects net %s to both global and non-global pins.\n", clb_nlist.net_name(net_id).c_str()); } + */ } } diff --git a/vpr/src/base/vpr_api.cpp b/vpr/src/base/vpr_api.cpp index 9f379f84e42..46999356e5a 100644 --- a/vpr/src/base/vpr_api.cpp +++ b/vpr/src/base/vpr_api.cpp @@ -301,6 +301,9 @@ void vpr_init_with_options(const t_options* options, t_vpr_setup* vpr_setup, t_a &vpr_setup->PowerOpts, vpr_setup); + /* XXX: Hardcode parallel router for testing */ + //vpr_setup->RouterOpts.router_algorithm = PARALLEL; + /* Check inputs are reasonable */ CheckArch(*arch); @@ -906,19 +909,12 @@ RouteStatus vpr_route_fixed_W(const Netlist<>& net_list, std::shared_ptr delay_calc, NetPinsMatrix& net_delay, bool is_flat) { - get_cached_router_lookahead( - vpr_setup.RoutingArch, - vpr_setup.RouterOpts.lookahead_type, - vpr_setup.RouterOpts.write_router_lookahead, - vpr_setup.RouterOpts.read_router_lookahead, - vpr_setup.Segments, - is_flat); - vtr::ScopedStartFinishTimer timer("Routing"); if (NO_FIXED_CHANNEL_WIDTH == fixed_channel_width || fixed_channel_width <= 0) { VPR_FATAL_ERROR(VPR_ERROR_ROUTE, "Fixed channel width must be specified when routing at fixed channel width (was %d)", fixed_channel_width); } + bool status = false; status = try_route(net_list, fixed_channel_width, diff --git a/vpr/src/base/vpr_types.h b/vpr/src/base/vpr_types.h index 93ef759bb88..a51f6548d2d 100644 --- a/vpr/src/base/vpr_types.h +++ b/vpr/src/base/vpr_types.h @@ -1393,6 +1393,8 @@ struct t_router_opts { bool flat_routing; bool has_choking_spot; + size_t num_workers; + // Options related to rr_node reordering, for testing and possible cache optimization e_rr_node_reorder_algorithm reorder_rr_graph_nodes_algorithm = DONT_REORDER; int reorder_rr_graph_nodes_threshold = 0; diff --git a/vpr/src/route/connection_router.cpp b/vpr/src/route/connection_router.cpp index 62db70ed31f..82ebe70d329 100644 --- a/vpr/src/route/connection_router.cpp +++ b/vpr/src/route/connection_router.cpp @@ -1,4 +1,5 @@ #include "connection_router.h" +#include "route_common.h" #include "rr_graph.h" #include "binary_heap.h" @@ -61,10 +62,11 @@ inline void update_router_stats(const DeviceContext& device_ctx, /** return tuple */ template std::tuple ConnectionRouter::timing_driven_route_connection_from_route_tree( - const RouteTreeNode& rt_root, + const RouteTree& tree, + RRNodeId source_node, RRNodeId sink_node, - const t_conn_cost_params cost_params, - t_bb bounding_box, + const t_conn_cost_params& cost_params, + const t_bb& bounding_box, RouterStats& router_stats, const ConnectionParameters& conn_params, bool can_grow_bb) { @@ -73,7 +75,7 @@ std::tuple ConnectionRouter::timing_driven_route_conne bool retry = false; t_heap* cheapest; - std::tie(retry, cheapest) = timing_driven_route_connection_common_setup(rt_root, sink_node, cost_params, bounding_box, can_grow_bb); + std::tie(retry, cheapest) = timing_driven_route_connection_common_setup(tree, source_node, sink_node, cost_params, bounding_box, can_grow_bb); if (cheapest != nullptr) { rcv_path_manager.update_route_tree_set(cheapest->path_data); @@ -94,25 +96,25 @@ std::tuple ConnectionRouter::timing_driven_route_conne /** Return */ template std::tuple ConnectionRouter::timing_driven_route_connection_common_setup( - const RouteTreeNode& rt_root, + const RouteTree& tree, + RRNodeId source_node, RRNodeId sink_node, - const t_conn_cost_params cost_params, - t_bb bounding_box, + const t_conn_cost_params& cost_params, + const t_bb& bounding_box, bool can_grow_bb) { //Re-add route nodes from the existing route tree to the heap. //They need to be repushed onto the heap since each node's cost is target specific. - add_route_tree_to_heap(rt_root, sink_node, cost_params, false); + add_route_tree_to_heap(tree.root(), sink_node, bounding_box, cost_params, false); heap_.build_heap(); // via sifting down everything - RRNodeId source_node = rt_root.inode; - if (heap_.is_empty_heap()) { VTR_LOG("No source in route tree: %s\n", describe_unrouteable_connection(source_node, sink_node, is_flat_).c_str()); + VTR_LOG("Bounding box: %d,%dx%d,%d\n", bounding_box.xmin, bounding_box.ymin, bounding_box.xmax, bounding_box.ymax); return std::make_tuple(false, nullptr); } - VTR_LOGV_DEBUG(router_debug_, " Routing to %d as normal net (BB: %d,%d x %d,%d)\n", sink_node, + VTR_LOGV_DEBUG(router_debug_, " %p Routing to %d as normal net (BB: %d,%d x %d,%d)\n", this, sink_node, bounding_box.xmin, bounding_box.ymin, bounding_box.xmax, bounding_box.ymax); @@ -172,7 +174,7 @@ std::tuple ConnectionRouter::timing_driven_route_connection //Re-initialize the heap since it was emptied by the previous call to //timing_driven_route_connection_from_heap() - add_route_tree_to_heap(rt_root, sink_node, cost_params, false); + add_route_tree_to_heap(tree.root(), sink_node, full_device_bounding_box, cost_params, false); heap_.build_heap(); // via sifting down everything //Try finding the path again with the relaxed bounding box @@ -196,10 +198,11 @@ std::tuple ConnectionRouter::timing_driven_route_connection // Returns a tuple of */ template std::tuple ConnectionRouter::timing_driven_route_connection_from_route_tree_high_fanout( - const RouteTreeNode& rt_root, + const RouteTree& tree, + RRNodeId source_node, RRNodeId sink_node, - const t_conn_cost_params cost_params, - t_bb net_bounding_box, + const t_conn_cost_params& cost_params, + const t_bb& net_bounding_box, const SpatialRouteTreeLookup& spatial_rt_lookup, RouterStats& router_stats, const ConnectionParameters& conn_params, @@ -210,19 +213,17 @@ std::tuple ConnectionRouter::timing_driven_route_conne // re-explore route tree from root to add any new nodes (buildheap afterwards) // route tree needs to be repushed onto the heap since each node's cost is target specific router_stats_->add_high_fanout_rt++; - t_bb high_fanout_bb = add_high_fanout_route_tree_to_heap(rt_root, sink_node, cost_params, spatial_rt_lookup, net_bounding_box); + t_bb high_fanout_bb = add_high_fanout_route_tree_to_heap(tree.root(), sink_node, cost_params, spatial_rt_lookup, net_bounding_box); heap_.build_heap(); - RRNodeId source_node = rt_root.inode; - if (heap_.is_empty_heap()) { VTR_LOG("No source in route tree: %s\n", describe_unrouteable_connection(source_node, sink_node, is_flat_).c_str()); return std::make_tuple(false, false, t_heap()); } - VTR_LOGV_DEBUG(router_debug_, " Routing to %d as high fanout net (BB: %d,%d x %d,%d)\n", sink_node, - high_fanout_bb.xmin, high_fanout_bb.ymin, - high_fanout_bb.xmax, high_fanout_bb.ymax); + VTR_LOGV_DEBUG(router_debug_, " %p Routing to %d as high fanout net (BB: %d,%d x %d,%d)\n", this, sink_node, + net_bounding_box.xmin, net_bounding_box.ymin, + net_bounding_box.xmax, net_bounding_box.ymax); bool retry_with_full_bb = false; t_heap* cheapest; @@ -234,17 +235,21 @@ std::tuple ConnectionRouter::timing_driven_route_conne //Found no path, that may be due to an unlucky choice of existing route tree sub-set, //try again with the full route tree to be sure this is not an artifact of high-fanout routing VTR_LOG_WARN("No routing path found in high-fanout mode for net connection (to sink_rr %d), retrying with full route tree\n", sink_node); + VTR_LOG_WARN("high_fanout_bb=%d,%dx%d,%d\n", high_fanout_bb.xmin, high_fanout_bb.ymin, high_fanout_bb.xmax, high_fanout_bb.ymax); + VTR_LOG_WARN("net_bb=%d,%dx%d,%d\n", net_bounding_box.xmin, net_bounding_box.ymin, net_bounding_box.xmax, net_bounding_box.ymax); //Reset any previously recorded node costs so timing_driven_route_connection() //starts over from scratch. reset_path_costs(); modified_rr_node_inf_.clear(); - std::tie(retry_with_full_bb, cheapest) = timing_driven_route_connection_common_setup(rt_root, - sink_node, - cost_params, - net_bounding_box, - can_grow_bb); + std::tie(retry_with_full_bb, cheapest) = timing_driven_route_connection_common_setup( + tree, + source_node, + sink_node, + cost_params, + net_bounding_box, + can_grow_bb); } if (cheapest == nullptr) { @@ -273,14 +278,12 @@ std::tuple ConnectionRouter::timing_driven_route_conne // Returns either the last element of the path, or nullptr if no path is found template t_heap* ConnectionRouter::timing_driven_route_connection_from_heap(RRNodeId sink_node, - const t_conn_cost_params cost_params, - t_bb bounding_box) { + const t_conn_cost_params& cost_params, + const t_bb& bounding_box) { VTR_ASSERT_SAFE(heap_.is_valid()); - //std::cout << "using this: " << (void *)this << "\n"; - //std::cout << "using heap: " << heap_.get_ptr() << "\n"; if (heap_.is_empty_heap()) { //No source - VTR_LOGV_DEBUG(router_debug_, " Initial heap empty (no source)\n"); + VTR_LOGV_DEBUG(router_debug_, " %p Initial heap empty (no source)\n", this); } const auto& device_ctx = g_vpr_ctx.device(); @@ -297,8 +300,8 @@ t_heap* ConnectionRouter::timing_driven_route_connection_from_heap(RRNodeI false); RRNodeId inode = cheapest->index; - VTR_LOGV_DEBUG(router_debug_, " Popping node %d (cost: %g)\n", - inode, cheapest->cost); + VTR_LOGV_DEBUG(router_debug_, " %p Popping node %d (cost: %g)\n", + this, inode, cheapest->cost); // Have we found the target? if (inode == sink_node) { @@ -308,7 +311,7 @@ t_heap* ConnectionRouter::timing_driven_route_connection_from_heap(RRNodeI if (rcv_path_manager.is_enabled()) { rcv_path_manager.insert_backwards_path_into_traceback(cheapest->path_data, cheapest->cost, cheapest->backward_path_cost, route_ctx); } - VTR_LOGV_DEBUG(router_debug_, " Found target %8d (%s)\n", inode, describe_rr_node(device_ctx.rr_graph, device_ctx.grid, device_ctx.rr_indexed_data, inode, is_flat_).c_str()); + VTR_LOGV_DEBUG(router_debug_, " %p Found target %8d (%s)\n", this, inode, describe_rr_node(device_ctx.rr_graph, device_ctx.grid, device_ctx.rr_indexed_data, inode, is_flat_).c_str()); break; } @@ -329,7 +332,7 @@ t_heap* ConnectionRouter::timing_driven_route_connection_from_heap(RRNodeI } if (cheapest == nullptr) { /* Impossible routing. No path for net. */ - VTR_LOGV_DEBUG(router_debug_, " Empty heap (no path found)\n"); + VTR_LOGV_DEBUG(router_debug_, " %p Empty heap (no path found)\n", this); return nullptr; } @@ -339,9 +342,9 @@ t_heap* ConnectionRouter::timing_driven_route_connection_from_heap(RRNodeI // Find shortest paths from specified route tree to all nodes in the RR graph template vtr::vector ConnectionRouter::timing_driven_find_all_shortest_paths_from_route_tree( - const RouteTreeNode& rt_root, - const t_conn_cost_params cost_params, - t_bb bounding_box, + const RouteTree& tree, + const t_conn_cost_params& cost_params, + const t_bb& bounding_box, RouterStats& router_stats, const ConnectionParameters& conn_params) { router_stats_ = &router_stats; @@ -349,7 +352,7 @@ vtr::vector ConnectionRouter::timing_driven_find_all_sho // Add the route tree to the heap with no specific target node RRNodeId target_node = RRNodeId::INVALID(); - add_route_tree_to_heap(rt_root, target_node, cost_params, false); + add_route_tree_to_heap(tree.root(), target_node, bounding_box, cost_params, false); heap_.build_heap(); // via sifting down everything auto res = timing_driven_find_all_shortest_paths_from_heap(cost_params, bounding_box); @@ -367,14 +370,14 @@ vtr::vector ConnectionRouter::timing_driven_find_all_sho // no-operation lookahead which always returns zero. template vtr::vector ConnectionRouter::timing_driven_find_all_shortest_paths_from_heap( - const t_conn_cost_params cost_params, - t_bb bounding_box) { + const t_conn_cost_params& cost_params, + const t_bb& bounding_box) { vtr::vector cheapest_paths(rr_nodes_.size()); VTR_ASSERT_SAFE(heap_.is_valid()); if (heap_.is_empty_heap()) { // No source - VTR_LOGV_DEBUG(router_debug_, " Initial heap empty (no source)\n"); + VTR_LOGV_DEBUG(router_debug_, " %p Initial heap empty (no source)\n", this); } while (!heap_.is_empty_heap()) { @@ -387,8 +390,8 @@ vtr::vector ConnectionRouter::timing_driven_find_all_sho false); RRNodeId inode = cheapest->index; - VTR_LOGV_DEBUG(router_debug_, " Popping node %d (cost: %g)\n", - inode, cheapest->cost); + VTR_LOGV_DEBUG(router_debug_, " %p Popping node %d (cost: %g)\n", + this, inode, cheapest->cost); // Since we want to find shortest paths to all nodes in the graph // we do not specify a target node. @@ -403,10 +406,10 @@ vtr::vector ConnectionRouter::timing_driven_find_all_sho bounding_box); if (cheapest_paths[inode].index == RRNodeId::INVALID() || cheapest_paths[inode].cost >= cheapest->cost) { - VTR_LOGV_DEBUG(router_debug_, " Better cost to node %d: %g (was %g)\n", inode, cheapest->cost, cheapest_paths[inode].cost); + VTR_LOGV_DEBUG(router_debug_, " %p Better cost to node %d: %g (was %g)\n", this, inode, cheapest->cost, cheapest_paths[inode].cost); cheapest_paths[inode] = *cheapest; } else { - VTR_LOGV_DEBUG(router_debug_, " Worse cost to node %d: %g (better %g)\n", inode, cheapest->cost, cheapest_paths[inode].cost); + VTR_LOGV_DEBUG(router_debug_, " %p Worse cost to node %d: %g (better %g)\n", this, inode, cheapest->cost, cheapest_paths[inode].cost); } rcv_path_manager.free_path_struct(cheapest->path_data); @@ -419,8 +422,8 @@ vtr::vector ConnectionRouter::timing_driven_find_all_sho template void ConnectionRouter::timing_driven_expand_cheapest(t_heap* cheapest, RRNodeId target_node, - const t_conn_cost_params cost_params, - t_bb bounding_box) { + const t_conn_cost_params& cost_params, + const t_bb& bounding_box) { RRNodeId inode = cheapest->index; t_rr_node_route_inf* route_inf = &rr_node_route_inf_[inode]; @@ -441,10 +444,11 @@ void ConnectionRouter::timing_driven_expand_cheapest(t_heap* cheapest, if (best_total_cost > new_total_cost && ((rcv_path_manager.is_enabled()) || best_back_cost > new_back_cost)) { // Explore from this node, since the current/new partial path has the best cost // found so far - VTR_LOGV_DEBUG(router_debug_, " Better cost to %d\n", inode); - VTR_LOGV_DEBUG(router_debug_, " New total cost: %g\n", new_total_cost); - VTR_LOGV_DEBUG(router_debug_, " New back cost: %g\n", new_back_cost); - VTR_LOGV_DEBUG(router_debug_, " Setting path costs for associated node %d (from %d edge %zu)\n", + VTR_LOGV_DEBUG(router_debug_, " %p Better cost to %d\n", this, inode); + VTR_LOGV_DEBUG(router_debug_, " %p New total cost: %g\n", this, new_total_cost); + VTR_LOGV_DEBUG(router_debug_, " %p New back cost: %g\n", this, new_back_cost); + VTR_LOGV_DEBUG(router_debug_, " %p Setting path costs for associated node %d (from %d edge %zu)\n", + this, cheapest->index, cheapest->prev_node(), size_t(cheapest->prev_edge())); @@ -456,18 +460,18 @@ void ConnectionRouter::timing_driven_expand_cheapest(t_heap* cheapest, } else { // Post-heap prune, do not re-explore from the current/new partial path as it // has worse cost than the best partial path to this node found so far - VTR_LOGV_DEBUG(router_debug_, " Worse cost to %d\n", inode); - VTR_LOGV_DEBUG(router_debug_, " Old total cost: %g\n", best_total_cost); - VTR_LOGV_DEBUG(router_debug_, " Old back cost: %g\n", best_back_cost); - VTR_LOGV_DEBUG(router_debug_, " New total cost: %g\n", new_total_cost); - VTR_LOGV_DEBUG(router_debug_, " New back cost: %g\n", new_back_cost); + VTR_LOGV_DEBUG(router_debug_, " %p Worse cost to %d\n", this, inode); + VTR_LOGV_DEBUG(router_debug_, " %p Old total cost: %g\n", this, best_total_cost); + VTR_LOGV_DEBUG(router_debug_, " %p Old back cost: %g\n", this, best_back_cost); + VTR_LOGV_DEBUG(router_debug_, " %p New total cost: %g\n", this, new_total_cost); + VTR_LOGV_DEBUG(router_debug_, " %p New back cost: %g\n", this, new_back_cost); } } template void ConnectionRouter::timing_driven_expand_neighbours(t_heap* current, - const t_conn_cost_params cost_params, - t_bb bounding_box, + const t_conn_cost_params& cost_params, + const t_bb& bounding_box, RRNodeId target_node) { /* Puts all the rr_nodes adjacent to current on the heap. */ @@ -529,29 +533,28 @@ void ConnectionRouter::timing_driven_expand_neighbour(t_heap* current, RRNodeId from_node, RREdgeId from_edge, RRNodeId to_node, - const t_conn_cost_params cost_params, - const t_bb bounding_box, + const t_conn_cost_params& cost_params, + const t_bb& bounding_box, RRNodeId target_node, - const t_bb target_bb) { + const t_bb& target_bb) { int to_xlow = rr_graph_->node_xlow(to_node); int to_ylow = rr_graph_->node_ylow(to_node); - int to_xhigh = rr_graph_->node_xhigh(to_node); - int to_yhigh = rr_graph_->node_yhigh(to_node); // BB-pruning // Disable BB-pruning if RCV is enabled, as this can make it harder for circuits with high negative hold slack to resolve this // TODO: Only disable pruning if the net has negative hold slack, maybe go off budgets - if ((to_xhigh < bounding_box.xmin // Strictly left of BB left-edge - || to_xlow > bounding_box.xmax // Strictly right of BB right-edge - || to_yhigh < bounding_box.ymin // Strictly below BB bottom-edge - || to_ylow > bounding_box.ymax) // Strictly above BB top-edge + // Parallel router change: only expand if to_node is inside BB + if ((to_xlow < bounding_box.xmin + || to_ylow < bounding_box.ymin + || to_xlow > bounding_box.xmax + || to_ylow > bounding_box.ymax) && !rcv_path_manager.is_enabled()) { VTR_LOGV_DEBUG(router_debug_, - " Pruned expansion of node %d edge %zu -> %d" + " %p Pruned expansion of node %d edge %zu -> %d" " (to node location %d,%dx%d,%d outside of expanded" " net bounding box %d,%dx%d,%d)\n", - from_node, size_t(from_edge), size_t(to_node), - to_xlow, to_ylow, to_xhigh, to_yhigh, + this, from_node, size_t(from_edge), size_t(to_node), + to_xlow, to_ylow, rr_graph_->node_xhigh(to_node), rr_graph_->node_yhigh(to_node), bounding_box.xmin, bounding_box.ymin, bounding_box.xmax, bounding_box.ymax); return; /* Node is outside (expanded) bounding box. */ } @@ -565,15 +568,17 @@ void ConnectionRouter::timing_driven_expand_neighbour(t_heap* current, if (to_type == IPIN) { // Check if this IPIN leads to the target block // IPIN's of the target block should be contained within it's bounding box + int to_xhigh = rr_graph_->node_xhigh(to_node); + int to_yhigh = rr_graph_->node_yhigh(to_node); if (to_xlow < target_bb.xmin || to_ylow < target_bb.ymin || to_xhigh > target_bb.xmax || to_yhigh > target_bb.ymax) { VTR_LOGV_DEBUG(router_debug_, - " Pruned expansion of node %d edge %zu -> %d" + " %p Pruned expansion of node %d edge %zu -> %d" " (to node is IPIN at %d,%dx%d,%d which does not" " lead to target block %d,%dx%d,%d)\n", - from_node, size_t(from_edge), size_t(to_node), + this, from_node, size_t(from_edge), size_t(to_node), to_xlow, to_ylow, to_xhigh, to_yhigh, target_bb.xmin, target_bb.ymin, target_bb.xmax, target_bb.ymax); return; @@ -581,8 +586,8 @@ void ConnectionRouter::timing_driven_expand_neighbour(t_heap* current, } } - VTR_LOGV_DEBUG(router_debug_, " Expanding node %d edge %zu -> %d\n", - from_node, size_t(from_edge), size_t(to_node)); + VTR_LOGV_DEBUG(router_debug_, " %p Expanding node %d edge %zu -> %d\n", + this, from_node, size_t(from_edge), size_t(to_node)); // Check if the node exists in the route tree when RCV is enabled // Other pruning methods have been disabled when RCV is on, so this method is required to prevent "loops" from being created @@ -604,11 +609,11 @@ void ConnectionRouter::timing_driven_expand_neighbour(t_heap* current, // Add to_node to the heap, and also add any nodes which are connected by non-configurable edges template -void ConnectionRouter::timing_driven_add_to_heap(const t_conn_cost_params cost_params, +void ConnectionRouter::timing_driven_add_to_heap(const t_conn_cost_params& cost_params, const t_heap* current, RRNodeId from_node, RRNodeId to_node, - const RREdgeId from_edge, + RREdgeId from_edge, RRNodeId target_node) { const auto& device_ctx = g_vpr_ctx.device(); t_heap next; @@ -642,14 +647,14 @@ void ConnectionRouter::timing_driven_add_to_heap(const t_conn_cost_params float new_back_cost = next.backward_path_cost; if (new_total_cost < best_total_cost && ((rcv_path_manager.is_enabled()) || (new_back_cost < best_back_cost))) { - VTR_LOGV_DEBUG(router_debug_, " Expanding to node %d (%s)\n", to_node, + VTR_LOGV_DEBUG(router_debug_, " %p Expanding to node %d (%s)\n", this, to_node, describe_rr_node(device_ctx.rr_graph, device_ctx.grid, device_ctx.rr_indexed_data, to_node, is_flat_) .c_str()); - VTR_LOGV_DEBUG(router_debug_, " New Total Cost %g New back Cost %g\n", new_total_cost, new_back_cost); + VTR_LOGV_DEBUG(router_debug_, " %p New Total Cost %g New back Cost %g\n", this, new_total_cost, new_back_cost); //Add node to the heap only if the cost via the current partial path is less than the //best known cost, since there is no reason for the router to expand more expensive paths. // @@ -683,9 +688,9 @@ void ConnectionRouter::timing_driven_add_to_heap(const t_conn_cost_params true); } else { - VTR_LOGV_DEBUG(router_debug_, " Didn't expand to %d (%s)\n", to_node, describe_rr_node(device_ctx.rr_graph, device_ctx.grid, device_ctx.rr_indexed_data, to_node, is_flat_).c_str()); - VTR_LOGV_DEBUG(router_debug_, " Prev Total Cost %g Prev back Cost %g \n", best_total_cost, best_back_cost); - VTR_LOGV_DEBUG(router_debug_, " New Total Cost %g New back Cost %g \n", new_total_cost, new_back_cost); + VTR_LOGV_DEBUG(router_debug_, " %p Didn't expand to %d (%s)\n", this, to_node, describe_rr_node(device_ctx.rr_graph, device_ctx.grid, device_ctx.rr_indexed_data, to_node, is_flat_).c_str()); + VTR_LOGV_DEBUG(router_debug_, " %p Prev Total Cost %g Prev back Cost %g \n", this, best_total_cost, best_back_cost); + VTR_LOGV_DEBUG(router_debug_, " %p New Total Cost %g New back Cost %g \n", this, new_total_cost, new_back_cost); } if (rcv_path_manager.is_enabled() && next.path_data != nullptr) { @@ -713,7 +718,7 @@ static bool same_non_config_node_set(RRNodeId from_node, RRNodeId to_node) { #endif template -float ConnectionRouter::compute_node_cost_using_rcv(const t_conn_cost_params cost_params, +float ConnectionRouter::compute_node_cost_using_rcv(const t_conn_cost_params& cost_params, RRNodeId to_node, RRNodeId target_node, float backwards_delay, @@ -767,7 +772,7 @@ void ConnectionRouter::set_rcv_enabled(bool enable) { //Calculates the cost of reaching to_node template void ConnectionRouter::evaluate_timing_driven_node_costs(t_heap* to, - const t_conn_cost_params cost_params, + const t_conn_cost_params& cost_params, RRNodeId from_node, RRNodeId to_node, RREdgeId from_edge, @@ -877,7 +882,8 @@ void ConnectionRouter::evaluate_timing_driven_node_costs(t_heap* to, cost_params, to->R_upstream); VTR_LOGV_DEBUG(router_debug_ && !std::isfinite(expected_cost), - " Lookahead from %s (%s) to %s (%s) is non-finite, expected_cost = %f, to->R_upstream = %f\n", + " %p Lookahead from %s (%s) to %s (%s) is non-finite, expected_cost = %f, to->R_upstream = %f\n", + this, rr_node_arch_name(to_node, is_flat_).c_str(), describe_rr_node(device_ctx.rr_graph, device_ctx.grid, device_ctx.rr_indexed_data, to_node, is_flat_).c_str(), rr_node_arch_name(target_node, is_flat_).c_str(), @@ -910,16 +916,16 @@ void ConnectionRouter::empty_heap_annotating_node_route_inf() { //Adds the route tree rooted at rt_node to the heap, preparing it to be //used as branch-points for further routing. +/* Puts the entire partial routing below and including rt_node onto the heap + * (except for those parts marked as not to be expanded) by calling itself + * recursively. */ template void ConnectionRouter::add_route_tree_to_heap( const RouteTreeNode& rt_node, RRNodeId target_node, - const t_conn_cost_params cost_params, + const t_bb& bounding_box, + const t_conn_cost_params& cost_params, bool from_high_fanout) { - /* Puts the entire partial routing below and including rt_node onto the heap * - * (except for those parts marked as not to be expanded) by calling itself * - * recursively. */ - if (from_high_fanout) { router_stats_->add_all_rt_from_high_fanout++; } else { @@ -931,6 +937,7 @@ void ConnectionRouter::add_route_tree_to_heap( if (rt_node.re_expand) { add_route_tree_node_to_heap(rt_node, target_node, + bounding_box, cost_params, false); } @@ -942,12 +949,14 @@ void ConnectionRouter::add_route_tree_to_heap( target_node)) { add_route_tree_to_heap(child_node, target_node, + bounding_box, cost_params, from_high_fanout); } } else { add_route_tree_to_heap(child_node, target_node, + bounding_box, cost_params, from_high_fanout); } @@ -962,13 +971,18 @@ template void ConnectionRouter::add_route_tree_node_to_heap( const RouteTreeNode& rt_node, RRNodeId target_node, - const t_conn_cost_params cost_params, + const t_bb& bounding_box, + const t_conn_cost_params& cost_params, bool is_high_fanout) { const auto& device_ctx = g_vpr_ctx.device(); const RRNodeId inode = rt_node.inode; float backward_path_cost = cost_params.criticality * rt_node.Tdel; float R_upstream = rt_node.R_upstream; + /* don't include if not in BB */ + if (!inside_bb(rt_node.inode, bounding_box)) + return; + // after budgets are loaded, calculate delay cost as described by RCV paper /* R. Fung, V. Betz and W. Chow, "Slack Allocation and Routing to Improve FPGA Timing While * Repairing Short-Path Violations," in IEEE Transactions on Computer-Aided Design of @@ -983,7 +997,8 @@ void ConnectionRouter::add_route_tree_node_to_heap( target_node, cost_params, R_upstream); - VTR_LOGV_DEBUG(router_debug_, " Adding node %8d to heap from init route tree with cost %g (%s)\n", + VTR_LOGV_DEBUG(router_debug_, " %p Adding node %8d to heap from init route tree with cost %g (%s)\n", + this, inode, tot_cost, describe_rr_node(device_ctx.rr_graph, device_ctx.grid, device_ctx.rr_indexed_data, inode, is_flat_).c_str()); @@ -1012,25 +1027,30 @@ void ConnectionRouter::add_route_tree_node_to_heap( } } -static t_bb adjust_highfanout_bounding_box(t_bb highfanout_bb) { - t_bb bb = highfanout_bb; +/* Expand bb by inode's extents and clip against net_bb */ +inline void expand_highfanout_bounding_box(t_bb& bb, RRNodeId inode, const t_bb& net_bb, const RRGraphView* rr_graph) { + bb.xmin = std::max(net_bb.xmin, std::min(bb.xmin, rr_graph->node_xlow(inode))); + bb.ymin = std::max(net_bb.ymin, std::min(bb.ymin, rr_graph->node_ylow(inode))); + bb.xmax = std::min(net_bb.xmax, std::max(bb.xmax, rr_graph->node_xhigh(inode))); + bb.ymax = std::min(net_bb.ymax, std::max(bb.ymax, rr_graph->node_yhigh(inode))); +} +/* Expand bb by HIGH_FANOUT_BB_FAC and clip against net_bb */ +inline void adjust_highfanout_bounding_box(t_bb& bb, const t_bb& net_bb) { constexpr int HIGH_FANOUT_BB_FAC = 3; - bb.xmin -= HIGH_FANOUT_BB_FAC; - bb.ymin -= HIGH_FANOUT_BB_FAC; - bb.xmax += HIGH_FANOUT_BB_FAC; - bb.ymax += HIGH_FANOUT_BB_FAC; - - return bb; + bb.xmin = std::max(net_bb.xmin, bb.xmin - HIGH_FANOUT_BB_FAC); + bb.ymin = std::max(net_bb.ymin, bb.ymin - HIGH_FANOUT_BB_FAC); + bb.xmax = std::min(net_bb.xmax, bb.xmax + HIGH_FANOUT_BB_FAC); + bb.ymax = std::min(net_bb.ymax, bb.ymax + HIGH_FANOUT_BB_FAC); } template t_bb ConnectionRouter::add_high_fanout_route_tree_to_heap( const RouteTreeNode& rt_root, RRNodeId target_node, - const t_conn_cost_params cost_params, + const t_conn_cost_params& cost_params, const SpatialRouteTreeLookup& spatial_rt_lookup, - t_bb net_bounding_box) { + const t_bb& net_bounding_box) { //For high fanout nets we only add those route tree nodes which are spatially close //to the sink. // @@ -1070,6 +1090,7 @@ t_bb ConnectionRouter::add_high_fanout_route_tree_to_heap( for (const RouteTreeNode& rt_node : spatial_rt_lookup[bin_x][bin_y]) { if (!rt_node.re_expand) continue; //Some nodes (like IPINs) shouldn't be re-expanded + RRNodeId rr_node_to_add = rt_node.inode; if (is_flat_) { @@ -1077,14 +1098,16 @@ t_bb ConnectionRouter::add_high_fanout_route_tree_to_heap( continue; } - // Put the node onto the heap - add_route_tree_node_to_heap(rt_node, target_node, cost_params, true); + /* In case we are using a net-wide lookup with a clipped BB (decomposed net) */ + if (!inside_bb(rr_node_to_add, net_bounding_box)) + continue; + + /* Put the node onto the heap (here it can be net_bounding_box, it's inside anyway) */ + add_route_tree_node_to_heap(rt_node, target_node, net_bounding_box, cost_params, true); + + /* Expand bounding box by this node's extents (clips by net_bounding_box) */ + expand_highfanout_bounding_box(highfanout_bb, rr_node_to_add, net_bounding_box, rr_graph_); - // Update Bounding Box - highfanout_bb.xmin = std::min(highfanout_bb.xmin, rr_graph_->node_xlow(rr_node_to_add)); - highfanout_bb.ymin = std::min(highfanout_bb.ymin, rr_graph_->node_ylow(rr_node_to_add)); - highfanout_bb.xmax = std::max(highfanout_bb.xmax, rr_graph_->node_xhigh(rr_node_to_add)); - highfanout_bb.ymax = std::max(highfanout_bb.ymax, rr_graph_->node_yhigh(rr_node_to_add)); if (is_flat_) { if (rr_graph_->node_type(rr_node_to_add) == CHANY || rr_graph_->node_type(rr_node_to_add) == CHANX) { chan_nodes_added++; @@ -1110,15 +1133,14 @@ t_bb ConnectionRouter::add_high_fanout_route_tree_to_heap( if (done) break; } - t_bb bounding_box = net_bounding_box; if (nodes_added == 0) { //If the target bin, and it's surrounding bins were empty, just add the full route tree - add_route_tree_to_heap(rt_root, target_node, cost_params, true); + add_route_tree_to_heap(rt_root, target_node, net_bounding_box, cost_params, true); + return net_bounding_box; } else { //We found nearby routing, replace original bounding box to be localized around that routing - bounding_box = adjust_highfanout_bounding_box(highfanout_bb); + adjust_highfanout_bounding_box(highfanout_bb, net_bounding_box); + return highfanout_bb; } - - return bounding_box; } std::unique_ptr make_connection_router(e_heap_type heap_type, diff --git a/vpr/src/route/connection_router.h b/vpr/src/route/connection_router.h index 5834e852409..f514941981a 100644 --- a/vpr/src/route/connection_router.h +++ b/vpr/src/route/connection_router.h @@ -69,10 +69,11 @@ class ConnectionRouter : public ConnectionRouterInterface { * bool: should retry with full bounding box? (only used in parallel routing) * t_heap: heap element of cheapest path */ std::tuple timing_driven_route_connection_from_route_tree( - const RouteTreeNode& rt_root, + const RouteTree& tree, + RRNodeId source_node, RRNodeId sink_node, - const t_conn_cost_params cost_params, - t_bb bounding_box, + const t_conn_cost_params& cost_params, + const t_bb& bounding_box, RouterStats& router_stats, const ConnectionParameters& conn_params, bool can_grow_bb) final; @@ -88,10 +89,11 @@ class ConnectionRouter : public ConnectionRouterInterface { * bool: should retry with full bounding box? (only used in parallel routing) * t_heap: heap element of cheapest path */ std::tuple timing_driven_route_connection_from_route_tree_high_fanout( - const RouteTreeNode& rt_root, + const RouteTree& tree, + RRNodeId source_node, RRNodeId sink_node, - const t_conn_cost_params cost_params, - t_bb net_bounding_box, + const t_conn_cost_params& cost_params, + const t_bb& net_bounding_box, const SpatialRouteTreeLookup& spatial_rt_lookup, RouterStats& router_stats, const ConnectionParameters& conn_params, @@ -107,9 +109,9 @@ class ConnectionRouter : public ConnectionRouterInterface { // empty). When using cost_params.astar_fac = 0, for efficiency the // RouterLookahead used should be the NoOpLookahead. vtr::vector timing_driven_find_all_shortest_paths_from_route_tree( - const RouteTreeNode& rt_root, - const t_conn_cost_params cost_params, - t_bb bounding_box, + const RouteTree& tree, + const t_conn_cost_params& cost_params, + const t_bb& bounding_box, RouterStats& router_stats, const ConnectionParameters& conn_params) final; @@ -156,17 +158,19 @@ class ConnectionRouter : public ConnectionRouterInterface { * timing_driven_route_connection_from_route_tree_high_fanout for running * the connection router. * @param[in] rt_root RouteTreeNode describing the current routing state + * @param[in] source_node Source node ID to route from * @param[in] sink_node Sink node ID to route to * @param[in] cost_params * @param[in] bounding_box Keep search confined to this bounding box * @param[in] can_grow_bb Can this fn grow the given bounding box? - * @return bool Signal to retry this connection with a full-device bounding box, + * @return bool Signal to retry this connection with a full-device bounding box. * @return t_heap* Heap element describing the path found. */ std::tuple timing_driven_route_connection_common_setup( - const RouteTreeNode& rt_root, + const RouteTree& tree, + RRNodeId source_node, RRNodeId sink_node, - const t_conn_cost_params cost_params, - t_bb bounding_box, + const t_conn_cost_params& cost_params, + const t_bb& bounding_box, bool can_grow_bb); // Finds a path to sink_node, starting from the elements currently in the @@ -180,21 +184,21 @@ class ConnectionRouter : public ConnectionRouterInterface { // found t_heap* timing_driven_route_connection_from_heap( RRNodeId sink_node, - const t_conn_cost_params cost_params, - t_bb bounding_box); + const t_conn_cost_params& cost_params, + const t_bb& bounding_box); // Expand this current node if it is a cheaper path. void timing_driven_expand_cheapest( t_heap* cheapest, RRNodeId target_node, - const t_conn_cost_params cost_params, - t_bb bounding_box); + const t_conn_cost_params& cost_params, + const t_bb& bounding_box); // Expand each neighbor of the current node. void timing_driven_expand_neighbours( t_heap* current, - const t_conn_cost_params cost_params, - t_bb bounding_box, + const t_conn_cost_params& cost_params, + const t_bb& bounding_box, RRNodeId target_node); // Conditionally adds to_node to the router heap (via path from from_node @@ -207,15 +211,15 @@ class ConnectionRouter : public ConnectionRouterInterface { RRNodeId from_node, RREdgeId from_edge, RRNodeId to_node, - const t_conn_cost_params cost_params, - const t_bb bounding_box, + const t_conn_cost_params& cost_params, + const t_bb& bounding_box, RRNodeId target_node, - const t_bb target_bb); + const t_bb& target_bb); // Add to_node to the heap, and also add any nodes which are connected by // non-configurable edges void timing_driven_add_to_heap( - const t_conn_cost_params cost_params, + const t_conn_cost_params& cost_params, const t_heap* current, RRNodeId from_node, RRNodeId to_node, @@ -225,7 +229,7 @@ class ConnectionRouter : public ConnectionRouterInterface { // Calculates the cost of reaching to_node void evaluate_timing_driven_node_costs( t_heap* to, - const t_conn_cost_params cost_params, + const t_conn_cost_params& cost_params, RRNodeId from_node, RRNodeId to_node, RREdgeId from_edge, @@ -233,8 +237,8 @@ class ConnectionRouter : public ConnectionRouterInterface { // Find paths from current heap to all nodes in the RR graph vtr::vector timing_driven_find_all_shortest_paths_from_heap( - const t_conn_cost_params cost_params, - t_bb bounding_box); + const t_conn_cost_params& cost_params, + const t_bb& bounding_box); void empty_heap_annotating_node_route_inf(); @@ -242,11 +246,12 @@ class ConnectionRouter : public ConnectionRouterInterface { //used as branch-points for further routing. void add_route_tree_to_heap(const RouteTreeNode& rt_node, RRNodeId target_node, - const t_conn_cost_params cost_params, + const t_bb& bounding_box, + const t_conn_cost_params& cost_params, bool from_high_fanout); // Evaluate node costs using the RCV algorith - float compute_node_cost_using_rcv(const t_conn_cost_params cost_params, + float compute_node_cost_using_rcv(const t_conn_cost_params& cost_params, RRNodeId to_node, RRNodeId target_node, float backwards_delay, @@ -260,15 +265,16 @@ class ConnectionRouter : public ConnectionRouterInterface { void add_route_tree_node_to_heap( const RouteTreeNode& rt_node, RRNodeId target_node, - const t_conn_cost_params cost_params, + const t_bb& bounding_box, + const t_conn_cost_params& cost_params, bool is_high_fanout); t_bb add_high_fanout_route_tree_to_heap( const RouteTreeNode& rt_root, RRNodeId target_node, - const t_conn_cost_params cost_params, + const t_conn_cost_params& cost_params, const SpatialRouteTreeLookup& spatial_route_tree_lookup, - t_bb net_bounding_box); + const t_bb& net_bounding_box); const DeviceGrid& grid_; const RouterLookahead& router_lookahead_; diff --git a/vpr/src/route/connection_router_interface.h b/vpr/src/route/connection_router_interface.h index 2180dbe76f3..803114a6639 100644 --- a/vpr/src/route/connection_router_interface.h +++ b/vpr/src/route/connection_router_interface.h @@ -53,10 +53,11 @@ class ConnectionRouterInterface { * bool: should retry with full bounding box? (only used in parallel routing) * t_heap: heap element of cheapest path */ virtual std::tuple timing_driven_route_connection_from_route_tree( - const RouteTreeNode& rt_root, + const RouteTree& tree, + RRNodeId source_node, RRNodeId sink_node, - const t_conn_cost_params cost_params, - t_bb bounding_box, + const t_conn_cost_params& cost_params, + const t_bb& bounding_box, RouterStats& router_stats, const ConnectionParameters& conn_params, bool can_grow_bb) @@ -73,10 +74,11 @@ class ConnectionRouterInterface { * bool: should retry with full bounding box? (only used in parallel routing) * t_heap: heap element of cheapest path */ virtual std::tuple timing_driven_route_connection_from_route_tree_high_fanout( - const RouteTreeNode& rt_root, + const RouteTree& tree, + RRNodeId source_node, RRNodeId sink_node, - const t_conn_cost_params cost_params, - t_bb bounding_box, + const t_conn_cost_params& cost_params, + const t_bb& bounding_box, const SpatialRouteTreeLookup& spatial_rt_lookup, RouterStats& router_stats, const ConnectionParameters& conn_params, @@ -93,9 +95,9 @@ class ConnectionRouterInterface { // empty). When using cost_params.astar_fac = 0, for efficiency the // RouterLookahead used should be the NoOpLookahead. virtual vtr::vector timing_driven_find_all_shortest_paths_from_route_tree( - const RouteTreeNode& rt_root, - const t_conn_cost_params cost_params, - t_bb bounding_box, + const RouteTree& tree, + const t_conn_cost_params& cost_params, + const t_bb& bounding_box, RouterStats& router_stats, const ConnectionParameters& conn_params) = 0; diff --git a/vpr/src/route/partition_tree.cpp b/vpr/src/route/partition_tree.cpp index d3d895493b5..ab294eddf4e 100644 --- a/vpr/src/route/partition_tree.cpp +++ b/vpr/src/route/partition_tree.cpp @@ -1,21 +1,31 @@ #include "partition_tree.h" #include +#include #include -PartitionTree::PartitionTree(const Netlist<>& netlist) { +/** Arbitrary limit to stop partitioning nets. At a certain point, the quality lost due to disturbed net ordering + * and the task creation overhead outweighs the advantage of partitioning, so we stop doing it. */ +constexpr size_t MIN_NETS_TO_PARTITION = 256; + +PartitionTree::PartitionTree(const Netlist<>& netlist, const vtr::vector& scores) { const auto& device_ctx = g_vpr_ctx.device(); auto all_nets = std::vector(netlist.nets().begin(), netlist.nets().end()); - _root = build_helper(netlist, all_nets, 0, 0, device_ctx.grid.width() - 1, device_ctx.grid.height() - 1); + _root = build_helper(all_nets, scores, 0, 0, device_ctx.grid.width() - 1, device_ctx.grid.height() - 1); } -std::unique_ptr PartitionTree::build_helper(const Netlist<>& netlist, const std::vector& nets, int x1, int y1, int x2, int y2) { +std::unique_ptr PartitionTree::build_helper(const std::vector& nets, const vtr::vector& scores, int x1, int y1, int x2, int y2) { if (nets.empty()) return nullptr; const auto& route_ctx = g_vpr_ctx.routing(); auto out = std::make_unique(); + if (nets.size() < MIN_NETS_TO_PARTITION){ + out->nets = nets; + return out; + } + /* Build ParaDRo-ish prefix sum lookup for each bin (coordinate) in the device. * Do this for every step with only given nets, because each cutline takes some nets out * of the game, so if we just built a global lookup it wouldn't yield accurate results. @@ -29,61 +39,72 @@ std::unique_ptr PartitionTree::build_helper(const Netlist<>& /* Cutlines are placed between integral coordinates. * For instance, x_total_before[0] assumes a cutline at x=0.5, so fanouts at x=0 are included but not * x=1. It's similar for x_total_after[0], which excludes fanouts at x=0 and includes x=1. - * Note that we have W-1 possible cutlines for a W-wide box. */ - std::vector x_total_before(W - 1, 0), x_total_after(W - 1, 0); - std::vector y_total_before(H - 1, 0), y_total_after(H - 1, 0); + * Note that we have W-1 possible cutlines for a W-wide box. + * + * Here, *_total_before holds total score of nets before the cutline and not intersecting it. + * In ParaDRo this would be total_before + total_on. (same for total_after)*/ + std::vector x_total_before(W - 1, 0), x_total_after(W - 1, 0), x_total_on(W - 1, 0); + std::vector y_total_before(H - 1, 0), y_total_after(H - 1, 0), y_total_on(H - 1, 0); for (auto net_id : nets) { t_bb bb = route_ctx.route_bb[net_id]; - size_t fanouts = netlist.net_sinks(net_id).size(); + uint64_t score = scores[net_id]; /* Inclusive start and end coords of the bbox relative to x1. Clamp to [x1, x2]. */ int x_start = std::max(x1, bb.xmin) - x1; int x_end = std::min(bb.xmax, x2) - x1; - /* Fill in the lookups assuming a cutline at x + 0.5. */ - for (int x = x_start; x < W - 1; x++) { - x_total_before[x] += fanouts; + /* Fill in the lookups assuming a cutline at x + 0.5. + * This means total_before includes the max coord of the bbox but + * total_after does not include the min coord. */ + for (int x = x_end; x < W - 1; x++) { + x_total_before[x] += score; } - for (int x = 0; x < x_end; x++) { - x_total_after[x] += fanouts; + for (int x = 0; x < x_start; x++) { + x_total_after[x] += score; + } + for (int x = x_start; x < x_end; x++){ + x_total_on[x] += score; } int y_start = std::max(y1, bb.ymin) - y1; int y_end = std::min(bb.ymax, y2) - y1; - for (int y = y_start; y < H - 1; y++) { - y_total_before[y] += fanouts; + for (int y = y_end; y < H - 1; y++) { + y_total_before[y] += score; + } + for (int y = 0; y < y_start; y++) { + y_total_after[y] += score; } - for (int y = 0; y < y_end; y++) { - y_total_after[y] += fanouts; + for (int y = y_start; y < y_end; y++){ + y_total_on[y] += score; } } - int best_score = std::numeric_limits::max(); + uint64_t best_score = std::numeric_limits::max(); float best_pos = std::numeric_limits::quiet_NaN(); Axis best_axis = Axis::X; - int max_x_before = x_total_before[W - 2]; - int max_x_after = x_total_after[0]; for (int x = 0; x < W - 1; x++) { int before = x_total_before[x]; int after = x_total_after[x]; - if (before == max_x_before || after == max_x_after) /* Cutting here would leave no nets to the left or right */ + if (before == 0 || after == 0) /* Cutting here would leave no nets to the left or right */ continue; - int score = abs(x_total_before[x] - x_total_after[x]); + /* Now get a measure of "critical path": work on cutline + max(work on sides) + * Test: What happens if we discount max(sides)? */ + uint64_t score = x_total_on[x] + std::max(x_total_before[x], x_total_after[x]); + // uint64_t score = std::abs(int(x_total_before[x]) - int(x_total_after[x])); if (score < best_score) { best_score = score; best_pos = x1 + x + 0.5; /* Lookups are relative to (x1, y1) */ best_axis = Axis::X; - } + } } - int max_y_before = y_total_before[H - 2]; - int max_y_after = y_total_after[0]; for (int y = 0; y < H - 1; y++) { int before = y_total_before[y]; int after = y_total_after[y]; - if (before == max_y_before || after == max_y_after) /* Cutting here would leave no nets to the left or right (sideways) */ + if (before == 0 || after == 0) /* Cutting here would leave no nets to the left or right (sideways) */ continue; - int score = abs(y_total_before[y] - y_total_after[y]); + uint64_t score = y_total_on[y] + std::max(y_total_before[y], y_total_after[y]); + // uint64_t score = std::abs(int(y_total_before[y]) - int(y_total_after[y])); if (score < best_score) { best_score = score; best_pos = y1 + y + 0.5; /* Lookups are relative to (x1, y1) */ @@ -112,8 +133,8 @@ std::unique_ptr PartitionTree::build_helper(const Netlist<>& } } - out->left = build_helper(netlist, left_nets, x1, y1, std::floor(best_pos), y2); - out->right = build_helper(netlist, right_nets, std::floor(best_pos + 1), y1, x2, y2); + out->left = build_helper(left_nets, scores, x1, y1, std::floor(best_pos), y2); + out->right = build_helper(right_nets, scores, std::floor(best_pos + 1), y1, x2, y2); } else { VTR_ASSERT(best_axis == Axis::Y); for (auto net_id : nets) { @@ -127,10 +148,11 @@ std::unique_ptr PartitionTree::build_helper(const Netlist<>& } } - out->left = build_helper(netlist, left_nets, x1, y1, x2, std::floor(best_pos)); - out->right = build_helper(netlist, right_nets, x1, std::floor(best_pos + 1), x2, y2); + out->left = build_helper(left_nets, scores, x1, y1, x2, std::floor(best_pos)); + out->right = build_helper(right_nets, scores, x1, std::floor(best_pos + 1), x2, y2); } + out->nets = my_nets; out->nets = my_nets; out->cutline_axis = best_axis; out->cutline_pos = best_pos; diff --git a/vpr/src/route/partition_tree.h b/vpr/src/route/partition_tree.h index 08eb668a88f..56c929fb43d 100644 --- a/vpr/src/route/partition_tree.h +++ b/vpr/src/route/partition_tree.h @@ -2,6 +2,7 @@ #include "connection_router.h" #include "router_stats.h" +#include "virtual_net.h" #include #include @@ -27,8 +28,6 @@ inline Side operator!(const Side& rhs) { /** Routing iteration results per thread. (for a subset of the input netlist) */ struct RouteIterResults { - /** Are there any connections impossible to route due to a disconnected rr_graph? */ - bool is_routable = true; /** Net IDs for which timing_driven_route_net() actually got called */ std::vector rerouted_nets; /** RouterStats collected from my subset of nets */ @@ -44,22 +43,17 @@ struct RouteIterResults { * by the cutline. Leaf nodes represent a final set of nets reached by partitioning. * * To route this in parallel, we first route the nets in the root node, then add - * its left and right to a task queue, and repeat this for the whole tree. - * - * The tree stores some routing results to be later combined, such as is_routable and - * rerouted_nets. (TODO: do this per thread instead of per node) */ + * its left and right to a task queue, and repeat this for the whole tree. */ class PartitionTreeNode { public: /** Nets claimed by this node (intersected by cutline if branch, nets in final region if leaf) */ std::vector nets; + /** Virtual nets delegated to this node by the parent */ + std::vector virtual_nets; /** Left subtree. */ std::unique_ptr left = nullptr; /** Right subtree. */ std::unique_ptr right = nullptr; - /** Are there any connections impossible to route due to a disconnected rr_graph? */ - bool is_routable = false; - /** Net IDs for which timing_driven_route_net() actually got called */ - std::vector rerouted_nets; /* Axis of the cutline. */ Axis cutline_axis = Axis::X; /* Position of the cutline. It's a float, because cutlines are considered to be "between" integral coordinates. */ @@ -78,14 +72,14 @@ class PartitionTree { PartitionTree& operator=(PartitionTree&&) = default; /** Can only be built from a netlist */ - PartitionTree(const Netlist<>& netlist); + PartitionTree(const Netlist<>& netlist, const vtr::vector& scores); /** Access root. Shouldn't cause a segfault, because PartitionTree constructor always makes a _root */ inline PartitionTreeNode& root(void) { return *_root; } private: std::unique_ptr _root; - std::unique_ptr build_helper(const Netlist<>& netlist, const std::vector& nets, int x1, int y1, int x2, int y2); + std::unique_ptr build_helper(const std::vector& nets, const vtr::vector& scores, int x1, int y1, int x2, int y2); }; #ifdef VPR_DEBUG_PARTITION_TREE diff --git a/vpr/src/route/route_common.cpp b/vpr/src/route/route_common.cpp index 99d116b0de6..e9359923a58 100644 --- a/vpr/src/route/route_common.cpp +++ b/vpr/src/route/route_common.cpp @@ -190,6 +190,12 @@ void try_graph(int width_fac, is_flat); } +/** Attempts a routing via an iterated maze router algorithm. \p width_fac + * specifies the relative width of the channels, while the members of + * router_opts determine the value of the costs assigned to routing + * resource node, etc. det_routing_arch describes the detailed routing + * architecture (connection and switch boxes) of the FPGA; it is used + * only if a DETAILED routing has been selected. */ bool try_route(const Netlist<>& net_list, int width_fac, const t_router_opts& router_opts, @@ -204,12 +210,6 @@ bool try_route(const Netlist<>& net_list, int num_directs, ScreenUpdatePriority first_iteration_priority, bool is_flat) { - /* Attempts a routing via an iterated maze router algorithm. Width_fac * - * specifies the relative width of the channels, while the members of * - * router_opts determine the value of the costs assigned to routing * - * resource node, etc. det_routing_arch describes the detailed routing * - * architecture (connection and switch boxes) of the FPGA; it is used * - * only if a DETAILED routing has been selected. */ auto& device_ctx = g_vpr_ctx.mutable_device(); auto& cluster_ctx = g_vpr_ctx.clustering(); @@ -309,11 +309,10 @@ bool try_route(const Netlist<>& net_list, return (success); } +/** This routine checks to see if this is a resource-feasible routing. + * That is, are all rr_node capacity limitations respected? It assumes + * that the occupancy arrays are up to date when it is called. */ bool feasible_routing() { - /* This routine checks to see if this is a resource-feasible routing. * - * That is, are all rr_node capacity limitations respected? It assumes * - * that the occupancy arrays are up to date when it is called. */ - auto& device_ctx = g_vpr_ctx.device(); const auto& rr_graph = device_ctx.rr_graph; auto& route_ctx = g_vpr_ctx.routing(); @@ -387,8 +386,27 @@ void pathfinder_update_acc_cost_and_overuse_info(float acc_fac, OveruseInfo& ove auto& device_ctx = g_vpr_ctx.device(); const auto& rr_graph = device_ctx.rr_graph; auto& route_ctx = g_vpr_ctx.mutable_routing(); - size_t overused_nodes = 0, total_overuse = 0, worst_overuse = 0; +#ifdef VPR_USE_TBB + tbb::combinable overused_nodes(0), total_overuse(0), worst_overuse(0); + tbb::parallel_for_each(rr_graph.nodes().begin(), rr_graph.nodes().end(), [&](RRNodeId rr_id){ + int overuse = route_ctx.rr_node_route_inf[rr_id].occ() - rr_graph.node_capacity(rr_id); + + // If overused, update the acc_cost and add this node to the overuse info + // If not, do nothing + if (overuse > 0) { + route_ctx.rr_node_route_inf[rr_id].acc_cost += overuse * acc_fac; + + ++overused_nodes.local(); + total_overuse.local() += overuse; + worst_overuse.local() = std::max(worst_overuse.local(), size_t(overuse)); + } + }); + overuse_info.overused_nodes = overused_nodes.combine(std::plus()); + overuse_info.total_overuse = total_overuse.combine(std::plus()); + overuse_info.worst_overuse = worst_overuse.combine([](size_t a, size_t b){ return std::max(a, b); }); +#else + size_t overused_nodes = 0, total_overuse = 0, worst_overuse = 0; for (const RRNodeId& rr_id : rr_graph.nodes()) { int overuse = route_ctx.rr_node_route_inf[rr_id].occ() - rr_graph.node_capacity(rr_id); @@ -402,11 +420,11 @@ void pathfinder_update_acc_cost_and_overuse_info(float acc_fac, OveruseInfo& ove worst_overuse = std::max(worst_overuse, size_t(overuse)); } } - // Update overuse info overuse_info.overused_nodes = overused_nodes; overuse_info.total_overuse = total_overuse; overuse_info.worst_overuse = worst_overuse; +#endif } /** Update pathfinder cost of all nodes rooted at rt_node, including rt_node itself */ diff --git a/vpr/src/route/route_common.h b/vpr/src/route/route_common.h index 68e525e10b0..76b224c0ec5 100644 --- a/vpr/src/route/route_common.h +++ b/vpr/src/route/route_common.h @@ -2,6 +2,7 @@ #pragma once #include #include "clustered_netlist.h" +#include "rr_node_types.h" #include "vtr_vector.h" #include "heap_type.h" #include "rr_node_fwd.h" @@ -155,7 +156,7 @@ t_heap* prepare_to_add_node_to_heap( return hptr; } -/* Puts an rr_node on the heap if it is the cheapest path. */ +/** Puts an rr_node on the heap if it is the cheapest path. */ template void add_node_to_heap( T* heap, @@ -221,3 +222,39 @@ void push_back_node_with_info( heap->push_back(hptr); } + +/** Is \p inode inside this bounding box? + * In the context of the parallel router, an inode is inside a bounding box + * if its driving side is in the bounding box. If it's not directional, + * we take (xlow, ylow) as reference */ +inline bool inside_bb(RRNodeId inode, const t_bb& bb) { + auto& device_ctx = g_vpr_ctx.device(); + const auto& rr_graph = device_ctx.rr_graph; + + /* + int x, y; + if(rr_graph.node_direction(inode) == Direction::DEC){ + x = rr_graph.node_xhigh(inode); + y = rr_graph.node_yhigh(inode); + } else { + x = rr_graph.node_xlow(inode); + y = rr_graph.node_ylow(inode); + } */ + int x, y; + x = rr_graph.node_xlow(inode); + y = rr_graph.node_ylow(inode); + + return x >= bb.xmin && x <= bb.xmax && y >= bb.ymin && y <= bb.ymax; +} + +/** When RCV is enabled, it's necessary to be able to completely ripup high fanout nets + * if there is still negative hold slack. Normally the router will prune the illegal + * branches of high fanout nets, this will bypass this */ +inline bool check_hold(const t_router_opts& router_opts, float worst_neg_slack) { + if (router_opts.routing_budgets_algorithm != YOYO) { + return false; + } else if (worst_neg_slack != 0) { + return true; + } + return false; +} diff --git a/vpr/src/route/route_parallel.cpp b/vpr/src/route/route_parallel.cpp index 96e6464f62b..269d3b53ba2 100644 --- a/vpr/src/route/route_parallel.cpp +++ b/vpr/src/route/route_parallel.cpp @@ -1,8 +1,10 @@ /** @file Functions specific to parallel routing. * Reuse code from route_timing.cpp where possible. */ +#include #include #include +#include #include #include #include @@ -18,19 +20,28 @@ #include "netlist_fwd.h" #include "partition_tree.h" #include "read_route.h" -#include "route_export.h" #include "route_common.h" -#include "route_timing.h" +#include "route_export.h" #include "route_parallel.h" // all functions in profiling:: namespace, which are only activated if PROFILE is defined #include "route_profiling.h" +#include "route_samplers.h" +#include "route_timing.h" +#include "rr_graph_fwd.h" +#include "rr_node_types.h" #include "timing_util.h" +#include "virtual_net.h" +#include "vpr_error.h" +#include "vpr_types.h" +#include "vtr_assert.h" +#include "vtr_math.h" #include "vtr_time.h" #include "NetPinTimingInvalidator.h" #ifdef VPR_USE_TBB +# include "tbb/concurrent_vector.h" # include "tbb/enumerable_thread_specific.h" # include "tbb/task_group.h" @@ -39,14 +50,13 @@ template class RouteIterCtx { public: - tbb::enumerable_thread_specific routers; + tbb::enumerable_thread_specific& routers; const Netlist<>& net_list; int itry; float pres_fac; const t_router_opts& router_opts; CBRR& connections_inf; - tbb::enumerable_thread_specific router_stats; - tbb::enumerable_thread_specific route_structs; + tbb::enumerable_thread_specific& router_stats; NetPinsMatrix& net_delay; const ClusteredPinAtomPinsLookup& netlist_pin_lookup; std::shared_ptr timing_info; @@ -55,11 +65,25 @@ class RouteIterCtx { float worst_negative_slack; const RoutingPredictor& routing_predictor; const vtr::vector>>& choking_spots; + tbb::concurrent_vector& nets_to_retry; + vtr::vector& is_decomp_disabled; + /** Are there any connections impossible to route due to a disconnected rr_graph? */ + std::atomic_bool is_routable = false; + /** Net IDs for which timing_driven_route_net() actually got called */ + tbb::enumerable_thread_specific>& rerouted_nets; + /** "Scores" for building a PartitionTree (estimated workload) */ + vtr::vector& net_scores; + /** Sink indices known to fail when routed after decomposition. Always route these serially */ + vtr::vector>& net_known_samples; bool is_flat; }; -/** Helper for reduce_partition_tree. Traverse \p node's subtree and collect results into \p results */ -static void reduce_partition_tree_helper(const PartitionTreeNode& node, RouteIterResults& results); +/** Don't try to decompose nets if # of iterations > this. */ +constexpr int MAX_DECOMP_ITER = 5; + +/** Don't try to decompose a regular net more than this many times. + * For instance, max_decomp_depth=2 means one regular net can become 4 virtual nets at max. */ +constexpr int MAX_DECOMP_DEPTH = 2; /** * Try to route in parallel with the given ConnectionRouter. @@ -214,13 +238,17 @@ bool try_parallel_route_tmpl(const Netlist<>& net_list, route_budgets budgeting_inf(net_list, is_flat); + const RouterLookahead* router_lookahead; + + { + vtr::ScopedStartFinishTimer timer("Obtaining lookahead"); // This needs to be called before filling intra-cluster lookahead maps to ensure that the intra-cluster lookahead maps are initialized. - const RouterLookahead* router_lookahead = get_cached_router_lookahead(det_routing_arch, - router_opts.lookahead_type, - router_opts.write_router_lookahead, - router_opts.read_router_lookahead, - segment_inf, - is_flat); + router_lookahead = get_cached_router_lookahead(det_routing_arch, + router_opts.lookahead_type, + router_opts.write_router_lookahead, + router_opts.read_router_lookahead, + segment_inf, + is_flat); if (is_flat) { // If is_flat is true, the router lookahead maps related to intra-cluster resources should be initialized since @@ -245,6 +273,7 @@ bool try_parallel_route_tmpl(const Netlist<>& net_list, router_lookahead->write_intra_cluster(router_opts.write_intra_cluster_router_lookahead); } } + } VTR_ASSERT(router_lookahead != nullptr); @@ -346,7 +375,26 @@ bool try_parallel_route_tmpl(const Netlist<>& net_list, route_ctx.rr_node_route_inf, is_flat)); /* Here we provide an "exemplar" to copy for each thread */ auto router_stats_thread = tbb::enumerable_thread_specific(); - auto route_structs = tbb::enumerable_thread_specific(net_list); + tbb::concurrent_vector nets_to_retry; + auto rerouted_nets = tbb::enumerable_thread_specific>(); + + /* Should I decompose this net? */ + vtr::vector is_decomp_disabled(net_list.nets().size(), false); + + /* Keep track of workload per net */ + std::deque net_empirical_workloads(net_list.nets().size()); + std::fill(net_empirical_workloads.begin(), net_empirical_workloads.end(), 0); + + /* Scores: initially fanouts, later can be changed by route_with_partition_tree */ + vtr::vector net_scores(net_list.nets().size()); + + /* Populate with initial scores */ + tbb::parallel_for_each(net_list.nets(), [&](ParentNetId net_id){ + net_scores[net_id] = route_ctx.net_rr_terminals[net_id].size() - 1; + }); + + /* "Known samples" for each net: ones known to not route after decomp */ + vtr::vector> net_known_samples(net_list.nets().size()); RouterStats router_stats; float prev_iter_cumm_time = 0; @@ -402,7 +450,6 @@ bool try_parallel_route_tmpl(const Netlist<>& net_list, router_opts, connections_inf, router_stats_thread, - route_structs, net_delay, netlist_pin_lookup, route_timing_info, @@ -411,13 +458,19 @@ bool try_parallel_route_tmpl(const Netlist<>& net_list, worst_negative_slack, routing_predictor, choking_spots, + nets_to_retry, + is_decomp_disabled, + true, + rerouted_nets, + net_scores, + net_known_samples, is_flat}; vtr::Timer net_routing_timer; - RouteIterResults iter_results = route_with_partition_tree(tbb_task_group, iter_ctx); + RouteIterResults iter_results = decompose_route_with_partition_tree(tbb_task_group, iter_ctx); PartitionTreeDebug::log("Routing all nets took " + std::to_string(net_routing_timer.elapsed_sec()) + " s"); - if (!iter_results.is_routable) { + if (!iter_ctx.is_routable) { return false; // Impossible to route } @@ -501,7 +554,7 @@ bool try_parallel_route_tmpl(const Netlist<>& net_list, /* * Are we finished? */ - if (is_iteration_complete(routing_is_feasible, router_opts, itry, timing_info, rcv_finished_count == 0)) { + if (iter_ctx.nets_to_retry.empty() && is_iteration_complete(routing_is_feasible, router_opts, itry, timing_info, rcv_finished_count == 0)) { auto& router_ctx = g_vpr_ctx.routing(); if (is_better_quality_routing(best_routing, best_routing_metrics, wirelength_info, timing_info)) { @@ -639,19 +692,20 @@ bool try_parallel_route_tmpl(const Netlist<>& net_list, // the router to route around otherwise congested regions // (at the cost of high run-time). - //Increase the size of the net bounding boxes to give the router more - //freedom to find alternate paths. - // - //In the case of routing conflicts there are multiple connections competing - //for the same resources which can not resolve the congestion themselves. - //In normal routing mode we try to keep the bounding boxes small to minimize - //run-time, but this can limits how far signals can detour (i.e. they can't - //route outside the bounding box), which can cause conflicts to oscillate back - //and forth without resolving. - // - //By scaling the bounding boxes here, we slowly increase the router's search - //space in hopes of it allowing signals to move further out of the way to - //alleviate the conflicts. + /* Increase the size of the net bounding boxes to give the router more + * freedom to find alternate paths. + * + * In the case of routing conflicts there are multiple connections competing + * for the same resources which can not resolve the congestion themselves. + * In normal routing mode we try to keep the bounding boxes small to minimize + * run-time, but this can limits how far signals can detour (i.e. they can't + * route outside the bounding box), which can cause conflicts to oscillate back + * and forth without resolving. + * + * By scaling the bounding boxes here, we slowly increase the router's search + * space in hopes of it allowing signals to move further out of the way to + * alleviate the conflicts. */ + if (itry_conflicted_mode % BB_SCALE_ITER_COUNT == 0) { //We scale the bounding boxes by BB_SCALE_FACTOR, //every BB_SCALE_ITER_COUNT iterations. This ensures @@ -799,6 +853,24 @@ bool try_parallel_route_tmpl(const Netlist<>& net_list, return routing_is_successful; } +/** Apparently we need a few more checks around should_route_net. TODO: smush this function into should_route_net */ +static bool should_really_route_net(const Netlist<>& net_list, ParentNetId net_id, route_budgets& budgeting_inf, CBRR& connections_inf, float worst_negative_slack) { + auto& route_ctx = g_vpr_ctx.mutable_routing(); + + bool reroute_for_hold = false; + if (budgeting_inf.if_set()) { + reroute_for_hold = budgeting_inf.get_should_reroute(net_id); + reroute_for_hold &= (worst_negative_slack != 0); + } + if (route_ctx.net_status.is_fixed(net_id)) /* Skip pre-routed nets. */ + return false; + else if (net_list.net_is_ignored(net_id)) /* Skip ignored nets. */ + return false; + else if (!(reroute_for_hold) && !should_route_net(net_id, connections_inf, true)) + return false; + return true; +} + /** Try routing a net. This calls timing_driven_route_net. * The only difference is that it returns a "retry_net" flag, which means that the net * couldn't be routed with the default bounding box and needs a full-device BB. @@ -806,75 +878,49 @@ bool try_parallel_route_tmpl(const Netlist<>& net_list, * The single-thread router just retries with a full-device BB and does not need to notify the caller. * TODO: make the serial router follow this execution path to decrease code duplication */ template -NetResultFlags try_parallel_route_net(ConnectionRouter& router, - const Netlist<>& net_list, - const ParentNetId& net_id, - int itry, - float pres_fac, - const t_router_opts& router_opts, - CBRR& connections_inf, - RouterStats& router_stats, - std::vector& pin_criticality, - NetPinsMatrix& net_delay, - const ClusteredPinAtomPinsLookup& netlist_pin_lookup, - std::shared_ptr timing_info, - NetPinTimingInvalidator* pin_timing_invalidator, - route_budgets& budgeting_inf, - float worst_negative_slack, - const RoutingPredictor& routing_predictor, - const std::vector>& choking_spots, - bool is_flat) { +NetResultFlags try_parallel_route_net(ParentNetId net_id, RouteIterCtx& ctx) { auto& route_ctx = g_vpr_ctx.mutable_routing(); NetResultFlags flags; - bool reroute_for_hold = false; - if (budgeting_inf.if_set()) { - reroute_for_hold = (budgeting_inf.get_should_reroute(net_id)); - reroute_for_hold &= worst_negative_slack != 0; - } - if (route_ctx.net_status.is_fixed(net_id)) { /* Skip pre-routed nets. */ + /* Just return success if we don't need to route this one */ + if (!should_really_route_net(ctx.net_list, net_id, ctx.budgeting_inf, ctx.connections_inf, ctx.worst_negative_slack)) { flags.success = true; - } else if (net_list.net_is_ignored(net_id)) { /* Skip ignored nets. */ - flags.success = true; - } else if (!(reroute_for_hold) && !should_route_net(net_id, connections_inf, true)) { - flags.success = true; - } else { - // track time spent vs fanout - profiling::net_fanout_start(); - - vtr::Timer routing_timer; - flags = timing_driven_route_net(router, - net_list, - net_id, - itry, - pres_fac, - router_opts, - connections_inf, - router_stats, - pin_criticality, - net_delay[net_id].data(), - netlist_pin_lookup, - timing_info, - pin_timing_invalidator, - budgeting_inf, - worst_negative_slack, - routing_predictor, - choking_spots, - is_flat); - - profiling::net_fanout_end(net_list.net_sinks(net_id).size()); - - /* Impossible to route? (disconnected rr_graph) */ - if (flags.success) { - route_ctx.net_status.set_is_routed(net_id, true); - } else { - VTR_LOG("Routing failed for net %d\n", net_id); - } + return flags; + } - flags.was_rerouted = true; //Flag to record whether routing was actually changed + // track time spent vs fanout + profiling::net_fanout_start(); + + vtr::Timer routing_timer; + flags = timing_driven_route_net(ctx.routers.local(), + ctx.net_list, + net_id, + ctx.itry, + ctx.pres_fac, + ctx.router_opts, + ctx.connections_inf, + ctx.router_stats.local(), + ctx.net_delay[net_id].data(), + ctx.netlist_pin_lookup, + ctx.timing_info, + ctx.pin_timing_invalidator, + ctx.budgeting_inf, + ctx.worst_negative_slack, + ctx.routing_predictor, + ctx.choking_spots[net_id], + ctx.is_flat); + + profiling::net_fanout_end(ctx.net_list.net_sinks(net_id).size()); + + /* Impossible to route? (disconnected rr_graph) */ + if (flags.success) { + route_ctx.net_status.set_is_routed(net_id, true); + } else { + VTR_LOG("Routing failed for net %d\n", net_id); } + flags.was_rerouted = true; //Flag to record whether routing was actually changed return flags; } @@ -889,36 +935,15 @@ void route_partition_tree_helper(tbb::task_group& g, return ctx.net_list.net_sinks(id1).size() > ctx.net_list.net_sinks(id2).size(); }); - node.is_routable = true; - node.rerouted_nets.clear(); - vtr::Timer t; for (auto net_id : node.nets) { - auto flags = try_parallel_route_net( - ctx.routers.local(), - ctx.net_list, - net_id, - ctx.itry, - ctx.pres_fac, - ctx.router_opts, - ctx.connections_inf, - ctx.router_stats.local(), - ctx.route_structs.local().pin_criticality, - ctx.net_delay, - ctx.netlist_pin_lookup, - ctx.timing_info, - ctx.pin_timing_invalidator, - ctx.budgeting_inf, - ctx.worst_negative_slack, - ctx.routing_predictor, - ctx.choking_spots[net_id], - ctx.is_flat); + auto flags = try_parallel_route_net(net_id, ctx); if (!flags.success && !flags.retry_with_full_bb) { - node.is_routable = false; + ctx.is_routable = false; } if (flags.was_rerouted) { - node.rerouted_nets.push_back(net_id); + ctx.rerouted_nets.local().push_back(net_id); } /* If we need to retry this net with full-device BB, it will go up to the top * of the tree, so remove it from this node and keep track of it */ @@ -943,18 +968,6 @@ void route_partition_tree_helper(tbb::task_group& g, } } -/** Reduce results from partition tree into a single RouteIterResults */ -static void reduce_partition_tree_helper(const PartitionTreeNode& node, RouteIterResults& results) { - results.is_routable &= node.is_routable; - const std::vector& rerouted = node.rerouted_nets; - results.rerouted_nets.insert(results.rerouted_nets.end(), rerouted.begin(), rerouted.end()); - - if (node.left) - reduce_partition_tree_helper(*node.left, results); - if (node.right) - reduce_partition_tree_helper(*node.right, results); -} - /** Route all nets in parallel using the partitioning information in the PartitionTree. * * @param[in, out] g TBB task group to dispatch tasks. @@ -991,7 +1004,9 @@ RouteIterResults route_partition_tree(tbb::task_group& g, } RouteIterResults out; - reduce_partition_tree_helper(tree.root(), out); + for (auto& thread_rerouted_nets: ctx.rerouted_nets){ + out.rerouted_nets.insert(out.rerouted_nets.begin(), thread_rerouted_nets.begin(), thread_rerouted_nets.end()); + } for (auto& thread_stats : ctx.router_stats) { update_router_stats(out.stats, thread_stats); } @@ -1029,8 +1044,6 @@ static RouteIterResults route_without_partition_tree(std::vector& n ctx.router_opts, ctx.connections_inf, ctx.router_stats.local(), - ctx.route_structs.local().pin_criticality, - ctx.route_structs.local().rt_node_of_sink, ctx.net_delay, ctx.netlist_pin_lookup, ctx.timing_info, @@ -1042,7 +1055,7 @@ static RouteIterResults route_without_partition_tree(std::vector& n ctx.is_flat); if (!flags.success) { - out.is_routable = false; + ctx.is_routable = false; } if (flags.was_rerouted) { out.rerouted_nets.push_back(net_id); @@ -1054,4 +1067,1118 @@ static RouteIterResults route_without_partition_tree(std::vector& n return out; } +tbb::enumerable_thread_specific nets_too_deep = 0; +tbb::enumerable_thread_specific nets_clock = 0; +tbb::enumerable_thread_specific nets_retry_limit = 0; +tbb::enumerable_thread_specific nets_thin_strip = 0; +tbb::enumerable_thread_specific nets_cut_thin_strip = 0; +tbb::enumerable_thread_specific nets_few_fanouts = 0; +tbb::enumerable_thread_specific nets_set_to_decompose = 0; + +/** Get all "sink pin indices" for a given VirtualNet. We often work with that + * index, because it is used in a lot of lookups and is impossible to get back once + * converted to a ParentPinId or RRNodeId. */ +int get_vnet_num_sinks(const VirtualNet& vnet) { + auto& route_ctx = g_vpr_ctx.routing(); + size_t parent_num_sinks = route_ctx.route_trees[vnet.net_id]->num_sinks(); + int out = 0; + /* 1-indexed. Yes, I know... */ + for (size_t isink = 1; isink <= parent_num_sinks; ++isink) { + RRNodeId sink_rr = route_ctx.net_rr_terminals[vnet.net_id][isink]; + if (inside_bb(sink_rr, vnet.clipped_bb)) + out++; + } + return out; +} + +/** Should we decompose this net? We should probably leave it alone if: + * - it's a clock net + * - we decomposed nets for enough levels and should have good thread utilization by now + * - decomposing this net doesn't result in any parallelism + * - TODO: Don't decompose nets with full-device bounding box (don't want to clip their BB) */ +template +bool should_decompose_net(ParentNetId net_id, const PartitionTreeNode& node, const RouteIterCtx& ctx) { + /* Node doesn't have branches */ + if (!node.left || !node.right) + return false; + /* Clock net */ + if (ctx.net_list.net_is_global(net_id) && ctx.router_opts.two_stage_clock_routing){ + nets_clock.local()++; + return false; + } + /* Decomposition is disabled for net */ + if (ctx.is_decomp_disabled[net_id]){ + nets_retry_limit.local()++; + return false; + } + /* We are past the iteration to try decomposition */ + if (ctx.itry > MAX_DECOMP_ITER){ + nets_retry_limit.local()++; + return false; + } + int num_sinks = ctx.net_list.net_sinks(net_id).size(); + if(num_sinks < 8){ + nets_few_fanouts.local()++; + return false; + } + + nets_set_to_decompose.local()++; + return true; +} + +/** Should we decompose this vnet? */ +bool should_decompose_vnet(const VirtualNet& vnet, const PartitionTreeNode& node) { + /* Node doesn't have branches */ + if (!node.left || !node.right) + return false; + + if(vnet.times_decomposed >= MAX_DECOMP_DEPTH) + return false; + + /* Cutline doesn't go through vnet (a valid case: it wasn't there when partition tree was being built) */ + if(node.cutline_axis == Axis::X){ + if(vnet.clipped_bb.xmin > node.cutline_pos || vnet.clipped_bb.xmax < node.cutline_pos) + return false; + }else{ + if(vnet.clipped_bb.ymin > node.cutline_pos || vnet.clipped_bb.ymax < node.cutline_pos) + return false; + } + + int num_sinks = get_vnet_num_sinks(vnet); + if(num_sinks < 8){ + nets_few_fanouts.local()++; + return false; + } + + nets_set_to_decompose.local()++; + return true; +} + +/** Clip bb to one side of the cutline given the axis and position of the cutline. + * Note that cutlines are assumed to be at axis = cutline_pos + 0.5. */ +t_bb clip_to_side(const t_bb& bb, Axis axis, int cutline_pos, Side side) { + t_bb out = bb; + if (axis == Axis::X && side == Side::LEFT) + out.xmax = cutline_pos; + else if (axis == Axis::X && side == Side::RIGHT) + out.xmin = cutline_pos + 1; + else if (axis == Axis::Y && side == Side::LEFT) + out.ymax = cutline_pos; + else if (axis == Axis::Y && side == Side::RIGHT) + out.ymin = cutline_pos + 1; + else + VTR_ASSERT_MSG(false, "Unreachable"); + return out; +} + +/** Break a net into two given the partition tree node and virtual source. + * @param net_id: The net in question. + * @param node: The PartitionTreeNode which owns this net, fully or partially. + * @param virtual_source: The source node. Virtual source for the sink side, real source for the source side. + * @param sink_side: Which side of the cutline has the virtual source? + * @return Left and right halves of the net as VirtualNets. */ +std::tuple make_decomposed_pair(ParentNetId net_id, int cutline_pos, Axis cutline_axis) { + auto& route_ctx = g_vpr_ctx.routing(); + + Side source_side = which_side(route_ctx.route_trees[net_id]->root().inode, cutline_pos, cutline_axis); + VirtualNet source_half, sink_half; + t_bb bb = route_ctx.route_bb[net_id]; + source_half.net_id = net_id; + source_half.clipped_bb = clip_to_side(bb, cutline_axis, cutline_pos, source_side); + sink_half.net_id = net_id; + sink_half.clipped_bb = clip_to_side(bb, cutline_axis, cutline_pos, !source_side); + source_half.times_decomposed = 1; + sink_half.times_decomposed = 1; + if (source_side == Side::RIGHT) + return std::make_tuple(sink_half, source_half); + else + return std::make_tuple(source_half, sink_half); +} + +/** Does the current routing of \p net_id cross the cutline at cutline_axis = cutline_pos? */ +bool is_routing_over_cutline(ParentNetId net_id, int cutline_pos, Axis cutline_axis) { + auto& route_ctx = g_vpr_ctx.mutable_routing(); + const RouteTree& tree = route_ctx.route_trees[net_id].value(); + RRNodeId rr_source = tree.root().inode; + Side source_side = which_side(rr_source, cutline_pos, cutline_axis); + + for (auto isink : tree.get_reached_isinks()) { + RRNodeId rr_sink = route_ctx.net_rr_terminals[net_id][isink]; + Side sink_side = which_side(rr_sink, cutline_pos, cutline_axis); + if (source_side != sink_side) + return true; + } + + return false; +} + +/** Is \p inode too close to this cutline? + * We assign some "thickness" to the node and check for collision */ +bool is_close_to_cutline(RRNodeId inode, int cutline_pos, Axis cutline_axis, int thickness){ + const auto& device_ctx = g_vpr_ctx.device(); + const auto& rr_graph = device_ctx.rr_graph; + + /* Cutlines are considered to be at x + 0.5, set a thickness of +1 here by checking for equality */ + if(cutline_axis == Axis::X){ + return rr_graph.node_xlow(inode) - thickness <= cutline_pos && rr_graph.node_xhigh(inode) + thickness >= cutline_pos; + } else { + return rr_graph.node_ylow(inode) - thickness <= cutline_pos && rr_graph.node_yhigh(inode) + thickness >= cutline_pos; + } +} + +/** Is \p inode too close to this bb? (Assuming it's inside) + * We assign some "thickness" to the node and check for collision */ +bool is_close_to_bb(RRNodeId inode, const t_bb& bb, int thickness){ + const auto& device_ctx = g_vpr_ctx.device(); + const auto& rr_graph = device_ctx.rr_graph; + + int xlow = rr_graph.node_xlow(inode) - thickness; + int ylow = rr_graph.node_ylow(inode) - thickness; + int xhigh = rr_graph.node_xhigh(inode) + thickness; + int yhigh = rr_graph.node_yhigh(inode) + thickness; + + return (xlow <= bb.xmin && xhigh >= bb.xmin) + || (ylow <= bb.ymin && yhigh >= bb.ymin) + || (xlow <= bb.xmax && xhigh >= bb.xmax) + || (ylow <= bb.ymax && yhigh >= bb.ymax); +} + +/** Is this net divided very unevenly? If so, put all sinks in the small side into \p out and return true */ +bool get_reduction_isinks(ParentNetId net_id, int cutline_pos, Axis cutline_axis, std::set& out){ + const auto& route_ctx = g_vpr_ctx.routing(); + + const RouteTree& tree = route_ctx.route_trees[net_id].value(); + int num_sinks = tree.num_sinks(); + std::vector sinks; + int all_sinks = 0; + + Side source_side = which_side(tree.root().inode, cutline_pos, cutline_axis); + const t_bb& net_bb = route_ctx.route_bb[net_id]; + t_bb sink_side_bb = clip_to_side(net_bb, cutline_axis, cutline_pos, !source_side); + auto& is_isink_reached = tree.get_is_isink_reached(); + /* Get sinks on the sink side */ + for(int isink=1; isink +std::vector get_decomposition_isinks(ParentNetId net_id, int cutline_pos, Axis cutline_axis, const RouteIterCtx& ctx) { + const auto& route_ctx = g_vpr_ctx.routing(); + const RouteTree& tree = route_ctx.route_trees[net_id].value(); + + // std::vector sampled = convex_hull_downsample(net_id); + // std::vector sampled = sample_single_sink(net_id, pin_criticality, cutline_pos, cutline_axis); + + std::set sampled_set; + + /* Sometimes cutlines divide a net very unevenly. In that case, just route to all + * sinks in the small side and unblock. Stick with convex hull sampling if source + * is close to cutline. */ + bool is_reduced = get_reduction_isinks(net_id, cutline_pos, cutline_axis, sampled_set); + bool source_on_cutline = is_close_to_cutline(tree.root().inode, cutline_pos, cutline_axis, 1); + if(!is_reduced || source_on_cutline) + convex_hull_downsample(net_id, sampled_set); + + auto& is_isink_reached = tree.get_is_isink_reached(); + + /* Always sample "known samples": sinks known to fail to route */ + for(int isink: ctx.net_known_samples[net_id]){ + if(is_isink_reached[isink]) + continue; + + sampled_set.insert(isink); + } + + /* Sample if a sink is too close to the cutline (and unreached). + * Those sinks are likely to fail routing */ + for(size_t isink=1; isink out(sampled_set.begin(), sampled_set.end()); + + return out; +} + +/** Get all "sink pin indices" for a given VirtualNet. We often work with that + * index, because it is used in a lot of lookups and is impossible to get back once + * converted to a ParentPinId or RRNodeId. */ +std::vector get_vnet_isinks(const VirtualNet& vnet) { + auto& route_ctx = g_vpr_ctx.routing(); + size_t num_sinks = route_ctx.route_trees[vnet.net_id]->num_sinks(); + std::vector out; /* The compiler should be smart enough to not copy this when returning */ + /* 1-indexed. Yes, I know... */ + for (size_t isink = 1; isink <= num_sinks; ++isink) { + RRNodeId sink_rr = route_ctx.net_rr_terminals[vnet.net_id][isink]; + if (inside_bb(sink_rr, vnet.clipped_bb)) + out.push_back(isink); + } + return out; +} + +/** Break a vnet into two from the cutline. */ +std::tuple make_decomposed_pair_from_vnet(const VirtualNet& vnet, int cutline_pos, Axis cutline_axis) { + VirtualNet left_half, right_half; + left_half.net_id = vnet.net_id; + left_half.clipped_bb = clip_to_side(vnet.clipped_bb, cutline_axis, cutline_pos, Side::LEFT); + right_half.net_id = vnet.net_id; + right_half.clipped_bb = clip_to_side(vnet.clipped_bb, cutline_axis, cutline_pos, Side::RIGHT); + left_half.times_decomposed = vnet.times_decomposed + 1; + right_half.times_decomposed = vnet.times_decomposed + 1; + return std::make_tuple(left_half, right_half); +} + +/* Is this net divided very unevenly? If so, put all sinks in the small side into out. + * Since this is a vnet, there's a chance that both sides are small: then return all sinks */ +int get_reduction_isinks_vnet(const VirtualNet& vnet, int cutline_pos, Axis cutline_axis, std::set& out){ + const auto& route_ctx = g_vpr_ctx.routing(); + + const RouteTree& tree = route_ctx.route_trees[vnet.net_id].value(); + int num_sinks = tree.num_sinks(); + const t_bb& net_bb = vnet.clipped_bb; + + t_bb left_side = clip_to_side(net_bb, cutline_axis, cutline_pos, Side::LEFT); + t_bb right_side = clip_to_side(net_bb, cutline_axis, cutline_pos, Side::RIGHT); + auto& is_isink_reached = tree.get_is_isink_reached(); + + int reduced_sides = 0; + + for(const t_bb& side_bb: {left_side, right_side}){ + std::vector sinks; + int all_sinks = 0; + + const int MIN_WIDTH = 10; + int W = side_bb.xmax - side_bb.xmin + 1; + int H = side_bb.ymax - side_bb.ymin + 1; + bool is_narrow = (W < MIN_WIDTH || H < MIN_WIDTH); + bool should_reduce = true; + + const int MIN_SINKS = 4; + + for(int isink=1; isink MIN_SINKS){ + should_reduce = false; + break; + } + } + + if(!should_reduce) /* We found enough sinks and the box is not narrow */ + continue; + + /* Either we have a narrow box, or too few unique sink locations. Just route to every sink on this side */ + out.insert(sinks.begin(), sinks.end()); + reduced_sides++; + } + + return reduced_sides; +} + +/** Reduce only one side if vnet has source */ +bool get_reduction_isinks_vnet_with_source(const VirtualNet& vnet, int cutline_pos, Axis cutline_axis, std::set& out){ + const auto& route_ctx = g_vpr_ctx.routing(); + + const RouteTree& tree = route_ctx.route_trees[vnet.net_id].value(); + int num_sinks = tree.num_sinks(); + std::vector sinks; + int all_sinks = 0; + + Side source_side = which_side(tree.root().inode, cutline_pos, cutline_axis); + const t_bb& net_bb = vnet.clipped_bb; + t_bb sink_side_bb = clip_to_side(net_bb, cutline_axis, cutline_pos, !source_side); + auto& is_isink_reached = tree.get_is_isink_reached(); + /* Get sinks on the sink side */ + for(int isink=1; isink get_decomposition_isinks_vnet(const VirtualNet& vnet, int cutline_pos, Axis cutline_axis) { + const auto& route_ctx = g_vpr_ctx.routing(); + const RouteTree& tree = route_ctx.route_trees[vnet.net_id].value(); + + std::set sampled_set; + + /* Sometimes cutlines divide a net very unevenly. In that case, just route to all + * sinks in the small side and unblock. Add convex hull since we are in a vnet which + * may not have a source at all */ + if(inside_bb(tree.root().inode, vnet.clipped_bb)){ /* We have source, no need to sample after reduction in most cases */ + bool is_reduced = get_reduction_isinks_vnet_with_source(vnet, cutline_pos, cutline_axis, sampled_set); + bool source_on_cutline = is_close_to_cutline(tree.root().inode, cutline_pos, cutline_axis, 1); + if(!is_reduced || source_on_cutline) + convex_hull_downsample_vnet(vnet, sampled_set); + }else{ + int reduced_sides = get_reduction_isinks_vnet(vnet, cutline_pos, cutline_axis, sampled_set); + if(reduced_sides < 2){ + convex_hull_downsample_vnet(vnet, sampled_set); + } + } + + std::vector isinks = get_vnet_isinks(vnet); + auto& is_isink_reached = tree.get_is_isink_reached(); + + /* Sample if a sink is too close to the cutline (and unreached). + * Those sinks are likely to fail routing */ + for(int isink: isinks){ + if(is_isink_reached[isink]) + continue; + RRNodeId rr_sink = route_ctx.net_rr_terminals[vnet.net_id][isink]; + if(is_close_to_cutline(rr_sink, cutline_pos, cutline_axis, 1)){ + sampled_set.insert(isink); + continue; + } + if(is_close_to_bb(rr_sink, vnet.clipped_bb, 1)) + sampled_set.insert(isink); + } + + std::vector out(sampled_set.begin(), sampled_set.end()); + return out; +} + +/** Decompose a net into a pair of nets. */ +template +vtr::optional> route_and_decompose(ParentNetId net_id, const PartitionTreeNode& node, RouteIterCtx& ctx) { + auto& route_ctx = g_vpr_ctx.mutable_routing(); + + unsigned int num_sinks = ctx.net_list.net_sinks(net_id).size(); + + /* We don't have to route this net, so why bother decomposing it? */ + if (!should_really_route_net(ctx.net_list, net_id, ctx.budgeting_inf, ctx.connections_inf, ctx.worst_negative_slack)) + return vtr::nullopt; + + setup_routing_resources( + ctx.itry, + net_id, + ctx.net_list, + num_sinks, + ctx.router_opts.min_incremental_reroute_fanout, + ctx.connections_inf, + ctx.router_opts, + check_hold(ctx.router_opts, ctx.worst_negative_slack)); + + VTR_ASSERT(route_ctx.route_trees[net_id]); + RouteTree& tree = route_ctx.route_trees[net_id].value(); + + bool high_fanout = is_high_fanout(num_sinks, ctx.router_opts.high_fanout_threshold); + + /* I think it's OK to build the full high fanout lookup for both sides of the net. + * The work required to get the right bounding box and nodes into the lookup may + * be more than to just build it twice. */ + SpatialRouteTreeLookup spatial_route_tree_lookup; + if (high_fanout) { + spatial_route_tree_lookup = build_route_tree_spatial_lookup(ctx.net_list, + route_ctx.route_bb, + net_id, + tree.root()); + } + + /* Get the isinks to actually route to */ + std::vector isinks_to_route = get_decomposition_isinks(net_id, node.cutline_pos, node.cutline_axis, ctx); + + /* Get pin criticalities */ + std::vector pin_criticality(num_sinks + 1); + + for (int isink : isinks_to_route) { + if (ctx.timing_info) { + auto pin = ctx.net_list.net_pin(net_id, isink); + pin_criticality[isink] = get_net_pin_criticality(ctx.timing_info, + ctx.netlist_pin_lookup, + ctx.router_opts.max_criticality, + ctx.router_opts.criticality_exp, + net_id, + pin, + ctx.is_flat); + } else { + //No timing info, implies we want a min delay routing, so use criticality of 1. + pin_criticality[isink] = 1.; + } + } + + /* Sort wrt criticality */ + std::sort(isinks_to_route.begin(), isinks_to_route.end(), [&](int a, int b) { + return pin_criticality[a] > pin_criticality[b]; + }); + + /* Update base costs according to fanout and criticality rules + * TODO: Not sure what this does and if it's safe to call in parallel */ + update_rr_base_costs(num_sinks); + + t_conn_delay_budget conn_delay_budget; + t_conn_cost_params cost_params; + cost_params.astar_fac = ctx.router_opts.astar_fac; + cost_params.bend_cost = ctx.router_opts.bend_cost; + cost_params.pres_fac = ctx.pres_fac; + cost_params.delay_budget = ((ctx.budgeting_inf.if_set()) ? &conn_delay_budget : nullptr); + + for (int isink : isinks_to_route) { + /* Fill the necessary forms to route to this sink. */ + RRNodeId rr_sink = route_ctx.net_rr_terminals[net_id][isink]; + cost_params.criticality = pin_criticality[isink]; + + if (ctx.budgeting_inf.if_set()) { + conn_delay_budget.max_delay = ctx.budgeting_inf.get_max_delay_budget(net_id, isink); + conn_delay_budget.target_delay = ctx.budgeting_inf.get_delay_target(net_id, isink); + conn_delay_budget.min_delay = ctx.budgeting_inf.get_min_delay_budget(net_id, isink); + conn_delay_budget.short_path_criticality = ctx.budgeting_inf.get_crit_short_path(net_id, isink); + conn_delay_budget.routing_budgets_algorithm = ctx.router_opts.routing_budgets_algorithm; + } + + enable_router_debug(ctx.router_opts, net_id, rr_sink, ctx.itry, &ctx.routers.local()); + VTR_LOGV_DEBUG(f_router_debug, "Routing to sink %zu of net %zu for decomposition\n", size_t(rr_sink), size_t(net_id)); + + /* Route to this sink. */ + NetResultFlags sink_flags = timing_driven_route_sink( + ctx.routers.local(), + ctx.net_list, + net_id, + 0, /* itarget: only used for debug, so we can lie here */ + isink, + cost_params, + ctx.router_opts, + tree, + (high_fanout ? &spatial_route_tree_lookup : nullptr), + ctx.router_stats.local(), + ctx.budgeting_inf, + ctx.routing_predictor, + ctx.choking_spots[net_id], + ctx.is_flat, + route_ctx.route_bb[net_id]); + + if (!sink_flags.success) /* Couldn't route. It's too much work to backtrack from here, just fail. */ + return vtr::nullopt; + + /* Fill the required forms after routing a connection. */ + ++ctx.router_stats.local().connections_routed; + + /* Update the net delay for the sink we just routed */ + update_net_delay_from_isink(ctx.net_delay[net_id].data(), + tree, + isink, + ctx.net_list, + net_id, + ctx.timing_info.get(), + ctx.pin_timing_invalidator); + } + + if (ctx.router_opts.update_lower_bound_delays) { + for (int ipin : isinks_to_route) { + ctx.connections_inf.update_lower_bound_connection_delay(net_id, ipin, ctx.net_delay[net_id][ipin]); + } + } + + ctx.routers.local().empty_rcv_route_tree_set(); // ? + + return make_decomposed_pair(net_id, node.cutline_pos, node.cutline_axis); +} + +/** Get all "remaining sink pin indices" for a given VirtualNet. For regular nets + * you can get it from the route tree, but we need to spatially filter it here. */ +std::vector get_vnet_remaining_isinks(const VirtualNet& vnet) { + auto& route_ctx = g_vpr_ctx.routing(); + const RouteTree& tree = route_ctx.route_trees[vnet.net_id].value(); + + std::vector out; /* The compiler should be smart enough to not copy this when returning */ + for (size_t isink : tree.get_remaining_isinks()) { + RRNodeId sink_rr = route_ctx.net_rr_terminals[vnet.net_id][isink]; + if (inside_bb(sink_rr, vnet.clipped_bb)) + out.push_back(isink); + } + return out; +} + + +/** Decompose a net into a pair of nets. */ +template +vtr::optional> route_and_decompose_vnet(const VirtualNet& vnet, const PartitionTreeNode& node, RouteIterCtx& ctx) { + auto& route_ctx = g_vpr_ctx.mutable_routing(); + + unsigned int num_sinks = get_vnet_num_sinks(vnet); + RouteTree& tree = route_ctx.route_trees[vnet.net_id].value(); + + /* Get the isinks to actually route to */ + std::vector isinks_to_route = get_decomposition_isinks_vnet(vnet, node.cutline_pos, node.cutline_axis); + + if(isinks_to_route.size() == 0) /* All the sinks we were going to route are already reached -- just break down the net */ + return make_decomposed_pair_from_vnet(vnet, node.cutline_pos, node.cutline_axis); + + /* Get pin criticalities */ + std::vector pin_criticality(tree.num_sinks() + 1); + + for (int isink : isinks_to_route) { + if (ctx.timing_info) { + auto pin = ctx.net_list.net_pin(vnet.net_id, isink); + pin_criticality[isink] = get_net_pin_criticality(ctx.timing_info, + ctx.netlist_pin_lookup, + ctx.router_opts.max_criticality, + ctx.router_opts.criticality_exp, + vnet.net_id, + pin, + ctx.is_flat); + } else { + //No timing info, implies we want a min delay routing, so use criticality of 1. + pin_criticality[isink] = 1.; + } + } + + /* Sort wrt criticality */ + std::sort(isinks_to_route.begin(), isinks_to_route.end(), [&](int a, int b) { + return pin_criticality[a] > pin_criticality[b]; + }); + + bool high_fanout = is_high_fanout(tree.num_sinks(), ctx.router_opts.high_fanout_threshold); + + /* I think it's OK to build the full high fanout lookup for both sides of the net. + * The work required to get the right bounding box and nodes into the lookup may + * be more than to just build it twice. */ + SpatialRouteTreeLookup spatial_route_tree_lookup; + if (high_fanout) { + spatial_route_tree_lookup = build_route_tree_spatial_lookup(ctx.net_list, + route_ctx.route_bb, + vnet.net_id, + tree.root()); + } + + /* Update base costs according to fanout and criticality rules + * TODO: Not sure what this does and if it's safe to call in parallel */ + update_rr_base_costs(num_sinks); + + t_conn_delay_budget conn_delay_budget; + t_conn_cost_params cost_params; + cost_params.astar_fac = ctx.router_opts.astar_fac; + cost_params.bend_cost = ctx.router_opts.bend_cost; + cost_params.pres_fac = ctx.pres_fac; + cost_params.delay_budget = ((ctx.budgeting_inf.if_set()) ? &conn_delay_budget : nullptr); + + for (int isink : isinks_to_route) { + /* Fill the necessary forms to route to this sink. */ + RRNodeId rr_sink = route_ctx.net_rr_terminals[vnet.net_id][isink]; + cost_params.criticality = pin_criticality[isink]; + + if (ctx.budgeting_inf.if_set()) { + conn_delay_budget.max_delay = ctx.budgeting_inf.get_max_delay_budget(vnet.net_id, isink); + conn_delay_budget.target_delay = ctx.budgeting_inf.get_delay_target(vnet.net_id, isink); + conn_delay_budget.min_delay = ctx.budgeting_inf.get_min_delay_budget(vnet.net_id, isink); + conn_delay_budget.short_path_criticality = ctx.budgeting_inf.get_crit_short_path(vnet.net_id, isink); + conn_delay_budget.routing_budgets_algorithm = ctx.router_opts.routing_budgets_algorithm; + } + + enable_router_debug(ctx.router_opts, vnet.net_id, rr_sink, ctx.itry, &ctx.routers.local()); + VTR_LOGV_DEBUG(f_router_debug, "Routing to sink %zu of net %zu for decomposition\n", size_t(rr_sink), size_t(vnet.net_id)); + + /* Route to this sink. */ + NetResultFlags sink_flags = timing_driven_route_sink( + ctx.routers.local(), + ctx.net_list, + vnet.net_id, + 0, /* itarget: only used for debug, so we can lie here */ + isink, + cost_params, + ctx.router_opts, + tree, + (high_fanout ? &spatial_route_tree_lookup : nullptr), + ctx.router_stats.local(), + ctx.budgeting_inf, + ctx.routing_predictor, + ctx.choking_spots[vnet.net_id], + ctx.is_flat, + vnet.clipped_bb); + + if (!sink_flags.success) /* Couldn't route. It's too much work to backtrack from here, just fail. */ + return vtr::nullopt; + + /* Fill the required forms after routing a connection. */ + ++ctx.router_stats.local().connections_routed; + + /* Update the net delay for the sink we just routed */ + update_net_delay_from_isink(ctx.net_delay[vnet.net_id].data(), + tree, + isink, + ctx.net_list, + vnet.net_id, + ctx.timing_info.get(), + ctx.pin_timing_invalidator); + } + + if (ctx.router_opts.update_lower_bound_delays) { + for (int ipin : isinks_to_route) { + ctx.connections_inf.update_lower_bound_connection_delay(vnet.net_id, ipin, ctx.net_delay[vnet.net_id][ipin]); + } + } + + ctx.routers.local().empty_rcv_route_tree_set(); // ? + + return make_decomposed_pair_from_vnet(vnet, node.cutline_pos, node.cutline_axis); +} + + +/* Goes through all the sinks of this virtual net and copies their delay values from + * the route_tree to the net_delay array. */ +template +static void update_net_delays_from_vnet(const VirtualNet& vnet, RouteIterCtx& ctx) { + auto& route_ctx = g_vpr_ctx.routing(); + std::vector sinks = get_vnet_isinks(vnet); + + for (int isink : sinks) { + update_net_delay_from_isink( + ctx.net_delay[vnet.net_id].data(), + *route_ctx.route_trees[vnet.net_id], + isink, + ctx.net_list, + vnet.net_id, + ctx.timing_info.get(), + ctx.pin_timing_invalidator); + } +} + +inline std::string describe_bbox(const t_bb& bb){ + return std::to_string(bb.xmin) + "," + std::to_string(bb.ymin) + + "x" + std::to_string(bb.xmax) + "," + std::to_string(bb.ymax); +} + +inline std::string describe_rr_coords(RRNodeId inode){ + auto& device_ctx = g_vpr_ctx.device(); + const auto& rr_graph = device_ctx.rr_graph; + + return std::to_string(rr_graph.node_xlow(inode)) + + "," + std::to_string(rr_graph.node_ylow(inode)) + + " -> " + std::to_string(rr_graph.node_xhigh(inode)) + + "," + std::to_string(rr_graph.node_yhigh(inode)); +} + +/** Build a string describing \p vnet and its existing routing */ +inline std::string describe_vnet(const VirtualNet& vnet){ + const auto& route_ctx = g_vpr_ctx.routing(); + + std::string out = ""; + out += "Virtual net with bbox " + describe_bbox(vnet.clipped_bb) + + " parent net: " + std::to_string(size_t(vnet.net_id)) + + " parent bbox: " + describe_bbox(route_ctx.route_bb[vnet.net_id]) + "\n"; + + RRNodeId source_rr = route_ctx.net_rr_terminals[vnet.net_id][0]; + out += "source: " + describe_rr_coords(source_rr) + ", sinks:"; + for(size_t i=1; iall_nodes(); + for(auto it = all_nodes.begin(); it != all_nodes.end(); ++it){ + if((*it).is_leaf()) { + out += describe_rr_coords((*it).inode) + " END "; + ++it; + if(it == all_nodes.end()) + break; + out += describe_rr_coords((*it).parent()->inode) + " -> "; + out += describe_rr_coords((*it).inode) + " -> "; + } else { + out += describe_rr_coords((*it).inode) + " -> "; + } + } + out += "\n"; + + return out; +} + +/** Build a logarithmic net fanouts histogram */ +std::string describe_fanout_histogram(void){ + const auto& route_ctx = g_vpr_ctx.routing(); + std::vector bins(6); + for(size_t i=0; i(vtr::log2_floor(F), 5); + bins[bin]++; + } + std::string out = "Log fanout histogram:"; + for(int f: bins){ + out += " " + std::to_string(f); + } + out += "\n"; + return out; +} + +/** Route a VirtualNet, which is a portion of a net with a clipped bounding box + * and maybe a virtual source. */ +template +NetResultFlags route_virtual_net(const VirtualNet& vnet, RouteIterCtx& ctx) { + auto& route_ctx = g_vpr_ctx.mutable_routing(); + + std::vector sinks = get_vnet_isinks(vnet); + NetResultFlags flags; + + VTR_ASSERT(route_ctx.route_trees[vnet.net_id]); + RouteTree& tree = route_ctx.route_trees[vnet.net_id].value(); + + /* Use vnet sinks to trigger high fanout code */ + bool high_fanout = is_high_fanout(tree.num_sinks(), ctx.router_opts.high_fanout_threshold); + + /* I think it's OK to build the full high fanout lookup. + * The work required to get the right bounding box and nodes into the lookup may + * be more than to just build it twice. */ + SpatialRouteTreeLookup spatial_route_tree_lookup; + if (high_fanout) { + spatial_route_tree_lookup = build_route_tree_spatial_lookup(ctx.net_list, + route_ctx.route_bb, + vnet.net_id, + tree.root()); + } + + std::vector remaining_isinks = get_vnet_remaining_isinks(vnet); + + std::vector pin_criticality(tree.num_sinks() + 1); + + /* Sort by decreasing criticality */ + for (int isink : remaining_isinks) { + if (ctx.timing_info) { + auto pin = ctx.net_list.net_pin(vnet.net_id, isink); + pin_criticality[isink] = get_net_pin_criticality( + ctx.timing_info, + ctx.netlist_pin_lookup, + ctx.router_opts.max_criticality, + ctx.router_opts.criticality_exp, + vnet.net_id, + pin, + ctx.is_flat); + + } else { + //No timing info, implies we want a min delay routing, so use criticality of 1. + pin_criticality[isink] = 1.; + } + } + + // compare the criticality of different sink nodes + sort(begin(remaining_isinks), end(remaining_isinks), [&](int a, int b) { + return pin_criticality[a] > pin_criticality[b]; + }); + + /* Update base costs according to fanout and criticality rules (TODO: I'm super sure this is not thread safe) */ + update_rr_base_costs(sinks.size()); + + /* Set up the tax forms for routing nets */ + t_conn_delay_budget conn_delay_budget; + t_conn_cost_params cost_params; + cost_params.astar_fac = ctx.router_opts.astar_fac; + cost_params.bend_cost = ctx.router_opts.bend_cost; + cost_params.pres_fac = ctx.pres_fac; + cost_params.delay_budget = ((ctx.budgeting_inf.if_set()) ? &conn_delay_budget : nullptr); + + /* This isn't exactly thread safe, but here both threads routing this net would be setting this to the same value */ + if (ctx.budgeting_inf.if_set()) { + ctx.budgeting_inf.set_should_reroute(vnet.net_id, false); + } + + /* Route sinks in decreasing order of criticality */ + for (unsigned itarget = 0; itarget < remaining_isinks.size(); ++itarget) { + int isink = remaining_isinks[itarget]; + RRNodeId sink_rr = route_ctx.net_rr_terminals[vnet.net_id][isink]; + cost_params.criticality = pin_criticality[isink]; + + enable_router_debug(ctx.router_opts, vnet.net_id, sink_rr, ctx.itry, &ctx.routers.local()); + VTR_LOGV_DEBUG(f_router_debug, "Routing to sink %zu of decomposed net %zu, clipped bbox = %d,%d - %d,%d\n", + size_t(sink_rr), size_t(vnet.net_id), vnet.clipped_bb.xmin, vnet.clipped_bb.ymin, vnet.clipped_bb.xmax, vnet.clipped_bb.ymax); + + if (ctx.budgeting_inf.if_set()) { + conn_delay_budget.max_delay = ctx.budgeting_inf.get_max_delay_budget(vnet.net_id, isink); + conn_delay_budget.target_delay = ctx.budgeting_inf.get_delay_target(vnet.net_id, isink); + conn_delay_budget.min_delay = ctx.budgeting_inf.get_min_delay_budget(vnet.net_id, isink); + conn_delay_budget.short_path_criticality = ctx.budgeting_inf.get_crit_short_path(vnet.net_id, isink); + conn_delay_budget.routing_budgets_algorithm = ctx.router_opts.routing_budgets_algorithm; + } + + profiling::conn_start(); + + auto sink_flags = timing_driven_route_sink( + ctx.routers.local(), + ctx.net_list, + vnet.net_id, + itarget, + isink, + cost_params, + ctx.router_opts, + tree, + (high_fanout ? &spatial_route_tree_lookup : nullptr), + ctx.router_stats.local(), + ctx.budgeting_inf, + ctx.routing_predictor, + ctx.choking_spots[vnet.net_id], + ctx.is_flat, + vnet.clipped_bb); + + flags.retry_with_full_bb |= sink_flags.retry_with_full_bb; + + /* Give up for vnet if we failed to route a sink, since it's likely we will fail others as well. */ + if (!sink_flags.success) { + PartitionTreeDebug::log("Failed to route sink " + std::to_string(isink - 1) + " in decomposed net:\n" + describe_vnet(vnet)); + ctx.net_known_samples[vnet.net_id].push_back(isink); + flags.success = false; + //continue; + return flags; + } + + /* Update the net delay for the sink we just routed */ + update_net_delay_from_isink(ctx.net_delay[vnet.net_id].data(), + tree, + isink, + ctx.net_list, + vnet.net_id, + ctx.timing_info.get(), + ctx.pin_timing_invalidator); + + if (ctx.router_opts.update_lower_bound_delays) + ctx.connections_inf.update_lower_bound_connection_delay(vnet.net_id, isink, ctx.net_delay[vnet.net_id][isink]); + + profiling::conn_finish(size_t(route_ctx.net_rr_terminals[vnet.net_id][0]), + size_t(sink_rr), + pin_criticality[isink]); + + ++ctx.router_stats.local().connections_routed; + } // finished all sinks + + /* Return early if we failed to route some sinks */ + if(!flags.success) + return flags; + + ++ctx.router_stats.local().nets_routed; + profiling::net_finish(); + + ctx.routers.local().empty_rcv_route_tree_set(); // ? + + flags.success = true; + return flags; +} + +/* Helper for decompose_route_partition_tree(). */ +template +void decompose_route_partition_tree_helper(tbb::task_group& g, + PartitionTreeNode& node, + RouteIterCtx& ctx, + int level) { + vtr::Timer t; + + nets_too_deep.local() = 0; + nets_clock.local() = 0; + nets_retry_limit.local() = 0; + nets_thin_strip.local() = 0; + nets_cut_thin_strip.local() = 0; + nets_few_fanouts.local() = 0; + nets_set_to_decompose.local() = 0; + + /* Sort so net with most sinks is routed first. + * We want to interleave virtual nets with regular ones, so sort an "index vector" + * instead where indices >= node.nets.size() refer to node.virtual_nets. */ + std::vector order(node.nets.size() + node.virtual_nets.size()); + std::iota(order.begin(), order.end(), 0); + std::sort(order.begin(), order.end(), [&](size_t i, size_t j) -> bool { + ParentNetId id1 = i < node.nets.size() ? node.nets[i] : node.virtual_nets[i - node.nets.size()].net_id; + ParentNetId id2 = j < node.nets.size() ? node.nets[j] : node.virtual_nets[j - node.nets.size()].net_id; + return ctx.net_list.net_sinks(id1).size() > ctx.net_list.net_sinks(id2).size(); + }); + + /* Route virtual or regular nets, interleaved */ + for(size_t i: order){ + if(i < node.nets.size()){ // regular net + ParentNetId net_id = node.nets[i]; + /* Should I decompose this net? */ + if (should_decompose_net(net_id, node, ctx)) { + auto decomposed_nets = route_and_decompose(net_id, node, ctx); + if (decomposed_nets) { + auto& [left, right] = decomposed_nets.value(); + node.left->virtual_nets.push_back(left); + node.right->virtual_nets.push_back(right); + /* We changed the routing */ + ctx.rerouted_nets.local().push_back(net_id); + continue; /* We are done with this net */ + } + } + /* If not, route it here */ + auto flags = try_parallel_route_net(net_id, ctx); + + if (!flags.success && !flags.retry_with_full_bb) { + ctx.is_routable = false; + } + if (flags.was_rerouted) { + ctx.rerouted_nets.local().push_back(net_id); + } + if (flags.retry_with_full_bb) { + ctx.nets_to_retry.push_back(net_id); + } + } else { // virtual net + VirtualNet& vnet = node.virtual_nets[i - node.nets.size()]; + /* Should we decompose this vnet? */ + if (should_decompose_vnet(vnet, node)) { + auto decomposed_nets = route_and_decompose_vnet(vnet, node, ctx); + if (decomposed_nets) { + auto& [left, right] = decomposed_nets.value(); + node.left->virtual_nets.push_back(left); + node.right->virtual_nets.push_back(right); + continue; + } + } + /* Otherwise, route it here. + * We don't care about flags, if there's something truly wrong, + * it will get discovered when decomposition is disabled */ + route_virtual_net(vnet, ctx); + } + } + + PartitionTreeDebug::log("Node with " + std::to_string(node.nets.size()) + + " nets and " + std::to_string(node.virtual_nets.size()) + + " virtual nets routed in " + std::to_string(t.elapsed_sec()) + + " s (level=" + std::to_string(level) + ")"); + + PartitionTreeDebug::log("total: " + std::to_string(node.nets.size()) + + " nets_too_deep: " + std::to_string(nets_too_deep.local()) + + " nets_clock: " + std::to_string(nets_clock.local()) + + " nets_retry_limit: " + std::to_string(nets_retry_limit.local()) + + " nets_thin_strip: " + std::to_string(nets_thin_strip.local()) + + " nets_cut_thin_strip: " + std::to_string(nets_cut_thin_strip.local()) + + " nets_few_fanouts: " + std::to_string(nets_few_fanouts.local()) + + " nets_set_to_decompose: " + std::to_string(nets_set_to_decompose.local())); + + /* add left and right trees to task queue */ + if (node.left && node.right) { + /* Otherwise both try to change the same "level" and garble it */ + g.run([&, level]() { + decompose_route_partition_tree_helper(g, *node.left, ctx, level + 1); + }); + g.run([&, level]() { + decompose_route_partition_tree_helper(g, *node.right, ctx, level + 1); + }); + } else { + VTR_ASSERT(!node.left && !node.right); // there shouldn't be a node with a single branch + } +} + +/** Route all nets in parallel using the partitioning information in the PartitionTree. + * + * @param[in, out] g TBB task group to dispatch tasks. + * @param[in, out] tree The partition tree. Non-const reference because iteration results get written on the nodes. + * @param[in, out] ctx RouteIterCtx containing all the necessary bits of state for routing. + * @return RouteIterResults combined from all threads. + * + * See comments in PartitionTreeNode for how parallel routing works. */ +template +RouteIterResults decompose_route_partition_tree(tbb::task_group& g, + PartitionTree& tree, + RouteIterCtx& ctx) { + auto& device_ctx = g_vpr_ctx.device(); + auto& route_ctx = g_vpr_ctx.mutable_routing(); + PartitionTreeDebug::log(describe_fanout_histogram()); + + ctx.nets_to_retry.clear(); + for (auto& thread_rerouted_nets: ctx.rerouted_nets){ + thread_rerouted_nets.clear(); + } + + /* Route all nets */ + decompose_route_partition_tree_helper(g, tree.root(), ctx, 0); + g.wait(); + + /* Grow the bounding box and set to not decompose if a net is set to retry */ + for (ParentNetId net_id : ctx.nets_to_retry) { + route_ctx.route_bb[net_id] = { + 0, + (int)(device_ctx.grid.width() - 1), + 0, + (int)(device_ctx.grid.height() - 1)}; + ctx.is_decomp_disabled[net_id] = true; + } + + RouteIterResults out; + for (auto& thread_rerouted_nets: ctx.rerouted_nets){ + out.rerouted_nets.insert(out.rerouted_nets.begin(), thread_rerouted_nets.begin(), thread_rerouted_nets.end()); + } + for (auto& thread_stats : ctx.router_stats) { + update_router_stats(out.stats, thread_stats); + } + return out; +} + +/* Build a partition tree and do a net-decomposing route with it */ +template +static RouteIterResults decompose_route_with_partition_tree(tbb::task_group& g, RouteIterCtx& ctx) { + vtr::Timer t2; + PartitionTree partition_tree(ctx.net_list, ctx.net_scores); + + float total_prep_time = t2.elapsed_sec(); + VTR_LOG("# Built partition tree in %f seconds\n", total_prep_time); + + return decompose_route_partition_tree(g, partition_tree, ctx); +} + #endif // VPR_USE_TBB diff --git a/vpr/src/route/route_samplers.cpp b/vpr/src/route/route_samplers.cpp new file mode 100644 index 00000000000..b8969d06507 --- /dev/null +++ b/vpr/src/route/route_samplers.cpp @@ -0,0 +1,69 @@ +/** Bulky geometry code for route_samplers.h + * TODO: Make the fns available in vtr_geometry.h? */ + +#include "route_samplers.h" + +/** Cross product of v0v1 and v0p */ +constexpr int det(const SinkPoint& p, const SinkPoint& v0, const SinkPoint& v1){ + return (v1.x - v0.x) * (p.y - v0.y) - (v1.y - v0.y) * (p.x - v0.x); +} + +/** Which side of [v0, v1] has p? +1 is right, -1 is left */ +constexpr int which_side(const SinkPoint& p, const SinkPoint& v0, const SinkPoint& v1){ + return det(p, v0, v1) > 0 ? 1 : -1; +} + +/** Perpendicular distance of p to v0v1 assuming |v0v1| = 1 + * (it's not, so only use to compare when v0 and v1 is the same for different p's) */ +constexpr int dist(const SinkPoint& p, const SinkPoint& v0, const SinkPoint& v1){ + return abs(det(p, v0, v1)); +} + +/** Helper for quickhull() */ +void find_hull(std::set& out, const std::vector& points, const SinkPoint& v0, const SinkPoint& v1, int side){ + int max_dist = 0; + const SinkPoint* max_p = nullptr; + for(auto& point: points){ + if(which_side(point, v0, v1) != side){ + continue; + } + int h = dist(point, v0, v1); + if(h > max_dist){ + max_dist = h; + max_p = &point; + } + } + if(!max_p) /* no point */ + return; + out.insert(*max_p); + find_hull(out, points, v0, *max_p, -1); + find_hull(out, points, *max_p, v1, -1); +} + +/** Find convex hull. Doesn't work with <3 points. + * See https://en.wikipedia.org/wiki/Quickhull */ +std::vector quickhull(const std::vector& points){ + if(points.size() < 3) + return std::vector(); + + std::set out; + + int min_x = std::numeric_limits::max(); + int max_x = std::numeric_limits::min(); + const SinkPoint* min_p, *max_p; + for(auto& point: points){ + if(point.x <= min_x){ + min_x = point.x; + min_p = &point; + } + if(point.x >= max_x){ + max_x = point.x; + max_p = &point; + } + } + out.insert(*min_p); + out.insert(*max_p); + find_hull(out, points, *min_p, *max_p, -1); + find_hull(out, points, *min_p, *max_p, 1); + return std::vector(out.begin(), out.end()); +} diff --git a/vpr/src/route/route_samplers.h b/vpr/src/route/route_samplers.h new file mode 100644 index 00000000000..f1d2a222a57 --- /dev/null +++ b/vpr/src/route/route_samplers.h @@ -0,0 +1,503 @@ +/** Sink downsamplers for parallel routing. + * + * These are used to get a "minimal skeleton routing" from the main task. + * Rest of the routing is delegated to child tasks. They will work with a + * strictly limited bounding box, so it's necessary that the initial routing + * provides enough hints while routing to as few sinks as possible to limit + * the serial bottleneck. */ +#pragma once + +#include +#include +#include +#include "globals.h" +#include "partition_tree.h" +#include "route_common.h" +#include "router_lookahead_sampling.h" + +/** Minimum bin size when spatially sampling decomposition sinks. (I know, doesn't make much sense.) + * The parallel router tries to decompose nets by building a "skeleton routing" from the main task + * and then delegating the remaining work to its child tasks. This minimum bin size determines how much + * time the main thread spends building the skeleton. + * Less is more effort -> less speedup, better quality. + * See get_decomposition_isinks() for more info. */ +constexpr size_t MIN_DECOMP_BIN_WIDTH = 5; + +/** Sink container for geometry operations */ +struct SinkPoint { + int x; + int y; + int isink; + + bool operator==(const SinkPoint& rhs) const { + return x == rhs.x && y == rhs.y; + } + bool operator<(const SinkPoint& rhs) const { + if(x < rhs.x) + return true; + if(x > rhs.x) + return false; + if(y < rhs.y) + return true; + if(y > rhs.y) + return false; + return isink < rhs.isink; + } +}; + +/** Find convex hull. Doesn't work with <3 points. + * See https://en.wikipedia.org/wiki/Quickhull */ +std::vector quickhull(const std::vector& points); + +/** Which side of the cutline is this RRNode? + * Cutlines are always assumed to be at cutline_axis = (cutline_pos + 0.5). + * In the context of the parallel router, a RR node is considered to be inside a bounding + * box if its top left corner (xlow, ylow) is inside it. */ +inline Side which_side(RRNodeId inode, int cutline_pos, Axis axis) { + auto& device_ctx = g_vpr_ctx.device(); + const auto& rr_graph = device_ctx.rr_graph; + + if (axis == Axis::X) { + return Side(rr_graph.node_xlow(inode) > cutline_pos); /* 1 is RIGHT */ + } else { + return Side(rr_graph.node_ylow(inode) > cutline_pos); + } +} + +/** Sample most critical sink in every MIN_DECOMP_BIN_WIDTH-wide bin. Bins are grown to absorb fractional bins. + * Skip a bin if already reached by existing routing. */ +inline std::vector min_voxel_downsample(ParentNetId net_id, const std::vector& remaining_targets) { + const auto& route_ctx = g_vpr_ctx.routing(); + const auto& device_ctx = g_vpr_ctx.device(); + const auto& rr_graph = device_ctx.rr_graph; + + const RouteTree& tree = route_ctx.route_trees[net_id].value(); + std::vector out; + + /* Set up sampling bins. If we are sampling from W = 22 with minimum width 6, then we have + * 3 bins and real width is 22/3 + 1 = 8. Then x=0 goes to bin 0, x=8 goes to bin 1 etc. */ + const t_bb& net_bb = route_ctx.route_bb[net_id]; + size_t width = net_bb.xmax - net_bb.xmin + 1; + size_t height = net_bb.ymax - net_bb.ymin + 1; + size_t bins_x = width / MIN_DECOMP_BIN_WIDTH; + size_t bins_y = height / MIN_DECOMP_BIN_WIDTH; + size_t samples_to_find = bins_x * bins_y; + size_t bin_width_x = width / bins_x + 1; + size_t bin_width_y = height / bins_y + 1; + + /* The sample for each bin, indexed by [x][y]. Set to -1 if reached by existing routing, + * 0 if not found yet. */ + std::vector> samples(bins_x, std::vector(bins_y)); + constexpr int REACHED = -1; + constexpr int NONE = 0; + + /* Mark bins with already reached sinks. */ + for (int isink : tree.get_reached_isinks()) { + if (samples_to_find == 0) + return out; + RRNodeId rr_sink = route_ctx.net_rr_terminals[net_id][isink]; + size_t x = (rr_graph.node_xlow(rr_sink) - net_bb.xmin) / bin_width_x; + size_t y = (rr_graph.node_ylow(rr_sink) - net_bb.ymin) / bin_width_y; + if (samples[x][y] != REACHED) { + samples[x][y] = REACHED; + samples_to_find--; + } + } + + /* Spatially sample remaining targets. This should be already sorted by pin criticality, + * so we sample the most critical sink in the bin right away. */ + for (int isink : remaining_targets) { + if (samples_to_find == 0) + return out; + RRNodeId rr_sink = route_ctx.net_rr_terminals[net_id][isink]; + size_t x = (rr_graph.node_xlow(rr_sink) - net_bb.xmin) / bin_width_x; + size_t y = (rr_graph.node_ylow(rr_sink) - net_bb.ymin) / bin_width_y; + if (samples[x][y] == NONE) { + samples[x][y] = isink; + out.push_back(isink); + samples_to_find--; + } + } + + return out; +} + +/** Sample sinks on the convex hull of the set {source + sinks}. Skip sinks if already reached. */ +inline void convex_hull_downsample(ParentNetId net_id, std::set& out) { + const auto& route_ctx = g_vpr_ctx.routing(); + const auto& device_ctx = g_vpr_ctx.device(); + const auto& rr_graph = device_ctx.rr_graph; + + const RouteTree& tree = route_ctx.route_trees[net_id].value(); + std::vector sink_points; + + /* i = 0 corresponds to the source */ + for(size_t i = 0; i < tree.num_sinks()+1; i++){ + RRNodeId rr_sink = route_ctx.net_rr_terminals[net_id][i]; + SinkPoint point {rr_graph.node_xlow(rr_sink), rr_graph.node_ylow(rr_sink), int(i)}; + sink_points.push_back(point); + } + + auto hull = quickhull(sink_points); + + auto& is_isink_reached = tree.get_is_isink_reached(); + /* Sample if not reached and not source */ + for(auto& point: hull){ + if(point.isink == 0) /* source */ + continue; + if(!is_isink_reached[point.isink]) + out.insert(point.isink); + } +} + +/** Clip bb to one side of the cutline given the axis and position of the cutline. + * Note that cutlines are assumed to be at axis = cutline_pos + 0.5. */ +inline t_bb clip_to_side2(const t_bb& bb, Axis axis, int cutline_pos, Side side) { + t_bb out = bb; + if (axis == Axis::X && side == Side::LEFT) + out.xmax = cutline_pos; + else if (axis == Axis::X && side == Side::RIGHT) + out.xmin = cutline_pos + 1; + else if (axis == Axis::Y && side == Side::LEFT) + out.ymax = cutline_pos; + else if (axis == Axis::Y && side == Side::RIGHT) + out.ymin = cutline_pos + 1; + else + VTR_ASSERT_MSG(false, "Unreachable"); + return out; +} + +inline int dist2(int x1, int y1, int x2, int y2){ + return (x2 - x1) * (x2 - x1) + (y2 - y1) * (y2 - y1); +} + +/** Sample one sink closest to each bbox's epicenter. The rationale is that the + * sinks around the cutline will be sampled by the sink thickness rule anyway. */ +inline void sample_both_epicenters(ParentNetId net_id, int cutline_pos, Axis cutline_axis, std::set& out) { + const auto& route_ctx = g_vpr_ctx.routing(); + const auto& device_ctx = g_vpr_ctx.device(); + const auto& rr_graph = device_ctx.rr_graph; + + const RouteTree& tree = route_ctx.route_trees[net_id].value(); + + int num_sinks = tree.num_sinks(); + auto& is_isink_reached = tree.get_is_isink_reached(); + const t_bb& net_bb = route_ctx.route_bb[net_id]; + t_bb left_bb = clip_to_side2(net_bb, cutline_axis, cutline_pos, Side::LEFT); + t_bb right_bb = clip_to_side2(net_bb, cutline_axis, cutline_pos, Side::RIGHT); + int left_epi_x = (left_bb.xmin + left_bb.xmax) / 2; + int left_epi_y = (left_bb.ymin + left_bb.ymax) / 2; + int right_epi_x = (right_bb.xmin + right_bb.xmax) / 2; + int right_epi_y = (right_bb.ymin + right_bb.ymax) / 2; + int best_score_left = std::numeric_limits::max(); + int best_score_right = std::numeric_limits::max(); + int best_left_isink = 0; + int best_right_isink = 0; + + for(int isink=1; isink& out) { + const auto& route_ctx = g_vpr_ctx.routing(); + const auto& device_ctx = g_vpr_ctx.device(); + const auto& rr_graph = device_ctx.rr_graph; + + const RouteTree& tree = route_ctx.route_trees[vnet.net_id].value(); + + int num_sinks = tree.num_sinks(); + auto& is_isink_reached = tree.get_is_isink_reached(); + const t_bb& net_bb = vnet.clipped_bb; + t_bb left_bb = clip_to_side2(net_bb, cutline_axis, cutline_pos, Side::LEFT); + t_bb right_bb = clip_to_side2(net_bb, cutline_axis, cutline_pos, Side::RIGHT); + int left_epi_x = (left_bb.xmin + left_bb.xmax) / 2; + int left_epi_y = (left_bb.ymin + left_bb.ymax) / 2; + int right_epi_x = (right_bb.xmin + right_bb.xmax) / 2; + int right_epi_y = (right_bb.ymin + right_bb.ymax) / 2; + int best_score_left = std::numeric_limits::max(); + int best_score_right = std::numeric_limits::max(); + int best_left_isink = 0; + int best_right_isink = 0; + + for(int isink=1; isink& out) { + const auto& route_ctx = g_vpr_ctx.routing(); + const auto& device_ctx = g_vpr_ctx.device(); + const auto& rr_graph = device_ctx.rr_graph; + + const RouteTree& tree = route_ctx.route_trees[vnet.net_id].value(); + std::vector sink_points; + + /* i = 0 corresponds to the source */ + for(size_t i = 0; i < tree.num_sinks()+1; i++){ + RRNodeId rr_sink = route_ctx.net_rr_terminals[vnet.net_id][i]; + if(!inside_bb(rr_sink, vnet.clipped_bb)) + continue; + SinkPoint point {rr_graph.node_xlow(rr_sink), rr_graph.node_ylow(rr_sink), int(i)}; + sink_points.push_back(point); + } + + auto hull = quickhull(sink_points); + + auto& is_isink_reached = tree.get_is_isink_reached(); + /* Sample if not reached and not source */ + for(auto& point: hull){ + if(point.isink == 0) /* source */ + continue; + if(!is_isink_reached[point.isink]) + out.insert(point.isink); + } +} + +/** Sample sinks on the *sink side* of the convex hull of the set {source + sinks}. + * Skip sinks if already reached. */ +inline std::vector half_convex_hull_downsample(ParentNetId net_id, int cutline_pos, Axis cutline_axis) { + const auto& route_ctx = g_vpr_ctx.routing(); + const auto& device_ctx = g_vpr_ctx.device(); + const auto& rr_graph = device_ctx.rr_graph; + + const RouteTree& tree = route_ctx.route_trees[net_id].value(); + std::vector out; + std::vector sink_points; + + /* i = 0 corresponds to the source */ + for(size_t i = 0; i < tree.num_sinks()+1; i++){ + RRNodeId rr_sink = route_ctx.net_rr_terminals[net_id][i]; + SinkPoint point {rr_graph.node_xlow(rr_sink), rr_graph.node_ylow(rr_sink), int(i)}; + sink_points.push_back(point); + } + + auto hull = quickhull(sink_points); + + auto& is_isink_reached = tree.get_is_isink_reached(); + RRNodeId rr_source = route_ctx.net_rr_terminals[net_id][0]; + Side source_side = which_side(rr_source, cutline_pos, cutline_axis); + /* Sample if not reached and not source */ + for(auto& point: hull){ + if(point.isink == 0 || is_isink_reached[point.isink]) /* source or reached */ + continue; + RRNodeId rr_sink = route_ctx.net_rr_terminals[net_id][point.isink]; + if(which_side(rr_sink, cutline_pos, cutline_axis) == source_side) /* on source side */ + continue; + out.push_back(point.isink); + } + + return out; +} + +/** Sample sinks on the *sink side* of the convex hull of the set {source + sinks}. + * Skip sinks if already reached. */ +inline std::vector half_convex_hull_downsample_vnet(const VirtualNet& vnet, int cutline_pos, Axis cutline_axis) { + const auto& route_ctx = g_vpr_ctx.routing(); + const auto& device_ctx = g_vpr_ctx.device(); + const auto& rr_graph = device_ctx.rr_graph; + + const RouteTree& tree = route_ctx.route_trees[vnet.net_id].value(); + std::vector out; + std::vector sink_points; + + /* i = 0 corresponds to the source */ + for(size_t i = 0; i < tree.num_sinks()+1; i++){ + RRNodeId rr_sink = route_ctx.net_rr_terminals[vnet.net_id][i]; + if(!inside_bb(rr_sink, vnet.clipped_bb)) + continue; + SinkPoint point {rr_graph.node_xlow(rr_sink), rr_graph.node_ylow(rr_sink), int(i)}; + sink_points.push_back(point); + } + + auto hull = quickhull(sink_points); + + auto& is_isink_reached = tree.get_is_isink_reached(); + RRNodeId rr_source = route_ctx.net_rr_terminals[vnet.net_id][0]; + Side source_side = which_side(rr_source, cutline_pos, cutline_axis); + /* Sample if not reached and not source */ + for(auto& point: hull){ + if(point.isink == 0 || is_isink_reached[point.isink]) /* source or reached */ + continue; + RRNodeId rr_sink = route_ctx.net_rr_terminals[vnet.net_id][point.isink]; + if(which_side(rr_sink, cutline_pos, cutline_axis) == source_side) /* on source side */ + continue; + out.push_back(point.isink); + } + + return out; +} + +/** Sample the most critical sink on the other side of the cutline. + * Sample nothing if that's already reached. */ +inline std::vector sample_single_sink(ParentNetId net_id, const std::vector& pin_criticality, int cutline_pos, Axis cutline_axis) { + const auto& route_ctx = g_vpr_ctx.routing(); + const RouteTree& tree = route_ctx.route_trees[net_id].value(); + auto& is_isink_reached = tree.get_is_isink_reached(); + + std::vector isinks(tree.num_sinks()); + std::iota(isinks.begin(), isinks.end(), 1); + std::sort(isinks.begin(), isinks.end(), [&](int i, int j){ + return pin_criticality[i] > pin_criticality[j]; + }); + + RRNodeId rr_source = route_ctx.net_rr_terminals[net_id][0]; + Side source_side = which_side(rr_source, cutline_pos, cutline_axis); + for(int isink: isinks){ + if(is_isink_reached[isink]) + continue; + RRNodeId rr_sink = route_ctx.net_rr_terminals[net_id][isink]; + if(which_side(rr_sink, cutline_pos, cutline_axis) != source_side){ + if(is_isink_reached[isink]) + return {}; + else + return {isink}; + } + } + + return {}; +} + +inline bool is_close_to_cutline2(RRNodeId inode, int cutline_pos, Axis cutline_axis, int thickness){ + const auto& device_ctx = g_vpr_ctx.device(); + const auto& rr_graph = device_ctx.rr_graph; + + /* Cutlines are considered to be at x + 0.5, set a thickness of +1 here by checking for equality */ + if(cutline_axis == Axis::X){ + return rr_graph.node_xlow(inode) - thickness <= cutline_pos && rr_graph.node_xhigh(inode) + thickness >= cutline_pos; + } else { + return rr_graph.node_ylow(inode) - thickness <= cutline_pos && rr_graph.node_yhigh(inode) + thickness >= cutline_pos; + } +} + +/** Is \p inode too close to this bb? (Assuming it's inside) + * We assign some "thickness" to the node and check for collision */ +inline bool is_close_to_bb2(RRNodeId inode, const t_bb& bb, int thickness){ + const auto& device_ctx = g_vpr_ctx.device(); + const auto& rr_graph = device_ctx.rr_graph; + + int xlow = rr_graph.node_xlow(inode) - thickness; + int ylow = rr_graph.node_ylow(inode) - thickness; + int xhigh = rr_graph.node_xhigh(inode) + thickness; + int yhigh = rr_graph.node_yhigh(inode) + thickness; + + return (xlow <= bb.xmin && xhigh >= bb.xmin) + || (ylow <= bb.ymin && yhigh >= bb.ymin) + || (xlow <= bb.xmax && xhigh >= bb.xmax) + || (ylow <= bb.ymax && yhigh >= bb.ymax); +} + +/** Sample the most critical sinks on both sides. Omit reached sinks. */ +inline void sample_two_sinks(ParentNetId net_id, const std::vector& pin_criticality, int cutline_pos, Axis cutline_axis, std::set& out) { + const auto& route_ctx = g_vpr_ctx.routing(); + const RouteTree& tree = route_ctx.route_trees[net_id].value(); + auto& is_isink_reached = tree.get_is_isink_reached(); + + std::vector isinks(tree.num_sinks()); + std::iota(isinks.begin(), isinks.end(), 1); + std::sort(isinks.begin(), isinks.end(), [&](int i, int j){ + return pin_criticality[i] > pin_criticality[j]; + }); + + int left_isink = -1; + int right_isink = -1; + const t_bb& net_bb = route_ctx.route_bb[net_id]; + t_bb left_bb = clip_to_side2(net_bb, cutline_axis, cutline_pos, Side::LEFT); + t_bb right_bb = clip_to_side2(net_bb, cutline_axis, cutline_pos, Side::RIGHT); + + for(int isink: isinks){ + RRNodeId rr_sink = route_ctx.net_rr_terminals[net_id][isink]; + if(is_close_to_cutline2(rr_sink, cutline_pos, cutline_axis, 3)) + continue; + if(inside_bb(rr_sink, left_bb)){ + left_isink = isink; + }else if(inside_bb(rr_sink, right_bb)){ + right_isink = isink; + } + if(left_isink > -1 && right_isink > -1) + break; + } + + if(left_isink > -1 && !is_isink_reached[left_isink]) + out.insert(left_isink); + if(right_isink > -1 && !is_isink_reached[right_isink]) + out.insert(right_isink); +} + +/** Sample the most critical sinks on both sides. Omit reached sinks. */ +inline void sample_two_sinks_vnet(const VirtualNet& vnet, const std::vector& pin_criticality, int cutline_pos, Axis cutline_axis, std::set& out) { + const auto& route_ctx = g_vpr_ctx.routing(); + const RouteTree& tree = route_ctx.route_trees[vnet.net_id].value(); + auto& is_isink_reached = tree.get_is_isink_reached(); + + std::vector isinks(tree.num_sinks()); + std::iota(isinks.begin(), isinks.end(), 1); + std::sort(isinks.begin(), isinks.end(), [&](int i, int j){ + return pin_criticality[i] > pin_criticality[j]; + }); + + int left_isink = -1; + int right_isink = -1; + const t_bb& net_bb = vnet.clipped_bb; + t_bb left_bb = clip_to_side2(net_bb, cutline_axis, cutline_pos, Side::LEFT); + t_bb right_bb = clip_to_side2(net_bb, cutline_axis, cutline_pos, Side::RIGHT); + + for(int isink: isinks){ + RRNodeId rr_sink = route_ctx.net_rr_terminals[vnet.net_id][isink]; + if(inside_bb(rr_sink, left_bb) && !is_close_to_cutline2(rr_sink, cutline_pos, cutline_axis, 3) && !is_close_to_bb2(rr_sink, left_bb, 1)){ + left_isink = isink; + }else if(inside_bb(rr_sink, right_bb) && !is_close_to_cutline2(rr_sink, cutline_pos, cutline_axis, 3) && !is_close_to_bb2(rr_sink, right_bb, 1)){ + right_isink = isink; + } + if(left_isink > -1 && right_isink > -1) + break; + } + + if(left_isink > -1 && !is_isink_reached[left_isink]) + out.insert(left_isink); + if(right_isink > -1 && !is_isink_reached[right_isink]) + out.insert(right_isink); +} \ No newline at end of file diff --git a/vpr/src/route/route_timing.cpp b/vpr/src/route/route_timing.cpp index 62930ad2555..059e80e69d4 100644 --- a/vpr/src/route/route_timing.cpp +++ b/vpr/src/route/route_timing.cpp @@ -1,3 +1,4 @@ +#include #include #include #include @@ -65,42 +66,6 @@ static int num_routing_failed = 0; /******************** Subroutines local to route_timing.cpp ********************/ -/** Attempt to route a single sink (target_pin) in a net. - * In the process, update global pathfinder costs, rr_node_route_inf and extend the global RouteTree - * for this net. - * - * @param router The ConnectionRouter instance - * @param net_list Input netlist - * @param net_id - * @param itarget # of this connection in the net (only used for debug output) - * @param target_pin # of this sink in the net (TODO: is it the same thing as itarget?) - * @param cost_params - * @param router_opts - * @param[in, out] tree RouteTree describing the current routing state - * @param rt_node_of_sink Lookup from target_pin-like indices (indicating SINK nodes) to RouteTreeNodes - * @param spatial_rt_lookup - * @param router_stats - * @param budgeting_inf - * @param routing_predictor - * @param choking_spots - * @param is_flat - * @return NetResultFlags for this sink to be bubbled up through timing_driven_route_net */ -template -static NetResultFlags timing_driven_route_sink(ConnectionRouter& router, - const Netlist<>& net_list, - ParentNetId net_id, - unsigned itarget, - int target_pin, - const t_conn_cost_params cost_params, - const t_router_opts& router_opts, - RouteTree& tree, - SpatialRouteTreeLookup& spatial_rt_lookup, - RouterStats& router_stats, - route_budgets& budgeting_inf, - const RoutingPredictor& routing_predictor, - const std::vector>& choking_spots, - bool is_flat); - /** Return tuple of: * bool: Did we find a path for each sink in this net? * bool: Should the caller retry with a full-device bounding box? */ @@ -117,42 +82,6 @@ static std::tuple timing_driven_pre_route_to_clock_root(ConnectionRo bool is_flat, bool can_grow_bb); -static void setup_routing_resources(int itry, - ParentNetId net_id, - const Netlist<>& net_list, - unsigned num_sinks, - int min_incremental_reroute_fanout, - CBRR& connections_inf, - const t_router_opts& router_opts, - bool ripup_high_fanout_nets); - -static void update_net_delays_from_route_tree(float* net_delay, - const Netlist<>& net_list, - ParentNetId inet, - TimingInfo* timing_info, - NetPinTimingInvalidator* pin_timing_invalidator); - -static bool check_hold(const t_router_opts& router_opts, float worst_neg_slack); - -static float get_net_pin_criticality(const std::shared_ptr timing_info, - const ClusteredPinAtomPinsLookup& netlist_pin_lookup, - float max_criticality, - float criticality_exp, - ParentNetId net_id, - ParentPinId pin_id, - bool is_flat); - -struct more_sinks_than { - const Netlist<>& net_list_; - more_sinks_than(const Netlist<>& net_list) - : net_list_(net_list) {} - inline bool operator()(const ParentNetId& net_index1, const ParentNetId& net_index2) { - return net_list_.net_sinks(net_index1).size() > net_list_.net_sinks(net_index2).size(); - } -}; - -static bool is_high_fanout(int fanout, int fanout_threshold); - // The reason that try_timing_driven_route_tmpl (and descendents) are being // templated over is because using a virtual interface instead fully templating // the router results in a 5% runtime increase. @@ -253,7 +182,10 @@ bool try_timing_driven_route_tmpl(const Netlist<>& net_list, //sort so net with most sinks is routed first. auto sorted_nets = std::vector(net_list.nets().begin(), net_list.nets().end()); - std::sort(sorted_nets.begin(), sorted_nets.end(), more_sinks_than(net_list)); + + std::sort(sorted_nets.begin(), sorted_nets.end(), [&](const ParentNetId id1, const ParentNetId id2) -> bool { + return net_list.net_sinks(id1).size() > net_list.net_sinks(id2).size(); + }); /* * Configure the routing predictor @@ -413,7 +345,6 @@ bool try_timing_driven_route_tmpl(const Netlist<>& net_list, RouterStats router_stats; init_router_stats(router_stats); - timing_driven_route_structs route_structs(net_list); float prev_iter_cumm_time = 0; vtr::Timer iteration_timer; int num_net_bounding_boxes_updated = 0; @@ -465,7 +396,6 @@ bool try_timing_driven_route_tmpl(const Netlist<>& net_list, router_opts, connections_inf, router_iteration_stats, - route_structs.pin_criticality, net_delay, netlist_pin_lookup, route_timing_info, @@ -863,7 +793,6 @@ NetResultFlags try_timing_driven_route_net(ConnectionRouter& router, const t_router_opts& router_opts, CBRR& connections_inf, RouterStats& router_stats, - std::vector& pin_criticality, NetPinsMatrix& net_delay, const ClusteredPinAtomPinsLookup& netlist_pin_lookup, std::shared_ptr timing_info, @@ -901,7 +830,6 @@ NetResultFlags try_timing_driven_route_net(ConnectionRouter& router, router_opts, connections_inf, router_stats, - pin_criticality, net_delay[net_id].data(), netlist_pin_lookup, timing_info, @@ -946,7 +874,6 @@ NetResultFlags timing_driven_route_net(ConnectionRouter& router, const t_router_opts& router_opts, CBRR& connections_inf, RouterStats& router_stats, - std::vector& pin_criticality, float* net_delay, const ClusteredPinAtomPinsLookup& netlist_pin_lookup, std::shared_ptr timing_info, @@ -993,18 +920,20 @@ NetResultFlags timing_driven_route_net(ConnectionRouter& router, // remaining_targets from this point on are the **pin indices** that have yet to be routed std::vector remaining_targets(tree.get_remaining_isinks().begin(), tree.get_remaining_isinks().end()); + std::vector pin_criticality(num_sinks + 1); + // calculate criticality of remaining target pins for (int ipin : remaining_targets) { if (timing_info) { auto pin = net_list.net_pin(net_id, ipin); - pin_criticality[ipin] = get_net_pin_criticality(timing_info, - netlist_pin_lookup, - router_opts.max_criticality, - router_opts.criticality_exp, - net_id, - pin, - is_flat); - + pin_criticality[ipin] = get_net_pin_criticality( + timing_info, + netlist_pin_lookup, + router_opts.max_criticality, + router_opts.criticality_exp, + net_id, + pin, + is_flat); } else { //No timing info, implies we want a min delay routing, so use criticality of 1. pin_criticality[ipin] = 1.; @@ -1012,7 +941,7 @@ NetResultFlags timing_driven_route_net(ConnectionRouter& router, } // compare the criticality of different sink nodes - sort(begin(remaining_targets), end(remaining_targets), [&](int a, int b) { + std::sort(remaining_targets.begin(), remaining_targets.end(), [&](int a, int b) { return pin_criticality[a] > pin_criticality[b]; }); @@ -1044,17 +973,18 @@ NetResultFlags timing_driven_route_net(ConnectionRouter& router, * routers handle this in the same way */ bool can_grow_bb = (router_opts.router_algorithm != PARALLEL); - std::tie(flags.success, flags.retry_with_full_bb) = timing_driven_pre_route_to_clock_root(router, - net_id, - net_list, - sink_node, - cost_params, - router_opts.high_fanout_threshold, - tree, - spatial_route_tree_lookup, - router_stats, - is_flat, - can_grow_bb); + std::tie(flags.success, flags.retry_with_full_bb) = timing_driven_pre_route_to_clock_root( + router, + net_id, + net_list, + sink_node, + cost_params, + router_opts.high_fanout_threshold, + tree, + spatial_route_tree_lookup, + router_stats, + is_flat, + can_grow_bb); return flags; } @@ -1084,20 +1014,22 @@ NetResultFlags timing_driven_route_net(ConnectionRouter& router, profiling::conn_start(); // build a branch in the route tree to the target - auto sink_flags = timing_driven_route_sink(router, - net_list, - net_id, - itarget, - target_pin, - cost_params, - router_opts, - tree, - spatial_route_tree_lookup, - router_stats, - budgeting_inf, - routing_predictor, - choking_spots, - is_flat); + auto sink_flags = timing_driven_route_sink( + router, + net_list, + net_id, + itarget, + target_pin, + cost_params, + router_opts, + tree, + (high_fanout ? &spatial_route_tree_lookup : nullptr), + router_stats, + budgeting_inf, + routing_predictor, + choking_spots, + is_flat, + route_ctx.route_bb[net_id]); flags.retry_with_full_bb |= sink_flags.retry_with_full_bb; @@ -1173,7 +1105,8 @@ static std::tuple timing_driven_pre_route_to_clock_root(ConnectionRo std::unordered_map()); std::tie(found_path, retry_with_full_bb, cheapest) = router.timing_driven_route_connection_from_route_tree( - tree.root(), + tree, + tree.root().inode, sink_node, cost_params, bounding_box, @@ -1229,20 +1162,21 @@ static std::tuple timing_driven_pre_route_to_clock_root(ConnectionRo } template -static NetResultFlags timing_driven_route_sink(ConnectionRouter& router, - const Netlist<>& net_list, - ParentNetId net_id, - unsigned itarget, - int target_pin, - const t_conn_cost_params cost_params, - const t_router_opts& router_opts, - RouteTree& tree, - SpatialRouteTreeLookup& spatial_rt_lookup, - RouterStats& router_stats, - route_budgets& budgeting_inf, - const RoutingPredictor& routing_predictor, - const std::vector>& choking_spots, - bool is_flat) { +NetResultFlags timing_driven_route_sink(ConnectionRouter& router, + const Netlist<>& net_list, + ParentNetId net_id, + unsigned itarget, + int target_pin, + const t_conn_cost_params cost_params, + const t_router_opts& router_opts, + RouteTree& tree, + SpatialRouteTreeLookup* spatial_rt_lookup, + RouterStats& router_stats, + route_budgets& budgeting_inf, + const RoutingPredictor& routing_predictor, + const std::vector>& choking_spots, + bool is_flat, + const t_bb& bounding_box) { const auto& device_ctx = g_vpr_ctx.device(); auto& route_ctx = g_vpr_ctx.mutable_routing(); @@ -1257,14 +1191,13 @@ static NetResultFlags timing_driven_route_sink(ConnectionRouter& router, bool found_path; t_heap cheapest; - t_bb bounding_box = route_ctx.route_bb[net_id]; /* Is the connection router allowed to grow the bounding box? That's not the case * when routing in parallel, so disallow it. */ bool can_grow_bb = (router_opts.router_algorithm != PARALLEL); bool net_is_global = net_list.net_is_global(net_id); - bool high_fanout = is_high_fanout(net_list.net_sinks(net_id).size(), router_opts.high_fanout_threshold); + bool high_fanout = (spatial_rt_lookup != nullptr); constexpr float HIGH_FANOUT_CRITICALITY_THRESHOLD = 0.9; bool sink_critical = (cost_params.criticality > HIGH_FANOUT_CRITICALITY_THRESHOLD); bool net_is_clock = route_ctx.is_clock_net[net_id] != 0; @@ -1276,22 +1209,26 @@ static NetResultFlags timing_driven_route_sink(ConnectionRouter& router, //However, if the current sink is 'critical' from a timing perspective, we put the entire route tree back onto //the heap to ensure it has more flexibility to find the best path. if (high_fanout && !sink_critical && !net_is_global && !net_is_clock && -routing_predictor.get_slope() > router_opts.high_fanout_max_slope) { - std::tie(found_path, flags.retry_with_full_bb, cheapest) = router.timing_driven_route_connection_from_route_tree_high_fanout(tree.root(), - sink_node, - cost_params, - bounding_box, - spatial_rt_lookup, - router_stats, - conn_params, - can_grow_bb); + std::tie(found_path, flags.retry_with_full_bb, cheapest) = router.timing_driven_route_connection_from_route_tree_high_fanout( + tree, + tree.root().inode, + sink_node, + cost_params, + bounding_box, + *spatial_rt_lookup, + router_stats, + conn_params, + can_grow_bb); } else { - std::tie(found_path, flags.retry_with_full_bb, cheapest) = router.timing_driven_route_connection_from_route_tree(tree.root(), - sink_node, - cost_params, - bounding_box, - router_stats, - conn_params, - can_grow_bb); + std::tie(found_path, flags.retry_with_full_bb, cheapest) = router.timing_driven_route_connection_from_route_tree( + tree, + tree.root().inode, + sink_node, + cost_params, + bounding_box, + router_stats, + conn_params, + can_grow_bb); } if (!found_path) { @@ -1305,6 +1242,8 @@ static NetResultFlags timing_driven_route_sink(ConnectionRouter& router, if (f_router_debug) { update_screen(ScreenUpdatePriority::MAJOR, "Unable to route connection.", ROUTING, nullptr); } + /* Reset path costs since routing may go on after a failure */ + router.reset_path_costs(); flags.success = false; return flags; } @@ -1315,9 +1254,9 @@ static NetResultFlags timing_driven_route_sink(ConnectionRouter& router, route_ctx.rr_node_route_inf[inode].target_flag--; /* Connected to this SINK. */ vtr::optional new_branch, new_sink; - std::tie(new_branch, new_sink) = tree.update_from_heap(&cheapest, target_pin, ((high_fanout) ? &spatial_rt_lookup : nullptr), is_flat); + std::tie(new_branch, new_sink) = tree.update_from_heap(&cheapest, target_pin, spatial_rt_lookup, is_flat); - VTR_ASSERT_DEBUG(!high_fanout || validate_route_tree_spatial_lookup(tree.root(), spatial_rt_lookup)); + VTR_ASSERT_DEBUG(!high_fanout || validate_route_tree_spatial_lookup(tree.root(), *spatial_rt_lookup)); if (f_router_debug) { std::string msg = vtr::string_fmt("Routed Net %zu connection %d to RR node %d successfully", size_t(net_id), itarget, sink_node); @@ -1343,18 +1282,14 @@ static NetResultFlags timing_driven_route_sink(ConnectionRouter& router, return flags; } -static void setup_routing_resources(int itry, - ParentNetId net_id, - const Netlist<>& net_list, - unsigned num_sinks, - int min_incremental_reroute_fanout, - CBRR& connections_inf, - const t_router_opts& router_opts, - bool ripup_high_fanout_nets) { - /* Build and return a partial route tree from the legal connections from last iteration. - * along the way do: - * update pathfinder costs to be accurate to the partial route tree - * mark the rr_node sinks as targets to be reached. */ +void setup_routing_resources(int itry, + ParentNetId net_id, + const Netlist<>& net_list, + unsigned num_sinks, + int min_incremental_reroute_fanout, + CBRR& connections_inf, + const t_router_opts& router_opts, + bool ripup_high_fanout_nets) { auto& route_ctx = g_vpr_ctx.mutable_routing(); /* "tree" points to this net's spot in the global context here, so re-initializing it etc. changes the global state */ @@ -1367,12 +1302,12 @@ static void setup_routing_resources(int itry, /* rip up the whole net */ if (tree) - pathfinder_update_cost_from_route_tree(tree.value().root(), -1); + pathfinder_update_cost_from_route_tree(tree->root(), -1); tree = vtr::nullopt; /* re-initialize net */ tree = RouteTree(net_id); - pathfinder_update_cost_from_route_tree(tree.value().root(), 1); + pathfinder_update_cost_from_route_tree(tree->root(), 1); // since all connections will be rerouted for this net, clear all of net's forced reroute flags connections_inf.clear_force_reroute_for_net(net_id); @@ -1386,7 +1321,7 @@ static void setup_routing_resources(int itry, if (!tree) { tree = RouteTree(net_id); - pathfinder_update_cost_from_route_tree(tree.value().root(), 1); + pathfinder_update_cost_from_route_tree(tree->root(), 1); } /* copy the existing routing @@ -1415,31 +1350,33 @@ static void setup_routing_resources(int itry, // Initialize only to source tree = RouteTree(net_id); - pathfinder_update_cost_from_route_tree(tree.value().root(), 1); + pathfinder_update_cost_from_route_tree(tree->root(), 1); } + profiling::net_rebuild_end(num_sinks, tree->get_remaining_isinks().size()); profiling::net_rebuild_end(num_sinks, tree->get_remaining_isinks().size()); // still need to calculate the tree's time delay - tree.value().reload_timing(); + tree->reload_timing(); // check for R_upstream C_downstream and edge correctness - VTR_ASSERT_SAFE(tree.value().is_valid()); + VTR_ASSERT_SAFE(tree->is_valid()); // congestion should've been pruned away - VTR_ASSERT_SAFE(tree.value().is_uncongested()); + VTR_ASSERT_SAFE(tree->is_uncongested()); // mark remaining ends mark_remaining_ends(net_id); // mark the lookup (rr_node_route_inf) for existing tree elements as NO_PREVIOUS so add_to_path stops when it reaches one of them - update_rr_route_inf_from_tree(tree.value().root()); + update_rr_route_inf_from_tree(tree->root()); } // completed constructing the partial route tree and updated all other data structures to match } -/** Change the base costs of rr_nodes according to # of fanouts */ +/** Change the base costs of rr_nodes according to # of fanouts + * TODO: Doesn't seem very thread safe? */ void update_rr_base_costs(int fanout) { auto& device_ctx = g_vpr_ctx.mutable_device(); @@ -1511,11 +1448,11 @@ bool timing_driven_check_net_delays(const Netlist<>& net_list, NetPinsMatrix& net_list, - ParentNetId inet, - TimingInfo* timing_info, - NetPinTimingInvalidator* pin_timing_invalidator) { +void update_net_delays_from_route_tree(float* net_delay, + const Netlist<>& net_list, + ParentNetId inet, + TimingInfo* timing_info, + NetPinTimingInvalidator* pin_timing_invalidator) { auto& route_ctx = g_vpr_ctx.routing(); const RouteTree& tree = route_ctx.route_trees[inet].value(); @@ -1524,7 +1461,7 @@ static void update_net_delays_from_route_tree(float* net_delay, } } -/* Detect if net should be routed or not */ +/** Detect if \p net_id should be routed or not. */ bool should_route_net(ParentNetId net_id, CBRR& connections_inf, bool if_force_reroute) { @@ -1576,25 +1513,13 @@ bool early_exit_heuristic(const t_router_opts& router_opts, const WirelengthInfo return false; } -static bool check_hold(const t_router_opts& router_opts, float worst_neg_slack) { - /* When RCV is enabled, it's necessary to be able to completely ripup high fanout nets if there is still negative hold slack - * Normally the router will prune the illegal branches of high fanout nets, this will bypass this */ - - if (router_opts.routing_budgets_algorithm != YOYO) { - return false; - } else if (worst_neg_slack != 0) { - return true; - } - return false; -} - -static float get_net_pin_criticality(const std::shared_ptr timing_info, - const ClusteredPinAtomPinsLookup& netlist_pin_lookup, - float max_criticality, - float criticality_exp, - ParentNetId net_id, - ParentPinId pin_id, - bool is_flat) { +float get_net_pin_criticality(const std::shared_ptr timing_info, + const ClusteredPinAtomPinsLookup& netlist_pin_lookup, + float max_criticality, + float criticality_exp, + ParentNetId net_id, + ParentPinId pin_id, + bool is_flat) { float pin_criticality = 0.0; const auto& route_ctx = g_vpr_ctx.routing(); @@ -1646,6 +1571,23 @@ WirelengthInfo calculate_wirelength_info(const Netlist<>& net_list, size_t avail auto& route_ctx = g_vpr_ctx.routing(); +#ifdef VPR_USE_TBB + tbb::combinable thread_used_wirelength(0); + + tbb::parallel_for_each(net_list.nets().begin(), net_list.nets().end(), [&](ParentNetId net_id){ + if (!net_list.net_is_ignored(net_id) + && net_list.net_sinks(net_id).size() != 0 /* Globals don't count. */ + && route_ctx.route_trees[net_id]) { + int bends, wirelength, segments; + bool is_absorbed; + get_num_bends_and_length(net_id, &bends, &wirelength, &segments, &is_absorbed); + + thread_used_wirelength.local() += wirelength; + } + }); + + used_wirelength = thread_used_wirelength.combine(std::plus()); +#else for (auto net_id : net_list.nets()) { if (!net_list.net_is_ignored(net_id) && net_list.net_sinks(net_id).size() != 0 /* Globals don't count. */ @@ -1657,6 +1599,7 @@ WirelengthInfo calculate_wirelength_info(const Netlist<>& net_list, size_t avail used_wirelength += wirelength; } } +#endif return WirelengthInfo(available_wirelength, used_wirelength); } @@ -1789,12 +1732,6 @@ void print_overused_nodes_status(const t_router_opts& router_opts, const Overuse VTR_LOG("\n"); } -//Returns true if the specified net fanout is classified as high fanout -static bool is_high_fanout(int fanout, int fanout_threshold) { - if (fanout_threshold < 0 || fanout < fanout_threshold) return false; - return true; -} - // In heavily congested designs a static bounding box (BB) can // become problematic for routability (it effectively enforces a // hard blockage restricting where a net can route). @@ -1850,7 +1787,9 @@ size_t dynamic_update_bounding_boxes(const std::vector& updated_net //use different bounding boxes based on the target location. // //This ensures that the delta values calculated below are always non-negative - if (is_high_fanout(net_list.net_sinks(net).size(), high_fanout_threshold)) continue; + //EXPERIMENT: Do it anyway -- we now clip BBs of HF nets + //if (is_high_fanout(net_list.net_sinks(net).size(), high_fanout_threshold)) + // continue; t_bb curr_bb = calc_current_bb(route_ctx.route_trees[net].value()); t_bb& router_bb = route_ctx.route_bb[net]; diff --git a/vpr/src/route/route_timing.h b/vpr/src/route/route_timing.h index 38495bb806b..01d1228cf7a 100644 --- a/vpr/src/route/route_timing.h +++ b/vpr/src/route/route_timing.h @@ -21,7 +21,6 @@ extern bool f_router_debug; -/** TODO: remove timing_driven_route_structs together with this fn */ int get_max_pins_per_net(const Netlist<>& net_list); /** Types and defines common to timing_driven and parallel routers */ @@ -62,20 +61,6 @@ struct RoutingMetrics { tatum::TimingPathInfo critical_path; }; -/* Data while timing driven route is active */ -class timing_driven_route_structs { - public: - std::vector pin_criticality; /* [1..max_pins_per_net-1] */ - - timing_driven_route_structs(const Netlist<>& net_list) { - int max_sinks = std::max(get_max_pins_per_net(net_list) - 1, 0); - pin_criticality.resize(max_sinks + 1); - - /* Set element 0 to invalid values */ - pin_criticality[0] = std::numeric_limits::quiet_NaN(); - } -}; - /** Returns the bounding box of a net's used routing resources */ t_bb calc_current_bb(const RouteTree& tree); @@ -109,6 +94,12 @@ void generate_route_timing_reports(const t_router_opts& router_opts, const RoutingDelayCalculator& delay_calc, bool is_flat); +/** Returns true if the specified net fanout is classified as high fanout. */ +inline bool is_high_fanout(int fanout, int fanout_threshold) { + if (fanout_threshold < 0 || fanout < fanout_threshold) return false; + return true; +} + /** Initialize net_delay based on best-case delay estimates from the router lookahead. */ void init_net_delay_from_lookahead(const RouterLookahead& router_lookahead, const Netlist<>& net_list, @@ -196,6 +187,67 @@ bool try_timing_driven_route(const Netlist<>& net_list, ScreenUpdatePriority first_iteration_priority, bool is_flat); +/** Calculate pin criticality for \p pin_id of \p net_id. */ +float get_net_pin_criticality(const std::shared_ptr timing_info, + const ClusteredPinAtomPinsLookup& netlist_pin_lookup, + float max_criticality, + float criticality_exp, + ParentNetId net_id, + ParentPinId pin_id, + bool is_flat); + +/** Build and return a partial route tree from the legal connections from last iteration. + * along the way do: + * update pathfinder costs to be accurate to the partial route tree + * find and store the pins that still need to be reached in connections_inf.remaining_targets + * find and store the rt nodes that have been reached in connections_inf.reached_rt_sinks + * mark the rr_node sinks as targets to be reached. */ +void setup_routing_resources(int itry, + ParentNetId net_id, + const Netlist<>& net_list, + unsigned num_sinks, + int min_incremental_reroute_fanout, + CBRR& connections_inf, + const t_router_opts& router_opts, + bool ripup_high_fanout_nets); + +/** Attempt to route a single sink (target_pin) in a net. + * In the process, update global pathfinder costs, rr_node_route_inf and extend the global RouteTree + * for this net. + * + * @param router The ConnectionRouter instance + * @param net_list Input netlist + * @param net_id + * @param itarget # of this connection in the net (only used for debug output) + * @param target_pin # of this sink in the net (TODO: is it the same thing as itarget?) + * @param cost_params + * @param router_opts + * @param[in, out] tree RouteTree describing the current routing state + * @param rt_node_of_sink Lookup from target_pin-like indices (indicating SINK nodes) to RouteTreeNodes + * @param spatial_rt_lookup + * @param router_stats + * @param budgeting_inf + * @param routing_predictor + * @param choking_spots + * @param is_flat + * @return NetResultFlags for this sink to be bubbled up through timing_driven_route_net */ +template +NetResultFlags timing_driven_route_sink(ConnectionRouter& router, + const Netlist<>& net_list, + ParentNetId net_id, + unsigned itarget, + int target_pin, + const t_conn_cost_params cost_params, + const t_router_opts& router_opts, + RouteTree& tree, + SpatialRouteTreeLookup* spatial_rt_lookup, + RouterStats& router_stats, + route_budgets& budgeting_inf, + const RoutingPredictor& routing_predictor, + const std::vector>& choking_spots, + bool is_flat, + const t_bb& bounding_box); + /** Attempt to route a single net. * * @param router The ConnectionRouter instance @@ -207,7 +259,6 @@ bool try_timing_driven_route(const Netlist<>& net_list, * @param connections_inf * @param router_stats * @param pin_criticality - * @param rt_node_of_sink Lookup from target_pin-like indices (indicating SINK nodes) to RouteTreeNodes * @param net_delay * @param netlist_pin_lookup * @param timing_info @@ -227,7 +278,6 @@ NetResultFlags timing_driven_route_net(ConnectionRouter& router, const t_router_opts& router_opts, CBRR& connections_inf, RouterStats& router_stats, - std::vector& pin_criticality, float* net_delay, const ClusteredPinAtomPinsLookup& netlist_pin_lookup, std::shared_ptr timing_info, @@ -247,7 +297,6 @@ NetResultFlags try_timing_driven_route_net(ConnectionRouter& router, const t_router_opts& router_opts, CBRR& connections_inf, RouterStats& router_stats, - std::vector& pin_criticality, NetPinsMatrix& net_delay, const ClusteredPinAtomPinsLookup& netlist_pin_lookup, std::shared_ptr timing_info, @@ -278,6 +327,15 @@ inline void update_net_delay_from_isink(float* net_delay, net_delay[isink] = new_delay; } +/* Goes through all the sinks of this net and copies their delay values from + * the route_tree to the net_delay array. */ +void update_net_delays_from_route_tree(float* net_delay, + const Netlist<>& net_list, + ParentNetId inet, + TimingInfo* timing_info, + NetPinTimingInvalidator* pin_timing_invalidator); + +/** Combine \p router_iteration_stats into \p router_stats. */ void update_router_stats(RouterStats& router_stats, RouterStats& router_iteration_stats); #ifndef NO_GRAPHICS diff --git a/vpr/src/route/route_tree.cpp b/vpr/src/route/route_tree.cpp index 36f37461527..01667d8ace2 100644 --- a/vpr/src/route/route_tree.cpp +++ b/vpr/src/route/route_tree.cpp @@ -40,9 +40,13 @@ void RouteTreeNode::print_x(int depth) const { auto& device_ctx = g_vpr_ctx.device(); const auto& rr_graph = device_ctx.rr_graph; - VTR_LOG("%srt_node: %d (%s) \t ipin: %d \t R: %g \t C: %g \t delay: %g \t", + VTR_LOG("%srt_node: %d (%d, %d) -> (%d, %d) (%s) ipin: %d R: %g C: %g delay: %g ", indent.c_str(), inode, + rr_graph.node_xlow(inode), + rr_graph.node_ylow(inode), + rr_graph.node_xhigh(inode), + rr_graph.node_yhigh(inode), rr_graph.node_type_string(inode), net_pin_index, R_upstream, @@ -50,7 +54,7 @@ void RouteTreeNode::print_x(int depth) const { Tdel); if (_parent) { - VTR_LOG("parent: %d \t parent_switch: %d", _parent->inode, parent_switch); + VTR_LOG("parent: %d parent_switch: %d", _parent->inode, parent_switch); bool parent_edge_configurable = rr_graph.rr_switch_inf(parent_switch).configurable(); if (!parent_edge_configurable) { VTR_LOG("*"); @@ -288,7 +292,7 @@ RouteTree::update_unbuffered_ancestors_C_downstream(RouteTreeNode& from_node) { /* Having set the value of C_downstream_addition, we must check whether the parent switch * is a buffered or unbuffered switch with the if statement below. If the parent switch is - * a buffered switch, then the parent node's downsteam capacitance is increased by the + * a buffered switch, then the parent node's downstream capacitance is increased by the * value of the parent switch's internal capacitance in the if statement below. * Correspondingly, the ancestors' downstream capacitance will be updated by the same * value through the while loop. Otherwise, if the parent switch is unbuffered, then @@ -301,6 +305,8 @@ RouteTree::update_unbuffered_ancestors_C_downstream(RouteTreeNode& from_node) { if (rr_graph.rr_switch_inf(iswitch).buffered() == true) { C_downstream_addition = rr_graph.rr_switch_inf(iswitch).Cinternal; + if(C_downstream_addition == 0) /* This switch has Cinternal = 0, no need to update parent */ + return from_node; last_node = parent_rt_node; last_node->C_downstream += C_downstream_addition; parent_rt_node = last_node->_parent; diff --git a/vpr/src/route/router_delay_profiling.cpp b/vpr/src/route/router_delay_profiling.cpp index 4e2274c406f..51d5a21d972 100644 --- a/vpr/src/route/router_delay_profiling.cpp +++ b/vpr/src/route/router_delay_profiling.cpp @@ -72,7 +72,8 @@ bool RouterDelayProfiler::calculate_delay(RRNodeId source_node, RRNodeId sink_no false, std::unordered_map()); std::tie(found_path, std::ignore, cheapest) = router_.timing_driven_route_connection_from_route_tree( - tree.root(), + tree, + tree.root().inode, sink_node, cost_params, bounding_box, @@ -144,7 +145,7 @@ vtr::vector calculate_all_path_delays_from_rr_node(RRNodeId src is_flat); RouterStats router_stats; ConnectionParameters conn_params(ParentNetId::INVALID(), OPEN, false, std::unordered_map()); - vtr::vector shortest_paths = router.timing_driven_find_all_shortest_paths_from_route_tree(tree.root(), + vtr::vector shortest_paths = router.timing_driven_find_all_shortest_paths_from_route_tree(tree, cost_params, bounding_box, router_stats, diff --git a/vpr/src/route/spatial_route_tree_lookup.cpp b/vpr/src/route/spatial_route_tree_lookup.cpp index 3d3f7a25460..e03fe8f291e 100644 --- a/vpr/src/route/spatial_route_tree_lookup.cpp +++ b/vpr/src/route/spatial_route_tree_lookup.cpp @@ -48,7 +48,7 @@ void update_route_tree_spatial_lookup_recur(const RouteTreeNode& rt_node, Spatia // // TODO: Depending on bin size, long wires may end up being added only to bins at // their start/end and may pass through bins along their length to which they - // are not added. If this becomes an issues, reconsider how we add nodes to + // are not added. If this becomes an issue, reconsider how we add nodes to // bins if (bin_xhigh != bin_xlow || bin_yhigh != bin_ylow) { spatial_lookup[bin_xhigh][bin_yhigh].push_back(rt_node); diff --git a/vpr/src/route/virtual_net.h b/vpr/src/route/virtual_net.h new file mode 100644 index 00000000000..4c0cff5e4ba --- /dev/null +++ b/vpr/src/route/virtual_net.h @@ -0,0 +1,21 @@ +#pragma once + +#include "netlist_fwd.h" +#include "route_tree_fwd.h" +#include "vpr_types.h" + +/** A net decomposed by routing a connection through the partitioning + * cutline and dividing the bounding box into two. When routing, the connection + * router will receive a smaller-than-usual bounding box and will have to + * filter the existing routing spatially. */ +class VirtualNet { + public: + /** The net in question. */ + ParentNetId net_id; + /** Clipped bounding box. This is needed to enable decomposing a net multiple times. + * Otherwise we would need a history of side types and cutlines to compute the bbox. */ + t_bb clipped_bb; + /** Times decomposed -- don't decompose vnets too deeply or + * it disturbs net ordering when it's eventually disabled & creates a runtime bump. */ + int times_decomposed = 0; +}; diff --git a/vpr/src/timing/NetPinTimingInvalidator.h b/vpr/src/timing/NetPinTimingInvalidator.h index f452b95bd7a..ded51f11560 100644 --- a/vpr/src/timing/NetPinTimingInvalidator.h +++ b/vpr/src/timing/NetPinTimingInvalidator.h @@ -11,9 +11,8 @@ # include #endif -/** Make NetPinTimingInvalidator a virtual class since it does nothing for the general case of non-incremental - * timing updates. It should really be templated to not pay the cost for vtable lookups, but this is the - * best approach without putting a template on every function which uses this machine. */ +/** Adapter code to tell TimingInfo about invalidated connections. Can be no-op in + * the case of full timing updates. */ class NetPinTimingInvalidator { public: typedef vtr::Range tedge_range; @@ -83,19 +82,13 @@ class IncrNetPinTimingInvalidator : public NetPinTimingInvalidator { * driving the specified pin. * Is concurrently safe. */ void invalidate_connection(ParentPinId pin, TimingInfo* timing_info) { - if (invalidated_pins_.count(pin)) return; //Already invalidated - for (tatum::EdgeId edge : pin_timing_edges(pin)) { timing_info->invalidate_delay(edge); } - - invalidated_pins_.insert(pin); } - /** Resets invalidation state for this class - * Not concurrently safe! */ + /** Resets invalidation state for this class (no-op) */ void reset() { - invalidated_pins_.clear(); } private: @@ -129,14 +122,6 @@ class IncrNetPinTimingInvalidator : public NetPinTimingInvalidator { private: std::vector pin_first_edge_; //Indices into timing_edges corresponding std::vector timing_edges_; - - /** Cache for invalidated pins. Use concurrent set when TBB is turned on, since the - * invalidator may be shared between threads */ -#ifdef VPR_USE_TBB - tbb::concurrent_unordered_set invalidated_pins_; -#else - vtr::vec_id_set invalidated_pins_; -#endif }; /** NetPinTimingInvalidator is only a rube goldberg machine when incremental timing analysis @@ -155,7 +140,8 @@ class NoopNetPinTimingInvalidator : public NetPinTimingInvalidator { } }; -/** Make a NetPinTimingInvalidator depending on update_type. Will return a NoopInvalidator if it's not INCREMENTAL. */ +/** Make a NetPinTimingInvalidator depending on update_type. Will return a NoopInvalidator + * if it's not INCREMENTAL or AUTO (adaptive) */ inline std::unique_ptr make_net_pin_timing_invalidator( e_timing_update_type update_type, const Netlist<>& net_list, @@ -164,10 +150,10 @@ inline std::unique_ptr make_net_pin_timing_invalidator( const AtomLookup& atom_lookup, const tatum::TimingGraph& timing_graph, bool is_flat) { - if (update_type == e_timing_update_type::FULL || update_type == e_timing_update_type::AUTO) { + if (update_type == e_timing_update_type::FULL) { return std::make_unique(); } else { - VTR_ASSERT(update_type == e_timing_update_type::INCREMENTAL); + VTR_ASSERT(update_type == e_timing_update_type::INCREMENTAL || update_type == e_timing_update_type::AUTO); return std::make_unique(net_list, clb_atom_pin_lookup, atom_nlist, atom_lookup, timing_graph, is_flat); } -} \ No newline at end of file +} diff --git a/vpr/src/timing/concrete_timing_info.h b/vpr/src/timing/concrete_timing_info.h index 9aaae0d82ff..d733c6c03c4 100644 --- a/vpr/src/timing/concrete_timing_info.h +++ b/vpr/src/timing/concrete_timing_info.h @@ -1,6 +1,7 @@ #ifndef VPR_CONCRETE_TIMING_INFO_H #define VPR_CONCRETE_TIMING_INFO_H +#include "tatum/analyzer_factory.hpp" #include "vtr_log.h" #include "timing_info.h" #include "timing_util.h" @@ -490,8 +491,10 @@ std::unique_ptr make_setup_hold_timing_info(std::shared_ptr auto& timing_ctx = g_vpr_ctx.timing(); std::shared_ptr analyzer; - if (update_type == e_timing_update_type::FULL || update_type == e_timing_update_type::AUTO) { + if (update_type == e_timing_update_type::FULL) { analyzer = tatum::AnalyzerFactory::make(*timing_ctx.graph, *timing_ctx.constraints, *delay_calculator); + } else if (update_type == e_timing_update_type::AUTO) { /* Create adaptive analyzer */ + analyzer = tatum::AnalyzerFactory::make(*timing_ctx.graph, *timing_ctx.constraints, *delay_calculator); } else { VTR_ASSERT(update_type == e_timing_update_type::INCREMENTAL); analyzer = tatum::AnalyzerFactory::make(*timing_ctx.graph, *timing_ctx.constraints, *delay_calculator); diff --git a/vpr/src/timing/net_delay.cpp b/vpr/src/timing/net_delay.cpp index 5420c197769..d5d1ce52152 100644 --- a/vpr/src/timing/net_delay.cpp +++ b/vpr/src/timing/net_delay.cpp @@ -45,13 +45,13 @@ static void load_one_constant_net_delay(const Netlist<>& net_list, float delay_value); /*************************** Subroutine definitions **************************/ -void load_net_delay_from_routing(const Netlist<>& net_list, NetPinsMatrix& net_delay) { - /* This routine loads net_delay[0..nets.size()-1][1..num_pins-1]. Each entry * - * is the Elmore delay from the net source to the appropriate sink. Both * - * the rr_graph and the routing traceback must be completely constructed * - * before this routine is called, and the net_delay array must have been * - * allocated. */ +/** This routine loads net_delay[0..nets.size()-1][1..num_pins-1]. Each entry + * is the Elmore delay from the net source to the appropriate sink. Both + * the rr_graph and the routing traceback must be completely constructed + * before this routine is called, and the net_delay array must have been + * allocated. */ +void load_net_delay_from_routing(const Netlist<>& net_list, NetPinsMatrix& net_delay) { for (auto net_id : net_list.nets()) { if (net_list.net_is_ignored(net_id)) { load_one_constant_net_delay(net_list, net_delay, net_id, 0.); @@ -61,18 +61,17 @@ void load_net_delay_from_routing(const Netlist<>& net_list, NetPinsMatrix } } +/** This routine loads delay values for one net in + * net_delay[net_id][1..num_pins-1]. First, from the traceback, it + * constructs the route tree and computes its values for R, C, and Tdel. + * Next, it walks the route tree recursively, storing the time delays for + * each sink into the map ipin_to_Tdel. Then, while looping through the + * net_delay array we search for the pin index in the map, and + * correspondingly update the entry in net_delay. Finally, it frees the + * route tree and clears the ipin_to_Tdel_map associated with that net. */ static void load_one_net_delay(const Netlist<>& net_list, NetPinsMatrix& net_delay, ParentNetId net_id) { - /* This routine loads delay values for one net in * - * net_delay[net_id][1..num_pins-1]. First, from the traceback, it * - * constructs the route tree and computes its values for R, C, and Tdel. * - * Next, it walks the route tree recursively, storing the time delays for * - * each sink into the map ipin_to_Tdel. Then, while looping through the * - * net_delay array we search for the pin index in the map, and * - * correspondingly update the entry in net_delay. Finally, it frees the * - * route tree and clears the ipin_to_Tdel_map associated with that net. */ - auto& route_ctx = g_vpr_ctx.mutable_routing(); if (!route_ctx.route_trees[net_id]) { @@ -92,9 +91,9 @@ static void load_one_net_delay(const Netlist<>& net_list, ipin_to_Tdel_map.clear(); // clear the map } +/** This routine recursively traverses the route tree, and copies the Tdel of the sink_type nodes + * into the map. */ static void load_one_net_delay_recurr(const RouteTreeNode& rt_node, ParentNetId net_id) { - /* This routine recursively traverses the route tree, and copies the Tdel of the sink_type nodes * - * into the map. */ if (rt_node.net_pin_index != OPEN) { // value of OPEN indicates a non-SINK ipin_to_Tdel_map[rt_node.net_pin_index] = rt_node.Tdel; // add to the map, process current sink-type node } @@ -104,12 +103,11 @@ static void load_one_net_delay_recurr(const RouteTreeNode& rt_node, ParentNetId } } +/** Sets each entry of the net_delay array for net inet to delay_value. */ static void load_one_constant_net_delay(const Netlist<>& net_list, NetPinsMatrix& net_delay, ParentNetId net_id, float delay_value) { - /* Sets each entry of the net_delay array for net inet to delay_value. */ - for (unsigned int ipin = 1; ipin < net_list.net_pins(net_id).size(); ipin++) net_delay[net_id][ipin] = delay_value; } diff --git a/vpr/test/test_connection_router.cpp b/vpr/test/test_connection_router.cpp index 6c1b54734e3..c28fb66694d 100644 --- a/vpr/test/test_connection_router.cpp +++ b/vpr/test/test_connection_router.cpp @@ -68,7 +68,8 @@ static float do_one_route(RRNodeId source_node, -1, false, std::unordered_map()); - std::tie(found_path, std::ignore, cheapest) = router.timing_driven_route_connection_from_route_tree(tree.root(), + std::tie(found_path, std::ignore, cheapest) = router.timing_driven_route_connection_from_route_tree(tree, + source_node, sink_node, cost_params, bounding_box, diff --git a/vpr/test/test_net_decomp.cpp b/vpr/test/test_net_decomp.cpp new file mode 100644 index 00000000000..12a0a969f4d --- /dev/null +++ b/vpr/test/test_net_decomp.cpp @@ -0,0 +1,30 @@ +#include "catch2/catch_test_macros.hpp" +#include "catch2/matchers/catch_matchers_all.hpp" + +#include "route_samplers.h" + +namespace { + +TEST_CASE("test_convex_hull", "[vpr]") { + /* Smoke test for the convex hull algorithm in the sampler */ + std::vector points1 { + {0, 0, 0}, {0, 1, 0}, {1, 1, 0} + }; + std::vector expected_hull1(points1); + std::vector hull1 = quickhull(points1); + REQUIRE_THAT(hull1, Catch::Matchers::UnorderedEquals(expected_hull1)); + + std::vector points2 { + {113,148,0}, {113,143,0}, {113,145,0}, {114,146,0}, {111,138,0}, {110,139,0}, + {112,138,0}, {108,146,0}, {111,145,0}, {103,142,0}, {103,148,0}, {116,142,0}, + {116,141,0}, {110,148,0}, {106,146,0} + }; + std::vector expected_hull2 { + {111,138,0}, {116,141,0}, {112,138,0}, {103,148,0}, {103,142,0}, {116,142,0}, + {113,148,0} + }; + std::vector hull2 = quickhull(points2); + REQUIRE_THAT(hull2, Catch::Matchers::UnorderedEquals(expected_hull2)); +} + +} // namespace diff --git a/vtr_flow/scripts/python_libs/vtr/task.py b/vtr_flow/scripts/python_libs/vtr/task.py index 6bf898a5d22..0cfb6f3ebbe 100644 --- a/vtr_flow/scripts/python_libs/vtr/task.py +++ b/vtr_flow/scripts/python_libs/vtr/task.py @@ -633,9 +633,11 @@ def create_job( prev_run_dir = get_existing_run_dir(find_task_dir(config, args.alt_tasks_dir), prev_run) prev_work_path = Path(prev_run_dir) / work_dir / param_string prev_file = prev_work_path / "{}.{}".format(Path(circuit).stem, extension) - if not prev_file.exists(): - raise FileNotFoundError("use_previous: file %s not found" % str(prev_file)) - current_cmd += [option, str(prev_file)] + if option == "REPLACE_BLIF": + current_cmd[0] = str(prev_file) + current_cmd += ["-start", "vpr"] + else: + current_cmd += [option, str(prev_file)] if param_string != "common": current_cmd += param.split(" ") diff --git a/vtr_flow/scripts/python_libs/vtr/util.py b/vtr_flow/scripts/python_libs/vtr/util.py index 8eec41661ba..e19935b4b21 100644 --- a/vtr_flow/scripts/python_libs/vtr/util.py +++ b/vtr_flow/scripts/python_libs/vtr/util.py @@ -442,23 +442,24 @@ def format_elapsed_time(time_delta): "route": ["route", "--route_file"], "rr_graph": ["rr_graph.xml", "--read_rr_graph"], "lookahead": ["lookahead.bin", "--read_router_lookahead"], + "blif": ["pre-vpr.blif", "REPLACE_BLIF"], } -def argparse_use_previous(inp: str) -> List[Tuple[str, List]]: +def argparse_use_previous(x: str) -> List[Tuple[str, List]]: """ Parse a -use_previous parameter. Throw if not valid. Returns a list with (run dir name, [extension, cmdline option]) elements. """ - tokens = [w.strip() for w in inp.split(",")] + tokens = [w.strip() for w in x.split(",")] tokens = [w for w in tokens if len(w)] out = [] for w in tokens: r = re.fullmatch(r"(\w+):(\w+)", w) if not r: - raise argparse.ArgumentTypeError("Invalid input to -use_previous: %s" % w) + raise argparse.ArgumentError("Invalid input to -use_previous: %s" % w) if not REUSABLE_FILES.get(r.group(2)): - raise argparse.ArgumentTypeError( + raise argparse.ArgumentError( "Unknown file type to use_previous: %s, available types: %s" % (r.group(2), ",".join(REUSABLE_FILES.keys())) )