From a53cdea71af62c7675f49bd1c63d893fb3f2c2c9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Fahrican=20Ko=C5=9Far?= Date: Thu, 24 Mar 2022 19:21:19 -0400 Subject: [PATCH] dump net decomposition code --- .github/scripts/hostsetup.sh | 3 +- dev/pylint_check.py | 3 + libs/librrgraph/src/base/rr_graph_storage.h | 5 - libs/librrgraph/src/io/rr_graph_reader.cpp | 4 +- libs/libvtrutil/src/vtr_math.h | 12 + libs/libvtrutil/src/vtr_util.cpp | 24 +- libs/libvtrutil/src/vtr_util.h | 3 +- utils/route_diag/src/main.cpp | 17 +- vpr/CMakeLists.txt | 5 + vpr/src/base/SetupVPR.cpp | 3 + vpr/src/base/vpr_api.cpp | 9 +- vpr/src/base/vpr_types.h | 3 + vpr/src/route/connection_based_routing.cpp | 10 +- vpr/src/route/connection_based_routing.h | 30 - vpr/src/route/connection_router.cpp | 69 +- vpr/src/route/connection_router.h | 16 +- vpr/src/route/connection_router_interface.h | 8 +- vpr/src/route/partition_tree.cpp | 159 +-- vpr/src/route/partition_tree.h | 67 +- vpr/src/route/route_common.cpp | 12 +- vpr/src/route/route_common.h | 29 +- vpr/src/route/route_parallel.cpp | 951 +++++++++++++++--- vpr/src/route/route_timing.cpp | 370 +++---- vpr/src/route/route_timing.h | 123 ++- vpr/src/route/route_tree.cpp | 72 +- vpr/src/route/route_tree.h | 121 ++- vpr/src/route/router_delay_profiling.cpp | 5 +- vpr/src/route/spatial_route_tree_lookup.cpp | 2 +- vpr/src/route/virtual_net.h | 18 + vpr/src/timing/net_delay.cpp | 36 +- vtr_flow/scripts/python_libs/vtr/__init__.py | 4 +- vtr_flow/scripts/python_libs/vtr/flow.py | 7 + vtr_flow/scripts/python_libs/vtr/task.py | 48 +- vtr_flow/scripts/python_libs/vtr/util.py | 56 +- vtr_flow/scripts/python_libs/vtr/vpr/vpr.py | 12 +- vtr_flow/scripts/run_vtr_flow.py | 13 +- vtr_flow/scripts/run_vtr_task.py | 68 +- .../vtr_reg_strong/koios/config/config.txt | 4 +- .../koios/config/golden_results.txt | 5 +- .../strong_flat_router/config/config.txt | 5 +- .../config/golden_results.txt | 5 +- .../strong_multiclock/config/config.txt | 5 +- .../config/golden_results.txt | 1 + .../strong_timing/config/config.txt | 5 +- .../strong_timing/config/golden_results.txt | 5 +- .../config/config.txt | 2 + .../config/golden_results.txt | 12 +- 47 files changed, 1754 insertions(+), 692 deletions(-) create mode 100644 vpr/src/route/virtual_net.h diff --git a/.github/scripts/hostsetup.sh b/.github/scripts/hostsetup.sh index 48f56a066a9..a136f61a43e 100755 --- a/.github/scripts/hostsetup.sh +++ b/.github/scripts/hostsetup.sh @@ -69,7 +69,8 @@ apt install -y \ default-jdk \ g++-9 \ gcc-9 \ - wget + wget \ + libtbb-dev # installing the latest version of cmake apt install -y apt-transport-https ca-certificates gnupg diff --git a/dev/pylint_check.py b/dev/pylint_check.py index 0231480746b..17872b1b82a 100755 --- a/dev/pylint_check.py +++ b/dev/pylint_check.py @@ -216,6 +216,9 @@ def main(): cmd = ["pylint", path, "-s", "n"] if ignore_list: cmd.append("--disable=" + ",".join(ignore_list)) + # Don't object to single-letter variable names (that's not in PEP8) + # see https://stackoverflow.com/q/21833872 + cmd.append('--variable-rgx=[a-z][a-z0-9_]{1,30}$') # Run pylint and check output process = subprocess.run(cmd, check=False, stdout=subprocess.PIPE) diff --git a/libs/librrgraph/src/base/rr_graph_storage.h b/libs/librrgraph/src/base/rr_graph_storage.h index 09d80264645..950c51f4ffa 100644 --- a/libs/librrgraph/src/base/rr_graph_storage.h +++ b/libs/librrgraph/src/base/rr_graph_storage.h @@ -631,11 +631,6 @@ class t_rr_graph_storage { static inline Direction get_node_direction( vtr::array_view_id node_storage, RRNodeId id) { - auto& node_data = node_storage[id]; - if (node_data.type_ != CHANX && node_data.type_ != CHANY) { - VTR_LOG_ERROR("Attempted to access RR node 'direction' for non-channel type '%s'", - rr_node_typename[node_data.type_]); - } return node_storage[id].dir_side_.direction; } diff --git a/libs/librrgraph/src/io/rr_graph_reader.cpp b/libs/librrgraph/src/io/rr_graph_reader.cpp index 16a340c08d5..a62f41d84d9 100644 --- a/libs/librrgraph/src/io/rr_graph_reader.cpp +++ b/libs/librrgraph/src/io/rr_graph_reader.cpp @@ -1,4 +1,4 @@ -/*This function loads in a routing resource graph written in xml format +/* This function loads in a routing resource graph written in xml format * into vpr when the option --read_rr_graph is specified. * When it is not specified the build_rr_graph function is then called. * This is done using the libpugixml library. This is useful @@ -11,7 +11,7 @@ * to ensure it matches. An error will through if any feature does not match. * Other elements such as edges, nodes, and switches * are overwritten by the rr graph file if one is specified. If an optional - * identifier such as capacitance is not specified, it is set to 0*/ + * identifier such as capacitance is not specified, it is set to 0 */ #include "rr_graph_reader.h" diff --git a/libs/libvtrutil/src/vtr_math.h b/libs/libvtrutil/src/vtr_math.h index 74b4ccebf58..199b15ac71b 100644 --- a/libs/libvtrutil/src/vtr_math.h +++ b/libs/libvtrutil/src/vtr_math.h @@ -4,6 +4,7 @@ #include #include #include +#include #include "vtr_assert.h" @@ -163,6 +164,17 @@ bool isclose(T a, T b) { return isclose(a, b, DEFAULT_REL_TOL, DEFAULT_ABS_TOL); } +/** Log2, round down. + * From https://stackoverflow.com/a/51351885 */ +static inline uint64_t log2_floor(uint64_t x) { + return 63U - __builtin_clzl(x); +} + +/** Log2, round up */ +static inline uint64_t log2_ceil(uint64_t x) { + return log2_floor(x - 1) + 1; +} + } // namespace vtr #endif diff --git a/libs/libvtrutil/src/vtr_util.cpp b/libs/libvtrutil/src/vtr_util.cpp index 45ee3035883..2a7a247bde1 100644 --- a/libs/libvtrutil/src/vtr_util.cpp +++ b/libs/libvtrutil/src/vtr_util.cpp @@ -2,6 +2,7 @@ #include #include //For errno #include +#include #include #include @@ -455,28 +456,15 @@ bool file_exists(const char* filename) { return false; } -/* Date:July 17th, 2013 - * Author: Daniel Chen */ /** * @brief Checks the file extension of an file to ensure correct file format. * - * Returns true if format is correct, and false otherwise. - * @note This is probably a fragile check, but at least should - * prevent common problems such as swapping architecture file - * and blif file on the VPR command line. + * Returns true if the extension is correct, and false otherwise. */ -bool check_file_name_extension(const char* file_name, - const char* file_extension) { - const char* str; - int len_extension; - - len_extension = std::strlen(file_extension); - str = std::strstr(file_name, file_extension); - if (str == nullptr || (*(str + len_extension) != '\0')) { - return false; - } - - return true; +bool check_file_name_extension(std::string file_name, + std::string file_extension) { + auto ext = std::filesystem::path(file_name).extension(); + return ext == file_extension; } /** diff --git a/libs/libvtrutil/src/vtr_util.h b/libs/libvtrutil/src/vtr_util.h index 08562d3d092..edcb7ba8598 100644 --- a/libs/libvtrutil/src/vtr_util.h +++ b/libs/libvtrutil/src/vtr_util.h @@ -69,8 +69,7 @@ double atod(const std::string& value); */ int get_file_line_number_of_last_opened_file(); bool file_exists(const char* filename); -bool check_file_name_extension(const char* file_name, - const char* file_extension); +bool check_file_name_extension(std::string file_name, std::string file_extension); extern std::string out_file_prefix; diff --git a/utils/route_diag/src/main.cpp b/utils/route_diag/src/main.cpp index 571c17c30e6..7f4d50eef28 100644 --- a/utils/route_diag/src/main.cpp +++ b/utils/route_diag/src/main.cpp @@ -117,13 +117,16 @@ static void do_one_route(const Netlist<>& net_list, -1, false, std::unordered_map()); - std::tie(found_path, std::ignore, cheapest) = router.timing_driven_route_connection_from_route_tree(tree.root(), - sink_node, - cost_params, - bounding_box, - router_stats, - conn_params, - true); + std::tie(found_path, std::ignore, cheapest) = router.timing_driven_route_connection_from_route_tree( + tree.root(), + tree.root().inode, + sink_node, + cost_params, + bounding_box, + router_stats, + conn_params, + true + ); if (found_path) { VTR_ASSERT(cheapest.index == sink_node); diff --git a/vpr/CMakeLists.txt b/vpr/CMakeLists.txt index 1568ff0547f..1ab5e2861e9 100644 --- a/vpr/CMakeLists.txt +++ b/vpr/CMakeLists.txt @@ -47,6 +47,11 @@ if(${VTR_ENABLE_CAPNPROTO}) add_definitions("-DVTR_ENABLE_CAPNPROTO") endif() +if(${VPR_DEBUG_PARTITION_TREE}) + message(STATUS "VPR: Partition tree debug logs: enabled") + add_definitions("-DVPR_DEBUG_PARTITION_TREE") +endif() + #Create the library add_library(libvpr STATIC ${LIB_HEADERS} diff --git a/vpr/src/base/SetupVPR.cpp b/vpr/src/base/SetupVPR.cpp index 18306db87a7..6322c1feb9b 100644 --- a/vpr/src/base/SetupVPR.cpp +++ b/vpr/src/base/SetupVPR.cpp @@ -281,6 +281,9 @@ void SetupVPR(const t_options* Options, /* Set seed for pseudo-random placement, default seed to 1 */ vtr::srandom(PlacerOpts->seed); + /* Make num_workers available to the router */ + RouterOpts->num_workers = vpr_setup->num_workers; + { vtr::ScopedStartFinishTimer t("Building complex block graph"); alloc_and_load_all_pb_graphs(PowerOpts->do_power, RouterOpts->flat_routing); diff --git a/vpr/src/base/vpr_api.cpp b/vpr/src/base/vpr_api.cpp index c87d2bec7fc..d058f5352a0 100644 --- a/vpr/src/base/vpr_api.cpp +++ b/vpr/src/base/vpr_api.cpp @@ -217,12 +217,11 @@ void vpr_init_with_options(const t_options* options, t_vpr_setup* vpr_setup, t_a #ifdef VPR_USE_TBB //Using Thread Building Blocks if (num_workers == 0) { - //Use default concurrency (i.e. maximum conccurency) + //Use default concurrency (i.e. maximum concurrency) num_workers = tbb::this_task_arena::max_concurrency(); } VTR_LOG("Using up to %zu parallel worker(s)\n", num_workers); - tbb::global_control c(tbb::global_control::max_allowed_parallelism, num_workers); #else //No parallel execution support if (num_workers != 1) { @@ -237,6 +236,7 @@ void vpr_init_with_options(const t_options* options, t_vpr_setup* vpr_setup, t_a vpr_setup->clock_modeling = options->clock_modeling; vpr_setup->two_stage_clock_routing = options->two_stage_clock_routing; vpr_setup->exit_before_pack = options->exit_before_pack; + vpr_setup->num_workers = num_workers; VTR_LOG("\n"); VTR_LOG("Architecture file: %s\n", options->ArchFile.value().c_str()); @@ -366,6 +366,10 @@ bool vpr_flow(t_vpr_setup& vpr_setup, t_arch& arch) { return true; } + /* Set this here, because tbb::global_control doesn't control anything once it's out of scope + * (contrary to the name). */ + tbb::global_control c(tbb::global_control::max_allowed_parallelism, vpr_setup.num_workers); + { //Pack bool pack_success = vpr_pack_flow(vpr_setup, arch); @@ -912,6 +916,7 @@ RouteStatus vpr_route_fixed_W(const Netlist<>& net_list, if (NO_FIXED_CHANNEL_WIDTH == fixed_channel_width || fixed_channel_width <= 0) { VPR_FATAL_ERROR(VPR_ERROR_ROUTE, "Fixed channel width must be specified when routing at fixed channel width (was %d)", fixed_channel_width); } + bool status = false; status = try_route(net_list, fixed_channel_width, diff --git a/vpr/src/base/vpr_types.h b/vpr/src/base/vpr_types.h index 5d03e194f8a..d145908637e 100644 --- a/vpr/src/base/vpr_types.h +++ b/vpr/src/base/vpr_types.h @@ -1390,6 +1390,8 @@ struct t_router_opts { bool flat_routing; bool has_choking_spot; + size_t num_workers; + // Options related to rr_node reordering, for testing and possible cache optimization e_rr_node_reorder_algorithm reorder_rr_graph_nodes_algorithm = DONT_REORDER; int reorder_rr_graph_nodes_threshold = 0; @@ -1790,6 +1792,7 @@ struct t_vpr_setup { e_clock_modeling clock_modeling; ///> remaining_targets; - - /** Holds RRNodeIds of legally reached sinks. Used to build the external rt_node_to_sink - * lookup. (should be moved into RouteTree)*/ - vtr::vector> reached_rt_sinks; - public: Connection_based_routing_resources(const Netlist<>& net_list, const vtr::vector>& net_terminals, bool is_flat); - // adding to the resources when they are reached during pruning - // mark rr sink node as something that still needs to be reached - void toreach_rr_sink(ParentNetId net_id, int rr_sink_node) { - remaining_targets[net_id].push_back(rr_sink_node); - } - // mark rt sink node as something that has been legally reached - void reached_rt_sink(ParentNetId net_id, RRNodeId rt_sink) { - reached_rt_sinks[net_id].push_back(rt_sink); - } - - // get a handle on the resources - std::vector& get_remaining_targets(ParentNetId net_id) { - return remaining_targets[net_id]; - } - std::vector& get_reached_rt_sinks(ParentNetId net_id) { - return reached_rt_sinks[net_id]; - } bool sanity_check_lookup() const; @@ -87,11 +62,6 @@ class Connection_based_routing_resources { //Updates the connection delay lower bound (if less than current best found) void update_lower_bound_connection_delay(ParentNetId net, int ipin, float delay); - void prepare_routing_for_net(ParentNetId net_id) { - remaining_targets[net_id].clear(); - reached_rt_sinks[net_id].clear(); - } - // get a handle on the resources float get_stable_critical_path_delay() const { return last_stable_critical_path_delay; } diff --git a/vpr/src/route/connection_router.cpp b/vpr/src/route/connection_router.cpp index 62db70ed31f..758271cc6ef 100644 --- a/vpr/src/route/connection_router.cpp +++ b/vpr/src/route/connection_router.cpp @@ -1,4 +1,5 @@ #include "connection_router.h" +#include "route_common.h" #include "rr_graph.h" #include "binary_heap.h" @@ -61,7 +62,8 @@ inline void update_router_stats(const DeviceContext& device_ctx, /** return tuple */ template std::tuple ConnectionRouter::timing_driven_route_connection_from_route_tree( - const RouteTreeNode& rt_root, + const RouteTree& tree, + RRNodeId source_node, RRNodeId sink_node, const t_conn_cost_params cost_params, t_bb bounding_box, @@ -73,7 +75,7 @@ std::tuple ConnectionRouter::timing_driven_route_conne bool retry = false; t_heap* cheapest; - std::tie(retry, cheapest) = timing_driven_route_connection_common_setup(rt_root, sink_node, cost_params, bounding_box, can_grow_bb); + std::tie(retry, cheapest) = timing_driven_route_connection_common_setup(tree, source_node, sink_node, cost_params, bounding_box, can_grow_bb); if (cheapest != nullptr) { rcv_path_manager.update_route_tree_set(cheapest->path_data); @@ -94,7 +96,8 @@ std::tuple ConnectionRouter::timing_driven_route_conne /** Return */ template std::tuple ConnectionRouter::timing_driven_route_connection_common_setup( - const RouteTreeNode& rt_root, + const RouteTree& tree, + RRNodeId source_node, RRNodeId sink_node, const t_conn_cost_params cost_params, t_bb bounding_box, @@ -102,13 +105,12 @@ std::tuple ConnectionRouter::timing_driven_route_connection //Re-add route nodes from the existing route tree to the heap. //They need to be repushed onto the heap since each node's cost is target specific. - add_route_tree_to_heap(rt_root, sink_node, cost_params, false); + add_route_tree_to_heap(tree.root(), sink_node, bounding_box, cost_params, false); heap_.build_heap(); // via sifting down everything - RRNodeId source_node = rt_root.inode; - if (heap_.is_empty_heap()) { VTR_LOG("No source in route tree: %s\n", describe_unrouteable_connection(source_node, sink_node, is_flat_).c_str()); + VTR_LOG("Bounding box: %d,%dx%d,%d\n", bounding_box.xmin, bounding_box.ymin, bounding_box.xmax, bounding_box.ymax); return std::make_tuple(false, nullptr); } @@ -172,7 +174,7 @@ std::tuple ConnectionRouter::timing_driven_route_connection //Re-initialize the heap since it was emptied by the previous call to //timing_driven_route_connection_from_heap() - add_route_tree_to_heap(rt_root, sink_node, cost_params, false); + add_route_tree_to_heap(tree.root(), sink_node, full_device_bounding_box, cost_params, false); heap_.build_heap(); // via sifting down everything //Try finding the path again with the relaxed bounding box @@ -196,7 +198,8 @@ std::tuple ConnectionRouter::timing_driven_route_connection // Returns a tuple of */ template std::tuple ConnectionRouter::timing_driven_route_connection_from_route_tree_high_fanout( - const RouteTreeNode& rt_root, + const RouteTree& tree, + RRNodeId source_node, RRNodeId sink_node, const t_conn_cost_params cost_params, t_bb net_bounding_box, @@ -210,25 +213,24 @@ std::tuple ConnectionRouter::timing_driven_route_conne // re-explore route tree from root to add any new nodes (buildheap afterwards) // route tree needs to be repushed onto the heap since each node's cost is target specific router_stats_->add_high_fanout_rt++; - t_bb high_fanout_bb = add_high_fanout_route_tree_to_heap(rt_root, sink_node, cost_params, spatial_rt_lookup, net_bounding_box); + add_high_fanout_route_tree_to_heap(tree.root(), sink_node, cost_params, spatial_rt_lookup, net_bounding_box); heap_.build_heap(); - RRNodeId source_node = rt_root.inode; - if (heap_.is_empty_heap()) { VTR_LOG("No source in route tree: %s\n", describe_unrouteable_connection(source_node, sink_node, is_flat_).c_str()); return std::make_tuple(false, false, t_heap()); } VTR_LOGV_DEBUG(router_debug_, " Routing to %d as high fanout net (BB: %d,%d x %d,%d)\n", sink_node, - high_fanout_bb.xmin, high_fanout_bb.ymin, - high_fanout_bb.xmax, high_fanout_bb.ymax); + net_bounding_box.xmin, net_bounding_box.ymin, + net_bounding_box.xmax, net_bounding_box.ymax); + /* TODO: Check if this change (high fanout bb -> bb) is OK */ bool retry_with_full_bb = false; t_heap* cheapest; cheapest = timing_driven_route_connection_from_heap(sink_node, cost_params, - high_fanout_bb); + net_bounding_box); if (cheapest == nullptr) { //Found no path, that may be due to an unlucky choice of existing route tree sub-set, @@ -240,11 +242,13 @@ std::tuple ConnectionRouter::timing_driven_route_conne reset_path_costs(); modified_rr_node_inf_.clear(); - std::tie(retry_with_full_bb, cheapest) = timing_driven_route_connection_common_setup(rt_root, - sink_node, - cost_params, - net_bounding_box, - can_grow_bb); + std::tie(retry_with_full_bb, cheapest) = timing_driven_route_connection_common_setup( + tree, + source_node, + sink_node, + cost_params, + net_bounding_box, + can_grow_bb); } if (cheapest == nullptr) { @@ -276,8 +280,6 @@ t_heap* ConnectionRouter::timing_driven_route_connection_from_heap(RRNodeI const t_conn_cost_params cost_params, t_bb bounding_box) { VTR_ASSERT_SAFE(heap_.is_valid()); - //std::cout << "using this: " << (void *)this << "\n"; - //std::cout << "using heap: " << heap_.get_ptr() << "\n"; if (heap_.is_empty_heap()) { //No source VTR_LOGV_DEBUG(router_debug_, " Initial heap empty (no source)\n"); @@ -339,7 +341,7 @@ t_heap* ConnectionRouter::timing_driven_route_connection_from_heap(RRNodeI // Find shortest paths from specified route tree to all nodes in the RR graph template vtr::vector ConnectionRouter::timing_driven_find_all_shortest_paths_from_route_tree( - const RouteTreeNode& rt_root, + const RouteTree& tree, const t_conn_cost_params cost_params, t_bb bounding_box, RouterStats& router_stats, @@ -349,7 +351,7 @@ vtr::vector ConnectionRouter::timing_driven_find_all_sho // Add the route tree to the heap with no specific target node RRNodeId target_node = RRNodeId::INVALID(); - add_route_tree_to_heap(rt_root, target_node, cost_params, false); + add_route_tree_to_heap(tree.root(), target_node, bounding_box, cost_params, false); heap_.build_heap(); // via sifting down everything auto res = timing_driven_find_all_shortest_paths_from_heap(cost_params, bounding_box); @@ -910,16 +912,16 @@ void ConnectionRouter::empty_heap_annotating_node_route_inf() { //Adds the route tree rooted at rt_node to the heap, preparing it to be //used as branch-points for further routing. +/* Puts the entire partial routing below and including rt_node onto the heap + * (except for those parts marked as not to be expanded) by calling itself + * recursively. */ template void ConnectionRouter::add_route_tree_to_heap( const RouteTreeNode& rt_node, RRNodeId target_node, + const t_bb& bounding_box, const t_conn_cost_params cost_params, bool from_high_fanout) { - /* Puts the entire partial routing below and including rt_node onto the heap * - * (except for those parts marked as not to be expanded) by calling itself * - * recursively. */ - if (from_high_fanout) { router_stats_->add_all_rt_from_high_fanout++; } else { @@ -931,6 +933,7 @@ void ConnectionRouter::add_route_tree_to_heap( if (rt_node.re_expand) { add_route_tree_node_to_heap(rt_node, target_node, + bounding_box, cost_params, false); } @@ -942,12 +945,14 @@ void ConnectionRouter::add_route_tree_to_heap( target_node)) { add_route_tree_to_heap(child_node, target_node, + bounding_box, cost_params, from_high_fanout); } } else { add_route_tree_to_heap(child_node, target_node, + bounding_box, cost_params, from_high_fanout); } @@ -962,6 +967,7 @@ template void ConnectionRouter::add_route_tree_node_to_heap( const RouteTreeNode& rt_node, RRNodeId target_node, + const t_bb& bounding_box, const t_conn_cost_params cost_params, bool is_high_fanout) { const auto& device_ctx = g_vpr_ctx.device(); @@ -969,6 +975,10 @@ void ConnectionRouter::add_route_tree_node_to_heap( float backward_path_cost = cost_params.criticality * rt_node.Tdel; float R_upstream = rt_node.R_upstream; + /* don't include if not in BB */ + if (!inside_bb(rt_node.inode, bounding_box)) + return; + // after budgets are loaded, calculate delay cost as described by RCV paper /* R. Fung, V. Betz and W. Chow, "Slack Allocation and Routing to Improve FPGA Timing While * Repairing Short-Path Violations," in IEEE Transactions on Computer-Aided Design of @@ -1070,6 +1080,7 @@ t_bb ConnectionRouter::add_high_fanout_route_tree_to_heap( for (const RouteTreeNode& rt_node : spatial_rt_lookup[bin_x][bin_y]) { if (!rt_node.re_expand) continue; //Some nodes (like IPINs) shouldn't be re-expanded + RRNodeId rr_node_to_add = rt_node.inode; if (is_flat_) { @@ -1078,7 +1089,7 @@ t_bb ConnectionRouter::add_high_fanout_route_tree_to_heap( } // Put the node onto the heap - add_route_tree_node_to_heap(rt_node, target_node, cost_params, true); + add_route_tree_node_to_heap(rt_node, target_node, net_bounding_box, cost_params, true); // Update Bounding Box highfanout_bb.xmin = std::min(highfanout_bb.xmin, rr_graph_->node_xlow(rr_node_to_add)); @@ -1112,7 +1123,7 @@ t_bb ConnectionRouter::add_high_fanout_route_tree_to_heap( t_bb bounding_box = net_bounding_box; if (nodes_added == 0) { //If the target bin, and it's surrounding bins were empty, just add the full route tree - add_route_tree_to_heap(rt_root, target_node, cost_params, true); + add_route_tree_to_heap(rt_root, target_node, bounding_box, cost_params, true); } else { //We found nearby routing, replace original bounding box to be localized around that routing bounding_box = adjust_highfanout_bounding_box(highfanout_bb); diff --git a/vpr/src/route/connection_router.h b/vpr/src/route/connection_router.h index 5834e852409..32c9e327ec4 100644 --- a/vpr/src/route/connection_router.h +++ b/vpr/src/route/connection_router.h @@ -69,7 +69,8 @@ class ConnectionRouter : public ConnectionRouterInterface { * bool: should retry with full bounding box? (only used in parallel routing) * t_heap: heap element of cheapest path */ std::tuple timing_driven_route_connection_from_route_tree( - const RouteTreeNode& rt_root, + const RouteTree& tree, + RRNodeId source_node, RRNodeId sink_node, const t_conn_cost_params cost_params, t_bb bounding_box, @@ -88,7 +89,8 @@ class ConnectionRouter : public ConnectionRouterInterface { * bool: should retry with full bounding box? (only used in parallel routing) * t_heap: heap element of cheapest path */ std::tuple timing_driven_route_connection_from_route_tree_high_fanout( - const RouteTreeNode& rt_root, + const RouteTree& tree, + RRNodeId source_node, RRNodeId sink_node, const t_conn_cost_params cost_params, t_bb net_bounding_box, @@ -107,7 +109,7 @@ class ConnectionRouter : public ConnectionRouterInterface { // empty). When using cost_params.astar_fac = 0, for efficiency the // RouterLookahead used should be the NoOpLookahead. vtr::vector timing_driven_find_all_shortest_paths_from_route_tree( - const RouteTreeNode& rt_root, + const RouteTree& tree, const t_conn_cost_params cost_params, t_bb bounding_box, RouterStats& router_stats, @@ -156,14 +158,16 @@ class ConnectionRouter : public ConnectionRouterInterface { * timing_driven_route_connection_from_route_tree_high_fanout for running * the connection router. * @param[in] rt_root RouteTreeNode describing the current routing state + * @param[in] source_node Source node ID to route from * @param[in] sink_node Sink node ID to route to * @param[in] cost_params * @param[in] bounding_box Keep search confined to this bounding box * @param[in] can_grow_bb Can this fn grow the given bounding box? - * @return bool Signal to retry this connection with a full-device bounding box, + * @return bool Signal to retry this connection with a full-device bounding box. * @return t_heap* Heap element describing the path found. */ std::tuple timing_driven_route_connection_common_setup( - const RouteTreeNode& rt_root, + const RouteTree& tree, + RRNodeId source_node, RRNodeId sink_node, const t_conn_cost_params cost_params, t_bb bounding_box, @@ -242,6 +246,7 @@ class ConnectionRouter : public ConnectionRouterInterface { //used as branch-points for further routing. void add_route_tree_to_heap(const RouteTreeNode& rt_node, RRNodeId target_node, + const t_bb& bounding_box, const t_conn_cost_params cost_params, bool from_high_fanout); @@ -260,6 +265,7 @@ class ConnectionRouter : public ConnectionRouterInterface { void add_route_tree_node_to_heap( const RouteTreeNode& rt_node, RRNodeId target_node, + const t_bb& bounding_box, const t_conn_cost_params cost_params, bool is_high_fanout); diff --git a/vpr/src/route/connection_router_interface.h b/vpr/src/route/connection_router_interface.h index 2180dbe76f3..b58a037ec4f 100644 --- a/vpr/src/route/connection_router_interface.h +++ b/vpr/src/route/connection_router_interface.h @@ -53,7 +53,8 @@ class ConnectionRouterInterface { * bool: should retry with full bounding box? (only used in parallel routing) * t_heap: heap element of cheapest path */ virtual std::tuple timing_driven_route_connection_from_route_tree( - const RouteTreeNode& rt_root, + const RouteTree& tree, + RRNodeId source_node, RRNodeId sink_node, const t_conn_cost_params cost_params, t_bb bounding_box, @@ -73,7 +74,8 @@ class ConnectionRouterInterface { * bool: should retry with full bounding box? (only used in parallel routing) * t_heap: heap element of cheapest path */ virtual std::tuple timing_driven_route_connection_from_route_tree_high_fanout( - const RouteTreeNode& rt_root, + const RouteTree& tree, + RRNodeId source_node, RRNodeId sink_node, const t_conn_cost_params cost_params, t_bb bounding_box, @@ -93,7 +95,7 @@ class ConnectionRouterInterface { // empty). When using cost_params.astar_fac = 0, for efficiency the // RouterLookahead used should be the NoOpLookahead. virtual vtr::vector timing_driven_find_all_shortest_paths_from_route_tree( - const RouteTreeNode& rt_root, + const RouteTree& tree, const t_conn_cost_params cost_params, t_bb bounding_box, RouterStats& router_stats, diff --git a/vpr/src/route/partition_tree.cpp b/vpr/src/route/partition_tree.cpp index f896d93bc94..d3d895493b5 100644 --- a/vpr/src/route/partition_tree.cpp +++ b/vpr/src/route/partition_tree.cpp @@ -1,11 +1,12 @@ #include "partition_tree.h" +#include #include PartitionTree::PartitionTree(const Netlist<>& netlist) { const auto& device_ctx = g_vpr_ctx.device(); auto all_nets = std::vector(netlist.nets().begin(), netlist.nets().end()); - _root = build_helper(netlist, all_nets, 0, 0, device_ctx.grid.width(), device_ctx.grid.height()); + _root = build_helper(netlist, all_nets, 0, 0, device_ctx.grid.width() - 1, device_ctx.grid.height() - 1); } std::unique_ptr PartitionTree::build_helper(const Netlist<>& netlist, const std::vector& nets, int x1, int y1, int x2, int y2) { @@ -15,116 +16,122 @@ std::unique_ptr PartitionTree::build_helper(const Netlist<>& const auto& route_ctx = g_vpr_ctx.routing(); auto out = std::make_unique(); - /* Find best cutline. In ParaDRo this is done using prefix sums, but - * life is too short to implement them, therefore I'm just doing a linear search, - * and the complexity is O((fpga width + height) * #nets * log2(w+h * #nets)). - * What we are searching for is the cutline with the most balanced workload (# of fanouts) - * on the sides. */ - int left, right, mine; - int score; - /* TODO: maybe put all of this into a tuple or struct? */ - int best_score = std::numeric_limits::max(); - int best_pos = -1, best_left = -1, best_right = -1; - enum { X, - Y } best_axis - = X; + /* Build ParaDRo-ish prefix sum lookup for each bin (coordinate) in the device. + * Do this for every step with only given nets, because each cutline takes some nets out + * of the game, so if we just built a global lookup it wouldn't yield accurate results. + * + * VPR's bounding boxes include the borders (see ConnectionRouter::timing_driven_expand_neighbour()) + * so try to include x=bb.xmax, y=bb.ymax etc. when calculating things. */ + int W = x2 - x1 + 1; + int H = y2 - y1 + 1; - for (int x = x1 + 1; x < x2; x++) { - left = right = mine = 0; - for (auto net_id : nets) { - t_bb bb = route_ctx.route_bb[net_id]; - size_t fanout = netlist.net_sinks(net_id).size(); - if (bb.xmin < x && bb.xmax < x) { - left += fanout; - } else if (bb.xmin > x && bb.xmax > x) { - right += fanout; - } else if (bb.xmin <= x && bb.xmax >= x) { - mine += fanout; - } else { - VTR_ASSERT(false); /* unreachable */ - } + VTR_ASSERT(W > 1 && H > 1); + /* Cutlines are placed between integral coordinates. + * For instance, x_total_before[0] assumes a cutline at x=0.5, so fanouts at x=0 are included but not + * x=1. It's similar for x_total_after[0], which excludes fanouts at x=0 and includes x=1. + * Note that we have W-1 possible cutlines for a W-wide box. */ + std::vector x_total_before(W - 1, 0), x_total_after(W - 1, 0); + std::vector y_total_before(H - 1, 0), y_total_after(H - 1, 0); + + for (auto net_id : nets) { + t_bb bb = route_ctx.route_bb[net_id]; + size_t fanouts = netlist.net_sinks(net_id).size(); + + /* Inclusive start and end coords of the bbox relative to x1. Clamp to [x1, x2]. */ + int x_start = std::max(x1, bb.xmin) - x1; + int x_end = std::min(bb.xmax, x2) - x1; + /* Fill in the lookups assuming a cutline at x + 0.5. */ + for (int x = x_start; x < W - 1; x++) { + x_total_before[x] += fanouts; + } + for (int x = 0; x < x_end; x++) { + x_total_after[x] += fanouts; + } + int y_start = std::max(y1, bb.ymin) - y1; + int y_end = std::min(bb.ymax, y2) - y1; + for (int y = y_start; y < H - 1; y++) { + y_total_before[y] += fanouts; } - score = abs(left - right); + for (int y = 0; y < y_end; y++) { + y_total_after[y] += fanouts; + } + } + + int best_score = std::numeric_limits::max(); + float best_pos = std::numeric_limits::quiet_NaN(); + Axis best_axis = Axis::X; + + int max_x_before = x_total_before[W - 2]; + int max_x_after = x_total_after[0]; + for (int x = 0; x < W - 1; x++) { + int before = x_total_before[x]; + int after = x_total_after[x]; + if (before == max_x_before || after == max_x_after) /* Cutting here would leave no nets to the left or right */ + continue; + int score = abs(x_total_before[x] - x_total_after[x]); if (score < best_score) { best_score = score; - best_left = left; - best_right = right; - best_pos = x; - best_axis = X; + best_pos = x1 + x + 0.5; /* Lookups are relative to (x1, y1) */ + best_axis = Axis::X; } } - for (int y = y1 + 1; y < y2; y++) { - left = right = mine = 0; - for (auto net_id : nets) { - t_bb bb = route_ctx.route_bb[net_id]; - size_t fanout = netlist.net_sinks(net_id).size(); - if (bb.ymin < y && bb.ymax < y) { - left += fanout; - } else if (bb.ymin > y && bb.ymax > y) { - right += fanout; - } else if (bb.ymin <= y && bb.ymax >= y) { - mine += fanout; - } else { - VTR_ASSERT(false); /* unreachable */ - } - } - score = abs(left - right); + + int max_y_before = y_total_before[H - 2]; + int max_y_after = y_total_after[0]; + for (int y = 0; y < H - 1; y++) { + int before = y_total_before[y]; + int after = y_total_after[y]; + if (before == max_y_before || after == max_y_after) /* Cutting here would leave no nets to the left or right (sideways) */ + continue; + int score = abs(y_total_before[y] - y_total_after[y]); if (score < best_score) { best_score = score; - best_left = left; - best_right = right; - best_pos = y; - best_axis = Y; + best_pos = y1 + y + 0.5; /* Lookups are relative to (x1, y1) */ + best_axis = Axis::Y; } } - /* If one of the sides has 0 nets in the best arrangement, - * there's no use in partitioning this: no parallelism comes out of it. */ - if (best_left == 0 || best_right == 0) { - out->nets = std::move(nets); + /* Couldn't find a cutline: all cutlines result in a one-way cut */ + if (std::isnan(best_pos)) { + out->nets = nets; /* We hope copy elision is smart enough to optimize this stuff out */ return out; } - /* Populate net IDs on each side - * and call next level of build_partition_trees. */ + /* Populate net IDs on each side and call next level of build_x */ std::vector left_nets, right_nets, my_nets; - if (best_axis == X) { + if (best_axis == Axis::X) { for (auto net_id : nets) { t_bb bb = route_ctx.route_bb[net_id]; - if (bb.xmin < best_pos && bb.xmax < best_pos) { + if (bb.xmax < best_pos) { left_nets.push_back(net_id); - } else if (bb.xmin > best_pos && bb.xmax > best_pos) { + } else if (bb.xmin > best_pos) { right_nets.push_back(net_id); - } else if (bb.xmin <= best_pos && bb.xmax >= best_pos) { - my_nets.push_back(net_id); } else { - VTR_ASSERT(false); /* unreachable */ + my_nets.push_back(net_id); } } - out->left = build_helper(netlist, left_nets, x1, y1, best_pos, y2); - out->right = build_helper(netlist, right_nets, best_pos, y2, x2, y2); + out->left = build_helper(netlist, left_nets, x1, y1, std::floor(best_pos), y2); + out->right = build_helper(netlist, right_nets, std::floor(best_pos + 1), y1, x2, y2); } else { - VTR_ASSERT(best_axis == Y); + VTR_ASSERT(best_axis == Axis::Y); for (auto net_id : nets) { t_bb bb = route_ctx.route_bb[net_id]; - if (bb.ymin < best_pos && bb.ymax < best_pos) { + if (bb.ymax < best_pos) { left_nets.push_back(net_id); - } else if (bb.ymin > best_pos && bb.ymax > best_pos) { + } else if (bb.ymin > best_pos) { right_nets.push_back(net_id); - } else if (bb.ymin <= best_pos && bb.ymax >= best_pos) { - my_nets.push_back(net_id); } else { - VTR_ASSERT(false); /* unreachable */ + my_nets.push_back(net_id); } } - out->left = build_helper(netlist, left_nets, x1, best_pos, x2, y2); - out->right = build_helper(netlist, right_nets, x1, y1, x2, best_pos); + out->left = build_helper(netlist, left_nets, x1, y1, x2, std::floor(best_pos)); + out->right = build_helper(netlist, right_nets, x1, std::floor(best_pos + 1), x2, y2); } - out->nets = std::move(my_nets); + out->nets = my_nets; out->cutline_axis = best_axis; out->cutline_pos = best_pos; return out; diff --git a/vpr/src/route/partition_tree.h b/vpr/src/route/partition_tree.h index 97988d5fdbb..b7e2526f716 100644 --- a/vpr/src/route/partition_tree.h +++ b/vpr/src/route/partition_tree.h @@ -2,6 +2,29 @@ #include "connection_router.h" #include "router_stats.h" +#include "virtual_net.h" + +#include +#include +#include +#include + +#ifdef VPR_USE_TBB +# include +#endif + +/** Self-descriptive */ +enum class Axis { X, + Y }; + +/** Which side of a line? */ +enum class Side { LEFT = 0, + RIGHT = 1 }; + +/** Invert side */ +inline Side operator!(const Side& rhs) { + return Side(!size_t(rhs)); +} /** Routing iteration results per thread. (for a subset of the input netlist) */ struct RouteIterResults { @@ -30,6 +53,8 @@ class PartitionTreeNode { public: /** Nets claimed by this node (intersected by cutline if branch, nets in final region if leaf) */ std::vector nets; + /** Virtual nets delegated to this node by the parent */ + std::vector virtual_nets; /** Left subtree. */ std::unique_ptr left = nullptr; /** Right subtree. */ @@ -38,11 +63,12 @@ class PartitionTreeNode { bool is_routable = false; /** Net IDs for which timing_driven_route_net() actually got called */ std::vector rerouted_nets; - - /* debug stuff */ - int cutline_axis = -1; - int cutline_pos = -1; - std::vector exec_times; + /* Axis of the cutline. */ + Axis cutline_axis = Axis::X; + /* Position of the cutline. It's a float, because cutlines are considered to be "between" integral coordinates. */ + float cutline_pos = std::numeric_limits::quiet_NaN(); + /* Bounding box of *this* node. (The cutline cuts this box) */ + t_bb bb; }; /** Holds the root PartitionTreeNode and exposes top level operations. */ @@ -64,3 +90,34 @@ class PartitionTree { std::unique_ptr _root; std::unique_ptr build_helper(const Netlist<>& netlist, const std::vector& nets, int x1, int y1, int x2, int y2); }; + +#ifdef VPR_DEBUG_PARTITION_TREE +/** Log PartitionTree-related messages. Can handle multiple threads. */ +class PartitionTreeDebug { + public: +# ifdef VPR_USE_TBB + static inline tbb::concurrent_vector lines; +# else + static inline std::vector lines; +# endif + /** Add msg to the log buffer (with a thread ID header) */ + static inline void log(std::string msg) { + auto thread_id = std::hash()(std::this_thread::get_id()); + lines.push_back("[thread " + std::to_string(thread_id) + "] " + msg); + } + /** Write out the log buffer into a file */ + static inline void write(std::string filename) { + std::ofstream f(filename); + for (auto& line : lines) { + f << line << std::endl; + } + f.close(); + } +}; +#else +class PartitionTreeDebug { + public: + static inline void log(std::string msg) { std::cout << msg << "\n"; } + static inline void write(std::string /* filename */) {} +}; +#endif diff --git a/vpr/src/route/route_common.cpp b/vpr/src/route/route_common.cpp index 881dbfd46aa..d0d34dcdb40 100644 --- a/vpr/src/route/route_common.cpp +++ b/vpr/src/route/route_common.cpp @@ -309,11 +309,10 @@ bool try_route(const Netlist<>& net_list, return (success); } +/** This routine checks to see if this is a resource-feasible routing. + * That is, are all rr_node capacity limitations respected? It assumes + * that the occupancy arrays are up to date when it is called. */ bool feasible_routing() { - /* This routine checks to see if this is a resource-feasible routing. * - * That is, are all rr_node capacity limitations respected? It assumes * - * that the occupancy arrays are up to date when it is called. */ - auto& device_ctx = g_vpr_ctx.device(); const auto& rr_graph = device_ctx.rr_graph; auto& route_ctx = g_vpr_ctx.routing(); @@ -520,13 +519,14 @@ void mark_ends(const Netlist<>& net_list, ParentNetId net_id) { } } -void mark_remaining_ends(ParentNetId net_id, const std::vector& remaining_sinks) { +void mark_remaining_ends(ParentNetId net_id) { // like mark_ends, but only performs it for the remaining sinks of a net RRNodeId inode; auto& route_ctx = g_vpr_ctx.mutable_routing(); + const auto& tree = route_ctx.route_trees[net_id].value(); - for (int sink_pin : remaining_sinks) { + for (int sink_pin : tree.get_remaining_isinks()) { inode = route_ctx.net_rr_terminals[net_id][sink_pin]; ++route_ctx.rr_node_route_inf[inode].target_flag; } diff --git a/vpr/src/route/route_common.h b/vpr/src/route/route_common.h index 4a7d1a2cf76..4aff8588673 100644 --- a/vpr/src/route/route_common.h +++ b/vpr/src/route/route_common.h @@ -2,6 +2,7 @@ #pragma once #include #include "clustered_netlist.h" +#include "rr_node_types.h" #include "vtr_vector.h" #include "heap_type.h" #include "rr_node_fwd.h" @@ -89,7 +90,7 @@ inline float get_single_rr_cong_cost(RRNodeId inode, float pres_fac) { void mark_ends(const Netlist<>& net_list, ParentNetId net_id); -void mark_remaining_ends(ParentNetId net_id, const std::vector& remaining_sinks); +void mark_remaining_ends(ParentNetId net_id); void add_to_mod_list(RRNodeId inode, std::vector& modified_rr_node_inf); @@ -221,3 +222,29 @@ void push_back_node_with_info( heap->push_back(hptr); } + +/** Is \p inode inside this bounding box? + * In the context of the parallel router, an inode is inside a bounding box + * if its driving side is in the bounding box. If it's not directional, + * we take the top left corner as reference */ +inline bool inside_bb(RRNodeId inode, const t_bb& bb) { + auto& device_ctx = g_vpr_ctx.device(); + const auto& rr_graph = device_ctx.rr_graph; + + int x = rr_graph.node_xlow(inode); + int y = rr_graph.node_ylow(inode); + + return x >= bb.xmin && x <= bb.xmax && y >= bb.ymin && y <= bb.ymax; +} + +/** When RCV is enabled, it's necessary to be able to completely ripup high fanout nets + * if there is still negative hold slack. Normally the router will prune the illegal + * branches of high fanout nets, this will bypass this */ +inline bool check_hold(const t_router_opts& router_opts, float worst_neg_slack) { + if (router_opts.routing_budgets_algorithm != YOYO) { + return false; + } else if (worst_neg_slack != 0) { + return true; + } + return false; +} diff --git a/vpr/src/route/route_parallel.cpp b/vpr/src/route/route_parallel.cpp index 1e50f758b31..e6c394bd7dd 100644 --- a/vpr/src/route/route_parallel.cpp +++ b/vpr/src/route/route_parallel.cpp @@ -24,16 +24,21 @@ #include "route_parallel.h" // all functions in profiling:: namespace, which are only activated if PROFILE is defined #include "route_profiling.h" +#include "rr_graph_fwd.h" +#include "rr_node_types.h" #include "timing_util.h" +#include "vpr_error.h" +#include "vtr_assert.h" +#include "vtr_math.h" #include "vtr_time.h" #include "NetPinTimingInvalidator.h" #ifdef VPR_USE_TBB +# include "tbb/concurrent_vector.h" # include "tbb/enumerable_thread_specific.h" # include "tbb/task_group.h" -# include "tbb/global_control.h" /** route_net and similar functions need many bits of state collected from various * parts of VPR, collect them here for ease of use */ @@ -47,7 +52,6 @@ class RouteIterCtx { const t_router_opts& router_opts; CBRR& connections_inf; tbb::enumerable_thread_specific router_stats; - tbb::enumerable_thread_specific route_structs; NetPinsMatrix& net_delay; const ClusteredPinAtomPinsLookup& netlist_pin_lookup; std::shared_ptr timing_info; @@ -56,9 +60,24 @@ class RouteIterCtx { float worst_negative_slack; const RoutingPredictor& routing_predictor; const vtr::vector>>& choking_spots; + tbb::concurrent_vector& nets_to_retry; + vtr::vector& decomp_retries; bool is_flat; }; +/** Minimum bin size when spatially sampling decomposition sinks. (I know, doesn't make much sense.) + * The parallel router tries to decompose nets by building a "skeleton routing" from the main task + * and then delegating the remaining work to its child tasks. This minimum bin size determines how much + * time the main thread spends building the skeleton. + * Less is more effort -> less speedup, better quality. + * See get_decomposition_isinks() for more info. */ +constexpr size_t MIN_DECOMP_BIN_WIDTH = 5; + +/** Sometimes nets just don't play well with decomposition. If we rerouted a net more than this many + * times, quit trying to parallelize it, rip up its current routing and do it serially. This is a safeguard + * ensuring routability and should be hit at most ~10 times per circuit. */ +constexpr size_t MAX_DECOMP_REROUTE = 5; + /** Helper for reduce_partition_tree. Traverse \p node's subtree and collect results into \p results */ static void reduce_partition_tree_helper(const PartitionTreeNode& node, RouteIterResults& results); @@ -102,6 +121,12 @@ static bool try_parallel_route_tmpl(const Netlist<>& netlist, ScreenUpdatePriority first_iteration_priority, bool is_flat); +template +static RouteIterResults route_with_partition_tree(tbb::task_group& g, RouteIterCtx& ctx); + +template +static RouteIterResults route_without_partition_tree(std::vector& nets_to_route, RouteIterCtx& ctx); + /************************ Subroutine definitions *****************************/ bool try_parallel_route(const Netlist<>& net_list, @@ -326,12 +351,6 @@ bool try_parallel_route_tmpl(const Netlist<>& net_list, is_flat); } - /* Build partition tree for parallel routing */ - vtr::Timer t; - PartitionTree partition_tree(net_list); - float total_prep_time = t.elapsed_sec(); - VTR_LOG("# Built partition tree in %f seconds\n", total_prep_time); - tbb::task_group tbb_task_group; /* Set up thread local storage. @@ -347,7 +366,10 @@ bool try_parallel_route_tmpl(const Netlist<>& net_list, route_ctx.rr_node_route_inf, is_flat)); /* Here we provide an "exemplar" to copy for each thread */ auto router_stats_thread = tbb::enumerable_thread_specific(); - auto route_structs = tbb::enumerable_thread_specific(net_list); + tbb::concurrent_vector nets_to_retry; + + /** Count decomposition reroutes for each net. */ + vtr::vector decomp_retries(net_list.nets().size()); RouterStats router_stats; float prev_iter_cumm_time = 0; @@ -403,7 +425,6 @@ bool try_parallel_route_tmpl(const Netlist<>& net_list, router_opts, connections_inf, router_stats_thread, - route_structs, net_delay, netlist_pin_lookup, route_timing_info, @@ -412,9 +433,13 @@ bool try_parallel_route_tmpl(const Netlist<>& net_list, worst_negative_slack, routing_predictor, choking_spots, + nets_to_retry, + decomp_retries, is_flat}; - RouteIterResults iter_results = route_partition_tree(tbb_task_group, partition_tree, iter_ctx); + vtr::Timer net_routing_timer; + RouteIterResults iter_results = decompose_route_with_partition_tree(tbb_task_group, iter_ctx); + PartitionTreeDebug::log("Routing all nets took " + std::to_string(net_routing_timer.elapsed_sec()) + " s"); if (!iter_results.is_routable) { return false; // Impossible to route @@ -478,6 +503,7 @@ bool try_parallel_route_tmpl(const Netlist<>& net_list, //Output progress print_route_status(itry, iter_elapsed_time, pres_fac, num_net_bounding_boxes_updated, iter_results.stats, overuse_info, wirelength_info, timing_info, est_success_iteration); + PartitionTreeDebug::log("Iteration " + std::to_string(itry) + " took " + std::to_string(iter_elapsed_time) + " s"); prev_iter_cumm_time = iter_cumm_time; @@ -499,7 +525,7 @@ bool try_parallel_route_tmpl(const Netlist<>& net_list, /* * Are we finished? */ - if (is_iteration_complete(routing_is_feasible, router_opts, itry, timing_info, rcv_finished_count == 0)) { + if (iter_ctx.nets_to_retry.empty() && is_iteration_complete(routing_is_feasible, router_opts, itry, timing_info, rcv_finished_count == 0)) { auto& router_ctx = g_vpr_ctx.routing(); if (is_better_quality_routing(best_routing, best_routing_metrics, wirelength_info, timing_info)) { @@ -593,8 +619,7 @@ bool try_parallel_route_tmpl(const Netlist<>& net_list, */ if (router_opts.route_bb_update == e_route_bb_update::DYNAMIC) { - /** TODO: Disabled BB scaling for the baseline parallel router. Should re-enable it by building/updating partition tree on every iteration */ - // num_net_bounding_boxes_updated = dynamic_update_bounding_boxes(iter_results.rerouted_nets, net_list, router_opts.high_fanout_threshold); + num_net_bounding_boxes_updated = dynamic_update_bounding_boxes(iter_results.rerouted_nets, net_list, router_opts.high_fanout_threshold); } if (itry >= high_effort_congestion_mode_iteration_threshold) { @@ -638,19 +663,20 @@ bool try_parallel_route_tmpl(const Netlist<>& net_list, // the router to route around otherwise congested regions // (at the cost of high run-time). - //Increase the size of the net bounding boxes to give the router more - //freedom to find alternate paths. - // - //In the case of routing conflicts there are multiple connections competing - //for the same resources which can not resolve the congestion themselves. - //In normal routing mode we try to keep the bounding boxes small to minimize - //run-time, but this can limits how far signals can detour (i.e. they can't - //route outside the bounding box), which can cause conflicts to oscillate back - //and forth without resolving. - // - //By scaling the bounding boxes here, we slowly increase the router's search - //space in hopes of it allowing signals to move further out of the way to - //alleviate the conflicts. + /* Increase the size of the net bounding boxes to give the router more + * freedom to find alternate paths. + * + * In the case of routing conflicts there are multiple connections competing + * for the same resources which can not resolve the congestion themselves. + * In normal routing mode we try to keep the bounding boxes small to minimize + * run-time, but this can limits how far signals can detour (i.e. they can't + * route outside the bounding box), which can cause conflicts to oscillate back + * and forth without resolving. + * + * By scaling the bounding boxes here, we slowly increase the router's search + * space in hopes of it allowing signals to move further out of the way to + * alleviate the conflicts. */ + if (itry_conflicted_mode % BB_SCALE_ITER_COUNT == 0) { //We scale the bounding boxes by BB_SCALE_FACTOR, //every BB_SCALE_ITER_COUNT iterations. This ensures @@ -665,8 +691,7 @@ bool try_parallel_route_tmpl(const Netlist<>& net_list, //Scale by BB_SCALE_FACTOR but clip to grid size to avoid overflow bb_fac = std::min(max_grid_dim, bb_fac * BB_SCALE_FACTOR); - /** TODO: Disabled BB scaling for the baseline parallel router. Should re-enable it by building/updating partition tree on every iteration */ - // route_ctx.route_bb = load_route_bb(net_list, bb_fac); + route_ctx.route_bb = load_route_bb(net_list, bb_fac); } ++itry_conflicted_mode; @@ -795,9 +820,28 @@ bool try_parallel_route_tmpl(const Netlist<>& net_list, VTR_LOG("total_number_of_adding_all_rt_from_calling_high_fanout_rt: %zu ", router_stats.add_all_rt_from_high_fanout); VTR_LOG("\n"); + PartitionTreeDebug::write("partition_tree.log"); return routing_is_successful; } +/** Apparently we need a few more checks around should_route_net. TODO: smush this function into should_route_net */ +static bool should_really_route_net(const Netlist<>& net_list, ParentNetId net_id, route_budgets& budgeting_inf, CBRR& connections_inf, float worst_negative_slack) { + auto& route_ctx = g_vpr_ctx.mutable_routing(); + + bool reroute_for_hold = false; + if (budgeting_inf.if_set()) { + reroute_for_hold = budgeting_inf.get_should_reroute(net_id); + reroute_for_hold &= (worst_negative_slack != 0); + } + if (route_ctx.net_status.is_fixed(net_id)) /* Skip pre-routed nets. */ + return false; + else if (net_list.net_is_ignored(net_id)) /* Skip ignored nets. */ + return false; + else if (!(reroute_for_hold) && !should_route_net(net_id, connections_inf, true)) + return false; + return true; +} + /** Try routing a net. This calls timing_driven_route_net. * The only difference is that it returns a "retry_net" flag, which means that the net * couldn't be routed with the default bounding box and needs a full-device BB. @@ -805,79 +849,49 @@ bool try_parallel_route_tmpl(const Netlist<>& net_list, * The single-thread router just retries with a full-device BB and does not need to notify the caller. * TODO: make the serial router follow this execution path to decrease code duplication */ template -NetResultFlags try_parallel_route_net(ConnectionRouter& router, - const Netlist<>& net_list, - const ParentNetId& net_id, - int itry, - float pres_fac, - const t_router_opts& router_opts, - CBRR& connections_inf, - RouterStats& router_stats, - std::vector& pin_criticality, - std::vector>& rt_node_of_sink, - NetPinsMatrix& net_delay, - const ClusteredPinAtomPinsLookup& netlist_pin_lookup, - std::shared_ptr timing_info, - NetPinTimingInvalidator* pin_timing_invalidator, - route_budgets& budgeting_inf, - float worst_negative_slack, - const RoutingPredictor& routing_predictor, - const std::vector>& choking_spots, - bool is_flat) { +NetResultFlags try_parallel_route_net(ParentNetId net_id, RouteIterCtx& ctx, int level) { auto& route_ctx = g_vpr_ctx.mutable_routing(); NetResultFlags flags; - connections_inf.prepare_routing_for_net(net_id); - - bool reroute_for_hold = false; - if (budgeting_inf.if_set()) { - reroute_for_hold = (budgeting_inf.get_should_reroute(net_id)); - reroute_for_hold &= worst_negative_slack != 0; + /* Just return success if we don't need to route this one */ + if (!should_really_route_net(ctx.net_list, net_id, ctx.budgeting_inf, ctx.connections_inf, ctx.worst_negative_slack)) { + flags.success = true; + return flags; } - if (route_ctx.net_status.is_fixed(net_id)) { /* Skip pre-routed nets. */ - flags.success = true; - } else if (net_list.net_is_ignored(net_id)) { /* Skip ignored nets. */ - flags.success = true; - } else if (!(reroute_for_hold) && !should_route_net(net_id, connections_inf, true)) { - flags.success = true; + // track time spent vs fanout + profiling::net_fanout_start(); + + vtr::Timer routing_timer; + flags = timing_driven_route_net(ctx.routers.local(), + ctx.net_list, + net_id, + ctx.itry, + ctx.pres_fac, + ctx.router_opts, + ctx.connections_inf, + ctx.router_stats.local(), + ctx.net_delay[net_id].data(), + ctx.netlist_pin_lookup, + ctx.timing_info, + ctx.pin_timing_invalidator, + ctx.budgeting_inf, + ctx.worst_negative_slack, + ctx.routing_predictor, + ctx.choking_spots[net_id], + ctx.is_flat); + + profiling::net_fanout_end(ctx.net_list.net_sinks(net_id).size()); + + /* Impossible to route? (disconnected rr_graph) */ + if (flags.success) { + route_ctx.net_status.set_is_routed(net_id, true); } else { - // track time spent vs fanout - profiling::net_fanout_start(); - - vtr::Timer routing_timer; - flags = timing_driven_route_net(router, - net_list, - net_id, - itry, - pres_fac, - router_opts, - connections_inf, - router_stats, - pin_criticality, - rt_node_of_sink, - net_delay[net_id].data(), - netlist_pin_lookup, - timing_info, - pin_timing_invalidator, - budgeting_inf, - worst_negative_slack, - routing_predictor, - choking_spots, - is_flat); - - profiling::net_fanout_end(net_list.net_sinks(net_id).size()); - - /* Impossible to route? (disconnected rr_graph) */ - if (flags.success) { - route_ctx.net_status.set_is_routed(net_id, true); - } else { - VTR_LOG("Routing failed for net %d\n", net_id); - } - - flags.was_rerouted = true; //Flag to record whether routing was actually changed + VTR_LOG("Routing failed for net %d\n", net_id); } + + flags.was_rerouted = true; //Flag to record whether routing was actually changed return flags; } @@ -895,30 +909,9 @@ void route_partition_tree_helper(tbb::task_group& g, node.is_routable = true; node.rerouted_nets.clear(); - std::cout << "routing node with " << node.nets.size() << " nets\n"; - vtr::Timer t; for (auto net_id : node.nets) { - auto flags = try_parallel_route_net( - ctx.routers.local(), - ctx.net_list, - net_id, - ctx.itry, - ctx.pres_fac, - ctx.router_opts, - ctx.connections_inf, - ctx.router_stats.local(), - ctx.route_structs.local().pin_criticality, - ctx.route_structs.local().rt_node_of_sink, - ctx.net_delay, - ctx.netlist_pin_lookup, - ctx.timing_info, - ctx.pin_timing_invalidator, - ctx.budgeting_inf, - ctx.worst_negative_slack, - ctx.routing_predictor, - ctx.choking_spots[net_id], - ctx.is_flat); + auto flags = try_parallel_route_net(net_id, ctx); if (!flags.success && !flags.retry_with_full_bb) { node.is_routable = false; @@ -933,7 +926,8 @@ void route_partition_tree_helper(tbb::task_group& g, nets_to_retry[net_id] = true; } } - node.exec_times.push_back(t.elapsed_sec()); + + PartitionTreeDebug::log("Node with " + std::to_string(node.nets.size()) + " nets routed in " + std::to_string(t.elapsed_sec()) + " s"); /* add left and right trees to task queue */ if (node.left && node.right) { @@ -944,7 +938,7 @@ void route_partition_tree_helper(tbb::task_group& g, route_partition_tree_helper(g, *node.right, ctx, nets_to_retry); }); } else { - VTR_ASSERT(!node.left && !node.right); // tree should have been built perfectly balanced + VTR_ASSERT(!node.left && !node.right); // there shouldn't be a node with a single branch } } @@ -1003,4 +997,725 @@ RouteIterResults route_partition_tree(tbb::task_group& g, return out; } +/* Build a partition tree and route with it */ +template +static RouteIterResults route_with_partition_tree(tbb::task_group& g, RouteIterCtx& ctx) { + vtr::Timer t2; + PartitionTree partition_tree(ctx.net_list); + float total_prep_time = t2.elapsed_sec(); + VTR_LOG("# Built partition tree in %f seconds\n", total_prep_time); + + return route_partition_tree(g, partition_tree, ctx); +} + +/* Route serially */ +template +static RouteIterResults route_without_partition_tree(std::vector& nets_to_route, RouteIterCtx& ctx) { + RouteIterResults out; + + /* Sort so net with most sinks is routed first. */ + std::sort(nets_to_route.begin(), nets_to_route.end(), [&](const ParentNetId id1, const ParentNetId id2) -> bool { + return ctx.net_list.net_sinks(id1).size() > ctx.net_list.net_sinks(id2).size(); + }); + + for (auto net_id : nets_to_route) { + auto flags = try_timing_driven_route_net( + ctx.routers.local(), + ctx.net_list, + net_id, + ctx.itry, + ctx.pres_fac, + ctx.router_opts, + ctx.connections_inf, + ctx.router_stats.local(), + ctx.net_delay, + ctx.netlist_pin_lookup, + ctx.timing_info, + ctx.pin_timing_invalidator, + ctx.budgeting_inf, + ctx.worst_negative_slack, + ctx.routing_predictor, + ctx.choking_spots[net_id], + ctx.is_flat); + + if (!flags.success) { + out.is_routable = false; + } + if (flags.was_rerouted) { + out.rerouted_nets.push_back(net_id); + } + } + + update_router_stats(out.stats, ctx.router_stats.local()); + + return out; +} + +/** Which side of the cutline is this RRNode? 0 is left/up and anything else is right/down. + * Cutlines are always assumed to be at cutline_axis = (cutline_pos + 0.5). + * In the context of the parallel router, a RR node is considered to be inside a bounding + * box if its top left corner (xlow, ylow) is inside it. */ +inline Side which_side(RRNodeId inode, int cutline_pos, Axis axis) { + auto& device_ctx = g_vpr_ctx.device(); + const auto& rr_graph = device_ctx.rr_graph; + + if (axis == Axis::X) { + return Side(rr_graph.node_xlow(inode) > cutline_pos); /* 1 is RIGHT */ + } else { + return Side(rr_graph.node_ylow(inode) > cutline_pos); + } +} + +/** Would decomposing this net yield any parallelism? */ +bool is_worth_decomposing(ParentNetId net_id, int cutline_pos, Axis axis) { + auto& route_ctx = g_vpr_ctx.routing(); + const t_bb& bb = route_ctx.route_bb[net_id]; + size_t W = bb.xmax - bb.xmin + 1; + size_t H = bb.ymax - bb.ymin + 1; + int bins_x = W / MIN_DECOMP_BIN_WIDTH; + int bins_y = H / MIN_DECOMP_BIN_WIDTH; + size_t bin_width_x = W / bins_x + 1; + size_t bin_width_y = H / bins_y + 1; + + /* Is this net itself a thin strip? (smaller than sampling bin) */ + if (W < bin_width_x || H < bin_width_y) + return false; + + /* Does this cutline leave a thin strip on either side of it? */ + if (axis == Axis::X) { + if (bb.xmax - cutline_pos < int(bin_width_x)) + return false; + if (cutline_pos - bb.xmin + 1 < int(bin_width_y)) + return false; + } else { + if (bb.ymax - cutline_pos < int(bin_width_x)) + return false; + if (cutline_pos - bb.ymin + 1 < int(bin_width_y)) + return false; + } + + /* Do we have enough sinks to at least fill up the perimeter bins? + * Min n_samples is 4 to cover for bins_x or bins_y <= 2 case */ + size_t n_samples = std::max(2 * (bins_x + bins_y) - 4, 4); + if (route_ctx.net_rr_terminals[net_id].size() <= n_samples + 2) /* Need at least 1 extra on each side */ + return false; + + return true; +} + +/** Should we decompose this net? We should probably leave it alone if: + * - it's a clock net + * - we decomposed nets for enough levels and should have good thread utilization by now + * - decomposing this net doesn't result in any parallelism + * - TODO: Don't decompose nets with full-device bounding box (don't want to clip their BB) */ +template +bool should_decompose_net(ParentNetId net_id, size_t level, int cutline_pos, Axis axis, const RouteIterCtx& ctx) { + /* We are too deep down the tree. + * i.e. if num_workers=4, stop at level=2 (we decomposed for two levels and it should be enough for 4 threads) + * TODO: Move this check into the caller */ + if (level > vtr::log2_ceil(ctx.router_opts.num_workers) - 1) + return false; + /* Clock net */ + if (ctx.net_list.net_is_global(net_id) && ctx.router_opts.two_stage_clock_routing) + return false; + /* We tried too many times to decompose this net. Route it serially from now on. + * TODO: Rip up before going down this route? */ + if (ctx.decomp_retries[net_id] >= MAX_DECOMP_REROUTE) + return false; + /* Dividing this net wouldn't yield us any extra parallelism */ + if (!is_worth_decomposing(net_id, cutline_pos, axis)) + return false; + + return true; +} + +/** Clip bb to one side of the cutline given the axis and position of the cutline. + * Note that cutlines are assumed to be at axis = cutline_pos + 0.5. */ +t_bb clip_to_side(const t_bb& bb, Axis axis, int cutline_pos, Side side) { + t_bb out = bb; + if (axis == Axis::X && side == Side::LEFT) + out.xmax = cutline_pos; + else if (axis == Axis::X && side == Side::RIGHT) + out.xmin = cutline_pos + 1; + else if (axis == Axis::Y && side == Side::LEFT) + out.ymax = cutline_pos; + else if (axis == Axis::Y && side == Side::RIGHT) + out.ymin = cutline_pos + 1; + else + VTR_ASSERT_MSG(false, "Unreachable"); + return out; +} + +/** Break a net into two given the partition tree node and virtual source. + * @param net_id: The net in question. + * @param node: The PartitionTreeNode which owns this net, fully or partially. + * @param virtual_source: The source node. Virtual source for the sink side, real source for the source side. + * @param sink_side: Which side of the cutline has the virtual source? + * @return Left and right halves of the net as VirtualNets. */ +std::tuple make_decomposed_pair(ParentNetId net_id, int cutline_pos, Axis cutline_axis) { + auto& route_ctx = g_vpr_ctx.routing(); + + Side source_side = which_side(route_ctx.route_trees[net_id]->root().inode, cutline_pos, cutline_axis); + VirtualNet source_half, sink_half; + t_bb bb = route_ctx.route_bb[net_id]; + source_half.net_id = net_id; + source_half.clipped_bb = clip_to_side(bb, cutline_axis, cutline_pos, source_side); + sink_half.net_id = net_id; + sink_half.clipped_bb = clip_to_side(bb, cutline_axis, cutline_pos, !source_side); + if (source_side == Side::RIGHT) + return std::make_tuple(sink_half, source_half); + else + return std::make_tuple(source_half, sink_half); +} + +/** Does the current routing of \p net_id cross the cutline at cutline_axis = cutline_pos? */ +bool is_routing_over_cutline(ParentNetId net_id, int cutline_pos, Axis cutline_axis) { + auto& route_ctx = g_vpr_ctx.mutable_routing(); + const RouteTree& tree = route_ctx.route_trees[net_id].value(); + RRNodeId rr_source = tree.root().inode; + Side source_side = which_side(rr_source, cutline_pos, cutline_axis); + + for (auto isink : tree.get_reached_isinks()) { + RRNodeId rr_sink = route_ctx.net_rr_terminals[net_id][isink]; + Side sink_side = which_side(rr_sink, cutline_pos, cutline_axis); + if (source_side != sink_side) + return true; + } + + return false; +} + +/** Find isinks to route before decomposition. Assumes remaining_targets is sorted by criticality. + * To do this, we spatially sample sinks from the net bbox so that the resulting skeleton routing will + * have an idea about how to go to different parts of it. We try to pick up the most critical sink + * from each sample region ("bin"). If the existing route tree already has a sink in a bin, we can + * skip it. */ +std::vector get_decomposition_isinks(ParentNetId net_id, const std::vector& remaining_targets, const PartitionTreeNode& node) { + const auto& route_ctx = g_vpr_ctx.routing(); + const auto& device_ctx = g_vpr_ctx.device(); + const auto& rr_graph = device_ctx.rr_graph; + + const RouteTree& tree = route_ctx.route_trees[net_id].value(); + std::vector out; + + /* Set up sampling bins. If we are sampling from W = 22 with minimum width 6, then we have + * 3 bins and real width is 22/3 + 1 = 8. Then x=0 goes to bin 0, x=8 goes to bin 1 etc. */ + const t_bb& net_bb = route_ctx.route_bb[net_id]; + size_t width = net_bb.xmax - net_bb.xmin + 1; + size_t height = net_bb.ymax - net_bb.ymin + 1; + size_t bins_x = width / MIN_DECOMP_BIN_WIDTH; + size_t bins_y = height / MIN_DECOMP_BIN_WIDTH; + size_t samples_to_find = bins_x * bins_y; + size_t bin_width_x = width / bins_x + 1; + size_t bin_width_y = height / bins_y + 1; + + /* The sample for each bin, indexed by [x][y]. Set to -1 if reached by existing routing, + * 0 if not found yet. */ + std::vector> samples(bins_x, std::vector(bins_y)); + constexpr int REACHED = -1; + constexpr int NONE = 0; + + /* Mark bins with already reached sinks. */ + for (int isink : tree.get_reached_isinks()) { + if (samples_to_find == 0) + return out; + RRNodeId rr_sink = route_ctx.net_rr_terminals[net_id][isink]; + size_t x = (rr_graph.node_xlow(rr_sink) - net_bb.xmin) / bin_width_x; + size_t y = (rr_graph.node_ylow(rr_sink) - net_bb.ymin) / bin_width_y; + if (samples[x][y] != REACHED) { + samples[x][y] = REACHED; + samples_to_find--; + } + } + + /* Spatially sample remaining targets. This should be already sorted by pin criticality, + * so we sample the most critical sink in the bin right away. */ + for (int isink : remaining_targets) { + if (samples_to_find == 0) + return out; + RRNodeId rr_sink = route_ctx.net_rr_terminals[net_id][isink]; + size_t x = (rr_graph.node_xlow(rr_sink) - net_bb.xmin) / bin_width_x; + size_t y = (rr_graph.node_ylow(rr_sink) - net_bb.ymin) / bin_width_y; + if (samples[x][y] == NONE) { + samples[x][y] = isink; + out.push_back(isink); + samples_to_find--; + } + } + + return out; +} + +/** Decompose a net into a pair of nets after routing $decomposition_effort connections and + * making sure that the current routing intersects the cutline. + * + * Will crash if the net is not suitable for decomposition. Be sure to check with should_decompose_net + * before. */ +template +vtr::optional> route_and_decompose(ParentNetId net_id, const PartitionTreeNode& node, RouteIterCtx& ctx) { + auto& route_ctx = g_vpr_ctx.mutable_routing(); + + unsigned int num_sinks = ctx.net_list.net_sinks(net_id).size(); + + /* We don't have to route this net, so why bother decomposing it? */ + if (!should_really_route_net(ctx.net_list, net_id, ctx.budgeting_inf, ctx.connections_inf, ctx.worst_negative_slack)) + return vtr::nullopt; + + setup_routing_resources( + ctx.itry, + net_id, + ctx.net_list, + num_sinks, + ctx.router_opts.min_incremental_reroute_fanout, + ctx.connections_inf, + ctx.router_opts, + check_hold(ctx.router_opts, ctx.worst_negative_slack)); + + VTR_ASSERT(route_ctx.route_trees[net_id]); + RouteTree& tree = route_ctx.route_trees[net_id].value(); + + bool high_fanout = is_high_fanout(num_sinks, ctx.router_opts.high_fanout_threshold); + + /* I think it's OK to build the full high fanout lookup for both sides of the net. + * The work required to get the right bounding box and nodes into the lookup may + * be more than to just build it twice. */ + SpatialRouteTreeLookup spatial_route_tree_lookup; + if (high_fanout) { + spatial_route_tree_lookup = build_route_tree_spatial_lookup(ctx.net_list, + route_ctx.route_bb, + net_id, + tree.root()); + } + + /* Pick and route some connections. The resulting tree should intersect the cutline */ + std::vector remaining_targets(tree.get_remaining_isinks().begin(), tree.get_remaining_isinks().end()); + + /* Get pin criticalities */ + std::vector pin_criticality(num_sinks + 1); + + for (int isink : remaining_targets) { + if (ctx.timing_info) { + auto pin = ctx.net_list.net_pin(net_id, isink); + pin_criticality[isink] = get_net_pin_criticality(ctx.timing_info, + ctx.netlist_pin_lookup, + ctx.router_opts.max_criticality, + ctx.router_opts.criticality_exp, + net_id, + pin, + ctx.is_flat); + } else { + //No timing info, implies we want a min delay routing, so use criticality of 1. + pin_criticality[isink] = 1.; + } + } + + /* Sort pins by criticality */ + sort(begin(remaining_targets), end(remaining_targets), [&](int a, int b) { + return pin_criticality[a] > pin_criticality[b]; + }); + + /* Update base costs according to fanout and criticality rules + * TODO: Not sure what this does and if it's safe to call in parallel */ + update_rr_base_costs(num_sinks); + + t_conn_delay_budget conn_delay_budget; + t_conn_cost_params cost_params; + cost_params.astar_fac = ctx.router_opts.astar_fac; + cost_params.bend_cost = ctx.router_opts.bend_cost; + cost_params.pres_fac = ctx.pres_fac; + cost_params.delay_budget = ((ctx.budgeting_inf.if_set()) ? &conn_delay_budget : nullptr); + + /* Get the isinks to actually route to */ + std::vector isinks_to_route = get_decomposition_isinks(net_id, remaining_targets, node); + + const auto& device_ctx = g_vpr_ctx.device(); + const auto& rr_graph = device_ctx.rr_graph; + const t_bb& bb = route_ctx.route_bb[net_id]; + std::cout << "route_and_decompose " << net_id << ", bbox: " << bb.xmin << "," << bb.ymin << "x" << bb.xmax << "," << bb.ymax << "\n"; + + for (int isink : isinks_to_route) { + /* Fill the necessary forms to route to this sink. */ + RRNodeId rr_sink = route_ctx.net_rr_terminals[net_id][isink]; + std::cout << "routing to " << rr_sink << ": " << rr_graph.node_xlow(rr_sink) << "," << rr_graph.node_ylow(rr_sink) << "\n"; + cost_params.criticality = pin_criticality[isink]; + + if (ctx.budgeting_inf.if_set()) { + conn_delay_budget.max_delay = ctx.budgeting_inf.get_max_delay_budget(net_id, isink); + conn_delay_budget.target_delay = ctx.budgeting_inf.get_delay_target(net_id, isink); + conn_delay_budget.min_delay = ctx.budgeting_inf.get_min_delay_budget(net_id, isink); + conn_delay_budget.short_path_criticality = ctx.budgeting_inf.get_crit_short_path(net_id, isink); + conn_delay_budget.routing_budgets_algorithm = ctx.router_opts.routing_budgets_algorithm; + } + + enable_router_debug(ctx.router_opts, net_id, rr_sink, ctx.itry, &ctx.routers.local()); + VTR_LOGV_DEBUG(f_router_debug, "Routing to sink %zu of net %zu for decomposition\n", size_t(rr_sink), size_t(net_id)); + + /* Route to this sink. */ + NetResultFlags sink_flags = timing_driven_route_sink( + ctx.routers.local(), + ctx.net_list, + net_id, + 0, /* itarget: only used for debug, so we can lie here */ + isink, + cost_params, + ctx.router_opts, + tree, + spatial_route_tree_lookup, + ctx.router_stats.local(), + ctx.budgeting_inf, + ctx.routing_predictor, + ctx.choking_spots[net_id], + ctx.is_flat, + route_ctx.route_bb[net_id], + num_sinks); + + if (!sink_flags.success) /* Couldn't route. It's too much work to backtrack from here, just fail. */ + return vtr::nullopt; + + /* Fill the required forms after routing a connection. */ + ++ctx.router_stats.local().connections_routed; + + /* Update the net delay for the sink we just routed */ + update_net_delay_from_isink(ctx.net_delay[net_id].data(), + tree, + isink, + ctx.net_list, + net_id, + ctx.timing_info.get(), + ctx.pin_timing_invalidator); + } + + if (ctx.router_opts.update_lower_bound_delays) { + for (int ipin : isinks_to_route) { + ctx.connections_inf.update_lower_bound_connection_delay(net_id, ipin, ctx.net_delay[net_id][ipin]); + } + } + + ctx.routers.local().empty_rcv_route_tree_set(); // ? + + /* Count reroutes with decomposition */ + ctx.decomp_retries[net_id]++; + + return make_decomposed_pair(net_id, node.cutline_pos, node.cutline_axis); +} + +/* Goes through all the sinks of this virtual net and copies their delay values from + * the route_tree to the net_delay array. */ +template +static void update_net_delays_from_vnet(const VirtualNet& vnet, RouteIterCtx& ctx) { + auto& route_ctx = g_vpr_ctx.routing(); + std::vector sinks = get_vnet_isinks(vnet, ctx); + + for (int isink : sinks) { + update_net_delay_from_isink( + ctx.net_delay[vnet.net_id].data(), + *route_ctx.route_trees[vnet.net_id], + isink, + ctx.net_list, + vnet.net_id, + ctx.timing_info.get(), + ctx.pin_timing_invalidator); + } +} + +/** Get all "sink pin indices" for a given VirtualNet. We often work with that + * index, because it is used in a lot of lookups and is impossible to get back once + * converted to a ParentPinId or RRNodeId. */ +template +std::vector get_vnet_isinks(const VirtualNet& vnet, RouteIterCtx& ctx) { + auto& route_ctx = g_vpr_ctx.routing(); + size_t num_sinks = ctx.net_list.net_sinks(vnet.net_id).size(); + std::vector out; /* The compiler should be smart enough to not copy this when returning */ + /* 1-indexed. Yes, I know... */ + for (size_t isink = 1; isink <= num_sinks; ++isink) { + RRNodeId sink_rr = route_ctx.net_rr_terminals[vnet.net_id][isink]; + if (inside_bb(sink_rr, vnet.clipped_bb)) + out.push_back(isink); + } + return out; +} + +/** Get all "remaining sink pin indices" for a given VirtualNet. For regular nets + * you can get it from the route tree, but we need to spatially filter it here. */ +std::vector get_vnet_remaining_isinks(const VirtualNet& vnet) { + auto& route_ctx = g_vpr_ctx.routing(); + const RouteTree& tree = route_ctx.route_trees[vnet.net_id].value(); + + std::vector out; /* The compiler should be smart enough to not copy this when returning */ + for (size_t isink : tree.get_remaining_isinks()) { + RRNodeId sink_rr = route_ctx.net_rr_terminals[vnet.net_id][isink]; + if (inside_bb(sink_rr, vnet.clipped_bb)) + out.push_back(isink); + } + return out; +} + +/** Route a VirtualNet, which is a portion of a net with a clipped bounding box + * and maybe a virtual source. */ +template +NetResultFlags route_virtual_net(const VirtualNet& vnet, RouteIterCtx& ctx, int level) { + auto& route_ctx = g_vpr_ctx.mutable_routing(); + + std::vector sinks = get_vnet_isinks(vnet, ctx); + NetResultFlags flags; + + VTR_ASSERT(route_ctx.route_trees[vnet.net_id]); + RouteTree& tree = route_ctx.route_trees[vnet.net_id].value(); + + /* Use num_sinks from the original tree to trigger high fanout code */ + bool high_fanout = is_high_fanout(tree.num_sinks(), ctx.router_opts.high_fanout_threshold); + + auto thread_id = std::to_string(std::hash()(std::this_thread::get_id())); + + /* I think it's OK to build the full high fanout lookup for both sides of the net. + * The work required to get the right bounding box and nodes into the lookup may + * be more than to just build it twice. */ + SpatialRouteTreeLookup spatial_route_tree_lookup; + if (high_fanout) { + spatial_route_tree_lookup = build_route_tree_spatial_lookup(ctx.net_list, + route_ctx.route_bb, + vnet.net_id, + tree.root()); + } + + std::vector remaining_isinks = get_vnet_remaining_isinks(vnet); + + std::vector pin_criticality(tree.num_sinks() + 1); + + /* Sort by decreasing criticality */ + for (int isink : remaining_isinks) { + if (ctx.timing_info) { + auto pin = ctx.net_list.net_pin(vnet.net_id, isink); + pin_criticality[isink] = get_net_pin_criticality( + ctx.timing_info, + ctx.netlist_pin_lookup, + ctx.router_opts.max_criticality, + ctx.router_opts.criticality_exp, + vnet.net_id, + pin, + ctx.is_flat); + + } else { + //No timing info, implies we want a min delay routing, so use criticality of 1. + pin_criticality[isink] = 1.; + } + } + + // compare the criticality of different sink nodes + sort(begin(remaining_isinks), end(remaining_isinks), [&](int a, int b) { + return pin_criticality[a] > pin_criticality[b]; + }); + + /* Update base costs according to fanout and criticality rules (TODO: I'm super sure this is not thread safe) */ + update_rr_base_costs(sinks.size()); + + /* Set up the tax forms for routing nets */ + t_conn_delay_budget conn_delay_budget; + t_conn_cost_params cost_params; + cost_params.astar_fac = ctx.router_opts.astar_fac; + cost_params.bend_cost = ctx.router_opts.bend_cost; + cost_params.pres_fac = ctx.pres_fac; + cost_params.delay_budget = ((ctx.budgeting_inf.if_set()) ? &conn_delay_budget : nullptr); + + /* This isn't exactly thread safe, but here both threads routing this net would be setting this to the same value */ + if (ctx.budgeting_inf.if_set()) { + ctx.budgeting_inf.set_should_reroute(vnet.net_id, false); + } + + /* Route sinks in decreasing order of criticality */ + for (unsigned itarget = 0; itarget < remaining_isinks.size(); ++itarget) { + int isink = remaining_isinks[itarget]; + RRNodeId sink_rr = route_ctx.net_rr_terminals[vnet.net_id][isink]; + cost_params.criticality = pin_criticality[isink]; + + enable_router_debug(ctx.router_opts, vnet.net_id, sink_rr, ctx.itry, &ctx.routers.local()); + VTR_LOGV_DEBUG(f_router_debug, "Routing to sink %zu of decomposed net %zu, clipped bbox = %d,%d - %d,%d\n", + size_t(sink_rr), size_t(vnet.net_id), vnet.clipped_bb.xmin, vnet.clipped_bb.ymin, vnet.clipped_bb.xmax, vnet.clipped_bb.ymax); + + if (ctx.budgeting_inf.if_set()) { + conn_delay_budget.max_delay = ctx.budgeting_inf.get_max_delay_budget(vnet.net_id, isink); + conn_delay_budget.target_delay = ctx.budgeting_inf.get_delay_target(vnet.net_id, isink); + conn_delay_budget.min_delay = ctx.budgeting_inf.get_min_delay_budget(vnet.net_id, isink); + conn_delay_budget.short_path_criticality = ctx.budgeting_inf.get_crit_short_path(vnet.net_id, isink); + conn_delay_budget.routing_budgets_algorithm = ctx.router_opts.routing_budgets_algorithm; + } + + profiling::conn_start(); + + auto sink_flags = timing_driven_route_sink( + ctx.routers.local(), + ctx.net_list, + vnet.net_id, + itarget, + isink, + cost_params, + ctx.router_opts, + tree, + spatial_route_tree_lookup, + ctx.router_stats.local(), + ctx.budgeting_inf, + ctx.routing_predictor, + ctx.choking_spots[vnet.net_id], + ctx.is_flat, + vnet.clipped_bb, + sinks.size()); + + flags.retry_with_full_bb |= sink_flags.retry_with_full_bb; + + if (!sink_flags.success) { + flags.success = false; + return flags; + } + + profiling::conn_finish(size_t(route_ctx.net_rr_terminals[vnet.net_id][0]), + size_t(sink_rr), + pin_criticality[isink]); + + ++ctx.router_stats.local().connections_routed; + } // finished all sinks + + ++ctx.router_stats.local().nets_routed; + profiling::net_finish(); + + /* For later timing analysis. */ + update_net_delays_from_vnet(vnet, ctx); + + if (ctx.router_opts.update_lower_bound_delays) { + for (int isink : remaining_isinks) { + ctx.connections_inf.update_lower_bound_connection_delay(vnet.net_id, isink, ctx.net_delay[vnet.net_id][isink]); + } + } + + ctx.routers.local().empty_rcv_route_tree_set(); // ? + + flags.success = true; + return flags; +} + +/* Helper for decompose_route_partition_tree(). */ +template +void decompose_route_partition_tree_helper(tbb::task_group& g, + PartitionTreeNode& node, + RouteIterCtx& ctx, + int level) { + /* Sort so net with most sinks is routed first. */ + std::sort(node.nets.begin(), node.nets.end(), [&](const ParentNetId id1, const ParentNetId id2) -> bool { + return ctx.net_list.net_sinks(id1).size() > ctx.net_list.net_sinks(id2).size(); + }); + + node.is_routable = true; + node.rerouted_nets.clear(); + + vtr::Timer t; + + for (auto net_id : node.nets) { + /* Should I decompose this net? */ + if (should_decompose_net(net_id, level, node.cutline_pos, node.cutline_axis, ctx)) { + auto decomposed_nets = route_and_decompose(net_id, node, ctx); + if (decomposed_nets) { + auto& [left, right] = decomposed_nets.value(); + node.left->virtual_nets.push_back(left); + node.right->virtual_nets.push_back(right); + /* We changed the routing */ + node.rerouted_nets.push_back(net_id); + continue; /* We are done with this net */ + } + } + /* If not, route it here */ + auto flags = try_parallel_route_net(net_id, ctx, level); + + if (!flags.success && !flags.retry_with_full_bb) { + node.is_routable = false; + } + if (flags.was_rerouted) { + node.rerouted_nets.push_back(net_id); + } + if (flags.retry_with_full_bb) { + ctx.nets_to_retry.push_back(net_id); + } + } + + /* Route virtual nets first: they are probably parts of high fanout nets */ + for (const auto& vnet : node.virtual_nets) { + auto flags = route_virtual_net(vnet, ctx, level); + + /* Here, !flags.success probably means our cutline was too close to the sink side + * and no useful routing resources were added to the heap. Disable composition for this + * net. (next time it won't become a virtual net, so no need to handle that case) */ + if (!flags.success && !flags.retry_with_full_bb) { + ctx.decomp_retries[vnet.net_id] = MAX_DECOMP_REROUTE; + } else if (flags.retry_with_full_bb) { + ctx.nets_to_retry.push_back(vnet.net_id); + } + } + + PartitionTreeDebug::log("Node with " + std::to_string(node.nets.size()) + + " nets and " + std::to_string(node.virtual_nets.size()) + + " virtual nets routed in " + std::to_string(t.elapsed_sec()) + + " s (level=" + std::to_string(level) + ")"); + + /* add left and right trees to task queue */ + if (node.left && node.right) { + /* Otherwise both try to change the same "level" and garble it */ + g.run([&, level]() { + decompose_route_partition_tree_helper(g, *node.left, ctx, level + 1); + }); + g.run([&, level]() { + decompose_route_partition_tree_helper(g, *node.right, ctx, level + 1); + }); + } else { + VTR_ASSERT(!node.left && !node.right); // there shouldn't be a node with a single branch + } +} + +/** Route all nets in parallel using the partitioning information in the PartitionTree. + * + * @param[in, out] g TBB task group to dispatch tasks. + * @param[in, out] tree The partition tree. Non-const reference because iteration results get written on the nodes. + * @param[in, out] ctx RouteIterCtx containing all the necessary bits of state for routing. + * @return RouteIterResults combined from all threads. + * + * See comments in PartitionTreeNode for how parallel routing works. */ +template +RouteIterResults decompose_route_partition_tree(tbb::task_group& g, + PartitionTree& tree, + RouteIterCtx& ctx) { + auto& device_ctx = g_vpr_ctx.device(); + auto& route_ctx = g_vpr_ctx.mutable_routing(); + + ctx.nets_to_retry.clear(); + + /* Route all nets */ + decompose_route_partition_tree_helper(g, tree.root(), ctx, 0); + g.wait(); + + /* Grow the bounding box and set to not decompose if a net is set to retry */ + for (ParentNetId net_id : ctx.nets_to_retry) { + route_ctx.route_bb[net_id] = { + 0, + (int)(device_ctx.grid.width() - 1), + 0, + (int)(device_ctx.grid.height() - 1)}; + ctx.decomp_retries[net_id] = MAX_DECOMP_REROUTE; + } + + RouteIterResults out; + reduce_partition_tree_helper(tree.root(), out); + for (auto& thread_stats : ctx.router_stats) { + update_router_stats(out.stats, thread_stats); + } + return out; +} + +/* Build a partition tree and do a net-decomposing route with it */ +template +static RouteIterResults decompose_route_with_partition_tree(tbb::task_group& g, RouteIterCtx& ctx) { + vtr::Timer t2; + PartitionTree partition_tree(ctx.net_list); + float total_prep_time = t2.elapsed_sec(); + VTR_LOG("# Built partition tree in %f seconds\n", total_prep_time); + + return decompose_route_partition_tree(g, partition_tree, ctx); +} + #endif // VPR_USE_TBB diff --git a/vpr/src/route/route_timing.cpp b/vpr/src/route/route_timing.cpp index 9a2197ed3ca..bbda3361f64 100644 --- a/vpr/src/route/route_timing.cpp +++ b/vpr/src/route/route_timing.cpp @@ -65,43 +65,6 @@ static int num_routing_failed = 0; /******************** Subroutines local to route_timing.cpp ********************/ -/** Attempt to route a single sink (target_pin) in a net. - * In the process, update global pathfinder costs, rr_node_route_inf and extend the global RouteTree - * for this net. - * - * @param router The ConnectionRouter instance - * @param net_list Input netlist - * @param net_id - * @param itarget # of this connection in the net (only used for debug output) - * @param target_pin # of this sink in the net (TODO: is it the same thing as itarget?) - * @param cost_params - * @param router_opts - * @param[in, out] tree RouteTree describing the current routing state - * @param rt_node_of_sink Lookup from target_pin-like indices (indicating SINK nodes) to RouteTreeNodes - * @param spatial_rt_lookup - * @param router_stats - * @param budgeting_inf - * @param routing_predictor - * @param choking_spots - * @param is_flat - * @return NetResultFlags for this sink to be bubbled up through timing_driven_route_net */ -template -static NetResultFlags timing_driven_route_sink(ConnectionRouter& router, - const Netlist<>& net_list, - ParentNetId net_id, - unsigned itarget, - int target_pin, - const t_conn_cost_params cost_params, - const t_router_opts& router_opts, - RouteTree& tree, - std::vector>& rt_node_of_sink, - SpatialRouteTreeLookup& spatial_rt_lookup, - RouterStats& router_stats, - route_budgets& budgeting_inf, - const RoutingPredictor& routing_predictor, - const std::vector>& choking_spots, - bool is_flat); - /** Return tuple of: * bool: Did we find a path for each sink in this net? * bool: Should the caller retry with a full-device bounding box? */ @@ -118,44 +81,6 @@ static std::tuple timing_driven_pre_route_to_clock_root(ConnectionRo bool is_flat, bool can_grow_bb); -static void setup_routing_resources(int itry, - ParentNetId net_id, - const Netlist<>& net_list, - unsigned num_sinks, - int min_incremental_reroute_fanout, - CBRR& connections_inf, - std::vector>& rt_node_of_sink, - const t_router_opts& router_opts, - bool ripup_high_fanout_nets); - -static void update_net_delays_from_route_tree(float* net_delay, - const Netlist<>& net_list, - std::vector>& rt_node_of_sink, - ParentNetId inet, - TimingInfo* timing_info, - NetPinTimingInvalidator* pin_timing_invalidator); - -static bool check_hold(const t_router_opts& router_opts, float worst_neg_slack); - -static float get_net_pin_criticality(const std::shared_ptr timing_info, - const ClusteredPinAtomPinsLookup& netlist_pin_lookup, - float max_criticality, - float criticality_exp, - ParentNetId net_id, - ParentPinId pin_id, - bool is_flat); - -struct more_sinks_than { - const Netlist<>& net_list_; - more_sinks_than(const Netlist<>& net_list) - : net_list_(net_list) {} - inline bool operator()(const ParentNetId& net_index1, const ParentNetId& net_index2) { - return net_list_.net_sinks(net_index1).size() > net_list_.net_sinks(net_index2).size(); - } -}; - -static bool is_high_fanout(int fanout, int fanout_threshold); - // The reason that try_timing_driven_route_tmpl (and descendents) are being // templated over is because using a virtual interface instead fully templating // the router results in a 5% runtime increase. @@ -256,7 +181,10 @@ bool try_timing_driven_route_tmpl(const Netlist<>& net_list, //sort so net with most sinks is routed first. auto sorted_nets = std::vector(net_list.nets().begin(), net_list.nets().end()); - std::sort(sorted_nets.begin(), sorted_nets.end(), more_sinks_than(net_list)); + + std::sort(sorted_nets.begin(), sorted_nets.end(), [&](const ParentNetId id1, const ParentNetId id2) -> bool { + return net_list.net_sinks(id1).size() > net_list.net_sinks(id2).size(); + }); /* * Configure the routing predictor @@ -416,7 +344,6 @@ bool try_timing_driven_route_tmpl(const Netlist<>& net_list, RouterStats router_stats; init_router_stats(router_stats); - timing_driven_route_structs route_structs(net_list); float prev_iter_cumm_time = 0; vtr::Timer iteration_timer; int num_net_bounding_boxes_updated = 0; @@ -468,8 +395,6 @@ bool try_timing_driven_route_tmpl(const Netlist<>& net_list, router_opts, connections_inf, router_iteration_stats, - route_structs.pin_criticality, - route_structs.rt_node_of_sink, net_delay, netlist_pin_lookup, route_timing_info, @@ -867,8 +792,6 @@ NetResultFlags try_timing_driven_route_net(ConnectionRouter& router, const t_router_opts& router_opts, CBRR& connections_inf, RouterStats& router_stats, - std::vector& pin_criticality, - std::vector>& rt_node_of_sink, NetPinsMatrix& net_delay, const ClusteredPinAtomPinsLookup& netlist_pin_lookup, std::shared_ptr timing_info, @@ -882,8 +805,6 @@ NetResultFlags try_timing_driven_route_net(ConnectionRouter& router, NetResultFlags flags; - connections_inf.prepare_routing_for_net(net_id); - bool reroute_for_hold = false; if (budgeting_inf.if_set()) { reroute_for_hold = (budgeting_inf.get_should_reroute(net_id)); @@ -908,8 +829,6 @@ NetResultFlags try_timing_driven_route_net(ConnectionRouter& router, router_opts, connections_inf, router_stats, - pin_criticality, - rt_node_of_sink, net_delay[net_id].data(), netlist_pin_lookup, timing_info, @@ -954,8 +873,6 @@ NetResultFlags timing_driven_route_net(ConnectionRouter& router, const t_router_opts& router_opts, CBRR& connections_inf, RouterStats& router_stats, - std::vector& pin_criticality, - std::vector>& rt_node_of_sink, float* net_delay, const ClusteredPinAtomPinsLookup& netlist_pin_lookup, std::shared_ptr timing_info, @@ -982,7 +899,6 @@ NetResultFlags timing_driven_route_net(ConnectionRouter& router, num_sinks, router_opts.min_incremental_reroute_fanout, connections_inf, - rt_node_of_sink, router_opts, check_hold(router_opts, worst_neg_slack)); @@ -1001,20 +917,22 @@ NetResultFlags timing_driven_route_net(ConnectionRouter& router, // after this point the route tree is correct // remaining_targets from this point on are the **pin indices** that have yet to be routed - auto& remaining_targets = connections_inf.get_remaining_targets(net_id); + std::vector remaining_targets(tree.get_remaining_isinks().begin(), tree.get_remaining_isinks().end()); + + std::vector pin_criticality(num_sinks + 1); // calculate criticality of remaining target pins for (int ipin : remaining_targets) { if (timing_info) { auto pin = net_list.net_pin(net_id, ipin); - pin_criticality[ipin] = get_net_pin_criticality(timing_info, - netlist_pin_lookup, - router_opts.max_criticality, - router_opts.criticality_exp, - net_id, - pin, - is_flat); - + pin_criticality[ipin] = get_net_pin_criticality( + timing_info, + netlist_pin_lookup, + router_opts.max_criticality, + router_opts.criticality_exp, + net_id, + pin, + is_flat); } else { //No timing info, implies we want a min delay routing, so use criticality of 1. pin_criticality[ipin] = 1.; @@ -1022,7 +940,7 @@ NetResultFlags timing_driven_route_net(ConnectionRouter& router, } // compare the criticality of different sink nodes - sort(begin(remaining_targets), end(remaining_targets), [&](int a, int b) { + std::sort(remaining_targets.begin(), remaining_targets.end(), [&](int a, int b) { return pin_criticality[a] > pin_criticality[b]; }); @@ -1054,17 +972,18 @@ NetResultFlags timing_driven_route_net(ConnectionRouter& router, * routers handle this in the same way */ bool can_grow_bb = (router_opts.router_algorithm != PARALLEL); - std::tie(flags.success, flags.retry_with_full_bb) = timing_driven_pre_route_to_clock_root(router, - net_id, - net_list, - sink_node, - cost_params, - router_opts.high_fanout_threshold, - tree, - spatial_route_tree_lookup, - router_stats, - is_flat, - can_grow_bb); + std::tie(flags.success, flags.retry_with_full_bb) = timing_driven_pre_route_to_clock_root( + router, + net_id, + net_list, + sink_node, + cost_params, + router_opts.high_fanout_threshold, + tree, + spatial_route_tree_lookup, + router_stats, + is_flat, + can_grow_bb); return flags; } @@ -1094,21 +1013,23 @@ NetResultFlags timing_driven_route_net(ConnectionRouter& router, profiling::conn_start(); // build a branch in the route tree to the target - auto sink_flags = timing_driven_route_sink(router, - net_list, - net_id, - itarget, - target_pin, - cost_params, - router_opts, - tree, - rt_node_of_sink, - spatial_route_tree_lookup, - router_stats, - budgeting_inf, - routing_predictor, - choking_spots, - is_flat); + auto sink_flags = timing_driven_route_sink( + router, + net_list, + net_id, + itarget, + target_pin, + cost_params, + router_opts, + tree, + spatial_route_tree_lookup, + router_stats, + budgeting_inf, + routing_predictor, + choking_spots, + is_flat, + route_ctx.route_bb[net_id], + num_sinks); flags.retry_with_full_bb |= sink_flags.retry_with_full_bb; @@ -1132,7 +1053,6 @@ NetResultFlags timing_driven_route_net(ConnectionRouter& router, // may have to update timing delay of the previously legally reached sinks since downstream capacitance could be changed update_net_delays_from_route_tree(net_delay, net_list, - rt_node_of_sink, net_id, timing_info.get(), pin_timing_invalidator); @@ -1185,7 +1105,8 @@ static std::tuple timing_driven_pre_route_to_clock_root(ConnectionRo std::unordered_map()); std::tie(found_path, retry_with_full_bb, cheapest) = router.timing_driven_route_connection_from_route_tree( - tree.root(), + tree, + tree.root().inode, sink_node, cost_params, bounding_box, @@ -1241,21 +1162,22 @@ static std::tuple timing_driven_pre_route_to_clock_root(ConnectionRo } template -static NetResultFlags timing_driven_route_sink(ConnectionRouter& router, - const Netlist<>& net_list, - ParentNetId net_id, - unsigned itarget, - int target_pin, - const t_conn_cost_params cost_params, - const t_router_opts& router_opts, - RouteTree& tree, - std::vector>& rt_node_of_sink, - SpatialRouteTreeLookup& spatial_rt_lookup, - RouterStats& router_stats, - route_budgets& budgeting_inf, - const RoutingPredictor& routing_predictor, - const std::vector>& choking_spots, - bool is_flat) { +NetResultFlags timing_driven_route_sink(ConnectionRouter& router, + const Netlist<>& net_list, + ParentNetId net_id, + unsigned itarget, + int target_pin, + const t_conn_cost_params cost_params, + const t_router_opts& router_opts, + RouteTree& tree, + SpatialRouteTreeLookup& spatial_rt_lookup, + RouterStats& router_stats, + route_budgets& budgeting_inf, + const RoutingPredictor& routing_predictor, + const std::vector>& choking_spots, + bool is_flat, + const t_bb& bounding_box, + size_t num_sinks) { const auto& device_ctx = g_vpr_ctx.device(); auto& route_ctx = g_vpr_ctx.mutable_routing(); @@ -1270,14 +1192,13 @@ static NetResultFlags timing_driven_route_sink(ConnectionRouter& router, bool found_path; t_heap cheapest; - t_bb bounding_box = route_ctx.route_bb[net_id]; /* Is the connection router allowed to grow the bounding box? That's not the case * when routing in parallel, so disallow it. */ bool can_grow_bb = (router_opts.router_algorithm != PARALLEL); bool net_is_global = net_list.net_is_global(net_id); - bool high_fanout = is_high_fanout(net_list.net_sinks(net_id).size(), router_opts.high_fanout_threshold); + bool high_fanout = is_high_fanout(num_sinks, router_opts.high_fanout_threshold); constexpr float HIGH_FANOUT_CRITICALITY_THRESHOLD = 0.9; bool sink_critical = (cost_params.criticality > HIGH_FANOUT_CRITICALITY_THRESHOLD); bool net_is_clock = route_ctx.is_clock_net[net_id] != 0; @@ -1289,22 +1210,26 @@ static NetResultFlags timing_driven_route_sink(ConnectionRouter& router, //However, if the current sink is 'critical' from a timing perspective, we put the entire route tree back onto //the heap to ensure it has more flexibility to find the best path. if (high_fanout && !sink_critical && !net_is_global && !net_is_clock && -routing_predictor.get_slope() > router_opts.high_fanout_max_slope) { - std::tie(found_path, flags.retry_with_full_bb, cheapest) = router.timing_driven_route_connection_from_route_tree_high_fanout(tree.root(), - sink_node, - cost_params, - bounding_box, - spatial_rt_lookup, - router_stats, - conn_params, - can_grow_bb); + std::tie(found_path, flags.retry_with_full_bb, cheapest) = router.timing_driven_route_connection_from_route_tree_high_fanout( + tree, + tree.root().inode, + sink_node, + cost_params, + bounding_box, + spatial_rt_lookup, + router_stats, + conn_params, + can_grow_bb); } else { - std::tie(found_path, flags.retry_with_full_bb, cheapest) = router.timing_driven_route_connection_from_route_tree(tree.root(), - sink_node, - cost_params, - bounding_box, - router_stats, - conn_params, - can_grow_bb); + std::tie(found_path, flags.retry_with_full_bb, cheapest) = router.timing_driven_route_connection_from_route_tree( + tree, + tree.root().inode, + sink_node, + cost_params, + bounding_box, + router_stats, + conn_params, + can_grow_bb); } if (!found_path) { @@ -1343,8 +1268,6 @@ static NetResultFlags timing_driven_route_sink(ConnectionRouter& router, } } - rt_node_of_sink[target_pin] = new_sink; - /* update global occupancy from the new branch */ if (new_branch) pathfinder_update_cost_from_route_tree(new_branch.value(), 1); @@ -1358,21 +1281,14 @@ static NetResultFlags timing_driven_route_sink(ConnectionRouter& router, return flags; } -static void setup_routing_resources(int itry, - ParentNetId net_id, - const Netlist<>& net_list, - unsigned num_sinks, - int min_incremental_reroute_fanout, - CBRR& connections_inf, - std::vector>& rt_node_of_sink, - const t_router_opts& router_opts, - bool ripup_high_fanout_nets) { - /* Build and return a partial route tree from the legal connections from last iteration. - * along the way do: - * update pathfinder costs to be accurate to the partial route tree - * find and store the pins that still need to be reached in incremental_rerouting_resources.remaining_targets - * find and store the rt nodes that have been reached in incremental_rerouting_resources.reached_rt_sinks - * mark the rr_node sinks as targets to be reached. */ +void setup_routing_resources(int itry, + ParentNetId net_id, + const Netlist<>& net_list, + unsigned num_sinks, + int min_incremental_reroute_fanout, + CBRR& connections_inf, + const t_router_opts& router_opts, + bool ripup_high_fanout_nets) { auto& route_ctx = g_vpr_ctx.mutable_routing(); /* "tree" points to this net's spot in the global context here, so re-initializing it etc. changes the global state */ @@ -1385,14 +1301,13 @@ static void setup_routing_resources(int itry, /* rip up the whole net */ if (tree) - pathfinder_update_cost_from_route_tree(tree.value().root(), -1); + pathfinder_update_cost_from_route_tree(tree->root(), -1); tree = vtr::nullopt; /* re-initialize net */ tree = RouteTree(net_id); + pathfinder_update_cost_from_route_tree(tree->root(), 1); - for (unsigned int sink_pin = 1; sink_pin <= num_sinks; ++sink_pin) - connections_inf.toreach_rr_sink(net_id, sink_pin); // since all connections will be rerouted for this net, clear all of net's forced reroute flags connections_inf.clear_force_reroute_for_net(net_id); @@ -1401,16 +1316,15 @@ static void setup_routing_resources(int itry, // of their versions that act on node indices directly like mark_remaining_ends mark_ends(net_list, net_id); } else { - auto& reached_sinks = connections_inf.get_reached_rt_sinks(net_id); - auto& remaining_targets = connections_inf.get_remaining_targets(net_id); - profiling::net_rebuild_start(); - if (!tree) + if (!tree) { tree = RouteTree(net_id); + pathfinder_update_cost_from_route_tree(tree->root(), 1); + } /* copy the existing routing - * prune_route_tree depends on global occ, so we can't subtract before pruning + * prune() depends on global occ, so we can't subtract before pruning * OPT: to skip this copy, return a "diff" from RouteTree::prune */ RouteTree tree2 = tree.value(); @@ -1435,19 +1349,12 @@ static void setup_routing_resources(int itry, // Initialize only to source tree = RouteTree(net_id); + pathfinder_update_cost_from_route_tree(tree->root(), 1); } - VTR_ASSERT(reached_sinks.size() + remaining_targets.size() == num_sinks); - - // give lookup on the reached sinks - for (RRNodeId sink_rr_node : reached_sinks) { - auto& sink_node = tree.value().find_by_rr_id(sink_rr_node).value(); - rt_node_of_sink[sink_node.net_pin_index] = sink_node; - } + profiling::net_rebuild_end(num_sinks, tree->get_remaining_isinks().size()); - profiling::net_rebuild_end(num_sinks, remaining_targets.size()); - - // still need to calculate the tree's time delay (0 Tarrival means from SOURCE) + // still need to calculate the tree's time delay tree.value().reload_timing(); // check for R_upstream C_downstream and edge correctness @@ -1457,7 +1364,7 @@ static void setup_routing_resources(int itry, VTR_ASSERT_SAFE(tree.value().is_uncongested()); // mark remaining ends - mark_remaining_ends(net_id, remaining_targets); + mark_remaining_ends(net_id); // mark the lookup (rr_node_route_inf) for existing tree elements as NO_PREVIOUS so add_to_path stops when it reaches one of them update_rr_route_inf_from_tree(tree.value().root()); @@ -1466,7 +1373,8 @@ static void setup_routing_resources(int itry, // completed constructing the partial route tree and updated all other data structures to match } -/** Change the base costs of rr_nodes according to # of fanouts */ +/** Change the base costs of rr_nodes according to # of fanouts + * TODO: Doesn't seem very thread safe? */ void update_rr_base_costs(int fanout) { auto& device_ctx = g_vpr_ctx.mutable_device(); @@ -1538,27 +1446,20 @@ bool timing_driven_check_net_delays(const Netlist<>& net_list, NetPinsMatrix& net_list, - std::vector>& rt_node_of_sink, - ParentNetId inet, - TimingInfo* timing_info, - NetPinTimingInvalidator* pin_timing_invalidator) { - for (unsigned int isink = 1; isink < net_list.net_pins(inet).size(); isink++) { - float new_delay = rt_node_of_sink[isink]->Tdel; - - if (pin_timing_invalidator && new_delay != net_delay[isink]) { - //Delay changed, invalidate for incremental timing update - VTR_ASSERT_SAFE(timing_info); - ParentPinId pin = net_list.net_pin(inet, isink); - pin_timing_invalidator->invalidate_connection(pin, timing_info); - } +void update_net_delays_from_route_tree(float* net_delay, + const Netlist<>& net_list, + ParentNetId inet, + TimingInfo* timing_info, + NetPinTimingInvalidator* pin_timing_invalidator) { + auto& route_ctx = g_vpr_ctx.routing(); + const RouteTree& tree = route_ctx.route_trees[inet].value(); - net_delay[isink] = new_delay; + for (unsigned int isink = 1; isink < net_list.net_pins(inet).size(); isink++) { + update_net_delay_from_isink(net_delay, tree, isink, net_list, inet, timing_info, pin_timing_invalidator); } } -/* Detect if net should be routed or not */ +/** Detect if \p net_id should be routed or not. */ bool should_route_net(ParentNetId net_id, CBRR& connections_inf, bool if_force_reroute) { @@ -1571,8 +1472,10 @@ bool should_route_net(ParentNetId net_id, return true; } + const RouteTree& tree = route_ctx.route_trees[net_id].value(); + /* Walk over all rt_nodes in the net */ - for (auto& rt_node : route_ctx.route_trees[net_id]->all_nodes()) { + for (auto& rt_node : tree.all_nodes()) { RRNodeId inode = rt_node.inode; int occ = route_ctx.rr_node_route_inf[inode].occ(); int capacity = rr_graph.node_capacity(inode); @@ -1591,9 +1494,11 @@ bool should_route_net(ParentNetId net_id, } } - VTR_ASSERT(connections_inf.get_remaining_targets(net_id).empty()); + /* If all sinks have been routed to without overuse, no need to route this */ + if (tree.get_remaining_isinks().empty()) + return false; - return false; /* Current route has no overuse */ + return true; } bool early_exit_heuristic(const t_router_opts& router_opts, const WirelengthInfo& wirelength_info) { @@ -1606,25 +1511,13 @@ bool early_exit_heuristic(const t_router_opts& router_opts, const WirelengthInfo return false; } -static bool check_hold(const t_router_opts& router_opts, float worst_neg_slack) { - /* When RCV is enabled, it's necessary to be able to completely ripup high fanout nets if there is still negative hold slack - * Normally the router will prune the illegal branches of high fanout nets, this will bypass this */ - - if (router_opts.routing_budgets_algorithm != YOYO) { - return false; - } else if (worst_neg_slack != 0) { - return true; - } - return false; -} - -static float get_net_pin_criticality(const std::shared_ptr timing_info, - const ClusteredPinAtomPinsLookup& netlist_pin_lookup, - float max_criticality, - float criticality_exp, - ParentNetId net_id, - ParentPinId pin_id, - bool is_flat) { +float get_net_pin_criticality(const std::shared_ptr timing_info, + const ClusteredPinAtomPinsLookup& netlist_pin_lookup, + float max_criticality, + float criticality_exp, + ParentNetId net_id, + ParentPinId pin_id, + bool is_flat) { float pin_criticality = 0.0; const auto& route_ctx = g_vpr_ctx.routing(); @@ -1819,12 +1712,6 @@ void print_overused_nodes_status(const t_router_opts& router_opts, const Overuse VTR_LOG("\n"); } -//Returns true if the specified net fanout is classified as high fanout -static bool is_high_fanout(int fanout, int fanout_threshold) { - if (fanout_threshold < 0 || fanout < fanout_threshold) return false; - return true; -} - // In heavily congested designs a static bounding box (BB) can // become problematic for routability (it effectively enforces a // hard blockage restricting where a net can route). @@ -1873,6 +1760,8 @@ size_t dynamic_update_bounding_boxes(const std::vector& updated_net for (ParentNetId net : updated_nets) { if (!route_ctx.route_trees[net]) continue; // Skip if no routing + if (!route_ctx.net_status.is_routed(net)) + continue; //We do not adjust the bounding boxes of high fanout nets, since they //use different bounding boxes based on the target location. @@ -2100,7 +1989,6 @@ void prune_unused_non_configurable_nets(CBRR& connections_inf, continue; RouteTree& tree = route_ctx.route_trees[net_id].value(); - connections_inf.prepare_routing_for_net(net_id); connections_inf.clear_force_reroute_for_net(net_id); std::vector usage = tree.get_non_config_node_set_usage(); diff --git a/vpr/src/route/route_timing.h b/vpr/src/route/route_timing.h index bccf9ba2c84..2595bf28074 100644 --- a/vpr/src/route/route_timing.h +++ b/vpr/src/route/route_timing.h @@ -21,7 +21,6 @@ extern bool f_router_debug; -/** TODO: remove timing_driven_route_structs together with this fn */ int get_max_pins_per_net(const Netlist<>& net_list); /** Types and defines common to timing_driven and parallel routers */ @@ -62,26 +61,6 @@ struct RoutingMetrics { tatum::TimingPathInfo critical_path; }; -/* Data while timing driven route is active */ -class timing_driven_route_structs { - public: - std::vector pin_criticality; /* [1..max_pins_per_net-1] */ - std::vector sink_order; /* [1..max_pins_per_net-1] */ - std::vector> rt_node_of_sink; /* [1..max_pins_per_net-1] */ - - timing_driven_route_structs(const Netlist<>& net_list) { - int max_sinks = std::max(get_max_pins_per_net(net_list) - 1, 0); - pin_criticality.resize(max_sinks + 1); - sink_order.resize(max_sinks + 1); - rt_node_of_sink.resize(max_sinks + 1); - - /* Set element 0 to invalid values */ - pin_criticality[0] = std::numeric_limits::quiet_NaN(); - sink_order[0] = -1; - rt_node_of_sink[0] = vtr::nullopt; - } -}; - /** Returns the bounding box of a net's used routing resources */ t_bb calc_current_bb(const RouteTree& tree); @@ -115,6 +94,12 @@ void generate_route_timing_reports(const t_router_opts& router_opts, const RoutingDelayCalculator& delay_calc, bool is_flat); +/** Returns true if the specified net fanout is classified as high fanout. */ +inline bool is_high_fanout(int fanout, int fanout_threshold) { + if (fanout_threshold < 0 || fanout < fanout_threshold) return false; + return true; +} + /** Initialize net_delay based on best-case delay estimates from the router lookahead. */ void init_net_delay_from_lookahead(const RouterLookahead& router_lookahead, const Netlist<>& net_list, @@ -202,6 +187,68 @@ bool try_timing_driven_route(const Netlist<>& net_list, ScreenUpdatePriority first_iteration_priority, bool is_flat); +/** Calculate pin criticality for \p pin_id of \p net_id. */ +float get_net_pin_criticality(const std::shared_ptr timing_info, + const ClusteredPinAtomPinsLookup& netlist_pin_lookup, + float max_criticality, + float criticality_exp, + ParentNetId net_id, + ParentPinId pin_id, + bool is_flat); + +/** Build and return a partial route tree from the legal connections from last iteration. + * along the way do: + * update pathfinder costs to be accurate to the partial route tree + * find and store the pins that still need to be reached in connections_inf.remaining_targets + * find and store the rt nodes that have been reached in connections_inf.reached_rt_sinks + * mark the rr_node sinks as targets to be reached. */ +void setup_routing_resources(int itry, + ParentNetId net_id, + const Netlist<>& net_list, + unsigned num_sinks, + int min_incremental_reroute_fanout, + CBRR& connections_inf, + const t_router_opts& router_opts, + bool ripup_high_fanout_nets); + +/** Attempt to route a single sink (target_pin) in a net. + * In the process, update global pathfinder costs, rr_node_route_inf and extend the global RouteTree + * for this net. + * + * @param router The ConnectionRouter instance + * @param net_list Input netlist + * @param net_id + * @param itarget # of this connection in the net (only used for debug output) + * @param target_pin # of this sink in the net (TODO: is it the same thing as itarget?) + * @param cost_params + * @param router_opts + * @param[in, out] tree RouteTree describing the current routing state + * @param rt_node_of_sink Lookup from target_pin-like indices (indicating SINK nodes) to RouteTreeNodes + * @param spatial_rt_lookup + * @param router_stats + * @param budgeting_inf + * @param routing_predictor + * @param choking_spots + * @param is_flat + * @return NetResultFlags for this sink to be bubbled up through timing_driven_route_net */ +template +NetResultFlags timing_driven_route_sink(ConnectionRouter& router, + const Netlist<>& net_list, + ParentNetId net_id, + unsigned itarget, + int target_pin, + const t_conn_cost_params cost_params, + const t_router_opts& router_opts, + RouteTree& tree, + SpatialRouteTreeLookup& spatial_rt_lookup, + RouterStats& router_stats, + route_budgets& budgeting_inf, + const RoutingPredictor& routing_predictor, + const std::vector>& choking_spots, + bool is_flat, + const t_bb& bounding_box, + size_t num_sinks); + /** Attempt to route a single net. * * @param router The ConnectionRouter instance @@ -213,7 +260,6 @@ bool try_timing_driven_route(const Netlist<>& net_list, * @param connections_inf * @param router_stats * @param pin_criticality - * @param rt_node_of_sink Lookup from target_pin-like indices (indicating SINK nodes) to RouteTreeNodes * @param net_delay * @param netlist_pin_lookup * @param timing_info @@ -233,8 +279,6 @@ NetResultFlags timing_driven_route_net(ConnectionRouter& router, const t_router_opts& router_opts, CBRR& connections_inf, RouterStats& router_stats, - std::vector& pin_criticality, - std::vector>& rt_node_of_sink, float* net_delay, const ClusteredPinAtomPinsLookup& netlist_pin_lookup, std::shared_ptr timing_info, @@ -254,8 +298,6 @@ NetResultFlags try_timing_driven_route_net(ConnectionRouter& router, const t_router_opts& router_opts, CBRR& connections_inf, RouterStats& router_stats, - std::vector& pin_criticality, - std::vector>& rt_node_of_sink, NetPinsMatrix& net_delay, const ClusteredPinAtomPinsLookup& netlist_pin_lookup, std::shared_ptr timing_info, @@ -266,6 +308,35 @@ NetResultFlags try_timing_driven_route_net(ConnectionRouter& router, const std::vector>& choking_spots, bool is_flat); +/** Update net_delay value for a single sink in a RouteTree. */ +inline void update_net_delay_from_isink(float* net_delay, + const RouteTree& tree, + int isink, + const Netlist<>& net_list, + ParentNetId inet, + TimingInfo* timing_info, + NetPinTimingInvalidator* pin_timing_invalidator) { + float new_delay = tree.find_by_isink(isink)->Tdel; + + if (pin_timing_invalidator && new_delay != net_delay[isink]) { + //Delay changed, invalidate for incremental timing update + VTR_ASSERT_SAFE(timing_info); + ParentPinId pin = net_list.net_pin(inet, isink); + pin_timing_invalidator->invalidate_connection(pin, timing_info); + } + + net_delay[isink] = new_delay; +} + +/* Goes through all the sinks of this net and copies their delay values from + * the route_tree to the net_delay array. */ +void update_net_delays_from_route_tree(float* net_delay, + const Netlist<>& net_list, + ParentNetId inet, + TimingInfo* timing_info, + NetPinTimingInvalidator* pin_timing_invalidator); + +/** Combine \p router_iteration_stats into \p router_stats. */ void update_router_stats(RouterStats& router_stats, RouterStats& router_iteration_stats); #ifndef NO_GRAPHICS diff --git a/vpr/src/route/route_tree.cpp b/vpr/src/route/route_tree.cpp index 82e929e3ebc..36f37461527 100644 --- a/vpr/src/route/route_tree.cpp +++ b/vpr/src/route/route_tree.cpp @@ -78,10 +78,15 @@ RouteTree::RouteTree(RRNodeId _inode) { RouteTree::RouteTree(ParentNetId _inet) { auto& route_ctx = g_vpr_ctx.routing(); + RRNodeId inode = RRNodeId(route_ctx.net_rr_terminals[_inet][0]); _root = new RouteTreeNode(inode, RRSwitchId::INVALID(), nullptr); _net_id = _inet; _rr_node_to_rt_node[inode] = _root; + + _num_sinks = route_ctx.net_rr_terminals[_inet].size() - 1; + _isink_to_rt_node.resize(_num_sinks); /* 0-indexed */ + _is_isink_reached.resize(_num_sinks + 1); /* 1-indexed */ } /** Make a copy of rhs and return it. @@ -105,43 +110,66 @@ void RouteTree::copy_tree_x(RouteTreeNode* lhs, const RouteTreeNode& rhs) { /* Copy constructor */ RouteTree::RouteTree(const RouteTree& rhs) { - _root = copy_tree(rhs._root); + _isink_to_rt_node.resize(rhs._isink_to_rt_node.size()); _net_id = rhs._net_id; + _root = copy_tree(rhs._root); + _is_isink_reached = rhs._is_isink_reached; + _num_sinks = rhs._num_sinks; } /* Move constructor: * Take over rhs' linked list & set it to null so it doesn't get freed. - * Refs should stay valid after this? */ + * Refs should stay valid after this? + * I don't think there's a user crazy enough to move around route trees + * from multiple threads, but better safe than sorry */ RouteTree::RouteTree(RouteTree&& rhs) { + std::unique_lock rhs_write_lock(rhs._write_mutex); _root = rhs._root; _net_id = rhs._net_id; rhs._root = nullptr; _rr_node_to_rt_node = std::move(rhs._rr_node_to_rt_node); + _isink_to_rt_node = std::move(rhs._isink_to_rt_node); + _is_isink_reached = std::move(rhs._is_isink_reached); + _num_sinks = rhs._num_sinks; } /* Copy assignment: free list, clear lookup, reload list. */ RouteTree& RouteTree::operator=(const RouteTree& rhs) { if (this == &rhs) return *this; + std::unique_lock write_lock(_write_mutex); free_list(_root); _rr_node_to_rt_node.clear(); - _root = copy_tree(rhs._root); + _isink_to_rt_node.clear(); + _isink_to_rt_node.resize(rhs._isink_to_rt_node.size()); _net_id = rhs._net_id; + _root = copy_tree(rhs._root); + _is_isink_reached = rhs._is_isink_reached; + _num_sinks = rhs._num_sinks; return *this; } /* Move assignment: * Free my list, take over rhs' linked list & set it to null so it doesn't get freed. * Also ~steal~ acquire ownership of node lookup from rhs. - * Refs should stay valid after this? */ + * Refs should stay valid after this? + * I don't think there's a user crazy enough to move around route trees + * from multiple threads, but better safe than sorry */ RouteTree& RouteTree::operator=(RouteTree&& rhs) { if (this == &rhs) return *this; + /* See https://stackoverflow.com/a/29988626 */ + std::unique_lock write_lock(_write_mutex, std::defer_lock); + std::unique_lock rhs_write_lock(rhs._write_mutex, std::defer_lock); + std::lock(write_lock, rhs_write_lock); free_list(_root); _root = rhs._root; _net_id = rhs._net_id; rhs._root = nullptr; _rr_node_to_rt_node = std::move(rhs._rr_node_to_rt_node); + _isink_to_rt_node = std::move(rhs._isink_to_rt_node); + _is_isink_reached = std::move(rhs._is_isink_reached); + _num_sinks = rhs._num_sinks; return *this; } @@ -149,6 +177,11 @@ RouteTree& RouteTree::operator=(RouteTree&& rhs) { * Can take a RouteTreeNode& to do an incremental update. * Note that update_from_heap already calls this. */ void RouteTree::reload_timing(vtr::optional from_node) { + std::unique_lock write_lock(_write_mutex); + reload_timing_unlocked(from_node); +} + +void RouteTree::reload_timing_unlocked(vtr::optional from_node) { auto& device_ctx = g_vpr_ctx.device(); const auto& rr_graph = device_ctx.rr_graph; @@ -449,8 +482,8 @@ void RouteTree::print(void) const { * RouteTreeNode of the SINK it adds to the routing. */ std::tuple, vtr::optional> RouteTree::update_from_heap(t_heap* hptr, int target_net_pin_index, SpatialRouteTreeLookup* spatial_rt_lookup, bool is_flat) { - auto& device_ctx = g_vpr_ctx.device(); - const auto& rr_graph = device_ctx.rr_graph; + /* Lock the route tree for writing. At least on Linux this shouldn't have an impact on single-threaded code */ + std::unique_lock write_lock(_write_mutex); //Create a new subtree from the target in hptr to existing routing vtr::optional start_of_new_subtree_rt_node, sink_rt_node; @@ -460,19 +493,14 @@ RouteTree::update_from_heap(t_heap* hptr, int target_net_pin_index, SpatialRoute return {vtr::nullopt, *sink_rt_node}; /* Reload timing values */ - reload_timing(start_of_new_subtree_rt_node); + reload_timing_unlocked(start_of_new_subtree_rt_node); if (spatial_rt_lookup) { update_route_tree_spatial_lookup_recur(*start_of_new_subtree_rt_node, *spatial_rt_lookup); } - /* if the new branch is the only child of its parent and the parent is a SOURCE, - * it is the first time we are creating this tree, so include the parent in the new branch return - * so that it can be included in occupancy calculation. - * TODO: probably this should be cleaner */ - RouteTreeNode* parent = start_of_new_subtree_rt_node->_parent; - if (start_of_new_subtree_rt_node->_next_sibling == parent->_subtree_end && rr_graph.node_type(parent->inode) == SOURCE) - return {*parent, *sink_rt_node}; + if (_net_id.is_valid()) /* We don't have this lookup if the tree isn't associated with a net */ + _is_isink_reached[target_net_pin_index] = true; return {*start_of_new_subtree_rt_node, *sink_rt_node}; } @@ -527,7 +555,6 @@ RouteTree::add_subtree_from_heap(t_heap* hptr, int target_net_pin_index, bool is * Walk through new_branch_iswitches and corresponding new_branch_inodes. */ for (int i = new_branch_inodes.size() - 1; i >= 0; i--) { RouteTreeNode* new_node = new RouteTreeNode(new_branch_inodes[i], new_branch_iswitches[i], last_node); - add_node(last_node, new_node); e_rr_type node_type = rr_graph.node_type(new_branch_inodes[i]); // If is_flat is enabled, IPINs should be added, since they are used for intra-cluster routing @@ -540,6 +567,8 @@ RouteTree::add_subtree_from_heap(t_heap* hptr, int target_net_pin_index, bool is new_node->re_expand = true; } + add_node(last_node, new_node); + last_node = new_node; main_branch_visited.insert(new_branch_inodes[i]); @@ -604,6 +633,8 @@ void RouteTree::add_non_configurable_nodes(RouteTreeNode* rt_node, /** Prune a route tree of illegal branches - when there is at least 1 congested node on the path to a sink * Returns nullopt if the entire tree has been pruned. + * Updates "is_isink_reached" lookup! After prune(), if a sink is marked as reached in the lookup, it is reached + * legally. * * Note: does not update R_upstream/C_downstream */ vtr::optional @@ -612,6 +643,8 @@ RouteTree::prune(CBRR& connections_inf, std::vector* non_config_node_set_us const auto& rr_graph = device_ctx.rr_graph; auto& route_ctx = g_vpr_ctx.routing(); + std::unique_lock write_lock(_write_mutex); + VTR_ASSERT_MSG(rr_graph.node_type(root().inode) == SOURCE, "Root of route tree must be SOURCE"); VTR_ASSERT_MSG(_net_id, "RouteTree must be constructed using a ParentNetId"); @@ -675,14 +708,12 @@ RouteTree::prune_x(RouteTreeNode& rt_node, CBRR& connections_inf, bool force_pru if (!force_prune) { //Valid path to sink - //Record sink as reachable - connections_inf.reached_rt_sink(_net_id, rt_node.inode); - + //Record sink as reached + _is_isink_reached[rt_node.net_pin_index] = true; return rt_node; // Not pruned } else { //Record as not reached - connections_inf.toreach_rr_sink(_net_id, rt_node.net_pin_index); - + _is_isink_reached[rt_node.net_pin_index] = false; return vtr::nullopt; // Pruned } } else if (all_children_pruned) { @@ -789,6 +820,7 @@ RouteTree::prune_x(RouteTreeNode& rt_node, CBRR& connections_inf, bool force_pru * This is used after routing a clock net. * TODO: is this function doing anything? Try running without it */ void RouteTree::freeze(void) { + std::unique_lock write_lock(_write_mutex); return freeze_x(*_root); } diff --git a/vpr/src/route/route_tree.h b/vpr/src/route/route_tree.h index 9d2200d2696..63eebf555ea 100644 --- a/vpr/src/route/route_tree.h +++ b/vpr/src/route/route_tree.h @@ -83,6 +83,7 @@ #include #include #include +#include #include #include "connection_based_routing_fwd.h" @@ -90,6 +91,8 @@ #include "vtr_assert.h" #include "spatial_route_tree_lookup.h" #include "vtr_optional.h" +#include "vtr_range.h" +#include "vtr_vec_id_set.h" /** * @brief A single route tree node @@ -341,6 +344,7 @@ class RouteTree { RouteTree(ParentNetId inet); ~RouteTree() { + std::unique_lock write_lock(_write_mutex); free_list(_root); } @@ -349,19 +353,34 @@ class RouteTree { * is the heap pointer of the SINK that was reached, and target_net_pin_index * is the net pin index corresponding to the SINK that was reached. This routine * returns a tuple: RouteTreeNode of the branch it adds to the route tree and - * RouteTreeNode of the SINK it adds to the routing. */ + * RouteTreeNode of the SINK it adds to the routing. + * Locking operation: only one thread can update_from_heap() a RouteTree at a time. */ std::tuple, vtr::optional> update_from_heap(t_heap* hptr, int target_net_pin_index, SpatialRouteTreeLookup* spatial_rt_lookup, bool is_flat); /** Reload timing values (R_upstream, C_downstream, Tdel). * Can take a RouteTreeNode& to do an incremental update. - * Note that update_from_heap already does this, but prune() doesn't */ + * Note that update_from_heap already does this, but prune() doesn't. + * Locking operation: only one thread can reload_timing() for a RouteTree at a time. */ void reload_timing(vtr::optional from_node = vtr::nullopt); /** Get the RouteTreeNode corresponding to the RRNodeId. Returns nullopt if not found. - * SINK nodes may be added to the tree multiple times. In that case, this will return the last one added. */ + * SINK nodes may be added to the tree multiple times. In that case, this will return the last one added. + * Use find_by_isink for a more accurate lookup. */ vtr::optional find_by_rr_id(RRNodeId rr_node) const; + /** Get the sink RouteTreeNode associated with the isink. + * Will probably segfault if the tree is not constructed with a ParentNetId. */ + inline vtr::optional find_by_isink(int isink) const { + RouteTreeNode* x = _isink_to_rt_node[isink - 1]; + return x ? vtr::optional(*x) : vtr::nullopt; + } + + /** Get the number of sinks in associated net. */ + constexpr size_t num_sinks(void) const { + return _num_sinks; + } + /** Check the consistency of this route tree. Looks for: * - invalid parent-child links * - invalid timing values @@ -378,12 +397,14 @@ class RouteTree { /** Prune overused nodes from the tree. * Also prune unused non-configurable nodes if non_config_node_set_usage is provided (see get_non_config_node_set_usage) - * Returns nullopt if the entire tree is pruned. */ + * Returns nullopt if the entire tree is pruned. + * Locking operation: only one thread can prune() a RouteTree at a time. */ vtr::optional prune(CBRR& connections_inf, std::vector* non_config_node_set_usage = nullptr); /** Remove all sinks and mark the remaining nodes as un-expandable. * This is used after routing a clock net. - * TODO: is this function doing anything? Try running without it */ + * TODO: is this function doing anything? Try running without it + * Locking operation: only one thread can freeze() a RouteTree at a time. */ void freeze(void); /** Count configurable edges to non-configurable node sets. (rr_nonconf_node_sets index -> int) @@ -400,6 +421,71 @@ class RouteTree { /** Get a reference to the root RouteTreeNode. */ constexpr const RouteTreeNode& root(void) const { return *_root; } /* this file is 90% const and 10% code */ + /** Iterator implementation for remaining or reached isinks. Goes over [1..num_sinks] + * and only returns a value when the sink state is right */ + template + class IsinkIterator { + public: + using iterator_category = std::forward_iterator_tag; + using difference_type = std::ptrdiff_t; + using value_type = int; + using pointer = int*; + using reference = int&; + + constexpr IsinkIterator(const std::vector& bitset, size_t x) + : _bitset(bitset) + , _x(x) { + if (_x < _bitset.size() && _bitset[_x] != sink_state) /* Iterate forward to a valid state */ + ++(*this); + } + constexpr value_type operator*() const { + return _x; + } + inline IsinkIterator& operator++() { + _x++; + for (; _x < _bitset.size() && _bitset[_x] != sink_state; _x++) + ; + return *this; + } + inline IsinkIterator operator++(int) { + IsinkIterator tmp = *this; + ++(*this); + return tmp; + } + constexpr bool operator==(const IsinkIterator& rhs) { return _x == rhs._x; } + constexpr bool operator!=(const IsinkIterator& rhs) { return _x != rhs._x; } + + private: + /** Ref to the bitset */ + const std::vector& _bitset; + /** Current position */ + size_t _x; + }; + + typedef vtr::Range> reached_isink_range; + typedef vtr::Range> remaining_isink_range; + + /** Get a lookup which contains the "isink reached state". + * It's a 1-indexed! bitset of "pin indices". True if the nth sink has been reached, false otherwise. + * If you call it before prune() and after routing, there's no guarantee on whether the reached sinks + * are reached legally. */ + constexpr const std::vector& get_is_isink_reached(void) const { return _is_isink_reached; } + + /** Get reached isinks: 1-indexed pin indices enumerating the sinks in this net. + * "Reached" means "reached legally" if you call this after prune() and not before any routing. + * Otherwise it doesn't guarantee legality. + * Builds and returns a value: use get_is_isink_reached directly if you want speed. */ + constexpr reached_isink_range get_reached_isinks(void) const { + return vtr::make_range(IsinkIterator(_is_isink_reached, 1), IsinkIterator(_is_isink_reached, _num_sinks + 1)); + } + + /** Get remaining (not routed (legally?)) isinks: + * 1-indexed pin indices enumerating the sinks in this net. + * Caveats in get_reached_isinks() apply. */ + constexpr remaining_isink_range get_remaining_isinks(void) const { + return vtr::make_range(IsinkIterator(_is_isink_reached, 1), IsinkIterator(_is_isink_reached, _num_sinks + 1)); + } + private: std::tuple, vtr::optional> add_subtree_from_heap(t_heap* hptr, int target_net_pin_index, bool is_flat); @@ -409,6 +495,7 @@ class RouteTree { std::unordered_set& visited, bool is_flat); + void reload_timing_unlocked(vtr::optional from_node = vtr::nullopt); void load_new_subtree_R_upstream(RouteTreeNode& from_node); float load_new_subtree_C_downstream(RouteTreeNode& from_node); RouteTreeNode& update_unbuffered_ancestors_C_downstream(RouteTreeNode& from_node); @@ -442,7 +529,12 @@ class RouteTree { node->_next_sibling = parent->_next; } parent->_next = node; + + /** Add node to RR to RT lookup */ _rr_node_to_rt_node[node->inode] = node; + /** If node is a SINK (net_pin_index > 0), also add it to sink RT lookup */ + if (node->net_pin_index > 0 && _net_id.is_valid()) + _isink_to_rt_node[node->net_pin_index - 1] = node; /* Now it's a branch */ parent->_is_leaf = false; @@ -526,4 +618,23 @@ class RouteTree { * therefore store the last rt_node created of all the SINK nodes with the same * index "inode". */ std::unordered_map _rr_node_to_rt_node; + + /** RRNodeId is not a unique lookup for sink RouteTreeNodes, but net_pin_index + * is. Store a 0-indexed lookup here for users who need to look up a sink from + * a net_pin_index, ipin, isink, etc. */ + std::vector _isink_to_rt_node; + + /** Is Nth sink in this net reached? + * Bitset of [1..num_sinks]. (1-indexed!) + * We work with these indices, because they are used in a bunch of lookups in + * the router. Looking these back up from sink RR nodes would require looking + * up its RouteTreeNode and then the net_pin_index from that. */ + std::vector _is_isink_reached; + + /** Number of sinks in this tree's net. Useful for iteration. */ + size_t _num_sinks; + + /** Write mutex on this RouteTree. Acquired by the write operations automatically: + * the caller does not need to know about a lock. */ + std::mutex _write_mutex; }; diff --git a/vpr/src/route/router_delay_profiling.cpp b/vpr/src/route/router_delay_profiling.cpp index 4e2274c406f..51d5a21d972 100644 --- a/vpr/src/route/router_delay_profiling.cpp +++ b/vpr/src/route/router_delay_profiling.cpp @@ -72,7 +72,8 @@ bool RouterDelayProfiler::calculate_delay(RRNodeId source_node, RRNodeId sink_no false, std::unordered_map()); std::tie(found_path, std::ignore, cheapest) = router_.timing_driven_route_connection_from_route_tree( - tree.root(), + tree, + tree.root().inode, sink_node, cost_params, bounding_box, @@ -144,7 +145,7 @@ vtr::vector calculate_all_path_delays_from_rr_node(RRNodeId src is_flat); RouterStats router_stats; ConnectionParameters conn_params(ParentNetId::INVALID(), OPEN, false, std::unordered_map()); - vtr::vector shortest_paths = router.timing_driven_find_all_shortest_paths_from_route_tree(tree.root(), + vtr::vector shortest_paths = router.timing_driven_find_all_shortest_paths_from_route_tree(tree, cost_params, bounding_box, router_stats, diff --git a/vpr/src/route/spatial_route_tree_lookup.cpp b/vpr/src/route/spatial_route_tree_lookup.cpp index 3d3f7a25460..e03fe8f291e 100644 --- a/vpr/src/route/spatial_route_tree_lookup.cpp +++ b/vpr/src/route/spatial_route_tree_lookup.cpp @@ -48,7 +48,7 @@ void update_route_tree_spatial_lookup_recur(const RouteTreeNode& rt_node, Spatia // // TODO: Depending on bin size, long wires may end up being added only to bins at // their start/end and may pass through bins along their length to which they - // are not added. If this becomes an issues, reconsider how we add nodes to + // are not added. If this becomes an issue, reconsider how we add nodes to // bins if (bin_xhigh != bin_xlow || bin_yhigh != bin_ylow) { spatial_lookup[bin_xhigh][bin_yhigh].push_back(rt_node); diff --git a/vpr/src/route/virtual_net.h b/vpr/src/route/virtual_net.h new file mode 100644 index 00000000000..453986ff531 --- /dev/null +++ b/vpr/src/route/virtual_net.h @@ -0,0 +1,18 @@ +#pragma once + +#include "netlist_fwd.h" +#include "route_tree_fwd.h" +#include "vpr_types.h" + +/** A net decomposed by routing a connection through the partitioning + * cutline and dividing the bounding box into two. When routing, the connection + * router will receive a smaller-than-usual bounding box and will have to + * filter the existing routing spatially. */ +class VirtualNet { + public: + /** The net in question. */ + ParentNetId net_id; + /** Clipped bounding box. This is needed to enable decomposing a net multiple times. + * Otherwise we would need a history of side types and cutlines to compute the bbox. */ + t_bb clipped_bb; +}; diff --git a/vpr/src/timing/net_delay.cpp b/vpr/src/timing/net_delay.cpp index 5420c197769..d5d1ce52152 100644 --- a/vpr/src/timing/net_delay.cpp +++ b/vpr/src/timing/net_delay.cpp @@ -45,13 +45,13 @@ static void load_one_constant_net_delay(const Netlist<>& net_list, float delay_value); /*************************** Subroutine definitions **************************/ -void load_net_delay_from_routing(const Netlist<>& net_list, NetPinsMatrix& net_delay) { - /* This routine loads net_delay[0..nets.size()-1][1..num_pins-1]. Each entry * - * is the Elmore delay from the net source to the appropriate sink. Both * - * the rr_graph and the routing traceback must be completely constructed * - * before this routine is called, and the net_delay array must have been * - * allocated. */ +/** This routine loads net_delay[0..nets.size()-1][1..num_pins-1]. Each entry + * is the Elmore delay from the net source to the appropriate sink. Both + * the rr_graph and the routing traceback must be completely constructed + * before this routine is called, and the net_delay array must have been + * allocated. */ +void load_net_delay_from_routing(const Netlist<>& net_list, NetPinsMatrix& net_delay) { for (auto net_id : net_list.nets()) { if (net_list.net_is_ignored(net_id)) { load_one_constant_net_delay(net_list, net_delay, net_id, 0.); @@ -61,18 +61,17 @@ void load_net_delay_from_routing(const Netlist<>& net_list, NetPinsMatrix } } +/** This routine loads delay values for one net in + * net_delay[net_id][1..num_pins-1]. First, from the traceback, it + * constructs the route tree and computes its values for R, C, and Tdel. + * Next, it walks the route tree recursively, storing the time delays for + * each sink into the map ipin_to_Tdel. Then, while looping through the + * net_delay array we search for the pin index in the map, and + * correspondingly update the entry in net_delay. Finally, it frees the + * route tree and clears the ipin_to_Tdel_map associated with that net. */ static void load_one_net_delay(const Netlist<>& net_list, NetPinsMatrix& net_delay, ParentNetId net_id) { - /* This routine loads delay values for one net in * - * net_delay[net_id][1..num_pins-1]. First, from the traceback, it * - * constructs the route tree and computes its values for R, C, and Tdel. * - * Next, it walks the route tree recursively, storing the time delays for * - * each sink into the map ipin_to_Tdel. Then, while looping through the * - * net_delay array we search for the pin index in the map, and * - * correspondingly update the entry in net_delay. Finally, it frees the * - * route tree and clears the ipin_to_Tdel_map associated with that net. */ - auto& route_ctx = g_vpr_ctx.mutable_routing(); if (!route_ctx.route_trees[net_id]) { @@ -92,9 +91,9 @@ static void load_one_net_delay(const Netlist<>& net_list, ipin_to_Tdel_map.clear(); // clear the map } +/** This routine recursively traverses the route tree, and copies the Tdel of the sink_type nodes + * into the map. */ static void load_one_net_delay_recurr(const RouteTreeNode& rt_node, ParentNetId net_id) { - /* This routine recursively traverses the route tree, and copies the Tdel of the sink_type nodes * - * into the map. */ if (rt_node.net_pin_index != OPEN) { // value of OPEN indicates a non-SINK ipin_to_Tdel_map[rt_node.net_pin_index] = rt_node.Tdel; // add to the map, process current sink-type node } @@ -104,12 +103,11 @@ static void load_one_net_delay_recurr(const RouteTreeNode& rt_node, ParentNetId } } +/** Sets each entry of the net_delay array for net inet to delay_value. */ static void load_one_constant_net_delay(const Netlist<>& net_list, NetPinsMatrix& net_delay, ParentNetId net_id, float delay_value) { - /* Sets each entry of the net_delay array for net inet to delay_value. */ - for (unsigned int ipin = 1; ipin < net_list.net_pins(net_id).size(); ipin++) net_delay[net_id][ipin] = delay_value; } diff --git a/vtr_flow/scripts/python_libs/vtr/__init__.py b/vtr_flow/scripts/python_libs/vtr/__init__.py index 49211fe1456..6a0b38d7639 100644 --- a/vtr_flow/scripts/python_libs/vtr/__init__.py +++ b/vtr_flow/scripts/python_libs/vtr/__init__.py @@ -11,10 +11,12 @@ format_elapsed_time, write_tab_delimitted_csv, load_list_file, + argparse_use_previous, argparse_str2bool, - get_next_run_dir, + get_existing_run_dir, get_latest_run_dir, get_latest_run_number, + get_next_run_dir, verify_file, pretty_print_table, find_task_dir, diff --git a/vtr_flow/scripts/python_libs/vtr/flow.py b/vtr_flow/scripts/python_libs/vtr/flow.py index 0aab0f8f3a3..a1f14e2816e 100644 --- a/vtr_flow/scripts/python_libs/vtr/flow.py +++ b/vtr_flow/scripts/python_libs/vtr/flow.py @@ -56,6 +56,7 @@ def run( relax_w_factor=1.3, check_route=False, check_place=False, + no_second_run=False, ): """ Runs the VTR CAD flow to map the specified circuit_file onto the target architecture_file @@ -130,6 +131,9 @@ def run( check_place: Route existing placement by enabling VPR routing. + + no_second_run: + Don't run VPR again even if it's writing out some intermediate files. """ # @@ -300,6 +304,9 @@ def run( ): do_second_run = True + if no_second_run: + do_second_run = False + vtr.vpr.run( architecture_copy, pre_vpr_netlist, diff --git a/vtr_flow/scripts/python_libs/vtr/task.py b/vtr_flow/scripts/python_libs/vtr/task.py index cd51bd0b403..6bf898a5d22 100644 --- a/vtr_flow/scripts/python_libs/vtr/task.py +++ b/vtr_flow/scripts/python_libs/vtr/task.py @@ -1,20 +1,24 @@ """ Module that contains the task functions """ +import itertools + from pathlib import Path from pathlib import PurePath from shlex import split -import itertools + +from typing import List, Tuple from vtr import ( VtrError, InspectError, load_list_file, load_parse_results, + get_existing_run_dir, + get_latest_run_dir, get_next_run_dir, find_task_dir, load_script_param, - get_latest_run_dir, paths, ) @@ -82,7 +86,7 @@ def __init__( class Job: """ - A class to store the nessesary information for a job that needs to be run. + A class to store the necessary information for a job that needs to be run. """ def __init__( @@ -169,7 +173,7 @@ def qor_parse_command(self): """ return self._qor_parse_command - def work_dir(self, run_dir): + def work_dir(self, run_dir: str) -> str: """ return the work directory of the job """ @@ -179,7 +183,7 @@ def work_dir(self, run_dir): # pylint: enable=too-many-instance-attributes -def load_task_config(config_file): +def load_task_config(config_file) -> TaskConfig: """ Load task config information """ @@ -245,7 +249,7 @@ def load_task_config(config_file): else: # All valid keys should have been collected by now raise VtrError( - "Unrecognzied key '{key}' in config file {file}".format(key=key, file=config_file) + "Unrecognized key '{key}' in config file {file}".format(key=key, file=config_file) ) # We split the script params into a list @@ -351,7 +355,10 @@ def create_second_parse_cmd(config): return second_parse_cmd -def create_cmd(abs_circuit_filepath, abs_arch_filepath, config, args, circuit, noc_traffic): +# pylint: disable=too-many-branches +def create_cmd( + abs_circuit_filepath, abs_arch_filepath, config, args, circuit, noc_traffic +) -> Tuple: """ Create the command to run the task """ # Collect any extra script params from the config file cmd = [abs_circuit_filepath, abs_arch_filepath] @@ -410,6 +417,20 @@ def create_cmd(abs_circuit_filepath, abs_arch_filepath, config, args, circuit, n cmd += ["--fix_clusters", "{}".format(place_constr_file)] + # parse_vtr_task doesn't have these in args, so use getattr here + if getattr(args, "write_rr_graphs", None): + cmd += [ + "--write_rr_graph", + "{}.rr_graph.xml".format(Path(circuit).stem), + ] # Use XML format instead of capnp (see #2352) + + if getattr(args, "write_lookaheads", None): + cmd += ["--write_router_lookahead", "{}.lookahead.bin".format(Path(circuit).stem)] + + if getattr(args, "write_rr_graphs", None) or getattr(args, "write_lookaheads", None): + # Don't trigger a second run, we just want the files + cmd += ["-no_second_run"] + parse_cmd = None qor_parse_command = None if config.parse_file: @@ -446,7 +467,7 @@ def create_cmd(abs_circuit_filepath, abs_arch_filepath, config, args, circuit, n # pylint: disable=too-many-branches -def create_jobs(args, configs, after_run=False): +def create_jobs(args, configs, after_run=False) -> List[Job]: """ Create the jobs to be executed depending on the configs. """ @@ -539,7 +560,7 @@ def create_job( work_dir, run_dir, golden_results, -): +) -> Job: """ Create an individual job with the specified parameters """ @@ -607,6 +628,15 @@ def create_job( current_cmd = cmd.copy() current_cmd += ["-temp_dir", run_dir + "/{}".format(param_string)] + if getattr(args, "use_previous", None): + for prev_run, [extension, option] in args.use_previous: + prev_run_dir = get_existing_run_dir(find_task_dir(config, args.alt_tasks_dir), prev_run) + prev_work_path = Path(prev_run_dir) / work_dir / param_string + prev_file = prev_work_path / "{}.{}".format(Path(circuit).stem, extension) + if not prev_file.exists(): + raise FileNotFoundError("use_previous: file %s not found" % str(prev_file)) + current_cmd += [option, str(prev_file)] + if param_string != "common": current_cmd += param.split(" ") diff --git a/vtr_flow/scripts/python_libs/vtr/util.py b/vtr_flow/scripts/python_libs/vtr/util.py index 6243063c6ea..fac9886947c 100644 --- a/vtr_flow/scripts/python_libs/vtr/util.py +++ b/vtr_flow/scripts/python_libs/vtr/util.py @@ -1,16 +1,21 @@ """ Module to utilize many of the tools needed for VTR. """ -from pathlib import PurePath -from pathlib import Path + import sys import re import time import subprocess import argparse import csv + from collections import OrderedDict +from pathlib import PurePath +from pathlib import Path +from typing import List, Tuple + from prettytable import PrettyTable + import vtr.error from vtr.error import CommandError from vtr import paths @@ -335,7 +340,7 @@ def relax_w(min_w, relax_factor, base=2): return relaxed_w -def load_list_file(list_file): +def load_list_file(list_file: str) -> List[str]: """ Loads a file containing a single value-per-line, potentially with '#' comments @@ -429,6 +434,39 @@ def format_elapsed_time(time_delta): return "%.2f seconds" % time_delta.total_seconds() +# Files that can be read back by VPR with their conventional extensions +# and the command line option to read them. +REUSABLE_FILES = { + "net": ["net", "--net_file"], + "place": ["place", "--place_file"], + "route": ["route", "--route_file"], + "rr_graph": ["rr_graph.xml", "--read_rr_graph"], + "lookahead": ["lookahead.bin", "--read_router_lookahead"], +} + + +def argparse_use_previous(x: str) -> List[Tuple[str, List]]: + """ + Parse a -use_previous parameter. Throw if not valid. + Returns a list with (run dir name, [extension, cmdline option]) elements. + """ + tokens = [w.strip() for w in x.split(",")] + tokens = [w for w in tokens if len(w)] + out = [] + for w in tokens: + r = re.fullmatch(r"(\w+):(\w+)", w) + if not r: + raise argparse.ArgumentError("Invalid input to -use_previous: %s" % w) + if not REUSABLE_FILES.get(r.group(2)): + raise argparse.ArgumentError( + "Unknown file type to use_previous: %s, available types: %s" + % (r.group(2), ",".join(REUSABLE_FILES.keys())) + ) + out.append((r.group(1), REUSABLE_FILES[r.group(2)])) + + return out + + def argparse_str2bool(str_val): """ parses a string boolean to a boolean @@ -481,6 +519,18 @@ def get_latest_run_dir(base_dir): return str(PurePath(base_dir) / run_dir_name(latest_run_number)) +def get_existing_run_dir(base_dir: str, run_dir: str) -> str: + """ + Get an existing run directory (from a previous run). Throw if it doesn't exist + """ + path = Path(base_dir) / run_dir + if not path.exists(): + raise FileNotFoundError( + "Couldn't find previous run directory %s in %s" % (base_dir, run_dir) + ) + return str(path) + + def get_next_run_number(base_dir): """ Returns the next available (i.e. non-existing) run number in base_dir diff --git a/vtr_flow/scripts/python_libs/vtr/vpr/vpr.py b/vtr_flow/scripts/python_libs/vtr/vpr/vpr.py index d0e5953fbe0..003adb9f8cb 100644 --- a/vtr_flow/scripts/python_libs/vtr/vpr/vpr.py +++ b/vtr_flow/scripts/python_libs/vtr/vpr/vpr.py @@ -7,7 +7,7 @@ from vtr import CommandRunner, relax_w, determine_min_w, verify_file, paths from vtr.error import InspectError -# pylint: disable=too-many-arguments +# pylint: disable=too-many-arguments,too-many-locals def run_relax_w( architecture, circuit, @@ -70,13 +70,15 @@ def run_relax_w( vpr_min_w_log = ".".join([logfile_base, "out"]) vpr_relaxed_w_log = ".".join([logfile_base, "crit_path", "out"]) - crit_path_router_iterations = None + crit_path_router_iterations = None if "crit_path_router_iterations" in vpr_args: crit_path_router_iterations = vpr_args["crit_path_router_iterations"] del vpr_args["crit_path_router_iterations"] - if "write_rr_graph" in vpr_args: + write_rr_graph = None + if "write_rr_graph" in vpr_args: # Don't write out rr_graph on the first run + write_rr_graph = vpr_args["write_rr_graph"] del vpr_args["write_rr_graph"] if vpr_exec is None: @@ -105,9 +107,11 @@ def run_relax_w( vpr_args["route"] = True # Re-route only vpr_args["route_chan_width"] = relaxed_w # At a fixed channel width + if write_rr_graph: # Write out rr_graph with known W + vpr_args["write_rr_graph"] = write_rr_graph + # VPR does not support performing routing when fixed pins # are specified, and placement is not run; so remove the option - run( architecture, circuit, diff --git a/vtr_flow/scripts/run_vtr_flow.py b/vtr_flow/scripts/run_vtr_flow.py index 7a03918e80b..8f4fd2f58b8 100755 --- a/vtr_flow/scripts/run_vtr_flow.py +++ b/vtr_flow/scripts/run_vtr_flow.py @@ -187,7 +187,7 @@ def vtr_command_argparser(prog=None): house_keeping.add_argument( "-temp_dir", default=None, - help="Directory to run the flow in (will be created if non-existant).", + help="Directory to run the flow in (will be created if non-existent).", ) house_keeping.add_argument("-name", default=None, help="Name for this run to be output.") @@ -398,11 +398,17 @@ def vtr_command_argparser(prog=None): action="store_true", help="Tells VPR to verify the routing resource graph.", ) + vpr.add_argument( + "-no_second_run", + default=False, + action="store_true", + help="Don't run VPR a second time to check if it can read intermediate files.", + ) vpr.add_argument( "-rr_graph_ext", default=".xml", type=str, - help="Determines the output rr_graph files' extention.", + help="Determines the output rr_graph files' extension.", ) vpr.add_argument( "-check_route", @@ -575,6 +581,7 @@ def vtr_command_main(arg_list, prog=None): relax_w_factor=args.relax_w_factor, check_route=args.check_route, check_place=args.check_place, + no_second_run=args.no_second_run, ) error_status = "OK" except vtr.VtrError as error: @@ -583,7 +590,7 @@ def vtr_command_main(arg_list, prog=None): ) except KeyboardInterrupt as error: - print("{} recieved keyboard interrupt".format(prog)) + print("{} keyboard interrupt".format(prog)) exit_status = 4 return_status = exit_status diff --git a/vtr_flow/scripts/run_vtr_task.py b/vtr_flow/scripts/run_vtr_task.py index 51a1d4bf9f4..0d9c5013181 100755 --- a/vtr_flow/scripts/run_vtr_task.py +++ b/vtr_flow/scripts/run_vtr_task.py @@ -3,18 +3,18 @@ """ This module is a wrapper around the scripts/python_libs/vtr, allowing the user to run one or more VTR tasks. """ - -from pathlib import Path -from pathlib import PurePath -import sys -import os import argparse -import textwrap +import os import subprocess -from datetime import datetime +import sys +import textwrap + from contextlib import redirect_stdout -from multiprocessing import Pool, Manager +from datetime import datetime from difflib import SequenceMatcher +from multiprocessing import Pool, Manager +from pathlib import Path +from pathlib import PurePath from run_vtr_flow import vtr_command_main as run_vtr_flow @@ -26,6 +26,7 @@ format_elapsed_time, RawDefaultHelpFormatter, argparse_str2bool, + argparse_use_previous, get_next_run_dir, load_task_config, find_task_config_file, @@ -202,6 +203,34 @@ def vtr_command_argparser(prog=None): help="Print meta-data like command-line arguments and run-time", ) + parser.add_argument( + "-write_rr_graphs", + default=False, + action="store_true", + help="Write out rr_graph files from VPR. These are normally computed on the fly" + "and can become very large. Typically used with -use_previous [...] to save time" + "on later executions for large tasks.", + ) + + parser.add_argument( + "-write_lookaheads", + default=False, + action="store_true", + help="Write out router lookahead files from VPR. These are normally computed on the fly" + "and can become very large. Typically used with -use_previous [...] to save time on" + "later executions for large tasks.", + ) + + parser.add_argument( + "-use_previous", + default=None, + type=argparse_use_previous, + help="Reuse intermediate [file]s from previous [run]s of the tasks. Accepts a comma" + 'separated list of [run]:[file] such as "-use_previous run001:place,run001:net".' + 'Works throughout different config parameters: "common" will reuse "common"\'s files etc.' + "Use with caution and try to validate your results with a clean run.", + ) + parser.add_argument( "-s", nargs=argparse.REMAINDER, @@ -214,7 +243,7 @@ def vtr_command_argparser(prog=None): return parser -def vtr_command_main(arg_list, prog=None): +def vtr_command_main(arg_list, prog=None) -> int: """Run the vtr tasks given and the tasks in the lists given""" # Load the arguments args = vtr_command_argparser(prog).parse_args(arg_list) @@ -266,10 +295,7 @@ def vtr_command_main(arg_list, prog=None): return num_failed -def run_tasks( - args, - configs, -): +def run_tasks(args, configs) -> int: """ Runs the specified set of tasks (configs) """ @@ -278,6 +304,7 @@ def run_tasks( jobs = create_jobs(args, configs) + # Determine the run dir for each config run_dirs = {} for config in configs: task_dir = find_task_dir(config, args.alt_tasks_dir) @@ -324,24 +351,22 @@ def run_tasks( return num_failed -def run_parallel(args, queued_jobs, run_dirs): +def run_parallel(args, queued_jobs, run_dirs: dict) -> int: """ Run each external command in commands with at most args.j commands running in parllel """ - # Determine the run dir for each config # We pop off the jobs of queued_jobs, which python does from the end, # so reverse the list now so we get the expected order. This also ensures # we are working with a copy of the jobs queued_jobs = list(reversed(queued_jobs)) - # Find the max taskname length for pretty printing queued_procs = [] queue = Manager().Queue() for job in queued_jobs: - queued_procs += [(queue, run_dirs, job, args.script)] - # Queue of currently running subprocesses + queued_procs.append((queue, run_dirs, job, args.script)) + # Queue of currently running subprocesses num_failed = 0 with Pool(processes=args.j) as pool: for proc in queued_procs: @@ -451,15 +476,16 @@ def format_human_readable_memory(num_bytes): return "%.2f GiB" % (num_bytes / (1024 ** 3)) -def run_vtr_flow_process(queue, run_dirs, job, script): +def run_vtr_flow_process(queue, run_dirs, job, script) -> None: """ - This is the function that the multiprocessing calls. - It runs the vtr flow and allerts the multiprocessor through a queue if the flow failed. + This is the function called by multiprocessing.Pool. + It runs the VTR flow and alerts the caller through the queue if the flow failed. """ work_dir = job.work_dir(run_dirs[job.task_name()]) Path(work_dir).mkdir(parents=True, exist_ok=True) out = None vtr_flow_out = str(PurePath(work_dir) / "vtr_flow.out") + with open(vtr_flow_out, "w+") as out_file: with redirect_stdout(out_file): if script == "run_vtr_flow.py": diff --git a/vtr_flow/tasks/regression_tests/vtr_reg_strong/koios/config/config.txt b/vtr_flow/tasks/regression_tests/vtr_reg_strong/koios/config/config.txt index 7bb68bed48e..1ccd16490d7 100644 --- a/vtr_flow/tasks/regression_tests/vtr_reg_strong/koios/config/config.txt +++ b/vtr_flow/tasks/regression_tests/vtr_reg_strong/koios/config/config.txt @@ -35,4 +35,6 @@ qor_parse_file=qor_standard.txt pass_requirements_file=pass_requirements.txt #Script parameters -script_params=-track_memory_usage +script_params_common=-track_memory_usage +script_params_list_add = +script_params_list_add = --router_algorithm parallel diff --git a/vtr_flow/tasks/regression_tests/vtr_reg_strong/koios/config/golden_results.txt b/vtr_flow/tasks/regression_tests/vtr_reg_strong/koios/config/golden_results.txt index 749f23089f4..8c6c0b532f1 100644 --- a/vtr_flow/tasks/regression_tests/vtr_reg_strong/koios/config/golden_results.txt +++ b/vtr_flow/tasks/regression_tests/vtr_reg_strong/koios/config/golden_results.txt @@ -1,2 +1,3 @@ - arch circuit script_params vtr_flow_elapsed_time vtr_max_mem_stage vtr_max_mem error odin_synth_time max_odin_mem parmys_synth_time max_parmys_mem abc_depth abc_synth_time abc_cec_time abc_sec_time max_abc_mem ace_time max_ace_mem num_clb num_io num_memories num_mult vpr_status vpr_revision vpr_build_info vpr_compiler vpr_compiled hostname rundir max_vpr_mem num_primary_inputs num_primary_outputs num_pre_packed_nets num_pre_packed_blocks num_netlist_clocks num_post_packed_nets num_post_packed_blocks device_width device_height device_grid_tiles device_limiting_resources device_name pack_mem pack_time placed_wirelength_est place_mem place_time place_quench_time placed_CPD_est placed_setup_TNS_est placed_setup_WNS_est placed_geomean_nonvirtual_intradomain_critical_path_delay_est place_delay_matrix_lookup_time place_quench_timing_analysis_time place_quench_sta_time place_total_timing_analysis_time place_total_sta_time min_chan_width routed_wirelength min_chan_width_route_success_iteration logic_block_area_total logic_block_area_used min_chan_width_routing_area_total min_chan_width_routing_area_per_tile min_chan_width_route_time min_chan_width_total_timing_analysis_time min_chan_width_total_sta_time crit_path_routed_wirelength crit_path_route_success_iteration crit_path_total_nets_routed crit_path_total_connections_routed crit_path_total_heap_pushes crit_path_total_heap_pops critical_path_delay geomean_nonvirtual_intradomain_critical_path_delay setup_TNS setup_WNS hold_TNS hold_WNS crit_path_routing_area_total crit_path_routing_area_per_tile router_lookahead_computation_time crit_path_route_time crit_path_total_timing_analysis_time crit_path_total_sta_time - k6FracN10LB_mem20K_complexDSP_customSB_22nm.xml test.v common 6.55 vpr 74.02 MiB -1 -1 0.18 21664 1 0.04 -1 -1 35456 -1 -1 12 130 0 -1 success v8.0.0-6989-g4a9293e1e-dirty release IPO VTR_ASSERT_LEVEL=3 GNU 11.3.0 on Linux-5.15.0-58-generic x86_64 2023-02-04T01:37:29 dev /home/dev/Desktop/CAS-Atlantic/vtr-verilog-to-routing 75792 130 40 596 562 1 356 185 14 14 196 dsp_top auto 35.5 MiB 0.12 1674 74.0 MiB 0.12 0.00 5.12303 -567.54 -5.12303 5.12303 0.49 0.000709907 0.000632382 0.0521422 0.0466692 82 3380 8 4.93594e+06 1.0962e+06 1.24853e+06 6370.04 3.85 0.328078 0.303484 3282 8 751 823 207761 68347 4.57723 4.57723 -668.524 -4.57723 0 0 1.53695e+06 7841.58 0.29 0.05 0.025136 0.0240614 +arch circuit script_params vtr_flow_elapsed_time vtr_max_mem_stage vtr_max_mem error odin_synth_time max_odin_mem parmys_synth_time max_parmys_mem abc_depth abc_synth_time abc_cec_time abc_sec_time max_abc_mem ace_time max_ace_mem num_clb num_io num_memories num_mult vpr_status vpr_revision vpr_build_info vpr_compiler vpr_compiled hostname rundir max_vpr_mem num_primary_inputs num_primary_outputs num_pre_packed_nets num_pre_packed_blocks num_netlist_clocks num_post_packed_nets num_post_packed_blocks device_width device_height device_grid_tiles device_limiting_resources device_name pack_mem pack_time placed_wirelength_est place_mem place_time place_quench_time placed_CPD_est placed_setup_TNS_est placed_setup_WNS_est placed_geomean_nonvirtual_intradomain_critical_path_delay_est place_delay_matrix_lookup_time place_quench_timing_analysis_time place_quench_sta_time place_total_timing_analysis_time place_total_sta_time min_chan_width routed_wirelength min_chan_width_route_success_iteration logic_block_area_total logic_block_area_used min_chan_width_routing_area_total min_chan_width_routing_area_per_tile min_chan_width_route_time min_chan_width_total_timing_analysis_time min_chan_width_total_sta_time crit_path_num_rr_graph_nodes crit_path_num_rr_graph_edges crit_path_collapsed_nodes crit_path_routed_wirelength crit_path_route_success_iteration crit_path_total_nets_routed crit_path_total_connections_routed crit_path_total_heap_pushes crit_path_total_heap_pops crit_path_total_internal_heap_pushes crit_path_total_internal_heap_pops crit_path_total_external_heap_pushes crit_path_total_external_heap_pops crit_path_total_external_SOURCE_pushes crit_path_total_external_SOURCE_pops crit_path_total_internal_SOURCE_pushes crit_path_total_internal_SOURCE_pops crit_path_total_external_SINK_pushes crit_path_total_external_SINK_pops crit_path_total_internal_SINK_pushes crit_path_total_internal_SINK_pops crit_path_total_external_IPIN_pushes crit_path_total_external_IPIN_pops crit_path_total_internal_IPIN_pushes crit_path_total_internal_IPIN_pops crit_path_total_external_OPIN_pushes crit_path_total_external_OPIN_pops crit_path_total_internal_OPIN_pushes crit_path_total_internal_OPIN_pops crit_path_total_external_CHANX_pushes crit_path_total_external_CHANX_pops crit_path_total_internal_CHANX_pushes crit_path_total_internal_CHANX_pops crit_path_total_external_CHANY_pushes crit_path_total_external_CHANY_pops crit_path_total_internal_CHANY_pushes crit_path_total_internal_CHANY_pops crit_path_rt_node_SOURCE_pushes crit_path_rt_node_SINK_pushes crit_path_rt_node_IPIN_pushes crit_path_rt_node_OPIN_pushes crit_path_rt_node_CHANX_pushes crit_path_rt_node_CHANY_pushes crit_path_adding_all_rt crit_path_adding_high_fanout_rt crit_path_total_number_of_adding_all_rt_from_calling_high_fanout_rt critical_path_delay geomean_nonvirtual_intradomain_critical_path_delay setup_TNS setup_WNS hold_TNS hold_WNS crit_path_routing_area_total crit_path_routing_area_per_tile router_lookahead_computation_time crit_path_route_time crit_path_create_rr_graph_time crit_path_create_intra_cluster_rr_graph_time crit_path_tile_lookahead_computation_time crit_path_router_lookahead_computation_time crit_path_total_timing_analysis_time crit_path_total_sta_time +k6FracN10LB_mem20K_complexDSP_customSB_22nm.xml test.v common 6.19 vpr 74.24 MiB -1 -1 0.16 18044 1 0.09 -1 -1 32460 -1 -1 12 130 0 -1 success v8.0.0-8293-gcafae33ff-dirty release IPO VTR_ASSERT_LEVEL=2 GNU 9.4.0 on Linux-4.15.0-197-generic x86_64 2023-08-02T01:36:29 redacted.eecg.utoronto.ca /home/redacted/par1/vtr-verilog-to-routing/vtr_flow/tasks 76020 130 40 596 562 1 356 185 14 14 196 dsp_top auto 35.4 MiB 0.10 1734 74.2 MiB 0.13 0.00 5.12303 -543.21 -5.12303 5.12303 0.45 0.000787704 0.000733664 0.0618895 0.0577764 82 3564 25 4.93594e+06 1.0962e+06 1.24853e+06 6370.04 3.47 0.374217 0.341793 33448 252102 -1 3384 9 709 754 192289 63961 0 0 192289 63961 754 713 0 0 18466 17836 0 0 19719 19117 0 0 755 715 0 0 69265 12759 0 0 83330 12821 0 0 754 0 0 45 152 212 1613 0 0 4.57723 4.57723 -644.847 -4.57723 0 0 1.53695e+06 7841.58 0.26 0.05 0.23 -1 -1 0.26 0.0256266 0.0242 +k6FracN10LB_mem20K_complexDSP_customSB_22nm.xml test.v common_--router_algorithm_parallel 4.88 vpr 74.18 MiB -1 -1 0.16 18028 1 0.09 -1 -1 32508 -1 -1 12 130 0 -1 success v8.0.0-8293-gcafae33ff-dirty release IPO VTR_ASSERT_LEVEL=2 GNU 9.4.0 on Linux-4.15.0-197-generic x86_64 2023-08-02T01:36:29 redacted.eecg.utoronto.ca /home/redacted/par1/vtr-verilog-to-routing/vtr_flow/tasks 75956 130 40 596 562 1 356 185 14 14 196 dsp_top auto 35.4 MiB 0.10 1734 74.2 MiB 0.13 0.00 5.12303 -543.21 -5.12303 5.12303 0.45 0.000821246 0.000766795 0.0629599 0.0588552 82 3617 17 4.93594e+06 1.0962e+06 1.24853e+06 6370.04 2.14 0.308433 0.2822 33448 252102 -1 3414 14 731 776 220618 73159 0 0 220618 73159 776 735 0 0 20611 19950 0 0 21899 21238 0 0 777 738 0 0 80287 15212 0 0 96268 15286 0 0 776 0 0 45 151 219 1641 0 0 4.57723 4.57723 -637.466 -4.57723 0 0 1.53695e+06 7841.58 0.26 0.06 0.23 -1 -1 0.26 0.0330502 0.0309507 diff --git a/vtr_flow/tasks/regression_tests/vtr_reg_strong/strong_flat_router/config/config.txt b/vtr_flow/tasks/regression_tests/vtr_reg_strong/strong_flat_router/config/config.txt index caed2da9784..4e4071f98c2 100644 --- a/vtr_flow/tasks/regression_tests/vtr_reg_strong/strong_flat_router/config/config.txt +++ b/vtr_flow/tasks/regression_tests/vtr_reg_strong/strong_flat_router/config/config.txt @@ -24,5 +24,6 @@ qor_parse_file=qor_standard.txt # Pass requirements pass_requirements_file=pass_requirements.txt -script_params=-track_memory_usage --route_chan_width 100 --max_router_iterations 100 --router_lookahead map --flat_routing true - +script_params_common=-track_memory_usage --route_chan_width 100 --max_router_iterations 100 --router_lookahead map --flat_routing true +script_params_list_add = +script_params_list_add = --router_algorithm parallel --num_workers 4 diff --git a/vtr_flow/tasks/regression_tests/vtr_reg_strong/strong_flat_router/config/golden_results.txt b/vtr_flow/tasks/regression_tests/vtr_reg_strong/strong_flat_router/config/golden_results.txt index 6a885701bc1..67f180f6b08 100644 --- a/vtr_flow/tasks/regression_tests/vtr_reg_strong/strong_flat_router/config/golden_results.txt +++ b/vtr_flow/tasks/regression_tests/vtr_reg_strong/strong_flat_router/config/golden_results.txt @@ -1,2 +1,3 @@ - arch circuit script_params vtr_flow_elapsed_time vtr_max_mem_stage vtr_max_mem error odin_synth_time max_odin_mem parmys_synth_time max_parmys_mem abc_depth abc_synth_time abc_cec_time abc_sec_time max_abc_mem ace_time max_ace_mem num_clb num_io num_memories num_mult vpr_status vpr_revision vpr_build_info vpr_compiler vpr_compiled hostname rundir max_vpr_mem num_primary_inputs num_primary_outputs num_pre_packed_nets num_pre_packed_blocks num_netlist_clocks num_post_packed_nets num_post_packed_blocks device_width device_height device_grid_tiles device_limiting_resources device_name pack_mem pack_time placed_wirelength_est place_mem place_time place_quench_time placed_CPD_est placed_setup_TNS_est placed_setup_WNS_est placed_geomean_nonvirtual_intradomain_critical_path_delay_est place_delay_matrix_lookup_time place_quench_timing_analysis_time place_quench_sta_time place_total_timing_analysis_time place_total_sta_time min_chan_width routed_wirelength min_chan_width_route_success_iteration logic_block_area_total logic_block_area_used min_chan_width_routing_area_total min_chan_width_routing_area_per_tile min_chan_width_route_time min_chan_width_total_timing_analysis_time min_chan_width_total_sta_time crit_path_num_rr_graph_nodes crit_path_num_rr_graph_edges crit_path_collapsed_nodes crit_path_routed_wirelength crit_path_route_success_iteration crit_path_total_nets_routed crit_path_total_connections_routed crit_path_total_heap_pushes crit_path_total_heap_pops crit_path_total_internal_heap_pushes crit_path_total_internal_heap_pops crit_path_total_external_heap_pushes crit_path_total_external_heap_pops crit_path_total_external_SOURCE_pushes crit_path_total_external_SOURCE_pops crit_path_total_internal_SOURCE_pushes crit_path_total_internal_SOURCE_pops crit_path_total_external_SINK_pushes crit_path_total_external_SINK_pops crit_path_total_internal_SINK_pushes crit_path_total_internal_SINK_pops crit_path_total_external_IPIN_pushes crit_path_total_external_IPIN_pops crit_path_total_internal_IPIN_pushes crit_path_total_internal_IPIN_pops crit_path_total_external_OPIN_pushes crit_path_total_external_OPIN_pops crit_path_total_internal_OPIN_pushes crit_path_total_internal_OPIN_pops crit_path_total_external_CHANX_pushes crit_path_total_external_CHANX_pops crit_path_total_internal_CHANX_pushes crit_path_total_internal_CHANX_pops crit_path_total_external_CHANY_pushes crit_path_total_external_CHANY_pops crit_path_total_internal_CHANY_pushes crit_path_total_internal_CHANY_pops crit_path_rt_node_SOURCE_pushes crit_path_rt_node_SINK_pushes crit_path_rt_node_IPIN_pushes crit_path_rt_node_OPIN_pushes crit_path_rt_node_CHANX_pushes crit_path_rt_node_CHANY_pushes crit_path_adding_all_rt crit_path_adding_high_fanout_rt crit_path_total_number_of_adding_all_rt_from_calling_high_fanout_rt critical_path_delay geomean_nonvirtual_intradomain_critical_path_delay setup_TNS setup_WNS hold_TNS hold_WNS crit_path_routing_area_total crit_path_routing_area_per_tile router_lookahead_computation_time crit_path_route_time crit_path_create_rr_graph_time crit_path_create_intra_cluster_rr_graph_time crit_path_tile_lookahead_computation_time crit_path_router_lookahead_computation_time crit_path_total_timing_analysis_time crit_path_total_sta_time - k6_frac_N10_frac_chain_mem32K_40nm.xml spree.v common 12.82 vpr 76.19 MiB -1 -1 3.42 34124 16 0.76 -1 -1 37916 -1 -1 61 45 3 1 success 8528925 release IPO VTR_ASSERT_LEVEL=3 GNU 9.5.0 on Linux-5.10.35-v8 x86_64 2023-05-29T15:34:55 gh-actions-runner-vtr-auto-spawned83 /root/vtr-verilog-to-routing/vtr-verilog-to-routing 78016 45 32 1188 1147 1 781 142 14 14 196 memory auto 39.1 MiB 3.14 6687 76.2 MiB 0.85 0.01 9.87688 -6144.34 -9.87688 9.87688 0.04 0.00303074 0.00250348 0.260087 0.214733 -1 10707 13 9.20055e+06 5.32753e+06 1.21359e+06 5900 2.66 0.354898 0.295042 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 +arch circuit script_params vtr_flow_elapsed_time vtr_max_mem_stage vtr_max_mem error odin_synth_time max_odin_mem parmys_synth_time max_parmys_mem abc_depth abc_synth_time abc_cec_time abc_sec_time max_abc_mem ace_time max_ace_mem num_clb num_io num_memories num_mult vpr_status vpr_revision vpr_build_info vpr_compiler vpr_compiled hostname rundir max_vpr_mem num_primary_inputs num_primary_outputs num_pre_packed_nets num_pre_packed_blocks num_netlist_clocks num_post_packed_nets num_post_packed_blocks device_width device_height device_grid_tiles device_limiting_resources device_name pack_mem pack_time placed_wirelength_est place_mem place_time place_quench_time placed_CPD_est placed_setup_TNS_est placed_setup_WNS_est placed_geomean_nonvirtual_intradomain_critical_path_delay_est place_delay_matrix_lookup_time place_quench_timing_analysis_time place_quench_sta_time place_total_timing_analysis_time place_total_sta_time min_chan_width routed_wirelength min_chan_width_route_success_iteration logic_block_area_total logic_block_area_used min_chan_width_routing_area_total min_chan_width_routing_area_per_tile min_chan_width_route_time min_chan_width_total_timing_analysis_time min_chan_width_total_sta_time crit_path_num_rr_graph_nodes crit_path_num_rr_graph_edges crit_path_collapsed_nodes crit_path_routed_wirelength crit_path_route_success_iteration crit_path_total_nets_routed crit_path_total_connections_routed crit_path_total_heap_pushes crit_path_total_heap_pops crit_path_total_internal_heap_pushes crit_path_total_internal_heap_pops crit_path_total_external_heap_pushes crit_path_total_external_heap_pops crit_path_total_external_SOURCE_pushes crit_path_total_external_SOURCE_pops crit_path_total_internal_SOURCE_pushes crit_path_total_internal_SOURCE_pops crit_path_total_external_SINK_pushes crit_path_total_external_SINK_pops crit_path_total_internal_SINK_pushes crit_path_total_internal_SINK_pops crit_path_total_external_IPIN_pushes crit_path_total_external_IPIN_pops crit_path_total_internal_IPIN_pushes crit_path_total_internal_IPIN_pops crit_path_total_external_OPIN_pushes crit_path_total_external_OPIN_pops crit_path_total_internal_OPIN_pushes crit_path_total_internal_OPIN_pops crit_path_total_external_CHANX_pushes crit_path_total_external_CHANX_pops crit_path_total_internal_CHANX_pushes crit_path_total_internal_CHANX_pops crit_path_total_external_CHANY_pushes crit_path_total_external_CHANY_pops crit_path_total_internal_CHANY_pushes crit_path_total_internal_CHANY_pops crit_path_rt_node_SOURCE_pushes crit_path_rt_node_SINK_pushes crit_path_rt_node_IPIN_pushes crit_path_rt_node_OPIN_pushes crit_path_rt_node_CHANX_pushes crit_path_rt_node_CHANY_pushes crit_path_adding_all_rt crit_path_adding_high_fanout_rt crit_path_total_number_of_adding_all_rt_from_calling_high_fanout_rt critical_path_delay geomean_nonvirtual_intradomain_critical_path_delay setup_TNS setup_WNS hold_TNS hold_WNS crit_path_routing_area_total crit_path_routing_area_per_tile router_lookahead_computation_time crit_path_route_time crit_path_create_rr_graph_time crit_path_create_intra_cluster_rr_graph_time crit_path_tile_lookahead_computation_time crit_path_router_lookahead_computation_time crit_path_total_timing_analysis_time crit_path_total_sta_time +k6_frac_N10_frac_chain_mem32K_40nm.xml spree.v common 6.77 vpr 75.04 MiB -1 -1 1.37 31788 16 1.37 -1 -1 35456 -1 -1 61 45 3 1 success v8.0.0-8293-gcafae33ff-dirty release IPO VTR_ASSERT_LEVEL=2 GNU 9.4.0 on Linux-4.15.0-197-generic x86_64 2023-08-02T01:36:29 redacted.eecg.utoronto.ca /home/redacted/par1/vtr-verilog-to-routing/vtr_flow/tasks 76840 45 32 1188 1147 1 781 142 14 14 196 memory auto 38.0 MiB 1.38 6687 75.0 MiB 0.32 0.00 9.87688 -6144.34 -9.87688 9.87688 0.02 0.00160353 0.00140955 0.146887 0.130179 -1 10701 12 9.20055e+06 5.32753e+06 1.47691e+06 7535.23 1.15 0.187274 0.164699 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 +k6_frac_N10_frac_chain_mem32K_40nm.xml spree.v common_--router_algorithm_parallel_--num_workers_4 6.97 vpr 74.97 MiB -1 -1 1.39 31556 16 1.40 -1 -1 35520 -1 -1 61 45 3 1 success v8.0.0-8293-gcafae33ff-dirty release IPO VTR_ASSERT_LEVEL=2 GNU 9.4.0 on Linux-4.15.0-197-generic x86_64 2023-08-02T01:36:29 redacted.eecg.utoronto.ca /home/redacted/par1/vtr-verilog-to-routing/vtr_flow/tasks 76772 45 32 1188 1147 1 781 142 14 14 196 memory auto 37.9 MiB 1.39 6687 75.0 MiB 0.40 0.00 9.87688 -6144.34 -9.87688 9.87688 0.02 0.00245521 0.00214283 0.217122 0.186066 -1 10603 14 9.20055e+06 5.32753e+06 1.47691e+06 7535.23 1.21 0.280862 0.238432 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 diff --git a/vtr_flow/tasks/regression_tests/vtr_reg_strong/strong_multiclock/config/config.txt b/vtr_flow/tasks/regression_tests/vtr_reg_strong/strong_multiclock/config/config.txt index 0c49b4e3405..dbceb44a4dc 100644 --- a/vtr_flow/tasks/regression_tests/vtr_reg_strong/strong_multiclock/config/config.txt +++ b/vtr_flow/tasks/regression_tests/vtr_reg_strong/strong_multiclock/config/config.txt @@ -24,5 +24,6 @@ qor_parse_file=qor_multiclock.txt # Pass requirements pass_requirements_file=pass_requirements_multiclock.txt -script_params=-starting_stage vpr -sdc_file tasks/regression_tests/vtr_reg_strong/strong_multiclock/config/multiclock.sdc - +script_params_common=-starting_stage vpr -sdc_file tasks/regression_tests/vtr_reg_strong/strong_multiclock/config/multiclock.sdc +script_params_list_add = +script_params_list_add = --router_algorithm parallel --num_workers 4 diff --git a/vtr_flow/tasks/regression_tests/vtr_reg_strong/strong_multiclock/config/golden_results.txt b/vtr_flow/tasks/regression_tests/vtr_reg_strong/strong_multiclock/config/golden_results.txt index a939a6842c7..ff260f23dac 100644 --- a/vtr_flow/tasks/regression_tests/vtr_reg_strong/strong_multiclock/config/golden_results.txt +++ b/vtr_flow/tasks/regression_tests/vtr_reg_strong/strong_multiclock/config/golden_results.txt @@ -1,2 +1,3 @@ arch circuit script_params crit_path_delay_mcw clk_to_clk_cpd clk_to_clk2_cpd clk_to_input_cpd clk_to_output_cpd clk2_to_clk2_cpd clk2_to_clk_cpd clk2_to_input_cpd clk2_to_output_cpd input_to_input_cpd input_to_clk_cpd input_to_clk2_cpd input_to_output_cpd output_to_output_cpd output_to_clk_cpd output_to_clk2_cpd output_to_input_cpd clk_to_clk_setup_slack clk_to_clk2_setup_slack clk_to_input_setup_slack clk_to_output_setup_slack clk2_to_clk2_setup_slack clk2_to_clk_setup_slack clk2_to_input_setup_slack clk2_to_output_setup_slack input_to_input_setup_slack input_to_clk_setup_slack input_to_clk2_setup_slack input_to_output_setup_slack output_to_output_setup_slack output_to_clk_setup_slack output_to_clk2_setup_slack output_to_input_setup_slack clk_to_clk_hold_slack clk_to_clk2_hold_slack clk_to_input_hold_slack clk_to_output_hold_slack clk2_to_clk2_hold_slack clk2_to_clk_hold_slack clk2_to_input_hold_slack clk2_to_output_hold_slack input_to_input_hold_slack input_to_clk_hold_slack input_to_clk2_hold_slack input_to_output_hold_slack output_to_output_hold_slack output_to_clk_hold_slack output_to_clk2_hold_slack output_to_input_hold_slack k6_frac_N10_mem32K_40nm.xml multiclock.blif common 1.31564 0.595 0.841581 -1 -1 0.57 0.814813 -1 1.31564 -1 1.07053 -1 1.76203 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 0.243 1.71958 -1 -1 0.268 3.24281 -1 1.16427 -1 3.30853 -1 -1.48434 -1 -1 -1 -1 +k6_frac_N10_mem32K_40nm.xml multiclock.blif common_--router_algorithm_parallel_--num_workers_4 1.31564 0.595 0.841581 -1 -1 0.57 0.814813 -1 1.31564 -1 1.07053 -1 1.76203 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 0.243 1.71958 -1 -1 0.268 3.24281 -1 1.16427 -1 3.30853 -1 -1.48434 -1 -1 -1 -1 diff --git a/vtr_flow/tasks/regression_tests/vtr_reg_strong/strong_timing/config/config.txt b/vtr_flow/tasks/regression_tests/vtr_reg_strong/strong_timing/config/config.txt index 401c36ecd01..dac263af64c 100644 --- a/vtr_flow/tasks/regression_tests/vtr_reg_strong/strong_timing/config/config.txt +++ b/vtr_flow/tasks/regression_tests/vtr_reg_strong/strong_timing/config/config.txt @@ -24,5 +24,6 @@ qor_parse_file=qor_standard.txt pass_requirements_file=pass_requirements.txt # Script parameters -#script_params="" -script_params = -track_memory_usage +script_params_common = -track_memory_usage +script_params_list_add = +script_params_list_add = --router_algorithm parallel --num_workers 4 diff --git a/vtr_flow/tasks/regression_tests/vtr_reg_strong/strong_timing/config/golden_results.txt b/vtr_flow/tasks/regression_tests/vtr_reg_strong/strong_timing/config/golden_results.txt index 0ec96460c6e..a5312f38fc9 100644 --- a/vtr_flow/tasks/regression_tests/vtr_reg_strong/strong_timing/config/golden_results.txt +++ b/vtr_flow/tasks/regression_tests/vtr_reg_strong/strong_timing/config/golden_results.txt @@ -1,2 +1,3 @@ - arch circuit script_params vtr_flow_elapsed_time vtr_max_mem_stage vtr_max_mem error odin_synth_time max_odin_mem parmys_synth_time max_parmys_mem abc_depth abc_synth_time abc_cec_time abc_sec_time max_abc_mem ace_time max_ace_mem num_clb num_io num_memories num_mult vpr_status vpr_revision vpr_build_info vpr_compiler vpr_compiled hostname rundir max_vpr_mem num_primary_inputs num_primary_outputs num_pre_packed_nets num_pre_packed_blocks num_netlist_clocks num_post_packed_nets num_post_packed_blocks device_width device_height device_grid_tiles device_limiting_resources device_name pack_mem pack_time placed_wirelength_est place_mem place_time place_quench_time placed_CPD_est placed_setup_TNS_est placed_setup_WNS_est placed_geomean_nonvirtual_intradomain_critical_path_delay_est place_delay_matrix_lookup_time place_quench_timing_analysis_time place_quench_sta_time place_total_timing_analysis_time place_total_sta_time min_chan_width routed_wirelength min_chan_width_route_success_iteration logic_block_area_total logic_block_area_used min_chan_width_routing_area_total min_chan_width_routing_area_per_tile min_chan_width_route_time min_chan_width_total_timing_analysis_time min_chan_width_total_sta_time crit_path_routed_wirelength crit_path_route_success_iteration crit_path_total_nets_routed crit_path_total_connections_routed crit_path_total_heap_pushes crit_path_total_heap_pops critical_path_delay geomean_nonvirtual_intradomain_critical_path_delay setup_TNS setup_WNS hold_TNS hold_WNS crit_path_routing_area_total crit_path_routing_area_per_tile router_lookahead_computation_time crit_path_route_time crit_path_total_timing_analysis_time crit_path_total_sta_time - k6_frac_N10_mem32K_40nm.xml ch_intrinsics.v common 2.81 vpr 64.00 MiB -1 -1 0.21 21792 3 0.07 -1 -1 36304 -1 -1 68 99 1 0 success v8.0.0-6989-g4a9293e1e-dirty release IPO VTR_ASSERT_LEVEL=3 GNU 11.3.0 on Linux-5.15.0-58-generic x86_64 2023-02-04T01:37:29 dev /home/dev/Desktop/CAS-Atlantic/vtr-verilog-to-routing 65540 99 130 343 473 1 225 298 12 12 144 clb auto 25.9 MiB 0.13 574 64.0 MiB 0.16 0.00 1.63028 -109.727 -1.63028 1.63028 0.24 0.000401182 0.000358398 0.0349089 0.0312228 40 1376 20 5.66058e+06 4.21279e+06 333335. 2314.82 1.17 0.219159 0.200557 1211 9 370 555 25048 7436 1.97803 1.97803 -136.611 -1.97803 -1.34293 -0.298787 419432. 2912.72 0.10 0.02 0.0138875 0.0131731 +arch circuit script_params vtr_flow_elapsed_time vtr_max_mem_stage vtr_max_mem error odin_synth_time max_odin_mem parmys_synth_time max_parmys_mem abc_depth abc_synth_time abc_cec_time abc_sec_time max_abc_mem ace_time max_ace_mem num_clb num_io num_memories num_mult vpr_status vpr_revision vpr_build_info vpr_compiler vpr_compiled hostname rundir max_vpr_mem num_primary_inputs num_primary_outputs num_pre_packed_nets num_pre_packed_blocks num_netlist_clocks num_post_packed_nets num_post_packed_blocks device_width device_height device_grid_tiles device_limiting_resources device_name pack_mem pack_time placed_wirelength_est place_mem place_time place_quench_time placed_CPD_est placed_setup_TNS_est placed_setup_WNS_est placed_geomean_nonvirtual_intradomain_critical_path_delay_est place_delay_matrix_lookup_time place_quench_timing_analysis_time place_quench_sta_time place_total_timing_analysis_time place_total_sta_time min_chan_width routed_wirelength min_chan_width_route_success_iteration logic_block_area_total logic_block_area_used min_chan_width_routing_area_total min_chan_width_routing_area_per_tile min_chan_width_route_time min_chan_width_total_timing_analysis_time min_chan_width_total_sta_time crit_path_num_rr_graph_nodes crit_path_num_rr_graph_edges crit_path_collapsed_nodes crit_path_routed_wirelength crit_path_route_success_iteration crit_path_total_nets_routed crit_path_total_connections_routed crit_path_total_heap_pushes crit_path_total_heap_pops crit_path_total_internal_heap_pushes crit_path_total_internal_heap_pops crit_path_total_external_heap_pushes crit_path_total_external_heap_pops crit_path_total_external_SOURCE_pushes crit_path_total_external_SOURCE_pops crit_path_total_internal_SOURCE_pushes crit_path_total_internal_SOURCE_pops crit_path_total_external_SINK_pushes crit_path_total_external_SINK_pops crit_path_total_internal_SINK_pushes crit_path_total_internal_SINK_pops crit_path_total_external_IPIN_pushes crit_path_total_external_IPIN_pops crit_path_total_internal_IPIN_pushes crit_path_total_internal_IPIN_pops crit_path_total_external_OPIN_pushes crit_path_total_external_OPIN_pops crit_path_total_internal_OPIN_pushes crit_path_total_internal_OPIN_pops crit_path_total_external_CHANX_pushes crit_path_total_external_CHANX_pops crit_path_total_internal_CHANX_pushes crit_path_total_internal_CHANX_pops crit_path_total_external_CHANY_pushes crit_path_total_external_CHANY_pops crit_path_total_internal_CHANY_pushes crit_path_total_internal_CHANY_pops crit_path_rt_node_SOURCE_pushes crit_path_rt_node_SINK_pushes crit_path_rt_node_IPIN_pushes crit_path_rt_node_OPIN_pushes crit_path_rt_node_CHANX_pushes crit_path_rt_node_CHANY_pushes crit_path_adding_all_rt crit_path_adding_high_fanout_rt crit_path_total_number_of_adding_all_rt_from_calling_high_fanout_rt critical_path_delay geomean_nonvirtual_intradomain_critical_path_delay setup_TNS setup_WNS hold_TNS hold_WNS crit_path_routing_area_total crit_path_routing_area_per_tile router_lookahead_computation_time crit_path_route_time crit_path_create_rr_graph_time crit_path_create_intra_cluster_rr_graph_time crit_path_tile_lookahead_computation_time crit_path_router_lookahead_computation_time crit_path_total_timing_analysis_time crit_path_total_sta_time +k6_frac_N10_mem32K_40nm.xml ch_intrinsics.v common 2.72 vpr 63.91 MiB -1 -1 0.18 18244 3 0.15 -1 -1 33476 -1 -1 68 99 1 0 success v8.0.0-8293-gcafae33ff-dirty release IPO VTR_ASSERT_LEVEL=2 GNU 9.4.0 on Linux-4.15.0-197-generic x86_64 2023-08-02T01:36:29 redacted.eecg.utoronto.ca /home/redacted/par1/vtr-verilog-to-routing/vtr_flow/tasks 65444 99 130 343 473 1 225 298 12 12 144 clb auto 25.5 MiB 0.10 599 63.9 MiB 0.15 0.00 1.62851 -108.153 -1.62851 1.62851 0.22 0.0005716 0.000537284 0.0435 0.0407889 36 1445 27 5.66058e+06 4.21279e+06 305235. 2119.69 0.98 0.22339 0.203215 12238 58442 -1 1263 12 429 686 37045 11418 0 0 37045 11418 686 536 0 0 1992 1802 0 0 2359 1992 0 0 742 603 0 0 15126 3546 0 0 16140 2939 0 0 686 0 0 257 388 336 2661 0 0 1.99752 1.99752 -139.829 -1.99752 -0.305022 -0.0771249 378970. 2631.74 0.08 0.03 0.04 -1 -1 0.08 0.0189281 0.0175612 +k6_frac_N10_mem32K_40nm.xml ch_intrinsics.v common_--router_algorithm_parallel_--num_workers_4 2.26 vpr 64.16 MiB -1 -1 0.19 18428 3 0.15 -1 -1 33484 -1 -1 68 99 1 0 success v8.0.0-8293-gcafae33ff-dirty release IPO VTR_ASSERT_LEVEL=2 GNU 9.4.0 on Linux-4.15.0-197-generic x86_64 2023-08-02T01:36:29 redacted.eecg.utoronto.ca /home/redacted/par1/vtr-verilog-to-routing/vtr_flow/tasks 65700 99 130 343 473 1 225 298 12 12 144 clb auto 25.6 MiB 0.10 599 64.2 MiB 0.15 0.00 1.62851 -108.153 -1.62851 1.62851 0.22 0.000550016 0.000510375 0.0416236 0.0381932 36 1454 19 5.66058e+06 4.21279e+06 305235. 2119.69 0.52 0.126906 0.113733 12238 58442 -1 1272 12 419 668 36954 11455 0 0 36954 11455 668 526 0 0 2004 1826 0 0 2365 2004 0 0 724 593 0 0 15013 3573 0 0 16180 2933 0 0 668 0 0 249 384 322 2589 0 0 1.99231 1.99231 -140.914 -1.99231 -0.305022 -0.0771249 378970. 2631.74 0.08 0.03 0.04 -1 -1 0.08 0.0174112 0.0155977 diff --git a/vtr_flow/tasks/regression_tests/vtr_reg_strong/strong_timing_update_type/config/config.txt b/vtr_flow/tasks/regression_tests/vtr_reg_strong/strong_timing_update_type/config/config.txt index 3c7366d98fe..17b20f60f24 100644 --- a/vtr_flow/tasks/regression_tests/vtr_reg_strong/strong_timing_update_type/config/config.txt +++ b/vtr_flow/tasks/regression_tests/vtr_reg_strong/strong_timing_update_type/config/config.txt @@ -29,3 +29,5 @@ script_params_list_add = --timing_update_type auto script_params_list_add = --timing_update_type full script_params_list_add = --timing_update_type incremental script_params_list_add = --timing_update_type incremental --quench_recompute_divider 999999999 #Do post-move incremental STA during quench +script_params_list_add = --timing_update_type incremental --router_algorithm parallel --num_workers 4 # rarely exercised code path +script_params_list_add = --timing_update_type full --router_algorithm parallel --num_workers 4 diff --git a/vtr_flow/tasks/regression_tests/vtr_reg_strong/strong_timing_update_type/config/golden_results.txt b/vtr_flow/tasks/regression_tests/vtr_reg_strong/strong_timing_update_type/config/golden_results.txt index e3f3510b5ec..37fb4f22f53 100644 --- a/vtr_flow/tasks/regression_tests/vtr_reg_strong/strong_timing_update_type/config/golden_results.txt +++ b/vtr_flow/tasks/regression_tests/vtr_reg_strong/strong_timing_update_type/config/golden_results.txt @@ -1,5 +1,7 @@ - arch circuit script_params vtr_flow_elapsed_time vtr_max_mem_stage vtr_max_mem error odin_synth_time max_odin_mem parmys_synth_time max_parmys_mem abc_depth abc_synth_time abc_cec_time abc_sec_time max_abc_mem ace_time max_ace_mem num_clb num_io num_memories num_mult vpr_status vpr_revision vpr_build_info vpr_compiler vpr_compiled hostname rundir max_vpr_mem num_primary_inputs num_primary_outputs num_pre_packed_nets num_pre_packed_blocks num_netlist_clocks num_post_packed_nets num_post_packed_blocks device_width device_height device_grid_tiles device_limiting_resources device_name pack_mem pack_time placed_wirelength_est place_mem place_time place_quench_time placed_CPD_est placed_setup_TNS_est placed_setup_WNS_est placed_geomean_nonvirtual_intradomain_critical_path_delay_est place_delay_matrix_lookup_time place_quench_timing_analysis_time place_quench_sta_time place_total_timing_analysis_time place_total_sta_time min_chan_width routed_wirelength min_chan_width_route_success_iteration logic_block_area_total logic_block_area_used min_chan_width_routing_area_total min_chan_width_routing_area_per_tile min_chan_width_route_time min_chan_width_total_timing_analysis_time min_chan_width_total_sta_time crit_path_routed_wirelength crit_path_route_success_iteration crit_path_total_nets_routed crit_path_total_connections_routed crit_path_total_heap_pushes crit_path_total_heap_pops critical_path_delay geomean_nonvirtual_intradomain_critical_path_delay setup_TNS setup_WNS hold_TNS hold_WNS crit_path_routing_area_total crit_path_routing_area_per_tile router_lookahead_computation_time crit_path_route_time crit_path_total_timing_analysis_time crit_path_total_sta_time - k6_N10_mem32K_40nm.xml stereovision3.v common_--timing_update_type_auto 1.16 vpr 62.23 MiB -1 -1 0.49 25724 5 0.12 -1 -1 35796 -1 -1 12 10 0 0 success v8.0.0-6989-g4a9293e1e-dirty release IPO VTR_ASSERT_LEVEL=3 GNU 11.3.0 on Linux-5.15.0-58-generic x86_64 2023-02-04T01:37:29 dev /home/dev/Desktop/CAS-Atlantic/vtr-verilog-to-routing 63728 10 2 181 183 1 40 24 6 6 36 clb auto 23.6 MiB 0.03 152 62.2 MiB 0.01 0.00 2.0099 -85.4829 -2.0099 2.0099 0.00 0.000148273 0.000125883 0.00293958 0.00254045 -1 137 15 646728 646728 138825. 3856.24 0.01 0.00984083 0.00873552 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 - k6_N10_mem32K_40nm.xml stereovision3.v common_--timing_update_type_full 1.15 vpr 62.10 MiB -1 -1 0.42 25388 5 0.14 -1 -1 35504 -1 -1 12 10 0 0 success v8.0.0-6989-g4a9293e1e-dirty release IPO VTR_ASSERT_LEVEL=3 GNU 11.3.0 on Linux-5.15.0-58-generic x86_64 2023-02-04T01:37:29 dev /home/dev/Desktop/CAS-Atlantic/vtr-verilog-to-routing 63592 10 2 181 183 1 40 24 6 6 36 clb auto 23.5 MiB 0.04 152 62.1 MiB 0.01 0.00 2.0099 -85.4829 -2.0099 2.0099 0.00 0.000124802 9.9759e-05 0.00754593 0.00711966 -1 137 15 646728 646728 138825. 3856.24 0.01 0.0162485 0.0150465 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 - k6_N10_mem32K_40nm.xml stereovision3.v common_--timing_update_type_incremental 1.13 vpr 62.04 MiB -1 -1 0.42 25392 5 0.13 -1 -1 35796 -1 -1 12 10 0 0 success v8.0.0-6989-g4a9293e1e-dirty release IPO VTR_ASSERT_LEVEL=3 GNU 11.3.0 on Linux-5.15.0-58-generic x86_64 2023-02-04T01:37:29 dev /home/dev/Desktop/CAS-Atlantic/vtr-verilog-to-routing 63528 10 2 181 183 1 40 24 6 6 36 clb auto 23.4 MiB 0.04 152 62.0 MiB 0.01 0.00 2.0099 -85.4829 -2.0099 2.0099 0.00 1.3455e-05 9.755e-06 0.00205916 0.00187256 -1 137 15 646728 646728 138825. 3856.24 0.01 0.00865869 0.00791492 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 - k6_N10_mem32K_40nm.xml stereovision3.v common_--timing_update_type_incremental_--quench_recompute_divider_999999999 1.08 vpr 61.99 MiB -1 -1 0.42 25488 5 0.12 -1 -1 35964 -1 -1 12 10 0 0 success v8.0.0-6989-g4a9293e1e-dirty release IPO VTR_ASSERT_LEVEL=3 GNU 11.3.0 on Linux-5.15.0-58-generic x86_64 2023-02-04T01:37:29 dev /home/dev/Desktop/CAS-Atlantic/vtr-verilog-to-routing 63480 10 2 181 183 1 40 24 6 6 36 clb auto 23.5 MiB 0.03 152 62.0 MiB 0.01 0.00 2.0099 -85.4829 -2.0099 2.0099 0.00 9.8431e-05 2.9719e-05 0.00312475 0.00288136 -1 137 15 646728 646728 138825. 3856.24 0.01 0.00941001 0.0086255 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 +arch circuit script_params vtr_flow_elapsed_time vtr_max_mem_stage vtr_max_mem error odin_synth_time max_odin_mem parmys_synth_time max_parmys_mem abc_depth abc_synth_time abc_cec_time abc_sec_time max_abc_mem ace_time max_ace_mem num_clb num_io num_memories num_mult vpr_status vpr_revision vpr_build_info vpr_compiler vpr_compiled hostname rundir max_vpr_mem num_primary_inputs num_primary_outputs num_pre_packed_nets num_pre_packed_blocks num_netlist_clocks num_post_packed_nets num_post_packed_blocks device_width device_height device_grid_tiles device_limiting_resources device_name pack_mem pack_time placed_wirelength_est place_mem place_time place_quench_time placed_CPD_est placed_setup_TNS_est placed_setup_WNS_est placed_geomean_nonvirtual_intradomain_critical_path_delay_est place_delay_matrix_lookup_time place_quench_timing_analysis_time place_quench_sta_time place_total_timing_analysis_time place_total_sta_time min_chan_width routed_wirelength min_chan_width_route_success_iteration logic_block_area_total logic_block_area_used min_chan_width_routing_area_total min_chan_width_routing_area_per_tile min_chan_width_route_time min_chan_width_total_timing_analysis_time min_chan_width_total_sta_time crit_path_num_rr_graph_nodes crit_path_num_rr_graph_edges crit_path_collapsed_nodes crit_path_routed_wirelength crit_path_route_success_iteration crit_path_total_nets_routed crit_path_total_connections_routed crit_path_total_heap_pushes crit_path_total_heap_pops crit_path_total_internal_heap_pushes crit_path_total_internal_heap_pops crit_path_total_external_heap_pushes crit_path_total_external_heap_pops crit_path_total_external_SOURCE_pushes crit_path_total_external_SOURCE_pops crit_path_total_internal_SOURCE_pushes crit_path_total_internal_SOURCE_pops crit_path_total_external_SINK_pushes crit_path_total_external_SINK_pops crit_path_total_internal_SINK_pushes crit_path_total_internal_SINK_pops crit_path_total_external_IPIN_pushes crit_path_total_external_IPIN_pops crit_path_total_internal_IPIN_pushes crit_path_total_internal_IPIN_pops crit_path_total_external_OPIN_pushes crit_path_total_external_OPIN_pops crit_path_total_internal_OPIN_pushes crit_path_total_internal_OPIN_pops crit_path_total_external_CHANX_pushes crit_path_total_external_CHANX_pops crit_path_total_internal_CHANX_pushes crit_path_total_internal_CHANX_pops crit_path_total_external_CHANY_pushes crit_path_total_external_CHANY_pops crit_path_total_internal_CHANY_pushes crit_path_total_internal_CHANY_pops crit_path_rt_node_SOURCE_pushes crit_path_rt_node_SINK_pushes crit_path_rt_node_IPIN_pushes crit_path_rt_node_OPIN_pushes crit_path_rt_node_CHANX_pushes crit_path_rt_node_CHANY_pushes crit_path_adding_all_rt crit_path_adding_high_fanout_rt crit_path_total_number_of_adding_all_rt_from_calling_high_fanout_rt critical_path_delay geomean_nonvirtual_intradomain_critical_path_delay setup_TNS setup_WNS hold_TNS hold_WNS crit_path_routing_area_total crit_path_routing_area_per_tile router_lookahead_computation_time crit_path_route_time crit_path_create_rr_graph_time crit_path_create_intra_cluster_rr_graph_time crit_path_tile_lookahead_computation_time crit_path_router_lookahead_computation_time crit_path_total_timing_analysis_time crit_path_total_sta_time +k6_N10_mem32K_40nm.xml stereovision3.v common_--timing_update_type_auto 1.43 vpr 61.82 MiB -1 -1 0.34 22744 5 0.31 -1 -1 33812 -1 -1 12 10 0 0 success v8.0.0-8293-gcafae33ff-dirty release IPO VTR_ASSERT_LEVEL=2 GNU 9.4.0 on Linux-4.15.0-197-generic x86_64 2023-08-02T01:36:29 redacted.eecg.utoronto.ca /home/redacted/par1/vtr-verilog-to-routing/vtr_flow/tasks 63308 10 2 181 183 1 40 24 6 6 36 clb auto 23.2 MiB 0.03 171 61.8 MiB 0.01 0.00 2.06897 -87.8888 -2.06897 2.06897 0.00 0.00021372 0.000192501 0.00180931 0.00171502 -1 163 21 646728 646728 138825. 3856.24 0.02 0.0128896 0.0113993 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 +k6_N10_mem32K_40nm.xml stereovision3.v common_--timing_update_type_full 1.39 vpr 61.79 MiB -1 -1 0.34 22548 5 0.32 -1 -1 33868 -1 -1 12 10 0 0 success v8.0.0-8293-gcafae33ff-dirty release IPO VTR_ASSERT_LEVEL=2 GNU 9.4.0 on Linux-4.15.0-197-generic x86_64 2023-08-02T01:36:29 redacted.eecg.utoronto.ca /home/redacted/par1/vtr-verilog-to-routing/vtr_flow/tasks 63276 10 2 181 183 1 40 24 6 6 36 clb auto 23.2 MiB 0.03 171 61.8 MiB 0.01 0.00 2.06897 -87.8888 -2.06897 2.06897 0.00 0.000213827 0.000192479 0.00182632 0.00173178 -1 163 21 646728 646728 138825. 3856.24 0.02 0.0128195 0.0112985 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 +k6_N10_mem32K_40nm.xml stereovision3.v common_--timing_update_type_incremental 1.37 vpr 61.70 MiB -1 -1 0.32 22740 5 0.30 -1 -1 33952 -1 -1 12 10 0 0 success v8.0.0-8293-gcafae33ff-dirty release IPO VTR_ASSERT_LEVEL=2 GNU 9.4.0 on Linux-4.15.0-197-generic x86_64 2023-08-02T01:36:29 redacted.eecg.utoronto.ca /home/redacted/par1/vtr-verilog-to-routing/vtr_flow/tasks 63184 10 2 181 183 1 40 24 6 6 36 clb auto 23.2 MiB 0.03 171 61.7 MiB 0.01 0.00 2.06897 -87.8888 -2.06897 2.06897 0.00 6.0326e-05 4.8099e-05 0.00129504 0.00122282 -1 163 21 646728 646728 138825. 3856.24 0.01 0.00725272 0.00528838 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 +k6_N10_mem32K_40nm.xml stereovision3.v common_--timing_update_type_incremental_--quench_recompute_divider_999999999 1.44 vpr 61.68 MiB -1 -1 0.34 22584 5 0.34 -1 -1 33880 -1 -1 12 10 0 0 success v8.0.0-8293-gcafae33ff-dirty release IPO VTR_ASSERT_LEVEL=2 GNU 9.4.0 on Linux-4.15.0-197-generic x86_64 2023-08-02T01:36:29 redacted.eecg.utoronto.ca /home/redacted/par1/vtr-verilog-to-routing/vtr_flow/tasks 63156 10 2 181 183 1 40 24 6 6 36 clb auto 23.2 MiB 0.03 171 61.7 MiB 0.01 0.00 2.06897 -87.8888 -2.06897 2.06897 0.00 0.000173798 8.9569e-05 0.00138827 0.00124749 -1 163 21 646728 646728 138825. 3856.24 0.02 0.0112762 0.00755481 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 +k6_N10_mem32K_40nm.xml stereovision3.v common_--timing_update_type_incremental_--router_algorithm_parallel_--num_workers_4 1.40 vpr 61.78 MiB -1 -1 0.33 22868 5 0.31 -1 -1 33924 -1 -1 12 10 0 0 success v8.0.0-8293-gcafae33ff-dirty release IPO VTR_ASSERT_LEVEL=2 GNU 9.4.0 on Linux-4.15.0-197-generic x86_64 2023-08-02T01:36:29 redacted.eecg.utoronto.ca /home/redacted/par1/vtr-verilog-to-routing/vtr_flow/tasks 63264 10 2 181 183 1 40 24 6 6 36 clb auto 23.2 MiB 0.03 171 61.8 MiB 0.01 0.00 2.06897 -87.8888 -2.06897 2.06897 0.00 8.3175e-05 5.0203e-05 0.00133949 0.00121351 -1 163 21 646728 646728 138825. 3856.24 0.01 0.00784067 0.00532275 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 +k6_N10_mem32K_40nm.xml stereovision3.v common_--timing_update_type_full_--router_algorithm_parallel_--num_workers_4 1.42 vpr 61.66 MiB -1 -1 0.34 22744 5 0.30 -1 -1 33908 -1 -1 12 10 0 0 success v8.0.0-8293-gcafae33ff-dirty release IPO VTR_ASSERT_LEVEL=2 GNU 9.4.0 on Linux-4.15.0-197-generic x86_64 2023-08-02T01:36:29 redacted.eecg.utoronto.ca /home/redacted/par1/vtr-verilog-to-routing/vtr_flow/tasks 63140 10 2 181 183 1 40 24 6 6 36 clb auto 23.1 MiB 0.03 171 61.7 MiB 0.01 0.00 2.06897 -87.8888 -2.06897 2.06897 0.00 0.000453545 0.000422888 0.00262389 0.00239838 -1 163 21 646728 646728 138825. 3856.24 0.02 0.0186074 0.0163246 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1