From b77eecd450208c6b593e0f989fb5281e22064714 Mon Sep 17 00:00:00 2001 From: soheilshahrouz Date: Mon, 19 Jun 2023 13:37:40 -0400 Subject: [PATCH 01/35] Simple annealer for NoC routers in initial placement. Added a simple annealer for routers that starts with a random placement for routers and tries random swaps to improve NoC cost function. During initial placement, routers are placed before other blocks. The starting temperature needed to be reduce so that SA does not destroy the initial router placement. --- vpr/src/place/initial_placement.cpp | 286 +++++++++++++++++++++++++++- vpr/src/place/initial_placement.h | 4 +- vpr/src/place/noc_place_utils.cpp | 2 +- vpr/src/place/noc_place_utils.h | 2 +- vpr/src/place/place.cpp | 13 +- vpr/test/test_noc_place_utils.cpp | 8 +- 6 files changed, 298 insertions(+), 17 deletions(-) diff --git a/vpr/src/place/initial_placement.cpp b/vpr/src/place/initial_placement.cpp index cd484e7f7f3..388296ecb66 100644 --- a/vpr/src/place/initial_placement.cpp +++ b/vpr/src/place/initial_placement.cpp @@ -950,7 +950,7 @@ static void clear_block_type_grid_locs(std::unordered_set unplaced_blk_type * logical_block_types contain empty type, needs to be ignored. * Not having any type in unplaced_blk_types_index means that it is the first iteration, hence all grids needs to be cleared */ - if (unplaced_blk_types_index.size() == device_ctx.logical_block_types.size() - 1 || unplaced_blk_types_index.size() == 0) { + if (unplaced_blk_types_index.size() == device_ctx.logical_block_types.size() - 1) { clear_all_block_types = true; } @@ -985,6 +985,23 @@ static void clear_block_type_grid_locs(std::unordered_set unplaced_blk_type } } +static void initialize_grid_locs() { + auto& device_ctx = g_vpr_ctx.device(); + + std::unordered_set blk_types_to_be_cleared; + const auto& logical_block_types = device_ctx.logical_block_types; + + // Insert all the logical block types into the set except the empty type + // clear_block_type_grid_locs does not expect empty type to be among given types + for (const auto& logical_type : logical_block_types) { + if (!is_empty_type(&logical_type)) { + blk_types_to_be_cleared.insert(logical_type.index); + } + } + + clear_block_type_grid_locs(blk_types_to_be_cleared); +} + bool place_one_block(const ClusterBlockId& blk_id, enum e_pad_loc_type pad_loc_type, std::vector* blk_types_empty_locs_in_grid, @@ -1021,9 +1038,262 @@ bool place_one_block(const ClusterBlockId& blk_id, return placed_macro; } -void initial_placement(enum e_pad_loc_type pad_loc_type, const char* constraints_file, bool noc_enabled) { + +static double calculate_noc_cost(const t_placer_costs& costs, const t_noc_opts& noc_opts) { + double noc_cost = 0.0; + noc_cost = (noc_opts.noc_placement_weighting) * ((costs.noc_aggregate_bandwidth_cost * costs.noc_aggregate_bandwidth_cost_norm) + (costs.noc_latency_cost * costs.noc_latency_cost_norm)); + return noc_cost; +} + + +static bool assess_noc_swap(double delta_cost, double prob) { + + if (delta_cost <= 0.0) { + return true; + } + + if (prob == 0.0) { + return false; + } + + float random_num = vtr::frand(); + if (random_num < prob) { + return true; + } else { + return false; + } +} + +static int findFirstInteger(const std::string& str) { + std::string numberString; + bool foundNumber = false; + + for (char c : str) { + if (isdigit(c)) { + numberString += c; + foundNumber = true; + } else if (foundNumber) { + // We encountered a non-digit character after finding a number, + // so we stop searching. + break; + } + } + + if (!numberString.empty()) { + // Convert the string to an integer using stoi() function + return std::stoi(numberString); + } else { + // If no integer is found, return a default value or handle the case + // according to your requirements. + return -1; + } +} + +#include + +void print_noc_grid() { + + auto& place_ctx = g_vpr_ctx.placement(); + auto& cluster_ctx = g_vpr_ctx.clustering(); + auto& noc_ctx = g_vpr_ctx.noc(); + + + const auto router_block_type = cluster_ctx.clb_nlist.block_type(noc_ctx.noc_traffic_flows_storage.get_router_clusters_in_netlist()[0]); + const auto& compressed_noc_grid = place_ctx.compressed_block_grids[router_block_type->index]; + + static int grid_arr[10][10]; + + for (int i = 0; i < 10; i++) { + for (int j = 0; j < 10; j++) { + grid_arr[i][j] = -1; + } + } + + const std::vector& router_bids = noc_ctx.noc_traffic_flows_storage.get_router_clusters_in_netlist(); + + // Iterate over all routers + for (auto router_bid : router_bids) { + + std::string router_name = cluster_ctx.clb_nlist.block_name(router_bid); + int router_id = findFirstInteger(router_name); + int placed_router_x = grid_to_compressed_approx(compressed_noc_grid.compressed_to_grid_x, place_ctx.block_locs[router_bid].loc.x); + int placed_router_y = grid_to_compressed_approx(compressed_noc_grid.compressed_to_grid_y, place_ctx.block_locs[router_bid].loc.y); + grid_arr[placed_router_x][placed_router_y] = router_id; + } + +// std::cout << "Router id " << router_id << " " << place_ctx.block_locs[blk_id].loc.x << " " << place_ctx.block_locs[blk_id].loc.y << std::endl; + + + std::cout << std::endl; + for (int i = 0; i < 10; i++) { + for (int j = 0; j < 10; j++) { + if (grid_arr[j][i] >= 0) { + std::cout << std::setw(2) << std::setfill('0') << grid_arr[j][i] << "\t"; + } else { + std::cout << std::setw(2) << std::setfill(' ') << "X-" << "\t"; + } + + } + std::cout << std::endl; + } + + std::cout << std::endl; + +} + +static void initial_noc_placement(const t_noc_opts& noc_opts) { + auto& place_ctx = g_vpr_ctx.placement(); + auto& noc_ctx = g_vpr_ctx.noc(); + auto& cluster_ctx = g_vpr_ctx.clustering(); + auto& device_ctx = g_vpr_ctx.device(); + + const std::vector& router_blk_ids = noc_ctx.noc_traffic_flows_storage.get_router_clusters_in_netlist(); + + std::vector unplaced_routers; + + for (auto router_blk_id : router_blk_ids) { + + if (is_block_placed((router_blk_id))) { + continue; + } + + if (is_cluster_constrained(router_blk_id)) { + // TODO: try to place the router in its region + } else { + unplaced_routers.push_back(router_blk_id); + } + } + + // Make a copy of NoC physical routers + vtr::vector noc_phy_routers = noc_ctx.noc_model.get_noc_routers(); + + // Shuffle NoC physical routers + vtr::RandState rand_state = vtr::irand(1024); + vtr::shuffle(noc_phy_routers.begin(), noc_phy_routers.end(), rand_state); + + const auto router_block_type = cluster_ctx.clb_nlist.block_type(noc_ctx.noc_traffic_flows_storage.get_router_clusters_in_netlist()[0]); + const auto& compressed_noc_grid = place_ctx.compressed_block_grids[router_block_type->index]; + + std::cout << noc_phy_routers.size() << " " << unplaced_routers.size() << std::endl; + + // Iterate over shuffled physical routers to place logical routers + for (auto& phy_router : noc_phy_routers) { + + int x = phy_router.get_router_grid_position_x(); + int y = phy_router.get_router_grid_position_y(); + t_pl_loc loc(x, y, OPEN); + + if (place_ctx.grid_blocks[x][y].blocks[0] == EMPTY_BLOCK_ID) { + auto logical_router_bid = unplaced_routers.back(); + unplaced_routers.pop_back(); + + const auto& type = device_ctx.grid.get_physical_type(loc.x, loc.y); + auto& compatible_sub_tiles = compressed_noc_grid.compatible_sub_tiles_for_tile.at(type->index); + loc.sub_tile = compatible_sub_tiles[vtr::irand((int)compatible_sub_tiles.size() - 1)]; + + t_pl_macro_member macro_member; + + macro_member.blk_index = logical_router_bid; + macro_member.offset = t_pl_offset(0, 0, 0); + t_pl_macro pl_macro; + pl_macro.members.push_back(macro_member); + + bool legal = try_place_macro(pl_macro, loc); + VTR_ASSERT(legal); + if (!legal) { + std::cout << "Illegal" << std::endl; + exit(0); + } + + if (unplaced_routers.empty()) { + break; + } + } + + } // end for of random router placement + + + std::cout << noc_phy_routers.size() << " " << unplaced_routers.size() << std::endl; + + initial_noc_routing(); + + print_noc_grid(); + + // Only NoC related costs are considered + t_placer_costs costs; + + costs.noc_aggregate_bandwidth_cost = comp_noc_aggregate_bandwidth_cost(); + costs.noc_latency_cost = comp_noc_latency_cost(noc_opts); + update_noc_normalization_factors(costs); + costs.cost = calculate_noc_cost(costs, noc_opts); + + double best_agg_bw_cost = std::numeric_limits::infinity(); + double best_lat_cost = std::numeric_limits::infinity(); + + float r_lim = 9.0; + + // TODO: Can max_blocks be 2? Does it include only blocks that need to be moved or all the block whose timing is updated? + t_pl_blocks_to_be_moved blocks_affected(1024); + constexpr int N_MOVES = 2500000; + const double starting_prob = 0.5; + const double prob_step = starting_prob / N_MOVES; + + // Random moves + for (int i_move = 0, n_accepted = 0; i_move < N_MOVES; i_move++) { + e_create_move create_move_outcome = e_create_move::ABORT; + clear_move_blocks(blocks_affected); + float r_lim_decayed = 1.0f + (N_MOVES-i_move) * (r_lim/N_MOVES); + create_move_outcome = propose_router_swap(blocks_affected, r_lim_decayed); + if (create_move_outcome != e_create_move::ABORT) { + + apply_move_blocks(blocks_affected); + + double noc_aggregate_bandwidth_delta_c = 0.0; + double noc_latency_delta_c = 0.0; + find_affected_noc_routers_and_update_noc_costs(blocks_affected, noc_aggregate_bandwidth_delta_c, noc_latency_delta_c, noc_opts); + double delta_cost = (noc_opts.noc_placement_weighting) * (noc_latency_delta_c * costs.noc_latency_cost_norm + noc_aggregate_bandwidth_delta_c * costs.noc_aggregate_bandwidth_cost_norm); + + + double prob = starting_prob - i_move*prob_step; + bool move_accepted = assess_noc_swap(delta_cost, prob); + + if (move_accepted) { + costs.cost += delta_cost; + n_accepted++; + commit_move_blocks(blocks_affected); + commit_noc_costs(); + costs.noc_aggregate_bandwidth_cost += noc_aggregate_bandwidth_delta_c; + costs.noc_latency_cost += noc_latency_delta_c; + best_agg_bw_cost = std::min(best_agg_bw_cost, costs.noc_aggregate_bandwidth_cost); + best_lat_cost = std::min(best_lat_cost, costs.noc_latency_cost); + if (n_accepted % 16 == 0) { + update_noc_normalization_factors(costs); + costs.cost = calculate_noc_cost(costs, noc_opts); +// print_noc_grid(); + } + } else { + revert_move_blocks(blocks_affected); + revert_noc_traffic_flow_routes(blocks_affected); + } + + } + } + + + std::cout << "Best BW cost: " << best_agg_bw_cost << std::endl; + std::cout << "Best latency cost: " << best_lat_cost << std::endl; + +} + + +void initial_placement(enum e_pad_loc_type pad_loc_type, const char* constraints_file, const t_noc_opts& noc_opts) { vtr::ScopedStartFinishTimer timer("Initial Placement"); + /* Initialize the grid blocks to empty. + * Initialize all the blocks to unplaced. + */ + initialize_grid_locs(); + /* Go through cluster blocks to calculate the tightest placement * floorplan constraint for each constrained block */ @@ -1033,6 +1303,12 @@ void initial_placement(enum e_pad_loc_type pad_loc_type, const char* constraints * as fixed so they do not get moved during initial placement or later during the simulated annealing stage of placement*/ mark_fixed_blocks(); + if (noc_opts.noc) { + initial_noc_placement(noc_opts); + } + + print_noc_grid(); + //Assign scores to blocks and placement macros according to how difficult they are to place vtr::vector block_scores = assign_block_scores(); @@ -1043,9 +1319,9 @@ void initial_placement(enum e_pad_loc_type pad_loc_type, const char* constraints check_initial_placement_legality(); // route all the traffic flows in the NoC now that all the router cluster block have been placed (this is done only if the noc optimization is enabled by the user) - if (noc_enabled) { - initial_noc_placement(); - } +// if (noc_enabled) { +// initial_noc_routing(); +// } //#ifdef VERBOSE // VTR_LOG("At end of initial_placement.\n"); diff --git a/vpr/src/place/initial_placement.h b/vpr/src/place/initial_placement.h index 994a8801da6..cd2ad03c6b5 100644 --- a/vpr/src/place/initial_placement.h +++ b/vpr/src/place/initial_placement.h @@ -38,6 +38,8 @@ struct t_grid_empty_locs_block_type { int num_of_empty_locs_in_y_axis; }; +void print_noc_grid(); + /** * @brief Tries to find an initial placement location for each block considering floorplanning constraints * and throws an error out if it fails after max number of attempts. @@ -51,7 +53,7 @@ struct t_grid_empty_locs_block_type { * @param noc_enabled Used to check whether the user turned on the noc * optimization during placement. */ -void initial_placement(enum e_pad_loc_type pad_loc_type, const char* constraints_file, bool noc_enabled); +void initial_placement(enum e_pad_loc_type pad_loc_type, const char* constraints_file, const t_noc_opts& noc_opts); /** * @brief Looks for a valid placement location for block. diff --git a/vpr/src/place/noc_place_utils.cpp b/vpr/src/place/noc_place_utils.cpp index 0a53e936d51..05dcec13f0c 100644 --- a/vpr/src/place/noc_place_utils.cpp +++ b/vpr/src/place/noc_place_utils.cpp @@ -9,7 +9,7 @@ static vtr::vector traffic_flow_costs, p static std::vector affected_traffic_flows; /*********************************************************** *****************************/ -void initial_noc_placement(void) { +void initial_noc_routing(void) { // need to get placement information about where the router cluster blocks are placed on the device const auto& place_ctx = g_vpr_ctx.placement(); diff --git a/vpr/src/place/noc_place_utils.h b/vpr/src/place/noc_place_utils.h index f001dfce993..10d030c611d 100644 --- a/vpr/src/place/noc_place_utils.h +++ b/vpr/src/place/noc_place_utils.h @@ -49,7 +49,7 @@ struct TrafficFlowPlaceCost { * routed. This is why this function should only be used once. * */ -void initial_noc_placement(void); +void initial_noc_routing(void); /** * @brief Goes through all the cluster blocks that were moved diff --git a/vpr/src/place/place.cpp b/vpr/src/place/place.cpp index f4a6bf52dfb..2f07f832bdb 100644 --- a/vpr/src/place/place.cpp +++ b/vpr/src/place/place.cpp @@ -535,7 +535,7 @@ void try_place(const Netlist<>& net_list, vtr::ScopedStartFinishTimer timer("Placement"); - initial_placement(placer_opts.pad_loc_type, placer_opts.constraints_file.c_str(), noc_opts.noc); + initial_placement(placer_opts.pad_loc_type, placer_opts.constraints_file.c_str(), noc_opts); #ifdef ENABLE_ANALYTIC_PLACE /* @@ -693,6 +693,7 @@ void try_place(const Netlist<>& net_list, print_histogram( create_setup_slack_histogram(*timing_info->setup_analyzer())); } + size_t num_macro_members = 0; for (auto& macro : g_vpr_ctx.placement().pl_macros) { num_macro_members += macro.members.size(); @@ -1012,6 +1013,8 @@ void try_place(const Netlist<>& net_list, sprintf(msg, "\nNoC Placement Costs. noc_aggregate_bandwidth_cost: %g noc_latency_cost: %g noc_latency_constraints_cost: %d", costs.noc_aggregate_bandwidth_cost, costs.noc_latency_cost, get_number_of_traffic_flows_with_latency_cons_met()); VTR_LOG("NoC Placement Costs. noc_aggregate_bandwidth_cost: %g, noc_latency_cost: %g, noc_latency_constraints_cost: %d, \n", costs.noc_aggregate_bandwidth_cost, costs.noc_latency_cost, get_number_of_traffic_flows_with_latency_cons_met()); + + print_noc_grid(); } update_screen(ScreenUpdatePriority::MAJOR, msg, PLACEMENT, timing_info); // Print out swap statistics @@ -1341,11 +1344,11 @@ static float starting_t(const t_annealing_state* state, t_placer_costs* costs, t /* We use a constructive initial placement and a low starting temperature * by default, but that can cause problems with NoCs as the initial logical * locations are random. Use a higher starting T in that case.*/ - if (noc_opts.noc) { - init_temp = 20. * std_dev; - } else { +// if (noc_opts.noc) { +// init_temp = 20. * std_dev; +// } else { init_temp = std_dev / 64; - } +// } return init_temp; } diff --git a/vpr/test/test_noc_place_utils.cpp b/vpr/test/test_noc_place_utils.cpp index 65e76fb4dbb..2186cb31701 100644 --- a/vpr/test/test_noc_place_utils.cpp +++ b/vpr/test/test_noc_place_utils.cpp @@ -173,7 +173,7 @@ TEST_CASE("test_initial_noc_placement", "[noc_place_utils]") { } // now call the test function - initial_noc_placement(); + initial_noc_routing(); // now verify the function by comparing the link bandwidths in the noc model (should have been updated by the test function) to the golden set int number_of_links = golden_link_bandwidths.size(); @@ -345,7 +345,7 @@ TEST_CASE("test_initial_comp_cost_functions", "[noc_place_utils]") { // assume this works // this is needed to set up the global noc packet router and also global datastructures - initial_noc_placement(); + initial_noc_routing(); SECTION("test_comp_noc_aggregate_bandwidth_cost") { //initialize all the cost calculator datastructures @@ -595,7 +595,7 @@ TEST_CASE("test_find_affected_noc_routers_and_update_noc_costs, test_commit_noc_ // assume this works // this is needed to set up the global noc packet router and also global datastructures - initial_noc_placement(); + initial_noc_routing(); // datastructure below will store the bandwidth usages of all the links // and will be updated throughout this test. @@ -1272,7 +1272,7 @@ TEST_CASE("test_revert_noc_traffic_flow_routes", "[noc_place_utils]") { // assume this works // this is needed to set up the global noc packet router and also global datastructures - initial_noc_placement(); + initial_noc_routing(); // datastructure below will store the bandwidth usages of all the links // and will be updated throughout this test. From f27c9dcd644b534c00ca591ce20f4831ce54f91b Mon Sep 17 00:00:00 2001 From: soheilshahrouz Date: Mon, 19 Jun 2023 15:47:33 -0400 Subject: [PATCH 02/35] Clarify the description for propose_router_swap(). The description was somewhat misleading because the proposed move does not always involve swapping two router clusters. Sometimes it just moves a router cluster to an empty location. --- vpr/src/place/noc_place_utils.h | 18 +++++++++++------- 1 file changed, 11 insertions(+), 7 deletions(-) diff --git a/vpr/src/place/noc_place_utils.h b/vpr/src/place/noc_place_utils.h index 10d030c611d..f255aee5cb9 100644 --- a/vpr/src/place/noc_place_utils.h +++ b/vpr/src/place/noc_place_utils.h @@ -391,13 +391,17 @@ void free_noc_placement_structs(void); bool check_for_router_swap(int user_supplied_noc_router_swap_percentage); /** - * @brief Generates a placement move by choosing two router cluster blocks to - * swap. First, a random router cluster block is chosen and then another router - * cluster block is chosen that can be swapped with the initial block such that - * the distance travelled by either block does not exceed rlim. - * - * @param blocks_affected The two router cluster blocks that are proposed to be - * swapped + * @brief Generates a placement move by first choosing a random router cluster + * and then choosing a random physical router where the selected router cluster + * can be moved to. If the selected physical router is already occupied, + * the proposed move requires swapping two router clusters. If the selected + * physical router is empty, the proposed move only requires changing the location + * of the random router cluster. The range in which the physical router is selected + * is limited such that them maximum distance travelled by the random router cluster + * does not exceed rlim. + * + * @param blocks_affected Contains one or two router clusters that are proposed + * to be moved or swapped. * @param rlim The maximum distance in the x and y direction that a router * cluster block can travel (this is within the compressed block space) * @return e_create_move Result of proposing the move From a4ef7a3e4d04c2b84cb3a1fb3d3a9b15973546d3 Mon Sep 17 00:00:00 2001 From: soheilshahrouz Date: Mon, 19 Jun 2023 15:53:33 -0400 Subject: [PATCH 03/35] Look for an empty location in find_compatible_compressed_loc_in_range(). find_compatible_compressed_loc_in_range() searches for a compatible location for a specific block type in a range around a given location. The returned location is not necessarily empty. This does not cause any problems for SA because two blocks can be easily swapped. However, in initial placement, we want to place blocks in empty locations. If this function returns a location which has already been occupied, initial placement resorts to random placement. The modified function get a new argument (check_empty) to indicate that the returned location must be empty. --- vpr/src/place/initial_placement.cpp | 11 ++++++----- vpr/src/place/move_utils.cpp | 21 +++++++++++++++++---- vpr/src/place/move_utils.h | 2 +- 3 files changed, 24 insertions(+), 10 deletions(-) diff --git a/vpr/src/place/initial_placement.cpp b/vpr/src/place/initial_placement.cpp index 388296ecb66..d196537e17d 100644 --- a/vpr/src/place/initial_placement.cpp +++ b/vpr/src/place/initial_placement.cpp @@ -163,10 +163,11 @@ static std::vector find_centroid_loc(t_pl_macro pl_macro, t_pl_l * * @param centroid_loc Calculated location in try_centroid_placement function for the block. * @param block_type Logical block type of the macro blocks. + * @param check_empty If set, the function tries to find an empty location. * * @return true if the function can find any location near the centroid one, false otherwise. */ -static bool find_centroid_neighbor(t_pl_loc& centroid_loc, t_logical_block_type_ptr block_type); +static bool find_centroid_neighbor(t_pl_loc& centroid_loc, t_logical_block_type_ptr block_type, bool check_empty); /** * @brief tries to place a macro at a centroid location of its placed connections. @@ -289,7 +290,7 @@ static bool is_loc_legal(t_pl_loc& loc, PartitionRegion& pr, t_logical_block_typ return legal; } -static bool find_centroid_neighbor(t_pl_loc& centroid_loc, t_logical_block_type_ptr block_type) { +static bool find_centroid_neighbor(t_pl_loc& centroid_loc, t_logical_block_type_ptr block_type, bool check_empty) { const auto& compressed_block_grid = g_vpr_ctx.placement().compressed_block_grids[block_type->index]; //Determine centroid location in the compressed space of the current block @@ -320,7 +321,7 @@ static bool find_centroid_neighbor(t_pl_loc& centroid_loc, t_logical_block_type_ int cx_to, cy_to; - bool legal = find_compatible_compressed_loc_in_range(block_type, min_cx, max_cx, min_cy, max_cy, delta_cx, cx_from, cy_from, cx_to, cy_to, false); + bool legal = find_compatible_compressed_loc_in_range(block_type, min_cx, max_cx, min_cy, max_cy, delta_cx, cx_from, cy_from, cx_to, cy_to, false, check_empty); if (!legal) { return false; @@ -423,7 +424,7 @@ static bool try_centroid_placement(t_pl_macro pl_macro, PartitionRegion& pr, t_l //try to find a near location that meet these requirements bool neighbor_legal_loc = false; if (!is_loc_legal(centroid_loc, pr, block_type)) { - neighbor_legal_loc = find_centroid_neighbor(centroid_loc, block_type); + neighbor_legal_loc = find_centroid_neighbor(centroid_loc, block_type, true); if (!neighbor_legal_loc) { //no neighbor candidate found return false; } @@ -598,7 +599,7 @@ static bool try_random_placement(t_pl_macro pl_macro, PartitionRegion& pr, t_log int cy_to; bool legal; - legal = find_compatible_compressed_loc_in_range(block_type, min_cx, max_cx, min_cy, max_cy, delta_cx, cx_from, cy_from, cx_to, cy_to, false); + legal = find_compatible_compressed_loc_in_range(block_type, min_cx, max_cx, min_cy, max_cy, delta_cx, cx_from, cy_from, cx_to, cy_to, false, true); if (!legal) { //No valid position found return false; diff --git a/vpr/src/place/move_utils.cpp b/vpr/src/place/move_utils.cpp index 53a7c0ad248..2d9941a4fce 100644 --- a/vpr/src/place/move_utils.cpp +++ b/vpr/src/place/move_utils.cpp @@ -738,7 +738,7 @@ bool find_to_loc_uniform(t_logical_block_type_ptr type, } } - legal = find_compatible_compressed_loc_in_range(type, min_cx, max_cx, min_cy, max_cy, delta_cx, cx_from, cy_from, cx_to, cy_to, false); + legal = find_compatible_compressed_loc_in_range(type, min_cx, max_cx, min_cy, max_cy, delta_cx, cx_from, cy_from, cx_to, cy_to, false, false); if (!legal) { //No valid position found @@ -810,7 +810,7 @@ bool find_to_loc_median(t_logical_block_type_ptr blk_type, } } - legal = find_compatible_compressed_loc_in_range(blk_type, min_cx, max_cx, min_cy, max_cy, delta_cx, cx_from, cy_from, cx_to, cy_to, true); + legal = find_compatible_compressed_loc_in_range(blk_type, min_cx, max_cx, min_cy, max_cy, delta_cx, cx_from, cy_from, cx_to, cy_to, true, false); if (!legal) { //No valid position found @@ -895,7 +895,7 @@ bool find_to_loc_centroid(t_logical_block_type_ptr blk_type, } } - legal = find_compatible_compressed_loc_in_range(blk_type, min_cx, max_cx, min_cy, max_cy, delta_cx, cx_from, cy_from, cx_to, cy_to, false); + legal = find_compatible_compressed_loc_in_range(blk_type, min_cx, max_cx, min_cy, max_cy, delta_cx, cx_from, cy_from, cx_to, cy_to, false, false); if (!legal) { //No valid position found @@ -949,8 +949,10 @@ void compressed_grid_to_loc(t_logical_block_type_ptr blk_type, int cx, int cy, t to_loc.sub_tile = compatible_sub_tiles[vtr::irand((int)compatible_sub_tiles.size() - 1)]; } -bool find_compatible_compressed_loc_in_range(t_logical_block_type_ptr type, int min_cx, int max_cx, int min_cy, int max_cy, int delta_cx, int cx_from, int cy_from, int& cx_to, int& cy_to, bool is_median) { +bool find_compatible_compressed_loc_in_range(t_logical_block_type_ptr type, int min_cx, int max_cx, int min_cy, int max_cy, int delta_cx, int cx_from, int cy_from, int& cx_to, int& cy_to, bool is_median, bool check_empty) { const auto& compressed_block_grid = g_vpr_ctx.placement().compressed_block_grids[type->index]; + auto& device_ctx = g_vpr_ctx.device(); + auto& place_ctx = g_vpr_ctx.placement(); std::unordered_set tried_cx_to; bool legal = false; @@ -1028,6 +1030,17 @@ bool find_compatible_compressed_loc_in_range(t_logical_block_type_ptr type, int if (cx_from == cx_to && cy_from == cy_to) { continue; //Same from/to location -- try again for new y-position + } else if (check_empty) { // Check if the location has at least one empty sub-tile + t_pl_loc to_loc; + compressed_grid_to_loc(type, cx_to, cy_to, to_loc); + const auto& phy_type = device_ctx.grid.get_physical_type(to_loc.x, to_loc.y); + const auto& compatible_sub_tiles = compressed_block_grid.compatible_sub_tiles_for_tile.at(phy_type->index); + for (const auto& sub_tile : compatible_sub_tiles) { + if (place_ctx.grid_blocks[to_loc.x][to_loc.y].blocks[sub_tile] == EMPTY_BLOCK_ID) { + legal = true; + break; + } + } } else { legal = true; } diff --git a/vpr/src/place/move_utils.h b/vpr/src/place/move_utils.h index 36733624eed..a32c8fa2406 100644 --- a/vpr/src/place/move_utils.h +++ b/vpr/src/place/move_utils.h @@ -225,7 +225,7 @@ void compressed_grid_to_loc(t_logical_block_type_ptr blk_type, int cx, int cy, t * cx_to, cy_to: the x and y coordinates of the new location on the compressed grid * is_median: true if this is called from find_to_loc_median */ -bool find_compatible_compressed_loc_in_range(t_logical_block_type_ptr type, int min_cx, int max_cx, int min_cy, int max_cy, int delta_cx, int cx_from, int cy_from, int& cx_to, int& cy_to, bool is_median); +bool find_compatible_compressed_loc_in_range(t_logical_block_type_ptr type, int min_cx, int max_cx, int min_cy, int max_cy, int delta_cx, int cx_from, int cy_from, int& cx_to, int& cy_to, bool is_median, bool check_empty); /* * If the block to be moved (b_from) has a floorplan constraint, this routine changes the max and min coords From ade86b599512db2ec33f766b3197b380703da205 Mon Sep 17 00:00:00 2001 From: soheilshahrouz Date: Tue, 20 Jun 2023 12:47:04 -0400 Subject: [PATCH 04/35] Add router swap proposal based on the flow centroid. propose_router_swap_flow_centroid() randomly selects a router cluster and calculates a weighted average over the locations of the routers connected to it with flows. The weight of each router is determined by the bandwidth and latency of the flow that connects two routers. --- vpr/src/noc/noc_traffic_flows.cpp | 23 ++++ vpr/src/noc/noc_traffic_flows.h | 6 +- vpr/src/place/initial_placement.cpp | 31 +++-- vpr/src/place/noc_place_utils.cpp | 168 +++++++++++++++++++++++++--- vpr/src/place/noc_place_utils.h | 2 + 5 files changed, 202 insertions(+), 28 deletions(-) diff --git a/vpr/src/noc/noc_traffic_flows.cpp b/vpr/src/noc/noc_traffic_flows.cpp index 50e323cb716..367f4f30471 100644 --- a/vpr/src/noc/noc_traffic_flows.cpp +++ b/vpr/src/noc/noc_traffic_flows.cpp @@ -89,6 +89,29 @@ void NocTrafficFlows::finished_noc_traffic_flows_setup(void) { int number_of_traffic_flows = noc_traffic_flows.size(); traffic_flow_routes.resize(number_of_traffic_flows); + + const int num_flows = get_number_of_traffic_flows(); + double bandwidth_sum = 0.0; + double inverse_latency_sum = 0.0; + + // Iterate over all flows and calculate bandwidth and inverse latency sums + for (const auto& flow_id : noc_traffic_flows_ids) { + const auto& flow = get_single_noc_traffic_flow(flow_id); + bandwidth_sum += flow.traffic_flow_bandwidth; + inverse_latency_sum += 1.0 / flow.max_traffic_flow_latency; + } + + double bandwidth_norm_factor = bandwidth_sum / num_flows; + double inverse_latency_norm_factor = inverse_latency_sum / num_flows; + + // Iterate over all flows and assign their scores + for (const auto& flow_id : noc_traffic_flows_ids) { + auto& flow = noc_traffic_flows[flow_id]; + double normalized_bandwidth = flow.traffic_flow_bandwidth / bandwidth_norm_factor; + double normalized_inverse_latency = 1.0 / (flow.max_traffic_flow_latency * inverse_latency_norm_factor); + flow.score = flow.traffic_flow_priority * normalized_bandwidth * normalized_inverse_latency; + } + return; } diff --git a/vpr/src/noc/noc_traffic_flows.h b/vpr/src/noc/noc_traffic_flows.h index dae52b5184d..7749c332dc9 100644 --- a/vpr/src/noc/noc_traffic_flows.h +++ b/vpr/src/noc/noc_traffic_flows.h @@ -63,6 +63,9 @@ struct t_noc_traffic_flow { /** Indicates the importance of the traffic flow. Higher priority traffic flows will have more importance and will be more likely to have their latency reduced and constraints met. Range: [0-inf) */ int traffic_flow_priority; + /** When a weighted average is computed over flows or their properties, this score can be used as the contributing weight for its corresponding flow */ + double score; + /** Constructor initializes all variables*/ t_noc_traffic_flow(std::string source_router_name, std::string sink_router_name, ClusterBlockId source_router_id, ClusterBlockId sink_router_id, double flow_bandwidth, double max_flow_latency, int flow_priority) : source_router_module_name(source_router_name) @@ -71,7 +74,8 @@ struct t_noc_traffic_flow { , sink_router_cluster_id(sink_router_id) , traffic_flow_bandwidth(flow_bandwidth) , max_traffic_flow_latency(max_flow_latency) - , traffic_flow_priority(flow_priority) {} + , traffic_flow_priority(flow_priority) + , score(0.0) {} }; class NocTrafficFlows { diff --git a/vpr/src/place/initial_placement.cpp b/vpr/src/place/initial_placement.cpp index d196537e17d..a5946ada912 100644 --- a/vpr/src/place/initial_placement.cpp +++ b/vpr/src/place/initial_placement.cpp @@ -1142,6 +1142,8 @@ void print_noc_grid() { } +#include + static void initial_noc_placement(const t_noc_opts& noc_opts) { auto& place_ctx = g_vpr_ctx.placement(); auto& noc_ctx = g_vpr_ctx.noc(); @@ -1149,6 +1151,7 @@ static void initial_noc_placement(const t_noc_opts& noc_opts) { auto& device_ctx = g_vpr_ctx.device(); const std::vector& router_blk_ids = noc_ctx.noc_traffic_flows_storage.get_router_clusters_in_netlist(); + const int num_router_clusters = router_blk_ids.size(); std::vector unplaced_routers; @@ -1175,8 +1178,6 @@ static void initial_noc_placement(const t_noc_opts& noc_opts) { const auto router_block_type = cluster_ctx.clb_nlist.block_type(noc_ctx.noc_traffic_flows_storage.get_router_clusters_in_netlist()[0]); const auto& compressed_noc_grid = place_ctx.compressed_block_grids[router_block_type->index]; - std::cout << noc_phy_routers.size() << " " << unplaced_routers.size() << std::endl; - // Iterate over shuffled physical routers to place logical routers for (auto& phy_router : noc_phy_routers) { @@ -1202,7 +1203,6 @@ static void initial_noc_placement(const t_noc_opts& noc_opts) { bool legal = try_place_macro(pl_macro, loc); VTR_ASSERT(legal); if (!legal) { - std::cout << "Illegal" << std::endl; exit(0); } @@ -1214,8 +1214,6 @@ static void initial_noc_placement(const t_noc_opts& noc_opts) { } // end for of random router placement - std::cout << noc_phy_routers.size() << " " << unplaced_routers.size() << std::endl; - initial_noc_routing(); print_noc_grid(); @@ -1227,15 +1225,20 @@ static void initial_noc_placement(const t_noc_opts& noc_opts) { costs.noc_latency_cost = comp_noc_latency_cost(noc_opts); update_noc_normalization_factors(costs); costs.cost = calculate_noc_cost(costs, noc_opts); +// double prev_cost = costs.cost; double best_agg_bw_cost = std::numeric_limits::infinity(); double best_lat_cost = std::numeric_limits::infinity(); float r_lim = 9.0; - // TODO: Can max_blocks be 2? Does it include only blocks that need to be moved or all the block whose timing is updated? - t_pl_blocks_to_be_moved blocks_affected(1024); - constexpr int N_MOVES = 2500000; + + // At most, two routers are swapped + t_pl_blocks_to_be_moved blocks_affected(2); + + // Total number of moves + const int N_MOVES = num_router_clusters * 35 * 1000; + const double starting_prob = 0.5; const double prob_step = starting_prob / N_MOVES; @@ -1243,8 +1246,13 @@ static void initial_noc_placement(const t_noc_opts& noc_opts) { for (int i_move = 0, n_accepted = 0; i_move < N_MOVES; i_move++) { e_create_move create_move_outcome = e_create_move::ABORT; clear_move_blocks(blocks_affected); - float r_lim_decayed = 1.0f + (N_MOVES-i_move) * (r_lim/N_MOVES); - create_move_outcome = propose_router_swap(blocks_affected, r_lim_decayed); + if (i_move % 2 == 0) { + float r_lim_decayed = 1.0f + (N_MOVES-i_move) * (r_lim/N_MOVES); + create_move_outcome = propose_router_swap(blocks_affected, r_lim_decayed); + } else { + create_move_outcome = propose_router_swap_flow_centroid(blocks_affected); + } + if (create_move_outcome != e_create_move::ABORT) { apply_move_blocks(blocks_affected); @@ -1254,7 +1262,6 @@ static void initial_noc_placement(const t_noc_opts& noc_opts) { find_affected_noc_routers_and_update_noc_costs(blocks_affected, noc_aggregate_bandwidth_delta_c, noc_latency_delta_c, noc_opts); double delta_cost = (noc_opts.noc_placement_weighting) * (noc_latency_delta_c * costs.noc_latency_cost_norm + noc_aggregate_bandwidth_delta_c * costs.noc_aggregate_bandwidth_cost_norm); - double prob = starting_prob - i_move*prob_step; bool move_accepted = assess_noc_swap(delta_cost, prob); @@ -1267,7 +1274,7 @@ static void initial_noc_placement(const t_noc_opts& noc_opts) { costs.noc_latency_cost += noc_latency_delta_c; best_agg_bw_cost = std::min(best_agg_bw_cost, costs.noc_aggregate_bandwidth_cost); best_lat_cost = std::min(best_lat_cost, costs.noc_latency_cost); - if (n_accepted % 16 == 0) { + if (n_accepted % 128 == 0) { update_noc_normalization_factors(costs); costs.cost = calculate_noc_cost(costs, noc_opts); // print_noc_grid(); diff --git a/vpr/src/place/noc_place_utils.cpp b/vpr/src/place/noc_place_utils.cpp index 05dcec13f0c..2ec4f147a59 100644 --- a/vpr/src/place/noc_place_utils.cpp +++ b/vpr/src/place/noc_place_utils.cpp @@ -447,36 +447,54 @@ bool check_for_router_swap(int user_supplied_noc_router_swap_percentage) { return (vtr::irand(99) < user_supplied_noc_router_swap_percentage) ? true : false; } -e_create_move propose_router_swap(t_pl_blocks_to_be_moved& blocks_affected, float rlim) { +static bool select_random_router_cluster(ClusterBlockId& b_from, t_pl_loc& from, t_logical_block_type_ptr& cluster_from_type) { // need to access all the router cluster blocks in the design auto& noc_ctx = g_vpr_ctx.noc(); + // + auto& place_ctx = g_vpr_ctx.placement(); + // + auto& cluster_ctx = g_vpr_ctx.clustering(); + // get a reference to the collection of router cluster blocks in the design const std::vector& router_clusters = noc_ctx.noc_traffic_flows_storage.get_router_clusters_in_netlist(); - // if there are no router cluster blocks to swap then abort + // if there are no router cluster blocks, return false if (router_clusters.empty()) { - return e_create_move::ABORT; + return false; } - int number_of_router_blocks = router_clusters.size(); + const int number_of_router_blocks = router_clusters.size(); //randomly choose a router block to move - int random_cluster_block_index = vtr::irand(number_of_router_blocks - 1); - ClusterBlockId b_from = router_clusters[random_cluster_block_index]; + const int random_cluster_block_index = vtr::irand(number_of_router_blocks - 1); + b_from = router_clusters[random_cluster_block_index]; - auto& place_ctx = g_vpr_ctx.placement(); - auto& cluster_ctx = g_vpr_ctx.clustering(); + from = place_ctx.block_locs[b_from].loc; + cluster_from_type = cluster_ctx.clb_nlist.block_type(b_from); + const auto grid_from_type = g_vpr_ctx.device().grid.get_physical_type(from.x, from.y); + VTR_ASSERT(is_tile_compatible(grid_from_type, cluster_from_type)); - //check if the block is movable - if (place_ctx.block_locs[b_from].is_fixed) { + return true; +} + +e_create_move propose_router_swap(t_pl_blocks_to_be_moved& blocks_affected, float rlim) { + + // block ID for the randomly selected router cluster + ClusterBlockId b_from; + // current location of the randomly selected router cluster + t_pl_loc from; + // logical block type of the randomly selected router cluster + t_logical_block_type_ptr cluster_from_type; + bool random_select_success = false; + + // Randomly select a router cluster + random_select_success = select_random_router_cluster(b_from, from, cluster_from_type); + + // If a random router cluster could not be selected, no move can be proposed + if (!random_select_success) { return e_create_move::ABORT; } - t_pl_loc from = place_ctx.block_locs[b_from].loc; - auto cluster_from_type = cluster_ctx.clb_nlist.block_type(b_from); - auto grid_from_type = g_vpr_ctx.device().grid.get_physical_type(from.x, from.y); - VTR_ASSERT(is_tile_compatible(grid_from_type, cluster_from_type)); - // now choose a compatible block to swap with t_pl_loc to; @@ -494,6 +512,126 @@ e_create_move propose_router_swap(t_pl_blocks_to_be_moved& blocks_affected, floa return create_move; } +e_create_move propose_router_swap_flow_centroid(t_pl_blocks_to_be_moved& blocks_affected) { + + auto& noc_ctx = g_vpr_ctx.noc(); + auto& place_ctx = g_vpr_ctx.placement(); + const auto& grid = g_vpr_ctx.device().grid; + + + // block ID for the randomly selected router cluster + ClusterBlockId b_from; + // current location of the randomly selected router cluster + t_pl_loc from; + // logical block type of the randomly selected router cluster + t_logical_block_type_ptr cluster_from_type; + bool random_select_success = false; + + // Randomly select a router cluster + random_select_success = select_random_router_cluster(b_from, from, cluster_from_type); + + const auto& compressed_noc_grid = g_vpr_ctx.placement().compressed_block_grids[cluster_from_type->index]; + + // If a random router cluster could not be selected, no move can be proposed + if (!random_select_success) { + return e_create_move::ABORT; + } + + // Get all the traffic flow associated with the selected router cluster + const std::vector* associated_flows = noc_ctx.noc_traffic_flows_storage.get_traffic_flows_associated_to_router_block(b_from); + + // There are no associated flows for this router. Centroid location cannot be calculated. + if (associated_flows == nullptr) { + return e_create_move::ABORT; + } + + double acc_x = 0.0; + double acc_y = 0.0; + double acc_weight = 0.0; + + // iterate over all the flows associated with the given router + for (auto flow_id : *associated_flows) { + auto& flow = noc_ctx.noc_traffic_flows_storage.get_single_noc_traffic_flow(flow_id); + ClusterBlockId source_blk_id = flow.source_router_cluster_id; + ClusterBlockId sink_blk_id = flow.sink_router_cluster_id; + + if (b_from == source_blk_id) { + acc_x += flow.score * place_ctx.block_locs[sink_blk_id].loc.x; + acc_y += flow.score * place_ctx.block_locs[sink_blk_id].loc.y; + acc_weight += flow.score; + } else if (b_from == sink_blk_id) { + acc_x += flow.score * place_ctx.block_locs[source_blk_id].loc.x; + acc_y += flow.score * place_ctx.block_locs[source_blk_id].loc.y; + acc_weight += flow.score; + } else { + VTR_ASSERT(false); + } + } + + + t_pl_loc centroid_loc(OPEN, OPEN, OPEN); + + if (acc_weight > 0.0) { + centroid_loc.x = (int)round(acc_x / acc_weight); + centroid_loc.y = (int)round(acc_y / acc_weight); + } else { + return e_create_move::ABORT; + } + + if (!is_loc_on_chip(centroid_loc.x, centroid_loc.y)) { + return e_create_move::ABORT; + } + + + const auto& physical_type = grid.get_physical_type(centroid_loc.x, centroid_loc.y); + + // If the calculated centroid does not have a compatible type, find a compatible location nearby + if (!is_tile_compatible(physical_type, cluster_from_type)) { + + //Determine centroid location in the compressed space of the current block + int cx_centroid = grid_to_compressed_approx(compressed_noc_grid.compressed_to_grid_x, centroid_loc.x); + int cy_centroid = grid_to_compressed_approx(compressed_noc_grid.compressed_to_grid_y, centroid_loc.y); + + const int r_lim = 1; + int r_lim_x = std::min(compressed_noc_grid.compressed_to_grid_x.size(), r_lim); + int r_lim_y = std::min(compressed_noc_grid.compressed_to_grid_y.size(), r_lim); + + //Determine the valid compressed grid location ranges + int min_cx, max_cx, delta_cx; + int min_cy, max_cy; + + min_cx = std::max(0, cx_centroid - r_lim_x); + max_cx = std::min(compressed_noc_grid.compressed_to_grid_x.size() - 1, cx_centroid + r_lim_x); + + min_cy = std::max(0, cy_centroid - r_lim_y); + max_cy = std::min(compressed_noc_grid.compressed_to_grid_y.size() - 1, cy_centroid + r_lim_y); + + delta_cx = max_cx - min_cx; + + int cx_from = grid_to_compressed_approx(compressed_noc_grid.compressed_to_grid_x, from.x); + int cy_from = grid_to_compressed_approx(compressed_noc_grid.compressed_to_grid_y, from.y); + + int cx_to, cy_to; + + bool legal = find_compatible_compressed_loc_in_range(cluster_from_type, min_cx, max_cx, min_cy, max_cy, delta_cx, cx_from, cy_from, cx_to, cy_to, false, false); + + if (!legal) { + return e_create_move::ABORT; + } + + compressed_grid_to_loc(cluster_from_type, cx_to, cy_to, centroid_loc); + } + + e_create_move create_move = ::create_move(blocks_affected, b_from, centroid_loc); + + //Check that all the blocks affected by the move would still be in a legal floorplan region after the swap + if (!floorplan_legal(blocks_affected)) { + return e_create_move::ABORT; + } + + return create_move; +} + void write_noc_placement_file(std::string file_name) { // we need the clustered netlist to get the names of all the NoC router cluster blocks auto& cluster_ctx = g_vpr_ctx.clustering(); diff --git a/vpr/src/place/noc_place_utils.h b/vpr/src/place/noc_place_utils.h index f255aee5cb9..256ecfc45bf 100644 --- a/vpr/src/place/noc_place_utils.h +++ b/vpr/src/place/noc_place_utils.h @@ -408,6 +408,8 @@ bool check_for_router_swap(int user_supplied_noc_router_swap_percentage); */ e_create_move propose_router_swap(t_pl_blocks_to_be_moved& blocks_affected, float rlim); +e_create_move propose_router_swap_flow_centroid(t_pl_blocks_to_be_moved& blocks_affected); + /** * @brief Writes out the locations of the router cluster blocks in the * final placement. This file contains only NoC routers and the From c11000e1776ad4de5aa7a479d7543ca87fb331cb Mon Sep 17 00:00:00 2001 From: soheilshahrouz Date: Tue, 20 Jun 2023 16:07:48 -0400 Subject: [PATCH 05/35] Clean the code for initial router placement. --- vpr/src/place/initial_placement.cpp | 77 ++++++++++++++--------------- 1 file changed, 37 insertions(+), 40 deletions(-) diff --git a/vpr/src/place/initial_placement.cpp b/vpr/src/place/initial_placement.cpp index a5946ada912..aa9cf4b7bc4 100644 --- a/vpr/src/place/initial_placement.cpp +++ b/vpr/src/place/initial_placement.cpp @@ -17,6 +17,7 @@ #include #include +#include #ifdef VERBOSE void print_clb_placement(const char* fname); @@ -1142,21 +1143,22 @@ void print_noc_grid() { } -#include - static void initial_noc_placement(const t_noc_opts& noc_opts) { auto& place_ctx = g_vpr_ctx.placement(); auto& noc_ctx = g_vpr_ctx.noc(); auto& cluster_ctx = g_vpr_ctx.clustering(); auto& device_ctx = g_vpr_ctx.device(); + // Get all the router clusters and figure out how many of them exist const std::vector& router_blk_ids = noc_ctx.noc_traffic_flows_storage.get_router_clusters_in_netlist(); const int num_router_clusters = router_blk_ids.size(); - std::vector unplaced_routers; + // Holds all the routers that are not fixed into a specific location by constraints + std::vector unfixed_routers; for (auto router_blk_id : router_blk_ids) { + // The block is fixed and was placed in mark_fixed_blocks() if (is_block_placed((router_blk_id))) { continue; } @@ -1164,56 +1166,62 @@ static void initial_noc_placement(const t_noc_opts& noc_opts) { if (is_cluster_constrained(router_blk_id)) { // TODO: try to place the router in its region } else { - unplaced_routers.push_back(router_blk_id); + unfixed_routers.push_back(router_blk_id); } } - // Make a copy of NoC physical routers + // Make a copy of NoC physical routers because we want to change its order vtr::vector noc_phy_routers = noc_ctx.noc_model.get_noc_routers(); // Shuffle NoC physical routers vtr::RandState rand_state = vtr::irand(1024); vtr::shuffle(noc_phy_routers.begin(), noc_phy_routers.end(), rand_state); + // Get the logical block type for router const auto router_block_type = cluster_ctx.clb_nlist.block_type(noc_ctx.noc_traffic_flows_storage.get_router_clusters_in_netlist()[0]); + + // Get the compressed grid for NoC const auto& compressed_noc_grid = place_ctx.compressed_block_grids[router_block_type->index]; // Iterate over shuffled physical routers to place logical routers - for (auto& phy_router : noc_phy_routers) { + // Since physical routers are shuffled, router placement would be random + for (const auto& phy_router : noc_phy_routers) { int x = phy_router.get_router_grid_position_x(); int y = phy_router.get_router_grid_position_y(); - t_pl_loc loc(x, y, OPEN); - if (place_ctx.grid_blocks[x][y].blocks[0] == EMPTY_BLOCK_ID) { - auto logical_router_bid = unplaced_routers.back(); - unplaced_routers.pop_back(); + // Find a compatible sub-tile + const auto& phy_type = device_ctx.grid.get_physical_type(x, y); + const auto& compatible_sub_tiles = compressed_noc_grid.compatible_sub_tiles_for_tile.at(phy_type->index); + int sub_tile = compatible_sub_tiles[vtr::irand((int)compatible_sub_tiles.size() - 1)]; - const auto& type = device_ctx.grid.get_physical_type(loc.x, loc.y); - auto& compatible_sub_tiles = compressed_noc_grid.compatible_sub_tiles_for_tile.at(type->index); - loc.sub_tile = compatible_sub_tiles[vtr::irand((int)compatible_sub_tiles.size() - 1)]; + t_pl_loc loc(x, y, sub_tile); - t_pl_macro_member macro_member; + if (place_ctx.grid_blocks[x][y].blocks[sub_tile] == EMPTY_BLOCK_ID) { + // Pick one of the unplaced routers + auto logical_router_bid = unfixed_routers.back(); + unfixed_routers.pop_back(); + // Create a macro with a single member + t_pl_macro_member macro_member; macro_member.blk_index = logical_router_bid; macro_member.offset = t_pl_offset(0, 0, 0); t_pl_macro pl_macro; pl_macro.members.push_back(macro_member); bool legal = try_place_macro(pl_macro, loc); - VTR_ASSERT(legal); if (!legal) { - exit(0); + VPR_FATAL_ERROR(VPR_ERROR_PLACE, "Could not place a router cluster into an empty physical router."); } - if (unplaced_routers.empty()) { + // When all router clusters are placed, stop iterating over remaining physical routers + if (unfixed_routers.empty()) { break; } } - } // end for of random router placement - + // populate internal data structures to maintain route, bandwidth usage, and latencies initial_noc_routing(); print_noc_grid(); @@ -1225,13 +1233,12 @@ static void initial_noc_placement(const t_noc_opts& noc_opts) { costs.noc_latency_cost = comp_noc_latency_cost(noc_opts); update_noc_normalization_factors(costs); costs.cost = calculate_noc_cost(costs, noc_opts); -// double prev_cost = costs.cost; double best_agg_bw_cost = std::numeric_limits::infinity(); double best_lat_cost = std::numeric_limits::infinity(); - float r_lim = 9.0; - + // Maximum distance in each direction that a router can travel in a move + const float max_r_lim = ceil(sqrtf(noc_phy_routers.size())); // At most, two routers are swapped t_pl_blocks_to_be_moved blocks_affected(2); @@ -1242,16 +1249,16 @@ static void initial_noc_placement(const t_noc_opts& noc_opts) { const double starting_prob = 0.5; const double prob_step = starting_prob / N_MOVES; - // Random moves + // Generate and evaluate router moves for (int i_move = 0, n_accepted = 0; i_move < N_MOVES; i_move++) { e_create_move create_move_outcome = e_create_move::ABORT; clear_move_blocks(blocks_affected); - if (i_move % 2 == 0) { - float r_lim_decayed = 1.0f + (N_MOVES-i_move) * (r_lim/N_MOVES); +// if (i_move % 2 == 0) { + float r_lim_decayed = 1.0f + (N_MOVES-i_move) * (max_r_lim/N_MOVES); create_move_outcome = propose_router_swap(blocks_affected, r_lim_decayed); - } else { - create_move_outcome = propose_router_swap_flow_centroid(blocks_affected); - } +// } else { +// create_move_outcome = propose_router_swap_flow_centroid(blocks_affected); +// } if (create_move_outcome != e_create_move::ABORT) { @@ -1277,20 +1284,14 @@ static void initial_noc_placement(const t_noc_opts& noc_opts) { if (n_accepted % 128 == 0) { update_noc_normalization_factors(costs); costs.cost = calculate_noc_cost(costs, noc_opts); -// print_noc_grid(); } } else { revert_move_blocks(blocks_affected); revert_noc_traffic_flow_routes(blocks_affected); } - } - } - - - std::cout << "Best BW cost: " << best_agg_bw_cost << std::endl; - std::cout << "Best latency cost: " << best_lat_cost << std::endl; + } } @@ -1312,6 +1313,7 @@ void initial_placement(enum e_pad_loc_type pad_loc_type, const char* constraints mark_fixed_blocks(); if (noc_opts.noc) { + // NoC routers are placed before other blocks initial_noc_placement(noc_opts); } @@ -1326,11 +1328,6 @@ void initial_placement(enum e_pad_loc_type pad_loc_type, const char* constraints //if any blocks remain unplaced, print an error check_initial_placement_legality(); - // route all the traffic flows in the NoC now that all the router cluster block have been placed (this is done only if the noc optimization is enabled by the user) -// if (noc_enabled) { -// initial_noc_routing(); -// } - //#ifdef VERBOSE // VTR_LOG("At end of initial_placement.\n"); // if (getEchoEnabled() && isEchoFileEnabled(E_ECHO_INITIAL_CLB_PLACEMENT)) { From afc7e4e6bb1d6d3d67c9e3a74c932c331b0ef732 Mon Sep 17 00:00:00 2001 From: soheilshahrouz Date: Mon, 26 Jun 2023 11:35:08 -0400 Subject: [PATCH 06/35] Add RouterPlacementCheckpoint to store initial router placement checkpoints. --- vpr/src/place/initial_placement.cpp | 111 +++++++++++++++++++++++++--- vpr/src/place/initial_placement.h | 17 +++++ 2 files changed, 116 insertions(+), 12 deletions(-) diff --git a/vpr/src/place/initial_placement.cpp b/vpr/src/place/initial_placement.cpp index aa9cf4b7bc4..f703761d4da 100644 --- a/vpr/src/place/initial_placement.cpp +++ b/vpr/src/place/initial_placement.cpp @@ -240,6 +240,89 @@ static void place_all_blocks(vtr::vector& block_s */ static void check_initial_placement_legality(); +RouterPlacementCheckpoint::RouterPlacementCheckpoint() : + valid_(false), + cost_(std::numeric_limits::infinity()) { + const auto& noc_ctx = g_vpr_ctx.noc(); + + // Get all router clusters in the net-list + const std::vector& router_bids = noc_ctx.noc_traffic_flows_storage.get_router_clusters_in_netlist(); + + router_locations_.clear(); + + for (const auto& router_bid : router_bids) { + router_locations_[router_bid] = t_pl_loc(OPEN, OPEN, OPEN); + } +} + +void RouterPlacementCheckpoint::save_checkpoint(double cost) { + const auto& noc_ctx = g_vpr_ctx.noc(); + const auto& place_ctx = g_vpr_ctx.placement(); + + const std::vector& router_bids = noc_ctx.noc_traffic_flows_storage.get_router_clusters_in_netlist(); + + for (const auto& router_bid : router_bids) { + t_pl_loc loc = place_ctx.block_locs[router_bid].loc; + router_locations_[router_bid] = loc; + } + valid_ = true; + cost_ = cost; + + std::cout << "save checkpoint is called" << std::endl; + print_noc_grid(); +} + +void RouterPlacementCheckpoint::restore_checkpoint() { + const auto& noc_ctx = g_vpr_ctx.noc(); + const auto& device_ctx = g_vpr_ctx.device(); + auto& place_ctx = g_vpr_ctx.mutable_placement(); + + // Get all physical routers + const auto& noc_phy_routers = noc_ctx.noc_model.get_noc_routers(); + + // Clear all physical routers in placement + for (const auto& phy_router : noc_phy_routers) { + + int x = phy_router.get_router_grid_position_x(); + int y = phy_router.get_router_grid_position_y(); + + place_ctx.grid_blocks[x][y].usage = 0; + + auto tile = device_ctx.grid.get_physical_type(x, y); + + for (auto sub_tile : tile->sub_tiles) { + auto capacity = sub_tile.capacity; + + for (int k = 0; k < capacity.total(); k++) { + if (place_ctx.grid_blocks[x][y].blocks[k + capacity.low] != INVALID_BLOCK_ID) { + place_ctx.grid_blocks[x][y].blocks[k + capacity.low] = EMPTY_BLOCK_ID; + } + } + } + } + + // Place routers based on router_locations_ + for (const auto& router_loc : router_locations_) { + ClusterBlockId router_blk_id = router_loc.first; + t_pl_loc location = router_loc.second; + +// place_ctx.grid_blocks[location.x][location.y].blocks[location.sub_tile] = router_blk_id; +// place_ctx.grid_blocks[location.x][location.y].usage++; + + set_block_location(router_blk_id, location); + } + + + std::cout << "restore checkpoint is called" << std::endl; +} + +bool RouterPlacementCheckpoint::is_valid() const{ + return valid_; +} +double RouterPlacementCheckpoint::get_cost() const { + return cost_; +} + static void check_initial_placement_legality() { auto& cluster_ctx = g_vpr_ctx.clustering(); auto& place_ctx = g_vpr_ctx.placement(); @@ -1234,11 +1317,8 @@ static void initial_noc_placement(const t_noc_opts& noc_opts) { update_noc_normalization_factors(costs); costs.cost = calculate_noc_cost(costs, noc_opts); - double best_agg_bw_cost = std::numeric_limits::infinity(); - double best_lat_cost = std::numeric_limits::infinity(); - // Maximum distance in each direction that a router can travel in a move - const float max_r_lim = ceil(sqrtf(noc_phy_routers.size())); + const float max_r_lim = ceilf(sqrtf(noc_phy_routers.size())); // At most, two routers are swapped t_pl_blocks_to_be_moved blocks_affected(2); @@ -1249,16 +1329,15 @@ static void initial_noc_placement(const t_noc_opts& noc_opts) { const double starting_prob = 0.5; const double prob_step = starting_prob / N_MOVES; + RouterPlacementCheckpoint checkpoint; + // Generate and evaluate router moves for (int i_move = 0, n_accepted = 0; i_move < N_MOVES; i_move++) { e_create_move create_move_outcome = e_create_move::ABORT; clear_move_blocks(blocks_affected); -// if (i_move % 2 == 0) { - float r_lim_decayed = 1.0f + (N_MOVES-i_move) * (max_r_lim/N_MOVES); - create_move_outcome = propose_router_swap(blocks_affected, r_lim_decayed); -// } else { -// create_move_outcome = propose_router_swap_flow_centroid(blocks_affected); -// } + // Shrink the range limit over time + float r_lim_decayed = 1.0f + (N_MOVES-i_move) * (max_r_lim/N_MOVES); + create_move_outcome = propose_router_swap(blocks_affected, r_lim_decayed); if (create_move_outcome != e_create_move::ABORT) { @@ -1279,9 +1358,11 @@ static void initial_noc_placement(const t_noc_opts& noc_opts) { commit_noc_costs(); costs.noc_aggregate_bandwidth_cost += noc_aggregate_bandwidth_delta_c; costs.noc_latency_cost += noc_latency_delta_c; - best_agg_bw_cost = std::min(best_agg_bw_cost, costs.noc_aggregate_bandwidth_cost); - best_lat_cost = std::min(best_lat_cost, costs.noc_latency_cost); if (n_accepted % 128 == 0) { + if (!checkpoint.is_valid() || costs.cost < checkpoint.get_cost()) { + checkpoint.save_checkpoint(costs.cost); + } + update_noc_normalization_factors(costs); costs.cost = calculate_noc_cost(costs, noc_opts); } @@ -1290,8 +1371,13 @@ static void initial_noc_placement(const t_noc_opts& noc_opts) { revert_noc_traffic_flow_routes(blocks_affected); } } + } + if (checkpoint.get_cost() < costs.cost) { + checkpoint.restore_checkpoint(); } + + } @@ -1335,3 +1421,4 @@ void initial_placement(enum e_pad_loc_type pad_loc_type, const char* constraints // } //#endif } + diff --git a/vpr/src/place/initial_placement.h b/vpr/src/place/initial_placement.h index cd2ad03c6b5..7771da5175b 100644 --- a/vpr/src/place/initial_placement.h +++ b/vpr/src/place/initial_placement.h @@ -38,6 +38,23 @@ struct t_grid_empty_locs_block_type { int num_of_empty_locs_in_y_axis; }; +class RouterPlacementCheckpoint { + private: + std::unordered_map router_locations_; + bool valid_ = false; + double cost_; + + public: + RouterPlacementCheckpoint(); + RouterPlacementCheckpoint(const RouterPlacementCheckpoint& other) = delete; + RouterPlacementCheckpoint& operator=(const RouterPlacementCheckpoint& other) = delete; + + void save_checkpoint(double cost); + void restore_checkpoint(); + bool is_valid() const; + double get_cost() const; +}; + void print_noc_grid(); /** From a91abe28e46ef4aaa6818fd0a9ef5ab2d06201f6 Mon Sep 17 00:00:00 2001 From: soheilshahrouz Date: Mon, 26 Jun 2023 12:43:18 -0400 Subject: [PATCH 07/35] Solve non-decreasing NoC cost in initial placemennt Removed update_noc_normalization_factors() call in the loop. Aggregated bandwidth and latency usually decrease over SA, and as a result, the normalization factor extracted from the increases. This means that the NoC related cost may increase solely due to higher normalizing factors. --- vpr/src/place/initial_placement.cpp | 23 ++++++++++------------- 1 file changed, 10 insertions(+), 13 deletions(-) diff --git a/vpr/src/place/initial_placement.cpp b/vpr/src/place/initial_placement.cpp index f703761d4da..70836febf23 100644 --- a/vpr/src/place/initial_placement.cpp +++ b/vpr/src/place/initial_placement.cpp @@ -18,6 +18,7 @@ #include #include #include +#include #ifdef VERBOSE void print_clb_placement(const char* fname); @@ -265,10 +266,12 @@ void RouterPlacementCheckpoint::save_checkpoint(double cost) { t_pl_loc loc = place_ctx.block_locs[router_bid].loc; router_locations_[router_bid] = loc; } + + std::cout << "save checkpoint is called " << cost_ << " " << cost << std::endl; valid_ = true; cost_ = cost; - std::cout << "save checkpoint is called" << std::endl; +// std::cout << "save checkpoint is called" << std::endl; print_noc_grid(); } @@ -306,14 +309,12 @@ void RouterPlacementCheckpoint::restore_checkpoint() { ClusterBlockId router_blk_id = router_loc.first; t_pl_loc location = router_loc.second; -// place_ctx.grid_blocks[location.x][location.y].blocks[location.sub_tile] = router_blk_id; -// place_ctx.grid_blocks[location.x][location.y].usage++; - set_block_location(router_blk_id, location); } std::cout << "restore checkpoint is called" << std::endl; + print_noc_grid(); } bool RouterPlacementCheckpoint::is_valid() const{ @@ -1174,7 +1175,7 @@ static int findFirstInteger(const std::string& str) { } } -#include + void print_noc_grid() { @@ -1208,7 +1209,6 @@ void print_noc_grid() { // std::cout << "Router id " << router_id << " " << place_ctx.block_locs[blk_id].loc.x << " " << place_ctx.block_locs[blk_id].loc.y << std::endl; - std::cout << std::endl; for (int i = 0; i < 10; i++) { for (int j = 0; j < 10; j++) { @@ -1358,13 +1358,8 @@ static void initial_noc_placement(const t_noc_opts& noc_opts) { commit_noc_costs(); costs.noc_aggregate_bandwidth_cost += noc_aggregate_bandwidth_delta_c; costs.noc_latency_cost += noc_latency_delta_c; - if (n_accepted % 128 == 0) { - if (!checkpoint.is_valid() || costs.cost < checkpoint.get_cost()) { - checkpoint.save_checkpoint(costs.cost); - } - - update_noc_normalization_factors(costs); - costs.cost = calculate_noc_cost(costs, noc_opts); + if (costs.cost < checkpoint.get_cost() || !checkpoint.is_valid()) { + checkpoint.save_checkpoint(costs.cost); } } else { revert_move_blocks(blocks_affected); @@ -1374,6 +1369,8 @@ static void initial_noc_placement(const t_noc_opts& noc_opts) { } if (checkpoint.get_cost() < costs.cost) { + std::cout << costs.cost << std::endl; + print_noc_grid(); checkpoint.restore_checkpoint(); } From 845e04131e973efcbc440f13d3d243fd6f7a9892 Mon Sep 17 00:00:00 2001 From: soheilshahrouz Date: Tue, 27 Jun 2023 17:00:22 -0400 Subject: [PATCH 08/35] Re-initialize NoC-related costs when a checkpoint is restored. --- vpr/src/noc/noc_storage.cpp | 4 ++++ vpr/src/noc/noc_storage.h | 9 +++++++++ vpr/src/place/initial_placement.cpp | 23 ++++++++--------------- vpr/src/place/initial_placement.h | 3 ++- vpr/src/place/noc_place_utils.cpp | 17 +++++++++++++++++ vpr/src/place/noc_place_utils.h | 16 ++++++++++++++++ vpr/src/place/place.cpp | 2 +- vpr/src/place/place_checkpoint.cpp | 7 ++++++- vpr/src/place/place_checkpoint.h | 2 +- 9 files changed, 64 insertions(+), 19 deletions(-) diff --git a/vpr/src/noc/noc_storage.cpp b/vpr/src/noc/noc_storage.cpp index cb5382cd13e..10b68d521a5 100644 --- a/vpr/src/noc/noc_storage.cpp +++ b/vpr/src/noc/noc_storage.cpp @@ -23,6 +23,10 @@ const vtr::vector& NocStorage::get_noc_links(void) const { return link_storage; } +vtr::vector& NocStorage::get_mutable_noc_links(void) { + return link_storage; +} + int NocStorage::get_number_of_noc_links(void) const { return link_storage.size(); } diff --git a/vpr/src/noc/noc_storage.h b/vpr/src/noc/noc_storage.h index c1d1e025af0..defaf1e9578 100644 --- a/vpr/src/noc/noc_storage.h +++ b/vpr/src/noc/noc_storage.h @@ -183,6 +183,15 @@ class NocStorage { */ const vtr::vector& get_noc_links(void) const; + /** + * @brief Get all the links in the NoC. The links themselves can + * be modified. This function should be used when information on + * every link needs to be modified. + * + * @return A vector of links. + */ + vtr::vector& get_mutable_noc_links(void); + /** * @return An integer representing the total number of links within the * NoC. diff --git a/vpr/src/place/initial_placement.cpp b/vpr/src/place/initial_placement.cpp index 70836febf23..34e3873f803 100644 --- a/vpr/src/place/initial_placement.cpp +++ b/vpr/src/place/initial_placement.cpp @@ -266,16 +266,11 @@ void RouterPlacementCheckpoint::save_checkpoint(double cost) { t_pl_loc loc = place_ctx.block_locs[router_bid].loc; router_locations_[router_bid] = loc; } - - std::cout << "save checkpoint is called " << cost_ << " " << cost << std::endl; valid_ = true; cost_ = cost; - -// std::cout << "save checkpoint is called" << std::endl; - print_noc_grid(); } -void RouterPlacementCheckpoint::restore_checkpoint() { +void RouterPlacementCheckpoint::restore_checkpoint(const t_noc_opts& noc_opts, t_placer_costs& costs) { const auto& noc_ctx = g_vpr_ctx.noc(); const auto& device_ctx = g_vpr_ctx.device(); auto& place_ctx = g_vpr_ctx.mutable_placement(); @@ -304,6 +299,7 @@ void RouterPlacementCheckpoint::restore_checkpoint() { } } + // Place routers based on router_locations_ for (const auto& router_loc : router_locations_) { ClusterBlockId router_blk_id = router_loc.first; @@ -312,14 +308,14 @@ void RouterPlacementCheckpoint::restore_checkpoint() { set_block_location(router_blk_id, location); } - - std::cout << "restore checkpoint is called" << std::endl; - print_noc_grid(); + // Re-initialize routes and static variables that keep track of NoC-related costs + reinitialize_noc_routing(noc_opts, costs); } bool RouterPlacementCheckpoint::is_valid() const{ return valid_; } + double RouterPlacementCheckpoint::get_cost() const { return cost_; } @@ -1332,7 +1328,7 @@ static void initial_noc_placement(const t_noc_opts& noc_opts) { RouterPlacementCheckpoint checkpoint; // Generate and evaluate router moves - for (int i_move = 0, n_accepted = 0; i_move < N_MOVES; i_move++) { + for (int i_move = 0; i_move < N_MOVES; i_move++) { e_create_move create_move_outcome = e_create_move::ABORT; clear_move_blocks(blocks_affected); // Shrink the range limit over time @@ -1353,7 +1349,6 @@ static void initial_noc_placement(const t_noc_opts& noc_opts) { if (move_accepted) { costs.cost += delta_cost; - n_accepted++; commit_move_blocks(blocks_affected); commit_noc_costs(); costs.noc_aggregate_bandwidth_cost += noc_aggregate_bandwidth_delta_c; @@ -1361,7 +1356,7 @@ static void initial_noc_placement(const t_noc_opts& noc_opts) { if (costs.cost < checkpoint.get_cost() || !checkpoint.is_valid()) { checkpoint.save_checkpoint(costs.cost); } - } else { + } else { // The proposed move is rejected revert_move_blocks(blocks_affected); revert_noc_traffic_flow_routes(blocks_affected); } @@ -1369,9 +1364,7 @@ static void initial_noc_placement(const t_noc_opts& noc_opts) { } if (checkpoint.get_cost() < costs.cost) { - std::cout << costs.cost << std::endl; - print_noc_grid(); - checkpoint.restore_checkpoint(); + checkpoint.restore_checkpoint(noc_opts, costs); } diff --git a/vpr/src/place/initial_placement.h b/vpr/src/place/initial_placement.h index 7771da5175b..61b2763a4cb 100644 --- a/vpr/src/place/initial_placement.h +++ b/vpr/src/place/initial_placement.h @@ -2,6 +2,7 @@ #define VPR_INITIAL_PLACEMENT_H #include "vpr_types.h" +#include "place_util.h" /** * @brief Used to assign each block a score for how difficult it is to place. @@ -50,7 +51,7 @@ class RouterPlacementCheckpoint { RouterPlacementCheckpoint& operator=(const RouterPlacementCheckpoint& other) = delete; void save_checkpoint(double cost); - void restore_checkpoint(); + void restore_checkpoint(const t_noc_opts& noc_opts, t_placer_costs& costs); bool is_valid() const; double get_cost() const; }; diff --git a/vpr/src/place/noc_place_utils.cpp b/vpr/src/place/noc_place_utils.cpp index 2ec4f147a59..56fdc179f22 100644 --- a/vpr/src/place/noc_place_utils.cpp +++ b/vpr/src/place/noc_place_utils.cpp @@ -37,6 +37,23 @@ void initial_noc_routing(void) { return; } +void reinitialize_noc_routing(const t_noc_opts& noc_opts, t_placer_costs& costs) { + + auto& noc_ctx = g_vpr_ctx.mutable_noc(); + + // Zero out bandwidth usage for all links + for (auto& noc_link : noc_ctx.noc_model.get_mutable_noc_links()) { + noc_link.set_bandwidth_usage(0.0); + } + + // Route traffic flows and update link bandwidth usage + initial_noc_routing(); + + // Initialize traffic_flow_costs + costs.noc_aggregate_bandwidth_cost = comp_noc_aggregate_bandwidth_cost(); + costs.noc_latency_cost = comp_noc_latency_cost(noc_opts); +} + void find_affected_noc_routers_and_update_noc_costs(const t_pl_blocks_to_be_moved& blocks_affected, double& noc_aggregate_bandwidth_delta_c, double& noc_latency_delta_c, const t_noc_opts& noc_opts) { // provides the positions where the affected blocks have moved to auto& place_ctx = g_vpr_ctx.placement(); diff --git a/vpr/src/place/noc_place_utils.h b/vpr/src/place/noc_place_utils.h index 256ecfc45bf..4d1f65adf17 100644 --- a/vpr/src/place/noc_place_utils.h +++ b/vpr/src/place/noc_place_utils.h @@ -51,6 +51,22 @@ struct TrafficFlowPlaceCost { */ void initial_noc_routing(void); +/** + * @brief Zeros out all link bandwidth usage an re-routes traffic flows. + * Initializes static variables in noc_place_utils.cpp that are used to + * keep track of NoC-related costs. + * + * This function should be called when a placement checkpoint is restored. + * If the router placement in the checkpoint is different from the last + * router placement before the checkpoint is restored, link bandwidth usage, + * traffic flow routes, and static variable in noc_place_utils.cpp are no + * longer valid and need to be re-initialized. + * + * @param noc_opts NoC-related options used to calculated NoC costs + * @param costs Used to get aggregate bandwidth and latency costs. + */ +void reinitialize_noc_routing(const t_noc_opts& noc_opts, t_placer_costs& costs); + /** * @brief Goes through all the cluster blocks that were moved * in a single swap iteration during placement and checks to see diff --git a/vpr/src/place/place.cpp b/vpr/src/place/place.cpp index 2f07f832bdb..1ca1f24a611 100644 --- a/vpr/src/place/place.cpp +++ b/vpr/src/place/place.cpp @@ -943,7 +943,7 @@ void try_place(const Netlist<>& net_list, if (placer_opts.place_checkpointing) restore_best_placement(placement_checkpoint, timing_info, costs, placer_criticalities, placer_setup_slacks, place_delay_model, - pin_timing_invalidator, crit_params); + pin_timing_invalidator, crit_params, noc_opts); if (placer_opts.placement_saves_per_temperature >= 1) { std::string filename = vtr::string_fmt("placement_%03d_%03d.place", diff --git a/vpr/src/place/place_checkpoint.cpp b/vpr/src/place/place_checkpoint.cpp index 971d1dcb946..0bc4b144bdf 100644 --- a/vpr/src/place/place_checkpoint.cpp +++ b/vpr/src/place/place_checkpoint.cpp @@ -1,4 +1,5 @@ #include "place_checkpoint.h" +#include "noc_place_utils.h" float t_placement_checkpoint::get_cp_cpd() { return cpd; } double t_placement_checkpoint::get_cp_bb_cost() { return costs.bb_cost; } @@ -26,7 +27,7 @@ void save_placement_checkpoint_if_needed(t_placement_checkpoint& placement_check } } -void restore_best_placement(t_placement_checkpoint& placement_checkpoint, std::shared_ptr& timing_info, t_placer_costs& costs, std::unique_ptr& placer_criticalities, std::unique_ptr& placer_setup_slacks, std::unique_ptr& place_delay_model, std::unique_ptr& pin_timing_invalidator, PlaceCritParams crit_params) { +void restore_best_placement(t_placement_checkpoint& placement_checkpoint, std::shared_ptr& timing_info, t_placer_costs& costs, std::unique_ptr& placer_criticalities, std::unique_ptr& placer_setup_slacks, std::unique_ptr& place_delay_model, std::unique_ptr& pin_timing_invalidator, PlaceCritParams crit_params, const t_noc_opts& noc_opts) { if (placement_checkpoint.cp_is_valid() && timing_info->least_slack_critical_path().delay() > placement_checkpoint.get_cp_cpd() && costs.bb_cost < 1.05 * placement_checkpoint.get_cp_bb_cost()) { //restore the latest placement checkpoint costs = placement_checkpoint.restore_placement(); @@ -43,6 +44,10 @@ void restore_best_placement(t_placement_checkpoint& placement_checkpoint, std::s timing_info.get(), &costs); + // Re-initialize static variables that are used to keep track of NoC-related costs + // and re-compute NoC costs + reinitialize_noc_routing(noc_opts, costs); + VTR_LOG("\nCheckpoint restored\n"); } } diff --git a/vpr/src/place/place_checkpoint.h b/vpr/src/place/place_checkpoint.h index 368f7205bee..d3315aca2f2 100644 --- a/vpr/src/place/place_checkpoint.h +++ b/vpr/src/place/place_checkpoint.h @@ -50,5 +50,5 @@ class t_placement_checkpoint { void save_placement_checkpoint_if_needed(t_placement_checkpoint& placement_checkpoint, std::shared_ptr timing_info, t_placer_costs& costs, float cpd); //restore the checkpoint if it's better than the latest placement solution -void restore_best_placement(t_placement_checkpoint& placement_checkpoint, std::shared_ptr& timing_info, t_placer_costs& costs, std::unique_ptr& placer_criticalities, std::unique_ptr& placer_setup_slacks, std::unique_ptr& place_delay_model, std::unique_ptr& pin_timing_invalidator, PlaceCritParams crit_params); +void restore_best_placement(t_placement_checkpoint& placement_checkpoint, std::shared_ptr& timing_info, t_placer_costs& costs, std::unique_ptr& placer_criticalities, std::unique_ptr& placer_setup_slacks, std::unique_ptr& place_delay_model, std::unique_ptr& pin_timing_invalidator, PlaceCritParams crit_params, const t_noc_opts& noc_opts); #endif From f35b940fe2d2fed4d6ed61a658af01feac23640c Mon Sep 17 00:00:00 2001 From: soheilshahrouz Date: Tue, 27 Jun 2023 17:23:48 -0400 Subject: [PATCH 09/35] Place constrained routers in their region --- vpr/src/place/initial_placement.cpp | 36 +++++++++++++++++++++++------ vpr/src/place/place_checkpoint.cpp | 7 +++++- vpr/src/place/place_checkpoint.h | 6 ++++- 3 files changed, 40 insertions(+), 9 deletions(-) diff --git a/vpr/src/place/initial_placement.cpp b/vpr/src/place/initial_placement.cpp index 34e3873f803..6b93142ba7d 100644 --- a/vpr/src/place/initial_placement.cpp +++ b/vpr/src/place/initial_placement.cpp @@ -196,7 +196,7 @@ static bool try_centroid_placement(t_pl_macro pl_macro, PartitionRegion& pr, t_l * * @return true if the macro gets placed, false if not. */ -static bool try_random_placement(t_pl_macro pl_macro, PartitionRegion& pr, t_logical_block_type_ptr block_type, enum e_pad_loc_type pad_loc_type); +static bool try_random_placement(t_pl_macro pl_macro, const PartitionRegion& pr, t_logical_block_type_ptr block_type, enum e_pad_loc_type pad_loc_type); /** * @brief Looks for a valid placement location for macro exhaustively once the maximum number of random locations have been tried. @@ -209,7 +209,7 @@ static bool try_random_placement(t_pl_macro pl_macro, PartitionRegion& pr, t_log * * @return true if the macro gets placed, false if not. */ -static bool try_exhaustive_placement(t_pl_macro pl_macro, PartitionRegion& pr, t_logical_block_type_ptr block_type, enum e_pad_loc_type pad_loc_type); +static bool try_exhaustive_placement(t_pl_macro pl_macro, const PartitionRegion& pr, t_logical_block_type_ptr block_type, enum e_pad_loc_type pad_loc_type); /** * @brief Looks for a valid placement location for macro in second iteration, tries to place as many macros as possible in one column @@ -642,7 +642,7 @@ static inline void fix_IO_block_types(t_pl_macro pl_macro, t_pl_loc loc, enum e_ } } -static bool try_random_placement(t_pl_macro pl_macro, PartitionRegion& pr, t_logical_block_type_ptr block_type, enum e_pad_loc_type pad_loc_type) { +static bool try_random_placement(t_pl_macro pl_macro, const PartitionRegion& pr, t_logical_block_type_ptr block_type, enum e_pad_loc_type pad_loc_type) { const auto& compressed_block_grid = g_vpr_ctx.placement().compressed_block_grids[block_type->index]; t_pl_loc loc; @@ -704,7 +704,7 @@ static bool try_random_placement(t_pl_macro pl_macro, PartitionRegion& pr, t_log return legal; } -static bool try_exhaustive_placement(t_pl_macro pl_macro, PartitionRegion& pr, t_logical_block_type_ptr block_type, enum e_pad_loc_type pad_loc_type) { +static bool try_exhaustive_placement(t_pl_macro pl_macro, const PartitionRegion& pr, t_logical_block_type_ptr block_type, enum e_pad_loc_type pad_loc_type) { const auto& compressed_block_grid = g_vpr_ctx.placement().compressed_block_grids[block_type->index]; auto& place_ctx = g_vpr_ctx.mutable_placement(); @@ -1203,8 +1203,6 @@ void print_noc_grid() { grid_arr[placed_router_x][placed_router_y] = router_id; } -// std::cout << "Router id " << router_id << " " << place_ctx.block_locs[blk_id].loc.x << " " << place_ctx.block_locs[blk_id].loc.y << std::endl; - std::cout << std::endl; for (int i = 0; i < 10; i++) { for (int j = 0; j < 10; j++) { @@ -1227,6 +1225,7 @@ static void initial_noc_placement(const t_noc_opts& noc_opts) { auto& noc_ctx = g_vpr_ctx.noc(); auto& cluster_ctx = g_vpr_ctx.clustering(); auto& device_ctx = g_vpr_ctx.device(); + const auto& floorplanning_ctx = g_vpr_ctx.floorplanning(); // Get all the router clusters and figure out how many of them exist const std::vector& router_blk_ids = noc_ctx.noc_traffic_flows_storage.get_router_clusters_in_netlist(); @@ -1243,7 +1242,30 @@ static void initial_noc_placement(const t_noc_opts& noc_opts) { } if (is_cluster_constrained(router_blk_id)) { - // TODO: try to place the router in its region + + auto block_type = cluster_ctx.clb_nlist.block_type(router_blk_id); + const PartitionRegion& pr = floorplanning_ctx.cluster_constraints[router_blk_id]; + + // Create a macro with a single member + t_pl_macro_member macro_member; + macro_member.blk_index = router_blk_id; + macro_member.offset = t_pl_offset(0, 0, 0); + t_pl_macro pl_macro; + pl_macro.members.push_back(macro_member); + + bool macro_placed = false; + for (int i_try = 0; i_try < MAX_NUM_TRIES_TO_PLACE_MACROS_RANDOMLY && !macro_placed; i_try++) { + macro_placed = try_random_placement(pl_macro, pr, block_type, FREE); + } + + if (!macro_placed) { + macro_placed = try_exhaustive_placement(pl_macro, pr, block_type, FREE); + } + + if (!macro_placed) { + VPR_FATAL_ERROR(VPR_ERROR_PLACE, "Could not place a router cluster within its constrained region"); + } + } else { unfixed_routers.push_back(router_blk_id); } diff --git a/vpr/src/place/place_checkpoint.cpp b/vpr/src/place/place_checkpoint.cpp index 0bc4b144bdf..c5be4c96411 100644 --- a/vpr/src/place/place_checkpoint.cpp +++ b/vpr/src/place/place_checkpoint.cpp @@ -27,7 +27,12 @@ void save_placement_checkpoint_if_needed(t_placement_checkpoint& placement_check } } -void restore_best_placement(t_placement_checkpoint& placement_checkpoint, std::shared_ptr& timing_info, t_placer_costs& costs, std::unique_ptr& placer_criticalities, std::unique_ptr& placer_setup_slacks, std::unique_ptr& place_delay_model, std::unique_ptr& pin_timing_invalidator, PlaceCritParams crit_params, const t_noc_opts& noc_opts) { +void restore_best_placement(t_placement_checkpoint& placement_checkpoint, std::shared_ptr& timing_info, + t_placer_costs& costs, std::unique_ptr& placer_criticalities, + std::unique_ptr& placer_setup_slacks, std::unique_ptr& place_delay_model, + std::unique_ptr& pin_timing_invalidator, PlaceCritParams crit_params, + const t_noc_opts& noc_opts) { + if (placement_checkpoint.cp_is_valid() && timing_info->least_slack_critical_path().delay() > placement_checkpoint.get_cp_cpd() && costs.bb_cost < 1.05 * placement_checkpoint.get_cp_bb_cost()) { //restore the latest placement checkpoint costs = placement_checkpoint.restore_placement(); diff --git a/vpr/src/place/place_checkpoint.h b/vpr/src/place/place_checkpoint.h index d3315aca2f2..4770ade2d9f 100644 --- a/vpr/src/place/place_checkpoint.h +++ b/vpr/src/place/place_checkpoint.h @@ -50,5 +50,9 @@ class t_placement_checkpoint { void save_placement_checkpoint_if_needed(t_placement_checkpoint& placement_checkpoint, std::shared_ptr timing_info, t_placer_costs& costs, float cpd); //restore the checkpoint if it's better than the latest placement solution -void restore_best_placement(t_placement_checkpoint& placement_checkpoint, std::shared_ptr& timing_info, t_placer_costs& costs, std::unique_ptr& placer_criticalities, std::unique_ptr& placer_setup_slacks, std::unique_ptr& place_delay_model, std::unique_ptr& pin_timing_invalidator, PlaceCritParams crit_params, const t_noc_opts& noc_opts); +void restore_best_placement(t_placement_checkpoint& placement_checkpoint, std::shared_ptr& timing_info, + t_placer_costs& costs, std::unique_ptr& placer_criticalities, + std::unique_ptr& placer_setup_slacks, std::unique_ptr& place_delay_model, + std::unique_ptr& pin_timing_invalidator, PlaceCritParams crit_params, + const t_noc_opts& noc_opts); #endif From da483b9b428234de73e86ff7d13a16797375310e Mon Sep 17 00:00:00 2001 From: soheilshahrouz Date: Tue, 27 Jun 2023 18:20:24 -0400 Subject: [PATCH 10/35] Moved RouterPlacementCheckpoint to a separate file. Used const reference to pass some arguments. Some functions in initial_placement.cpp were modified to receive const reference arguments. For instance, t_pl_macro is simply a std:vector that contains its members. Passing it by reference avoids copying the underlying std::vector. Converted some index based loops to range based loops for more readability. --- vpr/src/place/initial_placement.cpp | 146 ++++++------------------- vpr/src/place/initial_placement.h | 16 --- vpr/src/place/noc_place_checkpoint.cpp | 82 ++++++++++++++ vpr/src/place/noc_place_checkpoint.h | 24 ++++ 4 files changed, 140 insertions(+), 128 deletions(-) create mode 100644 vpr/src/place/noc_place_checkpoint.cpp create mode 100644 vpr/src/place/noc_place_checkpoint.h diff --git a/vpr/src/place/initial_placement.cpp b/vpr/src/place/initial_placement.cpp index 6b93142ba7d..7b3012f239c 100644 --- a/vpr/src/place/initial_placement.cpp +++ b/vpr/src/place/initial_placement.cpp @@ -12,11 +12,11 @@ #include "region.h" #include "directed_moves_util.h" #include "noc_place_utils.h" +#include "noc_place_checkpoint.h" #include "echo_files.h" -#include -#include +#include #include #include @@ -47,7 +47,7 @@ constexpr int INVALID_X = -1; * @param unplaced_blk_types_index Block types that their grid locations must be cleared. * */ -static void clear_block_type_grid_locs(std::unordered_set unplaced_blk_types_index); +static void clear_block_type_grid_locs(const std::unordered_set& unplaced_blk_types_index); /** * @brief Places the macro if the head position passed in is legal, and all the resulting @@ -58,7 +58,7 @@ static void clear_block_type_grid_locs(std::unordered_set unplaced_blk_type * * @return true if macro was placed, false if not. */ -static bool try_place_macro(t_pl_macro pl_macro, t_pl_loc head_pos); +static bool try_place_macro(const t_pl_macro& pl_macro, t_pl_loc head_pos); /** * @brief Control routine for placing a macro. @@ -78,7 +78,7 @@ static bool try_place_macro(t_pl_macro pl_macro, t_pl_loc head_pos); * * @return true if macro was placed, false if not. */ -static bool place_macro(int macros_max_num_tries, t_pl_macro pl_macro, enum e_pad_loc_type pad_loc_type, std::vector* blk_types_empty_locs_in_grid, vtr::vector& block_scores); +static bool place_macro(int macros_max_num_tries, const t_pl_macro& pl_macro, enum e_pad_loc_type pad_loc_type, std::vector* blk_types_empty_locs_in_grid, vtr::vector& block_scores); /* * Assign scores to each block based on macro size and floorplanning constraints. @@ -96,7 +96,7 @@ static vtr::vector assign_block_scores(); * * @return y coordinate of the location that macro head should be placed */ -static int get_y_loc_based_on_macro_direction(t_grid_empty_locs_block_type first_macro_loc, t_pl_macro pl_macro); +static int get_y_loc_based_on_macro_direction(t_grid_empty_locs_block_type first_macro_loc, const t_pl_macro& pl_macro); /** * @brief Tries to get the first available location of a specific block type that can accomodate macro blocks @@ -107,7 +107,7 @@ static int get_y_loc_based_on_macro_direction(t_grid_empty_locs_block_type first * * @return index to a column of blk_types_empty_locs_in_grid that can accomodate pl_macro and location of first available location returned by reference */ -static int get_blk_type_first_loc(t_pl_loc& loc, t_pl_macro pl_macro, std::vector* blk_types_empty_locs_in_grid); +static int get_blk_type_first_loc(t_pl_loc& loc, const t_pl_macro& pl_macro, std::vector* blk_types_empty_locs_in_grid); /** * @brief Updates the first available location (lowest y) and number of remaining blocks in the column that dense placement used to place the macro. @@ -118,7 +118,7 @@ static int get_blk_type_first_loc(t_pl_loc& loc, t_pl_macro pl_macro, std::vecto * @param blk_types_empty_locs_in_grid first location (lowest y) and number of remaining blocks in each column for the blk_id type * */ -static void update_blk_type_first_loc(int blk_type_column_index, t_logical_block_type_ptr block_type, t_pl_macro pl_macro, std::vector* blk_types_empty_locs_in_grid); +static void update_blk_type_first_loc(int blk_type_column_index, t_logical_block_type_ptr block_type, const t_pl_macro& pl_macro, std::vector* blk_types_empty_locs_in_grid); /** * @brief Initializes empty locations of the grid with a specific block type into vector for dense initial placement @@ -136,7 +136,7 @@ static std::vector init_blk_types_empty_locations( * @param loc The location at which the head of the macro is placed. * @param pad_loc_type Used to check whether an io block needs to be marked as fixed. */ -static inline void fix_IO_block_types(t_pl_macro pl_macro, t_pl_loc loc, enum e_pad_loc_type pad_loc_type); +static inline void fix_IO_block_types(const t_pl_macro& pl_macro, t_pl_loc loc, enum e_pad_loc_type pad_loc_type); /** * @brief Determine whether a specific macro can be placed in a specific location. @@ -196,7 +196,7 @@ static bool try_centroid_placement(t_pl_macro pl_macro, PartitionRegion& pr, t_l * * @return true if the macro gets placed, false if not. */ -static bool try_random_placement(t_pl_macro pl_macro, const PartitionRegion& pr, t_logical_block_type_ptr block_type, enum e_pad_loc_type pad_loc_type); +static bool try_random_placement(const t_pl_macro& pl_macro, const PartitionRegion& pr, t_logical_block_type_ptr block_type, enum e_pad_loc_type pad_loc_type); /** * @brief Looks for a valid placement location for macro exhaustively once the maximum number of random locations have been tried. @@ -209,7 +209,7 @@ static bool try_random_placement(t_pl_macro pl_macro, const PartitionRegion& pr, * * @return true if the macro gets placed, false if not. */ -static bool try_exhaustive_placement(t_pl_macro pl_macro, const PartitionRegion& pr, t_logical_block_type_ptr block_type, enum e_pad_loc_type pad_loc_type); +static bool try_exhaustive_placement(const t_pl_macro& pl_macro, const PartitionRegion& pr, t_logical_block_type_ptr block_type, enum e_pad_loc_type pad_loc_type); /** * @brief Looks for a valid placement location for macro in second iteration, tries to place as many macros as possible in one column @@ -224,7 +224,7 @@ static bool try_exhaustive_placement(t_pl_macro pl_macro, const PartitionRegion& * * @return true if the macro gets placed, false if not. */ -static bool try_dense_placement(t_pl_macro pl_macro, PartitionRegion& pr, t_logical_block_type_ptr block_type, enum e_pad_loc_type pad_loc_type, std::vector* blk_types_empty_locs_in_grid); +static bool try_dense_placement(const t_pl_macro& pl_macro, PartitionRegion& pr, t_logical_block_type_ptr block_type, enum e_pad_loc_type pad_loc_type, std::vector* blk_types_empty_locs_in_grid); /** * @brief Tries for MAX_INIT_PLACE_ATTEMPTS times to place all blocks considering their floorplanning constraints and the device size @@ -241,84 +241,6 @@ static void place_all_blocks(vtr::vector& block_s */ static void check_initial_placement_legality(); -RouterPlacementCheckpoint::RouterPlacementCheckpoint() : - valid_(false), - cost_(std::numeric_limits::infinity()) { - const auto& noc_ctx = g_vpr_ctx.noc(); - - // Get all router clusters in the net-list - const std::vector& router_bids = noc_ctx.noc_traffic_flows_storage.get_router_clusters_in_netlist(); - - router_locations_.clear(); - - for (const auto& router_bid : router_bids) { - router_locations_[router_bid] = t_pl_loc(OPEN, OPEN, OPEN); - } -} - -void RouterPlacementCheckpoint::save_checkpoint(double cost) { - const auto& noc_ctx = g_vpr_ctx.noc(); - const auto& place_ctx = g_vpr_ctx.placement(); - - const std::vector& router_bids = noc_ctx.noc_traffic_flows_storage.get_router_clusters_in_netlist(); - - for (const auto& router_bid : router_bids) { - t_pl_loc loc = place_ctx.block_locs[router_bid].loc; - router_locations_[router_bid] = loc; - } - valid_ = true; - cost_ = cost; -} - -void RouterPlacementCheckpoint::restore_checkpoint(const t_noc_opts& noc_opts, t_placer_costs& costs) { - const auto& noc_ctx = g_vpr_ctx.noc(); - const auto& device_ctx = g_vpr_ctx.device(); - auto& place_ctx = g_vpr_ctx.mutable_placement(); - - // Get all physical routers - const auto& noc_phy_routers = noc_ctx.noc_model.get_noc_routers(); - - // Clear all physical routers in placement - for (const auto& phy_router : noc_phy_routers) { - - int x = phy_router.get_router_grid_position_x(); - int y = phy_router.get_router_grid_position_y(); - - place_ctx.grid_blocks[x][y].usage = 0; - - auto tile = device_ctx.grid.get_physical_type(x, y); - - for (auto sub_tile : tile->sub_tiles) { - auto capacity = sub_tile.capacity; - - for (int k = 0; k < capacity.total(); k++) { - if (place_ctx.grid_blocks[x][y].blocks[k + capacity.low] != INVALID_BLOCK_ID) { - place_ctx.grid_blocks[x][y].blocks[k + capacity.low] = EMPTY_BLOCK_ID; - } - } - } - } - - - // Place routers based on router_locations_ - for (const auto& router_loc : router_locations_) { - ClusterBlockId router_blk_id = router_loc.first; - t_pl_loc location = router_loc.second; - - set_block_location(router_blk_id, location); - } - - // Re-initialize routes and static variables that keep track of NoC-related costs - reinitialize_noc_routing(noc_opts, costs); -} - -bool RouterPlacementCheckpoint::is_valid() const{ - return valid_; -} - -double RouterPlacementCheckpoint::get_cost() const { - return cost_; -} static void check_initial_placement_legality() { auto& cluster_ctx = g_vpr_ctx.clustering(); @@ -344,7 +266,7 @@ static void check_initial_placement_legality() { static bool is_block_placed(ClusterBlockId blk_id) { auto& place_ctx = g_vpr_ctx.placement(); - return (!(place_ctx.block_locs[blk_id].loc.x == INVALID_X)); + return (place_ctx.block_locs[blk_id].loc.x != INVALID_X); } static bool is_loc_legal(t_pl_loc& loc, PartitionRegion& pr, t_logical_block_type_ptr block_type) { @@ -546,7 +468,7 @@ static bool try_centroid_placement(t_pl_macro pl_macro, PartitionRegion& pr, t_l return legal; } -static int get_y_loc_based_on_macro_direction(t_grid_empty_locs_block_type first_macro_loc, t_pl_macro pl_macro) { +static int get_y_loc_based_on_macro_direction(t_grid_empty_locs_block_type first_macro_loc, const t_pl_macro& pl_macro) { int y = first_macro_loc.first_avail_loc.y; /* @@ -563,7 +485,7 @@ static int get_y_loc_based_on_macro_direction(t_grid_empty_locs_block_type first return y; } -static void update_blk_type_first_loc(int blk_type_column_index, t_logical_block_type_ptr block_type, t_pl_macro pl_macro, std::vector* blk_types_empty_locs_in_grid) { +static void update_blk_type_first_loc(int blk_type_column_index, t_logical_block_type_ptr block_type, const t_pl_macro& pl_macro, std::vector* blk_types_empty_locs_in_grid) { //check if dense placement could place macro successfully if (blk_type_column_index == -1 || blk_types_empty_locs_in_grid->size() <= abs(blk_type_column_index)) { return; @@ -576,7 +498,7 @@ static void update_blk_type_first_loc(int blk_type_column_index, t_logical_block blk_types_empty_locs_in_grid->at(blk_type_column_index).num_of_empty_locs_in_y_axis -= pl_macro.members.size(); } -static int get_blk_type_first_loc(t_pl_loc& loc, t_pl_macro pl_macro, std::vector* blk_types_empty_locs_in_grid) { +static int get_blk_type_first_loc(t_pl_loc& loc, const t_pl_macro& pl_macro, std::vector* blk_types_empty_locs_in_grid) { //loop over all empty locations and choose first column that can accomodate macro blocks for (unsigned int empty_loc_index = 0; empty_loc_index < blk_types_empty_locs_in_grid->size(); empty_loc_index++) { auto first_empty_loc = blk_types_empty_locs_in_grid->at(empty_loc_index); @@ -629,20 +551,20 @@ static std::vector init_blk_types_empty_locations( return block_type_empty_locs; } -static inline void fix_IO_block_types(t_pl_macro pl_macro, t_pl_loc loc, enum e_pad_loc_type pad_loc_type) { +static inline void fix_IO_block_types(const t_pl_macro& pl_macro, t_pl_loc loc, enum e_pad_loc_type pad_loc_type) { const auto& device_ctx = g_vpr_ctx.device(); auto& place_ctx = g_vpr_ctx.mutable_placement(); //If the user marked the IO block pad_loc_type as RANDOM, that means it should be randomly //placed and then stay fixed to that location, which is why the macro members are marked as fixed. const auto& type = device_ctx.grid.get_physical_type(loc.x, loc.y); if (is_io_type(type) && pad_loc_type == RANDOM) { - for (unsigned int imember = 0; imember < pl_macro.members.size(); imember++) { - place_ctx.block_locs[pl_macro.members[imember].blk_index].is_fixed = true; + for (const auto& pl_macro_member : pl_macro.members) { + place_ctx.block_locs[pl_macro_member.blk_index].is_fixed = true; } } } -static bool try_random_placement(t_pl_macro pl_macro, const PartitionRegion& pr, t_logical_block_type_ptr block_type, enum e_pad_loc_type pad_loc_type) { +static bool try_random_placement(const t_pl_macro& pl_macro, const PartitionRegion& pr, t_logical_block_type_ptr block_type, enum e_pad_loc_type pad_loc_type) { const auto& compressed_block_grid = g_vpr_ctx.placement().compressed_block_grids[block_type->index]; t_pl_loc loc; @@ -704,7 +626,7 @@ static bool try_random_placement(t_pl_macro pl_macro, const PartitionRegion& pr, return legal; } -static bool try_exhaustive_placement(t_pl_macro pl_macro, const PartitionRegion& pr, t_logical_block_type_ptr block_type, enum e_pad_loc_type pad_loc_type) { +static bool try_exhaustive_placement(const t_pl_macro& pl_macro, const PartitionRegion& pr, t_logical_block_type_ptr block_type, enum e_pad_loc_type pad_loc_type) { const auto& compressed_block_grid = g_vpr_ctx.placement().compressed_block_grids[block_type->index]; auto& place_ctx = g_vpr_ctx.mutable_placement(); @@ -777,7 +699,7 @@ static bool try_exhaustive_placement(t_pl_macro pl_macro, const PartitionRegion& return placed; } -static bool try_dense_placement(t_pl_macro pl_macro, PartitionRegion& pr, t_logical_block_type_ptr block_type, enum e_pad_loc_type pad_loc_type, std::vector* blk_types_empty_locs_in_grid) { +static bool try_dense_placement(const t_pl_macro& pl_macro, PartitionRegion& pr, t_logical_block_type_ptr block_type, enum e_pad_loc_type pad_loc_type, std::vector* blk_types_empty_locs_in_grid) { t_pl_loc loc; int column_index = get_blk_type_first_loc(loc, pl_macro, blk_types_empty_locs_in_grid); @@ -806,7 +728,7 @@ static bool try_dense_placement(t_pl_macro pl_macro, PartitionRegion& pr, t_logi return legal; } -static bool try_place_macro(t_pl_macro pl_macro, t_pl_loc head_pos) { +static bool try_place_macro(const t_pl_macro& pl_macro, t_pl_loc head_pos) { auto& place_ctx = g_vpr_ctx.mutable_placement(); bool macro_placed = false; @@ -821,10 +743,10 @@ static bool try_place_macro(t_pl_macro pl_macro, t_pl_loc head_pos) { if (mac_can_be_placed) { // Place down the macro macro_placed = true; - for (size_t imember = 0; imember < pl_macro.members.size(); imember++) { - t_pl_loc member_pos = head_pos + pl_macro.members[imember].offset; + for (const auto& pl_macro_member : pl_macro.members) { + t_pl_loc member_pos = head_pos + pl_macro_member.offset; - ClusterBlockId iblk = pl_macro.members[imember].blk_index; + ClusterBlockId iblk = pl_macro_member.blk_index; set_block_location(iblk, member_pos); @@ -834,7 +756,7 @@ static bool try_place_macro(t_pl_macro pl_macro, t_pl_loc head_pos) { return (macro_placed); } -static bool place_macro(int macros_max_num_tries, t_pl_macro pl_macro, enum e_pad_loc_type pad_loc_type, std::vector* blk_types_empty_locs_in_grid, vtr::vector& block_scores) { +static bool place_macro(int macros_max_num_tries, const t_pl_macro& pl_macro, enum e_pad_loc_type pad_loc_type, std::vector* blk_types_empty_locs_in_grid, vtr::vector& block_scores) { ClusterBlockId blk_id; blk_id = pl_macro.members[0].blk_index; @@ -865,7 +787,7 @@ static bool place_macro(int macros_max_num_tries, t_pl_macro pl_macro, enum e_pa //If blk_types_empty_locs_in_grid is not NULL, means that initial placement has been failed in first iteration for this block type //We need to place densely in second iteration to be able to find a legal initial placement solution - if (blk_types_empty_locs_in_grid != NULL && blk_types_empty_locs_in_grid->size() != 0) { + if (blk_types_empty_locs_in_grid != nullptr && !blk_types_empty_locs_in_grid->empty()) { macro_placed = try_dense_placement(pl_macro, pr, block_type, pad_loc_type, blk_types_empty_locs_in_grid); } @@ -926,10 +848,10 @@ static vtr::vector assign_block_scores() { } //go through placement macros and store size of macro for each block - for (auto pl_macro : pl_macros) { + for (const auto& pl_macro : pl_macros) { int size = pl_macro.members.size(); - for (unsigned int i = 0; i < pl_macro.members.size(); i++) { - block_scores[pl_macro.members[i].blk_index].macro_size = size; + for (const auto& pl_macro_member : pl_macro.members) { + block_scores[pl_macro_member.blk_index].macro_size = size; } } @@ -980,7 +902,7 @@ static void place_all_blocks(vtr::vector& block_s std::vector heap_blocks(blocks.begin(), blocks.end()); std::make_heap(heap_blocks.begin(), heap_blocks.end(), criteria); - while (heap_blocks.size()) { + while (!heap_blocks.empty()) { std::pop_heap(heap_blocks.begin(), heap_blocks.end(), criteria); auto blk_id = heap_blocks.back(); heap_blocks.pop_back(); @@ -1024,7 +946,7 @@ static void place_all_blocks(vtr::vector& block_s } } -static void clear_block_type_grid_locs(std::unordered_set unplaced_blk_types_index) { +static void clear_block_type_grid_locs(const std::unordered_set& unplaced_blk_types_index) { auto& device_ctx = g_vpr_ctx.device(); bool clear_all_block_types = false; @@ -1336,7 +1258,7 @@ static void initial_noc_placement(const t_noc_opts& noc_opts) { costs.cost = calculate_noc_cost(costs, noc_opts); // Maximum distance in each direction that a router can travel in a move - const float max_r_lim = ceilf(sqrtf(noc_phy_routers.size())); + const float max_r_lim = ceilf(sqrtf((float)noc_phy_routers.size())); // At most, two routers are swapped t_pl_blocks_to_be_moved blocks_affected(2); diff --git a/vpr/src/place/initial_placement.h b/vpr/src/place/initial_placement.h index 61b2763a4cb..893a528a0ea 100644 --- a/vpr/src/place/initial_placement.h +++ b/vpr/src/place/initial_placement.h @@ -2,7 +2,6 @@ #define VPR_INITIAL_PLACEMENT_H #include "vpr_types.h" -#include "place_util.h" /** * @brief Used to assign each block a score for how difficult it is to place. @@ -39,22 +38,7 @@ struct t_grid_empty_locs_block_type { int num_of_empty_locs_in_y_axis; }; -class RouterPlacementCheckpoint { - private: - std::unordered_map router_locations_; - bool valid_ = false; - double cost_; - public: - RouterPlacementCheckpoint(); - RouterPlacementCheckpoint(const RouterPlacementCheckpoint& other) = delete; - RouterPlacementCheckpoint& operator=(const RouterPlacementCheckpoint& other) = delete; - - void save_checkpoint(double cost); - void restore_checkpoint(const t_noc_opts& noc_opts, t_placer_costs& costs); - bool is_valid() const; - double get_cost() const; -}; void print_noc_grid(); diff --git a/vpr/src/place/noc_place_checkpoint.cpp b/vpr/src/place/noc_place_checkpoint.cpp new file mode 100644 index 00000000000..9cda3aa0adb --- /dev/null +++ b/vpr/src/place/noc_place_checkpoint.cpp @@ -0,0 +1,82 @@ + +#include "noc_place_checkpoint.h" +#include "noc_place_utils.h" + +RouterPlacementCheckpoint::RouterPlacementCheckpoint() : + valid_(false), + cost_(std::numeric_limits::infinity()) { + const auto& noc_ctx = g_vpr_ctx.noc(); + + // Get all router clusters in the net-list + const std::vector& router_bids = noc_ctx.noc_traffic_flows_storage.get_router_clusters_in_netlist(); + + router_locations_.clear(); + + for (const auto& router_bid : router_bids) { + router_locations_[router_bid] = t_pl_loc(OPEN, OPEN, OPEN); + } +} + +void RouterPlacementCheckpoint::save_checkpoint(double cost) { + const auto& noc_ctx = g_vpr_ctx.noc(); + const auto& place_ctx = g_vpr_ctx.placement(); + + const std::vector& router_bids = noc_ctx.noc_traffic_flows_storage.get_router_clusters_in_netlist(); + + for (const auto& router_bid : router_bids) { + t_pl_loc loc = place_ctx.block_locs[router_bid].loc; + router_locations_[router_bid] = loc; + } + valid_ = true; + cost_ = cost; +} + +void RouterPlacementCheckpoint::restore_checkpoint(const t_noc_opts& noc_opts, t_placer_costs& costs) { + const auto& noc_ctx = g_vpr_ctx.noc(); + const auto& device_ctx = g_vpr_ctx.device(); + auto& place_ctx = g_vpr_ctx.mutable_placement(); + + // Get all physical routers + const auto& noc_phy_routers = noc_ctx.noc_model.get_noc_routers(); + + // Clear all physical routers in placement + for (const auto& phy_router : noc_phy_routers) { + + int x = phy_router.get_router_grid_position_x(); + int y = phy_router.get_router_grid_position_y(); + + place_ctx.grid_blocks[x][y].usage = 0; + + auto tile = device_ctx.grid.get_physical_type(x, y); + + for (auto sub_tile : tile->sub_tiles) { + auto capacity = sub_tile.capacity; + + for (int k = 0; k < capacity.total(); k++) { + if (place_ctx.grid_blocks[x][y].blocks[k + capacity.low] != INVALID_BLOCK_ID) { + place_ctx.grid_blocks[x][y].blocks[k + capacity.low] = EMPTY_BLOCK_ID; + } + } + } + } + + + // Place routers based on router_locations_ + for (const auto& router_loc : router_locations_) { + ClusterBlockId router_blk_id = router_loc.first; + t_pl_loc location = router_loc.second; + + set_block_location(router_blk_id, location); + } + + // Re-initialize routes and static variables that keep track of NoC-related costs + reinitialize_noc_routing(noc_opts, costs); +} + +bool RouterPlacementCheckpoint::is_valid() const{ + return valid_; +} + +double RouterPlacementCheckpoint::get_cost() const { + return cost_; +} diff --git a/vpr/src/place/noc_place_checkpoint.h b/vpr/src/place/noc_place_checkpoint.h new file mode 100644 index 00000000000..6e29f45417f --- /dev/null +++ b/vpr/src/place/noc_place_checkpoint.h @@ -0,0 +1,24 @@ +#ifndef VTR_ROUTERPLACEMENTCHECKPOINT_H +#define VTR_ROUTERPLACEMENTCHECKPOINT_H + +#include "vpr_types.h" +#include "place_util.h" + +class RouterPlacementCheckpoint { + private: + std::unordered_map router_locations_; + bool valid_ = false; + double cost_; + + public: + RouterPlacementCheckpoint(); + RouterPlacementCheckpoint(const RouterPlacementCheckpoint& other) = delete; + RouterPlacementCheckpoint& operator=(const RouterPlacementCheckpoint& other) = delete; + + void save_checkpoint(double cost); + void restore_checkpoint(const t_noc_opts& noc_opts, t_placer_costs& costs); + bool is_valid() const; + double get_cost() const; +}; + +#endif //VTR_ROUTERPLACEMENTCHECKPOINT_H From aea82216059d37fbdbb81defb5f199751255d4d7 Mon Sep 17 00:00:00 2001 From: soheilshahrouz Date: Thu, 29 Jun 2023 17:20:59 -0400 Subject: [PATCH 11/35] Call NoC-related functions only when NoC is enabled. Some NoC-related functions like print_noc_grid() and reinitialize_noc_routing() were called without checking whether NoC is enabled, causing illegal memory access. These function calls were moved into if statements to check that NoC option is enabled. --- vpr/src/place/initial_placement.cpp | 3 +-- vpr/src/place/place_checkpoint.cpp | 5 ++++- 2 files changed, 5 insertions(+), 3 deletions(-) diff --git a/vpr/src/place/initial_placement.cpp b/vpr/src/place/initial_placement.cpp index 7b3012f239c..f67d1ecb337 100644 --- a/vpr/src/place/initial_placement.cpp +++ b/vpr/src/place/initial_placement.cpp @@ -1335,10 +1335,9 @@ void initial_placement(enum e_pad_loc_type pad_loc_type, const char* constraints if (noc_opts.noc) { // NoC routers are placed before other blocks initial_noc_placement(noc_opts); + print_noc_grid(); } - print_noc_grid(); - //Assign scores to blocks and placement macros according to how difficult they are to place vtr::vector block_scores = assign_block_scores(); diff --git a/vpr/src/place/place_checkpoint.cpp b/vpr/src/place/place_checkpoint.cpp index c5be4c96411..078c5c8973d 100644 --- a/vpr/src/place/place_checkpoint.cpp +++ b/vpr/src/place/place_checkpoint.cpp @@ -51,7 +51,10 @@ void restore_best_placement(t_placement_checkpoint& placement_checkpoint, std::s // Re-initialize static variables that are used to keep track of NoC-related costs // and re-compute NoC costs - reinitialize_noc_routing(noc_opts, costs); + if (noc_opts.noc) { + reinitialize_noc_routing(noc_opts, costs); + } + VTR_LOG("\nCheckpoint restored\n"); } From 49ef3c3a40e3a7d0fb8ffc4622be167332d8d6c5 Mon Sep 17 00:00:00 2001 From: soheilshahrouz Date: Mon, 10 Jul 2023 12:40:34 -0400 Subject: [PATCH 12/35] Cherry-picked packing changes. --- vpr/src/base/vpr_context.h | 5 + vpr/src/base/vpr_types.cpp | 247 ++++++++++++++++++++++++++++++- vpr/src/base/vpr_types.h | 19 ++- vpr/src/pack/cluster.cpp | 32 ++-- vpr/src/pack/cluster.h | 3 - vpr/src/pack/cluster_util.cpp | 24 ++- vpr/src/pack/cluster_util.h | 2 +- vpr/src/pack/pack.cpp | 269 +++------------------------------- 8 files changed, 318 insertions(+), 283 deletions(-) diff --git a/vpr/src/base/vpr_context.h b/vpr/src/base/vpr_context.h index 7a2d8450819..8ef200b96ee 100644 --- a/vpr/src/base/vpr_context.h +++ b/vpr/src/base/vpr_context.h @@ -336,6 +336,11 @@ struct ClusteringHelperContext : public Context { // the utilization of external input/output pins during packing (between 0 and 1) t_ext_pin_util_targets target_external_pin_util; + // During clustering, a block is related to un-clustered primitives with nets. + // This relation has three types: low fanout, high fanout, and trasitive + // high_fanout_thresholds stores the threshold for nets to a block type to be considered high fanout + t_pack_high_fanout_thresholds high_fanout_thresholds; + // A vector of unordered_sets of AtomBlockIds that are inside each clustered block [0 .. num_clustered_blocks-1] // unordered_set for faster insertion/deletion during the iterative improvement process of packing vtr::vector> atoms_lookup; diff --git a/vpr/src/base/vpr_types.cpp b/vpr/src/base/vpr_types.cpp index c6c688e97c3..10510c57b6c 100644 --- a/vpr/src/base/vpr_types.cpp +++ b/vpr/src/base/vpr_types.cpp @@ -7,7 +7,124 @@ t_ext_pin_util_targets::t_ext_pin_util_targets(float default_in_util, float defa defaults_.output_pin_util = default_out_util; } -t_ext_pin_util t_ext_pin_util_targets::get_pin_util(std::string block_type_name) const { +t_ext_pin_util_targets::t_ext_pin_util_targets(const std::vector& specs) + : t_ext_pin_util_targets(1., 1.) { + + if (specs.size() == 1 && specs[0] == "auto") { + //No user-specified pin utilizations, infer them automatically. + // + //We set a pin utilization target based on the block type, with + //the logic block having a lower utilization target and other blocks + //(e.g. hard blocks) having no limit. + + auto& device_ctx = g_vpr_ctx.device(); + auto& grid = device_ctx.grid; + t_logical_block_type_ptr logic_block_type = infer_logic_block_type(grid); + + //Allowing 100% pin utilization of the logic block type can harm + //routability, since it may allow a few (typically outlier) clusters to + //use a very large number of pins -- causing routability issues. These + //clusters can cause failed routings where only a handful of routing + //resource nodes remain overused (and do not resolve) These can be + //avoided by putting a (soft) limit on the number of input pins which + //can be used, effectively clipping off the most egregeous outliers. + // + //Experiments show that limiting input utilization produces better quality + //than limiting output utilization (limiting input utilization implicitly + //also limits output utilization). + // + //For relatively high pin utilizations (e.g. > 70%) this has little-to-no + //impact on the number of clusters required. As a result we set a default + //input pin utilization target which is high, but less than 100%. + if (logic_block_type != nullptr) { + constexpr float LOGIC_BLOCK_TYPE_AUTO_INPUT_UTIL = 0.8; + constexpr float LOGIC_BLOCK_TYPE_AUTO_OUTPUT_UTIL = 1.0; + + t_ext_pin_util logic_block_ext_pin_util(LOGIC_BLOCK_TYPE_AUTO_INPUT_UTIL, LOGIC_BLOCK_TYPE_AUTO_OUTPUT_UTIL); + + set_block_pin_util(logic_block_type->name, logic_block_ext_pin_util); + } else { + VTR_LOG_WARN("Unable to identify logic block type to apply default pin utilization targets to; this may result in denser packing than desired\n"); + } + + } else { + //Process user specified overrides + + bool default_set = false; + std::set seen_block_types; + + for (const auto& spec : specs) { + t_ext_pin_util target_ext_pin_util(1., 1.); + + auto block_values = vtr::split(spec, ":"); + std::string block_type; + std::string values; + if (block_values.size() == 2) { + block_type = block_values[0]; + values = block_values[1]; + } else if (block_values.size() == 1) { + values = block_values[0]; + } else { + std::stringstream msg; + msg << "In valid block pin utilization specification '" << spec << "' (expected at most one ':' between block name and values"; + VPR_FATAL_ERROR(VPR_ERROR_PACK, msg.str().c_str()); + } + + auto elements = vtr::split(values, ","); + if (elements.size() == 1) { + target_ext_pin_util.input_pin_util = vtr::atof(elements[0]); + } else if (elements.size() == 2) { + target_ext_pin_util.input_pin_util = vtr::atof(elements[0]); + target_ext_pin_util.output_pin_util = vtr::atof(elements[1]); + } else { + std::stringstream msg; + msg << "Invalid conversion from '" << spec << "' to external pin util (expected either a single float value, or two float values separted by a comma)"; + VPR_FATAL_ERROR(VPR_ERROR_PACK, msg.str().c_str()); + } + + if (target_ext_pin_util.input_pin_util < 0. || target_ext_pin_util.input_pin_util > 1.) { + std::stringstream msg; + msg << "Out of range target input pin utilization '" << target_ext_pin_util.input_pin_util << "' (expected within range [0.0, 1.0])"; + VPR_FATAL_ERROR(VPR_ERROR_PACK, msg.str().c_str()); + } + if (target_ext_pin_util.output_pin_util < 0. || target_ext_pin_util.output_pin_util > 1.) { + std::stringstream msg; + msg << "Out of range target output pin utilization '" << target_ext_pin_util.output_pin_util << "' (expected within range [0.0, 1.0])"; + VPR_FATAL_ERROR(VPR_ERROR_PACK, msg.str().c_str()); + } + + if (block_type.empty()) { + //Default value + if (default_set) { + std::stringstream msg; + msg << "Only one default pin utilization should be specified"; + VPR_FATAL_ERROR(VPR_ERROR_PACK, msg.str().c_str()); + } + set_default_pin_util(target_ext_pin_util); + default_set = true; + } else { + if (seen_block_types.count(block_type)) { + std::stringstream msg; + msg << "Only one pin utilization should be specified for block type '" << block_type << "'"; + VPR_FATAL_ERROR(VPR_ERROR_PACK, msg.str().c_str()); + } + + set_block_pin_util(block_type, target_ext_pin_util); + seen_block_types.insert(block_type); + } + } + } +} + +t_ext_pin_util_targets& t_ext_pin_util_targets::operator=(t_ext_pin_util_targets&& other) noexcept { + if (this != &other) { + defaults_ = std::move(other.defaults_); + overrides_ = std::move(other.overrides_); + } + return *this; +} + +t_ext_pin_util t_ext_pin_util_targets::get_pin_util(const std::string& block_type_name) const { auto itr = overrides_.find(block_type_name); if (itr != overrides_.end()) { return itr->second; @@ -15,7 +132,30 @@ t_ext_pin_util t_ext_pin_util_targets::get_pin_util(std::string block_type_name) return defaults_; } -void t_ext_pin_util_targets::set_block_pin_util(std::string block_type_name, t_ext_pin_util target) { +std::string t_ext_pin_util_targets::to_string() const { + std::stringstream ss; + + auto& device_ctx = g_vpr_ctx.device(); + + for (unsigned int itype = 0; itype < device_ctx.physical_tile_types.size(); ++itype) { + if (is_empty_type(&device_ctx.physical_tile_types[itype])) continue; + + auto blk_name = device_ctx.physical_tile_types[itype].name; + + ss << blk_name << ":"; + + auto pin_util = get_pin_util(blk_name); + ss << pin_util.input_pin_util << ',' << pin_util.output_pin_util; + + if (itype != device_ctx.physical_tile_types.size() - 1) { + ss << " "; + } + } + + return ss.str(); +} + +void t_ext_pin_util_targets::set_block_pin_util(const std::string& block_type_name, t_ext_pin_util target) { overrides_[block_type_name] = target; } @@ -26,15 +166,91 @@ void t_ext_pin_util_targets::set_default_pin_util(t_ext_pin_util default_target) t_pack_high_fanout_thresholds::t_pack_high_fanout_thresholds(int threshold) : default_(threshold) {} +t_pack_high_fanout_thresholds::t_pack_high_fanout_thresholds(const std::vector& specs) + : t_pack_high_fanout_thresholds(128) { + + if (specs.size() == 1 && specs[0] == "auto") { + //No user-specified high fanout thresholds, infer them automatically. + // + //We set the high fanout threshold a based on the block type, with + //the logic block having a lower threshold than other blocks. + //(Since logic blocks are the ones which tend to be too densely + //clustered.) + + auto& device_ctx = g_vpr_ctx.device(); + auto& grid = device_ctx.grid; + t_logical_block_type_ptr logic_block_type = infer_logic_block_type(grid); + + if (logic_block_type != nullptr) { + constexpr float LOGIC_BLOCK_TYPE_HIGH_FANOUT_THRESHOLD = 32; + + set(logic_block_type->name, LOGIC_BLOCK_TYPE_HIGH_FANOUT_THRESHOLD); + } else { + VTR_LOG_WARN("Unable to identify logic block type to apply default packer high fanout thresholds; this may result in denser packing than desired\n"); + } + } else { + //Process user specified overrides + + bool default_set = false; + std::set seen_block_types; + + for (const auto& spec : specs) { + auto block_values = vtr::split(spec, ":"); + std::string block_type; + std::string value; + if (block_values.size() == 1) { + value = block_values[0]; + } else if (block_values.size() == 2) { + block_type = block_values[0]; + value = block_values[1]; + } else { + std::stringstream msg; + msg << "In valid block high fanout threshold specification '" << spec << "' (expected at most one ':' between block name and value"; + VPR_FATAL_ERROR(VPR_ERROR_PACK, msg.str().c_str()); + } + + int threshold = vtr::atoi(value); + + if (block_type.empty()) { + //Default value + if (default_set) { + std::stringstream msg; + msg << "Only one default high fanout threshold should be specified"; + VPR_FATAL_ERROR(VPR_ERROR_PACK, msg.str().c_str()); + } + set_default(threshold); + default_set = true; + } else { + if (seen_block_types.count(block_type)) { + std::stringstream msg; + msg << "Only one high fanout threshold should be specified for block type '" << block_type << "'"; + VPR_FATAL_ERROR(VPR_ERROR_PACK, msg.str().c_str()); + } + + set(block_type, threshold); + seen_block_types.insert(block_type); + } + } + } +} + +t_pack_high_fanout_thresholds& t_pack_high_fanout_thresholds::operator=(t_pack_high_fanout_thresholds&& other) noexcept { + if (this != &other) { + default_ = std::move(other.default_); + overrides_ = std::move(other.overrides_); + } + return *this; +} + void t_pack_high_fanout_thresholds::set_default(int threshold) { default_ = threshold; } -void t_pack_high_fanout_thresholds::set(std::string block_type_name, int threshold) { +void t_pack_high_fanout_thresholds::set(const std::string& block_type_name, int threshold) { overrides_[block_type_name] = threshold; } -int t_pack_high_fanout_thresholds::get_threshold(std::string block_type_name) const { +int t_pack_high_fanout_thresholds::get_threshold(const std::string& block_type_name) const { auto itr = overrides_.find(block_type_name); if (itr != overrides_.end()) { return itr->second; @@ -42,6 +258,29 @@ int t_pack_high_fanout_thresholds::get_threshold(std::string block_type_name) co return default_; } +std::string t_pack_high_fanout_thresholds::to_string() const { + std::stringstream ss; + + auto& device_ctx = g_vpr_ctx.device(); + + for (unsigned int itype = 0; itype < device_ctx.physical_tile_types.size(); ++itype) { + if (is_empty_type(&device_ctx.physical_tile_types[itype])) continue; + + auto blk_name = device_ctx.physical_tile_types[itype].name; + + ss << blk_name << ":"; + + auto threshold = get_threshold(blk_name); + ss << threshold; + + if (itype != device_ctx.physical_tile_types.size() - 1) { + ss << " "; + } + } + + return ss.str(); +} + /* * t_pb structure function definitions */ diff --git a/vpr/src/base/vpr_types.h b/vpr/src/base/vpr_types.h index 1fc87f71e7c..d609e2f6344 100644 --- a/vpr/src/base/vpr_types.h +++ b/vpr/src/base/vpr_types.h @@ -186,16 +186,21 @@ class t_ext_pin_util_targets { public: t_ext_pin_util_targets() = default; t_ext_pin_util_targets(float default_in_util, float default_out_util); + t_ext_pin_util_targets(const std::vector& specs); + t_ext_pin_util_targets& operator=(t_ext_pin_util_targets&& other) noexcept; ///@brief Returns the input pin util of the specified block (or default if unspecified) - t_ext_pin_util get_pin_util(std::string block_type_name) const; + t_ext_pin_util get_pin_util(const std::string& block_type_name) const; + + ///@brief Returns a string describing input/output pin utilization targets + std::string to_string() const; public: /** * @brief Sets the pin util for the specified block type * @return true if non-default was previously set */ - void set_block_pin_util(std::string block_type_name, t_ext_pin_util target); + void set_block_pin_util(const std::string& block_type_name, t_ext_pin_util target); /** * @brief Sets the default pin util @@ -212,15 +217,21 @@ class t_pack_high_fanout_thresholds { public: t_pack_high_fanout_thresholds() = default; t_pack_high_fanout_thresholds(int threshold); + t_pack_high_fanout_thresholds(const std::vector& specs); + t_pack_high_fanout_thresholds& operator=(t_pack_high_fanout_thresholds&& other) noexcept; + + ///@brief Returns the high fanout threshold of the specified block + int get_threshold(const std::string& block_type_name) const; - int get_threshold(std::string block_type_name) const; + ///@brief Returns a string describing high fanout thresholds for different block types + std::string to_string() const; public: /** * @brief Sets the pin util for the specified block type * @return true if non-default was previously set */ - void set(std::string block_type_name, int threshold); + void set(const std::string& block_type_name, int threshold); /** * @brief Sets the default pin util diff --git a/vpr/src/pack/cluster.cpp b/vpr/src/pack/cluster.cpp index 4f1382a990d..9651b576084 100644 --- a/vpr/src/pack/cluster.cpp +++ b/vpr/src/pack/cluster.cpp @@ -88,14 +88,11 @@ std::map do_clustering(const t_packer_opts& pa const t_analysis_opts& analysis_opts, const t_arch* arch, t_pack_molecule* molecule_head, - int num_models, const std::unordered_set& is_clock, const std::unordered_map& expected_lowest_cost_pb_gnode, bool allow_unrelated_clustering, bool balance_block_type_utilization, std::vector* lb_type_rr_graphs, - const t_ext_pin_util_targets& ext_pin_util_targets, - const t_pack_high_fanout_thresholds& high_fanout_thresholds, AttractionInfo& attraction_groups, bool& floorplan_regions_overfull, t_clustering_data& clustering_data) { @@ -123,9 +120,7 @@ std::map do_clustering(const t_packer_opts& pa t_cluster_progress_stats cluster_stats; //int num_molecules, num_molecules_processed, mols_since_last_print, blocks_since_last_analysis, - int num_blocks_hill_added, max_pb_depth, - seedindex, savedseedindex /* index of next most timing critical block */, - detailed_routing_stage; + int num_blocks_hill_added, max_pb_depth, detailed_routing_stage; const int verbosity = packer_opts.pack_verbosity; @@ -137,7 +132,6 @@ std::map do_clustering(const t_packer_opts& pa std::map num_used_type_instances; - bool is_cluster_legal; enum e_block_pack_status block_pack_status; t_cluster_placement_stats* cur_cluster_placement_stats_ptr; @@ -178,8 +172,6 @@ std::map do_clustering(const t_packer_opts& pa helper_ctx.max_cluster_size = 0; max_pb_depth = 0; - seedindex = 0; - const t_molecule_stats max_molecule_stats = calc_max_molecules_stats(molecule_head); mark_all_molecules_valid(molecule_head); @@ -224,9 +216,12 @@ std::map do_clustering(const t_packer_opts& pa clustering_delay_calc, timing_info, atom_criticality); } + // Assign gain scores to atoms and sort them based on the scores. auto seed_atoms = initialize_seed_atoms(packer_opts.cluster_seed_type, max_molecule_stats, atom_criticality); - istart = get_highest_gain_seed_molecule(&seedindex, seed_atoms); + /* index of next most timing critical block */ + int seed_index = 0; + istart = get_highest_gain_seed_molecule(seed_index, seed_atoms); print_pack_status_header(); @@ -235,9 +230,10 @@ std::map do_clustering(const t_packer_opts& pa *****************************************************************/ while (istart != nullptr) { - is_cluster_legal = false; - savedseedindex = seedindex; + bool is_cluster_legal = false; + int saved_seed_index = seed_index; for (detailed_routing_stage = (int)E_DETAILED_ROUTE_AT_END_ONLY; !is_cluster_legal && detailed_routing_stage != (int)E_DETAILED_ROUTE_INVALID; detailed_routing_stage++) { + // Use the total number created clusters so far as the ID for the new cluster ClusterBlockId clb_index(helper_ctx.total_clb_num); VTR_LOGV(verbosity > 2, "Complex block %d:\n", helper_ctx.total_clb_num); @@ -251,7 +247,7 @@ std::map do_clustering(const t_packer_opts& pa clb_index, istart, num_used_type_instances, packer_opts.target_device_utilization, - num_models, helper_ctx.max_cluster_size, + helper_ctx.num_models, helper_ctx.max_cluster_size, arch, packer_opts.device_layout, lb_type_rr_graphs, &router_data, detailed_routing_stage, &cluster_ctx.clb_nlist, @@ -281,8 +277,8 @@ std::map do_clustering(const t_packer_opts& pa //Progress dot for seed-block fflush(stdout); - t_ext_pin_util target_ext_pin_util = ext_pin_util_targets.get_pin_util(cluster_ctx.clb_nlist.block_type(clb_index)->name); - int high_fanout_threshold = high_fanout_thresholds.get_threshold(cluster_ctx.clb_nlist.block_type(clb_index)->name); + t_ext_pin_util target_ext_pin_util = helper_ctx.target_external_pin_util.get_pin_util(cluster_ctx.clb_nlist.block_type(clb_index)->name); + int high_fanout_threshold = helper_ctx.high_fanout_thresholds.get_threshold(cluster_ctx.clb_nlist.block_type(clb_index)->name); update_cluster_stats(istart, clb_index, is_clock, //Set of clock nets is_clock, //Set of global nets (currently all clocks) @@ -345,7 +341,7 @@ std::map do_clustering(const t_packer_opts& pa helper_ctx.primitives_list, cluster_stats, helper_ctx.total_clb_num, - num_models, + helper_ctx.num_models, helper_ctx.max_cluster_size, clb_index, detailed_routing_stage, @@ -368,10 +364,10 @@ std::map do_clustering(const t_packer_opts& pa is_cluster_legal = check_cluster_legality(verbosity, detailed_routing_stage, router_data); if (is_cluster_legal) { - istart = save_cluster_routing_and_pick_new_seed(packer_opts, helper_ctx.total_clb_num, seed_atoms, num_blocks_hill_added, clustering_data.intra_lb_routing, seedindex, cluster_stats, router_data); + istart = save_cluster_routing_and_pick_new_seed(packer_opts, helper_ctx.total_clb_num, seed_atoms, num_blocks_hill_added, clustering_data.intra_lb_routing, seed_index, cluster_stats, router_data); store_cluster_info_and_free(packer_opts, clb_index, logic_block_type, le_pb_type, le_count, clb_inter_blk_nets); } else { - free_data_and_requeue_used_mols_if_illegal(clb_index, savedseedindex, num_used_type_instances, helper_ctx.total_clb_num, seedindex); + free_data_and_requeue_used_mols_if_illegal(clb_index, saved_seed_index, num_used_type_instances, helper_ctx.total_clb_num, seed_index); } free_router_data(router_data); router_data = nullptr; diff --git a/vpr/src/pack/cluster.h b/vpr/src/pack/cluster.h index a9f2c1df689..e18783264b8 100644 --- a/vpr/src/pack/cluster.h +++ b/vpr/src/pack/cluster.h @@ -15,14 +15,11 @@ std::map do_clustering(const t_packer_opts& pa const t_analysis_opts& analysis_opts, const t_arch* arch, t_pack_molecule* molecule_head, - int num_models, const std::unordered_set& is_clock, const std::unordered_map& expected_lowest_cost_pb_gnode, bool allow_unrelated_clustering, bool balance_block_type_utilization, std::vector* lb_type_rr_graphs, - const t_ext_pin_util_targets& ext_pin_util_targets, - const t_pack_high_fanout_thresholds& high_fanout_thresholds, AttractionInfo& attraction_groups, bool& floorplan_regions_overfull, t_clustering_data& clustering_data); diff --git a/vpr/src/pack/cluster_util.cpp b/vpr/src/pack/cluster_util.cpp index 8bc8e87923d..6c5ecb59fe0 100644 --- a/vpr/src/pack/cluster_util.cpp +++ b/vpr/src/pack/cluster_util.cpp @@ -1639,7 +1639,7 @@ t_pack_molecule* save_cluster_routing_and_pick_new_seed(const t_packer_opts& pac router_data->saved_lb_nets = nullptr; //Pick a new seed - next_seed = get_highest_gain_seed_molecule(&seedindex, seed_atoms); + next_seed = get_highest_gain_seed_molecule(seedindex, seed_atoms); if (packer_opts.timing_driven) { if (num_blocks_hill_added > 0) { @@ -1776,7 +1776,10 @@ void mark_and_update_partial_gain(const AtomNetId net_id, /* Optimization: It can be too runtime costly for marking all sinks for * a high fanout-net that probably has no hope of ever getting packed, * thus ignore those high fanout nets */ - if (!is_global.count(net_id)) { + /* There are VCC and GND nets in the netlist. These nets have a high fanout, + * but their sinks do not necessarily have a logical relation with each other. + * Therefore, we exclude constant nets when evaluating high fanout connectivity. */ + if (!is_global.count(net_id) && !atom_ctx.nlist.net_is_constant(net_id)) { /* If no low/medium fanout nets, we may need to consider * high fan-out nets for packing, so select one and store it */ AtomNetId stored_net = cur_pb->pb_stats->tie_break_high_fanout_net; @@ -2098,9 +2101,7 @@ void start_new_cluster(t_cluster_placement_stats* cluster_placement_stats, //Try packing into each candidate type bool success = false; - for (size_t i = 0; i < candidate_types.size(); i++) { - auto type = candidate_types[i]; - + for (auto type : candidate_types) { t_pb* pb = new t_pb; pb->pb_graph_node = type->pb_graph_head; alloc_and_load_pb_stats(pb, feasible_block_array_size); @@ -2823,15 +2824,18 @@ std::vector initialize_seed_atoms(const e_cluster_seed seed_type, return seed_atoms; } -t_pack_molecule* get_highest_gain_seed_molecule(int* seedindex, const std::vector seed_atoms) { +t_pack_molecule* get_highest_gain_seed_molecule(int& seed_index, const std::vector& seed_atoms) { auto& atom_ctx = g_vpr_ctx.atom(); - while (*seedindex < static_cast(seed_atoms.size())) { - AtomBlockId blk_id = seed_atoms[(*seedindex)++]; + while (seed_index < static_cast(seed_atoms.size())) { + AtomBlockId blk_id = seed_atoms[seed_index++]; + // Check if the atom has already been assigned to a cluster if (atom_ctx.lookup.atom_clb(blk_id) == ClusterBlockId::INVALID()) { t_pack_molecule* best = nullptr; + // Iterate over all the molecules associated with the selected atom + // and select the one with the highest gain auto rng = atom_ctx.atom_molecules.equal_range(blk_id); for (const auto& kv : vtr::make_range(rng.first, rng.second)) { t_pack_molecule* molecule = kv.second; @@ -3343,11 +3347,15 @@ std::map> identify_primiti auto& device_ctx = g_vpr_ctx.device(); std::set unique_models; + // Find all logic models used in the netlist for (auto blk : atom_nlist.blocks()) { auto model = atom_nlist.block_model(blk); unique_models.insert(model); } + /* For each technology-mapped logic model, find logical block types + * that can accommodate that logic model + */ for (auto model : unique_models) { model_candidates[model] = {}; diff --git a/vpr/src/pack/cluster_util.h b/vpr/src/pack/cluster_util.h index 1316229abc5..0d18b8708da 100644 --- a/vpr/src/pack/cluster_util.h +++ b/vpr/src/pack/cluster_util.h @@ -407,7 +407,7 @@ std::vector initialize_seed_atoms(const e_cluster_seed seed_type, const t_molecule_stats& max_molecule_stats, const vtr::vector& atom_criticality); -t_pack_molecule* get_highest_gain_seed_molecule(int* seedindex, const std::vector seed_atoms); +t_pack_molecule* get_highest_gain_seed_molecule(int& seed_index, const std::vector& seed_atoms); float get_molecule_gain(t_pack_molecule* molecule, std::map& blk_gain, AttractGroupId cluster_attraction_group_id, AttractionInfo& attraction_groups, int num_molecule_failures); diff --git a/vpr/src/pack/pack.cpp b/vpr/src/pack/pack.cpp index a1868c80778..0f505e748a4 100644 --- a/vpr/src/pack/pack.cpp +++ b/vpr/src/pack/pack.cpp @@ -29,11 +29,13 @@ static bool try_size_device_grid(const t_arch& arch, const std::map& num_type_instances, float target_device_utilization, std::string device_layout_name); -static t_ext_pin_util_targets parse_target_external_pin_util(std::vector specs); -static std::string target_external_pin_util_to_string(const t_ext_pin_util_targets& ext_pin_utils); - -static t_pack_high_fanout_thresholds parse_high_fanout_thresholds(std::vector specs); -static std::string high_fanout_thresholds_to_string(const t_pack_high_fanout_thresholds& hf_thresholds); +/** + * @brief Counts the total number of models + * + * @param user_models A linked list of models + * @return int The total number of models in the linked list + */ +static int count_models(const t_model* user_models); bool try_pack(t_packer_opts* packer_opts, const t_analysis_opts* analysis_opts, @@ -46,23 +48,13 @@ bool try_pack(t_packer_opts* packer_opts, std::unordered_set is_clock; std::unordered_map expected_lowest_cost_pb_gnode; //The molecules associated with each atom block - const t_model* cur_model; t_clustering_data clustering_data; std::vector list_of_packing_patterns; VTR_LOG("Begin packing '%s'.\n", packer_opts->circuit_file_name.c_str()); /* determine number of models in the architecture */ - helper_ctx.num_models = 0; - cur_model = user_models; - while (cur_model) { - helper_ctx.num_models++; - cur_model = cur_model->next; - } - cur_model = library_models; - while (cur_model) { - helper_ctx.num_models++; - cur_model = cur_model->next; - } + helper_ctx.num_models = count_models(user_models); + helper_ctx.num_models += count_models(library_models); is_clock = alloc_and_load_is_clock(packer_opts->global_clocks); @@ -113,11 +105,11 @@ bool try_pack(t_packer_opts* packer_opts, VTR_LOG("Using inter-cluster delay: %g\n", packer_opts->inter_cluster_net_delay); } - helper_ctx.target_external_pin_util = parse_target_external_pin_util(packer_opts->target_external_pin_util); - t_pack_high_fanout_thresholds high_fanout_thresholds = parse_high_fanout_thresholds(packer_opts->high_fanout_threshold); + helper_ctx.target_external_pin_util = t_ext_pin_util_targets(packer_opts->target_external_pin_util); + helper_ctx.high_fanout_thresholds = t_pack_high_fanout_thresholds(packer_opts->high_fanout_threshold); - VTR_LOG("Packing with pin utilization targets: %s\n", target_external_pin_util_to_string(helper_ctx.target_external_pin_util).c_str()); - VTR_LOG("Packing with high fanout thresholds: %s\n", high_fanout_thresholds_to_string(high_fanout_thresholds).c_str()); + VTR_LOG("Packing with pin utilization targets: %s\n", helper_ctx.target_external_pin_util.to_string().c_str()); + VTR_LOG("Packing with high fanout thresholds: %s\n", helper_ctx.high_fanout_thresholds.to_string().c_str()); bool allow_unrelated_clustering = false; if (packer_opts->allow_unrelated_clustering == e_unrelated_clustering::ON) { @@ -143,14 +135,12 @@ bool try_pack(t_packer_opts* packer_opts, helper_ctx.num_used_type_instances = do_clustering( *packer_opts, *analysis_opts, - arch, atom_mutable_ctx.list_of_pack_molecules.get(), helper_ctx.num_models, + arch, atom_mutable_ctx.list_of_pack_molecules.get(), is_clock, expected_lowest_cost_pb_gnode, allow_unrelated_clustering, balance_block_type_util, lb_type_rr_graphs, - helper_ctx.target_external_pin_util, - high_fanout_thresholds, attraction_groups, floorplan_regions_overfull, clustering_data); @@ -387,229 +377,18 @@ static bool try_size_device_grid(const t_arch& arch, const std::map specs) { - t_ext_pin_util_targets targets(1., 1.); - - if (specs.size() == 1 && specs[0] == "auto") { - //No user-specified pin utilizations, infer them automatically. - // - //We set a pin utilization target based on the block type, with - //the logic block having a lower utilization target and other blocks - //(e.g. hard blocks) having no limit. - - auto& device_ctx = g_vpr_ctx.device(); - auto& grid = device_ctx.grid; - t_logical_block_type_ptr logic_block_type = infer_logic_block_type(grid); - - //Allowing 100% pin utilization of the logic block type can harm - //routability, since it may allow a few (typically outlier) clusters to - //use a very large number of pins -- causing routability issues. These - //clusters can cause failed routings where only a handful of routing - //resource nodes remain overused (and do not resolve) These can be - //avoided by putting a (soft) limit on the number of input pins which - //can be used, effectively clipping off the most egregeous outliers. - // - //Experiments show that limiting input utilization produces better quality - //than limiting output utilization (limiting input utilization implicitly - //also limits output utilization). - // - //For relatively high pin utilizations (e.g. > 70%) this has little-to-no - //impact on the number of clusters required. As a result we set a default - //input pin utilization target which is high, but less than 100%. - if (logic_block_type != nullptr) { - constexpr float LOGIC_BLOCK_TYPE_AUTO_INPUT_UTIL = 0.8; - constexpr float LOGIC_BLOCK_TYPE_AUTO_OUTPUT_UTIL = 1.0; - - t_ext_pin_util logic_block_ext_pin_util(LOGIC_BLOCK_TYPE_AUTO_INPUT_UTIL, LOGIC_BLOCK_TYPE_AUTO_OUTPUT_UTIL); - - targets.set_block_pin_util(logic_block_type->name, logic_block_ext_pin_util); - } else { - VTR_LOG_WARN("Unable to identify logic block type to apply default pin utilization targets to; this may result in denser packing than desired\n"); - } - - } else { - //Process user specified overrides - - bool default_set = false; - std::set seen_block_types; - - for (auto spec : specs) { - t_ext_pin_util target_ext_pin_util(1., 1.); - - auto block_values = vtr::split(spec, ":"); - std::string block_type; - std::string values; - if (block_values.size() == 2) { - block_type = block_values[0]; - values = block_values[1]; - } else if (block_values.size() == 1) { - values = block_values[0]; - } else { - std::stringstream msg; - msg << "In valid block pin utilization specification '" << spec << "' (expected at most one ':' between block name and values"; - VPR_FATAL_ERROR(VPR_ERROR_PACK, msg.str().c_str()); - } - - auto elements = vtr::split(values, ","); - if (elements.size() == 1) { - target_ext_pin_util.input_pin_util = vtr::atof(elements[0]); - } else if (elements.size() == 2) { - target_ext_pin_util.input_pin_util = vtr::atof(elements[0]); - target_ext_pin_util.output_pin_util = vtr::atof(elements[1]); - } else { - std::stringstream msg; - msg << "Invalid conversion from '" << spec << "' to external pin util (expected either a single float value, or two float values separted by a comma)"; - VPR_FATAL_ERROR(VPR_ERROR_PACK, msg.str().c_str()); - } - - if (target_ext_pin_util.input_pin_util < 0. || target_ext_pin_util.input_pin_util > 1.) { - std::stringstream msg; - msg << "Out of range target input pin utilization '" << target_ext_pin_util.input_pin_util << "' (expected within range [0.0, 1.0])"; - VPR_FATAL_ERROR(VPR_ERROR_PACK, msg.str().c_str()); - } - if (target_ext_pin_util.output_pin_util < 0. || target_ext_pin_util.output_pin_util > 1.) { - std::stringstream msg; - msg << "Out of range target output pin utilization '" << target_ext_pin_util.output_pin_util << "' (expected within range [0.0, 1.0])"; - VPR_FATAL_ERROR(VPR_ERROR_PACK, msg.str().c_str()); - } - - if (block_type.empty()) { - //Default value - if (default_set) { - std::stringstream msg; - msg << "Only one default pin utilization should be specified"; - VPR_FATAL_ERROR(VPR_ERROR_PACK, msg.str().c_str()); - } - targets.set_default_pin_util(target_ext_pin_util); - default_set = true; - } else { - if (seen_block_types.count(block_type)) { - std::stringstream msg; - msg << "Only one pin utilization should be specified for block type '" << block_type << "'"; - VPR_FATAL_ERROR(VPR_ERROR_PACK, msg.str().c_str()); - } - - targets.set_block_pin_util(block_type, target_ext_pin_util); - seen_block_types.insert(block_type); - } - } - } - - return targets; -} - -static std::string target_external_pin_util_to_string(const t_ext_pin_util_targets& ext_pin_utils) { - std::stringstream ss; - - auto& device_ctx = g_vpr_ctx.device(); - - for (unsigned int itype = 0; itype < device_ctx.physical_tile_types.size(); ++itype) { - if (is_empty_type(&device_ctx.physical_tile_types[itype])) continue; - - auto blk_name = device_ctx.physical_tile_types[itype].name; - - ss << blk_name << ":"; - - auto pin_util = ext_pin_utils.get_pin_util(blk_name); - ss << pin_util.input_pin_util << ',' << pin_util.output_pin_util; - - if (itype != device_ctx.physical_tile_types.size() - 1) { - ss << " "; - } - } - - return ss.str(); -} - -static t_pack_high_fanout_thresholds parse_high_fanout_thresholds(std::vector specs) { - t_pack_high_fanout_thresholds high_fanout_thresholds(128); - - if (specs.size() == 1 && specs[0] == "auto") { - //No user-specified high fanout thresholds, infer them automatically. - // - //We set the high fanout threshold a based on the block type, with - //the logic block having a lower threshold than other blocks. - //(Since logic blocks are the ones which tend to be too densely - //clustered.) - - auto& device_ctx = g_vpr_ctx.device(); - auto& grid = device_ctx.grid; - t_logical_block_type_ptr logic_block_type = infer_logic_block_type(grid); - - if (logic_block_type != nullptr) { - constexpr float LOGIC_BLOCK_TYPE_HIGH_FANOUT_THRESHOLD = 32; - - high_fanout_thresholds.set(logic_block_type->name, LOGIC_BLOCK_TYPE_HIGH_FANOUT_THRESHOLD); - } else { - VTR_LOG_WARN("Unable to identify logic block type to apply default packer high fanout thresholds; this may result in denser packing than desired\n"); - } - } else { - //Process user specified overrides - - bool default_set = false; - std::set seen_block_types; - - for (auto spec : specs) { - auto block_values = vtr::split(spec, ":"); - std::string block_type; - std::string value; - if (block_values.size() == 1) { - value = block_values[0]; - } else if (block_values.size() == 2) { - block_type = block_values[0]; - value = block_values[1]; - } else { - std::stringstream msg; - msg << "In valid block high fanout threshold specification '" << spec << "' (expected at most one ':' between block name and value"; - VPR_FATAL_ERROR(VPR_ERROR_PACK, msg.str().c_str()); - } - - int threshold = vtr::atoi(value); - - if (block_type.empty()) { - //Default value - if (default_set) { - std::stringstream msg; - msg << "Only one default high fanout threshold should be specified"; - VPR_FATAL_ERROR(VPR_ERROR_PACK, msg.str().c_str()); - } - high_fanout_thresholds.set_default(threshold); - default_set = true; - } else { - if (seen_block_types.count(block_type)) { - std::stringstream msg; - msg << "Only one high fanout threshold should be specified for block type '" << block_type << "'"; - VPR_FATAL_ERROR(VPR_ERROR_PACK, msg.str().c_str()); - } - - high_fanout_thresholds.set(block_type, threshold); - seen_block_types.insert(block_type); - } - } +static int count_models(const t_model* user_models) { + if (user_models == nullptr) { + return 0; } - return high_fanout_thresholds; -} - -static std::string high_fanout_thresholds_to_string(const t_pack_high_fanout_thresholds& hf_thresholds) { - std::stringstream ss; - - auto& device_ctx = g_vpr_ctx.device(); - - for (unsigned int itype = 0; itype < device_ctx.physical_tile_types.size(); ++itype) { - if (is_empty_type(&device_ctx.physical_tile_types[itype])) continue; - - auto blk_name = device_ctx.physical_tile_types[itype].name; - - ss << blk_name << ":"; - - auto threshold = hf_thresholds.get_threshold(blk_name); - ss << threshold; + const t_model* cur_model = user_models; + int n_models = 0; - if (itype != device_ctx.physical_tile_types.size() - 1) { - ss << " "; - } + while (cur_model) { + n_models++; + cur_model = cur_model->next; } - return ss.str(); -} + return n_models; +} \ No newline at end of file From 824b6b65dadbd4889c06e23eb723082299acbe14 Mon Sep 17 00:00:00 2001 From: soheilshahrouz Date: Mon, 10 Jul 2023 16:07:18 -0400 Subject: [PATCH 13/35] Add prototypes and comments for functions related to initial NoC placement. --- vpr/src/place/initial_placement.cpp | 38 ++++++++++++++++++++++++++++- 1 file changed, 37 insertions(+), 1 deletion(-) diff --git a/vpr/src/place/initial_placement.cpp b/vpr/src/place/initial_placement.cpp index f67d1ecb337..aec96a6759c 100644 --- a/vpr/src/place/initial_placement.cpp +++ b/vpr/src/place/initial_placement.cpp @@ -50,6 +50,43 @@ constexpr int INVALID_X = -1; static void clear_block_type_grid_locs(const std::unordered_set& unplaced_blk_types_index); /** + * @brief Initializes the grid to empty. It also initialized the location for + * all blocks to unplaced. + */ +static void initialize_grid_locs(); + +/** + * @brief Calculates total NoC cost. + * + * @param costs Contains latency and aggregate bandwidth costs + * along with their corresponding normalization factors. + * @param noc_opts Contains NoC placement weighting factor. + * + * @return Calculated total NoC cost. + */ +static double calculate_noc_cost(const t_placer_costs& costs, const t_noc_opts& noc_opts); + +/** + * @brief Evaluates whether a NoC router swap should be accepted or not. + * + * @param delta_cost Specifies how much the total cost would change if + * the proposed swap is accepted. + * @param prob The probability by which a router swap that increases + * the cost is accepted. + * + * @return true if the proposed swap is accepted, false if not. + */ +static bool assess_noc_swap(double delta_cost, double prob); + +/** + * @brief Randomly places NoC routers, then runs a quick simulated annealing + * to minimize NoC costs. + * + * @param noc_opts NoC-related options. Used to calculate NoC-related costs. + */ +static void initial_noc_placement(const t_noc_opts& noc_opts); + + /** * @brief Places the macro if the head position passed in is legal, and all the resulting * member positions are legal * @@ -1311,7 +1348,6 @@ static void initial_noc_placement(const t_noc_opts& noc_opts) { checkpoint.restore_checkpoint(noc_opts, costs); } - } From 1547c0cfeda747c4da8db45b61e7fbf94b748484 Mon Sep 17 00:00:00 2001 From: soheilshahrouz Date: Mon, 10 Jul 2023 17:30:02 -0400 Subject: [PATCH 14/35] Change the condition for restoring a placement checkpoint. With the previous condition, if a checkpoint has a better CPD and its WL is more than 5% better than the current solution, it would not be restored. Passed some arguments as const reference in initial_placement.cpp --- vpr/src/place/initial_placement.cpp | 11 +++++------ vpr/src/place/place_checkpoint.cpp | 21 ++++++++++++++------- vpr/src/place/place_checkpoint.h | 12 ++++++++---- 3 files changed, 27 insertions(+), 17 deletions(-) diff --git a/vpr/src/place/initial_placement.cpp b/vpr/src/place/initial_placement.cpp index aec96a6759c..b2df4e112e3 100644 --- a/vpr/src/place/initial_placement.cpp +++ b/vpr/src/place/initial_placement.cpp @@ -86,7 +86,7 @@ static bool assess_noc_swap(double delta_cost, double prob); */ static void initial_noc_placement(const t_noc_opts& noc_opts); - /** +/** * @brief Places the macro if the head position passed in is legal, and all the resulting * member positions are legal * @@ -195,7 +195,7 @@ static bool is_loc_legal(t_pl_loc& loc, PartitionRegion& pr, t_logical_block_typ * * @return a vector of blocks that are connected to this block but not yet placed so their scores can later be updated. */ -static std::vector find_centroid_loc(t_pl_macro pl_macro, t_pl_loc& centroid); +static std::vector find_centroid_loc(const t_pl_macro& pl_macro, t_pl_loc& centroid); /** * @brief Tries to find a nearest location to the centroid location if calculated centroid location is not legal or is occupied. @@ -220,7 +220,7 @@ static bool find_centroid_neighbor(t_pl_loc& centroid_loc, t_logical_block_type_ * * @return true if the macro gets placed, false if not. */ -static bool try_centroid_placement(t_pl_macro pl_macro, PartitionRegion& pr, t_logical_block_type_ptr block_type, enum e_pad_loc_type pad_loc_type, vtr::vector& block_scores); +static bool try_centroid_placement(const t_pl_macro& pl_macro, PartitionRegion& pr, t_logical_block_type_ptr block_type, enum e_pad_loc_type pad_loc_type, vtr::vector& block_scores); /** * @brief tries to place a macro at a random location @@ -278,7 +278,6 @@ static void place_all_blocks(vtr::vector& block_s */ static void check_initial_placement_legality(); - static void check_initial_placement_legality() { auto& cluster_ctx = g_vpr_ctx.clustering(); auto& place_ctx = g_vpr_ctx.placement(); @@ -372,7 +371,7 @@ static bool find_centroid_neighbor(t_pl_loc& centroid_loc, t_logical_block_type_ return legal; } -static std::vector find_centroid_loc(t_pl_macro pl_macro, t_pl_loc& centroid) { +static std::vector find_centroid_loc(const t_pl_macro& pl_macro, t_pl_loc& centroid) { auto& cluster_ctx = g_vpr_ctx.clustering(); int x, y; @@ -449,7 +448,7 @@ static std::vector find_centroid_loc(t_pl_macro pl_macro, t_pl_l return connected_blocks_to_update; } -static bool try_centroid_placement(t_pl_macro pl_macro, PartitionRegion& pr, t_logical_block_type_ptr block_type, enum e_pad_loc_type pad_loc_type, vtr::vector& block_scores) { +static bool try_centroid_placement(const t_pl_macro& pl_macro, PartitionRegion& pr, t_logical_block_type_ptr block_type, enum e_pad_loc_type pad_loc_type, vtr::vector& block_scores) { t_pl_loc centroid_loc(OPEN, OPEN, OPEN); std::vector unplaced_blocks_to_update_their_score; diff --git a/vpr/src/place/place_checkpoint.cpp b/vpr/src/place/place_checkpoint.cpp index 078c5c8973d..3506a93a7e2 100644 --- a/vpr/src/place/place_checkpoint.cpp +++ b/vpr/src/place/place_checkpoint.cpp @@ -27,13 +27,21 @@ void save_placement_checkpoint_if_needed(t_placement_checkpoint& placement_check } } -void restore_best_placement(t_placement_checkpoint& placement_checkpoint, std::shared_ptr& timing_info, - t_placer_costs& costs, std::unique_ptr& placer_criticalities, - std::unique_ptr& placer_setup_slacks, std::unique_ptr& place_delay_model, - std::unique_ptr& pin_timing_invalidator, PlaceCritParams crit_params, +void restore_best_placement(t_placement_checkpoint& placement_checkpoint, + std::shared_ptr& timing_info, + t_placer_costs& costs, + std::unique_ptr& placer_criticalities, + std::unique_ptr& placer_setup_slacks, + std::unique_ptr& place_delay_model, + std::unique_ptr& pin_timing_invalidator, + PlaceCritParams crit_params, const t_noc_opts& noc_opts) { - - if (placement_checkpoint.cp_is_valid() && timing_info->least_slack_critical_path().delay() > placement_checkpoint.get_cp_cpd() && costs.bb_cost < 1.05 * placement_checkpoint.get_cp_bb_cost()) { + /* The (valid) checkpoint is restored if the following conditions are met: + * 1) The checkpoint has a lower critical path delay. + * 2) The checkpoint's wire-length cost is either better than the current solution, + * or at least is not more than 5% worse than the current solution. + */ + if (placement_checkpoint.cp_is_valid() && timing_info->least_slack_critical_path().delay() > placement_checkpoint.get_cp_cpd() && costs.bb_cost * 1.05 > placement_checkpoint.get_cp_bb_cost()) { //restore the latest placement checkpoint costs = placement_checkpoint.restore_placement(); @@ -55,7 +63,6 @@ void restore_best_placement(t_placement_checkpoint& placement_checkpoint, std::s reinitialize_noc_routing(noc_opts, costs); } - VTR_LOG("\nCheckpoint restored\n"); } } diff --git a/vpr/src/place/place_checkpoint.h b/vpr/src/place/place_checkpoint.h index 4770ade2d9f..28c5b78641f 100644 --- a/vpr/src/place/place_checkpoint.h +++ b/vpr/src/place/place_checkpoint.h @@ -50,9 +50,13 @@ class t_placement_checkpoint { void save_placement_checkpoint_if_needed(t_placement_checkpoint& placement_checkpoint, std::shared_ptr timing_info, t_placer_costs& costs, float cpd); //restore the checkpoint if it's better than the latest placement solution -void restore_best_placement(t_placement_checkpoint& placement_checkpoint, std::shared_ptr& timing_info, - t_placer_costs& costs, std::unique_ptr& placer_criticalities, - std::unique_ptr& placer_setup_slacks, std::unique_ptr& place_delay_model, - std::unique_ptr& pin_timing_invalidator, PlaceCritParams crit_params, +void restore_best_placement(t_placement_checkpoint& placement_checkpoint, + std::shared_ptr& timing_info, + t_placer_costs& costs, + std::unique_ptr& placer_criticalities, + std::unique_ptr& placer_setup_slacks, + std::unique_ptr& place_delay_model, + std::unique_ptr& pin_timing_invalidator, + PlaceCritParams crit_params, const t_noc_opts& noc_opts); #endif From 2597853bfadf908f3f7d0abeb81706cf2f06e4ba Mon Sep 17 00:00:00 2001 From: soheilshahrouz Date: Tue, 11 Jul 2023 10:56:46 -0400 Subject: [PATCH 15/35] Avoid using constant and very high fanout (the same order as clock) nets to infer high fanout connectivity between molecules. --- vpr/src/pack/cluster.cpp | 4 ++- vpr/src/pack/cluster.h | 1 + vpr/src/pack/cluster_util.cpp | 10 +++---- vpr/src/pack/cluster_util.h | 1 + vpr/src/pack/pack.cpp | 53 ++++++++++++++++++++++++++++++++--- 5 files changed, 58 insertions(+), 11 deletions(-) diff --git a/vpr/src/pack/cluster.cpp b/vpr/src/pack/cluster.cpp index 9651b576084..65023c54aed 100644 --- a/vpr/src/pack/cluster.cpp +++ b/vpr/src/pack/cluster.cpp @@ -89,6 +89,7 @@ std::map do_clustering(const t_packer_opts& pa const t_arch* arch, t_pack_molecule* molecule_head, const std::unordered_set& is_clock, + const std::unordered_set& is_global, const std::unordered_map& expected_lowest_cost_pb_gnode, bool allow_unrelated_clustering, bool balance_block_type_utilization, @@ -281,7 +282,7 @@ std::map do_clustering(const t_packer_opts& pa int high_fanout_threshold = helper_ctx.high_fanout_thresholds.get_threshold(cluster_ctx.clb_nlist.block_type(clb_index)->name); update_cluster_stats(istart, clb_index, is_clock, //Set of clock nets - is_clock, //Set of global nets (currently all clocks) + is_global, //Set of global nets (currently all clocks) packer_opts.global_clocks, packer_opts.alpha, packer_opts.beta, packer_opts.timing_driven, packer_opts.connection_driven, @@ -350,6 +351,7 @@ std::map do_clustering(const t_packer_opts& pa allow_unrelated_clustering, high_fanout_threshold, is_clock, + is_global, timing_info, router_data, target_ext_pin_util, diff --git a/vpr/src/pack/cluster.h b/vpr/src/pack/cluster.h index e18783264b8..e08e58dac50 100644 --- a/vpr/src/pack/cluster.h +++ b/vpr/src/pack/cluster.h @@ -16,6 +16,7 @@ std::map do_clustering(const t_packer_opts& pa const t_arch* arch, t_pack_molecule* molecule_head, const std::unordered_set& is_clock, + const std::unordered_set& is_global, const std::unordered_map& expected_lowest_cost_pb_gnode, bool allow_unrelated_clustering, bool balance_block_type_utilization, diff --git a/vpr/src/pack/cluster_util.cpp b/vpr/src/pack/cluster_util.cpp index 6c5ecb59fe0..157948201e5 100644 --- a/vpr/src/pack/cluster_util.cpp +++ b/vpr/src/pack/cluster_util.cpp @@ -1497,6 +1497,7 @@ void try_fill_cluster(const t_packer_opts& packer_opts, bool allow_unrelated_clustering, const int& high_fanout_threshold, const std::unordered_set& is_clock, + const std::unordered_set& is_global, const std::shared_ptr& timing_info, t_lb_router_data* router_data, t_ext_pin_util target_ext_pin_util, @@ -1592,7 +1593,7 @@ void try_fill_cluster(const t_packer_opts& packer_opts, update_cluster_stats(next_molecule, clb_index, is_clock, //Set of all clocks - is_clock, //Set of all global signals (currently clocks) + is_global, //Set of all global signals (currently clocks) packer_opts.global_clocks, packer_opts.alpha, packer_opts.beta, packer_opts.timing_driven, packer_opts.connection_driven, high_fanout_threshold, @@ -2668,13 +2669,10 @@ t_molecule_stats calc_max_molecules_stats(const t_pack_molecule* molecule_head) std::vector initialize_seed_atoms(const e_cluster_seed seed_type, const t_molecule_stats& max_molecule_stats, const vtr::vector& atom_criticality) { - std::vector seed_atoms; + auto& atom_ctx = g_vpr_ctx.atom(); //Put all atoms in seed list - auto& atom_ctx = g_vpr_ctx.atom(); - for (auto blk : atom_ctx.nlist.blocks()) { - seed_atoms.emplace_back(blk); - } + std::vector seed_atoms(atom_ctx.nlist.blocks().begin(), atom_ctx.nlist.blocks().end()); //Initially all gains are zero vtr::vector atom_gains(atom_ctx.nlist.blocks().size(), 0.); diff --git a/vpr/src/pack/cluster_util.h b/vpr/src/pack/cluster_util.h index 0d18b8708da..eecbaf98b07 100644 --- a/vpr/src/pack/cluster_util.h +++ b/vpr/src/pack/cluster_util.h @@ -232,6 +232,7 @@ void try_fill_cluster(const t_packer_opts& packer_opts, bool allow_unrelated_clustering, const int& high_fanout_threshold, const std::unordered_set& is_clock, + const std::unordered_set& is_global, const std::shared_ptr& timing_info, t_lb_router_data* router_data, t_ext_pin_util target_ext_pin_util, diff --git a/vpr/src/pack/pack.cpp b/vpr/src/pack/pack.cpp index 0f505e748a4..0b7775228f9 100644 --- a/vpr/src/pack/pack.cpp +++ b/vpr/src/pack/pack.cpp @@ -37,6 +37,20 @@ static bool try_size_device_grid(const t_arch& arch, const std::map A set containing all nets that are + * likely to be global control nets (e.g. reset, clock enable). + */ +static std::unordered_set find_likely_global_ctrl_nets(const std::unordered_set& clocks); + bool try_pack(t_packer_opts* packer_opts, const t_analysis_opts* analysis_opts, const t_arch* arch, @@ -45,8 +59,10 @@ bool try_pack(t_packer_opts* packer_opts, float interc_delay, std::vector* lb_type_rr_graphs) { auto& helper_ctx = g_vpr_ctx.mutable_cl_helper(); + auto& atom_ctx = g_vpr_ctx.atom(); + auto& atom_mutable_ctx = g_vpr_ctx.mutable_atom(); - std::unordered_set is_clock; + std::unordered_set is_clock, is_global; std::unordered_map expected_lowest_cost_pb_gnode; //The molecules associated with each atom block t_clustering_data clustering_data; std::vector list_of_packing_patterns; @@ -57,9 +73,8 @@ bool try_pack(t_packer_opts* packer_opts, helper_ctx.num_models += count_models(library_models); is_clock = alloc_and_load_is_clock(packer_opts->global_clocks); - - auto& atom_ctx = g_vpr_ctx.atom(); - auto& atom_mutable_ctx = g_vpr_ctx.mutable_atom(); + is_global = find_likely_global_ctrl_nets(is_clock); + is_global.insert(is_clock.begin(), is_clock.end()); size_t num_p_inputs = 0; size_t num_p_outputs = 0; @@ -137,6 +152,7 @@ bool try_pack(t_packer_opts* packer_opts, *analysis_opts, arch, atom_mutable_ctx.list_of_pack_molecules.get(), is_clock, + is_global, expected_lowest_cost_pb_gnode, allow_unrelated_clustering, balance_block_type_util, @@ -332,6 +348,35 @@ std::unordered_set alloc_and_load_is_clock(bool global_clocks) { return (is_clock); } +std::unordered_set find_likely_global_ctrl_nets(const std::unordered_set& clocks) { + auto& atom_ctx = g_vpr_ctx.atom(); + + std::unordered_set likely_reset; + + if (clocks.empty()) { + return likely_reset; + } + + size_t max_clk_sinks = 0; + + for (auto clk_net_id : clocks) { + size_t n_sinks = atom_ctx.nlist.net_sinks(clk_net_id).size(); + max_clk_sinks = std::max(max_clk_sinks, n_sinks); + } + + constexpr float high_fanout_reset_sinks_ratio = 0.6; + for (auto net_id : atom_ctx.nlist.nets()) { + size_t n_sinks = atom_ctx.nlist.net_sinks(net_id).size(); + bool is_net_clock = clocks.count(net_id); + + if (n_sinks > high_fanout_reset_sinks_ratio * max_clk_sinks && !is_net_clock) { + likely_reset.insert(net_id); + } + } + + return likely_reset; +} + static bool try_size_device_grid(const t_arch& arch, const std::map& num_type_instances, float target_device_utilization, std::string device_layout_name) { auto& device_ctx = g_vpr_ctx.mutable_device(); From 192151b9b59d6d3db1685d89e044d8182c3dea44 Mon Sep 17 00:00:00 2001 From: soheilshahrouz Date: Fri, 14 Jul 2023 14:52:56 -0400 Subject: [PATCH 16/35] Disable high fanout connectivity in the first packing attempt. --- vpr/src/pack/cluster.cpp | 3 +++ vpr/src/pack/cluster.h | 1 + vpr/src/pack/cluster_util.cpp | 11 ++++++++--- vpr/src/pack/cluster_util.h | 3 +++ vpr/src/pack/pack.cpp | 16 +++++++++++++--- 5 files changed, 28 insertions(+), 6 deletions(-) diff --git a/vpr/src/pack/cluster.cpp b/vpr/src/pack/cluster.cpp index 65023c54aed..cf819224c94 100644 --- a/vpr/src/pack/cluster.cpp +++ b/vpr/src/pack/cluster.cpp @@ -91,6 +91,7 @@ std::map do_clustering(const t_packer_opts& pa const std::unordered_set& is_clock, const std::unordered_set& is_global, const std::unordered_map& expected_lowest_cost_pb_gnode, + bool allow_high_fanout_connectivity_clustering, bool allow_unrelated_clustering, bool balance_block_type_utilization, std::vector* lb_type_rr_graphs, @@ -301,6 +302,7 @@ std::map do_clustering(const t_packer_opts& pa cluster_stats.num_unrelated_clustering_attempts = 0; next_molecule = get_molecule_for_cluster(cluster_ctx.clb_nlist.block_pb(clb_index), attraction_groups, + allow_high_fanout_connectivity_clustering, allow_unrelated_clustering, packer_opts.prioritize_transitive_connectivity, packer_opts.transitive_fanout_threshold, @@ -348,6 +350,7 @@ std::map do_clustering(const t_packer_opts& pa detailed_routing_stage, attraction_groups, clb_inter_blk_nets, + allow_high_fanout_connectivity_clustering, allow_unrelated_clustering, high_fanout_threshold, is_clock, diff --git a/vpr/src/pack/cluster.h b/vpr/src/pack/cluster.h index e08e58dac50..e05d6066ffd 100644 --- a/vpr/src/pack/cluster.h +++ b/vpr/src/pack/cluster.h @@ -18,6 +18,7 @@ std::map do_clustering(const t_packer_opts& pa const std::unordered_set& is_clock, const std::unordered_set& is_global, const std::unordered_map& expected_lowest_cost_pb_gnode, + bool allow_high_fanout_connectivity_clustering, bool allow_unrelated_clustering, bool balance_block_type_utilization, std::vector* lb_type_rr_graphs, diff --git a/vpr/src/pack/cluster_util.cpp b/vpr/src/pack/cluster_util.cpp index 157948201e5..bc75f5880c6 100644 --- a/vpr/src/pack/cluster_util.cpp +++ b/vpr/src/pack/cluster_util.cpp @@ -1494,6 +1494,7 @@ void try_fill_cluster(const t_packer_opts& packer_opts, const int detailed_routing_stage, AttractionInfo& attraction_groups, vtr::vector>& clb_inter_blk_nets, + bool allow_high_fanout_connectivity_clustering, bool allow_unrelated_clustering, const int& high_fanout_threshold, const std::unordered_set& is_clock, @@ -1554,6 +1555,7 @@ void try_fill_cluster(const t_packer_opts& packer_opts, next_molecule = get_molecule_for_cluster(cluster_ctx.clb_nlist.block_pb(clb_index), attraction_groups, + allow_high_fanout_connectivity_clustering, allow_unrelated_clustering, packer_opts.prioritize_transitive_connectivity, packer_opts.transitive_fanout_threshold, @@ -1607,6 +1609,7 @@ void try_fill_cluster(const t_packer_opts& packer_opts, } next_molecule = get_molecule_for_cluster(cluster_ctx.clb_nlist.block_pb(clb_index), attraction_groups, + allow_high_fanout_connectivity_clustering, allow_unrelated_clustering, packer_opts.prioritize_transitive_connectivity, packer_opts.transitive_fanout_threshold, @@ -2204,6 +2207,7 @@ t_pack_molecule* get_highest_gain_molecule(t_pb* cur_pb, vtr::vector>& clb_inter_blk_nets, const ClusterBlockId cluster_index, bool prioritize_transitive_connectivity, + bool allow_high_fanout_connectivity_clustering, int transitive_fanout_threshold, const int feasible_block_array_size, std::map>& primitive_candidate_block_types) { @@ -2230,12 +2234,12 @@ t_pack_molecule* get_highest_gain_molecule(t_pb* cur_pb, } // 3. Find unpacked molecules based on weak connectedness (connected by high fanout nets) with current cluster - if (cur_pb->pb_stats->num_feasible_blocks == 0 && cur_pb->pb_stats->tie_break_high_fanout_net) { + if (cur_pb->pb_stats->num_feasible_blocks == 0 && cur_pb->pb_stats->tie_break_high_fanout_net && allow_high_fanout_connectivity_clustering) { add_cluster_molecule_candidates_by_highfanout_connectivity(cur_pb, cluster_placement_stats_ptr, feasible_block_array_size, attraction_groups); } } else { //Reverse order // 3. Find unpacked molecules based on weak connectedness (connected by high fanout nets) with current cluster - if (cur_pb->pb_stats->num_feasible_blocks == 0 && cur_pb->pb_stats->tie_break_high_fanout_net) { + if (cur_pb->pb_stats->num_feasible_blocks == 0 && cur_pb->pb_stats->tie_break_high_fanout_net && allow_high_fanout_connectivity_clustering) { add_cluster_molecule_candidates_by_highfanout_connectivity(cur_pb, cluster_placement_stats_ptr, feasible_block_array_size, attraction_groups); } @@ -2508,6 +2512,7 @@ bool check_free_primitives_for_molecule_atoms(t_pack_molecule* molecule, t_clust /*****************************************/ t_pack_molecule* get_molecule_for_cluster(t_pb* cur_pb, AttractionInfo& attraction_groups, + const bool allow_high_fanout_connectivity_clustering, const bool allow_unrelated_clustering, const bool prioritize_transitive_connectivity, const int transitive_fanout_threshold, @@ -2531,7 +2536,7 @@ t_pack_molecule* get_molecule_for_cluster(t_pb* cur_pb, auto best_molecule = get_highest_gain_molecule(cur_pb, attraction_groups, NOT_HILL_CLIMBING, cluster_placement_stats_ptr, clb_inter_blk_nets, - cluster_index, prioritize_transitive_connectivity, + cluster_index, prioritize_transitive_connectivity, allow_high_fanout_connectivity_clustering, transitive_fanout_threshold, feasible_block_array_size, primitive_candidate_block_types); /* If no blocks have any gain to the current cluster, the code above * diff --git a/vpr/src/pack/cluster_util.h b/vpr/src/pack/cluster_util.h index eecbaf98b07..ed33b6c369d 100644 --- a/vpr/src/pack/cluster_util.h +++ b/vpr/src/pack/cluster_util.h @@ -229,6 +229,7 @@ void try_fill_cluster(const t_packer_opts& packer_opts, const int detailed_routing_stage, AttractionInfo& attraction_groups, vtr::vector>& clb_inter_blk_nets, + bool allow_high_fanout_connectivity_clustering, bool allow_unrelated_clustering, const int& high_fanout_threshold, const std::unordered_set& is_clock, @@ -350,6 +351,7 @@ t_pack_molecule* get_highest_gain_molecule(t_pb* cur_pb, vtr::vector>& clb_inter_blk_nets, const ClusterBlockId cluster_index, bool prioritize_transitive_connectivity, + bool allow_high_fanout_connectivity_clustering, int transitive_fanout_threshold, const int feasible_block_array_size, std::map>& primitive_candidate_block_types); @@ -383,6 +385,7 @@ bool check_free_primitives_for_molecule_atoms(t_pack_molecule* molecule, t_clust t_pack_molecule* get_molecule_for_cluster(t_pb* cur_pb, AttractionInfo& attraction_groups, + const bool allow_high_fanout_connectivity_clustering, const bool allow_unrelated_clustering, const bool prioritize_transitive_connectivity, const int transitive_fanout_threshold, diff --git a/vpr/src/pack/pack.cpp b/vpr/src/pack/pack.cpp index 0b7775228f9..cf0106f52b9 100644 --- a/vpr/src/pack/pack.cpp +++ b/vpr/src/pack/pack.cpp @@ -73,7 +73,7 @@ bool try_pack(t_packer_opts* packer_opts, helper_ctx.num_models += count_models(library_models); is_clock = alloc_and_load_is_clock(packer_opts->global_clocks); - is_global = find_likely_global_ctrl_nets(is_clock); +// is_global = find_likely_global_ctrl_nets(is_clock); is_global.insert(is_clock.begin(), is_clock.end()); size_t num_p_inputs = 0; @@ -142,6 +142,7 @@ bool try_pack(t_packer_opts* packer_opts, int pack_iteration = 1; bool floorplan_regions_overfull = false; + bool allow_high_fanout_connectivity_clustering = false; while (true) { free_clustering_data(*packer_opts, clustering_data); @@ -154,6 +155,7 @@ bool try_pack(t_packer_opts* packer_opts, is_clock, is_global, expected_lowest_cost_pb_gnode, + allow_high_fanout_connectivity_clustering, allow_unrelated_clustering, balance_block_type_util, lb_type_rr_graphs, @@ -173,6 +175,13 @@ bool try_pack(t_packer_opts* packer_opts, if (fits_on_device && !floorplan_regions_overfull) { break; //Done } else if (pack_iteration == 1 && !floorplan_not_fitting) { + VTR_ASSERT(allow_high_fanout_connectivity_clustering == false); + allow_high_fanout_connectivity_clustering = true; + VTR_LOG("Packing failed to fit on device. Re-packing with: unrelated_logic_clustering=%s balance_block_type_util=%s allow_high_fanout_connectivity_clustering=%s\n", + (allow_unrelated_clustering ? "true" : "false"), + (balance_block_type_util ? "true" : "false"), + (allow_high_fanout_connectivity_clustering ? "true" : "false")); + } else if (pack_iteration == 2 && !floorplan_not_fitting) { //1st pack attempt was unsucessful (i.e. not dense enough) and we have control of unrelated clustering // //Turn it on to increase packing density @@ -184,9 +193,10 @@ bool try_pack(t_packer_opts* packer_opts, VTR_ASSERT(balance_block_type_util == false); balance_block_type_util = true; } - VTR_LOG("Packing failed to fit on device. Re-packing with: unrelated_logic_clustering=%s balance_block_type_util=%s\n", + VTR_LOG("Packing failed to fit on device. Re-packing with: unrelated_logic_clustering=%s balance_block_type_util=%s allow_high_fanout_connectivity_clustering=%s\n", (allow_unrelated_clustering ? "true" : "false"), - (balance_block_type_util ? "true" : "false")); + (balance_block_type_util ? "true" : "false"), + (allow_high_fanout_connectivity_clustering ? "true" : "false")); /* * When running with tight floorplan constraints, some regions may become overfull with clusters (i.e. * the number of blocks assigned to the region exceeds the number of blocks available). When this occurs, we From cfe51513ca89eab8816148a4d59fe397c9fe3d18 Mon Sep 17 00:00:00 2001 From: soheilshahrouz Date: Sat, 29 Jul 2023 20:55:56 -0400 Subject: [PATCH 17/35] Removed initial temperature boosting for NoC designs. Removed print_noc_grid() calls. --- vpr/src/place/initial_placement.cpp | 103 +++++++++++++--------------- vpr/src/place/initial_placement.h | 4 -- vpr/src/place/place.cpp | 13 +--- 3 files changed, 50 insertions(+), 70 deletions(-) diff --git a/vpr/src/place/initial_placement.cpp b/vpr/src/place/initial_placement.cpp index 30b0aa4c059..fc943605fae 100644 --- a/vpr/src/place/initial_placement.cpp +++ b/vpr/src/place/initial_placement.cpp @@ -1192,57 +1192,55 @@ static int findFirstInteger(const std::string& str) { } } - - -void print_noc_grid() { - - auto& place_ctx = g_vpr_ctx.placement(); - auto& cluster_ctx = g_vpr_ctx.clustering(); - auto& noc_ctx = g_vpr_ctx.noc(); - const int num_layers = g_vpr_ctx.device().grid.get_num_layers(); - - const auto router_block_type = cluster_ctx.clb_nlist.block_type(noc_ctx.noc_traffic_flows_storage.get_router_clusters_in_netlist()[0]); - const auto& compressed_noc_grid = place_ctx.compressed_block_grids[router_block_type->index]; - - static int grid_arr[10][10]; - - for (int i = 0; i < 10; i++) { - for (int j = 0; j < 10; j++) { - grid_arr[i][j] = -1; - } - } - - const std::vector& router_bids = noc_ctx.noc_traffic_flows_storage.get_router_clusters_in_netlist(); - - // Iterate over all routers - for (auto router_bid : router_bids) { - - std::string router_name = cluster_ctx.clb_nlist.block_name(router_bid); - int router_id = findFirstInteger(router_name); - - auto compressed_loc = get_compressed_loc_approx(compressed_noc_grid,place_ctx.block_locs[router_bid].loc, num_layers); - - int placed_router_x = compressed_loc[0].x; - int placed_router_y = compressed_loc[0].y; - grid_arr[placed_router_x][placed_router_y] = router_id; - } - - std::cout << std::endl; - for (int i = 0; i < 10; i++) { - for (int j = 0; j < 10; j++) { - if (grid_arr[j][i] >= 0) { - std::cout << std::setw(2) << std::setfill('0') << grid_arr[j][i] << "\t"; - } else { - std::cout << std::setw(2) << std::setfill(' ') << "X-" << "\t"; - } - - } - std::cout << std::endl; - } - - std::cout << std::endl; - -} +//void print_noc_grid() { +// +// auto& place_ctx = g_vpr_ctx.placement(); +// auto& cluster_ctx = g_vpr_ctx.clustering(); +// auto& noc_ctx = g_vpr_ctx.noc(); +// const int num_layers = g_vpr_ctx.device().grid.get_num_layers(); +// +// const auto router_block_type = cluster_ctx.clb_nlist.block_type(noc_ctx.noc_traffic_flows_storage.get_router_clusters_in_netlist()[0]); +// const auto& compressed_noc_grid = place_ctx.compressed_block_grids[router_block_type->index]; +// +// static int grid_arr[10][10]; +// +// for (int i = 0; i < 10; i++) { +// for (int j = 0; j < 10; j++) { +// grid_arr[i][j] = -1; +// } +// } +// +// const std::vector& router_bids = noc_ctx.noc_traffic_flows_storage.get_router_clusters_in_netlist(); +// +// // Iterate over all routers +// for (auto router_bid : router_bids) { +// +// std::string router_name = cluster_ctx.clb_nlist.block_name(router_bid); +// int router_id = findFirstInteger(router_name); +// +// auto compressed_loc = get_compressed_loc_approx(compressed_noc_grid,place_ctx.block_locs[router_bid].loc, num_layers); +// +// int placed_router_x = compressed_loc[0].x; +// int placed_router_y = compressed_loc[0].y; +// grid_arr[placed_router_x][placed_router_y] = router_id; +// } +// +// std::cout << std::endl; +// for (int i = 0; i < 10; i++) { +// for (int j = 0; j < 10; j++) { +// if (grid_arr[j][i] >= 0) { +// std::cout << std::setw(2) << std::setfill('0') << grid_arr[j][i] << "\t"; +// } else { +// std::cout << std::setw(2) << std::setfill(' ') << "X-" << "\t"; +// } +// +// } +// std::cout << std::endl; +// } +// +// std::cout << std::endl; +// +//} static void initial_noc_placement(const t_noc_opts& noc_opts) { auto& place_ctx = g_vpr_ctx.placement(); @@ -1347,8 +1345,6 @@ static void initial_noc_placement(const t_noc_opts& noc_opts) { // populate internal data structures to maintain route, bandwidth usage, and latencies initial_noc_routing(); - print_noc_grid(); - // Only NoC related costs are considered t_placer_costs costs; @@ -1434,7 +1430,6 @@ void initial_placement(enum e_pad_loc_type pad_loc_type, const char* constraints if (noc_opts.noc) { // NoC routers are placed before other blocks initial_noc_placement(noc_opts); - print_noc_grid(); } //Assign scores to blocks and placement macros according to how difficult they are to place diff --git a/vpr/src/place/initial_placement.h b/vpr/src/place/initial_placement.h index 893a528a0ea..9fae8c8599a 100644 --- a/vpr/src/place/initial_placement.h +++ b/vpr/src/place/initial_placement.h @@ -38,10 +38,6 @@ struct t_grid_empty_locs_block_type { int num_of_empty_locs_in_y_axis; }; - - -void print_noc_grid(); - /** * @brief Tries to find an initial placement location for each block considering floorplanning constraints * and throws an error out if it fails after max number of attempts. diff --git a/vpr/src/place/place.cpp b/vpr/src/place/place.cpp index de9a15bbe9e..243c35ab666 100644 --- a/vpr/src/place/place.cpp +++ b/vpr/src/place/place.cpp @@ -1024,8 +1024,6 @@ void try_place(const Netlist<>& net_list, sprintf(msg, "\nNoC Placement Costs. noc_aggregate_bandwidth_cost: %g noc_latency_cost: %g noc_latency_constraints_cost: %d", costs.noc_aggregate_bandwidth_cost, costs.noc_latency_cost, get_number_of_traffic_flows_with_latency_cons_met()); VTR_LOG("NoC Placement Costs. noc_aggregate_bandwidth_cost: %g, noc_latency_cost: %g, noc_latency_constraints_cost: %d, \n", costs.noc_aggregate_bandwidth_cost, costs.noc_latency_cost, get_number_of_traffic_flows_with_latency_cons_met()); - - print_noc_grid(); } update_screen(ScreenUpdatePriority::MAJOR, msg, PLACEMENT, timing_info); // Print out swap statistics @@ -1350,16 +1348,7 @@ static float starting_t(const t_annealing_state* state, t_placer_costs* costs, t VTR_LOG("std_dev: %g, average cost: %g, starting temp: %g\n", std_dev, av, 20. * std_dev); #endif - float init_temp = 0.0; - - /* We use a constructive initial placement and a low starting temperature - * by default, but that can cause problems with NoCs as the initial logical - * locations are random. Use a higher starting T in that case.*/ -// if (noc_opts.noc) { -// init_temp = 20. * std_dev; -// } else { - init_temp = std_dev / 64; -// } + float init_temp = std_dev / 64; return init_temp; } From c9cc32d842f27dc4dd791b8f74f21f39e7cd1ffc Mon Sep 17 00:00:00 2001 From: soheilshahrouz Date: Mon, 7 Aug 2023 13:28:44 -0400 Subject: [PATCH 18/35] make format --- vpr/src/base/vpr_types.cpp | 2 - vpr/src/noc/noc_router.cpp | 1 - vpr/src/noc/noc_router.h | 1 - vpr/src/noc/noc_traffic_flows.cpp | 1 - vpr/src/pack/cluster.cpp | 2 +- vpr/src/pack/cluster_util.cpp | 2 +- vpr/src/pack/pack.cpp | 2 +- vpr/src/place/initial_placement.cpp | 68 ++------------------------ vpr/src/place/move_utils.cpp | 4 +- vpr/src/place/noc_place_checkpoint.cpp | 10 ++-- vpr/src/place/noc_place_checkpoint.h | 2 +- vpr/src/place/noc_place_utils.cpp | 9 +--- 12 files changed, 16 insertions(+), 88 deletions(-) diff --git a/vpr/src/base/vpr_types.cpp b/vpr/src/base/vpr_types.cpp index 10510c57b6c..ed3fc40f9d0 100644 --- a/vpr/src/base/vpr_types.cpp +++ b/vpr/src/base/vpr_types.cpp @@ -9,7 +9,6 @@ t_ext_pin_util_targets::t_ext_pin_util_targets(float default_in_util, float defa t_ext_pin_util_targets::t_ext_pin_util_targets(const std::vector& specs) : t_ext_pin_util_targets(1., 1.) { - if (specs.size() == 1 && specs[0] == "auto") { //No user-specified pin utilizations, infer them automatically. // @@ -168,7 +167,6 @@ t_pack_high_fanout_thresholds::t_pack_high_fanout_thresholds(int threshold) t_pack_high_fanout_thresholds::t_pack_high_fanout_thresholds(const std::vector& specs) : t_pack_high_fanout_thresholds(128) { - if (specs.size() == 1 && specs[0] == "auto") { //No user-specified high fanout thresholds, infer them automatically. // diff --git a/vpr/src/noc/noc_router.cpp b/vpr/src/noc/noc_router.cpp index 78c2da72617..b0aa166aac5 100644 --- a/vpr/src/noc/noc_router.cpp +++ b/vpr/src/noc/noc_router.cpp @@ -28,7 +28,6 @@ int NocRouter::get_router_layer_position(void) const { } t_physical_tile_loc NocRouter::get_router_physical_location(void) const { - const int x = get_router_grid_position_x(); const int y = get_router_grid_position_y(); const int layer = get_router_layer_position(); diff --git a/vpr/src/noc/noc_router.h b/vpr/src/noc/noc_router.h index c29c3af6cdf..0feb397bdd2 100644 --- a/vpr/src/noc/noc_router.h +++ b/vpr/src/noc/noc_router.h @@ -86,7 +86,6 @@ class NocRouter { */ int get_router_layer_position(void) const; - /** * @brief Gets the physical location where the the physical router is located * @return t_physical_tile_loc that contains x-y coordinates and the layer number diff --git a/vpr/src/noc/noc_traffic_flows.cpp b/vpr/src/noc/noc_traffic_flows.cpp index df211982ae4..1285fc0e474 100644 --- a/vpr/src/noc/noc_traffic_flows.cpp +++ b/vpr/src/noc/noc_traffic_flows.cpp @@ -89,7 +89,6 @@ void NocTrafficFlows::finished_noc_traffic_flows_setup(void) { int number_of_traffic_flows = noc_traffic_flows.size(); traffic_flow_routes.resize(number_of_traffic_flows); - const int num_flows = get_number_of_traffic_flows(); double bandwidth_sum = 0.0; double inverse_latency_sum = 0.0; diff --git a/vpr/src/pack/cluster.cpp b/vpr/src/pack/cluster.cpp index cf819224c94..34c791d0bac 100644 --- a/vpr/src/pack/cluster.cpp +++ b/vpr/src/pack/cluster.cpp @@ -282,7 +282,7 @@ std::map do_clustering(const t_packer_opts& pa t_ext_pin_util target_ext_pin_util = helper_ctx.target_external_pin_util.get_pin_util(cluster_ctx.clb_nlist.block_type(clb_index)->name); int high_fanout_threshold = helper_ctx.high_fanout_thresholds.get_threshold(cluster_ctx.clb_nlist.block_type(clb_index)->name); update_cluster_stats(istart, clb_index, - is_clock, //Set of clock nets + is_clock, //Set of clock nets is_global, //Set of global nets (currently all clocks) packer_opts.global_clocks, packer_opts.alpha, packer_opts.beta, diff --git a/vpr/src/pack/cluster_util.cpp b/vpr/src/pack/cluster_util.cpp index bbf642bce43..a5613e5a194 100644 --- a/vpr/src/pack/cluster_util.cpp +++ b/vpr/src/pack/cluster_util.cpp @@ -1594,7 +1594,7 @@ void try_fill_cluster(const t_packer_opts& packer_opts, attraction_groups); update_cluster_stats(next_molecule, clb_index, - is_clock, //Set of all clocks + is_clock, //Set of all clocks is_global, //Set of all global signals (currently clocks) packer_opts.global_clocks, packer_opts.alpha, packer_opts.beta, packer_opts.timing_driven, packer_opts.connection_driven, diff --git a/vpr/src/pack/pack.cpp b/vpr/src/pack/pack.cpp index 577334bf34d..bfef3fd14f4 100644 --- a/vpr/src/pack/pack.cpp +++ b/vpr/src/pack/pack.cpp @@ -73,7 +73,7 @@ bool try_pack(t_packer_opts* packer_opts, helper_ctx.num_models += count_models(library_models); is_clock = alloc_and_load_is_clock(packer_opts->global_clocks); -// is_global = find_likely_global_ctrl_nets(is_clock); + // is_global = find_likely_global_ctrl_nets(is_clock); is_global.insert(is_clock.begin(), is_clock.end()); size_t num_p_inputs = 0; diff --git a/vpr/src/place/initial_placement.cpp b/vpr/src/place/initial_placement.cpp index fc943605fae..fe9250b6432 100644 --- a/vpr/src/place/initial_placement.cpp +++ b/vpr/src/place/initial_placement.cpp @@ -1141,16 +1141,13 @@ bool place_one_block(const ClusterBlockId& blk_id, return placed_macro; } - static double calculate_noc_cost(const t_placer_costs& costs, const t_noc_opts& noc_opts) { double noc_cost = 0.0; noc_cost = (noc_opts.noc_placement_weighting) * ((costs.noc_aggregate_bandwidth_cost * costs.noc_aggregate_bandwidth_cost_norm) + (costs.noc_latency_cost * costs.noc_latency_cost_norm)); return noc_cost; } - static bool assess_noc_swap(double delta_cost, double prob) { - if (delta_cost <= 0.0) { return true; } @@ -1161,7 +1158,7 @@ static bool assess_noc_swap(double delta_cost, double prob) { float random_num = vtr::frand(); if (random_num < prob) { - return true; + return true; } else { return false; } @@ -1192,56 +1189,6 @@ static int findFirstInteger(const std::string& str) { } } -//void print_noc_grid() { -// -// auto& place_ctx = g_vpr_ctx.placement(); -// auto& cluster_ctx = g_vpr_ctx.clustering(); -// auto& noc_ctx = g_vpr_ctx.noc(); -// const int num_layers = g_vpr_ctx.device().grid.get_num_layers(); -// -// const auto router_block_type = cluster_ctx.clb_nlist.block_type(noc_ctx.noc_traffic_flows_storage.get_router_clusters_in_netlist()[0]); -// const auto& compressed_noc_grid = place_ctx.compressed_block_grids[router_block_type->index]; -// -// static int grid_arr[10][10]; -// -// for (int i = 0; i < 10; i++) { -// for (int j = 0; j < 10; j++) { -// grid_arr[i][j] = -1; -// } -// } -// -// const std::vector& router_bids = noc_ctx.noc_traffic_flows_storage.get_router_clusters_in_netlist(); -// -// // Iterate over all routers -// for (auto router_bid : router_bids) { -// -// std::string router_name = cluster_ctx.clb_nlist.block_name(router_bid); -// int router_id = findFirstInteger(router_name); -// -// auto compressed_loc = get_compressed_loc_approx(compressed_noc_grid,place_ctx.block_locs[router_bid].loc, num_layers); -// -// int placed_router_x = compressed_loc[0].x; -// int placed_router_y = compressed_loc[0].y; -// grid_arr[placed_router_x][placed_router_y] = router_id; -// } -// -// std::cout << std::endl; -// for (int i = 0; i < 10; i++) { -// for (int j = 0; j < 10; j++) { -// if (grid_arr[j][i] >= 0) { -// std::cout << std::setw(2) << std::setfill('0') << grid_arr[j][i] << "\t"; -// } else { -// std::cout << std::setw(2) << std::setfill(' ') << "X-" << "\t"; -// } -// -// } -// std::cout << std::endl; -// } -// -// std::cout << std::endl; -// -//} - static void initial_noc_placement(const t_noc_opts& noc_opts) { auto& place_ctx = g_vpr_ctx.placement(); auto& noc_ctx = g_vpr_ctx.noc(); @@ -1257,14 +1204,12 @@ static void initial_noc_placement(const t_noc_opts& noc_opts) { std::vector unfixed_routers; for (auto router_blk_id : router_blk_ids) { - // The block is fixed and was placed in mark_fixed_blocks() if (is_block_placed((router_blk_id))) { continue; } if (is_cluster_constrained(router_blk_id)) { - auto block_type = cluster_ctx.clb_nlist.block_type(router_blk_id); const PartitionRegion& pr = floorplanning_ctx.cluster_constraints[router_blk_id]; @@ -1340,7 +1285,7 @@ static void initial_noc_placement(const t_noc_opts& noc_opts) { break; } } - } // end for of random router placement + } // end for of random router placement // populate internal data structures to maintain route, bandwidth usage, and latencies initial_noc_routing(); @@ -1372,11 +1317,10 @@ static void initial_noc_placement(const t_noc_opts& noc_opts) { e_create_move create_move_outcome = e_create_move::ABORT; clear_move_blocks(blocks_affected); // Shrink the range limit over time - float r_lim_decayed = 1.0f + (N_MOVES-i_move) * (max_r_lim/N_MOVES); + float r_lim_decayed = 1.0f + (N_MOVES - i_move) * (max_r_lim / N_MOVES); create_move_outcome = propose_router_swap(blocks_affected, r_lim_decayed); if (create_move_outcome != e_create_move::ABORT) { - apply_move_blocks(blocks_affected); double noc_aggregate_bandwidth_delta_c = 0.0; @@ -1384,7 +1328,7 @@ static void initial_noc_placement(const t_noc_opts& noc_opts) { find_affected_noc_routers_and_update_noc_costs(blocks_affected, noc_aggregate_bandwidth_delta_c, noc_latency_delta_c, noc_opts); double delta_cost = (noc_opts.noc_placement_weighting) * (noc_latency_delta_c * costs.noc_latency_cost_norm + noc_aggregate_bandwidth_delta_c * costs.noc_aggregate_bandwidth_cost_norm); - double prob = starting_prob - i_move*prob_step; + double prob = starting_prob - i_move * prob_step; bool move_accepted = assess_noc_swap(delta_cost, prob); if (move_accepted) { @@ -1396,7 +1340,7 @@ static void initial_noc_placement(const t_noc_opts& noc_opts) { if (costs.cost < checkpoint.get_cost() || !checkpoint.is_valid()) { checkpoint.save_checkpoint(costs.cost); } - } else { // The proposed move is rejected + } else { // The proposed move is rejected revert_move_blocks(blocks_affected); revert_noc_traffic_flow_routes(blocks_affected); } @@ -1406,10 +1350,8 @@ static void initial_noc_placement(const t_noc_opts& noc_opts) { if (checkpoint.get_cost() < costs.cost) { checkpoint.restore_checkpoint(noc_opts, costs); } - } - void initial_placement(enum e_pad_loc_type pad_loc_type, const char* constraints_file, const t_noc_opts& noc_opts) { vtr::ScopedStartFinishTimer timer("Initial Placement"); diff --git a/vpr/src/place/move_utils.cpp b/vpr/src/place/move_utils.cpp index 485320b816a..f00b6f1a5b9 100644 --- a/vpr/src/place/move_utils.cpp +++ b/vpr/src/place/move_utils.cpp @@ -1067,8 +1067,8 @@ bool find_compatible_compressed_loc_in_range(t_logical_block_type_ptr type, VTR_ASSERT(to_loc.y <= search_range.ymax); if (from_loc.x == to_loc.x && from_loc.y == to_loc.y && from_loc.layer_num == to_layer_num) { - continue; //Same from/to location -- try again for new y-position - } else if (check_empty) { // Check if the location has at least one empty sub-tile + continue; //Same from/to location -- try again for new y-position + } else if (check_empty) { // Check if the location has at least one empty sub-tile t_pl_loc to_uncompressed_loc; compressed_grid_to_loc(type, to_loc, to_uncompressed_loc); const t_physical_tile_loc to_phy_uncompressed_loc{to_uncompressed_loc.x, to_uncompressed_loc.y, to_uncompressed_loc.layer}; diff --git a/vpr/src/place/noc_place_checkpoint.cpp b/vpr/src/place/noc_place_checkpoint.cpp index a20275757cc..ff9b75b9e87 100644 --- a/vpr/src/place/noc_place_checkpoint.cpp +++ b/vpr/src/place/noc_place_checkpoint.cpp @@ -2,9 +2,9 @@ #include "noc_place_checkpoint.h" #include "noc_place_utils.h" -RouterPlacementCheckpoint::RouterPlacementCheckpoint() : - valid_(false), - cost_(std::numeric_limits::infinity()) { +RouterPlacementCheckpoint::RouterPlacementCheckpoint() + : valid_(false) + , cost_(std::numeric_limits::infinity()) { const auto& noc_ctx = g_vpr_ctx.noc(); // Get all router clusters in the net-list @@ -41,7 +41,6 @@ void RouterPlacementCheckpoint::restore_checkpoint(const t_noc_opts& noc_opts, t // Clear all physical routers in placement for (const auto& phy_router : noc_phy_routers) { - auto phy_loc = phy_router.get_router_physical_location(); place_ctx.grid_blocks.set_usage(phy_loc, 0); @@ -59,7 +58,6 @@ void RouterPlacementCheckpoint::restore_checkpoint(const t_noc_opts& noc_opts, t } } - // Place routers based on router_locations_ for (const auto& router_loc : router_locations_) { ClusterBlockId router_blk_id = router_loc.first; @@ -72,7 +70,7 @@ void RouterPlacementCheckpoint::restore_checkpoint(const t_noc_opts& noc_opts, t reinitialize_noc_routing(noc_opts, costs); } -bool RouterPlacementCheckpoint::is_valid() const{ +bool RouterPlacementCheckpoint::is_valid() const { return valid_; } diff --git a/vpr/src/place/noc_place_checkpoint.h b/vpr/src/place/noc_place_checkpoint.h index 6e29f45417f..ef016c64ad3 100644 --- a/vpr/src/place/noc_place_checkpoint.h +++ b/vpr/src/place/noc_place_checkpoint.h @@ -16,7 +16,7 @@ class RouterPlacementCheckpoint { RouterPlacementCheckpoint& operator=(const RouterPlacementCheckpoint& other) = delete; void save_checkpoint(double cost); - void restore_checkpoint(const t_noc_opts& noc_opts, t_placer_costs& costs); + void restore_checkpoint(const t_noc_opts& noc_opts, t_placer_costs& costs); bool is_valid() const; double get_cost() const; }; diff --git a/vpr/src/place/noc_place_utils.cpp b/vpr/src/place/noc_place_utils.cpp index ae8662294e6..a11c42996e6 100644 --- a/vpr/src/place/noc_place_utils.cpp +++ b/vpr/src/place/noc_place_utils.cpp @@ -499,7 +499,6 @@ static bool select_random_router_cluster(ClusterBlockId& b_from, t_pl_loc& from, } e_create_move propose_router_swap(t_pl_blocks_to_be_moved& blocks_affected, float rlim) { - // block ID for the randomly selected router cluster ClusterBlockId b_from; // current location of the randomly selected router cluster @@ -534,7 +533,6 @@ e_create_move propose_router_swap(t_pl_blocks_to_be_moved& blocks_affected, floa } e_create_move propose_router_swap_flow_centroid(t_pl_blocks_to_be_moved& blocks_affected) { - auto& noc_ctx = g_vpr_ctx.noc(); auto& place_ctx = g_vpr_ctx.placement(); const auto& grid = g_vpr_ctx.device().grid; @@ -590,7 +588,6 @@ e_create_move propose_router_swap_flow_centroid(t_pl_blocks_to_be_moved& blocks_ } } - t_pl_loc centroid_loc(OPEN, OPEN, OPEN, OPEN); if (acc_weight > 0.0) { @@ -609,14 +606,10 @@ e_create_move propose_router_swap_flow_centroid(t_pl_blocks_to_be_moved& blocks_ return e_create_move::ABORT; } - const auto& physical_type = grid.get_physical_type({centroid_loc.x, centroid_loc.y, centroid_loc.layer}); // If the calculated centroid does not have a compatible type, find a compatible location nearby if (!is_tile_compatible(physical_type, cluster_from_type)) { - - - //Determine centroid location in the compressed space of the current block auto compressed_centroid_loc = get_compressed_loc_approx(compressed_noc_grid, {centroid_loc.x, centroid_loc.y, 0, centroid_loc.layer}, @@ -649,7 +642,7 @@ e_create_move propose_router_swap_flow_centroid(t_pl_blocks_to_be_moved& blocks_ bool legal = find_compatible_compressed_loc_in_range(cluster_from_type, delta_cx, compressed_from_loc[0], - {min_cx, max_cx, min_cy, max_cy}, + {min_cx, max_cx, min_cy, max_cy}, compressed_to_loc, false, compressed_from_loc[0].layer_num, From 634d852ecb429b243705fa4b9d6791ce092067f0 Mon Sep 17 00:00:00 2001 From: soheilshahrouz Date: Fri, 11 Aug 2023 09:18:13 -0400 Subject: [PATCH 19/35] Temporarily consider constant nets for clustering. --- vpr/src/pack/cluster_util.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vpr/src/pack/cluster_util.cpp b/vpr/src/pack/cluster_util.cpp index a5613e5a194..fcfbd2d15c1 100644 --- a/vpr/src/pack/cluster_util.cpp +++ b/vpr/src/pack/cluster_util.cpp @@ -1783,7 +1783,7 @@ void mark_and_update_partial_gain(const AtomNetId net_id, /* There are VCC and GND nets in the netlist. These nets have a high fanout, * but their sinks do not necessarily have a logical relation with each other. * Therefore, we exclude constant nets when evaluating high fanout connectivity. */ - if (!is_global.count(net_id) && !atom_ctx.nlist.net_is_constant(net_id)) { + if (!is_global.count(net_id)) { /* If no low/medium fanout nets, we may need to consider * high fan-out nets for packing, so select one and store it */ AtomNetId stored_net = cur_pb->pb_stats->tie_break_high_fanout_net; From f715a537f442d2299d841d916d6e3e5a0ad33d62 Mon Sep 17 00:00:00 2001 From: soheilshahrouz Date: Fri, 18 Aug 2023 10:21:05 -0400 Subject: [PATCH 20/35] cherry-picked some packing updates from noc congestion branch --- vpr/src/base/vpr_api.cpp | 3 ++- vpr/src/pack/pack.cpp | 47 ++++++++++++++++++++++++++++------------ vpr/src/pack/pack.h | 3 ++- 3 files changed, 37 insertions(+), 16 deletions(-) diff --git a/vpr/src/base/vpr_api.cpp b/vpr/src/base/vpr_api.cpp index cd1722cb80c..2505895a7c6 100644 --- a/vpr/src/base/vpr_api.cpp +++ b/vpr/src/base/vpr_api.cpp @@ -652,7 +652,8 @@ bool vpr_pack(t_vpr_setup& vpr_setup, const t_arch& arch) { return try_pack(&vpr_setup.PackerOpts, &vpr_setup.AnalysisOpts, &arch, vpr_setup.user_models, vpr_setup.library_models, inter_cluster_delay, - vpr_setup.PackerRRGraph); + vpr_setup.PackerRRGraph, + vpr_setup.NocOpts.noc); } void vpr_load_packing(t_vpr_setup& vpr_setup, const t_arch& arch) { diff --git a/vpr/src/pack/pack.cpp b/vpr/src/pack/pack.cpp index bfef3fd14f4..6e31b0675f3 100644 --- a/vpr/src/pack/pack.cpp +++ b/vpr/src/pack/pack.cpp @@ -57,7 +57,8 @@ bool try_pack(t_packer_opts* packer_opts, const t_model* user_models, const t_model* library_models, float interc_delay, - std::vector* lb_type_rr_graphs) { + std::vector* lb_type_rr_graphs, + bool noc_enabled) { auto& helper_ctx = g_vpr_ctx.mutable_cl_helper(); auto& atom_ctx = g_vpr_ctx.atom(); auto& atom_mutable_ctx = g_vpr_ctx.mutable_atom(); @@ -142,7 +143,22 @@ bool try_pack(t_packer_opts* packer_opts, int pack_iteration = 1; bool floorplan_regions_overfull = false; - bool allow_high_fanout_connectivity_clustering = false; + bool allow_high_fanout_connectivity_clustering; + + /* If the design contains NoC routers, don't use high fanout connectivity + * to find candidate atoms for growing the current cluster. In NoC-based designs, + * modules connected to different routers may not have any shared net except for + * clock and other global signals. By not using high fanout connectivity, + * atoms belonging to different modules become less likely to be clustered together. + * This allows the placement engine to place clustered blocks closer to their + * corresponding NoC router, reducing WL as a result. Otherwise, blocks containing + * atoms from two different modules would be stretched between two routers. + */ + if (noc_enabled) { + allow_high_fanout_connectivity_clustering = false; + } else { + allow_high_fanout_connectivity_clustering = true; + } while (true) { free_clustering_data(*packer_opts, clustering_data); @@ -174,15 +190,20 @@ bool try_pack(t_packer_opts* packer_opts, if (fits_on_device && !floorplan_regions_overfull) { break; //Done - } else if (pack_iteration == 1 && !floorplan_not_fitting) { + /* + * If NoC is enabled and the first packing attempt has failed, we don't care whether a floorplan constraint couldn't be satisfied + * or the clustered netlist does not fit into the target FPGA device. Enabling high fanout connectivity clustering + * can help with both, so we enable it. + */ + } else if (noc_enabled && pack_iteration == 1) { VTR_ASSERT(allow_high_fanout_connectivity_clustering == false); allow_high_fanout_connectivity_clustering = true; VTR_LOG("Packing failed to fit on device. Re-packing with: unrelated_logic_clustering=%s balance_block_type_util=%s allow_high_fanout_connectivity_clustering=%s\n", (allow_unrelated_clustering ? "true" : "false"), (balance_block_type_util ? "true" : "false"), (allow_high_fanout_connectivity_clustering ? "true" : "false")); - } else if (pack_iteration == 2 && !floorplan_not_fitting) { - //1st pack attempt was unsucessful (i.e. not dense enough) and we have control of unrelated clustering + } else if (!floorplan_not_fitting && ((noc_enabled && pack_iteration == 2) || (!noc_enabled && pack_iteration == 1))) { + //1st pack attempt was unsuccessful (i.e. not dense enough) and we have control of unrelated clustering // //Turn it on to increase packing density if (packer_opts->allow_unrelated_clustering == e_unrelated_clustering::AUTO) { @@ -207,21 +228,21 @@ bool try_pack(t_packer_opts* packer_opts, * we create attraction groups for partitions with overfull regions (pack those atoms more densely). We continue this way * until the last iteration, when we create attraction groups for every partition, if needed. */ - } else if (pack_iteration == 1 && floorplan_not_fitting) { + } else if (floorplan_not_fitting && ((!noc_enabled && pack_iteration == 1) || (noc_enabled && pack_iteration == 2))) { VTR_LOG("Floorplan regions are overfull: trying to pack again using cluster attraction groups. \n"); attraction_groups.create_att_groups_for_overfull_regions(); attraction_groups.set_att_group_pulls(1); - } else if (pack_iteration >= 2 && pack_iteration < 5 && floorplan_not_fitting) { - if (pack_iteration == 2) { + } else if (floorplan_not_fitting && ((!noc_enabled && pack_iteration >= 2 && pack_iteration < 5) || (noc_enabled && pack_iteration >= 3 && pack_iteration < 6))) { + if ((!noc_enabled && pack_iteration == 2) || (noc_enabled && pack_iteration == 3)) { VTR_LOG("Floorplan regions are overfull: trying to pack again with more attraction groups exploration. \n"); attraction_groups.create_att_groups_for_overfull_regions(); VTR_LOG("Pack iteration is %d\n", pack_iteration); - } else if (pack_iteration == 3) { + } else if ((!noc_enabled && pack_iteration == 3) || (noc_enabled && pack_iteration == 4)) { attraction_groups.create_att_groups_for_all_regions(); VTR_LOG("Floorplan regions are overfull: trying to pack again with more attraction groups exploration. \n"); VTR_LOG("Pack iteration is %d\n", pack_iteration); - } else if (pack_iteration == 4) { + } else if ((!noc_enabled && pack_iteration == 4) || (noc_enabled && pack_iteration == 5)) { attraction_groups.create_att_groups_for_all_regions(); VTR_LOG("Floorplan regions are overfull: trying to pack again with more attraction groups exploration and higher target pin utilization. \n"); VTR_LOG("Pack iteration is %d\n", pack_iteration); @@ -230,9 +251,7 @@ bool try_pack(t_packer_opts* packer_opts, helper_ctx.target_external_pin_util.set_block_pin_util("clb", pin_util); } - } else { - //Unable to pack densely enough: Give Up - + } else { //Unable to pack densely enough: Give Up if (floorplan_regions_overfull) { VPR_FATAL_ERROR(VPR_ERROR_OTHER, "Failed to find pack clusters densely enough to fit in the designated floorplan regions.\n" @@ -258,7 +277,7 @@ bool try_pack(t_packer_opts* packer_opts, resource_avail += std::string(iter->first->name) + ": " + std::to_string(num_instances); } - VPR_FATAL_ERROR(VPR_ERROR_OTHER, "Failed to find device which satisifies resource requirements required: %s (available %s)", resource_reqs.c_str(), resource_avail.c_str()); + VPR_FATAL_ERROR(VPR_ERROR_OTHER, "Failed to find device which satisfies resource requirements required: %s (available %s)", resource_reqs.c_str(), resource_avail.c_str()); } //Reset clustering for re-packing diff --git a/vpr/src/pack/pack.h b/vpr/src/pack/pack.h index df99104512b..0f294e34edf 100644 --- a/vpr/src/pack/pack.h +++ b/vpr/src/pack/pack.h @@ -11,7 +11,8 @@ bool try_pack(t_packer_opts* packer_opts, const t_model* user_models, const t_model* library_models, float interc_delay, - std::vector* lb_type_rr_graphs); + std::vector* lb_type_rr_graphs, + bool noc_enabled); float get_arch_switch_info(short switch_index, int switch_fanin, float& Tdel_switch, float& R_switch, float& Cout_switch); From 7feda705d13df0e131d6b16e3f125766b47a66ce Mon Sep 17 00:00:00 2001 From: soheilshahrouz Date: Wed, 23 Aug 2023 15:36:13 -0400 Subject: [PATCH 21/35] fixed out of range NoCRouterId bug in link removal test --- vpr/test/test_noc_storage.cpp | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/vpr/test/test_noc_storage.cpp b/vpr/test/test_noc_storage.cpp index a0a3e2ee54b..27ce5647d62 100644 --- a/vpr/test/test_noc_storage.cpp +++ b/vpr/test/test_noc_storage.cpp @@ -279,7 +279,7 @@ TEST_CASE("test_remove_link", "[vpr_noc]") { std::mt19937 rand_num_gen(device()); // random number generation to determine routers of the link to remove - std::uniform_int_distribution src_router(0, NUM_OF_ROUTERS); + std::uniform_int_distribution src_router(0, NUM_OF_ROUTERS-1); std::uniform_int_distribution sink_router(1, NOC_CONNECTIVITY - 1); // create the NoC @@ -349,9 +349,9 @@ TEST_CASE("test_remove_link", "[vpr_noc]") { auto& outgoing_links = test_noc.get_noc_router_connections(link_to_remove_src_router); // go through all the outgoing links of the source router in the link we removed and check that the link does not exist there as well. - for (auto outgoing_link_id = outgoing_links.begin(); outgoing_link_id != outgoing_links.end(); outgoing_link_id++) { + for (auto outgoing_link : outgoing_links) { // get the current outgoing link - const NocLink& curr_outgoing_link = test_noc.get_single_noc_link(*outgoing_link_id); + const NocLink& curr_outgoing_link = test_noc.get_single_noc_link(outgoing_link); if ((curr_outgoing_link.get_source_router() == link_to_remove_src_router) && (curr_outgoing_link.get_sink_router() == link_to_remove_sink_router)) { link_removed_from_outgoing_vector = false; @@ -363,9 +363,9 @@ TEST_CASE("test_remove_link", "[vpr_noc]") { const auto& links_in_noc = test_noc.get_noc_links(); // go through the links and make sure that none of them have the source and sink router of the link // that we removed. THe removed link should have the source and sink routers set to invalid values. - for (auto single_link = links_in_noc.begin(); single_link != links_in_noc.end(); single_link++) { + for (auto single_link : links_in_noc) { // check whether the source and sink router of the current link matches the routers in the link to remove - if ((single_link->get_source_router() == link_to_remove_src_router) && (single_link->get_sink_router() == link_to_remove_sink_router)) { + if ((single_link.get_source_router() == link_to_remove_src_router) && (single_link.get_sink_router() == link_to_remove_sink_router)) { // this indicates that the link was not set to an invalid state and not removed properly link_removed_from_outgoing_vector = false; break; From a31fe806436e7973dcb951b38a2f440a7851261a Mon Sep 17 00:00:00 2001 From: soheilshahrouz Date: Thu, 24 Aug 2023 18:56:01 -0400 Subject: [PATCH 22/35] don't consider constant net candidates when NoC is enabled --- vpr/src/pack/cluster.cpp | 9 ++++++--- vpr/src/pack/cluster.h | 3 ++- vpr/src/pack/cluster_util.cpp | 29 +++++++++++++++++++---------- vpr/src/pack/cluster_util.h | 9 ++++++--- vpr/src/pack/pack.cpp | 3 ++- 5 files changed, 35 insertions(+), 18 deletions(-) diff --git a/vpr/src/pack/cluster.cpp b/vpr/src/pack/cluster.cpp index 34c791d0bac..853137c93f1 100644 --- a/vpr/src/pack/cluster.cpp +++ b/vpr/src/pack/cluster.cpp @@ -97,7 +97,8 @@ std::map do_clustering(const t_packer_opts& pa std::vector* lb_type_rr_graphs, AttractionInfo& attraction_groups, bool& floorplan_regions_overfull, - t_clustering_data& clustering_data) { + t_clustering_data& clustering_data, + bool noc_enabled) { /* Does the actual work of clustering multiple netlist blocks * * into clusters. */ @@ -290,7 +291,8 @@ std::map do_clustering(const t_packer_opts& pa high_fanout_threshold, *timing_info, attraction_groups, - net_output_feeds_driving_block_input); + net_output_feeds_driving_block_input, + noc_enabled); helper_ctx.total_clb_num++; if (packer_opts.timing_driven) { @@ -363,7 +365,8 @@ std::map do_clustering(const t_packer_opts& pa clustering_data.unclustered_list_head, unclustered_list_head_size, net_output_feeds_driving_block_input, - primitive_candidate_block_types); + primitive_candidate_block_types, + noc_enabled); } is_cluster_legal = check_cluster_legality(verbosity, detailed_routing_stage, router_data); diff --git a/vpr/src/pack/cluster.h b/vpr/src/pack/cluster.h index e05d6066ffd..a41b7f9bd4f 100644 --- a/vpr/src/pack/cluster.h +++ b/vpr/src/pack/cluster.h @@ -24,7 +24,8 @@ std::map do_clustering(const t_packer_opts& pa std::vector* lb_type_rr_graphs, AttractionInfo& attraction_groups, bool& floorplan_regions_overfull, - t_clustering_data& clustering_data); + t_clustering_data& clustering_data, + bool noc_enabled); int get_cluster_of_block(int blkidx); diff --git a/vpr/src/pack/cluster_util.cpp b/vpr/src/pack/cluster_util.cpp index fcfbd2d15c1..b2d306180f7 100644 --- a/vpr/src/pack/cluster_util.cpp +++ b/vpr/src/pack/cluster_util.cpp @@ -1507,7 +1507,8 @@ void try_fill_cluster(const t_packer_opts& packer_opts, t_molecule_link* unclustered_list_head, const int& unclustered_list_head_size, std::unordered_map& net_output_feeds_driving_block_input, - std::map>& primitive_candidate_block_types) { + std::map>& primitive_candidate_block_types, + bool noc_enabled) { auto& atom_ctx = g_vpr_ctx.atom(); auto& device_ctx = g_vpr_ctx.mutable_device(); auto& cluster_ctx = g_vpr_ctx.mutable_clustering(); @@ -1601,7 +1602,8 @@ void try_fill_cluster(const t_packer_opts& packer_opts, high_fanout_threshold, *timing_info, attraction_groups, - net_output_feeds_driving_block_input); + net_output_feeds_driving_block_input, + noc_enabled); cluster_stats.num_unrelated_clustering_attempts = 0; if (packer_opts.timing_driven) { @@ -1763,7 +1765,8 @@ void mark_and_update_partial_gain(const AtomNetId net_id, const SetupTimingInfo& timing_info, const std::unordered_set& is_global, const int high_fanout_net_threshold, - std::unordered_map& net_output_feeds_driving_block_input) { + std::unordered_map& net_output_feeds_driving_block_input, + bool noc_enabled) { /* Updates the marked data structures, and if gain_flag is GAIN, * * the gain when an atom block is added to a cluster. The * * sharinggain is the number of inputs that a atom block shares with * @@ -1783,7 +1786,7 @@ void mark_and_update_partial_gain(const AtomNetId net_id, /* There are VCC and GND nets in the netlist. These nets have a high fanout, * but their sinks do not necessarily have a logical relation with each other. * Therefore, we exclude constant nets when evaluating high fanout connectivity. */ - if (!is_global.count(net_id)) { + if (!is_global.count(net_id) && (!noc_enabled || !atom_ctx.nlist.net_is_constant(net_id))) { /* If no low/medium fanout nets, we may need to consider * high fan-out nets for packing, so select one and store it */ AtomNetId stored_net = cur_pb->pb_stats->tie_break_high_fanout_net; @@ -1919,7 +1922,8 @@ void update_cluster_stats(const t_pack_molecule* molecule, const int high_fanout_net_threshold, const SetupTimingInfo& timing_info, AttractionInfo& attraction_groups, - std::unordered_map& net_output_feeds_driving_block_input) { + std::unordered_map& net_output_feeds_driving_block_input, + bool noc_enabled) { /* Routine that is called each time a new molecule is added to the cluster. * Makes calls to update cluster stats such as the gain map for atoms, used pins, and clock structures, * in order to reflect the new content of the cluster. @@ -1976,7 +1980,8 @@ void update_cluster_stats(const t_pack_molecule* molecule, timing_info, is_global, high_fanout_net_threshold, - net_output_feeds_driving_block_input); + net_output_feeds_driving_block_input, + noc_enabled); } else { mark_and_update_partial_gain(net_id, NO_GAIN, blk_id, timing_driven, @@ -1984,7 +1989,8 @@ void update_cluster_stats(const t_pack_molecule* molecule, timing_info, is_global, high_fanout_net_threshold, - net_output_feeds_driving_block_input); + net_output_feeds_driving_block_input, + noc_enabled); } } @@ -1997,7 +2003,8 @@ void update_cluster_stats(const t_pack_molecule* molecule, timing_info, is_global, high_fanout_net_threshold, - net_output_feeds_driving_block_input); + net_output_feeds_driving_block_input, + noc_enabled); } /* Finally Clocks */ @@ -2009,14 +2016,16 @@ void update_cluster_stats(const t_pack_molecule* molecule, timing_info, is_global, high_fanout_net_threshold, - net_output_feeds_driving_block_input); + net_output_feeds_driving_block_input, + noc_enabled); } else { mark_and_update_partial_gain(net_id, GAIN, blk_id, timing_driven, connection_driven, INPUT, timing_info, is_global, high_fanout_net_threshold, - net_output_feeds_driving_block_input); + net_output_feeds_driving_block_input, + noc_enabled); } } diff --git a/vpr/src/pack/cluster_util.h b/vpr/src/pack/cluster_util.h index 467c34a2a03..fe110845e4d 100644 --- a/vpr/src/pack/cluster_util.h +++ b/vpr/src/pack/cluster_util.h @@ -242,7 +242,8 @@ void try_fill_cluster(const t_packer_opts& packer_opts, t_molecule_link* unclustered_list_head, const int& unclustered_list_head_size, std::unordered_map& net_output_feeds_driving_block_input, - std::map>& primitive_candidate_block_types); + std::map>& primitive_candidate_block_types, + bool noc_enabled); t_pack_molecule* save_cluster_routing_and_pick_new_seed(const t_packer_opts& packer_opts, const int& num_clb, @@ -305,7 +306,8 @@ void mark_and_update_partial_gain(const AtomNetId net_id, const SetupTimingInfo& timing_info, const std::unordered_set& is_global, const int high_fanout_net_threshold, - std::unordered_map& net_output_feeds_driving_block_input); + std::unordered_map& net_output_feeds_driving_block_input, + bool noc_enabled); void update_total_gain(float alpha, float beta, bool timing_driven, bool connection_driven, t_pb* pb, AttractionInfo& attraction_groups); @@ -321,7 +323,8 @@ void update_cluster_stats(const t_pack_molecule* molecule, const int high_fanout_net_threshold, const SetupTimingInfo& timing_info, AttractionInfo& attraction_groups, - std::unordered_map& net_output_feeds_driving_block_input); + std::unordered_map& net_output_feeds_driving_block_input, + bool noc_enabled); void start_new_cluster(t_cluster_placement_stats* cluster_placement_stats, t_pb_graph_node** primitives_list, diff --git a/vpr/src/pack/pack.cpp b/vpr/src/pack/pack.cpp index 6e31b0675f3..dc4bb700b56 100644 --- a/vpr/src/pack/pack.cpp +++ b/vpr/src/pack/pack.cpp @@ -177,7 +177,8 @@ bool try_pack(t_packer_opts* packer_opts, lb_type_rr_graphs, attraction_groups, floorplan_regions_overfull, - clustering_data); + clustering_data, + noc_enabled); //Try to size/find a device bool fits_on_device = try_size_device_grid(*arch, helper_ctx.num_used_type_instances, packer_opts->target_device_utilization, packer_opts->device_layout); From ea9f5be4a3e5aded7c55351b42e89fe7cc2b81d5 Mon Sep 17 00:00:00 2001 From: soheilshahrouz Date: Thu, 24 Aug 2023 20:23:09 -0400 Subject: [PATCH 23/35] temporarily don't check emptiness --- vpr/src/place/initial_placement.cpp | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/vpr/src/place/initial_placement.cpp b/vpr/src/place/initial_placement.cpp index fe9250b6432..0d82e6215c4 100644 --- a/vpr/src/place/initial_placement.cpp +++ b/vpr/src/place/initial_placement.cpp @@ -493,7 +493,7 @@ static bool try_centroid_placement(const t_pl_macro& pl_macro, PartitionRegion& //try to find a near location that meet these requirements bool neighbor_legal_loc = false; if (!is_loc_legal(centroid_loc, pr, block_type)) { - neighbor_legal_loc = find_centroid_neighbor(centroid_loc, block_type, true); + neighbor_legal_loc = find_centroid_neighbor(centroid_loc, block_type, false); if (!neighbor_legal_loc) { //no neighbor candidate found return false; } @@ -540,7 +540,7 @@ static int get_y_loc_based_on_macro_direction(t_grid_empty_locs_block_type first /* * if the macro member offset is positive, it means that macro head should be placed at the first location of first_macro_loc. * otherwise, macro head should be placed at the last available location to ensure macro_can_be_placed can check macro location correctly. - * + * */ if (pl_macro.members.size() > 1) { if (pl_macro.members.at(1).offset.y < 0) { @@ -683,7 +683,7 @@ static bool try_random_placement(const t_pl_macro& pl_macro, const PartitionRegi to_compressed_loc, false, reg_coord.layer_num, - true); + false); if (!legal) { //No valid position found return false; From 8f3b2fafcd5e00e6f2aa2fb65701f0f244e70af1 Mon Sep 17 00:00:00 2001 From: soheilshahrouz Date: Wed, 30 Aug 2023 13:45:45 -0400 Subject: [PATCH 24/35] Revert "temporarily don't check emptiness" This reverts commit ea9f5be4a3e5aded7c55351b42e89fe7cc2b81d5. --- vpr/src/place/initial_placement.cpp | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/vpr/src/place/initial_placement.cpp b/vpr/src/place/initial_placement.cpp index 0d82e6215c4..fe9250b6432 100644 --- a/vpr/src/place/initial_placement.cpp +++ b/vpr/src/place/initial_placement.cpp @@ -493,7 +493,7 @@ static bool try_centroid_placement(const t_pl_macro& pl_macro, PartitionRegion& //try to find a near location that meet these requirements bool neighbor_legal_loc = false; if (!is_loc_legal(centroid_loc, pr, block_type)) { - neighbor_legal_loc = find_centroid_neighbor(centroid_loc, block_type, false); + neighbor_legal_loc = find_centroid_neighbor(centroid_loc, block_type, true); if (!neighbor_legal_loc) { //no neighbor candidate found return false; } @@ -540,7 +540,7 @@ static int get_y_loc_based_on_macro_direction(t_grid_empty_locs_block_type first /* * if the macro member offset is positive, it means that macro head should be placed at the first location of first_macro_loc. * otherwise, macro head should be placed at the last available location to ensure macro_can_be_placed can check macro location correctly. - * + * */ if (pl_macro.members.size() > 1) { if (pl_macro.members.at(1).offset.y < 0) { @@ -683,7 +683,7 @@ static bool try_random_placement(const t_pl_macro& pl_macro, const PartitionRegi to_compressed_loc, false, reg_coord.layer_num, - false); + true); if (!legal) { //No valid position found return false; From 355ba50695e5a72d50824d0989a13851cfedb8fb Mon Sep 17 00:00:00 2001 From: soheilshahrouz Date: Wed, 30 Aug 2023 14:30:51 -0400 Subject: [PATCH 25/35] make format --- vpr/src/place/initial_placement.cpp | 25 ------------------- vpr/test/test_noc_storage.cpp | 2 +- .../titan_quick_qor/config/config.txt | 5 +++- 3 files changed, 5 insertions(+), 27 deletions(-) diff --git a/vpr/src/place/initial_placement.cpp b/vpr/src/place/initial_placement.cpp index fe9250b6432..6c793981166 100644 --- a/vpr/src/place/initial_placement.cpp +++ b/vpr/src/place/initial_placement.cpp @@ -1164,31 +1164,6 @@ static bool assess_noc_swap(double delta_cost, double prob) { } } -static int findFirstInteger(const std::string& str) { - std::string numberString; - bool foundNumber = false; - - for (char c : str) { - if (isdigit(c)) { - numberString += c; - foundNumber = true; - } else if (foundNumber) { - // We encountered a non-digit character after finding a number, - // so we stop searching. - break; - } - } - - if (!numberString.empty()) { - // Convert the string to an integer using stoi() function - return std::stoi(numberString); - } else { - // If no integer is found, return a default value or handle the case - // according to your requirements. - return -1; - } -} - static void initial_noc_placement(const t_noc_opts& noc_opts) { auto& place_ctx = g_vpr_ctx.placement(); auto& noc_ctx = g_vpr_ctx.noc(); diff --git a/vpr/test/test_noc_storage.cpp b/vpr/test/test_noc_storage.cpp index 27ce5647d62..882ba37385c 100644 --- a/vpr/test/test_noc_storage.cpp +++ b/vpr/test/test_noc_storage.cpp @@ -279,7 +279,7 @@ TEST_CASE("test_remove_link", "[vpr_noc]") { std::mt19937 rand_num_gen(device()); // random number generation to determine routers of the link to remove - std::uniform_int_distribution src_router(0, NUM_OF_ROUTERS-1); + std::uniform_int_distribution src_router(0, NUM_OF_ROUTERS - 1); std::uniform_int_distribution sink_router(1, NOC_CONNECTIVITY - 1); // create the NoC diff --git a/vtr_flow/tasks/regression_tests/vtr_reg_nightly_test2/titan_quick_qor/config/config.txt b/vtr_flow/tasks/regression_tests/vtr_reg_nightly_test2/titan_quick_qor/config/config.txt index 2edd2dc32be..3dad722821a 100644 --- a/vtr_flow/tasks/regression_tests/vtr_reg_nightly_test2/titan_quick_qor/config/config.txt +++ b/vtr_flow/tasks/regression_tests/vtr_reg_nightly_test2/titan_quick_qor/config/config.txt @@ -67,4 +67,7 @@ pass_requirements_file=pass_requirements_vpr_titan.txt #A large number of routing iterations is set to ensure the router doesn't give up to easily on the larger benchmarks #To be more run-time comparable to commercial tools like Quartus, we run with higher placer effort (inner_num=2) and lower astar_fac (1.0) #Set a 24hr timeout so they don't run forever -script_params=-starting_stage vpr --route_chan_width 300 --max_router_iterations 400 --router_lookahead map -timeout 86400 --initial_pres_fac 1.0 --router_profiler_astar_fac 1.5 --seed 3 +script_params=-starting_stage vpr --route_chan_width 300 --max_router_iterations 400 --router_lookahead map -timeout 86400 --initial_pres_fac 1.0 --router_profiler_astar_fac 1.5 +script_params_list_add=-starting_stage vpr --seed 3 +script_params_list_add=-starting_stage vpr --seed 4 +script_params_list_add=-starting_stage vpr --seed 5 \ No newline at end of file From 72954479d0b6d0b68d08a35eda0512de945c1911 Mon Sep 17 00:00:00 2001 From: soheilshahrouz Date: Wed, 30 Aug 2023 14:46:57 -0400 Subject: [PATCH 26/35] weigh sink block locations in centroid calculation When averaging over the sinks connected to block, I use the inverse of the number of sinks driven by a net as weight --- vpr/src/place/initial_placement.cpp | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/vpr/src/place/initial_placement.cpp b/vpr/src/place/initial_placement.cpp index 6c793981166..a84ca9ac3bd 100644 --- a/vpr/src/place/initial_placement.cpp +++ b/vpr/src/place/initial_placement.cpp @@ -419,6 +419,9 @@ static std::vector find_centroid_loc(const t_pl_macro& pl_macro, if (cluster_ctx.clb_nlist.net_is_ignored(net_id)) { continue; } + + float n_sinks_inv = 1.0f / (float)cluster_ctx.clb_nlist.net_sinks(net_id).size(); + for (auto sink_pin_id : cluster_ctx.clb_nlist.net_sinks(net_id)) { /* Ignore if one of the sinks is the block itself*/ if (pin_id == sink_pin_id) @@ -435,9 +438,9 @@ static std::vector find_centroid_loc(const t_pl_macro& pl_macro, VTR_ASSERT(tile_loc.layer_num != OPEN); layer_count[tile_loc.layer_num]++; } - acc_x += tile_loc.x; - acc_y += tile_loc.y; - acc_weight++; + acc_x += (float)tile_loc.x * n_sinks_inv; + acc_y += (float)tile_loc.y * n_sinks_inv; + acc_weight += n_sinks_inv; } } From a1188ce54ff30e0a6e246af72dfe285fe3c4be67 Mon Sep 17 00:00:00 2001 From: soheilshahrouz Date: Thu, 19 Oct 2023 17:30:55 -0400 Subject: [PATCH 27/35] Revert "weigh sink block locations in centroid calculation" This reverts commit 72954479d0b6d0b68d08a35eda0512de945c1911. --- vpr/src/place/initial_placement.cpp | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/vpr/src/place/initial_placement.cpp b/vpr/src/place/initial_placement.cpp index a84ca9ac3bd..6c793981166 100644 --- a/vpr/src/place/initial_placement.cpp +++ b/vpr/src/place/initial_placement.cpp @@ -419,9 +419,6 @@ static std::vector find_centroid_loc(const t_pl_macro& pl_macro, if (cluster_ctx.clb_nlist.net_is_ignored(net_id)) { continue; } - - float n_sinks_inv = 1.0f / (float)cluster_ctx.clb_nlist.net_sinks(net_id).size(); - for (auto sink_pin_id : cluster_ctx.clb_nlist.net_sinks(net_id)) { /* Ignore if one of the sinks is the block itself*/ if (pin_id == sink_pin_id) @@ -438,9 +435,9 @@ static std::vector find_centroid_loc(const t_pl_macro& pl_macro, VTR_ASSERT(tile_loc.layer_num != OPEN); layer_count[tile_loc.layer_num]++; } - acc_x += (float)tile_loc.x * n_sinks_inv; - acc_y += (float)tile_loc.y * n_sinks_inv; - acc_weight += n_sinks_inv; + acc_x += tile_loc.x; + acc_y += tile_loc.y; + acc_weight++; } } From e4ff9344a84211b81c90343ca701ce7e63326ec0 Mon Sep 17 00:00:00 2001 From: soheilshahrouz Date: Fri, 20 Oct 2023 20:49:26 -0400 Subject: [PATCH 28/35] Disable high fanout connectivity clustering for logic blocks --- vpr/src/base/vpr_types.cpp | 2 +- vpr/src/pack/pack.cpp | 2 ++ 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/vpr/src/base/vpr_types.cpp b/vpr/src/base/vpr_types.cpp index ed3fc40f9d0..7618ba9fa6a 100644 --- a/vpr/src/base/vpr_types.cpp +++ b/vpr/src/base/vpr_types.cpp @@ -180,7 +180,7 @@ t_pack_high_fanout_thresholds::t_pack_high_fanout_thresholds(const std::vectorname, LOGIC_BLOCK_TYPE_HIGH_FANOUT_THRESHOLD); } else { diff --git a/vpr/src/pack/pack.cpp b/vpr/src/pack/pack.cpp index dc4bb700b56..7bf96c43294 100644 --- a/vpr/src/pack/pack.cpp +++ b/vpr/src/pack/pack.cpp @@ -160,6 +160,8 @@ bool try_pack(t_packer_opts* packer_opts, allow_high_fanout_connectivity_clustering = true; } + allow_high_fanout_connectivity_clustering = true; + while (true) { free_clustering_data(*packer_opts, clustering_data); From 375db5880b770f4758330cff4d0f650fc68d3ed0 Mon Sep 17 00:00:00 2001 From: soheilshahrouz Date: Tue, 7 Nov 2023 14:08:37 -0500 Subject: [PATCH 29/35] Reverted packing optimizations for NoC benchmarks --- vpr/src/base/vpr_api.cpp | 3 +- vpr/src/pack/cluster.cpp | 12 +-- vpr/src/pack/cluster.h | 4 +- vpr/src/pack/cluster_util.cpp | 43 +++----- vpr/src/pack/cluster_util.h | 12 +-- vpr/src/pack/pack.cpp | 98 ++----------------- vpr/src/pack/pack.h | 3 +- .../titan_quick_qor/config/config.txt | 5 +- 8 files changed, 33 insertions(+), 147 deletions(-) diff --git a/vpr/src/base/vpr_api.cpp b/vpr/src/base/vpr_api.cpp index 034d6889ad9..9f379f84e42 100644 --- a/vpr/src/base/vpr_api.cpp +++ b/vpr/src/base/vpr_api.cpp @@ -659,8 +659,7 @@ bool vpr_pack(t_vpr_setup& vpr_setup, const t_arch& arch) { return try_pack(&vpr_setup.PackerOpts, &vpr_setup.AnalysisOpts, &arch, vpr_setup.user_models, vpr_setup.library_models, inter_cluster_delay, - vpr_setup.PackerRRGraph, - vpr_setup.NocOpts.noc); + vpr_setup.PackerRRGraph); } void vpr_load_packing(t_vpr_setup& vpr_setup, const t_arch& arch) { diff --git a/vpr/src/pack/cluster.cpp b/vpr/src/pack/cluster.cpp index 853137c93f1..b19aa4e7f99 100644 --- a/vpr/src/pack/cluster.cpp +++ b/vpr/src/pack/cluster.cpp @@ -91,14 +91,12 @@ std::map do_clustering(const t_packer_opts& pa const std::unordered_set& is_clock, const std::unordered_set& is_global, const std::unordered_map& expected_lowest_cost_pb_gnode, - bool allow_high_fanout_connectivity_clustering, bool allow_unrelated_clustering, bool balance_block_type_utilization, std::vector* lb_type_rr_graphs, AttractionInfo& attraction_groups, bool& floorplan_regions_overfull, - t_clustering_data& clustering_data, - bool noc_enabled) { + t_clustering_data& clustering_data) { /* Does the actual work of clustering multiple netlist blocks * * into clusters. */ @@ -291,8 +289,7 @@ std::map do_clustering(const t_packer_opts& pa high_fanout_threshold, *timing_info, attraction_groups, - net_output_feeds_driving_block_input, - noc_enabled); + net_output_feeds_driving_block_input); helper_ctx.total_clb_num++; if (packer_opts.timing_driven) { @@ -304,7 +301,6 @@ std::map do_clustering(const t_packer_opts& pa cluster_stats.num_unrelated_clustering_attempts = 0; next_molecule = get_molecule_for_cluster(cluster_ctx.clb_nlist.block_pb(clb_index), attraction_groups, - allow_high_fanout_connectivity_clustering, allow_unrelated_clustering, packer_opts.prioritize_transitive_connectivity, packer_opts.transitive_fanout_threshold, @@ -352,7 +348,6 @@ std::map do_clustering(const t_packer_opts& pa detailed_routing_stage, attraction_groups, clb_inter_blk_nets, - allow_high_fanout_connectivity_clustering, allow_unrelated_clustering, high_fanout_threshold, is_clock, @@ -365,8 +360,7 @@ std::map do_clustering(const t_packer_opts& pa clustering_data.unclustered_list_head, unclustered_list_head_size, net_output_feeds_driving_block_input, - primitive_candidate_block_types, - noc_enabled); + primitive_candidate_block_types); } is_cluster_legal = check_cluster_legality(verbosity, detailed_routing_stage, router_data); diff --git a/vpr/src/pack/cluster.h b/vpr/src/pack/cluster.h index a41b7f9bd4f..e08e58dac50 100644 --- a/vpr/src/pack/cluster.h +++ b/vpr/src/pack/cluster.h @@ -18,14 +18,12 @@ std::map do_clustering(const t_packer_opts& pa const std::unordered_set& is_clock, const std::unordered_set& is_global, const std::unordered_map& expected_lowest_cost_pb_gnode, - bool allow_high_fanout_connectivity_clustering, bool allow_unrelated_clustering, bool balance_block_type_utilization, std::vector* lb_type_rr_graphs, AttractionInfo& attraction_groups, bool& floorplan_regions_overfull, - t_clustering_data& clustering_data, - bool noc_enabled); + t_clustering_data& clustering_data); int get_cluster_of_block(int blkidx); diff --git a/vpr/src/pack/cluster_util.cpp b/vpr/src/pack/cluster_util.cpp index b2d306180f7..d04e08bd74f 100644 --- a/vpr/src/pack/cluster_util.cpp +++ b/vpr/src/pack/cluster_util.cpp @@ -1494,7 +1494,6 @@ void try_fill_cluster(const t_packer_opts& packer_opts, const int detailed_routing_stage, AttractionInfo& attraction_groups, vtr::vector>& clb_inter_blk_nets, - bool allow_high_fanout_connectivity_clustering, bool allow_unrelated_clustering, const int& high_fanout_threshold, const std::unordered_set& is_clock, @@ -1507,8 +1506,7 @@ void try_fill_cluster(const t_packer_opts& packer_opts, t_molecule_link* unclustered_list_head, const int& unclustered_list_head_size, std::unordered_map& net_output_feeds_driving_block_input, - std::map>& primitive_candidate_block_types, - bool noc_enabled) { + std::map>& primitive_candidate_block_types) { auto& atom_ctx = g_vpr_ctx.atom(); auto& device_ctx = g_vpr_ctx.mutable_device(); auto& cluster_ctx = g_vpr_ctx.mutable_clustering(); @@ -1556,7 +1554,6 @@ void try_fill_cluster(const t_packer_opts& packer_opts, next_molecule = get_molecule_for_cluster(cluster_ctx.clb_nlist.block_pb(clb_index), attraction_groups, - allow_high_fanout_connectivity_clustering, allow_unrelated_clustering, packer_opts.prioritize_transitive_connectivity, packer_opts.transitive_fanout_threshold, @@ -1602,8 +1599,7 @@ void try_fill_cluster(const t_packer_opts& packer_opts, high_fanout_threshold, *timing_info, attraction_groups, - net_output_feeds_driving_block_input, - noc_enabled); + net_output_feeds_driving_block_input); cluster_stats.num_unrelated_clustering_attempts = 0; if (packer_opts.timing_driven) { @@ -1611,7 +1607,6 @@ void try_fill_cluster(const t_packer_opts& packer_opts, } next_molecule = get_molecule_for_cluster(cluster_ctx.clb_nlist.block_pb(clb_index), attraction_groups, - allow_high_fanout_connectivity_clustering, allow_unrelated_clustering, packer_opts.prioritize_transitive_connectivity, packer_opts.transitive_fanout_threshold, @@ -1765,8 +1760,7 @@ void mark_and_update_partial_gain(const AtomNetId net_id, const SetupTimingInfo& timing_info, const std::unordered_set& is_global, const int high_fanout_net_threshold, - std::unordered_map& net_output_feeds_driving_block_input, - bool noc_enabled) { + std::unordered_map& net_output_feeds_driving_block_input) { /* Updates the marked data structures, and if gain_flag is GAIN, * * the gain when an atom block is added to a cluster. The * * sharinggain is the number of inputs that a atom block shares with * @@ -1783,10 +1777,7 @@ void mark_and_update_partial_gain(const AtomNetId net_id, /* Optimization: It can be too runtime costly for marking all sinks for * a high fanout-net that probably has no hope of ever getting packed, * thus ignore those high fanout nets */ - /* There are VCC and GND nets in the netlist. These nets have a high fanout, - * but their sinks do not necessarily have a logical relation with each other. - * Therefore, we exclude constant nets when evaluating high fanout connectivity. */ - if (!is_global.count(net_id) && (!noc_enabled || !atom_ctx.nlist.net_is_constant(net_id))) { + if (!is_global.count(net_id)) { /* If no low/medium fanout nets, we may need to consider * high fan-out nets for packing, so select one and store it */ AtomNetId stored_net = cur_pb->pb_stats->tie_break_high_fanout_net; @@ -1922,8 +1913,7 @@ void update_cluster_stats(const t_pack_molecule* molecule, const int high_fanout_net_threshold, const SetupTimingInfo& timing_info, AttractionInfo& attraction_groups, - std::unordered_map& net_output_feeds_driving_block_input, - bool noc_enabled) { + std::unordered_map& net_output_feeds_driving_block_input) { /* Routine that is called each time a new molecule is added to the cluster. * Makes calls to update cluster stats such as the gain map for atoms, used pins, and clock structures, * in order to reflect the new content of the cluster. @@ -1980,8 +1970,7 @@ void update_cluster_stats(const t_pack_molecule* molecule, timing_info, is_global, high_fanout_net_threshold, - net_output_feeds_driving_block_input, - noc_enabled); + net_output_feeds_driving_block_input); } else { mark_and_update_partial_gain(net_id, NO_GAIN, blk_id, timing_driven, @@ -1989,8 +1978,7 @@ void update_cluster_stats(const t_pack_molecule* molecule, timing_info, is_global, high_fanout_net_threshold, - net_output_feeds_driving_block_input, - noc_enabled); + net_output_feeds_driving_block_input); } } @@ -2003,8 +1991,7 @@ void update_cluster_stats(const t_pack_molecule* molecule, timing_info, is_global, high_fanout_net_threshold, - net_output_feeds_driving_block_input, - noc_enabled); + net_output_feeds_driving_block_input); } /* Finally Clocks */ @@ -2016,16 +2003,14 @@ void update_cluster_stats(const t_pack_molecule* molecule, timing_info, is_global, high_fanout_net_threshold, - net_output_feeds_driving_block_input, - noc_enabled); + net_output_feeds_driving_block_input); } else { mark_and_update_partial_gain(net_id, GAIN, blk_id, timing_driven, connection_driven, INPUT, timing_info, is_global, high_fanout_net_threshold, - net_output_feeds_driving_block_input, - noc_enabled); + net_output_feeds_driving_block_input); } } @@ -2216,7 +2201,6 @@ t_pack_molecule* get_highest_gain_molecule(t_pb* cur_pb, vtr::vector>& clb_inter_blk_nets, const ClusterBlockId cluster_index, bool prioritize_transitive_connectivity, - bool allow_high_fanout_connectivity_clustering, int transitive_fanout_threshold, const int feasible_block_array_size, std::map>& primitive_candidate_block_types) { @@ -2243,12 +2227,12 @@ t_pack_molecule* get_highest_gain_molecule(t_pb* cur_pb, } // 3. Find unpacked molecules based on weak connectedness (connected by high fanout nets) with current cluster - if (cur_pb->pb_stats->num_feasible_blocks == 0 && cur_pb->pb_stats->tie_break_high_fanout_net && allow_high_fanout_connectivity_clustering) { + if (cur_pb->pb_stats->num_feasible_blocks == 0 && cur_pb->pb_stats->tie_break_high_fanout_net) { add_cluster_molecule_candidates_by_highfanout_connectivity(cur_pb, cluster_placement_stats_ptr, feasible_block_array_size, attraction_groups); } } else { //Reverse order // 3. Find unpacked molecules based on weak connectedness (connected by high fanout nets) with current cluster - if (cur_pb->pb_stats->num_feasible_blocks == 0 && cur_pb->pb_stats->tie_break_high_fanout_net && allow_high_fanout_connectivity_clustering) { + if (cur_pb->pb_stats->num_feasible_blocks == 0 && cur_pb->pb_stats->tie_break_high_fanout_net) { add_cluster_molecule_candidates_by_highfanout_connectivity(cur_pb, cluster_placement_stats_ptr, feasible_block_array_size, attraction_groups); } @@ -2521,7 +2505,6 @@ bool check_free_primitives_for_molecule_atoms(t_pack_molecule* molecule, t_clust /*****************************************/ t_pack_molecule* get_molecule_for_cluster(t_pb* cur_pb, AttractionInfo& attraction_groups, - const bool allow_high_fanout_connectivity_clustering, const bool allow_unrelated_clustering, const bool prioritize_transitive_connectivity, const int transitive_fanout_threshold, @@ -2545,7 +2528,7 @@ t_pack_molecule* get_molecule_for_cluster(t_pb* cur_pb, auto best_molecule = get_highest_gain_molecule(cur_pb, attraction_groups, NOT_HILL_CLIMBING, cluster_placement_stats_ptr, clb_inter_blk_nets, - cluster_index, prioritize_transitive_connectivity, allow_high_fanout_connectivity_clustering, + cluster_index, prioritize_transitive_connectivity, transitive_fanout_threshold, feasible_block_array_size, primitive_candidate_block_types); /* If no blocks have any gain to the current cluster, the code above * diff --git a/vpr/src/pack/cluster_util.h b/vpr/src/pack/cluster_util.h index fe110845e4d..6c05272e1e7 100644 --- a/vpr/src/pack/cluster_util.h +++ b/vpr/src/pack/cluster_util.h @@ -229,7 +229,6 @@ void try_fill_cluster(const t_packer_opts& packer_opts, const int detailed_routing_stage, AttractionInfo& attraction_groups, vtr::vector>& clb_inter_blk_nets, - bool allow_high_fanout_connectivity_clustering, bool allow_unrelated_clustering, const int& high_fanout_threshold, const std::unordered_set& is_clock, @@ -242,8 +241,7 @@ void try_fill_cluster(const t_packer_opts& packer_opts, t_molecule_link* unclustered_list_head, const int& unclustered_list_head_size, std::unordered_map& net_output_feeds_driving_block_input, - std::map>& primitive_candidate_block_types, - bool noc_enabled); + std::map>& primitive_candidate_block_types); t_pack_molecule* save_cluster_routing_and_pick_new_seed(const t_packer_opts& packer_opts, const int& num_clb, @@ -306,8 +304,7 @@ void mark_and_update_partial_gain(const AtomNetId net_id, const SetupTimingInfo& timing_info, const std::unordered_set& is_global, const int high_fanout_net_threshold, - std::unordered_map& net_output_feeds_driving_block_input, - bool noc_enabled); + std::unordered_map& net_output_feeds_driving_block_input); void update_total_gain(float alpha, float beta, bool timing_driven, bool connection_driven, t_pb* pb, AttractionInfo& attraction_groups); @@ -323,8 +320,7 @@ void update_cluster_stats(const t_pack_molecule* molecule, const int high_fanout_net_threshold, const SetupTimingInfo& timing_info, AttractionInfo& attraction_groups, - std::unordered_map& net_output_feeds_driving_block_input, - bool noc_enabled); + std::unordered_map& net_output_feeds_driving_block_input); void start_new_cluster(t_cluster_placement_stats* cluster_placement_stats, t_pb_graph_node** primitives_list, @@ -354,7 +350,6 @@ t_pack_molecule* get_highest_gain_molecule(t_pb* cur_pb, vtr::vector>& clb_inter_blk_nets, const ClusterBlockId cluster_index, bool prioritize_transitive_connectivity, - bool allow_high_fanout_connectivity_clustering, int transitive_fanout_threshold, const int feasible_block_array_size, std::map>& primitive_candidate_block_types); @@ -388,7 +383,6 @@ bool check_free_primitives_for_molecule_atoms(t_pack_molecule* molecule, t_clust t_pack_molecule* get_molecule_for_cluster(t_pb* cur_pb, AttractionInfo& attraction_groups, - const bool allow_high_fanout_connectivity_clustering, const bool allow_unrelated_clustering, const bool prioritize_transitive_connectivity, const int transitive_fanout_threshold, diff --git a/vpr/src/pack/pack.cpp b/vpr/src/pack/pack.cpp index 7bf96c43294..3210c8ad011 100644 --- a/vpr/src/pack/pack.cpp +++ b/vpr/src/pack/pack.cpp @@ -37,28 +37,13 @@ static bool try_size_device_grid(const t_arch& arch, const std::map A set containing all nets that are - * likely to be global control nets (e.g. reset, clock enable). - */ -static std::unordered_set find_likely_global_ctrl_nets(const std::unordered_set& clocks); - bool try_pack(t_packer_opts* packer_opts, const t_analysis_opts* analysis_opts, const t_arch* arch, const t_model* user_models, const t_model* library_models, float interc_delay, - std::vector* lb_type_rr_graphs, - bool noc_enabled) { + std::vector* lb_type_rr_graphs) { auto& helper_ctx = g_vpr_ctx.mutable_cl_helper(); auto& atom_ctx = g_vpr_ctx.atom(); auto& atom_mutable_ctx = g_vpr_ctx.mutable_atom(); @@ -74,7 +59,6 @@ bool try_pack(t_packer_opts* packer_opts, helper_ctx.num_models += count_models(library_models); is_clock = alloc_and_load_is_clock(packer_opts->global_clocks); - // is_global = find_likely_global_ctrl_nets(is_clock); is_global.insert(is_clock.begin(), is_clock.end()); size_t num_p_inputs = 0; @@ -143,24 +127,6 @@ bool try_pack(t_packer_opts* packer_opts, int pack_iteration = 1; bool floorplan_regions_overfull = false; - bool allow_high_fanout_connectivity_clustering; - - /* If the design contains NoC routers, don't use high fanout connectivity - * to find candidate atoms for growing the current cluster. In NoC-based designs, - * modules connected to different routers may not have any shared net except for - * clock and other global signals. By not using high fanout connectivity, - * atoms belonging to different modules become less likely to be clustered together. - * This allows the placement engine to place clustered blocks closer to their - * corresponding NoC router, reducing WL as a result. Otherwise, blocks containing - * atoms from two different modules would be stretched between two routers. - */ - if (noc_enabled) { - allow_high_fanout_connectivity_clustering = false; - } else { - allow_high_fanout_connectivity_clustering = true; - } - - allow_high_fanout_connectivity_clustering = true; while (true) { free_clustering_data(*packer_opts, clustering_data); @@ -173,14 +139,12 @@ bool try_pack(t_packer_opts* packer_opts, is_clock, is_global, expected_lowest_cost_pb_gnode, - allow_high_fanout_connectivity_clustering, allow_unrelated_clustering, balance_block_type_util, lb_type_rr_graphs, attraction_groups, floorplan_regions_overfull, - clustering_data, - noc_enabled); + clustering_data); //Try to size/find a device bool fits_on_device = try_size_device_grid(*arch, helper_ctx.num_used_type_instances, packer_opts->target_device_utilization, packer_opts->device_layout); @@ -193,19 +157,7 @@ bool try_pack(t_packer_opts* packer_opts, if (fits_on_device && !floorplan_regions_overfull) { break; //Done - /* - * If NoC is enabled and the first packing attempt has failed, we don't care whether a floorplan constraint couldn't be satisfied - * or the clustered netlist does not fit into the target FPGA device. Enabling high fanout connectivity clustering - * can help with both, so we enable it. - */ - } else if (noc_enabled && pack_iteration == 1) { - VTR_ASSERT(allow_high_fanout_connectivity_clustering == false); - allow_high_fanout_connectivity_clustering = true; - VTR_LOG("Packing failed to fit on device. Re-packing with: unrelated_logic_clustering=%s balance_block_type_util=%s allow_high_fanout_connectivity_clustering=%s\n", - (allow_unrelated_clustering ? "true" : "false"), - (balance_block_type_util ? "true" : "false"), - (allow_high_fanout_connectivity_clustering ? "true" : "false")); - } else if (!floorplan_not_fitting && ((noc_enabled && pack_iteration == 2) || (!noc_enabled && pack_iteration == 1))) { + } else if (pack_iteration == 1 && !floorplan_not_fitting) { //1st pack attempt was unsuccessful (i.e. not dense enough) and we have control of unrelated clustering // //Turn it on to increase packing density @@ -217,10 +169,9 @@ bool try_pack(t_packer_opts* packer_opts, VTR_ASSERT(balance_block_type_util == false); balance_block_type_util = true; } - VTR_LOG("Packing failed to fit on device. Re-packing with: unrelated_logic_clustering=%s balance_block_type_util=%s allow_high_fanout_connectivity_clustering=%s\n", + VTR_LOG("Packing failed to fit on device. Re-packing with: unrelated_logic_clustering=%s balance_block_type_util=%s\n", (allow_unrelated_clustering ? "true" : "false"), - (balance_block_type_util ? "true" : "false"), - (allow_high_fanout_connectivity_clustering ? "true" : "false")); + (balance_block_type_util ? "true" : "false")); /* * When running with tight floorplan constraints, some regions may become overfull with clusters (i.e. * the number of blocks assigned to the region exceeds the number of blocks available). When this occurs, we @@ -231,21 +182,21 @@ bool try_pack(t_packer_opts* packer_opts, * we create attraction groups for partitions with overfull regions (pack those atoms more densely). We continue this way * until the last iteration, when we create attraction groups for every partition, if needed. */ - } else if (floorplan_not_fitting && ((!noc_enabled && pack_iteration == 1) || (noc_enabled && pack_iteration == 2))) { + } else if (pack_iteration == 1 && floorplan_not_fitting) { VTR_LOG("Floorplan regions are overfull: trying to pack again using cluster attraction groups. \n"); attraction_groups.create_att_groups_for_overfull_regions(); attraction_groups.set_att_group_pulls(1); - } else if (floorplan_not_fitting && ((!noc_enabled && pack_iteration >= 2 && pack_iteration < 5) || (noc_enabled && pack_iteration >= 3 && pack_iteration < 6))) { - if ((!noc_enabled && pack_iteration == 2) || (noc_enabled && pack_iteration == 3)) { + } else if (pack_iteration >= 2 && pack_iteration < 5 && floorplan_not_fitting) { + if (pack_iteration == 2) { VTR_LOG("Floorplan regions are overfull: trying to pack again with more attraction groups exploration. \n"); attraction_groups.create_att_groups_for_overfull_regions(); VTR_LOG("Pack iteration is %d\n", pack_iteration); - } else if ((!noc_enabled && pack_iteration == 3) || (noc_enabled && pack_iteration == 4)) { + } else if (pack_iteration == 3) { attraction_groups.create_att_groups_for_all_regions(); VTR_LOG("Floorplan regions are overfull: trying to pack again with more attraction groups exploration. \n"); VTR_LOG("Pack iteration is %d\n", pack_iteration); - } else if ((!noc_enabled && pack_iteration == 4) || (noc_enabled && pack_iteration == 5)) { + } else if (pack_iteration == 4) { attraction_groups.create_att_groups_for_all_regions(); VTR_LOG("Floorplan regions are overfull: trying to pack again with more attraction groups exploration and higher target pin utilization. \n"); VTR_LOG("Pack iteration is %d\n", pack_iteration); @@ -380,35 +331,6 @@ std::unordered_set alloc_and_load_is_clock(bool global_clocks) { return (is_clock); } -std::unordered_set find_likely_global_ctrl_nets(const std::unordered_set& clocks) { - auto& atom_ctx = g_vpr_ctx.atom(); - - std::unordered_set likely_reset; - - if (clocks.empty()) { - return likely_reset; - } - - size_t max_clk_sinks = 0; - - for (auto clk_net_id : clocks) { - size_t n_sinks = atom_ctx.nlist.net_sinks(clk_net_id).size(); - max_clk_sinks = std::max(max_clk_sinks, n_sinks); - } - - constexpr float high_fanout_reset_sinks_ratio = 0.6; - for (auto net_id : atom_ctx.nlist.nets()) { - size_t n_sinks = atom_ctx.nlist.net_sinks(net_id).size(); - bool is_net_clock = clocks.count(net_id); - - if (n_sinks > high_fanout_reset_sinks_ratio * max_clk_sinks && !is_net_clock) { - likely_reset.insert(net_id); - } - } - - return likely_reset; -} - static bool try_size_device_grid(const t_arch& arch, const std::map& num_type_instances, float target_device_utilization, std::string device_layout_name) { auto& device_ctx = g_vpr_ctx.mutable_device(); diff --git a/vpr/src/pack/pack.h b/vpr/src/pack/pack.h index 0f294e34edf..df99104512b 100644 --- a/vpr/src/pack/pack.h +++ b/vpr/src/pack/pack.h @@ -11,8 +11,7 @@ bool try_pack(t_packer_opts* packer_opts, const t_model* user_models, const t_model* library_models, float interc_delay, - std::vector* lb_type_rr_graphs, - bool noc_enabled); + std::vector* lb_type_rr_graphs); float get_arch_switch_info(short switch_index, int switch_fanin, float& Tdel_switch, float& R_switch, float& Cout_switch); diff --git a/vtr_flow/tasks/regression_tests/vtr_reg_nightly_test2/titan_quick_qor/config/config.txt b/vtr_flow/tasks/regression_tests/vtr_reg_nightly_test2/titan_quick_qor/config/config.txt index 3dad722821a..ca38c172c7f 100644 --- a/vtr_flow/tasks/regression_tests/vtr_reg_nightly_test2/titan_quick_qor/config/config.txt +++ b/vtr_flow/tasks/regression_tests/vtr_reg_nightly_test2/titan_quick_qor/config/config.txt @@ -67,7 +67,4 @@ pass_requirements_file=pass_requirements_vpr_titan.txt #A large number of routing iterations is set to ensure the router doesn't give up to easily on the larger benchmarks #To be more run-time comparable to commercial tools like Quartus, we run with higher placer effort (inner_num=2) and lower astar_fac (1.0) #Set a 24hr timeout so they don't run forever -script_params=-starting_stage vpr --route_chan_width 300 --max_router_iterations 400 --router_lookahead map -timeout 86400 --initial_pres_fac 1.0 --router_profiler_astar_fac 1.5 -script_params_list_add=-starting_stage vpr --seed 3 -script_params_list_add=-starting_stage vpr --seed 4 -script_params_list_add=-starting_stage vpr --seed 5 \ No newline at end of file +script_params=-starting_stage vpr --route_chan_width 300 --max_router_iterations 400 --router_lookahead map -timeout 86400 --initial_pres_fac 1.0 --router_profiler_astar_fac 1.5 \ No newline at end of file From 39c80f815475b5e4c3afe205e1f251ac90c33831 Mon Sep 17 00:00:00 2001 From: soheilshahrouz Date: Thu, 9 Nov 2023 17:08:35 -0500 Subject: [PATCH 30/35] Reverted LOGIC_BLOCK_TYPE_HIGH_FANOUT_THRESHOLD value to 32. --- vpr/src/base/vpr_types.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vpr/src/base/vpr_types.cpp b/vpr/src/base/vpr_types.cpp index 7618ba9fa6a..ed3fc40f9d0 100644 --- a/vpr/src/base/vpr_types.cpp +++ b/vpr/src/base/vpr_types.cpp @@ -180,7 +180,7 @@ t_pack_high_fanout_thresholds::t_pack_high_fanout_thresholds(const std::vectorname, LOGIC_BLOCK_TYPE_HIGH_FANOUT_THRESHOLD); } else { From d2d3904c8c3f916b152f720d82f81ef8977b98ca Mon Sep 17 00:00:00 2001 From: soheilshahrouz Date: Fri, 10 Nov 2023 13:59:33 -0500 Subject: [PATCH 31/35] Don't look for empty locs in init placement --- vpr/src/place/initial_placement.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/vpr/src/place/initial_placement.cpp b/vpr/src/place/initial_placement.cpp index a581f36bfce..a37896cc985 100644 --- a/vpr/src/place/initial_placement.cpp +++ b/vpr/src/place/initial_placement.cpp @@ -493,7 +493,7 @@ static bool try_centroid_placement(const t_pl_macro& pl_macro, PartitionRegion& //try to find a near location that meet these requirements bool neighbor_legal_loc = false; if (!is_loc_legal(centroid_loc, pr, block_type)) { - neighbor_legal_loc = find_centroid_neighbor(centroid_loc, block_type, true); + neighbor_legal_loc = find_centroid_neighbor(centroid_loc, block_type, false); if (!neighbor_legal_loc) { //no neighbor candidate found return false; } @@ -1311,7 +1311,7 @@ static void initial_noc_placement(const t_noc_opts& noc_opts) { const double starting_prob = 0.5; const double prob_step = starting_prob / N_MOVES; - RouterPlacementCheckpoint checkpoint; + NoCPlacementCheckpoint checkpoint; // Generate and evaluate router moves for (int i_move = 0; i_move < N_MOVES; i_move++) { From 81224e3b5ecb5c93497cdddf5e2a06fa2af6432f Mon Sep 17 00:00:00 2001 From: soheilshahrouz Date: Fri, 10 Nov 2023 18:44:08 -0500 Subject: [PATCH 32/35] Applied the comments. --- vpr/src/base/vpr_types.h | 6 +- vpr/src/noc/noc_traffic_flows.cpp | 22 -- vpr/src/noc/noc_traffic_flows.h | 5 +- vpr/src/pack/pack.cpp | 6 +- vpr/src/place/initial_noc_placement.cpp | 277 ++++++++++++++++ vpr/src/place/initial_noc_placment.h | 15 + vpr/src/place/initial_placement.cpp | 300 ++---------------- vpr/src/place/initial_placement.h | 64 +++- vpr/src/place/move_utils.cpp | 41 ++- vpr/src/place/move_utils.h | 17 +- vpr/src/place/noc_place_checkpoint.cpp | 13 +- vpr/src/place/noc_place_checkpoint.h | 66 +++- vpr/src/place/noc_place_utils.cpp | 153 ++------- vpr/src/place/noc_place_utils.h | 15 +- vpr/src/place/place.cpp | 7 +- .../titan_quick_qor/config/config.txt | 2 +- 16 files changed, 520 insertions(+), 489 deletions(-) create mode 100644 vpr/src/place/initial_noc_placement.cpp create mode 100644 vpr/src/place/initial_noc_placment.h diff --git a/vpr/src/base/vpr_types.h b/vpr/src/base/vpr_types.h index e5e63c915a9..1e6823178fb 100644 --- a/vpr/src/base/vpr_types.h +++ b/vpr/src/base/vpr_types.h @@ -216,11 +216,11 @@ class t_ext_pin_util_targets { class t_pack_high_fanout_thresholds { public: t_pack_high_fanout_thresholds() = default; - t_pack_high_fanout_thresholds(int threshold); - t_pack_high_fanout_thresholds(const std::vector& specs); + explicit t_pack_high_fanout_thresholds(int threshold); + explicit t_pack_high_fanout_thresholds(const std::vector& specs); t_pack_high_fanout_thresholds& operator=(t_pack_high_fanout_thresholds&& other) noexcept; - ///@brief Returns the high fanout threshold of the specified block + ///@brief Returns the high fanout threshold of the specifi ed block int get_threshold(const std::string& block_type_name) const; ///@brief Returns a string describing high fanout thresholds for different block types diff --git a/vpr/src/noc/noc_traffic_flows.cpp b/vpr/src/noc/noc_traffic_flows.cpp index 1285fc0e474..426597bd71c 100644 --- a/vpr/src/noc/noc_traffic_flows.cpp +++ b/vpr/src/noc/noc_traffic_flows.cpp @@ -89,28 +89,6 @@ void NocTrafficFlows::finished_noc_traffic_flows_setup(void) { int number_of_traffic_flows = noc_traffic_flows.size(); traffic_flow_routes.resize(number_of_traffic_flows); - const int num_flows = get_number_of_traffic_flows(); - double bandwidth_sum = 0.0; - double inverse_latency_sum = 0.0; - - // Iterate over all flows and calculate bandwidth and inverse latency sums - for (const auto& flow_id : noc_traffic_flows_ids) { - const auto& flow = get_single_noc_traffic_flow(flow_id); - bandwidth_sum += flow.traffic_flow_bandwidth; - inverse_latency_sum += 1.0 / flow.max_traffic_flow_latency; - } - - double bandwidth_norm_factor = bandwidth_sum / num_flows; - double inverse_latency_norm_factor = inverse_latency_sum / num_flows; - - // Iterate over all flows and assign their scores - for (const auto& flow_id : noc_traffic_flows_ids) { - auto& flow = noc_traffic_flows[flow_id]; - double normalized_bandwidth = flow.traffic_flow_bandwidth / bandwidth_norm_factor; - double normalized_inverse_latency = 1.0 / (flow.max_traffic_flow_latency * inverse_latency_norm_factor); - flow.score = flow.traffic_flow_priority * normalized_bandwidth * normalized_inverse_latency; - } - return; } diff --git a/vpr/src/noc/noc_traffic_flows.h b/vpr/src/noc/noc_traffic_flows.h index 6b3d6483a29..d5a879ea6f5 100644 --- a/vpr/src/noc/noc_traffic_flows.h +++ b/vpr/src/noc/noc_traffic_flows.h @@ -66,9 +66,6 @@ struct t_noc_traffic_flow { /** Indicates the importance of the traffic flow. Higher priority traffic flows will have more importance and will be more likely to have their latency reduced and constraints met. Range: [0-inf) */ int traffic_flow_priority; - /** When a weighted average is computed over flows or their properties, this score can be used as the contributing weight for its corresponding flow */ - double score; - /** Constructor initializes all variables*/ t_noc_traffic_flow(std::string source_router_name, std::string sink_router_name, ClusterBlockId source_router_id, ClusterBlockId sink_router_id, double flow_bandwidth, double max_flow_latency, int flow_priority) : source_router_module_name(std::move(source_router_name)) @@ -78,7 +75,7 @@ struct t_noc_traffic_flow { , traffic_flow_bandwidth(flow_bandwidth) , max_traffic_flow_latency(max_flow_latency) , traffic_flow_priority(flow_priority) - , score(0.0) {} + {} }; class NocTrafficFlows { diff --git a/vpr/src/pack/pack.cpp b/vpr/src/pack/pack.cpp index 3210c8ad011..88a177f4ba6 100644 --- a/vpr/src/pack/pack.cpp +++ b/vpr/src/pack/pack.cpp @@ -3,7 +3,7 @@ #include #include #include -#include +#include #include #include "vtr_assert.h" @@ -30,9 +30,9 @@ static bool try_size_device_grid(const t_arch& arch, const std::map& num_type_instances, float target_device_utilization, std::string device_layout_name); /** - * @brief Counts the total number of models + * @brief Counts the total number of logic models that the architecture can implement. * - * @param user_models A linked list of models + * @param user_models A linked list of logic models. * @return int The total number of models in the linked list */ static int count_models(const t_model* user_models); diff --git a/vpr/src/place/initial_noc_placement.cpp b/vpr/src/place/initial_noc_placement.cpp new file mode 100644 index 00000000000..b2ee3f50d88 --- /dev/null +++ b/vpr/src/place/initial_noc_placement.cpp @@ -0,0 +1,277 @@ + +#include "initial_noc_placment.h" +#include "initial_placement.h" +#include "noc_place_utils.h" +#include "noc_place_checkpoint.h" + +/** + * @brief Evaluates whether a NoC router swap should be accepted or not. + * If delta cost is non-positive, the move is always accepted. If the cost + * has increased, the probability of accepting the move is prob. + * + * @param delta_cost Specifies how much the total cost would change if + * the proposed swap is accepted. + * @param prob The probability by which a router swap that increases + * the cost is accepted. The passed value should be in range [0, 1]. + * + * @return true if the proposed swap is accepted, false if not. + */ +static bool accept_noc_swap(double delta_cost, double prob); + +/** + * @brief Places a constrained NoC router within its partition region. + * + * @param router_blk_id NoC router cluster block ID + */ +static void place_constrained_noc_router(ClusterBlockId router_blk_id); + +/** + * @brief Randomly places unconstrained NoC routers. + * + * @param unfixed_routers Contains the cluster block ID for all unconstrained + * NoC routers. + * @param seed Used for shuffling NoC routers. + */ +static void place_noc_routers_randomly (std::vector& unfixed_routers, int seed); + +/** + * @brief Runs a simulated annealing optimizer for NoC routers. + * + * @param noc_opts Contains weighting factors for NoC cost terms. + */ +static void noc_routers_anneal(const t_noc_opts& noc_opts); + +static bool accept_noc_swap(double delta_cost, double prob) { + if (delta_cost <= 0.0) { + return true; + } + + if (prob == 0.0) { + return false; + } + + float random_num = vtr::frand(); + if (random_num < prob) { + return true; + } else { + return false; + } +} + +static void place_constrained_noc_router(ClusterBlockId router_blk_id) +{ + auto& cluster_ctx = g_vpr_ctx.clustering(); + const auto& floorplanning_ctx = g_vpr_ctx.floorplanning(); + + auto block_type = cluster_ctx.clb_nlist.block_type(router_blk_id); + const PartitionRegion& pr = floorplanning_ctx.cluster_constraints[router_blk_id]; + + // Create a macro with a single member + t_pl_macro_member macro_member; + macro_member.blk_index = router_blk_id; + macro_member.offset = t_pl_offset(0, 0, 0); + t_pl_macro pl_macro; + pl_macro.members.push_back(macro_member); + + bool macro_placed = false; + for (int i_try = 0; i_try < MAX_NUM_TRIES_TO_PLACE_MACROS_RANDOMLY && !macro_placed; i_try++) { + macro_placed = try_place_macro_randomly(pl_macro, pr, block_type, FREE); + } + + if (!macro_placed) { + macro_placed = try_place_macro_exhaustively(pl_macro, pr, block_type, FREE); + } + + if (!macro_placed) { + VPR_FATAL_ERROR(VPR_ERROR_PLACE, "Could not place a router cluster within its constrained region"); + } +} + +static void place_noc_routers_randomly (std::vector& unfixed_routers, int seed) +{ + auto& place_ctx = g_vpr_ctx.placement(); + auto& noc_ctx = g_vpr_ctx.noc(); + auto& cluster_ctx = g_vpr_ctx.clustering(); + auto& device_ctx = g_vpr_ctx.device(); + + /* + * Unconstrained NoC routers are placed randomly, then NoC cost is optimized using simulated annealing. + * For random placement, physical NoC routers are shuffled, the logical NoC routers are assigned + * to shuffled physical routers. This is equivalent to placing each logical NoC router at a + * randomly selected physical router. The only difference is that an occupied physical NoC router + * might be selected multiple times. Shuffling makes sure that each physical NoC router is evaluated + * only once. + */ + + // Make a copy of NoC physical routers because we want to change its order + vtr::vector noc_phy_routers = noc_ctx.noc_model.get_noc_routers(); + + // Shuffle physical NoC routers + vtr::RandState rand_state = seed; + vtr::shuffle(noc_phy_routers.begin(), noc_phy_routers.end(), rand_state); + + // Get the logical block type for router + const auto router_block_type = cluster_ctx.clb_nlist.block_type(noc_ctx.noc_traffic_flows_storage.get_router_clusters_in_netlist()[0]); + + // Get the compressed grid for NoC + const auto& compressed_noc_grid = place_ctx.compressed_block_grids[router_block_type->index]; + + // Iterate over shuffled physical routers to place logical routers + // Since physical routers are shuffled, router placement would be random + for (const auto& phy_router : noc_phy_routers) { + t_physical_tile_loc router_phy_loc = phy_router.get_router_physical_location(); + + // Find a compatible sub-tile + const auto& phy_type = device_ctx.grid.get_physical_type(router_phy_loc); + const auto& compatible_sub_tiles = compressed_noc_grid.compatible_sub_tiles_for_tile.at(phy_type->index); + int sub_tile = compatible_sub_tiles[vtr::irand((int)compatible_sub_tiles.size() - 1)]; + + t_pl_loc loc(router_phy_loc, sub_tile); + + if (place_ctx.grid_blocks.is_sub_tile_empty(router_phy_loc, sub_tile)) { + // Pick one of the unplaced routers + auto logical_router_bid = unfixed_routers.back(); + unfixed_routers.pop_back(); + + // Create a macro with a single member + t_pl_macro_member macro_member; + macro_member.blk_index = logical_router_bid; + macro_member.offset = t_pl_offset(0, 0, 0); + t_pl_macro pl_macro; + pl_macro.members.push_back(macro_member); + + bool legal = try_place_macro(pl_macro, loc); + if (!legal) { + VPR_FATAL_ERROR(VPR_ERROR_PLACE, "Could not place a router cluster into an empty physical router."); + } + + // When all router clusters are placed, stop iterating over remaining physical routers + if (unfixed_routers.empty()) { + break; + } + } + } // end for of random router placement +} + +static void noc_routers_anneal(const t_noc_opts& noc_opts) +{ + auto& noc_ctx = g_vpr_ctx.noc(); + + // Only NoC related costs are considered + t_placer_costs costs; + + // Initialize NoC-related costs + costs.noc_aggregate_bandwidth_cost = comp_noc_aggregate_bandwidth_cost(); + costs.noc_latency_cost = comp_noc_latency_cost(noc_opts); + update_noc_normalization_factors(costs); + costs.cost = calculate_noc_cost(costs, noc_opts); + + // Maximum distance in each direction that a router can travel in a move + // It is assumed that NoC routers are organized in a square grid. + // Each router can initially move within the entire grid with a single swap. + const size_t n_physical_routers = noc_ctx.noc_model.get_noc_routers().size(); + const float max_r_lim = ceilf(sqrtf((float)n_physical_routers)); + + // At most, two routers are swapped + t_pl_blocks_to_be_moved blocks_affected(2); + + // Total number of moves grows linearly with the number of logical NoC routers. + // The constant factor was selected experimentally by running the algorithm on + // synthetic benchmarks. NoC-related metrics did not improve after increasing + // the constant factor above 35000. + // Get all the router clusters and figure out how many of them exist + const int num_router_clusters = noc_ctx.noc_traffic_flows_storage.get_router_clusters_in_netlist().size(); + const int N_MOVES = num_router_clusters * 35000; + + const double starting_prob = 0.5; + const double prob_step = starting_prob / N_MOVES; + + // The checkpoint stored the placement with the lowest cost. + NoCPlacementCheckpoint checkpoint; + + /* Algorithm overview: + * In each iteration, one logical NoC router and a physical NoC router are selected randomly. + * If the selected physical NoC router is occupied, two logical NoC routers are swapped. + * If not, the selected logical NoC router is moved to the vacant physical router. + * Then, the cost difference of this swap is computed. If the swap reduces the cost, + * it is always accepted. Swaps that increase the cost are accepted with a + * gradually decreasing probability. The placement with the lowest cost is saved + * as a checkpoint. When the annealing is over, if the checkpoint has a better + * cost than the current placement, the checkpoint is restored. + * Range limit and the probability of accepting swaps with positive delta cost + * decrease linearly as more swaps are evaluated. Late in the annealing, + * NoC routers are swapped only with their neighbors as the range limit approaches 1. + */ + + // Generate and evaluate router moves + for (int i_move = 0; i_move < N_MOVES; i_move++) { + e_create_move create_move_outcome = e_create_move::ABORT; + clear_move_blocks(blocks_affected); + // Shrink the range limit over time + float r_lim_decayed = 1.0f + (N_MOVES - i_move) * (max_r_lim / N_MOVES); + create_move_outcome = propose_router_swap(blocks_affected, r_lim_decayed); + + if (create_move_outcome != e_create_move::ABORT) { + apply_move_blocks(blocks_affected); + + double noc_aggregate_bandwidth_delta_c = 0.0; + double noc_latency_delta_c = 0.0; + find_affected_noc_routers_and_update_noc_costs(blocks_affected, noc_aggregate_bandwidth_delta_c, noc_latency_delta_c, noc_opts); + double delta_cost = (noc_opts.noc_placement_weighting) * (noc_latency_delta_c * costs.noc_latency_cost_norm + noc_aggregate_bandwidth_delta_c * costs.noc_aggregate_bandwidth_cost_norm); + + double prob = starting_prob - i_move * prob_step; + bool move_accepted = accept_noc_swap(delta_cost, prob); + + if (move_accepted) { + costs.cost += delta_cost; + commit_move_blocks(blocks_affected); + commit_noc_costs(); + costs.noc_aggregate_bandwidth_cost += noc_aggregate_bandwidth_delta_c; + costs.noc_latency_cost += noc_latency_delta_c; + if (costs.cost < checkpoint.get_cost() || !checkpoint.is_valid()) { + checkpoint.save_checkpoint(costs.cost); + } + } else { // The proposed move is rejected + revert_move_blocks(blocks_affected); + revert_noc_traffic_flow_routes(blocks_affected); + } + } + } + + if (checkpoint.get_cost() < costs.cost) { + checkpoint.restore_checkpoint(noc_opts, costs); + } +} + +void initial_noc_placement(const t_noc_opts& noc_opts, int seed) { + auto& noc_ctx = g_vpr_ctx.noc(); + + // Get all the router clusters + const std::vector& router_blk_ids = noc_ctx.noc_traffic_flows_storage.get_router_clusters_in_netlist(); + + // Holds all the routers that are not fixed into a specific location by constraints + std::vector unfixed_routers; + + // Check for floorplanning constraints and place constrained NoC routers + for (auto router_blk_id : router_blk_ids) { + // The block is fixed and was placed in mark_fixed_blocks() + if (is_block_placed((router_blk_id))) { + continue; + } + + if (is_cluster_constrained(router_blk_id)) { + place_constrained_noc_router(router_blk_id); + } else { + unfixed_routers.push_back(router_blk_id); + } + } + + // Place unconstrained NoC routers randomly + place_noc_routers_randomly(unfixed_routers,seed); + + // populate internal data structures to maintain route, bandwidth usage, and latencies + initial_noc_routing(); + + // Run the simulated annealing optimizer for NoC routers + noc_routers_anneal(noc_opts); +} \ No newline at end of file diff --git a/vpr/src/place/initial_noc_placment.h b/vpr/src/place/initial_noc_placment.h new file mode 100644 index 00000000000..4f060a14277 --- /dev/null +++ b/vpr/src/place/initial_noc_placment.h @@ -0,0 +1,15 @@ + +#ifndef VTR_INITIAL_NOC_PLACMENT_H +#define VTR_INITIAL_NOC_PLACMENT_H + +#include "vpr_types.h" + +/** + * @brief Randomly places NoC routers, then runs a quick simulated annealing + * to minimize NoC costs. + * + * @param noc_opts NoC-related options. Used to calculate NoC-related costs. + */ +void initial_noc_placement(const t_noc_opts& noc_opts, int seed); + +#endif //VTR_INITIAL_NOC_PLACMENT_H diff --git a/vpr/src/place/initial_placement.cpp b/vpr/src/place/initial_placement.cpp index a37896cc985..a8cd383fc4e 100644 --- a/vpr/src/place/initial_placement.cpp +++ b/vpr/src/place/initial_placement.cpp @@ -5,14 +5,13 @@ #include "globals.h" #include "read_place.h" #include "initial_placement.h" +#include "initial_noc_placment.h" #include "vpr_utils.h" #include "place_util.h" #include "place_constraints.h" #include "move_utils.h" #include "region.h" #include "directed_moves_util.h" -#include "noc_place_utils.h" -#include "noc_place_checkpoint.h" #include "echo_files.h" @@ -36,13 +35,8 @@ constexpr int INVALID_X = -1; // The amount of weight that will added to each tile which is outside of the floorplanning constraints #define SORT_WEIGHT_PER_TILES_OUTSIDE_OF_PR 100 -/* The maximum number of tries when trying to place a macro at a * - * random location before trying exhaustive placement - find the first * - * legal position and place it during initial placement. */ -#define MAX_NUM_TRIES_TO_PLACE_MACROS_RANDOMLY 8 - /** - * @brief Set choosen grid locations to EMPTY block id before each placement iteration + * @brief Set chosen grid locations to EMPTY block id before each placement iteration * * @param unplaced_blk_types_index Block types that their grid locations must be cleared. * @@ -53,57 +47,15 @@ static void clear_block_type_grid_locs(const std::unordered_set& unplaced_b * @brief Initializes the grid to empty. It also initialized the location for * all blocks to unplaced. */ -static void initialize_grid_locs(); - -/** - * @brief Calculates total NoC cost. - * - * @param costs Contains latency and aggregate bandwidth costs - * along with their corresponding normalization factors. - * @param noc_opts Contains NoC placement weighting factor. - * - * @return Calculated total NoC cost. - */ -static double calculate_noc_cost(const t_placer_costs& costs, const t_noc_opts& noc_opts); - -/** - * @brief Evaluates whether a NoC router swap should be accepted or not. - * - * @param delta_cost Specifies how much the total cost would change if - * the proposed swap is accepted. - * @param prob The probability by which a router swap that increases - * the cost is accepted. - * - * @return true if the proposed swap is accepted, false if not. - */ -static bool assess_noc_swap(double delta_cost, double prob); - -/** - * @brief Randomly places NoC routers, then runs a quick simulated annealing - * to minimize NoC costs. - * - * @param noc_opts NoC-related options. Used to calculate NoC-related costs. - */ -static void initial_noc_placement(const t_noc_opts& noc_opts); - -/** - * @brief Places the macro if the head position passed in is legal, and all the resulting - * member positions are legal - * - * @param pl_macro The macro to be placed. - * @param head_pos The location of the macro head member. - * - * @return true if macro was placed, false if not. - */ -static bool try_place_macro(const t_pl_macro& pl_macro, t_pl_loc head_pos); +static void clear_all_grid_locs(); /** * @brief Control routine for placing a macro. * First iteration of place_marco performs the following steps to place a macro: * 1) try_centroid_placement : tries to find a location based on the macro's logical connections. - * 2) try_random_placement : if no smart location found in the centroid placement, the function tries + * 2) try_place_macro_randomly : if no smart location found in the centroid placement, the function tries * to place it randomly for the max number of tries. - * 3) try_exhaustive_placement : if neither placement alogrithms work, the function will find a location + * 3) try_place_macro_exhaustively : if neither placement alogrithms work, the function will find a location * for the macro by exhaustively searching all available locations. * If first iteration failed, next iteration calls dense placement for specific block types. * @@ -202,11 +154,11 @@ static std::vector find_centroid_loc(const t_pl_macro& pl_macro, * * @param centroid_loc Calculated location in try_centroid_placement function for the block. * @param block_type Logical block type of the macro blocks. - * @param check_empty If set, the function tries to find an empty location. + * @param search_for_empty If set, the function tries to find an empty location. * * @return true if the function can find any location near the centroid one, false otherwise. */ -static bool find_centroid_neighbor(t_pl_loc& centroid_loc, t_logical_block_type_ptr block_type, bool check_empty); +static bool find_centroid_neighbor(t_pl_loc& centroid_loc, t_logical_block_type_ptr block_type, bool search_for_empty); /** * @brief tries to place a macro at a centroid location of its placed connections. @@ -222,32 +174,6 @@ static bool find_centroid_neighbor(t_pl_loc& centroid_loc, t_logical_block_type_ */ static bool try_centroid_placement(const t_pl_macro& pl_macro, PartitionRegion& pr, t_logical_block_type_ptr block_type, enum e_pad_loc_type pad_loc_type, vtr::vector& block_scores); -/** - * @brief tries to place a macro at a random location - * - * @param pl_macro The macro to be placed. - * @param pr The PartitionRegion of the macro - represents its floorplanning constraints, is the size of the whole chip if the macro is not - * constrained. - * @param block_type Logical block type of the macro blocks. - * @param pad_loc_type Used to check whether an io block needs to be marked as fixed. - * - * @return true if the macro gets placed, false if not. - */ -static bool try_random_placement(const t_pl_macro& pl_macro, const PartitionRegion& pr, t_logical_block_type_ptr block_type, enum e_pad_loc_type pad_loc_type); - -/** - * @brief Looks for a valid placement location for macro exhaustively once the maximum number of random locations have been tried. - * - * @param pl_macro The macro to be placed. - * @param pr The PartitionRegion of the macro - represents its floorplanning constraints, is the size of the whole chip if the macro is not - * constrained. - * @param block_type Logical block type of the macro blocks. - * @param pad_loc_type Used to check whether an io block needs to be marked as fixed. - * - * @return true if the macro gets placed, false if not. - */ -static bool try_exhaustive_placement(const t_pl_macro& pl_macro, const PartitionRegion& pr, t_logical_block_type_ptr block_type, enum e_pad_loc_type pad_loc_type); - /** * @brief Looks for a valid placement location for macro in second iteration, tries to place as many macros as possible in one column * and avoids fragmenting the available locations in one column. @@ -299,7 +225,7 @@ static void check_initial_placement_legality() { } } -static bool is_block_placed(ClusterBlockId blk_id) { +bool is_block_placed(ClusterBlockId blk_id) { auto& place_ctx = g_vpr_ctx.placement(); return (place_ctx.block_locs[blk_id].loc.x != INVALID_X); @@ -332,7 +258,7 @@ static bool is_loc_legal(t_pl_loc& loc, PartitionRegion& pr, t_logical_block_typ return legal; } -static bool find_centroid_neighbor(t_pl_loc& centroid_loc, t_logical_block_type_ptr block_type, bool check_empty) { +static bool find_centroid_neighbor(t_pl_loc& centroid_loc, t_logical_block_type_ptr block_type, bool search_for_empty) { const auto& compressed_block_grid = g_vpr_ctx.placement().compressed_block_grids[block_type->index]; const int num_layers = g_vpr_ctx.device().grid.get_num_layers(); const int centroid_loc_layer_num = centroid_loc.layer; @@ -367,7 +293,7 @@ static bool find_centroid_neighbor(t_pl_loc& centroid_loc, t_logical_block_type_ to_compressed_loc, false, centroid_loc_layer_num, - check_empty); + search_for_empty); if (!legal) { return false; @@ -640,7 +566,7 @@ static inline void fix_IO_block_types(const t_pl_macro& pl_macro, t_pl_loc loc, } } -static bool try_random_placement(const t_pl_macro& pl_macro, const PartitionRegion& pr, t_logical_block_type_ptr block_type, enum e_pad_loc_type pad_loc_type) { +bool try_place_macro_randomly(const t_pl_macro& pl_macro, const PartitionRegion& pr, t_logical_block_type_ptr block_type, enum e_pad_loc_type pad_loc_type) { const auto& compressed_block_grid = g_vpr_ctx.placement().compressed_block_grids[block_type->index]; t_pl_loc loc; @@ -707,7 +633,7 @@ static bool try_random_placement(const t_pl_macro& pl_macro, const PartitionRegi return legal; } -static bool try_exhaustive_placement(const t_pl_macro& pl_macro, const PartitionRegion& pr, t_logical_block_type_ptr block_type, enum e_pad_loc_type pad_loc_type) { +bool try_place_macro_exhaustively(const t_pl_macro& pl_macro, const PartitionRegion& pr, t_logical_block_type_ptr block_type, enum e_pad_loc_type pad_loc_type) { const auto& compressed_block_grid = g_vpr_ctx.placement().compressed_block_grids[block_type->index]; auto& place_ctx = g_vpr_ctx.mutable_placement(); @@ -819,7 +745,7 @@ static bool try_dense_placement(const t_pl_macro& pl_macro, PartitionRegion& pr, return legal; } -static bool try_place_macro(const t_pl_macro& pl_macro, t_pl_loc head_pos) { +bool try_place_macro(const t_pl_macro& pl_macro, t_pl_loc head_pos) { auto& place_ctx = g_vpr_ctx.mutable_placement(); VTR_LOGV_DEBUG(place_ctx.f_placer_debug, "\t\t\t\tTry to place the macro at %dx%dx%dx%d\n", @@ -907,7 +833,7 @@ static bool place_macro(int macros_max_num_tries, const t_pl_macro& pl_macro, en // If macro is not placed yet, try to place the macro randomly for the max number of random tries for (int itry = 0; itry < macros_max_num_tries && macro_placed == false; itry++) { VTR_LOGV_DEBUG(g_vpr_ctx.placement().f_placer_debug, "\t\t\tTry random place iter: %d\n", itry); - macro_placed = try_random_placement(pl_macro, pr, block_type, pad_loc_type); + macro_placed = try_place_macro_randomly(pl_macro, pr, block_type, pad_loc_type); } // Finished all tries if (!macro_placed) { @@ -919,7 +845,7 @@ static bool place_macro(int macros_max_num_tries, const t_pl_macro& pl_macro, en // Exhaustive placement of carry macros VTR_LOGV_DEBUG(g_vpr_ctx.placement().f_placer_debug, "\t\t\tTry exhaustive placement\n"); - macro_placed = try_exhaustive_placement(pl_macro, pr, block_type, pad_loc_type); + macro_placed = try_place_macro_exhaustively(pl_macro, pr, block_type, pad_loc_type); } return macro_placed; } @@ -1113,7 +1039,7 @@ static void clear_block_type_grid_locs(const std::unordered_set& unplaced_b } } -static void initialize_grid_locs() { +static void clear_all_grid_locs() { auto& device_ctx = g_vpr_ctx.device(); std::unordered_set blk_types_to_be_cleared; @@ -1167,195 +1093,7 @@ bool place_one_block(const ClusterBlockId& blk_id, return placed_macro; } -static double calculate_noc_cost(const t_placer_costs& costs, const t_noc_opts& noc_opts) { - double noc_cost = 0.0; - noc_cost = (noc_opts.noc_placement_weighting) * ((costs.noc_aggregate_bandwidth_cost * costs.noc_aggregate_bandwidth_cost_norm) + (costs.noc_latency_cost * costs.noc_latency_cost_norm)); - return noc_cost; -} - -static bool assess_noc_swap(double delta_cost, double prob) { - if (delta_cost <= 0.0) { - return true; - } - - if (prob == 0.0) { - return false; - } - - float random_num = vtr::frand(); - if (random_num < prob) { - return true; - } else { - return false; - } -} - -static void initial_noc_placement(const t_noc_opts& noc_opts) { - auto& place_ctx = g_vpr_ctx.placement(); - auto& noc_ctx = g_vpr_ctx.noc(); - auto& cluster_ctx = g_vpr_ctx.clustering(); - auto& device_ctx = g_vpr_ctx.device(); - const auto& floorplanning_ctx = g_vpr_ctx.floorplanning(); - - // Get all the router clusters and figure out how many of them exist - const std::vector& router_blk_ids = noc_ctx.noc_traffic_flows_storage.get_router_clusters_in_netlist(); - const int num_router_clusters = router_blk_ids.size(); - - // Holds all the routers that are not fixed into a specific location by constraints - std::vector unfixed_routers; - - for (auto router_blk_id : router_blk_ids) { - // The block is fixed and was placed in mark_fixed_blocks() - if (is_block_placed((router_blk_id))) { - continue; - } - - if (is_cluster_constrained(router_blk_id)) { - auto block_type = cluster_ctx.clb_nlist.block_type(router_blk_id); - const PartitionRegion& pr = floorplanning_ctx.cluster_constraints[router_blk_id]; - - // Create a macro with a single member - t_pl_macro_member macro_member; - macro_member.blk_index = router_blk_id; - macro_member.offset = t_pl_offset(0, 0, 0); - t_pl_macro pl_macro; - pl_macro.members.push_back(macro_member); - - bool macro_placed = false; - for (int i_try = 0; i_try < MAX_NUM_TRIES_TO_PLACE_MACROS_RANDOMLY && !macro_placed; i_try++) { - macro_placed = try_random_placement(pl_macro, pr, block_type, FREE); - } - - if (!macro_placed) { - macro_placed = try_exhaustive_placement(pl_macro, pr, block_type, FREE); - } - - if (!macro_placed) { - VPR_FATAL_ERROR(VPR_ERROR_PLACE, "Could not place a router cluster within its constrained region"); - } - - } else { - unfixed_routers.push_back(router_blk_id); - } - } - - // Make a copy of NoC physical routers because we want to change its order - vtr::vector noc_phy_routers = noc_ctx.noc_model.get_noc_routers(); - - // Shuffle NoC physical routers - vtr::RandState rand_state = vtr::irand(1024); - vtr::shuffle(noc_phy_routers.begin(), noc_phy_routers.end(), rand_state); - - // Get the logical block type for router - const auto router_block_type = cluster_ctx.clb_nlist.block_type(noc_ctx.noc_traffic_flows_storage.get_router_clusters_in_netlist()[0]); - - // Get the compressed grid for NoC - const auto& compressed_noc_grid = place_ctx.compressed_block_grids[router_block_type->index]; - - // Iterate over shuffled physical routers to place logical routers - // Since physical routers are shuffled, router placement would be random - for (const auto& phy_router : noc_phy_routers) { - t_physical_tile_loc router_phy_loc = phy_router.get_router_physical_location(); - - // Find a compatible sub-tile - const auto& phy_type = device_ctx.grid.get_physical_type(router_phy_loc); - const auto& compatible_sub_tiles = compressed_noc_grid.compatible_sub_tiles_for_tile.at(phy_type->index); - int sub_tile = compatible_sub_tiles[vtr::irand((int)compatible_sub_tiles.size() - 1)]; - - t_pl_loc loc(router_phy_loc, sub_tile); - - if (place_ctx.grid_blocks.is_sub_tile_empty(router_phy_loc, sub_tile)) { - // Pick one of the unplaced routers - auto logical_router_bid = unfixed_routers.back(); - unfixed_routers.pop_back(); - - // Create a macro with a single member - t_pl_macro_member macro_member; - macro_member.blk_index = logical_router_bid; - macro_member.offset = t_pl_offset(0, 0, 0); - t_pl_macro pl_macro; - pl_macro.members.push_back(macro_member); - - bool legal = try_place_macro(pl_macro, loc); - if (!legal) { - VPR_FATAL_ERROR(VPR_ERROR_PLACE, "Could not place a router cluster into an empty physical router."); - } - - // When all router clusters are placed, stop iterating over remaining physical routers - if (unfixed_routers.empty()) { - break; - } - } - } // end for of random router placement - - // populate internal data structures to maintain route, bandwidth usage, and latencies - initial_noc_routing(); - - // Only NoC related costs are considered - t_placer_costs costs; - - costs.noc_aggregate_bandwidth_cost = comp_noc_aggregate_bandwidth_cost(); - costs.noc_latency_cost = comp_noc_latency_cost(noc_opts); - update_noc_normalization_factors(costs); - costs.cost = calculate_noc_cost(costs, noc_opts); - - // Maximum distance in each direction that a router can travel in a move - const float max_r_lim = ceilf(sqrtf((float)noc_phy_routers.size())); - - // At most, two routers are swapped - t_pl_blocks_to_be_moved blocks_affected(2); - - // Total number of moves - const int N_MOVES = num_router_clusters * 35 * 1000; - - const double starting_prob = 0.5; - const double prob_step = starting_prob / N_MOVES; - - NoCPlacementCheckpoint checkpoint; - - // Generate and evaluate router moves - for (int i_move = 0; i_move < N_MOVES; i_move++) { - e_create_move create_move_outcome = e_create_move::ABORT; - clear_move_blocks(blocks_affected); - // Shrink the range limit over time - float r_lim_decayed = 1.0f + (N_MOVES - i_move) * (max_r_lim / N_MOVES); - create_move_outcome = propose_router_swap(blocks_affected, r_lim_decayed); - - if (create_move_outcome != e_create_move::ABORT) { - apply_move_blocks(blocks_affected); - - double noc_aggregate_bandwidth_delta_c = 0.0; - double noc_latency_delta_c = 0.0; - find_affected_noc_routers_and_update_noc_costs(blocks_affected, noc_aggregate_bandwidth_delta_c, noc_latency_delta_c, noc_opts); - double delta_cost = (noc_opts.noc_placement_weighting) * (noc_latency_delta_c * costs.noc_latency_cost_norm + noc_aggregate_bandwidth_delta_c * costs.noc_aggregate_bandwidth_cost_norm); - - double prob = starting_prob - i_move * prob_step; - bool move_accepted = assess_noc_swap(delta_cost, prob); - - if (move_accepted) { - costs.cost += delta_cost; - commit_move_blocks(blocks_affected); - commit_noc_costs(); - costs.noc_aggregate_bandwidth_cost += noc_aggregate_bandwidth_delta_c; - costs.noc_latency_cost += noc_latency_delta_c; - if (costs.cost < checkpoint.get_cost() || !checkpoint.is_valid()) { - checkpoint.save_checkpoint(costs.cost); - } - } else { // The proposed move is rejected - revert_move_blocks(blocks_affected); - revert_noc_traffic_flow_routes(blocks_affected); - } - } - } - - if (checkpoint.get_cost() < costs.cost) { - checkpoint.restore_checkpoint(noc_opts, costs); - } -} - - void initial_placement(const t_placer_opts& placer_opts, - enum e_pad_loc_type pad_loc_type, const char* constraints_file, const t_noc_opts& noc_opts) { vtr::ScopedStartFinishTimer timer("Initial Placement"); @@ -1363,7 +1101,7 @@ void initial_placement(const t_placer_opts& placer_opts, /* Initialize the grid blocks to empty. * Initialize all the blocks to unplaced. */ - initialize_grid_locs(); + clear_all_grid_locs(); /* Go through cluster blocks to calculate the tightest placement * floorplan constraint for each constrained block @@ -1376,14 +1114,14 @@ void initial_placement(const t_placer_opts& placer_opts, if (noc_opts.noc) { // NoC routers are placed before other blocks - initial_noc_placement(noc_opts); + initial_noc_placement(noc_opts, placer_opts.seed); } //Assign scores to blocks and placement macros according to how difficult they are to place vtr::vector block_scores = assign_block_scores(); //Place all blocks - place_all_blocks(placer_opts, block_scores, pad_loc_type, constraints_file); + place_all_blocks(placer_opts, block_scores, placer_opts.pad_loc_type, constraints_file); //if any blocks remain unplaced, print an error check_initial_placement_legality(); diff --git a/vpr/src/place/initial_placement.h b/vpr/src/place/initial_placement.h index cd1d3b13440..44a3772087d 100644 --- a/vpr/src/place/initial_placement.h +++ b/vpr/src/place/initial_placement.h @@ -2,6 +2,13 @@ #define VPR_INITIAL_PLACEMENT_H #include "vpr_types.h" +#include "place_macro.h" +#include "partition_region.h" + +/* The maximum number of tries when trying to place a macro at a * + * random location before trying exhaustive placement - find the first * + * legal position and place it during initial placement. */ +constexpr int MAX_NUM_TRIES_TO_PLACE_MACROS_RANDOMLY = 8; /** * @brief Used to assign each block a score for how difficult it is to place. @@ -38,6 +45,58 @@ struct t_grid_empty_locs_block_type { int num_of_empty_locs_in_y_axis; }; +/** + * @brief tries to place a macro at a random location + * + * @param pl_macro The macro to be placed. + * @param pr The PartitionRegion of the macro - represents its floorplanning constraints, is the size of the whole chip if the macro is not + * constrained. + * @param block_type Logical block type of the macro blocks. + * @param pad_loc_type Used to check whether an io block needs to be marked as fixed. + * + * @return true if the macro gets placed, false if not. + */ +bool try_place_macro_randomly(const t_pl_macro& pl_macro, + const PartitionRegion& pr, + t_logical_block_type_ptr block_type, + enum e_pad_loc_type pad_loc_type); + +/** + * @brief Looks for a valid placement location for macro exhaustively once the maximum number of random locations have been tried. + * + * @param pl_macro The macro to be placed. + * @param pr The PartitionRegion of the macro - represents its floorplanning constraints, is the size of the whole chip if the macro is not + * constrained. + * @param block_type Logical block type of the macro blocks. + * @param pad_loc_type Used to check whether an io block needs to be marked as fixed. + * + * @return true if the macro gets placed, false if not. + */ +bool try_place_macro_exhaustively(const t_pl_macro& pl_macro, + const PartitionRegion& pr, + t_logical_block_type_ptr block_type, + enum e_pad_loc_type pad_loc_type); + +/** + * @brief Places the macro if the head position passed in is legal, and all the resulting + * member positions are legal + * + * @param pl_macro The macro to be placed. + * @param head_pos The location of the macro head member. + * + * @return true if macro was placed, false if not. + */ +bool try_place_macro(const t_pl_macro& pl_macro, t_pl_loc head_pos); + +/** + * @brief Checks whether the block is already placed + * + * @param blk_id block id of the block to be checked + * + * @return true if the block was placed, false if not. + */ +bool is_block_placed(ClusterBlockId blk_id); + /** * @brief Tries to find an initial placement location for each block considering floorplanning constraints * and throws an error out if it fails after max number of attempts. @@ -46,14 +105,13 @@ struct t_grid_empty_locs_block_type { * flows and updating the bandwidths used by the links due to the * traffic flows. * - * @param placer_opts Required by the function that set the status of f_placer_debug - * @param pad_loc_type Used to check whether an io block needs to be marked as fixed. + * @param placer_opts Required by the function that set the status of f_placer_debug. + * Also used to access pad_loc_type to see if a block needs to be marked fixed. * @param constraints_file Used to read block locations if any constraints is available. * @param noc_enabled Used to check whether the user turned on the noc * optimization during placement. */ void initial_placement(const t_placer_opts& placer_opts, - enum e_pad_loc_type pad_loc_type, const char* constraints_file, const t_noc_opts& noc_opts); diff --git a/vpr/src/place/move_utils.cpp b/vpr/src/place/move_utils.cpp index 933de84c382..0bc001e42da 100644 --- a/vpr/src/place/move_utils.cpp +++ b/vpr/src/place/move_utils.cpp @@ -1031,6 +1031,29 @@ void compressed_grid_to_loc(t_logical_block_type_ptr blk_type, to_loc = t_pl_loc(grid_loc.x, grid_loc.y, sub_tile, grid_loc.layer_num); } +bool has_empty_compatible_subtile(t_logical_block_type_ptr type, const t_physical_tile_loc& to_loc) +{ + auto& device_ctx = g_vpr_ctx.device(); + auto& place_ctx = g_vpr_ctx.placement(); + + const auto& compressed_block_grid = g_vpr_ctx.placement().compressed_block_grids[type->index]; + bool legal = false; + + t_pl_loc to_uncompressed_loc; + compressed_grid_to_loc(type, to_loc, to_uncompressed_loc); + const t_physical_tile_loc to_phy_uncompressed_loc{to_uncompressed_loc.x, to_uncompressed_loc.y, to_uncompressed_loc.layer}; + const auto& phy_type = device_ctx.grid.get_physical_type(to_phy_uncompressed_loc); + const auto& compatible_sub_tiles = compressed_block_grid.compatible_sub_tiles_for_tile.at(phy_type->index); + for (const auto& sub_tile : compatible_sub_tiles) { + if (place_ctx.grid_blocks.is_sub_tile_empty(to_phy_uncompressed_loc, sub_tile)) { + legal = true; + break; + } + } + + return legal; +} + bool find_compatible_compressed_loc_in_range(t_logical_block_type_ptr type, const int delta_cx, const t_physical_tile_loc& from_loc, @@ -1038,9 +1061,7 @@ bool find_compatible_compressed_loc_in_range(t_logical_block_type_ptr type, t_physical_tile_loc& to_loc, bool is_median, int to_layer_num, - bool check_empty) { - auto& device_ctx = g_vpr_ctx.device(); - auto& place_ctx = g_vpr_ctx.placement(); + bool search_for_empty) { //TODO For the time being, the blocks only moved in the same layer. This assertion should be removed after VPR is updated to move blocks between layers VTR_ASSERT(to_layer_num == from_loc.layer_num); const auto& compressed_block_grid = g_vpr_ctx.placement().compressed_block_grids[type->index]; @@ -1122,18 +1143,8 @@ bool find_compatible_compressed_loc_in_range(t_logical_block_type_ptr type, if (from_loc.x == to_loc.x && from_loc.y == to_loc.y && from_loc.layer_num == to_layer_num) { continue; //Same from/to location -- try again for new y-position - } else if (check_empty) { // Check if the location has at least one empty sub-tile - t_pl_loc to_uncompressed_loc; - compressed_grid_to_loc(type, to_loc, to_uncompressed_loc); - const t_physical_tile_loc to_phy_uncompressed_loc{to_uncompressed_loc.x, to_uncompressed_loc.y, to_uncompressed_loc.layer}; - const auto& phy_type = device_ctx.grid.get_physical_type(to_phy_uncompressed_loc); - const auto& compatible_sub_tiles = compressed_block_grid.compatible_sub_tiles_for_tile.at(phy_type->index); - for (const auto& sub_tile : compatible_sub_tiles) { - if (place_ctx.grid_blocks.is_sub_tile_empty(to_phy_uncompressed_loc, sub_tile)) { - legal = true; - break; - } - } + } else if (search_for_empty) { // Check if the location has at least one empty sub-tile + legal = has_empty_compatible_subtile(type, to_loc); } else { legal = true; } diff --git a/vpr/src/place/move_utils.h b/vpr/src/place/move_utils.h index dbf28c01458..39d6621f697 100644 --- a/vpr/src/place/move_utils.h +++ b/vpr/src/place/move_utils.h @@ -222,6 +222,19 @@ const std::string& move_type_to_string(e_move_type); void compressed_grid_to_loc(t_logical_block_type_ptr blk_type, t_physical_tile_loc compressed_loc, t_pl_loc& to_loc); + +/** + * @brief Checks whether the given location has a compatible empty subtile with + * the given type. + * + * @param type logical block type + * @param to_loc The location to be checked + * + * @return bool True if the given location has at least one empty compatible subtile. + */ +bool has_empty_compatible_subtile(t_logical_block_type_ptr type, + const t_physical_tile_loc& to_loc); + /** * @brief find compressed location in a compressed range for a specific type in the given layer (to_layer_num) * @@ -231,7 +244,7 @@ void compressed_grid_to_loc(t_logical_block_type_ptr blk_type, * to_loc: the coordinates of the new location on the compressed grid * is_median: true if this is called from find_to_loc_median * to_layer_num: the layer number of the new location (set by the caller) - * check_empty: indicates that the returned location must be empty + * search_for_empty: indicates that the returned location must be empty */ bool find_compatible_compressed_loc_in_range(t_logical_block_type_ptr type, const int delta_cx, @@ -240,7 +253,7 @@ bool find_compatible_compressed_loc_in_range(t_logical_block_type_ptr type, t_physical_tile_loc& to_loc, bool is_median, int to_layer_num, - bool check_empty); + bool search_for_empty); /** * @brief Get the the compressed loc from the uncompressed loc (grid_loc) diff --git a/vpr/src/place/noc_place_checkpoint.cpp b/vpr/src/place/noc_place_checkpoint.cpp index ff9b75b9e87..a25cd9ec82c 100644 --- a/vpr/src/place/noc_place_checkpoint.cpp +++ b/vpr/src/place/noc_place_checkpoint.cpp @@ -2,7 +2,7 @@ #include "noc_place_checkpoint.h" #include "noc_place_utils.h" -RouterPlacementCheckpoint::RouterPlacementCheckpoint() +NoCPlacementCheckpoint::NoCPlacementCheckpoint() : valid_(false) , cost_(std::numeric_limits::infinity()) { const auto& noc_ctx = g_vpr_ctx.noc(); @@ -12,12 +12,13 @@ RouterPlacementCheckpoint::RouterPlacementCheckpoint() router_locations_.clear(); + // Initializes checkpoint locations to invalid for (const auto& router_bid : router_bids) { router_locations_[router_bid] = t_pl_loc(OPEN, OPEN, OPEN, OPEN); } } -void RouterPlacementCheckpoint::save_checkpoint(double cost) { +void NoCPlacementCheckpoint::save_checkpoint(double cost) { const auto& noc_ctx = g_vpr_ctx.noc(); const auto& place_ctx = g_vpr_ctx.placement(); @@ -31,7 +32,7 @@ void RouterPlacementCheckpoint::save_checkpoint(double cost) { cost_ = cost; } -void RouterPlacementCheckpoint::restore_checkpoint(const t_noc_opts& noc_opts, t_placer_costs& costs) { +void NoCPlacementCheckpoint::restore_checkpoint(const t_noc_opts& noc_opts, t_placer_costs& costs) { const auto& noc_ctx = g_vpr_ctx.noc(); const auto& device_ctx = g_vpr_ctx.device(); auto& place_ctx = g_vpr_ctx.mutable_placement(); @@ -46,7 +47,7 @@ void RouterPlacementCheckpoint::restore_checkpoint(const t_noc_opts& noc_opts, t place_ctx.grid_blocks.set_usage(phy_loc, 0); auto tile = device_ctx.grid.get_physical_type(phy_loc); - for (auto sub_tile : tile->sub_tiles) { + for (const auto& sub_tile : tile->sub_tiles) { auto capacity = sub_tile.capacity; for (int k = 0; k < capacity.total(); k++) { @@ -70,10 +71,10 @@ void RouterPlacementCheckpoint::restore_checkpoint(const t_noc_opts& noc_opts, t reinitialize_noc_routing(noc_opts, costs); } -bool RouterPlacementCheckpoint::is_valid() const { +bool NoCPlacementCheckpoint::is_valid() const { return valid_; } -double RouterPlacementCheckpoint::get_cost() const { +double NoCPlacementCheckpoint::get_cost() const { return cost_; } diff --git a/vpr/src/place/noc_place_checkpoint.h b/vpr/src/place/noc_place_checkpoint.h index ef016c64ad3..6c160ef4d1d 100644 --- a/vpr/src/place/noc_place_checkpoint.h +++ b/vpr/src/place/noc_place_checkpoint.h @@ -1,24 +1,72 @@ #ifndef VTR_ROUTERPLACEMENTCHECKPOINT_H #define VTR_ROUTERPLACEMENTCHECKPOINT_H +/** + * @brief NoC router placement checkpoint + * + * This class stores a checkpoint only for NoC router placement. + * If a checkpoint for all block types is needed, refer to place_checkpoint.h file. + * + * The initial placement for NoC routers is done before conventional blocks. Therefore, + * t_placement_checkpoint could not be used to store a checkpoint as t_placement_checkpoint + * assumes all blocks are placed. + * + * This class should only be used during initial NoC placement as it does not update + * bounding box and timing costs. + */ + #include "vpr_types.h" #include "place_util.h" -class RouterPlacementCheckpoint { - private: - std::unordered_map router_locations_; - bool valid_ = false; - double cost_; - +/** + * @brief A NoC router placement checkpoint + * + * The class stores a NoC router placement and its corresponding cost. + * The checkpoint can be restored to replace the current placement. + */ +class NoCPlacementCheckpoint { public: - RouterPlacementCheckpoint(); - RouterPlacementCheckpoint(const RouterPlacementCheckpoint& other) = delete; - RouterPlacementCheckpoint& operator=(const RouterPlacementCheckpoint& other) = delete; + /** + * @brief Default constructor initializes private member variables. + */ + NoCPlacementCheckpoint(); + NoCPlacementCheckpoint(const NoCPlacementCheckpoint& other) = delete; + NoCPlacementCheckpoint& operator=(const NoCPlacementCheckpoint& other) = delete; + /** + * @brief Saves the current NoC router placement as a checkpoint + * + * @param cost: The placement cost associated with the current placement + */ void save_checkpoint(double cost); + + /** + * @brief Loads the save checkpoint into global placement data structues. + * + * @param noc_opts: Contains weighting factors for different NoC cost terms + * @param costs: Used to load NoC related costs for the checkpoint + */ void restore_checkpoint(const t_noc_opts& noc_opts, t_placer_costs& costs); + + /** + * @brief Indicates whether the object is empty or it has already stored a + * checkpoint. + * + * @return bool True if there is a save checkpoint. + */ bool is_valid() const; + + /** + * @brief Return the cost associated with the checkpoint + * + * @return double Saved checkpoint's cost + */ double get_cost() const; + + private: + std::unordered_map router_locations_; + bool valid_ = false; + double cost_; }; #endif //VTR_ROUTERPLACEMENTCHECKPOINT_H diff --git a/vpr/src/place/noc_place_utils.cpp b/vpr/src/place/noc_place_utils.cpp index a11c42996e6..3f1bca50ab2 100644 --- a/vpr/src/place/noc_place_utils.cpp +++ b/vpr/src/place/noc_place_utils.cpp @@ -9,6 +9,18 @@ static vtr::vector traffic_flow_costs, p static std::vector affected_traffic_flows; /*********************************************************** *****************************/ +/** + * @brief Randomly select a moveable NoC router cluster blocks + * + * @param b_from The cluster block ID of the selected NoC router + * @param from The current location of the selected NoC router + * @param cluster_from_type Block type of the selected block + * @return bool True if a block was selected successfully. + * False if there are no NoC routers in the netlist or the + * selected NoC router is fixed/ + */ +static bool select_random_router_cluster(ClusterBlockId& b_from, t_pl_loc& from, t_logical_block_type_ptr& cluster_from_type); + void initial_noc_routing(void) { // need to get placement information about where the router cluster blocks are placed on the device const auto& place_ctx = g_vpr_ctx.placement(); @@ -249,6 +261,12 @@ void update_noc_normalization_factors(t_placer_costs& costs) { return; } +double calculate_noc_cost(const t_placer_costs& costs, const t_noc_opts& noc_opts) { + double noc_cost; + noc_cost = (noc_opts.noc_placement_weighting) * ((costs.noc_aggregate_bandwidth_cost * costs.noc_aggregate_bandwidth_cost_norm) + (costs.noc_latency_cost * costs.noc_latency_cost_norm)); + return noc_cost; +} + double comp_noc_aggregate_bandwidth_cost(void) { // used to get traffic flow route information auto& noc_ctx = g_vpr_ctx.mutable_noc(); @@ -532,140 +550,7 @@ e_create_move propose_router_swap(t_pl_blocks_to_be_moved& blocks_affected, floa return create_move; } -e_create_move propose_router_swap_flow_centroid(t_pl_blocks_to_be_moved& blocks_affected) { - auto& noc_ctx = g_vpr_ctx.noc(); - auto& place_ctx = g_vpr_ctx.placement(); - const auto& grid = g_vpr_ctx.device().grid; - - const int num_layers = g_vpr_ctx.device().grid.get_num_layers(); - - // block ID for the randomly selected router cluster - ClusterBlockId b_from; - // current location of the randomly selected router cluster - t_pl_loc from; - // logical block type of the randomly selected router cluster - t_logical_block_type_ptr cluster_from_type; - bool random_select_success = false; - - // Randomly select a router cluster - random_select_success = select_random_router_cluster(b_from, from, cluster_from_type); - - const auto& compressed_noc_grid = g_vpr_ctx.placement().compressed_block_grids[cluster_from_type->index]; - - // If a random router cluster could not be selected, no move can be proposed - if (!random_select_success) { - return e_create_move::ABORT; - } - - // Get all the traffic flow associated with the selected router cluster - const std::vector* associated_flows = noc_ctx.noc_traffic_flows_storage.get_traffic_flows_associated_to_router_block(b_from); - - // There are no associated flows for this router. Centroid location cannot be calculated. - if (associated_flows == nullptr) { - return e_create_move::ABORT; - } - - double acc_x = 0.0; - double acc_y = 0.0; - double acc_weight = 0.0; - - // iterate over all the flows associated with the given router - for (auto flow_id : *associated_flows) { - auto& flow = noc_ctx.noc_traffic_flows_storage.get_single_noc_traffic_flow(flow_id); - ClusterBlockId source_blk_id = flow.source_router_cluster_id; - ClusterBlockId sink_blk_id = flow.sink_router_cluster_id; - - if (b_from == source_blk_id) { - acc_x += flow.score * place_ctx.block_locs[sink_blk_id].loc.x; - acc_y += flow.score * place_ctx.block_locs[sink_blk_id].loc.y; - acc_weight += flow.score; - } else if (b_from == sink_blk_id) { - acc_x += flow.score * place_ctx.block_locs[source_blk_id].loc.x; - acc_y += flow.score * place_ctx.block_locs[source_blk_id].loc.y; - acc_weight += flow.score; - } else { - VTR_ASSERT(false); - } - } - - t_pl_loc centroid_loc(OPEN, OPEN, OPEN, OPEN); - - if (acc_weight > 0.0) { - centroid_loc.x = (int)round(acc_x / acc_weight); - centroid_loc.y = (int)round(acc_y / acc_weight); - // NoC routers are not swapped across layers - // TODO: Is a 3d NoC feasible? If so, calculate the target layer - centroid_loc.layer = from.layer; - } else { - return e_create_move::ABORT; - } - - t_physical_tile_loc phy_centroid_loc{centroid_loc.x, centroid_loc.y, centroid_loc.y}; - - if (!is_loc_on_chip(phy_centroid_loc)) { - return e_create_move::ABORT; - } - - const auto& physical_type = grid.get_physical_type({centroid_loc.x, centroid_loc.y, centroid_loc.layer}); - - // If the calculated centroid does not have a compatible type, find a compatible location nearby - if (!is_tile_compatible(physical_type, cluster_from_type)) { - //Determine centroid location in the compressed space of the current block - auto compressed_centroid_loc = get_compressed_loc_approx(compressed_noc_grid, - {centroid_loc.x, centroid_loc.y, 0, centroid_loc.layer}, - num_layers); - int cx_centroid = compressed_centroid_loc[0].x; - int cy_centroid = compressed_centroid_loc[0].y; - - const int r_lim = 1; - int r_lim_x = std::min(compressed_noc_grid.compressed_to_grid_x.size(), r_lim); - int r_lim_y = std::min(compressed_noc_grid.compressed_to_grid_y.size(), r_lim); - - //Determine the valid compressed grid location ranges - int min_cx, max_cx, delta_cx; - int min_cy, max_cy; - - min_cx = std::max(0, cx_centroid - r_lim_x); - max_cx = std::min(compressed_noc_grid.compressed_to_grid_x.size() - 1, cx_centroid + r_lim_x); - - min_cy = std::max(0, cy_centroid - r_lim_y); - max_cy = std::min(compressed_noc_grid.compressed_to_grid_y.size() - 1, cy_centroid + r_lim_y); - - delta_cx = max_cx - min_cx; - - auto compressed_from_loc = get_compressed_loc_approx(compressed_noc_grid, - {from.x, from.y, 0, from.layer}, - num_layers); - - t_physical_tile_loc compressed_to_loc; - - bool legal = find_compatible_compressed_loc_in_range(cluster_from_type, - delta_cx, - compressed_from_loc[0], - {min_cx, max_cx, min_cy, max_cy}, - compressed_to_loc, - false, - compressed_from_loc[0].layer_num, - false); - - if (!legal) { - return e_create_move::ABORT; - } - - compressed_grid_to_loc(cluster_from_type, compressed_to_loc, centroid_loc); - } - - e_create_move create_move = ::create_move(blocks_affected, b_from, centroid_loc); - - //Check that all the blocks affected by the move would still be in a legal floorplan region after the swap - if (!floorplan_legal(blocks_affected)) { - return e_create_move::ABORT; - } - - return create_move; -} - -void write_noc_placement_file(std::string file_name) { +void write_noc_placement_file(const std::string& file_name) { // we need the clustered netlist to get the names of all the NoC router cluster blocks auto& cluster_ctx = g_vpr_ctx.clustering(); // we need to the placement context to determine the final placed locations of the NoC router cluster blocks diff --git a/vpr/src/place/noc_place_utils.h b/vpr/src/place/noc_place_utils.h index 4d1f65adf17..5dbaed43f8f 100644 --- a/vpr/src/place/noc_place_utils.h +++ b/vpr/src/place/noc_place_utils.h @@ -270,6 +270,17 @@ void recompute_noc_costs(double& new_noc_aggregate_bandwidth_cost, double& new_n */ void update_noc_normalization_factors(t_placer_costs& costs); +/** + * @brief Calculates total NoC cost. + * + * @param costs Contains latency and aggregate bandwidth costs + * along with their corresponding normalization factors. + * @param noc_opts Contains NoC placement weighting factor. + * + * @return Calculated total NoC cost. + */ +double calculate_noc_cost(const t_placer_costs& costs, const t_noc_opts& noc_opts); + /** * @brief Calculates the aggregate bandwidth of each traffic flow in the NoC * and initializes local variables that keep track of the traffic flow @@ -424,8 +435,6 @@ bool check_for_router_swap(int user_supplied_noc_router_swap_percentage); */ e_create_move propose_router_swap(t_pl_blocks_to_be_moved& blocks_affected, float rlim); -e_create_move propose_router_swap_flow_centroid(t_pl_blocks_to_be_moved& blocks_affected); - /** * @brief Writes out the locations of the router cluster blocks in the * final placement. This file contains only NoC routers and the @@ -442,5 +451,5 @@ e_create_move propose_router_swap_flow_centroid(t_pl_blocks_to_be_moved& blocks_ * information. * */ -void write_noc_placement_file(std::string file_name); +void write_noc_placement_file(const std::string& file_name); #endif \ No newline at end of file diff --git a/vpr/src/place/place.cpp b/vpr/src/place/place.cpp index 1b321a75e6e..cb67c1171e9 100644 --- a/vpr/src/place/place.cpp +++ b/vpr/src/place/place.cpp @@ -545,7 +545,6 @@ void try_place(const Netlist<>& net_list, vtr::ScopedStartFinishTimer timer("Placement"); initial_placement(placer_opts, - placer_opts.pad_loc_type, placer_opts.constraints_file.c_str(), noc_opts); @@ -1357,6 +1356,8 @@ static float starting_t(const t_annealing_state* state, t_placer_costs* costs, t VTR_LOG("std_dev: %g, average cost: %g, starting temp: %g\n", std_dev, av, 20. * std_dev); #endif + // Improved initial placement uses a fast SA for NoC routers and centroid placement + // for other blocks. The temperature is reduced to prevent SA from destroying the initial placement float init_temp = std_dev / 64; return init_temp; @@ -2011,8 +2012,8 @@ static double get_total_cost(t_placer_costs* costs, const t_placer_opts& placer_ } if (noc_opts.noc) { - // in noc mode we include noc agggregate bandwidth and noc latency - total_cost += (noc_opts.noc_placement_weighting) * ((costs->noc_aggregate_bandwidth_cost * costs->noc_aggregate_bandwidth_cost_norm) + (costs->noc_latency_cost * costs->noc_latency_cost_norm)); + // in noc mode we include noc aggregate bandwidth and noc latency + total_cost += calculate_noc_cost(*costs, noc_opts); } return total_cost; diff --git a/vtr_flow/tasks/regression_tests/vtr_reg_nightly_test2/titan_quick_qor/config/config.txt b/vtr_flow/tasks/regression_tests/vtr_reg_nightly_test2/titan_quick_qor/config/config.txt index ca38c172c7f..6e8a089ab8d 100644 --- a/vtr_flow/tasks/regression_tests/vtr_reg_nightly_test2/titan_quick_qor/config/config.txt +++ b/vtr_flow/tasks/regression_tests/vtr_reg_nightly_test2/titan_quick_qor/config/config.txt @@ -67,4 +67,4 @@ pass_requirements_file=pass_requirements_vpr_titan.txt #A large number of routing iterations is set to ensure the router doesn't give up to easily on the larger benchmarks #To be more run-time comparable to commercial tools like Quartus, we run with higher placer effort (inner_num=2) and lower astar_fac (1.0) #Set a 24hr timeout so they don't run forever -script_params=-starting_stage vpr --route_chan_width 300 --max_router_iterations 400 --router_lookahead map -timeout 86400 --initial_pres_fac 1.0 --router_profiler_astar_fac 1.5 \ No newline at end of file +script_params=-starting_stage vpr --route_chan_width 300 --max_router_iterations 400 --router_lookahead map -timeout 86400 --initial_pres_fac 1.0 --router_profiler_astar_fac 1.5 --seed 3 \ No newline at end of file From 4439184e7560828164ce842f8ffa2e506615196b Mon Sep 17 00:00:00 2001 From: soheilshahrouz Date: Sat, 11 Nov 2023 13:02:52 -0500 Subject: [PATCH 33/35] make format --- vpr/src/noc/noc_traffic_flows.h | 3 +-- vpr/src/pack/pack.cpp | 4 ++-- vpr/src/place/initial_noc_placement.cpp | 13 +++++-------- vpr/src/place/initial_placement.cpp | 2 +- vpr/src/place/move_utils.cpp | 5 ++--- vpr/src/place/noc_place_checkpoint.h | 6 +++--- 6 files changed, 14 insertions(+), 19 deletions(-) diff --git a/vpr/src/noc/noc_traffic_flows.h b/vpr/src/noc/noc_traffic_flows.h index d5a879ea6f5..8b433ef3599 100644 --- a/vpr/src/noc/noc_traffic_flows.h +++ b/vpr/src/noc/noc_traffic_flows.h @@ -74,8 +74,7 @@ struct t_noc_traffic_flow { , sink_router_cluster_id(sink_router_id) , traffic_flow_bandwidth(flow_bandwidth) , max_traffic_flow_latency(max_flow_latency) - , traffic_flow_priority(flow_priority) - {} + , traffic_flow_priority(flow_priority) {} }; class NocTrafficFlows { diff --git a/vpr/src/pack/pack.cpp b/vpr/src/pack/pack.cpp index 88a177f4ba6..9fd61587cde 100644 --- a/vpr/src/pack/pack.cpp +++ b/vpr/src/pack/pack.cpp @@ -192,11 +192,11 @@ bool try_pack(t_packer_opts* packer_opts, VTR_LOG("Floorplan regions are overfull: trying to pack again with more attraction groups exploration. \n"); attraction_groups.create_att_groups_for_overfull_regions(); VTR_LOG("Pack iteration is %d\n", pack_iteration); - } else if (pack_iteration == 3) { + } else if (pack_iteration == 3) { attraction_groups.create_att_groups_for_all_regions(); VTR_LOG("Floorplan regions are overfull: trying to pack again with more attraction groups exploration. \n"); VTR_LOG("Pack iteration is %d\n", pack_iteration); - } else if (pack_iteration == 4) { + } else if (pack_iteration == 4) { attraction_groups.create_att_groups_for_all_regions(); VTR_LOG("Floorplan regions are overfull: trying to pack again with more attraction groups exploration and higher target pin utilization. \n"); VTR_LOG("Pack iteration is %d\n", pack_iteration); diff --git a/vpr/src/place/initial_noc_placement.cpp b/vpr/src/place/initial_noc_placement.cpp index b2ee3f50d88..7f6c75a8f48 100644 --- a/vpr/src/place/initial_noc_placement.cpp +++ b/vpr/src/place/initial_noc_placement.cpp @@ -32,7 +32,7 @@ static void place_constrained_noc_router(ClusterBlockId router_blk_id); * NoC routers. * @param seed Used for shuffling NoC routers. */ -static void place_noc_routers_randomly (std::vector& unfixed_routers, int seed); +static void place_noc_routers_randomly(std::vector& unfixed_routers, int seed); /** * @brief Runs a simulated annealing optimizer for NoC routers. @@ -58,8 +58,7 @@ static bool accept_noc_swap(double delta_cost, double prob) { } } -static void place_constrained_noc_router(ClusterBlockId router_blk_id) -{ +static void place_constrained_noc_router(ClusterBlockId router_blk_id) { auto& cluster_ctx = g_vpr_ctx.clustering(); const auto& floorplanning_ctx = g_vpr_ctx.floorplanning(); @@ -87,8 +86,7 @@ static void place_constrained_noc_router(ClusterBlockId router_blk_id) } } -static void place_noc_routers_randomly (std::vector& unfixed_routers, int seed) -{ +static void place_noc_routers_randomly(std::vector& unfixed_routers, int seed) { auto& place_ctx = g_vpr_ctx.placement(); auto& noc_ctx = g_vpr_ctx.noc(); auto& cluster_ctx = g_vpr_ctx.clustering(); @@ -153,8 +151,7 @@ static void place_noc_routers_randomly (std::vector& unfixed_rou } // end for of random router placement } -static void noc_routers_anneal(const t_noc_opts& noc_opts) -{ +static void noc_routers_anneal(const t_noc_opts& noc_opts) { auto& noc_ctx = g_vpr_ctx.noc(); // Only NoC related costs are considered @@ -267,7 +264,7 @@ void initial_noc_placement(const t_noc_opts& noc_opts, int seed) { } // Place unconstrained NoC routers randomly - place_noc_routers_randomly(unfixed_routers,seed); + place_noc_routers_randomly(unfixed_routers, seed); // populate internal data structures to maintain route, bandwidth usage, and latencies initial_noc_routing(); diff --git a/vpr/src/place/initial_placement.cpp b/vpr/src/place/initial_placement.cpp index a8cd383fc4e..15a9c1dae1b 100644 --- a/vpr/src/place/initial_placement.cpp +++ b/vpr/src/place/initial_placement.cpp @@ -821,7 +821,7 @@ static bool place_macro(int macros_max_num_tries, const t_pl_macro& pl_macro, en //If blk_types_empty_locs_in_grid is not NULL, means that initial placement has been failed in first iteration for this block type //We need to place densely in second iteration to be able to find a legal initial placement solution if (blk_types_empty_locs_in_grid != nullptr && !blk_types_empty_locs_in_grid->empty()) { - VTR_LOGV_DEBUG(g_vpr_ctx.placement().f_placer_debug, "\t\t\tTry dense placement\n"); + VTR_LOGV_DEBUG(g_vpr_ctx.placement().f_placer_debug, "\t\t\tTry dense placement\n"); macro_placed = try_dense_placement(pl_macro, pr, block_type, pad_loc_type, blk_types_empty_locs_in_grid); } diff --git a/vpr/src/place/move_utils.cpp b/vpr/src/place/move_utils.cpp index 0bc001e42da..8e187908968 100644 --- a/vpr/src/place/move_utils.cpp +++ b/vpr/src/place/move_utils.cpp @@ -1031,8 +1031,7 @@ void compressed_grid_to_loc(t_logical_block_type_ptr blk_type, to_loc = t_pl_loc(grid_loc.x, grid_loc.y, sub_tile, grid_loc.layer_num); } -bool has_empty_compatible_subtile(t_logical_block_type_ptr type, const t_physical_tile_loc& to_loc) -{ +bool has_empty_compatible_subtile(t_logical_block_type_ptr type, const t_physical_tile_loc& to_loc) { auto& device_ctx = g_vpr_ctx.device(); auto& place_ctx = g_vpr_ctx.placement(); @@ -1142,7 +1141,7 @@ bool find_compatible_compressed_loc_in_range(t_logical_block_type_ptr type, VTR_ASSERT(to_loc.y <= search_range.ymax); if (from_loc.x == to_loc.x && from_loc.y == to_loc.y && from_loc.layer_num == to_layer_num) { - continue; //Same from/to location -- try again for new y-position + continue; //Same from/to location -- try again for new y-position } else if (search_for_empty) { // Check if the location has at least one empty sub-tile legal = has_empty_compatible_subtile(type, to_loc); } else { diff --git a/vpr/src/place/noc_place_checkpoint.h b/vpr/src/place/noc_place_checkpoint.h index 6c160ef4d1d..bf5c4305616 100644 --- a/vpr/src/place/noc_place_checkpoint.h +++ b/vpr/src/place/noc_place_checkpoint.h @@ -26,9 +26,9 @@ */ class NoCPlacementCheckpoint { public: - /** - * @brief Default constructor initializes private member variables. - */ + /** + * @brief Default constructor initializes private member variables. + */ NoCPlacementCheckpoint(); NoCPlacementCheckpoint(const NoCPlacementCheckpoint& other) = delete; NoCPlacementCheckpoint& operator=(const NoCPlacementCheckpoint& other) = delete; From 67f55f0d9c49de3157dc5573518a77be04376fc5 Mon Sep 17 00:00:00 2001 From: soheilshahrouz Date: Sat, 11 Nov 2023 18:16:08 -0500 Subject: [PATCH 34/35] set search_for_empty to false. --- vpr/src/place/initial_placement.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vpr/src/place/initial_placement.cpp b/vpr/src/place/initial_placement.cpp index 15a9c1dae1b..460c40a8807 100644 --- a/vpr/src/place/initial_placement.cpp +++ b/vpr/src/place/initial_placement.cpp @@ -609,7 +609,7 @@ bool try_place_macro_randomly(const t_pl_macro& pl_macro, const PartitionRegion& to_compressed_loc, false, reg_coord.layer_num, - true); + false); if (!legal) { //No valid position found return false; From 12b681ace9235a12042c1cbe9d5f2d8e06251163 Mon Sep 17 00:00:00 2001 From: soheilshahrouz Date: Mon, 13 Nov 2023 11:49:25 -0500 Subject: [PATCH 35/35] updated the default value for --noc_swap_percentage option With NoC initial placement, we no longer need to move NoC routers more than what the RL agent suggests. --- vpr/src/base/read_options.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vpr/src/base/read_options.cpp b/vpr/src/base/read_options.cpp index 32929c4fc9a..ef9aaa14234 100644 --- a/vpr/src/base/read_options.cpp +++ b/vpr/src/base/read_options.cpp @@ -2823,7 +2823,7 @@ argparse::ArgumentParser create_arg_parser(std::string prog_name, t_options& arg .help( "Sets the minimum fraction of swaps attempted by the placer that are NoC blocks." "This value is an integer ranging from 0-100. 0 means NoC blocks will be moved at the same rate as other blocks. 100 means all swaps attempted by the placer are NoC router blocks.") - .default_value("40") + .default_value("0") .show_in(argparse::ShowIn::HELP_ONLY); noc_grp.add_argument(args.noc_placement_file_name, "--noc_placement_file_name")