diff --git a/vpr/src/base/read_options.cpp b/vpr/src/base/read_options.cpp index b734b82b850..d62b812e4b8 100644 --- a/vpr/src/base/read_options.cpp +++ b/vpr/src/base/read_options.cpp @@ -2823,7 +2823,7 @@ argparse::ArgumentParser create_arg_parser(std::string prog_name, t_options& arg .help( "Sets the minimum fraction of swaps attempted by the placer that are NoC blocks." "This value is an integer ranging from 0-100. 0 means NoC blocks will be moved at the same rate as other blocks. 100 means all swaps attempted by the placer are NoC router blocks.") - .default_value("40") + .default_value("0") .show_in(argparse::ShowIn::HELP_ONLY); noc_grp.add_argument(args.noc_placement_file_name, "--noc_placement_file_name") diff --git a/vpr/src/base/vpr_context.h b/vpr/src/base/vpr_context.h index a07a73e2827..6a07f367e13 100644 --- a/vpr/src/base/vpr_context.h +++ b/vpr/src/base/vpr_context.h @@ -344,6 +344,11 @@ struct ClusteringHelperContext : public Context { // the utilization of external input/output pins during packing (between 0 and 1) t_ext_pin_util_targets target_external_pin_util; + // During clustering, a block is related to un-clustered primitives with nets. + // This relation has three types: low fanout, high fanout, and trasitive + // high_fanout_thresholds stores the threshold for nets to a block type to be considered high fanout + t_pack_high_fanout_thresholds high_fanout_thresholds; + // A vector of unordered_sets of AtomBlockIds that are inside each clustered block [0 .. num_clustered_blocks-1] // unordered_set for faster insertion/deletion during the iterative improvement process of packing vtr::vector> atoms_lookup; diff --git a/vpr/src/base/vpr_types.cpp b/vpr/src/base/vpr_types.cpp index c6c688e97c3..ed3fc40f9d0 100644 --- a/vpr/src/base/vpr_types.cpp +++ b/vpr/src/base/vpr_types.cpp @@ -7,7 +7,123 @@ t_ext_pin_util_targets::t_ext_pin_util_targets(float default_in_util, float defa defaults_.output_pin_util = default_out_util; } -t_ext_pin_util t_ext_pin_util_targets::get_pin_util(std::string block_type_name) const { +t_ext_pin_util_targets::t_ext_pin_util_targets(const std::vector& specs) + : t_ext_pin_util_targets(1., 1.) { + if (specs.size() == 1 && specs[0] == "auto") { + //No user-specified pin utilizations, infer them automatically. + // + //We set a pin utilization target based on the block type, with + //the logic block having a lower utilization target and other blocks + //(e.g. hard blocks) having no limit. + + auto& device_ctx = g_vpr_ctx.device(); + auto& grid = device_ctx.grid; + t_logical_block_type_ptr logic_block_type = infer_logic_block_type(grid); + + //Allowing 100% pin utilization of the logic block type can harm + //routability, since it may allow a few (typically outlier) clusters to + //use a very large number of pins -- causing routability issues. These + //clusters can cause failed routings where only a handful of routing + //resource nodes remain overused (and do not resolve) These can be + //avoided by putting a (soft) limit on the number of input pins which + //can be used, effectively clipping off the most egregeous outliers. + // + //Experiments show that limiting input utilization produces better quality + //than limiting output utilization (limiting input utilization implicitly + //also limits output utilization). + // + //For relatively high pin utilizations (e.g. > 70%) this has little-to-no + //impact on the number of clusters required. As a result we set a default + //input pin utilization target which is high, but less than 100%. + if (logic_block_type != nullptr) { + constexpr float LOGIC_BLOCK_TYPE_AUTO_INPUT_UTIL = 0.8; + constexpr float LOGIC_BLOCK_TYPE_AUTO_OUTPUT_UTIL = 1.0; + + t_ext_pin_util logic_block_ext_pin_util(LOGIC_BLOCK_TYPE_AUTO_INPUT_UTIL, LOGIC_BLOCK_TYPE_AUTO_OUTPUT_UTIL); + + set_block_pin_util(logic_block_type->name, logic_block_ext_pin_util); + } else { + VTR_LOG_WARN("Unable to identify logic block type to apply default pin utilization targets to; this may result in denser packing than desired\n"); + } + + } else { + //Process user specified overrides + + bool default_set = false; + std::set seen_block_types; + + for (const auto& spec : specs) { + t_ext_pin_util target_ext_pin_util(1., 1.); + + auto block_values = vtr::split(spec, ":"); + std::string block_type; + std::string values; + if (block_values.size() == 2) { + block_type = block_values[0]; + values = block_values[1]; + } else if (block_values.size() == 1) { + values = block_values[0]; + } else { + std::stringstream msg; + msg << "In valid block pin utilization specification '" << spec << "' (expected at most one ':' between block name and values"; + VPR_FATAL_ERROR(VPR_ERROR_PACK, msg.str().c_str()); + } + + auto elements = vtr::split(values, ","); + if (elements.size() == 1) { + target_ext_pin_util.input_pin_util = vtr::atof(elements[0]); + } else if (elements.size() == 2) { + target_ext_pin_util.input_pin_util = vtr::atof(elements[0]); + target_ext_pin_util.output_pin_util = vtr::atof(elements[1]); + } else { + std::stringstream msg; + msg << "Invalid conversion from '" << spec << "' to external pin util (expected either a single float value, or two float values separted by a comma)"; + VPR_FATAL_ERROR(VPR_ERROR_PACK, msg.str().c_str()); + } + + if (target_ext_pin_util.input_pin_util < 0. || target_ext_pin_util.input_pin_util > 1.) { + std::stringstream msg; + msg << "Out of range target input pin utilization '" << target_ext_pin_util.input_pin_util << "' (expected within range [0.0, 1.0])"; + VPR_FATAL_ERROR(VPR_ERROR_PACK, msg.str().c_str()); + } + if (target_ext_pin_util.output_pin_util < 0. || target_ext_pin_util.output_pin_util > 1.) { + std::stringstream msg; + msg << "Out of range target output pin utilization '" << target_ext_pin_util.output_pin_util << "' (expected within range [0.0, 1.0])"; + VPR_FATAL_ERROR(VPR_ERROR_PACK, msg.str().c_str()); + } + + if (block_type.empty()) { + //Default value + if (default_set) { + std::stringstream msg; + msg << "Only one default pin utilization should be specified"; + VPR_FATAL_ERROR(VPR_ERROR_PACK, msg.str().c_str()); + } + set_default_pin_util(target_ext_pin_util); + default_set = true; + } else { + if (seen_block_types.count(block_type)) { + std::stringstream msg; + msg << "Only one pin utilization should be specified for block type '" << block_type << "'"; + VPR_FATAL_ERROR(VPR_ERROR_PACK, msg.str().c_str()); + } + + set_block_pin_util(block_type, target_ext_pin_util); + seen_block_types.insert(block_type); + } + } + } +} + +t_ext_pin_util_targets& t_ext_pin_util_targets::operator=(t_ext_pin_util_targets&& other) noexcept { + if (this != &other) { + defaults_ = std::move(other.defaults_); + overrides_ = std::move(other.overrides_); + } + return *this; +} + +t_ext_pin_util t_ext_pin_util_targets::get_pin_util(const std::string& block_type_name) const { auto itr = overrides_.find(block_type_name); if (itr != overrides_.end()) { return itr->second; @@ -15,7 +131,30 @@ t_ext_pin_util t_ext_pin_util_targets::get_pin_util(std::string block_type_name) return defaults_; } -void t_ext_pin_util_targets::set_block_pin_util(std::string block_type_name, t_ext_pin_util target) { +std::string t_ext_pin_util_targets::to_string() const { + std::stringstream ss; + + auto& device_ctx = g_vpr_ctx.device(); + + for (unsigned int itype = 0; itype < device_ctx.physical_tile_types.size(); ++itype) { + if (is_empty_type(&device_ctx.physical_tile_types[itype])) continue; + + auto blk_name = device_ctx.physical_tile_types[itype].name; + + ss << blk_name << ":"; + + auto pin_util = get_pin_util(blk_name); + ss << pin_util.input_pin_util << ',' << pin_util.output_pin_util; + + if (itype != device_ctx.physical_tile_types.size() - 1) { + ss << " "; + } + } + + return ss.str(); +} + +void t_ext_pin_util_targets::set_block_pin_util(const std::string& block_type_name, t_ext_pin_util target) { overrides_[block_type_name] = target; } @@ -26,15 +165,90 @@ void t_ext_pin_util_targets::set_default_pin_util(t_ext_pin_util default_target) t_pack_high_fanout_thresholds::t_pack_high_fanout_thresholds(int threshold) : default_(threshold) {} +t_pack_high_fanout_thresholds::t_pack_high_fanout_thresholds(const std::vector& specs) + : t_pack_high_fanout_thresholds(128) { + if (specs.size() == 1 && specs[0] == "auto") { + //No user-specified high fanout thresholds, infer them automatically. + // + //We set the high fanout threshold a based on the block type, with + //the logic block having a lower threshold than other blocks. + //(Since logic blocks are the ones which tend to be too densely + //clustered.) + + auto& device_ctx = g_vpr_ctx.device(); + auto& grid = device_ctx.grid; + t_logical_block_type_ptr logic_block_type = infer_logic_block_type(grid); + + if (logic_block_type != nullptr) { + constexpr float LOGIC_BLOCK_TYPE_HIGH_FANOUT_THRESHOLD = 32; + + set(logic_block_type->name, LOGIC_BLOCK_TYPE_HIGH_FANOUT_THRESHOLD); + } else { + VTR_LOG_WARN("Unable to identify logic block type to apply default packer high fanout thresholds; this may result in denser packing than desired\n"); + } + } else { + //Process user specified overrides + + bool default_set = false; + std::set seen_block_types; + + for (const auto& spec : specs) { + auto block_values = vtr::split(spec, ":"); + std::string block_type; + std::string value; + if (block_values.size() == 1) { + value = block_values[0]; + } else if (block_values.size() == 2) { + block_type = block_values[0]; + value = block_values[1]; + } else { + std::stringstream msg; + msg << "In valid block high fanout threshold specification '" << spec << "' (expected at most one ':' between block name and value"; + VPR_FATAL_ERROR(VPR_ERROR_PACK, msg.str().c_str()); + } + + int threshold = vtr::atoi(value); + + if (block_type.empty()) { + //Default value + if (default_set) { + std::stringstream msg; + msg << "Only one default high fanout threshold should be specified"; + VPR_FATAL_ERROR(VPR_ERROR_PACK, msg.str().c_str()); + } + set_default(threshold); + default_set = true; + } else { + if (seen_block_types.count(block_type)) { + std::stringstream msg; + msg << "Only one high fanout threshold should be specified for block type '" << block_type << "'"; + VPR_FATAL_ERROR(VPR_ERROR_PACK, msg.str().c_str()); + } + + set(block_type, threshold); + seen_block_types.insert(block_type); + } + } + } +} + +t_pack_high_fanout_thresholds& t_pack_high_fanout_thresholds::operator=(t_pack_high_fanout_thresholds&& other) noexcept { + if (this != &other) { + default_ = std::move(other.default_); + overrides_ = std::move(other.overrides_); + } + return *this; +} + void t_pack_high_fanout_thresholds::set_default(int threshold) { default_ = threshold; } -void t_pack_high_fanout_thresholds::set(std::string block_type_name, int threshold) { +void t_pack_high_fanout_thresholds::set(const std::string& block_type_name, int threshold) { overrides_[block_type_name] = threshold; } -int t_pack_high_fanout_thresholds::get_threshold(std::string block_type_name) const { +int t_pack_high_fanout_thresholds::get_threshold(const std::string& block_type_name) const { auto itr = overrides_.find(block_type_name); if (itr != overrides_.end()) { return itr->second; @@ -42,6 +256,29 @@ int t_pack_high_fanout_thresholds::get_threshold(std::string block_type_name) co return default_; } +std::string t_pack_high_fanout_thresholds::to_string() const { + std::stringstream ss; + + auto& device_ctx = g_vpr_ctx.device(); + + for (unsigned int itype = 0; itype < device_ctx.physical_tile_types.size(); ++itype) { + if (is_empty_type(&device_ctx.physical_tile_types[itype])) continue; + + auto blk_name = device_ctx.physical_tile_types[itype].name; + + ss << blk_name << ":"; + + auto threshold = get_threshold(blk_name); + ss << threshold; + + if (itype != device_ctx.physical_tile_types.size() - 1) { + ss << " "; + } + } + + return ss.str(); +} + /* * t_pb structure function definitions */ diff --git a/vpr/src/base/vpr_types.h b/vpr/src/base/vpr_types.h index 438b81086cc..d2f86c0af47 100644 --- a/vpr/src/base/vpr_types.h +++ b/vpr/src/base/vpr_types.h @@ -194,16 +194,21 @@ class t_ext_pin_util_targets { public: t_ext_pin_util_targets() = default; t_ext_pin_util_targets(float default_in_util, float default_out_util); + t_ext_pin_util_targets(const std::vector& specs); + t_ext_pin_util_targets& operator=(t_ext_pin_util_targets&& other) noexcept; ///@brief Returns the input pin util of the specified block (or default if unspecified) - t_ext_pin_util get_pin_util(std::string block_type_name) const; + t_ext_pin_util get_pin_util(const std::string& block_type_name) const; + + ///@brief Returns a string describing input/output pin utilization targets + std::string to_string() const; public: /** * @brief Sets the pin util for the specified block type * @return true if non-default was previously set */ - void set_block_pin_util(std::string block_type_name, t_ext_pin_util target); + void set_block_pin_util(const std::string& block_type_name, t_ext_pin_util target); /** * @brief Sets the default pin util @@ -219,16 +224,22 @@ class t_ext_pin_util_targets { class t_pack_high_fanout_thresholds { public: t_pack_high_fanout_thresholds() = default; - t_pack_high_fanout_thresholds(int threshold); + explicit t_pack_high_fanout_thresholds(int threshold); + explicit t_pack_high_fanout_thresholds(const std::vector& specs); + t_pack_high_fanout_thresholds& operator=(t_pack_high_fanout_thresholds&& other) noexcept; + + ///@brief Returns the high fanout threshold of the specifi ed block + int get_threshold(const std::string& block_type_name) const; - int get_threshold(std::string block_type_name) const; + ///@brief Returns a string describing high fanout thresholds for different block types + std::string to_string() const; public: /** * @brief Sets the pin util for the specified block type * @return true if non-default was previously set */ - void set(std::string block_type_name, int threshold); + void set(const std::string& block_type_name, int threshold); /** * @brief Sets the default pin util @@ -723,6 +734,11 @@ struct t_pl_loc { , y(yloc) , sub_tile(sub_tile_loc) , layer(layer_num) {} + t_pl_loc(const t_physical_tile_loc& phy_loc, int sub_tile_loc) + : x(phy_loc.x) + , y(phy_loc.y) + , sub_tile(sub_tile_loc) + , layer(phy_loc.layer_num) {} int x = OPEN; int y = OPEN; diff --git a/vpr/src/noc/noc_router.cpp b/vpr/src/noc/noc_router.cpp index 5ea2c05c1b9..b0aa166aac5 100644 --- a/vpr/src/noc/noc_router.cpp +++ b/vpr/src/noc/noc_router.cpp @@ -27,6 +27,15 @@ int NocRouter::get_router_layer_position(void) const { return router_layer_position; } +t_physical_tile_loc NocRouter::get_router_physical_location(void) const { + const int x = get_router_grid_position_x(); + const int y = get_router_grid_position_y(); + const int layer = get_router_layer_position(); + t_physical_tile_loc phy_loc{x, y, layer}; + + return phy_loc; +} + ClusterBlockId NocRouter::get_router_block_ref(void) const { return router_block_ref; } diff --git a/vpr/src/noc/noc_router.h b/vpr/src/noc/noc_router.h index 8cbda26e0bc..0feb397bdd2 100644 --- a/vpr/src/noc/noc_router.h +++ b/vpr/src/noc/noc_router.h @@ -86,6 +86,12 @@ class NocRouter { */ int get_router_layer_position(void) const; + /** + * @brief Gets the physical location where the the physical router is located + * @return t_physical_tile_loc that contains x-y coordinates and the layer number + */ + t_physical_tile_loc get_router_physical_location(void) const; + /** * @brief Gets the unique id of the router block that is current placed on the physical router * @return A ClusterBlockId that identifies a router block in the clustered netlist diff --git a/vpr/src/pack/cluster.cpp b/vpr/src/pack/cluster.cpp index 4f1382a990d..b19aa4e7f99 100644 --- a/vpr/src/pack/cluster.cpp +++ b/vpr/src/pack/cluster.cpp @@ -88,14 +88,12 @@ std::map do_clustering(const t_packer_opts& pa const t_analysis_opts& analysis_opts, const t_arch* arch, t_pack_molecule* molecule_head, - int num_models, const std::unordered_set& is_clock, + const std::unordered_set& is_global, const std::unordered_map& expected_lowest_cost_pb_gnode, bool allow_unrelated_clustering, bool balance_block_type_utilization, std::vector* lb_type_rr_graphs, - const t_ext_pin_util_targets& ext_pin_util_targets, - const t_pack_high_fanout_thresholds& high_fanout_thresholds, AttractionInfo& attraction_groups, bool& floorplan_regions_overfull, t_clustering_data& clustering_data) { @@ -123,9 +121,7 @@ std::map do_clustering(const t_packer_opts& pa t_cluster_progress_stats cluster_stats; //int num_molecules, num_molecules_processed, mols_since_last_print, blocks_since_last_analysis, - int num_blocks_hill_added, max_pb_depth, - seedindex, savedseedindex /* index of next most timing critical block */, - detailed_routing_stage; + int num_blocks_hill_added, max_pb_depth, detailed_routing_stage; const int verbosity = packer_opts.pack_verbosity; @@ -137,7 +133,6 @@ std::map do_clustering(const t_packer_opts& pa std::map num_used_type_instances; - bool is_cluster_legal; enum e_block_pack_status block_pack_status; t_cluster_placement_stats* cur_cluster_placement_stats_ptr; @@ -178,8 +173,6 @@ std::map do_clustering(const t_packer_opts& pa helper_ctx.max_cluster_size = 0; max_pb_depth = 0; - seedindex = 0; - const t_molecule_stats max_molecule_stats = calc_max_molecules_stats(molecule_head); mark_all_molecules_valid(molecule_head); @@ -224,9 +217,12 @@ std::map do_clustering(const t_packer_opts& pa clustering_delay_calc, timing_info, atom_criticality); } + // Assign gain scores to atoms and sort them based on the scores. auto seed_atoms = initialize_seed_atoms(packer_opts.cluster_seed_type, max_molecule_stats, atom_criticality); - istart = get_highest_gain_seed_molecule(&seedindex, seed_atoms); + /* index of next most timing critical block */ + int seed_index = 0; + istart = get_highest_gain_seed_molecule(seed_index, seed_atoms); print_pack_status_header(); @@ -235,9 +231,10 @@ std::map do_clustering(const t_packer_opts& pa *****************************************************************/ while (istart != nullptr) { - is_cluster_legal = false; - savedseedindex = seedindex; + bool is_cluster_legal = false; + int saved_seed_index = seed_index; for (detailed_routing_stage = (int)E_DETAILED_ROUTE_AT_END_ONLY; !is_cluster_legal && detailed_routing_stage != (int)E_DETAILED_ROUTE_INVALID; detailed_routing_stage++) { + // Use the total number created clusters so far as the ID for the new cluster ClusterBlockId clb_index(helper_ctx.total_clb_num); VTR_LOGV(verbosity > 2, "Complex block %d:\n", helper_ctx.total_clb_num); @@ -251,7 +248,7 @@ std::map do_clustering(const t_packer_opts& pa clb_index, istart, num_used_type_instances, packer_opts.target_device_utilization, - num_models, helper_ctx.max_cluster_size, + helper_ctx.num_models, helper_ctx.max_cluster_size, arch, packer_opts.device_layout, lb_type_rr_graphs, &router_data, detailed_routing_stage, &cluster_ctx.clb_nlist, @@ -281,11 +278,11 @@ std::map do_clustering(const t_packer_opts& pa //Progress dot for seed-block fflush(stdout); - t_ext_pin_util target_ext_pin_util = ext_pin_util_targets.get_pin_util(cluster_ctx.clb_nlist.block_type(clb_index)->name); - int high_fanout_threshold = high_fanout_thresholds.get_threshold(cluster_ctx.clb_nlist.block_type(clb_index)->name); + t_ext_pin_util target_ext_pin_util = helper_ctx.target_external_pin_util.get_pin_util(cluster_ctx.clb_nlist.block_type(clb_index)->name); + int high_fanout_threshold = helper_ctx.high_fanout_thresholds.get_threshold(cluster_ctx.clb_nlist.block_type(clb_index)->name); update_cluster_stats(istart, clb_index, - is_clock, //Set of clock nets - is_clock, //Set of global nets (currently all clocks) + is_clock, //Set of clock nets + is_global, //Set of global nets (currently all clocks) packer_opts.global_clocks, packer_opts.alpha, packer_opts.beta, packer_opts.timing_driven, packer_opts.connection_driven, @@ -345,7 +342,7 @@ std::map do_clustering(const t_packer_opts& pa helper_ctx.primitives_list, cluster_stats, helper_ctx.total_clb_num, - num_models, + helper_ctx.num_models, helper_ctx.max_cluster_size, clb_index, detailed_routing_stage, @@ -354,6 +351,7 @@ std::map do_clustering(const t_packer_opts& pa allow_unrelated_clustering, high_fanout_threshold, is_clock, + is_global, timing_info, router_data, target_ext_pin_util, @@ -368,10 +366,10 @@ std::map do_clustering(const t_packer_opts& pa is_cluster_legal = check_cluster_legality(verbosity, detailed_routing_stage, router_data); if (is_cluster_legal) { - istart = save_cluster_routing_and_pick_new_seed(packer_opts, helper_ctx.total_clb_num, seed_atoms, num_blocks_hill_added, clustering_data.intra_lb_routing, seedindex, cluster_stats, router_data); + istart = save_cluster_routing_and_pick_new_seed(packer_opts, helper_ctx.total_clb_num, seed_atoms, num_blocks_hill_added, clustering_data.intra_lb_routing, seed_index, cluster_stats, router_data); store_cluster_info_and_free(packer_opts, clb_index, logic_block_type, le_pb_type, le_count, clb_inter_blk_nets); } else { - free_data_and_requeue_used_mols_if_illegal(clb_index, savedseedindex, num_used_type_instances, helper_ctx.total_clb_num, seedindex); + free_data_and_requeue_used_mols_if_illegal(clb_index, saved_seed_index, num_used_type_instances, helper_ctx.total_clb_num, seed_index); } free_router_data(router_data); router_data = nullptr; diff --git a/vpr/src/pack/cluster.h b/vpr/src/pack/cluster.h index a9f2c1df689..e08e58dac50 100644 --- a/vpr/src/pack/cluster.h +++ b/vpr/src/pack/cluster.h @@ -15,14 +15,12 @@ std::map do_clustering(const t_packer_opts& pa const t_analysis_opts& analysis_opts, const t_arch* arch, t_pack_molecule* molecule_head, - int num_models, const std::unordered_set& is_clock, + const std::unordered_set& is_global, const std::unordered_map& expected_lowest_cost_pb_gnode, bool allow_unrelated_clustering, bool balance_block_type_utilization, std::vector* lb_type_rr_graphs, - const t_ext_pin_util_targets& ext_pin_util_targets, - const t_pack_high_fanout_thresholds& high_fanout_thresholds, AttractionInfo& attraction_groups, bool& floorplan_regions_overfull, t_clustering_data& clustering_data); diff --git a/vpr/src/pack/cluster_util.cpp b/vpr/src/pack/cluster_util.cpp index 0e12305dc70..d04e08bd74f 100644 --- a/vpr/src/pack/cluster_util.cpp +++ b/vpr/src/pack/cluster_util.cpp @@ -1497,6 +1497,7 @@ void try_fill_cluster(const t_packer_opts& packer_opts, bool allow_unrelated_clustering, const int& high_fanout_threshold, const std::unordered_set& is_clock, + const std::unordered_set& is_global, const std::shared_ptr& timing_info, t_lb_router_data* router_data, t_ext_pin_util target_ext_pin_util, @@ -1591,8 +1592,8 @@ void try_fill_cluster(const t_packer_opts& packer_opts, attraction_groups); update_cluster_stats(next_molecule, clb_index, - is_clock, //Set of all clocks - is_clock, //Set of all global signals (currently clocks) + is_clock, //Set of all clocks + is_global, //Set of all global signals (currently clocks) packer_opts.global_clocks, packer_opts.alpha, packer_opts.beta, packer_opts.timing_driven, packer_opts.connection_driven, high_fanout_threshold, @@ -1639,7 +1640,7 @@ t_pack_molecule* save_cluster_routing_and_pick_new_seed(const t_packer_opts& pac router_data->saved_lb_nets = nullptr; //Pick a new seed - next_seed = get_highest_gain_seed_molecule(&seedindex, seed_atoms); + next_seed = get_highest_gain_seed_molecule(seedindex, seed_atoms); if (packer_opts.timing_driven) { if (num_blocks_hill_added > 0) { @@ -2098,9 +2099,7 @@ void start_new_cluster(t_cluster_placement_stats* cluster_placement_stats, //Try packing into each candidate type bool success = false; - for (size_t i = 0; i < candidate_types.size(); i++) { - auto type = candidate_types[i]; - + for (auto type : candidate_types) { t_pb* pb = new t_pb; pb->pb_graph_node = type->pb_graph_head; alloc_and_load_pb_stats(pb, feasible_block_array_size); @@ -2667,13 +2666,10 @@ t_molecule_stats calc_max_molecules_stats(const t_pack_molecule* molecule_head) std::vector initialize_seed_atoms(const e_cluster_seed seed_type, const t_molecule_stats& max_molecule_stats, const vtr::vector& atom_criticality) { - std::vector seed_atoms; + auto& atom_ctx = g_vpr_ctx.atom(); //Put all atoms in seed list - auto& atom_ctx = g_vpr_ctx.atom(); - for (auto blk : atom_ctx.nlist.blocks()) { - seed_atoms.emplace_back(blk); - } + std::vector seed_atoms(atom_ctx.nlist.blocks().begin(), atom_ctx.nlist.blocks().end()); //Initially all gains are zero vtr::vector atom_gains(atom_ctx.nlist.blocks().size(), 0.); @@ -2823,15 +2819,18 @@ std::vector initialize_seed_atoms(const e_cluster_seed seed_type, return seed_atoms; } -t_pack_molecule* get_highest_gain_seed_molecule(int* seedindex, const std::vector seed_atoms) { +t_pack_molecule* get_highest_gain_seed_molecule(int& seed_index, const std::vector& seed_atoms) { auto& atom_ctx = g_vpr_ctx.atom(); - while (*seedindex < static_cast(seed_atoms.size())) { - AtomBlockId blk_id = seed_atoms[(*seedindex)++]; + while (seed_index < static_cast(seed_atoms.size())) { + AtomBlockId blk_id = seed_atoms[seed_index++]; + // Check if the atom has already been assigned to a cluster if (atom_ctx.lookup.atom_clb(blk_id) == ClusterBlockId::INVALID()) { t_pack_molecule* best = nullptr; + // Iterate over all the molecules associated with the selected atom + // and select the one with the highest gain auto rng = atom_ctx.atom_molecules.equal_range(blk_id); for (const auto& kv : vtr::make_range(rng.first, rng.second)) { t_pack_molecule* molecule = kv.second; @@ -3343,11 +3342,15 @@ std::map> identify_primiti auto& device_ctx = g_vpr_ctx.device(); std::set unique_models; + // Find all logic models used in the netlist for (auto blk : atom_nlist.blocks()) { auto model = atom_nlist.block_model(blk); unique_models.insert(model); } + /* For each technology-mapped logic model, find logical block types + * that can accommodate that logic model + */ for (auto model : unique_models) { model_candidates[model] = {}; diff --git a/vpr/src/pack/cluster_util.h b/vpr/src/pack/cluster_util.h index 9a91e47ea7a..6c05272e1e7 100644 --- a/vpr/src/pack/cluster_util.h +++ b/vpr/src/pack/cluster_util.h @@ -232,6 +232,7 @@ void try_fill_cluster(const t_packer_opts& packer_opts, bool allow_unrelated_clustering, const int& high_fanout_threshold, const std::unordered_set& is_clock, + const std::unordered_set& is_global, const std::shared_ptr& timing_info, t_lb_router_data* router_data, t_ext_pin_util target_ext_pin_util, @@ -407,7 +408,7 @@ std::vector initialize_seed_atoms(const e_cluster_seed seed_type, const t_molecule_stats& max_molecule_stats, const vtr::vector& atom_criticality); -t_pack_molecule* get_highest_gain_seed_molecule(int* seedindex, const std::vector seed_atoms); +t_pack_molecule* get_highest_gain_seed_molecule(int& seed_index, const std::vector& seed_atoms); float get_molecule_gain(t_pack_molecule* molecule, std::map& blk_gain, AttractGroupId cluster_attraction_group_id, AttractionInfo& attraction_groups, int num_molecule_failures); diff --git a/vpr/src/pack/pack.cpp b/vpr/src/pack/pack.cpp index 252dc37a98d..9fd61587cde 100644 --- a/vpr/src/pack/pack.cpp +++ b/vpr/src/pack/pack.cpp @@ -3,7 +3,7 @@ #include #include #include -#include +#include #include #include "vtr_assert.h" @@ -29,11 +29,13 @@ static bool try_size_device_grid(const t_arch& arch, const std::map& num_type_instances, float target_device_utilization, std::string device_layout_name); -static t_ext_pin_util_targets parse_target_external_pin_util(std::vector specs); -static std::string target_external_pin_util_to_string(const t_ext_pin_util_targets& ext_pin_utils); - -static t_pack_high_fanout_thresholds parse_high_fanout_thresholds(std::vector specs); -static std::string high_fanout_thresholds_to_string(const t_pack_high_fanout_thresholds& hf_thresholds); +/** + * @brief Counts the total number of logic models that the architecture can implement. + * + * @param user_models A linked list of logic models. + * @return int The total number of models in the linked list + */ +static int count_models(const t_model* user_models); bool try_pack(t_packer_opts* packer_opts, const t_analysis_opts* analysis_opts, @@ -43,31 +45,21 @@ bool try_pack(t_packer_opts* packer_opts, float interc_delay, std::vector* lb_type_rr_graphs) { auto& helper_ctx = g_vpr_ctx.mutable_cl_helper(); + auto& atom_ctx = g_vpr_ctx.atom(); + auto& atom_mutable_ctx = g_vpr_ctx.mutable_atom(); - std::unordered_set is_clock; + std::unordered_set is_clock, is_global; std::unordered_map expected_lowest_cost_pb_gnode; //The molecules associated with each atom block - const t_model* cur_model; t_clustering_data clustering_data; std::vector list_of_packing_patterns; VTR_LOG("Begin packing '%s'.\n", packer_opts->circuit_file_name.c_str()); /* determine number of models in the architecture */ - helper_ctx.num_models = 0; - cur_model = user_models; - while (cur_model) { - helper_ctx.num_models++; - cur_model = cur_model->next; - } - cur_model = library_models; - while (cur_model) { - helper_ctx.num_models++; - cur_model = cur_model->next; - } + helper_ctx.num_models = count_models(user_models); + helper_ctx.num_models += count_models(library_models); is_clock = alloc_and_load_is_clock(packer_opts->global_clocks); - - auto& atom_ctx = g_vpr_ctx.atom(); - auto& atom_mutable_ctx = g_vpr_ctx.mutable_atom(); + is_global.insert(is_clock.begin(), is_clock.end()); size_t num_p_inputs = 0; size_t num_p_outputs = 0; @@ -113,11 +105,11 @@ bool try_pack(t_packer_opts* packer_opts, VTR_LOG("Using inter-cluster delay: %g\n", packer_opts->inter_cluster_net_delay); } - helper_ctx.target_external_pin_util = parse_target_external_pin_util(packer_opts->target_external_pin_util); - t_pack_high_fanout_thresholds high_fanout_thresholds = parse_high_fanout_thresholds(packer_opts->high_fanout_threshold); + helper_ctx.target_external_pin_util = t_ext_pin_util_targets(packer_opts->target_external_pin_util); + helper_ctx.high_fanout_thresholds = t_pack_high_fanout_thresholds(packer_opts->high_fanout_threshold); - VTR_LOG("Packing with pin utilization targets: %s\n", target_external_pin_util_to_string(helper_ctx.target_external_pin_util).c_str()); - VTR_LOG("Packing with high fanout thresholds: %s\n", high_fanout_thresholds_to_string(high_fanout_thresholds).c_str()); + VTR_LOG("Packing with pin utilization targets: %s\n", helper_ctx.target_external_pin_util.to_string().c_str()); + VTR_LOG("Packing with high fanout thresholds: %s\n", helper_ctx.high_fanout_thresholds.to_string().c_str()); bool allow_unrelated_clustering = false; if (packer_opts->allow_unrelated_clustering == e_unrelated_clustering::ON) { @@ -143,14 +135,13 @@ bool try_pack(t_packer_opts* packer_opts, helper_ctx.num_used_type_instances = do_clustering( *packer_opts, *analysis_opts, - arch, atom_mutable_ctx.list_of_pack_molecules.get(), helper_ctx.num_models, + arch, atom_mutable_ctx.list_of_pack_molecules.get(), is_clock, + is_global, expected_lowest_cost_pb_gnode, allow_unrelated_clustering, balance_block_type_util, lb_type_rr_graphs, - helper_ctx.target_external_pin_util, - high_fanout_thresholds, attraction_groups, floorplan_regions_overfull, clustering_data); @@ -167,7 +158,7 @@ bool try_pack(t_packer_opts* packer_opts, if (fits_on_device && !floorplan_regions_overfull) { break; //Done } else if (pack_iteration == 1 && !floorplan_not_fitting) { - //1st pack attempt was unsucessful (i.e. not dense enough) and we have control of unrelated clustering + //1st pack attempt was unsuccessful (i.e. not dense enough) and we have control of unrelated clustering // //Turn it on to increase packing density if (packer_opts->allow_unrelated_clustering == e_unrelated_clustering::AUTO) { @@ -214,9 +205,7 @@ bool try_pack(t_packer_opts* packer_opts, helper_ctx.target_external_pin_util.set_block_pin_util("clb", pin_util); } - } else { - //Unable to pack densely enough: Give Up - + } else { //Unable to pack densely enough: Give Up if (floorplan_regions_overfull) { VPR_FATAL_ERROR(VPR_ERROR_OTHER, "Failed to find pack clusters densely enough to fit in the designated floorplan regions.\n" @@ -242,7 +231,7 @@ bool try_pack(t_packer_opts* packer_opts, resource_avail += std::string(iter->first->name) + ": " + std::to_string(num_instances); } - VPR_FATAL_ERROR(VPR_ERROR_OTHER, "Failed to find device which satisifies resource requirements required: %s (available %s)", resource_reqs.c_str(), resource_avail.c_str()); + VPR_FATAL_ERROR(VPR_ERROR_OTHER, "Failed to find device which satisfies resource requirements required: %s (available %s)", resource_reqs.c_str(), resource_avail.c_str()); } //Reset clustering for re-packing @@ -387,229 +376,18 @@ static bool try_size_device_grid(const t_arch& arch, const std::map specs) { - t_ext_pin_util_targets targets(1., 1.); - - if (specs.size() == 1 && specs[0] == "auto") { - //No user-specified pin utilizations, infer them automatically. - // - //We set a pin utilization target based on the block type, with - //the logic block having a lower utilization target and other blocks - //(e.g. hard blocks) having no limit. - - auto& device_ctx = g_vpr_ctx.device(); - auto& grid = device_ctx.grid; - t_logical_block_type_ptr logic_block_type = infer_logic_block_type(grid); - - //Allowing 100% pin utilization of the logic block type can harm - //routability, since it may allow a few (typically outlier) clusters to - //use a very large number of pins -- causing routability issues. These - //clusters can cause failed routings where only a handful of routing - //resource nodes remain overused (and do not resolve) These can be - //avoided by putting a (soft) limit on the number of input pins which - //can be used, effectively clipping off the most egregeous outliers. - // - //Experiments show that limiting input utilization produces better quality - //than limiting output utilization (limiting input utilization implicitly - //also limits output utilization). - // - //For relatively high pin utilizations (e.g. > 70%) this has little-to-no - //impact on the number of clusters required. As a result we set a default - //input pin utilization target which is high, but less than 100%. - if (logic_block_type != nullptr) { - constexpr float LOGIC_BLOCK_TYPE_AUTO_INPUT_UTIL = 0.8; - constexpr float LOGIC_BLOCK_TYPE_AUTO_OUTPUT_UTIL = 1.0; - - t_ext_pin_util logic_block_ext_pin_util(LOGIC_BLOCK_TYPE_AUTO_INPUT_UTIL, LOGIC_BLOCK_TYPE_AUTO_OUTPUT_UTIL); - - targets.set_block_pin_util(logic_block_type->name, logic_block_ext_pin_util); - } else { - VTR_LOG_WARN("Unable to identify logic block type to apply default pin utilization targets to; this may result in denser packing than desired\n"); - } - - } else { - //Process user specified overrides - - bool default_set = false; - std::set seen_block_types; - - for (auto spec : specs) { - t_ext_pin_util target_ext_pin_util(1., 1.); - - auto block_values = vtr::split(spec, ":"); - std::string block_type; - std::string values; - if (block_values.size() == 2) { - block_type = block_values[0]; - values = block_values[1]; - } else if (block_values.size() == 1) { - values = block_values[0]; - } else { - std::stringstream msg; - msg << "In valid block pin utilization specification '" << spec << "' (expected at most one ':' between block name and values"; - VPR_FATAL_ERROR(VPR_ERROR_PACK, msg.str().c_str()); - } - - auto elements = vtr::split(values, ","); - if (elements.size() == 1) { - target_ext_pin_util.input_pin_util = vtr::atof(elements[0]); - } else if (elements.size() == 2) { - target_ext_pin_util.input_pin_util = vtr::atof(elements[0]); - target_ext_pin_util.output_pin_util = vtr::atof(elements[1]); - } else { - std::stringstream msg; - msg << "Invalid conversion from '" << spec << "' to external pin util (expected either a single float value, or two float values separted by a comma)"; - VPR_FATAL_ERROR(VPR_ERROR_PACK, msg.str().c_str()); - } - - if (target_ext_pin_util.input_pin_util < 0. || target_ext_pin_util.input_pin_util > 1.) { - std::stringstream msg; - msg << "Out of range target input pin utilization '" << target_ext_pin_util.input_pin_util << "' (expected within range [0.0, 1.0])"; - VPR_FATAL_ERROR(VPR_ERROR_PACK, msg.str().c_str()); - } - if (target_ext_pin_util.output_pin_util < 0. || target_ext_pin_util.output_pin_util > 1.) { - std::stringstream msg; - msg << "Out of range target output pin utilization '" << target_ext_pin_util.output_pin_util << "' (expected within range [0.0, 1.0])"; - VPR_FATAL_ERROR(VPR_ERROR_PACK, msg.str().c_str()); - } - - if (block_type.empty()) { - //Default value - if (default_set) { - std::stringstream msg; - msg << "Only one default pin utilization should be specified"; - VPR_FATAL_ERROR(VPR_ERROR_PACK, msg.str().c_str()); - } - targets.set_default_pin_util(target_ext_pin_util); - default_set = true; - } else { - if (seen_block_types.count(block_type)) { - std::stringstream msg; - msg << "Only one pin utilization should be specified for block type '" << block_type << "'"; - VPR_FATAL_ERROR(VPR_ERROR_PACK, msg.str().c_str()); - } - - targets.set_block_pin_util(block_type, target_ext_pin_util); - seen_block_types.insert(block_type); - } - } +static int count_models(const t_model* user_models) { + if (user_models == nullptr) { + return 0; } - return targets; -} - -static std::string target_external_pin_util_to_string(const t_ext_pin_util_targets& ext_pin_utils) { - std::stringstream ss; - - auto& device_ctx = g_vpr_ctx.device(); - - for (unsigned int itype = 0; itype < device_ctx.physical_tile_types.size(); ++itype) { - if (is_empty_type(&device_ctx.physical_tile_types[itype])) continue; - - auto blk_name = device_ctx.physical_tile_types[itype].name; - - ss << blk_name << ":"; + const t_model* cur_model = user_models; + int n_models = 0; - auto pin_util = ext_pin_utils.get_pin_util(blk_name); - ss << pin_util.input_pin_util << ',' << pin_util.output_pin_util; - - if (itype != device_ctx.physical_tile_types.size() - 1) { - ss << " "; - } - } - - return ss.str(); -} - -static t_pack_high_fanout_thresholds parse_high_fanout_thresholds(std::vector specs) { - t_pack_high_fanout_thresholds high_fanout_thresholds(128); - - if (specs.size() == 1 && specs[0] == "auto") { - //No user-specified high fanout thresholds, infer them automatically. - // - //We set the high fanout threshold a based on the block type, with - //the logic block having a lower threshold than other blocks. - //(Since logic blocks are the ones which tend to be too densely - //clustered.) - - auto& device_ctx = g_vpr_ctx.device(); - auto& grid = device_ctx.grid; - t_logical_block_type_ptr logic_block_type = infer_logic_block_type(grid); - - if (logic_block_type != nullptr) { - constexpr float LOGIC_BLOCK_TYPE_HIGH_FANOUT_THRESHOLD = 32; - - high_fanout_thresholds.set(logic_block_type->name, LOGIC_BLOCK_TYPE_HIGH_FANOUT_THRESHOLD); - } else { - VTR_LOG_WARN("Unable to identify logic block type to apply default packer high fanout thresholds; this may result in denser packing than desired\n"); - } - } else { - //Process user specified overrides - - bool default_set = false; - std::set seen_block_types; - - for (auto spec : specs) { - auto block_values = vtr::split(spec, ":"); - std::string block_type; - std::string value; - if (block_values.size() == 1) { - value = block_values[0]; - } else if (block_values.size() == 2) { - block_type = block_values[0]; - value = block_values[1]; - } else { - std::stringstream msg; - msg << "In valid block high fanout threshold specification '" << spec << "' (expected at most one ':' between block name and value"; - VPR_FATAL_ERROR(VPR_ERROR_PACK, msg.str().c_str()); - } - - int threshold = vtr::atoi(value); - - if (block_type.empty()) { - //Default value - if (default_set) { - std::stringstream msg; - msg << "Only one default high fanout threshold should be specified"; - VPR_FATAL_ERROR(VPR_ERROR_PACK, msg.str().c_str()); - } - high_fanout_thresholds.set_default(threshold); - default_set = true; - } else { - if (seen_block_types.count(block_type)) { - std::stringstream msg; - msg << "Only one high fanout threshold should be specified for block type '" << block_type << "'"; - VPR_FATAL_ERROR(VPR_ERROR_PACK, msg.str().c_str()); - } - - high_fanout_thresholds.set(block_type, threshold); - seen_block_types.insert(block_type); - } - } - } - - return high_fanout_thresholds; -} - -static std::string high_fanout_thresholds_to_string(const t_pack_high_fanout_thresholds& hf_thresholds) { - std::stringstream ss; - - auto& device_ctx = g_vpr_ctx.device(); - - for (unsigned int itype = 0; itype < device_ctx.physical_tile_types.size(); ++itype) { - if (is_empty_type(&device_ctx.physical_tile_types[itype])) continue; - - auto blk_name = device_ctx.physical_tile_types[itype].name; - - ss << blk_name << ":"; - - auto threshold = hf_thresholds.get_threshold(blk_name); - ss << threshold; - - if (itype != device_ctx.physical_tile_types.size() - 1) { - ss << " "; - } + while (cur_model) { + n_models++; + cur_model = cur_model->next; } - return ss.str(); -} + return n_models; +} \ No newline at end of file diff --git a/vpr/src/place/initial_noc_placement.cpp b/vpr/src/place/initial_noc_placement.cpp new file mode 100644 index 00000000000..55d3c6296d1 --- /dev/null +++ b/vpr/src/place/initial_noc_placement.cpp @@ -0,0 +1,274 @@ + +#include "initial_noc_placment.h" +#include "initial_placement.h" +#include "noc_place_utils.h" +#include "noc_place_checkpoint.h" + +/** + * @brief Evaluates whether a NoC router swap should be accepted or not. + * If delta cost is non-positive, the move is always accepted. If the cost + * has increased, the probability of accepting the move is prob. + * + * @param delta_cost Specifies how much the total cost would change if + * the proposed swap is accepted. + * @param prob The probability by which a router swap that increases + * the cost is accepted. The passed value should be in range [0, 1]. + * + * @return true if the proposed swap is accepted, false if not. + */ +static bool accept_noc_swap(double delta_cost, double prob); + +/** + * @brief Places a constrained NoC router within its partition region. + * + * @param router_blk_id NoC router cluster block ID + */ +static void place_constrained_noc_router(ClusterBlockId router_blk_id); + +/** + * @brief Randomly places unconstrained NoC routers. + * + * @param unfixed_routers Contains the cluster block ID for all unconstrained + * NoC routers. + * @param seed Used for shuffling NoC routers. + */ +static void place_noc_routers_randomly(std::vector& unfixed_routers, int seed); + +/** + * @brief Runs a simulated annealing optimizer for NoC routers. + * + * @param noc_opts Contains weighting factors for NoC cost terms. + */ +static void noc_routers_anneal(const t_noc_opts& noc_opts); + +static bool accept_noc_swap(double delta_cost, double prob) { + if (delta_cost <= 0.0) { + return true; + } + + if (prob == 0.0) { + return false; + } + + float random_num = vtr::frand(); + if (random_num < prob) { + return true; + } else { + return false; + } +} + +static void place_constrained_noc_router(ClusterBlockId router_blk_id) { + auto& cluster_ctx = g_vpr_ctx.clustering(); + const auto& floorplanning_ctx = g_vpr_ctx.floorplanning(); + + auto block_type = cluster_ctx.clb_nlist.block_type(router_blk_id); + const PartitionRegion& pr = floorplanning_ctx.cluster_constraints[router_blk_id]; + + // Create a macro with a single member + t_pl_macro_member macro_member; + macro_member.blk_index = router_blk_id; + macro_member.offset = t_pl_offset(0, 0, 0, 0); + t_pl_macro pl_macro; + pl_macro.members.push_back(macro_member); + + bool macro_placed = false; + for (int i_try = 0; i_try < MAX_NUM_TRIES_TO_PLACE_MACROS_RANDOMLY && !macro_placed; i_try++) { + macro_placed = try_place_macro_randomly(pl_macro, pr, block_type, FREE); + } + + if (!macro_placed) { + macro_placed = try_place_macro_exhaustively(pl_macro, pr, block_type, FREE); + } + + if (!macro_placed) { + VPR_FATAL_ERROR(VPR_ERROR_PLACE, "Could not place a router cluster within its constrained region"); + } +} + +static void place_noc_routers_randomly(std::vector& unfixed_routers, int seed) { + auto& place_ctx = g_vpr_ctx.placement(); + auto& noc_ctx = g_vpr_ctx.noc(); + auto& cluster_ctx = g_vpr_ctx.clustering(); + auto& device_ctx = g_vpr_ctx.device(); + + /* + * Unconstrained NoC routers are placed randomly, then NoC cost is optimized using simulated annealing. + * For random placement, physical NoC routers are shuffled, the logical NoC routers are assigned + * to shuffled physical routers. This is equivalent to placing each logical NoC router at a + * randomly selected physical router. The only difference is that an occupied physical NoC router + * might be selected multiple times. Shuffling makes sure that each physical NoC router is evaluated + * only once. + */ + + // Make a copy of NoC physical routers because we want to change its order + vtr::vector noc_phy_routers = noc_ctx.noc_model.get_noc_routers(); + + // Shuffle physical NoC routers + vtr::RandState rand_state = seed; + vtr::shuffle(noc_phy_routers.begin(), noc_phy_routers.end(), rand_state); + + // Get the logical block type for router + const auto router_block_type = cluster_ctx.clb_nlist.block_type(noc_ctx.noc_traffic_flows_storage.get_router_clusters_in_netlist()[0]); + + // Get the compressed grid for NoC + const auto& compressed_noc_grid = place_ctx.compressed_block_grids[router_block_type->index]; + + // Iterate over shuffled physical routers to place logical routers + // Since physical routers are shuffled, router placement would be random + for (const auto& phy_router : noc_phy_routers) { + t_physical_tile_loc router_phy_loc = phy_router.get_router_physical_location(); + + // Find a compatible sub-tile + const auto& phy_type = device_ctx.grid.get_physical_type(router_phy_loc); + const auto& compatible_sub_tiles = compressed_noc_grid.compatible_sub_tiles_for_tile.at(phy_type->index); + int sub_tile = compatible_sub_tiles[vtr::irand((int)compatible_sub_tiles.size() - 1)]; + + t_pl_loc loc(router_phy_loc, sub_tile); + + if (place_ctx.grid_blocks.is_sub_tile_empty(router_phy_loc, sub_tile)) { + // Pick one of the unplaced routers + auto logical_router_bid = unfixed_routers.back(); + unfixed_routers.pop_back(); + + // Create a macro with a single member + t_pl_macro_member macro_member; + macro_member.blk_index = logical_router_bid; + macro_member.offset = t_pl_offset(0, 0, 0, 0); + t_pl_macro pl_macro; + pl_macro.members.push_back(macro_member); + + bool legal = try_place_macro(pl_macro, loc); + if (!legal) { + VPR_FATAL_ERROR(VPR_ERROR_PLACE, "Could not place a router cluster into an empty physical router."); + } + + // When all router clusters are placed, stop iterating over remaining physical routers + if (unfixed_routers.empty()) { + break; + } + } + } // end for of random router placement +} + +static void noc_routers_anneal(const t_noc_opts& noc_opts) { + auto& noc_ctx = g_vpr_ctx.noc(); + + // Only NoC related costs are considered + t_placer_costs costs; + + // Initialize NoC-related costs + costs.noc_aggregate_bandwidth_cost = comp_noc_aggregate_bandwidth_cost(); + costs.noc_latency_cost = comp_noc_latency_cost(noc_opts); + update_noc_normalization_factors(costs); + costs.cost = calculate_noc_cost(costs, noc_opts); + + // Maximum distance in each direction that a router can travel in a move + // It is assumed that NoC routers are organized in a square grid. + // Each router can initially move within the entire grid with a single swap. + const size_t n_physical_routers = noc_ctx.noc_model.get_noc_routers().size(); + const float max_r_lim = ceilf(sqrtf((float)n_physical_routers)); + + // At most, two routers are swapped + t_pl_blocks_to_be_moved blocks_affected(2); + + // Total number of moves grows linearly with the number of logical NoC routers. + // The constant factor was selected experimentally by running the algorithm on + // synthetic benchmarks. NoC-related metrics did not improve after increasing + // the constant factor above 35000. + // Get all the router clusters and figure out how many of them exist + const int num_router_clusters = noc_ctx.noc_traffic_flows_storage.get_router_clusters_in_netlist().size(); + const int N_MOVES = num_router_clusters * 35000; + + const double starting_prob = 0.5; + const double prob_step = starting_prob / N_MOVES; + + // The checkpoint stored the placement with the lowest cost. + NoCPlacementCheckpoint checkpoint; + + /* Algorithm overview: + * In each iteration, one logical NoC router and a physical NoC router are selected randomly. + * If the selected physical NoC router is occupied, two logical NoC routers are swapped. + * If not, the selected logical NoC router is moved to the vacant physical router. + * Then, the cost difference of this swap is computed. If the swap reduces the cost, + * it is always accepted. Swaps that increase the cost are accepted with a + * gradually decreasing probability. The placement with the lowest cost is saved + * as a checkpoint. When the annealing is over, if the checkpoint has a better + * cost than the current placement, the checkpoint is restored. + * Range limit and the probability of accepting swaps with positive delta cost + * decrease linearly as more swaps are evaluated. Late in the annealing, + * NoC routers are swapped only with their neighbors as the range limit approaches 1. + */ + + // Generate and evaluate router moves + for (int i_move = 0; i_move < N_MOVES; i_move++) { + e_create_move create_move_outcome = e_create_move::ABORT; + clear_move_blocks(blocks_affected); + // Shrink the range limit over time + float r_lim_decayed = 1.0f + (N_MOVES - i_move) * (max_r_lim / N_MOVES); + create_move_outcome = propose_router_swap(blocks_affected, r_lim_decayed); + + if (create_move_outcome != e_create_move::ABORT) { + apply_move_blocks(blocks_affected); + + double noc_aggregate_bandwidth_delta_c = 0.0; + double noc_latency_delta_c = 0.0; + find_affected_noc_routers_and_update_noc_costs(blocks_affected, noc_aggregate_bandwidth_delta_c, noc_latency_delta_c, noc_opts); + double delta_cost = (noc_opts.noc_placement_weighting) * (noc_latency_delta_c * costs.noc_latency_cost_norm + noc_aggregate_bandwidth_delta_c * costs.noc_aggregate_bandwidth_cost_norm); + + double prob = starting_prob - i_move * prob_step; + bool move_accepted = accept_noc_swap(delta_cost, prob); + + if (move_accepted) { + costs.cost += delta_cost; + commit_move_blocks(blocks_affected); + commit_noc_costs(); + costs.noc_aggregate_bandwidth_cost += noc_aggregate_bandwidth_delta_c; + costs.noc_latency_cost += noc_latency_delta_c; + if (costs.cost < checkpoint.get_cost() || !checkpoint.is_valid()) { + checkpoint.save_checkpoint(costs.cost); + } + } else { // The proposed move is rejected + revert_move_blocks(blocks_affected); + revert_noc_traffic_flow_routes(blocks_affected); + } + } + } + + if (checkpoint.get_cost() < costs.cost) { + checkpoint.restore_checkpoint(noc_opts, costs); + } +} + +void initial_noc_placement(const t_noc_opts& noc_opts, int seed) { + auto& noc_ctx = g_vpr_ctx.noc(); + + // Get all the router clusters + const std::vector& router_blk_ids = noc_ctx.noc_traffic_flows_storage.get_router_clusters_in_netlist(); + + // Holds all the routers that are not fixed into a specific location by constraints + std::vector unfixed_routers; + + // Check for floorplanning constraints and place constrained NoC routers + for (auto router_blk_id : router_blk_ids) { + // The block is fixed and was placed in mark_fixed_blocks() + if (is_block_placed((router_blk_id))) { + continue; + } + + if (is_cluster_constrained(router_blk_id)) { + place_constrained_noc_router(router_blk_id); + } else { + unfixed_routers.push_back(router_blk_id); + } + } + + // Place unconstrained NoC routers randomly + place_noc_routers_randomly(unfixed_routers, seed); + + // populate internal data structures to maintain route, bandwidth usage, and latencies + initial_noc_routing(); + + // Run the simulated annealing optimizer for NoC routers + noc_routers_anneal(noc_opts); +} \ No newline at end of file diff --git a/vpr/src/place/initial_noc_placment.h b/vpr/src/place/initial_noc_placment.h new file mode 100644 index 00000000000..4f060a14277 --- /dev/null +++ b/vpr/src/place/initial_noc_placment.h @@ -0,0 +1,15 @@ + +#ifndef VTR_INITIAL_NOC_PLACMENT_H +#define VTR_INITIAL_NOC_PLACMENT_H + +#include "vpr_types.h" + +/** + * @brief Randomly places NoC routers, then runs a quick simulated annealing + * to minimize NoC costs. + * + * @param noc_opts NoC-related options. Used to calculate NoC-related costs. + */ +void initial_noc_placement(const t_noc_opts& noc_opts, int seed); + +#endif //VTR_INITIAL_NOC_PLACMENT_H diff --git a/vpr/src/place/initial_placement.cpp b/vpr/src/place/initial_placement.cpp index c80d5ff245b..7e67f169ef2 100644 --- a/vpr/src/place/initial_placement.cpp +++ b/vpr/src/place/initial_placement.cpp @@ -5,18 +5,19 @@ #include "globals.h" #include "read_place.h" #include "initial_placement.h" +#include "initial_noc_placment.h" #include "vpr_utils.h" #include "place_util.h" #include "place_constraints.h" #include "move_utils.h" #include "region.h" #include "directed_moves_util.h" -#include "noc_place_utils.h" #include "echo_files.h" -#include -#include +#include +#include +#include #ifdef VERBOSE void print_clb_placement(const char* fname); @@ -34,37 +35,27 @@ constexpr int INVALID_X = -1; // The amount of weight that will added to each tile which is outside of the floorplanning constraints #define SORT_WEIGHT_PER_TILES_OUTSIDE_OF_PR 100 -/* The maximum number of tries when trying to place a macro at a * - * random location before trying exhaustive placement - find the first * - * legal position and place it during initial placement. */ -#define MAX_NUM_TRIES_TO_PLACE_MACROS_RANDOMLY 8 - /** - * @brief Set choosen grid locations to EMPTY block id before each placement iteration + * @brief Set chosen grid locations to EMPTY block id before each placement iteration * * @param unplaced_blk_types_index Block types that their grid locations must be cleared. * */ -static void clear_block_type_grid_locs(std::unordered_set unplaced_blk_types_index); +static void clear_block_type_grid_locs(const std::unordered_set& unplaced_blk_types_index); /** - * @brief Places the macro if the head position passed in is legal, and all the resulting - * member positions are legal - * - * @param pl_macro The macro to be placed. - * @param head_pos The location of the macro head member. - * - * @return true if macro was placed, false if not. + * @brief Initializes the grid to empty. It also initialized the location for + * all blocks to unplaced. */ -static bool try_place_macro(t_pl_macro pl_macro, t_pl_loc head_pos); +static void clear_all_grid_locs(); /** * @brief Control routine for placing a macro. * First iteration of place_marco performs the following steps to place a macro: * 1) try_centroid_placement : tries to find a location based on the macro's logical connections. - * 2) try_random_placement : if no smart location found in the centroid placement, the function tries + * 2) try_place_macro_randomly : if no smart location found in the centroid placement, the function tries * to place it randomly for the max number of tries. - * 3) try_exhaustive_placement : if neither placement alogrithms work, the function will find a location + * 3) try_place_macro_exhaustively : if neither placement alogrithms work, the function will find a location * for the macro by exhaustively searching all available locations. * If first iteration failed, next iteration calls dense placement for specific block types. * @@ -76,7 +67,7 @@ static bool try_place_macro(t_pl_macro pl_macro, t_pl_loc head_pos); * * @return true if macro was placed, false if not. */ -static bool place_macro(int macros_max_num_tries, t_pl_macro pl_macro, enum e_pad_loc_type pad_loc_type, std::vector* blk_types_empty_locs_in_grid, vtr::vector& block_scores); +static bool place_macro(int macros_max_num_tries, const t_pl_macro& pl_macro, enum e_pad_loc_type pad_loc_type, std::vector* blk_types_empty_locs_in_grid, vtr::vector& block_scores); /* * Assign scores to each block based on macro size and floorplanning constraints. @@ -94,7 +85,7 @@ static vtr::vector assign_block_scores(); * * @return y coordinate of the location that macro head should be placed */ -static int get_y_loc_based_on_macro_direction(t_grid_empty_locs_block_type first_macro_loc, t_pl_macro pl_macro); +static int get_y_loc_based_on_macro_direction(t_grid_empty_locs_block_type first_macro_loc, const t_pl_macro& pl_macro); /** * @brief Tries to get the first available location of a specific block type that can accomodate macro blocks @@ -105,7 +96,7 @@ static int get_y_loc_based_on_macro_direction(t_grid_empty_locs_block_type first * * @return index to a column of blk_types_empty_locs_in_grid that can accomodate pl_macro and location of first available location returned by reference */ -static int get_blk_type_first_loc(t_pl_loc& loc, t_pl_macro pl_macro, std::vector* blk_types_empty_locs_in_grid); +static int get_blk_type_first_loc(t_pl_loc& loc, const t_pl_macro& pl_macro, std::vector* blk_types_empty_locs_in_grid); /** * @brief Updates the first available location (lowest y) and number of remaining blocks in the column that dense placement used to place the macro. @@ -116,7 +107,7 @@ static int get_blk_type_first_loc(t_pl_loc& loc, t_pl_macro pl_macro, std::vecto * @param blk_types_empty_locs_in_grid first location (lowest y) and number of remaining blocks in each column for the blk_id type * */ -static void update_blk_type_first_loc(int blk_type_column_index, t_logical_block_type_ptr block_type, t_pl_macro pl_macro, std::vector* blk_types_empty_locs_in_grid); +static void update_blk_type_first_loc(int blk_type_column_index, t_logical_block_type_ptr block_type, const t_pl_macro& pl_macro, std::vector* blk_types_empty_locs_in_grid); /** * @brief Initializes empty locations of the grid with a specific block type into vector for dense initial placement @@ -134,7 +125,7 @@ static std::vector init_blk_types_empty_locations( * @param loc The location at which the head of the macro is placed. * @param pad_loc_type Used to check whether an io block needs to be marked as fixed. */ -static inline void fix_IO_block_types(t_pl_macro pl_macro, t_pl_loc loc, enum e_pad_loc_type pad_loc_type); +static inline void fix_IO_block_types(const t_pl_macro& pl_macro, t_pl_loc loc, enum e_pad_loc_type pad_loc_type); /** * @brief Determine whether a specific macro can be placed in a specific location. @@ -156,17 +147,18 @@ static bool is_loc_legal(t_pl_loc& loc, PartitionRegion& pr, t_logical_block_typ * * @return a vector of blocks that are connected to this block but not yet placed so their scores can later be updated. */ -static std::vector find_centroid_loc(t_pl_macro pl_macro, t_pl_loc& centroid); +static std::vector find_centroid_loc(const t_pl_macro& pl_macro, t_pl_loc& centroid); /** * @brief Tries to find a nearest location to the centroid location if calculated centroid location is not legal or is occupied. * * @param centroid_loc Calculated location in try_centroid_placement function for the block. * @param block_type Logical block type of the macro blocks. + * @param search_for_empty If set, the function tries to find an empty location. * * @return true if the function can find any location near the centroid one, false otherwise. */ -static bool find_centroid_neighbor(t_pl_loc& centroid_loc, t_logical_block_type_ptr block_type); +static bool find_centroid_neighbor(t_pl_loc& centroid_loc, t_logical_block_type_ptr block_type, bool search_for_empty); /** * @brief tries to place a macro at a centroid location of its placed connections. @@ -180,33 +172,7 @@ static bool find_centroid_neighbor(t_pl_loc& centroid_loc, t_logical_block_type_ * * @return true if the macro gets placed, false if not. */ -static bool try_centroid_placement(t_pl_macro pl_macro, PartitionRegion& pr, t_logical_block_type_ptr block_type, enum e_pad_loc_type pad_loc_type, vtr::vector& block_scores); - -/** - * @brief tries to place a macro at a random location - * - * @param pl_macro The macro to be placed. - * @param pr The PartitionRegion of the macro - represents its floorplanning constraints, is the size of the whole chip if the macro is not - * constrained. - * @param block_type Logical block type of the macro blocks. - * @param pad_loc_type Used to check whether an io block needs to be marked as fixed. - * - * @return true if the macro gets placed, false if not. - */ -static bool try_random_placement(t_pl_macro pl_macro, PartitionRegion& pr, t_logical_block_type_ptr block_type, enum e_pad_loc_type pad_loc_type); - -/** - * @brief Looks for a valid placement location for macro exhaustively once the maximum number of random locations have been tried. - * - * @param pl_macro The macro to be placed. - * @param pr The PartitionRegion of the macro - represents its floorplanning constraints, is the size of the whole chip if the macro is not - * constrained. - * @param block_type Logical block type of the macro blocks. - * @param pad_loc_type Used to check whether an io block needs to be marked as fixed. - * - * @return true if the macro gets placed, false if not. - */ -static bool try_exhaustive_placement(t_pl_macro pl_macro, PartitionRegion& pr, t_logical_block_type_ptr block_type, enum e_pad_loc_type pad_loc_type); +static bool try_centroid_placement(const t_pl_macro& pl_macro, PartitionRegion& pr, t_logical_block_type_ptr block_type, enum e_pad_loc_type pad_loc_type, vtr::vector& block_scores); /** * @brief Looks for a valid placement location for macro in second iteration, tries to place as many macros as possible in one column @@ -221,7 +187,7 @@ static bool try_exhaustive_placement(t_pl_macro pl_macro, PartitionRegion& pr, t * * @return true if the macro gets placed, false if not. */ -static bool try_dense_placement(t_pl_macro pl_macro, PartitionRegion& pr, t_logical_block_type_ptr block_type, enum e_pad_loc_type pad_loc_type, std::vector* blk_types_empty_locs_in_grid); +static bool try_dense_placement(const t_pl_macro& pl_macro, PartitionRegion& pr, t_logical_block_type_ptr block_type, enum e_pad_loc_type pad_loc_type, std::vector* blk_types_empty_locs_in_grid); /** * @brief Tries for MAX_INIT_PLACE_ATTEMPTS times to place all blocks considering their floorplanning constraints and the device size @@ -259,10 +225,10 @@ static void check_initial_placement_legality() { } } -static bool is_block_placed(ClusterBlockId blk_id) { +bool is_block_placed(ClusterBlockId blk_id) { auto& place_ctx = g_vpr_ctx.placement(); - return (!(place_ctx.block_locs[blk_id].loc.x == INVALID_X)); + return (place_ctx.block_locs[blk_id].loc.x != INVALID_X); } static bool is_loc_legal(t_pl_loc& loc, PartitionRegion& pr, t_logical_block_type_ptr block_type) { @@ -292,7 +258,7 @@ static bool is_loc_legal(t_pl_loc& loc, PartitionRegion& pr, t_logical_block_typ return legal; } -static bool find_centroid_neighbor(t_pl_loc& centroid_loc, t_logical_block_type_ptr block_type) { +static bool find_centroid_neighbor(t_pl_loc& centroid_loc, t_logical_block_type_ptr block_type, bool search_for_empty) { const auto& compressed_block_grid = g_vpr_ctx.placement().compressed_block_grids[block_type->index]; const int num_layers = g_vpr_ctx.device().grid.get_num_layers(); const int centroid_loc_layer_num = centroid_loc.layer; @@ -325,7 +291,8 @@ static bool find_centroid_neighbor(t_pl_loc& centroid_loc, t_logical_block_type_ search_range, to_compressed_loc, false, - centroid_loc_layer_num); + centroid_loc_layer_num, + search_for_empty); if (!legal) { return false; @@ -336,7 +303,7 @@ static bool find_centroid_neighbor(t_pl_loc& centroid_loc, t_logical_block_type_ return legal; } -static std::vector find_centroid_loc(t_pl_macro pl_macro, t_pl_loc& centroid) { +static std::vector find_centroid_loc(const t_pl_macro& pl_macro, t_pl_loc& centroid) { auto& cluster_ctx = g_vpr_ctx.clustering(); t_physical_tile_loc tile_loc; @@ -435,7 +402,7 @@ static std::vector find_centroid_loc(t_pl_macro pl_macro, t_pl_l return connected_blocks_to_update; } -static bool try_centroid_placement(t_pl_macro pl_macro, PartitionRegion& pr, t_logical_block_type_ptr block_type, enum e_pad_loc_type pad_loc_type, vtr::vector& block_scores) { +static bool try_centroid_placement(const t_pl_macro& pl_macro, PartitionRegion& pr, t_logical_block_type_ptr block_type, enum e_pad_loc_type pad_loc_type, vtr::vector& block_scores) { t_pl_loc centroid_loc(OPEN, OPEN, OPEN, OPEN); std::vector unplaced_blocks_to_update_their_score; @@ -450,7 +417,7 @@ static bool try_centroid_placement(t_pl_macro pl_macro, PartitionRegion& pr, t_l //try to find a near location that meet these requirements bool neighbor_legal_loc = false; if (!is_loc_legal(centroid_loc, pr, block_type)) { - neighbor_legal_loc = find_centroid_neighbor(centroid_loc, block_type); + neighbor_legal_loc = find_centroid_neighbor(centroid_loc, block_type, false); if (!neighbor_legal_loc) { //no neighbor candidate found return false; } @@ -491,7 +458,7 @@ static bool try_centroid_placement(t_pl_macro pl_macro, PartitionRegion& pr, t_l return legal; } -static int get_y_loc_based_on_macro_direction(t_grid_empty_locs_block_type first_macro_loc, t_pl_macro pl_macro) { +static int get_y_loc_based_on_macro_direction(t_grid_empty_locs_block_type first_macro_loc, const t_pl_macro& pl_macro) { int y = first_macro_loc.first_avail_loc.y; /* @@ -508,7 +475,7 @@ static int get_y_loc_based_on_macro_direction(t_grid_empty_locs_block_type first return y; } -static void update_blk_type_first_loc(int blk_type_column_index, t_logical_block_type_ptr block_type, t_pl_macro pl_macro, std::vector* blk_types_empty_locs_in_grid) { +static void update_blk_type_first_loc(int blk_type_column_index, t_logical_block_type_ptr block_type, const t_pl_macro& pl_macro, std::vector* blk_types_empty_locs_in_grid) { //check if dense placement could place macro successfully if (blk_type_column_index == -1 || blk_types_empty_locs_in_grid->size() <= abs(blk_type_column_index)) { return; @@ -521,7 +488,7 @@ static void update_blk_type_first_loc(int blk_type_column_index, t_logical_block blk_types_empty_locs_in_grid->at(blk_type_column_index).num_of_empty_locs_in_y_axis -= pl_macro.members.size(); } -static int get_blk_type_first_loc(t_pl_loc& loc, t_pl_macro pl_macro, std::vector* blk_types_empty_locs_in_grid) { +static int get_blk_type_first_loc(t_pl_loc& loc, const t_pl_macro& pl_macro, std::vector* blk_types_empty_locs_in_grid) { //loop over all empty locations and choose first column that can accomodate macro blocks for (unsigned int empty_loc_index = 0; empty_loc_index < blk_types_empty_locs_in_grid->size(); empty_loc_index++) { auto first_empty_loc = blk_types_empty_locs_in_grid->at(empty_loc_index); @@ -584,20 +551,20 @@ static std::vector init_blk_types_empty_locations( return block_type_empty_locs; } -static inline void fix_IO_block_types(t_pl_macro pl_macro, t_pl_loc loc, enum e_pad_loc_type pad_loc_type) { +static inline void fix_IO_block_types(const t_pl_macro& pl_macro, t_pl_loc loc, enum e_pad_loc_type pad_loc_type) { const auto& device_ctx = g_vpr_ctx.device(); auto& place_ctx = g_vpr_ctx.mutable_placement(); //If the user marked the IO block pad_loc_type as RANDOM, that means it should be randomly //placed and then stay fixed to that location, which is why the macro members are marked as fixed. const auto& type = device_ctx.grid.get_physical_type({loc.x, loc.y, loc.layer}); if (is_io_type(type) && pad_loc_type == RANDOM) { - for (unsigned int imember = 0; imember < pl_macro.members.size(); imember++) { - place_ctx.block_locs[pl_macro.members[imember].blk_index].is_fixed = true; + for (const auto& pl_macro_member : pl_macro.members) { + place_ctx.block_locs[pl_macro_member.blk_index].is_fixed = true; } } } -static bool try_random_placement(t_pl_macro pl_macro, PartitionRegion& pr, t_logical_block_type_ptr block_type, enum e_pad_loc_type pad_loc_type) { +bool try_place_macro_randomly(const t_pl_macro& pl_macro, const PartitionRegion& pr, t_logical_block_type_ptr block_type, enum e_pad_loc_type pad_loc_type) { const auto& compressed_block_grid = g_vpr_ctx.placement().compressed_block_grids[block_type->index]; t_pl_loc loc; @@ -641,7 +608,8 @@ static bool try_random_placement(t_pl_macro pl_macro, PartitionRegion& pr, t_log reg_coord.layer_num, reg_coord.layer_num}, to_compressed_loc, false, - reg_coord.layer_num); + reg_coord.layer_num, + false); if (!legal) { //No valid position found return false; @@ -665,7 +633,7 @@ static bool try_random_placement(t_pl_macro pl_macro, PartitionRegion& pr, t_log return legal; } -static bool try_exhaustive_placement(t_pl_macro pl_macro, PartitionRegion& pr, t_logical_block_type_ptr block_type, enum e_pad_loc_type pad_loc_type) { +bool try_place_macro_exhaustively(const t_pl_macro& pl_macro, const PartitionRegion& pr, t_logical_block_type_ptr block_type, enum e_pad_loc_type pad_loc_type) { const auto& compressed_block_grid = g_vpr_ctx.placement().compressed_block_grids[block_type->index]; auto& place_ctx = g_vpr_ctx.mutable_placement(); @@ -748,7 +716,7 @@ static bool try_exhaustive_placement(t_pl_macro pl_macro, PartitionRegion& pr, t return placed; } -static bool try_dense_placement(t_pl_macro pl_macro, PartitionRegion& pr, t_logical_block_type_ptr block_type, enum e_pad_loc_type pad_loc_type, std::vector* blk_types_empty_locs_in_grid) { +static bool try_dense_placement(const t_pl_macro& pl_macro, PartitionRegion& pr, t_logical_block_type_ptr block_type, enum e_pad_loc_type pad_loc_type, std::vector* blk_types_empty_locs_in_grid) { t_pl_loc loc; int column_index = get_blk_type_first_loc(loc, pl_macro, blk_types_empty_locs_in_grid); @@ -777,7 +745,7 @@ static bool try_dense_placement(t_pl_macro pl_macro, PartitionRegion& pr, t_logi return legal; } -static bool try_place_macro(t_pl_macro pl_macro, t_pl_loc head_pos) { +bool try_place_macro(const t_pl_macro& pl_macro, t_pl_loc head_pos) { auto& place_ctx = g_vpr_ctx.mutable_placement(); VTR_LOGV_DEBUG(place_ctx.f_placer_debug, "\t\t\t\tTry to place the macro at %dx%dx%dx%d\n", @@ -799,10 +767,10 @@ static bool try_place_macro(t_pl_macro pl_macro, t_pl_loc head_pos) { // Place down the macro macro_placed = true; VTR_LOGV_DEBUG(place_ctx.f_placer_debug, "\t\t\t\tMacro is placed at the given location\n"); - for (size_t imember = 0; imember < pl_macro.members.size(); imember++) { - t_pl_loc member_pos = head_pos + pl_macro.members[imember].offset; + for (const auto& pl_macro_member : pl_macro.members) { + t_pl_loc member_pos = head_pos + pl_macro_member.offset; - ClusterBlockId iblk = pl_macro.members[imember].blk_index; + ClusterBlockId iblk = pl_macro_member.blk_index; set_block_location(iblk, member_pos); @@ -812,7 +780,7 @@ static bool try_place_macro(t_pl_macro pl_macro, t_pl_loc head_pos) { return (macro_placed); } -static bool place_macro(int macros_max_num_tries, t_pl_macro pl_macro, enum e_pad_loc_type pad_loc_type, std::vector* blk_types_empty_locs_in_grid, vtr::vector& block_scores) { +static bool place_macro(int macros_max_num_tries, const t_pl_macro& pl_macro, enum e_pad_loc_type pad_loc_type, std::vector* blk_types_empty_locs_in_grid, vtr::vector& block_scores) { ClusterBlockId blk_id; blk_id = pl_macro.members[0].blk_index; VTR_LOGV_DEBUG(g_vpr_ctx.placement().f_placer_debug, "\t\tHead of the macro is Block %d\n", size_t(blk_id)); @@ -852,7 +820,7 @@ static bool place_macro(int macros_max_num_tries, t_pl_macro pl_macro, enum e_pa //If blk_types_empty_locs_in_grid is not NULL, means that initial placement has been failed in first iteration for this block type //We need to place densely in second iteration to be able to find a legal initial placement solution - if (blk_types_empty_locs_in_grid != NULL && blk_types_empty_locs_in_grid->size() != 0) { + if (blk_types_empty_locs_in_grid != nullptr && !blk_types_empty_locs_in_grid->empty()) { VTR_LOGV_DEBUG(g_vpr_ctx.placement().f_placer_debug, "\t\t\tTry dense placement\n"); macro_placed = try_dense_placement(pl_macro, pr, block_type, pad_loc_type, blk_types_empty_locs_in_grid); } @@ -865,7 +833,7 @@ static bool place_macro(int macros_max_num_tries, t_pl_macro pl_macro, enum e_pa // If macro is not placed yet, try to place the macro randomly for the max number of random tries for (int itry = 0; itry < macros_max_num_tries && macro_placed == false; itry++) { VTR_LOGV_DEBUG(g_vpr_ctx.placement().f_placer_debug, "\t\t\tTry random place iter: %d\n", itry); - macro_placed = try_random_placement(pl_macro, pr, block_type, pad_loc_type); + macro_placed = try_place_macro_randomly(pl_macro, pr, block_type, pad_loc_type); } // Finished all tries if (!macro_placed) { @@ -877,7 +845,7 @@ static bool place_macro(int macros_max_num_tries, t_pl_macro pl_macro, enum e_pa // Exhaustive placement of carry macros VTR_LOGV_DEBUG(g_vpr_ctx.placement().f_placer_debug, "\t\t\tTry exhaustive placement\n"); - macro_placed = try_exhaustive_placement(pl_macro, pr, block_type, pad_loc_type); + macro_placed = try_place_macro_exhaustively(pl_macro, pr, block_type, pad_loc_type); } return macro_placed; } @@ -917,10 +885,10 @@ static vtr::vector assign_block_scores() { } //go through placement macros and store size of macro for each block - for (auto pl_macro : pl_macros) { + for (const auto& pl_macro : pl_macros) { int size = pl_macro.members.size(); - for (unsigned int i = 0; i < pl_macro.members.size(); i++) { - block_scores[pl_macro.members[i].blk_index].macro_size = size; + for (const auto& pl_macro_member : pl_macro.members) { + block_scores[pl_macro_member.blk_index].macro_size = size; } } @@ -975,7 +943,7 @@ static void place_all_blocks(const t_placer_opts& /* placer_opts */, vtr::vector std::vector heap_blocks(blocks.begin(), blocks.end()); std::make_heap(heap_blocks.begin(), heap_blocks.end(), criteria); - while (heap_blocks.size()) { + while (!heap_blocks.empty()) { std::pop_heap(heap_blocks.begin(), heap_blocks.end(), criteria); auto blk_id = heap_blocks.back(); heap_blocks.pop_back(); @@ -1026,7 +994,7 @@ static void place_all_blocks(const t_placer_opts& /* placer_opts */, vtr::vector } } -static void clear_block_type_grid_locs(std::unordered_set unplaced_blk_types_index) { +static void clear_block_type_grid_locs(const std::unordered_set& unplaced_blk_types_index) { auto& device_ctx = g_vpr_ctx.device(); bool clear_all_block_types = false; @@ -1034,7 +1002,7 @@ static void clear_block_type_grid_locs(std::unordered_set unplaced_blk_type * logical_block_types contain empty type, needs to be ignored. * Not having any type in unplaced_blk_types_index means that it is the first iteration, hence all grids needs to be cleared */ - if (unplaced_blk_types_index.size() == device_ctx.logical_block_types.size() - 1 || unplaced_blk_types_index.size() == 0) { + if (unplaced_blk_types_index.size() == device_ctx.logical_block_types.size() - 1) { clear_all_block_types = true; } @@ -1071,6 +1039,23 @@ static void clear_block_type_grid_locs(std::unordered_set unplaced_blk_type } } +static void clear_all_grid_locs() { + auto& device_ctx = g_vpr_ctx.device(); + + std::unordered_set blk_types_to_be_cleared; + const auto& logical_block_types = device_ctx.logical_block_types; + + // Insert all the logical block types into the set except the empty type + // clear_block_type_grid_locs does not expect empty type to be among given types + for (const auto& logical_type : logical_block_types) { + if (!is_empty_type(&logical_type)) { + blk_types_to_be_cleared.insert(logical_type.index); + } + } + + clear_block_type_grid_locs(blk_types_to_be_cleared); +} + bool place_one_block(const ClusterBlockId& blk_id, enum e_pad_loc_type pad_loc_type, std::vector* blk_types_empty_locs_in_grid, @@ -1109,11 +1094,15 @@ bool place_one_block(const ClusterBlockId& blk_id, } void initial_placement(const t_placer_opts& placer_opts, - enum e_pad_loc_type pad_loc_type, const char* constraints_file, - bool noc_enabled) { + const t_noc_opts& noc_opts) { vtr::ScopedStartFinishTimer timer("Initial Placement"); + /* Initialize the grid blocks to empty. + * Initialize all the blocks to unplaced. + */ + clear_all_grid_locs(); + /* Go through cluster blocks to calculate the tightest placement * floorplan constraint for each constrained block */ @@ -1123,20 +1112,20 @@ void initial_placement(const t_placer_opts& placer_opts, * as fixed so they do not get moved during initial placement or later during the simulated annealing stage of placement*/ mark_fixed_blocks(); + if (noc_opts.noc) { + // NoC routers are placed before other blocks + initial_noc_placement(noc_opts, placer_opts.seed); + } + //Assign scores to blocks and placement macros according to how difficult they are to place vtr::vector block_scores = assign_block_scores(); //Place all blocks - place_all_blocks(placer_opts, block_scores, pad_loc_type, constraints_file); + place_all_blocks(placer_opts, block_scores, placer_opts.pad_loc_type, constraints_file); //if any blocks remain unplaced, print an error check_initial_placement_legality(); - // route all the traffic flows in the NoC now that all the router cluster block have been placed (this is done only if the noc optimization is enabled by the user) - if (noc_enabled) { - initial_noc_routing(); - } - //#ifdef VERBOSE // VTR_LOG("At end of initial_placement.\n"); // if (getEchoEnabled() && isEchoFileEnabled(E_ECHO_INITIAL_CLB_PLACEMENT)) { diff --git a/vpr/src/place/initial_placement.h b/vpr/src/place/initial_placement.h index e33677611cb..44a3772087d 100644 --- a/vpr/src/place/initial_placement.h +++ b/vpr/src/place/initial_placement.h @@ -2,6 +2,13 @@ #define VPR_INITIAL_PLACEMENT_H #include "vpr_types.h" +#include "place_macro.h" +#include "partition_region.h" + +/* The maximum number of tries when trying to place a macro at a * + * random location before trying exhaustive placement - find the first * + * legal position and place it during initial placement. */ +constexpr int MAX_NUM_TRIES_TO_PLACE_MACROS_RANDOMLY = 8; /** * @brief Used to assign each block a score for how difficult it is to place. @@ -38,6 +45,58 @@ struct t_grid_empty_locs_block_type { int num_of_empty_locs_in_y_axis; }; +/** + * @brief tries to place a macro at a random location + * + * @param pl_macro The macro to be placed. + * @param pr The PartitionRegion of the macro - represents its floorplanning constraints, is the size of the whole chip if the macro is not + * constrained. + * @param block_type Logical block type of the macro blocks. + * @param pad_loc_type Used to check whether an io block needs to be marked as fixed. + * + * @return true if the macro gets placed, false if not. + */ +bool try_place_macro_randomly(const t_pl_macro& pl_macro, + const PartitionRegion& pr, + t_logical_block_type_ptr block_type, + enum e_pad_loc_type pad_loc_type); + +/** + * @brief Looks for a valid placement location for macro exhaustively once the maximum number of random locations have been tried. + * + * @param pl_macro The macro to be placed. + * @param pr The PartitionRegion of the macro - represents its floorplanning constraints, is the size of the whole chip if the macro is not + * constrained. + * @param block_type Logical block type of the macro blocks. + * @param pad_loc_type Used to check whether an io block needs to be marked as fixed. + * + * @return true if the macro gets placed, false if not. + */ +bool try_place_macro_exhaustively(const t_pl_macro& pl_macro, + const PartitionRegion& pr, + t_logical_block_type_ptr block_type, + enum e_pad_loc_type pad_loc_type); + +/** + * @brief Places the macro if the head position passed in is legal, and all the resulting + * member positions are legal + * + * @param pl_macro The macro to be placed. + * @param head_pos The location of the macro head member. + * + * @return true if macro was placed, false if not. + */ +bool try_place_macro(const t_pl_macro& pl_macro, t_pl_loc head_pos); + +/** + * @brief Checks whether the block is already placed + * + * @param blk_id block id of the block to be checked + * + * @return true if the block was placed, false if not. + */ +bool is_block_placed(ClusterBlockId blk_id); + /** * @brief Tries to find an initial placement location for each block considering floorplanning constraints * and throws an error out if it fails after max number of attempts. @@ -46,16 +105,15 @@ struct t_grid_empty_locs_block_type { * flows and updating the bandwidths used by the links due to the * traffic flows. * - * @param placer_opts Required by the function that set the status of f_placer_debug - * @param pad_loc_type Used to check whether an io block needs to be marked as fixed. + * @param placer_opts Required by the function that set the status of f_placer_debug. + * Also used to access pad_loc_type to see if a block needs to be marked fixed. * @param constraints_file Used to read block locations if any constraints is available. * @param noc_enabled Used to check whether the user turned on the noc * optimization during placement. */ void initial_placement(const t_placer_opts& placer_opts, - enum e_pad_loc_type pad_loc_type, const char* constraints_file, - bool noc_enabled); + const t_noc_opts& noc_opts); /** * @brief Looks for a valid placement location for block. diff --git a/vpr/src/place/move_utils.cpp b/vpr/src/place/move_utils.cpp index ca10cfc500b..2c62d6ec371 100644 --- a/vpr/src/place/move_utils.cpp +++ b/vpr/src/place/move_utils.cpp @@ -790,7 +790,8 @@ bool find_to_loc_uniform(t_logical_block_type_ptr type, search_range, to_compressed_loc, false, - to_layer_num); + to_layer_num, + false); if (!legal) { //No valid position found @@ -887,7 +888,8 @@ bool find_to_loc_median(t_logical_block_type_ptr blk_type, search_range, to_compressed_loc, true, - to_layer_num); + to_layer_num, + false); if (!legal) { //No valid position found @@ -973,7 +975,8 @@ bool find_to_loc_centroid(t_logical_block_type_ptr blk_type, search_range, to_compressed_loc, false, - to_layer_num); + to_layer_num, + false); if (!legal) { //No valid position found @@ -1032,13 +1035,36 @@ void compressed_grid_to_loc(t_logical_block_type_ptr blk_type, to_loc = t_pl_loc(grid_loc.x, grid_loc.y, sub_tile, grid_loc.layer_num); } +bool has_empty_compatible_subtile(t_logical_block_type_ptr type, const t_physical_tile_loc& to_loc) { + auto& device_ctx = g_vpr_ctx.device(); + auto& place_ctx = g_vpr_ctx.placement(); + + const auto& compressed_block_grid = g_vpr_ctx.placement().compressed_block_grids[type->index]; + bool legal = false; + + t_pl_loc to_uncompressed_loc; + compressed_grid_to_loc(type, to_loc, to_uncompressed_loc); + const t_physical_tile_loc to_phy_uncompressed_loc{to_uncompressed_loc.x, to_uncompressed_loc.y, to_uncompressed_loc.layer}; + const auto& phy_type = device_ctx.grid.get_physical_type(to_phy_uncompressed_loc); + const auto& compatible_sub_tiles = compressed_block_grid.compatible_sub_tiles_for_tile.at(phy_type->index); + for (const auto& sub_tile : compatible_sub_tiles) { + if (place_ctx.grid_blocks.is_sub_tile_empty(to_phy_uncompressed_loc, sub_tile)) { + legal = true; + break; + } + } + + return legal; +} + bool find_compatible_compressed_loc_in_range(t_logical_block_type_ptr type, const int delta_cx, const t_physical_tile_loc& from_loc, t_bb search_range, t_physical_tile_loc& to_loc, bool is_median, - int to_layer_num) { + int to_layer_num, + bool search_for_empty) { //TODO For the time being, the blocks only moved in the same layer. This assertion should be removed after VPR is updated to move blocks between layers VTR_ASSERT(to_layer_num == from_loc.layer_num); const auto& compressed_block_grid = g_vpr_ctx.placement().compressed_block_grids[type->index]; @@ -1119,7 +1145,9 @@ bool find_compatible_compressed_loc_in_range(t_logical_block_type_ptr type, VTR_ASSERT(to_loc.y <= search_range.ymax); if (from_loc.x == to_loc.x && from_loc.y == to_loc.y && from_loc.layer_num == to_layer_num) { - continue; //Same from/to location -- try again for new y-position + continue; //Same from/to location -- try again for new y-position + } else if (search_for_empty) { // Check if the location has at least one empty sub-tile + legal = has_empty_compatible_subtile(type, to_loc); } else { legal = true; } diff --git a/vpr/src/place/move_utils.h b/vpr/src/place/move_utils.h index f9369acd4f7..3ff8e729833 100644 --- a/vpr/src/place/move_utils.h +++ b/vpr/src/place/move_utils.h @@ -222,16 +222,29 @@ const std::string& move_type_to_string(e_move_type); void compressed_grid_to_loc(t_logical_block_type_ptr blk_type, t_physical_tile_loc compressed_loc, t_pl_loc& to_loc); + +/** + * @brief Checks whether the given location has a compatible empty subtile with + * the given type. + * + * @param type logical block type + * @param to_loc The location to be checked + * + * @return bool True if the given location has at least one empty compatible subtile. + */ +bool has_empty_compatible_subtile(t_logical_block_type_ptr type, + const t_physical_tile_loc& to_loc); + /** * @brief find compressed location in a compressed range for a specific type in the given layer (to_layer_num) * * type: defines the moving block type - * min_cx, max_cx: the minimum and maximum x coordinates of the range in the compressed grid - * min_cy, max_cx: the minimum and maximum y coordinates of the range in the compressed grid - * cx_from, cy_from: the x and y coordinates of the old location - * cx_to, cy_to: the x and y coordinates of the new location on the compressed grid + * search_range: the minimum and maximum coordinates of the search range in the compressed grid + * from_loc: the coordinates of the old location + * to_loc: the coordinates of the new location on the compressed grid * is_median: true if this is called from find_to_loc_median * to_layer_num: the layer number of the new location (set by the caller) + * search_for_empty: indicates that the returned location must be empty */ bool find_compatible_compressed_loc_in_range(t_logical_block_type_ptr type, const int delta_cx, @@ -239,7 +252,8 @@ bool find_compatible_compressed_loc_in_range(t_logical_block_type_ptr type, t_bb search_range, t_physical_tile_loc& to_loc, bool is_median, - int to_layer_num); + int to_layer_num, + bool search_for_empty); /** * @brief Get the the compressed loc from the uncompressed loc (grid_loc) diff --git a/vpr/src/place/noc_place_checkpoint.cpp b/vpr/src/place/noc_place_checkpoint.cpp new file mode 100644 index 00000000000..a25cd9ec82c --- /dev/null +++ b/vpr/src/place/noc_place_checkpoint.cpp @@ -0,0 +1,80 @@ + +#include "noc_place_checkpoint.h" +#include "noc_place_utils.h" + +NoCPlacementCheckpoint::NoCPlacementCheckpoint() + : valid_(false) + , cost_(std::numeric_limits::infinity()) { + const auto& noc_ctx = g_vpr_ctx.noc(); + + // Get all router clusters in the net-list + const std::vector& router_bids = noc_ctx.noc_traffic_flows_storage.get_router_clusters_in_netlist(); + + router_locations_.clear(); + + // Initializes checkpoint locations to invalid + for (const auto& router_bid : router_bids) { + router_locations_[router_bid] = t_pl_loc(OPEN, OPEN, OPEN, OPEN); + } +} + +void NoCPlacementCheckpoint::save_checkpoint(double cost) { + const auto& noc_ctx = g_vpr_ctx.noc(); + const auto& place_ctx = g_vpr_ctx.placement(); + + const std::vector& router_bids = noc_ctx.noc_traffic_flows_storage.get_router_clusters_in_netlist(); + + for (const auto& router_bid : router_bids) { + t_pl_loc loc = place_ctx.block_locs[router_bid].loc; + router_locations_[router_bid] = loc; + } + valid_ = true; + cost_ = cost; +} + +void NoCPlacementCheckpoint::restore_checkpoint(const t_noc_opts& noc_opts, t_placer_costs& costs) { + const auto& noc_ctx = g_vpr_ctx.noc(); + const auto& device_ctx = g_vpr_ctx.device(); + auto& place_ctx = g_vpr_ctx.mutable_placement(); + + // Get all physical routers + const auto& noc_phy_routers = noc_ctx.noc_model.get_noc_routers(); + + // Clear all physical routers in placement + for (const auto& phy_router : noc_phy_routers) { + auto phy_loc = phy_router.get_router_physical_location(); + + place_ctx.grid_blocks.set_usage(phy_loc, 0); + auto tile = device_ctx.grid.get_physical_type(phy_loc); + + for (const auto& sub_tile : tile->sub_tiles) { + auto capacity = sub_tile.capacity; + + for (int k = 0; k < capacity.total(); k++) { + const t_pl_loc loc(phy_loc, k + capacity.low); + if (place_ctx.grid_blocks.block_at_location(loc) != INVALID_BLOCK_ID) { + place_ctx.grid_blocks.set_block_at_location(loc, EMPTY_BLOCK_ID); + } + } + } + } + + // Place routers based on router_locations_ + for (const auto& router_loc : router_locations_) { + ClusterBlockId router_blk_id = router_loc.first; + t_pl_loc location = router_loc.second; + + set_block_location(router_blk_id, location); + } + + // Re-initialize routes and static variables that keep track of NoC-related costs + reinitialize_noc_routing(noc_opts, costs); +} + +bool NoCPlacementCheckpoint::is_valid() const { + return valid_; +} + +double NoCPlacementCheckpoint::get_cost() const { + return cost_; +} diff --git a/vpr/src/place/noc_place_checkpoint.h b/vpr/src/place/noc_place_checkpoint.h new file mode 100644 index 00000000000..bf5c4305616 --- /dev/null +++ b/vpr/src/place/noc_place_checkpoint.h @@ -0,0 +1,72 @@ +#ifndef VTR_ROUTERPLACEMENTCHECKPOINT_H +#define VTR_ROUTERPLACEMENTCHECKPOINT_H + +/** + * @brief NoC router placement checkpoint + * + * This class stores a checkpoint only for NoC router placement. + * If a checkpoint for all block types is needed, refer to place_checkpoint.h file. + * + * The initial placement for NoC routers is done before conventional blocks. Therefore, + * t_placement_checkpoint could not be used to store a checkpoint as t_placement_checkpoint + * assumes all blocks are placed. + * + * This class should only be used during initial NoC placement as it does not update + * bounding box and timing costs. + */ + +#include "vpr_types.h" +#include "place_util.h" + +/** + * @brief A NoC router placement checkpoint + * + * The class stores a NoC router placement and its corresponding cost. + * The checkpoint can be restored to replace the current placement. + */ +class NoCPlacementCheckpoint { + public: + /** + * @brief Default constructor initializes private member variables. + */ + NoCPlacementCheckpoint(); + NoCPlacementCheckpoint(const NoCPlacementCheckpoint& other) = delete; + NoCPlacementCheckpoint& operator=(const NoCPlacementCheckpoint& other) = delete; + + /** + * @brief Saves the current NoC router placement as a checkpoint + * + * @param cost: The placement cost associated with the current placement + */ + void save_checkpoint(double cost); + + /** + * @brief Loads the save checkpoint into global placement data structues. + * + * @param noc_opts: Contains weighting factors for different NoC cost terms + * @param costs: Used to load NoC related costs for the checkpoint + */ + void restore_checkpoint(const t_noc_opts& noc_opts, t_placer_costs& costs); + + /** + * @brief Indicates whether the object is empty or it has already stored a + * checkpoint. + * + * @return bool True if there is a save checkpoint. + */ + bool is_valid() const; + + /** + * @brief Return the cost associated with the checkpoint + * + * @return double Saved checkpoint's cost + */ + double get_cost() const; + + private: + std::unordered_map router_locations_; + bool valid_ = false; + double cost_; +}; + +#endif //VTR_ROUTERPLACEMENTCHECKPOINT_H diff --git a/vpr/src/place/noc_place_utils.cpp b/vpr/src/place/noc_place_utils.cpp index 17d96dd3677..24745755123 100644 --- a/vpr/src/place/noc_place_utils.cpp +++ b/vpr/src/place/noc_place_utils.cpp @@ -9,6 +9,18 @@ static vtr::vector traffic_flow_costs, p static std::vector affected_traffic_flows; /*********************************************************** *****************************/ +/** + * @brief Randomly select a moveable NoC router cluster blocks + * + * @param b_from The cluster block ID of the selected NoC router + * @param from The current location of the selected NoC router + * @param cluster_from_type Block type of the selected block + * @return bool True if a block was selected successfully. + * False if there are no NoC routers in the netlist or the + * selected NoC router is fixed/ + */ +static bool select_random_router_cluster(ClusterBlockId& b_from, t_pl_loc& from, t_logical_block_type_ptr& cluster_from_type); + void initial_noc_routing(void) { // need to get placement information about where the router cluster blocks are placed on the device const auto& place_ctx = g_vpr_ctx.placement(); @@ -249,6 +261,12 @@ void update_noc_normalization_factors(t_placer_costs& costs) { return; } +double calculate_noc_cost(const t_placer_costs& costs, const t_noc_opts& noc_opts) { + double noc_cost; + noc_cost = (noc_opts.noc_placement_weighting) * ((costs.noc_aggregate_bandwidth_cost * costs.noc_aggregate_bandwidth_cost_norm) + (costs.noc_latency_cost * costs.noc_latency_cost_norm)); + return noc_cost; +} + double comp_noc_aggregate_bandwidth_cost(void) { // used to get traffic flow route information auto& noc_ctx = g_vpr_ctx.mutable_noc(); @@ -463,36 +481,58 @@ bool check_for_router_swap(int user_supplied_noc_router_swap_percentage) { return (vtr::irand(99) < user_supplied_noc_router_swap_percentage) ? true : false; } -e_create_move propose_router_swap(t_pl_blocks_to_be_moved& blocks_affected, float rlim) { +static bool select_random_router_cluster(ClusterBlockId& b_from, t_pl_loc& from, t_logical_block_type_ptr& cluster_from_type) { // need to access all the router cluster blocks in the design auto& noc_ctx = g_vpr_ctx.noc(); + // + auto& place_ctx = g_vpr_ctx.placement(); + // + auto& cluster_ctx = g_vpr_ctx.clustering(); + // get a reference to the collection of router cluster blocks in the design const std::vector& router_clusters = noc_ctx.noc_traffic_flows_storage.get_router_clusters_in_netlist(); - // if there are no router cluster blocks to swap then abort + // if there are no router cluster blocks, return false if (router_clusters.empty()) { - return e_create_move::ABORT; + return false; } - int number_of_router_blocks = router_clusters.size(); + const int number_of_router_blocks = router_clusters.size(); //randomly choose a router block to move - int random_cluster_block_index = vtr::irand(number_of_router_blocks - 1); - ClusterBlockId b_from = router_clusters[random_cluster_block_index]; - - auto& place_ctx = g_vpr_ctx.placement(); - auto& cluster_ctx = g_vpr_ctx.clustering(); + const int random_cluster_block_index = vtr::irand(number_of_router_blocks - 1); + b_from = router_clusters[random_cluster_block_index]; //check if the block is movable if (place_ctx.block_locs[b_from].is_fixed) { - return e_create_move::ABORT; + return false; } - t_pl_loc from = place_ctx.block_locs[b_from].loc; - auto cluster_from_type = cluster_ctx.clb_nlist.block_type(b_from); + from = place_ctx.block_locs[b_from].loc; + cluster_from_type = cluster_ctx.clb_nlist.block_type(b_from); auto grid_from_type = g_vpr_ctx.device().grid.get_physical_type({from.x, from.y, from.layer}); VTR_ASSERT(is_tile_compatible(grid_from_type, cluster_from_type)); + return true; +} + +e_create_move propose_router_swap(t_pl_blocks_to_be_moved& blocks_affected, float rlim) { + // block ID for the randomly selected router cluster + ClusterBlockId b_from; + // current location of the randomly selected router cluster + t_pl_loc from; + // logical block type of the randomly selected router cluster + t_logical_block_type_ptr cluster_from_type; + bool random_select_success = false; + + // Randomly select a router cluster + random_select_success = select_random_router_cluster(b_from, from, cluster_from_type); + + // If a random router cluster could not be selected, no move can be proposed + if (!random_select_success) { + return e_create_move::ABORT; + } + // now choose a compatible block to swap with t_pl_loc to; to.layer = from.layer; @@ -510,7 +550,7 @@ e_create_move propose_router_swap(t_pl_blocks_to_be_moved& blocks_affected, floa return create_move; } -void write_noc_placement_file(std::string file_name) { +void write_noc_placement_file(const std::string& file_name) { // we need the clustered netlist to get the names of all the NoC router cluster blocks auto& cluster_ctx = g_vpr_ctx.clustering(); // we need to the placement context to determine the final placed locations of the NoC router cluster blocks diff --git a/vpr/src/place/noc_place_utils.h b/vpr/src/place/noc_place_utils.h index 3f27ca63ec5..5dbaed43f8f 100644 --- a/vpr/src/place/noc_place_utils.h +++ b/vpr/src/place/noc_place_utils.h @@ -270,6 +270,17 @@ void recompute_noc_costs(double& new_noc_aggregate_bandwidth_cost, double& new_n */ void update_noc_normalization_factors(t_placer_costs& costs); +/** + * @brief Calculates total NoC cost. + * + * @param costs Contains latency and aggregate bandwidth costs + * along with their corresponding normalization factors. + * @param noc_opts Contains NoC placement weighting factor. + * + * @return Calculated total NoC cost. + */ +double calculate_noc_cost(const t_placer_costs& costs, const t_noc_opts& noc_opts); + /** * @brief Calculates the aggregate bandwidth of each traffic flow in the NoC * and initializes local variables that keep track of the traffic flow @@ -407,13 +418,17 @@ void free_noc_placement_structs(void); bool check_for_router_swap(int user_supplied_noc_router_swap_percentage); /** - * @brief Generates a placement move by choosing two router cluster blocks to - * swap. First, a random router cluster block is chosen and then another router - * cluster block is chosen that can be swapped with the initial block such that - * the distance travelled by either block does not exceed rlim. - * - * @param blocks_affected The two router cluster blocks that are proposed to be - * swapped + * @brief Generates a placement move by first choosing a random router cluster + * and then choosing a random physical router where the selected router cluster + * can be moved to. If the selected physical router is already occupied, + * the proposed move requires swapping two router clusters. If the selected + * physical router is empty, the proposed move only requires changing the location + * of the random router cluster. The range in which the physical router is selected + * is limited such that them maximum distance travelled by the random router cluster + * does not exceed rlim. + * + * @param blocks_affected Contains one or two router clusters that are proposed + * to be moved or swapped. * @param rlim The maximum distance in the x and y direction that a router * cluster block can travel (this is within the compressed block space) * @return e_create_move Result of proposing the move @@ -436,5 +451,5 @@ e_create_move propose_router_swap(t_pl_blocks_to_be_moved& blocks_affected, floa * information. * */ -void write_noc_placement_file(std::string file_name); +void write_noc_placement_file(const std::string& file_name); #endif \ No newline at end of file diff --git a/vpr/src/place/place.cpp b/vpr/src/place/place.cpp index 51dfce9ee32..203d5d6cac8 100644 --- a/vpr/src/place/place.cpp +++ b/vpr/src/place/place.cpp @@ -665,9 +665,8 @@ void try_place(const Netlist<>& net_list, vtr::ScopedStartFinishTimer timer("Placement"); initial_placement(placer_opts, - placer_opts.pad_loc_type, placer_opts.constraints_file.c_str(), - noc_opts.noc); + noc_opts); if (!placer_opts.write_initial_place_file.empty()) { print_place(nullptr, @@ -848,6 +847,7 @@ void try_place(const Netlist<>& net_list, print_histogram( create_setup_slack_histogram(*timing_info->setup_analyzer())); } + size_t num_macro_members = 0; for (auto& macro : g_vpr_ctx.placement().pl_macros) { num_macro_members += macro.members.size(); @@ -1512,16 +1512,9 @@ static float starting_t(const t_annealing_state* state, VTR_LOG("std_dev: %g, average cost: %g, starting temp: %g\n", std_dev, av, 20. * std_dev); #endif - float init_temp = 0.0; - - /* We use a constructive initial placement and a low starting temperature - * by default, but that can cause problems with NoCs as the initial logical - * locations are random. Use a higher starting T in that case.*/ - if (noc_opts.noc) { - init_temp = 20. * std_dev; - } else { - init_temp = std_dev / 64; - } + // Improved initial placement uses a fast SA for NoC routers and centroid placement + // for other blocks. The temperature is reduced to prevent SA from destroying the initial placement + float init_temp = std_dev / 64; return init_temp; } @@ -2289,8 +2282,8 @@ static double get_total_cost(t_placer_costs* costs, const t_placer_opts& placer_ } if (noc_opts.noc) { - // in noc mode we include noc agggregate bandwidth and noc latency - total_cost += (noc_opts.noc_placement_weighting) * ((costs->noc_aggregate_bandwidth_cost * costs->noc_aggregate_bandwidth_cost_norm) + (costs->noc_latency_cost * costs->noc_latency_cost_norm)); + // in noc mode we include noc aggregate bandwidth and noc latency + total_cost += calculate_noc_cost(*costs, noc_opts); } return total_cost; diff --git a/vpr/src/place/place_checkpoint.cpp b/vpr/src/place/place_checkpoint.cpp index 976b130a517..dd9b9a0d9f1 100644 --- a/vpr/src/place/place_checkpoint.cpp +++ b/vpr/src/place/place_checkpoint.cpp @@ -27,8 +27,21 @@ void save_placement_checkpoint_if_needed(t_placement_checkpoint& placement_check } } -void restore_best_placement(t_placement_checkpoint& placement_checkpoint, std::shared_ptr& timing_info, t_placer_costs& costs, std::unique_ptr& placer_criticalities, std::unique_ptr& placer_setup_slacks, std::unique_ptr& place_delay_model, std::unique_ptr& pin_timing_invalidator, PlaceCritParams crit_params, const t_noc_opts& noc_opts) { - if (placement_checkpoint.cp_is_valid() && timing_info->least_slack_critical_path().delay() > placement_checkpoint.get_cp_cpd() && costs.bb_cost < 1.05 * placement_checkpoint.get_cp_bb_cost()) { +void restore_best_placement(t_placement_checkpoint& placement_checkpoint, + std::shared_ptr& timing_info, + t_placer_costs& costs, + std::unique_ptr& placer_criticalities, + std::unique_ptr& placer_setup_slacks, + std::unique_ptr& place_delay_model, + std::unique_ptr& pin_timing_invalidator, + PlaceCritParams crit_params, + const t_noc_opts& noc_opts) { + /* The (valid) checkpoint is restored if the following conditions are met: + * 1) The checkpoint has a lower critical path delay. + * 2) The checkpoint's wire-length cost is either better than the current solution, + * or at least is not more than 5% worse than the current solution. + */ + if (placement_checkpoint.cp_is_valid() && timing_info->least_slack_critical_path().delay() > placement_checkpoint.get_cp_cpd() && costs.bb_cost * 1.05 > placement_checkpoint.get_cp_bb_cost()) { //restore the latest placement checkpoint costs = placement_checkpoint.restore_placement(); diff --git a/vtr_flow/tasks/regression_tests/vtr_reg_nightly_test2/titan_quick_qor/config/config.txt b/vtr_flow/tasks/regression_tests/vtr_reg_nightly_test2/titan_quick_qor/config/config.txt index 2fe7e70eb51..6a47ac0657e 100644 --- a/vtr_flow/tasks/regression_tests/vtr_reg_nightly_test2/titan_quick_qor/config/config.txt +++ b/vtr_flow/tasks/regression_tests/vtr_reg_nightly_test2/titan_quick_qor/config/config.txt @@ -67,4 +67,4 @@ pass_requirements_file=pass_requirements_vpr_titan.txt #A large number of routing iterations is set to ensure the router doesn't give up to easily on the larger benchmarks #To be more run-time comparable to commercial tools like Quartus, we run with higher placer effort (inner_num=2) and lower astar_fac (1.0) #Set a 24hr timeout so they don't run forever -script_params=-starting_stage vpr --route_chan_width 300 --max_router_iterations 400 --router_lookahead map -timeout 86400 --initial_pres_fac 1.0 --router_profiler_astar_fac 1.5 --seed 3 +script_params=-starting_stage vpr --route_chan_width 300 --max_router_iterations 400 --router_lookahead map -timeout 86400 --initial_pres_fac 1.0 --router_profiler_astar_fac 1.5 --seed 3 \ No newline at end of file