From 296be30f283714aa516a8c9c2d3d16c86c4c5c31 Mon Sep 17 00:00:00 2001 From: JRPan <25518778+JRPan@users.noreply.github.com> Date: Wed, 21 Feb 2024 16:49:34 -0500 Subject: [PATCH 1/4] performance inprovements --- src/gpgpu-sim/gpu-sim.cc | 8 +++-- src/gpgpu-sim/local_interconnect.cc | 46 +++++++++++++++++++---------- 2 files changed, 37 insertions(+), 17 deletions(-) diff --git a/src/gpgpu-sim/gpu-sim.cc b/src/gpgpu-sim/gpu-sim.cc index 47c0b4a89..13029e337 100644 --- a/src/gpgpu-sim/gpu-sim.cc +++ b/src/gpgpu-sim/gpu-sim.cc @@ -1935,8 +1935,10 @@ void gpgpu_sim::cycle() { if (mf) partiton_reqs_in_parallel_per_cycle++; } m_memory_sub_partition[i]->cache_cycle(gpu_sim_cycle + gpu_tot_sim_cycle); - m_memory_sub_partition[i]->accumulate_L2cache_stats( - m_power_stats->pwr_mem_stat->l2_cache_stats[CURRENT_STAT_IDX]); + if (m_config.g_power_simulation_enabled) { + m_memory_sub_partition[i]->accumulate_L2cache_stats( + m_power_stats->pwr_mem_stat->l2_cache_stats[CURRENT_STAT_IDX]); + } } } partiton_reqs_in_parallel += partiton_reqs_in_parallel_per_cycle; @@ -1958,11 +1960,13 @@ void gpgpu_sim::cycle() { *active_sms += m_cluster[i]->get_n_active_sms(); } // Update core icnt/cache stats for AccelWattch + if (m_config.g_power_simulation_enabled) { m_cluster[i]->get_icnt_stats( m_power_stats->pwr_mem_stat->n_simt_to_mem[CURRENT_STAT_IDX][i], m_power_stats->pwr_mem_stat->n_mem_to_simt[CURRENT_STAT_IDX][i]); m_cluster[i]->get_cache_stats( m_power_stats->pwr_mem_stat->core_cache_stats[CURRENT_STAT_IDX]); + } m_cluster[i]->get_current_occupancy( gpu_occupancy.aggregate_warp_slot_filled, gpu_occupancy.aggregate_theoretical_warp_slots); diff --git a/src/gpgpu-sim/local_interconnect.cc b/src/gpgpu-sim/local_interconnect.cc index fe7bc74fb..b51d0fa5c 100644 --- a/src/gpgpu-sim/local_interconnect.cc +++ b/src/gpgpu-sim/local_interconnect.cc @@ -184,15 +184,21 @@ void xbar_router::iSLIP_Advance() { unsigned reqs = 0; // calcaulte how many conflicts are there for stats + // prebuild a set than contains only the nodes that have packets + std::set node_set; + std::set destination_set; // a set with output_nodes as destination for (unsigned i = 0; i < total_nodes; ++i) { if (!in_buffers[i].empty()) { + node_set.insert(i); + destination_set.insert(in_buffers[i].front().output_deviceID); Packet _packet_tmp = in_buffers[i].front(); if (!node_tmp.empty()) { if (std::find(node_tmp.begin(), node_tmp.end(), _packet_tmp.output_deviceID) != node_tmp.end()) { conflict_sub++; - } else + } else { node_tmp.push_back(_packet_tmp.output_deviceID); + } } else { node_tmp.push_back(_packet_tmp.output_deviceID); } @@ -206,41 +212,51 @@ void xbar_router::iSLIP_Advance() { cycles_util++; } // do iSLIP - for (unsigned i = 0; i < total_nodes; ++i) { - if (Has_Buffer_Out(i, 1)) { - for (unsigned j = 0; j < total_nodes; ++j) { - unsigned node_id = (j + next_node[i]) % total_nodes; - - if (!in_buffers[node_id].empty()) { + // for (unsigned i = 0; i < total_nodes; ++i) { + for (auto dest : destination_set) { + if (Has_Buffer_Out(dest, 1)) { + unsigned start_node = next_node[dest]; + auto it = std::upper_bound(node_set.begin(), node_set.end(), + start_node); + for (unsigned j = 0; j < node_set.size(); j++) { + if (it == node_set.end()) { + it = node_set.begin(); + } + unsigned node_id = *it; + assert(!in_buffers[node_id].empty()); Packet _packet = in_buffers[node_id].front(); - if (_packet.output_deviceID == i) { + if (_packet.output_deviceID == dest) { out_buffers[_packet.output_deviceID].push(_packet); in_buffers[node_id].pop(); if (verbose) printf("%d : cycle %llu : send req from %d to %d\n", m_id, cycles, - node_id, i - _n_shader); + node_id, dest - _n_shader); if (grant_cycles_count == 1) - next_node[i] = (++node_id % total_nodes); + next_node[dest] = (++node_id % total_nodes); if (verbose) { for (unsigned k = j + 1; k < total_nodes; ++k) { - unsigned node_id2 = (k + next_node[i]) % total_nodes; + unsigned node_id2 = (k + next_node[dest]) % total_nodes; if (!in_buffers[node_id2].empty()) { Packet _packet2 = in_buffers[node_id2].front(); - if (_packet2.output_deviceID == i) + if (_packet2.output_deviceID == dest) printf("%d : cycle %llu : cannot send req from %d to %d\n", - m_id, cycles, node_id2, i - _n_shader); + m_id, cycles, node_id2, dest - _n_shader); } } } reqs++; + if (in_buffers[node_id].empty()) { + node_set.erase(node_id); + } break; } - } + it++; } - } else + } else { out_buffer_full++; + } } if (active) { From ccc6a9b858477bc079f1442c2dc1c04ad52a2519 Mon Sep 17 00:00:00 2001 From: JRPan <25518778+JRPan@users.noreply.github.com> Date: Wed, 21 Feb 2024 17:27:10 -0500 Subject: [PATCH 2/4] use node_id before incremented --- src/gpgpu-sim/local_interconnect.cc | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/gpgpu-sim/local_interconnect.cc b/src/gpgpu-sim/local_interconnect.cc index b51d0fa5c..9ed2b5bad 100644 --- a/src/gpgpu-sim/local_interconnect.cc +++ b/src/gpgpu-sim/local_interconnect.cc @@ -228,6 +228,9 @@ void xbar_router::iSLIP_Advance() { if (_packet.output_deviceID == dest) { out_buffers[_packet.output_deviceID].push(_packet); in_buffers[node_id].pop(); + if (in_buffers[node_id].empty()) { + node_set.erase(node_id); + } if (verbose) printf("%d : cycle %llu : send req from %d to %d\n", m_id, cycles, node_id, dest - _n_shader); @@ -247,9 +250,6 @@ void xbar_router::iSLIP_Advance() { } reqs++; - if (in_buffers[node_id].empty()) { - node_set.erase(node_id); - } break; } it++; From 229eddaf27676666233f547227595abfb4bc128e Mon Sep 17 00:00:00 2001 From: JRPan <25518778+JRPan@users.noreply.github.com> Date: Wed, 28 Feb 2024 15:15:22 -0500 Subject: [PATCH 3/4] Cleanup iSLIP --- src/gpgpu-sim/local_interconnect.cc | 38 ++++++++++------------------- 1 file changed, 13 insertions(+), 25 deletions(-) diff --git a/src/gpgpu-sim/local_interconnect.cc b/src/gpgpu-sim/local_interconnect.cc index 9ed2b5bad..1cd7e9de8 100644 --- a/src/gpgpu-sim/local_interconnect.cc +++ b/src/gpgpu-sim/local_interconnect.cc @@ -177,31 +177,23 @@ void xbar_router::RR_Advance() { // IEEE/ACM transactions on networking 2 (1999): 188-201. // https://www.cs.rutgers.edu/~sn624/552-F18/papers/islip.pdf void xbar_router::iSLIP_Advance() { - vector node_tmp; bool active = false; unsigned conflict_sub = 0; unsigned reqs = 0; // calcaulte how many conflicts are there for stats - // prebuild a set than contains only the nodes that have packets - std::set node_set; - std::set destination_set; // a set with output_nodes as destination + std::set input_nodes; + std::set destination_set; for (unsigned i = 0; i < total_nodes; ++i) { if (!in_buffers[i].empty()) { - node_set.insert(i); - destination_set.insert(in_buffers[i].front().output_deviceID); - Packet _packet_tmp = in_buffers[i].front(); - if (!node_tmp.empty()) { - if (std::find(node_tmp.begin(), node_tmp.end(), - _packet_tmp.output_deviceID) != node_tmp.end()) { - conflict_sub++; - } else { - node_tmp.push_back(_packet_tmp.output_deviceID); - } - } else { - node_tmp.push_back(_packet_tmp.output_deviceID); + input_nodes.insert(i); + unsigned out_node = in_buffers[i].front().output_deviceID; + + if(destination_set.find(out_node) != destination_set.end()) { + conflict_sub++; } + destination_set.insert(out_node); active = true; } } @@ -212,15 +204,14 @@ void xbar_router::iSLIP_Advance() { cycles_util++; } // do iSLIP - // for (unsigned i = 0; i < total_nodes; ++i) { for (auto dest : destination_set) { if (Has_Buffer_Out(dest, 1)) { unsigned start_node = next_node[dest]; - auto it = std::upper_bound(node_set.begin(), node_set.end(), + auto it = std::upper_bound(input_nodes.begin(), input_nodes.end(), start_node); - for (unsigned j = 0; j < node_set.size(); j++) { - if (it == node_set.end()) { - it = node_set.begin(); + for (unsigned j = 0; j < input_nodes.size(); j++, it++) { + if (it == input_nodes.end()) { + it = input_nodes.begin(); } unsigned node_id = *it; assert(!in_buffers[node_id].empty()); @@ -228,9 +219,7 @@ void xbar_router::iSLIP_Advance() { if (_packet.output_deviceID == dest) { out_buffers[_packet.output_deviceID].push(_packet); in_buffers[node_id].pop(); - if (in_buffers[node_id].empty()) { - node_set.erase(node_id); - } + input_nodes.erase(node_id); //can only be used once if (verbose) printf("%d : cycle %llu : send req from %d to %d\n", m_id, cycles, node_id, dest - _n_shader); @@ -252,7 +241,6 @@ void xbar_router::iSLIP_Advance() { reqs++; break; } - it++; } } else { out_buffer_full++; From 8e05bd97ae07db31799287e7beeaf73172de84e2 Mon Sep 17 00:00:00 2001 From: JRPAN <25518778+JRPan@users.noreply.github.com> Date: Fri, 12 Jul 2024 13:07:02 -0400 Subject: [PATCH 4/4] run set_dram_power_stats only when power model enabled --- src/gpgpu-sim/gpu-sim.cc | 22 ++++++++++++---------- 1 file changed, 12 insertions(+), 10 deletions(-) diff --git a/src/gpgpu-sim/gpu-sim.cc b/src/gpgpu-sim/gpu-sim.cc index 13029e337..793eee1e5 100644 --- a/src/gpgpu-sim/gpu-sim.cc +++ b/src/gpgpu-sim/gpu-sim.cc @@ -1905,16 +1905,18 @@ void gpgpu_sim::cycle() { m_memory_partition_unit[i] ->dram_cycle(); // Issue the dram command (scheduler + delay model) // Update performance counters for DRAM - m_memory_partition_unit[i]->set_dram_power_stats( - m_power_stats->pwr_mem_stat->n_cmd[CURRENT_STAT_IDX][i], - m_power_stats->pwr_mem_stat->n_activity[CURRENT_STAT_IDX][i], - m_power_stats->pwr_mem_stat->n_nop[CURRENT_STAT_IDX][i], - m_power_stats->pwr_mem_stat->n_act[CURRENT_STAT_IDX][i], - m_power_stats->pwr_mem_stat->n_pre[CURRENT_STAT_IDX][i], - m_power_stats->pwr_mem_stat->n_rd[CURRENT_STAT_IDX][i], - m_power_stats->pwr_mem_stat->n_wr[CURRENT_STAT_IDX][i], - m_power_stats->pwr_mem_stat->n_wr_WB[CURRENT_STAT_IDX][i], - m_power_stats->pwr_mem_stat->n_req[CURRENT_STAT_IDX][i]); + if (m_config.g_power_simulation_enabled) { + m_memory_partition_unit[i]->set_dram_power_stats( + m_power_stats->pwr_mem_stat->n_cmd[CURRENT_STAT_IDX][i], + m_power_stats->pwr_mem_stat->n_activity[CURRENT_STAT_IDX][i], + m_power_stats->pwr_mem_stat->n_nop[CURRENT_STAT_IDX][i], + m_power_stats->pwr_mem_stat->n_act[CURRENT_STAT_IDX][i], + m_power_stats->pwr_mem_stat->n_pre[CURRENT_STAT_IDX][i], + m_power_stats->pwr_mem_stat->n_rd[CURRENT_STAT_IDX][i], + m_power_stats->pwr_mem_stat->n_wr[CURRENT_STAT_IDX][i], + m_power_stats->pwr_mem_stat->n_wr_WB[CURRENT_STAT_IDX][i], + m_power_stats->pwr_mem_stat->n_req[CURRENT_STAT_IDX][i]); + } } }