Skip to content

Commit

Permalink
LDGSTS, LDGDEPBAR and DEPBAR Implementations (#62)
Browse files Browse the repository at this point in the history
  • Loading branch information
Connie120 authored Sep 20, 2023
1 parent 53e99da commit a0c12f5
Show file tree
Hide file tree
Showing 3 changed files with 196 additions and 2 deletions.
21 changes: 21 additions & 0 deletions src/abstract_hardware_model.h
Original file line number Diff line number Diff line change
Expand Up @@ -1056,6 +1056,13 @@ class warp_inst_t : public inst_t {
m_uid = 0;
m_empty = true;
m_config = NULL;

// Ni:
m_is_ldgsts = false;
m_is_ldgdepbar = false;
m_is_depbar = false;

m_depbar_group_no = 0;
}
warp_inst_t(const core_config *config) {
m_uid = 0;
Expand All @@ -1069,6 +1076,13 @@ class warp_inst_t : public inst_t {
m_is_printf = false;
m_is_cdp = 0;
should_do_atomic = true;

// Ni:
m_is_ldgsts = false;
m_is_ldgdepbar = false;
m_is_depbar = false;

m_depbar_group_no = 0;
}
virtual ~warp_inst_t() {}

Expand Down Expand Up @@ -1251,6 +1265,13 @@ class warp_inst_t : public inst_t {
// Jin: cdp support
public:
int m_is_cdp;

// Ni: add boolean to indicate whether the instruction is ldgsts
bool m_is_ldgsts;
bool m_is_ldgdepbar;
bool m_is_depbar;

unsigned int m_depbar_group_no;
};

void move_warp(warp_inst_t *&dst, warp_inst_t *&src);
Expand Down
129 changes: 127 additions & 2 deletions src/gpgpu-sim/shader.cc
Original file line number Diff line number Diff line change
Expand Up @@ -532,7 +532,6 @@ void shader_core_ctx::reinit(unsigned start_thread, unsigned end_thread,
void shader_core_ctx::init_warps(unsigned cta_id, unsigned start_thread,
unsigned end_thread, unsigned ctaid,
int cta_size, kernel_info_t &kernel) {
//
address_type start_pc = next_pc(start_thread);
unsigned kernel_id = kernel.get_uid();
if (m_config->model == POST_DOMINATOR) {
Expand Down Expand Up @@ -1046,13 +1045,63 @@ void shader_core_ctx::issue_warp(register_set &pipe_reg_set,
m_stats->shader_cycle_distro[2 + (*pipe_reg)->active_count()]++;
func_exec_inst(**pipe_reg);

// Add LDGSTS instructions into a buffer
unsigned int ldgdepbar_id = m_warp[warp_id]->m_ldgdepbar_id;
if (next_inst->m_is_ldgsts) {
if (m_warp[warp_id]->m_ldgdepbar_buf.size() == ldgdepbar_id + 1) {
m_warp[warp_id]->m_ldgdepbar_buf[ldgdepbar_id].push_back(*next_inst);
}
else {
assert(m_warp[warp_id]->m_ldgdepbar_buf.size() < ldgdepbar_id + 1);
std::vector<warp_inst_t> l;
l.push_back(*next_inst);
m_warp[warp_id]->m_ldgdepbar_buf.push_back(l);
}
// If the mask of the instruction is all 0, then the address is also 0,
// so that there's no need to check through the writeback
if (next_inst->get_active_mask() == 0) {
(m_warp[warp_id]->m_ldgdepbar_buf.back()).back().pc = -1;
}
}

if (next_inst->op == BARRIER_OP) {
m_warp[warp_id]->store_info_of_last_inst_at_barrier(*pipe_reg);
m_barriers.warp_reaches_barrier(m_warp[warp_id]->get_cta_id(), warp_id,
const_cast<warp_inst_t *>(next_inst));

} else if (next_inst->op == MEMORY_BARRIER_OP) {
m_warp[warp_id]->set_membar();
} else if (next_inst->m_is_ldgdepbar) { // Add for LDGDEPBAR
m_warp[warp_id]->m_ldgdepbar_id++;
} else if (next_inst->m_is_depbar) { // Add for DEPBAR
// Set to true immediately when a DEPBAR instruction is met
m_warp[warp_id]->m_waiting_ldgsts = true;
m_warp[warp_id]->m_depbar_group = next_inst->m_depbar_group_no; // set in trace_driven.cc

// Record the last group that's possbily being monitored by this DEPBAR instr
m_warp[warp_id]->m_depbar_start_id = m_warp[warp_id]->m_ldgdepbar_id - 1;

// Record the last group that's actually being monitored by this DEPBAR instr
unsigned int end_group = m_warp[warp_id]->m_ldgdepbar_id - m_warp[warp_id]->m_depbar_group;

// Check for the case that the LDGSTSs monitored have finished when encountering the
// DEPBAR instruction
bool done_flag = true;
for (int i = 0; i < end_group; i++) {
for (int j = 0; j < m_warp[warp_id]->m_ldgdepbar_buf[i].size(); j++) {
if (m_warp[warp_id]->m_ldgdepbar_buf[i][j].pc != -1) {
done_flag = false;
goto UpdateDEPBAR;
}
}
}

UpdateDEPBAR:
if (done_flag) {
if (m_warp[warp_id]->m_waiting_ldgsts) {
m_warp[warp_id]->m_waiting_ldgsts = false;
}
}
}

updateSIMTStack(warp_id, *pipe_reg);
Expand Down Expand Up @@ -1796,12 +1845,50 @@ void ldst_unit::get_L1T_sub_stats(struct cache_sub_stats &css) const {
if (m_L1T) m_L1T->get_sub_stats(css);
}

// Add this function to unset depbar
void shader_core_ctx::unset_depbar(const warp_inst_t &inst) {
bool done_flag = true;
unsigned int end_group = m_warp[inst.warp_id()]->m_depbar_start_id == 0 ?
m_warp[inst.warp_id()]->m_ldgdepbar_buf.size() :
(m_warp[inst.warp_id()]->m_depbar_start_id - m_warp[inst.warp_id()]->m_depbar_group + 1);

if (inst.m_is_ldgsts) {
for (int i = 0; i < m_warp[inst.warp_id()]->m_ldgdepbar_buf.size(); i++) {
for (int j = 0; j < m_warp[inst.warp_id()]->m_ldgdepbar_buf[i].size(); j++) {
if (m_warp[inst.warp_id()]->m_ldgdepbar_buf[i][j].pc == inst.pc) {
// Handle the case that same pc results in multiple LDGSTS instructions
if (m_warp[inst.warp_id()]->m_ldgdepbar_buf[i][j].get_addr(0) == inst.get_addr(0)) {
m_warp[inst.warp_id()]->m_ldgdepbar_buf[i][j].pc = -1;
goto DoneWB;
}
}
}
}

DoneWB:
for (int i = 0; i < end_group; i++) {
for (int j = 0; j < m_warp[inst.warp_id()]->m_ldgdepbar_buf[i].size(); j++) {
if (m_warp[inst.warp_id()]->m_ldgdepbar_buf[i][j].pc != -1) {
done_flag = false;
goto UpdateDEPBAR;
}
}
}

UpdateDEPBAR:
if (done_flag) {
if (m_warp[inst.warp_id()]->m_waiting_ldgsts) {
m_warp[inst.warp_id()]->m_waiting_ldgsts = false;
}
}
}
}

void shader_core_ctx::warp_inst_complete(const warp_inst_t &inst) {
#if 0
printf("[warp_inst_complete] uid=%u core=%u warp=%u pc=%#x @ time=%llu \n",
inst.get_uid(), m_sid, inst.warp_id(), inst.pc, m_gpu->gpu_tot_sim_cycle + m_gpu->gpu_sim_cycle);
#endif

if (inst.op_pipe == SP__OP)
m_stats->m_num_sp_committed[m_sid]++;
else if (inst.op_pipe == SFU__OP)
Expand Down Expand Up @@ -1907,6 +1994,14 @@ mem_stage_stall_type ldst_unit::process_cache_access(
if (inst.is_load()) {
for (unsigned r = 0; r < MAX_OUTPUT_VALUES; r++)
if (inst.out[r] > 0) m_pending_writes[inst.warp_id()][inst.out[r]]--;

// release LDGSTS
if (inst.m_is_ldgsts) {
m_pending_ldgsts[inst.warp_id()][inst.pc][inst.get_addr(0)]--;
if (m_pending_ldgsts[inst.warp_id()][inst.pc][inst.get_addr(0)] == 0) {
m_core->unset_depbar(inst);
}
}
}
if (!write_sent) delete mf;
} else if (status == RESERVATION_FAIL) {
Expand Down Expand Up @@ -2035,6 +2130,14 @@ void ldst_unit::L1_latency_queue_cycle() {
m_core->warp_inst_complete(mf_next->get_inst());
}
}

// release LDGSTS
if (mf_next->get_inst().m_is_ldgsts) {
m_pending_ldgsts[mf_next->get_inst().warp_id()][mf_next->get_inst().pc][mf_next->get_inst().get_addr(0)]--;
if (m_pending_ldgsts[mf_next->get_inst().warp_id()][mf_next->get_inst().pc][mf_next->get_inst().get_addr(0)] == 0) {
m_core->unset_depbar(mf_next->get_inst());
}
}
}

// For write hit in WB policy
Expand Down Expand Up @@ -2571,10 +2674,21 @@ void ldst_unit::writeback() {
insn_completed = true;
}
}
else if (m_next_wb.m_is_ldgsts) { // for LDGSTS instructions where no output register is used
m_pending_ldgsts[m_next_wb.warp_id()][m_next_wb.pc][m_next_wb.get_addr(0)]--;
if (m_pending_ldgsts[m_next_wb.warp_id()][m_next_wb.pc][m_next_wb.get_addr(0)] == 0) {
insn_completed = true;
}
break;
}
}
if (insn_completed) {
m_core->warp_inst_complete(m_next_wb);
if (m_next_wb.m_is_ldgsts) {
m_core->unset_depbar(m_next_wb);
}
}

m_next_wb.clear();
m_last_inst_gpu_sim_cycle = m_core->get_gpu()->gpu_sim_cycle;
m_last_inst_gpu_tot_sim_cycle = m_core->get_gpu()->gpu_tot_sim_cycle;
Expand Down Expand Up @@ -2796,6 +2910,14 @@ void ldst_unit::cycle() {
if (!pending_requests) {
m_core->warp_inst_complete(*m_dispatch_reg);
m_scoreboard->releaseRegisters(m_dispatch_reg);

// release LDGSTS
if (m_dispatch_reg->m_is_ldgsts) {
// m_pending_ldgsts[m_dispatch_reg->warp_id()][m_dispatch_reg->pc][m_dispatch_reg->get_addr(0)]--;
if (m_pending_ldgsts[m_dispatch_reg->warp_id()][m_dispatch_reg->pc][m_dispatch_reg->get_addr(0)] == 0) {
m_core->unset_depbar(*m_dispatch_reg);
}
}
}
m_core->dec_inst_in_pipeline(warp_id);
m_dispatch_reg->clear();
Expand Down Expand Up @@ -3930,6 +4052,8 @@ bool shd_warp_t::waiting() {
// the functional execution of the atomic when it hits DRAM can cause
// the wrong register to be read.
return true;
} else if (m_waiting_ldgsts) { // Waiting for LDGSTS to finish
return true;
}
return false;
}
Expand Down Expand Up @@ -4050,6 +4174,7 @@ int register_bank(int regnum, int wid, unsigned num_banks,

bool opndcoll_rfu_t::writeback(warp_inst_t &inst) {
assert(!inst.empty());

std::list<unsigned> regs = m_shader->get_regs_written(inst);
for (unsigned op = 0; op < MAX_REG_OPERANDS; op++) {
int reg_num = inst.arch_reg.dst[op]; // this math needs to match that used
Expand Down
48 changes: 48 additions & 0 deletions src/gpgpu-sim/shader.h
Original file line number Diff line number Diff line change
Expand Up @@ -123,6 +123,20 @@ class shd_warp_t {
// Jin: cdp support
m_cdp_latency = 0;
m_cdp_dummy = false;

// Ni: Initialize ldgdepbar_id
m_ldgdepbar_id = 0;
m_depbar_start_id = 0;
m_depbar_group = 0;

// Ni: Set waiting to false
m_waiting_ldgsts = false;

// Ni: Clear m_ldgdepbar_buf
for (int i = 0; i < m_ldgdepbar_buf.size(); i++) {
m_ldgdepbar_buf[i].clear();
}
m_ldgdepbar_buf.clear();
}
void init(address_type start_pc, unsigned cta_id, unsigned wid,
const std::bitset<MAX_WARP_SIZE> &active,
Expand All @@ -140,6 +154,20 @@ class shd_warp_t {
// Jin: cdp support
m_cdp_latency = 0;
m_cdp_dummy = false;

// Ni: Initialize ldgdepbar_id
m_ldgdepbar_id = 0;
m_depbar_start_id = 0;
m_depbar_group = 0;

// Ni: Set waiting to false
m_waiting_ldgsts = false;

// Ni: Clear m_ldgdepbar_buf
for (int i = 0; i < m_ldgdepbar_buf.size(); i++) {
m_ldgdepbar_buf[i].clear();
}
m_ldgdepbar_buf.clear();
}

bool functional_done() const;
Expand Down Expand Up @@ -288,6 +316,14 @@ class shd_warp_t {
public:
unsigned int m_cdp_latency;
bool m_cdp_dummy;

// Ni: LDGDEPBAR barrier support
public:
unsigned int m_ldgdepbar_id; // LDGDEPBAR barrier ID
std::vector<std::vector<warp_inst_t>> m_ldgdepbar_buf; // LDGDEPBAR barrier buffer
unsigned int m_depbar_start_id;
unsigned int m_depbar_group;
bool m_waiting_ldgsts; // Ni: Whether the warp is waiting for the LDGSTS instrs to finish
};

inline unsigned hw_tid_from_wid(unsigned wid, unsigned warp_size, unsigned i) {
Expand Down Expand Up @@ -1314,6 +1350,15 @@ class ldst_unit : public pipelined_simd_unit {
const memory_config *mem_config, class shader_core_stats *stats,
unsigned sid, unsigned tpc);

// Add a structure to record the LDGSTS instructions,
// similar to m_pending_writes, but since LDGSTS does not have a output register
// to write to, so a new structure needs to be added
/* A multi-level map: unsigned (warp_id) -> unsigned (pc) -> unsigned (addr) -> unsigned (count)
*/
std::map<unsigned /*warp_id*/,
std::map<unsigned /*pc*/,
std::map<unsigned /*addr*/, unsigned /*count*/>>>
m_pending_ldgsts;
// modifiers
virtual void issue(register_set &inst);
bool is_issue_partitioned() { return false; }
Expand Down Expand Up @@ -2069,6 +2114,9 @@ class shader_core_ctx : public core_t {
// modifiers
virtual void warp_exit(unsigned warp_id);

// Ni: Unset ldgdepbar
void unset_depbar(const warp_inst_t &inst);

// accessors
virtual bool warp_waiting_at_barrier(unsigned warp_id) const;
void get_pdom_stack_top_info(unsigned tid, unsigned *pc, unsigned *rpc) const;
Expand Down

0 comments on commit a0c12f5

Please sign in to comment.