From 52c56353fe1bd4bcd419d15e99ea61b2e3b40e89 Mon Sep 17 00:00:00 2001 From: calebmkim <55243755+calebmkim@users.noreply.github.com> Date: Tue, 10 Oct 2023 20:18:20 -0400 Subject: [PATCH] Improve systolic array (#1735) * reduce register usage * even fewer registers * tests hopefully pass * removed extraneous file --- .../systolic-lang/gen_array_component.py | 286 +++++++----------- runt.toml | 2 + tests/frontend/systolic/array-1.expect | 284 +++++------------ 3 files changed, 189 insertions(+), 383 deletions(-) diff --git a/frontends/systolic-lang/gen_array_component.py b/frontends/systolic-lang/gen_array_component.py index 5741c44ce8..ab2ca39b3f 100644 --- a/frontends/systolic-lang/gen_array_component.py +++ b/frontends/systolic-lang/gen_array_component.py @@ -16,8 +16,6 @@ NAME_SCHEME = { # Indexing into the memory "index name": "{prefix}_idx", - "index init": "{prefix}_idx_init", - "index update": "{prefix}_idx_update", # Move data from main memories "memory move": "{prefix}_move", "out write": "{pe}_out_write", @@ -62,19 +60,15 @@ def __str__(self): + str(self.const) ) - def build_group(self, comp: cb.ComponentBuilder) -> str: + def implement_add(self, comp: cb.ComponentBuilder) -> str: """ - Builds a static Calyx group (latency 1) that implements `self` - Note that we avoid creating duplicate groups. - Returns the group name + Implements the `CalyxAdd` by creating an adder that adds the two values """ - group_name = str(self) + "_group" - if comp.try_get_group(group_name) is None: + if comp.try_get_cell(str(self)) is None: add = comp.add(BITWIDTH, str(self)) - with comp.static_group(group_name, 1): + with comp.continuous: add.left = self.port add.right = self.const - return group_name def add_systolic_output_params(comp: cb.ComponentBuilder, row_num, addr_width): @@ -117,13 +111,15 @@ def instantiate_memory(comp: cb.ComponentBuilder, top_or_left, idx, size): this = comp.this() addr0_port = this.port(name + "_addr0") read_data_port = this.port(name + "_read_data") - # Instantiate the indexing register - idx = instantiate_indexor(comp, name, idx_width) + # Get the indexing value, taking into account offset + # For example, for l2, we want to access idx-2 (since we want to wait two + # cycles before we start feeding memories in) + idx_val = get_indexor(comp, idx_width, offset=idx) # Register to save the value from the memory. Defined by [[instantiate_pe]]. target = comp.get_cell(target_reg) group_name = NAME_SCHEME["memory move"].format(prefix=name) with comp.static_group(group_name, 1) as g: - g.asgn(addr0_port, idx.out) + g.asgn(addr0_port, idx_val.out) target.in_ = read_data_port target.write_en = 1 @@ -138,37 +134,23 @@ def instantiate_pe(comp: cb.ComponentBuilder, row: int, col: int): comp.reg(f"left_{row}_{col}", BITWIDTH) -def instantiate_indexor(comp: cb.ComponentBuilder, prefix, width) -> cb.CellBuilder: +def get_indexor(comp: cb.ComponentBuilder, width: int, offset: int) -> cb.CellBuilder: """ - Instantiate an indexor for accessing memory with name `prefix`. - Generates structure to initialize and update the indexor. - - The initializor starts sets the memories to their maximum value - because we expect all indices to be incremented once before - being used. - - Returns (cells, structure) + Gets (instantiates if needed) an indexor for accessing memory with offset + `offset` (as compared to the iteration idx) """ - name = NAME_SCHEME["index name"].format(prefix=prefix) - - reg = comp.reg(name, width) - add = comp.add(width, f"{prefix}_add") - - init_name = NAME_SCHEME["index init"].format(prefix=prefix) - with comp.static_group(init_name, 1): - # Initialize the indexor to 0 - reg.in_ = 0 - reg.write_en = 1 - - upd_name = NAME_SCHEME["index update"].format(prefix=prefix) - with comp.static_group(upd_name, 1): - # Increment the indexor. - add.left = 1 - add.right = reg.out - reg.in_ = add.out - reg.write_en = 1 - - return reg + if comp.try_get_cell(f"idx_minus_{offset}_res") is None: + idx = comp.get_cell("idx") + # idx has width bitwidth + sub = comp.sub(BITWIDTH, f"idx_minus_{offset}") + sub_res = comp.slice(f"idx_minus_{offset}_res", BITWIDTH, width) + with comp.continuous: + sub.left = idx.out + sub.right = offset + sub_res.in_ = sub.out + return sub_res + else: + return comp.get_cell(f"idx_minus_{offset}_res") def instantiate_data_move( @@ -223,10 +205,8 @@ def get_memory_updates(row, col): movers = [] if col == 0: movers.append(NAME_SCHEME["memory move"].format(prefix=f"l{row}")) - movers.append(NAME_SCHEME["index update"].format(prefix=f"l{row}")) if row == 0: movers.append(NAME_SCHEME["memory move"].format(prefix=f"t{col}")) - movers.append(NAME_SCHEME["index update"].format(prefix=f"t{col}")) mover_enables = [py_ast.Enable(name) for name in movers] return mover_enables @@ -266,23 +246,15 @@ def get_pe_invoke(r, c, mul_ready): ) -def init_runtime_vals(comp: cb.ComponentBuilder, depth_port, partial_iter_limit): +def init_iter_limit(comp: cb.ComponentBuilder, depth_port, partial_iter_limit): """ Builds group that instantiates the dynamic/runtime values for the systolic array: its depth and iteration limit/count (since its iteration limit depends on its depth). iteration limit = depth + partial_iter_limit """ - min_depth_4 = comp.reg("min_depth_4", BITWIDTH) - lt_depth_4 = comp.lt(BITWIDTH, "lt_depth_4") iter_limit = comp.reg("iter_limit", BITWIDTH) iter_limit_add = comp.add(BITWIDTH, "iter_limit_add") - with comp.static_group("init_min_depth", 1): - lt_depth_4.left = depth_port - lt_depth_4.right = 4 - min_depth_4.in_ = lt_depth_4.out @ depth_port - min_depth_4.in_ = ~lt_depth_4.out @ 4 - min_depth_4.write_en = 1 with comp.static_group("init_iter_limit", 1): iter_limit_add.left = partial_iter_limit iter_limit_add.right = depth_port @@ -290,7 +262,7 @@ def init_runtime_vals(comp: cb.ComponentBuilder, depth_port, partial_iter_limit) iter_limit.write_en = 1 -def instantiate_while_groups(comp: cb.ComponentBuilder): +def instantiate_idx_groups(comp: cb.ComponentBuilder): """ Builds groups that instantiate idx to 0 and increment idx. Also builds groups that set cond_reg to 1 (runs before the while loop) @@ -298,7 +270,6 @@ def instantiate_while_groups(comp: cb.ComponentBuilder): """ idx = comp.reg("idx", BITWIDTH) add = comp.add(BITWIDTH, "idx_add") - cond_reg = comp.reg("cond_reg", 1) iter_limit = comp.get_cell("iter_limit") lt_iter_limit = comp.lt(BITWIDTH, "lt_iter_limit") @@ -310,14 +281,9 @@ def instantiate_while_groups(comp: cb.ComponentBuilder): add.right = 1 idx.in_ = add.out idx.write_en = 1 - with comp.static_group("init_cond_reg", 1): - cond_reg.in_ = 1 - cond_reg.write_en = 1 - with comp.static_group("write_cond_reg", 1): - lt_iter_limit.left = add.out + with comp.continuous: + lt_iter_limit.left = idx.out lt_iter_limit.right = iter_limit.out - cond_reg.in_ = lt_iter_limit.out - cond_reg.write_en = 1 def instantiate_calyx_adds(comp, nec_ranges) -> list: @@ -326,85 +292,67 @@ def instantiate_calyx_adds(comp, nec_ranges) -> list: specified add. Returns a list of all the group names that we created. """ - calyx_add_groups = set() for lo, hi in nec_ranges: if type(lo) == CalyxAdd: - group_name = lo.build_group(comp) - calyx_add_groups.add(group_name) + lo.implement_add(comp) if type(hi) == CalyxAdd: - group_name = hi.build_group(comp) - calyx_add_groups.add(group_name) - group_list = list(calyx_add_groups) - # sort for testing purposes - group_list.sort() - return group_list + hi.implement_add(comp) -def instantiate_idx_between(comp: cb.ComponentBuilder, lo, hi) -> list: +def check_idx_lower_bound(comp: cb.ComponentBuilder, lo): + """ + Creates assignments to test if idx >= lo """ - Instantiates a static group and register called "idx_between_{lo}_{hi}_reg/group" - that should output whether idx is between [lo, hi). That is, whether lo <= idx < hi. + if type(lo) == CalyxAdd: + lo_value = comp.get_cell(str(lo)).port("out") + else: + lo_value = lo + idx = comp.get_cell("idx") + index_ge = f"index_ge_{lo}" + ge = comp.ge(BITWIDTH, index_ge) + with comp.continuous: + ge.left = idx.out + ge.right = lo_value + - Note: If you're trying to understand why this works, we are checking `idx_add` which - is one higher than idx. This offsets the cycle it takes to update the register. +def check_idx_upper_bound(comp: cb.ComponentBuilder, hi): + """ + Creates assignments to test if idx < hi """ if type(hi) == CalyxAdd: hi_value = comp.get_cell(str(hi)).port("out") else: hi_value = hi - if type(lo) == CalyxAdd: - lo_value = comp.get_cell(str(lo)).port("out") - else: - lo_value = lo - reg_str = NAME_SCHEME["idx between reg"].format(lo=lo, hi=hi) - group_str = NAME_SCHEME["idx between group"].format(lo=lo, hi=hi) + idx = comp.get_cell("idx") index_lt = f"index_lt_{hi}" - index_ge = f"index_ge_{lo}" - reg = comp.reg(reg_str, 1) - idx_add = comp.get_cell("idx_add") - # If no upper bound, then only need to check reg >= lo - lt = ( - comp.get_cell(index_lt) - if comp.try_get_cell(index_lt) is not None - else comp.lt(BITWIDTH, index_lt) - ) + lt = comp.lt(BITWIDTH, index_lt) + with comp.continuous: + lt.left = idx.out + lt.right = hi_value + + +def check_idx_between(comp: cb.ComponentBuilder, lo, hi) -> list: + """ + Creates assignments to check whether idx is between [lo, hi). + That is, whether lo <= idx < hi. + """ + # This is the name of the combinational cell that checks the condition + idx_between_str = f"idx_between_{lo}_{hi}_comb" + lt = comp.get_cell(f"index_lt_{hi}") # if lo == 0, then only need to check if reg < hi if type(lo) == int and lo == 0: - with comp.static_group(group_str, 1): - lt.left = idx_add.out - lt.right = hi_value - reg.in_ = lt.out - reg.write_en = 1 + # In this case, the `wire` cell is the cell checking the condition. + wire = comp.wire(idx_between_str, 1) + with comp.continuous: + wire.in_ = lt.out # need to check if reg >= lo and reg < hi else: - ge = ( - comp.get_cell(index_ge) - if comp.try_get_cell(index_ge) is not None - else comp.ge(BITWIDTH, index_ge) - ) - and_ = comp.and_(1, f"idx_between_{lo}_{hi}_comb") - with comp.static_group(group_str, 1): - ge.left = idx_add.out - ge.right = lo_value - lt.left = idx_add.out - lt.right = hi_value - and_.left = ge.out + ge = comp.get_cell(f"index_ge_{lo}") + # In this case, the `and` cell is the cell checking the condition. + and_ = comp.and_(1, idx_between_str) + with comp.continuous: and_.right = lt.out - reg.in_ = and_.out - reg.write_en = 1 - - -def init_idx_between(comp: cb.ComponentBuilder, lo, hi): - """ - Builds a group to set initial state for register idx_between_{lo}_{hi}_reg. - """ - # if lo == 0, then the idx will initially be in between the interval, so - # need to set idx_between to high - start_hi = 1 if lo == 0 else 0 - idx_between = comp.get_cell(NAME_SCHEME["idx between reg"].format(lo=lo, hi=hi)) - with comp.static_group(NAME_SCHEME["idx between init"].format(lo=lo, hi=hi), 1): - idx_between.in_ = start_hi - idx_between.write_en = 1 + and_.left = ge.out def accum_nec_ranges(nec_ranges, schedule): @@ -450,19 +398,17 @@ def gen_schedules( """ left_length, top_length = config.left_length, config.top_length depth_port = comp.this().depth - min_depth_4_port = comp.get_cell("min_depth_4").port("out") schedules = {} update_sched = np.zeros((left_length, top_length), dtype=object) pe_fill_sched = np.zeros((left_length, top_length), dtype=object) pe_accum_sched = np.zeros((left_length, top_length), dtype=object) pe_move_sched = np.zeros((left_length, top_length), dtype=object) - # will only actually use one of the following two schedules pe_write_sched = np.zeros((left_length, top_length), dtype=object) for row in range(0, left_length): for col in range(0, top_length): pos = row + col update_sched[row][col] = (pos, CalyxAdd(depth_port, pos)) - pe_fill_sched[row][col] = (pos + 1, CalyxAdd(min_depth_4_port, pos + 1)) + pe_fill_sched[row][col] = (pos + 1, pos + 5) pe_accum_sched[row][col] = (pos + 5, CalyxAdd(depth_port, pos + 5)) pe_move_sched[row][col] = (pos + 1, CalyxAdd(depth_port, pos + 1)) pe_write_sched[row][col] = ( @@ -486,7 +432,7 @@ def execute_if_between(comp: cb.ComponentBuilder, start, end, body): """ if not body: return [] - if_cell = comp.get_cell(f"idx_between_{start}_{end}_reg") + if_cell = comp.get_cell(f"idx_between_{start}_{end}_comb") return [ cb.static_if( if_cell.out, @@ -496,47 +442,32 @@ def execute_if_between(comp: cb.ComponentBuilder, start, end, body): def generate_control( - comp: cb.ComponentBuilder, - config: SystolicConfiguration, - schedules, - calyx_add_groups, - nec_ranges, + comp: cb.ComponentBuilder, config: SystolicConfiguration, schedules ): """ Logically, control performs the following actions: 1. Initialize all the memory indexors and idx and idx_between registers at the start - 2. Build a static repeat with a one cycle body that: + 2. Build a static loop with a one cycle body that: a. Updates memory indices if needed/feeds memory into systolic array. b. Invokes the PEs correctly (mul_ready should only be hi if the multiplier's values are ready). c. Move the data needed by each PE - 3. Writes the PE values into external memory + 3. Writes the PE values into output ports of the component when necessary """ control = [] top_length, left_length = config.top_length, config.left_length # Initialize all memories. - init_indices: list[py_ast.Control] = [ - py_ast.Enable(NAME_SCHEME["index init"].format(prefix=f"t{idx}")) - for idx in range(top_length) - ] - init_indices.extend( - [ - py_ast.Enable(NAME_SCHEME["index init"].format(prefix=f"l{idx}")) - for idx in range(left_length) - ] - + [ - py_ast.Enable("init_idx"), - py_ast.Enable("init_min_depth"), - py_ast.Enable("init_cond_reg"), - py_ast.Enable("init_iter_limit"), - ] - + [py_ast.Enable(f"init_idx_between_{lo}_{hi}") for (lo, hi) in nec_ranges] + control.append( + py_ast.StaticParComp( + [ + py_ast.Enable("init_idx"), + py_ast.Enable("init_iter_limit"), + ] + ) ) - control.append(py_ast.StaticParComp(init_indices)) - # source_pos metadata init init_tag = 0 source_map = {} @@ -549,8 +480,7 @@ def counter(): # end source pos init - control_stmts = [] - incr_stmts = [py_ast.Enable("incr_idx"), py_ast.Enable("write_cond_reg")] + while_body_stmts = [py_ast.Enable("incr_idx")] for r in range(left_length): for c in range(top_length): # build 4 if stmts for the 4 schedules that we need to account for @@ -587,7 +517,7 @@ def counter(): pe_control = ( input_mem_updates + pe_fills + pe_moves + pe_accums + output_writes ) - control_stmts.append(py_ast.StaticParComp(pe_control)) + while_body_stmts.append(py_ast.StaticParComp(pe_control)) # providing metadata tag = counter() source_map[ @@ -599,19 +529,10 @@ def counter(): writing: [{schedules['write_sched'][r][c][0]} \ {schedules['write_sched'][r][c][1]})" - # handles the coordination so that `idx_if_between` statements work correctly ` - for start, end in nec_ranges: - # build the control stmts that assign correct values to - # idx_between_{start}_{end}_reg, which is what the if stmts above^ rely on - incr_stmts.append(py_ast.Enable(f"idx_between_{start}_{end}_group")) - for calyx_add_group in calyx_add_groups: - incr_stmts.append(py_ast.Enable(calyx_add_group)) - while_ctrl = [py_ast.StaticParComp(control_stmts), py_ast.StaticParComp(incr_stmts)] - while_body = py_ast.StaticParComp(while_ctrl) + while_body = py_ast.StaticParComp(while_body_stmts) # build the while loop with condition cond_reg. - # num repeats = (top_length - 1) + (left_length - 1) + (top_depth - 1) + 5 + 1 - cond_reg_port = comp.get_cell("cond_reg").port("out") + cond_reg_port = comp.get_cell("lt_iter_limit").port("out") while_loop = cb.while_(cond_reg_port, while_body) control.append(while_loop) @@ -629,7 +550,8 @@ def create_systolic_array(prog: cb.Builder, config: SystolicConfiguration): pe(prog) computational_unit = prog.component(SYSTOLIC_ARRAY_COMP) depth_port = computational_unit.input("depth", BITWIDTH) - init_runtime_vals( + # initialize the iteration limit to top_length + left_length + depth + 4 + init_iter_limit( computational_unit, depth_port, config.top_length + config.left_length + 4 ) @@ -637,7 +559,20 @@ def create_systolic_array(prog: cb.Builder, config: SystolicConfiguration): nec_ranges = set() for sched in schedules.values(): accum_nec_ranges(nec_ranges, sched) - calyx_add_groups = instantiate_calyx_adds(computational_unit, nec_ranges) + instantiate_calyx_adds(computational_unit, nec_ranges) + + # instantiate groups that handles the idx variables + instantiate_idx_groups(computational_unit) + list1, list2 = zip(*nec_ranges) + nec_ranges_beg = set(list1) + nec_ranges_end = set(list2) + for val in nec_ranges_beg: + check_idx_lower_bound(computational_unit, val) + for val in nec_ranges_end: + check_idx_upper_bound(computational_unit, val) + for start, end in nec_ranges: + # create the assignments that help determine if idx is in between + check_idx_between(computational_unit, start, end) for row in range(config.left_length): for col in range(config.top_length): @@ -673,20 +608,7 @@ def create_systolic_array(prog: cb.Builder, config: SystolicConfiguration): # `computational_unit`'s output ports instantiate_output_move(computational_unit, row, col) - # instantiate groups that handle cond_reg and idx variables - instantiate_while_groups(computational_unit) - for start, end in nec_ranges: - # create the groups that create for idx_in_between registers - instantiate_idx_between(computational_unit, start, end) - init_idx_between(computational_unit, start, end) - # Generate the control and set the source map - control, source_map = generate_control( - computational_unit, - config, - schedules, - calyx_add_groups, - nec_ranges, - ) + control, source_map = generate_control(computational_unit, config, schedules) computational_unit.control = control prog.program.meta = source_map diff --git a/runt.toml b/runt.toml index 258ccb7602..7ba72defcb 100644 --- a/runt.toml +++ b/runt.toml @@ -257,6 +257,7 @@ fud e --from systolic --to jq \ --through vcd_json \ -s verilog.data {}.data \ -s calyx.exec './target/debug/calyx' \ + -s calyx.flags "-d well-formed" \ -s jq.file {}.jq \ {} -q """ @@ -274,6 +275,7 @@ fud e --from systolic --to dat \ --through verilog \ -s verilog.data {}.data \ -s calyx.exec './target/debug/calyx' \ + -s calyx.flags "-d well-formed" \ {} -q """ diff --git a/tests/frontend/systolic/array-1.expect b/tests/frontend/systolic/array-1.expect index 9bc38f15d9..1c7b2f7036 100644 --- a/tests/frontend/systolic/array-1.expect +++ b/tests/frontend/systolic/array-1.expect @@ -29,105 +29,99 @@ static<1> component mac_pe(top: 32, left: 32, mul_ready: 1) -> (out: 32) { } component systolic_array_comp(depth: 32, t0_read_data: 32, l0_read_data: 32) -> (t0_addr0: 2, l0_addr0: 2, r0_valid: 1, r0_value: 32, r0_idx: 1) { cells { - min_depth_4 = std_reg(32); - lt_depth_4 = std_lt(32); iter_limit = std_reg(32); iter_limit_add = std_add(32); depth_plus_5 = std_add(32); depth_plus_0 = std_add(32); depth_plus_1 = std_add(32); - min_depth_4_plus_1 = std_add(32); depth_plus_6 = std_add(32); - pe_0_0 = mac_pe(); - top_0_0 = std_reg(32); - left_0_0 = std_reg(32); - t0_idx = std_reg(2); - t0_add = std_add(2); - l0_idx = std_reg(2); - l0_add = std_add(2); idx = std_reg(32); idx_add = std_add(32); - cond_reg = std_reg(1); lt_iter_limit = std_lt(32); - idx_between_5_depth_plus_5_reg = std_reg(1); - index_lt_depth_plus_5 = std_lt(32); + index_ge_0 = std_ge(32); + index_ge_1 = std_ge(32); + index_ge_depth_plus_5 = std_ge(32); index_ge_5 = std_ge(32); - idx_between_5_depth_plus_5_comb = std_and(1); - idx_between_0_depth_plus_0_reg = std_reg(1); index_lt_depth_plus_0 = std_lt(32); - idx_between_1_depth_plus_1_reg = std_reg(1); index_lt_depth_plus_1 = std_lt(32); - index_ge_1 = std_ge(32); - idx_between_1_depth_plus_1_comb = std_and(1); - idx_between_1_min_depth_4_plus_1_reg = std_reg(1); - index_lt_min_depth_4_plus_1 = std_lt(32); - idx_between_1_min_depth_4_plus_1_comb = std_and(1); - idx_between_depth_plus_5_depth_plus_6_reg = std_reg(1); + index_lt_5 = std_lt(32); + index_lt_depth_plus_5 = std_lt(32); index_lt_depth_plus_6 = std_lt(32); - index_ge_depth_plus_5 = std_ge(32); + idx_between_5_depth_plus_5_comb = std_and(1); + idx_between_1_5_comb = std_and(1); + idx_between_0_depth_plus_0_comb = std_wire(1); + idx_between_1_depth_plus_1_comb = std_and(1); idx_between_depth_plus_5_depth_plus_6_comb = std_and(1); + pe_0_0 = mac_pe(); + top_0_0 = std_reg(32); + left_0_0 = std_reg(32); + idx_minus_0 = std_sub(32); + idx_minus_0_res = std_slice(32, 2); } wires { - static<1> group init_min_depth { - lt_depth_4.left = depth; - lt_depth_4.right = 32'd4; - min_depth_4.in = lt_depth_4.out ? depth; - min_depth_4.in = !lt_depth_4.out ? 32'd4; - min_depth_4.write_en = 1'd1; - } static<1> group init_iter_limit { iter_limit_add.left = 32'd6; iter_limit_add.right = depth; iter_limit.in = iter_limit_add.out; iter_limit.write_en = 1'd1; } - static<1> group depth_plus_5_group { - depth_plus_5.left = depth; - depth_plus_5.right = 32'd5; - } - static<1> group depth_plus_0_group { - depth_plus_0.left = depth; - depth_plus_0.right = 32'd0; - } - static<1> group depth_plus_1_group { - depth_plus_1.left = depth; - depth_plus_1.right = 32'd1; - } - static<1> group min_depth_4_plus_1_group { - min_depth_4_plus_1.left = min_depth_4.out; - min_depth_4_plus_1.right = 32'd1; - } - static<1> group depth_plus_6_group { - depth_plus_6.left = depth; - depth_plus_6.right = 32'd6; - } - static<1> group t0_idx_init { - t0_idx.in = 2'd0; - t0_idx.write_en = 1'd1; + depth_plus_5.left = depth; + depth_plus_5.right = 32'd5; + depth_plus_0.left = depth; + depth_plus_0.right = 32'd0; + depth_plus_1.left = depth; + depth_plus_1.right = 32'd1; + depth_plus_6.left = depth; + depth_plus_6.right = 32'd6; + static<1> group init_idx { + idx.in = 32'd0; + idx.write_en = 1'd1; } - static<1> group t0_idx_update { - t0_add.left = 2'd1; - t0_add.right = t0_idx.out; - t0_idx.in = t0_add.out; - t0_idx.write_en = 1'd1; + static<1> group incr_idx { + idx_add.left = idx.out; + idx_add.right = 32'd1; + idx.in = idx_add.out; + idx.write_en = 1'd1; } + lt_iter_limit.left = idx.out; + lt_iter_limit.right = iter_limit.out; + index_ge_0.left = idx.out; + index_ge_0.right = 32'd0; + index_ge_1.left = idx.out; + index_ge_1.right = 32'd1; + index_ge_depth_plus_5.left = idx.out; + index_ge_depth_plus_5.right = depth_plus_5.out; + index_ge_5.left = idx.out; + index_ge_5.right = 32'd5; + index_lt_depth_plus_0.left = idx.out; + index_lt_depth_plus_0.right = depth_plus_0.out; + index_lt_depth_plus_1.left = idx.out; + index_lt_depth_plus_1.right = depth_plus_1.out; + index_lt_5.left = idx.out; + index_lt_5.right = 32'd5; + index_lt_depth_plus_5.left = idx.out; + index_lt_depth_plus_5.right = depth_plus_5.out; + index_lt_depth_plus_6.left = idx.out; + index_lt_depth_plus_6.right = depth_plus_6.out; + idx_between_5_depth_plus_5_comb.right = index_lt_depth_plus_5.out; + idx_between_5_depth_plus_5_comb.left = index_ge_5.out; + idx_between_1_5_comb.right = index_lt_5.out; + idx_between_1_5_comb.left = index_ge_1.out; + idx_between_0_depth_plus_0_comb.in = index_lt_depth_plus_0.out; + idx_between_1_depth_plus_1_comb.right = index_lt_depth_plus_1.out; + idx_between_1_depth_plus_1_comb.left = index_ge_1.out; + idx_between_depth_plus_5_depth_plus_6_comb.right = index_lt_depth_plus_6.out; + idx_between_depth_plus_5_depth_plus_6_comb.left = index_ge_depth_plus_5.out; + idx_minus_0.left = idx.out; + idx_minus_0.right = 32'd0; + idx_minus_0_res.in = idx_minus_0.out; static<1> group t0_move { - t0_addr0 = t0_idx.out; + t0_addr0 = idx_minus_0_res.out; top_0_0.in = t0_read_data; top_0_0.write_en = 1'd1; } - static<1> group l0_idx_init { - l0_idx.in = 2'd0; - l0_idx.write_en = 1'd1; - } - static<1> group l0_idx_update { - l0_add.left = 2'd1; - l0_add.right = l0_idx.out; - l0_idx.in = l0_add.out; - l0_idx.write_en = 1'd1; - } static<1> group l0_move { - l0_addr0 = l0_idx.out; + l0_addr0 = idx_minus_0_res.out; left_0_0.in = l0_read_data; left_0_0.write_en = 1'd1; } @@ -136,151 +130,39 @@ component systolic_array_comp(depth: 32, t0_read_data: 32, l0_read_data: 32) -> r0_value = pe_0_0.out; r0_idx = 1'd0; } - static<1> group init_idx { - idx.in = 32'd0; - idx.write_en = 1'd1; - } - static<1> group incr_idx { - idx_add.left = idx.out; - idx_add.right = 32'd1; - idx.in = idx_add.out; - idx.write_en = 1'd1; - } - static<1> group init_cond_reg { - cond_reg.in = 1'd1; - cond_reg.write_en = 1'd1; - } - static<1> group write_cond_reg { - lt_iter_limit.left = idx_add.out; - lt_iter_limit.right = iter_limit.out; - cond_reg.in = lt_iter_limit.out; - cond_reg.write_en = 1'd1; - } - static<1> group idx_between_5_depth_plus_5_group { - index_ge_5.left = idx_add.out; - index_ge_5.right = 32'd5; - index_lt_depth_plus_5.left = idx_add.out; - index_lt_depth_plus_5.right = depth_plus_5.out; - idx_between_5_depth_plus_5_comb.left = index_ge_5.out; - idx_between_5_depth_plus_5_comb.right = index_lt_depth_plus_5.out; - idx_between_5_depth_plus_5_reg.in = idx_between_5_depth_plus_5_comb.out; - idx_between_5_depth_plus_5_reg.write_en = 1'd1; - } - static<1> group init_idx_between_5_depth_plus_5 { - idx_between_5_depth_plus_5_reg.in = 1'd0; - idx_between_5_depth_plus_5_reg.write_en = 1'd1; - } - static<1> group idx_between_0_depth_plus_0_group { - index_lt_depth_plus_0.left = idx_add.out; - index_lt_depth_plus_0.right = depth_plus_0.out; - idx_between_0_depth_plus_0_reg.in = index_lt_depth_plus_0.out; - idx_between_0_depth_plus_0_reg.write_en = 1'd1; - } - static<1> group init_idx_between_0_depth_plus_0 { - idx_between_0_depth_plus_0_reg.in = 1'd1; - idx_between_0_depth_plus_0_reg.write_en = 1'd1; - } - static<1> group idx_between_1_depth_plus_1_group { - index_ge_1.left = idx_add.out; - index_ge_1.right = 32'd1; - index_lt_depth_plus_1.left = idx_add.out; - index_lt_depth_plus_1.right = depth_plus_1.out; - idx_between_1_depth_plus_1_comb.left = index_ge_1.out; - idx_between_1_depth_plus_1_comb.right = index_lt_depth_plus_1.out; - idx_between_1_depth_plus_1_reg.in = idx_between_1_depth_plus_1_comb.out; - idx_between_1_depth_plus_1_reg.write_en = 1'd1; - } - static<1> group init_idx_between_1_depth_plus_1 { - idx_between_1_depth_plus_1_reg.in = 1'd0; - idx_between_1_depth_plus_1_reg.write_en = 1'd1; - } - static<1> group idx_between_1_min_depth_4_plus_1_group { - index_ge_1.left = idx_add.out; - index_ge_1.right = 32'd1; - index_lt_min_depth_4_plus_1.left = idx_add.out; - index_lt_min_depth_4_plus_1.right = min_depth_4_plus_1.out; - idx_between_1_min_depth_4_plus_1_comb.left = index_ge_1.out; - idx_between_1_min_depth_4_plus_1_comb.right = index_lt_min_depth_4_plus_1.out; - idx_between_1_min_depth_4_plus_1_reg.in = idx_between_1_min_depth_4_plus_1_comb.out; - idx_between_1_min_depth_4_plus_1_reg.write_en = 1'd1; - } - static<1> group init_idx_between_1_min_depth_4_plus_1 { - idx_between_1_min_depth_4_plus_1_reg.in = 1'd0; - idx_between_1_min_depth_4_plus_1_reg.write_en = 1'd1; - } - static<1> group idx_between_depth_plus_5_depth_plus_6_group { - index_ge_depth_plus_5.left = idx_add.out; - index_ge_depth_plus_5.right = depth_plus_5.out; - index_lt_depth_plus_6.left = idx_add.out; - index_lt_depth_plus_6.right = depth_plus_6.out; - idx_between_depth_plus_5_depth_plus_6_comb.left = index_ge_depth_plus_5.out; - idx_between_depth_plus_5_depth_plus_6_comb.right = index_lt_depth_plus_6.out; - idx_between_depth_plus_5_depth_plus_6_reg.in = idx_between_depth_plus_5_depth_plus_6_comb.out; - idx_between_depth_plus_5_depth_plus_6_reg.write_en = 1'd1; - } - static<1> group init_idx_between_depth_plus_5_depth_plus_6 { - idx_between_depth_plus_5_depth_plus_6_reg.in = 1'd0; - idx_between_depth_plus_5_depth_plus_6_reg.write_en = 1'd1; - } } control { seq { static par { - t0_idx_init; - l0_idx_init; init_idx; - init_min_depth; - init_cond_reg; init_iter_limit; - init_idx_between_5_depth_plus_5; - init_idx_between_0_depth_plus_0; - init_idx_between_1_depth_plus_1; - init_idx_between_1_min_depth_4_plus_1; - init_idx_between_depth_plus_5_depth_plus_6; } - while cond_reg.out { + while lt_iter_limit.out { static par { + incr_idx; static par { - static par { - static if idx_between_0_depth_plus_0_reg.out { - static par { - l0_move; - l0_idx_update; - t0_move; - t0_idx_update; - } + static if idx_between_0_depth_plus_0_comb.out { + static par { + l0_move; + t0_move; } - static if idx_between_1_min_depth_4_plus_1_reg.out { - static par { - static invoke pe_0_0(top=top_0_0.out, left=left_0_0.out, mul_ready=1'd0)(); - } + } + static if idx_between_1_5_comb.out { + static par { + static invoke pe_0_0(top=top_0_0.out, left=left_0_0.out, mul_ready=1'd0)(); } - static if idx_between_5_depth_plus_5_reg.out { - static par { - static invoke pe_0_0(top=top_0_0.out, left=left_0_0.out, mul_ready=1'd1)(); - } + } + static if idx_between_5_depth_plus_5_comb.out { + static par { + static invoke pe_0_0(top=top_0_0.out, left=left_0_0.out, mul_ready=1'd1)(); } - static if idx_between_depth_plus_5_depth_plus_6_reg.out { - static par { - pe_0_0_out_write; - } + } + static if idx_between_depth_plus_5_depth_plus_6_comb.out { + static par { + pe_0_0_out_write; } } } - static par { - incr_idx; - write_cond_reg; - idx_between_5_depth_plus_5_group; - idx_between_0_depth_plus_0_group; - idx_between_1_depth_plus_1_group; - idx_between_1_min_depth_4_plus_1_group; - idx_between_depth_plus_5_depth_plus_6_group; - depth_plus_0_group; - depth_plus_1_group; - depth_plus_5_group; - depth_plus_6_group; - min_depth_4_plus_1_group; - } } } } @@ -346,5 +228,5 @@ component main() -> () { } } metadata #{ -0: pe_0_0 filling: [1,min_depth_4_plus_1), accumulating: [5 depth_plus_5), writing: [depth_plus_5 depth_plus_6) +0: pe_0_0 filling: [1,5), accumulating: [5 depth_plus_5), writing: [depth_plus_5 depth_plus_6) }#