From be8bb0ba90ef5130e6f74e858bb53311371b42a7 Mon Sep 17 00:00:00 2001 From: gullahmed1 Date: Wed, 8 Nov 2023 15:32:54 +0500 Subject: [PATCH] Add bitmanipulation support --- rtl/cv32e40p_alu.sv | 156 +++++++++++++++++++++++++++++++++++- rtl/cv32e40p_core.sv | 9 ++- rtl/cv32e40p_decoder.sv | 87 ++++++++++++++++++-- rtl/cv32e40p_ex_stage.sv | 7 +- rtl/cv32e40p_id_stage.sv | 6 +- rtl/cv32e40p_top.sv | 6 +- rtl/include/cv32e40p_pkg.sv | 42 +++++++++- 7 files changed, 295 insertions(+), 18 deletions(-) diff --git a/rtl/cv32e40p_alu.sv b/rtl/cv32e40p_alu.sv index aa900a787..3ba55f44e 100644 --- a/rtl/cv32e40p_alu.sv +++ b/rtl/cv32e40p_alu.sv @@ -27,7 +27,9 @@ module cv32e40p_alu import cv32e40p_pkg::*; -( +#( + parameter ZBITMANIP = 0 +) ( input logic clk, input logic rst_n, input logic enable_i, @@ -805,7 +807,11 @@ module cv32e40p_alu logic [31:0] bmask_first, bmask_inv; logic [31:0] bextins_and; logic [31:0] bextins_result, bclr_result, bset_result; - + logic [31:0] result_bitmanip; // Store result of bitmanip operations + logic [31:0] clmul_result; // Store carry-less multiplication result + logic [ 5:0] cpop; // Store no of set bits in operand a + logic [ 4:0] ff_one_result; // Return the position of first one + logic ff_one_all_zeros; // Return true if all input is zero // construct bit mask for insert/extract/bclr/bset // bmask looks like this 00..0011..1100..00 @@ -823,6 +829,137 @@ module cv32e40p_alu assign bclr_result = operand_a_i & bmask_inv; assign bset_result = operand_a_i | bmask; + if (ZBITMANIP) begin : gen_zbc_zbb_results + + // Temporary registers + logic [31:0] ff_one_in; + logic [31:0][31:0] clmul_temp0; + logic [7:0][31:0] clmul_temp1; + logic [1:0][31:0] clmul_temp2; + logic [31:0] operand_b_rev; + + // Decide the input of cv32e40p_ff_one module based on operator_i + assign ff_one_in = (operator_i == ALU_B_CTZ) ? operand_a_i : operand_a_rev; + + // Instantiate cv32e40p_popcnt module, it will return 1's count + cv32e40p_popcnt popcnt_i ( + .in_i (operand_a_i), + .result_o(cpop) + ); + + // Instantiate Find First One Module + cv32e40p_ff_one ff_one_i ( + .in_i (ff_one_in), + .first_one_o(ff_one_result), + .no_ones_o (ff_one_all_zeros) + ); + + // Reverse operand_b_i using streaming operator + assign operand_b_rev = {<<{operand_b_i}}; + + // Create 32 rows like traditional multiplication + for (genvar i = 0; i < 32; i++) begin : gen_32_rows + assign clmul_temp0[i] = (operator_i == ALU_B_CLMUL) ? + operand_b_i[i] ? operand_a_i << i : '0 : + operand_b_rev[i] ? operand_a_rev << i : '0; + end + + // Xor 4 rows 8 times + for (genvar i = 0; i < 8; i++) begin : gen_xor_result_8_rows + assign clmul_temp1[i] = clmul_temp0[i<<2] ^ clmul_temp0[(i<<2)+1] ^ + clmul_temp0[(i<<2)+2] ^ clmul_temp0[(i<<2)+3]; + end + + // XOR 4 rows twice + for (genvar i = 0; i < 2; i++) begin : gen_xor_result_2_rows + assign clmul_temp2[i] = clmul_temp1[i<<2] ^ clmul_temp1[(i<<2)+1] ^ + clmul_temp1[(i<<2)+2] ^ clmul_temp1[(i<<2)+3]; + end + + // Xor on last 2 rows + assign clmul_result = clmul_temp2[0] ^ clmul_temp2[1]; + end + + always_comb begin + if (ZBITMANIP) begin + unique case (operator_i) + + // Zba: Address generation Instructions , Shift left rs1 by 1/2/3 + rs2 + ALU_B_SH1ADD: result_bitmanip = {operand_a_i[30:0], 1'b0} + operand_b_i; + ALU_B_SH2ADD: result_bitmanip = {operand_a_i[29:0], 2'b0} + operand_b_i; + ALU_B_SH3ADD: result_bitmanip = {operand_a_i[28:0], 3'b0} + operand_b_i; + + // Zbb: Basic Bit-Manipulation + // Logical with Negate + ALU_B_ANDN: result_bitmanip = operand_a_i & operand_b_neg; + ALU_B_ORN: result_bitmanip = operand_a_i | operand_b_neg; + ALU_B_XNOR: result_bitmanip = ~(operand_a_i ^ operand_b_i); + + // Count leading/trailing zero bits + ALU_B_CLZ: result_bitmanip = ff_one_all_zeros ? {26'b0, 6'b100000} : {26'b0, ff_one_result}; + ALU_B_CTZ: result_bitmanip = ff_one_all_zeros ? {26'b0, 6'b100000} : {26'b0, ff_one_result}; + + // Count set bits + ALU_B_CPOP: result_bitmanip = cpop; + + // Integer Minimum/Maximum + ALU_B_MAX: + result_bitmanip = ($signed(operand_a_i) < $signed(operand_b_i)) ? operand_b_i : operand_a_i; + ALU_B_MAXU: result_bitmanip = (operand_a_i < operand_b_i) ? operand_b_i : operand_a_i; + ALU_B_MIN: + result_bitmanip = ($signed(operand_a_i) < $signed(operand_b_i)) ? operand_a_i : operand_b_i; + ALU_B_MINU: result_bitmanip = (operand_a_i < operand_b_i) ? operand_a_i : operand_b_i; + + // Sign and zero-extension + ALU_B_SEXTB: result_bitmanip = {{24{operand_a_i[7]}}, operand_a_i[7:0]}; + ALU_B_SEXTH: result_bitmanip = {{16{operand_a_i[15]}}, operand_a_i[15:0]}; + ALU_B_ZEXTH: result_bitmanip = {{16{1'b0}}, operand_a_i[15:0]}; + + // Bitwise rotation + ALU_B_ROL: + result_bitmanip = (operand_a_i << operand_b_i[4:0]) | (operand_a_i >> (32-operand_b_i[4:0])); + ALU_B_ROR: + result_bitmanip = (operand_a_i >> operand_b_i[4:0]) | (operand_a_i << (32-operand_b_i[4:0])); + ALU_B_RORI: + result_bitmanip = (operand_a_i >> operand_b_i[4:0]) | (operand_a_i << (32-operand_b_i[4:0])); + + // Bitwise OR-Combine, byte granule + ALU_B_ORCB: + result_bitmanip = { + {8{|operand_a_i[31:24]}}, + {8{|operand_a_i[23:16]}}, + {8{|operand_a_i[15:8]}}, + {8{|operand_a_i[7:0]}} + }; + + // Byte-reverse register + ALU_B_REV8: + result_bitmanip = { + {operand_a_i[7:0]}, {operand_a_i[15:8]}, {operand_a_i[23:16]}, {operand_a_i[31:24]} + }; + + // Zbc: Carry-less Multiplication low/reversed/high part + ALU_B_CLMUL: result_bitmanip = clmul_result; + ALU_B_CLMULR: result_bitmanip = {<<{clmul_result}}; + ALU_B_CLMULH: result_bitmanip = {<<{clmul_result}} >> 1'b1; + + // Zbs: Single-bit Instructions + ALU_B_BCLR: result_bitmanip = operand_a_i & ~(1'b1 << (operand_b_i & 5'b11111)); + ALU_B_BCLRI: result_bitmanip = operand_a_i & ~(1'b1 << (operand_b_i & 5'b11111)); + ALU_B_BEXT: result_bitmanip = (operand_a_i >> (operand_b_i & 5'b11111)) & 1'b1; + ALU_B_BEXTI: result_bitmanip = (operand_a_i >> (operand_b_i & 5'b11111)) & 1'b1; + ALU_B_BINV: result_bitmanip = operand_a_i ^ (1'b1 << (operand_b_i & 5'b11111)); + ALU_B_BINVI: result_bitmanip = operand_a_i ^ (1'b1 << (operand_b_i & 5'b11111)); + ALU_B_BSET: result_bitmanip = operand_a_i | (1'b1 << (operand_b_i & 5'b11111)); + ALU_B_BSETI: result_bitmanip = operand_a_i | (1'b1 << (operand_b_i & 5'b11111)); + + default: result_bitmanip = '0; + endcase + end else begin + result_bitmanip = '0; + end + end + ///////////////////////////////////////////////////////////////////////////////// // ____ _____ _______ _____ ________ ________ _____ _____ ______ // // | _ \_ _|__ __| | __ \| ____\ \ / / ____| __ \ / ____| ____| // @@ -979,6 +1116,21 @@ module cv32e40p_alu default: ; // default case to suppress unique warning endcase + + if (ZBITMANIP) begin + unique case (operator_i) + // Bit-Manip Operations Result + ALU_B_SH1ADD, ALU_B_MIN, ALU_B_ROL, ALU_B_ROR, ALU_B_XNOR, ALU_B_MAXU, + ALU_B_SH2ADD, ALU_B_ANDN, ALU_B_MAX, ALU_B_ORN, ALU_B_MINU, ALU_B_RORI, + ALU_B_SEXTB, ALU_B_SEXTH, ALU_B_ZEXTH, ALU_B_CPOP, ALU_B_CTZ, ALU_B_BCLR, + ALU_B_BEXT, ALU_B_BEXTI, ALU_B_BINV, ALU_B_BINVI, ALU_B_BSET, ALU_B_REV8, + ALU_B_CLMUL, ALU_B_CLMULH, ALU_B_CLMULR, ALU_B_CLZ, ALU_B_BSETI, ALU_B_ORCB, + ALU_B_BCLRI, ALU_B_SH3ADD : + result_o = result_bitmanip; + + default: ; + endcase + end end assign ready_o = div_ready; diff --git a/rtl/cv32e40p_core.sv b/rtl/cv32e40p_core.sv index 549869b9b..25ac66345 100644 --- a/rtl/cv32e40p_core.sv +++ b/rtl/cv32e40p_core.sv @@ -37,7 +37,8 @@ module cv32e40p_core parameter FPU_ADDMUL_LAT = 0, // Floating-Point ADDition/MULtiplication lane pipeline registers number parameter FPU_OTHERS_LAT = 0, // Floating-Point COMParison/CONVersion lanes pipeline registers number parameter ZFINX = 0, // Float-in-General Purpose registers - parameter NUM_MHPMCOUNTERS = 1 + parameter NUM_MHPMCOUNTERS = 1, + parameter ZBITMANIP = 0 // To Enable Bitmanip support ) ( // Clock and Reset input logic clk_i, @@ -523,7 +524,8 @@ module cv32e40p_core .APU_WOP_CPU (APU_WOP_CPU), .APU_NDSFLAGS_CPU(APU_NDSFLAGS_CPU), .APU_NUSFLAGS_CPU(APU_NUSFLAGS_CPU), - .DEBUG_TRIGGER_EN(DEBUG_TRIGGER_EN) + .DEBUG_TRIGGER_EN(DEBUG_TRIGGER_EN), + .ZBITMANIP (ZBITMANIP) ) id_stage_i ( .clk (clk), // Gated clock .clk_ungated_i(clk_i), // Ungated clock @@ -744,7 +746,8 @@ module cv32e40p_core .APU_NARGS_CPU (APU_NARGS_CPU), .APU_WOP_CPU (APU_WOP_CPU), .APU_NDSFLAGS_CPU(APU_NDSFLAGS_CPU), - .APU_NUSFLAGS_CPU(APU_NUSFLAGS_CPU) + .APU_NUSFLAGS_CPU(APU_NUSFLAGS_CPU), + .ZBITMANIP (ZBITMANIP) ) ex_stage_i ( // Global signals: Clock and active low asynchronous reset .clk (clk), diff --git a/rtl/cv32e40p_decoder.sv b/rtl/cv32e40p_decoder.sv index d03027bae..fe0bceddc 100644 --- a/rtl/cv32e40p_decoder.sv +++ b/rtl/cv32e40p_decoder.sv @@ -40,7 +40,8 @@ module cv32e40p_decoder parameter PULP_SECURE = 0, parameter USE_PMP = 0, parameter APU_WOP_CPU = 6, - parameter DEBUG_TRIGGER_EN = 1 + parameter DEBUG_TRIGGER_EN = 1, + parameter ZBITMANIP = 0 // To Enable Bitmanip support ) ( // signals running to/from controller @@ -185,6 +186,9 @@ module cv32e40p_decoder // unittypes for latencies to help us decode for APU enum logic[1:0] {ADDMUL, DIVSQRT, NONCOMP, CONV} fp_op_group; + // Illegal Instr flags for bitmanip + logic illegal_instr_bm; + logic illegal_instr_non_bm; ///////////////////////////////////////////// // ____ _ // @@ -264,6 +268,8 @@ module cv32e40p_decoder atop_o = 6'b000000; illegal_insn_o = 1'b0; + illegal_instr_bm = 1'b0; + illegal_instr_non_bm = 1'b0; ebrk_insn_o = 1'b0; ecall_insn_o = 1'b0; wfi_o = 1'b0; @@ -493,8 +499,31 @@ module cv32e40p_decoder 3'b111: alu_operator_o = ALU_AND; // And with Immediate 3'b001: begin - alu_operator_o = ALU_SLL; // Shift Left Logical by Immediate - if (instr_rdata_i[31:25] != 7'b0) + if (instr_rdata_i[31:25] == 7'b0) + alu_operator_o = ALU_SLL; // Shift Left Logical by Immediate + + //Bit-Manip ALU Operations + else if (ZBITMANIP) begin + unique case (instr_rdata_i[31:25]) + 7'b011_0000: begin + unique case(instr_rdata_i[24:20]) + 5'b00100: alu_operator_o = ALU_B_SEXTB; + 5'b00101: alu_operator_o = ALU_B_SEXTH; + 5'b00010: alu_operator_o = ALU_B_CPOP; + 5'b00001: alu_operator_o = ALU_B_CTZ; + 5'b00000: alu_operator_o = ALU_B_CLZ; + default: illegal_insn_o = 1'b1; + endcase + end + 7'b010_0100: alu_operator_o = ALU_B_BCLRI; + 7'b011_0100: alu_operator_o = ALU_B_BINVI; + 7'b001_0100: alu_operator_o = ALU_B_BSETI; + default: begin + illegal_insn_o = 1'b1; + end + endcase + end + else illegal_insn_o = 1'b1; end @@ -503,11 +532,23 @@ module cv32e40p_decoder alu_operator_o = ALU_SRL; // Shift Right Logical by Immediate else if (instr_rdata_i[31:25] == 7'b010_0000) alu_operator_o = ALU_SRA; // Shift Right Arithmetically by Immediate + + //Bit-Manip ALU Operations + else if (ZBITMANIP) begin + if (instr_rdata_i[31:25] == 7'b011_0000) + alu_operator_o = ALU_B_RORI; + else if (instr_rdata_i[31:20] == 12'b001010000111) + alu_operator_o = ALU_B_ORCB; + else if (instr_rdata_i[31:20] == 12'b011010011000) + alu_operator_o = ALU_B_REV8; + else if (instr_rdata_i[31:25] == 7'b010_0100) + alu_operator_o = ALU_B_BEXTI; + else + illegal_insn_o = 1'b1; + end else illegal_insn_o = 1'b1; end - - endcase end @@ -992,9 +1033,43 @@ module cv32e40p_decoder end default: begin - illegal_insn_o = 1'b1; + illegal_instr_non_bm = 1'b1; end endcase + + if (ZBITMANIP) begin + unique case ({instr_rdata_i[30:25], instr_rdata_i[14:12]}) + // Bit-Manip ALU Operations + {6'b01_0000, 3'b010}: alu_operator_o = ALU_B_SH1ADD; + {6'b01_0000, 3'b100}: alu_operator_o = ALU_B_SH2ADD; + {6'b01_0000, 3'b110}: alu_operator_o = ALU_B_SH3ADD; + {6'b10_0000, 3'b111}: alu_operator_o = ALU_B_ANDN; + {6'b00_0101, 3'b110}: alu_operator_o = ALU_B_MAX; + {6'b00_0101, 3'b100}: alu_operator_o = ALU_B_MIN; + {6'b11_0000, 3'b001}: alu_operator_o = ALU_B_ROL; + {6'b11_0000, 3'b101}: alu_operator_o = ALU_B_ROR; + {6'b10_0000, 3'b100}: alu_operator_o = ALU_B_XNOR; + {6'b10_0000, 3'b110}: alu_operator_o = ALU_B_ORN; + {6'b00_0101, 3'b111}: alu_operator_o = ALU_B_MAXU; + {6'b00_0101, 3'b101}: alu_operator_o = ALU_B_MINU; + {6'b00_0100, 3'b100}: alu_operator_o = ALU_B_ZEXTH; + {6'b00_0101, 3'b001}: alu_operator_o = ALU_B_CLMUL; + {6'b00_0101, 3'b011}: alu_operator_o = ALU_B_CLMULH; + {6'b00_0101, 3'b010}: alu_operator_o = ALU_B_CLMULR; + {6'b10_0100, 3'b001}: alu_operator_o = ALU_B_BCLR; + {6'b10_0100, 3'b101}: alu_operator_o = ALU_B_BEXT; + {6'b11_0100, 3'b001}: alu_operator_o = ALU_B_BINV; + {6'b01_0100, 3'b001}: alu_operator_o = ALU_B_BSET; + default: begin + illegal_instr_bm = 1'b1; + end + endcase + end + + unique case (ZBITMANIP) + 1'b0: illegal_insn_o = illegal_instr_non_bm; + 1'b1: illegal_insn_o = illegal_instr_non_bm & illegal_instr_bm; + endcase end end diff --git a/rtl/cv32e40p_ex_stage.sv b/rtl/cv32e40p_ex_stage.sv index f327b8db6..0f3450ee6 100644 --- a/rtl/cv32e40p_ex_stage.sv +++ b/rtl/cv32e40p_ex_stage.sv @@ -37,7 +37,8 @@ module cv32e40p_ex_stage parameter APU_NARGS_CPU = 3, parameter APU_WOP_CPU = 6, parameter APU_NDSFLAGS_CPU = 15, - parameter APU_NUSFLAGS_CPU = 5 + parameter APU_NUSFLAGS_CPU = 5, + parameter ZBITMANIP = 0 ) ( input logic clk, input logic rst_n, @@ -249,7 +250,9 @@ module cv32e40p_ex_stage // // //////////////////////////// - cv32e40p_alu alu_i ( + cv32e40p_alu #( + .ZBITMANIP(ZBITMANIP) + ) alu_i ( .clk (clk), .rst_n (rst_n), .enable_i (alu_en_i), diff --git a/rtl/cv32e40p_id_stage.sv b/rtl/cv32e40p_id_stage.sv index 62e4d7217..c48677164 100644 --- a/rtl/cv32e40p_id_stage.sv +++ b/rtl/cv32e40p_id_stage.sv @@ -47,7 +47,8 @@ module cv32e40p_id_stage parameter APU_WOP_CPU = 6, parameter APU_NDSFLAGS_CPU = 15, parameter APU_NUSFLAGS_CPU = 5, - parameter DEBUG_TRIGGER_EN = 1 + parameter DEBUG_TRIGGER_EN = 1, + parameter ZBITMANIP = 0 // To Enable Bitmanip support ) ( input logic clk, // Gated clock input logic clk_ungated_i, // Ungated clock @@ -978,7 +979,8 @@ module cv32e40p_id_stage .PULP_SECURE (PULP_SECURE), .USE_PMP (USE_PMP), .APU_WOP_CPU (APU_WOP_CPU), - .DEBUG_TRIGGER_EN(DEBUG_TRIGGER_EN) + .DEBUG_TRIGGER_EN(DEBUG_TRIGGER_EN), + .ZBITMANIP (ZBITMANIP) ) decoder_i ( // controller related signals .deassert_we_i(deassert_we), diff --git a/rtl/cv32e40p_top.sv b/rtl/cv32e40p_top.sv index 7ddd2d5a2..305dc094f 100644 --- a/rtl/cv32e40p_top.sv +++ b/rtl/cv32e40p_top.sv @@ -18,7 +18,8 @@ module cv32e40p_top #( parameter FPU_ADDMUL_LAT = 0, // Floating-Point ADDition/MULtiplication computing lane pipeline registers number parameter FPU_OTHERS_LAT = 0, // Floating-Point COMParison/CONVersion computing lanes pipeline registers number parameter ZFINX = 0, // Float-in-General Purpose registers - parameter NUM_MHPMCOUNTERS = 1 + parameter NUM_MHPMCOUNTERS = 1, + parameter ZBITMANIP = 0 // To Enable Bitmanip support ) ( // Clock and Reset input logic clk_i, @@ -90,7 +91,8 @@ module cv32e40p_top #( .FPU_ADDMUL_LAT (FPU_ADDMUL_LAT), .FPU_OTHERS_LAT (FPU_OTHERS_LAT), .ZFINX (ZFINX), - .NUM_MHPMCOUNTERS(NUM_MHPMCOUNTERS) + .NUM_MHPMCOUNTERS(NUM_MHPMCOUNTERS), + .ZBITMANIP (ZBITMANIP) ) core_i ( .clk_i (clk_i), .rst_ni(rst_ni), diff --git a/rtl/include/cv32e40p_pkg.sv b/rtl/include/cv32e40p_pkg.sv index 319e790b6..2cbc422b3 100644 --- a/rtl/include/cv32e40p_pkg.sv +++ b/rtl/include/cv32e40p_pkg.sv @@ -156,7 +156,47 @@ package cv32e40p_pkg; ALU_SHUF = 7'b0111010, ALU_SHUF2 = 7'b0111011, ALU_PCKLO = 7'b0111000, - ALU_PCKHI = 7'b0111001 + ALU_PCKHI = 7'b0111001, + + //Zba: Address generation Instructions + ALU_B_SH1ADD = 7'b0001111, + ALU_B_SH2ADD = 7'b0001110, + ALU_B_SH3ADD = 7'b1110010, + + //Zbb: Basic Bit-Manipulation + ALU_B_ANDN = 7'b1100010, + ALU_B_MAX = 7'b0111100, + ALU_B_MIN = 7'b0111101, + ALU_B_ROL = 7'b1010110, + ALU_B_ROR = 7'b1011110, + ALU_B_XNOR = 7'b1011100, + ALU_B_ORN = 7'b1010100, + ALU_B_MAXU = 7'b1100000, + ALU_B_MINU = 7'b1110110, + ALU_B_RORI = 7'b1110111, + ALU_B_ORCB = 7'b1100001, + ALU_B_REV8 = 7'b1100011, + ALU_B_SEXTB = 7'b1100100, + ALU_B_SEXTH = 7'b1100101, + ALU_B_ZEXTH = 7'b1100110, + ALU_B_CPOP = 7'b1100111, + ALU_B_CTZ = 7'b1101001, + ALU_B_CLZ = 7'b1111110, + + //Zbc: Carry-less Multiplication + ALU_B_CLMUL = 7'b1101010, + ALU_B_CLMULH = 7'b1101011, + ALU_B_CLMULR = 7'b1101100, + + //Zbs: Single-bit Instructions + ALU_B_BCLR = 7'b1101101, + ALU_B_BCLRI = 7'b1101110, + ALU_B_BEXT = 7'b1101111, + ALU_B_BEXTI = 7'b1110000, + ALU_B_BINV = 7'b1110001, + ALU_B_BINVI = 7'b1110011, + ALU_B_BSET = 7'b1110100, + ALU_B_BSETI = 7'b1110101 } alu_opcode_e;