From b5e3515d8bb691591f01945148f33bd700d50884 Mon Sep 17 00:00:00 2001 From: Paolo Savini Date: Wed, 27 Nov 2024 18:40:16 +0000 Subject: [PATCH] WIP --- include/tcg/tcg-op-gvec-common.h | 6 +- target/riscv/insn_trans/trans_rvv.c.inc | 183 +++++++++++++++--------- tcg/tcg-op-gvec.c | 41 ++++-- 3 files changed, 154 insertions(+), 76 deletions(-) diff --git a/include/tcg/tcg-op-gvec-common.h b/include/tcg/tcg-op-gvec-common.h index d4cf01cd1fd52..e0785c845e24d 100644 --- a/include/tcg/tcg-op-gvec-common.h +++ b/include/tcg/tcg-op-gvec-common.h @@ -246,9 +246,11 @@ void tcg_gen_gvec_4i(uint32_t dofs, uint32_t aofs, uint32_t bofs, uint32_t cofs, /* Expand a specific vector operation. */ -void tcg_gen_gvec_ld(uint32_t dofs, TCGv_ptr ptr, +//void tcg_gen_gvec_ld(uint32_t dofs, TCGv_ptr ptr, +void tcg_gen_gvec_ld(TCGv_ptr dofs, TCGv_ptr ptr, uint32_t oprsz, uint32_t maxsz); -void tcg_gen_gvec_st(uint32_t dofs, TCGv_ptr ptr, +//void tcg_gen_gvec_st(uint32_t dofs, TCGv_ptr ptr, +void tcg_gen_gvec_st(TCGv_ptr dofs, TCGv_ptr ptr, uint32_t oprsz, uint32_t maxsz); void tcg_gen_gvec_mov(unsigned vece, uint32_t dofs, uint32_t aofs, uint32_t oprsz, uint32_t maxsz); diff --git a/target/riscv/insn_trans/trans_rvv.c.inc b/target/riscv/insn_trans/trans_rvv.c.inc index 4bd799218bd26..da3256c4f5363 100644 --- a/target/riscv/insn_trans/trans_rvv.c.inc +++ b/target/riscv/insn_trans/trans_rvv.c.inc @@ -1103,73 +1103,128 @@ static bool ldst_whole_trans(uint32_t vd, uint32_t rs1, uint32_t nf, uint32_t log2_esz, gen_helper_ldst_whole *fn, DisasContext *s, bool is_load) { - TCGv_ptr dest; - TCGv_ptr ld_addr = tcg_temp_new_ptr(); - TCGv base_reg; - - /* We might want to use these values from here instead of the helper function - * because the tcg_gen functions use information about the size of the elements - * in the array or the number of max elements in an array. Consider that the whole register - * loads/stores unlike other vector loads/stores will always perform the load/store - * on all the max elements intead of the number of active elements in a register only, - * so it's not based on vl (that is a parameter set every time by vsetvl/vsetvli and is - * the number of elements in a vector register on which to perform an operation) - * but on all the elements (max_elems = length of the register in bytes divided by - * the size in bytes of the single element). - * nf here (number of fields) determines how many vector registers are processed at once. - * We might load 2 or 3 or 4 registers at a time for instance. - * Better to start with the simple case nf=1 and then see how to handle the multiple - * destinations when nf=2,3... this might require some extra tcg functions to accept multiple - * destination registers. */ - uint32_t max_elems = s->cfg_ptr->vlenb >> log2_esz; -// uint32_t evl = nf * max_elems; -// uint32_t esz = 1 << log2_esz; - - /* vl${NF}re${SEW}.v - * - * NF = 1,2,4,8 - * SEW = 8,16,32,64 (bits) - * - * NF == 1 - * vl1re16.v v3, (a0) # Load v3 with VLEN/16 halfwords from address a0 - * NF == 2 - * vl2re16.v v2, (a0) # Load v2,v3 with 2*VLEN/16 halfwords from address a0 - */ - uint32_t data = FIELD_DP32(0, VDATA, NF, nf); - data = FIELD_DP32(data, VDATA, VM, 1); - dest = tcg_temp_new_ptr(); - - /* a0 */ - //if (get_xl(s) == MXL_RV32) { - base_reg = get_gpr(s, rs1, EXT_NONE); - /* v3 */ - tcg_gen_addi_ptr(dest, tcg_env, vreg_ofs(s, vd)); - - mark_vs_dirty(s); - - /* - Try and use "dest" in the call to tcg_gen_gvec_ld below if things don't work out, instead of passing vreg_ofs(s, vd). - - Our load instruction uses register addressing so we need the content of the scalar register operand summed to an offset. - The tcg_gen_gvec_ld function will use a destination register and an address/pointer. It will then do a loop of loads based on - the size of the whole operation by adding the iteration index to the pointer. - We need then to generate an add instruction to create the memory address / pointer made of the tcg_env + the content of the base register. - */ - - tcg_gen_add_ptr(ld_addr, (TCGv_ptr)base_reg, tcg_env); - - /* - Need to add a function for the load? See other GVecGen operations used around in the targets. Any load/store? - */ - - /* The size of the elements (8,16,32,64 bits) doesn't seem to be used to select the - * appropriate host vector in tcg/tcg-op-gvec.c:choose_vector_type. */ - +// TCGv_ptr dest = tcg_temp_new_ptr(); +// TCGv_ptr ld_addr = tcg_temp_new_ptr(); +// TCGv base_reg; +//// TCGv_ptr base_ptr = tcg_temp_new_ptr(); +// +// /* We might want to use these values from here instead of the helper function +// * because the tcg_gen functions use information about the size of the elements +// * in the array or the number of max elements in an array. Consider that the whole register +// * loads/stores unlike other vector loads/stores will always perform the load/store +// * on all the max elements intead of the number of active elements in a register only, +// * so it's not based on vl (that is a parameter set every time by vsetvl/vsetvli and is +// * the number of elements in a vector register on which to perform an operation) +// * but on all the elements (max_elems = length of the register in bytes divided by +// * the size in bytes of the single element). +// * nf here (number of fields) determines how many vector registers are processed at once. +// * We might load 2 or 3 or 4 registers at a time for instance. +// * Better to start with the simple case nf=1 and then see how to handle the multiple +// * destinations when nf=2,3... this might require some extra tcg functions to accept multiple +// * destination registers. */ + uint32_t max_elems = s->cfg_ptr->vlenb >> log2_esz; +// int vlmax = vext_get_vlmax(s->cfg_ptr->vlenb, s->sew, s->lmul); +// uint32_t evl = nf * max_elems; +// uint32_t esz = 1 << log2_esz; +// +// /* vl${NF}re${SEW}.v +// * +// * NF = 1,2,4,8 +// * SEW = 8,16,32,64 (bits) +// * +// * NF == 1 +// * vl1re16.v v3, (a0) # Load v3 with VLEN/16 halfwords from address a0 +// * NF == 2 +// * vl2re16.v v2, (a0) # Load v2,v3 with 2*VLEN/16 halfwords from address a0 +// */ +// uint32_t data = FIELD_DP32(0, VDATA, NF, nf); +// data = FIELD_DP32(data, VDATA, VM, 1); +// +// /* a0 */ +// //if (get_xl(s) == MXL_RV32) { +// base_reg = get_gpr(s, rs1, EXT_NONE); +// /* v3 */ +// tcg_gen_addi_ptr(dest, tcg_env, vreg_ofs(s, vd)); +// +// mark_vs_dirty(s); + + // int midx = get_mem_index(s); + // TCGv_i64 dirty_addr, clean_addr, t0, t1; + TCGv addr; + addr = get_address(s, rs1, 0); + +// dirty_addr = tcg_temp_new_i64(); +// TCGv_i64 t0 = tcg_temp_new_i64(); +// TCGv_i64 t1 = tcg_temp_new_i64(); + TCGv_i128 t16 = tcg_temp_new_i128(); + + + for (int i=0; i<(max_elems<mem_idx, + MO_LE | MO_128 | MO_ATOM_NONE); +// tcg_gen_extr_i128_i64(t0, t1, t16); + tcg_gen_st_i128(t16, tcg_env, vreg_ofs(s, vd) + (i/8)); +// tcg_gen_st_i64(t0, tcg_env, vreg_ofs(s, vd)); +// tcg_gen_st_i64(t1, tcg_env, vreg_ofs(s, vd) + 8); } else { - tcg_gen_gvec_st(vreg_ofs(s, vd), ld_addr, max_elems, max_elems /*FIXME: other parameters needed*/); + addr = get_address(s, rs1, i); + tcg_gen_ld_i128(t16, tcg_env, vreg_ofs(s, vd) + (i/8)); + tcg_gen_qemu_st_i128(t16, addr, s->mem_idx, + MO_LE | MO_128 | MO_ATOM_NONE); } +} + mark_vs_dirty(s); +//// TCGv_i128 r = tcg_temp_new_i128(); +//// +//// tcg_gen_ld_i128(r, base_reg, 0); +// /* +// Try and use "dest" in the call to tcg_gen_gvec_ld below if things don't work out, instead of passing vreg_ofs(s, vd). +// +// Our load instruction uses register addressing so we need the content of the scalar register operand summed to an offset. +// The tcg_gen_gvec_ld function will use a destination register and an address/pointer. It will then do a loop of loads based on +// the size of the whole operation by adding the iteration index to the pointer. +// We need then to generate an add instruction to create the memory address / pointer made of the tcg_env + the content of the base register. +// */ +// +// TCGv_i32 base_val = tcg_temp_new_i32(); +// tcg_gen_trunc_tl_i32(base_val, base_reg); +// tcg_gen_andi_i32(base_val, base_val, vlmax - 1); +// +//#if HOST_BIG_ENDIAN +// tcg_gen_xori_i32(base_val, base_val, 7 >> s->sew); +//#endif +// /* Convert the index to an offset. */ +//// endian_adjust(base_val, s->sew); +// tcg_gen_shli_i32(base_val, base_val, s->sew); +// +// /* Convert the index to a pointer. */ +// tcg_gen_ext_i32_ptr(ld_addr, base_val); +// tcg_gen_add_ptr(ld_addr, ld_addr, tcg_env); +////// tcg_gen_ext_i32_ptr(ld_addr, 0x1234); +//// tcg_gen_ext_i32_ptr(ld_addr, (TCGv_i32)base_reg); +//// tcg_gen_add_ptr(ld_addr, ld_addr, tcg_env); +////// tcg_gen_add_ptr(ld_addr, (TCGv_ptr)base_reg, tcg_env); +// +// /* +// Need to add a function for the load? See other GVecGen operations used around in the targets. Any load/store? +// */ +// +// /* The size of the elements (8,16,32,64 bits) doesn't seem to be used to select the +// * appropriate host vector in tcg/tcg-op-gvec.c:choose_vector_type. */ +// +// if (is_load) { +// tcg_gen_gvec_ld(dest, ld_addr, max_elems, max_elems /*FIXME: other parameters needed*/); +// //tcg_gen_gvec_ld(vreg_ofs(s, vd), ld_addr, max_elems, max_elems /*FIXME: other parameters needed*/); +// } else { +// tcg_gen_gvec_st(dest, ld_addr, max_elems, max_elems /*FIXME: other parameters needed*/); +// //tcg_gen_gvec_st(vreg_ofs(s, vd), ld_addr, max_elems, max_elems /*FIXME: other parameters needed*/); +// } // Original call to the C helper function that we want to avoid. // fn(dest, base, tcg_env, desc); diff --git a/tcg/tcg-op-gvec.c b/tcg/tcg-op-gvec.c index dc61dd7e2e997..ea4d4c0b49eee 100644 --- a/tcg/tcg-op-gvec.c +++ b/tcg/tcg-op-gvec.c @@ -1181,25 +1181,40 @@ static void expand_4i_vec(unsigned vece, uint32_t dofs, uint32_t aofs, } } -static void expand_vec_ld_r(uint32_t vofs, TCGv_ptr ptr, +//static void expand_vec_ld_r(uint32_t vofs, TCGv_ptr ptr, +static void expand_vec_ld_r(TCGv_ptr reg, TCGv_ptr ptr, uint32_t oprsz, uint32_t tysz, TCGType type) { TCGv_vec t0 = tcg_temp_new_vec(type); for (uint32_t i = 0; i < oprsz; i += tysz) { tcg_gen_ld_vec(t0, ptr, i); - tcg_gen_st_vec(t0, tcg_env, vofs + i); + tcg_gen_st_vec(t0, reg, i); +// tcg_gen_st_vec(t0, tcg_env, vofs + i); } +// tcg_gen_ld_vec(t0, ptr, 0); +// tcg_gen_st_vec(t0, tcg_env, vofs); tcg_temp_free_vec(t0); } -void tcg_gen_gvec_ld(uint32_t vofs, TCGv_ptr ptr, +//void tcg_gen_gvec_ld(uint32_t vofs, TCGv_ptr ptr, +void tcg_gen_gvec_ld(TCGv_ptr reg, TCGv_ptr ptr, uint32_t oprsz, uint32_t maxsz) { TCGType type; - check_size_align(oprsz, maxsz, vofs); +// TCGv_ptr v0; +// TCGv_i32 desc = tcg_constant_i32(simd_desc(oprsz, maxsz, data)); + + /// v0 = tcg_temp_ebb_new_ptr(); + /// + /// tcg_gen_addi_ptr(v0, tcg_env, vofs); + /// expand_vec_ld_r(v0, ptr, oprsz, maxsz, type); + /// + /// tcg_temp_free_ptr(v0); +// check_size_align(oprsz, maxsz, vofs); type = choose_vector_type(NULL, maxsz, oprsz, 0); - expand_vec_ld_r(vofs, ptr, oprsz, maxsz, type); +// expand_vec_ld_r(vofs, ptr, oprsz, maxsz, type); + expand_vec_ld_r(reg, ptr, oprsz, maxsz, type); if (oprsz < maxsz) { // FIXME: tmp @@ -1207,25 +1222,31 @@ void tcg_gen_gvec_ld(uint32_t vofs, TCGv_ptr ptr, } } -static void expand_vec_st_r(uint32_t vofs, TCGv_ptr ptr, +//static void expand_vec_st_r(uint32_t vofs, TCGv_ptr ptr, +static void expand_vec_st_r(TCGv_ptr reg, TCGv_ptr ptr, uint32_t oprsz, uint32_t tysz, TCGType type) { TCGv_vec t0 = tcg_temp_new_vec(type); for (uint32_t i = 0; i < oprsz; i += tysz) { - tcg_gen_ld_vec(t0, tcg_env, vofs + i); + // tcg_gen_ld_vec(t0, tcg_env, vofs + i); + tcg_gen_ld_vec(t0, reg, i); tcg_gen_st_vec(t0, ptr, i); } +// tcg_gen_ld_vec(t0, tcg_env, vofs ); +// tcg_gen_st_vec(t0, ptr, 0); tcg_temp_free_vec(t0); } -void tcg_gen_gvec_st(uint32_t vofs, TCGv_ptr ptr, +//void tcg_gen_gvec_st(uint32_t vofs, TCGv_ptr ptr, +void tcg_gen_gvec_st(TCGv_ptr reg, TCGv_ptr ptr, uint32_t oprsz, uint32_t maxsz) { TCGType type; - check_size_align(oprsz, maxsz, vofs); +// check_size_align(oprsz, maxsz, vofs); type = choose_vector_type(NULL, maxsz, oprsz, 0); - expand_vec_st_r(vofs, ptr, oprsz, maxsz, type); +// expand_vec_st_r(vofs, ptr, oprsz, maxsz, type); + expand_vec_st_r(reg, ptr, oprsz, maxsz, type); if (oprsz < maxsz) { // FIXME: tmp