diff --git a/target/riscv/insn_trans/trans_rvv.c.inc b/target/riscv/insn_trans/trans_rvv.c.inc index b9883a5d3238..bdb9c2948ee8 100644 --- a/target/riscv/insn_trans/trans_rvv.c.inc +++ b/target/riscv/insn_trans/trans_rvv.c.inc @@ -16,6 +16,9 @@ */ #include "tcg/tcg-op-gvec.h" #include "tcg/tcg-gvec-desc.h" +//#include "exec/plugin-gen.h" +//#include "tcg/tcg-internal.h" +//#include "tcg/tcg-op-common.h" #include "internals.h" static inline bool is_overlapped(const int8_t astart, int8_t asize, @@ -1100,26 +1103,34 @@ GEN_VEXT_TRANS(vle64ff_v, MO_64, r2nfvm, ldff_op, ld_us_check) typedef void gen_helper_ldst_whole(TCGv_ptr, TCGv, TCGv_env, TCGv_i32); static bool ldst_whole_trans(uint32_t vd, uint32_t rs1, uint32_t nf, - gen_helper_ldst_whole *fn, - DisasContext *s) + uint32_t log2_esz, gen_helper_ldst_whole *fn, + DisasContext *s, bool is_load) { - TCGv_ptr dest; - TCGv base; - TCGv_i32 desc; - - uint32_t data = FIELD_DP32(0, VDATA, NF, nf); - data = FIELD_DP32(data, VDATA, VM, 1); - dest = tcg_temp_new_ptr(); - desc = tcg_constant_i32(simd_desc(s->cfg_ptr->vlenb, - s->cfg_ptr->vlenb, data)); - - base = get_gpr(s, rs1, EXT_NONE); - tcg_gen_addi_ptr(dest, tcg_env, vreg_ofs(s, vd)); + TCGv addr; + addr = get_address(s, rs1, 0); + uint32_t size = s->cfg_ptr->vlenb * nf; - mark_vs_dirty(s); - - fn(dest, base, tcg_env, desc); + TCGv_i128 t16 = tcg_temp_new_i128(); + /* + * Load/store minimum vlenb bytes per iteration. + * When possible do this atomically. + * Update vstart with the number of processed elements. + */ + for (int i=0; i < size; i+=16) { + addr = get_address(s, rs1, i); + if (is_load) { + tcg_gen_qemu_ld_i128(t16, addr, s->mem_idx, + MO_LE | MO_128 | MO_ATOM_WITHIN16); + tcg_gen_st_i128(t16, tcg_env, vreg_ofs(s, vd) + i); + tcg_gen_addi_tl(cpu_vstart, cpu_vstart, 16 >> log2_esz); + } else { + tcg_gen_ld_i128(t16, tcg_env, vreg_ofs(s, vd) + i); + tcg_gen_qemu_st_i128(t16, addr, s->mem_idx, + MO_LE | MO_128 | MO_ATOM_WITHIN16); + tcg_gen_addi_tl(cpu_vstart, cpu_vstart, 16 >> log2_esz); + } + } finalize_rvv_inst(s); return true; } @@ -1128,42 +1139,42 @@ static bool ldst_whole_trans(uint32_t vd, uint32_t rs1, uint32_t nf, * load and store whole register instructions ignore vtype and vl setting. * Thus, we don't need to check vill bit. (Section 7.9) */ -#define GEN_LDST_WHOLE_TRANS(NAME, ARG_NF) \ +#define GEN_LDST_WHOLE_TRANS(NAME, ETYPE, ARG_NF, IS_LOAD) \ static bool trans_##NAME(DisasContext *s, arg_##NAME * a) \ { \ if (require_rvv(s) && \ QEMU_IS_ALIGNED(a->rd, ARG_NF)) { \ - return ldst_whole_trans(a->rd, a->rs1, ARG_NF, \ - gen_helper_##NAME, s); \ + return ldst_whole_trans(a->rd, a->rs1, ARG_NF, ctzl(sizeof(ETYPE)), \ + gen_helper_##NAME, s, IS_LOAD); \ } \ return false; \ } -GEN_LDST_WHOLE_TRANS(vl1re8_v, 1) -GEN_LDST_WHOLE_TRANS(vl1re16_v, 1) -GEN_LDST_WHOLE_TRANS(vl1re32_v, 1) -GEN_LDST_WHOLE_TRANS(vl1re64_v, 1) -GEN_LDST_WHOLE_TRANS(vl2re8_v, 2) -GEN_LDST_WHOLE_TRANS(vl2re16_v, 2) -GEN_LDST_WHOLE_TRANS(vl2re32_v, 2) -GEN_LDST_WHOLE_TRANS(vl2re64_v, 2) -GEN_LDST_WHOLE_TRANS(vl4re8_v, 4) -GEN_LDST_WHOLE_TRANS(vl4re16_v, 4) -GEN_LDST_WHOLE_TRANS(vl4re32_v, 4) -GEN_LDST_WHOLE_TRANS(vl4re64_v, 4) -GEN_LDST_WHOLE_TRANS(vl8re8_v, 8) -GEN_LDST_WHOLE_TRANS(vl8re16_v, 8) -GEN_LDST_WHOLE_TRANS(vl8re32_v, 8) -GEN_LDST_WHOLE_TRANS(vl8re64_v, 8) +GEN_LDST_WHOLE_TRANS(vl1re8_v, int8_t, 1, true) +GEN_LDST_WHOLE_TRANS(vl1re16_v, int16_t, 1, true) +GEN_LDST_WHOLE_TRANS(vl1re32_v, int32_t, 1, true) +GEN_LDST_WHOLE_TRANS(vl1re64_v, int64_t, 1, true) +GEN_LDST_WHOLE_TRANS(vl2re8_v, int8_t, 2, true) +GEN_LDST_WHOLE_TRANS(vl2re16_v, int16_t, 2, true) +GEN_LDST_WHOLE_TRANS(vl2re32_v, int32_t, 2, true) +GEN_LDST_WHOLE_TRANS(vl2re64_v, int64_t, 2, true) +GEN_LDST_WHOLE_TRANS(vl4re8_v, int8_t, 4, true) +GEN_LDST_WHOLE_TRANS(vl4re16_v, int16_t, 4, true) +GEN_LDST_WHOLE_TRANS(vl4re32_v, int32_t, 4, true) +GEN_LDST_WHOLE_TRANS(vl4re64_v, int64_t, 4, true) +GEN_LDST_WHOLE_TRANS(vl8re8_v, int8_t, 8, true) +GEN_LDST_WHOLE_TRANS(vl8re16_v, int16_t, 8, true) +GEN_LDST_WHOLE_TRANS(vl8re32_v, int32_t, 8, true) +GEN_LDST_WHOLE_TRANS(vl8re64_v, int64_t, 8, true) /* * The vector whole register store instructions are encoded similar to * unmasked unit-stride store of elements with EEW=8. */ -GEN_LDST_WHOLE_TRANS(vs1r_v, 1) -GEN_LDST_WHOLE_TRANS(vs2r_v, 2) -GEN_LDST_WHOLE_TRANS(vs4r_v, 4) -GEN_LDST_WHOLE_TRANS(vs8r_v, 8) +GEN_LDST_WHOLE_TRANS(vs1r_v, int8_t, 1, false) +GEN_LDST_WHOLE_TRANS(vs2r_v, int8_t, 2, false) +GEN_LDST_WHOLE_TRANS(vs4r_v, int8_t, 4, false) +GEN_LDST_WHOLE_TRANS(vs8r_v, int8_t, 8, false) /* *** Vector Integer Arithmetic Instructions