From b5e3515d8bb691591f01945148f33bd700d50884 Mon Sep 17 00:00:00 2001
From: Paolo Savini <paolo.savini@embecosm.com>
Date: Wed, 27 Nov 2024 18:40:16 +0000
Subject: [PATCH] WIP

---
 include/tcg/tcg-op-gvec-common.h        |   6 +-
 target/riscv/insn_trans/trans_rvv.c.inc | 183 +++++++++++++++---------
 tcg/tcg-op-gvec.c                       |  41 ++++--
 3 files changed, 154 insertions(+), 76 deletions(-)

diff --git a/include/tcg/tcg-op-gvec-common.h b/include/tcg/tcg-op-gvec-common.h
index d4cf01cd1fd52..e0785c845e24d 100644
--- a/include/tcg/tcg-op-gvec-common.h
+++ b/include/tcg/tcg-op-gvec-common.h
@@ -246,9 +246,11 @@ void tcg_gen_gvec_4i(uint32_t dofs, uint32_t aofs, uint32_t bofs, uint32_t cofs,
 
 /* Expand a specific vector operation.  */
 
-void tcg_gen_gvec_ld(uint32_t dofs, TCGv_ptr ptr,
+//void tcg_gen_gvec_ld(uint32_t dofs, TCGv_ptr ptr,
+void tcg_gen_gvec_ld(TCGv_ptr dofs, TCGv_ptr ptr,
                      uint32_t oprsz, uint32_t maxsz);
-void tcg_gen_gvec_st(uint32_t dofs, TCGv_ptr ptr,
+//void tcg_gen_gvec_st(uint32_t dofs, TCGv_ptr ptr,
+void tcg_gen_gvec_st(TCGv_ptr dofs, TCGv_ptr ptr,
                      uint32_t oprsz, uint32_t maxsz);
 void tcg_gen_gvec_mov(unsigned vece, uint32_t dofs, uint32_t aofs,
                       uint32_t oprsz, uint32_t maxsz);
diff --git a/target/riscv/insn_trans/trans_rvv.c.inc b/target/riscv/insn_trans/trans_rvv.c.inc
index 4bd799218bd26..da3256c4f5363 100644
--- a/target/riscv/insn_trans/trans_rvv.c.inc
+++ b/target/riscv/insn_trans/trans_rvv.c.inc
@@ -1103,73 +1103,128 @@ static bool ldst_whole_trans(uint32_t vd, uint32_t rs1, uint32_t nf,
                              uint32_t log2_esz, gen_helper_ldst_whole *fn,
                              DisasContext *s, bool is_load)
 {
-    TCGv_ptr dest;
-    TCGv_ptr ld_addr = tcg_temp_new_ptr();
-    TCGv base_reg;
-
-    /* We might want to use these values from here instead of the helper function
-     * because the tcg_gen functions use information about the size of the elements
-     * in the array or the number of max elements in an array. Consider that the whole register
-     * loads/stores unlike other vector loads/stores will always perform the load/store
-     * on all the max elements intead of the number of active elements in a register only,
-     * so it's not based on vl (that is a parameter set every time by vsetvl/vsetvli and is
-     * the number of elements in a vector register on which to perform an operation)
-     * but on all the elements (max_elems = length of the register in bytes divided by
-     * the size in bytes of the single element).
-     * nf here (number of fields) determines how many vector registers are processed at once.
-     * We might load 2 or 3 or 4 registers at a time for instance.
-     * Better to start with the simple case nf=1 and then see how to handle the multiple
-     * destinations when nf=2,3... this might require some extra tcg functions to accept multiple
-     * destination registers. */
-    uint32_t max_elems = s->cfg_ptr->vlenb >> log2_esz;
-//    uint32_t evl = nf * max_elems;
-//    uint32_t esz = 1 << log2_esz;
-
-    /* vl${NF}re${SEW}.v
-     * 
-     * NF = 1,2,4,8
-     * SEW = 8,16,32,64 (bits)
-     * 
-     * NF == 1
-     * vl1re16.v v3, (a0) # Load v3 with VLEN/16 halfwords from address a0
-     * NF == 2
-     * vl2re16.v v2, (a0) # Load v2,v3 with 2*VLEN/16 halfwords from address a0
-     */
-    uint32_t data = FIELD_DP32(0, VDATA, NF, nf);
-    data = FIELD_DP32(data, VDATA, VM, 1);
-    dest = tcg_temp_new_ptr();
-
-    /* a0 */
-    //if (get_xl(s) == MXL_RV32) {
-    base_reg = get_gpr(s, rs1, EXT_NONE);
-    /* v3 */
-    tcg_gen_addi_ptr(dest, tcg_env, vreg_ofs(s, vd));
-
-    mark_vs_dirty(s);
-
-    /*
-    Try and use "dest" in the call to tcg_gen_gvec_ld below if things don't work out, instead of passing vreg_ofs(s, vd).
-
-    Our load instruction uses register addressing so we need the content of the scalar register operand summed to an offset.
-    The tcg_gen_gvec_ld function will use a destination register and an address/pointer. It will then do a loop of loads based on
-    the size of the whole operation by adding the iteration index to the pointer.
-    We need then to generate an add instruction to create the memory address / pointer made of the tcg_env + the content of the base register.
-    */
-
-    tcg_gen_add_ptr(ld_addr, (TCGv_ptr)base_reg, tcg_env);
-
-    /*
-    Need to add a function for the load? See other GVecGen operations used around in the targets. Any load/store?
-    */
-
-    /* The size of the elements (8,16,32,64 bits) doesn't seem to be used to select the
-     * appropriate host vector in tcg/tcg-op-gvec.c:choose_vector_type. */
-
+//    TCGv_ptr dest = tcg_temp_new_ptr();
+//    TCGv_ptr ld_addr = tcg_temp_new_ptr();
+//    TCGv base_reg;
+////    TCGv_ptr base_ptr = tcg_temp_new_ptr();
+//
+//    /* We might want to use these values from here instead of the helper function
+//     * because the tcg_gen functions use information about the size of the elements
+//     * in the array or the number of max elements in an array. Consider that the whole register
+//     * loads/stores unlike other vector loads/stores will always perform the load/store
+//     * on all the max elements intead of the number of active elements in a register only,
+//     * so it's not based on vl (that is a parameter set every time by vsetvl/vsetvli and is
+//     * the number of elements in a vector register on which to perform an operation)
+//     * but on all the elements (max_elems = length of the register in bytes divided by
+//     * the size in bytes of the single element).
+//     * nf here (number of fields) determines how many vector registers are processed at once.
+//     * We might load 2 or 3 or 4 registers at a time for instance.
+//     * Better to start with the simple case nf=1 and then see how to handle the multiple
+//     * destinations when nf=2,3... this might require some extra tcg functions to accept multiple
+//     * destination registers. */
+      uint32_t max_elems = s->cfg_ptr->vlenb >> log2_esz;
+//    int vlmax = vext_get_vlmax(s->cfg_ptr->vlenb, s->sew, s->lmul);
+//      uint32_t evl = nf * max_elems;
+//      uint32_t esz = 1 << log2_esz;
+//
+//    /* vl${NF}re${SEW}.v
+//     * 
+//     * NF = 1,2,4,8
+//     * SEW = 8,16,32,64 (bits)
+//     * 
+//     * NF == 1
+//     * vl1re16.v v3, (a0) # Load v3 with VLEN/16 halfwords from address a0
+//     * NF == 2
+//     * vl2re16.v v2, (a0) # Load v2,v3 with 2*VLEN/16 halfwords from address a0
+//     */
+//    uint32_t data = FIELD_DP32(0, VDATA, NF, nf);
+//    data = FIELD_DP32(data, VDATA, VM, 1);
+//
+//    /* a0 */
+//    //if (get_xl(s) == MXL_RV32) {
+//    base_reg = get_gpr(s, rs1, EXT_NONE);
+//    /* v3 */
+//    tcg_gen_addi_ptr(dest, tcg_env, vreg_ofs(s, vd));
+//
+//    mark_vs_dirty(s);
+
+   // int midx = get_mem_index(s);
+  //  TCGv_i64 dirty_addr, clean_addr, t0, t1;
+    TCGv addr;
+    addr = get_address(s, rs1, 0);
+
+//    dirty_addr = tcg_temp_new_i64();
+//    TCGv_i64 t0 = tcg_temp_new_i64();
+//    TCGv_i64 t1 = tcg_temp_new_i64();
+    TCGv_i128 t16 = tcg_temp_new_i128();
+
+
+    for (int i=0; i<(max_elems<<log2_esz); i+=128) {
     if (is_load) {
-      tcg_gen_gvec_ld(vreg_ofs(s, vd), ld_addr, max_elems, max_elems /*FIXME: other parameters needed*/);
+ //   tcg_gen_addi_i64(dirty_addr, base_reg, 0);
+    
+//    clean_addr = gen_mte_checkN(s, dirty_addr, false, rn != 31, len, MO_8);
+
+    addr = get_address(s, rs1, i);
+    tcg_gen_qemu_ld_i128(t16, addr, s->mem_idx,
+                         MO_LE | MO_128 | MO_ATOM_NONE);
+//    tcg_gen_extr_i128_i64(t0, t1, t16);
+    tcg_gen_st_i128(t16, tcg_env, vreg_ofs(s, vd) + (i/8));
+//    tcg_gen_st_i64(t0, tcg_env, vreg_ofs(s, vd));
+//    tcg_gen_st_i64(t1, tcg_env, vreg_ofs(s, vd) + 8);
     } else {
-      tcg_gen_gvec_st(vreg_ofs(s, vd), ld_addr, max_elems, max_elems /*FIXME: other parameters needed*/);
+    addr = get_address(s, rs1, i);
+    tcg_gen_ld_i128(t16, tcg_env, vreg_ofs(s, vd) + (i/8));
+    tcg_gen_qemu_st_i128(t16, addr, s->mem_idx,
+                         MO_LE | MO_128 | MO_ATOM_NONE);
     }
+}
+    mark_vs_dirty(s);
+////    TCGv_i128 r = tcg_temp_new_i128();
+////
+////    tcg_gen_ld_i128(r, base_reg, 0);
+//    /*
+//    Try and use "dest" in the call to tcg_gen_gvec_ld below if things don't work out, instead of passing vreg_ofs(s, vd).
+//
+//    Our load instruction uses register addressing so we need the content of the scalar register operand summed to an offset.
+//    The tcg_gen_gvec_ld function will use a destination register and an address/pointer. It will then do a loop of loads based on
+//    the size of the whole operation by adding the iteration index to the pointer.
+//    We need then to generate an add instruction to create the memory address / pointer made of the tcg_env + the content of the base register.
+//    */
+// 
+//    TCGv_i32 base_val = tcg_temp_new_i32();
+//    tcg_gen_trunc_tl_i32(base_val, base_reg);
+//    tcg_gen_andi_i32(base_val, base_val, vlmax - 1);
+//
+//#if HOST_BIG_ENDIAN
+//    tcg_gen_xori_i32(base_val, base_val, 7 >> s->sew);
+//#endif
+//    /* Convert the index to an offset. */
+////    endian_adjust(base_val, s->sew);
+//    tcg_gen_shli_i32(base_val, base_val, s->sew);
+//
+//    /* Convert the index to a pointer. */
+//    tcg_gen_ext_i32_ptr(ld_addr, base_val);
+//    tcg_gen_add_ptr(ld_addr, ld_addr, tcg_env);
+//////    tcg_gen_ext_i32_ptr(ld_addr, 0x1234);
+////    tcg_gen_ext_i32_ptr(ld_addr, (TCGv_i32)base_reg);
+////    tcg_gen_add_ptr(ld_addr, ld_addr, tcg_env);
+//////    tcg_gen_add_ptr(ld_addr, (TCGv_ptr)base_reg, tcg_env);
+//
+//    /*
+//    Need to add a function for the load? See other GVecGen operations used around in the targets. Any load/store?
+//    */
+//
+//    /* The size of the elements (8,16,32,64 bits) doesn't seem to be used to select the
+//     * appropriate host vector in tcg/tcg-op-gvec.c:choose_vector_type. */
+//
+//    if (is_load) {
+//      tcg_gen_gvec_ld(dest, ld_addr, max_elems, max_elems /*FIXME: other parameters needed*/);
+//      //tcg_gen_gvec_ld(vreg_ofs(s, vd), ld_addr, max_elems, max_elems /*FIXME: other parameters needed*/);
+//    } else {
+//      tcg_gen_gvec_st(dest, ld_addr, max_elems, max_elems /*FIXME: other parameters needed*/);
+//      //tcg_gen_gvec_st(vreg_ofs(s, vd), ld_addr, max_elems, max_elems /*FIXME: other parameters needed*/);
+//    }
 
     // Original call to the C helper function that we want to avoid.
     // fn(dest, base, tcg_env, desc);
diff --git a/tcg/tcg-op-gvec.c b/tcg/tcg-op-gvec.c
index dc61dd7e2e997..ea4d4c0b49eee 100644
--- a/tcg/tcg-op-gvec.c
+++ b/tcg/tcg-op-gvec.c
@@ -1181,25 +1181,40 @@ static void expand_4i_vec(unsigned vece, uint32_t dofs, uint32_t aofs,
     }
 }
 
-static void expand_vec_ld_r(uint32_t vofs, TCGv_ptr ptr,
+//static void expand_vec_ld_r(uint32_t vofs, TCGv_ptr ptr,
+static void expand_vec_ld_r(TCGv_ptr reg, TCGv_ptr ptr,
                             uint32_t oprsz, uint32_t tysz, TCGType type)
 {
     TCGv_vec t0 = tcg_temp_new_vec(type);
     for (uint32_t i = 0; i < oprsz; i += tysz) {
         tcg_gen_ld_vec(t0, ptr, i);
-        tcg_gen_st_vec(t0, tcg_env, vofs + i);
+        tcg_gen_st_vec(t0, reg, i);
+//        tcg_gen_st_vec(t0, tcg_env, vofs + i);
     }
+//        tcg_gen_ld_vec(t0, ptr, 0);
+//        tcg_gen_st_vec(t0, tcg_env, vofs);
     tcg_temp_free_vec(t0);
 }
 
-void tcg_gen_gvec_ld(uint32_t vofs, TCGv_ptr ptr,
+//void tcg_gen_gvec_ld(uint32_t vofs, TCGv_ptr ptr,
+void tcg_gen_gvec_ld(TCGv_ptr reg, TCGv_ptr ptr,
                      uint32_t oprsz, uint32_t maxsz)
 {
     TCGType type;
 
-    check_size_align(oprsz, maxsz, vofs);
+//    TCGv_ptr v0;
+//    TCGv_i32 desc = tcg_constant_i32(simd_desc(oprsz, maxsz, data));
+
+ ///   v0 = tcg_temp_ebb_new_ptr();
+ ///
+ ///   tcg_gen_addi_ptr(v0, tcg_env, vofs);
+ ///   expand_vec_ld_r(v0, ptr, oprsz, maxsz, type);
+ ///
+ ///   tcg_temp_free_ptr(v0);
+//    check_size_align(oprsz, maxsz, vofs);
     type = choose_vector_type(NULL, maxsz, oprsz, 0);
-    expand_vec_ld_r(vofs, ptr, oprsz, maxsz, type);
+//    expand_vec_ld_r(vofs, ptr, oprsz, maxsz, type);
+    expand_vec_ld_r(reg, ptr, oprsz, maxsz, type);
 
     if (oprsz < maxsz) {
 	    // FIXME: tmp
@@ -1207,25 +1222,31 @@ void tcg_gen_gvec_ld(uint32_t vofs, TCGv_ptr ptr,
     }
 }
 
-static void expand_vec_st_r(uint32_t vofs, TCGv_ptr ptr,
+//static void expand_vec_st_r(uint32_t vofs, TCGv_ptr ptr,
+static void expand_vec_st_r(TCGv_ptr reg, TCGv_ptr ptr,
                             uint32_t oprsz, uint32_t tysz, TCGType type)
 {
     TCGv_vec t0 = tcg_temp_new_vec(type);
     for (uint32_t i = 0; i < oprsz; i += tysz) {
-        tcg_gen_ld_vec(t0, tcg_env, vofs + i);
+    //    tcg_gen_ld_vec(t0, tcg_env, vofs + i);
+        tcg_gen_ld_vec(t0, reg, i);
         tcg_gen_st_vec(t0, ptr, i);
     }
+//        tcg_gen_ld_vec(t0, tcg_env, vofs );
+//        tcg_gen_st_vec(t0, ptr, 0);
     tcg_temp_free_vec(t0);
 }
 
-void tcg_gen_gvec_st(uint32_t vofs, TCGv_ptr ptr,
+//void tcg_gen_gvec_st(uint32_t vofs, TCGv_ptr ptr,
+void tcg_gen_gvec_st(TCGv_ptr reg, TCGv_ptr ptr,
                      uint32_t oprsz, uint32_t maxsz)
 {
     TCGType type;
 
-    check_size_align(oprsz, maxsz, vofs);
+//    check_size_align(oprsz, maxsz, vofs);
     type = choose_vector_type(NULL, maxsz, oprsz, 0);
-    expand_vec_st_r(vofs, ptr, oprsz, maxsz, type);
+//    expand_vec_st_r(vofs, ptr, oprsz, maxsz, type);
+    expand_vec_st_r(reg, ptr, oprsz, maxsz, type);
 
     if (oprsz < maxsz) {
 	// FIXME: tmp