Skip to content

Commit

Permalink
WIP
Browse files Browse the repository at this point in the history
  • Loading branch information
PaoloS02 committed Nov 27, 2024
1 parent 46f902c commit b5e3515
Show file tree
Hide file tree
Showing 3 changed files with 154 additions and 76 deletions.
6 changes: 4 additions & 2 deletions include/tcg/tcg-op-gvec-common.h
Original file line number Diff line number Diff line change
Expand Up @@ -246,9 +246,11 @@ void tcg_gen_gvec_4i(uint32_t dofs, uint32_t aofs, uint32_t bofs, uint32_t cofs,

/* Expand a specific vector operation. */

void tcg_gen_gvec_ld(uint32_t dofs, TCGv_ptr ptr,
//void tcg_gen_gvec_ld(uint32_t dofs, TCGv_ptr ptr,
void tcg_gen_gvec_ld(TCGv_ptr dofs, TCGv_ptr ptr,
uint32_t oprsz, uint32_t maxsz);
void tcg_gen_gvec_st(uint32_t dofs, TCGv_ptr ptr,
//void tcg_gen_gvec_st(uint32_t dofs, TCGv_ptr ptr,
void tcg_gen_gvec_st(TCGv_ptr dofs, TCGv_ptr ptr,
uint32_t oprsz, uint32_t maxsz);
void tcg_gen_gvec_mov(unsigned vece, uint32_t dofs, uint32_t aofs,
uint32_t oprsz, uint32_t maxsz);
Expand Down
183 changes: 119 additions & 64 deletions target/riscv/insn_trans/trans_rvv.c.inc
Original file line number Diff line number Diff line change
Expand Up @@ -1103,73 +1103,128 @@ static bool ldst_whole_trans(uint32_t vd, uint32_t rs1, uint32_t nf,
uint32_t log2_esz, gen_helper_ldst_whole *fn,
DisasContext *s, bool is_load)
{
TCGv_ptr dest;
TCGv_ptr ld_addr = tcg_temp_new_ptr();
TCGv base_reg;

/* We might want to use these values from here instead of the helper function
* because the tcg_gen functions use information about the size of the elements
* in the array or the number of max elements in an array. Consider that the whole register
* loads/stores unlike other vector loads/stores will always perform the load/store
* on all the max elements intead of the number of active elements in a register only,
* so it's not based on vl (that is a parameter set every time by vsetvl/vsetvli and is
* the number of elements in a vector register on which to perform an operation)
* but on all the elements (max_elems = length of the register in bytes divided by
* the size in bytes of the single element).
* nf here (number of fields) determines how many vector registers are processed at once.
* We might load 2 or 3 or 4 registers at a time for instance.
* Better to start with the simple case nf=1 and then see how to handle the multiple
* destinations when nf=2,3... this might require some extra tcg functions to accept multiple
* destination registers. */
uint32_t max_elems = s->cfg_ptr->vlenb >> log2_esz;
// uint32_t evl = nf * max_elems;
// uint32_t esz = 1 << log2_esz;

/* vl${NF}re${SEW}.v
*
* NF = 1,2,4,8
* SEW = 8,16,32,64 (bits)
*
* NF == 1
* vl1re16.v v3, (a0) # Load v3 with VLEN/16 halfwords from address a0
* NF == 2
* vl2re16.v v2, (a0) # Load v2,v3 with 2*VLEN/16 halfwords from address a0
*/
uint32_t data = FIELD_DP32(0, VDATA, NF, nf);
data = FIELD_DP32(data, VDATA, VM, 1);
dest = tcg_temp_new_ptr();

/* a0 */
//if (get_xl(s) == MXL_RV32) {
base_reg = get_gpr(s, rs1, EXT_NONE);
/* v3 */
tcg_gen_addi_ptr(dest, tcg_env, vreg_ofs(s, vd));

mark_vs_dirty(s);

/*
Try and use "dest" in the call to tcg_gen_gvec_ld below if things don't work out, instead of passing vreg_ofs(s, vd).
Our load instruction uses register addressing so we need the content of the scalar register operand summed to an offset.
The tcg_gen_gvec_ld function will use a destination register and an address/pointer. It will then do a loop of loads based on
the size of the whole operation by adding the iteration index to the pointer.
We need then to generate an add instruction to create the memory address / pointer made of the tcg_env + the content of the base register.
*/

tcg_gen_add_ptr(ld_addr, (TCGv_ptr)base_reg, tcg_env);

/*
Need to add a function for the load? See other GVecGen operations used around in the targets. Any load/store?
*/

/* The size of the elements (8,16,32,64 bits) doesn't seem to be used to select the
* appropriate host vector in tcg/tcg-op-gvec.c:choose_vector_type. */

// TCGv_ptr dest = tcg_temp_new_ptr();
// TCGv_ptr ld_addr = tcg_temp_new_ptr();
// TCGv base_reg;
//// TCGv_ptr base_ptr = tcg_temp_new_ptr();
//
// /* We might want to use these values from here instead of the helper function
// * because the tcg_gen functions use information about the size of the elements
// * in the array or the number of max elements in an array. Consider that the whole register
// * loads/stores unlike other vector loads/stores will always perform the load/store
// * on all the max elements intead of the number of active elements in a register only,
// * so it's not based on vl (that is a parameter set every time by vsetvl/vsetvli and is
// * the number of elements in a vector register on which to perform an operation)
// * but on all the elements (max_elems = length of the register in bytes divided by
// * the size in bytes of the single element).
// * nf here (number of fields) determines how many vector registers are processed at once.
// * We might load 2 or 3 or 4 registers at a time for instance.
// * Better to start with the simple case nf=1 and then see how to handle the multiple
// * destinations when nf=2,3... this might require some extra tcg functions to accept multiple
// * destination registers. */
uint32_t max_elems = s->cfg_ptr->vlenb >> log2_esz;
// int vlmax = vext_get_vlmax(s->cfg_ptr->vlenb, s->sew, s->lmul);
// uint32_t evl = nf * max_elems;
// uint32_t esz = 1 << log2_esz;
//
// /* vl${NF}re${SEW}.v
// *
// * NF = 1,2,4,8
// * SEW = 8,16,32,64 (bits)
// *
// * NF == 1
// * vl1re16.v v3, (a0) # Load v3 with VLEN/16 halfwords from address a0
// * NF == 2
// * vl2re16.v v2, (a0) # Load v2,v3 with 2*VLEN/16 halfwords from address a0
// */
// uint32_t data = FIELD_DP32(0, VDATA, NF, nf);
// data = FIELD_DP32(data, VDATA, VM, 1);
//
// /* a0 */
// //if (get_xl(s) == MXL_RV32) {
// base_reg = get_gpr(s, rs1, EXT_NONE);
// /* v3 */
// tcg_gen_addi_ptr(dest, tcg_env, vreg_ofs(s, vd));
//
// mark_vs_dirty(s);

// int midx = get_mem_index(s);
// TCGv_i64 dirty_addr, clean_addr, t0, t1;
TCGv addr;
addr = get_address(s, rs1, 0);

// dirty_addr = tcg_temp_new_i64();
// TCGv_i64 t0 = tcg_temp_new_i64();
// TCGv_i64 t1 = tcg_temp_new_i64();
TCGv_i128 t16 = tcg_temp_new_i128();


for (int i=0; i<(max_elems<<log2_esz); i+=128) {
if (is_load) {
tcg_gen_gvec_ld(vreg_ofs(s, vd), ld_addr, max_elems, max_elems /*FIXME: other parameters needed*/);
// tcg_gen_addi_i64(dirty_addr, base_reg, 0);

// clean_addr = gen_mte_checkN(s, dirty_addr, false, rn != 31, len, MO_8);

addr = get_address(s, rs1, i);
tcg_gen_qemu_ld_i128(t16, addr, s->mem_idx,
MO_LE | MO_128 | MO_ATOM_NONE);
// tcg_gen_extr_i128_i64(t0, t1, t16);
tcg_gen_st_i128(t16, tcg_env, vreg_ofs(s, vd) + (i/8));
// tcg_gen_st_i64(t0, tcg_env, vreg_ofs(s, vd));
// tcg_gen_st_i64(t1, tcg_env, vreg_ofs(s, vd) + 8);
} else {
tcg_gen_gvec_st(vreg_ofs(s, vd), ld_addr, max_elems, max_elems /*FIXME: other parameters needed*/);
addr = get_address(s, rs1, i);
tcg_gen_ld_i128(t16, tcg_env, vreg_ofs(s, vd) + (i/8));
tcg_gen_qemu_st_i128(t16, addr, s->mem_idx,
MO_LE | MO_128 | MO_ATOM_NONE);
}
}
mark_vs_dirty(s);
//// TCGv_i128 r = tcg_temp_new_i128();
////
//// tcg_gen_ld_i128(r, base_reg, 0);
// /*
// Try and use "dest" in the call to tcg_gen_gvec_ld below if things don't work out, instead of passing vreg_ofs(s, vd).
//
// Our load instruction uses register addressing so we need the content of the scalar register operand summed to an offset.
// The tcg_gen_gvec_ld function will use a destination register and an address/pointer. It will then do a loop of loads based on
// the size of the whole operation by adding the iteration index to the pointer.
// We need then to generate an add instruction to create the memory address / pointer made of the tcg_env + the content of the base register.
// */
//
// TCGv_i32 base_val = tcg_temp_new_i32();
// tcg_gen_trunc_tl_i32(base_val, base_reg);
// tcg_gen_andi_i32(base_val, base_val, vlmax - 1);
//
//#if HOST_BIG_ENDIAN
// tcg_gen_xori_i32(base_val, base_val, 7 >> s->sew);
//#endif
// /* Convert the index to an offset. */
//// endian_adjust(base_val, s->sew);
// tcg_gen_shli_i32(base_val, base_val, s->sew);
//
// /* Convert the index to a pointer. */
// tcg_gen_ext_i32_ptr(ld_addr, base_val);
// tcg_gen_add_ptr(ld_addr, ld_addr, tcg_env);
////// tcg_gen_ext_i32_ptr(ld_addr, 0x1234);
//// tcg_gen_ext_i32_ptr(ld_addr, (TCGv_i32)base_reg);
//// tcg_gen_add_ptr(ld_addr, ld_addr, tcg_env);
////// tcg_gen_add_ptr(ld_addr, (TCGv_ptr)base_reg, tcg_env);
//
// /*
// Need to add a function for the load? See other GVecGen operations used around in the targets. Any load/store?
// */
//
// /* The size of the elements (8,16,32,64 bits) doesn't seem to be used to select the
// * appropriate host vector in tcg/tcg-op-gvec.c:choose_vector_type. */
//
// if (is_load) {
// tcg_gen_gvec_ld(dest, ld_addr, max_elems, max_elems /*FIXME: other parameters needed*/);
// //tcg_gen_gvec_ld(vreg_ofs(s, vd), ld_addr, max_elems, max_elems /*FIXME: other parameters needed*/);
// } else {
// tcg_gen_gvec_st(dest, ld_addr, max_elems, max_elems /*FIXME: other parameters needed*/);
// //tcg_gen_gvec_st(vreg_ofs(s, vd), ld_addr, max_elems, max_elems /*FIXME: other parameters needed*/);
// }

// Original call to the C helper function that we want to avoid.
// fn(dest, base, tcg_env, desc);
Expand Down
41 changes: 31 additions & 10 deletions tcg/tcg-op-gvec.c
Original file line number Diff line number Diff line change
Expand Up @@ -1181,51 +1181,72 @@ static void expand_4i_vec(unsigned vece, uint32_t dofs, uint32_t aofs,
}
}

static void expand_vec_ld_r(uint32_t vofs, TCGv_ptr ptr,
//static void expand_vec_ld_r(uint32_t vofs, TCGv_ptr ptr,
static void expand_vec_ld_r(TCGv_ptr reg, TCGv_ptr ptr,
uint32_t oprsz, uint32_t tysz, TCGType type)
{
TCGv_vec t0 = tcg_temp_new_vec(type);
for (uint32_t i = 0; i < oprsz; i += tysz) {
tcg_gen_ld_vec(t0, ptr, i);
tcg_gen_st_vec(t0, tcg_env, vofs + i);
tcg_gen_st_vec(t0, reg, i);
// tcg_gen_st_vec(t0, tcg_env, vofs + i);
}
// tcg_gen_ld_vec(t0, ptr, 0);
// tcg_gen_st_vec(t0, tcg_env, vofs);
tcg_temp_free_vec(t0);
}

void tcg_gen_gvec_ld(uint32_t vofs, TCGv_ptr ptr,
//void tcg_gen_gvec_ld(uint32_t vofs, TCGv_ptr ptr,
void tcg_gen_gvec_ld(TCGv_ptr reg, TCGv_ptr ptr,
uint32_t oprsz, uint32_t maxsz)
{
TCGType type;

check_size_align(oprsz, maxsz, vofs);
// TCGv_ptr v0;
// TCGv_i32 desc = tcg_constant_i32(simd_desc(oprsz, maxsz, data));

/// v0 = tcg_temp_ebb_new_ptr();
///
/// tcg_gen_addi_ptr(v0, tcg_env, vofs);
/// expand_vec_ld_r(v0, ptr, oprsz, maxsz, type);
///
/// tcg_temp_free_ptr(v0);
// check_size_align(oprsz, maxsz, vofs);
type = choose_vector_type(NULL, maxsz, oprsz, 0);
expand_vec_ld_r(vofs, ptr, oprsz, maxsz, type);
// expand_vec_ld_r(vofs, ptr, oprsz, maxsz, type);
expand_vec_ld_r(reg, ptr, oprsz, maxsz, type);

if (oprsz < maxsz) {
// FIXME: tmp
g_assert_not_reached();
}
}

static void expand_vec_st_r(uint32_t vofs, TCGv_ptr ptr,
//static void expand_vec_st_r(uint32_t vofs, TCGv_ptr ptr,
static void expand_vec_st_r(TCGv_ptr reg, TCGv_ptr ptr,
uint32_t oprsz, uint32_t tysz, TCGType type)
{
TCGv_vec t0 = tcg_temp_new_vec(type);
for (uint32_t i = 0; i < oprsz; i += tysz) {
tcg_gen_ld_vec(t0, tcg_env, vofs + i);
// tcg_gen_ld_vec(t0, tcg_env, vofs + i);
tcg_gen_ld_vec(t0, reg, i);
tcg_gen_st_vec(t0, ptr, i);
}
// tcg_gen_ld_vec(t0, tcg_env, vofs );
// tcg_gen_st_vec(t0, ptr, 0);
tcg_temp_free_vec(t0);
}

void tcg_gen_gvec_st(uint32_t vofs, TCGv_ptr ptr,
//void tcg_gen_gvec_st(uint32_t vofs, TCGv_ptr ptr,
void tcg_gen_gvec_st(TCGv_ptr reg, TCGv_ptr ptr,
uint32_t oprsz, uint32_t maxsz)
{
TCGType type;

check_size_align(oprsz, maxsz, vofs);
// check_size_align(oprsz, maxsz, vofs);
type = choose_vector_type(NULL, maxsz, oprsz, 0);
expand_vec_st_r(vofs, ptr, oprsz, maxsz, type);
// expand_vec_st_r(vofs, ptr, oprsz, maxsz, type);
expand_vec_st_r(reg, ptr, oprsz, maxsz, type);

if (oprsz < maxsz) {
// FIXME: tmp
Expand Down

0 comments on commit b5e3515

Please sign in to comment.