diff --git a/block/file-posix.c b/block/file-posix.c index 9f6e6279d987..766bbb6cb538 100644 --- a/block/file-posix.c +++ b/block/file-posix.c @@ -280,6 +280,13 @@ static int raw_normalize_devicepath(const char **filename, Error **errp) } #endif +#if defined(CONFIG_IOS) +static int probe_logical_blocksize(int fd, unsigned int *sector_size_p) +{ + return -ENOTSUP; /* not supported on iOS */ +} +#else /* CONFIG_IOS */ + /* * Get logical block size via ioctl. On success store it in @sector_size_p. */ @@ -313,6 +320,8 @@ static int probe_logical_blocksize(int fd, unsigned int *sector_size_p) return success ? 0 : -errno; } +#endif + /** * Get physical block size of @fd. * On success, store it in @blk_size and return 0. @@ -1449,12 +1458,24 @@ static bool preadv_present = true; static ssize_t qemu_preadv(int fd, const struct iovec *iov, int nr_iov, off_t offset) { +#ifdef CONFIG_DARWIN /* preadv introduced in macOS 11 */ + if (!__builtin_available(macOS 11, iOS 14, watchOS 7, tvOS 14, *)) { + preadv_present = false; + return -ENOSYS; + } else +#endif return preadv(fd, iov, nr_iov, offset); } static ssize_t qemu_pwritev(int fd, const struct iovec *iov, int nr_iov, off_t offset) { +#ifdef CONFIG_DARWIN /* pwritev introduced in macOS 11 */ + if (!__builtin_available(macOS 11, iOS 14, watchOS 7, tvOS 14, *)) { + preadv_present = false; + return -ENOSYS; + } else +#endif return pwritev(fd, iov, nr_iov, offset); } diff --git a/include/qemu/osdep.h b/include/qemu/osdep.h index 8dbf741ee4ca..97bc3ceac3af 100644 --- a/include/qemu/osdep.h +++ b/include/qemu/osdep.h @@ -676,7 +676,7 @@ size_t qemu_get_host_physmem(void); /** * Platforms which do not support system() return ENOSYS */ -#ifndef HAVE_SYSTEM_FUNCTION +#if !defined(HAVE_SYSTEM_FUNCTION) || defined(CONFIG_IOS) #define system platform_does_not_support_system static inline int platform_does_not_support_system(const char *command) { diff --git a/meson.build b/meson.build index ab6a60d1a87e..5fd46123defe 100644 --- a/meson.build +++ b/meson.build @@ -294,6 +294,7 @@ add_project_arguments('-iquote', '.', if host_machine.system() == 'darwin' add_languages('objc', required: false, native: false) + add_project_link_arguments(['-fvisibility-inlines-hidden', '-Xlinker', '-no_deduplicate'], native: false, language: ['c', 'cpp', 'objc']) endif sparse = find_program('cgcc', required: get_option('sparse')) @@ -455,6 +456,8 @@ if targetos == 'netbsd' endif endif +tcti_gadgets = files() + tcg_arch = host_arch if get_option('tcg').allowed() if host_arch == 'unknown' @@ -483,14 +486,77 @@ if get_option('tcg').allowed() # Tell our compiler how to generate our TCTI gadgets. gadget_generator = 'tcg/@0@/tcti-gadget-gen.py'.format(tcg_arch) - tcti_gadgets = custom_target('tcti-gadgets.c.inc', - output: 'tcti-gadgets.c.inc', - input: gadget_generator, - command: [find_program(gadget_generator), '@OUTPUT@'], - build_by_default: true, - build_always_stale: false) - - genh += tcti_gadgets + tcti_sources = [ + 'tcti_gadgets.h', + 'tcti_misc_gadgets.c', + 'tcti_misc_gadgets.h', + 'tcti_setcond_gadgets.c', + 'tcti_setcond_gadgets.h', + 'tcti_brcond_gadgets.c', + 'tcti_brcond_gadgets.h', + 'tcti_mov_gadgets.c', + 'tcti_mov_gadgets.h', + 'tcti_load_signed_gadgets.c', + 'tcti_load_signed_gadgets.h', + 'tcti_load_unsigned_gadgets.c', + 'tcti_load_unsigned_gadgets.h', + 'tcti_store_gadgets.c', + 'tcti_store_gadgets.h', + 'tcti_arithmetic_gadgets.c', + 'tcti_arithmetic_gadgets.h', + 'tcti_logical_gadgets.c', + 'tcti_logical_gadgets.h', + 'tcti_extension_gadgets.c', + 'tcti_extension_gadgets.h', + 'tcti_bitwise_gadgets.c', + 'tcti_bitwise_gadgets.h', + 'tcti_byteswap_gadgets.c', + 'tcti_byteswap_gadgets.h', + 'tcti_qemu_ld_aligned_signed_le_gadgets.c', + 'tcti_qemu_ld_aligned_signed_le_gadgets.h', + 'tcti_qemu_ld_unaligned_signed_le_gadgets.c', + 'tcti_qemu_ld_unaligned_signed_le_gadgets.h', + 'tcti_qemu_ld_slowpath_signed_le_gadgets.c', + 'tcti_qemu_ld_slowpath_signed_le_gadgets.h', + 'tcti_qemu_ld_aligned_unsigned_le_gadgets.c', + 'tcti_qemu_ld_aligned_unsigned_le_gadgets.h', + 'tcti_qemu_ld_unaligned_unsigned_le_gadgets.c', + 'tcti_qemu_ld_unaligned_unsigned_le_gadgets.h', + 'tcti_qemu_ld_slowpath_unsigned_le_gadgets.c', + 'tcti_qemu_ld_slowpath_unsigned_le_gadgets.h', + 'tcti_qemu_ld_aligned_be_gadgets.c', + 'tcti_qemu_ld_aligned_be_gadgets.h', + 'tcti_qemu_ld_unaligned_be_gadgets.c', + 'tcti_qemu_ld_unaligned_be_gadgets.h', + 'tcti_qemu_ld_slowpath_be_gadgets.c', + 'tcti_qemu_ld_slowpath_be_gadgets.h', + 'tcti_qemu_st_aligned_le_gadgets.c', + 'tcti_qemu_st_aligned_le_gadgets.h', + 'tcti_qemu_st_unaligned_le_gadgets.c', + 'tcti_qemu_st_unaligned_le_gadgets.h', + 'tcti_qemu_st_slowpath_le_gadgets.c', + 'tcti_qemu_st_slowpath_le_gadgets.h', + 'tcti_qemu_st_aligned_be_gadgets.c', + 'tcti_qemu_st_aligned_be_gadgets.h', + 'tcti_qemu_st_unaligned_be_gadgets.c', + 'tcti_qemu_st_unaligned_be_gadgets.h', + 'tcti_qemu_st_slowpath_be_gadgets.c', + 'tcti_qemu_st_slowpath_be_gadgets.h', + 'tcti_simd_base_gadgets.c', + 'tcti_simd_base_gadgets.h', + 'tcti_simd_arithmetic_gadgets.c', + 'tcti_simd_arithmetic_gadgets.h', + 'tcti_simd_logical_gadgets.c', + 'tcti_simd_logical_gadgets.h', + 'tcti_simd_immediate_gadgets.c', + 'tcti_simd_immediate_gadgets.h', + ] + tcti_gadgets = custom_target('tcti-gadgets.h', + output: tcti_sources, + input: gadget_generator, + command: [find_program(gadget_generator)], + build_by_default: true, + build_always_stale: false) elif host_arch == 'x86_64' tcg_arch = 'i386' elif host_arch == 'ppc64' @@ -3157,6 +3223,11 @@ if get_option('b_lto') endif common_ss.add(pagevary) specific_ss.add(files('page-vary.c')) +specific_ss.add(when: 'CONFIG_TCG_INTERPRETER', if_true: files('tcg/tci.c')) + +# FIXME: This is being used for now for development quickness, but these realy should be +# added to a gadget-specific shared library (tcti_ss). +specific_ss.add(when: 'CONFIG_TCG_THREADED_INTERPRETER', if_true: tcti_gadgets) subdir('backends') subdir('disas') diff --git a/tcg/aarch64-tcti/tcg-target-con-set.h b/tcg/aarch64-tcti/tcg-target-con-set.h index f51b7bcb13e7..a0b91bb320f6 100644 --- a/tcg/aarch64-tcti/tcg-target-con-set.h +++ b/tcg/aarch64-tcti/tcg-target-con-set.h @@ -9,13 +9,24 @@ * Each operand should be a sequence of constraint letters as defined by * tcg-target-con-str.h; the constraint combination is inclusive or. */ + +// Simple register functions. +C_O0_I1(r) C_O0_I2(r, r) C_O0_I3(r, r, r) -C_O0_I4(r, r, r, r) +//C_O0_I4(r, r, r, r) C_O1_I1(r, r) -C_O1_I2(r, 0, r) C_O1_I2(r, r, r) -C_O1_I4(r, r, r, r, r) -C_O2_I1(r, r, r) -C_O2_I2(r, r, r, r) -C_O2_I4(r, r, r, r, r, r) +//C_O1_I4(r, r, r, r, r) +//C_O2_I1(r, r, r) +//C_O2_I2(r, r, r, r) +//C_O2_I4(r, r, r, r, r, r) + +// Vector functions. +C_O1_I1(w, w) +C_O1_I1(w, r) +C_O0_I2(w, r) +C_O1_I1(w, wr) +C_O1_I2(w, w, w) +C_O1_I3(w, w, w, w) +C_O1_I2(w, 0, w) \ No newline at end of file diff --git a/tcg/aarch64-tcti/tcg-target-con-str.h b/tcg/aarch64-tcti/tcg-target-con-str.h index 87c0f19e9c2e..94d06d3e74a5 100644 --- a/tcg/aarch64-tcti/tcg-target-con-str.h +++ b/tcg/aarch64-tcti/tcg-target-con-str.h @@ -8,4 +8,13 @@ * Define constraint letters for register sets: * REGS(letter, register_mask) */ -REGS('r', MAKE_64BIT_MASK(0, TCG_TARGET_NB_REGS)) +REGS('r', TCG_MASK_GP_REGISTERS) +REGS('w', TCG_MASK_VECTOR_REGISTERS) + +/* + * Define constraint letters for constants: + * CONST(letter, TCG_CT_CONST_* bit set) + */ + +// Simple 64-bit immediates. +CONST('I', 0xFFFFFFFFFFFFFFFF) diff --git a/tcg/aarch64-tcti/tcg-target.c.inc b/tcg/aarch64-tcti/tcg-target.c.inc index af4cc8d664b9..10d6c4ec1b62 100644 --- a/tcg/aarch64-tcti/tcg-target.c.inc +++ b/tcg/aarch64-tcti/tcg-target.c.inc @@ -22,13 +22,16 @@ * THE SOFTWARE. */ + +// Rich disassembly is nice in theory, but it's -slow-. +//#define TCTI_GADGET_RICH_DISASSEMBLY + #define TCTI_GADGET_IMMEDIATE_ARRAY_LEN 64 #include "tcg/tcg-ldst.h" -// Grab our gadget definitions. -// FIXME: use the system path instead of hardcoding this? -#include "tcti-gadgets.c.inc" +// Grab our gadget headers. +#include "tcti_gadgets.h" /* Marker for missing code. */ #define TODO() \ @@ -47,64 +50,15 @@ # define tcti_assert(cond) ((void)0) #endif -/* Bitfield n...m (in 32 bit value). */ -#define BITS(n, m) (((0xffffffffU << (31 - n)) >> (31 - n + m)) << m) - -/** - * Macro that defines a look-up tree for named QEMU_LD gadgets. - */ -#define LD_MEMOP_LOOKUP(variable, arg, suffix) \ - switch (get_memop(arg) & (MO_BSWAP | MO_SSIZE)) { \ - case MO_UB: variable = gadget_qemu_ld_ub_ ## suffix; break; \ - case MO_SB: variable = gadget_qemu_ld_sb_ ## suffix; break; \ - case MO_LEUW: variable = gadget_qemu_ld_leuw_ ## suffix; break; \ - case MO_LESW: variable = gadget_qemu_ld_lesw_ ## suffix; break; \ - case MO_LEUL: variable = gadget_qemu_ld_leul_ ## suffix; break; \ - case MO_LESL: variable = gadget_qemu_ld_lesl_ ## suffix; break; \ - case MO_LEUQ: variable = gadget_qemu_ld_leq_ ## suffix; break; \ - case MO_BEUW: variable = gadget_qemu_ld_beuw_ ## suffix; break; \ - case MO_BESW: variable = gadget_qemu_ld_besw_ ## suffix; break; \ - case MO_BEUL: variable = gadget_qemu_ld_beul_ ## suffix; break; \ - case MO_BESL: variable = gadget_qemu_ld_besl_ ## suffix; break; \ - case MO_BEUQ: variable = gadget_qemu_ld_beq_ ## suffix; break; \ - default: \ - g_assert_not_reached(); \ - } -#define LD_MEMOP_HANDLER(variable, arg, suffix, a_bits, s_bits) \ - if (a_bits >= s_bits) { \ - LD_MEMOP_LOOKUP(variable, arg, aligned_ ## suffix ); \ - } else { \ - LD_MEMOP_LOOKUP(gadget, arg, unaligned_ ## suffix); \ - } - - - -/** - * Macro that defines a look-up tree for named QEMU_ST gadgets. - */ -#define ST_MEMOP_LOOKUP(variable, arg, suffix) \ - switch (get_memop(arg) & (MO_BSWAP | MO_SSIZE)) { \ - case MO_UB: variable = gadget_qemu_st_ub_ ## suffix; break; \ - case MO_LEUW: variable = gadget_qemu_st_leuw_ ## suffix; break; \ - case MO_LEUL: variable = gadget_qemu_st_leul_ ## suffix; break; \ - case MO_LEUQ: variable = gadget_qemu_st_leq_ ## suffix; break; \ - case MO_BEUW: variable = gadget_qemu_st_beuw_ ## suffix; break; \ - case MO_BEUL: variable = gadget_qemu_st_beul_ ## suffix; break; \ - case MO_BEUQ: variable = gadget_qemu_st_beq_ ## suffix; break; \ - default: \ - g_assert_not_reached(); \ - } -#define ST_MEMOP_HANDLER(variable, arg, suffix, a_bits, s_bits) \ - if (a_bits >= s_bits) { \ - ST_MEMOP_LOOKUP(variable, arg, aligned_ ## suffix ); \ - } else { \ - ST_MEMOP_LOOKUP(gadget, arg, unaligned_ ## suffix); \ - } +/******************************** + * TCG Constraints Definitions * + ********************************/ static TCGConstraintSetIndex tcg_target_op_def(TCGOpcode op) { switch (op) { + case INDEX_op_ld8u_i32: case INDEX_op_ld8s_i32: case INDEX_op_ld16u_i32: @@ -138,6 +92,8 @@ static TCGConstraintSetIndex tcg_target_op_def(TCGOpcode op) case INDEX_op_bswap32_i32: case INDEX_op_bswap32_i64: case INDEX_op_bswap64_i64: + case INDEX_op_extrl_i64_i32: + case INDEX_op_extrh_i64_i32: return C_O1_I1(r, r); case INDEX_op_st8_i32: @@ -191,6 +147,10 @@ static TCGConstraintSetIndex tcg_target_op_def(TCGOpcode op) case INDEX_op_rotr_i64: case INDEX_op_setcond_i32: case INDEX_op_setcond_i64: + case INDEX_op_clz_i32: + case INDEX_op_clz_i64: + case INDEX_op_ctz_i32: + case INDEX_op_ctz_i64: return C_O1_I2(r, r, r); case INDEX_op_brcond_i32: @@ -204,12 +164,65 @@ static TCGConstraintSetIndex tcg_target_op_def(TCGOpcode op) case INDEX_op_qemu_st_i64: return C_O0_I3(r, r, r); + // + // Vector ops. + // + case INDEX_op_add_vec: + case INDEX_op_sub_vec: + case INDEX_op_mul_vec: + case INDEX_op_xor_vec: + case INDEX_op_ssadd_vec: + case INDEX_op_sssub_vec: + case INDEX_op_usadd_vec: + case INDEX_op_ussub_vec: + case INDEX_op_smax_vec: + case INDEX_op_smin_vec: + case INDEX_op_umax_vec: + case INDEX_op_umin_vec: + case INDEX_op_shlv_vec: + case INDEX_op_shrv_vec: + case INDEX_op_sarv_vec: + case INDEX_op_aa64_sshl_vec: + return C_O1_I2(w, w, w); + case INDEX_op_not_vec: + case INDEX_op_neg_vec: + case INDEX_op_abs_vec: + case INDEX_op_shli_vec: + case INDEX_op_shri_vec: + case INDEX_op_sari_vec: + return C_O1_I1(w, w); + case INDEX_op_ld_vec: + case INDEX_op_dupm_vec: + return C_O1_I1(w, r); + case INDEX_op_st_vec: + return C_O0_I2(w, r); + case INDEX_op_dup_vec: + return C_O1_I1(w, wr); + case INDEX_op_or_vec: + case INDEX_op_andc_vec: + return C_O1_I2(w, w, w); + case INDEX_op_and_vec: + case INDEX_op_orc_vec: + return C_O1_I2(w, w, w); + case INDEX_op_cmp_vec: + return C_O1_I2(w, w, w); + case INDEX_op_bitsel_vec: + return C_O1_I3(w, w, w, w); + default: g_assert_not_reached(); } } static const int tcg_target_reg_alloc_order[] = { + + // General purpose registers, in preference-of-allocation order. + TCG_REG_R8, + TCG_REG_R9, + TCG_REG_R10, + TCG_REG_R11, + TCG_REG_R12, + TCG_REG_R13, TCG_REG_R0, TCG_REG_R1, TCG_REG_R2, @@ -218,16 +231,15 @@ static const int tcg_target_reg_alloc_order[] = { TCG_REG_R5, TCG_REG_R6, TCG_REG_R7, - TCG_REG_R8, - TCG_REG_R9, - TCG_REG_R10, - TCG_REG_R11, - TCG_REG_R12, - TCG_REG_R13, - /* - TCG_REG_R14, // AREG0 - TCG_REG_R15, // SP - */ + + // Note: we do not allocate R14 or R15, as they're used for our + // special-purpose values. + + // We'll use the high 16 vector register; avoiding the call-saved lower ones. + TCG_REG_V16, TCG_REG_V17, TCG_REG_V18, TCG_REG_V19, + TCG_REG_V20, TCG_REG_V21, TCG_REG_V22, TCG_REG_V23, + TCG_REG_V24, TCG_REG_V25, TCG_REG_V26, TCG_REG_V27, + TCG_REG_V28, TCG_REG_V29, TCG_REG_V30, TCG_REG_V31, }; #if MAX_OPC_PARAM_IARGS != 7 @@ -248,7 +260,7 @@ static const int tcg_target_call_oarg_regs[] = { }; #ifdef CONFIG_DEBUG_TCG -static const char *const tcg_target_reg_names[TCG_TARGET_NB_REGS] = { +static const char *const tcg_target_reg_names[TCG_TARGET_GP_REGS] = { "r00", "r01", "r02", @@ -268,6 +280,98 @@ static const char *const tcg_target_reg_names[TCG_TARGET_NB_REGS] = { }; #endif +/************************* + * TCG Emitter Helpers * + *************************/ + +/* Bitfield n...m (in 32 bit value). */ +#define BITS(n, m) (((0xffffffffU << (31 - n)) >> (31 - n + m)) << m) + +/** + * Macro that defines a look-up tree for named QEMU_LD gadgets. + */ +#define LD_MEMOP_LOOKUP(variable, arg, suffix) \ + switch (get_memop(arg) & (MO_BSWAP | MO_SSIZE)) { \ + case MO_UB: variable = gadget_qemu_ld_ub_ ## suffix; break; \ + case MO_SB: variable = gadget_qemu_ld_sb_ ## suffix; break; \ + case MO_LEUW: variable = gadget_qemu_ld_leuw_ ## suffix; break; \ + case MO_LESW: variable = gadget_qemu_ld_lesw_ ## suffix; break; \ + case MO_LEUL: variable = gadget_qemu_ld_leul_ ## suffix; break; \ + case MO_LESL: variable = gadget_qemu_ld_lesl_ ## suffix; break; \ + case MO_LEUQ: variable = gadget_qemu_ld_leq_ ## suffix; break; \ + case MO_BEUW: variable = gadget_qemu_ld_beuw_ ## suffix; break; \ + case MO_BESW: variable = gadget_qemu_ld_besw_ ## suffix; break; \ + case MO_BEUL: variable = gadget_qemu_ld_beul_ ## suffix; break; \ + case MO_BESL: variable = gadget_qemu_ld_besl_ ## suffix; break; \ + case MO_BEUQ: variable = gadget_qemu_ld_beq_ ## suffix; break; \ + default: \ + g_assert_not_reached(); \ + } +#define LD_MEMOP_HANDLER(variable, arg, suffix, a_bits, s_bits) \ + if (a_bits >= s_bits) { \ + LD_MEMOP_LOOKUP(variable, arg, aligned_ ## suffix ); \ + } else { \ + LD_MEMOP_LOOKUP(gadget, arg, unaligned_ ## suffix); \ + } + + + +/** + * Macro that defines a look-up tree for named QEMU_ST gadgets. + */ +#define ST_MEMOP_LOOKUP(variable, arg, suffix) \ + switch (get_memop(arg) & (MO_BSWAP | MO_SSIZE)) { \ + case MO_UB: variable = gadget_qemu_st_ub_ ## suffix; break; \ + case MO_LEUW: variable = gadget_qemu_st_leuw_ ## suffix; break; \ + case MO_LEUL: variable = gadget_qemu_st_leul_ ## suffix; break; \ + case MO_LEUQ: variable = gadget_qemu_st_leq_ ## suffix; break; \ + case MO_BEUW: variable = gadget_qemu_st_beuw_ ## suffix; break; \ + case MO_BEUL: variable = gadget_qemu_st_beul_ ## suffix; break; \ + case MO_BEUQ: variable = gadget_qemu_st_beq_ ## suffix; break; \ + default: \ + g_assert_not_reached(); \ + } +#define ST_MEMOP_HANDLER(variable, arg, suffix, a_bits, s_bits) \ + if (a_bits >= s_bits) { \ + ST_MEMOP_LOOKUP(variable, arg, aligned_ ## suffix ); \ + } else { \ + ST_MEMOP_LOOKUP(gadget, arg, unaligned_ ## suffix); \ + } + + +#define LOOKUP_SPECIAL_CASE_LDST_GADGET(arg, name, mode) \ + switch(TLB_MASK_TABLE_OFS(get_mmuidx(arg))) { \ + case -32: \ + gadget = (a_bits >= s_bits) ? \ + gadget_qemu_ ## name ## _aligned_ ## mode ## _off32_i64 : \ + gadget_qemu_ ## name ## _unaligned_ ## mode ## _off32_i64; \ + break; \ + case -48: \ + gadget = (a_bits >= s_bits) ? \ + gadget_qemu_ ## name ## _aligned_ ## mode ## _off48_i64 : \ + gadget_qemu_ ## name ## _unaligned_ ## mode ## _off48_i64; \ + break; \ + case -64: \ + gadget = (a_bits >= s_bits) ? \ + gadget_qemu_ ## name ## _aligned_ ## mode ## _off64_i64 : \ + gadget_qemu_ ## name ## _unaligned_ ## mode ## _off64_i64; \ + break; \ + case -96: \ + gadget = (a_bits >= s_bits) ? \ + gadget_qemu_ ## name ## _aligned_ ## mode ## _off96_i64 : \ + gadget_qemu_ ## name ## _unaligned_ ## mode ## _off96_i64; \ + break; \ + case -128: \ + gadget = (a_bits >= s_bits) ? \ + gadget_qemu_ ## name ## _aligned_ ## mode ## _off128_i64 : \ + gadget_qemu_ ## name ## _unaligned_ ## mode ## _off128_i64; \ + break;\ + default: \ + gadget = gadget_qemu_ ## name ## _slowpath_ ## mode ## _off0_i64; \ + break; \ + } + + static bool patch_reloc(tcg_insn_unit *code_ptr, int type, intptr_t value, intptr_t addend) { @@ -363,48 +467,51 @@ tcg_target_ulong helper_be_ldul_mmu_signed(CPUArchState *env, target_ulong addr, /* Write gadget pointer. */ -static void tcg_out_nullary_gadget(TCGContext *s, void *gadget) +static void tcg_out_gadget(TCGContext *s, const void *gadget) { tcg_out_immediate(s, (tcg_target_ulong)gadget); } /* Write gadget pointer, plus 64b immediate. */ -static void tcg_out_imm64_gadget(TCGContext *s, void *gadget, tcg_target_ulong immediate) +static void tcg_out_imm64_gadget(TCGContext *s, const void *gadget, tcg_target_ulong immediate) { - tcg_out_nullary_gadget(s, gadget); + tcg_out_gadget(s, gadget); tcg_out64(s, immediate); } /* Write gadget pointer (one register). */ -static void tcg_out_unary_gadget(TCGContext *s, void *gadget_base[TCG_TARGET_NB_REGS], unsigned reg0) +static void tcg_out_unary_gadget(TCGContext *s, const void *gadget_base[TCG_TARGET_GP_REGS], unsigned reg0) { - tcg_out_nullary_gadget(s, gadget_base[reg0]); + tcg_out_gadget(s, gadget_base[reg0]); } /* Write gadget pointer (two registers). */ -static void tcg_out_binary_gadget(TCGContext *s, void *gadget_base[TCG_TARGET_NB_REGS][TCG_TARGET_NB_REGS], unsigned reg0, unsigned reg1) +static void tcg_out_binary_gadget(TCGContext *s, const void *gadget_base[TCG_TARGET_GP_REGS][TCG_TARGET_GP_REGS], unsigned reg0, unsigned reg1) { - tcg_out_nullary_gadget(s, gadget_base[reg0][reg1]); + tcg_out_gadget(s, gadget_base[reg0][reg1]); } /* Write gadget pointer (three registers). */ -static void tcg_out_ternary_gadget(TCGContext *s, void *gadget_base[TCG_TARGET_NB_REGS][TCG_TARGET_NB_REGS][TCG_TARGET_NB_REGS], unsigned reg0, unsigned reg1, unsigned reg2) +static void tcg_out_ternary_gadget(TCGContext *s, const void *gadget_base[TCG_TARGET_GP_REGS][TCG_TARGET_GP_REGS][TCG_TARGET_GP_REGS], unsigned reg0, unsigned reg1, unsigned reg2) { - tcg_out_nullary_gadget(s, gadget_base[reg0][reg1][reg2]); + tcg_out_gadget(s, gadget_base[reg0][reg1][reg2]); } +/*************************** + * TCG Scalar Operations * + ***************************/ /** * Version of our LDST generator that defers to more optimized gadgets selectively. */ -static void tcg_out_ldst_gadget_inner(TCGContext *s, - void *gadget_base[TCG_TARGET_NB_REGS][TCG_TARGET_NB_REGS], - void *gadget_pos_imm[TCG_TARGET_NB_REGS][TCG_TARGET_NB_REGS][TCTI_GADGET_IMMEDIATE_ARRAY_LEN], - void *gadget_shifted_imm[TCG_TARGET_NB_REGS][TCG_TARGET_NB_REGS][TCTI_GADGET_IMMEDIATE_ARRAY_LEN], - void *gadget_neg_imm[TCG_TARGET_NB_REGS][TCG_TARGET_NB_REGS][TCTI_GADGET_IMMEDIATE_ARRAY_LEN], +static void tcg_out_ldst_gadget_inner(TCGContext *s, + const void *gadget_base[TCG_TARGET_GP_REGS][TCG_TARGET_GP_REGS], + const void *gadget_pos_imm[TCG_TARGET_GP_REGS][TCG_TARGET_GP_REGS][TCTI_GADGET_IMMEDIATE_ARRAY_LEN], + const void *gadget_shifted_imm[TCG_TARGET_GP_REGS][TCG_TARGET_GP_REGS][TCTI_GADGET_IMMEDIATE_ARRAY_LEN], + const void *gadget_neg_imm[TCG_TARGET_GP_REGS][TCG_TARGET_GP_REGS][TCTI_GADGET_IMMEDIATE_ARRAY_LEN], unsigned reg0, unsigned reg1, uint32_t offset) { int64_t extended_offset = (int32_t)offset; @@ -415,7 +522,7 @@ static void tcg_out_ldst_gadget_inner(TCGContext *s, // We handle positive and negative gadgets separately, in order to allow for asymmetrical // collections of pre-made gadgets. - if (!is_negative) + if (!is_negative) { uint64_t shifted_offset = (extended_offset >> 3); bool aligned_to_8B = ((extended_offset & 0b111) == 0); @@ -425,23 +532,23 @@ static void tcg_out_ldst_gadget_inner(TCGContext *s, // More optimal case: we have a gadget that directly encodes the argument. if (have_optimized_gadget) { - tcg_out_nullary_gadget(s, gadget_pos_imm[reg0][reg1][extended_offset]); + tcg_out_gadget(s, gadget_pos_imm[reg0][reg1][extended_offset]); return; - } + } // Special case: it's frequent to have low-numbered positive offsets that are aligned // to 16B boundaries else if(aligned_to_8B && have_shifted_gadget) { - tcg_out_nullary_gadget(s, gadget_shifted_imm[reg0][reg1][shifted_offset]); + tcg_out_gadget(s, gadget_shifted_imm[reg0][reg1][shifted_offset]); return; } - } + } else { uint64_t negated_offset = -(extended_offset); // More optimal case: we have a gadget that directly encodes the argument. if (negated_offset < TCTI_GADGET_IMMEDIATE_ARRAY_LEN) { - tcg_out_nullary_gadget(s, gadget_neg_imm[reg0][reg1][negated_offset]); + tcg_out_gadget(s, gadget_neg_imm[reg0][reg1][negated_offset]); return; } } @@ -473,40 +580,90 @@ static void tcti_out_label(TCGContext *s, TCGLabel *label) } } -/** - * Generate a register-to-register MOV. - */ + +/* Register to register move using ORR (shifted register with no shift). */ +static void tcg_out_movr(TCGContext *s, TCGType ext, TCGReg rd, TCGReg rm) +{ + switch(ext) { + case TCG_TYPE_I32: + tcg_out_binary_gadget(s, gadget_mov_i32, rd, rm); + break; + + case TCG_TYPE_I64: + tcg_out_binary_gadget(s, gadget_mov_i64, rd, rm); + break; + + default: + g_assert_not_reached(); + + } +} + + static bool tcg_out_mov(TCGContext *s, TCGType type, TCGReg ret, TCGReg arg) { - tcg_debug_assert(ret != arg); + TCGReg w_ret = (ret - TCG_REG_V16); + TCGReg w_arg = (arg - TCG_REG_V16); - if (type == TCG_TYPE_I32) { - tcg_out_binary_gadget(s, gadget_mov_i32, ret, arg); - } else { - tcg_out_binary_gadget(s, gadget_mov_i64, ret, arg); + if (ret == arg) { + return true; } + switch (type) { + case TCG_TYPE_I32: + case TCG_TYPE_I64: + + // If this is a GP to GP register mov, issue our standard MOV. + if (ret < 32 && arg < 32) { + tcg_out_movr(s, type, ret, arg); + break; + } + // If this is a vector register to GP, issue a UMOV. + else if (ret < 32) { + void *gadget = (type == TCG_TYPE_I32) ? gadget_umov_s0 : gadget_umov_d0; + tcg_out_binary_gadget(s, gadget, ret, w_arg); + break; + } + + // If this is a GP to vector move, insert the vealue using INS. + else if (arg < 32) { + void *gadget = (type == TCG_TYPE_I32) ? gadget_ins_s0 : gadget_ins_d0; + tcg_out_binary_gadget(s, gadget, w_ret, arg); + break; + } + /* FALLTHRU */ + + case TCG_TYPE_V64: + tcg_debug_assert(ret >= 32 && arg >= 32); + tcg_out_ternary_gadget(s, gadget_or_d, w_ret, w_arg, w_arg); + break; + + case TCG_TYPE_V128: + tcg_debug_assert(ret >= 32 && arg >= 32); + tcg_out_ternary_gadget(s, gadget_or_q, w_ret, w_arg, w_arg); + break; + default: + g_assert_not_reached(); + } return true; } + static void tcg_out_movi_i32(TCGContext *s, TCGReg t0, tcg_target_long arg) { bool is_negative = (arg < 0); // We handle positive and negative gadgets separately, in order to allow for asymmetrical // collections of pre-made gadgets. - if (!is_negative) + if (!is_negative) { // More optimal case: we have a gadget that directly encodes the argument. if (arg < ARRAY_SIZE(gadget_movi_imm_i32[t0])) { - tcg_out_nullary_gadget(s, gadget_movi_imm_i32[t0][arg]); + tcg_out_gadget(s, gadget_movi_imm_i32[t0][arg]); return; } - } - else { - } // Emit the mov and its immediate. @@ -521,16 +678,13 @@ static void tcg_out_movi_i64(TCGContext *s, TCGReg t0, tcg_target_long arg) // We handle positive and negative gadgets separately, in order to allow for asymmetrical // collections of pre-made gadgets. - if (!is_negative) + if (!is_negative) { // More optimal case: we have a gadget that directly encodes the argument. if (arg < ARRAY_SIZE(gadget_movi_imm_i64[t0])) { - tcg_out_nullary_gadget(s, gadget_movi_imm_i64[t0][arg]); + tcg_out_gadget(s, gadget_movi_imm_i64[t0][arg]); return; } - } - else { - } // TODO: optimize the negative case, too? @@ -558,7 +712,7 @@ static void tcg_out_movi(TCGContext *s, TCGType type, TCGReg t0, tcg_target_long */ static inline void tcg_out_call(TCGContext *s, const tcg_insn_unit *arg) { - tcg_out_nullary_gadget(s, gadget_call); + tcg_out_gadget(s, gadget_call); tcg_out64(s, (uintptr_t)arg); } @@ -570,9 +724,9 @@ static void tcg_out_ld(TCGContext *s, TCGType type, TCGReg ret, TCGReg arg1, { if (type == TCG_TYPE_I32) { - tcg_out_ldst_gadget(s, gadget_ld32u, ret, arg1, arg2); + tcg_out_ldst_gadget(s, gadget_ld32u, ret, arg1, arg2); } else { - tcg_out_ldst_gadget(s, gadget_ld_i64, ret, arg1, arg2); + tcg_out_ldst_gadget(s, gadget_ld_i64, ret, arg1, arg2); } } @@ -598,7 +752,7 @@ void tcg_out_op(TCGContext *s, TCGOpcode opc, const TCGArg *args, const int *con // to patch our gadget stream with the target address, later. if (s->tb_jmp_insn_offset) { // Emit our gadget. - tcg_out_nullary_gadget(s, gadget_br); + tcg_out_gadget(s, gadget_br); // Place our current instruction into our "relocation table", so it can // be patched once we know where the branch will target... @@ -617,7 +771,7 @@ void tcg_out_op(TCGContext *s, TCGOpcode opc, const TCGArg *args, const int *con // Simple branch. case INDEX_op_br: - tcg_out_nullary_gadget(s, gadget_br); + tcg_out_gadget(s, gadget_br); tcti_out_label(s, arg_label(args[0])); break; @@ -678,41 +832,41 @@ void tcg_out_op(TCGContext *s, TCGOpcode opc, const TCGArg *args, const int *con case INDEX_op_ld8u_i32: case INDEX_op_ld8u_i64: - tcg_out_ldst_gadget(s, gadget_ld8u, args[0], args[1], args[2]); + tcg_out_ldst_gadget(s, gadget_ld8u, args[0], args[1], args[2]); break; case INDEX_op_ld8s_i32: - tcg_out_ldst_gadget(s, gadget_ld8s_i32, args[0], args[1], args[2]); + tcg_out_ldst_gadget(s, gadget_ld8s_i32, args[0], args[1], args[2]); break; case INDEX_op_ld8s_i64: - tcg_out_ldst_gadget(s, gadget_ld8s_i64, args[0], args[1], args[2]); + tcg_out_ldst_gadget(s, gadget_ld8s_i64, args[0], args[1], args[2]); break; case INDEX_op_ld16u_i32: case INDEX_op_ld16u_i64: - tcg_out_ldst_gadget(s, gadget_ld16u, args[0], args[1], args[2]); + tcg_out_ldst_gadget(s, gadget_ld16u, args[0], args[1], args[2]); break; case INDEX_op_ld16s_i32: - tcg_out_ldst_gadget(s, gadget_ld16s_i32, args[0], args[1], args[2]); + tcg_out_ldst_gadget(s, gadget_ld16s_i32, args[0], args[1], args[2]); break; case INDEX_op_ld16s_i64: - tcg_out_ldst_gadget(s, gadget_ld16s_i64, args[0], args[1], args[2]); + tcg_out_ldst_gadget(s, gadget_ld16s_i64, args[0], args[1], args[2]); break; case INDEX_op_ld_i32: case INDEX_op_ld32u_i64: - tcg_out_ldst_gadget(s, gadget_ld32u, args[0], args[1], args[2]); + tcg_out_ldst_gadget(s, gadget_ld32u, args[0], args[1], args[2]); break; case INDEX_op_ld_i64: - tcg_out_ldst_gadget(s, gadget_ld_i64, args[0], args[1], args[2]); + tcg_out_ldst_gadget(s, gadget_ld_i64, args[0], args[1], args[2]); break; - + case INDEX_op_ld32s_i64: - tcg_out_ldst_gadget(s, gadget_ld32s_i64, args[0], args[1], args[2]); + tcg_out_ldst_gadget(s, gadget_ld32s_i64, args[0], args[1], args[2]); break; @@ -721,155 +875,169 @@ void tcg_out_op(TCGContext *s, TCGOpcode opc, const TCGArg *args, const int *con */ case INDEX_op_st8_i32: case INDEX_op_st8_i64: - tcg_out_ldst_gadget(s, gadget_st8, args[0], args[1], args[2]); + tcg_out_ldst_gadget(s, gadget_st8, args[0], args[1], args[2]); break; case INDEX_op_st16_i32: case INDEX_op_st16_i64: - tcg_out_ldst_gadget(s, gadget_st16, args[0], args[1], args[2]); + tcg_out_ldst_gadget(s, gadget_st16, args[0], args[1], args[2]); break; case INDEX_op_st_i32: case INDEX_op_st32_i64: - tcg_out_ldst_gadget(s, gadget_st_i32, args[0], args[1], args[2]); + tcg_out_ldst_gadget(s, gadget_st_i32, args[0], args[1], args[2]); break; case INDEX_op_st_i64: - tcg_out_ldst_gadget(s, gadget_st_i64, args[0], args[1], args[2]); + tcg_out_ldst_gadget(s, gadget_st_i64, args[0], args[1], args[2]); break; /** * Arithmetic instructions. */ - case INDEX_op_add_i32: - tcg_out_ternary_gadget(s, gadget_add_i32, args[0], args[1], args[2]); + case INDEX_op_add_i32: + tcg_out_ternary_gadget(s, gadget_add_i32, args[0], args[1], args[2]); break; case INDEX_op_sub_i32: - tcg_out_ternary_gadget(s, gadget_sub_i32, args[0], args[1], args[2]); + tcg_out_ternary_gadget(s, gadget_sub_i32, args[0], args[1], args[2]); break; case INDEX_op_mul_i32: - tcg_out_ternary_gadget(s, gadget_mul_i32, args[0], args[1], args[2]); + tcg_out_ternary_gadget(s, gadget_mul_i32, args[0], args[1], args[2]); + break; + + case INDEX_op_nand_i32: /* Optional (TCG_TARGET_HAS_nand_i32). */ + tcg_out_ternary_gadget(s, gadget_nand_i32, args[0], args[1], args[2]); + break; + + case INDEX_op_nor_i32: /* Optional (TCG_TARGET_HAS_nor_i32). */ + tcg_out_ternary_gadget(s, gadget_nor_i32, args[0], args[1], args[2]); break; case INDEX_op_and_i32: - tcg_out_ternary_gadget(s, gadget_and_i32, args[0], args[1], args[2]); + tcg_out_ternary_gadget(s, gadget_and_i32, args[0], args[1], args[2]); break; case INDEX_op_andc_i32: /* Optional (TCG_TARGET_HAS_andc_i32). */ - tcg_out_ternary_gadget(s, gadget_andc_i32, args[0], args[1], args[2]); + tcg_out_ternary_gadget(s, gadget_andc_i32, args[0], args[1], args[2]); break; case INDEX_op_orc_i32: /* Optional (TCG_TARGET_HAS_orc_i64). */ - tcg_out_ternary_gadget(s, gadget_orc_i32, args[0], args[1], args[2]); + tcg_out_ternary_gadget(s, gadget_orc_i32, args[0], args[1], args[2]); break; case INDEX_op_eqv_i32: /* Optional (TCG_TARGET_HAS_orc_i64). */ - tcg_out_ternary_gadget(s, gadget_eqv_i32, args[0], args[1], args[2]); + tcg_out_ternary_gadget(s, gadget_eqv_i32, args[0], args[1], args[2]); break; case INDEX_op_or_i32: - tcg_out_ternary_gadget(s, gadget_or_i32, args[0], args[1], args[2]); + tcg_out_ternary_gadget(s, gadget_or_i32, args[0], args[1], args[2]); break; case INDEX_op_xor_i32: - tcg_out_ternary_gadget(s, gadget_xor_i32, args[0], args[1], args[2]); + tcg_out_ternary_gadget(s, gadget_xor_i32, args[0], args[1], args[2]); break; case INDEX_op_shl_i32: - tcg_out_ternary_gadget(s, gadget_shl_i32, args[0], args[1], args[2]); + tcg_out_ternary_gadget(s, gadget_shl_i32, args[0], args[1], args[2]); break; case INDEX_op_shr_i32: - tcg_out_ternary_gadget(s, gadget_shr_i32, args[0], args[1], args[2]); + tcg_out_ternary_gadget(s, gadget_shr_i32, args[0], args[1], args[2]); break; case INDEX_op_sar_i32: - tcg_out_ternary_gadget(s, gadget_sar_i32, args[0], args[1], args[2]); + tcg_out_ternary_gadget(s, gadget_sar_i32, args[0], args[1], args[2]); break; - //case INDEX_op_rotr_i32: /* Optional (TCG_TARGET_HAS_rot_i32). */ - // tcg_out_ternary_gadget(s, gadget_rotr_i32, args[0], args[1], args[2]); - // break; + case INDEX_op_rotr_i32: /* Optional (TCG_TARGET_HAS_rot_i32). */ + tcg_out_ternary_gadget(s, gadget_rotr_i32, args[0], args[1], args[2]); + break; - //case INDEX_op_rotl_i32: /* Optional (TCG_TARGET_HAS_rot_i32). */ - // tcg_out_ternary_gadget(s, gadget_rotl_i32, args[0], args[1], args[2]); + case INDEX_op_rotl_i32: /* Optional (TCG_TARGET_HAS_rot_i32). */ + tcg_out_ternary_gadget(s, gadget_rotl_i32, args[0], args[1], args[2]); + break; case INDEX_op_add_i64: - tcg_out_ternary_gadget(s, gadget_add_i64, args[0], args[1], args[2]); + tcg_out_ternary_gadget(s, gadget_add_i64, args[0], args[1], args[2]); break; case INDEX_op_sub_i64: - tcg_out_ternary_gadget(s, gadget_sub_i64, args[0], args[1], args[2]); + tcg_out_ternary_gadget(s, gadget_sub_i64, args[0], args[1], args[2]); break; case INDEX_op_mul_i64: - tcg_out_ternary_gadget(s, gadget_mul_i64, args[0], args[1], args[2]); + tcg_out_ternary_gadget(s, gadget_mul_i64, args[0], args[1], args[2]); break; case INDEX_op_and_i64: - tcg_out_ternary_gadget(s, gadget_and_i64, args[0], args[1], args[2]); + tcg_out_ternary_gadget(s, gadget_and_i64, args[0], args[1], args[2]); break; case INDEX_op_andc_i64: /* Optional (TCG_TARGET_HAS_andc_i64). */ - tcg_out_ternary_gadget(s, gadget_andc_i64, args[0], args[1], args[2]); + tcg_out_ternary_gadget(s, gadget_andc_i64, args[0], args[1], args[2]); break; case INDEX_op_orc_i64: /* Optional (TCG_TARGET_HAS_orc_i64). */ - tcg_out_ternary_gadget(s, gadget_orc_i64, args[0], args[1], args[2]); + tcg_out_ternary_gadget(s, gadget_orc_i64, args[0], args[1], args[2]); break; case INDEX_op_eqv_i64: /* Optional (TCG_TARGET_HAS_eqv_i64). */ - tcg_out_ternary_gadget(s, gadget_eqv_i64, args[0], args[1], args[2]); + tcg_out_ternary_gadget(s, gadget_eqv_i64, args[0], args[1], args[2]); + break; + + case INDEX_op_nand_i64: /* Optional (TCG_TARGET_HAS_nand_i64). */ + tcg_out_ternary_gadget(s, gadget_nand_i64, args[0], args[1], args[2]); break; - //case INDEX_op_nand_i64: /* Optional (TCG_TARGET_HAS_nand_i64). */ - //case INDEX_op_nor_i64: /* Optional (TCG_TARGET_HAS_nor_i64). */ + case INDEX_op_nor_i64: /* Optional (TCG_TARGET_HAS_nor_i64). */ + tcg_out_ternary_gadget(s, gadget_nor_i64, args[0], args[1], args[2]); + break; case INDEX_op_or_i64: - tcg_out_ternary_gadget(s, gadget_or_i64, args[0], args[1], args[2]); + tcg_out_ternary_gadget(s, gadget_or_i64, args[0], args[1], args[2]); break; case INDEX_op_xor_i64: - tcg_out_ternary_gadget(s, gadget_xor_i64, args[0], args[1], args[2]); + tcg_out_ternary_gadget(s, gadget_xor_i64, args[0], args[1], args[2]); break; case INDEX_op_shl_i64: - tcg_out_ternary_gadget(s, gadget_shl_i64, args[0], args[1], args[2]); + tcg_out_ternary_gadget(s, gadget_shl_i64, args[0], args[1], args[2]); break; case INDEX_op_shr_i64: - tcg_out_ternary_gadget(s, gadget_shr_i64, args[0], args[1], args[2]); + tcg_out_ternary_gadget(s, gadget_shr_i64, args[0], args[1], args[2]); break; case INDEX_op_sar_i64: - tcg_out_ternary_gadget(s, gadget_sar_i64, args[0], args[1], args[2]); + tcg_out_ternary_gadget(s, gadget_sar_i64, args[0], args[1], args[2]); break; - //case INDEX_op_rotl_i64: /* Optional (TCG_TARGET_HAS_rot_i64). */ - // tcg_out_ternary_gadget(s, gadget_rotl_i64, args[0], args[1], args[2]); - // break; + case INDEX_op_rotl_i64: /* Optional (TCG_TARGET_HAS_rot_i64). */ + tcg_out_ternary_gadget(s, gadget_rotl_i64, args[0], args[1], args[2]); + break; - //case INDEX_op_rotr_i64: /* Optional (TCG_TARGET_HAS_rot_i64). */ - // tcg_out_ternary_gadget(s, gadget_rotr_i64, args[0], args[1], args[2]); - // break; + case INDEX_op_rotr_i64: /* Optional (TCG_TARGET_HAS_rot_i64). */ + tcg_out_ternary_gadget(s, gadget_rotr_i64, args[0], args[1], args[2]); + break; case INDEX_op_div_i64: /* Optional (TCG_TARGET_HAS_div_i64). */ - tcg_out_ternary_gadget(s, gadget_div_i64, args[0], args[1], args[2]); + tcg_out_ternary_gadget(s, gadget_div_i64, args[0], args[1], args[2]); break; case INDEX_op_divu_i64: /* Optional (TCG_TARGET_HAS_div_i64). */ - tcg_out_ternary_gadget(s, gadget_divu_i64, args[0], args[1], args[2]); + tcg_out_ternary_gadget(s, gadget_divu_i64, args[0], args[1], args[2]); break; case INDEX_op_rem_i64: /* Optional (TCG_TARGET_HAS_div_i64). */ - tcg_out_ternary_gadget(s, gadget_rem_i64, args[0], args[1], args[2]); + tcg_out_ternary_gadget(s, gadget_rem_i64, args[0], args[1], args[2]); break; case INDEX_op_remu_i64: /* Optional (TCG_TARGET_HAS_div_i64). */ - tcg_out_ternary_gadget(s, gadget_remu_i64, args[0], args[1], args[2]); + tcg_out_ternary_gadget(s, gadget_remu_i64, args[0], args[1], args[2]); break; case INDEX_op_brcond_i64: @@ -898,7 +1066,7 @@ void tcg_out_op(TCGContext *s, TCGOpcode opc, const TCGArg *args, const int *con // helps the processor's branch prediction be less "squished", as not every // branch is going throuh the same instruction. tcg_out_ternary_gadget(s, gadget, last_brcond_i64, args[0], args[1]); - last_brcond_i64 = (last_brcond_i64 + 1) % TCG_TARGET_NB_REGS; + last_brcond_i64 = (last_brcond_i64 + 1) % TCG_TARGET_GP_REGS; // Branch target immediate. tcti_out_label(s, arg_label(args[3])); @@ -928,6 +1096,14 @@ void tcg_out_op(TCGContext *s, TCGOpcode opc, const TCGArg *args, const int *con tcg_out_binary_gadget(s, gadget_neg_i64, args[0], args[1]); break; + case INDEX_op_clz_i64: /* Optional (TCG_TARGET_HAS_clz_i64). */ + tcg_out_ternary_gadget(s, gadget_clz_i64, args[0], args[1], args[2]); + break; + + case INDEX_op_ctz_i64: /* Optional (TCG_TARGET_HAS_ctz_i64). */ + tcg_out_ternary_gadget(s, gadget_ctz_i64, args[0], args[1], args[2]); + break; + case INDEX_op_ext8s_i64: /* Optional (TCG_TARGET_HAS_ext8s_i64). */ tcg_out_binary_gadget(s, gadget_ext8s_i64, args[0], args[1]); break; @@ -956,10 +1132,26 @@ void tcg_out_op(TCGContext *s, TCGOpcode opc, const TCGArg *args, const int *con tcg_out_binary_gadget(s, gadget_ext32u_i64, args[0], args[1]); break; + case INDEX_op_extrl_i64_i32: + tcg_out_binary_gadget(s, gadget_extrl, args[0], args[1]); + break; + + case INDEX_op_extrh_i64_i32: + tcg_out_binary_gadget(s, gadget_extrh, args[0], args[1]); + break; + case INDEX_op_neg_i32: /* Optional (TCG_TARGET_HAS_neg_i32). */ tcg_out_binary_gadget(s, gadget_neg_i32, args[0], args[1]); break; + case INDEX_op_clz_i32: /* Optional (TCG_TARGET_HAS_clz_i32). */ + tcg_out_ternary_gadget(s, gadget_clz_i32, args[0], args[1], args[2]); + break; + + case INDEX_op_ctz_i32: /* Optional (TCG_TARGET_HAS_ctz_i32). */ + tcg_out_ternary_gadget(s, gadget_ctz_i32, args[0], args[1], args[2]); + break; + case INDEX_op_not_i32: /* Optional (TCG_TARGET_HAS_not_i32). */ tcg_out_binary_gadget(s, gadget_not_i32, args[0], args[1]); break; @@ -973,19 +1165,19 @@ void tcg_out_op(TCGContext *s, TCGOpcode opc, const TCGArg *args, const int *con break; case INDEX_op_div_i32: /* Optional (TCG_TARGET_HAS_div_i32). */ - tcg_out_ternary_gadget(s, gadget_div_i32, args[0], args[1], args[2]); + tcg_out_ternary_gadget(s, gadget_div_i32, args[0], args[1], args[2]); break; case INDEX_op_divu_i32: /* Optional (TCG_TARGET_HAS_div_i32). */ - tcg_out_ternary_gadget(s, gadget_divu_i32, args[0], args[1], args[2]); + tcg_out_ternary_gadget(s, gadget_divu_i32, args[0], args[1], args[2]); break; case INDEX_op_rem_i32: /* Optional (TCG_TARGET_HAS_div_i32). */ - tcg_out_ternary_gadget(s, gadget_rem_i32, args[0], args[1], args[2]); + tcg_out_ternary_gadget(s, gadget_rem_i32, args[0], args[1], args[2]); break; case INDEX_op_remu_i32: /* Optional (TCG_TARGET_HAS_div_i32). */ - tcg_out_ternary_gadget(s, gadget_remu_i32, args[0], args[1], args[2]); + tcg_out_ternary_gadget(s, gadget_remu_i32, args[0], args[1], args[2]); break; case INDEX_op_brcond_i32: @@ -1014,7 +1206,7 @@ void tcg_out_op(TCGContext *s, TCGOpcode opc, const TCGArg *args, const int *con // helps the processor's branch prediction be less "squished", as not every // branch is going throuh the same instruction. tcg_out_ternary_gadget(s, gadget, last_brcond_i32, args[0], args[1]); - last_brcond_i32 = (last_brcond_i32 + 1) % TCG_TARGET_NB_REGS; + last_brcond_i32 = (last_brcond_i32 + 1) % TCG_TARGET_GP_REGS; // Branch target immediate. tcti_out_label(s, arg_label(args[3])); @@ -1031,6 +1223,8 @@ void tcg_out_op(TCGContext *s, TCGOpcode opc, const TCGArg *args, const int *con void *gadget; switch(TLB_MASK_TABLE_OFS(get_mmuidx(args[2]))) { + case -32: LD_MEMOP_HANDLER(gadget, args[2], off32_i32, a_bits, s_bits); break; + case -48: LD_MEMOP_HANDLER(gadget, args[2], off48_i32, a_bits, s_bits); break; case -64: LD_MEMOP_HANDLER(gadget, args[2], off64_i32, a_bits, s_bits); break; case -96: LD_MEMOP_HANDLER(gadget, args[2], off96_i32, a_bits, s_bits); break; case -128: LD_MEMOP_HANDLER(gadget, args[2], off128_i32, a_bits, s_bits); break; @@ -1038,7 +1232,7 @@ void tcg_out_op(TCGContext *s, TCGOpcode opc, const TCGArg *args, const int *con } // Args: - // - an immediate32 encodes our operation index + // - an immediate32 encodes our operation index tcg_out_binary_gadget(s, gadget, args[0], args[1]); tcg_out64(s, args[2]); // TODO: fix encoding to be 4b break; @@ -1052,43 +1246,31 @@ void tcg_out_op(TCGContext *s, TCGOpcode opc, const TCGArg *args, const int *con void *gadget; - // Special optimization case: if we have an operation/target of 0x3A, - // this is a common case. Delegate to our special-case handler. - if (args[2] == 0x3a) { - switch(TLB_MASK_TABLE_OFS(get_mmuidx(args[2]))) { - - case -64: - gadget = (a_bits >= s_bits) ? - gadget_qemu_ld_leq_aligned_mode3a_off64_i64 : - gadget_qemu_ld_leq_unaligned_mode3a_off64_i64; - break; - case -96: - gadget = (a_bits >= s_bits) ? - gadget_qemu_ld_leq_aligned_mode3a_off96_i64 : - gadget_qemu_ld_leq_unaligned_mode3a_off96_i64; - break; - case -128: - gadget = (a_bits >= s_bits) ? - gadget_qemu_ld_leq_aligned_mode3a_off128_i64 : - gadget_qemu_ld_leq_unaligned_mode3a_off128_i64; - break; - - default: - gadget = gadget_qemu_ld_leq_slowpath_mode3a_off0_i64; - break; - } + // Special optimization case: if we have an common case. + // Delegate to our special-case handler. + if (args[2] == 0x02) { + LOOKUP_SPECIAL_CASE_LDST_GADGET(args[2], ld_ub, mode02) tcg_out_binary_gadget(s, gadget, args[0], args[1]); - } + } else if (args[2] == 0x32) { + LOOKUP_SPECIAL_CASE_LDST_GADGET(args[2], ld_leq, mode32) + tcg_out_binary_gadget(s, gadget, args[0], args[1]); + } else if(args[2] == 0x3a) { + LOOKUP_SPECIAL_CASE_LDST_GADGET(args[2], ld_leq, mode3a) + tcg_out_binary_gadget(s, gadget, args[0], args[1]); + } // Otherwise, handle the generic case. else { switch(TLB_MASK_TABLE_OFS(get_mmuidx(args[2]))) { + case -32: LD_MEMOP_HANDLER(gadget, args[2], off32_i64, a_bits, s_bits); break; + case -48: LD_MEMOP_HANDLER(gadget, args[2], off48_i64, a_bits, s_bits); break; case -64: LD_MEMOP_HANDLER(gadget, args[2], off64_i64, a_bits, s_bits); break; case -96: LD_MEMOP_HANDLER(gadget, args[2], off96_i64, a_bits, s_bits); break; case -128: LD_MEMOP_HANDLER(gadget, args[2], off128_i64, a_bits, s_bits); break; default: LD_MEMOP_LOOKUP(gadget, args[2], slowpath_off0_i64); break; } + // Args: - // - an immediate32 encodes our operation index + // - an immediate32 encodes our operation index tcg_out_binary_gadget(s, gadget, args[0], args[1]); tcg_out64(s, args[2]); // TODO: fix encoding to be 4b } @@ -1105,6 +1287,8 @@ void tcg_out_op(TCGContext *s, TCGOpcode opc, const TCGArg *args, const int *con void *gadget; switch(TLB_MASK_TABLE_OFS(get_mmuidx(args[2]))) { + case -32: ST_MEMOP_HANDLER(gadget, args[2], off32_i32, a_bits, s_bits); break; + case -48: ST_MEMOP_HANDLER(gadget, args[2], off48_i32, a_bits, s_bits); break; case -64: ST_MEMOP_HANDLER(gadget, args[2], off64_i32, a_bits, s_bits); break; case -96: ST_MEMOP_HANDLER(gadget, args[2], off96_i32, a_bits, s_bits); break; case -128: ST_MEMOP_HANDLER(gadget, args[2], off128_i32, a_bits, s_bits); break; @@ -1113,7 +1297,7 @@ void tcg_out_op(TCGContext *s, TCGOpcode opc, const TCGArg *args, const int *con // Args: // - our gadget encodes the target and address registers - // - an immediate32 encodes our operation index + // - an immediate32 encodes our operation index tcg_out_binary_gadget(s, gadget, args[0], args[1]); tcg_out64(s, args[2]); // FIXME: double encoded break; @@ -1127,36 +1311,23 @@ void tcg_out_op(TCGContext *s, TCGOpcode opc, const TCGArg *args, const int *con void *gadget; - // Special optimization case: if we have an operation/target of 0x3A, - // this is a common case. Delegate to our special-case handler. - if (args[2] == 0x3a) { - switch(TLB_MASK_TABLE_OFS(get_mmuidx(args[2]))) { - - case -64: - gadget = (a_bits >= s_bits) ? - gadget_qemu_st_leq_aligned_mode3a_off64_i64 : - gadget_qemu_st_leq_unaligned_mode3a_off64_i64; - break; - case -96: - gadget = (a_bits >= s_bits) ? - gadget_qemu_st_leq_aligned_mode3a_off96_i64 : - gadget_qemu_st_leq_unaligned_mode3a_off96_i64; - break; - case -128: - gadget = (a_bits >= s_bits) ? - gadget_qemu_st_leq_aligned_mode3a_off128_i64 : - gadget_qemu_st_leq_unaligned_mode3a_off128_i64; - break; - - default: - gadget = gadget_qemu_st_leq_slowpath_mode3a_off0_i64; - break; - } + // Special optimization case: if we have an common case. + // Delegate to our special-case handler. + if (args[2] == 0x02) { + LOOKUP_SPECIAL_CASE_LDST_GADGET(args[2], st_ub, mode02) tcg_out_binary_gadget(s, gadget, args[0], args[1]); - } + } else if (args[2] == 0x32) { + LOOKUP_SPECIAL_CASE_LDST_GADGET(args[2], st_leq, mode32) + tcg_out_binary_gadget(s, gadget, args[0], args[1]); + } else if(args[2] == 0x3a) { + LOOKUP_SPECIAL_CASE_LDST_GADGET(args[2], st_leq, mode3a) + tcg_out_binary_gadget(s, gadget, args[0], args[1]); + } // Otherwise, handle the generic case. else { switch(TLB_MASK_TABLE_OFS(get_mmuidx(args[2]))) { + case -32: ST_MEMOP_HANDLER(gadget, args[2], off32_i64, a_bits, s_bits); break; + case -48: ST_MEMOP_HANDLER(gadget, args[2], off48_i64, a_bits, s_bits); break; case -64: ST_MEMOP_HANDLER(gadget, args[2], off64_i64, a_bits, s_bits); break; case -96: ST_MEMOP_HANDLER(gadget, args[2], off96_i64, a_bits, s_bits); break; case -128: ST_MEMOP_HANDLER(gadget, args[2], off128_i64, a_bits, s_bits); break; @@ -1165,7 +1336,7 @@ void tcg_out_op(TCGContext *s, TCGOpcode opc, const TCGArg *args, const int *con // Args: // - our gadget encodes the target and address registers - // - an immediate32 encodes our operation index + // - an immediate32 encodes our operation index tcg_out_binary_gadget(s, gadget, args[0], args[1]); tcg_out64(s, args[2]); // FIXME: double encoded } @@ -1183,7 +1354,7 @@ void tcg_out_op(TCGContext *s, TCGOpcode opc, const TCGArg *args, const int *con [TCG_MO_LD_ST] = gadget_mb_ld, [TCG_MO_LD_ST | TCG_MO_LD_LD] = gadget_mb_ld, }; - tcg_out_nullary_gadget(s, sync[args[0] & TCG_MO_ALL]); + tcg_out_gadget(s, sync[args[0] & TCG_MO_ALL]); break; } @@ -1203,9 +1374,9 @@ static void tcg_out_st(TCGContext *s, TCGType type, TCGReg arg, TCGReg arg1, intptr_t arg2) { if (type == TCG_TYPE_I32) { - tcg_out_ldst_gadget(s, gadget_st_i32, arg, arg1, arg2); + tcg_out_ldst_gadget(s, gadget_st_i32, arg, arg1, arg2); } else { - tcg_out_ldst_gadget(s, gadget_st_i64, arg, arg1, arg2); + tcg_out_ldst_gadget(s, gadget_st_i64, arg, arg1, arg2); } } @@ -1221,19 +1392,629 @@ static bool tcg_target_const_match(int64_t val, TCGType type, int ct) return ct & TCG_CT_CONST; } +/*************************** + * TCG Vector Operations * + ***************************/ + +// +// Helper for emitting DUPI (immediate DUP) instructions. +// +#define tcg_out_dupi_gadget(s, name, q, rd, op, cmode, arg) \ + if (q) { \ + tcg_out_gadget(s, gadget_ ## name ## _cmode_ ## cmode ## _op ## op ## _q1[rd][arg]); \ + } else { \ + tcg_out_gadget(s, gadget_ ## name ## _cmode_ ## cmode ## _op ## op ## _q0[rd][arg]); \ + } + + +// +// Helpers for emitting D/Q variant instructions. +// +#define tcg_out_dq_gadget(s, name, arity, is_q, args...) \ + if (is_q) { \ + tcg_out_ ## arity ## _gadget(s, gadget_ ## name ## _q, args); \ + } else { \ + tcg_out_ ## arity ## _gadget(s, gadget_ ## name ## _d, args); \ + } + +#define tcg_out_unary_dq_gadget(s, name, is_q, a) \ + tcg_out_dq_gadget(s, name, unary, is_q, a) +#define tcg_out_binary_dq_gadget(s, name, is_q, a, b) \ + tcg_out_dq_gadget(s, name, binary, is_q, a, b) +#define tcg_out_ternary_dq_gadget(s, name, is_q, a, b, c) \ + tcg_out_dq_gadget(s, name, ternary, is_q, a, b, c) + + +// +// Helper for emitting the gadget appropriate for a vector's size. +// +#define tcg_out_sized_vector_gadget(s, name, arity, vece, args...) \ + switch(vece) { \ + case MO_8: \ + if (type == TCG_TYPE_V64) { \ + tcg_out_ ## arity ## _gadget(s, gadget_ ## name ## _8b, args); \ + } else { \ + tcg_out_ ## arity ## _gadget(s, gadget_ ## name ## _16b, args); \ + } \ + break; \ + case MO_16: \ + if (type == TCG_TYPE_V64) { \ + tcg_out_ ## arity ## _gadget(s, gadget_ ## name ## _4h, args); \ + } else { \ + tcg_out_ ## arity ## _gadget(s, gadget_ ## name ## _8h, args); \ + } \ + break; \ + case MO_32: \ + if (type == TCG_TYPE_V64) { \ + tcg_out_ ## arity ## _gadget(s, gadget_ ## name ## _2s, args); \ + } else { \ + tcg_out_ ## arity ## _gadget(s, gadget_ ## name ## _4s, args); \ + } \ + break; \ + case MO_64: \ + if (type == TCG_TYPE_V128) { \ + tcg_out_ ## arity ## _gadget(s, gadget_ ## name ## _2d, args); \ + } \ + else { \ + g_assert_not_reached(); \ + } \ + break; \ + default: \ + g_assert_not_reached(); \ + } +#define tcg_out_sized_vector_gadget_no64(s, name, arity, vece, args...) \ + switch(vece) { \ + case MO_8: \ + if (type == TCG_TYPE_V64) { \ + tcg_out_ ## arity ## _gadget(s, gadget_ ## name ## _8b, args); \ + } else { \ + tcg_out_ ## arity ## _gadget(s, gadget_ ## name ## _16b, args); \ + } \ + break; \ + case MO_16: \ + if (type == TCG_TYPE_V64) { \ + tcg_out_ ## arity ## _gadget(s, gadget_ ## name ## _4h, args); \ + } else { \ + tcg_out_ ## arity ## _gadget(s, gadget_ ## name ## _8h, args); \ + } \ + break; \ + case MO_32: \ + if (type == TCG_TYPE_V64) { \ + tcg_out_ ## arity ## _gadget(s, gadget_ ## name ## _2s, args); \ + } else { \ + tcg_out_ ## arity ## _gadget(s, gadget_ ## name ## _4s, args); \ + } \ + break; \ + default: \ + g_assert_not_reached(); \ + } + + +#define tcg_out_unary_vector_gadget(s, name, vece, a) \ + tcg_out_sized_vector_gadget(s, name, unary, vece, a) +#define tcg_out_binary_vector_gadget(s, name, vece, a, b) \ + tcg_out_sized_vector_gadget(s, name, binary, vece, a, b) +#define tcg_out_ternary_vector_gadget(s, name, vece, a, b, c) \ + tcg_out_sized_vector_gadget(s, name, ternary, vece, a, b, c) + +#define tcg_out_ternary_vector_gadget_no64(s, name, vece, a, b, c) \ + tcg_out_sized_vector_gadget_no64(s, name, ternary, vece, a, b, c) + + +#define tcg_out_ternary_vector_gadget_with_scalar(s, name, is_scalar, vece, a, b, c) \ + if (is_scalar) { \ + tcg_out_ternary_gadget(s, gadget_ ## name ## _scalar, w0, w1, w2); \ + } else { \ + tcg_out_ternary_vector_gadget(s, name, vece, w0, w1, w2); \ + } + + +/* Return true if v16 is a valid 16-bit shifted immediate. */ +static bool is_shimm16(uint16_t v16, int *cmode, int *imm8) +{ + if (v16 == (v16 & 0xff)) { + *cmode = 0x8; + *imm8 = v16 & 0xff; + return true; + } else if (v16 == (v16 & 0xff00)) { + *cmode = 0xa; + *imm8 = v16 >> 8; + return true; + } + return false; +} + + +/** Core vector operation emission. */ +static void tcg_out_vec_op(TCGContext *s, TCGOpcode opc, unsigned vecl, unsigned vece, + const TCGArg args[TCG_MAX_OP_ARGS], const int const_args[TCG_MAX_OP_ARGS]) +{ + TCGType type = vecl + TCG_TYPE_V64; + TCGArg r0, r1, r2, r3, w0, w1, w2, w3; + + // Typing flags for vector operations. + bool is_v128 = (type == TCG_TYPE_V128); + bool is_scalar = !is_v128 && (vece == MO_64); + + // Argument shortcuts. + r0 = args[0]; + r1 = args[1]; + r2 = args[2]; + r3 = args[3]; + + // Offset argument shortcuts; offset to convert register numbers to gadget numberes. + w0 = args[0] - TCG_REG_V16; + w1 = args[1] - TCG_REG_V16; + w2 = args[2] - TCG_REG_V16; + w3 = args[3] - TCG_REG_V16; + + // Argument shortcuts, as signed. + int64_t signed_offset_arg = (int32_t)args[2]; + + switch (opc) { + + // Load memory -> vector: followed by a 64-bit offset immediate + case INDEX_op_ld_vec: + tcg_out_binary_dq_gadget(s, ldr, is_v128, w0, r1); + tcg_out64(s, signed_offset_arg); + break; + + // Store memory -> vector: followed by a 64-bit offset immediate + case INDEX_op_st_vec: + tcg_out_binary_dq_gadget(s, str, is_v128, w0, r1); + tcg_out64(s, signed_offset_arg); + break; + + // Duplciate memory to all vector elements. + case INDEX_op_dupm_vec: + // DUPM handles normalization itself; pass arguments raw. + tcg_out_dupm_vec(s, type, vece, r0, r1, r2); + break; + + case INDEX_op_add_vec: + tcg_out_ternary_vector_gadget_with_scalar(s, add, is_scalar, vece, w0, w1, w2); + break; + + case INDEX_op_sub_vec: + tcg_out_ternary_vector_gadget_with_scalar(s, sub, is_scalar, vece, w0, w1, w2); + break; + + case INDEX_op_mul_vec: // optional + tcg_out_ternary_vector_gadget_no64(s, mul, vece, w0, w1, w2); + break; + + case INDEX_op_neg_vec: // optional + tcg_out_binary_vector_gadget(s, neg, vece, w0, w1); + break; + + case INDEX_op_abs_vec: // optional + tcg_out_binary_vector_gadget(s, abs, vece, w0, w1); + break; + + case INDEX_op_and_vec: // optional + tcg_out_ternary_dq_gadget(s, and, is_v128, w0, w1, w2); + break; + + case INDEX_op_or_vec: + tcg_out_ternary_dq_gadget(s, or, is_v128, w0, w1, w2); + break; + + case INDEX_op_andc_vec: + tcg_out_ternary_dq_gadget(s, andc, is_v128, w0, w1, w2); + break; + + case INDEX_op_orc_vec: // optional + tcg_out_ternary_dq_gadget(s, orc, is_v128, w0, w1, w2); + break; + + case INDEX_op_xor_vec: + tcg_out_ternary_dq_gadget(s, xor, is_v128, w0, w1, w2); + break; + + case INDEX_op_ssadd_vec: + tcg_out_ternary_vector_gadget_with_scalar(s, ssadd, is_scalar, vece, w0, w1, w2); + break; + + case INDEX_op_sssub_vec: + tcg_out_ternary_vector_gadget_with_scalar(s, sssub, is_scalar, vece, w0, w1, w2); + break; + + case INDEX_op_usadd_vec: + tcg_out_ternary_vector_gadget_with_scalar(s, usadd, is_scalar, vece, w0, w1, w2); + break; + + case INDEX_op_ussub_vec: + tcg_out_ternary_vector_gadget_with_scalar(s, ussub, is_scalar, vece, w0, w1, w2); + break; + + case INDEX_op_smax_vec: + tcg_out_ternary_vector_gadget_no64(s, smax, vece, w0, w1, w2); + break; + + case INDEX_op_smin_vec: + tcg_out_ternary_vector_gadget_no64(s, smin, vece, w0, w1, w2); + break; + + case INDEX_op_umax_vec: + tcg_out_ternary_vector_gadget_no64(s, umax, vece, w0, w1, w2); + break; + + case INDEX_op_umin_vec: + tcg_out_ternary_vector_gadget_no64(s, umin, vece, w0, w1, w2); + break; + + case INDEX_op_not_vec: // optional + tcg_out_binary_dq_gadget(s, not, is_v128, w0, w1); + break; + + case INDEX_op_shlv_vec: + tcg_out_ternary_vector_gadget_with_scalar(s, shlv, is_scalar, vece, w0, w1, w2); + break; + + case INDEX_op_aa64_sshl_vec: + tcg_out_ternary_vector_gadget_with_scalar(s, sshl, is_scalar, vece, w0, w1, w2); + break; + + case INDEX_op_cmp_vec: + switch (args[3]) { + case TCG_COND_EQ: + tcg_out_ternary_vector_gadget_with_scalar(s, cmeq, is_scalar, vece, w0, w1, w2); + break; + case TCG_COND_NE: + tcg_out_ternary_vector_gadget_with_scalar(s, cmeq, is_scalar, vece, w0, w1, w2); + tcg_out_binary_dq_gadget(s, not, is_v128, w0, w0); + break; + case TCG_COND_GT: + tcg_out_ternary_vector_gadget_with_scalar(s, cmgt, is_scalar, vece, w0, w1, w2); + break; + case TCG_COND_LE: + tcg_out_ternary_vector_gadget_with_scalar(s, cmgt, is_scalar, vece, w0, w2, w1); + break; + case TCG_COND_GE: + tcg_out_ternary_vector_gadget_with_scalar(s, cmge, is_scalar, vece, w0, w1, w2); + break; + case TCG_COND_LT: + tcg_out_ternary_vector_gadget_with_scalar(s, cmge, is_scalar, vece, w0, w2, w1); + break; + case TCG_COND_GTU: + tcg_out_ternary_vector_gadget_with_scalar(s, cmhi, is_scalar, vece, w0, w1, w2); + break; + case TCG_COND_LEU: + tcg_out_ternary_vector_gadget_with_scalar(s, cmhi, is_scalar, vece, w0, w2, w1); + break; + case TCG_COND_GEU: + tcg_out_ternary_vector_gadget_with_scalar(s, cmhs, is_scalar, vece, w0, w1, w2); + break; + case TCG_COND_LTU: + tcg_out_ternary_vector_gadget_with_scalar(s, cmhs, is_scalar, vece, w0, w2, w1); + break; + default: + g_assert_not_reached(); + } + break; + + case INDEX_op_bitsel_vec: // optional + { + if (r0 == r3) { + tcg_out_ternary_dq_gadget(s, bit, is_v128, w0, w2, w1); + } else if (r0 == r2) { + tcg_out_ternary_dq_gadget(s, bif, is_v128, w0, w3, w1); + } else { + if (r0 != r1) { + tcg_out_mov(s, type, r0, r1); + } + tcg_out_ternary_dq_gadget(s, bsl, is_v128, w0, w2, w3); + } + break; + } + + case INDEX_op_mov_vec: /* Always emitted via tcg_out_mov. */ + case INDEX_op_dup_vec: /* Always emitted via tcg_out_dup_vec. */ + default: + g_assert_not_reached(); + } +} + + +int tcg_can_emit_vec_op(TCGOpcode opc, TCGType type, unsigned vece) +{ + switch (opc) { + case INDEX_op_add_vec: + case INDEX_op_sub_vec: + case INDEX_op_and_vec: + case INDEX_op_or_vec: + case INDEX_op_xor_vec: + case INDEX_op_andc_vec: + case INDEX_op_orc_vec: + case INDEX_op_neg_vec: + case INDEX_op_abs_vec: + case INDEX_op_not_vec: + case INDEX_op_cmp_vec: + case INDEX_op_ssadd_vec: + case INDEX_op_sssub_vec: + case INDEX_op_usadd_vec: + case INDEX_op_ussub_vec: + case INDEX_op_shlv_vec: + case INDEX_op_bitsel_vec: + return 1; + case INDEX_op_rotli_vec: + case INDEX_op_shrv_vec: + case INDEX_op_sarv_vec: + case INDEX_op_rotlv_vec: + case INDEX_op_rotrv_vec: + return -1; + case INDEX_op_mul_vec: + case INDEX_op_smax_vec: + case INDEX_op_smin_vec: + case INDEX_op_umax_vec: + case INDEX_op_umin_vec: + return vece < MO_64; + + default: + return 0; + } +} + +void tcg_expand_vec_op(TCGOpcode opc, TCGType type, unsigned vece, + TCGArg a0, ...) +{ + va_list va; + TCGv_vec v0, v1, v2, t1, t2, c1; + TCGArg a2; + + + va_start(va, a0); + v0 = temp_tcgv_vec(arg_temp(a0)); + v1 = temp_tcgv_vec(arg_temp(va_arg(va, TCGArg))); + a2 = va_arg(va, TCGArg); + va_end(va); + + switch (opc) { + case INDEX_op_shrv_vec: + case INDEX_op_sarv_vec: + /* Right shifts are negative left shifts for AArch64. */ + v2 = temp_tcgv_vec(arg_temp(a2)); + t1 = tcg_temp_new_vec(type); + tcg_gen_neg_vec(vece, t1, v2); + opc = (opc == INDEX_op_shrv_vec + ? INDEX_op_shlv_vec : INDEX_op_aa64_sshl_vec); + vec_gen_3(opc, type, vece, tcgv_vec_arg(v0), + tcgv_vec_arg(v1), tcgv_vec_arg(t1)); + tcg_temp_free_vec(t1); + break; + + case INDEX_op_rotlv_vec: + v2 = temp_tcgv_vec(arg_temp(a2)); + t1 = tcg_temp_new_vec(type); + c1 = tcg_constant_vec(type, vece, 8 << vece); + tcg_gen_sub_vec(vece, t1, v2, c1); + /* Right shifts are negative left shifts for AArch64. */ + vec_gen_3(INDEX_op_shlv_vec, type, vece, tcgv_vec_arg(t1), + tcgv_vec_arg(v1), tcgv_vec_arg(t1)); + vec_gen_3(INDEX_op_shlv_vec, type, vece, tcgv_vec_arg(v0), + tcgv_vec_arg(v1), tcgv_vec_arg(v2)); + tcg_gen_or_vec(vece, v0, v0, t1); + tcg_temp_free_vec(t1); + break; + + case INDEX_op_rotrv_vec: + v2 = temp_tcgv_vec(arg_temp(a2)); + t1 = tcg_temp_new_vec(type); + t2 = tcg_temp_new_vec(type); + c1 = tcg_constant_vec(type, vece, 8 << vece); + tcg_gen_neg_vec(vece, t1, v2); + tcg_gen_sub_vec(vece, t2, c1, v2); + /* Right shifts are negative left shifts for AArch64. */ + vec_gen_3(INDEX_op_shlv_vec, type, vece, tcgv_vec_arg(t1), + tcgv_vec_arg(v1), tcgv_vec_arg(t1)); + vec_gen_3(INDEX_op_shlv_vec, type, vece, tcgv_vec_arg(t2), + tcgv_vec_arg(v1), tcgv_vec_arg(t2)); + tcg_gen_or_vec(vece, v0, t1, t2); + tcg_temp_free_vec(t1); + tcg_temp_free_vec(t2); + break; + + default: + g_assert_not_reached(); + } +} + + +/* Generate DUPI (move immediate) vector ops. */ +static bool tcg_out_optimized_dupi_vec(TCGContext *s, TCGType type, unsigned vece, TCGReg rd, int64_t v64) +{ + bool q = (type == TCG_TYPE_V128); + int cmode, imm8, i; + + // If we're copying an 8b immediate, we implicitly have a simple gadget for this, + // since there are only 256 possible values * 16 registers. Emit a MOVI gadget implicitly. + if (vece == MO_8) { + imm8 = (uint8_t)v64; + tcg_out_dupi_gadget(s, movi, q, rd, 0, e, imm8); + return true; + } + + // Otherwise, if we have a value that's all 0x00 and 0xFF bytes, + // we can use the scalar variant of MOVI (op=1, cmode=e), which handles + // that case directly. + for (i = imm8 = 0; i < 8; i++) { + uint8_t byte = v64 >> (i * 8); + if (byte == 0xff) { + imm8 |= 1 << i; + } else if (byte != 0) { + goto fail_bytes; + } + } + tcg_out_dupi_gadget(s, movi, q, rd, 1, e, imm8); + return true; + fail_bytes: + + // Handle 16B moves. + if (vece == MO_16) { + uint16_t v16 = v64; + + // Check to see if we have a value representable in as a MOV imm8, possibly via a shift. + if (is_shimm16(v16, &cmode, &imm8)) { + // Output the corret instruction CMode for either a regular MOVI (8) or a LSL8 MOVI (a). + if (cmode == 0x8) { + tcg_out_dupi_gadget(s, movi, q, rd, 0, 8, imm8); + } else { + tcg_out_dupi_gadget(s, movi, q, rd, 0, a, imm8); + } + return true; + } + + // Check to see if we have a value representable in as an inverted MOV imm8, possibly via a shift. + if (is_shimm16(~v16, &cmode, &imm8)) { + // Output the corret instruction CMode for either a regular MOVI (8) or a LSL8 MOVI (a). + if (cmode == 0x8) { + tcg_out_dupi_gadget(s, mvni, q, rd, 0, 8, imm8); + } else { + tcg_out_dupi_gadget(s, mvni, q, rd, 0, a, imm8); + } + return true; + } + + // If we can't perform either of the optimizations, we'll need to do this in two steps. + // Normally, we'd emit a gadget for both steps, but in this case that'd result in needing -way- + // too many gadgets. We'll emit two, instead. + tcg_out_dupi_gadget(s, movi, q, rd, 0, 8, v16 & 0xff); + tcg_out_dupi_gadget(s, orr, q, rd, 0, a, v16 >> 8); + return true; + } + + // FIXME: implement 32B move optimizations + + + // Try to create optimized 32B moves. + //else if (vece == MO_32) { + // uint32_t v32 = v64; + // uint32_t n32 = ~v32; + + // if (is_shimm32(v32, &cmode, &imm8) || + // is_soimm32(v32, &cmode, &imm8) || + // is_fimm32(v32, &cmode, &imm8)) { + // tcg_out_insn(s, 3606, MOVI, q, rd, 0, cmode, imm8); + // return; + // } + // if (is_shimm32(n32, &cmode, &imm8) || + // is_soimm32(n32, &cmode, &imm8)) { + // tcg_out_insn(s, 3606, MVNI, q, rd, 0, cmode, imm8); + // return; + // } + + // // + // // Restrict the set of constants to those we can load with + // // two instructions. Others we load from the pool. + // // + // i = is_shimm32_pair(v32, &cmode, &imm8); + // if (i) { + // tcg_out_insn(s, 3606, MOVI, q, rd, 0, cmode, imm8); + // tcg_out_insn(s, 3606, ORR, q, rd, 0, i, extract32(v32, i * 4, 8)); + // return; + // } + // i = is_shimm32_pair(n32, &cmode, &imm8); + // if (i) { + // tcg_out_insn(s, 3606, MVNI, q, rd, 0, cmode, imm8); + // tcg_out_insn(s, 3606, BIC, q, rd, 0, i, extract32(n32, i * 4, 8)); + // return; + // } + //} + + return false; +} + + +/* Emits instructions that can load an immediate into a vector. */ +static void tcg_out_dupi_vec(TCGContext *s, TCGType type, unsigned vece, TCGReg rd, int64_t v64) +{ + // Convert Rd into a simple gadget number. + rd = rd - (TCG_REG_V16); + + // First, try to create an optimized implementation, if possible. + if (tcg_out_optimized_dupi_vec(s, type, vece, rd, v64)) { + return; + } + + // If we didn't, we'll need to load the full vector from memory. + // Emit it into our bytecode stream as an immediate; which we'll then + // load inside the gadget. + if (type == TCG_TYPE_V128) { + tcg_out_unary_gadget(s, gadget_ldi_q, rd); + tcg_out64(s, v64); + tcg_out64(s, v64); + } else { + tcg_out_unary_gadget(s, gadget_ldi_d, rd); + tcg_out64(s, v64); + } +} + + +/* Emits instructions that can load a register into a vector. */ +static bool tcg_out_dup_vec(TCGContext *s, TCGType type, unsigned vece, TCGReg rd, TCGReg rs) +{ + // Compute the gadget index for the relevant vector register. + TCGReg wd = rd - (TCG_REG_V16); + + // Emit a DUP gadget to handles the operation. + tcg_out_binary_vector_gadget(s, dup, vece, wd, rs); + return true; +} + +static bool tcg_out_dupm_vec(TCGContext *s, TCGType type, unsigned vece, TCGReg r, TCGReg base, intptr_t offset) +{ + int64_t extended_offset = (int32_t)offset; + + // Convert the register into a simple register number for our gadgets. + r = r - TCG_REG_V16; + + // Emit a DUPM gadget... + tcg_out_binary_vector_gadget(s, dupm, vece, r, base); + + // ... and emit its int64 immediate offset. + tcg_out64(s, extended_offset); + + return true; +} + + +/******************************** + * TCG Runtime & Platform Def * + *******************************/ + static void tcg_target_init(TCGContext *s) { /* The current code uses uint8_t for tcg operations. */ tcg_debug_assert(tcg_op_defs_max <= UINT8_MAX); - /* Registers available for 32 bit operations. */ - tcg_target_available_regs[TCG_TYPE_I32] = BIT(TCG_TARGET_NB_REGS) - 1; - /* Registers available for 64 bit operations. */ - tcg_target_available_regs[TCG_TYPE_I64] = BIT(TCG_TARGET_NB_REGS) - 1; - - /* TODO: Which registers should be set here? */ - tcg_target_call_clobber_regs = BIT(TCG_TARGET_NB_REGS) - 1; + // Registers available for each type of operation. + tcg_target_available_regs[TCG_TYPE_I32] = TCG_MASK_GP_REGISTERS; + tcg_target_available_regs[TCG_TYPE_I64] = TCG_MASK_GP_REGISTERS; + tcg_target_available_regs[TCG_TYPE_V64] = TCG_MASK_VECTOR_REGISTERS; + tcg_target_available_regs[TCG_TYPE_V128] = TCG_MASK_VECTOR_REGISTERS; + + TCGReg unclobbered_registers[] = { + // We don't use registers R16+ in our runtime, so we'll not bother protecting them. + TCG_REG_R16, TCG_REG_R17, TCG_REG_R18, TCG_REG_R19, + TCG_REG_R20, TCG_REG_R21, TCG_REG_R22, TCG_REG_R23, + TCG_REG_R24, TCG_REG_R25, TCG_REG_R26, TCG_REG_R27, + TCG_REG_R28, TCG_REG_R29, TCG_REG_R30, TCG_REG_R31, + + // Per our calling convention. + TCG_REG_V8, TCG_REG_V9, TCG_REG_V10, TCG_REG_V11, + TCG_REG_V12, TCG_REG_V13, TCG_REG_V14, TCG_REG_V15, + }; + + // Specify which registers are clobbered during call. + tcg_target_call_clobber_regs = -1ull; + for (unsigned i = 0; i < ARRAY_SIZE(unclobbered_registers); ++i) { + tcg_regset_reset_reg(tcg_target_call_clobber_regs, unclobbered_registers[i]); + } + // Specify which local registers we're reserving. + // + // Note that we only have to specify registers that are used in the runtime, + // and so not e.g. the register that contains AREG0, which can never be allocated. s->reserved_regs = 0; tcg_regset_set_reg(s->reserved_regs, TCG_REG_CALL_STACK); @@ -1292,8 +2073,8 @@ uintptr_t QEMU_DISABLE_CFI tcg_qemu_tb_exec(CPUArchState *env, const void *v_tb_ : [return_value] "=m" (return_value) - : [areg0] "m" (env), - [sp_value] "m" (sp_value), + : [areg0] "m" (env), + [sp_value] "m" (sp_value), [start_tb_ptr] "m" (v_tb_ptr), [pc_mirror] "m" (pc_mirror) @@ -1318,8 +2099,11 @@ uintptr_t QEMU_DISABLE_CFI tcg_qemu_tb_exec(CPUArchState *env, const void *v_tb_ /* Disassemble TCI bytecode. */ int print_insn_tcti(bfd_vma addr, disassemble_info *info) { + +#ifdef TCTI_GADGET_RICH_DISASSEMBLY Dl_info symbol_info = {}; char symbol_name[48] ; +#endif int status; uint64_t block; @@ -1331,16 +2115,22 @@ int print_insn_tcti(bfd_vma addr, disassemble_info *info) return -1; } +#ifdef TCTI_GADGET_RICH_DISASSEMBLY // Most of our disassembly stream will be gadgets. Try to get their names, for nice output. dladdr((void *)block, &symbol_info); if(symbol_info.dli_sname != 0) { - strlcpy(symbol_name, symbol_info.dli_sname, 47); + strncpy(symbol_name, symbol_info.dli_sname, sizeof(symbol_name)); + symbol_name[sizeof(symbol_name) - 1] = 0; info->fprintf_func(info->stream, "%s", symbol_name); } else { - info->fprintf_func(info->stream, "%016llx", block); + info->fprintf_func(info->stream, "%016lx", block); } +#else + info->fprintf_func(info->stream, "%016lx", block); +#endif + return sizeof(block); } diff --git a/tcg/aarch64-tcti/tcg-target.h b/tcg/aarch64-tcti/tcg-target.h index 7eb3bb1c3d94..bf4e7e2772b9 100644 --- a/tcg/aarch64-tcti/tcg-target.h +++ b/tcg/aarch64-tcti/tcg-target.h @@ -56,8 +56,11 @@ // weird psuedo-native bytecode. We'll indicate that we're intepreted. #define TCG_TARGET_INTERPRETER 1 +// Specify we'll handle direct jumps. +#define TCG_TARGET_HAS_direct_jump 1 + // -// Supported optional instructions. +// Supported optional scalar instructions. // // Divs. @@ -78,23 +81,35 @@ #define TCG_TARGET_HAS_ext16u_i64 1 #define TCG_TARGET_HAS_ext32u_i64 1 -// Logicals. +// Register extractions. +#define TCG_TARGET_HAS_extrl_i64_i32 1 +#define TCG_TARGET_HAS_extrh_i64_i32 1 + +// Negations. #define TCG_TARGET_HAS_neg_i32 1 #define TCG_TARGET_HAS_not_i32 1 #define TCG_TARGET_HAS_neg_i64 1 #define TCG_TARGET_HAS_not_i64 1 +// Logicals. #define TCG_TARGET_HAS_andc_i32 1 #define TCG_TARGET_HAS_orc_i32 1 #define TCG_TARGET_HAS_eqv_i32 1 +#define TCG_TARGET_HAS_rot_i32 1 +#define TCG_TARGET_HAS_nand_i32 1 +#define TCG_TARGET_HAS_nor_i32 1 #define TCG_TARGET_HAS_andc_i64 1 #define TCG_TARGET_HAS_eqv_i64 1 #define TCG_TARGET_HAS_orc_i64 1 +#define TCG_TARGET_HAS_rot_i64 1 +#define TCG_TARGET_HAS_nor_i64 1 +#define TCG_TARGET_HAS_nand_i64 1 -// We don't curretly support rotates, since AArch64 lacks ROL. -// We'll fix this later. -#define TCG_TARGET_HAS_rot_i32 0 -#define TCG_TARGET_HAS_rot_i64 0 +// Bitwise operations. +#define TCG_TARGET_HAS_clz_i32 1 +#define TCG_TARGET_HAS_ctz_i32 1 +#define TCG_TARGET_HAS_clz_i64 1 +#define TCG_TARGET_HAS_ctz_i64 1 // Swaps. #define TCG_TARGET_HAS_bswap16_i32 1 @@ -104,53 +119,58 @@ #define TCG_TARGET_HAS_bswap64_i64 1 #define TCG_TARGET_HAS_MEMORY_BSWAP 1 -// Specify we'll handle direct jumps. -#define TCG_TARGET_HAS_direct_jump 1 - // -// Potential TODOs. +// Supported optional vector instructions. // -// TODO: implement DEPOSIT as BFI. -#define TCG_TARGET_HAS_deposit_i32 0 -#define TCG_TARGET_HAS_deposit_i64 0 - -// TODO: implement EXTRACT as BFX. -#define TCG_TARGET_HAS_extract_i32 0 -#define TCG_TARGET_HAS_sextract_i32 0 -#define TCG_TARGET_HAS_extract_i64 0 -#define TCG_TARGET_HAS_sextract_i64 0 - -// TODO: it might be worth writing a gadget for this -#define TCG_TARGET_HAS_movcond_i32 0 -#define TCG_TARGET_HAS_movcond_i64 0 +#define TCG_TARGET_HAS_v64 1 +#define TCG_TARGET_HAS_v128 1 +#define TCG_TARGET_HAS_v256 0 + +#define TCG_TARGET_HAS_andc_vec 1 +#define TCG_TARGET_HAS_orc_vec 1 +#define TCG_TARGET_HAS_nand_vec 0 +#define TCG_TARGET_HAS_nor_vec 0 +#define TCG_TARGET_HAS_eqv_vec 0 +#define TCG_TARGET_HAS_not_vec 1 +#define TCG_TARGET_HAS_neg_vec 1 +#define TCG_TARGET_HAS_abs_vec 1 +#define TCG_TARGET_HAS_roti_vec 0 +#define TCG_TARGET_HAS_rots_vec 0 +#define TCG_TARGET_HAS_rotv_vec 0 +#define TCG_TARGET_HAS_shi_vec 0 +#define TCG_TARGET_HAS_shs_vec 0 +#define TCG_TARGET_HAS_shv_vec 1 +#define TCG_TARGET_HAS_mul_vec 1 +#define TCG_TARGET_HAS_sat_vec 1 +#define TCG_TARGET_HAS_minmax_vec 1 +#define TCG_TARGET_HAS_bitsel_vec 1 +#define TCG_TARGET_HAS_cmpsel_vec 0 // // Unsupported instructions. // -// ARMv8 doesn't have instructions for NAND/NOR. -#define TCG_TARGET_HAS_nand_i32 0 -#define TCG_TARGET_HAS_nor_i32 0 -#define TCG_TARGET_HAS_nor_i64 0 -#define TCG_TARGET_HAS_nand_i64 0 - -// aarch64's CLZ is implemented without a condition, so it -#define TCG_TARGET_HAS_clz_i32 0 -#define TCG_TARGET_HAS_ctz_i32 0 +// There's no direct instruction with which to count the number of ones, +// so we'll leave this implemented as other instructions. #define TCG_TARGET_HAS_ctpop_i32 0 -#define TCG_TARGET_HAS_clz_i64 0 -#define TCG_TARGET_HAS_ctz_i64 0 #define TCG_TARGET_HAS_ctpop_i64 0 -// We don't have a simple gadget for this, since we're always assuming softmmu. -#define TCG_TARGET_HAS_qemu_st8_i32 0 - -// No AArch64 equivalent.a -#define TCG_TARGET_HAS_extrl_i64_i32 0 -#define TCG_TARGET_HAS_extrh_i64_i32 0 +// We don't currently support gadgets with more than three arguments, +// so we can't yet create movcond, deposit, or extract gadgets. +#define TCG_TARGET_HAS_movcond_i32 0 +#define TCG_TARGET_HAS_movcond_i64 0 +#define TCG_TARGET_HAS_deposit_i32 0 +#define TCG_TARGET_HAS_deposit_i64 0 +#define TCG_TARGET_HAS_extract_i32 0 +#define TCG_TARGET_HAS_sextract_i32 0 +#define TCG_TARGET_HAS_extract_i64 0 +#define TCG_TARGET_HAS_sextract_i64 0 -#define TCG_TARGET_HAS_extract2_i64 0 +// This operation exists specifically to allow us to provide differing register +// constraints for 8-bit loads and stores. We don't need to do so, so we'll leave +// this unimplemented, as we gain nothing by it. +#define TCG_TARGET_HAS_qemu_st8_i32 0 // These should always be zero on our 64B platform. #define TCG_TARGET_HAS_muls2_i64 0 @@ -166,36 +186,55 @@ #define TCG_TARGET_HAS_muls2_i32 0 #define TCG_TARGET_HAS_muluh_i32 0 #define TCG_TARGET_HAS_mulsh_i32 0 +#define TCG_TARGET_HAS_extract2_i64 0 // // Platform metadata. // // Number of registers available. -// It might make sense to up these, since we can also use x16 -> x25? -#define TCG_TARGET_NB_REGS 16 +#define TCG_TARGET_NB_REGS 64 + +// Number of general purpose registers. +#define TCG_TARGET_GP_REGS 16 /* List of registers which are used by TCG. */ typedef enum { - TCG_REG_R0 = 0, - TCG_REG_R1, - TCG_REG_R2, - TCG_REG_R3, - TCG_REG_R4, - TCG_REG_R5, - TCG_REG_R6, - TCG_REG_R7, - TCG_REG_R8, - TCG_REG_R9, - TCG_REG_R10, - TCG_REG_R11, - TCG_REG_R12, - TCG_REG_R13, - TCG_REG_R14, - TCG_REG_R15, - - TCG_AREG0 = TCG_REG_R14, - TCG_REG_CALL_STACK = TCG_REG_R15, + + // General purpose registers. + // Note that we name every _host_ register here; but don't + // necessarily use them; that's determined by the allocation order + // and the number of registers setting above. These just give us the ability + // to refer to these by name. + TCG_REG_R0, TCG_REG_R1, TCG_REG_R2, TCG_REG_R3, + TCG_REG_R4, TCG_REG_R5, TCG_REG_R6, TCG_REG_R7, + TCG_REG_R8, TCG_REG_R9, TCG_REG_R10, TCG_REG_R11, + TCG_REG_R12, TCG_REG_R13, TCG_REG_R14, TCG_REG_R15, + TCG_REG_R16, TCG_REG_R17, TCG_REG_R18, TCG_REG_R19, + TCG_REG_R20, TCG_REG_R21, TCG_REG_R22, TCG_REG_R23, + TCG_REG_R24, TCG_REG_R25, TCG_REG_R26, TCG_REG_R27, + TCG_REG_R28, TCG_REG_R29, TCG_REG_R30, TCG_REG_R31, + + // Register aliases. + TCG_AREG0 = TCG_REG_R14, + TCG_REG_CALL_STACK = TCG_REG_R15, + + // Mask that refers to the GP registers. + TCG_MASK_GP_REGISTERS = 0xFFFFul, + + // Vector registers. + TCG_REG_V0 = 32, TCG_REG_V1, TCG_REG_V2, TCG_REG_V3, + TCG_REG_V4, TCG_REG_V5, TCG_REG_V6, TCG_REG_V7, + TCG_REG_V8, TCG_REG_V9, TCG_REG_V10, TCG_REG_V11, + TCG_REG_V12, TCG_REG_V13, TCG_REG_V14, TCG_REG_V15, + TCG_REG_V16, TCG_REG_V17, TCG_REG_V18, TCG_REG_V19, + TCG_REG_V20, TCG_REG_V21, TCG_REG_V22, TCG_REG_V23, + TCG_REG_V24, TCG_REG_V25, TCG_REG_V26, TCG_REG_V27, + TCG_REG_V28, TCG_REG_V29, TCG_REG_V30, TCG_REG_V31, + + // Mask that refers to the vector registers. + TCG_MASK_VECTOR_REGISTERS = 0xFFFF000000000000ul, + } TCGReg; // Specify the shape of the stack our runtime will use. diff --git a/tcg/aarch64-tcti/tcg-target.opc.h b/tcg/aarch64-tcti/tcg-target.opc.h new file mode 100644 index 000000000000..26bfd9c46093 --- /dev/null +++ b/tcg/aarch64-tcti/tcg-target.opc.h @@ -0,0 +1,14 @@ +/* + * Copyright (c) 2019 Linaro + * + * This work is licensed under the terms of the GNU GPL, version 2 or + * (at your option) any later version. + * + * See the COPYING file in the top-level directory for details. + * + * Target-specific opcodes for host vector expansion. These will be + * emitted by tcg_expand_vec_op. For those familiar with GCC internals, + * consider these to be UNSPEC with names. + */ + +DEF(aa64_sshl_vec, 1, 2, 0, IMPLVEC) diff --git a/tcg/aarch64-tcti/tcti-gadget-gen.py b/tcg/aarch64-tcti/tcti-gadget-gen.py index fa0232fefac0..4e127ff8c3be 100755 --- a/tcg/aarch64-tcti/tcti-gadget-gen.py +++ b/tcg/aarch64-tcti/tcti-gadget-gen.py @@ -4,17 +4,10 @@ Generates a C-code include file containing 'gadgets' for use by TCTI. """ +import os import sys import itertools -# Get a handle on the file we'll be working with, and redirect print to it. -if len(sys.argv) > 1: - out_file = open(sys.argv[1], "w") - - # Hook our print function, so it always outputs to the relevant file. - core_print = print - print = lambda *a, **k : core_print(*a, **k, file=out_file) - # Epilogue code follows at the end of each gadget, and handles continuing execution. EPILOGUE = ( # Load our next gadget address from our bytecode stream, advancing it. @@ -32,41 +25,113 @@ # Helper that provides each of the AArch64 condition codes of interest. ARCH_CONDITION_CODES = ["eq", "ne", "lt", "ge", "le", "gt", "lo", "hs", "ls", "hi"] +# The list of vector size codes supported on this platform. +VECTOR_SIZES = ['16b', '8b', '4h', '8h', '2s', '4s', '2d'] + # We'll create a variety of gadgets that assume the MMU's TLB is stored at certain # offsets into its structure. These should match the offsets in tcg-target.c.in. -QEMU_ALLOWED_MMU_OFFSETS = [ 64, 96, 128 ] +QEMU_ALLOWED_MMU_OFFSETS = [ 32, 48, 64, 96, 128 ] # Statistics. gadgets = 0 instructions = 0 -def simple(name, *lines): +# Files to write to. +current_collection = "basic" +output_files = {} + +# Create a top-level header. +top_header = open("tcti_gadgets.h", "w") +print("/* Automatically generated by tcti-gadget-gen.py. Do not edit. */\n", file=top_header) + +def _get_output_files(): + """ Gathers the output C and H files for a given gadget-cluster name. """ + + # If we don't have an output file for this already, create it. + return output_files[current_collection] + + +def START_COLLECTION(name): + """ Sets the name of the current collection. """ + + global current_collection + + # If we already have a collection for this, skip it. + if name in output_files: + return + + # Create the relevant output files + new_c_file = open(f"tcti_{name}_gadgets.c", "w") + new_h_file = open(f"tcti_{name}_gadgets.h", "w") + output_files[name] = (new_c_file, new_h_file) + + # Add the file to our gadget collection. + print(f'#include "tcti_{name}_gadgets.h"', file=top_header) + + # Add generated messages to the relevant collection. + print("/* Automatically generated by tcti-gadget-gen.py. Do not edit. */\n", file=new_c_file) + print("/* Automatically generated by tcti-gadget-gen.py. Do not edit. */\n", file=new_h_file) + + # Start our C file with inclusion of the relevant header. + print(f'\n#include "tcti_{name}_gadgets.h"\n', file=new_c_file) + + # Start our H file with a simple pragma-guard, for speed. + print('\n#pragma once\n', file=new_h_file) + + # Finally, set the global active collection. + current_collection = name + + +def simple(name, *lines, export=True): """ Generates a simple gadget that needs no per-register specialization. """ global gadgets, instructions gadgets += 1 + # Fetch the files we'll be using for output. + c_file, h_file = _get_output_files() + # Create our C/ASM framing. - #print(f"__attribute__((naked)) static void gadget_{name}(void)") - print(f"__attribute__((naked)) static void gadget_{name}(void);") - print(f"__attribute__((naked)) static void gadget_{name}(void)") - print("{") + if export: + print(f"__attribute__((naked)) void gadget_{name}(void);", file=h_file) + print(f"__attribute__((naked)) void gadget_{name}(void)", file=c_file) + else: + print(f"static __attribute__((naked)) void gadget_{name}(void)", file=c_file) + + print("{", file=c_file) # Add the core gadget - print("\tasm(") + print("\tasm(", file=c_file) for line in lines + EPILOGUE: - print(f"\t\t\"{line} \\n\"") + print(f"\t\t\"{line} \\n\"", file=c_file) instructions += 1 - print("\t);") + print("\t);", file=c_file) # End our framing. - print("}\n") + print("}\n", file=c_file) + def with_register_substitutions(name, substitutions, *lines, immediate_range=range(0)): """ Generates a collection of gadgtes with register substitutions. """ + def _expand_op1_immediate(num): + """ Gets a uncompressed bitfield argument for a given immediate; for NEON instructions. + + Duplciates each bit eight times; converting 0b0100 to 0x00FF0000. + """ + + # Get the number as a binary string... + binstring = bin(num)[2:] + + # ... expand out the values to hex... + hex_string = binstring.replace('1', 'FF').replace('0', '00') + + # ... and return out the new constant. + return f"0x{hex_string}" + + def substitutions_for_letter(letter, number, line): """ Helper that transforms Wd => w1, implementing gadget substitutions. """ @@ -74,8 +139,16 @@ def substitutions_for_letter(letter, number, line): line = line.replace(f"X{letter}", f"x{number}") line = line.replace(f"W{letter}", f"w{number}") - # ... immediate substitutions. + # ... vector register substitutions... + line = line.replace(f"V{letter}", f"v{number + 16}") + line = line.replace(f"D{letter}", f"d{number + 16}") + line = line.replace(f"Q{letter}", f"q{number + 16}") + + # ... regular immediate substitutions... line = line.replace(f"I{letter}", f"{number}") + + # ... and compressed immediate substitutions. + line = line.replace(f"S{letter}", f"{_expand_op1_immediate(number)}") return line @@ -105,77 +178,94 @@ def substitutions_for_letter(letter, number, line): # ... and emit the gadget. permutation_id = "_arg".join(str(number) for number in permutation) - simple(f"{name}_arg{permutation_id}", *new_lines) + simple(f"{name}_arg{permutation_id}", *new_lines, export=False) def with_dnm(name, *lines): """ Generates a collection of gadgets with substitutions for Xd, Xn, and Xm, and equivalents. """ with_register_substitutions(name, ("d", "n", "m"), *lines) + # Fetch the files we'll be using for output. + c_file, h_file = _get_output_files() + + # Print out an extern. + print(f"extern const void* gadget_{name}[{TCG_REGISTER_COUNT}][{TCG_REGISTER_COUNT}][{TCG_REGISTER_COUNT}];", file=h_file) + # Print out an array that contains all of our gadgets, for lookup. - print(f"static void* gadget_{name}[{TCG_REGISTER_COUNT}][{TCG_REGISTER_COUNT}][{TCG_REGISTER_COUNT}] = ", end="") - print("{") + print(f"const void* gadget_{name}[{TCG_REGISTER_COUNT}][{TCG_REGISTER_COUNT}][{TCG_REGISTER_COUNT}] = ", end="", file=c_file) + print("{", file=c_file) # D array for d in TCG_REGISTER_NUMBERS: - print("\t{") + print("\t{", file=c_file) # N array for n in TCG_REGISTER_NUMBERS: - print("\t\t{", end="") + print("\t\t{", end="", file=c_file) # M array for m in TCG_REGISTER_NUMBERS: - print(f"gadget_{name}_arg{d}_arg{n}_arg{m}", end=", ") + print(f"gadget_{name}_arg{d}_arg{n}_arg{m}", end=", ", file=c_file) - print("},") - print("\t},") - print("};") + print("},", file=c_file) + print("\t},", file=c_file) + print("};", file=c_file) def with_dn_immediate(name, *lines, immediate_range): """ Generates a collection of gadgets with substitutions for Xd, Xn, and Xm, and equivalents. """ with_register_substitutions(name, ["d", "n"], *lines, immediate_range=immediate_range) + # Fetch the files we'll be using for output. + c_file, h_file = _get_output_files() + + # Print out an extern. + print(f"extern const void* gadget_{name}[{TCG_REGISTER_COUNT}][{TCG_REGISTER_COUNT}][{len(immediate_range)}];", file=h_file) + # Print out an array that contains all of our gadgets, for lookup. - print(f"static void* gadget_{name}[{TCG_REGISTER_COUNT}][{TCG_REGISTER_COUNT}][{len(immediate_range)}] = ", end="") - print("{") + print(f"const void* gadget_{name}[{TCG_REGISTER_COUNT}][{TCG_REGISTER_COUNT}][{len(immediate_range)}] = ", end="", file=c_file) + print("{", file=c_file) # D array for d in TCG_REGISTER_NUMBERS: - print("\t{") + print("\t{", file=c_file) # N array for n in TCG_REGISTER_NUMBERS: - print("\t\t{", end="") + print("\t\t{", end="", file=c_file) # M array for i in immediate_range: - print(f"gadget_{name}_arg{d}_arg{n}_arg{i}", end=", ") + print(f"gadget_{name}_arg{d}_arg{n}_arg{i}", end=", ", file=c_file) - print("},") - print("\t},") - print("};") + print("},", file=c_file) + print("\t},", file=c_file) + print("};", file=c_file) def with_pair(name, substitutions, *lines): """ Generates a collection of gadgets with two subtstitutions.""" with_register_substitutions(name, substitutions, *lines) + # Fetch the files we'll be using for output. + c_file, h_file = _get_output_files() + + print(f"extern const void* gadget_{name}[{TCG_REGISTER_COUNT}][{TCG_REGISTER_COUNT}];", file=h_file) + # Print out an array that contains all of our gadgets, for lookup. - print(f"static void* gadget_{name}[{TCG_REGISTER_COUNT}][{TCG_REGISTER_COUNT}] = ", end="") - print("{") + print(f"const void* gadget_{name}[{TCG_REGISTER_COUNT}][{TCG_REGISTER_COUNT}] = ", end="", file=c_file) + print("{", file=c_file) # N array for a in TCG_REGISTER_NUMBERS: - print("\t\t{", end="") + print("\t\t{", end="", file=c_file) # M array for b in TCG_REGISTER_NUMBERS: - print(f"gadget_{name}_arg{a}_arg{b}", end=", ") + print(f"gadget_{name}_arg{a}_arg{b}", end=", ", file=c_file) - print("},") - print("};") + print("},", file=c_file) + print("};", file=c_file) def math_dnm(name, mnemonic): @@ -183,10 +273,10 @@ def math_dnm(name, mnemonic): with_dnm(f'{name}_i32', f"{mnemonic} Wd, Wn, Wm") with_dnm(f'{name}_i64', f"{mnemonic} Xd, Xn, Xm") -def math_dn(name, mnemonic): +def math_dn(name, mnemonic, source_is_wn=False): """ Equivalent to `with_dn`, but creates a _i32 and _i64 variant. For simple math. """ with_dn(f'{name}_i32', f"{mnemonic} Wd, Wn") - with_dn(f'{name}_i64', f"{mnemonic} Xd, Xn") + with_dn(f'{name}_i64', f"{mnemonic} Xd, Wn" if source_is_wn else f"{mnemonic} Xd, Xn") def with_nm(name, *lines): @@ -227,34 +317,44 @@ def with_single(name, substitution, *lines): """ Generates a collection of gadgets with two subtstitutions.""" with_register_substitutions(name, (substitution,), *lines) + # Fetch the files we'll be using for output. + c_file, h_file = _get_output_files() + + print(f"extern const void* gadget_{name}[{TCG_REGISTER_COUNT}];", file=h_file) + # Print out an array that contains all of our gadgets, for lookup. - print(f"static void* gadget_{name}[{TCG_REGISTER_COUNT}] = ", end="") - print("{") + print(f"const void* gadget_{name}[{TCG_REGISTER_COUNT}] = ", end="", file=c_file) + print("{", file=c_file) for n in TCG_REGISTER_NUMBERS: - print(f"gadget_{name}_arg{n}", end=", ") + print(f"gadget_{name}_arg{n}", end=", ", file=c_file) - print("};") + print("};", file=c_file) def with_d_immediate(name, *lines, immediate_range=range(0)): """ Generates a collection of gadgets with two subtstitutions.""" with_register_substitutions(name, ['d'], *lines, immediate_range=immediate_range) + # Fetch the files we'll be using for output. + c_file, h_file = _get_output_files() + + print(f"extern void* gadget_{name}[{TCG_REGISTER_COUNT}][{len(immediate_range)}];", file=h_file) + # Print out an array that contains all of our gadgets, for lookup. - print(f"static void* gadget_{name}[{TCG_REGISTER_COUNT}][{len(immediate_range)}] = ", end="") - print("{") + print(f"void* gadget_{name}[{TCG_REGISTER_COUNT}][{len(immediate_range)}] = ", end="", file=c_file) + print("{", file=c_file) # D array for a in TCG_REGISTER_NUMBERS: - print("\t\t{", end="") + print("\t\t{", end="", file=c_file) # I array for b in immediate_range: - print(f"gadget_{name}_arg{a}_arg{b}", end=", ") + print(f"gadget_{name}_arg{a}_arg{b}", end=", ", file=c_file) - print("},") - print("};") + print("},", file=c_file) + print("};", file=c_file) @@ -265,31 +365,14 @@ def with_d(name, *lines): # Assembly code for saving our machine state before entering the C runtime. C_CALL_PROLOGUE = [ - # Store our machine state. - "str x25, [sp, #-16]!", "stp x14, x15, [sp, #-16]!", - "stp x12, x13, [sp, #-16]!", - "stp x10, x11, [sp, #-16]!", - "stp x8, x9, [sp, #-16]!", - "stp x6, x7, [sp, #-16]!", - "stp x4, x5, [sp, #-16]!", - "stp x2, x3, [sp, #-16]!", - "stp x0, x1, [sp, #-16]!", "stp x28, lr, [sp, #-16]!", ] # Assembly code for restoring our machine state after leaving the C runtime. C_CALL_EPILOGUE = [ - "ldp x28, lr, [sp], #16", - "ldp x0, x1, [sp], #16", - "ldp x2, x3, [sp], #16", - "ldp x4, x5, [sp], #16", - "ldp x6, x7, [sp], #16", - "ldp x8, x9, [sp], #16", - "ldp x10, x11, [sp], #16", - "ldp x12, x13, [sp], #16", + "ldp x28, lr, [sp], #16", "ldp x14, x15, [sp], #16", - "ldr x25, [sp], #16", ] @@ -503,11 +586,73 @@ def st_thunk(name, fastpath_32b, fastpath_64b, slowpath_helper, immediate=None, ) + +def vector_dn(name, *lines): + """ Creates a set of gadgets for every size of a given vector op. Accepts 'S' as a size placeholder. """ + + def do_size_replacement(line, size): + line = line.replace(".S", f".{size}") + + # If this size requires a 32b register, replace Wd with Xd. + if size == "2d": + line = line.replace("Wn", "Xn") + + return line + + + # Create a variant for each size, replacing any placeholders. + for size in VECTOR_SIZES: + sized_lines = (do_size_replacement(line, size) for line in lines) + with_dn(f"{name}_{size}", *sized_lines) + + +def vector_dnm(name, *lines, scalar=None, omit_sizes=()): + """ Creates a set of gadgets for every size of a given vector op. Accepts 'S' as a size placeholder. """ + + def do_size_replacement(line, size): + return line.replace(".S", f".{size}") + + # Create a variant for each size, replacing any placeholders. + for size in VECTOR_SIZES: + if size in omit_sizes: + continue + + sized_lines = (do_size_replacement(line, size) for line in lines) + with_dnm(f"{name}_{size}", *sized_lines) + + if scalar: + if isinstance(scalar, str): + sized_lines = (scalar,) + with_dnm(f"{name}_scalar", *sized_lines) + + +def vector_math_dnm(name, operation): + """ Generates a collection of gadgets for vector math instructions. """ + vector_dnm(name, f"{operation} Vd.S, Vn.S, Vm.S", scalar=f"{operation} Dd, Dn, Dm") + + +def vector_math_dnm_no64(name, operation): + """ Generates a collection of gadgets for vector math instructions. """ + vector_dnm(name, f"{operation} Vd.S, Vn.S, Vm.S", omit_sizes=('2d',)) + + +def vector_logic_dn(name, operation): + """ Generates a pair of gadgets for vector bitwise logic instructions. """ + with_dn(f"{name}_d", f"{operation} Vd.8b, Vn.8b") + with_dn(f"{name}_q", f"{operation} Vd.16b, Vn.16b") + + +def vector_logic_dnm(name, operation): + """ Generates a pair of gadgets for vector bitwise logic instructions. """ + with_dnm(f"{name}_d", f"{operation} Vd.8b, Vn.8b, Vm.8b") + with_dnm(f"{name}_q", f"{operation} Vd.16b, Vn.16b, Vm.16b") + + # # Gadget definitions. # -print("/* Automatically generated by tcti-gadget-gen.py. Do not edit. */\n") +START_COLLECTION("misc") # Call a C language helper function by address. simple("call", @@ -539,6 +684,7 @@ def st_thunk(name, fastpath_32b, fastpath_64b, slowpath_helper, immediate=None, "ldr x28, [x28]" ) + # Exit from a translation buffer execution. simple("exit_tb", @@ -550,9 +696,18 @@ def st_thunk(name, fastpath_32b, fastpath_64b, slowpath_helper, immediate=None, "ret" ) +# Memory barriers. +simple("mb_all", "dmb ish") +simple("mb_st", "dmb ishst") +simple("mb_ld", "dmb ishld") + + + for condition in ARCH_CONDITION_CODES: + START_COLLECTION("setcond") + # Performs a comparison between two operands. with_dnm(f"setcond_i32_{condition}", "subs Wd, Wn, Wm", @@ -573,23 +728,20 @@ def st_thunk(name, fastpath_32b, fastpath_64b, slowpath_helper, immediate=None, # branch is funneled throught the same address. # + START_COLLECTION("brcond") + # Branches iff a given comparison is true. with_dnm(f'brcond_i32_{condition}', # Grab our immediate argument. "ldr x27, [x28], #8", - # Perform our comparison and conditional branch. - "subs Wzr, Wn, Wm", - f"b{condition} 1f", - - "0:", # not taken - # Perform our end-of-instruction epilogue. - *EPILOGUE, + # Perform our comparison... + "subs wzr, Wn, Wm", - "1:" # taken - # Update our bytecode pointer to take the label. - "mov x28, x27" + # ... and our conditional branch, which selectively sets w28 (our "gadget pointer") + # to the new location, if required. + f"csel x28, x27, x28, {condition}" ) # Branches iff a given comparison is true. @@ -599,19 +751,17 @@ def st_thunk(name, fastpath_32b, fastpath_64b, slowpath_helper, immediate=None, "ldr x27, [x28], #8", # Perform our comparison and conditional branch. - "subs Xzr, Xn, Xm", - f"b{condition} 1f", + "subs xzr, Xn, Xm", - "0:", # not taken - # Perform our end-of-instruction epilogue. - *EPILOGUE, - - "1:" # taken - # Update our bytecode pointer to take the label. - "mov x28, x27" + # ... and our conditional branch, which selectively sets w28 (our "gadget pointer") + # to the new location, if required. + f"csel x28, x27, x28, {condition}" ) +START_COLLECTION("mov") + + # MOV variants. with_dn("mov_i32", "mov Wd, Wn") with_dn("mov_i64", "mov Xd, Xn") @@ -623,17 +773,24 @@ def st_thunk(name, fastpath_32b, fastpath_64b, slowpath_helper, immediate=None, with_d_immediate("movi_imm_i32", "mov Wd, #Ii", immediate_range=range(64)) with_d_immediate("movi_imm_i64", "mov Xd, #Ii", immediate_range=range(64)) +START_COLLECTION("load_unsigned") + # LOAD variants. # TODO: should the signed variants have X variants for _i64? ldst_dn("ld8u", "ldrb Wd, [Xn, x27]") +ldst_dn("ld16u", "ldrh Wd, [Xn, x27]") +ldst_dn("ld32u", "ldr Wd, [Xn, x27]") +ldst_dn("ld_i64", "ldr Xd, [Xn, x27]") + +START_COLLECTION("load_signed") + ldst_dn("ld8s_i32", "ldrsb Wd, [Xn, x27]") ldst_dn("ld8s_i64", "ldrsb Xd, [Xn, x27]") -ldst_dn("ld16u", "ldrh Wd, [Xn, x27]") ldst_dn("ld16s_i32", "ldrsh Wd, [Xn, x27]") ldst_dn("ld16s_i64", "ldrsh Xd, [Xn, x27]") -ldst_dn("ld32u", "ldr Wd, [Xn, x27]") ldst_dn("ld32s_i64", "ldrsw Xd, [Xn, x27]") -ldst_dn("ld_i64", "ldr Xd, [Xn, x27]") + +START_COLLECTION("store") # STORE variants. ldst_dn("st8", "strb Wd, [Xn, x27]") @@ -644,6 +801,8 @@ def st_thunk(name, fastpath_32b, fastpath_64b, slowpath_helper, immediate=None, # QEMU LD/ST are handled in our C runtime rather than with simple gadgets, # as they're nontrivial. +START_COLLECTION("arithmetic") + # Trivial arithmetic. math_dnm("add" , "add" ) math_dnm("sub" , "sub" ) @@ -657,6 +816,8 @@ def st_thunk(name, fastpath_32b, fastpath_64b, slowpath_helper, immediate=None, with_dnm("remu_i32", "udiv w27, Wn, Wm", "msub Wd, w27, Wm, Wn") with_dnm("remu_i64", "udiv x27, Xn, Xm", "msub Xd, x27, Xm, Xn") +START_COLLECTION("logical") + # Trivial logical. math_dn( "not", "mvn") math_dn( "neg", "neg") @@ -669,71 +830,155 @@ def st_thunk(name, fastpath_32b, fastpath_64b, slowpath_helper, immediate=None, math_dnm("shl", "lsl") math_dnm("shr", "lsr") math_dnm("sar", "asr") +math_dnm("rotr", "ror") # AArch64 lacks a Rotate Left; so we instead rotate right by a negative. -# TODO: validate this? -#math_dnm("rotr", "ror") -#with_dnm("rotl_i32", "neg w27, Wm", "ror Wd, Wn, w27") -#with_dnm("rotl_i64", "neg x27, Xm", "ror Xd, Xn, x27") +with_dnm("rotl_i32", "neg w27, Wm", "ror Wd, Wn, w27") +with_dnm("rotl_i64", "neg w27, Wm", "ror Xd, Xn, x27") + +# We'll synthesize several instructions that don't exist; since it's still faster +# to run these as gadgets. +with_dnm("nand_i32", "and Wd, Wn, Wm", "mvn Wd, Wd") +with_dnm("nand_i64", "and Xd, Xn, Xm", "mvn Xd, Xd") +with_dnm("nor_i32", "orr Wd, Wn, Wm", "mvn Wd, Wd") +with_dnm("nor_i64", "orr Xd, Xn, Xm", "mvn Xd, Xd") + +START_COLLECTION("bitwise") + +# Count leading zeroes, with a twist: QEMU requires us to provide +# a default value for when the argument is 0. +with_dnm("clz_i32", + + # Perform the core CLZ into w26. + "clz w26, Wn", + + # Check Wn to see if it was zero + "tst Wn, Wn", + + # If it was zero, accept the argument provided in Wm. + # Otherwise, accept our result from w26. + "csel Wd, Wm, w26, eq" +) +with_dnm("clz_i64", + + # Perform the core CLZ into w26. + "clz x26, Xn", + + # Check Wn to see if it was zero + "tst Xn, Xn", + + # If it was zero, accept the argument provided in Wm. + # Otherwise, accept our result from w26. + "csel Xd, Xm, x26, eq" +) + + +# Count trailing zeroes, with a twist: QEMU requires us to provide +# a default value for when the argument is 0. +with_dnm("ctz_i32", + # Reverse our bits before performing our actual clz. + "rbit w26, Wn", + "clz w26, w26", + + # Check Wn to see if it was zero + "tst Wn, Wn", + + # If it was zero, accept the argument provided in Wm. + # Otherwise, accept our result from w26. + "csel Wd, Wm, w26, eq" +) +with_dnm("ctz_i64", + + # Perform the core CLZ into w26. + "rbit x26, Xn", + "clz x26, x26", + + # Check Wn to see if it was zero + "tst Xn, Xn", + + # If it was zero, accept the argument provided in Wm. + # Otherwise, accept our result from w26. + "csel Xd, Xm, x26, eq" +) + + +START_COLLECTION("extension") # Numeric extension. -math_dn("ext8s", "sxtb") +math_dn("ext8s", "sxtb", source_is_wn=True) with_dn("ext8u", "and Xd, Xn, #0xff") -math_dn("ext16s", "sxth") +math_dn("ext16s", "sxth", source_is_wn=True) with_dn("ext16u", "and Wd, Wn, #0xffff") with_dn("ext32s_i64", "sxtw Xd, Wn") -with_dn("ext32u_i64", "and Xd, Xn, #0xffffffff") +with_dn("ext32u_i64", "mov Wd, Wn") + +# Numeric extraction. +with_dn("extrl", "mov Wd, Wn") +with_dn("extrh", "lsr Xd, Xn, #32") + +START_COLLECTION("byteswap") # Byte swapping. with_dn("bswap16", "rev w27, Wn", "lsr Wd, w27, #16") with_dn("bswap32", "rev Wd, Wn") with_dn("bswap64", "rev Xd, Xn") -# Memory barriers. -simple("mb_all", "dmb ish") -simple("mb_st", "dmb ishst") -simple("mb_ld", "dmb ishld") # Handlers for QEMU_LD, which handles guest <- host loads. for subtype in ('aligned', 'unaligned', 'slowpath'): is_aligned = (subtype == 'aligned') is_slowpath = (subtype == 'slowpath') + START_COLLECTION(f"qemu_ld_{subtype}_unsigned_le") + ld_thunk(f"qemu_ld_ub_{subtype}", is_aligned=is_aligned, slowpath_helper="helper_ret_ldub_mmu", fastpath_32b=["ldrb Wd, [Xn, x27]"], fastpath_64b=["ldrb Wd, [Xn, x27]"], force_slowpath=is_slowpath, ) - ld_thunk(f"qemu_ld_sb_{subtype}", is_aligned=is_aligned, slowpath_helper="helper_ret_ldub_mmu_signed", - fastpath_32b=["ldrsb Wd, [Xn, x27]"], fastpath_64b=["ldrsb Xd, [Xn, x27]"], - force_slowpath=is_slowpath, - ) ld_thunk(f"qemu_ld_leuw_{subtype}", is_aligned=is_aligned, slowpath_helper="helper_le_lduw_mmu", fastpath_32b=["ldrh Wd, [Xn, x27]"], fastpath_64b=["ldrh Wd, [Xn, x27]"], force_slowpath=is_slowpath, ) - ld_thunk(f"qemu_ld_lesw_{subtype}", is_aligned=is_aligned, slowpath_helper="helper_le_lduw_mmu_signed", - fastpath_32b=["ldrsh Wd, [Xn, x27]"], fastpath_64b=["ldrsh Xd, [Xn, x27]"], - force_slowpath=is_slowpath, - ) ld_thunk(f"qemu_ld_leul_{subtype}", is_aligned=is_aligned, slowpath_helper="helper_le_ldul_mmu", fastpath_32b=["ldr Wd, [Xn, x27]"], fastpath_64b=["ldr Wd, [Xn, x27]"], force_slowpath=is_slowpath, ) + ld_thunk(f"qemu_ld_leq_{subtype}", is_aligned=is_aligned, slowpath_helper="helper_le_ldq_mmu", + fastpath_32b=["ldr Xd, [Xn, x27]"], fastpath_64b=["ldr Xd, [Xn, x27]"], + force_slowpath=is_slowpath, + ) + + START_COLLECTION(f"qemu_ld_{subtype}_signed_le") + + ld_thunk(f"qemu_ld_sb_{subtype}", is_aligned=is_aligned, slowpath_helper="helper_ret_ldub_mmu_signed", + fastpath_32b=["ldrsb Wd, [Xn, x27]"], fastpath_64b=["ldrsb Xd, [Xn, x27]"], + force_slowpath=is_slowpath, + ) + ld_thunk(f"qemu_ld_lesw_{subtype}", is_aligned=is_aligned, slowpath_helper="helper_le_lduw_mmu_signed", + fastpath_32b=["ldrsh Wd, [Xn, x27]"], fastpath_64b=["ldrsh Xd, [Xn, x27]"], + force_slowpath=is_slowpath, + ) ld_thunk(f"qemu_ld_lesl_{subtype}", is_aligned=is_aligned, slowpath_helper="helper_le_ldul_mmu_signed", fastpath_32b=["ldrsw Xd, [Xn, x27]"], fastpath_64b=["ldrsw Xd, [Xn, x27]"], force_slowpath=is_slowpath, ) - ld_thunk(f"qemu_ld_leq_{subtype}", is_aligned=is_aligned, slowpath_helper="helper_le_ldq_mmu", + + # Special variant for the most common modes, as a speedup optimization. + ld_thunk(f"qemu_ld_ub_{subtype}_mode02", is_aligned=is_aligned, slowpath_helper="helper_ret_ldub_mmu", + fastpath_32b=["ldrb Wd, [Xn, x27]"], fastpath_64b=["ldrb Wd, [Xn, x27]"], + force_slowpath=is_slowpath, immediate=0x02 + ) + ld_thunk(f"qemu_ld_leq_{subtype}_mode32", is_aligned=is_aligned, slowpath_helper="helper_le_ldq_mmu", fastpath_32b=["ldr Xd, [Xn, x27]"], fastpath_64b=["ldr Xd, [Xn, x27]"], - force_slowpath=is_slowpath, + force_slowpath=is_slowpath, immediate=0x32 ) - - # Special variant for the most common mode, as a speedup optimization. ld_thunk(f"qemu_ld_leq_{subtype}_mode3a", is_aligned=is_aligned, slowpath_helper="helper_le_ldq_mmu", fastpath_32b=["ldr Xd, [Xn, x27]"], fastpath_64b=["ldr Xd, [Xn, x27]"], force_slowpath=is_slowpath, immediate=0x3a ) + START_COLLECTION(f"qemu_ld_{subtype}_be") + # For now, leave the rare/big-endian stuff slow-path only. ld_thunk(f"qemu_ld_beuw_{subtype}", None, None, "helper_be_lduw_mmu", is_aligned=is_aligned, force_slowpath=is_slowpath) @@ -747,11 +992,15 @@ def st_thunk(name, fastpath_32b, fastpath_64b, slowpath_helper, immediate=None, is_aligned=is_aligned, force_slowpath=is_slowpath) + + # Handlers for QEMU_ST, which handles guest -> host stores. for subtype in ('aligned', 'unaligned', 'slowpath'): is_aligned = (subtype == 'aligned') is_slowpath = (subtype == 'slowpath') + START_COLLECTION(f"qemu_st_{subtype}_le") + st_thunk(f"qemu_st_ub_{subtype}", is_aligned=is_aligned, slowpath_helper="helper_ret_stb_mmu", fastpath_32b=["strb Wd, [Xn, x27]"], fastpath_64b=["strb Wd, [Xn, x27]"], force_slowpath=is_slowpath, @@ -770,11 +1019,21 @@ def st_thunk(name, fastpath_32b, fastpath_64b, slowpath_helper, immediate=None, ) # Special optimization for the most common modes. + st_thunk(f"qemu_st_ub_{subtype}_mode02", is_aligned=is_aligned, slowpath_helper="helper_ret_stb_mmu", + fastpath_32b=["strb Wd, [Xn, x27]"], fastpath_64b=["strb Wd, [Xn, x27]"], + force_slowpath=is_slowpath, immediate=0x02 + ) + st_thunk(f"qemu_st_leq_{subtype}_mode32", is_aligned=is_aligned, slowpath_helper="helper_le_stq_mmu", + fastpath_32b=["str Xd, [Xn, x27]"], fastpath_64b=["str Xd, [Xn, x27]"], + force_slowpath=is_slowpath, immediate=0x32 + ) st_thunk(f"qemu_st_leq_{subtype}_mode3a", is_aligned=is_aligned, slowpath_helper="helper_le_stq_mmu", fastpath_32b=["str Xd, [Xn, x27]"], fastpath_64b=["str Xd, [Xn, x27]"], force_slowpath=is_slowpath, immediate=0x3a ) + START_COLLECTION(f"qemu_st_{subtype}_be") + # For now, leave the rare/big-endian stuff slow-path only. st_thunk(f"qemu_st_beuw_{subtype}", None, None, "helper_be_stw_mmu", is_aligned=is_aligned, force_slowpath=is_slowpath) @@ -784,5 +1043,121 @@ def st_thunk(name, fastpath_32b, fastpath_64b, slowpath_helper, immediate=None, is_aligned=is_aligned, force_slowpath=is_slowpath) +# +# SIMD/Vector ops +# + +# SIMD MOVI instructions. +START_COLLECTION(f"simd_base") + +# Unoptimized/unoptimizable load of a vector64; grabbing an immediate. +with_d("ldi_d", "ldr Dd, [x28], #8") +with_d("ldi_q", "ldr Qd, [x28], #16") + +# General purpose reg -> vec rec loads +vector_dn("dup", "dup Vd.S, Wn") + +# move vector -> GP reg +with_dn("umov_s0", "umov Wd, Vn.s[0]") +with_dn("umov_d0", "umov Xd, Vn.d[0]") + +# mov GP reg -> vector +with_dn("ins_s0", "ins Vd.s[0], Wn") +with_dn("ins_d0", "ins Vd.d[0], Xn") + + +# Memory -> vec reg loads. +# The offset of the load is stored in a 64b immediate. + +# Duplicating load. +# TODO: possibly squish the add into the ld1r, if that's valid? +vector_dn("dupm", "ldr x27, [x28], #8", "add x27, x27, Xn", "ld1r {Vd.S}, [x27]") + +# Direct loads. +with_dn("ldr_d", "ldr x27, [x28], #8", "ldr Dd, [Xn, x27]") +with_dn("ldr_q", "ldr x27, [x28], #8", "ldr Qd, [Xn, x27]") + +# vec -> reg stores. +# The offset of the stores is stored in a 64b immediate. +with_dn("str_d", "ldr x27, [x28], #8", "str Dd, [Xn, x27]") +with_dn("str_q", "ldr x27, [x28], #8", "str Qd, [Xn, x27]") + + +START_COLLECTION(f"simd_arithmetic") + +vector_math_dnm("add", "add") +vector_math_dnm("usadd", "uqadd") +vector_math_dnm("ssadd", "sqadd") +vector_math_dnm("sub", "sub") +vector_math_dnm("ussub", "uqsub") +vector_math_dnm("sssub", "sqsub") +vector_math_dnm_no64("mul", "mul") +vector_math_dnm_no64("smax", "smax") +vector_math_dnm_no64("smin", "smin") +vector_math_dnm_no64("umax", "umax") +vector_math_dnm_no64("umin", "umin") + +START_COLLECTION(f"simd_logical") + +vector_logic_dnm("and", "and") +vector_logic_dnm("andc", "bic") +vector_logic_dnm("or", "orr") +vector_logic_dnm("orc", "orn") +vector_logic_dnm("xor", "eor") +vector_logic_dn( "not", "not") +vector_dn("neg", "neg Vd.S, Vn.S") +vector_dn("abs", "abs Vd.S, Vn.S") +vector_logic_dnm( "bit", "bit") +vector_logic_dnm( "bif", "bif") +vector_logic_dnm( "bsl", "bsl") + +vector_math_dnm("shlv", "ushl") +vector_math_dnm("sshl", "sshl") + +vector_dnm("cmeq", "cmeq Vd.S, Vn.S, Vm.S", scalar="cmeq Dd, Dn, Dm") +vector_dnm("cmgt", "cmgt Vd.S, Vn.S, Vm.S", scalar="cmgt Dd, Dn, Dm") +vector_dnm("cmge", "cmge Vd.S, Vn.S, Vm.S", scalar="cmge Dd, Dn, Dm") +vector_dnm("cmhi", "cmhi Vd.S, Vn.S, Vm.S", scalar="cmhi Dd, Dn, Dm") +vector_dnm("cmhs", "cmhs Vd.S, Vn.S, Vm.S", scalar="cmhs Dd, Dn, Dm") + +START_COLLECTION(f"simd_immediate") + +# Simple imm8 movs... +with_d_immediate("movi_cmode_e_op0_q0", "movi Vd.8b, #Ii", immediate_range=range(256)) +with_d_immediate("movi_cmode_e_op0_q1", "movi Vd.16b, #Ii", immediate_range=range(256)) + +# ... all 00/FF movs... +with_d_immediate("movi_cmode_e_op1_q0", "movi Dd, #Si", immediate_range=range(256)) +with_d_immediate("movi_cmode_e_op1_q1", "movi Vd.2d, #Si", immediate_range=range(256)) + +# Halfword MOVs. +with_d_immediate("movi_cmode_8_op0_q0", "movi Vd.4h, #Ii", immediate_range=range(256)) +with_d_immediate("movi_cmode_8_op0_q1", "movi Vd.8h, #Ii", immediate_range=range(256)) +with_d_immediate("mvni_cmode_8_op0_q0", "mvni Vd.4h, #Ii", immediate_range=range(256)) +with_d_immediate("mvni_cmode_8_op0_q1", "mvni Vd.8h, #Ii", immediate_range=range(256)) +with_d_immediate("movi_cmode_a_op0_q0", "movi Vd.4h, #Ii, lsl #8", immediate_range=range(256)) +with_d_immediate("movi_cmode_a_op0_q1", "movi Vd.8h, #Ii, lsl #8", immediate_range=range(256)) +with_d_immediate("mvni_cmode_a_op0_q0", "mvni Vd.4h, #Ii, lsl #8", immediate_range=range(256)) +with_d_immediate("mvni_cmode_a_op0_q1", "mvni Vd.8h, #Ii, lsl #8", immediate_range=range(256)) + +# Halfword ORIs, for building complex MOVs. +with_d_immediate("orr_cmode_a_op0_q0", "orr Vd.4h, #Ii, lsl #8", immediate_range=range(256)) +with_d_immediate("orr_cmode_a_op0_q1", "orr Vd.8h, #Ii, lsl #8", immediate_range=range(256)) + + +# Print a list of output files generated. +output_c_filenames = (f"'tcti_{name}_gadgets.c'" for name in output_files.keys()) +output_h_filenames = (f"'tcti_{name}_gadgets.h'" for name in output_files.keys()) + +print("Sources generated:", file=sys.stderr) +print(f"gadgets = [", file=sys.stderr) +print(" tcti_gadgets.h,", file=sys.stderr) + +for name in output_files.keys(): + print(f" 'tcti_{name}_gadgets.c',", file=sys.stderr) + print(f" 'tcti_{name}_gadgets.h',", file=sys.stderr) + +print(f"]", file=sys.stderr) + # Statistics. -sys.stderr.write(f"\nGenerated {gadgets} gadgets with {instructions} instructions ({instructions * 4} B).\n\n") +sys.stderr.write(f"\nGenerated {gadgets} gadgets with {instructions} instructions (~{(instructions * 4) // 1024 // 1024} MiB).\n\n") diff --git a/util/osdep.c b/util/osdep.c index 81c46df6f517..8df113c2df5c 100644 --- a/util/osdep.c +++ b/util/osdep.c @@ -114,6 +114,12 @@ int qemu_mprotect_none(void *addr, size_t size) #ifdef _WIN32 return qemu_mprotect__osdep(addr, size, PAGE_NOACCESS); #else +# if defined(__APPLE__) && defined(__arm64__) + if (__builtin_available(macOS 11.2, *)) { + /* mprotect() in macOS 11.2 can't switch RWX to NONE */ + return 0; + } +# endif return qemu_mprotect__osdep(addr, size, PROT_NONE); #endif }