diff --git a/accel/tcg/translate-all.c b/accel/tcg/translate-all.c index 28d563454e03..71676bbd2dea 100644 --- a/accel/tcg/translate-all.c +++ b/accel/tcg/translate-all.c @@ -1129,7 +1129,7 @@ static bool alloc_code_gen_buffer_anon(size_t size, int prot, return true; } -#ifndef CONFIG_TCG_INTERPRETER +#if !defined(CONFIG_TCG_INTERPRETER) && !defined(CONFIG_TCG_THREADED_INTERPRETER) #ifdef CONFIG_POSIX #include "qemu/memfd.h" @@ -1256,7 +1256,7 @@ static bool alloc_code_gen_buffer_splitwx_vmremap(size_t size, Error **errp) static bool alloc_code_gen_buffer_splitwx(size_t size, Error **errp) { -#ifndef CONFIG_TCG_INTERPRETER +#if !defined(CONFIG_TCG_INTERPRETER) && !defined(CONFIG_TCG_THREADED_INTERPRETER) # ifdef CONFIG_DARWIN return alloc_code_gen_buffer_splitwx_vmremap(size, errp); # endif @@ -1289,7 +1289,7 @@ static bool alloc_code_gen_buffer(size_t size, int splitwx, Error **errp) prot = PROT_READ | PROT_WRITE | PROT_EXEC; flags = MAP_PRIVATE | MAP_ANONYMOUS; -#ifdef CONFIG_TCG_INTERPRETER +#if defined(CONFIG_TCG_INTERPRETER) || defined(CONFIG_TCG_THREADED_INTERPRETER) /* The tcg interpreter does not need execute permission. */ prot = PROT_READ | PROT_WRITE; #elif defined(CONFIG_DARWIN) diff --git a/configure b/configure index 0e7dbc56c784..9296d29e4fd6 100755 --- a/configure +++ b/configure @@ -362,6 +362,7 @@ tsan="no" fortify_source="$default_feature" strip_opt="yes" tcg_interpreter="false" +tcg_threaded_interpreter="false" bigendian="no" mingw32="no" gcov="no" @@ -1115,6 +1116,10 @@ for opt do ;; --enable-tcg-interpreter) tcg_interpreter="true" ;; + --disable-tcg-tcti) tcg_threaded_interpreter="false" + ;; + --enable-tcg-tcti) tcg_threaded_interpreter="true" + ;; --disable-cap-ng) cap_ng="disabled" ;; --enable-cap-ng) cap_ng="enabled" @@ -6469,9 +6474,8 @@ NINJA=$ninja $meson setup \ -Dvhost_user_blk_server=$vhost_user_blk_server -Dmultiprocess=$multiprocess \ -Dfuse=$fuse -Dfuse_lseek=$fuse_lseek -Dguest_agent_msi=$guest_agent_msi \ $(if test "$default_features" = no; then echo "-Dauto_features=disabled"; fi) \ - -Dtcg_interpreter=$tcg_interpreter -Dshared_lib=$shared_lib \ - $cross_arg \ - "$PWD" "$source_path" + -Dtcg_interpreter=$tcg_interpreter -Dtcg_threaded_interpreter=$tcg_threaded_interpreter\ + -Dshared_lib=$shared_lib $cross_arg "$PWD" "$source_path" if test "$?" -ne 0 ; then error_exit "meson setup failed" diff --git a/disas.c b/disas.c index a61f95b580b8..cea0f9019e49 100644 --- a/disas.c +++ b/disas.c @@ -152,6 +152,8 @@ static void initialize_debug_host(CPUDebug *s) #endif #if defined(CONFIG_TCG_INTERPRETER) s->info.print_insn = print_insn_tci; +#elif defined(CONFIG_TCG_THREADED_INTERPRETER) + s->info.print_insn = print_insn_tcti; #elif defined(__i386__) s->info.mach = bfd_mach_i386_i386; s->info.print_insn = print_insn_i386; diff --git a/include/disas/dis-asm.h b/include/disas/dis-asm.h index 13fa1edd411e..ded69ba2ffaa 100644 --- a/include/disas/dis-asm.h +++ b/include/disas/dis-asm.h @@ -411,6 +411,7 @@ typedef struct disassemble_info { typedef int (*disassembler_ftype) (bfd_vma, disassemble_info *); int print_insn_tci(bfd_vma, disassemble_info*); +int print_insn_tcti(bfd_vma, disassemble_info*); int print_insn_big_mips (bfd_vma, disassemble_info*); int print_insn_little_mips (bfd_vma, disassemble_info*); int print_insn_nanomips (bfd_vma, disassemble_info*); diff --git a/include/exec/exec-all.h b/include/exec/exec-all.h index 6b036cae8f65..a8f2295decd2 100644 --- a/include/exec/exec-all.h +++ b/include/exec/exec-all.h @@ -543,7 +543,11 @@ void tb_set_jmp_target(TranslationBlock *tb, int n, uintptr_t addr); #if defined(CONFIG_TCG_INTERPRETER) extern __thread uintptr_t tci_tb_ptr; # define GETPC() tci_tb_ptr +#elif defined(CONFIG_TCG_THREADED_INTERPRETER) +extern __thread uintptr_t tcti_call_return_address; +# define GETPC() tcti_call_return_address #else +/* Note that this is correct for TCTI also; whose gadget behaves like native code. */ # define GETPC() \ ((uintptr_t)__builtin_extract_return_addr(__builtin_return_address(0))) #endif diff --git a/include/tcg/tcg.h b/include/tcg/tcg.h index 0f0695e90da2..cfcd069bf3f6 100644 --- a/include/tcg/tcg.h +++ b/include/tcg/tcg.h @@ -1296,7 +1296,7 @@ static inline unsigned get_mmuidx(TCGMemOpIdx oi) #define TB_EXIT_IDXMAX 1 #define TB_EXIT_REQUESTED 3 -#ifdef CONFIG_TCG_INTERPRETER +#if defined(CONFIG_TCG_INTERPRETER) || defined(CONFIG_TCG_THREADED_INTERPRETER) uintptr_t tcg_qemu_tb_exec(CPUArchState *env, const void *tb_ptr); #else typedef uintptr_t tcg_prologue_fn(CPUArchState *env, const void *tb_ptr); diff --git a/meson.build b/meson.build index 1524a9be2121..8aea04191215 100644 --- a/meson.build +++ b/meson.build @@ -58,6 +58,7 @@ python = import('python').find_installation() supported_oses = ['windows', 'freebsd', 'netbsd', 'openbsd', 'darwin', 'sunos', 'linux'] supported_cpus = ['ppc', 'ppc64', 's390x', 'riscv32', 'riscv64', 'x86', 'x86_64', 'arm', 'aarch64', 'mips', 'mips64', 'sparc', 'sparc64'] +tcti_supported_cpus = ['aarch64'] cpu = host_machine.cpu_family() targetos = host_machine.system() @@ -248,6 +249,25 @@ if not get_option('tcg').disabled() endif if get_option('tcg_interpreter') tcg_arch = 'tci' + elif get_option('tcg_threaded_interpreter') + if cpu not in tcti_supported_cpus + error('Unsupported CPU @0@ for TCTI, try --enable-tcg-interpreter'.format(cpu)) + else + warning('TCTI is extremely experimental and incomplete! Things might break!') + tcg_arch = '@0@-tcti'.format(cpu) + endif + + # Tell our compiler how to generate our TCTI gadgets. + gadget_generator = 'tcg/@0@/tcti-gadget-gen.py'.format(tcg_arch) + tcti_gadgets = custom_target('tcti-gadgets.c.inc', + output: 'tcti-gadgets.c.inc', + input: gadget_generator, + command: [find_program(gadget_generator), '@OUTPUT@'], + build_by_default: true, + build_always_stale: false) + + genh += tcti_gadgets + elif config_host['ARCH'] == 'sparc64' tcg_arch = 'sparc' elif config_host['ARCH'] == 's390x' @@ -1284,6 +1304,8 @@ foreach target : target_dirs config_all += { sym: 'y' } if sym == 'CONFIG_TCG' and tcg_arch == 'tci' config_target += { 'CONFIG_TCG_INTERPRETER': 'y' } + elif sym == 'CONFIG_TCG' and tcg_arch.endswith('tcti') + config_target += { 'CONFIG_TCG_THREADED_INTERPRETER': 'y' } elif sym == 'CONFIG_XEN' and have_xen_pci_passthrough config_target += { 'CONFIG_XEN_PCI_PASSTHROUGH': 'y' } endif @@ -2575,6 +2597,8 @@ summary_info += {'TCG support': config_all.has_key('CONFIG_TCG')} if config_all.has_key('CONFIG_TCG') if get_option('tcg_interpreter') summary_info += {'TCG backend': 'TCI (TCG with bytecode interpreter, experimental and slow)'} + elif get_option('tcg_threaded_interpreter') + summary_info += {'TCG backend': 'TCTI (TCG with threaded-dispatch bytecode interpreter, experimental and slow; but faster than TCI)'} else summary_info += {'TCG backend': 'native (@0@)'.format(cpu)} endif diff --git a/meson_options.txt b/meson_options.txt index 6c29ea93300a..5aa68672c2ff 100644 --- a/meson_options.txt +++ b/meson_options.txt @@ -43,6 +43,8 @@ option('tcg', type: 'feature', value: 'auto', description: 'TCG support') option('tcg_interpreter', type: 'boolean', value: false, description: 'TCG with bytecode interpreter (experimental and slow)') +option('tcg_threaded_interpreter', type: 'boolean', value: false, + description: 'TCG with threaded-dispatch bytecode interpreter (experimental and slow, but less slow than TCI)') option('cfi', type: 'boolean', value: 'false', description: 'Control-Flow Integrity (CFI)') option('cfi_debug', type: 'boolean', value: 'false', diff --git a/scripts/mtest2make.py b/scripts/mtest2make.py index ee072c05025a..b0467ab56545 100644 --- a/scripts/mtest2make.py +++ b/scripts/mtest2make.py @@ -75,18 +75,18 @@ def process_tests(test, targets, suites): print('run-test-%d: $(.test.deps.%d)' % (i,i)) print('\t@$(call .test.run,%d,$(.test.output-format))' % (i,)) - test_suites = test['suite'] or ['default'] - is_slow = any(s.endswith('-slow') for s in test_suites) - for s in test_suites: - # The suite name in the introspection info is "PROJECT:SUITE" - s = s.split(':')[1] - if s.endswith('-slow'): - s = s[:-5] - if is_slow: - suites[s].slow_tests.append(i) - else: - suites[s].tests.append(i) - suites[s].executables.add(executable) + #test_suites = test['suite'] or ['default'] + #is_slow = any(s.endswith('-slow') for s in test_suites) + #for s in test_suites: + # # The suite name in the introspection info is "PROJECT:SUITE" + # s = s.split(':')[1] + # if s.endswith('-slow'): + # s = s[:-5] + # if is_slow: + # suites[s].slow_tests.append(i) + # else: + # suites[s].tests.append(i) + # suites[s].executables.add(executable) def emit_prolog(suites, prefix): all_tap = ' '.join(('%s-report-%s.tap' % (prefix, k) for k in suites.keys())) diff --git a/tcg/aarch64-tcti/README.md b/tcg/aarch64-tcti/README.md new file mode 100644 index 000000000000..eb848e5a9e57 --- /dev/null +++ b/tcg/aarch64-tcti/README.md @@ -0,0 +1,1026 @@ +# QEMU Tiny-Code Threaded Interpreter (AArch64) + +A TCG backend that chains together JOP/ROP-ish gadgets to massively reduce interpreter overhead vs TCI. +Platform-dependent; but usable when JIT isn't available; e.g. on platforms that lack WX mappings. The general idea squish the addresses of a gadget sequence into a "queue" and then write each gadget so it ends in a "dequeue-jump". + +Execution occurs by jumping into the first gadget, and letting it just play back some linear-overhead native code sequences for a while. + +Since TCG-TCI is optimized for sets of 16 GP registers and aarch64 has 30, we could easily keep JIT/QEMU and guest state separate, and since 16\*16 is reasonably small we could actually have a set of reasonable gadgets for each combination of operands. + + +## Register Convention + +| Regs | Use | +| :------ | :-------------------- | +| x1-x15 | Guest Registers | +| x24 | TCTI temporary | +| x25 | saved IP during call | +| x26 | TCTI temporary | +| x27 | TCTI temporary | +| x28 | Thread-stream pointer | +| x30 | Link register | +| SP | Stack Pointer, host | +| PC | Program Counter, host | + +In pseudocode: + +| Symbol | Meaning | +| :----- | :---------------------------------- | +| Rd | stand-in for destination register | +| Rn | stand-in for first source register | +| Rm | stand-in for second source register | + +## Gadget Structure + +### End of gadget + +Each gadget ends by advancing our bytecode pointer, and then executing from thew new location. + +```asm +# Load our next gadget address from our bytecode stream, advancing it, and jump to the next gadget. + +ldr x27, [x28], #8\n +br x27 +``` + +## Calling into QEMU's C codebase + +When calling into C, we lose control over which registers are used. Accordingly, we'll need to save +registers relevant to TCTI: + +```asm +str x25, [sp, #-16]! +stp x14, x15, [sp, #-16]! +stp x12, x13, [sp, #-16]! +stp x10, x11, [sp, #-16]! +stp x8, x9, [sp, #-16]! +stp x6, x7, [sp, #-16]! +stp x4, x5, [sp, #-16]! +stp x2, x3, [sp, #-16]! +stp x0, x1, [sp, #-16]! +stp x28, lr, [sp, #-16]! +``` + +Upon returning to the gadget stream, we'll then restore them. + +```asm +ldp x28, lr, [sp], #16 +ldp x0, x1, [sp], #16 +ldp x2, x3, [sp], #16 +ldp x4, x5, [sp], #16 +ldp x6, x7, [sp], #16 +ldp x8, x9, [sp], #16 +ldp x10, x11, [sp], #16 +ldp x12, x13, [sp], #16 +ldp x14, x15, [sp], #16 +ldr x25, [sp], #16 +``` + +## TCG Operations + +Each operation needs an implementation for every platform; and probably a set of gadgets for each possible set of operands. + +At 14 GP registers, that means that + +1 operand =\> 16 gadgets +2 operands =\> 256 gadgets +3 operands =\> 4096 gadgets + +### call + +Calls a helper function by address. + +**IR Format**: `br ` +**Gadget type:** single + +```asm + # Get our C runtime function's location as a pointer-sized immediate... + "ldr x27, [x28], #8", + + # Store our TB return address for our helper. This is necessary so the GETPC() + # macro works correctly as used in helper functions. + "str x28, [x25]", + + # Prepare ourselves to call into our C runtime... + *C_CALL_PROLOGUE, + + # ... perform the call itself ... + "blr x27", + + # Save the result of our call for later. + "mov x27, x0", + + # ... and restore our environment. + *C_CALL_EPILOGUE, + + # Restore our return value. + "mov x0, x27" +``` + +### br + +Branches to a given immediate address. Branches are + +**IR Format**: `br ` +**Gadget type:** single + +```asm +# Use our immediate argument as our new bytecode-pointer location. +ldr x28, [x28] +``` + +### setcond_i32 + +Performs a comparison between two 32-bit operands. + +**IR Format**: `setcond32 , Rd, Rn, Rm` +**Gadget type:** treated as 10 operations with variants for every `Rd`/`Rn`/`Rm` (40,960) + +```asm +subs Wd, Wn, Wm +cset Wd, +``` + +| QEMU Cond | AArch64 Cond | +| :-------- | :----------- | +| EQ | EQ | +| NE | NE | +| LT | LT | +| GE | GE | +| LE | LE | +| GT | GT | +| LTU | LO | +| GEU | HS | +| LEU | LS | +| GTU | HI | + +### setcond_i64 + +Performs a comparison between two 32-bit operands. + +**IR Format**: `setcond64 , Rd, Rn, Rm` +**Gadget type:** treated as 10 operations with variants for every `Rd`/`Rn`/`Rm` (40,960) + +```asm +subs Xd, Xn, Xm +cset Xd, +``` + +Comparison chart is the same as the `_i32` variant. + +### brcond_i32 + +Compares two 32-bit numbers, and branches if the comparison is true. + +**IR Format**: `brcond Rn, Rm, ` +**Gadget type:** treated as 10 operations with variants for every `Rn`/`Rm` (2560) + +```asm +# Perform our comparison and conditional branch. +subs Wrz, Wn, Wm +br taken + + # Consume the branch target, without using it. + add x28, x28, #8 + + # Perform our end-of-instruction epilogue. + + +taken: + + # Update our bytecode pointer to take the label. + ldr x28, [x28] +``` + +Comparison chart is the same as in `setcond_i32` . + +### brcond_i64 + +Compares two 64-bit numbers, and branches if the comparison is true. + +**IR Format**: `brcond Rn, Rm, ` +**Gadget type:** treated as 10 operations with variants for every `Rn`/`Rm` (2560) + +```asm +# Perform our comparison and conditional branch. +subs Xrz, Xn, Xm +br taken + + # Consume the branch target, without using it. + add x28, x28, #8 + + # Perform our end-of-instruction epilogue. + + +taken: + + # Update our bytecode pointer to take the label. + ldr x28, [x28] +``` + +Comparison chart is the same as in `setcond_i32` . + +### mov_i32 + +Moves a value from a register to another register. + +**IR Format**: `mov Rd, Rn` +**Gadget type:** gadget per `Rd` + `Rn` combo (256) + +```asm +mov Rd, Rn +``` + +### mov_i64 + +Moves a value from a register to another register. + +**IR Format**: `mov Rd, Rn` +**Gadget type:** gadget per `Rd` + `Rn` combo (256) + +```asm +mov Xd, Xn +``` + +### tci_movi_i32 + +Moves an 32b immediate into a register. + +**IR Format**: `mov Rd, #imm32` +**Gadget type:** gadget per `Rd` (16) + +```asm +ldr w27, [x28], #4 +mov Wd, w27 +``` + +### tci_movi_i64 + +Moves an 64b immediate into a register. + +**IR Format**: `mov Rd, #imm64` +**Gadget type:** gadget per `Rd` (16) + +```asm +ldr x27, [x28], #4 +mov Xd, x27 +``` + +### ld8u_i32 / ld8u_i64 + +Load byte from host memory to register. + +**IR Format**: `ldr Rd, Rn, ` +**Gadget type:** gadget per `Rd` & `Rn` (256) + +```asm +ldrsw x27, [x28], #4 +ldrb Xd, [Xn, x27] +``` + +### ld8s_i32 / ld8s_i64 + +Load byte from host memory to register; sign extending. + +**IR Format**: `ldr Rd, Rn, ` +**Gadget type:** gadget per `Rd` & `Rn` (256) + +```asm +ldrsw x27, [x28], #4 +ldrsb Xd, [Xn, x27] +``` + +### ld16u_i32 / ld16u_i64 + +Load 16b from host memory to register. + +**IR Format**: `ldr Rd, Rn, ` +**Gadget type:** gadget per `Rd` & `Rn` (256) + +```asm +ldrsw x27, [x28], #4 +ldrh Wd, [Xn, x27] +``` + +### ld16s_i32 / ld16s_i64 + +Load 16b from host memory to register; sign extending. + +**IR Format**: `ldr Rd, Rn, ` +**Gadget type:** gadget per `Rd` & `Rn` (256) + +```asm +ldrsw x27, [x28], #4 +ldrsh Xd, [Xn, x27] +``` + +### ld32u_i32 / ld32u_i64 + +Load 32b from host memory to register. + +**IR Format**: `ldr Rd, Rn, ` +**Gadget type:** gadget per `Rd` & `Rn` (256) + +```asm +ldrsw x27, [x28], #4 +ldr Wd, [Xn, x27] +``` + +### ld32s_i64 + +Load 32b from host memory to register; sign extending. + +**IR Format**: `ldr Rd, Rn, ` +**Gadget type:** gadget per `Rd` & `Rn` (256) + +```asm +ldrsw x27, [x28], #4 +ldrsw Xd, [Xn, x27] +``` + +### ld_i64 + +Load 64b from host memory to register. + +**IR Format**: `ldr Rd, Rn, ` +**Gadget type:** gadget per `Rd` & `Rn` (256) + +```asm +ldrsw x27, [x28], #4 +ldr Xd, [Xn, x27] +``` + +### st8_i32 / st8_i64 + +Stores byte from register to host memory. + +**IR Format**: `str Rd, Rn, ` +**Gadget type:** gadget per `Rd` & `Rn` (256) + +```asm +ldrsw x27, [x28], #4 +strb Wd, [Xn, x27] +``` + +### st16_i32 / st16_i64 + +Stores 16b from register to host memory. + +**IR Format**: `str Rd, Rn, ` +**Gadget type:** gadget per `Rd` & `Rn` (256) + +```asm +ldrsw x27, [x28], #4 +strh Wd, [Xn, x27] +``` + +### st_i32 / st32_i64 + +Stores 32b from register to host memory. + +**IR Format**: `str Rd, Rn, ` +**Gadget type:** gadget per `Rd` & `Rn` (256) + +```asm +ldrsw x27, [x28], #4 +str Wd, [Xn, x27] +``` + +### st_i64 + +Stores 64b from register to host memory. + +**IR Format**: `str Rd, Rn, ` +**Gadget type:** gadget per `Rd` & `Rn` (256) + +```asm +ldrsw x27, [x28], #4 +str Xd, [Xn, x27] +``` + +### qemu_ld_i32 + +Loads 32b from _guest_ memory to register. + +**IR Format**: `ld Rd, , ` +**Gadget type:** thunk per `Rd` into C impl? + +### qemu_ld_i64 + +Loads 64b from _guest_ memory to register. + +**IR Format**: `ld Rd, , ` +**Gadget type:** thunk per `Rd` into C impl? + +### qemu_st_i32 + +Stores 32b from a register to _guest_ memory. + +**IR Format**: `st Rd, , ` +**Gadget type:** thunk per `Rd` into C impl + +### qemu_st_i64 + +Stores 64b from a register to _guest_ memory. + +**IR Format**: `st Rd, , ` +**Gadget type:** thunk per `Rd` into C impl? + +#### Note + +See note on `qemu_ld_i32`. + +### add_i32 + +Adds two 32-bit numbers. + +**IR Format**: `add Rd, Rn, Rm` +**Gadget type:** gadget per `Rd`, `Rn`, `Rm` (4096) + +```asm +add Wd, Wn, Wm +``` + +### add_i64 + +Adds two 64-bit numbers. + +**IR Format**: `add Rd, Rn, Rm` +**Gadget type:** gadget per `Rd`, `Rn`, `Rm` (4096) + +```asm +add Xd, Xn, Xm +``` + +### sub_i32 + +Subtracts two 32-bit numbers. + +**IR Format**: `add Rd, Rn, Rm` +**Gadget type:** gadget per `Rd`, `Rn`, `Rm` (4096) + +```asm +Sub Wd, Wn, Wm +``` + +### sub_i64 + +Subtracts two 64-bit numbers. + +**IR Format**: `sub Rd, Rn, Rm` +**Gadget type:** gadget per `Rd`, `Rn`, `Rm` (4096) + +```asm +sub Xd, Xn, Xm +``` + +### mul_i32 + +Multiplies two 32-bit numbers. + +**IR Format**: `mul Rd, Rn, Rm` +**Gadget type:** gadget per `Rd`, `Rn`, `Rm` (4096) + +```asm +mul Wd, Wn, Wm +``` + +### mul_i64 + +Multiplies two 64-bit numbers. + +**IR Format**: `mul Rd, Rn, Rm` +**Gadget type:** gadget per `Rd`, `Rn`, `Rm` (4096) + +```asm +mul Xd, Xn, Xm +``` + +### div_i32 + +Divides two 32-bit numbers; considering them signed. + +**IR Format**: `div Rd, Rn, Rm` +**Gadget type:** gadget per `Rd`, `Rn`, `Rm` (4096) + +```asm +sdiv Wd, Wn, Wm +``` + +### div_i64 + +Divides two 64-bit numbers; considering them signed. + +**IR Format**: `div Rd, Rn, Rm` +**Gadget type:** gadget per `Rd`, `Rn`, `Rm` (4096) + +```asm +sdiv Xd, Xn, Xm +``` + +### divu_i32 + +Divides two 32-bit numbers; considering them unsigned. + +**IR Format**: `div Rd, Rn, Rm` +**Gadget type:** gadget per `Rd`, `Rn`, `Rm` (4096) + +```asm +udiv Wd, Wn, Wm +``` + +### divu_i64 + +Divides two 32-bit numbers; considering them unsigned. + +**IR Format**: `div Rd, Rn, Rm` +**Gadget type:** gadget per `Rd`, `Rn`, `Rm` (4096) + +```asm +udiv Xd, Xn, Xm +``` + +### rem_i32 + +Computes the division remainder (modulus) of two 32-bit numbers; considering them signed. + +**IR Format**: `rem Rd, Rn, Rm` +**Gadget type:** gadget per `Rd`, `Rn`, `Rm` (4096) + +```asm +sdiv w27, Wn, Wm +msub Wd, w27, Wm, Wn +``` + +### rem_i64 + +Computes the division remainder (modulus) of two 64-bit numbers; considering them signed. + +**IR Format**: `rem Rd, Rn, Rm` +**Gadget type:** gadget per `Rd`, `Rn`, `Rm` (4096) + +```asm +sdiv x27, Xn, Xm +msub Xd, x27, Xm, Xn +``` + +### remu_i32 + +Computes the division remainder (modulus) of two 32-bit numbers; considering them unsigned. + +**IR Format**: `rem Rd, Rn, Rm` +**Gadget type:** gadget per `Rd`, `Rn`, `Rm` (4096) + +```asm +udiv w27, Wn, Wm +msub Wd, w27, Wm, Wn +``` + +### remu_i64 + +Computes the division remainder (modulus) of two 32-bit numbers; considering them unsigned. + +**IR Format**: `rem Rd, Rn, Rm` +**Gadget type:** gadget per `Rd`, `Rn`, `Rm` (4096) + +```asm +udiv x27, Xn, Xm +msub Xd, x27, Xm, Xn +``` + +### not_i32 + +Logically inverts a 32-bit number. + +**IR Format**: `not Rd, Rn` +**Gadget type:** gadget per `Rd`, `Rn` (256) + +```asm +mvn Wd, Wn +``` + +### not_i64 + +Logically inverts a 64-bit number. + +**IR Format**: `not Rd, Rn` +**Gadget type:** gadget per `Rd`, `Rn` (256) + +```asm +mvn Xd, Xn +``` + +### neg_i32 + +Arithmetically inverts (two's compliment) a 32-bit number. + +**IR Format**: `not Rd, Rn` +**Gadget type:** gadget per `Rd`, `Rn` (256) + +```asm +neg Wd, Wn +``` + +### neg_i64 + +Arithmetically inverts (two's compliment) a 64-bit number. + +**IR Format**: `not Rd, Rn` +**Gadget type:** gadget per `Rd`, `Rn` (256) + +```asm +neg Xd, Xn +``` + +### and_i32 + +Logically ANDs two 32-bit numbers. + +**IR Format**: `and Rd, Rn, Rm` +**Gadget type:** gadget per `Rd`, `Rn`, `Rm` (4096) + +```asm +and Wd, Wn, Wm +``` + +### and_i64 + +Logically ANDs two 64-bit numbers. + +**IR Format**: `and Rd, Rn, Rm` +**Gadget type:** gadget per `Rd`, `Rn`, `Rm` (4096) + +```asm +and Xd, Xn, Xm +``` + +### or_i32 + +Logically ORs two 32-bit numbers. + +**IR Format**: `or Rd, Rn, Rm` +**Gadget type:** gadget per `Rd`, `Rn`, `Rm` (4096) + +```asm +or Wd, Wn, Wm +``` + +### or_i64 + +Logically ORs two 64-bit numbers. + +**IR Format**: `or Rd, Rn, Rm` +**Gadget type:** gadget per `Rd`, `Rn`, `Rm` (4096) + +```asm +or Xd, Xn, Xm +``` + +### xor_i32 + +Logically XORs two 32-bit numbers. + +**IR Format**: `xor Rd, Rn, Rm` +**Gadget type:** gadget per `Rd`, `Rn`, `Rm` (4096) + +```asm +eor Wd, Wn, Wm +``` + +### xor_i64 + +Logically XORs two 64-bit numbers. + +**IR Format**: `xor Rd, Rn, Rm` +**Gadget type:** gadget per `Rd`, `Rn`, `Rm` (4096) + +```asm +eor Xd, Xn, Xm +``` + +### shl_i32 + +Logically shifts a 32-bit number left. + +**IR Format**: `shl Rd, Rn, Rm` +**Gadget type:** gadget per `Rd`, `Rn`, `Rm` (4096) + +```asm +lsl Wd, Wn, Wm +``` + +### shl_i64 + +Logically shifts a 64-bit number left. + +**IR Format**: `shl Rd, Rn, Rm` +**Gadget type:** gadget per `Rd`, `Rn`, `Rm` (4096) + +```asm +lsl Xd, Xn, Xm +``` + +### shr_i32 + +Logically shifts a 32-bit number right. + +**IR Format**: `shr Rd, Rn, Rm` +**Gadget type:** gadget per `Rd`, `Rn`, `Rm` (4096) + +```asm +lsr Wd, Wn, Wm +``` + +### shr_i64 + +Logically shifts a 64-bit number right. + +**IR Format**: `shr Rd, Rn, Rm` +**Gadget type:** gadget per `Rd`, `Rn`, `Rm` (4096) + +```asm +lsr Xd, Xn, Xm +``` + +### sar_i32 + +Arithmetically shifts a 32-bit number right. + +**IR Format**: `sar Rd, Rn, Rm` +**Gadget type:** gadget per `Rd`, `Rn`, `Rm` (4096) + +```asm +asr Wd, Wn, Wm +``` + +### sar_i64 + +Arithmetically shifts a 64-bit number right. + +**IR Format**: `sar Rd, Rn, Rm` +**Gadget type:** gadget per `Rd`, `Rn`, `Rm` (4096) + +```asm +asr Xd, Xn, Xm +``` + +### rotl_i32 + +Rotates a 32-bit number left. + +**IR Format**: `rotl Rd, Rn, Rm` +**Gadget type:** gadget per `Rd`, `Rn`, `Rm` (4096) + +```asm +rol Wd, Wn, Wm +``` + +### rotl_i64 + +Rotates a 64-bit number left. + +**IR Format**: `rotl Rd, Rn, Rm` +**Gadget type:** gadget per `Rd`, `Rn`, `Rm` (4096) + +```asm +rol Xd, Xn, Xm +``` + +### rotr_i32 + +Rotates a 32-bit number right. + +**IR Format**: `rotr Rd, Rn, Rm` +**Gadget type:** gadget per `Rd`, `Rn`, `Rm` (4096) + +```asm +ror Wd, Wn, Wm +``` + +### rotr_i64 + +Rotates a 64-bit number right. + +**IR Format**: `rotr Rd, Rn, Rm` +**Gadget type:** gadget per `Rd`, `Rn`, `Rm` (4096) + +```asm +ror Xd, Xn, Xm +``` + +### deposit_i32 + +Optional; not currently implementing. + +### deposit_i64 + +Optional; not currently implementing. + +### ext8s_i32 + +Sign extends the lower 8b of a register into a 32b destination. + +**IR Format**: `ext8s Rd, Rn` +**Gadget type:** gadget per `Rd`, `Rn` (256) + +```asm +sxtb Wd, Wn +``` + +### ext8s_i64 + +Sign extends the lower 8b of a register into a 64b destination. + +**IR Format**: `ext8s Rd, Rn` +**Gadget type:** gadget per `Rd`, `Rn` (256) + +```asm +sxtb Xd, Wn +``` + +### ext8u_i32 + +Zero extends the lower 8b of a register into a 32b destination. + +**IR Format**: `ext8u Rd, Rn` +**Gadget type:** gadget per `Rd`, `Rn` (256) + +```asm +and Xd, Xn, #0xff +``` + +### ext8u_i64 + +Zero extends the lower 8b of a register into a 64b destination. + +**IR Format**: `ext8u Rd, Rn` +**Gadget type:** gadget per `Rd`, `Rn` (256) + +```asm +and Xd, Xn, #0xff +``` + +### ext16s_i32 + +Sign extends the lower 16b of a register into a 32b destination. + +**IR Format**: `ext16s Rd, Rn` +**Gadget type:** gadget per `Rd`, `Rn` (256) + +```asm +sxth Xd, Wn +``` + +### ext16s_i64 + +Sign extends the lower 16b of a register into a 64b destination. + +**IR Format**: `ext16s Rd, Rn` +**Gadget type:** gadget per `Rd`, `Rn` (256) + +```asm +sxth Xd, Wn +``` + +### ext16u_i32 + +Zero extends the lower 16b of a register into a 32b destination. + +**IR Format**: `ext16u Rd, Rn` +**Gadget type:** gadget per `Rd`, `Rn` (256) + +```asm +and Wd, Wn, #0xffff +``` + +### ext16u_i64 + +Zero extends the lower 16b of a register into a 32b destination. + +**IR Format**: `ext16u Rd, Rn` +**Gadget type:** gadget per `Rd`, `Rn` (256) + +```asm +and Wd, Wn, #0xffff +``` + +### ext32s_i64 + +Sign extends the lower 32b of a register into a 64b destination. + +**IR Format**: `ext32s Rd, Rn` +**Gadget type:** gadget per `Rd`, `Rn` (256) + +```asm +sxtw Xd, Wn +``` + +### ext32u_i64 + +Zero extends the lower 32b of a register into a 64b destination. + +**IR Format**: `ext32s Rd, Rn` +**Gadget type:** gadget per `Rd`, `Rn` (256) + +```asm +sxtw Xd, Wn +``` + +### ext_i32_i64 + +Sign extends the lower 32b of a register into a 64b destination. + +**IR Format**: `ext32s Rd, Rn` +**Gadget type:** gadget per `Rd`, `Rn` (256) + +```asm +sxtw Xd, Wn +``` + +### extu_i32_i64 + +Zero extends the lower 32b of a register into a 32b destination. + +**IR Format**: `ext32u Rd, Rn` +**Gadget type:** gadget per `Rd`, `Rn` (256) + +```asm +and Xd, Xn, #0xffffffff +``` + +### bswap16_i32 + +Byte-swaps a 16b quantity. + +**IR Format**: `bswap16 Rd, Rn` +**Gadget type:** gadget per `Rd`, `Rn` (256) + +```asm +rev w27, Wn +lsr Wd, w27, #16 +``` + +### bswap16_i64 + +Byte-swaps a 16b quantity. + +**IR Format**: `bswap16 Rd, Rn` +**Gadget type:** gadget per `Rd`, `Rn` (256) + +```asm +rev w27, Wn +lsr Wd, w27, #16 +``` + +### bswap32_i32 + +Byte-swaps a 32b quantity. + +**IR Format**: `bswap32 Rd, Rn` +**Gadget type:** gadget per `Rd`, `Rn` (256) + +```asm +rev Wd, Wn +``` + +### bswap32_i64 + +Byte-swaps a 32b quantity. + +**IR Format**: `bswap32 Rd, Rn` +**Gadget type:** gadget per `Rd`, `Rn` (256) + +```asm +rev Wd, Wn +``` + +### bswap64_i64 + +Byte-swaps a 64b quantity. + +**IR Format**: `bswap64 Rd, Rn` +**Gadget type:** gadget per `Rd`, `Rn` (256) + +```asm +rev Xd, Xn +``` + +### exit_tb + +Exits the translation block. Has no gadget; but instead inserts the address of the translation block epilogue. + + +### mb + +Memory barrier. + +**IR Format**: `mb ` +**Gadget type:** gadget per type + +```asm +# !!! TODO +``` + +#### Note + +We still need to look up out how to map QEMU MB types map to AArch64 ones. This might take nuance. diff --git a/tcg/aarch64-tcti/tcg-target-con-set.h b/tcg/aarch64-tcti/tcg-target-con-set.h new file mode 100644 index 000000000000..f51b7bcb13e7 --- /dev/null +++ b/tcg/aarch64-tcti/tcg-target-con-set.h @@ -0,0 +1,21 @@ +/* SPDX-License-Identifier: MIT */ +/* + * TCI target-specific constraint sets. + * Copyright (c) 2021 Linaro + */ + +/* + * C_On_Im(...) defines a constraint set with outputs and inputs. + * Each operand should be a sequence of constraint letters as defined by + * tcg-target-con-str.h; the constraint combination is inclusive or. + */ +C_O0_I2(r, r) +C_O0_I3(r, r, r) +C_O0_I4(r, r, r, r) +C_O1_I1(r, r) +C_O1_I2(r, 0, r) +C_O1_I2(r, r, r) +C_O1_I4(r, r, r, r, r) +C_O2_I1(r, r, r) +C_O2_I2(r, r, r, r) +C_O2_I4(r, r, r, r, r, r) diff --git a/tcg/aarch64-tcti/tcg-target-con-str.h b/tcg/aarch64-tcti/tcg-target-con-str.h new file mode 100644 index 000000000000..87c0f19e9c2e --- /dev/null +++ b/tcg/aarch64-tcti/tcg-target-con-str.h @@ -0,0 +1,11 @@ +/* SPDX-License-Identifier: MIT */ +/* + * Define TCI target-specific operand constraints. + * Copyright (c) 2021 Linaro + */ + +/* + * Define constraint letters for register sets: + * REGS(letter, register_mask) + */ +REGS('r', MAKE_64BIT_MASK(0, TCG_TARGET_NB_REGS)) diff --git a/tcg/aarch64-tcti/tcg-target.c.inc b/tcg/aarch64-tcti/tcg-target.c.inc new file mode 100644 index 000000000000..d7bb67a92140 --- /dev/null +++ b/tcg/aarch64-tcti/tcg-target.c.inc @@ -0,0 +1,1347 @@ +/* + * Tiny Code Threaded Intepreter for QEMU + * + * Copyright (c) 2021 Kate Temkin + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ + +#define TCTI_GADGET_IMMEDIATE_ARRAY_LEN 64 + +// Grab our gadget definitions. +// FIXME: use the system path instead of hardcoding this? +#include "tcti-gadgets.c.inc" + +/* Marker for missing code. */ +#define TODO() \ + do { \ + fprintf(stderr, "TODO %s:%u: %s()\n", \ + __FILE__, __LINE__, __func__); \ + tcg_abort(); \ + } while (0) + + +/* Enable TCTI assertions only when debugging TCG (and without NDEBUG defined). + * Without assertions, the interpreter runs much faster. */ +#if defined(CONFIG_DEBUG_TCG) +# define tcti_assert(cond) assert(cond) +#else +# define tcti_assert(cond) ((void)0) +#endif + +/* Bitfield n...m (in 32 bit value). */ +#define BITS(n, m) (((0xffffffffU << (31 - n)) >> (31 - n + m)) << m) + +/** + * Macro that defines a look-up tree for named QEMU_LD gadgets. + */ +#define LD_MEMOP_LOOKUP(variable, arg, suffix) \ + switch (get_memop(arg) & (MO_BSWAP | MO_SSIZE)) { \ + case MO_UB: variable = gadget_qemu_ld_ub_ ## suffix; break; \ + case MO_SB: variable = gadget_qemu_ld_sb_ ## suffix; break; \ + case MO_LEUW: variable = gadget_qemu_ld_leuw_ ## suffix; break; \ + case MO_LESW: variable = gadget_qemu_ld_lesw_ ## suffix; break; \ + case MO_LEUL: variable = gadget_qemu_ld_leul_ ## suffix; break; \ + case MO_LESL: variable = gadget_qemu_ld_lesl_ ## suffix; break; \ + case MO_LEQ: variable = gadget_qemu_ld_leq_ ## suffix; break; \ + case MO_BEUW: variable = gadget_qemu_ld_beuw_ ## suffix; break; \ + case MO_BESW: variable = gadget_qemu_ld_besw_ ## suffix; break; \ + case MO_BEUL: variable = gadget_qemu_ld_beul_ ## suffix; break; \ + case MO_BESL: variable = gadget_qemu_ld_besl_ ## suffix; break; \ + case MO_BEQ: variable = gadget_qemu_ld_beq_ ## suffix; break; \ + default: \ + g_assert_not_reached(); \ + } +#define LD_MEMOP_HANDLER(variable, arg, suffix, a_bits, s_bits) \ + if (a_bits >= s_bits) { \ + LD_MEMOP_LOOKUP(variable, arg, aligned_ ## suffix ); \ + } else { \ + LD_MEMOP_LOOKUP(gadget, arg, unaligned_ ## suffix); \ + } + + + +/** + * Macro that defines a look-up tree for named QEMU_ST gadgets. + */ +#define ST_MEMOP_LOOKUP(variable, arg, suffix) \ + switch (get_memop(arg) & (MO_BSWAP | MO_SSIZE)) { \ + case MO_UB: variable = gadget_qemu_st_ub_ ## suffix; break; \ + case MO_LEUW: variable = gadget_qemu_st_leuw_ ## suffix; break; \ + case MO_LEUL: variable = gadget_qemu_st_leul_ ## suffix; break; \ + case MO_LEQ: variable = gadget_qemu_st_leq_ ## suffix; break; \ + case MO_BEUW: variable = gadget_qemu_st_beuw_ ## suffix; break; \ + case MO_BEUL: variable = gadget_qemu_st_beul_ ## suffix; break; \ + case MO_BEQ: variable = gadget_qemu_st_beq_ ## suffix; break; \ + default: \ + g_assert_not_reached(); \ + } +#define ST_MEMOP_HANDLER(variable, arg, suffix, a_bits, s_bits) \ + if (a_bits >= s_bits) { \ + ST_MEMOP_LOOKUP(variable, arg, aligned_ ## suffix ); \ + } else { \ + ST_MEMOP_LOOKUP(gadget, arg, unaligned_ ## suffix); \ + } + + +static TCGConstraintSetIndex tcg_target_op_def(TCGOpcode op) +{ + switch (op) { + case INDEX_op_ld8u_i32: + case INDEX_op_ld8s_i32: + case INDEX_op_ld16u_i32: + case INDEX_op_ld16s_i32: + case INDEX_op_ld_i32: + case INDEX_op_ld8u_i64: + case INDEX_op_ld8s_i64: + case INDEX_op_ld16u_i64: + case INDEX_op_ld16s_i64: + case INDEX_op_ld32u_i64: + case INDEX_op_ld32s_i64: + case INDEX_op_ld_i64: + case INDEX_op_not_i32: + case INDEX_op_not_i64: + case INDEX_op_neg_i32: + case INDEX_op_neg_i64: + case INDEX_op_ext8s_i32: + case INDEX_op_ext8s_i64: + case INDEX_op_ext16s_i32: + case INDEX_op_ext16s_i64: + case INDEX_op_ext8u_i32: + case INDEX_op_ext8u_i64: + case INDEX_op_ext16u_i32: + case INDEX_op_ext16u_i64: + case INDEX_op_ext32s_i64: + case INDEX_op_ext32u_i64: + case INDEX_op_ext_i32_i64: + case INDEX_op_extu_i32_i64: + case INDEX_op_bswap16_i32: + case INDEX_op_bswap16_i64: + case INDEX_op_bswap32_i32: + case INDEX_op_bswap32_i64: + case INDEX_op_bswap64_i64: + return C_O1_I1(r, r); + + case INDEX_op_st8_i32: + case INDEX_op_st16_i32: + case INDEX_op_st_i32: + case INDEX_op_st8_i64: + case INDEX_op_st16_i64: + case INDEX_op_st32_i64: + case INDEX_op_st_i64: + return C_O0_I2(r, r); + + case INDEX_op_div_i32: + case INDEX_op_div_i64: + case INDEX_op_divu_i32: + case INDEX_op_divu_i64: + case INDEX_op_rem_i32: + case INDEX_op_rem_i64: + case INDEX_op_remu_i32: + case INDEX_op_remu_i64: + case INDEX_op_add_i32: + case INDEX_op_add_i64: + case INDEX_op_sub_i32: + case INDEX_op_sub_i64: + case INDEX_op_mul_i32: + case INDEX_op_mul_i64: + case INDEX_op_and_i32: + case INDEX_op_and_i64: + case INDEX_op_andc_i32: + case INDEX_op_andc_i64: + case INDEX_op_eqv_i32: + case INDEX_op_eqv_i64: + case INDEX_op_nand_i32: + case INDEX_op_nand_i64: + case INDEX_op_nor_i32: + case INDEX_op_nor_i64: + case INDEX_op_or_i32: + case INDEX_op_or_i64: + case INDEX_op_orc_i32: + case INDEX_op_orc_i64: + case INDEX_op_xor_i32: + case INDEX_op_xor_i64: + case INDEX_op_shl_i32: + case INDEX_op_shl_i64: + case INDEX_op_shr_i32: + case INDEX_op_shr_i64: + case INDEX_op_sar_i32: + case INDEX_op_sar_i64: + case INDEX_op_rotl_i32: + case INDEX_op_rotl_i64: + case INDEX_op_rotr_i32: + case INDEX_op_rotr_i64: + case INDEX_op_setcond_i32: + case INDEX_op_setcond_i64: + return C_O1_I2(r, r, r); + + case INDEX_op_brcond_i32: + case INDEX_op_brcond_i64: + return C_O0_I2(r, r); + + case INDEX_op_qemu_ld_i32: + case INDEX_op_qemu_ld_i64: + return C_O1_I2(r, r, r); + case INDEX_op_qemu_st_i32: + case INDEX_op_qemu_st_i64: + return C_O0_I3(r, r, r); + + default: + g_assert_not_reached(); + } +} + +static const int tcg_target_reg_alloc_order[] = { + TCG_REG_R0, + TCG_REG_R1, + TCG_REG_R2, + TCG_REG_R3, + TCG_REG_R4, + TCG_REG_R5, + TCG_REG_R6, + TCG_REG_R7, + TCG_REG_R8, + TCG_REG_R9, + TCG_REG_R10, + TCG_REG_R11, + TCG_REG_R12, + TCG_REG_R13, + /* + TCG_REG_R14, // AREG0 + TCG_REG_R15, // SP + */ +}; + +#if MAX_OPC_PARAM_IARGS != 6 +# error Fix needed, number of supported input arguments changed! +#endif + +static const int tcg_target_call_iarg_regs[] = { + TCG_REG_R0, + TCG_REG_R1, + TCG_REG_R2, + TCG_REG_R3, + TCG_REG_R4, + TCG_REG_R5, +}; + +static const int tcg_target_call_oarg_regs[] = { + TCG_REG_R0, +}; + +#ifdef CONFIG_DEBUG_TCG +static const char *const tcg_target_reg_names[TCG_TARGET_NB_REGS] = { + "r00", + "r01", + "r02", + "r03", + "r04", + "r05", + "r06", + "r07", + "r08", + "r09", + "r10", + "r11", + "r12", + "r13", + "r14", + "r15", +}; +#endif + +static bool patch_reloc(tcg_insn_unit *code_ptr, int type, + intptr_t value, intptr_t addend) +{ + /* tcg_out_reloc always uses the same type, addend. */ + tcg_debug_assert(type == sizeof(tcg_target_long)); + tcg_debug_assert(addend == 0); + tcg_debug_assert(value != 0); + if (TCG_TARGET_REG_BITS == 32) { + tcg_patch32(code_ptr, value); + } else { + tcg_patch64(code_ptr, value); + } + return true; +} + +#if defined(CONFIG_DEBUG_TCG_INTERPRETER) +/* Show current bytecode. Used by tcg interpreter. */ +void tci_disas(uint8_t opc) +{ + const TCGOpDef *def = &tcg_op_defs[opc]; + fprintf(stderr, "TCG %s %u, %u, %u\n", + def->name, def->nb_oargs, def->nb_iargs, def->nb_cargs); +} +#endif + +/* Write value (native size). */ +static void tcg_out_immediate(TCGContext *s, tcg_target_ulong v) +{ + if (TCG_TARGET_REG_BITS == 32) { + //tcg_out32(s, v); + tcg_out64(s, v); + } else { + tcg_out64(s, v); + } +} + +void tb_target_set_jmp_target(uintptr_t tc_ptr, uintptr_t jmp_rx, uintptr_t jmp_rw, uintptr_t addr) +{ + /* Get a pointer to our immediate, which exists after a single pointer. */ + uintptr_t immediate_addr = jmp_rw; + + /* Patch it to be match our target address. */ + qatomic_set((uint64_t *)immediate_addr, addr); +} + + +/** + * TCTI Thunk Helpers + */ + +#ifdef CONFIG_SOFTMMU + +// TODO: relocate these prototypes? +tcg_target_ulong helper_ret_ldub_mmu_signed(CPUArchState *env, target_ulong addr, TCGMemOpIdx oi, uintptr_t retaddr); +tcg_target_ulong helper_le_lduw_mmu_signed(CPUArchState *env, target_ulong addr, TCGMemOpIdx oi, uintptr_t retaddr); +tcg_target_ulong helper_le_ldul_mmu_signed(CPUArchState *env, target_ulong addr, TCGMemOpIdx oi, uintptr_t retaddr); +tcg_target_ulong helper_be_lduw_mmu_signed(CPUArchState *env, target_ulong addr, TCGMemOpIdx oi, uintptr_t retaddr); +tcg_target_ulong helper_be_ldul_mmu_signed(CPUArchState *env, target_ulong addr, TCGMemOpIdx oi, uintptr_t retaddr); + +tcg_target_ulong helper_ret_ldub_mmu_signed(CPUArchState *env, target_ulong addr, TCGMemOpIdx oi, uintptr_t retaddr) +{ + return (int8_t)helper_ret_ldub_mmu(env, addr, oi, retaddr); +} + +tcg_target_ulong helper_le_lduw_mmu_signed(CPUArchState *env, target_ulong addr, TCGMemOpIdx oi, uintptr_t retaddr) +{ + return (int16_t)helper_le_lduw_mmu(env, addr, oi, retaddr); +} + +tcg_target_ulong helper_le_ldul_mmu_signed(CPUArchState *env, target_ulong addr, TCGMemOpIdx oi, uintptr_t retaddr) +{ + return (int32_t)helper_le_ldul_mmu(env, addr, oi, retaddr); +} + +tcg_target_ulong helper_be_lduw_mmu_signed(CPUArchState *env, target_ulong addr, TCGMemOpIdx oi, uintptr_t retaddr) +{ + return (int16_t)helper_be_lduw_mmu(env, addr, oi, retaddr); +} + +tcg_target_ulong helper_be_ldul_mmu_signed(CPUArchState *env, target_ulong addr, TCGMemOpIdx oi, uintptr_t retaddr) +{ + return (int32_t)helper_be_ldul_mmu(env, addr, oi, retaddr); +} + +#else +#error TCTI currently only supports use of the soft MMU. +#endif + + +/** + * TCTI Emmiter Helpers + */ + + +/* Write gadget pointer. */ +static void tcg_out_nullary_gadget(TCGContext *s, void *gadget) +{ + tcg_out_immediate(s, (tcg_target_ulong)gadget); +} + +/* Write gadget pointer, plus 64b immediate. */ +static void tcg_out_imm64_gadget(TCGContext *s, void *gadget, tcg_target_ulong immediate) +{ + tcg_out_nullary_gadget(s, gadget); + tcg_out64(s, immediate); +} + + +/* Write gadget pointer (one register). */ +static void tcg_out_unary_gadget(TCGContext *s, void *gadget_base[TCG_TARGET_NB_REGS], unsigned reg0) +{ + tcg_out_nullary_gadget(s, gadget_base[reg0]); +} + + +/* Write gadget pointer (two registers). */ +static void tcg_out_binary_gadget(TCGContext *s, void *gadget_base[TCG_TARGET_NB_REGS][TCG_TARGET_NB_REGS], unsigned reg0, unsigned reg1) +{ + tcg_out_nullary_gadget(s, gadget_base[reg0][reg1]); +} + + +/* Write gadget pointer (three registers). */ +static void tcg_out_ternary_gadget(TCGContext *s, void *gadget_base[TCG_TARGET_NB_REGS][TCG_TARGET_NB_REGS][TCG_TARGET_NB_REGS], unsigned reg0, unsigned reg1, unsigned reg2) +{ + tcg_out_nullary_gadget(s, gadget_base[reg0][reg1][reg2]); +} + + +/** + * Version of our LDST generator that defers to more optimized gadgets selectively. + */ +static void tcg_out_ldst_gadget_inner(TCGContext *s, + void *gadget_base[TCG_TARGET_NB_REGS][TCG_TARGET_NB_REGS], + void *gadget_pos_imm[TCG_TARGET_NB_REGS][TCG_TARGET_NB_REGS][TCTI_GADGET_IMMEDIATE_ARRAY_LEN], + void *gadget_shifted_imm[TCG_TARGET_NB_REGS][TCG_TARGET_NB_REGS][TCTI_GADGET_IMMEDIATE_ARRAY_LEN], + void *gadget_neg_imm[TCG_TARGET_NB_REGS][TCG_TARGET_NB_REGS][TCTI_GADGET_IMMEDIATE_ARRAY_LEN], + unsigned reg0, unsigned reg1, uint32_t offset) +{ + int64_t extended_offset = (int32_t)offset; + bool is_negative = (extended_offset < 0); + + // Optimal case: we have a gadget that handles our specific offset, so we don't need to encode + // an immediate. This saves us a bunch of speed. :) + + // We handle positive and negative gadgets separately, in order to allow for asymmetrical + // collections of pre-made gadgets. + if (!is_negative) + { + uint64_t shifted_offset = (extended_offset >> 3); + bool aligned_to_8B = ((extended_offset & 0b111) == 0); + + bool have_optimized_gadget = (extended_offset < TCTI_GADGET_IMMEDIATE_ARRAY_LEN); + bool have_shifted_gadget = (shifted_offset < TCTI_GADGET_IMMEDIATE_ARRAY_LEN); + + // More optimal case: we have a gadget that directly encodes the argument. + if (have_optimized_gadget) { + tcg_out_nullary_gadget(s, gadget_pos_imm[reg0][reg1][extended_offset]); + return; + } + + // Special case: it's frequent to have low-numbered positive offsets that are aligned + // to 16B boundaries + else if(aligned_to_8B && have_shifted_gadget) { + tcg_out_nullary_gadget(s, gadget_shifted_imm[reg0][reg1][shifted_offset]); + return; + } + } + else { + uint64_t negated_offset = -(extended_offset); + + // More optimal case: we have a gadget that directly encodes the argument. + if (negated_offset < TCTI_GADGET_IMMEDIATE_ARRAY_LEN) { + tcg_out_nullary_gadget(s, gadget_neg_imm[reg0][reg1][negated_offset]); + return; + } + } + + // Less optimal case: we don't have a gadget specifically for this. Emit the general case immediate. + tcg_out_binary_gadget(s, gadget_base, reg0, reg1); + tcg_out64(s, extended_offset); //tcg_out32(s, offset); +} + +/* Shorthand for the above, that prevents us from having to specify the name three times. */ +#define tcg_out_ldst_gadget(s, name, a, b, c) \ + tcg_out_ldst_gadget_inner(s, name, \ + name ## _imm, \ + name ## _sh8_imm, \ + name ## _neg_imm, \ + a, b, c) + + + +/* Write label. */ +static void tcti_out_label(TCGContext *s, TCGLabel *label) +{ + if (label->has_value) { + tcg_out64(s, label->u.value); + tcg_debug_assert(label->u.value); + } else { + tcg_out_reloc(s, s->code_ptr, sizeof(tcg_target_ulong), label, 0); + s->code_ptr += sizeof(tcg_target_ulong); + } +} + +/** + * Generate a register-to-register MOV. + */ +static bool tcg_out_mov(TCGContext *s, TCGType type, TCGReg ret, TCGReg arg) +{ + tcg_debug_assert(ret != arg); + + if (type == TCG_TYPE_I32) { + tcg_out_binary_gadget(s, gadget_mov_i32, ret, arg); + } else { + tcg_out_binary_gadget(s, gadget_mov_i64, ret, arg); + } + + + return true; +} + + +static void tcg_out_movi_i32(TCGContext *s, TCGReg t0, tcg_target_long arg) +{ + bool is_negative = (arg < 0); + + // We handle positive and negative gadgets separately, in order to allow for asymmetrical + // collections of pre-made gadgets. + if (!is_negative) + { + // More optimal case: we have a gadget that directly encodes the argument. + if (arg < ARRAY_SIZE(gadget_movi_imm_i32[t0])) { + tcg_out_nullary_gadget(s, gadget_movi_imm_i32[t0][arg]); + return; + } + } + else { + + } + + // Emit the mov and its immediate. + tcg_out_unary_gadget(s, gadget_movi_i32, t0); + tcg_out64(s, arg); // TODO: make 32b? +} + + +static void tcg_out_movi_i64(TCGContext *s, TCGReg t0, tcg_target_long arg) +{ + uint8_t is_negative = arg < 0; + + // We handle positive and negative gadgets separately, in order to allow for asymmetrical + // collections of pre-made gadgets. + if (!is_negative) + { + // More optimal case: we have a gadget that directly encodes the argument. + if (arg < ARRAY_SIZE(gadget_movi_imm_i64[t0])) { + tcg_out_nullary_gadget(s, gadget_movi_imm_i64[t0][arg]); + return; + } + } + else { + + } + + // TODO: optimize the negative case, too? + + // Less optimal case: emit the mov and its immediate. + tcg_out_unary_gadget(s, gadget_movi_i64, t0); + tcg_out64(s, arg); +} + + +/** + * Generate an immediate-to-register MOV. + */ +static void tcg_out_movi(TCGContext *s, TCGType type, TCGReg t0, tcg_target_long arg) +{ + if (type == TCG_TYPE_I32) { + tcg_out_movi_i32(s, t0, arg); + } else { + tcg_out_movi_i64(s, t0, arg); + } +} + +/** + * Generate a CALL. + */ +static inline void tcg_out_call(TCGContext *s, const tcg_insn_unit *arg) +{ + tcg_out_nullary_gadget(s, gadget_call); + tcg_out64(s, (uintptr_t)arg); +} + +/** + * Generates LD instructions. + */ +static void tcg_out_ld(TCGContext *s, TCGType type, TCGReg ret, TCGReg arg1, + intptr_t arg2) +{ + + if (type == TCG_TYPE_I32) { + tcg_out_ldst_gadget(s, gadget_ld32u, ret, arg1, arg2); + } else { + tcg_out_ldst_gadget(s, gadget_ld_i64, ret, arg1, arg2); + } +} + + +/** + * Generate every other operation. + */ +//static void tcg_out_op(TCGContext *s, TCGOpcode opc, const TCGArg *args, const int *const_args) +void tcg_out_op(TCGContext *s, TCGOpcode opc, const TCGArg *args, const int *const_args) +{ + switch (opc) { + + // Exit translation, and return back to QEMU. + case INDEX_op_exit_tb: + // Emit a simple gadget with a known return code. + tcg_out_imm64_gadget(s, gadget_exit_tb, args[0]); + break; + + // Jump to a translation block. + case INDEX_op_goto_tb: + + // If we're using a direct jump, we'll emit a "relocation" that can be usd + // to patch our gadget stream with the target address, later. + if (s->tb_jmp_insn_offset) { + // Emit our gadget. + tcg_out_nullary_gadget(s, gadget_br); + + // Place our current instruction into our "relocation table", so it can + // be patched once we know where the branch will target... + s->tb_jmp_insn_offset[args[0]] = tcg_current_code_size(s); + + // ... and emit our relocation. + tcg_out64(s, args[0]); + + + } else { + /* Indirect jump method. */ + TODO(); + } + set_jmp_reset_offset(s, args[0]); + break; + + // Simple branch. + case INDEX_op_br: + tcg_out_nullary_gadget(s, gadget_br); + tcti_out_label(s, arg_label(args[0])); + break; + + + // Set condition flag. + // a0 = Rd, a1 = Rn, a2 = Rm + case INDEX_op_setcond_i32: + { + void *gadget; + + // We have to emit a different gadget per condition; we'll select which. + switch(args[3]) { + case TCG_COND_EQ: gadget = gadget_setcond_i32_eq; break; + case TCG_COND_NE: gadget = gadget_setcond_i32_ne; break; + case TCG_COND_LT: gadget = gadget_setcond_i32_lt; break; + case TCG_COND_GE: gadget = gadget_setcond_i32_ge; break; + case TCG_COND_LE: gadget = gadget_setcond_i32_le; break; + case TCG_COND_GT: gadget = gadget_setcond_i32_gt; break; + case TCG_COND_LTU: gadget = gadget_setcond_i32_lo; break; + case TCG_COND_GEU: gadget = gadget_setcond_i32_hs; break; + case TCG_COND_LEU: gadget = gadget_setcond_i32_ls; break; + case TCG_COND_GTU: gadget = gadget_setcond_i32_hi; break; + default: + g_assert_not_reached(); + } + + tcg_out_ternary_gadget(s, gadget, args[0], args[1], args[2]); + break; + } + + case INDEX_op_setcond_i64: + { + void *gadget; + + // We have to emit a different gadget per condition; we'll select which. + switch(args[3]) { + case TCG_COND_EQ: gadget = gadget_setcond_i64_eq; break; + case TCG_COND_NE: gadget = gadget_setcond_i64_ne; break; + case TCG_COND_LT: gadget = gadget_setcond_i64_lt; break; + case TCG_COND_GE: gadget = gadget_setcond_i64_ge; break; + case TCG_COND_LE: gadget = gadget_setcond_i64_le; break; + case TCG_COND_GT: gadget = gadget_setcond_i64_gt; break; + case TCG_COND_LTU: gadget = gadget_setcond_i64_lo; break; + case TCG_COND_GEU: gadget = gadget_setcond_i64_hs; break; + case TCG_COND_LEU: gadget = gadget_setcond_i64_ls; break; + case TCG_COND_GTU: gadget = gadget_setcond_i64_hi; break; + default: + g_assert_not_reached(); + } + + tcg_out_ternary_gadget(s, gadget, args[0], args[1], args[2]); + break; + } + + /** + * Load instructions. + */ + + case INDEX_op_ld8u_i32: + case INDEX_op_ld8u_i64: + tcg_out_ldst_gadget(s, gadget_ld8u, args[0], args[1], args[2]); + break; + + case INDEX_op_ld8s_i32: + tcg_out_ldst_gadget(s, gadget_ld8s_i32, args[0], args[1], args[2]); + break; + + case INDEX_op_ld8s_i64: + tcg_out_ldst_gadget(s, gadget_ld8s_i64, args[0], args[1], args[2]); + break; + + case INDEX_op_ld16u_i32: + case INDEX_op_ld16u_i64: + tcg_out_ldst_gadget(s, gadget_ld16u, args[0], args[1], args[2]); + break; + + case INDEX_op_ld16s_i32: + tcg_out_ldst_gadget(s, gadget_ld16s_i32, args[0], args[1], args[2]); + break; + + case INDEX_op_ld16s_i64: + tcg_out_ldst_gadget(s, gadget_ld16s_i64, args[0], args[1], args[2]); + break; + + case INDEX_op_ld_i32: + case INDEX_op_ld32u_i64: + tcg_out_ldst_gadget(s, gadget_ld32u, args[0], args[1], args[2]); + break; + + case INDEX_op_ld_i64: + tcg_out_ldst_gadget(s, gadget_ld_i64, args[0], args[1], args[2]); + break; + + case INDEX_op_ld32s_i64: + tcg_out_ldst_gadget(s, gadget_ld32s_i64, args[0], args[1], args[2]); + break; + + + /** + * Store instructions. + */ + case INDEX_op_st8_i32: + case INDEX_op_st8_i64: + tcg_out_ldst_gadget(s, gadget_st8, args[0], args[1], args[2]); + break; + + case INDEX_op_st16_i32: + case INDEX_op_st16_i64: + tcg_out_ldst_gadget(s, gadget_st16, args[0], args[1], args[2]); + break; + + case INDEX_op_st_i32: + case INDEX_op_st32_i64: + tcg_out_ldst_gadget(s, gadget_st_i32, args[0], args[1], args[2]); + break; + + case INDEX_op_st_i64: + tcg_out_ldst_gadget(s, gadget_st_i64, args[0], args[1], args[2]); + break; + + /** + * Arithmetic instructions. + */ + + case INDEX_op_add_i32: + tcg_out_ternary_gadget(s, gadget_add_i32, args[0], args[1], args[2]); + break; + + case INDEX_op_sub_i32: + tcg_out_ternary_gadget(s, gadget_sub_i32, args[0], args[1], args[2]); + break; + + case INDEX_op_mul_i32: + tcg_out_ternary_gadget(s, gadget_mul_i32, args[0], args[1], args[2]); + break; + + case INDEX_op_and_i32: + tcg_out_ternary_gadget(s, gadget_and_i32, args[0], args[1], args[2]); + break; + + case INDEX_op_andc_i32: /* Optional (TCG_TARGET_HAS_andc_i32). */ + tcg_out_ternary_gadget(s, gadget_andc_i32, args[0], args[1], args[2]); + break; + + case INDEX_op_orc_i32: /* Optional (TCG_TARGET_HAS_orc_i64). */ + tcg_out_ternary_gadget(s, gadget_orc_i32, args[0], args[1], args[2]); + break; + + case INDEX_op_eqv_i32: /* Optional (TCG_TARGET_HAS_orc_i64). */ + tcg_out_ternary_gadget(s, gadget_eqv_i32, args[0], args[1], args[2]); + break; + + case INDEX_op_or_i32: + tcg_out_ternary_gadget(s, gadget_or_i32, args[0], args[1], args[2]); + break; + + case INDEX_op_xor_i32: + tcg_out_ternary_gadget(s, gadget_xor_i32, args[0], args[1], args[2]); + break; + + case INDEX_op_shl_i32: + tcg_out_ternary_gadget(s, gadget_shl_i32, args[0], args[1], args[2]); + break; + + case INDEX_op_shr_i32: + tcg_out_ternary_gadget(s, gadget_shr_i32, args[0], args[1], args[2]); + break; + + case INDEX_op_sar_i32: + tcg_out_ternary_gadget(s, gadget_sar_i32, args[0], args[1], args[2]); + break; + + //case INDEX_op_rotr_i32: /* Optional (TCG_TARGET_HAS_rot_i32). */ + // tcg_out_ternary_gadget(s, gadget_rotr_i32, args[0], args[1], args[2]); + // break; + + //case INDEX_op_rotl_i32: /* Optional (TCG_TARGET_HAS_rot_i32). */ + // tcg_out_ternary_gadget(s, gadget_rotl_i32, args[0], args[1], args[2]); + + case INDEX_op_add_i64: + tcg_out_ternary_gadget(s, gadget_add_i64, args[0], args[1], args[2]); + break; + + case INDEX_op_sub_i64: + tcg_out_ternary_gadget(s, gadget_sub_i64, args[0], args[1], args[2]); + break; + + case INDEX_op_mul_i64: + tcg_out_ternary_gadget(s, gadget_mul_i64, args[0], args[1], args[2]); + break; + + case INDEX_op_and_i64: + tcg_out_ternary_gadget(s, gadget_and_i64, args[0], args[1], args[2]); + break; + + case INDEX_op_andc_i64: /* Optional (TCG_TARGET_HAS_andc_i64). */ + tcg_out_ternary_gadget(s, gadget_andc_i64, args[0], args[1], args[2]); + break; + + case INDEX_op_orc_i64: /* Optional (TCG_TARGET_HAS_orc_i64). */ + tcg_out_ternary_gadget(s, gadget_orc_i64, args[0], args[1], args[2]); + break; + + case INDEX_op_eqv_i64: /* Optional (TCG_TARGET_HAS_eqv_i64). */ + tcg_out_ternary_gadget(s, gadget_eqv_i64, args[0], args[1], args[2]); + break; + + //case INDEX_op_nand_i64: /* Optional (TCG_TARGET_HAS_nand_i64). */ + //case INDEX_op_nor_i64: /* Optional (TCG_TARGET_HAS_nor_i64). */ + + case INDEX_op_or_i64: + tcg_out_ternary_gadget(s, gadget_or_i64, args[0], args[1], args[2]); + break; + + case INDEX_op_xor_i64: + tcg_out_ternary_gadget(s, gadget_xor_i64, args[0], args[1], args[2]); + break; + + case INDEX_op_shl_i64: + tcg_out_ternary_gadget(s, gadget_shl_i64, args[0], args[1], args[2]); + break; + + case INDEX_op_shr_i64: + tcg_out_ternary_gadget(s, gadget_shr_i64, args[0], args[1], args[2]); + break; + + case INDEX_op_sar_i64: + tcg_out_ternary_gadget(s, gadget_sar_i64, args[0], args[1], args[2]); + break; + + //case INDEX_op_rotl_i64: /* Optional (TCG_TARGET_HAS_rot_i64). */ + // tcg_out_ternary_gadget(s, gadget_rotl_i64, args[0], args[1], args[2]); + // break; + + //case INDEX_op_rotr_i64: /* Optional (TCG_TARGET_HAS_rot_i64). */ + // tcg_out_ternary_gadget(s, gadget_rotr_i64, args[0], args[1], args[2]); + // break; + + case INDEX_op_div_i64: /* Optional (TCG_TARGET_HAS_div_i64). */ + tcg_out_ternary_gadget(s, gadget_div_i64, args[0], args[1], args[2]); + break; + + case INDEX_op_divu_i64: /* Optional (TCG_TARGET_HAS_div_i64). */ + tcg_out_ternary_gadget(s, gadget_divu_i64, args[0], args[1], args[2]); + break; + + case INDEX_op_rem_i64: /* Optional (TCG_TARGET_HAS_div_i64). */ + tcg_out_ternary_gadget(s, gadget_rem_i64, args[0], args[1], args[2]); + break; + + case INDEX_op_remu_i64: /* Optional (TCG_TARGET_HAS_div_i64). */ + tcg_out_ternary_gadget(s, gadget_remu_i64, args[0], args[1], args[2]); + break; + + case INDEX_op_brcond_i64: + { + static uint8_t last_brcond_i64 = 0; + void *gadget; + + // We have to emit a different gadget per condition; we'll select which. + switch(args[2]) { + case TCG_COND_EQ: gadget = gadget_brcond_i64_eq; break; + case TCG_COND_NE: gadget = gadget_brcond_i64_ne; break; + case TCG_COND_LT: gadget = gadget_brcond_i64_lt; break; + case TCG_COND_GE: gadget = gadget_brcond_i64_ge; break; + case TCG_COND_LE: gadget = gadget_brcond_i64_le; break; + case TCG_COND_GT: gadget = gadget_brcond_i64_gt; break; + case TCG_COND_LTU: gadget = gadget_brcond_i64_lo; break; + case TCG_COND_GEU: gadget = gadget_brcond_i64_hs; break; + case TCG_COND_LEU: gadget = gadget_brcond_i64_ls; break; + case TCG_COND_GTU: gadget = gadget_brcond_i64_hi; break; + default: + g_assert_not_reached(); + } + + // We'll select the which branch to used based on a cycling counter. + // This means we'll pick one of 16 identical brconds. Spreading this out + // helps the processor's branch prediction be less "squished", as not every + // branch is going throuh the same instruction. + tcg_out_ternary_gadget(s, gadget, last_brcond_i64, args[0], args[1]); + last_brcond_i64 = (last_brcond_i64 + 1) % TCG_TARGET_NB_REGS; + + // Branch target immediate. + tcti_out_label(s, arg_label(args[3])); + break; + } + + + case INDEX_op_bswap16_i32: /* Optional (TCG_TARGET_HAS_bswap16_i32). */ + case INDEX_op_bswap16_i64: /* Optional (TCG_TARGET_HAS_bswap16_i64). */ + tcg_out_binary_gadget(s, gadget_bswap16, args[0], args[1]); + break; + + case INDEX_op_bswap32_i32: /* Optional (TCG_TARGET_HAS_bswap32_i32). */ + case INDEX_op_bswap32_i64: /* Optional (TCG_TARGET_HAS_bswap32_i64). */ + tcg_out_binary_gadget(s, gadget_bswap32, args[0], args[1]); + break; + + case INDEX_op_bswap64_i64: /* Optional (TCG_TARGET_HAS_bswap64_i64). */ + tcg_out_binary_gadget(s, gadget_bswap64, args[0], args[1]); + break; + + case INDEX_op_not_i64: /* Optional (TCG_TARGET_HAS_not_i64). */ + tcg_out_binary_gadget(s, gadget_not_i64, args[0], args[1]); + break; + + case INDEX_op_neg_i64: /* Optional (TCG_TARGET_HAS_neg_i64). */ + tcg_out_binary_gadget(s, gadget_neg_i64, args[0], args[1]); + break; + + case INDEX_op_ext8s_i64: /* Optional (TCG_TARGET_HAS_ext8s_i64). */ + tcg_out_binary_gadget(s, gadget_ext8s_i64, args[0], args[1]); + break; + + case INDEX_op_ext8u_i32: /* Optional (TCG_TARGET_HAS_ext8u_i32). */ + case INDEX_op_ext8u_i64: /* Optional (TCG_TARGET_HAS_ext8u_i64). */ + tcg_out_binary_gadget(s, gadget_ext8u, args[0], args[1]); + break; + + case INDEX_op_ext16s_i64: /* Optional (TCG_TARGET_HAS_ext16s_i64). */ + tcg_out_binary_gadget(s, gadget_ext16s_i64, args[0], args[1]); + break; + + case INDEX_op_ext16u_i32: /* Optional (TCG_TARGET_HAS_ext16u_i32). */ + case INDEX_op_ext16u_i64: /* Optional (TCG_TARGET_HAS_ext16u_i64). */ + tcg_out_binary_gadget(s, gadget_ext16u, args[0], args[1]); + break; + + case INDEX_op_ext32s_i64: /* Optional (TCG_TARGET_HAS_ext32s_i64). */ + case INDEX_op_ext_i32_i64: + tcg_out_binary_gadget(s, gadget_ext32s_i64, args[0], args[1]); + break; + + case INDEX_op_ext32u_i64: /* Optional (TCG_TARGET_HAS_ext32u_i64). */ + case INDEX_op_extu_i32_i64: + tcg_out_binary_gadget(s, gadget_ext32u_i64, args[0], args[1]); + break; + + case INDEX_op_neg_i32: /* Optional (TCG_TARGET_HAS_neg_i32). */ + tcg_out_binary_gadget(s, gadget_neg_i32, args[0], args[1]); + break; + + case INDEX_op_not_i32: /* Optional (TCG_TARGET_HAS_not_i32). */ + tcg_out_binary_gadget(s, gadget_not_i32, args[0], args[1]); + break; + + case INDEX_op_ext8s_i32: /* Optional (TCG_TARGET_HAS_ext8s_i32). */ + tcg_out_binary_gadget(s, gadget_ext8s_i32, args[0], args[1]); + break; + + case INDEX_op_ext16s_i32: /* Optional (TCG_TARGET_HAS_ext16s_i32). */ + tcg_out_binary_gadget(s, gadget_ext16s_i32, args[0], args[1]); + break; + + case INDEX_op_div_i32: /* Optional (TCG_TARGET_HAS_div_i32). */ + tcg_out_ternary_gadget(s, gadget_div_i32, args[0], args[1], args[2]); + break; + + case INDEX_op_divu_i32: /* Optional (TCG_TARGET_HAS_div_i32). */ + tcg_out_ternary_gadget(s, gadget_divu_i32, args[0], args[1], args[2]); + break; + + case INDEX_op_rem_i32: /* Optional (TCG_TARGET_HAS_div_i32). */ + tcg_out_ternary_gadget(s, gadget_rem_i32, args[0], args[1], args[2]); + break; + + case INDEX_op_remu_i32: /* Optional (TCG_TARGET_HAS_div_i32). */ + tcg_out_ternary_gadget(s, gadget_remu_i32, args[0], args[1], args[2]); + break; + + case INDEX_op_brcond_i32: + { + static uint8_t last_brcond_i32 = 0; + void *gadget; + + // We have to emit a different gadget per condition; we'll select which. + switch(args[2]) { + case TCG_COND_EQ: gadget = gadget_brcond_i32_eq; break; + case TCG_COND_NE: gadget = gadget_brcond_i32_ne; break; + case TCG_COND_LT: gadget = gadget_brcond_i32_lt; break; + case TCG_COND_GE: gadget = gadget_brcond_i32_ge; break; + case TCG_COND_LE: gadget = gadget_brcond_i32_le; break; + case TCG_COND_GT: gadget = gadget_brcond_i32_gt; break; + case TCG_COND_LTU: gadget = gadget_brcond_i32_lo; break; + case TCG_COND_GEU: gadget = gadget_brcond_i32_hs; break; + case TCG_COND_LEU: gadget = gadget_brcond_i32_ls; break; + case TCG_COND_GTU: gadget = gadget_brcond_i32_hi; break; + default: + g_assert_not_reached(); + } + + // We'll select the which branch to used based on a cycling counter. + // This means we'll pick one of 16 identical brconds. Spreading this out + // helps the processor's branch prediction be less "squished", as not every + // branch is going throuh the same instruction. + tcg_out_ternary_gadget(s, gadget, last_brcond_i32, args[0], args[1]); + last_brcond_i32 = (last_brcond_i32 + 1) % TCG_TARGET_NB_REGS; + + // Branch target immediate. + tcti_out_label(s, arg_label(args[3])); + + break; + } + + case INDEX_op_qemu_ld_i32: + { + MemOp opc = get_memop(args[2]); + unsigned a_bits = get_alignment_bits(opc); + unsigned s_bits = opc & MO_SIZE; + + void *gadget; + + switch(TLB_MASK_TABLE_OFS(get_mmuidx(args[2]))) { + case -64: LD_MEMOP_HANDLER(gadget, args[2], off64_i32, a_bits, s_bits); break; + case -96: LD_MEMOP_HANDLER(gadget, args[2], off96_i32, a_bits, s_bits); break; + case -128: LD_MEMOP_HANDLER(gadget, args[2], off128_i32, a_bits, s_bits); break; + default: LD_MEMOP_LOOKUP(gadget, args[2], slowpath_off0_i32); break; + } + + // Args: + // - an immediate32 encodes our operation index + tcg_out_binary_gadget(s, gadget, args[0], args[1]); + tcg_out64(s, args[2]); // TODO: fix encoding to be 4b + break; + } + + case INDEX_op_qemu_ld_i64: + { + MemOp opc = get_memop(args[2]); + unsigned a_bits = get_alignment_bits(opc); + unsigned s_bits = opc & MO_SIZE; + + void *gadget; + + // Special optimization case: if we have an operation/target of 0x3A, + // this is a common case. Delegate to our special-case handler. + if (args[2] == 0x3a) { + switch(TLB_MASK_TABLE_OFS(get_mmuidx(args[2]))) { + + case -64: + gadget = (a_bits >= s_bits) ? + gadget_qemu_ld_leq_aligned_mode3a_off64_i64 : + gadget_qemu_ld_leq_unaligned_mode3a_off64_i64; + break; + case -96: + gadget = (a_bits >= s_bits) ? + gadget_qemu_ld_leq_aligned_mode3a_off96_i64 : + gadget_qemu_ld_leq_unaligned_mode3a_off96_i64; + break; + case -128: + gadget = (a_bits >= s_bits) ? + gadget_qemu_ld_leq_aligned_mode3a_off128_i64 : + gadget_qemu_ld_leq_unaligned_mode3a_off128_i64; + break; + + default: + gadget = gadget_qemu_ld_leq_slowpath_mode3a_off0_i64; + break; + } + tcg_out_binary_gadget(s, gadget, args[0], args[1]); + } + // Otherwise, handle the generic case. + else { + switch(TLB_MASK_TABLE_OFS(get_mmuidx(args[2]))) { + case -64: LD_MEMOP_HANDLER(gadget, args[2], off64_i64, a_bits, s_bits); break; + case -96: LD_MEMOP_HANDLER(gadget, args[2], off96_i64, a_bits, s_bits); break; + case -128: LD_MEMOP_HANDLER(gadget, args[2], off128_i64, a_bits, s_bits); break; + default: LD_MEMOP_LOOKUP(gadget, args[2], slowpath_off0_i64); break; + } + // Args: + // - an immediate32 encodes our operation index + tcg_out_binary_gadget(s, gadget, args[0], args[1]); + tcg_out64(s, args[2]); // TODO: fix encoding to be 4b + } + + break; + } + + case INDEX_op_qemu_st_i32: + { + MemOp opc = get_memop(args[2]); + unsigned a_bits = get_alignment_bits(opc); + unsigned s_bits = opc & MO_SIZE; + + void *gadget; + + switch(TLB_MASK_TABLE_OFS(get_mmuidx(args[2]))) { + case -64: ST_MEMOP_HANDLER(gadget, args[2], off64_i32, a_bits, s_bits); break; + case -96: ST_MEMOP_HANDLER(gadget, args[2], off96_i32, a_bits, s_bits); break; + case -128: ST_MEMOP_HANDLER(gadget, args[2], off128_i32, a_bits, s_bits); break; + default: ST_MEMOP_LOOKUP(gadget, args[2], slowpath_off0_i32); break; + } + + // Args: + // - our gadget encodes the target and address registers + // - an immediate32 encodes our operation index + tcg_out_binary_gadget(s, gadget, args[0], args[1]); + tcg_out64(s, args[2]); // FIXME: double encoded + break; + } + + case INDEX_op_qemu_st_i64: + { + MemOp opc = get_memop(args[2]); + unsigned a_bits = get_alignment_bits(opc); + unsigned s_bits = opc & MO_SIZE; + + void *gadget; + + // Special optimization case: if we have an operation/target of 0x3A, + // this is a common case. Delegate to our special-case handler. + if (args[2] == 0x3a) { + switch(TLB_MASK_TABLE_OFS(get_mmuidx(args[2]))) { + + case -64: + gadget = (a_bits >= s_bits) ? + gadget_qemu_st_leq_aligned_mode3a_off64_i64 : + gadget_qemu_st_leq_unaligned_mode3a_off64_i64; + break; + case -96: + gadget = (a_bits >= s_bits) ? + gadget_qemu_st_leq_aligned_mode3a_off96_i64 : + gadget_qemu_st_leq_unaligned_mode3a_off96_i64; + break; + case -128: + gadget = (a_bits >= s_bits) ? + gadget_qemu_st_leq_aligned_mode3a_off128_i64 : + gadget_qemu_st_leq_unaligned_mode3a_off128_i64; + break; + + default: + gadget = gadget_qemu_st_leq_slowpath_mode3a_off0_i64; + break; + } + tcg_out_binary_gadget(s, gadget, args[0], args[1]); + } + // Otherwise, handle the generic case. + else { + switch(TLB_MASK_TABLE_OFS(get_mmuidx(args[2]))) { + case -64: ST_MEMOP_HANDLER(gadget, args[2], off64_i64, a_bits, s_bits); break; + case -96: ST_MEMOP_HANDLER(gadget, args[2], off96_i64, a_bits, s_bits); break; + case -128: ST_MEMOP_HANDLER(gadget, args[2], off128_i64, a_bits, s_bits); break; + default: ST_MEMOP_LOOKUP(gadget, args[2], slowpath_off0_i64); break; + } + + // Args: + // - our gadget encodes the target and address registers + // - an immediate32 encodes our operation index + tcg_out_binary_gadget(s, gadget, args[0], args[1]); + tcg_out64(s, args[2]); // FIXME: double encoded + } + + break; + } + + // Memory barriers. + case INDEX_op_mb: + { + static void* sync[] = { + [0 ... TCG_MO_ALL] = gadget_mb_all, + [TCG_MO_ST_ST] = gadget_mb_st, + [TCG_MO_LD_LD] = gadget_mb_ld, + [TCG_MO_LD_ST] = gadget_mb_ld, + [TCG_MO_LD_ST | TCG_MO_LD_LD] = gadget_mb_ld, + }; + tcg_out_nullary_gadget(s, sync[args[0] & TCG_MO_ALL]); + + break; + } + + case INDEX_op_mov_i32: /* Always emitted via tcg_out_mov. */ + case INDEX_op_mov_i64: + case INDEX_op_call: /* Always emitted via tcg_out_call. */ + default: + tcg_abort(); + } +} + +/** + * Generate immediate stores. + */ +static void tcg_out_st(TCGContext *s, TCGType type, TCGReg arg, TCGReg arg1, + intptr_t arg2) +{ + if (type == TCG_TYPE_I32) { + tcg_out_ldst_gadget(s, gadget_st_i32, arg, arg1, arg2); + } else { + tcg_out_ldst_gadget(s, gadget_st_i64, arg, arg1, arg2); + } +} + +static inline bool tcg_out_sti(TCGContext *s, TCGType type, TCGArg val, + TCGReg base, intptr_t ofs) +{ + return false; +} + +/* Test if a constant matches the constraint. */ +static int tcg_target_const_match(tcg_target_long val, TCGType type, + const TCGArgConstraint *arg_ct) +{ + /* No need to return 0 or 1, 0 or != 0 is good enough. */ + return arg_ct->ct & TCG_CT_CONST; +} + +static void tcg_target_init(TCGContext *s) +{ + /* The current code uses uint8_t for tcg operations. */ + tcg_debug_assert(tcg_op_defs_max <= UINT8_MAX); + + /* Registers available for 32 bit operations. */ + tcg_target_available_regs[TCG_TYPE_I32] = BIT(TCG_TARGET_NB_REGS) - 1; + /* Registers available for 64 bit operations. */ + tcg_target_available_regs[TCG_TYPE_I64] = BIT(TCG_TARGET_NB_REGS) - 1; + + /* TODO: Which registers should be set here? */ + tcg_target_call_clobber_regs = BIT(TCG_TARGET_NB_REGS) - 1; + + s->reserved_regs = 0; + tcg_regset_set_reg(s->reserved_regs, TCG_REG_CALL_STACK); + + /* We use negative offsets from "sp" so that we can distinguish + stores that might pretend to be call arguments. */ + tcg_set_frame(s, TCG_REG_CALL_STACK, -CPU_TEMP_BUF_NLONGS * sizeof(long), CPU_TEMP_BUF_NLONGS * sizeof(long)); +} + +/* Generate global QEMU prologue and epilogue code. */ +static inline void tcg_target_qemu_prologue(TCGContext *s) +{ + // No prologue; as we're interpreted. +} + + +/** + * TCTI 'interpreter' bootstrap. + */ + +// Store the current return address during helper calls. +__thread uintptr_t tcti_call_return_address; + +/* Dispatch the bytecode stream contained in our translation buffer. */ +uintptr_t QEMU_DISABLE_CFI tcg_qemu_tb_exec(CPUArchState *env, const void *v_tb_ptr) +{ + // Create our per-CPU temporary storage. + long tcg_temps[CPU_TEMP_BUF_NLONGS]; + + uint64_t return_value = 0; + uintptr_t sp_value = (uintptr_t)(tcg_temps + CPU_TEMP_BUF_NLONGS); + uintptr_t pc_mirror = (uintptr_t)&tcti_call_return_address; + + // Ensure our target configuration hasn't changed. + tcti_assert(TCG_AREG0 == TCG_REG_R14); + tcti_assert(TCG_REG_CALL_STACK == TCG_REG_R15); + + asm( + // Our threaded-dispatch prologue needs to set up things for our machine to run. + // This means: + // - Set up TCG_AREG0 (R14) to point to our architectural state. + // - Set up TCG_REG_CALL_STACK (R15) to point to our temporary buffer. + // - Point x28 (our bytecode "instruction pointer") to the relevant stream address. + "ldr x14, %[areg0]\n" + "ldr x15, %[sp_value]\n" + "ldr x25, %[pc_mirror]\n" + "ldr x28, %[start_tb_ptr]\n" + + // To start our code, we'll -call- the gadget at the first bytecode pointer. + // Note that we call/branch-with-link, here; so our TB_EXIT gadget can RET in order + // to return to this point when things are complete. + "ldr x27, [x28], #8\n" + "blr x27\n" + + // Finally, we'll copy out our final return value. + "str x0, %[return_value]\n" + + : [return_value] "=m" (return_value) + + : [areg0] "m" (env), + [sp_value] "m" (sp_value), + [start_tb_ptr] "m" (v_tb_ptr), + [pc_mirror] "m" (pc_mirror) + + // We touch _every_ one of the lower registers, as we use these to execute directly. + : "x0", "x1", "x2", "x3", "x4", "x5", "x6", "x7", + "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", + + // We also use x26/x27 for temporary values, and x28 as our bytecode poitner. + "x25", "x26", "x27", "x28", "cc", "memory" + ); + + return return_value; +} + + +/** + * Disassembly output support. + */ +#include + + +/* Disassemble TCI bytecode. */ +int print_insn_tcti(bfd_vma addr, disassemble_info *info) +{ + Dl_info symbol_info = {}; + char symbol_name[48] ; + + int status; + uint64_t block; + + // Read the relevant pointer. + status = info->read_memory_func(addr, (void *)&block, sizeof(block), info); + if (status != 0) { + info->memory_error_func(status, addr, info); + return -1; + } + + // Most of our disassembly stream will be gadgets. Try to get their names, for nice output. + dladdr((void *)block, &symbol_info); + + if(symbol_info.dli_sname != 0) { + strlcpy(symbol_name, symbol_info.dli_sname, 47); + info->fprintf_func(info->stream, "%s", symbol_name); + } else { + info->fprintf_func(info->stream, "%016llx", block); + } + + return sizeof(block); +} + + diff --git a/tcg/aarch64-tcti/tcg-target.h b/tcg/aarch64-tcti/tcg-target.h new file mode 100644 index 000000000000..fa2ae5c40a3e --- /dev/null +++ b/tcg/aarch64-tcti/tcg-target.h @@ -0,0 +1,220 @@ +/* + * Tiny Code Generator for QEMU + * + * Copyright (c) 2009, 2011 Stefan Weil + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ + +/* + * This code implements a TCG which does not generate machine code for some + * real target machine but which generates virtual machine code for an + * interpreter. Interpreted pseudo code is slow, but it works on any host. + * + * Some remarks might help in understanding the code: + * + * "target" or "TCG target" is the machine which runs the generated code. + * This is different to the usual meaning in QEMU where "target" is the + * emulated machine. So normally QEMU host is identical to TCG target. + * Here the TCG target is a virtual machine, but this virtual machine must + * use the same word size like the real machine. + * Therefore, we need both 32 and 64 bit virtual machines (interpreter). + */ + +#ifndef TCG_TARGET_H +#define TCG_TARGET_H + +#if UINTPTR_MAX == UINT32_MAX +# error We only support AArch64 running in 64B mode. +#elif UINTPTR_MAX == UINT64_MAX +# define TCG_TARGET_REG_BITS 64 +#else +# error Unknown pointer size for tcti target +#endif + +#define TCG_TARGET_INSN_UNIT_SIZE 1 +#define TCG_TARGET_TLB_DISPLACEMENT_BITS 32 + +// We're an interpreted target; even if we're JIT-compiling to our interpreter's +// weird psuedo-native bytecode. We'll indicate that we're intepreted. +#define TCG_TARGET_INTERPRETER 1 + +// +// Supported optional instructions. +// + +// Divs. +#define TCG_TARGET_HAS_div_i32 1 +#define TCG_TARGET_HAS_rem_i32 1 +#define TCG_TARGET_HAS_div_i64 1 +#define TCG_TARGET_HAS_rem_i64 1 + +// Extends. +#define TCG_TARGET_HAS_ext8s_i32 1 +#define TCG_TARGET_HAS_ext16s_i32 1 +#define TCG_TARGET_HAS_ext8u_i32 1 +#define TCG_TARGET_HAS_ext16u_i32 1 +#define TCG_TARGET_HAS_ext8s_i64 1 +#define TCG_TARGET_HAS_ext16s_i64 1 +#define TCG_TARGET_HAS_ext32s_i64 1 +#define TCG_TARGET_HAS_ext8u_i64 1 +#define TCG_TARGET_HAS_ext16u_i64 1 +#define TCG_TARGET_HAS_ext32u_i64 1 + +// Logicals. +#define TCG_TARGET_HAS_neg_i32 1 +#define TCG_TARGET_HAS_not_i32 1 +#define TCG_TARGET_HAS_neg_i64 1 +#define TCG_TARGET_HAS_not_i64 1 + +#define TCG_TARGET_HAS_andc_i32 1 +#define TCG_TARGET_HAS_orc_i32 1 +#define TCG_TARGET_HAS_eqv_i32 1 +#define TCG_TARGET_HAS_andc_i64 1 +#define TCG_TARGET_HAS_eqv_i64 1 +#define TCG_TARGET_HAS_orc_i64 1 + +// We don't curretly support rotates, since AArch64 lacks ROL. +// We'll fix this later. +#define TCG_TARGET_HAS_rot_i32 0 +#define TCG_TARGET_HAS_rot_i64 0 + +// Swaps. +#define TCG_TARGET_HAS_bswap16_i32 1 +#define TCG_TARGET_HAS_bswap32_i32 1 +#define TCG_TARGET_HAS_bswap16_i64 1 +#define TCG_TARGET_HAS_bswap32_i64 1 +#define TCG_TARGET_HAS_bswap64_i64 1 +#define TCG_TARGET_HAS_MEMORY_BSWAP 1 + +// Specify we'll handle direct jumps. +#define TCG_TARGET_HAS_direct_jump 1 + +// +// Potential TODOs. +// + +// TODO: implement DEPOSIT as BFI. +#define TCG_TARGET_HAS_deposit_i32 0 +#define TCG_TARGET_HAS_deposit_i64 0 + +// TODO: implement EXTRACT as BFX. +#define TCG_TARGET_HAS_extract_i32 0 +#define TCG_TARGET_HAS_sextract_i32 0 +#define TCG_TARGET_HAS_extract_i64 0 +#define TCG_TARGET_HAS_sextract_i64 0 + +// TODO: it might be worth writing a gadget for this +#define TCG_TARGET_HAS_movcond_i32 0 +#define TCG_TARGET_HAS_movcond_i64 0 + +// +// Unsupported instructions. +// + +// ARMv8 doesn't have instructions for NAND/NOR. +#define TCG_TARGET_HAS_nand_i32 0 +#define TCG_TARGET_HAS_nor_i32 0 +#define TCG_TARGET_HAS_nor_i64 0 +#define TCG_TARGET_HAS_nand_i64 0 + +// aarch64's CLZ is implemented without a condition, so it +#define TCG_TARGET_HAS_clz_i32 0 +#define TCG_TARGET_HAS_ctz_i32 0 +#define TCG_TARGET_HAS_ctpop_i32 0 +#define TCG_TARGET_HAS_clz_i64 0 +#define TCG_TARGET_HAS_ctz_i64 0 +#define TCG_TARGET_HAS_ctpop_i64 0 + + +// GOTO_PTR is too complex to emit a simple gadget for. +// We'll let C handle it, since the overhead is similar. +#define TCG_TARGET_HAS_goto_ptr 0 + +// We don't have a simple gadget for this, since we're always assuming softmmu. +#define TCG_TARGET_HAS_qemu_st8_i32 0 + +// No AArch64 equivalent.a +#define TCG_TARGET_HAS_extrl_i64_i32 0 +#define TCG_TARGET_HAS_extrh_i64_i32 0 + +#define TCG_TARGET_HAS_extract2_i64 0 + +// These should always be zero on our 64B platform. +#define TCG_TARGET_HAS_muls2_i64 0 +#define TCG_TARGET_HAS_add2_i32 0 +#define TCG_TARGET_HAS_sub2_i32 0 +#define TCG_TARGET_HAS_mulu2_i32 0 +#define TCG_TARGET_HAS_add2_i64 0 +#define TCG_TARGET_HAS_sub2_i64 0 +#define TCG_TARGET_HAS_mulu2_i64 0 +#define TCG_TARGET_HAS_muluh_i64 0 +#define TCG_TARGET_HAS_mulsh_i64 0 +#define TCG_TARGET_HAS_extract2_i32 0 +#define TCG_TARGET_HAS_muls2_i32 0 +#define TCG_TARGET_HAS_muluh_i32 0 +#define TCG_TARGET_HAS_mulsh_i32 0 + +// +// Platform metadata. +// + +// Number of registers available. +// It might make sense to up these, since we can also use x16 -> x25? +#define TCG_TARGET_NB_REGS 16 + +/* List of registers which are used by TCG. */ +typedef enum { + TCG_REG_R0 = 0, + TCG_REG_R1, + TCG_REG_R2, + TCG_REG_R3, + TCG_REG_R4, + TCG_REG_R5, + TCG_REG_R6, + TCG_REG_R7, + TCG_REG_R8, + TCG_REG_R9, + TCG_REG_R10, + TCG_REG_R11, + TCG_REG_R12, + TCG_REG_R13, + TCG_REG_R14, + TCG_REG_R15, + + TCG_AREG0 = TCG_REG_R14, + TCG_REG_CALL_STACK = TCG_REG_R15, +} TCGReg; + +// Specify the shape of the stack our runtime will use. +#define TCG_TARGET_CALL_STACK_OFFSET 0 +#define TCG_TARGET_STACK_ALIGN 16 + +// We're interpreted, so we'll use our own code to run TB_EXEC. +#define HAVE_TCG_QEMU_TB_EXEC + +// We'll need to enforce memory ordering with barriers. +#define TCG_TARGET_DEFAULT_MO (0) + +void tci_disas(uint8_t opc); + +void tb_target_set_jmp_target(uintptr_t, uintptr_t, uintptr_t, uintptr_t); + + +#endif /* TCG_TARGET_H */ diff --git a/tcg/aarch64-tcti/tcti-gadget-gen.py b/tcg/aarch64-tcti/tcti-gadget-gen.py new file mode 100755 index 000000000000..1296f6d0c2d7 --- /dev/null +++ b/tcg/aarch64-tcti/tcti-gadget-gen.py @@ -0,0 +1,788 @@ +#!/usr/bin/env python3 +""" Gadget-code generator for QEMU TCTI on AArch64. + +Generates a C-code include file containing 'gadgets' for use by TCTI. +""" + +import sys +import itertools + +# Get a handle on the file we'll be working with, and redirect print to it. +if len(sys.argv) > 1: + out_file = open(sys.argv[1], "w") + + # Hook our print function, so it always outputs to the relevant file. + core_print = print + print = lambda *a, **k : core_print(*a, **k, file=out_file) + +# Epilogue code follows at the end of each gadget, and handles continuing execution. +EPILOGUE = ( + # Load our next gadget address from our bytecode stream, advancing it. + "ldr x27, [x28], #8", + + # Jump to the next gadget. + "br x27" +) + +# The number of general-purpose registers we're affording the TCG. This must match +# the configuration in the TCTI target. +TCG_REGISTER_COUNT = 16 +TCG_REGISTER_NUMBERS = list(range(TCG_REGISTER_COUNT)) + +# Helper that provides each of the AArch64 condition codes of interest. +ARCH_CONDITION_CODES = ["eq", "ne", "lt", "ge", "le", "gt", "lo", "hs", "ls", "hi"] + +# We'll create a variety of gadgets that assume the MMU's TLB is stored at certain +# offsets into its structure. These should match the offsets in tcg-target.c.in. +QEMU_ALLOWED_MMU_OFFSETS = [ 64, 96, 128 ] + +# Statistics. +gadgets = 0 +instructions = 0 + +def simple(name, *lines): + """ Generates a simple gadget that needs no per-register specialization. """ + + global gadgets, instructions + + gadgets += 1 + + # Create our C/ASM framing. + #print(f"__attribute__((naked)) static void gadget_{name}(void)") + print(f"__attribute__((naked)) void gadget_{name}(void);") + print(f"__attribute__((naked)) void gadget_{name}(void)") + print("{") + + # Add the core gadget + print("\tasm(") + for line in lines + EPILOGUE: + print(f"\t\t\"{line} \\n\"") + instructions += 1 + print("\t);") + + # End our framing. + print("}\n") + + +def with_register_substitutions(name, substitutions, *lines, immediate_range=range(0)): + """ Generates a collection of gadgtes with register substitutions. """ + + def substitutions_for_letter(letter, number, line): + """ Helper that transforms Wd => w1, implementing gadget substitutions. """ + + # Register substitutions... + line = line.replace(f"X{letter}", f"x{number}") + line = line.replace(f"W{letter}", f"w{number}") + + # ... immediate substitutions. + line = line.replace(f"I{letter}", f"{number}") + return line + + + # Build a list of all the various stages we'll iterate over... + immediate_parameters = list(immediate_range) + parameters = ([TCG_REGISTER_NUMBERS] * len(substitutions)) + + # ... adding immediates, if need be. + if immediate_parameters: + parameters.append(immediate_parameters) + substitutions = substitutions + ['i'] + + # Generate a list of register-combinations we'll support. + permutations = itertools.product(*parameters) + + # For each permutation... + for permutation in permutations: + new_lines = lines + + # Replace each placeholder element with its proper value... + for index, element in enumerate(permutation): + letter = substitutions[index] + number = element + + # Create new gadgets for the releavnt line... + new_lines = [substitutions_for_letter(letter, number, line) for line in new_lines] + + # ... and emit the gadget. + permutation_id = "_arg".join(str(number) for number in permutation) + simple(f"{name}_arg{permutation_id}", *new_lines) + + +def with_dnm(name, *lines): + """ Generates a collection of gadgets with substitutions for Xd, Xn, and Xm, and equivalents. """ + with_register_substitutions(name, ("d", "n", "m"), *lines) + + # Print out an array that contains all of our gadgets, for lookup. + print(f"void* gadget_{name}[{TCG_REGISTER_COUNT}][{TCG_REGISTER_COUNT}][{TCG_REGISTER_COUNT}] = ", end="") + print("{") + + # D array + for d in TCG_REGISTER_NUMBERS: + print("\t{") + + # N array + for n in TCG_REGISTER_NUMBERS: + print("\t\t{", end="") + + # M array + for m in TCG_REGISTER_NUMBERS: + print(f"gadget_{name}_arg{d}_arg{n}_arg{m}", end=", ") + + print("},") + print("\t},") + print("};") + + +def with_dn_immediate(name, *lines, immediate_range): + """ Generates a collection of gadgets with substitutions for Xd, Xn, and Xm, and equivalents. """ + with_register_substitutions(name, ["d", "n"], *lines, immediate_range=immediate_range) + + # Print out an array that contains all of our gadgets, for lookup. + print(f"void* gadget_{name}[{TCG_REGISTER_COUNT}][{TCG_REGISTER_COUNT}][{len(immediate_range)}] = ", end="") + print("{") + + # D array + for d in TCG_REGISTER_NUMBERS: + print("\t{") + + # N array + for n in TCG_REGISTER_NUMBERS: + print("\t\t{", end="") + + # M array + for i in immediate_range: + print(f"gadget_{name}_arg{d}_arg{n}_arg{i}", end=", ") + + print("},") + print("\t},") + print("};") + + +def with_pair(name, substitutions, *lines): + """ Generates a collection of gadgets with two subtstitutions.""" + with_register_substitutions(name, substitutions, *lines) + + # Print out an array that contains all of our gadgets, for lookup. + print(f"void* gadget_{name}[{TCG_REGISTER_COUNT}][{TCG_REGISTER_COUNT}] = ", end="") + print("{") + + # N array + for a in TCG_REGISTER_NUMBERS: + print("\t\t{", end="") + + # M array + for b in TCG_REGISTER_NUMBERS: + print(f"gadget_{name}_arg{a}_arg{b}", end=", ") + + print("},") + print("};") + + +def math_dnm(name, mnemonic): + """ Equivalent to `with_dnm`, but creates a _i32 and _i64 variant. For simple math. """ + with_dnm(f'{name}_i32', f"{mnemonic} Wd, Wn, Wm") + with_dnm(f'{name}_i64', f"{mnemonic} Xd, Xn, Xm") + +def math_dn(name, mnemonic): + """ Equivalent to `with_dn`, but creates a _i32 and _i64 variant. For simple math. """ + with_dn(f'{name}_i32', f"{mnemonic} Wd, Wn") + with_dn(f'{name}_i64', f"{mnemonic} Xd, Xn") + + +def with_nm(name, *lines): + """ Generates a collection of gadgets with substitutions for Xn, and Xm, and equivalents. """ + with_pair(name, ('n', 'm',), *lines) + + +def with_dn(name, *lines): + """ Generates a collection of gadgets with substitutions for Xd, and Xn, and equivalents. """ + with_pair(name, ('d', 'n',), *lines) + + +def ldst_dn(name, *lines): + """ Generates a collection of gadgets with substitutions for Xd, and Xn, and equivalents. + + This variant is optimized for loads and stores, and optimizes common offset cases. + """ + + # + # Simple case: create our gadgets. + # + with_dn(name, "ldr x27, [x28], #8", *lines) + + # + # Optimization case: create variants of our gadgets with our offsets replaced with common immediates. + # + immediate_lines_pos = [line.replace("x27", "#Ii") for line in lines] + with_dn_immediate(f"{name}_imm", *immediate_lines_pos, immediate_range=range(64)) + + immediate_lines_aligned = [line.replace("x27", "#(Ii << 3)") for line in lines] + with_dn_immediate(f"{name}_sh8_imm", *immediate_lines_aligned, immediate_range=range(64)) + + immediate_lines_neg = [line.replace("x27", "#-Ii") for line in lines] + with_dn_immediate(f"{name}_neg_imm", *immediate_lines_neg, immediate_range=range(64)) + + +def with_single(name, substitution, *lines): + """ Generates a collection of gadgets with two subtstitutions.""" + with_register_substitutions(name, (substitution,), *lines) + + # Print out an array that contains all of our gadgets, for lookup. + print(f"void* gadget_{name}[{TCG_REGISTER_COUNT}] = ", end="") + print("{") + + for n in TCG_REGISTER_NUMBERS: + print(f"gadget_{name}_arg{n}", end=", ") + + print("};") + + +def with_d_immediate(name, *lines, immediate_range=range(0)): + """ Generates a collection of gadgets with two subtstitutions.""" + with_register_substitutions(name, ['d'], *lines, immediate_range=immediate_range) + + # Print out an array that contains all of our gadgets, for lookup. + print(f"void* gadget_{name}[{TCG_REGISTER_COUNT}][{len(immediate_range)}] = ", end="") + print("{") + + # D array + for a in TCG_REGISTER_NUMBERS: + print("\t\t{", end="") + + # I array + for b in immediate_range: + print(f"gadget_{name}_arg{a}_arg{b}", end=", ") + + print("},") + print("};") + + + +def with_d(name, *lines): + """ Generates a collection of gadgets with substitutions for Xd. """ + with_single(name, 'd', *lines) + + +# Assembly code for saving our machine state before entering the C runtime. +C_CALL_PROLOGUE = [ + # Store our machine state. + "str x25, [sp, #-16]!", + "stp x14, x15, [sp, #-16]!", + "stp x12, x13, [sp, #-16]!", + "stp x10, x11, [sp, #-16]!", + "stp x8, x9, [sp, #-16]!", + "stp x6, x7, [sp, #-16]!", + "stp x4, x5, [sp, #-16]!", + "stp x2, x3, [sp, #-16]!", + "stp x0, x1, [sp, #-16]!", + "stp x28, lr, [sp, #-16]!", +] + +# Assembly code for restoring our machine state after leaving the C runtime. +C_CALL_EPILOGUE = [ + "ldp x28, lr, [sp], #16", + "ldp x0, x1, [sp], #16", + "ldp x2, x3, [sp], #16", + "ldp x4, x5, [sp], #16", + "ldp x6, x7, [sp], #16", + "ldp x8, x9, [sp], #16", + "ldp x10, x11, [sp], #16", + "ldp x12, x13, [sp], #16", + "ldp x14, x15, [sp], #16", + "ldr x25, [sp], #16", +] + + +def create_tlb_fastpath(is_aligned, is_write, offset, miss_label="0"): + """ Creates a set of instructions that perform a soft-MMU TLB lookup. + + This is used for `qemu_ld`/qemu_st` instructions; to emit a prologue that + hopefully helps us skip a slow call into the C runtime when a Guest Virtual + -> Host Virtual mapping is in the softmmu's TLB. + + This "fast-path" prelude behaves as follows: + - If a TLB entry is found for the address stored in Xn, then x27 + is stored to an "addend" that can be added to the guest virtual addres + to get the host virtual address (the address in our local memory space). + - If a TLB entry isn't found, it branches to the "miss_label" (by default, 0:), + so address lookup can be handled by the fastpath. + + Clobbers x24, and x26; provides output in x27. + """ + + fast_path = [ + # Load env_tlb(env)->f[mmu_idx].{mask,table} into {x26,x27}. + f"ldp x26, x27, [x14, #-{offset}]", + + # Extract the TLB index from the address into X26. + "and x26, x26, Xn, lsr #7", # Xn = addr regsiter + + # Add the tlb_table pointer, creating the CPUTLBEntry address into X27. + "add x27, x27, x26", + + # Load the tlb comparator into X26, and the fast path addend into X27. + "ldr x26, [x27, #8]" if is_write else "ldr x26, [x27]", + "ldr x27, [x27, #0x18]", + + ] + + if is_aligned: + fast_path.extend([ + # Store the page mask part of the address into X24. + "and x24, Xn, #0xfffffffffffff000", + + # Compare the masked address with the TLB value. + "cmp x26, x24", + + # If we're not equal, this isn't a TLB hit. Jump to our miss handler. + f"b.ne {miss_label}f", + ]) + else: + fast_path.extend([ + # If we're not aligned, add in our alignment value to ensure we don't + # don't straddle the end of a page. + "add x24, Xn, #7", + + # Store the page mask part of the address into X24. + "and x24, x24, #0xfffffffffffff000", + + # Compare the masked address with the TLB value. + "cmp x26, x24", + + # If we're not equal, this isn't a TLB hit. Jump to our miss handler. + f"b.ne {miss_label}f", + ]) + + return fast_path + + + +def ld_thunk(name, fastpath_32b, fastpath_64b, slowpath_helper, immediate=None, is_aligned=False, force_slowpath=False): + """ Creates a thunk into our C runtime for a QEMU ST operation. """ + + # Use only offset 0 (no real offset) if we're forcing slowpath; + # otherwise, use all of our allowed MMU offsets. + offsets = [0] if force_slowpath else QEMU_ALLOWED_MMU_OFFSETS + for offset in offsets: + for is_32b in (True, False): + fastpath = fastpath_32b if is_32b else fastpath_64b + + gadget_name = f"{name}_off{offset}_i32" if is_32b else f"{name}_off{offset}_i64" + postscript = () if immediate else ("add x28, x28, #8",) + + # If we have a pure-assembly fast path, start our gadget with it. + if fastpath and not force_slowpath: + fastpath_ops = [ + # Create a fastpath that jumps to miss_lable on a TLB miss, + # or sets x27 to the TLB addend on a TLB hit. + *create_tlb_fastpath(is_aligned=is_aligned, is_write=False, offset=offset), + + # On a hit, we can just perform an appropriate load... + *fastpath, + + # Run our patch-up post-script, if we have one. + *postscript, + + # ... and then we're done! + *EPILOGUE, + ] + # Otherwise, we'll save arguments for our slow path. + else: + fastpath_ops = [] + + # + # If we're not taking our fast path, we'll call into our C runtime to take the slow path. + # + with_dn(gadget_name, + *fastpath_ops, + + "0:", + "mov x27, Xn", + + # Save our registers in preparation for entering a C call. + *C_CALL_PROLOGUE, + + # Per our calling convention: + # - Move our architectural environment into x0, from x14. + # - Move our target address into x1. [Placed in x27 below.] + # - Move our operation info into x2, from an immediate32. + # - Move the next bytecode pointer into x3, from x28. + "mov x0, x14", + "mov x1, x27", + f"mov x2, #{immediate}" if (immediate is not None) else "ldr x2, [x28], #8", + "mov x3, x28", + + # Perform our actual core code. + f"bl _{slowpath_helper}", + + # Temporarily store our result in a register that won't get trashed. + "mov x27, x0", + + # Restore our registers after our C call. + *C_CALL_EPILOGUE, + + # Finally, call our postscript... + *postscript, + + # ... and place our results in the target register. + "mov Wd, w27" if is_32b else "mov Xd, x27" + ) + + +def st_thunk(name, fastpath_32b, fastpath_64b, slowpath_helper, immediate=None, is_aligned=False, force_slowpath=False): + """ Creates a thunk into our C runtime for a QEMU ST operation. """ + + # Use only offset 0 (no real offset) if we're forcing slowpath; + # otherwise, use all of our allowed MMU offsets. + offsets = [0] if force_slowpath else QEMU_ALLOWED_MMU_OFFSETS + for offset in offsets: + + for is_32b in (True, False): + fastpath = fastpath_32b if is_32b else fastpath_64b + + gadget_name = f"{name}_off{offset}_i32" if is_32b else f"{name}_off{offset}_i64" + postscript = () if immediate else ("add x28, x28, #8",) + + # If we have a pure-assembly fast path, start our gadget with it. + if fastpath and not force_slowpath: + fastpath_ops = [ + + # Create a fastpath that jumps to miss_lable on a TLB miss, + # or sets x27 to the TLB addend on a TLB hit. + *create_tlb_fastpath(is_aligned=is_aligned, is_write=True, offset=offset), + + # On a hit, we can just perform an appropriate load... + *fastpath, + + # Run our patch-up post-script, if we have one. + *postscript, + + # ... and then we're done! + *EPILOGUE, + ] + else: + fastpath_ops = [] + + + # + # If we're not taking our fast path, we'll call into our C runtime to take the slow path. + # + with_dn(gadget_name, + *fastpath_ops, + + "0:", + # Move our arguments into registers that we're not actively using. + # This ensures that they won't be trounced by our calling convention + # if this is reading values from x0-x4. + "mov w27, Wd" if is_32b else "mov x27, Xd", + "mov x26, Xn", + + # Save our registers in preparation for entering a C call. + *C_CALL_PROLOGUE, + + # Per our calling convention: + # - Move our architectural environment into x0, from x14. + # - Move our target address into x1. [Moved into x26 above]. + # - Move our target value into x2. [Moved into x27 above]. + # - Move our operation info into x3, from an immediate32. + # - Move the next bytecode pointer into x4, from x28. + "mov x0, x14", + "mov x1, x26", + "mov x2, x27", + f"mov x3, #{immediate}" if (immediate is not None) else "ldr x3, [x28], #8", + "mov x4, x28", + + # Perform our actual core code. + f"bl _{slowpath_helper}", + + # Restore our registers after our C call. + *C_CALL_EPILOGUE, + + # Finally, call our postscript. + *postscript + ) + + +# +# Gadget definitions. +# + +print("/* Automatically generated by tcti-gadget-gen.py. Do not edit. */\n") + +# Call a C language helper function by address. +simple("call", + # Get our C runtime function's location as a pointer-sized immediate... + "ldr x27, [x28], #8", + + # Store our TB return address for our helper. + "str x28, [x25]", + + # Prepare ourselves to call into our C runtime... + *C_CALL_PROLOGUE, + + # ... perform the call itself ... + "blr x27", + + # Save the result of our call for later. + "mov x27, x0", + + # ... and restore our environment. + *C_CALL_EPILOGUE, + + # Restore our return value. + "mov x0, x27" +) + +# Branch to a given immediate address. +simple("br", + # Use our immediate argument as our new bytecode-pointer location. + "ldr x28, [x28]" +) + +# Exit from a translation buffer execution. +simple("exit_tb", + + # We have a single immediate argument, which contains our return code. + # Place it into x0, as one would a return code. + "ldr x0, [x28], #8", + + # And finally, return back to the code that invoked our gadget stream. + "ret" +) + + +for condition in ARCH_CONDITION_CODES: + + # Performs a comparison between two operands. + with_dnm(f"setcond_i32_{condition}", + "subs Wd, Wn, Wm", + f"cset Wd, {condition}" + ) + with_dnm(f"setcond_i64_{condition}", + "subs Xd, Xn, Xm", + f"cset Xd, {condition}" + ) + + # + # NOTE: we use _dnm for the conditional branches, even though we don't + # actually do anything different based on the d argument. This gemerates + # effectively 16 identical `brcond` gadgets for each condition; which we + # use in the backend to spread out the actual branch sources we use. + # + # This is a slight mercy for the branch predictor, as not every conditional + # branch is funneled throught the same address. + # + + # Branches iff a given comparison is true. + with_dnm(f'brcond_i32_{condition}', + + # Grab our immediate argument. + "ldr x27, [x28], #8", + + # Perform our comparison and conditional branch. + "subs Wzr, Wn, Wm", + f"b{condition} 1f", + + "0:", # not taken + # Perform our end-of-instruction epilogue. + *EPILOGUE, + + "1:" # taken + # Update our bytecode pointer to take the label. + "mov x28, x27" + ) + + # Branches iff a given comparison is true. + with_dnm(f'brcond_i64_{condition}', + + # Grab our immediate argument. + "ldr x27, [x28], #8", + + # Perform our comparison and conditional branch. + "subs Xzr, Xn, Xm", + f"b{condition} 1f", + + "0:", # not taken + # Perform our end-of-instruction epilogue. + *EPILOGUE, + + "1:" # taken + # Update our bytecode pointer to take the label. + "mov x28, x27" + ) + + +# MOV variants. +with_dn("mov_i32", "mov Wd, Wn") +with_dn("mov_i64", "mov Xd, Xn") +with_d("movi_i32", "ldr Wd, [x28], #8") +with_d("movi_i64", "ldr Xd, [x28], #8") + +# Create MOV variants that have common constants built in to the gadget. +# This optimization helps costly reads from memories for simple operations. +with_d_immediate("movi_imm_i32", "mov Wd, #Ii", immediate_range=range(64)) +with_d_immediate("movi_imm_i64", "mov Xd, #Ii", immediate_range=range(64)) + +# LOAD variants. +# TODO: should the signed variants have X variants for _i64? +ldst_dn("ld8u", "ldrb Wd, [Xn, x27]") +ldst_dn("ld8s_i32", "ldrsb Wd, [Xn, x27]") +ldst_dn("ld8s_i64", "ldrsb Xd, [Xn, x27]") +ldst_dn("ld16u", "ldrh Wd, [Xn, x27]") +ldst_dn("ld16s_i32", "ldrsh Wd, [Xn, x27]") +ldst_dn("ld16s_i64", "ldrsh Xd, [Xn, x27]") +ldst_dn("ld32u", "ldr Wd, [Xn, x27]") +ldst_dn("ld32s_i64", "ldrsw Xd, [Xn, x27]") +ldst_dn("ld_i64", "ldr Xd, [Xn, x27]") + +# STORE variants. +ldst_dn("st8", "strb Wd, [Xn, x27]") +ldst_dn("st16", "strh Wd, [Xn, x27]") +ldst_dn("st_i32", "str Wd, [Xn, x27]") +ldst_dn("st_i64", "str Xd, [Xn, x27]") + +# QEMU LD/ST are handled in our C runtime rather than with simple gadgets, +# as they're nontrivial. + +# Trivial arithmetic. +math_dnm("add" , "add" ) +math_dnm("sub" , "sub" ) +math_dnm("mul" , "mul" ) +math_dnm("div" , "sdiv") +math_dnm("divu", "udiv") + +# Division remainder +with_dnm("rem_i32", "sdiv w27, Wn, Wm", "msub Wd, w27, Wm, Wn") +with_dnm("rem_i64", "sdiv x27, Xn, Xm", "msub Xd, x27, Xm, Xn") +with_dnm("remu_i32", "udiv w27, Wn, Wm", "msub Wd, w27, Wm, Wn") +with_dnm("remu_i64", "udiv x27, Xn, Xm", "msub Xd, x27, Xm, Xn") + +# Trivial logical. +math_dn( "not", "mvn") +math_dn( "neg", "neg") +math_dnm("and", "and") +math_dnm("andc", "bic") +math_dnm("or", "orr") +math_dnm("orc", "orn") +math_dnm("xor", "eor") +math_dnm("eqv", "eon") +math_dnm("shl", "lsl") +math_dnm("shr", "lsr") +math_dnm("sar", "asr") + +# AArch64 lacks a Rotate Left; so we instead rotate right by a negative. +# TODO: validate this? +#math_dnm("rotr", "ror") +#with_dnm("rotl_i32", "neg w27, Wm", "ror Wd, Wn, w27") +#with_dnm("rotl_i64", "neg x27, Xm", "ror Xd, Xn, x27") + +# Numeric extension. +math_dn("ext8s", "sxtb") +with_dn("ext8u", "and Xd, Xn, #0xff") +math_dn("ext16s", "sxth") +with_dn("ext16u", "and Wd, Wn, #0xffff") +with_dn("ext32s_i64", "sxtw Xd, Wn") +with_dn("ext32u_i64", "and Xd, Xn, #0xffffffff") + +# Byte swapping. +with_dn("bswap16", "rev w27, Wn", "lsr Wd, w27, #16") +with_dn("bswap32", "rev Wd, Wn") +with_dn("bswap64", "rev Xd, Xn") + +# Memory barriers. +simple("mb_all", "dmb ish") +simple("mb_st", "dmb ishst") +simple("mb_ld", "dmb ishld") + +# Handlers for QEMU_LD, which handles guest <- host loads. +for subtype in ('aligned', 'unaligned', 'slowpath'): + is_aligned = (subtype == 'aligned') + is_slowpath = (subtype == 'slowpath') + + ld_thunk(f"qemu_ld_ub_{subtype}", is_aligned=is_aligned, slowpath_helper="helper_ret_ldub_mmu", + fastpath_32b=["ldrb Wd, [Xn, x27]"], fastpath_64b=["ldrb Wd, [Xn, x27]"], + force_slowpath=is_slowpath, + ) + ld_thunk(f"qemu_ld_sb_{subtype}", is_aligned=is_aligned, slowpath_helper="helper_ret_ldub_mmu_signed", + fastpath_32b=["ldrsb Wd, [Xn, x27]"], fastpath_64b=["ldrsb Xd, [Xn, x27]"], + force_slowpath=is_slowpath, + ) + ld_thunk(f"qemu_ld_leuw_{subtype}", is_aligned=is_aligned, slowpath_helper="helper_le_lduw_mmu", + fastpath_32b=["ldrh Wd, [Xn, x27]"], fastpath_64b=["ldrh Wd, [Xn, x27]"], + force_slowpath=is_slowpath, + ) + ld_thunk(f"qemu_ld_lesw_{subtype}", is_aligned=is_aligned, slowpath_helper="helper_le_lduw_mmu_signed", + fastpath_32b=["ldrsh Wd, [Xn, x27]"], fastpath_64b=["ldrsh Xd, [Xn, x27]"], + force_slowpath=is_slowpath, + ) + ld_thunk(f"qemu_ld_leul_{subtype}", is_aligned=is_aligned, slowpath_helper="helper_le_ldul_mmu", + fastpath_32b=["ldr Wd, [Xn, x27]"], fastpath_64b=["ldr Wd, [Xn, x27]"], + force_slowpath=is_slowpath, + ) + ld_thunk(f"qemu_ld_lesl_{subtype}", is_aligned=is_aligned, slowpath_helper="helper_le_ldul_mmu_signed", + fastpath_32b=["ldrsw Xd, [Xn, x27]"], fastpath_64b=["ldrsw Xd, [Xn, x27]"], + force_slowpath=is_slowpath, + ) + ld_thunk(f"qemu_ld_leq_{subtype}", is_aligned=is_aligned, slowpath_helper="helper_le_ldq_mmu", + fastpath_32b=["ldr Xd, [Xn, x27]"], fastpath_64b=["ldr Xd, [Xn, x27]"], + force_slowpath=is_slowpath, + ) + + # Special variant for the most common mode, as a speedup optimization. + ld_thunk(f"qemu_ld_leq_{subtype}_mode3a", is_aligned=is_aligned, slowpath_helper="helper_le_ldq_mmu", + fastpath_32b=["ldr Xd, [Xn, x27]"], fastpath_64b=["ldr Xd, [Xn, x27]"], + force_slowpath=is_slowpath, immediate=0x3a + ) + + # For now, leave the rare/big-endian stuff slow-path only. + ld_thunk(f"qemu_ld_beuw_{subtype}", None, None, "helper_be_lduw_mmu", + is_aligned=is_aligned, force_slowpath=is_slowpath) + ld_thunk(f"qemu_ld_besw_{subtype}", None, None, "helper_be_lduw_mmu_signed", + is_aligned=is_aligned, force_slowpath=is_slowpath) + ld_thunk(f"qemu_ld_beul_{subtype}", None, None, "helper_be_ldul_mmu", + is_aligned=is_aligned, force_slowpath=is_slowpath) + ld_thunk(f"qemu_ld_besl_{subtype}", None, None, "helper_be_ldul_mmu_signed", + is_aligned=is_aligned, force_slowpath=is_slowpath) + ld_thunk(f"qemu_ld_beq_{subtype}", None, None, "helper_be_ldq_mmu", + is_aligned=is_aligned, force_slowpath=is_slowpath) + + +# Handlers for QEMU_ST, which handles guest -> host stores. +for subtype in ('aligned', 'unaligned', 'slowpath'): + is_aligned = (subtype == 'aligned') + is_slowpath = (subtype == 'slowpath') + + st_thunk(f"qemu_st_ub_{subtype}", is_aligned=is_aligned, slowpath_helper="helper_ret_stb_mmu", + fastpath_32b=["strb Wd, [Xn, x27]"], fastpath_64b=["strb Wd, [Xn, x27]"], + force_slowpath=is_slowpath, + ) + st_thunk(f"qemu_st_leuw_{subtype}", is_aligned=is_aligned, slowpath_helper="helper_le_stw_mmu", + fastpath_32b=["strh Wd, [Xn, x27]"], fastpath_64b=["strh Wd, [Xn, x27]"], + force_slowpath=is_slowpath, + ) + st_thunk(f"qemu_st_leul_{subtype}", is_aligned=is_aligned, slowpath_helper="helper_le_stl_mmu", + fastpath_32b=["str Wd, [Xn, x27]"], fastpath_64b=["str Wd, [Xn, x27]"], + force_slowpath=is_slowpath, + ) + st_thunk(f"qemu_st_leq_{subtype}", is_aligned=is_aligned, slowpath_helper="helper_le_stq_mmu", + fastpath_32b=["str Xd, [Xn, x27]"], fastpath_64b=["str Xd, [Xn, x27]"], + force_slowpath=is_slowpath, + ) + + # Special optimization for the most common modes. + st_thunk(f"qemu_st_leq_{subtype}_mode3a", is_aligned=is_aligned, slowpath_helper="helper_le_stq_mmu", + fastpath_32b=["str Xd, [Xn, x27]"], fastpath_64b=["str Xd, [Xn, x27]"], + force_slowpath=is_slowpath, immediate=0x3a + ) + + # For now, leave the rare/big-endian stuff slow-path only. + st_thunk(f"qemu_st_beuw_{subtype}", None, None, "helper_be_stw_mmu", + is_aligned=is_aligned, force_slowpath=is_slowpath) + st_thunk(f"qemu_st_beul_{subtype}", None, None, "helper_be_stl_mmu", + is_aligned=is_aligned, force_slowpath=is_slowpath) + st_thunk(f"qemu_st_beq_{subtype}", None, None, "helper_be_stq_mmu", + is_aligned=is_aligned, force_slowpath=is_slowpath) + + +# Statistics. +sys.stderr.write(f"\nGenerated {gadgets} gadgets with {instructions} instructions ({instructions * 4} B).\n\n") diff --git a/tcg/tcg.c b/tcg/tcg.c index 5b0750685102..ec832d92d0e6 100644 --- a/tcg/tcg.c +++ b/tcg/tcg.c @@ -165,7 +165,7 @@ TCGv_env cpu_env = 0; const void *tcg_code_gen_epilogue; uintptr_t tcg_splitwx_diff; -#ifndef CONFIG_TCG_INTERPRETER +#if !defined(CONFIG_TCG_INTERPRETER) && !defined(CONFIG_TCG_THREADED_INTERPRETER) tcg_prologue_fn *tcg_qemu_tb_exec; #endif @@ -1227,7 +1227,7 @@ void tcg_prologue_init(TCGContext *s) region.start = buf0; region.end = buf0 + total_size; -#ifndef CONFIG_TCG_INTERPRETER +#if !defined(CONFIG_TCG_INTERPRETER) && !defined(CONFIG_TCG_THREADED_INTERPRETER) tcg_qemu_tb_exec = (tcg_prologue_fn *)tcg_splitwx_to_rx(buf0); #endif @@ -1253,7 +1253,7 @@ void tcg_prologue_init(TCGContext *s) #endif buf1 = s->code_ptr; -#ifndef CONFIG_TCG_INTERPRETER +#if !defined(CONFIG_TCG_INTERPRETER) && !defined(CONFIG_TCG_THREADED_INTERPRETER) flush_idcache_range((uintptr_t)tcg_splitwx_to_rx(buf0), (uintptr_t)buf0, tcg_ptr_byte_diff(buf1, buf0)); #endif @@ -1981,7 +1981,7 @@ void tcg_gen_callN(void *func, TCGTemp *ret, int nargs, TCGTemp **args) #endif #if defined(__sparc__) && !defined(__arch64__) \ - && !defined(CONFIG_TCG_INTERPRETER) + && !defined(CONFIG_TCG_INTERPRETER) && !defined(CONFIG_TCG_THREADED_INTERPRETER) /* We have 64-bit values in one register, but need to pass as two separate parameters. Split them. */ int orig_sizemask = sizemask; @@ -2031,7 +2031,7 @@ void tcg_gen_callN(void *func, TCGTemp *ret, int nargs, TCGTemp **args) pi = 0; if (ret != NULL) { #if defined(__sparc__) && !defined(__arch64__) \ - && !defined(CONFIG_TCG_INTERPRETER) + && !defined(CONFIG_TCG_INTERPRETER) && !defined(CONFIG_TCG_THREADED_INTERPRETER) if (orig_sizemask & 1) { /* The 32-bit ABI is going to return the 64-bit value in the %o0/%o1 register pair. Prepare for this by using @@ -2109,7 +2109,7 @@ void tcg_gen_callN(void *func, TCGTemp *ret, int nargs, TCGTemp **args) tcg_debug_assert(pi <= ARRAY_SIZE(op->args)); #if defined(__sparc__) && !defined(__arch64__) \ - && !defined(CONFIG_TCG_INTERPRETER) + && !defined(CONFIG_TCG_INTERPRETER) && !defined(CONFIG_TCG_THREADED_INTERPRETER) /* Free all of the parts we allocated above. */ for (i = real_args = 0; i < orig_nargs; ++i) { int is_64bit = orig_sizemask & (1 << (i+1)*2); @@ -4789,7 +4789,7 @@ int tcg_gen_code(TCGContext *s, TranslationBlock *tb) return -2; } -#ifndef CONFIG_TCG_INTERPRETER +#if !defined(CONFIG_TCG_INTERPRETER) && !defined(CONFIG_TCG_THREADED_INTERPRETER) /* flush instruction cache */ flush_idcache_range((uintptr_t)tcg_splitwx_to_rx(s->code_buf), (uintptr_t)s->code_buf,