diff --git a/accel/tcg/translate-all.c b/accel/tcg/translate-all.c
index 28d563454e03..71676bbd2dea 100644
--- a/accel/tcg/translate-all.c
+++ b/accel/tcg/translate-all.c
@@ -1129,7 +1129,7 @@ static bool alloc_code_gen_buffer_anon(size_t size, int prot,
     return true;
 }
 
-#ifndef CONFIG_TCG_INTERPRETER
+#if !defined(CONFIG_TCG_INTERPRETER) && !defined(CONFIG_TCG_THREADED_INTERPRETER)
 #ifdef CONFIG_POSIX
 #include "qemu/memfd.h"
 
@@ -1256,7 +1256,7 @@ static bool alloc_code_gen_buffer_splitwx_vmremap(size_t size, Error **errp)
 
 static bool alloc_code_gen_buffer_splitwx(size_t size, Error **errp)
 {
-#ifndef CONFIG_TCG_INTERPRETER
+#if !defined(CONFIG_TCG_INTERPRETER) && !defined(CONFIG_TCG_THREADED_INTERPRETER)
 # ifdef CONFIG_DARWIN
     return alloc_code_gen_buffer_splitwx_vmremap(size, errp);
 # endif
@@ -1289,7 +1289,7 @@ static bool alloc_code_gen_buffer(size_t size, int splitwx, Error **errp)
 
     prot = PROT_READ | PROT_WRITE | PROT_EXEC;
     flags = MAP_PRIVATE | MAP_ANONYMOUS;
-#ifdef CONFIG_TCG_INTERPRETER
+#if defined(CONFIG_TCG_INTERPRETER) || defined(CONFIG_TCG_THREADED_INTERPRETER)
     /* The tcg interpreter does not need execute permission. */
     prot = PROT_READ | PROT_WRITE;
 #elif defined(CONFIG_DARWIN)
diff --git a/configure b/configure
index 0e7dbc56c784..9296d29e4fd6 100755
--- a/configure
+++ b/configure
@@ -362,6 +362,7 @@ tsan="no"
 fortify_source="$default_feature"
 strip_opt="yes"
 tcg_interpreter="false"
+tcg_threaded_interpreter="false"
 bigendian="no"
 mingw32="no"
 gcov="no"
@@ -1115,6 +1116,10 @@ for opt do
   ;;
   --enable-tcg-interpreter) tcg_interpreter="true"
   ;;
+  --disable-tcg-tcti) tcg_threaded_interpreter="false"
+  ;;
+  --enable-tcg-tcti) tcg_threaded_interpreter="true"
+  ;;
   --disable-cap-ng)  cap_ng="disabled"
   ;;
   --enable-cap-ng) cap_ng="enabled"
@@ -6469,9 +6474,8 @@ NINJA=$ninja $meson setup \
         -Dvhost_user_blk_server=$vhost_user_blk_server -Dmultiprocess=$multiprocess \
         -Dfuse=$fuse -Dfuse_lseek=$fuse_lseek -Dguest_agent_msi=$guest_agent_msi \
         $(if test "$default_features" = no; then echo "-Dauto_features=disabled"; fi) \
-	-Dtcg_interpreter=$tcg_interpreter -Dshared_lib=$shared_lib \
-        $cross_arg \
-        "$PWD" "$source_path"
+	-Dtcg_interpreter=$tcg_interpreter -Dtcg_threaded_interpreter=$tcg_threaded_interpreter\
+	-Dshared_lib=$shared_lib $cross_arg "$PWD" "$source_path"
 
 if test "$?" -ne 0 ; then
     error_exit "meson setup failed"
diff --git a/disas.c b/disas.c
index a61f95b580b8..cea0f9019e49 100644
--- a/disas.c
+++ b/disas.c
@@ -152,6 +152,8 @@ static void initialize_debug_host(CPUDebug *s)
 #endif
 #if defined(CONFIG_TCG_INTERPRETER)
     s->info.print_insn = print_insn_tci;
+#elif defined(CONFIG_TCG_THREADED_INTERPRETER)
+    s->info.print_insn = print_insn_tcti;
 #elif defined(__i386__)
     s->info.mach = bfd_mach_i386_i386;
     s->info.print_insn = print_insn_i386;
diff --git a/include/disas/dis-asm.h b/include/disas/dis-asm.h
index 13fa1edd411e..ded69ba2ffaa 100644
--- a/include/disas/dis-asm.h
+++ b/include/disas/dis-asm.h
@@ -411,6 +411,7 @@ typedef struct disassemble_info {
 typedef int (*disassembler_ftype) (bfd_vma, disassemble_info *);
 
 int print_insn_tci(bfd_vma, disassemble_info*);
+int print_insn_tcti(bfd_vma, disassemble_info*);
 int print_insn_big_mips         (bfd_vma, disassemble_info*);
 int print_insn_little_mips      (bfd_vma, disassemble_info*);
 int print_insn_nanomips         (bfd_vma, disassemble_info*);
diff --git a/include/exec/exec-all.h b/include/exec/exec-all.h
index 6b036cae8f65..a8f2295decd2 100644
--- a/include/exec/exec-all.h
+++ b/include/exec/exec-all.h
@@ -543,7 +543,11 @@ void tb_set_jmp_target(TranslationBlock *tb, int n, uintptr_t addr);
 #if defined(CONFIG_TCG_INTERPRETER)
 extern __thread uintptr_t tci_tb_ptr;
 # define GETPC() tci_tb_ptr
+#elif defined(CONFIG_TCG_THREADED_INTERPRETER)
+extern __thread uintptr_t tcti_call_return_address;
+# define GETPC() tcti_call_return_address
 #else
+/* Note that this is correct for TCTI also; whose gadget behaves like native code. */
 # define GETPC() \
     ((uintptr_t)__builtin_extract_return_addr(__builtin_return_address(0)))
 #endif
diff --git a/include/tcg/tcg.h b/include/tcg/tcg.h
index 0f0695e90da2..cfcd069bf3f6 100644
--- a/include/tcg/tcg.h
+++ b/include/tcg/tcg.h
@@ -1296,7 +1296,7 @@ static inline unsigned get_mmuidx(TCGMemOpIdx oi)
 #define TB_EXIT_IDXMAX    1
 #define TB_EXIT_REQUESTED 3
 
-#ifdef CONFIG_TCG_INTERPRETER
+#if defined(CONFIG_TCG_INTERPRETER) || defined(CONFIG_TCG_THREADED_INTERPRETER)
 uintptr_t tcg_qemu_tb_exec(CPUArchState *env, const void *tb_ptr);
 #else
 typedef uintptr_t tcg_prologue_fn(CPUArchState *env, const void *tb_ptr);
diff --git a/meson.build b/meson.build
index 1524a9be2121..8aea04191215 100644
--- a/meson.build
+++ b/meson.build
@@ -58,6 +58,7 @@ python = import('python').find_installation()
 supported_oses = ['windows', 'freebsd', 'netbsd', 'openbsd', 'darwin', 'sunos', 'linux']
 supported_cpus = ['ppc', 'ppc64', 's390x', 'riscv32', 'riscv64', 'x86', 'x86_64',
   'arm', 'aarch64', 'mips', 'mips64', 'sparc', 'sparc64']
+tcti_supported_cpus = ['aarch64']
 
 cpu = host_machine.cpu_family()
 targetos = host_machine.system()
@@ -248,6 +249,25 @@ if not get_option('tcg').disabled()
   endif
   if get_option('tcg_interpreter')
     tcg_arch = 'tci'
+  elif get_option('tcg_threaded_interpreter')
+    if cpu not in tcti_supported_cpus
+      error('Unsupported CPU @0@ for TCTI, try --enable-tcg-interpreter'.format(cpu))
+    else
+      warning('TCTI is extremely experimental and incomplete! Things might break!')
+      tcg_arch = '@0@-tcti'.format(cpu)
+    endif
+
+    # Tell our compiler how to generate our TCTI gadgets.
+    gadget_generator = 'tcg/@0@/tcti-gadget-gen.py'.format(tcg_arch)
+    tcti_gadgets = custom_target('tcti-gadgets.c.inc',
+                                output: 'tcti-gadgets.c.inc',
+                                input: gadget_generator,
+                                command: [find_program(gadget_generator), '@OUTPUT@'],
+                                build_by_default: true,
+                                build_always_stale: false)
+
+    genh += tcti_gadgets
+    
   elif config_host['ARCH'] == 'sparc64'
     tcg_arch = 'sparc'
   elif config_host['ARCH'] == 's390x'
@@ -1284,6 +1304,8 @@ foreach target : target_dirs
       config_all += { sym: 'y' }
       if sym == 'CONFIG_TCG' and tcg_arch == 'tci'
         config_target += { 'CONFIG_TCG_INTERPRETER': 'y' }
+      elif sym == 'CONFIG_TCG' and tcg_arch.endswith('tcti')
+        config_target += { 'CONFIG_TCG_THREADED_INTERPRETER': 'y' }
       elif sym == 'CONFIG_XEN' and have_xen_pci_passthrough
         config_target += { 'CONFIG_XEN_PCI_PASSTHROUGH': 'y' }
       endif
@@ -2575,6 +2597,8 @@ summary_info += {'TCG support':       config_all.has_key('CONFIG_TCG')}
 if config_all.has_key('CONFIG_TCG')
   if get_option('tcg_interpreter')
     summary_info += {'TCG backend':   'TCI (TCG with bytecode interpreter, experimental and slow)'}
+  elif get_option('tcg_threaded_interpreter')
+    summary_info += {'TCG backend':   'TCTI (TCG with threaded-dispatch bytecode interpreter, experimental and slow; but faster than TCI)'}
   else
     summary_info += {'TCG backend':   'native (@0@)'.format(cpu)}
   endif
diff --git a/meson_options.txt b/meson_options.txt
index 6c29ea93300a..5aa68672c2ff 100644
--- a/meson_options.txt
+++ b/meson_options.txt
@@ -43,6 +43,8 @@ option('tcg', type: 'feature', value: 'auto',
        description: 'TCG support')
 option('tcg_interpreter', type: 'boolean', value: false,
        description: 'TCG with bytecode interpreter (experimental and slow)')
+option('tcg_threaded_interpreter', type: 'boolean', value: false,
+       description: 'TCG with threaded-dispatch bytecode interpreter (experimental and slow, but less slow than TCI)')
 option('cfi', type: 'boolean', value: 'false',
        description: 'Control-Flow Integrity (CFI)')
 option('cfi_debug', type: 'boolean', value: 'false',
diff --git a/scripts/mtest2make.py b/scripts/mtest2make.py
index ee072c05025a..b0467ab56545 100644
--- a/scripts/mtest2make.py
+++ b/scripts/mtest2make.py
@@ -75,18 +75,18 @@ def process_tests(test, targets, suites):
     print('run-test-%d: $(.test.deps.%d)' % (i,i))
     print('\t@$(call .test.run,%d,$(.test.output-format))' % (i,))
 
-    test_suites = test['suite'] or ['default']
-    is_slow = any(s.endswith('-slow') for s in test_suites)
-    for s in test_suites:
-        # The suite name in the introspection info is "PROJECT:SUITE"
-        s = s.split(':')[1]
-        if s.endswith('-slow'):
-            s = s[:-5]
-        if is_slow:
-            suites[s].slow_tests.append(i)
-        else:
-            suites[s].tests.append(i)
-        suites[s].executables.add(executable)
+    #test_suites = test['suite'] or ['default']
+    #is_slow = any(s.endswith('-slow') for s in test_suites)
+    #for s in test_suites:
+    #    # The suite name in the introspection info is "PROJECT:SUITE"
+    #    s = s.split(':')[1]
+    #    if s.endswith('-slow'):
+    #        s = s[:-5]
+    #    if is_slow:
+    #        suites[s].slow_tests.append(i)
+    #    else:
+    #        suites[s].tests.append(i)
+    #    suites[s].executables.add(executable)
 
 def emit_prolog(suites, prefix):
     all_tap = ' '.join(('%s-report-%s.tap' % (prefix, k) for k in suites.keys()))
diff --git a/tcg/aarch64-tcti/README.md b/tcg/aarch64-tcti/README.md
new file mode 100644
index 000000000000..eb848e5a9e57
--- /dev/null
+++ b/tcg/aarch64-tcti/README.md
@@ -0,0 +1,1026 @@
+# QEMU Tiny-Code Threaded Interpreter (AArch64)
+
+A TCG backend that chains together JOP/ROP-ish gadgets to massively reduce interpreter overhead vs TCI.
+Platform-dependent; but usable when JIT isn't available; e.g. on platforms that lack WX mappings. The general idea squish the addresses of a gadget sequence into a "queue" and then write each gadget so it ends in a "dequeue-jump".
+
+Execution occurs by jumping into the first gadget, and letting it just play back some linear-overhead native code sequences for a while.
+
+Since TCG-TCI is optimized for sets of 16 GP registers and aarch64 has 30, we could easily keep JIT/QEMU and guest state separate, and since 16\*16 is reasonably small we could actually have a set of reasonable gadgets for each combination of operands.
+
+
+## Register Convention
+
+| Regs    | Use                   |
+| :------ | :-------------------- |
+| x1-x15  | Guest Registers       |
+| x24     | TCTI temporary        |
+| x25     | saved IP during call  |
+| x26     | TCTI temporary        |
+| x27     | TCTI temporary        |
+| x28     | Thread-stream pointer |
+| x30     | Link register         |
+| SP      | Stack Pointer, host   |
+| PC      | Program Counter, host |
+
+In pseudocode:
+
+| Symbol | Meaning                             |
+| :----- | :---------------------------------- |
+| Rd     | stand-in for destination register   |
+| Rn     | stand-in for first source register  |
+| Rm     | stand-in for second source register |
+
+## Gadget Structure
+
+### End of gadget
+
+Each gadget ends by advancing our bytecode pointer, and then executing from thew new location.
+
+```asm
+# Load our next gadget address from our bytecode stream, advancing it, and jump to the next gadget.
+
+ldr x27, [x28], #8\n
+br x27
+```
+
+## Calling into QEMU's C codebase
+
+When calling into C, we lose control over which registers are used. Accordingly, we'll need to save
+registers relevant to TCTI:
+
+```asm
+str x25,      [sp, #-16]!
+stp x14, x15, [sp, #-16]!
+stp x12, x13, [sp, #-16]!
+stp x10, x11, [sp, #-16]!
+stp x8,  x9,  [sp, #-16]!
+stp x6,  x7,  [sp, #-16]!
+stp x4,  x5,  [sp, #-16]!
+stp x2,  x3,  [sp, #-16]!
+stp x0,  x1,  [sp, #-16]!
+stp x28, lr,  [sp, #-16]!
+```
+
+Upon returning to the gadget stream, we'll then restore them.
+
+```asm
+ldp x28, lr, [sp], #16
+ldp x0,  x1, [sp], #16
+ldp x2,  x3, [sp], #16
+ldp x4,  x5, [sp], #16
+ldp x6,  x7, [sp], #16
+ldp x8,  x9, [sp], #16
+ldp x10, x11, [sp], #16
+ldp x12, x13, [sp], #16
+ldp x14, x15, [sp], #16
+ldr x25,      [sp], #16
+```
+
+## TCG Operations
+
+Each operation needs an implementation for every platform; and probably a set of gadgets for each possible set of operands.
+
+At 14 GP registers, that means that
+
+1 operand =\> 16 gadgets
+2 operands =\> 256 gadgets
+3 operands =\> 4096 gadgets
+
+### call
+
+Calls a helper function by address.
+
+**IR Format**: `br <ptr address>`  
+**Gadget type:** single
+
+```asm
+    # Get our C runtime function's location as a pointer-sized immediate...
+    "ldr x27, [x28], #8",
+
+    # Store our TB return address for our helper. This is necessary so the GETPC()
+    # macro works correctly as used in helper functions.
+    "str x28, [x25]",
+
+    # Prepare ourselves to call into our C runtime...
+    *C_CALL_PROLOGUE,
+
+    # ... perform the call itself ...
+    "blr x27",
+
+    # Save the result of our call for later.
+    "mov x27, x0",
+
+    # ... and restore our environment.
+    *C_CALL_EPILOGUE,
+
+    # Restore our return value.
+    "mov x0, x27"
+```
+
+### br
+
+Branches to a given immediate address. Branches are
+
+**IR Format**: `br <ptr address>`  
+**Gadget type:** single
+
+```asm
+# Use our immediate argument as our new bytecode-pointer location.
+ldr x28, [x28]
+```
+
+### setcond_i32
+
+Performs a comparison between two 32-bit operands.
+
+**IR Format**: `setcond32 <cond>, Rd, Rn, Rm`  
+**Gadget type:** treated as 10 operations with variants for every `Rd`/`Rn`/`Rm` (40,960)
+
+```asm
+subs Wd, Wn, Wm
+cset Wd, <cond>
+```
+
+| QEMU Cond | AArch64 Cond |
+| :-------- | :----------- |
+| EQ        | EQ           |
+| NE        | NE           |
+| LT        | LT           |
+| GE        | GE           |
+| LE        | LE           |
+| GT        | GT           |
+| LTU       | LO           |
+| GEU       | HS           |
+| LEU       | LS           |
+| GTU       | HI           |
+
+### setcond_i64
+
+Performs a comparison between two 32-bit operands.
+
+**IR Format**: `setcond64 <cond>, Rd, Rn, Rm`  
+**Gadget type:** treated as 10 operations with variants for every `Rd`/`Rn`/`Rm` (40,960)
+
+```asm
+subs Xd, Xn, Xm
+cset Xd, <cond>
+```
+
+Comparison chart is the same as the `_i32` variant.
+
+### brcond_i32
+
+Compares two 32-bit numbers, and branches if the comparison is true.
+
+**IR Format**: `brcond Rn, Rm, <cond>`  
+**Gadget type:** treated as 10 operations with variants for every `Rn`/`Rm` (2560)
+
+```asm
+# Perform our comparison and conditional branch.
+subs Wrz, Wn, Wm
+br<cond> taken
+
+    # Consume the branch target, without using it.
+    add x28, x28, #8
+
+    # Perform our end-of-instruction epilogue.
+    <epilogue here>
+
+taken:
+
+    # Update our bytecode pointer to take the label.
+    ldr x28, [x28]
+```
+
+Comparison chart is the same as in `setcond_i32` .
+
+### brcond_i64
+
+Compares two 64-bit numbers, and branches if the comparison is true.
+
+**IR Format**: `brcond Rn, Rm, <cond>`  
+**Gadget type:** treated as 10 operations with variants for every `Rn`/`Rm` (2560)
+
+```asm
+# Perform our comparison and conditional branch.
+subs Xrz, Xn, Xm
+br<cond> taken
+
+    # Consume the branch target, without using it.
+    add x28, x28, #8
+
+    # Perform our end-of-instruction epilogue.
+    <epilogue here>
+
+taken:
+
+    # Update our bytecode pointer to take the label.
+    ldr x28, [x28]
+```
+
+Comparison chart is the same as in `setcond_i32` .
+
+### mov_i32
+
+Moves a value from a register to another register.
+
+**IR Format**: `mov Rd, Rn`  
+**Gadget type:** gadget per `Rd` + `Rn` combo (256)
+
+```asm
+mov Rd, Rn
+```
+
+### mov_i64
+
+Moves a value from a register to another register.
+
+**IR Format**: `mov Rd, Rn`  
+**Gadget type:** gadget per `Rd` + `Rn` combo (256)
+
+```asm
+mov Xd, Xn
+```
+
+### tci_movi_i32
+
+Moves an 32b immediate into a register.
+
+**IR Format**: `mov Rd, #imm32`  
+**Gadget type:** gadget per `Rd` (16)
+
+```asm
+ldr w27, [x28], #4
+mov Wd, w27
+```
+
+### tci_movi_i64
+
+Moves an 64b immediate into a register.
+
+**IR Format**: `mov Rd, #imm64`  
+**Gadget type:** gadget per `Rd` (16)
+
+```asm
+ldr x27, [x28], #4
+mov Xd, x27
+```
+
+### ld8u_i32 / ld8u_i64
+
+Load byte from host memory to register.
+
+**IR Format**: `ldr Rd, Rn, <signed offset>`  
+**Gadget type:** gadget per `Rd` & `Rn` (256)
+
+```asm
+ldrsw x27, [x28], #4
+ldrb Xd, [Xn, x27]
+```
+
+### ld8s_i32 / ld8s_i64
+
+Load byte from host memory to register; sign extending.
+
+**IR Format**: `ldr Rd, Rn, <signed offset>`  
+**Gadget type:** gadget per `Rd` & `Rn` (256)
+
+```asm
+ldrsw x27, [x28], #4
+ldrsb Xd, [Xn, x27]
+```
+
+### ld16u_i32 / ld16u_i64
+
+Load 16b from host memory to register.
+
+**IR Format**: `ldr Rd, Rn, <signed offset>`  
+**Gadget type:** gadget per `Rd` & `Rn` (256)
+
+```asm
+ldrsw x27, [x28], #4
+ldrh Wd, [Xn, x27]
+```
+
+### ld16s_i32 / ld16s_i64
+
+Load 16b from host memory to register; sign extending.
+
+**IR Format**: `ldr Rd, Rn, <signed offset>`  
+**Gadget type:** gadget per `Rd` & `Rn` (256)
+
+```asm
+ldrsw x27, [x28], #4
+ldrsh Xd, [Xn, x27]
+```
+
+### ld32u_i32 / ld32u_i64
+
+Load 32b from host memory to register.
+
+**IR Format**: `ldr Rd, Rn, <signed offset>`  
+**Gadget type:** gadget per `Rd` & `Rn` (256)
+
+```asm
+ldrsw x27, [x28], #4
+ldr Wd, [Xn, x27]
+```
+
+### ld32s_i64
+
+Load 32b from host memory to register; sign extending.
+
+**IR Format**: `ldr Rd, Rn, <signed offset>`  
+**Gadget type:** gadget per `Rd` & `Rn` (256)
+
+```asm
+ldrsw x27, [x28], #4
+ldrsw Xd, [Xn, x27]
+```
+
+### ld_i64
+
+Load 64b from host memory to register.
+
+**IR Format**: `ldr Rd, Rn, <signed offset>`  
+**Gadget type:** gadget per `Rd` & `Rn` (256)
+
+```asm
+ldrsw x27, [x28], #4
+ldr Xd, [Xn, x27]
+```
+
+### st8_i32 / st8_i64
+
+Stores byte from register to host memory.
+
+**IR Format**: `str Rd, Rn, <signed offset>`  
+**Gadget type:** gadget per `Rd` & `Rn` (256)
+
+```asm
+ldrsw x27, [x28], #4
+strb Wd, [Xn, x27]
+```
+
+### st16_i32 / st16_i64
+
+Stores 16b from register to host memory.
+
+**IR Format**: `str Rd, Rn, <signed offset>`  
+**Gadget type:** gadget per `Rd` & `Rn` (256)
+
+```asm
+ldrsw x27, [x28], #4
+strh Wd, [Xn, x27]
+```
+
+### st_i32 / st32_i64
+
+Stores 32b from register to host memory.
+
+**IR Format**: `str Rd, Rn, <signed offset>`  
+**Gadget type:** gadget per `Rd` & `Rn` (256)
+
+```asm
+ldrsw x27, [x28], #4
+str Wd, [Xn, x27]
+```
+
+### st_i64
+
+Stores 64b from register to host memory.
+
+**IR Format**: `str Rd, Rn, <signed offset>`  
+**Gadget type:** gadget per `Rd` & `Rn` (256)
+
+```asm
+ldrsw x27, [x28], #4
+str Xd, [Xn, x27]
+```
+
+### qemu_ld_i32
+
+Loads 32b from _guest_ memory to register.
+
+**IR Format**: `ld Rd, <foreign/guest pointer>, <memory operation>`  
+**Gadget type:** thunk per `Rd` into C impl?
+
+### qemu_ld_i64
+
+Loads 64b from _guest_ memory to register.
+
+**IR Format**: `ld Rd, <foreign/guest pointer>, <memory operation>`  
+**Gadget type:** thunk per `Rd` into C impl?
+
+### qemu_st_i32
+
+Stores 32b from a register to _guest_ memory.
+
+**IR Format**: `st Rd, <foreign/guest pointer>, <memory operation>`  
+**Gadget type:** thunk per `Rd` into C impl
+
+### qemu_st_i64
+
+Stores 64b from a register to _guest_ memory.
+
+**IR Format**: `st Rd, <foreign/guest pointer>, <memory operation>`  
+**Gadget type:** thunk per `Rd` into C impl?
+
+#### Note
+
+See note on `qemu_ld_i32`.
+
+### add_i32
+
+Adds two 32-bit numbers.
+
+**IR Format**: `add Rd, Rn, Rm`  
+**Gadget type:** gadget per `Rd`, `Rn`, `Rm` (4096)
+
+```asm
+add Wd, Wn, Wm
+```
+
+### add_i64
+
+Adds two 64-bit numbers.
+
+**IR Format**: `add Rd, Rn, Rm`  
+**Gadget type:** gadget per `Rd`, `Rn`, `Rm` (4096)
+
+```asm
+add Xd, Xn, Xm
+```
+
+### sub_i32
+
+Subtracts two 32-bit numbers.
+
+**IR Format**: `add Rd, Rn, Rm`  
+**Gadget type:** gadget per `Rd`, `Rn`, `Rm` (4096)
+
+```asm
+Sub Wd, Wn, Wm
+```
+
+### sub_i64
+
+Subtracts two 64-bit numbers.
+
+**IR Format**: `sub Rd, Rn, Rm`  
+**Gadget type:** gadget per `Rd`, `Rn`, `Rm` (4096)
+
+```asm
+sub Xd, Xn, Xm
+```
+
+### mul_i32
+
+Multiplies two 32-bit numbers.
+
+**IR Format**: `mul Rd, Rn, Rm`  
+**Gadget type:** gadget per `Rd`, `Rn`, `Rm` (4096)
+
+```asm
+mul Wd, Wn, Wm
+```
+
+### mul_i64
+
+Multiplies two 64-bit numbers.
+
+**IR Format**: `mul Rd, Rn, Rm`  
+**Gadget type:** gadget per `Rd`, `Rn`, `Rm` (4096)
+
+```asm
+mul Xd, Xn, Xm
+```
+
+### div_i32
+
+Divides two 32-bit numbers; considering them signed.
+
+**IR Format**: `div Rd, Rn, Rm`  
+**Gadget type:** gadget per `Rd`, `Rn`, `Rm` (4096)
+
+```asm
+sdiv Wd, Wn, Wm
+```
+
+### div_i64
+
+Divides two 64-bit numbers; considering them signed.
+
+**IR Format**: `div Rd, Rn, Rm`  
+**Gadget type:** gadget per `Rd`, `Rn`, `Rm` (4096)
+
+```asm
+sdiv Xd, Xn, Xm
+```
+
+### divu_i32
+
+Divides two 32-bit numbers; considering them unsigned.
+
+**IR Format**: `div Rd, Rn, Rm`  
+**Gadget type:** gadget per `Rd`, `Rn`, `Rm` (4096)
+
+```asm
+udiv Wd, Wn, Wm
+```
+
+### divu_i64
+
+Divides two 32-bit numbers; considering them unsigned.
+
+**IR Format**: `div Rd, Rn, Rm`  
+**Gadget type:** gadget per `Rd`, `Rn`, `Rm` (4096)
+
+```asm
+udiv Xd, Xn, Xm
+```
+
+### rem_i32
+
+Computes the division remainder (modulus) of two 32-bit numbers; considering them signed.
+
+**IR Format**: `rem Rd, Rn, Rm`  
+**Gadget type:** gadget per `Rd`, `Rn`, `Rm` (4096)
+
+```asm
+sdiv    w27, Wn, Wm
+msub    Wd, w27, Wm, Wn
+```
+
+### rem_i64
+
+Computes the division remainder (modulus) of two 64-bit numbers; considering them signed.
+
+**IR Format**: `rem Rd, Rn, Rm`  
+**Gadget type:** gadget per `Rd`, `Rn`, `Rm` (4096)
+
+```asm
+sdiv    x27, Xn, Xm
+msub    Xd, x27, Xm, Xn
+```
+
+### remu_i32
+
+Computes the division remainder (modulus) of two 32-bit numbers; considering them unsigned.
+
+**IR Format**: `rem Rd, Rn, Rm`  
+**Gadget type:** gadget per `Rd`, `Rn`, `Rm` (4096)
+
+```asm
+udiv    w27, Wn, Wm
+msub    Wd, w27, Wm, Wn
+```
+
+### remu_i64
+
+Computes the division remainder (modulus) of two 32-bit numbers; considering them unsigned.
+
+**IR Format**: `rem Rd, Rn, Rm`  
+**Gadget type:** gadget per `Rd`, `Rn`, `Rm` (4096)
+
+```asm
+udiv    x27, Xn, Xm
+msub    Xd, x27, Xm, Xn
+```
+
+### not_i32
+
+Logically inverts a 32-bit number.
+
+**IR Format**: `not Rd, Rn`  
+**Gadget type:** gadget per `Rd`, `Rn` (256)
+
+```asm
+mvn Wd, Wn
+```
+
+### not_i64
+
+Logically inverts a 64-bit number.
+
+**IR Format**: `not Rd, Rn`  
+**Gadget type:** gadget per `Rd`, `Rn` (256)
+
+```asm
+mvn Xd, Xn
+```
+
+### neg_i32
+
+Arithmetically inverts (two's compliment) a 32-bit number.
+
+**IR Format**: `not Rd, Rn`  
+**Gadget type:** gadget per `Rd`, `Rn` (256)
+
+```asm
+neg Wd, Wn
+```
+
+### neg_i64
+
+Arithmetically inverts (two's compliment) a 64-bit number.
+
+**IR Format**: `not Rd, Rn`  
+**Gadget type:** gadget per `Rd`, `Rn` (256)
+
+```asm
+neg Xd, Xn
+```
+
+### and_i32
+
+Logically ANDs two 32-bit numbers.
+
+**IR Format**: `and Rd, Rn, Rm`  
+**Gadget type:** gadget per `Rd`, `Rn`, `Rm` (4096)
+
+```asm
+and Wd, Wn, Wm
+```
+
+### and_i64
+
+Logically ANDs two 64-bit numbers.
+
+**IR Format**: `and Rd, Rn, Rm`  
+**Gadget type:** gadget per `Rd`, `Rn`, `Rm` (4096)
+
+```asm
+and Xd, Xn, Xm
+```
+
+### or_i32
+
+Logically ORs two 32-bit numbers.
+
+**IR Format**: `or Rd, Rn, Rm`  
+**Gadget type:** gadget per `Rd`, `Rn`, `Rm` (4096)
+
+```asm
+or Wd, Wn, Wm
+```
+
+### or_i64
+
+Logically ORs two 64-bit numbers.
+
+**IR Format**: `or Rd, Rn, Rm`  
+**Gadget type:** gadget per `Rd`, `Rn`, `Rm` (4096)
+
+```asm
+or Xd, Xn, Xm
+```
+
+### xor_i32
+
+Logically XORs two 32-bit numbers.
+
+**IR Format**: `xor Rd, Rn, Rm`  
+**Gadget type:** gadget per `Rd`, `Rn`, `Rm` (4096)
+
+```asm
+eor Wd, Wn, Wm
+```
+
+### xor_i64
+
+Logically XORs two 64-bit numbers.
+
+**IR Format**: `xor Rd, Rn, Rm`  
+**Gadget type:** gadget per `Rd`, `Rn`, `Rm` (4096)
+
+```asm
+eor Xd, Xn, Xm
+```
+
+### shl_i32
+
+Logically shifts a 32-bit number left.
+
+**IR Format**: `shl Rd, Rn, Rm`  
+**Gadget type:** gadget per `Rd`, `Rn`, `Rm` (4096)
+
+```asm
+lsl Wd, Wn, Wm
+```
+
+### shl_i64
+
+Logically shifts a 64-bit number left.
+
+**IR Format**: `shl Rd, Rn, Rm`  
+**Gadget type:** gadget per `Rd`, `Rn`, `Rm` (4096)
+
+```asm
+lsl Xd, Xn, Xm
+```
+
+### shr_i32
+
+Logically shifts a 32-bit number right.
+
+**IR Format**: `shr Rd, Rn, Rm`  
+**Gadget type:** gadget per `Rd`, `Rn`, `Rm` (4096)
+
+```asm
+lsr Wd, Wn, Wm
+```
+
+### shr_i64
+
+Logically shifts a 64-bit number right.
+
+**IR Format**: `shr Rd, Rn, Rm`  
+**Gadget type:** gadget per `Rd`, `Rn`, `Rm` (4096)
+
+```asm
+lsr Xd, Xn, Xm
+```
+
+### sar_i32
+
+Arithmetically shifts a 32-bit number right.
+
+**IR Format**: `sar Rd, Rn, Rm`  
+**Gadget type:** gadget per `Rd`, `Rn`, `Rm` (4096)
+
+```asm
+asr Wd, Wn, Wm
+```
+
+### sar_i64
+
+Arithmetically shifts a 64-bit number right.
+
+**IR Format**: `sar Rd, Rn, Rm`  
+**Gadget type:** gadget per `Rd`, `Rn`, `Rm` (4096)
+
+```asm
+asr Xd, Xn, Xm
+```
+
+### rotl_i32
+
+Rotates a 32-bit number left.
+
+**IR Format**: `rotl Rd, Rn, Rm`  
+**Gadget type:** gadget per `Rd`, `Rn`, `Rm` (4096)
+
+```asm
+rol Wd, Wn, Wm
+```
+
+### rotl_i64
+
+Rotates a 64-bit number left.
+
+**IR Format**: `rotl Rd, Rn, Rm`  
+**Gadget type:** gadget per `Rd`, `Rn`, `Rm` (4096)
+
+```asm
+rol Xd, Xn, Xm
+```
+
+### rotr_i32
+
+Rotates a 32-bit number right.
+
+**IR Format**: `rotr Rd, Rn, Rm`  
+**Gadget type:** gadget per `Rd`, `Rn`, `Rm` (4096)
+
+```asm
+ror Wd, Wn, Wm
+```
+
+### rotr_i64
+
+Rotates a 64-bit number right.
+
+**IR Format**: `rotr Rd, Rn, Rm`  
+**Gadget type:** gadget per `Rd`, `Rn`, `Rm` (4096)
+
+```asm
+ror Xd, Xn, Xm
+```
+
+### deposit_i32
+
+Optional; not currently implementing.
+
+### deposit_i64
+
+Optional; not currently implementing.
+
+### ext8s_i32
+
+Sign extends the lower 8b of a register into a 32b destination.
+
+**IR Format**: `ext8s Rd, Rn`  
+**Gadget type:** gadget per `Rd`, `Rn` (256)
+
+```asm
+sxtb Wd, Wn
+```
+
+### ext8s_i64
+
+Sign extends the lower 8b of a register into a 64b destination.
+
+**IR Format**: `ext8s Rd, Rn`  
+**Gadget type:** gadget per `Rd`, `Rn` (256)
+
+```asm
+sxtb Xd, Wn
+```
+
+### ext8u_i32
+
+Zero extends the lower 8b of a register into a 32b destination.
+
+**IR Format**: `ext8u Rd, Rn`  
+**Gadget type:** gadget per `Rd`, `Rn` (256)
+
+```asm
+and Xd, Xn, #0xff
+```
+
+### ext8u_i64
+
+Zero extends the lower 8b of a register into a 64b destination.
+
+**IR Format**: `ext8u Rd, Rn`  
+**Gadget type:** gadget per `Rd`, `Rn` (256)
+
+```asm
+and Xd, Xn, #0xff
+```
+
+### ext16s_i32
+
+Sign extends the lower 16b of a register into a 32b destination.
+
+**IR Format**: `ext16s Rd, Rn`  
+**Gadget type:** gadget per `Rd`, `Rn` (256)
+
+```asm
+sxth Xd, Wn
+```
+
+### ext16s_i64
+
+Sign extends the lower 16b of a register into a 64b destination.
+
+**IR Format**: `ext16s Rd, Rn`  
+**Gadget type:** gadget per `Rd`, `Rn` (256)
+
+```asm
+sxth Xd, Wn
+```
+
+### ext16u_i32
+
+Zero extends the lower 16b of a register into a 32b destination.
+
+**IR Format**: `ext16u Rd, Rn`  
+**Gadget type:** gadget per `Rd`, `Rn` (256)
+
+```asm
+and Wd, Wn, #0xffff
+```
+
+### ext16u_i64
+
+Zero extends the lower 16b of a register into a 32b destination.
+
+**IR Format**: `ext16u Rd, Rn`  
+**Gadget type:** gadget per `Rd`, `Rn` (256)
+
+```asm
+and Wd, Wn, #0xffff
+```
+
+### ext32s_i64
+
+Sign extends the lower 32b of a register into a 64b destination.
+
+**IR Format**: `ext32s Rd, Rn`  
+**Gadget type:** gadget per `Rd`, `Rn` (256)
+
+```asm
+sxtw Xd, Wn
+```
+
+### ext32u_i64
+
+Zero extends the lower 32b of a register into a 64b destination.
+
+**IR Format**: `ext32s Rd, Rn`  
+**Gadget type:** gadget per `Rd`, `Rn` (256)
+
+```asm
+sxtw Xd, Wn
+```
+
+### ext_i32_i64
+
+Sign extends the lower 32b of a register into a 64b destination.
+
+**IR Format**: `ext32s Rd, Rn`  
+**Gadget type:** gadget per `Rd`, `Rn` (256)
+
+```asm
+sxtw Xd, Wn
+```
+
+### extu_i32_i64
+
+Zero extends the lower 32b of a register into a 32b destination.
+
+**IR Format**: `ext32u Rd, Rn`  
+**Gadget type:** gadget per `Rd`, `Rn` (256)
+
+```asm
+and Xd, Xn, #0xffffffff
+```
+
+### bswap16_i32
+
+Byte-swaps a 16b quantity.
+
+**IR Format**: `bswap16 Rd, Rn`  
+**Gadget type:** gadget per `Rd`, `Rn` (256)
+
+```asm
+rev     w27, Wn
+lsr     Wd, w27, #16
+```
+
+### bswap16_i64
+
+Byte-swaps a 16b quantity.
+
+**IR Format**: `bswap16 Rd, Rn`  
+**Gadget type:** gadget per `Rd`, `Rn` (256)
+
+```asm
+rev     w27, Wn
+lsr     Wd, w27, #16
+```
+
+### bswap32_i32
+
+Byte-swaps a 32b quantity.
+
+**IR Format**: `bswap32 Rd, Rn`  
+**Gadget type:** gadget per `Rd`, `Rn` (256)
+
+```asm
+rev     Wd, Wn
+```
+
+### bswap32_i64
+
+Byte-swaps a 32b quantity.
+
+**IR Format**: `bswap32 Rd, Rn`  
+**Gadget type:** gadget per `Rd`, `Rn` (256)
+
+```asm
+rev     Wd, Wn
+```
+
+### bswap64_i64
+
+Byte-swaps a 64b quantity.
+
+**IR Format**: `bswap64 Rd, Rn`  
+**Gadget type:** gadget per `Rd`, `Rn` (256)
+
+```asm
+rev     Xd, Xn
+```
+
+### exit_tb
+
+Exits the translation block. Has no gadget; but instead inserts the address of the translation block epilogue.
+
+
+### mb
+
+Memory barrier.
+
+**IR Format**: `mb <type>`  
+**Gadget type:** gadget per type
+
+```asm
+# !!! TODO
+```
+
+#### Note
+
+We still need to look up out how to map QEMU MB types map to AArch64 ones. This might take nuance.
diff --git a/tcg/aarch64-tcti/tcg-target-con-set.h b/tcg/aarch64-tcti/tcg-target-con-set.h
new file mode 100644
index 000000000000..f51b7bcb13e7
--- /dev/null
+++ b/tcg/aarch64-tcti/tcg-target-con-set.h
@@ -0,0 +1,21 @@
+/* SPDX-License-Identifier: MIT */
+/*
+ * TCI target-specific constraint sets.
+ * Copyright (c) 2021 Linaro
+ */
+
+/*
+ * C_On_Im(...) defines a constraint set with <n> outputs and <m> inputs.
+ * Each operand should be a sequence of constraint letters as defined by
+ * tcg-target-con-str.h; the constraint combination is inclusive or.
+ */
+C_O0_I2(r, r)
+C_O0_I3(r, r, r)
+C_O0_I4(r, r, r, r)
+C_O1_I1(r, r)
+C_O1_I2(r, 0, r)
+C_O1_I2(r, r, r)
+C_O1_I4(r, r, r, r, r)
+C_O2_I1(r, r, r)
+C_O2_I2(r, r, r, r)
+C_O2_I4(r, r, r, r, r, r)
diff --git a/tcg/aarch64-tcti/tcg-target-con-str.h b/tcg/aarch64-tcti/tcg-target-con-str.h
new file mode 100644
index 000000000000..87c0f19e9c2e
--- /dev/null
+++ b/tcg/aarch64-tcti/tcg-target-con-str.h
@@ -0,0 +1,11 @@
+/* SPDX-License-Identifier: MIT */
+/*
+ * Define TCI target-specific operand constraints.
+ * Copyright (c) 2021 Linaro
+ */
+
+/*
+ * Define constraint letters for register sets:
+ * REGS(letter, register_mask)
+ */
+REGS('r', MAKE_64BIT_MASK(0, TCG_TARGET_NB_REGS))
diff --git a/tcg/aarch64-tcti/tcg-target.c.inc b/tcg/aarch64-tcti/tcg-target.c.inc
new file mode 100644
index 000000000000..d7bb67a92140
--- /dev/null
+++ b/tcg/aarch64-tcti/tcg-target.c.inc
@@ -0,0 +1,1347 @@
+/*
+ * Tiny Code Threaded Intepreter for QEMU
+ *
+ * Copyright (c) 2021 Kate Temkin
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+#define TCTI_GADGET_IMMEDIATE_ARRAY_LEN 64
+
+// Grab our gadget definitions.
+// FIXME: use the system path instead of hardcoding this?
+#include "tcti-gadgets.c.inc"
+
+/* Marker for missing code. */
+#define TODO() \
+    do { \
+        fprintf(stderr, "TODO %s:%u: %s()\n", \
+                __FILE__, __LINE__, __func__); \
+        tcg_abort(); \
+    } while (0)
+
+
+/* Enable TCTI assertions only when debugging TCG (and without NDEBUG defined).
+ * Without assertions, the interpreter runs much faster. */
+#if defined(CONFIG_DEBUG_TCG)
+# define tcti_assert(cond) assert(cond)
+#else
+# define tcti_assert(cond) ((void)0)
+#endif
+
+/* Bitfield n...m (in 32 bit value). */
+#define BITS(n, m) (((0xffffffffU << (31 - n)) >> (31 - n + m)) << m)
+
+/**
+ * Macro that defines a look-up tree for named QEMU_LD gadgets.
+ */ 
+#define LD_MEMOP_LOOKUP(variable, arg, suffix) \
+    switch (get_memop(arg) & (MO_BSWAP | MO_SSIZE)) { \
+        case MO_UB:   variable = gadget_qemu_ld_ub_   ## suffix; break; \
+        case MO_SB:   variable = gadget_qemu_ld_sb_   ## suffix; break; \
+        case MO_LEUW: variable = gadget_qemu_ld_leuw_ ## suffix; break; \
+        case MO_LESW: variable = gadget_qemu_ld_lesw_ ## suffix; break; \
+        case MO_LEUL: variable = gadget_qemu_ld_leul_ ## suffix; break; \
+        case MO_LESL: variable = gadget_qemu_ld_lesl_ ## suffix; break; \
+        case MO_LEQ:  variable = gadget_qemu_ld_leq_  ## suffix; break; \
+        case MO_BEUW: variable = gadget_qemu_ld_beuw_ ## suffix; break; \
+        case MO_BESW: variable = gadget_qemu_ld_besw_ ## suffix; break; \
+        case MO_BEUL: variable = gadget_qemu_ld_beul_ ## suffix; break; \
+        case MO_BESL: variable = gadget_qemu_ld_besl_ ## suffix; break; \
+        case MO_BEQ:  variable = gadget_qemu_ld_beq_  ## suffix; break; \
+        default: \
+            g_assert_not_reached(); \
+    }
+#define LD_MEMOP_HANDLER(variable, arg, suffix, a_bits, s_bits) \
+        if (a_bits >= s_bits) { \
+            LD_MEMOP_LOOKUP(variable, arg, aligned_ ## suffix ); \
+        } else { \
+            LD_MEMOP_LOOKUP(gadget, arg, unaligned_ ## suffix); \
+        }
+
+
+
+/**
+ * Macro that defines a look-up tree for named QEMU_ST gadgets.
+ */ 
+#define ST_MEMOP_LOOKUP(variable, arg, suffix) \
+    switch (get_memop(arg) & (MO_BSWAP | MO_SSIZE)) { \
+        case MO_UB:   variable = gadget_qemu_st_ub_   ## suffix; break; \
+        case MO_LEUW: variable = gadget_qemu_st_leuw_ ## suffix; break; \
+        case MO_LEUL: variable = gadget_qemu_st_leul_ ## suffix; break; \
+        case MO_LEQ:  variable = gadget_qemu_st_leq_  ## suffix; break; \
+        case MO_BEUW: variable = gadget_qemu_st_beuw_ ## suffix; break; \
+        case MO_BEUL: variable = gadget_qemu_st_beul_ ## suffix; break; \
+        case MO_BEQ:  variable = gadget_qemu_st_beq_  ## suffix; break; \
+        default: \
+            g_assert_not_reached(); \
+    }
+#define ST_MEMOP_HANDLER(variable, arg, suffix, a_bits, s_bits) \
+        if (a_bits >= s_bits) { \
+            ST_MEMOP_LOOKUP(variable, arg, aligned_ ## suffix ); \
+        } else { \
+            ST_MEMOP_LOOKUP(gadget, arg, unaligned_ ## suffix); \
+        }
+
+
+static TCGConstraintSetIndex tcg_target_op_def(TCGOpcode op)
+{
+    switch (op) {
+    case INDEX_op_ld8u_i32:
+    case INDEX_op_ld8s_i32:
+    case INDEX_op_ld16u_i32:
+    case INDEX_op_ld16s_i32:
+    case INDEX_op_ld_i32:
+    case INDEX_op_ld8u_i64:
+    case INDEX_op_ld8s_i64:
+    case INDEX_op_ld16u_i64:
+    case INDEX_op_ld16s_i64:
+    case INDEX_op_ld32u_i64:
+    case INDEX_op_ld32s_i64:
+    case INDEX_op_ld_i64:
+    case INDEX_op_not_i32:
+    case INDEX_op_not_i64:
+    case INDEX_op_neg_i32:
+    case INDEX_op_neg_i64:
+    case INDEX_op_ext8s_i32:
+    case INDEX_op_ext8s_i64:
+    case INDEX_op_ext16s_i32:
+    case INDEX_op_ext16s_i64:
+    case INDEX_op_ext8u_i32:
+    case INDEX_op_ext8u_i64:
+    case INDEX_op_ext16u_i32:
+    case INDEX_op_ext16u_i64:
+    case INDEX_op_ext32s_i64:
+    case INDEX_op_ext32u_i64:
+    case INDEX_op_ext_i32_i64:
+    case INDEX_op_extu_i32_i64:
+    case INDEX_op_bswap16_i32:
+    case INDEX_op_bswap16_i64:
+    case INDEX_op_bswap32_i32:
+    case INDEX_op_bswap32_i64:
+    case INDEX_op_bswap64_i64:
+        return C_O1_I1(r, r);
+
+    case INDEX_op_st8_i32:
+    case INDEX_op_st16_i32:
+    case INDEX_op_st_i32:
+    case INDEX_op_st8_i64:
+    case INDEX_op_st16_i64:
+    case INDEX_op_st32_i64:
+    case INDEX_op_st_i64:
+        return C_O0_I2(r, r);
+
+    case INDEX_op_div_i32:
+    case INDEX_op_div_i64:
+    case INDEX_op_divu_i32:
+    case INDEX_op_divu_i64:
+    case INDEX_op_rem_i32:
+    case INDEX_op_rem_i64:
+    case INDEX_op_remu_i32:
+    case INDEX_op_remu_i64:
+    case INDEX_op_add_i32:
+    case INDEX_op_add_i64:
+    case INDEX_op_sub_i32:
+    case INDEX_op_sub_i64:
+    case INDEX_op_mul_i32:
+    case INDEX_op_mul_i64:
+    case INDEX_op_and_i32:
+    case INDEX_op_and_i64:
+    case INDEX_op_andc_i32:
+    case INDEX_op_andc_i64:
+    case INDEX_op_eqv_i32:
+    case INDEX_op_eqv_i64:
+    case INDEX_op_nand_i32:
+    case INDEX_op_nand_i64:
+    case INDEX_op_nor_i32:
+    case INDEX_op_nor_i64:
+    case INDEX_op_or_i32:
+    case INDEX_op_or_i64:
+    case INDEX_op_orc_i32:
+    case INDEX_op_orc_i64:
+    case INDEX_op_xor_i32:
+    case INDEX_op_xor_i64:
+    case INDEX_op_shl_i32:
+    case INDEX_op_shl_i64:
+    case INDEX_op_shr_i32:
+    case INDEX_op_shr_i64:
+    case INDEX_op_sar_i32:
+    case INDEX_op_sar_i64:
+    case INDEX_op_rotl_i32:
+    case INDEX_op_rotl_i64:
+    case INDEX_op_rotr_i32:
+    case INDEX_op_rotr_i64:
+    case INDEX_op_setcond_i32:
+    case INDEX_op_setcond_i64:
+        return C_O1_I2(r, r, r);
+
+    case INDEX_op_brcond_i32:
+    case INDEX_op_brcond_i64:
+        return C_O0_I2(r, r);
+
+    case INDEX_op_qemu_ld_i32:
+    case INDEX_op_qemu_ld_i64:
+        return C_O1_I2(r, r, r);
+    case INDEX_op_qemu_st_i32:
+    case INDEX_op_qemu_st_i64:
+        return C_O0_I3(r, r, r);
+
+    default:
+        g_assert_not_reached();
+    }
+}
+
+static const int tcg_target_reg_alloc_order[] = {
+    TCG_REG_R0,
+    TCG_REG_R1,
+    TCG_REG_R2,
+    TCG_REG_R3,
+    TCG_REG_R4,
+    TCG_REG_R5,
+    TCG_REG_R6,
+    TCG_REG_R7,
+    TCG_REG_R8,
+    TCG_REG_R9,
+    TCG_REG_R10,
+    TCG_REG_R11,
+    TCG_REG_R12,
+    TCG_REG_R13,
+    /*
+    TCG_REG_R14,  // AREG0
+    TCG_REG_R15,  // SP
+    */
+};
+
+#if MAX_OPC_PARAM_IARGS != 6
+# error Fix needed, number of supported input arguments changed!
+#endif
+
+static const int tcg_target_call_iarg_regs[] = {
+    TCG_REG_R0,
+    TCG_REG_R1,
+    TCG_REG_R2,
+    TCG_REG_R3,
+    TCG_REG_R4,
+    TCG_REG_R5,
+};
+
+static const int tcg_target_call_oarg_regs[] = {
+    TCG_REG_R0,
+};
+
+#ifdef CONFIG_DEBUG_TCG
+static const char *const tcg_target_reg_names[TCG_TARGET_NB_REGS] = {
+    "r00",
+    "r01",
+    "r02",
+    "r03",
+    "r04",
+    "r05",
+    "r06",
+    "r07",
+    "r08",
+    "r09",
+    "r10",
+    "r11",
+    "r12",
+    "r13",
+    "r14",
+    "r15",
+};
+#endif
+
+static bool patch_reloc(tcg_insn_unit *code_ptr, int type,
+                        intptr_t value, intptr_t addend)
+{
+    /* tcg_out_reloc always uses the same type, addend. */
+    tcg_debug_assert(type == sizeof(tcg_target_long));
+    tcg_debug_assert(addend == 0);
+    tcg_debug_assert(value != 0);
+    if (TCG_TARGET_REG_BITS == 32) {
+        tcg_patch32(code_ptr, value);
+    } else {
+        tcg_patch64(code_ptr, value);
+    }
+    return true;
+}
+
+#if defined(CONFIG_DEBUG_TCG_INTERPRETER)
+/* Show current bytecode. Used by tcg interpreter. */
+void tci_disas(uint8_t opc)
+{
+    const TCGOpDef *def = &tcg_op_defs[opc];
+    fprintf(stderr, "TCG %s %u, %u, %u\n",
+            def->name, def->nb_oargs, def->nb_iargs, def->nb_cargs);
+}
+#endif
+
+/* Write value (native size). */
+static void tcg_out_immediate(TCGContext *s, tcg_target_ulong v)
+{
+    if (TCG_TARGET_REG_BITS == 32) {
+        //tcg_out32(s, v);
+        tcg_out64(s, v);
+    } else {
+        tcg_out64(s, v);
+    }
+}
+
+void tb_target_set_jmp_target(uintptr_t tc_ptr, uintptr_t jmp_rx, uintptr_t jmp_rw, uintptr_t addr)
+{
+    /* Get a pointer to our immediate, which exists after a single pointer. */
+    uintptr_t immediate_addr = jmp_rw;
+
+    /* Patch it to be match our target address. */
+    qatomic_set((uint64_t *)immediate_addr, addr);
+}
+
+
+/**
+ * TCTI Thunk Helpers
+ */
+
+#ifdef CONFIG_SOFTMMU
+
+// TODO: relocate these prototypes?
+tcg_target_ulong helper_ret_ldub_mmu_signed(CPUArchState *env, target_ulong addr, TCGMemOpIdx oi, uintptr_t retaddr);
+tcg_target_ulong helper_le_lduw_mmu_signed(CPUArchState *env, target_ulong addr, TCGMemOpIdx oi, uintptr_t retaddr);
+tcg_target_ulong helper_le_ldul_mmu_signed(CPUArchState *env, target_ulong addr, TCGMemOpIdx oi, uintptr_t retaddr);
+tcg_target_ulong helper_be_lduw_mmu_signed(CPUArchState *env, target_ulong addr, TCGMemOpIdx oi, uintptr_t retaddr);
+tcg_target_ulong helper_be_ldul_mmu_signed(CPUArchState *env, target_ulong addr, TCGMemOpIdx oi, uintptr_t retaddr);
+
+tcg_target_ulong helper_ret_ldub_mmu_signed(CPUArchState *env, target_ulong addr, TCGMemOpIdx oi, uintptr_t retaddr)
+{
+    return (int8_t)helper_ret_ldub_mmu(env, addr, oi, retaddr);
+}
+
+tcg_target_ulong helper_le_lduw_mmu_signed(CPUArchState *env, target_ulong addr, TCGMemOpIdx oi, uintptr_t retaddr)
+{
+    return (int16_t)helper_le_lduw_mmu(env, addr, oi, retaddr);
+}
+
+tcg_target_ulong helper_le_ldul_mmu_signed(CPUArchState *env, target_ulong addr, TCGMemOpIdx oi, uintptr_t retaddr)
+{
+    return (int32_t)helper_le_ldul_mmu(env, addr, oi, retaddr);
+}
+
+tcg_target_ulong helper_be_lduw_mmu_signed(CPUArchState *env, target_ulong addr, TCGMemOpIdx oi, uintptr_t retaddr)
+{
+    return (int16_t)helper_be_lduw_mmu(env, addr, oi, retaddr);
+}
+
+tcg_target_ulong helper_be_ldul_mmu_signed(CPUArchState *env, target_ulong addr, TCGMemOpIdx oi, uintptr_t retaddr)
+{
+    return (int32_t)helper_be_ldul_mmu(env, addr, oi, retaddr);
+}
+
+#else
+#error TCTI currently only supports use of the soft MMU.
+#endif
+
+
+/**
+ * TCTI Emmiter Helpers
+ */
+
+
+/* Write gadget pointer. */
+static void tcg_out_nullary_gadget(TCGContext *s, void *gadget)
+{
+    tcg_out_immediate(s, (tcg_target_ulong)gadget);
+}
+
+/* Write gadget pointer, plus 64b immediate. */
+static void tcg_out_imm64_gadget(TCGContext *s, void *gadget, tcg_target_ulong immediate)
+{
+    tcg_out_nullary_gadget(s, gadget);
+    tcg_out64(s, immediate);
+}
+
+
+/* Write gadget pointer (one register). */
+static void tcg_out_unary_gadget(TCGContext *s, void *gadget_base[TCG_TARGET_NB_REGS], unsigned reg0)
+{
+    tcg_out_nullary_gadget(s, gadget_base[reg0]);
+}
+
+
+/* Write gadget pointer (two registers). */
+static void tcg_out_binary_gadget(TCGContext *s, void *gadget_base[TCG_TARGET_NB_REGS][TCG_TARGET_NB_REGS], unsigned reg0, unsigned reg1)
+{
+    tcg_out_nullary_gadget(s, gadget_base[reg0][reg1]);
+}
+
+
+/* Write gadget pointer (three registers). */
+static void tcg_out_ternary_gadget(TCGContext *s, void *gadget_base[TCG_TARGET_NB_REGS][TCG_TARGET_NB_REGS][TCG_TARGET_NB_REGS], unsigned reg0, unsigned reg1, unsigned reg2)
+{
+    tcg_out_nullary_gadget(s, gadget_base[reg0][reg1][reg2]);
+}
+
+
+/**
+ * Version of our LDST generator that defers to more optimized gadgets selectively.
+ */
+static void tcg_out_ldst_gadget_inner(TCGContext *s, 
+    void *gadget_base[TCG_TARGET_NB_REGS][TCG_TARGET_NB_REGS], 
+    void *gadget_pos_imm[TCG_TARGET_NB_REGS][TCG_TARGET_NB_REGS][TCTI_GADGET_IMMEDIATE_ARRAY_LEN], 
+    void *gadget_shifted_imm[TCG_TARGET_NB_REGS][TCG_TARGET_NB_REGS][TCTI_GADGET_IMMEDIATE_ARRAY_LEN], 
+    void *gadget_neg_imm[TCG_TARGET_NB_REGS][TCG_TARGET_NB_REGS][TCTI_GADGET_IMMEDIATE_ARRAY_LEN], 
+    unsigned reg0, unsigned reg1, uint32_t offset)
+{
+    int64_t extended_offset = (int32_t)offset;
+    bool is_negative = (extended_offset < 0);
+
+    // Optimal case: we have a gadget that handles our specific offset, so we don't need to encode
+    // an immediate. This saves us a bunch of speed. :)
+
+    // We handle positive and negative gadgets separately, in order to allow for asymmetrical
+    // collections of pre-made gadgets.
+    if (!is_negative) 
+    {
+        uint64_t shifted_offset = (extended_offset >> 3);
+        bool aligned_to_8B = ((extended_offset & 0b111) == 0);
+
+        bool have_optimized_gadget = (extended_offset < TCTI_GADGET_IMMEDIATE_ARRAY_LEN);
+        bool have_shifted_gadget   = (shifted_offset  < TCTI_GADGET_IMMEDIATE_ARRAY_LEN);
+
+        // More optimal case: we have a gadget that directly encodes the argument.
+        if (have_optimized_gadget) {
+            tcg_out_nullary_gadget(s, gadget_pos_imm[reg0][reg1][extended_offset]);
+            return;
+        } 
+
+        // Special case: it's frequent to have low-numbered positive offsets that are aligned
+        // to 16B boundaries
+        else if(aligned_to_8B && have_shifted_gadget) {
+            tcg_out_nullary_gadget(s, gadget_shifted_imm[reg0][reg1][shifted_offset]);
+            return;
+        }
+    } 
+    else {
+        uint64_t negated_offset = -(extended_offset);
+
+        // More optimal case: we have a gadget that directly encodes the argument.
+        if (negated_offset < TCTI_GADGET_IMMEDIATE_ARRAY_LEN) {
+            tcg_out_nullary_gadget(s, gadget_neg_imm[reg0][reg1][negated_offset]);
+            return;
+        }
+    }
+
+    // Less optimal case: we don't have a gadget specifically for this. Emit the general case immediate.
+    tcg_out_binary_gadget(s, gadget_base, reg0, reg1);
+    tcg_out64(s, extended_offset); //tcg_out32(s, offset);
+}
+
+/* Shorthand for the above, that prevents us from having to specify the name three times. */
+#define tcg_out_ldst_gadget(s, name, a, b, c) \
+    tcg_out_ldst_gadget_inner(s, name, \
+        name ## _imm,  \
+        name ## _sh8_imm,  \
+        name ## _neg_imm, \
+    a, b, c)
+
+
+
+/* Write label. */
+static void tcti_out_label(TCGContext *s, TCGLabel *label)
+{
+    if (label->has_value) {
+        tcg_out64(s, label->u.value);
+        tcg_debug_assert(label->u.value);
+    } else {
+        tcg_out_reloc(s, s->code_ptr, sizeof(tcg_target_ulong), label, 0);
+        s->code_ptr += sizeof(tcg_target_ulong);
+    }
+}
+
+/**
+ * Generate a register-to-register MOV.
+ */
+static bool tcg_out_mov(TCGContext *s, TCGType type, TCGReg ret, TCGReg arg)
+{
+    tcg_debug_assert(ret != arg);
+
+    if (type == TCG_TYPE_I32) {
+        tcg_out_binary_gadget(s, gadget_mov_i32, ret, arg);
+    } else {
+        tcg_out_binary_gadget(s, gadget_mov_i64, ret, arg);
+    }
+
+
+    return true;
+}
+
+
+static void tcg_out_movi_i32(TCGContext *s, TCGReg t0, tcg_target_long arg)
+{
+    bool is_negative = (arg < 0);
+
+    // We handle positive and negative gadgets separately, in order to allow for asymmetrical
+    // collections of pre-made gadgets.
+    if (!is_negative) 
+    {
+        // More optimal case: we have a gadget that directly encodes the argument.
+        if (arg < ARRAY_SIZE(gadget_movi_imm_i32[t0])) {
+            tcg_out_nullary_gadget(s, gadget_movi_imm_i32[t0][arg]);
+            return;
+        }
+    } 
+    else {
+
+    }
+
+    // Emit the mov and its immediate.
+    tcg_out_unary_gadget(s, gadget_movi_i32, t0);
+    tcg_out64(s, arg); // TODO: make 32b?
+}
+
+
+static void tcg_out_movi_i64(TCGContext *s, TCGReg t0, tcg_target_long arg)
+{
+    uint8_t is_negative = arg < 0;
+
+    // We handle positive and negative gadgets separately, in order to allow for asymmetrical
+    // collections of pre-made gadgets.
+    if (!is_negative) 
+    {
+        // More optimal case: we have a gadget that directly encodes the argument.
+        if (arg < ARRAY_SIZE(gadget_movi_imm_i64[t0])) {
+            tcg_out_nullary_gadget(s, gadget_movi_imm_i64[t0][arg]);
+            return;
+        }
+    } 
+    else {
+
+    }
+
+    // TODO: optimize the negative case, too?
+
+    // Less optimal case: emit the mov and its immediate.
+    tcg_out_unary_gadget(s, gadget_movi_i64, t0);
+    tcg_out64(s, arg);
+}
+
+
+/**
+ * Generate an immediate-to-register MOV.
+ */
+static void tcg_out_movi(TCGContext *s, TCGType type, TCGReg t0, tcg_target_long arg)
+{
+    if (type == TCG_TYPE_I32) {
+        tcg_out_movi_i32(s, t0, arg);
+    } else {
+        tcg_out_movi_i64(s, t0, arg);
+    }
+}
+
+/**
+ * Generate a CALL.
+ */
+static inline void tcg_out_call(TCGContext *s, const tcg_insn_unit *arg)
+{
+    tcg_out_nullary_gadget(s, gadget_call);
+    tcg_out64(s, (uintptr_t)arg);
+}
+
+/**
+ * Generates LD instructions.
+ */
+static void tcg_out_ld(TCGContext *s, TCGType type, TCGReg ret, TCGReg arg1,
+                       intptr_t arg2)
+{
+
+    if (type == TCG_TYPE_I32) {
+        tcg_out_ldst_gadget(s, gadget_ld32u, ret, arg1, arg2); 
+    } else {
+        tcg_out_ldst_gadget(s, gadget_ld_i64, ret, arg1, arg2); 
+    }
+}
+
+
+/**
+ * Generate every other operation.
+ */
+//static void tcg_out_op(TCGContext *s, TCGOpcode opc, const TCGArg *args, const int *const_args)
+void tcg_out_op(TCGContext *s, TCGOpcode opc, const TCGArg *args, const int *const_args)
+{
+    switch (opc) {
+
+    // Exit translation, and return back to QEMU.
+    case INDEX_op_exit_tb:
+        // Emit a simple gadget with a known return code.
+        tcg_out_imm64_gadget(s, gadget_exit_tb, args[0]);
+        break;
+
+    // Jump to a translation block.
+    case INDEX_op_goto_tb:
+
+        // If we're using a direct jump, we'll emit a "relocation" that can be usd
+        // to patch our gadget stream with the target address, later.
+        if (s->tb_jmp_insn_offset) {
+            // Emit our gadget.
+            tcg_out_nullary_gadget(s, gadget_br);
+
+            // Place our current instruction into our "relocation table", so it can
+            // be patched once we know where the branch will target...
+            s->tb_jmp_insn_offset[args[0]] = tcg_current_code_size(s);
+
+            // ... and emit our relocation.
+            tcg_out64(s, args[0]);
+
+
+        } else {
+            /* Indirect jump method. */
+            TODO();
+        }
+        set_jmp_reset_offset(s, args[0]);
+        break;
+
+    // Simple branch.
+    case INDEX_op_br:
+        tcg_out_nullary_gadget(s, gadget_br);
+        tcti_out_label(s, arg_label(args[0]));
+        break;
+
+
+    // Set condition flag.
+    // a0 = Rd, a1 = Rn, a2 = Rm
+    case INDEX_op_setcond_i32:
+    {
+        void *gadget;
+
+        // We have to emit a different gadget per condition; we'll select which.
+        switch(args[3]) {
+            case TCG_COND_EQ:  gadget = gadget_setcond_i32_eq; break;
+            case TCG_COND_NE:  gadget = gadget_setcond_i32_ne; break;
+            case TCG_COND_LT:  gadget = gadget_setcond_i32_lt; break;
+            case TCG_COND_GE:  gadget = gadget_setcond_i32_ge; break;
+            case TCG_COND_LE:  gadget = gadget_setcond_i32_le; break;
+            case TCG_COND_GT:  gadget = gadget_setcond_i32_gt; break;
+            case TCG_COND_LTU: gadget = gadget_setcond_i32_lo; break;
+            case TCG_COND_GEU: gadget = gadget_setcond_i32_hs; break;
+            case TCG_COND_LEU: gadget = gadget_setcond_i32_ls; break;
+            case TCG_COND_GTU: gadget = gadget_setcond_i32_hi; break;
+            default:
+                g_assert_not_reached();
+        }
+
+        tcg_out_ternary_gadget(s, gadget, args[0], args[1], args[2]);
+        break;
+    }
+
+    case INDEX_op_setcond_i64:
+    {
+        void *gadget;
+
+        // We have to emit a different gadget per condition; we'll select which.
+        switch(args[3]) {
+            case TCG_COND_EQ:  gadget = gadget_setcond_i64_eq; break;
+            case TCG_COND_NE:  gadget = gadget_setcond_i64_ne; break;
+            case TCG_COND_LT:  gadget = gadget_setcond_i64_lt; break;
+            case TCG_COND_GE:  gadget = gadget_setcond_i64_ge; break;
+            case TCG_COND_LE:  gadget = gadget_setcond_i64_le; break;
+            case TCG_COND_GT:  gadget = gadget_setcond_i64_gt; break;
+            case TCG_COND_LTU: gadget = gadget_setcond_i64_lo; break;
+            case TCG_COND_GEU: gadget = gadget_setcond_i64_hs; break;
+            case TCG_COND_LEU: gadget = gadget_setcond_i64_ls; break;
+            case TCG_COND_GTU: gadget = gadget_setcond_i64_hi; break;
+            default:
+                g_assert_not_reached();
+        }
+
+        tcg_out_ternary_gadget(s, gadget, args[0], args[1], args[2]);
+        break;
+    }
+
+    /**
+     * Load instructions.
+     */
+
+    case INDEX_op_ld8u_i32:
+    case INDEX_op_ld8u_i64:
+        tcg_out_ldst_gadget(s, gadget_ld8u, args[0], args[1], args[2]); 
+        break;
+
+    case INDEX_op_ld8s_i32:
+        tcg_out_ldst_gadget(s, gadget_ld8s_i32, args[0], args[1], args[2]); 
+        break;
+
+    case INDEX_op_ld8s_i64:
+        tcg_out_ldst_gadget(s, gadget_ld8s_i64, args[0], args[1], args[2]); 
+        break;
+
+    case INDEX_op_ld16u_i32:
+    case INDEX_op_ld16u_i64:
+        tcg_out_ldst_gadget(s, gadget_ld16u, args[0], args[1], args[2]); 
+        break;
+
+    case INDEX_op_ld16s_i32:
+        tcg_out_ldst_gadget(s, gadget_ld16s_i32, args[0], args[1], args[2]); 
+        break;
+
+    case INDEX_op_ld16s_i64:
+        tcg_out_ldst_gadget(s, gadget_ld16s_i64, args[0], args[1], args[2]); 
+        break;
+
+    case INDEX_op_ld_i32:
+    case INDEX_op_ld32u_i64:
+        tcg_out_ldst_gadget(s, gadget_ld32u, args[0], args[1], args[2]); 
+        break;
+
+    case INDEX_op_ld_i64:
+        tcg_out_ldst_gadget(s, gadget_ld_i64, args[0], args[1], args[2]); 
+        break;
+    
+    case INDEX_op_ld32s_i64:
+        tcg_out_ldst_gadget(s, gadget_ld32s_i64, args[0], args[1], args[2]); 
+        break;
+
+
+    /**
+     * Store instructions.
+     */
+    case INDEX_op_st8_i32:
+    case INDEX_op_st8_i64:
+        tcg_out_ldst_gadget(s, gadget_st8, args[0], args[1], args[2]); 
+        break;
+
+    case INDEX_op_st16_i32:
+    case INDEX_op_st16_i64:
+        tcg_out_ldst_gadget(s, gadget_st16, args[0], args[1], args[2]); 
+        break;
+
+    case INDEX_op_st_i32:
+    case INDEX_op_st32_i64:
+        tcg_out_ldst_gadget(s, gadget_st_i32, args[0], args[1], args[2]); 
+        break;
+
+    case INDEX_op_st_i64:
+        tcg_out_ldst_gadget(s, gadget_st_i64, args[0], args[1], args[2]); 
+        break;
+
+    /**
+     * Arithmetic instructions.
+     */
+
+    case INDEX_op_add_i32: 
+        tcg_out_ternary_gadget(s, gadget_add_i32, args[0], args[1], args[2]); 
+        break;
+
+    case INDEX_op_sub_i32:
+        tcg_out_ternary_gadget(s, gadget_sub_i32, args[0], args[1], args[2]); 
+        break;
+
+    case INDEX_op_mul_i32:
+        tcg_out_ternary_gadget(s, gadget_mul_i32, args[0], args[1], args[2]); 
+        break;
+
+    case INDEX_op_and_i32:
+        tcg_out_ternary_gadget(s, gadget_and_i32, args[0], args[1], args[2]); 
+        break;
+
+    case INDEX_op_andc_i32:     /* Optional (TCG_TARGET_HAS_andc_i32). */
+        tcg_out_ternary_gadget(s, gadget_andc_i32, args[0], args[1], args[2]); 
+        break;
+
+    case INDEX_op_orc_i32:      /* Optional (TCG_TARGET_HAS_orc_i64). */
+        tcg_out_ternary_gadget(s, gadget_orc_i32, args[0], args[1], args[2]); 
+        break;
+
+    case INDEX_op_eqv_i32:      /* Optional (TCG_TARGET_HAS_orc_i64). */
+        tcg_out_ternary_gadget(s, gadget_eqv_i32, args[0], args[1], args[2]); 
+        break;
+
+    case INDEX_op_or_i32:
+        tcg_out_ternary_gadget(s, gadget_or_i32, args[0], args[1], args[2]); 
+        break;
+
+    case INDEX_op_xor_i32:
+        tcg_out_ternary_gadget(s, gadget_xor_i32, args[0], args[1], args[2]); 
+        break;
+
+    case INDEX_op_shl_i32:
+        tcg_out_ternary_gadget(s, gadget_shl_i32, args[0], args[1], args[2]); 
+        break;
+
+    case INDEX_op_shr_i32:
+        tcg_out_ternary_gadget(s, gadget_shr_i32, args[0], args[1], args[2]); 
+        break;
+
+    case INDEX_op_sar_i32:
+        tcg_out_ternary_gadget(s, gadget_sar_i32, args[0], args[1], args[2]); 
+        break;
+
+    //case INDEX_op_rotr_i32:     /* Optional (TCG_TARGET_HAS_rot_i32). */
+    //    tcg_out_ternary_gadget(s, gadget_rotr_i32, args[0], args[1], args[2]); 
+    //    break;
+
+    //case INDEX_op_rotl_i32:     /* Optional (TCG_TARGET_HAS_rot_i32). */
+    //    tcg_out_ternary_gadget(s, gadget_rotl_i32, args[0], args[1], args[2]); 
+
+    case INDEX_op_add_i64:
+        tcg_out_ternary_gadget(s, gadget_add_i64, args[0], args[1], args[2]); 
+        break;
+
+    case INDEX_op_sub_i64:
+        tcg_out_ternary_gadget(s, gadget_sub_i64, args[0], args[1], args[2]); 
+        break;
+
+    case INDEX_op_mul_i64:
+        tcg_out_ternary_gadget(s, gadget_mul_i64, args[0], args[1], args[2]); 
+        break;
+
+    case INDEX_op_and_i64:
+        tcg_out_ternary_gadget(s, gadget_and_i64, args[0], args[1], args[2]); 
+        break;
+
+    case INDEX_op_andc_i64:     /* Optional (TCG_TARGET_HAS_andc_i64). */
+        tcg_out_ternary_gadget(s, gadget_andc_i64, args[0], args[1], args[2]); 
+        break;
+
+    case INDEX_op_orc_i64:      /* Optional (TCG_TARGET_HAS_orc_i64). */
+        tcg_out_ternary_gadget(s, gadget_orc_i64, args[0], args[1], args[2]); 
+        break;
+
+    case INDEX_op_eqv_i64:      /* Optional (TCG_TARGET_HAS_eqv_i64). */
+        tcg_out_ternary_gadget(s, gadget_eqv_i64, args[0], args[1], args[2]); 
+        break;
+
+    //case INDEX_op_nand_i64:     /* Optional (TCG_TARGET_HAS_nand_i64). */
+    //case INDEX_op_nor_i64:      /* Optional (TCG_TARGET_HAS_nor_i64). */
+
+    case INDEX_op_or_i64:
+        tcg_out_ternary_gadget(s, gadget_or_i64, args[0], args[1], args[2]); 
+        break;
+
+    case INDEX_op_xor_i64:
+        tcg_out_ternary_gadget(s, gadget_xor_i64, args[0], args[1], args[2]); 
+        break;
+
+    case INDEX_op_shl_i64:
+        tcg_out_ternary_gadget(s, gadget_shl_i64, args[0], args[1], args[2]); 
+        break;
+
+    case INDEX_op_shr_i64:
+        tcg_out_ternary_gadget(s, gadget_shr_i64, args[0], args[1], args[2]); 
+        break;
+
+    case INDEX_op_sar_i64:
+        tcg_out_ternary_gadget(s, gadget_sar_i64, args[0], args[1], args[2]); 
+        break;
+
+    //case INDEX_op_rotl_i64:     /* Optional (TCG_TARGET_HAS_rot_i64). */
+    //    tcg_out_ternary_gadget(s, gadget_rotl_i64, args[0], args[1], args[2]); 
+    //    break;
+
+    //case INDEX_op_rotr_i64:     /* Optional (TCG_TARGET_HAS_rot_i64). */
+    //    tcg_out_ternary_gadget(s, gadget_rotr_i64, args[0], args[1], args[2]); 
+    //    break;
+
+    case INDEX_op_div_i64:      /* Optional (TCG_TARGET_HAS_div_i64). */
+        tcg_out_ternary_gadget(s, gadget_div_i64, args[0], args[1], args[2]); 
+        break;
+
+    case INDEX_op_divu_i64:     /* Optional (TCG_TARGET_HAS_div_i64). */
+        tcg_out_ternary_gadget(s, gadget_divu_i64, args[0], args[1], args[2]); 
+        break;
+
+    case INDEX_op_rem_i64:      /* Optional (TCG_TARGET_HAS_div_i64). */
+        tcg_out_ternary_gadget(s, gadget_rem_i64, args[0], args[1], args[2]); 
+        break;
+
+    case INDEX_op_remu_i64:     /* Optional (TCG_TARGET_HAS_div_i64). */
+        tcg_out_ternary_gadget(s, gadget_remu_i64, args[0], args[1], args[2]); 
+        break;
+
+    case INDEX_op_brcond_i64:
+    {
+        static uint8_t last_brcond_i64 = 0;
+        void *gadget;
+
+        // We have to emit a different gadget per condition; we'll select which.
+        switch(args[2]) {
+            case TCG_COND_EQ:  gadget = gadget_brcond_i64_eq; break;
+            case TCG_COND_NE:  gadget = gadget_brcond_i64_ne; break;
+            case TCG_COND_LT:  gadget = gadget_brcond_i64_lt; break;
+            case TCG_COND_GE:  gadget = gadget_brcond_i64_ge; break;
+            case TCG_COND_LE:  gadget = gadget_brcond_i64_le; break;
+            case TCG_COND_GT:  gadget = gadget_brcond_i64_gt; break;
+            case TCG_COND_LTU: gadget = gadget_brcond_i64_lo; break;
+            case TCG_COND_GEU: gadget = gadget_brcond_i64_hs; break;
+            case TCG_COND_LEU: gadget = gadget_brcond_i64_ls; break;
+            case TCG_COND_GTU: gadget = gadget_brcond_i64_hi; break;
+            default:
+                g_assert_not_reached();
+        }
+
+        // We'll select the which branch to used based on a cycling counter.
+        // This means we'll pick one of 16 identical brconds. Spreading this out
+        // helps the processor's branch prediction be less "squished", as not every
+        // branch is going throuh the same instruction.
+        tcg_out_ternary_gadget(s, gadget, last_brcond_i64, args[0], args[1]);
+        last_brcond_i64 = (last_brcond_i64 + 1) % TCG_TARGET_NB_REGS;
+
+        // Branch target immediate.
+        tcti_out_label(s, arg_label(args[3]));
+        break;
+    }
+
+
+    case INDEX_op_bswap16_i32:  /* Optional (TCG_TARGET_HAS_bswap16_i32). */
+    case INDEX_op_bswap16_i64:  /* Optional (TCG_TARGET_HAS_bswap16_i64). */
+        tcg_out_binary_gadget(s, gadget_bswap16, args[0], args[1]);
+        break;
+
+    case INDEX_op_bswap32_i32:  /* Optional (TCG_TARGET_HAS_bswap32_i32). */
+    case INDEX_op_bswap32_i64:  /* Optional (TCG_TARGET_HAS_bswap32_i64). */
+        tcg_out_binary_gadget(s, gadget_bswap32, args[0], args[1]);
+        break;
+
+    case INDEX_op_bswap64_i64:  /* Optional (TCG_TARGET_HAS_bswap64_i64). */
+        tcg_out_binary_gadget(s, gadget_bswap64, args[0], args[1]);
+        break;
+
+    case INDEX_op_not_i64:      /* Optional (TCG_TARGET_HAS_not_i64). */
+        tcg_out_binary_gadget(s, gadget_not_i64, args[0], args[1]);
+        break;
+
+    case INDEX_op_neg_i64:      /* Optional (TCG_TARGET_HAS_neg_i64). */
+        tcg_out_binary_gadget(s, gadget_neg_i64, args[0], args[1]);
+        break;
+
+    case INDEX_op_ext8s_i64:    /* Optional (TCG_TARGET_HAS_ext8s_i64). */
+        tcg_out_binary_gadget(s, gadget_ext8s_i64, args[0], args[1]);
+        break;
+
+    case INDEX_op_ext8u_i32:    /* Optional (TCG_TARGET_HAS_ext8u_i32). */
+    case INDEX_op_ext8u_i64:    /* Optional (TCG_TARGET_HAS_ext8u_i64). */
+        tcg_out_binary_gadget(s, gadget_ext8u, args[0], args[1]);
+        break;
+
+    case INDEX_op_ext16s_i64:   /* Optional (TCG_TARGET_HAS_ext16s_i64). */
+        tcg_out_binary_gadget(s, gadget_ext16s_i64, args[0], args[1]);
+        break;
+
+    case INDEX_op_ext16u_i32:   /* Optional (TCG_TARGET_HAS_ext16u_i32). */
+    case INDEX_op_ext16u_i64:   /* Optional (TCG_TARGET_HAS_ext16u_i64). */
+        tcg_out_binary_gadget(s, gadget_ext16u, args[0], args[1]);
+        break;
+
+    case INDEX_op_ext32s_i64:   /* Optional (TCG_TARGET_HAS_ext32s_i64). */
+    case INDEX_op_ext_i32_i64:
+        tcg_out_binary_gadget(s, gadget_ext32s_i64, args[0], args[1]);
+        break;
+
+    case INDEX_op_ext32u_i64:   /* Optional (TCG_TARGET_HAS_ext32u_i64). */
+    case INDEX_op_extu_i32_i64:
+        tcg_out_binary_gadget(s, gadget_ext32u_i64, args[0], args[1]);
+        break;
+
+    case INDEX_op_neg_i32:      /* Optional (TCG_TARGET_HAS_neg_i32). */
+        tcg_out_binary_gadget(s, gadget_neg_i32, args[0], args[1]);
+        break;
+
+    case INDEX_op_not_i32:      /* Optional (TCG_TARGET_HAS_not_i32). */
+        tcg_out_binary_gadget(s, gadget_not_i32, args[0], args[1]);
+        break;
+
+    case INDEX_op_ext8s_i32:    /* Optional (TCG_TARGET_HAS_ext8s_i32). */
+        tcg_out_binary_gadget(s, gadget_ext8s_i32, args[0], args[1]);
+        break;
+
+    case INDEX_op_ext16s_i32:   /* Optional (TCG_TARGET_HAS_ext16s_i32). */
+        tcg_out_binary_gadget(s, gadget_ext16s_i32, args[0], args[1]);
+        break;
+
+    case INDEX_op_div_i32:      /* Optional (TCG_TARGET_HAS_div_i32). */
+        tcg_out_ternary_gadget(s, gadget_div_i32, args[0], args[1], args[2]); 
+        break;
+
+    case INDEX_op_divu_i32:     /* Optional (TCG_TARGET_HAS_div_i32). */
+        tcg_out_ternary_gadget(s, gadget_divu_i32, args[0], args[1], args[2]); 
+        break;
+
+    case INDEX_op_rem_i32:      /* Optional (TCG_TARGET_HAS_div_i32). */
+        tcg_out_ternary_gadget(s, gadget_rem_i32, args[0], args[1], args[2]); 
+        break;
+
+    case INDEX_op_remu_i32:     /* Optional (TCG_TARGET_HAS_div_i32). */
+        tcg_out_ternary_gadget(s, gadget_remu_i32, args[0], args[1], args[2]); 
+        break;
+
+    case INDEX_op_brcond_i32:
+    {
+        static uint8_t last_brcond_i32 = 0;
+        void *gadget;
+
+        // We have to emit a different gadget per condition; we'll select which.
+        switch(args[2]) {
+            case TCG_COND_EQ:  gadget = gadget_brcond_i32_eq; break;
+            case TCG_COND_NE:  gadget = gadget_brcond_i32_ne; break;
+            case TCG_COND_LT:  gadget = gadget_brcond_i32_lt; break;
+            case TCG_COND_GE:  gadget = gadget_brcond_i32_ge; break;
+            case TCG_COND_LE:  gadget = gadget_brcond_i32_le; break;
+            case TCG_COND_GT:  gadget = gadget_brcond_i32_gt; break;
+            case TCG_COND_LTU: gadget = gadget_brcond_i32_lo; break;
+            case TCG_COND_GEU: gadget = gadget_brcond_i32_hs; break;
+            case TCG_COND_LEU: gadget = gadget_brcond_i32_ls; break;
+            case TCG_COND_GTU: gadget = gadget_brcond_i32_hi; break;
+            default:
+                g_assert_not_reached();
+        }
+
+        // We'll select the which branch to used based on a cycling counter.
+        // This means we'll pick one of 16 identical brconds. Spreading this out
+        // helps the processor's branch prediction be less "squished", as not every
+        // branch is going throuh the same instruction.
+        tcg_out_ternary_gadget(s, gadget, last_brcond_i32, args[0], args[1]);
+        last_brcond_i32 = (last_brcond_i32 + 1) % TCG_TARGET_NB_REGS;
+
+        // Branch target immediate.
+        tcti_out_label(s, arg_label(args[3]));
+
+        break;
+    }
+
+    case INDEX_op_qemu_ld_i32:
+    {
+        MemOp opc = get_memop(args[2]);
+        unsigned a_bits = get_alignment_bits(opc);
+        unsigned s_bits = opc & MO_SIZE;
+
+        void *gadget;
+
+        switch(TLB_MASK_TABLE_OFS(get_mmuidx(args[2]))) {
+            case -64:  LD_MEMOP_HANDLER(gadget, args[2],  off64_i32, a_bits, s_bits); break;
+            case -96:  LD_MEMOP_HANDLER(gadget, args[2],  off96_i32, a_bits, s_bits); break;
+            case -128: LD_MEMOP_HANDLER(gadget, args[2], off128_i32, a_bits, s_bits); break;
+            default:   LD_MEMOP_LOOKUP(gadget, args[2], slowpath_off0_i32); break;
+        }
+
+        // Args:
+        // - an immediate32 encodes our operation index 
+        tcg_out_binary_gadget(s, gadget, args[0], args[1]);
+        tcg_out64(s, args[2]); // TODO: fix encoding to be 4b
+        break;
+    }
+
+    case INDEX_op_qemu_ld_i64:
+    {
+        MemOp opc = get_memop(args[2]);
+        unsigned a_bits = get_alignment_bits(opc);
+        unsigned s_bits = opc & MO_SIZE;
+
+        void *gadget;
+
+        // Special optimization case: if we have an operation/target of 0x3A, 
+        // this is a common case. Delegate to our special-case handler.
+        if (args[2] == 0x3a) {
+            switch(TLB_MASK_TABLE_OFS(get_mmuidx(args[2]))) {
+
+                case -64: 
+                    gadget = (a_bits >= s_bits) ? 
+                        gadget_qemu_ld_leq_aligned_mode3a_off64_i64 :
+                        gadget_qemu_ld_leq_unaligned_mode3a_off64_i64;
+                    break;
+                case -96: 
+                    gadget = (a_bits >= s_bits) ? 
+                        gadget_qemu_ld_leq_aligned_mode3a_off96_i64 :
+                        gadget_qemu_ld_leq_unaligned_mode3a_off96_i64;
+                    break;
+                case -128: 
+                    gadget = (a_bits >= s_bits) ? 
+                        gadget_qemu_ld_leq_aligned_mode3a_off128_i64 :
+                        gadget_qemu_ld_leq_unaligned_mode3a_off128_i64;
+                    break;
+
+                default: 
+                    gadget = gadget_qemu_ld_leq_slowpath_mode3a_off0_i64;
+                    break;
+            }
+            tcg_out_binary_gadget(s, gadget, args[0], args[1]);
+        } 
+        // Otherwise, handle the generic case.
+        else {
+            switch(TLB_MASK_TABLE_OFS(get_mmuidx(args[2]))) {
+                case -64:  LD_MEMOP_HANDLER(gadget, args[2],  off64_i64, a_bits, s_bits); break;
+                case -96:  LD_MEMOP_HANDLER(gadget, args[2],  off96_i64, a_bits, s_bits); break;
+                case -128: LD_MEMOP_HANDLER(gadget, args[2], off128_i64, a_bits, s_bits); break;
+                default:   LD_MEMOP_LOOKUP(gadget, args[2], slowpath_off0_i64); break;
+            }
+            // Args:
+            // - an immediate32 encodes our operation index 
+            tcg_out_binary_gadget(s, gadget, args[0], args[1]);
+            tcg_out64(s, args[2]); // TODO: fix encoding to be 4b
+        }
+
+        break;
+    }
+
+    case INDEX_op_qemu_st_i32:
+    {
+        MemOp opc = get_memop(args[2]);
+        unsigned a_bits = get_alignment_bits(opc);
+        unsigned s_bits = opc & MO_SIZE;
+
+        void *gadget;
+
+        switch(TLB_MASK_TABLE_OFS(get_mmuidx(args[2]))) {
+            case -64:  ST_MEMOP_HANDLER(gadget, args[2],  off64_i32, a_bits, s_bits); break;
+            case -96:  ST_MEMOP_HANDLER(gadget, args[2],  off96_i32, a_bits, s_bits); break;
+            case -128: ST_MEMOP_HANDLER(gadget, args[2], off128_i32, a_bits, s_bits); break;
+            default:   ST_MEMOP_LOOKUP(gadget, args[2], slowpath_off0_i32); break;
+        }
+
+        // Args:
+        // - our gadget encodes the target and address registers
+        // - an immediate32 encodes our operation index 
+        tcg_out_binary_gadget(s, gadget, args[0], args[1]);
+        tcg_out64(s, args[2]); // FIXME: double encoded
+        break;
+    }
+
+    case INDEX_op_qemu_st_i64:
+    {
+        MemOp opc = get_memop(args[2]);
+        unsigned a_bits = get_alignment_bits(opc);
+        unsigned s_bits = opc & MO_SIZE;
+
+        void *gadget;
+
+        // Special optimization case: if we have an operation/target of 0x3A, 
+        // this is a common case. Delegate to our special-case handler.
+        if (args[2] == 0x3a) {
+            switch(TLB_MASK_TABLE_OFS(get_mmuidx(args[2]))) {
+
+                case -64: 
+                    gadget = (a_bits >= s_bits) ? 
+                        gadget_qemu_st_leq_aligned_mode3a_off64_i64 :
+                        gadget_qemu_st_leq_unaligned_mode3a_off64_i64;
+                    break;
+                case -96: 
+                    gadget = (a_bits >= s_bits) ? 
+                        gadget_qemu_st_leq_aligned_mode3a_off96_i64 :
+                        gadget_qemu_st_leq_unaligned_mode3a_off96_i64;
+                    break;
+                case -128: 
+                    gadget = (a_bits >= s_bits) ? 
+                        gadget_qemu_st_leq_aligned_mode3a_off128_i64 :
+                        gadget_qemu_st_leq_unaligned_mode3a_off128_i64;
+                    break;
+
+                default: 
+                    gadget = gadget_qemu_st_leq_slowpath_mode3a_off0_i64;
+                    break;
+            }
+            tcg_out_binary_gadget(s, gadget, args[0], args[1]);
+        } 
+        // Otherwise, handle the generic case.
+        else {
+            switch(TLB_MASK_TABLE_OFS(get_mmuidx(args[2]))) {
+                case -64:  ST_MEMOP_HANDLER(gadget, args[2],  off64_i64, a_bits, s_bits); break;
+                case -96:  ST_MEMOP_HANDLER(gadget, args[2],  off96_i64, a_bits, s_bits); break;
+                case -128: ST_MEMOP_HANDLER(gadget, args[2], off128_i64, a_bits, s_bits); break;
+                default:   ST_MEMOP_LOOKUP(gadget, args[2], slowpath_off0_i64); break;
+            }
+
+            // Args:
+            // - our gadget encodes the target and address registers
+            // - an immediate32 encodes our operation index 
+            tcg_out_binary_gadget(s, gadget, args[0], args[1]);
+            tcg_out64(s, args[2]); // FIXME: double encoded
+        }
+
+        break;
+    }
+
+    // Memory barriers.
+    case INDEX_op_mb:
+    {
+        static void* sync[] = {
+            [0 ... TCG_MO_ALL]            = gadget_mb_all,
+            [TCG_MO_ST_ST]                = gadget_mb_st,
+            [TCG_MO_LD_LD]                = gadget_mb_ld,
+            [TCG_MO_LD_ST]                = gadget_mb_ld,
+            [TCG_MO_LD_ST | TCG_MO_LD_LD] = gadget_mb_ld,
+        };
+        tcg_out_nullary_gadget(s, sync[args[0] & TCG_MO_ALL]);
+
+        break;
+    }
+
+    case INDEX_op_mov_i32:  /* Always emitted via tcg_out_mov.  */
+    case INDEX_op_mov_i64:
+    case INDEX_op_call:     /* Always emitted via tcg_out_call.  */
+    default:
+        tcg_abort();
+    }
+}
+
+/**
+ * Generate immediate stores.
+ */
+static void tcg_out_st(TCGContext *s, TCGType type, TCGReg arg, TCGReg arg1,
+                       intptr_t arg2)
+{
+    if (type == TCG_TYPE_I32) {
+        tcg_out_ldst_gadget(s, gadget_st_i32, arg, arg1, arg2); 
+    } else {
+        tcg_out_ldst_gadget(s, gadget_st_i64, arg, arg1, arg2); 
+    }
+}
+
+static inline bool tcg_out_sti(TCGContext *s, TCGType type, TCGArg val,
+                               TCGReg base, intptr_t ofs)
+{
+    return false;
+}
+
+/* Test if a constant matches the constraint. */
+static int tcg_target_const_match(tcg_target_long val, TCGType type,
+                                  const TCGArgConstraint *arg_ct)
+{
+    /* No need to return 0 or 1, 0 or != 0 is good enough. */
+    return arg_ct->ct & TCG_CT_CONST;
+}
+
+static void tcg_target_init(TCGContext *s)
+{
+    /* The current code uses uint8_t for tcg operations. */
+    tcg_debug_assert(tcg_op_defs_max <= UINT8_MAX);
+
+    /* Registers available for 32 bit operations. */
+    tcg_target_available_regs[TCG_TYPE_I32] = BIT(TCG_TARGET_NB_REGS) - 1;
+    /* Registers available for 64 bit operations. */
+    tcg_target_available_regs[TCG_TYPE_I64] = BIT(TCG_TARGET_NB_REGS) - 1;
+
+    /* TODO: Which registers should be set here? */
+    tcg_target_call_clobber_regs = BIT(TCG_TARGET_NB_REGS) - 1;
+
+    s->reserved_regs = 0;
+    tcg_regset_set_reg(s->reserved_regs, TCG_REG_CALL_STACK);
+
+    /* We use negative offsets from "sp" so that we can distinguish
+       stores that might pretend to be call arguments.  */
+    tcg_set_frame(s, TCG_REG_CALL_STACK, -CPU_TEMP_BUF_NLONGS * sizeof(long), CPU_TEMP_BUF_NLONGS * sizeof(long));
+}
+
+/* Generate global QEMU prologue and epilogue code. */
+static inline void tcg_target_qemu_prologue(TCGContext *s)
+{
+    // No prologue; as we're interpreted.
+}
+
+
+/**
+ * TCTI 'interpreter' bootstrap.
+ */
+
+// Store the current return address during helper calls.
+__thread uintptr_t tcti_call_return_address;
+
+/* Dispatch the bytecode stream contained in our translation buffer. */
+uintptr_t QEMU_DISABLE_CFI tcg_qemu_tb_exec(CPUArchState *env, const void *v_tb_ptr)
+{
+    // Create our per-CPU temporary storage.
+    long tcg_temps[CPU_TEMP_BUF_NLONGS];
+
+    uint64_t return_value = 0;
+    uintptr_t sp_value    = (uintptr_t)(tcg_temps + CPU_TEMP_BUF_NLONGS);
+    uintptr_t pc_mirror   = (uintptr_t)&tcti_call_return_address;
+
+    // Ensure our target configuration hasn't changed.
+    tcti_assert(TCG_AREG0 == TCG_REG_R14);
+    tcti_assert(TCG_REG_CALL_STACK == TCG_REG_R15);
+
+    asm(
+        // Our threaded-dispatch prologue needs to set up things for our machine to run.
+        // This means:
+        //   - Set up TCG_AREG0 (R14) to point to our architectural state.
+        //   - Set up TCG_REG_CALL_STACK (R15) to point to our temporary buffer.
+        //   - Point x28 (our bytecode "instruction pointer") to the relevant stream address.
+        "ldr x14, %[areg0]\n"
+        "ldr x15, %[sp_value]\n"
+        "ldr x25, %[pc_mirror]\n"
+        "ldr x28, %[start_tb_ptr]\n"
+
+        // To start our code, we'll -call- the gadget at the first bytecode pointer.
+        // Note that we call/branch-with-link, here; so our TB_EXIT gadget can RET in order
+        // to return to this point when things are complete.
+        "ldr x27, [x28], #8\n"
+        "blr x27\n"
+
+        // Finally, we'll copy out our final return value.
+        "str x0, %[return_value]\n"
+
+        : [return_value] "=m" (return_value)
+
+        : [areg0]        "m"  (env), 
+          [sp_value]     "m"  (sp_value), 
+          [start_tb_ptr] "m"  (v_tb_ptr),
+          [pc_mirror]    "m"  (pc_mirror)
+
+        // We touch _every_ one of the lower registers, as we use these to execute directly.
+        : "x0", "x1",  "x2",  "x3",  "x4",  "x5",  "x6",  "x7",
+          "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15",
+
+        // We also use x26/x27 for temporary values, and x28 as our bytecode poitner.
+        "x25", "x26", "x27", "x28", "cc", "memory"
+    );
+
+    return return_value;
+}
+
+
+/**
+ *  Disassembly output support.
+ */
+#include <dlfcn.h>
+
+
+/* Disassemble TCI bytecode. */
+int print_insn_tcti(bfd_vma addr, disassemble_info *info)
+{
+    Dl_info symbol_info = {};
+    char symbol_name[48] ;
+
+    int status;
+    uint64_t block;
+
+    // Read the relevant pointer.
+    status = info->read_memory_func(addr, (void *)&block, sizeof(block), info);
+    if (status != 0) {
+        info->memory_error_func(status, addr, info);
+        return -1;
+    }
+
+    // Most of our disassembly stream will be gadgets. Try to get their names, for nice output.
+    dladdr((void *)block, &symbol_info);
+
+    if(symbol_info.dli_sname != 0) {
+        strlcpy(symbol_name, symbol_info.dli_sname, 47);
+        info->fprintf_func(info->stream, "%s", symbol_name);
+    } else {
+        info->fprintf_func(info->stream, "%016llx", block);
+    }
+
+    return sizeof(block);
+}
+
+
diff --git a/tcg/aarch64-tcti/tcg-target.h b/tcg/aarch64-tcti/tcg-target.h
new file mode 100644
index 000000000000..fa2ae5c40a3e
--- /dev/null
+++ b/tcg/aarch64-tcti/tcg-target.h
@@ -0,0 +1,220 @@
+/*
+ * Tiny Code Generator for QEMU
+ *
+ * Copyright (c) 2009, 2011 Stefan Weil
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+/*
+ * This code implements a TCG which does not generate machine code for some
+ * real target machine but which generates virtual machine code for an
+ * interpreter. Interpreted pseudo code is slow, but it works on any host.
+ *
+ * Some remarks might help in understanding the code:
+ *
+ * "target" or "TCG target" is the machine which runs the generated code.
+ * This is different to the usual meaning in QEMU where "target" is the
+ * emulated machine. So normally QEMU host is identical to TCG target.
+ * Here the TCG target is a virtual machine, but this virtual machine must
+ * use the same word size like the real machine.
+ * Therefore, we need both 32 and 64 bit virtual machines (interpreter).
+ */
+
+#ifndef TCG_TARGET_H
+#define TCG_TARGET_H
+
+#if UINTPTR_MAX == UINT32_MAX
+# error We only support AArch64 running in 64B mode.
+#elif UINTPTR_MAX == UINT64_MAX
+# define TCG_TARGET_REG_BITS 64
+#else
+# error Unknown pointer size for tcti target
+#endif
+
+#define TCG_TARGET_INSN_UNIT_SIZE        1
+#define TCG_TARGET_TLB_DISPLACEMENT_BITS 32
+
+// We're an interpreted target; even if we're JIT-compiling to our interpreter's
+// weird psuedo-native bytecode. We'll indicate that we're intepreted.
+#define TCG_TARGET_INTERPRETER 1
+
+//
+// Supported optional instructions.
+//
+
+// Divs.
+#define TCG_TARGET_HAS_div_i32          1
+#define TCG_TARGET_HAS_rem_i32          1
+#define TCG_TARGET_HAS_div_i64          1
+#define TCG_TARGET_HAS_rem_i64          1
+
+// Extends.
+#define TCG_TARGET_HAS_ext8s_i32        1
+#define TCG_TARGET_HAS_ext16s_i32       1
+#define TCG_TARGET_HAS_ext8u_i32        1
+#define TCG_TARGET_HAS_ext16u_i32       1
+#define TCG_TARGET_HAS_ext8s_i64        1
+#define TCG_TARGET_HAS_ext16s_i64       1
+#define TCG_TARGET_HAS_ext32s_i64       1
+#define TCG_TARGET_HAS_ext8u_i64        1
+#define TCG_TARGET_HAS_ext16u_i64       1
+#define TCG_TARGET_HAS_ext32u_i64       1
+
+// Logicals.
+#define TCG_TARGET_HAS_neg_i32          1
+#define TCG_TARGET_HAS_not_i32          1
+#define TCG_TARGET_HAS_neg_i64          1
+#define TCG_TARGET_HAS_not_i64          1
+
+#define TCG_TARGET_HAS_andc_i32         1
+#define TCG_TARGET_HAS_orc_i32          1
+#define TCG_TARGET_HAS_eqv_i32          1
+#define TCG_TARGET_HAS_andc_i64         1
+#define TCG_TARGET_HAS_eqv_i64          1
+#define TCG_TARGET_HAS_orc_i64          1
+
+// We don't curretly support rotates, since AArch64 lacks ROL.
+// We'll fix this later.
+#define TCG_TARGET_HAS_rot_i32          0
+#define TCG_TARGET_HAS_rot_i64          0
+
+// Swaps.
+#define TCG_TARGET_HAS_bswap16_i32      1
+#define TCG_TARGET_HAS_bswap32_i32      1
+#define TCG_TARGET_HAS_bswap16_i64      1
+#define TCG_TARGET_HAS_bswap32_i64      1
+#define TCG_TARGET_HAS_bswap64_i64      1
+#define TCG_TARGET_HAS_MEMORY_BSWAP     1
+
+// Specify we'll handle direct jumps.
+#define TCG_TARGET_HAS_direct_jump      1
+
+//
+// Potential TODOs.
+//
+
+// TODO: implement DEPOSIT as BFI.
+#define TCG_TARGET_HAS_deposit_i32      0
+#define TCG_TARGET_HAS_deposit_i64      0
+
+// TODO: implement EXTRACT as BFX.
+#define TCG_TARGET_HAS_extract_i32      0
+#define TCG_TARGET_HAS_sextract_i32     0
+#define TCG_TARGET_HAS_extract_i64      0
+#define TCG_TARGET_HAS_sextract_i64     0
+
+// TODO: it might be worth writing a gadget for this
+#define TCG_TARGET_HAS_movcond_i32      0
+#define TCG_TARGET_HAS_movcond_i64      0
+
+//
+// Unsupported instructions.
+//
+
+// ARMv8 doesn't have instructions for NAND/NOR.
+#define TCG_TARGET_HAS_nand_i32         0
+#define TCG_TARGET_HAS_nor_i32          0
+#define TCG_TARGET_HAS_nor_i64          0
+#define TCG_TARGET_HAS_nand_i64         0
+
+// aarch64's CLZ is implemented without a condition, so it
+#define TCG_TARGET_HAS_clz_i32          0
+#define TCG_TARGET_HAS_ctz_i32          0
+#define TCG_TARGET_HAS_ctpop_i32        0
+#define TCG_TARGET_HAS_clz_i64          0
+#define TCG_TARGET_HAS_ctz_i64          0
+#define TCG_TARGET_HAS_ctpop_i64        0
+
+
+// GOTO_PTR is too complex to emit a simple gadget for.
+// We'll let C handle it, since the overhead is similar.
+#define TCG_TARGET_HAS_goto_ptr         0
+
+// We don't have a simple gadget for this, since we're always assuming softmmu.
+#define TCG_TARGET_HAS_qemu_st8_i32     0
+
+// No AArch64 equivalent.a
+#define TCG_TARGET_HAS_extrl_i64_i32    0
+#define TCG_TARGET_HAS_extrh_i64_i32    0
+
+#define TCG_TARGET_HAS_extract2_i64     0
+
+// These should always be zero on our 64B platform.
+#define TCG_TARGET_HAS_muls2_i64        0
+#define TCG_TARGET_HAS_add2_i32         0
+#define TCG_TARGET_HAS_sub2_i32         0
+#define TCG_TARGET_HAS_mulu2_i32        0
+#define TCG_TARGET_HAS_add2_i64         0
+#define TCG_TARGET_HAS_sub2_i64         0
+#define TCG_TARGET_HAS_mulu2_i64        0
+#define TCG_TARGET_HAS_muluh_i64        0
+#define TCG_TARGET_HAS_mulsh_i64        0
+#define TCG_TARGET_HAS_extract2_i32     0
+#define TCG_TARGET_HAS_muls2_i32        0
+#define TCG_TARGET_HAS_muluh_i32        0
+#define TCG_TARGET_HAS_mulsh_i32        0
+
+//
+// Platform metadata.
+//
+
+// Number of registers available.
+// It might make sense to up these, since we can also use x16 -> x25?
+#define TCG_TARGET_NB_REGS 16
+
+/* List of registers which are used by TCG. */
+typedef enum {
+    TCG_REG_R0 = 0,
+    TCG_REG_R1,
+    TCG_REG_R2,
+    TCG_REG_R3,
+    TCG_REG_R4,
+    TCG_REG_R5,
+    TCG_REG_R6,
+    TCG_REG_R7,
+    TCG_REG_R8,
+    TCG_REG_R9,
+    TCG_REG_R10,
+    TCG_REG_R11,
+    TCG_REG_R12,
+    TCG_REG_R13,
+    TCG_REG_R14,
+    TCG_REG_R15,
+
+    TCG_AREG0          = TCG_REG_R14,
+    TCG_REG_CALL_STACK = TCG_REG_R15,
+} TCGReg;
+
+// Specify the shape of the stack our runtime will use.
+#define TCG_TARGET_CALL_STACK_OFFSET    0
+#define TCG_TARGET_STACK_ALIGN          16
+
+// We're interpreted, so we'll use our own code to run TB_EXEC.
+#define HAVE_TCG_QEMU_TB_EXEC
+
+// We'll need to enforce memory ordering with barriers.
+#define TCG_TARGET_DEFAULT_MO  (0)
+
+void tci_disas(uint8_t opc);
+
+void tb_target_set_jmp_target(uintptr_t, uintptr_t, uintptr_t, uintptr_t);
+
+
+#endif /* TCG_TARGET_H */
diff --git a/tcg/aarch64-tcti/tcti-gadget-gen.py b/tcg/aarch64-tcti/tcti-gadget-gen.py
new file mode 100755
index 000000000000..1296f6d0c2d7
--- /dev/null
+++ b/tcg/aarch64-tcti/tcti-gadget-gen.py
@@ -0,0 +1,788 @@
+#!/usr/bin/env python3
+""" Gadget-code generator for QEMU TCTI on AArch64. 
+
+Generates a C-code include file containing 'gadgets' for use by TCTI.
+"""
+
+import sys
+import itertools
+
+# Get a handle on the file we'll be working with, and redirect print to it.
+if len(sys.argv) > 1:
+    out_file = open(sys.argv[1], "w")
+
+    # Hook our print function, so it always outputs to the relevant file.
+    core_print = print
+    print = lambda *a, **k : core_print(*a, **k, file=out_file)
+
+# Epilogue code follows at the end of each gadget, and handles continuing execution.
+EPILOGUE = ( 
+    # Load our next gadget address from our bytecode stream, advancing it.
+    "ldr x27, [x28], #8",
+
+    # Jump to the next gadget.
+    "br x27"
+)
+
+# The number of general-purpose registers we're affording the TCG. This must match
+# the configuration in the TCTI target.
+TCG_REGISTER_COUNT   = 16
+TCG_REGISTER_NUMBERS = list(range(TCG_REGISTER_COUNT))
+
+# Helper that provides each of the AArch64 condition codes of interest.
+ARCH_CONDITION_CODES = ["eq", "ne", "lt", "ge", "le", "gt", "lo", "hs", "ls", "hi"]
+
+# We'll create a variety of gadgets that assume the MMU's TLB is stored at certain
+# offsets into its structure. These should match the offsets in tcg-target.c.in.
+QEMU_ALLOWED_MMU_OFFSETS = [ 64, 96, 128 ]
+
+# Statistics.
+gadgets      = 0
+instructions = 0
+
+def simple(name, *lines):
+    """ Generates a simple gadget that needs no per-register specialization. """
+
+    global gadgets, instructions
+
+    gadgets += 1
+
+    # Create our C/ASM framing.
+    #print(f"__attribute__((naked)) static void gadget_{name}(void)")
+    print(f"__attribute__((naked)) void gadget_{name}(void);")
+    print(f"__attribute__((naked)) void gadget_{name}(void)")
+    print("{")
+
+    # Add the core gadget
+    print("\tasm(")
+    for line in lines + EPILOGUE:
+        print(f"\t\t\"{line} \\n\"")
+        instructions += 1
+    print("\t);")
+
+    # End our framing.
+    print("}\n")
+
+
+def with_register_substitutions(name, substitutions, *lines, immediate_range=range(0)):
+    """ Generates a collection of gadgtes with register substitutions. """
+
+    def substitutions_for_letter(letter, number, line):
+        """ Helper that transforms Wd => w1, implementing gadget substitutions. """
+
+        # Register substitutions...
+        line = line.replace(f"X{letter}", f"x{number}")
+        line = line.replace(f"W{letter}", f"w{number}")
+
+        # ... immediate substitutions.
+        line = line.replace(f"I{letter}", f"{number}")
+        return line
+
+        
+    # Build a list of all the various stages we'll iterate over...
+    immediate_parameters = list(immediate_range)
+    parameters   = ([TCG_REGISTER_NUMBERS] * len(substitutions))
+
+    # ... adding immediates, if need be.
+    if immediate_parameters:
+        parameters.append(immediate_parameters)
+        substitutions = substitutions + ['i']
+
+    # Generate a list of register-combinations we'll support.
+    permutations = itertools.product(*parameters)
+
+    #  For each permutation...
+    for permutation in permutations:
+        new_lines = lines
+
+        # Replace each placeholder element with its proper value...
+        for index, element in enumerate(permutation):
+            letter = substitutions[index]
+            number = element
+
+            # Create new gadgets for the releavnt line...
+            new_lines = [substitutions_for_letter(letter, number, line) for line in new_lines]
+
+        # ... and emit the gadget.
+        permutation_id = "_arg".join(str(number) for number in permutation)
+        simple(f"{name}_arg{permutation_id}", *new_lines)
+
+
+def with_dnm(name, *lines):
+    """ Generates a collection of gadgets with substitutions for Xd, Xn, and Xm, and equivalents. """
+    with_register_substitutions(name, ("d", "n", "m"), *lines)
+
+    # Print out an array that contains all of our gadgets, for lookup.
+    print(f"void* gadget_{name}[{TCG_REGISTER_COUNT}][{TCG_REGISTER_COUNT}][{TCG_REGISTER_COUNT}] = ", end="")
+    print("{")
+
+    # D array
+    for d in TCG_REGISTER_NUMBERS:
+        print("\t{")
+
+        # N array
+        for n in TCG_REGISTER_NUMBERS:
+            print("\t\t{", end="")
+
+            # M array
+            for m in TCG_REGISTER_NUMBERS:
+                print(f"gadget_{name}_arg{d}_arg{n}_arg{m}", end=", ")
+
+            print("},")
+        print("\t},")
+    print("};")
+
+
+def with_dn_immediate(name, *lines, immediate_range):
+    """ Generates a collection of gadgets with substitutions for Xd, Xn, and Xm, and equivalents. """
+    with_register_substitutions(name, ["d", "n"], *lines, immediate_range=immediate_range)
+
+    # Print out an array that contains all of our gadgets, for lookup.
+    print(f"void* gadget_{name}[{TCG_REGISTER_COUNT}][{TCG_REGISTER_COUNT}][{len(immediate_range)}] = ", end="")
+    print("{")
+
+    # D array
+    for d in TCG_REGISTER_NUMBERS:
+        print("\t{")
+
+        # N array
+        for n in TCG_REGISTER_NUMBERS:
+            print("\t\t{", end="")
+
+            # M array
+            for i in immediate_range:
+                print(f"gadget_{name}_arg{d}_arg{n}_arg{i}", end=", ")
+
+            print("},")
+        print("\t},")
+    print("};")
+
+
+def with_pair(name, substitutions, *lines):
+    """ Generates a collection of gadgets with two subtstitutions."""
+    with_register_substitutions(name, substitutions, *lines)
+
+    # Print out an array that contains all of our gadgets, for lookup.
+    print(f"void* gadget_{name}[{TCG_REGISTER_COUNT}][{TCG_REGISTER_COUNT}] = ", end="")
+    print("{")
+
+    # N array
+    for a in TCG_REGISTER_NUMBERS:
+        print("\t\t{", end="")
+
+        # M array
+        for b in TCG_REGISTER_NUMBERS:
+            print(f"gadget_{name}_arg{a}_arg{b}", end=", ")
+
+        print("},")
+    print("};")
+
+
+def math_dnm(name, mnemonic):
+    """ Equivalent to `with_dnm`, but creates a _i32 and _i64 variant. For simple math. """
+    with_dnm(f'{name}_i32', f"{mnemonic} Wd, Wn, Wm")
+    with_dnm(f'{name}_i64', f"{mnemonic} Xd, Xn, Xm")
+
+def math_dn(name, mnemonic):
+    """ Equivalent to `with_dn`, but creates a _i32 and _i64 variant. For simple math. """
+    with_dn(f'{name}_i32', f"{mnemonic} Wd, Wn")
+    with_dn(f'{name}_i64', f"{mnemonic} Xd, Xn")
+
+
+def with_nm(name, *lines):
+    """ Generates a collection of gadgets with substitutions for Xn, and Xm, and equivalents. """
+    with_pair(name, ('n', 'm',), *lines)
+
+
+def with_dn(name, *lines):
+    """ Generates a collection of gadgets with substitutions for Xd, and Xn, and equivalents. """
+    with_pair(name, ('d', 'n',), *lines)
+
+
+def ldst_dn(name, *lines):
+    """ Generates a collection of gadgets with substitutions for Xd, and Xn, and equivalents. 
+    
+    This variant is optimized for loads and stores, and optimizes common offset cases.
+    """
+
+    #
+    # Simple case: create our gadgets.
+    #
+    with_dn(name, "ldr x27, [x28], #8", *lines)
+
+    #
+    # Optimization case: create variants of our gadgets with our offsets replaced with common immediates.
+    #
+    immediate_lines_pos = [line.replace("x27", "#Ii") for line in lines]
+    with_dn_immediate(f"{name}_imm", *immediate_lines_pos, immediate_range=range(64))
+
+    immediate_lines_aligned = [line.replace("x27", "#(Ii << 3)") for line in lines]
+    with_dn_immediate(f"{name}_sh8_imm", *immediate_lines_aligned, immediate_range=range(64))
+
+    immediate_lines_neg = [line.replace("x27", "#-Ii") for line in lines]
+    with_dn_immediate(f"{name}_neg_imm", *immediate_lines_neg, immediate_range=range(64))
+
+
+def with_single(name, substitution, *lines):
+    """ Generates a collection of gadgets with two subtstitutions."""
+    with_register_substitutions(name, (substitution,), *lines)
+
+    # Print out an array that contains all of our gadgets, for lookup.
+    print(f"void* gadget_{name}[{TCG_REGISTER_COUNT}] = ", end="")
+    print("{")
+
+    for n in TCG_REGISTER_NUMBERS:
+        print(f"gadget_{name}_arg{n}", end=", ")
+
+    print("};")
+
+
+def with_d_immediate(name, *lines, immediate_range=range(0)):
+    """ Generates a collection of gadgets with two subtstitutions."""
+    with_register_substitutions(name, ['d'], *lines, immediate_range=immediate_range)
+
+    # Print out an array that contains all of our gadgets, for lookup.
+    print(f"void* gadget_{name}[{TCG_REGISTER_COUNT}][{len(immediate_range)}] = ", end="")
+    print("{")
+
+    # D array
+    for a in TCG_REGISTER_NUMBERS:
+        print("\t\t{", end="")
+
+        # I array
+        for b in immediate_range:
+            print(f"gadget_{name}_arg{a}_arg{b}", end=", ")
+
+        print("},")
+    print("};")
+
+
+
+def with_d(name, *lines):
+    """ Generates a collection of gadgets with substitutions for Xd. """
+    with_single(name, 'd', *lines)
+
+
+# Assembly code for saving our machine state before entering the C runtime.
+C_CALL_PROLOGUE = [
+    # Store our machine state.
+    "str x25,      [sp, #-16]!",
+    "stp x14, x15, [sp, #-16]!",
+    "stp x12, x13, [sp, #-16]!",
+    "stp x10, x11, [sp, #-16]!",
+    "stp x8,  x9,  [sp, #-16]!",
+    "stp x6,  x7,  [sp, #-16]!",
+    "stp x4,  x5,  [sp, #-16]!",
+    "stp x2,  x3,  [sp, #-16]!",
+    "stp x0,  x1,  [sp, #-16]!",
+    "stp x28, lr,  [sp, #-16]!",
+]
+
+# Assembly code for restoring our machine state after leaving the C runtime.
+C_CALL_EPILOGUE = [
+    "ldp x28, lr, [sp], #16",
+    "ldp x0,  x1, [sp], #16",
+    "ldp x2,  x3, [sp], #16",
+    "ldp x4,  x5, [sp], #16",
+    "ldp x6,  x7, [sp], #16",
+    "ldp x8,  x9, [sp], #16",
+    "ldp x10, x11, [sp], #16",
+    "ldp x12, x13, [sp], #16",
+    "ldp x14, x15, [sp], #16",
+    "ldr x25,      [sp], #16",
+]
+
+
+def create_tlb_fastpath(is_aligned, is_write, offset, miss_label="0"):
+    """ Creates a set of instructions that perform a soft-MMU TLB lookup.
+
+    This is used for `qemu_ld`/qemu_st` instructions; to emit a prologue that
+    hopefully helps us skip a slow call into the C runtime when a Guest Virtual 
+    -> Host Virtual mapping is in the softmmu's TLB.
+
+    This "fast-path" prelude behaves as follows:
+        - If a TLB entry is found for the address stored in Xn, then x27
+          is stored to an "addend" that can be added to the guest virtual addres
+          to get the host virtual address (the address in our local memory space).
+        - If a TLB entry isn't found, it branches to the "miss_label" (by default, 0:),
+          so address lookup can be handled by the fastpath.
+
+    Clobbers x24, and x26; provides output in x27.
+    """
+
+    fast_path = [
+        # Load env_tlb(env)->f[mmu_idx].{mask,table} into {x26,x27}.
+        f"ldp x26, x27, [x14, #-{offset}]",
+
+        # Extract the TLB index from the address into X26. 
+        "and x26, x26, Xn, lsr #7", # Xn = addr regsiter 
+
+        # Add the tlb_table pointer, creating the CPUTLBEntry address into X27. 
+        "add x27, x27, x26",
+
+        # Load the tlb comparator into X26, and the fast path addend into X27. 
+        "ldr x26, [x27, #8]" if is_write else "ldr x26, [x27]",
+        "ldr x27, [x27, #0x18]",
+
+    ]
+
+    if is_aligned:
+        fast_path.extend([
+            # Store the page mask part of the address into X24.
+            "and x24, Xn, #0xfffffffffffff000",
+
+            # Compare the masked address with the TLB value.
+            "cmp x26, x24",
+
+            # If we're not equal, this isn't a TLB hit. Jump to our miss handler.
+            f"b.ne {miss_label}f",
+        ])
+    else:
+        fast_path.extend([
+            # If we're not aligned, add in our alignment value to ensure we don't
+            # don't straddle the end of a page.
+            "add x24, Xn, #7",
+
+            # Store the page mask part of the address into X24.
+            "and x24, x24, #0xfffffffffffff000",
+
+            # Compare the masked address with the TLB value.
+            "cmp x26, x24",
+
+            # If we're not equal, this isn't a TLB hit. Jump to our miss handler.
+            f"b.ne {miss_label}f",
+        ])
+
+    return fast_path
+
+
+
+def ld_thunk(name, fastpath_32b, fastpath_64b, slowpath_helper, immediate=None, is_aligned=False, force_slowpath=False):
+    """ Creates a thunk into our C runtime for a QEMU ST operation. """
+
+    # Use only offset 0 (no real offset) if we're forcing slowpath; 
+    # otherwise, use all of our allowed MMU offsets.
+    offsets = [0] if force_slowpath else QEMU_ALLOWED_MMU_OFFSETS
+    for offset in offsets:
+        for is_32b in (True, False):
+            fastpath = fastpath_32b if is_32b else fastpath_64b
+
+            gadget_name = f"{name}_off{offset}_i32" if is_32b else f"{name}_off{offset}_i64"
+            postscript = () if immediate else ("add x28, x28, #8",)
+
+            # If we have a pure-assembly fast path, start our gadget with it.
+            if fastpath and not force_slowpath:
+                fastpath_ops = [
+                    # Create a fastpath that jumps to miss_lable on a TLB miss,
+                    # or sets x27 to the TLB addend on a TLB hit.
+                    *create_tlb_fastpath(is_aligned=is_aligned, is_write=False, offset=offset),
+
+                    # On a hit, we can just perform an appropriate load...
+                    *fastpath,
+
+                    # Run our patch-up post-script, if we have one.
+                    *postscript,
+
+                    # ... and then we're done!
+                    *EPILOGUE,
+                ]
+            # Otherwise, we'll save arguments for our slow path.
+            else:
+                fastpath_ops = []
+
+            #
+            # If we're not taking our fast path, we'll call into our C runtime to take the slow path.
+            # 
+            with_dn(gadget_name, 
+                    *fastpath_ops,
+
+                "0:",
+                    "mov x27, Xn",
+
+                    # Save our registers in preparation for entering a C call.
+                    *C_CALL_PROLOGUE,
+
+                    # Per our calling convention:
+                    # - Move our architectural environment into x0, from x14.
+                    # - Move our target address into x1. [Placed in x27 below.]
+                    # - Move our operation info into x2, from an immediate32.
+                    # - Move the next bytecode pointer into x3, from x28.
+                    "mov   x0, x14",
+                    "mov   x1, x27",
+                    f"mov   x2, #{immediate}" if (immediate is not None) else "ldr   x2, [x28], #8", 
+                    "mov   x3, x28",
+
+                    # Perform our actual core code.
+                    f"bl _{slowpath_helper}",
+
+                    # Temporarily store our result in a register that won't get trashed.
+                    "mov x27, x0",
+
+                    # Restore our registers after our C call.
+                    *C_CALL_EPILOGUE,
+
+                    # Finally, call our postscript...
+                    *postscript,
+
+                    # ... and place our results in the target register.
+                    "mov Wd, w27" if is_32b else "mov Xd, x27"
+            )
+
+
+def st_thunk(name, fastpath_32b, fastpath_64b, slowpath_helper, immediate=None, is_aligned=False, force_slowpath=False):
+    """ Creates a thunk into our C runtime for a QEMU ST operation. """
+
+    # Use only offset 0 (no real offset) if we're forcing slowpath; 
+    # otherwise, use all of our allowed MMU offsets.
+    offsets = [0] if force_slowpath else QEMU_ALLOWED_MMU_OFFSETS
+    for offset in offsets:
+
+        for is_32b in (True, False):
+            fastpath = fastpath_32b if is_32b else fastpath_64b
+
+            gadget_name = f"{name}_off{offset}_i32" if is_32b else f"{name}_off{offset}_i64"
+            postscript = () if immediate else ("add x28, x28, #8",)
+
+            # If we have a pure-assembly fast path, start our gadget with it.
+            if fastpath and not force_slowpath:
+                fastpath_ops = [
+
+                    # Create a fastpath that jumps to miss_lable on a TLB miss,
+                    # or sets x27 to the TLB addend on a TLB hit.
+                    *create_tlb_fastpath(is_aligned=is_aligned, is_write=True, offset=offset),
+
+                    # On a hit, we can just perform an appropriate load...
+                    *fastpath,
+
+                    # Run our patch-up post-script, if we have one.
+                    *postscript,
+
+                    # ... and then we're done!
+                    *EPILOGUE,
+                ]
+            else:
+                fastpath_ops = []
+
+
+            #
+            # If we're not taking our fast path, we'll call into our C runtime to take the slow path.
+            # 
+            with_dn(gadget_name, 
+                    *fastpath_ops,
+
+                "0:",
+                    # Move our arguments into registers that we're not actively using.
+                    # This ensures that they won't be trounced by our calling convention
+                    # if this is reading values from x0-x4.
+                    "mov w27, Wd" if is_32b else "mov x27, Xd",
+                    "mov x26, Xn",
+
+                    # Save our registers in preparation for entering a C call.
+                    *C_CALL_PROLOGUE,
+
+                    # Per our calling convention:
+                    # - Move our architectural environment into x0, from x14.
+                    # - Move our target address into x1. [Moved into x26 above].
+                    # - Move our target value into x2. [Moved into x27 above].
+                    # - Move our operation info into x3, from an immediate32.
+                    # - Move the next bytecode pointer into x4, from x28.
+                    "mov   x0, x14",
+                    "mov   x1, x26",
+                    "mov   x2, x27",
+                    f"mov  x3, #{immediate}" if (immediate is not None) else "ldr   x3, [x28], #8", 
+                    "mov   x4, x28",
+
+                    # Perform our actual core code.
+                    f"bl _{slowpath_helper}",
+
+                    # Restore our registers after our C call.
+                    *C_CALL_EPILOGUE,
+
+                    # Finally, call our postscript.
+                    *postscript
+            )
+
+
+#
+# Gadget definitions.
+#
+
+print("/* Automatically generated by tcti-gadget-gen.py. Do not edit. */\n")
+
+# Call a C language helper function by address.
+simple("call",
+    # Get our C runtime function's location as a pointer-sized immediate...
+    "ldr x27, [x28], #8",
+
+    # Store our TB return address for our helper.
+    "str x28, [x25]",
+
+    # Prepare ourselves to call into our C runtime...
+    *C_CALL_PROLOGUE,
+
+    # ... perform the call itself ...
+    "blr x27",
+
+    # Save the result of our call for later.
+    "mov x27, x0",
+
+    # ... and restore our environment.
+    *C_CALL_EPILOGUE,
+
+    # Restore our return value.
+    "mov x0, x27"
+)
+
+# Branch to a given immediate address.
+simple("br",
+    # Use our immediate argument as our new bytecode-pointer location.
+    "ldr x28, [x28]"
+)
+
+# Exit from a translation buffer execution.
+simple("exit_tb",
+
+    # We have a single immediate argument, which contains our return code.
+    # Place it into x0, as one would a return code.
+    "ldr x0, [x28], #8",
+
+    # And finally, return back to the code that invoked our gadget stream.
+    "ret"
+)
+
+
+for condition in ARCH_CONDITION_CODES:
+
+    # Performs a comparison between two operands.
+    with_dnm(f"setcond_i32_{condition}",
+        "subs Wd, Wn, Wm",
+        f"cset Wd, {condition}"
+    )
+    with_dnm(f"setcond_i64_{condition}",
+        "subs Xd, Xn, Xm",
+        f"cset Xd, {condition}"
+    )
+
+    #
+    # NOTE: we use _dnm for the conditional branches, even though we don't
+    # actually do anything different based on the d argument. This gemerates
+    # effectively 16 identical `brcond` gadgets for each condition; which we
+    # use in the backend to spread out the actual branch sources we use.
+    #
+    # This is a slight mercy for the branch predictor, as not every conditional
+    # branch is funneled throught the same address.
+    #
+
+    # Branches iff a given comparison is true.
+    with_dnm(f'brcond_i32_{condition}',
+
+        # Grab our immediate argument.
+        "ldr x27, [x28], #8",
+
+        # Perform our comparison and conditional branch.
+        "subs Wzr, Wn, Wm",
+        f"b{condition} 1f",
+
+        "0:", # not taken
+           # Perform our end-of-instruction epilogue.
+            *EPILOGUE,
+
+        "1:" # taken
+            # Update our bytecode pointer to take the label.
+            "mov x28, x27"
+    )
+
+    # Branches iff a given comparison is true.
+    with_dnm(f'brcond_i64_{condition}',
+
+        # Grab our immediate argument.
+        "ldr x27, [x28], #8",
+
+        # Perform our comparison and conditional branch.
+        "subs Xzr, Xn, Xm",
+        f"b{condition} 1f",
+
+        "0:", # not taken
+            # Perform our end-of-instruction epilogue.
+            *EPILOGUE,
+
+        "1:" # taken
+            # Update our bytecode pointer to take the label.
+            "mov x28, x27"
+    )
+
+
+# MOV variants.
+with_dn("mov_i32",     "mov Wd, Wn")
+with_dn("mov_i64",     "mov Xd, Xn")
+with_d("movi_i32", "ldr Wd, [x28], #8")
+with_d("movi_i64", "ldr Xd, [x28], #8")
+
+# Create MOV variants that have common constants built in to the gadget.
+# This optimization helps costly reads from memories for simple operations.
+with_d_immediate("movi_imm_i32", "mov Wd, #Ii", immediate_range=range(64))
+with_d_immediate("movi_imm_i64", "mov Xd, #Ii", immediate_range=range(64))
+
+# LOAD variants.
+# TODO: should the signed variants have X variants for _i64?
+ldst_dn("ld8u",      "ldrb  Wd, [Xn, x27]")
+ldst_dn("ld8s_i32",  "ldrsb Wd, [Xn, x27]")
+ldst_dn("ld8s_i64",  "ldrsb Xd, [Xn, x27]")
+ldst_dn("ld16u",     "ldrh  Wd, [Xn, x27]")
+ldst_dn("ld16s_i32", "ldrsh Wd, [Xn, x27]")
+ldst_dn("ld16s_i64", "ldrsh Xd, [Xn, x27]")
+ldst_dn("ld32u",     "ldr   Wd, [Xn, x27]")
+ldst_dn("ld32s_i64", "ldrsw Xd, [Xn, x27]")
+ldst_dn("ld_i64",    "ldr   Xd, [Xn, x27]")
+
+# STORE variants.
+ldst_dn("st8",         "strb  Wd, [Xn, x27]")
+ldst_dn("st16",        "strh  Wd, [Xn, x27]")
+ldst_dn("st_i32",      "str   Wd, [Xn, x27]")
+ldst_dn("st_i64",      "str   Xd, [Xn, x27]")
+
+# QEMU LD/ST are handled in our C runtime rather than with simple gadgets,
+# as they're nontrivial.
+
+# Trivial arithmetic.
+math_dnm("add" , "add" )
+math_dnm("sub" , "sub" )
+math_dnm("mul" , "mul" )
+math_dnm("div" , "sdiv")
+math_dnm("divu", "udiv")
+
+# Division remainder
+with_dnm("rem_i32",  "sdiv w27, Wn, Wm", "msub Wd, w27, Wm, Wn")
+with_dnm("rem_i64",  "sdiv x27, Xn, Xm", "msub Xd, x27, Xm, Xn")
+with_dnm("remu_i32", "udiv w27, Wn, Wm", "msub Wd, w27, Wm, Wn")
+with_dnm("remu_i64", "udiv x27, Xn, Xm", "msub Xd, x27, Xm, Xn")
+
+# Trivial logical.
+math_dn( "not",  "mvn")
+math_dn( "neg",  "neg")
+math_dnm("and",  "and")
+math_dnm("andc", "bic")
+math_dnm("or",   "orr")
+math_dnm("orc",  "orn")
+math_dnm("xor",  "eor")
+math_dnm("eqv",  "eon")
+math_dnm("shl",  "lsl")
+math_dnm("shr",  "lsr")
+math_dnm("sar",  "asr")
+
+# AArch64 lacks a Rotate Left; so we instead rotate right by a negative.
+# TODO: validate this?
+#math_dnm("rotr", "ror")
+#with_dnm("rotl_i32", "neg w27, Wm", "ror Wd, Wn, w27")
+#with_dnm("rotl_i64", "neg x27, Xm", "ror Xd, Xn, x27")
+
+# Numeric extension.
+math_dn("ext8s",      "sxtb")
+with_dn("ext8u",      "and Xd, Xn, #0xff")
+math_dn("ext16s",     "sxth")
+with_dn("ext16u",     "and Wd, Wn, #0xffff")
+with_dn("ext32s_i64", "sxtw Xd, Wn")
+with_dn("ext32u_i64", "and Xd, Xn, #0xffffffff")
+
+# Byte swapping.
+with_dn("bswap16",    "rev w27, Wn", "lsr Wd, w27, #16")
+with_dn("bswap32",    "rev Wd, Wn")
+with_dn("bswap64",    "rev Xd, Xn")
+
+# Memory barriers.
+simple("mb_all", "dmb ish")
+simple("mb_st",  "dmb ishst")
+simple("mb_ld",  "dmb ishld")
+
+# Handlers for QEMU_LD, which handles guest <- host loads.
+for subtype in ('aligned', 'unaligned', 'slowpath'):
+    is_aligned  = (subtype == 'aligned')
+    is_slowpath = (subtype == 'slowpath')
+
+    ld_thunk(f"qemu_ld_ub_{subtype}", is_aligned=is_aligned, slowpath_helper="helper_ret_ldub_mmu",
+        fastpath_32b=["ldrb Wd, [Xn, x27]"], fastpath_64b=["ldrb Wd, [Xn, x27]"],
+        force_slowpath=is_slowpath,
+    )
+    ld_thunk(f"qemu_ld_sb_{subtype}", is_aligned=is_aligned, slowpath_helper="helper_ret_ldub_mmu_signed",
+        fastpath_32b=["ldrsb Wd, [Xn, x27]"], fastpath_64b=["ldrsb Xd, [Xn, x27]"],
+        force_slowpath=is_slowpath,
+    )
+    ld_thunk(f"qemu_ld_leuw_{subtype}", is_aligned=is_aligned, slowpath_helper="helper_le_lduw_mmu",
+        fastpath_32b=["ldrh Wd, [Xn, x27]"], fastpath_64b=["ldrh Wd, [Xn, x27]"],
+        force_slowpath=is_slowpath,
+    )
+    ld_thunk(f"qemu_ld_lesw_{subtype}", is_aligned=is_aligned, slowpath_helper="helper_le_lduw_mmu_signed",
+        fastpath_32b=["ldrsh Wd, [Xn, x27]"], fastpath_64b=["ldrsh Xd, [Xn, x27]"],
+        force_slowpath=is_slowpath,
+    )
+    ld_thunk(f"qemu_ld_leul_{subtype}", is_aligned=is_aligned, slowpath_helper="helper_le_ldul_mmu",
+        fastpath_32b=["ldr Wd, [Xn, x27]"], fastpath_64b=["ldr Wd, [Xn, x27]"],
+        force_slowpath=is_slowpath,
+    )
+    ld_thunk(f"qemu_ld_lesl_{subtype}", is_aligned=is_aligned, slowpath_helper="helper_le_ldul_mmu_signed",
+        fastpath_32b=["ldrsw Xd, [Xn, x27]"], fastpath_64b=["ldrsw Xd, [Xn, x27]"],
+        force_slowpath=is_slowpath,
+    )
+    ld_thunk(f"qemu_ld_leq_{subtype}", is_aligned=is_aligned, slowpath_helper="helper_le_ldq_mmu",
+        fastpath_32b=["ldr Xd, [Xn, x27]"], fastpath_64b=["ldr Xd, [Xn, x27]"],
+        force_slowpath=is_slowpath,
+    )
+
+    # Special variant for the most common mode, as a speedup optimization.
+    ld_thunk(f"qemu_ld_leq_{subtype}_mode3a", is_aligned=is_aligned, slowpath_helper="helper_le_ldq_mmu",
+        fastpath_32b=["ldr Xd, [Xn, x27]"], fastpath_64b=["ldr Xd, [Xn, x27]"],
+        force_slowpath=is_slowpath, immediate=0x3a
+    )
+
+    # For now, leave the rare/big-endian stuff slow-path only.
+    ld_thunk(f"qemu_ld_beuw_{subtype}", None, None, "helper_be_lduw_mmu",         
+            is_aligned=is_aligned, force_slowpath=is_slowpath)
+    ld_thunk(f"qemu_ld_besw_{subtype}", None, None, "helper_be_lduw_mmu_signed",  
+            is_aligned=is_aligned, force_slowpath=is_slowpath)
+    ld_thunk(f"qemu_ld_beul_{subtype}", None, None, "helper_be_ldul_mmu",         
+            is_aligned=is_aligned, force_slowpath=is_slowpath)
+    ld_thunk(f"qemu_ld_besl_{subtype}", None, None, "helper_be_ldul_mmu_signed",  
+            is_aligned=is_aligned, force_slowpath=is_slowpath)
+    ld_thunk(f"qemu_ld_beq_{subtype}",  None, None, "helper_be_ldq_mmu",          
+            is_aligned=is_aligned, force_slowpath=is_slowpath)
+
+
+# Handlers for QEMU_ST, which handles guest -> host stores.
+for subtype in ('aligned', 'unaligned', 'slowpath'):
+    is_aligned  = (subtype == 'aligned')
+    is_slowpath = (subtype == 'slowpath')
+
+    st_thunk(f"qemu_st_ub_{subtype}", is_aligned=is_aligned, slowpath_helper="helper_ret_stb_mmu",
+        fastpath_32b=["strb Wd, [Xn, x27]"], fastpath_64b=["strb Wd, [Xn, x27]"],
+        force_slowpath=is_slowpath,
+    )
+    st_thunk(f"qemu_st_leuw_{subtype}", is_aligned=is_aligned, slowpath_helper="helper_le_stw_mmu",
+        fastpath_32b=["strh Wd, [Xn, x27]"], fastpath_64b=["strh Wd, [Xn, x27]"],
+        force_slowpath=is_slowpath,
+    )
+    st_thunk(f"qemu_st_leul_{subtype}", is_aligned=is_aligned, slowpath_helper="helper_le_stl_mmu",
+        fastpath_32b=["str Wd, [Xn, x27]"], fastpath_64b=["str Wd, [Xn, x27]"],
+        force_slowpath=is_slowpath,
+    )
+    st_thunk(f"qemu_st_leq_{subtype}", is_aligned=is_aligned, slowpath_helper="helper_le_stq_mmu",
+        fastpath_32b=["str Xd, [Xn, x27]"], fastpath_64b=["str Xd, [Xn, x27]"],
+        force_slowpath=is_slowpath,
+    )
+    
+    # Special optimization for the most common modes.
+    st_thunk(f"qemu_st_leq_{subtype}_mode3a", is_aligned=is_aligned, slowpath_helper="helper_le_stq_mmu",
+        fastpath_32b=["str Xd, [Xn, x27]"], fastpath_64b=["str Xd, [Xn, x27]"],
+        force_slowpath=is_slowpath, immediate=0x3a
+    )
+
+    # For now, leave the rare/big-endian stuff slow-path only.
+    st_thunk(f"qemu_st_beuw_{subtype}", None, None, "helper_be_stw_mmu",  
+            is_aligned=is_aligned, force_slowpath=is_slowpath)
+    st_thunk(f"qemu_st_beul_{subtype}", None, None, "helper_be_stl_mmu",
+            is_aligned=is_aligned, force_slowpath=is_slowpath)
+    st_thunk(f"qemu_st_beq_{subtype}",  None, None, "helper_be_stq_mmu",
+            is_aligned=is_aligned, force_slowpath=is_slowpath)
+
+
+# Statistics.
+sys.stderr.write(f"\nGenerated {gadgets} gadgets with {instructions} instructions ({instructions * 4} B).\n\n")
diff --git a/tcg/tcg.c b/tcg/tcg.c
index 5b0750685102..ec832d92d0e6 100644
--- a/tcg/tcg.c
+++ b/tcg/tcg.c
@@ -165,7 +165,7 @@ TCGv_env cpu_env = 0;
 const void *tcg_code_gen_epilogue;
 uintptr_t tcg_splitwx_diff;
 
-#ifndef CONFIG_TCG_INTERPRETER
+#if !defined(CONFIG_TCG_INTERPRETER) && !defined(CONFIG_TCG_THREADED_INTERPRETER)
 tcg_prologue_fn *tcg_qemu_tb_exec;
 #endif
 
@@ -1227,7 +1227,7 @@ void tcg_prologue_init(TCGContext *s)
     region.start = buf0;
     region.end = buf0 + total_size;
 
-#ifndef CONFIG_TCG_INTERPRETER
+#if !defined(CONFIG_TCG_INTERPRETER) && !defined(CONFIG_TCG_THREADED_INTERPRETER)
     tcg_qemu_tb_exec = (tcg_prologue_fn *)tcg_splitwx_to_rx(buf0);
 #endif
 
@@ -1253,7 +1253,7 @@ void tcg_prologue_init(TCGContext *s)
 #endif
 
     buf1 = s->code_ptr;
-#ifndef CONFIG_TCG_INTERPRETER
+#if !defined(CONFIG_TCG_INTERPRETER) && !defined(CONFIG_TCG_THREADED_INTERPRETER)
     flush_idcache_range((uintptr_t)tcg_splitwx_to_rx(buf0), (uintptr_t)buf0,
                         tcg_ptr_byte_diff(buf1, buf0));
 #endif
@@ -1981,7 +1981,7 @@ void tcg_gen_callN(void *func, TCGTemp *ret, int nargs, TCGTemp **args)
 #endif
 
 #if defined(__sparc__) && !defined(__arch64__) \
-    && !defined(CONFIG_TCG_INTERPRETER)
+    && !defined(CONFIG_TCG_INTERPRETER) && !defined(CONFIG_TCG_THREADED_INTERPRETER)
     /* We have 64-bit values in one register, but need to pass as two
        separate parameters.  Split them.  */
     int orig_sizemask = sizemask;
@@ -2031,7 +2031,7 @@ void tcg_gen_callN(void *func, TCGTemp *ret, int nargs, TCGTemp **args)
     pi = 0;
     if (ret != NULL) {
 #if defined(__sparc__) && !defined(__arch64__) \
-    && !defined(CONFIG_TCG_INTERPRETER)
+    && !defined(CONFIG_TCG_INTERPRETER) && !defined(CONFIG_TCG_THREADED_INTERPRETER)
         if (orig_sizemask & 1) {
             /* The 32-bit ABI is going to return the 64-bit value in
                the %o0/%o1 register pair.  Prepare for this by using
@@ -2109,7 +2109,7 @@ void tcg_gen_callN(void *func, TCGTemp *ret, int nargs, TCGTemp **args)
     tcg_debug_assert(pi <= ARRAY_SIZE(op->args));
 
 #if defined(__sparc__) && !defined(__arch64__) \
-    && !defined(CONFIG_TCG_INTERPRETER)
+    && !defined(CONFIG_TCG_INTERPRETER) && !defined(CONFIG_TCG_THREADED_INTERPRETER)
     /* Free all of the parts we allocated above.  */
     for (i = real_args = 0; i < orig_nargs; ++i) {
         int is_64bit = orig_sizemask & (1 << (i+1)*2);
@@ -4789,7 +4789,7 @@ int tcg_gen_code(TCGContext *s, TranslationBlock *tb)
         return -2;
     }
 
-#ifndef CONFIG_TCG_INTERPRETER
+#if !defined(CONFIG_TCG_INTERPRETER) && !defined(CONFIG_TCG_THREADED_INTERPRETER)
     /* flush instruction cache */
     flush_idcache_range((uintptr_t)tcg_splitwx_to_rx(s->code_buf),
                         (uintptr_t)s->code_buf,