diff --git a/block/file-posix.c b/block/file-posix.c
index 9f6e6279d987..766bbb6cb538 100644
--- a/block/file-posix.c
+++ b/block/file-posix.c
@@ -280,6 +280,13 @@ static int raw_normalize_devicepath(const char **filename, Error **errp)
 }
 #endif
 
+#if defined(CONFIG_IOS)
+static int probe_logical_blocksize(int fd, unsigned int *sector_size_p)
+{
+    return -ENOTSUP; /* not supported on iOS */
+}
+#else /* CONFIG_IOS */
+
 /*
  * Get logical block size via ioctl. On success store it in @sector_size_p.
  */
@@ -313,6 +320,8 @@ static int probe_logical_blocksize(int fd, unsigned int *sector_size_p)
     return success ? 0 : -errno;
 }
 
+#endif
+
 /**
  * Get physical block size of @fd.
  * On success, store it in @blk_size and return 0.
@@ -1449,12 +1458,24 @@ static bool preadv_present = true;
 static ssize_t
 qemu_preadv(int fd, const struct iovec *iov, int nr_iov, off_t offset)
 {
+#ifdef CONFIG_DARWIN /* preadv introduced in macOS 11 */
+    if (!__builtin_available(macOS 11, iOS 14, watchOS 7, tvOS 14, *)) {
+        preadv_present = false;
+        return -ENOSYS;
+    } else
+#endif
     return preadv(fd, iov, nr_iov, offset);
 }
 
 static ssize_t
 qemu_pwritev(int fd, const struct iovec *iov, int nr_iov, off_t offset)
 {
+#ifdef CONFIG_DARWIN /* pwritev introduced in macOS 11 */
+    if (!__builtin_available(macOS 11, iOS 14, watchOS 7, tvOS 14, *)) {
+        preadv_present = false;
+        return -ENOSYS;
+    } else
+#endif
     return pwritev(fd, iov, nr_iov, offset);
 }
 
diff --git a/include/qemu/osdep.h b/include/qemu/osdep.h
index 8dbf741ee4ca..97bc3ceac3af 100644
--- a/include/qemu/osdep.h
+++ b/include/qemu/osdep.h
@@ -676,7 +676,7 @@ size_t qemu_get_host_physmem(void);
 /**
  * Platforms which do not support system() return ENOSYS
  */
-#ifndef HAVE_SYSTEM_FUNCTION
+#if !defined(HAVE_SYSTEM_FUNCTION) || defined(CONFIG_IOS)
 #define system platform_does_not_support_system
 static inline int platform_does_not_support_system(const char *command)
 {
diff --git a/meson.build b/meson.build
index ab6a60d1a87e..5fd46123defe 100644
--- a/meson.build
+++ b/meson.build
@@ -294,6 +294,7 @@ add_project_arguments('-iquote', '.',
 
 if host_machine.system() == 'darwin'
   add_languages('objc', required: false, native: false)
+  add_project_link_arguments(['-fvisibility-inlines-hidden', '-Xlinker', '-no_deduplicate'], native: false, language: ['c', 'cpp', 'objc'])
 endif
 
 sparse = find_program('cgcc', required: get_option('sparse'))
@@ -455,6 +456,8 @@ if targetos == 'netbsd'
   endif
 endif
 
+tcti_gadgets = files()
+
 tcg_arch = host_arch
 if get_option('tcg').allowed()
   if host_arch == 'unknown'
@@ -483,14 +486,77 @@ if get_option('tcg').allowed()
 
     # Tell our compiler how to generate our TCTI gadgets.
     gadget_generator = 'tcg/@0@/tcti-gadget-gen.py'.format(tcg_arch)
-    tcti_gadgets = custom_target('tcti-gadgets.c.inc',
-                                output: 'tcti-gadgets.c.inc',
-                                input: gadget_generator,
-                                command: [find_program(gadget_generator), '@OUTPUT@'],
-                                build_by_default: true,
-                                build_always_stale: false)
-
-    genh += tcti_gadgets
+    tcti_sources = [
+        'tcti_gadgets.h',
+        'tcti_misc_gadgets.c',
+        'tcti_misc_gadgets.h',
+        'tcti_setcond_gadgets.c',
+        'tcti_setcond_gadgets.h',
+        'tcti_brcond_gadgets.c',
+        'tcti_brcond_gadgets.h',
+        'tcti_mov_gadgets.c',
+        'tcti_mov_gadgets.h',
+        'tcti_load_signed_gadgets.c',
+        'tcti_load_signed_gadgets.h',
+        'tcti_load_unsigned_gadgets.c',
+        'tcti_load_unsigned_gadgets.h',
+        'tcti_store_gadgets.c',
+        'tcti_store_gadgets.h',
+        'tcti_arithmetic_gadgets.c',
+        'tcti_arithmetic_gadgets.h',
+        'tcti_logical_gadgets.c',
+        'tcti_logical_gadgets.h',
+        'tcti_extension_gadgets.c',
+        'tcti_extension_gadgets.h',
+        'tcti_bitwise_gadgets.c',
+        'tcti_bitwise_gadgets.h',
+        'tcti_byteswap_gadgets.c',
+        'tcti_byteswap_gadgets.h',
+        'tcti_qemu_ld_aligned_signed_le_gadgets.c',
+        'tcti_qemu_ld_aligned_signed_le_gadgets.h',
+        'tcti_qemu_ld_unaligned_signed_le_gadgets.c',
+        'tcti_qemu_ld_unaligned_signed_le_gadgets.h',
+        'tcti_qemu_ld_slowpath_signed_le_gadgets.c',
+        'tcti_qemu_ld_slowpath_signed_le_gadgets.h',
+        'tcti_qemu_ld_aligned_unsigned_le_gadgets.c',
+        'tcti_qemu_ld_aligned_unsigned_le_gadgets.h',
+        'tcti_qemu_ld_unaligned_unsigned_le_gadgets.c',
+        'tcti_qemu_ld_unaligned_unsigned_le_gadgets.h',
+        'tcti_qemu_ld_slowpath_unsigned_le_gadgets.c',
+        'tcti_qemu_ld_slowpath_unsigned_le_gadgets.h',
+        'tcti_qemu_ld_aligned_be_gadgets.c',
+        'tcti_qemu_ld_aligned_be_gadgets.h',
+        'tcti_qemu_ld_unaligned_be_gadgets.c',
+        'tcti_qemu_ld_unaligned_be_gadgets.h',
+        'tcti_qemu_ld_slowpath_be_gadgets.c',
+        'tcti_qemu_ld_slowpath_be_gadgets.h',
+        'tcti_qemu_st_aligned_le_gadgets.c',
+        'tcti_qemu_st_aligned_le_gadgets.h',
+        'tcti_qemu_st_unaligned_le_gadgets.c',
+        'tcti_qemu_st_unaligned_le_gadgets.h',
+        'tcti_qemu_st_slowpath_le_gadgets.c',
+        'tcti_qemu_st_slowpath_le_gadgets.h',
+        'tcti_qemu_st_aligned_be_gadgets.c',
+        'tcti_qemu_st_aligned_be_gadgets.h',
+        'tcti_qemu_st_unaligned_be_gadgets.c',
+        'tcti_qemu_st_unaligned_be_gadgets.h',
+        'tcti_qemu_st_slowpath_be_gadgets.c',
+        'tcti_qemu_st_slowpath_be_gadgets.h',
+        'tcti_simd_base_gadgets.c',
+        'tcti_simd_base_gadgets.h',
+        'tcti_simd_arithmetic_gadgets.c',
+        'tcti_simd_arithmetic_gadgets.h',
+        'tcti_simd_logical_gadgets.c',
+        'tcti_simd_logical_gadgets.h',
+        'tcti_simd_immediate_gadgets.c',
+        'tcti_simd_immediate_gadgets.h',
+    ]
+    tcti_gadgets = custom_target('tcti-gadgets.h',
+                              output: tcti_sources,
+                              input: gadget_generator,
+                              command: [find_program(gadget_generator)],
+                              build_by_default: true,
+                              build_always_stale: false)
   elif host_arch == 'x86_64'
     tcg_arch = 'i386'
   elif host_arch == 'ppc64'
@@ -3157,6 +3223,11 @@ if get_option('b_lto')
 endif
 common_ss.add(pagevary)
 specific_ss.add(files('page-vary.c'))
+specific_ss.add(when: 'CONFIG_TCG_INTERPRETER', if_true: files('tcg/tci.c'))
+
+# FIXME: This is being used for now for development quickness, but these realy should be
+# added to a gadget-specific shared library (tcti_ss).
+specific_ss.add(when: 'CONFIG_TCG_THREADED_INTERPRETER', if_true: tcti_gadgets)
 
 subdir('backends')
 subdir('disas')
diff --git a/tcg/aarch64-tcti/tcg-target-con-set.h b/tcg/aarch64-tcti/tcg-target-con-set.h
index f51b7bcb13e7..a0b91bb320f6 100644
--- a/tcg/aarch64-tcti/tcg-target-con-set.h
+++ b/tcg/aarch64-tcti/tcg-target-con-set.h
@@ -9,13 +9,24 @@
  * Each operand should be a sequence of constraint letters as defined by
  * tcg-target-con-str.h; the constraint combination is inclusive or.
  */
+
+// Simple register functions.
+C_O0_I1(r)
 C_O0_I2(r, r)
 C_O0_I3(r, r, r)
-C_O0_I4(r, r, r, r)
+//C_O0_I4(r, r, r, r)
 C_O1_I1(r, r)
-C_O1_I2(r, 0, r)
 C_O1_I2(r, r, r)
-C_O1_I4(r, r, r, r, r)
-C_O2_I1(r, r, r)
-C_O2_I2(r, r, r, r)
-C_O2_I4(r, r, r, r, r, r)
+//C_O1_I4(r, r, r, r, r)
+//C_O2_I1(r, r, r)
+//C_O2_I2(r, r, r, r)
+//C_O2_I4(r, r, r, r, r, r)
+
+// Vector functions.
+C_O1_I1(w, w)
+C_O1_I1(w, r)
+C_O0_I2(w, r)
+C_O1_I1(w, wr)
+C_O1_I2(w, w, w)
+C_O1_I3(w, w, w, w)
+C_O1_I2(w, 0, w)
\ No newline at end of file
diff --git a/tcg/aarch64-tcti/tcg-target-con-str.h b/tcg/aarch64-tcti/tcg-target-con-str.h
index 87c0f19e9c2e..94d06d3e74a5 100644
--- a/tcg/aarch64-tcti/tcg-target-con-str.h
+++ b/tcg/aarch64-tcti/tcg-target-con-str.h
@@ -8,4 +8,13 @@
  * Define constraint letters for register sets:
  * REGS(letter, register_mask)
  */
-REGS('r', MAKE_64BIT_MASK(0, TCG_TARGET_NB_REGS))
+REGS('r', TCG_MASK_GP_REGISTERS)
+REGS('w', TCG_MASK_VECTOR_REGISTERS)
+
+/*
+ * Define constraint letters for constants:
+ * CONST(letter, TCG_CT_CONST_* bit set)
+ */
+
+// Simple 64-bit immediates.
+CONST('I', 0xFFFFFFFFFFFFFFFF)
diff --git a/tcg/aarch64-tcti/tcg-target.c.inc b/tcg/aarch64-tcti/tcg-target.c.inc
index af4cc8d664b9..10d6c4ec1b62 100644
--- a/tcg/aarch64-tcti/tcg-target.c.inc
+++ b/tcg/aarch64-tcti/tcg-target.c.inc
@@ -22,13 +22,16 @@
  * THE SOFTWARE.
  */
 
+
+// Rich disassembly is nice in theory, but it's -slow-.
+//#define TCTI_GADGET_RICH_DISASSEMBLY
+
 #define TCTI_GADGET_IMMEDIATE_ARRAY_LEN 64
 
 #include "tcg/tcg-ldst.h"
 
-// Grab our gadget definitions.
-// FIXME: use the system path instead of hardcoding this?
-#include "tcti-gadgets.c.inc"
+// Grab our gadget headers.
+#include "tcti_gadgets.h"
 
 /* Marker for missing code. */
 #define TODO() \
@@ -47,64 +50,15 @@
 # define tcti_assert(cond) ((void)0)
 #endif
 
-/* Bitfield n...m (in 32 bit value). */
-#define BITS(n, m) (((0xffffffffU << (31 - n)) >> (31 - n + m)) << m)
-
-/**
- * Macro that defines a look-up tree for named QEMU_LD gadgets.
- */ 
-#define LD_MEMOP_LOOKUP(variable, arg, suffix) \
-    switch (get_memop(arg) & (MO_BSWAP | MO_SSIZE)) { \
-        case MO_UB:   variable = gadget_qemu_ld_ub_   ## suffix; break; \
-        case MO_SB:   variable = gadget_qemu_ld_sb_   ## suffix; break; \
-        case MO_LEUW: variable = gadget_qemu_ld_leuw_ ## suffix; break; \
-        case MO_LESW: variable = gadget_qemu_ld_lesw_ ## suffix; break; \
-        case MO_LEUL: variable = gadget_qemu_ld_leul_ ## suffix; break; \
-        case MO_LESL: variable = gadget_qemu_ld_lesl_ ## suffix; break; \
-        case MO_LEUQ: variable = gadget_qemu_ld_leq_  ## suffix; break; \
-        case MO_BEUW: variable = gadget_qemu_ld_beuw_ ## suffix; break; \
-        case MO_BESW: variable = gadget_qemu_ld_besw_ ## suffix; break; \
-        case MO_BEUL: variable = gadget_qemu_ld_beul_ ## suffix; break; \
-        case MO_BESL: variable = gadget_qemu_ld_besl_ ## suffix; break; \
-        case MO_BEUQ: variable = gadget_qemu_ld_beq_  ## suffix; break; \
-        default: \
-            g_assert_not_reached(); \
-    }
-#define LD_MEMOP_HANDLER(variable, arg, suffix, a_bits, s_bits) \
-        if (a_bits >= s_bits) { \
-            LD_MEMOP_LOOKUP(variable, arg, aligned_ ## suffix ); \
-        } else { \
-            LD_MEMOP_LOOKUP(gadget, arg, unaligned_ ## suffix); \
-        }
-
-
-
-/**
- * Macro that defines a look-up tree for named QEMU_ST gadgets.
- */ 
-#define ST_MEMOP_LOOKUP(variable, arg, suffix) \
-    switch (get_memop(arg) & (MO_BSWAP | MO_SSIZE)) { \
-        case MO_UB:   variable = gadget_qemu_st_ub_   ## suffix; break; \
-        case MO_LEUW: variable = gadget_qemu_st_leuw_ ## suffix; break; \
-        case MO_LEUL: variable = gadget_qemu_st_leul_ ## suffix; break; \
-        case MO_LEUQ: variable = gadget_qemu_st_leq_  ## suffix; break; \
-        case MO_BEUW: variable = gadget_qemu_st_beuw_ ## suffix; break; \
-        case MO_BEUL: variable = gadget_qemu_st_beul_ ## suffix; break; \
-        case MO_BEUQ: variable = gadget_qemu_st_beq_  ## suffix; break; \
-        default: \
-            g_assert_not_reached(); \
-    }
-#define ST_MEMOP_HANDLER(variable, arg, suffix, a_bits, s_bits) \
-        if (a_bits >= s_bits) { \
-            ST_MEMOP_LOOKUP(variable, arg, aligned_ ## suffix ); \
-        } else { \
-            ST_MEMOP_LOOKUP(gadget, arg, unaligned_ ## suffix); \
-        }
 
+/********************************
+ *  TCG Constraints Definitions *
+ ********************************/
 
 static TCGConstraintSetIndex tcg_target_op_def(TCGOpcode op)
 {
     switch (op) {
+
     case INDEX_op_ld8u_i32:
     case INDEX_op_ld8s_i32:
     case INDEX_op_ld16u_i32:
@@ -138,6 +92,8 @@ static TCGConstraintSetIndex tcg_target_op_def(TCGOpcode op)
     case INDEX_op_bswap32_i32:
     case INDEX_op_bswap32_i64:
     case INDEX_op_bswap64_i64:
+    case INDEX_op_extrl_i64_i32:
+    case INDEX_op_extrh_i64_i32:
         return C_O1_I1(r, r);
 
     case INDEX_op_st8_i32:
@@ -191,6 +147,10 @@ static TCGConstraintSetIndex tcg_target_op_def(TCGOpcode op)
     case INDEX_op_rotr_i64:
     case INDEX_op_setcond_i32:
     case INDEX_op_setcond_i64:
+    case INDEX_op_clz_i32:
+    case INDEX_op_clz_i64:
+    case INDEX_op_ctz_i32:
+    case INDEX_op_ctz_i64:
         return C_O1_I2(r, r, r);
 
     case INDEX_op_brcond_i32:
@@ -204,12 +164,65 @@ static TCGConstraintSetIndex tcg_target_op_def(TCGOpcode op)
     case INDEX_op_qemu_st_i64:
         return C_O0_I3(r, r, r);
 
+    //
+    // Vector ops.
+    //
+    case INDEX_op_add_vec:
+    case INDEX_op_sub_vec:
+    case INDEX_op_mul_vec:
+    case INDEX_op_xor_vec:
+    case INDEX_op_ssadd_vec:
+    case INDEX_op_sssub_vec:
+    case INDEX_op_usadd_vec:
+    case INDEX_op_ussub_vec:
+    case INDEX_op_smax_vec:
+    case INDEX_op_smin_vec:
+    case INDEX_op_umax_vec:
+    case INDEX_op_umin_vec:
+    case INDEX_op_shlv_vec:
+    case INDEX_op_shrv_vec:
+    case INDEX_op_sarv_vec:
+    case INDEX_op_aa64_sshl_vec:
+        return C_O1_I2(w, w, w);
+    case INDEX_op_not_vec:
+    case INDEX_op_neg_vec:
+    case INDEX_op_abs_vec:
+    case INDEX_op_shli_vec:
+    case INDEX_op_shri_vec:
+    case INDEX_op_sari_vec:
+        return C_O1_I1(w, w);
+    case INDEX_op_ld_vec:
+    case INDEX_op_dupm_vec:
+        return C_O1_I1(w, r);
+    case INDEX_op_st_vec:
+        return C_O0_I2(w, r);
+    case INDEX_op_dup_vec:
+        return C_O1_I1(w, wr);
+    case INDEX_op_or_vec:
+    case INDEX_op_andc_vec:
+        return C_O1_I2(w, w, w);
+    case INDEX_op_and_vec:
+    case INDEX_op_orc_vec:
+        return C_O1_I2(w, w, w);
+    case INDEX_op_cmp_vec:
+        return C_O1_I2(w, w, w);
+    case INDEX_op_bitsel_vec:
+        return C_O1_I3(w, w, w, w);
+
     default:
         g_assert_not_reached();
     }
 }
 
 static const int tcg_target_reg_alloc_order[] = {
+
+    // General purpose registers, in preference-of-allocation order.
+    TCG_REG_R8,
+    TCG_REG_R9,
+    TCG_REG_R10,
+    TCG_REG_R11,
+    TCG_REG_R12,
+    TCG_REG_R13,
     TCG_REG_R0,
     TCG_REG_R1,
     TCG_REG_R2,
@@ -218,16 +231,15 @@ static const int tcg_target_reg_alloc_order[] = {
     TCG_REG_R5,
     TCG_REG_R6,
     TCG_REG_R7,
-    TCG_REG_R8,
-    TCG_REG_R9,
-    TCG_REG_R10,
-    TCG_REG_R11,
-    TCG_REG_R12,
-    TCG_REG_R13,
-    /*
-    TCG_REG_R14,  // AREG0
-    TCG_REG_R15,  // SP
-    */
+
+    // Note: we do not allocate R14 or R15, as they're used for our
+    // special-purpose values.
+
+    // We'll use the high 16 vector register; avoiding the call-saved lower ones.
+    TCG_REG_V16, TCG_REG_V17, TCG_REG_V18, TCG_REG_V19,
+    TCG_REG_V20, TCG_REG_V21, TCG_REG_V22, TCG_REG_V23,
+    TCG_REG_V24, TCG_REG_V25, TCG_REG_V26, TCG_REG_V27,
+    TCG_REG_V28, TCG_REG_V29, TCG_REG_V30, TCG_REG_V31,
 };
 
 #if MAX_OPC_PARAM_IARGS != 7
@@ -248,7 +260,7 @@ static const int tcg_target_call_oarg_regs[] = {
 };
 
 #ifdef CONFIG_DEBUG_TCG
-static const char *const tcg_target_reg_names[TCG_TARGET_NB_REGS] = {
+static const char *const tcg_target_reg_names[TCG_TARGET_GP_REGS] = {
     "r00",
     "r01",
     "r02",
@@ -268,6 +280,98 @@ static const char *const tcg_target_reg_names[TCG_TARGET_NB_REGS] = {
 };
 #endif
 
+/*************************
+ *  TCG Emitter Helpers  *
+ *************************/
+
+/* Bitfield n...m (in 32 bit value). */
+#define BITS(n, m) (((0xffffffffU << (31 - n)) >> (31 - n + m)) << m)
+
+/**
+ * Macro that defines a look-up tree for named QEMU_LD gadgets.
+ */
+#define LD_MEMOP_LOOKUP(variable, arg, suffix) \
+    switch (get_memop(arg) & (MO_BSWAP | MO_SSIZE)) { \
+        case MO_UB:   variable = gadget_qemu_ld_ub_   ## suffix; break; \
+        case MO_SB:   variable = gadget_qemu_ld_sb_   ## suffix; break; \
+        case MO_LEUW: variable = gadget_qemu_ld_leuw_ ## suffix; break; \
+        case MO_LESW: variable = gadget_qemu_ld_lesw_ ## suffix; break; \
+        case MO_LEUL: variable = gadget_qemu_ld_leul_ ## suffix; break; \
+        case MO_LESL: variable = gadget_qemu_ld_lesl_ ## suffix; break; \
+        case MO_LEUQ:  variable = gadget_qemu_ld_leq_  ## suffix; break; \
+        case MO_BEUW: variable = gadget_qemu_ld_beuw_ ## suffix; break; \
+        case MO_BESW: variable = gadget_qemu_ld_besw_ ## suffix; break; \
+        case MO_BEUL: variable = gadget_qemu_ld_beul_ ## suffix; break; \
+        case MO_BESL: variable = gadget_qemu_ld_besl_ ## suffix; break; \
+        case MO_BEUQ:  variable = gadget_qemu_ld_beq_  ## suffix; break; \
+        default: \
+            g_assert_not_reached(); \
+    }
+#define LD_MEMOP_HANDLER(variable, arg, suffix, a_bits, s_bits) \
+        if (a_bits >= s_bits) { \
+            LD_MEMOP_LOOKUP(variable, arg, aligned_ ## suffix ); \
+        } else { \
+            LD_MEMOP_LOOKUP(gadget, arg, unaligned_ ## suffix); \
+        }
+
+
+
+/**
+ * Macro that defines a look-up tree for named QEMU_ST gadgets.
+ */
+#define ST_MEMOP_LOOKUP(variable, arg, suffix) \
+    switch (get_memop(arg) & (MO_BSWAP | MO_SSIZE)) { \
+        case MO_UB:   variable = gadget_qemu_st_ub_   ## suffix; break; \
+        case MO_LEUW: variable = gadget_qemu_st_leuw_ ## suffix; break; \
+        case MO_LEUL: variable = gadget_qemu_st_leul_ ## suffix; break; \
+        case MO_LEUQ:  variable = gadget_qemu_st_leq_  ## suffix; break; \
+        case MO_BEUW: variable = gadget_qemu_st_beuw_ ## suffix; break; \
+        case MO_BEUL: variable = gadget_qemu_st_beul_ ## suffix; break; \
+        case MO_BEUQ:  variable = gadget_qemu_st_beq_  ## suffix; break; \
+        default: \
+            g_assert_not_reached(); \
+    }
+#define ST_MEMOP_HANDLER(variable, arg, suffix, a_bits, s_bits) \
+        if (a_bits >= s_bits) { \
+            ST_MEMOP_LOOKUP(variable, arg, aligned_ ## suffix ); \
+        } else { \
+            ST_MEMOP_LOOKUP(gadget, arg, unaligned_ ## suffix); \
+        }
+
+
+#define LOOKUP_SPECIAL_CASE_LDST_GADGET(arg, name, mode) \
+    switch(TLB_MASK_TABLE_OFS(get_mmuidx(arg))) { \
+        case -32:  \
+            gadget = (a_bits >= s_bits) ?  \
+                gadget_qemu_ ## name ## _aligned_ ## mode ## _off32_i64 : \
+                gadget_qemu_ ## name ## _unaligned_ ## mode ## _off32_i64; \
+            break; \
+        case -48:  \
+            gadget = (a_bits >= s_bits) ?  \
+                gadget_qemu_ ## name ## _aligned_ ## mode ## _off48_i64 : \
+                gadget_qemu_ ## name ## _unaligned_ ## mode ## _off48_i64; \
+            break; \
+        case -64: \
+            gadget = (a_bits >= s_bits) ? \
+                gadget_qemu_ ## name ## _aligned_ ## mode ## _off64_i64 : \
+                gadget_qemu_ ## name ## _unaligned_ ## mode ## _off64_i64; \
+            break; \
+        case -96: \
+            gadget = (a_bits >= s_bits) ? \
+                gadget_qemu_ ## name ## _aligned_ ## mode ## _off96_i64 : \
+                gadget_qemu_ ## name ## _unaligned_ ## mode ## _off96_i64; \
+            break; \
+        case -128: \
+            gadget = (a_bits >= s_bits) ? \
+                gadget_qemu_ ## name ## _aligned_ ## mode ## _off128_i64 : \
+                gadget_qemu_ ## name ## _unaligned_ ## mode ## _off128_i64; \
+            break;\
+        default: \
+            gadget = gadget_qemu_ ## name ## _slowpath_ ## mode ## _off0_i64; \
+            break; \
+        }
+
+
 static bool patch_reloc(tcg_insn_unit *code_ptr, int type,
                         intptr_t value, intptr_t addend)
 {
@@ -363,48 +467,51 @@ tcg_target_ulong helper_be_ldul_mmu_signed(CPUArchState *env, target_ulong addr,
 
 
 /* Write gadget pointer. */
-static void tcg_out_nullary_gadget(TCGContext *s, void *gadget)
+static void tcg_out_gadget(TCGContext *s, const void *gadget)
 {
     tcg_out_immediate(s, (tcg_target_ulong)gadget);
 }
 
 /* Write gadget pointer, plus 64b immediate. */
-static void tcg_out_imm64_gadget(TCGContext *s, void *gadget, tcg_target_ulong immediate)
+static void tcg_out_imm64_gadget(TCGContext *s, const void *gadget, tcg_target_ulong immediate)
 {
-    tcg_out_nullary_gadget(s, gadget);
+    tcg_out_gadget(s, gadget);
     tcg_out64(s, immediate);
 }
 
 
 /* Write gadget pointer (one register). */
-static void tcg_out_unary_gadget(TCGContext *s, void *gadget_base[TCG_TARGET_NB_REGS], unsigned reg0)
+static void tcg_out_unary_gadget(TCGContext *s, const void *gadget_base[TCG_TARGET_GP_REGS], unsigned reg0)
 {
-    tcg_out_nullary_gadget(s, gadget_base[reg0]);
+    tcg_out_gadget(s, gadget_base[reg0]);
 }
 
 
 /* Write gadget pointer (two registers). */
-static void tcg_out_binary_gadget(TCGContext *s, void *gadget_base[TCG_TARGET_NB_REGS][TCG_TARGET_NB_REGS], unsigned reg0, unsigned reg1)
+static void tcg_out_binary_gadget(TCGContext *s, const void *gadget_base[TCG_TARGET_GP_REGS][TCG_TARGET_GP_REGS], unsigned reg0, unsigned reg1)
 {
-    tcg_out_nullary_gadget(s, gadget_base[reg0][reg1]);
+    tcg_out_gadget(s, gadget_base[reg0][reg1]);
 }
 
 
 /* Write gadget pointer (three registers). */
-static void tcg_out_ternary_gadget(TCGContext *s, void *gadget_base[TCG_TARGET_NB_REGS][TCG_TARGET_NB_REGS][TCG_TARGET_NB_REGS], unsigned reg0, unsigned reg1, unsigned reg2)
+static void tcg_out_ternary_gadget(TCGContext *s, const void *gadget_base[TCG_TARGET_GP_REGS][TCG_TARGET_GP_REGS][TCG_TARGET_GP_REGS], unsigned reg0, unsigned reg1, unsigned reg2)
 {
-    tcg_out_nullary_gadget(s, gadget_base[reg0][reg1][reg2]);
+    tcg_out_gadget(s, gadget_base[reg0][reg1][reg2]);
 }
 
+/***************************
+ *  TCG Scalar Operations  *
+ ***************************/
 
 /**
  * Version of our LDST generator that defers to more optimized gadgets selectively.
  */
-static void tcg_out_ldst_gadget_inner(TCGContext *s, 
-    void *gadget_base[TCG_TARGET_NB_REGS][TCG_TARGET_NB_REGS], 
-    void *gadget_pos_imm[TCG_TARGET_NB_REGS][TCG_TARGET_NB_REGS][TCTI_GADGET_IMMEDIATE_ARRAY_LEN], 
-    void *gadget_shifted_imm[TCG_TARGET_NB_REGS][TCG_TARGET_NB_REGS][TCTI_GADGET_IMMEDIATE_ARRAY_LEN], 
-    void *gadget_neg_imm[TCG_TARGET_NB_REGS][TCG_TARGET_NB_REGS][TCTI_GADGET_IMMEDIATE_ARRAY_LEN], 
+static void tcg_out_ldst_gadget_inner(TCGContext *s,
+    const void *gadget_base[TCG_TARGET_GP_REGS][TCG_TARGET_GP_REGS],
+    const void *gadget_pos_imm[TCG_TARGET_GP_REGS][TCG_TARGET_GP_REGS][TCTI_GADGET_IMMEDIATE_ARRAY_LEN],
+    const void *gadget_shifted_imm[TCG_TARGET_GP_REGS][TCG_TARGET_GP_REGS][TCTI_GADGET_IMMEDIATE_ARRAY_LEN],
+    const void *gadget_neg_imm[TCG_TARGET_GP_REGS][TCG_TARGET_GP_REGS][TCTI_GADGET_IMMEDIATE_ARRAY_LEN],
     unsigned reg0, unsigned reg1, uint32_t offset)
 {
     int64_t extended_offset = (int32_t)offset;
@@ -415,7 +522,7 @@ static void tcg_out_ldst_gadget_inner(TCGContext *s,
 
     // We handle positive and negative gadgets separately, in order to allow for asymmetrical
     // collections of pre-made gadgets.
-    if (!is_negative) 
+    if (!is_negative)
     {
         uint64_t shifted_offset = (extended_offset >> 3);
         bool aligned_to_8B = ((extended_offset & 0b111) == 0);
@@ -425,23 +532,23 @@ static void tcg_out_ldst_gadget_inner(TCGContext *s,
 
         // More optimal case: we have a gadget that directly encodes the argument.
         if (have_optimized_gadget) {
-            tcg_out_nullary_gadget(s, gadget_pos_imm[reg0][reg1][extended_offset]);
+            tcg_out_gadget(s, gadget_pos_imm[reg0][reg1][extended_offset]);
             return;
-        } 
+        }
 
         // Special case: it's frequent to have low-numbered positive offsets that are aligned
         // to 16B boundaries
         else if(aligned_to_8B && have_shifted_gadget) {
-            tcg_out_nullary_gadget(s, gadget_shifted_imm[reg0][reg1][shifted_offset]);
+            tcg_out_gadget(s, gadget_shifted_imm[reg0][reg1][shifted_offset]);
             return;
         }
-    } 
+    }
     else {
         uint64_t negated_offset = -(extended_offset);
 
         // More optimal case: we have a gadget that directly encodes the argument.
         if (negated_offset < TCTI_GADGET_IMMEDIATE_ARRAY_LEN) {
-            tcg_out_nullary_gadget(s, gadget_neg_imm[reg0][reg1][negated_offset]);
+            tcg_out_gadget(s, gadget_neg_imm[reg0][reg1][negated_offset]);
             return;
         }
     }
@@ -473,40 +580,90 @@ static void tcti_out_label(TCGContext *s, TCGLabel *label)
     }
 }
 
-/**
- * Generate a register-to-register MOV.
- */
+
+/* Register to register move using ORR (shifted register with no shift). */
+static void tcg_out_movr(TCGContext *s, TCGType ext, TCGReg rd, TCGReg rm)
+{
+    switch(ext) {
+        case TCG_TYPE_I32:
+            tcg_out_binary_gadget(s, gadget_mov_i32, rd, rm);
+            break;
+
+        case TCG_TYPE_I64:
+            tcg_out_binary_gadget(s, gadget_mov_i64, rd, rm);
+            break;
+
+        default:
+            g_assert_not_reached();
+
+    }
+}
+
+
 static bool tcg_out_mov(TCGContext *s, TCGType type, TCGReg ret, TCGReg arg)
 {
-    tcg_debug_assert(ret != arg);
+    TCGReg w_ret = (ret - TCG_REG_V16);
+    TCGReg w_arg = (arg - TCG_REG_V16);
 
-    if (type == TCG_TYPE_I32) {
-        tcg_out_binary_gadget(s, gadget_mov_i32, ret, arg);
-    } else {
-        tcg_out_binary_gadget(s, gadget_mov_i64, ret, arg);
+    if (ret == arg) {
+        return true;
     }
 
+    switch (type) {
+    case TCG_TYPE_I32:
+    case TCG_TYPE_I64:
+
+        // If this is a GP to GP register mov, issue our standard MOV.
+        if (ret < 32 && arg < 32) {
+            tcg_out_movr(s, type, ret, arg);
+            break;
+        } 
+        // If this is a vector register to GP, issue a UMOV.
+        else if (ret < 32) {
+            void *gadget = (type == TCG_TYPE_I32) ? gadget_umov_s0 : gadget_umov_d0;
+            tcg_out_binary_gadget(s, gadget, ret, w_arg);
+            break;
+        } 
+        
+        // If this is a GP to vector move, insert the vealue using INS.
+        else if (arg < 32) {
+            void *gadget = (type == TCG_TYPE_I32) ? gadget_ins_s0 : gadget_ins_d0;
+            tcg_out_binary_gadget(s, gadget, w_ret, arg);
+            break;
+        }
+        /* FALLTHRU */
+
+    case TCG_TYPE_V64:
+        tcg_debug_assert(ret >= 32 && arg >= 32);
+        tcg_out_ternary_gadget(s, gadget_or_d, w_ret, w_arg, w_arg);
+        break;
+
+    case TCG_TYPE_V128:
+        tcg_debug_assert(ret >= 32 && arg >= 32);
+        tcg_out_ternary_gadget(s, gadget_or_q, w_ret, w_arg, w_arg);
+        break;
 
+    default:
+        g_assert_not_reached();
+    }
     return true;
 }
 
 
+
 static void tcg_out_movi_i32(TCGContext *s, TCGReg t0, tcg_target_long arg)
 {
     bool is_negative = (arg < 0);
 
     // We handle positive and negative gadgets separately, in order to allow for asymmetrical
     // collections of pre-made gadgets.
-    if (!is_negative) 
+    if (!is_negative)
     {
         // More optimal case: we have a gadget that directly encodes the argument.
         if (arg < ARRAY_SIZE(gadget_movi_imm_i32[t0])) {
-            tcg_out_nullary_gadget(s, gadget_movi_imm_i32[t0][arg]);
+            tcg_out_gadget(s, gadget_movi_imm_i32[t0][arg]);
             return;
         }
-    } 
-    else {
-
     }
 
     // Emit the mov and its immediate.
@@ -521,16 +678,13 @@ static void tcg_out_movi_i64(TCGContext *s, TCGReg t0, tcg_target_long arg)
 
     // We handle positive and negative gadgets separately, in order to allow for asymmetrical
     // collections of pre-made gadgets.
-    if (!is_negative) 
+    if (!is_negative)
     {
         // More optimal case: we have a gadget that directly encodes the argument.
         if (arg < ARRAY_SIZE(gadget_movi_imm_i64[t0])) {
-            tcg_out_nullary_gadget(s, gadget_movi_imm_i64[t0][arg]);
+            tcg_out_gadget(s, gadget_movi_imm_i64[t0][arg]);
             return;
         }
-    } 
-    else {
-
     }
 
     // TODO: optimize the negative case, too?
@@ -558,7 +712,7 @@ static void tcg_out_movi(TCGContext *s, TCGType type, TCGReg t0, tcg_target_long
  */
 static inline void tcg_out_call(TCGContext *s, const tcg_insn_unit *arg)
 {
-    tcg_out_nullary_gadget(s, gadget_call);
+    tcg_out_gadget(s, gadget_call);
     tcg_out64(s, (uintptr_t)arg);
 }
 
@@ -570,9 +724,9 @@ static void tcg_out_ld(TCGContext *s, TCGType type, TCGReg ret, TCGReg arg1,
 {
 
     if (type == TCG_TYPE_I32) {
-        tcg_out_ldst_gadget(s, gadget_ld32u, ret, arg1, arg2); 
+        tcg_out_ldst_gadget(s, gadget_ld32u, ret, arg1, arg2);
     } else {
-        tcg_out_ldst_gadget(s, gadget_ld_i64, ret, arg1, arg2); 
+        tcg_out_ldst_gadget(s, gadget_ld_i64, ret, arg1, arg2);
     }
 }
 
@@ -598,7 +752,7 @@ void tcg_out_op(TCGContext *s, TCGOpcode opc, const TCGArg *args, const int *con
         // to patch our gadget stream with the target address, later.
         if (s->tb_jmp_insn_offset) {
             // Emit our gadget.
-            tcg_out_nullary_gadget(s, gadget_br);
+            tcg_out_gadget(s, gadget_br);
 
             // Place our current instruction into our "relocation table", so it can
             // be patched once we know where the branch will target...
@@ -617,7 +771,7 @@ void tcg_out_op(TCGContext *s, TCGOpcode opc, const TCGArg *args, const int *con
 
     // Simple branch.
     case INDEX_op_br:
-        tcg_out_nullary_gadget(s, gadget_br);
+        tcg_out_gadget(s, gadget_br);
         tcti_out_label(s, arg_label(args[0]));
         break;
 
@@ -678,41 +832,41 @@ void tcg_out_op(TCGContext *s, TCGOpcode opc, const TCGArg *args, const int *con
 
     case INDEX_op_ld8u_i32:
     case INDEX_op_ld8u_i64:
-        tcg_out_ldst_gadget(s, gadget_ld8u, args[0], args[1], args[2]); 
+        tcg_out_ldst_gadget(s, gadget_ld8u, args[0], args[1], args[2]);
         break;
 
     case INDEX_op_ld8s_i32:
-        tcg_out_ldst_gadget(s, gadget_ld8s_i32, args[0], args[1], args[2]); 
+        tcg_out_ldst_gadget(s, gadget_ld8s_i32, args[0], args[1], args[2]);
         break;
 
     case INDEX_op_ld8s_i64:
-        tcg_out_ldst_gadget(s, gadget_ld8s_i64, args[0], args[1], args[2]); 
+        tcg_out_ldst_gadget(s, gadget_ld8s_i64, args[0], args[1], args[2]);
         break;
 
     case INDEX_op_ld16u_i32:
     case INDEX_op_ld16u_i64:
-        tcg_out_ldst_gadget(s, gadget_ld16u, args[0], args[1], args[2]); 
+        tcg_out_ldst_gadget(s, gadget_ld16u, args[0], args[1], args[2]);
         break;
 
     case INDEX_op_ld16s_i32:
-        tcg_out_ldst_gadget(s, gadget_ld16s_i32, args[0], args[1], args[2]); 
+        tcg_out_ldst_gadget(s, gadget_ld16s_i32, args[0], args[1], args[2]);
         break;
 
     case INDEX_op_ld16s_i64:
-        tcg_out_ldst_gadget(s, gadget_ld16s_i64, args[0], args[1], args[2]); 
+        tcg_out_ldst_gadget(s, gadget_ld16s_i64, args[0], args[1], args[2]);
         break;
 
     case INDEX_op_ld_i32:
     case INDEX_op_ld32u_i64:
-        tcg_out_ldst_gadget(s, gadget_ld32u, args[0], args[1], args[2]); 
+        tcg_out_ldst_gadget(s, gadget_ld32u, args[0], args[1], args[2]);
         break;
 
     case INDEX_op_ld_i64:
-        tcg_out_ldst_gadget(s, gadget_ld_i64, args[0], args[1], args[2]); 
+        tcg_out_ldst_gadget(s, gadget_ld_i64, args[0], args[1], args[2]);
         break;
-    
+   
     case INDEX_op_ld32s_i64:
-        tcg_out_ldst_gadget(s, gadget_ld32s_i64, args[0], args[1], args[2]); 
+        tcg_out_ldst_gadget(s, gadget_ld32s_i64, args[0], args[1], args[2]);
         break;
 
 
@@ -721,155 +875,169 @@ void tcg_out_op(TCGContext *s, TCGOpcode opc, const TCGArg *args, const int *con
      */
     case INDEX_op_st8_i32:
     case INDEX_op_st8_i64:
-        tcg_out_ldst_gadget(s, gadget_st8, args[0], args[1], args[2]); 
+        tcg_out_ldst_gadget(s, gadget_st8, args[0], args[1], args[2]);
         break;
 
     case INDEX_op_st16_i32:
     case INDEX_op_st16_i64:
-        tcg_out_ldst_gadget(s, gadget_st16, args[0], args[1], args[2]); 
+        tcg_out_ldst_gadget(s, gadget_st16, args[0], args[1], args[2]);
         break;
 
     case INDEX_op_st_i32:
     case INDEX_op_st32_i64:
-        tcg_out_ldst_gadget(s, gadget_st_i32, args[0], args[1], args[2]); 
+        tcg_out_ldst_gadget(s, gadget_st_i32, args[0], args[1], args[2]);
         break;
 
     case INDEX_op_st_i64:
-        tcg_out_ldst_gadget(s, gadget_st_i64, args[0], args[1], args[2]); 
+        tcg_out_ldst_gadget(s, gadget_st_i64, args[0], args[1], args[2]);
         break;
 
     /**
      * Arithmetic instructions.
      */
 
-    case INDEX_op_add_i32: 
-        tcg_out_ternary_gadget(s, gadget_add_i32, args[0], args[1], args[2]); 
+    case INDEX_op_add_i32:
+        tcg_out_ternary_gadget(s, gadget_add_i32, args[0], args[1], args[2]);
         break;
 
     case INDEX_op_sub_i32:
-        tcg_out_ternary_gadget(s, gadget_sub_i32, args[0], args[1], args[2]); 
+        tcg_out_ternary_gadget(s, gadget_sub_i32, args[0], args[1], args[2]);
         break;
 
     case INDEX_op_mul_i32:
-        tcg_out_ternary_gadget(s, gadget_mul_i32, args[0], args[1], args[2]); 
+        tcg_out_ternary_gadget(s, gadget_mul_i32, args[0], args[1], args[2]);
+        break;
+
+    case INDEX_op_nand_i32:     /* Optional (TCG_TARGET_HAS_nand_i32). */
+        tcg_out_ternary_gadget(s, gadget_nand_i32, args[0], args[1], args[2]);
+        break;
+
+    case INDEX_op_nor_i32:     /* Optional (TCG_TARGET_HAS_nor_i32). */
+        tcg_out_ternary_gadget(s, gadget_nor_i32, args[0], args[1], args[2]);
         break;
 
     case INDEX_op_and_i32:
-        tcg_out_ternary_gadget(s, gadget_and_i32, args[0], args[1], args[2]); 
+        tcg_out_ternary_gadget(s, gadget_and_i32, args[0], args[1], args[2]);
         break;
 
     case INDEX_op_andc_i32:     /* Optional (TCG_TARGET_HAS_andc_i32). */
-        tcg_out_ternary_gadget(s, gadget_andc_i32, args[0], args[1], args[2]); 
+        tcg_out_ternary_gadget(s, gadget_andc_i32, args[0], args[1], args[2]);
         break;
 
     case INDEX_op_orc_i32:      /* Optional (TCG_TARGET_HAS_orc_i64). */
-        tcg_out_ternary_gadget(s, gadget_orc_i32, args[0], args[1], args[2]); 
+        tcg_out_ternary_gadget(s, gadget_orc_i32, args[0], args[1], args[2]);
         break;
 
     case INDEX_op_eqv_i32:      /* Optional (TCG_TARGET_HAS_orc_i64). */
-        tcg_out_ternary_gadget(s, gadget_eqv_i32, args[0], args[1], args[2]); 
+        tcg_out_ternary_gadget(s, gadget_eqv_i32, args[0], args[1], args[2]);
         break;
 
     case INDEX_op_or_i32:
-        tcg_out_ternary_gadget(s, gadget_or_i32, args[0], args[1], args[2]); 
+        tcg_out_ternary_gadget(s, gadget_or_i32, args[0], args[1], args[2]);
         break;
 
     case INDEX_op_xor_i32:
-        tcg_out_ternary_gadget(s, gadget_xor_i32, args[0], args[1], args[2]); 
+        tcg_out_ternary_gadget(s, gadget_xor_i32, args[0], args[1], args[2]);
         break;
 
     case INDEX_op_shl_i32:
-        tcg_out_ternary_gadget(s, gadget_shl_i32, args[0], args[1], args[2]); 
+        tcg_out_ternary_gadget(s, gadget_shl_i32, args[0], args[1], args[2]);
         break;
 
     case INDEX_op_shr_i32:
-        tcg_out_ternary_gadget(s, gadget_shr_i32, args[0], args[1], args[2]); 
+        tcg_out_ternary_gadget(s, gadget_shr_i32, args[0], args[1], args[2]);
         break;
 
     case INDEX_op_sar_i32:
-        tcg_out_ternary_gadget(s, gadget_sar_i32, args[0], args[1], args[2]); 
+        tcg_out_ternary_gadget(s, gadget_sar_i32, args[0], args[1], args[2]);
         break;
 
-    //case INDEX_op_rotr_i32:     /* Optional (TCG_TARGET_HAS_rot_i32). */
-    //    tcg_out_ternary_gadget(s, gadget_rotr_i32, args[0], args[1], args[2]); 
-    //    break;
+    case INDEX_op_rotr_i32:     /* Optional (TCG_TARGET_HAS_rot_i32). */
+        tcg_out_ternary_gadget(s, gadget_rotr_i32, args[0], args[1], args[2]);
+        break;
 
-    //case INDEX_op_rotl_i32:     /* Optional (TCG_TARGET_HAS_rot_i32). */
-    //    tcg_out_ternary_gadget(s, gadget_rotl_i32, args[0], args[1], args[2]); 
+    case INDEX_op_rotl_i32:     /* Optional (TCG_TARGET_HAS_rot_i32). */
+        tcg_out_ternary_gadget(s, gadget_rotl_i32, args[0], args[1], args[2]);
+        break;
 
     case INDEX_op_add_i64:
-        tcg_out_ternary_gadget(s, gadget_add_i64, args[0], args[1], args[2]); 
+        tcg_out_ternary_gadget(s, gadget_add_i64, args[0], args[1], args[2]);
         break;
 
     case INDEX_op_sub_i64:
-        tcg_out_ternary_gadget(s, gadget_sub_i64, args[0], args[1], args[2]); 
+        tcg_out_ternary_gadget(s, gadget_sub_i64, args[0], args[1], args[2]);
         break;
 
     case INDEX_op_mul_i64:
-        tcg_out_ternary_gadget(s, gadget_mul_i64, args[0], args[1], args[2]); 
+        tcg_out_ternary_gadget(s, gadget_mul_i64, args[0], args[1], args[2]);
         break;
 
     case INDEX_op_and_i64:
-        tcg_out_ternary_gadget(s, gadget_and_i64, args[0], args[1], args[2]); 
+        tcg_out_ternary_gadget(s, gadget_and_i64, args[0], args[1], args[2]);
         break;
 
     case INDEX_op_andc_i64:     /* Optional (TCG_TARGET_HAS_andc_i64). */
-        tcg_out_ternary_gadget(s, gadget_andc_i64, args[0], args[1], args[2]); 
+        tcg_out_ternary_gadget(s, gadget_andc_i64, args[0], args[1], args[2]);
         break;
 
     case INDEX_op_orc_i64:      /* Optional (TCG_TARGET_HAS_orc_i64). */
-        tcg_out_ternary_gadget(s, gadget_orc_i64, args[0], args[1], args[2]); 
+        tcg_out_ternary_gadget(s, gadget_orc_i64, args[0], args[1], args[2]);
         break;
 
     case INDEX_op_eqv_i64:      /* Optional (TCG_TARGET_HAS_eqv_i64). */
-        tcg_out_ternary_gadget(s, gadget_eqv_i64, args[0], args[1], args[2]); 
+        tcg_out_ternary_gadget(s, gadget_eqv_i64, args[0], args[1], args[2]);
+        break;
+
+    case INDEX_op_nand_i64:     /* Optional (TCG_TARGET_HAS_nand_i64). */
+        tcg_out_ternary_gadget(s, gadget_nand_i64, args[0], args[1], args[2]);
         break;
 
-    //case INDEX_op_nand_i64:     /* Optional (TCG_TARGET_HAS_nand_i64). */
-    //case INDEX_op_nor_i64:      /* Optional (TCG_TARGET_HAS_nor_i64). */
+    case INDEX_op_nor_i64:      /* Optional (TCG_TARGET_HAS_nor_i64). */
+        tcg_out_ternary_gadget(s, gadget_nor_i64, args[0], args[1], args[2]);
+        break;
 
     case INDEX_op_or_i64:
-        tcg_out_ternary_gadget(s, gadget_or_i64, args[0], args[1], args[2]); 
+        tcg_out_ternary_gadget(s, gadget_or_i64, args[0], args[1], args[2]);
         break;
 
     case INDEX_op_xor_i64:
-        tcg_out_ternary_gadget(s, gadget_xor_i64, args[0], args[1], args[2]); 
+        tcg_out_ternary_gadget(s, gadget_xor_i64, args[0], args[1], args[2]);
         break;
 
     case INDEX_op_shl_i64:
-        tcg_out_ternary_gadget(s, gadget_shl_i64, args[0], args[1], args[2]); 
+        tcg_out_ternary_gadget(s, gadget_shl_i64, args[0], args[1], args[2]);
         break;
 
     case INDEX_op_shr_i64:
-        tcg_out_ternary_gadget(s, gadget_shr_i64, args[0], args[1], args[2]); 
+        tcg_out_ternary_gadget(s, gadget_shr_i64, args[0], args[1], args[2]);
         break;
 
     case INDEX_op_sar_i64:
-        tcg_out_ternary_gadget(s, gadget_sar_i64, args[0], args[1], args[2]); 
+        tcg_out_ternary_gadget(s, gadget_sar_i64, args[0], args[1], args[2]);
         break;
 
-    //case INDEX_op_rotl_i64:     /* Optional (TCG_TARGET_HAS_rot_i64). */
-    //    tcg_out_ternary_gadget(s, gadget_rotl_i64, args[0], args[1], args[2]); 
-    //    break;
+    case INDEX_op_rotl_i64:     /* Optional (TCG_TARGET_HAS_rot_i64). */
+        tcg_out_ternary_gadget(s, gadget_rotl_i64, args[0], args[1], args[2]);
+        break;
 
-    //case INDEX_op_rotr_i64:     /* Optional (TCG_TARGET_HAS_rot_i64). */
-    //    tcg_out_ternary_gadget(s, gadget_rotr_i64, args[0], args[1], args[2]); 
-    //    break;
+    case INDEX_op_rotr_i64:     /* Optional (TCG_TARGET_HAS_rot_i64). */
+        tcg_out_ternary_gadget(s, gadget_rotr_i64, args[0], args[1], args[2]);
+        break;
 
     case INDEX_op_div_i64:      /* Optional (TCG_TARGET_HAS_div_i64). */
-        tcg_out_ternary_gadget(s, gadget_div_i64, args[0], args[1], args[2]); 
+        tcg_out_ternary_gadget(s, gadget_div_i64, args[0], args[1], args[2]);
         break;
 
     case INDEX_op_divu_i64:     /* Optional (TCG_TARGET_HAS_div_i64). */
-        tcg_out_ternary_gadget(s, gadget_divu_i64, args[0], args[1], args[2]); 
+        tcg_out_ternary_gadget(s, gadget_divu_i64, args[0], args[1], args[2]);
         break;
 
     case INDEX_op_rem_i64:      /* Optional (TCG_TARGET_HAS_div_i64). */
-        tcg_out_ternary_gadget(s, gadget_rem_i64, args[0], args[1], args[2]); 
+        tcg_out_ternary_gadget(s, gadget_rem_i64, args[0], args[1], args[2]);
         break;
 
     case INDEX_op_remu_i64:     /* Optional (TCG_TARGET_HAS_div_i64). */
-        tcg_out_ternary_gadget(s, gadget_remu_i64, args[0], args[1], args[2]); 
+        tcg_out_ternary_gadget(s, gadget_remu_i64, args[0], args[1], args[2]);
         break;
 
     case INDEX_op_brcond_i64:
@@ -898,7 +1066,7 @@ void tcg_out_op(TCGContext *s, TCGOpcode opc, const TCGArg *args, const int *con
         // helps the processor's branch prediction be less "squished", as not every
         // branch is going throuh the same instruction.
         tcg_out_ternary_gadget(s, gadget, last_brcond_i64, args[0], args[1]);
-        last_brcond_i64 = (last_brcond_i64 + 1) % TCG_TARGET_NB_REGS;
+        last_brcond_i64 = (last_brcond_i64 + 1) % TCG_TARGET_GP_REGS;
 
         // Branch target immediate.
         tcti_out_label(s, arg_label(args[3]));
@@ -928,6 +1096,14 @@ void tcg_out_op(TCGContext *s, TCGOpcode opc, const TCGArg *args, const int *con
         tcg_out_binary_gadget(s, gadget_neg_i64, args[0], args[1]);
         break;
 
+    case INDEX_op_clz_i64:      /* Optional (TCG_TARGET_HAS_clz_i64). */
+        tcg_out_ternary_gadget(s, gadget_clz_i64, args[0], args[1], args[2]);
+        break;
+
+    case INDEX_op_ctz_i64:      /* Optional (TCG_TARGET_HAS_ctz_i64). */
+        tcg_out_ternary_gadget(s, gadget_ctz_i64, args[0], args[1], args[2]);
+        break;
+
     case INDEX_op_ext8s_i64:    /* Optional (TCG_TARGET_HAS_ext8s_i64). */
         tcg_out_binary_gadget(s, gadget_ext8s_i64, args[0], args[1]);
         break;
@@ -956,10 +1132,26 @@ void tcg_out_op(TCGContext *s, TCGOpcode opc, const TCGArg *args, const int *con
         tcg_out_binary_gadget(s, gadget_ext32u_i64, args[0], args[1]);
         break;
 
+    case INDEX_op_extrl_i64_i32:
+        tcg_out_binary_gadget(s, gadget_extrl, args[0], args[1]);
+        break;
+
+    case INDEX_op_extrh_i64_i32:
+        tcg_out_binary_gadget(s, gadget_extrh, args[0], args[1]);
+        break;
+
     case INDEX_op_neg_i32:      /* Optional (TCG_TARGET_HAS_neg_i32). */
         tcg_out_binary_gadget(s, gadget_neg_i32, args[0], args[1]);
         break;
 
+    case INDEX_op_clz_i32:      /* Optional (TCG_TARGET_HAS_clz_i32). */
+        tcg_out_ternary_gadget(s, gadget_clz_i32, args[0], args[1], args[2]);
+        break;
+
+    case INDEX_op_ctz_i32:      /* Optional (TCG_TARGET_HAS_ctz_i32). */
+        tcg_out_ternary_gadget(s, gadget_ctz_i32, args[0], args[1], args[2]);
+        break;
+
     case INDEX_op_not_i32:      /* Optional (TCG_TARGET_HAS_not_i32). */
         tcg_out_binary_gadget(s, gadget_not_i32, args[0], args[1]);
         break;
@@ -973,19 +1165,19 @@ void tcg_out_op(TCGContext *s, TCGOpcode opc, const TCGArg *args, const int *con
         break;
 
     case INDEX_op_div_i32:      /* Optional (TCG_TARGET_HAS_div_i32). */
-        tcg_out_ternary_gadget(s, gadget_div_i32, args[0], args[1], args[2]); 
+        tcg_out_ternary_gadget(s, gadget_div_i32, args[0], args[1], args[2]);
         break;
 
     case INDEX_op_divu_i32:     /* Optional (TCG_TARGET_HAS_div_i32). */
-        tcg_out_ternary_gadget(s, gadget_divu_i32, args[0], args[1], args[2]); 
+        tcg_out_ternary_gadget(s, gadget_divu_i32, args[0], args[1], args[2]);
         break;
 
     case INDEX_op_rem_i32:      /* Optional (TCG_TARGET_HAS_div_i32). */
-        tcg_out_ternary_gadget(s, gadget_rem_i32, args[0], args[1], args[2]); 
+        tcg_out_ternary_gadget(s, gadget_rem_i32, args[0], args[1], args[2]);
         break;
 
     case INDEX_op_remu_i32:     /* Optional (TCG_TARGET_HAS_div_i32). */
-        tcg_out_ternary_gadget(s, gadget_remu_i32, args[0], args[1], args[2]); 
+        tcg_out_ternary_gadget(s, gadget_remu_i32, args[0], args[1], args[2]);
         break;
 
     case INDEX_op_brcond_i32:
@@ -1014,7 +1206,7 @@ void tcg_out_op(TCGContext *s, TCGOpcode opc, const TCGArg *args, const int *con
         // helps the processor's branch prediction be less "squished", as not every
         // branch is going throuh the same instruction.
         tcg_out_ternary_gadget(s, gadget, last_brcond_i32, args[0], args[1]);
-        last_brcond_i32 = (last_brcond_i32 + 1) % TCG_TARGET_NB_REGS;
+        last_brcond_i32 = (last_brcond_i32 + 1) % TCG_TARGET_GP_REGS;
 
         // Branch target immediate.
         tcti_out_label(s, arg_label(args[3]));
@@ -1031,6 +1223,8 @@ void tcg_out_op(TCGContext *s, TCGOpcode opc, const TCGArg *args, const int *con
         void *gadget;
 
         switch(TLB_MASK_TABLE_OFS(get_mmuidx(args[2]))) {
+            case -32:  LD_MEMOP_HANDLER(gadget, args[2],  off32_i32, a_bits, s_bits); break;
+            case -48:  LD_MEMOP_HANDLER(gadget, args[2],  off48_i32, a_bits, s_bits); break;
             case -64:  LD_MEMOP_HANDLER(gadget, args[2],  off64_i32, a_bits, s_bits); break;
             case -96:  LD_MEMOP_HANDLER(gadget, args[2],  off96_i32, a_bits, s_bits); break;
             case -128: LD_MEMOP_HANDLER(gadget, args[2], off128_i32, a_bits, s_bits); break;
@@ -1038,7 +1232,7 @@ void tcg_out_op(TCGContext *s, TCGOpcode opc, const TCGArg *args, const int *con
         }
 
         // Args:
-        // - an immediate32 encodes our operation index 
+        // - an immediate32 encodes our operation index
         tcg_out_binary_gadget(s, gadget, args[0], args[1]);
         tcg_out64(s, args[2]); // TODO: fix encoding to be 4b
         break;
@@ -1052,43 +1246,31 @@ void tcg_out_op(TCGContext *s, TCGOpcode opc, const TCGArg *args, const int *con
 
         void *gadget;
 
-        // Special optimization case: if we have an operation/target of 0x3A, 
-        // this is a common case. Delegate to our special-case handler.
-        if (args[2] == 0x3a) {
-            switch(TLB_MASK_TABLE_OFS(get_mmuidx(args[2]))) {
-
-                case -64: 
-                    gadget = (a_bits >= s_bits) ? 
-                        gadget_qemu_ld_leq_aligned_mode3a_off64_i64 :
-                        gadget_qemu_ld_leq_unaligned_mode3a_off64_i64;
-                    break;
-                case -96: 
-                    gadget = (a_bits >= s_bits) ? 
-                        gadget_qemu_ld_leq_aligned_mode3a_off96_i64 :
-                        gadget_qemu_ld_leq_unaligned_mode3a_off96_i64;
-                    break;
-                case -128: 
-                    gadget = (a_bits >= s_bits) ? 
-                        gadget_qemu_ld_leq_aligned_mode3a_off128_i64 :
-                        gadget_qemu_ld_leq_unaligned_mode3a_off128_i64;
-                    break;
-
-                default: 
-                    gadget = gadget_qemu_ld_leq_slowpath_mode3a_off0_i64;
-                    break;
-            }
+        // Special optimization case: if we have an common case.
+        // Delegate to our special-case handler.
+        if (args[2] == 0x02) {
+            LOOKUP_SPECIAL_CASE_LDST_GADGET(args[2], ld_ub, mode02)
             tcg_out_binary_gadget(s, gadget, args[0], args[1]);
-        } 
+        } else if (args[2] == 0x32) {
+            LOOKUP_SPECIAL_CASE_LDST_GADGET(args[2], ld_leq, mode32)
+            tcg_out_binary_gadget(s, gadget, args[0], args[1]);
+        } else if(args[2] == 0x3a) {
+            LOOKUP_SPECIAL_CASE_LDST_GADGET(args[2], ld_leq, mode3a)
+            tcg_out_binary_gadget(s, gadget, args[0], args[1]);
+        }
         // Otherwise, handle the generic case.
         else {
             switch(TLB_MASK_TABLE_OFS(get_mmuidx(args[2]))) {
+                case -32:  LD_MEMOP_HANDLER(gadget, args[2],  off32_i64, a_bits, s_bits); break;
+                case -48:  LD_MEMOP_HANDLER(gadget, args[2],  off48_i64, a_bits, s_bits); break;
                 case -64:  LD_MEMOP_HANDLER(gadget, args[2],  off64_i64, a_bits, s_bits); break;
                 case -96:  LD_MEMOP_HANDLER(gadget, args[2],  off96_i64, a_bits, s_bits); break;
                 case -128: LD_MEMOP_HANDLER(gadget, args[2], off128_i64, a_bits, s_bits); break;
                 default:   LD_MEMOP_LOOKUP(gadget, args[2], slowpath_off0_i64); break;
             }
+
             // Args:
-            // - an immediate32 encodes our operation index 
+            // - an immediate32 encodes our operation index
             tcg_out_binary_gadget(s, gadget, args[0], args[1]);
             tcg_out64(s, args[2]); // TODO: fix encoding to be 4b
         }
@@ -1105,6 +1287,8 @@ void tcg_out_op(TCGContext *s, TCGOpcode opc, const TCGArg *args, const int *con
         void *gadget;
 
         switch(TLB_MASK_TABLE_OFS(get_mmuidx(args[2]))) {
+            case -32:  ST_MEMOP_HANDLER(gadget, args[2],  off32_i32, a_bits, s_bits); break;
+            case -48:  ST_MEMOP_HANDLER(gadget, args[2],  off48_i32, a_bits, s_bits); break;
             case -64:  ST_MEMOP_HANDLER(gadget, args[2],  off64_i32, a_bits, s_bits); break;
             case -96:  ST_MEMOP_HANDLER(gadget, args[2],  off96_i32, a_bits, s_bits); break;
             case -128: ST_MEMOP_HANDLER(gadget, args[2], off128_i32, a_bits, s_bits); break;
@@ -1113,7 +1297,7 @@ void tcg_out_op(TCGContext *s, TCGOpcode opc, const TCGArg *args, const int *con
 
         // Args:
         // - our gadget encodes the target and address registers
-        // - an immediate32 encodes our operation index 
+        // - an immediate32 encodes our operation index
         tcg_out_binary_gadget(s, gadget, args[0], args[1]);
         tcg_out64(s, args[2]); // FIXME: double encoded
         break;
@@ -1127,36 +1311,23 @@ void tcg_out_op(TCGContext *s, TCGOpcode opc, const TCGArg *args, const int *con
 
         void *gadget;
 
-        // Special optimization case: if we have an operation/target of 0x3A, 
-        // this is a common case. Delegate to our special-case handler.
-        if (args[2] == 0x3a) {
-            switch(TLB_MASK_TABLE_OFS(get_mmuidx(args[2]))) {
-
-                case -64: 
-                    gadget = (a_bits >= s_bits) ? 
-                        gadget_qemu_st_leq_aligned_mode3a_off64_i64 :
-                        gadget_qemu_st_leq_unaligned_mode3a_off64_i64;
-                    break;
-                case -96: 
-                    gadget = (a_bits >= s_bits) ? 
-                        gadget_qemu_st_leq_aligned_mode3a_off96_i64 :
-                        gadget_qemu_st_leq_unaligned_mode3a_off96_i64;
-                    break;
-                case -128: 
-                    gadget = (a_bits >= s_bits) ? 
-                        gadget_qemu_st_leq_aligned_mode3a_off128_i64 :
-                        gadget_qemu_st_leq_unaligned_mode3a_off128_i64;
-                    break;
-
-                default: 
-                    gadget = gadget_qemu_st_leq_slowpath_mode3a_off0_i64;
-                    break;
-            }
+        // Special optimization case: if we have an common case.
+        // Delegate to our special-case handler.
+        if (args[2] == 0x02) {
+            LOOKUP_SPECIAL_CASE_LDST_GADGET(args[2], st_ub, mode02)
             tcg_out_binary_gadget(s, gadget, args[0], args[1]);
-        } 
+        } else if (args[2] == 0x32) {
+            LOOKUP_SPECIAL_CASE_LDST_GADGET(args[2], st_leq, mode32)
+            tcg_out_binary_gadget(s, gadget, args[0], args[1]);
+        } else if(args[2] == 0x3a) {
+            LOOKUP_SPECIAL_CASE_LDST_GADGET(args[2], st_leq, mode3a)
+            tcg_out_binary_gadget(s, gadget, args[0], args[1]);
+        }
         // Otherwise, handle the generic case.
         else {
             switch(TLB_MASK_TABLE_OFS(get_mmuidx(args[2]))) {
+                case -32:  ST_MEMOP_HANDLER(gadget, args[2],  off32_i64, a_bits, s_bits); break;
+                case -48:  ST_MEMOP_HANDLER(gadget, args[2],  off48_i64, a_bits, s_bits); break;
                 case -64:  ST_MEMOP_HANDLER(gadget, args[2],  off64_i64, a_bits, s_bits); break;
                 case -96:  ST_MEMOP_HANDLER(gadget, args[2],  off96_i64, a_bits, s_bits); break;
                 case -128: ST_MEMOP_HANDLER(gadget, args[2], off128_i64, a_bits, s_bits); break;
@@ -1165,7 +1336,7 @@ void tcg_out_op(TCGContext *s, TCGOpcode opc, const TCGArg *args, const int *con
 
             // Args:
             // - our gadget encodes the target and address registers
-            // - an immediate32 encodes our operation index 
+            // - an immediate32 encodes our operation index
             tcg_out_binary_gadget(s, gadget, args[0], args[1]);
             tcg_out64(s, args[2]); // FIXME: double encoded
         }
@@ -1183,7 +1354,7 @@ void tcg_out_op(TCGContext *s, TCGOpcode opc, const TCGArg *args, const int *con
             [TCG_MO_LD_ST]                = gadget_mb_ld,
             [TCG_MO_LD_ST | TCG_MO_LD_LD] = gadget_mb_ld,
         };
-        tcg_out_nullary_gadget(s, sync[args[0] & TCG_MO_ALL]);
+        tcg_out_gadget(s, sync[args[0] & TCG_MO_ALL]);
 
         break;
     }
@@ -1203,9 +1374,9 @@ static void tcg_out_st(TCGContext *s, TCGType type, TCGReg arg, TCGReg arg1,
                        intptr_t arg2)
 {
     if (type == TCG_TYPE_I32) {
-        tcg_out_ldst_gadget(s, gadget_st_i32, arg, arg1, arg2); 
+        tcg_out_ldst_gadget(s, gadget_st_i32, arg, arg1, arg2);
     } else {
-        tcg_out_ldst_gadget(s, gadget_st_i64, arg, arg1, arg2); 
+        tcg_out_ldst_gadget(s, gadget_st_i64, arg, arg1, arg2);
     }
 }
 
@@ -1221,19 +1392,629 @@ static bool tcg_target_const_match(int64_t val, TCGType type, int ct)
     return ct & TCG_CT_CONST;
 }
 
+/***************************
+ *  TCG Vector Operations  *
+ ***************************/
+
+//
+// Helper for emitting DUPI (immediate DUP) instructions.
+//
+#define tcg_out_dupi_gadget(s, name, q, rd, op, cmode, arg) \
+    if (q) { \
+        tcg_out_gadget(s, gadget_ ## name ## _cmode_ ## cmode ## _op ## op ## _q1[rd][arg]); \
+    } else { \
+        tcg_out_gadget(s, gadget_ ## name ## _cmode_ ## cmode ## _op ## op ## _q0[rd][arg]); \
+    }
+
+
+//
+// Helpers for emitting D/Q variant instructions.
+//
+#define tcg_out_dq_gadget(s, name, arity, is_q, args...) \
+    if (is_q) { \
+        tcg_out_ ## arity ## _gadget(s, gadget_ ## name ## _q, args); \
+    } else { \
+        tcg_out_ ## arity ## _gadget(s, gadget_ ## name ## _d, args); \
+    }
+
+#define tcg_out_unary_dq_gadget(s, name, is_q, a) \
+    tcg_out_dq_gadget(s, name, unary, is_q, a) 
+#define tcg_out_binary_dq_gadget(s, name, is_q, a, b) \
+    tcg_out_dq_gadget(s, name, binary, is_q, a, b)
+#define tcg_out_ternary_dq_gadget(s, name, is_q, a, b, c) \
+    tcg_out_dq_gadget(s, name, ternary, is_q, a, b, c)
+
+
+//
+// Helper for emitting the gadget appropriate for a vector's size.
+//
+#define tcg_out_sized_vector_gadget(s, name, arity, vece, args...) \
+    switch(vece) { \
+        case MO_8: \
+            if (type == TCG_TYPE_V64) { \
+                tcg_out_ ## arity ## _gadget(s, gadget_ ## name ## _8b, args); \
+            } else { \
+                tcg_out_ ## arity ## _gadget(s, gadget_ ## name ## _16b, args); \
+            } \
+            break; \
+        case MO_16: \
+            if (type == TCG_TYPE_V64) { \
+                tcg_out_ ## arity ## _gadget(s, gadget_ ## name ## _4h, args); \
+            } else { \
+                tcg_out_ ## arity ## _gadget(s, gadget_ ## name ## _8h, args); \
+            } \
+            break; \
+        case MO_32: \
+            if (type == TCG_TYPE_V64) { \
+                tcg_out_ ## arity ## _gadget(s, gadget_ ## name ## _2s, args); \
+            } else { \
+                tcg_out_ ## arity ## _gadget(s, gadget_ ## name ## _4s, args); \
+            } \
+            break; \
+        case MO_64: \
+            if (type == TCG_TYPE_V128) { \
+                tcg_out_ ## arity ## _gadget(s, gadget_ ## name ## _2d, args); \
+            } \
+            else { \
+                g_assert_not_reached(); \
+            } \
+            break;  \
+        default: \
+            g_assert_not_reached(); \
+    } 
+#define tcg_out_sized_vector_gadget_no64(s, name, arity, vece, args...) \
+    switch(vece) { \
+        case MO_8: \
+            if (type == TCG_TYPE_V64) { \
+                tcg_out_ ## arity ## _gadget(s, gadget_ ## name ## _8b, args); \
+            } else { \
+                tcg_out_ ## arity ## _gadget(s, gadget_ ## name ## _16b, args); \
+            } \
+            break; \
+        case MO_16: \
+            if (type == TCG_TYPE_V64) { \
+                tcg_out_ ## arity ## _gadget(s, gadget_ ## name ## _4h, args); \
+            } else { \
+                tcg_out_ ## arity ## _gadget(s, gadget_ ## name ## _8h, args); \
+            } \
+            break; \
+        case MO_32: \
+            if (type == TCG_TYPE_V64) { \
+                tcg_out_ ## arity ## _gadget(s, gadget_ ## name ## _2s, args); \
+            } else { \
+                tcg_out_ ## arity ## _gadget(s, gadget_ ## name ## _4s, args); \
+            } \
+            break; \
+        default: \
+            g_assert_not_reached(); \
+    } 
+
+
+#define tcg_out_unary_vector_gadget(s, name, vece, a) \
+    tcg_out_sized_vector_gadget(s, name, unary, vece, a)
+#define tcg_out_binary_vector_gadget(s, name, vece, a, b) \
+    tcg_out_sized_vector_gadget(s, name, binary, vece, a, b)
+#define tcg_out_ternary_vector_gadget(s, name, vece, a, b, c) \
+    tcg_out_sized_vector_gadget(s, name, ternary, vece, a, b, c)
+
+#define tcg_out_ternary_vector_gadget_no64(s, name, vece, a, b, c) \
+    tcg_out_sized_vector_gadget_no64(s, name, ternary, vece, a, b, c)
+
+
+#define tcg_out_ternary_vector_gadget_with_scalar(s, name, is_scalar, vece, a, b, c) \
+    if (is_scalar) { \
+        tcg_out_ternary_gadget(s, gadget_ ## name ## _scalar, w0, w1, w2); \
+    } else { \
+        tcg_out_ternary_vector_gadget(s, name, vece, w0, w1, w2); \
+    }
+
+
+/* Return true if v16 is a valid 16-bit shifted immediate.  */
+static bool is_shimm16(uint16_t v16, int *cmode, int *imm8)
+{
+    if (v16 == (v16 & 0xff)) {
+        *cmode = 0x8;
+        *imm8 = v16 & 0xff;
+        return true;
+    } else if (v16 == (v16 & 0xff00)) {
+        *cmode = 0xa;
+        *imm8 = v16 >> 8;
+        return true;
+    }
+    return false;
+}
+
+
+/** Core vector operation emission. */
+static void tcg_out_vec_op(TCGContext *s, TCGOpcode opc, unsigned vecl, unsigned vece,
+    const TCGArg args[TCG_MAX_OP_ARGS], const int const_args[TCG_MAX_OP_ARGS])
+{
+    TCGType type = vecl + TCG_TYPE_V64;
+    TCGArg r0, r1, r2, r3, w0, w1, w2, w3;
+
+    // Typing flags for vector operations.
+    bool is_v128 = (type == TCG_TYPE_V128);
+    bool is_scalar = !is_v128 && (vece == MO_64);
+
+    // Argument shortcuts.
+    r0 = args[0];
+    r1 = args[1];
+    r2 = args[2];
+    r3 = args[3];
+
+    // Offset argument shortcuts; offset to convert register numbers to gadget numberes.
+    w0 = args[0] - TCG_REG_V16;
+    w1 = args[1] - TCG_REG_V16;
+    w2 = args[2] - TCG_REG_V16;
+    w3 = args[3] - TCG_REG_V16;
+
+    // Argument shortcuts, as signed.
+    int64_t signed_offset_arg = (int32_t)args[2];
+
+    switch (opc) {
+
+    // Load memory -> vector: followed by a 64-bit offset immediate
+    case INDEX_op_ld_vec:
+        tcg_out_binary_dq_gadget(s, ldr, is_v128, w0, r1);
+        tcg_out64(s, signed_offset_arg);
+        break;
+    
+    // Store memory -> vector: followed by a 64-bit offset immediate
+    case INDEX_op_st_vec:
+        tcg_out_binary_dq_gadget(s, str, is_v128, w0, r1);
+        tcg_out64(s, signed_offset_arg);
+        break;
+
+    // Duplciate memory to all vector elements.
+    case INDEX_op_dupm_vec:
+        // DUPM handles normalization itself; pass arguments raw.
+        tcg_out_dupm_vec(s, type, vece, r0, r1, r2);
+        break;
+
+    case INDEX_op_add_vec:
+        tcg_out_ternary_vector_gadget_with_scalar(s, add, is_scalar, vece, w0, w1, w2);
+        break;
+
+    case INDEX_op_sub_vec:
+        tcg_out_ternary_vector_gadget_with_scalar(s, sub, is_scalar, vece, w0, w1, w2);
+        break;
+
+    case INDEX_op_mul_vec: // optional
+        tcg_out_ternary_vector_gadget_no64(s, mul, vece, w0, w1, w2);
+        break;
+
+    case INDEX_op_neg_vec: // optional
+        tcg_out_binary_vector_gadget(s, neg, vece, w0, w1);
+        break;
+
+    case INDEX_op_abs_vec: // optional
+        tcg_out_binary_vector_gadget(s, abs, vece, w0, w1);
+        break;
+
+    case INDEX_op_and_vec: // optional
+        tcg_out_ternary_dq_gadget(s, and, is_v128, w0, w1, w2);
+        break;
+
+    case INDEX_op_or_vec:
+        tcg_out_ternary_dq_gadget(s, or, is_v128, w0, w1, w2);
+        break;
+
+    case INDEX_op_andc_vec:
+        tcg_out_ternary_dq_gadget(s, andc, is_v128, w0, w1, w2);
+        break;
+
+    case INDEX_op_orc_vec: // optional
+        tcg_out_ternary_dq_gadget(s, orc, is_v128, w0, w1, w2);
+        break;
+
+    case INDEX_op_xor_vec:
+        tcg_out_ternary_dq_gadget(s, xor, is_v128, w0, w1, w2);
+        break;
+
+    case INDEX_op_ssadd_vec:
+        tcg_out_ternary_vector_gadget_with_scalar(s, ssadd, is_scalar, vece, w0, w1, w2);
+        break;
+
+    case INDEX_op_sssub_vec:
+        tcg_out_ternary_vector_gadget_with_scalar(s, sssub, is_scalar, vece, w0, w1, w2);
+        break;
+
+    case INDEX_op_usadd_vec:
+        tcg_out_ternary_vector_gadget_with_scalar(s, usadd, is_scalar, vece, w0, w1, w2);
+        break;
+
+    case INDEX_op_ussub_vec:
+        tcg_out_ternary_vector_gadget_with_scalar(s, ussub, is_scalar, vece, w0, w1, w2);
+        break;
+
+    case INDEX_op_smax_vec:
+        tcg_out_ternary_vector_gadget_no64(s, smax, vece, w0, w1, w2);
+        break;
+
+    case INDEX_op_smin_vec:
+        tcg_out_ternary_vector_gadget_no64(s, smin, vece, w0, w1, w2);
+        break;
+
+    case INDEX_op_umax_vec:
+        tcg_out_ternary_vector_gadget_no64(s, umax, vece, w0, w1, w2);
+        break;
+
+    case INDEX_op_umin_vec:
+        tcg_out_ternary_vector_gadget_no64(s, umin, vece, w0, w1, w2);
+        break;
+
+    case INDEX_op_not_vec: // optional
+        tcg_out_binary_dq_gadget(s, not, is_v128, w0, w1);
+        break;
+
+    case INDEX_op_shlv_vec:
+        tcg_out_ternary_vector_gadget_with_scalar(s, shlv, is_scalar, vece, w0, w1, w2);
+        break;
+
+    case INDEX_op_aa64_sshl_vec:
+        tcg_out_ternary_vector_gadget_with_scalar(s, sshl, is_scalar, vece, w0, w1, w2);
+        break;
+
+    case INDEX_op_cmp_vec:
+        switch (args[3]) {
+            case TCG_COND_EQ:
+                tcg_out_ternary_vector_gadget_with_scalar(s, cmeq, is_scalar, vece, w0, w1, w2);
+                break;
+            case TCG_COND_NE:
+                tcg_out_ternary_vector_gadget_with_scalar(s, cmeq, is_scalar, vece, w0, w1, w2);
+                tcg_out_binary_dq_gadget(s, not, is_v128, w0, w0);
+                break;
+            case TCG_COND_GT:
+                tcg_out_ternary_vector_gadget_with_scalar(s, cmgt, is_scalar, vece, w0, w1, w2);
+                break;
+            case TCG_COND_LE:
+                tcg_out_ternary_vector_gadget_with_scalar(s, cmgt, is_scalar, vece, w0, w2, w1);
+                break;
+            case TCG_COND_GE:
+                tcg_out_ternary_vector_gadget_with_scalar(s, cmge, is_scalar, vece, w0, w1, w2);
+                break;
+            case TCG_COND_LT:
+                tcg_out_ternary_vector_gadget_with_scalar(s, cmge, is_scalar, vece, w0, w2, w1);
+                break;
+            case TCG_COND_GTU:
+                tcg_out_ternary_vector_gadget_with_scalar(s, cmhi, is_scalar, vece, w0, w1, w2);
+                break;
+            case TCG_COND_LEU:
+                tcg_out_ternary_vector_gadget_with_scalar(s, cmhi, is_scalar, vece, w0, w2, w1);
+                break;
+            case TCG_COND_GEU:
+                tcg_out_ternary_vector_gadget_with_scalar(s, cmhs, is_scalar, vece, w0, w1, w2);
+                break;
+            case TCG_COND_LTU:
+                tcg_out_ternary_vector_gadget_with_scalar(s, cmhs, is_scalar, vece, w0, w2, w1);
+                break;
+            default:
+                g_assert_not_reached();
+        }
+        break;
+
+    case INDEX_op_bitsel_vec: // optional
+    {
+        if (r0 == r3) {
+            tcg_out_ternary_dq_gadget(s, bit, is_v128, w0, w2, w1);
+        } else if (r0 == r2) {
+            tcg_out_ternary_dq_gadget(s, bif, is_v128, w0, w3, w1);
+        } else {
+            if (r0 != r1) {
+                tcg_out_mov(s, type, r0, r1);
+            }
+            tcg_out_ternary_dq_gadget(s, bsl, is_v128, w0, w2, w3);
+        }
+        break;
+    }
+
+    case INDEX_op_mov_vec:  /* Always emitted via tcg_out_mov.  */
+    case INDEX_op_dup_vec:  /* Always emitted via tcg_out_dup_vec.  */
+    default:
+        g_assert_not_reached();
+    }
+}
+
+
+int tcg_can_emit_vec_op(TCGOpcode opc, TCGType type, unsigned vece)
+{
+    switch (opc) {
+    case INDEX_op_add_vec:
+    case INDEX_op_sub_vec:
+    case INDEX_op_and_vec:
+    case INDEX_op_or_vec:
+    case INDEX_op_xor_vec:
+    case INDEX_op_andc_vec:
+    case INDEX_op_orc_vec:
+    case INDEX_op_neg_vec:
+    case INDEX_op_abs_vec:
+    case INDEX_op_not_vec:
+    case INDEX_op_cmp_vec:
+    case INDEX_op_ssadd_vec:
+    case INDEX_op_sssub_vec:
+    case INDEX_op_usadd_vec:
+    case INDEX_op_ussub_vec:
+    case INDEX_op_shlv_vec:
+    case INDEX_op_bitsel_vec:
+        return 1;
+    case INDEX_op_rotli_vec:
+    case INDEX_op_shrv_vec:
+    case INDEX_op_sarv_vec:
+    case INDEX_op_rotlv_vec:
+    case INDEX_op_rotrv_vec:
+        return -1;
+    case INDEX_op_mul_vec:
+    case INDEX_op_smax_vec:
+    case INDEX_op_smin_vec:
+    case INDEX_op_umax_vec:
+    case INDEX_op_umin_vec:
+        return vece < MO_64;
+
+    default:
+        return 0;
+    }
+}
+
+void tcg_expand_vec_op(TCGOpcode opc, TCGType type, unsigned vece,
+                       TCGArg a0, ...)
+{
+    va_list va;
+    TCGv_vec v0, v1, v2, t1, t2, c1;
+    TCGArg a2;
+
+
+    va_start(va, a0);
+    v0 = temp_tcgv_vec(arg_temp(a0));
+    v1 = temp_tcgv_vec(arg_temp(va_arg(va, TCGArg)));
+    a2 = va_arg(va, TCGArg);
+    va_end(va);
+
+    switch (opc) {
+    case INDEX_op_shrv_vec:
+    case INDEX_op_sarv_vec:
+        /* Right shifts are negative left shifts for AArch64.  */
+        v2 = temp_tcgv_vec(arg_temp(a2));
+        t1 = tcg_temp_new_vec(type);
+        tcg_gen_neg_vec(vece, t1, v2);
+        opc = (opc == INDEX_op_shrv_vec
+               ? INDEX_op_shlv_vec : INDEX_op_aa64_sshl_vec);
+        vec_gen_3(opc, type, vece, tcgv_vec_arg(v0),
+                  tcgv_vec_arg(v1), tcgv_vec_arg(t1));
+        tcg_temp_free_vec(t1);
+        break;
+
+    case INDEX_op_rotlv_vec:
+        v2 = temp_tcgv_vec(arg_temp(a2));
+        t1 = tcg_temp_new_vec(type);
+        c1 = tcg_constant_vec(type, vece, 8 << vece);
+        tcg_gen_sub_vec(vece, t1, v2, c1);
+        /* Right shifts are negative left shifts for AArch64.  */
+        vec_gen_3(INDEX_op_shlv_vec, type, vece, tcgv_vec_arg(t1),
+                  tcgv_vec_arg(v1), tcgv_vec_arg(t1));
+        vec_gen_3(INDEX_op_shlv_vec, type, vece, tcgv_vec_arg(v0),
+                  tcgv_vec_arg(v1), tcgv_vec_arg(v2));
+        tcg_gen_or_vec(vece, v0, v0, t1);
+        tcg_temp_free_vec(t1);
+        break;
+
+    case INDEX_op_rotrv_vec:
+        v2 = temp_tcgv_vec(arg_temp(a2));
+        t1 = tcg_temp_new_vec(type);
+        t2 = tcg_temp_new_vec(type);
+        c1 = tcg_constant_vec(type, vece, 8 << vece);
+        tcg_gen_neg_vec(vece, t1, v2);
+        tcg_gen_sub_vec(vece, t2, c1, v2);
+        /* Right shifts are negative left shifts for AArch64.  */
+        vec_gen_3(INDEX_op_shlv_vec, type, vece, tcgv_vec_arg(t1),
+                  tcgv_vec_arg(v1), tcgv_vec_arg(t1));
+        vec_gen_3(INDEX_op_shlv_vec, type, vece, tcgv_vec_arg(t2),
+                  tcgv_vec_arg(v1), tcgv_vec_arg(t2));
+        tcg_gen_or_vec(vece, v0, t1, t2);
+        tcg_temp_free_vec(t1);
+        tcg_temp_free_vec(t2);
+        break;
+
+    default:
+        g_assert_not_reached();
+    }
+}
+
+
+/* Generate DUPI (move immediate) vector ops. */
+static bool tcg_out_optimized_dupi_vec(TCGContext *s, TCGType type, unsigned vece, TCGReg rd, int64_t v64)
+{
+    bool q = (type == TCG_TYPE_V128);
+    int cmode, imm8, i;
+
+    // If we're copying an 8b immediate, we implicitly have a simple gadget for this,
+    // since there are only 256 possible values * 16 registers. Emit a MOVI gadget implicitly.
+    if (vece == MO_8) {
+        imm8 = (uint8_t)v64;
+        tcg_out_dupi_gadget(s, movi, q, rd, 0, e, imm8);
+        return true;
+    }
+
+    // Otherwise, if we have a value that's all 0x00 and 0xFF bytes,
+    // we can use the scalar variant of MOVI (op=1, cmode=e), which handles
+    // that case directly.
+    for (i = imm8 = 0; i < 8; i++) {
+        uint8_t byte = v64 >> (i * 8);
+        if (byte == 0xff) {
+            imm8 |= 1 << i;
+        } else if (byte != 0) {
+            goto fail_bytes;
+        }
+    }
+    tcg_out_dupi_gadget(s, movi, q, rd, 1, e, imm8);
+    return true;
+ fail_bytes:
+
+    // Handle 16B moves.
+    if (vece == MO_16) {
+        uint16_t v16 = v64;
+
+        // Check to see if we have a value representable in as a MOV imm8, possibly via a shift.
+        if (is_shimm16(v16, &cmode, &imm8)) {
+            // Output the corret instruction CMode for either a regular MOVI (8) or a LSL8 MOVI (a).
+            if (cmode == 0x8) {
+                tcg_out_dupi_gadget(s, movi, q, rd, 0, 8, imm8);
+            } else {
+                tcg_out_dupi_gadget(s, movi, q, rd, 0, a, imm8);
+            }
+            return true;
+        }
+
+        // Check to see if we have a value representable in as an inverted MOV imm8, possibly via a shift.
+        if (is_shimm16(~v16, &cmode, &imm8)) {
+            // Output the corret instruction CMode for either a regular MOVI (8) or a LSL8 MOVI (a).
+            if (cmode == 0x8) {
+                tcg_out_dupi_gadget(s, mvni, q, rd, 0, 8, imm8);
+            } else {
+                tcg_out_dupi_gadget(s, mvni, q, rd, 0, a, imm8);
+            }
+            return true;
+        }
+
+        // If we can't perform either of the optimizations, we'll need to do this in two steps.
+        // Normally, we'd emit a gadget for both steps, but in this case that'd result in needing -way-
+        // too many gadgets. We'll emit two, instead.
+        tcg_out_dupi_gadget(s, movi, q, rd, 0, 8, v16 & 0xff);
+        tcg_out_dupi_gadget(s, orr,  q, rd, 0, a, v16 >> 8);
+        return true;
+    }
+
+    // FIXME: implement 32B move optimizations
+
+     
+    // Try to create optimized 32B moves.
+    //else if (vece == MO_32) {
+    //    uint32_t v32 = v64;
+    //    uint32_t n32 = ~v32;
+
+    //    if (is_shimm32(v32, &cmode, &imm8) ||
+    //        is_soimm32(v32, &cmode, &imm8) ||
+    //        is_fimm32(v32, &cmode, &imm8)) {
+    //        tcg_out_insn(s, 3606, MOVI, q, rd, 0, cmode, imm8);
+    //        return;
+    //    }
+    //    if (is_shimm32(n32, &cmode, &imm8) ||
+    //        is_soimm32(n32, &cmode, &imm8)) {
+    //        tcg_out_insn(s, 3606, MVNI, q, rd, 0, cmode, imm8);
+    //        return;
+    //    }
+
+    //    //
+    //    // Restrict the set of constants to those we can load with
+    //    // two instructions.  Others we load from the pool.
+    //    //
+    //    i = is_shimm32_pair(v32, &cmode, &imm8);
+    //    if (i) {
+    //        tcg_out_insn(s, 3606, MOVI, q, rd, 0, cmode, imm8);
+    //        tcg_out_insn(s, 3606, ORR, q, rd, 0, i, extract32(v32, i * 4, 8));
+    //        return;
+    //    }
+    //    i = is_shimm32_pair(n32, &cmode, &imm8);
+    //    if (i) {
+    //        tcg_out_insn(s, 3606, MVNI, q, rd, 0, cmode, imm8);
+    //        tcg_out_insn(s, 3606, BIC, q, rd, 0, i, extract32(n32, i * 4, 8));
+    //        return;
+    //    }
+    //} 
+
+    return false;
+}
+
+
+/* Emits instructions that can load an immediate into a vector. */
+static void tcg_out_dupi_vec(TCGContext *s, TCGType type, unsigned vece, TCGReg rd, int64_t v64)
+{
+    // Convert Rd into a simple gadget number.
+    rd = rd - (TCG_REG_V16);
+
+    // First, try to create an optimized implementation, if possible.
+    if (tcg_out_optimized_dupi_vec(s, type, vece, rd, v64)) {
+        return;
+    }
+
+    // If we didn't, we'll need to load the full vector from memory.
+    // Emit it into our bytecode stream as an immediate; which we'll then
+    // load inside the gadget.
+    if (type == TCG_TYPE_V128) {
+        tcg_out_unary_gadget(s, gadget_ldi_q, rd);
+        tcg_out64(s, v64);
+        tcg_out64(s, v64);
+    } else {
+        tcg_out_unary_gadget(s, gadget_ldi_d, rd);
+        tcg_out64(s, v64);
+    }
+}
+
+
+/* Emits instructions that can load a register into a vector. */
+static bool tcg_out_dup_vec(TCGContext *s, TCGType type, unsigned vece, TCGReg rd, TCGReg rs)
+{
+    // Compute the gadget index for the relevant vector register.
+    TCGReg wd = rd - (TCG_REG_V16);
+
+    // Emit a DUP gadget to handles the operation.
+    tcg_out_binary_vector_gadget(s, dup, vece, wd, rs);
+    return true;
+}
+
+static bool tcg_out_dupm_vec(TCGContext *s, TCGType type, unsigned vece, TCGReg r, TCGReg base, intptr_t offset)
+{
+    int64_t extended_offset = (int32_t)offset;
+
+    // Convert the register into a simple register number for our gadgets.
+    r = r - TCG_REG_V16;
+
+    // Emit a DUPM gadget...
+    tcg_out_binary_vector_gadget(s, dupm, vece, r, base);
+
+    // ... and emit its int64 immediate offset.
+    tcg_out64(s, extended_offset);
+
+    return true;
+}
+
+
+/********************************
+ *  TCG Runtime & Platform Def  *
+ *******************************/
+
 static void tcg_target_init(TCGContext *s)
 {
     /* The current code uses uint8_t for tcg operations. */
     tcg_debug_assert(tcg_op_defs_max <= UINT8_MAX);
 
-    /* Registers available for 32 bit operations. */
-    tcg_target_available_regs[TCG_TYPE_I32] = BIT(TCG_TARGET_NB_REGS) - 1;
-    /* Registers available for 64 bit operations. */
-    tcg_target_available_regs[TCG_TYPE_I64] = BIT(TCG_TARGET_NB_REGS) - 1;
-
-    /* TODO: Which registers should be set here? */
-    tcg_target_call_clobber_regs = BIT(TCG_TARGET_NB_REGS) - 1;
+    // Registers available for each type of operation.
+    tcg_target_available_regs[TCG_TYPE_I32]  = TCG_MASK_GP_REGISTERS;
+    tcg_target_available_regs[TCG_TYPE_I64]  = TCG_MASK_GP_REGISTERS;
+    tcg_target_available_regs[TCG_TYPE_V64]  = TCG_MASK_VECTOR_REGISTERS;
+    tcg_target_available_regs[TCG_TYPE_V128] = TCG_MASK_VECTOR_REGISTERS;
+
+    TCGReg unclobbered_registers[] = {
+        // We don't use registers R16+ in our runtime, so we'll not bother protecting them.
+        TCG_REG_R16, TCG_REG_R17, TCG_REG_R18, TCG_REG_R19,
+        TCG_REG_R20, TCG_REG_R21, TCG_REG_R22, TCG_REG_R23,
+        TCG_REG_R24, TCG_REG_R25, TCG_REG_R26, TCG_REG_R27,
+        TCG_REG_R28, TCG_REG_R29, TCG_REG_R30, TCG_REG_R31,
+
+        // Per our calling convention.
+        TCG_REG_V8,  TCG_REG_V9,  TCG_REG_V10, TCG_REG_V11,
+        TCG_REG_V12, TCG_REG_V13, TCG_REG_V14, TCG_REG_V15,
+   };
+
+    // Specify which registers are clobbered during call.
+    tcg_target_call_clobber_regs = -1ull;
+    for (unsigned i = 0; i < ARRAY_SIZE(unclobbered_registers); ++i) {
+        tcg_regset_reset_reg(tcg_target_call_clobber_regs, unclobbered_registers[i]);
+    }
 
+    // Specify which local registers we're reserving.
+    //
+    // Note that we only have to specify registers that are used in the runtime,
+    // and so not e.g. the register that contains AREG0, which can never be allocated.
     s->reserved_regs = 0;
     tcg_regset_set_reg(s->reserved_regs, TCG_REG_CALL_STACK);
 
@@ -1292,8 +2073,8 @@ uintptr_t QEMU_DISABLE_CFI tcg_qemu_tb_exec(CPUArchState *env, const void *v_tb_
 
         : [return_value] "=m" (return_value)
 
-        : [areg0]        "m"  (env), 
-          [sp_value]     "m"  (sp_value), 
+        : [areg0]        "m"  (env),
+          [sp_value]     "m"  (sp_value),
           [start_tb_ptr] "m"  (v_tb_ptr),
           [pc_mirror]    "m"  (pc_mirror)
 
@@ -1318,8 +2099,11 @@ uintptr_t QEMU_DISABLE_CFI tcg_qemu_tb_exec(CPUArchState *env, const void *v_tb_
 /* Disassemble TCI bytecode. */
 int print_insn_tcti(bfd_vma addr, disassemble_info *info)
 {
+
+#ifdef TCTI_GADGET_RICH_DISASSEMBLY
     Dl_info symbol_info = {};
     char symbol_name[48] ;
+#endif
 
     int status;
     uint64_t block;
@@ -1331,16 +2115,22 @@ int print_insn_tcti(bfd_vma addr, disassemble_info *info)
         return -1;
     }
 
+#ifdef TCTI_GADGET_RICH_DISASSEMBLY
     // Most of our disassembly stream will be gadgets. Try to get their names, for nice output.
     dladdr((void *)block, &symbol_info);
 
     if(symbol_info.dli_sname != 0) {
-        strlcpy(symbol_name, symbol_info.dli_sname, 47);
+        strncpy(symbol_name, symbol_info.dli_sname, sizeof(symbol_name));
+        symbol_name[sizeof(symbol_name) - 1] = 0;
         info->fprintf_func(info->stream, "%s", symbol_name);
     } else {
-        info->fprintf_func(info->stream, "%016llx", block);
+        info->fprintf_func(info->stream, "%016lx", block);
     }
 
+#else
+    info->fprintf_func(info->stream, "%016lx", block);
+#endif
+
     return sizeof(block);
 }
 
diff --git a/tcg/aarch64-tcti/tcg-target.h b/tcg/aarch64-tcti/tcg-target.h
index 7eb3bb1c3d94..bf4e7e2772b9 100644
--- a/tcg/aarch64-tcti/tcg-target.h
+++ b/tcg/aarch64-tcti/tcg-target.h
@@ -56,8 +56,11 @@
 // weird psuedo-native bytecode. We'll indicate that we're intepreted.
 #define TCG_TARGET_INTERPRETER 1
 
+// Specify we'll handle direct jumps.
+#define TCG_TARGET_HAS_direct_jump      1
+
 //
-// Supported optional instructions.
+// Supported optional scalar instructions.
 //
 
 // Divs.
@@ -78,23 +81,35 @@
 #define TCG_TARGET_HAS_ext16u_i64       1
 #define TCG_TARGET_HAS_ext32u_i64       1
 
-// Logicals.
+// Register extractions.
+#define TCG_TARGET_HAS_extrl_i64_i32    1
+#define TCG_TARGET_HAS_extrh_i64_i32    1
+
+// Negations.
 #define TCG_TARGET_HAS_neg_i32          1
 #define TCG_TARGET_HAS_not_i32          1
 #define TCG_TARGET_HAS_neg_i64          1
 #define TCG_TARGET_HAS_not_i64          1
 
+// Logicals.
 #define TCG_TARGET_HAS_andc_i32         1
 #define TCG_TARGET_HAS_orc_i32          1
 #define TCG_TARGET_HAS_eqv_i32          1
+#define TCG_TARGET_HAS_rot_i32          1
+#define TCG_TARGET_HAS_nand_i32         1
+#define TCG_TARGET_HAS_nor_i32          1
 #define TCG_TARGET_HAS_andc_i64         1
 #define TCG_TARGET_HAS_eqv_i64          1
 #define TCG_TARGET_HAS_orc_i64          1
+#define TCG_TARGET_HAS_rot_i64          1
+#define TCG_TARGET_HAS_nor_i64          1
+#define TCG_TARGET_HAS_nand_i64         1
 
-// We don't curretly support rotates, since AArch64 lacks ROL.
-// We'll fix this later.
-#define TCG_TARGET_HAS_rot_i32          0
-#define TCG_TARGET_HAS_rot_i64          0
+// Bitwise operations.
+#define TCG_TARGET_HAS_clz_i32          1
+#define TCG_TARGET_HAS_ctz_i32          1
+#define TCG_TARGET_HAS_clz_i64          1
+#define TCG_TARGET_HAS_ctz_i64          1
 
 // Swaps.
 #define TCG_TARGET_HAS_bswap16_i32      1
@@ -104,53 +119,58 @@
 #define TCG_TARGET_HAS_bswap64_i64      1
 #define TCG_TARGET_HAS_MEMORY_BSWAP     1
 
-// Specify we'll handle direct jumps.
-#define TCG_TARGET_HAS_direct_jump      1
-
 //
-// Potential TODOs.
+// Supported optional vector instructions.
 //
 
-// TODO: implement DEPOSIT as BFI.
-#define TCG_TARGET_HAS_deposit_i32      0
-#define TCG_TARGET_HAS_deposit_i64      0
-
-// TODO: implement EXTRACT as BFX.
-#define TCG_TARGET_HAS_extract_i32      0
-#define TCG_TARGET_HAS_sextract_i32     0
-#define TCG_TARGET_HAS_extract_i64      0
-#define TCG_TARGET_HAS_sextract_i64     0
-
-// TODO: it might be worth writing a gadget for this
-#define TCG_TARGET_HAS_movcond_i32      0
-#define TCG_TARGET_HAS_movcond_i64      0
+#define TCG_TARGET_HAS_v64              1
+#define TCG_TARGET_HAS_v128             1
+#define TCG_TARGET_HAS_v256             0
+
+#define TCG_TARGET_HAS_andc_vec         1
+#define TCG_TARGET_HAS_orc_vec          1
+#define TCG_TARGET_HAS_nand_vec         0
+#define TCG_TARGET_HAS_nor_vec          0
+#define TCG_TARGET_HAS_eqv_vec          0
+#define TCG_TARGET_HAS_not_vec          1
+#define TCG_TARGET_HAS_neg_vec          1
+#define TCG_TARGET_HAS_abs_vec          1
+#define TCG_TARGET_HAS_roti_vec         0
+#define TCG_TARGET_HAS_rots_vec         0
+#define TCG_TARGET_HAS_rotv_vec         0
+#define TCG_TARGET_HAS_shi_vec          0
+#define TCG_TARGET_HAS_shs_vec          0
+#define TCG_TARGET_HAS_shv_vec          1
+#define TCG_TARGET_HAS_mul_vec          1
+#define TCG_TARGET_HAS_sat_vec          1
+#define TCG_TARGET_HAS_minmax_vec       1
+#define TCG_TARGET_HAS_bitsel_vec       1
+#define TCG_TARGET_HAS_cmpsel_vec       0
 
 //
 // Unsupported instructions.
 //
 
-// ARMv8 doesn't have instructions for NAND/NOR.
-#define TCG_TARGET_HAS_nand_i32         0
-#define TCG_TARGET_HAS_nor_i32          0
-#define TCG_TARGET_HAS_nor_i64          0
-#define TCG_TARGET_HAS_nand_i64         0
-
-// aarch64's CLZ is implemented without a condition, so it
-#define TCG_TARGET_HAS_clz_i32          0
-#define TCG_TARGET_HAS_ctz_i32          0
+// There's no direct instruction with which to count the number of ones,
+// so we'll leave this implemented as other instructions.
 #define TCG_TARGET_HAS_ctpop_i32        0
-#define TCG_TARGET_HAS_clz_i64          0
-#define TCG_TARGET_HAS_ctz_i64          0
 #define TCG_TARGET_HAS_ctpop_i64        0
 
-// We don't have a simple gadget for this, since we're always assuming softmmu.
-#define TCG_TARGET_HAS_qemu_st8_i32     0
-
-// No AArch64 equivalent.a
-#define TCG_TARGET_HAS_extrl_i64_i32    0
-#define TCG_TARGET_HAS_extrh_i64_i32    0
+// We don't currently support gadgets with more than three arguments,
+// so we can't yet create movcond, deposit, or extract gadgets.
+#define TCG_TARGET_HAS_movcond_i32      0
+#define TCG_TARGET_HAS_movcond_i64      0
+#define TCG_TARGET_HAS_deposit_i32      0
+#define TCG_TARGET_HAS_deposit_i64      0
+#define TCG_TARGET_HAS_extract_i32      0
+#define TCG_TARGET_HAS_sextract_i32     0
+#define TCG_TARGET_HAS_extract_i64      0
+#define TCG_TARGET_HAS_sextract_i64     0
 
-#define TCG_TARGET_HAS_extract2_i64     0
+// This operation exists specifically to allow us to provide differing register
+// constraints for 8-bit loads and stores. We don't need to do so, so we'll leave
+// this unimplemented, as we gain nothing by it.
+#define TCG_TARGET_HAS_qemu_st8_i32     0
 
 // These should always be zero on our 64B platform.
 #define TCG_TARGET_HAS_muls2_i64        0
@@ -166,36 +186,55 @@
 #define TCG_TARGET_HAS_muls2_i32        0
 #define TCG_TARGET_HAS_muluh_i32        0
 #define TCG_TARGET_HAS_mulsh_i32        0
+#define TCG_TARGET_HAS_extract2_i64     0
 
 //
 // Platform metadata.
 //
 
 // Number of registers available.
-// It might make sense to up these, since we can also use x16 -> x25?
-#define TCG_TARGET_NB_REGS 16
+#define TCG_TARGET_NB_REGS 64
+
+// Number of general purpose registers.
+#define TCG_TARGET_GP_REGS 16
 
 /* List of registers which are used by TCG. */
 typedef enum {
-    TCG_REG_R0 = 0,
-    TCG_REG_R1,
-    TCG_REG_R2,
-    TCG_REG_R3,
-    TCG_REG_R4,
-    TCG_REG_R5,
-    TCG_REG_R6,
-    TCG_REG_R7,
-    TCG_REG_R8,
-    TCG_REG_R9,
-    TCG_REG_R10,
-    TCG_REG_R11,
-    TCG_REG_R12,
-    TCG_REG_R13,
-    TCG_REG_R14,
-    TCG_REG_R15,
-
-    TCG_AREG0          = TCG_REG_R14,
-    TCG_REG_CALL_STACK = TCG_REG_R15,
+
+    // General purpose registers.
+    // Note that we name every _host_ register here; but don't 
+    // necessarily use them; that's determined by the allocation order
+    // and the number of registers setting above. These just give us the ability
+    // to refer to these by name.
+    TCG_REG_R0, TCG_REG_R1, TCG_REG_R2, TCG_REG_R3,
+    TCG_REG_R4, TCG_REG_R5, TCG_REG_R6, TCG_REG_R7,
+    TCG_REG_R8, TCG_REG_R9, TCG_REG_R10, TCG_REG_R11,
+    TCG_REG_R12, TCG_REG_R13, TCG_REG_R14, TCG_REG_R15,
+    TCG_REG_R16, TCG_REG_R17, TCG_REG_R18, TCG_REG_R19,
+    TCG_REG_R20, TCG_REG_R21, TCG_REG_R22, TCG_REG_R23,
+    TCG_REG_R24, TCG_REG_R25, TCG_REG_R26, TCG_REG_R27,
+    TCG_REG_R28, TCG_REG_R29, TCG_REG_R30, TCG_REG_R31,
+
+    // Register aliases.
+    TCG_AREG0             = TCG_REG_R14,
+    TCG_REG_CALL_STACK    = TCG_REG_R15,
+
+    // Mask that refers to the GP registers.
+    TCG_MASK_GP_REGISTERS = 0xFFFFul, 
+
+    // Vector registers.
+    TCG_REG_V0 = 32, TCG_REG_V1, TCG_REG_V2, TCG_REG_V3,
+    TCG_REG_V4, TCG_REG_V5, TCG_REG_V6, TCG_REG_V7,
+    TCG_REG_V8, TCG_REG_V9, TCG_REG_V10, TCG_REG_V11,
+    TCG_REG_V12, TCG_REG_V13, TCG_REG_V14, TCG_REG_V15,
+    TCG_REG_V16, TCG_REG_V17, TCG_REG_V18, TCG_REG_V19,
+    TCG_REG_V20, TCG_REG_V21, TCG_REG_V22, TCG_REG_V23,
+    TCG_REG_V24, TCG_REG_V25, TCG_REG_V26, TCG_REG_V27,
+    TCG_REG_V28, TCG_REG_V29, TCG_REG_V30, TCG_REG_V31,
+
+    // Mask that refers to the vector registers.
+    TCG_MASK_VECTOR_REGISTERS = 0xFFFF000000000000ul, 
+
 } TCGReg;
 
 // Specify the shape of the stack our runtime will use.
diff --git a/tcg/aarch64-tcti/tcg-target.opc.h b/tcg/aarch64-tcti/tcg-target.opc.h
new file mode 100644
index 000000000000..26bfd9c46093
--- /dev/null
+++ b/tcg/aarch64-tcti/tcg-target.opc.h
@@ -0,0 +1,14 @@
+/*
+ * Copyright (c) 2019 Linaro
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2 or
+ * (at your option) any later version.
+ *
+ * See the COPYING file in the top-level directory for details.
+ *
+ * Target-specific opcodes for host vector expansion.  These will be
+ * emitted by tcg_expand_vec_op.  For those familiar with GCC internals,
+ * consider these to be UNSPEC with names.
+ */
+
+DEF(aa64_sshl_vec, 1, 2, 0, IMPLVEC)
diff --git a/tcg/aarch64-tcti/tcti-gadget-gen.py b/tcg/aarch64-tcti/tcti-gadget-gen.py
index fa0232fefac0..4e127ff8c3be 100755
--- a/tcg/aarch64-tcti/tcti-gadget-gen.py
+++ b/tcg/aarch64-tcti/tcti-gadget-gen.py
@@ -4,17 +4,10 @@
 Generates a C-code include file containing 'gadgets' for use by TCTI.
 """
 
+import os
 import sys
 import itertools
 
-# Get a handle on the file we'll be working with, and redirect print to it.
-if len(sys.argv) > 1:
-    out_file = open(sys.argv[1], "w")
-
-    # Hook our print function, so it always outputs to the relevant file.
-    core_print = print
-    print = lambda *a, **k : core_print(*a, **k, file=out_file)
-
 # Epilogue code follows at the end of each gadget, and handles continuing execution.
 EPILOGUE = ( 
     # Load our next gadget address from our bytecode stream, advancing it.
@@ -32,41 +25,113 @@
 # Helper that provides each of the AArch64 condition codes of interest.
 ARCH_CONDITION_CODES = ["eq", "ne", "lt", "ge", "le", "gt", "lo", "hs", "ls", "hi"]
 
+# The list of vector size codes supported on this platform.
+VECTOR_SIZES = ['16b', '8b', '4h', '8h', '2s', '4s', '2d']
+
 # We'll create a variety of gadgets that assume the MMU's TLB is stored at certain
 # offsets into its structure. These should match the offsets in tcg-target.c.in.
-QEMU_ALLOWED_MMU_OFFSETS = [ 64, 96, 128 ]
+QEMU_ALLOWED_MMU_OFFSETS = [ 32, 48, 64, 96, 128 ]
 
 # Statistics.
 gadgets      = 0
 instructions = 0
 
-def simple(name, *lines):
+# Files to write to.
+current_collection = "basic"
+output_files = {}
+
+# Create a top-level header.
+top_header = open("tcti_gadgets.h", "w")
+print("/* Automatically generated by tcti-gadget-gen.py. Do not edit. */\n", file=top_header)
+
+def _get_output_files():
+    """ Gathers the output C and H files for a given gadget-cluster name. """
+
+    # If we don't have an output file for this already, create it.
+    return output_files[current_collection]
+
+
+def START_COLLECTION(name):
+    """ Sets the name of the current collection. """
+
+    global current_collection
+
+    # If we already have a collection for this, skip it.
+    if name in output_files:
+        return
+
+    # Create the relevant output files
+    new_c_file = open(f"tcti_{name}_gadgets.c", "w")
+    new_h_file = open(f"tcti_{name}_gadgets.h", "w")
+    output_files[name] = (new_c_file, new_h_file)
+
+    # Add the file to our gadget collection.
+    print(f'#include "tcti_{name}_gadgets.h"', file=top_header)
+
+    # Add generated messages to the relevant collection.
+    print("/* Automatically generated by tcti-gadget-gen.py. Do not edit. */\n", file=new_c_file)
+    print("/* Automatically generated by tcti-gadget-gen.py. Do not edit. */\n", file=new_h_file)
+
+    # Start our C file with inclusion of the relevant header.
+    print(f'\n#include "tcti_{name}_gadgets.h"\n', file=new_c_file)
+
+    # Start our H file with a simple pragma-guard, for speed.
+    print('\n#pragma once\n', file=new_h_file)
+
+    # Finally, set the global active collection.
+    current_collection = name
+    
+
+def simple(name, *lines, export=True):
     """ Generates a simple gadget that needs no per-register specialization. """
 
     global gadgets, instructions
 
     gadgets += 1
 
+    # Fetch the files we'll be using for output.
+    c_file, h_file = _get_output_files()
+
     # Create our C/ASM framing.
-    #print(f"__attribute__((naked)) static void gadget_{name}(void)")
-    print(f"__attribute__((naked)) static void gadget_{name}(void);")
-    print(f"__attribute__((naked)) static void gadget_{name}(void)")
-    print("{")
+    if export:
+        print(f"__attribute__((naked)) void gadget_{name}(void);", file=h_file)
+        print(f"__attribute__((naked)) void gadget_{name}(void)", file=c_file)
+    else:
+        print(f"static __attribute__((naked)) void gadget_{name}(void)", file=c_file)
+
+    print("{", file=c_file)
 
     # Add the core gadget
-    print("\tasm(")
+    print("\tasm(", file=c_file)
     for line in lines + EPILOGUE:
-        print(f"\t\t\"{line} \\n\"")
+        print(f"\t\t\"{line} \\n\"", file=c_file)
         instructions += 1
-    print("\t);")
+    print("\t);", file=c_file)
 
     # End our framing.
-    print("}\n")
+    print("}\n", file=c_file)
+
 
 
 def with_register_substitutions(name, substitutions, *lines, immediate_range=range(0)):
     """ Generates a collection of gadgtes with register substitutions. """
 
+    def _expand_op1_immediate(num):
+        """ Gets a uncompressed bitfield argument for a given immediate; for NEON instructions. 
+        
+        Duplciates each bit eight times; converting 0b0100 to 0x00FF0000.
+        """
+
+        # Get the number as a binary string...
+        binstring = bin(num)[2:]
+
+        # ... expand out the values to hex...
+        hex_string = binstring.replace('1', 'FF').replace('0', '00') 
+
+        # ... and return out the new constant.
+        return f"0x{hex_string}"
+
+
     def substitutions_for_letter(letter, number, line):
         """ Helper that transforms Wd => w1, implementing gadget substitutions. """
 
@@ -74,8 +139,16 @@ def substitutions_for_letter(letter, number, line):
         line = line.replace(f"X{letter}", f"x{number}")
         line = line.replace(f"W{letter}", f"w{number}")
 
-        # ... immediate substitutions.
+        # ... vector register substitutions...
+        line = line.replace(f"V{letter}", f"v{number + 16}")
+        line = line.replace(f"D{letter}", f"d{number + 16}")
+        line = line.replace(f"Q{letter}", f"q{number + 16}")
+
+        # ... regular immediate substitutions...
         line = line.replace(f"I{letter}", f"{number}")
+
+        # ... and compressed immediate substitutions.
+        line = line.replace(f"S{letter}", f"{_expand_op1_immediate(number)}")
         return line
 
         
@@ -105,77 +178,94 @@ def substitutions_for_letter(letter, number, line):
 
         # ... and emit the gadget.
         permutation_id = "_arg".join(str(number) for number in permutation)
-        simple(f"{name}_arg{permutation_id}", *new_lines)
+        simple(f"{name}_arg{permutation_id}", *new_lines, export=False)
 
 
 def with_dnm(name, *lines):
     """ Generates a collection of gadgets with substitutions for Xd, Xn, and Xm, and equivalents. """
     with_register_substitutions(name, ("d", "n", "m"), *lines)
 
+    # Fetch the files we'll be using for output.
+    c_file, h_file = _get_output_files()
+
+    # Print out an extern.
+    print(f"extern const void* gadget_{name}[{TCG_REGISTER_COUNT}][{TCG_REGISTER_COUNT}][{TCG_REGISTER_COUNT}];", file=h_file)
+
     # Print out an array that contains all of our gadgets, for lookup.
-    print(f"static void* gadget_{name}[{TCG_REGISTER_COUNT}][{TCG_REGISTER_COUNT}][{TCG_REGISTER_COUNT}] = ", end="")
-    print("{")
+    print(f"const void* gadget_{name}[{TCG_REGISTER_COUNT}][{TCG_REGISTER_COUNT}][{TCG_REGISTER_COUNT}] = ", end="", file=c_file)
+    print("{", file=c_file)
 
     # D array
     for d in TCG_REGISTER_NUMBERS:
-        print("\t{")
+        print("\t{", file=c_file)
 
         # N array
         for n in TCG_REGISTER_NUMBERS:
-            print("\t\t{", end="")
+            print("\t\t{", end="", file=c_file)
 
             # M array
             for m in TCG_REGISTER_NUMBERS:
-                print(f"gadget_{name}_arg{d}_arg{n}_arg{m}", end=", ")
+                print(f"gadget_{name}_arg{d}_arg{n}_arg{m}", end=", ", file=c_file)
 
-            print("},")
-        print("\t},")
-    print("};")
+            print("},", file=c_file)
+        print("\t},", file=c_file)
+    print("};", file=c_file)
 
 
 def with_dn_immediate(name, *lines, immediate_range):
     """ Generates a collection of gadgets with substitutions for Xd, Xn, and Xm, and equivalents. """
     with_register_substitutions(name, ["d", "n"], *lines, immediate_range=immediate_range)
 
+    # Fetch the files we'll be using for output.
+    c_file, h_file = _get_output_files()
+
+    # Print out an extern.
+    print(f"extern const void* gadget_{name}[{TCG_REGISTER_COUNT}][{TCG_REGISTER_COUNT}][{len(immediate_range)}];", file=h_file)
+
     # Print out an array that contains all of our gadgets, for lookup.
-    print(f"static void* gadget_{name}[{TCG_REGISTER_COUNT}][{TCG_REGISTER_COUNT}][{len(immediate_range)}] = ", end="")
-    print("{")
+    print(f"const void* gadget_{name}[{TCG_REGISTER_COUNT}][{TCG_REGISTER_COUNT}][{len(immediate_range)}] = ", end="", file=c_file)
+    print("{", file=c_file)
 
     # D array
     for d in TCG_REGISTER_NUMBERS:
-        print("\t{")
+        print("\t{", file=c_file)
 
         # N array
         for n in TCG_REGISTER_NUMBERS:
-            print("\t\t{", end="")
+            print("\t\t{", end="", file=c_file)
 
             # M array
             for i in immediate_range:
-                print(f"gadget_{name}_arg{d}_arg{n}_arg{i}", end=", ")
+                print(f"gadget_{name}_arg{d}_arg{n}_arg{i}", end=", ", file=c_file)
 
-            print("},")
-        print("\t},")
-    print("};")
+            print("},", file=c_file)
+        print("\t},", file=c_file)
+    print("};", file=c_file)
 
 
 def with_pair(name, substitutions, *lines):
     """ Generates a collection of gadgets with two subtstitutions."""
     with_register_substitutions(name, substitutions, *lines)
 
+    # Fetch the files we'll be using for output.
+    c_file, h_file = _get_output_files()
+
+    print(f"extern const void* gadget_{name}[{TCG_REGISTER_COUNT}][{TCG_REGISTER_COUNT}];", file=h_file)
+
     # Print out an array that contains all of our gadgets, for lookup.
-    print(f"static void* gadget_{name}[{TCG_REGISTER_COUNT}][{TCG_REGISTER_COUNT}] = ", end="")
-    print("{")
+    print(f"const void* gadget_{name}[{TCG_REGISTER_COUNT}][{TCG_REGISTER_COUNT}] = ", end="", file=c_file)
+    print("{", file=c_file)
 
     # N array
     for a in TCG_REGISTER_NUMBERS:
-        print("\t\t{", end="")
+        print("\t\t{", end="", file=c_file)
 
         # M array
         for b in TCG_REGISTER_NUMBERS:
-            print(f"gadget_{name}_arg{a}_arg{b}", end=", ")
+            print(f"gadget_{name}_arg{a}_arg{b}", end=", ", file=c_file)
 
-        print("},")
-    print("};")
+        print("},", file=c_file)
+    print("};", file=c_file)
 
 
 def math_dnm(name, mnemonic):
@@ -183,10 +273,10 @@ def math_dnm(name, mnemonic):
     with_dnm(f'{name}_i32', f"{mnemonic} Wd, Wn, Wm")
     with_dnm(f'{name}_i64', f"{mnemonic} Xd, Xn, Xm")
 
-def math_dn(name, mnemonic):
+def math_dn(name, mnemonic, source_is_wn=False):
     """ Equivalent to `with_dn`, but creates a _i32 and _i64 variant. For simple math. """
     with_dn(f'{name}_i32', f"{mnemonic} Wd, Wn")
-    with_dn(f'{name}_i64', f"{mnemonic} Xd, Xn")
+    with_dn(f'{name}_i64', f"{mnemonic} Xd, Wn" if source_is_wn else f"{mnemonic} Xd, Xn")
 
 
 def with_nm(name, *lines):
@@ -227,34 +317,44 @@ def with_single(name, substitution, *lines):
     """ Generates a collection of gadgets with two subtstitutions."""
     with_register_substitutions(name, (substitution,), *lines)
 
+    # Fetch the files we'll be using for output.
+    c_file, h_file = _get_output_files()
+
+    print(f"extern const void* gadget_{name}[{TCG_REGISTER_COUNT}];", file=h_file)
+
     # Print out an array that contains all of our gadgets, for lookup.
-    print(f"static void* gadget_{name}[{TCG_REGISTER_COUNT}] = ", end="")
-    print("{")
+    print(f"const void* gadget_{name}[{TCG_REGISTER_COUNT}] = ", end="", file=c_file)
+    print("{", file=c_file)
 
     for n in TCG_REGISTER_NUMBERS:
-        print(f"gadget_{name}_arg{n}", end=", ")
+        print(f"gadget_{name}_arg{n}", end=", ", file=c_file)
 
-    print("};")
+    print("};", file=c_file)
 
 
 def with_d_immediate(name, *lines, immediate_range=range(0)):
     """ Generates a collection of gadgets with two subtstitutions."""
     with_register_substitutions(name, ['d'], *lines, immediate_range=immediate_range)
 
+    # Fetch the files we'll be using for output.
+    c_file, h_file = _get_output_files()
+
+    print(f"extern void* gadget_{name}[{TCG_REGISTER_COUNT}][{len(immediate_range)}];", file=h_file)
+
     # Print out an array that contains all of our gadgets, for lookup.
-    print(f"static void* gadget_{name}[{TCG_REGISTER_COUNT}][{len(immediate_range)}] = ", end="")
-    print("{")
+    print(f"void* gadget_{name}[{TCG_REGISTER_COUNT}][{len(immediate_range)}] = ", end="", file=c_file)
+    print("{", file=c_file)
 
     # D array
     for a in TCG_REGISTER_NUMBERS:
-        print("\t\t{", end="")
+        print("\t\t{", end="", file=c_file)
 
         # I array
         for b in immediate_range:
-            print(f"gadget_{name}_arg{a}_arg{b}", end=", ")
+            print(f"gadget_{name}_arg{a}_arg{b}", end=", ", file=c_file)
 
-        print("},")
-    print("};")
+        print("},", file=c_file)
+    print("};", file=c_file)
 
 
 
@@ -265,31 +365,14 @@ def with_d(name, *lines):
 
 # Assembly code for saving our machine state before entering the C runtime.
 C_CALL_PROLOGUE = [
-    # Store our machine state.
-    "str x25,      [sp, #-16]!",
     "stp x14, x15, [sp, #-16]!",
-    "stp x12, x13, [sp, #-16]!",
-    "stp x10, x11, [sp, #-16]!",
-    "stp x8,  x9,  [sp, #-16]!",
-    "stp x6,  x7,  [sp, #-16]!",
-    "stp x4,  x5,  [sp, #-16]!",
-    "stp x2,  x3,  [sp, #-16]!",
-    "stp x0,  x1,  [sp, #-16]!",
     "stp x28, lr,  [sp, #-16]!",
 ]
 
 # Assembly code for restoring our machine state after leaving the C runtime.
 C_CALL_EPILOGUE = [
-    "ldp x28, lr, [sp], #16",
-    "ldp x0,  x1, [sp], #16",
-    "ldp x2,  x3, [sp], #16",
-    "ldp x4,  x5, [sp], #16",
-    "ldp x6,  x7, [sp], #16",
-    "ldp x8,  x9, [sp], #16",
-    "ldp x10, x11, [sp], #16",
-    "ldp x12, x13, [sp], #16",
+    "ldp x28, lr,  [sp], #16",
     "ldp x14, x15, [sp], #16",
-    "ldr x25,      [sp], #16",
 ]
 
 
@@ -503,11 +586,73 @@ def st_thunk(name, fastpath_32b, fastpath_64b, slowpath_helper, immediate=None,
             )
 
 
+
+def vector_dn(name, *lines):
+    """ Creates a set of gadgets for every size of a given vector op. Accepts 'S' as a size placeholder. """
+
+    def do_size_replacement(line, size):
+        line = line.replace(".S", f".{size}")
+        
+        # If this size requires a 32b register, replace Wd with Xd.
+        if size == "2d":
+            line = line.replace("Wn", "Xn")
+
+        return line
+
+
+    # Create a variant for each size, replacing any placeholders.
+    for size in VECTOR_SIZES:
+        sized_lines = (do_size_replacement(line, size) for line in lines)
+        with_dn(f"{name}_{size}", *sized_lines)
+
+
+def vector_dnm(name, *lines, scalar=None, omit_sizes=()):
+    """ Creates a set of gadgets for every size of a given vector op. Accepts 'S' as a size placeholder. """
+
+    def do_size_replacement(line, size):
+        return line.replace(".S", f".{size}")
+        
+    # Create a variant for each size, replacing any placeholders.
+    for size in VECTOR_SIZES:
+        if size in omit_sizes:
+            continue
+
+        sized_lines = (do_size_replacement(line, size) for line in lines)
+        with_dnm(f"{name}_{size}", *sized_lines)
+
+    if scalar:
+        if isinstance(scalar, str):
+            sized_lines = (scalar,)
+        with_dnm(f"{name}_scalar", *sized_lines)
+
+
+def vector_math_dnm(name, operation):
+    """ Generates a collection of gadgets for vector math instructions. """
+    vector_dnm(name, f"{operation} Vd.S, Vn.S, Vm.S", scalar=f"{operation} Dd, Dn, Dm")
+
+
+def vector_math_dnm_no64(name, operation):
+    """ Generates a collection of gadgets for vector math instructions. """
+    vector_dnm(name, f"{operation} Vd.S, Vn.S, Vm.S", omit_sizes=('2d',))
+
+
+def vector_logic_dn(name, operation):
+    """ Generates a pair of gadgets for vector bitwise logic instructions. """
+    with_dn(f"{name}_d", f"{operation} Vd.8b, Vn.8b")
+    with_dn(f"{name}_q", f"{operation} Vd.16b, Vn.16b")
+
+
+def vector_logic_dnm(name, operation):
+    """ Generates a pair of gadgets for vector bitwise logic instructions. """
+    with_dnm(f"{name}_d", f"{operation} Vd.8b, Vn.8b, Vm.8b")
+    with_dnm(f"{name}_q", f"{operation} Vd.16b, Vn.16b, Vm.16b")
+
+
 #
 # Gadget definitions.
 #
 
-print("/* Automatically generated by tcti-gadget-gen.py. Do not edit. */\n")
+START_COLLECTION("misc")
 
 # Call a C language helper function by address.
 simple("call",
@@ -539,6 +684,7 @@ def st_thunk(name, fastpath_32b, fastpath_64b, slowpath_helper, immediate=None,
     "ldr x28, [x28]"
 )
 
+
 # Exit from a translation buffer execution.
 simple("exit_tb",
 
@@ -550,9 +696,18 @@ def st_thunk(name, fastpath_32b, fastpath_64b, slowpath_helper, immediate=None,
     "ret"
 )
 
+# Memory barriers.
+simple("mb_all", "dmb ish")
+simple("mb_st",  "dmb ishst")
+simple("mb_ld",  "dmb ishld")
+
+
+
 
 for condition in ARCH_CONDITION_CODES:
 
+    START_COLLECTION("setcond")
+
     # Performs a comparison between two operands.
     with_dnm(f"setcond_i32_{condition}",
         "subs Wd, Wn, Wm",
@@ -573,23 +728,20 @@ def st_thunk(name, fastpath_32b, fastpath_64b, slowpath_helper, immediate=None,
     # branch is funneled throught the same address.
     #
 
+    START_COLLECTION("brcond")
+
     # Branches iff a given comparison is true.
     with_dnm(f'brcond_i32_{condition}',
 
         # Grab our immediate argument.
         "ldr x27, [x28], #8",
 
-        # Perform our comparison and conditional branch.
-        "subs Wzr, Wn, Wm",
-        f"b{condition} 1f",
-
-        "0:", # not taken
-           # Perform our end-of-instruction epilogue.
-            *EPILOGUE,
+        # Perform our comparison...
+        "subs wzr, Wn, Wm",
 
-        "1:" # taken
-            # Update our bytecode pointer to take the label.
-            "mov x28, x27"
+        # ... and our conditional branch, which selectively sets w28 (our "gadget pointer")
+        # to the new location, if required.
+        f"csel x28, x27, x28, {condition}"
     )
 
     # Branches iff a given comparison is true.
@@ -599,19 +751,17 @@ def st_thunk(name, fastpath_32b, fastpath_64b, slowpath_helper, immediate=None,
         "ldr x27, [x28], #8",
 
         # Perform our comparison and conditional branch.
-        "subs Xzr, Xn, Xm",
-        f"b{condition} 1f",
+        "subs xzr, Xn, Xm",
 
-        "0:", # not taken
-            # Perform our end-of-instruction epilogue.
-            *EPILOGUE,
-
-        "1:" # taken
-            # Update our bytecode pointer to take the label.
-            "mov x28, x27"
+        # ... and our conditional branch, which selectively sets w28 (our "gadget pointer")
+        # to the new location, if required.
+        f"csel x28, x27, x28, {condition}"
     )
 
 
+START_COLLECTION("mov")
+
+
 # MOV variants.
 with_dn("mov_i32",     "mov Wd, Wn")
 with_dn("mov_i64",     "mov Xd, Xn")
@@ -623,17 +773,24 @@ def st_thunk(name, fastpath_32b, fastpath_64b, slowpath_helper, immediate=None,
 with_d_immediate("movi_imm_i32", "mov Wd, #Ii", immediate_range=range(64))
 with_d_immediate("movi_imm_i64", "mov Xd, #Ii", immediate_range=range(64))
 
+START_COLLECTION("load_unsigned")
+
 # LOAD variants.
 # TODO: should the signed variants have X variants for _i64?
 ldst_dn("ld8u",      "ldrb  Wd, [Xn, x27]")
+ldst_dn("ld16u",     "ldrh  Wd, [Xn, x27]")
+ldst_dn("ld32u",     "ldr   Wd, [Xn, x27]")
+ldst_dn("ld_i64",    "ldr   Xd, [Xn, x27]")
+
+START_COLLECTION("load_signed")
+
 ldst_dn("ld8s_i32",  "ldrsb Wd, [Xn, x27]")
 ldst_dn("ld8s_i64",  "ldrsb Xd, [Xn, x27]")
-ldst_dn("ld16u",     "ldrh  Wd, [Xn, x27]")
 ldst_dn("ld16s_i32", "ldrsh Wd, [Xn, x27]")
 ldst_dn("ld16s_i64", "ldrsh Xd, [Xn, x27]")
-ldst_dn("ld32u",     "ldr   Wd, [Xn, x27]")
 ldst_dn("ld32s_i64", "ldrsw Xd, [Xn, x27]")
-ldst_dn("ld_i64",    "ldr   Xd, [Xn, x27]")
+
+START_COLLECTION("store")
 
 # STORE variants.
 ldst_dn("st8",         "strb  Wd, [Xn, x27]")
@@ -644,6 +801,8 @@ def st_thunk(name, fastpath_32b, fastpath_64b, slowpath_helper, immediate=None,
 # QEMU LD/ST are handled in our C runtime rather than with simple gadgets,
 # as they're nontrivial.
 
+START_COLLECTION("arithmetic")
+
 # Trivial arithmetic.
 math_dnm("add" , "add" )
 math_dnm("sub" , "sub" )
@@ -657,6 +816,8 @@ def st_thunk(name, fastpath_32b, fastpath_64b, slowpath_helper, immediate=None,
 with_dnm("remu_i32", "udiv w27, Wn, Wm", "msub Wd, w27, Wm, Wn")
 with_dnm("remu_i64", "udiv x27, Xn, Xm", "msub Xd, x27, Xm, Xn")
 
+START_COLLECTION("logical")
+
 # Trivial logical.
 math_dn( "not",  "mvn")
 math_dn( "neg",  "neg")
@@ -669,71 +830,155 @@ def st_thunk(name, fastpath_32b, fastpath_64b, slowpath_helper, immediate=None,
 math_dnm("shl",  "lsl")
 math_dnm("shr",  "lsr")
 math_dnm("sar",  "asr")
+math_dnm("rotr", "ror")
 
 # AArch64 lacks a Rotate Left; so we instead rotate right by a negative.
-# TODO: validate this?
-#math_dnm("rotr", "ror")
-#with_dnm("rotl_i32", "neg w27, Wm", "ror Wd, Wn, w27")
-#with_dnm("rotl_i64", "neg x27, Xm", "ror Xd, Xn, x27")
+with_dnm("rotl_i32", "neg w27, Wm", "ror Wd, Wn, w27")
+with_dnm("rotl_i64", "neg w27, Wm", "ror Xd, Xn, x27")
+
+# We'll synthesize several instructions that don't exist; since it's still faster
+# to run these as gadgets.
+with_dnm("nand_i32", "and Wd, Wn, Wm", "mvn Wd, Wd")
+with_dnm("nand_i64", "and Xd, Xn, Xm", "mvn Xd, Xd")
+with_dnm("nor_i32",  "orr Wd, Wn, Wm", "mvn Wd, Wd")
+with_dnm("nor_i64",  "orr Xd, Xn, Xm", "mvn Xd, Xd")
+
+START_COLLECTION("bitwise")
+
+# Count leading zeroes, with a twist: QEMU requires us to provide
+# a default value for when the argument is 0.
+with_dnm("clz_i32",
+
+    # Perform the core CLZ into w26.
+    "clz w26, Wn",
+
+    # Check Wn to see if it was zero
+    "tst Wn, Wn",
+
+    # If it was zero, accept the argument provided in Wm.
+    # Otherwise, accept our result from w26.
+    "csel Wd, Wm, w26, eq"
+)
+with_dnm("clz_i64",
+
+    # Perform the core CLZ into w26.
+    "clz x26, Xn",
+
+    # Check Wn to see if it was zero
+    "tst Xn, Xn",
+
+    # If it was zero, accept the argument provided in Wm.
+    # Otherwise, accept our result from w26.
+    "csel Xd, Xm, x26, eq"
+)
+
+
+# Count trailing zeroes, with a twist: QEMU requires us to provide
+# a default value for when the argument is 0.
+with_dnm("ctz_i32",
+    # Reverse our bits before performing our actual clz.
+    "rbit w26, Wn",
+    "clz w26, w26",
+
+    # Check Wn to see if it was zero
+    "tst Wn, Wn",
+
+    # If it was zero, accept the argument provided in Wm.
+    # Otherwise, accept our result from w26.
+    "csel Wd, Wm, w26, eq"
+)
+with_dnm("ctz_i64",
+
+    # Perform the core CLZ into w26.
+    "rbit x26, Xn",
+    "clz x26, x26",
+
+    # Check Wn to see if it was zero
+    "tst Xn, Xn",
+
+    # If it was zero, accept the argument provided in Wm.
+    # Otherwise, accept our result from w26.
+    "csel Xd, Xm, x26, eq"
+)
+
+
+START_COLLECTION("extension")
 
 # Numeric extension.
-math_dn("ext8s",      "sxtb")
+math_dn("ext8s",      "sxtb", source_is_wn=True)
 with_dn("ext8u",      "and Xd, Xn, #0xff")
-math_dn("ext16s",     "sxth")
+math_dn("ext16s",     "sxth", source_is_wn=True)
 with_dn("ext16u",     "and Wd, Wn, #0xffff")
 with_dn("ext32s_i64", "sxtw Xd, Wn")
-with_dn("ext32u_i64", "and Xd, Xn, #0xffffffff")
+with_dn("ext32u_i64", "mov Wd, Wn")
+
+# Numeric extraction.
+with_dn("extrl",      "mov Wd, Wn")
+with_dn("extrh",      "lsr Xd, Xn, #32")
+
+START_COLLECTION("byteswap")
 
 # Byte swapping.
 with_dn("bswap16",    "rev w27, Wn", "lsr Wd, w27, #16")
 with_dn("bswap32",    "rev Wd, Wn")
 with_dn("bswap64",    "rev Xd, Xn")
 
-# Memory barriers.
-simple("mb_all", "dmb ish")
-simple("mb_st",  "dmb ishst")
-simple("mb_ld",  "dmb ishld")
 
 # Handlers for QEMU_LD, which handles guest <- host loads.
 for subtype in ('aligned', 'unaligned', 'slowpath'):
     is_aligned  = (subtype == 'aligned')
     is_slowpath = (subtype == 'slowpath')
 
+    START_COLLECTION(f"qemu_ld_{subtype}_unsigned_le")
+
     ld_thunk(f"qemu_ld_ub_{subtype}", is_aligned=is_aligned, slowpath_helper="helper_ret_ldub_mmu",
         fastpath_32b=["ldrb Wd, [Xn, x27]"], fastpath_64b=["ldrb Wd, [Xn, x27]"],
         force_slowpath=is_slowpath,
     )
-    ld_thunk(f"qemu_ld_sb_{subtype}", is_aligned=is_aligned, slowpath_helper="helper_ret_ldub_mmu_signed",
-        fastpath_32b=["ldrsb Wd, [Xn, x27]"], fastpath_64b=["ldrsb Xd, [Xn, x27]"],
-        force_slowpath=is_slowpath,
-    )
     ld_thunk(f"qemu_ld_leuw_{subtype}", is_aligned=is_aligned, slowpath_helper="helper_le_lduw_mmu",
         fastpath_32b=["ldrh Wd, [Xn, x27]"], fastpath_64b=["ldrh Wd, [Xn, x27]"],
         force_slowpath=is_slowpath,
     )
-    ld_thunk(f"qemu_ld_lesw_{subtype}", is_aligned=is_aligned, slowpath_helper="helper_le_lduw_mmu_signed",
-        fastpath_32b=["ldrsh Wd, [Xn, x27]"], fastpath_64b=["ldrsh Xd, [Xn, x27]"],
-        force_slowpath=is_slowpath,
-    )
     ld_thunk(f"qemu_ld_leul_{subtype}", is_aligned=is_aligned, slowpath_helper="helper_le_ldul_mmu",
         fastpath_32b=["ldr Wd, [Xn, x27]"], fastpath_64b=["ldr Wd, [Xn, x27]"],
         force_slowpath=is_slowpath,
     )
+    ld_thunk(f"qemu_ld_leq_{subtype}", is_aligned=is_aligned, slowpath_helper="helper_le_ldq_mmu",
+        fastpath_32b=["ldr Xd, [Xn, x27]"], fastpath_64b=["ldr Xd, [Xn, x27]"],
+        force_slowpath=is_slowpath,
+    )
+
+    START_COLLECTION(f"qemu_ld_{subtype}_signed_le")
+
+    ld_thunk(f"qemu_ld_sb_{subtype}", is_aligned=is_aligned, slowpath_helper="helper_ret_ldub_mmu_signed",
+        fastpath_32b=["ldrsb Wd, [Xn, x27]"], fastpath_64b=["ldrsb Xd, [Xn, x27]"],
+        force_slowpath=is_slowpath,
+    )
+    ld_thunk(f"qemu_ld_lesw_{subtype}", is_aligned=is_aligned, slowpath_helper="helper_le_lduw_mmu_signed",
+        fastpath_32b=["ldrsh Wd, [Xn, x27]"], fastpath_64b=["ldrsh Xd, [Xn, x27]"],
+        force_slowpath=is_slowpath,
+    )
     ld_thunk(f"qemu_ld_lesl_{subtype}", is_aligned=is_aligned, slowpath_helper="helper_le_ldul_mmu_signed",
         fastpath_32b=["ldrsw Xd, [Xn, x27]"], fastpath_64b=["ldrsw Xd, [Xn, x27]"],
         force_slowpath=is_slowpath,
     )
-    ld_thunk(f"qemu_ld_leq_{subtype}", is_aligned=is_aligned, slowpath_helper="helper_le_ldq_mmu",
+
+    # Special variant for the most common modes, as a speedup optimization.
+    ld_thunk(f"qemu_ld_ub_{subtype}_mode02", is_aligned=is_aligned, slowpath_helper="helper_ret_ldub_mmu",
+        fastpath_32b=["ldrb Wd, [Xn, x27]"], fastpath_64b=["ldrb Wd, [Xn, x27]"],
+        force_slowpath=is_slowpath, immediate=0x02
+    )
+    ld_thunk(f"qemu_ld_leq_{subtype}_mode32", is_aligned=is_aligned, slowpath_helper="helper_le_ldq_mmu",
         fastpath_32b=["ldr Xd, [Xn, x27]"], fastpath_64b=["ldr Xd, [Xn, x27]"],
-        force_slowpath=is_slowpath,
+        force_slowpath=is_slowpath, immediate=0x32
     )
-
-    # Special variant for the most common mode, as a speedup optimization.
     ld_thunk(f"qemu_ld_leq_{subtype}_mode3a", is_aligned=is_aligned, slowpath_helper="helper_le_ldq_mmu",
         fastpath_32b=["ldr Xd, [Xn, x27]"], fastpath_64b=["ldr Xd, [Xn, x27]"],
         force_slowpath=is_slowpath, immediate=0x3a
     )
 
+    START_COLLECTION(f"qemu_ld_{subtype}_be")
+
     # For now, leave the rare/big-endian stuff slow-path only.
     ld_thunk(f"qemu_ld_beuw_{subtype}", None, None, "helper_be_lduw_mmu",         
             is_aligned=is_aligned, force_slowpath=is_slowpath)
@@ -747,11 +992,15 @@ def st_thunk(name, fastpath_32b, fastpath_64b, slowpath_helper, immediate=None,
             is_aligned=is_aligned, force_slowpath=is_slowpath)
 
 
+
+
 # Handlers for QEMU_ST, which handles guest -> host stores.
 for subtype in ('aligned', 'unaligned', 'slowpath'):
     is_aligned  = (subtype == 'aligned')
     is_slowpath = (subtype == 'slowpath')
 
+    START_COLLECTION(f"qemu_st_{subtype}_le")
+
     st_thunk(f"qemu_st_ub_{subtype}", is_aligned=is_aligned, slowpath_helper="helper_ret_stb_mmu",
         fastpath_32b=["strb Wd, [Xn, x27]"], fastpath_64b=["strb Wd, [Xn, x27]"],
         force_slowpath=is_slowpath,
@@ -770,11 +1019,21 @@ def st_thunk(name, fastpath_32b, fastpath_64b, slowpath_helper, immediate=None,
     )
     
     # Special optimization for the most common modes.
+    st_thunk(f"qemu_st_ub_{subtype}_mode02", is_aligned=is_aligned, slowpath_helper="helper_ret_stb_mmu",
+        fastpath_32b=["strb Wd, [Xn, x27]"], fastpath_64b=["strb Wd, [Xn, x27]"],
+        force_slowpath=is_slowpath, immediate=0x02
+    )
+    st_thunk(f"qemu_st_leq_{subtype}_mode32", is_aligned=is_aligned, slowpath_helper="helper_le_stq_mmu",
+        fastpath_32b=["str Xd, [Xn, x27]"], fastpath_64b=["str Xd, [Xn, x27]"],
+        force_slowpath=is_slowpath, immediate=0x32
+    )
     st_thunk(f"qemu_st_leq_{subtype}_mode3a", is_aligned=is_aligned, slowpath_helper="helper_le_stq_mmu",
         fastpath_32b=["str Xd, [Xn, x27]"], fastpath_64b=["str Xd, [Xn, x27]"],
         force_slowpath=is_slowpath, immediate=0x3a
     )
 
+    START_COLLECTION(f"qemu_st_{subtype}_be")
+
     # For now, leave the rare/big-endian stuff slow-path only.
     st_thunk(f"qemu_st_beuw_{subtype}", None, None, "helper_be_stw_mmu",  
             is_aligned=is_aligned, force_slowpath=is_slowpath)
@@ -784,5 +1043,121 @@ def st_thunk(name, fastpath_32b, fastpath_64b, slowpath_helper, immediate=None,
             is_aligned=is_aligned, force_slowpath=is_slowpath)
 
 
+#
+# SIMD/Vector ops
+#
+
+# SIMD MOVI instructions.
+START_COLLECTION(f"simd_base")
+
+# Unoptimized/unoptimizable load of a vector64; grabbing an immediate.
+with_d("ldi_d", "ldr Dd, [x28], #8")
+with_d("ldi_q", "ldr Qd, [x28], #16")
+
+# General purpose reg -> vec rec loads
+vector_dn("dup", "dup Vd.S, Wn")
+
+# move vector -> GP reg
+with_dn("umov_s0", "umov Wd, Vn.s[0]")
+with_dn("umov_d0", "umov Xd, Vn.d[0]")
+
+# mov GP reg -> vector
+with_dn("ins_s0", "ins Vd.s[0], Wn")
+with_dn("ins_d0", "ins Vd.d[0], Xn")
+
+
+# Memory -> vec reg loads.
+# The offset of the load is stored in a 64b immediate.
+
+# Duplicating load.
+# TODO: possibly squish the add into the ld1r, if that's valid?
+vector_dn("dupm", "ldr x27, [x28], #8", "add x27, x27, Xn", "ld1r {Vd.S}, [x27]")
+
+# Direct loads.
+with_dn("ldr_d",  "ldr x27, [x28], #8", "ldr Dd, [Xn, x27]")
+with_dn("ldr_q",  "ldr x27, [x28], #8", "ldr Qd, [Xn, x27]")
+
+# vec -> reg stores.
+# The offset of the stores is stored in a 64b immediate.
+with_dn("str_d",  "ldr x27, [x28], #8", "str Dd, [Xn, x27]")
+with_dn("str_q",  "ldr x27, [x28], #8", "str Qd, [Xn, x27]")
+
+
+START_COLLECTION(f"simd_arithmetic")
+
+vector_math_dnm("add",   "add")
+vector_math_dnm("usadd", "uqadd")
+vector_math_dnm("ssadd", "sqadd")
+vector_math_dnm("sub",   "sub")
+vector_math_dnm("ussub", "uqsub")
+vector_math_dnm("sssub", "sqsub")
+vector_math_dnm_no64("mul",  "mul")
+vector_math_dnm_no64("smax", "smax")
+vector_math_dnm_no64("smin", "smin")
+vector_math_dnm_no64("umax", "umax")
+vector_math_dnm_no64("umin", "umin")
+
+START_COLLECTION(f"simd_logical")
+
+vector_logic_dnm("and",  "and")
+vector_logic_dnm("andc", "bic")
+vector_logic_dnm("or",   "orr")
+vector_logic_dnm("orc",  "orn")
+vector_logic_dnm("xor",  "eor")
+vector_logic_dn( "not",  "not")
+vector_dn("neg", "neg Vd.S, Vn.S")
+vector_dn("abs", "abs Vd.S, Vn.S")
+vector_logic_dnm( "bit",  "bit")
+vector_logic_dnm( "bif",  "bif")
+vector_logic_dnm( "bsl",  "bsl")
+
+vector_math_dnm("shlv", "ushl")
+vector_math_dnm("sshl", "sshl")
+
+vector_dnm("cmeq", "cmeq Vd.S, Vn.S, Vm.S", scalar="cmeq Dd, Dn, Dm")
+vector_dnm("cmgt", "cmgt Vd.S, Vn.S, Vm.S", scalar="cmgt Dd, Dn, Dm")
+vector_dnm("cmge", "cmge Vd.S, Vn.S, Vm.S", scalar="cmge Dd, Dn, Dm")
+vector_dnm("cmhi", "cmhi Vd.S, Vn.S, Vm.S", scalar="cmhi Dd, Dn, Dm")
+vector_dnm("cmhs", "cmhs Vd.S, Vn.S, Vm.S", scalar="cmhs Dd, Dn, Dm")
+
+START_COLLECTION(f"simd_immediate")
+
+# Simple imm8 movs...
+with_d_immediate("movi_cmode_e_op0_q0",  "movi Vd.8b, #Ii",          immediate_range=range(256))
+with_d_immediate("movi_cmode_e_op0_q1",  "movi Vd.16b, #Ii",         immediate_range=range(256))
+
+# ... all 00/FF movs...
+with_d_immediate("movi_cmode_e_op1_q0",  "movi Dd, #Si",             immediate_range=range(256))
+with_d_immediate("movi_cmode_e_op1_q1",  "movi Vd.2d, #Si",          immediate_range=range(256))
+
+# Halfword MOVs.
+with_d_immediate("movi_cmode_8_op0_q0",  "movi Vd.4h, #Ii",         immediate_range=range(256))
+with_d_immediate("movi_cmode_8_op0_q1",  "movi Vd.8h, #Ii",         immediate_range=range(256))
+with_d_immediate("mvni_cmode_8_op0_q0",  "mvni Vd.4h, #Ii",         immediate_range=range(256))
+with_d_immediate("mvni_cmode_8_op0_q1",  "mvni Vd.8h, #Ii",         immediate_range=range(256))
+with_d_immediate("movi_cmode_a_op0_q0",  "movi Vd.4h, #Ii, lsl #8", immediate_range=range(256))
+with_d_immediate("movi_cmode_a_op0_q1",  "movi Vd.8h, #Ii, lsl #8", immediate_range=range(256))
+with_d_immediate("mvni_cmode_a_op0_q0",  "mvni Vd.4h, #Ii, lsl #8", immediate_range=range(256))
+with_d_immediate("mvni_cmode_a_op0_q1",  "mvni Vd.8h, #Ii, lsl #8", immediate_range=range(256))
+
+# Halfword ORIs, for building complex MOVs.
+with_d_immediate("orr_cmode_a_op0_q0",   "orr Vd.4h, #Ii, lsl #8",  immediate_range=range(256))
+with_d_immediate("orr_cmode_a_op0_q1",   "orr Vd.8h, #Ii, lsl #8",  immediate_range=range(256))
+
+
+# Print a list of output files generated.
+output_c_filenames = (f"'tcti_{name}_gadgets.c'" for name in output_files.keys())
+output_h_filenames = (f"'tcti_{name}_gadgets.h'" for name in output_files.keys())
+
+print("Sources generated:",    file=sys.stderr)
+print(f"gadgets = [",          file=sys.stderr)
+print("      tcti_gadgets.h,", file=sys.stderr)
+
+for name in output_files.keys():
+    print(f"      'tcti_{name}_gadgets.c',", file=sys.stderr)
+    print(f"      'tcti_{name}_gadgets.h',", file=sys.stderr)
+
+print(f"]", file=sys.stderr)
+
 # Statistics.
-sys.stderr.write(f"\nGenerated {gadgets} gadgets with {instructions} instructions ({instructions * 4} B).\n\n")
+sys.stderr.write(f"\nGenerated {gadgets} gadgets with {instructions} instructions (~{(instructions * 4) // 1024 // 1024} MiB).\n\n")
diff --git a/util/osdep.c b/util/osdep.c
index 81c46df6f517..8df113c2df5c 100644
--- a/util/osdep.c
+++ b/util/osdep.c
@@ -114,6 +114,12 @@ int qemu_mprotect_none(void *addr, size_t size)
 #ifdef _WIN32
     return qemu_mprotect__osdep(addr, size, PAGE_NOACCESS);
 #else
+# if defined(__APPLE__) && defined(__arm64__)
+    if (__builtin_available(macOS 11.2, *)) {
+        /* mprotect() in macOS 11.2 can't switch RWX to NONE */
+        return 0;
+    }
+# endif
     return qemu_mprotect__osdep(addr, size, PROT_NONE);
 #endif
 }