From 0272e162b73b402fb8d69a172a85463a821047be Mon Sep 17 00:00:00 2001
From: Joelle van Dyne <j@getutm.app>
Date: Sat, 10 Oct 2020 19:52:38 -0700
Subject: [PATCH 01/36] block: feature detection for host block support

On Darwin (iOS), there are no system level APIs for directly accessing
host block devices. We detect this at configure time.
---
 block/file-posix.c   | 33 ++++++++++++++++++++++-----------
 meson.build          |  6 +++++-
 qapi/block-core.json | 10 +++++++---
 3 files changed, 34 insertions(+), 15 deletions(-)

diff --git a/block/file-posix.c b/block/file-posix.c
index 05079b40caee..d1ab3180ff16 100644
--- a/block/file-posix.c
+++ b/block/file-posix.c
@@ -42,6 +42,8 @@
 #include "scsi/constants.h"
 
 #if defined(__APPLE__) && (__MACH__)
+#include <sys/ioctl.h>
+#if defined(HAVE_HOST_BLOCK_DEVICE)
 #include <paths.h>
 #include <sys/param.h>
 #include <IOKit/IOKitLib.h>
@@ -52,6 +54,7 @@
 //#include <IOKit/storage/IOCDTypes.h>
 #include <IOKit/storage/IODVDMedia.h>
 #include <CoreFoundation/CoreFoundation.h>
+#endif /* defined(HAVE_HOST_BLOCK_DEVICE) */
 #endif
 
 #ifdef __sun__
@@ -181,7 +184,17 @@ typedef struct BDRVRawReopenState {
     bool check_cache_dropped;
 } BDRVRawReopenState;
 
-static int fd_open(BlockDriverState *bs);
+static int fd_open(BlockDriverState *bs)
+{
+    BDRVRawState *s = bs->opaque;
+
+    /* this is just to ensure s->fd is sane (its called by io ops) */
+    if (s->fd >= 0) {
+        return 0;
+    }
+    return -EIO;
+}
+
 static int64_t raw_getlength(BlockDriverState *bs);
 
 typedef struct RawPosixAIOData {
@@ -3032,6 +3045,7 @@ static BlockStatsSpecific *raw_get_specific_stats(BlockDriverState *bs)
     return stats;
 }
 
+#if defined(HAVE_HOST_BLOCK_DEVICE)
 static BlockStatsSpecific *hdev_get_specific_stats(BlockDriverState *bs)
 {
     BlockStatsSpecific *stats = g_new(BlockStatsSpecific, 1);
@@ -3041,6 +3055,7 @@ static BlockStatsSpecific *hdev_get_specific_stats(BlockDriverState *bs)
 
     return stats;
 }
+#endif /* HAVE_HOST_BLOCK_DEVICE */
 
 static QemuOptsList raw_create_opts = {
     .name = "raw-create-opts",
@@ -3265,6 +3280,8 @@ BlockDriver bdrv_file = {
 /***********************************************/
 /* host device */
 
+#if defined(HAVE_HOST_BLOCK_DEVICE)
+
 #if defined(__APPLE__) && defined(__MACH__)
 static kern_return_t GetBSDPath(io_iterator_t mediaIterator, char *bsdPath,
                                 CFIndex maxPathSize, int flags);
@@ -3557,16 +3574,6 @@ hdev_co_ioctl(BlockDriverState *bs, unsigned long int req, void *buf)
 }
 #endif /* linux */
 
-static int fd_open(BlockDriverState *bs)
-{
-    BDRVRawState *s = bs->opaque;
-
-    /* this is just to ensure s->fd is sane (its called by io ops) */
-    if (s->fd >= 0)
-        return 0;
-    return -EIO;
-}
-
 static coroutine_fn int
 hdev_co_pdiscard(BlockDriverState *bs, int64_t offset, int bytes)
 {
@@ -3890,6 +3897,8 @@ static BlockDriver bdrv_host_cdrom = {
 };
 #endif /* __FreeBSD__ */
 
+#endif /* HAVE_HOST_BLOCK_DEVICE */
+
 static void bdrv_file_init(void)
 {
     /*
@@ -3897,6 +3906,7 @@ static void bdrv_file_init(void)
      * registered last will get probed first.
      */
     bdrv_register(&bdrv_file);
+#if defined(HAVE_HOST_BLOCK_DEVICE)
     bdrv_register(&bdrv_host_device);
 #ifdef __linux__
     bdrv_register(&bdrv_host_cdrom);
@@ -3904,6 +3914,7 @@ static void bdrv_file_init(void)
 #if defined(__FreeBSD__) || defined(__FreeBSD_kernel__)
     bdrv_register(&bdrv_host_cdrom);
 #endif
+#endif /* HAVE_HOST_BLOCK_DEVICE */
 }
 
 block_init(bdrv_file_init);
diff --git a/meson.build b/meson.build
index 81d760d6e84f..0e53876f69c1 100644
--- a/meson.build
+++ b/meson.build
@@ -181,7 +181,7 @@ if targetos == 'windows'
                                       include_directories: include_directories('.'))
 elif targetos == 'darwin'
   coref = dependency('appleframeworks', modules: 'CoreFoundation')
-  iokit = dependency('appleframeworks', modules: 'IOKit')
+  iokit = dependency('appleframeworks', modules: 'IOKit', required: false)
 elif targetos == 'sunos'
   socket = [cc.find_library('socket'),
             cc.find_library('nsl'),
@@ -1056,6 +1056,9 @@ if get_option('cfi')
   add_global_link_arguments(cfi_flags, native: false, language: ['c', 'cpp', 'objc'])
 endif
 
+have_host_block_device = (targetos != 'darwin' or
+    cc.has_header('IOKit/storage/IOMedia.h'))
+
 #################
 # config-host.h #
 #################
@@ -1149,6 +1152,7 @@ config_host_data.set('HAVE_PTY_H', cc.has_header('pty.h'))
 config_host_data.set('HAVE_SYS_IOCCOM_H', cc.has_header('sys/ioccom.h'))
 config_host_data.set('HAVE_SYS_KCOV_H', cc.has_header('sys/kcov.h'))
 config_host_data.set('HAVE_SYSTEM_FUNCTION', cc.has_function('system', prefix: '#include <stdlib.h>'))
+config_host_data.set('HAVE_HOST_BLOCK_DEVICE', have_host_block_device)
 
 config_host_data.set('CONFIG_PREADV', cc.has_function('preadv', prefix: '#include <sys/uio.h>'))
 
diff --git a/qapi/block-core.json b/qapi/block-core.json
index 9f555d5c1d83..0c2cd9e68926 100644
--- a/qapi/block-core.json
+++ b/qapi/block-core.json
@@ -959,7 +959,8 @@
   'discriminator': 'driver',
   'data': {
       'file': 'BlockStatsSpecificFile',
-      'host_device': 'BlockStatsSpecificFile',
+      'host_device': { 'type': 'BlockStatsSpecificFile',
+                       'if': 'defined(HAVE_HOST_BLOCK_DEVICE)' },
       'nvme': 'BlockStatsSpecificNvme' } }
 
 ##
@@ -2863,7 +2864,9 @@
 { 'enum': 'BlockdevDriver',
   'data': [ 'blkdebug', 'blklogwrites', 'blkreplay', 'blkverify', 'bochs',
             'cloop', 'compress', 'copy-on-read', 'dmg', 'file', 'ftp', 'ftps',
-            'gluster', 'host_cdrom', 'host_device', 'http', 'https', 'iscsi',
+            'gluster', 'host_cdrom',
+            {'name': 'host_device', 'if': 'defined(HAVE_HOST_BLOCK_DEVICE)' },
+            'http', 'https', 'iscsi',
             'luks', 'nbd', 'nfs', 'null-aio', 'null-co', 'nvme', 'parallels',
             'preallocate', 'qcow', 'qcow2', 'qed', 'quorum', 'raw', 'rbd',
             { 'name': 'replication', 'if': 'defined(CONFIG_REPLICATION)' },
@@ -4066,7 +4069,8 @@
       'ftps':       'BlockdevOptionsCurlFtps',
       'gluster':    'BlockdevOptionsGluster',
       'host_cdrom': 'BlockdevOptionsFile',
-      'host_device':'BlockdevOptionsFile',
+      'host_device': { 'type': 'BlockdevOptionsFile',
+                       'if': 'defined(HAVE_HOST_BLOCK_DEVICE)' },
       'http':       'BlockdevOptionsCurlHttp',
       'https':      'BlockdevOptionsCurlHttps',
       'iscsi':      'BlockdevOptionsIscsi',

From e0a033d9d474319b252a29da42eb397bacb1c4a1 Mon Sep 17 00:00:00 2001
From: Joelle van Dyne <j@getutm.app>
Date: Thu, 21 Jan 2021 16:12:00 -0800
Subject: [PATCH 02/36] block: check for sys/disk.h

Some BSD platforms do not have this header.
---
 block.c     | 2 +-
 meson.build | 1 +
 2 files changed, 2 insertions(+), 1 deletion(-)

diff --git a/block.c b/block.c
index a1f3cecd7552..b2705ad225c0 100644
--- a/block.c
+++ b/block.c
@@ -54,7 +54,7 @@
 #ifdef CONFIG_BSD
 #include <sys/ioctl.h>
 #include <sys/queue.h>
-#ifndef __DragonFly__
+#if defined(HAVE_SYS_DISK_H)
 #include <sys/disk.h>
 #endif
 #endif
diff --git a/meson.build b/meson.build
index 0e53876f69c1..ba0db9fa1f81 100644
--- a/meson.build
+++ b/meson.build
@@ -1153,6 +1153,7 @@ config_host_data.set('HAVE_SYS_IOCCOM_H', cc.has_header('sys/ioccom.h'))
 config_host_data.set('HAVE_SYS_KCOV_H', cc.has_header('sys/kcov.h'))
 config_host_data.set('HAVE_SYSTEM_FUNCTION', cc.has_function('system', prefix: '#include <stdlib.h>'))
 config_host_data.set('HAVE_HOST_BLOCK_DEVICE', have_host_block_device)
+config_host_data.set('HAVE_SYS_DISK_H', cc.has_header('sys/disk.h'))
 
 config_host_data.set('CONFIG_PREADV', cc.has_function('preadv', prefix: '#include <sys/uio.h>'))
 

From f3140144225066c74daf51b7c5faf6e2a1d78fb4 Mon Sep 17 00:00:00 2001
From: Joelle van Dyne <j@getutm.app>
Date: Sun, 7 Mar 2021 16:52:56 -0800
Subject: [PATCH 03/36] block: detect DKIOCGETBLOCKCOUNT/SIZE before use

iOS hosts do not have these defined so we fallback to the
default behaviour.

Co-authored-by: Warner Losh <imp@bsdimp.com>
---
 block/file-posix.c | 18 +++++++-----------
 1 file changed, 7 insertions(+), 11 deletions(-)

diff --git a/block/file-posix.c b/block/file-posix.c
index d1ab3180ff16..9b6d7ddda3d5 100644
--- a/block/file-posix.c
+++ b/block/file-posix.c
@@ -2326,8 +2326,10 @@ static int64_t raw_getlength(BlockDriverState *bs)
 again:
 #endif
     if (!fstat(fd, &sb) && (S_IFCHR & sb.st_mode)) {
+        size = 0;
 #ifdef DIOCGMEDIASIZE
         if (ioctl(fd, DIOCGMEDIASIZE, (off_t *)&size))
+            size = 0;
 #elif defined(DIOCGPART)
         {
                 struct partinfo pi;
@@ -2336,9 +2338,7 @@ static int64_t raw_getlength(BlockDriverState *bs)
                 else
                         size = 0;
         }
-        if (size == 0)
-#endif
-#if defined(__APPLE__) && defined(__MACH__)
+#elif defined(DKIOCGETBLOCKCOUNT) && defined(DKIOCGETBLOCKSIZE)
         {
             uint64_t sectors = 0;
             uint32_t sector_size = 0;
@@ -2346,19 +2346,15 @@ static int64_t raw_getlength(BlockDriverState *bs)
             if (ioctl(fd, DKIOCGETBLOCKCOUNT, &sectors) == 0
                && ioctl(fd, DKIOCGETBLOCKSIZE, &sector_size) == 0) {
                 size = sectors * sector_size;
-            } else {
-                size = lseek(fd, 0LL, SEEK_END);
-                if (size < 0) {
-                    return -errno;
-                }
             }
         }
-#else
-        size = lseek(fd, 0LL, SEEK_END);
+#endif
+        if (size == 0) {
+            size = lseek(fd, 0LL, SEEK_END);
+        }
         if (size < 0) {
             return -errno;
         }
-#endif
 #if defined(__FreeBSD__) || defined(__FreeBSD_kernel__)
         switch(s->type) {
         case FTYPE_CD:

From 69819144f1909230ec140f55c9c48baa08a1601a Mon Sep 17 00:00:00 2001
From: Joelle van Dyne <j@getutm.app>
Date: Thu, 21 Jan 2021 16:31:09 -0800
Subject: [PATCH 04/36] slirp: feature detection for smbd

Replace Windows specific macro with a more generic feature detection
macro. Allows slirp smb feature to be disabled manually as well.
---
 configure   | 26 +++++++++++++++++++++++---
 meson.build |  2 +-
 net/slirp.c | 16 ++++++++--------
 3 files changed, 32 insertions(+), 12 deletions(-)

diff --git a/configure b/configure
index 34fccaa2bae6..8335a3e6a0d8 100755
--- a/configure
+++ b/configure
@@ -465,6 +465,7 @@ fuse_lseek="auto"
 multiprocess="auto"
 
 malloc_trim="auto"
+slirp_smbd="auto"
 
 # parse CC options second
 for opt do
@@ -834,8 +835,6 @@ do
     fi
 done
 
-: ${smbd=${SMBD-/usr/sbin/smbd}}
-
 # Default objcc to clang if available, otherwise use CC
 if has clang; then
   objcc=clang
@@ -1560,6 +1559,10 @@ for opt do
   ;;
   --disable-multiprocess) multiprocess="disabled"
   ;;
+  --enable-slirp-smbd) slirp_smbd=yes
+  ;;
+  --disable-slirp-smbd) slirp_smbd=no
+  ;;
   *)
       echo "ERROR: unknown option $opt"
       echo "Try '$0 --help' for more information"
@@ -1913,6 +1916,7 @@ disabled with --disable-FEATURE, default is enabled if available
   fuse            FUSE block device export
   fuse-lseek      SEEK_HOLE/SEEK_DATA support for FUSE exports
   multiprocess    Out of process device emulation support
+  slirp-smbd      use smbd (at path --smbd=*) in slirp networking
 
 NOTE: The object files are built at the place where configure is launched
 EOF
@@ -5252,6 +5256,19 @@ case "$slirp" in
     ;;
 esac
 
+# Check for slirp smbd dupport
+: ${smbd=${SMBD-/usr/sbin/smbd}}
+if test "$slirp_smbd" != "no" ; then
+  if test "$mingw32" = "yes" ; then
+    if test "$slirp_smbd" = "yes" ; then
+      error_exit "Host smbd not supported on this platform."
+    fi
+    slirp_smbd=no
+  else
+    slirp_smbd=yes
+  fi
+fi
+
 ##########################################
 # check for usable __NR_keyctl syscall
 
@@ -5527,7 +5544,10 @@ fi
 if test "$guest_agent" = "yes" ; then
   echo "CONFIG_GUEST_AGENT=y" >> $config_host_mak
 fi
-echo "CONFIG_SMBD_COMMAND=\"$smbd\"" >> $config_host_mak
+if test "$slirp_smbd" = "yes" ; then
+  echo "CONFIG_SLIRP_SMBD=y" >> $config_host_mak
+  echo "CONFIG_SMBD_COMMAND=\"$smbd\"" >> $config_host_mak
+fi
 if test "$vde" = "yes" ; then
   echo "CONFIG_VDE=y" >> $config_host_mak
   echo "VDE_LIBS=$vde_libs" >> $config_host_mak
diff --git a/meson.build b/meson.build
index ba0db9fa1f81..cad70a8fc501 100644
--- a/meson.build
+++ b/meson.build
@@ -2424,7 +2424,7 @@ summary_info += {'genisoimage':       config_host['GENISOIMAGE']}
 if targetos == 'windows' and config_host.has_key('CONFIG_GUEST_AGENT')
   summary_info += {'wixl':            wixl.found() ? wixl.full_path() : false}
 endif
-if slirp_opt != 'disabled'
+if slirp_opt != 'disabled' and 'CONFIG_SLIRP_SMBD' in config_host
   summary_info += {'smbd':            config_host['CONFIG_SMBD_COMMAND']}
 endif
 summary(summary_info, bool_yn: true, section: 'Host binaries')
diff --git a/net/slirp.c b/net/slirp.c
index be914c0be058..b3ded2aac155 100644
--- a/net/slirp.c
+++ b/net/slirp.c
@@ -27,7 +27,7 @@
 #include "net/slirp.h"
 
 
-#ifndef _WIN32
+#if defined(CONFIG_SLIRP_SMBD)
 #include <pwd.h>
 #include <sys/wait.h>
 #endif
@@ -90,7 +90,7 @@ typedef struct SlirpState {
     Slirp *slirp;
     Notifier poll_notifier;
     Notifier exit_notifier;
-#ifndef _WIN32
+#if defined(CONFIG_SLIRP_SMBD)
     gchar *smb_dir;
 #endif
     GSList *fwd;
@@ -103,7 +103,7 @@ static QTAILQ_HEAD(, SlirpState) slirp_stacks =
 static int slirp_hostfwd(SlirpState *s, const char *redir_str, Error **errp);
 static int slirp_guestfwd(SlirpState *s, const char *config_str, Error **errp);
 
-#ifndef _WIN32
+#if defined(CONFIG_SLIRP_SMBD)
 static int slirp_smb(SlirpState *s, const char *exported_dir,
                      struct in_addr vserver_addr, Error **errp);
 static void slirp_smb_cleanup(SlirpState *s);
@@ -367,7 +367,7 @@ static int net_slirp_init(NetClientState *peer, const char *model,
     struct in6_addr ip6_prefix;
     struct in6_addr ip6_host;
     struct in6_addr ip6_dns;
-#ifndef _WIN32
+#if defined(CONFIG_SLIRP_SMBD)
     struct in_addr smbsrv = { .s_addr = 0 };
 #endif
     NetClientState *nc;
@@ -477,7 +477,7 @@ static int net_slirp_init(NetClientState *peer, const char *model,
         return -1;
     }
 
-#ifndef _WIN32
+#if defined(CONFIG_SLIRP_SMBD)
     if (vsmbserver && !inet_aton(vsmbserver, &smbsrv)) {
         error_setg(errp, "Failed to parse SMB address");
         return -1;
@@ -592,7 +592,7 @@ static int net_slirp_init(NetClientState *peer, const char *model,
             }
         }
     }
-#ifndef _WIN32
+#if defined(CONFIG_SLIRP_SMBD)
     if (smb_export) {
         if (slirp_smb(s, smb_export, smbsrv, errp) < 0) {
             goto error;
@@ -784,7 +784,7 @@ void hmp_hostfwd_add(Monitor *mon, const QDict *qdict)
 
 }
 
-#ifndef _WIN32
+#if defined(CONFIG_SLIRP_SMBD)
 
 /* automatic user mode samba server configuration */
 static void slirp_smb_cleanup(SlirpState *s)
@@ -899,7 +899,7 @@ static int slirp_smb(SlirpState* s, const char *exported_dir,
     return 0;
 }
 
-#endif /* !defined(_WIN32) */
+#endif /* defined(CONFIG_SLIRP_SMBD) */
 
 static int guestfwd_can_read(void *opaque)
 {

From bcdcc525e45e75d02fc55adbca9434e753c85f68 Mon Sep 17 00:00:00 2001
From: osy <50960678+osy@users.noreply.github.com>
Date: Sun, 7 Mar 2021 17:24:50 -0800
Subject: [PATCH 05/36] meson: option to build as shared library

On iOS, we cannot fork() new processes, so the best way to load QEMU into an
app is through a shared library. We add a new configure option
`--enable-shared-lib` that will build the bulk of QEMU into a shared lib.
The usual executables will then link to the library.
---
 configure         | 14 ++++++++++++--
 meson.build       | 40 ++++++++++++++++++++++++++++++++++------
 meson_options.txt |  2 ++
 3 files changed, 48 insertions(+), 8 deletions(-)

diff --git a/configure b/configure
index 8335a3e6a0d8..351c3e012d08 100755
--- a/configure
+++ b/configure
@@ -463,6 +463,7 @@ gettext="auto"
 fuse="auto"
 fuse_lseek="auto"
 multiprocess="auto"
+shared_lib="false"
 
 malloc_trim="auto"
 slirp_smbd="auto"
@@ -1563,6 +1564,10 @@ for opt do
   ;;
   --disable-slirp-smbd) slirp_smbd=no
   ;;
+  --enable-shared-lib) shared_lib=true
+  ;;
+  --disable-shared-lib) shared_lib=false
+  ;;
   *)
       echo "ERROR: unknown option $opt"
       echo "Try '$0 --help' for more information"
@@ -1790,6 +1795,7 @@ Advanced options (experts only):
                            enable plugins via shared library loading
   --disable-containers     don't use containers for cross-building
   --gdb=GDB-path           gdb to use for gdbstub tests [$gdb_bin]
+  --enable-shared-lib      build QEMU as a shared library
 
 Optional features, enabled with --enable-FEATURE and
 disabled with --disable-FEATURE, default is enabled if available
@@ -6364,7 +6370,11 @@ echo "ranlib = [$(meson_quote $ranlib)]" >> $cross
 if has $sdl2_config; then
   echo "sdl2-config = [$(meson_quote $sdl2_config)]" >> $cross
 fi
-echo "strip = [$(meson_quote $strip)]" >> $cross
+if test "$shared_lib" = "true"; then
+  echo "strip = [$(meson_quote $strip), '-x']" >> $cross
+else
+  echo "strip = [$(meson_quote $strip)]" >> $cross
+fi
 echo "windres = [$(meson_quote $windres)]" >> $cross
 if test "$cross_compile" = "yes"; then
     cross_arg="--cross-file config-meson.cross"
@@ -6446,7 +6456,7 @@ NINJA=$ninja $meson setup \
         -Dvhost_user_blk_server=$vhost_user_blk_server -Dmultiprocess=$multiprocess \
         -Dfuse=$fuse -Dfuse_lseek=$fuse_lseek -Dguest_agent_msi=$guest_agent_msi \
         $(if test "$default_features" = no; then echo "-Dauto_features=disabled"; fi) \
-	-Dtcg_interpreter=$tcg_interpreter \
+	-Dtcg_interpreter=$tcg_interpreter -Dshared_lib=$shared_lib \
         $cross_arg \
         "$PWD" "$source_path"
 
diff --git a/meson.build b/meson.build
index cad70a8fc501..40ce117aa118 100644
--- a/meson.build
+++ b/meson.build
@@ -2185,14 +2185,31 @@ foreach target : target_dirs
   arch_srcs += target_specific.sources()
   arch_deps += target_specific.dependencies()
 
-  lib = static_library('qemu-' + target,
+  if get_option('shared_lib')
+    build_lib_args = {
+      'target_type': 'shared_library',
+      'install': true,
+      'dependencies': arch_deps + deps,
+      'link_language': link_language,
+      'link_depends': [block_syms, qemu_syms],
+      'link_args': link_args + cc.get_supported_link_arguments(['-Wl,-U,_qemu_main'])
+    }
+  else
+    build_lib_args = {
+      'target_type': 'static_library',
+      'install': false,
+      'dependencies': arch_deps,
+      'name_suffix': 'fa'
+    }
+  endif
+
+  lib = build_target('qemu-' + target,
                  sources: arch_srcs + genh,
-                 dependencies: arch_deps,
                  objects: objects,
                  include_directories: target_inc,
                  c_args: c_args,
                  build_by_default: false,
-                 name_suffix: 'fa')
+                 kwargs: build_lib_args)
 
   if target.endswith('-softmmu')
     execs = [{
@@ -2226,6 +2243,17 @@ foreach target : target_dirs
       'dependencies': []
     }]
   endif
+  if get_option('shared_lib')
+    build_exe_args = {
+      'link_with': lib,
+      'link_args': link_args + cc.get_supported_link_arguments(['-Wl,--exclude-libs,ALL'])
+    }
+  else
+    build_exe_args = {
+      'objects': lib.extract_all_objects(recursive: true),
+      'link_args': link_args
+    }
+  endif
   foreach exe: execs
     exe_name = exe['name']
     exe_sign = 'CONFIG_HVF' in config_target
@@ -2237,11 +2265,10 @@ foreach target : target_dirs
                install: true,
                c_args: c_args,
                dependencies: arch_deps + deps + exe['dependencies'],
-               objects: lib.extract_all_objects(recursive: true),
                link_language: link_language,
                link_depends: [block_syms, qemu_syms] + exe.get('link_depends', []),
-               link_args: link_args,
-               gui_app: exe['gui'])
+               gui_app: exe['gui'],
+               kwargs: build_exe_args)
 
     if exe_sign
       emulators += {exe['name'] : custom_target(exe['name'],
@@ -2408,6 +2435,7 @@ endif
 summary_info += {'Doc directory':     get_option('docdir')}
 summary_info += {'Build directory':   meson.current_build_dir()}
 summary_info += {'Source path':       meson.current_source_dir()}
+summary_info += {'build shared lib':  get_option('shared_lib')}
 summary_info += {'GIT submodules':    config_host['GIT_SUBMODULES']}
 summary(summary_info, bool_yn: true, section: 'Directories')
 
diff --git a/meson_options.txt b/meson_options.txt
index 9734019995a0..4594d42769d6 100644
--- a/meson_options.txt
+++ b/meson_options.txt
@@ -6,6 +6,8 @@ option('qemu_firmwarepath', type : 'string', value : '',
        description: 'search PATH for firmware files')
 option('sphinx_build', type : 'string', value : '',
        description: 'Use specified sphinx-build [$sphinx_build] for building document (default to be empty)')
+option('shared_lib', type : 'boolean', value : false,
+       description: 'build QEMU as a shared library')
 
 option('default_devices', type : 'boolean', value : true,
        description: 'Include a default selection of devices in emulators')

From 4ef61a5fd79e3635a09dda3cedd797dd43438e77 Mon Sep 17 00:00:00 2001
From: Gerd Hoffmann <kraxel@redhat.com>
Date: Fri, 8 Jun 2018 13:19:58 +0200
Subject: [PATCH 06/36] hw/display: add virtio-ramfb device

Like virtio-vga, but using ramfb instead of legacy vga.
Useful for booting from OVMF into Windows ARM which expects a linear FB.
---
 hw/display/meson.build    |   1 +
 hw/display/virtio-ramfb.c | 188 ++++++++++++++++++++++++++++++++++++++
 2 files changed, 189 insertions(+)
 create mode 100644 hw/display/virtio-ramfb.c

diff --git a/hw/display/meson.build b/hw/display/meson.build
index 9d79e3951d9e..14f5fa39f4c1 100644
--- a/hw/display/meson.build
+++ b/hw/display/meson.build
@@ -60,6 +60,7 @@ if config_all_devices.has_key('CONFIG_VIRTIO_GPU')
   virtio_gpu_ss.add(when: ['CONFIG_VIRTIO_GPU', 'CONFIG_VIRGL'],
                     if_true: [files('virtio-gpu-3d.c'), pixman, virgl])
   virtio_gpu_ss.add(when: 'CONFIG_VHOST_USER_GPU', if_true: files('vhost-user-gpu.c'))
+  virtio_gpu_ss.add(when: 'CONFIG_FW_CFG_DMA', if_true: files('virtio-ramfb.c'))
   hw_display_modules += {'virtio-gpu': virtio_gpu_ss}
 endif
 
diff --git a/hw/display/virtio-ramfb.c b/hw/display/virtio-ramfb.c
new file mode 100644
index 000000000000..d08bb90a14d4
--- /dev/null
+++ b/hw/display/virtio-ramfb.c
@@ -0,0 +1,188 @@
+#include "qemu/osdep.h"
+#include "hw/pci/pci.h"
+#include "ui/console.h"
+#include "hw/qdev-properties.h"
+#include "hw/virtio/virtio-gpu-pci.h"
+#include "qapi/error.h"
+#include "hw/display/ramfb.h"
+#include "qom/object.h"
+
+/*
+ * virtio-ramfb-base: This extends VirtioPCIProxy.
+ */
+#define TYPE_VIRTIO_RAMFB_BASE "virtio-ramfb-base"
+OBJECT_DECLARE_TYPE(VirtIORAMFBBase, VirtIORAMFBBaseClass,
+                    VIRTIO_RAMFB_BASE)
+
+struct VirtIORAMFBBase {
+    VirtIOPCIProxy parent_obj;
+
+    VirtIOGPUBase *vgpu;
+    RAMFBState    *ramfb;
+};
+
+struct VirtIORAMFBBaseClass {
+    VirtioPCIClass parent_class;
+
+    DeviceReset parent_reset;
+};
+
+static void virtio_ramfb_invalidate_display(void *opaque)
+{
+    VirtIORAMFBBase *vramfb = opaque;
+    VirtIOGPUBase *g = vramfb->vgpu;
+
+    if (g->enable) {
+        g->hw_ops->invalidate(g);
+    }
+}
+
+static void virtio_ramfb_update_display(void *opaque)
+{
+    VirtIORAMFBBase *vramfb = opaque;
+    VirtIOGPUBase *g = vramfb->vgpu;
+
+    if (g->enable) {
+        g->hw_ops->gfx_update(g);
+    } else {
+        ramfb_display_update(g->scanout[0].con, vramfb->ramfb);
+    }
+}
+
+static int virtio_ramfb_ui_info(void *opaque, uint32_t idx, QemuUIInfo *info)
+{
+    VirtIORAMFBBase *vramfb = opaque;
+    VirtIOGPUBase *g = vramfb->vgpu;
+
+    if (g->hw_ops->ui_info) {
+        return g->hw_ops->ui_info(g, idx, info);
+    }
+    return -1;
+}
+
+static void virtio_ramfb_gl_block(void *opaque, bool block)
+{
+    VirtIORAMFBBase *vramfb = opaque;
+    VirtIOGPUBase *g = vramfb->vgpu;
+
+    if (g->hw_ops->gl_block) {
+        g->hw_ops->gl_block(g, block);
+    }
+}
+
+static const GraphicHwOps virtio_ramfb_ops = {
+    .invalidate = virtio_ramfb_invalidate_display,
+    .gfx_update = virtio_ramfb_update_display,
+    .ui_info = virtio_ramfb_ui_info,
+    .gl_block = virtio_ramfb_gl_block,
+};
+
+static const VMStateDescription vmstate_virtio_ramfb = {
+    .name = "virtio-ramfb",
+    .version_id = 2,
+    .minimum_version_id = 2,
+    .fields = (VMStateField[]) {
+        /* no pci stuff here, saving the virtio device will handle that */
+        /* FIXME */
+        VMSTATE_END_OF_LIST()
+    }
+};
+
+/* RAMFB device wrapper around PCI device around virtio GPU */
+static void virtio_ramfb_realize(VirtIOPCIProxy *vpci_dev, Error **errp)
+{
+    VirtIORAMFBBase *vramfb = VIRTIO_RAMFB_BASE(vpci_dev);
+    VirtIOGPUBase *g = vramfb->vgpu;
+    int i;
+
+    /* init virtio bits */
+    virtio_pci_force_virtio_1(vpci_dev);
+    if (!qdev_realize(DEVICE(g), BUS(&vpci_dev->bus), errp)) {
+        return;
+    }
+
+    /* init ramfb */
+    vramfb->ramfb = ramfb_setup(errp);
+    graphic_console_set_hwops(g->scanout[0].con, &virtio_ramfb_ops, vramfb);
+
+    for (i = 0; i < g->conf.max_outputs; i++) {
+        object_property_set_link(OBJECT(g->scanout[i].con), "device",
+                                 OBJECT(vpci_dev), &error_abort);
+    }
+}
+
+static void virtio_ramfb_reset(DeviceState *dev)
+{
+    VirtIORAMFBBaseClass *klass = VIRTIO_RAMFB_BASE_GET_CLASS(dev);
+
+    /* reset virtio-gpu */
+    klass->parent_reset(dev);
+}
+
+static Property virtio_ramfb_base_properties[] = {
+    DEFINE_VIRTIO_GPU_PCI_PROPERTIES(VirtIOPCIProxy),
+    DEFINE_PROP_END_OF_LIST(),
+};
+
+static void virtio_ramfb_base_class_init(ObjectClass *klass, void *data)
+{
+    DeviceClass *dc = DEVICE_CLASS(klass);
+    VirtioPCIClass *k = VIRTIO_PCI_CLASS(klass);
+    VirtIORAMFBBaseClass *v = VIRTIO_RAMFB_BASE_CLASS(klass);
+    PCIDeviceClass *pcidev_k = PCI_DEVICE_CLASS(klass);
+
+    set_bit(DEVICE_CATEGORY_DISPLAY, dc->categories);
+    device_class_set_props(dc, virtio_ramfb_base_properties);
+    dc->vmsd = &vmstate_virtio_ramfb;
+    dc->hotpluggable = false;
+    device_class_set_parent_reset(dc, virtio_ramfb_reset,
+                                  &v->parent_reset);
+
+    k->realize = virtio_ramfb_realize;
+    pcidev_k->class_id = PCI_CLASS_DISPLAY_OTHER;
+}
+
+static TypeInfo virtio_ramfb_base_info = {
+    .name          = TYPE_VIRTIO_RAMFB_BASE,
+    .parent        = TYPE_VIRTIO_PCI,
+    .instance_size = sizeof(VirtIORAMFBBase),
+    .class_size    = sizeof(VirtIORAMFBBaseClass),
+    .class_init    = virtio_ramfb_base_class_init,
+    .abstract      = true,
+};
+
+#define TYPE_VIRTIO_RAMFB "virtio-ramfb"
+
+typedef struct VirtIORAMFB VirtIORAMFB;
+DECLARE_INSTANCE_CHECKER(VirtIORAMFB, VIRTIO_RAMFB,
+                         TYPE_VIRTIO_RAMFB)
+
+struct VirtIORAMFB {
+    VirtIORAMFBBase parent_obj;
+
+    VirtIOGPU     vdev;
+};
+
+static void virtio_ramfb_inst_initfn(Object *obj)
+{
+    VirtIORAMFB *dev = VIRTIO_RAMFB(obj);
+
+    virtio_instance_init_common(obj, &dev->vdev, sizeof(dev->vdev),
+                                TYPE_VIRTIO_GPU);
+    VIRTIO_RAMFB_BASE(dev)->vgpu = VIRTIO_GPU_BASE(&dev->vdev);
+}
+
+static VirtioPCIDeviceTypeInfo virtio_ramfb_info = {
+    .generic_name  = TYPE_VIRTIO_RAMFB,
+    .parent        = TYPE_VIRTIO_RAMFB_BASE,
+    .instance_size = sizeof(VirtIORAMFB),
+    .instance_init = virtio_ramfb_inst_initfn,
+};
+
+static void virtio_ramfb_register_types(void)
+{
+    type_register_static(&virtio_ramfb_base_info);
+    virtio_pci_types_register(&virtio_ramfb_info);
+}
+
+type_init(virtio_ramfb_register_types)

From ff51693eefc8a412e78620a9269c5ca5aed3559d Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Marc-Andr=C3=A9=20Lureau?= <marcandre.lureau@redhat.com>
Date: Mon, 25 Jan 2021 11:34:26 +0400
Subject: [PATCH 07/36] slirp: update to git master
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

git cherry-diff:

Commits on bacb71f1c3ed5f40e393afd8be81bedfba13a401 branch that is not on 8f43a99191afb47ca3f3c6972f6306209f367ece branch
+ 1021b0dc38d39f1dc95a296fe3e05a24a087cdc6 disable_dns option
+ 0f94ceec752592e4ac632a24e3c64a97dd09bf4c limit vnameserver_addr to port 53
+ b57bafa852ef16b133907a13678ec69e9531f177 libslirp.h: fix SlirpConfig v3 documentation
+ 1abf18b2b5edb462797629ed47ad4515a195686e Update CHANGELOG
+ ff4ecf9b6c6542b24b4ac6ea178be9d44e159f79 Release v4.3.0
+ 21f1d933050a40d62612c6274c32de60b811d9ea changelog: post-release
+ 376187c4b14c795763d472214812826eebe7e9c2 Release v4.3.1
+ 73336e08902a7e826f7d960453df037380266186 changelog: post-release
+ 5c1c9d43be61571608e9b14615045b67b830daf5 udp, udp6, icmp: handle TTL value
+ 73ed49ab71998d4288e71e954ef6214b70f23d79 icmp, icmp6: Add icmp_forward_error and icmp6_forward_error
+ 7a4840a57ec7dbc37cca1ab96f058a9610b26950 udp, udp6, icmp, icmp6: Enable forwarding errors on Linux
+ e9b2bc19ae652a2907f247e621b2e4773bdd2aab TCPIPHDR_DELTA: Fix potential negative value
+ 39f9a363eec082f04513413046321abd04163148 .gitlab-ci: add a Coverity stage
+ 1b0093b973cfa0dc041522e5d4e6f576b2df642e sosendoob: better document what urgc is used for
+ 5b9ad89ebbb8afa50162c9156fabd5fc56291088 Add G_GNUC_PRINTF to local function slirp_vsnprintf
+ 8a808aa493980e212b4d5f5465330905c8294e59 meson: remove meson-dist script
+ 0b669b5fbe4d3c25a682a67f1059d8633c963b3d meson: support compiling as subproject
+ 9f82a47b81f2864422b82c1e40e51a2ed9c6ac32 Add DNS resolving for iOS
+ c0eac03e8ce1b9a743231f2fe21e7cb579fc9339 Remove the QEMU-special make build-system
+ 1bfd4d9368f9fa2e4f0731e1266bec05bbc83a80 socket: consume empty packets
+ 92413be68914f8cae2f5bad4bf3ab8491dcbc5d7 Release v4.4.0
+ 07e8cfac69766081871ab620d9f16a630543d302 changelog: post-release
+ 4c4e035813313d02b63fdeb920d56fb2fdc0a5b1 Remove some needless (void)casts
+ eee9db9d115d91aa82f33685c4e76d656db92976 fork_exec_child_setup: improve signal handling
+ 216f434a018b3af182a4f31bbe5a00daee170343 Fix unused variables

Signed-off-by: Marc-André Lureau <marcandre.lureau@redhat.com>
Message-Id: <20210125073427.3970606-2-marcandre.lureau@redhat.com>
---
 slirp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/slirp b/slirp
index 8f43a99191af..5dce846e3ee8 160000
--- a/slirp
+++ b/slirp
@@ -1 +1 @@
-Subproject commit 8f43a99191afb47ca3f3c6972f6306209f367ece
+Subproject commit 5dce846e3ee82d93462bc637bb0db2fd49f0fc5a

From da1d34255cc4f5ec142286b34dda5b9b231cf1ec Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Marc-Andr=C3=A9=20Lureau?= <marcandre.lureau@redhat.com>
Date: Mon, 25 Jan 2021 11:34:27 +0400
Subject: [PATCH 08/36] build-sys: make libslirp a meson subproject
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Remove the manual build.

Signed-off-by: Marc-André Lureau <marcandre.lureau@redhat.com>
Message-Id: <20210125073427.3970606-3-marcandre.lureau@redhat.com>
---
 .gitmodules          |  6 ++---
 configure            |  2 +-
 meson.build          | 63 +++-----------------------------------------
 slirp                |  1 -
 subprojects/libslirp |  1 +
 5 files changed, 9 insertions(+), 64 deletions(-)
 delete mode 160000 slirp
 create mode 160000 subprojects/libslirp

diff --git a/.gitmodules b/.gitmodules
index 08b1b48a09f4..c28831c50ab4 100644
--- a/.gitmodules
+++ b/.gitmodules
@@ -49,9 +49,9 @@
 [submodule "roms/edk2"]
 	path = roms/edk2
 	url = https://gitlab.com/qemu-project/edk2.git
-[submodule "slirp"]
-	path = slirp
-	url = https://gitlab.com/qemu-project/libslirp.git
+[submodule "subprojects/libslirp"]
+	path = subprojects/libslirp
+	url = https://git.qemu.org/git/libslirp.git
 [submodule "roms/opensbi"]
 	path = roms/opensbi
 	url = 	https://gitlab.com/qemu-project/opensbi.git
diff --git a/configure b/configure
index 351c3e012d08..a2736ecf161b 100755
--- a/configure
+++ b/configure
@@ -5257,7 +5257,7 @@ case "$slirp" in
   auto | enabled | internal)
     # Simpler to always update submodule, even if not needed.
     if test "$git_submodules_action" != "ignore"; then
-      git_submodules="${git_submodules} slirp"
+      git_submodules="${git_submodules} subprojects/libslirp"
     fi
     ;;
 esac
diff --git a/meson.build b/meson.build
index 40ce117aa118..892314dcbf6c 100644
--- a/meson.build
+++ b/meson.build
@@ -1504,7 +1504,7 @@ slirp_opt = 'disabled'
 if have_system
   slirp_opt = get_option('slirp')
   if slirp_opt in ['enabled', 'auto', 'system']
-    have_internal = fs.exists(meson.current_source_dir() / 'slirp/meson.build')
+    have_internal = fs.exists(meson.current_source_dir() / 'subprojects/libslirp/meson.build')
     slirp = dependency('slirp', kwargs: static_kwargs,
                        method: 'pkg-config',
                        required: slirp_opt == 'system' or
@@ -1518,64 +1518,9 @@ if have_system
     endif
   endif
   if slirp_opt == 'internal'
-    slirp_deps = []
-    if targetos == 'windows'
-      slirp_deps = cc.find_library('iphlpapi')
-    endif
-    slirp_conf = configuration_data()
-    slirp_conf.set('SLIRP_MAJOR_VERSION', meson.project_version().split('.')[0])
-    slirp_conf.set('SLIRP_MINOR_VERSION', meson.project_version().split('.')[1])
-    slirp_conf.set('SLIRP_MICRO_VERSION', meson.project_version().split('.')[2])
-    slirp_conf.set_quoted('SLIRP_VERSION_STRING', meson.project_version())
-    slirp_cargs = ['-DG_LOG_DOMAIN="Slirp"']
-    slirp_files = [
-      'slirp/src/arp_table.c',
-      'slirp/src/bootp.c',
-      'slirp/src/cksum.c',
-      'slirp/src/dhcpv6.c',
-      'slirp/src/dnssearch.c',
-      'slirp/src/if.c',
-      'slirp/src/ip6_icmp.c',
-      'slirp/src/ip6_input.c',
-      'slirp/src/ip6_output.c',
-      'slirp/src/ip_icmp.c',
-      'slirp/src/ip_input.c',
-      'slirp/src/ip_output.c',
-      'slirp/src/mbuf.c',
-      'slirp/src/misc.c',
-      'slirp/src/ncsi.c',
-      'slirp/src/ndp_table.c',
-      'slirp/src/sbuf.c',
-      'slirp/src/slirp.c',
-      'slirp/src/socket.c',
-      'slirp/src/state.c',
-      'slirp/src/stream.c',
-      'slirp/src/tcp_input.c',
-      'slirp/src/tcp_output.c',
-      'slirp/src/tcp_subr.c',
-      'slirp/src/tcp_timer.c',
-      'slirp/src/tftp.c',
-      'slirp/src/udp.c',
-      'slirp/src/udp6.c',
-      'slirp/src/util.c',
-      'slirp/src/version.c',
-      'slirp/src/vmstate.c',
-    ]
-
-    configure_file(
-      input : 'slirp/src/libslirp-version.h.in',
-      output : 'libslirp-version.h',
-      configuration: slirp_conf)
-
-    slirp_inc = include_directories('slirp', 'slirp/src')
-    libslirp = static_library('slirp',
-                              build_by_default: false,
-                              sources: slirp_files,
-                              c_args: slirp_cargs,
-                              include_directories: slirp_inc)
-    slirp = declare_dependency(link_with: libslirp,
-                               dependencies: slirp_deps,
-                               include_directories: slirp_inc)
+    libslirp = subproject('libslirp',
+                          default_options: ['default_library=static'])
+    slirp = libslirp.get_variable('libslirp_dep')
   endif
 endif
 
diff --git a/slirp b/slirp
deleted file mode 160000
index 5dce846e3ee8..000000000000
--- a/slirp
+++ /dev/null
@@ -1 +0,0 @@
-Subproject commit 5dce846e3ee82d93462bc637bb0db2fd49f0fc5a
diff --git a/subprojects/libslirp b/subprojects/libslirp
new file mode 160000
index 000000000000..bacb71f1c3ed
--- /dev/null
+++ b/subprojects/libslirp
@@ -0,0 +1 @@
+Subproject commit bacb71f1c3ed5f40e393afd8be81bedfba13a401

From cf8a85ad1822ea6153eff5bda9c14b96d1c0bd0d Mon Sep 17 00:00:00 2001
From: Roman Bolshakov <r.bolshakov@yadro.com>
Date: Wed, 10 Feb 2021 13:55:27 +0300
Subject: [PATCH 09/36] util/osdep: Avoid mprotect() RWX->NONE on Big Sur 11.2

There's a change in mprotect() behaviour [1] in the latest macOS on M1
and it's not yet clear if it's going to be fixed by Apple. For now we
can avoid unsupported mprotect() calls. QEMU and qtests work fine
without it.

1. https://gist.github.com/hikalium/75ae822466ee4da13cbbe486498a191f

Buglink: https://bugs.launchpad.net/qemu/+bug/1914849
Apple-Feedback: FB8994773
Signed-off-by: Roman Bolshakov <r.bolshakov@yadro.com>
Message-Id: <20210210105527.74943-1-r.bolshakov@yadro.com>
---
 util/osdep.c | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/util/osdep.c b/util/osdep.c
index 66d01b9160fb..1edd7b1caf9c 100644
--- a/util/osdep.c
+++ b/util/osdep.c
@@ -111,6 +111,12 @@ int qemu_mprotect_none(void *addr, size_t size)
 #ifdef _WIN32
     return qemu_mprotect__osdep(addr, size, PAGE_NOACCESS);
 #else
+# if defined(__APPLE__) && defined(__arm64__)
+    if (__builtin_available(macOS 11.2, *)) {
+        /* mprotect() in macOS 11.2 can't switch RWX to NONE */
+        return 0;
+    }
+# endif
     return qemu_mprotect__osdep(addr, size, PROT_NONE);
 #endif
 }

From 37aa4a189406effcd6d19844f33f068f5a109d6c Mon Sep 17 00:00:00 2001
From: osy <50960678+osy@users.noreply.github.com>
Date: Mon, 4 Jan 2021 14:04:27 -0800
Subject: [PATCH 10/36] tcg: custom APRR implementation

---
 accel/tcg/cpu-exec.c        |   1 +
 accel/tcg/translate-all.c   |   1 +
 include/qemu/osdep.h        |  28 ----------
 include/tcg/tcg-apple-jit.h | 100 ++++++++++++++++++++++++++++++++++++
 tcg/tcg.c                   |   1 +
 util/osdep.c                |   4 ++
 6 files changed, 107 insertions(+), 28 deletions(-)
 create mode 100644 include/tcg/tcg-apple-jit.h

diff --git a/accel/tcg/cpu-exec.c b/accel/tcg/cpu-exec.c
index 16e4fe3ccd87..6b2c66468368 100644
--- a/accel/tcg/cpu-exec.c
+++ b/accel/tcg/cpu-exec.c
@@ -26,6 +26,7 @@
 #include "disas/disas.h"
 #include "exec/exec-all.h"
 #include "tcg/tcg.h"
+#include "tcg/tcg-apple-jit.h"
 #include "qemu/atomic.h"
 #include "qemu/compiler.h"
 #include "sysemu/qtest.h"
diff --git a/accel/tcg/translate-all.c b/accel/tcg/translate-all.c
index bbd919a39328..e03d48ae7abc 100644
--- a/accel/tcg/translate-all.c
+++ b/accel/tcg/translate-all.c
@@ -27,6 +27,7 @@
 #include "disas/disas.h"
 #include "exec/exec-all.h"
 #include "tcg/tcg.h"
+#include "tcg/tcg-apple-jit.h"
 #if defined(CONFIG_USER_ONLY)
 #include "qemu.h"
 #if defined(__FreeBSD__) || defined(__FreeBSD_kernel__)
diff --git a/include/qemu/osdep.h b/include/qemu/osdep.h
index ba15be9c569c..5bd1a6776915 100644
--- a/include/qemu/osdep.h
+++ b/include/qemu/osdep.h
@@ -119,10 +119,6 @@ extern int daemon(int, int);
 #include "sysemu/os-posix.h"
 #endif
 
-#ifdef __APPLE__
-#include <AvailabilityMacros.h>
-#endif
-
 #include "glib-compat.h"
 #include "qemu/typedefs.h"
 
@@ -686,30 +682,6 @@ char *qemu_get_host_name(Error **errp);
  */
 size_t qemu_get_host_physmem(void);
 
-/*
- * Toggle write/execute on the pages marked MAP_JIT
- * for the current thread.
- */
-#if defined(MAC_OS_VERSION_11_0) && \
-    MAC_OS_X_VERSION_MAX_ALLOWED >= MAC_OS_VERSION_11_0
-static inline void qemu_thread_jit_execute(void)
-{
-    if (__builtin_available(macOS 11.0, *)) {
-        pthread_jit_write_protect_np(true);
-    }
-}
-
-static inline void qemu_thread_jit_write(void)
-{
-    if (__builtin_available(macOS 11.0, *)) {
-        pthread_jit_write_protect_np(false);
-    }
-}
-#else
-static inline void qemu_thread_jit_write(void) {}
-static inline void qemu_thread_jit_execute(void) {}
-#endif
-
 /**
  * Platforms which do not support system() return ENOSYS
  */
diff --git a/include/tcg/tcg-apple-jit.h b/include/tcg/tcg-apple-jit.h
new file mode 100644
index 000000000000..7f25dab809a3
--- /dev/null
+++ b/include/tcg/tcg-apple-jit.h
@@ -0,0 +1,100 @@
+/*
+ * Apple Silicon functions for JIT handling
+ *
+ * Copyright (c) 2020 osy
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+#ifndef TCG_APPLE_JIT_H
+#define TCG_APPLE_JIT_H
+
+/*
+ * APRR handling
+ * Credits to: https://siguza.github.io/APRR/
+ * Reversed from /usr/lib/system/libsystem_pthread.dylib
+ */
+
+#if defined(__aarch64__) && defined(CONFIG_DARWIN)
+
+#define _COMM_PAGE_START_ADDRESS        (0x0000000FFFFFC000ULL) /* In TTBR0 */
+#define _COMM_PAGE_APRR_SUPPORT         (_COMM_PAGE_START_ADDRESS + 0x10C)
+#define _COMM_PAGE_APPR_WRITE_ENABLE    (_COMM_PAGE_START_ADDRESS + 0x110)
+#define _COMM_PAGE_APRR_WRITE_DISABLE   (_COMM_PAGE_START_ADDRESS + 0x118)
+
+static __attribute__((__always_inline__)) bool jit_write_protect_supported(void)
+{
+    /* Access shared kernel page at fixed memory location. */
+    uint8_t aprr_support = *(volatile uint8_t *)_COMM_PAGE_APRR_SUPPORT;
+    return aprr_support > 0;
+}
+
+/* write protect enable = write disable */
+static __attribute__((__always_inline__)) void jit_write_protect(int enabled)
+{
+    /* Access shared kernel page at fixed memory location. */
+    uint8_t aprr_support = *(volatile uint8_t *)_COMM_PAGE_APRR_SUPPORT;
+    if (aprr_support == 0 || aprr_support > 3) {
+        return;
+    } else if (aprr_support == 1) {
+        __asm__ __volatile__ (
+            "mov x0, %0\n"
+            "ldr x0, [x0]\n"
+            "msr S3_4_c15_c2_7, x0\n"
+            "isb sy\n"
+            :: "r" (enabled ? _COMM_PAGE_APRR_WRITE_DISABLE
+                            : _COMM_PAGE_APPR_WRITE_ENABLE)
+            : "memory", "x0"
+        );
+    } else {
+        __asm__ __volatile__ (
+            "mov x0, %0\n"
+            "ldr x0, [x0]\n"
+            "msr S3_6_c15_c1_5, x0\n"
+            "isb sy\n"
+            :: "r" (enabled ? _COMM_PAGE_APRR_WRITE_DISABLE
+                            : _COMM_PAGE_APPR_WRITE_ENABLE)
+            : "memory", "x0"
+        );
+    }
+}
+
+#else /* defined(__aarch64__) && defined(CONFIG_DARWIN) */
+
+static __attribute__((__always_inline__)) bool jit_write_protect_supported(void)
+{
+    return false;
+}
+
+static __attribute__((__always_inline__)) void jit_write_protect(int enabled)
+{
+}
+
+#endif
+
+static inline void qemu_thread_jit_execute(void)
+{
+    if (jit_write_protect_supported()) {
+        jit_write_protect(true);
+    }
+}
+
+static inline void qemu_thread_jit_write(void)
+{
+    if (jit_write_protect_supported()) {
+        jit_write_protect(false);
+    }
+}
+
+#endif /* define TCG_APPLE_JIT_H */
diff --git a/tcg/tcg.c b/tcg/tcg.c
index 63a12b197bff..457a40837feb 100644
--- a/tcg/tcg.c
+++ b/tcg/tcg.c
@@ -49,6 +49,7 @@
 #include "hw/boards.h"
 #endif
 
+#include "tcg/tcg-apple-jit.h"
 #include "tcg/tcg-op.h"
 
 #if UINTPTR_MAX == UINT32_MAX
diff --git a/util/osdep.c b/util/osdep.c
index 1edd7b1caf9c..cb20608292ef 100644
--- a/util/osdep.c
+++ b/util/osdep.c
@@ -39,6 +39,10 @@ extern int madvise(char *, size_t, int);
 #include "qemu/error-report.h"
 #include "monitor/monitor.h"
 
+#ifdef CONFIG_DARWIN
+#include "tcg/tcg-apple-jit.h"
+#endif
+
 static bool fips_enabled = false;
 
 static const char *hw_version = QEMU_HW_VERSION;

From 4373fc4514e2c793d121fe9f0b021d6fefb163df Mon Sep 17 00:00:00 2001
From: Joelle van Dyne <j@getutm.app>
Date: Sun, 3 Jan 2021 08:58:11 -0800
Subject: [PATCH 11/36] coroutine: add libucontext as external library

iOS does not support ucontext natively for aarch64 and the sigaltstack is
also unsupported (even worse, it fails silently, see:
https://openradar.appspot.com/13002712 )

As a workaround we include a library implementation of ucontext and add it
as a build option.
---
 .gitmodules               |  3 +++
 configure                 | 23 ++++++++++++++++++++---
 meson.build               | 12 +++++++++++-
 meson_options.txt         |  2 ++
 subprojects/libucontext   |  1 +
 util/coroutine-ucontext.c |  9 +++++++++
 6 files changed, 46 insertions(+), 4 deletions(-)
 create mode 160000 subprojects/libucontext

diff --git a/.gitmodules b/.gitmodules
index c28831c50ab4..33f90687266b 100644
--- a/.gitmodules
+++ b/.gitmodules
@@ -64,3 +64,6 @@
 [submodule "roms/vbootrom"]
 	path = roms/vbootrom
 	url = https://gitlab.com/qemu-project/vbootrom.git
+[submodule "libucontext"]
+	path = subprojects/libucontext
+	url = https://github.com/utmapp/libucontext.git
diff --git a/configure b/configure
index a2736ecf161b..34f237ae653e 100755
--- a/configure
+++ b/configure
@@ -1781,7 +1781,7 @@ Advanced options (experts only):
   --oss-lib                path to OSS library
   --cpu=CPU                Build for host CPU [$cpu]
   --with-coroutine=BACKEND coroutine backend. Supported options:
-                           ucontext, sigaltstack, windows
+                           ucontext, libucontext, sigaltstack, windows
   --enable-gcov            enable test coverage analysis with gcov
   --disable-blobs          disable installing provided firmware blobs
   --with-vss-sdk=SDK-path  enable Windows VSS support in QEMU Guest Agent
@@ -4504,6 +4504,8 @@ if test "$coroutine" = ""; then
     coroutine=win32
   elif test "$ucontext_works" = "yes"; then
     coroutine=ucontext
+  elif test "$ios" = "yes"; then
+    coroutine=libucontext
   else
     coroutine=sigaltstack
   fi
@@ -4527,12 +4529,27 @@ else
       error_exit "only the 'windows' coroutine backend is valid for Windows"
     fi
     ;;
+  libucontext)
+  ;;
   *)
     error_exit "unknown coroutine backend $coroutine"
     ;;
   esac
 fi
 
+case $coroutine in
+libucontext)
+  git_submodules="${git_submodules} subprojects/libucontext"
+  mkdir -p libucontext
+  coroutine_impl=ucontext
+  libucontext="enabled"
+  ;;
+*)
+  coroutine_impl=$coroutine
+  libucontext="disabled"
+  ;;
+esac
+
 if test "$coroutine_pool" = ""; then
   coroutine_pool=yes
 fi
@@ -5884,7 +5901,7 @@ if test "$qom_cast_debug" = "yes" ; then
   echo "CONFIG_QOM_CAST_DEBUG=y" >> $config_host_mak
 fi
 
-echo "CONFIG_COROUTINE_BACKEND=$coroutine" >> $config_host_mak
+echo "CONFIG_COROUTINE_BACKEND=$coroutine_impl" >> $config_host_mak
 if test "$coroutine_pool" = "yes" ; then
   echo "CONFIG_COROUTINE_POOL=1" >> $config_host_mak
 else
@@ -6451,7 +6468,7 @@ NINJA=$ninja $meson setup \
         -Dlibnfs=$libnfs -Diconv=$iconv -Dcurses=$curses -Dlibudev=$libudev\
         -Drbd=$rbd -Dlzo=$lzo -Dsnappy=$snappy -Dlzfse=$lzfse \
         -Dzstd=$zstd -Dseccomp=$seccomp -Dvirtfs=$virtfs -Dcap_ng=$cap_ng \
-        -Dattr=$attr -Ddefault_devices=$default_devices \
+        -Dattr=$attr -Ddefault_devices=$default_devices -Ducontext=$libucontext \
         -Ddocs=$docs -Dsphinx_build=$sphinx_build -Dinstall_blobs=$blobs \
         -Dvhost_user_blk_server=$vhost_user_blk_server -Dmultiprocess=$multiprocess \
         -Dfuse=$fuse -Dfuse_lseek=$fuse_lseek -Dguest_agent_msi=$guest_agent_msi \
diff --git a/meson.build b/meson.build
index 892314dcbf6c..d9f0d44535cb 100644
--- a/meson.build
+++ b/meson.build
@@ -1571,9 +1571,18 @@ if not fdt.found() and fdt_required.length() > 0
   error('fdt not available but required by targets ' + ', '.join(fdt_required))
 endif
 
+ucontext = dependency('libucontext', kwargs: static_kwargs, required : false)
+if not ucontext.found() and get_option('ucontext').enabled()
+  libucontext_proj = subproject('libucontext',
+                                default_options: ['default_library=static',
+                                                  'freestanding=true'])
+  ucontext = libucontext_proj.get_variable('libucontext_dep')
+endif
+
 config_host_data.set('CONFIG_CAPSTONE', capstone.found())
 config_host_data.set('CONFIG_FDT', fdt.found())
 config_host_data.set('CONFIG_SLIRP', slirp.found())
+config_host_data.set('CONFIG_LIBUCONTEXT', ucontext.found())
 
 #####################
 # Generated sources #
@@ -1821,7 +1830,7 @@ util_ss.add_all(trace_ss)
 util_ss = util_ss.apply(config_all, strict: false)
 libqemuutil = static_library('qemuutil',
                              sources: util_ss.sources() + stub_ss.sources() + genh,
-                             dependencies: [util_ss.dependencies(), m, glib, socket, malloc])
+                             dependencies: [util_ss.dependencies(), m, glib, socket, malloc, ucontext])
 qemuutil = declare_dependency(link_with: libqemuutil,
                               sources: genh + version_res)
 
@@ -2545,6 +2554,7 @@ summary(summary_info, bool_yn: true, section: 'Targets and accelerators')
 
 # Block layer
 summary_info = {}
+summary_info += {'libucontext support': ucontext.found()}
 summary_info += {'coroutine backend': config_host['CONFIG_COROUTINE_BACKEND']}
 summary_info += {'coroutine pool':    config_host['CONFIG_COROUTINE_POOL'] == '1'}
 if have_block
diff --git a/meson_options.txt b/meson_options.txt
index 4594d42769d6..6c29ea93300a 100644
--- a/meson_options.txt
+++ b/meson_options.txt
@@ -110,6 +110,8 @@ option('fuse', type: 'feature', value: 'auto',
        description: 'FUSE block device export')
 option('fuse_lseek', type : 'feature', value : 'auto',
        description: 'SEEK_HOLE/SEEK_DATA support for FUSE exports')
+option('ucontext', type : 'feature', value : 'disabled',
+       description: 'libucontext support')
 
 option('vhost_user_blk_server', type: 'feature', value: 'auto',
        description: 'build vhost-user-blk server')
diff --git a/subprojects/libucontext b/subprojects/libucontext
new file mode 160000
index 000000000000..9b1d8f01a6e9
--- /dev/null
+++ b/subprojects/libucontext
@@ -0,0 +1 @@
+Subproject commit 9b1d8f01a6e99166f9808c79966abe10786de8b6
diff --git a/util/coroutine-ucontext.c b/util/coroutine-ucontext.c
index 904b375192ca..220c57a743af 100644
--- a/util/coroutine-ucontext.c
+++ b/util/coroutine-ucontext.c
@@ -23,7 +23,16 @@
 #undef _FORTIFY_SOURCE
 #endif
 #include "qemu/osdep.h"
+#if defined(CONFIG_LIBUCONTEXT)
+#include <libucontext/libucontext.h>
+#define ucontext_t libucontext_ucontext_t
+#define getcontext libucontext_getcontext
+#define setcontext libucontext_setcontext
+#define swapcontext libucontext_swapcontext
+#define makecontext libucontext_makecontext
+#else
 #include <ucontext.h>
+#endif
 #include "qemu/coroutine_int.h"
 
 #ifdef CONFIG_VALGRIND_H

From e16a18d2d9240b234a56055857baa8035ded1fbd Mon Sep 17 00:00:00 2001
From: Kate Temkin <k@ktemkin.com>
Date: Sat, 27 Mar 2021 15:46:31 -0600
Subject: [PATCH 12/36] get building for an iOS target, as well

---
 block/file-posix.c      | 21 +++++++++++++++++++++
 configure               | 41 ++++++++++++++++++++++++++++++++++++++++-
 include/qemu/osdep.h    |  2 +-
 meson.build             |  7 +++----
 tests/qtest/meson.build |  4 ++--
 5 files changed, 67 insertions(+), 8 deletions(-)

diff --git a/block/file-posix.c b/block/file-posix.c
index 9b6d7ddda3d5..e0b0e56db375 100644
--- a/block/file-posix.c
+++ b/block/file-posix.c
@@ -279,6 +279,13 @@ static int raw_normalize_devicepath(const char **filename, Error **errp)
 }
 #endif
 
+#if defined(CONFIG_IOS)
+static int probe_logical_blocksize(int fd, unsigned int *sector_size_p)
+{
+    return -ENOTSUP; /* not supported on iOS */
+}
+#else /* CONFIG_IOS */
+
 /*
  * Get logical block size via ioctl. On success store it in @sector_size_p.
  */
@@ -312,6 +319,8 @@ static int probe_logical_blocksize(int fd, unsigned int *sector_size_p)
     return success ? 0 : -errno;
 }
 
+#endif
+
 /**
  * Get physical block size of @fd.
  * On success, store it in @blk_size and return 0.
@@ -1408,12 +1417,24 @@ static bool preadv_present = true;
 static ssize_t
 qemu_preadv(int fd, const struct iovec *iov, int nr_iov, off_t offset)
 {
+#ifdef CONFIG_DARWIN /* preadv introduced in macOS 11 */
+    if (!__builtin_available(macOS 11, iOS 14, watchOS 7, tvOS 14, *)) {
+        preadv_present = false;
+        return -ENOSYS;
+    } else
+#endif
     return preadv(fd, iov, nr_iov, offset);
 }
 
 static ssize_t
 qemu_pwritev(int fd, const struct iovec *iov, int nr_iov, off_t offset)
 {
+#ifdef CONFIG_DARWIN /* pwritev introduced in macOS 11 */
+    if (!__builtin_available(macOS 11, iOS 14, watchOS 7, tvOS 14, *)) {
+        preadv_present = false;
+        return -ENOSYS;
+    } else
+#endif
     return pwritev(fd, iov, nr_iov, offset);
 }
 
diff --git a/configure b/configure
index 34f237ae653e..ac874fda5b02 100755
--- a/configure
+++ b/configure
@@ -571,6 +571,21 @@ EOF
   compile_object
 }
 
+check_ios() {
+  cat > $TMPC <<EOF
+#ifdef __APPLE__
+#import "TargetConditionals.h"
+#if !TARGET_OS_IPHONE
+#error TARGET_OS_IPHONE not true
+#endif
+#endif
+int main(void) { return 0; }
+EOF
+  compile_object
+}
+
+
+
 check_include() {
 cat > $TMPC <<EOF
 #include <$1>
@@ -613,7 +628,11 @@ elif check_define __DragonFly__ ; then
 elif check_define __NetBSD__; then
   targetos='NetBSD'
 elif check_define __APPLE__; then
-  targetos='Darwin'
+  if check_ios ; then
+    targetos='iOS'
+  else
+    targetos='Darwin'
+  fi
 else
   # This is a fatal error, but don't report it yet, because we
   # might be going to just print the --help text, or it might
@@ -629,6 +648,22 @@ case $targetos in
 Darwin)
   HOST_DSOSUF=".dylib"
   ;;
+iOS)
+  bsd="yes"
+  darwin="yes"
+  ios="yes"
+  if [ "$cpu" = "x86_64" ] ; then
+    QEMU_CFLAGS="-arch x86_64 $QEMU_CFLAGS"
+    QEMU_LDFLAGS="-arch x86_64 $QEMU_LDFLAGS"
+  fi
+  host_block_device_support="no"
+  audio_drv_list=""
+  audio_possible_drivers=""
+  QEMU_LDFLAGS="-framework CoreFoundation $QEMU_LDFLAGS"
+  # Disable attempts to use ObjectiveC features in os/object.h since they
+  # won't work when we're compiling with gcc as a C compiler.
+  QEMU_CFLAGS="-DOS_OBJECT_USE_OBJC=0 $QEMU_CFLAGS"
+;;
 SunOS)
   # $(uname -m) returns i86pc even on an x86_64 box, so default based on isainfo
   if test -z "$cpu" && test "$(isainfo -k)" = "amd64"; then
@@ -5549,6 +5584,10 @@ if test "$darwin" = "yes" ; then
   echo "CONFIG_DARWIN=y" >> $config_host_mak
 fi
 
+if test "$ios" = "yes" ; then
+  echo "CONFIG_IOS=y" >> $config_host_mak
+fi
+
 if test "$solaris" = "yes" ; then
   echo "CONFIG_SOLARIS=y" >> $config_host_mak
 fi
diff --git a/include/qemu/osdep.h b/include/qemu/osdep.h
index 5bd1a6776915..76cfa8f83562 100644
--- a/include/qemu/osdep.h
+++ b/include/qemu/osdep.h
@@ -685,7 +685,7 @@ size_t qemu_get_host_physmem(void);
 /**
  * Platforms which do not support system() return ENOSYS
  */
-#ifndef HAVE_SYSTEM_FUNCTION
+#if !defined(HAVE_SYSTEM_FUNCTION) || defined(CONFIG_IOS)
 #define system platform_does_not_support_system
 static inline int platform_does_not_support_system(const char *command)
 {
diff --git a/meson.build b/meson.build
index d9f0d44535cb..bcb175d82265 100644
--- a/meson.build
+++ b/meson.build
@@ -181,7 +181,7 @@ if targetos == 'windows'
                                       include_directories: include_directories('.'))
 elif targetos == 'darwin'
   coref = dependency('appleframeworks', modules: 'CoreFoundation')
-  iokit = dependency('appleframeworks', modules: 'IOKit', required: false)
+  iokit = dependency('appleframeworks', modules: 'IOKit', required: 'CONFIG_IOS' not in config_host)
 elif targetos == 'sunos'
   socket = [cc.find_library('socket'),
             cc.find_library('nsl'),
@@ -1056,8 +1056,7 @@ if get_option('cfi')
   add_global_link_arguments(cfi_flags, native: false, language: ['c', 'cpp', 'objc'])
 endif
 
-have_host_block_device = (targetos != 'darwin' or
-    cc.has_header('IOKit/storage/IOMedia.h'))
+have_host_block_device = (targetos != 'darwin' or (cc.has_header('IOKit/storage/IOMedia.h') and ('CONFIG_IOS' not in config_host)))
 
 #################
 # config-host.h #
@@ -1153,7 +1152,7 @@ config_host_data.set('HAVE_SYS_IOCCOM_H', cc.has_header('sys/ioccom.h'))
 config_host_data.set('HAVE_SYS_KCOV_H', cc.has_header('sys/kcov.h'))
 config_host_data.set('HAVE_SYSTEM_FUNCTION', cc.has_function('system', prefix: '#include <stdlib.h>'))
 config_host_data.set('HAVE_HOST_BLOCK_DEVICE', have_host_block_device)
-config_host_data.set('HAVE_SYS_DISK_H', cc.has_header('sys/disk.h'))
+config_host_data.set('HAVE_SYS_DISK_H', cc.has_header('sys/disk.h') and ('CONFIG_IOS' not in config_host))
 
 config_host_data.set('CONFIG_PREADV', cc.has_function('preadv', prefix: '#include <sys/uio.h>'))
 
diff --git a/tests/qtest/meson.build b/tests/qtest/meson.build
index ba6ecaed3256..dad92c996a72 100644
--- a/tests/qtest/meson.build
+++ b/tests/qtest/meson.build
@@ -47,12 +47,11 @@ qtests_i386 = \
   (config_all_devices.has_key('CONFIG_TPM_TIS_ISA') ? ['tpm-tis-test'] : []) +              \
   (config_all_devices.has_key('CONFIG_TPM_TIS_ISA') ? ['tpm-tis-swtpm-test'] : []) +        \
   (config_all_devices.has_key('CONFIG_RTL8139_PCI') ? ['rtl8139-test'] : []) +              \
+  (not config_host.has_key('CONFIG_IOS') ? ['bios-tables-test', 'hd-geo-test'] : []) +      \
   qtests_pci +                                                                              \
   ['fdc-test',
    'ide-test',
-   'hd-geo-test',
    'boot-order-test',
-   'bios-tables-test',
    'rtc-test',
    'i440fx-test',
    'fuzz-test',
@@ -160,6 +159,7 @@ qtests_aarch64 = \
   (cpu != 'arm' ? ['bios-tables-test'] : []) +                                                  \
   (config_all_devices.has_key('CONFIG_TPM_TIS_SYSBUS') ? ['tpm-tis-device-test'] : []) +        \
   (config_all_devices.has_key('CONFIG_TPM_TIS_SYSBUS') ? ['tpm-tis-device-swtpm-test'] : []) +  \
+  (cpu != 'arm' and not config_host.has_key('CONFIG_IOS') ? ['bios-tables-test'] : []) + \
   ['arm-cpu-features',
    'numa-test',
    'boot-serial-test',

From 4de86e6de1f071d33ce130db1062ebac8fe90a5e Mon Sep 17 00:00:00 2001
From: Kate Temkin <k@ktemkin.com>
Date: Mon, 29 Mar 2021 13:26:08 -0600
Subject: [PATCH 13/36] TCTI: add TCTI TCG backend for acceleration on non-JIT
 AArch64

---
 accel/tcg/translate-all.c             |    6 +-
 configure                             |   10 +-
 disas.c                               |    2 +
 include/disas/dis-asm.h               |    1 +
 include/exec/exec-all.h               |    4 +
 include/tcg/tcg.h                     |    2 +-
 meson.build                           |   24 +
 meson_options.txt                     |    2 +
 scripts/mtest2make.py                 |   24 +-
 tcg/aarch64-tcti/README.md            | 1026 +++++++++++++++++++
 tcg/aarch64-tcti/tcg-target-con-set.h |   21 +
 tcg/aarch64-tcti/tcg-target-con-str.h |   11 +
 tcg/aarch64-tcti/tcg-target.c.inc     | 1347 +++++++++++++++++++++++++
 tcg/aarch64-tcti/tcg-target.h         |  220 ++++
 tcg/aarch64-tcti/tcti-gadget-gen.py   |  788 +++++++++++++++
 tcg/tcg.c                             |   14 +-
 16 files changed, 3476 insertions(+), 26 deletions(-)
 create mode 100644 tcg/aarch64-tcti/README.md
 create mode 100644 tcg/aarch64-tcti/tcg-target-con-set.h
 create mode 100644 tcg/aarch64-tcti/tcg-target-con-str.h
 create mode 100644 tcg/aarch64-tcti/tcg-target.c.inc
 create mode 100644 tcg/aarch64-tcti/tcg-target.h
 create mode 100755 tcg/aarch64-tcti/tcti-gadget-gen.py

diff --git a/accel/tcg/translate-all.c b/accel/tcg/translate-all.c
index e03d48ae7abc..bef19e744841 100644
--- a/accel/tcg/translate-all.c
+++ b/accel/tcg/translate-all.c
@@ -1130,7 +1130,7 @@ static bool alloc_code_gen_buffer_anon(size_t size, int prot,
     return true;
 }
 
-#ifndef CONFIG_TCG_INTERPRETER
+#if !defined(CONFIG_TCG_INTERPRETER) && !defined(CONFIG_TCG_THREADED_INTERPRETER)
 #ifdef CONFIG_POSIX
 #include "qemu/memfd.h"
 
@@ -1257,7 +1257,7 @@ static bool alloc_code_gen_buffer_splitwx_vmremap(size_t size, Error **errp)
 
 static bool alloc_code_gen_buffer_splitwx(size_t size, Error **errp)
 {
-#ifndef CONFIG_TCG_INTERPRETER
+#if !defined(CONFIG_TCG_INTERPRETER) && !defined(CONFIG_TCG_THREADED_INTERPRETER)
 # ifdef CONFIG_DARWIN
     return alloc_code_gen_buffer_splitwx_vmremap(size, errp);
 # endif
@@ -1290,7 +1290,7 @@ static bool alloc_code_gen_buffer(size_t size, int splitwx, Error **errp)
 
     prot = PROT_READ | PROT_WRITE | PROT_EXEC;
     flags = MAP_PRIVATE | MAP_ANONYMOUS;
-#ifdef CONFIG_TCG_INTERPRETER
+#if defined(CONFIG_TCG_INTERPRETER) || defined(CONFIG_TCG_THREADED_INTERPRETER)
     /* The tcg interpreter does not need execute permission. */
     prot = PROT_READ | PROT_WRITE;
 #elif defined(CONFIG_DARWIN)
diff --git a/configure b/configure
index ac874fda5b02..b80b9e619641 100755
--- a/configure
+++ b/configure
@@ -362,6 +362,7 @@ tsan="no"
 fortify_source="$default_feature"
 strip_opt="yes"
 tcg_interpreter="false"
+tcg_threaded_interpreter="false"
 bigendian="no"
 mingw32="no"
 gcov="no"
@@ -1149,6 +1150,10 @@ for opt do
   ;;
   --enable-tcg-interpreter) tcg_interpreter="true"
   ;;
+  --disable-tcg-tcti) tcg_threaded_interpreter="false"
+  ;;
+  --enable-tcg-tcti) tcg_threaded_interpreter="true"
+  ;;
   --disable-cap-ng)  cap_ng="disabled"
   ;;
   --enable-cap-ng) cap_ng="enabled"
@@ -6512,9 +6517,8 @@ NINJA=$ninja $meson setup \
         -Dvhost_user_blk_server=$vhost_user_blk_server -Dmultiprocess=$multiprocess \
         -Dfuse=$fuse -Dfuse_lseek=$fuse_lseek -Dguest_agent_msi=$guest_agent_msi \
         $(if test "$default_features" = no; then echo "-Dauto_features=disabled"; fi) \
-	-Dtcg_interpreter=$tcg_interpreter -Dshared_lib=$shared_lib \
-        $cross_arg \
-        "$PWD" "$source_path"
+	-Dtcg_interpreter=$tcg_interpreter -Dtcg_threaded_interpreter=$tcg_threaded_interpreter\
+	-Dshared_lib=$shared_lib $cross_arg "$PWD" "$source_path"
 
 if test "$?" -ne 0 ; then
     error_exit "meson setup failed"
diff --git a/disas.c b/disas.c
index a61f95b580b8..cea0f9019e49 100644
--- a/disas.c
+++ b/disas.c
@@ -152,6 +152,8 @@ static void initialize_debug_host(CPUDebug *s)
 #endif
 #if defined(CONFIG_TCG_INTERPRETER)
     s->info.print_insn = print_insn_tci;
+#elif defined(CONFIG_TCG_THREADED_INTERPRETER)
+    s->info.print_insn = print_insn_tcti;
 #elif defined(__i386__)
     s->info.mach = bfd_mach_i386_i386;
     s->info.print_insn = print_insn_i386;
diff --git a/include/disas/dis-asm.h b/include/disas/dis-asm.h
index 13fa1edd411e..ded69ba2ffaa 100644
--- a/include/disas/dis-asm.h
+++ b/include/disas/dis-asm.h
@@ -411,6 +411,7 @@ typedef struct disassemble_info {
 typedef int (*disassembler_ftype) (bfd_vma, disassemble_info *);
 
 int print_insn_tci(bfd_vma, disassemble_info*);
+int print_insn_tcti(bfd_vma, disassemble_info*);
 int print_insn_big_mips         (bfd_vma, disassemble_info*);
 int print_insn_little_mips      (bfd_vma, disassemble_info*);
 int print_insn_nanomips         (bfd_vma, disassemble_info*);
diff --git a/include/exec/exec-all.h b/include/exec/exec-all.h
index b7b3c0ef12d9..0b879e939abd 100644
--- a/include/exec/exec-all.h
+++ b/include/exec/exec-all.h
@@ -543,7 +543,11 @@ void tb_set_jmp_target(TranslationBlock *tb, int n, uintptr_t addr);
 #if defined(CONFIG_TCG_INTERPRETER)
 extern __thread uintptr_t tci_tb_ptr;
 # define GETPC() tci_tb_ptr
+#elif defined(CONFIG_TCG_THREADED_INTERPRETER)
+extern __thread uintptr_t tcti_call_return_address;
+# define GETPC() tcti_call_return_address
 #else
+/* Note that this is correct for TCTI also; whose gadget behaves like native code. */
 # define GETPC() \
     ((uintptr_t)__builtin_extract_return_addr(__builtin_return_address(0)))
 #endif
diff --git a/include/tcg/tcg.h b/include/tcg/tcg.h
index 0f0695e90da2..cfcd069bf3f6 100644
--- a/include/tcg/tcg.h
+++ b/include/tcg/tcg.h
@@ -1296,7 +1296,7 @@ static inline unsigned get_mmuidx(TCGMemOpIdx oi)
 #define TB_EXIT_IDXMAX    1
 #define TB_EXIT_REQUESTED 3
 
-#ifdef CONFIG_TCG_INTERPRETER
+#if defined(CONFIG_TCG_INTERPRETER) || defined(CONFIG_TCG_THREADED_INTERPRETER)
 uintptr_t tcg_qemu_tb_exec(CPUArchState *env, const void *tb_ptr);
 #else
 typedef uintptr_t tcg_prologue_fn(CPUArchState *env, const void *tb_ptr);
diff --git a/meson.build b/meson.build
index bcb175d82265..f3dddc075a47 100644
--- a/meson.build
+++ b/meson.build
@@ -58,6 +58,7 @@ python = import('python').find_installation()
 supported_oses = ['windows', 'freebsd', 'netbsd', 'openbsd', 'darwin', 'sunos', 'linux']
 supported_cpus = ['ppc', 'ppc64', 's390x', 'riscv32', 'riscv64', 'x86', 'x86_64',
   'arm', 'aarch64', 'mips', 'mips64', 'sparc', 'sparc64']
+tcti_supported_cpus = ['aarch64']
 
 cpu = host_machine.cpu_family()
 targetos = host_machine.system()
@@ -246,6 +247,25 @@ if not get_option('tcg').disabled()
   endif
   if get_option('tcg_interpreter')
     tcg_arch = 'tci'
+  elif get_option('tcg_threaded_interpreter')
+    if cpu not in tcti_supported_cpus
+      error('Unsupported CPU @0@ for TCTI, try --enable-tcg-interpreter'.format(cpu))
+    else
+      warning('TCTI is extremely experimental and incomplete! Things might break!')
+      tcg_arch = '@0@-tcti'.format(cpu)
+    endif
+
+    # Tell our compiler how to generate our TCTI gadgets.
+    gadget_generator = 'tcg/@0@/tcti-gadget-gen.py'.format(tcg_arch)
+    tcti_gadgets = custom_target('tcti-gadgets.c.inc',
+                                output: 'tcti-gadgets.c.inc',
+                                input: gadget_generator,
+                                command: [find_program(gadget_generator), '@OUTPUT@'],
+                                build_by_default: true,
+                                build_always_stale: false)
+
+    genh += tcti_gadgets
+    
   elif config_host['ARCH'] == 'sparc64'
     tcg_arch = 'sparc'
   elif config_host['ARCH'] == 's390x'
@@ -1280,6 +1300,8 @@ foreach target : target_dirs
       config_all += { sym: 'y' }
       if sym == 'CONFIG_TCG' and tcg_arch == 'tci'
         config_target += { 'CONFIG_TCG_INTERPRETER': 'y' }
+      elif sym == 'CONFIG_TCG' and tcg_arch.endswith('tcti')
+        config_target += { 'CONFIG_TCG_THREADED_INTERPRETER': 'y' }
       elif sym == 'CONFIG_XEN' and have_xen_pci_passthrough
         config_target += { 'CONFIG_XEN_PCI_PASSTHROUGH': 'y' }
       endif
@@ -2539,6 +2561,8 @@ summary_info += {'TCG support':       config_all.has_key('CONFIG_TCG')}
 if config_all.has_key('CONFIG_TCG')
   if get_option('tcg_interpreter')
     summary_info += {'TCG backend':   'TCI (TCG with bytecode interpreter, experimental and slow)'}
+  elif get_option('tcg_threaded_interpreter')
+    summary_info += {'TCG backend':   'TCTI (TCG with threaded-dispatch bytecode interpreter, experimental and slow; but faster than TCI)'}
   else
     summary_info += {'TCG backend':   'native (@0@)'.format(cpu)}
   endif
diff --git a/meson_options.txt b/meson_options.txt
index 6c29ea93300a..5aa68672c2ff 100644
--- a/meson_options.txt
+++ b/meson_options.txt
@@ -43,6 +43,8 @@ option('tcg', type: 'feature', value: 'auto',
        description: 'TCG support')
 option('tcg_interpreter', type: 'boolean', value: false,
        description: 'TCG with bytecode interpreter (experimental and slow)')
+option('tcg_threaded_interpreter', type: 'boolean', value: false,
+       description: 'TCG with threaded-dispatch bytecode interpreter (experimental and slow, but less slow than TCI)')
 option('cfi', type: 'boolean', value: 'false',
        description: 'Control-Flow Integrity (CFI)')
 option('cfi_debug', type: 'boolean', value: 'false',
diff --git a/scripts/mtest2make.py b/scripts/mtest2make.py
index cbbcba100d86..48bf9acafc99 100644
--- a/scripts/mtest2make.py
+++ b/scripts/mtest2make.py
@@ -75,18 +75,18 @@ def process_tests(test, targets, suites):
     print('run-test-%d: $(.test.deps.%d)' % (i,i))
     print('\t@$(call .test.run,%d,$(.test.output-format))' % (i,))
 
-    test_suites = test['suite'] or ['default']
-    is_slow = any(s.endswith('-slow') for s in test_suites)
-    for s in test_suites:
-        # The suite name in the introspection info is "PROJECT:SUITE"
-        s = s.split(':')[1]
-        if s.endswith('-slow'):
-            s = s[:-5]
-        if is_slow:
-            suites[s].slow_tests.append(i)
-        else:
-            suites[s].tests.append(i)
-        suites[s].executables.add(executable)
+    #test_suites = test['suite'] or ['default']
+    #is_slow = any(s.endswith('-slow') for s in test_suites)
+    #for s in test_suites:
+    #    # The suite name in the introspection info is "PROJECT:SUITE"
+    #    s = s.split(':')[1]
+    #    if s.endswith('-slow'):
+    #        s = s[:-5]
+    #    if is_slow:
+    #        suites[s].slow_tests.append(i)
+    #    else:
+    #        suites[s].tests.append(i)
+    #    suites[s].executables.add(executable)
 
 def emit_prolog(suites, prefix):
     all_tap = ' '.join(('%s-report-%s.tap' % (prefix, k) for k in suites.keys()))
diff --git a/tcg/aarch64-tcti/README.md b/tcg/aarch64-tcti/README.md
new file mode 100644
index 000000000000..eb848e5a9e57
--- /dev/null
+++ b/tcg/aarch64-tcti/README.md
@@ -0,0 +1,1026 @@
+# QEMU Tiny-Code Threaded Interpreter (AArch64)
+
+A TCG backend that chains together JOP/ROP-ish gadgets to massively reduce interpreter overhead vs TCI.
+Platform-dependent; but usable when JIT isn't available; e.g. on platforms that lack WX mappings. The general idea squish the addresses of a gadget sequence into a "queue" and then write each gadget so it ends in a "dequeue-jump".
+
+Execution occurs by jumping into the first gadget, and letting it just play back some linear-overhead native code sequences for a while.
+
+Since TCG-TCI is optimized for sets of 16 GP registers and aarch64 has 30, we could easily keep JIT/QEMU and guest state separate, and since 16\*16 is reasonably small we could actually have a set of reasonable gadgets for each combination of operands.
+
+
+## Register Convention
+
+| Regs    | Use                   |
+| :------ | :-------------------- |
+| x1-x15  | Guest Registers       |
+| x24     | TCTI temporary        |
+| x25     | saved IP during call  |
+| x26     | TCTI temporary        |
+| x27     | TCTI temporary        |
+| x28     | Thread-stream pointer |
+| x30     | Link register         |
+| SP      | Stack Pointer, host   |
+| PC      | Program Counter, host |
+
+In pseudocode:
+
+| Symbol | Meaning                             |
+| :----- | :---------------------------------- |
+| Rd     | stand-in for destination register   |
+| Rn     | stand-in for first source register  |
+| Rm     | stand-in for second source register |
+
+## Gadget Structure
+
+### End of gadget
+
+Each gadget ends by advancing our bytecode pointer, and then executing from thew new location.
+
+```asm
+# Load our next gadget address from our bytecode stream, advancing it, and jump to the next gadget.
+
+ldr x27, [x28], #8\n
+br x27
+```
+
+## Calling into QEMU's C codebase
+
+When calling into C, we lose control over which registers are used. Accordingly, we'll need to save
+registers relevant to TCTI:
+
+```asm
+str x25,      [sp, #-16]!
+stp x14, x15, [sp, #-16]!
+stp x12, x13, [sp, #-16]!
+stp x10, x11, [sp, #-16]!
+stp x8,  x9,  [sp, #-16]!
+stp x6,  x7,  [sp, #-16]!
+stp x4,  x5,  [sp, #-16]!
+stp x2,  x3,  [sp, #-16]!
+stp x0,  x1,  [sp, #-16]!
+stp x28, lr,  [sp, #-16]!
+```
+
+Upon returning to the gadget stream, we'll then restore them.
+
+```asm
+ldp x28, lr, [sp], #16
+ldp x0,  x1, [sp], #16
+ldp x2,  x3, [sp], #16
+ldp x4,  x5, [sp], #16
+ldp x6,  x7, [sp], #16
+ldp x8,  x9, [sp], #16
+ldp x10, x11, [sp], #16
+ldp x12, x13, [sp], #16
+ldp x14, x15, [sp], #16
+ldr x25,      [sp], #16
+```
+
+## TCG Operations
+
+Each operation needs an implementation for every platform; and probably a set of gadgets for each possible set of operands.
+
+At 14 GP registers, that means that
+
+1 operand =\> 16 gadgets
+2 operands =\> 256 gadgets
+3 operands =\> 4096 gadgets
+
+### call
+
+Calls a helper function by address.
+
+**IR Format**: `br <ptr address>`  
+**Gadget type:** single
+
+```asm
+    # Get our C runtime function's location as a pointer-sized immediate...
+    "ldr x27, [x28], #8",
+
+    # Store our TB return address for our helper. This is necessary so the GETPC()
+    # macro works correctly as used in helper functions.
+    "str x28, [x25]",
+
+    # Prepare ourselves to call into our C runtime...
+    *C_CALL_PROLOGUE,
+
+    # ... perform the call itself ...
+    "blr x27",
+
+    # Save the result of our call for later.
+    "mov x27, x0",
+
+    # ... and restore our environment.
+    *C_CALL_EPILOGUE,
+
+    # Restore our return value.
+    "mov x0, x27"
+```
+
+### br
+
+Branches to a given immediate address. Branches are
+
+**IR Format**: `br <ptr address>`  
+**Gadget type:** single
+
+```asm
+# Use our immediate argument as our new bytecode-pointer location.
+ldr x28, [x28]
+```
+
+### setcond_i32
+
+Performs a comparison between two 32-bit operands.
+
+**IR Format**: `setcond32 <cond>, Rd, Rn, Rm`  
+**Gadget type:** treated as 10 operations with variants for every `Rd`/`Rn`/`Rm` (40,960)
+
+```asm
+subs Wd, Wn, Wm
+cset Wd, <cond>
+```
+
+| QEMU Cond | AArch64 Cond |
+| :-------- | :----------- |
+| EQ        | EQ           |
+| NE        | NE           |
+| LT        | LT           |
+| GE        | GE           |
+| LE        | LE           |
+| GT        | GT           |
+| LTU       | LO           |
+| GEU       | HS           |
+| LEU       | LS           |
+| GTU       | HI           |
+
+### setcond_i64
+
+Performs a comparison between two 32-bit operands.
+
+**IR Format**: `setcond64 <cond>, Rd, Rn, Rm`  
+**Gadget type:** treated as 10 operations with variants for every `Rd`/`Rn`/`Rm` (40,960)
+
+```asm
+subs Xd, Xn, Xm
+cset Xd, <cond>
+```
+
+Comparison chart is the same as the `_i32` variant.
+
+### brcond_i32
+
+Compares two 32-bit numbers, and branches if the comparison is true.
+
+**IR Format**: `brcond Rn, Rm, <cond>`  
+**Gadget type:** treated as 10 operations with variants for every `Rn`/`Rm` (2560)
+
+```asm
+# Perform our comparison and conditional branch.
+subs Wrz, Wn, Wm
+br<cond> taken
+
+    # Consume the branch target, without using it.
+    add x28, x28, #8
+
+    # Perform our end-of-instruction epilogue.
+    <epilogue here>
+
+taken:
+
+    # Update our bytecode pointer to take the label.
+    ldr x28, [x28]
+```
+
+Comparison chart is the same as in `setcond_i32` .
+
+### brcond_i64
+
+Compares two 64-bit numbers, and branches if the comparison is true.
+
+**IR Format**: `brcond Rn, Rm, <cond>`  
+**Gadget type:** treated as 10 operations with variants for every `Rn`/`Rm` (2560)
+
+```asm
+# Perform our comparison and conditional branch.
+subs Xrz, Xn, Xm
+br<cond> taken
+
+    # Consume the branch target, without using it.
+    add x28, x28, #8
+
+    # Perform our end-of-instruction epilogue.
+    <epilogue here>
+
+taken:
+
+    # Update our bytecode pointer to take the label.
+    ldr x28, [x28]
+```
+
+Comparison chart is the same as in `setcond_i32` .
+
+### mov_i32
+
+Moves a value from a register to another register.
+
+**IR Format**: `mov Rd, Rn`  
+**Gadget type:** gadget per `Rd` + `Rn` combo (256)
+
+```asm
+mov Rd, Rn
+```
+
+### mov_i64
+
+Moves a value from a register to another register.
+
+**IR Format**: `mov Rd, Rn`  
+**Gadget type:** gadget per `Rd` + `Rn` combo (256)
+
+```asm
+mov Xd, Xn
+```
+
+### tci_movi_i32
+
+Moves an 32b immediate into a register.
+
+**IR Format**: `mov Rd, #imm32`  
+**Gadget type:** gadget per `Rd` (16)
+
+```asm
+ldr w27, [x28], #4
+mov Wd, w27
+```
+
+### tci_movi_i64
+
+Moves an 64b immediate into a register.
+
+**IR Format**: `mov Rd, #imm64`  
+**Gadget type:** gadget per `Rd` (16)
+
+```asm
+ldr x27, [x28], #4
+mov Xd, x27
+```
+
+### ld8u_i32 / ld8u_i64
+
+Load byte from host memory to register.
+
+**IR Format**: `ldr Rd, Rn, <signed offset>`  
+**Gadget type:** gadget per `Rd` & `Rn` (256)
+
+```asm
+ldrsw x27, [x28], #4
+ldrb Xd, [Xn, x27]
+```
+
+### ld8s_i32 / ld8s_i64
+
+Load byte from host memory to register; sign extending.
+
+**IR Format**: `ldr Rd, Rn, <signed offset>`  
+**Gadget type:** gadget per `Rd` & `Rn` (256)
+
+```asm
+ldrsw x27, [x28], #4
+ldrsb Xd, [Xn, x27]
+```
+
+### ld16u_i32 / ld16u_i64
+
+Load 16b from host memory to register.
+
+**IR Format**: `ldr Rd, Rn, <signed offset>`  
+**Gadget type:** gadget per `Rd` & `Rn` (256)
+
+```asm
+ldrsw x27, [x28], #4
+ldrh Wd, [Xn, x27]
+```
+
+### ld16s_i32 / ld16s_i64
+
+Load 16b from host memory to register; sign extending.
+
+**IR Format**: `ldr Rd, Rn, <signed offset>`  
+**Gadget type:** gadget per `Rd` & `Rn` (256)
+
+```asm
+ldrsw x27, [x28], #4
+ldrsh Xd, [Xn, x27]
+```
+
+### ld32u_i32 / ld32u_i64
+
+Load 32b from host memory to register.
+
+**IR Format**: `ldr Rd, Rn, <signed offset>`  
+**Gadget type:** gadget per `Rd` & `Rn` (256)
+
+```asm
+ldrsw x27, [x28], #4
+ldr Wd, [Xn, x27]
+```
+
+### ld32s_i64
+
+Load 32b from host memory to register; sign extending.
+
+**IR Format**: `ldr Rd, Rn, <signed offset>`  
+**Gadget type:** gadget per `Rd` & `Rn` (256)
+
+```asm
+ldrsw x27, [x28], #4
+ldrsw Xd, [Xn, x27]
+```
+
+### ld_i64
+
+Load 64b from host memory to register.
+
+**IR Format**: `ldr Rd, Rn, <signed offset>`  
+**Gadget type:** gadget per `Rd` & `Rn` (256)
+
+```asm
+ldrsw x27, [x28], #4
+ldr Xd, [Xn, x27]
+```
+
+### st8_i32 / st8_i64
+
+Stores byte from register to host memory.
+
+**IR Format**: `str Rd, Rn, <signed offset>`  
+**Gadget type:** gadget per `Rd` & `Rn` (256)
+
+```asm
+ldrsw x27, [x28], #4
+strb Wd, [Xn, x27]
+```
+
+### st16_i32 / st16_i64
+
+Stores 16b from register to host memory.
+
+**IR Format**: `str Rd, Rn, <signed offset>`  
+**Gadget type:** gadget per `Rd` & `Rn` (256)
+
+```asm
+ldrsw x27, [x28], #4
+strh Wd, [Xn, x27]
+```
+
+### st_i32 / st32_i64
+
+Stores 32b from register to host memory.
+
+**IR Format**: `str Rd, Rn, <signed offset>`  
+**Gadget type:** gadget per `Rd` & `Rn` (256)
+
+```asm
+ldrsw x27, [x28], #4
+str Wd, [Xn, x27]
+```
+
+### st_i64
+
+Stores 64b from register to host memory.
+
+**IR Format**: `str Rd, Rn, <signed offset>`  
+**Gadget type:** gadget per `Rd` & `Rn` (256)
+
+```asm
+ldrsw x27, [x28], #4
+str Xd, [Xn, x27]
+```
+
+### qemu_ld_i32
+
+Loads 32b from _guest_ memory to register.
+
+**IR Format**: `ld Rd, <foreign/guest pointer>, <memory operation>`  
+**Gadget type:** thunk per `Rd` into C impl?
+
+### qemu_ld_i64
+
+Loads 64b from _guest_ memory to register.
+
+**IR Format**: `ld Rd, <foreign/guest pointer>, <memory operation>`  
+**Gadget type:** thunk per `Rd` into C impl?
+
+### qemu_st_i32
+
+Stores 32b from a register to _guest_ memory.
+
+**IR Format**: `st Rd, <foreign/guest pointer>, <memory operation>`  
+**Gadget type:** thunk per `Rd` into C impl
+
+### qemu_st_i64
+
+Stores 64b from a register to _guest_ memory.
+
+**IR Format**: `st Rd, <foreign/guest pointer>, <memory operation>`  
+**Gadget type:** thunk per `Rd` into C impl?
+
+#### Note
+
+See note on `qemu_ld_i32`.
+
+### add_i32
+
+Adds two 32-bit numbers.
+
+**IR Format**: `add Rd, Rn, Rm`  
+**Gadget type:** gadget per `Rd`, `Rn`, `Rm` (4096)
+
+```asm
+add Wd, Wn, Wm
+```
+
+### add_i64
+
+Adds two 64-bit numbers.
+
+**IR Format**: `add Rd, Rn, Rm`  
+**Gadget type:** gadget per `Rd`, `Rn`, `Rm` (4096)
+
+```asm
+add Xd, Xn, Xm
+```
+
+### sub_i32
+
+Subtracts two 32-bit numbers.
+
+**IR Format**: `add Rd, Rn, Rm`  
+**Gadget type:** gadget per `Rd`, `Rn`, `Rm` (4096)
+
+```asm
+Sub Wd, Wn, Wm
+```
+
+### sub_i64
+
+Subtracts two 64-bit numbers.
+
+**IR Format**: `sub Rd, Rn, Rm`  
+**Gadget type:** gadget per `Rd`, `Rn`, `Rm` (4096)
+
+```asm
+sub Xd, Xn, Xm
+```
+
+### mul_i32
+
+Multiplies two 32-bit numbers.
+
+**IR Format**: `mul Rd, Rn, Rm`  
+**Gadget type:** gadget per `Rd`, `Rn`, `Rm` (4096)
+
+```asm
+mul Wd, Wn, Wm
+```
+
+### mul_i64
+
+Multiplies two 64-bit numbers.
+
+**IR Format**: `mul Rd, Rn, Rm`  
+**Gadget type:** gadget per `Rd`, `Rn`, `Rm` (4096)
+
+```asm
+mul Xd, Xn, Xm
+```
+
+### div_i32
+
+Divides two 32-bit numbers; considering them signed.
+
+**IR Format**: `div Rd, Rn, Rm`  
+**Gadget type:** gadget per `Rd`, `Rn`, `Rm` (4096)
+
+```asm
+sdiv Wd, Wn, Wm
+```
+
+### div_i64
+
+Divides two 64-bit numbers; considering them signed.
+
+**IR Format**: `div Rd, Rn, Rm`  
+**Gadget type:** gadget per `Rd`, `Rn`, `Rm` (4096)
+
+```asm
+sdiv Xd, Xn, Xm
+```
+
+### divu_i32
+
+Divides two 32-bit numbers; considering them unsigned.
+
+**IR Format**: `div Rd, Rn, Rm`  
+**Gadget type:** gadget per `Rd`, `Rn`, `Rm` (4096)
+
+```asm
+udiv Wd, Wn, Wm
+```
+
+### divu_i64
+
+Divides two 32-bit numbers; considering them unsigned.
+
+**IR Format**: `div Rd, Rn, Rm`  
+**Gadget type:** gadget per `Rd`, `Rn`, `Rm` (4096)
+
+```asm
+udiv Xd, Xn, Xm
+```
+
+### rem_i32
+
+Computes the division remainder (modulus) of two 32-bit numbers; considering them signed.
+
+**IR Format**: `rem Rd, Rn, Rm`  
+**Gadget type:** gadget per `Rd`, `Rn`, `Rm` (4096)
+
+```asm
+sdiv    w27, Wn, Wm
+msub    Wd, w27, Wm, Wn
+```
+
+### rem_i64
+
+Computes the division remainder (modulus) of two 64-bit numbers; considering them signed.
+
+**IR Format**: `rem Rd, Rn, Rm`  
+**Gadget type:** gadget per `Rd`, `Rn`, `Rm` (4096)
+
+```asm
+sdiv    x27, Xn, Xm
+msub    Xd, x27, Xm, Xn
+```
+
+### remu_i32
+
+Computes the division remainder (modulus) of two 32-bit numbers; considering them unsigned.
+
+**IR Format**: `rem Rd, Rn, Rm`  
+**Gadget type:** gadget per `Rd`, `Rn`, `Rm` (4096)
+
+```asm
+udiv    w27, Wn, Wm
+msub    Wd, w27, Wm, Wn
+```
+
+### remu_i64
+
+Computes the division remainder (modulus) of two 32-bit numbers; considering them unsigned.
+
+**IR Format**: `rem Rd, Rn, Rm`  
+**Gadget type:** gadget per `Rd`, `Rn`, `Rm` (4096)
+
+```asm
+udiv    x27, Xn, Xm
+msub    Xd, x27, Xm, Xn
+```
+
+### not_i32
+
+Logically inverts a 32-bit number.
+
+**IR Format**: `not Rd, Rn`  
+**Gadget type:** gadget per `Rd`, `Rn` (256)
+
+```asm
+mvn Wd, Wn
+```
+
+### not_i64
+
+Logically inverts a 64-bit number.
+
+**IR Format**: `not Rd, Rn`  
+**Gadget type:** gadget per `Rd`, `Rn` (256)
+
+```asm
+mvn Xd, Xn
+```
+
+### neg_i32
+
+Arithmetically inverts (two's compliment) a 32-bit number.
+
+**IR Format**: `not Rd, Rn`  
+**Gadget type:** gadget per `Rd`, `Rn` (256)
+
+```asm
+neg Wd, Wn
+```
+
+### neg_i64
+
+Arithmetically inverts (two's compliment) a 64-bit number.
+
+**IR Format**: `not Rd, Rn`  
+**Gadget type:** gadget per `Rd`, `Rn` (256)
+
+```asm
+neg Xd, Xn
+```
+
+### and_i32
+
+Logically ANDs two 32-bit numbers.
+
+**IR Format**: `and Rd, Rn, Rm`  
+**Gadget type:** gadget per `Rd`, `Rn`, `Rm` (4096)
+
+```asm
+and Wd, Wn, Wm
+```
+
+### and_i64
+
+Logically ANDs two 64-bit numbers.
+
+**IR Format**: `and Rd, Rn, Rm`  
+**Gadget type:** gadget per `Rd`, `Rn`, `Rm` (4096)
+
+```asm
+and Xd, Xn, Xm
+```
+
+### or_i32
+
+Logically ORs two 32-bit numbers.
+
+**IR Format**: `or Rd, Rn, Rm`  
+**Gadget type:** gadget per `Rd`, `Rn`, `Rm` (4096)
+
+```asm
+or Wd, Wn, Wm
+```
+
+### or_i64
+
+Logically ORs two 64-bit numbers.
+
+**IR Format**: `or Rd, Rn, Rm`  
+**Gadget type:** gadget per `Rd`, `Rn`, `Rm` (4096)
+
+```asm
+or Xd, Xn, Xm
+```
+
+### xor_i32
+
+Logically XORs two 32-bit numbers.
+
+**IR Format**: `xor Rd, Rn, Rm`  
+**Gadget type:** gadget per `Rd`, `Rn`, `Rm` (4096)
+
+```asm
+eor Wd, Wn, Wm
+```
+
+### xor_i64
+
+Logically XORs two 64-bit numbers.
+
+**IR Format**: `xor Rd, Rn, Rm`  
+**Gadget type:** gadget per `Rd`, `Rn`, `Rm` (4096)
+
+```asm
+eor Xd, Xn, Xm
+```
+
+### shl_i32
+
+Logically shifts a 32-bit number left.
+
+**IR Format**: `shl Rd, Rn, Rm`  
+**Gadget type:** gadget per `Rd`, `Rn`, `Rm` (4096)
+
+```asm
+lsl Wd, Wn, Wm
+```
+
+### shl_i64
+
+Logically shifts a 64-bit number left.
+
+**IR Format**: `shl Rd, Rn, Rm`  
+**Gadget type:** gadget per `Rd`, `Rn`, `Rm` (4096)
+
+```asm
+lsl Xd, Xn, Xm
+```
+
+### shr_i32
+
+Logically shifts a 32-bit number right.
+
+**IR Format**: `shr Rd, Rn, Rm`  
+**Gadget type:** gadget per `Rd`, `Rn`, `Rm` (4096)
+
+```asm
+lsr Wd, Wn, Wm
+```
+
+### shr_i64
+
+Logically shifts a 64-bit number right.
+
+**IR Format**: `shr Rd, Rn, Rm`  
+**Gadget type:** gadget per `Rd`, `Rn`, `Rm` (4096)
+
+```asm
+lsr Xd, Xn, Xm
+```
+
+### sar_i32
+
+Arithmetically shifts a 32-bit number right.
+
+**IR Format**: `sar Rd, Rn, Rm`  
+**Gadget type:** gadget per `Rd`, `Rn`, `Rm` (4096)
+
+```asm
+asr Wd, Wn, Wm
+```
+
+### sar_i64
+
+Arithmetically shifts a 64-bit number right.
+
+**IR Format**: `sar Rd, Rn, Rm`  
+**Gadget type:** gadget per `Rd`, `Rn`, `Rm` (4096)
+
+```asm
+asr Xd, Xn, Xm
+```
+
+### rotl_i32
+
+Rotates a 32-bit number left.
+
+**IR Format**: `rotl Rd, Rn, Rm`  
+**Gadget type:** gadget per `Rd`, `Rn`, `Rm` (4096)
+
+```asm
+rol Wd, Wn, Wm
+```
+
+### rotl_i64
+
+Rotates a 64-bit number left.
+
+**IR Format**: `rotl Rd, Rn, Rm`  
+**Gadget type:** gadget per `Rd`, `Rn`, `Rm` (4096)
+
+```asm
+rol Xd, Xn, Xm
+```
+
+### rotr_i32
+
+Rotates a 32-bit number right.
+
+**IR Format**: `rotr Rd, Rn, Rm`  
+**Gadget type:** gadget per `Rd`, `Rn`, `Rm` (4096)
+
+```asm
+ror Wd, Wn, Wm
+```
+
+### rotr_i64
+
+Rotates a 64-bit number right.
+
+**IR Format**: `rotr Rd, Rn, Rm`  
+**Gadget type:** gadget per `Rd`, `Rn`, `Rm` (4096)
+
+```asm
+ror Xd, Xn, Xm
+```
+
+### deposit_i32
+
+Optional; not currently implementing.
+
+### deposit_i64
+
+Optional; not currently implementing.
+
+### ext8s_i32
+
+Sign extends the lower 8b of a register into a 32b destination.
+
+**IR Format**: `ext8s Rd, Rn`  
+**Gadget type:** gadget per `Rd`, `Rn` (256)
+
+```asm
+sxtb Wd, Wn
+```
+
+### ext8s_i64
+
+Sign extends the lower 8b of a register into a 64b destination.
+
+**IR Format**: `ext8s Rd, Rn`  
+**Gadget type:** gadget per `Rd`, `Rn` (256)
+
+```asm
+sxtb Xd, Wn
+```
+
+### ext8u_i32
+
+Zero extends the lower 8b of a register into a 32b destination.
+
+**IR Format**: `ext8u Rd, Rn`  
+**Gadget type:** gadget per `Rd`, `Rn` (256)
+
+```asm
+and Xd, Xn, #0xff
+```
+
+### ext8u_i64
+
+Zero extends the lower 8b of a register into a 64b destination.
+
+**IR Format**: `ext8u Rd, Rn`  
+**Gadget type:** gadget per `Rd`, `Rn` (256)
+
+```asm
+and Xd, Xn, #0xff
+```
+
+### ext16s_i32
+
+Sign extends the lower 16b of a register into a 32b destination.
+
+**IR Format**: `ext16s Rd, Rn`  
+**Gadget type:** gadget per `Rd`, `Rn` (256)
+
+```asm
+sxth Xd, Wn
+```
+
+### ext16s_i64
+
+Sign extends the lower 16b of a register into a 64b destination.
+
+**IR Format**: `ext16s Rd, Rn`  
+**Gadget type:** gadget per `Rd`, `Rn` (256)
+
+```asm
+sxth Xd, Wn
+```
+
+### ext16u_i32
+
+Zero extends the lower 16b of a register into a 32b destination.
+
+**IR Format**: `ext16u Rd, Rn`  
+**Gadget type:** gadget per `Rd`, `Rn` (256)
+
+```asm
+and Wd, Wn, #0xffff
+```
+
+### ext16u_i64
+
+Zero extends the lower 16b of a register into a 32b destination.
+
+**IR Format**: `ext16u Rd, Rn`  
+**Gadget type:** gadget per `Rd`, `Rn` (256)
+
+```asm
+and Wd, Wn, #0xffff
+```
+
+### ext32s_i64
+
+Sign extends the lower 32b of a register into a 64b destination.
+
+**IR Format**: `ext32s Rd, Rn`  
+**Gadget type:** gadget per `Rd`, `Rn` (256)
+
+```asm
+sxtw Xd, Wn
+```
+
+### ext32u_i64
+
+Zero extends the lower 32b of a register into a 64b destination.
+
+**IR Format**: `ext32s Rd, Rn`  
+**Gadget type:** gadget per `Rd`, `Rn` (256)
+
+```asm
+sxtw Xd, Wn
+```
+
+### ext_i32_i64
+
+Sign extends the lower 32b of a register into a 64b destination.
+
+**IR Format**: `ext32s Rd, Rn`  
+**Gadget type:** gadget per `Rd`, `Rn` (256)
+
+```asm
+sxtw Xd, Wn
+```
+
+### extu_i32_i64
+
+Zero extends the lower 32b of a register into a 32b destination.
+
+**IR Format**: `ext32u Rd, Rn`  
+**Gadget type:** gadget per `Rd`, `Rn` (256)
+
+```asm
+and Xd, Xn, #0xffffffff
+```
+
+### bswap16_i32
+
+Byte-swaps a 16b quantity.
+
+**IR Format**: `bswap16 Rd, Rn`  
+**Gadget type:** gadget per `Rd`, `Rn` (256)
+
+```asm
+rev     w27, Wn
+lsr     Wd, w27, #16
+```
+
+### bswap16_i64
+
+Byte-swaps a 16b quantity.
+
+**IR Format**: `bswap16 Rd, Rn`  
+**Gadget type:** gadget per `Rd`, `Rn` (256)
+
+```asm
+rev     w27, Wn
+lsr     Wd, w27, #16
+```
+
+### bswap32_i32
+
+Byte-swaps a 32b quantity.
+
+**IR Format**: `bswap32 Rd, Rn`  
+**Gadget type:** gadget per `Rd`, `Rn` (256)
+
+```asm
+rev     Wd, Wn
+```
+
+### bswap32_i64
+
+Byte-swaps a 32b quantity.
+
+**IR Format**: `bswap32 Rd, Rn`  
+**Gadget type:** gadget per `Rd`, `Rn` (256)
+
+```asm
+rev     Wd, Wn
+```
+
+### bswap64_i64
+
+Byte-swaps a 64b quantity.
+
+**IR Format**: `bswap64 Rd, Rn`  
+**Gadget type:** gadget per `Rd`, `Rn` (256)
+
+```asm
+rev     Xd, Xn
+```
+
+### exit_tb
+
+Exits the translation block. Has no gadget; but instead inserts the address of the translation block epilogue.
+
+
+### mb
+
+Memory barrier.
+
+**IR Format**: `mb <type>`  
+**Gadget type:** gadget per type
+
+```asm
+# !!! TODO
+```
+
+#### Note
+
+We still need to look up out how to map QEMU MB types map to AArch64 ones. This might take nuance.
diff --git a/tcg/aarch64-tcti/tcg-target-con-set.h b/tcg/aarch64-tcti/tcg-target-con-set.h
new file mode 100644
index 000000000000..f51b7bcb13e7
--- /dev/null
+++ b/tcg/aarch64-tcti/tcg-target-con-set.h
@@ -0,0 +1,21 @@
+/* SPDX-License-Identifier: MIT */
+/*
+ * TCI target-specific constraint sets.
+ * Copyright (c) 2021 Linaro
+ */
+
+/*
+ * C_On_Im(...) defines a constraint set with <n> outputs and <m> inputs.
+ * Each operand should be a sequence of constraint letters as defined by
+ * tcg-target-con-str.h; the constraint combination is inclusive or.
+ */
+C_O0_I2(r, r)
+C_O0_I3(r, r, r)
+C_O0_I4(r, r, r, r)
+C_O1_I1(r, r)
+C_O1_I2(r, 0, r)
+C_O1_I2(r, r, r)
+C_O1_I4(r, r, r, r, r)
+C_O2_I1(r, r, r)
+C_O2_I2(r, r, r, r)
+C_O2_I4(r, r, r, r, r, r)
diff --git a/tcg/aarch64-tcti/tcg-target-con-str.h b/tcg/aarch64-tcti/tcg-target-con-str.h
new file mode 100644
index 000000000000..87c0f19e9c2e
--- /dev/null
+++ b/tcg/aarch64-tcti/tcg-target-con-str.h
@@ -0,0 +1,11 @@
+/* SPDX-License-Identifier: MIT */
+/*
+ * Define TCI target-specific operand constraints.
+ * Copyright (c) 2021 Linaro
+ */
+
+/*
+ * Define constraint letters for register sets:
+ * REGS(letter, register_mask)
+ */
+REGS('r', MAKE_64BIT_MASK(0, TCG_TARGET_NB_REGS))
diff --git a/tcg/aarch64-tcti/tcg-target.c.inc b/tcg/aarch64-tcti/tcg-target.c.inc
new file mode 100644
index 000000000000..d7bb67a92140
--- /dev/null
+++ b/tcg/aarch64-tcti/tcg-target.c.inc
@@ -0,0 +1,1347 @@
+/*
+ * Tiny Code Threaded Intepreter for QEMU
+ *
+ * Copyright (c) 2021 Kate Temkin
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+#define TCTI_GADGET_IMMEDIATE_ARRAY_LEN 64
+
+// Grab our gadget definitions.
+// FIXME: use the system path instead of hardcoding this?
+#include "tcti-gadgets.c.inc"
+
+/* Marker for missing code. */
+#define TODO() \
+    do { \
+        fprintf(stderr, "TODO %s:%u: %s()\n", \
+                __FILE__, __LINE__, __func__); \
+        tcg_abort(); \
+    } while (0)
+
+
+/* Enable TCTI assertions only when debugging TCG (and without NDEBUG defined).
+ * Without assertions, the interpreter runs much faster. */
+#if defined(CONFIG_DEBUG_TCG)
+# define tcti_assert(cond) assert(cond)
+#else
+# define tcti_assert(cond) ((void)0)
+#endif
+
+/* Bitfield n...m (in 32 bit value). */
+#define BITS(n, m) (((0xffffffffU << (31 - n)) >> (31 - n + m)) << m)
+
+/**
+ * Macro that defines a look-up tree for named QEMU_LD gadgets.
+ */ 
+#define LD_MEMOP_LOOKUP(variable, arg, suffix) \
+    switch (get_memop(arg) & (MO_BSWAP | MO_SSIZE)) { \
+        case MO_UB:   variable = gadget_qemu_ld_ub_   ## suffix; break; \
+        case MO_SB:   variable = gadget_qemu_ld_sb_   ## suffix; break; \
+        case MO_LEUW: variable = gadget_qemu_ld_leuw_ ## suffix; break; \
+        case MO_LESW: variable = gadget_qemu_ld_lesw_ ## suffix; break; \
+        case MO_LEUL: variable = gadget_qemu_ld_leul_ ## suffix; break; \
+        case MO_LESL: variable = gadget_qemu_ld_lesl_ ## suffix; break; \
+        case MO_LEQ:  variable = gadget_qemu_ld_leq_  ## suffix; break; \
+        case MO_BEUW: variable = gadget_qemu_ld_beuw_ ## suffix; break; \
+        case MO_BESW: variable = gadget_qemu_ld_besw_ ## suffix; break; \
+        case MO_BEUL: variable = gadget_qemu_ld_beul_ ## suffix; break; \
+        case MO_BESL: variable = gadget_qemu_ld_besl_ ## suffix; break; \
+        case MO_BEQ:  variable = gadget_qemu_ld_beq_  ## suffix; break; \
+        default: \
+            g_assert_not_reached(); \
+    }
+#define LD_MEMOP_HANDLER(variable, arg, suffix, a_bits, s_bits) \
+        if (a_bits >= s_bits) { \
+            LD_MEMOP_LOOKUP(variable, arg, aligned_ ## suffix ); \
+        } else { \
+            LD_MEMOP_LOOKUP(gadget, arg, unaligned_ ## suffix); \
+        }
+
+
+
+/**
+ * Macro that defines a look-up tree for named QEMU_ST gadgets.
+ */ 
+#define ST_MEMOP_LOOKUP(variable, arg, suffix) \
+    switch (get_memop(arg) & (MO_BSWAP | MO_SSIZE)) { \
+        case MO_UB:   variable = gadget_qemu_st_ub_   ## suffix; break; \
+        case MO_LEUW: variable = gadget_qemu_st_leuw_ ## suffix; break; \
+        case MO_LEUL: variable = gadget_qemu_st_leul_ ## suffix; break; \
+        case MO_LEQ:  variable = gadget_qemu_st_leq_  ## suffix; break; \
+        case MO_BEUW: variable = gadget_qemu_st_beuw_ ## suffix; break; \
+        case MO_BEUL: variable = gadget_qemu_st_beul_ ## suffix; break; \
+        case MO_BEQ:  variable = gadget_qemu_st_beq_  ## suffix; break; \
+        default: \
+            g_assert_not_reached(); \
+    }
+#define ST_MEMOP_HANDLER(variable, arg, suffix, a_bits, s_bits) \
+        if (a_bits >= s_bits) { \
+            ST_MEMOP_LOOKUP(variable, arg, aligned_ ## suffix ); \
+        } else { \
+            ST_MEMOP_LOOKUP(gadget, arg, unaligned_ ## suffix); \
+        }
+
+
+static TCGConstraintSetIndex tcg_target_op_def(TCGOpcode op)
+{
+    switch (op) {
+    case INDEX_op_ld8u_i32:
+    case INDEX_op_ld8s_i32:
+    case INDEX_op_ld16u_i32:
+    case INDEX_op_ld16s_i32:
+    case INDEX_op_ld_i32:
+    case INDEX_op_ld8u_i64:
+    case INDEX_op_ld8s_i64:
+    case INDEX_op_ld16u_i64:
+    case INDEX_op_ld16s_i64:
+    case INDEX_op_ld32u_i64:
+    case INDEX_op_ld32s_i64:
+    case INDEX_op_ld_i64:
+    case INDEX_op_not_i32:
+    case INDEX_op_not_i64:
+    case INDEX_op_neg_i32:
+    case INDEX_op_neg_i64:
+    case INDEX_op_ext8s_i32:
+    case INDEX_op_ext8s_i64:
+    case INDEX_op_ext16s_i32:
+    case INDEX_op_ext16s_i64:
+    case INDEX_op_ext8u_i32:
+    case INDEX_op_ext8u_i64:
+    case INDEX_op_ext16u_i32:
+    case INDEX_op_ext16u_i64:
+    case INDEX_op_ext32s_i64:
+    case INDEX_op_ext32u_i64:
+    case INDEX_op_ext_i32_i64:
+    case INDEX_op_extu_i32_i64:
+    case INDEX_op_bswap16_i32:
+    case INDEX_op_bswap16_i64:
+    case INDEX_op_bswap32_i32:
+    case INDEX_op_bswap32_i64:
+    case INDEX_op_bswap64_i64:
+        return C_O1_I1(r, r);
+
+    case INDEX_op_st8_i32:
+    case INDEX_op_st16_i32:
+    case INDEX_op_st_i32:
+    case INDEX_op_st8_i64:
+    case INDEX_op_st16_i64:
+    case INDEX_op_st32_i64:
+    case INDEX_op_st_i64:
+        return C_O0_I2(r, r);
+
+    case INDEX_op_div_i32:
+    case INDEX_op_div_i64:
+    case INDEX_op_divu_i32:
+    case INDEX_op_divu_i64:
+    case INDEX_op_rem_i32:
+    case INDEX_op_rem_i64:
+    case INDEX_op_remu_i32:
+    case INDEX_op_remu_i64:
+    case INDEX_op_add_i32:
+    case INDEX_op_add_i64:
+    case INDEX_op_sub_i32:
+    case INDEX_op_sub_i64:
+    case INDEX_op_mul_i32:
+    case INDEX_op_mul_i64:
+    case INDEX_op_and_i32:
+    case INDEX_op_and_i64:
+    case INDEX_op_andc_i32:
+    case INDEX_op_andc_i64:
+    case INDEX_op_eqv_i32:
+    case INDEX_op_eqv_i64:
+    case INDEX_op_nand_i32:
+    case INDEX_op_nand_i64:
+    case INDEX_op_nor_i32:
+    case INDEX_op_nor_i64:
+    case INDEX_op_or_i32:
+    case INDEX_op_or_i64:
+    case INDEX_op_orc_i32:
+    case INDEX_op_orc_i64:
+    case INDEX_op_xor_i32:
+    case INDEX_op_xor_i64:
+    case INDEX_op_shl_i32:
+    case INDEX_op_shl_i64:
+    case INDEX_op_shr_i32:
+    case INDEX_op_shr_i64:
+    case INDEX_op_sar_i32:
+    case INDEX_op_sar_i64:
+    case INDEX_op_rotl_i32:
+    case INDEX_op_rotl_i64:
+    case INDEX_op_rotr_i32:
+    case INDEX_op_rotr_i64:
+    case INDEX_op_setcond_i32:
+    case INDEX_op_setcond_i64:
+        return C_O1_I2(r, r, r);
+
+    case INDEX_op_brcond_i32:
+    case INDEX_op_brcond_i64:
+        return C_O0_I2(r, r);
+
+    case INDEX_op_qemu_ld_i32:
+    case INDEX_op_qemu_ld_i64:
+        return C_O1_I2(r, r, r);
+    case INDEX_op_qemu_st_i32:
+    case INDEX_op_qemu_st_i64:
+        return C_O0_I3(r, r, r);
+
+    default:
+        g_assert_not_reached();
+    }
+}
+
+static const int tcg_target_reg_alloc_order[] = {
+    TCG_REG_R0,
+    TCG_REG_R1,
+    TCG_REG_R2,
+    TCG_REG_R3,
+    TCG_REG_R4,
+    TCG_REG_R5,
+    TCG_REG_R6,
+    TCG_REG_R7,
+    TCG_REG_R8,
+    TCG_REG_R9,
+    TCG_REG_R10,
+    TCG_REG_R11,
+    TCG_REG_R12,
+    TCG_REG_R13,
+    /*
+    TCG_REG_R14,  // AREG0
+    TCG_REG_R15,  // SP
+    */
+};
+
+#if MAX_OPC_PARAM_IARGS != 6
+# error Fix needed, number of supported input arguments changed!
+#endif
+
+static const int tcg_target_call_iarg_regs[] = {
+    TCG_REG_R0,
+    TCG_REG_R1,
+    TCG_REG_R2,
+    TCG_REG_R3,
+    TCG_REG_R4,
+    TCG_REG_R5,
+};
+
+static const int tcg_target_call_oarg_regs[] = {
+    TCG_REG_R0,
+};
+
+#ifdef CONFIG_DEBUG_TCG
+static const char *const tcg_target_reg_names[TCG_TARGET_NB_REGS] = {
+    "r00",
+    "r01",
+    "r02",
+    "r03",
+    "r04",
+    "r05",
+    "r06",
+    "r07",
+    "r08",
+    "r09",
+    "r10",
+    "r11",
+    "r12",
+    "r13",
+    "r14",
+    "r15",
+};
+#endif
+
+static bool patch_reloc(tcg_insn_unit *code_ptr, int type,
+                        intptr_t value, intptr_t addend)
+{
+    /* tcg_out_reloc always uses the same type, addend. */
+    tcg_debug_assert(type == sizeof(tcg_target_long));
+    tcg_debug_assert(addend == 0);
+    tcg_debug_assert(value != 0);
+    if (TCG_TARGET_REG_BITS == 32) {
+        tcg_patch32(code_ptr, value);
+    } else {
+        tcg_patch64(code_ptr, value);
+    }
+    return true;
+}
+
+#if defined(CONFIG_DEBUG_TCG_INTERPRETER)
+/* Show current bytecode. Used by tcg interpreter. */
+void tci_disas(uint8_t opc)
+{
+    const TCGOpDef *def = &tcg_op_defs[opc];
+    fprintf(stderr, "TCG %s %u, %u, %u\n",
+            def->name, def->nb_oargs, def->nb_iargs, def->nb_cargs);
+}
+#endif
+
+/* Write value (native size). */
+static void tcg_out_immediate(TCGContext *s, tcg_target_ulong v)
+{
+    if (TCG_TARGET_REG_BITS == 32) {
+        //tcg_out32(s, v);
+        tcg_out64(s, v);
+    } else {
+        tcg_out64(s, v);
+    }
+}
+
+void tb_target_set_jmp_target(uintptr_t tc_ptr, uintptr_t jmp_rx, uintptr_t jmp_rw, uintptr_t addr)
+{
+    /* Get a pointer to our immediate, which exists after a single pointer. */
+    uintptr_t immediate_addr = jmp_rw;
+
+    /* Patch it to be match our target address. */
+    qatomic_set((uint64_t *)immediate_addr, addr);
+}
+
+
+/**
+ * TCTI Thunk Helpers
+ */
+
+#ifdef CONFIG_SOFTMMU
+
+// TODO: relocate these prototypes?
+tcg_target_ulong helper_ret_ldub_mmu_signed(CPUArchState *env, target_ulong addr, TCGMemOpIdx oi, uintptr_t retaddr);
+tcg_target_ulong helper_le_lduw_mmu_signed(CPUArchState *env, target_ulong addr, TCGMemOpIdx oi, uintptr_t retaddr);
+tcg_target_ulong helper_le_ldul_mmu_signed(CPUArchState *env, target_ulong addr, TCGMemOpIdx oi, uintptr_t retaddr);
+tcg_target_ulong helper_be_lduw_mmu_signed(CPUArchState *env, target_ulong addr, TCGMemOpIdx oi, uintptr_t retaddr);
+tcg_target_ulong helper_be_ldul_mmu_signed(CPUArchState *env, target_ulong addr, TCGMemOpIdx oi, uintptr_t retaddr);
+
+tcg_target_ulong helper_ret_ldub_mmu_signed(CPUArchState *env, target_ulong addr, TCGMemOpIdx oi, uintptr_t retaddr)
+{
+    return (int8_t)helper_ret_ldub_mmu(env, addr, oi, retaddr);
+}
+
+tcg_target_ulong helper_le_lduw_mmu_signed(CPUArchState *env, target_ulong addr, TCGMemOpIdx oi, uintptr_t retaddr)
+{
+    return (int16_t)helper_le_lduw_mmu(env, addr, oi, retaddr);
+}
+
+tcg_target_ulong helper_le_ldul_mmu_signed(CPUArchState *env, target_ulong addr, TCGMemOpIdx oi, uintptr_t retaddr)
+{
+    return (int32_t)helper_le_ldul_mmu(env, addr, oi, retaddr);
+}
+
+tcg_target_ulong helper_be_lduw_mmu_signed(CPUArchState *env, target_ulong addr, TCGMemOpIdx oi, uintptr_t retaddr)
+{
+    return (int16_t)helper_be_lduw_mmu(env, addr, oi, retaddr);
+}
+
+tcg_target_ulong helper_be_ldul_mmu_signed(CPUArchState *env, target_ulong addr, TCGMemOpIdx oi, uintptr_t retaddr)
+{
+    return (int32_t)helper_be_ldul_mmu(env, addr, oi, retaddr);
+}
+
+#else
+#error TCTI currently only supports use of the soft MMU.
+#endif
+
+
+/**
+ * TCTI Emmiter Helpers
+ */
+
+
+/* Write gadget pointer. */
+static void tcg_out_nullary_gadget(TCGContext *s, void *gadget)
+{
+    tcg_out_immediate(s, (tcg_target_ulong)gadget);
+}
+
+/* Write gadget pointer, plus 64b immediate. */
+static void tcg_out_imm64_gadget(TCGContext *s, void *gadget, tcg_target_ulong immediate)
+{
+    tcg_out_nullary_gadget(s, gadget);
+    tcg_out64(s, immediate);
+}
+
+
+/* Write gadget pointer (one register). */
+static void tcg_out_unary_gadget(TCGContext *s, void *gadget_base[TCG_TARGET_NB_REGS], unsigned reg0)
+{
+    tcg_out_nullary_gadget(s, gadget_base[reg0]);
+}
+
+
+/* Write gadget pointer (two registers). */
+static void tcg_out_binary_gadget(TCGContext *s, void *gadget_base[TCG_TARGET_NB_REGS][TCG_TARGET_NB_REGS], unsigned reg0, unsigned reg1)
+{
+    tcg_out_nullary_gadget(s, gadget_base[reg0][reg1]);
+}
+
+
+/* Write gadget pointer (three registers). */
+static void tcg_out_ternary_gadget(TCGContext *s, void *gadget_base[TCG_TARGET_NB_REGS][TCG_TARGET_NB_REGS][TCG_TARGET_NB_REGS], unsigned reg0, unsigned reg1, unsigned reg2)
+{
+    tcg_out_nullary_gadget(s, gadget_base[reg0][reg1][reg2]);
+}
+
+
+/**
+ * Version of our LDST generator that defers to more optimized gadgets selectively.
+ */
+static void tcg_out_ldst_gadget_inner(TCGContext *s, 
+    void *gadget_base[TCG_TARGET_NB_REGS][TCG_TARGET_NB_REGS], 
+    void *gadget_pos_imm[TCG_TARGET_NB_REGS][TCG_TARGET_NB_REGS][TCTI_GADGET_IMMEDIATE_ARRAY_LEN], 
+    void *gadget_shifted_imm[TCG_TARGET_NB_REGS][TCG_TARGET_NB_REGS][TCTI_GADGET_IMMEDIATE_ARRAY_LEN], 
+    void *gadget_neg_imm[TCG_TARGET_NB_REGS][TCG_TARGET_NB_REGS][TCTI_GADGET_IMMEDIATE_ARRAY_LEN], 
+    unsigned reg0, unsigned reg1, uint32_t offset)
+{
+    int64_t extended_offset = (int32_t)offset;
+    bool is_negative = (extended_offset < 0);
+
+    // Optimal case: we have a gadget that handles our specific offset, so we don't need to encode
+    // an immediate. This saves us a bunch of speed. :)
+
+    // We handle positive and negative gadgets separately, in order to allow for asymmetrical
+    // collections of pre-made gadgets.
+    if (!is_negative) 
+    {
+        uint64_t shifted_offset = (extended_offset >> 3);
+        bool aligned_to_8B = ((extended_offset & 0b111) == 0);
+
+        bool have_optimized_gadget = (extended_offset < TCTI_GADGET_IMMEDIATE_ARRAY_LEN);
+        bool have_shifted_gadget   = (shifted_offset  < TCTI_GADGET_IMMEDIATE_ARRAY_LEN);
+
+        // More optimal case: we have a gadget that directly encodes the argument.
+        if (have_optimized_gadget) {
+            tcg_out_nullary_gadget(s, gadget_pos_imm[reg0][reg1][extended_offset]);
+            return;
+        } 
+
+        // Special case: it's frequent to have low-numbered positive offsets that are aligned
+        // to 16B boundaries
+        else if(aligned_to_8B && have_shifted_gadget) {
+            tcg_out_nullary_gadget(s, gadget_shifted_imm[reg0][reg1][shifted_offset]);
+            return;
+        }
+    } 
+    else {
+        uint64_t negated_offset = -(extended_offset);
+
+        // More optimal case: we have a gadget that directly encodes the argument.
+        if (negated_offset < TCTI_GADGET_IMMEDIATE_ARRAY_LEN) {
+            tcg_out_nullary_gadget(s, gadget_neg_imm[reg0][reg1][negated_offset]);
+            return;
+        }
+    }
+
+    // Less optimal case: we don't have a gadget specifically for this. Emit the general case immediate.
+    tcg_out_binary_gadget(s, gadget_base, reg0, reg1);
+    tcg_out64(s, extended_offset); //tcg_out32(s, offset);
+}
+
+/* Shorthand for the above, that prevents us from having to specify the name three times. */
+#define tcg_out_ldst_gadget(s, name, a, b, c) \
+    tcg_out_ldst_gadget_inner(s, name, \
+        name ## _imm,  \
+        name ## _sh8_imm,  \
+        name ## _neg_imm, \
+    a, b, c)
+
+
+
+/* Write label. */
+static void tcti_out_label(TCGContext *s, TCGLabel *label)
+{
+    if (label->has_value) {
+        tcg_out64(s, label->u.value);
+        tcg_debug_assert(label->u.value);
+    } else {
+        tcg_out_reloc(s, s->code_ptr, sizeof(tcg_target_ulong), label, 0);
+        s->code_ptr += sizeof(tcg_target_ulong);
+    }
+}
+
+/**
+ * Generate a register-to-register MOV.
+ */
+static bool tcg_out_mov(TCGContext *s, TCGType type, TCGReg ret, TCGReg arg)
+{
+    tcg_debug_assert(ret != arg);
+
+    if (type == TCG_TYPE_I32) {
+        tcg_out_binary_gadget(s, gadget_mov_i32, ret, arg);
+    } else {
+        tcg_out_binary_gadget(s, gadget_mov_i64, ret, arg);
+    }
+
+
+    return true;
+}
+
+
+static void tcg_out_movi_i32(TCGContext *s, TCGReg t0, tcg_target_long arg)
+{
+    bool is_negative = (arg < 0);
+
+    // We handle positive and negative gadgets separately, in order to allow for asymmetrical
+    // collections of pre-made gadgets.
+    if (!is_negative) 
+    {
+        // More optimal case: we have a gadget that directly encodes the argument.
+        if (arg < ARRAY_SIZE(gadget_movi_imm_i32[t0])) {
+            tcg_out_nullary_gadget(s, gadget_movi_imm_i32[t0][arg]);
+            return;
+        }
+    } 
+    else {
+
+    }
+
+    // Emit the mov and its immediate.
+    tcg_out_unary_gadget(s, gadget_movi_i32, t0);
+    tcg_out64(s, arg); // TODO: make 32b?
+}
+
+
+static void tcg_out_movi_i64(TCGContext *s, TCGReg t0, tcg_target_long arg)
+{
+    uint8_t is_negative = arg < 0;
+
+    // We handle positive and negative gadgets separately, in order to allow for asymmetrical
+    // collections of pre-made gadgets.
+    if (!is_negative) 
+    {
+        // More optimal case: we have a gadget that directly encodes the argument.
+        if (arg < ARRAY_SIZE(gadget_movi_imm_i64[t0])) {
+            tcg_out_nullary_gadget(s, gadget_movi_imm_i64[t0][arg]);
+            return;
+        }
+    } 
+    else {
+
+    }
+
+    // TODO: optimize the negative case, too?
+
+    // Less optimal case: emit the mov and its immediate.
+    tcg_out_unary_gadget(s, gadget_movi_i64, t0);
+    tcg_out64(s, arg);
+}
+
+
+/**
+ * Generate an immediate-to-register MOV.
+ */
+static void tcg_out_movi(TCGContext *s, TCGType type, TCGReg t0, tcg_target_long arg)
+{
+    if (type == TCG_TYPE_I32) {
+        tcg_out_movi_i32(s, t0, arg);
+    } else {
+        tcg_out_movi_i64(s, t0, arg);
+    }
+}
+
+/**
+ * Generate a CALL.
+ */
+static inline void tcg_out_call(TCGContext *s, const tcg_insn_unit *arg)
+{
+    tcg_out_nullary_gadget(s, gadget_call);
+    tcg_out64(s, (uintptr_t)arg);
+}
+
+/**
+ * Generates LD instructions.
+ */
+static void tcg_out_ld(TCGContext *s, TCGType type, TCGReg ret, TCGReg arg1,
+                       intptr_t arg2)
+{
+
+    if (type == TCG_TYPE_I32) {
+        tcg_out_ldst_gadget(s, gadget_ld32u, ret, arg1, arg2); 
+    } else {
+        tcg_out_ldst_gadget(s, gadget_ld_i64, ret, arg1, arg2); 
+    }
+}
+
+
+/**
+ * Generate every other operation.
+ */
+//static void tcg_out_op(TCGContext *s, TCGOpcode opc, const TCGArg *args, const int *const_args)
+void tcg_out_op(TCGContext *s, TCGOpcode opc, const TCGArg *args, const int *const_args)
+{
+    switch (opc) {
+
+    // Exit translation, and return back to QEMU.
+    case INDEX_op_exit_tb:
+        // Emit a simple gadget with a known return code.
+        tcg_out_imm64_gadget(s, gadget_exit_tb, args[0]);
+        break;
+
+    // Jump to a translation block.
+    case INDEX_op_goto_tb:
+
+        // If we're using a direct jump, we'll emit a "relocation" that can be usd
+        // to patch our gadget stream with the target address, later.
+        if (s->tb_jmp_insn_offset) {
+            // Emit our gadget.
+            tcg_out_nullary_gadget(s, gadget_br);
+
+            // Place our current instruction into our "relocation table", so it can
+            // be patched once we know where the branch will target...
+            s->tb_jmp_insn_offset[args[0]] = tcg_current_code_size(s);
+
+            // ... and emit our relocation.
+            tcg_out64(s, args[0]);
+
+
+        } else {
+            /* Indirect jump method. */
+            TODO();
+        }
+        set_jmp_reset_offset(s, args[0]);
+        break;
+
+    // Simple branch.
+    case INDEX_op_br:
+        tcg_out_nullary_gadget(s, gadget_br);
+        tcti_out_label(s, arg_label(args[0]));
+        break;
+
+
+    // Set condition flag.
+    // a0 = Rd, a1 = Rn, a2 = Rm
+    case INDEX_op_setcond_i32:
+    {
+        void *gadget;
+
+        // We have to emit a different gadget per condition; we'll select which.
+        switch(args[3]) {
+            case TCG_COND_EQ:  gadget = gadget_setcond_i32_eq; break;
+            case TCG_COND_NE:  gadget = gadget_setcond_i32_ne; break;
+            case TCG_COND_LT:  gadget = gadget_setcond_i32_lt; break;
+            case TCG_COND_GE:  gadget = gadget_setcond_i32_ge; break;
+            case TCG_COND_LE:  gadget = gadget_setcond_i32_le; break;
+            case TCG_COND_GT:  gadget = gadget_setcond_i32_gt; break;
+            case TCG_COND_LTU: gadget = gadget_setcond_i32_lo; break;
+            case TCG_COND_GEU: gadget = gadget_setcond_i32_hs; break;
+            case TCG_COND_LEU: gadget = gadget_setcond_i32_ls; break;
+            case TCG_COND_GTU: gadget = gadget_setcond_i32_hi; break;
+            default:
+                g_assert_not_reached();
+        }
+
+        tcg_out_ternary_gadget(s, gadget, args[0], args[1], args[2]);
+        break;
+    }
+
+    case INDEX_op_setcond_i64:
+    {
+        void *gadget;
+
+        // We have to emit a different gadget per condition; we'll select which.
+        switch(args[3]) {
+            case TCG_COND_EQ:  gadget = gadget_setcond_i64_eq; break;
+            case TCG_COND_NE:  gadget = gadget_setcond_i64_ne; break;
+            case TCG_COND_LT:  gadget = gadget_setcond_i64_lt; break;
+            case TCG_COND_GE:  gadget = gadget_setcond_i64_ge; break;
+            case TCG_COND_LE:  gadget = gadget_setcond_i64_le; break;
+            case TCG_COND_GT:  gadget = gadget_setcond_i64_gt; break;
+            case TCG_COND_LTU: gadget = gadget_setcond_i64_lo; break;
+            case TCG_COND_GEU: gadget = gadget_setcond_i64_hs; break;
+            case TCG_COND_LEU: gadget = gadget_setcond_i64_ls; break;
+            case TCG_COND_GTU: gadget = gadget_setcond_i64_hi; break;
+            default:
+                g_assert_not_reached();
+        }
+
+        tcg_out_ternary_gadget(s, gadget, args[0], args[1], args[2]);
+        break;
+    }
+
+    /**
+     * Load instructions.
+     */
+
+    case INDEX_op_ld8u_i32:
+    case INDEX_op_ld8u_i64:
+        tcg_out_ldst_gadget(s, gadget_ld8u, args[0], args[1], args[2]); 
+        break;
+
+    case INDEX_op_ld8s_i32:
+        tcg_out_ldst_gadget(s, gadget_ld8s_i32, args[0], args[1], args[2]); 
+        break;
+
+    case INDEX_op_ld8s_i64:
+        tcg_out_ldst_gadget(s, gadget_ld8s_i64, args[0], args[1], args[2]); 
+        break;
+
+    case INDEX_op_ld16u_i32:
+    case INDEX_op_ld16u_i64:
+        tcg_out_ldst_gadget(s, gadget_ld16u, args[0], args[1], args[2]); 
+        break;
+
+    case INDEX_op_ld16s_i32:
+        tcg_out_ldst_gadget(s, gadget_ld16s_i32, args[0], args[1], args[2]); 
+        break;
+
+    case INDEX_op_ld16s_i64:
+        tcg_out_ldst_gadget(s, gadget_ld16s_i64, args[0], args[1], args[2]); 
+        break;
+
+    case INDEX_op_ld_i32:
+    case INDEX_op_ld32u_i64:
+        tcg_out_ldst_gadget(s, gadget_ld32u, args[0], args[1], args[2]); 
+        break;
+
+    case INDEX_op_ld_i64:
+        tcg_out_ldst_gadget(s, gadget_ld_i64, args[0], args[1], args[2]); 
+        break;
+    
+    case INDEX_op_ld32s_i64:
+        tcg_out_ldst_gadget(s, gadget_ld32s_i64, args[0], args[1], args[2]); 
+        break;
+
+
+    /**
+     * Store instructions.
+     */
+    case INDEX_op_st8_i32:
+    case INDEX_op_st8_i64:
+        tcg_out_ldst_gadget(s, gadget_st8, args[0], args[1], args[2]); 
+        break;
+
+    case INDEX_op_st16_i32:
+    case INDEX_op_st16_i64:
+        tcg_out_ldst_gadget(s, gadget_st16, args[0], args[1], args[2]); 
+        break;
+
+    case INDEX_op_st_i32:
+    case INDEX_op_st32_i64:
+        tcg_out_ldst_gadget(s, gadget_st_i32, args[0], args[1], args[2]); 
+        break;
+
+    case INDEX_op_st_i64:
+        tcg_out_ldst_gadget(s, gadget_st_i64, args[0], args[1], args[2]); 
+        break;
+
+    /**
+     * Arithmetic instructions.
+     */
+
+    case INDEX_op_add_i32: 
+        tcg_out_ternary_gadget(s, gadget_add_i32, args[0], args[1], args[2]); 
+        break;
+
+    case INDEX_op_sub_i32:
+        tcg_out_ternary_gadget(s, gadget_sub_i32, args[0], args[1], args[2]); 
+        break;
+
+    case INDEX_op_mul_i32:
+        tcg_out_ternary_gadget(s, gadget_mul_i32, args[0], args[1], args[2]); 
+        break;
+
+    case INDEX_op_and_i32:
+        tcg_out_ternary_gadget(s, gadget_and_i32, args[0], args[1], args[2]); 
+        break;
+
+    case INDEX_op_andc_i32:     /* Optional (TCG_TARGET_HAS_andc_i32). */
+        tcg_out_ternary_gadget(s, gadget_andc_i32, args[0], args[1], args[2]); 
+        break;
+
+    case INDEX_op_orc_i32:      /* Optional (TCG_TARGET_HAS_orc_i64). */
+        tcg_out_ternary_gadget(s, gadget_orc_i32, args[0], args[1], args[2]); 
+        break;
+
+    case INDEX_op_eqv_i32:      /* Optional (TCG_TARGET_HAS_orc_i64). */
+        tcg_out_ternary_gadget(s, gadget_eqv_i32, args[0], args[1], args[2]); 
+        break;
+
+    case INDEX_op_or_i32:
+        tcg_out_ternary_gadget(s, gadget_or_i32, args[0], args[1], args[2]); 
+        break;
+
+    case INDEX_op_xor_i32:
+        tcg_out_ternary_gadget(s, gadget_xor_i32, args[0], args[1], args[2]); 
+        break;
+
+    case INDEX_op_shl_i32:
+        tcg_out_ternary_gadget(s, gadget_shl_i32, args[0], args[1], args[2]); 
+        break;
+
+    case INDEX_op_shr_i32:
+        tcg_out_ternary_gadget(s, gadget_shr_i32, args[0], args[1], args[2]); 
+        break;
+
+    case INDEX_op_sar_i32:
+        tcg_out_ternary_gadget(s, gadget_sar_i32, args[0], args[1], args[2]); 
+        break;
+
+    //case INDEX_op_rotr_i32:     /* Optional (TCG_TARGET_HAS_rot_i32). */
+    //    tcg_out_ternary_gadget(s, gadget_rotr_i32, args[0], args[1], args[2]); 
+    //    break;
+
+    //case INDEX_op_rotl_i32:     /* Optional (TCG_TARGET_HAS_rot_i32). */
+    //    tcg_out_ternary_gadget(s, gadget_rotl_i32, args[0], args[1], args[2]); 
+
+    case INDEX_op_add_i64:
+        tcg_out_ternary_gadget(s, gadget_add_i64, args[0], args[1], args[2]); 
+        break;
+
+    case INDEX_op_sub_i64:
+        tcg_out_ternary_gadget(s, gadget_sub_i64, args[0], args[1], args[2]); 
+        break;
+
+    case INDEX_op_mul_i64:
+        tcg_out_ternary_gadget(s, gadget_mul_i64, args[0], args[1], args[2]); 
+        break;
+
+    case INDEX_op_and_i64:
+        tcg_out_ternary_gadget(s, gadget_and_i64, args[0], args[1], args[2]); 
+        break;
+
+    case INDEX_op_andc_i64:     /* Optional (TCG_TARGET_HAS_andc_i64). */
+        tcg_out_ternary_gadget(s, gadget_andc_i64, args[0], args[1], args[2]); 
+        break;
+
+    case INDEX_op_orc_i64:      /* Optional (TCG_TARGET_HAS_orc_i64). */
+        tcg_out_ternary_gadget(s, gadget_orc_i64, args[0], args[1], args[2]); 
+        break;
+
+    case INDEX_op_eqv_i64:      /* Optional (TCG_TARGET_HAS_eqv_i64). */
+        tcg_out_ternary_gadget(s, gadget_eqv_i64, args[0], args[1], args[2]); 
+        break;
+
+    //case INDEX_op_nand_i64:     /* Optional (TCG_TARGET_HAS_nand_i64). */
+    //case INDEX_op_nor_i64:      /* Optional (TCG_TARGET_HAS_nor_i64). */
+
+    case INDEX_op_or_i64:
+        tcg_out_ternary_gadget(s, gadget_or_i64, args[0], args[1], args[2]); 
+        break;
+
+    case INDEX_op_xor_i64:
+        tcg_out_ternary_gadget(s, gadget_xor_i64, args[0], args[1], args[2]); 
+        break;
+
+    case INDEX_op_shl_i64:
+        tcg_out_ternary_gadget(s, gadget_shl_i64, args[0], args[1], args[2]); 
+        break;
+
+    case INDEX_op_shr_i64:
+        tcg_out_ternary_gadget(s, gadget_shr_i64, args[0], args[1], args[2]); 
+        break;
+
+    case INDEX_op_sar_i64:
+        tcg_out_ternary_gadget(s, gadget_sar_i64, args[0], args[1], args[2]); 
+        break;
+
+    //case INDEX_op_rotl_i64:     /* Optional (TCG_TARGET_HAS_rot_i64). */
+    //    tcg_out_ternary_gadget(s, gadget_rotl_i64, args[0], args[1], args[2]); 
+    //    break;
+
+    //case INDEX_op_rotr_i64:     /* Optional (TCG_TARGET_HAS_rot_i64). */
+    //    tcg_out_ternary_gadget(s, gadget_rotr_i64, args[0], args[1], args[2]); 
+    //    break;
+
+    case INDEX_op_div_i64:      /* Optional (TCG_TARGET_HAS_div_i64). */
+        tcg_out_ternary_gadget(s, gadget_div_i64, args[0], args[1], args[2]); 
+        break;
+
+    case INDEX_op_divu_i64:     /* Optional (TCG_TARGET_HAS_div_i64). */
+        tcg_out_ternary_gadget(s, gadget_divu_i64, args[0], args[1], args[2]); 
+        break;
+
+    case INDEX_op_rem_i64:      /* Optional (TCG_TARGET_HAS_div_i64). */
+        tcg_out_ternary_gadget(s, gadget_rem_i64, args[0], args[1], args[2]); 
+        break;
+
+    case INDEX_op_remu_i64:     /* Optional (TCG_TARGET_HAS_div_i64). */
+        tcg_out_ternary_gadget(s, gadget_remu_i64, args[0], args[1], args[2]); 
+        break;
+
+    case INDEX_op_brcond_i64:
+    {
+        static uint8_t last_brcond_i64 = 0;
+        void *gadget;
+
+        // We have to emit a different gadget per condition; we'll select which.
+        switch(args[2]) {
+            case TCG_COND_EQ:  gadget = gadget_brcond_i64_eq; break;
+            case TCG_COND_NE:  gadget = gadget_brcond_i64_ne; break;
+            case TCG_COND_LT:  gadget = gadget_brcond_i64_lt; break;
+            case TCG_COND_GE:  gadget = gadget_brcond_i64_ge; break;
+            case TCG_COND_LE:  gadget = gadget_brcond_i64_le; break;
+            case TCG_COND_GT:  gadget = gadget_brcond_i64_gt; break;
+            case TCG_COND_LTU: gadget = gadget_brcond_i64_lo; break;
+            case TCG_COND_GEU: gadget = gadget_brcond_i64_hs; break;
+            case TCG_COND_LEU: gadget = gadget_brcond_i64_ls; break;
+            case TCG_COND_GTU: gadget = gadget_brcond_i64_hi; break;
+            default:
+                g_assert_not_reached();
+        }
+
+        // We'll select the which branch to used based on a cycling counter.
+        // This means we'll pick one of 16 identical brconds. Spreading this out
+        // helps the processor's branch prediction be less "squished", as not every
+        // branch is going throuh the same instruction.
+        tcg_out_ternary_gadget(s, gadget, last_brcond_i64, args[0], args[1]);
+        last_brcond_i64 = (last_brcond_i64 + 1) % TCG_TARGET_NB_REGS;
+
+        // Branch target immediate.
+        tcti_out_label(s, arg_label(args[3]));
+        break;
+    }
+
+
+    case INDEX_op_bswap16_i32:  /* Optional (TCG_TARGET_HAS_bswap16_i32). */
+    case INDEX_op_bswap16_i64:  /* Optional (TCG_TARGET_HAS_bswap16_i64). */
+        tcg_out_binary_gadget(s, gadget_bswap16, args[0], args[1]);
+        break;
+
+    case INDEX_op_bswap32_i32:  /* Optional (TCG_TARGET_HAS_bswap32_i32). */
+    case INDEX_op_bswap32_i64:  /* Optional (TCG_TARGET_HAS_bswap32_i64). */
+        tcg_out_binary_gadget(s, gadget_bswap32, args[0], args[1]);
+        break;
+
+    case INDEX_op_bswap64_i64:  /* Optional (TCG_TARGET_HAS_bswap64_i64). */
+        tcg_out_binary_gadget(s, gadget_bswap64, args[0], args[1]);
+        break;
+
+    case INDEX_op_not_i64:      /* Optional (TCG_TARGET_HAS_not_i64). */
+        tcg_out_binary_gadget(s, gadget_not_i64, args[0], args[1]);
+        break;
+
+    case INDEX_op_neg_i64:      /* Optional (TCG_TARGET_HAS_neg_i64). */
+        tcg_out_binary_gadget(s, gadget_neg_i64, args[0], args[1]);
+        break;
+
+    case INDEX_op_ext8s_i64:    /* Optional (TCG_TARGET_HAS_ext8s_i64). */
+        tcg_out_binary_gadget(s, gadget_ext8s_i64, args[0], args[1]);
+        break;
+
+    case INDEX_op_ext8u_i32:    /* Optional (TCG_TARGET_HAS_ext8u_i32). */
+    case INDEX_op_ext8u_i64:    /* Optional (TCG_TARGET_HAS_ext8u_i64). */
+        tcg_out_binary_gadget(s, gadget_ext8u, args[0], args[1]);
+        break;
+
+    case INDEX_op_ext16s_i64:   /* Optional (TCG_TARGET_HAS_ext16s_i64). */
+        tcg_out_binary_gadget(s, gadget_ext16s_i64, args[0], args[1]);
+        break;
+
+    case INDEX_op_ext16u_i32:   /* Optional (TCG_TARGET_HAS_ext16u_i32). */
+    case INDEX_op_ext16u_i64:   /* Optional (TCG_TARGET_HAS_ext16u_i64). */
+        tcg_out_binary_gadget(s, gadget_ext16u, args[0], args[1]);
+        break;
+
+    case INDEX_op_ext32s_i64:   /* Optional (TCG_TARGET_HAS_ext32s_i64). */
+    case INDEX_op_ext_i32_i64:
+        tcg_out_binary_gadget(s, gadget_ext32s_i64, args[0], args[1]);
+        break;
+
+    case INDEX_op_ext32u_i64:   /* Optional (TCG_TARGET_HAS_ext32u_i64). */
+    case INDEX_op_extu_i32_i64:
+        tcg_out_binary_gadget(s, gadget_ext32u_i64, args[0], args[1]);
+        break;
+
+    case INDEX_op_neg_i32:      /* Optional (TCG_TARGET_HAS_neg_i32). */
+        tcg_out_binary_gadget(s, gadget_neg_i32, args[0], args[1]);
+        break;
+
+    case INDEX_op_not_i32:      /* Optional (TCG_TARGET_HAS_not_i32). */
+        tcg_out_binary_gadget(s, gadget_not_i32, args[0], args[1]);
+        break;
+
+    case INDEX_op_ext8s_i32:    /* Optional (TCG_TARGET_HAS_ext8s_i32). */
+        tcg_out_binary_gadget(s, gadget_ext8s_i32, args[0], args[1]);
+        break;
+
+    case INDEX_op_ext16s_i32:   /* Optional (TCG_TARGET_HAS_ext16s_i32). */
+        tcg_out_binary_gadget(s, gadget_ext16s_i32, args[0], args[1]);
+        break;
+
+    case INDEX_op_div_i32:      /* Optional (TCG_TARGET_HAS_div_i32). */
+        tcg_out_ternary_gadget(s, gadget_div_i32, args[0], args[1], args[2]); 
+        break;
+
+    case INDEX_op_divu_i32:     /* Optional (TCG_TARGET_HAS_div_i32). */
+        tcg_out_ternary_gadget(s, gadget_divu_i32, args[0], args[1], args[2]); 
+        break;
+
+    case INDEX_op_rem_i32:      /* Optional (TCG_TARGET_HAS_div_i32). */
+        tcg_out_ternary_gadget(s, gadget_rem_i32, args[0], args[1], args[2]); 
+        break;
+
+    case INDEX_op_remu_i32:     /* Optional (TCG_TARGET_HAS_div_i32). */
+        tcg_out_ternary_gadget(s, gadget_remu_i32, args[0], args[1], args[2]); 
+        break;
+
+    case INDEX_op_brcond_i32:
+    {
+        static uint8_t last_brcond_i32 = 0;
+        void *gadget;
+
+        // We have to emit a different gadget per condition; we'll select which.
+        switch(args[2]) {
+            case TCG_COND_EQ:  gadget = gadget_brcond_i32_eq; break;
+            case TCG_COND_NE:  gadget = gadget_brcond_i32_ne; break;
+            case TCG_COND_LT:  gadget = gadget_brcond_i32_lt; break;
+            case TCG_COND_GE:  gadget = gadget_brcond_i32_ge; break;
+            case TCG_COND_LE:  gadget = gadget_brcond_i32_le; break;
+            case TCG_COND_GT:  gadget = gadget_brcond_i32_gt; break;
+            case TCG_COND_LTU: gadget = gadget_brcond_i32_lo; break;
+            case TCG_COND_GEU: gadget = gadget_brcond_i32_hs; break;
+            case TCG_COND_LEU: gadget = gadget_brcond_i32_ls; break;
+            case TCG_COND_GTU: gadget = gadget_brcond_i32_hi; break;
+            default:
+                g_assert_not_reached();
+        }
+
+        // We'll select the which branch to used based on a cycling counter.
+        // This means we'll pick one of 16 identical brconds. Spreading this out
+        // helps the processor's branch prediction be less "squished", as not every
+        // branch is going throuh the same instruction.
+        tcg_out_ternary_gadget(s, gadget, last_brcond_i32, args[0], args[1]);
+        last_brcond_i32 = (last_brcond_i32 + 1) % TCG_TARGET_NB_REGS;
+
+        // Branch target immediate.
+        tcti_out_label(s, arg_label(args[3]));
+
+        break;
+    }
+
+    case INDEX_op_qemu_ld_i32:
+    {
+        MemOp opc = get_memop(args[2]);
+        unsigned a_bits = get_alignment_bits(opc);
+        unsigned s_bits = opc & MO_SIZE;
+
+        void *gadget;
+
+        switch(TLB_MASK_TABLE_OFS(get_mmuidx(args[2]))) {
+            case -64:  LD_MEMOP_HANDLER(gadget, args[2],  off64_i32, a_bits, s_bits); break;
+            case -96:  LD_MEMOP_HANDLER(gadget, args[2],  off96_i32, a_bits, s_bits); break;
+            case -128: LD_MEMOP_HANDLER(gadget, args[2], off128_i32, a_bits, s_bits); break;
+            default:   LD_MEMOP_LOOKUP(gadget, args[2], slowpath_off0_i32); break;
+        }
+
+        // Args:
+        // - an immediate32 encodes our operation index 
+        tcg_out_binary_gadget(s, gadget, args[0], args[1]);
+        tcg_out64(s, args[2]); // TODO: fix encoding to be 4b
+        break;
+    }
+
+    case INDEX_op_qemu_ld_i64:
+    {
+        MemOp opc = get_memop(args[2]);
+        unsigned a_bits = get_alignment_bits(opc);
+        unsigned s_bits = opc & MO_SIZE;
+
+        void *gadget;
+
+        // Special optimization case: if we have an operation/target of 0x3A, 
+        // this is a common case. Delegate to our special-case handler.
+        if (args[2] == 0x3a) {
+            switch(TLB_MASK_TABLE_OFS(get_mmuidx(args[2]))) {
+
+                case -64: 
+                    gadget = (a_bits >= s_bits) ? 
+                        gadget_qemu_ld_leq_aligned_mode3a_off64_i64 :
+                        gadget_qemu_ld_leq_unaligned_mode3a_off64_i64;
+                    break;
+                case -96: 
+                    gadget = (a_bits >= s_bits) ? 
+                        gadget_qemu_ld_leq_aligned_mode3a_off96_i64 :
+                        gadget_qemu_ld_leq_unaligned_mode3a_off96_i64;
+                    break;
+                case -128: 
+                    gadget = (a_bits >= s_bits) ? 
+                        gadget_qemu_ld_leq_aligned_mode3a_off128_i64 :
+                        gadget_qemu_ld_leq_unaligned_mode3a_off128_i64;
+                    break;
+
+                default: 
+                    gadget = gadget_qemu_ld_leq_slowpath_mode3a_off0_i64;
+                    break;
+            }
+            tcg_out_binary_gadget(s, gadget, args[0], args[1]);
+        } 
+        // Otherwise, handle the generic case.
+        else {
+            switch(TLB_MASK_TABLE_OFS(get_mmuidx(args[2]))) {
+                case -64:  LD_MEMOP_HANDLER(gadget, args[2],  off64_i64, a_bits, s_bits); break;
+                case -96:  LD_MEMOP_HANDLER(gadget, args[2],  off96_i64, a_bits, s_bits); break;
+                case -128: LD_MEMOP_HANDLER(gadget, args[2], off128_i64, a_bits, s_bits); break;
+                default:   LD_MEMOP_LOOKUP(gadget, args[2], slowpath_off0_i64); break;
+            }
+            // Args:
+            // - an immediate32 encodes our operation index 
+            tcg_out_binary_gadget(s, gadget, args[0], args[1]);
+            tcg_out64(s, args[2]); // TODO: fix encoding to be 4b
+        }
+
+        break;
+    }
+
+    case INDEX_op_qemu_st_i32:
+    {
+        MemOp opc = get_memop(args[2]);
+        unsigned a_bits = get_alignment_bits(opc);
+        unsigned s_bits = opc & MO_SIZE;
+
+        void *gadget;
+
+        switch(TLB_MASK_TABLE_OFS(get_mmuidx(args[2]))) {
+            case -64:  ST_MEMOP_HANDLER(gadget, args[2],  off64_i32, a_bits, s_bits); break;
+            case -96:  ST_MEMOP_HANDLER(gadget, args[2],  off96_i32, a_bits, s_bits); break;
+            case -128: ST_MEMOP_HANDLER(gadget, args[2], off128_i32, a_bits, s_bits); break;
+            default:   ST_MEMOP_LOOKUP(gadget, args[2], slowpath_off0_i32); break;
+        }
+
+        // Args:
+        // - our gadget encodes the target and address registers
+        // - an immediate32 encodes our operation index 
+        tcg_out_binary_gadget(s, gadget, args[0], args[1]);
+        tcg_out64(s, args[2]); // FIXME: double encoded
+        break;
+    }
+
+    case INDEX_op_qemu_st_i64:
+    {
+        MemOp opc = get_memop(args[2]);
+        unsigned a_bits = get_alignment_bits(opc);
+        unsigned s_bits = opc & MO_SIZE;
+
+        void *gadget;
+
+        // Special optimization case: if we have an operation/target of 0x3A, 
+        // this is a common case. Delegate to our special-case handler.
+        if (args[2] == 0x3a) {
+            switch(TLB_MASK_TABLE_OFS(get_mmuidx(args[2]))) {
+
+                case -64: 
+                    gadget = (a_bits >= s_bits) ? 
+                        gadget_qemu_st_leq_aligned_mode3a_off64_i64 :
+                        gadget_qemu_st_leq_unaligned_mode3a_off64_i64;
+                    break;
+                case -96: 
+                    gadget = (a_bits >= s_bits) ? 
+                        gadget_qemu_st_leq_aligned_mode3a_off96_i64 :
+                        gadget_qemu_st_leq_unaligned_mode3a_off96_i64;
+                    break;
+                case -128: 
+                    gadget = (a_bits >= s_bits) ? 
+                        gadget_qemu_st_leq_aligned_mode3a_off128_i64 :
+                        gadget_qemu_st_leq_unaligned_mode3a_off128_i64;
+                    break;
+
+                default: 
+                    gadget = gadget_qemu_st_leq_slowpath_mode3a_off0_i64;
+                    break;
+            }
+            tcg_out_binary_gadget(s, gadget, args[0], args[1]);
+        } 
+        // Otherwise, handle the generic case.
+        else {
+            switch(TLB_MASK_TABLE_OFS(get_mmuidx(args[2]))) {
+                case -64:  ST_MEMOP_HANDLER(gadget, args[2],  off64_i64, a_bits, s_bits); break;
+                case -96:  ST_MEMOP_HANDLER(gadget, args[2],  off96_i64, a_bits, s_bits); break;
+                case -128: ST_MEMOP_HANDLER(gadget, args[2], off128_i64, a_bits, s_bits); break;
+                default:   ST_MEMOP_LOOKUP(gadget, args[2], slowpath_off0_i64); break;
+            }
+
+            // Args:
+            // - our gadget encodes the target and address registers
+            // - an immediate32 encodes our operation index 
+            tcg_out_binary_gadget(s, gadget, args[0], args[1]);
+            tcg_out64(s, args[2]); // FIXME: double encoded
+        }
+
+        break;
+    }
+
+    // Memory barriers.
+    case INDEX_op_mb:
+    {
+        static void* sync[] = {
+            [0 ... TCG_MO_ALL]            = gadget_mb_all,
+            [TCG_MO_ST_ST]                = gadget_mb_st,
+            [TCG_MO_LD_LD]                = gadget_mb_ld,
+            [TCG_MO_LD_ST]                = gadget_mb_ld,
+            [TCG_MO_LD_ST | TCG_MO_LD_LD] = gadget_mb_ld,
+        };
+        tcg_out_nullary_gadget(s, sync[args[0] & TCG_MO_ALL]);
+
+        break;
+    }
+
+    case INDEX_op_mov_i32:  /* Always emitted via tcg_out_mov.  */
+    case INDEX_op_mov_i64:
+    case INDEX_op_call:     /* Always emitted via tcg_out_call.  */
+    default:
+        tcg_abort();
+    }
+}
+
+/**
+ * Generate immediate stores.
+ */
+static void tcg_out_st(TCGContext *s, TCGType type, TCGReg arg, TCGReg arg1,
+                       intptr_t arg2)
+{
+    if (type == TCG_TYPE_I32) {
+        tcg_out_ldst_gadget(s, gadget_st_i32, arg, arg1, arg2); 
+    } else {
+        tcg_out_ldst_gadget(s, gadget_st_i64, arg, arg1, arg2); 
+    }
+}
+
+static inline bool tcg_out_sti(TCGContext *s, TCGType type, TCGArg val,
+                               TCGReg base, intptr_t ofs)
+{
+    return false;
+}
+
+/* Test if a constant matches the constraint. */
+static int tcg_target_const_match(tcg_target_long val, TCGType type,
+                                  const TCGArgConstraint *arg_ct)
+{
+    /* No need to return 0 or 1, 0 or != 0 is good enough. */
+    return arg_ct->ct & TCG_CT_CONST;
+}
+
+static void tcg_target_init(TCGContext *s)
+{
+    /* The current code uses uint8_t for tcg operations. */
+    tcg_debug_assert(tcg_op_defs_max <= UINT8_MAX);
+
+    /* Registers available for 32 bit operations. */
+    tcg_target_available_regs[TCG_TYPE_I32] = BIT(TCG_TARGET_NB_REGS) - 1;
+    /* Registers available for 64 bit operations. */
+    tcg_target_available_regs[TCG_TYPE_I64] = BIT(TCG_TARGET_NB_REGS) - 1;
+
+    /* TODO: Which registers should be set here? */
+    tcg_target_call_clobber_regs = BIT(TCG_TARGET_NB_REGS) - 1;
+
+    s->reserved_regs = 0;
+    tcg_regset_set_reg(s->reserved_regs, TCG_REG_CALL_STACK);
+
+    /* We use negative offsets from "sp" so that we can distinguish
+       stores that might pretend to be call arguments.  */
+    tcg_set_frame(s, TCG_REG_CALL_STACK, -CPU_TEMP_BUF_NLONGS * sizeof(long), CPU_TEMP_BUF_NLONGS * sizeof(long));
+}
+
+/* Generate global QEMU prologue and epilogue code. */
+static inline void tcg_target_qemu_prologue(TCGContext *s)
+{
+    // No prologue; as we're interpreted.
+}
+
+
+/**
+ * TCTI 'interpreter' bootstrap.
+ */
+
+// Store the current return address during helper calls.
+__thread uintptr_t tcti_call_return_address;
+
+/* Dispatch the bytecode stream contained in our translation buffer. */
+uintptr_t QEMU_DISABLE_CFI tcg_qemu_tb_exec(CPUArchState *env, const void *v_tb_ptr)
+{
+    // Create our per-CPU temporary storage.
+    long tcg_temps[CPU_TEMP_BUF_NLONGS];
+
+    uint64_t return_value = 0;
+    uintptr_t sp_value    = (uintptr_t)(tcg_temps + CPU_TEMP_BUF_NLONGS);
+    uintptr_t pc_mirror   = (uintptr_t)&tcti_call_return_address;
+
+    // Ensure our target configuration hasn't changed.
+    tcti_assert(TCG_AREG0 == TCG_REG_R14);
+    tcti_assert(TCG_REG_CALL_STACK == TCG_REG_R15);
+
+    asm(
+        // Our threaded-dispatch prologue needs to set up things for our machine to run.
+        // This means:
+        //   - Set up TCG_AREG0 (R14) to point to our architectural state.
+        //   - Set up TCG_REG_CALL_STACK (R15) to point to our temporary buffer.
+        //   - Point x28 (our bytecode "instruction pointer") to the relevant stream address.
+        "ldr x14, %[areg0]\n"
+        "ldr x15, %[sp_value]\n"
+        "ldr x25, %[pc_mirror]\n"
+        "ldr x28, %[start_tb_ptr]\n"
+
+        // To start our code, we'll -call- the gadget at the first bytecode pointer.
+        // Note that we call/branch-with-link, here; so our TB_EXIT gadget can RET in order
+        // to return to this point when things are complete.
+        "ldr x27, [x28], #8\n"
+        "blr x27\n"
+
+        // Finally, we'll copy out our final return value.
+        "str x0, %[return_value]\n"
+
+        : [return_value] "=m" (return_value)
+
+        : [areg0]        "m"  (env), 
+          [sp_value]     "m"  (sp_value), 
+          [start_tb_ptr] "m"  (v_tb_ptr),
+          [pc_mirror]    "m"  (pc_mirror)
+
+        // We touch _every_ one of the lower registers, as we use these to execute directly.
+        : "x0", "x1",  "x2",  "x3",  "x4",  "x5",  "x6",  "x7",
+          "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15",
+
+        // We also use x26/x27 for temporary values, and x28 as our bytecode poitner.
+        "x25", "x26", "x27", "x28", "cc", "memory"
+    );
+
+    return return_value;
+}
+
+
+/**
+ *  Disassembly output support.
+ */
+#include <dlfcn.h>
+
+
+/* Disassemble TCI bytecode. */
+int print_insn_tcti(bfd_vma addr, disassemble_info *info)
+{
+    Dl_info symbol_info = {};
+    char symbol_name[48] ;
+
+    int status;
+    uint64_t block;
+
+    // Read the relevant pointer.
+    status = info->read_memory_func(addr, (void *)&block, sizeof(block), info);
+    if (status != 0) {
+        info->memory_error_func(status, addr, info);
+        return -1;
+    }
+
+    // Most of our disassembly stream will be gadgets. Try to get their names, for nice output.
+    dladdr((void *)block, &symbol_info);
+
+    if(symbol_info.dli_sname != 0) {
+        strlcpy(symbol_name, symbol_info.dli_sname, 47);
+        info->fprintf_func(info->stream, "%s", symbol_name);
+    } else {
+        info->fprintf_func(info->stream, "%016llx", block);
+    }
+
+    return sizeof(block);
+}
+
+
diff --git a/tcg/aarch64-tcti/tcg-target.h b/tcg/aarch64-tcti/tcg-target.h
new file mode 100644
index 000000000000..fa2ae5c40a3e
--- /dev/null
+++ b/tcg/aarch64-tcti/tcg-target.h
@@ -0,0 +1,220 @@
+/*
+ * Tiny Code Generator for QEMU
+ *
+ * Copyright (c) 2009, 2011 Stefan Weil
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+/*
+ * This code implements a TCG which does not generate machine code for some
+ * real target machine but which generates virtual machine code for an
+ * interpreter. Interpreted pseudo code is slow, but it works on any host.
+ *
+ * Some remarks might help in understanding the code:
+ *
+ * "target" or "TCG target" is the machine which runs the generated code.
+ * This is different to the usual meaning in QEMU where "target" is the
+ * emulated machine. So normally QEMU host is identical to TCG target.
+ * Here the TCG target is a virtual machine, but this virtual machine must
+ * use the same word size like the real machine.
+ * Therefore, we need both 32 and 64 bit virtual machines (interpreter).
+ */
+
+#ifndef TCG_TARGET_H
+#define TCG_TARGET_H
+
+#if UINTPTR_MAX == UINT32_MAX
+# error We only support AArch64 running in 64B mode.
+#elif UINTPTR_MAX == UINT64_MAX
+# define TCG_TARGET_REG_BITS 64
+#else
+# error Unknown pointer size for tcti target
+#endif
+
+#define TCG_TARGET_INSN_UNIT_SIZE        1
+#define TCG_TARGET_TLB_DISPLACEMENT_BITS 32
+
+// We're an interpreted target; even if we're JIT-compiling to our interpreter's
+// weird psuedo-native bytecode. We'll indicate that we're intepreted.
+#define TCG_TARGET_INTERPRETER 1
+
+//
+// Supported optional instructions.
+//
+
+// Divs.
+#define TCG_TARGET_HAS_div_i32          1
+#define TCG_TARGET_HAS_rem_i32          1
+#define TCG_TARGET_HAS_div_i64          1
+#define TCG_TARGET_HAS_rem_i64          1
+
+// Extends.
+#define TCG_TARGET_HAS_ext8s_i32        1
+#define TCG_TARGET_HAS_ext16s_i32       1
+#define TCG_TARGET_HAS_ext8u_i32        1
+#define TCG_TARGET_HAS_ext16u_i32       1
+#define TCG_TARGET_HAS_ext8s_i64        1
+#define TCG_TARGET_HAS_ext16s_i64       1
+#define TCG_TARGET_HAS_ext32s_i64       1
+#define TCG_TARGET_HAS_ext8u_i64        1
+#define TCG_TARGET_HAS_ext16u_i64       1
+#define TCG_TARGET_HAS_ext32u_i64       1
+
+// Logicals.
+#define TCG_TARGET_HAS_neg_i32          1
+#define TCG_TARGET_HAS_not_i32          1
+#define TCG_TARGET_HAS_neg_i64          1
+#define TCG_TARGET_HAS_not_i64          1
+
+#define TCG_TARGET_HAS_andc_i32         1
+#define TCG_TARGET_HAS_orc_i32          1
+#define TCG_TARGET_HAS_eqv_i32          1
+#define TCG_TARGET_HAS_andc_i64         1
+#define TCG_TARGET_HAS_eqv_i64          1
+#define TCG_TARGET_HAS_orc_i64          1
+
+// We don't curretly support rotates, since AArch64 lacks ROL.
+// We'll fix this later.
+#define TCG_TARGET_HAS_rot_i32          0
+#define TCG_TARGET_HAS_rot_i64          0
+
+// Swaps.
+#define TCG_TARGET_HAS_bswap16_i32      1
+#define TCG_TARGET_HAS_bswap32_i32      1
+#define TCG_TARGET_HAS_bswap16_i64      1
+#define TCG_TARGET_HAS_bswap32_i64      1
+#define TCG_TARGET_HAS_bswap64_i64      1
+#define TCG_TARGET_HAS_MEMORY_BSWAP     1
+
+// Specify we'll handle direct jumps.
+#define TCG_TARGET_HAS_direct_jump      1
+
+//
+// Potential TODOs.
+//
+
+// TODO: implement DEPOSIT as BFI.
+#define TCG_TARGET_HAS_deposit_i32      0
+#define TCG_TARGET_HAS_deposit_i64      0
+
+// TODO: implement EXTRACT as BFX.
+#define TCG_TARGET_HAS_extract_i32      0
+#define TCG_TARGET_HAS_sextract_i32     0
+#define TCG_TARGET_HAS_extract_i64      0
+#define TCG_TARGET_HAS_sextract_i64     0
+
+// TODO: it might be worth writing a gadget for this
+#define TCG_TARGET_HAS_movcond_i32      0
+#define TCG_TARGET_HAS_movcond_i64      0
+
+//
+// Unsupported instructions.
+//
+
+// ARMv8 doesn't have instructions for NAND/NOR.
+#define TCG_TARGET_HAS_nand_i32         0
+#define TCG_TARGET_HAS_nor_i32          0
+#define TCG_TARGET_HAS_nor_i64          0
+#define TCG_TARGET_HAS_nand_i64         0
+
+// aarch64's CLZ is implemented without a condition, so it
+#define TCG_TARGET_HAS_clz_i32          0
+#define TCG_TARGET_HAS_ctz_i32          0
+#define TCG_TARGET_HAS_ctpop_i32        0
+#define TCG_TARGET_HAS_clz_i64          0
+#define TCG_TARGET_HAS_ctz_i64          0
+#define TCG_TARGET_HAS_ctpop_i64        0
+
+
+// GOTO_PTR is too complex to emit a simple gadget for.
+// We'll let C handle it, since the overhead is similar.
+#define TCG_TARGET_HAS_goto_ptr         0
+
+// We don't have a simple gadget for this, since we're always assuming softmmu.
+#define TCG_TARGET_HAS_qemu_st8_i32     0
+
+// No AArch64 equivalent.a
+#define TCG_TARGET_HAS_extrl_i64_i32    0
+#define TCG_TARGET_HAS_extrh_i64_i32    0
+
+#define TCG_TARGET_HAS_extract2_i64     0
+
+// These should always be zero on our 64B platform.
+#define TCG_TARGET_HAS_muls2_i64        0
+#define TCG_TARGET_HAS_add2_i32         0
+#define TCG_TARGET_HAS_sub2_i32         0
+#define TCG_TARGET_HAS_mulu2_i32        0
+#define TCG_TARGET_HAS_add2_i64         0
+#define TCG_TARGET_HAS_sub2_i64         0
+#define TCG_TARGET_HAS_mulu2_i64        0
+#define TCG_TARGET_HAS_muluh_i64        0
+#define TCG_TARGET_HAS_mulsh_i64        0
+#define TCG_TARGET_HAS_extract2_i32     0
+#define TCG_TARGET_HAS_muls2_i32        0
+#define TCG_TARGET_HAS_muluh_i32        0
+#define TCG_TARGET_HAS_mulsh_i32        0
+
+//
+// Platform metadata.
+//
+
+// Number of registers available.
+// It might make sense to up these, since we can also use x16 -> x25?
+#define TCG_TARGET_NB_REGS 16
+
+/* List of registers which are used by TCG. */
+typedef enum {
+    TCG_REG_R0 = 0,
+    TCG_REG_R1,
+    TCG_REG_R2,
+    TCG_REG_R3,
+    TCG_REG_R4,
+    TCG_REG_R5,
+    TCG_REG_R6,
+    TCG_REG_R7,
+    TCG_REG_R8,
+    TCG_REG_R9,
+    TCG_REG_R10,
+    TCG_REG_R11,
+    TCG_REG_R12,
+    TCG_REG_R13,
+    TCG_REG_R14,
+    TCG_REG_R15,
+
+    TCG_AREG0          = TCG_REG_R14,
+    TCG_REG_CALL_STACK = TCG_REG_R15,
+} TCGReg;
+
+// Specify the shape of the stack our runtime will use.
+#define TCG_TARGET_CALL_STACK_OFFSET    0
+#define TCG_TARGET_STACK_ALIGN          16
+
+// We're interpreted, so we'll use our own code to run TB_EXEC.
+#define HAVE_TCG_QEMU_TB_EXEC
+
+// We'll need to enforce memory ordering with barriers.
+#define TCG_TARGET_DEFAULT_MO  (0)
+
+void tci_disas(uint8_t opc);
+
+void tb_target_set_jmp_target(uintptr_t, uintptr_t, uintptr_t, uintptr_t);
+
+
+#endif /* TCG_TARGET_H */
diff --git a/tcg/aarch64-tcti/tcti-gadget-gen.py b/tcg/aarch64-tcti/tcti-gadget-gen.py
new file mode 100755
index 000000000000..1296f6d0c2d7
--- /dev/null
+++ b/tcg/aarch64-tcti/tcti-gadget-gen.py
@@ -0,0 +1,788 @@
+#!/usr/bin/env python3
+""" Gadget-code generator for QEMU TCTI on AArch64. 
+
+Generates a C-code include file containing 'gadgets' for use by TCTI.
+"""
+
+import sys
+import itertools
+
+# Get a handle on the file we'll be working with, and redirect print to it.
+if len(sys.argv) > 1:
+    out_file = open(sys.argv[1], "w")
+
+    # Hook our print function, so it always outputs to the relevant file.
+    core_print = print
+    print = lambda *a, **k : core_print(*a, **k, file=out_file)
+
+# Epilogue code follows at the end of each gadget, and handles continuing execution.
+EPILOGUE = ( 
+    # Load our next gadget address from our bytecode stream, advancing it.
+    "ldr x27, [x28], #8",
+
+    # Jump to the next gadget.
+    "br x27"
+)
+
+# The number of general-purpose registers we're affording the TCG. This must match
+# the configuration in the TCTI target.
+TCG_REGISTER_COUNT   = 16
+TCG_REGISTER_NUMBERS = list(range(TCG_REGISTER_COUNT))
+
+# Helper that provides each of the AArch64 condition codes of interest.
+ARCH_CONDITION_CODES = ["eq", "ne", "lt", "ge", "le", "gt", "lo", "hs", "ls", "hi"]
+
+# We'll create a variety of gadgets that assume the MMU's TLB is stored at certain
+# offsets into its structure. These should match the offsets in tcg-target.c.in.
+QEMU_ALLOWED_MMU_OFFSETS = [ 64, 96, 128 ]
+
+# Statistics.
+gadgets      = 0
+instructions = 0
+
+def simple(name, *lines):
+    """ Generates a simple gadget that needs no per-register specialization. """
+
+    global gadgets, instructions
+
+    gadgets += 1
+
+    # Create our C/ASM framing.
+    #print(f"__attribute__((naked)) static void gadget_{name}(void)")
+    print(f"__attribute__((naked)) void gadget_{name}(void);")
+    print(f"__attribute__((naked)) void gadget_{name}(void)")
+    print("{")
+
+    # Add the core gadget
+    print("\tasm(")
+    for line in lines + EPILOGUE:
+        print(f"\t\t\"{line} \\n\"")
+        instructions += 1
+    print("\t);")
+
+    # End our framing.
+    print("}\n")
+
+
+def with_register_substitutions(name, substitutions, *lines, immediate_range=range(0)):
+    """ Generates a collection of gadgtes with register substitutions. """
+
+    def substitutions_for_letter(letter, number, line):
+        """ Helper that transforms Wd => w1, implementing gadget substitutions. """
+
+        # Register substitutions...
+        line = line.replace(f"X{letter}", f"x{number}")
+        line = line.replace(f"W{letter}", f"w{number}")
+
+        # ... immediate substitutions.
+        line = line.replace(f"I{letter}", f"{number}")
+        return line
+
+        
+    # Build a list of all the various stages we'll iterate over...
+    immediate_parameters = list(immediate_range)
+    parameters   = ([TCG_REGISTER_NUMBERS] * len(substitutions))
+
+    # ... adding immediates, if need be.
+    if immediate_parameters:
+        parameters.append(immediate_parameters)
+        substitutions = substitutions + ['i']
+
+    # Generate a list of register-combinations we'll support.
+    permutations = itertools.product(*parameters)
+
+    #  For each permutation...
+    for permutation in permutations:
+        new_lines = lines
+
+        # Replace each placeholder element with its proper value...
+        for index, element in enumerate(permutation):
+            letter = substitutions[index]
+            number = element
+
+            # Create new gadgets for the releavnt line...
+            new_lines = [substitutions_for_letter(letter, number, line) for line in new_lines]
+
+        # ... and emit the gadget.
+        permutation_id = "_arg".join(str(number) for number in permutation)
+        simple(f"{name}_arg{permutation_id}", *new_lines)
+
+
+def with_dnm(name, *lines):
+    """ Generates a collection of gadgets with substitutions for Xd, Xn, and Xm, and equivalents. """
+    with_register_substitutions(name, ("d", "n", "m"), *lines)
+
+    # Print out an array that contains all of our gadgets, for lookup.
+    print(f"void* gadget_{name}[{TCG_REGISTER_COUNT}][{TCG_REGISTER_COUNT}][{TCG_REGISTER_COUNT}] = ", end="")
+    print("{")
+
+    # D array
+    for d in TCG_REGISTER_NUMBERS:
+        print("\t{")
+
+        # N array
+        for n in TCG_REGISTER_NUMBERS:
+            print("\t\t{", end="")
+
+            # M array
+            for m in TCG_REGISTER_NUMBERS:
+                print(f"gadget_{name}_arg{d}_arg{n}_arg{m}", end=", ")
+
+            print("},")
+        print("\t},")
+    print("};")
+
+
+def with_dn_immediate(name, *lines, immediate_range):
+    """ Generates a collection of gadgets with substitutions for Xd, Xn, and Xm, and equivalents. """
+    with_register_substitutions(name, ["d", "n"], *lines, immediate_range=immediate_range)
+
+    # Print out an array that contains all of our gadgets, for lookup.
+    print(f"void* gadget_{name}[{TCG_REGISTER_COUNT}][{TCG_REGISTER_COUNT}][{len(immediate_range)}] = ", end="")
+    print("{")
+
+    # D array
+    for d in TCG_REGISTER_NUMBERS:
+        print("\t{")
+
+        # N array
+        for n in TCG_REGISTER_NUMBERS:
+            print("\t\t{", end="")
+
+            # M array
+            for i in immediate_range:
+                print(f"gadget_{name}_arg{d}_arg{n}_arg{i}", end=", ")
+
+            print("},")
+        print("\t},")
+    print("};")
+
+
+def with_pair(name, substitutions, *lines):
+    """ Generates a collection of gadgets with two subtstitutions."""
+    with_register_substitutions(name, substitutions, *lines)
+
+    # Print out an array that contains all of our gadgets, for lookup.
+    print(f"void* gadget_{name}[{TCG_REGISTER_COUNT}][{TCG_REGISTER_COUNT}] = ", end="")
+    print("{")
+
+    # N array
+    for a in TCG_REGISTER_NUMBERS:
+        print("\t\t{", end="")
+
+        # M array
+        for b in TCG_REGISTER_NUMBERS:
+            print(f"gadget_{name}_arg{a}_arg{b}", end=", ")
+
+        print("},")
+    print("};")
+
+
+def math_dnm(name, mnemonic):
+    """ Equivalent to `with_dnm`, but creates a _i32 and _i64 variant. For simple math. """
+    with_dnm(f'{name}_i32', f"{mnemonic} Wd, Wn, Wm")
+    with_dnm(f'{name}_i64', f"{mnemonic} Xd, Xn, Xm")
+
+def math_dn(name, mnemonic):
+    """ Equivalent to `with_dn`, but creates a _i32 and _i64 variant. For simple math. """
+    with_dn(f'{name}_i32', f"{mnemonic} Wd, Wn")
+    with_dn(f'{name}_i64', f"{mnemonic} Xd, Xn")
+
+
+def with_nm(name, *lines):
+    """ Generates a collection of gadgets with substitutions for Xn, and Xm, and equivalents. """
+    with_pair(name, ('n', 'm',), *lines)
+
+
+def with_dn(name, *lines):
+    """ Generates a collection of gadgets with substitutions for Xd, and Xn, and equivalents. """
+    with_pair(name, ('d', 'n',), *lines)
+
+
+def ldst_dn(name, *lines):
+    """ Generates a collection of gadgets with substitutions for Xd, and Xn, and equivalents. 
+    
+    This variant is optimized for loads and stores, and optimizes common offset cases.
+    """
+
+    #
+    # Simple case: create our gadgets.
+    #
+    with_dn(name, "ldr x27, [x28], #8", *lines)
+
+    #
+    # Optimization case: create variants of our gadgets with our offsets replaced with common immediates.
+    #
+    immediate_lines_pos = [line.replace("x27", "#Ii") for line in lines]
+    with_dn_immediate(f"{name}_imm", *immediate_lines_pos, immediate_range=range(64))
+
+    immediate_lines_aligned = [line.replace("x27", "#(Ii << 3)") for line in lines]
+    with_dn_immediate(f"{name}_sh8_imm", *immediate_lines_aligned, immediate_range=range(64))
+
+    immediate_lines_neg = [line.replace("x27", "#-Ii") for line in lines]
+    with_dn_immediate(f"{name}_neg_imm", *immediate_lines_neg, immediate_range=range(64))
+
+
+def with_single(name, substitution, *lines):
+    """ Generates a collection of gadgets with two subtstitutions."""
+    with_register_substitutions(name, (substitution,), *lines)
+
+    # Print out an array that contains all of our gadgets, for lookup.
+    print(f"void* gadget_{name}[{TCG_REGISTER_COUNT}] = ", end="")
+    print("{")
+
+    for n in TCG_REGISTER_NUMBERS:
+        print(f"gadget_{name}_arg{n}", end=", ")
+
+    print("};")
+
+
+def with_d_immediate(name, *lines, immediate_range=range(0)):
+    """ Generates a collection of gadgets with two subtstitutions."""
+    with_register_substitutions(name, ['d'], *lines, immediate_range=immediate_range)
+
+    # Print out an array that contains all of our gadgets, for lookup.
+    print(f"void* gadget_{name}[{TCG_REGISTER_COUNT}][{len(immediate_range)}] = ", end="")
+    print("{")
+
+    # D array
+    for a in TCG_REGISTER_NUMBERS:
+        print("\t\t{", end="")
+
+        # I array
+        for b in immediate_range:
+            print(f"gadget_{name}_arg{a}_arg{b}", end=", ")
+
+        print("},")
+    print("};")
+
+
+
+def with_d(name, *lines):
+    """ Generates a collection of gadgets with substitutions for Xd. """
+    with_single(name, 'd', *lines)
+
+
+# Assembly code for saving our machine state before entering the C runtime.
+C_CALL_PROLOGUE = [
+    # Store our machine state.
+    "str x25,      [sp, #-16]!",
+    "stp x14, x15, [sp, #-16]!",
+    "stp x12, x13, [sp, #-16]!",
+    "stp x10, x11, [sp, #-16]!",
+    "stp x8,  x9,  [sp, #-16]!",
+    "stp x6,  x7,  [sp, #-16]!",
+    "stp x4,  x5,  [sp, #-16]!",
+    "stp x2,  x3,  [sp, #-16]!",
+    "stp x0,  x1,  [sp, #-16]!",
+    "stp x28, lr,  [sp, #-16]!",
+]
+
+# Assembly code for restoring our machine state after leaving the C runtime.
+C_CALL_EPILOGUE = [
+    "ldp x28, lr, [sp], #16",
+    "ldp x0,  x1, [sp], #16",
+    "ldp x2,  x3, [sp], #16",
+    "ldp x4,  x5, [sp], #16",
+    "ldp x6,  x7, [sp], #16",
+    "ldp x8,  x9, [sp], #16",
+    "ldp x10, x11, [sp], #16",
+    "ldp x12, x13, [sp], #16",
+    "ldp x14, x15, [sp], #16",
+    "ldr x25,      [sp], #16",
+]
+
+
+def create_tlb_fastpath(is_aligned, is_write, offset, miss_label="0"):
+    """ Creates a set of instructions that perform a soft-MMU TLB lookup.
+
+    This is used for `qemu_ld`/qemu_st` instructions; to emit a prologue that
+    hopefully helps us skip a slow call into the C runtime when a Guest Virtual 
+    -> Host Virtual mapping is in the softmmu's TLB.
+
+    This "fast-path" prelude behaves as follows:
+        - If a TLB entry is found for the address stored in Xn, then x27
+          is stored to an "addend" that can be added to the guest virtual addres
+          to get the host virtual address (the address in our local memory space).
+        - If a TLB entry isn't found, it branches to the "miss_label" (by default, 0:),
+          so address lookup can be handled by the fastpath.
+
+    Clobbers x24, and x26; provides output in x27.
+    """
+
+    fast_path = [
+        # Load env_tlb(env)->f[mmu_idx].{mask,table} into {x26,x27}.
+        f"ldp x26, x27, [x14, #-{offset}]",
+
+        # Extract the TLB index from the address into X26. 
+        "and x26, x26, Xn, lsr #7", # Xn = addr regsiter 
+
+        # Add the tlb_table pointer, creating the CPUTLBEntry address into X27. 
+        "add x27, x27, x26",
+
+        # Load the tlb comparator into X26, and the fast path addend into X27. 
+        "ldr x26, [x27, #8]" if is_write else "ldr x26, [x27]",
+        "ldr x27, [x27, #0x18]",
+
+    ]
+
+    if is_aligned:
+        fast_path.extend([
+            # Store the page mask part of the address into X24.
+            "and x24, Xn, #0xfffffffffffff000",
+
+            # Compare the masked address with the TLB value.
+            "cmp x26, x24",
+
+            # If we're not equal, this isn't a TLB hit. Jump to our miss handler.
+            f"b.ne {miss_label}f",
+        ])
+    else:
+        fast_path.extend([
+            # If we're not aligned, add in our alignment value to ensure we don't
+            # don't straddle the end of a page.
+            "add x24, Xn, #7",
+
+            # Store the page mask part of the address into X24.
+            "and x24, x24, #0xfffffffffffff000",
+
+            # Compare the masked address with the TLB value.
+            "cmp x26, x24",
+
+            # If we're not equal, this isn't a TLB hit. Jump to our miss handler.
+            f"b.ne {miss_label}f",
+        ])
+
+    return fast_path
+
+
+
+def ld_thunk(name, fastpath_32b, fastpath_64b, slowpath_helper, immediate=None, is_aligned=False, force_slowpath=False):
+    """ Creates a thunk into our C runtime for a QEMU ST operation. """
+
+    # Use only offset 0 (no real offset) if we're forcing slowpath; 
+    # otherwise, use all of our allowed MMU offsets.
+    offsets = [0] if force_slowpath else QEMU_ALLOWED_MMU_OFFSETS
+    for offset in offsets:
+        for is_32b in (True, False):
+            fastpath = fastpath_32b if is_32b else fastpath_64b
+
+            gadget_name = f"{name}_off{offset}_i32" if is_32b else f"{name}_off{offset}_i64"
+            postscript = () if immediate else ("add x28, x28, #8",)
+
+            # If we have a pure-assembly fast path, start our gadget with it.
+            if fastpath and not force_slowpath:
+                fastpath_ops = [
+                    # Create a fastpath that jumps to miss_lable on a TLB miss,
+                    # or sets x27 to the TLB addend on a TLB hit.
+                    *create_tlb_fastpath(is_aligned=is_aligned, is_write=False, offset=offset),
+
+                    # On a hit, we can just perform an appropriate load...
+                    *fastpath,
+
+                    # Run our patch-up post-script, if we have one.
+                    *postscript,
+
+                    # ... and then we're done!
+                    *EPILOGUE,
+                ]
+            # Otherwise, we'll save arguments for our slow path.
+            else:
+                fastpath_ops = []
+
+            #
+            # If we're not taking our fast path, we'll call into our C runtime to take the slow path.
+            # 
+            with_dn(gadget_name, 
+                    *fastpath_ops,
+
+                "0:",
+                    "mov x27, Xn",
+
+                    # Save our registers in preparation for entering a C call.
+                    *C_CALL_PROLOGUE,
+
+                    # Per our calling convention:
+                    # - Move our architectural environment into x0, from x14.
+                    # - Move our target address into x1. [Placed in x27 below.]
+                    # - Move our operation info into x2, from an immediate32.
+                    # - Move the next bytecode pointer into x3, from x28.
+                    "mov   x0, x14",
+                    "mov   x1, x27",
+                    f"mov   x2, #{immediate}" if (immediate is not None) else "ldr   x2, [x28], #8", 
+                    "mov   x3, x28",
+
+                    # Perform our actual core code.
+                    f"bl _{slowpath_helper}",
+
+                    # Temporarily store our result in a register that won't get trashed.
+                    "mov x27, x0",
+
+                    # Restore our registers after our C call.
+                    *C_CALL_EPILOGUE,
+
+                    # Finally, call our postscript...
+                    *postscript,
+
+                    # ... and place our results in the target register.
+                    "mov Wd, w27" if is_32b else "mov Xd, x27"
+            )
+
+
+def st_thunk(name, fastpath_32b, fastpath_64b, slowpath_helper, immediate=None, is_aligned=False, force_slowpath=False):
+    """ Creates a thunk into our C runtime for a QEMU ST operation. """
+
+    # Use only offset 0 (no real offset) if we're forcing slowpath; 
+    # otherwise, use all of our allowed MMU offsets.
+    offsets = [0] if force_slowpath else QEMU_ALLOWED_MMU_OFFSETS
+    for offset in offsets:
+
+        for is_32b in (True, False):
+            fastpath = fastpath_32b if is_32b else fastpath_64b
+
+            gadget_name = f"{name}_off{offset}_i32" if is_32b else f"{name}_off{offset}_i64"
+            postscript = () if immediate else ("add x28, x28, #8",)
+
+            # If we have a pure-assembly fast path, start our gadget with it.
+            if fastpath and not force_slowpath:
+                fastpath_ops = [
+
+                    # Create a fastpath that jumps to miss_lable on a TLB miss,
+                    # or sets x27 to the TLB addend on a TLB hit.
+                    *create_tlb_fastpath(is_aligned=is_aligned, is_write=True, offset=offset),
+
+                    # On a hit, we can just perform an appropriate load...
+                    *fastpath,
+
+                    # Run our patch-up post-script, if we have one.
+                    *postscript,
+
+                    # ... and then we're done!
+                    *EPILOGUE,
+                ]
+            else:
+                fastpath_ops = []
+
+
+            #
+            # If we're not taking our fast path, we'll call into our C runtime to take the slow path.
+            # 
+            with_dn(gadget_name, 
+                    *fastpath_ops,
+
+                "0:",
+                    # Move our arguments into registers that we're not actively using.
+                    # This ensures that they won't be trounced by our calling convention
+                    # if this is reading values from x0-x4.
+                    "mov w27, Wd" if is_32b else "mov x27, Xd",
+                    "mov x26, Xn",
+
+                    # Save our registers in preparation for entering a C call.
+                    *C_CALL_PROLOGUE,
+
+                    # Per our calling convention:
+                    # - Move our architectural environment into x0, from x14.
+                    # - Move our target address into x1. [Moved into x26 above].
+                    # - Move our target value into x2. [Moved into x27 above].
+                    # - Move our operation info into x3, from an immediate32.
+                    # - Move the next bytecode pointer into x4, from x28.
+                    "mov   x0, x14",
+                    "mov   x1, x26",
+                    "mov   x2, x27",
+                    f"mov  x3, #{immediate}" if (immediate is not None) else "ldr   x3, [x28], #8", 
+                    "mov   x4, x28",
+
+                    # Perform our actual core code.
+                    f"bl _{slowpath_helper}",
+
+                    # Restore our registers after our C call.
+                    *C_CALL_EPILOGUE,
+
+                    # Finally, call our postscript.
+                    *postscript
+            )
+
+
+#
+# Gadget definitions.
+#
+
+print("/* Automatically generated by tcti-gadget-gen.py. Do not edit. */\n")
+
+# Call a C language helper function by address.
+simple("call",
+    # Get our C runtime function's location as a pointer-sized immediate...
+    "ldr x27, [x28], #8",
+
+    # Store our TB return address for our helper.
+    "str x28, [x25]",
+
+    # Prepare ourselves to call into our C runtime...
+    *C_CALL_PROLOGUE,
+
+    # ... perform the call itself ...
+    "blr x27",
+
+    # Save the result of our call for later.
+    "mov x27, x0",
+
+    # ... and restore our environment.
+    *C_CALL_EPILOGUE,
+
+    # Restore our return value.
+    "mov x0, x27"
+)
+
+# Branch to a given immediate address.
+simple("br",
+    # Use our immediate argument as our new bytecode-pointer location.
+    "ldr x28, [x28]"
+)
+
+# Exit from a translation buffer execution.
+simple("exit_tb",
+
+    # We have a single immediate argument, which contains our return code.
+    # Place it into x0, as one would a return code.
+    "ldr x0, [x28], #8",
+
+    # And finally, return back to the code that invoked our gadget stream.
+    "ret"
+)
+
+
+for condition in ARCH_CONDITION_CODES:
+
+    # Performs a comparison between two operands.
+    with_dnm(f"setcond_i32_{condition}",
+        "subs Wd, Wn, Wm",
+        f"cset Wd, {condition}"
+    )
+    with_dnm(f"setcond_i64_{condition}",
+        "subs Xd, Xn, Xm",
+        f"cset Xd, {condition}"
+    )
+
+    #
+    # NOTE: we use _dnm for the conditional branches, even though we don't
+    # actually do anything different based on the d argument. This gemerates
+    # effectively 16 identical `brcond` gadgets for each condition; which we
+    # use in the backend to spread out the actual branch sources we use.
+    #
+    # This is a slight mercy for the branch predictor, as not every conditional
+    # branch is funneled throught the same address.
+    #
+
+    # Branches iff a given comparison is true.
+    with_dnm(f'brcond_i32_{condition}',
+
+        # Grab our immediate argument.
+        "ldr x27, [x28], #8",
+
+        # Perform our comparison and conditional branch.
+        "subs Wzr, Wn, Wm",
+        f"b{condition} 1f",
+
+        "0:", # not taken
+           # Perform our end-of-instruction epilogue.
+            *EPILOGUE,
+
+        "1:" # taken
+            # Update our bytecode pointer to take the label.
+            "mov x28, x27"
+    )
+
+    # Branches iff a given comparison is true.
+    with_dnm(f'brcond_i64_{condition}',
+
+        # Grab our immediate argument.
+        "ldr x27, [x28], #8",
+
+        # Perform our comparison and conditional branch.
+        "subs Xzr, Xn, Xm",
+        f"b{condition} 1f",
+
+        "0:", # not taken
+            # Perform our end-of-instruction epilogue.
+            *EPILOGUE,
+
+        "1:" # taken
+            # Update our bytecode pointer to take the label.
+            "mov x28, x27"
+    )
+
+
+# MOV variants.
+with_dn("mov_i32",     "mov Wd, Wn")
+with_dn("mov_i64",     "mov Xd, Xn")
+with_d("movi_i32", "ldr Wd, [x28], #8")
+with_d("movi_i64", "ldr Xd, [x28], #8")
+
+# Create MOV variants that have common constants built in to the gadget.
+# This optimization helps costly reads from memories for simple operations.
+with_d_immediate("movi_imm_i32", "mov Wd, #Ii", immediate_range=range(64))
+with_d_immediate("movi_imm_i64", "mov Xd, #Ii", immediate_range=range(64))
+
+# LOAD variants.
+# TODO: should the signed variants have X variants for _i64?
+ldst_dn("ld8u",      "ldrb  Wd, [Xn, x27]")
+ldst_dn("ld8s_i32",  "ldrsb Wd, [Xn, x27]")
+ldst_dn("ld8s_i64",  "ldrsb Xd, [Xn, x27]")
+ldst_dn("ld16u",     "ldrh  Wd, [Xn, x27]")
+ldst_dn("ld16s_i32", "ldrsh Wd, [Xn, x27]")
+ldst_dn("ld16s_i64", "ldrsh Xd, [Xn, x27]")
+ldst_dn("ld32u",     "ldr   Wd, [Xn, x27]")
+ldst_dn("ld32s_i64", "ldrsw Xd, [Xn, x27]")
+ldst_dn("ld_i64",    "ldr   Xd, [Xn, x27]")
+
+# STORE variants.
+ldst_dn("st8",         "strb  Wd, [Xn, x27]")
+ldst_dn("st16",        "strh  Wd, [Xn, x27]")
+ldst_dn("st_i32",      "str   Wd, [Xn, x27]")
+ldst_dn("st_i64",      "str   Xd, [Xn, x27]")
+
+# QEMU LD/ST are handled in our C runtime rather than with simple gadgets,
+# as they're nontrivial.
+
+# Trivial arithmetic.
+math_dnm("add" , "add" )
+math_dnm("sub" , "sub" )
+math_dnm("mul" , "mul" )
+math_dnm("div" , "sdiv")
+math_dnm("divu", "udiv")
+
+# Division remainder
+with_dnm("rem_i32",  "sdiv w27, Wn, Wm", "msub Wd, w27, Wm, Wn")
+with_dnm("rem_i64",  "sdiv x27, Xn, Xm", "msub Xd, x27, Xm, Xn")
+with_dnm("remu_i32", "udiv w27, Wn, Wm", "msub Wd, w27, Wm, Wn")
+with_dnm("remu_i64", "udiv x27, Xn, Xm", "msub Xd, x27, Xm, Xn")
+
+# Trivial logical.
+math_dn( "not",  "mvn")
+math_dn( "neg",  "neg")
+math_dnm("and",  "and")
+math_dnm("andc", "bic")
+math_dnm("or",   "orr")
+math_dnm("orc",  "orn")
+math_dnm("xor",  "eor")
+math_dnm("eqv",  "eon")
+math_dnm("shl",  "lsl")
+math_dnm("shr",  "lsr")
+math_dnm("sar",  "asr")
+
+# AArch64 lacks a Rotate Left; so we instead rotate right by a negative.
+# TODO: validate this?
+#math_dnm("rotr", "ror")
+#with_dnm("rotl_i32", "neg w27, Wm", "ror Wd, Wn, w27")
+#with_dnm("rotl_i64", "neg x27, Xm", "ror Xd, Xn, x27")
+
+# Numeric extension.
+math_dn("ext8s",      "sxtb")
+with_dn("ext8u",      "and Xd, Xn, #0xff")
+math_dn("ext16s",     "sxth")
+with_dn("ext16u",     "and Wd, Wn, #0xffff")
+with_dn("ext32s_i64", "sxtw Xd, Wn")
+with_dn("ext32u_i64", "and Xd, Xn, #0xffffffff")
+
+# Byte swapping.
+with_dn("bswap16",    "rev w27, Wn", "lsr Wd, w27, #16")
+with_dn("bswap32",    "rev Wd, Wn")
+with_dn("bswap64",    "rev Xd, Xn")
+
+# Memory barriers.
+simple("mb_all", "dmb ish")
+simple("mb_st",  "dmb ishst")
+simple("mb_ld",  "dmb ishld")
+
+# Handlers for QEMU_LD, which handles guest <- host loads.
+for subtype in ('aligned', 'unaligned', 'slowpath'):
+    is_aligned  = (subtype == 'aligned')
+    is_slowpath = (subtype == 'slowpath')
+
+    ld_thunk(f"qemu_ld_ub_{subtype}", is_aligned=is_aligned, slowpath_helper="helper_ret_ldub_mmu",
+        fastpath_32b=["ldrb Wd, [Xn, x27]"], fastpath_64b=["ldrb Wd, [Xn, x27]"],
+        force_slowpath=is_slowpath,
+    )
+    ld_thunk(f"qemu_ld_sb_{subtype}", is_aligned=is_aligned, slowpath_helper="helper_ret_ldub_mmu_signed",
+        fastpath_32b=["ldrsb Wd, [Xn, x27]"], fastpath_64b=["ldrsb Xd, [Xn, x27]"],
+        force_slowpath=is_slowpath,
+    )
+    ld_thunk(f"qemu_ld_leuw_{subtype}", is_aligned=is_aligned, slowpath_helper="helper_le_lduw_mmu",
+        fastpath_32b=["ldrh Wd, [Xn, x27]"], fastpath_64b=["ldrh Wd, [Xn, x27]"],
+        force_slowpath=is_slowpath,
+    )
+    ld_thunk(f"qemu_ld_lesw_{subtype}", is_aligned=is_aligned, slowpath_helper="helper_le_lduw_mmu_signed",
+        fastpath_32b=["ldrsh Wd, [Xn, x27]"], fastpath_64b=["ldrsh Xd, [Xn, x27]"],
+        force_slowpath=is_slowpath,
+    )
+    ld_thunk(f"qemu_ld_leul_{subtype}", is_aligned=is_aligned, slowpath_helper="helper_le_ldul_mmu",
+        fastpath_32b=["ldr Wd, [Xn, x27]"], fastpath_64b=["ldr Wd, [Xn, x27]"],
+        force_slowpath=is_slowpath,
+    )
+    ld_thunk(f"qemu_ld_lesl_{subtype}", is_aligned=is_aligned, slowpath_helper="helper_le_ldul_mmu_signed",
+        fastpath_32b=["ldrsw Xd, [Xn, x27]"], fastpath_64b=["ldrsw Xd, [Xn, x27]"],
+        force_slowpath=is_slowpath,
+    )
+    ld_thunk(f"qemu_ld_leq_{subtype}", is_aligned=is_aligned, slowpath_helper="helper_le_ldq_mmu",
+        fastpath_32b=["ldr Xd, [Xn, x27]"], fastpath_64b=["ldr Xd, [Xn, x27]"],
+        force_slowpath=is_slowpath,
+    )
+
+    # Special variant for the most common mode, as a speedup optimization.
+    ld_thunk(f"qemu_ld_leq_{subtype}_mode3a", is_aligned=is_aligned, slowpath_helper="helper_le_ldq_mmu",
+        fastpath_32b=["ldr Xd, [Xn, x27]"], fastpath_64b=["ldr Xd, [Xn, x27]"],
+        force_slowpath=is_slowpath, immediate=0x3a
+    )
+
+    # For now, leave the rare/big-endian stuff slow-path only.
+    ld_thunk(f"qemu_ld_beuw_{subtype}", None, None, "helper_be_lduw_mmu",         
+            is_aligned=is_aligned, force_slowpath=is_slowpath)
+    ld_thunk(f"qemu_ld_besw_{subtype}", None, None, "helper_be_lduw_mmu_signed",  
+            is_aligned=is_aligned, force_slowpath=is_slowpath)
+    ld_thunk(f"qemu_ld_beul_{subtype}", None, None, "helper_be_ldul_mmu",         
+            is_aligned=is_aligned, force_slowpath=is_slowpath)
+    ld_thunk(f"qemu_ld_besl_{subtype}", None, None, "helper_be_ldul_mmu_signed",  
+            is_aligned=is_aligned, force_slowpath=is_slowpath)
+    ld_thunk(f"qemu_ld_beq_{subtype}",  None, None, "helper_be_ldq_mmu",          
+            is_aligned=is_aligned, force_slowpath=is_slowpath)
+
+
+# Handlers for QEMU_ST, which handles guest -> host stores.
+for subtype in ('aligned', 'unaligned', 'slowpath'):
+    is_aligned  = (subtype == 'aligned')
+    is_slowpath = (subtype == 'slowpath')
+
+    st_thunk(f"qemu_st_ub_{subtype}", is_aligned=is_aligned, slowpath_helper="helper_ret_stb_mmu",
+        fastpath_32b=["strb Wd, [Xn, x27]"], fastpath_64b=["strb Wd, [Xn, x27]"],
+        force_slowpath=is_slowpath,
+    )
+    st_thunk(f"qemu_st_leuw_{subtype}", is_aligned=is_aligned, slowpath_helper="helper_le_stw_mmu",
+        fastpath_32b=["strh Wd, [Xn, x27]"], fastpath_64b=["strh Wd, [Xn, x27]"],
+        force_slowpath=is_slowpath,
+    )
+    st_thunk(f"qemu_st_leul_{subtype}", is_aligned=is_aligned, slowpath_helper="helper_le_stl_mmu",
+        fastpath_32b=["str Wd, [Xn, x27]"], fastpath_64b=["str Wd, [Xn, x27]"],
+        force_slowpath=is_slowpath,
+    )
+    st_thunk(f"qemu_st_leq_{subtype}", is_aligned=is_aligned, slowpath_helper="helper_le_stq_mmu",
+        fastpath_32b=["str Xd, [Xn, x27]"], fastpath_64b=["str Xd, [Xn, x27]"],
+        force_slowpath=is_slowpath,
+    )
+    
+    # Special optimization for the most common modes.
+    st_thunk(f"qemu_st_leq_{subtype}_mode3a", is_aligned=is_aligned, slowpath_helper="helper_le_stq_mmu",
+        fastpath_32b=["str Xd, [Xn, x27]"], fastpath_64b=["str Xd, [Xn, x27]"],
+        force_slowpath=is_slowpath, immediate=0x3a
+    )
+
+    # For now, leave the rare/big-endian stuff slow-path only.
+    st_thunk(f"qemu_st_beuw_{subtype}", None, None, "helper_be_stw_mmu",  
+            is_aligned=is_aligned, force_slowpath=is_slowpath)
+    st_thunk(f"qemu_st_beul_{subtype}", None, None, "helper_be_stl_mmu",
+            is_aligned=is_aligned, force_slowpath=is_slowpath)
+    st_thunk(f"qemu_st_beq_{subtype}",  None, None, "helper_be_stq_mmu",
+            is_aligned=is_aligned, force_slowpath=is_slowpath)
+
+
+# Statistics.
+sys.stderr.write(f"\nGenerated {gadgets} gadgets with {instructions} instructions ({instructions * 4} B).\n\n")
diff --git a/tcg/tcg.c b/tcg/tcg.c
index 457a40837feb..0eb06e0ed059 100644
--- a/tcg/tcg.c
+++ b/tcg/tcg.c
@@ -162,7 +162,7 @@ TCGv_env cpu_env = 0;
 const void *tcg_code_gen_epilogue;
 uintptr_t tcg_splitwx_diff;
 
-#ifndef CONFIG_TCG_INTERPRETER
+#if !defined(CONFIG_TCG_INTERPRETER) && !defined(CONFIG_TCG_THREADED_INTERPRETER)
 tcg_prologue_fn *tcg_qemu_tb_exec;
 #endif
 
@@ -1200,7 +1200,7 @@ void tcg_prologue_init(TCGContext *s)
     region.start = buf0;
     region.end = buf0 + total_size;
 
-#ifndef CONFIG_TCG_INTERPRETER
+#if !defined(CONFIG_TCG_INTERPRETER) && !defined(CONFIG_TCG_THREADED_INTERPRETER)
     tcg_qemu_tb_exec = (tcg_prologue_fn *)tcg_splitwx_to_rx(buf0);
 #endif
 
@@ -1226,7 +1226,7 @@ void tcg_prologue_init(TCGContext *s)
 #endif
 
     buf1 = s->code_ptr;
-#ifndef CONFIG_TCG_INTERPRETER
+#if !defined(CONFIG_TCG_INTERPRETER) && !defined(CONFIG_TCG_THREADED_INTERPRETER)
     flush_idcache_range((uintptr_t)tcg_splitwx_to_rx(buf0), (uintptr_t)buf0,
                         tcg_ptr_byte_diff(buf1, buf0));
 #endif
@@ -1955,7 +1955,7 @@ void tcg_gen_callN(void *func, TCGTemp *ret, int nargs, TCGTemp **args)
 #endif
 
 #if defined(__sparc__) && !defined(__arch64__) \
-    && !defined(CONFIG_TCG_INTERPRETER)
+    && !defined(CONFIG_TCG_INTERPRETER) && !defined(CONFIG_TCG_THREADED_INTERPRETER)
     /* We have 64-bit values in one register, but need to pass as two
        separate parameters.  Split them.  */
     int orig_sizemask = sizemask;
@@ -2005,7 +2005,7 @@ void tcg_gen_callN(void *func, TCGTemp *ret, int nargs, TCGTemp **args)
     pi = 0;
     if (ret != NULL) {
 #if defined(__sparc__) && !defined(__arch64__) \
-    && !defined(CONFIG_TCG_INTERPRETER)
+    && !defined(CONFIG_TCG_INTERPRETER) && !defined(CONFIG_TCG_THREADED_INTERPRETER)
         if (orig_sizemask & 1) {
             /* The 32-bit ABI is going to return the 64-bit value in
                the %o0/%o1 register pair.  Prepare for this by using
@@ -2083,7 +2083,7 @@ void tcg_gen_callN(void *func, TCGTemp *ret, int nargs, TCGTemp **args)
     tcg_debug_assert(pi <= ARRAY_SIZE(op->args));
 
 #if defined(__sparc__) && !defined(__arch64__) \
-    && !defined(CONFIG_TCG_INTERPRETER)
+    && !defined(CONFIG_TCG_INTERPRETER) && !defined(CONFIG_TCG_THREADED_INTERPRETER)
     /* Free all of the parts we allocated above.  */
     for (i = real_args = 0; i < orig_nargs; ++i) {
         int is_64bit = orig_sizemask & (1 << (i+1)*2);
@@ -4763,7 +4763,7 @@ int tcg_gen_code(TCGContext *s, TranslationBlock *tb)
         return -2;
     }
 
-#ifndef CONFIG_TCG_INTERPRETER
+#if !defined(CONFIG_TCG_INTERPRETER) && !defined(CONFIG_TCG_THREADED_INTERPRETER)
     /* flush instruction cache */
     flush_idcache_range((uintptr_t)tcg_splitwx_to_rx(s->code_buf),
                         (uintptr_t)s->code_buf,

From 4f59dc2db05b7a1b2654adaee1f03be1d08fb048 Mon Sep 17 00:00:00 2001
From: Joelle van Dyne <j@getutm.app>
Date: Sat, 10 Oct 2020 19:52:38 -0700
Subject: [PATCH 14/36] block: feature detection for host block support

On Darwin (iOS), there are no system level APIs for directly accessing
host block devices. We detect this at configure time.
---
 block/file-posix.c   | 33 ++++++++++++++++++++++-----------
 meson.build          |  6 +++++-
 qapi/block-core.json | 10 +++++++---
 3 files changed, 34 insertions(+), 15 deletions(-)

diff --git a/block/file-posix.c b/block/file-posix.c
index 20e14f8e96ba..08d7a1891ffd 100644
--- a/block/file-posix.c
+++ b/block/file-posix.c
@@ -42,6 +42,8 @@
 #include "scsi/constants.h"
 
 #if defined(__APPLE__) && (__MACH__)
+#include <sys/ioctl.h>
+#if defined(HAVE_HOST_BLOCK_DEVICE)
 #include <paths.h>
 #include <sys/param.h>
 #include <IOKit/IOKitLib.h>
@@ -52,6 +54,7 @@
 //#include <IOKit/storage/IOCDTypes.h>
 #include <IOKit/storage/IODVDMedia.h>
 #include <CoreFoundation/CoreFoundation.h>
+#endif /* defined(HAVE_HOST_BLOCK_DEVICE) */
 #endif
 
 #ifdef __sun__
@@ -181,7 +184,17 @@ typedef struct BDRVRawReopenState {
     bool check_cache_dropped;
 } BDRVRawReopenState;
 
-static int fd_open(BlockDriverState *bs);
+static int fd_open(BlockDriverState *bs)
+{
+    BDRVRawState *s = bs->opaque;
+
+    /* this is just to ensure s->fd is sane (its called by io ops) */
+    if (s->fd >= 0) {
+        return 0;
+    }
+    return -EIO;
+}
+
 static int64_t raw_getlength(BlockDriverState *bs);
 
 typedef struct RawPosixAIOData {
@@ -3027,6 +3040,7 @@ static BlockStatsSpecific *raw_get_specific_stats(BlockDriverState *bs)
     return stats;
 }
 
+#if defined(HAVE_HOST_BLOCK_DEVICE)
 static BlockStatsSpecific *hdev_get_specific_stats(BlockDriverState *bs)
 {
     BlockStatsSpecific *stats = g_new(BlockStatsSpecific, 1);
@@ -3036,6 +3050,7 @@ static BlockStatsSpecific *hdev_get_specific_stats(BlockDriverState *bs)
 
     return stats;
 }
+#endif /* HAVE_HOST_BLOCK_DEVICE */
 
 static QemuOptsList raw_create_opts = {
     .name = "raw-create-opts",
@@ -3260,6 +3275,8 @@ BlockDriver bdrv_file = {
 /***********************************************/
 /* host device */
 
+#if defined(HAVE_HOST_BLOCK_DEVICE)
+
 #if defined(__APPLE__) && defined(__MACH__)
 static kern_return_t GetBSDPath(io_iterator_t mediaIterator, char *bsdPath,
                                 CFIndex maxPathSize, int flags);
@@ -3552,16 +3569,6 @@ hdev_co_ioctl(BlockDriverState *bs, unsigned long int req, void *buf)
 }
 #endif /* linux */
 
-static int fd_open(BlockDriverState *bs)
-{
-    BDRVRawState *s = bs->opaque;
-
-    /* this is just to ensure s->fd is sane (its called by io ops) */
-    if (s->fd >= 0)
-        return 0;
-    return -EIO;
-}
-
 static coroutine_fn int
 hdev_co_pdiscard(BlockDriverState *bs, int64_t offset, int bytes)
 {
@@ -3885,6 +3892,8 @@ static BlockDriver bdrv_host_cdrom = {
 };
 #endif /* __FreeBSD__ */
 
+#endif /* HAVE_HOST_BLOCK_DEVICE */
+
 static void bdrv_file_init(void)
 {
     /*
@@ -3892,6 +3901,7 @@ static void bdrv_file_init(void)
      * registered last will get probed first.
      */
     bdrv_register(&bdrv_file);
+#if defined(HAVE_HOST_BLOCK_DEVICE)
     bdrv_register(&bdrv_host_device);
 #ifdef __linux__
     bdrv_register(&bdrv_host_cdrom);
@@ -3899,6 +3909,7 @@ static void bdrv_file_init(void)
 #if defined(__FreeBSD__) || defined(__FreeBSD_kernel__)
     bdrv_register(&bdrv_host_cdrom);
 #endif
+#endif /* HAVE_HOST_BLOCK_DEVICE */
 }
 
 block_init(bdrv_file_init);
diff --git a/meson.build b/meson.build
index c6f4b0cf5e8a..78aa4670fd92 100644
--- a/meson.build
+++ b/meson.build
@@ -181,7 +181,7 @@ if targetos == 'windows'
                                       include_directories: include_directories('.'))
 elif targetos == 'darwin'
   coref = dependency('appleframeworks', modules: 'CoreFoundation')
-  iokit = dependency('appleframeworks', modules: 'IOKit')
+  iokit = dependency('appleframeworks', modules: 'IOKit', required: false)
 elif targetos == 'sunos'
   socket = [cc.find_library('socket'),
             cc.find_library('nsl'),
@@ -1056,6 +1056,9 @@ if get_option('cfi')
   add_global_link_arguments(cfi_flags, native: false, language: ['c', 'cpp', 'objc'])
 endif
 
+have_host_block_device = (targetos != 'darwin' or
+    cc.has_header('IOKit/storage/IOMedia.h'))
+
 #################
 # config-host.h #
 #################
@@ -1149,6 +1152,7 @@ config_host_data.set('HAVE_PTY_H', cc.has_header('pty.h'))
 config_host_data.set('HAVE_SYS_IOCCOM_H', cc.has_header('sys/ioccom.h'))
 config_host_data.set('HAVE_SYS_KCOV_H', cc.has_header('sys/kcov.h'))
 config_host_data.set('HAVE_SYSTEM_FUNCTION', cc.has_function('system', prefix: '#include <stdlib.h>'))
+config_host_data.set('HAVE_HOST_BLOCK_DEVICE', have_host_block_device)
 
 config_host_data.set('CONFIG_PREADV', cc.has_function('preadv', prefix: '#include <sys/uio.h>'))
 
diff --git a/qapi/block-core.json b/qapi/block-core.json
index 6d227924d06c..ea0d2725d261 100644
--- a/qapi/block-core.json
+++ b/qapi/block-core.json
@@ -897,7 +897,8 @@
   'discriminator': 'driver',
   'data': {
       'file': 'BlockStatsSpecificFile',
-      'host_device': 'BlockStatsSpecificFile',
+      'host_device': { 'type': 'BlockStatsSpecificFile',
+                       'if': 'defined(HAVE_HOST_BLOCK_DEVICE)' },
       'nvme': 'BlockStatsSpecificNvme' } }
 
 ##
@@ -2814,7 +2815,9 @@
 { 'enum': 'BlockdevDriver',
   'data': [ 'blkdebug', 'blklogwrites', 'blkreplay', 'blkverify', 'bochs',
             'cloop', 'compress', 'copy-on-read', 'dmg', 'file', 'ftp', 'ftps',
-            'gluster', 'host_cdrom', 'host_device', 'http', 'https', 'iscsi',
+            'gluster', 'host_cdrom',
+            {'name': 'host_device', 'if': 'defined(HAVE_HOST_BLOCK_DEVICE)' },
+            'http', 'https', 'iscsi',
             'luks', 'nbd', 'nfs', 'null-aio', 'null-co', 'nvme', 'parallels',
             'preallocate', 'qcow', 'qcow2', 'qed', 'quorum', 'raw', 'rbd',
             { 'name': 'replication', 'if': 'defined(CONFIG_REPLICATION)' },
@@ -4017,7 +4020,8 @@
       'ftps':       'BlockdevOptionsCurlFtps',
       'gluster':    'BlockdevOptionsGluster',
       'host_cdrom': 'BlockdevOptionsFile',
-      'host_device':'BlockdevOptionsFile',
+      'host_device': { 'type': 'BlockdevOptionsFile',
+                       'if': 'defined(HAVE_HOST_BLOCK_DEVICE)' },
       'http':       'BlockdevOptionsCurlHttp',
       'https':      'BlockdevOptionsCurlHttps',
       'iscsi':      'BlockdevOptionsIscsi',

From 1913d4d2a7a16e3e3ea8725bd42564c4da22b617 Mon Sep 17 00:00:00 2001
From: Joelle van Dyne <j@getutm.app>
Date: Thu, 21 Jan 2021 16:12:00 -0800
Subject: [PATCH 15/36] block: check for sys/disk.h

Some BSD platforms do not have this header.
---
 block.c     | 2 +-
 meson.build | 1 +
 2 files changed, 2 insertions(+), 1 deletion(-)

diff --git a/block.c b/block.c
index c5b887cec196..7f33709c94da 100644
--- a/block.c
+++ b/block.c
@@ -54,7 +54,7 @@
 #ifdef CONFIG_BSD
 #include <sys/ioctl.h>
 #include <sys/queue.h>
-#ifndef __DragonFly__
+#if defined(HAVE_SYS_DISK_H)
 #include <sys/disk.h>
 #endif
 #endif
diff --git a/meson.build b/meson.build
index 78aa4670fd92..202b1e0f1940 100644
--- a/meson.build
+++ b/meson.build
@@ -1153,6 +1153,7 @@ config_host_data.set('HAVE_SYS_IOCCOM_H', cc.has_header('sys/ioccom.h'))
 config_host_data.set('HAVE_SYS_KCOV_H', cc.has_header('sys/kcov.h'))
 config_host_data.set('HAVE_SYSTEM_FUNCTION', cc.has_function('system', prefix: '#include <stdlib.h>'))
 config_host_data.set('HAVE_HOST_BLOCK_DEVICE', have_host_block_device)
+config_host_data.set('HAVE_SYS_DISK_H', cc.has_header('sys/disk.h'))
 
 config_host_data.set('CONFIG_PREADV', cc.has_function('preadv', prefix: '#include <sys/uio.h>'))
 

From 36c2febd7f65c98cc3661d788e10b02e3d9d8c7e Mon Sep 17 00:00:00 2001
From: Joelle van Dyne <j@getutm.app>
Date: Sun, 7 Mar 2021 16:52:56 -0800
Subject: [PATCH 16/36] block: detect DKIOCGETBLOCKCOUNT/SIZE before use

iOS hosts do not have these defined so we fallback to the
default behaviour.

Co-authored-by: Warner Losh <imp@bsdimp.com>
---
 block/file-posix.c | 18 +++++++-----------
 1 file changed, 7 insertions(+), 11 deletions(-)

diff --git a/block/file-posix.c b/block/file-posix.c
index 08d7a1891ffd..dcd2a2375bfe 100644
--- a/block/file-posix.c
+++ b/block/file-posix.c
@@ -2321,8 +2321,10 @@ static int64_t raw_getlength(BlockDriverState *bs)
 again:
 #endif
     if (!fstat(fd, &sb) && (S_IFCHR & sb.st_mode)) {
+        size = 0;
 #ifdef DIOCGMEDIASIZE
         if (ioctl(fd, DIOCGMEDIASIZE, (off_t *)&size))
+            size = 0;
 #elif defined(DIOCGPART)
         {
                 struct partinfo pi;
@@ -2331,9 +2333,7 @@ static int64_t raw_getlength(BlockDriverState *bs)
                 else
                         size = 0;
         }
-        if (size == 0)
-#endif
-#if defined(__APPLE__) && defined(__MACH__)
+#elif defined(DKIOCGETBLOCKCOUNT) && defined(DKIOCGETBLOCKSIZE)
         {
             uint64_t sectors = 0;
             uint32_t sector_size = 0;
@@ -2341,19 +2341,15 @@ static int64_t raw_getlength(BlockDriverState *bs)
             if (ioctl(fd, DKIOCGETBLOCKCOUNT, &sectors) == 0
                && ioctl(fd, DKIOCGETBLOCKSIZE, &sector_size) == 0) {
                 size = sectors * sector_size;
-            } else {
-                size = lseek(fd, 0LL, SEEK_END);
-                if (size < 0) {
-                    return -errno;
-                }
             }
         }
-#else
-        size = lseek(fd, 0LL, SEEK_END);
+#endif
+        if (size == 0) {
+            size = lseek(fd, 0LL, SEEK_END);
+        }
         if (size < 0) {
             return -errno;
         }
-#endif
 #if defined(__FreeBSD__) || defined(__FreeBSD_kernel__)
         switch(s->type) {
         case FTYPE_CD:

From d4e9108ef1b5f98bf1beeb14529aa4684736e57a Mon Sep 17 00:00:00 2001
From: Joelle van Dyne <j@getutm.app>
Date: Thu, 21 Jan 2021 16:31:09 -0800
Subject: [PATCH 17/36] slirp: feature detection for smbd

Replace Windows specific macro with a more generic feature detection
macro. Allows slirp smb feature to be disabled manually as well.
---
 configure   | 25 ++++++++++++++++++++++---
 meson.build |  2 +-
 net/slirp.c | 16 ++++++++--------
 3 files changed, 31 insertions(+), 12 deletions(-)

diff --git a/configure b/configure
index 535e6a9269be..7fec65c90c51 100755
--- a/configure
+++ b/configure
@@ -466,6 +466,7 @@ multiprocess="auto"
 
 malloc_trim="auto"
 gio="$default_feature"
+slirp_smbd="auto"
 
 # parse CC options second
 for opt do
@@ -835,8 +836,6 @@ do
     fi
 done
 
-: ${smbd=${SMBD-/usr/sbin/smbd}}
-
 # Default objcc to clang if available, otherwise use CC
 if has clang; then
   objcc=clang
@@ -1564,6 +1563,9 @@ for opt do
   --enable-gio) gio=yes
   ;;
   --disable-gio) gio=no
+  --enable-slirp-smbd) slirp_smbd=yes
+  ;;
+  --disable-slirp-smbd) slirp_smbd=no
   ;;
   *)
       echo "ERROR: unknown option $opt"
@@ -1919,6 +1921,7 @@ disabled with --disable-FEATURE, default is enabled if available
   fuse-lseek      SEEK_HOLE/SEEK_DATA support for FUSE exports
   multiprocess    Out of process device emulation support
   gio             libgio support
+  slirp-smbd      use smbd (at path --smbd=*) in slirp networking
 
 NOTE: The object files are built at the place where configure is launched
 EOF
@@ -5255,6 +5258,19 @@ case "$slirp" in
     ;;
 esac
 
+# Check for slirp smbd dupport
+: ${smbd=${SMBD-/usr/sbin/smbd}}
+if test "$slirp_smbd" != "no" ; then
+  if test "$mingw32" = "yes" ; then
+    if test "$slirp_smbd" = "yes" ; then
+      error_exit "Host smbd not supported on this platform."
+    fi
+    slirp_smbd=no
+  else
+    slirp_smbd=yes
+  fi
+fi
+
 ##########################################
 # check for usable __NR_keyctl syscall
 
@@ -5530,7 +5546,10 @@ fi
 if test "$guest_agent" = "yes" ; then
   echo "CONFIG_GUEST_AGENT=y" >> $config_host_mak
 fi
-echo "CONFIG_SMBD_COMMAND=\"$smbd\"" >> $config_host_mak
+if test "$slirp_smbd" = "yes" ; then
+  echo "CONFIG_SLIRP_SMBD=y" >> $config_host_mak
+  echo "CONFIG_SMBD_COMMAND=\"$smbd\"" >> $config_host_mak
+fi
 if test "$vde" = "yes" ; then
   echo "CONFIG_VDE=y" >> $config_host_mak
   echo "VDE_LIBS=$vde_libs" >> $config_host_mak
diff --git a/meson.build b/meson.build
index 202b1e0f1940..3c8905517637 100644
--- a/meson.build
+++ b/meson.build
@@ -2456,7 +2456,7 @@ summary_info += {'genisoimage':       config_host['GENISOIMAGE']}
 if targetos == 'windows' and config_host.has_key('CONFIG_GUEST_AGENT')
   summary_info += {'wixl':            wixl.found() ? wixl.full_path() : false}
 endif
-if slirp_opt != 'disabled'
+if slirp_opt != 'disabled' and 'CONFIG_SLIRP_SMBD' in config_host
   summary_info += {'smbd':            config_host['CONFIG_SMBD_COMMAND']}
 endif
 summary(summary_info, bool_yn: true, section: 'Host binaries')
diff --git a/net/slirp.c b/net/slirp.c
index a9fdc7a08f24..ae0e6c625eaf 100644
--- a/net/slirp.c
+++ b/net/slirp.c
@@ -27,7 +27,7 @@
 #include "net/slirp.h"
 
 
-#ifndef _WIN32
+#if defined(CONFIG_SLIRP_SMBD)
 #include <pwd.h>
 #include <sys/wait.h>
 #endif
@@ -91,7 +91,7 @@ typedef struct SlirpState {
     Slirp *slirp;
     Notifier poll_notifier;
     Notifier exit_notifier;
-#ifndef _WIN32
+#if defined(CONFIG_SLIRP_SMBD)
     gchar *smb_dir;
 #endif
     GSList *fwd;
@@ -104,7 +104,7 @@ static QTAILQ_HEAD(, SlirpState) slirp_stacks =
 static int slirp_hostfwd(SlirpState *s, const char *redir_str, Error **errp);
 static int slirp_guestfwd(SlirpState *s, const char *config_str, Error **errp);
 
-#ifndef _WIN32
+#if defined(CONFIG_SLIRP_SMBD)
 static int slirp_smb(SlirpState *s, const char *exported_dir,
                      struct in_addr vserver_addr, Error **errp);
 static void slirp_smb_cleanup(SlirpState *s);
@@ -377,7 +377,7 @@ static int net_slirp_init(NetClientState *peer, const char *model,
     struct in6_addr ip6_prefix;
     struct in6_addr ip6_host;
     struct in6_addr ip6_dns;
-#ifndef _WIN32
+#if defined(CONFIG_SLIRP_SMBD)
     struct in_addr smbsrv = { .s_addr = 0 };
 #endif
     NetClientState *nc;
@@ -490,7 +490,7 @@ static int net_slirp_init(NetClientState *peer, const char *model,
         return -1;
     }
 
-#ifndef _WIN32
+#if defined(CONFIG_SLIRP_SMBD)
     if (vsmbserver && !inet_aton(vsmbserver, &smbsrv)) {
         error_setg(errp, "Failed to parse SMB address");
         return -1;
@@ -720,7 +720,7 @@ static int net_slirp_init(NetClientState *peer, const char *model,
             QAPI_LIST_APPEND(stored_guestfwd, element);
         }
     }
-#ifndef _WIN32
+#if defined(CONFIG_SLIRP_SMBD)
     if (smb_export) {
         if (slirp_smb(s, smb_export, smbsrv, errp) < 0) {
             goto error;
@@ -912,7 +912,7 @@ void hmp_hostfwd_add(Monitor *mon, const QDict *qdict)
 
 }
 
-#ifndef _WIN32
+#if defined(CONFIG_SLIRP_SMBD)
 
 /* automatic user mode samba server configuration */
 static void slirp_smb_cleanup(SlirpState *s)
@@ -1027,7 +1027,7 @@ static int slirp_smb(SlirpState* s, const char *exported_dir,
     return 0;
 }
 
-#endif /* !defined(_WIN32) */
+#endif /* defined(CONFIG_SLIRP_SMBD) */
 
 static int guestfwd_can_read(void *opaque)
 {

From e034ab0b4dd9ba3a5a70d9910ad5b863ef256616 Mon Sep 17 00:00:00 2001
From: osy <50960678+osy@users.noreply.github.com>
Date: Sun, 7 Mar 2021 17:24:50 -0800
Subject: [PATCH 18/36] meson: option to build as shared library

On iOS, we cannot fork() new processes, so the best way to load QEMU into an
app is through a shared library. We add a new configure option
`--enable-shared-lib` that will build the bulk of QEMU into a shared lib.
The usual executables will then link to the library.
---
 configure         | 14 ++++++++++++--
 meson.build       | 40 ++++++++++++++++++++++++++++++++++------
 meson_options.txt |  2 ++
 3 files changed, 48 insertions(+), 8 deletions(-)

diff --git a/configure b/configure
index 7fec65c90c51..4440743e8475 100755
--- a/configure
+++ b/configure
@@ -463,6 +463,7 @@ gettext="auto"
 fuse="auto"
 fuse_lseek="auto"
 multiprocess="auto"
+shared_lib="false"
 
 malloc_trim="auto"
 gio="$default_feature"
@@ -1567,6 +1568,10 @@ for opt do
   ;;
   --disable-slirp-smbd) slirp_smbd=no
   ;;
+  --enable-shared-lib) shared_lib=true
+  ;;
+  --disable-shared-lib) shared_lib=false
+  ;;
   *)
       echo "ERROR: unknown option $opt"
       echo "Try '$0 --help' for more information"
@@ -1794,6 +1799,7 @@ Advanced options (experts only):
                            enable plugins via shared library loading
   --disable-containers     don't use containers for cross-building
   --gdb=GDB-path           gdb to use for gdbstub tests [$gdb_bin]
+  --enable-shared-lib      build QEMU as a shared library
 
 Optional features, enabled with --enable-FEATURE and
 disabled with --disable-FEATURE, default is enabled if available
@@ -6362,7 +6368,11 @@ echo "ranlib = [$(meson_quote $ranlib)]" >> $cross
 if has $sdl2_config; then
   echo "sdl2-config = [$(meson_quote $sdl2_config)]" >> $cross
 fi
-echo "strip = [$(meson_quote $strip)]" >> $cross
+if test "$shared_lib" = "true"; then
+  echo "strip = [$(meson_quote $strip), '-x']" >> $cross
+else
+  echo "strip = [$(meson_quote $strip)]" >> $cross
+fi
 echo "windres = [$(meson_quote $windres)]" >> $cross
 if test "$cross_compile" = "yes"; then
     cross_arg="--cross-file config-meson.cross"
@@ -6444,7 +6454,7 @@ NINJA=$ninja $meson setup \
         -Dvhost_user_blk_server=$vhost_user_blk_server -Dmultiprocess=$multiprocess \
         -Dfuse=$fuse -Dfuse_lseek=$fuse_lseek -Dguest_agent_msi=$guest_agent_msi \
         $(if test "$default_features" = no; then echo "-Dauto_features=disabled"; fi) \
-	-Dtcg_interpreter=$tcg_interpreter \
+	-Dtcg_interpreter=$tcg_interpreter -Dshared_lib=$shared_lib \
         $cross_arg \
         "$PWD" "$source_path"
 
diff --git a/meson.build b/meson.build
index 3c8905517637..80e9305d5a64 100644
--- a/meson.build
+++ b/meson.build
@@ -2217,14 +2217,31 @@ foreach target : target_dirs
   arch_srcs += target_specific.sources()
   arch_deps += target_specific.dependencies()
 
-  lib = static_library('qemu-' + target,
+  if get_option('shared_lib')
+    build_lib_args = {
+      'target_type': 'shared_library',
+      'install': true,
+      'dependencies': arch_deps + deps,
+      'link_language': link_language,
+      'link_depends': [block_syms, qemu_syms],
+      'link_args': link_args + cc.get_supported_link_arguments(['-Wl,-U,_qemu_main'])
+    }
+  else
+    build_lib_args = {
+      'target_type': 'static_library',
+      'install': false,
+      'dependencies': arch_deps,
+      'name_suffix': 'fa'
+    }
+  endif
+
+  lib = build_target('qemu-' + target,
                  sources: arch_srcs + genh,
-                 dependencies: arch_deps,
                  objects: objects,
                  include_directories: target_inc,
                  c_args: c_args,
                  build_by_default: false,
-                 name_suffix: 'fa')
+                 kwargs: build_lib_args)
 
   if target.endswith('-softmmu')
     execs = [{
@@ -2258,6 +2275,17 @@ foreach target : target_dirs
       'dependencies': []
     }]
   endif
+  if get_option('shared_lib')
+    build_exe_args = {
+      'link_with': lib,
+      'link_args': link_args + cc.get_supported_link_arguments(['-Wl,--exclude-libs,ALL'])
+    }
+  else
+    build_exe_args = {
+      'objects': lib.extract_all_objects(recursive: true),
+      'link_args': link_args
+    }
+  endif
   foreach exe: execs
     exe_name = exe['name']
     exe_sign = 'CONFIG_HVF' in config_target
@@ -2269,11 +2297,10 @@ foreach target : target_dirs
                install: true,
                c_args: c_args,
                dependencies: arch_deps + deps + exe['dependencies'],
-               objects: lib.extract_all_objects(recursive: true),
                link_language: link_language,
                link_depends: [block_syms, qemu_syms] + exe.get('link_depends', []),
-               link_args: link_args,
-               gui_app: exe['gui'])
+               gui_app: exe['gui'],
+               kwargs: build_exe_args)
 
     if exe_sign
       emulators += {exe['name'] : custom_target(exe['name'],
@@ -2440,6 +2467,7 @@ endif
 summary_info += {'Doc directory':     get_option('docdir')}
 summary_info += {'Build directory':   meson.current_build_dir()}
 summary_info += {'Source path':       meson.current_source_dir()}
+summary_info += {'build shared lib':  get_option('shared_lib')}
 summary_info += {'GIT submodules':    config_host['GIT_SUBMODULES']}
 summary(summary_info, bool_yn: true, section: 'Directories')
 
diff --git a/meson_options.txt b/meson_options.txt
index 9734019995a0..4594d42769d6 100644
--- a/meson_options.txt
+++ b/meson_options.txt
@@ -6,6 +6,8 @@ option('qemu_firmwarepath', type : 'string', value : '',
        description: 'search PATH for firmware files')
 option('sphinx_build', type : 'string', value : '',
        description: 'Use specified sphinx-build [$sphinx_build] for building document (default to be empty)')
+option('shared_lib', type : 'boolean', value : false,
+       description: 'build QEMU as a shared library')
 
 option('default_devices', type : 'boolean', value : true,
        description: 'Include a default selection of devices in emulators')

From 17ebce776d24f2d9346ba647dd8bbea1d8aa2c55 Mon Sep 17 00:00:00 2001
From: Gerd Hoffmann <kraxel@redhat.com>
Date: Fri, 8 Jun 2018 13:19:58 +0200
Subject: [PATCH 19/36] hw/display: add virtio-ramfb device

Like virtio-vga, but using ramfb instead of legacy vga.
Useful for booting from OVMF into Windows ARM which expects a linear FB.
---
 hw/display/meson.build    |   1 +
 hw/display/virtio-ramfb.c | 188 ++++++++++++++++++++++++++++++++++++++
 2 files changed, 189 insertions(+)
 create mode 100644 hw/display/virtio-ramfb.c

diff --git a/hw/display/meson.build b/hw/display/meson.build
index 9d79e3951d9e..14f5fa39f4c1 100644
--- a/hw/display/meson.build
+++ b/hw/display/meson.build
@@ -60,6 +60,7 @@ if config_all_devices.has_key('CONFIG_VIRTIO_GPU')
   virtio_gpu_ss.add(when: ['CONFIG_VIRTIO_GPU', 'CONFIG_VIRGL'],
                     if_true: [files('virtio-gpu-3d.c'), pixman, virgl])
   virtio_gpu_ss.add(when: 'CONFIG_VHOST_USER_GPU', if_true: files('vhost-user-gpu.c'))
+  virtio_gpu_ss.add(when: 'CONFIG_FW_CFG_DMA', if_true: files('virtio-ramfb.c'))
   hw_display_modules += {'virtio-gpu': virtio_gpu_ss}
 endif
 
diff --git a/hw/display/virtio-ramfb.c b/hw/display/virtio-ramfb.c
new file mode 100644
index 000000000000..d08bb90a14d4
--- /dev/null
+++ b/hw/display/virtio-ramfb.c
@@ -0,0 +1,188 @@
+#include "qemu/osdep.h"
+#include "hw/pci/pci.h"
+#include "ui/console.h"
+#include "hw/qdev-properties.h"
+#include "hw/virtio/virtio-gpu-pci.h"
+#include "qapi/error.h"
+#include "hw/display/ramfb.h"
+#include "qom/object.h"
+
+/*
+ * virtio-ramfb-base: This extends VirtioPCIProxy.
+ */
+#define TYPE_VIRTIO_RAMFB_BASE "virtio-ramfb-base"
+OBJECT_DECLARE_TYPE(VirtIORAMFBBase, VirtIORAMFBBaseClass,
+                    VIRTIO_RAMFB_BASE)
+
+struct VirtIORAMFBBase {
+    VirtIOPCIProxy parent_obj;
+
+    VirtIOGPUBase *vgpu;
+    RAMFBState    *ramfb;
+};
+
+struct VirtIORAMFBBaseClass {
+    VirtioPCIClass parent_class;
+
+    DeviceReset parent_reset;
+};
+
+static void virtio_ramfb_invalidate_display(void *opaque)
+{
+    VirtIORAMFBBase *vramfb = opaque;
+    VirtIOGPUBase *g = vramfb->vgpu;
+
+    if (g->enable) {
+        g->hw_ops->invalidate(g);
+    }
+}
+
+static void virtio_ramfb_update_display(void *opaque)
+{
+    VirtIORAMFBBase *vramfb = opaque;
+    VirtIOGPUBase *g = vramfb->vgpu;
+
+    if (g->enable) {
+        g->hw_ops->gfx_update(g);
+    } else {
+        ramfb_display_update(g->scanout[0].con, vramfb->ramfb);
+    }
+}
+
+static int virtio_ramfb_ui_info(void *opaque, uint32_t idx, QemuUIInfo *info)
+{
+    VirtIORAMFBBase *vramfb = opaque;
+    VirtIOGPUBase *g = vramfb->vgpu;
+
+    if (g->hw_ops->ui_info) {
+        return g->hw_ops->ui_info(g, idx, info);
+    }
+    return -1;
+}
+
+static void virtio_ramfb_gl_block(void *opaque, bool block)
+{
+    VirtIORAMFBBase *vramfb = opaque;
+    VirtIOGPUBase *g = vramfb->vgpu;
+
+    if (g->hw_ops->gl_block) {
+        g->hw_ops->gl_block(g, block);
+    }
+}
+
+static const GraphicHwOps virtio_ramfb_ops = {
+    .invalidate = virtio_ramfb_invalidate_display,
+    .gfx_update = virtio_ramfb_update_display,
+    .ui_info = virtio_ramfb_ui_info,
+    .gl_block = virtio_ramfb_gl_block,
+};
+
+static const VMStateDescription vmstate_virtio_ramfb = {
+    .name = "virtio-ramfb",
+    .version_id = 2,
+    .minimum_version_id = 2,
+    .fields = (VMStateField[]) {
+        /* no pci stuff here, saving the virtio device will handle that */
+        /* FIXME */
+        VMSTATE_END_OF_LIST()
+    }
+};
+
+/* RAMFB device wrapper around PCI device around virtio GPU */
+static void virtio_ramfb_realize(VirtIOPCIProxy *vpci_dev, Error **errp)
+{
+    VirtIORAMFBBase *vramfb = VIRTIO_RAMFB_BASE(vpci_dev);
+    VirtIOGPUBase *g = vramfb->vgpu;
+    int i;
+
+    /* init virtio bits */
+    virtio_pci_force_virtio_1(vpci_dev);
+    if (!qdev_realize(DEVICE(g), BUS(&vpci_dev->bus), errp)) {
+        return;
+    }
+
+    /* init ramfb */
+    vramfb->ramfb = ramfb_setup(errp);
+    graphic_console_set_hwops(g->scanout[0].con, &virtio_ramfb_ops, vramfb);
+
+    for (i = 0; i < g->conf.max_outputs; i++) {
+        object_property_set_link(OBJECT(g->scanout[i].con), "device",
+                                 OBJECT(vpci_dev), &error_abort);
+    }
+}
+
+static void virtio_ramfb_reset(DeviceState *dev)
+{
+    VirtIORAMFBBaseClass *klass = VIRTIO_RAMFB_BASE_GET_CLASS(dev);
+
+    /* reset virtio-gpu */
+    klass->parent_reset(dev);
+}
+
+static Property virtio_ramfb_base_properties[] = {
+    DEFINE_VIRTIO_GPU_PCI_PROPERTIES(VirtIOPCIProxy),
+    DEFINE_PROP_END_OF_LIST(),
+};
+
+static void virtio_ramfb_base_class_init(ObjectClass *klass, void *data)
+{
+    DeviceClass *dc = DEVICE_CLASS(klass);
+    VirtioPCIClass *k = VIRTIO_PCI_CLASS(klass);
+    VirtIORAMFBBaseClass *v = VIRTIO_RAMFB_BASE_CLASS(klass);
+    PCIDeviceClass *pcidev_k = PCI_DEVICE_CLASS(klass);
+
+    set_bit(DEVICE_CATEGORY_DISPLAY, dc->categories);
+    device_class_set_props(dc, virtio_ramfb_base_properties);
+    dc->vmsd = &vmstate_virtio_ramfb;
+    dc->hotpluggable = false;
+    device_class_set_parent_reset(dc, virtio_ramfb_reset,
+                                  &v->parent_reset);
+
+    k->realize = virtio_ramfb_realize;
+    pcidev_k->class_id = PCI_CLASS_DISPLAY_OTHER;
+}
+
+static TypeInfo virtio_ramfb_base_info = {
+    .name          = TYPE_VIRTIO_RAMFB_BASE,
+    .parent        = TYPE_VIRTIO_PCI,
+    .instance_size = sizeof(VirtIORAMFBBase),
+    .class_size    = sizeof(VirtIORAMFBBaseClass),
+    .class_init    = virtio_ramfb_base_class_init,
+    .abstract      = true,
+};
+
+#define TYPE_VIRTIO_RAMFB "virtio-ramfb"
+
+typedef struct VirtIORAMFB VirtIORAMFB;
+DECLARE_INSTANCE_CHECKER(VirtIORAMFB, VIRTIO_RAMFB,
+                         TYPE_VIRTIO_RAMFB)
+
+struct VirtIORAMFB {
+    VirtIORAMFBBase parent_obj;
+
+    VirtIOGPU     vdev;
+};
+
+static void virtio_ramfb_inst_initfn(Object *obj)
+{
+    VirtIORAMFB *dev = VIRTIO_RAMFB(obj);
+
+    virtio_instance_init_common(obj, &dev->vdev, sizeof(dev->vdev),
+                                TYPE_VIRTIO_GPU);
+    VIRTIO_RAMFB_BASE(dev)->vgpu = VIRTIO_GPU_BASE(&dev->vdev);
+}
+
+static VirtioPCIDeviceTypeInfo virtio_ramfb_info = {
+    .generic_name  = TYPE_VIRTIO_RAMFB,
+    .parent        = TYPE_VIRTIO_RAMFB_BASE,
+    .instance_size = sizeof(VirtIORAMFB),
+    .instance_init = virtio_ramfb_inst_initfn,
+};
+
+static void virtio_ramfb_register_types(void)
+{
+    type_register_static(&virtio_ramfb_base_info);
+    virtio_pci_types_register(&virtio_ramfb_info);
+}
+
+type_init(virtio_ramfb_register_types)

From 138287176bcd6f91506bafce274dd8741716617b Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Marc-Andr=C3=A9=20Lureau?= <marcandre.lureau@redhat.com>
Date: Mon, 25 Jan 2021 11:34:26 +0400
Subject: [PATCH 20/36] slirp: update to git master
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

git cherry-diff:

Commits on bacb71f1c3ed5f40e393afd8be81bedfba13a401 branch that is not on 8f43a99191afb47ca3f3c6972f6306209f367ece branch
+ 1021b0dc38d39f1dc95a296fe3e05a24a087cdc6 disable_dns option
+ 0f94ceec752592e4ac632a24e3c64a97dd09bf4c limit vnameserver_addr to port 53
+ b57bafa852ef16b133907a13678ec69e9531f177 libslirp.h: fix SlirpConfig v3 documentation
+ 1abf18b2b5edb462797629ed47ad4515a195686e Update CHANGELOG
+ ff4ecf9b6c6542b24b4ac6ea178be9d44e159f79 Release v4.3.0
+ 21f1d933050a40d62612c6274c32de60b811d9ea changelog: post-release
+ 376187c4b14c795763d472214812826eebe7e9c2 Release v4.3.1
+ 73336e08902a7e826f7d960453df037380266186 changelog: post-release
+ 5c1c9d43be61571608e9b14615045b67b830daf5 udp, udp6, icmp: handle TTL value
+ 73ed49ab71998d4288e71e954ef6214b70f23d79 icmp, icmp6: Add icmp_forward_error and icmp6_forward_error
+ 7a4840a57ec7dbc37cca1ab96f058a9610b26950 udp, udp6, icmp, icmp6: Enable forwarding errors on Linux
+ e9b2bc19ae652a2907f247e621b2e4773bdd2aab TCPIPHDR_DELTA: Fix potential negative value
+ 39f9a363eec082f04513413046321abd04163148 .gitlab-ci: add a Coverity stage
+ 1b0093b973cfa0dc041522e5d4e6f576b2df642e sosendoob: better document what urgc is used for
+ 5b9ad89ebbb8afa50162c9156fabd5fc56291088 Add G_GNUC_PRINTF to local function slirp_vsnprintf
+ 8a808aa493980e212b4d5f5465330905c8294e59 meson: remove meson-dist script
+ 0b669b5fbe4d3c25a682a67f1059d8633c963b3d meson: support compiling as subproject
+ 9f82a47b81f2864422b82c1e40e51a2ed9c6ac32 Add DNS resolving for iOS
+ c0eac03e8ce1b9a743231f2fe21e7cb579fc9339 Remove the QEMU-special make build-system
+ 1bfd4d9368f9fa2e4f0731e1266bec05bbc83a80 socket: consume empty packets
+ 92413be68914f8cae2f5bad4bf3ab8491dcbc5d7 Release v4.4.0
+ 07e8cfac69766081871ab620d9f16a630543d302 changelog: post-release
+ 4c4e035813313d02b63fdeb920d56fb2fdc0a5b1 Remove some needless (void)casts
+ eee9db9d115d91aa82f33685c4e76d656db92976 fork_exec_child_setup: improve signal handling
+ 216f434a018b3af182a4f31bbe5a00daee170343 Fix unused variables

Signed-off-by: Marc-André Lureau <marcandre.lureau@redhat.com>
Message-Id: <20210125073427.3970606-2-marcandre.lureau@redhat.com>
---
 slirp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/slirp b/slirp
index 8f43a99191af..5dce846e3ee8 160000
--- a/slirp
+++ b/slirp
@@ -1 +1 @@
-Subproject commit 8f43a99191afb47ca3f3c6972f6306209f367ece
+Subproject commit 5dce846e3ee82d93462bc637bb0db2fd49f0fc5a

From c78951da1b67f293b750a1b6a7c9ff7e0575eccd Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Marc-Andr=C3=A9=20Lureau?= <marcandre.lureau@redhat.com>
Date: Mon, 25 Jan 2021 11:34:27 +0400
Subject: [PATCH 21/36] build-sys: make libslirp a meson subproject
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Remove the manual build.

Signed-off-by: Marc-André Lureau <marcandre.lureau@redhat.com>
Message-Id: <20210125073427.3970606-3-marcandre.lureau@redhat.com>
---
 .gitmodules          |  6 ++---
 configure            |  2 +-
 meson.build          | 63 +++-----------------------------------------
 slirp                |  1 -
 subprojects/libslirp |  1 +
 5 files changed, 9 insertions(+), 64 deletions(-)
 delete mode 160000 slirp
 create mode 160000 subprojects/libslirp

diff --git a/.gitmodules b/.gitmodules
index 08b1b48a09f4..c28831c50ab4 100644
--- a/.gitmodules
+++ b/.gitmodules
@@ -49,9 +49,9 @@
 [submodule "roms/edk2"]
 	path = roms/edk2
 	url = https://gitlab.com/qemu-project/edk2.git
-[submodule "slirp"]
-	path = slirp
-	url = https://gitlab.com/qemu-project/libslirp.git
+[submodule "subprojects/libslirp"]
+	path = subprojects/libslirp
+	url = https://git.qemu.org/git/libslirp.git
 [submodule "roms/opensbi"]
 	path = roms/opensbi
 	url = 	https://gitlab.com/qemu-project/opensbi.git
diff --git a/configure b/configure
index 4440743e8475..12de8035e42d 100755
--- a/configure
+++ b/configure
@@ -5259,7 +5259,7 @@ case "$slirp" in
   auto | enabled | internal)
     # Simpler to always update submodule, even if not needed.
     if test "$git_submodules_action" != "ignore"; then
-      git_submodules="${git_submodules} slirp"
+      git_submodules="${git_submodules} subprojects/libslirp"
     fi
     ;;
 esac
diff --git a/meson.build b/meson.build
index 80e9305d5a64..8b1e240a3ec3 100644
--- a/meson.build
+++ b/meson.build
@@ -1504,7 +1504,7 @@ slirp_opt = 'disabled'
 if have_system
   slirp_opt = get_option('slirp')
   if slirp_opt in ['enabled', 'auto', 'system']
-    have_internal = fs.exists(meson.current_source_dir() / 'slirp/meson.build')
+    have_internal = fs.exists(meson.current_source_dir() / 'subprojects/libslirp/meson.build')
     slirp = dependency('slirp', kwargs: static_kwargs,
                        method: 'pkg-config',
                        required: slirp_opt == 'system' or
@@ -1518,64 +1518,9 @@ if have_system
     endif
   endif
   if slirp_opt == 'internal'
-    slirp_deps = []
-    if targetos == 'windows'
-      slirp_deps = cc.find_library('iphlpapi')
-    endif
-    slirp_conf = configuration_data()
-    slirp_conf.set('SLIRP_MAJOR_VERSION', meson.project_version().split('.')[0])
-    slirp_conf.set('SLIRP_MINOR_VERSION', meson.project_version().split('.')[1])
-    slirp_conf.set('SLIRP_MICRO_VERSION', meson.project_version().split('.')[2])
-    slirp_conf.set_quoted('SLIRP_VERSION_STRING', meson.project_version())
-    slirp_cargs = ['-DG_LOG_DOMAIN="Slirp"']
-    slirp_files = [
-      'slirp/src/arp_table.c',
-      'slirp/src/bootp.c',
-      'slirp/src/cksum.c',
-      'slirp/src/dhcpv6.c',
-      'slirp/src/dnssearch.c',
-      'slirp/src/if.c',
-      'slirp/src/ip6_icmp.c',
-      'slirp/src/ip6_input.c',
-      'slirp/src/ip6_output.c',
-      'slirp/src/ip_icmp.c',
-      'slirp/src/ip_input.c',
-      'slirp/src/ip_output.c',
-      'slirp/src/mbuf.c',
-      'slirp/src/misc.c',
-      'slirp/src/ncsi.c',
-      'slirp/src/ndp_table.c',
-      'slirp/src/sbuf.c',
-      'slirp/src/slirp.c',
-      'slirp/src/socket.c',
-      'slirp/src/state.c',
-      'slirp/src/stream.c',
-      'slirp/src/tcp_input.c',
-      'slirp/src/tcp_output.c',
-      'slirp/src/tcp_subr.c',
-      'slirp/src/tcp_timer.c',
-      'slirp/src/tftp.c',
-      'slirp/src/udp.c',
-      'slirp/src/udp6.c',
-      'slirp/src/util.c',
-      'slirp/src/version.c',
-      'slirp/src/vmstate.c',
-    ]
-
-    configure_file(
-      input : 'slirp/src/libslirp-version.h.in',
-      output : 'libslirp-version.h',
-      configuration: slirp_conf)
-
-    slirp_inc = include_directories('slirp', 'slirp/src')
-    libslirp = static_library('slirp',
-                              build_by_default: false,
-                              sources: slirp_files,
-                              c_args: slirp_cargs,
-                              include_directories: slirp_inc)
-    slirp = declare_dependency(link_with: libslirp,
-                               dependencies: slirp_deps,
-                               include_directories: slirp_inc)
+    libslirp = subproject('libslirp',
+                          default_options: ['default_library=static'])
+    slirp = libslirp.get_variable('libslirp_dep')
   endif
 endif
 
diff --git a/slirp b/slirp
deleted file mode 160000
index 5dce846e3ee8..000000000000
--- a/slirp
+++ /dev/null
@@ -1 +0,0 @@
-Subproject commit 5dce846e3ee82d93462bc637bb0db2fd49f0fc5a
diff --git a/subprojects/libslirp b/subprojects/libslirp
new file mode 160000
index 000000000000..bacb71f1c3ed
--- /dev/null
+++ b/subprojects/libslirp
@@ -0,0 +1 @@
+Subproject commit bacb71f1c3ed5f40e393afd8be81bedfba13a401

From 5db66c67b993632f5d0b158cf52eeb56845c0dbc Mon Sep 17 00:00:00 2001
From: Roman Bolshakov <r.bolshakov@yadro.com>
Date: Wed, 10 Feb 2021 13:55:27 +0300
Subject: [PATCH 22/36] util/osdep: Avoid mprotect() RWX->NONE on Big Sur 11.2

There's a change in mprotect() behaviour [1] in the latest macOS on M1
and it's not yet clear if it's going to be fixed by Apple. For now we
can avoid unsupported mprotect() calls. QEMU and qtests work fine
without it.

1. https://gist.github.com/hikalium/75ae822466ee4da13cbbe486498a191f

Buglink: https://bugs.launchpad.net/qemu/+bug/1914849
Apple-Feedback: FB8994773
Signed-off-by: Roman Bolshakov <r.bolshakov@yadro.com>
Message-Id: <20210210105527.74943-1-r.bolshakov@yadro.com>
---
 util/osdep.c | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/util/osdep.c b/util/osdep.c
index 66d01b9160fb..1edd7b1caf9c 100644
--- a/util/osdep.c
+++ b/util/osdep.c
@@ -111,6 +111,12 @@ int qemu_mprotect_none(void *addr, size_t size)
 #ifdef _WIN32
     return qemu_mprotect__osdep(addr, size, PAGE_NOACCESS);
 #else
+# if defined(__APPLE__) && defined(__arm64__)
+    if (__builtin_available(macOS 11.2, *)) {
+        /* mprotect() in macOS 11.2 can't switch RWX to NONE */
+        return 0;
+    }
+# endif
     return qemu_mprotect__osdep(addr, size, PROT_NONE);
 #endif
 }

From a6b59f6dee48f0d9d858f5daa2e38e91aba7d949 Mon Sep 17 00:00:00 2001
From: osy <50960678+osy@users.noreply.github.com>
Date: Mon, 4 Jan 2021 14:04:27 -0800
Subject: [PATCH 23/36] tcg: custom APRR implementation

---
 accel/tcg/cpu-exec.c        |   1 +
 accel/tcg/translate-all.c   |   1 +
 include/qemu/osdep.h        |  28 ----------
 include/tcg/tcg-apple-jit.h | 100 ++++++++++++++++++++++++++++++++++++
 tcg/tcg.c                   |   1 +
 util/osdep.c                |   4 ++
 6 files changed, 107 insertions(+), 28 deletions(-)
 create mode 100644 include/tcg/tcg-apple-jit.h

diff --git a/accel/tcg/cpu-exec.c b/accel/tcg/cpu-exec.c
index f62f12e717ca..07f8e3604a67 100644
--- a/accel/tcg/cpu-exec.c
+++ b/accel/tcg/cpu-exec.c
@@ -26,6 +26,7 @@
 #include "disas/disas.h"
 #include "exec/exec-all.h"
 #include "tcg/tcg.h"
+#include "tcg/tcg-apple-jit.h"
 #include "qemu/atomic.h"
 #include "qemu/compiler.h"
 #include "qemu/timer.h"
diff --git a/accel/tcg/translate-all.c b/accel/tcg/translate-all.c
index f32df8b24042..0ca0a8c5c23c 100644
--- a/accel/tcg/translate-all.c
+++ b/accel/tcg/translate-all.c
@@ -27,6 +27,7 @@
 #include "disas/disas.h"
 #include "exec/exec-all.h"
 #include "tcg/tcg.h"
+#include "tcg/tcg-apple-jit.h"
 #if defined(CONFIG_USER_ONLY)
 #include "qemu.h"
 #if defined(__FreeBSD__) || defined(__FreeBSD_kernel__)
diff --git a/include/qemu/osdep.h b/include/qemu/osdep.h
index ba15be9c569c..5bd1a6776915 100644
--- a/include/qemu/osdep.h
+++ b/include/qemu/osdep.h
@@ -119,10 +119,6 @@ extern int daemon(int, int);
 #include "sysemu/os-posix.h"
 #endif
 
-#ifdef __APPLE__
-#include <AvailabilityMacros.h>
-#endif
-
 #include "glib-compat.h"
 #include "qemu/typedefs.h"
 
@@ -686,30 +682,6 @@ char *qemu_get_host_name(Error **errp);
  */
 size_t qemu_get_host_physmem(void);
 
-/*
- * Toggle write/execute on the pages marked MAP_JIT
- * for the current thread.
- */
-#if defined(MAC_OS_VERSION_11_0) && \
-    MAC_OS_X_VERSION_MAX_ALLOWED >= MAC_OS_VERSION_11_0
-static inline void qemu_thread_jit_execute(void)
-{
-    if (__builtin_available(macOS 11.0, *)) {
-        pthread_jit_write_protect_np(true);
-    }
-}
-
-static inline void qemu_thread_jit_write(void)
-{
-    if (__builtin_available(macOS 11.0, *)) {
-        pthread_jit_write_protect_np(false);
-    }
-}
-#else
-static inline void qemu_thread_jit_write(void) {}
-static inline void qemu_thread_jit_execute(void) {}
-#endif
-
 /**
  * Platforms which do not support system() return ENOSYS
  */
diff --git a/include/tcg/tcg-apple-jit.h b/include/tcg/tcg-apple-jit.h
new file mode 100644
index 000000000000..7f25dab809a3
--- /dev/null
+++ b/include/tcg/tcg-apple-jit.h
@@ -0,0 +1,100 @@
+/*
+ * Apple Silicon functions for JIT handling
+ *
+ * Copyright (c) 2020 osy
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+#ifndef TCG_APPLE_JIT_H
+#define TCG_APPLE_JIT_H
+
+/*
+ * APRR handling
+ * Credits to: https://siguza.github.io/APRR/
+ * Reversed from /usr/lib/system/libsystem_pthread.dylib
+ */
+
+#if defined(__aarch64__) && defined(CONFIG_DARWIN)
+
+#define _COMM_PAGE_START_ADDRESS        (0x0000000FFFFFC000ULL) /* In TTBR0 */
+#define _COMM_PAGE_APRR_SUPPORT         (_COMM_PAGE_START_ADDRESS + 0x10C)
+#define _COMM_PAGE_APPR_WRITE_ENABLE    (_COMM_PAGE_START_ADDRESS + 0x110)
+#define _COMM_PAGE_APRR_WRITE_DISABLE   (_COMM_PAGE_START_ADDRESS + 0x118)
+
+static __attribute__((__always_inline__)) bool jit_write_protect_supported(void)
+{
+    /* Access shared kernel page at fixed memory location. */
+    uint8_t aprr_support = *(volatile uint8_t *)_COMM_PAGE_APRR_SUPPORT;
+    return aprr_support > 0;
+}
+
+/* write protect enable = write disable */
+static __attribute__((__always_inline__)) void jit_write_protect(int enabled)
+{
+    /* Access shared kernel page at fixed memory location. */
+    uint8_t aprr_support = *(volatile uint8_t *)_COMM_PAGE_APRR_SUPPORT;
+    if (aprr_support == 0 || aprr_support > 3) {
+        return;
+    } else if (aprr_support == 1) {
+        __asm__ __volatile__ (
+            "mov x0, %0\n"
+            "ldr x0, [x0]\n"
+            "msr S3_4_c15_c2_7, x0\n"
+            "isb sy\n"
+            :: "r" (enabled ? _COMM_PAGE_APRR_WRITE_DISABLE
+                            : _COMM_PAGE_APPR_WRITE_ENABLE)
+            : "memory", "x0"
+        );
+    } else {
+        __asm__ __volatile__ (
+            "mov x0, %0\n"
+            "ldr x0, [x0]\n"
+            "msr S3_6_c15_c1_5, x0\n"
+            "isb sy\n"
+            :: "r" (enabled ? _COMM_PAGE_APRR_WRITE_DISABLE
+                            : _COMM_PAGE_APPR_WRITE_ENABLE)
+            : "memory", "x0"
+        );
+    }
+}
+
+#else /* defined(__aarch64__) && defined(CONFIG_DARWIN) */
+
+static __attribute__((__always_inline__)) bool jit_write_protect_supported(void)
+{
+    return false;
+}
+
+static __attribute__((__always_inline__)) void jit_write_protect(int enabled)
+{
+}
+
+#endif
+
+static inline void qemu_thread_jit_execute(void)
+{
+    if (jit_write_protect_supported()) {
+        jit_write_protect(true);
+    }
+}
+
+static inline void qemu_thread_jit_write(void)
+{
+    if (jit_write_protect_supported()) {
+        jit_write_protect(false);
+    }
+}
+
+#endif /* define TCG_APPLE_JIT_H */
diff --git a/tcg/tcg.c b/tcg/tcg.c
index 1fbe0b686d57..5b0750685102 100644
--- a/tcg/tcg.c
+++ b/tcg/tcg.c
@@ -49,6 +49,7 @@
 #include "hw/boards.h"
 #endif
 
+#include "tcg/tcg-apple-jit.h"
 #include "tcg/tcg-op.h"
 
 #if UINTPTR_MAX == UINT32_MAX
diff --git a/util/osdep.c b/util/osdep.c
index 1edd7b1caf9c..cb20608292ef 100644
--- a/util/osdep.c
+++ b/util/osdep.c
@@ -39,6 +39,10 @@ extern int madvise(char *, size_t, int);
 #include "qemu/error-report.h"
 #include "monitor/monitor.h"
 
+#ifdef CONFIG_DARWIN
+#include "tcg/tcg-apple-jit.h"
+#endif
+
 static bool fips_enabled = false;
 
 static const char *hw_version = QEMU_HW_VERSION;

From ad73633ab31a025128b70e847fb3a2a77e295247 Mon Sep 17 00:00:00 2001
From: Joelle van Dyne <j@getutm.app>
Date: Sun, 3 Jan 2021 08:58:11 -0800
Subject: [PATCH 24/36] coroutine: add libucontext as external library

iOS does not support ucontext natively for aarch64 and the sigaltstack is
also unsupported (even worse, it fails silently, see:
https://openradar.appspot.com/13002712 )

As a workaround we include a library implementation of ucontext and add it
as a build option.
---
 .gitmodules               |  3 +++
 configure                 | 23 ++++++++++++++++++++---
 meson.build               | 12 +++++++++++-
 meson_options.txt         |  2 ++
 subprojects/libucontext   |  1 +
 util/coroutine-ucontext.c |  9 +++++++++
 6 files changed, 46 insertions(+), 4 deletions(-)
 create mode 160000 subprojects/libucontext

diff --git a/.gitmodules b/.gitmodules
index c28831c50ab4..33f90687266b 100644
--- a/.gitmodules
+++ b/.gitmodules
@@ -64,3 +64,6 @@
 [submodule "roms/vbootrom"]
 	path = roms/vbootrom
 	url = https://gitlab.com/qemu-project/vbootrom.git
+[submodule "libucontext"]
+	path = subprojects/libucontext
+	url = https://github.com/utmapp/libucontext.git
diff --git a/configure b/configure
index 12de8035e42d..1c5c9fc0f06a 100755
--- a/configure
+++ b/configure
@@ -1785,7 +1785,7 @@ Advanced options (experts only):
   --oss-lib                path to OSS library
   --cpu=CPU                Build for host CPU [$cpu]
   --with-coroutine=BACKEND coroutine backend. Supported options:
-                           ucontext, sigaltstack, windows
+                           ucontext, libucontext, sigaltstack, windows
   --enable-gcov            enable test coverage analysis with gcov
   --disable-blobs          disable installing provided firmware blobs
   --with-vss-sdk=SDK-path  enable Windows VSS support in QEMU Guest Agent
@@ -4521,6 +4521,8 @@ if test "$coroutine" = ""; then
     coroutine=win32
   elif test "$ucontext_works" = "yes"; then
     coroutine=ucontext
+  elif test "$ios" = "yes"; then
+    coroutine=libucontext
   else
     coroutine=sigaltstack
   fi
@@ -4544,12 +4546,27 @@ else
       error_exit "only the 'windows' coroutine backend is valid for Windows"
     fi
     ;;
+  libucontext)
+  ;;
   *)
     error_exit "unknown coroutine backend $coroutine"
     ;;
   esac
 fi
 
+case $coroutine in
+libucontext)
+  git_submodules="${git_submodules} subprojects/libucontext"
+  mkdir -p libucontext
+  coroutine_impl=ucontext
+  libucontext="enabled"
+  ;;
+*)
+  coroutine_impl=$coroutine
+  libucontext="disabled"
+  ;;
+esac
+
 if test "$coroutine_pool" = ""; then
   coroutine_pool=yes
 fi
@@ -5886,7 +5903,7 @@ if test "$qom_cast_debug" = "yes" ; then
   echo "CONFIG_QOM_CAST_DEBUG=y" >> $config_host_mak
 fi
 
-echo "CONFIG_COROUTINE_BACKEND=$coroutine" >> $config_host_mak
+echo "CONFIG_COROUTINE_BACKEND=$coroutine_impl" >> $config_host_mak
 if test "$coroutine_pool" = "yes" ; then
   echo "CONFIG_COROUTINE_POOL=1" >> $config_host_mak
 else
@@ -6449,7 +6466,7 @@ NINJA=$ninja $meson setup \
         -Dlibnfs=$libnfs -Diconv=$iconv -Dcurses=$curses -Dlibudev=$libudev\
         -Drbd=$rbd -Dlzo=$lzo -Dsnappy=$snappy -Dlzfse=$lzfse \
         -Dzstd=$zstd -Dseccomp=$seccomp -Dvirtfs=$virtfs -Dcap_ng=$cap_ng \
-        -Dattr=$attr -Ddefault_devices=$default_devices \
+        -Dattr=$attr -Ddefault_devices=$default_devices -Ducontext=$libucontext \
         -Ddocs=$docs -Dsphinx_build=$sphinx_build -Dinstall_blobs=$blobs \
         -Dvhost_user_blk_server=$vhost_user_blk_server -Dmultiprocess=$multiprocess \
         -Dfuse=$fuse -Dfuse_lseek=$fuse_lseek -Dguest_agent_msi=$guest_agent_msi \
diff --git a/meson.build b/meson.build
index 8b1e240a3ec3..7968c40f630f 100644
--- a/meson.build
+++ b/meson.build
@@ -1583,9 +1583,18 @@ if not fdt.found() and fdt_required.length() > 0
   error('fdt not available but required by targets ' + ', '.join(fdt_required))
 endif
 
+ucontext = dependency('libucontext', kwargs: static_kwargs, required : false)
+if not ucontext.found() and get_option('ucontext').enabled()
+  libucontext_proj = subproject('libucontext',
+                                default_options: ['default_library=static',
+                                                  'freestanding=true'])
+  ucontext = libucontext_proj.get_variable('libucontext_dep')
+endif
+
 config_host_data.set('CONFIG_CAPSTONE', capstone.found())
 config_host_data.set('CONFIG_FDT', fdt.found())
 config_host_data.set('CONFIG_SLIRP', slirp.found())
+config_host_data.set('CONFIG_LIBUCONTEXT', ucontext.found())
 
 #####################
 # Generated sources #
@@ -1833,7 +1842,7 @@ util_ss.add_all(trace_ss)
 util_ss = util_ss.apply(config_all, strict: false)
 libqemuutil = static_library('qemuutil',
                              sources: util_ss.sources() + stub_ss.sources() + genh,
-                             dependencies: [util_ss.dependencies(), m, glib, socket, malloc])
+                             dependencies: [util_ss.dependencies(), m, glib, socket, malloc, ucontext])
 qemuutil = declare_dependency(link_with: libqemuutil,
                               sources: genh + version_res)
 
@@ -2577,6 +2586,7 @@ summary(summary_info, bool_yn: true, section: 'Targets and accelerators')
 
 # Block layer
 summary_info = {}
+summary_info += {'libucontext support': ucontext.found()}
 summary_info += {'coroutine backend': config_host['CONFIG_COROUTINE_BACKEND']}
 summary_info += {'coroutine pool':    config_host['CONFIG_COROUTINE_POOL'] == '1'}
 if have_block
diff --git a/meson_options.txt b/meson_options.txt
index 4594d42769d6..6c29ea93300a 100644
--- a/meson_options.txt
+++ b/meson_options.txt
@@ -110,6 +110,8 @@ option('fuse', type: 'feature', value: 'auto',
        description: 'FUSE block device export')
 option('fuse_lseek', type : 'feature', value : 'auto',
        description: 'SEEK_HOLE/SEEK_DATA support for FUSE exports')
+option('ucontext', type : 'feature', value : 'disabled',
+       description: 'libucontext support')
 
 option('vhost_user_blk_server', type: 'feature', value: 'auto',
        description: 'build vhost-user-blk server')
diff --git a/subprojects/libucontext b/subprojects/libucontext
new file mode 160000
index 000000000000..9b1d8f01a6e9
--- /dev/null
+++ b/subprojects/libucontext
@@ -0,0 +1 @@
+Subproject commit 9b1d8f01a6e99166f9808c79966abe10786de8b6
diff --git a/util/coroutine-ucontext.c b/util/coroutine-ucontext.c
index 904b375192ca..220c57a743af 100644
--- a/util/coroutine-ucontext.c
+++ b/util/coroutine-ucontext.c
@@ -23,7 +23,16 @@
 #undef _FORTIFY_SOURCE
 #endif
 #include "qemu/osdep.h"
+#if defined(CONFIG_LIBUCONTEXT)
+#include <libucontext/libucontext.h>
+#define ucontext_t libucontext_ucontext_t
+#define getcontext libucontext_getcontext
+#define setcontext libucontext_setcontext
+#define swapcontext libucontext_swapcontext
+#define makecontext libucontext_makecontext
+#else
 #include <ucontext.h>
+#endif
 #include "qemu/coroutine_int.h"
 
 #ifdef CONFIG_VALGRIND_H

From 6c39f9e99bd7c284d2328026ff6ab0b509a43fbd Mon Sep 17 00:00:00 2001
From: Kate Temkin <k@ktemkin.com>
Date: Sat, 27 Mar 2021 15:46:31 -0600
Subject: [PATCH 25/36] get building for an iOS target, as well

---
 block/file-posix.c      | 21 +++++++++++++++++++++
 configure               | 41 ++++++++++++++++++++++++++++++++++++++++-
 include/qemu/osdep.h    |  2 +-
 meson.build             |  7 +++----
 tests/qtest/meson.build |  4 ++--
 5 files changed, 67 insertions(+), 8 deletions(-)

diff --git a/block/file-posix.c b/block/file-posix.c
index dcd2a2375bfe..913b82210ba3 100644
--- a/block/file-posix.c
+++ b/block/file-posix.c
@@ -279,6 +279,13 @@ static int raw_normalize_devicepath(const char **filename, Error **errp)
 }
 #endif
 
+#if defined(CONFIG_IOS)
+static int probe_logical_blocksize(int fd, unsigned int *sector_size_p)
+{
+    return -ENOTSUP; /* not supported on iOS */
+}
+#else /* CONFIG_IOS */
+
 /*
  * Get logical block size via ioctl. On success store it in @sector_size_p.
  */
@@ -312,6 +319,8 @@ static int probe_logical_blocksize(int fd, unsigned int *sector_size_p)
     return success ? 0 : -errno;
 }
 
+#endif
+
 /**
  * Get physical block size of @fd.
  * On success, store it in @blk_size and return 0.
@@ -1403,12 +1412,24 @@ static bool preadv_present = true;
 static ssize_t
 qemu_preadv(int fd, const struct iovec *iov, int nr_iov, off_t offset)
 {
+#ifdef CONFIG_DARWIN /* preadv introduced in macOS 11 */
+    if (!__builtin_available(macOS 11, iOS 14, watchOS 7, tvOS 14, *)) {
+        preadv_present = false;
+        return -ENOSYS;
+    } else
+#endif
     return preadv(fd, iov, nr_iov, offset);
 }
 
 static ssize_t
 qemu_pwritev(int fd, const struct iovec *iov, int nr_iov, off_t offset)
 {
+#ifdef CONFIG_DARWIN /* pwritev introduced in macOS 11 */
+    if (!__builtin_available(macOS 11, iOS 14, watchOS 7, tvOS 14, *)) {
+        preadv_present = false;
+        return -ENOSYS;
+    } else
+#endif
     return pwritev(fd, iov, nr_iov, offset);
 }
 
diff --git a/configure b/configure
index 1c5c9fc0f06a..edb2a5257f9c 100755
--- a/configure
+++ b/configure
@@ -572,6 +572,21 @@ EOF
   compile_object
 }
 
+check_ios() {
+  cat > $TMPC <<EOF
+#ifdef __APPLE__
+#import "TargetConditionals.h"
+#if !TARGET_OS_IPHONE
+#error TARGET_OS_IPHONE not true
+#endif
+#endif
+int main(void) { return 0; }
+EOF
+  compile_object
+}
+
+
+
 check_include() {
 cat > $TMPC <<EOF
 #include <$1>
@@ -614,7 +629,11 @@ elif check_define __DragonFly__ ; then
 elif check_define __NetBSD__; then
   targetos='NetBSD'
 elif check_define __APPLE__; then
-  targetos='Darwin'
+  if check_ios ; then
+    targetos='iOS'
+  else
+    targetos='Darwin'
+  fi
 else
   # This is a fatal error, but don't report it yet, because we
   # might be going to just print the --help text, or it might
@@ -630,6 +649,22 @@ case $targetos in
 Darwin)
   HOST_DSOSUF=".dylib"
   ;;
+iOS)
+  bsd="yes"
+  darwin="yes"
+  ios="yes"
+  if [ "$cpu" = "x86_64" ] ; then
+    QEMU_CFLAGS="-arch x86_64 $QEMU_CFLAGS"
+    QEMU_LDFLAGS="-arch x86_64 $QEMU_LDFLAGS"
+  fi
+  host_block_device_support="no"
+  audio_drv_list=""
+  audio_possible_drivers=""
+  QEMU_LDFLAGS="-framework CoreFoundation $QEMU_LDFLAGS"
+  # Disable attempts to use ObjectiveC features in os/object.h since they
+  # won't work when we're compiling with gcc as a C compiler.
+  QEMU_CFLAGS="-DOS_OBJECT_USE_OBJC=0 $QEMU_CFLAGS"
+;;
 SunOS)
   # $(uname -m) returns i86pc even on an x86_64 box, so default based on isainfo
   if test -z "$cpu" && test "$(isainfo -k)" = "amd64"; then
@@ -5551,6 +5586,10 @@ if test "$darwin" = "yes" ; then
   echo "CONFIG_DARWIN=y" >> $config_host_mak
 fi
 
+if test "$ios" = "yes" ; then
+  echo "CONFIG_IOS=y" >> $config_host_mak
+fi
+
 if test "$solaris" = "yes" ; then
   echo "CONFIG_SOLARIS=y" >> $config_host_mak
 fi
diff --git a/include/qemu/osdep.h b/include/qemu/osdep.h
index 5bd1a6776915..76cfa8f83562 100644
--- a/include/qemu/osdep.h
+++ b/include/qemu/osdep.h
@@ -685,7 +685,7 @@ size_t qemu_get_host_physmem(void);
 /**
  * Platforms which do not support system() return ENOSYS
  */
-#ifndef HAVE_SYSTEM_FUNCTION
+#if !defined(HAVE_SYSTEM_FUNCTION) || defined(CONFIG_IOS)
 #define system platform_does_not_support_system
 static inline int platform_does_not_support_system(const char *command)
 {
diff --git a/meson.build b/meson.build
index 7968c40f630f..6c758cb0c8ad 100644
--- a/meson.build
+++ b/meson.build
@@ -181,7 +181,7 @@ if targetos == 'windows'
                                       include_directories: include_directories('.'))
 elif targetos == 'darwin'
   coref = dependency('appleframeworks', modules: 'CoreFoundation')
-  iokit = dependency('appleframeworks', modules: 'IOKit', required: false)
+  iokit = dependency('appleframeworks', modules: 'IOKit', required: 'CONFIG_IOS' not in config_host)
 elif targetos == 'sunos'
   socket = [cc.find_library('socket'),
             cc.find_library('nsl'),
@@ -1056,8 +1056,7 @@ if get_option('cfi')
   add_global_link_arguments(cfi_flags, native: false, language: ['c', 'cpp', 'objc'])
 endif
 
-have_host_block_device = (targetos != 'darwin' or
-    cc.has_header('IOKit/storage/IOMedia.h'))
+have_host_block_device = (targetos != 'darwin' or (cc.has_header('IOKit/storage/IOMedia.h') and ('CONFIG_IOS' not in config_host)))
 
 #################
 # config-host.h #
@@ -1153,7 +1152,7 @@ config_host_data.set('HAVE_SYS_IOCCOM_H', cc.has_header('sys/ioccom.h'))
 config_host_data.set('HAVE_SYS_KCOV_H', cc.has_header('sys/kcov.h'))
 config_host_data.set('HAVE_SYSTEM_FUNCTION', cc.has_function('system', prefix: '#include <stdlib.h>'))
 config_host_data.set('HAVE_HOST_BLOCK_DEVICE', have_host_block_device)
-config_host_data.set('HAVE_SYS_DISK_H', cc.has_header('sys/disk.h'))
+config_host_data.set('HAVE_SYS_DISK_H', cc.has_header('sys/disk.h') and ('CONFIG_IOS' not in config_host))
 
 config_host_data.set('CONFIG_PREADV', cc.has_function('preadv', prefix: '#include <sys/uio.h>'))
 
diff --git a/tests/qtest/meson.build b/tests/qtest/meson.build
index 902cfef7cb2f..5d2b43c7c6ce 100644
--- a/tests/qtest/meson.build
+++ b/tests/qtest/meson.build
@@ -68,12 +68,11 @@ qtests_i386 = \
   (config_all_devices.has_key('CONFIG_TPM_TIS_ISA') ? ['tpm-tis-swtpm-test'] : []) +        \
   (config_all_devices.has_key('CONFIG_RTL8139_PCI') ? ['rtl8139-test'] : []) +              \
   (config_all_devices.has_key('CONFIG_E1000E_PCI_EXPRESS') ? ['fuzz-e1000e-test'] : []) +   \
+  (not config_host.has_key('CONFIG_IOS') ? ['bios-tables-test', 'hd-geo-test'] : []) +      \
   qtests_pci +                                                                              \
   ['fdc-test',
    'ide-test',
-   'hd-geo-test',
    'boot-order-test',
-   'bios-tables-test',
    'rtc-test',
    'i440fx-test',
    'fw_cfg-test',
@@ -182,6 +181,7 @@ qtests_aarch64 = \
   (cpu != 'arm' ? ['bios-tables-test'] : []) +                                                  \
   (config_all_devices.has_key('CONFIG_TPM_TIS_SYSBUS') ? ['tpm-tis-device-test'] : []) +        \
   (config_all_devices.has_key('CONFIG_TPM_TIS_SYSBUS') ? ['tpm-tis-device-swtpm-test'] : []) +  \
+  (cpu != 'arm' and not config_host.has_key('CONFIG_IOS') ? ['bios-tables-test'] : []) + \
   ['arm-cpu-features',
    'numa-test',
    'boot-serial-test',

From 1e4d72b004c26724cd049798b3370492016cd3b0 Mon Sep 17 00:00:00 2001
From: Kate Temkin <k@ktemkin.com>
Date: Mon, 29 Mar 2021 13:26:08 -0600
Subject: [PATCH 26/36] TCTI: add TCTI TCG backend for acceleration on non-JIT
 AArch64

---
 accel/tcg/translate-all.c             |    6 +-
 configure                             |   10 +-
 disas.c                               |    2 +
 include/disas/dis-asm.h               |    1 +
 include/exec/exec-all.h               |    4 +
 include/tcg/tcg.h                     |    2 +-
 meson.build                           |   24 +
 meson_options.txt                     |    2 +
 scripts/mtest2make.py                 |   24 +-
 tcg/aarch64-tcti/README.md            | 1026 +++++++++++++++++++
 tcg/aarch64-tcti/tcg-target-con-set.h |   21 +
 tcg/aarch64-tcti/tcg-target-con-str.h |   11 +
 tcg/aarch64-tcti/tcg-target.c.inc     | 1347 +++++++++++++++++++++++++
 tcg/aarch64-tcti/tcg-target.h         |  220 ++++
 tcg/aarch64-tcti/tcti-gadget-gen.py   |  788 +++++++++++++++
 tcg/tcg.c                             |   14 +-
 16 files changed, 3476 insertions(+), 26 deletions(-)
 create mode 100644 tcg/aarch64-tcti/README.md
 create mode 100644 tcg/aarch64-tcti/tcg-target-con-set.h
 create mode 100644 tcg/aarch64-tcti/tcg-target-con-str.h
 create mode 100644 tcg/aarch64-tcti/tcg-target.c.inc
 create mode 100644 tcg/aarch64-tcti/tcg-target.h
 create mode 100755 tcg/aarch64-tcti/tcti-gadget-gen.py

diff --git a/accel/tcg/translate-all.c b/accel/tcg/translate-all.c
index 0ca0a8c5c23c..b25b17e44c84 100644
--- a/accel/tcg/translate-all.c
+++ b/accel/tcg/translate-all.c
@@ -1129,7 +1129,7 @@ static bool alloc_code_gen_buffer_anon(size_t size, int prot,
     return true;
 }
 
-#ifndef CONFIG_TCG_INTERPRETER
+#if !defined(CONFIG_TCG_INTERPRETER) && !defined(CONFIG_TCG_THREADED_INTERPRETER)
 #ifdef CONFIG_POSIX
 #include "qemu/memfd.h"
 
@@ -1256,7 +1256,7 @@ static bool alloc_code_gen_buffer_splitwx_vmremap(size_t size, Error **errp)
 
 static bool alloc_code_gen_buffer_splitwx(size_t size, Error **errp)
 {
-#ifndef CONFIG_TCG_INTERPRETER
+#if !defined(CONFIG_TCG_INTERPRETER) && !defined(CONFIG_TCG_THREADED_INTERPRETER)
 # ifdef CONFIG_DARWIN
     return alloc_code_gen_buffer_splitwx_vmremap(size, errp);
 # endif
@@ -1289,7 +1289,7 @@ static bool alloc_code_gen_buffer(size_t size, int splitwx, Error **errp)
 
     prot = PROT_READ | PROT_WRITE | PROT_EXEC;
     flags = MAP_PRIVATE | MAP_ANONYMOUS;
-#ifdef CONFIG_TCG_INTERPRETER
+#if defined(CONFIG_TCG_INTERPRETER) || defined(CONFIG_TCG_THREADED_INTERPRETER)
     /* The tcg interpreter does not need execute permission. */
     prot = PROT_READ | PROT_WRITE;
 #elif defined(CONFIG_DARWIN)
diff --git a/configure b/configure
index edb2a5257f9c..d0053845169c 100755
--- a/configure
+++ b/configure
@@ -362,6 +362,7 @@ tsan="no"
 fortify_source="$default_feature"
 strip_opt="yes"
 tcg_interpreter="false"
+tcg_threaded_interpreter="false"
 bigendian="no"
 mingw32="no"
 gcov="no"
@@ -1150,6 +1151,10 @@ for opt do
   ;;
   --enable-tcg-interpreter) tcg_interpreter="true"
   ;;
+  --disable-tcg-tcti) tcg_threaded_interpreter="false"
+  ;;
+  --enable-tcg-tcti) tcg_threaded_interpreter="true"
+  ;;
   --disable-cap-ng)  cap_ng="disabled"
   ;;
   --enable-cap-ng) cap_ng="enabled"
@@ -6510,9 +6515,8 @@ NINJA=$ninja $meson setup \
         -Dvhost_user_blk_server=$vhost_user_blk_server -Dmultiprocess=$multiprocess \
         -Dfuse=$fuse -Dfuse_lseek=$fuse_lseek -Dguest_agent_msi=$guest_agent_msi \
         $(if test "$default_features" = no; then echo "-Dauto_features=disabled"; fi) \
-	-Dtcg_interpreter=$tcg_interpreter -Dshared_lib=$shared_lib \
-        $cross_arg \
-        "$PWD" "$source_path"
+	-Dtcg_interpreter=$tcg_interpreter -Dtcg_threaded_interpreter=$tcg_threaded_interpreter\
+	-Dshared_lib=$shared_lib $cross_arg "$PWD" "$source_path"
 
 if test "$?" -ne 0 ; then
     error_exit "meson setup failed"
diff --git a/disas.c b/disas.c
index a61f95b580b8..cea0f9019e49 100644
--- a/disas.c
+++ b/disas.c
@@ -152,6 +152,8 @@ static void initialize_debug_host(CPUDebug *s)
 #endif
 #if defined(CONFIG_TCG_INTERPRETER)
     s->info.print_insn = print_insn_tci;
+#elif defined(CONFIG_TCG_THREADED_INTERPRETER)
+    s->info.print_insn = print_insn_tcti;
 #elif defined(__i386__)
     s->info.mach = bfd_mach_i386_i386;
     s->info.print_insn = print_insn_i386;
diff --git a/include/disas/dis-asm.h b/include/disas/dis-asm.h
index 13fa1edd411e..ded69ba2ffaa 100644
--- a/include/disas/dis-asm.h
+++ b/include/disas/dis-asm.h
@@ -411,6 +411,7 @@ typedef struct disassemble_info {
 typedef int (*disassembler_ftype) (bfd_vma, disassemble_info *);
 
 int print_insn_tci(bfd_vma, disassemble_info*);
+int print_insn_tcti(bfd_vma, disassemble_info*);
 int print_insn_big_mips         (bfd_vma, disassemble_info*);
 int print_insn_little_mips      (bfd_vma, disassemble_info*);
 int print_insn_nanomips         (bfd_vma, disassemble_info*);
diff --git a/include/exec/exec-all.h b/include/exec/exec-all.h
index 6b036cae8f65..a8f2295decd2 100644
--- a/include/exec/exec-all.h
+++ b/include/exec/exec-all.h
@@ -543,7 +543,11 @@ void tb_set_jmp_target(TranslationBlock *tb, int n, uintptr_t addr);
 #if defined(CONFIG_TCG_INTERPRETER)
 extern __thread uintptr_t tci_tb_ptr;
 # define GETPC() tci_tb_ptr
+#elif defined(CONFIG_TCG_THREADED_INTERPRETER)
+extern __thread uintptr_t tcti_call_return_address;
+# define GETPC() tcti_call_return_address
 #else
+/* Note that this is correct for TCTI also; whose gadget behaves like native code. */
 # define GETPC() \
     ((uintptr_t)__builtin_extract_return_addr(__builtin_return_address(0)))
 #endif
diff --git a/include/tcg/tcg.h b/include/tcg/tcg.h
index 0f0695e90da2..cfcd069bf3f6 100644
--- a/include/tcg/tcg.h
+++ b/include/tcg/tcg.h
@@ -1296,7 +1296,7 @@ static inline unsigned get_mmuidx(TCGMemOpIdx oi)
 #define TB_EXIT_IDXMAX    1
 #define TB_EXIT_REQUESTED 3
 
-#ifdef CONFIG_TCG_INTERPRETER
+#if defined(CONFIG_TCG_INTERPRETER) || defined(CONFIG_TCG_THREADED_INTERPRETER)
 uintptr_t tcg_qemu_tb_exec(CPUArchState *env, const void *tb_ptr);
 #else
 typedef uintptr_t tcg_prologue_fn(CPUArchState *env, const void *tb_ptr);
diff --git a/meson.build b/meson.build
index 6c758cb0c8ad..2bcd98dd0865 100644
--- a/meson.build
+++ b/meson.build
@@ -58,6 +58,7 @@ python = import('python').find_installation()
 supported_oses = ['windows', 'freebsd', 'netbsd', 'openbsd', 'darwin', 'sunos', 'linux']
 supported_cpus = ['ppc', 'ppc64', 's390x', 'riscv32', 'riscv64', 'x86', 'x86_64',
   'arm', 'aarch64', 'mips', 'mips64', 'sparc', 'sparc64']
+tcti_supported_cpus = ['aarch64']
 
 cpu = host_machine.cpu_family()
 targetos = host_machine.system()
@@ -246,6 +247,25 @@ if not get_option('tcg').disabled()
   endif
   if get_option('tcg_interpreter')
     tcg_arch = 'tci'
+  elif get_option('tcg_threaded_interpreter')
+    if cpu not in tcti_supported_cpus
+      error('Unsupported CPU @0@ for TCTI, try --enable-tcg-interpreter'.format(cpu))
+    else
+      warning('TCTI is extremely experimental and incomplete! Things might break!')
+      tcg_arch = '@0@-tcti'.format(cpu)
+    endif
+
+    # Tell our compiler how to generate our TCTI gadgets.
+    gadget_generator = 'tcg/@0@/tcti-gadget-gen.py'.format(tcg_arch)
+    tcti_gadgets = custom_target('tcti-gadgets.c.inc',
+                                output: 'tcti-gadgets.c.inc',
+                                input: gadget_generator,
+                                command: [find_program(gadget_generator), '@OUTPUT@'],
+                                build_by_default: true,
+                                build_always_stale: false)
+
+    genh += tcti_gadgets
+    
   elif config_host['ARCH'] == 'sparc64'
     tcg_arch = 'sparc'
   elif config_host['ARCH'] == 's390x'
@@ -1280,6 +1300,8 @@ foreach target : target_dirs
       config_all += { sym: 'y' }
       if sym == 'CONFIG_TCG' and tcg_arch == 'tci'
         config_target += { 'CONFIG_TCG_INTERPRETER': 'y' }
+      elif sym == 'CONFIG_TCG' and tcg_arch.endswith('tcti')
+        config_target += { 'CONFIG_TCG_THREADED_INTERPRETER': 'y' }
       elif sym == 'CONFIG_XEN' and have_xen_pci_passthrough
         config_target += { 'CONFIG_XEN_PCI_PASSTHROUGH': 'y' }
       endif
@@ -2571,6 +2593,8 @@ summary_info += {'TCG support':       config_all.has_key('CONFIG_TCG')}
 if config_all.has_key('CONFIG_TCG')
   if get_option('tcg_interpreter')
     summary_info += {'TCG backend':   'TCI (TCG with bytecode interpreter, experimental and slow)'}
+  elif get_option('tcg_threaded_interpreter')
+    summary_info += {'TCG backend':   'TCTI (TCG with threaded-dispatch bytecode interpreter, experimental and slow; but faster than TCI)'}
   else
     summary_info += {'TCG backend':   'native (@0@)'.format(cpu)}
   endif
diff --git a/meson_options.txt b/meson_options.txt
index 6c29ea93300a..5aa68672c2ff 100644
--- a/meson_options.txt
+++ b/meson_options.txt
@@ -43,6 +43,8 @@ option('tcg', type: 'feature', value: 'auto',
        description: 'TCG support')
 option('tcg_interpreter', type: 'boolean', value: false,
        description: 'TCG with bytecode interpreter (experimental and slow)')
+option('tcg_threaded_interpreter', type: 'boolean', value: false,
+       description: 'TCG with threaded-dispatch bytecode interpreter (experimental and slow, but less slow than TCI)')
 option('cfi', type: 'boolean', value: 'false',
        description: 'Control-Flow Integrity (CFI)')
 option('cfi_debug', type: 'boolean', value: 'false',
diff --git a/scripts/mtest2make.py b/scripts/mtest2make.py
index ee072c05025a..b0467ab56545 100644
--- a/scripts/mtest2make.py
+++ b/scripts/mtest2make.py
@@ -75,18 +75,18 @@ def process_tests(test, targets, suites):
     print('run-test-%d: $(.test.deps.%d)' % (i,i))
     print('\t@$(call .test.run,%d,$(.test.output-format))' % (i,))
 
-    test_suites = test['suite'] or ['default']
-    is_slow = any(s.endswith('-slow') for s in test_suites)
-    for s in test_suites:
-        # The suite name in the introspection info is "PROJECT:SUITE"
-        s = s.split(':')[1]
-        if s.endswith('-slow'):
-            s = s[:-5]
-        if is_slow:
-            suites[s].slow_tests.append(i)
-        else:
-            suites[s].tests.append(i)
-        suites[s].executables.add(executable)
+    #test_suites = test['suite'] or ['default']
+    #is_slow = any(s.endswith('-slow') for s in test_suites)
+    #for s in test_suites:
+    #    # The suite name in the introspection info is "PROJECT:SUITE"
+    #    s = s.split(':')[1]
+    #    if s.endswith('-slow'):
+    #        s = s[:-5]
+    #    if is_slow:
+    #        suites[s].slow_tests.append(i)
+    #    else:
+    #        suites[s].tests.append(i)
+    #    suites[s].executables.add(executable)
 
 def emit_prolog(suites, prefix):
     all_tap = ' '.join(('%s-report-%s.tap' % (prefix, k) for k in suites.keys()))
diff --git a/tcg/aarch64-tcti/README.md b/tcg/aarch64-tcti/README.md
new file mode 100644
index 000000000000..eb848e5a9e57
--- /dev/null
+++ b/tcg/aarch64-tcti/README.md
@@ -0,0 +1,1026 @@
+# QEMU Tiny-Code Threaded Interpreter (AArch64)
+
+A TCG backend that chains together JOP/ROP-ish gadgets to massively reduce interpreter overhead vs TCI.
+Platform-dependent; but usable when JIT isn't available; e.g. on platforms that lack WX mappings. The general idea squish the addresses of a gadget sequence into a "queue" and then write each gadget so it ends in a "dequeue-jump".
+
+Execution occurs by jumping into the first gadget, and letting it just play back some linear-overhead native code sequences for a while.
+
+Since TCG-TCI is optimized for sets of 16 GP registers and aarch64 has 30, we could easily keep JIT/QEMU and guest state separate, and since 16\*16 is reasonably small we could actually have a set of reasonable gadgets for each combination of operands.
+
+
+## Register Convention
+
+| Regs    | Use                   |
+| :------ | :-------------------- |
+| x1-x15  | Guest Registers       |
+| x24     | TCTI temporary        |
+| x25     | saved IP during call  |
+| x26     | TCTI temporary        |
+| x27     | TCTI temporary        |
+| x28     | Thread-stream pointer |
+| x30     | Link register         |
+| SP      | Stack Pointer, host   |
+| PC      | Program Counter, host |
+
+In pseudocode:
+
+| Symbol | Meaning                             |
+| :----- | :---------------------------------- |
+| Rd     | stand-in for destination register   |
+| Rn     | stand-in for first source register  |
+| Rm     | stand-in for second source register |
+
+## Gadget Structure
+
+### End of gadget
+
+Each gadget ends by advancing our bytecode pointer, and then executing from thew new location.
+
+```asm
+# Load our next gadget address from our bytecode stream, advancing it, and jump to the next gadget.
+
+ldr x27, [x28], #8\n
+br x27
+```
+
+## Calling into QEMU's C codebase
+
+When calling into C, we lose control over which registers are used. Accordingly, we'll need to save
+registers relevant to TCTI:
+
+```asm
+str x25,      [sp, #-16]!
+stp x14, x15, [sp, #-16]!
+stp x12, x13, [sp, #-16]!
+stp x10, x11, [sp, #-16]!
+stp x8,  x9,  [sp, #-16]!
+stp x6,  x7,  [sp, #-16]!
+stp x4,  x5,  [sp, #-16]!
+stp x2,  x3,  [sp, #-16]!
+stp x0,  x1,  [sp, #-16]!
+stp x28, lr,  [sp, #-16]!
+```
+
+Upon returning to the gadget stream, we'll then restore them.
+
+```asm
+ldp x28, lr, [sp], #16
+ldp x0,  x1, [sp], #16
+ldp x2,  x3, [sp], #16
+ldp x4,  x5, [sp], #16
+ldp x6,  x7, [sp], #16
+ldp x8,  x9, [sp], #16
+ldp x10, x11, [sp], #16
+ldp x12, x13, [sp], #16
+ldp x14, x15, [sp], #16
+ldr x25,      [sp], #16
+```
+
+## TCG Operations
+
+Each operation needs an implementation for every platform; and probably a set of gadgets for each possible set of operands.
+
+At 14 GP registers, that means that
+
+1 operand =\> 16 gadgets
+2 operands =\> 256 gadgets
+3 operands =\> 4096 gadgets
+
+### call
+
+Calls a helper function by address.
+
+**IR Format**: `br <ptr address>`  
+**Gadget type:** single
+
+```asm
+    # Get our C runtime function's location as a pointer-sized immediate...
+    "ldr x27, [x28], #8",
+
+    # Store our TB return address for our helper. This is necessary so the GETPC()
+    # macro works correctly as used in helper functions.
+    "str x28, [x25]",
+
+    # Prepare ourselves to call into our C runtime...
+    *C_CALL_PROLOGUE,
+
+    # ... perform the call itself ...
+    "blr x27",
+
+    # Save the result of our call for later.
+    "mov x27, x0",
+
+    # ... and restore our environment.
+    *C_CALL_EPILOGUE,
+
+    # Restore our return value.
+    "mov x0, x27"
+```
+
+### br
+
+Branches to a given immediate address. Branches are
+
+**IR Format**: `br <ptr address>`  
+**Gadget type:** single
+
+```asm
+# Use our immediate argument as our new bytecode-pointer location.
+ldr x28, [x28]
+```
+
+### setcond_i32
+
+Performs a comparison between two 32-bit operands.
+
+**IR Format**: `setcond32 <cond>, Rd, Rn, Rm`  
+**Gadget type:** treated as 10 operations with variants for every `Rd`/`Rn`/`Rm` (40,960)
+
+```asm
+subs Wd, Wn, Wm
+cset Wd, <cond>
+```
+
+| QEMU Cond | AArch64 Cond |
+| :-------- | :----------- |
+| EQ        | EQ           |
+| NE        | NE           |
+| LT        | LT           |
+| GE        | GE           |
+| LE        | LE           |
+| GT        | GT           |
+| LTU       | LO           |
+| GEU       | HS           |
+| LEU       | LS           |
+| GTU       | HI           |
+
+### setcond_i64
+
+Performs a comparison between two 32-bit operands.
+
+**IR Format**: `setcond64 <cond>, Rd, Rn, Rm`  
+**Gadget type:** treated as 10 operations with variants for every `Rd`/`Rn`/`Rm` (40,960)
+
+```asm
+subs Xd, Xn, Xm
+cset Xd, <cond>
+```
+
+Comparison chart is the same as the `_i32` variant.
+
+### brcond_i32
+
+Compares two 32-bit numbers, and branches if the comparison is true.
+
+**IR Format**: `brcond Rn, Rm, <cond>`  
+**Gadget type:** treated as 10 operations with variants for every `Rn`/`Rm` (2560)
+
+```asm
+# Perform our comparison and conditional branch.
+subs Wrz, Wn, Wm
+br<cond> taken
+
+    # Consume the branch target, without using it.
+    add x28, x28, #8
+
+    # Perform our end-of-instruction epilogue.
+    <epilogue here>
+
+taken:
+
+    # Update our bytecode pointer to take the label.
+    ldr x28, [x28]
+```
+
+Comparison chart is the same as in `setcond_i32` .
+
+### brcond_i64
+
+Compares two 64-bit numbers, and branches if the comparison is true.
+
+**IR Format**: `brcond Rn, Rm, <cond>`  
+**Gadget type:** treated as 10 operations with variants for every `Rn`/`Rm` (2560)
+
+```asm
+# Perform our comparison and conditional branch.
+subs Xrz, Xn, Xm
+br<cond> taken
+
+    # Consume the branch target, without using it.
+    add x28, x28, #8
+
+    # Perform our end-of-instruction epilogue.
+    <epilogue here>
+
+taken:
+
+    # Update our bytecode pointer to take the label.
+    ldr x28, [x28]
+```
+
+Comparison chart is the same as in `setcond_i32` .
+
+### mov_i32
+
+Moves a value from a register to another register.
+
+**IR Format**: `mov Rd, Rn`  
+**Gadget type:** gadget per `Rd` + `Rn` combo (256)
+
+```asm
+mov Rd, Rn
+```
+
+### mov_i64
+
+Moves a value from a register to another register.
+
+**IR Format**: `mov Rd, Rn`  
+**Gadget type:** gadget per `Rd` + `Rn` combo (256)
+
+```asm
+mov Xd, Xn
+```
+
+### tci_movi_i32
+
+Moves an 32b immediate into a register.
+
+**IR Format**: `mov Rd, #imm32`  
+**Gadget type:** gadget per `Rd` (16)
+
+```asm
+ldr w27, [x28], #4
+mov Wd, w27
+```
+
+### tci_movi_i64
+
+Moves an 64b immediate into a register.
+
+**IR Format**: `mov Rd, #imm64`  
+**Gadget type:** gadget per `Rd` (16)
+
+```asm
+ldr x27, [x28], #4
+mov Xd, x27
+```
+
+### ld8u_i32 / ld8u_i64
+
+Load byte from host memory to register.
+
+**IR Format**: `ldr Rd, Rn, <signed offset>`  
+**Gadget type:** gadget per `Rd` & `Rn` (256)
+
+```asm
+ldrsw x27, [x28], #4
+ldrb Xd, [Xn, x27]
+```
+
+### ld8s_i32 / ld8s_i64
+
+Load byte from host memory to register; sign extending.
+
+**IR Format**: `ldr Rd, Rn, <signed offset>`  
+**Gadget type:** gadget per `Rd` & `Rn` (256)
+
+```asm
+ldrsw x27, [x28], #4
+ldrsb Xd, [Xn, x27]
+```
+
+### ld16u_i32 / ld16u_i64
+
+Load 16b from host memory to register.
+
+**IR Format**: `ldr Rd, Rn, <signed offset>`  
+**Gadget type:** gadget per `Rd` & `Rn` (256)
+
+```asm
+ldrsw x27, [x28], #4
+ldrh Wd, [Xn, x27]
+```
+
+### ld16s_i32 / ld16s_i64
+
+Load 16b from host memory to register; sign extending.
+
+**IR Format**: `ldr Rd, Rn, <signed offset>`  
+**Gadget type:** gadget per `Rd` & `Rn` (256)
+
+```asm
+ldrsw x27, [x28], #4
+ldrsh Xd, [Xn, x27]
+```
+
+### ld32u_i32 / ld32u_i64
+
+Load 32b from host memory to register.
+
+**IR Format**: `ldr Rd, Rn, <signed offset>`  
+**Gadget type:** gadget per `Rd` & `Rn` (256)
+
+```asm
+ldrsw x27, [x28], #4
+ldr Wd, [Xn, x27]
+```
+
+### ld32s_i64
+
+Load 32b from host memory to register; sign extending.
+
+**IR Format**: `ldr Rd, Rn, <signed offset>`  
+**Gadget type:** gadget per `Rd` & `Rn` (256)
+
+```asm
+ldrsw x27, [x28], #4
+ldrsw Xd, [Xn, x27]
+```
+
+### ld_i64
+
+Load 64b from host memory to register.
+
+**IR Format**: `ldr Rd, Rn, <signed offset>`  
+**Gadget type:** gadget per `Rd` & `Rn` (256)
+
+```asm
+ldrsw x27, [x28], #4
+ldr Xd, [Xn, x27]
+```
+
+### st8_i32 / st8_i64
+
+Stores byte from register to host memory.
+
+**IR Format**: `str Rd, Rn, <signed offset>`  
+**Gadget type:** gadget per `Rd` & `Rn` (256)
+
+```asm
+ldrsw x27, [x28], #4
+strb Wd, [Xn, x27]
+```
+
+### st16_i32 / st16_i64
+
+Stores 16b from register to host memory.
+
+**IR Format**: `str Rd, Rn, <signed offset>`  
+**Gadget type:** gadget per `Rd` & `Rn` (256)
+
+```asm
+ldrsw x27, [x28], #4
+strh Wd, [Xn, x27]
+```
+
+### st_i32 / st32_i64
+
+Stores 32b from register to host memory.
+
+**IR Format**: `str Rd, Rn, <signed offset>`  
+**Gadget type:** gadget per `Rd` & `Rn` (256)
+
+```asm
+ldrsw x27, [x28], #4
+str Wd, [Xn, x27]
+```
+
+### st_i64
+
+Stores 64b from register to host memory.
+
+**IR Format**: `str Rd, Rn, <signed offset>`  
+**Gadget type:** gadget per `Rd` & `Rn` (256)
+
+```asm
+ldrsw x27, [x28], #4
+str Xd, [Xn, x27]
+```
+
+### qemu_ld_i32
+
+Loads 32b from _guest_ memory to register.
+
+**IR Format**: `ld Rd, <foreign/guest pointer>, <memory operation>`  
+**Gadget type:** thunk per `Rd` into C impl?
+
+### qemu_ld_i64
+
+Loads 64b from _guest_ memory to register.
+
+**IR Format**: `ld Rd, <foreign/guest pointer>, <memory operation>`  
+**Gadget type:** thunk per `Rd` into C impl?
+
+### qemu_st_i32
+
+Stores 32b from a register to _guest_ memory.
+
+**IR Format**: `st Rd, <foreign/guest pointer>, <memory operation>`  
+**Gadget type:** thunk per `Rd` into C impl
+
+### qemu_st_i64
+
+Stores 64b from a register to _guest_ memory.
+
+**IR Format**: `st Rd, <foreign/guest pointer>, <memory operation>`  
+**Gadget type:** thunk per `Rd` into C impl?
+
+#### Note
+
+See note on `qemu_ld_i32`.
+
+### add_i32
+
+Adds two 32-bit numbers.
+
+**IR Format**: `add Rd, Rn, Rm`  
+**Gadget type:** gadget per `Rd`, `Rn`, `Rm` (4096)
+
+```asm
+add Wd, Wn, Wm
+```
+
+### add_i64
+
+Adds two 64-bit numbers.
+
+**IR Format**: `add Rd, Rn, Rm`  
+**Gadget type:** gadget per `Rd`, `Rn`, `Rm` (4096)
+
+```asm
+add Xd, Xn, Xm
+```
+
+### sub_i32
+
+Subtracts two 32-bit numbers.
+
+**IR Format**: `add Rd, Rn, Rm`  
+**Gadget type:** gadget per `Rd`, `Rn`, `Rm` (4096)
+
+```asm
+Sub Wd, Wn, Wm
+```
+
+### sub_i64
+
+Subtracts two 64-bit numbers.
+
+**IR Format**: `sub Rd, Rn, Rm`  
+**Gadget type:** gadget per `Rd`, `Rn`, `Rm` (4096)
+
+```asm
+sub Xd, Xn, Xm
+```
+
+### mul_i32
+
+Multiplies two 32-bit numbers.
+
+**IR Format**: `mul Rd, Rn, Rm`  
+**Gadget type:** gadget per `Rd`, `Rn`, `Rm` (4096)
+
+```asm
+mul Wd, Wn, Wm
+```
+
+### mul_i64
+
+Multiplies two 64-bit numbers.
+
+**IR Format**: `mul Rd, Rn, Rm`  
+**Gadget type:** gadget per `Rd`, `Rn`, `Rm` (4096)
+
+```asm
+mul Xd, Xn, Xm
+```
+
+### div_i32
+
+Divides two 32-bit numbers; considering them signed.
+
+**IR Format**: `div Rd, Rn, Rm`  
+**Gadget type:** gadget per `Rd`, `Rn`, `Rm` (4096)
+
+```asm
+sdiv Wd, Wn, Wm
+```
+
+### div_i64
+
+Divides two 64-bit numbers; considering them signed.
+
+**IR Format**: `div Rd, Rn, Rm`  
+**Gadget type:** gadget per `Rd`, `Rn`, `Rm` (4096)
+
+```asm
+sdiv Xd, Xn, Xm
+```
+
+### divu_i32
+
+Divides two 32-bit numbers; considering them unsigned.
+
+**IR Format**: `div Rd, Rn, Rm`  
+**Gadget type:** gadget per `Rd`, `Rn`, `Rm` (4096)
+
+```asm
+udiv Wd, Wn, Wm
+```
+
+### divu_i64
+
+Divides two 32-bit numbers; considering them unsigned.
+
+**IR Format**: `div Rd, Rn, Rm`  
+**Gadget type:** gadget per `Rd`, `Rn`, `Rm` (4096)
+
+```asm
+udiv Xd, Xn, Xm
+```
+
+### rem_i32
+
+Computes the division remainder (modulus) of two 32-bit numbers; considering them signed.
+
+**IR Format**: `rem Rd, Rn, Rm`  
+**Gadget type:** gadget per `Rd`, `Rn`, `Rm` (4096)
+
+```asm
+sdiv    w27, Wn, Wm
+msub    Wd, w27, Wm, Wn
+```
+
+### rem_i64
+
+Computes the division remainder (modulus) of two 64-bit numbers; considering them signed.
+
+**IR Format**: `rem Rd, Rn, Rm`  
+**Gadget type:** gadget per `Rd`, `Rn`, `Rm` (4096)
+
+```asm
+sdiv    x27, Xn, Xm
+msub    Xd, x27, Xm, Xn
+```
+
+### remu_i32
+
+Computes the division remainder (modulus) of two 32-bit numbers; considering them unsigned.
+
+**IR Format**: `rem Rd, Rn, Rm`  
+**Gadget type:** gadget per `Rd`, `Rn`, `Rm` (4096)
+
+```asm
+udiv    w27, Wn, Wm
+msub    Wd, w27, Wm, Wn
+```
+
+### remu_i64
+
+Computes the division remainder (modulus) of two 32-bit numbers; considering them unsigned.
+
+**IR Format**: `rem Rd, Rn, Rm`  
+**Gadget type:** gadget per `Rd`, `Rn`, `Rm` (4096)
+
+```asm
+udiv    x27, Xn, Xm
+msub    Xd, x27, Xm, Xn
+```
+
+### not_i32
+
+Logically inverts a 32-bit number.
+
+**IR Format**: `not Rd, Rn`  
+**Gadget type:** gadget per `Rd`, `Rn` (256)
+
+```asm
+mvn Wd, Wn
+```
+
+### not_i64
+
+Logically inverts a 64-bit number.
+
+**IR Format**: `not Rd, Rn`  
+**Gadget type:** gadget per `Rd`, `Rn` (256)
+
+```asm
+mvn Xd, Xn
+```
+
+### neg_i32
+
+Arithmetically inverts (two's compliment) a 32-bit number.
+
+**IR Format**: `not Rd, Rn`  
+**Gadget type:** gadget per `Rd`, `Rn` (256)
+
+```asm
+neg Wd, Wn
+```
+
+### neg_i64
+
+Arithmetically inverts (two's compliment) a 64-bit number.
+
+**IR Format**: `not Rd, Rn`  
+**Gadget type:** gadget per `Rd`, `Rn` (256)
+
+```asm
+neg Xd, Xn
+```
+
+### and_i32
+
+Logically ANDs two 32-bit numbers.
+
+**IR Format**: `and Rd, Rn, Rm`  
+**Gadget type:** gadget per `Rd`, `Rn`, `Rm` (4096)
+
+```asm
+and Wd, Wn, Wm
+```
+
+### and_i64
+
+Logically ANDs two 64-bit numbers.
+
+**IR Format**: `and Rd, Rn, Rm`  
+**Gadget type:** gadget per `Rd`, `Rn`, `Rm` (4096)
+
+```asm
+and Xd, Xn, Xm
+```
+
+### or_i32
+
+Logically ORs two 32-bit numbers.
+
+**IR Format**: `or Rd, Rn, Rm`  
+**Gadget type:** gadget per `Rd`, `Rn`, `Rm` (4096)
+
+```asm
+or Wd, Wn, Wm
+```
+
+### or_i64
+
+Logically ORs two 64-bit numbers.
+
+**IR Format**: `or Rd, Rn, Rm`  
+**Gadget type:** gadget per `Rd`, `Rn`, `Rm` (4096)
+
+```asm
+or Xd, Xn, Xm
+```
+
+### xor_i32
+
+Logically XORs two 32-bit numbers.
+
+**IR Format**: `xor Rd, Rn, Rm`  
+**Gadget type:** gadget per `Rd`, `Rn`, `Rm` (4096)
+
+```asm
+eor Wd, Wn, Wm
+```
+
+### xor_i64
+
+Logically XORs two 64-bit numbers.
+
+**IR Format**: `xor Rd, Rn, Rm`  
+**Gadget type:** gadget per `Rd`, `Rn`, `Rm` (4096)
+
+```asm
+eor Xd, Xn, Xm
+```
+
+### shl_i32
+
+Logically shifts a 32-bit number left.
+
+**IR Format**: `shl Rd, Rn, Rm`  
+**Gadget type:** gadget per `Rd`, `Rn`, `Rm` (4096)
+
+```asm
+lsl Wd, Wn, Wm
+```
+
+### shl_i64
+
+Logically shifts a 64-bit number left.
+
+**IR Format**: `shl Rd, Rn, Rm`  
+**Gadget type:** gadget per `Rd`, `Rn`, `Rm` (4096)
+
+```asm
+lsl Xd, Xn, Xm
+```
+
+### shr_i32
+
+Logically shifts a 32-bit number right.
+
+**IR Format**: `shr Rd, Rn, Rm`  
+**Gadget type:** gadget per `Rd`, `Rn`, `Rm` (4096)
+
+```asm
+lsr Wd, Wn, Wm
+```
+
+### shr_i64
+
+Logically shifts a 64-bit number right.
+
+**IR Format**: `shr Rd, Rn, Rm`  
+**Gadget type:** gadget per `Rd`, `Rn`, `Rm` (4096)
+
+```asm
+lsr Xd, Xn, Xm
+```
+
+### sar_i32
+
+Arithmetically shifts a 32-bit number right.
+
+**IR Format**: `sar Rd, Rn, Rm`  
+**Gadget type:** gadget per `Rd`, `Rn`, `Rm` (4096)
+
+```asm
+asr Wd, Wn, Wm
+```
+
+### sar_i64
+
+Arithmetically shifts a 64-bit number right.
+
+**IR Format**: `sar Rd, Rn, Rm`  
+**Gadget type:** gadget per `Rd`, `Rn`, `Rm` (4096)
+
+```asm
+asr Xd, Xn, Xm
+```
+
+### rotl_i32
+
+Rotates a 32-bit number left.
+
+**IR Format**: `rotl Rd, Rn, Rm`  
+**Gadget type:** gadget per `Rd`, `Rn`, `Rm` (4096)
+
+```asm
+rol Wd, Wn, Wm
+```
+
+### rotl_i64
+
+Rotates a 64-bit number left.
+
+**IR Format**: `rotl Rd, Rn, Rm`  
+**Gadget type:** gadget per `Rd`, `Rn`, `Rm` (4096)
+
+```asm
+rol Xd, Xn, Xm
+```
+
+### rotr_i32
+
+Rotates a 32-bit number right.
+
+**IR Format**: `rotr Rd, Rn, Rm`  
+**Gadget type:** gadget per `Rd`, `Rn`, `Rm` (4096)
+
+```asm
+ror Wd, Wn, Wm
+```
+
+### rotr_i64
+
+Rotates a 64-bit number right.
+
+**IR Format**: `rotr Rd, Rn, Rm`  
+**Gadget type:** gadget per `Rd`, `Rn`, `Rm` (4096)
+
+```asm
+ror Xd, Xn, Xm
+```
+
+### deposit_i32
+
+Optional; not currently implementing.
+
+### deposit_i64
+
+Optional; not currently implementing.
+
+### ext8s_i32
+
+Sign extends the lower 8b of a register into a 32b destination.
+
+**IR Format**: `ext8s Rd, Rn`  
+**Gadget type:** gadget per `Rd`, `Rn` (256)
+
+```asm
+sxtb Wd, Wn
+```
+
+### ext8s_i64
+
+Sign extends the lower 8b of a register into a 64b destination.
+
+**IR Format**: `ext8s Rd, Rn`  
+**Gadget type:** gadget per `Rd`, `Rn` (256)
+
+```asm
+sxtb Xd, Wn
+```
+
+### ext8u_i32
+
+Zero extends the lower 8b of a register into a 32b destination.
+
+**IR Format**: `ext8u Rd, Rn`  
+**Gadget type:** gadget per `Rd`, `Rn` (256)
+
+```asm
+and Xd, Xn, #0xff
+```
+
+### ext8u_i64
+
+Zero extends the lower 8b of a register into a 64b destination.
+
+**IR Format**: `ext8u Rd, Rn`  
+**Gadget type:** gadget per `Rd`, `Rn` (256)
+
+```asm
+and Xd, Xn, #0xff
+```
+
+### ext16s_i32
+
+Sign extends the lower 16b of a register into a 32b destination.
+
+**IR Format**: `ext16s Rd, Rn`  
+**Gadget type:** gadget per `Rd`, `Rn` (256)
+
+```asm
+sxth Xd, Wn
+```
+
+### ext16s_i64
+
+Sign extends the lower 16b of a register into a 64b destination.
+
+**IR Format**: `ext16s Rd, Rn`  
+**Gadget type:** gadget per `Rd`, `Rn` (256)
+
+```asm
+sxth Xd, Wn
+```
+
+### ext16u_i32
+
+Zero extends the lower 16b of a register into a 32b destination.
+
+**IR Format**: `ext16u Rd, Rn`  
+**Gadget type:** gadget per `Rd`, `Rn` (256)
+
+```asm
+and Wd, Wn, #0xffff
+```
+
+### ext16u_i64
+
+Zero extends the lower 16b of a register into a 32b destination.
+
+**IR Format**: `ext16u Rd, Rn`  
+**Gadget type:** gadget per `Rd`, `Rn` (256)
+
+```asm
+and Wd, Wn, #0xffff
+```
+
+### ext32s_i64
+
+Sign extends the lower 32b of a register into a 64b destination.
+
+**IR Format**: `ext32s Rd, Rn`  
+**Gadget type:** gadget per `Rd`, `Rn` (256)
+
+```asm
+sxtw Xd, Wn
+```
+
+### ext32u_i64
+
+Zero extends the lower 32b of a register into a 64b destination.
+
+**IR Format**: `ext32s Rd, Rn`  
+**Gadget type:** gadget per `Rd`, `Rn` (256)
+
+```asm
+sxtw Xd, Wn
+```
+
+### ext_i32_i64
+
+Sign extends the lower 32b of a register into a 64b destination.
+
+**IR Format**: `ext32s Rd, Rn`  
+**Gadget type:** gadget per `Rd`, `Rn` (256)
+
+```asm
+sxtw Xd, Wn
+```
+
+### extu_i32_i64
+
+Zero extends the lower 32b of a register into a 32b destination.
+
+**IR Format**: `ext32u Rd, Rn`  
+**Gadget type:** gadget per `Rd`, `Rn` (256)
+
+```asm
+and Xd, Xn, #0xffffffff
+```
+
+### bswap16_i32
+
+Byte-swaps a 16b quantity.
+
+**IR Format**: `bswap16 Rd, Rn`  
+**Gadget type:** gadget per `Rd`, `Rn` (256)
+
+```asm
+rev     w27, Wn
+lsr     Wd, w27, #16
+```
+
+### bswap16_i64
+
+Byte-swaps a 16b quantity.
+
+**IR Format**: `bswap16 Rd, Rn`  
+**Gadget type:** gadget per `Rd`, `Rn` (256)
+
+```asm
+rev     w27, Wn
+lsr     Wd, w27, #16
+```
+
+### bswap32_i32
+
+Byte-swaps a 32b quantity.
+
+**IR Format**: `bswap32 Rd, Rn`  
+**Gadget type:** gadget per `Rd`, `Rn` (256)
+
+```asm
+rev     Wd, Wn
+```
+
+### bswap32_i64
+
+Byte-swaps a 32b quantity.
+
+**IR Format**: `bswap32 Rd, Rn`  
+**Gadget type:** gadget per `Rd`, `Rn` (256)
+
+```asm
+rev     Wd, Wn
+```
+
+### bswap64_i64
+
+Byte-swaps a 64b quantity.
+
+**IR Format**: `bswap64 Rd, Rn`  
+**Gadget type:** gadget per `Rd`, `Rn` (256)
+
+```asm
+rev     Xd, Xn
+```
+
+### exit_tb
+
+Exits the translation block. Has no gadget; but instead inserts the address of the translation block epilogue.
+
+
+### mb
+
+Memory barrier.
+
+**IR Format**: `mb <type>`  
+**Gadget type:** gadget per type
+
+```asm
+# !!! TODO
+```
+
+#### Note
+
+We still need to look up out how to map QEMU MB types map to AArch64 ones. This might take nuance.
diff --git a/tcg/aarch64-tcti/tcg-target-con-set.h b/tcg/aarch64-tcti/tcg-target-con-set.h
new file mode 100644
index 000000000000..f51b7bcb13e7
--- /dev/null
+++ b/tcg/aarch64-tcti/tcg-target-con-set.h
@@ -0,0 +1,21 @@
+/* SPDX-License-Identifier: MIT */
+/*
+ * TCI target-specific constraint sets.
+ * Copyright (c) 2021 Linaro
+ */
+
+/*
+ * C_On_Im(...) defines a constraint set with <n> outputs and <m> inputs.
+ * Each operand should be a sequence of constraint letters as defined by
+ * tcg-target-con-str.h; the constraint combination is inclusive or.
+ */
+C_O0_I2(r, r)
+C_O0_I3(r, r, r)
+C_O0_I4(r, r, r, r)
+C_O1_I1(r, r)
+C_O1_I2(r, 0, r)
+C_O1_I2(r, r, r)
+C_O1_I4(r, r, r, r, r)
+C_O2_I1(r, r, r)
+C_O2_I2(r, r, r, r)
+C_O2_I4(r, r, r, r, r, r)
diff --git a/tcg/aarch64-tcti/tcg-target-con-str.h b/tcg/aarch64-tcti/tcg-target-con-str.h
new file mode 100644
index 000000000000..87c0f19e9c2e
--- /dev/null
+++ b/tcg/aarch64-tcti/tcg-target-con-str.h
@@ -0,0 +1,11 @@
+/* SPDX-License-Identifier: MIT */
+/*
+ * Define TCI target-specific operand constraints.
+ * Copyright (c) 2021 Linaro
+ */
+
+/*
+ * Define constraint letters for register sets:
+ * REGS(letter, register_mask)
+ */
+REGS('r', MAKE_64BIT_MASK(0, TCG_TARGET_NB_REGS))
diff --git a/tcg/aarch64-tcti/tcg-target.c.inc b/tcg/aarch64-tcti/tcg-target.c.inc
new file mode 100644
index 000000000000..d7bb67a92140
--- /dev/null
+++ b/tcg/aarch64-tcti/tcg-target.c.inc
@@ -0,0 +1,1347 @@
+/*
+ * Tiny Code Threaded Intepreter for QEMU
+ *
+ * Copyright (c) 2021 Kate Temkin
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+#define TCTI_GADGET_IMMEDIATE_ARRAY_LEN 64
+
+// Grab our gadget definitions.
+// FIXME: use the system path instead of hardcoding this?
+#include "tcti-gadgets.c.inc"
+
+/* Marker for missing code. */
+#define TODO() \
+    do { \
+        fprintf(stderr, "TODO %s:%u: %s()\n", \
+                __FILE__, __LINE__, __func__); \
+        tcg_abort(); \
+    } while (0)
+
+
+/* Enable TCTI assertions only when debugging TCG (and without NDEBUG defined).
+ * Without assertions, the interpreter runs much faster. */
+#if defined(CONFIG_DEBUG_TCG)
+# define tcti_assert(cond) assert(cond)
+#else
+# define tcti_assert(cond) ((void)0)
+#endif
+
+/* Bitfield n...m (in 32 bit value). */
+#define BITS(n, m) (((0xffffffffU << (31 - n)) >> (31 - n + m)) << m)
+
+/**
+ * Macro that defines a look-up tree for named QEMU_LD gadgets.
+ */ 
+#define LD_MEMOP_LOOKUP(variable, arg, suffix) \
+    switch (get_memop(arg) & (MO_BSWAP | MO_SSIZE)) { \
+        case MO_UB:   variable = gadget_qemu_ld_ub_   ## suffix; break; \
+        case MO_SB:   variable = gadget_qemu_ld_sb_   ## suffix; break; \
+        case MO_LEUW: variable = gadget_qemu_ld_leuw_ ## suffix; break; \
+        case MO_LESW: variable = gadget_qemu_ld_lesw_ ## suffix; break; \
+        case MO_LEUL: variable = gadget_qemu_ld_leul_ ## suffix; break; \
+        case MO_LESL: variable = gadget_qemu_ld_lesl_ ## suffix; break; \
+        case MO_LEQ:  variable = gadget_qemu_ld_leq_  ## suffix; break; \
+        case MO_BEUW: variable = gadget_qemu_ld_beuw_ ## suffix; break; \
+        case MO_BESW: variable = gadget_qemu_ld_besw_ ## suffix; break; \
+        case MO_BEUL: variable = gadget_qemu_ld_beul_ ## suffix; break; \
+        case MO_BESL: variable = gadget_qemu_ld_besl_ ## suffix; break; \
+        case MO_BEQ:  variable = gadget_qemu_ld_beq_  ## suffix; break; \
+        default: \
+            g_assert_not_reached(); \
+    }
+#define LD_MEMOP_HANDLER(variable, arg, suffix, a_bits, s_bits) \
+        if (a_bits >= s_bits) { \
+            LD_MEMOP_LOOKUP(variable, arg, aligned_ ## suffix ); \
+        } else { \
+            LD_MEMOP_LOOKUP(gadget, arg, unaligned_ ## suffix); \
+        }
+
+
+
+/**
+ * Macro that defines a look-up tree for named QEMU_ST gadgets.
+ */ 
+#define ST_MEMOP_LOOKUP(variable, arg, suffix) \
+    switch (get_memop(arg) & (MO_BSWAP | MO_SSIZE)) { \
+        case MO_UB:   variable = gadget_qemu_st_ub_   ## suffix; break; \
+        case MO_LEUW: variable = gadget_qemu_st_leuw_ ## suffix; break; \
+        case MO_LEUL: variable = gadget_qemu_st_leul_ ## suffix; break; \
+        case MO_LEQ:  variable = gadget_qemu_st_leq_  ## suffix; break; \
+        case MO_BEUW: variable = gadget_qemu_st_beuw_ ## suffix; break; \
+        case MO_BEUL: variable = gadget_qemu_st_beul_ ## suffix; break; \
+        case MO_BEQ:  variable = gadget_qemu_st_beq_  ## suffix; break; \
+        default: \
+            g_assert_not_reached(); \
+    }
+#define ST_MEMOP_HANDLER(variable, arg, suffix, a_bits, s_bits) \
+        if (a_bits >= s_bits) { \
+            ST_MEMOP_LOOKUP(variable, arg, aligned_ ## suffix ); \
+        } else { \
+            ST_MEMOP_LOOKUP(gadget, arg, unaligned_ ## suffix); \
+        }
+
+
+static TCGConstraintSetIndex tcg_target_op_def(TCGOpcode op)
+{
+    switch (op) {
+    case INDEX_op_ld8u_i32:
+    case INDEX_op_ld8s_i32:
+    case INDEX_op_ld16u_i32:
+    case INDEX_op_ld16s_i32:
+    case INDEX_op_ld_i32:
+    case INDEX_op_ld8u_i64:
+    case INDEX_op_ld8s_i64:
+    case INDEX_op_ld16u_i64:
+    case INDEX_op_ld16s_i64:
+    case INDEX_op_ld32u_i64:
+    case INDEX_op_ld32s_i64:
+    case INDEX_op_ld_i64:
+    case INDEX_op_not_i32:
+    case INDEX_op_not_i64:
+    case INDEX_op_neg_i32:
+    case INDEX_op_neg_i64:
+    case INDEX_op_ext8s_i32:
+    case INDEX_op_ext8s_i64:
+    case INDEX_op_ext16s_i32:
+    case INDEX_op_ext16s_i64:
+    case INDEX_op_ext8u_i32:
+    case INDEX_op_ext8u_i64:
+    case INDEX_op_ext16u_i32:
+    case INDEX_op_ext16u_i64:
+    case INDEX_op_ext32s_i64:
+    case INDEX_op_ext32u_i64:
+    case INDEX_op_ext_i32_i64:
+    case INDEX_op_extu_i32_i64:
+    case INDEX_op_bswap16_i32:
+    case INDEX_op_bswap16_i64:
+    case INDEX_op_bswap32_i32:
+    case INDEX_op_bswap32_i64:
+    case INDEX_op_bswap64_i64:
+        return C_O1_I1(r, r);
+
+    case INDEX_op_st8_i32:
+    case INDEX_op_st16_i32:
+    case INDEX_op_st_i32:
+    case INDEX_op_st8_i64:
+    case INDEX_op_st16_i64:
+    case INDEX_op_st32_i64:
+    case INDEX_op_st_i64:
+        return C_O0_I2(r, r);
+
+    case INDEX_op_div_i32:
+    case INDEX_op_div_i64:
+    case INDEX_op_divu_i32:
+    case INDEX_op_divu_i64:
+    case INDEX_op_rem_i32:
+    case INDEX_op_rem_i64:
+    case INDEX_op_remu_i32:
+    case INDEX_op_remu_i64:
+    case INDEX_op_add_i32:
+    case INDEX_op_add_i64:
+    case INDEX_op_sub_i32:
+    case INDEX_op_sub_i64:
+    case INDEX_op_mul_i32:
+    case INDEX_op_mul_i64:
+    case INDEX_op_and_i32:
+    case INDEX_op_and_i64:
+    case INDEX_op_andc_i32:
+    case INDEX_op_andc_i64:
+    case INDEX_op_eqv_i32:
+    case INDEX_op_eqv_i64:
+    case INDEX_op_nand_i32:
+    case INDEX_op_nand_i64:
+    case INDEX_op_nor_i32:
+    case INDEX_op_nor_i64:
+    case INDEX_op_or_i32:
+    case INDEX_op_or_i64:
+    case INDEX_op_orc_i32:
+    case INDEX_op_orc_i64:
+    case INDEX_op_xor_i32:
+    case INDEX_op_xor_i64:
+    case INDEX_op_shl_i32:
+    case INDEX_op_shl_i64:
+    case INDEX_op_shr_i32:
+    case INDEX_op_shr_i64:
+    case INDEX_op_sar_i32:
+    case INDEX_op_sar_i64:
+    case INDEX_op_rotl_i32:
+    case INDEX_op_rotl_i64:
+    case INDEX_op_rotr_i32:
+    case INDEX_op_rotr_i64:
+    case INDEX_op_setcond_i32:
+    case INDEX_op_setcond_i64:
+        return C_O1_I2(r, r, r);
+
+    case INDEX_op_brcond_i32:
+    case INDEX_op_brcond_i64:
+        return C_O0_I2(r, r);
+
+    case INDEX_op_qemu_ld_i32:
+    case INDEX_op_qemu_ld_i64:
+        return C_O1_I2(r, r, r);
+    case INDEX_op_qemu_st_i32:
+    case INDEX_op_qemu_st_i64:
+        return C_O0_I3(r, r, r);
+
+    default:
+        g_assert_not_reached();
+    }
+}
+
+static const int tcg_target_reg_alloc_order[] = {
+    TCG_REG_R0,
+    TCG_REG_R1,
+    TCG_REG_R2,
+    TCG_REG_R3,
+    TCG_REG_R4,
+    TCG_REG_R5,
+    TCG_REG_R6,
+    TCG_REG_R7,
+    TCG_REG_R8,
+    TCG_REG_R9,
+    TCG_REG_R10,
+    TCG_REG_R11,
+    TCG_REG_R12,
+    TCG_REG_R13,
+    /*
+    TCG_REG_R14,  // AREG0
+    TCG_REG_R15,  // SP
+    */
+};
+
+#if MAX_OPC_PARAM_IARGS != 6
+# error Fix needed, number of supported input arguments changed!
+#endif
+
+static const int tcg_target_call_iarg_regs[] = {
+    TCG_REG_R0,
+    TCG_REG_R1,
+    TCG_REG_R2,
+    TCG_REG_R3,
+    TCG_REG_R4,
+    TCG_REG_R5,
+};
+
+static const int tcg_target_call_oarg_regs[] = {
+    TCG_REG_R0,
+};
+
+#ifdef CONFIG_DEBUG_TCG
+static const char *const tcg_target_reg_names[TCG_TARGET_NB_REGS] = {
+    "r00",
+    "r01",
+    "r02",
+    "r03",
+    "r04",
+    "r05",
+    "r06",
+    "r07",
+    "r08",
+    "r09",
+    "r10",
+    "r11",
+    "r12",
+    "r13",
+    "r14",
+    "r15",
+};
+#endif
+
+static bool patch_reloc(tcg_insn_unit *code_ptr, int type,
+                        intptr_t value, intptr_t addend)
+{
+    /* tcg_out_reloc always uses the same type, addend. */
+    tcg_debug_assert(type == sizeof(tcg_target_long));
+    tcg_debug_assert(addend == 0);
+    tcg_debug_assert(value != 0);
+    if (TCG_TARGET_REG_BITS == 32) {
+        tcg_patch32(code_ptr, value);
+    } else {
+        tcg_patch64(code_ptr, value);
+    }
+    return true;
+}
+
+#if defined(CONFIG_DEBUG_TCG_INTERPRETER)
+/* Show current bytecode. Used by tcg interpreter. */
+void tci_disas(uint8_t opc)
+{
+    const TCGOpDef *def = &tcg_op_defs[opc];
+    fprintf(stderr, "TCG %s %u, %u, %u\n",
+            def->name, def->nb_oargs, def->nb_iargs, def->nb_cargs);
+}
+#endif
+
+/* Write value (native size). */
+static void tcg_out_immediate(TCGContext *s, tcg_target_ulong v)
+{
+    if (TCG_TARGET_REG_BITS == 32) {
+        //tcg_out32(s, v);
+        tcg_out64(s, v);
+    } else {
+        tcg_out64(s, v);
+    }
+}
+
+void tb_target_set_jmp_target(uintptr_t tc_ptr, uintptr_t jmp_rx, uintptr_t jmp_rw, uintptr_t addr)
+{
+    /* Get a pointer to our immediate, which exists after a single pointer. */
+    uintptr_t immediate_addr = jmp_rw;
+
+    /* Patch it to be match our target address. */
+    qatomic_set((uint64_t *)immediate_addr, addr);
+}
+
+
+/**
+ * TCTI Thunk Helpers
+ */
+
+#ifdef CONFIG_SOFTMMU
+
+// TODO: relocate these prototypes?
+tcg_target_ulong helper_ret_ldub_mmu_signed(CPUArchState *env, target_ulong addr, TCGMemOpIdx oi, uintptr_t retaddr);
+tcg_target_ulong helper_le_lduw_mmu_signed(CPUArchState *env, target_ulong addr, TCGMemOpIdx oi, uintptr_t retaddr);
+tcg_target_ulong helper_le_ldul_mmu_signed(CPUArchState *env, target_ulong addr, TCGMemOpIdx oi, uintptr_t retaddr);
+tcg_target_ulong helper_be_lduw_mmu_signed(CPUArchState *env, target_ulong addr, TCGMemOpIdx oi, uintptr_t retaddr);
+tcg_target_ulong helper_be_ldul_mmu_signed(CPUArchState *env, target_ulong addr, TCGMemOpIdx oi, uintptr_t retaddr);
+
+tcg_target_ulong helper_ret_ldub_mmu_signed(CPUArchState *env, target_ulong addr, TCGMemOpIdx oi, uintptr_t retaddr)
+{
+    return (int8_t)helper_ret_ldub_mmu(env, addr, oi, retaddr);
+}
+
+tcg_target_ulong helper_le_lduw_mmu_signed(CPUArchState *env, target_ulong addr, TCGMemOpIdx oi, uintptr_t retaddr)
+{
+    return (int16_t)helper_le_lduw_mmu(env, addr, oi, retaddr);
+}
+
+tcg_target_ulong helper_le_ldul_mmu_signed(CPUArchState *env, target_ulong addr, TCGMemOpIdx oi, uintptr_t retaddr)
+{
+    return (int32_t)helper_le_ldul_mmu(env, addr, oi, retaddr);
+}
+
+tcg_target_ulong helper_be_lduw_mmu_signed(CPUArchState *env, target_ulong addr, TCGMemOpIdx oi, uintptr_t retaddr)
+{
+    return (int16_t)helper_be_lduw_mmu(env, addr, oi, retaddr);
+}
+
+tcg_target_ulong helper_be_ldul_mmu_signed(CPUArchState *env, target_ulong addr, TCGMemOpIdx oi, uintptr_t retaddr)
+{
+    return (int32_t)helper_be_ldul_mmu(env, addr, oi, retaddr);
+}
+
+#else
+#error TCTI currently only supports use of the soft MMU.
+#endif
+
+
+/**
+ * TCTI Emmiter Helpers
+ */
+
+
+/* Write gadget pointer. */
+static void tcg_out_nullary_gadget(TCGContext *s, void *gadget)
+{
+    tcg_out_immediate(s, (tcg_target_ulong)gadget);
+}
+
+/* Write gadget pointer, plus 64b immediate. */
+static void tcg_out_imm64_gadget(TCGContext *s, void *gadget, tcg_target_ulong immediate)
+{
+    tcg_out_nullary_gadget(s, gadget);
+    tcg_out64(s, immediate);
+}
+
+
+/* Write gadget pointer (one register). */
+static void tcg_out_unary_gadget(TCGContext *s, void *gadget_base[TCG_TARGET_NB_REGS], unsigned reg0)
+{
+    tcg_out_nullary_gadget(s, gadget_base[reg0]);
+}
+
+
+/* Write gadget pointer (two registers). */
+static void tcg_out_binary_gadget(TCGContext *s, void *gadget_base[TCG_TARGET_NB_REGS][TCG_TARGET_NB_REGS], unsigned reg0, unsigned reg1)
+{
+    tcg_out_nullary_gadget(s, gadget_base[reg0][reg1]);
+}
+
+
+/* Write gadget pointer (three registers). */
+static void tcg_out_ternary_gadget(TCGContext *s, void *gadget_base[TCG_TARGET_NB_REGS][TCG_TARGET_NB_REGS][TCG_TARGET_NB_REGS], unsigned reg0, unsigned reg1, unsigned reg2)
+{
+    tcg_out_nullary_gadget(s, gadget_base[reg0][reg1][reg2]);
+}
+
+
+/**
+ * Version of our LDST generator that defers to more optimized gadgets selectively.
+ */
+static void tcg_out_ldst_gadget_inner(TCGContext *s, 
+    void *gadget_base[TCG_TARGET_NB_REGS][TCG_TARGET_NB_REGS], 
+    void *gadget_pos_imm[TCG_TARGET_NB_REGS][TCG_TARGET_NB_REGS][TCTI_GADGET_IMMEDIATE_ARRAY_LEN], 
+    void *gadget_shifted_imm[TCG_TARGET_NB_REGS][TCG_TARGET_NB_REGS][TCTI_GADGET_IMMEDIATE_ARRAY_LEN], 
+    void *gadget_neg_imm[TCG_TARGET_NB_REGS][TCG_TARGET_NB_REGS][TCTI_GADGET_IMMEDIATE_ARRAY_LEN], 
+    unsigned reg0, unsigned reg1, uint32_t offset)
+{
+    int64_t extended_offset = (int32_t)offset;
+    bool is_negative = (extended_offset < 0);
+
+    // Optimal case: we have a gadget that handles our specific offset, so we don't need to encode
+    // an immediate. This saves us a bunch of speed. :)
+
+    // We handle positive and negative gadgets separately, in order to allow for asymmetrical
+    // collections of pre-made gadgets.
+    if (!is_negative) 
+    {
+        uint64_t shifted_offset = (extended_offset >> 3);
+        bool aligned_to_8B = ((extended_offset & 0b111) == 0);
+
+        bool have_optimized_gadget = (extended_offset < TCTI_GADGET_IMMEDIATE_ARRAY_LEN);
+        bool have_shifted_gadget   = (shifted_offset  < TCTI_GADGET_IMMEDIATE_ARRAY_LEN);
+
+        // More optimal case: we have a gadget that directly encodes the argument.
+        if (have_optimized_gadget) {
+            tcg_out_nullary_gadget(s, gadget_pos_imm[reg0][reg1][extended_offset]);
+            return;
+        } 
+
+        // Special case: it's frequent to have low-numbered positive offsets that are aligned
+        // to 16B boundaries
+        else if(aligned_to_8B && have_shifted_gadget) {
+            tcg_out_nullary_gadget(s, gadget_shifted_imm[reg0][reg1][shifted_offset]);
+            return;
+        }
+    } 
+    else {
+        uint64_t negated_offset = -(extended_offset);
+
+        // More optimal case: we have a gadget that directly encodes the argument.
+        if (negated_offset < TCTI_GADGET_IMMEDIATE_ARRAY_LEN) {
+            tcg_out_nullary_gadget(s, gadget_neg_imm[reg0][reg1][negated_offset]);
+            return;
+        }
+    }
+
+    // Less optimal case: we don't have a gadget specifically for this. Emit the general case immediate.
+    tcg_out_binary_gadget(s, gadget_base, reg0, reg1);
+    tcg_out64(s, extended_offset); //tcg_out32(s, offset);
+}
+
+/* Shorthand for the above, that prevents us from having to specify the name three times. */
+#define tcg_out_ldst_gadget(s, name, a, b, c) \
+    tcg_out_ldst_gadget_inner(s, name, \
+        name ## _imm,  \
+        name ## _sh8_imm,  \
+        name ## _neg_imm, \
+    a, b, c)
+
+
+
+/* Write label. */
+static void tcti_out_label(TCGContext *s, TCGLabel *label)
+{
+    if (label->has_value) {
+        tcg_out64(s, label->u.value);
+        tcg_debug_assert(label->u.value);
+    } else {
+        tcg_out_reloc(s, s->code_ptr, sizeof(tcg_target_ulong), label, 0);
+        s->code_ptr += sizeof(tcg_target_ulong);
+    }
+}
+
+/**
+ * Generate a register-to-register MOV.
+ */
+static bool tcg_out_mov(TCGContext *s, TCGType type, TCGReg ret, TCGReg arg)
+{
+    tcg_debug_assert(ret != arg);
+
+    if (type == TCG_TYPE_I32) {
+        tcg_out_binary_gadget(s, gadget_mov_i32, ret, arg);
+    } else {
+        tcg_out_binary_gadget(s, gadget_mov_i64, ret, arg);
+    }
+
+
+    return true;
+}
+
+
+static void tcg_out_movi_i32(TCGContext *s, TCGReg t0, tcg_target_long arg)
+{
+    bool is_negative = (arg < 0);
+
+    // We handle positive and negative gadgets separately, in order to allow for asymmetrical
+    // collections of pre-made gadgets.
+    if (!is_negative) 
+    {
+        // More optimal case: we have a gadget that directly encodes the argument.
+        if (arg < ARRAY_SIZE(gadget_movi_imm_i32[t0])) {
+            tcg_out_nullary_gadget(s, gadget_movi_imm_i32[t0][arg]);
+            return;
+        }
+    } 
+    else {
+
+    }
+
+    // Emit the mov and its immediate.
+    tcg_out_unary_gadget(s, gadget_movi_i32, t0);
+    tcg_out64(s, arg); // TODO: make 32b?
+}
+
+
+static void tcg_out_movi_i64(TCGContext *s, TCGReg t0, tcg_target_long arg)
+{
+    uint8_t is_negative = arg < 0;
+
+    // We handle positive and negative gadgets separately, in order to allow for asymmetrical
+    // collections of pre-made gadgets.
+    if (!is_negative) 
+    {
+        // More optimal case: we have a gadget that directly encodes the argument.
+        if (arg < ARRAY_SIZE(gadget_movi_imm_i64[t0])) {
+            tcg_out_nullary_gadget(s, gadget_movi_imm_i64[t0][arg]);
+            return;
+        }
+    } 
+    else {
+
+    }
+
+    // TODO: optimize the negative case, too?
+
+    // Less optimal case: emit the mov and its immediate.
+    tcg_out_unary_gadget(s, gadget_movi_i64, t0);
+    tcg_out64(s, arg);
+}
+
+
+/**
+ * Generate an immediate-to-register MOV.
+ */
+static void tcg_out_movi(TCGContext *s, TCGType type, TCGReg t0, tcg_target_long arg)
+{
+    if (type == TCG_TYPE_I32) {
+        tcg_out_movi_i32(s, t0, arg);
+    } else {
+        tcg_out_movi_i64(s, t0, arg);
+    }
+}
+
+/**
+ * Generate a CALL.
+ */
+static inline void tcg_out_call(TCGContext *s, const tcg_insn_unit *arg)
+{
+    tcg_out_nullary_gadget(s, gadget_call);
+    tcg_out64(s, (uintptr_t)arg);
+}
+
+/**
+ * Generates LD instructions.
+ */
+static void tcg_out_ld(TCGContext *s, TCGType type, TCGReg ret, TCGReg arg1,
+                       intptr_t arg2)
+{
+
+    if (type == TCG_TYPE_I32) {
+        tcg_out_ldst_gadget(s, gadget_ld32u, ret, arg1, arg2); 
+    } else {
+        tcg_out_ldst_gadget(s, gadget_ld_i64, ret, arg1, arg2); 
+    }
+}
+
+
+/**
+ * Generate every other operation.
+ */
+//static void tcg_out_op(TCGContext *s, TCGOpcode opc, const TCGArg *args, const int *const_args)
+void tcg_out_op(TCGContext *s, TCGOpcode opc, const TCGArg *args, const int *const_args)
+{
+    switch (opc) {
+
+    // Exit translation, and return back to QEMU.
+    case INDEX_op_exit_tb:
+        // Emit a simple gadget with a known return code.
+        tcg_out_imm64_gadget(s, gadget_exit_tb, args[0]);
+        break;
+
+    // Jump to a translation block.
+    case INDEX_op_goto_tb:
+
+        // If we're using a direct jump, we'll emit a "relocation" that can be usd
+        // to patch our gadget stream with the target address, later.
+        if (s->tb_jmp_insn_offset) {
+            // Emit our gadget.
+            tcg_out_nullary_gadget(s, gadget_br);
+
+            // Place our current instruction into our "relocation table", so it can
+            // be patched once we know where the branch will target...
+            s->tb_jmp_insn_offset[args[0]] = tcg_current_code_size(s);
+
+            // ... and emit our relocation.
+            tcg_out64(s, args[0]);
+
+
+        } else {
+            /* Indirect jump method. */
+            TODO();
+        }
+        set_jmp_reset_offset(s, args[0]);
+        break;
+
+    // Simple branch.
+    case INDEX_op_br:
+        tcg_out_nullary_gadget(s, gadget_br);
+        tcti_out_label(s, arg_label(args[0]));
+        break;
+
+
+    // Set condition flag.
+    // a0 = Rd, a1 = Rn, a2 = Rm
+    case INDEX_op_setcond_i32:
+    {
+        void *gadget;
+
+        // We have to emit a different gadget per condition; we'll select which.
+        switch(args[3]) {
+            case TCG_COND_EQ:  gadget = gadget_setcond_i32_eq; break;
+            case TCG_COND_NE:  gadget = gadget_setcond_i32_ne; break;
+            case TCG_COND_LT:  gadget = gadget_setcond_i32_lt; break;
+            case TCG_COND_GE:  gadget = gadget_setcond_i32_ge; break;
+            case TCG_COND_LE:  gadget = gadget_setcond_i32_le; break;
+            case TCG_COND_GT:  gadget = gadget_setcond_i32_gt; break;
+            case TCG_COND_LTU: gadget = gadget_setcond_i32_lo; break;
+            case TCG_COND_GEU: gadget = gadget_setcond_i32_hs; break;
+            case TCG_COND_LEU: gadget = gadget_setcond_i32_ls; break;
+            case TCG_COND_GTU: gadget = gadget_setcond_i32_hi; break;
+            default:
+                g_assert_not_reached();
+        }
+
+        tcg_out_ternary_gadget(s, gadget, args[0], args[1], args[2]);
+        break;
+    }
+
+    case INDEX_op_setcond_i64:
+    {
+        void *gadget;
+
+        // We have to emit a different gadget per condition; we'll select which.
+        switch(args[3]) {
+            case TCG_COND_EQ:  gadget = gadget_setcond_i64_eq; break;
+            case TCG_COND_NE:  gadget = gadget_setcond_i64_ne; break;
+            case TCG_COND_LT:  gadget = gadget_setcond_i64_lt; break;
+            case TCG_COND_GE:  gadget = gadget_setcond_i64_ge; break;
+            case TCG_COND_LE:  gadget = gadget_setcond_i64_le; break;
+            case TCG_COND_GT:  gadget = gadget_setcond_i64_gt; break;
+            case TCG_COND_LTU: gadget = gadget_setcond_i64_lo; break;
+            case TCG_COND_GEU: gadget = gadget_setcond_i64_hs; break;
+            case TCG_COND_LEU: gadget = gadget_setcond_i64_ls; break;
+            case TCG_COND_GTU: gadget = gadget_setcond_i64_hi; break;
+            default:
+                g_assert_not_reached();
+        }
+
+        tcg_out_ternary_gadget(s, gadget, args[0], args[1], args[2]);
+        break;
+    }
+
+    /**
+     * Load instructions.
+     */
+
+    case INDEX_op_ld8u_i32:
+    case INDEX_op_ld8u_i64:
+        tcg_out_ldst_gadget(s, gadget_ld8u, args[0], args[1], args[2]); 
+        break;
+
+    case INDEX_op_ld8s_i32:
+        tcg_out_ldst_gadget(s, gadget_ld8s_i32, args[0], args[1], args[2]); 
+        break;
+
+    case INDEX_op_ld8s_i64:
+        tcg_out_ldst_gadget(s, gadget_ld8s_i64, args[0], args[1], args[2]); 
+        break;
+
+    case INDEX_op_ld16u_i32:
+    case INDEX_op_ld16u_i64:
+        tcg_out_ldst_gadget(s, gadget_ld16u, args[0], args[1], args[2]); 
+        break;
+
+    case INDEX_op_ld16s_i32:
+        tcg_out_ldst_gadget(s, gadget_ld16s_i32, args[0], args[1], args[2]); 
+        break;
+
+    case INDEX_op_ld16s_i64:
+        tcg_out_ldst_gadget(s, gadget_ld16s_i64, args[0], args[1], args[2]); 
+        break;
+
+    case INDEX_op_ld_i32:
+    case INDEX_op_ld32u_i64:
+        tcg_out_ldst_gadget(s, gadget_ld32u, args[0], args[1], args[2]); 
+        break;
+
+    case INDEX_op_ld_i64:
+        tcg_out_ldst_gadget(s, gadget_ld_i64, args[0], args[1], args[2]); 
+        break;
+    
+    case INDEX_op_ld32s_i64:
+        tcg_out_ldst_gadget(s, gadget_ld32s_i64, args[0], args[1], args[2]); 
+        break;
+
+
+    /**
+     * Store instructions.
+     */
+    case INDEX_op_st8_i32:
+    case INDEX_op_st8_i64:
+        tcg_out_ldst_gadget(s, gadget_st8, args[0], args[1], args[2]); 
+        break;
+
+    case INDEX_op_st16_i32:
+    case INDEX_op_st16_i64:
+        tcg_out_ldst_gadget(s, gadget_st16, args[0], args[1], args[2]); 
+        break;
+
+    case INDEX_op_st_i32:
+    case INDEX_op_st32_i64:
+        tcg_out_ldst_gadget(s, gadget_st_i32, args[0], args[1], args[2]); 
+        break;
+
+    case INDEX_op_st_i64:
+        tcg_out_ldst_gadget(s, gadget_st_i64, args[0], args[1], args[2]); 
+        break;
+
+    /**
+     * Arithmetic instructions.
+     */
+
+    case INDEX_op_add_i32: 
+        tcg_out_ternary_gadget(s, gadget_add_i32, args[0], args[1], args[2]); 
+        break;
+
+    case INDEX_op_sub_i32:
+        tcg_out_ternary_gadget(s, gadget_sub_i32, args[0], args[1], args[2]); 
+        break;
+
+    case INDEX_op_mul_i32:
+        tcg_out_ternary_gadget(s, gadget_mul_i32, args[0], args[1], args[2]); 
+        break;
+
+    case INDEX_op_and_i32:
+        tcg_out_ternary_gadget(s, gadget_and_i32, args[0], args[1], args[2]); 
+        break;
+
+    case INDEX_op_andc_i32:     /* Optional (TCG_TARGET_HAS_andc_i32). */
+        tcg_out_ternary_gadget(s, gadget_andc_i32, args[0], args[1], args[2]); 
+        break;
+
+    case INDEX_op_orc_i32:      /* Optional (TCG_TARGET_HAS_orc_i64). */
+        tcg_out_ternary_gadget(s, gadget_orc_i32, args[0], args[1], args[2]); 
+        break;
+
+    case INDEX_op_eqv_i32:      /* Optional (TCG_TARGET_HAS_orc_i64). */
+        tcg_out_ternary_gadget(s, gadget_eqv_i32, args[0], args[1], args[2]); 
+        break;
+
+    case INDEX_op_or_i32:
+        tcg_out_ternary_gadget(s, gadget_or_i32, args[0], args[1], args[2]); 
+        break;
+
+    case INDEX_op_xor_i32:
+        tcg_out_ternary_gadget(s, gadget_xor_i32, args[0], args[1], args[2]); 
+        break;
+
+    case INDEX_op_shl_i32:
+        tcg_out_ternary_gadget(s, gadget_shl_i32, args[0], args[1], args[2]); 
+        break;
+
+    case INDEX_op_shr_i32:
+        tcg_out_ternary_gadget(s, gadget_shr_i32, args[0], args[1], args[2]); 
+        break;
+
+    case INDEX_op_sar_i32:
+        tcg_out_ternary_gadget(s, gadget_sar_i32, args[0], args[1], args[2]); 
+        break;
+
+    //case INDEX_op_rotr_i32:     /* Optional (TCG_TARGET_HAS_rot_i32). */
+    //    tcg_out_ternary_gadget(s, gadget_rotr_i32, args[0], args[1], args[2]); 
+    //    break;
+
+    //case INDEX_op_rotl_i32:     /* Optional (TCG_TARGET_HAS_rot_i32). */
+    //    tcg_out_ternary_gadget(s, gadget_rotl_i32, args[0], args[1], args[2]); 
+
+    case INDEX_op_add_i64:
+        tcg_out_ternary_gadget(s, gadget_add_i64, args[0], args[1], args[2]); 
+        break;
+
+    case INDEX_op_sub_i64:
+        tcg_out_ternary_gadget(s, gadget_sub_i64, args[0], args[1], args[2]); 
+        break;
+
+    case INDEX_op_mul_i64:
+        tcg_out_ternary_gadget(s, gadget_mul_i64, args[0], args[1], args[2]); 
+        break;
+
+    case INDEX_op_and_i64:
+        tcg_out_ternary_gadget(s, gadget_and_i64, args[0], args[1], args[2]); 
+        break;
+
+    case INDEX_op_andc_i64:     /* Optional (TCG_TARGET_HAS_andc_i64). */
+        tcg_out_ternary_gadget(s, gadget_andc_i64, args[0], args[1], args[2]); 
+        break;
+
+    case INDEX_op_orc_i64:      /* Optional (TCG_TARGET_HAS_orc_i64). */
+        tcg_out_ternary_gadget(s, gadget_orc_i64, args[0], args[1], args[2]); 
+        break;
+
+    case INDEX_op_eqv_i64:      /* Optional (TCG_TARGET_HAS_eqv_i64). */
+        tcg_out_ternary_gadget(s, gadget_eqv_i64, args[0], args[1], args[2]); 
+        break;
+
+    //case INDEX_op_nand_i64:     /* Optional (TCG_TARGET_HAS_nand_i64). */
+    //case INDEX_op_nor_i64:      /* Optional (TCG_TARGET_HAS_nor_i64). */
+
+    case INDEX_op_or_i64:
+        tcg_out_ternary_gadget(s, gadget_or_i64, args[0], args[1], args[2]); 
+        break;
+
+    case INDEX_op_xor_i64:
+        tcg_out_ternary_gadget(s, gadget_xor_i64, args[0], args[1], args[2]); 
+        break;
+
+    case INDEX_op_shl_i64:
+        tcg_out_ternary_gadget(s, gadget_shl_i64, args[0], args[1], args[2]); 
+        break;
+
+    case INDEX_op_shr_i64:
+        tcg_out_ternary_gadget(s, gadget_shr_i64, args[0], args[1], args[2]); 
+        break;
+
+    case INDEX_op_sar_i64:
+        tcg_out_ternary_gadget(s, gadget_sar_i64, args[0], args[1], args[2]); 
+        break;
+
+    //case INDEX_op_rotl_i64:     /* Optional (TCG_TARGET_HAS_rot_i64). */
+    //    tcg_out_ternary_gadget(s, gadget_rotl_i64, args[0], args[1], args[2]); 
+    //    break;
+
+    //case INDEX_op_rotr_i64:     /* Optional (TCG_TARGET_HAS_rot_i64). */
+    //    tcg_out_ternary_gadget(s, gadget_rotr_i64, args[0], args[1], args[2]); 
+    //    break;
+
+    case INDEX_op_div_i64:      /* Optional (TCG_TARGET_HAS_div_i64). */
+        tcg_out_ternary_gadget(s, gadget_div_i64, args[0], args[1], args[2]); 
+        break;
+
+    case INDEX_op_divu_i64:     /* Optional (TCG_TARGET_HAS_div_i64). */
+        tcg_out_ternary_gadget(s, gadget_divu_i64, args[0], args[1], args[2]); 
+        break;
+
+    case INDEX_op_rem_i64:      /* Optional (TCG_TARGET_HAS_div_i64). */
+        tcg_out_ternary_gadget(s, gadget_rem_i64, args[0], args[1], args[2]); 
+        break;
+
+    case INDEX_op_remu_i64:     /* Optional (TCG_TARGET_HAS_div_i64). */
+        tcg_out_ternary_gadget(s, gadget_remu_i64, args[0], args[1], args[2]); 
+        break;
+
+    case INDEX_op_brcond_i64:
+    {
+        static uint8_t last_brcond_i64 = 0;
+        void *gadget;
+
+        // We have to emit a different gadget per condition; we'll select which.
+        switch(args[2]) {
+            case TCG_COND_EQ:  gadget = gadget_brcond_i64_eq; break;
+            case TCG_COND_NE:  gadget = gadget_brcond_i64_ne; break;
+            case TCG_COND_LT:  gadget = gadget_brcond_i64_lt; break;
+            case TCG_COND_GE:  gadget = gadget_brcond_i64_ge; break;
+            case TCG_COND_LE:  gadget = gadget_brcond_i64_le; break;
+            case TCG_COND_GT:  gadget = gadget_brcond_i64_gt; break;
+            case TCG_COND_LTU: gadget = gadget_brcond_i64_lo; break;
+            case TCG_COND_GEU: gadget = gadget_brcond_i64_hs; break;
+            case TCG_COND_LEU: gadget = gadget_brcond_i64_ls; break;
+            case TCG_COND_GTU: gadget = gadget_brcond_i64_hi; break;
+            default:
+                g_assert_not_reached();
+        }
+
+        // We'll select the which branch to used based on a cycling counter.
+        // This means we'll pick one of 16 identical brconds. Spreading this out
+        // helps the processor's branch prediction be less "squished", as not every
+        // branch is going throuh the same instruction.
+        tcg_out_ternary_gadget(s, gadget, last_brcond_i64, args[0], args[1]);
+        last_brcond_i64 = (last_brcond_i64 + 1) % TCG_TARGET_NB_REGS;
+
+        // Branch target immediate.
+        tcti_out_label(s, arg_label(args[3]));
+        break;
+    }
+
+
+    case INDEX_op_bswap16_i32:  /* Optional (TCG_TARGET_HAS_bswap16_i32). */
+    case INDEX_op_bswap16_i64:  /* Optional (TCG_TARGET_HAS_bswap16_i64). */
+        tcg_out_binary_gadget(s, gadget_bswap16, args[0], args[1]);
+        break;
+
+    case INDEX_op_bswap32_i32:  /* Optional (TCG_TARGET_HAS_bswap32_i32). */
+    case INDEX_op_bswap32_i64:  /* Optional (TCG_TARGET_HAS_bswap32_i64). */
+        tcg_out_binary_gadget(s, gadget_bswap32, args[0], args[1]);
+        break;
+
+    case INDEX_op_bswap64_i64:  /* Optional (TCG_TARGET_HAS_bswap64_i64). */
+        tcg_out_binary_gadget(s, gadget_bswap64, args[0], args[1]);
+        break;
+
+    case INDEX_op_not_i64:      /* Optional (TCG_TARGET_HAS_not_i64). */
+        tcg_out_binary_gadget(s, gadget_not_i64, args[0], args[1]);
+        break;
+
+    case INDEX_op_neg_i64:      /* Optional (TCG_TARGET_HAS_neg_i64). */
+        tcg_out_binary_gadget(s, gadget_neg_i64, args[0], args[1]);
+        break;
+
+    case INDEX_op_ext8s_i64:    /* Optional (TCG_TARGET_HAS_ext8s_i64). */
+        tcg_out_binary_gadget(s, gadget_ext8s_i64, args[0], args[1]);
+        break;
+
+    case INDEX_op_ext8u_i32:    /* Optional (TCG_TARGET_HAS_ext8u_i32). */
+    case INDEX_op_ext8u_i64:    /* Optional (TCG_TARGET_HAS_ext8u_i64). */
+        tcg_out_binary_gadget(s, gadget_ext8u, args[0], args[1]);
+        break;
+
+    case INDEX_op_ext16s_i64:   /* Optional (TCG_TARGET_HAS_ext16s_i64). */
+        tcg_out_binary_gadget(s, gadget_ext16s_i64, args[0], args[1]);
+        break;
+
+    case INDEX_op_ext16u_i32:   /* Optional (TCG_TARGET_HAS_ext16u_i32). */
+    case INDEX_op_ext16u_i64:   /* Optional (TCG_TARGET_HAS_ext16u_i64). */
+        tcg_out_binary_gadget(s, gadget_ext16u, args[0], args[1]);
+        break;
+
+    case INDEX_op_ext32s_i64:   /* Optional (TCG_TARGET_HAS_ext32s_i64). */
+    case INDEX_op_ext_i32_i64:
+        tcg_out_binary_gadget(s, gadget_ext32s_i64, args[0], args[1]);
+        break;
+
+    case INDEX_op_ext32u_i64:   /* Optional (TCG_TARGET_HAS_ext32u_i64). */
+    case INDEX_op_extu_i32_i64:
+        tcg_out_binary_gadget(s, gadget_ext32u_i64, args[0], args[1]);
+        break;
+
+    case INDEX_op_neg_i32:      /* Optional (TCG_TARGET_HAS_neg_i32). */
+        tcg_out_binary_gadget(s, gadget_neg_i32, args[0], args[1]);
+        break;
+
+    case INDEX_op_not_i32:      /* Optional (TCG_TARGET_HAS_not_i32). */
+        tcg_out_binary_gadget(s, gadget_not_i32, args[0], args[1]);
+        break;
+
+    case INDEX_op_ext8s_i32:    /* Optional (TCG_TARGET_HAS_ext8s_i32). */
+        tcg_out_binary_gadget(s, gadget_ext8s_i32, args[0], args[1]);
+        break;
+
+    case INDEX_op_ext16s_i32:   /* Optional (TCG_TARGET_HAS_ext16s_i32). */
+        tcg_out_binary_gadget(s, gadget_ext16s_i32, args[0], args[1]);
+        break;
+
+    case INDEX_op_div_i32:      /* Optional (TCG_TARGET_HAS_div_i32). */
+        tcg_out_ternary_gadget(s, gadget_div_i32, args[0], args[1], args[2]); 
+        break;
+
+    case INDEX_op_divu_i32:     /* Optional (TCG_TARGET_HAS_div_i32). */
+        tcg_out_ternary_gadget(s, gadget_divu_i32, args[0], args[1], args[2]); 
+        break;
+
+    case INDEX_op_rem_i32:      /* Optional (TCG_TARGET_HAS_div_i32). */
+        tcg_out_ternary_gadget(s, gadget_rem_i32, args[0], args[1], args[2]); 
+        break;
+
+    case INDEX_op_remu_i32:     /* Optional (TCG_TARGET_HAS_div_i32). */
+        tcg_out_ternary_gadget(s, gadget_remu_i32, args[0], args[1], args[2]); 
+        break;
+
+    case INDEX_op_brcond_i32:
+    {
+        static uint8_t last_brcond_i32 = 0;
+        void *gadget;
+
+        // We have to emit a different gadget per condition; we'll select which.
+        switch(args[2]) {
+            case TCG_COND_EQ:  gadget = gadget_brcond_i32_eq; break;
+            case TCG_COND_NE:  gadget = gadget_brcond_i32_ne; break;
+            case TCG_COND_LT:  gadget = gadget_brcond_i32_lt; break;
+            case TCG_COND_GE:  gadget = gadget_brcond_i32_ge; break;
+            case TCG_COND_LE:  gadget = gadget_brcond_i32_le; break;
+            case TCG_COND_GT:  gadget = gadget_brcond_i32_gt; break;
+            case TCG_COND_LTU: gadget = gadget_brcond_i32_lo; break;
+            case TCG_COND_GEU: gadget = gadget_brcond_i32_hs; break;
+            case TCG_COND_LEU: gadget = gadget_brcond_i32_ls; break;
+            case TCG_COND_GTU: gadget = gadget_brcond_i32_hi; break;
+            default:
+                g_assert_not_reached();
+        }
+
+        // We'll select the which branch to used based on a cycling counter.
+        // This means we'll pick one of 16 identical brconds. Spreading this out
+        // helps the processor's branch prediction be less "squished", as not every
+        // branch is going throuh the same instruction.
+        tcg_out_ternary_gadget(s, gadget, last_brcond_i32, args[0], args[1]);
+        last_brcond_i32 = (last_brcond_i32 + 1) % TCG_TARGET_NB_REGS;
+
+        // Branch target immediate.
+        tcti_out_label(s, arg_label(args[3]));
+
+        break;
+    }
+
+    case INDEX_op_qemu_ld_i32:
+    {
+        MemOp opc = get_memop(args[2]);
+        unsigned a_bits = get_alignment_bits(opc);
+        unsigned s_bits = opc & MO_SIZE;
+
+        void *gadget;
+
+        switch(TLB_MASK_TABLE_OFS(get_mmuidx(args[2]))) {
+            case -64:  LD_MEMOP_HANDLER(gadget, args[2],  off64_i32, a_bits, s_bits); break;
+            case -96:  LD_MEMOP_HANDLER(gadget, args[2],  off96_i32, a_bits, s_bits); break;
+            case -128: LD_MEMOP_HANDLER(gadget, args[2], off128_i32, a_bits, s_bits); break;
+            default:   LD_MEMOP_LOOKUP(gadget, args[2], slowpath_off0_i32); break;
+        }
+
+        // Args:
+        // - an immediate32 encodes our operation index 
+        tcg_out_binary_gadget(s, gadget, args[0], args[1]);
+        tcg_out64(s, args[2]); // TODO: fix encoding to be 4b
+        break;
+    }
+
+    case INDEX_op_qemu_ld_i64:
+    {
+        MemOp opc = get_memop(args[2]);
+        unsigned a_bits = get_alignment_bits(opc);
+        unsigned s_bits = opc & MO_SIZE;
+
+        void *gadget;
+
+        // Special optimization case: if we have an operation/target of 0x3A, 
+        // this is a common case. Delegate to our special-case handler.
+        if (args[2] == 0x3a) {
+            switch(TLB_MASK_TABLE_OFS(get_mmuidx(args[2]))) {
+
+                case -64: 
+                    gadget = (a_bits >= s_bits) ? 
+                        gadget_qemu_ld_leq_aligned_mode3a_off64_i64 :
+                        gadget_qemu_ld_leq_unaligned_mode3a_off64_i64;
+                    break;
+                case -96: 
+                    gadget = (a_bits >= s_bits) ? 
+                        gadget_qemu_ld_leq_aligned_mode3a_off96_i64 :
+                        gadget_qemu_ld_leq_unaligned_mode3a_off96_i64;
+                    break;
+                case -128: 
+                    gadget = (a_bits >= s_bits) ? 
+                        gadget_qemu_ld_leq_aligned_mode3a_off128_i64 :
+                        gadget_qemu_ld_leq_unaligned_mode3a_off128_i64;
+                    break;
+
+                default: 
+                    gadget = gadget_qemu_ld_leq_slowpath_mode3a_off0_i64;
+                    break;
+            }
+            tcg_out_binary_gadget(s, gadget, args[0], args[1]);
+        } 
+        // Otherwise, handle the generic case.
+        else {
+            switch(TLB_MASK_TABLE_OFS(get_mmuidx(args[2]))) {
+                case -64:  LD_MEMOP_HANDLER(gadget, args[2],  off64_i64, a_bits, s_bits); break;
+                case -96:  LD_MEMOP_HANDLER(gadget, args[2],  off96_i64, a_bits, s_bits); break;
+                case -128: LD_MEMOP_HANDLER(gadget, args[2], off128_i64, a_bits, s_bits); break;
+                default:   LD_MEMOP_LOOKUP(gadget, args[2], slowpath_off0_i64); break;
+            }
+            // Args:
+            // - an immediate32 encodes our operation index 
+            tcg_out_binary_gadget(s, gadget, args[0], args[1]);
+            tcg_out64(s, args[2]); // TODO: fix encoding to be 4b
+        }
+
+        break;
+    }
+
+    case INDEX_op_qemu_st_i32:
+    {
+        MemOp opc = get_memop(args[2]);
+        unsigned a_bits = get_alignment_bits(opc);
+        unsigned s_bits = opc & MO_SIZE;
+
+        void *gadget;
+
+        switch(TLB_MASK_TABLE_OFS(get_mmuidx(args[2]))) {
+            case -64:  ST_MEMOP_HANDLER(gadget, args[2],  off64_i32, a_bits, s_bits); break;
+            case -96:  ST_MEMOP_HANDLER(gadget, args[2],  off96_i32, a_bits, s_bits); break;
+            case -128: ST_MEMOP_HANDLER(gadget, args[2], off128_i32, a_bits, s_bits); break;
+            default:   ST_MEMOP_LOOKUP(gadget, args[2], slowpath_off0_i32); break;
+        }
+
+        // Args:
+        // - our gadget encodes the target and address registers
+        // - an immediate32 encodes our operation index 
+        tcg_out_binary_gadget(s, gadget, args[0], args[1]);
+        tcg_out64(s, args[2]); // FIXME: double encoded
+        break;
+    }
+
+    case INDEX_op_qemu_st_i64:
+    {
+        MemOp opc = get_memop(args[2]);
+        unsigned a_bits = get_alignment_bits(opc);
+        unsigned s_bits = opc & MO_SIZE;
+
+        void *gadget;
+
+        // Special optimization case: if we have an operation/target of 0x3A, 
+        // this is a common case. Delegate to our special-case handler.
+        if (args[2] == 0x3a) {
+            switch(TLB_MASK_TABLE_OFS(get_mmuidx(args[2]))) {
+
+                case -64: 
+                    gadget = (a_bits >= s_bits) ? 
+                        gadget_qemu_st_leq_aligned_mode3a_off64_i64 :
+                        gadget_qemu_st_leq_unaligned_mode3a_off64_i64;
+                    break;
+                case -96: 
+                    gadget = (a_bits >= s_bits) ? 
+                        gadget_qemu_st_leq_aligned_mode3a_off96_i64 :
+                        gadget_qemu_st_leq_unaligned_mode3a_off96_i64;
+                    break;
+                case -128: 
+                    gadget = (a_bits >= s_bits) ? 
+                        gadget_qemu_st_leq_aligned_mode3a_off128_i64 :
+                        gadget_qemu_st_leq_unaligned_mode3a_off128_i64;
+                    break;
+
+                default: 
+                    gadget = gadget_qemu_st_leq_slowpath_mode3a_off0_i64;
+                    break;
+            }
+            tcg_out_binary_gadget(s, gadget, args[0], args[1]);
+        } 
+        // Otherwise, handle the generic case.
+        else {
+            switch(TLB_MASK_TABLE_OFS(get_mmuidx(args[2]))) {
+                case -64:  ST_MEMOP_HANDLER(gadget, args[2],  off64_i64, a_bits, s_bits); break;
+                case -96:  ST_MEMOP_HANDLER(gadget, args[2],  off96_i64, a_bits, s_bits); break;
+                case -128: ST_MEMOP_HANDLER(gadget, args[2], off128_i64, a_bits, s_bits); break;
+                default:   ST_MEMOP_LOOKUP(gadget, args[2], slowpath_off0_i64); break;
+            }
+
+            // Args:
+            // - our gadget encodes the target and address registers
+            // - an immediate32 encodes our operation index 
+            tcg_out_binary_gadget(s, gadget, args[0], args[1]);
+            tcg_out64(s, args[2]); // FIXME: double encoded
+        }
+
+        break;
+    }
+
+    // Memory barriers.
+    case INDEX_op_mb:
+    {
+        static void* sync[] = {
+            [0 ... TCG_MO_ALL]            = gadget_mb_all,
+            [TCG_MO_ST_ST]                = gadget_mb_st,
+            [TCG_MO_LD_LD]                = gadget_mb_ld,
+            [TCG_MO_LD_ST]                = gadget_mb_ld,
+            [TCG_MO_LD_ST | TCG_MO_LD_LD] = gadget_mb_ld,
+        };
+        tcg_out_nullary_gadget(s, sync[args[0] & TCG_MO_ALL]);
+
+        break;
+    }
+
+    case INDEX_op_mov_i32:  /* Always emitted via tcg_out_mov.  */
+    case INDEX_op_mov_i64:
+    case INDEX_op_call:     /* Always emitted via tcg_out_call.  */
+    default:
+        tcg_abort();
+    }
+}
+
+/**
+ * Generate immediate stores.
+ */
+static void tcg_out_st(TCGContext *s, TCGType type, TCGReg arg, TCGReg arg1,
+                       intptr_t arg2)
+{
+    if (type == TCG_TYPE_I32) {
+        tcg_out_ldst_gadget(s, gadget_st_i32, arg, arg1, arg2); 
+    } else {
+        tcg_out_ldst_gadget(s, gadget_st_i64, arg, arg1, arg2); 
+    }
+}
+
+static inline bool tcg_out_sti(TCGContext *s, TCGType type, TCGArg val,
+                               TCGReg base, intptr_t ofs)
+{
+    return false;
+}
+
+/* Test if a constant matches the constraint. */
+static int tcg_target_const_match(tcg_target_long val, TCGType type,
+                                  const TCGArgConstraint *arg_ct)
+{
+    /* No need to return 0 or 1, 0 or != 0 is good enough. */
+    return arg_ct->ct & TCG_CT_CONST;
+}
+
+static void tcg_target_init(TCGContext *s)
+{
+    /* The current code uses uint8_t for tcg operations. */
+    tcg_debug_assert(tcg_op_defs_max <= UINT8_MAX);
+
+    /* Registers available for 32 bit operations. */
+    tcg_target_available_regs[TCG_TYPE_I32] = BIT(TCG_TARGET_NB_REGS) - 1;
+    /* Registers available for 64 bit operations. */
+    tcg_target_available_regs[TCG_TYPE_I64] = BIT(TCG_TARGET_NB_REGS) - 1;
+
+    /* TODO: Which registers should be set here? */
+    tcg_target_call_clobber_regs = BIT(TCG_TARGET_NB_REGS) - 1;
+
+    s->reserved_regs = 0;
+    tcg_regset_set_reg(s->reserved_regs, TCG_REG_CALL_STACK);
+
+    /* We use negative offsets from "sp" so that we can distinguish
+       stores that might pretend to be call arguments.  */
+    tcg_set_frame(s, TCG_REG_CALL_STACK, -CPU_TEMP_BUF_NLONGS * sizeof(long), CPU_TEMP_BUF_NLONGS * sizeof(long));
+}
+
+/* Generate global QEMU prologue and epilogue code. */
+static inline void tcg_target_qemu_prologue(TCGContext *s)
+{
+    // No prologue; as we're interpreted.
+}
+
+
+/**
+ * TCTI 'interpreter' bootstrap.
+ */
+
+// Store the current return address during helper calls.
+__thread uintptr_t tcti_call_return_address;
+
+/* Dispatch the bytecode stream contained in our translation buffer. */
+uintptr_t QEMU_DISABLE_CFI tcg_qemu_tb_exec(CPUArchState *env, const void *v_tb_ptr)
+{
+    // Create our per-CPU temporary storage.
+    long tcg_temps[CPU_TEMP_BUF_NLONGS];
+
+    uint64_t return_value = 0;
+    uintptr_t sp_value    = (uintptr_t)(tcg_temps + CPU_TEMP_BUF_NLONGS);
+    uintptr_t pc_mirror   = (uintptr_t)&tcti_call_return_address;
+
+    // Ensure our target configuration hasn't changed.
+    tcti_assert(TCG_AREG0 == TCG_REG_R14);
+    tcti_assert(TCG_REG_CALL_STACK == TCG_REG_R15);
+
+    asm(
+        // Our threaded-dispatch prologue needs to set up things for our machine to run.
+        // This means:
+        //   - Set up TCG_AREG0 (R14) to point to our architectural state.
+        //   - Set up TCG_REG_CALL_STACK (R15) to point to our temporary buffer.
+        //   - Point x28 (our bytecode "instruction pointer") to the relevant stream address.
+        "ldr x14, %[areg0]\n"
+        "ldr x15, %[sp_value]\n"
+        "ldr x25, %[pc_mirror]\n"
+        "ldr x28, %[start_tb_ptr]\n"
+
+        // To start our code, we'll -call- the gadget at the first bytecode pointer.
+        // Note that we call/branch-with-link, here; so our TB_EXIT gadget can RET in order
+        // to return to this point when things are complete.
+        "ldr x27, [x28], #8\n"
+        "blr x27\n"
+
+        // Finally, we'll copy out our final return value.
+        "str x0, %[return_value]\n"
+
+        : [return_value] "=m" (return_value)
+
+        : [areg0]        "m"  (env), 
+          [sp_value]     "m"  (sp_value), 
+          [start_tb_ptr] "m"  (v_tb_ptr),
+          [pc_mirror]    "m"  (pc_mirror)
+
+        // We touch _every_ one of the lower registers, as we use these to execute directly.
+        : "x0", "x1",  "x2",  "x3",  "x4",  "x5",  "x6",  "x7",
+          "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15",
+
+        // We also use x26/x27 for temporary values, and x28 as our bytecode poitner.
+        "x25", "x26", "x27", "x28", "cc", "memory"
+    );
+
+    return return_value;
+}
+
+
+/**
+ *  Disassembly output support.
+ */
+#include <dlfcn.h>
+
+
+/* Disassemble TCI bytecode. */
+int print_insn_tcti(bfd_vma addr, disassemble_info *info)
+{
+    Dl_info symbol_info = {};
+    char symbol_name[48] ;
+
+    int status;
+    uint64_t block;
+
+    // Read the relevant pointer.
+    status = info->read_memory_func(addr, (void *)&block, sizeof(block), info);
+    if (status != 0) {
+        info->memory_error_func(status, addr, info);
+        return -1;
+    }
+
+    // Most of our disassembly stream will be gadgets. Try to get their names, for nice output.
+    dladdr((void *)block, &symbol_info);
+
+    if(symbol_info.dli_sname != 0) {
+        strlcpy(symbol_name, symbol_info.dli_sname, 47);
+        info->fprintf_func(info->stream, "%s", symbol_name);
+    } else {
+        info->fprintf_func(info->stream, "%016llx", block);
+    }
+
+    return sizeof(block);
+}
+
+
diff --git a/tcg/aarch64-tcti/tcg-target.h b/tcg/aarch64-tcti/tcg-target.h
new file mode 100644
index 000000000000..fa2ae5c40a3e
--- /dev/null
+++ b/tcg/aarch64-tcti/tcg-target.h
@@ -0,0 +1,220 @@
+/*
+ * Tiny Code Generator for QEMU
+ *
+ * Copyright (c) 2009, 2011 Stefan Weil
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+/*
+ * This code implements a TCG which does not generate machine code for some
+ * real target machine but which generates virtual machine code for an
+ * interpreter. Interpreted pseudo code is slow, but it works on any host.
+ *
+ * Some remarks might help in understanding the code:
+ *
+ * "target" or "TCG target" is the machine which runs the generated code.
+ * This is different to the usual meaning in QEMU where "target" is the
+ * emulated machine. So normally QEMU host is identical to TCG target.
+ * Here the TCG target is a virtual machine, but this virtual machine must
+ * use the same word size like the real machine.
+ * Therefore, we need both 32 and 64 bit virtual machines (interpreter).
+ */
+
+#ifndef TCG_TARGET_H
+#define TCG_TARGET_H
+
+#if UINTPTR_MAX == UINT32_MAX
+# error We only support AArch64 running in 64B mode.
+#elif UINTPTR_MAX == UINT64_MAX
+# define TCG_TARGET_REG_BITS 64
+#else
+# error Unknown pointer size for tcti target
+#endif
+
+#define TCG_TARGET_INSN_UNIT_SIZE        1
+#define TCG_TARGET_TLB_DISPLACEMENT_BITS 32
+
+// We're an interpreted target; even if we're JIT-compiling to our interpreter's
+// weird psuedo-native bytecode. We'll indicate that we're intepreted.
+#define TCG_TARGET_INTERPRETER 1
+
+//
+// Supported optional instructions.
+//
+
+// Divs.
+#define TCG_TARGET_HAS_div_i32          1
+#define TCG_TARGET_HAS_rem_i32          1
+#define TCG_TARGET_HAS_div_i64          1
+#define TCG_TARGET_HAS_rem_i64          1
+
+// Extends.
+#define TCG_TARGET_HAS_ext8s_i32        1
+#define TCG_TARGET_HAS_ext16s_i32       1
+#define TCG_TARGET_HAS_ext8u_i32        1
+#define TCG_TARGET_HAS_ext16u_i32       1
+#define TCG_TARGET_HAS_ext8s_i64        1
+#define TCG_TARGET_HAS_ext16s_i64       1
+#define TCG_TARGET_HAS_ext32s_i64       1
+#define TCG_TARGET_HAS_ext8u_i64        1
+#define TCG_TARGET_HAS_ext16u_i64       1
+#define TCG_TARGET_HAS_ext32u_i64       1
+
+// Logicals.
+#define TCG_TARGET_HAS_neg_i32          1
+#define TCG_TARGET_HAS_not_i32          1
+#define TCG_TARGET_HAS_neg_i64          1
+#define TCG_TARGET_HAS_not_i64          1
+
+#define TCG_TARGET_HAS_andc_i32         1
+#define TCG_TARGET_HAS_orc_i32          1
+#define TCG_TARGET_HAS_eqv_i32          1
+#define TCG_TARGET_HAS_andc_i64         1
+#define TCG_TARGET_HAS_eqv_i64          1
+#define TCG_TARGET_HAS_orc_i64          1
+
+// We don't curretly support rotates, since AArch64 lacks ROL.
+// We'll fix this later.
+#define TCG_TARGET_HAS_rot_i32          0
+#define TCG_TARGET_HAS_rot_i64          0
+
+// Swaps.
+#define TCG_TARGET_HAS_bswap16_i32      1
+#define TCG_TARGET_HAS_bswap32_i32      1
+#define TCG_TARGET_HAS_bswap16_i64      1
+#define TCG_TARGET_HAS_bswap32_i64      1
+#define TCG_TARGET_HAS_bswap64_i64      1
+#define TCG_TARGET_HAS_MEMORY_BSWAP     1
+
+// Specify we'll handle direct jumps.
+#define TCG_TARGET_HAS_direct_jump      1
+
+//
+// Potential TODOs.
+//
+
+// TODO: implement DEPOSIT as BFI.
+#define TCG_TARGET_HAS_deposit_i32      0
+#define TCG_TARGET_HAS_deposit_i64      0
+
+// TODO: implement EXTRACT as BFX.
+#define TCG_TARGET_HAS_extract_i32      0
+#define TCG_TARGET_HAS_sextract_i32     0
+#define TCG_TARGET_HAS_extract_i64      0
+#define TCG_TARGET_HAS_sextract_i64     0
+
+// TODO: it might be worth writing a gadget for this
+#define TCG_TARGET_HAS_movcond_i32      0
+#define TCG_TARGET_HAS_movcond_i64      0
+
+//
+// Unsupported instructions.
+//
+
+// ARMv8 doesn't have instructions for NAND/NOR.
+#define TCG_TARGET_HAS_nand_i32         0
+#define TCG_TARGET_HAS_nor_i32          0
+#define TCG_TARGET_HAS_nor_i64          0
+#define TCG_TARGET_HAS_nand_i64         0
+
+// aarch64's CLZ is implemented without a condition, so it
+#define TCG_TARGET_HAS_clz_i32          0
+#define TCG_TARGET_HAS_ctz_i32          0
+#define TCG_TARGET_HAS_ctpop_i32        0
+#define TCG_TARGET_HAS_clz_i64          0
+#define TCG_TARGET_HAS_ctz_i64          0
+#define TCG_TARGET_HAS_ctpop_i64        0
+
+
+// GOTO_PTR is too complex to emit a simple gadget for.
+// We'll let C handle it, since the overhead is similar.
+#define TCG_TARGET_HAS_goto_ptr         0
+
+// We don't have a simple gadget for this, since we're always assuming softmmu.
+#define TCG_TARGET_HAS_qemu_st8_i32     0
+
+// No AArch64 equivalent.a
+#define TCG_TARGET_HAS_extrl_i64_i32    0
+#define TCG_TARGET_HAS_extrh_i64_i32    0
+
+#define TCG_TARGET_HAS_extract2_i64     0
+
+// These should always be zero on our 64B platform.
+#define TCG_TARGET_HAS_muls2_i64        0
+#define TCG_TARGET_HAS_add2_i32         0
+#define TCG_TARGET_HAS_sub2_i32         0
+#define TCG_TARGET_HAS_mulu2_i32        0
+#define TCG_TARGET_HAS_add2_i64         0
+#define TCG_TARGET_HAS_sub2_i64         0
+#define TCG_TARGET_HAS_mulu2_i64        0
+#define TCG_TARGET_HAS_muluh_i64        0
+#define TCG_TARGET_HAS_mulsh_i64        0
+#define TCG_TARGET_HAS_extract2_i32     0
+#define TCG_TARGET_HAS_muls2_i32        0
+#define TCG_TARGET_HAS_muluh_i32        0
+#define TCG_TARGET_HAS_mulsh_i32        0
+
+//
+// Platform metadata.
+//
+
+// Number of registers available.
+// It might make sense to up these, since we can also use x16 -> x25?
+#define TCG_TARGET_NB_REGS 16
+
+/* List of registers which are used by TCG. */
+typedef enum {
+    TCG_REG_R0 = 0,
+    TCG_REG_R1,
+    TCG_REG_R2,
+    TCG_REG_R3,
+    TCG_REG_R4,
+    TCG_REG_R5,
+    TCG_REG_R6,
+    TCG_REG_R7,
+    TCG_REG_R8,
+    TCG_REG_R9,
+    TCG_REG_R10,
+    TCG_REG_R11,
+    TCG_REG_R12,
+    TCG_REG_R13,
+    TCG_REG_R14,
+    TCG_REG_R15,
+
+    TCG_AREG0          = TCG_REG_R14,
+    TCG_REG_CALL_STACK = TCG_REG_R15,
+} TCGReg;
+
+// Specify the shape of the stack our runtime will use.
+#define TCG_TARGET_CALL_STACK_OFFSET    0
+#define TCG_TARGET_STACK_ALIGN          16
+
+// We're interpreted, so we'll use our own code to run TB_EXEC.
+#define HAVE_TCG_QEMU_TB_EXEC
+
+// We'll need to enforce memory ordering with barriers.
+#define TCG_TARGET_DEFAULT_MO  (0)
+
+void tci_disas(uint8_t opc);
+
+void tb_target_set_jmp_target(uintptr_t, uintptr_t, uintptr_t, uintptr_t);
+
+
+#endif /* TCG_TARGET_H */
diff --git a/tcg/aarch64-tcti/tcti-gadget-gen.py b/tcg/aarch64-tcti/tcti-gadget-gen.py
new file mode 100755
index 000000000000..1296f6d0c2d7
--- /dev/null
+++ b/tcg/aarch64-tcti/tcti-gadget-gen.py
@@ -0,0 +1,788 @@
+#!/usr/bin/env python3
+""" Gadget-code generator for QEMU TCTI on AArch64. 
+
+Generates a C-code include file containing 'gadgets' for use by TCTI.
+"""
+
+import sys
+import itertools
+
+# Get a handle on the file we'll be working with, and redirect print to it.
+if len(sys.argv) > 1:
+    out_file = open(sys.argv[1], "w")
+
+    # Hook our print function, so it always outputs to the relevant file.
+    core_print = print
+    print = lambda *a, **k : core_print(*a, **k, file=out_file)
+
+# Epilogue code follows at the end of each gadget, and handles continuing execution.
+EPILOGUE = ( 
+    # Load our next gadget address from our bytecode stream, advancing it.
+    "ldr x27, [x28], #8",
+
+    # Jump to the next gadget.
+    "br x27"
+)
+
+# The number of general-purpose registers we're affording the TCG. This must match
+# the configuration in the TCTI target.
+TCG_REGISTER_COUNT   = 16
+TCG_REGISTER_NUMBERS = list(range(TCG_REGISTER_COUNT))
+
+# Helper that provides each of the AArch64 condition codes of interest.
+ARCH_CONDITION_CODES = ["eq", "ne", "lt", "ge", "le", "gt", "lo", "hs", "ls", "hi"]
+
+# We'll create a variety of gadgets that assume the MMU's TLB is stored at certain
+# offsets into its structure. These should match the offsets in tcg-target.c.in.
+QEMU_ALLOWED_MMU_OFFSETS = [ 64, 96, 128 ]
+
+# Statistics.
+gadgets      = 0
+instructions = 0
+
+def simple(name, *lines):
+    """ Generates a simple gadget that needs no per-register specialization. """
+
+    global gadgets, instructions
+
+    gadgets += 1
+
+    # Create our C/ASM framing.
+    #print(f"__attribute__((naked)) static void gadget_{name}(void)")
+    print(f"__attribute__((naked)) void gadget_{name}(void);")
+    print(f"__attribute__((naked)) void gadget_{name}(void)")
+    print("{")
+
+    # Add the core gadget
+    print("\tasm(")
+    for line in lines + EPILOGUE:
+        print(f"\t\t\"{line} \\n\"")
+        instructions += 1
+    print("\t);")
+
+    # End our framing.
+    print("}\n")
+
+
+def with_register_substitutions(name, substitutions, *lines, immediate_range=range(0)):
+    """ Generates a collection of gadgtes with register substitutions. """
+
+    def substitutions_for_letter(letter, number, line):
+        """ Helper that transforms Wd => w1, implementing gadget substitutions. """
+
+        # Register substitutions...
+        line = line.replace(f"X{letter}", f"x{number}")
+        line = line.replace(f"W{letter}", f"w{number}")
+
+        # ... immediate substitutions.
+        line = line.replace(f"I{letter}", f"{number}")
+        return line
+
+        
+    # Build a list of all the various stages we'll iterate over...
+    immediate_parameters = list(immediate_range)
+    parameters   = ([TCG_REGISTER_NUMBERS] * len(substitutions))
+
+    # ... adding immediates, if need be.
+    if immediate_parameters:
+        parameters.append(immediate_parameters)
+        substitutions = substitutions + ['i']
+
+    # Generate a list of register-combinations we'll support.
+    permutations = itertools.product(*parameters)
+
+    #  For each permutation...
+    for permutation in permutations:
+        new_lines = lines
+
+        # Replace each placeholder element with its proper value...
+        for index, element in enumerate(permutation):
+            letter = substitutions[index]
+            number = element
+
+            # Create new gadgets for the releavnt line...
+            new_lines = [substitutions_for_letter(letter, number, line) for line in new_lines]
+
+        # ... and emit the gadget.
+        permutation_id = "_arg".join(str(number) for number in permutation)
+        simple(f"{name}_arg{permutation_id}", *new_lines)
+
+
+def with_dnm(name, *lines):
+    """ Generates a collection of gadgets with substitutions for Xd, Xn, and Xm, and equivalents. """
+    with_register_substitutions(name, ("d", "n", "m"), *lines)
+
+    # Print out an array that contains all of our gadgets, for lookup.
+    print(f"void* gadget_{name}[{TCG_REGISTER_COUNT}][{TCG_REGISTER_COUNT}][{TCG_REGISTER_COUNT}] = ", end="")
+    print("{")
+
+    # D array
+    for d in TCG_REGISTER_NUMBERS:
+        print("\t{")
+
+        # N array
+        for n in TCG_REGISTER_NUMBERS:
+            print("\t\t{", end="")
+
+            # M array
+            for m in TCG_REGISTER_NUMBERS:
+                print(f"gadget_{name}_arg{d}_arg{n}_arg{m}", end=", ")
+
+            print("},")
+        print("\t},")
+    print("};")
+
+
+def with_dn_immediate(name, *lines, immediate_range):
+    """ Generates a collection of gadgets with substitutions for Xd, Xn, and Xm, and equivalents. """
+    with_register_substitutions(name, ["d", "n"], *lines, immediate_range=immediate_range)
+
+    # Print out an array that contains all of our gadgets, for lookup.
+    print(f"void* gadget_{name}[{TCG_REGISTER_COUNT}][{TCG_REGISTER_COUNT}][{len(immediate_range)}] = ", end="")
+    print("{")
+
+    # D array
+    for d in TCG_REGISTER_NUMBERS:
+        print("\t{")
+
+        # N array
+        for n in TCG_REGISTER_NUMBERS:
+            print("\t\t{", end="")
+
+            # M array
+            for i in immediate_range:
+                print(f"gadget_{name}_arg{d}_arg{n}_arg{i}", end=", ")
+
+            print("},")
+        print("\t},")
+    print("};")
+
+
+def with_pair(name, substitutions, *lines):
+    """ Generates a collection of gadgets with two subtstitutions."""
+    with_register_substitutions(name, substitutions, *lines)
+
+    # Print out an array that contains all of our gadgets, for lookup.
+    print(f"void* gadget_{name}[{TCG_REGISTER_COUNT}][{TCG_REGISTER_COUNT}] = ", end="")
+    print("{")
+
+    # N array
+    for a in TCG_REGISTER_NUMBERS:
+        print("\t\t{", end="")
+
+        # M array
+        for b in TCG_REGISTER_NUMBERS:
+            print(f"gadget_{name}_arg{a}_arg{b}", end=", ")
+
+        print("},")
+    print("};")
+
+
+def math_dnm(name, mnemonic):
+    """ Equivalent to `with_dnm`, but creates a _i32 and _i64 variant. For simple math. """
+    with_dnm(f'{name}_i32', f"{mnemonic} Wd, Wn, Wm")
+    with_dnm(f'{name}_i64', f"{mnemonic} Xd, Xn, Xm")
+
+def math_dn(name, mnemonic):
+    """ Equivalent to `with_dn`, but creates a _i32 and _i64 variant. For simple math. """
+    with_dn(f'{name}_i32', f"{mnemonic} Wd, Wn")
+    with_dn(f'{name}_i64', f"{mnemonic} Xd, Xn")
+
+
+def with_nm(name, *lines):
+    """ Generates a collection of gadgets with substitutions for Xn, and Xm, and equivalents. """
+    with_pair(name, ('n', 'm',), *lines)
+
+
+def with_dn(name, *lines):
+    """ Generates a collection of gadgets with substitutions for Xd, and Xn, and equivalents. """
+    with_pair(name, ('d', 'n',), *lines)
+
+
+def ldst_dn(name, *lines):
+    """ Generates a collection of gadgets with substitutions for Xd, and Xn, and equivalents. 
+    
+    This variant is optimized for loads and stores, and optimizes common offset cases.
+    """
+
+    #
+    # Simple case: create our gadgets.
+    #
+    with_dn(name, "ldr x27, [x28], #8", *lines)
+
+    #
+    # Optimization case: create variants of our gadgets with our offsets replaced with common immediates.
+    #
+    immediate_lines_pos = [line.replace("x27", "#Ii") for line in lines]
+    with_dn_immediate(f"{name}_imm", *immediate_lines_pos, immediate_range=range(64))
+
+    immediate_lines_aligned = [line.replace("x27", "#(Ii << 3)") for line in lines]
+    with_dn_immediate(f"{name}_sh8_imm", *immediate_lines_aligned, immediate_range=range(64))
+
+    immediate_lines_neg = [line.replace("x27", "#-Ii") for line in lines]
+    with_dn_immediate(f"{name}_neg_imm", *immediate_lines_neg, immediate_range=range(64))
+
+
+def with_single(name, substitution, *lines):
+    """ Generates a collection of gadgets with two subtstitutions."""
+    with_register_substitutions(name, (substitution,), *lines)
+
+    # Print out an array that contains all of our gadgets, for lookup.
+    print(f"void* gadget_{name}[{TCG_REGISTER_COUNT}] = ", end="")
+    print("{")
+
+    for n in TCG_REGISTER_NUMBERS:
+        print(f"gadget_{name}_arg{n}", end=", ")
+
+    print("};")
+
+
+def with_d_immediate(name, *lines, immediate_range=range(0)):
+    """ Generates a collection of gadgets with two subtstitutions."""
+    with_register_substitutions(name, ['d'], *lines, immediate_range=immediate_range)
+
+    # Print out an array that contains all of our gadgets, for lookup.
+    print(f"void* gadget_{name}[{TCG_REGISTER_COUNT}][{len(immediate_range)}] = ", end="")
+    print("{")
+
+    # D array
+    for a in TCG_REGISTER_NUMBERS:
+        print("\t\t{", end="")
+
+        # I array
+        for b in immediate_range:
+            print(f"gadget_{name}_arg{a}_arg{b}", end=", ")
+
+        print("},")
+    print("};")
+
+
+
+def with_d(name, *lines):
+    """ Generates a collection of gadgets with substitutions for Xd. """
+    with_single(name, 'd', *lines)
+
+
+# Assembly code for saving our machine state before entering the C runtime.
+C_CALL_PROLOGUE = [
+    # Store our machine state.
+    "str x25,      [sp, #-16]!",
+    "stp x14, x15, [sp, #-16]!",
+    "stp x12, x13, [sp, #-16]!",
+    "stp x10, x11, [sp, #-16]!",
+    "stp x8,  x9,  [sp, #-16]!",
+    "stp x6,  x7,  [sp, #-16]!",
+    "stp x4,  x5,  [sp, #-16]!",
+    "stp x2,  x3,  [sp, #-16]!",
+    "stp x0,  x1,  [sp, #-16]!",
+    "stp x28, lr,  [sp, #-16]!",
+]
+
+# Assembly code for restoring our machine state after leaving the C runtime.
+C_CALL_EPILOGUE = [
+    "ldp x28, lr, [sp], #16",
+    "ldp x0,  x1, [sp], #16",
+    "ldp x2,  x3, [sp], #16",
+    "ldp x4,  x5, [sp], #16",
+    "ldp x6,  x7, [sp], #16",
+    "ldp x8,  x9, [sp], #16",
+    "ldp x10, x11, [sp], #16",
+    "ldp x12, x13, [sp], #16",
+    "ldp x14, x15, [sp], #16",
+    "ldr x25,      [sp], #16",
+]
+
+
+def create_tlb_fastpath(is_aligned, is_write, offset, miss_label="0"):
+    """ Creates a set of instructions that perform a soft-MMU TLB lookup.
+
+    This is used for `qemu_ld`/qemu_st` instructions; to emit a prologue that
+    hopefully helps us skip a slow call into the C runtime when a Guest Virtual 
+    -> Host Virtual mapping is in the softmmu's TLB.
+
+    This "fast-path" prelude behaves as follows:
+        - If a TLB entry is found for the address stored in Xn, then x27
+          is stored to an "addend" that can be added to the guest virtual addres
+          to get the host virtual address (the address in our local memory space).
+        - If a TLB entry isn't found, it branches to the "miss_label" (by default, 0:),
+          so address lookup can be handled by the fastpath.
+
+    Clobbers x24, and x26; provides output in x27.
+    """
+
+    fast_path = [
+        # Load env_tlb(env)->f[mmu_idx].{mask,table} into {x26,x27}.
+        f"ldp x26, x27, [x14, #-{offset}]",
+
+        # Extract the TLB index from the address into X26. 
+        "and x26, x26, Xn, lsr #7", # Xn = addr regsiter 
+
+        # Add the tlb_table pointer, creating the CPUTLBEntry address into X27. 
+        "add x27, x27, x26",
+
+        # Load the tlb comparator into X26, and the fast path addend into X27. 
+        "ldr x26, [x27, #8]" if is_write else "ldr x26, [x27]",
+        "ldr x27, [x27, #0x18]",
+
+    ]
+
+    if is_aligned:
+        fast_path.extend([
+            # Store the page mask part of the address into X24.
+            "and x24, Xn, #0xfffffffffffff000",
+
+            # Compare the masked address with the TLB value.
+            "cmp x26, x24",
+
+            # If we're not equal, this isn't a TLB hit. Jump to our miss handler.
+            f"b.ne {miss_label}f",
+        ])
+    else:
+        fast_path.extend([
+            # If we're not aligned, add in our alignment value to ensure we don't
+            # don't straddle the end of a page.
+            "add x24, Xn, #7",
+
+            # Store the page mask part of the address into X24.
+            "and x24, x24, #0xfffffffffffff000",
+
+            # Compare the masked address with the TLB value.
+            "cmp x26, x24",
+
+            # If we're not equal, this isn't a TLB hit. Jump to our miss handler.
+            f"b.ne {miss_label}f",
+        ])
+
+    return fast_path
+
+
+
+def ld_thunk(name, fastpath_32b, fastpath_64b, slowpath_helper, immediate=None, is_aligned=False, force_slowpath=False):
+    """ Creates a thunk into our C runtime for a QEMU ST operation. """
+
+    # Use only offset 0 (no real offset) if we're forcing slowpath; 
+    # otherwise, use all of our allowed MMU offsets.
+    offsets = [0] if force_slowpath else QEMU_ALLOWED_MMU_OFFSETS
+    for offset in offsets:
+        for is_32b in (True, False):
+            fastpath = fastpath_32b if is_32b else fastpath_64b
+
+            gadget_name = f"{name}_off{offset}_i32" if is_32b else f"{name}_off{offset}_i64"
+            postscript = () if immediate else ("add x28, x28, #8",)
+
+            # If we have a pure-assembly fast path, start our gadget with it.
+            if fastpath and not force_slowpath:
+                fastpath_ops = [
+                    # Create a fastpath that jumps to miss_lable on a TLB miss,
+                    # or sets x27 to the TLB addend on a TLB hit.
+                    *create_tlb_fastpath(is_aligned=is_aligned, is_write=False, offset=offset),
+
+                    # On a hit, we can just perform an appropriate load...
+                    *fastpath,
+
+                    # Run our patch-up post-script, if we have one.
+                    *postscript,
+
+                    # ... and then we're done!
+                    *EPILOGUE,
+                ]
+            # Otherwise, we'll save arguments for our slow path.
+            else:
+                fastpath_ops = []
+
+            #
+            # If we're not taking our fast path, we'll call into our C runtime to take the slow path.
+            # 
+            with_dn(gadget_name, 
+                    *fastpath_ops,
+
+                "0:",
+                    "mov x27, Xn",
+
+                    # Save our registers in preparation for entering a C call.
+                    *C_CALL_PROLOGUE,
+
+                    # Per our calling convention:
+                    # - Move our architectural environment into x0, from x14.
+                    # - Move our target address into x1. [Placed in x27 below.]
+                    # - Move our operation info into x2, from an immediate32.
+                    # - Move the next bytecode pointer into x3, from x28.
+                    "mov   x0, x14",
+                    "mov   x1, x27",
+                    f"mov   x2, #{immediate}" if (immediate is not None) else "ldr   x2, [x28], #8", 
+                    "mov   x3, x28",
+
+                    # Perform our actual core code.
+                    f"bl _{slowpath_helper}",
+
+                    # Temporarily store our result in a register that won't get trashed.
+                    "mov x27, x0",
+
+                    # Restore our registers after our C call.
+                    *C_CALL_EPILOGUE,
+
+                    # Finally, call our postscript...
+                    *postscript,
+
+                    # ... and place our results in the target register.
+                    "mov Wd, w27" if is_32b else "mov Xd, x27"
+            )
+
+
+def st_thunk(name, fastpath_32b, fastpath_64b, slowpath_helper, immediate=None, is_aligned=False, force_slowpath=False):
+    """ Creates a thunk into our C runtime for a QEMU ST operation. """
+
+    # Use only offset 0 (no real offset) if we're forcing slowpath; 
+    # otherwise, use all of our allowed MMU offsets.
+    offsets = [0] if force_slowpath else QEMU_ALLOWED_MMU_OFFSETS
+    for offset in offsets:
+
+        for is_32b in (True, False):
+            fastpath = fastpath_32b if is_32b else fastpath_64b
+
+            gadget_name = f"{name}_off{offset}_i32" if is_32b else f"{name}_off{offset}_i64"
+            postscript = () if immediate else ("add x28, x28, #8",)
+
+            # If we have a pure-assembly fast path, start our gadget with it.
+            if fastpath and not force_slowpath:
+                fastpath_ops = [
+
+                    # Create a fastpath that jumps to miss_lable on a TLB miss,
+                    # or sets x27 to the TLB addend on a TLB hit.
+                    *create_tlb_fastpath(is_aligned=is_aligned, is_write=True, offset=offset),
+
+                    # On a hit, we can just perform an appropriate load...
+                    *fastpath,
+
+                    # Run our patch-up post-script, if we have one.
+                    *postscript,
+
+                    # ... and then we're done!
+                    *EPILOGUE,
+                ]
+            else:
+                fastpath_ops = []
+
+
+            #
+            # If we're not taking our fast path, we'll call into our C runtime to take the slow path.
+            # 
+            with_dn(gadget_name, 
+                    *fastpath_ops,
+
+                "0:",
+                    # Move our arguments into registers that we're not actively using.
+                    # This ensures that they won't be trounced by our calling convention
+                    # if this is reading values from x0-x4.
+                    "mov w27, Wd" if is_32b else "mov x27, Xd",
+                    "mov x26, Xn",
+
+                    # Save our registers in preparation for entering a C call.
+                    *C_CALL_PROLOGUE,
+
+                    # Per our calling convention:
+                    # - Move our architectural environment into x0, from x14.
+                    # - Move our target address into x1. [Moved into x26 above].
+                    # - Move our target value into x2. [Moved into x27 above].
+                    # - Move our operation info into x3, from an immediate32.
+                    # - Move the next bytecode pointer into x4, from x28.
+                    "mov   x0, x14",
+                    "mov   x1, x26",
+                    "mov   x2, x27",
+                    f"mov  x3, #{immediate}" if (immediate is not None) else "ldr   x3, [x28], #8", 
+                    "mov   x4, x28",
+
+                    # Perform our actual core code.
+                    f"bl _{slowpath_helper}",
+
+                    # Restore our registers after our C call.
+                    *C_CALL_EPILOGUE,
+
+                    # Finally, call our postscript.
+                    *postscript
+            )
+
+
+#
+# Gadget definitions.
+#
+
+print("/* Automatically generated by tcti-gadget-gen.py. Do not edit. */\n")
+
+# Call a C language helper function by address.
+simple("call",
+    # Get our C runtime function's location as a pointer-sized immediate...
+    "ldr x27, [x28], #8",
+
+    # Store our TB return address for our helper.
+    "str x28, [x25]",
+
+    # Prepare ourselves to call into our C runtime...
+    *C_CALL_PROLOGUE,
+
+    # ... perform the call itself ...
+    "blr x27",
+
+    # Save the result of our call for later.
+    "mov x27, x0",
+
+    # ... and restore our environment.
+    *C_CALL_EPILOGUE,
+
+    # Restore our return value.
+    "mov x0, x27"
+)
+
+# Branch to a given immediate address.
+simple("br",
+    # Use our immediate argument as our new bytecode-pointer location.
+    "ldr x28, [x28]"
+)
+
+# Exit from a translation buffer execution.
+simple("exit_tb",
+
+    # We have a single immediate argument, which contains our return code.
+    # Place it into x0, as one would a return code.
+    "ldr x0, [x28], #8",
+
+    # And finally, return back to the code that invoked our gadget stream.
+    "ret"
+)
+
+
+for condition in ARCH_CONDITION_CODES:
+
+    # Performs a comparison between two operands.
+    with_dnm(f"setcond_i32_{condition}",
+        "subs Wd, Wn, Wm",
+        f"cset Wd, {condition}"
+    )
+    with_dnm(f"setcond_i64_{condition}",
+        "subs Xd, Xn, Xm",
+        f"cset Xd, {condition}"
+    )
+
+    #
+    # NOTE: we use _dnm for the conditional branches, even though we don't
+    # actually do anything different based on the d argument. This gemerates
+    # effectively 16 identical `brcond` gadgets for each condition; which we
+    # use in the backend to spread out the actual branch sources we use.
+    #
+    # This is a slight mercy for the branch predictor, as not every conditional
+    # branch is funneled throught the same address.
+    #
+
+    # Branches iff a given comparison is true.
+    with_dnm(f'brcond_i32_{condition}',
+
+        # Grab our immediate argument.
+        "ldr x27, [x28], #8",
+
+        # Perform our comparison and conditional branch.
+        "subs Wzr, Wn, Wm",
+        f"b{condition} 1f",
+
+        "0:", # not taken
+           # Perform our end-of-instruction epilogue.
+            *EPILOGUE,
+
+        "1:" # taken
+            # Update our bytecode pointer to take the label.
+            "mov x28, x27"
+    )
+
+    # Branches iff a given comparison is true.
+    with_dnm(f'brcond_i64_{condition}',
+
+        # Grab our immediate argument.
+        "ldr x27, [x28], #8",
+
+        # Perform our comparison and conditional branch.
+        "subs Xzr, Xn, Xm",
+        f"b{condition} 1f",
+
+        "0:", # not taken
+            # Perform our end-of-instruction epilogue.
+            *EPILOGUE,
+
+        "1:" # taken
+            # Update our bytecode pointer to take the label.
+            "mov x28, x27"
+    )
+
+
+# MOV variants.
+with_dn("mov_i32",     "mov Wd, Wn")
+with_dn("mov_i64",     "mov Xd, Xn")
+with_d("movi_i32", "ldr Wd, [x28], #8")
+with_d("movi_i64", "ldr Xd, [x28], #8")
+
+# Create MOV variants that have common constants built in to the gadget.
+# This optimization helps costly reads from memories for simple operations.
+with_d_immediate("movi_imm_i32", "mov Wd, #Ii", immediate_range=range(64))
+with_d_immediate("movi_imm_i64", "mov Xd, #Ii", immediate_range=range(64))
+
+# LOAD variants.
+# TODO: should the signed variants have X variants for _i64?
+ldst_dn("ld8u",      "ldrb  Wd, [Xn, x27]")
+ldst_dn("ld8s_i32",  "ldrsb Wd, [Xn, x27]")
+ldst_dn("ld8s_i64",  "ldrsb Xd, [Xn, x27]")
+ldst_dn("ld16u",     "ldrh  Wd, [Xn, x27]")
+ldst_dn("ld16s_i32", "ldrsh Wd, [Xn, x27]")
+ldst_dn("ld16s_i64", "ldrsh Xd, [Xn, x27]")
+ldst_dn("ld32u",     "ldr   Wd, [Xn, x27]")
+ldst_dn("ld32s_i64", "ldrsw Xd, [Xn, x27]")
+ldst_dn("ld_i64",    "ldr   Xd, [Xn, x27]")
+
+# STORE variants.
+ldst_dn("st8",         "strb  Wd, [Xn, x27]")
+ldst_dn("st16",        "strh  Wd, [Xn, x27]")
+ldst_dn("st_i32",      "str   Wd, [Xn, x27]")
+ldst_dn("st_i64",      "str   Xd, [Xn, x27]")
+
+# QEMU LD/ST are handled in our C runtime rather than with simple gadgets,
+# as they're nontrivial.
+
+# Trivial arithmetic.
+math_dnm("add" , "add" )
+math_dnm("sub" , "sub" )
+math_dnm("mul" , "mul" )
+math_dnm("div" , "sdiv")
+math_dnm("divu", "udiv")
+
+# Division remainder
+with_dnm("rem_i32",  "sdiv w27, Wn, Wm", "msub Wd, w27, Wm, Wn")
+with_dnm("rem_i64",  "sdiv x27, Xn, Xm", "msub Xd, x27, Xm, Xn")
+with_dnm("remu_i32", "udiv w27, Wn, Wm", "msub Wd, w27, Wm, Wn")
+with_dnm("remu_i64", "udiv x27, Xn, Xm", "msub Xd, x27, Xm, Xn")
+
+# Trivial logical.
+math_dn( "not",  "mvn")
+math_dn( "neg",  "neg")
+math_dnm("and",  "and")
+math_dnm("andc", "bic")
+math_dnm("or",   "orr")
+math_dnm("orc",  "orn")
+math_dnm("xor",  "eor")
+math_dnm("eqv",  "eon")
+math_dnm("shl",  "lsl")
+math_dnm("shr",  "lsr")
+math_dnm("sar",  "asr")
+
+# AArch64 lacks a Rotate Left; so we instead rotate right by a negative.
+# TODO: validate this?
+#math_dnm("rotr", "ror")
+#with_dnm("rotl_i32", "neg w27, Wm", "ror Wd, Wn, w27")
+#with_dnm("rotl_i64", "neg x27, Xm", "ror Xd, Xn, x27")
+
+# Numeric extension.
+math_dn("ext8s",      "sxtb")
+with_dn("ext8u",      "and Xd, Xn, #0xff")
+math_dn("ext16s",     "sxth")
+with_dn("ext16u",     "and Wd, Wn, #0xffff")
+with_dn("ext32s_i64", "sxtw Xd, Wn")
+with_dn("ext32u_i64", "and Xd, Xn, #0xffffffff")
+
+# Byte swapping.
+with_dn("bswap16",    "rev w27, Wn", "lsr Wd, w27, #16")
+with_dn("bswap32",    "rev Wd, Wn")
+with_dn("bswap64",    "rev Xd, Xn")
+
+# Memory barriers.
+simple("mb_all", "dmb ish")
+simple("mb_st",  "dmb ishst")
+simple("mb_ld",  "dmb ishld")
+
+# Handlers for QEMU_LD, which handles guest <- host loads.
+for subtype in ('aligned', 'unaligned', 'slowpath'):
+    is_aligned  = (subtype == 'aligned')
+    is_slowpath = (subtype == 'slowpath')
+
+    ld_thunk(f"qemu_ld_ub_{subtype}", is_aligned=is_aligned, slowpath_helper="helper_ret_ldub_mmu",
+        fastpath_32b=["ldrb Wd, [Xn, x27]"], fastpath_64b=["ldrb Wd, [Xn, x27]"],
+        force_slowpath=is_slowpath,
+    )
+    ld_thunk(f"qemu_ld_sb_{subtype}", is_aligned=is_aligned, slowpath_helper="helper_ret_ldub_mmu_signed",
+        fastpath_32b=["ldrsb Wd, [Xn, x27]"], fastpath_64b=["ldrsb Xd, [Xn, x27]"],
+        force_slowpath=is_slowpath,
+    )
+    ld_thunk(f"qemu_ld_leuw_{subtype}", is_aligned=is_aligned, slowpath_helper="helper_le_lduw_mmu",
+        fastpath_32b=["ldrh Wd, [Xn, x27]"], fastpath_64b=["ldrh Wd, [Xn, x27]"],
+        force_slowpath=is_slowpath,
+    )
+    ld_thunk(f"qemu_ld_lesw_{subtype}", is_aligned=is_aligned, slowpath_helper="helper_le_lduw_mmu_signed",
+        fastpath_32b=["ldrsh Wd, [Xn, x27]"], fastpath_64b=["ldrsh Xd, [Xn, x27]"],
+        force_slowpath=is_slowpath,
+    )
+    ld_thunk(f"qemu_ld_leul_{subtype}", is_aligned=is_aligned, slowpath_helper="helper_le_ldul_mmu",
+        fastpath_32b=["ldr Wd, [Xn, x27]"], fastpath_64b=["ldr Wd, [Xn, x27]"],
+        force_slowpath=is_slowpath,
+    )
+    ld_thunk(f"qemu_ld_lesl_{subtype}", is_aligned=is_aligned, slowpath_helper="helper_le_ldul_mmu_signed",
+        fastpath_32b=["ldrsw Xd, [Xn, x27]"], fastpath_64b=["ldrsw Xd, [Xn, x27]"],
+        force_slowpath=is_slowpath,
+    )
+    ld_thunk(f"qemu_ld_leq_{subtype}", is_aligned=is_aligned, slowpath_helper="helper_le_ldq_mmu",
+        fastpath_32b=["ldr Xd, [Xn, x27]"], fastpath_64b=["ldr Xd, [Xn, x27]"],
+        force_slowpath=is_slowpath,
+    )
+
+    # Special variant for the most common mode, as a speedup optimization.
+    ld_thunk(f"qemu_ld_leq_{subtype}_mode3a", is_aligned=is_aligned, slowpath_helper="helper_le_ldq_mmu",
+        fastpath_32b=["ldr Xd, [Xn, x27]"], fastpath_64b=["ldr Xd, [Xn, x27]"],
+        force_slowpath=is_slowpath, immediate=0x3a
+    )
+
+    # For now, leave the rare/big-endian stuff slow-path only.
+    ld_thunk(f"qemu_ld_beuw_{subtype}", None, None, "helper_be_lduw_mmu",         
+            is_aligned=is_aligned, force_slowpath=is_slowpath)
+    ld_thunk(f"qemu_ld_besw_{subtype}", None, None, "helper_be_lduw_mmu_signed",  
+            is_aligned=is_aligned, force_slowpath=is_slowpath)
+    ld_thunk(f"qemu_ld_beul_{subtype}", None, None, "helper_be_ldul_mmu",         
+            is_aligned=is_aligned, force_slowpath=is_slowpath)
+    ld_thunk(f"qemu_ld_besl_{subtype}", None, None, "helper_be_ldul_mmu_signed",  
+            is_aligned=is_aligned, force_slowpath=is_slowpath)
+    ld_thunk(f"qemu_ld_beq_{subtype}",  None, None, "helper_be_ldq_mmu",          
+            is_aligned=is_aligned, force_slowpath=is_slowpath)
+
+
+# Handlers for QEMU_ST, which handles guest -> host stores.
+for subtype in ('aligned', 'unaligned', 'slowpath'):
+    is_aligned  = (subtype == 'aligned')
+    is_slowpath = (subtype == 'slowpath')
+
+    st_thunk(f"qemu_st_ub_{subtype}", is_aligned=is_aligned, slowpath_helper="helper_ret_stb_mmu",
+        fastpath_32b=["strb Wd, [Xn, x27]"], fastpath_64b=["strb Wd, [Xn, x27]"],
+        force_slowpath=is_slowpath,
+    )
+    st_thunk(f"qemu_st_leuw_{subtype}", is_aligned=is_aligned, slowpath_helper="helper_le_stw_mmu",
+        fastpath_32b=["strh Wd, [Xn, x27]"], fastpath_64b=["strh Wd, [Xn, x27]"],
+        force_slowpath=is_slowpath,
+    )
+    st_thunk(f"qemu_st_leul_{subtype}", is_aligned=is_aligned, slowpath_helper="helper_le_stl_mmu",
+        fastpath_32b=["str Wd, [Xn, x27]"], fastpath_64b=["str Wd, [Xn, x27]"],
+        force_slowpath=is_slowpath,
+    )
+    st_thunk(f"qemu_st_leq_{subtype}", is_aligned=is_aligned, slowpath_helper="helper_le_stq_mmu",
+        fastpath_32b=["str Xd, [Xn, x27]"], fastpath_64b=["str Xd, [Xn, x27]"],
+        force_slowpath=is_slowpath,
+    )
+    
+    # Special optimization for the most common modes.
+    st_thunk(f"qemu_st_leq_{subtype}_mode3a", is_aligned=is_aligned, slowpath_helper="helper_le_stq_mmu",
+        fastpath_32b=["str Xd, [Xn, x27]"], fastpath_64b=["str Xd, [Xn, x27]"],
+        force_slowpath=is_slowpath, immediate=0x3a
+    )
+
+    # For now, leave the rare/big-endian stuff slow-path only.
+    st_thunk(f"qemu_st_beuw_{subtype}", None, None, "helper_be_stw_mmu",  
+            is_aligned=is_aligned, force_slowpath=is_slowpath)
+    st_thunk(f"qemu_st_beul_{subtype}", None, None, "helper_be_stl_mmu",
+            is_aligned=is_aligned, force_slowpath=is_slowpath)
+    st_thunk(f"qemu_st_beq_{subtype}",  None, None, "helper_be_stq_mmu",
+            is_aligned=is_aligned, force_slowpath=is_slowpath)
+
+
+# Statistics.
+sys.stderr.write(f"\nGenerated {gadgets} gadgets with {instructions} instructions ({instructions * 4} B).\n\n")
diff --git a/tcg/tcg.c b/tcg/tcg.c
index 5b0750685102..ec832d92d0e6 100644
--- a/tcg/tcg.c
+++ b/tcg/tcg.c
@@ -165,7 +165,7 @@ TCGv_env cpu_env = 0;
 const void *tcg_code_gen_epilogue;
 uintptr_t tcg_splitwx_diff;
 
-#ifndef CONFIG_TCG_INTERPRETER
+#if !defined(CONFIG_TCG_INTERPRETER) && !defined(CONFIG_TCG_THREADED_INTERPRETER)
 tcg_prologue_fn *tcg_qemu_tb_exec;
 #endif
 
@@ -1227,7 +1227,7 @@ void tcg_prologue_init(TCGContext *s)
     region.start = buf0;
     region.end = buf0 + total_size;
 
-#ifndef CONFIG_TCG_INTERPRETER
+#if !defined(CONFIG_TCG_INTERPRETER) && !defined(CONFIG_TCG_THREADED_INTERPRETER)
     tcg_qemu_tb_exec = (tcg_prologue_fn *)tcg_splitwx_to_rx(buf0);
 #endif
 
@@ -1253,7 +1253,7 @@ void tcg_prologue_init(TCGContext *s)
 #endif
 
     buf1 = s->code_ptr;
-#ifndef CONFIG_TCG_INTERPRETER
+#if !defined(CONFIG_TCG_INTERPRETER) && !defined(CONFIG_TCG_THREADED_INTERPRETER)
     flush_idcache_range((uintptr_t)tcg_splitwx_to_rx(buf0), (uintptr_t)buf0,
                         tcg_ptr_byte_diff(buf1, buf0));
 #endif
@@ -1981,7 +1981,7 @@ void tcg_gen_callN(void *func, TCGTemp *ret, int nargs, TCGTemp **args)
 #endif
 
 #if defined(__sparc__) && !defined(__arch64__) \
-    && !defined(CONFIG_TCG_INTERPRETER)
+    && !defined(CONFIG_TCG_INTERPRETER) && !defined(CONFIG_TCG_THREADED_INTERPRETER)
     /* We have 64-bit values in one register, but need to pass as two
        separate parameters.  Split them.  */
     int orig_sizemask = sizemask;
@@ -2031,7 +2031,7 @@ void tcg_gen_callN(void *func, TCGTemp *ret, int nargs, TCGTemp **args)
     pi = 0;
     if (ret != NULL) {
 #if defined(__sparc__) && !defined(__arch64__) \
-    && !defined(CONFIG_TCG_INTERPRETER)
+    && !defined(CONFIG_TCG_INTERPRETER) && !defined(CONFIG_TCG_THREADED_INTERPRETER)
         if (orig_sizemask & 1) {
             /* The 32-bit ABI is going to return the 64-bit value in
                the %o0/%o1 register pair.  Prepare for this by using
@@ -2109,7 +2109,7 @@ void tcg_gen_callN(void *func, TCGTemp *ret, int nargs, TCGTemp **args)
     tcg_debug_assert(pi <= ARRAY_SIZE(op->args));
 
 #if defined(__sparc__) && !defined(__arch64__) \
-    && !defined(CONFIG_TCG_INTERPRETER)
+    && !defined(CONFIG_TCG_INTERPRETER) && !defined(CONFIG_TCG_THREADED_INTERPRETER)
     /* Free all of the parts we allocated above.  */
     for (i = real_args = 0; i < orig_nargs; ++i) {
         int is_64bit = orig_sizemask & (1 << (i+1)*2);
@@ -4789,7 +4789,7 @@ int tcg_gen_code(TCGContext *s, TranslationBlock *tb)
         return -2;
     }
 
-#ifndef CONFIG_TCG_INTERPRETER
+#if !defined(CONFIG_TCG_INTERPRETER) && !defined(CONFIG_TCG_THREADED_INTERPRETER)
     /* flush instruction cache */
     flush_idcache_range((uintptr_t)tcg_splitwx_to_rx(s->code_buf),
                         (uintptr_t)s->code_buf,

From c77bed1d5cf3a935ded239b21665512f5bfb618c Mon Sep 17 00:00:00 2001
From: Kate Temkin <k@ktemkin.com>
Date: Tue, 27 Apr 2021 17:23:16 +0000
Subject: [PATCH 27/36] split gadgets into multiple files

---
 meson.build                         |  42 ++++--
 tcg/aarch64-tcti/tcg-target.c.inc   |  33 +++--
 tcg/aarch64-tcti/tcti-gadget-gen.py | 212 ++++++++++++++++++++--------
 3 files changed, 213 insertions(+), 74 deletions(-)

diff --git a/meson.build b/meson.build
index 2bcd98dd0865..52f40e6b77e8 100644
--- a/meson.build
+++ b/meson.build
@@ -257,14 +257,38 @@ if not get_option('tcg').disabled()
 
     # Tell our compiler how to generate our TCTI gadgets.
     gadget_generator = 'tcg/@0@/tcti-gadget-gen.py'.format(tcg_arch)
-    tcti_gadgets = custom_target('tcti-gadgets.c.inc',
-                                output: 'tcti-gadgets.c.inc',
-                                input: gadget_generator,
-                                command: [find_program(gadget_generator), '@OUTPUT@'],
-                                build_by_default: true,
-                                build_always_stale: false)
-
-    genh += tcti_gadgets
+    gadgets = [
+      'tcti_misc_gadgets.c', 
+      'tcti_conditionals_gadgets.c', 
+      'tcti_mov_gadgets.c', 
+      'tcti_load_gadgets.c', 
+      'tcti_store_gadgets.c', 
+      'tcti_arithmetic_gadgets.c', 
+      'tcti_logical_gadgets.c', 
+      'tcti_extension_gadgets.c', 
+      'tcti_byteswap_gadgets.c', 
+      'tcti_qemu_ld_gadgets.c', 
+      'tcti_qemu_st_gadgets.c', 
+      'tcti_misc_gadgets.h', 
+      'tcti_conditionals_gadgets.h', 
+      'tcti_mov_gadgets.h', 
+      'tcti_load_gadgets.h', 
+      'tcti_store_gadgets.h', 
+      'tcti_arithmetic_gadgets.h', 
+      'tcti_logical_gadgets.h', 
+      'tcti_extension_gadgets.h', 
+      'tcti_byteswap_gadgets.h', 
+      'tcti_qemu_ld_gadgets.h', 
+      'tcti_qemu_st_gadgets.h', 
+      'tcti_gadgets.h']
+
+    tcti_gadgets = custom_target('tcti-gadgets.h',
+                              output: gadgets,
+                              input: gadget_generator,
+                              command: [find_program(gadget_generator)],
+                              build_by_default: true,
+                              build_always_stale: false)
+
     
   elif config_host['ARCH'] == 'sparc64'
     tcg_arch = 'sparc'
@@ -1943,6 +1967,8 @@ if get_option('b_lto')
 endif
 common_ss.add(pagevary)
 specific_ss.add(files('page-vary.c'))
+specific_ss.add(when: 'CONFIG_TCG_INTERPRETER', if_true: files('disas/tci.c', 'tcg/tci.c'))
+specific_ss.add(when: 'CONFIG_TCG_THREADED_INTERPRETER', if_true: tcti_gadgets)
 
 subdir('backends')
 subdir('disas')
diff --git a/tcg/aarch64-tcti/tcg-target.c.inc b/tcg/aarch64-tcti/tcg-target.c.inc
index d7bb67a92140..315033502ef7 100644
--- a/tcg/aarch64-tcti/tcg-target.c.inc
+++ b/tcg/aarch64-tcti/tcg-target.c.inc
@@ -22,11 +22,13 @@
  * THE SOFTWARE.
  */
 
+//#define TCTI_GADGET_RICH_DISASSEMBLY
+
 #define TCTI_GADGET_IMMEDIATE_ARRAY_LEN 64
 
-// Grab our gadget definitions.
-// FIXME: use the system path instead of hardcoding this?
-#include "tcti-gadgets.c.inc"
+
+// Grab our gadget headers.
+#include "tcti_gadgets.h"
 
 /* Marker for missing code. */
 #define TODO() \
@@ -575,6 +577,12 @@ static void tcg_out_ld(TCGContext *s, TCGType type, TCGReg ret, TCGReg arg1,
 }
 
 
+static void warn_slow_memop(const TCGArg arg)
+{
+    fprintf(stderr, "--- NOTE: emitting non-optimized memop at offset %i\n", TLB_MASK_TABLE_OFS(get_mmuidx(arg)));
+}
+
+
 /**
  * Generate every other operation.
  */
@@ -1032,7 +1040,7 @@ void tcg_out_op(TCGContext *s, TCGOpcode opc, const TCGArg *args, const int *con
             case -64:  LD_MEMOP_HANDLER(gadget, args[2],  off64_i32, a_bits, s_bits); break;
             case -96:  LD_MEMOP_HANDLER(gadget, args[2],  off96_i32, a_bits, s_bits); break;
             case -128: LD_MEMOP_HANDLER(gadget, args[2], off128_i32, a_bits, s_bits); break;
-            default:   LD_MEMOP_LOOKUP(gadget, args[2], slowpath_off0_i32); break;
+            default:   LD_MEMOP_LOOKUP(gadget, args[2], slowpath_off0_i32); warn_slow_memop(args[2]); break;
         }
 
         // Args:
@@ -1083,7 +1091,7 @@ void tcg_out_op(TCGContext *s, TCGOpcode opc, const TCGArg *args, const int *con
                 case -64:  LD_MEMOP_HANDLER(gadget, args[2],  off64_i64, a_bits, s_bits); break;
                 case -96:  LD_MEMOP_HANDLER(gadget, args[2],  off96_i64, a_bits, s_bits); break;
                 case -128: LD_MEMOP_HANDLER(gadget, args[2], off128_i64, a_bits, s_bits); break;
-                default:   LD_MEMOP_LOOKUP(gadget, args[2], slowpath_off0_i64); break;
+                default:   LD_MEMOP_LOOKUP(gadget, args[2], slowpath_off0_i64); warn_slow_memop(args[2]); break;
             }
             // Args:
             // - an immediate32 encodes our operation index 
@@ -1106,7 +1114,7 @@ void tcg_out_op(TCGContext *s, TCGOpcode opc, const TCGArg *args, const int *con
             case -64:  ST_MEMOP_HANDLER(gadget, args[2],  off64_i32, a_bits, s_bits); break;
             case -96:  ST_MEMOP_HANDLER(gadget, args[2],  off96_i32, a_bits, s_bits); break;
             case -128: ST_MEMOP_HANDLER(gadget, args[2], off128_i32, a_bits, s_bits); break;
-            default:   ST_MEMOP_LOOKUP(gadget, args[2], slowpath_off0_i32); break;
+            default:   ST_MEMOP_LOOKUP(gadget, args[2], slowpath_off0_i32); warn_slow_memop(args[2]); break;
         }
 
         // Args:
@@ -1158,7 +1166,7 @@ void tcg_out_op(TCGContext *s, TCGOpcode opc, const TCGArg *args, const int *con
                 case -64:  ST_MEMOP_HANDLER(gadget, args[2],  off64_i64, a_bits, s_bits); break;
                 case -96:  ST_MEMOP_HANDLER(gadget, args[2],  off96_i64, a_bits, s_bits); break;
                 case -128: ST_MEMOP_HANDLER(gadget, args[2], off128_i64, a_bits, s_bits); break;
-                default:   ST_MEMOP_LOOKUP(gadget, args[2], slowpath_off0_i64); break;
+                default:   ST_MEMOP_LOOKUP(gadget, args[2], slowpath_off0_i64); warn_slow_memop(args[2]); break;
             }
 
             // Args:
@@ -1318,8 +1326,11 @@ uintptr_t QEMU_DISABLE_CFI tcg_qemu_tb_exec(CPUArchState *env, const void *v_tb_
 /* Disassemble TCI bytecode. */
 int print_insn_tcti(bfd_vma addr, disassemble_info *info)
 {
+
+#ifdef TCTI_GADGET_RICH_DISASSEMBLY
     Dl_info symbol_info = {};
     char symbol_name[48] ;
+#endif
 
     int status;
     uint64_t block;
@@ -1331,16 +1342,22 @@ int print_insn_tcti(bfd_vma addr, disassemble_info *info)
         return -1;
     }
 
+#ifdef TCTI_GADGET_RICH_DISASSEMBLY
     // Most of our disassembly stream will be gadgets. Try to get their names, for nice output.
     dladdr((void *)block, &symbol_info);
 
     if(symbol_info.dli_sname != 0) {
-        strlcpy(symbol_name, symbol_info.dli_sname, 47);
+        strncpy(symbol_name, symbol_info.dli_sname, sizeof(symbol_name));
+        symbol_name[sizeof(symbol_name) - 1] = 0;
         info->fprintf_func(info->stream, "%s", symbol_name);
     } else {
         info->fprintf_func(info->stream, "%016llx", block);
     }
 
+#else
+    info->fprintf_func(info->stream, "%016llx", block);
+#endif
+
     return sizeof(block);
 }
 
diff --git a/tcg/aarch64-tcti/tcti-gadget-gen.py b/tcg/aarch64-tcti/tcti-gadget-gen.py
index 1296f6d0c2d7..51471719a48b 100755
--- a/tcg/aarch64-tcti/tcti-gadget-gen.py
+++ b/tcg/aarch64-tcti/tcti-gadget-gen.py
@@ -7,14 +7,6 @@
 import sys
 import itertools
 
-# Get a handle on the file we'll be working with, and redirect print to it.
-if len(sys.argv) > 1:
-    out_file = open(sys.argv[1], "w")
-
-    # Hook our print function, so it always outputs to the relevant file.
-    core_print = print
-    print = lambda *a, **k : core_print(*a, **k, file=out_file)
-
 # Epilogue code follows at the end of each gadget, and handles continuing execution.
 EPILOGUE = ( 
     # Load our next gadget address from our bytecode stream, advancing it.
@@ -40,6 +32,48 @@
 gadgets      = 0
 instructions = 0
 
+# Files to write to.
+current_collection = "basic"
+output_files = {}
+
+# Create a top-level header.
+top_header = open("tcti_gadgets.h", "w")
+print("/* Automatically generated by tcti-gadget-gen.py. Do not edit. */\n", file=top_header)
+
+def _get_output_files():
+    """ Gathers the output C and H files for a given gadget-cluster name. """
+
+    # If we don't have an output file for this already, create it.
+    return output_files[current_collection]
+
+
+def START_COLLECTION(name):
+    """ Sets the name of the current collection. """
+
+    global current_collection
+
+    # Create the relevant output files
+    new_c_file = open(f"tcti_{name}_gadgets.c", "w")
+    new_h_file = open(f"tcti_{name}_gadgets.h", "w")
+    output_files[name] = (new_c_file, new_h_file)
+
+    # Add the file to our gadget collection.
+    print(f'#include "tcti_{name}_gadgets.h"', file=top_header)
+
+    # Add generated messages to the relevant collection.
+    print("/* Automatically generated by tcti-gadget-gen.py. Do not edit. */\n", file=new_c_file)
+    print("/* Automatically generated by tcti-gadget-gen.py. Do not edit. */\n", file=new_h_file)
+
+    # Start our C file with inclusion of the relevant header.
+    print(f'\n#include "tcti_{name}_gadgets.h"\n', file=new_c_file)
+
+    # Start our H file with a simple pragma-guard, for speed.
+    print('\n#pragma once\n', file=new_h_file)
+
+    # Finally, set the global active collection.
+    current_collection = name
+    
+
 def simple(name, *lines):
     """ Generates a simple gadget that needs no per-register specialization. """
 
@@ -47,21 +81,23 @@ def simple(name, *lines):
 
     gadgets += 1
 
+    # Fetch the files we'll be using for output.
+    c_file, h_file = _get_output_files()
+
     # Create our C/ASM framing.
-    #print(f"__attribute__((naked)) static void gadget_{name}(void)")
-    print(f"__attribute__((naked)) void gadget_{name}(void);")
-    print(f"__attribute__((naked)) void gadget_{name}(void)")
-    print("{")
+    print(f"__attribute__((naked)) void gadget_{name}(void);", file=h_file)
+    print(f"__attribute__((naked)) void gadget_{name}(void)", file=c_file)
+    print("{", file=c_file)
 
     # Add the core gadget
-    print("\tasm(")
+    print("\tasm(", file=c_file)
     for line in lines + EPILOGUE:
-        print(f"\t\t\"{line} \\n\"")
+        print(f"\t\t\"{line} \\n\"", file=c_file)
         instructions += 1
-    print("\t);")
+    print("\t);", file=c_file)
 
     # End our framing.
-    print("}\n")
+    print("}\n", file=c_file)
 
 
 def with_register_substitutions(name, substitutions, *lines, immediate_range=range(0)):
@@ -112,70 +148,87 @@ def with_dnm(name, *lines):
     """ Generates a collection of gadgets with substitutions for Xd, Xn, and Xm, and equivalents. """
     with_register_substitutions(name, ("d", "n", "m"), *lines)
 
+    # Fetch the files we'll be using for output.
+    c_file, h_file = _get_output_files()
+
+    # Print out an extern.
+    print(f"extern void* gadget_{name}[{TCG_REGISTER_COUNT}][{TCG_REGISTER_COUNT}][{TCG_REGISTER_COUNT}];", file=h_file)
+
     # Print out an array that contains all of our gadgets, for lookup.
-    print(f"void* gadget_{name}[{TCG_REGISTER_COUNT}][{TCG_REGISTER_COUNT}][{TCG_REGISTER_COUNT}] = ", end="")
-    print("{")
+    print(f"void* gadget_{name}[{TCG_REGISTER_COUNT}][{TCG_REGISTER_COUNT}][{TCG_REGISTER_COUNT}] = ", end="", file=c_file)
+    print("{", file=c_file)
 
     # D array
     for d in TCG_REGISTER_NUMBERS:
-        print("\t{")
+        print("\t{", file=c_file)
 
         # N array
         for n in TCG_REGISTER_NUMBERS:
-            print("\t\t{", end="")
+            print("\t\t{", end="", file=c_file)
 
             # M array
             for m in TCG_REGISTER_NUMBERS:
-                print(f"gadget_{name}_arg{d}_arg{n}_arg{m}", end=", ")
+                print(f"gadget_{name}_arg{d}_arg{n}_arg{m}", end=", ", file=c_file)
 
-            print("},")
-        print("\t},")
-    print("};")
+            print("},", file=c_file)
+        print("\t},", file=c_file)
+    print("};", file=c_file)
 
 
 def with_dn_immediate(name, *lines, immediate_range):
     """ Generates a collection of gadgets with substitutions for Xd, Xn, and Xm, and equivalents. """
     with_register_substitutions(name, ["d", "n"], *lines, immediate_range=immediate_range)
 
+    # Fetch the files we'll be using for output.
+    c_file, h_file = _get_output_files()
+
+    # Print out an extern.
+    print(f"extern void* gadget_{name}[{TCG_REGISTER_COUNT}][{TCG_REGISTER_COUNT}][{len(immediate_range)}];", file=h_file)
+
     # Print out an array that contains all of our gadgets, for lookup.
-    print(f"void* gadget_{name}[{TCG_REGISTER_COUNT}][{TCG_REGISTER_COUNT}][{len(immediate_range)}] = ", end="")
-    print("{")
+    print(f"void* gadget_{name}[{TCG_REGISTER_COUNT}][{TCG_REGISTER_COUNT}][{len(immediate_range)}] = ", end="", file=c_file)
+    print("{", file=c_file)
 
     # D array
     for d in TCG_REGISTER_NUMBERS:
-        print("\t{")
+        print("\t{", file=c_file)
 
         # N array
         for n in TCG_REGISTER_NUMBERS:
-            print("\t\t{", end="")
+            print("\t\t{", end="", file=c_file)
 
             # M array
             for i in immediate_range:
-                print(f"gadget_{name}_arg{d}_arg{n}_arg{i}", end=", ")
+                print(f"gadget_{name}_arg{d}_arg{n}_arg{i}", end=", ", file=c_file)
 
-            print("},")
-        print("\t},")
-    print("};")
+            print("},", file=c_file)
+        print("\t},", file=c_file)
+    print("};", file=c_file)
 
 
 def with_pair(name, substitutions, *lines):
     """ Generates a collection of gadgets with two subtstitutions."""
     with_register_substitutions(name, substitutions, *lines)
 
+    # Fetch the files we'll be using for output.
+    c_file, h_file = _get_output_files()
+
+    print(f"extern void* gadget_{name}[{TCG_REGISTER_COUNT}][{TCG_REGISTER_COUNT}];", file=h_file)
+
     # Print out an array that contains all of our gadgets, for lookup.
-    print(f"void* gadget_{name}[{TCG_REGISTER_COUNT}][{TCG_REGISTER_COUNT}] = ", end="")
-    print("{")
+    print(f"void* gadget_{name}[{TCG_REGISTER_COUNT}][{TCG_REGISTER_COUNT}] = ", end="", file=c_file)
+    print("{", file=c_file)
 
     # N array
     for a in TCG_REGISTER_NUMBERS:
-        print("\t\t{", end="")
+        print("\t\t{", end="", file=c_file)
 
         # M array
         for b in TCG_REGISTER_NUMBERS:
-            print(f"gadget_{name}_arg{a}_arg{b}", end=", ")
+            print(f"gadget_{name}_arg{a}_arg{b}", end=", ", file=c_file)
 
-        print("},")
-    print("};")
+        print("},", file=c_file)
+    print("};", file=c_file)
 
 
 def math_dnm(name, mnemonic):
@@ -227,34 +280,44 @@ def with_single(name, substitution, *lines):
     """ Generates a collection of gadgets with two subtstitutions."""
     with_register_substitutions(name, (substitution,), *lines)
 
+    # Fetch the files we'll be using for output.
+    c_file, h_file = _get_output_files()
+
+    print(f"extern void* gadget_{name}[{TCG_REGISTER_COUNT}];", file=h_file)
+
     # Print out an array that contains all of our gadgets, for lookup.
-    print(f"void* gadget_{name}[{TCG_REGISTER_COUNT}] = ", end="")
-    print("{")
+    print(f"void* gadget_{name}[{TCG_REGISTER_COUNT}] = ", end="", file=c_file)
+    print("{", file=c_file)
 
     for n in TCG_REGISTER_NUMBERS:
-        print(f"gadget_{name}_arg{n}", end=", ")
+        print(f"gadget_{name}_arg{n}", end=", ", file=c_file)
 
-    print("};")
+    print("};", file=c_file)
 
 
 def with_d_immediate(name, *lines, immediate_range=range(0)):
     """ Generates a collection of gadgets with two subtstitutions."""
     with_register_substitutions(name, ['d'], *lines, immediate_range=immediate_range)
 
+    # Fetch the files we'll be using for output.
+    c_file, h_file = _get_output_files()
+
+    print(f"extern void* gadget_{name}[{TCG_REGISTER_COUNT}][{len(immediate_range)}];", file=h_file)
+
     # Print out an array that contains all of our gadgets, for lookup.
-    print(f"void* gadget_{name}[{TCG_REGISTER_COUNT}][{len(immediate_range)}] = ", end="")
-    print("{")
+    print(f"void* gadget_{name}[{TCG_REGISTER_COUNT}][{len(immediate_range)}] = ", end="", file=c_file)
+    print("{", file=c_file)
 
     # D array
     for a in TCG_REGISTER_NUMBERS:
-        print("\t\t{", end="")
+        print("\t\t{", end="", file=c_file)
 
         # I array
         for b in immediate_range:
-            print(f"gadget_{name}_arg{a}_arg{b}", end=", ")
+            print(f"gadget_{name}_arg{a}_arg{b}", end=", ", file=c_file)
 
-        print("},")
-    print("};")
+        print("},", file=c_file)
+    print("};", file=c_file)
 
 
 
@@ -413,7 +476,7 @@ def ld_thunk(name, fastpath_32b, fastpath_64b, slowpath_helper, immediate=None,
                     "mov   x3, x28",
 
                     # Perform our actual core code.
-                    f"bl _{slowpath_helper}",
+                    f"bl {slowpath_helper}",
 
                     # Temporarily store our result in a register that won't get trashed.
                     "mov x27, x0",
@@ -493,7 +556,7 @@ def st_thunk(name, fastpath_32b, fastpath_64b, slowpath_helper, immediate=None,
                     "mov   x4, x28",
 
                     # Perform our actual core code.
-                    f"bl _{slowpath_helper}",
+                    f"bl {slowpath_helper}",
 
                     # Restore our registers after our C call.
                     *C_CALL_EPILOGUE,
@@ -507,7 +570,7 @@ def st_thunk(name, fastpath_32b, fastpath_64b, slowpath_helper, immediate=None,
 # Gadget definitions.
 #
 
-print("/* Automatically generated by tcti-gadget-gen.py. Do not edit. */\n")
+START_COLLECTION("misc")
 
 # Call a C language helper function by address.
 simple("call",
@@ -550,6 +613,14 @@ def st_thunk(name, fastpath_32b, fastpath_64b, slowpath_helper, immediate=None,
     "ret"
 )
 
+# Memory barriers.
+simple("mb_all", "dmb ish")
+simple("mb_st",  "dmb ishst")
+simple("mb_ld",  "dmb ishld")
+
+
+START_COLLECTION("conditionals")
+
 
 for condition in ARCH_CONDITION_CODES:
 
@@ -612,6 +683,9 @@ def st_thunk(name, fastpath_32b, fastpath_64b, slowpath_helper, immediate=None,
     )
 
 
+START_COLLECTION("mov")
+
+
 # MOV variants.
 with_dn("mov_i32",     "mov Wd, Wn")
 with_dn("mov_i64",     "mov Xd, Xn")
@@ -623,17 +697,24 @@ def st_thunk(name, fastpath_32b, fastpath_64b, slowpath_helper, immediate=None,
 with_d_immediate("movi_imm_i32", "mov Wd, #Ii", immediate_range=range(64))
 with_d_immediate("movi_imm_i64", "mov Xd, #Ii", immediate_range=range(64))
 
+START_COLLECTION("load_unsigned")
+
 # LOAD variants.
 # TODO: should the signed variants have X variants for _i64?
 ldst_dn("ld8u",      "ldrb  Wd, [Xn, x27]")
+ldst_dn("ld16u",     "ldrh  Wd, [Xn, x27]")
+ldst_dn("ld32u",     "ldr   Wd, [Xn, x27]")
+ldst_dn("ld_i64",    "ldr   Xd, [Xn, x27]")
+
+START_COLLECTION("load_signed")
+
 ldst_dn("ld8s_i32",  "ldrsb Wd, [Xn, x27]")
 ldst_dn("ld8s_i64",  "ldrsb Xd, [Xn, x27]")
-ldst_dn("ld16u",     "ldrh  Wd, [Xn, x27]")
 ldst_dn("ld16s_i32", "ldrsh Wd, [Xn, x27]")
 ldst_dn("ld16s_i64", "ldrsh Xd, [Xn, x27]")
-ldst_dn("ld32u",     "ldr   Wd, [Xn, x27]")
 ldst_dn("ld32s_i64", "ldrsw Xd, [Xn, x27]")
-ldst_dn("ld_i64",    "ldr   Xd, [Xn, x27]")
+
+START_COLLECTION("store")
 
 # STORE variants.
 ldst_dn("st8",         "strb  Wd, [Xn, x27]")
@@ -644,6 +725,8 @@ def st_thunk(name, fastpath_32b, fastpath_64b, slowpath_helper, immediate=None,
 # QEMU LD/ST are handled in our C runtime rather than with simple gadgets,
 # as they're nontrivial.
 
+START_COLLECTION("arithmetic")
+
 # Trivial arithmetic.
 math_dnm("add" , "add" )
 math_dnm("sub" , "sub" )
@@ -657,6 +740,8 @@ def st_thunk(name, fastpath_32b, fastpath_64b, slowpath_helper, immediate=None,
 with_dnm("remu_i32", "udiv w27, Wn, Wm", "msub Wd, w27, Wm, Wn")
 with_dnm("remu_i64", "udiv x27, Xn, Xm", "msub Xd, x27, Xm, Xn")
 
+START_COLLECTION("logical")
+
 # Trivial logical.
 math_dn( "not",  "mvn")
 math_dn( "neg",  "neg")
@@ -676,6 +761,8 @@ def st_thunk(name, fastpath_32b, fastpath_64b, slowpath_helper, immediate=None,
 #with_dnm("rotl_i32", "neg w27, Wm", "ror Wd, Wn, w27")
 #with_dnm("rotl_i64", "neg x27, Xm", "ror Xd, Xn, x27")
 
+START_COLLECTION("extension")
+
 # Numeric extension.
 math_dn("ext8s",      "sxtb")
 with_dn("ext8u",      "and Xd, Xn, #0xff")
@@ -684,15 +771,14 @@ def st_thunk(name, fastpath_32b, fastpath_64b, slowpath_helper, immediate=None,
 with_dn("ext32s_i64", "sxtw Xd, Wn")
 with_dn("ext32u_i64", "and Xd, Xn, #0xffffffff")
 
+START_COLLECTION("byteswap")
+
 # Byte swapping.
 with_dn("bswap16",    "rev w27, Wn", "lsr Wd, w27, #16")
 with_dn("bswap32",    "rev Wd, Wn")
 with_dn("bswap64",    "rev Xd, Xn")
 
-# Memory barriers.
-simple("mb_all", "dmb ish")
-simple("mb_st",  "dmb ishst")
-simple("mb_ld",  "dmb ishld")
+START_COLLECTION("qemu_ld")
 
 # Handlers for QEMU_LD, which handles guest <- host loads.
 for subtype in ('aligned', 'unaligned', 'slowpath'):
@@ -747,6 +833,9 @@ def st_thunk(name, fastpath_32b, fastpath_64b, slowpath_helper, immediate=None,
             is_aligned=is_aligned, force_slowpath=is_slowpath)
 
 
+START_COLLECTION("qemu_st")
+
+
 # Handlers for QEMU_ST, which handles guest -> host stores.
 for subtype in ('aligned', 'unaligned', 'slowpath'):
     is_aligned  = (subtype == 'aligned')
@@ -784,5 +873,12 @@ def st_thunk(name, fastpath_32b, fastpath_64b, slowpath_helper, immediate=None,
             is_aligned=is_aligned, force_slowpath=is_slowpath)
 
 
+# Print a list of output files generated.
+output_c_filenames = ", ".join(f"'tcti_{name}_gadgets.c'" for name in output_files.keys())
+output_h_filenames = ", ".join(f"'tcti_{name}_gadgets.h'" for name in output_files.keys())
+
+print("Sources generated:", file=sys.stderr)
+print(f"output: [{output_c_filenames}, {output_h_filenames}, 'tcti_gadgets.h']", file=sys.stderr)
+
 # Statistics.
 sys.stderr.write(f"\nGenerated {gadgets} gadgets with {instructions} instructions ({instructions * 4} B).\n\n")

From 744075b9ee623e3090340c576743a1b7024c2358 Mon Sep 17 00:00:00 2001
From: Kate Temkin <k@ktemkin.com>
Date: Tue, 27 Apr 2021 14:02:53 -0600
Subject: [PATCH 28/36] TCTI: dramatically speed up build & reduce footprint

---
 configure                           |  1 +
 meson.build                         | 66 +++++++++++++++-----------
 tcg/aarch64-tcti/tcg-target.c.inc   | 24 ++++++++--
 tcg/aarch64-tcti/tcti-gadget-gen.py | 72 ++++++++++++++++-------------
 4 files changed, 99 insertions(+), 64 deletions(-)

diff --git a/configure b/configure
index d0053845169c..c3637f198428 100755
--- a/configure
+++ b/configure
@@ -1604,6 +1604,7 @@ for opt do
   --enable-gio) gio=yes
   ;;
   --disable-gio) gio=no
+  ;;
   --enable-slirp-smbd) slirp_smbd=yes
   ;;
   --disable-slirp-smbd) slirp_smbd=no
diff --git a/meson.build b/meson.build
index 52f40e6b77e8..02a6354802c7 100644
--- a/meson.build
+++ b/meson.build
@@ -121,6 +121,9 @@ add_global_arguments(config_host['QEMU_CXXFLAGS'].split(),
 add_global_link_arguments(config_host['QEMU_LDFLAGS'].split(),
                           native: false, language: ['c', 'cpp', 'objc'])
 
+add_global_arguments('-Wno-unused-command-line-argument', native: false, language: ['c', 'objc'])
+add_global_link_arguments('-ldl', native: false, language: ['c', 'cpp', 'objc'])
+
 if targetos == 'linux'
   add_project_arguments('-isystem', meson.current_source_dir() / 'linux-headers',
                         '-isystem', 'linux-headers',
@@ -258,30 +261,41 @@ if not get_option('tcg').disabled()
     # Tell our compiler how to generate our TCTI gadgets.
     gadget_generator = 'tcg/@0@/tcti-gadget-gen.py'.format(tcg_arch)
     gadgets = [
-      'tcti_misc_gadgets.c', 
-      'tcti_conditionals_gadgets.c', 
-      'tcti_mov_gadgets.c', 
-      'tcti_load_gadgets.c', 
-      'tcti_store_gadgets.c', 
-      'tcti_arithmetic_gadgets.c', 
-      'tcti_logical_gadgets.c', 
-      'tcti_extension_gadgets.c', 
-      'tcti_byteswap_gadgets.c', 
-      'tcti_qemu_ld_gadgets.c', 
-      'tcti_qemu_st_gadgets.c', 
-      'tcti_misc_gadgets.h', 
-      'tcti_conditionals_gadgets.h', 
-      'tcti_mov_gadgets.h', 
-      'tcti_load_gadgets.h', 
-      'tcti_store_gadgets.h', 
-      'tcti_arithmetic_gadgets.h', 
-      'tcti_logical_gadgets.h', 
-      'tcti_extension_gadgets.h', 
-      'tcti_byteswap_gadgets.h', 
-      'tcti_qemu_ld_gadgets.h', 
-      'tcti_qemu_st_gadgets.h', 
-      'tcti_gadgets.h']
-
+        'tcti_gadgets.h',
+        'tcti_misc_gadgets.c',
+        'tcti_misc_gadgets.s',
+        'tcti_misc_gadgets.h',
+        'tcti_conditionals_gadgets.c',
+        'tcti_conditionals_gadgets.s',
+        'tcti_conditionals_gadgets.h',
+        'tcti_mov_gadgets.c',
+        'tcti_mov_gadgets.s',
+        'tcti_mov_gadgets.h',
+        'tcti_load_gadgets.c',
+        'tcti_load_gadgets.s',
+        'tcti_load_gadgets.h',
+        'tcti_store_gadgets.c',
+        'tcti_store_gadgets.s',
+        'tcti_store_gadgets.h',
+        'tcti_arithmetic_gadgets.c',
+        'tcti_arithmetic_gadgets.s',
+        'tcti_arithmetic_gadgets.h',
+        'tcti_logical_gadgets.c',
+        'tcti_logical_gadgets.s',
+        'tcti_logical_gadgets.h',
+        'tcti_extension_gadgets.c',
+        'tcti_extension_gadgets.s',
+        'tcti_extension_gadgets.h',
+        'tcti_byteswap_gadgets.c',
+        'tcti_byteswap_gadgets.s',
+        'tcti_byteswap_gadgets.h',
+        'tcti_qemu_ld_gadgets.c',
+        'tcti_qemu_ld_gadgets.s',
+        'tcti_qemu_ld_gadgets.h',
+        'tcti_qemu_st_gadgets.c',
+        'tcti_qemu_st_gadgets.s',
+        'tcti_qemu_st_gadgets.h',
+    ]
     tcti_gadgets = custom_target('tcti-gadgets.h',
                               output: gadgets,
                               input: gadget_generator,
@@ -498,7 +512,7 @@ endif
 rt = cc.find_library('rt', required: false)
 libdl = not_found
 if 'CONFIG_PLUGIN' in config_host
-  libdl = cc.find_library('dl', required: true)
+  libdl = dependency('dl', required: true)
 endif
 libiscsi = not_found
 if not get_option('libiscsi').auto() or have_block
@@ -1967,7 +1981,7 @@ if get_option('b_lto')
 endif
 common_ss.add(pagevary)
 specific_ss.add(files('page-vary.c'))
-specific_ss.add(when: 'CONFIG_TCG_INTERPRETER', if_true: files('disas/tci.c', 'tcg/tci.c'))
+specific_ss.add(when: 'CONFIG_TCG_INTERPRETER', if_true: files('tcg/tci.c'))
 specific_ss.add(when: 'CONFIG_TCG_THREADED_INTERPRETER', if_true: tcti_gadgets)
 
 subdir('backends')
diff --git a/tcg/aarch64-tcti/tcg-target.c.inc b/tcg/aarch64-tcti/tcg-target.c.inc
index 315033502ef7..7cf6acdc3dfa 100644
--- a/tcg/aarch64-tcti/tcg-target.c.inc
+++ b/tcg/aarch64-tcti/tcg-target.c.inc
@@ -22,7 +22,9 @@
  * THE SOFTWARE.
  */
 
-//#define TCTI_GADGET_RICH_DISASSEMBLY
+
+// Rich disassembly is nice in theory, but it's -slow-.
+#define TCTI_GADGET_RICH_DISASSEMBLY
 
 #define TCTI_GADGET_IMMEDIATE_ARRAY_LEN 64
 
@@ -1037,6 +1039,7 @@ void tcg_out_op(TCGContext *s, TCGOpcode opc, const TCGArg *args, const int *con
         void *gadget;
 
         switch(TLB_MASK_TABLE_OFS(get_mmuidx(args[2]))) {
+            case -32:  LD_MEMOP_HANDLER(gadget, args[2],  off32_i32, a_bits, s_bits); break;
             case -64:  LD_MEMOP_HANDLER(gadget, args[2],  off64_i32, a_bits, s_bits); break;
             case -96:  LD_MEMOP_HANDLER(gadget, args[2],  off96_i32, a_bits, s_bits); break;
             case -128: LD_MEMOP_HANDLER(gadget, args[2], off128_i32, a_bits, s_bits); break;
@@ -1062,7 +1065,11 @@ void tcg_out_op(TCGContext *s, TCGOpcode opc, const TCGArg *args, const int *con
         // this is a common case. Delegate to our special-case handler.
         if (args[2] == 0x3a) {
             switch(TLB_MASK_TABLE_OFS(get_mmuidx(args[2]))) {
-
+                case -32: 
+                    gadget = (a_bits >= s_bits) ? 
+                        gadget_qemu_ld_leq_aligned_mode3a_off32_i64 :
+                        gadget_qemu_ld_leq_unaligned_mode3a_off32_i64;
+                    break;
                 case -64: 
                     gadget = (a_bits >= s_bits) ? 
                         gadget_qemu_ld_leq_aligned_mode3a_off64_i64 :
@@ -1088,6 +1095,7 @@ void tcg_out_op(TCGContext *s, TCGOpcode opc, const TCGArg *args, const int *con
         // Otherwise, handle the generic case.
         else {
             switch(TLB_MASK_TABLE_OFS(get_mmuidx(args[2]))) {
+                case -32:  LD_MEMOP_HANDLER(gadget, args[2],  off32_i64, a_bits, s_bits); break;
                 case -64:  LD_MEMOP_HANDLER(gadget, args[2],  off64_i64, a_bits, s_bits); break;
                 case -96:  LD_MEMOP_HANDLER(gadget, args[2],  off96_i64, a_bits, s_bits); break;
                 case -128: LD_MEMOP_HANDLER(gadget, args[2], off128_i64, a_bits, s_bits); break;
@@ -1111,6 +1119,7 @@ void tcg_out_op(TCGContext *s, TCGOpcode opc, const TCGArg *args, const int *con
         void *gadget;
 
         switch(TLB_MASK_TABLE_OFS(get_mmuidx(args[2]))) {
+            case -32:  ST_MEMOP_HANDLER(gadget, args[2],  off32_i32, a_bits, s_bits); break;
             case -64:  ST_MEMOP_HANDLER(gadget, args[2],  off64_i32, a_bits, s_bits); break;
             case -96:  ST_MEMOP_HANDLER(gadget, args[2],  off96_i32, a_bits, s_bits); break;
             case -128: ST_MEMOP_HANDLER(gadget, args[2], off128_i32, a_bits, s_bits); break;
@@ -1137,7 +1146,11 @@ void tcg_out_op(TCGContext *s, TCGOpcode opc, const TCGArg *args, const int *con
         // this is a common case. Delegate to our special-case handler.
         if (args[2] == 0x3a) {
             switch(TLB_MASK_TABLE_OFS(get_mmuidx(args[2]))) {
-
+                case -32: 
+                    gadget = (a_bits >= s_bits) ? 
+                        gadget_qemu_st_leq_aligned_mode3a_off32_i64 :
+                        gadget_qemu_st_leq_unaligned_mode3a_off32_i64;
+                    break;
                 case -64: 
                     gadget = (a_bits >= s_bits) ? 
                         gadget_qemu_st_leq_aligned_mode3a_off64_i64 :
@@ -1163,6 +1176,7 @@ void tcg_out_op(TCGContext *s, TCGOpcode opc, const TCGArg *args, const int *con
         // Otherwise, handle the generic case.
         else {
             switch(TLB_MASK_TABLE_OFS(get_mmuidx(args[2]))) {
+                case -32:  ST_MEMOP_HANDLER(gadget, args[2],  off32_i32, a_bits, s_bits); break;
                 case -64:  ST_MEMOP_HANDLER(gadget, args[2],  off64_i64, a_bits, s_bits); break;
                 case -96:  ST_MEMOP_HANDLER(gadget, args[2],  off96_i64, a_bits, s_bits); break;
                 case -128: ST_MEMOP_HANDLER(gadget, args[2], off128_i64, a_bits, s_bits); break;
@@ -1351,11 +1365,11 @@ int print_insn_tcti(bfd_vma addr, disassemble_info *info)
         symbol_name[sizeof(symbol_name) - 1] = 0;
         info->fprintf_func(info->stream, "%s", symbol_name);
     } else {
-        info->fprintf_func(info->stream, "%016llx", block);
+        info->fprintf_func(info->stream, "%016lx", block);
     }
 
 #else
-    info->fprintf_func(info->stream, "%016llx", block);
+    info->fprintf_func(info->stream, "%016lx", block);
 #endif
 
     return sizeof(block);
diff --git a/tcg/aarch64-tcti/tcti-gadget-gen.py b/tcg/aarch64-tcti/tcti-gadget-gen.py
index 51471719a48b..800702fad3db 100755
--- a/tcg/aarch64-tcti/tcti-gadget-gen.py
+++ b/tcg/aarch64-tcti/tcti-gadget-gen.py
@@ -4,6 +4,7 @@
 Generates a C-code include file containing 'gadgets' for use by TCTI.
 """
 
+import os
 import sys
 import itertools
 
@@ -26,7 +27,7 @@
 
 # We'll create a variety of gadgets that assume the MMU's TLB is stored at certain
 # offsets into its structure. These should match the offsets in tcg-target.c.in.
-QEMU_ALLOWED_MMU_OFFSETS = [ 64, 96, 128 ]
+QEMU_ALLOWED_MMU_OFFSETS = [ 32, 64, 96, 128 ]
 
 # Statistics.
 gadgets      = 0
@@ -54,14 +55,16 @@ def START_COLLECTION(name):
 
     # Create the relevant output files
     new_c_file = open(f"tcti_{name}_gadgets.c", "w")
+    new_s_file = open(f"tcti_{name}_gadgets.s", "w")
     new_h_file = open(f"tcti_{name}_gadgets.h", "w")
-    output_files[name] = (new_c_file, new_h_file)
+    output_files[name] = (new_c_file, new_s_file, new_h_file)
 
     # Add the file to our gadget collection.
     print(f'#include "tcti_{name}_gadgets.h"', file=top_header)
 
     # Add generated messages to the relevant collection.
     print("/* Automatically generated by tcti-gadget-gen.py. Do not edit. */\n", file=new_c_file)
+    print("/* Automatically generated by tcti-gadget-gen.py. Do not edit. */\n", file=new_s_file)
     print("/* Automatically generated by tcti-gadget-gen.py. Do not edit. */\n", file=new_h_file)
 
     # Start our C file with inclusion of the relevant header.
@@ -82,22 +85,19 @@ def simple(name, *lines):
     gadgets += 1
 
     # Fetch the files we'll be using for output.
-    c_file, h_file = _get_output_files()
+    c_file, s_file, h_file = _get_output_files()
 
     # Create our C/ASM framing.
-    print(f"__attribute__((naked)) void gadget_{name}(void);", file=h_file)
-    print(f"__attribute__((naked)) void gadget_{name}(void)", file=c_file)
-    print("{", file=c_file)
+    print(f"void gadget_{name}(void);", file=h_file)
+    print(f".global gadget_{name}", file=s_file)
+    print(f"gadget_{name}:", file=s_file)
 
     # Add the core gadget
-    print("\tasm(", file=c_file)
     for line in lines + EPILOGUE:
-        print(f"\t\t\"{line} \\n\"", file=c_file)
+        print(f"\t{line}", file=s_file)
         instructions += 1
-    print("\t);", file=c_file)
 
-    # End our framing.
-    print("}\n", file=c_file)
+    print(f"", file=s_file)
 
 
 def with_register_substitutions(name, substitutions, *lines, immediate_range=range(0)):
@@ -149,7 +149,7 @@ def with_dnm(name, *lines):
     with_register_substitutions(name, ("d", "n", "m"), *lines)
 
     # Fetch the files we'll be using for output.
-    c_file, h_file = _get_output_files()
+    c_file, s_file, h_file = _get_output_files()
 
     # Print out an extern.
     print(f"extern void* gadget_{name}[{TCG_REGISTER_COUNT}][{TCG_REGISTER_COUNT}][{TCG_REGISTER_COUNT}];", file=h_file)
@@ -180,7 +180,7 @@ def with_dn_immediate(name, *lines, immediate_range):
     with_register_substitutions(name, ["d", "n"], *lines, immediate_range=immediate_range)
 
     # Fetch the files we'll be using for output.
-    c_file, h_file = _get_output_files()
+    c_file, s_file, h_file = _get_output_files()
 
     # Print out an extern.
     print(f"extern void* gadget_{name}[{TCG_REGISTER_COUNT}][{TCG_REGISTER_COUNT}][{len(immediate_range)}];", file=h_file)
@@ -211,7 +211,7 @@ def with_pair(name, substitutions, *lines):
     with_register_substitutions(name, substitutions, *lines)
 
     # Fetch the files we'll be using for output.
-    c_file, h_file = _get_output_files()
+    c_file, s_file, h_file = _get_output_files()
 
     print(f"extern void* gadget_{name}[{TCG_REGISTER_COUNT}][{TCG_REGISTER_COUNT}];", file=h_file)
 
@@ -236,10 +236,10 @@ def math_dnm(name, mnemonic):
     with_dnm(f'{name}_i32', f"{mnemonic} Wd, Wn, Wm")
     with_dnm(f'{name}_i64', f"{mnemonic} Xd, Xn, Xm")
 
-def math_dn(name, mnemonic):
+def math_dn(name, mnemonic, source_is_wn=False):
     """ Equivalent to `with_dn`, but creates a _i32 and _i64 variant. For simple math. """
     with_dn(f'{name}_i32', f"{mnemonic} Wd, Wn")
-    with_dn(f'{name}_i64', f"{mnemonic} Xd, Xn")
+    with_dn(f'{name}_i64', f"{mnemonic} Xd, Wn" if source_is_wn else f"{mnemonic} Xd, Xn")
 
 
 def with_nm(name, *lines):
@@ -281,7 +281,7 @@ def with_single(name, substitution, *lines):
     with_register_substitutions(name, (substitution,), *lines)
 
     # Fetch the files we'll be using for output.
-    c_file, h_file = _get_output_files()
+    c_file, s_file, h_file = _get_output_files()
 
     print(f"extern void* gadget_{name}[{TCG_REGISTER_COUNT}];", file=h_file)
 
@@ -300,7 +300,7 @@ def with_d_immediate(name, *lines, immediate_range=range(0)):
     with_register_substitutions(name, ['d'], *lines, immediate_range=immediate_range)
 
     # Fetch the files we'll be using for output.
-    c_file, h_file = _get_output_files()
+    c_file, s_file, h_file = _get_output_files()
 
     print(f"extern void* gadget_{name}[{TCG_REGISTER_COUNT}][{len(immediate_range)}];", file=h_file)
 
@@ -651,7 +651,7 @@ def st_thunk(name, fastpath_32b, fastpath_64b, slowpath_helper, immediate=None,
         "ldr x27, [x28], #8",
 
         # Perform our comparison and conditional branch.
-        "subs Wzr, Wn, Wm",
+        "subs wzr, Wn, Wm",
         f"b{condition} 1f",
 
         "0:", # not taken
@@ -670,7 +670,7 @@ def st_thunk(name, fastpath_32b, fastpath_64b, slowpath_helper, immediate=None,
         "ldr x27, [x28], #8",
 
         # Perform our comparison and conditional branch.
-        "subs Xzr, Xn, Xm",
+        "subs xzr, Xn, Xm",
         f"b{condition} 1f",
 
         "0:", # not taken
@@ -697,22 +697,19 @@ def st_thunk(name, fastpath_32b, fastpath_64b, slowpath_helper, immediate=None,
 with_d_immediate("movi_imm_i32", "mov Wd, #Ii", immediate_range=range(64))
 with_d_immediate("movi_imm_i64", "mov Xd, #Ii", immediate_range=range(64))
 
-START_COLLECTION("load_unsigned")
+START_COLLECTION("load")
 
 # LOAD variants.
 # TODO: should the signed variants have X variants for _i64?
 ldst_dn("ld8u",      "ldrb  Wd, [Xn, x27]")
-ldst_dn("ld16u",     "ldrh  Wd, [Xn, x27]")
-ldst_dn("ld32u",     "ldr   Wd, [Xn, x27]")
-ldst_dn("ld_i64",    "ldr   Xd, [Xn, x27]")
-
-START_COLLECTION("load_signed")
-
 ldst_dn("ld8s_i32",  "ldrsb Wd, [Xn, x27]")
 ldst_dn("ld8s_i64",  "ldrsb Xd, [Xn, x27]")
+ldst_dn("ld16u",     "ldrh  Wd, [Xn, x27]")
 ldst_dn("ld16s_i32", "ldrsh Wd, [Xn, x27]")
 ldst_dn("ld16s_i64", "ldrsh Xd, [Xn, x27]")
+ldst_dn("ld32u",     "ldr   Wd, [Xn, x27]")
 ldst_dn("ld32s_i64", "ldrsw Xd, [Xn, x27]")
+ldst_dn("ld_i64",    "ldr   Xd, [Xn, x27]")
 
 START_COLLECTION("store")
 
@@ -764,9 +761,9 @@ def st_thunk(name, fastpath_32b, fastpath_64b, slowpath_helper, immediate=None,
 START_COLLECTION("extension")
 
 # Numeric extension.
-math_dn("ext8s",      "sxtb")
+math_dn("ext8s",      "sxtb", source_is_wn=True)
 with_dn("ext8u",      "and Xd, Xn, #0xff")
-math_dn("ext16s",     "sxth")
+math_dn("ext16s",     "sxth", source_is_wn=True)
 with_dn("ext16u",     "and Wd, Wn, #0xffff")
 with_dn("ext32s_i64", "sxtw Xd, Wn")
 with_dn("ext32u_i64", "and Xd, Xn, #0xffffffff")
@@ -874,11 +871,20 @@ def st_thunk(name, fastpath_32b, fastpath_64b, slowpath_helper, immediate=None,
 
 
 # Print a list of output files generated.
-output_c_filenames = ", ".join(f"'tcti_{name}_gadgets.c'" for name in output_files.keys())
-output_h_filenames = ", ".join(f"'tcti_{name}_gadgets.h'" for name in output_files.keys())
+output_c_filenames = (f"'tcti_{name}_gadgets.c'" for name in output_files.keys())
+output_s_filenames = (f"'tcti_{name}_gadgets.s'" for name in output_files.keys())
+output_h_filenames = (f"'tcti_{name}_gadgets.h'" for name in output_files.keys())
+
+print("Sources generated:",    file=sys.stderr)
+print(f"gadgets = [",          file=sys.stderr)
+print("      tcti_gadgets.h,", file=sys.stderr)
+
+for name in output_files.keys():
+    print(f"      tcti_{name}_gadgets.c,", file=sys.stderr)
+    print(f"      tcti_{name}_gadgets.s,", file=sys.stderr)
+    print(f"      tcti_{name}_gadgets.h,", file=sys.stderr)
 
-print("Sources generated:", file=sys.stderr)
-print(f"output: [{output_c_filenames}, {output_h_filenames}, 'tcti_gadgets.h']", file=sys.stderr)
+print(f"]", file=sys.stderr)
 
 # Statistics.
 sys.stderr.write(f"\nGenerated {gadgets} gadgets with {instructions} instructions ({instructions * 4} B).\n\n")

From 6764fe306fe541ee7768b545b4d853b5fa74b275 Mon Sep 17 00:00:00 2001
From: Kate Temkin <k@ktemkin.com>
Date: Wed, 28 Apr 2021 05:37:42 -0600
Subject: [PATCH 29/36] TCTI: more modular hacks

---
 meson.build                         | 69 ++++++++++++++++++++++-----
 tcg/aarch64-tcti/tcg-target.c.inc   | 18 +++----
 tcg/aarch64-tcti/tcti-gadget-gen.py | 73 ++++++++++++++++++-----------
 3 files changed, 112 insertions(+), 48 deletions(-)

diff --git a/meson.build b/meson.build
index 02a6354802c7..dfd1606bfa30 100644
--- a/meson.build
+++ b/meson.build
@@ -265,15 +265,21 @@ if not get_option('tcg').disabled()
         'tcti_misc_gadgets.c',
         'tcti_misc_gadgets.s',
         'tcti_misc_gadgets.h',
-        'tcti_conditionals_gadgets.c',
-        'tcti_conditionals_gadgets.s',
-        'tcti_conditionals_gadgets.h',
+        'tcti_setcond_gadgets.c',
+        'tcti_setcond_gadgets.s',
+        'tcti_setcond_gadgets.h',
+        'tcti_brcond_gadgets.c',
+        'tcti_brcond_gadgets.s',
+        'tcti_brcond_gadgets.h',
         'tcti_mov_gadgets.c',
         'tcti_mov_gadgets.s',
         'tcti_mov_gadgets.h',
-        'tcti_load_gadgets.c',
-        'tcti_load_gadgets.s',
-        'tcti_load_gadgets.h',
+        'tcti_load_signed_gadgets.c',
+        'tcti_load_signed_gadgets.s',
+        'tcti_load_signed_gadgets.h',
+        'tcti_load_unsigned_gadgets.c',
+        'tcti_load_unsigned_gadgets.s',
+        'tcti_load_unsigned_gadgets.h',
         'tcti_store_gadgets.c',
         'tcti_store_gadgets.s',
         'tcti_store_gadgets.h',
@@ -289,12 +295,51 @@ if not get_option('tcg').disabled()
         'tcti_byteswap_gadgets.c',
         'tcti_byteswap_gadgets.s',
         'tcti_byteswap_gadgets.h',
-        'tcti_qemu_ld_gadgets.c',
-        'tcti_qemu_ld_gadgets.s',
-        'tcti_qemu_ld_gadgets.h',
-        'tcti_qemu_st_gadgets.c',
-        'tcti_qemu_st_gadgets.s',
-        'tcti_qemu_st_gadgets.h',
+        'tcti_qemu_ld_aligned_signed_le_gadgets.c',
+        'tcti_qemu_ld_aligned_signed_le_gadgets.s',
+        'tcti_qemu_ld_aligned_signed_le_gadgets.h',
+        'tcti_qemu_ld_unaligned_signed_le_gadgets.c',
+        'tcti_qemu_ld_unaligned_signed_le_gadgets.s',
+        'tcti_qemu_ld_unaligned_signed_le_gadgets.h',
+        'tcti_qemu_ld_slowpath_signed_le_gadgets.c',
+        'tcti_qemu_ld_slowpath_signed_le_gadgets.s',
+        'tcti_qemu_ld_slowpath_signed_le_gadgets.h',
+        'tcti_qemu_ld_aligned_unsigned_le_gadgets.c',
+        'tcti_qemu_ld_aligned_unsigned_le_gadgets.s',
+        'tcti_qemu_ld_aligned_unsigned_le_gadgets.h',
+        'tcti_qemu_ld_unaligned_unsigned_le_gadgets.c',
+        'tcti_qemu_ld_unaligned_unsigned_le_gadgets.s',
+        'tcti_qemu_ld_unaligned_unsigned_le_gadgets.h',
+        'tcti_qemu_ld_slowpath_unsigned_le_gadgets.c',
+        'tcti_qemu_ld_slowpath_unsigned_le_gadgets.s',
+        'tcti_qemu_ld_slowpath_unsigned_le_gadgets.h',
+        'tcti_qemu_ld_aligned_be_gadgets.c',
+        'tcti_qemu_ld_aligned_be_gadgets.s',
+        'tcti_qemu_ld_aligned_be_gadgets.h',
+        'tcti_qemu_ld_unaligned_be_gadgets.c',
+        'tcti_qemu_ld_unaligned_be_gadgets.s',
+        'tcti_qemu_ld_unaligned_be_gadgets.h',
+        'tcti_qemu_ld_slowpath_be_gadgets.c',
+        'tcti_qemu_ld_slowpath_be_gadgets.s',
+        'tcti_qemu_ld_slowpath_be_gadgets.h',
+        'tcti_qemu_st_aligned_le_gadgets.c',
+        'tcti_qemu_st_aligned_le_gadgets.s',
+        'tcti_qemu_st_aligned_le_gadgets.h',
+        'tcti_qemu_st_unaligned_le_gadgets.c',
+        'tcti_qemu_st_unaligned_le_gadgets.s',
+        'tcti_qemu_st_unaligned_le_gadgets.h',
+        'tcti_qemu_st_slowpath_le_gadgets.c',
+        'tcti_qemu_st_slowpath_le_gadgets.s',
+        'tcti_qemu_st_slowpath_le_gadgets.h',
+        'tcti_qemu_st_aligned_be_gadgets.c',
+        'tcti_qemu_st_aligned_be_gadgets.s',
+        'tcti_qemu_st_aligned_be_gadgets.h',
+        'tcti_qemu_st_unaligned_be_gadgets.c',
+        'tcti_qemu_st_unaligned_be_gadgets.s',
+        'tcti_qemu_st_unaligned_be_gadgets.h',
+        'tcti_qemu_st_slowpath_be_gadgets.c',
+        'tcti_qemu_st_slowpath_be_gadgets.s',
+        'tcti_qemu_st_slowpath_be_gadgets.h',
     ]
     tcti_gadgets = custom_target('tcti-gadgets.h',
                               output: gadgets,
diff --git a/tcg/aarch64-tcti/tcg-target.c.inc b/tcg/aarch64-tcti/tcg-target.c.inc
index 7cf6acdc3dfa..b5da8e228162 100644
--- a/tcg/aarch64-tcti/tcg-target.c.inc
+++ b/tcg/aarch64-tcti/tcg-target.c.inc
@@ -365,13 +365,13 @@ tcg_target_ulong helper_be_ldul_mmu_signed(CPUArchState *env, target_ulong addr,
 
 
 /* Write gadget pointer. */
-static void tcg_out_nullary_gadget(TCGContext *s, void *gadget)
+static void tcg_out_nullary_gadget(TCGContext *s, const void *gadget)
 {
     tcg_out_immediate(s, (tcg_target_ulong)gadget);
 }
 
 /* Write gadget pointer, plus 64b immediate. */
-static void tcg_out_imm64_gadget(TCGContext *s, void *gadget, tcg_target_ulong immediate)
+static void tcg_out_imm64_gadget(TCGContext *s, const void *gadget, tcg_target_ulong immediate)
 {
     tcg_out_nullary_gadget(s, gadget);
     tcg_out64(s, immediate);
@@ -379,21 +379,21 @@ static void tcg_out_imm64_gadget(TCGContext *s, void *gadget, tcg_target_ulong i
 
 
 /* Write gadget pointer (one register). */
-static void tcg_out_unary_gadget(TCGContext *s, void *gadget_base[TCG_TARGET_NB_REGS], unsigned reg0)
+static void tcg_out_unary_gadget(TCGContext *s, const void *gadget_base[TCG_TARGET_NB_REGS], unsigned reg0)
 {
     tcg_out_nullary_gadget(s, gadget_base[reg0]);
 }
 
 
 /* Write gadget pointer (two registers). */
-static void tcg_out_binary_gadget(TCGContext *s, void *gadget_base[TCG_TARGET_NB_REGS][TCG_TARGET_NB_REGS], unsigned reg0, unsigned reg1)
+static void tcg_out_binary_gadget(TCGContext *s, const void *gadget_base[TCG_TARGET_NB_REGS][TCG_TARGET_NB_REGS], unsigned reg0, unsigned reg1)
 {
     tcg_out_nullary_gadget(s, gadget_base[reg0][reg1]);
 }
 
 
 /* Write gadget pointer (three registers). */
-static void tcg_out_ternary_gadget(TCGContext *s, void *gadget_base[TCG_TARGET_NB_REGS][TCG_TARGET_NB_REGS][TCG_TARGET_NB_REGS], unsigned reg0, unsigned reg1, unsigned reg2)
+static void tcg_out_ternary_gadget(TCGContext *s, const void *gadget_base[TCG_TARGET_NB_REGS][TCG_TARGET_NB_REGS][TCG_TARGET_NB_REGS], unsigned reg0, unsigned reg1, unsigned reg2)
 {
     tcg_out_nullary_gadget(s, gadget_base[reg0][reg1][reg2]);
 }
@@ -403,10 +403,10 @@ static void tcg_out_ternary_gadget(TCGContext *s, void *gadget_base[TCG_TARGET_N
  * Version of our LDST generator that defers to more optimized gadgets selectively.
  */
 static void tcg_out_ldst_gadget_inner(TCGContext *s, 
-    void *gadget_base[TCG_TARGET_NB_REGS][TCG_TARGET_NB_REGS], 
-    void *gadget_pos_imm[TCG_TARGET_NB_REGS][TCG_TARGET_NB_REGS][TCTI_GADGET_IMMEDIATE_ARRAY_LEN], 
-    void *gadget_shifted_imm[TCG_TARGET_NB_REGS][TCG_TARGET_NB_REGS][TCTI_GADGET_IMMEDIATE_ARRAY_LEN], 
-    void *gadget_neg_imm[TCG_TARGET_NB_REGS][TCG_TARGET_NB_REGS][TCTI_GADGET_IMMEDIATE_ARRAY_LEN], 
+    const void *gadget_base[TCG_TARGET_NB_REGS][TCG_TARGET_NB_REGS], 
+    const void *gadget_pos_imm[TCG_TARGET_NB_REGS][TCG_TARGET_NB_REGS][TCTI_GADGET_IMMEDIATE_ARRAY_LEN], 
+    const void *gadget_shifted_imm[TCG_TARGET_NB_REGS][TCG_TARGET_NB_REGS][TCTI_GADGET_IMMEDIATE_ARRAY_LEN], 
+    const void *gadget_neg_imm[TCG_TARGET_NB_REGS][TCG_TARGET_NB_REGS][TCTI_GADGET_IMMEDIATE_ARRAY_LEN], 
     unsigned reg0, unsigned reg1, uint32_t offset)
 {
     int64_t extended_offset = (int32_t)offset;
diff --git a/tcg/aarch64-tcti/tcti-gadget-gen.py b/tcg/aarch64-tcti/tcti-gadget-gen.py
index 800702fad3db..3501988acc80 100755
--- a/tcg/aarch64-tcti/tcti-gadget-gen.py
+++ b/tcg/aarch64-tcti/tcti-gadget-gen.py
@@ -53,6 +53,10 @@ def START_COLLECTION(name):
 
     global current_collection
 
+    # If we already have a collection for this, skip it.
+    if name in output_files:
+        return
+
     # Create the relevant output files
     new_c_file = open(f"tcti_{name}_gadgets.c", "w")
     new_s_file = open(f"tcti_{name}_gadgets.s", "w")
@@ -152,10 +156,10 @@ def with_dnm(name, *lines):
     c_file, s_file, h_file = _get_output_files()
 
     # Print out an extern.
-    print(f"extern void* gadget_{name}[{TCG_REGISTER_COUNT}][{TCG_REGISTER_COUNT}][{TCG_REGISTER_COUNT}];", file=h_file)
+    print(f"extern const void* gadget_{name}[{TCG_REGISTER_COUNT}][{TCG_REGISTER_COUNT}][{TCG_REGISTER_COUNT}];", file=h_file)
 
     # Print out an array that contains all of our gadgets, for lookup.
-    print(f"void* gadget_{name}[{TCG_REGISTER_COUNT}][{TCG_REGISTER_COUNT}][{TCG_REGISTER_COUNT}] = ", end="", file=c_file)
+    print(f"const void* gadget_{name}[{TCG_REGISTER_COUNT}][{TCG_REGISTER_COUNT}][{TCG_REGISTER_COUNT}] = ", end="", file=c_file)
     print("{", file=c_file)
 
     # D array
@@ -183,10 +187,10 @@ def with_dn_immediate(name, *lines, immediate_range):
     c_file, s_file, h_file = _get_output_files()
 
     # Print out an extern.
-    print(f"extern void* gadget_{name}[{TCG_REGISTER_COUNT}][{TCG_REGISTER_COUNT}][{len(immediate_range)}];", file=h_file)
+    print(f"extern const void* gadget_{name}[{TCG_REGISTER_COUNT}][{TCG_REGISTER_COUNT}][{len(immediate_range)}];", file=h_file)
 
     # Print out an array that contains all of our gadgets, for lookup.
-    print(f"void* gadget_{name}[{TCG_REGISTER_COUNT}][{TCG_REGISTER_COUNT}][{len(immediate_range)}] = ", end="", file=c_file)
+    print(f"const void* gadget_{name}[{TCG_REGISTER_COUNT}][{TCG_REGISTER_COUNT}][{len(immediate_range)}] = ", end="", file=c_file)
     print("{", file=c_file)
 
     # D array
@@ -213,10 +217,10 @@ def with_pair(name, substitutions, *lines):
     # Fetch the files we'll be using for output.
     c_file, s_file, h_file = _get_output_files()
 
-    print(f"extern void* gadget_{name}[{TCG_REGISTER_COUNT}][{TCG_REGISTER_COUNT}];", file=h_file)
+    print(f"extern const void* gadget_{name}[{TCG_REGISTER_COUNT}][{TCG_REGISTER_COUNT}];", file=h_file)
 
     # Print out an array that contains all of our gadgets, for lookup.
-    print(f"void* gadget_{name}[{TCG_REGISTER_COUNT}][{TCG_REGISTER_COUNT}] = ", end="", file=c_file)
+    print(f"const void* gadget_{name}[{TCG_REGISTER_COUNT}][{TCG_REGISTER_COUNT}] = ", end="", file=c_file)
     print("{", file=c_file)
 
     # N array
@@ -283,10 +287,10 @@ def with_single(name, substitution, *lines):
     # Fetch the files we'll be using for output.
     c_file, s_file, h_file = _get_output_files()
 
-    print(f"extern void* gadget_{name}[{TCG_REGISTER_COUNT}];", file=h_file)
+    print(f"extern const void* gadget_{name}[{TCG_REGISTER_COUNT}];", file=h_file)
 
     # Print out an array that contains all of our gadgets, for lookup.
-    print(f"void* gadget_{name}[{TCG_REGISTER_COUNT}] = ", end="", file=c_file)
+    print(f"const void* gadget_{name}[{TCG_REGISTER_COUNT}] = ", end="", file=c_file)
     print("{", file=c_file)
 
     for n in TCG_REGISTER_NUMBERS:
@@ -619,11 +623,12 @@ def st_thunk(name, fastpath_32b, fastpath_64b, slowpath_helper, immediate=None,
 simple("mb_ld",  "dmb ishld")
 
 
-START_COLLECTION("conditionals")
 
 
 for condition in ARCH_CONDITION_CODES:
 
+    START_COLLECTION("setcond")
+
     # Performs a comparison between two operands.
     with_dnm(f"setcond_i32_{condition}",
         "subs Wd, Wn, Wm",
@@ -644,6 +649,8 @@ def st_thunk(name, fastpath_32b, fastpath_64b, slowpath_helper, immediate=None,
     # branch is funneled throught the same address.
     #
 
+    START_COLLECTION("brcond")
+
     # Branches iff a given comparison is true.
     with_dnm(f'brcond_i32_{condition}',
 
@@ -697,19 +704,22 @@ def st_thunk(name, fastpath_32b, fastpath_64b, slowpath_helper, immediate=None,
 with_d_immediate("movi_imm_i32", "mov Wd, #Ii", immediate_range=range(64))
 with_d_immediate("movi_imm_i64", "mov Xd, #Ii", immediate_range=range(64))
 
-START_COLLECTION("load")
+START_COLLECTION("load_unsigned")
 
 # LOAD variants.
 # TODO: should the signed variants have X variants for _i64?
 ldst_dn("ld8u",      "ldrb  Wd, [Xn, x27]")
+ldst_dn("ld16u",     "ldrh  Wd, [Xn, x27]")
+ldst_dn("ld32u",     "ldr   Wd, [Xn, x27]")
+ldst_dn("ld_i64",    "ldr   Xd, [Xn, x27]")
+
+START_COLLECTION("load_signed")
+
 ldst_dn("ld8s_i32",  "ldrsb Wd, [Xn, x27]")
 ldst_dn("ld8s_i64",  "ldrsb Xd, [Xn, x27]")
-ldst_dn("ld16u",     "ldrh  Wd, [Xn, x27]")
 ldst_dn("ld16s_i32", "ldrsh Wd, [Xn, x27]")
 ldst_dn("ld16s_i64", "ldrsh Xd, [Xn, x27]")
-ldst_dn("ld32u",     "ldr   Wd, [Xn, x27]")
 ldst_dn("ld32s_i64", "ldrsw Xd, [Xn, x27]")
-ldst_dn("ld_i64",    "ldr   Xd, [Xn, x27]")
 
 START_COLLECTION("store")
 
@@ -775,48 +785,54 @@ def st_thunk(name, fastpath_32b, fastpath_64b, slowpath_helper, immediate=None,
 with_dn("bswap32",    "rev Wd, Wn")
 with_dn("bswap64",    "rev Xd, Xn")
 
-START_COLLECTION("qemu_ld")
 
 # Handlers for QEMU_LD, which handles guest <- host loads.
 for subtype in ('aligned', 'unaligned', 'slowpath'):
     is_aligned  = (subtype == 'aligned')
     is_slowpath = (subtype == 'slowpath')
 
+    START_COLLECTION(f"qemu_ld_{subtype}_unsigned_le")
+
     ld_thunk(f"qemu_ld_ub_{subtype}", is_aligned=is_aligned, slowpath_helper="helper_ret_ldub_mmu",
         fastpath_32b=["ldrb Wd, [Xn, x27]"], fastpath_64b=["ldrb Wd, [Xn, x27]"],
         force_slowpath=is_slowpath,
     )
-    ld_thunk(f"qemu_ld_sb_{subtype}", is_aligned=is_aligned, slowpath_helper="helper_ret_ldub_mmu_signed",
-        fastpath_32b=["ldrsb Wd, [Xn, x27]"], fastpath_64b=["ldrsb Xd, [Xn, x27]"],
-        force_slowpath=is_slowpath,
-    )
     ld_thunk(f"qemu_ld_leuw_{subtype}", is_aligned=is_aligned, slowpath_helper="helper_le_lduw_mmu",
         fastpath_32b=["ldrh Wd, [Xn, x27]"], fastpath_64b=["ldrh Wd, [Xn, x27]"],
         force_slowpath=is_slowpath,
     )
-    ld_thunk(f"qemu_ld_lesw_{subtype}", is_aligned=is_aligned, slowpath_helper="helper_le_lduw_mmu_signed",
-        fastpath_32b=["ldrsh Wd, [Xn, x27]"], fastpath_64b=["ldrsh Xd, [Xn, x27]"],
-        force_slowpath=is_slowpath,
-    )
     ld_thunk(f"qemu_ld_leul_{subtype}", is_aligned=is_aligned, slowpath_helper="helper_le_ldul_mmu",
         fastpath_32b=["ldr Wd, [Xn, x27]"], fastpath_64b=["ldr Wd, [Xn, x27]"],
         force_slowpath=is_slowpath,
     )
-    ld_thunk(f"qemu_ld_lesl_{subtype}", is_aligned=is_aligned, slowpath_helper="helper_le_ldul_mmu_signed",
-        fastpath_32b=["ldrsw Xd, [Xn, x27]"], fastpath_64b=["ldrsw Xd, [Xn, x27]"],
-        force_slowpath=is_slowpath,
-    )
     ld_thunk(f"qemu_ld_leq_{subtype}", is_aligned=is_aligned, slowpath_helper="helper_le_ldq_mmu",
         fastpath_32b=["ldr Xd, [Xn, x27]"], fastpath_64b=["ldr Xd, [Xn, x27]"],
         force_slowpath=is_slowpath,
     )
 
+    START_COLLECTION(f"qemu_ld_{subtype}_signed_le")
+
+    ld_thunk(f"qemu_ld_sb_{subtype}", is_aligned=is_aligned, slowpath_helper="helper_ret_ldub_mmu_signed",
+        fastpath_32b=["ldrsb Wd, [Xn, x27]"], fastpath_64b=["ldrsb Xd, [Xn, x27]"],
+        force_slowpath=is_slowpath,
+    )
+    ld_thunk(f"qemu_ld_lesw_{subtype}", is_aligned=is_aligned, slowpath_helper="helper_le_lduw_mmu_signed",
+        fastpath_32b=["ldrsh Wd, [Xn, x27]"], fastpath_64b=["ldrsh Xd, [Xn, x27]"],
+        force_slowpath=is_slowpath,
+    )
+    ld_thunk(f"qemu_ld_lesl_{subtype}", is_aligned=is_aligned, slowpath_helper="helper_le_ldul_mmu_signed",
+        fastpath_32b=["ldrsw Xd, [Xn, x27]"], fastpath_64b=["ldrsw Xd, [Xn, x27]"],
+        force_slowpath=is_slowpath,
+    )
+
     # Special variant for the most common mode, as a speedup optimization.
     ld_thunk(f"qemu_ld_leq_{subtype}_mode3a", is_aligned=is_aligned, slowpath_helper="helper_le_ldq_mmu",
         fastpath_32b=["ldr Xd, [Xn, x27]"], fastpath_64b=["ldr Xd, [Xn, x27]"],
         force_slowpath=is_slowpath, immediate=0x3a
     )
 
+    START_COLLECTION(f"qemu_ld_{subtype}_be")
+
     # For now, leave the rare/big-endian stuff slow-path only.
     ld_thunk(f"qemu_ld_beuw_{subtype}", None, None, "helper_be_lduw_mmu",         
             is_aligned=is_aligned, force_slowpath=is_slowpath)
@@ -830,7 +846,6 @@ def st_thunk(name, fastpath_32b, fastpath_64b, slowpath_helper, immediate=None,
             is_aligned=is_aligned, force_slowpath=is_slowpath)
 
 
-START_COLLECTION("qemu_st")
 
 
 # Handlers for QEMU_ST, which handles guest -> host stores.
@@ -838,6 +853,8 @@ def st_thunk(name, fastpath_32b, fastpath_64b, slowpath_helper, immediate=None,
     is_aligned  = (subtype == 'aligned')
     is_slowpath = (subtype == 'slowpath')
 
+    START_COLLECTION(f"qemu_st_{subtype}_le")
+
     st_thunk(f"qemu_st_ub_{subtype}", is_aligned=is_aligned, slowpath_helper="helper_ret_stb_mmu",
         fastpath_32b=["strb Wd, [Xn, x27]"], fastpath_64b=["strb Wd, [Xn, x27]"],
         force_slowpath=is_slowpath,
@@ -861,6 +878,8 @@ def st_thunk(name, fastpath_32b, fastpath_64b, slowpath_helper, immediate=None,
         force_slowpath=is_slowpath, immediate=0x3a
     )
 
+    START_COLLECTION(f"qemu_st_{subtype}_be")
+
     # For now, leave the rare/big-endian stuff slow-path only.
     st_thunk(f"qemu_st_beuw_{subtype}", None, None, "helper_be_stw_mmu",  
             is_aligned=is_aligned, force_slowpath=is_slowpath)

From 69c9fe3f2edc904e93f3dded97d50e02db27dbed Mon Sep 17 00:00:00 2001
From: Kate Temkin <k@ktemkin.com>
Date: Thu, 29 Apr 2021 04:23:23 -0600
Subject: [PATCH 30/36] TCTI: modularize things more for fast clang

---
 configure                           | 14 ++++++++
 meson.build                         | 50 +++++++++++------------------
 tcg/aarch64-tcti/tcti-gadget-gen.py | 43 +++++++++++++------------
 3 files changed, 55 insertions(+), 52 deletions(-)

diff --git a/configure b/configure
index c3637f198428..9858e71f538a 100755
--- a/configure
+++ b/configure
@@ -479,6 +479,8 @@ for opt do
   ;;
   --cc=*) CC="$optarg"
   ;;
+  --ld=*) LD="$optarg"
+  ;;
   --cxx=*) CXX="$optarg"
   ;;
   --cpu=*) cpu="$optarg"
@@ -908,6 +910,8 @@ for opt do
   ;;
   --cc=*)
   ;;
+  --ld=*) ld="$optarg"
+  ;;
   --host-cc=*) host_cc="$optarg"
   ;;
   --cxx=*)
@@ -1696,6 +1700,13 @@ case "$cpu" in
     # No special flags required for other host CPUs
 esac
 
+# XXX
+QEMU_LDFLAGS="$QEMU_LDFLAGS -Wl,-no_deduplicate"
+QEMU_LDFLAGS="$QEMU_LDFLAGS -Wl,-random_uuid"
+#QEMU_LDFLAGS="$QEMU_LDFLAGS -Wl,-force_load"
+QEMU_LDFLAGS="$QEMU_LDFLAGS -Wl,-no_compact_unwind"
+
+
 eval "cross_cc_${cpu}=\$host_cc"
 cross_cc_vars="$cross_cc_vars cross_cc_${cpu}"
 QEMU_CFLAGS="$CPU_CFLAGS $QEMU_CFLAGS"
@@ -6473,6 +6484,9 @@ else
 fi
 mv $cross config-meson.cross
 
+export CC_LD="$ld"
+export CXX_LD="$ld"
+
 rm -rf meson-private meson-info meson-logs
 unset staticpic
 if ! version_ge "$($meson --version)" 0.56.0; then
diff --git a/meson.build b/meson.build
index dfd1606bfa30..7927a1d163ab 100644
--- a/meson.build
+++ b/meson.build
@@ -142,6 +142,7 @@ if link_language == 'cpp'
 endif
 if host_machine.system() == 'darwin'
   add_languages('objc', required: false, native: false)
+  add_project_link_arguments(['-fvisibility-inlines-hidden', '-Xlinker', '-no_deduplicate'], native: false, language: ['c', 'cpp', 'objc'])
 endif
 
 sparse = find_program('cgcc', required: get_option('sparse'))
@@ -260,95 +261,68 @@ if not get_option('tcg').disabled()
 
     # Tell our compiler how to generate our TCTI gadgets.
     gadget_generator = 'tcg/@0@/tcti-gadget-gen.py'.format(tcg_arch)
-    gadgets = [
+    tcti_sources = [
         'tcti_gadgets.h',
-        'tcti_misc_gadgets.c',
         'tcti_misc_gadgets.s',
         'tcti_misc_gadgets.h',
-        'tcti_setcond_gadgets.c',
         'tcti_setcond_gadgets.s',
         'tcti_setcond_gadgets.h',
-        'tcti_brcond_gadgets.c',
         'tcti_brcond_gadgets.s',
         'tcti_brcond_gadgets.h',
-        'tcti_mov_gadgets.c',
         'tcti_mov_gadgets.s',
         'tcti_mov_gadgets.h',
-        'tcti_load_signed_gadgets.c',
         'tcti_load_signed_gadgets.s',
         'tcti_load_signed_gadgets.h',
-        'tcti_load_unsigned_gadgets.c',
         'tcti_load_unsigned_gadgets.s',
         'tcti_load_unsigned_gadgets.h',
-        'tcti_store_gadgets.c',
         'tcti_store_gadgets.s',
         'tcti_store_gadgets.h',
-        'tcti_arithmetic_gadgets.c',
         'tcti_arithmetic_gadgets.s',
         'tcti_arithmetic_gadgets.h',
-        'tcti_logical_gadgets.c',
         'tcti_logical_gadgets.s',
         'tcti_logical_gadgets.h',
-        'tcti_extension_gadgets.c',
         'tcti_extension_gadgets.s',
         'tcti_extension_gadgets.h',
-        'tcti_byteswap_gadgets.c',
         'tcti_byteswap_gadgets.s',
         'tcti_byteswap_gadgets.h',
-        'tcti_qemu_ld_aligned_signed_le_gadgets.c',
         'tcti_qemu_ld_aligned_signed_le_gadgets.s',
         'tcti_qemu_ld_aligned_signed_le_gadgets.h',
-        'tcti_qemu_ld_unaligned_signed_le_gadgets.c',
         'tcti_qemu_ld_unaligned_signed_le_gadgets.s',
         'tcti_qemu_ld_unaligned_signed_le_gadgets.h',
-        'tcti_qemu_ld_slowpath_signed_le_gadgets.c',
         'tcti_qemu_ld_slowpath_signed_le_gadgets.s',
         'tcti_qemu_ld_slowpath_signed_le_gadgets.h',
-        'tcti_qemu_ld_aligned_unsigned_le_gadgets.c',
         'tcti_qemu_ld_aligned_unsigned_le_gadgets.s',
         'tcti_qemu_ld_aligned_unsigned_le_gadgets.h',
-        'tcti_qemu_ld_unaligned_unsigned_le_gadgets.c',
         'tcti_qemu_ld_unaligned_unsigned_le_gadgets.s',
         'tcti_qemu_ld_unaligned_unsigned_le_gadgets.h',
-        'tcti_qemu_ld_slowpath_unsigned_le_gadgets.c',
         'tcti_qemu_ld_slowpath_unsigned_le_gadgets.s',
         'tcti_qemu_ld_slowpath_unsigned_le_gadgets.h',
-        'tcti_qemu_ld_aligned_be_gadgets.c',
         'tcti_qemu_ld_aligned_be_gadgets.s',
         'tcti_qemu_ld_aligned_be_gadgets.h',
-        'tcti_qemu_ld_unaligned_be_gadgets.c',
         'tcti_qemu_ld_unaligned_be_gadgets.s',
         'tcti_qemu_ld_unaligned_be_gadgets.h',
-        'tcti_qemu_ld_slowpath_be_gadgets.c',
         'tcti_qemu_ld_slowpath_be_gadgets.s',
         'tcti_qemu_ld_slowpath_be_gadgets.h',
-        'tcti_qemu_st_aligned_le_gadgets.c',
         'tcti_qemu_st_aligned_le_gadgets.s',
         'tcti_qemu_st_aligned_le_gadgets.h',
-        'tcti_qemu_st_unaligned_le_gadgets.c',
         'tcti_qemu_st_unaligned_le_gadgets.s',
         'tcti_qemu_st_unaligned_le_gadgets.h',
-        'tcti_qemu_st_slowpath_le_gadgets.c',
         'tcti_qemu_st_slowpath_le_gadgets.s',
         'tcti_qemu_st_slowpath_le_gadgets.h',
-        'tcti_qemu_st_aligned_be_gadgets.c',
         'tcti_qemu_st_aligned_be_gadgets.s',
         'tcti_qemu_st_aligned_be_gadgets.h',
-        'tcti_qemu_st_unaligned_be_gadgets.c',
         'tcti_qemu_st_unaligned_be_gadgets.s',
         'tcti_qemu_st_unaligned_be_gadgets.h',
-        'tcti_qemu_st_slowpath_be_gadgets.c',
         'tcti_qemu_st_slowpath_be_gadgets.s',
         'tcti_qemu_st_slowpath_be_gadgets.h',
     ]
     tcti_gadgets = custom_target('tcti-gadgets.h',
-                              output: gadgets,
+                              output: tcti_sources,
                               input: gadget_generator,
                               command: [find_program(gadget_generator)],
                               build_by_default: true,
                               build_always_stale: false)
 
-    
   elif config_host['ARCH'] == 'sparc64'
     tcg_arch = 'sparc'
   elif config_host['ARCH'] == 's390x'
@@ -1806,6 +1780,7 @@ qom_ss = ss.source_set()
 softmmu_ss = ss.source_set()
 specific_fuzz_ss = ss.source_set()
 specific_ss = ss.source_set()
+tcti_ss = ss.source_set()
 stub_ss = ss.source_set()
 trace_ss = ss.source_set()
 user_ss = ss.source_set()
@@ -2027,7 +2002,10 @@ endif
 common_ss.add(pagevary)
 specific_ss.add(files('page-vary.c'))
 specific_ss.add(when: 'CONFIG_TCG_INTERPRETER', if_true: files('tcg/tci.c'))
-specific_ss.add(when: 'CONFIG_TCG_THREADED_INTERPRETER', if_true: tcti_gadgets)
+
+# FIXME: This is being used for now for development quickness, but these realy should be
+# added to a gadget-specific shared library.
+tcti_ss.add(when: 'CONFIG_TCG_THREADED_INTERPRETER', if_true: tcti_gadgets)
 
 subdir('backends')
 subdir('disas')
@@ -2197,6 +2175,12 @@ common_all = static_library('common',
                             dependencies: common_all.dependencies(),
                             name_suffix: 'fa')
 
+tcti_all = common_ss.apply(config_all, strict: false)
+tcti_all = shared_library('tcti',
+                            build_by_default: false,
+                            sources: tcti_gadgets,
+                            name_suffix: 'fa')
+
 feature_to_c = find_program('scripts/feature_to_c.sh')
 
 emulators = {}
@@ -2270,7 +2254,7 @@ foreach target : target_dirs
   arch_deps += t.dependencies()
 
   target_common = common_ss.apply(config_target, strict: false)
-  objects = common_all.extract_objects(target_common.sources())
+  objects = [common_all.extract_objects(target_common.sources()), tcti_all.extract_all_objects()]
   deps = target_common.dependencies()
 
   target_specific = specific_ss.apply(config_target, strict: false)
@@ -2301,7 +2285,9 @@ foreach target : target_dirs
                  include_directories: target_inc,
                  c_args: c_args,
                  build_by_default: false,
-                 kwargs: build_lib_args)
+                 kwargs: build_lib_args,
+                 link_with: tcti_all
+                 )
 
   if target.endswith('-softmmu')
     execs = [{
diff --git a/tcg/aarch64-tcti/tcti-gadget-gen.py b/tcg/aarch64-tcti/tcti-gadget-gen.py
index 3501988acc80..60dba2ffc907 100755
--- a/tcg/aarch64-tcti/tcti-gadget-gen.py
+++ b/tcg/aarch64-tcti/tcti-gadget-gen.py
@@ -59,16 +59,14 @@ def START_COLLECTION(name):
 
     # Create the relevant output files
     new_c_file = open(f"tcti_{name}_gadgets.c", "w")
-    new_s_file = open(f"tcti_{name}_gadgets.s", "w")
     new_h_file = open(f"tcti_{name}_gadgets.h", "w")
-    output_files[name] = (new_c_file, new_s_file, new_h_file)
+    output_files[name] = (new_c_file, new_h_file)
 
     # Add the file to our gadget collection.
     print(f'#include "tcti_{name}_gadgets.h"', file=top_header)
 
     # Add generated messages to the relevant collection.
     print("/* Automatically generated by tcti-gadget-gen.py. Do not edit. */\n", file=new_c_file)
-    print("/* Automatically generated by tcti-gadget-gen.py. Do not edit. */\n", file=new_s_file)
     print("/* Automatically generated by tcti-gadget-gen.py. Do not edit. */\n", file=new_h_file)
 
     # Start our C file with inclusion of the relevant header.
@@ -81,7 +79,7 @@ def START_COLLECTION(name):
     current_collection = name
     
 
-def simple(name, *lines):
+def simple(name, *lines, export=True):
     """ Generates a simple gadget that needs no per-register specialization. """
 
     global gadgets, instructions
@@ -89,19 +87,26 @@ def simple(name, *lines):
     gadgets += 1
 
     # Fetch the files we'll be using for output.
-    c_file, s_file, h_file = _get_output_files()
+    c_file, h_file = _get_output_files()
 
     # Create our C/ASM framing.
-    print(f"void gadget_{name}(void);", file=h_file)
-    print(f".global gadget_{name}", file=s_file)
-    print(f"gadget_{name}:", file=s_file)
+    if export:
+        print(f"__attribute__((naked)) void gadget_{name}(void);", file=h_file)
+        print(f"__attribute__((naked)) void gadget_{name}(void)", file=c_file)
+    else:
+        print(f"static __attribute__((naked)) void gadget_{name}(void)", file=c_file)
+
+    print("{", file=c_file)
 
     # Add the core gadget
+    print("\tasm(", file=c_file)
     for line in lines + EPILOGUE:
-        print(f"\t{line}", file=s_file)
+        print(f"\t\t\"{line} \\n\"", file=c_file)
         instructions += 1
+    print("\t);", file=c_file)
 
-    print(f"", file=s_file)
+    # End our framing.
+    print("}\n", file=c_file)
 
 
 def with_register_substitutions(name, substitutions, *lines, immediate_range=range(0)):
@@ -145,7 +150,7 @@ def substitutions_for_letter(letter, number, line):
 
         # ... and emit the gadget.
         permutation_id = "_arg".join(str(number) for number in permutation)
-        simple(f"{name}_arg{permutation_id}", *new_lines)
+        simple(f"{name}_arg{permutation_id}", *new_lines, export=False)
 
 
 def with_dnm(name, *lines):
@@ -153,7 +158,7 @@ def with_dnm(name, *lines):
     with_register_substitutions(name, ("d", "n", "m"), *lines)
 
     # Fetch the files we'll be using for output.
-    c_file, s_file, h_file = _get_output_files()
+    c_file, h_file = _get_output_files()
 
     # Print out an extern.
     print(f"extern const void* gadget_{name}[{TCG_REGISTER_COUNT}][{TCG_REGISTER_COUNT}][{TCG_REGISTER_COUNT}];", file=h_file)
@@ -184,7 +189,7 @@ def with_dn_immediate(name, *lines, immediate_range):
     with_register_substitutions(name, ["d", "n"], *lines, immediate_range=immediate_range)
 
     # Fetch the files we'll be using for output.
-    c_file, s_file, h_file = _get_output_files()
+    c_file, h_file = _get_output_files()
 
     # Print out an extern.
     print(f"extern const void* gadget_{name}[{TCG_REGISTER_COUNT}][{TCG_REGISTER_COUNT}][{len(immediate_range)}];", file=h_file)
@@ -215,7 +220,7 @@ def with_pair(name, substitutions, *lines):
     with_register_substitutions(name, substitutions, *lines)
 
     # Fetch the files we'll be using for output.
-    c_file, s_file, h_file = _get_output_files()
+    c_file, h_file = _get_output_files()
 
     print(f"extern const void* gadget_{name}[{TCG_REGISTER_COUNT}][{TCG_REGISTER_COUNT}];", file=h_file)
 
@@ -285,7 +290,7 @@ def with_single(name, substitution, *lines):
     with_register_substitutions(name, (substitution,), *lines)
 
     # Fetch the files we'll be using for output.
-    c_file, s_file, h_file = _get_output_files()
+    c_file, h_file = _get_output_files()
 
     print(f"extern const void* gadget_{name}[{TCG_REGISTER_COUNT}];", file=h_file)
 
@@ -304,7 +309,7 @@ def with_d_immediate(name, *lines, immediate_range=range(0)):
     with_register_substitutions(name, ['d'], *lines, immediate_range=immediate_range)
 
     # Fetch the files we'll be using for output.
-    c_file, s_file, h_file = _get_output_files()
+    c_file, h_file = _get_output_files()
 
     print(f"extern void* gadget_{name}[{TCG_REGISTER_COUNT}][{len(immediate_range)}];", file=h_file)
 
@@ -480,7 +485,7 @@ def ld_thunk(name, fastpath_32b, fastpath_64b, slowpath_helper, immediate=None,
                     "mov   x3, x28",
 
                     # Perform our actual core code.
-                    f"bl {slowpath_helper}",
+                    f"bl _{slowpath_helper}",
 
                     # Temporarily store our result in a register that won't get trashed.
                     "mov x27, x0",
@@ -560,7 +565,7 @@ def st_thunk(name, fastpath_32b, fastpath_64b, slowpath_helper, immediate=None,
                     "mov   x4, x28",
 
                     # Perform our actual core code.
-                    f"bl {slowpath_helper}",
+                    f"bl _{slowpath_helper}",
 
                     # Restore our registers after our C call.
                     *C_CALL_EPILOGUE,
@@ -891,7 +896,6 @@ def st_thunk(name, fastpath_32b, fastpath_64b, slowpath_helper, immediate=None,
 
 # Print a list of output files generated.
 output_c_filenames = (f"'tcti_{name}_gadgets.c'" for name in output_files.keys())
-output_s_filenames = (f"'tcti_{name}_gadgets.s'" for name in output_files.keys())
 output_h_filenames = (f"'tcti_{name}_gadgets.h'" for name in output_files.keys())
 
 print("Sources generated:",    file=sys.stderr)
@@ -900,7 +904,6 @@ def st_thunk(name, fastpath_32b, fastpath_64b, slowpath_helper, immediate=None,
 
 for name in output_files.keys():
     print(f"      tcti_{name}_gadgets.c,", file=sys.stderr)
-    print(f"      tcti_{name}_gadgets.s,", file=sys.stderr)
     print(f"      tcti_{name}_gadgets.h,", file=sys.stderr)
 
 print(f"]", file=sys.stderr)

From cd4b5da233938fe85dbc7488ed61f3525fc7e1b5 Mon Sep 17 00:00:00 2001
From: Kate Temkin <k@ktemkin.com>
Date: Fri, 30 Apr 2021 11:40:17 -0600
Subject: [PATCH 31/36] TCTI: optimize I/O operation and helper call speeds

---
 meson.build                         |  72 ++++++++--------
 tcg/aarch64-tcti/tcg-target.c.inc   | 122 ++++++++++++++--------------
 tcg/aarch64-tcti/tcti-gadget-gen.py |  41 +++++-----
 3 files changed, 117 insertions(+), 118 deletions(-)

diff --git a/meson.build b/meson.build
index 7927a1d163ab..6d9d804bf1aa 100644
--- a/meson.build
+++ b/meson.build
@@ -263,57 +263,57 @@ if not get_option('tcg').disabled()
     gadget_generator = 'tcg/@0@/tcti-gadget-gen.py'.format(tcg_arch)
     tcti_sources = [
         'tcti_gadgets.h',
-        'tcti_misc_gadgets.s',
+        'tcti_misc_gadgets.c',
         'tcti_misc_gadgets.h',
-        'tcti_setcond_gadgets.s',
+        'tcti_setcond_gadgets.c',
         'tcti_setcond_gadgets.h',
-        'tcti_brcond_gadgets.s',
+        'tcti_brcond_gadgets.c',
         'tcti_brcond_gadgets.h',
-        'tcti_mov_gadgets.s',
+        'tcti_mov_gadgets.c',
         'tcti_mov_gadgets.h',
-        'tcti_load_signed_gadgets.s',
+        'tcti_load_signed_gadgets.c',
         'tcti_load_signed_gadgets.h',
-        'tcti_load_unsigned_gadgets.s',
+        'tcti_load_unsigned_gadgets.c',
         'tcti_load_unsigned_gadgets.h',
-        'tcti_store_gadgets.s',
+        'tcti_store_gadgets.c',
         'tcti_store_gadgets.h',
-        'tcti_arithmetic_gadgets.s',
+        'tcti_arithmetic_gadgets.c',
         'tcti_arithmetic_gadgets.h',
-        'tcti_logical_gadgets.s',
+        'tcti_logical_gadgets.c',
         'tcti_logical_gadgets.h',
-        'tcti_extension_gadgets.s',
+        'tcti_extension_gadgets.c',
         'tcti_extension_gadgets.h',
-        'tcti_byteswap_gadgets.s',
+        'tcti_byteswap_gadgets.c',
         'tcti_byteswap_gadgets.h',
-        'tcti_qemu_ld_aligned_signed_le_gadgets.s',
+        'tcti_qemu_ld_aligned_signed_le_gadgets.c',
         'tcti_qemu_ld_aligned_signed_le_gadgets.h',
-        'tcti_qemu_ld_unaligned_signed_le_gadgets.s',
+        'tcti_qemu_ld_unaligned_signed_le_gadgets.c',
         'tcti_qemu_ld_unaligned_signed_le_gadgets.h',
-        'tcti_qemu_ld_slowpath_signed_le_gadgets.s',
+        'tcti_qemu_ld_slowpath_signed_le_gadgets.c',
         'tcti_qemu_ld_slowpath_signed_le_gadgets.h',
-        'tcti_qemu_ld_aligned_unsigned_le_gadgets.s',
+        'tcti_qemu_ld_aligned_unsigned_le_gadgets.c',
         'tcti_qemu_ld_aligned_unsigned_le_gadgets.h',
-        'tcti_qemu_ld_unaligned_unsigned_le_gadgets.s',
+        'tcti_qemu_ld_unaligned_unsigned_le_gadgets.c',
         'tcti_qemu_ld_unaligned_unsigned_le_gadgets.h',
-        'tcti_qemu_ld_slowpath_unsigned_le_gadgets.s',
+        'tcti_qemu_ld_slowpath_unsigned_le_gadgets.c',
         'tcti_qemu_ld_slowpath_unsigned_le_gadgets.h',
-        'tcti_qemu_ld_aligned_be_gadgets.s',
+        'tcti_qemu_ld_aligned_be_gadgets.c',
         'tcti_qemu_ld_aligned_be_gadgets.h',
-        'tcti_qemu_ld_unaligned_be_gadgets.s',
+        'tcti_qemu_ld_unaligned_be_gadgets.c',
         'tcti_qemu_ld_unaligned_be_gadgets.h',
-        'tcti_qemu_ld_slowpath_be_gadgets.s',
+        'tcti_qemu_ld_slowpath_be_gadgets.c',
         'tcti_qemu_ld_slowpath_be_gadgets.h',
-        'tcti_qemu_st_aligned_le_gadgets.s',
+        'tcti_qemu_st_aligned_le_gadgets.c',
         'tcti_qemu_st_aligned_le_gadgets.h',
-        'tcti_qemu_st_unaligned_le_gadgets.s',
+        'tcti_qemu_st_unaligned_le_gadgets.c',
         'tcti_qemu_st_unaligned_le_gadgets.h',
-        'tcti_qemu_st_slowpath_le_gadgets.s',
+        'tcti_qemu_st_slowpath_le_gadgets.c',
         'tcti_qemu_st_slowpath_le_gadgets.h',
-        'tcti_qemu_st_aligned_be_gadgets.s',
+        'tcti_qemu_st_aligned_be_gadgets.c',
         'tcti_qemu_st_aligned_be_gadgets.h',
-        'tcti_qemu_st_unaligned_be_gadgets.s',
+        'tcti_qemu_st_unaligned_be_gadgets.c',
         'tcti_qemu_st_unaligned_be_gadgets.h',
-        'tcti_qemu_st_slowpath_be_gadgets.s',
+        'tcti_qemu_st_slowpath_be_gadgets.c',
         'tcti_qemu_st_slowpath_be_gadgets.h',
     ]
     tcti_gadgets = custom_target('tcti-gadgets.h',
@@ -1780,7 +1780,7 @@ qom_ss = ss.source_set()
 softmmu_ss = ss.source_set()
 specific_fuzz_ss = ss.source_set()
 specific_ss = ss.source_set()
-tcti_ss = ss.source_set()
+#tcti_ss = ss.source_set()
 stub_ss = ss.source_set()
 trace_ss = ss.source_set()
 user_ss = ss.source_set()
@@ -2004,8 +2004,8 @@ specific_ss.add(files('page-vary.c'))
 specific_ss.add(when: 'CONFIG_TCG_INTERPRETER', if_true: files('tcg/tci.c'))
 
 # FIXME: This is being used for now for development quickness, but these realy should be
-# added to a gadget-specific shared library.
-tcti_ss.add(when: 'CONFIG_TCG_THREADED_INTERPRETER', if_true: tcti_gadgets)
+# added to a gadget-specific shared library (tcti_ss).
+specific_ss.add(when: 'CONFIG_TCG_THREADED_INTERPRETER', if_true: tcti_gadgets)
 
 subdir('backends')
 subdir('disas')
@@ -2175,11 +2175,11 @@ common_all = static_library('common',
                             dependencies: common_all.dependencies(),
                             name_suffix: 'fa')
 
-tcti_all = common_ss.apply(config_all, strict: false)
-tcti_all = shared_library('tcti',
-                            build_by_default: false,
-                            sources: tcti_gadgets,
-                            name_suffix: 'fa')
+#tcti_all = common_ss.apply(config_all, strict: false)
+#tcti_all = shared_library('tcti',
+#                            build_by_default: false,
+#                            sources: tcti_gadgets,
+#                            name_suffix: 'fa')
 
 feature_to_c = find_program('scripts/feature_to_c.sh')
 
@@ -2254,7 +2254,7 @@ foreach target : target_dirs
   arch_deps += t.dependencies()
 
   target_common = common_ss.apply(config_target, strict: false)
-  objects = [common_all.extract_objects(target_common.sources()), tcti_all.extract_all_objects()]
+  objects = [common_all.extract_objects(target_common.sources())] #, tcti_all.extract_all_objects()]
   deps = target_common.dependencies()
 
   target_specific = specific_ss.apply(config_target, strict: false)
@@ -2286,7 +2286,7 @@ foreach target : target_dirs
                  c_args: c_args,
                  build_by_default: false,
                  kwargs: build_lib_args,
-                 link_with: tcti_all
+                 #link_with: tcti_all
                  )
 
   if target.endswith('-softmmu')
diff --git a/tcg/aarch64-tcti/tcg-target.c.inc b/tcg/aarch64-tcti/tcg-target.c.inc
index b5da8e228162..4233acf10976 100644
--- a/tcg/aarch64-tcti/tcg-target.c.inc
+++ b/tcg/aarch64-tcti/tcg-target.c.inc
@@ -104,6 +104,39 @@
         }
 
 
+#define LOOKUP_SPECIAL_CASE_LDST_GADGET(arg, name, mode) \
+    switch(TLB_MASK_TABLE_OFS(get_mmuidx(arg))) { \
+        case -32:  \
+            gadget = (a_bits >= s_bits) ?  \
+                gadget_qemu_ ## name ## _aligned_ ## mode ## _off32_i64 : \
+                gadget_qemu_ ## name ## _unaligned_ ## mode ## _off32_i64; \
+            break; \
+        case -48:  \
+            gadget = (a_bits >= s_bits) ?  \
+                gadget_qemu_ ## name ## _aligned_ ## mode ## _off48_i64 : \
+                gadget_qemu_ ## name ## _unaligned_ ## mode ## _off48_i64; \
+            break; \
+        case -64: \
+            gadget = (a_bits >= s_bits) ? \
+                gadget_qemu_ ## name ## _aligned_ ## mode ## _off64_i64 : \
+                gadget_qemu_ ## name ## _unaligned_ ## mode ## _off64_i64; \
+            break; \
+        case -96: \
+            gadget = (a_bits >= s_bits) ? \
+                gadget_qemu_ ## name ## _aligned_ ## mode ## _off96_i64 : \
+                gadget_qemu_ ## name ## _unaligned_ ## mode ## _off96_i64; \
+            break; \
+        case -128: \
+            gadget = (a_bits >= s_bits) ? \
+                gadget_qemu_ ## name ## _aligned_ ## mode ## _off128_i64 : \
+                gadget_qemu_ ## name ## _unaligned_ ## mode ## _off128_i64; \
+            break;\
+        default: \
+            gadget = gadget_qemu_ ## name ## _slowpath_ ## mode ## _off0_i64; \
+            break; \
+        }
+
+
 static TCGConstraintSetIndex tcg_target_op_def(TCGOpcode op)
 {
     switch (op) {
@@ -1040,6 +1073,7 @@ void tcg_out_op(TCGContext *s, TCGOpcode opc, const TCGArg *args, const int *con
 
         switch(TLB_MASK_TABLE_OFS(get_mmuidx(args[2]))) {
             case -32:  LD_MEMOP_HANDLER(gadget, args[2],  off32_i32, a_bits, s_bits); break;
+            case -48:  LD_MEMOP_HANDLER(gadget, args[2],  off48_i32, a_bits, s_bits); break;
             case -64:  LD_MEMOP_HANDLER(gadget, args[2],  off64_i32, a_bits, s_bits); break;
             case -96:  LD_MEMOP_HANDLER(gadget, args[2],  off96_i32, a_bits, s_bits); break;
             case -128: LD_MEMOP_HANDLER(gadget, args[2], off128_i32, a_bits, s_bits); break;
@@ -1061,46 +1095,29 @@ void tcg_out_op(TCGContext *s, TCGOpcode opc, const TCGArg *args, const int *con
 
         void *gadget;
 
-        // Special optimization case: if we have an operation/target of 0x3A, 
-        // this is a common case. Delegate to our special-case handler.
-        if (args[2] == 0x3a) {
-            switch(TLB_MASK_TABLE_OFS(get_mmuidx(args[2]))) {
-                case -32: 
-                    gadget = (a_bits >= s_bits) ? 
-                        gadget_qemu_ld_leq_aligned_mode3a_off32_i64 :
-                        gadget_qemu_ld_leq_unaligned_mode3a_off32_i64;
-                    break;
-                case -64: 
-                    gadget = (a_bits >= s_bits) ? 
-                        gadget_qemu_ld_leq_aligned_mode3a_off64_i64 :
-                        gadget_qemu_ld_leq_unaligned_mode3a_off64_i64;
-                    break;
-                case -96: 
-                    gadget = (a_bits >= s_bits) ? 
-                        gadget_qemu_ld_leq_aligned_mode3a_off96_i64 :
-                        gadget_qemu_ld_leq_unaligned_mode3a_off96_i64;
-                    break;
-                case -128: 
-                    gadget = (a_bits >= s_bits) ? 
-                        gadget_qemu_ld_leq_aligned_mode3a_off128_i64 :
-                        gadget_qemu_ld_leq_unaligned_mode3a_off128_i64;
-                    break;
-
-                default: 
-                    gadget = gadget_qemu_ld_leq_slowpath_mode3a_off0_i64;
-                    break;
-            }
+        // Special optimization case: if we have an common case. 
+        // Delegate to our special-case handler.
+        if (args[2] == 0x02) {
+            LOOKUP_SPECIAL_CASE_LDST_GADGET(args[2], ld_ub, mode02)
             tcg_out_binary_gadget(s, gadget, args[0], args[1]);
-        } 
+        } else if (args[2] == 0x32) {
+            LOOKUP_SPECIAL_CASE_LDST_GADGET(args[2], ld_leq, mode32)
+            tcg_out_binary_gadget(s, gadget, args[0], args[1]);
+        } else if(args[2] == 0x3a) {
+            LOOKUP_SPECIAL_CASE_LDST_GADGET(args[2], ld_leq, mode3a)
+            tcg_out_binary_gadget(s, gadget, args[0], args[1]);
+        }
         // Otherwise, handle the generic case.
         else {
             switch(TLB_MASK_TABLE_OFS(get_mmuidx(args[2]))) {
                 case -32:  LD_MEMOP_HANDLER(gadget, args[2],  off32_i64, a_bits, s_bits); break;
+                case -48:  LD_MEMOP_HANDLER(gadget, args[2],  off48_i64, a_bits, s_bits); break;
                 case -64:  LD_MEMOP_HANDLER(gadget, args[2],  off64_i64, a_bits, s_bits); break;
                 case -96:  LD_MEMOP_HANDLER(gadget, args[2],  off96_i64, a_bits, s_bits); break;
                 case -128: LD_MEMOP_HANDLER(gadget, args[2], off128_i64, a_bits, s_bits); break;
                 default:   LD_MEMOP_LOOKUP(gadget, args[2], slowpath_off0_i64); warn_slow_memop(args[2]); break;
             }
+
             // Args:
             // - an immediate32 encodes our operation index 
             tcg_out_binary_gadget(s, gadget, args[0], args[1]);
@@ -1120,6 +1137,7 @@ void tcg_out_op(TCGContext *s, TCGOpcode opc, const TCGArg *args, const int *con
 
         switch(TLB_MASK_TABLE_OFS(get_mmuidx(args[2]))) {
             case -32:  ST_MEMOP_HANDLER(gadget, args[2],  off32_i32, a_bits, s_bits); break;
+            case -48:  ST_MEMOP_HANDLER(gadget, args[2],  off48_i32, a_bits, s_bits); break;
             case -64:  ST_MEMOP_HANDLER(gadget, args[2],  off64_i32, a_bits, s_bits); break;
             case -96:  ST_MEMOP_HANDLER(gadget, args[2],  off96_i32, a_bits, s_bits); break;
             case -128: ST_MEMOP_HANDLER(gadget, args[2], off128_i32, a_bits, s_bits); break;
@@ -1142,41 +1160,23 @@ void tcg_out_op(TCGContext *s, TCGOpcode opc, const TCGArg *args, const int *con
 
         void *gadget;
 
-        // Special optimization case: if we have an operation/target of 0x3A, 
-        // this is a common case. Delegate to our special-case handler.
-        if (args[2] == 0x3a) {
-            switch(TLB_MASK_TABLE_OFS(get_mmuidx(args[2]))) {
-                case -32: 
-                    gadget = (a_bits >= s_bits) ? 
-                        gadget_qemu_st_leq_aligned_mode3a_off32_i64 :
-                        gadget_qemu_st_leq_unaligned_mode3a_off32_i64;
-                    break;
-                case -64: 
-                    gadget = (a_bits >= s_bits) ? 
-                        gadget_qemu_st_leq_aligned_mode3a_off64_i64 :
-                        gadget_qemu_st_leq_unaligned_mode3a_off64_i64;
-                    break;
-                case -96: 
-                    gadget = (a_bits >= s_bits) ? 
-                        gadget_qemu_st_leq_aligned_mode3a_off96_i64 :
-                        gadget_qemu_st_leq_unaligned_mode3a_off96_i64;
-                    break;
-                case -128: 
-                    gadget = (a_bits >= s_bits) ? 
-                        gadget_qemu_st_leq_aligned_mode3a_off128_i64 :
-                        gadget_qemu_st_leq_unaligned_mode3a_off128_i64;
-                    break;
-
-                default: 
-                    gadget = gadget_qemu_st_leq_slowpath_mode3a_off0_i64;
-                    break;
-            }
+        // Special optimization case: if we have an common case. 
+        // Delegate to our special-case handler.
+        if (args[2] == 0x02) {
+            LOOKUP_SPECIAL_CASE_LDST_GADGET(args[2], st_ub, mode02)
             tcg_out_binary_gadget(s, gadget, args[0], args[1]);
-        } 
+        } else if (args[2] == 0x32) {
+            LOOKUP_SPECIAL_CASE_LDST_GADGET(args[2], st_leq, mode32)
+            tcg_out_binary_gadget(s, gadget, args[0], args[1]);
+        } else if(args[2] == 0x3a) {
+            LOOKUP_SPECIAL_CASE_LDST_GADGET(args[2], st_leq, mode3a)
+            tcg_out_binary_gadget(s, gadget, args[0], args[1]);
+        }
         // Otherwise, handle the generic case.
         else {
             switch(TLB_MASK_TABLE_OFS(get_mmuidx(args[2]))) {
-                case -32:  ST_MEMOP_HANDLER(gadget, args[2],  off32_i32, a_bits, s_bits); break;
+                case -32:  ST_MEMOP_HANDLER(gadget, args[2],  off32_i64, a_bits, s_bits); break;
+                case -48:  ST_MEMOP_HANDLER(gadget, args[2],  off48_i64, a_bits, s_bits); break;
                 case -64:  ST_MEMOP_HANDLER(gadget, args[2],  off64_i64, a_bits, s_bits); break;
                 case -96:  ST_MEMOP_HANDLER(gadget, args[2],  off96_i64, a_bits, s_bits); break;
                 case -128: ST_MEMOP_HANDLER(gadget, args[2], off128_i64, a_bits, s_bits); break;
diff --git a/tcg/aarch64-tcti/tcti-gadget-gen.py b/tcg/aarch64-tcti/tcti-gadget-gen.py
index 60dba2ffc907..62b52a01cd26 100755
--- a/tcg/aarch64-tcti/tcti-gadget-gen.py
+++ b/tcg/aarch64-tcti/tcti-gadget-gen.py
@@ -27,7 +27,7 @@
 
 # We'll create a variety of gadgets that assume the MMU's TLB is stored at certain
 # offsets into its structure. These should match the offsets in tcg-target.c.in.
-QEMU_ALLOWED_MMU_OFFSETS = [ 32, 64, 96, 128 ]
+QEMU_ALLOWED_MMU_OFFSETS = [ 32, 48, 64, 96, 128 ]
 
 # Statistics.
 gadgets      = 0
@@ -337,31 +337,14 @@ def with_d(name, *lines):
 
 # Assembly code for saving our machine state before entering the C runtime.
 C_CALL_PROLOGUE = [
-    # Store our machine state.
-    "str x25,      [sp, #-16]!",
     "stp x14, x15, [sp, #-16]!",
-    "stp x12, x13, [sp, #-16]!",
-    "stp x10, x11, [sp, #-16]!",
-    "stp x8,  x9,  [sp, #-16]!",
-    "stp x6,  x7,  [sp, #-16]!",
-    "stp x4,  x5,  [sp, #-16]!",
-    "stp x2,  x3,  [sp, #-16]!",
-    "stp x0,  x1,  [sp, #-16]!",
     "stp x28, lr,  [sp, #-16]!",
 ]
 
 # Assembly code for restoring our machine state after leaving the C runtime.
 C_CALL_EPILOGUE = [
-    "ldp x28, lr, [sp], #16",
-    "ldp x0,  x1, [sp], #16",
-    "ldp x2,  x3, [sp], #16",
-    "ldp x4,  x5, [sp], #16",
-    "ldp x6,  x7, [sp], #16",
-    "ldp x8,  x9, [sp], #16",
-    "ldp x10, x11, [sp], #16",
-    "ldp x12, x13, [sp], #16",
+    "ldp x28, lr,  [sp], #16",
     "ldp x14, x15, [sp], #16",
-    "ldr x25,      [sp], #16",
 ]
 
 
@@ -781,7 +764,7 @@ def st_thunk(name, fastpath_32b, fastpath_64b, slowpath_helper, immediate=None,
 math_dn("ext16s",     "sxth", source_is_wn=True)
 with_dn("ext16u",     "and Wd, Wn, #0xffff")
 with_dn("ext32s_i64", "sxtw Xd, Wn")
-with_dn("ext32u_i64", "and Xd, Xn, #0xffffffff")
+with_dn("ext32u_i64", "mov Wd, Wn")
 
 START_COLLECTION("byteswap")
 
@@ -830,7 +813,15 @@ def st_thunk(name, fastpath_32b, fastpath_64b, slowpath_helper, immediate=None,
         force_slowpath=is_slowpath,
     )
 
-    # Special variant for the most common mode, as a speedup optimization.
+    # Special variant for the most common modes, as a speedup optimization.
+    ld_thunk(f"qemu_ld_ub_{subtype}_mode02", is_aligned=is_aligned, slowpath_helper="helper_ret_ldub_mmu",
+        fastpath_32b=["ldrb Wd, [Xn, x27]"], fastpath_64b=["ldrb Wd, [Xn, x27]"],
+        force_slowpath=is_slowpath, immediate=0x02
+    )
+    ld_thunk(f"qemu_ld_leq_{subtype}_mode32", is_aligned=is_aligned, slowpath_helper="helper_le_ldq_mmu",
+        fastpath_32b=["ldr Xd, [Xn, x27]"], fastpath_64b=["ldr Xd, [Xn, x27]"],
+        force_slowpath=is_slowpath, immediate=0x32
+    )
     ld_thunk(f"qemu_ld_leq_{subtype}_mode3a", is_aligned=is_aligned, slowpath_helper="helper_le_ldq_mmu",
         fastpath_32b=["ldr Xd, [Xn, x27]"], fastpath_64b=["ldr Xd, [Xn, x27]"],
         force_slowpath=is_slowpath, immediate=0x3a
@@ -878,6 +869,14 @@ def st_thunk(name, fastpath_32b, fastpath_64b, slowpath_helper, immediate=None,
     )
     
     # Special optimization for the most common modes.
+    st_thunk(f"qemu_st_ub_{subtype}_mode02", is_aligned=is_aligned, slowpath_helper="helper_ret_stb_mmu",
+        fastpath_32b=["strb Wd, [Xn, x27]"], fastpath_64b=["strb Wd, [Xn, x27]"],
+        force_slowpath=is_slowpath, immediate=0x02
+    )
+    st_thunk(f"qemu_st_leq_{subtype}_mode32", is_aligned=is_aligned, slowpath_helper="helper_le_stq_mmu",
+        fastpath_32b=["str Xd, [Xn, x27]"], fastpath_64b=["str Xd, [Xn, x27]"],
+        force_slowpath=is_slowpath, immediate=0x32
+    )
     st_thunk(f"qemu_st_leq_{subtype}_mode3a", is_aligned=is_aligned, slowpath_helper="helper_le_stq_mmu",
         fastpath_32b=["str Xd, [Xn, x27]"], fastpath_64b=["str Xd, [Xn, x27]"],
         force_slowpath=is_slowpath, immediate=0x3a

From 026956a4743e392ca1314bbecfcd020b13b6fe51 Mon Sep 17 00:00:00 2001
From: Kate Temkin <k@ktemkin.com>
Date: Tue, 4 May 2021 06:19:41 -0600
Subject: [PATCH 32/36] TCTI: minor gadget optimizations

---
 tcg/aarch64-tcti/tcg-target.c.inc   | 186 +++++++++++++++-------------
 tcg/aarch64-tcti/tcg-target.h       |  57 +++++----
 tcg/aarch64-tcti/tcti-gadget-gen.py |  40 +++---
 3 files changed, 151 insertions(+), 132 deletions(-)

diff --git a/tcg/aarch64-tcti/tcg-target.c.inc b/tcg/aarch64-tcti/tcg-target.c.inc
index 4233acf10976..3b844086bbb2 100644
--- a/tcg/aarch64-tcti/tcg-target.c.inc
+++ b/tcg/aarch64-tcti/tcg-target.c.inc
@@ -54,7 +54,7 @@
 
 /**
  * Macro that defines a look-up tree for named QEMU_LD gadgets.
- */ 
+ */
 #define LD_MEMOP_LOOKUP(variable, arg, suffix) \
     switch (get_memop(arg) & (MO_BSWAP | MO_SSIZE)) { \
         case MO_UB:   variable = gadget_qemu_ld_ub_   ## suffix; break; \
@@ -83,7 +83,7 @@
 
 /**
  * Macro that defines a look-up tree for named QEMU_ST gadgets.
- */ 
+ */
 #define ST_MEMOP_LOOKUP(variable, arg, suffix) \
     switch (get_memop(arg) & (MO_BSWAP | MO_SSIZE)) { \
         case MO_UB:   variable = gadget_qemu_st_ub_   ## suffix; break; \
@@ -435,11 +435,11 @@ static void tcg_out_ternary_gadget(TCGContext *s, const void *gadget_base[TCG_TA
 /**
  * Version of our LDST generator that defers to more optimized gadgets selectively.
  */
-static void tcg_out_ldst_gadget_inner(TCGContext *s, 
-    const void *gadget_base[TCG_TARGET_NB_REGS][TCG_TARGET_NB_REGS], 
-    const void *gadget_pos_imm[TCG_TARGET_NB_REGS][TCG_TARGET_NB_REGS][TCTI_GADGET_IMMEDIATE_ARRAY_LEN], 
-    const void *gadget_shifted_imm[TCG_TARGET_NB_REGS][TCG_TARGET_NB_REGS][TCTI_GADGET_IMMEDIATE_ARRAY_LEN], 
-    const void *gadget_neg_imm[TCG_TARGET_NB_REGS][TCG_TARGET_NB_REGS][TCTI_GADGET_IMMEDIATE_ARRAY_LEN], 
+static void tcg_out_ldst_gadget_inner(TCGContext *s,
+    const void *gadget_base[TCG_TARGET_NB_REGS][TCG_TARGET_NB_REGS],
+    const void *gadget_pos_imm[TCG_TARGET_NB_REGS][TCG_TARGET_NB_REGS][TCTI_GADGET_IMMEDIATE_ARRAY_LEN],
+    const void *gadget_shifted_imm[TCG_TARGET_NB_REGS][TCG_TARGET_NB_REGS][TCTI_GADGET_IMMEDIATE_ARRAY_LEN],
+    const void *gadget_neg_imm[TCG_TARGET_NB_REGS][TCG_TARGET_NB_REGS][TCTI_GADGET_IMMEDIATE_ARRAY_LEN],
     unsigned reg0, unsigned reg1, uint32_t offset)
 {
     int64_t extended_offset = (int32_t)offset;
@@ -450,7 +450,7 @@ static void tcg_out_ldst_gadget_inner(TCGContext *s,
 
     // We handle positive and negative gadgets separately, in order to allow for asymmetrical
     // collections of pre-made gadgets.
-    if (!is_negative) 
+    if (!is_negative)
     {
         uint64_t shifted_offset = (extended_offset >> 3);
         bool aligned_to_8B = ((extended_offset & 0b111) == 0);
@@ -462,7 +462,7 @@ static void tcg_out_ldst_gadget_inner(TCGContext *s,
         if (have_optimized_gadget) {
             tcg_out_nullary_gadget(s, gadget_pos_imm[reg0][reg1][extended_offset]);
             return;
-        } 
+        }
 
         // Special case: it's frequent to have low-numbered positive offsets that are aligned
         // to 16B boundaries
@@ -470,7 +470,7 @@ static void tcg_out_ldst_gadget_inner(TCGContext *s,
             tcg_out_nullary_gadget(s, gadget_shifted_imm[reg0][reg1][shifted_offset]);
             return;
         }
-    } 
+    }
     else {
         uint64_t negated_offset = -(extended_offset);
 
@@ -532,14 +532,14 @@ static void tcg_out_movi_i32(TCGContext *s, TCGReg t0, tcg_target_long arg)
 
     // We handle positive and negative gadgets separately, in order to allow for asymmetrical
     // collections of pre-made gadgets.
-    if (!is_negative) 
+    if (!is_negative)
     {
         // More optimal case: we have a gadget that directly encodes the argument.
         if (arg < ARRAY_SIZE(gadget_movi_imm_i32[t0])) {
             tcg_out_nullary_gadget(s, gadget_movi_imm_i32[t0][arg]);
             return;
         }
-    } 
+    }
     else {
 
     }
@@ -556,14 +556,14 @@ static void tcg_out_movi_i64(TCGContext *s, TCGReg t0, tcg_target_long arg)
 
     // We handle positive and negative gadgets separately, in order to allow for asymmetrical
     // collections of pre-made gadgets.
-    if (!is_negative) 
+    if (!is_negative)
     {
         // More optimal case: we have a gadget that directly encodes the argument.
         if (arg < ARRAY_SIZE(gadget_movi_imm_i64[t0])) {
             tcg_out_nullary_gadget(s, gadget_movi_imm_i64[t0][arg]);
             return;
         }
-    } 
+    }
     else {
 
     }
@@ -605,9 +605,9 @@ static void tcg_out_ld(TCGContext *s, TCGType type, TCGReg ret, TCGReg arg1,
 {
 
     if (type == TCG_TYPE_I32) {
-        tcg_out_ldst_gadget(s, gadget_ld32u, ret, arg1, arg2); 
+        tcg_out_ldst_gadget(s, gadget_ld32u, ret, arg1, arg2);
     } else {
-        tcg_out_ldst_gadget(s, gadget_ld_i64, ret, arg1, arg2); 
+        tcg_out_ldst_gadget(s, gadget_ld_i64, ret, arg1, arg2);
     }
 }
 
@@ -719,41 +719,41 @@ void tcg_out_op(TCGContext *s, TCGOpcode opc, const TCGArg *args, const int *con
 
     case INDEX_op_ld8u_i32:
     case INDEX_op_ld8u_i64:
-        tcg_out_ldst_gadget(s, gadget_ld8u, args[0], args[1], args[2]); 
+        tcg_out_ldst_gadget(s, gadget_ld8u, args[0], args[1], args[2]);
         break;
 
     case INDEX_op_ld8s_i32:
-        tcg_out_ldst_gadget(s, gadget_ld8s_i32, args[0], args[1], args[2]); 
+        tcg_out_ldst_gadget(s, gadget_ld8s_i32, args[0], args[1], args[2]);
         break;
 
     case INDEX_op_ld8s_i64:
-        tcg_out_ldst_gadget(s, gadget_ld8s_i64, args[0], args[1], args[2]); 
+        tcg_out_ldst_gadget(s, gadget_ld8s_i64, args[0], args[1], args[2]);
         break;
 
     case INDEX_op_ld16u_i32:
     case INDEX_op_ld16u_i64:
-        tcg_out_ldst_gadget(s, gadget_ld16u, args[0], args[1], args[2]); 
+        tcg_out_ldst_gadget(s, gadget_ld16u, args[0], args[1], args[2]);
         break;
 
     case INDEX_op_ld16s_i32:
-        tcg_out_ldst_gadget(s, gadget_ld16s_i32, args[0], args[1], args[2]); 
+        tcg_out_ldst_gadget(s, gadget_ld16s_i32, args[0], args[1], args[2]);
         break;
 
     case INDEX_op_ld16s_i64:
-        tcg_out_ldst_gadget(s, gadget_ld16s_i64, args[0], args[1], args[2]); 
+        tcg_out_ldst_gadget(s, gadget_ld16s_i64, args[0], args[1], args[2]);
         break;
 
     case INDEX_op_ld_i32:
     case INDEX_op_ld32u_i64:
-        tcg_out_ldst_gadget(s, gadget_ld32u, args[0], args[1], args[2]); 
+        tcg_out_ldst_gadget(s, gadget_ld32u, args[0], args[1], args[2]);
         break;
 
     case INDEX_op_ld_i64:
-        tcg_out_ldst_gadget(s, gadget_ld_i64, args[0], args[1], args[2]); 
+        tcg_out_ldst_gadget(s, gadget_ld_i64, args[0], args[1], args[2]);
         break;
-    
+   
     case INDEX_op_ld32s_i64:
-        tcg_out_ldst_gadget(s, gadget_ld32s_i64, args[0], args[1], args[2]); 
+        tcg_out_ldst_gadget(s, gadget_ld32s_i64, args[0], args[1], args[2]);
         break;
 
 
@@ -762,155 +762,169 @@ void tcg_out_op(TCGContext *s, TCGOpcode opc, const TCGArg *args, const int *con
      */
     case INDEX_op_st8_i32:
     case INDEX_op_st8_i64:
-        tcg_out_ldst_gadget(s, gadget_st8, args[0], args[1], args[2]); 
+        tcg_out_ldst_gadget(s, gadget_st8, args[0], args[1], args[2]);
         break;
 
     case INDEX_op_st16_i32:
     case INDEX_op_st16_i64:
-        tcg_out_ldst_gadget(s, gadget_st16, args[0], args[1], args[2]); 
+        tcg_out_ldst_gadget(s, gadget_st16, args[0], args[1], args[2]);
         break;
 
     case INDEX_op_st_i32:
     case INDEX_op_st32_i64:
-        tcg_out_ldst_gadget(s, gadget_st_i32, args[0], args[1], args[2]); 
+        tcg_out_ldst_gadget(s, gadget_st_i32, args[0], args[1], args[2]);
         break;
 
     case INDEX_op_st_i64:
-        tcg_out_ldst_gadget(s, gadget_st_i64, args[0], args[1], args[2]); 
+        tcg_out_ldst_gadget(s, gadget_st_i64, args[0], args[1], args[2]);
         break;
 
     /**
      * Arithmetic instructions.
      */
 
-    case INDEX_op_add_i32: 
-        tcg_out_ternary_gadget(s, gadget_add_i32, args[0], args[1], args[2]); 
+    case INDEX_op_add_i32:
+        tcg_out_ternary_gadget(s, gadget_add_i32, args[0], args[1], args[2]);
         break;
 
     case INDEX_op_sub_i32:
-        tcg_out_ternary_gadget(s, gadget_sub_i32, args[0], args[1], args[2]); 
+        tcg_out_ternary_gadget(s, gadget_sub_i32, args[0], args[1], args[2]);
         break;
 
     case INDEX_op_mul_i32:
-        tcg_out_ternary_gadget(s, gadget_mul_i32, args[0], args[1], args[2]); 
+        tcg_out_ternary_gadget(s, gadget_mul_i32, args[0], args[1], args[2]);
+        break;
+
+    case INDEX_op_nand_i32:     /* Optional (TCG_TARGET_HAS_nand_i32). */
+        tcg_out_ternary_gadget(s, gadget_nand_i32, args[0], args[1], args[2]);
+        break;
+
+    case INDEX_op_nor_i32:     /* Optional (TCG_TARGET_HAS_nor_i32). */
+        tcg_out_ternary_gadget(s, gadget_nor_i32, args[0], args[1], args[2]);
         break;
 
     case INDEX_op_and_i32:
-        tcg_out_ternary_gadget(s, gadget_and_i32, args[0], args[1], args[2]); 
+        tcg_out_ternary_gadget(s, gadget_and_i32, args[0], args[1], args[2]);
         break;
 
     case INDEX_op_andc_i32:     /* Optional (TCG_TARGET_HAS_andc_i32). */
-        tcg_out_ternary_gadget(s, gadget_andc_i32, args[0], args[1], args[2]); 
+        tcg_out_ternary_gadget(s, gadget_andc_i32, args[0], args[1], args[2]);
         break;
 
     case INDEX_op_orc_i32:      /* Optional (TCG_TARGET_HAS_orc_i64). */
-        tcg_out_ternary_gadget(s, gadget_orc_i32, args[0], args[1], args[2]); 
+        tcg_out_ternary_gadget(s, gadget_orc_i32, args[0], args[1], args[2]);
         break;
 
     case INDEX_op_eqv_i32:      /* Optional (TCG_TARGET_HAS_orc_i64). */
-        tcg_out_ternary_gadget(s, gadget_eqv_i32, args[0], args[1], args[2]); 
+        tcg_out_ternary_gadget(s, gadget_eqv_i32, args[0], args[1], args[2]);
         break;
 
     case INDEX_op_or_i32:
-        tcg_out_ternary_gadget(s, gadget_or_i32, args[0], args[1], args[2]); 
+        tcg_out_ternary_gadget(s, gadget_or_i32, args[0], args[1], args[2]);
         break;
 
     case INDEX_op_xor_i32:
-        tcg_out_ternary_gadget(s, gadget_xor_i32, args[0], args[1], args[2]); 
+        tcg_out_ternary_gadget(s, gadget_xor_i32, args[0], args[1], args[2]);
         break;
 
     case INDEX_op_shl_i32:
-        tcg_out_ternary_gadget(s, gadget_shl_i32, args[0], args[1], args[2]); 
+        tcg_out_ternary_gadget(s, gadget_shl_i32, args[0], args[1], args[2]);
         break;
 
     case INDEX_op_shr_i32:
-        tcg_out_ternary_gadget(s, gadget_shr_i32, args[0], args[1], args[2]); 
+        tcg_out_ternary_gadget(s, gadget_shr_i32, args[0], args[1], args[2]);
         break;
 
     case INDEX_op_sar_i32:
-        tcg_out_ternary_gadget(s, gadget_sar_i32, args[0], args[1], args[2]); 
+        tcg_out_ternary_gadget(s, gadget_sar_i32, args[0], args[1], args[2]);
         break;
 
-    //case INDEX_op_rotr_i32:     /* Optional (TCG_TARGET_HAS_rot_i32). */
-    //    tcg_out_ternary_gadget(s, gadget_rotr_i32, args[0], args[1], args[2]); 
-    //    break;
+    case INDEX_op_rotr_i32:     /* Optional (TCG_TARGET_HAS_rot_i32). */
+        tcg_out_ternary_gadget(s, gadget_rotr_i32, args[0], args[1], args[2]);
+        break;
 
-    //case INDEX_op_rotl_i32:     /* Optional (TCG_TARGET_HAS_rot_i32). */
-    //    tcg_out_ternary_gadget(s, gadget_rotl_i32, args[0], args[1], args[2]); 
+    case INDEX_op_rotl_i32:     /* Optional (TCG_TARGET_HAS_rot_i32). */
+        tcg_out_ternary_gadget(s, gadget_rotl_i32, args[0], args[1], args[2]);
+        break;
 
     case INDEX_op_add_i64:
-        tcg_out_ternary_gadget(s, gadget_add_i64, args[0], args[1], args[2]); 
+        tcg_out_ternary_gadget(s, gadget_add_i64, args[0], args[1], args[2]);
         break;
 
     case INDEX_op_sub_i64:
-        tcg_out_ternary_gadget(s, gadget_sub_i64, args[0], args[1], args[2]); 
+        tcg_out_ternary_gadget(s, gadget_sub_i64, args[0], args[1], args[2]);
         break;
 
     case INDEX_op_mul_i64:
-        tcg_out_ternary_gadget(s, gadget_mul_i64, args[0], args[1], args[2]); 
+        tcg_out_ternary_gadget(s, gadget_mul_i64, args[0], args[1], args[2]);
         break;
 
     case INDEX_op_and_i64:
-        tcg_out_ternary_gadget(s, gadget_and_i64, args[0], args[1], args[2]); 
+        tcg_out_ternary_gadget(s, gadget_and_i64, args[0], args[1], args[2]);
         break;
 
     case INDEX_op_andc_i64:     /* Optional (TCG_TARGET_HAS_andc_i64). */
-        tcg_out_ternary_gadget(s, gadget_andc_i64, args[0], args[1], args[2]); 
+        tcg_out_ternary_gadget(s, gadget_andc_i64, args[0], args[1], args[2]);
         break;
 
     case INDEX_op_orc_i64:      /* Optional (TCG_TARGET_HAS_orc_i64). */
-        tcg_out_ternary_gadget(s, gadget_orc_i64, args[0], args[1], args[2]); 
+        tcg_out_ternary_gadget(s, gadget_orc_i64, args[0], args[1], args[2]);
         break;
 
     case INDEX_op_eqv_i64:      /* Optional (TCG_TARGET_HAS_eqv_i64). */
-        tcg_out_ternary_gadget(s, gadget_eqv_i64, args[0], args[1], args[2]); 
+        tcg_out_ternary_gadget(s, gadget_eqv_i64, args[0], args[1], args[2]);
         break;
 
-    //case INDEX_op_nand_i64:     /* Optional (TCG_TARGET_HAS_nand_i64). */
-    //case INDEX_op_nor_i64:      /* Optional (TCG_TARGET_HAS_nor_i64). */
+    case INDEX_op_nand_i64:     /* Optional (TCG_TARGET_HAS_nand_i64). */
+        tcg_out_ternary_gadget(s, gadget_nand_i64, args[0], args[1], args[2]);
+        break;
+
+    case INDEX_op_nor_i64:      /* Optional (TCG_TARGET_HAS_nor_i64). */
+        tcg_out_ternary_gadget(s, gadget_nor_i64, args[0], args[1], args[2]);
+        break;
 
     case INDEX_op_or_i64:
-        tcg_out_ternary_gadget(s, gadget_or_i64, args[0], args[1], args[2]); 
+        tcg_out_ternary_gadget(s, gadget_or_i64, args[0], args[1], args[2]);
         break;
 
     case INDEX_op_xor_i64:
-        tcg_out_ternary_gadget(s, gadget_xor_i64, args[0], args[1], args[2]); 
+        tcg_out_ternary_gadget(s, gadget_xor_i64, args[0], args[1], args[2]);
         break;
 
     case INDEX_op_shl_i64:
-        tcg_out_ternary_gadget(s, gadget_shl_i64, args[0], args[1], args[2]); 
+        tcg_out_ternary_gadget(s, gadget_shl_i64, args[0], args[1], args[2]);
         break;
 
     case INDEX_op_shr_i64:
-        tcg_out_ternary_gadget(s, gadget_shr_i64, args[0], args[1], args[2]); 
+        tcg_out_ternary_gadget(s, gadget_shr_i64, args[0], args[1], args[2]);
         break;
 
     case INDEX_op_sar_i64:
-        tcg_out_ternary_gadget(s, gadget_sar_i64, args[0], args[1], args[2]); 
+        tcg_out_ternary_gadget(s, gadget_sar_i64, args[0], args[1], args[2]);
         break;
 
-    //case INDEX_op_rotl_i64:     /* Optional (TCG_TARGET_HAS_rot_i64). */
-    //    tcg_out_ternary_gadget(s, gadget_rotl_i64, args[0], args[1], args[2]); 
-    //    break;
+    case INDEX_op_rotl_i64:     /* Optional (TCG_TARGET_HAS_rot_i64). */
+        tcg_out_ternary_gadget(s, gadget_rotl_i64, args[0], args[1], args[2]);
+        break;
 
-    //case INDEX_op_rotr_i64:     /* Optional (TCG_TARGET_HAS_rot_i64). */
-    //    tcg_out_ternary_gadget(s, gadget_rotr_i64, args[0], args[1], args[2]); 
-    //    break;
+    case INDEX_op_rotr_i64:     /* Optional (TCG_TARGET_HAS_rot_i64). */
+        tcg_out_ternary_gadget(s, gadget_rotr_i64, args[0], args[1], args[2]);
+        break;
 
     case INDEX_op_div_i64:      /* Optional (TCG_TARGET_HAS_div_i64). */
-        tcg_out_ternary_gadget(s, gadget_div_i64, args[0], args[1], args[2]); 
+        tcg_out_ternary_gadget(s, gadget_div_i64, args[0], args[1], args[2]);
         break;
 
     case INDEX_op_divu_i64:     /* Optional (TCG_TARGET_HAS_div_i64). */
-        tcg_out_ternary_gadget(s, gadget_divu_i64, args[0], args[1], args[2]); 
+        tcg_out_ternary_gadget(s, gadget_divu_i64, args[0], args[1], args[2]);
         break;
 
     case INDEX_op_rem_i64:      /* Optional (TCG_TARGET_HAS_div_i64). */
-        tcg_out_ternary_gadget(s, gadget_rem_i64, args[0], args[1], args[2]); 
+        tcg_out_ternary_gadget(s, gadget_rem_i64, args[0], args[1], args[2]);
         break;
 
     case INDEX_op_remu_i64:     /* Optional (TCG_TARGET_HAS_div_i64). */
-        tcg_out_ternary_gadget(s, gadget_remu_i64, args[0], args[1], args[2]); 
+        tcg_out_ternary_gadget(s, gadget_remu_i64, args[0], args[1], args[2]);
         break;
 
     case INDEX_op_brcond_i64:
@@ -1014,19 +1028,19 @@ void tcg_out_op(TCGContext *s, TCGOpcode opc, const TCGArg *args, const int *con
         break;
 
     case INDEX_op_div_i32:      /* Optional (TCG_TARGET_HAS_div_i32). */
-        tcg_out_ternary_gadget(s, gadget_div_i32, args[0], args[1], args[2]); 
+        tcg_out_ternary_gadget(s, gadget_div_i32, args[0], args[1], args[2]);
         break;
 
     case INDEX_op_divu_i32:     /* Optional (TCG_TARGET_HAS_div_i32). */
-        tcg_out_ternary_gadget(s, gadget_divu_i32, args[0], args[1], args[2]); 
+        tcg_out_ternary_gadget(s, gadget_divu_i32, args[0], args[1], args[2]);
         break;
 
     case INDEX_op_rem_i32:      /* Optional (TCG_TARGET_HAS_div_i32). */
-        tcg_out_ternary_gadget(s, gadget_rem_i32, args[0], args[1], args[2]); 
+        tcg_out_ternary_gadget(s, gadget_rem_i32, args[0], args[1], args[2]);
         break;
 
     case INDEX_op_remu_i32:     /* Optional (TCG_TARGET_HAS_div_i32). */
-        tcg_out_ternary_gadget(s, gadget_remu_i32, args[0], args[1], args[2]); 
+        tcg_out_ternary_gadget(s, gadget_remu_i32, args[0], args[1], args[2]);
         break;
 
     case INDEX_op_brcond_i32:
@@ -1081,7 +1095,7 @@ void tcg_out_op(TCGContext *s, TCGOpcode opc, const TCGArg *args, const int *con
         }
 
         // Args:
-        // - an immediate32 encodes our operation index 
+        // - an immediate32 encodes our operation index
         tcg_out_binary_gadget(s, gadget, args[0], args[1]);
         tcg_out64(s, args[2]); // TODO: fix encoding to be 4b
         break;
@@ -1095,7 +1109,7 @@ void tcg_out_op(TCGContext *s, TCGOpcode opc, const TCGArg *args, const int *con
 
         void *gadget;
 
-        // Special optimization case: if we have an common case. 
+        // Special optimization case: if we have an common case.
         // Delegate to our special-case handler.
         if (args[2] == 0x02) {
             LOOKUP_SPECIAL_CASE_LDST_GADGET(args[2], ld_ub, mode02)
@@ -1119,7 +1133,7 @@ void tcg_out_op(TCGContext *s, TCGOpcode opc, const TCGArg *args, const int *con
             }
 
             // Args:
-            // - an immediate32 encodes our operation index 
+            // - an immediate32 encodes our operation index
             tcg_out_binary_gadget(s, gadget, args[0], args[1]);
             tcg_out64(s, args[2]); // TODO: fix encoding to be 4b
         }
@@ -1146,7 +1160,7 @@ void tcg_out_op(TCGContext *s, TCGOpcode opc, const TCGArg *args, const int *con
 
         // Args:
         // - our gadget encodes the target and address registers
-        // - an immediate32 encodes our operation index 
+        // - an immediate32 encodes our operation index
         tcg_out_binary_gadget(s, gadget, args[0], args[1]);
         tcg_out64(s, args[2]); // FIXME: double encoded
         break;
@@ -1160,7 +1174,7 @@ void tcg_out_op(TCGContext *s, TCGOpcode opc, const TCGArg *args, const int *con
 
         void *gadget;
 
-        // Special optimization case: if we have an common case. 
+        // Special optimization case: if we have an common case.
         // Delegate to our special-case handler.
         if (args[2] == 0x02) {
             LOOKUP_SPECIAL_CASE_LDST_GADGET(args[2], st_ub, mode02)
@@ -1185,7 +1199,7 @@ void tcg_out_op(TCGContext *s, TCGOpcode opc, const TCGArg *args, const int *con
 
             // Args:
             // - our gadget encodes the target and address registers
-            // - an immediate32 encodes our operation index 
+            // - an immediate32 encodes our operation index
             tcg_out_binary_gadget(s, gadget, args[0], args[1]);
             tcg_out64(s, args[2]); // FIXME: double encoded
         }
@@ -1223,9 +1237,9 @@ static void tcg_out_st(TCGContext *s, TCGType type, TCGReg arg, TCGReg arg1,
                        intptr_t arg2)
 {
     if (type == TCG_TYPE_I32) {
-        tcg_out_ldst_gadget(s, gadget_st_i32, arg, arg1, arg2); 
+        tcg_out_ldst_gadget(s, gadget_st_i32, arg, arg1, arg2);
     } else {
-        tcg_out_ldst_gadget(s, gadget_st_i64, arg, arg1, arg2); 
+        tcg_out_ldst_gadget(s, gadget_st_i64, arg, arg1, arg2);
     }
 }
 
@@ -1314,8 +1328,8 @@ uintptr_t QEMU_DISABLE_CFI tcg_qemu_tb_exec(CPUArchState *env, const void *v_tb_
 
         : [return_value] "=m" (return_value)
 
-        : [areg0]        "m"  (env), 
-          [sp_value]     "m"  (sp_value), 
+        : [areg0]        "m"  (env),
+          [sp_value]     "m"  (sp_value),
           [start_tb_ptr] "m"  (v_tb_ptr),
           [pc_mirror]    "m"  (pc_mirror)
 
diff --git a/tcg/aarch64-tcti/tcg-target.h b/tcg/aarch64-tcti/tcg-target.h
index fa2ae5c40a3e..12d06f8cc671 100644
--- a/tcg/aarch64-tcti/tcg-target.h
+++ b/tcg/aarch64-tcti/tcg-target.h
@@ -55,8 +55,11 @@
 // weird psuedo-native bytecode. We'll indicate that we're intepreted.
 #define TCG_TARGET_INTERPRETER 1
 
+// Specify we'll handle direct jumps.
+#define TCG_TARGET_HAS_direct_jump      1
+
 //
-// Supported optional instructions.
+// Supported optional scalar instructions.
 //
 
 // Divs.
@@ -77,23 +80,27 @@
 #define TCG_TARGET_HAS_ext16u_i64       1
 #define TCG_TARGET_HAS_ext32u_i64       1
 
-// Logicals.
+// Negations.
 #define TCG_TARGET_HAS_neg_i32          1
 #define TCG_TARGET_HAS_not_i32          1
 #define TCG_TARGET_HAS_neg_i64          1
 #define TCG_TARGET_HAS_not_i64          1
 
+// Logicals.
 #define TCG_TARGET_HAS_andc_i32         1
 #define TCG_TARGET_HAS_orc_i32          1
 #define TCG_TARGET_HAS_eqv_i32          1
+#define TCG_TARGET_HAS_rot_i32          1
+#define TCG_TARGET_HAS_nand_i32         1
+#define TCG_TARGET_HAS_nor_i32          1
 #define TCG_TARGET_HAS_andc_i64         1
 #define TCG_TARGET_HAS_eqv_i64          1
 #define TCG_TARGET_HAS_orc_i64          1
+#define TCG_TARGET_HAS_rot_i64          1
+#define TCG_TARGET_HAS_nor_i64          1
+#define TCG_TARGET_HAS_nand_i64         1
+
 
-// We don't curretly support rotates, since AArch64 lacks ROL.
-// We'll fix this later.
-#define TCG_TARGET_HAS_rot_i32          0
-#define TCG_TARGET_HAS_rot_i64          0
 
 // Swaps.
 #define TCG_TARGET_HAS_bswap16_i32      1
@@ -103,9 +110,6 @@
 #define TCG_TARGET_HAS_bswap64_i64      1
 #define TCG_TARGET_HAS_MEMORY_BSWAP     1
 
-// Specify we'll handle direct jumps.
-#define TCG_TARGET_HAS_direct_jump      1
-
 //
 // Potential TODOs.
 //
@@ -120,19 +124,16 @@
 #define TCG_TARGET_HAS_extract_i64      0
 #define TCG_TARGET_HAS_sextract_i64     0
 
-// TODO: it might be worth writing a gadget for this
-#define TCG_TARGET_HAS_movcond_i32      0
-#define TCG_TARGET_HAS_movcond_i64      0
 
 //
-// Unsupported instructions.
+// Supported optional vector instructions.
 //
 
-// ARMv8 doesn't have instructions for NAND/NOR.
-#define TCG_TARGET_HAS_nand_i32         0
-#define TCG_TARGET_HAS_nor_i32          0
-#define TCG_TARGET_HAS_nor_i64          0
-#define TCG_TARGET_HAS_nand_i64         0
+// TODO!
+
+//
+// Unsupported instructions.
+//
 
 // aarch64's CLZ is implemented without a condition, so it
 #define TCG_TARGET_HAS_clz_i32          0
@@ -142,19 +143,24 @@
 #define TCG_TARGET_HAS_ctz_i64          0
 #define TCG_TARGET_HAS_ctpop_i64        0
 
+// We don't currently support gadgets with more than three arguments,
+// so we can't yet create movcond gadgets.
+#define TCG_TARGET_HAS_movcond_i32      0
+#define TCG_TARGET_HAS_movcond_i64      0
 
-// GOTO_PTR is too complex to emit a simple gadget for.
-// We'll let C handle it, since the overhead is similar.
+// GOTO_PTR is too complex to emit a simple gadget for, since it can
+// target either interpreted code or our non-existent epilogue.
 #define TCG_TARGET_HAS_goto_ptr         0
 
-// We don't have a simple gadget for this, since we're always assuming softmmu.
-#define TCG_TARGET_HAS_qemu_st8_i32     0
-
-// No AArch64 equivalent.a
+// No AArch64 equivalent.
 #define TCG_TARGET_HAS_extrl_i64_i32    0
 #define TCG_TARGET_HAS_extrh_i64_i32    0
 
-#define TCG_TARGET_HAS_extract2_i64     0
+// This operation exists specifically to allow us to provide differing register
+// constraints for 8-bit loads and stores. We don't need to do so, so we'll leave
+// this unimplemented, as we gain nothing by it.
+#define TCG_TARGET_HAS_qemu_st8_i32     0
+
 
 // These should always be zero on our 64B platform.
 #define TCG_TARGET_HAS_muls2_i64        0
@@ -170,6 +176,7 @@
 #define TCG_TARGET_HAS_muls2_i32        0
 #define TCG_TARGET_HAS_muluh_i32        0
 #define TCG_TARGET_HAS_mulsh_i32        0
+#define TCG_TARGET_HAS_extract2_i64     0
 
 //
 // Platform metadata.
diff --git a/tcg/aarch64-tcti/tcti-gadget-gen.py b/tcg/aarch64-tcti/tcti-gadget-gen.py
index 62b52a01cd26..138b8c45860d 100755
--- a/tcg/aarch64-tcti/tcti-gadget-gen.py
+++ b/tcg/aarch64-tcti/tcti-gadget-gen.py
@@ -594,6 +594,7 @@ def st_thunk(name, fastpath_32b, fastpath_64b, slowpath_helper, immediate=None,
     "ldr x28, [x28]"
 )
 
+
 # Exit from a translation buffer execution.
 simple("exit_tb",
 
@@ -645,17 +646,12 @@ def st_thunk(name, fastpath_32b, fastpath_64b, slowpath_helper, immediate=None,
         # Grab our immediate argument.
         "ldr x27, [x28], #8",
 
-        # Perform our comparison and conditional branch.
+        # Perform our comparison...
         "subs wzr, Wn, Wm",
-        f"b{condition} 1f",
-
-        "0:", # not taken
-           # Perform our end-of-instruction epilogue.
-            *EPILOGUE,
 
-        "1:" # taken
-            # Update our bytecode pointer to take the label.
-            "mov x28, x27"
+        # ... and our conditional branch, which selectively sets w28 (our "gadget pointer")
+        # to the new location, if required.
+        f"csel x28, x27, x28, {condition}"
     )
 
     # Branches iff a given comparison is true.
@@ -666,15 +662,10 @@ def st_thunk(name, fastpath_32b, fastpath_64b, slowpath_helper, immediate=None,
 
         # Perform our comparison and conditional branch.
         "subs xzr, Xn, Xm",
-        f"b{condition} 1f",
 
-        "0:", # not taken
-            # Perform our end-of-instruction epilogue.
-            *EPILOGUE,
-
-        "1:" # taken
-            # Update our bytecode pointer to take the label.
-            "mov x28, x27"
+        # ... and our conditional branch, which selectively sets w28 (our "gadget pointer")
+        # to the new location, if required.
+        f"csel x28, x27, x28, {condition}"
     )
 
 
@@ -749,12 +740,19 @@ def st_thunk(name, fastpath_32b, fastpath_64b, slowpath_helper, immediate=None,
 math_dnm("shl",  "lsl")
 math_dnm("shr",  "lsr")
 math_dnm("sar",  "asr")
+math_dnm("rotr", "ror")
 
 # AArch64 lacks a Rotate Left; so we instead rotate right by a negative.
-# TODO: validate this?
-#math_dnm("rotr", "ror")
-#with_dnm("rotl_i32", "neg w27, Wm", "ror Wd, Wn, w27")
-#with_dnm("rotl_i64", "neg x27, Xm", "ror Xd, Xn, x27")
+with_dnm("rotl_i32", "neg w27, Wm", "ror Wd, Wn, w27")
+with_dnm("rotl_i64", "neg w27, Wm", "ror Xd, Xn, x27")
+
+# We'll synthesize several instructions that don't exist; since it's still faster
+# to run these as gadgets.
+with_dnm("nand_i32", "and Wd, Wn, Wm", "mvn Wd, Wd")
+with_dnm("nand_i64", "and Xd, Xn, Xm", "mvn Xd, Xd")
+with_dnm("nor_i32",  "orr Wd, Wn, Wm", "mvn Wd, Wd")
+with_dnm("nor_i64",  "orr Xd, Xn, Xm", "mvn Xd, Xd")
+
 
 START_COLLECTION("extension")
 

From 671b3cefcccaada60b289cdff408b9d29907b7f2 Mon Sep 17 00:00:00 2001
From: Kate Temkin <k@ktemkin.com>
Date: Tue, 4 May 2021 08:02:19 -0600
Subject: [PATCH 33/36] TCTI: add remaining scalar ops

---
 meson.build                         |  2 +
 tcg/aarch64-tcti/tcg-target.c.inc   | 30 ++++++++++++++
 tcg/aarch64-tcti/tcg-target.h       | 45 +++++++++------------
 tcg/aarch64-tcti/tcti-gadget-gen.py | 62 +++++++++++++++++++++++++++++
 4 files changed, 112 insertions(+), 27 deletions(-)

diff --git a/meson.build b/meson.build
index 6d9d804bf1aa..df83051ebc2a 100644
--- a/meson.build
+++ b/meson.build
@@ -283,6 +283,8 @@ if not get_option('tcg').disabled()
         'tcti_logical_gadgets.h',
         'tcti_extension_gadgets.c',
         'tcti_extension_gadgets.h',
+        'tcti_bitwise_gadgets.c',
+        'tcti_bitwise_gadgets.h',
         'tcti_byteswap_gadgets.c',
         'tcti_byteswap_gadgets.h',
         'tcti_qemu_ld_aligned_signed_le_gadgets.c',
diff --git a/tcg/aarch64-tcti/tcg-target.c.inc b/tcg/aarch64-tcti/tcg-target.c.inc
index 3b844086bbb2..e6020ce59671 100644
--- a/tcg/aarch64-tcti/tcg-target.c.inc
+++ b/tcg/aarch64-tcti/tcg-target.c.inc
@@ -173,6 +173,8 @@ static TCGConstraintSetIndex tcg_target_op_def(TCGOpcode op)
     case INDEX_op_bswap32_i32:
     case INDEX_op_bswap32_i64:
     case INDEX_op_bswap64_i64:
+    case INDEX_op_extrl_i64_i32:
+    case INDEX_op_extrh_i64_i32:
         return C_O1_I1(r, r);
 
     case INDEX_op_st8_i32:
@@ -226,6 +228,10 @@ static TCGConstraintSetIndex tcg_target_op_def(TCGOpcode op)
     case INDEX_op_rotr_i64:
     case INDEX_op_setcond_i32:
     case INDEX_op_setcond_i64:
+    case INDEX_op_clz_i32:
+    case INDEX_op_clz_i64:
+    case INDEX_op_ctz_i32:
+    case INDEX_op_ctz_i64:
         return C_O1_I2(r, r, r);
 
     case INDEX_op_brcond_i32:
@@ -983,6 +989,14 @@ void tcg_out_op(TCGContext *s, TCGOpcode opc, const TCGArg *args, const int *con
         tcg_out_binary_gadget(s, gadget_neg_i64, args[0], args[1]);
         break;
 
+    case INDEX_op_clz_i64:      /* Optional (TCG_TARGET_HAS_clz_i64). */
+        tcg_out_ternary_gadget(s, gadget_clz_i64, args[0], args[1], args[2]);
+        break;
+
+    case INDEX_op_ctz_i64:      /* Optional (TCG_TARGET_HAS_ctz_i64). */
+        tcg_out_ternary_gadget(s, gadget_ctz_i64, args[0], args[1], args[2]);
+        break;
+
     case INDEX_op_ext8s_i64:    /* Optional (TCG_TARGET_HAS_ext8s_i64). */
         tcg_out_binary_gadget(s, gadget_ext8s_i64, args[0], args[1]);
         break;
@@ -1011,10 +1025,26 @@ void tcg_out_op(TCGContext *s, TCGOpcode opc, const TCGArg *args, const int *con
         tcg_out_binary_gadget(s, gadget_ext32u_i64, args[0], args[1]);
         break;
 
+    case INDEX_op_extrl_i64_i32:
+        tcg_out_binary_gadget(s, gadget_extrl, args[0], args[1]);
+        break;
+
+    case INDEX_op_extrh_i64_i32:
+        tcg_out_binary_gadget(s, gadget_extrh, args[0], args[1]);
+        break;
+
     case INDEX_op_neg_i32:      /* Optional (TCG_TARGET_HAS_neg_i32). */
         tcg_out_binary_gadget(s, gadget_neg_i32, args[0], args[1]);
         break;
 
+    case INDEX_op_clz_i32:      /* Optional (TCG_TARGET_HAS_clz_i32). */
+        tcg_out_ternary_gadget(s, gadget_clz_i32, args[0], args[1], args[2]);
+        break;
+
+    case INDEX_op_ctz_i32:      /* Optional (TCG_TARGET_HAS_ctz_i32). */
+        tcg_out_ternary_gadget(s, gadget_ctz_i32, args[0], args[1], args[2]);
+        break;
+
     case INDEX_op_not_i32:      /* Optional (TCG_TARGET_HAS_not_i32). */
         tcg_out_binary_gadget(s, gadget_not_i32, args[0], args[1]);
         break;
diff --git a/tcg/aarch64-tcti/tcg-target.h b/tcg/aarch64-tcti/tcg-target.h
index 12d06f8cc671..7087321de68a 100644
--- a/tcg/aarch64-tcti/tcg-target.h
+++ b/tcg/aarch64-tcti/tcg-target.h
@@ -80,6 +80,10 @@
 #define TCG_TARGET_HAS_ext16u_i64       1
 #define TCG_TARGET_HAS_ext32u_i64       1
 
+// Register extractions.
+#define TCG_TARGET_HAS_extrl_i64_i32    1
+#define TCG_TARGET_HAS_extrh_i64_i32    1
+
 // Negations.
 #define TCG_TARGET_HAS_neg_i32          1
 #define TCG_TARGET_HAS_not_i32          1
@@ -100,7 +104,11 @@
 #define TCG_TARGET_HAS_nor_i64          1
 #define TCG_TARGET_HAS_nand_i64         1
 
-
+// Bitwise operations.
+#define TCG_TARGET_HAS_clz_i32          1
+#define TCG_TARGET_HAS_ctz_i32          1
+#define TCG_TARGET_HAS_clz_i64          1
+#define TCG_TARGET_HAS_ctz_i64          1
 
 // Swaps.
 #define TCG_TARGET_HAS_bswap16_i32      1
@@ -110,21 +118,6 @@
 #define TCG_TARGET_HAS_bswap64_i64      1
 #define TCG_TARGET_HAS_MEMORY_BSWAP     1
 
-//
-// Potential TODOs.
-//
-
-// TODO: implement DEPOSIT as BFI.
-#define TCG_TARGET_HAS_deposit_i32      0
-#define TCG_TARGET_HAS_deposit_i64      0
-
-// TODO: implement EXTRACT as BFX.
-#define TCG_TARGET_HAS_extract_i32      0
-#define TCG_TARGET_HAS_sextract_i32     0
-#define TCG_TARGET_HAS_extract_i64      0
-#define TCG_TARGET_HAS_sextract_i64     0
-
-
 //
 // Supported optional vector instructions.
 //
@@ -135,33 +128,31 @@
 // Unsupported instructions.
 //
 
-// aarch64's CLZ is implemented without a condition, so it
-#define TCG_TARGET_HAS_clz_i32          0
-#define TCG_TARGET_HAS_ctz_i32          0
+// There's no direct instruction with which to count the number of ones,
+// so we'll leave this implemented as other instructions.
 #define TCG_TARGET_HAS_ctpop_i32        0
-#define TCG_TARGET_HAS_clz_i64          0
-#define TCG_TARGET_HAS_ctz_i64          0
 #define TCG_TARGET_HAS_ctpop_i64        0
 
 // We don't currently support gadgets with more than three arguments,
-// so we can't yet create movcond gadgets.
+// so we can't yet create movcond, deposit, or extract gadgets.
 #define TCG_TARGET_HAS_movcond_i32      0
 #define TCG_TARGET_HAS_movcond_i64      0
+#define TCG_TARGET_HAS_deposit_i32      0
+#define TCG_TARGET_HAS_deposit_i64      0
+#define TCG_TARGET_HAS_extract_i32      0
+#define TCG_TARGET_HAS_sextract_i32     0
+#define TCG_TARGET_HAS_extract_i64      0
+#define TCG_TARGET_HAS_sextract_i64     0
 
 // GOTO_PTR is too complex to emit a simple gadget for, since it can
 // target either interpreted code or our non-existent epilogue.
 #define TCG_TARGET_HAS_goto_ptr         0
 
-// No AArch64 equivalent.
-#define TCG_TARGET_HAS_extrl_i64_i32    0
-#define TCG_TARGET_HAS_extrh_i64_i32    0
-
 // This operation exists specifically to allow us to provide differing register
 // constraints for 8-bit loads and stores. We don't need to do so, so we'll leave
 // this unimplemented, as we gain nothing by it.
 #define TCG_TARGET_HAS_qemu_st8_i32     0
 
-
 // These should always be zero on our 64B platform.
 #define TCG_TARGET_HAS_muls2_i64        0
 #define TCG_TARGET_HAS_add2_i32         0
diff --git a/tcg/aarch64-tcti/tcti-gadget-gen.py b/tcg/aarch64-tcti/tcti-gadget-gen.py
index 138b8c45860d..0c0e575c4d24 100755
--- a/tcg/aarch64-tcti/tcti-gadget-gen.py
+++ b/tcg/aarch64-tcti/tcti-gadget-gen.py
@@ -753,6 +753,64 @@ def st_thunk(name, fastpath_32b, fastpath_64b, slowpath_helper, immediate=None,
 with_dnm("nor_i32",  "orr Wd, Wn, Wm", "mvn Wd, Wd")
 with_dnm("nor_i64",  "orr Xd, Xn, Xm", "mvn Xd, Xd")
 
+START_COLLECTION("bitwise")
+
+# Count leading zeroes, with a twist: QEMU requires us to provide
+# a default value for when the argument is 0.
+with_dnm("clz_i32",
+
+    # Perform the core CLZ into w26.
+    "clz w26, Wn",
+
+    # Check Wn to see if it was zero
+    "tst Wn, Wn",
+
+    # If it was zero, accept the argument provided in Wm.
+    # Otherwise, accept our result from w26.
+    "csel Wd, Wm, w26, eq"
+)
+with_dnm("clz_i64",
+
+    # Perform the core CLZ into w26.
+    "clz x26, Xn",
+
+    # Check Wn to see if it was zero
+    "tst Xn, Xn",
+
+    # If it was zero, accept the argument provided in Wm.
+    # Otherwise, accept our result from w26.
+    "csel Xd, Xm, x26, eq"
+)
+
+
+# Count trailing zeroes, with a twist: QEMU requires us to provide
+# a default value for when the argument is 0.
+with_dnm("ctz_i32",
+    # Reverse our bits before performing our actual clz.
+    "rbit w26, Wn",
+    "clz w26, w26",
+
+    # Check Wn to see if it was zero
+    "tst Wn, Wn",
+
+    # If it was zero, accept the argument provided in Wm.
+    # Otherwise, accept our result from w26.
+    "csel Wd, Wm, w26, eq"
+)
+with_dnm("ctz_i64",
+
+    # Perform the core CLZ into w26.
+    "rbit x26, Xn",
+    "clz x26, x26",
+
+    # Check Wn to see if it was zero
+    "tst Xn, Xn",
+
+    # If it was zero, accept the argument provided in Wm.
+    # Otherwise, accept our result from w26.
+    "csel Xd, Xm, x26, eq"
+)
+
 
 START_COLLECTION("extension")
 
@@ -764,6 +822,10 @@ def st_thunk(name, fastpath_32b, fastpath_64b, slowpath_helper, immediate=None,
 with_dn("ext32s_i64", "sxtw Xd, Wn")
 with_dn("ext32u_i64", "mov Wd, Wn")
 
+# Numeric extraction.
+with_dn("extrl",      "mov Wd, Wn")
+with_dn("extrh",      "lsr Xd, Xn, #32")
+
 START_COLLECTION("byteswap")
 
 # Byte swapping.

From 21ad741c1c05b8b1723355857069af979659c8b8 Mon Sep 17 00:00:00 2001
From: Kate Temkin <k@ktemkin.com>
Date: Wed, 5 May 2021 16:49:30 -0600
Subject: [PATCH 34/36] TCTI: add initial vector instruction support

---
 meson.build                           |   6 +
 tcg/aarch64-tcti/tcg-target-con-set.h |  23 +-
 tcg/aarch64-tcti/tcg-target-con-str.h |  11 +-
 tcg/aarch64-tcti/tcg-target.c.inc     | 948 ++++++++++++++++++++++----
 tcg/aarch64-tcti/tcg-target.h         |  81 ++-
 tcg/aarch64-tcti/tcg-target.opc.h     |  15 +
 tcg/aarch64-tcti/tcti-gadget-gen.py   | 177 ++++-
 7 files changed, 1090 insertions(+), 171 deletions(-)
 create mode 100644 tcg/aarch64-tcti/tcg-target.opc.h

diff --git a/meson.build b/meson.build
index df83051ebc2a..cdbe0933ac9d 100644
--- a/meson.build
+++ b/meson.build
@@ -317,6 +317,12 @@ if not get_option('tcg').disabled()
         'tcti_qemu_st_unaligned_be_gadgets.h',
         'tcti_qemu_st_slowpath_be_gadgets.c',
         'tcti_qemu_st_slowpath_be_gadgets.h',
+        'tcti_simd_base_gadgets.c',
+        'tcti_simd_base_gadgets.h',
+        'tcti_simd_arithmetic_gadgets.c',
+        'tcti_simd_arithmetic_gadgets.h',
+        'tcti_simd_logical_gadgets.c',
+        'tcti_simd_logical_gadgets.h',
     ]
     tcti_gadgets = custom_target('tcti-gadgets.h',
                               output: tcti_sources,
diff --git a/tcg/aarch64-tcti/tcg-target-con-set.h b/tcg/aarch64-tcti/tcg-target-con-set.h
index f51b7bcb13e7..a0b91bb320f6 100644
--- a/tcg/aarch64-tcti/tcg-target-con-set.h
+++ b/tcg/aarch64-tcti/tcg-target-con-set.h
@@ -9,13 +9,24 @@
  * Each operand should be a sequence of constraint letters as defined by
  * tcg-target-con-str.h; the constraint combination is inclusive or.
  */
+
+// Simple register functions.
+C_O0_I1(r)
 C_O0_I2(r, r)
 C_O0_I3(r, r, r)
-C_O0_I4(r, r, r, r)
+//C_O0_I4(r, r, r, r)
 C_O1_I1(r, r)
-C_O1_I2(r, 0, r)
 C_O1_I2(r, r, r)
-C_O1_I4(r, r, r, r, r)
-C_O2_I1(r, r, r)
-C_O2_I2(r, r, r, r)
-C_O2_I4(r, r, r, r, r, r)
+//C_O1_I4(r, r, r, r, r)
+//C_O2_I1(r, r, r)
+//C_O2_I2(r, r, r, r)
+//C_O2_I4(r, r, r, r, r, r)
+
+// Vector functions.
+C_O1_I1(w, w)
+C_O1_I1(w, r)
+C_O0_I2(w, r)
+C_O1_I1(w, wr)
+C_O1_I2(w, w, w)
+C_O1_I3(w, w, w, w)
+C_O1_I2(w, 0, w)
\ No newline at end of file
diff --git a/tcg/aarch64-tcti/tcg-target-con-str.h b/tcg/aarch64-tcti/tcg-target-con-str.h
index 87c0f19e9c2e..94d06d3e74a5 100644
--- a/tcg/aarch64-tcti/tcg-target-con-str.h
+++ b/tcg/aarch64-tcti/tcg-target-con-str.h
@@ -8,4 +8,13 @@
  * Define constraint letters for register sets:
  * REGS(letter, register_mask)
  */
-REGS('r', MAKE_64BIT_MASK(0, TCG_TARGET_NB_REGS))
+REGS('r', TCG_MASK_GP_REGISTERS)
+REGS('w', TCG_MASK_VECTOR_REGISTERS)
+
+/*
+ * Define constraint letters for constants:
+ * CONST(letter, TCG_CT_CONST_* bit set)
+ */
+
+// Simple 64-bit immediates.
+CONST('I', 0xFFFFFFFFFFFFFFFF)
diff --git a/tcg/aarch64-tcti/tcg-target.c.inc b/tcg/aarch64-tcti/tcg-target.c.inc
index e6020ce59671..9852650ca6fe 100644
--- a/tcg/aarch64-tcti/tcg-target.c.inc
+++ b/tcg/aarch64-tcti/tcg-target.c.inc
@@ -24,7 +24,7 @@
 
 
 // Rich disassembly is nice in theory, but it's -slow-.
-#define TCTI_GADGET_RICH_DISASSEMBLY
+//#define TCTI_GADGET_RICH_DISASSEMBLY
 
 #define TCTI_GADGET_IMMEDIATE_ARRAY_LEN 64
 
@@ -49,97 +49,15 @@
 # define tcti_assert(cond) ((void)0)
 #endif
 
-/* Bitfield n...m (in 32 bit value). */
-#define BITS(n, m) (((0xffffffffU << (31 - n)) >> (31 - n + m)) << m)
-
-/**
- * Macro that defines a look-up tree for named QEMU_LD gadgets.
- */
-#define LD_MEMOP_LOOKUP(variable, arg, suffix) \
-    switch (get_memop(arg) & (MO_BSWAP | MO_SSIZE)) { \
-        case MO_UB:   variable = gadget_qemu_ld_ub_   ## suffix; break; \
-        case MO_SB:   variable = gadget_qemu_ld_sb_   ## suffix; break; \
-        case MO_LEUW: variable = gadget_qemu_ld_leuw_ ## suffix; break; \
-        case MO_LESW: variable = gadget_qemu_ld_lesw_ ## suffix; break; \
-        case MO_LEUL: variable = gadget_qemu_ld_leul_ ## suffix; break; \
-        case MO_LESL: variable = gadget_qemu_ld_lesl_ ## suffix; break; \
-        case MO_LEQ:  variable = gadget_qemu_ld_leq_  ## suffix; break; \
-        case MO_BEUW: variable = gadget_qemu_ld_beuw_ ## suffix; break; \
-        case MO_BESW: variable = gadget_qemu_ld_besw_ ## suffix; break; \
-        case MO_BEUL: variable = gadget_qemu_ld_beul_ ## suffix; break; \
-        case MO_BESL: variable = gadget_qemu_ld_besl_ ## suffix; break; \
-        case MO_BEQ:  variable = gadget_qemu_ld_beq_  ## suffix; break; \
-        default: \
-            g_assert_not_reached(); \
-    }
-#define LD_MEMOP_HANDLER(variable, arg, suffix, a_bits, s_bits) \
-        if (a_bits >= s_bits) { \
-            LD_MEMOP_LOOKUP(variable, arg, aligned_ ## suffix ); \
-        } else { \
-            LD_MEMOP_LOOKUP(gadget, arg, unaligned_ ## suffix); \
-        }
-
-
-
-/**
- * Macro that defines a look-up tree for named QEMU_ST gadgets.
- */
-#define ST_MEMOP_LOOKUP(variable, arg, suffix) \
-    switch (get_memop(arg) & (MO_BSWAP | MO_SSIZE)) { \
-        case MO_UB:   variable = gadget_qemu_st_ub_   ## suffix; break; \
-        case MO_LEUW: variable = gadget_qemu_st_leuw_ ## suffix; break; \
-        case MO_LEUL: variable = gadget_qemu_st_leul_ ## suffix; break; \
-        case MO_LEQ:  variable = gadget_qemu_st_leq_  ## suffix; break; \
-        case MO_BEUW: variable = gadget_qemu_st_beuw_ ## suffix; break; \
-        case MO_BEUL: variable = gadget_qemu_st_beul_ ## suffix; break; \
-        case MO_BEQ:  variable = gadget_qemu_st_beq_  ## suffix; break; \
-        default: \
-            g_assert_not_reached(); \
-    }
-#define ST_MEMOP_HANDLER(variable, arg, suffix, a_bits, s_bits) \
-        if (a_bits >= s_bits) { \
-            ST_MEMOP_LOOKUP(variable, arg, aligned_ ## suffix ); \
-        } else { \
-            ST_MEMOP_LOOKUP(gadget, arg, unaligned_ ## suffix); \
-        }
-
-
-#define LOOKUP_SPECIAL_CASE_LDST_GADGET(arg, name, mode) \
-    switch(TLB_MASK_TABLE_OFS(get_mmuidx(arg))) { \
-        case -32:  \
-            gadget = (a_bits >= s_bits) ?  \
-                gadget_qemu_ ## name ## _aligned_ ## mode ## _off32_i64 : \
-                gadget_qemu_ ## name ## _unaligned_ ## mode ## _off32_i64; \
-            break; \
-        case -48:  \
-            gadget = (a_bits >= s_bits) ?  \
-                gadget_qemu_ ## name ## _aligned_ ## mode ## _off48_i64 : \
-                gadget_qemu_ ## name ## _unaligned_ ## mode ## _off48_i64; \
-            break; \
-        case -64: \
-            gadget = (a_bits >= s_bits) ? \
-                gadget_qemu_ ## name ## _aligned_ ## mode ## _off64_i64 : \
-                gadget_qemu_ ## name ## _unaligned_ ## mode ## _off64_i64; \
-            break; \
-        case -96: \
-            gadget = (a_bits >= s_bits) ? \
-                gadget_qemu_ ## name ## _aligned_ ## mode ## _off96_i64 : \
-                gadget_qemu_ ## name ## _unaligned_ ## mode ## _off96_i64; \
-            break; \
-        case -128: \
-            gadget = (a_bits >= s_bits) ? \
-                gadget_qemu_ ## name ## _aligned_ ## mode ## _off128_i64 : \
-                gadget_qemu_ ## name ## _unaligned_ ## mode ## _off128_i64; \
-            break;\
-        default: \
-            gadget = gadget_qemu_ ## name ## _slowpath_ ## mode ## _off0_i64; \
-            break; \
-        }
 
+/********************************
+ *  TCG Constraints Definitions *
+ ********************************/
 
 static TCGConstraintSetIndex tcg_target_op_def(TCGOpcode op)
 {
     switch (op) {
+
     case INDEX_op_ld8u_i32:
     case INDEX_op_ld8s_i32:
     case INDEX_op_ld16u_i32:
@@ -245,12 +163,67 @@ static TCGConstraintSetIndex tcg_target_op_def(TCGOpcode op)
     case INDEX_op_qemu_st_i64:
         return C_O0_I3(r, r, r);
 
+    //
+    // Vector ops.
+    //
+    case INDEX_op_add_vec:
+    case INDEX_op_sub_vec:
+    case INDEX_op_mul_vec:
+    case INDEX_op_xor_vec:
+    case INDEX_op_ssadd_vec:
+    case INDEX_op_sssub_vec:
+    case INDEX_op_usadd_vec:
+    case INDEX_op_ussub_vec:
+    case INDEX_op_smax_vec:
+    case INDEX_op_smin_vec:
+    case INDEX_op_umax_vec:
+    case INDEX_op_umin_vec:
+    case INDEX_op_shlv_vec:
+    case INDEX_op_shrv_vec:
+    case INDEX_op_sarv_vec:
+    case INDEX_op_aa64_sshl_vec:
+        return C_O1_I2(w, w, w);
+    case INDEX_op_not_vec:
+    case INDEX_op_neg_vec:
+    case INDEX_op_abs_vec:
+    case INDEX_op_shli_vec:
+    case INDEX_op_shri_vec:
+    case INDEX_op_sari_vec:
+        return C_O1_I1(w, w);
+    case INDEX_op_ld_vec:
+    case INDEX_op_dupm_vec:
+        return C_O1_I1(w, r);
+    case INDEX_op_st_vec:
+        return C_O0_I2(w, r);
+    case INDEX_op_dup_vec:
+        return C_O1_I1(w, wr);
+    case INDEX_op_or_vec:
+    case INDEX_op_andc_vec:
+        return C_O1_I2(w, w, w);
+    case INDEX_op_and_vec:
+    case INDEX_op_orc_vec:
+        return C_O1_I2(w, w, w);
+    case INDEX_op_cmp_vec:
+        return C_O1_I2(w, w, w);
+    case INDEX_op_bitsel_vec:
+        return C_O1_I3(w, w, w, w);
+    case INDEX_op_aa64_sli_vec:
+        return C_O1_I2(w, 0, w);
+
     default:
         g_assert_not_reached();
     }
 }
 
 static const int tcg_target_reg_alloc_order[] = {
+
+    // General purpose registers, in preference-of-allocation order.
+    TCG_REG_R8,
+    TCG_REG_R9,
+    TCG_REG_R10,
+    TCG_REG_R11,
+    TCG_REG_R12,
+    TCG_REG_R13,
     TCG_REG_R0,
     TCG_REG_R1,
     TCG_REG_R2,
@@ -259,16 +232,15 @@ static const int tcg_target_reg_alloc_order[] = {
     TCG_REG_R5,
     TCG_REG_R6,
     TCG_REG_R7,
-    TCG_REG_R8,
-    TCG_REG_R9,
-    TCG_REG_R10,
-    TCG_REG_R11,
-    TCG_REG_R12,
-    TCG_REG_R13,
-    /*
-    TCG_REG_R14,  // AREG0
-    TCG_REG_R15,  // SP
-    */
+
+    // Note: we do not allocate R14 or R15, as they're used for our
+    // special-purpose values.
+
+    // We'll use the high 16 vector register; avoiding the call-saved lower ones.
+    TCG_REG_V16, TCG_REG_V17, TCG_REG_V18, TCG_REG_V19,
+    TCG_REG_V20, TCG_REG_V21, TCG_REG_V22, TCG_REG_V23,
+    TCG_REG_V24, TCG_REG_V25, TCG_REG_V26, TCG_REG_V27,
+    TCG_REG_V28, TCG_REG_V29, TCG_REG_V30, TCG_REG_V31,
 };
 
 #if MAX_OPC_PARAM_IARGS != 6
@@ -289,7 +261,7 @@ static const int tcg_target_call_oarg_regs[] = {
 };
 
 #ifdef CONFIG_DEBUG_TCG
-static const char *const tcg_target_reg_names[TCG_TARGET_NB_REGS] = {
+static const char *const tcg_target_reg_names[TCG_TARGET_GP_REGS] = {
     "r00",
     "r01",
     "r02",
@@ -309,6 +281,98 @@ static const char *const tcg_target_reg_names[TCG_TARGET_NB_REGS] = {
 };
 #endif
 
+/*************************
+ *  TCG Emitter Helpers  *
+ *************************/
+
+/* Bitfield n...m (in 32 bit value). */
+#define BITS(n, m) (((0xffffffffU << (31 - n)) >> (31 - n + m)) << m)
+
+/**
+ * Macro that defines a look-up tree for named QEMU_LD gadgets.
+ */
+#define LD_MEMOP_LOOKUP(variable, arg, suffix) \
+    switch (get_memop(arg) & (MO_BSWAP | MO_SSIZE)) { \
+        case MO_UB:   variable = gadget_qemu_ld_ub_   ## suffix; break; \
+        case MO_SB:   variable = gadget_qemu_ld_sb_   ## suffix; break; \
+        case MO_LEUW: variable = gadget_qemu_ld_leuw_ ## suffix; break; \
+        case MO_LESW: variable = gadget_qemu_ld_lesw_ ## suffix; break; \
+        case MO_LEUL: variable = gadget_qemu_ld_leul_ ## suffix; break; \
+        case MO_LESL: variable = gadget_qemu_ld_lesl_ ## suffix; break; \
+        case MO_LEQ:  variable = gadget_qemu_ld_leq_  ## suffix; break; \
+        case MO_BEUW: variable = gadget_qemu_ld_beuw_ ## suffix; break; \
+        case MO_BESW: variable = gadget_qemu_ld_besw_ ## suffix; break; \
+        case MO_BEUL: variable = gadget_qemu_ld_beul_ ## suffix; break; \
+        case MO_BESL: variable = gadget_qemu_ld_besl_ ## suffix; break; \
+        case MO_BEQ:  variable = gadget_qemu_ld_beq_  ## suffix; break; \
+        default: \
+            g_assert_not_reached(); \
+    }
+#define LD_MEMOP_HANDLER(variable, arg, suffix, a_bits, s_bits) \
+        if (a_bits >= s_bits) { \
+            LD_MEMOP_LOOKUP(variable, arg, aligned_ ## suffix ); \
+        } else { \
+            LD_MEMOP_LOOKUP(gadget, arg, unaligned_ ## suffix); \
+        }
+
+
+
+/**
+ * Macro that defines a look-up tree for named QEMU_ST gadgets.
+ */
+#define ST_MEMOP_LOOKUP(variable, arg, suffix) \
+    switch (get_memop(arg) & (MO_BSWAP | MO_SSIZE)) { \
+        case MO_UB:   variable = gadget_qemu_st_ub_   ## suffix; break; \
+        case MO_LEUW: variable = gadget_qemu_st_leuw_ ## suffix; break; \
+        case MO_LEUL: variable = gadget_qemu_st_leul_ ## suffix; break; \
+        case MO_LEQ:  variable = gadget_qemu_st_leq_  ## suffix; break; \
+        case MO_BEUW: variable = gadget_qemu_st_beuw_ ## suffix; break; \
+        case MO_BEUL: variable = gadget_qemu_st_beul_ ## suffix; break; \
+        case MO_BEQ:  variable = gadget_qemu_st_beq_  ## suffix; break; \
+        default: \
+            g_assert_not_reached(); \
+    }
+#define ST_MEMOP_HANDLER(variable, arg, suffix, a_bits, s_bits) \
+        if (a_bits >= s_bits) { \
+            ST_MEMOP_LOOKUP(variable, arg, aligned_ ## suffix ); \
+        } else { \
+            ST_MEMOP_LOOKUP(gadget, arg, unaligned_ ## suffix); \
+        }
+
+
+#define LOOKUP_SPECIAL_CASE_LDST_GADGET(arg, name, mode) \
+    switch(TLB_MASK_TABLE_OFS(get_mmuidx(arg))) { \
+        case -32:  \
+            gadget = (a_bits >= s_bits) ?  \
+                gadget_qemu_ ## name ## _aligned_ ## mode ## _off32_i64 : \
+                gadget_qemu_ ## name ## _unaligned_ ## mode ## _off32_i64; \
+            break; \
+        case -48:  \
+            gadget = (a_bits >= s_bits) ?  \
+                gadget_qemu_ ## name ## _aligned_ ## mode ## _off48_i64 : \
+                gadget_qemu_ ## name ## _unaligned_ ## mode ## _off48_i64; \
+            break; \
+        case -64: \
+            gadget = (a_bits >= s_bits) ? \
+                gadget_qemu_ ## name ## _aligned_ ## mode ## _off64_i64 : \
+                gadget_qemu_ ## name ## _unaligned_ ## mode ## _off64_i64; \
+            break; \
+        case -96: \
+            gadget = (a_bits >= s_bits) ? \
+                gadget_qemu_ ## name ## _aligned_ ## mode ## _off96_i64 : \
+                gadget_qemu_ ## name ## _unaligned_ ## mode ## _off96_i64; \
+            break; \
+        case -128: \
+            gadget = (a_bits >= s_bits) ? \
+                gadget_qemu_ ## name ## _aligned_ ## mode ## _off128_i64 : \
+                gadget_qemu_ ## name ## _unaligned_ ## mode ## _off128_i64; \
+            break;\
+        default: \
+            gadget = gadget_qemu_ ## name ## _slowpath_ ## mode ## _off0_i64; \
+            break; \
+        }
+
+
 static bool patch_reloc(tcg_insn_unit *code_ptr, int type,
                         intptr_t value, intptr_t addend)
 {
@@ -404,7 +468,7 @@ tcg_target_ulong helper_be_ldul_mmu_signed(CPUArchState *env, target_ulong addr,
 
 
 /* Write gadget pointer. */
-static void tcg_out_nullary_gadget(TCGContext *s, const void *gadget)
+static void tcg_out_gadget(TCGContext *s, const void *gadget)
 {
     tcg_out_immediate(s, (tcg_target_ulong)gadget);
 }
@@ -412,40 +476,43 @@ static void tcg_out_nullary_gadget(TCGContext *s, const void *gadget)
 /* Write gadget pointer, plus 64b immediate. */
 static void tcg_out_imm64_gadget(TCGContext *s, const void *gadget, tcg_target_ulong immediate)
 {
-    tcg_out_nullary_gadget(s, gadget);
+    tcg_out_gadget(s, gadget);
     tcg_out64(s, immediate);
 }
 
 
 /* Write gadget pointer (one register). */
-static void tcg_out_unary_gadget(TCGContext *s, const void *gadget_base[TCG_TARGET_NB_REGS], unsigned reg0)
+static void tcg_out_unary_gadget(TCGContext *s, const void *gadget_base[TCG_TARGET_GP_REGS], unsigned reg0)
 {
-    tcg_out_nullary_gadget(s, gadget_base[reg0]);
+    tcg_out_gadget(s, gadget_base[reg0]);
 }
 
 
 /* Write gadget pointer (two registers). */
-static void tcg_out_binary_gadget(TCGContext *s, const void *gadget_base[TCG_TARGET_NB_REGS][TCG_TARGET_NB_REGS], unsigned reg0, unsigned reg1)
+static void tcg_out_binary_gadget(TCGContext *s, const void *gadget_base[TCG_TARGET_GP_REGS][TCG_TARGET_GP_REGS], unsigned reg0, unsigned reg1)
 {
-    tcg_out_nullary_gadget(s, gadget_base[reg0][reg1]);
+    tcg_out_gadget(s, gadget_base[reg0][reg1]);
 }
 
 
 /* Write gadget pointer (three registers). */
-static void tcg_out_ternary_gadget(TCGContext *s, const void *gadget_base[TCG_TARGET_NB_REGS][TCG_TARGET_NB_REGS][TCG_TARGET_NB_REGS], unsigned reg0, unsigned reg1, unsigned reg2)
+static void tcg_out_ternary_gadget(TCGContext *s, const void *gadget_base[TCG_TARGET_GP_REGS][TCG_TARGET_GP_REGS][TCG_TARGET_GP_REGS], unsigned reg0, unsigned reg1, unsigned reg2)
 {
-    tcg_out_nullary_gadget(s, gadget_base[reg0][reg1][reg2]);
+    tcg_out_gadget(s, gadget_base[reg0][reg1][reg2]);
 }
 
+/***************************
+ *  TCG Scalar Operations  *
+ ***************************/
 
 /**
  * Version of our LDST generator that defers to more optimized gadgets selectively.
  */
 static void tcg_out_ldst_gadget_inner(TCGContext *s,
-    const void *gadget_base[TCG_TARGET_NB_REGS][TCG_TARGET_NB_REGS],
-    const void *gadget_pos_imm[TCG_TARGET_NB_REGS][TCG_TARGET_NB_REGS][TCTI_GADGET_IMMEDIATE_ARRAY_LEN],
-    const void *gadget_shifted_imm[TCG_TARGET_NB_REGS][TCG_TARGET_NB_REGS][TCTI_GADGET_IMMEDIATE_ARRAY_LEN],
-    const void *gadget_neg_imm[TCG_TARGET_NB_REGS][TCG_TARGET_NB_REGS][TCTI_GADGET_IMMEDIATE_ARRAY_LEN],
+    const void *gadget_base[TCG_TARGET_GP_REGS][TCG_TARGET_GP_REGS],
+    const void *gadget_pos_imm[TCG_TARGET_GP_REGS][TCG_TARGET_GP_REGS][TCTI_GADGET_IMMEDIATE_ARRAY_LEN],
+    const void *gadget_shifted_imm[TCG_TARGET_GP_REGS][TCG_TARGET_GP_REGS][TCTI_GADGET_IMMEDIATE_ARRAY_LEN],
+    const void *gadget_neg_imm[TCG_TARGET_GP_REGS][TCG_TARGET_GP_REGS][TCTI_GADGET_IMMEDIATE_ARRAY_LEN],
     unsigned reg0, unsigned reg1, uint32_t offset)
 {
     int64_t extended_offset = (int32_t)offset;
@@ -466,14 +533,14 @@ static void tcg_out_ldst_gadget_inner(TCGContext *s,
 
         // More optimal case: we have a gadget that directly encodes the argument.
         if (have_optimized_gadget) {
-            tcg_out_nullary_gadget(s, gadget_pos_imm[reg0][reg1][extended_offset]);
+            tcg_out_gadget(s, gadget_pos_imm[reg0][reg1][extended_offset]);
             return;
         }
 
         // Special case: it's frequent to have low-numbered positive offsets that are aligned
         // to 16B boundaries
         else if(aligned_to_8B && have_shifted_gadget) {
-            tcg_out_nullary_gadget(s, gadget_shifted_imm[reg0][reg1][shifted_offset]);
+            tcg_out_gadget(s, gadget_shifted_imm[reg0][reg1][shifted_offset]);
             return;
         }
     }
@@ -482,7 +549,7 @@ static void tcg_out_ldst_gadget_inner(TCGContext *s,
 
         // More optimal case: we have a gadget that directly encodes the argument.
         if (negated_offset < TCTI_GADGET_IMMEDIATE_ARRAY_LEN) {
-            tcg_out_nullary_gadget(s, gadget_neg_imm[reg0][reg1][negated_offset]);
+            tcg_out_gadget(s, gadget_neg_imm[reg0][reg1][negated_offset]);
             return;
         }
     }
@@ -542,7 +609,7 @@ static void tcg_out_movi_i32(TCGContext *s, TCGReg t0, tcg_target_long arg)
     {
         // More optimal case: we have a gadget that directly encodes the argument.
         if (arg < ARRAY_SIZE(gadget_movi_imm_i32[t0])) {
-            tcg_out_nullary_gadget(s, gadget_movi_imm_i32[t0][arg]);
+            tcg_out_gadget(s, gadget_movi_imm_i32[t0][arg]);
             return;
         }
     }
@@ -566,7 +633,7 @@ static void tcg_out_movi_i64(TCGContext *s, TCGReg t0, tcg_target_long arg)
     {
         // More optimal case: we have a gadget that directly encodes the argument.
         if (arg < ARRAY_SIZE(gadget_movi_imm_i64[t0])) {
-            tcg_out_nullary_gadget(s, gadget_movi_imm_i64[t0][arg]);
+            tcg_out_gadget(s, gadget_movi_imm_i64[t0][arg]);
             return;
         }
     }
@@ -599,7 +666,7 @@ static void tcg_out_movi(TCGContext *s, TCGType type, TCGReg t0, tcg_target_long
  */
 static inline void tcg_out_call(TCGContext *s, const tcg_insn_unit *arg)
 {
-    tcg_out_nullary_gadget(s, gadget_call);
+    tcg_out_gadget(s, gadget_call);
     tcg_out64(s, (uintptr_t)arg);
 }
 
@@ -618,12 +685,6 @@ static void tcg_out_ld(TCGContext *s, TCGType type, TCGReg ret, TCGReg arg1,
 }
 
 
-static void warn_slow_memop(const TCGArg arg)
-{
-    fprintf(stderr, "--- NOTE: emitting non-optimized memop at offset %i\n", TLB_MASK_TABLE_OFS(get_mmuidx(arg)));
-}
-
-
 /**
  * Generate every other operation.
  */
@@ -645,7 +706,7 @@ void tcg_out_op(TCGContext *s, TCGOpcode opc, const TCGArg *args, const int *con
         // to patch our gadget stream with the target address, later.
         if (s->tb_jmp_insn_offset) {
             // Emit our gadget.
-            tcg_out_nullary_gadget(s, gadget_br);
+            tcg_out_gadget(s, gadget_br);
 
             // Place our current instruction into our "relocation table", so it can
             // be patched once we know where the branch will target...
@@ -664,7 +725,7 @@ void tcg_out_op(TCGContext *s, TCGOpcode opc, const TCGArg *args, const int *con
 
     // Simple branch.
     case INDEX_op_br:
-        tcg_out_nullary_gadget(s, gadget_br);
+        tcg_out_gadget(s, gadget_br);
         tcti_out_label(s, arg_label(args[0]));
         break;
 
@@ -959,7 +1020,7 @@ void tcg_out_op(TCGContext *s, TCGOpcode opc, const TCGArg *args, const int *con
         // helps the processor's branch prediction be less "squished", as not every
         // branch is going throuh the same instruction.
         tcg_out_ternary_gadget(s, gadget, last_brcond_i64, args[0], args[1]);
-        last_brcond_i64 = (last_brcond_i64 + 1) % TCG_TARGET_NB_REGS;
+        last_brcond_i64 = (last_brcond_i64 + 1) % TCG_TARGET_GP_REGS;
 
         // Branch target immediate.
         tcti_out_label(s, arg_label(args[3]));
@@ -1099,7 +1160,7 @@ void tcg_out_op(TCGContext *s, TCGOpcode opc, const TCGArg *args, const int *con
         // helps the processor's branch prediction be less "squished", as not every
         // branch is going throuh the same instruction.
         tcg_out_ternary_gadget(s, gadget, last_brcond_i32, args[0], args[1]);
-        last_brcond_i32 = (last_brcond_i32 + 1) % TCG_TARGET_NB_REGS;
+        last_brcond_i32 = (last_brcond_i32 + 1) % TCG_TARGET_GP_REGS;
 
         // Branch target immediate.
         tcti_out_label(s, arg_label(args[3]));
@@ -1121,7 +1182,7 @@ void tcg_out_op(TCGContext *s, TCGOpcode opc, const TCGArg *args, const int *con
             case -64:  LD_MEMOP_HANDLER(gadget, args[2],  off64_i32, a_bits, s_bits); break;
             case -96:  LD_MEMOP_HANDLER(gadget, args[2],  off96_i32, a_bits, s_bits); break;
             case -128: LD_MEMOP_HANDLER(gadget, args[2], off128_i32, a_bits, s_bits); break;
-            default:   LD_MEMOP_LOOKUP(gadget, args[2], slowpath_off0_i32); warn_slow_memop(args[2]); break;
+            default:   LD_MEMOP_LOOKUP(gadget, args[2], slowpath_off0_i32); break;
         }
 
         // Args:
@@ -1159,7 +1220,7 @@ void tcg_out_op(TCGContext *s, TCGOpcode opc, const TCGArg *args, const int *con
                 case -64:  LD_MEMOP_HANDLER(gadget, args[2],  off64_i64, a_bits, s_bits); break;
                 case -96:  LD_MEMOP_HANDLER(gadget, args[2],  off96_i64, a_bits, s_bits); break;
                 case -128: LD_MEMOP_HANDLER(gadget, args[2], off128_i64, a_bits, s_bits); break;
-                default:   LD_MEMOP_LOOKUP(gadget, args[2], slowpath_off0_i64); warn_slow_memop(args[2]); break;
+                default:   LD_MEMOP_LOOKUP(gadget, args[2], slowpath_off0_i64); break;
             }
 
             // Args:
@@ -1185,7 +1246,7 @@ void tcg_out_op(TCGContext *s, TCGOpcode opc, const TCGArg *args, const int *con
             case -64:  ST_MEMOP_HANDLER(gadget, args[2],  off64_i32, a_bits, s_bits); break;
             case -96:  ST_MEMOP_HANDLER(gadget, args[2],  off96_i32, a_bits, s_bits); break;
             case -128: ST_MEMOP_HANDLER(gadget, args[2], off128_i32, a_bits, s_bits); break;
-            default:   ST_MEMOP_LOOKUP(gadget, args[2], slowpath_off0_i32); warn_slow_memop(args[2]); break;
+            default:   ST_MEMOP_LOOKUP(gadget, args[2], slowpath_off0_i32); break;
         }
 
         // Args:
@@ -1224,7 +1285,7 @@ void tcg_out_op(TCGContext *s, TCGOpcode opc, const TCGArg *args, const int *con
                 case -64:  ST_MEMOP_HANDLER(gadget, args[2],  off64_i64, a_bits, s_bits); break;
                 case -96:  ST_MEMOP_HANDLER(gadget, args[2],  off96_i64, a_bits, s_bits); break;
                 case -128: ST_MEMOP_HANDLER(gadget, args[2], off128_i64, a_bits, s_bits); break;
-                default:   ST_MEMOP_LOOKUP(gadget, args[2], slowpath_off0_i64); warn_slow_memop(args[2]); break;
+                default:   ST_MEMOP_LOOKUP(gadget, args[2], slowpath_off0_i64); break;
             }
 
             // Args:
@@ -1247,7 +1308,7 @@ void tcg_out_op(TCGContext *s, TCGOpcode opc, const TCGArg *args, const int *con
             [TCG_MO_LD_ST]                = gadget_mb_ld,
             [TCG_MO_LD_ST | TCG_MO_LD_LD] = gadget_mb_ld,
         };
-        tcg_out_nullary_gadget(s, sync[args[0] & TCG_MO_ALL]);
+        tcg_out_gadget(s, sync[args[0] & TCG_MO_ALL]);
 
         break;
     }
@@ -1287,19 +1348,630 @@ static int tcg_target_const_match(tcg_target_long val, TCGType type,
     return arg_ct->ct & TCG_CT_CONST;
 }
 
+/***************************
+ *  TCG Vector Operations  *
+ ***************************/
+
+//
+// Helper for emitting DUPI (immediate DUP) instructions.
+//
+#define tcg_out_dupi_gadget(s, name, q, rd, op, cmode, arg) \
+    if (q) { \
+        tcg_out_gadget(s, gadget_ ## name ## _cmode_ ## cmode ## _op ## op ## _q1[rd][arg]); \
+    } else { \
+        tcg_out_gadget(s, gadget_ ## name ## _cmode_ ## cmode ## _op ## op ## _q0[rd][arg]); \
+    }
+
+
+//
+// Helpers for emitting D/Q variant instructions.
+//
+#define tcg_out_dq_gadget(s, name, arity, is_q, args...) \
+    if (is_q) { \
+        tcg_out_ ## arity ## _gadget(s, gadget_ ## name ## _q, args); \
+    } else { \
+        tcg_out_ ## arity ## _gadget(s, gadget_ ## name ## _d, args); \
+    }
+
+#define tcg_out_unary_dq_gadget(s, name, is_q, a) \
+    tcg_out_dq_gadget(s, name, unary, is_q, a) 
+#define tcg_out_binary_dq_gadget(s, name, is_q, a, b) \
+    tcg_out_dq_gadget(s, name, binary, is_q, a, b)
+#define tcg_out_ternary_dq_gadget(s, name, is_q, a, b, c) \
+    tcg_out_dq_gadget(s, name, ternary, is_q, a, b, c)
+
+
+//
+// Helper for emitting the gadget appropriate for a vector's size.
+//
+#define tcg_out_sized_vector_gadget(s, name, arity, vece, args...) \
+    switch(vece) { \
+        case MO_8: \
+            if (type == TCG_TYPE_V64) { \
+                tcg_out_ ## arity ## _gadget(s, gadget_ ## name ## _8b, args); \
+            } else { \
+                tcg_out_ ## arity ## _gadget(s, gadget_ ## name ## _16b, args); \
+            } \
+            break; \
+        case MO_16: \
+            if (type == TCG_TYPE_V64) { \
+                tcg_out_ ## arity ## _gadget(s, gadget_ ## name ## _4h, args); \
+            } else { \
+                tcg_out_ ## arity ## _gadget(s, gadget_ ## name ## _8h, args); \
+            } \
+            break; \
+        case MO_32: \
+            if (type == TCG_TYPE_V64) { \
+                tcg_out_ ## arity ## _gadget(s, gadget_ ## name ## _2s, args); \
+            } else { \
+                tcg_out_ ## arity ## _gadget(s, gadget_ ## name ## _4s, args); \
+            } \
+            break; \
+        case MO_64: \
+            if (type == TCG_TYPE_V128) { \
+                tcg_out_ ## arity ## _gadget(s, gadget_ ## name ## _2d, args); \
+            } \
+            else { \
+                g_assert_not_reached(); \
+            } \
+            break;  \
+        default: \
+            g_assert_not_reached(); \
+    } 
+#define tcg_out_sized_vector_gadget_no64(s, name, arity, vece, args...) \
+    switch(vece) { \
+        case MO_8: \
+            if (type == TCG_TYPE_V64) { \
+                tcg_out_ ## arity ## _gadget(s, gadget_ ## name ## _8b, args); \
+            } else { \
+                tcg_out_ ## arity ## _gadget(s, gadget_ ## name ## _16b, args); \
+            } \
+            break; \
+        case MO_16: \
+            if (type == TCG_TYPE_V64) { \
+                tcg_out_ ## arity ## _gadget(s, gadget_ ## name ## _4h, args); \
+            } else { \
+                tcg_out_ ## arity ## _gadget(s, gadget_ ## name ## _8h, args); \
+            } \
+            break; \
+        case MO_32: \
+            if (type == TCG_TYPE_V64) { \
+                tcg_out_ ## arity ## _gadget(s, gadget_ ## name ## _2s, args); \
+            } else { \
+                tcg_out_ ## arity ## _gadget(s, gadget_ ## name ## _4s, args); \
+            } \
+            break; \
+        default: \
+            g_assert_not_reached(); \
+    } 
+
+
+#define tcg_out_unary_vector_gadget(s, name, vece, a) \
+    tcg_out_sized_vector_gadget(s, name, unary, vece, a)
+#define tcg_out_binary_vector_gadget(s, name, vece, a, b) \
+    tcg_out_sized_vector_gadget(s, name, binary, vece, a, b)
+#define tcg_out_ternary_vector_gadget(s, name, vece, a, b, c) \
+    tcg_out_sized_vector_gadget(s, name, ternary, vece, a, b, c)
+
+#define tcg_out_ternary_vector_gadget_no64(s, name, vece, a, b, c) \
+    tcg_out_sized_vector_gadget_no64(s, name, ternary, vece, a, b, c)
+
+
+#define tcg_out_ternary_vector_gadget_with_scalar(s, name, is_scalar, vece, a, b, c) \
+    if (is_scalar) { \
+        tcg_out_ternary_gadget(s, gadget_ ## name ## _scalar, w0, w1, w2); \
+    } else { \
+        tcg_out_ternary_vector_gadget(s, name, vece, w0, w1, w2); \
+    }
+
+
+/* Return true if v16 is a valid 16-bit shifted immediate.  */
+/*
+static bool is_shimm16(uint16_t v16, int *cmode, int *imm8)
+{
+    if (v16 == (v16 & 0xff)) {
+        *cmode = 0x8;
+        *imm8 = v16 & 0xff;
+        return true;
+    } else if (v16 == (v16 & 0xff00)) {
+        *cmode = 0xa;
+        *imm8 = v16 >> 8;
+        return true;
+    }
+    return false;
+}
+*/
+
+/* Return true if v64 is a valid float64 immediate.  */
+/*
+static bool is_fimm64(uint64_t v64, int *cmode, int *imm8)
+{
+    if (extract64(v64, 0, 48) == 0
+        && (extract64(v64, 54, 9) == 0x100
+            || extract64(v64, 54, 9) == 0x0ff)) {
+        *cmode = 0xf;
+        *imm8 = (extract64(v64, 63, 1) << 7)
+              | (extract64(v64, 54, 1) << 6)
+              | extract64(v64, 48, 6);
+        return true;
+    }
+    return false;
+}
+*/
+
+/** Core vector operation emission. */
+static void tcg_out_vec_op(TCGContext *s, TCGOpcode opc, unsigned vecl, unsigned vece,
+    const TCGArg args[TCG_MAX_OP_ARGS], const int const_args[TCG_MAX_OP_ARGS])
+{
+    TCGType type = vecl + TCG_TYPE_V64;
+    TCGArg r0, r1, r2, w0, w1, w2;
+
+    // Typing flags for vector operations.
+    bool is_v128 = (type == TCG_TYPE_V128);
+    bool is_scalar = !is_v128 && (vece == MO_64);
+
+    // Argument shortcuts.
+    r0 = args[0];
+    r1 = args[1];
+    r2 = args[2];
+
+    // Offset argument shortcuts; offset to convert register numbers to gadget numberes.
+    w0 = args[0] - TCG_REG_V16;
+    w1 = args[1] - TCG_REG_V16;
+    w2 = args[2] - TCG_REG_V16;
+
+    // Argument shortcuts, as signed.
+    int64_t signed_offset_arg = (int32_t)args[2];
+
+    switch (opc) {
+
+    // Load memory -> vector: followed by a 64-bit offset immediate
+    case INDEX_op_ld_vec:
+        tcg_out_binary_dq_gadget(s, ldr, is_v128, w0, r1);
+        tcg_out64(s, signed_offset_arg);
+        break;
+    
+    // Store memory -> vector: followed by a 64-bit offset immediate
+    case INDEX_op_st_vec:
+        tcg_out_binary_dq_gadget(s, str, is_v128, w0, r1);
+        tcg_out64(s, signed_offset_arg);
+        break;
+
+    // Duplciate memory to all vector elements.
+    case INDEX_op_dupm_vec:
+        // DUPM handles normalization itself; pass arguments raw.
+        tcg_out_dupm_vec(s, type, vece, r0, r1, r2);
+        break;
+
+    case INDEX_op_add_vec:
+        tcg_out_ternary_vector_gadget_with_scalar(s, add, is_scalar, vece, w0, w1, w2);
+        break;
+
+    case INDEX_op_sub_vec:
+        tcg_out_ternary_vector_gadget_with_scalar(s, sub, is_scalar, vece, w0, w1, w2);
+        break;
+
+    case INDEX_op_mul_vec: // optional
+        tcg_out_ternary_vector_gadget_no64(s, mul, vece, w0, w1, w2);
+        break;
+
+    case INDEX_op_neg_vec: // optional
+        tcg_out_binary_vector_gadget(s, neg, vece, w0, w1);
+        break;
+
+    case INDEX_op_abs_vec: // optional
+        tcg_out_binary_vector_gadget(s, abs, vece, w0, w1);
+        break;
+
+    case INDEX_op_and_vec: // optional
+        tcg_out_ternary_dq_gadget(s, and, is_v128, w0, w1, w2);
+        break;
+
+    case INDEX_op_or_vec:
+        tcg_out_ternary_dq_gadget(s, or, is_v128, w0, w1, w2);
+        break;
+
+    case INDEX_op_andc_vec:
+        tcg_out_ternary_dq_gadget(s, andc, is_v128, w0, w1, w2);
+        break;
+
+    case INDEX_op_orc_vec: // optional
+        tcg_out_ternary_dq_gadget(s, orc, is_v128, w0, w1, w2);
+        break;
+
+    case INDEX_op_xor_vec:
+        tcg_out_ternary_dq_gadget(s, xor, is_v128, w0, w1, w2);
+        break;
+
+    case INDEX_op_ssadd_vec:
+        tcg_out_ternary_vector_gadget_with_scalar(s, ssadd, is_scalar, vece, w0, w1, w2);
+        break;
+
+    case INDEX_op_sssub_vec:
+        tcg_out_ternary_vector_gadget_with_scalar(s, sssub, is_scalar, vece, w0, w1, w2);
+        break;
+
+    case INDEX_op_usadd_vec:
+        tcg_out_ternary_vector_gadget_with_scalar(s, usadd, is_scalar, vece, w0, w1, w2);
+        break;
+
+    case INDEX_op_ussub_vec:
+        tcg_out_ternary_vector_gadget_with_scalar(s, ussub, is_scalar, vece, w0, w1, w2);
+        break;
+
+    case INDEX_op_smax_vec:
+        TODO();
+        break;
+
+    case INDEX_op_smin_vec:
+        TODO();
+        break;
+
+    case INDEX_op_umax_vec:
+        TODO();
+        break;
+
+    case INDEX_op_umin_vec:
+        TODO();
+        break;
+
+    case INDEX_op_not_vec: // optional
+        tcg_out_binary_dq_gadget(s, not, is_v128, w0, w1);
+        break;
+
+    case INDEX_op_shli_vec:
+        TODO();
+        break;
+
+    case INDEX_op_shri_vec:
+        TODO();
+        break;
+
+    case INDEX_op_sari_vec:
+        TODO();
+        break;
+
+    case INDEX_op_aa64_sli_vec:
+        TODO();
+        break;
+    case INDEX_op_shlv_vec:
+        TODO();
+        break;
+    case INDEX_op_aa64_sshl_vec:
+        TODO();
+        break;
+    case INDEX_op_cmp_vec:
+        TODO();
+        break;
+    case INDEX_op_bitsel_vec: // optional
+        TODO();
+        break;
+
+    case INDEX_op_mov_vec:  /* Always emitted via tcg_out_mov.  */
+    case INDEX_op_dup_vec:  /* Always emitted via tcg_out_dup_vec.  */
+    default:
+        g_assert_not_reached();
+    }
+}
+
+
+int tcg_can_emit_vec_op(TCGOpcode opc, TCGType type, unsigned vece)
+{
+    switch (opc) {
+    case INDEX_op_add_vec:
+    case INDEX_op_sub_vec:
+    case INDEX_op_and_vec:
+    case INDEX_op_or_vec:
+    case INDEX_op_xor_vec:
+    case INDEX_op_andc_vec:
+    case INDEX_op_orc_vec:
+    case INDEX_op_neg_vec:
+    case INDEX_op_abs_vec:
+    case INDEX_op_not_vec:
+    //case INDEX_op_cmp_vec:
+    //case INDEX_op_shli_vec:
+    //case INDEX_op_shri_vec:
+    //case INDEX_op_sari_vec:
+    case INDEX_op_ssadd_vec:
+    case INDEX_op_sssub_vec:
+    case INDEX_op_usadd_vec:
+    case INDEX_op_ussub_vec:
+    //case INDEX_op_shlv_vec:
+    //case INDEX_op_bitsel_vec:
+        return 1;
+    case INDEX_op_rotli_vec:
+    case INDEX_op_shrv_vec:
+    case INDEX_op_sarv_vec:
+    case INDEX_op_rotlv_vec:
+    case INDEX_op_rotrv_vec:
+        return -1;
+    case INDEX_op_mul_vec:
+    //case INDEX_op_smax_vec:
+    //case INDEX_op_smin_vec:
+    //case INDEX_op_umax_vec:
+    //case INDEX_op_umin_vec:
+        return vece < MO_64;
+
+    default:
+        return 0;
+    }
+}
+
+void tcg_expand_vec_op(TCGOpcode opc, TCGType type, unsigned vece,
+                       TCGArg a0, ...)
+{
+    va_list va;
+    TCGv_vec v0, v1, v2, t1, t2, c1;
+    TCGArg a2;
+
+
+    va_start(va, a0);
+    v0 = temp_tcgv_vec(arg_temp(a0));
+    v1 = temp_tcgv_vec(arg_temp(va_arg(va, TCGArg)));
+    a2 = va_arg(va, TCGArg);
+    va_end(va);
+
+    switch (opc) {
+    case INDEX_op_rotli_vec:
+        t1 = tcg_temp_new_vec(type);
+        tcg_gen_shri_vec(vece, t1, v1, -a2 & ((8 << vece) - 1));
+        vec_gen_4(INDEX_op_aa64_sli_vec, type, vece,
+                  tcgv_vec_arg(v0), tcgv_vec_arg(t1), tcgv_vec_arg(v1), a2);
+        tcg_temp_free_vec(t1);
+        break;
+
+    case INDEX_op_shrv_vec:
+    case INDEX_op_sarv_vec:
+        /* Right shifts are negative left shifts for AArch64.  */
+        v2 = temp_tcgv_vec(arg_temp(a2));
+        t1 = tcg_temp_new_vec(type);
+        tcg_gen_neg_vec(vece, t1, v2);
+        opc = (opc == INDEX_op_shrv_vec
+               ? INDEX_op_shlv_vec : INDEX_op_aa64_sshl_vec);
+        vec_gen_3(opc, type, vece, tcgv_vec_arg(v0),
+                  tcgv_vec_arg(v1), tcgv_vec_arg(t1));
+        tcg_temp_free_vec(t1);
+        break;
+
+    case INDEX_op_rotlv_vec:
+        v2 = temp_tcgv_vec(arg_temp(a2));
+        t1 = tcg_temp_new_vec(type);
+        c1 = tcg_constant_vec(type, vece, 8 << vece);
+        tcg_gen_sub_vec(vece, t1, v2, c1);
+        /* Right shifts are negative left shifts for AArch64.  */
+        vec_gen_3(INDEX_op_shlv_vec, type, vece, tcgv_vec_arg(t1),
+                  tcgv_vec_arg(v1), tcgv_vec_arg(t1));
+        vec_gen_3(INDEX_op_shlv_vec, type, vece, tcgv_vec_arg(v0),
+                  tcgv_vec_arg(v1), tcgv_vec_arg(v2));
+        tcg_gen_or_vec(vece, v0, v0, t1);
+        tcg_temp_free_vec(t1);
+        break;
+
+    case INDEX_op_rotrv_vec:
+        v2 = temp_tcgv_vec(arg_temp(a2));
+        t1 = tcg_temp_new_vec(type);
+        t2 = tcg_temp_new_vec(type);
+        c1 = tcg_constant_vec(type, vece, 8 << vece);
+        tcg_gen_neg_vec(vece, t1, v2);
+        tcg_gen_sub_vec(vece, t2, c1, v2);
+        /* Right shifts are negative left shifts for AArch64.  */
+        vec_gen_3(INDEX_op_shlv_vec, type, vece, tcgv_vec_arg(t1),
+                  tcgv_vec_arg(v1), tcgv_vec_arg(t1));
+        vec_gen_3(INDEX_op_shlv_vec, type, vece, tcgv_vec_arg(t2),
+                  tcgv_vec_arg(v1), tcgv_vec_arg(t2));
+        tcg_gen_or_vec(vece, v0, t1, t2);
+        tcg_temp_free_vec(t1);
+        tcg_temp_free_vec(t2);
+        break;
+
+    default:
+        g_assert_not_reached();
+    }
+}
+
+
+/* Generate DUPI (move immediate) vector ops. */
+/*
+static bool tcg_out_optimized_dupi_vec(TCGContext *s, TCGType type, unsigned vece, TCGReg rd, int64_t v64)
+{
+    bool is_v128 = (type == TCG_TYPE_V128);
+    int cmode, imm8, i;
+
+    // If we're copying an 8b immediate, we implicitly have a simple gadget for this,
+    // since there are only 256 possible values * 16 registers. Emit a MOVI gadget implicitly.
+    if (vece == MO_8) {
+        imm8 = (uint8_t)v64;
+        tcg_out_dupi_gadget(s, movi, q, rd, 0, e, imm8);
+        return true;
+    }
+
+    // Otherwise, if we have a value that's all 0x00 and 0xFF bytes,
+    // we can use the scalar variant of MOVI (op=1, cmode=e), which handles
+    // that case directly.
+    for (i = imm8 = 0; i < 8; i++) {
+        uint8_t byte = v64 >> (i * 8);
+        if (byte == 0xff) {
+            imm8 |= 1 << i;
+        } else if (byte != 0) {
+            goto fail_bytes;
+        }
+    }
+    tcg_out_dupi_gadget(s, movi, q, rd, 1, e, imm8);
+    return true;
+ fail_bytes:
+
+    // Handle 16B moves.
+    if (vece == MO_16) {
+        uint16_t v16 = v64;
+
+        // Check to see if we have a value representable in as a MOV imm8, possibly via a shift.
+        if (is_shimm16(v16, &cmode, &imm8)) {
+            // Output the corret instruction CMode for either a regular MOVI (8) or a LSL8 MOVI (a).
+            if (cmode == 0x8) {
+                tcg_out_dupi_gadget(s, movi, q, rd, 0, 8, imm8);
+            } else {
+                tcg_out_dupi_gadget(s, movi, q, rd, 0, a, imm8);
+            }
+            return true;
+        }
+
+        // Check to see if we have a value representable in as an inverted MOV imm8, possibly via a shift.
+        if (is_shimm16(~v16, &cmode, &imm8)) {
+            // Output the corret instruction CMode for either a regular MOVI (8) or a LSL8 MOVI (a).
+            if (cmode == 0x8) {
+                tcg_out_dupi_gadget(s, mvni, q, rd, 0, 8, imm8);
+            } else {
+                tcg_out_dupi_gadget(s, mvni, q, rd, 0, a, imm8);
+            }
+            return true;
+        }
+
+        // If we can't perform either of the optimizations, we'll need to do this in two steps.
+        // Normally, we'd emit a gadget for both steps, but in this case that'd result in needing -way-
+        // too many gadgets. We'll emit two, instead.
+        tcg_out_dupi_gadget(s, movi, q, rd, 0, 8, v16 & 0xff);
+        tcg_out_dupi_gadget(s, orr,  q, rd, 0, a, v16 >> 8);
+        return true;
+    }
+
+    // FIXME: implement 32B move optimizations
+
+     
+    // Try to create optimized 32B moves.
+    //else if (vece == MO_32) {
+    //    uint32_t v32 = v64;
+    //    uint32_t n32 = ~v32;
+
+    //    if (is_shimm32(v32, &cmode, &imm8) ||
+    //        is_soimm32(v32, &cmode, &imm8) ||
+    //        is_fimm32(v32, &cmode, &imm8)) {
+    //        tcg_out_insn(s, 3606, MOVI, q, rd, 0, cmode, imm8);
+    //        return;
+    //    }
+    //    if (is_shimm32(n32, &cmode, &imm8) ||
+    //        is_soimm32(n32, &cmode, &imm8)) {
+    //        tcg_out_insn(s, 3606, MVNI, q, rd, 0, cmode, imm8);
+    //        return;
+    //    }
+
+    //    //
+    //    // Restrict the set of constants to those we can load with
+    //    // two instructions.  Others we load from the pool.
+    //    //
+    //    i = is_shimm32_pair(v32, &cmode, &imm8);
+    //    if (i) {
+    //        tcg_out_insn(s, 3606, MOVI, q, rd, 0, cmode, imm8);
+    //        tcg_out_insn(s, 3606, ORR, q, rd, 0, i, extract32(v32, i * 4, 8));
+    //        return;
+    //    }
+    //    i = is_shimm32_pair(n32, &cmode, &imm8);
+    //    if (i) {
+    //        tcg_out_insn(s, 3606, MVNI, q, rd, 0, cmode, imm8);
+    //        tcg_out_insn(s, 3606, BIC, q, rd, 0, i, extract32(n32, i * 4, 8));
+    //        return;
+    //    }
+    //} 
+    
+    else if (is_fimm64(v64, &cmode, &imm8)) {
+        tcg_out_insn(s, 3606, MOVI, q, rd, 1, cmode, imm8);
+        return true;
+    }
+
+}
+*/
+
+/* Emits instructions that can load an immediate into a vector. */
+static void tcg_out_dupi_vec(TCGContext *s, TCGType type, unsigned vece, TCGReg rd, int64_t v64)
+{
+    // Convert Rd into a simple gadget number.
+    rd = rd - (TCG_REG_V16);
+
+    // First, try to create an optimized implementation, if possible.
+    /*
+    if (tcg_out_optimized_dupi_vec(s, type, vece, rd, v64)) {
+        return;
+    }
+    */
+
+    // If we didn't, we'll need to load the full vector from memory.
+    // Emit it into our bytecode stream as an immediate; which we'll then
+    // load inside the gadget.
+    if (type == TCG_TYPE_V128) {
+        tcg_out_unary_gadget(s, gadget_ldi_q, rd);
+        tcg_out64(s, v64);
+        tcg_out64(s, v64);
+    } else {
+        tcg_out_unary_gadget(s, gadget_ldi_d, rd);
+        tcg_out64(s, v64);
+    }
+}
+
+
+/* Emits instructions that can load a register into a vector. */
+static bool tcg_out_dup_vec(TCGContext *s, TCGType type, unsigned vece, TCGReg rd, TCGReg rs)
+{
+    // Compute the gadget index for the relevant vector register.
+    TCGReg wd = rd - (TCG_REG_V16);
+
+    // Emit a DUP gadget to handles the operation.
+    tcg_out_binary_vector_gadget(s, dup, vece, wd, rs);
+    return true;
+}
+
+static bool tcg_out_dupm_vec(TCGContext *s, TCGType type, unsigned vece, TCGReg r, TCGReg base, intptr_t offset)
+{
+    int64_t extended_offset = (int32_t)offset;
+
+    // Convert the register into a simple register number for our gadgets.
+    r = r - TCG_REG_V16;
+
+    // Emit a DUPM gadget...
+    tcg_out_binary_vector_gadget(s, dupm, vece, r, base);
+
+    // ... and emit its int64 immediate offset.
+    tcg_out64(s, extended_offset);
+
+    return true;
+}
+
+
+/********************************
+ *  TCG Runtime & Platform Def  *
+ *******************************/
+
 static void tcg_target_init(TCGContext *s)
 {
     /* The current code uses uint8_t for tcg operations. */
     tcg_debug_assert(tcg_op_defs_max <= UINT8_MAX);
 
-    /* Registers available for 32 bit operations. */
-    tcg_target_available_regs[TCG_TYPE_I32] = BIT(TCG_TARGET_NB_REGS) - 1;
-    /* Registers available for 64 bit operations. */
-    tcg_target_available_regs[TCG_TYPE_I64] = BIT(TCG_TARGET_NB_REGS) - 1;
-
-    /* TODO: Which registers should be set here? */
-    tcg_target_call_clobber_regs = BIT(TCG_TARGET_NB_REGS) - 1;
+    // Registers available for each type of operation.
+    tcg_target_available_regs[TCG_TYPE_I32]  = TCG_MASK_GP_REGISTERS;
+    tcg_target_available_regs[TCG_TYPE_I64]  = TCG_MASK_GP_REGISTERS;
+    tcg_target_available_regs[TCG_TYPE_V64]  = TCG_MASK_VECTOR_REGISTERS;
+    tcg_target_available_regs[TCG_TYPE_V128] = TCG_MASK_VECTOR_REGISTERS;
+
+    TCGReg unclobbered_registers[] = {
+        // We don't use registers R16+ in our runtime, so we'll not bother protecting them.
+        TCG_REG_R16, TCG_REG_R17, TCG_REG_R18, TCG_REG_R19,
+        TCG_REG_R20, TCG_REG_R21, TCG_REG_R22, TCG_REG_R23,
+        TCG_REG_R24, TCG_REG_R25, TCG_REG_R26, TCG_REG_R27,
+        TCG_REG_R28, TCG_REG_R29, TCG_REG_R30, TCG_REG_R31,
+
+        // Per our calling convention.
+        TCG_REG_V8,  TCG_REG_V9,  TCG_REG_V10, TCG_REG_V11,
+        TCG_REG_V12, TCG_REG_V13, TCG_REG_V14, TCG_REG_V15,
+   };
+
+    // Specify which registers are clobbered during call.
+    tcg_target_call_clobber_regs = -1ull;
+    for (unsigned i = 0; i < ARRAY_SIZE(unclobbered_registers); ++i) {
+        tcg_regset_reset_reg(tcg_target_call_clobber_regs, unclobbered_registers[i]);
+    }
 
+    // Specify which local registers we're reserving.
+    //
+    // Note that we only have to specify registers that are used in the runtime,
+    // and so not e.g. the register that contains AREG0, which can never be allocated.
     s->reserved_regs = 0;
     tcg_regset_set_reg(s->reserved_regs, TCG_REG_CALL_STACK);
 
diff --git a/tcg/aarch64-tcti/tcg-target.h b/tcg/aarch64-tcti/tcg-target.h
index 7087321de68a..f48060dcbfeb 100644
--- a/tcg/aarch64-tcti/tcg-target.h
+++ b/tcg/aarch64-tcti/tcg-target.h
@@ -122,7 +122,26 @@
 // Supported optional vector instructions.
 //
 
-// TODO!
+#define TCG_TARGET_HAS_v64              1
+#define TCG_TARGET_HAS_v128             1
+#define TCG_TARGET_HAS_v256             0
+
+#define TCG_TARGET_HAS_andc_vec         1
+#define TCG_TARGET_HAS_orc_vec          1
+#define TCG_TARGET_HAS_not_vec          1
+#define TCG_TARGET_HAS_neg_vec          1
+#define TCG_TARGET_HAS_abs_vec          1
+#define TCG_TARGET_HAS_roti_vec         0
+#define TCG_TARGET_HAS_rots_vec         0
+#define TCG_TARGET_HAS_rotv_vec         0
+#define TCG_TARGET_HAS_shi_vec          1
+#define TCG_TARGET_HAS_shs_vec          0
+#define TCG_TARGET_HAS_shv_vec          1
+#define TCG_TARGET_HAS_mul_vec          1
+#define TCG_TARGET_HAS_sat_vec          1
+#define TCG_TARGET_HAS_minmax_vec       1
+#define TCG_TARGET_HAS_bitsel_vec       1
+#define TCG_TARGET_HAS_cmpsel_vec       0
 
 //
 // Unsupported instructions.
@@ -174,30 +193,48 @@
 //
 
 // Number of registers available.
-// It might make sense to up these, since we can also use x16 -> x25?
-#define TCG_TARGET_NB_REGS 16
+#define TCG_TARGET_NB_REGS 64
+
+// Number of general purpose registers.
+#define TCG_TARGET_GP_REGS 16
 
 /* List of registers which are used by TCG. */
 typedef enum {
-    TCG_REG_R0 = 0,
-    TCG_REG_R1,
-    TCG_REG_R2,
-    TCG_REG_R3,
-    TCG_REG_R4,
-    TCG_REG_R5,
-    TCG_REG_R6,
-    TCG_REG_R7,
-    TCG_REG_R8,
-    TCG_REG_R9,
-    TCG_REG_R10,
-    TCG_REG_R11,
-    TCG_REG_R12,
-    TCG_REG_R13,
-    TCG_REG_R14,
-    TCG_REG_R15,
-
-    TCG_AREG0          = TCG_REG_R14,
-    TCG_REG_CALL_STACK = TCG_REG_R15,
+
+    // General purpose registers.
+    // Note that we name every _host_ register here; but don't 
+    // necessarily use them; that's determined by the allocation order
+    // and the number of registers setting above. These just give us the ability
+    // to refer to these by name.
+    TCG_REG_R0, TCG_REG_R1, TCG_REG_R2, TCG_REG_R3,
+    TCG_REG_R4, TCG_REG_R5, TCG_REG_R6, TCG_REG_R7,
+    TCG_REG_R8, TCG_REG_R9, TCG_REG_R10, TCG_REG_R11,
+    TCG_REG_R12, TCG_REG_R13, TCG_REG_R14, TCG_REG_R15,
+    TCG_REG_R16, TCG_REG_R17, TCG_REG_R18, TCG_REG_R19,
+    TCG_REG_R20, TCG_REG_R21, TCG_REG_R22, TCG_REG_R23,
+    TCG_REG_R24, TCG_REG_R25, TCG_REG_R26, TCG_REG_R27,
+    TCG_REG_R28, TCG_REG_R29, TCG_REG_R30, TCG_REG_R31,
+
+    // Register aliases.
+    TCG_AREG0             = TCG_REG_R14,
+    TCG_REG_CALL_STACK    = TCG_REG_R15,
+
+    // Mask that refers to the GP registers.
+    TCG_MASK_GP_REGISTERS = 0xFFFFul, 
+
+    // Vector registers.
+    TCG_REG_V0 = 32, TCG_REG_V1, TCG_REG_V2, TCG_REG_V3,
+    TCG_REG_V4, TCG_REG_V5, TCG_REG_V6, TCG_REG_V7,
+    TCG_REG_V8, TCG_REG_V9, TCG_REG_V10, TCG_REG_V11,
+    TCG_REG_V12, TCG_REG_V13, TCG_REG_V14, TCG_REG_V15,
+    TCG_REG_V16, TCG_REG_V17, TCG_REG_V18, TCG_REG_V19,
+    TCG_REG_V20, TCG_REG_V21, TCG_REG_V22, TCG_REG_V23,
+    TCG_REG_V24, TCG_REG_V25, TCG_REG_V26, TCG_REG_V27,
+    TCG_REG_V28, TCG_REG_V29, TCG_REG_V30, TCG_REG_V31,
+
+    // Mask that refers to the vector registers.
+    TCG_MASK_VECTOR_REGISTERS = 0xFFFF000000000000ul, 
+
 } TCGReg;
 
 // Specify the shape of the stack our runtime will use.
diff --git a/tcg/aarch64-tcti/tcg-target.opc.h b/tcg/aarch64-tcti/tcg-target.opc.h
new file mode 100644
index 000000000000..bce30accd936
--- /dev/null
+++ b/tcg/aarch64-tcti/tcg-target.opc.h
@@ -0,0 +1,15 @@
+/*
+ * Copyright (c) 2019 Linaro
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2 or
+ * (at your option) any later version.
+ *
+ * See the COPYING file in the top-level directory for details.
+ *
+ * Target-specific opcodes for host vector expansion.  These will be
+ * emitted by tcg_expand_vec_op.  For those familiar with GCC internals,
+ * consider these to be UNSPEC with names.
+ */
+
+DEF(aa64_sshl_vec, 1, 2, 0, IMPLVEC)
+DEF(aa64_sli_vec, 1, 2, 1, IMPLVEC)
diff --git a/tcg/aarch64-tcti/tcti-gadget-gen.py b/tcg/aarch64-tcti/tcti-gadget-gen.py
index 0c0e575c4d24..17c76fd82f25 100755
--- a/tcg/aarch64-tcti/tcti-gadget-gen.py
+++ b/tcg/aarch64-tcti/tcti-gadget-gen.py
@@ -25,6 +25,9 @@
 # Helper that provides each of the AArch64 condition codes of interest.
 ARCH_CONDITION_CODES = ["eq", "ne", "lt", "ge", "le", "gt", "lo", "hs", "ls", "hi"]
 
+# The list of vector size codes supported on this platform.
+VECTOR_SIZES = ['16b', '8b', '4h', '8h', '2s', '4s', '2d']
+
 # We'll create a variety of gadgets that assume the MMU's TLB is stored at certain
 # offsets into its structure. These should match the offsets in tcg-target.c.in.
 QEMU_ALLOWED_MMU_OFFSETS = [ 32, 48, 64, 96, 128 ]
@@ -109,9 +112,30 @@ def simple(name, *lines, export=True):
     print("}\n", file=c_file)
 
 
+
+
+
+
+
 def with_register_substitutions(name, substitutions, *lines, immediate_range=range(0)):
     """ Generates a collection of gadgtes with register substitutions. """
 
+    def _expand_op1_immediate(num):
+        """ Gets a uncompressed bitfield argument for a given immediate; for NEON instructions. 
+        
+        Duplciates each bit eight times; converting 0b0100 to 0x00FF0000.
+        """
+
+        # Get the number as a binary string...
+        binstring = bin(num)[2:]
+
+        # ... expand out the values to hex...
+        hex_string = binstring.replace('1', 'FF').replace('0', '00') 
+
+        # ... and return out the new constant.
+        return f"0x{hex_string}"
+
+
     def substitutions_for_letter(letter, number, line):
         """ Helper that transforms Wd => w1, implementing gadget substitutions. """
 
@@ -119,8 +143,16 @@ def substitutions_for_letter(letter, number, line):
         line = line.replace(f"X{letter}", f"x{number}")
         line = line.replace(f"W{letter}", f"w{number}")
 
-        # ... immediate substitutions.
+        # ... vector register substitutions...
+        line = line.replace(f"V{letter}", f"v{number + 16}")
+        line = line.replace(f"D{letter}", f"d{number + 16}")
+        line = line.replace(f"Q{letter}", f"q{number + 16}")
+
+        # ... regular immediate substitutions...
         line = line.replace(f"I{letter}", f"{number}")
+
+        # ... and compressed immediate substitutions.
+        line = line.replace(f"S{letter}", f"{_expand_op1_immediate(number)}")
         return line
 
         
@@ -558,6 +590,63 @@ def st_thunk(name, fastpath_32b, fastpath_64b, slowpath_helper, immediate=None,
             )
 
 
+
+def vector_dn(name, *lines):
+    """ Creates a set of gadgets for every size of a given vector op. Accepts 'S' as a size placeholder. """
+
+    def do_size_replacement(line, size):
+        line = line.replace(".S", f".{size}")
+        
+        # If this size requires a 32b register, replace Wd with Xd.
+        if size == "2d":
+            line = line.replace("Wn", "Xn")
+
+        return line
+
+
+    # Create a variant for each size, replacing any placeholders.
+    for size in VECTOR_SIZES:
+        sized_lines = (do_size_replacement(line, size) for line in lines)
+        with_dn(f"{name}_{size}", *sized_lines)
+
+
+def vector_dnm(name, *lines, scalar=None, omit_sizes=()):
+    """ Creates a set of gadgets for every size of a given vector op. Accepts 'S' as a size placeholder. """
+
+    def do_size_replacement(line, size):
+        return line.replace(".S", f".{size}")
+        
+    # Create a variant for each size, replacing any placeholders.
+    for size in VECTOR_SIZES:
+        if size in omit_sizes:
+            continue
+
+        sized_lines = (do_size_replacement(line, size) for line in lines)
+        with_dnm(f"{name}_{size}", *sized_lines)
+
+    if scalar:
+        if isinstance(scalar, str):
+            sized_lines = (scalar,)
+        with_dnm(f"{name}_scalar", *sized_lines)
+
+
+def vector_math_dnm(name, operation):
+    """ Generates a collection of gadgets for vector math instructions. """
+    vector_dnm(name, f"{operation} Vd.S, Vn.S, Vm.S", scalar=f"{operation} Dd, Dn, Dm")
+
+
+def vector_logic_dn(name, operation):
+    """ Generates a pair of gadgets for vector bitwise logic instructions. """
+    with_dn(f"{name}_d", f"{operation} Vd.8b, Vn.8b")
+    with_dn(f"{name}_q", f"{operation} Vd.16b, Vn.16b")
+
+
+def vector_logic_dnm(name, operation):
+    """ Generates a pair of gadgets for vector bitwise logic instructions. """
+    with_dnm(f"{name}_d", f"{operation} Vd.8b, Vn.8b, Vm.8b")
+    with_dnm(f"{name}_q", f"{operation} Vd.16b, Vn.16b, Vm.16b")
+
+
 #
 # Gadget definitions.
 #
@@ -953,6 +1042,86 @@ def st_thunk(name, fastpath_32b, fastpath_64b, slowpath_helper, immediate=None,
             is_aligned=is_aligned, force_slowpath=is_slowpath)
 
 
+#
+# SIMD/Vector ops
+#
+
+# SIMD MOVI instructions.
+START_COLLECTION(f"simd_base")
+
+# Unoptimized/unoptimizable load of a vector64; grabbing an immediate.
+with_d("ldi_d", "ldr Dd, [x28], #8")
+with_d("ldi_q", "ldr Qd, [x28], #16")
+
+# General purpose reg -> vec rec loads
+vector_dn("dup", "dup Vd.S, Wn")
+
+# Memory -> vec reg loads.
+# The offset of the load is stored in a 64b immediate.
+
+# Duplicating load.
+# TODO: possibly squish the add into the ld1r, if that's valid?
+vector_dn("dupm", "ldr x27, [x28], #8", "add x27, x27, Xn", "ld1r {Vd.S}, [x27]")
+
+# Direct loads.
+with_dn("ldr_d",  "ldr x27, [x28], #8", "ldr Dd, [Xn, x27]")
+with_dn("ldr_q",  "ldr x27, [x28], #8", "ldr Qd, [Xn, x27]")
+
+# vec -> reg stores.
+# The offset of the stores is stored in a 64b immediate.
+with_dn("str_d",  "ldr x27, [x28], #8", "str Dd, [Xn, x27]")
+with_dn("str_q",  "ldr x27, [x28], #8", "str Qd, [Xn, x27]")
+
+
+START_COLLECTION(f"simd_arithmetic")
+
+vector_math_dnm("add",   "add")
+vector_math_dnm("usadd", "uqadd")
+vector_math_dnm("ssadd", "sqadd")
+vector_math_dnm("sub",   "sub")
+vector_math_dnm("ussub", "uqsub")
+vector_math_dnm("sssub", "sqsub")
+vector_dnm("mul", "mul Vd.S, Vn.S, Vm.S", omit_sizes=("2d",))
+
+START_COLLECTION(f"simd_logical")
+
+vector_logic_dnm("and",  "and")
+vector_logic_dnm("andc", "bic")
+vector_logic_dnm("or",   "orr")
+vector_logic_dnm("orc",  "orn")
+vector_logic_dnm("xor",  "eor")
+vector_logic_dn( "not",  "not")
+vector_dn("neg", "neg Vd.S, Vn.S")
+vector_dn("abs", "abs Vd.S, Vn.S")
+
+
+"""
+START_COLLECTION(f"simd_dupi_optimizations")
+
+# Simple imm8 movs...
+with_d_immediate("movi_cmode_e_op0_q0",  "mov Vd.8b, #Ii",          immediate_range=range(256))
+with_d_immediate("movi_cmode_e_op0_q1",  "mov Vd.16b, #Ii",         immediate_range=range(256))
+
+# ... all 00/FF movs...
+with_d_immediate("movi_cmode_e_op1_q0",  "mov Dd, #Si",             immediate_range=range(256))
+with_d_immediate("movi_cmode_e_op1_q1",  "mov Vd.2d, #Si",          immediate_range=range(256))
+
+# Halfword MOVs.
+with_d_immediate("movi_cmode_8_op0_q0",  "movi v0.4h, #Ii",         immediate_range=range(256))
+with_d_immediate("movi_cmode_8_op0_q1",  "movi v0.8h, #Ii",         immediate_range=range(256))
+with_d_immediate("movi_cmode_8_op0_q0",  "mvni v0.4h, #Ii",         immediate_range=range(256))
+with_d_immediate("movi_cmode_8_op0_q1",  "mvni v0.8h, #Ii",         immediate_range=range(256))
+with_d_immediate("movi_cmode_a_op0_q0",  "movi v0.4h, #Ii, lsl #8", immediate_range=range(256))
+with_d_immediate("movi_cmode_a_op0_q1",  "movi v0.8h, #Ii, lsl #8", immediate_range=range(256))
+with_d_immediate("movi_cmode_a_op0_q0",  "mvni v0.4h, #Ii, lsl #8", immediate_range=range(256))
+with_d_immediate("movi_cmode_a_op0_q1",  "mvni v0.8h, #Ii, lsl #8", immediate_range=range(256))
+
+# Halfword ORIs, for building complex MOVs.
+with_d_immediate("movi_orr_a_op0_q0",    "orr v0.4h, #Ii, lsl #8", immediate_range=range(256))
+with_d_immediate("movi_orr_a_op0_q1",    "orr v0.8h, #Ii, lsl #8", immediate_range=range(256))
+"""
+
+
 # Print a list of output files generated.
 output_c_filenames = (f"'tcti_{name}_gadgets.c'" for name in output_files.keys())
 output_h_filenames = (f"'tcti_{name}_gadgets.h'" for name in output_files.keys())
@@ -962,10 +1131,10 @@ def st_thunk(name, fastpath_32b, fastpath_64b, slowpath_helper, immediate=None,
 print("      tcti_gadgets.h,", file=sys.stderr)
 
 for name in output_files.keys():
-    print(f"      tcti_{name}_gadgets.c,", file=sys.stderr)
-    print(f"      tcti_{name}_gadgets.h,", file=sys.stderr)
+    print(f"      'tcti_{name}_gadgets.c',", file=sys.stderr)
+    print(f"      'tcti_{name}_gadgets.h',", file=sys.stderr)
 
 print(f"]", file=sys.stderr)
 
 # Statistics.
-sys.stderr.write(f"\nGenerated {gadgets} gadgets with {instructions} instructions ({instructions * 4} B).\n\n")
+sys.stderr.write(f"\nGenerated {gadgets} gadgets with {instructions} instructions (~{(instructions * 4) // 1024 // 1024} B).\n\n")

From 13c9a7c820df2754b308f8b30a4f0a56f47aa023 Mon Sep 17 00:00:00 2001
From: Kate Temkin <k@ktemkin.com>
Date: Thu, 6 May 2021 16:58:56 -0600
Subject: [PATCH 35/36] TCTI: expand vector support to all ops

---
 meson.build                         |   2 +
 tcg/aarch64-tcti/tcg-target.c.inc   | 216 +++++++++++++++++-----------
 tcg/aarch64-tcti/tcg-target.h       |   2 +-
 tcg/aarch64-tcti/tcg-target.opc.h   |   1 -
 tcg/aarch64-tcti/tcti-gadget-gen.py |  69 ++++++---
 5 files changed, 179 insertions(+), 111 deletions(-)

diff --git a/meson.build b/meson.build
index cdbe0933ac9d..d87943c90234 100644
--- a/meson.build
+++ b/meson.build
@@ -323,6 +323,8 @@ if not get_option('tcg').disabled()
         'tcti_simd_arithmetic_gadgets.h',
         'tcti_simd_logical_gadgets.c',
         'tcti_simd_logical_gadgets.h',
+        'tcti_simd_immediate_gadgets.c',
+        'tcti_simd_immediate_gadgets.h',
     ]
     tcti_gadgets = custom_target('tcti-gadgets.h',
                               output: tcti_sources,
diff --git a/tcg/aarch64-tcti/tcg-target.c.inc b/tcg/aarch64-tcti/tcg-target.c.inc
index 9852650ca6fe..ea6dfc56f6de 100644
--- a/tcg/aarch64-tcti/tcg-target.c.inc
+++ b/tcg/aarch64-tcti/tcg-target.c.inc
@@ -207,8 +207,6 @@ static TCGConstraintSetIndex tcg_target_op_def(TCGOpcode op)
         return C_O1_I2(w, w, w);
     case INDEX_op_bitsel_vec:
         return C_O1_I3(w, w, w, w);
-    case INDEX_op_aa64_sli_vec:
-        return C_O1_I2(w, 0, w);
 
     default:
         g_assert_not_reached();
@@ -581,24 +579,77 @@ static void tcti_out_label(TCGContext *s, TCGLabel *label)
     }
 }
 
-/**
- * Generate a register-to-register MOV.
- */
+
+/* Register to register move using ORR (shifted register with no shift). */
+static void tcg_out_movr(TCGContext *s, TCGType ext, TCGReg rd, TCGReg rm)
+{
+    switch(ext) {
+        case TCG_TYPE_I32:
+            tcg_out_binary_gadget(s, gadget_mov_i32, rd, rm);
+            break;
+
+        case TCG_TYPE_I64:
+            tcg_out_binary_gadget(s, gadget_mov_i64, rd, rm);
+            break;
+
+        default:
+            g_assert_not_reached();
+
+    }
+}
+
+
 static bool tcg_out_mov(TCGContext *s, TCGType type, TCGReg ret, TCGReg arg)
 {
-    tcg_debug_assert(ret != arg);
+    TCGReg w_ret = (ret - TCG_REG_V16);
+    TCGReg w_arg = (arg - TCG_REG_V16);
 
-    if (type == TCG_TYPE_I32) {
-        tcg_out_binary_gadget(s, gadget_mov_i32, ret, arg);
-    } else {
-        tcg_out_binary_gadget(s, gadget_mov_i64, ret, arg);
+    if (ret == arg) {
+        return true;
     }
 
+    switch (type) {
+    case TCG_TYPE_I32:
+    case TCG_TYPE_I64:
+
+        // If this is a GP to GP register mov, issue our standard MOV.
+        if (ret < 32 && arg < 32) {
+            tcg_out_movr(s, type, ret, arg);
+            break;
+        } 
+        // If this is a vector register to GP, issue a UMOV.
+        else if (ret < 32) {
+            void *gadget = (type == TCG_TYPE_I32) ? gadget_umov_s0 : gadget_umov_d0;
+            tcg_out_binary_gadget(s, gadget, ret, w_arg);
+            break;
+        } 
+        
+        // If this is a GP to vector move, insert the vealue using INS.
+        else if (arg < 32) {
+            void *gadget = (type == TCG_TYPE_I32) ? gadget_ins_s0 : gadget_ins_d0;
+            tcg_out_binary_gadget(s, gadget, w_ret, arg);
+            break;
+        }
+        /* FALLTHRU */
+
+    case TCG_TYPE_V64:
+        tcg_debug_assert(ret >= 32 && arg >= 32);
+        tcg_out_ternary_gadget(s, gadget_or_d, w_ret, w_arg, w_arg);
+        break;
+
+    case TCG_TYPE_V128:
+        tcg_debug_assert(ret >= 32 && arg >= 32);
+        tcg_out_ternary_gadget(s, gadget_or_q, w_ret, w_arg, w_arg);
+        break;
 
+    default:
+        g_assert_not_reached();
+    }
     return true;
 }
 
 
+
 static void tcg_out_movi_i32(TCGContext *s, TCGReg t0, tcg_target_long arg)
 {
     bool is_negative = (arg < 0);
@@ -613,9 +664,6 @@ static void tcg_out_movi_i32(TCGContext *s, TCGReg t0, tcg_target_long arg)
             return;
         }
     }
-    else {
-
-    }
 
     // Emit the mov and its immediate.
     tcg_out_unary_gadget(s, gadget_movi_i32, t0);
@@ -637,9 +685,6 @@ static void tcg_out_movi_i64(TCGContext *s, TCGReg t0, tcg_target_long arg)
             return;
         }
     }
-    else {
-
-    }
 
     // TODO: optimize the negative case, too?
 
@@ -1466,7 +1511,6 @@ static int tcg_target_const_match(tcg_target_long val, TCGType type,
 
 
 /* Return true if v16 is a valid 16-bit shifted immediate.  */
-/*
 static bool is_shimm16(uint16_t v16, int *cmode, int *imm8)
 {
     if (v16 == (v16 & 0xff)) {
@@ -1480,31 +1524,14 @@ static bool is_shimm16(uint16_t v16, int *cmode, int *imm8)
     }
     return false;
 }
-*/
 
-/* Return true if v64 is a valid float64 immediate.  */
-/*
-static bool is_fimm64(uint64_t v64, int *cmode, int *imm8)
-{
-    if (extract64(v64, 0, 48) == 0
-        && (extract64(v64, 54, 9) == 0x100
-            || extract64(v64, 54, 9) == 0x0ff)) {
-        *cmode = 0xf;
-        *imm8 = (extract64(v64, 63, 1) << 7)
-              | (extract64(v64, 54, 1) << 6)
-              | extract64(v64, 48, 6);
-        return true;
-    }
-    return false;
-}
-*/
 
 /** Core vector operation emission. */
 static void tcg_out_vec_op(TCGContext *s, TCGOpcode opc, unsigned vecl, unsigned vece,
     const TCGArg args[TCG_MAX_OP_ARGS], const int const_args[TCG_MAX_OP_ARGS])
 {
     TCGType type = vecl + TCG_TYPE_V64;
-    TCGArg r0, r1, r2, w0, w1, w2;
+    TCGArg r0, r1, r2, r3, w0, w1, w2, w3;
 
     // Typing flags for vector operations.
     bool is_v128 = (type == TCG_TYPE_V128);
@@ -1514,11 +1541,13 @@ static void tcg_out_vec_op(TCGContext *s, TCGOpcode opc, unsigned vecl, unsigned
     r0 = args[0];
     r1 = args[1];
     r2 = args[2];
+    r3 = args[3];
 
     // Offset argument shortcuts; offset to convert register numbers to gadget numberes.
     w0 = args[0] - TCG_REG_V16;
     w1 = args[1] - TCG_REG_V16;
     w2 = args[2] - TCG_REG_V16;
+    w3 = args[3] - TCG_REG_V16;
 
     // Argument shortcuts, as signed.
     int64_t signed_offset_arg = (int32_t)args[2];
@@ -1600,52 +1629,85 @@ static void tcg_out_vec_op(TCGContext *s, TCGOpcode opc, unsigned vecl, unsigned
         break;
 
     case INDEX_op_smax_vec:
-        TODO();
+        tcg_out_ternary_vector_gadget_no64(s, smax, vece, w0, w1, w2);
         break;
 
     case INDEX_op_smin_vec:
-        TODO();
+        tcg_out_ternary_vector_gadget_no64(s, smin, vece, w0, w1, w2);
         break;
 
     case INDEX_op_umax_vec:
-        TODO();
+        tcg_out_ternary_vector_gadget_no64(s, umax, vece, w0, w1, w2);
         break;
 
     case INDEX_op_umin_vec:
-        TODO();
+        tcg_out_ternary_vector_gadget_no64(s, umin, vece, w0, w1, w2);
         break;
 
     case INDEX_op_not_vec: // optional
         tcg_out_binary_dq_gadget(s, not, is_v128, w0, w1);
         break;
 
-    case INDEX_op_shli_vec:
-        TODO();
-        break;
-
-    case INDEX_op_shri_vec:
-        TODO();
-        break;
-
-    case INDEX_op_sari_vec:
-        TODO();
-        break;
-
-    case INDEX_op_aa64_sli_vec:
-        TODO();
-        break;
     case INDEX_op_shlv_vec:
-        TODO();
+        tcg_out_ternary_vector_gadget_with_scalar(s, shlv, is_scalar, vece, w0, w1, w2);
         break;
+
     case INDEX_op_aa64_sshl_vec:
-        TODO();
+        tcg_out_ternary_vector_gadget_with_scalar(s, sshl, is_scalar, vece, w0, w1, w2);
         break;
+
     case INDEX_op_cmp_vec:
-        TODO();
+        switch (args[3]) {
+            case TCG_COND_EQ:
+                tcg_out_ternary_vector_gadget_with_scalar(s, cmeq, is_scalar, vece, w0, w1, w2);
+                break;
+            case TCG_COND_NE:
+                tcg_out_ternary_vector_gadget_with_scalar(s, cmeq, is_scalar, vece, w0, w1, w2);
+                tcg_out_binary_dq_gadget(s, not, is_v128, w0, w0);
+                break;
+            case TCG_COND_GT:
+                tcg_out_ternary_vector_gadget_with_scalar(s, cmgt, is_scalar, vece, w0, w1, w2);
+                break;
+            case TCG_COND_LE:
+                tcg_out_ternary_vector_gadget_with_scalar(s, cmgt, is_scalar, vece, w0, w2, w1);
+                break;
+            case TCG_COND_GE:
+                tcg_out_ternary_vector_gadget_with_scalar(s, cmge, is_scalar, vece, w0, w1, w2);
+                break;
+            case TCG_COND_LT:
+                tcg_out_ternary_vector_gadget_with_scalar(s, cmge, is_scalar, vece, w0, w2, w1);
+                break;
+            case TCG_COND_GTU:
+                tcg_out_ternary_vector_gadget_with_scalar(s, cmhi, is_scalar, vece, w0, w1, w2);
+                break;
+            case TCG_COND_LEU:
+                tcg_out_ternary_vector_gadget_with_scalar(s, cmhi, is_scalar, vece, w0, w2, w1);
+                break;
+            case TCG_COND_GEU:
+                tcg_out_ternary_vector_gadget_with_scalar(s, cmhs, is_scalar, vece, w0, w1, w2);
+                break;
+            case TCG_COND_LTU:
+                tcg_out_ternary_vector_gadget_with_scalar(s, cmhs, is_scalar, vece, w0, w2, w1);
+                break;
+            default:
+                g_assert_not_reached();
+        }
         break;
+
     case INDEX_op_bitsel_vec: // optional
-        TODO();
+    {
+        if (r0 == r3) {
+            tcg_out_ternary_dq_gadget(s, bit, is_v128, w0, w2, w1);
+        } else if (r0 == r2) {
+            tcg_out_ternary_dq_gadget(s, bif, is_v128, w0, w3, w1);
+        } else {
+            if (r0 != r1) {
+                tcg_out_mov(s, type, r0, r1);
+            }
+            tcg_out_ternary_dq_gadget(s, bsl, is_v128, w0, w2, w3);
+        }
         break;
+    }
 
     case INDEX_op_mov_vec:  /* Always emitted via tcg_out_mov.  */
     case INDEX_op_dup_vec:  /* Always emitted via tcg_out_dup_vec.  */
@@ -1668,16 +1730,13 @@ int tcg_can_emit_vec_op(TCGOpcode opc, TCGType type, unsigned vece)
     case INDEX_op_neg_vec:
     case INDEX_op_abs_vec:
     case INDEX_op_not_vec:
-    //case INDEX_op_cmp_vec:
-    //case INDEX_op_shli_vec:
-    //case INDEX_op_shri_vec:
-    //case INDEX_op_sari_vec:
+    case INDEX_op_cmp_vec:
     case INDEX_op_ssadd_vec:
     case INDEX_op_sssub_vec:
     case INDEX_op_usadd_vec:
     case INDEX_op_ussub_vec:
-    //case INDEX_op_shlv_vec:
-    //case INDEX_op_bitsel_vec:
+    case INDEX_op_shlv_vec:
+    case INDEX_op_bitsel_vec:
         return 1;
     case INDEX_op_rotli_vec:
     case INDEX_op_shrv_vec:
@@ -1686,10 +1745,10 @@ int tcg_can_emit_vec_op(TCGOpcode opc, TCGType type, unsigned vece)
     case INDEX_op_rotrv_vec:
         return -1;
     case INDEX_op_mul_vec:
-    //case INDEX_op_smax_vec:
-    //case INDEX_op_smin_vec:
-    //case INDEX_op_umax_vec:
-    //case INDEX_op_umin_vec:
+    case INDEX_op_smax_vec:
+    case INDEX_op_smin_vec:
+    case INDEX_op_umax_vec:
+    case INDEX_op_umin_vec:
         return vece < MO_64;
 
     default:
@@ -1712,14 +1771,6 @@ void tcg_expand_vec_op(TCGOpcode opc, TCGType type, unsigned vece,
     va_end(va);
 
     switch (opc) {
-    case INDEX_op_rotli_vec:
-        t1 = tcg_temp_new_vec(type);
-        tcg_gen_shri_vec(vece, t1, v1, -a2 & ((8 << vece) - 1));
-        vec_gen_4(INDEX_op_aa64_sli_vec, type, vece,
-                  tcgv_vec_arg(v0), tcgv_vec_arg(t1), tcgv_vec_arg(v1), a2);
-        tcg_temp_free_vec(t1);
-        break;
-
     case INDEX_op_shrv_vec:
     case INDEX_op_sarv_vec:
         /* Right shifts are negative left shifts for AArch64.  */
@@ -1771,10 +1822,9 @@ void tcg_expand_vec_op(TCGOpcode opc, TCGType type, unsigned vece,
 
 
 /* Generate DUPI (move immediate) vector ops. */
-/*
 static bool tcg_out_optimized_dupi_vec(TCGContext *s, TCGType type, unsigned vece, TCGReg rd, int64_t v64)
 {
-    bool is_v128 = (type == TCG_TYPE_V128);
+    bool q = (type == TCG_TYPE_V128);
     int cmode, imm8, i;
 
     // If we're copying an 8b immediate, we implicitly have a simple gadget for this,
@@ -1871,14 +1921,10 @@ static bool tcg_out_optimized_dupi_vec(TCGContext *s, TCGType type, unsigned vec
     //        return;
     //    }
     //} 
-    
-    else if (is_fimm64(v64, &cmode, &imm8)) {
-        tcg_out_insn(s, 3606, MOVI, q, rd, 1, cmode, imm8);
-        return true;
-    }
 
+    return false;
 }
-*/
+
 
 /* Emits instructions that can load an immediate into a vector. */
 static void tcg_out_dupi_vec(TCGContext *s, TCGType type, unsigned vece, TCGReg rd, int64_t v64)
@@ -1887,11 +1933,9 @@ static void tcg_out_dupi_vec(TCGContext *s, TCGType type, unsigned vece, TCGReg
     rd = rd - (TCG_REG_V16);
 
     // First, try to create an optimized implementation, if possible.
-    /*
     if (tcg_out_optimized_dupi_vec(s, type, vece, rd, v64)) {
         return;
     }
-    */
 
     // If we didn't, we'll need to load the full vector from memory.
     // Emit it into our bytecode stream as an immediate; which we'll then
diff --git a/tcg/aarch64-tcti/tcg-target.h b/tcg/aarch64-tcti/tcg-target.h
index f48060dcbfeb..b4a3c225002b 100644
--- a/tcg/aarch64-tcti/tcg-target.h
+++ b/tcg/aarch64-tcti/tcg-target.h
@@ -134,7 +134,7 @@
 #define TCG_TARGET_HAS_roti_vec         0
 #define TCG_TARGET_HAS_rots_vec         0
 #define TCG_TARGET_HAS_rotv_vec         0
-#define TCG_TARGET_HAS_shi_vec          1
+#define TCG_TARGET_HAS_shi_vec          0
 #define TCG_TARGET_HAS_shs_vec          0
 #define TCG_TARGET_HAS_shv_vec          1
 #define TCG_TARGET_HAS_mul_vec          1
diff --git a/tcg/aarch64-tcti/tcg-target.opc.h b/tcg/aarch64-tcti/tcg-target.opc.h
index bce30accd936..26bfd9c46093 100644
--- a/tcg/aarch64-tcti/tcg-target.opc.h
+++ b/tcg/aarch64-tcti/tcg-target.opc.h
@@ -12,4 +12,3 @@
  */
 
 DEF(aa64_sshl_vec, 1, 2, 0, IMPLVEC)
-DEF(aa64_sli_vec, 1, 2, 1, IMPLVEC)
diff --git a/tcg/aarch64-tcti/tcti-gadget-gen.py b/tcg/aarch64-tcti/tcti-gadget-gen.py
index 17c76fd82f25..4e127ff8c3be 100755
--- a/tcg/aarch64-tcti/tcti-gadget-gen.py
+++ b/tcg/aarch64-tcti/tcti-gadget-gen.py
@@ -113,10 +113,6 @@ def simple(name, *lines, export=True):
 
 
 
-
-
-
-
 def with_register_substitutions(name, substitutions, *lines, immediate_range=range(0)):
     """ Generates a collection of gadgtes with register substitutions. """
 
@@ -635,6 +631,11 @@ def vector_math_dnm(name, operation):
     vector_dnm(name, f"{operation} Vd.S, Vn.S, Vm.S", scalar=f"{operation} Dd, Dn, Dm")
 
 
+def vector_math_dnm_no64(name, operation):
+    """ Generates a collection of gadgets for vector math instructions. """
+    vector_dnm(name, f"{operation} Vd.S, Vn.S, Vm.S", omit_sizes=('2d',))
+
+
 def vector_logic_dn(name, operation):
     """ Generates a pair of gadgets for vector bitwise logic instructions. """
     with_dn(f"{name}_d", f"{operation} Vd.8b, Vn.8b")
@@ -1056,6 +1057,15 @@ def vector_logic_dnm(name, operation):
 # General purpose reg -> vec rec loads
 vector_dn("dup", "dup Vd.S, Wn")
 
+# move vector -> GP reg
+with_dn("umov_s0", "umov Wd, Vn.s[0]")
+with_dn("umov_d0", "umov Xd, Vn.d[0]")
+
+# mov GP reg -> vector
+with_dn("ins_s0", "ins Vd.s[0], Wn")
+with_dn("ins_d0", "ins Vd.d[0], Xn")
+
+
 # Memory -> vec reg loads.
 # The offset of the load is stored in a 64b immediate.
 
@@ -1081,7 +1091,11 @@ def vector_logic_dnm(name, operation):
 vector_math_dnm("sub",   "sub")
 vector_math_dnm("ussub", "uqsub")
 vector_math_dnm("sssub", "sqsub")
-vector_dnm("mul", "mul Vd.S, Vn.S, Vm.S", omit_sizes=("2d",))
+vector_math_dnm_no64("mul",  "mul")
+vector_math_dnm_no64("smax", "smax")
+vector_math_dnm_no64("smin", "smin")
+vector_math_dnm_no64("umax", "umax")
+vector_math_dnm_no64("umin", "umin")
 
 START_COLLECTION(f"simd_logical")
 
@@ -1093,33 +1107,42 @@ def vector_logic_dnm(name, operation):
 vector_logic_dn( "not",  "not")
 vector_dn("neg", "neg Vd.S, Vn.S")
 vector_dn("abs", "abs Vd.S, Vn.S")
+vector_logic_dnm( "bit",  "bit")
+vector_logic_dnm( "bif",  "bif")
+vector_logic_dnm( "bsl",  "bsl")
 
+vector_math_dnm("shlv", "ushl")
+vector_math_dnm("sshl", "sshl")
 
-"""
-START_COLLECTION(f"simd_dupi_optimizations")
+vector_dnm("cmeq", "cmeq Vd.S, Vn.S, Vm.S", scalar="cmeq Dd, Dn, Dm")
+vector_dnm("cmgt", "cmgt Vd.S, Vn.S, Vm.S", scalar="cmgt Dd, Dn, Dm")
+vector_dnm("cmge", "cmge Vd.S, Vn.S, Vm.S", scalar="cmge Dd, Dn, Dm")
+vector_dnm("cmhi", "cmhi Vd.S, Vn.S, Vm.S", scalar="cmhi Dd, Dn, Dm")
+vector_dnm("cmhs", "cmhs Vd.S, Vn.S, Vm.S", scalar="cmhs Dd, Dn, Dm")
+
+START_COLLECTION(f"simd_immediate")
 
 # Simple imm8 movs...
-with_d_immediate("movi_cmode_e_op0_q0",  "mov Vd.8b, #Ii",          immediate_range=range(256))
-with_d_immediate("movi_cmode_e_op0_q1",  "mov Vd.16b, #Ii",         immediate_range=range(256))
+with_d_immediate("movi_cmode_e_op0_q0",  "movi Vd.8b, #Ii",          immediate_range=range(256))
+with_d_immediate("movi_cmode_e_op0_q1",  "movi Vd.16b, #Ii",         immediate_range=range(256))
 
 # ... all 00/FF movs...
-with_d_immediate("movi_cmode_e_op1_q0",  "mov Dd, #Si",             immediate_range=range(256))
-with_d_immediate("movi_cmode_e_op1_q1",  "mov Vd.2d, #Si",          immediate_range=range(256))
+with_d_immediate("movi_cmode_e_op1_q0",  "movi Dd, #Si",             immediate_range=range(256))
+with_d_immediate("movi_cmode_e_op1_q1",  "movi Vd.2d, #Si",          immediate_range=range(256))
 
 # Halfword MOVs.
-with_d_immediate("movi_cmode_8_op0_q0",  "movi v0.4h, #Ii",         immediate_range=range(256))
-with_d_immediate("movi_cmode_8_op0_q1",  "movi v0.8h, #Ii",         immediate_range=range(256))
-with_d_immediate("movi_cmode_8_op0_q0",  "mvni v0.4h, #Ii",         immediate_range=range(256))
-with_d_immediate("movi_cmode_8_op0_q1",  "mvni v0.8h, #Ii",         immediate_range=range(256))
-with_d_immediate("movi_cmode_a_op0_q0",  "movi v0.4h, #Ii, lsl #8", immediate_range=range(256))
-with_d_immediate("movi_cmode_a_op0_q1",  "movi v0.8h, #Ii, lsl #8", immediate_range=range(256))
-with_d_immediate("movi_cmode_a_op0_q0",  "mvni v0.4h, #Ii, lsl #8", immediate_range=range(256))
-with_d_immediate("movi_cmode_a_op0_q1",  "mvni v0.8h, #Ii, lsl #8", immediate_range=range(256))
+with_d_immediate("movi_cmode_8_op0_q0",  "movi Vd.4h, #Ii",         immediate_range=range(256))
+with_d_immediate("movi_cmode_8_op0_q1",  "movi Vd.8h, #Ii",         immediate_range=range(256))
+with_d_immediate("mvni_cmode_8_op0_q0",  "mvni Vd.4h, #Ii",         immediate_range=range(256))
+with_d_immediate("mvni_cmode_8_op0_q1",  "mvni Vd.8h, #Ii",         immediate_range=range(256))
+with_d_immediate("movi_cmode_a_op0_q0",  "movi Vd.4h, #Ii, lsl #8", immediate_range=range(256))
+with_d_immediate("movi_cmode_a_op0_q1",  "movi Vd.8h, #Ii, lsl #8", immediate_range=range(256))
+with_d_immediate("mvni_cmode_a_op0_q0",  "mvni Vd.4h, #Ii, lsl #8", immediate_range=range(256))
+with_d_immediate("mvni_cmode_a_op0_q1",  "mvni Vd.8h, #Ii, lsl #8", immediate_range=range(256))
 
 # Halfword ORIs, for building complex MOVs.
-with_d_immediate("movi_orr_a_op0_q0",    "orr v0.4h, #Ii, lsl #8", immediate_range=range(256))
-with_d_immediate("movi_orr_a_op0_q1",    "orr v0.8h, #Ii, lsl #8", immediate_range=range(256))
-"""
+with_d_immediate("orr_cmode_a_op0_q0",   "orr Vd.4h, #Ii, lsl #8",  immediate_range=range(256))
+with_d_immediate("orr_cmode_a_op0_q1",   "orr Vd.8h, #Ii, lsl #8",  immediate_range=range(256))
 
 
 # Print a list of output files generated.
@@ -1137,4 +1160,4 @@ def vector_logic_dnm(name, operation):
 print(f"]", file=sys.stderr)
 
 # Statistics.
-sys.stderr.write(f"\nGenerated {gadgets} gadgets with {instructions} instructions (~{(instructions * 4) // 1024 // 1024} B).\n\n")
+sys.stderr.write(f"\nGenerated {gadgets} gadgets with {instructions} instructions (~{(instructions * 4) // 1024 // 1024} MiB).\n\n")

From 550af33ef5b7840c4b3419b1119d844f4725e609 Mon Sep 17 00:00:00 2001
From: Katherine Temkin <k@ktemkin.com>
Date: Fri, 2 Sep 2022 15:40:24 -0600
Subject: [PATCH 36/36] meson tweaks for building with and _without_ JIT

---
 meson.build | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/meson.build b/meson.build
index d87943c90234..47d2f00ff6e2 100644
--- a/meson.build
+++ b/meson.build
@@ -233,6 +233,8 @@ if not get_option('hax').disabled()
   endif
 endif
 
+tcti_gadgets = files()
+
 tcg_arch = config_host['ARCH']
 if not get_option('tcg').disabled()
   if cpu not in supported_cpus