From 66b8f880744cff339c925aeb52a3b8328cf9f300 Mon Sep 17 00:00:00 2001
From: Hugh Sanderson <hugh@hughsando.com>
Date: Sun, 6 Oct 2024 13:37:11 +0800
Subject: [PATCH 1/3] Add 'set' with no name to dump current values.  Check for
 undefined values when linking android

---
 toolchain/android-toolchain-clang.xml |  3 +++
 tools/hxcpp/BuildTool.hx              | 18 +++++++++++++++---
 2 files changed, 18 insertions(+), 3 deletions(-)
diff --git a/toolchain/android-toolchain-clang.xml b/toolchain/android-toolchain-clang.xml
index a6583b3b8..6ebbd5001 100644
--- a/toolchain/android-toolchain-clang.xml
+++ b/toolchain/android-toolchain-clang.xml
@@ -92,6 +92,9 @@
   <flag value ="-shared" />
   <flag value="--target=${ABITRIPLE}${PLATFORM_NUMBER}" />
 
+  <!-- Build time error, not run time -->
+  <flag value="-Wl,--no-undefined" unless="HXCPP_ALLOW_UNDEFINED" />
+
   <flag value ="-static-libstdc++" />
   <!-- This shows the android link line, which may be so long that it breaks the tool
      https://github.com/HaxeFoundation/hxcpp/pull/1091
diff --git a/tools/hxcpp/BuildTool.hx b/tools/hxcpp/BuildTool.hx
index 8e20a1469..d6b42f513 100644
--- a/tools/hxcpp/BuildTool.hx
+++ b/tools/hxcpp/BuildTool.hx
@@ -2216,6 +2216,13 @@ class BuildTool
       }
    }
 
+   function dumpDefs()
+   {
+      Sys.println("Defines:");
+      for(k in mDefines.keys())
+         Sys.println('  $k=${mDefines.get(k)}');
+   }
+
    function parseXML(inXML:XmlAccess,inSection:String, forceRelative:Bool)
    {
       for(el in inXML.elements)
@@ -2225,9 +2232,14 @@ class BuildTool
             switch(el.name)
             {
                case "set" :
-                  var name = substitute(el.att.name);
-                  var value = substitute(el.att.value);
-                  mDefines.set(name,value);
+                  if (el.has.name)
+                  {
+                     var name = substitute(el.att.name);
+                     var value = substitute(el.att.value);
+                     mDefines.set(name,value);
+                  }
+                  else
+                     dumpDefs();
                case "unset" :
                   var name = substitute(el.att.name);
                   mDefines.remove(name);

From c97ab282a046c09ec7cdaa4f73bb774d2ab8206a Mon Sep 17 00:00:00 2001
From: Aidan Lee <Aidan63@users.noreply.github.com>
Date: Sat, 12 Oct 2024 19:32:32 +0100
Subject: [PATCH 2/3] CI - Updated version handling (#1123)

* Updated version handling

* Downgrade to macos-13
---
 .github/workflows/package.yml | 48 ++++++++---------
 tools/version/Write.hx        | 97 ++++++++++++++++++-----------------
 2 files changed, 72 insertions(+), 73 deletions(-)

diff --git a/.github/workflows/package.yml b/.github/workflows/package.yml
index ead482b54..c44816997 100644
--- a/.github/workflows/package.yml
+++ b/.github/workflows/package.yml
@@ -12,7 +12,9 @@ jobs:
     runs-on: ${{ matrix.os }}
     steps:
       - name: Checkout
-        uses: actions/checkout@v3
+        uses: actions/checkout@v4
+        with:
+          fetch-depth: 0
 
       - uses: Aidan63/setup-haxe@3d3101bcd0a2001699fc8295f4d9eddd0724d3e9
         with:
@@ -22,16 +24,20 @@ jobs:
         run: |
           haxe -version
           haxelib dev hxcpp .
+
+      - name: 'Get Previous tag'
+        id: previoustag
+        uses: "WyriHaximus/github-action-get-previous-tag@v1"
+        with:
+          prefix: v
           
       - name: Set Version
-        run: haxe -cp tools/version --run Write ${{github.run_number}} > version.env
-
+        run: haxe -p tools/version --run Write ${{ steps.previoustag.outputs.tag }} > version.env
 
       - name: Build Tool
         run: |
           cd tools/hxcpp
           haxe compile.hxml
-
         
       - name: Check XCode
         if: startsWith(matrix.os,'macos')
@@ -49,7 +55,7 @@ jobs:
 
       - name: Archive Linux Results
         if: startsWith(matrix.os,'ubuntu')
-        uses: actions/upload-artifact@v3
+        uses: actions/upload-artifact@v4
         with:
           name: linux-64
           path: |
@@ -75,7 +81,7 @@ jobs:
 
       - name: Archive Windows Results
         if: startsWith(matrix.os,'windows')
-        uses: actions/upload-artifact@v3
+        uses: actions/upload-artifact@v4
         with:
           name: windows-64
           path: |
@@ -83,7 +89,7 @@ jobs:
 
       - name: Archive Mac Results
         if: startsWith(matrix.os,'macos')
-        uses: actions/upload-artifact@v3
+        uses: actions/upload-artifact@v4
         with:
           name: mac-64
           path: |
@@ -94,19 +100,19 @@ jobs:
     runs-on: ubuntu-latest
     steps:
         - name: Download Linux
-          uses: actions/download-artifact@v3
+          uses: actions/download-artifact@v4
           with:
              name: linux-64
              path: hxcpp
 
         - name: Download Mac
-          uses: actions/download-artifact@v3
+          uses: actions/download-artifact@v4
           with:
              name: mac-64
              path: hxcpp/bin/Mac64/
 
         - name: Download Windows
-          uses: actions/download-artifact@v3
+          uses: actions/download-artifact@v4
           with:
              name: windows-64
              path: hxcpp/bin/Windows64/
@@ -126,23 +132,11 @@ jobs:
              zip -r hxcpp-${{ env.hxcpp_release }}.zip hxcpp-${{ env.hxcpp_release }} 
       
         - name: Create Release
-          id: create_release
-          uses: actions/create-release@v1
-          env:
-            GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+          uses: ncipollo/release-action@v1
           with:
-            tag_name: v${{ env.hxcpp_release }}
-            release_name: Release ${{ env.hxcpp_release }}
+            tag: v${{ env.hxcpp_release }}
+            commit: ${{ github.head_ref }}
+            name: Release ${{ env.hxcpp_release }}
             draft: false
             prerelease: false
-
-        - name: Upload Release Asset
-          id: upload-release-asset 
-          uses: actions/upload-release-asset@v1
-          env:
-            GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-          with:
-            upload_url: ${{ steps.create_release.outputs.upload_url }} # This pulls from the CREATE RELEASE step above, referencing it's ID to get its outputs object, which include a `upload_url`. See this blog post for more info: https://jasonet.co/posts/new-features-of-github-actions/#passing-data-to-future-steps 
-            asset_path: ./hxcpp-${{ env.hxcpp_release }}.zip
-            asset_name: hxcpp-${{ env.hxcpp_release }}.zip
-            asset_content_type: application/zip
\ No newline at end of file
+            artifacts: ./hxcpp-${{ env.hxcpp_release }}.zip
\ No newline at end of file
diff --git a/tools/version/Write.hx b/tools/version/Write.hx
index e4eaec13c..b1e34526e 100644
--- a/tools/version/Write.hx
+++ b/tools/version/Write.hx
@@ -1,58 +1,63 @@
+import haxe.Exception;
+import haxe.Json;
 import sys.io.File;
 
+using StringTools;
+
+typedef Haxelib = {
+   var version: String;
+}
+
 class Write
 {
    public static function main()
    {
-      var args = Sys.args();
-      // AL NOTE: this "30 +" is a bodge around some CI stuff.
-      // Usually the ever incrementing CI run number is provided as the argument, but this ID is per github workflow.
-      // So when the release ci yml moved file the number reset to zero and we started overwriting previous releases.
-      // For now just append 30 since the previous release was 25 or something.
-      //
-      // This will need to be revisited when anything other than the last number increases as you would end up with
-      // something like 5.0.42 instead of 5.0.0.
-      var buildNumber = 30 + Std.parseInt(args[0]);
-      if (buildNumber<1 || buildNumber==null)
-         throw "Usage: Write buildNumber";
-
-
-      var jsonFile = "haxelib.json";
-      var lines = File.getContent(jsonFile).split("\n");
-      var idx = 0;
-      var versionMatch = ~/(.*"version"\s*:\s*")(.*)(".*)/;
-      var found = false;
-      var newVersion = "";
-      while(idx<lines.length)
+      switch Sys.args()
       {
-         if (versionMatch.match(lines[idx]))
-         {
-            var parts = versionMatch.matched(2).split(".");
-            if (parts.length==3)
-               parts[2] = buildNumber+"";
-            else
-               parts.push(buildNumber+"");
-            newVersion = parts.join(".");
-            lines[idx]=versionMatch.matched(1) + newVersion + versionMatch.matched(3);
-            found = true;
-            break;
-         }
-         idx++;
-      }
-      if (!found)
-         throw "Could not find version in " + jsonFile;
+         case [ version ] if (version.startsWith('v')):
+            switch version.substr(1).split('.')
+            {
+               case [ previousMajor, previousMinor, previousPatch ]:
+                  final jsonFile = "haxelib.json";
+                  final json     = (cast Json.parse(File.getContent(jsonFile)) : Haxelib);
 
-      File.saveContent(jsonFile, lines.join("\n") );
+                  switch json.version.split('.')
+                  {
+                     case [ newMajor, newMinor, _ ]:
+                        if (newMajor < previousMajor || (newMajor == previousMajor && newMinor < previousMinor))
+                        {
+                           throw new Exception('Version in haxelib.json is older than the last tag');
+                        }
 
-      var writeVersionFilename = "include/HxcppVersion.h";
-      var define = "HXCPP_VERSION";
-      var lines = [
-         '#ifndef $define',
-         '#define $define "$newVersion"',
-         '#endif'
-      ];
-      File.saveContent( writeVersionFilename, lines.join("\n") );
+                        if (newMajor > previousMajor || newMinor > previousMinor)
+                        {
+                           json.version = '$newMajor.$newMinor.0';
+                        }
+                        else
+                        {
+                           json.version = '$newMajor.$newMinor.${ Std.parseInt(previousPatch) + 1 }';
+                        }
+                     case _:
+                        throw new Exception('Invalid version in haxelib.json');
+                  }
 
-      Sys.println("hxcpp_release=" + newVersion );
+                  File.saveContent(jsonFile, Json.stringify(json, '\t'));
+
+                  final define = "HXCPP_VERSION";
+                  final lines  = [
+                     '#ifndef $define',
+                     '#define $define "${ json.version }"',
+                     '#endif'
+                  ];
+
+                  File.saveContent("include/HxcppVersion.h", lines.join("\n"));
+
+                  Sys.println("hxcpp_release=" + json.version );
+               case _:
+                  throw new Exception('Invalid version in tag');
+            }
+         case other:
+            throw new Exception('Invalid version $other');
+      }
    }
 }

From 54af892be2ca4c63988c99c9c524431af6c6f036 Mon Sep 17 00:00:00 2001
From: Michael Bickel <michael.bickel@gmail.com>
Date: Tue, 15 Oct 2024 09:27:05 +0200
Subject: [PATCH 3/3] add support for Tracy profiler (#1153)

* add support for Tracy profiler

* add tracy client

* add Aidan's StackContext Zone

* a bit more documentation

* doc cleanup

* add HXCPP_TRACY_DISABLE_STACKS flag

* put tracy include behind HXCPP_TRACY

* move code to third party; move tracy into it's own cachable <files> section in haxe-target.xml

* shift tracy telemetry into its own cpp file

* fix incorrect signature

* add global __hxcpp_tracy_* telemetry functions

* add ctx to cppia-stackframe; fix sourcelocation's fullName

* remove the dependency on haxe's stackframe context, use tracyZones vector instead

* update docs

* bug: fix InternalNew's  parameter being used as linenumber in its HX_STACK_FRAME

* fix wrong path in HX_STACK_FRAME

* add HXCPP_TRACY_NO_EXIT & HXCPP_TRACY_ON_DEMAND support

* added HXCPP_TRACY_INCLUDE_CALLSTACKS, use this to generate & include callstacks in the profiler zones

* allocation tracking

* add no-op gc alloc function to hxtelemetry implementation

* Account for the same large object pointer appearing before collection

* add HXCPP_TRACY_MEMORY

* stack depth needs +1

* track large objects globally

* eagerly free large objects

* add __hxcpp_tracy_set_thread_name_and_group; disable custom GC ___tracy_source_location_data

* new tracy telemetry header and hxcpp zone macro

* Use hxcpp float

* fix argument order in custom scope

* Remove test zone function

* line unique scope variable

* callstack macro variant

* fix HXCPP_TRACY_ZONE

* zone count function

* update docs & comments

* update readme

---------

Co-authored-by: Aidan Lee <aidan.lee63@gmail.com>
---
 include/hx/GC.h                               |    3 +
 include/hx/StackContext.h                     |    2 +-
 include/hx/TelemetryTracy.h                   |   44 +
 include/hxcpp.h                               |    1 -
 project/thirdparty/tracy-0.11.1/README.md     |   76 +
 .../thirdparty/tracy-0.11.1/TracyClient.cpp   |   61 +
 .../tracy-0.11.1/client/TracyAlloc.cpp        |   43 +
 .../tracy-0.11.1/client/TracyArmCpuTable.hpp  |  419 +
 .../tracy-0.11.1/client/TracyCallstack.cpp    | 1415 +++
 .../tracy-0.11.1/client/TracyCallstack.h      |   35 +
 .../tracy-0.11.1/client/TracyCallstack.hpp    |  153 +
 .../tracy-0.11.1/client/TracyCpuid.hpp        |   12 +
 .../tracy-0.11.1/client/TracyDebug.hpp        |   11 +
 .../tracy-0.11.1/client/TracyDxt1.cpp         |  644 ++
 .../tracy-0.11.1/client/TracyDxt1.hpp         |   11 +
 .../tracy-0.11.1/client/TracyFastVector.hpp   |  118 +
 .../tracy-0.11.1/client/TracyKCore.cpp        |  121 +
 .../tracy-0.11.1/client/TracyKCore.hpp        |   37 +
 .../tracy-0.11.1/client/TracyLock.hpp         |  546 ++
 .../tracy-0.11.1/client/TracyOverride.cpp     |   26 +
 .../tracy-0.11.1/client/TracyProfiler.cpp     | 4813 +++++++++++
 .../tracy-0.11.1/client/TracyProfiler.hpp     | 1010 +++
 .../tracy-0.11.1/client/TracyRingBuffer.hpp   |  141 +
 .../tracy-0.11.1/client/TracyScoped.hpp       |  228 +
 .../client/TracyStringHelpers.hpp             |   41 +
 .../tracy-0.11.1/client/TracySysPower.cpp     |  164 +
 .../tracy-0.11.1/client/TracySysPower.hpp     |   44 +
 .../tracy-0.11.1/client/TracySysTime.cpp      |  108 +
 .../tracy-0.11.1/client/TracySysTime.hpp      |   36 +
 .../tracy-0.11.1/client/TracySysTrace.cpp     | 1611 ++++
 .../tracy-0.11.1/client/TracySysTrace.hpp     |   28 +
 .../tracy-0.11.1/client/TracyThread.hpp       |   90 +
 .../tracy-0.11.1/client/tracy_SPSCQueue.h     |  148 +
 .../client/tracy_concurrentqueue.h            | 1441 ++++
 .../tracy-0.11.1/client/tracy_rpmalloc.cpp    | 3517 ++++++++
 .../tracy-0.11.1/client/tracy_rpmalloc.hpp    |  363 +
 .../tracy-0.11.1/common/TracyAlign.hpp        |   27 +
 .../tracy-0.11.1/common/TracyAlloc.hpp        |   72 +
 .../thirdparty/tracy-0.11.1/common/TracyApi.h |   16 +
 .../tracy-0.11.1/common/TracyColor.hpp        |  690 ++
 .../tracy-0.11.1/common/TracyForceInline.hpp  |   20 +
 .../tracy-0.11.1/common/TracyMutex.hpp        |   24 +
 .../tracy-0.11.1/common/TracyProtocol.hpp     |  169 +
 .../tracy-0.11.1/common/TracyQueue.hpp        |  905 ++
 .../tracy-0.11.1/common/TracySocket.cpp       |  752 ++
 .../tracy-0.11.1/common/TracySocket.hpp       |  155 +
 .../tracy-0.11.1/common/TracyStackFrames.cpp  |  122 +
 .../tracy-0.11.1/common/TracyStackFrames.hpp  |   22 +
 .../tracy-0.11.1/common/TracySystem.cpp       |  347 +
 .../tracy-0.11.1/common/TracySystem.hpp       |   43 +
 .../tracy-0.11.1/common/TracyUwp.hpp          |   11 +
 .../tracy-0.11.1/common/TracyVersion.hpp      |   14 +
 .../tracy-0.11.1/common/TracyYield.hpp        |   28 +
 .../tracy-0.11.1/common/tracy_lz4.cpp         | 2720 ++++++
 .../tracy-0.11.1/common/tracy_lz4.hpp         |  847 ++
 .../tracy-0.11.1/common/tracy_lz4hc.cpp       | 1636 ++++
 .../tracy-0.11.1/common/tracy_lz4hc.hpp       |  405 +
 .../tracy-0.11.1/libbacktrace/LICENSE         |   29 +
 .../tracy-0.11.1/libbacktrace/alloc.cpp       |  174 +
 .../tracy-0.11.1/libbacktrace/backtrace.hpp   |  186 +
 .../tracy-0.11.1/libbacktrace/config.h        |   26 +
 .../tracy-0.11.1/libbacktrace/dwarf.cpp       | 4455 ++++++++++
 .../tracy-0.11.1/libbacktrace/elf.cpp         | 7605 +++++++++++++++++
 .../tracy-0.11.1/libbacktrace/fileline.cpp    |  412 +
 .../tracy-0.11.1/libbacktrace/filenames.hpp   |   52 +
 .../tracy-0.11.1/libbacktrace/internal.hpp    |  435 +
 .../tracy-0.11.1/libbacktrace/macho.cpp       | 1367 +++
 .../tracy-0.11.1/libbacktrace/mmapio.cpp      |  115 +
 .../tracy-0.11.1/libbacktrace/posix.cpp       |  109 +
 .../tracy-0.11.1/libbacktrace/sort.cpp        |  113 +
 .../tracy-0.11.1/libbacktrace/state.cpp       |   76 +
 .../thirdparty/tracy-0.11.1/tracy/Tracy.hpp   |  300 +
 .../thirdparty/tracy-0.11.1/tracy/TracyC.h    |  417 +
 .../tracy-0.11.1/tracy/TracyD3D11.hpp         |  446 +
 .../tracy-0.11.1/tracy/TracyD3D12.hpp         |  500 ++
 .../tracy-0.11.1/tracy/TracyLua.hpp           |  446 +
 .../tracy-0.11.1/tracy/TracyOpenCL.hpp        |  414 +
 .../tracy-0.11.1/tracy/TracyOpenGL.hpp        |  325 +
 .../tracy-0.11.1/tracy/TracyVulkan.hpp        |  723 ++
 src/hx/Debug.cpp                              |    6 +-
 src/hx/Telemetry.cpp                          |    2 +
 src/hx/TelemetryTracy.cpp                     |  262 +
 src/hx/gc/Immix.cpp                           |   17 +-
 toolchain/common-defines.xml                  |   12 +
 toolchain/haxe-target.xml                     |   18 +-
 85 files changed, 45618 insertions(+), 10 deletions(-)
 create mode 100644 include/hx/TelemetryTracy.h
 create mode 100644 project/thirdparty/tracy-0.11.1/README.md
 create mode 100644 project/thirdparty/tracy-0.11.1/TracyClient.cpp
 create mode 100644 project/thirdparty/tracy-0.11.1/client/TracyAlloc.cpp
 create mode 100644 project/thirdparty/tracy-0.11.1/client/TracyArmCpuTable.hpp
 create mode 100644 project/thirdparty/tracy-0.11.1/client/TracyCallstack.cpp
 create mode 100644 project/thirdparty/tracy-0.11.1/client/TracyCallstack.h
 create mode 100644 project/thirdparty/tracy-0.11.1/client/TracyCallstack.hpp
 create mode 100644 project/thirdparty/tracy-0.11.1/client/TracyCpuid.hpp
 create mode 100644 project/thirdparty/tracy-0.11.1/client/TracyDebug.hpp
 create mode 100644 project/thirdparty/tracy-0.11.1/client/TracyDxt1.cpp
 create mode 100644 project/thirdparty/tracy-0.11.1/client/TracyDxt1.hpp
 create mode 100644 project/thirdparty/tracy-0.11.1/client/TracyFastVector.hpp
 create mode 100644 project/thirdparty/tracy-0.11.1/client/TracyKCore.cpp
 create mode 100644 project/thirdparty/tracy-0.11.1/client/TracyKCore.hpp
 create mode 100644 project/thirdparty/tracy-0.11.1/client/TracyLock.hpp
 create mode 100644 project/thirdparty/tracy-0.11.1/client/TracyOverride.cpp
 create mode 100644 project/thirdparty/tracy-0.11.1/client/TracyProfiler.cpp
 create mode 100644 project/thirdparty/tracy-0.11.1/client/TracyProfiler.hpp
 create mode 100644 project/thirdparty/tracy-0.11.1/client/TracyRingBuffer.hpp
 create mode 100644 project/thirdparty/tracy-0.11.1/client/TracyScoped.hpp
 create mode 100644 project/thirdparty/tracy-0.11.1/client/TracyStringHelpers.hpp
 create mode 100644 project/thirdparty/tracy-0.11.1/client/TracySysPower.cpp
 create mode 100644 project/thirdparty/tracy-0.11.1/client/TracySysPower.hpp
 create mode 100644 project/thirdparty/tracy-0.11.1/client/TracySysTime.cpp
 create mode 100644 project/thirdparty/tracy-0.11.1/client/TracySysTime.hpp
 create mode 100644 project/thirdparty/tracy-0.11.1/client/TracySysTrace.cpp
 create mode 100644 project/thirdparty/tracy-0.11.1/client/TracySysTrace.hpp
 create mode 100644 project/thirdparty/tracy-0.11.1/client/TracyThread.hpp
 create mode 100644 project/thirdparty/tracy-0.11.1/client/tracy_SPSCQueue.h
 create mode 100644 project/thirdparty/tracy-0.11.1/client/tracy_concurrentqueue.h
 create mode 100644 project/thirdparty/tracy-0.11.1/client/tracy_rpmalloc.cpp
 create mode 100644 project/thirdparty/tracy-0.11.1/client/tracy_rpmalloc.hpp
 create mode 100644 project/thirdparty/tracy-0.11.1/common/TracyAlign.hpp
 create mode 100644 project/thirdparty/tracy-0.11.1/common/TracyAlloc.hpp
 create mode 100644 project/thirdparty/tracy-0.11.1/common/TracyApi.h
 create mode 100644 project/thirdparty/tracy-0.11.1/common/TracyColor.hpp
 create mode 100644 project/thirdparty/tracy-0.11.1/common/TracyForceInline.hpp
 create mode 100644 project/thirdparty/tracy-0.11.1/common/TracyMutex.hpp
 create mode 100644 project/thirdparty/tracy-0.11.1/common/TracyProtocol.hpp
 create mode 100644 project/thirdparty/tracy-0.11.1/common/TracyQueue.hpp
 create mode 100644 project/thirdparty/tracy-0.11.1/common/TracySocket.cpp
 create mode 100644 project/thirdparty/tracy-0.11.1/common/TracySocket.hpp
 create mode 100644 project/thirdparty/tracy-0.11.1/common/TracyStackFrames.cpp
 create mode 100644 project/thirdparty/tracy-0.11.1/common/TracyStackFrames.hpp
 create mode 100644 project/thirdparty/tracy-0.11.1/common/TracySystem.cpp
 create mode 100644 project/thirdparty/tracy-0.11.1/common/TracySystem.hpp
 create mode 100644 project/thirdparty/tracy-0.11.1/common/TracyUwp.hpp
 create mode 100644 project/thirdparty/tracy-0.11.1/common/TracyVersion.hpp
 create mode 100644 project/thirdparty/tracy-0.11.1/common/TracyYield.hpp
 create mode 100644 project/thirdparty/tracy-0.11.1/common/tracy_lz4.cpp
 create mode 100644 project/thirdparty/tracy-0.11.1/common/tracy_lz4.hpp
 create mode 100644 project/thirdparty/tracy-0.11.1/common/tracy_lz4hc.cpp
 create mode 100644 project/thirdparty/tracy-0.11.1/common/tracy_lz4hc.hpp
 create mode 100644 project/thirdparty/tracy-0.11.1/libbacktrace/LICENSE
 create mode 100644 project/thirdparty/tracy-0.11.1/libbacktrace/alloc.cpp
 create mode 100644 project/thirdparty/tracy-0.11.1/libbacktrace/backtrace.hpp
 create mode 100644 project/thirdparty/tracy-0.11.1/libbacktrace/config.h
 create mode 100644 project/thirdparty/tracy-0.11.1/libbacktrace/dwarf.cpp
 create mode 100644 project/thirdparty/tracy-0.11.1/libbacktrace/elf.cpp
 create mode 100644 project/thirdparty/tracy-0.11.1/libbacktrace/fileline.cpp
 create mode 100644 project/thirdparty/tracy-0.11.1/libbacktrace/filenames.hpp
 create mode 100644 project/thirdparty/tracy-0.11.1/libbacktrace/internal.hpp
 create mode 100644 project/thirdparty/tracy-0.11.1/libbacktrace/macho.cpp
 create mode 100644 project/thirdparty/tracy-0.11.1/libbacktrace/mmapio.cpp
 create mode 100644 project/thirdparty/tracy-0.11.1/libbacktrace/posix.cpp
 create mode 100644 project/thirdparty/tracy-0.11.1/libbacktrace/sort.cpp
 create mode 100644 project/thirdparty/tracy-0.11.1/libbacktrace/state.cpp
 create mode 100644 project/thirdparty/tracy-0.11.1/tracy/Tracy.hpp
 create mode 100644 project/thirdparty/tracy-0.11.1/tracy/TracyC.h
 create mode 100644 project/thirdparty/tracy-0.11.1/tracy/TracyD3D11.hpp
 create mode 100644 project/thirdparty/tracy-0.11.1/tracy/TracyD3D12.hpp
 create mode 100644 project/thirdparty/tracy-0.11.1/tracy/TracyLua.hpp
 create mode 100644 project/thirdparty/tracy-0.11.1/tracy/TracyOpenCL.hpp
 create mode 100644 project/thirdparty/tracy-0.11.1/tracy/TracyOpenGL.hpp
 create mode 100644 project/thirdparty/tracy-0.11.1/tracy/TracyVulkan.hpp
 create mode 100644 src/hx/TelemetryTracy.cpp

diff --git a/include/hx/GC.h b/include/hx/GC.h
index a9148c505..95b65c25d 100644
--- a/include/hx/GC.h
+++ b/include/hx/GC.h
@@ -23,6 +23,8 @@
 
 #ifdef HXCPP_TELEMETRY
 extern void __hxt_gc_new(hx::StackContext *inStack, void* obj, int inSize, const char *inName);
+extern void __hxt_gc_alloc(void* obj, int inSize);
+extern void __hxt_gc_free_large(void* obj);
 #endif
 
 
@@ -430,6 +432,7 @@ class ImmixAllocator
             #endif
 
             #ifdef HXCPP_TELEMETRY
+            __hxt_gc_alloc(buffer, inSize);
             __hxt_gc_new((hx::StackContext *)alloc,buffer, inSize, inName);
             #endif
             return buffer;
diff --git a/include/hx/StackContext.h b/include/hx/StackContext.h
index 1587b1b12..67dc56399 100644
--- a/include/hx/StackContext.h
+++ b/include/hx/StackContext.h
@@ -73,7 +73,7 @@
    // Newer code will use the HX_STACKFRAME macro
    #define HX_STACKFRAME(pos) ::hx::StackFrame _hx_stackframe(pos);
    #define HX_GC_STACKFRAME(pos) ::hx::StackFrame _hx_stackframe(pos);
-
+   
    // Must record the stack state at the catch
    #define HX_STACK_BEGIN_CATCH __hxcpp_stack_begin_catch();
    #define HX_JUST_GC_STACKFRAME ::hx::JustGcStackFrame _hx_stackframe;
diff --git a/include/hx/TelemetryTracy.h b/include/hx/TelemetryTracy.h
new file mode 100644
index 000000000..178932afc
--- /dev/null
+++ b/include/hx/TelemetryTracy.h
@@ -0,0 +1,44 @@
+#ifndef HX_TELEMETRY_TRACY_H
+#define HX_TELEMETRY_TRACY_H
+
+#ifndef HXCPP_TRACY
+	#error "Error: HXCPP_TRACY must be defined."
+#endif
+
+#define TRACY_ENABLE
+#include <hxcpp.h>
+#include "../../project/thirdparty/tracy-0.11.1/tracy/TracyC.h"
+#include "../../project/thirdparty/tracy-0.11.1/tracy/Tracy.hpp"
+
+#ifdef HXCPP_TRACY_MEMORY
+	#ifdef HXCPP_GC_MOVING
+		#error "Error: HXCPP_TRACY_MEMORY is not supported when HXCPP_GC_MOVING is active."
+	#endif
+	#ifdef HXCPP_GC_GENERATIONAL
+		#error "Error: HXCPP_TRACY_MEMORY is not supported when HXCPP_GC_GENERATIONAL is active."
+	#endif
+#endif
+
+#ifdef HXCPP_TRACY_INCLUDE_CALLSTACKS
+#define HXCPP_TRACY_ZONE(name) \
+	::hx::strbuf TracyConcat(_hx_tracy_str_buffer, TracyLine); \
+	int TracyConcat(_hx_tracy_str_length, TracyLine); \
+	const char *TracyConcat(_hx_tracy_str_buffer_ptr, TracyLine) = name.utf8_str(&TracyConcat(_hx_tracy_str_buffer, TracyLine), false, &TracyConcat(_hx_tracy_str_length, TracyLine)); \
+	::tracy::ScopedZone TracyConcat(_hx_tracy_scoped_zone,TracyLine)(_hx_stackframe.lineNumber, _hx_stackframe.position->fileName, strlen(_hx_stackframe.position->fileName), _hx_stackframe.position->fullName, strlen(_hx_stackframe.position->fullName), TracyConcat(_hx_tracy_str_buffer_ptr, TracyLine), TracyConcat(_hx_tracy_str_length, TracyLine), __hxcpp_tracy_get_zone_count());
+#else
+#define HXCPP_TRACY_ZONE(name) \
+	::hx::strbuf TracyConcat(_hx_tracy_str_buffer, TracyLine); \
+	int TracyConcat(_hx_tracy_str_length, TracyLine); \
+	const char *TracyConcat(_hx_tracy_str_buffer_ptr, TracyLine) = name.utf8_str(&TracyConcat(_hx_tracy_str_buffer, TracyLine), false, &TracyConcat(_hx_tracy_str_length, TracyLine)); \
+	::tracy::ScopedZone TracyConcat(_hx_tracy_scoped_zone,TracyLine)(_hx_stackframe.lineNumber, _hx_stackframe.position->fileName, strlen(_hx_stackframe.position->fileName), _hx_stackframe.position->fullName, strlen(_hx_stackframe.position->fullName), TracyConcat(_hx_tracy_str_buffer_ptr, TracyLine), TracyConcat(_hx_tracy_str_length, TracyLine));
+#endif
+
+void __hxcpp_tracy_framemark();
+void __hxcpp_tracy_plot(::String name, ::Float val);
+void __hxcpp_tracy_plot_config(::String name, uint8_t format, bool step, bool fill, int color);
+void __hxcpp_tracy_message(::String msg, int color);
+void __hxcpp_tracy_message_app_info(::String info);
+void __hxcpp_tracy_set_thread_name_and_group(String name, int groupHint);
+int __hxcpp_tracy_get_zone_count();
+
+#endif
diff --git a/include/hxcpp.h b/include/hxcpp.h
index faa1b4db2..008f13545 100755
--- a/include/hxcpp.h
+++ b/include/hxcpp.h
@@ -75,7 +75,6 @@
 #endif
 
 
-
 // Some compilers are over-enthusiastic about what they #define ...
 //#ifdef NULL
 //#undef NULL
diff --git a/project/thirdparty/tracy-0.11.1/README.md b/project/thirdparty/tracy-0.11.1/README.md
new file mode 100644
index 000000000..082a086db
--- /dev/null
+++ b/project/thirdparty/tracy-0.11.1/README.md
@@ -0,0 +1,76 @@
+
+## How to use 
+
+To activate the tracy integration, compile your app with:
+
+```
+-D HXCPP_TRACY
+-D HXCPP_TELEMETRY
+-D HXCPP_STACK_TRACE
+-D HXCPP_STACK_LINE
+```
+
+and use the following call in your mainloop:
+
+```
+cpp.vm.tracy.TracyProfiler.frameMark();
+```
+
+Then start either Tracy's UI-App or cmdline server listening on localhost and start your hxcpp-made binary.
+
+
+## Some notes about the integration
+
+### Haxe-Code
+We integrate Tracy into hxcpp as a Telemetry option which utilizes `hx::StackPosition` and offer a set of static functions to set zones and other tracy functionality. Through this all your haxe-code will be captured in a profiler-session.
+
+There are however native parts of hxcpp that wont be visible by default in Tracy (bc there are no ZoneScopes). 
+
+> Note: Exceptions are in a few spots in the GC-Code, so GC becomes visible for us.
+
+> Note: Hxcpp's native calls will become visible if you use the option to capture callstacks.
+
+> Note: We capture source-locations and their filepaths. By default these are relative to your project and thus the sourcecode preview / browsing in Tracy wont work since it expects absolute paths. To solve this you can use `-D absolute-path` in your builds.
+
+### externs
+The same is true about externs you might be using in your project. If you want to make these visible, you need to `@:include('hx/TelemetryTracy.h')` and you gain access to Tracy's C-Macros that you can use in your extern's c/cpp-code. Please refer to the official Tracy documentation: https://github.com/wolfpld/tracy/releases/latest/download/tracy.pdf
+
+### externs with static/dynamic libs
+Another special case are static or dynamic libs your externs might be using. For these you will have to make more changes that are beyond the scope of this doc here, please refer to Tracy's official documentation over here: https://github.com/wolfpld/tracy/releases/latest/download/tracy.pdf
+
+## Optional Features
+
+### Memory Profiling
+
+The following define adds tracking (de-)allocations of hxcpp's small & large object heap.
+
+```
+-D HXCPP_TRACY_MEMORY
+```
+
+### Capture Callstacks
+
+By default we only track zones. If you wanna inspect the actual callstack per zone, you should use the following define:
+
+```
+-D HXCPP_TRACY_INCLUDE_CALLSTACKS
+```
+
+> Note: This will inflate the telemetry data A LOT and cost more performance. Please be aware. 
+
+
+### On Demand Profiling
+
+By default this integration will start sampling & collecting telemetry with the start of your application. You can change this behavior by the following define and your app will only generate telemetry if the Tracy Profiler app is open and reachable. 
+
+```
+-D HXCPP_TRACY_ON_DEMAND
+```
+
+### Short-lived Application Support
+
+In cases where you dont have a mainloop or a very short-lived application you can use the following define to let your application stay around to complete sending telemetry data it has collected.  
+
+```
+-D HXCPP_TRACY_NO_EXIT
+```
\ No newline at end of file
diff --git a/project/thirdparty/tracy-0.11.1/TracyClient.cpp b/project/thirdparty/tracy-0.11.1/TracyClient.cpp
new file mode 100644
index 000000000..6224f48bf
--- /dev/null
+++ b/project/thirdparty/tracy-0.11.1/TracyClient.cpp
@@ -0,0 +1,61 @@
+//
+//          Tracy profiler
+//         ----------------
+//
+// For fast integration, compile and
+// link with this source file (and none
+// other) in your executable (or in the
+// main DLL / shared object on multi-DLL
+// projects).
+//
+
+// Define TRACY_ENABLE to enable profiler.
+
+#include "common/TracySystem.cpp"
+
+#ifdef TRACY_ENABLE
+
+#ifdef _MSC_VER
+#  pragma warning(push, 0)
+#endif
+
+#include "common/tracy_lz4.cpp"
+#include "client/TracyProfiler.cpp"
+#include "client/TracyCallstack.cpp"
+#include "client/TracySysPower.cpp"
+#include "client/TracySysTime.cpp"
+#include "client/TracySysTrace.cpp"
+#include "common/TracySocket.cpp"
+#include "client/tracy_rpmalloc.cpp"
+#include "client/TracyDxt1.cpp"
+#include "client/TracyAlloc.cpp"
+#include "client/TracyOverride.cpp"
+#include "client/TracyKCore.cpp"
+
+#if defined(TRACY_HAS_CALLSTACK)
+#  if TRACY_HAS_CALLSTACK == 2 || TRACY_HAS_CALLSTACK == 3 || TRACY_HAS_CALLSTACK == 4 || TRACY_HAS_CALLSTACK == 6
+#    include "libbacktrace/alloc.cpp"
+#    include "libbacktrace/dwarf.cpp"
+#    include "libbacktrace/fileline.cpp"
+#    include "libbacktrace/mmapio.cpp"
+#    include "libbacktrace/posix.cpp"
+#    include "libbacktrace/sort.cpp"
+#    include "libbacktrace/state.cpp"
+#    if TRACY_HAS_CALLSTACK == 4
+#      include "libbacktrace/macho.cpp"
+#    else
+#      include "libbacktrace/elf.cpp"
+#    endif
+#    include "common/TracyStackFrames.cpp"
+#  endif
+#endif
+
+#ifdef _MSC_VER
+#  pragma comment(lib, "ws2_32.lib")
+#  pragma comment(lib, "dbghelp.lib")
+#  pragma comment(lib, "advapi32.lib")
+#  pragma comment(lib, "user32.lib")
+#  pragma warning(pop)
+#endif
+
+#endif
diff --git a/project/thirdparty/tracy-0.11.1/client/TracyAlloc.cpp b/project/thirdparty/tracy-0.11.1/client/TracyAlloc.cpp
new file mode 100644
index 000000000..c675b6d3f
--- /dev/null
+++ b/project/thirdparty/tracy-0.11.1/client/TracyAlloc.cpp
@@ -0,0 +1,43 @@
+#include "../common/TracyAlloc.hpp"
+
+#ifdef TRACY_USE_RPMALLOC
+
+#include <atomic>
+
+#include "../common/TracyForceInline.hpp"
+#include "../common/TracyYield.hpp"
+
+namespace tracy
+{
+
+extern thread_local bool RpThreadInitDone;
+extern std::atomic<int> RpInitDone;
+extern std::atomic<int> RpInitLock;
+
+tracy_no_inline static void InitRpmallocPlumbing()
+{
+    const auto done = RpInitDone.load( std::memory_order_acquire );
+    if( !done )
+    {
+        int expected = 0;
+        while( !RpInitLock.compare_exchange_weak( expected, 1, std::memory_order_release, std::memory_order_relaxed ) ) { expected = 0; YieldThread(); }
+        const auto done = RpInitDone.load( std::memory_order_acquire );
+        if( !done )
+        {
+            rpmalloc_initialize();
+            RpInitDone.store( 1, std::memory_order_release );
+        }
+        RpInitLock.store( 0, std::memory_order_release );
+    }
+    rpmalloc_thread_initialize();
+    RpThreadInitDone = true;
+}
+
+TRACY_API void InitRpmalloc()
+{
+    if( !RpThreadInitDone ) InitRpmallocPlumbing();
+}
+
+}
+
+#endif
diff --git a/project/thirdparty/tracy-0.11.1/client/TracyArmCpuTable.hpp b/project/thirdparty/tracy-0.11.1/client/TracyArmCpuTable.hpp
new file mode 100644
index 000000000..2b47c3a60
--- /dev/null
+++ b/project/thirdparty/tracy-0.11.1/client/TracyArmCpuTable.hpp
@@ -0,0 +1,419 @@
+namespace tracy
+{
+
+#if defined __linux__ && defined __ARM_ARCH
+
+static const char* DecodeArmImplementer( uint32_t v )
+{
+    static char buf[16];
+    switch( v )
+    {
+    case 0x41: return "ARM";
+    case 0x42: return "Broadcom";
+    case 0x43: return "Cavium";
+    case 0x44: return "DEC";
+    case 0x46: return "Fujitsu";
+    case 0x48: return "HiSilicon";
+    case 0x49: return "Infineon";
+    case 0x4d: return "Motorola";
+    case 0x4e: return "Nvidia";
+    case 0x50: return "Applied Micro";
+    case 0x51: return "Qualcomm";
+    case 0x53: return "Samsung";
+    case 0x54: return "Texas Instruments";
+    case 0x56: return "Marvell";
+    case 0x61: return "Apple";
+    case 0x66: return "Faraday";
+    case 0x68: return "HXT";
+    case 0x69: return "Intel";
+    case 0xc0: return "Ampere Computing";
+    default: break;
+    }
+    sprintf( buf, "0x%x", v );
+    return buf;
+}
+
+static const char* DecodeArmPart( uint32_t impl, uint32_t part )
+{
+    static char buf[16];
+    switch( impl )
+    {
+    case 0x41:  // ARM
+        switch( part )
+        {
+        case 0x810: return "810";
+        case 0x920: return "920";
+        case 0x922: return "922";
+        case 0x926: return "926";
+        case 0x940: return "940";
+        case 0x946: return "946";
+        case 0x966: return "966";
+        case 0xa20: return "1020";
+        case 0xa22: return "1022";
+        case 0xa26: return "1026";
+        case 0xb02: return "11 MPCore";
+        case 0xb36: return "1136";
+        case 0xb56: return "1156";
+        case 0xb76: return "1176";
+        case 0xc05: return " Cortex-A5";
+        case 0xc07: return " Cortex-A7";
+        case 0xc08: return " Cortex-A8";
+        case 0xc09: return " Cortex-A9";
+        case 0xc0c: return " Cortex-A12";
+        case 0xc0d: return " Rockchip RK3288";
+        case 0xc0e: return " Cortex-A17";
+        case 0xc0f: return " Cortex-A15";
+        case 0xc14: return " Cortex-R4";
+        case 0xc15: return " Cortex-R5";
+        case 0xc17: return " Cortex-R7";
+        case 0xc18: return " Cortex-R8";
+        case 0xc20: return " Cortex-M0";
+        case 0xc21: return " Cortex-M1";
+        case 0xc23: return " Cortex-M3";
+        case 0xc24: return " Cortex-M4";
+        case 0xc27: return " Cortex-M7";
+        case 0xc60: return " Cortex-M0+";
+        case 0xd00: return " AArch64 simulator";
+        case 0xd01: return " Cortex-A32";
+        case 0xd02: return " Cortex-A34";
+        case 0xd03: return " Cortex-A53";
+        case 0xd04: return " Cortex-A35";
+        case 0xd05: return " Cortex-A55";
+        case 0xd06: return " Cortex-A65";
+        case 0xd07: return " Cortex-A57";
+        case 0xd08: return " Cortex-A72";
+        case 0xd09: return " Cortex-A73";
+        case 0xd0a: return " Cortex-A75";
+        case 0xd0b: return " Cortex-A76";
+        case 0xd0c: return " Neoverse N1";
+        case 0xd0d: return " Cortex-A77";
+        case 0xd0e: return " Cortex-A76AE";
+        case 0xd0f: return " AEMv8";
+        case 0xd13: return " Cortex-R52";
+        case 0xd20: return " Cortex-M23";
+        case 0xd21: return " Cortex-M33";
+        case 0xd22: return " Cortex-M55";
+        case 0xd40: return " Neoverse V1";
+        case 0xd41: return " Cortex-A78";
+        case 0xd42: return " Cortex-A78AE";
+        case 0xd43: return " Cortex-A65AE";
+        case 0xd44: return " Cortex-X1";
+        case 0xd47: return " Cortex-A710";
+        case 0xd48: return " Cortex-X2";
+        case 0xd49: return " Neoverse N2";
+        case 0xd4a: return " Neoverse E1";
+        case 0xd4b: return " Cortex-A78C";
+        case 0xd4c: return " Cortex-X1C";
+        default: break;
+        }
+    case 0x42:  // Broadcom
+        switch( part )
+        {
+        case 0xf: return " Brahma B15";
+        case 0x100: return " Brahma B53";
+        case 0x516: return " ThunderX2";
+        default: break;
+        }
+    case 0x43:  // Cavium
+        switch( part )
+        {
+        case 0xa0: return " ThunderX";
+        case 0xa1: return " ThunderX 88XX";
+        case 0xa2: return " ThunderX 81XX";
+        case 0xa3: return " ThunderX 83XX";
+        case 0xaf: return " ThunderX2 99xx";
+        case 0xb0: return " OcteonTX2";
+        case 0xb1: return " OcteonTX2 T98";
+        case 0xb2: return " OcteonTX2 T96";
+        case 0xb3: return " OcteonTX2 F95";
+        case 0xb4: return " OcteonTX2 F95N";
+        case 0xb5: return " OcteonTX2 F95MM";
+        case 0xb6: return " OcteonTX2 F95O";
+        case 0xb8: return " ThunderX3 T110";
+        default: break;
+        }
+    case 0x44:  // DEC
+        switch( part )
+        {
+        case 0xa10: return " SA110";
+        case 0xa11: return " SA1100";
+        default: break;
+        }
+    case 0x46:  // Fujitsu
+        switch( part )
+        {
+        case 0x1: return " A64FX";
+        default: break;
+        }
+    case 0x48:  // HiSilicon
+        switch( part )
+        {
+        case 0xd01: return " TSV100";
+        case 0xd40: return " Kirin 980";
+        default: break;
+        }
+    case 0x4e:  // Nvidia
+        switch( part )
+        {
+        case 0x0: return " Denver";
+        case 0x3: return " Denver 2";
+        case 0x4: return " Carmel";
+        default: break;
+        }
+    case 0x50:  // Applied Micro
+        switch( part )
+        {
+        case 0x0: return " X-Gene";
+        default: break;
+        }
+    case 0x51:  // Qualcomm
+        switch( part )
+        {
+        case 0xf: return " Scorpion";
+        case 0x2d: return " Scorpion";
+        case 0x4d: return " Krait";
+        case 0x6f: return " Krait";
+        case 0x200: return " Kryo";
+        case 0x201: return " Kryo Silver (Snapdragon 821)";
+        case 0x205: return " Kryo Gold";
+        case 0x211: return " Kryo Silver (Snapdragon 820)";
+        case 0x800: return " Kryo 260 / 280 Gold";
+        case 0x801: return " Kryo 260 / 280 Silver";
+        case 0x802: return " Kryo 385 Gold";
+        case 0x803: return " Kryo 385 Silver";
+        case 0x804: return " Kryo 485 Gold";
+        case 0x805: return " Kryo 4xx/5xx Silver";
+        case 0xc00: return " Falkor";
+        case 0xc01: return " Saphira";
+        default: break;
+        }
+    case 0x53:  // Samsung
+        switch( part )
+        {
+        case 0x1: return " Exynos M1/M2";
+        case 0x2: return " Exynos M3";
+        case 0x3: return " Exynos M4";
+        case 0x4: return " Exynos M5";
+        default: break;
+        }
+    case 0x54:  // Texas Instruments
+        switch( part )
+        {
+        case 0x925: return " TI925";
+        default: break;
+        }
+    case 0x56:  // Marvell
+        switch( part )
+        {
+        case 0x131: return " Feroceon 88FR131";
+        case 0x581: return " PJ4 / PJ4B";
+        case 0x584: return " PJ4B-MP / PJ4C";
+        default: break;
+        }
+    case 0x61:  // Apple
+        switch( part )
+        {
+        case 0x1: return " Cyclone";
+        case 0x2: return " Typhoon";
+        case 0x3: return " Typhoon/Capri";
+        case 0x4: return " Twister";
+        case 0x5: return " Twister/Elba/Malta";
+        case 0x6: return " Hurricane";
+        case 0x7: return " Hurricane/Myst";
+        case 0x22: return " M1 Icestorm";
+        case 0x23: return " M1 Firestorm";
+        case 0x24: return " M1 Icestorm Pro";
+        case 0x25: return " M1 Firestorm Pro";
+        case 0x28: return " M1 Icestorm Max";
+        case 0x29: return " M1 Firestorm Max";
+        default: break;
+        }
+    case 0x66:  // Faraday
+        switch( part )
+        {
+        case 0x526: return " FA526";
+        case 0x626: return " FA626";
+        default: break;
+        }
+    case 0x68:  // HXT
+        switch( part )
+        {
+        case 0x0: return " Phecda";
+        default: break;
+        }
+    case 0xc0:  // Ampere Computing
+        switch( part )
+        {
+        case 0xac3: return " Ampere1";
+        default: break;
+        }
+    default: break;
+    }
+    sprintf( buf, " 0x%x", part );
+    return buf;
+}
+
+#elif defined __APPLE__ && TARGET_OS_IPHONE == 1
+
+static const char* DecodeIosDevice( const char* id )
+{
+    static const char* DeviceTable[] = {
+        "i386", "32-bit simulator",
+        "x86_64", "64-bit simulator",
+        "iPhone1,1", "iPhone",
+        "iPhone1,2", "iPhone 3G",
+        "iPhone2,1", "iPhone 3GS",
+        "iPhone3,1", "iPhone 4 (GSM)",
+        "iPhone3,2", "iPhone 4 (GSM)",
+        "iPhone3,3", "iPhone 4 (CDMA)",
+        "iPhone4,1", "iPhone 4S",
+        "iPhone5,1", "iPhone 5 (A1428)",
+        "iPhone5,2", "iPhone 5 (A1429)",
+        "iPhone5,3", "iPhone 5c (A1456/A1532)",
+        "iPhone5,4", "iPhone 5c (A1507/A1516/1526/A1529)",
+        "iPhone6,1", "iPhone 5s (A1433/A1533)",
+        "iPhone6,2", "iPhone 5s (A1457/A1518/A1528/A1530)",
+        "iPhone7,1", "iPhone 6 Plus",
+        "iPhone7,2", "iPhone 6",
+        "iPhone8,1", "iPhone 6S",
+        "iPhone8,2", "iPhone 6S Plus",
+        "iPhone8,4", "iPhone SE",
+        "iPhone9,1", "iPhone 7 (CDMA)",
+        "iPhone9,2", "iPhone 7 Plus (CDMA)",
+        "iPhone9,3", "iPhone 7 (GSM)",
+        "iPhone9,4", "iPhone 7 Plus (GSM)",
+        "iPhone10,1", "iPhone 8 (CDMA)",
+        "iPhone10,2", "iPhone 8 Plus (CDMA)",
+        "iPhone10,3", "iPhone X (CDMA)",
+        "iPhone10,4", "iPhone 8 (GSM)",
+        "iPhone10,5", "iPhone 8 Plus (GSM)",
+        "iPhone10,6", "iPhone X (GSM)",
+        "iPhone11,2", "iPhone XS",
+        "iPhone11,4", "iPhone XS Max",
+        "iPhone11,6", "iPhone XS Max China",
+        "iPhone11,8", "iPhone XR",
+        "iPhone12,1", "iPhone 11",
+        "iPhone12,3", "iPhone 11 Pro",
+        "iPhone12,5", "iPhone 11 Pro Max",
+        "iPhone12,8", "iPhone SE 2nd Gen",
+        "iPhone13,1", "iPhone 12 Mini",
+        "iPhone13,2", "iPhone 12",
+        "iPhone13,3", "iPhone 12 Pro",
+        "iPhone13,4", "iPhone 12 Pro Max",
+        "iPhone14,2", "iPhone 13 Pro",
+        "iPhone14,3", "iPhone 13 Pro Max",
+        "iPhone14,4", "iPhone 13 Mini",
+        "iPhone14,5", "iPhone 13",
+        "iPhone14,6", "iPhone SE 3rd Gen",
+        "iPhone14,7", "iPhone 14",
+        "iPhone14,8", "iPhone 14 Plus",
+        "iPhone15,2", "iPhone 14 Pro",
+        "iPhone15,3", "iPhone 14 Pro Max",
+        "iPhone15,4", "iPhone 15",
+        "iPhone15,5", "iPhone 15 Plus",
+        "iPhone16,1", "iPhone 15 Pro",
+        "iPhone16,2", "iPhone 15 Pro Max",
+        "iPad1,1", "iPad (A1219/A1337)",
+        "iPad2,1", "iPad 2 (A1395)",
+        "iPad2,2", "iPad 2 (A1396)",
+        "iPad2,3", "iPad 2 (A1397)",
+        "iPad2,4", "iPad 2 (A1395)",
+        "iPad2,5", "iPad Mini (A1432)",
+        "iPad2,6", "iPad Mini (A1454)",
+        "iPad2,7", "iPad Mini (A1455)",
+        "iPad3,1", "iPad 3 (A1416)",
+        "iPad3,2", "iPad 3 (A1403)",
+        "iPad3,3", "iPad 3 (A1430)",
+        "iPad3,4", "iPad 4 (A1458)",
+        "iPad3,5", "iPad 4 (A1459)",
+        "iPad3,6", "iPad 4 (A1460)",
+        "iPad4,1", "iPad Air (A1474)",
+        "iPad4,2", "iPad Air (A1475)",
+        "iPad4,3", "iPad Air (A1476)",
+        "iPad4,4", "iPad Mini 2 (A1489)",
+        "iPad4,5", "iPad Mini 2 (A1490)",
+        "iPad4,6", "iPad Mini 2 (A1491)",
+        "iPad4,7", "iPad Mini 3 (A1599)",
+        "iPad4,8", "iPad Mini 3 (A1600)",
+        "iPad4,9", "iPad Mini 3 (A1601)",
+        "iPad5,1", "iPad Mini 4 (A1538)",
+        "iPad5,2", "iPad Mini 4 (A1550)",
+        "iPad5,3", "iPad Air 2 (A1566)",
+        "iPad5,4", "iPad Air 2 (A1567)",
+        "iPad6,3", "iPad Pro 9.7\" (A1673)",
+        "iPad6,4", "iPad Pro 9.7\" (A1674)",
+        "iPad6,5", "iPad Pro 9.7\" (A1675)",
+        "iPad6,7", "iPad Pro 12.9\" (A1584)",
+        "iPad6,8", "iPad Pro 12.9\" (A1652)",
+        "iPad6,11", "iPad 5th gen (A1822)",
+        "iPad6,12", "iPad 5th gen (A1823)",
+        "iPad7,1", "iPad Pro 12.9\" 2nd gen (A1670)",
+        "iPad7,2", "iPad Pro 12.9\" 2nd gen (A1671/A1821)",
+        "iPad7,3", "iPad Pro 10.5\" (A1701)",
+        "iPad7,4", "iPad Pro 10.5\" (A1709)",
+        "iPad7,5", "iPad 6th gen (A1893)",
+        "iPad7,6", "iPad 6th gen (A1954)",
+        "iPad7,11", "iPad 7th gen 10.2\" (Wifi)",
+        "iPad7,12", "iPad 7th gen 10.2\" (Wifi+Cellular)",
+        "iPad8,1", "iPad Pro 11\" (A1980)",
+        "iPad8,2", "iPad Pro 11\" (A1980)",
+        "iPad8,3", "iPad Pro 11\" (A1934/A1979/A2013)",
+        "iPad8,4", "iPad Pro 11\" (A1934/A1979/A2013)",
+        "iPad8,5", "iPad Pro 12.9\" 3rd gen (A1876)",
+        "iPad8,6", "iPad Pro 12.9\" 3rd gen (A1876)",
+        "iPad8,7", "iPad Pro 12.9\" 3rd gen (A1895/A1983/A2014)",
+        "iPad8,8", "iPad Pro 12.9\" 3rd gen (A1895/A1983/A2014)",
+        "iPad8,9", "iPad Pro 11\" 2nd gen (Wifi)",
+        "iPad8,10", "iPad Pro 11\" 2nd gen (Wifi+Cellular)",
+        "iPad8,11", "iPad Pro 12.9\" 4th gen (Wifi)",
+        "iPad8,12", "iPad Pro 12.9\" 4th gen (Wifi+Cellular)",
+        "iPad11,1", "iPad Mini 5th gen (A2133)",
+        "iPad11,2", "iPad Mini 5th gen (A2124/A2125/A2126)",
+        "iPad11,3", "iPad Air 3rd gen (A2152)",
+        "iPad11,4", "iPad Air 3rd gen (A2123/A2153/A2154)",
+        "iPad11,6", "iPad 8th gen (WiFi)",
+        "iPad11,7", "iPad 8th gen (WiFi+Cellular)",
+        "iPad12,1", "iPad 9th Gen (WiFi)",
+        "iPad12,2", "iPad 9th Gen (WiFi+Cellular)",
+        "iPad13,1", "iPad Air 4th gen (WiFi)",
+        "iPad13,2", "iPad Air 4th gen (WiFi+Cellular)",
+        "iPad13,4", "iPad Pro 11\" 3rd gen",
+        "iPad13,5", "iPad Pro 11\" 3rd gen",
+        "iPad13,6", "iPad Pro 11\" 3rd gen",
+        "iPad13,7", "iPad Pro 11\" 3rd gen",
+        "iPad13,8", "iPad Pro 12.9\" 5th gen",
+        "iPad13,9", "iPad Pro 12.9\" 5th gen",
+        "iPad13,10", "iPad Pro 12.9\" 5th gen",
+        "iPad13,11", "iPad Pro 12.9\" 5th gen",
+        "iPad13,16", "iPad Air 5th Gen (WiFi)",
+        "iPad13,17", "iPad Air 5th Gen (WiFi+Cellular)",
+        "iPad13,18", "iPad 10th Gen",
+        "iPad13,19", "iPad 10th Gen",
+        "iPad14,1", "iPad mini 6th Gen (WiFi)",
+        "iPad14,2", "iPad mini 6th Gen (WiFi+Cellular)",
+        "iPad14,3", "iPad Pro 11\" 4th Gen",
+        "iPad14,4", "iPad Pro 11\" 4th Gen",
+        "iPad14,5", "iPad Pro 12.9\" 6th Gen",
+        "iPad14,6", "iPad Pro 12.9\" 6th Gen",
+        "iPod1,1", "iPod Touch",
+        "iPod2,1", "iPod Touch 2nd gen",
+        "iPod3,1", "iPod Touch 3rd gen",
+        "iPod4,1", "iPod Touch 4th gen",
+        "iPod5,1", "iPod Touch 5th gen",
+        "iPod7,1", "iPod Touch 6th gen",
+        "iPod9,1", "iPod Touch 7th gen",
+        nullptr
+    };
+
+    auto ptr = DeviceTable;
+    while( *ptr )
+    {
+        if( strcmp( ptr[0], id ) == 0 ) return ptr[1];
+        ptr += 2;
+    }
+    return id;
+}
+
+#endif
+
+}
diff --git a/project/thirdparty/tracy-0.11.1/client/TracyCallstack.cpp b/project/thirdparty/tracy-0.11.1/client/TracyCallstack.cpp
new file mode 100644
index 000000000..946a19721
--- /dev/null
+++ b/project/thirdparty/tracy-0.11.1/client/TracyCallstack.cpp
@@ -0,0 +1,1415 @@
+#include <limits>
+#include <new>
+#include <stdio.h>
+#include <string.h>
+#include "TracyCallstack.hpp"
+#include "TracyDebug.hpp"
+#include "TracyFastVector.hpp"
+#include "TracyStringHelpers.hpp"
+#include "../common/TracyAlloc.hpp"
+#include "../common/TracySystem.hpp"
+
+
+#ifdef TRACY_HAS_CALLSTACK
+
+#if TRACY_HAS_CALLSTACK == 1
+#  ifndef NOMINMAX
+#    define NOMINMAX
+#  endif
+#  include <windows.h>
+#  include <psapi.h>
+#  include <algorithm>
+#  ifdef _MSC_VER
+#    pragma warning( push )
+#    pragma warning( disable : 4091 )
+#  endif
+#  include <dbghelp.h>
+#  ifdef _MSC_VER
+#    pragma warning( pop )
+#  endif
+#elif TRACY_HAS_CALLSTACK == 2 || TRACY_HAS_CALLSTACK == 3 || TRACY_HAS_CALLSTACK == 4 || TRACY_HAS_CALLSTACK == 6
+#  include "../libbacktrace/backtrace.hpp"
+#  include <algorithm>
+#  include <dlfcn.h>
+#  include <cxxabi.h>
+#  include <stdlib.h>
+#elif TRACY_HAS_CALLSTACK == 5
+#  include <dlfcn.h>
+#  include <cxxabi.h>
+#endif
+
+#ifdef TRACY_DBGHELP_LOCK
+#  include "TracyProfiler.hpp"
+
+#  define DBGHELP_INIT TracyConcat( TRACY_DBGHELP_LOCK, Init() )
+#  define DBGHELP_LOCK TracyConcat( TRACY_DBGHELP_LOCK, Lock() );
+#  define DBGHELP_UNLOCK TracyConcat( TRACY_DBGHELP_LOCK, Unlock() );
+
+extern "C"
+{
+    void DBGHELP_INIT;
+    void DBGHELP_LOCK;
+    void DBGHELP_UNLOCK;
+};
+#endif
+
+#if TRACY_HAS_CALLSTACK == 2 || TRACY_HAS_CALLSTACK == 3 || TRACY_HAS_CALLSTACK == 4 || TRACY_HAS_CALLSTACK == 5 || TRACY_HAS_CALLSTACK == 6
+// If you want to use your own demangling functionality (e.g. for another language),
+// define TRACY_DEMANGLE and provide your own implementation of the __tracy_demangle
+// function. The input parameter is a function name. The demangle function must
+// identify whether this name is mangled, and fail if it is not. Failure is indicated
+// by returning nullptr. If demangling succeeds, a pointer to the C string containing
+// demangled function must be returned. The demangling function is responsible for
+// managing memory for this string. It is expected that it will be internally reused.
+// When a call to ___tracy_demangle is made, previous contents of the string memory
+// do not need to be preserved. Function may return string of any length, but the
+// profiler can choose to truncate it.
+extern "C" const char* ___tracy_demangle( const char* mangled );
+
+#ifndef TRACY_DEMANGLE
+constexpr size_t ___tracy_demangle_buffer_len = 1024*1024;
+char* ___tracy_demangle_buffer;
+
+void ___tracy_init_demangle_buffer()
+{
+    ___tracy_demangle_buffer = (char*)tracy::tracy_malloc( ___tracy_demangle_buffer_len );
+}
+
+void ___tracy_free_demangle_buffer()
+{
+    tracy::tracy_free( ___tracy_demangle_buffer );
+}
+
+extern "C" const char* ___tracy_demangle( const char* mangled )
+{
+    if( !mangled || mangled[0] != '_' ) return nullptr;
+    if( strlen( mangled ) > ___tracy_demangle_buffer_len ) return nullptr;
+    int status;
+    size_t len = ___tracy_demangle_buffer_len;
+    return abi::__cxa_demangle( mangled, ___tracy_demangle_buffer, &len, &status );
+}
+#endif
+#endif
+
+#if TRACY_HAS_CALLSTACK == 3
+#   define TRACY_USE_IMAGE_CACHE
+#   include <link.h>
+#endif
+
+namespace tracy
+{
+
+#ifdef TRACY_USE_IMAGE_CACHE
+// when we have access to dl_iterate_phdr(), we can build a cache of address ranges to image paths
+// so we can quickly determine which image an address falls into.
+// We refresh this cache only when we hit an address that doesn't fall into any known range.
+class ImageCache
+{
+public:
+    struct ImageEntry
+    {
+        void* m_startAddress = nullptr;
+        void* m_endAddress = nullptr;
+        char* m_name = nullptr;
+    };
+
+    ImageCache()
+        : m_images( 512 )
+    {
+        Refresh();
+    }
+
+    ~ImageCache()
+    {
+        Clear();
+    }
+
+    const ImageEntry* GetImageForAddress( void* address )
+    {
+        const ImageEntry* entry = GetImageForAddressImpl( address );
+        if( !entry )
+        {
+            Refresh();
+            return GetImageForAddressImpl( address );
+        }
+        return entry;
+    }
+
+private:
+    tracy::FastVector<ImageEntry> m_images;
+    bool m_updated = false;
+    bool m_haveMainImageName = false;
+
+    static int Callback( struct dl_phdr_info* info, size_t size, void* data )
+    {
+        ImageCache* cache = reinterpret_cast<ImageCache*>( data );
+
+        const auto startAddress = reinterpret_cast<void*>( info->dlpi_addr );
+        if( cache->Contains( startAddress ) ) return 0;
+
+        const uint32_t headerCount = info->dlpi_phnum;
+        assert( headerCount > 0);
+        const auto endAddress = reinterpret_cast<void*>( info->dlpi_addr +
+            info->dlpi_phdr[info->dlpi_phnum - 1].p_vaddr + info->dlpi_phdr[info->dlpi_phnum - 1].p_memsz);
+
+        ImageEntry* image = cache->m_images.push_next();
+        image->m_startAddress = startAddress;
+        image->m_endAddress = endAddress;
+
+        // the base executable name isn't provided when iterating with dl_iterate_phdr,
+        // we will have to patch the executable image name outside this callback
+        if( info->dlpi_name && info->dlpi_name[0] != '\0' )
+        {
+            size_t sz = strlen( info->dlpi_name ) + 1;
+            image->m_name = (char*)tracy_malloc( sz );
+            memcpy( image->m_name,  info->dlpi_name, sz );
+        }
+        else
+        {
+            image->m_name = nullptr;
+        }
+
+        cache->m_updated = true;
+
+        return 0;
+    }
+
+    bool Contains( void* startAddress ) const
+    {
+        return std::any_of( m_images.begin(), m_images.end(), [startAddress]( const ImageEntry& entry ) { return startAddress == entry.m_startAddress; } );
+    }
+
+    void Refresh()
+    {
+        m_updated = false;
+        dl_iterate_phdr( Callback, this );
+
+        if( m_updated )
+        {
+            std::sort( m_images.begin(), m_images.end(),
+                []( const ImageEntry& lhs, const ImageEntry& rhs ) { return lhs.m_startAddress > rhs.m_startAddress; } );
+
+            // patch the main executable image name here, as calling dl_* functions inside the dl_iterate_phdr callback might cause deadlocks
+            UpdateMainImageName();
+        }
+    }
+
+    void UpdateMainImageName()
+    {
+        if( m_haveMainImageName )
+        {
+            return;
+        }
+
+        for( ImageEntry& entry : m_images )
+        {
+            if( entry.m_name == nullptr )
+            {
+                Dl_info dlInfo;
+                if( dladdr( (void *)entry.m_startAddress, &dlInfo ) )
+                {
+                    if( dlInfo.dli_fname )
+                    {
+                        size_t sz = strlen( dlInfo.dli_fname ) + 1;
+                        entry.m_name = (char*)tracy_malloc( sz );
+                        memcpy( entry.m_name, dlInfo.dli_fname, sz );
+                    }
+                }
+
+                // we only expect one entry to be null for the main executable entry
+                break;
+            }
+        }
+
+        m_haveMainImageName = true;
+    }
+
+    const ImageEntry* GetImageForAddressImpl( void* address ) const
+    {
+        auto it = std::lower_bound( m_images.begin(), m_images.end(), address,
+            []( const ImageEntry& lhs, const void* rhs ) { return lhs.m_startAddress > rhs; } );
+
+        if( it != m_images.end() && address < it->m_endAddress )
+        {
+            return it;
+        }
+        return nullptr;
+    }
+
+    void Clear()
+    {
+        for( ImageEntry& entry : m_images )
+        {
+            tracy_free( entry.m_name );
+        }
+
+        m_images.clear();
+        m_haveMainImageName = false;
+    }
+};
+#endif //#ifdef TRACY_USE_IMAGE_CACHE
+
+// when "TRACY_SYMBOL_OFFLINE_RESOLVE" is set, instead of fully resolving symbols at runtime,
+// simply resolve the offset and image name (which will be enough the resolving to be done offline)
+#ifdef TRACY_SYMBOL_OFFLINE_RESOLVE
+constexpr bool s_shouldResolveSymbolsOffline = true;
+#else
+static bool s_shouldResolveSymbolsOffline = false;
+bool ShouldResolveSymbolsOffline()
+{
+    const char* symbolOfflineResolve = GetEnvVar( "TRACY_SYMBOL_OFFLINE_RESOLVE" );
+    return (symbolOfflineResolve && symbolOfflineResolve[0] == '1');
+}
+#endif // #ifdef TRACY_SYMBOL_OFFLINE_RESOLVE
+
+#if TRACY_HAS_CALLSTACK == 1
+
+enum { MaxCbTrace = 64 };
+enum { MaxNameSize = 8*1024 };
+
+int cb_num;
+CallstackEntry cb_data[MaxCbTrace];
+
+extern "C"
+{
+    typedef DWORD (__stdcall *t_SymAddrIncludeInlineTrace)( HANDLE hProcess, DWORD64 Address );
+    typedef BOOL (__stdcall *t_SymQueryInlineTrace)( HANDLE hProcess, DWORD64 StartAddress, DWORD StartContext, DWORD64 StartRetAddress, DWORD64 CurAddress, LPDWORD CurContext, LPDWORD CurFrameIndex );
+    typedef BOOL (__stdcall *t_SymFromInlineContext)( HANDLE hProcess, DWORD64 Address, ULONG InlineContext, PDWORD64 Displacement, PSYMBOL_INFO Symbol );
+    typedef BOOL (__stdcall *t_SymGetLineFromInlineContext)( HANDLE hProcess, DWORD64 qwAddr, ULONG InlineContext, DWORD64 qwModuleBaseAddress, PDWORD pdwDisplacement, PIMAGEHLP_LINE64 Line64 );
+
+    t_SymAddrIncludeInlineTrace _SymAddrIncludeInlineTrace = 0;
+    t_SymQueryInlineTrace _SymQueryInlineTrace = 0;
+    t_SymFromInlineContext _SymFromInlineContext = 0;
+    t_SymGetLineFromInlineContext _SymGetLineFromInlineContext = 0;
+
+    TRACY_API ___tracy_t_RtlWalkFrameChain ___tracy_RtlWalkFrameChain = 0;
+}
+
+struct ModuleCache
+{
+    uint64_t start;
+    uint64_t end;
+    char* name;
+};
+
+static FastVector<ModuleCache>* s_modCache;
+
+
+struct KernelDriver
+{
+    uint64_t addr;
+    const char* mod;
+    const char* path;
+};
+
+KernelDriver* s_krnlCache = nullptr;
+size_t s_krnlCacheCnt;
+
+void InitCallstackCritical()
+{
+    ___tracy_RtlWalkFrameChain = (___tracy_t_RtlWalkFrameChain)GetProcAddress( GetModuleHandleA( "ntdll.dll" ), "RtlWalkFrameChain" );
+}
+
+void DbgHelpInit()
+{
+    if( s_shouldResolveSymbolsOffline ) return;
+
+    _SymAddrIncludeInlineTrace = (t_SymAddrIncludeInlineTrace)GetProcAddress(GetModuleHandleA("dbghelp.dll"), "SymAddrIncludeInlineTrace");
+    _SymQueryInlineTrace = (t_SymQueryInlineTrace)GetProcAddress(GetModuleHandleA("dbghelp.dll"), "SymQueryInlineTrace");
+    _SymFromInlineContext = (t_SymFromInlineContext)GetProcAddress(GetModuleHandleA("dbghelp.dll"), "SymFromInlineContext");
+    _SymGetLineFromInlineContext = (t_SymGetLineFromInlineContext)GetProcAddress(GetModuleHandleA("dbghelp.dll"), "SymGetLineFromInlineContext");
+
+#ifdef TRACY_DBGHELP_LOCK
+    DBGHELP_INIT;
+    DBGHELP_LOCK;
+#endif
+
+    SymInitialize( GetCurrentProcess(), nullptr, true );
+    SymSetOptions( SYMOPT_LOAD_LINES );
+
+#ifdef TRACY_DBGHELP_LOCK
+    DBGHELP_UNLOCK;
+#endif
+}
+
+DWORD64 DbgHelpLoadSymbolsForModule( const char* imageName, uint64_t baseOfDll, uint32_t bllSize )
+{
+    if( s_shouldResolveSymbolsOffline ) return 0;
+    return SymLoadModuleEx( GetCurrentProcess(), nullptr, imageName, nullptr, baseOfDll, bllSize, nullptr, 0 );
+}
+
+ModuleCache* LoadSymbolsForModuleAndCache( const char* imageName, uint32_t imageNameLength, uint64_t baseOfDll, uint32_t dllSize )
+{
+    DbgHelpLoadSymbolsForModule( imageName, baseOfDll, dllSize );
+
+    ModuleCache* cachedModule = s_modCache->push_next();
+    cachedModule->start = baseOfDll;
+    cachedModule->end = baseOfDll + dllSize;
+
+    // when doing offline symbol resolution, we must store the full path of the dll for the resolving to work
+    if( s_shouldResolveSymbolsOffline )
+    {
+        cachedModule->name = (char*)tracy_malloc_fast(imageNameLength + 1);
+        memcpy(cachedModule->name, imageName, imageNameLength);
+        cachedModule->name[imageNameLength] = '\0';
+    }
+    else
+    {
+        auto ptr = imageName + imageNameLength;
+        while (ptr > imageName && *ptr != '\\' && *ptr != '/') ptr--;
+        if (ptr > imageName) ptr++;
+        const auto namelen = imageName + imageNameLength - ptr;
+        cachedModule->name = (char*)tracy_malloc_fast(namelen + 3);
+        cachedModule->name[0] = '[';
+        memcpy(cachedModule->name + 1, ptr, namelen);
+        cachedModule->name[namelen + 1] = ']';
+        cachedModule->name[namelen + 2] = '\0';
+    }
+
+    return cachedModule;
+}
+
+void InitCallstack()
+{
+#ifndef TRACY_SYMBOL_OFFLINE_RESOLVE
+    s_shouldResolveSymbolsOffline = ShouldResolveSymbolsOffline();
+#endif //#ifndef TRACY_SYMBOL_OFFLINE_RESOLVE
+    if( s_shouldResolveSymbolsOffline )
+    {
+        TracyDebug("TRACY: enabling offline symbol resolving!\n");
+    }
+
+    DbgHelpInit();
+
+#ifdef TRACY_DBGHELP_LOCK
+    DBGHELP_LOCK;
+#endif
+
+    // use TRACY_NO_DBGHELP_INIT_LOAD=1 to disable preloading of driver
+    // and process module symbol loading at startup time - they will be loaded on demand later
+    // Sometimes this process can take a very long time and prevent resolving callstack frames
+    // symbols during that time.
+    const char* noInitLoadEnv = GetEnvVar( "TRACY_NO_DBGHELP_INIT_LOAD" );
+    const bool initTimeModuleLoad = !( noInitLoadEnv && noInitLoadEnv[0] == '1' );
+    if ( !initTimeModuleLoad )
+    {
+        TracyDebug("TRACY: skipping init time dbghelper module load\n");
+    }
+
+    DWORD needed;
+    LPVOID dev[4096];
+    if( initTimeModuleLoad && EnumDeviceDrivers( dev, sizeof(dev), &needed ) != 0 )
+    {
+        char windir[MAX_PATH];
+        if( !GetWindowsDirectoryA( windir, sizeof( windir ) ) ) memcpy( windir, "c:\\windows", 11 );
+        const auto windirlen = strlen( windir );
+
+        const auto sz = needed / sizeof( LPVOID );
+        s_krnlCache = (KernelDriver*)tracy_malloc( sizeof(KernelDriver) * sz );
+        int cnt = 0;
+        for( size_t i=0; i<sz; i++ )
+        {
+            char fn[MAX_PATH];
+            const auto len = GetDeviceDriverBaseNameA( dev[i], fn, sizeof( fn ) );
+            if( len != 0 )
+            {
+                auto buf = (char*)tracy_malloc_fast( len+3 );
+                buf[0] = '<';
+                memcpy( buf+1, fn, len );
+                memcpy( buf+len+1, ">", 2 );
+                s_krnlCache[cnt] = KernelDriver { (uint64_t)dev[i], buf };
+
+                const auto len = GetDeviceDriverFileNameA( dev[i], fn, sizeof( fn ) );
+                if( len != 0 )
+                {
+                    char full[MAX_PATH];
+                    char* path = fn;
+
+                    if( memcmp( fn, "\\SystemRoot\\", 12 ) == 0 )
+                    {
+                        memcpy( full, windir, windirlen );
+                        strcpy( full + windirlen, fn + 11 );
+                        path = full;
+                    }
+
+                    DbgHelpLoadSymbolsForModule( path, (DWORD64)dev[i], 0 );
+
+                    const auto psz = strlen( path );
+                    auto pptr = (char*)tracy_malloc_fast( psz+1 );
+                    memcpy( pptr, path, psz );
+                    pptr[psz] = '\0';
+                    s_krnlCache[cnt].path = pptr;
+                }
+
+                cnt++;
+            }
+        }
+        s_krnlCacheCnt = cnt;
+        std::sort( s_krnlCache, s_krnlCache + s_krnlCacheCnt, []( const KernelDriver& lhs, const KernelDriver& rhs ) { return lhs.addr > rhs.addr; } );
+    }
+
+    s_modCache = (FastVector<ModuleCache>*)tracy_malloc( sizeof( FastVector<ModuleCache> ) );
+    new(s_modCache) FastVector<ModuleCache>( 512 );
+
+    HANDLE proc = GetCurrentProcess();
+    HMODULE mod[1024];
+    if( initTimeModuleLoad && EnumProcessModules( proc, mod, sizeof( mod ), &needed ) != 0 )
+    {
+        const auto sz = needed / sizeof( HMODULE );
+        for( size_t i=0; i<sz; i++ )
+        {
+            MODULEINFO info;
+            if( GetModuleInformation( proc, mod[i], &info, sizeof( info ) ) != 0 )
+            {
+                char name[1024];
+                const auto nameLength = GetModuleFileNameA( mod[i], name, 1021 );
+                if( nameLength > 0 )
+                {
+                    // This may be a new module loaded since our call to SymInitialize.
+                    // Just in case, force DbgHelp to load its pdb !
+                    LoadSymbolsForModuleAndCache( name, nameLength, (DWORD64)info.lpBaseOfDll, info.SizeOfImage );
+                }
+            }
+        }
+    }
+
+#ifdef TRACY_DBGHELP_LOCK
+    DBGHELP_UNLOCK;
+#endif
+}
+
+void EndCallstack()
+{
+}
+
+const char* DecodeCallstackPtrFast( uint64_t ptr )
+{
+    if( s_shouldResolveSymbolsOffline ) return "[unresolved]";
+
+    static char ret[MaxNameSize];
+    const auto proc = GetCurrentProcess();
+
+    char buf[sizeof( SYMBOL_INFO ) + MaxNameSize];
+    auto si = (SYMBOL_INFO*)buf;
+    si->SizeOfStruct = sizeof( SYMBOL_INFO );
+    si->MaxNameLen = MaxNameSize;
+
+#ifdef TRACY_DBGHELP_LOCK
+    DBGHELP_LOCK;
+#endif
+    if( SymFromAddr( proc, ptr, nullptr, si ) == 0 )
+    {
+        *ret = '\0';
+    }
+    else
+    {
+        memcpy( ret, si->Name, si->NameLen );
+        ret[si->NameLen] = '\0';
+    }
+#ifdef TRACY_DBGHELP_LOCK
+    DBGHELP_UNLOCK;
+#endif
+    return ret;
+}
+
+const char* GetKernelModulePath( uint64_t addr )
+{
+    assert( addr >> 63 != 0 );
+    if( !s_krnlCache ) return nullptr;
+    auto it = std::lower_bound( s_krnlCache, s_krnlCache + s_krnlCacheCnt, addr, []( const KernelDriver& lhs, const uint64_t& rhs ) { return lhs.addr > rhs; } );
+    if( it == s_krnlCache + s_krnlCacheCnt ) return nullptr;
+    return it->path;
+}
+
+struct ModuleNameAndBaseAddress
+{
+    const char* name;
+    uint64_t baseAddr;
+};
+
+ModuleNameAndBaseAddress GetModuleNameAndPrepareSymbols( uint64_t addr )
+{
+    if( ( addr >> 63 ) != 0 )
+    {
+        if( s_krnlCache )
+        {
+            auto it = std::lower_bound( s_krnlCache, s_krnlCache + s_krnlCacheCnt, addr, []( const KernelDriver& lhs, const uint64_t& rhs ) { return lhs.addr > rhs; } );
+            if( it != s_krnlCache + s_krnlCacheCnt )
+            {
+                return ModuleNameAndBaseAddress{ it->mod, it->addr };
+            }
+        }
+        return ModuleNameAndBaseAddress{ "<kernel>", addr };
+    }
+
+    for( auto& v : *s_modCache )
+    {
+        if( addr >= v.start && addr < v.end )
+        {
+            return ModuleNameAndBaseAddress{ v.name, v.start };
+        }
+    }
+
+    HMODULE mod[1024];
+    DWORD needed;
+    HANDLE proc = GetCurrentProcess();
+
+    InitRpmalloc();
+    if( EnumProcessModules( proc, mod, sizeof( mod ), &needed ) != 0 )
+    {
+        const auto sz = needed / sizeof( HMODULE );
+        for( size_t i=0; i<sz; i++ )
+        {
+            MODULEINFO info;
+            if( GetModuleInformation( proc, mod[i], &info, sizeof( info ) ) != 0 )
+            {
+                const auto base = uint64_t( info.lpBaseOfDll );
+                if( addr >= base && addr < base + info.SizeOfImage )
+                {
+                    char name[1024];
+                    const auto nameLength = GetModuleFileNameA( mod[i], name, 1021 );
+                    if( nameLength > 0 )
+                    {
+                        // since this is the first time we encounter this module, load its symbols (needed for modules loaded after SymInitialize)
+                        ModuleCache* cachedModule = LoadSymbolsForModuleAndCache( name, nameLength, (DWORD64)info.lpBaseOfDll, info.SizeOfImage );
+                        return ModuleNameAndBaseAddress{ cachedModule->name, cachedModule->start };
+                    }
+                }
+            }
+        }
+    }
+
+    return ModuleNameAndBaseAddress{ "[unknown]", 0x0 };
+}
+
+CallstackSymbolData DecodeSymbolAddress( uint64_t ptr )
+{
+    CallstackSymbolData sym;
+
+    if( s_shouldResolveSymbolsOffline )
+    {
+        sym.file = "[unknown]";
+        sym.line = 0;
+        sym.needFree = false;
+        return sym;
+    }
+
+    IMAGEHLP_LINE64 line;
+    DWORD displacement = 0;
+    line.SizeOfStruct = sizeof(IMAGEHLP_LINE64);
+#ifdef TRACY_DBGHELP_LOCK
+    DBGHELP_LOCK;
+#endif
+    const auto res = SymGetLineFromAddr64( GetCurrentProcess(), ptr, &displacement, &line );
+    if( res == 0 || line.LineNumber >= 0xF00000 )
+    {
+        sym.file = "[unknown]";
+        sym.line = 0;
+        sym.needFree = false;
+    }
+    else
+    {
+        sym.file = CopyString( line.FileName );
+        sym.line = line.LineNumber;
+        sym.needFree = true;
+    }
+#ifdef TRACY_DBGHELP_LOCK
+    DBGHELP_UNLOCK;
+#endif
+    return sym;
+}
+
+CallstackEntryData DecodeCallstackPtr( uint64_t ptr )
+{
+#ifdef TRACY_DBGHELP_LOCK
+    DBGHELP_LOCK;
+#endif
+
+    InitRpmalloc();
+
+    const ModuleNameAndBaseAddress moduleNameAndAddress = GetModuleNameAndPrepareSymbols( ptr );
+
+    if( s_shouldResolveSymbolsOffline )
+    {
+#ifdef TRACY_DBGHELP_LOCK
+        DBGHELP_UNLOCK;
+#endif
+
+        cb_data[0].symAddr = ptr - moduleNameAndAddress.baseAddr;
+        cb_data[0].symLen = 0;
+
+        cb_data[0].name = CopyStringFast("[unresolved]");
+        cb_data[0].file = CopyStringFast("[unknown]");
+        cb_data[0].line = 0;
+
+        return { cb_data, 1, moduleNameAndAddress.name };
+    }
+
+    int write;
+    const auto proc = GetCurrentProcess();
+
+#if !defined TRACY_NO_CALLSTACK_INLINES
+    BOOL doInline = FALSE;
+    DWORD ctx = 0;
+    DWORD inlineNum = 0;
+    if( _SymAddrIncludeInlineTrace )
+    {
+        inlineNum = _SymAddrIncludeInlineTrace( proc, ptr );
+        if( inlineNum > MaxCbTrace - 1 ) inlineNum = MaxCbTrace - 1;
+        DWORD idx;
+        if( inlineNum != 0 ) doInline = _SymQueryInlineTrace( proc, ptr, 0, ptr, ptr, &ctx, &idx );
+    }
+    if( doInline )
+    {
+        write = inlineNum;
+        cb_num = 1 + inlineNum;
+    }
+    else
+#endif
+    {
+        write = 0;
+        cb_num = 1;
+    }
+
+    char buf[sizeof( SYMBOL_INFO ) + MaxNameSize];
+    auto si = (SYMBOL_INFO*)buf;
+    si->SizeOfStruct = sizeof( SYMBOL_INFO );
+    si->MaxNameLen = MaxNameSize;
+
+    const auto symValid = SymFromAddr( proc, ptr, nullptr, si ) != 0;
+
+    IMAGEHLP_LINE64 line;
+    DWORD displacement = 0;
+    line.SizeOfStruct = sizeof(IMAGEHLP_LINE64);
+
+    {
+        const char* filename;
+        const auto res = SymGetLineFromAddr64( proc, ptr, &displacement, &line );
+        if( res == 0 || line.LineNumber >= 0xF00000 )
+        {
+            filename = "[unknown]";
+            cb_data[write].line = 0;
+        }
+        else
+        {
+            filename = line.FileName;
+            cb_data[write].line = line.LineNumber;
+        }
+
+        cb_data[write].name = symValid ? CopyStringFast( si->Name, si->NameLen ) : CopyStringFast( moduleNameAndAddress.name );
+        cb_data[write].file = CopyStringFast( filename );
+        if( symValid )
+        {
+            cb_data[write].symLen = si->Size;
+            cb_data[write].symAddr = si->Address;
+        }
+        else
+        {
+            cb_data[write].symLen = 0;
+            cb_data[write].symAddr = 0;
+        }
+    }
+
+#if !defined TRACY_NO_CALLSTACK_INLINES
+    if( doInline )
+    {
+        for( DWORD i=0; i<inlineNum; i++ )
+        {
+            auto& cb = cb_data[i];
+            const auto symInlineValid = _SymFromInlineContext( proc, ptr, ctx, nullptr, si ) != 0;
+            const char* filename;
+            if( _SymGetLineFromInlineContext( proc, ptr, ctx, 0, &displacement, &line ) == 0 )
+            {
+                filename = "[unknown]";
+                cb.line = 0;
+            }
+            else
+            {
+                filename = line.FileName;
+                cb.line = line.LineNumber;
+            }
+
+            cb.name = symInlineValid ? CopyStringFast( si->Name, si->NameLen ) : CopyStringFast( moduleNameAndAddress.name );
+            cb.file = CopyStringFast( filename );
+            if( symInlineValid )
+            {
+                cb.symLen = si->Size;
+                cb.symAddr = si->Address;
+            }
+            else
+            {
+                cb.symLen = 0;
+                cb.symAddr = 0;
+            }
+
+            ctx++;
+        }
+    }
+#endif
+#ifdef TRACY_DBGHELP_LOCK
+    DBGHELP_UNLOCK;
+#endif
+
+    return { cb_data, uint8_t( cb_num ), moduleNameAndAddress.name };
+}
+
+#elif TRACY_HAS_CALLSTACK == 2 || TRACY_HAS_CALLSTACK == 3 || TRACY_HAS_CALLSTACK == 4 || TRACY_HAS_CALLSTACK == 6
+
+enum { MaxCbTrace = 64 };
+
+struct backtrace_state* cb_bts = nullptr;
+
+int cb_num;
+CallstackEntry cb_data[MaxCbTrace];
+int cb_fixup;
+#ifdef TRACY_USE_IMAGE_CACHE
+static ImageCache* s_imageCache = nullptr;
+#endif //#ifdef TRACY_USE_IMAGE_CACHE
+
+#ifdef TRACY_DEBUGINFOD
+debuginfod_client* s_debuginfod;
+
+struct DebugInfo
+{
+    uint8_t* buildid;
+    size_t buildid_size;
+    char* filename;
+    int fd;
+};
+
+static FastVector<DebugInfo>* s_di_known;
+#endif
+
+#ifdef __linux
+struct KernelSymbol
+{
+    uint64_t addr;
+    uint32_t size;
+    const char* name;
+    const char* mod;
+};
+
+KernelSymbol* s_kernelSym = nullptr;
+size_t s_kernelSymCnt;
+
+static void InitKernelSymbols()
+{
+    FILE* f = fopen( "/proc/kallsyms", "rb" );
+    if( !f ) return;
+    tracy::FastVector<KernelSymbol> tmpSym( 512 * 1024 );
+    size_t linelen = 16 * 1024;     // linelen must be big enough to prevent reallocs in getline()
+    auto linebuf = (char*)tracy_malloc( linelen );
+    ssize_t sz;
+    size_t validCnt = 0;
+    while( ( sz = getline( &linebuf, &linelen, f ) ) != -1 )
+    {
+        auto ptr = linebuf;
+        uint64_t addr = 0;
+        while( *ptr != ' ' )
+        {
+            auto v = *ptr;
+            if( v >= '0' && v <= '9' )
+            {
+                v -= '0';
+            }
+            else if( v >= 'a' && v <= 'f' )
+            {
+                v -= 'a';
+                v += 10;
+            }
+            else if( v >= 'A' && v <= 'F' )
+            {
+                v -= 'A';
+                v += 10;
+            }
+            else
+            {
+                assert( false );
+            }
+            assert( ( v & ~0xF ) == 0 );
+            addr <<= 4;
+            addr |= v;
+            ptr++;
+        }
+        if( addr == 0 ) continue;
+        ptr++;
+        const bool valid = *ptr == 'T' || *ptr == 't';
+        ptr += 2;
+        const auto namestart = ptr;
+        while( *ptr != '\t' && *ptr != '\n' ) ptr++;
+        const auto nameend = ptr;
+        const char* modstart = nullptr;
+        const char* modend;
+        if( *ptr == '\t' )
+        {
+            ptr += 2;
+            modstart = ptr;
+            while( *ptr != ']' ) ptr++;
+            modend = ptr;
+        }
+
+        char* strname = nullptr;
+        char* strmod = nullptr;
+
+        if( valid )
+        {
+            validCnt++;
+
+            strname = (char*)tracy_malloc_fast( nameend - namestart + 1 );
+            memcpy( strname, namestart, nameend - namestart );
+            strname[nameend-namestart] = '\0';
+
+            if( modstart )
+            {
+                strmod = (char*)tracy_malloc_fast( modend - modstart + 1 );
+                memcpy( strmod, modstart, modend - modstart );
+                strmod[modend-modstart] = '\0';
+            }
+        }
+
+        auto sym = tmpSym.push_next();
+        sym->addr = addr;
+        sym->size = 0;
+        sym->name = strname;
+        sym->mod = strmod;
+    }
+    tracy_free_fast( linebuf );
+    fclose( f );
+    if( tmpSym.empty() ) return;
+
+    std::sort( tmpSym.begin(), tmpSym.end(), []( const KernelSymbol& lhs, const KernelSymbol& rhs ) { return lhs.addr < rhs.addr; } );
+    for( size_t i=0; i<tmpSym.size()-1; i++ )
+    {
+        if( tmpSym[i].name ) tmpSym[i].size = tmpSym[i+1].addr - tmpSym[i].addr;
+    }
+
+    s_kernelSymCnt = validCnt;
+    s_kernelSym = (KernelSymbol*)tracy_malloc_fast( sizeof( KernelSymbol ) * validCnt );
+    auto dst = s_kernelSym;
+    for( auto& v : tmpSym )
+    {
+        if( v.name ) *dst++ = v;
+    }
+    assert( dst == s_kernelSym + validCnt );
+
+    TracyDebug( "Loaded %zu kernel symbols (%zu code sections)\n", tmpSym.size(), validCnt );
+}
+#endif
+
+char* NormalizePath( const char* path )
+{
+    if( path[0] != '/' ) return nullptr;
+
+    const char* ptr = path;
+    const char* end = path + strlen( path );
+
+    char* res = (char*)tracy_malloc( end - ptr + 1 );
+    size_t rsz = 0;
+
+    while( ptr < end )
+    {
+        const char* next = ptr;
+        while( next < end && *next != '/' ) next++;
+        size_t lsz = next - ptr;
+        switch( lsz )
+        {
+        case 2:
+            if( memcmp( ptr, "..", 2 ) == 0 )
+            {
+                const char* back = res + rsz - 1;
+                while( back > res && *back != '/' ) back--;
+                rsz = back - res;
+                ptr = next + 1;
+                continue;
+            }
+            break;
+        case 1:
+            if( *ptr == '.' )
+            {
+                ptr = next + 1;
+                continue;
+            }
+            break;
+        case 0:
+            ptr = next + 1;
+            continue;
+        }
+        if( rsz != 1 ) res[rsz++] = '/';
+        memcpy( res+rsz, ptr, lsz );
+        rsz += lsz;
+        ptr = next + 1;
+    }
+
+    if( rsz == 0 )
+    {
+        memcpy( res, "/", 2 );
+    }
+    else
+    {
+        res[rsz] = '\0';
+    }
+    return res;
+}
+
+void InitCallstackCritical()
+{
+}
+
+void InitCallstack()
+{
+    InitRpmalloc();
+
+#ifdef TRACY_USE_IMAGE_CACHE
+    s_imageCache = (ImageCache*)tracy_malloc( sizeof( ImageCache ) );
+    new(s_imageCache) ImageCache();
+#endif //#ifdef TRACY_USE_IMAGE_CACHE
+
+#ifndef TRACY_SYMBOL_OFFLINE_RESOLVE
+    s_shouldResolveSymbolsOffline = ShouldResolveSymbolsOffline();
+#endif //#ifndef TRACY_SYMBOL_OFFLINE_RESOLVE
+    if( s_shouldResolveSymbolsOffline )
+    {
+        cb_bts = nullptr; // disable use of libbacktrace calls
+        TracyDebug("TRACY: enabling offline symbol resolving!\n");
+    }
+    else
+    {
+        cb_bts = backtrace_create_state( nullptr, 0, nullptr, nullptr );
+    }
+
+#ifndef TRACY_DEMANGLE
+    ___tracy_init_demangle_buffer();
+#endif
+
+#ifdef __linux
+    InitKernelSymbols();
+#endif
+#ifdef TRACY_DEBUGINFOD
+    s_debuginfod = debuginfod_begin();
+    s_di_known = (FastVector<DebugInfo>*)tracy_malloc( sizeof( FastVector<DebugInfo> ) );
+    new (s_di_known) FastVector<DebugInfo>( 16 );
+#endif
+}
+
+#ifdef TRACY_DEBUGINFOD
+void ClearDebugInfoVector( FastVector<DebugInfo>& vec )
+{
+    for( auto& v : vec )
+    {
+        tracy_free( v.buildid );
+        tracy_free( v.filename );
+        if( v.fd >= 0 ) close( v.fd );
+    }
+    vec.clear();
+}
+
+DebugInfo* FindDebugInfo( FastVector<DebugInfo>& vec, const uint8_t* buildid_data, size_t buildid_size )
+{
+    for( auto& v : vec )
+    {
+        if( v.buildid_size == buildid_size && memcmp( v.buildid, buildid_data, buildid_size ) == 0 )
+        {
+            return &v;
+        }
+    }
+    return nullptr;
+}
+
+int GetDebugInfoDescriptor( const char* buildid_data, size_t buildid_size, const char* filename )
+{
+    auto buildid = (uint8_t*)buildid_data;
+    auto it = FindDebugInfo( *s_di_known, buildid, buildid_size );
+    if( it ) return it->fd >= 0 ? dup( it->fd ) : -1;
+
+    int fd = debuginfod_find_debuginfo( s_debuginfod, buildid, buildid_size, nullptr );
+    it = s_di_known->push_next();
+    it->buildid_size = buildid_size;
+    it->buildid = (uint8_t*)tracy_malloc( buildid_size );
+    memcpy( it->buildid, buildid, buildid_size );
+    const auto fnsz = strlen( filename ) + 1;
+    it->filename = (char*)tracy_malloc( fnsz );
+    memcpy( it->filename, filename, fnsz );
+    it->fd = fd >= 0 ? fd : -1;
+    TracyDebug( "DebugInfo descriptor query: %i, fn: %s\n", fd, filename );
+    return it->fd;
+}
+
+const uint8_t* GetBuildIdForImage( const char* image, size_t& size )
+{
+    assert( image );
+    for( auto& v : *s_di_known )
+    {
+        if( strcmp( image, v.filename ) == 0 )
+        {
+            size = v.buildid_size;
+            return v.buildid;
+        }
+    }
+    return nullptr;
+}
+
+debuginfod_client* GetDebuginfodClient()
+{
+    return s_debuginfod;
+}
+#endif
+
+void EndCallstack()
+{
+#ifdef TRACY_USE_IMAGE_CACHE
+    if( s_imageCache )
+    {
+        s_imageCache->~ImageCache();
+        tracy_free( s_imageCache );
+    }
+#endif //#ifdef TRACY_USE_IMAGE_CACHE
+#ifndef TRACY_DEMANGLE
+    ___tracy_free_demangle_buffer();
+#endif
+#ifdef TRACY_DEBUGINFOD
+    ClearDebugInfoVector( *s_di_known );
+    s_di_known->~FastVector<DebugInfo>();
+    tracy_free( s_di_known );
+
+    debuginfod_end( s_debuginfod );
+#endif
+}
+
+const char* DecodeCallstackPtrFast( uint64_t ptr )
+{
+    static char ret[1024];
+    auto vptr = (void*)ptr;
+    const char* symname = nullptr;
+    Dl_info dlinfo;
+    if( dladdr( vptr, &dlinfo ) && dlinfo.dli_sname )
+    {
+        symname = dlinfo.dli_sname;
+    }
+    if( symname )
+    {
+        strcpy( ret, symname );
+    }
+    else
+    {
+        *ret = '\0';
+    }
+    return ret;
+}
+
+static int SymbolAddressDataCb( void* data, uintptr_t pc, uintptr_t lowaddr, const char* fn, int lineno, const char* function )
+{
+    auto& sym = *(CallstackSymbolData*)data;
+    if( !fn )
+    {
+        sym.file = "[unknown]";
+        sym.line = 0;
+        sym.needFree = false;
+    }
+    else
+    {
+        sym.file = NormalizePath( fn );
+        if( !sym.file ) sym.file = CopyString( fn );
+        sym.line = lineno;
+        sym.needFree = true;
+    }
+
+    return 1;
+}
+
+static void SymbolAddressErrorCb( void* data, const char* /*msg*/, int /*errnum*/ )
+{
+    auto& sym = *(CallstackSymbolData*)data;
+    sym.file = "[unknown]";
+    sym.line = 0;
+    sym.needFree = false;
+}
+
+CallstackSymbolData DecodeSymbolAddress( uint64_t ptr )
+{
+    CallstackSymbolData sym;
+    if( cb_bts )
+    {
+        backtrace_pcinfo( cb_bts, ptr, SymbolAddressDataCb, SymbolAddressErrorCb, &sym );
+    }
+    else
+    {
+        SymbolAddressErrorCb(&sym, nullptr, 0);
+    }
+
+    return sym;
+}
+
+static int CallstackDataCb( void* /*data*/, uintptr_t pc, uintptr_t lowaddr, const char* fn, int lineno, const char* function )
+{
+    cb_data[cb_num].symLen = 0;
+    cb_data[cb_num].symAddr = (uint64_t)lowaddr;
+
+    if( !fn && !function )
+    {
+        const char* symname = nullptr;
+        auto vptr = (void*)pc;
+        ptrdiff_t symoff = 0;
+
+        Dl_info dlinfo;
+        if( dladdr( vptr, &dlinfo ) )
+        {
+            symname = dlinfo.dli_sname;
+            symoff = (char*)pc - (char*)dlinfo.dli_saddr;
+            const char* demangled = ___tracy_demangle( symname );
+            if( demangled ) symname = demangled;
+        }
+
+        if( !symname ) symname = "[unknown]";
+
+        if( symoff == 0 )
+        {
+            const auto len = std::min<size_t>( strlen( symname ), std::numeric_limits<uint16_t>::max() );
+            cb_data[cb_num].name = CopyStringFast( symname, len );
+        }
+        else
+        {
+            char buf[32];
+            const auto offlen = sprintf( buf, " + %td", symoff );
+            const auto namelen = std::min<size_t>( strlen( symname ), std::numeric_limits<uint16_t>::max() - offlen );
+            auto name = (char*)tracy_malloc_fast( namelen + offlen + 1 );
+            memcpy( name, symname, namelen );
+            memcpy( name + namelen, buf, offlen );
+            name[namelen + offlen] = '\0';
+            cb_data[cb_num].name = name;
+        }
+
+        cb_data[cb_num].file = CopyStringFast( "[unknown]" );
+        cb_data[cb_num].line = 0;
+    }
+    else
+    {
+        if( !fn ) fn = "[unknown]";
+        if( !function )
+        {
+            function = "[unknown]";
+        }
+        else
+        {
+            const char* demangled = ___tracy_demangle( function );
+            if( demangled ) function = demangled;
+        }
+
+        const auto len = std::min<size_t>( strlen( function ), std::numeric_limits<uint16_t>::max() );
+        cb_data[cb_num].name = CopyStringFast( function, len );
+        cb_data[cb_num].file = NormalizePath( fn );
+        if( !cb_data[cb_num].file ) cb_data[cb_num].file = CopyStringFast( fn );
+        cb_data[cb_num].line = lineno;
+    }
+
+    if( ++cb_num >= MaxCbTrace )
+    {
+        return 1;
+    }
+    else
+    {
+        return 0;
+    }
+}
+
+static void CallstackErrorCb( void* /*data*/, const char* /*msg*/, int /*errnum*/ )
+{
+    for( int i=0; i<cb_num; i++ )
+    {
+        tracy_free_fast( (void*)cb_data[i].name );
+        tracy_free_fast( (void*)cb_data[i].file );
+    }
+
+    cb_data[0].name = CopyStringFast( "[error]" );
+    cb_data[0].file = CopyStringFast( "[error]" );
+    cb_data[0].line = 0;
+
+    cb_num = 1;
+}
+
+void SymInfoCallback( void* /*data*/, uintptr_t pc, const char* symname, uintptr_t symval, uintptr_t symsize )
+{
+    cb_data[cb_num-1].symLen = (uint32_t)symsize;
+    cb_data[cb_num-1].symAddr = (uint64_t)symval;
+}
+
+void SymInfoError( void* /*data*/, const char* /*msg*/, int /*errnum*/ )
+{
+    cb_data[cb_num-1].symLen = 0;
+    cb_data[cb_num-1].symAddr = 0;
+}
+
+void GetSymbolForOfflineResolve(void* address, uint64_t imageBaseAddress, CallstackEntry& cbEntry)
+{
+    // tagged with a string that we can identify as an unresolved symbol
+    cbEntry.name = CopyStringFast( "[unresolved]" );
+    // set .so relative offset so it can be resolved offline
+    cbEntry.symAddr = (uint64_t)address - imageBaseAddress;
+    cbEntry.symLen = 0x0;
+    cbEntry.file = CopyStringFast( "[unknown]" );
+    cbEntry.line = 0;
+}
+
+CallstackEntryData DecodeCallstackPtr( uint64_t ptr )
+{
+    InitRpmalloc();
+    if( ptr >> 63 == 0 )
+    {
+        const char* imageName = nullptr;
+        uint64_t imageBaseAddress = 0x0;
+
+#ifdef TRACY_USE_IMAGE_CACHE
+        const auto* image = s_imageCache->GetImageForAddress((void*)ptr);
+        if( image )
+        {
+            imageName = image->m_name;
+            imageBaseAddress = uint64_t(image->m_startAddress);
+        }
+#else
+        Dl_info dlinfo;
+        if( dladdr( (void*)ptr, &dlinfo ) )
+        {
+            imageName = dlinfo.dli_fname;
+            imageBaseAddress = uint64_t( dlinfo.dli_fbase );
+        }
+#endif
+
+        if( s_shouldResolveSymbolsOffline )
+        {
+            cb_num = 1;
+            GetSymbolForOfflineResolve( (void*)ptr, imageBaseAddress, cb_data[0] );
+        }
+        else
+        {
+            cb_num = 0;
+            backtrace_pcinfo( cb_bts, ptr, CallstackDataCb, CallstackErrorCb, nullptr );
+            assert( cb_num > 0 );
+
+            backtrace_syminfo( cb_bts, ptr, SymInfoCallback, SymInfoError, nullptr );
+        }
+
+        return { cb_data, uint8_t( cb_num ), imageName ? imageName : "[unknown]" };
+    }
+#ifdef __linux
+    else if( s_kernelSym )
+    {
+        auto it = std::lower_bound( s_kernelSym, s_kernelSym + s_kernelSymCnt, ptr, []( const KernelSymbol& lhs, const uint64_t& rhs ) { return lhs.addr + lhs.size < rhs; } );
+        if( it != s_kernelSym + s_kernelSymCnt )
+        {
+            cb_data[0].name = CopyStringFast( it->name );
+            cb_data[0].file = CopyStringFast( "<kernel>" );
+            cb_data[0].line = 0;
+            cb_data[0].symLen = it->size;
+            cb_data[0].symAddr = it->addr;
+            return { cb_data, 1, it->mod ? it->mod : "<kernel>" };
+        }
+    }
+#endif
+
+    cb_data[0].name = CopyStringFast( "[unknown]" );
+    cb_data[0].file = CopyStringFast( "<kernel>" );
+    cb_data[0].line = 0;
+    cb_data[0].symLen = 0;
+    cb_data[0].symAddr = 0;
+    return { cb_data, 1, "<kernel>" };
+}
+
+#elif TRACY_HAS_CALLSTACK == 5
+
+void InitCallstackCritical()
+{
+}
+
+void InitCallstack()
+{
+    ___tracy_init_demangle_buffer();
+}
+
+void EndCallstack()
+{
+    ___tracy_free_demangle_buffer();
+}
+
+const char* DecodeCallstackPtrFast( uint64_t ptr )
+{
+    static char ret[1024];
+    auto vptr = (void*)ptr;
+    const char* symname = nullptr;
+    Dl_info dlinfo;
+    if( dladdr( vptr, &dlinfo ) && dlinfo.dli_sname )
+    {
+        symname = dlinfo.dli_sname;
+    }
+    if( symname )
+    {
+        strcpy( ret, symname );
+    }
+    else
+    {
+        *ret = '\0';
+    }
+    return ret;
+}
+
+CallstackSymbolData DecodeSymbolAddress( uint64_t ptr )
+{
+    const char* symloc = nullptr;
+    Dl_info dlinfo;
+    if( dladdr( (void*)ptr, &dlinfo ) ) symloc = dlinfo.dli_fname;
+    if( !symloc ) symloc = "[unknown]";
+    return CallstackSymbolData { symloc, 0, false, 0 };
+}
+
+CallstackEntryData DecodeCallstackPtr( uint64_t ptr )
+{
+    static CallstackEntry cb;
+    cb.line = 0;
+
+    const char* symname = nullptr;
+    const char* symloc = nullptr;
+    auto vptr = (void*)ptr;
+    ptrdiff_t symoff = 0;
+    void* symaddr = nullptr;
+
+    Dl_info dlinfo;
+    if( dladdr( vptr, &dlinfo ) )
+    {
+        symloc = dlinfo.dli_fname;
+        symname = dlinfo.dli_sname;
+        symoff = (char*)ptr - (char*)dlinfo.dli_saddr;
+        symaddr = dlinfo.dli_saddr;
+        const char* demangled = ___tracy_demangle( symname );
+        if( demangled ) symname = demangled;
+    }
+
+    if( !symname ) symname = "[unknown]";
+    if( !symloc ) symloc = "[unknown]";
+
+    if( symoff == 0 )
+    {
+        const auto len = std::min<size_t>( strlen( symname ), std::numeric_limits<uint16_t>::max() );
+        cb.name = CopyString( symname, len );
+    }
+    else
+    {
+        char buf[32];
+        const auto offlen = sprintf( buf, " + %td", symoff );
+        const auto namelen = std::min<size_t>( strlen( symname ), std::numeric_limits<uint16_t>::max() - offlen );
+        auto name = (char*)tracy_malloc( namelen + offlen + 1 );
+        memcpy( name, symname, namelen );
+        memcpy( name + namelen, buf, offlen );
+        name[namelen + offlen] = '\0';
+        cb.name = name;
+    }
+
+    cb.file = CopyString( "[unknown]" );
+    cb.symLen = 0;
+    cb.symAddr = (uint64_t)symaddr;
+
+    return { &cb, 1, symloc };
+}
+
+#endif
+
+}
+
+#endif
diff --git a/project/thirdparty/tracy-0.11.1/client/TracyCallstack.h b/project/thirdparty/tracy-0.11.1/client/TracyCallstack.h
new file mode 100644
index 000000000..2c7ecad9f
--- /dev/null
+++ b/project/thirdparty/tracy-0.11.1/client/TracyCallstack.h
@@ -0,0 +1,35 @@
+#ifndef __TRACYCALLSTACK_H__
+#define __TRACYCALLSTACK_H__
+
+#ifndef TRACY_NO_CALLSTACK
+
+#  if !defined _WIN32
+#    include <sys/param.h>
+#  endif
+
+#  if defined _WIN32
+#    include "../common/TracyUwp.hpp"
+#    ifndef TRACY_UWP
+#      define TRACY_HAS_CALLSTACK 1
+#    endif
+#  elif defined __ANDROID__
+#    if !defined __arm__ || __ANDROID_API__ >= 21
+#      define TRACY_HAS_CALLSTACK 2
+#    else
+#      define TRACY_HAS_CALLSTACK 5
+#    endif
+#  elif defined __linux
+#    if defined _GNU_SOURCE && defined __GLIBC__
+#      define TRACY_HAS_CALLSTACK 3
+#    else
+#      define TRACY_HAS_CALLSTACK 2
+#    endif
+#  elif defined __APPLE__
+#    define TRACY_HAS_CALLSTACK 4
+#  elif defined BSD
+#    define TRACY_HAS_CALLSTACK 6
+#  endif
+
+#endif
+
+#endif
diff --git a/project/thirdparty/tracy-0.11.1/client/TracyCallstack.hpp b/project/thirdparty/tracy-0.11.1/client/TracyCallstack.hpp
new file mode 100644
index 000000000..fdc9345df
--- /dev/null
+++ b/project/thirdparty/tracy-0.11.1/client/TracyCallstack.hpp
@@ -0,0 +1,153 @@
+#ifndef __TRACYCALLSTACK_HPP__
+#define __TRACYCALLSTACK_HPP__
+
+#include "../common/TracyApi.h"
+#include "../common/TracyForceInline.hpp"
+#include "TracyCallstack.h"
+
+#ifndef TRACY_HAS_CALLSTACK
+
+namespace tracy
+{
+static tracy_force_inline void* Callstack( int /*depth*/ ) { return nullptr; }
+}
+
+#else
+
+#if TRACY_HAS_CALLSTACK == 2 || TRACY_HAS_CALLSTACK == 5
+#  include <unwind.h>
+#elif TRACY_HAS_CALLSTACK >= 3
+#  ifdef TRACY_LIBUNWIND_BACKTRACE
+     // libunwind is, in general, significantly faster than execinfo based backtraces
+#    define UNW_LOCAL_ONLY
+#    include <libunwind.h>
+#  else
+#    include <execinfo.h>
+#  endif
+#endif
+
+#ifdef TRACY_DEBUGINFOD
+#  include <elfutils/debuginfod.h>
+#endif
+
+#include <assert.h>
+#include <stdint.h>
+
+#include "../common/TracyAlloc.hpp"
+
+namespace tracy
+{
+
+struct CallstackSymbolData
+{
+    const char* file;
+    uint32_t line;
+    bool needFree;
+    uint64_t symAddr;
+};
+
+struct CallstackEntry
+{
+    const char* name;
+    const char* file;
+    uint32_t line;
+    uint32_t symLen;
+    uint64_t symAddr;
+};
+
+struct CallstackEntryData
+{
+    const CallstackEntry* data;
+    uint8_t size;
+    const char* imageName;
+};
+
+CallstackSymbolData DecodeSymbolAddress( uint64_t ptr );
+const char* DecodeCallstackPtrFast( uint64_t ptr );
+CallstackEntryData DecodeCallstackPtr( uint64_t ptr );
+void InitCallstack();
+void InitCallstackCritical();
+void EndCallstack();
+const char* GetKernelModulePath( uint64_t addr );
+
+#ifdef TRACY_DEBUGINFOD
+const uint8_t* GetBuildIdForImage( const char* image, size_t& size );
+debuginfod_client* GetDebuginfodClient();
+#endif
+
+#if TRACY_HAS_CALLSTACK == 1
+
+extern "C"
+{
+    typedef unsigned long (__stdcall *___tracy_t_RtlWalkFrameChain)( void**, unsigned long, unsigned long );
+    TRACY_API extern ___tracy_t_RtlWalkFrameChain ___tracy_RtlWalkFrameChain;
+}
+
+static tracy_force_inline void* Callstack( int depth )
+{
+    assert( depth >= 1 && depth < 63 );
+    auto trace = (uintptr_t*)tracy_malloc( ( 1 + depth ) * sizeof( uintptr_t ) );
+    const auto num = ___tracy_RtlWalkFrameChain( (void**)( trace + 1 ), depth, 0 );
+    *trace = num;
+    return trace;
+}
+
+#elif TRACY_HAS_CALLSTACK == 2 || TRACY_HAS_CALLSTACK == 5
+
+struct BacktraceState
+{
+    void** current;
+    void** end;
+};
+
+static _Unwind_Reason_Code tracy_unwind_callback( struct _Unwind_Context* ctx, void* arg )
+{
+    auto state = (BacktraceState*)arg;
+    uintptr_t pc = _Unwind_GetIP( ctx );
+    if( pc )
+    {
+        if( state->current == state->end ) return _URC_END_OF_STACK;
+        *state->current++ = (void*)pc;
+    }
+    return _URC_NO_REASON;
+}
+
+static tracy_force_inline void* Callstack( int depth )
+{
+    assert( depth >= 1 && depth < 63 );
+
+    auto trace = (uintptr_t*)tracy_malloc( ( 1 + depth ) * sizeof( uintptr_t ) );
+    BacktraceState state = { (void**)(trace+1), (void**)(trace+1+depth) };
+    _Unwind_Backtrace( tracy_unwind_callback, &state );
+
+    *trace = (uintptr_t*)state.current - trace + 1;
+
+    return trace;
+}
+
+#elif TRACY_HAS_CALLSTACK == 3 || TRACY_HAS_CALLSTACK == 4 || TRACY_HAS_CALLSTACK == 6
+
+static tracy_force_inline void* Callstack( int depth )
+{
+    assert( depth >= 1 );
+
+    auto trace = (uintptr_t*)tracy_malloc( ( 1 + (size_t)depth ) * sizeof( uintptr_t ) );
+
+#ifdef TRACY_LIBUNWIND_BACKTRACE
+    size_t num =  unw_backtrace( (void**)(trace+1), depth );
+#else
+    const auto num = (size_t)backtrace( (void**)(trace+1), depth );
+#endif
+
+    *trace = num;
+
+    return trace;
+}
+
+#endif
+
+}
+
+#endif
+
+#endif
diff --git a/project/thirdparty/tracy-0.11.1/client/TracyCpuid.hpp b/project/thirdparty/tracy-0.11.1/client/TracyCpuid.hpp
new file mode 100644
index 000000000..9820be00b
--- /dev/null
+++ b/project/thirdparty/tracy-0.11.1/client/TracyCpuid.hpp
@@ -0,0 +1,12 @@
+#ifndef __TRACYCPUID_HPP__
+#define __TRACYCPUID_HPP__
+
+// Prior to GCC 11 the cpuid.h header did not have any include guards and thus
+// including it more than once would cause a compiler error due to symbol
+// redefinitions. In order to support older GCC versions, we have to wrap this
+// include between custom include guards to prevent this issue.
+// See also https://github.com/wolfpld/tracy/issues/452
+
+#include <cpuid.h>
+
+#endif
diff --git a/project/thirdparty/tracy-0.11.1/client/TracyDebug.hpp b/project/thirdparty/tracy-0.11.1/client/TracyDebug.hpp
new file mode 100644
index 000000000..8723356f4
--- /dev/null
+++ b/project/thirdparty/tracy-0.11.1/client/TracyDebug.hpp
@@ -0,0 +1,11 @@
+#ifndef __TRACYPRINT_HPP__
+#define __TRACYPRINT_HPP__
+
+#ifdef TRACY_VERBOSE
+#  include <stdio.h>
+#  define TracyDebug(...) fprintf( stderr, __VA_ARGS__ );
+#else
+#  define TracyDebug(...)
+#endif
+
+#endif
diff --git a/project/thirdparty/tracy-0.11.1/client/TracyDxt1.cpp b/project/thirdparty/tracy-0.11.1/client/TracyDxt1.cpp
new file mode 100644
index 000000000..930d09820
--- /dev/null
+++ b/project/thirdparty/tracy-0.11.1/client/TracyDxt1.cpp
@@ -0,0 +1,644 @@
+#include "TracyDxt1.hpp"
+#include "../common/TracyForceInline.hpp"
+
+#include <assert.h>
+#include <stdint.h>
+#include <string.h>
+
+#ifdef __ARM_NEON
+#  include <arm_neon.h>
+#endif
+
+#if defined __AVX__ && !defined __SSE4_1__
+#  define __SSE4_1__
+#endif
+
+#if defined __SSE4_1__ || defined __AVX2__
+#  ifdef _MSC_VER
+#    include <intrin.h>
+#  else
+#    include <x86intrin.h>
+#    ifndef _mm256_cvtsi256_si32
+#      define _mm256_cvtsi256_si32( v ) ( _mm_cvtsi128_si32( _mm256_castsi256_si128( v ) ) )
+#    endif
+#  endif
+#endif
+
+namespace tracy
+{
+
+static inline uint16_t to565( uint8_t r, uint8_t g, uint8_t b )
+{
+    return ( ( r & 0xF8 ) << 8 ) | ( ( g & 0xFC ) << 3 ) | ( b >> 3 );
+}
+
+static inline uint16_t to565( uint32_t c )
+{
+    return
+        ( ( c & 0xF80000 ) >> 19 ) |
+        ( ( c & 0x00FC00 ) >> 5 ) |
+        ( ( c & 0x0000F8 ) << 8 );
+}
+
+static const uint16_t DivTable[255*3+1] = {
+    0xffff, 0xffff, 0xffff, 0xffff, 0xcccc, 0xaaaa, 0x9249, 0x8000, 0x71c7, 0x6666, 0x5d17, 0x5555, 0x4ec4, 0x4924, 0x4444, 0x4000,
+    0x3c3c, 0x38e3, 0x35e5, 0x3333, 0x30c3, 0x2e8b, 0x2c85, 0x2aaa, 0x28f5, 0x2762, 0x25ed, 0x2492, 0x234f, 0x2222, 0x2108, 0x2000,
+    0x1f07, 0x1e1e, 0x1d41, 0x1c71, 0x1bac, 0x1af2, 0x1a41, 0x1999, 0x18f9, 0x1861, 0x17d0, 0x1745, 0x16c1, 0x1642, 0x15c9, 0x1555,
+    0x14e5, 0x147a, 0x1414, 0x13b1, 0x1352, 0x12f6, 0x129e, 0x1249, 0x11f7, 0x11a7, 0x115b, 0x1111, 0x10c9, 0x1084, 0x1041, 0x1000,
+    0x0fc0, 0x0f83, 0x0f48, 0x0f0f, 0x0ed7, 0x0ea0, 0x0e6c, 0x0e38, 0x0e07, 0x0dd6, 0x0da7, 0x0d79, 0x0d4c, 0x0d20, 0x0cf6, 0x0ccc,
+    0x0ca4, 0x0c7c, 0x0c56, 0x0c30, 0x0c0c, 0x0be8, 0x0bc5, 0x0ba2, 0x0b81, 0x0b60, 0x0b40, 0x0b21, 0x0b02, 0x0ae4, 0x0ac7, 0x0aaa,
+    0x0a8e, 0x0a72, 0x0a57, 0x0a3d, 0x0a23, 0x0a0a, 0x09f1, 0x09d8, 0x09c0, 0x09a9, 0x0991, 0x097b, 0x0964, 0x094f, 0x0939, 0x0924,
+    0x090f, 0x08fb, 0x08e7, 0x08d3, 0x08c0, 0x08ad, 0x089a, 0x0888, 0x0876, 0x0864, 0x0853, 0x0842, 0x0831, 0x0820, 0x0810, 0x0800,
+    0x07f0, 0x07e0, 0x07d1, 0x07c1, 0x07b3, 0x07a4, 0x0795, 0x0787, 0x0779, 0x076b, 0x075d, 0x0750, 0x0743, 0x0736, 0x0729, 0x071c,
+    0x070f, 0x0703, 0x06f7, 0x06eb, 0x06df, 0x06d3, 0x06c8, 0x06bc, 0x06b1, 0x06a6, 0x069b, 0x0690, 0x0685, 0x067b, 0x0670, 0x0666,
+    0x065c, 0x0652, 0x0648, 0x063e, 0x0634, 0x062b, 0x0621, 0x0618, 0x060f, 0x0606, 0x05fd, 0x05f4, 0x05eb, 0x05e2, 0x05d9, 0x05d1,
+    0x05c9, 0x05c0, 0x05b8, 0x05b0, 0x05a8, 0x05a0, 0x0598, 0x0590, 0x0588, 0x0581, 0x0579, 0x0572, 0x056b, 0x0563, 0x055c, 0x0555,
+    0x054e, 0x0547, 0x0540, 0x0539, 0x0532, 0x052b, 0x0525, 0x051e, 0x0518, 0x0511, 0x050b, 0x0505, 0x04fe, 0x04f8, 0x04f2, 0x04ec,
+    0x04e6, 0x04e0, 0x04da, 0x04d4, 0x04ce, 0x04c8, 0x04c3, 0x04bd, 0x04b8, 0x04b2, 0x04ad, 0x04a7, 0x04a2, 0x049c, 0x0497, 0x0492,
+    0x048d, 0x0487, 0x0482, 0x047d, 0x0478, 0x0473, 0x046e, 0x0469, 0x0465, 0x0460, 0x045b, 0x0456, 0x0452, 0x044d, 0x0448, 0x0444,
+    0x043f, 0x043b, 0x0436, 0x0432, 0x042d, 0x0429, 0x0425, 0x0421, 0x041c, 0x0418, 0x0414, 0x0410, 0x040c, 0x0408, 0x0404, 0x0400,
+    0x03fc, 0x03f8, 0x03f4, 0x03f0, 0x03ec, 0x03e8, 0x03e4, 0x03e0, 0x03dd, 0x03d9, 0x03d5, 0x03d2, 0x03ce, 0x03ca, 0x03c7, 0x03c3,
+    0x03c0, 0x03bc, 0x03b9, 0x03b5, 0x03b2, 0x03ae, 0x03ab, 0x03a8, 0x03a4, 0x03a1, 0x039e, 0x039b, 0x0397, 0x0394, 0x0391, 0x038e,
+    0x038b, 0x0387, 0x0384, 0x0381, 0x037e, 0x037b, 0x0378, 0x0375, 0x0372, 0x036f, 0x036c, 0x0369, 0x0366, 0x0364, 0x0361, 0x035e,
+    0x035b, 0x0358, 0x0355, 0x0353, 0x0350, 0x034d, 0x034a, 0x0348, 0x0345, 0x0342, 0x0340, 0x033d, 0x033a, 0x0338, 0x0335, 0x0333,
+    0x0330, 0x032e, 0x032b, 0x0329, 0x0326, 0x0324, 0x0321, 0x031f, 0x031c, 0x031a, 0x0317, 0x0315, 0x0313, 0x0310, 0x030e, 0x030c,
+    0x0309, 0x0307, 0x0305, 0x0303, 0x0300, 0x02fe, 0x02fc, 0x02fa, 0x02f7, 0x02f5, 0x02f3, 0x02f1, 0x02ef, 0x02ec, 0x02ea, 0x02e8,
+    0x02e6, 0x02e4, 0x02e2, 0x02e0, 0x02de, 0x02dc, 0x02da, 0x02d8, 0x02d6, 0x02d4, 0x02d2, 0x02d0, 0x02ce, 0x02cc, 0x02ca, 0x02c8,
+    0x02c6, 0x02c4, 0x02c2, 0x02c0, 0x02be, 0x02bc, 0x02bb, 0x02b9, 0x02b7, 0x02b5, 0x02b3, 0x02b1, 0x02b0, 0x02ae, 0x02ac, 0x02aa,
+    0x02a8, 0x02a7, 0x02a5, 0x02a3, 0x02a1, 0x02a0, 0x029e, 0x029c, 0x029b, 0x0299, 0x0297, 0x0295, 0x0294, 0x0292, 0x0291, 0x028f,
+    0x028d, 0x028c, 0x028a, 0x0288, 0x0287, 0x0285, 0x0284, 0x0282, 0x0280, 0x027f, 0x027d, 0x027c, 0x027a, 0x0279, 0x0277, 0x0276,
+    0x0274, 0x0273, 0x0271, 0x0270, 0x026e, 0x026d, 0x026b, 0x026a, 0x0268, 0x0267, 0x0265, 0x0264, 0x0263, 0x0261, 0x0260, 0x025e,
+    0x025d, 0x025c, 0x025a, 0x0259, 0x0257, 0x0256, 0x0255, 0x0253, 0x0252, 0x0251, 0x024f, 0x024e, 0x024d, 0x024b, 0x024a, 0x0249,
+    0x0247, 0x0246, 0x0245, 0x0243, 0x0242, 0x0241, 0x0240, 0x023e, 0x023d, 0x023c, 0x023b, 0x0239, 0x0238, 0x0237, 0x0236, 0x0234,
+    0x0233, 0x0232, 0x0231, 0x0230, 0x022e, 0x022d, 0x022c, 0x022b, 0x022a, 0x0229, 0x0227, 0x0226, 0x0225, 0x0224, 0x0223, 0x0222,
+    0x0220, 0x021f, 0x021e, 0x021d, 0x021c, 0x021b, 0x021a, 0x0219, 0x0218, 0x0216, 0x0215, 0x0214, 0x0213, 0x0212, 0x0211, 0x0210,
+    0x020f, 0x020e, 0x020d, 0x020c, 0x020b, 0x020a, 0x0209, 0x0208, 0x0207, 0x0206, 0x0205, 0x0204, 0x0203, 0x0202, 0x0201, 0x0200,
+    0x01ff, 0x01fe, 0x01fd, 0x01fc, 0x01fb, 0x01fa, 0x01f9, 0x01f8, 0x01f7, 0x01f6, 0x01f5, 0x01f4, 0x01f3, 0x01f2, 0x01f1, 0x01f0,
+    0x01ef, 0x01ee, 0x01ed, 0x01ec, 0x01eb, 0x01ea, 0x01e9, 0x01e9, 0x01e8, 0x01e7, 0x01e6, 0x01e5, 0x01e4, 0x01e3, 0x01e2, 0x01e1,
+    0x01e0, 0x01e0, 0x01df, 0x01de, 0x01dd, 0x01dc, 0x01db, 0x01da, 0x01da, 0x01d9, 0x01d8, 0x01d7, 0x01d6, 0x01d5, 0x01d4, 0x01d4,
+    0x01d3, 0x01d2, 0x01d1, 0x01d0, 0x01cf, 0x01cf, 0x01ce, 0x01cd, 0x01cc, 0x01cb, 0x01cb, 0x01ca, 0x01c9, 0x01c8, 0x01c7, 0x01c7,
+    0x01c6, 0x01c5, 0x01c4, 0x01c3, 0x01c3, 0x01c2, 0x01c1, 0x01c0, 0x01c0, 0x01bf, 0x01be, 0x01bd, 0x01bd, 0x01bc, 0x01bb, 0x01ba,
+    0x01ba, 0x01b9, 0x01b8, 0x01b7, 0x01b7, 0x01b6, 0x01b5, 0x01b4, 0x01b4, 0x01b3, 0x01b2, 0x01b2, 0x01b1, 0x01b0, 0x01af, 0x01af,
+    0x01ae, 0x01ad, 0x01ad, 0x01ac, 0x01ab, 0x01aa, 0x01aa, 0x01a9, 0x01a8, 0x01a8, 0x01a7, 0x01a6, 0x01a6, 0x01a5, 0x01a4, 0x01a4,
+    0x01a3, 0x01a2, 0x01a2, 0x01a1, 0x01a0, 0x01a0, 0x019f, 0x019e, 0x019e, 0x019d, 0x019c, 0x019c, 0x019b, 0x019a, 0x019a, 0x0199,
+    0x0198, 0x0198, 0x0197, 0x0197, 0x0196, 0x0195, 0x0195, 0x0194, 0x0193, 0x0193, 0x0192, 0x0192, 0x0191, 0x0190, 0x0190, 0x018f,
+    0x018f, 0x018e, 0x018d, 0x018d, 0x018c, 0x018b, 0x018b, 0x018a, 0x018a, 0x0189, 0x0189, 0x0188, 0x0187, 0x0187, 0x0186, 0x0186,
+    0x0185, 0x0184, 0x0184, 0x0183, 0x0183, 0x0182, 0x0182, 0x0181, 0x0180, 0x0180, 0x017f, 0x017f, 0x017e, 0x017e, 0x017d, 0x017d,
+    0x017c, 0x017b, 0x017b, 0x017a, 0x017a, 0x0179, 0x0179, 0x0178, 0x0178, 0x0177, 0x0177, 0x0176, 0x0175, 0x0175, 0x0174, 0x0174,
+    0x0173, 0x0173, 0x0172, 0x0172, 0x0171, 0x0171, 0x0170, 0x0170, 0x016f, 0x016f, 0x016e, 0x016e, 0x016d, 0x016d, 0x016c, 0x016c,
+    0x016b, 0x016b, 0x016a, 0x016a, 0x0169, 0x0169, 0x0168, 0x0168, 0x0167, 0x0167, 0x0166, 0x0166, 0x0165, 0x0165, 0x0164, 0x0164,
+    0x0163, 0x0163, 0x0162, 0x0162, 0x0161, 0x0161, 0x0160, 0x0160, 0x015f, 0x015f, 0x015e, 0x015e, 0x015d, 0x015d, 0x015d, 0x015c,
+    0x015c, 0x015b, 0x015b, 0x015a, 0x015a, 0x0159, 0x0159, 0x0158, 0x0158, 0x0158, 0x0157, 0x0157, 0x0156, 0x0156
+};
+
+#if defined __ARM_NEON && defined __aarch64__
+static const uint16_t DivTableNEON[255*3+1] = {
+    0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+    0x0000, 0x1c71, 0x1af2, 0x1999, 0x1861, 0x1745, 0x1642, 0x1555, 0x147a, 0x13b1, 0x12f6, 0x1249, 0x11a7, 0x1111, 0x1084, 0x1000,
+    0x0f83, 0x0f0f, 0x0ea0, 0x0e38, 0x0dd6, 0x0d79, 0x0d20, 0x0ccc, 0x0c7c, 0x0c30, 0x0be8, 0x0ba2, 0x0b60, 0x0b21, 0x0ae4, 0x0aaa,
+    0x0a72, 0x0a3d, 0x0a0a, 0x09d8, 0x09a9, 0x097b, 0x094f, 0x0924, 0x08fb, 0x08d3, 0x08ad, 0x0888, 0x0864, 0x0842, 0x0820, 0x0800,
+    0x07e0, 0x07c1, 0x07a4, 0x0787, 0x076b, 0x0750, 0x0736, 0x071c, 0x0703, 0x06eb, 0x06d3, 0x06bc, 0x06a6, 0x0690, 0x067b, 0x0666,
+    0x0652, 0x063e, 0x062b, 0x0618, 0x0606, 0x05f4, 0x05e2, 0x05d1, 0x05c0, 0x05b0, 0x05a0, 0x0590, 0x0581, 0x0572, 0x0563, 0x0555,
+    0x0547, 0x0539, 0x052b, 0x051e, 0x0511, 0x0505, 0x04f8, 0x04ec, 0x04e0, 0x04d4, 0x04c8, 0x04bd, 0x04b2, 0x04a7, 0x049c, 0x0492,
+    0x0487, 0x047d, 0x0473, 0x0469, 0x0460, 0x0456, 0x044d, 0x0444, 0x043b, 0x0432, 0x0429, 0x0421, 0x0418, 0x0410, 0x0408, 0x0400,
+    0x03f8, 0x03f0, 0x03e8, 0x03e0, 0x03d9, 0x03d2, 0x03ca, 0x03c3, 0x03bc, 0x03b5, 0x03ae, 0x03a8, 0x03a1, 0x039b, 0x0394, 0x038e,
+    0x0387, 0x0381, 0x037b, 0x0375, 0x036f, 0x0369, 0x0364, 0x035e, 0x0358, 0x0353, 0x034d, 0x0348, 0x0342, 0x033d, 0x0338, 0x0333,
+    0x032e, 0x0329, 0x0324, 0x031f, 0x031a, 0x0315, 0x0310, 0x030c, 0x0307, 0x0303, 0x02fe, 0x02fa, 0x02f5, 0x02f1, 0x02ec, 0x02e8,
+    0x02e4, 0x02e0, 0x02dc, 0x02d8, 0x02d4, 0x02d0, 0x02cc, 0x02c8, 0x02c4, 0x02c0, 0x02bc, 0x02b9, 0x02b5, 0x02b1, 0x02ae, 0x02aa,
+    0x02a7, 0x02a3, 0x02a0, 0x029c, 0x0299, 0x0295, 0x0292, 0x028f, 0x028c, 0x0288, 0x0285, 0x0282, 0x027f, 0x027c, 0x0279, 0x0276,
+    0x0273, 0x0270, 0x026d, 0x026a, 0x0267, 0x0264, 0x0261, 0x025e, 0x025c, 0x0259, 0x0256, 0x0253, 0x0251, 0x024e, 0x024b, 0x0249,
+    0x0246, 0x0243, 0x0241, 0x023e, 0x023c, 0x0239, 0x0237, 0x0234, 0x0232, 0x0230, 0x022d, 0x022b, 0x0229, 0x0226, 0x0224, 0x0222,
+    0x021f, 0x021d, 0x021b, 0x0219, 0x0216, 0x0214, 0x0212, 0x0210, 0x020e, 0x020c, 0x020a, 0x0208, 0x0206, 0x0204, 0x0202, 0x0200,
+    0x01fe, 0x01fc, 0x01fa, 0x01f8, 0x01f6, 0x01f4, 0x01f2, 0x01f0, 0x01ee, 0x01ec, 0x01ea, 0x01e9, 0x01e7, 0x01e5, 0x01e3, 0x01e1,
+    0x01e0, 0x01de, 0x01dc, 0x01da, 0x01d9, 0x01d7, 0x01d5, 0x01d4, 0x01d2, 0x01d0, 0x01cf, 0x01cd, 0x01cb, 0x01ca, 0x01c8, 0x01c7,
+    0x01c5, 0x01c3, 0x01c2, 0x01c0, 0x01bf, 0x01bd, 0x01bc, 0x01ba, 0x01b9, 0x01b7, 0x01b6, 0x01b4, 0x01b3, 0x01b2, 0x01b0, 0x01af,
+    0x01ad, 0x01ac, 0x01aa, 0x01a9, 0x01a8, 0x01a6, 0x01a5, 0x01a4, 0x01a2, 0x01a1, 0x01a0, 0x019e, 0x019d, 0x019c, 0x019a, 0x0199,
+    0x0198, 0x0197, 0x0195, 0x0194, 0x0193, 0x0192, 0x0190, 0x018f, 0x018e, 0x018d, 0x018b, 0x018a, 0x0189, 0x0188, 0x0187, 0x0186,
+    0x0184, 0x0183, 0x0182, 0x0181, 0x0180, 0x017f, 0x017e, 0x017d, 0x017b, 0x017a, 0x0179, 0x0178, 0x0177, 0x0176, 0x0175, 0x0174,
+    0x0173, 0x0172, 0x0171, 0x0170, 0x016f, 0x016e, 0x016d, 0x016c, 0x016b, 0x016a, 0x0169, 0x0168, 0x0167, 0x0166, 0x0165, 0x0164,
+    0x0163, 0x0162, 0x0161, 0x0160, 0x015f, 0x015e, 0x015d, 0x015c, 0x015b, 0x015a, 0x0159, 0x0158, 0x0158, 0x0157, 0x0156, 0x0155,
+    0x0154, 0x0153, 0x0152, 0x0151, 0x0150, 0x0150, 0x014f, 0x014e, 0x014d, 0x014c, 0x014b, 0x014a, 0x014a, 0x0149, 0x0148, 0x0147,
+    0x0146, 0x0146, 0x0145, 0x0144, 0x0143, 0x0142, 0x0142, 0x0141, 0x0140, 0x013f, 0x013e, 0x013e, 0x013d, 0x013c, 0x013b, 0x013b,
+    0x013a, 0x0139, 0x0138, 0x0138, 0x0137, 0x0136, 0x0135, 0x0135, 0x0134, 0x0133, 0x0132, 0x0132, 0x0131, 0x0130, 0x0130, 0x012f,
+    0x012e, 0x012e, 0x012d, 0x012c, 0x012b, 0x012b, 0x012a, 0x0129, 0x0129, 0x0128, 0x0127, 0x0127, 0x0126, 0x0125, 0x0125, 0x0124,
+    0x0123, 0x0123, 0x0122, 0x0121, 0x0121, 0x0120, 0x0120, 0x011f, 0x011e, 0x011e, 0x011d, 0x011c, 0x011c, 0x011b, 0x011b, 0x011a,
+    0x0119, 0x0119, 0x0118, 0x0118, 0x0117, 0x0116, 0x0116, 0x0115, 0x0115, 0x0114, 0x0113, 0x0113, 0x0112, 0x0112, 0x0111, 0x0111,
+    0x0110, 0x010f, 0x010f, 0x010e, 0x010e, 0x010d, 0x010d, 0x010c, 0x010c, 0x010b, 0x010a, 0x010a, 0x0109, 0x0109, 0x0108, 0x0108,
+    0x0107, 0x0107, 0x0106, 0x0106, 0x0105, 0x0105, 0x0104, 0x0104, 0x0103, 0x0103, 0x0102, 0x0102, 0x0101, 0x0101, 0x0100, 0x0100,
+    0x00ff, 0x00ff, 0x00fe, 0x00fe, 0x00fd, 0x00fd, 0x00fc, 0x00fc, 0x00fb, 0x00fb, 0x00fa, 0x00fa, 0x00f9, 0x00f9, 0x00f8, 0x00f8,
+    0x00f7, 0x00f7, 0x00f6, 0x00f6, 0x00f5, 0x00f5, 0x00f4, 0x00f4, 0x00f4, 0x00f3, 0x00f3, 0x00f2, 0x00f2, 0x00f1, 0x00f1, 0x00f0,
+    0x00f0, 0x00f0, 0x00ef, 0x00ef, 0x00ee, 0x00ee, 0x00ed, 0x00ed, 0x00ed, 0x00ec, 0x00ec, 0x00eb, 0x00eb, 0x00ea, 0x00ea, 0x00ea,
+    0x00e9, 0x00e9, 0x00e8, 0x00e8, 0x00e7, 0x00e7, 0x00e7, 0x00e6, 0x00e6, 0x00e5, 0x00e5, 0x00e5, 0x00e4, 0x00e4, 0x00e3, 0x00e3,
+    0x00e3, 0x00e2, 0x00e2, 0x00e1, 0x00e1, 0x00e1, 0x00e0, 0x00e0, 0x00e0, 0x00df, 0x00df, 0x00de, 0x00de, 0x00de, 0x00dd, 0x00dd,
+    0x00dd, 0x00dc, 0x00dc, 0x00db, 0x00db, 0x00db, 0x00da, 0x00da, 0x00da, 0x00d9, 0x00d9, 0x00d9, 0x00d8, 0x00d8, 0x00d7, 0x00d7,
+    0x00d7, 0x00d6, 0x00d6, 0x00d6, 0x00d5, 0x00d5, 0x00d5, 0x00d4, 0x00d4, 0x00d4, 0x00d3, 0x00d3, 0x00d3, 0x00d2, 0x00d2, 0x00d2,
+    0x00d1, 0x00d1, 0x00d1, 0x00d0, 0x00d0, 0x00d0, 0x00cf, 0x00cf, 0x00cf, 0x00ce, 0x00ce, 0x00ce, 0x00cd, 0x00cd, 0x00cd, 0x00cc,
+    0x00cc, 0x00cc, 0x00cb, 0x00cb, 0x00cb, 0x00ca, 0x00ca, 0x00ca, 0x00c9, 0x00c9, 0x00c9, 0x00c9, 0x00c8, 0x00c8, 0x00c8, 0x00c7,
+    0x00c7, 0x00c7, 0x00c6, 0x00c6, 0x00c6, 0x00c5, 0x00c5, 0x00c5, 0x00c5, 0x00c4, 0x00c4, 0x00c4, 0x00c3, 0x00c3, 0x00c3, 0x00c3,
+    0x00c2, 0x00c2, 0x00c2, 0x00c1, 0x00c1, 0x00c1, 0x00c1, 0x00c0, 0x00c0, 0x00c0, 0x00bf, 0x00bf, 0x00bf, 0x00bf, 0x00be, 0x00be,
+    0x00be, 0x00bd, 0x00bd, 0x00bd, 0x00bd, 0x00bc, 0x00bc, 0x00bc, 0x00bc, 0x00bb, 0x00bb, 0x00bb, 0x00ba, 0x00ba, 0x00ba, 0x00ba,
+    0x00b9, 0x00b9, 0x00b9, 0x00b9, 0x00b8, 0x00b8, 0x00b8, 0x00b8, 0x00b7, 0x00b7, 0x00b7, 0x00b7, 0x00b6, 0x00b6, 0x00b6, 0x00b6,
+    0x00b5, 0x00b5, 0x00b5, 0x00b5, 0x00b4, 0x00b4, 0x00b4, 0x00b4, 0x00b3, 0x00b3, 0x00b3, 0x00b3, 0x00b2, 0x00b2, 0x00b2, 0x00b2,
+    0x00b1, 0x00b1, 0x00b1, 0x00b1, 0x00b0, 0x00b0, 0x00b0, 0x00b0, 0x00af, 0x00af, 0x00af, 0x00af, 0x00ae, 0x00ae, 0x00ae, 0x00ae,
+    0x00ae, 0x00ad, 0x00ad, 0x00ad, 0x00ad, 0x00ac, 0x00ac, 0x00ac, 0x00ac, 0x00ac, 0x00ab, 0x00ab, 0x00ab, 0x00ab,
+};
+#endif
+
+
+static tracy_force_inline uint64_t ProcessRGB( const uint8_t* src )
+{
+#ifdef __SSE4_1__
+    __m128i px0 = _mm_loadu_si128(((__m128i*)src) + 0);
+    __m128i px1 = _mm_loadu_si128(((__m128i*)src) + 1);
+    __m128i px2 = _mm_loadu_si128(((__m128i*)src) + 2);
+    __m128i px3 = _mm_loadu_si128(((__m128i*)src) + 3);
+
+    __m128i smask = _mm_set1_epi32( 0xF8FCF8 );
+    __m128i sd0 = _mm_and_si128( px0, smask );
+    __m128i sd1 = _mm_and_si128( px1, smask );
+    __m128i sd2 = _mm_and_si128( px2, smask );
+    __m128i sd3 = _mm_and_si128( px3, smask );
+
+    __m128i sc = _mm_shuffle_epi32(sd0, _MM_SHUFFLE(0, 0, 0, 0));
+
+    __m128i sc0 = _mm_cmpeq_epi8(sd0, sc);
+    __m128i sc1 = _mm_cmpeq_epi8(sd1, sc);
+    __m128i sc2 = _mm_cmpeq_epi8(sd2, sc);
+    __m128i sc3 = _mm_cmpeq_epi8(sd3, sc);
+
+    __m128i sm0 = _mm_and_si128(sc0, sc1);
+    __m128i sm1 = _mm_and_si128(sc2, sc3);
+    __m128i sm = _mm_and_si128(sm0, sm1);
+
+    if( _mm_testc_si128(sm, _mm_set1_epi32(-1)) )
+    {
+        return uint64_t( to565( src[0], src[1], src[2] ) ) << 16;
+    }
+
+    __m128i amask = _mm_set1_epi32( 0xFFFFFF );
+    px0 = _mm_and_si128( px0, amask );
+    px1 = _mm_and_si128( px1, amask );
+    px2 = _mm_and_si128( px2, amask );
+    px3 = _mm_and_si128( px3, amask );
+
+    __m128i min0 = _mm_min_epu8( px0, px1 );
+    __m128i min1 = _mm_min_epu8( px2, px3 );
+    __m128i min2 = _mm_min_epu8( min0, min1 );
+
+    __m128i max0 = _mm_max_epu8( px0, px1 );
+    __m128i max1 = _mm_max_epu8( px2, px3 );
+    __m128i max2 = _mm_max_epu8( max0, max1 );
+
+    __m128i min3 = _mm_shuffle_epi32( min2, _MM_SHUFFLE( 2, 3, 0, 1 ) );
+    __m128i max3 = _mm_shuffle_epi32( max2, _MM_SHUFFLE( 2, 3, 0, 1 ) );
+    __m128i min4 = _mm_min_epu8( min2, min3 );
+    __m128i max4 = _mm_max_epu8( max2, max3 );
+
+    __m128i min5 = _mm_shuffle_epi32( min4, _MM_SHUFFLE( 0, 0, 2, 2 ) );
+    __m128i max5 = _mm_shuffle_epi32( max4, _MM_SHUFFLE( 0, 0, 2, 2 ) );
+    __m128i rmin = _mm_min_epu8( min4, min5 );
+    __m128i rmax = _mm_max_epu8( max4, max5 );
+
+    __m128i range1 = _mm_subs_epu8( rmax, rmin );
+    __m128i range2 = _mm_sad_epu8( rmax, rmin );
+
+    uint32_t vrange = _mm_cvtsi128_si32( range2 ) >> 1;
+    __m128i range = _mm_set1_epi16( DivTable[vrange] );
+
+    __m128i inset1 = _mm_srli_epi16( range1, 4 );
+    __m128i inset = _mm_and_si128( inset1, _mm_set1_epi8( 0xF ) );
+    __m128i min = _mm_adds_epu8( rmin, inset );
+    __m128i max = _mm_subs_epu8( rmax, inset );
+
+    __m128i c0 = _mm_subs_epu8( px0, rmin );
+    __m128i c1 = _mm_subs_epu8( px1, rmin );
+    __m128i c2 = _mm_subs_epu8( px2, rmin );
+    __m128i c3 = _mm_subs_epu8( px3, rmin );
+
+    __m128i is0 = _mm_maddubs_epi16( c0, _mm_set1_epi8( 1 ) );
+    __m128i is1 = _mm_maddubs_epi16( c1, _mm_set1_epi8( 1 ) );
+    __m128i is2 = _mm_maddubs_epi16( c2, _mm_set1_epi8( 1 ) );
+    __m128i is3 = _mm_maddubs_epi16( c3, _mm_set1_epi8( 1 ) );
+
+    __m128i s0 = _mm_hadd_epi16( is0, is1 );
+    __m128i s1 = _mm_hadd_epi16( is2, is3 );
+
+    __m128i m0 = _mm_mulhi_epu16( s0, range );
+    __m128i m1 = _mm_mulhi_epu16( s1, range );
+
+    __m128i p0 = _mm_packus_epi16( m0, m1 );
+
+    __m128i p1 = _mm_or_si128( _mm_srai_epi32( p0, 6 ), _mm_srai_epi32( p0, 12 ) );
+    __m128i p2 = _mm_or_si128( _mm_srai_epi32( p0, 18 ), p0 );
+    __m128i p3 = _mm_or_si128( p1, p2 );
+    __m128i p =_mm_shuffle_epi8( p3, _mm_set1_epi32( 0x0C080400 ) );
+
+    uint32_t vmin = _mm_cvtsi128_si32( min );
+    uint32_t vmax = _mm_cvtsi128_si32( max );
+    uint32_t vp = _mm_cvtsi128_si32( p );
+
+    return uint64_t( ( uint64_t( to565( vmin ) ) << 16 ) | to565( vmax ) | ( uint64_t( vp ) << 32 ) );
+#elif defined __ARM_NEON
+#  ifdef __aarch64__
+    uint8x16x4_t px = vld4q_u8( src );
+
+    uint8x16_t lr = px.val[0];
+    uint8x16_t lg = px.val[1];
+    uint8x16_t lb = px.val[2];
+
+    uint8_t rmaxr = vmaxvq_u8( lr );
+    uint8_t rmaxg = vmaxvq_u8( lg );
+    uint8_t rmaxb = vmaxvq_u8( lb );
+
+    uint8_t rminr = vminvq_u8( lr );
+    uint8_t rming = vminvq_u8( lg );
+    uint8_t rminb = vminvq_u8( lb );
+
+    int rr = rmaxr - rminr;
+    int rg = rmaxg - rming;
+    int rb = rmaxb - rminb;
+
+    int vrange1 = rr + rg + rb;
+    uint16_t vrange2 = DivTableNEON[vrange1];
+
+    uint8_t insetr = rr >> 4;
+    uint8_t insetg = rg >> 4;
+    uint8_t insetb = rb >> 4;
+
+    uint8_t minr = rminr + insetr;
+    uint8_t ming = rming + insetg;
+    uint8_t minb = rminb + insetb;
+
+    uint8_t maxr = rmaxr - insetr;
+    uint8_t maxg = rmaxg - insetg;
+    uint8_t maxb = rmaxb - insetb;
+
+    uint8x16_t cr = vsubq_u8( lr, vdupq_n_u8( rminr ) );
+    uint8x16_t cg = vsubq_u8( lg, vdupq_n_u8( rming ) );
+    uint8x16_t cb = vsubq_u8( lb, vdupq_n_u8( rminb ) );
+
+    uint16x8_t is0l = vaddl_u8( vget_low_u8( cr ), vget_low_u8( cg ) );
+    uint16x8_t is0h = vaddl_u8( vget_high_u8( cr ), vget_high_u8( cg ) );
+    uint16x8_t is1l = vaddw_u8( is0l, vget_low_u8( cb ) );
+    uint16x8_t is1h = vaddw_u8( is0h, vget_high_u8( cb ) );
+
+    int16x8_t range = vdupq_n_s16( vrange2 );
+    uint16x8_t m0 = vreinterpretq_u16_s16( vqdmulhq_s16( vreinterpretq_s16_u16( is1l ), range ) );
+    uint16x8_t m1 = vreinterpretq_u16_s16( vqdmulhq_s16( vreinterpretq_s16_u16( is1h ), range ) );
+
+    uint8x8_t p00 = vmovn_u16( m0 );
+    uint8x8_t p01 = vmovn_u16( m1 );
+    uint8x16_t p0 = vcombine_u8( p00, p01 );
+
+    uint32x4_t p1 = vaddq_u32( vshrq_n_u32( vreinterpretq_u32_u8( p0 ), 6 ), vshrq_n_u32( vreinterpretq_u32_u8( p0 ), 12 ) );
+    uint32x4_t p2 = vaddq_u32( vshrq_n_u32( vreinterpretq_u32_u8( p0 ), 18 ), vreinterpretq_u32_u8( p0 ) );
+    uint32x4_t p3 = vaddq_u32( p1, p2 );
+
+    uint16x4x2_t p4 = vuzp_u16( vget_low_u16( vreinterpretq_u16_u32( p3 ) ), vget_high_u16( vreinterpretq_u16_u32( p3 ) ) );
+    uint8x8x2_t p = vuzp_u8( vreinterpret_u8_u16( p4.val[0] ), vreinterpret_u8_u16( p4.val[0] ) );
+
+    uint32_t vp;
+    vst1_lane_u32( &vp, vreinterpret_u32_u8( p.val[0] ), 0 );
+
+    return uint64_t( ( uint64_t( to565( minr, ming, minb ) ) << 16 ) | to565( maxr, maxg, maxb ) | ( uint64_t( vp ) << 32 ) );
+#  else
+    uint32x4_t px0 = vld1q_u32( (uint32_t*)src );
+    uint32x4_t px1 = vld1q_u32( (uint32_t*)src + 4 );
+    uint32x4_t px2 = vld1q_u32( (uint32_t*)src + 8 );
+    uint32x4_t px3 = vld1q_u32( (uint32_t*)src + 12 );
+
+    uint32x4_t smask = vdupq_n_u32( 0xF8FCF8 );
+    uint32x4_t sd0 = vandq_u32( smask, px0 );
+    uint32x4_t sd1 = vandq_u32( smask, px1 );
+    uint32x4_t sd2 = vandq_u32( smask, px2 );
+    uint32x4_t sd3 = vandq_u32( smask, px3 );
+
+    uint32x4_t sc = vdupq_n_u32( sd0[0] );
+
+    uint32x4_t sc0 = vceqq_u32( sd0, sc );
+    uint32x4_t sc1 = vceqq_u32( sd1, sc );
+    uint32x4_t sc2 = vceqq_u32( sd2, sc );
+    uint32x4_t sc3 = vceqq_u32( sd3, sc );
+
+    uint32x4_t sm0 = vandq_u32( sc0, sc1 );
+    uint32x4_t sm1 = vandq_u32( sc2, sc3 );
+    int64x2_t sm = vreinterpretq_s64_u32( vandq_u32( sm0, sm1 ) );
+
+    if( sm[0] == -1 && sm[1] == -1 )
+    {
+        return uint64_t( to565( src[0], src[1], src[2] ) ) << 16;
+    }
+
+    uint32x4_t mask = vdupq_n_u32( 0xFFFFFF );
+    uint8x16_t l0 = vreinterpretq_u8_u32( vandq_u32( mask, px0 ) );
+    uint8x16_t l1 = vreinterpretq_u8_u32( vandq_u32( mask, px1 ) );
+    uint8x16_t l2 = vreinterpretq_u8_u32( vandq_u32( mask, px2 ) );
+    uint8x16_t l3 = vreinterpretq_u8_u32( vandq_u32( mask, px3 ) );
+
+    uint8x16_t min0 = vminq_u8( l0, l1 );
+    uint8x16_t min1 = vminq_u8( l2, l3 );
+    uint8x16_t min2 = vminq_u8( min0, min1 );
+
+    uint8x16_t max0 = vmaxq_u8( l0, l1 );
+    uint8x16_t max1 = vmaxq_u8( l2, l3 );
+    uint8x16_t max2 = vmaxq_u8( max0, max1 );
+
+    uint8x16_t min3 = vreinterpretq_u8_u32( vrev64q_u32( vreinterpretq_u32_u8( min2 ) ) );
+    uint8x16_t max3 = vreinterpretq_u8_u32( vrev64q_u32( vreinterpretq_u32_u8( max2 ) ) );
+
+    uint8x16_t min4 = vminq_u8( min2, min3 );
+    uint8x16_t max4 = vmaxq_u8( max2, max3 );
+
+    uint8x16_t min5 = vcombine_u8( vget_high_u8( min4 ), vget_low_u8( min4 ) );
+    uint8x16_t max5 = vcombine_u8( vget_high_u8( max4 ), vget_low_u8( max4 ) );
+
+    uint8x16_t rmin = vminq_u8( min4, min5 );
+    uint8x16_t rmax = vmaxq_u8( max4, max5 );
+
+    uint8x16_t range1 = vsubq_u8( rmax, rmin );
+    uint8x8_t range2 = vget_low_u8( range1 );
+    uint8x8x2_t range3 = vzip_u8( range2, vdup_n_u8( 0 ) );
+    uint16x4_t range4 = vreinterpret_u16_u8( range3.val[0] );
+
+    uint16_t vrange1;
+    uint16x4_t range5 = vpadd_u16( range4, range4 );
+    uint16x4_t range6 = vpadd_u16( range5, range5 );
+    vst1_lane_u16( &vrange1, range6, 0 );
+
+    uint32_t vrange2 = ( 2 << 16 ) / uint32_t( vrange1 + 1 );
+    uint16x8_t range = vdupq_n_u16( vrange2 );
+
+    uint8x16_t inset = vshrq_n_u8( range1, 4 );
+    uint8x16_t min = vaddq_u8( rmin, inset );
+    uint8x16_t max = vsubq_u8( rmax, inset );
+
+    uint8x16_t c0 = vsubq_u8( l0, rmin );
+    uint8x16_t c1 = vsubq_u8( l1, rmin );
+    uint8x16_t c2 = vsubq_u8( l2, rmin );
+    uint8x16_t c3 = vsubq_u8( l3, rmin );
+
+    uint16x8_t is0 = vpaddlq_u8( c0 );
+    uint16x8_t is1 = vpaddlq_u8( c1 );
+    uint16x8_t is2 = vpaddlq_u8( c2 );
+    uint16x8_t is3 = vpaddlq_u8( c3 );
+
+    uint16x4_t is4 = vpadd_u16( vget_low_u16( is0 ), vget_high_u16( is0 ) );
+    uint16x4_t is5 = vpadd_u16( vget_low_u16( is1 ), vget_high_u16( is1 ) );
+    uint16x4_t is6 = vpadd_u16( vget_low_u16( is2 ), vget_high_u16( is2 ) );
+    uint16x4_t is7 = vpadd_u16( vget_low_u16( is3 ), vget_high_u16( is3 ) );
+
+    uint16x8_t s0 = vcombine_u16( is4, is5 );
+    uint16x8_t s1 = vcombine_u16( is6, is7 );
+
+    uint16x8_t m0 = vreinterpretq_u16_s16( vqdmulhq_s16( vreinterpretq_s16_u16( s0 ), vreinterpretq_s16_u16( range ) ) );
+    uint16x8_t m1 = vreinterpretq_u16_s16( vqdmulhq_s16( vreinterpretq_s16_u16( s1 ), vreinterpretq_s16_u16( range ) ) );
+
+    uint8x8_t p00 = vmovn_u16( m0 );
+    uint8x8_t p01 = vmovn_u16( m1 );
+    uint8x16_t p0 = vcombine_u8( p00, p01 );
+
+    uint32x4_t p1 = vaddq_u32( vshrq_n_u32( vreinterpretq_u32_u8( p0 ), 6 ), vshrq_n_u32( vreinterpretq_u32_u8( p0 ), 12 ) );
+    uint32x4_t p2 = vaddq_u32( vshrq_n_u32( vreinterpretq_u32_u8( p0 ), 18 ), vreinterpretq_u32_u8( p0 ) );
+    uint32x4_t p3 = vaddq_u32( p1, p2 );
+
+    uint16x4x2_t p4 = vuzp_u16( vget_low_u16( vreinterpretq_u16_u32( p3 ) ), vget_high_u16( vreinterpretq_u16_u32( p3 ) ) );
+    uint8x8x2_t p = vuzp_u8( vreinterpret_u8_u16( p4.val[0] ), vreinterpret_u8_u16( p4.val[0] ) );
+
+    uint32_t vmin, vmax, vp;
+    vst1q_lane_u32( &vmin, vreinterpretq_u32_u8( min ), 0 );
+    vst1q_lane_u32( &vmax, vreinterpretq_u32_u8( max ), 0 );
+    vst1_lane_u32( &vp, vreinterpret_u32_u8( p.val[0] ), 0 );
+
+    return uint64_t( ( uint64_t( to565( vmin ) ) << 16 ) | to565( vmax ) | ( uint64_t( vp ) << 32 ) );
+#  endif
+#else
+    uint32_t ref;
+    memcpy( &ref, src, 4 );
+    uint32_t refMask = ref & 0xF8FCF8;
+    auto stmp = src + 4;
+    for( int i=1; i<16; i++ )
+    {
+        uint32_t px;
+        memcpy( &px, stmp, 4 );
+        if( ( px & 0xF8FCF8 ) != refMask ) break;
+        stmp += 4;
+    }
+    if( stmp == src + 64 )
+    {
+        return uint64_t( to565( ref ) ) << 16;
+    }
+
+    uint8_t min[3] = { src[0], src[1], src[2] };
+    uint8_t max[3] = { src[0], src[1], src[2] };
+    auto tmp = src + 4;
+    for( int i=1; i<16; i++ )
+    {
+        for( int j=0; j<3; j++ )
+        {
+            if( tmp[j] < min[j] ) min[j] = tmp[j];
+            else if( tmp[j] > max[j] ) max[j] = tmp[j];
+        }
+        tmp += 4;
+    }
+
+    const uint32_t range = DivTable[max[0] - min[0] + max[1] - min[1] + max[2] - min[2]];
+    const uint32_t rmin = min[0] + min[1] + min[2];
+    for( int i=0; i<3; i++ )
+    {
+        const uint8_t inset = ( max[i] - min[i] ) >> 4;
+        min[i] += inset;
+        max[i] -= inset;
+    }
+
+    uint32_t data = 0;
+    for( int i=0; i<16; i++ )
+    {
+        const uint32_t c = src[0] + src[1] + src[2] - rmin;
+        const uint8_t idx = ( c * range ) >> 16;
+        data |= idx << (i*2);
+        src += 4;
+    }
+
+    return uint64_t( ( uint64_t( to565( min[0], min[1], min[2] ) ) << 16 ) | to565( max[0], max[1], max[2] ) | ( uint64_t( data ) << 32 ) );
+#endif
+}
+
+#ifdef __AVX2__
+static tracy_force_inline void ProcessRGB_AVX( const uint8_t* src, char*& dst )
+{
+    __m256i px0 = _mm256_loadu_si256(((__m256i*)src) + 0);
+    __m256i px1 = _mm256_loadu_si256(((__m256i*)src) + 1);
+    __m256i px2 = _mm256_loadu_si256(((__m256i*)src) + 2);
+    __m256i px3 = _mm256_loadu_si256(((__m256i*)src) + 3);
+
+    __m256i smask = _mm256_set1_epi32( 0xF8FCF8 );
+    __m256i sd0 = _mm256_and_si256( px0, smask );
+    __m256i sd1 = _mm256_and_si256( px1, smask );
+    __m256i sd2 = _mm256_and_si256( px2, smask );
+    __m256i sd3 = _mm256_and_si256( px3, smask );
+
+    __m256i sc = _mm256_shuffle_epi32(sd0, _MM_SHUFFLE(0, 0, 0, 0));
+
+    __m256i sc0 = _mm256_cmpeq_epi8( sd0, sc );
+    __m256i sc1 = _mm256_cmpeq_epi8( sd1, sc );
+    __m256i sc2 = _mm256_cmpeq_epi8( sd2, sc );
+    __m256i sc3 = _mm256_cmpeq_epi8( sd3, sc );
+
+    __m256i sm0 = _mm256_and_si256( sc0, sc1 );
+    __m256i sm1 = _mm256_and_si256( sc2, sc3 );
+    __m256i sm = _mm256_and_si256( sm0, sm1 );
+
+    const int64_t solid0 = 1 - _mm_testc_si128( _mm256_castsi256_si128( sm ), _mm_set1_epi32( -1 ) );
+    const int64_t solid1 = 1 - _mm_testc_si128( _mm256_extracti128_si256( sm, 1 ), _mm_set1_epi32( -1 ) );
+
+    if( solid0 + solid1 == 0 )
+    {
+        const auto c0 = uint64_t( to565( src[0], src[1], src[2] ) ) << 16;
+        const auto c1 = uint64_t( to565( src[16], src[17], src[18] ) ) << 16;
+        memcpy( dst, &c0, 8 );
+        memcpy( dst+8, &c1, 8 );
+        dst += 16;
+        return;
+    }
+
+    __m256i amask = _mm256_set1_epi32( 0xFFFFFF );
+    px0 = _mm256_and_si256( px0, amask );
+    px1 = _mm256_and_si256( px1, amask );
+    px2 = _mm256_and_si256( px2, amask );
+    px3 = _mm256_and_si256( px3, amask );
+
+    __m256i min0 = _mm256_min_epu8( px0, px1 );
+    __m256i min1 = _mm256_min_epu8( px2, px3 );
+    __m256i min2 = _mm256_min_epu8( min0, min1 );
+
+    __m256i max0 = _mm256_max_epu8( px0, px1 );
+    __m256i max1 = _mm256_max_epu8( px2, px3 );
+    __m256i max2 = _mm256_max_epu8( max0, max1 );
+
+    __m256i min3 = _mm256_shuffle_epi32( min2, _MM_SHUFFLE( 2, 3, 0, 1 ) );
+    __m256i max3 = _mm256_shuffle_epi32( max2, _MM_SHUFFLE( 2, 3, 0, 1 ) );
+    __m256i min4 = _mm256_min_epu8( min2, min3 );
+    __m256i max4 = _mm256_max_epu8( max2, max3 );
+
+    __m256i min5 = _mm256_shuffle_epi32( min4, _MM_SHUFFLE( 0, 0, 2, 2 ) );
+    __m256i max5 = _mm256_shuffle_epi32( max4, _MM_SHUFFLE( 0, 0, 2, 2 ) );
+    __m256i rmin = _mm256_min_epu8( min4, min5 );
+    __m256i rmax = _mm256_max_epu8( max4, max5 );
+
+    __m256i range1 = _mm256_subs_epu8( rmax, rmin );
+    __m256i range2 = _mm256_sad_epu8( rmax, rmin );
+
+    uint16_t vrange0 = DivTable[_mm256_cvtsi256_si32( range2 ) >> 1];
+    uint16_t vrange1 = DivTable[_mm256_extract_epi16( range2, 8 ) >> 1];
+    __m256i range00 = _mm256_set1_epi16( vrange0 );
+    __m256i range = _mm256_inserti128_si256( range00, _mm_set1_epi16( vrange1 ), 1 );
+
+    __m256i inset1 = _mm256_srli_epi16( range1, 4 );
+    __m256i inset = _mm256_and_si256( inset1, _mm256_set1_epi8( 0xF ) );
+    __m256i min = _mm256_adds_epu8( rmin, inset );
+    __m256i max = _mm256_subs_epu8( rmax, inset );
+
+    __m256i c0 = _mm256_subs_epu8( px0, rmin );
+    __m256i c1 = _mm256_subs_epu8( px1, rmin );
+    __m256i c2 = _mm256_subs_epu8( px2, rmin );
+    __m256i c3 = _mm256_subs_epu8( px3, rmin );
+
+    __m256i is0 = _mm256_maddubs_epi16( c0, _mm256_set1_epi8( 1 ) );
+    __m256i is1 = _mm256_maddubs_epi16( c1, _mm256_set1_epi8( 1 ) );
+    __m256i is2 = _mm256_maddubs_epi16( c2, _mm256_set1_epi8( 1 ) );
+    __m256i is3 = _mm256_maddubs_epi16( c3, _mm256_set1_epi8( 1 ) );
+
+    __m256i s0 = _mm256_hadd_epi16( is0, is1 );
+    __m256i s1 = _mm256_hadd_epi16( is2, is3 );
+
+    __m256i m0 = _mm256_mulhi_epu16( s0, range );
+    __m256i m1 = _mm256_mulhi_epu16( s1, range );
+
+    __m256i p0 = _mm256_packus_epi16( m0, m1 );
+
+    __m256i p1 = _mm256_or_si256( _mm256_srai_epi32( p0, 6 ), _mm256_srai_epi32( p0, 12 ) );
+    __m256i p2 = _mm256_or_si256( _mm256_srai_epi32( p0, 18 ), p0 );
+    __m256i p3 = _mm256_or_si256( p1, p2 );
+    __m256i p =_mm256_shuffle_epi8( p3, _mm256_set1_epi32( 0x0C080400 ) );
+
+    __m256i mm0 = _mm256_unpacklo_epi8( _mm256_setzero_si256(), min );
+    __m256i mm1 = _mm256_unpacklo_epi8( _mm256_setzero_si256(), max );
+    __m256i mm2 = _mm256_unpacklo_epi64( mm1, mm0 );
+    __m256i mmr = _mm256_slli_epi64( _mm256_srli_epi64( mm2, 11 ), 11 );
+    __m256i mmg = _mm256_slli_epi64( _mm256_srli_epi64( mm2, 26 ), 5 );
+    __m256i mmb = _mm256_srli_epi64( _mm256_slli_epi64( mm2, 16 ), 59 );
+    __m256i mm3 = _mm256_or_si256( mmr, mmg );
+    __m256i mm4 = _mm256_or_si256( mm3, mmb );
+    __m256i mm5 = _mm256_shuffle_epi8( mm4, _mm256_set1_epi32( 0x09080100 ) );
+
+    __m256i d0 = _mm256_unpacklo_epi32( mm5, p );
+    __m256i d1 = _mm256_permute4x64_epi64( d0, _MM_SHUFFLE( 3, 2, 2, 0 ) );
+    __m128i d2 = _mm256_castsi256_si128( d1 );
+
+    __m128i mask = _mm_set_epi64x( 0xFFFF0000 | -solid1, 0xFFFF0000 | -solid0 );
+    __m128i d3 = _mm_and_si128( d2, mask );
+    _mm_storeu_si128( (__m128i*)dst, d3 );
+    dst += 16;
+}
+#endif
+
+void CompressImageDxt1( const char* src, char* dst, int w, int h )
+{
+    assert( (w % 4) == 0 && (h % 4) == 0 );
+
+#ifdef __AVX2__
+    if( w%8 == 0 )
+    {
+        uint32_t buf[8*4];
+        int i = 0;
+
+        auto blocks = w * h / 32;
+        do
+        {
+            auto tmp = (char*)buf;
+            memcpy( tmp,        src,          8*4 );
+            memcpy( tmp + 8*4,  src + w * 4,  8*4 );
+            memcpy( tmp + 16*4, src + w * 8,  8*4 );
+            memcpy( tmp + 24*4, src + w * 12, 8*4 );
+            src += 8*4;
+            if( ++i == w/8 )
+            {
+                src += w * 3 * 4;
+                i = 0;
+            }
+
+            ProcessRGB_AVX( (uint8_t*)buf, dst );
+        }
+        while( --blocks );
+    }
+    else
+#endif
+    {
+        uint32_t buf[4*4];
+        int i = 0;
+
+        auto ptr = dst;
+        auto blocks = w * h / 16;
+        do
+        {
+            auto tmp = (char*)buf;
+            memcpy( tmp,        src,          4*4 );
+            memcpy( tmp + 4*4,  src + w * 4,  4*4 );
+            memcpy( tmp + 8*4,  src + w * 8,  4*4 );
+            memcpy( tmp + 12*4, src + w * 12, 4*4 );
+            src += 4*4;
+            if( ++i == w/4 )
+            {
+                src += w * 3 * 4;
+                i = 0;
+            }
+
+            const auto c = ProcessRGB( (uint8_t*)buf );
+            memcpy( ptr, &c, sizeof( uint64_t ) );
+            ptr += sizeof( uint64_t );
+        }
+        while( --blocks );
+    }
+}
+
+}
diff --git a/project/thirdparty/tracy-0.11.1/client/TracyDxt1.hpp b/project/thirdparty/tracy-0.11.1/client/TracyDxt1.hpp
new file mode 100644
index 000000000..c23135427
--- /dev/null
+++ b/project/thirdparty/tracy-0.11.1/client/TracyDxt1.hpp
@@ -0,0 +1,11 @@
+#ifndef __TRACYDXT1_HPP__
+#define __TRACYDXT1_HPP__
+
+namespace tracy
+{
+
+void CompressImageDxt1( const char* src, char* dst, int w, int h );
+
+}
+
+#endif
diff --git a/project/thirdparty/tracy-0.11.1/client/TracyFastVector.hpp b/project/thirdparty/tracy-0.11.1/client/TracyFastVector.hpp
new file mode 100644
index 000000000..38accc926
--- /dev/null
+++ b/project/thirdparty/tracy-0.11.1/client/TracyFastVector.hpp
@@ -0,0 +1,118 @@
+#ifndef __TRACYFASTVECTOR_HPP__
+#define __TRACYFASTVECTOR_HPP__
+
+#include <assert.h>
+#include <stddef.h>
+
+#include "../common/TracyAlloc.hpp"
+#include "../common/TracyForceInline.hpp"
+
+namespace tracy
+{
+
+template<typename T>
+class FastVector
+{
+public:
+    using iterator = T*;
+    using const_iterator = const T*;
+
+    FastVector( size_t capacity )
+        : m_ptr( (T*)tracy_malloc( sizeof( T ) * capacity ) )
+        , m_write( m_ptr )
+        , m_end( m_ptr + capacity )
+    {
+        assert( capacity != 0 );
+    }
+
+    FastVector( const FastVector& ) = delete;
+    FastVector( FastVector&& ) = delete;
+
+    ~FastVector()
+    {
+        tracy_free( m_ptr );
+    }
+
+    FastVector& operator=( const FastVector& ) = delete;
+    FastVector& operator=( FastVector&& ) = delete;
+
+    bool empty() const { return m_ptr == m_write; }
+    size_t size() const { return m_write - m_ptr; }
+
+    T* data() { return m_ptr; }
+    const T* data() const { return m_ptr; };
+
+    T* begin() { return m_ptr; }
+    const T* begin() const { return m_ptr; }
+    T* end() { return m_write; }
+    const T* end() const { return m_write; }
+
+    T& front() { assert( !empty() ); return m_ptr[0]; }
+    const T& front() const { assert( !empty() ); return m_ptr[0]; }
+
+    T& back() { assert( !empty() ); return m_write[-1]; }
+    const T& back() const { assert( !empty() ); return m_write[-1]; }
+
+    T& operator[]( size_t idx ) { return m_ptr[idx]; }
+    const T& operator[]( size_t idx ) const { return m_ptr[idx]; }
+
+    T* push_next()
+    {
+        if( m_write == m_end ) AllocMore();
+        return m_write++;
+    }
+
+    T* prepare_next()
+    {
+        if( m_write == m_end ) AllocMore();
+        return m_write;
+    }
+
+    void commit_next()
+    {
+        m_write++;
+    }
+
+    void clear()
+    {
+        m_write = m_ptr;
+    }
+
+    void swap( FastVector& vec )
+    {
+        const auto ptr1 = m_ptr;
+        const auto ptr2 = vec.m_ptr;
+        const auto write1 = m_write;
+        const auto write2 = vec.m_write;
+        const auto end1 = m_end;
+        const auto end2 = vec.m_end;
+
+        m_ptr = ptr2;
+        vec.m_ptr = ptr1;
+        m_write = write2;
+        vec.m_write = write1;
+        m_end = end2;
+        vec.m_end = end1;
+    }
+
+private:
+    tracy_no_inline void AllocMore()
+    {
+        const auto cap = size_t( m_end - m_ptr ) * 2;
+        const auto size = size_t( m_write - m_ptr );
+        T* ptr = (T*)tracy_malloc( sizeof( T ) * cap );
+        memcpy( ptr, m_ptr, size * sizeof( T ) );
+        tracy_free_fast( m_ptr );
+        m_ptr = ptr;
+        m_write = m_ptr + size;
+        m_end = m_ptr + cap;
+    }
+
+    T* m_ptr;
+    T* m_write;
+    T* m_end;
+};
+
+}
+
+#endif
diff --git a/project/thirdparty/tracy-0.11.1/client/TracyKCore.cpp b/project/thirdparty/tracy-0.11.1/client/TracyKCore.cpp
new file mode 100644
index 000000000..09d51d117
--- /dev/null
+++ b/project/thirdparty/tracy-0.11.1/client/TracyKCore.cpp
@@ -0,0 +1,121 @@
+#ifdef __linux__
+
+#include <algorithm>
+#include <assert.h>
+#include <fcntl.h>
+#include <limits.h>
+#include <unistd.h>
+
+#include "TracyDebug.hpp"
+#include "TracyKCore.hpp"
+#include "../common/TracyAlloc.hpp"
+
+#if !defined(__GLIBC__) && !defined(__WORDSIZE)
+// include __WORDSIZE headers for musl
+#  include <bits/reg.h>
+#endif
+
+namespace tracy
+{
+
+using elf_half = uint16_t;
+using elf_word = uint32_t;
+using elf_sword = int32_t;
+
+#if __WORDSIZE == 32
+    using elf_addr = uint32_t;
+    using elf_off = uint32_t;
+    using elf_xword = uint32_t;
+#else
+    using elf_addr = uint64_t;
+    using elf_off = uint64_t;
+    using elf_xword = uint64_t;
+#endif
+
+struct elf_ehdr
+{
+    unsigned char e_ident[16];
+    elf_half e_type;
+    elf_half e_machine;
+    elf_word e_version;
+    elf_addr e_entry;
+    elf_off e_phoff;
+    elf_off e_shoff;
+    elf_word e_flags;
+    elf_half e_ehsize;
+    elf_half e_phentsize;
+    elf_half e_phnum;
+    elf_half e_shentsize;
+    elf_half e_shnum;
+    elf_half e_shstrndx;
+};
+
+struct elf_phdr
+{
+    elf_word p_type;
+    elf_word p_flags;
+    elf_off p_offset;
+    elf_addr p_vaddr;
+    elf_addr p_paddr;
+    elf_xword p_filesz;
+    elf_xword p_memsz;
+    uint64_t p_align;   // include 32-bit-only flags field for 32-bit compatibility
+};
+
+KCore::KCore()
+    : m_offsets( 16 )
+{
+    m_fd = open( "/proc/kcore", O_RDONLY );
+    if( m_fd == -1 ) return;
+
+    elf_ehdr ehdr;
+    if( read( m_fd, &ehdr, sizeof( ehdr ) ) != sizeof( ehdr ) ) goto err;
+
+    assert( ehdr.e_phentsize == sizeof( elf_phdr ) );
+
+    for( elf_half i=0; i<ehdr.e_phnum; i++ )
+    {
+        elf_phdr phdr;
+        if( lseek( m_fd, ehdr.e_phoff + i * ehdr.e_phentsize, SEEK_SET ) == -1 ) goto err;
+        if( read( m_fd, &phdr, sizeof( phdr ) ) != sizeof( phdr ) ) goto err;
+        if( phdr.p_type != 1 ) continue;
+
+        auto ptr = m_offsets.push_next();
+        ptr->start = phdr.p_vaddr;
+        ptr->size = phdr.p_memsz;
+        ptr->offset = phdr.p_offset;
+    }
+
+    std::sort( m_offsets.begin(), m_offsets.end(), []( const Offset& lhs, const Offset& rhs ) { return lhs.start < rhs.start; } );
+    TracyDebug( "KCore: %zu segments found\n", m_offsets.size() );
+    return;
+
+err:
+    close( m_fd );
+    m_fd = -1;
+}
+
+KCore::~KCore()
+{
+    if( m_fd != -1 ) close( m_fd );
+}
+
+void* KCore::Retrieve( uint64_t addr, uint64_t size ) const
+{
+    if( m_fd == -1 ) return nullptr;
+    auto it = std::lower_bound( m_offsets.begin(), m_offsets.end(), addr, []( const Offset& lhs, uint64_t rhs ) { return lhs.start + lhs.size < rhs; } );
+    if( it == m_offsets.end() ) return nullptr;
+    if( addr + size > it->start + it->size ) return nullptr;
+    if( lseek( m_fd, it->offset + addr - it->start, SEEK_SET ) == -1 ) return nullptr;
+    auto ptr = tracy_malloc( size );
+    if( read( m_fd, ptr, size ) != ssize_t( size ) )
+    {
+        tracy_free( ptr );
+        return nullptr;
+    }
+    return ptr;
+}
+
+}
+
+#endif
\ No newline at end of file
diff --git a/project/thirdparty/tracy-0.11.1/client/TracyKCore.hpp b/project/thirdparty/tracy-0.11.1/client/TracyKCore.hpp
new file mode 100644
index 000000000..437e172c2
--- /dev/null
+++ b/project/thirdparty/tracy-0.11.1/client/TracyKCore.hpp
@@ -0,0 +1,37 @@
+#ifndef __TRACYKCORE_HPP__
+#define __TRACYKCORE_HPP__
+
+#ifdef __linux__
+
+#include <stdint.h>
+
+#include "TracyFastVector.hpp"
+
+namespace tracy
+{
+
+class KCore
+{
+    struct Offset
+    {
+        uint64_t start;
+        uint64_t size;
+        uint64_t offset;
+    };
+
+public:
+    KCore();
+    ~KCore();
+
+    void* Retrieve( uint64_t addr, uint64_t size ) const;
+
+private:
+    int m_fd;
+    FastVector<Offset> m_offsets;
+};
+
+}
+
+#endif
+
+#endif
diff --git a/project/thirdparty/tracy-0.11.1/client/TracyLock.hpp b/project/thirdparty/tracy-0.11.1/client/TracyLock.hpp
new file mode 100644
index 000000000..d12a3c16d
--- /dev/null
+++ b/project/thirdparty/tracy-0.11.1/client/TracyLock.hpp
@@ -0,0 +1,546 @@
+#ifndef __TRACYLOCK_HPP__
+#define __TRACYLOCK_HPP__
+
+#include <atomic>
+#include <limits>
+
+#include "../common/TracySystem.hpp"
+#include "../common/TracyAlign.hpp"
+#include "TracyProfiler.hpp"
+
+namespace tracy
+{
+
+class LockableCtx
+{
+public:
+    tracy_force_inline LockableCtx( const SourceLocationData* srcloc )
+        : m_id( GetLockCounter().fetch_add( 1, std::memory_order_relaxed ) )
+#ifdef TRACY_ON_DEMAND
+        , m_lockCount( 0 )
+        , m_active( false )
+#endif
+    {
+        assert( m_id != (std::numeric_limits<uint32_t>::max)() );
+
+        auto item = Profiler::QueueSerial();
+        MemWrite( &item->hdr.type, QueueType::LockAnnounce );
+        MemWrite( &item->lockAnnounce.id, m_id );
+        MemWrite( &item->lockAnnounce.time, Profiler::GetTime() );
+        MemWrite( &item->lockAnnounce.lckloc, (uint64_t)srcloc );
+        MemWrite( &item->lockAnnounce.type, LockType::Lockable );
+#ifdef TRACY_ON_DEMAND
+        GetProfiler().DeferItem( *item );
+#endif
+        Profiler::QueueSerialFinish();
+    }
+
+    LockableCtx( const LockableCtx& ) = delete;
+    LockableCtx& operator=( const LockableCtx& ) = delete;
+
+    tracy_force_inline ~LockableCtx()
+    {
+        auto item = Profiler::QueueSerial();
+        MemWrite( &item->hdr.type, QueueType::LockTerminate );
+        MemWrite( &item->lockTerminate.id, m_id );
+        MemWrite( &item->lockTerminate.time, Profiler::GetTime() );
+#ifdef TRACY_ON_DEMAND
+        GetProfiler().DeferItem( *item );
+#endif
+        Profiler::QueueSerialFinish();
+    }
+
+    tracy_force_inline bool BeforeLock()
+    {
+#ifdef TRACY_ON_DEMAND
+        bool queue = false;
+        const auto locks = m_lockCount.fetch_add( 1, std::memory_order_relaxed );
+        const auto active = m_active.load( std::memory_order_relaxed );
+        if( locks == 0 || active )
+        {
+            const bool connected = GetProfiler().IsConnected();
+            if( active != connected ) m_active.store( connected, std::memory_order_relaxed );
+            if( connected ) queue = true;
+        }
+        if( !queue ) return false;
+#endif
+
+        auto item = Profiler::QueueSerial();
+        MemWrite( &item->hdr.type, QueueType::LockWait );
+        MemWrite( &item->lockWait.thread, GetThreadHandle() );
+        MemWrite( &item->lockWait.id, m_id );
+        MemWrite( &item->lockWait.time, Profiler::GetTime() );
+        Profiler::QueueSerialFinish();
+        return true;
+    }
+
+    tracy_force_inline void AfterLock()
+    {
+        auto item = Profiler::QueueSerial();
+        MemWrite( &item->hdr.type, QueueType::LockObtain );
+        MemWrite( &item->lockObtain.thread, GetThreadHandle() );
+        MemWrite( &item->lockObtain.id, m_id );
+        MemWrite( &item->lockObtain.time, Profiler::GetTime() );
+        Profiler::QueueSerialFinish();
+    }
+
+    tracy_force_inline void AfterUnlock()
+    {
+#ifdef TRACY_ON_DEMAND
+        m_lockCount.fetch_sub( 1, std::memory_order_relaxed );
+        if( !m_active.load( std::memory_order_relaxed ) ) return;
+        if( !GetProfiler().IsConnected() )
+        {
+            m_active.store( false, std::memory_order_relaxed );
+            return;
+        }
+#endif
+
+        auto item = Profiler::QueueSerial();
+        MemWrite( &item->hdr.type, QueueType::LockRelease );
+        MemWrite( &item->lockRelease.id, m_id );
+        MemWrite( &item->lockRelease.time, Profiler::GetTime() );
+        Profiler::QueueSerialFinish();
+    }
+
+    tracy_force_inline void AfterTryLock( bool acquired )
+    {
+#ifdef TRACY_ON_DEMAND
+        if( !acquired ) return;
+
+        bool queue = false;
+        const auto locks = m_lockCount.fetch_add( 1, std::memory_order_relaxed );
+        const auto active = m_active.load( std::memory_order_relaxed );
+        if( locks == 0 || active )
+        {
+            const bool connected = GetProfiler().IsConnected();
+            if( active != connected ) m_active.store( connected, std::memory_order_relaxed );
+            if( connected ) queue = true;
+        }
+        if( !queue ) return;
+#endif
+
+        if( acquired )
+        {
+            auto item = Profiler::QueueSerial();
+            MemWrite( &item->hdr.type, QueueType::LockObtain );
+            MemWrite( &item->lockObtain.thread, GetThreadHandle() );
+            MemWrite( &item->lockObtain.id, m_id );
+            MemWrite( &item->lockObtain.time, Profiler::GetTime() );
+            Profiler::QueueSerialFinish();
+        }
+    }
+
+    tracy_force_inline void Mark( const SourceLocationData* srcloc )
+    {
+#ifdef TRACY_ON_DEMAND
+        const auto active = m_active.load( std::memory_order_relaxed );
+        if( !active ) return;
+        const auto connected = GetProfiler().IsConnected();
+        if( !connected )
+        {
+            if( active ) m_active.store( false, std::memory_order_relaxed );
+            return;
+        }
+#endif
+
+        auto item = Profiler::QueueSerial();
+        MemWrite( &item->hdr.type, QueueType::LockMark );
+        MemWrite( &item->lockMark.thread, GetThreadHandle() );
+        MemWrite( &item->lockMark.id, m_id );
+        MemWrite( &item->lockMark.srcloc, (uint64_t)srcloc );
+        Profiler::QueueSerialFinish();
+    }
+
+    tracy_force_inline void CustomName( const char* name, size_t size )
+    {
+        assert( size < (std::numeric_limits<uint16_t>::max)() );
+        auto ptr = (char*)tracy_malloc( size );
+        memcpy( ptr, name, size );
+        auto item = Profiler::QueueSerial();
+        MemWrite( &item->hdr.type, QueueType::LockName );
+        MemWrite( &item->lockNameFat.id, m_id );
+        MemWrite( &item->lockNameFat.name, (uint64_t)ptr );
+        MemWrite( &item->lockNameFat.size, (uint16_t)size );
+#ifdef TRACY_ON_DEMAND
+        GetProfiler().DeferItem( *item );
+#endif
+        Profiler::QueueSerialFinish();
+    }
+
+private:
+    uint32_t m_id;
+
+#ifdef TRACY_ON_DEMAND
+    std::atomic<uint32_t> m_lockCount;
+    std::atomic<bool> m_active;
+#endif
+};
+
+template<class T>
+class Lockable
+{
+public:
+    tracy_force_inline Lockable( const SourceLocationData* srcloc )
+        : m_ctx( srcloc )
+    {
+    }
+
+    Lockable( const Lockable& ) = delete;
+    Lockable& operator=( const Lockable& ) = delete;
+
+    tracy_force_inline void lock()
+    {
+        const auto runAfter = m_ctx.BeforeLock();
+        m_lockable.lock();
+        if( runAfter ) m_ctx.AfterLock();
+    }
+
+    tracy_force_inline void unlock()
+    {
+        m_lockable.unlock();
+        m_ctx.AfterUnlock();
+    }
+
+    tracy_force_inline bool try_lock()
+    {
+        const auto acquired = m_lockable.try_lock();
+        m_ctx.AfterTryLock( acquired );
+        return acquired;
+    }
+
+    tracy_force_inline void Mark( const SourceLocationData* srcloc )
+    {
+        m_ctx.Mark( srcloc );
+    }
+
+    tracy_force_inline void CustomName( const char* name, size_t size )
+    {
+        m_ctx.CustomName( name, size );
+    }
+
+private:
+    T m_lockable;
+    LockableCtx m_ctx;
+};
+
+
+class SharedLockableCtx
+{
+public:
+    tracy_force_inline SharedLockableCtx( const SourceLocationData* srcloc )
+        : m_id( GetLockCounter().fetch_add( 1, std::memory_order_relaxed ) )
+#ifdef TRACY_ON_DEMAND
+        , m_lockCount( 0 )
+        , m_active( false )
+#endif
+    {
+        assert( m_id != (std::numeric_limits<uint32_t>::max)() );
+
+        auto item = Profiler::QueueSerial();
+        MemWrite( &item->hdr.type, QueueType::LockAnnounce );
+        MemWrite( &item->lockAnnounce.id, m_id );
+        MemWrite( &item->lockAnnounce.time, Profiler::GetTime() );
+        MemWrite( &item->lockAnnounce.lckloc, (uint64_t)srcloc );
+        MemWrite( &item->lockAnnounce.type, LockType::SharedLockable );
+#ifdef TRACY_ON_DEMAND
+        GetProfiler().DeferItem( *item );
+#endif
+        Profiler::QueueSerialFinish();
+    }
+
+    SharedLockableCtx( const SharedLockableCtx& ) = delete;
+    SharedLockableCtx& operator=( const SharedLockableCtx& ) = delete;
+
+    tracy_force_inline ~SharedLockableCtx()
+    {
+        auto item = Profiler::QueueSerial();
+        MemWrite( &item->hdr.type, QueueType::LockTerminate );
+        MemWrite( &item->lockTerminate.id, m_id );
+        MemWrite( &item->lockTerminate.time, Profiler::GetTime() );
+#ifdef TRACY_ON_DEMAND
+        GetProfiler().DeferItem( *item );
+#endif
+        Profiler::QueueSerialFinish();
+    }
+
+    tracy_force_inline bool BeforeLock()
+    {
+#ifdef TRACY_ON_DEMAND
+        bool queue = false;
+        const auto locks = m_lockCount.fetch_add( 1, std::memory_order_relaxed );
+        const auto active = m_active.load( std::memory_order_relaxed );
+        if( locks == 0 || active )
+        {
+            const bool connected = GetProfiler().IsConnected();
+            if( active != connected ) m_active.store( connected, std::memory_order_relaxed );
+            if( connected ) queue = true;
+        }
+        if( !queue ) return false;
+#endif
+
+        auto item = Profiler::QueueSerial();
+        MemWrite( &item->hdr.type, QueueType::LockWait );
+        MemWrite( &item->lockWait.thread, GetThreadHandle() );
+        MemWrite( &item->lockWait.id, m_id );
+        MemWrite( &item->lockWait.time, Profiler::GetTime() );
+        Profiler::QueueSerialFinish();
+        return true;
+    }
+
+    tracy_force_inline void AfterLock()
+    {
+        auto item = Profiler::QueueSerial();
+        MemWrite( &item->hdr.type, QueueType::LockObtain );
+        MemWrite( &item->lockObtain.thread, GetThreadHandle() );
+        MemWrite( &item->lockObtain.id, m_id );
+        MemWrite( &item->lockObtain.time, Profiler::GetTime() );
+        Profiler::QueueSerialFinish();
+    }
+
+    tracy_force_inline void AfterUnlock()
+    {
+#ifdef TRACY_ON_DEMAND
+        m_lockCount.fetch_sub( 1, std::memory_order_relaxed );
+        if( !m_active.load( std::memory_order_relaxed ) ) return;
+        if( !GetProfiler().IsConnected() )
+        {
+            m_active.store( false, std::memory_order_relaxed );
+            return;
+        }
+#endif
+
+        auto item = Profiler::QueueSerial();
+        MemWrite( &item->hdr.type, QueueType::LockRelease );
+        MemWrite( &item->lockRelease.id, m_id );
+        MemWrite( &item->lockRelease.time, Profiler::GetTime() );
+        Profiler::QueueSerialFinish();
+    }
+
+    tracy_force_inline void AfterTryLock( bool acquired )
+    {
+#ifdef TRACY_ON_DEMAND
+        if( !acquired ) return;
+
+        bool queue = false;
+        const auto locks = m_lockCount.fetch_add( 1, std::memory_order_relaxed );
+        const auto active = m_active.load( std::memory_order_relaxed );
+        if( locks == 0 || active )
+        {
+            const bool connected = GetProfiler().IsConnected();
+            if( active != connected ) m_active.store( connected, std::memory_order_relaxed );
+            if( connected ) queue = true;
+        }
+        if( !queue ) return;
+#endif
+
+        if( acquired )
+        {
+            auto item = Profiler::QueueSerial();
+            MemWrite( &item->hdr.type, QueueType::LockObtain );
+            MemWrite( &item->lockObtain.thread, GetThreadHandle() );
+            MemWrite( &item->lockObtain.id, m_id );
+            MemWrite( &item->lockObtain.time, Profiler::GetTime() );
+            Profiler::QueueSerialFinish();
+        }
+    }
+
+    tracy_force_inline bool BeforeLockShared()
+    {
+#ifdef TRACY_ON_DEMAND
+        bool queue = false;
+        const auto locks = m_lockCount.fetch_add( 1, std::memory_order_relaxed );
+        const auto active = m_active.load( std::memory_order_relaxed );
+        if( locks == 0 || active )
+        {
+            const bool connected = GetProfiler().IsConnected();
+            if( active != connected ) m_active.store( connected, std::memory_order_relaxed );
+            if( connected ) queue = true;
+        }
+        if( !queue ) return false;
+#endif
+
+        auto item = Profiler::QueueSerial();
+        MemWrite( &item->hdr.type, QueueType::LockSharedWait );
+        MemWrite( &item->lockWait.thread, GetThreadHandle() );
+        MemWrite( &item->lockWait.id, m_id );
+        MemWrite( &item->lockWait.time, Profiler::GetTime() );
+        Profiler::QueueSerialFinish();
+        return true;
+    }
+
+    tracy_force_inline void AfterLockShared()
+    {
+        auto item = Profiler::QueueSerial();
+        MemWrite( &item->hdr.type, QueueType::LockSharedObtain );
+        MemWrite( &item->lockObtain.thread, GetThreadHandle() );
+        MemWrite( &item->lockObtain.id, m_id );
+        MemWrite( &item->lockObtain.time, Profiler::GetTime() );
+        Profiler::QueueSerialFinish();
+    }
+
+    tracy_force_inline void AfterUnlockShared()
+    {
+#ifdef TRACY_ON_DEMAND
+        m_lockCount.fetch_sub( 1, std::memory_order_relaxed );
+        if( !m_active.load( std::memory_order_relaxed ) ) return;
+        if( !GetProfiler().IsConnected() )
+        {
+            m_active.store( false, std::memory_order_relaxed );
+            return;
+        }
+#endif
+
+        auto item = Profiler::QueueSerial();
+        MemWrite( &item->hdr.type, QueueType::LockSharedRelease );
+        MemWrite( &item->lockReleaseShared.thread, GetThreadHandle() );
+        MemWrite( &item->lockReleaseShared.id, m_id );
+        MemWrite( &item->lockReleaseShared.time, Profiler::GetTime() );
+        Profiler::QueueSerialFinish();
+    }
+
+    tracy_force_inline void AfterTryLockShared( bool acquired )
+    {
+#ifdef TRACY_ON_DEMAND
+        if( !acquired ) return;
+
+        bool queue = false;
+        const auto locks = m_lockCount.fetch_add( 1, std::memory_order_relaxed );
+        const auto active = m_active.load( std::memory_order_relaxed );
+        if( locks == 0 || active )
+        {
+            const bool connected = GetProfiler().IsConnected();
+            if( active != connected ) m_active.store( connected, std::memory_order_relaxed );
+            if( connected ) queue = true;
+        }
+        if( !queue ) return;
+#endif
+
+        if( acquired )
+        {
+            auto item = Profiler::QueueSerial();
+            MemWrite( &item->hdr.type, QueueType::LockSharedObtain );
+            MemWrite( &item->lockObtain.thread, GetThreadHandle() );
+            MemWrite( &item->lockObtain.id, m_id );
+            MemWrite( &item->lockObtain.time, Profiler::GetTime() );
+            Profiler::QueueSerialFinish();
+        }
+    }
+
+    tracy_force_inline void Mark( const SourceLocationData* srcloc )
+    {
+#ifdef TRACY_ON_DEMAND
+        const auto active = m_active.load( std::memory_order_relaxed );
+        if( !active ) return;
+        const auto connected = GetProfiler().IsConnected();
+        if( !connected )
+        {
+            if( active ) m_active.store( false, std::memory_order_relaxed );
+            return;
+        }
+#endif
+
+        auto item = Profiler::QueueSerial();
+        MemWrite( &item->hdr.type, QueueType::LockMark );
+        MemWrite( &item->lockMark.thread, GetThreadHandle() );
+        MemWrite( &item->lockMark.id, m_id );
+        MemWrite( &item->lockMark.srcloc, (uint64_t)srcloc );
+        Profiler::QueueSerialFinish();
+    }
+
+    tracy_force_inline void CustomName( const char* name, size_t size )
+    {
+        assert( size < (std::numeric_limits<uint16_t>::max)() );
+        auto ptr = (char*)tracy_malloc( size );
+        memcpy( ptr, name, size );
+        auto item = Profiler::QueueSerial();
+        MemWrite( &item->hdr.type, QueueType::LockName );
+        MemWrite( &item->lockNameFat.id, m_id );
+        MemWrite( &item->lockNameFat.name, (uint64_t)ptr );
+        MemWrite( &item->lockNameFat.size, (uint16_t)size );
+#ifdef TRACY_ON_DEMAND
+        GetProfiler().DeferItem( *item );
+#endif
+        Profiler::QueueSerialFinish();
+    }
+
+private:
+    uint32_t m_id;
+
+#ifdef TRACY_ON_DEMAND
+    std::atomic<uint32_t> m_lockCount;
+    std::atomic<bool> m_active;
+#endif
+};
+
+template<class T>
+class SharedLockable
+{
+public:
+    tracy_force_inline SharedLockable( const SourceLocationData* srcloc )
+        : m_ctx( srcloc )
+    {
+    }
+
+    SharedLockable( const SharedLockable& ) = delete;
+    SharedLockable& operator=( const SharedLockable& ) = delete;
+
+    tracy_force_inline void lock()
+    {
+        const auto runAfter = m_ctx.BeforeLock();
+        m_lockable.lock();
+        if( runAfter ) m_ctx.AfterLock();
+    }
+
+    tracy_force_inline void unlock()
+    {
+        m_lockable.unlock();
+        m_ctx.AfterUnlock();
+    }
+
+    tracy_force_inline bool try_lock()
+    {
+        const auto acquired = m_lockable.try_lock();
+        m_ctx.AfterTryLock( acquired );
+        return acquired;
+    }
+
+    tracy_force_inline void lock_shared()
+    {
+        const auto runAfter = m_ctx.BeforeLockShared();
+        m_lockable.lock_shared();
+        if( runAfter ) m_ctx.AfterLockShared();
+    }
+
+    tracy_force_inline void unlock_shared()
+    {
+        m_lockable.unlock_shared();
+        m_ctx.AfterUnlockShared();
+    }
+
+    tracy_force_inline bool try_lock_shared()
+    {
+        const auto acquired = m_lockable.try_lock_shared();
+        m_ctx.AfterTryLockShared( acquired );
+        return acquired;
+    }
+
+    tracy_force_inline void Mark( const SourceLocationData* srcloc )
+    {
+        m_ctx.Mark( srcloc );
+    }
+
+    tracy_force_inline void CustomName( const char* name, size_t size )
+    {
+        m_ctx.CustomName( name, size );
+    }
+
+private:
+    T m_lockable;
+    SharedLockableCtx m_ctx;
+};
+
+
+}
+
+#endif
diff --git a/project/thirdparty/tracy-0.11.1/client/TracyOverride.cpp b/project/thirdparty/tracy-0.11.1/client/TracyOverride.cpp
new file mode 100644
index 000000000..591508a7f
--- /dev/null
+++ b/project/thirdparty/tracy-0.11.1/client/TracyOverride.cpp
@@ -0,0 +1,26 @@
+#ifdef TRACY_ENABLE
+#  ifdef __linux__
+#    include "TracyDebug.hpp"
+#    ifdef TRACY_VERBOSE
+#      include <dlfcn.h>
+#      include <link.h>
+#    endif
+
+extern "C" int dlclose( void* hnd )
+{
+#ifdef TRACY_VERBOSE
+    struct link_map* lm;
+    if( dlinfo( hnd, RTLD_DI_LINKMAP, &lm ) == 0 )
+    {
+        TracyDebug( "Overriding dlclose for %s\n", lm->l_name );
+    }
+    else
+    {
+        TracyDebug( "Overriding dlclose for unknown object (%s)\n", dlerror() );
+    }
+#endif
+    return 0;
+}
+
+#  endif
+#endif
diff --git a/project/thirdparty/tracy-0.11.1/client/TracyProfiler.cpp b/project/thirdparty/tracy-0.11.1/client/TracyProfiler.cpp
new file mode 100644
index 000000000..3b8687441
--- /dev/null
+++ b/project/thirdparty/tracy-0.11.1/client/TracyProfiler.cpp
@@ -0,0 +1,4813 @@
+#ifdef TRACY_ENABLE
+
+#ifdef _WIN32
+#  ifndef NOMINMAX
+#    define NOMINMAX
+#  endif
+#  include <winsock2.h>
+#  include <windows.h>
+#  include <tlhelp32.h>
+#  include <inttypes.h>
+#  include <intrin.h>
+#  include "../common/TracyUwp.hpp"
+#else
+#  include <sys/time.h>
+#  include <sys/param.h>
+#endif
+
+#ifdef _GNU_SOURCE
+#  include <errno.h>
+#endif
+
+#ifdef __linux__
+#  include <dirent.h>
+#  include <pthread.h>
+#  include <sys/types.h>
+#  include <sys/syscall.h>
+#endif
+
+#if defined __APPLE__ || defined BSD
+#  include <sys/types.h>
+#  include <sys/sysctl.h>
+#endif
+
+#if defined __APPLE__
+#  include "TargetConditionals.h"
+#  include <mach-o/dyld.h>
+#endif
+
+#ifdef __ANDROID__
+#  include <sys/mman.h>
+#  include <sys/system_properties.h>
+#  include <stdio.h>
+#  include <stdint.h>
+#  include <algorithm>
+#  include <vector>
+#endif
+
+#ifdef __QNX__
+#  include <stdint.h>
+#  include <stdio.h>
+#  include <string.h>
+#  include <sys/syspage.h>
+#  include <sys/stat.h>
+#endif
+
+#include <algorithm>
+#include <assert.h>
+#include <atomic>
+#include <chrono>
+#include <limits>
+#include <new>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/stat.h>
+#include <thread>
+
+#include "../common/TracyAlign.hpp"
+#include "../common/TracyAlloc.hpp"
+#include "../common/TracySocket.hpp"
+#include "../common/TracySystem.hpp"
+#include "../common/TracyYield.hpp"
+#include "../common/tracy_lz4.hpp"
+#include "tracy_rpmalloc.hpp"
+#include "TracyCallstack.hpp"
+#include "TracyDebug.hpp"
+#include "TracyDxt1.hpp"
+#include "TracyScoped.hpp"
+#include "TracyProfiler.hpp"
+#include "TracyThread.hpp"
+#include "TracyArmCpuTable.hpp"
+#include "TracySysTrace.hpp"
+#include "../tracy/TracyC.h"
+
+#ifdef TRACY_PORT
+#  ifndef TRACY_DATA_PORT
+#    define TRACY_DATA_PORT TRACY_PORT
+#  endif
+#  ifndef TRACY_BROADCAST_PORT
+#    define TRACY_BROADCAST_PORT TRACY_PORT
+#  endif
+#endif
+
+#ifdef __APPLE__
+#  ifndef TRACY_DELAYED_INIT
+#    define TRACY_DELAYED_INIT
+#  endif
+#else
+#  ifdef __GNUC__
+#    define init_order( val ) __attribute__ ((init_priority(val)))
+#  else
+#    define init_order(x)
+#  endif
+#endif
+
+#if defined _WIN32
+#  include <lmcons.h>
+extern "C" typedef LONG (WINAPI *t_RtlGetVersion)( PRTL_OSVERSIONINFOW );
+extern "C" typedef BOOL (WINAPI *t_GetLogicalProcessorInformationEx)( LOGICAL_PROCESSOR_RELATIONSHIP, PSYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX, PDWORD );
+#else
+#  include <unistd.h>
+#  include <limits.h>
+#endif
+#if defined __linux__
+#  include <sys/sysinfo.h>
+#  include <sys/utsname.h>
+#endif
+
+#if !defined _WIN32 && ( defined __i386 || defined _M_IX86 || defined __x86_64__ || defined _M_X64 )
+#  include "TracyCpuid.hpp"
+#endif
+
+#if !( ( defined _WIN32 && _WIN32_WINNT >= _WIN32_WINNT_VISTA ) || defined __linux__ )
+#  include <mutex>
+#endif
+
+#ifdef __QNX__
+extern char* __progname;
+#endif
+
+namespace tracy
+{
+
+#ifdef __ANDROID__
+// Implementation helpers of EnsureReadable(address).
+// This is so far only needed on Android, where it is common for libraries to be mapped
+// with only executable, not readable, permissions. Typical example (line from /proc/self/maps):
+/*
+746b63b000-746b6dc000 --xp 00042000 07:48 35                             /apex/com.android.runtime/lib64/bionic/libc.so
+*/
+// See https://github.com/wolfpld/tracy/issues/125 .
+// To work around this, we parse /proc/self/maps and we use mprotect to set read permissions
+// on any mappings that contain symbols addresses hit by HandleSymbolCodeQuery.
+
+namespace {
+// Holds some information about a single memory mapping.
+struct MappingInfo {
+    // Start of address range. Inclusive.
+    uintptr_t start_address;
+    // End of address range. Exclusive, so the mapping is the half-open interval
+    // [start, end) and its length in bytes is `end - start`. As in /proc/self/maps.
+    uintptr_t end_address;
+    // Read/Write/Executable permissions.
+    bool perm_r, perm_w, perm_x;
+};
+}  // anonymous namespace
+
+   // Internal implementation helper for LookUpMapping(address).
+   //
+   // Parses /proc/self/maps returning a vector<MappingInfo>.
+   // /proc/self/maps is assumed to be sorted by ascending address, so the resulting
+   // vector is sorted by ascending address too.
+static std::vector<MappingInfo> ParseMappings()
+{
+    std::vector<MappingInfo> result;
+    FILE* file = fopen( "/proc/self/maps", "r" );
+    if( !file ) return result;
+    char line[1024];
+    while( fgets( line, sizeof( line ), file ) )
+    {
+        uintptr_t start_addr;
+        uintptr_t end_addr;
+#if defined(__LP64__)
+        if( sscanf( line, "%lx-%lx", &start_addr, &end_addr ) != 2 ) continue;
+#else
+        if (sscanf( line, "%dx-%dx", &start_addr, &end_addr ) != 2 ) continue;
+#endif
+        char* first_space = strchr( line, ' ' );
+        if( !first_space ) continue;
+        char* perm = first_space + 1;
+        char* second_space = strchr( perm, ' ' );
+        if( !second_space || second_space - perm != 4 ) continue;
+        result.emplace_back();
+        auto& mapping = result.back();
+        mapping.start_address = start_addr;
+        mapping.end_address = end_addr;
+        mapping.perm_r = perm[0] == 'r';
+        mapping.perm_w = perm[1] == 'w';
+        mapping.perm_x = perm[2] == 'x';
+    }
+    fclose( file );
+    return result;
+}
+
+// Internal implementation helper for LookUpMapping(address).
+//
+// Takes as input an `address` and a known vector `mappings`, assumed to be
+// sorted by increasing addresses, as /proc/self/maps seems to be.
+// Returns a pointer to the MappingInfo describing the mapping that this
+// address belongs to, or nullptr if the address isn't in `mappings`.
+static MappingInfo* LookUpMapping(std::vector<MappingInfo>& mappings, uintptr_t address)
+{
+    // Comparison function for std::lower_bound. Returns true if all addresses in `m1`
+    // are lower than `addr`.
+    auto Compare = []( const MappingInfo& m1, uintptr_t addr ) {
+        // '<=' because the address ranges are half-open intervals, [start, end).
+        return m1.end_address <= addr;
+    };
+    auto iter = std::lower_bound( mappings.begin(), mappings.end(), address, Compare );
+    if( iter == mappings.end() || iter->start_address > address) {
+        return nullptr;
+    }
+    return &*iter;
+}
+
+// Internal implementation helper for EnsureReadable(address).
+//
+// Takes as input an `address` and returns a pointer to a MappingInfo
+// describing the mapping that this address belongs to, or nullptr if
+// the address isn't in any known mapping.
+//
+// This function is stateful and not reentrant (assumes to be called from
+// only one thread). It holds a vector of mappings parsed from /proc/self/maps.
+//
+// Attempts to react to mappings changes by re-parsing /proc/self/maps.
+static MappingInfo* LookUpMapping(uintptr_t address)
+{
+    // Static state managed by this function. Not constant, we mutate that state as
+    // we turn some mappings readable. Initially parsed once here, updated as needed below.
+    static std::vector<MappingInfo> s_mappings = ParseMappings();
+    MappingInfo* mapping = LookUpMapping( s_mappings, address );
+    if( mapping ) return mapping;
+
+    // This address isn't in any known mapping. Try parsing again, maybe
+    // mappings changed.
+    s_mappings = ParseMappings();
+    return LookUpMapping( s_mappings, address );
+}
+
+// Internal implementation helper for EnsureReadable(address).
+//
+// Attempts to make the specified `mapping` readable if it isn't already.
+// Returns true if and only if the mapping is readable.
+static bool EnsureReadable( MappingInfo& mapping )
+{
+    if( mapping.perm_r )
+    {
+        // The mapping is already readable.
+        return true;
+    }
+    int prot = PROT_READ;
+    if( mapping.perm_w ) prot |= PROT_WRITE;
+    if( mapping.perm_x ) prot |= PROT_EXEC;
+    if( mprotect( reinterpret_cast<void*>( mapping.start_address ),
+        mapping.end_address - mapping.start_address, prot ) == -1 )
+    {
+        // Failed to make the mapping readable. Shouldn't happen, hasn't
+        // been observed yet. If it happened in practice, we should consider
+        // adding a bool to MappingInfo to track this to avoid retrying mprotect
+        // everytime on such mappings.
+        return false;
+    }
+    // The mapping is now readable. Update `mapping` so the next call will be fast.
+    mapping.perm_r = true;
+    return true;
+}
+
+// Attempts to set the read permission on the entire mapping containing the
+// specified address. Returns true if and only if the mapping is now readable.
+static bool EnsureReadable( uintptr_t address )
+{
+    MappingInfo* mapping = LookUpMapping(address);
+    return mapping && EnsureReadable( *mapping );
+}
+#elif defined WIN32
+static bool EnsureReadable( uintptr_t address )
+{
+    MEMORY_BASIC_INFORMATION memInfo;
+    VirtualQuery( reinterpret_cast<void*>( address ), &memInfo, sizeof( memInfo ) );
+    return memInfo.Protect != PAGE_NOACCESS;
+}
+#else
+static bool EnsureReadable( uintptr_t address )
+{
+    return true;
+}
+#endif
+
+#ifndef TRACY_DELAYED_INIT
+
+struct InitTimeWrapper
+{
+    int64_t val;
+};
+
+struct ProducerWrapper
+{
+    tracy::moodycamel::ConcurrentQueue<QueueItem>::ExplicitProducer* ptr;
+};
+
+struct ThreadHandleWrapper
+{
+    uint32_t val;
+};
+#endif
+
+
+#if defined __i386 || defined _M_IX86 || defined __x86_64__ || defined _M_X64
+static inline void CpuId( uint32_t* regs, uint32_t leaf )
+{
+    memset(regs, 0, sizeof(uint32_t) * 4);
+#if defined _MSC_VER
+    __cpuidex( (int*)regs, leaf, 0 );
+#else
+    __get_cpuid( leaf, regs, regs+1, regs+2, regs+3 );
+#endif
+}
+
+static void InitFailure( const char* msg )
+{
+#if defined _WIN32
+    bool hasConsole = false;
+    bool reopen = false;
+    const auto attached = AttachConsole( ATTACH_PARENT_PROCESS );
+    if( attached )
+    {
+        hasConsole = true;
+        reopen = true;
+    }
+    else
+    {
+        const auto err = GetLastError();
+        if( err == ERROR_ACCESS_DENIED )
+        {
+            hasConsole = true;
+        }
+    }
+    if( hasConsole )
+    {
+        fprintf( stderr, "Tracy Profiler initialization failure: %s\n", msg );
+        if( reopen )
+        {
+            freopen( "CONOUT$", "w", stderr );
+            fprintf( stderr, "Tracy Profiler initialization failure: %s\n", msg );
+        }
+    }
+    else
+    {
+#  ifndef TRACY_UWP
+        MessageBoxA( nullptr, msg, "Tracy Profiler initialization failure", MB_ICONSTOP );
+#  endif
+    }
+#else
+    fprintf( stderr, "Tracy Profiler initialization failure: %s\n", msg );
+#endif
+    exit( 1 );
+}
+
+static bool CheckHardwareSupportsInvariantTSC()
+{
+    const char* noCheck = GetEnvVar( "TRACY_NO_INVARIANT_CHECK" );
+    if( noCheck && noCheck[0] == '1' ) return true;
+
+    uint32_t regs[4];
+    CpuId( regs, 1 );
+    if( !( regs[3] & ( 1 << 4 ) ) )
+    {
+#if !defined TRACY_TIMER_QPC && !defined TRACY_TIMER_FALLBACK
+        InitFailure( "CPU doesn't support RDTSC instruction." );
+#else
+        return false;
+#endif
+    }
+    CpuId( regs, 0x80000007 );
+    if( regs[3] & ( 1 << 8 ) ) return true;
+
+    return false;
+}
+
+#if defined TRACY_TIMER_FALLBACK && defined TRACY_HW_TIMER
+bool HardwareSupportsInvariantTSC()
+{
+    static bool cachedResult = CheckHardwareSupportsInvariantTSC();
+    return cachedResult;
+}
+#endif
+
+static int64_t SetupHwTimer()
+{
+#if !defined TRACY_TIMER_QPC && !defined TRACY_TIMER_FALLBACK
+    if( !CheckHardwareSupportsInvariantTSC() )
+    {
+#if defined _WIN32
+        InitFailure( "CPU doesn't support invariant TSC.\nDefine TRACY_NO_INVARIANT_CHECK=1 to ignore this error, *if you know what you are doing*.\nAlternatively you may rebuild the application with the TRACY_TIMER_QPC or TRACY_TIMER_FALLBACK define to use lower resolution timer." );
+#else
+        InitFailure( "CPU doesn't support invariant TSC.\nDefine TRACY_NO_INVARIANT_CHECK=1 to ignore this error, *if you know what you are doing*.\nAlternatively you may rebuild the application with the TRACY_TIMER_FALLBACK define to use lower resolution timer." );
+#endif
+    }
+#endif
+
+    return Profiler::GetTime();
+}
+#else
+static int64_t SetupHwTimer()
+{
+    return Profiler::GetTime();
+}
+#endif
+
+static const char* GetProcessName()
+{
+    const char* processName = "unknown";
+#ifdef _WIN32
+    static char buf[_MAX_PATH];
+    GetModuleFileNameA( nullptr, buf, _MAX_PATH );
+    const char* ptr = buf;
+    while( *ptr != '\0' ) ptr++;
+    while( ptr > buf && *ptr != '\\' && *ptr != '/' ) ptr--;
+    if( ptr > buf ) ptr++;
+    processName = ptr;
+#elif defined __ANDROID__
+#  if __ANDROID_API__ >= 21
+    auto buf = getprogname();
+    if( buf ) processName = buf;
+#  endif
+#elif defined __linux__ && defined _GNU_SOURCE
+    if( program_invocation_short_name ) processName = program_invocation_short_name;
+#elif defined __APPLE__ || defined BSD
+    auto buf = getprogname();
+    if( buf ) processName = buf;
+#elif defined __QNX__
+    processName = __progname;
+#endif
+    return processName;
+}
+
+static const char* GetProcessExecutablePath()
+{
+#ifdef _WIN32
+    static char buf[_MAX_PATH];
+    GetModuleFileNameA( nullptr, buf, _MAX_PATH );
+    return buf;
+#elif defined __ANDROID__
+    return nullptr;
+#elif defined __linux__ && defined _GNU_SOURCE
+    return program_invocation_name;
+#elif defined __APPLE__
+    static char buf[1024];
+    uint32_t size = 1024;
+    _NSGetExecutablePath( buf, &size );
+    return buf;
+#elif defined __DragonFly__
+    static char buf[1024];
+    readlink( "/proc/curproc/file", buf, 1024 );
+    return buf;
+#elif defined __FreeBSD__
+    static char buf[1024];
+    int mib[4];
+    mib[0] = CTL_KERN;
+    mib[1] = KERN_PROC;
+    mib[2] = KERN_PROC_PATHNAME;
+    mib[3] = -1;
+    size_t cb = 1024;
+    sysctl( mib, 4, buf, &cb, nullptr, 0 );
+    return buf;
+#elif defined __NetBSD__
+    static char buf[1024];
+    readlink( "/proc/curproc/exe", buf, 1024 );
+    return buf;
+#elif defined __QNX__
+    static char buf[_PC_PATH_MAX + 1];
+    _cmdname(buf);
+    return buf;
+#else
+    return nullptr;
+#endif
+}
+
+#if defined __linux__ && defined __ARM_ARCH
+static uint32_t GetHex( char*& ptr, int skip )
+{
+    uint32_t ret;
+    ptr += skip;
+    char* end;
+    if( ptr[0] == '0' && ptr[1] == 'x' )
+    {
+        ptr += 2;
+        ret = strtol( ptr, &end, 16 );
+    }
+    else
+    {
+        ret = strtol( ptr, &end, 10 );
+    }
+    ptr = end;
+    return ret;
+}
+#endif
+
+static const char* GetHostInfo()
+{
+    static char buf[1024];
+    auto ptr = buf;
+#if defined _WIN32
+#  ifdef TRACY_UWP
+    auto GetVersion = &::GetVersionEx;
+#  else
+    auto GetVersion = (t_RtlGetVersion)GetProcAddress( GetModuleHandleA( "ntdll.dll" ), "RtlGetVersion" );
+#  endif
+    if( !GetVersion )
+    {
+#  ifdef __MINGW32__
+        ptr += sprintf( ptr, "OS: Windows (MingW)\n" );
+#  else
+        ptr += sprintf( ptr, "OS: Windows\n" );
+#  endif
+    }
+    else
+    {
+        RTL_OSVERSIONINFOW ver = { sizeof( RTL_OSVERSIONINFOW ) };
+        GetVersion( &ver );
+
+#  ifdef __MINGW32__
+        ptr += sprintf( ptr, "OS: Windows %i.%i.%i (MingW)\n", (int)ver.dwMajorVersion, (int)ver.dwMinorVersion, (int)ver.dwBuildNumber );
+#  else
+        ptr += sprintf( ptr, "OS: Windows %lu.%lu.%lu\n", ver.dwMajorVersion, ver.dwMinorVersion, ver.dwBuildNumber );
+#  endif
+    }
+#elif defined __linux__
+    struct utsname utsName;
+    uname( &utsName );
+#  if defined __ANDROID__
+    ptr += sprintf( ptr, "OS: Linux %s (Android)\n", utsName.release );
+#  else
+    ptr += sprintf( ptr, "OS: Linux %s\n", utsName.release );
+#  endif
+#elif defined __APPLE__
+#  if TARGET_OS_IPHONE == 1
+    ptr += sprintf( ptr, "OS: Darwin (iOS)\n" );
+#  elif TARGET_OS_MAC == 1
+    ptr += sprintf( ptr, "OS: Darwin (OSX)\n" );
+#  else
+    ptr += sprintf( ptr, "OS: Darwin (unknown)\n" );
+#  endif
+#elif defined __DragonFly__
+    ptr += sprintf( ptr, "OS: BSD (DragonFly)\n" );
+#elif defined __FreeBSD__
+    ptr += sprintf( ptr, "OS: BSD (FreeBSD)\n" );
+#elif defined __NetBSD__
+    ptr += sprintf( ptr, "OS: BSD (NetBSD)\n" );
+#elif defined __OpenBSD__
+    ptr += sprintf( ptr, "OS: BSD (OpenBSD)\n" );
+#elif defined __QNX__
+    ptr += sprintf( ptr, "OS: QNX\n" );
+#else
+    ptr += sprintf( ptr, "OS: unknown\n" );
+#endif
+
+#if defined _MSC_VER
+#  if defined __clang__
+    ptr += sprintf( ptr, "Compiler: MSVC clang-cl %i.%i.%i\n", __clang_major__, __clang_minor__, __clang_patchlevel__ );
+#  else
+    ptr += sprintf( ptr, "Compiler: MSVC %i\n", _MSC_VER );
+#  endif
+#elif defined __clang__
+    ptr += sprintf( ptr, "Compiler: clang %i.%i.%i\n", __clang_major__, __clang_minor__, __clang_patchlevel__ );
+#elif defined __GNUC__
+    ptr += sprintf( ptr, "Compiler: gcc %i.%i.%i\n", __GNUC__, __GNUC_MINOR__, __GNUC_PATCHLEVEL__ );
+#else
+    ptr += sprintf( ptr, "Compiler: unknown\n" );
+#endif
+
+#if defined _WIN32
+    InitWinSock();
+
+    char hostname[512];
+    gethostname( hostname, 512 );
+
+#  ifdef TRACY_UWP
+    const char* user = "";
+#  else
+    DWORD userSz = UNLEN+1;
+    char user[UNLEN+1];
+    GetUserNameA( user, &userSz );
+#  endif
+
+    ptr += sprintf( ptr, "User: %s@%s\n", user, hostname );
+#else
+    char hostname[_POSIX_HOST_NAME_MAX]{};
+    char user[_POSIX_LOGIN_NAME_MAX]{};
+
+    gethostname( hostname, _POSIX_HOST_NAME_MAX );
+#  if defined __ANDROID__
+    const auto login = getlogin();
+    if( login )
+    {
+        strcpy( user, login );
+    }
+    else
+    {
+        memcpy( user, "(?)", 4 );
+    }
+#  else
+    getlogin_r( user, _POSIX_LOGIN_NAME_MAX );
+#  endif
+
+    ptr += sprintf( ptr, "User: %s@%s\n", user, hostname );
+#endif
+
+#if defined __i386 || defined _M_IX86
+    ptr += sprintf( ptr, "Arch: x86\n" );
+#elif defined __x86_64__ || defined _M_X64
+    ptr += sprintf( ptr, "Arch: x64\n" );
+#elif defined __aarch64__
+    ptr += sprintf( ptr, "Arch: ARM64\n" );
+#elif defined __ARM_ARCH
+    ptr += sprintf( ptr, "Arch: ARM\n" );
+#else
+    ptr += sprintf( ptr, "Arch: unknown\n" );
+#endif
+
+#if defined __i386 || defined _M_IX86 || defined __x86_64__ || defined _M_X64
+    uint32_t regs[4];
+    char cpuModel[4*4*3+1] = {};
+    auto modelPtr = cpuModel;
+    for( uint32_t i=0x80000002; i<0x80000005; ++i )
+    {
+        CpuId( regs, i );
+        memcpy( modelPtr, regs, sizeof( regs ) ); modelPtr += sizeof( regs );
+    }
+
+    ptr += sprintf( ptr, "CPU: %s\n", cpuModel );
+#elif defined __linux__ && defined __ARM_ARCH
+    bool cpuFound = false;
+    FILE* fcpuinfo = fopen( "/proc/cpuinfo", "rb" );
+    if( fcpuinfo )
+    {
+        enum { BufSize = 4*1024 };
+        char buf[BufSize];
+        const auto sz = fread( buf, 1, BufSize, fcpuinfo );
+        fclose( fcpuinfo );
+        const auto end = buf + sz;
+        auto cptr = buf;
+
+        uint32_t impl = 0;
+        uint32_t var = 0;
+        uint32_t part = 0;
+        uint32_t rev = 0;
+
+        while( end - cptr > 20 )
+        {
+            while( end - cptr > 20 && memcmp( cptr, "CPU ", 4 ) != 0 )
+            {
+                cptr += 4;
+                while( end - cptr > 20 && *cptr != '\n' ) cptr++;
+                cptr++;
+            }
+            if( end - cptr <= 20 ) break;
+            cptr += 4;
+            if( memcmp( cptr, "implementer\t: ", 14 ) == 0 )
+            {
+                if( impl != 0 ) break;
+                impl = GetHex( cptr, 14 );
+            }
+            else if( memcmp( cptr, "variant\t: ", 10 ) == 0 ) var = GetHex( cptr, 10 );
+            else if( memcmp( cptr, "part\t: ", 7 ) == 0 ) part = GetHex( cptr, 7 );
+            else if( memcmp( cptr, "revision\t: ", 11 ) == 0 ) rev = GetHex( cptr, 11 );
+            while( *cptr != '\n' && *cptr != '\0' ) cptr++;
+            cptr++;
+        }
+
+        if( impl != 0 || var != 0 || part != 0 || rev != 0 )
+        {
+            cpuFound = true;
+            ptr += sprintf( ptr, "CPU: %s%s r%ip%i\n", DecodeArmImplementer( impl ), DecodeArmPart( impl, part ), var, rev );
+        }
+    }
+    if( !cpuFound )
+    {
+        ptr += sprintf( ptr, "CPU: unknown\n" );
+    }
+#elif defined __APPLE__ && TARGET_OS_IPHONE == 1
+    {
+        size_t sz;
+        sysctlbyname( "hw.machine", nullptr, &sz, nullptr, 0 );
+        auto str = (char*)tracy_malloc( sz );
+        sysctlbyname( "hw.machine", str, &sz, nullptr, 0 );
+        ptr += sprintf( ptr, "Device: %s\n", DecodeIosDevice( str ) );
+        tracy_free( str );
+    }
+#else
+    ptr += sprintf( ptr, "CPU: unknown\n" );
+#endif
+#ifdef __ANDROID__
+    char deviceModel[PROP_VALUE_MAX+1];
+    char deviceManufacturer[PROP_VALUE_MAX+1];
+    __system_property_get( "ro.product.model", deviceModel );
+    __system_property_get( "ro.product.manufacturer", deviceManufacturer );
+    ptr += sprintf( ptr, "Device: %s %s\n", deviceManufacturer, deviceModel );
+#endif
+
+    ptr += sprintf( ptr, "CPU cores: %i\n", std::thread::hardware_concurrency() );
+
+#if defined _WIN32
+    MEMORYSTATUSEX statex;
+    statex.dwLength = sizeof( statex );
+    GlobalMemoryStatusEx( &statex );
+#  ifdef _MSC_VER
+    ptr += sprintf( ptr, "RAM: %I64u MB\n", statex.ullTotalPhys / 1024 / 1024 );
+#  else
+    ptr += sprintf( ptr, "RAM: %llu MB\n", statex.ullTotalPhys / 1024 / 1024 );
+#  endif
+#elif defined __linux__
+    struct sysinfo sysInfo;
+    sysinfo( &sysInfo );
+    ptr += sprintf( ptr, "RAM: %lu MB\n", sysInfo.totalram / 1024 / 1024 );
+#elif defined __APPLE__
+    size_t memSize;
+    size_t sz = sizeof( memSize );
+    sysctlbyname( "hw.memsize", &memSize, &sz, nullptr, 0 );
+    ptr += sprintf( ptr, "RAM: %zu MB\n", memSize / 1024 / 1024 );
+#elif defined BSD
+    size_t memSize;
+    size_t sz = sizeof( memSize );
+    sysctlbyname( "hw.physmem", &memSize, &sz, nullptr, 0 );
+    ptr += sprintf( ptr, "RAM: %zu MB\n", memSize / 1024 / 1024 );
+#elif defined __QNX__
+    struct asinfo_entry *entries = SYSPAGE_ENTRY(asinfo);
+    size_t count = SYSPAGE_ENTRY_SIZE(asinfo) / sizeof(struct asinfo_entry);
+    char *strings = SYSPAGE_ENTRY(strings)->data;
+
+    uint64_t memSize = 0;
+    size_t i;
+    for (i = 0; i < count; i++) {
+        struct asinfo_entry *entry = &entries[i];
+        if (strcmp(strings + entry->name, "ram") == 0) {
+            memSize += entry->end - entry->start + 1;
+        }
+    }
+    memSize = memSize / 1024 / 1024;
+    ptr += sprintf( ptr, "RAM: %llu MB\n", memSize);
+#else
+    ptr += sprintf( ptr, "RAM: unknown\n" );
+#endif
+
+    return buf;
+}
+
+static uint64_t GetPid()
+{
+#if defined _WIN32
+    return uint64_t( GetCurrentProcessId() );
+#else
+    return uint64_t( getpid() );
+#endif
+}
+
+void Profiler::AckServerQuery()
+{
+    QueueItem item;
+    MemWrite( &item.hdr.type, QueueType::AckServerQueryNoop );
+    NeedDataSize( QueueDataSize[(int)QueueType::AckServerQueryNoop] );
+    AppendDataUnsafe( &item, QueueDataSize[(int)QueueType::AckServerQueryNoop] );
+}
+
+void Profiler::AckSymbolCodeNotAvailable()
+{
+    QueueItem item;
+    MemWrite( &item.hdr.type, QueueType::AckSymbolCodeNotAvailable );
+    NeedDataSize( QueueDataSize[(int)QueueType::AckSymbolCodeNotAvailable] );
+    AppendDataUnsafe( &item, QueueDataSize[(int)QueueType::AckSymbolCodeNotAvailable] );
+}
+
+static BroadcastMessage& GetBroadcastMessage( const char* procname, size_t pnsz, int& len, int port )
+{
+    static BroadcastMessage msg;
+
+    msg.broadcastVersion = BroadcastVersion;
+    msg.protocolVersion = ProtocolVersion;
+    msg.listenPort = port;
+    msg.pid = GetPid();
+
+    memcpy( msg.programName, procname, pnsz );
+    memset( msg.programName + pnsz, 0, WelcomeMessageProgramNameSize - pnsz );
+
+    len = int( offsetof( BroadcastMessage, programName ) + pnsz + 1 );
+    return msg;
+}
+
+#if defined _WIN32 && !defined TRACY_UWP && !defined TRACY_NO_CRASH_HANDLER
+static DWORD s_profilerThreadId = 0;
+static DWORD s_symbolThreadId = 0;
+static char s_crashText[1024];
+
+LONG WINAPI CrashFilter( PEXCEPTION_POINTERS pExp )
+{
+    if( !GetProfiler().IsConnected() ) return EXCEPTION_CONTINUE_SEARCH;
+
+    const unsigned ec = pExp->ExceptionRecord->ExceptionCode;
+    auto msgPtr = s_crashText;
+    switch( ec )
+    {
+    case EXCEPTION_ACCESS_VIOLATION:
+        msgPtr += sprintf( msgPtr, "Exception EXCEPTION_ACCESS_VIOLATION (0x%x). ", ec );
+        switch( pExp->ExceptionRecord->ExceptionInformation[0] )
+        {
+        case 0:
+            msgPtr += sprintf( msgPtr, "Read violation at address 0x%" PRIxPTR ".", pExp->ExceptionRecord->ExceptionInformation[1] );
+            break;
+        case 1:
+            msgPtr += sprintf( msgPtr, "Write violation at address 0x%" PRIxPTR ".", pExp->ExceptionRecord->ExceptionInformation[1] );
+            break;
+        case 8:
+            msgPtr += sprintf( msgPtr, "DEP violation at address 0x%" PRIxPTR ".", pExp->ExceptionRecord->ExceptionInformation[1] );
+            break;
+        default:
+            break;
+        }
+        break;
+    case EXCEPTION_ARRAY_BOUNDS_EXCEEDED:
+        msgPtr += sprintf( msgPtr, "Exception EXCEPTION_ARRAY_BOUNDS_EXCEEDED (0x%x). ", ec );
+        break;
+    case EXCEPTION_DATATYPE_MISALIGNMENT:
+        msgPtr += sprintf( msgPtr, "Exception EXCEPTION_DATATYPE_MISALIGNMENT (0x%x). ", ec );
+        break;
+    case EXCEPTION_FLT_DIVIDE_BY_ZERO:
+        msgPtr += sprintf( msgPtr, "Exception EXCEPTION_FLT_DIVIDE_BY_ZERO (0x%x). ", ec );
+        break;
+    case EXCEPTION_ILLEGAL_INSTRUCTION:
+        msgPtr += sprintf( msgPtr, "Exception EXCEPTION_ILLEGAL_INSTRUCTION (0x%x). ", ec );
+        break;
+    case EXCEPTION_IN_PAGE_ERROR:
+        msgPtr += sprintf( msgPtr, "Exception EXCEPTION_IN_PAGE_ERROR (0x%x). ", ec );
+        break;
+    case EXCEPTION_INT_DIVIDE_BY_ZERO:
+        msgPtr += sprintf( msgPtr, "Exception EXCEPTION_INT_DIVIDE_BY_ZERO (0x%x). ", ec );
+        break;
+    case EXCEPTION_PRIV_INSTRUCTION:
+        msgPtr += sprintf( msgPtr, "Exception EXCEPTION_PRIV_INSTRUCTION (0x%x). ", ec );
+        break;
+    case EXCEPTION_STACK_OVERFLOW:
+        msgPtr += sprintf( msgPtr, "Exception EXCEPTION_STACK_OVERFLOW (0x%x). ", ec );
+        break;
+    default:
+        return EXCEPTION_CONTINUE_SEARCH;
+    }
+
+    {
+        GetProfiler().SendCallstack( 60, "KiUserExceptionDispatcher" );
+
+        TracyQueuePrepare( QueueType::CrashReport );
+        item->crashReport.time = Profiler::GetTime();
+        item->crashReport.text = (uint64_t)s_crashText;
+        TracyQueueCommit( crashReportThread );
+    }
+
+    HANDLE h = CreateToolhelp32Snapshot( TH32CS_SNAPTHREAD, 0 );
+    if( h == INVALID_HANDLE_VALUE ) return EXCEPTION_CONTINUE_SEARCH;
+
+    THREADENTRY32 te = { sizeof( te ) };
+    if( !Thread32First( h, &te ) )
+    {
+        CloseHandle( h );
+        return EXCEPTION_CONTINUE_SEARCH;
+    }
+
+    const auto pid = GetCurrentProcessId();
+    const auto tid = GetCurrentThreadId();
+
+    do
+    {
+        if( te.th32OwnerProcessID == pid && te.th32ThreadID != tid && te.th32ThreadID != s_profilerThreadId && te.th32ThreadID != s_symbolThreadId )
+        {
+            HANDLE th = OpenThread( THREAD_SUSPEND_RESUME, FALSE, te.th32ThreadID );
+            if( th != INVALID_HANDLE_VALUE )
+            {
+                SuspendThread( th );
+                CloseHandle( th );
+            }
+        }
+    }
+    while( Thread32Next( h, &te ) );
+    CloseHandle( h );
+
+    {
+        TracyLfqPrepare( QueueType::Crash );
+        TracyLfqCommit;
+    }
+
+    std::this_thread::sleep_for( std::chrono::milliseconds( 500 ) );
+    GetProfiler().RequestShutdown();
+    while( !GetProfiler().HasShutdownFinished() ) { std::this_thread::sleep_for( std::chrono::milliseconds( 10 ) ); };
+
+    return EXCEPTION_CONTINUE_SEARCH;
+}
+#endif
+
+static Profiler* s_instance = nullptr;
+static Thread* s_thread;
+#ifndef TRACY_NO_FRAME_IMAGE
+static Thread* s_compressThread;
+#endif
+#ifdef TRACY_HAS_CALLSTACK
+static Thread* s_symbolThread;
+std::atomic<bool> s_symbolThreadGone { false };
+#endif
+#ifdef TRACY_HAS_SYSTEM_TRACING
+static Thread* s_sysTraceThread = nullptr;
+#endif
+
+#if defined __linux__ && !defined TRACY_NO_CRASH_HANDLER
+#  ifndef TRACY_CRASH_SIGNAL
+#    define TRACY_CRASH_SIGNAL SIGPWR
+#  endif
+
+static long s_profilerTid = 0;
+static long s_symbolTid = 0;
+static char s_crashText[1024];
+static std::atomic<bool> s_alreadyCrashed( false );
+
+static void ThreadFreezer( int /*signal*/ )
+{
+    for(;;) sleep( 1000 );
+}
+
+static inline void HexPrint( char*& ptr, uint64_t val )
+{
+    if( val == 0 )
+    {
+        *ptr++ = '0';
+        return;
+    }
+
+    static const char HexTable[16] = { '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', 'a', 'b', 'c', 'd', 'e', 'f' };
+    char buf[16];
+    auto bptr = buf;
+
+    do
+    {
+        *bptr++ = HexTable[val%16];
+        val /= 16;
+    }
+    while( val > 0 );
+
+    do
+    {
+        *ptr++ = *--bptr;
+    }
+    while( bptr != buf );
+}
+
+static void CrashHandler( int signal, siginfo_t* info, void* /*ucontext*/ )
+{
+    bool expected = false;
+    if( !s_alreadyCrashed.compare_exchange_strong( expected, true ) ) ThreadFreezer( signal );
+
+    struct sigaction act = {};
+    act.sa_handler = SIG_DFL;
+    sigaction( SIGABRT, &act, nullptr );
+
+    auto msgPtr = s_crashText;
+    switch( signal )
+    {
+    case SIGILL:
+        strcpy( msgPtr, "Illegal Instruction.\n" );
+        while( *msgPtr ) msgPtr++;
+        switch( info->si_code )
+        {
+        case ILL_ILLOPC:
+            strcpy( msgPtr, "Illegal opcode.\n" );
+            break;
+        case ILL_ILLOPN:
+            strcpy( msgPtr, "Illegal operand.\n" );
+            break;
+        case ILL_ILLADR:
+            strcpy( msgPtr, "Illegal addressing mode.\n" );
+            break;
+        case ILL_ILLTRP:
+            strcpy( msgPtr, "Illegal trap.\n" );
+            break;
+        case ILL_PRVOPC:
+            strcpy( msgPtr, "Privileged opcode.\n" );
+            break;
+        case ILL_PRVREG:
+            strcpy( msgPtr, "Privileged register.\n" );
+            break;
+        case ILL_COPROC:
+            strcpy( msgPtr, "Coprocessor error.\n" );
+            break;
+        case ILL_BADSTK:
+            strcpy( msgPtr, "Internal stack error.\n" );
+            break;
+        default:
+            break;
+        }
+        break;
+    case SIGFPE:
+        strcpy( msgPtr, "Floating-point exception.\n" );
+        while( *msgPtr ) msgPtr++;
+        switch( info->si_code )
+        {
+        case FPE_INTDIV:
+            strcpy( msgPtr, "Integer divide by zero.\n" );
+            break;
+        case FPE_INTOVF:
+            strcpy( msgPtr, "Integer overflow.\n" );
+            break;
+        case FPE_FLTDIV:
+            strcpy( msgPtr, "Floating-point divide by zero.\n" );
+            break;
+        case FPE_FLTOVF:
+            strcpy( msgPtr, "Floating-point overflow.\n" );
+            break;
+        case FPE_FLTUND:
+            strcpy( msgPtr, "Floating-point underflow.\n" );
+            break;
+        case FPE_FLTRES:
+            strcpy( msgPtr, "Floating-point inexact result.\n" );
+            break;
+        case FPE_FLTINV:
+            strcpy( msgPtr, "Floating-point invalid operation.\n" );
+            break;
+        case FPE_FLTSUB:
+            strcpy( msgPtr, "Subscript out of range.\n" );
+            break;
+        default:
+            break;
+        }
+        break;
+    case SIGSEGV:
+        strcpy( msgPtr, "Invalid memory reference.\n" );
+        while( *msgPtr ) msgPtr++;
+        switch( info->si_code )
+        {
+        case SEGV_MAPERR:
+            strcpy( msgPtr, "Address not mapped to object.\n" );
+            break;
+        case SEGV_ACCERR:
+            strcpy( msgPtr, "Invalid permissions for mapped object.\n" );
+            break;
+#  ifdef SEGV_BNDERR
+        case SEGV_BNDERR:
+            strcpy( msgPtr, "Failed address bound checks.\n" );
+            break;
+#  endif
+#  ifdef SEGV_PKUERR
+        case SEGV_PKUERR:
+            strcpy( msgPtr, "Access was denied by memory protection keys.\n" );
+            break;
+#  endif
+        default:
+            break;
+        }
+        break;
+    case SIGPIPE:
+        strcpy( msgPtr, "Broken pipe.\n" );
+        while( *msgPtr ) msgPtr++;
+        break;
+    case SIGBUS:
+        strcpy( msgPtr, "Bus error.\n" );
+        while( *msgPtr ) msgPtr++;
+        switch( info->si_code )
+        {
+        case BUS_ADRALN:
+            strcpy( msgPtr, "Invalid address alignment.\n" );
+            break;
+        case BUS_ADRERR:
+            strcpy( msgPtr, "Nonexistent physical address.\n" );
+            break;
+        case BUS_OBJERR:
+            strcpy( msgPtr, "Object-specific hardware error.\n" );
+            break;
+#  ifdef BUS_MCEERR_AR
+        case BUS_MCEERR_AR:
+            strcpy( msgPtr, "Hardware memory error consumed on a machine check; action required.\n" );
+            break;
+#  endif
+#  ifdef BUS_MCEERR_AO
+        case BUS_MCEERR_AO:
+            strcpy( msgPtr, "Hardware memory error detected in process but not consumed; action optional.\n" );
+            break;
+#  endif
+        default:
+            break;
+        }
+        break;
+    case SIGABRT:
+        strcpy( msgPtr, "Abort signal from abort().\n" );
+        break;
+    default:
+        abort();
+    }
+    while( *msgPtr ) msgPtr++;
+
+    if( signal != SIGPIPE )
+    {
+        strcpy( msgPtr, "Fault address: 0x" );
+        while( *msgPtr ) msgPtr++;
+        HexPrint( msgPtr, uint64_t( info->si_addr ) );
+        *msgPtr++ = '\n';
+    }
+
+    {
+        GetProfiler().SendCallstack( 60, "__kernel_rt_sigreturn" );
+
+        TracyQueuePrepare( QueueType::CrashReport );
+        item->crashReport.time = Profiler::GetTime();
+        item->crashReport.text = (uint64_t)s_crashText;
+        TracyQueueCommit( crashReportThread );
+    }
+
+    DIR* dp = opendir( "/proc/self/task" );
+    if( !dp ) abort();
+
+    const auto selfTid = syscall( SYS_gettid );
+
+    struct dirent* ep;
+    while( ( ep = readdir( dp ) ) != nullptr )
+    {
+        if( ep->d_name[0] == '.' ) continue;
+        int tid = atoi( ep->d_name );
+        if( tid != selfTid && tid != s_profilerTid && tid != s_symbolTid )
+        {
+            syscall( SYS_tkill, tid, TRACY_CRASH_SIGNAL );
+        }
+    }
+    closedir( dp );
+
+#ifdef TRACY_HAS_CALLSTACK
+    if( selfTid == s_symbolTid ) s_symbolThreadGone.store( true, std::memory_order_release );
+#endif
+
+    TracyLfqPrepare( QueueType::Crash );
+    TracyLfqCommit;
+
+    std::this_thread::sleep_for( std::chrono::milliseconds( 500 ) );
+    GetProfiler().RequestShutdown();
+    while( !GetProfiler().HasShutdownFinished() ) { std::this_thread::sleep_for( std::chrono::milliseconds( 10 ) ); };
+
+    abort();
+}
+#endif
+
+
+enum { QueuePrealloc = 256 * 1024 };
+
+TRACY_API int64_t GetFrequencyQpc()
+{
+#if defined _WIN32
+    LARGE_INTEGER t;
+    QueryPerformanceFrequency( &t );
+    return t.QuadPart;
+#else
+    return 0;
+#endif
+}
+
+#ifdef TRACY_DELAYED_INIT
+struct ThreadNameData;
+TRACY_API moodycamel::ConcurrentQueue<QueueItem>& GetQueue();
+
+struct ProfilerData
+{
+    int64_t initTime = SetupHwTimer();
+    moodycamel::ConcurrentQueue<QueueItem> queue;
+    Profiler profiler;
+    std::atomic<uint32_t> lockCounter { 0 };
+    std::atomic<uint8_t> gpuCtxCounter { 0 };
+    std::atomic<ThreadNameData*> threadNameData { nullptr };
+};
+
+struct ProducerWrapper
+{
+    ProducerWrapper( ProfilerData& data ) : detail( data.queue ), ptr( data.queue.get_explicit_producer( detail ) ) {}
+    moodycamel::ProducerToken detail;
+    tracy::moodycamel::ConcurrentQueue<QueueItem>::ExplicitProducer* ptr;
+};
+
+struct ProfilerThreadData
+{
+    ProfilerThreadData( ProfilerData& data ) : token( data ), gpuCtx( { nullptr } ) {}
+    ProducerWrapper token;
+    GpuCtxWrapper gpuCtx;
+#  ifdef TRACY_ON_DEMAND
+    LuaZoneState luaZoneState;
+#  endif
+};
+
+std::atomic<int> RpInitDone { 0 };
+std::atomic<int> RpInitLock { 0 };
+thread_local bool RpThreadInitDone = false;
+thread_local bool RpThreadShutdown = false;
+
+#  ifdef TRACY_MANUAL_LIFETIME
+ProfilerData* s_profilerData = nullptr;
+static ProfilerThreadData& GetProfilerThreadData();
+static std::atomic<bool> s_isProfilerStarted { false };
+TRACY_API void StartupProfiler()
+{
+    s_profilerData = (ProfilerData*)tracy_malloc( sizeof( ProfilerData ) );
+    new (s_profilerData) ProfilerData();
+    s_profilerData->profiler.SpawnWorkerThreads();
+    GetProfilerThreadData().token = ProducerWrapper( *s_profilerData );
+    s_isProfilerStarted.store( true, std::memory_order_seq_cst );
+}
+static ProfilerData& GetProfilerData()
+{
+    assert( s_profilerData );
+    return *s_profilerData;
+}
+TRACY_API void ShutdownProfiler()
+{
+    s_isProfilerStarted.store( false, std::memory_order_seq_cst );
+    s_profilerData->~ProfilerData();
+    tracy_free( s_profilerData );
+    s_profilerData = nullptr;
+    rpmalloc_finalize();
+    RpThreadInitDone = false;
+    RpInitDone.store( 0, std::memory_order_release );
+}
+TRACY_API bool IsProfilerStarted()
+{
+    return s_isProfilerStarted.load( std::memory_order_seq_cst );
+}
+#  else
+static std::atomic<int> profilerDataLock { 0 };
+static std::atomic<ProfilerData*> profilerData { nullptr };
+
+static ProfilerData& GetProfilerData()
+{
+    auto ptr = profilerData.load( std::memory_order_acquire );
+    if( !ptr )
+    {
+        int expected = 0;
+        while( !profilerDataLock.compare_exchange_weak( expected, 1, std::memory_order_release, std::memory_order_relaxed ) ) { expected = 0; YieldThread(); }
+        ptr = profilerData.load( std::memory_order_acquire );
+        if( !ptr )
+        {
+            ptr = (ProfilerData*)tracy_malloc( sizeof( ProfilerData ) );
+            new (ptr) ProfilerData();
+            profilerData.store( ptr, std::memory_order_release );
+        }
+        profilerDataLock.store( 0, std::memory_order_release );
+    }
+    return *ptr;
+}
+#  endif
+
+// GCC prior to 8.4 had a bug with function-inline thread_local variables. Versions of glibc beginning with
+// 2.18 may attempt to work around this issue, which manifests as a crash while running static destructors
+// if this function is compiled into a shared object. Unfortunately, centos7 ships with glibc 2.17. If running
+// on old GCC, use the old-fashioned way as a workaround
+// See: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=85400
+#if !defined(__clang__) && defined(__GNUC__) && ((__GNUC__ < 8) || ((__GNUC__ == 8) && (__GNUC_MINOR__ < 4)))
+struct ProfilerThreadDataKey
+{
+public:
+    ProfilerThreadDataKey()
+    {
+        int val = pthread_key_create(&m_key, sDestructor);
+        static_cast<void>(val); // unused
+        assert(val == 0);
+    }
+    ~ProfilerThreadDataKey()
+    {
+        int val = pthread_key_delete(m_key);
+        static_cast<void>(val); // unused
+        assert(val == 0);
+    }
+    ProfilerThreadData& get()
+    {
+        void* p = pthread_getspecific(m_key);
+        if (!p)
+        {
+            p = (ProfilerThreadData*)tracy_malloc( sizeof( ProfilerThreadData ) );
+            new (p) ProfilerThreadData(GetProfilerData());
+            pthread_setspecific(m_key, p);
+        }
+        return *static_cast<ProfilerThreadData*>(p);
+    }
+private:
+    pthread_key_t m_key;
+
+    static void sDestructor(void* p)
+    {
+        ((ProfilerThreadData*)p)->~ProfilerThreadData();
+        tracy_free(p);
+    }
+};
+
+static ProfilerThreadData& GetProfilerThreadData()
+{
+    static ProfilerThreadDataKey key;
+    return key.get();
+}
+#else
+static ProfilerThreadData& GetProfilerThreadData()
+{
+    thread_local ProfilerThreadData data( GetProfilerData() );
+    return data;
+}
+#endif
+
+TRACY_API moodycamel::ConcurrentQueue<QueueItem>::ExplicitProducer* GetToken() { return GetProfilerThreadData().token.ptr; }
+TRACY_API Profiler& GetProfiler() { return GetProfilerData().profiler; }
+TRACY_API moodycamel::ConcurrentQueue<QueueItem>& GetQueue() { return GetProfilerData().queue; }
+TRACY_API int64_t GetInitTime() { return GetProfilerData().initTime; }
+TRACY_API std::atomic<uint32_t>& GetLockCounter() { return GetProfilerData().lockCounter; }
+TRACY_API std::atomic<uint8_t>& GetGpuCtxCounter() { return GetProfilerData().gpuCtxCounter; }
+TRACY_API GpuCtxWrapper& GetGpuCtx() { return GetProfilerThreadData().gpuCtx; }
+TRACY_API uint32_t GetThreadHandle() { return detail::GetThreadHandleImpl(); }
+std::atomic<ThreadNameData*>& GetThreadNameData() { return GetProfilerData().threadNameData; }
+
+#  ifdef TRACY_ON_DEMAND
+TRACY_API LuaZoneState& GetLuaZoneState() { return GetProfilerThreadData().luaZoneState; }
+#  endif
+
+#  ifndef TRACY_MANUAL_LIFETIME
+namespace
+{
+    const auto& __profiler_init = GetProfiler();
+}
+#  endif
+
+#else
+
+// MSVC static initialization order solution. gcc/clang uses init_order() to avoid all this.
+
+// 1a. But s_queue is needed for initialization of variables in point 2.
+extern moodycamel::ConcurrentQueue<QueueItem> s_queue;
+
+// 2. If these variables would be in the .CRT$XCB section, they would be initialized only in main thread.
+thread_local moodycamel::ProducerToken init_order(107) s_token_detail( s_queue );
+thread_local ProducerWrapper init_order(108) s_token { s_queue.get_explicit_producer( s_token_detail ) };
+thread_local ThreadHandleWrapper init_order(104) s_threadHandle { detail::GetThreadHandleImpl() };
+
+#  ifdef _MSC_VER
+// 1. Initialize these static variables before all other variables.
+#    pragma warning( disable : 4075 )
+#    pragma init_seg( ".CRT$XCB" )
+#  endif
+
+static InitTimeWrapper init_order(101) s_initTime { SetupHwTimer() };
+std::atomic<int> init_order(102) RpInitDone( 0 );
+std::atomic<int> init_order(102) RpInitLock( 0 );
+thread_local bool RpThreadInitDone = false;
+thread_local bool RpThreadShutdown = false;
+moodycamel::ConcurrentQueue<QueueItem> init_order(103) s_queue( QueuePrealloc );
+std::atomic<uint32_t> init_order(104) s_lockCounter( 0 );
+std::atomic<uint8_t> init_order(104) s_gpuCtxCounter( 0 );
+
+thread_local GpuCtxWrapper init_order(104) s_gpuCtx { nullptr };
+
+struct ThreadNameData;
+static std::atomic<ThreadNameData*> init_order(104) s_threadNameDataInstance( nullptr );
+std::atomic<ThreadNameData*>& s_threadNameData = s_threadNameDataInstance;
+
+#  ifdef TRACY_ON_DEMAND
+thread_local LuaZoneState init_order(104) s_luaZoneState { 0, false };
+#  endif
+
+static Profiler init_order(105) s_profiler;
+
+TRACY_API moodycamel::ConcurrentQueue<QueueItem>::ExplicitProducer* GetToken() { return s_token.ptr; }
+TRACY_API Profiler& GetProfiler() { return s_profiler; }
+TRACY_API moodycamel::ConcurrentQueue<QueueItem>& GetQueue() { return s_queue; }
+TRACY_API int64_t GetInitTime() { return s_initTime.val; }
+TRACY_API std::atomic<uint32_t>& GetLockCounter() { return s_lockCounter; }
+TRACY_API std::atomic<uint8_t>& GetGpuCtxCounter() { return s_gpuCtxCounter; }
+TRACY_API GpuCtxWrapper& GetGpuCtx() { return s_gpuCtx; }
+TRACY_API uint32_t GetThreadHandle() { return s_threadHandle.val; }
+
+std::atomic<ThreadNameData*>& GetThreadNameData() { return s_threadNameData; }
+
+#  ifdef TRACY_ON_DEMAND
+TRACY_API LuaZoneState& GetLuaZoneState() { return s_luaZoneState; }
+#  endif
+#endif
+
+TRACY_API bool ProfilerAvailable() { return s_instance != nullptr; }
+TRACY_API bool ProfilerAllocatorAvailable() { return !RpThreadShutdown; }
+
+Profiler::Profiler()
+    : m_timeBegin( 0 )
+    , m_mainThread( detail::GetThreadHandleImpl() )
+    , m_epoch( std::chrono::duration_cast<std::chrono::seconds>( std::chrono::system_clock::now().time_since_epoch() ).count() )
+    , m_shutdown( false )
+    , m_shutdownManual( false )
+    , m_shutdownFinished( false )
+    , m_sock( nullptr )
+    , m_broadcast( nullptr )
+    , m_noExit( false )
+    , m_userPort( 0 )
+    , m_zoneId( 1 )
+    , m_samplingPeriod( 0 )
+    , m_stream( LZ4_createStream() )
+    , m_buffer( (char*)tracy_malloc( TargetFrameSize*3 ) )
+    , m_bufferOffset( 0 )
+    , m_bufferStart( 0 )
+    , m_lz4Buf( (char*)tracy_malloc( LZ4Size + sizeof( lz4sz_t ) ) )
+    , m_serialQueue( 1024*1024 )
+    , m_serialDequeue( 1024*1024 )
+#ifndef TRACY_NO_FRAME_IMAGE
+    , m_fiQueue( 16 )
+    , m_fiDequeue( 16 )
+#endif
+    , m_symbolQueue( 8*1024 )
+    , m_frameCount( 0 )
+    , m_isConnected( false )
+#ifdef TRACY_ON_DEMAND
+    , m_connectionId( 0 )
+    , m_deferredQueue( 64*1024 )
+#endif
+    , m_paramCallback( nullptr )
+    , m_sourceCallback( nullptr )
+    , m_queryImage( nullptr )
+    , m_queryData( nullptr )
+    , m_crashHandlerInstalled( false )
+    , m_programName( nullptr )
+{
+    assert( !s_instance );
+    s_instance = this;
+
+#ifndef TRACY_DELAYED_INIT
+#  ifdef _MSC_VER
+    // 3. But these variables need to be initialized in main thread within the .CRT$XCB section. Do it here.
+    s_token_detail = moodycamel::ProducerToken( s_queue );
+    s_token = ProducerWrapper { s_queue.get_explicit_producer( s_token_detail ) };
+    s_threadHandle = ThreadHandleWrapper { m_mainThread };
+#  endif
+#endif
+
+    CalibrateTimer();
+    CalibrateDelay();
+    ReportTopology();
+
+#ifdef __linux__
+    m_kcore = (KCore*)tracy_malloc( sizeof( KCore ) );
+    new(m_kcore) KCore();
+#endif
+
+#ifndef TRACY_NO_EXIT
+    const char* noExitEnv = GetEnvVar( "TRACY_NO_EXIT" );
+    if( noExitEnv && noExitEnv[0] == '1' )
+    {
+        m_noExit = true;
+    }
+#endif
+
+    const char* userPort = GetEnvVar( "TRACY_PORT" );
+    if( userPort )
+    {
+        m_userPort = atoi( userPort );
+    }
+
+#if !defined(TRACY_DELAYED_INIT) || !defined(TRACY_MANUAL_LIFETIME)
+    SpawnWorkerThreads();
+#endif
+}
+
+void Profiler::InstallCrashHandler()
+{
+
+#if defined __linux__ && !defined TRACY_NO_CRASH_HANDLER
+    struct sigaction threadFreezer = {};
+    threadFreezer.sa_handler = ThreadFreezer;
+    sigaction( TRACY_CRASH_SIGNAL, &threadFreezer, &m_prevSignal.pwr );
+
+    struct sigaction crashHandler = {};
+    crashHandler.sa_sigaction = CrashHandler;
+    crashHandler.sa_flags = SA_SIGINFO;
+    sigaction( SIGILL, &crashHandler, &m_prevSignal.ill );
+    sigaction( SIGFPE, &crashHandler, &m_prevSignal.fpe );
+    sigaction( SIGSEGV, &crashHandler, &m_prevSignal.segv );
+    sigaction( SIGPIPE, &crashHandler, &m_prevSignal.pipe );
+    sigaction( SIGBUS, &crashHandler, &m_prevSignal.bus );
+    sigaction( SIGABRT, &crashHandler, &m_prevSignal.abrt );
+#endif
+
+#if defined _WIN32 && !defined TRACY_UWP && !defined TRACY_NO_CRASH_HANDLER
+    m_exceptionHandler = AddVectoredExceptionHandler( 1, CrashFilter );
+#endif
+
+#ifndef TRACY_NO_CRASH_HANDLER
+    m_crashHandlerInstalled = true;
+#endif
+
+}
+
+void Profiler::RemoveCrashHandler()
+{
+#if defined _WIN32 && !defined TRACY_UWP
+    if( m_crashHandlerInstalled ) RemoveVectoredExceptionHandler( m_exceptionHandler );
+#endif
+
+#if defined __linux__ && !defined TRACY_NO_CRASH_HANDLER
+    if( m_crashHandlerInstalled )
+    {
+        sigaction( TRACY_CRASH_SIGNAL, &m_prevSignal.pwr, nullptr );
+        sigaction( SIGILL, &m_prevSignal.ill, nullptr );
+        sigaction( SIGFPE, &m_prevSignal.fpe, nullptr );
+        sigaction( SIGSEGV, &m_prevSignal.segv, nullptr );
+        sigaction( SIGPIPE, &m_prevSignal.pipe, nullptr );
+        sigaction( SIGBUS, &m_prevSignal.bus, nullptr );
+        sigaction( SIGABRT, &m_prevSignal.abrt, nullptr );
+    }
+#endif
+    m_crashHandlerInstalled = false;
+}
+
+void Profiler::SpawnWorkerThreads()
+{
+#ifdef TRACY_HAS_SYSTEM_TRACING
+    // use TRACY_NO_SYS_TRACE=1 to force disabling sys tracing (even if available in the underlying system)
+    // as it can have significant impact on the size of the traces
+    const char* noSysTrace = GetEnvVar( "TRACY_NO_SYS_TRACE" );
+    const bool disableSystrace = (noSysTrace && noSysTrace[0] == '1');
+    if( disableSystrace )
+    {
+        TracyDebug("TRACY: Sys Trace was disabled by 'TRACY_NO_SYS_TRACE=1'\n");
+    }
+    else if( SysTraceStart( m_samplingPeriod ) )
+    {
+        s_sysTraceThread = (Thread*)tracy_malloc( sizeof( Thread ) );
+        new(s_sysTraceThread) Thread( SysTraceWorker, nullptr );
+        std::this_thread::sleep_for( std::chrono::milliseconds( 1 ) );
+    }
+#endif
+
+    s_thread = (Thread*)tracy_malloc( sizeof( Thread ) );
+    new(s_thread) Thread( LaunchWorker, this );
+
+#ifndef TRACY_NO_FRAME_IMAGE
+    s_compressThread = (Thread*)tracy_malloc( sizeof( Thread ) );
+    new(s_compressThread) Thread( LaunchCompressWorker, this );
+#endif
+
+#ifdef TRACY_HAS_CALLSTACK
+    s_symbolThread = (Thread*)tracy_malloc( sizeof( Thread ) );
+    new(s_symbolThread) Thread( LaunchSymbolWorker, this );
+#endif
+
+#if defined _WIN32 && !defined TRACY_UWP && !defined TRACY_NO_CRASH_HANDLER
+    s_profilerThreadId = GetThreadId( s_thread->Handle() );
+#  ifdef TRACY_HAS_CALLSTACK
+    s_symbolThreadId = GetThreadId( s_symbolThread->Handle() );
+#  endif
+#endif
+
+#ifdef TRACY_HAS_CALLSTACK
+    InitCallstackCritical();
+#endif
+
+    m_timeBegin.store( GetTime(), std::memory_order_relaxed );
+}
+
+Profiler::~Profiler()
+{
+    m_shutdown.store( true, std::memory_order_relaxed );
+
+    RemoveCrashHandler();
+
+#ifdef TRACY_HAS_SYSTEM_TRACING
+    if( s_sysTraceThread )
+    {
+        SysTraceStop();
+        s_sysTraceThread->~Thread();
+        tracy_free( s_sysTraceThread );
+    }
+#endif
+
+#ifdef TRACY_HAS_CALLSTACK
+    s_symbolThread->~Thread();
+    tracy_free( s_symbolThread );
+#endif
+
+#ifndef TRACY_NO_FRAME_IMAGE
+    s_compressThread->~Thread();
+    tracy_free( s_compressThread );
+#endif
+
+    s_thread->~Thread();
+    tracy_free( s_thread );
+
+#ifdef TRACY_HAS_CALLSTACK
+    EndCallstack();
+#endif
+
+#ifdef __linux__
+    m_kcore->~KCore();
+    tracy_free( m_kcore );
+#endif
+
+    tracy_free( m_lz4Buf );
+    tracy_free( m_buffer );
+    LZ4_freeStream( (LZ4_stream_t*)m_stream );
+
+    if( m_sock )
+    {
+        m_sock->~Socket();
+        tracy_free( m_sock );
+    }
+
+    if( m_broadcast )
+    {
+        m_broadcast->~UdpBroadcast();
+        tracy_free( m_broadcast );
+    }
+
+    assert( s_instance );
+    s_instance = nullptr;
+}
+
+bool Profiler::ShouldExit()
+{
+    return s_instance->m_shutdown.load( std::memory_order_relaxed );
+}
+
+void Profiler::Worker()
+{
+#if defined __linux__ && !defined TRACY_NO_CRASH_HANDLER
+    s_profilerTid = syscall( SYS_gettid );
+#endif
+
+    ThreadExitHandler threadExitHandler;
+
+    SetThreadName( "Tracy Profiler" );
+
+#ifdef TRACY_DATA_PORT
+    const bool dataPortSearch = false;
+    auto dataPort = m_userPort != 0 ? m_userPort : TRACY_DATA_PORT;
+#else
+    const bool dataPortSearch = m_userPort == 0;
+    auto dataPort = m_userPort != 0 ? m_userPort : 8086;
+#endif
+#ifdef TRACY_BROADCAST_PORT
+    const auto broadcastPort = TRACY_BROADCAST_PORT;
+#else
+    const auto broadcastPort = 8086;
+#endif
+
+    while( m_timeBegin.load( std::memory_order_relaxed ) == 0 ) std::this_thread::sleep_for( std::chrono::milliseconds( 10 ) );
+
+#ifdef TRACY_USE_RPMALLOC
+    rpmalloc_thread_initialize();
+#endif
+
+    m_exectime = 0;
+    const auto execname = GetProcessExecutablePath();
+    if( execname )
+    {
+        struct stat st;
+        if( stat( execname, &st ) == 0 )
+        {
+            m_exectime = (uint64_t)st.st_mtime;
+        }
+    }
+
+    const auto procname = GetProcessName();
+    const auto pnsz = std::min<size_t>( strlen( procname ), WelcomeMessageProgramNameSize - 1 );
+
+    const auto hostinfo = GetHostInfo();
+    const auto hisz = std::min<size_t>( strlen( hostinfo ), WelcomeMessageHostInfoSize - 1 );
+
+    const uint64_t pid = GetPid();
+
+    uint8_t flags = 0;
+
+#ifdef TRACY_ON_DEMAND
+    flags |= WelcomeFlag::OnDemand;
+#endif
+#ifdef __APPLE__
+    flags |= WelcomeFlag::IsApple;
+#endif
+#ifndef TRACY_NO_CODE_TRANSFER
+    flags |= WelcomeFlag::CodeTransfer;
+#endif
+#ifdef _WIN32
+    flags |= WelcomeFlag::CombineSamples;
+#  ifndef TRACY_NO_CONTEXT_SWITCH
+    flags |= WelcomeFlag::IdentifySamples;
+#  endif
+#endif
+
+#if defined __i386 || defined _M_IX86
+    uint8_t cpuArch = CpuArchX86;
+#elif defined __x86_64__ || defined _M_X64
+    uint8_t cpuArch = CpuArchX64;
+#elif defined __aarch64__
+    uint8_t cpuArch = CpuArchArm64;
+#elif defined __ARM_ARCH
+    uint8_t cpuArch = CpuArchArm32;
+#else
+    uint8_t cpuArch = CpuArchUnknown;
+#endif
+
+#if defined __i386 || defined _M_IX86 || defined __x86_64__ || defined _M_X64
+    uint32_t regs[4];
+    char manufacturer[12];
+    CpuId( regs, 0 );
+    memcpy( manufacturer, regs+1, 4 );
+    memcpy( manufacturer+4, regs+3, 4 );
+    memcpy( manufacturer+8, regs+2, 4 );
+
+    CpuId( regs, 1 );
+    uint32_t cpuId = ( regs[0] & 0xFFF ) | ( ( regs[0] & 0xFFF0000 ) >> 4 );
+#else
+    const char manufacturer[12] = {};
+    uint32_t cpuId = 0;
+#endif
+
+    WelcomeMessage welcome;
+    MemWrite( &welcome.timerMul, m_timerMul );
+    MemWrite( &welcome.initBegin, GetInitTime() );
+    MemWrite( &welcome.initEnd, m_timeBegin.load( std::memory_order_relaxed ) );
+    MemWrite( &welcome.delay, m_delay );
+    MemWrite( &welcome.resolution, m_resolution );
+    MemWrite( &welcome.epoch, m_epoch );
+    MemWrite( &welcome.exectime, m_exectime );
+    MemWrite( &welcome.pid, pid );
+    MemWrite( &welcome.samplingPeriod, m_samplingPeriod );
+    MemWrite( &welcome.flags, flags );
+    MemWrite( &welcome.cpuArch, cpuArch );
+    memcpy( welcome.cpuManufacturer, manufacturer, 12 );
+    MemWrite( &welcome.cpuId, cpuId );
+    memcpy( welcome.programName, procname, pnsz );
+    memset( welcome.programName + pnsz, 0, WelcomeMessageProgramNameSize - pnsz );
+    memcpy( welcome.hostInfo, hostinfo, hisz );
+    memset( welcome.hostInfo + hisz, 0, WelcomeMessageHostInfoSize - hisz );
+
+    moodycamel::ConsumerToken token( GetQueue() );
+
+    ListenSocket listen;
+    bool isListening = false;
+    if( !dataPortSearch )
+    {
+        isListening = listen.Listen( dataPort, 4 );
+    }
+    else
+    {
+        for( uint32_t i=0; i<20; i++ )
+        {
+            if( listen.Listen( dataPort+i, 4 ) )
+            {
+                dataPort += i;
+                isListening = true;
+                break;
+            }
+        }
+    }
+    if( !isListening )
+    {
+        for(;;)
+        {
+            if( ShouldExit() )
+            {
+                m_shutdownFinished.store( true, std::memory_order_relaxed );
+                return;
+            }
+
+            ClearQueues( token );
+            std::this_thread::sleep_for( std::chrono::milliseconds( 10 ) );
+        }
+    }
+
+#ifndef TRACY_NO_BROADCAST
+    m_broadcast = (UdpBroadcast*)tracy_malloc( sizeof( UdpBroadcast ) );
+    new(m_broadcast) UdpBroadcast();
+#  ifdef TRACY_ONLY_LOCALHOST
+    const char* addr = "127.255.255.255";
+#  elif defined TRACY_CLIENT_ADDRESS
+    const char* addr = TRACY_CLIENT_ADDRESS;
+#  elif defined __QNX__
+     // global broadcast address of 255.255.255.255 is not well-supported by QNX,
+     // use the interface broadcast address instead, e.g. "const char* addr = 192.168.1.255;"
+#    error Need to specify TRACY_CLIENT_ADDRESS for a QNX target.
+#  else
+    const char* addr = "255.255.255.255";
+#  endif
+    if( !m_broadcast->Open( addr, broadcastPort ) )
+    {
+        m_broadcast->~UdpBroadcast();
+        tracy_free( m_broadcast );
+        m_broadcast = nullptr;
+    }
+#endif
+
+    int broadcastLen = 0;
+    auto& broadcastMsg = GetBroadcastMessage( procname, pnsz, broadcastLen, dataPort );
+    uint64_t lastBroadcast = 0;
+
+    // Connections loop.
+    // Each iteration of the loop handles whole connection. Multiple iterations will only
+    // happen in the on-demand mode or when handshake fails.
+    for(;;)
+    {
+        // Wait for incoming connection
+        for(;;)
+        {
+#ifndef TRACY_NO_EXIT
+            if( !m_noExit && ShouldExit() )
+            {
+                if( m_broadcast )
+                {
+                    broadcastMsg.activeTime = -1;
+                    m_broadcast->Send( broadcastPort, &broadcastMsg, broadcastLen );
+                }
+                m_shutdownFinished.store( true, std::memory_order_relaxed );
+                return;
+            }
+#endif
+            m_sock = listen.Accept();
+            if( m_sock ) break;
+#ifndef TRACY_ON_DEMAND
+            ProcessSysTime();
+#  ifdef TRACY_HAS_SYSPOWER
+            m_sysPower.Tick();
+#  endif
+#endif
+
+            if( m_broadcast )
+            {
+                const auto t = std::chrono::high_resolution_clock::now().time_since_epoch().count();
+                if( t - lastBroadcast > 3000000000 )  // 3s
+                {
+                    m_programNameLock.lock();
+                    if( m_programName )
+                    {
+                        broadcastMsg = GetBroadcastMessage( m_programName, strlen( m_programName ), broadcastLen, dataPort );
+                        m_programName = nullptr;
+                    }
+                    m_programNameLock.unlock();
+
+                    lastBroadcast = t;
+                    const auto ts = std::chrono::duration_cast<std::chrono::seconds>( std::chrono::system_clock::now().time_since_epoch() ).count();
+                    broadcastMsg.activeTime = int32_t( ts - m_epoch );
+                    assert( broadcastMsg.activeTime >= 0 );
+                    m_broadcast->Send( broadcastPort, &broadcastMsg, broadcastLen );
+                }
+            }
+        }
+
+        if( m_broadcast )
+        {
+            lastBroadcast = 0;
+            broadcastMsg.activeTime = -1;
+            m_broadcast->Send( broadcastPort, &broadcastMsg, broadcastLen );
+        }
+
+        // Handshake
+        {
+            char shibboleth[HandshakeShibbolethSize];
+            auto res = m_sock->ReadRaw( shibboleth, HandshakeShibbolethSize, 2000 );
+            if( !res || memcmp( shibboleth, HandshakeShibboleth, HandshakeShibbolethSize ) != 0 )
+            {
+                m_sock->~Socket();
+                tracy_free( m_sock );
+                m_sock = nullptr;
+                continue;
+            }
+
+            uint32_t protocolVersion;
+            res = m_sock->ReadRaw( &protocolVersion, sizeof( protocolVersion ), 2000 );
+            if( !res )
+            {
+                m_sock->~Socket();
+                tracy_free( m_sock );
+                m_sock = nullptr;
+                continue;
+            }
+
+            if( protocolVersion != ProtocolVersion )
+            {
+                HandshakeStatus status = HandshakeProtocolMismatch;
+                m_sock->Send( &status, sizeof( status ) );
+                m_sock->~Socket();
+                tracy_free( m_sock );
+                m_sock = nullptr;
+                continue;
+            }
+        }
+
+#ifdef TRACY_ON_DEMAND
+        const auto currentTime = GetTime();
+        ClearQueues( token );
+        m_connectionId.fetch_add( 1, std::memory_order_release );
+#endif
+        m_isConnected.store( true, std::memory_order_release );
+        InstallCrashHandler();
+
+        HandshakeStatus handshake = HandshakeWelcome;
+        m_sock->Send( &handshake, sizeof( handshake ) );
+
+        LZ4_resetStream( (LZ4_stream_t*)m_stream );
+        m_sock->Send( &welcome, sizeof( welcome ) );
+
+        m_threadCtx = 0;
+        m_refTimeSerial = 0;
+        m_refTimeCtx = 0;
+        m_refTimeGpu = 0;
+
+#ifdef TRACY_ON_DEMAND
+        OnDemandPayloadMessage onDemand;
+        onDemand.frames = m_frameCount.load( std::memory_order_relaxed );
+        onDemand.currentTime = currentTime;
+
+        m_sock->Send( &onDemand, sizeof( onDemand ) );
+
+        m_deferredLock.lock();
+        for( auto& item : m_deferredQueue )
+        {
+            uint64_t ptr;
+            uint16_t size;
+            const auto idx = MemRead<uint8_t>( &item.hdr.idx );
+            switch( (QueueType)idx )
+            {
+            case QueueType::MessageAppInfo:
+                ptr = MemRead<uint64_t>( &item.messageFat.text );
+                size = MemRead<uint16_t>( &item.messageFat.size );
+                SendSingleString( (const char*)ptr, size );
+                break;
+            case QueueType::LockName:
+                ptr = MemRead<uint64_t>( &item.lockNameFat.name );
+                size = MemRead<uint16_t>( &item.lockNameFat.size );
+                SendSingleString( (const char*)ptr, size );
+                break;
+            case QueueType::GpuContextName:
+                ptr = MemRead<uint64_t>( &item.gpuContextNameFat.ptr );
+                size = MemRead<uint16_t>( &item.gpuContextNameFat.size );
+                SendSingleString( (const char*)ptr, size );
+                break;
+            default:
+                break;
+            }
+            AppendData( &item, QueueDataSize[idx] );
+        }
+        m_deferredLock.unlock();
+#endif
+
+        // Main communications loop
+        int keepAlive = 0;
+        for(;;)
+        {
+            ProcessSysTime();
+#ifdef TRACY_HAS_SYSPOWER
+            m_sysPower.Tick();
+#endif
+            const auto status = Dequeue( token );
+            const auto serialStatus = DequeueSerial();
+            if( status == DequeueStatus::ConnectionLost || serialStatus == DequeueStatus::ConnectionLost )
+            {
+                break;
+            }
+            else if( status == DequeueStatus::QueueEmpty && serialStatus == DequeueStatus::QueueEmpty )
+            {
+                if( ShouldExit() ) break;
+                if( m_bufferOffset != m_bufferStart )
+                {
+                    if( !CommitData() ) break;
+                }
+                if( keepAlive == 500 )
+                {
+                    QueueItem ka;
+                    ka.hdr.type = QueueType::KeepAlive;
+                    AppendData( &ka, QueueDataSize[ka.hdr.idx] );
+                    if( !CommitData() ) break;
+
+                    keepAlive = 0;
+                }
+                else if( !m_sock->HasData() )
+                {
+                    keepAlive++;
+                    std::this_thread::sleep_for( std::chrono::milliseconds( 10 ) );
+                }
+            }
+            else
+            {
+                keepAlive = 0;
+            }
+
+            bool connActive = true;
+            while( m_sock->HasData() )
+            {
+                connActive = HandleServerQuery();
+                if( !connActive ) break;
+            }
+            if( !connActive ) break;
+        }
+        if( ShouldExit() ) break;
+
+        m_isConnected.store( false, std::memory_order_release );
+        RemoveCrashHandler();
+
+#ifdef TRACY_ON_DEMAND
+        m_bufferOffset = 0;
+        m_bufferStart = 0;
+#endif
+
+        m_sock->~Socket();
+        tracy_free( m_sock );
+        m_sock = nullptr;
+
+#ifndef TRACY_ON_DEMAND
+        // Client is no longer available here. Accept incoming connections, but reject handshake.
+        for(;;)
+        {
+            if( ShouldExit() )
+            {
+                m_shutdownFinished.store( true, std::memory_order_relaxed );
+                return;
+            }
+
+            ClearQueues( token );
+
+            m_sock = listen.Accept();
+            if( m_sock )
+            {
+                char shibboleth[HandshakeShibbolethSize];
+                auto res = m_sock->ReadRaw( shibboleth, HandshakeShibbolethSize, 1000 );
+                if( !res || memcmp( shibboleth, HandshakeShibboleth, HandshakeShibbolethSize ) != 0 )
+                {
+                    m_sock->~Socket();
+                    tracy_free( m_sock );
+                    m_sock = nullptr;
+                    continue;
+                }
+
+                uint32_t protocolVersion;
+                res = m_sock->ReadRaw( &protocolVersion, sizeof( protocolVersion ), 1000 );
+                if( !res )
+                {
+                    m_sock->~Socket();
+                    tracy_free( m_sock );
+                    m_sock = nullptr;
+                    continue;
+                }
+
+                HandshakeStatus status = HandshakeNotAvailable;
+                m_sock->Send( &status, sizeof( status ) );
+                m_sock->~Socket();
+                tracy_free( m_sock );
+            }
+        }
+#endif
+    }
+    // End of connections loop
+
+    // Wait for symbols thread to terminate. Symbol resolution will continue in this thread.
+#ifdef TRACY_HAS_CALLSTACK
+    while( s_symbolThreadGone.load() == false ) { YieldThread(); }
+#endif
+
+    // Client is exiting. Send items remaining in queues.
+    for(;;)
+    {
+        const auto status = Dequeue( token );
+        const auto serialStatus = DequeueSerial();
+        if( status == DequeueStatus::ConnectionLost || serialStatus == DequeueStatus::ConnectionLost )
+        {
+            m_shutdownFinished.store( true, std::memory_order_relaxed );
+            return;
+        }
+        else if( status == DequeueStatus::QueueEmpty && serialStatus == DequeueStatus::QueueEmpty )
+        {
+            if( m_bufferOffset != m_bufferStart ) CommitData();
+            break;
+        }
+
+        while( m_sock->HasData() )
+        {
+            if( !HandleServerQuery() )
+            {
+                m_shutdownFinished.store( true, std::memory_order_relaxed );
+                return;
+            }
+        }
+
+#ifdef TRACY_HAS_CALLSTACK
+        for(;;)
+        {
+            auto si = m_symbolQueue.front();
+            if( !si ) break;
+            HandleSymbolQueueItem( *si );
+            m_symbolQueue.pop();
+        }
+#endif
+    }
+
+    // Send client termination notice to the server
+    QueueItem terminate;
+    MemWrite( &terminate.hdr.type, QueueType::Terminate );
+    if( !SendData( (const char*)&terminate, 1 ) )
+    {
+        m_shutdownFinished.store( true, std::memory_order_relaxed );
+        return;
+    }
+    // Handle remaining server queries
+    for(;;)
+    {
+        while( m_sock->HasData() )
+        {
+            if( !HandleServerQuery() )
+            {
+                m_shutdownFinished.store( true, std::memory_order_relaxed );
+                return;
+            }
+        }
+#ifdef TRACY_HAS_CALLSTACK
+        for(;;)
+        {
+            auto si = m_symbolQueue.front();
+            if( !si ) break;
+            HandleSymbolQueueItem( *si );
+            m_symbolQueue.pop();
+        }
+#endif
+        const auto status = Dequeue( token );
+        const auto serialStatus = DequeueSerial();
+        if( status == DequeueStatus::ConnectionLost || serialStatus == DequeueStatus::ConnectionLost )
+        {
+            m_shutdownFinished.store( true, std::memory_order_relaxed );
+            return;
+        }
+        if( m_bufferOffset != m_bufferStart )
+        {
+            if( !CommitData() )
+            {
+                m_shutdownFinished.store( true, std::memory_order_relaxed );
+                return;
+            }
+        }
+    }
+}
+
+#ifndef TRACY_NO_FRAME_IMAGE
+void Profiler::CompressWorker()
+{
+    ThreadExitHandler threadExitHandler;
+    SetThreadName( "Tracy DXT1" );
+    while( m_timeBegin.load( std::memory_order_relaxed ) == 0 ) std::this_thread::sleep_for( std::chrono::milliseconds( 10 ) );
+
+#ifdef TRACY_USE_RPMALLOC
+    rpmalloc_thread_initialize();
+#endif
+
+    for(;;)
+    {
+        const auto shouldExit = ShouldExit();
+
+        {
+            bool lockHeld = true;
+            while( !m_fiLock.try_lock() )
+            {
+                if( m_shutdownManual.load( std::memory_order_relaxed ) )
+                {
+                    lockHeld = false;
+                    break;
+                }
+            }
+            if( !m_fiQueue.empty() ) m_fiQueue.swap( m_fiDequeue );
+            if( lockHeld )
+            {
+                m_fiLock.unlock();
+            }
+        }
+
+        const auto sz = m_fiDequeue.size();
+        if( sz > 0 )
+        {
+            auto fi = m_fiDequeue.data();
+            auto end = fi + sz;
+            while( fi != end )
+            {
+                const auto w = fi->w;
+                const auto h = fi->h;
+                const auto csz = size_t( w * h / 2 );
+                auto etc1buf = (char*)tracy_malloc( csz );
+                CompressImageDxt1( (const char*)fi->image, etc1buf, w, h );
+                tracy_free( fi->image );
+
+                TracyLfqPrepare( QueueType::FrameImage );
+                MemWrite( &item->frameImageFat.image, (uint64_t)etc1buf );
+                MemWrite( &item->frameImageFat.frame, fi->frame );
+                MemWrite( &item->frameImageFat.w, w );
+                MemWrite( &item->frameImageFat.h, h );
+                uint8_t flip = fi->flip;
+                MemWrite( &item->frameImageFat.flip, flip );
+                TracyLfqCommit;
+
+                fi++;
+            }
+            m_fiDequeue.clear();
+        }
+        else
+        {
+            std::this_thread::sleep_for( std::chrono::milliseconds( 20 ) );
+        }
+
+        if( shouldExit )
+        {
+            return;
+        }
+    }
+}
+#endif
+
+static void FreeAssociatedMemory( const QueueItem& item )
+{
+    if( item.hdr.idx >= (int)QueueType::Terminate ) return;
+
+    uint64_t ptr;
+    switch( item.hdr.type )
+    {
+    case QueueType::ZoneText:
+    case QueueType::ZoneName:
+        ptr = MemRead<uint64_t>( &item.zoneTextFat.text );
+        tracy_free( (void*)ptr );
+        break;
+    case QueueType::MessageColor:
+    case QueueType::MessageColorCallstack:
+        ptr = MemRead<uint64_t>( &item.messageColorFat.text );
+        tracy_free( (void*)ptr );
+        break;
+    case QueueType::Message:
+    case QueueType::MessageCallstack:
+#ifndef TRACY_ON_DEMAND
+    case QueueType::MessageAppInfo:
+#endif
+        ptr = MemRead<uint64_t>( &item.messageFat.text );
+        tracy_free( (void*)ptr );
+        break;
+    case QueueType::ZoneBeginAllocSrcLoc:
+    case QueueType::ZoneBeginAllocSrcLocCallstack:
+        ptr = MemRead<uint64_t>( &item.zoneBegin.srcloc );
+        tracy_free( (void*)ptr );
+        break;
+    case QueueType::GpuZoneBeginAllocSrcLoc:
+    case QueueType::GpuZoneBeginAllocSrcLocCallstack:
+    case QueueType::GpuZoneBeginAllocSrcLocSerial:
+    case QueueType::GpuZoneBeginAllocSrcLocCallstackSerial:
+        ptr = MemRead<uint64_t>( &item.gpuZoneBegin.srcloc );
+        tracy_free( (void*)ptr );
+        break;
+    case QueueType::CallstackSerial:
+    case QueueType::Callstack:
+        ptr = MemRead<uint64_t>( &item.callstackFat.ptr );
+        tracy_free( (void*)ptr );
+        break;
+    case QueueType::CallstackAlloc:
+        ptr = MemRead<uint64_t>( &item.callstackAllocFat.nativePtr );
+        tracy_free( (void*)ptr );
+        ptr = MemRead<uint64_t>( &item.callstackAllocFat.ptr );
+        tracy_free( (void*)ptr );
+        break;
+    case QueueType::CallstackSample:
+    case QueueType::CallstackSampleContextSwitch:
+        ptr = MemRead<uint64_t>( &item.callstackSampleFat.ptr );
+        tracy_free( (void*)ptr );
+        break;
+    case QueueType::FrameImage:
+        ptr = MemRead<uint64_t>( &item.frameImageFat.image );
+        tracy_free( (void*)ptr );
+        break;
+#ifdef TRACY_HAS_CALLSTACK
+    case QueueType::CallstackFrameSize:
+    {
+        InitRpmalloc();
+        auto size = MemRead<uint8_t>( &item.callstackFrameSizeFat.size );
+        auto data = (const CallstackEntry*)MemRead<uint64_t>( &item.callstackFrameSizeFat.data );
+        for( uint8_t i=0; i<size; i++ )
+        {
+            const auto& frame = data[i];
+            tracy_free_fast( (void*)frame.name );
+            tracy_free_fast( (void*)frame.file );
+        }
+        tracy_free_fast( (void*)data );
+        break;
+    }
+    case QueueType::SymbolInformation:
+    {
+        uint8_t needFree = MemRead<uint8_t>( &item.symbolInformationFat.needFree );
+        if( needFree )
+        {
+            ptr = MemRead<uint64_t>( &item.symbolInformationFat.fileString );
+            tracy_free( (void*)ptr );
+        }
+        break;
+    }
+    case QueueType::SymbolCodeMetadata:
+        ptr = MemRead<uint64_t>( &item.symbolCodeMetadata.ptr );
+        tracy_free( (void*)ptr );
+        break;
+#endif
+#ifndef TRACY_ON_DEMAND
+    case QueueType::LockName:
+        ptr = MemRead<uint64_t>( &item.lockNameFat.name );
+        tracy_free( (void*)ptr );
+        break;
+    case QueueType::GpuContextName:
+        ptr = MemRead<uint64_t>( &item.gpuContextNameFat.ptr );
+        tracy_free( (void*)ptr );
+        break;
+#endif
+#ifdef TRACY_ON_DEMAND
+    case QueueType::MessageAppInfo:
+    case QueueType::GpuContextName:
+        // Don't free memory associated with deferred messages.
+        break;
+#endif
+#ifdef TRACY_HAS_SYSTEM_TRACING
+    case QueueType::ExternalNameMetadata:
+        ptr = MemRead<uint64_t>( &item.externalNameMetadata.name );
+        tracy_free( (void*)ptr );
+        ptr = MemRead<uint64_t>( &item.externalNameMetadata.threadName );
+        tracy_free_fast( (void*)ptr );
+        break;
+#endif
+    case QueueType::SourceCodeMetadata:
+        ptr = MemRead<uint64_t>( &item.sourceCodeMetadata.ptr );
+        tracy_free( (void*)ptr );
+        break;
+    default:
+        break;
+    }
+}
+
+void Profiler::ClearQueues( moodycamel::ConsumerToken& token )
+{
+    for(;;)
+    {
+        const auto sz = GetQueue().try_dequeue_bulk_single( token, [](const uint64_t&){}, []( QueueItem* item, size_t sz ) { assert( sz > 0 ); while( sz-- > 0 ) FreeAssociatedMemory( *item++ ); } );
+        if( sz == 0 ) break;
+    }
+
+    ClearSerial();
+}
+
+void Profiler::ClearSerial()
+{
+    bool lockHeld = true;
+    while( !m_serialLock.try_lock() )
+    {
+        if( m_shutdownManual.load( std::memory_order_relaxed ) )
+        {
+            lockHeld = false;
+            break;
+        }
+    }
+    for( auto& v : m_serialQueue ) FreeAssociatedMemory( v );
+    m_serialQueue.clear();
+    if( lockHeld )
+    {
+        m_serialLock.unlock();
+    }
+
+    for( auto& v : m_serialDequeue ) FreeAssociatedMemory( v );
+    m_serialDequeue.clear();
+}
+
+Profiler::DequeueStatus Profiler::Dequeue( moodycamel::ConsumerToken& token )
+{
+    bool connectionLost = false;
+    const auto sz = GetQueue().try_dequeue_bulk_single( token,
+        [this, &connectionLost] ( const uint32_t& threadId )
+        {
+            if( ThreadCtxCheck( threadId ) == ThreadCtxStatus::ConnectionLost ) connectionLost = true;
+        },
+        [this, &connectionLost] ( QueueItem* item, size_t sz )
+        {
+            if( connectionLost ) return;
+            InitRpmalloc();
+            assert( sz > 0 );
+            int64_t refThread = m_refTimeThread;
+            int64_t refCtx = m_refTimeCtx;
+            int64_t refGpu = m_refTimeGpu;
+            while( sz-- > 0 )
+            {
+                uint64_t ptr;
+                uint16_t size;
+                auto idx = MemRead<uint8_t>( &item->hdr.idx );
+                if( idx < (int)QueueType::Terminate )
+                {
+                    switch( (QueueType)idx )
+                    {
+                    case QueueType::ZoneText:
+                    case QueueType::ZoneName:
+                        ptr = MemRead<uint64_t>( &item->zoneTextFat.text );
+                        size = MemRead<uint16_t>( &item->zoneTextFat.size );
+                        SendSingleString( (const char*)ptr, size );
+                        tracy_free_fast( (void*)ptr );
+                        break;
+                    case QueueType::Message:
+                    case QueueType::MessageCallstack:
+                        ptr = MemRead<uint64_t>( &item->messageFat.text );
+                        size = MemRead<uint16_t>( &item->messageFat.size );
+                        SendSingleString( (const char*)ptr, size );
+                        tracy_free_fast( (void*)ptr );
+                        break;
+                    case QueueType::MessageColor:
+                    case QueueType::MessageColorCallstack:
+                        ptr = MemRead<uint64_t>( &item->messageColorFat.text );
+                        size = MemRead<uint16_t>( &item->messageColorFat.size );
+                        SendSingleString( (const char*)ptr, size );
+                        tracy_free_fast( (void*)ptr );
+                        break;
+                    case QueueType::MessageAppInfo:
+                        ptr = MemRead<uint64_t>( &item->messageFat.text );
+                        size = MemRead<uint16_t>( &item->messageFat.size );
+                        SendSingleString( (const char*)ptr, size );
+#ifndef TRACY_ON_DEMAND
+                        tracy_free_fast( (void*)ptr );
+#endif
+                        break;
+                    case QueueType::ZoneBeginAllocSrcLoc:
+                    case QueueType::ZoneBeginAllocSrcLocCallstack:
+                    {
+                        int64_t t = MemRead<int64_t>( &item->zoneBegin.time );
+                        int64_t dt = t - refThread;
+                        refThread = t;
+                        MemWrite( &item->zoneBegin.time, dt );
+                        ptr = MemRead<uint64_t>( &item->zoneBegin.srcloc );
+                        SendSourceLocationPayload( ptr );
+                        tracy_free_fast( (void*)ptr );
+                        break;
+                    }
+                    case QueueType::Callstack:
+                        ptr = MemRead<uint64_t>( &item->callstackFat.ptr );
+                        SendCallstackPayload( ptr );
+                        tracy_free_fast( (void*)ptr );
+                        break;
+                    case QueueType::CallstackAlloc:
+                        ptr = MemRead<uint64_t>( &item->callstackAllocFat.nativePtr );
+                        if( ptr != 0 )
+                        {
+                            CutCallstack( (void*)ptr, "lua_pcall" );
+                            SendCallstackPayload( ptr );
+                            tracy_free_fast( (void*)ptr );
+                        }
+                        ptr = MemRead<uint64_t>( &item->callstackAllocFat.ptr );
+                        SendCallstackAlloc( ptr );
+                        tracy_free_fast( (void*)ptr );
+                        break;
+                    case QueueType::CallstackSample:
+                    case QueueType::CallstackSampleContextSwitch:
+                    {
+                        ptr = MemRead<uint64_t>( &item->callstackSampleFat.ptr );
+                        SendCallstackPayload64( ptr );
+                        tracy_free_fast( (void*)ptr );
+                        int64_t t = MemRead<int64_t>( &item->callstackSampleFat.time );
+                        int64_t dt = t - refCtx;
+                        refCtx = t;
+                        MemWrite( &item->callstackSampleFat.time, dt );
+                        break;
+                    }
+                    case QueueType::FrameImage:
+                    {
+                        ptr = MemRead<uint64_t>( &item->frameImageFat.image );
+                        const auto w = MemRead<uint16_t>( &item->frameImageFat.w );
+                        const auto h = MemRead<uint16_t>( &item->frameImageFat.h );
+                        const auto csz = size_t( w * h / 2 );
+                        SendLongString( ptr, (const char*)ptr, csz, QueueType::FrameImageData );
+                        tracy_free_fast( (void*)ptr );
+                        break;
+                    }
+                    case QueueType::ZoneBegin:
+                    case QueueType::ZoneBeginCallstack:
+                    {
+                        int64_t t = MemRead<int64_t>( &item->zoneBegin.time );
+                        int64_t dt = t - refThread;
+                        refThread = t;
+                        MemWrite( &item->zoneBegin.time, dt );
+                        break;
+                    }
+                    case QueueType::ZoneEnd:
+                    {
+                        int64_t t = MemRead<int64_t>( &item->zoneEnd.time );
+                        int64_t dt = t - refThread;
+                        refThread = t;
+                        MemWrite( &item->zoneEnd.time, dt );
+                        break;
+                    }
+                    case QueueType::GpuZoneBegin:
+                    case QueueType::GpuZoneBeginCallstack:
+                    {
+                        int64_t t = MemRead<int64_t>( &item->gpuZoneBegin.cpuTime );
+                        int64_t dt = t - refThread;
+                        refThread = t;
+                        MemWrite( &item->gpuZoneBegin.cpuTime, dt );
+                        break;
+                    }
+                    case QueueType::GpuZoneBeginAllocSrcLoc:
+                    case QueueType::GpuZoneBeginAllocSrcLocCallstack:
+                    {
+                        int64_t t = MemRead<int64_t>( &item->gpuZoneBegin.cpuTime );
+                        int64_t dt = t - refThread;
+                        refThread = t;
+                        MemWrite( &item->gpuZoneBegin.cpuTime, dt );
+                        ptr = MemRead<uint64_t>( &item->gpuZoneBegin.srcloc );
+                        SendSourceLocationPayload( ptr );
+                        tracy_free_fast( (void*)ptr );
+                        break;
+                    }
+                    case QueueType::GpuZoneEnd:
+                    {
+                        int64_t t = MemRead<int64_t>( &item->gpuZoneEnd.cpuTime );
+                        int64_t dt = t - refThread;
+                        refThread = t;
+                        MemWrite( &item->gpuZoneEnd.cpuTime, dt );
+                        break;
+                    }
+                    case QueueType::GpuContextName:
+                        ptr = MemRead<uint64_t>( &item->gpuContextNameFat.ptr );
+                        size = MemRead<uint16_t>( &item->gpuContextNameFat.size );
+                        SendSingleString( (const char*)ptr, size );
+#ifndef TRACY_ON_DEMAND
+                        tracy_free_fast( (void*)ptr );
+#endif
+                        break;
+                    case QueueType::PlotDataInt:
+                    case QueueType::PlotDataFloat:
+                    case QueueType::PlotDataDouble:
+                    {
+                        int64_t t = MemRead<int64_t>( &item->plotDataInt.time );
+                        int64_t dt = t - refThread;
+                        refThread = t;
+                        MemWrite( &item->plotDataInt.time, dt );
+                        break;
+                    }
+                    case QueueType::ContextSwitch:
+                    {
+                        int64_t t = MemRead<int64_t>( &item->contextSwitch.time );
+                        int64_t dt = t - refCtx;
+                        refCtx = t;
+                        MemWrite( &item->contextSwitch.time, dt );
+                        break;
+                    }
+                    case QueueType::ThreadWakeup:
+                    {
+                        int64_t t = MemRead<int64_t>( &item->threadWakeup.time );
+                        int64_t dt = t - refCtx;
+                        refCtx = t;
+                        MemWrite( &item->threadWakeup.time, dt );
+                        break;
+                    }
+                    case QueueType::GpuTime:
+                    {
+                        int64_t t = MemRead<int64_t>( &item->gpuTime.gpuTime );
+                        int64_t dt = t - refGpu;
+                        refGpu = t;
+                        MemWrite( &item->gpuTime.gpuTime, dt );
+                        break;
+                    }
+#ifdef TRACY_HAS_CALLSTACK
+                    case QueueType::CallstackFrameSize:
+                    {
+                        auto data = (const CallstackEntry*)MemRead<uint64_t>( &item->callstackFrameSizeFat.data );
+                        auto datasz = MemRead<uint8_t>( &item->callstackFrameSizeFat.size );
+                        auto imageName = (const char*)MemRead<uint64_t>( &item->callstackFrameSizeFat.imageName );
+                        SendSingleString( imageName );
+                        AppendData( item++, QueueDataSize[idx] );
+
+                        for( uint8_t i=0; i<datasz; i++ )
+                        {
+                            const auto& frame = data[i];
+
+                            SendSingleString( frame.name );
+                            SendSecondString( frame.file );
+
+                            QueueItem item;
+                            MemWrite( &item.hdr.type, QueueType::CallstackFrame );
+                            MemWrite( &item.callstackFrame.line, frame.line );
+                            MemWrite( &item.callstackFrame.symAddr, frame.symAddr );
+                            MemWrite( &item.callstackFrame.symLen, frame.symLen );
+
+                            AppendData( &item, QueueDataSize[(int)QueueType::CallstackFrame] );
+
+                            tracy_free_fast( (void*)frame.name );
+                            tracy_free_fast( (void*)frame.file );
+                        }
+                        tracy_free_fast( (void*)data );
+                        continue;
+                    }
+                    case QueueType::SymbolInformation:
+                    {
+                        auto fileString = (const char*)MemRead<uint64_t>( &item->symbolInformationFat.fileString );
+                        auto needFree = MemRead<uint8_t>( &item->symbolInformationFat.needFree );
+                        SendSingleString( fileString );
+                        if( needFree ) tracy_free_fast( (void*)fileString );
+                        break;
+                    }
+                    case QueueType::SymbolCodeMetadata:
+                    {
+                        auto symbol = MemRead<uint64_t>( &item->symbolCodeMetadata.symbol );
+                        auto ptr = (const char*)MemRead<uint64_t>( &item->symbolCodeMetadata.ptr );
+                        auto size = MemRead<uint32_t>( &item->symbolCodeMetadata.size );
+                        SendLongString( symbol, ptr, size, QueueType::SymbolCode );
+                        tracy_free_fast( (void*)ptr );
+                        ++item;
+                        continue;
+                    }
+#endif
+#ifdef TRACY_HAS_SYSTEM_TRACING
+                    case QueueType::ExternalNameMetadata:
+                    {
+                        auto thread = MemRead<uint64_t>( &item->externalNameMetadata.thread );
+                        auto name = (const char*)MemRead<uint64_t>( &item->externalNameMetadata.name );
+                        auto threadName = (const char*)MemRead<uint64_t>( &item->externalNameMetadata.threadName );
+                        SendString( thread, threadName, QueueType::ExternalThreadName );
+                        SendString( thread, name, QueueType::ExternalName );
+                        tracy_free_fast( (void*)threadName );
+                        tracy_free_fast( (void*)name );
+                        ++item;
+                        continue;
+                    }
+#endif
+                    case QueueType::SourceCodeMetadata:
+                    {
+                        auto ptr = (const char*)MemRead<uint64_t>( &item->sourceCodeMetadata.ptr );
+                        auto size = MemRead<uint32_t>( &item->sourceCodeMetadata.size );
+                        auto id = MemRead<uint32_t>( &item->sourceCodeMetadata.id );
+                        SendLongString( (uint64_t)id, ptr, size, QueueType::SourceCode );
+                        tracy_free_fast( (void*)ptr );
+                        ++item;
+                        continue;
+                    }
+                    default:
+                        assert( false );
+                        break;
+                    }
+                }
+                if( !AppendData( item++, QueueDataSize[idx] ) )
+                {
+                    connectionLost = true;
+                    m_refTimeThread = refThread;
+                    m_refTimeCtx = refCtx;
+                    m_refTimeGpu = refGpu;
+                    return;
+                }
+            }
+            m_refTimeThread = refThread;
+            m_refTimeCtx = refCtx;
+            m_refTimeGpu = refGpu;
+        }
+    );
+    if( connectionLost ) return DequeueStatus::ConnectionLost;
+    return sz > 0 ? DequeueStatus::DataDequeued : DequeueStatus::QueueEmpty;
+}
+
+Profiler::DequeueStatus Profiler::DequeueContextSwitches( tracy::moodycamel::ConsumerToken& token, int64_t& timeStop )
+{
+    const auto sz = GetQueue().try_dequeue_bulk_single( token, [] ( const uint64_t& ) {},
+        [this, &timeStop] ( QueueItem* item, size_t sz )
+        {
+            assert( sz > 0 );
+            int64_t refCtx = m_refTimeCtx;
+            while( sz-- > 0 )
+            {
+                FreeAssociatedMemory( *item );
+                if( timeStop < 0 ) return;
+                const auto idx = MemRead<uint8_t>( &item->hdr.idx );
+                if( idx == (uint8_t)QueueType::ContextSwitch )
+                {
+                    const auto csTime = MemRead<int64_t>( &item->contextSwitch.time );
+                    if( csTime > timeStop )
+                    {
+                        timeStop = -1;
+                        m_refTimeCtx = refCtx;
+                        return;
+                    }
+                    int64_t dt = csTime - refCtx;
+                    refCtx = csTime;
+                    MemWrite( &item->contextSwitch.time, dt );
+                    if( !AppendData( item, QueueDataSize[(int)QueueType::ContextSwitch] ) )
+                    {
+                        timeStop = -2;
+                        m_refTimeCtx = refCtx;
+                        return;
+                    }
+                }
+                else if( idx == (uint8_t)QueueType::ThreadWakeup )
+                {
+                    const auto csTime = MemRead<int64_t>( &item->threadWakeup.time );
+                    if( csTime > timeStop )
+                    {
+                        timeStop = -1;
+                        m_refTimeCtx = refCtx;
+                        return;
+                    }
+                    int64_t dt = csTime - refCtx;
+                    refCtx = csTime;
+                    MemWrite( &item->threadWakeup.time, dt );
+                    if( !AppendData( item, QueueDataSize[(int)QueueType::ThreadWakeup] ) )
+                    {
+                        timeStop = -2;
+                        m_refTimeCtx = refCtx;
+                        return;
+                    }
+                }
+                item++;
+            }
+            m_refTimeCtx = refCtx;
+        }
+    );
+
+    if( timeStop == -2 ) return DequeueStatus::ConnectionLost;
+    return ( timeStop == -1 || sz > 0 ) ? DequeueStatus::DataDequeued : DequeueStatus::QueueEmpty;
+}
+
+#define ThreadCtxCheckSerial( _name ) \
+    uint32_t thread = MemRead<uint32_t>( &item->_name.thread ); \
+    switch( ThreadCtxCheck( thread ) ) \
+    { \
+    case ThreadCtxStatus::Same: break; \
+    case ThreadCtxStatus::Changed: assert( m_refTimeThread == 0 ); refThread = 0; break; \
+    case ThreadCtxStatus::ConnectionLost: return DequeueStatus::ConnectionLost; \
+    default: assert( false ); break; \
+    }
+
+Profiler::DequeueStatus Profiler::DequeueSerial()
+{
+    {
+        bool lockHeld = true;
+        while( !m_serialLock.try_lock() )
+        {
+            if( m_shutdownManual.load( std::memory_order_relaxed ) )
+            {
+                lockHeld = false;
+                break;
+            }
+        }
+        if( !m_serialQueue.empty() ) m_serialQueue.swap( m_serialDequeue );
+        if( lockHeld )
+        {
+            m_serialLock.unlock();
+        }
+    }
+
+    const auto sz = m_serialDequeue.size();
+    if( sz > 0 )
+    {
+        InitRpmalloc();
+        int64_t refSerial = m_refTimeSerial;
+        int64_t refGpu = m_refTimeGpu;
+#ifdef TRACY_FIBERS
+        int64_t refThread = m_refTimeThread;
+#endif
+        auto item = m_serialDequeue.data();
+        auto end = item + sz;
+        while( item != end )
+        {
+            uint64_t ptr;
+            auto idx = MemRead<uint8_t>( &item->hdr.idx );
+            if( idx < (int)QueueType::Terminate )
+            {
+                switch( (QueueType)idx )
+                {
+                case QueueType::CallstackSerial:
+                    ptr = MemRead<uint64_t>( &item->callstackFat.ptr );
+                    SendCallstackPayload( ptr );
+                    tracy_free_fast( (void*)ptr );
+                    break;
+                case QueueType::LockWait:
+                case QueueType::LockSharedWait:
+                {
+                    int64_t t = MemRead<int64_t>( &item->lockWait.time );
+                    int64_t dt = t - refSerial;
+                    refSerial = t;
+                    MemWrite( &item->lockWait.time, dt );
+                    break;
+                }
+                case QueueType::LockObtain:
+                case QueueType::LockSharedObtain:
+                {
+                    int64_t t = MemRead<int64_t>( &item->lockObtain.time );
+                    int64_t dt = t - refSerial;
+                    refSerial = t;
+                    MemWrite( &item->lockObtain.time, dt );
+                    break;
+                }
+                case QueueType::LockRelease:
+                case QueueType::LockSharedRelease:
+                {
+                    int64_t t = MemRead<int64_t>( &item->lockRelease.time );
+                    int64_t dt = t - refSerial;
+                    refSerial = t;
+                    MemWrite( &item->lockRelease.time, dt );
+                    break;
+                }
+                case QueueType::LockName:
+                {
+                    ptr = MemRead<uint64_t>( &item->lockNameFat.name );
+                    uint16_t size = MemRead<uint16_t>( &item->lockNameFat.size );
+                    SendSingleString( (const char*)ptr, size );
+#ifndef TRACY_ON_DEMAND
+                    tracy_free_fast( (void*)ptr );
+#endif
+                    break;
+                }
+                case QueueType::MemAlloc:
+                case QueueType::MemAllocNamed:
+                case QueueType::MemAllocCallstack:
+                case QueueType::MemAllocCallstackNamed:
+                {
+                    int64_t t = MemRead<int64_t>( &item->memAlloc.time );
+                    int64_t dt = t - refSerial;
+                    refSerial = t;
+                    MemWrite( &item->memAlloc.time, dt );
+                    break;
+                }
+                case QueueType::MemFree:
+                case QueueType::MemFreeNamed:
+                case QueueType::MemFreeCallstack:
+                case QueueType::MemFreeCallstackNamed:
+                {
+                    int64_t t = MemRead<int64_t>( &item->memFree.time );
+                    int64_t dt = t - refSerial;
+                    refSerial = t;
+                    MemWrite( &item->memFree.time, dt );
+                    break;
+                }
+                case QueueType::GpuZoneBeginSerial:
+                case QueueType::GpuZoneBeginCallstackSerial:
+                {
+                    int64_t t = MemRead<int64_t>( &item->gpuZoneBegin.cpuTime );
+                    int64_t dt = t - refSerial;
+                    refSerial = t;
+                    MemWrite( &item->gpuZoneBegin.cpuTime, dt );
+                    break;
+                }
+                case QueueType::GpuZoneBeginAllocSrcLocSerial:
+                case QueueType::GpuZoneBeginAllocSrcLocCallstackSerial:
+                {
+                    int64_t t = MemRead<int64_t>( &item->gpuZoneBegin.cpuTime );
+                    int64_t dt = t - refSerial;
+                    refSerial = t;
+                    MemWrite( &item->gpuZoneBegin.cpuTime, dt );
+                    ptr = MemRead<uint64_t>( &item->gpuZoneBegin.srcloc );
+                    SendSourceLocationPayload( ptr );
+                    tracy_free_fast( (void*)ptr );
+                    break;
+                }
+                case QueueType::GpuZoneEndSerial:
+                {
+                    int64_t t = MemRead<int64_t>( &item->gpuZoneEnd.cpuTime );
+                    int64_t dt = t - refSerial;
+                    refSerial = t;
+                    MemWrite( &item->gpuZoneEnd.cpuTime, dt );
+                    break;
+                }
+                case QueueType::GpuTime:
+                {
+                    int64_t t = MemRead<int64_t>( &item->gpuTime.gpuTime );
+                    int64_t dt = t - refGpu;
+                    refGpu = t;
+                    MemWrite( &item->gpuTime.gpuTime, dt );
+                    break;
+                }
+                case QueueType::GpuContextName:
+                {
+                    ptr = MemRead<uint64_t>( &item->gpuContextNameFat.ptr );
+                    uint16_t size = MemRead<uint16_t>( &item->gpuContextNameFat.size );
+                    SendSingleString( (const char*)ptr, size );
+#ifndef TRACY_ON_DEMAND
+                    tracy_free_fast( (void*)ptr );
+#endif
+                    break;
+                }
+#ifdef TRACY_FIBERS
+                case QueueType::ZoneBegin:
+                case QueueType::ZoneBeginCallstack:
+                {
+                    ThreadCtxCheckSerial( zoneBeginThread );
+                    int64_t t = MemRead<int64_t>( &item->zoneBegin.time );
+                    int64_t dt = t - refThread;
+                    refThread = t;
+                    MemWrite( &item->zoneBegin.time, dt );
+                    break;
+                }
+                case QueueType::ZoneBeginAllocSrcLoc:
+                case QueueType::ZoneBeginAllocSrcLocCallstack:
+                {
+                    ThreadCtxCheckSerial( zoneBeginThread );
+                    int64_t t = MemRead<int64_t>( &item->zoneBegin.time );
+                    int64_t dt = t - refThread;
+                    refThread = t;
+                    MemWrite( &item->zoneBegin.time, dt );
+                    ptr = MemRead<uint64_t>( &item->zoneBegin.srcloc );
+                    SendSourceLocationPayload( ptr );
+                    tracy_free_fast( (void*)ptr );
+                    break;
+                }
+                case QueueType::ZoneEnd:
+                {
+                    ThreadCtxCheckSerial( zoneEndThread );
+                    int64_t t = MemRead<int64_t>( &item->zoneEnd.time );
+                    int64_t dt = t - refThread;
+                    refThread = t;
+                    MemWrite( &item->zoneEnd.time, dt );
+                    break;
+                }
+                case QueueType::ZoneText:
+                case QueueType::ZoneName:
+                {
+                    ThreadCtxCheckSerial( zoneTextFatThread );
+                    ptr = MemRead<uint64_t>( &item->zoneTextFat.text );
+                    uint16_t size = MemRead<uint16_t>( &item->zoneTextFat.size );
+                    SendSingleString( (const char*)ptr, size );
+                    tracy_free_fast( (void*)ptr );
+                    break;
+                }
+                case QueueType::Message:
+                case QueueType::MessageCallstack:
+                {
+                    ThreadCtxCheckSerial( messageFatThread );
+                    ptr = MemRead<uint64_t>( &item->messageFat.text );
+                    uint16_t size = MemRead<uint16_t>( &item->messageFat.size );
+                    SendSingleString( (const char*)ptr, size );
+                    tracy_free_fast( (void*)ptr );
+                    break;
+                }
+                case QueueType::MessageColor:
+                case QueueType::MessageColorCallstack:
+                {
+                    ThreadCtxCheckSerial( messageColorFatThread );
+                    ptr = MemRead<uint64_t>( &item->messageColorFat.text );
+                    uint16_t size = MemRead<uint16_t>( &item->messageColorFat.size );
+                    SendSingleString( (const char*)ptr, size );
+                    tracy_free_fast( (void*)ptr );
+                    break;
+                }
+                case QueueType::Callstack:
+                {
+                    ThreadCtxCheckSerial( callstackFatThread );
+                    ptr = MemRead<uint64_t>( &item->callstackFat.ptr );
+                    SendCallstackPayload( ptr );
+                    tracy_free_fast( (void*)ptr );
+                    break;
+                }
+                case QueueType::CallstackAlloc:
+                {
+                    ThreadCtxCheckSerial( callstackAllocFatThread );
+                    ptr = MemRead<uint64_t>( &item->callstackAllocFat.nativePtr );
+                    if( ptr != 0 )
+                    {
+                        CutCallstack( (void*)ptr, "lua_pcall" );
+                        SendCallstackPayload( ptr );
+                        tracy_free_fast( (void*)ptr );
+                    }
+                    ptr = MemRead<uint64_t>( &item->callstackAllocFat.ptr );
+                    SendCallstackAlloc( ptr );
+                    tracy_free_fast( (void*)ptr );
+                    break;
+                }
+                case QueueType::FiberEnter:
+                {
+                    ThreadCtxCheckSerial( fiberEnter );
+                    int64_t t = MemRead<int64_t>( &item->fiberEnter.time );
+                    int64_t dt = t - refThread;
+                    refThread = t;
+                    MemWrite( &item->fiberEnter.time, dt );
+                    break;
+                }
+                case QueueType::FiberLeave:
+                {
+                    ThreadCtxCheckSerial( fiberLeave );
+                    int64_t t = MemRead<int64_t>( &item->fiberLeave.time );
+                    int64_t dt = t - refThread;
+                    refThread = t;
+                    MemWrite( &item->fiberLeave.time, dt );
+                    break;
+                }
+#endif
+                default:
+                    assert( false );
+                    break;
+                }
+            }
+#ifdef TRACY_FIBERS
+            else
+            {
+                switch( (QueueType)idx )
+                {
+                case QueueType::ZoneColor:
+                {
+                    ThreadCtxCheckSerial( zoneColorThread );
+                    break;
+                }
+                case QueueType::ZoneValue:
+                {
+                    ThreadCtxCheckSerial( zoneValueThread );
+                    break;
+                }
+                case QueueType::ZoneValidation:
+                {
+                    ThreadCtxCheckSerial( zoneValidationThread );
+                    break;
+                }
+                case QueueType::MessageLiteral:
+                case QueueType::MessageLiteralCallstack:
+                {
+                    ThreadCtxCheckSerial( messageLiteralThread );
+                    break;
+                }
+                case QueueType::MessageLiteralColor:
+                case QueueType::MessageLiteralColorCallstack:
+                {
+                    ThreadCtxCheckSerial( messageColorLiteralThread );
+                    break;
+                }
+                case QueueType::CrashReport:
+                {
+                    ThreadCtxCheckSerial( crashReportThread );
+                    break;
+                }
+                default:
+                    break;
+                }
+            }
+#endif
+            if( !AppendData( item, QueueDataSize[idx] ) ) return DequeueStatus::ConnectionLost;
+            item++;
+        }
+        m_refTimeSerial = refSerial;
+        m_refTimeGpu = refGpu;
+#ifdef TRACY_FIBERS
+        m_refTimeThread = refThread;
+#endif
+        m_serialDequeue.clear();
+    }
+    else
+    {
+        return DequeueStatus::QueueEmpty;
+    }
+    return DequeueStatus::DataDequeued;
+}
+
+Profiler::ThreadCtxStatus Profiler::ThreadCtxCheck( uint32_t threadId )
+{
+    if( m_threadCtx == threadId ) return ThreadCtxStatus::Same;
+    QueueItem item;
+    MemWrite( &item.hdr.type, QueueType::ThreadContext );
+    MemWrite( &item.threadCtx.thread, threadId );
+    if( !AppendData( &item, QueueDataSize[(int)QueueType::ThreadContext] ) ) return ThreadCtxStatus::ConnectionLost;
+    m_threadCtx = threadId;
+    m_refTimeThread = 0;
+    return ThreadCtxStatus::Changed;
+}
+
+bool Profiler::CommitData()
+{
+    bool ret = SendData( m_buffer + m_bufferStart, m_bufferOffset - m_bufferStart );
+    if( m_bufferOffset > TargetFrameSize * 2 ) m_bufferOffset = 0;
+    m_bufferStart = m_bufferOffset;
+    return ret;
+}
+
+bool Profiler::SendData( const char* data, size_t len )
+{
+    const lz4sz_t lz4sz = LZ4_compress_fast_continue( (LZ4_stream_t*)m_stream, data, m_lz4Buf + sizeof( lz4sz_t ), (int)len, LZ4Size, 1 );
+    memcpy( m_lz4Buf, &lz4sz, sizeof( lz4sz ) );
+    return m_sock->Send( m_lz4Buf, lz4sz + sizeof( lz4sz_t ) ) != -1;
+}
+
+void Profiler::SendString( uint64_t str, const char* ptr, size_t len, QueueType type )
+{
+    assert( type == QueueType::StringData ||
+            type == QueueType::ThreadName ||
+            type == QueueType::PlotName ||
+            type == QueueType::FrameName ||
+            type == QueueType::ExternalName ||
+            type == QueueType::ExternalThreadName ||
+            type == QueueType::FiberName );
+
+    QueueItem item;
+    MemWrite( &item.hdr.type, type );
+    MemWrite( &item.stringTransfer.ptr, str );
+
+    assert( len <= std::numeric_limits<uint16_t>::max() );
+    auto l16 = uint16_t( len );
+
+    NeedDataSize( QueueDataSize[(int)type] + sizeof( l16 ) + l16 );
+
+    AppendDataUnsafe( &item, QueueDataSize[(int)type] );
+    AppendDataUnsafe( &l16, sizeof( l16 ) );
+    AppendDataUnsafe( ptr, l16 );
+}
+
+void Profiler::SendSingleString( const char* ptr, size_t len )
+{
+    QueueItem item;
+    MemWrite( &item.hdr.type, QueueType::SingleStringData );
+
+    assert( len <= std::numeric_limits<uint16_t>::max() );
+    auto l16 = uint16_t( len );
+
+    NeedDataSize( QueueDataSize[(int)QueueType::SingleStringData] + sizeof( l16 ) + l16 );
+
+    AppendDataUnsafe( &item, QueueDataSize[(int)QueueType::SingleStringData] );
+    AppendDataUnsafe( &l16, sizeof( l16 ) );
+    AppendDataUnsafe( ptr, l16 );
+}
+
+void Profiler::SendSecondString( const char* ptr, size_t len )
+{
+    QueueItem item;
+    MemWrite( &item.hdr.type, QueueType::SecondStringData );
+
+    assert( len <= std::numeric_limits<uint16_t>::max() );
+    auto l16 = uint16_t( len );
+
+    NeedDataSize( QueueDataSize[(int)QueueType::SecondStringData] + sizeof( l16 ) + l16 );
+
+    AppendDataUnsafe( &item, QueueDataSize[(int)QueueType::SecondStringData] );
+    AppendDataUnsafe( &l16, sizeof( l16 ) );
+    AppendDataUnsafe( ptr, l16 );
+}
+
+void Profiler::SendLongString( uint64_t str, const char* ptr, size_t len, QueueType type )
+{
+    assert( type == QueueType::FrameImageData ||
+            type == QueueType::SymbolCode ||
+            type == QueueType::SourceCode );
+
+    QueueItem item;
+    MemWrite( &item.hdr.type, type );
+    MemWrite( &item.stringTransfer.ptr, str );
+
+    assert( len <= std::numeric_limits<uint32_t>::max() );
+    assert( QueueDataSize[(int)type] + sizeof( uint32_t ) + len <= TargetFrameSize );
+    auto l32 = uint32_t( len );
+
+    NeedDataSize( QueueDataSize[(int)type] + sizeof( l32 ) + l32 );
+
+    AppendDataUnsafe( &item, QueueDataSize[(int)type] );
+    AppendDataUnsafe( &l32, sizeof( l32 ) );
+    AppendDataUnsafe( ptr, l32 );
+}
+
+void Profiler::SendSourceLocation( uint64_t ptr )
+{
+    auto srcloc = (const SourceLocationData*)ptr;
+    QueueItem item;
+    MemWrite( &item.hdr.type, QueueType::SourceLocation );
+    MemWrite( &item.srcloc.name, (uint64_t)srcloc->name );
+    MemWrite( &item.srcloc.file, (uint64_t)srcloc->file );
+    MemWrite( &item.srcloc.function, (uint64_t)srcloc->function );
+    MemWrite( &item.srcloc.line, srcloc->line );
+    MemWrite( &item.srcloc.b, uint8_t( ( srcloc->color       ) & 0xFF ) );
+    MemWrite( &item.srcloc.g, uint8_t( ( srcloc->color >> 8  ) & 0xFF ) );
+    MemWrite( &item.srcloc.r, uint8_t( ( srcloc->color >> 16 ) & 0xFF ) );
+    AppendData( &item, QueueDataSize[(int)QueueType::SourceLocation] );
+}
+
+void Profiler::SendSourceLocationPayload( uint64_t _ptr )
+{
+    auto ptr = (const char*)_ptr;
+
+    QueueItem item;
+    MemWrite( &item.hdr.type, QueueType::SourceLocationPayload );
+    MemWrite( &item.stringTransfer.ptr, _ptr );
+
+    uint16_t len;
+    memcpy( &len, ptr, sizeof( len ) );
+    assert( len > 2 );
+    len -= 2;
+    ptr += 2;
+
+    NeedDataSize( QueueDataSize[(int)QueueType::SourceLocationPayload] + sizeof( len ) + len );
+
+    AppendDataUnsafe( &item, QueueDataSize[(int)QueueType::SourceLocationPayload] );
+    AppendDataUnsafe( &len, sizeof( len ) );
+    AppendDataUnsafe( ptr, len );
+}
+
+void Profiler::SendCallstackPayload( uint64_t _ptr )
+{
+    auto ptr = (uintptr_t*)_ptr;
+
+    QueueItem item;
+    MemWrite( &item.hdr.type, QueueType::CallstackPayload );
+    MemWrite( &item.stringTransfer.ptr, _ptr );
+
+    const auto sz = *ptr++;
+    const auto len = sz * sizeof( uint64_t );
+    const auto l16 = uint16_t( len );
+
+    NeedDataSize( QueueDataSize[(int)QueueType::CallstackPayload] + sizeof( l16 ) + l16 );
+
+    AppendDataUnsafe( &item, QueueDataSize[(int)QueueType::CallstackPayload] );
+    AppendDataUnsafe( &l16, sizeof( l16 ) );
+
+    if( compile_time_condition<sizeof( uintptr_t ) == sizeof( uint64_t )>::value )
+    {
+        AppendDataUnsafe( ptr, sizeof( uint64_t ) * sz );
+    }
+    else
+    {
+        for( uintptr_t i=0; i<sz; i++ )
+        {
+            const auto val = uint64_t( *ptr++ );
+            AppendDataUnsafe( &val, sizeof( uint64_t ) );
+        }
+    }
+}
+
+void Profiler::SendCallstackPayload64( uint64_t _ptr )
+{
+    auto ptr = (uint64_t*)_ptr;
+
+    QueueItem item;
+    MemWrite( &item.hdr.type, QueueType::CallstackPayload );
+    MemWrite( &item.stringTransfer.ptr, _ptr );
+
+    const auto sz = *ptr++;
+    const auto len = sz * sizeof( uint64_t );
+    const auto l16 = uint16_t( len );
+
+    NeedDataSize( QueueDataSize[(int)QueueType::CallstackPayload] + sizeof( l16 ) + l16 );
+
+    AppendDataUnsafe( &item, QueueDataSize[(int)QueueType::CallstackPayload] );
+    AppendDataUnsafe( &l16, sizeof( l16 ) );
+    AppendDataUnsafe( ptr, sizeof( uint64_t ) * sz );
+}
+
+void Profiler::SendCallstackAlloc( uint64_t _ptr )
+{
+    auto ptr = (const char*)_ptr;
+
+    QueueItem item;
+    MemWrite( &item.hdr.type, QueueType::CallstackAllocPayload );
+    MemWrite( &item.stringTransfer.ptr, _ptr );
+
+    uint16_t len;
+    memcpy( &len, ptr, 2 );
+    ptr += 2;
+
+    NeedDataSize( QueueDataSize[(int)QueueType::CallstackAllocPayload] + sizeof( len ) + len );
+
+    AppendDataUnsafe( &item, QueueDataSize[(int)QueueType::CallstackAllocPayload] );
+    AppendDataUnsafe( &len, sizeof( len ) );
+    AppendDataUnsafe( ptr, len );
+}
+
+void Profiler::QueueCallstackFrame( uint64_t ptr )
+{
+#ifdef TRACY_HAS_CALLSTACK
+    m_symbolQueue.emplace( SymbolQueueItem { SymbolQueueItemType::CallstackFrame, ptr } );
+#else
+    AckServerQuery();
+#endif
+}
+
+void Profiler::QueueSymbolQuery( uint64_t symbol )
+{
+#ifdef TRACY_HAS_CALLSTACK
+    // Special handling for kernel frames
+    if( symbol >> 63 != 0 )
+    {
+        SendSingleString( "<kernel>" );
+        QueueItem item;
+        MemWrite( &item.hdr.type, QueueType::SymbolInformation );
+        MemWrite( &item.symbolInformation.line, 0 );
+        MemWrite( &item.symbolInformation.symAddr, symbol );
+        AppendData( &item, QueueDataSize[(int)QueueType::SymbolInformation] );
+    }
+    else
+    {
+        m_symbolQueue.emplace( SymbolQueueItem { SymbolQueueItemType::SymbolQuery, symbol } );
+    }
+#else
+    AckServerQuery();
+#endif
+}
+
+void Profiler::QueueExternalName( uint64_t ptr )
+{
+#ifdef TRACY_HAS_SYSTEM_TRACING
+    m_symbolQueue.emplace( SymbolQueueItem { SymbolQueueItemType::ExternalName, ptr } );
+#endif
+}
+
+void Profiler::QueueKernelCode( uint64_t symbol, uint32_t size )
+{
+    assert( symbol >> 63 != 0 );
+#ifdef TRACY_HAS_CALLSTACK
+    m_symbolQueue.emplace( SymbolQueueItem { SymbolQueueItemType::KernelCode, symbol, size } );
+#else
+    AckSymbolCodeNotAvailable();
+#endif
+}
+
+void Profiler::QueueSourceCodeQuery( uint32_t id )
+{
+    assert( m_exectime != 0 );
+    assert( m_queryData );
+    m_symbolQueue.emplace( SymbolQueueItem { SymbolQueueItemType::SourceCode, uint64_t( m_queryData ), uint64_t( m_queryImage ), id } );
+    m_queryData = nullptr;
+    m_queryImage = nullptr;
+}
+
+#ifdef TRACY_HAS_CALLSTACK
+void Profiler::HandleSymbolQueueItem( const SymbolQueueItem& si )
+{
+    switch( si.type )
+    {
+    case SymbolQueueItemType::CallstackFrame:
+    {
+        const auto frameData = DecodeCallstackPtr( si.ptr );
+        auto data = tracy_malloc_fast( sizeof( CallstackEntry ) * frameData.size );
+        memcpy( data, frameData.data, sizeof( CallstackEntry ) * frameData.size );
+        TracyLfqPrepare( QueueType::CallstackFrameSize );
+        MemWrite( &item->callstackFrameSizeFat.ptr, si.ptr );
+        MemWrite( &item->callstackFrameSizeFat.size, frameData.size );
+        MemWrite( &item->callstackFrameSizeFat.data, (uint64_t)data );
+        MemWrite( &item->callstackFrameSizeFat.imageName, (uint64_t)frameData.imageName );
+        TracyLfqCommit;
+        break;
+    }
+    case SymbolQueueItemType::SymbolQuery:
+    {
+#ifdef __ANDROID__
+        // On Android it's common for code to be in mappings that are only executable
+        // but not readable.
+        if( !EnsureReadable( si.ptr ) )
+        {
+            TracyLfqPrepare( QueueType::AckServerQueryNoop );
+            TracyLfqCommit;
+            break;
+        }
+#endif
+        const auto sym = DecodeSymbolAddress( si.ptr );
+        TracyLfqPrepare( QueueType::SymbolInformation );
+        MemWrite( &item->symbolInformationFat.line, sym.line );
+        MemWrite( &item->symbolInformationFat.symAddr, si.ptr );
+        MemWrite( &item->symbolInformationFat.fileString, (uint64_t)sym.file );
+        MemWrite( &item->symbolInformationFat.needFree, (uint8_t)sym.needFree );
+        TracyLfqCommit;
+        break;
+    }
+#ifdef TRACY_HAS_SYSTEM_TRACING
+    case SymbolQueueItemType::ExternalName:
+    {
+        const char* threadName;
+        const char* name;
+        SysTraceGetExternalName( si.ptr, threadName, name );
+        TracyLfqPrepare( QueueType::ExternalNameMetadata );
+        MemWrite( &item->externalNameMetadata.thread, si.ptr );
+        MemWrite( &item->externalNameMetadata.name, (uint64_t)name );
+        MemWrite( &item->externalNameMetadata.threadName, (uint64_t)threadName );
+        TracyLfqCommit;
+        break;
+    }
+#endif
+    case SymbolQueueItemType::KernelCode:
+    {
+#ifdef _WIN32
+        auto mod = GetKernelModulePath( si.ptr );
+        if( mod )
+        {
+            auto fn = DecodeCallstackPtrFast( si.ptr );
+            if( *fn )
+            {
+                auto hnd = LoadLibraryExA( mod, nullptr, DONT_RESOLVE_DLL_REFERENCES );
+                if( hnd )
+                {
+                    auto ptr = (const void*)GetProcAddress( hnd, fn );
+                    if( ptr )
+                    {
+                        auto buf = (char*)tracy_malloc( si.extra );
+                        memcpy( buf, ptr, si.extra );
+                        FreeLibrary( hnd );
+                        TracyLfqPrepare( QueueType::SymbolCodeMetadata );
+                        MemWrite( &item->symbolCodeMetadata.symbol, si.ptr );
+                        MemWrite( &item->symbolCodeMetadata.ptr, (uint64_t)buf );
+                        MemWrite( &item->symbolCodeMetadata.size, (uint32_t)si.extra );
+                        TracyLfqCommit;
+                        break;
+                    }
+                    FreeLibrary( hnd );
+                }
+            }
+        }
+#elif defined __linux__
+        void* data = m_kcore->Retrieve( si.ptr, si.extra );
+        if( data )
+        {
+            TracyLfqPrepare( QueueType::SymbolCodeMetadata );
+            MemWrite( &item->symbolCodeMetadata.symbol, si.ptr );
+            MemWrite( &item->symbolCodeMetadata.ptr, (uint64_t)data );
+            MemWrite( &item->symbolCodeMetadata.size, (uint32_t)si.extra );
+            TracyLfqCommit;
+            break;
+        }
+#endif
+        TracyLfqPrepare( QueueType::AckSymbolCodeNotAvailable );
+        TracyLfqCommit;
+        break;
+    }
+    case SymbolQueueItemType::SourceCode:
+        HandleSourceCodeQuery( (char*)si.ptr, (char*)si.extra, si.id );
+        break;
+    default:
+        assert( false );
+        break;
+    }
+}
+
+void Profiler::SymbolWorker()
+{
+#if defined __linux__ && !defined TRACY_NO_CRASH_HANDLER
+    s_symbolTid = syscall( SYS_gettid );
+#endif
+
+    ThreadExitHandler threadExitHandler;
+    SetThreadName( "Tracy Symbol Worker" );
+#ifdef TRACY_USE_RPMALLOC
+    InitRpmalloc();
+#endif
+    InitCallstack();
+    while( m_timeBegin.load( std::memory_order_relaxed ) == 0 ) std::this_thread::sleep_for( std::chrono::milliseconds( 10 ) );
+
+    for(;;)
+    {
+        const auto shouldExit = ShouldExit();
+#ifdef TRACY_ON_DEMAND
+        if( !IsConnected() )
+        {
+            if( shouldExit )
+            {
+                s_symbolThreadGone.store( true, std::memory_order_release );
+                return;
+            }
+            while( m_symbolQueue.front() ) m_symbolQueue.pop();
+            std::this_thread::sleep_for( std::chrono::milliseconds( 20 ) );
+            continue;
+        }
+#endif
+        auto si = m_symbolQueue.front();
+        if( si )
+        {
+            HandleSymbolQueueItem( *si );
+            m_symbolQueue.pop();
+        }
+        else
+        {
+            if( shouldExit )
+            {
+                s_symbolThreadGone.store( true, std::memory_order_release );
+                return;
+            }
+            std::this_thread::sleep_for( std::chrono::milliseconds( 20 ) );
+        }
+    }
+}
+#endif
+
+bool Profiler::HandleServerQuery()
+{
+    ServerQueryPacket payload;
+    if( !m_sock->Read( &payload, sizeof( payload ), 10 ) ) return false;
+
+    uint8_t type;
+    uint64_t ptr;
+    memcpy( &type, &payload.type, sizeof( payload.type ) );
+    memcpy( &ptr, &payload.ptr, sizeof( payload.ptr ) );
+
+    switch( type )
+    {
+    case ServerQueryString:
+        SendString( ptr, (const char*)ptr, QueueType::StringData );
+        break;
+    case ServerQueryThreadString:
+        if( ptr == m_mainThread )
+        {
+            SendString( ptr, "Main thread", 11, QueueType::ThreadName );
+        }
+        else
+        {
+            auto t = GetThreadNameData( (uint32_t)ptr );
+            if( t )
+            {
+                SendString( ptr, t->name, QueueType::ThreadName );
+                if( t->groupHint != 0 )
+                {
+                    TracyLfqPrepare( QueueType::ThreadGroupHint );
+                    MemWrite( &item->threadGroupHint.thread, (uint32_t)ptr );
+                    MemWrite( &item->threadGroupHint.groupHint, t->groupHint );
+                    TracyLfqCommit;
+                }
+            }
+            else
+            {
+                SendString( ptr, GetThreadName( (uint32_t)ptr ), QueueType::ThreadName );
+            }
+        }
+        break;
+    case ServerQuerySourceLocation:
+        SendSourceLocation( ptr );
+        break;
+    case ServerQueryPlotName:
+        SendString( ptr, (const char*)ptr, QueueType::PlotName );
+        break;
+    case ServerQueryTerminate:
+        return false;
+    case ServerQueryCallstackFrame:
+        QueueCallstackFrame( ptr );
+        break;
+    case ServerQueryFrameName:
+        SendString( ptr, (const char*)ptr, QueueType::FrameName );
+        break;
+    case ServerQueryDisconnect:
+        HandleDisconnect();
+        return false;
+#ifdef TRACY_HAS_SYSTEM_TRACING
+    case ServerQueryExternalName:
+        QueueExternalName( ptr );
+        break;
+#endif
+    case ServerQueryParameter:
+        HandleParameter( ptr );
+        break;
+    case ServerQuerySymbol:
+        QueueSymbolQuery( ptr );
+        break;
+#ifndef TRACY_NO_CODE_TRANSFER
+    case ServerQuerySymbolCode:
+        HandleSymbolCodeQuery( ptr, payload.extra );
+        break;
+#endif
+    case ServerQuerySourceCode:
+        QueueSourceCodeQuery( uint32_t( ptr ) );
+        break;
+    case ServerQueryDataTransfer:
+        if( m_queryData )
+        {
+            assert( !m_queryImage );
+            m_queryImage = m_queryData;
+        }
+        m_queryDataPtr = m_queryData = (char*)tracy_malloc( ptr + 11 );
+        AckServerQuery();
+        break;
+    case ServerQueryDataTransferPart:
+        memcpy( m_queryDataPtr, &ptr, 8 );
+        memcpy( m_queryDataPtr+8, &payload.extra, 4 );
+        m_queryDataPtr += 12;
+        AckServerQuery();
+        break;
+#ifdef TRACY_FIBERS
+    case ServerQueryFiberName:
+        SendString( ptr, (const char*)ptr, QueueType::FiberName );
+        break;
+#endif
+    default:
+        assert( false );
+        break;
+    }
+
+    return true;
+}
+
+void Profiler::HandleDisconnect()
+{
+    moodycamel::ConsumerToken token( GetQueue() );
+
+#ifdef TRACY_HAS_SYSTEM_TRACING
+    if( s_sysTraceThread )
+    {
+        auto timestamp = GetTime();
+        for(;;)
+        {
+            const auto status = DequeueContextSwitches( token, timestamp );
+            if( status == DequeueStatus::ConnectionLost )
+            {
+                return;
+            }
+            else if( status == DequeueStatus::QueueEmpty )
+            {
+                if( m_bufferOffset != m_bufferStart )
+                {
+                    if( !CommitData() ) return;
+                }
+            }
+            if( timestamp < 0 )
+            {
+                if( m_bufferOffset != m_bufferStart )
+                {
+                    if( !CommitData() ) return;
+                }
+                break;
+            }
+            ClearSerial();
+            if( m_sock->HasData() )
+            {
+                while( m_sock->HasData() )
+                {
+                    if( !HandleServerQuery() ) return;
+                }
+                if( m_bufferOffset != m_bufferStart )
+                {
+                    if( !CommitData() ) return;
+                }
+            }
+            else
+            {
+                if( m_bufferOffset != m_bufferStart )
+                {
+                    if( !CommitData() ) return;
+                }
+                std::this_thread::sleep_for( std::chrono::milliseconds( 10 ) );
+            }
+        }
+    }
+#endif
+
+    QueueItem terminate;
+    MemWrite( &terminate.hdr.type, QueueType::Terminate );
+    if( !SendData( (const char*)&terminate, 1 ) ) return;
+    for(;;)
+    {
+        ClearQueues( token );
+        if( m_sock->HasData() )
+        {
+            while( m_sock->HasData() )
+            {
+                if( !HandleServerQuery() ) return;
+            }
+            if( m_bufferOffset != m_bufferStart )
+            {
+                if( !CommitData() ) return;
+            }
+        }
+        else
+        {
+            if( m_bufferOffset != m_bufferStart )
+            {
+                if( !CommitData() ) return;
+            }
+            std::this_thread::sleep_for( std::chrono::milliseconds( 10 ) );
+        }
+    }
+}
+
+void Profiler::CalibrateTimer()
+{
+    m_timerMul = 1.;
+
+#ifdef TRACY_HW_TIMER
+
+#  if !defined TRACY_TIMER_QPC && defined TRACY_TIMER_FALLBACK
+    const bool needCalibration = HardwareSupportsInvariantTSC();
+#  else
+    const bool needCalibration = true;
+#  endif
+    if( needCalibration )
+    {
+        std::atomic_signal_fence( std::memory_order_acq_rel );
+        const auto t0 = std::chrono::high_resolution_clock::now();
+        const auto r0 = GetTime();
+        std::atomic_signal_fence( std::memory_order_acq_rel );
+        std::this_thread::sleep_for( std::chrono::milliseconds( 200 ) );
+        std::atomic_signal_fence( std::memory_order_acq_rel );
+        const auto t1 = std::chrono::high_resolution_clock::now();
+        const auto r1 = GetTime();
+        std::atomic_signal_fence( std::memory_order_acq_rel );
+
+        const auto dt = std::chrono::duration_cast<std::chrono::nanoseconds>( t1 - t0 ).count();
+        const auto dr = r1 - r0;
+
+        m_timerMul = double( dt ) / double( dr );
+    }
+#endif
+}
+
+void Profiler::CalibrateDelay()
+{
+    constexpr int Iterations = 50000;
+
+    auto mindiff = std::numeric_limits<int64_t>::max();
+    for( int i=0; i<Iterations * 10; i++ )
+    {
+        const auto t0i = GetTime();
+        const auto t1i = GetTime();
+        const auto dti = t1i - t0i;
+        if( dti > 0 && dti < mindiff ) mindiff = dti;
+    }
+    m_resolution = mindiff;
+
+#ifdef TRACY_DELAYED_INIT
+    m_delay = m_resolution;
+#else
+    constexpr int Events = Iterations * 2;   // start + end
+    static_assert( Events < QueuePrealloc, "Delay calibration loop will allocate memory in queue" );
+
+    static const tracy::SourceLocationData __tracy_source_location { nullptr, TracyFunction,  TracyFile, (uint32_t)TracyLine, 0 };
+    const auto t0 = GetTime();
+    for( int i=0; i<Iterations; i++ )
+    {
+        {
+            TracyLfqPrepare( QueueType::ZoneBegin );
+            MemWrite( &item->zoneBegin.time, Profiler::GetTime() );
+            MemWrite( &item->zoneBegin.srcloc, (uint64_t)&__tracy_source_location );
+            TracyLfqCommit;
+        }
+        {
+            TracyLfqPrepare( QueueType::ZoneEnd );
+            MemWrite( &item->zoneEnd.time, GetTime() );
+            TracyLfqCommit;
+        }
+    }
+    const auto t1 = GetTime();
+    const auto dt = t1 - t0;
+    m_delay = dt / Events;
+
+    moodycamel::ConsumerToken token( GetQueue() );
+    int left = Events;
+    while( left != 0 )
+    {
+        const auto sz = GetQueue().try_dequeue_bulk_single( token, [](const uint64_t&){}, [](QueueItem* item, size_t sz){} );
+        assert( sz > 0 );
+        left -= (int)sz;
+    }
+    assert( GetQueue().size_approx() == 0 );
+#endif
+}
+
+void Profiler::ReportTopology()
+{
+#ifndef TRACY_DELAYED_INIT
+    struct CpuData
+    {
+        uint32_t package;
+        uint32_t die;
+        uint32_t core;
+        uint32_t thread;
+    };
+
+#if defined _WIN32
+#  ifdef TRACY_UWP
+    t_GetLogicalProcessorInformationEx _GetLogicalProcessorInformationEx = &::GetLogicalProcessorInformationEx;
+#  else
+    t_GetLogicalProcessorInformationEx _GetLogicalProcessorInformationEx = (t_GetLogicalProcessorInformationEx)GetProcAddress( GetModuleHandleA( "kernel32.dll" ), "GetLogicalProcessorInformationEx" );
+#  endif
+    if( !_GetLogicalProcessorInformationEx ) return;
+
+    SYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX* packageInfo = nullptr;
+    SYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX* dieInfo = nullptr;
+    SYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX* coreInfo = nullptr;
+
+    DWORD psz = 0;
+    _GetLogicalProcessorInformationEx( RelationProcessorPackage, nullptr, &psz );
+    if( GetLastError() == ERROR_INSUFFICIENT_BUFFER )
+    {
+        packageInfo = (SYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX*)tracy_malloc( psz );
+        auto res = _GetLogicalProcessorInformationEx( RelationProcessorPackage, packageInfo, &psz );
+        assert( res );
+    }
+    else
+    {
+        psz = 0;
+    }
+
+    DWORD dsz = 0;
+    _GetLogicalProcessorInformationEx( RelationProcessorDie, nullptr, &dsz );
+    if( GetLastError() == ERROR_INSUFFICIENT_BUFFER )
+    {
+        dieInfo = (SYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX*)tracy_malloc( dsz );
+        auto res = _GetLogicalProcessorInformationEx( RelationProcessorDie, dieInfo, &dsz );
+        assert( res );
+    }
+    else
+    {
+        dsz = 0;
+    }
+
+    DWORD csz = 0;
+    _GetLogicalProcessorInformationEx( RelationProcessorCore, nullptr, &csz );
+    if( GetLastError() == ERROR_INSUFFICIENT_BUFFER )
+    {
+        coreInfo = (SYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX*)tracy_malloc( csz );
+        auto res = _GetLogicalProcessorInformationEx( RelationProcessorCore, coreInfo, &csz );
+        assert( res );
+    }
+    else
+    {
+        csz = 0;
+    }
+
+    SYSTEM_INFO sysinfo;
+    GetSystemInfo( &sysinfo );
+    const uint32_t numcpus = sysinfo.dwNumberOfProcessors;
+
+    auto cpuData = (CpuData*)tracy_malloc( sizeof( CpuData ) * numcpus );
+    memset( cpuData, 0, sizeof( CpuData ) * numcpus );
+    for( uint32_t i=0; i<numcpus; i++ ) cpuData[i].thread = i;
+
+    int idx = 0;
+    auto ptr = packageInfo;
+    while( (char*)ptr < ((char*)packageInfo) + psz )
+    {
+        assert( ptr->Relationship == RelationProcessorPackage );
+        // FIXME account for GroupCount
+        auto mask = ptr->Processor.GroupMask[0].Mask;
+        int core = 0;
+        while( mask != 0 )
+        {
+            if( mask & 1 ) cpuData[core].package = idx;
+            core++;
+            mask >>= 1;
+        }
+        ptr = (SYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX*)(((char*)ptr) + ptr->Size);
+        idx++;
+    }
+
+    idx = 0;
+    ptr = dieInfo;
+    while( (char*)ptr < ((char*)dieInfo) + dsz )
+    {
+        assert( ptr->Relationship == RelationProcessorDie );
+        // FIXME account for GroupCount
+        auto mask = ptr->Processor.GroupMask[0].Mask;
+        int core = 0;
+        while( mask != 0 )
+        {
+            if( mask & 1 ) cpuData[core].die = idx;
+            core++;
+            mask >>= 1;
+        }
+        ptr = (SYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX*)(((char*)ptr) + ptr->Size);
+        idx++;
+    }
+
+    idx = 0;
+    ptr = coreInfo;
+    while( (char*)ptr < ((char*)coreInfo) + csz )
+    {
+        assert( ptr->Relationship == RelationProcessorCore );
+        // FIXME account for GroupCount
+        auto mask = ptr->Processor.GroupMask[0].Mask;
+        int core = 0;
+        while( mask != 0 )
+        {
+            if( mask & 1 ) cpuData[core].core = idx;
+            core++;
+            mask >>= 1;
+        }
+        ptr = (SYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX*)(((char*)ptr) + ptr->Size);
+        idx++;
+    }
+
+    for( uint32_t i=0; i<numcpus; i++ )
+    {
+        auto& data = cpuData[i];
+
+        TracyLfqPrepare( QueueType::CpuTopology );
+        MemWrite( &item->cpuTopology.package, data.package );
+        MemWrite( &item->cpuTopology.die, data.die );
+        MemWrite( &item->cpuTopology.core, data.core );
+        MemWrite( &item->cpuTopology.thread, data.thread );
+
+#ifdef TRACY_ON_DEMAND
+        DeferItem( *item );
+#endif
+
+        TracyLfqCommit;
+    }
+
+    tracy_free( cpuData );
+    tracy_free( coreInfo );
+    tracy_free( packageInfo );
+#elif defined __linux__
+    const int numcpus = std::thread::hardware_concurrency();
+    auto cpuData = (CpuData*)tracy_malloc( sizeof( CpuData ) * numcpus );
+    memset( cpuData, 0, sizeof( CpuData ) * numcpus );
+
+    const char* basePath = "/sys/devices/system/cpu/cpu";
+    for( int i=0; i<numcpus; i++ )
+    {
+        char path[1024];
+        sprintf( path, "%s%i/topology/physical_package_id", basePath, i );
+        char buf[1024];
+        FILE* f = fopen( path, "rb" );
+        if( !f )
+        {
+            tracy_free( cpuData );
+            return;
+        }
+        auto read = fread( buf, 1, 1024, f );
+        buf[read] = '\0';
+        fclose( f );
+        cpuData[i].package = uint32_t( atoi( buf ) );
+        cpuData[i].thread = i;
+
+        sprintf( path, "%s%i/topology/core_id", basePath, i );
+        f = fopen( path, "rb" );
+        read = fread( buf, 1, 1024, f );
+        buf[read] = '\0';
+        fclose( f );
+        cpuData[i].core = uint32_t( atoi( buf ) );
+
+        sprintf( path, "%s%i/topology/die_id", basePath, i );
+        f = fopen( path, "rb" );
+        read = fread( buf, 1, 1024, f );
+        buf[read] = '\0';
+        fclose( f );
+        cpuData[i].die = uint32_t( atoi( buf ) );
+    }
+
+    for( int i=0; i<numcpus; i++ )
+    {
+        auto& data = cpuData[i];
+
+        TracyLfqPrepare( QueueType::CpuTopology );
+        MemWrite( &item->cpuTopology.package, data.package );
+        MemWrite( &item->cpuTopology.die, data.die );
+        MemWrite( &item->cpuTopology.core, data.core );
+        MemWrite( &item->cpuTopology.thread, data.thread );
+
+#ifdef TRACY_ON_DEMAND
+        DeferItem( *item );
+#endif
+
+        TracyLfqCommit;
+    }
+
+    tracy_free( cpuData );
+#endif
+#endif
+}
+
+void Profiler::SendCallstack( int depth, const char* skipBefore )
+{
+#ifdef TRACY_HAS_CALLSTACK
+    auto ptr = Callstack( depth );
+    CutCallstack( ptr, skipBefore );
+
+    TracyQueuePrepare( QueueType::Callstack );
+    MemWrite( &item->callstackFat.ptr, (uint64_t)ptr );
+    TracyQueueCommit( callstackFatThread );
+#endif
+}
+
+void Profiler::CutCallstack( void* callstack, const char* skipBefore )
+{
+#ifdef TRACY_HAS_CALLSTACK
+    auto data = (uintptr_t*)callstack;
+    const auto sz = *data++;
+    uintptr_t i;
+    for( i=0; i<sz; i++ )
+    {
+        auto name = DecodeCallstackPtrFast( uint64_t( data[i] ) );
+        const bool found = strcmp( name, skipBefore ) == 0;
+        if( found )
+        {
+            i++;
+            break;
+        }
+    }
+
+    if( i != sz )
+    {
+        memmove( data, data + i, ( sz - i ) * sizeof( uintptr_t* ) );
+        *--data = sz - i;
+    }
+#endif
+}
+
+#ifdef TRACY_HAS_SYSTIME
+void Profiler::ProcessSysTime()
+{
+    if( m_shutdown.load( std::memory_order_relaxed ) ) return;
+    auto t = std::chrono::high_resolution_clock::now().time_since_epoch().count();
+    if( t - m_sysTimeLast > 100000000 )    // 100 ms
+    {
+        auto sysTime = m_sysTime.Get();
+        if( sysTime >= 0 )
+        {
+            m_sysTimeLast = t;
+
+            TracyLfqPrepare( QueueType::SysTimeReport );
+            MemWrite( &item->sysTime.time, GetTime() );
+            MemWrite( &item->sysTime.sysTime, sysTime );
+            TracyLfqCommit;
+        }
+    }
+}
+#endif
+
+void Profiler::HandleParameter( uint64_t payload )
+{
+    assert( m_paramCallback );
+    const auto idx = uint32_t( payload >> 32 );
+    const auto val = int32_t( payload & 0xFFFFFFFF );
+    m_paramCallback( m_paramCallbackData, idx, val );
+    AckServerQuery();
+}
+
+void Profiler::HandleSymbolCodeQuery( uint64_t symbol, uint32_t size )
+{
+    if( symbol >> 63 != 0 )
+    {
+        QueueKernelCode( symbol, size );
+    }
+    else
+    {
+        if( !EnsureReadable( symbol ) )
+        {
+            AckSymbolCodeNotAvailable();
+            return;
+        }
+
+        SendLongString( symbol, (const char*)symbol, size, QueueType::SymbolCode );
+    }
+}
+
+void Profiler::HandleSourceCodeQuery( char* data, char* image, uint32_t id )
+{
+    bool ok = false;
+    FILE* f = fopen( data, "rb" );
+    if( f )
+    {
+        struct stat st;
+        if( fstat( fileno( f ), &st ) == 0 && (uint64_t)st.st_mtime < m_exectime && st.st_size < ( TargetFrameSize - 16 ) )
+        {
+            auto ptr = (char*)tracy_malloc_fast( st.st_size );
+            auto rd = fread( ptr, 1, st.st_size, f );
+            if( rd == (size_t)st.st_size )
+            {
+                TracyLfqPrepare( QueueType::SourceCodeMetadata );
+                MemWrite( &item->sourceCodeMetadata.ptr, (uint64_t)ptr );
+                MemWrite( &item->sourceCodeMetadata.size, (uint32_t)rd );
+                MemWrite( &item->sourceCodeMetadata.id, id );
+                TracyLfqCommit;
+                ok = true;
+            }
+            else
+            {
+                tracy_free_fast( ptr );
+            }
+        }
+        fclose( f );
+    }
+
+#ifdef TRACY_DEBUGINFOD
+    else if( image && data[0] == '/' )
+    {
+        size_t size;
+        auto buildid = GetBuildIdForImage( image, size );
+        if( buildid )
+        {
+            auto d = debuginfod_find_source( GetDebuginfodClient(), buildid, size, data, nullptr );
+            TracyDebug( "DebugInfo source query: %s, fn: %s, image: %s\n", d >= 0 ? " ok " : "fail", data, image );
+            if( d >= 0 )
+            {
+                struct stat st;
+                fstat( d, &st );
+                if( st.st_size < ( TargetFrameSize - 16 ) )
+                {
+                    lseek( d, 0, SEEK_SET );
+                    auto ptr = (char*)tracy_malloc_fast( st.st_size );
+                    auto rd = read( d, ptr, st.st_size );
+                    if( rd == (size_t)st.st_size )
+                    {
+                        TracyLfqPrepare( QueueType::SourceCodeMetadata );
+                        MemWrite( &item->sourceCodeMetadata.ptr, (uint64_t)ptr );
+                        MemWrite( &item->sourceCodeMetadata.size, (uint32_t)rd );
+                        MemWrite( &item->sourceCodeMetadata.id, id );
+                        TracyLfqCommit;
+                        ok = true;
+                    }
+                    else
+                    {
+                        tracy_free_fast( ptr );
+                    }
+                }
+                close( d );
+            }
+        }
+    }
+    else
+    {
+        TracyDebug( "DebugInfo invalid query fn: %s, image: %s\n", data, image );
+    }
+#endif
+
+    if( !ok && m_sourceCallback )
+    {
+        size_t sz;
+        char* ptr = m_sourceCallback( m_sourceCallbackData, data, sz );
+        if( ptr )
+        {
+            if( sz < ( TargetFrameSize - 16 ) )
+            {
+                TracyLfqPrepare( QueueType::SourceCodeMetadata );
+                MemWrite( &item->sourceCodeMetadata.ptr, (uint64_t)ptr );
+                MemWrite( &item->sourceCodeMetadata.size, (uint32_t)sz );
+                MemWrite( &item->sourceCodeMetadata.id, id );
+                TracyLfqCommit;
+                ok = true;
+            }
+            else
+            {
+                tracy_free_fast( ptr );
+            }
+        }
+    }
+
+    if( !ok )
+    {
+        TracyLfqPrepare( QueueType::AckSourceCodeNotAvailable );
+        MemWrite( &item->sourceCodeNotAvailable, id );
+        TracyLfqCommit;
+    }
+
+    tracy_free_fast( data );
+    tracy_free_fast( image );
+}
+
+#if defined _WIN32 && defined TRACY_TIMER_QPC
+int64_t Profiler::GetTimeQpc()
+{
+    LARGE_INTEGER t;
+    QueryPerformanceCounter( &t );
+    return t.QuadPart;
+}
+#endif
+
+}
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+TRACY_API TracyCZoneCtx ___tracy_emit_zone_begin( const struct ___tracy_source_location_data* srcloc, int active )
+{
+    ___tracy_c_zone_context ctx;
+#ifdef TRACY_ON_DEMAND
+    ctx.active = active && tracy::GetProfiler().IsConnected();
+#else
+    ctx.active = active;
+#endif
+    if( !ctx.active ) return ctx;
+    const auto id = tracy::GetProfiler().GetNextZoneId();
+    ctx.id = id;
+
+#ifndef TRACY_NO_VERIFY
+    {
+        TracyQueuePrepareC( tracy::QueueType::ZoneValidation );
+        tracy::MemWrite( &item->zoneValidation.id, id );
+        TracyQueueCommitC( zoneValidationThread );
+    }
+#endif
+    {
+        TracyQueuePrepareC( tracy::QueueType::ZoneBegin );
+        tracy::MemWrite( &item->zoneBegin.time, tracy::Profiler::GetTime() );
+        tracy::MemWrite( &item->zoneBegin.srcloc, (uint64_t)srcloc );
+        TracyQueueCommitC( zoneBeginThread );
+    }
+    return ctx;
+}
+
+TRACY_API TracyCZoneCtx ___tracy_emit_zone_begin_callstack( const struct ___tracy_source_location_data* srcloc, int depth, int active )
+{
+    ___tracy_c_zone_context ctx;
+#ifdef TRACY_ON_DEMAND
+    ctx.active = active && tracy::GetProfiler().IsConnected();
+#else
+    ctx.active = active;
+#endif
+    if( !ctx.active ) return ctx;
+    const auto id = tracy::GetProfiler().GetNextZoneId();
+    ctx.id = id;
+
+#ifndef TRACY_NO_VERIFY
+    {
+        TracyQueuePrepareC( tracy::QueueType::ZoneValidation );
+        tracy::MemWrite( &item->zoneValidation.id, id );
+        TracyQueueCommitC( zoneValidationThread );
+    }
+#endif
+    tracy::GetProfiler().SendCallstack( depth );
+    {
+        TracyQueuePrepareC( tracy::QueueType::ZoneBeginCallstack );
+        tracy::MemWrite( &item->zoneBegin.time, tracy::Profiler::GetTime() );
+        tracy::MemWrite( &item->zoneBegin.srcloc, (uint64_t)srcloc );
+        TracyQueueCommitC( zoneBeginThread );
+    }
+    return ctx;
+}
+
+TRACY_API TracyCZoneCtx ___tracy_emit_zone_begin_alloc( uint64_t srcloc, int active )
+{
+    ___tracy_c_zone_context ctx;
+#ifdef TRACY_ON_DEMAND
+    ctx.active = active && tracy::GetProfiler().IsConnected();
+#else
+    ctx.active = active;
+#endif
+    if( !ctx.active )
+    {
+        tracy::tracy_free( (void*)srcloc );
+        return ctx;
+    }
+    const auto id = tracy::GetProfiler().GetNextZoneId();
+    ctx.id = id;
+
+#ifndef TRACY_NO_VERIFY
+    {
+        TracyQueuePrepareC( tracy::QueueType::ZoneValidation );
+        tracy::MemWrite( &item->zoneValidation.id, id );
+        TracyQueueCommitC( zoneValidationThread );
+    }
+#endif
+    {
+        TracyQueuePrepareC( tracy::QueueType::ZoneBeginAllocSrcLoc );
+        tracy::MemWrite( &item->zoneBegin.time, tracy::Profiler::GetTime() );
+        tracy::MemWrite( &item->zoneBegin.srcloc, srcloc );
+        TracyQueueCommitC( zoneBeginThread );
+    }
+    return ctx;
+}
+
+TRACY_API TracyCZoneCtx ___tracy_emit_zone_begin_alloc_callstack( uint64_t srcloc, int depth, int active )
+{
+    ___tracy_c_zone_context ctx;
+#ifdef TRACY_ON_DEMAND
+    ctx.active = active && tracy::GetProfiler().IsConnected();
+#else
+    ctx.active = active;
+#endif
+    if( !ctx.active )
+    {
+        tracy::tracy_free( (void*)srcloc );
+        return ctx;
+    }
+    const auto id = tracy::GetProfiler().GetNextZoneId();
+    ctx.id = id;
+
+#ifndef TRACY_NO_VERIFY
+    {
+        TracyQueuePrepareC( tracy::QueueType::ZoneValidation );
+        tracy::MemWrite( &item->zoneValidation.id, id );
+        TracyQueueCommitC( zoneValidationThread );
+    }
+#endif
+    tracy::GetProfiler().SendCallstack( depth );
+    {
+        TracyQueuePrepareC( tracy::QueueType::ZoneBeginAllocSrcLocCallstack );
+        tracy::MemWrite( &item->zoneBegin.time, tracy::Profiler::GetTime() );
+        tracy::MemWrite( &item->zoneBegin.srcloc, srcloc );
+        TracyQueueCommitC( zoneBeginThread );
+    }
+    return ctx;
+}
+
+TRACY_API void ___tracy_emit_zone_end( TracyCZoneCtx ctx )
+{
+    if( !ctx.active ) return;
+#ifndef TRACY_NO_VERIFY
+    {
+        TracyQueuePrepareC( tracy::QueueType::ZoneValidation );
+        tracy::MemWrite( &item->zoneValidation.id, ctx.id );
+        TracyQueueCommitC( zoneValidationThread );
+    }
+#endif
+    {
+        TracyQueuePrepareC( tracy::QueueType::ZoneEnd );
+        tracy::MemWrite( &item->zoneEnd.time, tracy::Profiler::GetTime() );
+        TracyQueueCommitC( zoneEndThread );
+    }
+}
+
+TRACY_API void ___tracy_emit_zone_text( TracyCZoneCtx ctx, const char* txt, size_t size )
+{
+    assert( size < std::numeric_limits<uint16_t>::max() );
+    if( !ctx.active ) return;
+    auto ptr = (char*)tracy::tracy_malloc( size );
+    memcpy( ptr, txt, size );
+#ifndef TRACY_NO_VERIFY
+    {
+        TracyQueuePrepareC( tracy::QueueType::ZoneValidation );
+        tracy::MemWrite( &item->zoneValidation.id, ctx.id );
+        TracyQueueCommitC( zoneValidationThread );
+    }
+#endif
+    {
+        TracyQueuePrepareC( tracy::QueueType::ZoneText );
+        tracy::MemWrite( &item->zoneTextFat.text, (uint64_t)ptr );
+        tracy::MemWrite( &item->zoneTextFat.size, (uint16_t)size );
+        TracyQueueCommitC( zoneTextFatThread );
+    }
+}
+
+TRACY_API void ___tracy_emit_zone_name( TracyCZoneCtx ctx, const char* txt, size_t size )
+{
+    assert( size < std::numeric_limits<uint16_t>::max() );
+    if( !ctx.active ) return;
+    auto ptr = (char*)tracy::tracy_malloc( size );
+    memcpy( ptr, txt, size );
+#ifndef TRACY_NO_VERIFY
+    {
+        TracyQueuePrepareC( tracy::QueueType::ZoneValidation );
+        tracy::MemWrite( &item->zoneValidation.id, ctx.id );
+        TracyQueueCommitC( zoneValidationThread );
+    }
+#endif
+    {
+        TracyQueuePrepareC( tracy::QueueType::ZoneName );
+        tracy::MemWrite( &item->zoneTextFat.text, (uint64_t)ptr );
+        tracy::MemWrite( &item->zoneTextFat.size, (uint16_t)size );
+        TracyQueueCommitC( zoneTextFatThread );
+    }
+}
+
+TRACY_API void ___tracy_emit_zone_color( TracyCZoneCtx ctx, uint32_t color ) {
+    if( !ctx.active ) return;
+#ifndef TRACY_NO_VERIFY
+    {
+        TracyQueuePrepareC( tracy::QueueType::ZoneValidation );
+        tracy::MemWrite( &item->zoneValidation.id, ctx.id );
+        TracyQueueCommitC( zoneValidationThread );
+    }
+#endif
+    {
+        TracyQueuePrepareC( tracy::QueueType::ZoneColor );
+        tracy::MemWrite( &item->zoneColor.b, uint8_t( ( color       ) & 0xFF ) );
+        tracy::MemWrite( &item->zoneColor.g, uint8_t( ( color >> 8  ) & 0xFF ) );
+        tracy::MemWrite( &item->zoneColor.r, uint8_t( ( color >> 16 ) & 0xFF ) );
+        TracyQueueCommitC( zoneColorThread );
+    }
+}
+
+TRACY_API void ___tracy_emit_zone_value( TracyCZoneCtx ctx, uint64_t value )
+{
+    if( !ctx.active ) return;
+#ifndef TRACY_NO_VERIFY
+    {
+        TracyQueuePrepareC( tracy::QueueType::ZoneValidation );
+        tracy::MemWrite( &item->zoneValidation.id, ctx.id );
+        TracyQueueCommitC( zoneValidationThread );
+    }
+#endif
+    {
+        TracyQueuePrepareC( tracy::QueueType::ZoneValue );
+        tracy::MemWrite( &item->zoneValue.value, value );
+        TracyQueueCommitC( zoneValueThread );
+    }
+}
+
+TRACY_API void ___tracy_emit_memory_alloc( const void* ptr, size_t size, int secure ) { tracy::Profiler::MemAlloc( ptr, size, secure != 0 ); }
+TRACY_API void ___tracy_emit_memory_alloc_callstack( const void* ptr, size_t size, int depth, int secure ) { tracy::Profiler::MemAllocCallstack( ptr, size, depth, secure != 0 ); }
+TRACY_API void ___tracy_emit_memory_free( const void* ptr, int secure ) { tracy::Profiler::MemFree( ptr, secure != 0 ); }
+TRACY_API void ___tracy_emit_memory_free_callstack( const void* ptr, int depth, int secure ) { tracy::Profiler::MemFreeCallstack( ptr, depth, secure != 0 ); }
+TRACY_API void ___tracy_emit_memory_alloc_named( const void* ptr, size_t size, int secure, const char* name ) { tracy::Profiler::MemAllocNamed( ptr, size, secure != 0, name ); }
+TRACY_API void ___tracy_emit_memory_alloc_callstack_named( const void* ptr, size_t size, int depth, int secure, const char* name ) { tracy::Profiler::MemAllocCallstackNamed( ptr, size, depth, secure != 0, name ); }
+TRACY_API void ___tracy_emit_memory_free_named( const void* ptr, int secure, const char* name ) { tracy::Profiler::MemFreeNamed( ptr, secure != 0, name ); }
+TRACY_API void ___tracy_emit_memory_free_callstack_named( const void* ptr, int depth, int secure, const char* name ) { tracy::Profiler::MemFreeCallstackNamed( ptr, depth, secure != 0, name ); }
+TRACY_API void ___tracy_emit_frame_mark( const char* name ) { tracy::Profiler::SendFrameMark( name ); }
+TRACY_API void ___tracy_emit_frame_mark_start( const char* name ) { tracy::Profiler::SendFrameMark( name, tracy::QueueType::FrameMarkMsgStart ); }
+TRACY_API void ___tracy_emit_frame_mark_end( const char* name ) { tracy::Profiler::SendFrameMark( name, tracy::QueueType::FrameMarkMsgEnd ); }
+TRACY_API void ___tracy_emit_frame_image( const void* image, uint16_t w, uint16_t h, uint8_t offset, int flip ) { tracy::Profiler::SendFrameImage( image, w, h, offset, flip ); }
+TRACY_API void ___tracy_emit_plot( const char* name, double val ) { tracy::Profiler::PlotData( name, val ); }
+TRACY_API void ___tracy_emit_plot_float( const char* name, float val ) { tracy::Profiler::PlotData( name, val ); }
+TRACY_API void ___tracy_emit_plot_int( const char* name, int64_t val ) { tracy::Profiler::PlotData( name, val ); }
+TRACY_API void ___tracy_emit_plot_config( const char* name, int type, int step, int fill, uint32_t color ) { tracy::Profiler::ConfigurePlot( name, tracy::PlotFormatType(type), step, fill, color ); }
+TRACY_API void ___tracy_emit_message( const char* txt, size_t size, int callstack ) { tracy::Profiler::Message( txt, size, callstack ); }
+TRACY_API void ___tracy_emit_messageL( const char* txt, int callstack ) { tracy::Profiler::Message( txt, callstack ); }
+TRACY_API void ___tracy_emit_messageC( const char* txt, size_t size, uint32_t color, int callstack ) { tracy::Profiler::MessageColor( txt, size, color, callstack ); }
+TRACY_API void ___tracy_emit_messageLC( const char* txt, uint32_t color, int callstack ) { tracy::Profiler::MessageColor( txt, color, callstack ); }
+TRACY_API void ___tracy_emit_message_appinfo( const char* txt, size_t size ) { tracy::Profiler::MessageAppInfo( txt, size ); }
+
+TRACY_API uint64_t ___tracy_alloc_srcloc( uint32_t line, const char* source, size_t sourceSz, const char* function, size_t functionSz, uint32_t color ) {
+    return tracy::Profiler::AllocSourceLocation( line, source, sourceSz, function, functionSz, color );
+}
+
+TRACY_API uint64_t ___tracy_alloc_srcloc_name( uint32_t line, const char* source, size_t sourceSz, const char* function, size_t functionSz, const char* name, size_t nameSz, uint32_t color ) {
+    return tracy::Profiler::AllocSourceLocation( line, source, sourceSz, function, functionSz, name, nameSz, color );
+}
+
+TRACY_API void ___tracy_emit_gpu_zone_begin( const struct ___tracy_gpu_zone_begin_data data )
+{
+    TracyLfqPrepareC( tracy::QueueType::GpuZoneBegin );
+    tracy::MemWrite( &item->gpuZoneBegin.cpuTime, tracy::Profiler::GetTime() );
+    tracy::MemWrite( &item->gpuZoneBegin.thread, tracy::GetThreadHandle() );
+    tracy::MemWrite( &item->gpuZoneBegin.srcloc, data.srcloc );
+    tracy::MemWrite( &item->gpuZoneBegin.queryId, data.queryId );
+    tracy::MemWrite( &item->gpuZoneBegin.context, data.context );
+    TracyLfqCommitC;
+}
+
+TRACY_API void ___tracy_emit_gpu_zone_begin_callstack( const struct ___tracy_gpu_zone_begin_callstack_data data )
+{
+    tracy::GetProfiler().SendCallstack( data.depth );
+    TracyLfqPrepareC( tracy::QueueType::GpuZoneBeginCallstack );
+    tracy::MemWrite( &item->gpuZoneBegin.thread, tracy::GetThreadHandle() );
+    tracy::MemWrite( &item->gpuZoneBegin.cpuTime, tracy::Profiler::GetTime() );
+    tracy::MemWrite( &item->gpuZoneBegin.queryId, data.queryId );
+    tracy::MemWrite( &item->gpuZoneBegin.context, data.context );
+    tracy::MemWrite( &item->gpuZoneBegin.srcloc, data.srcloc );
+    TracyLfqCommitC;
+}
+
+TRACY_API void ___tracy_emit_gpu_zone_begin_alloc( const struct ___tracy_gpu_zone_begin_data data )
+{
+    TracyLfqPrepareC( tracy::QueueType::GpuZoneBeginAllocSrcLoc  );
+    tracy::MemWrite( &item->gpuZoneBegin.cpuTime, tracy::Profiler::GetTime() );
+    tracy::MemWrite( &item->gpuZoneBegin.thread, tracy::GetThreadHandle() );
+    tracy::MemWrite( &item->gpuZoneBegin.srcloc, data.srcloc );
+    tracy::MemWrite( &item->gpuZoneBegin.queryId, data.queryId );
+    tracy::MemWrite( &item->gpuZoneBegin.context, data.context );
+    TracyLfqCommitC;
+}
+
+TRACY_API void ___tracy_emit_gpu_zone_begin_alloc_callstack( const struct ___tracy_gpu_zone_begin_callstack_data data )
+{
+    tracy::GetProfiler().SendCallstack( data.depth );
+    TracyLfqPrepareC( tracy::QueueType::GpuZoneBeginAllocSrcLocCallstack  );
+    tracy::MemWrite( &item->gpuZoneBegin.cpuTime, tracy::Profiler::GetTime() );
+    tracy::MemWrite( &item->gpuZoneBegin.thread, tracy::GetThreadHandle() );
+    tracy::MemWrite( &item->gpuZoneBegin.srcloc, data.srcloc );
+    tracy::MemWrite( &item->gpuZoneBegin.queryId, data.queryId );
+    tracy::MemWrite( &item->gpuZoneBegin.context, data.context );
+    TracyLfqCommitC;
+}
+
+TRACY_API void ___tracy_emit_gpu_time( const struct ___tracy_gpu_time_data data )
+{
+    TracyLfqPrepareC( tracy::QueueType::GpuTime );
+    tracy::MemWrite( &item->gpuTime.gpuTime, data.gpuTime );
+    tracy::MemWrite( &item->gpuTime.queryId, data.queryId );
+    tracy::MemWrite( &item->gpuTime.context, data.context );
+    TracyLfqCommitC;
+}
+
+TRACY_API void ___tracy_emit_gpu_zone_end( const struct ___tracy_gpu_zone_end_data data )
+{
+    TracyLfqPrepareC( tracy::QueueType::GpuZoneEnd );
+    tracy::MemWrite( &item->gpuZoneEnd.cpuTime, tracy::Profiler::GetTime() );
+    memset( &item->gpuZoneEnd.thread, 0, sizeof( item->gpuZoneEnd.thread ) );
+    tracy::MemWrite( &item->gpuZoneEnd.queryId, data.queryId );
+    tracy::MemWrite( &item->gpuZoneEnd.context, data.context );
+    TracyLfqCommitC;
+}
+
+TRACY_API void ___tracy_emit_gpu_new_context( ___tracy_gpu_new_context_data data )
+{
+    TracyLfqPrepareC( tracy::QueueType::GpuNewContext );
+    tracy::MemWrite( &item->gpuNewContext.cpuTime, tracy::Profiler::GetTime() );
+    tracy::MemWrite( &item->gpuNewContext.thread, tracy::GetThreadHandle() );
+    tracy::MemWrite( &item->gpuNewContext.gpuTime, data.gpuTime );
+    tracy::MemWrite( &item->gpuNewContext.period, data.period );
+    tracy::MemWrite( &item->gpuNewContext.context, data.context );
+    tracy::MemWrite( &item->gpuNewContext.flags, data.flags );
+    tracy::MemWrite( &item->gpuNewContext.type, data.type );
+
+#ifdef TRACY_ON_DEMAND
+    tracy::GetProfiler().DeferItem( *item );
+#endif
+
+    TracyLfqCommitC;
+}
+
+TRACY_API void ___tracy_emit_gpu_context_name( const struct ___tracy_gpu_context_name_data data )
+{
+    auto ptr = (char*)tracy::tracy_malloc( data.len );
+    memcpy( ptr, data.name, data.len );
+
+    TracyLfqPrepareC( tracy::QueueType::GpuContextName );
+    tracy::MemWrite( &item->gpuContextNameFat.context, data.context );
+    tracy::MemWrite( &item->gpuContextNameFat.ptr, (uint64_t)ptr );
+    tracy::MemWrite( &item->gpuContextNameFat.size, data.len );
+
+#ifdef TRACY_ON_DEMAND
+    tracy::GetProfiler().DeferItem( *item );
+#endif
+
+    TracyLfqCommitC;
+}
+
+TRACY_API void ___tracy_emit_gpu_calibration( const struct ___tracy_gpu_calibration_data data )
+{
+    TracyLfqPrepareC( tracy::QueueType::GpuCalibration );
+    tracy::MemWrite( &item->gpuCalibration.cpuTime, tracy::Profiler::GetTime() );
+    tracy::MemWrite( &item->gpuCalibration.gpuTime, data.gpuTime );
+    tracy::MemWrite( &item->gpuCalibration.cpuDelta, data.cpuDelta );
+    tracy::MemWrite( &item->gpuCalibration.context, data.context );
+    TracyLfqCommitC;
+}
+
+TRACY_API void ___tracy_emit_gpu_time_sync( const struct ___tracy_gpu_time_sync_data data )
+{
+    TracyLfqPrepareC( tracy::QueueType::GpuTimeSync );
+    tracy::MemWrite( &item->gpuTimeSync.cpuTime, tracy::Profiler::GetTime() );
+    tracy::MemWrite( &item->gpuTimeSync.gpuTime, data.gpuTime );
+    tracy::MemWrite( &item->gpuTimeSync.context, data.context );
+    TracyLfqCommitC;
+}
+
+TRACY_API void ___tracy_emit_gpu_zone_begin_serial( const struct ___tracy_gpu_zone_begin_data data )
+{
+    auto item = tracy::Profiler::QueueSerial();
+    tracy::MemWrite( &item->hdr.type, tracy::QueueType::GpuZoneBeginSerial );
+    tracy::MemWrite( &item->gpuZoneBegin.cpuTime, tracy::Profiler::GetTime() );
+    tracy::MemWrite( &item->gpuZoneBegin.srcloc, data.srcloc );
+    tracy::MemWrite( &item->gpuZoneBegin.thread, tracy::GetThreadHandle() );
+    tracy::MemWrite( &item->gpuZoneBegin.queryId, data.queryId );
+    tracy::MemWrite( &item->gpuZoneBegin.context, data.context );
+    tracy::Profiler::QueueSerialFinish();
+}
+
+TRACY_API void ___tracy_emit_gpu_zone_begin_callstack_serial( const struct ___tracy_gpu_zone_begin_callstack_data data )
+{
+    auto item = tracy::Profiler::QueueSerialCallstack( tracy::Callstack( data.depth ) );
+    tracy::MemWrite( &item->hdr.type, tracy::QueueType::GpuZoneBeginCallstackSerial );
+    tracy::MemWrite( &item->gpuZoneBegin.cpuTime, tracy::Profiler::GetTime() );
+    tracy::MemWrite( &item->gpuZoneBegin.srcloc, data.srcloc );
+    tracy::MemWrite( &item->gpuZoneBegin.thread, tracy::GetThreadHandle() );
+    tracy::MemWrite( &item->gpuZoneBegin.queryId, data.queryId );
+    tracy::MemWrite( &item->gpuZoneBegin.context, data.context );
+    tracy::Profiler::QueueSerialFinish();
+}
+
+TRACY_API void ___tracy_emit_gpu_zone_begin_alloc_serial( const struct ___tracy_gpu_zone_begin_data data )
+{
+    auto item = tracy::Profiler::QueueSerial();
+    tracy::MemWrite( &item->hdr.type, tracy::QueueType::GpuZoneBeginAllocSrcLocSerial );
+    tracy::MemWrite( &item->gpuZoneBegin.cpuTime, tracy::Profiler::GetTime() );
+    tracy::MemWrite( &item->gpuZoneBegin.thread, tracy::GetThreadHandle() );
+    tracy::MemWrite( &item->gpuZoneBegin.srcloc, data.srcloc );
+    tracy::MemWrite( &item->gpuZoneBegin.queryId, data.queryId );
+    tracy::MemWrite( &item->gpuZoneBegin.context, data.context );
+    tracy::Profiler::QueueSerialFinish();
+}
+
+TRACY_API void ___tracy_emit_gpu_zone_begin_alloc_callstack_serial( const struct ___tracy_gpu_zone_begin_callstack_data data )
+{
+    auto item = tracy::Profiler::QueueSerialCallstack( tracy::Callstack( data.depth ) );
+    tracy::MemWrite( &item->hdr.type, tracy::QueueType::GpuZoneBeginAllocSrcLocCallstackSerial );
+    tracy::MemWrite( &item->gpuZoneBegin.cpuTime, tracy::Profiler::GetTime() );
+    tracy::MemWrite( &item->gpuZoneBegin.thread, tracy::GetThreadHandle() );
+    tracy::MemWrite( &item->gpuZoneBegin.srcloc, data.srcloc );
+    tracy::MemWrite( &item->gpuZoneBegin.queryId, data.queryId );
+    tracy::MemWrite( &item->gpuZoneBegin.context, data.context );
+    tracy::Profiler::QueueSerialFinish();
+}
+
+TRACY_API void ___tracy_emit_gpu_time_serial( const struct ___tracy_gpu_time_data data )
+{
+    auto item = tracy::Profiler::QueueSerial();
+    tracy::MemWrite( &item->hdr.type, tracy::QueueType::GpuTime );
+    tracy::MemWrite( &item->gpuTime.gpuTime, data.gpuTime );
+    tracy::MemWrite( &item->gpuTime.queryId, data.queryId );
+    tracy::MemWrite( &item->gpuTime.context, data.context );
+    tracy::Profiler::QueueSerialFinish();
+}
+
+TRACY_API void ___tracy_emit_gpu_zone_end_serial( const struct ___tracy_gpu_zone_end_data data )
+{
+    auto item = tracy::Profiler::QueueSerial();
+    tracy::MemWrite( &item->hdr.type, tracy::QueueType::GpuZoneEndSerial );
+    tracy::MemWrite( &item->gpuZoneEnd.cpuTime, tracy::Profiler::GetTime() );
+    memset( &item->gpuZoneEnd.thread, 0, sizeof( item->gpuZoneEnd.thread ) );
+    tracy::MemWrite( &item->gpuZoneEnd.queryId, data.queryId );
+    tracy::MemWrite( &item->gpuZoneEnd.context, data.context );
+    tracy::Profiler::QueueSerialFinish();
+}
+
+TRACY_API void ___tracy_emit_gpu_new_context_serial( ___tracy_gpu_new_context_data data )
+{
+    auto item = tracy::Profiler::QueueSerial();
+    tracy::MemWrite( &item->hdr.type, tracy::QueueType::GpuNewContext );
+    tracy::MemWrite( &item->gpuNewContext.cpuTime, tracy::Profiler::GetTime() );
+    tracy::MemWrite( &item->gpuNewContext.thread, tracy::GetThreadHandle() );
+    tracy::MemWrite( &item->gpuNewContext.gpuTime, data.gpuTime );
+    tracy::MemWrite( &item->gpuNewContext.period, data.period );
+    tracy::MemWrite( &item->gpuNewContext.context, data.context );
+    tracy::MemWrite( &item->gpuNewContext.flags, data.flags );
+    tracy::MemWrite( &item->gpuNewContext.type, data.type );
+    tracy::Profiler::QueueSerialFinish();
+}
+
+TRACY_API void ___tracy_emit_gpu_context_name_serial( const struct ___tracy_gpu_context_name_data data )
+{
+    auto ptr = (char*)tracy::tracy_malloc( data.len );
+    memcpy( ptr, data.name, data.len );
+
+    auto item = tracy::Profiler::QueueSerial();
+    tracy::MemWrite( &item->hdr.type, tracy::QueueType::GpuContextName );
+    tracy::MemWrite( &item->gpuContextNameFat.context, data.context );
+    tracy::MemWrite( &item->gpuContextNameFat.ptr, (uint64_t)ptr );
+    tracy::MemWrite( &item->gpuContextNameFat.size, data.len );
+    tracy::Profiler::QueueSerialFinish();
+}
+
+TRACY_API void ___tracy_emit_gpu_calibration_serial( const struct ___tracy_gpu_calibration_data data )
+{
+    auto item = tracy::Profiler::QueueSerial();
+    tracy::MemWrite( &item->hdr.type, tracy::QueueType::GpuCalibration );
+    tracy::MemWrite( &item->gpuCalibration.cpuTime, tracy::Profiler::GetTime() );
+    tracy::MemWrite( &item->gpuCalibration.gpuTime, data.gpuTime );
+    tracy::MemWrite( &item->gpuCalibration.cpuDelta, data.cpuDelta );
+    tracy::MemWrite( &item->gpuCalibration.context, data.context );
+    tracy::Profiler::QueueSerialFinish();
+}
+
+TRACY_API void ___tracy_emit_gpu_time_sync_serial( const struct ___tracy_gpu_time_sync_data data )
+{
+    auto item = tracy::Profiler::QueueSerial();
+    tracy::MemWrite( &item->hdr.type, tracy::QueueType::GpuTimeSync );
+    tracy::MemWrite( &item->gpuTimeSync.cpuTime, tracy::Profiler::GetTime() );
+    tracy::MemWrite( &item->gpuTimeSync.gpuTime, data.gpuTime );
+    tracy::MemWrite( &item->gpuTimeSync.context, data.context );
+    tracy::Profiler::QueueSerialFinish();
+}
+
+struct __tracy_lockable_context_data
+{
+    uint32_t m_id;
+#ifdef TRACY_ON_DEMAND
+    std::atomic<uint32_t> m_lockCount;
+    std::atomic<bool> m_active;
+#endif
+};
+
+TRACY_API struct __tracy_lockable_context_data* ___tracy_announce_lockable_ctx( const struct ___tracy_source_location_data* srcloc )
+{
+    struct __tracy_lockable_context_data *lockdata = (__tracy_lockable_context_data*)tracy::tracy_malloc( sizeof( __tracy_lockable_context_data ) );
+    lockdata->m_id =tracy:: GetLockCounter().fetch_add( 1, std::memory_order_relaxed );
+#ifdef TRACY_ON_DEMAND
+    new(&lockdata->m_lockCount) std::atomic<uint32_t>( 0 );
+    new(&lockdata->m_active) std::atomic<bool>( false );
+#endif
+    assert( lockdata->m_id != (std::numeric_limits<uint32_t>::max)() );
+
+    auto item = tracy::Profiler::QueueSerial();
+    tracy::MemWrite( &item->hdr.type, tracy::QueueType::LockAnnounce );
+    tracy::MemWrite( &item->lockAnnounce.id, lockdata->m_id );
+    tracy::MemWrite( &item->lockAnnounce.time, tracy::Profiler::GetTime() );
+    tracy::MemWrite( &item->lockAnnounce.lckloc, (uint64_t)srcloc );
+    tracy::MemWrite( &item->lockAnnounce.type, tracy::LockType::Lockable );
+#ifdef TRACY_ON_DEMAND
+    tracy::GetProfiler().DeferItem( *item );
+#endif
+    tracy::Profiler::QueueSerialFinish();
+
+    return lockdata;
+}
+
+TRACY_API void ___tracy_terminate_lockable_ctx( struct __tracy_lockable_context_data* lockdata )
+{
+    auto item = tracy::Profiler::QueueSerial();
+    tracy::MemWrite( &item->hdr.type, tracy::QueueType::LockTerminate );
+    tracy::MemWrite( &item->lockTerminate.id, lockdata->m_id );
+    tracy::MemWrite( &item->lockTerminate.time, tracy::Profiler::GetTime() );
+#ifdef TRACY_ON_DEMAND
+    tracy::GetProfiler().DeferItem( *item );
+#endif
+    tracy::Profiler::QueueSerialFinish();
+
+#ifdef TRACY_ON_DEMAND
+    lockdata->m_lockCount.~atomic();
+    lockdata->m_active.~atomic();
+#endif
+    tracy::tracy_free((void*)lockdata);
+}
+
+TRACY_API int ___tracy_before_lock_lockable_ctx( struct __tracy_lockable_context_data* lockdata )
+{
+#ifdef TRACY_ON_DEMAND
+    bool queue = false;
+    const auto locks = lockdata->m_lockCount.fetch_add( 1, std::memory_order_relaxed );
+    const auto active = lockdata->m_active.load( std::memory_order_relaxed );
+    if( locks == 0 || active )
+    {
+        const bool connected = tracy::GetProfiler().IsConnected();
+        if( active != connected ) lockdata->m_active.store( connected, std::memory_order_relaxed );
+        if( connected ) queue = true;
+    }
+    if( !queue ) return false;
+#endif
+
+    auto item = tracy::Profiler::QueueSerial();
+    tracy::MemWrite( &item->hdr.type, tracy::QueueType::LockWait );
+    tracy::MemWrite( &item->lockWait.thread, tracy::GetThreadHandle() );
+    tracy::MemWrite( &item->lockWait.id, lockdata->m_id );
+    tracy::MemWrite( &item->lockWait.time, tracy::Profiler::GetTime() );
+    tracy::Profiler::QueueSerialFinish();
+    return true;
+}
+
+TRACY_API void ___tracy_after_lock_lockable_ctx( struct __tracy_lockable_context_data* lockdata )
+{
+    auto item = tracy::Profiler::QueueSerial();
+    tracy::MemWrite( &item->hdr.type, tracy::QueueType::LockObtain );
+    tracy::MemWrite( &item->lockObtain.thread, tracy::GetThreadHandle() );
+    tracy::MemWrite( &item->lockObtain.id, lockdata->m_id );
+    tracy::MemWrite( &item->lockObtain.time, tracy::Profiler::GetTime() );
+    tracy::Profiler::QueueSerialFinish();
+}
+
+TRACY_API void ___tracy_after_unlock_lockable_ctx( struct __tracy_lockable_context_data* lockdata )
+{
+#ifdef TRACY_ON_DEMAND
+    lockdata->m_lockCount.fetch_sub( 1, std::memory_order_relaxed );
+    if( !lockdata->m_active.load( std::memory_order_relaxed ) ) return;
+    if( !tracy::GetProfiler().IsConnected() )
+    {
+        lockdata->m_active.store( false, std::memory_order_relaxed );
+        return;
+    }
+#endif
+
+    auto item = tracy::Profiler::QueueSerial();
+    tracy::MemWrite( &item->hdr.type, tracy::QueueType::LockRelease );
+    tracy::MemWrite( &item->lockRelease.id, lockdata->m_id );
+    tracy::MemWrite( &item->lockRelease.time, tracy::Profiler::GetTime() );
+    tracy::Profiler::QueueSerialFinish();
+}
+
+TRACY_API void ___tracy_after_try_lock_lockable_ctx( struct __tracy_lockable_context_data* lockdata, int acquired )
+{
+#ifdef TRACY_ON_DEMAND
+    if( !acquired ) return;
+
+    bool queue = false;
+    const auto locks = lockdata->m_lockCount.fetch_add( 1, std::memory_order_relaxed );
+    const auto active = lockdata->m_active.load( std::memory_order_relaxed );
+    if( locks == 0 || active )
+    {
+        const bool connected = tracy::GetProfiler().IsConnected();
+        if( active != connected ) lockdata->m_active.store( connected, std::memory_order_relaxed );
+        if( connected ) queue = true;
+    }
+    if( !queue ) return;
+#endif
+
+    if( acquired )
+    {
+        auto item = tracy::Profiler::QueueSerial();
+        tracy::MemWrite( &item->hdr.type, tracy::QueueType::LockObtain );
+        tracy::MemWrite( &item->lockObtain.thread, tracy::GetThreadHandle() );
+        tracy::MemWrite( &item->lockObtain.id, lockdata->m_id );
+        tracy::MemWrite( &item->lockObtain.time, tracy::Profiler::GetTime() );
+        tracy::Profiler::QueueSerialFinish();
+    }
+}
+
+TRACY_API void ___tracy_mark_lockable_ctx( struct __tracy_lockable_context_data* lockdata, const struct ___tracy_source_location_data* srcloc )
+{
+#ifdef TRACY_ON_DEMAND
+    const auto active = lockdata->m_active.load( std::memory_order_relaxed );
+    if( !active ) return;
+    const auto connected = tracy::GetProfiler().IsConnected();
+    if( !connected )
+    {
+        if( active ) lockdata->m_active.store( false, std::memory_order_relaxed );
+        return;
+    }
+#endif
+
+    auto item = tracy::Profiler::QueueSerial();
+    tracy::MemWrite( &item->hdr.type, tracy::QueueType::LockMark );
+    tracy::MemWrite( &item->lockMark.thread, tracy::GetThreadHandle() );
+    tracy::MemWrite( &item->lockMark.id, lockdata->m_id );
+    tracy::MemWrite( &item->lockMark.srcloc, (uint64_t)srcloc );
+    tracy::Profiler::QueueSerialFinish();
+}
+
+TRACY_API void ___tracy_custom_name_lockable_ctx( struct __tracy_lockable_context_data* lockdata, const char* name, size_t nameSz )
+{
+    assert( nameSz < (std::numeric_limits<uint16_t>::max)() );
+    auto ptr = (char*)tracy::tracy_malloc( nameSz );
+    memcpy( ptr, name, nameSz );
+    auto item = tracy::Profiler::QueueSerial();
+    tracy::MemWrite( &item->hdr.type, tracy::QueueType::LockName );
+    tracy::MemWrite( &item->lockNameFat.id, lockdata->m_id );
+    tracy::MemWrite( &item->lockNameFat.name, (uint64_t)ptr );
+    tracy::MemWrite( &item->lockNameFat.size, (uint16_t)nameSz );
+#ifdef TRACY_ON_DEMAND
+    tracy::GetProfiler().DeferItem( *item );
+#endif
+    tracy::Profiler::QueueSerialFinish();
+}
+
+TRACY_API int ___tracy_connected( void )
+{
+    return tracy::GetProfiler().IsConnected();
+}
+
+#ifdef TRACY_FIBERS
+TRACY_API void ___tracy_fiber_enter( const char* fiber ){ tracy::Profiler::EnterFiber( fiber, 0 ); }
+TRACY_API void ___tracy_fiber_leave( void ){ tracy::Profiler::LeaveFiber(); }
+#endif
+
+#  ifdef TRACY_MANUAL_LIFETIME
+TRACY_API void ___tracy_startup_profiler( void )
+{
+    tracy::StartupProfiler();
+}
+
+TRACY_API void ___tracy_shutdown_profiler( void )
+{
+    tracy::ShutdownProfiler();
+}
+
+TRACY_API int ___tracy_profiler_started( void )
+{
+    return tracy::s_isProfilerStarted.load( std::memory_order_seq_cst );
+}
+#  endif
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
diff --git a/project/thirdparty/tracy-0.11.1/client/TracyProfiler.hpp b/project/thirdparty/tracy-0.11.1/client/TracyProfiler.hpp
new file mode 100644
index 000000000..46f11f3d0
--- /dev/null
+++ b/project/thirdparty/tracy-0.11.1/client/TracyProfiler.hpp
@@ -0,0 +1,1010 @@
+#ifndef __TRACYPROFILER_HPP__
+#define __TRACYPROFILER_HPP__
+
+#include <assert.h>
+#include <atomic>
+#include <stdint.h>
+#include <string.h>
+#include <time.h>
+
+#include "tracy_concurrentqueue.h"
+#include "tracy_SPSCQueue.h"
+#include "TracyCallstack.hpp"
+#include "TracyKCore.hpp"
+#include "TracySysPower.hpp"
+#include "TracySysTime.hpp"
+#include "TracyFastVector.hpp"
+#include "../common/TracyQueue.hpp"
+#include "../common/TracyAlign.hpp"
+#include "../common/TracyAlloc.hpp"
+#include "../common/TracyMutex.hpp"
+#include "../common/TracyProtocol.hpp"
+
+#if defined _WIN32
+#  include <intrin.h>
+#endif
+#ifdef __APPLE__
+#  include <TargetConditionals.h>
+#  include <mach/mach_time.h>
+#endif
+
+#if ( (defined _WIN32 && !(defined _M_ARM64 || defined _M_ARM)) || ( defined __i386 || defined _M_IX86 || defined __x86_64__ || defined _M_X64 ) || ( defined TARGET_OS_IOS && TARGET_OS_IOS == 1 ) )
+#  define TRACY_HW_TIMER
+#endif
+
+#ifdef __linux__
+#  include <signal.h>
+#endif
+
+#if defined TRACY_TIMER_FALLBACK || !defined TRACY_HW_TIMER
+#  include <chrono>
+#endif
+
+#ifndef TracyConcat
+#  define TracyConcat(x,y) TracyConcatIndirect(x,y)
+#endif
+#ifndef TracyConcatIndirect
+#  define TracyConcatIndirect(x,y) x##y
+#endif
+
+namespace tracy
+{
+#if defined(TRACY_DELAYED_INIT) && defined(TRACY_MANUAL_LIFETIME)
+TRACY_API void StartupProfiler();
+TRACY_API void ShutdownProfiler();
+TRACY_API bool IsProfilerStarted();
+#  define TracyIsStarted tracy::IsProfilerStarted()
+#else
+#  define TracyIsStarted true
+#endif
+
+class GpuCtx;
+class Profiler;
+class Socket;
+class UdpBroadcast;
+
+struct GpuCtxWrapper
+{
+    GpuCtx* ptr;
+};
+
+TRACY_API moodycamel::ConcurrentQueue<QueueItem>::ExplicitProducer* GetToken();
+TRACY_API Profiler& GetProfiler();
+TRACY_API std::atomic<uint32_t>& GetLockCounter();
+TRACY_API std::atomic<uint8_t>& GetGpuCtxCounter();
+TRACY_API GpuCtxWrapper& GetGpuCtx();
+TRACY_API uint32_t GetThreadHandle();
+TRACY_API bool ProfilerAvailable();
+TRACY_API bool ProfilerAllocatorAvailable();
+TRACY_API int64_t GetFrequencyQpc();
+
+#if defined TRACY_TIMER_FALLBACK && defined TRACY_HW_TIMER && ( defined __i386 || defined _M_IX86 || defined __x86_64__ || defined _M_X64 )
+TRACY_API bool HardwareSupportsInvariantTSC();  // check, if we need fallback scenario
+#else
+#  if defined TRACY_HW_TIMER
+tracy_force_inline bool HardwareSupportsInvariantTSC()
+{
+    return true;  // this is checked at startup
+}
+#  else
+tracy_force_inline bool HardwareSupportsInvariantTSC()
+{
+    return false;
+}
+#  endif
+#endif
+
+
+struct SourceLocationData
+{
+    const char* name;
+    const char* function;
+    const char* file;
+    uint32_t line;
+    uint32_t color;
+};
+
+#ifdef TRACY_ON_DEMAND
+struct LuaZoneState
+{
+    uint32_t counter;
+    bool active;
+};
+#endif
+
+
+#define TracyLfqPrepare( _type ) \
+    moodycamel::ConcurrentQueueDefaultTraits::index_t __magic; \
+    auto __token = GetToken(); \
+    auto& __tail = __token->get_tail_index(); \
+    auto item = __token->enqueue_begin( __magic ); \
+    MemWrite( &item->hdr.type, _type );
+
+#define TracyLfqCommit \
+    __tail.store( __magic + 1, std::memory_order_release );
+
+#define TracyLfqPrepareC( _type ) \
+    tracy::moodycamel::ConcurrentQueueDefaultTraits::index_t __magic; \
+    auto __token = tracy::GetToken(); \
+    auto& __tail = __token->get_tail_index(); \
+    auto item = __token->enqueue_begin( __magic ); \
+    tracy::MemWrite( &item->hdr.type, _type );
+
+#define TracyLfqCommitC \
+    __tail.store( __magic + 1, std::memory_order_release );
+
+
+#ifdef TRACY_FIBERS
+#  define TracyQueuePrepare( _type ) \
+    auto item = Profiler::QueueSerial(); \
+    MemWrite( &item->hdr.type, _type );
+#  define TracyQueueCommit( _name ) \
+    MemWrite( &item->_name.thread, GetThreadHandle() ); \
+    Profiler::QueueSerialFinish();
+#  define TracyQueuePrepareC( _type ) \
+    auto item = tracy::Profiler::QueueSerial(); \
+    tracy::MemWrite( &item->hdr.type, _type );
+#  define TracyQueueCommitC( _name ) \
+    tracy::MemWrite( &item->_name.thread, tracy::GetThreadHandle() ); \
+    tracy::Profiler::QueueSerialFinish();
+#else
+#  define TracyQueuePrepare( _type ) TracyLfqPrepare( _type )
+#  define TracyQueueCommit( _name ) TracyLfqCommit
+#  define TracyQueuePrepareC( _type ) TracyLfqPrepareC( _type )
+#  define TracyQueueCommitC( _name ) TracyLfqCommitC
+#endif
+
+
+typedef void(*ParameterCallback)( void* data, uint32_t idx, int32_t val );
+typedef char*(*SourceContentsCallback)( void* data, const char* filename, size_t& size );
+
+class Profiler
+{
+    struct FrameImageQueueItem
+    {
+        void* image;
+        uint32_t frame;
+        uint16_t w;
+        uint16_t h;
+        bool flip;
+    };
+
+    enum class SymbolQueueItemType
+    {
+        CallstackFrame,
+        SymbolQuery,
+        ExternalName,
+        KernelCode,
+        SourceCode
+    };
+
+    struct SymbolQueueItem
+    {
+        SymbolQueueItemType type;
+        uint64_t ptr;
+        uint64_t extra;
+        uint32_t id;
+    };
+
+public:
+    Profiler();
+    ~Profiler();
+
+    void SpawnWorkerThreads();
+
+    static tracy_force_inline int64_t GetTime()
+    {
+#ifdef TRACY_HW_TIMER
+#  if defined TARGET_OS_IOS && TARGET_OS_IOS == 1
+        if( HardwareSupportsInvariantTSC() ) return mach_absolute_time();
+#  elif defined _WIN32
+#    ifdef TRACY_TIMER_QPC
+        return GetTimeQpc();
+#    else
+        if( HardwareSupportsInvariantTSC() ) return int64_t( __rdtsc() );
+#    endif
+#  elif defined __i386 || defined _M_IX86
+        if( HardwareSupportsInvariantTSC() )
+        {
+            uint32_t eax, edx;
+            asm volatile ( "rdtsc" : "=a" (eax), "=d" (edx) );
+            return ( uint64_t( edx ) << 32 ) + uint64_t( eax );
+        }
+#  elif defined __x86_64__ || defined _M_X64
+        if( HardwareSupportsInvariantTSC() )
+        {
+            uint64_t rax, rdx;
+#ifdef TRACY_PATCHABLE_NOPSLEDS
+            // Some external tooling (such as rr) wants to patch our rdtsc and replace it by a
+            // branch to control the external input seen by a program. This kind of patching is
+            // not generally possible depending on the surrounding code and can lead to significant
+            // slowdowns if the compiler generated unlucky code and rr and tracy are used together.
+            // To avoid this, use the rr-safe `nopl 0(%rax, %rax, 1); rdtsc` instruction sequence,
+            // which rr promises will be patchable independent of the surrounding code.
+            asm volatile (
+                    // This is nopl 0(%rax, %rax, 1), but assemblers are inconsistent about whether
+                    // they emit that as a 4 or 5 byte sequence and we need to be guaranteed to use
+                    // the 5 byte one.
+                    ".byte 0x0f, 0x1f, 0x44, 0x00, 0x00\n\t"
+                    "rdtsc" : "=a" (rax), "=d" (rdx) );
+#else
+            asm volatile ( "rdtsc" : "=a" (rax), "=d" (rdx) );
+#endif
+            return (int64_t)(( rdx << 32 ) + rax);
+        }
+#  else
+#    error "TRACY_HW_TIMER detection logic needs fixing"
+#  endif
+#endif
+
+#if !defined TRACY_HW_TIMER || defined TRACY_TIMER_FALLBACK
+#  if defined __linux__ && defined CLOCK_MONOTONIC_RAW
+        struct timespec ts;
+        clock_gettime( CLOCK_MONOTONIC_RAW, &ts );
+        return int64_t( ts.tv_sec ) * 1000000000ll + int64_t( ts.tv_nsec );
+#  else
+        return std::chrono::duration_cast<std::chrono::nanoseconds>( std::chrono::high_resolution_clock::now().time_since_epoch() ).count();
+#  endif
+#endif
+
+#if !defined TRACY_TIMER_FALLBACK
+        return 0;  // unreachable branch
+#endif
+    }
+
+    tracy_force_inline uint32_t GetNextZoneId()
+    {
+        return m_zoneId.fetch_add( 1, std::memory_order_relaxed );
+    }
+
+    static tracy_force_inline QueueItem* QueueSerial()
+    {
+        auto& p = GetProfiler();
+        p.m_serialLock.lock();
+        return p.m_serialQueue.prepare_next();
+    }
+
+    static tracy_force_inline QueueItem* QueueSerialCallstack( void* ptr )
+    {
+        auto& p = GetProfiler();
+        p.m_serialLock.lock();
+        p.SendCallstackSerial( ptr );
+        return p.m_serialQueue.prepare_next();
+    }
+
+    static tracy_force_inline void QueueSerialFinish()
+    {
+        auto& p = GetProfiler();
+        p.m_serialQueue.commit_next();
+        p.m_serialLock.unlock();
+    }
+
+    static tracy_force_inline void SendFrameMark( const char* name )
+    {
+        if( !name ) GetProfiler().m_frameCount.fetch_add( 1, std::memory_order_relaxed );
+#ifdef TRACY_ON_DEMAND
+        if( !GetProfiler().IsConnected() ) return;
+#endif
+        auto item = QueueSerial();
+        MemWrite( &item->hdr.type, QueueType::FrameMarkMsg );
+        MemWrite( &item->frameMark.time, GetTime() );
+        MemWrite( &item->frameMark.name, uint64_t( name ) );
+        QueueSerialFinish();
+    }
+
+    static tracy_force_inline void SendFrameMark( const char* name, QueueType type )
+    {
+        assert( type == QueueType::FrameMarkMsgStart || type == QueueType::FrameMarkMsgEnd );
+#ifdef TRACY_ON_DEMAND
+        if( !GetProfiler().IsConnected() ) return;
+#endif
+        auto item = QueueSerial();
+        MemWrite( &item->hdr.type, type );
+        MemWrite( &item->frameMark.time, GetTime() );
+        MemWrite( &item->frameMark.name, uint64_t( name ) );
+        QueueSerialFinish();
+    }
+
+    static tracy_force_inline void SendFrameImage( const void* image, uint16_t w, uint16_t h, uint8_t offset, bool flip )
+    {
+#ifndef TRACY_NO_FRAME_IMAGE
+        auto& profiler = GetProfiler();
+        assert( profiler.m_frameCount.load( std::memory_order_relaxed ) < (std::numeric_limits<uint32_t>::max)() );
+#  ifdef TRACY_ON_DEMAND
+        if( !profiler.IsConnected() ) return;
+#  endif
+        const auto sz = size_t( w ) * size_t( h ) * 4;
+        auto ptr = (char*)tracy_malloc( sz );
+        memcpy( ptr, image, sz );
+
+        profiler.m_fiLock.lock();
+        auto fi = profiler.m_fiQueue.prepare_next();
+        fi->image = ptr;
+        fi->frame = uint32_t( profiler.m_frameCount.load( std::memory_order_relaxed ) - offset );
+        fi->w = w;
+        fi->h = h;
+        fi->flip = flip;
+        profiler.m_fiQueue.commit_next();
+        profiler.m_fiLock.unlock();
+#else
+        static_cast<void>(image); // unused
+        static_cast<void>(w); // unused
+        static_cast<void>(h); // unused
+        static_cast<void>(offset); // unused
+        static_cast<void>(flip); // unused
+#endif
+    }
+
+    static tracy_force_inline void PlotData( const char* name, int64_t val )
+    {
+#ifdef TRACY_ON_DEMAND
+        if( !GetProfiler().IsConnected() ) return;
+#endif
+        TracyLfqPrepare( QueueType::PlotDataInt );
+        MemWrite( &item->plotDataInt.name, (uint64_t)name );
+        MemWrite( &item->plotDataInt.time, GetTime() );
+        MemWrite( &item->plotDataInt.val, val );
+        TracyLfqCommit;
+    }
+
+    static tracy_force_inline void PlotData( const char* name, float val )
+    {
+#ifdef TRACY_ON_DEMAND
+        if( !GetProfiler().IsConnected() ) return;
+#endif
+        TracyLfqPrepare( QueueType::PlotDataFloat );
+        MemWrite( &item->plotDataFloat.name, (uint64_t)name );
+        MemWrite( &item->plotDataFloat.time, GetTime() );
+        MemWrite( &item->plotDataFloat.val, val );
+        TracyLfqCommit;
+    }
+
+    static tracy_force_inline void PlotData( const char* name, double val )
+    {
+#ifdef TRACY_ON_DEMAND
+        if( !GetProfiler().IsConnected() ) return;
+#endif
+        TracyLfqPrepare( QueueType::PlotDataDouble );
+        MemWrite( &item->plotDataDouble.name, (uint64_t)name );
+        MemWrite( &item->plotDataDouble.time, GetTime() );
+        MemWrite( &item->plotDataDouble.val, val );
+        TracyLfqCommit;
+    }
+
+    static tracy_force_inline void ConfigurePlot( const char* name, PlotFormatType type, bool step, bool fill, uint32_t color )
+    {
+        TracyLfqPrepare( QueueType::PlotConfig );
+        MemWrite( &item->plotConfig.name, (uint64_t)name );
+        MemWrite( &item->plotConfig.type, (uint8_t)type );
+        MemWrite( &item->plotConfig.step, (uint8_t)step );
+        MemWrite( &item->plotConfig.fill, (uint8_t)fill );
+        MemWrite( &item->plotConfig.color, color );
+
+#ifdef TRACY_ON_DEMAND
+        GetProfiler().DeferItem( *item );
+#endif
+
+        TracyLfqCommit;
+    }
+
+    static tracy_force_inline void Message( const char* txt, size_t size, int callstack )
+    {
+        assert( size < (std::numeric_limits<uint16_t>::max)() );
+#ifdef TRACY_ON_DEMAND
+        if( !GetProfiler().IsConnected() ) return;
+#endif
+        if( callstack != 0 )
+        {
+            tracy::GetProfiler().SendCallstack( callstack );
+        }
+
+        auto ptr = (char*)tracy_malloc( size );
+        memcpy( ptr, txt, size );
+
+        TracyQueuePrepare( callstack == 0 ? QueueType::Message : QueueType::MessageCallstack );
+        MemWrite( &item->messageFat.time, GetTime() );
+        MemWrite( &item->messageFat.text, (uint64_t)ptr );
+        MemWrite( &item->messageFat.size, (uint16_t)size );
+        TracyQueueCommit( messageFatThread );
+    }
+
+    static tracy_force_inline void Message( const char* txt, int callstack )
+    {
+#ifdef TRACY_ON_DEMAND
+        if( !GetProfiler().IsConnected() ) return;
+#endif
+        if( callstack != 0 )
+        {
+            tracy::GetProfiler().SendCallstack( callstack );
+        }
+
+        TracyQueuePrepare( callstack == 0 ? QueueType::MessageLiteral : QueueType::MessageLiteralCallstack );
+        MemWrite( &item->messageLiteral.time, GetTime() );
+        MemWrite( &item->messageLiteral.text, (uint64_t)txt );
+        TracyQueueCommit( messageLiteralThread );
+    }
+
+    static tracy_force_inline void MessageColor( const char* txt, size_t size, uint32_t color, int callstack )
+    {
+        assert( size < (std::numeric_limits<uint16_t>::max)() );
+#ifdef TRACY_ON_DEMAND
+        if( !GetProfiler().IsConnected() ) return;
+#endif
+        if( callstack != 0 )
+        {
+            tracy::GetProfiler().SendCallstack( callstack );
+        }
+
+        auto ptr = (char*)tracy_malloc( size );
+        memcpy( ptr, txt, size );
+
+        TracyQueuePrepare( callstack == 0 ? QueueType::MessageColor : QueueType::MessageColorCallstack );
+        MemWrite( &item->messageColorFat.time, GetTime() );
+        MemWrite( &item->messageColorFat.text, (uint64_t)ptr );
+        MemWrite( &item->messageColorFat.b, uint8_t( ( color       ) & 0xFF ) );
+        MemWrite( &item->messageColorFat.g, uint8_t( ( color >> 8  ) & 0xFF ) );
+        MemWrite( &item->messageColorFat.r, uint8_t( ( color >> 16 ) & 0xFF ) );
+        MemWrite( &item->messageColorFat.size, (uint16_t)size );
+        TracyQueueCommit( messageColorFatThread );
+    }
+
+    static tracy_force_inline void MessageColor( const char* txt, uint32_t color, int callstack )
+    {
+#ifdef TRACY_ON_DEMAND
+        if( !GetProfiler().IsConnected() ) return;
+#endif
+        if( callstack != 0 )
+        {
+            tracy::GetProfiler().SendCallstack( callstack );
+        }
+
+        TracyQueuePrepare( callstack == 0 ? QueueType::MessageLiteralColor : QueueType::MessageLiteralColorCallstack );
+        MemWrite( &item->messageColorLiteral.time, GetTime() );
+        MemWrite( &item->messageColorLiteral.text, (uint64_t)txt );
+        MemWrite( &item->messageColorLiteral.b, uint8_t( ( color       ) & 0xFF ) );
+        MemWrite( &item->messageColorLiteral.g, uint8_t( ( color >> 8  ) & 0xFF ) );
+        MemWrite( &item->messageColorLiteral.r, uint8_t( ( color >> 16 ) & 0xFF ) );
+        TracyQueueCommit( messageColorLiteralThread );
+    }
+
+    static tracy_force_inline void MessageAppInfo( const char* txt, size_t size )
+    {
+        assert( size < (std::numeric_limits<uint16_t>::max)() );
+        auto ptr = (char*)tracy_malloc( size );
+        memcpy( ptr, txt, size );
+        TracyLfqPrepare( QueueType::MessageAppInfo );
+        MemWrite( &item->messageFat.time, GetTime() );
+        MemWrite( &item->messageFat.text, (uint64_t)ptr );
+        MemWrite( &item->messageFat.size, (uint16_t)size );
+
+#ifdef TRACY_ON_DEMAND
+        GetProfiler().DeferItem( *item );
+#endif
+
+        TracyLfqCommit;
+    }
+
+    static tracy_force_inline void MemAlloc( const void* ptr, size_t size, bool secure )
+    {
+        if( secure && !ProfilerAvailable() ) return;
+#ifdef TRACY_ON_DEMAND
+        if( !GetProfiler().IsConnected() ) return;
+#endif
+        const auto thread = GetThreadHandle();
+
+        GetProfiler().m_serialLock.lock();
+        SendMemAlloc( QueueType::MemAlloc, thread, ptr, size );
+        GetProfiler().m_serialLock.unlock();
+    }
+
+    static tracy_force_inline void MemFree( const void* ptr, bool secure )
+    {
+        if( secure && !ProfilerAvailable() ) return;
+#ifdef TRACY_ON_DEMAND
+        if( !GetProfiler().IsConnected() ) return;
+#endif
+        const auto thread = GetThreadHandle();
+
+        GetProfiler().m_serialLock.lock();
+        SendMemFree( QueueType::MemFree, thread, ptr );
+        GetProfiler().m_serialLock.unlock();
+    }
+
+    static tracy_force_inline void MemAllocCallstack( const void* ptr, size_t size, int depth, bool secure )
+    {
+        if( secure && !ProfilerAvailable() ) return;
+#ifdef TRACY_HAS_CALLSTACK
+        auto& profiler = GetProfiler();
+#  ifdef TRACY_ON_DEMAND
+        if( !profiler.IsConnected() ) return;
+#  endif
+        const auto thread = GetThreadHandle();
+
+        auto callstack = Callstack( depth );
+
+        profiler.m_serialLock.lock();
+        SendCallstackSerial( callstack );
+        SendMemAlloc( QueueType::MemAllocCallstack, thread, ptr, size );
+        profiler.m_serialLock.unlock();
+#else
+        static_cast<void>(depth); // unused
+        MemAlloc( ptr, size, secure );
+#endif
+    }
+
+    static tracy_force_inline void MemFreeCallstack( const void* ptr, int depth, bool secure )
+    {
+        if( secure && !ProfilerAvailable() ) return;
+        if( !ProfilerAllocatorAvailable() )
+        {
+            MemFree( ptr, secure );
+            return;
+        }
+#ifdef TRACY_HAS_CALLSTACK
+        auto& profiler = GetProfiler();
+#  ifdef TRACY_ON_DEMAND
+        if( !profiler.IsConnected() ) return;
+#  endif
+        const auto thread = GetThreadHandle();
+
+        auto callstack = Callstack( depth );
+
+        profiler.m_serialLock.lock();
+        SendCallstackSerial( callstack );
+        SendMemFree( QueueType::MemFreeCallstack, thread, ptr );
+        profiler.m_serialLock.unlock();
+#else
+        static_cast<void>(depth); // unused
+        MemFree( ptr, secure );
+#endif
+    }
+
+    static tracy_force_inline void MemAllocNamed( const void* ptr, size_t size, bool secure, const char* name )
+    {
+        if( secure && !ProfilerAvailable() ) return;
+#ifdef TRACY_ON_DEMAND
+        if( !GetProfiler().IsConnected() ) return;
+#endif
+        const auto thread = GetThreadHandle();
+
+        GetProfiler().m_serialLock.lock();
+        SendMemName( name );
+        SendMemAlloc( QueueType::MemAllocNamed, thread, ptr, size );
+        GetProfiler().m_serialLock.unlock();
+    }
+
+    static tracy_force_inline void MemFreeNamed( const void* ptr, bool secure, const char* name )
+    {
+        if( secure && !ProfilerAvailable() ) return;
+#ifdef TRACY_ON_DEMAND
+        if( !GetProfiler().IsConnected() ) return;
+#endif
+        const auto thread = GetThreadHandle();
+
+        GetProfiler().m_serialLock.lock();
+        SendMemName( name );
+        SendMemFree( QueueType::MemFreeNamed, thread, ptr );
+        GetProfiler().m_serialLock.unlock();
+    }
+
+    static tracy_force_inline void MemAllocCallstackNamed( const void* ptr, size_t size, int depth, bool secure, const char* name )
+    {
+        if( secure && !ProfilerAvailable() ) return;
+#ifdef TRACY_HAS_CALLSTACK
+        auto& profiler = GetProfiler();
+#  ifdef TRACY_ON_DEMAND
+        if( !profiler.IsConnected() ) return;
+#  endif
+        const auto thread = GetThreadHandle();
+
+        auto callstack = Callstack( depth );
+
+        profiler.m_serialLock.lock();
+        SendCallstackSerial( callstack );
+        SendMemName( name );
+        SendMemAlloc( QueueType::MemAllocCallstackNamed, thread, ptr, size );
+        profiler.m_serialLock.unlock();
+#else
+        static_cast<void>(depth); // unused
+        MemAllocNamed( ptr, size, secure, name );
+#endif
+    }
+
+    static tracy_force_inline void MemFreeCallstackNamed( const void* ptr, int depth, bool secure, const char* name )
+    {
+        if( secure && !ProfilerAvailable() ) return;
+#ifdef TRACY_HAS_CALLSTACK
+        auto& profiler = GetProfiler();
+#  ifdef TRACY_ON_DEMAND
+        if( !profiler.IsConnected() ) return;
+#  endif
+        const auto thread = GetThreadHandle();
+
+        auto callstack = Callstack( depth );
+
+        profiler.m_serialLock.lock();
+        SendCallstackSerial( callstack );
+        SendMemName( name );
+        SendMemFree( QueueType::MemFreeCallstackNamed, thread, ptr );
+        profiler.m_serialLock.unlock();
+#else
+        static_cast<void>(depth); // unused
+        MemFreeNamed( ptr, secure, name );
+#endif
+    }
+
+    static tracy_force_inline void SendCallstack( int depth )
+    {
+#ifdef TRACY_HAS_CALLSTACK
+        auto ptr = Callstack( depth );
+        TracyQueuePrepare( QueueType::Callstack );
+        MemWrite( &item->callstackFat.ptr, (uint64_t)ptr );
+        TracyQueueCommit( callstackFatThread );
+#else
+        static_cast<void>(depth); // unused
+#endif
+    }
+
+    static tracy_force_inline void ParameterRegister( ParameterCallback cb, void* data )
+    {
+        auto& profiler = GetProfiler();
+        profiler.m_paramCallback = cb;
+        profiler.m_paramCallbackData = data;
+    }
+
+    static tracy_force_inline void ParameterSetup( uint32_t idx, const char* name, bool isBool, int32_t val )
+    {
+        TracyLfqPrepare( QueueType::ParamSetup );
+        tracy::MemWrite( &item->paramSetup.idx, idx );
+        tracy::MemWrite( &item->paramSetup.name, (uint64_t)name );
+        tracy::MemWrite( &item->paramSetup.isBool, (uint8_t)isBool );
+        tracy::MemWrite( &item->paramSetup.val, val );
+
+#ifdef TRACY_ON_DEMAND
+        GetProfiler().DeferItem( *item );
+#endif
+
+        TracyLfqCommit;
+    }
+
+    static tracy_force_inline void SourceCallbackRegister( SourceContentsCallback cb, void* data )
+    {
+        auto& profiler = GetProfiler();
+        profiler.m_sourceCallback = cb;
+        profiler.m_sourceCallbackData = data;
+    }
+
+#ifdef TRACY_FIBERS
+    static tracy_force_inline void EnterFiber( const char* fiber, int32_t groupHint )
+    {
+        TracyQueuePrepare( QueueType::FiberEnter );
+        MemWrite( &item->fiberEnter.time, GetTime() );
+        MemWrite( &item->fiberEnter.fiber, (uint64_t)fiber );
+        MemWrite( &item->fiberEnter.groupHint, groupHint );
+        TracyQueueCommit( fiberEnter );
+    }
+
+    static tracy_force_inline void LeaveFiber()
+    {
+        TracyQueuePrepare( QueueType::FiberLeave );
+        MemWrite( &item->fiberLeave.time, GetTime() );
+        TracyQueueCommit( fiberLeave );
+    }
+#endif
+
+    void SendCallstack( int depth, const char* skipBefore );
+    static void CutCallstack( void* callstack, const char* skipBefore );
+
+    static bool ShouldExit();
+
+    tracy_force_inline bool IsConnected() const
+    {
+        return m_isConnected.load( std::memory_order_acquire );
+    }
+
+    tracy_force_inline void SetProgramName( const char* name )
+    {
+        m_programNameLock.lock();
+        m_programName = name;
+        m_programNameLock.unlock();
+    }
+
+#ifdef TRACY_ON_DEMAND
+    tracy_force_inline uint64_t ConnectionId() const
+    {
+        return m_connectionId.load( std::memory_order_acquire );
+    }
+
+    tracy_force_inline void DeferItem( const QueueItem& item )
+    {
+        m_deferredLock.lock();
+        auto dst = m_deferredQueue.push_next();
+        memcpy( dst, &item, sizeof( item ) );
+        m_deferredLock.unlock();
+    }
+#endif
+
+    void RequestShutdown() { m_shutdown.store( true, std::memory_order_relaxed ); m_shutdownManual.store( true, std::memory_order_relaxed ); }
+    bool HasShutdownFinished() const { return m_shutdownFinished.load( std::memory_order_relaxed ); }
+
+    void SendString( uint64_t str, const char* ptr, QueueType type ) { SendString( str, ptr, strlen( ptr ), type ); }
+    void SendString( uint64_t str, const char* ptr, size_t len, QueueType type );
+    void SendSingleString( const char* ptr ) { SendSingleString( ptr, strlen( ptr ) ); }
+    void SendSingleString( const char* ptr, size_t len );
+    void SendSecondString( const char* ptr ) { SendSecondString( ptr, strlen( ptr ) ); }
+    void SendSecondString( const char* ptr, size_t len );
+
+
+    // Allocated source location data layout:
+    //  2b  payload size
+    //  4b  color
+    //  4b  source line
+    //  fsz function name
+    //  1b  null terminator
+    //  ssz source file name
+    //  1b  null terminator
+    //  nsz zone name (optional)
+
+    static tracy_force_inline uint64_t AllocSourceLocation( uint32_t line, const char* source, const char* function, uint32_t color = 0 )
+    {
+        return AllocSourceLocation( line, source, function, nullptr, 0, color );
+    }
+
+    static tracy_force_inline uint64_t AllocSourceLocation( uint32_t line, const char* source, const char* function, const char* name, size_t nameSz, uint32_t color = 0 )
+    {
+        return AllocSourceLocation( line, source, strlen(source), function, strlen(function), name, nameSz, color );
+    }
+
+    static tracy_force_inline uint64_t AllocSourceLocation( uint32_t line, const char* source, size_t sourceSz, const char* function, size_t functionSz, uint32_t color = 0 )
+    {
+        return AllocSourceLocation( line, source, sourceSz, function, functionSz, nullptr, 0, color );
+    }
+
+    static tracy_force_inline uint64_t AllocSourceLocation( uint32_t line, const char* source, size_t sourceSz, const char* function, size_t functionSz, const char* name, size_t nameSz, uint32_t color = 0 )
+    {
+        const auto sz32 = uint32_t( 2 + 4 + 4 + functionSz + 1 + sourceSz + 1 + nameSz );
+        assert( sz32 <= (std::numeric_limits<uint16_t>::max)() );
+        const auto sz = uint16_t( sz32 );
+        auto ptr = (char*)tracy_malloc( sz );
+        memcpy( ptr, &sz, 2 );
+        memcpy( ptr + 2, &color, 4 );
+        memcpy( ptr + 6, &line, 4 );
+        memcpy( ptr + 10, function, functionSz );
+        ptr[10 + functionSz] = '\0';
+        memcpy( ptr + 10 + functionSz + 1, source, sourceSz );
+        ptr[10 + functionSz + 1 + sourceSz] = '\0';
+        if( nameSz != 0 )
+        {
+            memcpy( ptr + 10 + functionSz + 1 + sourceSz + 1, name, nameSz );
+        }
+        return uint64_t( ptr );
+    }
+
+private:
+    enum class DequeueStatus { DataDequeued, ConnectionLost, QueueEmpty };
+    enum class ThreadCtxStatus { Same, Changed, ConnectionLost };
+
+    static void LaunchWorker( void* ptr ) { ((Profiler*)ptr)->Worker(); }
+    void Worker();
+
+#ifndef TRACY_NO_FRAME_IMAGE
+    static void LaunchCompressWorker( void* ptr ) { ((Profiler*)ptr)->CompressWorker(); }
+    void CompressWorker();
+#endif
+
+#ifdef TRACY_HAS_CALLSTACK
+    static void LaunchSymbolWorker( void* ptr ) { ((Profiler*)ptr)->SymbolWorker(); }
+    void SymbolWorker();
+    void HandleSymbolQueueItem( const SymbolQueueItem& si );
+#endif
+
+    void InstallCrashHandler();
+    void RemoveCrashHandler();
+    
+    void ClearQueues( tracy::moodycamel::ConsumerToken& token );
+    void ClearSerial();
+    DequeueStatus Dequeue( tracy::moodycamel::ConsumerToken& token );
+    DequeueStatus DequeueContextSwitches( tracy::moodycamel::ConsumerToken& token, int64_t& timeStop );
+    DequeueStatus DequeueSerial();
+    ThreadCtxStatus ThreadCtxCheck( uint32_t threadId );
+    bool CommitData();
+
+    tracy_force_inline bool AppendData( const void* data, size_t len )
+    {
+        const auto ret = NeedDataSize( len );
+        AppendDataUnsafe( data, len );
+        return ret;
+    }
+
+    tracy_force_inline bool NeedDataSize( size_t len )
+    {
+        assert( len <= TargetFrameSize );
+        bool ret = true;
+        if( m_bufferOffset - m_bufferStart + (int)len > TargetFrameSize )
+        {
+            ret = CommitData();
+        }
+        return ret;
+    }
+
+    tracy_force_inline void AppendDataUnsafe( const void* data, size_t len )
+    {
+        memcpy( m_buffer + m_bufferOffset, data, len );
+        m_bufferOffset += int( len );
+    }
+
+    bool SendData( const char* data, size_t len );
+    void SendLongString( uint64_t ptr, const char* str, size_t len, QueueType type );
+    void SendSourceLocation( uint64_t ptr );
+    void SendSourceLocationPayload( uint64_t ptr );
+    void SendCallstackPayload( uint64_t ptr );
+    void SendCallstackPayload64( uint64_t ptr );
+    void SendCallstackAlloc( uint64_t ptr );
+
+    void QueueCallstackFrame( uint64_t ptr );
+    void QueueSymbolQuery( uint64_t symbol );
+    void QueueExternalName( uint64_t ptr );
+    void QueueKernelCode( uint64_t symbol, uint32_t size );
+    void QueueSourceCodeQuery( uint32_t id );
+
+    bool HandleServerQuery();
+    void HandleDisconnect();
+    void HandleParameter( uint64_t payload );
+    void HandleSymbolCodeQuery( uint64_t symbol, uint32_t size );
+    void HandleSourceCodeQuery( char* data, char* image, uint32_t id );
+
+    void AckServerQuery();
+    void AckSymbolCodeNotAvailable();
+
+    void CalibrateTimer();
+    void CalibrateDelay();
+    void ReportTopology();
+
+    static tracy_force_inline void SendCallstackSerial( void* ptr )
+    {
+#ifdef TRACY_HAS_CALLSTACK
+        auto item = GetProfiler().m_serialQueue.prepare_next();
+        MemWrite( &item->hdr.type, QueueType::CallstackSerial );
+        MemWrite( &item->callstackFat.ptr, (uint64_t)ptr );
+        GetProfiler().m_serialQueue.commit_next();
+#else
+        static_cast<void>(ptr); // unused
+#endif
+    }
+
+    static tracy_force_inline void SendMemAlloc( QueueType type, const uint32_t thread, const void* ptr, size_t size )
+    {
+        assert( type == QueueType::MemAlloc || type == QueueType::MemAllocCallstack || type == QueueType::MemAllocNamed || type == QueueType::MemAllocCallstackNamed );
+
+        auto item = GetProfiler().m_serialQueue.prepare_next();
+        MemWrite( &item->hdr.type, type );
+        MemWrite( &item->memAlloc.time, GetTime() );
+        MemWrite( &item->memAlloc.thread, thread );
+        MemWrite( &item->memAlloc.ptr, (uint64_t)ptr );
+        if( compile_time_condition<sizeof( size ) == 4>::value )
+        {
+            memcpy( &item->memAlloc.size, &size, 4 );
+            memset( &item->memAlloc.size + 4, 0, 2 );
+        }
+        else
+        {
+            assert( sizeof( size ) == 8 );
+            memcpy( &item->memAlloc.size, &size, 4 );
+            memcpy( ((char*)&item->memAlloc.size)+4, ((char*)&size)+4, 2 );
+        }
+        GetProfiler().m_serialQueue.commit_next();
+    }
+
+    static tracy_force_inline void SendMemFree( QueueType type, const uint32_t thread, const void* ptr )
+    {
+        assert( type == QueueType::MemFree || type == QueueType::MemFreeCallstack || type == QueueType::MemFreeNamed || type == QueueType::MemFreeCallstackNamed );
+
+        auto item = GetProfiler().m_serialQueue.prepare_next();
+        MemWrite( &item->hdr.type, type );
+        MemWrite( &item->memFree.time, GetTime() );
+        MemWrite( &item->memFree.thread, thread );
+        MemWrite( &item->memFree.ptr, (uint64_t)ptr );
+        GetProfiler().m_serialQueue.commit_next();
+    }
+
+    static tracy_force_inline void SendMemName( const char* name )
+    {
+        assert( name );
+        auto item = GetProfiler().m_serialQueue.prepare_next();
+        MemWrite( &item->hdr.type, QueueType::MemNamePayload );
+        MemWrite( &item->memName.name, (uint64_t)name );
+        GetProfiler().m_serialQueue.commit_next();
+    }
+
+#if defined _WIN32 && defined TRACY_TIMER_QPC
+    static int64_t GetTimeQpc();
+#endif
+
+    double m_timerMul;
+    uint64_t m_resolution;
+    uint64_t m_delay;
+    std::atomic<int64_t> m_timeBegin;
+    uint32_t m_mainThread;
+    uint64_t m_epoch, m_exectime;
+    std::atomic<bool> m_shutdown;
+    std::atomic<bool> m_shutdownManual;
+    std::atomic<bool> m_shutdownFinished;
+    Socket* m_sock;
+    UdpBroadcast* m_broadcast;
+    bool m_noExit;
+    uint32_t m_userPort;
+    std::atomic<uint32_t> m_zoneId;
+    int64_t m_samplingPeriod;
+
+    uint32_t m_threadCtx;
+    int64_t m_refTimeThread;
+    int64_t m_refTimeSerial;
+    int64_t m_refTimeCtx;
+    int64_t m_refTimeGpu;
+
+    void* m_stream;     // LZ4_stream_t*
+    char* m_buffer;
+    int m_bufferOffset;
+    int m_bufferStart;
+
+    char* m_lz4Buf;
+
+    FastVector<QueueItem> m_serialQueue, m_serialDequeue;
+    TracyMutex m_serialLock;
+
+#ifndef TRACY_NO_FRAME_IMAGE
+    FastVector<FrameImageQueueItem> m_fiQueue, m_fiDequeue;
+    TracyMutex m_fiLock;
+#endif
+
+    SPSCQueue<SymbolQueueItem> m_symbolQueue;
+
+    std::atomic<uint64_t> m_frameCount;
+    std::atomic<bool> m_isConnected;
+#ifdef TRACY_ON_DEMAND
+    std::atomic<uint64_t> m_connectionId;
+
+    TracyMutex m_deferredLock;
+    FastVector<QueueItem> m_deferredQueue;
+#endif
+
+#ifdef TRACY_HAS_SYSTIME
+    void ProcessSysTime();
+
+    SysTime m_sysTime;
+    uint64_t m_sysTimeLast = 0;
+#else
+    void ProcessSysTime() {}
+#endif
+
+#ifdef TRACY_HAS_SYSPOWER
+    SysPower m_sysPower;
+#endif
+
+    ParameterCallback m_paramCallback;
+    void* m_paramCallbackData;
+    SourceContentsCallback m_sourceCallback;
+    void* m_sourceCallbackData;
+
+    char* m_queryImage;
+    char* m_queryData;
+    char* m_queryDataPtr;
+
+#if defined _WIN32
+    void* m_exceptionHandler;
+#endif
+#ifdef __linux__
+    struct {
+        struct sigaction pwr, ill, fpe, segv, pipe, bus, abrt;
+    } m_prevSignal;
+    KCore* m_kcore;
+#endif
+    bool m_crashHandlerInstalled;
+
+    const char* m_programName;
+    TracyMutex m_programNameLock;
+};
+
+}
+
+#endif
diff --git a/project/thirdparty/tracy-0.11.1/client/TracyRingBuffer.hpp b/project/thirdparty/tracy-0.11.1/client/TracyRingBuffer.hpp
new file mode 100644
index 000000000..e9100e2d8
--- /dev/null
+++ b/project/thirdparty/tracy-0.11.1/client/TracyRingBuffer.hpp
@@ -0,0 +1,141 @@
+#include <atomic>
+#include <assert.h>
+#include <errno.h>
+#include <linux/perf_event.h>
+#include <stdint.h>
+#include <string.h>
+#include <sys/ioctl.h>
+#include <sys/mman.h>
+#include <unistd.h>
+
+#include "TracyDebug.hpp"
+
+namespace tracy
+{
+
+class RingBuffer
+{
+public:
+    RingBuffer( unsigned int size, int fd, int id, int cpu = -1 )
+        : m_size( size )
+        , m_id( id )
+        , m_cpu( cpu )
+        , m_fd( fd )
+    {
+        const auto pageSize = uint32_t( getpagesize() );
+        assert( size >= pageSize );
+        assert( __builtin_popcount( size ) == 1 );
+        m_mapSize = size + pageSize;
+        auto mapAddr = mmap( nullptr, m_mapSize, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0 );
+        if( mapAddr == MAP_FAILED )
+        {
+            TracyDebug( "mmap failed: errno %i (%s)\n", errno, strerror( errno ) );
+            m_fd = 0;
+            m_metadata = nullptr;
+            close( fd );
+            return;
+        }
+        m_metadata = (perf_event_mmap_page*)mapAddr;
+        assert( m_metadata->data_offset == pageSize );
+        m_buffer = ((char*)mapAddr) + pageSize;
+        m_tail = m_metadata->data_tail;
+    }
+
+    ~RingBuffer()
+    {
+        if( m_metadata ) munmap( m_metadata, m_mapSize );
+        if( m_fd ) close( m_fd );
+    }
+
+    RingBuffer( const RingBuffer& ) = delete;
+    RingBuffer& operator=( const RingBuffer& ) = delete;
+
+    RingBuffer( RingBuffer&& other )
+    {
+        memcpy( (char*)&other, (char*)this, sizeof( RingBuffer ) );
+        m_metadata = nullptr;
+        m_fd = 0;
+    }
+
+    RingBuffer& operator=( RingBuffer&& other )
+    {
+        memcpy( (char*)&other, (char*)this, sizeof( RingBuffer ) );
+        m_metadata = nullptr;
+        m_fd = 0;
+        return *this;
+    }
+
+    bool IsValid() const { return m_metadata != nullptr; }
+    int GetId() const { return m_id; }
+    int GetCpu() const { return m_cpu; }
+
+    void Enable()
+    {
+        ioctl( m_fd, PERF_EVENT_IOC_ENABLE, 0 );
+    }
+
+    void Read( void* dst, uint64_t offset, uint64_t cnt )
+    {
+        const auto size = m_size;
+        auto src = ( m_tail + offset ) % size;
+        if( src + cnt <= size )
+        {
+            memcpy( dst, m_buffer + src, cnt );
+        }
+        else
+        {
+            const auto s0 = size - src;
+            const auto buf = m_buffer;
+            memcpy( dst, buf + src, s0 );
+            memcpy( (char*)dst + s0, buf, cnt - s0 );
+        }
+    }
+
+    void Advance( uint64_t cnt )
+    {
+        m_tail += cnt;
+        StoreTail();
+    }
+
+    bool CheckTscCaps() const
+    {
+        return m_metadata->cap_user_time_zero;
+    }
+
+    int64_t ConvertTimeToTsc( int64_t timestamp ) const
+    {
+        if( !m_metadata->cap_user_time_zero ) return 0;
+        const auto time = timestamp - m_metadata->time_zero;
+        const auto quot = time / m_metadata->time_mult;
+        const auto rem = time % m_metadata->time_mult;
+        return ( quot << m_metadata->time_shift ) + ( rem << m_metadata->time_shift ) / m_metadata->time_mult;
+    }
+
+    uint64_t LoadHead() const
+    {
+        return std::atomic_load_explicit( (const volatile std::atomic<uint64_t>*)&m_metadata->data_head, std::memory_order_acquire );
+    }
+
+    uint64_t GetTail() const
+    {
+        return m_tail;
+    }
+
+private:
+    void StoreTail()
+    {
+        std::atomic_store_explicit( (volatile std::atomic<uint64_t>*)&m_metadata->data_tail, m_tail, std::memory_order_release );
+    }
+
+    unsigned int m_size;
+    uint64_t m_tail;
+    char* m_buffer;
+    int m_id;
+    int m_cpu;
+    perf_event_mmap_page* m_metadata;
+
+    size_t m_mapSize;
+    int m_fd;
+};
+
+}
diff --git a/project/thirdparty/tracy-0.11.1/client/TracyScoped.hpp b/project/thirdparty/tracy-0.11.1/client/TracyScoped.hpp
new file mode 100644
index 000000000..8e81c998f
--- /dev/null
+++ b/project/thirdparty/tracy-0.11.1/client/TracyScoped.hpp
@@ -0,0 +1,228 @@
+#ifndef __TRACYSCOPED_HPP__
+#define __TRACYSCOPED_HPP__
+
+#include <limits>
+#include <stdarg.h>
+#include <stdint.h>
+#include <string.h>
+
+#include "../common/TracySystem.hpp"
+#include "../common/TracyAlign.hpp"
+#include "../common/TracyAlloc.hpp"
+#include "TracyProfiler.hpp"
+
+namespace tracy
+{
+
+class ScopedZone
+{
+public:
+    ScopedZone( const ScopedZone& ) = delete;
+    ScopedZone( ScopedZone&& ) = delete;
+    ScopedZone& operator=( const ScopedZone& ) = delete;
+    ScopedZone& operator=( ScopedZone&& ) = delete;
+
+    tracy_force_inline ScopedZone( const SourceLocationData* srcloc, bool is_active = true )
+#ifdef TRACY_ON_DEMAND
+        : m_active( is_active && GetProfiler().IsConnected() )
+#else
+        : m_active( is_active )
+#endif
+    {
+        if( !m_active ) return;
+#ifdef TRACY_ON_DEMAND
+        m_connectionId = GetProfiler().ConnectionId();
+#endif
+        TracyQueuePrepare( QueueType::ZoneBegin );
+        MemWrite( &item->zoneBegin.time, Profiler::GetTime() );
+        MemWrite( &item->zoneBegin.srcloc, (uint64_t)srcloc );
+        TracyQueueCommit( zoneBeginThread );
+    }
+
+    tracy_force_inline ScopedZone( const SourceLocationData* srcloc, int depth, bool is_active = true )
+#ifdef TRACY_ON_DEMAND
+        : m_active( is_active && GetProfiler().IsConnected() )
+#else
+        : m_active( is_active )
+#endif
+    {
+        if( !m_active ) return;
+#ifdef TRACY_ON_DEMAND
+        m_connectionId = GetProfiler().ConnectionId();
+#endif
+        GetProfiler().SendCallstack( depth );
+
+        TracyQueuePrepare( QueueType::ZoneBeginCallstack );
+        MemWrite( &item->zoneBegin.time, Profiler::GetTime() );
+        MemWrite( &item->zoneBegin.srcloc, (uint64_t)srcloc );
+        TracyQueueCommit( zoneBeginThread );
+    }
+
+    tracy_force_inline ScopedZone( uint32_t line, const char* source, size_t sourceSz, const char* function, size_t functionSz, const char* name, size_t nameSz, uint32_t color, bool is_active = true )
+#ifdef TRACY_ON_DEMAND
+        : m_active( is_active && GetProfiler().IsConnected() )
+#else
+        : m_active( is_active )
+#endif
+    {
+        if( !m_active ) return;
+#ifdef TRACY_ON_DEMAND
+        m_connectionId = GetProfiler().ConnectionId();
+#endif
+        TracyQueuePrepare( QueueType::ZoneBeginAllocSrcLoc );
+        const auto srcloc = Profiler::AllocSourceLocation( line, source, sourceSz, function, functionSz, name, nameSz, color );
+        MemWrite( &item->zoneBegin.time, Profiler::GetTime() );
+        MemWrite( &item->zoneBegin.srcloc, srcloc );
+        TracyQueueCommit( zoneBeginThread );
+    }
+
+    tracy_force_inline ScopedZone( uint32_t line, const char* source, size_t sourceSz, const char* function, size_t functionSz, const char* name, size_t nameSz, bool is_active = true ) : ScopedZone( line, source, sourceSz, function, functionSz, name, nameSz, static_cast<uint32_t>(0), is_active ) {}
+
+    tracy_force_inline ScopedZone( uint32_t line, const char* source, size_t sourceSz, const char* function, size_t functionSz, const char* name, size_t nameSz, uint32_t color, int depth, bool is_active = true )
+#ifdef TRACY_ON_DEMAND
+        : m_active( is_active && GetProfiler().IsConnected() )
+#else
+        : m_active( is_active )
+#endif
+    {
+        if( !m_active ) return;
+#ifdef TRACY_ON_DEMAND
+        m_connectionId = GetProfiler().ConnectionId();
+#endif
+        GetProfiler().SendCallstack( depth );
+
+        TracyQueuePrepare( QueueType::ZoneBeginAllocSrcLocCallstack );
+        const auto srcloc = Profiler::AllocSourceLocation( line, source, sourceSz, function, functionSz, name, nameSz, color );
+        MemWrite( &item->zoneBegin.time, Profiler::GetTime() );
+        MemWrite( &item->zoneBegin.srcloc, srcloc );
+        TracyQueueCommit( zoneBeginThread );
+    }
+
+    tracy_force_inline ScopedZone( uint32_t line, const char* source, size_t sourceSz, const char* function, size_t functionSz, const char* name, size_t nameSz, int depth, bool is_active = true ) : ScopedZone( line, source, sourceSz, function, functionSz, name, nameSz, 0, depth, is_active ) {}
+
+    tracy_force_inline ~ScopedZone()
+    {
+        if( !m_active ) return;
+#ifdef TRACY_ON_DEMAND
+        if( GetProfiler().ConnectionId() != m_connectionId ) return;
+#endif
+        TracyQueuePrepare( QueueType::ZoneEnd );
+        MemWrite( &item->zoneEnd.time, Profiler::GetTime() );
+        TracyQueueCommit( zoneEndThread );
+    }
+
+    tracy_force_inline void Text( const char* txt, size_t size )
+    {
+        assert( size < (std::numeric_limits<uint16_t>::max)() );
+        if( !m_active ) return;
+#ifdef TRACY_ON_DEMAND
+        if( GetProfiler().ConnectionId() != m_connectionId ) return;
+#endif
+        auto ptr = (char*)tracy_malloc( size );
+        memcpy( ptr, txt, size );
+        TracyQueuePrepare( QueueType::ZoneText );
+        MemWrite( &item->zoneTextFat.text, (uint64_t)ptr );
+        MemWrite( &item->zoneTextFat.size, (uint16_t)size );
+        TracyQueueCommit( zoneTextFatThread );
+    }
+
+    void TextFmt( const char* fmt, ... )
+    {
+        if( !m_active ) return;
+#ifdef TRACY_ON_DEMAND
+        if( GetProfiler().ConnectionId() != m_connectionId ) return;
+#endif
+        va_list args;
+        va_start( args, fmt );
+        auto size = vsnprintf( nullptr, 0, fmt, args );
+        va_end( args );
+        if( size < 0 ) return;
+        assert( size < (std::numeric_limits<uint16_t>::max)() );
+
+        char* ptr = (char*)tracy_malloc( size_t( size ) + 1 );
+        va_start( args, fmt );
+        vsnprintf( ptr, size_t( size ) + 1, fmt, args );
+        va_end( args );
+
+        TracyQueuePrepare( QueueType::ZoneText );
+        MemWrite( &item->zoneTextFat.text, (uint64_t)ptr );
+        MemWrite( &item->zoneTextFat.size, (uint16_t)size );
+        TracyQueueCommit( zoneTextFatThread );
+    }
+
+    tracy_force_inline void Name( const char* txt, size_t size )
+    {
+        assert( size < (std::numeric_limits<uint16_t>::max)() );
+        if( !m_active ) return;
+#ifdef TRACY_ON_DEMAND
+        if( GetProfiler().ConnectionId() != m_connectionId ) return;
+#endif
+        auto ptr = (char*)tracy_malloc( size );
+        memcpy( ptr, txt, size );
+        TracyQueuePrepare( QueueType::ZoneName );
+        MemWrite( &item->zoneTextFat.text, (uint64_t)ptr );
+        MemWrite( &item->zoneTextFat.size, (uint16_t)size );
+        TracyQueueCommit( zoneTextFatThread );
+    }
+
+    void NameFmt( const char* fmt, ... )
+    {
+        if( !m_active ) return;
+#ifdef TRACY_ON_DEMAND
+        if( GetProfiler().ConnectionId() != m_connectionId ) return;
+#endif
+        va_list args;
+        va_start( args, fmt );
+        auto size = vsnprintf( nullptr, 0, fmt, args );
+        va_end( args );
+        if( size < 0 ) return;
+        assert( size < (std::numeric_limits<uint16_t>::max)() );
+
+        char* ptr = (char*)tracy_malloc( size_t( size ) + 1 );
+        va_start( args, fmt );
+        vsnprintf( ptr, size_t( size ) + 1, fmt, args );
+        va_end( args );
+
+        TracyQueuePrepare( QueueType::ZoneName );
+        MemWrite( &item->zoneTextFat.text, (uint64_t)ptr );
+        MemWrite( &item->zoneTextFat.size, (uint16_t)size );
+        TracyQueueCommit( zoneTextFatThread );
+    }
+
+    tracy_force_inline void Color( uint32_t color )
+    {
+        if( !m_active ) return;
+#ifdef TRACY_ON_DEMAND
+        if( GetProfiler().ConnectionId() != m_connectionId ) return;
+#endif
+        TracyQueuePrepare( QueueType::ZoneColor );
+        MemWrite( &item->zoneColor.b, uint8_t( ( color       ) & 0xFF ) );
+        MemWrite( &item->zoneColor.g, uint8_t( ( color >> 8  ) & 0xFF ) );
+        MemWrite( &item->zoneColor.r, uint8_t( ( color >> 16 ) & 0xFF ) );
+        TracyQueueCommit( zoneColorThread );
+    }
+
+    tracy_force_inline void Value( uint64_t value )
+    {
+        if( !m_active ) return;
+#ifdef TRACY_ON_DEMAND
+        if( GetProfiler().ConnectionId() != m_connectionId ) return;
+#endif
+        TracyQueuePrepare( QueueType::ZoneValue );
+        MemWrite( &item->zoneValue.value, value );
+        TracyQueueCommit( zoneValueThread );
+    }
+
+    tracy_force_inline bool IsActive() const { return m_active; }
+
+private:
+    const bool m_active;
+
+#ifdef TRACY_ON_DEMAND
+    uint64_t m_connectionId = 0;
+#endif
+};
+
+}
+
+#endif
diff --git a/project/thirdparty/tracy-0.11.1/client/TracyStringHelpers.hpp b/project/thirdparty/tracy-0.11.1/client/TracyStringHelpers.hpp
new file mode 100644
index 000000000..977be6a3e
--- /dev/null
+++ b/project/thirdparty/tracy-0.11.1/client/TracyStringHelpers.hpp
@@ -0,0 +1,41 @@
+#ifndef __TRACYSTRINGHELPERS_HPP__
+#define __TRACYSTRINGHELPERS_HPP__
+
+#include <assert.h>
+#include <string.h>
+
+#include "../common/TracyAlloc.hpp"
+#include "../common/TracyForceInline.hpp"
+
+namespace tracy
+{
+
+static tracy_force_inline char* CopyString( const char* src, size_t sz )
+{
+    auto dst = (char*)tracy_malloc( sz + 1 );
+    memcpy( dst, src, sz );
+    dst[sz] = '\0';
+    return dst;
+}
+
+static tracy_force_inline char* CopyString( const char* src )
+{
+    return CopyString( src, strlen( src ) );
+}
+
+static tracy_force_inline char* CopyStringFast( const char* src, size_t sz )
+{
+    auto dst = (char*)tracy_malloc_fast( sz + 1 );
+    memcpy( dst, src, sz );
+    dst[sz] = '\0';
+    return dst;
+}
+
+static tracy_force_inline char* CopyStringFast( const char* src )
+{
+    return CopyStringFast( src, strlen( src ) );
+}
+
+}
+
+#endif
diff --git a/project/thirdparty/tracy-0.11.1/client/TracySysPower.cpp b/project/thirdparty/tracy-0.11.1/client/TracySysPower.cpp
new file mode 100644
index 000000000..bd5939da2
--- /dev/null
+++ b/project/thirdparty/tracy-0.11.1/client/TracySysPower.cpp
@@ -0,0 +1,164 @@
+#include "TracySysPower.hpp"
+
+#ifdef TRACY_HAS_SYSPOWER
+
+#include <sys/types.h>
+#include <dirent.h>
+#include <chrono>
+#include <inttypes.h>
+#include <stdio.h>
+#include <string.h>
+
+#include "TracyDebug.hpp"
+#include "TracyProfiler.hpp"
+#include "../common/TracyAlloc.hpp"
+
+namespace tracy
+{
+
+SysPower::SysPower()
+    : m_domains( 4 )
+    , m_lastTime( 0 )
+{
+    ScanDirectory( "/sys/devices/virtual/powercap/intel-rapl", -1 );
+}
+
+SysPower::~SysPower()
+{
+    for( auto& v : m_domains )
+    {
+        fclose( v.handle );
+        // Do not release v.name, as it may be still needed
+    }
+}
+
+void SysPower::Tick()
+{
+    auto t = std::chrono::high_resolution_clock::now().time_since_epoch().count();
+    if( t - m_lastTime > 10000000 )    // 10 ms
+    {
+        m_lastTime = t;
+        for( auto& v : m_domains )
+        {
+            char tmp[32];
+            if( fread( tmp, 1, 32, v.handle ) > 0 )
+            {
+                rewind( v.handle );
+                auto p = (uint64_t)atoll( tmp );
+                uint64_t delta;
+                if( p >= v.value )
+                {
+                    delta = p - v.value;
+                }
+                else
+                {
+                    delta = v.overflow - v.value + p;
+                }
+                v.value = p;
+
+                TracyLfqPrepare( QueueType::SysPowerReport );
+                MemWrite( &item->sysPower.time, Profiler::GetTime() );
+                MemWrite( &item->sysPower.delta, delta );
+                MemWrite( &item->sysPower.name, (uint64_t)v.name );
+                TracyLfqCommit;
+            }
+        }
+    }
+}
+
+void SysPower::ScanDirectory( const char* path, int parent )
+{
+    DIR* dir = opendir( path );
+    if( !dir ) return;
+    struct dirent* ent;
+    uint64_t maxRange = 0;
+    char* name = nullptr;
+    FILE* handle = nullptr;
+    while( ( ent = readdir( dir ) ) )
+    {
+        if( ent->d_type == DT_REG )
+        {
+            if( strcmp( ent->d_name, "max_energy_range_uj" ) == 0 )
+            {
+                char tmp[PATH_MAX];
+                snprintf( tmp, PATH_MAX, "%s/max_energy_range_uj", path );
+                FILE* f = fopen( tmp, "r" );
+                if( f )
+                {
+                    fscanf( f, "%" PRIu64, &maxRange );
+                    fclose( f );
+                }
+            }
+            else if( strcmp( ent->d_name, "name" ) == 0 )
+            {
+                char tmp[PATH_MAX];
+                snprintf( tmp, PATH_MAX, "%s/name", path );
+                FILE* f = fopen( tmp, "r" );
+                if( f )
+                {
+                    char ntmp[128];
+                    if( fgets( ntmp, 128, f ) )
+                    {
+                        // Last character is newline, skip it
+                        const auto sz = strlen( ntmp ) - 1;
+                        if( parent < 0 )
+                        {
+                            name = (char*)tracy_malloc( sz + 1 );
+                            memcpy( name, ntmp, sz );
+                            name[sz] = '\0';
+                        }
+                        else
+                        {
+                            const auto p = m_domains[parent];
+                            const auto psz = strlen( p.name );
+                            name = (char*)tracy_malloc( psz + sz + 2 );
+                            memcpy( name, p.name, psz );
+                            name[psz] = ':';
+                            memcpy( name+psz+1, ntmp, sz );
+                            name[psz+sz+1] = '\0';
+                        }
+                    }
+                    fclose( f );
+                }
+            }
+            else if( strcmp( ent->d_name, "energy_uj" ) == 0 )
+            {
+                char tmp[PATH_MAX];
+                snprintf( tmp, PATH_MAX, "%s/energy_uj", path );
+                handle = fopen( tmp, "r" );
+            }
+        }
+        if( name && handle && maxRange > 0 ) break;
+    }
+    if( name && handle && maxRange > 0 )
+    {
+        parent = (int)m_domains.size();
+        Domain* domain = m_domains.push_next();
+        domain->value = 0;
+        domain->overflow = maxRange;
+        domain->handle = handle;
+        domain->name = name;
+        TracyDebug( "Power domain id %i, %s found at %s\n", parent, name, path );
+    }
+    else
+    {
+        if( name ) tracy_free( name );
+        if( handle ) fclose( handle );
+    }
+
+    rewinddir( dir );
+    while( ( ent = readdir( dir ) ) )
+    {
+        if( ent->d_type == DT_DIR && strncmp( ent->d_name, "intel-rapl:", 11 ) == 0 )
+        {
+            char tmp[PATH_MAX];
+            snprintf( tmp, PATH_MAX, "%s/%s", path, ent->d_name );
+            ScanDirectory( tmp, parent );
+        }
+    }
+    closedir( dir );
+}
+
+}
+
+#endif
diff --git a/project/thirdparty/tracy-0.11.1/client/TracySysPower.hpp b/project/thirdparty/tracy-0.11.1/client/TracySysPower.hpp
new file mode 100644
index 000000000..210123bce
--- /dev/null
+++ b/project/thirdparty/tracy-0.11.1/client/TracySysPower.hpp
@@ -0,0 +1,44 @@
+#ifndef __TRACYSYSPOWER_HPP__
+#define __TRACYSYSPOWER_HPP__
+
+#if defined __linux__
+#  define TRACY_HAS_SYSPOWER
+#endif
+
+#ifdef TRACY_HAS_SYSPOWER
+
+#include <stdint.h>
+#include <stdio.h>
+
+#include "TracyFastVector.hpp"
+
+namespace tracy
+{
+
+class SysPower
+{
+    struct Domain
+    {
+        uint64_t value;
+        uint64_t overflow;
+        FILE* handle;
+        const char* name;
+    };
+
+public:
+    SysPower();
+    ~SysPower();
+
+    void Tick();
+
+private:
+    void ScanDirectory( const char* path, int parent );
+
+    FastVector<Domain> m_domains;
+    uint64_t m_lastTime;
+};
+
+}
+#endif
+
+#endif
diff --git a/project/thirdparty/tracy-0.11.1/client/TracySysTime.cpp b/project/thirdparty/tracy-0.11.1/client/TracySysTime.cpp
new file mode 100644
index 000000000..b690a9114
--- /dev/null
+++ b/project/thirdparty/tracy-0.11.1/client/TracySysTime.cpp
@@ -0,0 +1,108 @@
+#include "TracySysTime.hpp"
+
+#ifdef TRACY_HAS_SYSTIME
+
+#  if defined _WIN32
+#    include <windows.h>
+#  elif defined __linux__
+#    include <stdio.h>
+#    include <inttypes.h>
+#  elif defined __APPLE__
+#    include <mach/mach_host.h>
+#    include <mach/host_info.h>
+#  elif defined BSD
+#    include <sys/types.h>
+#    include <sys/sysctl.h>
+#  endif
+
+namespace tracy
+{
+
+#  if defined _WIN32
+
+static inline uint64_t ConvertTime( const FILETIME& t )
+{
+    return ( uint64_t( t.dwHighDateTime ) << 32 ) | uint64_t( t.dwLowDateTime );
+}
+
+void SysTime::ReadTimes()
+{
+    FILETIME idleTime;
+    FILETIME kernelTime;
+    FILETIME userTime;
+
+    GetSystemTimes( &idleTime, &kernelTime, &userTime );
+
+    idle = ConvertTime( idleTime );
+    const auto kernel = ConvertTime( kernelTime );
+    const auto user = ConvertTime( userTime );
+    used = kernel + user;
+}
+
+#  elif defined __linux__
+
+void SysTime::ReadTimes()
+{
+    uint64_t user, nice, system;
+    FILE* f = fopen( "/proc/stat", "r" );
+    if( f )
+    {
+        int read = fscanf( f, "cpu %" PRIu64 " %" PRIu64 " %" PRIu64" %" PRIu64, &user, &nice, &system, &idle );
+        fclose( f );
+        if (read == 4)
+        {
+            used = user + nice + system;
+        }
+    }
+}
+
+#  elif defined __APPLE__
+
+void SysTime::ReadTimes()
+{
+    host_cpu_load_info_data_t info;
+    mach_msg_type_number_t cnt = HOST_CPU_LOAD_INFO_COUNT;
+    host_statistics( mach_host_self(), HOST_CPU_LOAD_INFO, reinterpret_cast<host_info_t>( &info ), &cnt );
+    used = info.cpu_ticks[CPU_STATE_USER] + info.cpu_ticks[CPU_STATE_NICE] + info.cpu_ticks[CPU_STATE_SYSTEM];
+    idle = info.cpu_ticks[CPU_STATE_IDLE];
+}
+
+#  elif defined BSD
+
+void SysTime::ReadTimes()
+{
+    u_long data[5];
+    size_t sz = sizeof( data );
+    sysctlbyname( "kern.cp_time", &data, &sz, nullptr, 0 );
+    used = data[0] + data[1] + data[2] + data[3];
+    idle = data[4];
+}
+
+#endif
+
+SysTime::SysTime()
+{
+    ReadTimes();
+}
+
+float SysTime::Get()
+{
+    const auto oldUsed = used;
+    const auto oldIdle = idle;
+
+    ReadTimes();
+
+    const auto diffIdle = idle - oldIdle;
+    const auto diffUsed = used - oldUsed;
+
+#if defined _WIN32
+    return diffUsed == 0 ? -1 : ( diffUsed - diffIdle ) * 100.f / diffUsed;
+#elif defined __linux__ || defined __APPLE__ || defined BSD
+    const auto total = diffUsed + diffIdle;
+    return total == 0 ? -1 : diffUsed * 100.f / total;
+#endif
+}
+
+}
+
+#endif
diff --git a/project/thirdparty/tracy-0.11.1/client/TracySysTime.hpp b/project/thirdparty/tracy-0.11.1/client/TracySysTime.hpp
new file mode 100644
index 000000000..cb5ebe736
--- /dev/null
+++ b/project/thirdparty/tracy-0.11.1/client/TracySysTime.hpp
@@ -0,0 +1,36 @@
+#ifndef __TRACYSYSTIME_HPP__
+#define __TRACYSYSTIME_HPP__
+
+#if defined _WIN32 || defined __linux__ || defined __APPLE__
+#  define TRACY_HAS_SYSTIME
+#else
+#  include <sys/param.h>
+#endif
+
+#ifdef BSD
+#  define TRACY_HAS_SYSTIME
+#endif
+
+#ifdef TRACY_HAS_SYSTIME
+
+#include <stdint.h>
+
+namespace tracy
+{
+
+class SysTime
+{
+public:
+    SysTime();
+    float Get();
+
+    void ReadTimes();
+
+private:
+    uint64_t idle, used;
+};
+
+}
+#endif
+
+#endif
diff --git a/project/thirdparty/tracy-0.11.1/client/TracySysTrace.cpp b/project/thirdparty/tracy-0.11.1/client/TracySysTrace.cpp
new file mode 100644
index 000000000..0fd1d0ac5
--- /dev/null
+++ b/project/thirdparty/tracy-0.11.1/client/TracySysTrace.cpp
@@ -0,0 +1,1611 @@
+#include "TracyDebug.hpp"
+#include "TracyStringHelpers.hpp"
+#include "TracySysTrace.hpp"
+#include "../common/TracySystem.hpp"
+
+#ifdef TRACY_HAS_SYSTEM_TRACING
+
+#ifndef TRACY_SAMPLING_HZ
+#  if defined _WIN32
+#    define TRACY_SAMPLING_HZ 8000
+#  elif defined __linux__
+#    define TRACY_SAMPLING_HZ 10000
+#  endif
+#endif
+
+namespace tracy
+{
+
+static int GetSamplingFrequency()
+{
+    int samplingHz = TRACY_SAMPLING_HZ;
+
+    auto env = GetEnvVar( "TRACY_SAMPLING_HZ" );
+    if( env )
+    {
+        int val = atoi( env );
+        if( val > 0 ) samplingHz = val;
+    }
+
+#if defined _WIN32
+    return samplingHz > 8000 ? 8000 : ( samplingHz < 1 ? 1 : samplingHz );
+#else
+    return samplingHz > 1000000 ? 1000000 : ( samplingHz < 1 ? 1 : samplingHz );
+#endif
+}
+
+static int GetSamplingPeriod()
+{
+    return 1000000000 / GetSamplingFrequency();
+}
+
+}
+
+#  if defined _WIN32
+
+#    ifndef NOMINMAX
+#      define NOMINMAX
+#    endif
+
+#    define INITGUID
+#    include <assert.h>
+#    include <string.h>
+#    include <windows.h>
+#    include <dbghelp.h>
+#    include <evntrace.h>
+#    include <evntcons.h>
+#    include <psapi.h>
+#    include <winternl.h>
+
+#    include "../common/TracyAlloc.hpp"
+#    include "../common/TracySystem.hpp"
+#    include "TracyProfiler.hpp"
+#    include "TracyThread.hpp"
+
+namespace tracy
+{
+
+static const GUID PerfInfoGuid = { 0xce1dbfb4, 0x137e, 0x4da6, { 0x87, 0xb0, 0x3f, 0x59, 0xaa, 0x10, 0x2c, 0xbc } };
+static const GUID DxgKrnlGuid  = { 0x802ec45a, 0x1e99, 0x4b83, { 0x99, 0x20, 0x87, 0xc9, 0x82, 0x77, 0xba, 0x9d } };
+static const GUID ThreadV2Guid = { 0x3d6fa8d1, 0xfe05, 0x11d0, { 0x9d, 0xda, 0x00, 0xc0, 0x4f, 0xd7, 0xba, 0x7c } };
+
+
+static TRACEHANDLE s_traceHandle;
+static TRACEHANDLE s_traceHandle2;
+static EVENT_TRACE_PROPERTIES* s_prop;
+static DWORD s_pid;
+
+static EVENT_TRACE_PROPERTIES* s_propVsync;
+static TRACEHANDLE s_traceHandleVsync;
+static TRACEHANDLE s_traceHandleVsync2;
+Thread* s_threadVsync = nullptr;
+
+struct CSwitch
+{
+    uint32_t    newThreadId;
+    uint32_t    oldThreadId;
+    int8_t      newThreadPriority;
+    int8_t      oldThreadPriority;
+    uint8_t     previousCState;
+    int8_t      spareByte;
+    int8_t      oldThreadWaitReason;
+    int8_t      oldThreadWaitMode;
+    int8_t      oldThreadState;
+    int8_t      oldThreadWaitIdealProcessor;
+    uint32_t    newThreadWaitTime;
+    uint32_t    reserved;
+};
+
+struct ReadyThread
+{
+    uint32_t    threadId;
+    int8_t      adjustReason;
+    int8_t      adjustIncrement;
+    int8_t      flag;
+    int8_t      reserverd;
+};
+
+struct ThreadTrace
+{
+    uint32_t processId;
+    uint32_t threadId;
+    uint32_t stackBase;
+    uint32_t stackLimit;
+    uint32_t userStackBase;
+    uint32_t userStackLimit;
+    uint32_t startAddr;
+    uint32_t win32StartAddr;
+    uint32_t tebBase;
+    uint32_t subProcessTag;
+};
+
+struct StackWalkEvent
+{
+    uint64_t eventTimeStamp;
+    uint32_t stackProcess;
+    uint32_t stackThread;
+    uint64_t stack[192];
+};
+
+struct VSyncInfo
+{
+    void*       dxgAdapter;
+    uint32_t    vidPnTargetId;
+    uint64_t    scannedPhysicalAddress;
+    uint32_t    vidPnSourceId;
+    uint32_t    frameNumber;
+    int64_t     frameQpcTime;
+    void*       hFlipDevice;
+    uint32_t    flipType;
+    uint64_t    flipFenceId;
+};
+
+extern "C" typedef NTSTATUS (WINAPI *t_NtQueryInformationThread)( HANDLE, THREADINFOCLASS, PVOID, ULONG, PULONG );
+extern "C" typedef BOOL (WINAPI *t_EnumProcessModules)( HANDLE, HMODULE*, DWORD, LPDWORD );
+extern "C" typedef BOOL (WINAPI *t_GetModuleInformation)( HANDLE, HMODULE, LPMODULEINFO, DWORD );
+extern "C" typedef DWORD (WINAPI *t_GetModuleBaseNameA)( HANDLE, HMODULE, LPSTR, DWORD );
+extern "C" typedef HRESULT (WINAPI *t_GetThreadDescription)( HANDLE, PWSTR* );
+
+t_NtQueryInformationThread NtQueryInformationThread = (t_NtQueryInformationThread)GetProcAddress( GetModuleHandleA( "ntdll.dll" ), "NtQueryInformationThread" );
+t_EnumProcessModules _EnumProcessModules = (t_EnumProcessModules)GetProcAddress( GetModuleHandleA( "kernel32.dll" ), "K32EnumProcessModules" );
+t_GetModuleInformation _GetModuleInformation = (t_GetModuleInformation)GetProcAddress( GetModuleHandleA( "kernel32.dll" ), "K32GetModuleInformation" );
+t_GetModuleBaseNameA _GetModuleBaseNameA = (t_GetModuleBaseNameA)GetProcAddress( GetModuleHandleA( "kernel32.dll" ), "K32GetModuleBaseNameA" );
+
+static t_GetThreadDescription _GetThreadDescription = 0;
+
+
+void WINAPI EventRecordCallback( PEVENT_RECORD record )
+{
+#ifdef TRACY_ON_DEMAND
+    if( !GetProfiler().IsConnected() ) return;
+#endif
+
+    const auto& hdr = record->EventHeader;
+    switch( hdr.ProviderId.Data1 )
+    {
+    case 0x3d6fa8d1:    // Thread Guid
+        if( hdr.EventDescriptor.Opcode == 36 )
+        {
+            const auto cswitch = (const CSwitch*)record->UserData;
+
+            TracyLfqPrepare( QueueType::ContextSwitch );
+            MemWrite( &item->contextSwitch.time, hdr.TimeStamp.QuadPart );
+            MemWrite( &item->contextSwitch.oldThread, cswitch->oldThreadId );
+            MemWrite( &item->contextSwitch.newThread, cswitch->newThreadId );
+            MemWrite( &item->contextSwitch.cpu, record->BufferContext.ProcessorNumber );
+            MemWrite( &item->contextSwitch.reason, cswitch->oldThreadWaitReason );
+            MemWrite( &item->contextSwitch.state, cswitch->oldThreadState );
+            TracyLfqCommit;
+        }
+        else if( hdr.EventDescriptor.Opcode == 50 )
+        {
+            const auto rt = (const ReadyThread*)record->UserData;
+
+            TracyLfqPrepare( QueueType::ThreadWakeup );
+            MemWrite( &item->threadWakeup.time, hdr.TimeStamp.QuadPart );
+            MemWrite( &item->threadWakeup.thread, rt->threadId );
+            TracyLfqCommit;
+        }
+        else if( hdr.EventDescriptor.Opcode == 1 || hdr.EventDescriptor.Opcode == 3 )
+        {
+            const auto tt = (const ThreadTrace*)record->UserData;
+
+            uint64_t tid = tt->threadId;
+            if( tid == 0 ) return;
+            uint64_t pid = tt->processId;
+            TracyLfqPrepare( QueueType::TidToPid );
+            MemWrite( &item->tidToPid.tid, tid );
+            MemWrite( &item->tidToPid.pid, pid );
+            TracyLfqCommit;
+        }
+        break;
+    case 0xdef2fe46:    // StackWalk Guid
+        if( hdr.EventDescriptor.Opcode == 32 )
+        {
+            const auto sw = (const StackWalkEvent*)record->UserData;
+            if( sw->stackProcess == s_pid )
+            {
+                const uint64_t sz = ( record->UserDataLength - 16 ) / 8;
+                if( sz > 0 )
+                {
+                    auto trace = (uint64_t*)tracy_malloc( ( 1 + sz ) * sizeof( uint64_t ) );
+                    memcpy( trace, &sz, sizeof( uint64_t ) );
+                    memcpy( trace+1, sw->stack, sizeof( uint64_t ) * sz );
+                    TracyLfqPrepare( QueueType::CallstackSample );
+                    MemWrite( &item->callstackSampleFat.time, sw->eventTimeStamp );
+                    MemWrite( &item->callstackSampleFat.thread, sw->stackThread );
+                    MemWrite( &item->callstackSampleFat.ptr, (uint64_t)trace );
+                    TracyLfqCommit;
+                }
+            }
+        }
+        break;
+    default:
+        break;
+    }
+}
+
+void WINAPI EventRecordCallbackVsync( PEVENT_RECORD record )
+{
+#ifdef TRACY_ON_DEMAND
+    if( !GetProfiler().IsConnected() ) return;
+#endif
+
+    const auto& hdr = record->EventHeader;
+    assert( hdr.ProviderId.Data1 == 0x802EC45A );
+    assert( hdr.EventDescriptor.Id == 0x0011 );
+
+    const auto vs = (const VSyncInfo*)record->UserData;
+
+    TracyLfqPrepare( QueueType::FrameVsync );
+    MemWrite( &item->frameVsync.time, hdr.TimeStamp.QuadPart );
+    MemWrite( &item->frameVsync.id, vs->vidPnTargetId );
+    TracyLfqCommit;
+}
+
+static void SetupVsync()
+{
+#if _WIN32_WINNT >= _WIN32_WINNT_WINBLUE && !defined(__MINGW32__)
+    const auto psz = sizeof( EVENT_TRACE_PROPERTIES ) + MAX_PATH;
+    s_propVsync = (EVENT_TRACE_PROPERTIES*)tracy_malloc( psz );
+    memset( s_propVsync, 0, sizeof( EVENT_TRACE_PROPERTIES ) );
+    s_propVsync->LogFileMode = EVENT_TRACE_REAL_TIME_MODE;
+    s_propVsync->Wnode.BufferSize = psz;
+#ifdef TRACY_TIMER_QPC
+    s_propVsync->Wnode.ClientContext = 1;
+#else
+    s_propVsync->Wnode.ClientContext = 3;
+#endif
+    s_propVsync->LoggerNameOffset = sizeof( EVENT_TRACE_PROPERTIES );
+    strcpy( ((char*)s_propVsync) + sizeof( EVENT_TRACE_PROPERTIES ), "TracyVsync" );
+
+    auto backup = tracy_malloc( psz );
+    memcpy( backup, s_propVsync, psz );
+
+    const auto controlStatus = ControlTraceA( 0, "TracyVsync", s_propVsync, EVENT_TRACE_CONTROL_STOP );
+    if( controlStatus != ERROR_SUCCESS && controlStatus != ERROR_WMI_INSTANCE_NOT_FOUND )
+    {
+        tracy_free( backup );
+        tracy_free( s_propVsync );
+        return;
+    }
+
+    memcpy( s_propVsync, backup, psz );
+    tracy_free( backup );
+
+    const auto startStatus = StartTraceA( &s_traceHandleVsync, "TracyVsync", s_propVsync );
+    if( startStatus != ERROR_SUCCESS )
+    {
+        tracy_free( s_propVsync );
+        return;
+    }
+
+    EVENT_FILTER_EVENT_ID fe = {};
+    fe.FilterIn = TRUE;
+    fe.Count = 1;
+    fe.Events[0] = 0x0011;  // VSyncDPC_Info
+
+    EVENT_FILTER_DESCRIPTOR desc = {};
+    desc.Ptr = (ULONGLONG)&fe;
+    desc.Size = sizeof( fe );
+    desc.Type = EVENT_FILTER_TYPE_EVENT_ID;
+
+    ENABLE_TRACE_PARAMETERS params = {};
+    params.Version = ENABLE_TRACE_PARAMETERS_VERSION_2;
+    params.EnableProperty = EVENT_ENABLE_PROPERTY_IGNORE_KEYWORD_0;
+    params.SourceId = s_propVsync->Wnode.Guid;
+    params.EnableFilterDesc = &desc;
+    params.FilterDescCount = 1;
+
+    uint64_t mask = 0x4000000000000001;   // Microsoft_Windows_DxgKrnl_Performance | Base
+    if( EnableTraceEx2( s_traceHandleVsync, &DxgKrnlGuid, EVENT_CONTROL_CODE_ENABLE_PROVIDER, TRACE_LEVEL_INFORMATION, mask, mask, 0, &params ) != ERROR_SUCCESS )
+    {
+        tracy_free( s_propVsync );
+        return;
+    }
+
+    char loggerName[MAX_PATH];
+    strcpy( loggerName, "TracyVsync" );
+
+    EVENT_TRACE_LOGFILEA log = {};
+    log.LoggerName = loggerName;
+    log.ProcessTraceMode = PROCESS_TRACE_MODE_REAL_TIME | PROCESS_TRACE_MODE_EVENT_RECORD | PROCESS_TRACE_MODE_RAW_TIMESTAMP;
+    log.EventRecordCallback = EventRecordCallbackVsync;
+
+    s_traceHandleVsync2 = OpenTraceA( &log );
+    if( s_traceHandleVsync2 == (TRACEHANDLE)INVALID_HANDLE_VALUE )
+    {
+        CloseTrace( s_traceHandleVsync );
+        tracy_free( s_propVsync );
+        return;
+    }
+
+    s_threadVsync = (Thread*)tracy_malloc( sizeof( Thread ) );
+    new(s_threadVsync) Thread( [] (void*) {
+        ThreadExitHandler threadExitHandler;
+        SetThreadPriority( GetCurrentThread(), THREAD_PRIORITY_TIME_CRITICAL );
+        SetThreadName( "Tracy Vsync" );
+        ProcessTrace( &s_traceHandleVsync2, 1, nullptr, nullptr );
+    }, nullptr );
+#endif
+}
+
+static int GetSamplingInterval()
+{
+    return GetSamplingPeriod() / 100;
+}
+
+bool SysTraceStart( int64_t& samplingPeriod )
+{
+    if( !_GetThreadDescription ) _GetThreadDescription = (t_GetThreadDescription)GetProcAddress( GetModuleHandleA( "kernel32.dll" ), "GetThreadDescription" );
+
+    s_pid = GetCurrentProcessId();
+
+#if defined _WIN64
+    constexpr bool isOs64Bit = true;
+#else
+    BOOL _iswow64;
+    IsWow64Process( GetCurrentProcess(), &_iswow64 );
+    const bool isOs64Bit = _iswow64;
+#endif
+
+    TOKEN_PRIVILEGES priv = {};
+    priv.PrivilegeCount = 1;
+    priv.Privileges[0].Attributes = SE_PRIVILEGE_ENABLED;
+    if( LookupPrivilegeValue( nullptr, SE_SYSTEM_PROFILE_NAME, &priv.Privileges[0].Luid ) == 0 ) return false;
+
+    HANDLE pt;
+    if( OpenProcessToken( GetCurrentProcess(), TOKEN_ADJUST_PRIVILEGES, &pt ) == 0 ) return false;
+    const auto adjust = AdjustTokenPrivileges( pt, FALSE, &priv, 0, nullptr, nullptr );
+    CloseHandle( pt );
+    if( adjust == 0 ) return false;
+    const auto status = GetLastError();
+    if( status != ERROR_SUCCESS ) return false;
+
+    if( isOs64Bit )
+    {
+        TRACE_PROFILE_INTERVAL interval = {};
+        interval.Interval = GetSamplingInterval();
+        const auto intervalStatus = TraceSetInformation( 0, TraceSampledProfileIntervalInfo, &interval, sizeof( interval ) );
+        if( intervalStatus != ERROR_SUCCESS ) return false;
+        samplingPeriod = GetSamplingPeriod();
+    }
+
+    const auto psz = sizeof( EVENT_TRACE_PROPERTIES ) + sizeof( KERNEL_LOGGER_NAME );
+    s_prop = (EVENT_TRACE_PROPERTIES*)tracy_malloc( psz );
+    memset( s_prop, 0, sizeof( EVENT_TRACE_PROPERTIES ) );
+    ULONG flags = 0;
+#ifndef TRACY_NO_CONTEXT_SWITCH
+    flags = EVENT_TRACE_FLAG_CSWITCH | EVENT_TRACE_FLAG_DISPATCHER | EVENT_TRACE_FLAG_THREAD;
+#endif
+#ifndef TRACY_NO_SAMPLING
+    if( isOs64Bit ) flags |= EVENT_TRACE_FLAG_PROFILE;
+#endif
+    s_prop->EnableFlags = flags;
+    s_prop->LogFileMode = EVENT_TRACE_REAL_TIME_MODE;
+    s_prop->Wnode.BufferSize = psz;
+    s_prop->Wnode.Flags = WNODE_FLAG_TRACED_GUID;
+#ifdef TRACY_TIMER_QPC
+    s_prop->Wnode.ClientContext = 1;
+#else
+    s_prop->Wnode.ClientContext = 3;
+#endif
+    s_prop->Wnode.Guid = SystemTraceControlGuid;
+    s_prop->BufferSize = 1024;
+    s_prop->MinimumBuffers = std::thread::hardware_concurrency() * 4;
+    s_prop->MaximumBuffers = std::thread::hardware_concurrency() * 6;
+    s_prop->LoggerNameOffset = sizeof( EVENT_TRACE_PROPERTIES );
+    memcpy( ((char*)s_prop) + sizeof( EVENT_TRACE_PROPERTIES ), KERNEL_LOGGER_NAME, sizeof( KERNEL_LOGGER_NAME ) );
+
+    auto backup = tracy_malloc( psz );
+    memcpy( backup, s_prop, psz );
+
+    const auto controlStatus = ControlTrace( 0, KERNEL_LOGGER_NAME, s_prop, EVENT_TRACE_CONTROL_STOP );
+    if( controlStatus != ERROR_SUCCESS && controlStatus != ERROR_WMI_INSTANCE_NOT_FOUND )
+    {
+        tracy_free( backup );
+        tracy_free( s_prop );
+        return false;
+    }
+
+    memcpy( s_prop, backup, psz );
+    tracy_free( backup );
+
+    const auto startStatus = StartTrace( &s_traceHandle, KERNEL_LOGGER_NAME, s_prop );
+    if( startStatus != ERROR_SUCCESS )
+    {
+        tracy_free( s_prop );
+        return false;
+    }
+
+#ifndef TRACY_NO_SAMPLING
+    if( isOs64Bit )
+    {
+        CLASSIC_EVENT_ID stackId[2] = {};
+        stackId[0].EventGuid = PerfInfoGuid;
+        stackId[0].Type = 46;
+        stackId[1].EventGuid = ThreadV2Guid;
+        stackId[1].Type = 36;
+        const auto stackStatus = TraceSetInformation( s_traceHandle, TraceStackTracingInfo, &stackId, sizeof( stackId ) );
+        if( stackStatus != ERROR_SUCCESS )
+        {
+            tracy_free( s_prop );
+            return false;
+        }
+    }
+#endif
+
+#ifdef UNICODE
+    WCHAR KernelLoggerName[sizeof( KERNEL_LOGGER_NAME )];
+#else
+    char KernelLoggerName[sizeof( KERNEL_LOGGER_NAME )];
+#endif
+    memcpy( KernelLoggerName, KERNEL_LOGGER_NAME, sizeof( KERNEL_LOGGER_NAME ) );
+    EVENT_TRACE_LOGFILE log = {};
+    log.LoggerName = KernelLoggerName;
+    log.ProcessTraceMode = PROCESS_TRACE_MODE_REAL_TIME | PROCESS_TRACE_MODE_EVENT_RECORD | PROCESS_TRACE_MODE_RAW_TIMESTAMP;
+    log.EventRecordCallback = EventRecordCallback;
+
+    s_traceHandle2 = OpenTrace( &log );
+    if( s_traceHandle2 == (TRACEHANDLE)INVALID_HANDLE_VALUE )
+    {
+        CloseTrace( s_traceHandle );
+        tracy_free( s_prop );
+        return false;
+    }
+
+#ifndef TRACY_NO_VSYNC_CAPTURE
+    SetupVsync();
+#endif
+
+    return true;
+}
+
+void SysTraceStop()
+{
+    if( s_threadVsync )
+    {
+        CloseTrace( s_traceHandleVsync2 );
+        CloseTrace( s_traceHandleVsync );
+        s_threadVsync->~Thread();
+        tracy_free( s_threadVsync );
+    }
+
+    CloseTrace( s_traceHandle2 );
+    CloseTrace( s_traceHandle );
+}
+
+void SysTraceWorker( void* ptr )
+{
+    ThreadExitHandler threadExitHandler;
+    SetThreadPriority( GetCurrentThread(), THREAD_PRIORITY_TIME_CRITICAL );
+    SetThreadName( "Tracy SysTrace" );
+    ProcessTrace( &s_traceHandle2, 1, 0, 0 );
+    ControlTrace( 0, KERNEL_LOGGER_NAME, s_prop, EVENT_TRACE_CONTROL_STOP );
+    tracy_free( s_prop );
+}
+
+void SysTraceGetExternalName( uint64_t thread, const char*& threadName, const char*& name )
+{
+    bool threadSent = false;
+    auto hnd = OpenThread( THREAD_QUERY_INFORMATION, FALSE, DWORD( thread ) );
+    if( hnd == 0 )
+    {
+        hnd = OpenThread( THREAD_QUERY_LIMITED_INFORMATION, FALSE, DWORD( thread ) );
+    }
+    if( hnd != 0 )
+    {
+        if( _GetThreadDescription )
+        {
+            PWSTR tmp;
+            _GetThreadDescription( hnd, &tmp );
+            char buf[256];
+            if( tmp )
+            {
+                auto ret = wcstombs( buf, tmp, 256 );
+                if( ret != 0 )
+                {
+                    threadName = CopyString( buf, ret );
+                    threadSent = true;
+                }
+            }
+        }
+        const auto pid = GetProcessIdOfThread( hnd );
+        if( !threadSent && NtQueryInformationThread && _EnumProcessModules && _GetModuleInformation && _GetModuleBaseNameA )
+        {
+            void* ptr;
+            ULONG retlen;
+            auto status = NtQueryInformationThread( hnd, (THREADINFOCLASS)9 /*ThreadQuerySetWin32StartAddress*/, &ptr, sizeof( &ptr ), &retlen );
+            if( status == 0 )
+            {
+                const auto phnd = OpenProcess( PROCESS_QUERY_INFORMATION | PROCESS_VM_READ, FALSE, pid );
+                if( phnd != INVALID_HANDLE_VALUE )
+                {
+                    HMODULE modules[1024];
+                    DWORD needed;
+                    if( _EnumProcessModules( phnd, modules, 1024 * sizeof( HMODULE ), &needed ) != 0 )
+                    {
+                        const auto sz = std::min( DWORD( needed / sizeof( HMODULE ) ), DWORD( 1024 ) );
+                        for( DWORD i=0; i<sz; i++ )
+                        {
+                            MODULEINFO info;
+                            if( _GetModuleInformation( phnd, modules[i], &info, sizeof( info ) ) != 0 )
+                            {
+                                if( (uint64_t)ptr >= (uint64_t)info.lpBaseOfDll && (uint64_t)ptr <= (uint64_t)info.lpBaseOfDll + (uint64_t)info.SizeOfImage )
+                                {
+                                    char buf2[1024];
+                                    const auto modlen = _GetModuleBaseNameA( phnd, modules[i], buf2, 1024 );
+                                    if( modlen != 0 )
+                                    {
+                                        threadName = CopyString( buf2, modlen );
+                                        threadSent = true;
+                                    }
+                                }
+                            }
+                        }
+                    }
+                    CloseHandle( phnd );
+                }
+            }
+        }
+        CloseHandle( hnd );
+        if( !threadSent )
+        {
+            threadName = CopyString( "???", 3 );
+            threadSent = true;
+        }
+        if( pid != 0 )
+        {
+            {
+                uint64_t _pid = pid;
+                TracyLfqPrepare( QueueType::TidToPid );
+                MemWrite( &item->tidToPid.tid, thread );
+                MemWrite( &item->tidToPid.pid, _pid );
+                TracyLfqCommit;
+            }
+            if( pid == 4 )
+            {
+                name = CopyStringFast( "System", 6 );
+                return;
+            }
+            else
+            {
+                const auto phnd = OpenProcess( PROCESS_QUERY_LIMITED_INFORMATION, FALSE, pid );
+                if( phnd != INVALID_HANDLE_VALUE )
+                {
+                    char buf2[1024];
+                    const auto sz = GetProcessImageFileNameA( phnd, buf2, 1024 );
+                    CloseHandle( phnd );
+                    if( sz != 0 )
+                    {
+                        auto ptr = buf2 + sz - 1;
+                        while( ptr > buf2 && *ptr != '\\' ) ptr--;
+                        if( *ptr == '\\' ) ptr++;
+                        name = CopyStringFast( ptr );
+                        return;
+                    }
+                }
+            }
+        }
+    }
+
+    if( !threadSent )
+    {
+        threadName = CopyString( "???", 3 );
+    }
+    name = CopyStringFast( "???", 3 );
+}
+
+}
+
+#  elif defined __linux__
+
+#    include <sys/types.h>
+#    include <sys/stat.h>
+#    include <sys/wait.h>
+#    include <fcntl.h>
+#    include <inttypes.h>
+#    include <limits>
+#    include <poll.h>
+#    include <stdio.h>
+#    include <stdlib.h>
+#    include <string.h>
+#    include <unistd.h>
+#    include <atomic>
+#    include <thread>
+#    include <linux/perf_event.h>
+#    include <linux/version.h>
+#    include <sys/mman.h>
+#    include <sys/ioctl.h>
+#    include <sys/syscall.h>
+
+#    if defined __i386 || defined __x86_64__
+#      include "TracyCpuid.hpp"
+#    endif
+
+#    include "TracyProfiler.hpp"
+#    include "TracyRingBuffer.hpp"
+#    include "TracyThread.hpp"
+
+namespace tracy
+{
+
+static std::atomic<bool> traceActive { false };
+static int s_numCpus = 0;
+static int s_numBuffers = 0;
+static int s_ctxBufferIdx = 0;
+
+static RingBuffer* s_ring = nullptr;
+
+static const int ThreadHashSize = 4 * 1024;
+static uint32_t s_threadHash[ThreadHashSize] = {};
+
+static bool CurrentProcOwnsThread( uint32_t tid )
+{
+    const auto hash = tid & ( ThreadHashSize-1 );
+    const auto hv = s_threadHash[hash];
+    if( hv == tid ) return true;
+    if( hv == -tid ) return false;
+
+    char path[256];
+    sprintf( path, "/proc/self/task/%d", tid );
+    struct stat st;
+    if( stat( path, &st ) == 0 )
+    {
+        s_threadHash[hash] = tid;
+        return true;
+    }
+    else
+    {
+        s_threadHash[hash] = -tid;
+        return false;
+    }
+}
+
+static int perf_event_open( struct perf_event_attr* hw_event, pid_t pid, int cpu, int group_fd, unsigned long flags )
+{
+    return syscall( __NR_perf_event_open, hw_event, pid, cpu, group_fd, flags );
+}
+
+enum TraceEventId
+{
+    EventCallstack,
+    EventCpuCycles,
+    EventInstructionsRetired,
+    EventCacheReference,
+    EventCacheMiss,
+    EventBranchRetired,
+    EventBranchMiss,
+    EventVsync,
+    EventContextSwitch,
+    EventWakeup,
+};
+
+static void ProbePreciseIp( perf_event_attr& pe, unsigned long long config0, unsigned long long config1, pid_t pid )
+{
+    pe.config = config1;
+    pe.precise_ip = 3;
+    while( pe.precise_ip != 0 )
+    {
+        const int fd = perf_event_open( &pe, pid, 0, -1, PERF_FLAG_FD_CLOEXEC );
+        if( fd != -1 )
+        {
+            close( fd );
+            break;
+        }
+        pe.precise_ip--;
+    }
+    pe.config = config0;
+    while( pe.precise_ip != 0 )
+    {
+        const int fd = perf_event_open( &pe, pid, 0, -1, PERF_FLAG_FD_CLOEXEC );
+        if( fd != -1 )
+        {
+            close( fd );
+            break;
+        }
+        pe.precise_ip--;
+    }
+    TracyDebug( "  Probed precise_ip: %i\n", pe.precise_ip );
+}
+
+static void ProbePreciseIp( perf_event_attr& pe, pid_t pid )
+{
+    pe.precise_ip = 3;
+    while( pe.precise_ip != 0 )
+    {
+        const int fd = perf_event_open( &pe, pid, 0, -1, PERF_FLAG_FD_CLOEXEC );
+        if( fd != -1 )
+        {
+            close( fd );
+            break;
+        }
+        pe.precise_ip--;
+    }
+    TracyDebug( "  Probed precise_ip: %i\n", pe.precise_ip );
+}
+
+static bool IsGenuineIntel()
+{
+#if defined __i386 || defined __x86_64__
+    uint32_t regs[4] = {};
+    __get_cpuid( 0, regs, regs+1, regs+2, regs+3 );
+    char manufacturer[12];
+    memcpy( manufacturer, regs+1, 4 );
+    memcpy( manufacturer+4, regs+3, 4 );
+    memcpy( manufacturer+8, regs+2, 4 );
+    return memcmp( manufacturer, "GenuineIntel", 12 ) == 0;
+#else
+    return false;
+#endif
+}
+
+static const char* ReadFile( const char* path )
+{
+    int fd = open( path, O_RDONLY );
+    if( fd < 0 ) return nullptr;
+
+    static char tmp[64];
+    const auto cnt = read( fd, tmp, 63 );
+    close( fd );
+    if( cnt < 0 ) return nullptr;
+    tmp[cnt] = '\0';
+    return tmp;
+}
+
+bool SysTraceStart( int64_t& samplingPeriod )
+{
+#ifndef CLOCK_MONOTONIC_RAW
+    return false;
+#endif
+
+    const auto paranoidLevelStr = ReadFile( "/proc/sys/kernel/perf_event_paranoid" );
+    if( !paranoidLevelStr ) return false;
+#ifdef TRACY_VERBOSE
+    int paranoidLevel = 2;
+    paranoidLevel = atoi( paranoidLevelStr );
+    TracyDebug( "perf_event_paranoid: %i\n", paranoidLevel );
+#endif
+
+    int switchId = -1, wakeupId = -1, vsyncId = -1;
+    const auto switchIdStr = ReadFile( "/sys/kernel/debug/tracing/events/sched/sched_switch/id" );
+    if( switchIdStr ) switchId = atoi( switchIdStr );
+    const auto wakeupIdStr = ReadFile( "/sys/kernel/debug/tracing/events/sched/sched_wakeup/id" );
+    if( wakeupIdStr ) wakeupId = atoi( wakeupIdStr );
+    const auto vsyncIdStr = ReadFile( "/sys/kernel/debug/tracing/events/drm/drm_vblank_event/id" );
+    if( vsyncIdStr ) vsyncId = atoi( vsyncIdStr );
+
+    TracyDebug( "sched_switch id: %i\n", switchId );
+    TracyDebug( "sched_wakeup id: %i\n", wakeupId );
+    TracyDebug( "drm_vblank_event id: %i\n", vsyncId );
+
+#ifdef TRACY_NO_SAMPLING
+    const bool noSoftwareSampling = true;
+#else
+    const char* noSoftwareSamplingEnv = GetEnvVar( "TRACY_NO_SAMPLING" );
+    const bool noSoftwareSampling = noSoftwareSamplingEnv && noSoftwareSamplingEnv[0] == '1';
+#endif
+
+#ifdef TRACY_NO_SAMPLE_RETIREMENT
+    const bool noRetirement = true;
+#else
+    const char* noRetirementEnv = GetEnvVar( "TRACY_NO_SAMPLE_RETIREMENT" );
+    const bool noRetirement = noRetirementEnv && noRetirementEnv[0] == '1';
+#endif
+
+#ifdef TRACY_NO_SAMPLE_CACHE
+    const bool noCache = true;
+#else
+    const char* noCacheEnv = GetEnvVar( "TRACY_NO_SAMPLE_CACHE" );
+    const bool noCache = noCacheEnv && noCacheEnv[0] == '1';
+#endif
+
+#ifdef TRACY_NO_SAMPLE_BRANCH
+    const bool noBranch = true;
+#else
+    const char* noBranchEnv = GetEnvVar( "TRACY_NO_SAMPLE_BRANCH" );
+    const bool noBranch = noBranchEnv && noBranchEnv[0] == '1';
+#endif
+
+#ifdef TRACY_NO_CONTEXT_SWITCH
+    const bool noCtxSwitch = true;
+#else
+    const char* noCtxSwitchEnv = GetEnvVar( "TRACY_NO_CONTEXT_SWITCH" );
+    const bool noCtxSwitch = noCtxSwitchEnv && noCtxSwitchEnv[0] == '1';
+#endif
+
+#ifdef TRACY_NO_VSYNC_CAPTURE
+    const bool noVsync = true;
+#else
+    const char* noVsyncEnv = GetEnvVar( "TRACY_NO_VSYNC_CAPTURE" );
+    const bool noVsync = noVsyncEnv && noVsyncEnv[0] == '1';
+#endif
+
+    samplingPeriod = GetSamplingPeriod();
+    uint32_t currentPid = (uint32_t)getpid();
+
+    s_numCpus = (int)std::thread::hardware_concurrency();
+
+    const auto maxNumBuffers = s_numCpus * (
+        1 +     // software sampling
+        2 +     // CPU cycles + instructions retired
+        2 +     // cache reference + miss
+        2 +     // branch retired + miss
+        2 +     // context switches + wakeups
+        1       // vsync
+    );
+    s_ring = (RingBuffer*)tracy_malloc( sizeof( RingBuffer ) * maxNumBuffers );
+    s_numBuffers = 0;
+
+    // software sampling
+    perf_event_attr pe = {};
+    pe.type = PERF_TYPE_SOFTWARE;
+    pe.size = sizeof( perf_event_attr );
+    pe.config = PERF_COUNT_SW_CPU_CLOCK;
+    pe.sample_freq = GetSamplingFrequency();
+    pe.sample_type = PERF_SAMPLE_TID | PERF_SAMPLE_TIME | PERF_SAMPLE_CALLCHAIN;
+#if LINUX_VERSION_CODE >= KERNEL_VERSION( 4, 8, 0 )
+    pe.sample_max_stack = 127;
+#endif
+    pe.disabled = 1;
+    pe.freq = 1;
+    pe.inherit = 1;
+#if !defined TRACY_HW_TIMER || !( defined __i386 || defined _M_IX86 || defined __x86_64__ || defined _M_X64 )
+    pe.use_clockid = 1;
+    pe.clockid = CLOCK_MONOTONIC_RAW;
+#endif
+
+    if( !noSoftwareSampling )
+    {
+        TracyDebug( "Setup software sampling\n" );
+        ProbePreciseIp( pe, currentPid );
+        for( int i=0; i<s_numCpus; i++ )
+        {
+            int fd = perf_event_open( &pe, currentPid, i, -1, PERF_FLAG_FD_CLOEXEC );
+            if( fd == -1 )
+            {
+                pe.exclude_kernel = 1;
+                ProbePreciseIp( pe, currentPid );
+                fd = perf_event_open( &pe, currentPid, i, -1, PERF_FLAG_FD_CLOEXEC );
+                if( fd == -1 )
+                {
+                    TracyDebug( "  Failed to setup!\n");
+                    break;
+                }
+                TracyDebug( "  No access to kernel samples\n" );
+            }
+            new( s_ring+s_numBuffers ) RingBuffer( 64*1024, fd, EventCallstack );
+            if( s_ring[s_numBuffers].IsValid() )
+            {
+                s_numBuffers++;
+                TracyDebug( "  Core %i ok\n", i );
+            }
+        }
+    }
+
+    // CPU cycles + instructions retired
+    pe = {};
+    pe.type = PERF_TYPE_HARDWARE;
+    pe.size = sizeof( perf_event_attr );
+    pe.sample_freq = 5000;
+    pe.sample_type = PERF_SAMPLE_IP | PERF_SAMPLE_TIME;
+    pe.disabled = 1;
+    pe.exclude_kernel = 1;
+    pe.exclude_guest = 1;
+    pe.exclude_hv = 1;
+    pe.freq = 1;
+    pe.inherit = 1;
+#if !defined TRACY_HW_TIMER || !( defined __i386 || defined _M_IX86 || defined __x86_64__ || defined _M_X64 )
+    pe.use_clockid = 1;
+    pe.clockid = CLOCK_MONOTONIC_RAW;
+#endif
+
+    if( !noRetirement )
+    {
+        TracyDebug( "Setup sampling cycles + retirement\n" );
+        ProbePreciseIp( pe, PERF_COUNT_HW_CPU_CYCLES, PERF_COUNT_HW_INSTRUCTIONS, currentPid );
+        for( int i=0; i<s_numCpus; i++ )
+        {
+            const int fd = perf_event_open( &pe, currentPid, i, -1, PERF_FLAG_FD_CLOEXEC );
+            if( fd != -1 )
+            {
+                new( s_ring+s_numBuffers ) RingBuffer( 64*1024, fd, EventCpuCycles );
+                if( s_ring[s_numBuffers].IsValid() )
+                {
+                    s_numBuffers++;
+                    TracyDebug( "  Core %i ok\n", i );
+                }
+            }
+        }
+
+        pe.config = PERF_COUNT_HW_INSTRUCTIONS;
+        for( int i=0; i<s_numCpus; i++ )
+        {
+            const int fd = perf_event_open( &pe, currentPid, i, -1, PERF_FLAG_FD_CLOEXEC );
+            if( fd != -1 )
+            {
+                new( s_ring+s_numBuffers ) RingBuffer( 64*1024, fd, EventInstructionsRetired );
+                if( s_ring[s_numBuffers].IsValid() )
+                {
+                    s_numBuffers++;
+                    TracyDebug( "  Core %i ok\n", i );
+                }
+            }
+        }
+    }
+
+    // cache reference + miss
+    if( !noCache )
+    {
+        TracyDebug( "Setup sampling CPU cache references + misses\n" );
+        ProbePreciseIp( pe, PERF_COUNT_HW_CACHE_REFERENCES, PERF_COUNT_HW_CACHE_MISSES, currentPid );
+        if( IsGenuineIntel() )
+        {
+            pe.precise_ip = 0;
+            TracyDebug( "  CPU is GenuineIntel, forcing precise_ip down to 0\n" );
+        }
+        for( int i=0; i<s_numCpus; i++ )
+        {
+            const int fd = perf_event_open( &pe, currentPid, i, -1, PERF_FLAG_FD_CLOEXEC );
+            if( fd != -1 )
+            {
+                new( s_ring+s_numBuffers ) RingBuffer( 64*1024, fd, EventCacheReference );
+                if( s_ring[s_numBuffers].IsValid() )
+                {
+                    s_numBuffers++;
+                    TracyDebug( "  Core %i ok\n", i );
+                }
+            }
+        }
+
+        pe.config = PERF_COUNT_HW_CACHE_MISSES;
+        for( int i=0; i<s_numCpus; i++ )
+        {
+            const int fd = perf_event_open( &pe, currentPid, i, -1, PERF_FLAG_FD_CLOEXEC );
+            if( fd != -1 )
+            {
+                new( s_ring+s_numBuffers ) RingBuffer( 64*1024, fd, EventCacheMiss );
+                if( s_ring[s_numBuffers].IsValid() )
+                {
+                    s_numBuffers++;
+                    TracyDebug( "  Core %i ok\n", i );
+                }
+            }
+        }
+    }
+
+    // branch retired + miss
+    if( !noBranch )
+    {
+        TracyDebug( "Setup sampling CPU branch retirements + misses\n" );
+        ProbePreciseIp( pe, PERF_COUNT_HW_BRANCH_INSTRUCTIONS, PERF_COUNT_HW_BRANCH_MISSES, currentPid );
+        for( int i=0; i<s_numCpus; i++ )
+        {
+            const int fd = perf_event_open( &pe, currentPid, i, -1, PERF_FLAG_FD_CLOEXEC );
+            if( fd != -1 )
+            {
+                new( s_ring+s_numBuffers ) RingBuffer( 64*1024, fd, EventBranchRetired );
+                if( s_ring[s_numBuffers].IsValid() )
+                {
+                    s_numBuffers++;
+                    TracyDebug( "  Core %i ok\n", i );
+                }
+            }
+        }
+
+        pe.config = PERF_COUNT_HW_BRANCH_MISSES;
+        for( int i=0; i<s_numCpus; i++ )
+        {
+            const int fd = perf_event_open( &pe, currentPid, i, -1, PERF_FLAG_FD_CLOEXEC );
+            if( fd != -1 )
+            {
+                new( s_ring+s_numBuffers ) RingBuffer( 64*1024, fd, EventBranchMiss );
+                if( s_ring[s_numBuffers].IsValid() )
+                {
+                    s_numBuffers++;
+                    TracyDebug( "  Core %i ok\n", i );
+                }
+            }
+        }
+    }
+
+    s_ctxBufferIdx = s_numBuffers;
+
+    // vsync
+    if( !noVsync && vsyncId != -1 )
+    {
+        pe = {};
+        pe.type = PERF_TYPE_TRACEPOINT;
+        pe.size = sizeof( perf_event_attr );
+        pe.sample_period = 1;
+        pe.sample_type = PERF_SAMPLE_TIME | PERF_SAMPLE_RAW;
+        pe.disabled = 1;
+        pe.config = vsyncId;
+#if !defined TRACY_HW_TIMER || !( defined __i386 || defined _M_IX86 || defined __x86_64__ || defined _M_X64 )
+        pe.use_clockid = 1;
+        pe.clockid = CLOCK_MONOTONIC_RAW;
+#endif
+
+        TracyDebug( "Setup vsync capture\n" );
+        for( int i=0; i<s_numCpus; i++ )
+        {
+            const int fd = perf_event_open( &pe, -1, i, -1, PERF_FLAG_FD_CLOEXEC );
+            if( fd != -1 )
+            {
+                new( s_ring+s_numBuffers ) RingBuffer( 64*1024, fd, EventVsync, i );
+                if( s_ring[s_numBuffers].IsValid() )
+                {
+                    s_numBuffers++;
+                    TracyDebug( "  Core %i ok\n", i );
+                }
+            }
+        }
+    }
+
+    // context switches
+    if( !noCtxSwitch && switchId != -1 )
+    {
+        pe = {};
+        pe.type = PERF_TYPE_TRACEPOINT;
+        pe.size = sizeof( perf_event_attr );
+        pe.sample_period = 1;
+        pe.sample_type = PERF_SAMPLE_TIME | PERF_SAMPLE_RAW | PERF_SAMPLE_CALLCHAIN;
+#if LINUX_VERSION_CODE >= KERNEL_VERSION( 4, 8, 0 )
+        pe.sample_max_stack = 127;
+#endif
+        pe.disabled = 1;
+        pe.inherit = 1;
+        pe.config = switchId;
+#if !defined TRACY_HW_TIMER || !( defined __i386 || defined _M_IX86 || defined __x86_64__ || defined _M_X64 )
+        pe.use_clockid = 1;
+        pe.clockid = CLOCK_MONOTONIC_RAW;
+#endif
+
+        TracyDebug( "Setup context switch capture\n" );
+        for( int i=0; i<s_numCpus; i++ )
+        {
+            const int fd = perf_event_open( &pe, -1, i, -1, PERF_FLAG_FD_CLOEXEC );
+            if( fd != -1 )
+            {
+                new( s_ring+s_numBuffers ) RingBuffer( 256*1024, fd, EventContextSwitch, i );
+                if( s_ring[s_numBuffers].IsValid() )
+                {
+                    s_numBuffers++;
+                    TracyDebug( "  Core %i ok\n", i );
+                }
+            }
+        }
+
+        if( wakeupId != -1 )
+        {
+            pe.config = wakeupId;
+            pe.config &= ~PERF_SAMPLE_CALLCHAIN;
+
+            TracyDebug( "Setup wakeup capture\n" );
+            for( int i=0; i<s_numCpus; i++ )
+            {
+                const int fd = perf_event_open( &pe, -1, i, -1, PERF_FLAG_FD_CLOEXEC );
+                if( fd != -1 )
+                {
+                    new( s_ring+s_numBuffers ) RingBuffer( 64*1024, fd, EventWakeup, i );
+                    if( s_ring[s_numBuffers].IsValid() )
+                    {
+                        s_numBuffers++;
+                        TracyDebug( "  Core %i ok\n", i );
+                    }
+                }
+            }
+        }
+    }
+
+    TracyDebug( "Ringbuffers in use: %i\n", s_numBuffers );
+
+    traceActive.store( true, std::memory_order_relaxed );
+    return true;
+}
+
+void SysTraceStop()
+{
+    traceActive.store( false, std::memory_order_relaxed );
+}
+
+static uint64_t* GetCallstackBlock( uint64_t cnt, RingBuffer& ring, uint64_t offset )
+{
+    auto trace = (uint64_t*)tracy_malloc_fast( ( 1 + cnt ) * sizeof( uint64_t ) );
+    ring.Read( trace+1, offset, sizeof( uint64_t ) * cnt );
+
+#if defined __x86_64__ || defined _M_X64
+    // remove non-canonical pointers
+    do
+    {
+        const auto test = (int64_t)trace[cnt];
+        const auto m1 = test >> 63;
+        const auto m2 = test >> 47;
+        if( m1 == m2 ) break;
+    }
+    while( --cnt > 0 );
+    for( uint64_t j=1; j<cnt; j++ )
+    {
+        const auto test = (int64_t)trace[j];
+        const auto m1 = test >> 63;
+        const auto m2 = test >> 47;
+        if( m1 != m2 ) trace[j] = 0;
+    }
+#endif
+
+    for( uint64_t j=1; j<=cnt; j++ )
+    {
+        if( trace[j] >= (uint64_t)-4095 )       // PERF_CONTEXT_MAX
+        {
+            memmove( trace+j, trace+j+1, sizeof( uint64_t ) * ( cnt - j ) );
+            cnt--;
+        }
+    }
+
+    memcpy( trace, &cnt, sizeof( uint64_t ) );
+    return trace;
+}
+
+void SysTraceWorker( void* ptr )
+{
+    ThreadExitHandler threadExitHandler;
+    SetThreadName( "Tracy Sampling" );
+    InitRpmalloc();
+    sched_param sp = { 99 };
+    if( pthread_setschedparam( pthread_self(), SCHED_FIFO, &sp ) != 0 ) TracyDebug( "Failed to increase SysTraceWorker thread priority!\n" );
+    auto ctxBufferIdx = s_ctxBufferIdx;
+    auto ringArray = s_ring;
+    auto numBuffers = s_numBuffers;
+    for( int i=0; i<numBuffers; i++ ) ringArray[i].Enable();
+    for(;;)
+    {
+#ifdef TRACY_ON_DEMAND
+        if( !GetProfiler().IsConnected() )
+        {
+            if( !traceActive.load( std::memory_order_relaxed ) ) break;
+            for( int i=0; i<numBuffers; i++ )
+            {
+                auto& ring = ringArray[i];
+                const auto head = ring.LoadHead();
+                const auto tail = ring.GetTail();
+                if( head != tail )
+                {
+                    const auto end = head - tail;
+                    ring.Advance( end );
+                }
+            }
+            if( !traceActive.load( std::memory_order_relaxed ) ) break;
+            std::this_thread::sleep_for( std::chrono::milliseconds( 10 ) );
+            continue;
+        }
+#endif
+
+        bool hadData = false;
+        for( int i=0; i<ctxBufferIdx; i++ )
+        {
+            if( !traceActive.load( std::memory_order_relaxed ) ) break;
+            auto& ring = ringArray[i];
+            const auto head = ring.LoadHead();
+            const auto tail = ring.GetTail();
+            if( head == tail ) continue;
+            assert( head > tail );
+            hadData = true;
+
+            const auto id = ring.GetId();
+            assert( id != EventContextSwitch );
+            const auto end = head - tail;
+            uint64_t pos = 0;
+            if( id == EventCallstack )
+            {
+                while( pos < end )
+                {
+                    perf_event_header hdr;
+                    ring.Read( &hdr, pos, sizeof( perf_event_header ) );
+                    if( hdr.type == PERF_RECORD_SAMPLE )
+                    {
+                        auto offset = pos + sizeof( perf_event_header );
+
+                        // Layout:
+                        //   u32 pid, tid
+                        //   u64 time
+                        //   u64 cnt
+                        //   u64 ip[cnt]
+
+                        uint32_t tid;
+                        uint64_t t0;
+                        uint64_t cnt;
+
+                        offset += sizeof( uint32_t );
+                        ring.Read( &tid, offset, sizeof( uint32_t ) );
+                        offset += sizeof( uint32_t );
+                        ring.Read( &t0, offset, sizeof( uint64_t ) );
+                        offset += sizeof( uint64_t );
+                        ring.Read( &cnt, offset, sizeof( uint64_t ) );
+                        offset += sizeof( uint64_t );
+
+                        if( cnt > 0 )
+                        {
+#if defined TRACY_HW_TIMER && ( defined __i386 || defined _M_IX86 || defined __x86_64__ || defined _M_X64 )
+                            t0 = ring.ConvertTimeToTsc( t0 );
+#endif
+                            auto trace = GetCallstackBlock( cnt, ring, offset );
+
+                            TracyLfqPrepare( QueueType::CallstackSample );
+                            MemWrite( &item->callstackSampleFat.time, t0 );
+                            MemWrite( &item->callstackSampleFat.thread, tid );
+                            MemWrite( &item->callstackSampleFat.ptr, (uint64_t)trace );
+                            TracyLfqCommit;
+                        }
+                    }
+                    pos += hdr.size;
+                }
+            }
+            else
+            {
+                while( pos < end )
+                {
+                    perf_event_header hdr;
+                    ring.Read( &hdr, pos, sizeof( perf_event_header ) );
+                    if( hdr.type == PERF_RECORD_SAMPLE )
+                    {
+                        auto offset = pos + sizeof( perf_event_header );
+
+                        // Layout:
+                        //   u64 ip
+                        //   u64 time
+
+                        uint64_t ip, t0;
+                        ring.Read( &ip, offset, sizeof( uint64_t ) );
+                        offset += sizeof( uint64_t );
+                        ring.Read( &t0, offset, sizeof( uint64_t ) );
+
+#if defined TRACY_HW_TIMER && ( defined __i386 || defined _M_IX86 || defined __x86_64__ || defined _M_X64 )
+                        t0 = ring.ConvertTimeToTsc( t0 );
+#endif
+                        QueueType type;
+                        switch( id )
+                        {
+                        case EventCpuCycles:
+                            type = QueueType::HwSampleCpuCycle;
+                            break;
+                        case EventInstructionsRetired:
+                            type = QueueType::HwSampleInstructionRetired;
+                            break;
+                        case EventCacheReference:
+                            type = QueueType::HwSampleCacheReference;
+                            break;
+                        case EventCacheMiss:
+                            type = QueueType::HwSampleCacheMiss;
+                            break;
+                        case EventBranchRetired:
+                            type = QueueType::HwSampleBranchRetired;
+                            break;
+                        case EventBranchMiss:
+                            type = QueueType::HwSampleBranchMiss;
+                            break;
+                        default:
+                            abort();
+                        }
+
+                        TracyLfqPrepare( type );
+                        MemWrite( &item->hwSample.ip, ip );
+                        MemWrite( &item->hwSample.time, t0 );
+                        TracyLfqCommit;
+                    }
+                    pos += hdr.size;
+                }
+            }
+            assert( pos == end );
+            ring.Advance( end );
+        }
+        if( !traceActive.load( std::memory_order_relaxed ) ) break;
+
+        if( ctxBufferIdx != numBuffers )
+        {
+            const auto ctxBufNum = numBuffers - ctxBufferIdx;
+
+            int activeNum = 0;
+            uint16_t active[512];
+            uint32_t end[512];
+            uint32_t pos[512];
+            for( int i=0; i<ctxBufNum; i++ )
+            {
+                const auto rbIdx = ctxBufferIdx + i;
+                const auto rbHead = ringArray[rbIdx].LoadHead();
+                const auto rbTail = ringArray[rbIdx].GetTail();
+                const auto rbActive = rbHead != rbTail;
+
+                if( rbActive )
+                {
+                    active[activeNum] = (uint16_t)i;
+                    activeNum++;
+                    end[i] = rbHead - rbTail;
+                    pos[i] = 0;
+                }
+                else
+                {
+                    end[i] = 0;
+                }
+            }
+            if( activeNum > 0 )
+            {
+                hadData = true;
+                while( activeNum > 0 )
+                {
+                    int sel = -1;
+                    int selPos;
+                    int64_t t0 = std::numeric_limits<int64_t>::max();
+                    for( int i=0; i<activeNum; i++ )
+                    {
+                        auto idx = active[i];
+                        auto rbPos = pos[idx];
+                        assert( rbPos < end[idx] );
+                        const auto rbIdx = ctxBufferIdx + idx;
+                        perf_event_header hdr;
+                        ringArray[rbIdx].Read( &hdr, rbPos, sizeof( perf_event_header ) );
+                        if( hdr.type == PERF_RECORD_SAMPLE )
+                        {
+                            int64_t rbTime;
+                            ringArray[rbIdx].Read( &rbTime, rbPos + sizeof( perf_event_header ), sizeof( int64_t ) );
+                            if( rbTime < t0 )
+                            {
+                                t0 = rbTime;
+                                sel = idx;
+                                selPos = i;
+                            }
+                        }
+                        else
+                        {
+                            rbPos += hdr.size;
+                            if( rbPos == end[idx] )
+                            {
+                                memmove( active+i, active+i+1, sizeof(*active) * ( activeNum - i - 1 ) );
+                                activeNum--;
+                                i--;
+                            }
+                            else
+                            {
+                                pos[idx] = rbPos;
+                            }
+                        }
+                    }
+                    if( sel >= 0 )
+                    {
+                        auto& ring = ringArray[ctxBufferIdx + sel];
+                        auto rbPos = pos[sel];
+                        auto offset = rbPos;
+                        perf_event_header hdr;
+                        ring.Read( &hdr, offset, sizeof( perf_event_header ) );
+
+#if defined TRACY_HW_TIMER && ( defined __i386 || defined _M_IX86 || defined __x86_64__ || defined _M_X64 )
+                        t0 = ring.ConvertTimeToTsc( t0 );
+#endif
+
+                        const auto rid = ring.GetId();
+                        if( rid == EventContextSwitch )
+                        {
+                            // Layout:
+                            //   u64 time
+                            //   u64 cnt
+                            //   u64 ip[cnt]
+                            //   u32 size
+                            //   u8  data[size]
+                            // Data (not ABI stable, but has not changed since it was added, in 2009):
+                            //   u8  hdr[8]
+                            //   u8  prev_comm[16]
+                            //   u32 prev_pid
+                            //   u32 prev_prio
+                            //   lng prev_state
+                            //   u8  next_comm[16]
+                            //   u32 next_pid
+                            //   u32 next_prio
+
+                            offset += sizeof( perf_event_header ) + sizeof( uint64_t );
+
+                            uint64_t cnt;
+                            ring.Read( &cnt, offset, sizeof( uint64_t ) );
+                            offset += sizeof( uint64_t );
+                            const auto traceOffset = offset;
+                            offset += sizeof( uint64_t ) * cnt + sizeof( uint32_t ) + 8 + 16;
+
+                            uint32_t prev_pid, next_pid;
+                            long prev_state;
+
+                            ring.Read( &prev_pid, offset, sizeof( uint32_t ) );
+                            offset += sizeof( uint32_t ) + sizeof( uint32_t );
+                            ring.Read( &prev_state, offset, sizeof( long ) );
+                            offset += sizeof( long ) + 16;
+                            ring.Read( &next_pid, offset, sizeof( uint32_t ) );
+
+                            uint8_t reason = 100;
+                            uint8_t state;
+
+                            if(      prev_state & 0x0001 ) state = 104;
+                            else if( prev_state & 0x0002 ) state = 101;
+                            else if( prev_state & 0x0004 ) state = 105;
+                            else if( prev_state & 0x0008 ) state = 106;
+                            else if( prev_state & 0x0010 ) state = 108;
+                            else if( prev_state & 0x0020 ) state = 109;
+                            else if( prev_state & 0x0040 ) state = 110;
+                            else if( prev_state & 0x0080 ) state = 102;
+                            else                           state = 103;
+
+                            TracyLfqPrepare( QueueType::ContextSwitch );
+                            MemWrite( &item->contextSwitch.time, t0 );
+                            MemWrite( &item->contextSwitch.oldThread, prev_pid );
+                            MemWrite( &item->contextSwitch.newThread, next_pid );
+                            MemWrite( &item->contextSwitch.cpu, uint8_t( ring.GetCpu() ) );
+                            MemWrite( &item->contextSwitch.reason, reason );
+                            MemWrite( &item->contextSwitch.state, state );
+                            TracyLfqCommit;
+
+                            if( cnt > 0 && prev_pid != 0 && CurrentProcOwnsThread( prev_pid ) )
+                            {
+                                auto trace = GetCallstackBlock( cnt, ring, traceOffset );
+
+                                TracyLfqPrepare( QueueType::CallstackSampleContextSwitch );
+                                MemWrite( &item->callstackSampleFat.time, t0 );
+                                MemWrite( &item->callstackSampleFat.thread, prev_pid );
+                                MemWrite( &item->callstackSampleFat.ptr, (uint64_t)trace );
+                                TracyLfqCommit;
+                            }
+                        }
+                        else if( rid == EventWakeup )
+                        {
+                            // Layout:
+                            //   u64 time
+                            //   u32 size
+                            //   u8  data[size]
+                            // Data:
+                            //   u8  hdr[8]
+                            //   u8  comm[16]
+                            //   u32 pid
+                            //   u32 prio
+                            //   u64 target_cpu
+
+                            offset += sizeof( perf_event_header ) + sizeof( uint64_t ) + sizeof( uint32_t ) + 8 + 16;
+
+                            uint32_t pid;
+                            ring.Read( &pid, offset, sizeof( uint32_t ) );
+
+                            TracyLfqPrepare( QueueType::ThreadWakeup );
+                            MemWrite( &item->threadWakeup.time, t0 );
+                            MemWrite( &item->threadWakeup.thread, pid );
+                            TracyLfqCommit;
+                        }
+                        else
+                        {
+                            assert( rid == EventVsync );
+                            // Layout:
+                            //   u64 time
+                            //   u32 size
+                            //   u8  data[size]
+                            // Data (not ABI stable):
+                            //   u8  hdr[8]
+                            //   i32 crtc
+                            //   u32 seq
+                            //   i64 ktime
+                            //   u8  high precision
+
+                            offset += sizeof( perf_event_header ) + sizeof( uint64_t ) + sizeof( uint32_t ) + 8;
+
+                            int32_t crtc;
+                            ring.Read( &crtc, offset, sizeof( int32_t ) );
+
+                            // Note: The timestamp value t0 might be off by a number of microseconds from the
+                            // true hardware vblank event. The ktime value should be used instead, but it is
+                            // measured in CLOCK_MONOTONIC time. Tracy only supports the timestamp counter
+                            // register (TSC) or CLOCK_MONOTONIC_RAW clock.
+#if 0
+                            offset += sizeof( uint32_t ) * 2;
+                            int64_t ktime;
+                            ring.Read( &ktime, offset, sizeof( int64_t ) );
+#endif
+
+                            TracyLfqPrepare( QueueType::FrameVsync );
+                            MemWrite( &item->frameVsync.id, crtc );
+                            MemWrite( &item->frameVsync.time, t0 );
+                            TracyLfqCommit;
+                        }
+
+                        rbPos += hdr.size;
+                        if( rbPos == end[sel] )
+                        {
+                            memmove( active+selPos, active+selPos+1, sizeof(*active) * ( activeNum - selPos - 1 ) );
+                            activeNum--;
+                        }
+                        else
+                        {
+                            pos[sel] = rbPos;
+                        }
+                    }
+                }
+                for( int i=0; i<ctxBufNum; i++ )
+                {
+                    if( end[i] != 0 ) ringArray[ctxBufferIdx + i].Advance( end[i] );
+                }
+            }
+        }
+        if( !traceActive.load( std::memory_order_relaxed ) ) break;
+        if( !hadData )
+        {
+            std::this_thread::sleep_for( std::chrono::milliseconds( 1 ) );
+        }
+    }
+
+    for( int i=0; i<numBuffers; i++ ) ringArray[i].~RingBuffer();
+    tracy_free_fast( ringArray );
+}
+
+void SysTraceGetExternalName( uint64_t thread, const char*& threadName, const char*& name )
+{
+    FILE* f;
+    char fn[256];
+    sprintf( fn, "/proc/%" PRIu64 "/comm", thread );
+    f = fopen( fn, "rb" );
+    if( f )
+    {
+        char buf[256];
+        const auto sz = fread( buf, 1, 256, f );
+        if( sz > 0 && buf[sz-1] == '\n' ) buf[sz-1] = '\0';
+        threadName = CopyString( buf );
+        fclose( f );
+    }
+    else
+    {
+        threadName = CopyString( "???", 3 );
+    }
+
+    sprintf( fn, "/proc/%" PRIu64 "/status", thread );
+    f = fopen( fn, "rb" );
+    if( f )
+    {
+        char* tmp = (char*)tracy_malloc_fast( 8*1024 );
+        const auto fsz = (ptrdiff_t)fread( tmp, 1, 8*1024, f );
+        fclose( f );
+
+        int pid = -1;
+        auto line = tmp;
+        for(;;)
+        {
+            if( memcmp( "Tgid:\t", line, 6 ) == 0 )
+            {
+                pid = atoi( line + 6 );
+                break;
+            }
+            while( line - tmp < fsz && *line != '\n' ) line++;
+            if( *line != '\n' ) break;
+            line++;
+        }
+        tracy_free_fast( tmp );
+
+        if( pid >= 0 )
+        {
+            {
+                uint64_t _pid = pid;
+                TracyLfqPrepare( QueueType::TidToPid );
+                MemWrite( &item->tidToPid.tid, thread );
+                MemWrite( &item->tidToPid.pid, _pid );
+                TracyLfqCommit;
+            }
+            sprintf( fn, "/proc/%i/comm", pid );
+            f = fopen( fn, "rb" );
+            if( f )
+            {
+                char buf[256];
+                const auto sz = fread( buf, 1, 256, f );
+                if( sz > 0 && buf[sz-1] == '\n' ) buf[sz-1] = '\0';
+                name = CopyStringFast( buf );
+                fclose( f );
+                return;
+            }
+        }
+    }
+    name = CopyStringFast( "???", 3 );
+}
+
+}
+
+#  endif
+
+#endif
diff --git a/project/thirdparty/tracy-0.11.1/client/TracySysTrace.hpp b/project/thirdparty/tracy-0.11.1/client/TracySysTrace.hpp
new file mode 100644
index 000000000..8c663cd7a
--- /dev/null
+++ b/project/thirdparty/tracy-0.11.1/client/TracySysTrace.hpp
@@ -0,0 +1,28 @@
+#ifndef __TRACYSYSTRACE_HPP__
+#define __TRACYSYSTRACE_HPP__
+
+#if !defined TRACY_NO_SYSTEM_TRACING && ( defined _WIN32 || defined __linux__ )
+#  include "../common/TracyUwp.hpp"
+#  ifndef TRACY_UWP
+#    define TRACY_HAS_SYSTEM_TRACING
+#  endif
+#endif
+
+#ifdef TRACY_HAS_SYSTEM_TRACING
+
+#include <stdint.h>
+
+namespace tracy
+{
+
+bool SysTraceStart( int64_t& samplingPeriod );
+void SysTraceStop();
+void SysTraceWorker( void* ptr );
+
+void SysTraceGetExternalName( uint64_t thread, const char*& threadName, const char*& name );
+
+}
+
+#endif
+
+#endif
diff --git a/project/thirdparty/tracy-0.11.1/client/TracyThread.hpp b/project/thirdparty/tracy-0.11.1/client/TracyThread.hpp
new file mode 100644
index 000000000..5638756ac
--- /dev/null
+++ b/project/thirdparty/tracy-0.11.1/client/TracyThread.hpp
@@ -0,0 +1,90 @@
+#ifndef __TRACYTHREAD_HPP__
+#define __TRACYTHREAD_HPP__
+
+#if defined _WIN32
+#  include <windows.h>
+#else
+#  include <pthread.h>
+#endif
+
+#ifdef TRACY_MANUAL_LIFETIME
+#  include "tracy_rpmalloc.hpp"
+#endif
+
+namespace tracy
+{
+
+#ifdef TRACY_MANUAL_LIFETIME
+extern thread_local bool RpThreadInitDone;
+#endif
+
+class ThreadExitHandler
+{
+public:
+    ~ThreadExitHandler()
+    {
+#ifdef TRACY_MANUAL_LIFETIME
+        rpmalloc_thread_finalize( 1 );
+        RpThreadInitDone = false;
+#endif
+    }
+};
+
+#if defined _WIN32
+
+class Thread
+{
+public:
+    Thread( void(*func)( void* ptr ), void* ptr )
+        : m_func( func )
+        , m_ptr( ptr )
+        , m_hnd( CreateThread( nullptr, 0, Launch, this, 0, nullptr ) )
+    {}
+
+    ~Thread()
+    {
+        WaitForSingleObject( m_hnd, INFINITE );
+        CloseHandle( m_hnd );
+    }
+
+    HANDLE Handle() const { return m_hnd; }
+
+private:
+    static DWORD WINAPI Launch( void* ptr ) { ((Thread*)ptr)->m_func( ((Thread*)ptr)->m_ptr ); return 0; }
+
+    void(*m_func)( void* ptr );
+    void* m_ptr;
+    HANDLE m_hnd;
+};
+
+#else
+
+class Thread
+{
+public:
+    Thread( void(*func)( void* ptr ), void* ptr )
+        : m_func( func )
+        , m_ptr( ptr )
+    {
+        pthread_create( &m_thread, nullptr, Launch, this );
+    }
+
+    ~Thread()
+    {
+        pthread_join( m_thread, nullptr );
+    }
+
+    pthread_t Handle() const { return m_thread; }
+
+private:
+    static void* Launch( void* ptr ) { ((Thread*)ptr)->m_func( ((Thread*)ptr)->m_ptr ); return nullptr; }
+    void(*m_func)( void* ptr );
+    void* m_ptr;
+    pthread_t m_thread;
+};
+
+#endif
+
+}
+
+#endif
diff --git a/project/thirdparty/tracy-0.11.1/client/tracy_SPSCQueue.h b/project/thirdparty/tracy-0.11.1/client/tracy_SPSCQueue.h
new file mode 100644
index 000000000..7f1752b56
--- /dev/null
+++ b/project/thirdparty/tracy-0.11.1/client/tracy_SPSCQueue.h
@@ -0,0 +1,148 @@
+/*
+Copyright (c) 2020 Erik Rigtorp <erik@rigtorp.se>
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
+ */
+
+#pragma once
+
+#include <atomic>
+#include <cassert>
+#include <cstddef>
+#include <stdexcept>
+#include <type_traits> // std::enable_if, std::is_*_constructible
+
+#include "../common/TracyAlloc.hpp"
+
+#if defined (_MSC_VER)
+#pragma warning(push)
+#pragma warning(disable:4324)
+#endif
+
+namespace tracy {
+
+template <typename T> class SPSCQueue {
+public:
+  explicit SPSCQueue(const size_t capacity)
+      : capacity_(capacity) {
+    capacity_++; // Needs one slack element
+    slots_ = (T*)tracy_malloc(sizeof(T) * (capacity_ + 2 * kPadding));
+
+    static_assert(alignof(SPSCQueue<T>) == kCacheLineSize, "");
+    static_assert(sizeof(SPSCQueue<T>) >= 3 * kCacheLineSize, "");
+    assert(reinterpret_cast<char *>(&readIdx_) -
+               reinterpret_cast<char *>(&writeIdx_) >=
+           static_cast<std::ptrdiff_t>(kCacheLineSize));
+  }
+
+  ~SPSCQueue() {
+    while (front()) {
+      pop();
+    }
+    tracy_free(slots_);
+  }
+
+  // non-copyable and non-movable
+  SPSCQueue(const SPSCQueue &) = delete;
+  SPSCQueue &operator=(const SPSCQueue &) = delete;
+
+  template <typename... Args>
+  void emplace(Args &&...args) noexcept(
+      std::is_nothrow_constructible<T, Args &&...>::value) {
+    static_assert(std::is_constructible<T, Args &&...>::value,
+                  "T must be constructible with Args&&...");
+    auto const writeIdx = writeIdx_.load(std::memory_order_relaxed);
+    auto nextWriteIdx = writeIdx + 1;
+    if (nextWriteIdx == capacity_) {
+      nextWriteIdx = 0;
+    }
+    while (nextWriteIdx == readIdxCache_) {
+      readIdxCache_ = readIdx_.load(std::memory_order_acquire);
+    }
+    new (&slots_[writeIdx + kPadding]) T(std::forward<Args>(args)...);
+    writeIdx_.store(nextWriteIdx, std::memory_order_release);
+  }
+
+  T *front() noexcept {
+    auto const readIdx = readIdx_.load(std::memory_order_relaxed);
+    if (readIdx == writeIdxCache_) {
+      writeIdxCache_ = writeIdx_.load(std::memory_order_acquire);
+      if (writeIdxCache_ == readIdx) {
+        return nullptr;
+      }
+    }
+    return &slots_[readIdx + kPadding];
+  }
+
+  void pop() noexcept {
+    static_assert(std::is_nothrow_destructible<T>::value,
+                  "T must be nothrow destructible");
+    auto const readIdx = readIdx_.load(std::memory_order_relaxed);
+    assert(writeIdx_.load(std::memory_order_acquire) != readIdx);
+    slots_[readIdx + kPadding].~T();
+    auto nextReadIdx = readIdx + 1;
+    if (nextReadIdx == capacity_) {
+      nextReadIdx = 0;
+    }
+    readIdx_.store(nextReadIdx, std::memory_order_release);
+  }
+
+  size_t size() const noexcept {
+    std::ptrdiff_t diff = writeIdx_.load(std::memory_order_acquire) -
+                          readIdx_.load(std::memory_order_acquire);
+    if (diff < 0) {
+      diff += capacity_;
+    }
+    return static_cast<size_t>(diff);
+  }
+
+  bool empty() const noexcept {
+      return writeIdx_.load(std::memory_order_acquire) ==
+          readIdx_.load(std::memory_order_acquire);
+  }
+
+  size_t capacity() const noexcept { return capacity_ - 1; }
+
+private:
+  static constexpr size_t kCacheLineSize = 64;
+
+  // Padding to avoid false sharing between slots_ and adjacent allocations
+  static constexpr size_t kPadding = (kCacheLineSize - 1) / sizeof(T) + 1;
+
+private:
+  size_t capacity_;
+  T *slots_;
+
+  // Align to cache line size in order to avoid false sharing
+  // readIdxCache_ and writeIdxCache_ is used to reduce the amount of cache
+  // coherency traffic
+  alignas(kCacheLineSize) std::atomic<size_t> writeIdx_ = {0};
+  alignas(kCacheLineSize) size_t readIdxCache_ = 0;
+  alignas(kCacheLineSize) std::atomic<size_t> readIdx_ = {0};
+  alignas(kCacheLineSize) size_t writeIdxCache_ = 0;
+
+  // Padding to avoid adjacent allocations to share cache line with
+  // writeIdxCache_
+  char padding_[kCacheLineSize - sizeof(SPSCQueue<T>::writeIdxCache_)];
+};
+} // namespace rigtorp
+
+#if defined (_MSC_VER)
+#pragma warning(pop)
+#endif
diff --git a/project/thirdparty/tracy-0.11.1/client/tracy_concurrentqueue.h b/project/thirdparty/tracy-0.11.1/client/tracy_concurrentqueue.h
new file mode 100644
index 000000000..4178d39ea
--- /dev/null
+++ b/project/thirdparty/tracy-0.11.1/client/tracy_concurrentqueue.h
@@ -0,0 +1,1441 @@
+﻿// Provides a C++11 implementation of a multi-producer, multi-consumer lock-free queue.
+// An overview, including benchmark results, is provided here:
+//     http://moodycamel.com/blog/2014/a-fast-general-purpose-lock-free-queue-for-c++
+// The full design is also described in excruciating detail at:
+//    http://moodycamel.com/blog/2014/detailed-design-of-a-lock-free-queue
+
+// Simplified BSD license:
+// Copyright (c) 2013-2016, Cameron Desrochers.
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+// - Redistributions of source code must retain the above copyright notice, this list of
+// conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice, this list of
+// conditions and the following disclaimer in the documentation and/or other materials
+// provided with the distribution.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
+// MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL
+// THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT
+// OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+// HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR
+// TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
+// EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+
+#pragma once
+
+#include "../common/TracyAlloc.hpp"
+#include "../common/TracyForceInline.hpp"
+#include "../common/TracySystem.hpp"
+
+#if defined(__GNUC__)
+// Disable -Wconversion warnings (spuriously triggered when Traits::size_t and
+// Traits::index_t are set to < 32 bits, causing integer promotion, causing warnings
+// upon assigning any computed values)
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wconversion"
+#endif
+
+#if defined(__APPLE__)
+#include "TargetConditionals.h"
+#endif
+
+#include <atomic>		// Requires C++11. Sorry VS2010.
+#include <cassert>
+#include <cstddef>              // for max_align_t
+#include <cstdint>
+#include <cstdlib>
+#include <type_traits>
+#include <algorithm>
+#include <utility>
+#include <limits>
+#include <climits>		// for CHAR_BIT
+#include <array>
+#include <thread>		// partly for __WINPTHREADS_VERSION if on MinGW-w64 w/ POSIX threading
+
+namespace tracy
+{
+
+// Compiler-specific likely/unlikely hints
+namespace moodycamel { namespace details {
+#if defined(__GNUC__)
+	inline bool cqLikely(bool x) { return __builtin_expect((x), true); }
+	inline bool cqUnlikely(bool x) { return __builtin_expect((x), false); }
+#else
+	inline bool cqLikely(bool x) { return x; }
+	inline bool cqUnlikely(bool x) { return x; }
+#endif
+} }
+
+namespace
+{
+    // to avoid MSVC warning 4127: conditional expression is constant
+    template <bool>
+    struct compile_time_condition
+    {
+        static const bool value = false;
+    };
+    template <>
+    struct compile_time_condition<true>
+    {
+        static const bool value = true;
+    };
+}
+
+namespace moodycamel {
+namespace details {
+	template<typename T>
+	struct const_numeric_max {
+		static_assert(std::is_integral<T>::value, "const_numeric_max can only be used with integers");
+		static const T value = std::numeric_limits<T>::is_signed
+			? (static_cast<T>(1) << (sizeof(T) * CHAR_BIT - 1)) - static_cast<T>(1)
+			: static_cast<T>(-1);
+	};
+
+#if defined(__GLIBCXX__)
+	typedef ::max_align_t std_max_align_t;      // libstdc++ forgot to add it to std:: for a while
+#else
+	typedef std::max_align_t std_max_align_t;   // Others (e.g. MSVC) insist it can *only* be accessed via std::
+#endif
+
+	// Some platforms have incorrectly set max_align_t to a type with <8 bytes alignment even while supporting
+	// 8-byte aligned scalar values (*cough* 32-bit iOS). Work around this with our own union. See issue #64.
+	typedef union {
+		std_max_align_t x;
+		long long y;
+		void* z;
+	} max_align_t;
+}
+
+// Default traits for the ConcurrentQueue. To change some of the
+// traits without re-implementing all of them, inherit from this
+// struct and shadow the declarations you wish to be different;
+// since the traits are used as a template type parameter, the
+// shadowed declarations will be used where defined, and the defaults
+// otherwise.
+struct ConcurrentQueueDefaultTraits
+{
+	// General-purpose size type. std::size_t is strongly recommended.
+	typedef std::size_t size_t;
+
+	// The type used for the enqueue and dequeue indices. Must be at least as
+	// large as size_t. Should be significantly larger than the number of elements
+	// you expect to hold at once, especially if you have a high turnover rate;
+	// for example, on 32-bit x86, if you expect to have over a hundred million
+	// elements or pump several million elements through your queue in a very
+	// short space of time, using a 32-bit type *may* trigger a race condition.
+	// A 64-bit int type is recommended in that case, and in practice will
+	// prevent a race condition no matter the usage of the queue. Note that
+	// whether the queue is lock-free with a 64-int type depends on the whether
+	// std::atomic<std::uint64_t> is lock-free, which is platform-specific.
+	typedef std::size_t index_t;
+
+	// Internally, all elements are enqueued and dequeued from multi-element
+	// blocks; this is the smallest controllable unit. If you expect few elements
+	// but many producers, a smaller block size should be favoured. For few producers
+	// and/or many elements, a larger block size is preferred. A sane default
+	// is provided. Must be a power of 2.
+	static const size_t BLOCK_SIZE = 64*1024;
+
+	// For explicit producers (i.e. when using a producer token), the block is
+	// checked for being empty by iterating through a list of flags, one per element.
+	// For large block sizes, this is too inefficient, and switching to an atomic
+	// counter-based approach is faster. The switch is made for block sizes strictly
+	// larger than this threshold.
+	static const size_t EXPLICIT_BLOCK_EMPTY_COUNTER_THRESHOLD = 32;
+
+	// How many full blocks can be expected for a single explicit producer? This should
+	// reflect that number's maximum for optimal performance. Must be a power of 2.
+	static const size_t EXPLICIT_INITIAL_INDEX_SIZE = 32;
+
+	// Controls the number of items that an explicit consumer (i.e. one with a token)
+	// must consume before it causes all consumers to rotate and move on to the next
+	// internal queue.
+	static const std::uint32_t EXPLICIT_CONSUMER_CONSUMPTION_QUOTA_BEFORE_ROTATE = 256;
+
+	// The maximum number of elements (inclusive) that can be enqueued to a sub-queue.
+	// Enqueue operations that would cause this limit to be surpassed will fail. Note
+	// that this limit is enforced at the block level (for performance reasons), i.e.
+	// it's rounded up to the nearest block size.
+	static const size_t MAX_SUBQUEUE_SIZE = details::const_numeric_max<size_t>::value;
+
+
+	// Memory allocation can be customized if needed.
+	// malloc should return nullptr on failure, and handle alignment like std::malloc.
+#if defined(malloc) || defined(free)
+	// Gah, this is 2015, stop defining macros that break standard code already!
+	// Work around malloc/free being special macros:
+	static inline void* WORKAROUND_malloc(size_t size) { return malloc(size); }
+	static inline void WORKAROUND_free(void* ptr) { return free(ptr); }
+	static inline void* (malloc)(size_t size) { return WORKAROUND_malloc(size); }
+	static inline void (free)(void* ptr) { return WORKAROUND_free(ptr); }
+#else
+	static inline void* malloc(size_t size) { return tracy::tracy_malloc(size); }
+	static inline void free(void* ptr) { return tracy::tracy_free(ptr); }
+#endif
+};
+
+
+// When producing or consuming many elements, the most efficient way is to:
+//    1) Use one of the bulk-operation methods of the queue with a token
+//    2) Failing that, use the bulk-operation methods without a token
+//    3) Failing that, create a token and use that with the single-item methods
+//    4) Failing that, use the single-parameter methods of the queue
+// Having said that, don't create tokens willy-nilly -- ideally there should be
+// a maximum of one token per thread (of each kind).
+struct ProducerToken;
+struct ConsumerToken;
+
+template<typename T, typename Traits> class ConcurrentQueue;
+
+
+namespace details
+{
+	struct ConcurrentQueueProducerTypelessBase
+	{
+		ConcurrentQueueProducerTypelessBase* next;
+		std::atomic<bool> inactive;
+		ProducerToken* token;
+        uint32_t threadId;
+
+		ConcurrentQueueProducerTypelessBase()
+			: next(nullptr), inactive(false), token(nullptr), threadId(0)
+		{
+		}
+	};
+
+	template<typename T>
+	static inline bool circular_less_than(T a, T b)
+	{
+		static_assert(std::is_integral<T>::value && !std::numeric_limits<T>::is_signed, "circular_less_than is intended to be used only with unsigned integer types");
+		return static_cast<T>(a - b) > static_cast<T>(static_cast<T>(1) << (static_cast<T>(sizeof(T) * CHAR_BIT - 1)));
+		// Note: extra parens around rhs of operator<< is MSVC bug: https://developercommunity2.visualstudio.com/t/C4554-triggers-when-both-lhs-and-rhs-is/10034931
+		//       silencing the bug requires #pragma warning(disable: 4554) around the calling code and has no effect when done here.
+	}
+
+	template<typename U>
+	static inline char* align_for(char* ptr)
+	{
+		const std::size_t alignment = std::alignment_of<U>::value;
+		return ptr + (alignment - (reinterpret_cast<std::uintptr_t>(ptr) % alignment)) % alignment;
+	}
+
+	template<typename T>
+	static inline T ceil_to_pow_2(T x)
+	{
+		static_assert(std::is_integral<T>::value && !std::numeric_limits<T>::is_signed, "ceil_to_pow_2 is intended to be used only with unsigned integer types");
+
+		// Adapted from http://graphics.stanford.edu/~seander/bithacks.html#RoundUpPowerOf2
+		--x;
+		x |= x >> 1;
+		x |= x >> 2;
+		x |= x >> 4;
+		for (std::size_t i = 1; i < sizeof(T); i <<= 1) {
+			x |= x >> (i << 3);
+		}
+		++x;
+		return x;
+	}
+
+	template<typename T>
+	static inline void swap_relaxed(std::atomic<T>& left, std::atomic<T>& right)
+	{
+		T temp = std::move(left.load(std::memory_order_relaxed));
+		left.store(std::move(right.load(std::memory_order_relaxed)), std::memory_order_relaxed);
+		right.store(std::move(temp), std::memory_order_relaxed);
+	}
+
+	template<typename T>
+	static inline T const& nomove(T const& x)
+	{
+		return x;
+	}
+
+	template<bool Enable>
+	struct nomove_if
+	{
+		template<typename T>
+		static inline T const& eval(T const& x)
+		{
+			return x;
+		}
+	};
+
+	template<>
+	struct nomove_if<false>
+	{
+		template<typename U>
+		static inline auto eval(U&& x)
+			-> decltype(std::forward<U>(x))
+		{
+			return std::forward<U>(x);
+		}
+	};
+
+	template<typename It>
+	static inline auto deref_noexcept(It& it) noexcept -> decltype(*it)
+	{
+		return *it;
+	}
+
+#if defined(__clang__) || !defined(__GNUC__) || __GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 8)
+	template<typename T> struct is_trivially_destructible : std::is_trivially_destructible<T> { };
+#else
+	template<typename T> struct is_trivially_destructible : std::has_trivial_destructor<T> { };
+#endif
+
+	template<typename T> struct static_is_lock_free_num { enum { value = 0 }; };
+	template<> struct static_is_lock_free_num<signed char> { enum { value = ATOMIC_CHAR_LOCK_FREE }; };
+	template<> struct static_is_lock_free_num<short> { enum { value = ATOMIC_SHORT_LOCK_FREE }; };
+	template<> struct static_is_lock_free_num<int> { enum { value = ATOMIC_INT_LOCK_FREE }; };
+	template<> struct static_is_lock_free_num<long> { enum { value = ATOMIC_LONG_LOCK_FREE }; };
+	template<> struct static_is_lock_free_num<long long> { enum { value = ATOMIC_LLONG_LOCK_FREE }; };
+	template<typename T> struct static_is_lock_free : static_is_lock_free_num<typename std::make_signed<T>::type> {  };
+	template<> struct static_is_lock_free<bool> { enum { value = ATOMIC_BOOL_LOCK_FREE }; };
+	template<typename U> struct static_is_lock_free<U*> { enum { value = ATOMIC_POINTER_LOCK_FREE }; };
+}
+
+
+struct ProducerToken
+{
+	template<typename T, typename Traits>
+	explicit ProducerToken(ConcurrentQueue<T, Traits>& queue);
+
+	ProducerToken(ProducerToken&& other) noexcept
+		: producer(other.producer)
+	{
+		other.producer = nullptr;
+		if (producer != nullptr) {
+			producer->token = this;
+		}
+	}
+
+	inline ProducerToken& operator=(ProducerToken&& other) noexcept
+	{
+		swap(other);
+		return *this;
+	}
+
+	void swap(ProducerToken& other) noexcept
+	{
+		std::swap(producer, other.producer);
+		if (producer != nullptr) {
+			producer->token = this;
+		}
+		if (other.producer != nullptr) {
+			other.producer->token = &other;
+		}
+	}
+
+	// A token is always valid unless:
+	//     1) Memory allocation failed during construction
+	//     2) It was moved via the move constructor
+	//        (Note: assignment does a swap, leaving both potentially valid)
+	//     3) The associated queue was destroyed
+	// Note that if valid() returns true, that only indicates
+	// that the token is valid for use with a specific queue,
+	// but not which one; that's up to the user to track.
+	inline bool valid() const { return producer != nullptr; }
+
+	~ProducerToken()
+	{
+		if (producer != nullptr) {
+			producer->token = nullptr;
+			producer->inactive.store(true, std::memory_order_release);
+		}
+	}
+
+	// Disable copying and assignment
+	ProducerToken(ProducerToken const&) = delete;
+	ProducerToken& operator=(ProducerToken const&) = delete;
+
+private:
+	template<typename T, typename Traits> friend class ConcurrentQueue;
+
+protected:
+	details::ConcurrentQueueProducerTypelessBase* producer;
+};
+
+
+struct ConsumerToken
+{
+	template<typename T, typename Traits>
+	explicit ConsumerToken(ConcurrentQueue<T, Traits>& q);
+
+	ConsumerToken(ConsumerToken&& other) noexcept
+		: initialOffset(other.initialOffset), lastKnownGlobalOffset(other.lastKnownGlobalOffset), itemsConsumedFromCurrent(other.itemsConsumedFromCurrent), currentProducer(other.currentProducer), desiredProducer(other.desiredProducer)
+	{
+	}
+
+	inline ConsumerToken& operator=(ConsumerToken&& other) noexcept
+	{
+		swap(other);
+		return *this;
+	}
+
+	void swap(ConsumerToken& other) noexcept
+	{
+		std::swap(initialOffset, other.initialOffset);
+		std::swap(lastKnownGlobalOffset, other.lastKnownGlobalOffset);
+		std::swap(itemsConsumedFromCurrent, other.itemsConsumedFromCurrent);
+		std::swap(currentProducer, other.currentProducer);
+		std::swap(desiredProducer, other.desiredProducer);
+	}
+
+	// Disable copying and assignment
+	ConsumerToken(ConsumerToken const&) = delete;
+	ConsumerToken& operator=(ConsumerToken const&) = delete;
+
+private:
+	template<typename T, typename Traits> friend class ConcurrentQueue;
+
+private: // but shared with ConcurrentQueue
+	std::uint32_t initialOffset;
+	std::uint32_t lastKnownGlobalOffset;
+	std::uint32_t itemsConsumedFromCurrent;
+	details::ConcurrentQueueProducerTypelessBase* currentProducer;
+	details::ConcurrentQueueProducerTypelessBase* desiredProducer;
+};
+
+
+template<typename T, typename Traits = ConcurrentQueueDefaultTraits>
+class ConcurrentQueue
+{
+public:
+    struct ExplicitProducer;
+
+	typedef moodycamel::ProducerToken producer_token_t;
+	typedef moodycamel::ConsumerToken consumer_token_t;
+
+	typedef typename Traits::index_t index_t;
+	typedef typename Traits::size_t size_t;
+
+	static const size_t BLOCK_SIZE = static_cast<size_t>(Traits::BLOCK_SIZE);
+	static const size_t EXPLICIT_BLOCK_EMPTY_COUNTER_THRESHOLD = static_cast<size_t>(Traits::EXPLICIT_BLOCK_EMPTY_COUNTER_THRESHOLD);
+	static const size_t EXPLICIT_INITIAL_INDEX_SIZE = static_cast<size_t>(Traits::EXPLICIT_INITIAL_INDEX_SIZE);
+	static const std::uint32_t EXPLICIT_CONSUMER_CONSUMPTION_QUOTA_BEFORE_ROTATE = static_cast<std::uint32_t>(Traits::EXPLICIT_CONSUMER_CONSUMPTION_QUOTA_BEFORE_ROTATE);
+#ifdef _MSC_VER
+#pragma warning(push)
+#pragma warning(disable: 4307)		// + integral constant overflow (that's what the ternary expression is for!)
+#pragma warning(disable: 4309)		// static_cast: Truncation of constant value
+#endif
+	static const size_t MAX_SUBQUEUE_SIZE = (details::const_numeric_max<size_t>::value - static_cast<size_t>(Traits::MAX_SUBQUEUE_SIZE) < BLOCK_SIZE) ? details::const_numeric_max<size_t>::value : ((static_cast<size_t>(Traits::MAX_SUBQUEUE_SIZE) + (BLOCK_SIZE - 1)) / BLOCK_SIZE * BLOCK_SIZE);
+#ifdef _MSC_VER
+#pragma warning(pop)
+#endif
+
+	static_assert(!std::numeric_limits<size_t>::is_signed && std::is_integral<size_t>::value, "Traits::size_t must be an unsigned integral type");
+	static_assert(!std::numeric_limits<index_t>::is_signed && std::is_integral<index_t>::value, "Traits::index_t must be an unsigned integral type");
+	static_assert(sizeof(index_t) >= sizeof(size_t), "Traits::index_t must be at least as wide as Traits::size_t");
+	static_assert((BLOCK_SIZE > 1) && !(BLOCK_SIZE & (BLOCK_SIZE - 1)), "Traits::BLOCK_SIZE must be a power of 2 (and at least 2)");
+	static_assert((EXPLICIT_BLOCK_EMPTY_COUNTER_THRESHOLD > 1) && !(EXPLICIT_BLOCK_EMPTY_COUNTER_THRESHOLD & (EXPLICIT_BLOCK_EMPTY_COUNTER_THRESHOLD - 1)), "Traits::EXPLICIT_BLOCK_EMPTY_COUNTER_THRESHOLD must be a power of 2 (and greater than 1)");
+	static_assert((EXPLICIT_INITIAL_INDEX_SIZE > 1) && !(EXPLICIT_INITIAL_INDEX_SIZE & (EXPLICIT_INITIAL_INDEX_SIZE - 1)), "Traits::EXPLICIT_INITIAL_INDEX_SIZE must be a power of 2 (and greater than 1)");
+
+public:
+	// Creates a queue with at least `capacity` element slots; note that the
+	// actual number of elements that can be inserted without additional memory
+	// allocation depends on the number of producers and the block size (e.g. if
+	// the block size is equal to `capacity`, only a single block will be allocated
+	// up-front, which means only a single producer will be able to enqueue elements
+	// without an extra allocation -- blocks aren't shared between producers).
+	// This method is not thread safe -- it is up to the user to ensure that the
+	// queue is fully constructed before it starts being used by other threads (this
+	// includes making the memory effects of construction visible, possibly with a
+	// memory barrier).
+	explicit ConcurrentQueue(size_t capacity = 6 * BLOCK_SIZE)
+		: producerListTail(nullptr),
+		producerCount(0),
+		initialBlockPoolIndex(0),
+		nextExplicitConsumerId(0),
+		globalExplicitConsumerOffset(0)
+	{
+		populate_initial_block_list(capacity / BLOCK_SIZE + ((capacity & (BLOCK_SIZE - 1)) == 0 ? 0 : 1));
+	}
+
+	// Computes the correct amount of pre-allocated blocks for you based
+	// on the minimum number of elements you want available at any given
+	// time, and the maximum concurrent number of each type of producer.
+	ConcurrentQueue(size_t minCapacity, size_t maxExplicitProducers)
+		: producerListTail(nullptr),
+		producerCount(0),
+		initialBlockPoolIndex(0),
+		nextExplicitConsumerId(0),
+		globalExplicitConsumerOffset(0)
+	{
+		size_t blocks = (((minCapacity + BLOCK_SIZE - 1) / BLOCK_SIZE) - 1) * (maxExplicitProducers + 1) + 2 * (maxExplicitProducers);
+		populate_initial_block_list(blocks);
+	}
+
+	// Note: The queue should not be accessed concurrently while it's
+	// being deleted. It's up to the user to synchronize this.
+	// This method is not thread safe.
+	~ConcurrentQueue()
+	{
+		// Destroy producers
+		auto ptr = producerListTail.load(std::memory_order_relaxed);
+		while (ptr != nullptr) {
+			auto next = ptr->next_prod();
+			if (ptr->token != nullptr) {
+				ptr->token->producer = nullptr;
+			}
+			destroy(ptr);
+			ptr = next;
+		}
+
+		// Destroy global free list
+		auto block = freeList.head_unsafe();
+		while (block != nullptr) {
+			auto next = block->freeListNext.load(std::memory_order_relaxed);
+			if (block->dynamicallyAllocated) {
+				destroy(block);
+			}
+			block = next;
+		}
+
+		// Destroy initial free list
+		destroy_array(initialBlockPool, initialBlockPoolSize);
+	}
+
+	// Disable copying and copy assignment
+	ConcurrentQueue(ConcurrentQueue const&) = delete;
+    ConcurrentQueue(ConcurrentQueue&& other) = delete;
+	ConcurrentQueue& operator=(ConcurrentQueue const&) = delete;
+    ConcurrentQueue& operator=(ConcurrentQueue&& other) = delete;
+
+public:
+    tracy_force_inline T* enqueue_begin(producer_token_t const& token, index_t& currentTailIndex)
+    {
+        return static_cast<ExplicitProducer*>(token.producer)->ConcurrentQueue::ExplicitProducer::enqueue_begin(currentTailIndex);
+    }
+
+	template<class NotifyThread, class ProcessData>
+    size_t try_dequeue_bulk_single(consumer_token_t& token, NotifyThread notifyThread, ProcessData processData )
+    {
+        if (token.desiredProducer == nullptr || token.lastKnownGlobalOffset != globalExplicitConsumerOffset.load(std::memory_order_relaxed)) {
+            if (!update_current_producer_after_rotation(token)) {
+                return 0;
+            }
+        }
+
+        size_t count = static_cast<ProducerBase*>(token.currentProducer)->dequeue_bulk(notifyThread, processData);
+        token.itemsConsumedFromCurrent += static_cast<std::uint32_t>(count);
+
+        auto tail = producerListTail.load(std::memory_order_acquire);
+        auto ptr = static_cast<ProducerBase*>(token.currentProducer)->next_prod();
+        if (ptr == nullptr) {
+            ptr = tail;
+        }
+        if( count == 0 )
+        {
+            while (ptr != static_cast<ProducerBase*>(token.currentProducer)) {
+                auto dequeued = ptr->dequeue_bulk(notifyThread, processData);
+                if (dequeued != 0) {
+                    token.currentProducer = ptr;
+                    token.itemsConsumedFromCurrent = static_cast<std::uint32_t>(dequeued);
+                    return dequeued;
+                }
+                ptr = ptr->next_prod();
+                if (ptr == nullptr) {
+                    ptr = tail;
+                }
+            }
+            return 0;
+        }
+        else
+        {
+            token.currentProducer = ptr;
+            token.itemsConsumedFromCurrent = 0;
+            return count;
+        }
+    }
+
+
+	// Returns an estimate of the total number of elements currently in the queue. This
+	// estimate is only accurate if the queue has completely stabilized before it is called
+	// (i.e. all enqueue and dequeue operations have completed and their memory effects are
+	// visible on the calling thread, and no further operations start while this method is
+	// being called).
+	// Thread-safe.
+	size_t size_approx() const
+	{
+		size_t size = 0;
+		for (auto ptr = producerListTail.load(std::memory_order_acquire); ptr != nullptr; ptr = ptr->next_prod()) {
+			size += ptr->size_approx();
+		}
+		return size;
+	}
+
+
+	// Returns true if the underlying atomic variables used by
+	// the queue are lock-free (they should be on most platforms).
+	// Thread-safe.
+	static bool is_lock_free()
+	{
+		return
+			details::static_is_lock_free<bool>::value == 2 &&
+			details::static_is_lock_free<size_t>::value == 2 &&
+			details::static_is_lock_free<std::uint32_t>::value == 2 &&
+			details::static_is_lock_free<index_t>::value == 2 &&
+			details::static_is_lock_free<void*>::value == 2;
+	}
+
+
+private:
+	friend struct ProducerToken;
+	friend struct ConsumerToken;
+	friend struct ExplicitProducer;
+
+
+	///////////////////////////////
+	// Queue methods
+	///////////////////////////////
+
+	inline bool update_current_producer_after_rotation(consumer_token_t& token)
+	{
+		// Ah, there's been a rotation, figure out where we should be!
+		auto tail = producerListTail.load(std::memory_order_acquire);
+		if (token.desiredProducer == nullptr && tail == nullptr) {
+			return false;
+		}
+		auto prodCount = producerCount.load(std::memory_order_relaxed);
+		auto globalOffset = globalExplicitConsumerOffset.load(std::memory_order_relaxed);
+		if (details::cqUnlikely(token.desiredProducer == nullptr)) {
+			// Aha, first time we're dequeueing anything.
+			// Figure out our local position
+			// Note: offset is from start, not end, but we're traversing from end -- subtract from count first
+			std::uint32_t offset = prodCount - 1 - (token.initialOffset % prodCount);
+			token.desiredProducer = tail;
+			for (std::uint32_t i = 0; i != offset; ++i) {
+				token.desiredProducer = static_cast<ProducerBase*>(token.desiredProducer)->next_prod();
+				if (token.desiredProducer == nullptr) {
+					token.desiredProducer = tail;
+				}
+			}
+		}
+
+		std::uint32_t delta = globalOffset - token.lastKnownGlobalOffset;
+		if (delta >= prodCount) {
+			delta = delta % prodCount;
+		}
+		for (std::uint32_t i = 0; i != delta; ++i) {
+			token.desiredProducer = static_cast<ProducerBase*>(token.desiredProducer)->next_prod();
+			if (token.desiredProducer == nullptr) {
+				token.desiredProducer = tail;
+			}
+		}
+
+		token.lastKnownGlobalOffset = globalOffset;
+		token.currentProducer = token.desiredProducer;
+		token.itemsConsumedFromCurrent = 0;
+		return true;
+	}
+
+
+	///////////////////////////
+	// Free list
+	///////////////////////////
+
+	template <typename N>
+	struct FreeListNode
+	{
+		FreeListNode() : freeListRefs(0), freeListNext(nullptr) { }
+
+		std::atomic<std::uint32_t> freeListRefs;
+		std::atomic<N*> freeListNext;
+	};
+
+	// A simple CAS-based lock-free free list. Not the fastest thing in the world under heavy contention, but
+	// simple and correct (assuming nodes are never freed until after the free list is destroyed), and fairly
+	// speedy under low contention.
+	template<typename N>		// N must inherit FreeListNode or have the same fields (and initialization of them)
+	struct FreeList
+	{
+		FreeList() : freeListHead(nullptr) { }
+		FreeList(FreeList&& other) : freeListHead(other.freeListHead.load(std::memory_order_relaxed)) { other.freeListHead.store(nullptr, std::memory_order_relaxed); }
+		void swap(FreeList& other) { details::swap_relaxed(freeListHead, other.freeListHead); }
+
+		FreeList(FreeList const&) = delete;
+		FreeList& operator=(FreeList const&) = delete;
+
+		inline void add(N* node)
+		{
+			// We know that the should-be-on-freelist bit is 0 at this point, so it's safe to
+			// set it using a fetch_add
+			if (node->freeListRefs.fetch_add(SHOULD_BE_ON_FREELIST, std::memory_order_acq_rel) == 0) {
+				// Oh look! We were the last ones referencing this node, and we know
+				// we want to add it to the free list, so let's do it!
+		 		add_knowing_refcount_is_zero(node);
+			}
+		}
+
+		inline N* try_get()
+		{
+			auto head = freeListHead.load(std::memory_order_acquire);
+			while (head != nullptr) {
+				auto prevHead = head;
+				auto refs = head->freeListRefs.load(std::memory_order_relaxed);
+				if ((refs & REFS_MASK) == 0 || !head->freeListRefs.compare_exchange_strong(refs, refs + 1, std::memory_order_acquire, std::memory_order_relaxed)) {
+					head = freeListHead.load(std::memory_order_acquire);
+					continue;
+				}
+
+				// Good, reference count has been incremented (it wasn't at zero), which means we can read the
+				// next and not worry about it changing between now and the time we do the CAS
+				auto next = head->freeListNext.load(std::memory_order_relaxed);
+				if (freeListHead.compare_exchange_strong(head, next, std::memory_order_acquire, std::memory_order_relaxed)) {
+					// Yay, got the node. This means it was on the list, which means shouldBeOnFreeList must be false no
+					// matter the refcount (because nobody else knows it's been taken off yet, it can't have been put back on).
+					assert((head->freeListRefs.load(std::memory_order_relaxed) & SHOULD_BE_ON_FREELIST) == 0);
+
+					// Decrease refcount twice, once for our ref, and once for the list's ref
+					head->freeListRefs.fetch_sub(2, std::memory_order_release);
+					return head;
+				}
+
+				// OK, the head must have changed on us, but we still need to decrease the refcount we increased.
+				// Note that we don't need to release any memory effects, but we do need to ensure that the reference
+				// count decrement happens-after the CAS on the head.
+				refs = prevHead->freeListRefs.fetch_sub(1, std::memory_order_acq_rel);
+				if (refs == SHOULD_BE_ON_FREELIST + 1) {
+					add_knowing_refcount_is_zero(prevHead);
+				}
+			}
+
+			return nullptr;
+		}
+
+		// Useful for traversing the list when there's no contention (e.g. to destroy remaining nodes)
+		N* head_unsafe() const { return freeListHead.load(std::memory_order_relaxed); }
+
+	private:
+		inline void add_knowing_refcount_is_zero(N* node)
+		{
+			// Since the refcount is zero, and nobody can increase it once it's zero (except us, and we run
+			// only one copy of this method per node at a time, i.e. the single thread case), then we know
+			// we can safely change the next pointer of the node; however, once the refcount is back above
+			// zero, then other threads could increase it (happens under heavy contention, when the refcount
+			// goes to zero in between a load and a refcount increment of a node in try_get, then back up to
+			// something non-zero, then the refcount increment is done by the other thread) -- so, if the CAS
+			// to add the node to the actual list fails, decrease the refcount and leave the add operation to
+			// the next thread who puts the refcount back at zero (which could be us, hence the loop).
+			auto head = freeListHead.load(std::memory_order_relaxed);
+			while (true) {
+				node->freeListNext.store(head, std::memory_order_relaxed);
+				node->freeListRefs.store(1, std::memory_order_release);
+				if (!freeListHead.compare_exchange_strong(head, node, std::memory_order_release, std::memory_order_relaxed)) {
+					// Hmm, the add failed, but we can only try again when the refcount goes back to zero
+					if (node->freeListRefs.fetch_add(SHOULD_BE_ON_FREELIST - 1, std::memory_order_release) == 1) {
+						continue;
+					}
+				}
+				return;
+			}
+		}
+
+	private:
+		// Implemented like a stack, but where node order doesn't matter (nodes are inserted out of order under contention)
+		std::atomic<N*> freeListHead;
+
+	static const std::uint32_t REFS_MASK = 0x7FFFFFFF;
+	static const std::uint32_t SHOULD_BE_ON_FREELIST = 0x80000000;
+	};
+
+
+	///////////////////////////
+	// Block
+	///////////////////////////
+
+	struct Block
+	{
+		Block()
+			: next(nullptr), elementsCompletelyDequeued(0), freeListRefs(0), freeListNext(nullptr), shouldBeOnFreeList(false), dynamicallyAllocated(true)
+		{
+		}
+
+		inline bool is_empty() const
+		{
+			if (compile_time_condition<BLOCK_SIZE <= EXPLICIT_BLOCK_EMPTY_COUNTER_THRESHOLD>::value) {
+				// Check flags
+				for (size_t i = 0; i < BLOCK_SIZE; ++i) {
+					if (!emptyFlags[i].load(std::memory_order_relaxed)) {
+						return false;
+					}
+				}
+
+				// Aha, empty; make sure we have all other memory effects that happened before the empty flags were set
+				std::atomic_thread_fence(std::memory_order_acquire);
+				return true;
+			}
+			else {
+				// Check counter
+				if (elementsCompletelyDequeued.load(std::memory_order_relaxed) == BLOCK_SIZE) {
+					std::atomic_thread_fence(std::memory_order_acquire);
+					return true;
+				}
+				assert(elementsCompletelyDequeued.load(std::memory_order_relaxed) <= BLOCK_SIZE);
+				return false;
+			}
+		}
+
+		// Returns true if the block is now empty (does not apply in explicit context)
+		inline bool set_empty(index_t i)
+		{
+			if (BLOCK_SIZE <= EXPLICIT_BLOCK_EMPTY_COUNTER_THRESHOLD) {
+				// Set flag
+				assert(!emptyFlags[BLOCK_SIZE - 1 - static_cast<size_t>(i & static_cast<index_t>(BLOCK_SIZE - 1))].load(std::memory_order_relaxed));
+				emptyFlags[BLOCK_SIZE - 1 - static_cast<size_t>(i & static_cast<index_t>(BLOCK_SIZE - 1))].store(true, std::memory_order_release);
+				return false;
+			}
+			else {
+				// Increment counter
+				auto prevVal = elementsCompletelyDequeued.fetch_add(1, std::memory_order_release);
+				assert(prevVal < BLOCK_SIZE);
+				return prevVal == BLOCK_SIZE - 1;
+			}
+		}
+
+		// Sets multiple contiguous item statuses to 'empty' (assumes no wrapping and count > 0).
+		// Returns true if the block is now empty (does not apply in explicit context).
+		inline bool set_many_empty(index_t i, size_t count)
+		{
+			if (compile_time_condition<BLOCK_SIZE <= EXPLICIT_BLOCK_EMPTY_COUNTER_THRESHOLD>::value) {
+				// Set flags
+				std::atomic_thread_fence(std::memory_order_release);
+				i = BLOCK_SIZE - 1 - static_cast<size_t>(i & static_cast<index_t>(BLOCK_SIZE - 1)) - count + 1;
+				for (size_t j = 0; j != count; ++j) {
+					assert(!emptyFlags[i + j].load(std::memory_order_relaxed));
+					emptyFlags[i + j].store(true, std::memory_order_relaxed);
+				}
+				return false;
+			}
+			else {
+				// Increment counter
+				auto prevVal = elementsCompletelyDequeued.fetch_add(count, std::memory_order_release);
+				assert(prevVal + count <= BLOCK_SIZE);
+				return prevVal + count == BLOCK_SIZE;
+			}
+		}
+
+		inline void set_all_empty()
+		{
+			if (BLOCK_SIZE <= EXPLICIT_BLOCK_EMPTY_COUNTER_THRESHOLD) {
+				// Set all flags
+				for (size_t i = 0; i != BLOCK_SIZE; ++i) {
+					emptyFlags[i].store(true, std::memory_order_relaxed);
+				}
+			}
+			else {
+				// Reset counter
+				elementsCompletelyDequeued.store(BLOCK_SIZE, std::memory_order_relaxed);
+			}
+		}
+
+		inline void reset_empty()
+		{
+			if (compile_time_condition<BLOCK_SIZE <= EXPLICIT_BLOCK_EMPTY_COUNTER_THRESHOLD>::value) {
+				// Reset flags
+				for (size_t i = 0; i != BLOCK_SIZE; ++i) {
+					emptyFlags[i].store(false, std::memory_order_relaxed);
+				}
+			}
+			else {
+				// Reset counter
+				elementsCompletelyDequeued.store(0, std::memory_order_relaxed);
+			}
+		}
+
+		inline T* operator[](index_t idx) noexcept { return static_cast<T*>(static_cast<void*>(elements)) + static_cast<size_t>(idx & static_cast<index_t>(BLOCK_SIZE - 1)); }
+		inline T const* operator[](index_t idx) const noexcept { return static_cast<T const*>(static_cast<void const*>(elements)) + static_cast<size_t>(idx & static_cast<index_t>(BLOCK_SIZE - 1)); }
+
+	private:
+		// IMPORTANT: This must be the first member in Block, so that if T depends on the alignment of
+		// addresses returned by malloc, that alignment will be preserved. Apparently clang actually
+		// generates code that uses this assumption for AVX instructions in some cases. Ideally, we
+		// should also align Block to the alignment of T in case it's higher than malloc's 16-byte
+		// alignment, but this is hard to do in a cross-platform way. Assert for this case:
+		static_assert(std::alignment_of<T>::value <= std::alignment_of<details::max_align_t>::value, "The queue does not support super-aligned types at this time");
+		// Additionally, we need the alignment of Block itself to be a multiple of max_align_t since
+		// otherwise the appropriate padding will not be added at the end of Block in order to make
+		// arrays of Blocks all be properly aligned (not just the first one). We use a union to force
+		// this.
+		union {
+			char elements[sizeof(T) * BLOCK_SIZE];
+			details::max_align_t dummy;
+		};
+	public:
+		Block* next;
+		std::atomic<size_t> elementsCompletelyDequeued;
+		std::atomic<bool> emptyFlags[BLOCK_SIZE <= EXPLICIT_BLOCK_EMPTY_COUNTER_THRESHOLD ? BLOCK_SIZE : 1];
+	public:
+		std::atomic<std::uint32_t> freeListRefs;
+		std::atomic<Block*> freeListNext;
+		std::atomic<bool> shouldBeOnFreeList;
+		bool dynamicallyAllocated;		// Perhaps a better name for this would be 'isNotPartOfInitialBlockPool'
+	};
+	static_assert(std::alignment_of<Block>::value >= std::alignment_of<details::max_align_t>::value, "Internal error: Blocks must be at least as aligned as the type they are wrapping");
+
+
+	///////////////////////////
+	// Producer base
+	///////////////////////////
+
+	struct ProducerBase : public details::ConcurrentQueueProducerTypelessBase
+	{
+		ProducerBase(ConcurrentQueue* parent_) :
+			tailIndex(0),
+			headIndex(0),
+			dequeueOptimisticCount(0),
+			dequeueOvercommit(0),
+			tailBlock(nullptr),
+			parent(parent_)
+		{
+		}
+
+		virtual ~ProducerBase() { };
+
+		template<class NotifyThread, class ProcessData>
+		inline size_t dequeue_bulk(NotifyThread notifyThread, ProcessData processData)
+		{
+			return static_cast<ExplicitProducer*>(this)->dequeue_bulk(notifyThread, processData);
+		}
+
+		inline ProducerBase* next_prod() const { return static_cast<ProducerBase*>(next); }
+
+		inline size_t size_approx() const
+		{
+			auto tail = tailIndex.load(std::memory_order_relaxed);
+			auto head = headIndex.load(std::memory_order_relaxed);
+			return details::circular_less_than(head, tail) ? static_cast<size_t>(tail - head) : 0;
+		}
+
+		inline index_t getTail() const { return tailIndex.load(std::memory_order_relaxed); }
+	protected:
+		std::atomic<index_t> tailIndex;		// Where to enqueue to next
+		std::atomic<index_t> headIndex;		// Where to dequeue from next
+
+		std::atomic<index_t> dequeueOptimisticCount;
+		std::atomic<index_t> dequeueOvercommit;
+
+		Block* tailBlock;
+
+	public:
+		ConcurrentQueue* parent;
+	};
+
+
+    public:
+	///////////////////////////
+	// Explicit queue
+	///////////////////////////
+	struct ExplicitProducer : public ProducerBase
+	{
+		explicit ExplicitProducer(ConcurrentQueue* _parent) :
+			ProducerBase(_parent),
+			blockIndex(nullptr),
+			pr_blockIndexSlotsUsed(0),
+			pr_blockIndexSize(EXPLICIT_INITIAL_INDEX_SIZE >> 1),
+			pr_blockIndexFront(0),
+			pr_blockIndexEntries(nullptr),
+			pr_blockIndexRaw(nullptr)
+		{
+			size_t poolBasedIndexSize = details::ceil_to_pow_2(_parent->initialBlockPoolSize) >> 1;
+			if (poolBasedIndexSize > pr_blockIndexSize) {
+				pr_blockIndexSize = poolBasedIndexSize;
+			}
+
+			new_block_index(0);		// This creates an index with double the number of current entries, i.e. EXPLICIT_INITIAL_INDEX_SIZE
+		}
+
+		~ExplicitProducer()
+		{
+			// Destruct any elements not yet dequeued.
+			// Since we're in the destructor, we can assume all elements
+			// are either completely dequeued or completely not (no halfways).
+			if (this->tailBlock != nullptr) {		// Note this means there must be a block index too
+				// First find the block that's partially dequeued, if any
+				Block* halfDequeuedBlock = nullptr;
+				if ((this->headIndex.load(std::memory_order_relaxed) & static_cast<index_t>(BLOCK_SIZE - 1)) != 0) {
+					// The head's not on a block boundary, meaning a block somewhere is partially dequeued
+					// (or the head block is the tail block and was fully dequeued, but the head/tail are still not on a boundary)
+					size_t i = (pr_blockIndexFront - pr_blockIndexSlotsUsed) & (pr_blockIndexSize - 1);
+					while (details::circular_less_than<index_t>(pr_blockIndexEntries[i].base + BLOCK_SIZE, this->headIndex.load(std::memory_order_relaxed))) {
+						i = (i + 1) & (pr_blockIndexSize - 1);
+					}
+					assert(details::circular_less_than<index_t>(pr_blockIndexEntries[i].base, this->headIndex.load(std::memory_order_relaxed)));
+					halfDequeuedBlock = pr_blockIndexEntries[i].block;
+				}
+
+				// Start at the head block (note the first line in the loop gives us the head from the tail on the first iteration)
+				auto block = this->tailBlock;
+				do {
+					block = block->next;
+					if (block->ConcurrentQueue::Block::is_empty()) {
+						continue;
+					}
+
+					size_t i = 0;	// Offset into block
+					if (block == halfDequeuedBlock) {
+						i = static_cast<size_t>(this->headIndex.load(std::memory_order_relaxed) & static_cast<index_t>(BLOCK_SIZE - 1));
+					}
+
+					// Walk through all the items in the block; if this is the tail block, we need to stop when we reach the tail index
+					auto lastValidIndex = (this->tailIndex.load(std::memory_order_relaxed) & static_cast<index_t>(BLOCK_SIZE - 1)) == 0 ? BLOCK_SIZE : static_cast<size_t>(this->tailIndex.load(std::memory_order_relaxed) & static_cast<index_t>(BLOCK_SIZE - 1));
+					while (i != BLOCK_SIZE && (block != this->tailBlock || i != lastValidIndex)) {
+						(*block)[i++]->~T();
+					}
+				} while (block != this->tailBlock);
+			}
+
+			// Destroy all blocks that we own
+			if (this->tailBlock != nullptr) {
+				auto block = this->tailBlock;
+				do {
+					auto nextBlock = block->next;
+					if (block->dynamicallyAllocated) {
+						destroy(block);
+					}
+					else {
+						this->parent->add_block_to_free_list(block);
+					}
+					block = nextBlock;
+				} while (block != this->tailBlock);
+			}
+
+			// Destroy the block indices
+			auto header = static_cast<BlockIndexHeader*>(pr_blockIndexRaw);
+			while (header != nullptr) {
+				auto prev = static_cast<BlockIndexHeader*>(header->prev);
+				header->~BlockIndexHeader();
+				(Traits::free)(header);
+				header = prev;
+			}
+		}
+
+        inline void enqueue_begin_alloc(index_t currentTailIndex)
+        {
+            // We reached the end of a block, start a new one
+            if (this->tailBlock != nullptr && this->tailBlock->next->ConcurrentQueue::Block::is_empty()) {
+                // We can re-use the block ahead of us, it's empty!
+                this->tailBlock = this->tailBlock->next;
+                this->tailBlock->ConcurrentQueue::Block::reset_empty();
+
+                // We'll put the block on the block index (guaranteed to be room since we're conceptually removing the
+                // last block from it first -- except instead of removing then adding, we can just overwrite).
+                // Note that there must be a valid block index here, since even if allocation failed in the ctor,
+                // it would have been re-attempted when adding the first block to the queue; since there is such
+                // a block, a block index must have been successfully allocated.
+            }
+            else {
+                // We're going to need a new block; check that the block index has room
+                if (pr_blockIndexRaw == nullptr || pr_blockIndexSlotsUsed == pr_blockIndexSize) {
+                    // Hmm, the circular block index is already full -- we'll need
+                    // to allocate a new index. Note pr_blockIndexRaw can only be nullptr if
+                    // the initial allocation failed in the constructor.
+                    new_block_index(pr_blockIndexSlotsUsed);
+                }
+
+                // Insert a new block in the circular linked list
+                auto newBlock = this->parent->ConcurrentQueue::requisition_block();
+                newBlock->ConcurrentQueue::Block::reset_empty();
+                if (this->tailBlock == nullptr) {
+                    newBlock->next = newBlock;
+                }
+                else {
+                    newBlock->next = this->tailBlock->next;
+                    this->tailBlock->next = newBlock;
+                }
+                this->tailBlock = newBlock;
+                ++pr_blockIndexSlotsUsed;
+            }
+
+            // Add block to block index
+            auto& entry = blockIndex.load(std::memory_order_relaxed)->entries[pr_blockIndexFront];
+            entry.base = currentTailIndex;
+            entry.block = this->tailBlock;
+            blockIndex.load(std::memory_order_relaxed)->front.store(pr_blockIndexFront, std::memory_order_release);
+            pr_blockIndexFront = (pr_blockIndexFront + 1) & (pr_blockIndexSize - 1);
+        }
+
+        tracy_force_inline T* enqueue_begin(index_t& currentTailIndex)
+        {
+            currentTailIndex = this->tailIndex.load(std::memory_order_relaxed);
+            if (details::cqUnlikely((currentTailIndex & static_cast<index_t>(BLOCK_SIZE - 1)) == 0)) {
+                this->enqueue_begin_alloc(currentTailIndex);
+            }
+            return (*this->tailBlock)[currentTailIndex];
+        }
+
+        tracy_force_inline std::atomic<index_t>& get_tail_index()
+        {
+            return this->tailIndex;
+        }
+
+		template<class NotifyThread, class ProcessData>
+		size_t dequeue_bulk(NotifyThread notifyThread, ProcessData processData)
+		{
+			auto tail = this->tailIndex.load(std::memory_order_relaxed);
+			auto overcommit = this->dequeueOvercommit.load(std::memory_order_relaxed);
+			auto desiredCount = static_cast<size_t>(tail - (this->dequeueOptimisticCount.load(std::memory_order_relaxed) - overcommit));
+			if (details::circular_less_than<size_t>(0, desiredCount)) {
+				desiredCount = desiredCount < 8192 ? desiredCount : 8192;
+				std::atomic_thread_fence(std::memory_order_acquire);
+
+				auto myDequeueCount = this->dequeueOptimisticCount.fetch_add(desiredCount, std::memory_order_relaxed);
+				assert(overcommit <= myDequeueCount);
+
+				tail = this->tailIndex.load(std::memory_order_acquire);
+				auto actualCount = static_cast<size_t>(tail - (myDequeueCount - overcommit));
+				if (details::circular_less_than<size_t>(0, actualCount)) {
+					actualCount = desiredCount < actualCount ? desiredCount : actualCount;
+					if (actualCount < desiredCount) {
+						this->dequeueOvercommit.fetch_add(desiredCount - actualCount, std::memory_order_release);
+					}
+
+					// Get the first index. Note that since there's guaranteed to be at least actualCount elements, this
+					// will never exceed tail.
+					auto firstIndex = this->headIndex.fetch_add(actualCount, std::memory_order_acq_rel);
+
+					// Determine which block the first element is in
+					auto localBlockIndex = blockIndex.load(std::memory_order_acquire);
+					auto localBlockIndexHead = localBlockIndex->front.load(std::memory_order_acquire);
+
+					auto headBase = localBlockIndex->entries[localBlockIndexHead].base;
+					auto firstBlockBaseIndex = firstIndex & ~static_cast<index_t>(BLOCK_SIZE - 1);
+					auto offset = static_cast<size_t>(static_cast<typename std::make_signed<index_t>::type>(firstBlockBaseIndex - headBase) / BLOCK_SIZE);
+					auto indexIndex = (localBlockIndexHead + offset) & (localBlockIndex->size - 1);
+
+					notifyThread( this->threadId );
+
+					// Iterate the blocks and dequeue
+					auto index = firstIndex;
+					do {
+						auto firstIndexInBlock = index;
+						auto endIndex = (index & ~static_cast<index_t>(BLOCK_SIZE - 1)) + static_cast<index_t>(BLOCK_SIZE);
+						endIndex = details::circular_less_than<index_t>(firstIndex + static_cast<index_t>(actualCount), endIndex) ? firstIndex + static_cast<index_t>(actualCount) : endIndex;
+						auto block = localBlockIndex->entries[indexIndex].block;
+
+						const auto sz = endIndex - index;
+						processData( (*block)[index], sz );
+						index += sz;
+
+						block->ConcurrentQueue::Block::set_many_empty(firstIndexInBlock, static_cast<size_t>(endIndex - firstIndexInBlock));
+						indexIndex = (indexIndex + 1) & (localBlockIndex->size - 1);
+					} while (index != firstIndex + actualCount);
+
+					return actualCount;
+				}
+				else {
+					// Wasn't anything to dequeue after all; make the effective dequeue count eventually consistent
+					this->dequeueOvercommit.fetch_add(desiredCount, std::memory_order_release);
+				}
+			}
+
+			return 0;
+		}
+
+	private:
+		struct BlockIndexEntry
+		{
+			index_t base;
+			Block* block;
+		};
+
+		struct BlockIndexHeader
+		{
+			size_t size;
+			std::atomic<size_t> front;		// Current slot (not next, like pr_blockIndexFront)
+			BlockIndexEntry* entries;
+			void* prev;
+		};
+
+
+		bool new_block_index(size_t numberOfFilledSlotsToExpose)
+		{
+			auto prevBlockSizeMask = pr_blockIndexSize - 1;
+
+			// Create the new block
+			pr_blockIndexSize <<= 1;
+			auto newRawPtr = static_cast<char*>((Traits::malloc)(sizeof(BlockIndexHeader) + std::alignment_of<BlockIndexEntry>::value - 1 + sizeof(BlockIndexEntry) * pr_blockIndexSize));
+			if (newRawPtr == nullptr) {
+				pr_blockIndexSize >>= 1;		// Reset to allow graceful retry
+				return false;
+			}
+
+			auto newBlockIndexEntries = reinterpret_cast<BlockIndexEntry*>(details::align_for<BlockIndexEntry>(newRawPtr + sizeof(BlockIndexHeader)));
+
+			// Copy in all the old indices, if any
+			size_t j = 0;
+			if (pr_blockIndexSlotsUsed != 0) {
+				auto i = (pr_blockIndexFront - pr_blockIndexSlotsUsed) & prevBlockSizeMask;
+				do {
+					newBlockIndexEntries[j++] = pr_blockIndexEntries[i];
+					i = (i + 1) & prevBlockSizeMask;
+				} while (i != pr_blockIndexFront);
+			}
+
+			// Update everything
+			auto header = new (newRawPtr) BlockIndexHeader;
+			header->size = pr_blockIndexSize;
+			header->front.store(numberOfFilledSlotsToExpose - 1, std::memory_order_relaxed);
+			header->entries = newBlockIndexEntries;
+			header->prev = pr_blockIndexRaw;		// we link the new block to the old one so we can free it later
+
+			pr_blockIndexFront = j;
+			pr_blockIndexEntries = newBlockIndexEntries;
+			pr_blockIndexRaw = newRawPtr;
+			blockIndex.store(header, std::memory_order_release);
+
+			return true;
+		}
+
+	private:
+		std::atomic<BlockIndexHeader*> blockIndex;
+
+		// To be used by producer only -- consumer must use the ones in referenced by blockIndex
+		size_t pr_blockIndexSlotsUsed;
+		size_t pr_blockIndexSize;
+		size_t pr_blockIndexFront;		// Next slot (not current)
+		BlockIndexEntry* pr_blockIndexEntries;
+		void* pr_blockIndexRaw;
+	};
+
+    ExplicitProducer* get_explicit_producer(producer_token_t const& token)
+    {
+        return static_cast<ExplicitProducer*>(token.producer);
+    }
+
+    private:
+
+	//////////////////////////////////
+	// Block pool manipulation
+	//////////////////////////////////
+
+	void populate_initial_block_list(size_t blockCount)
+	{
+		initialBlockPoolSize = blockCount;
+		if (initialBlockPoolSize == 0) {
+			initialBlockPool = nullptr;
+			return;
+		}
+
+		initialBlockPool = create_array<Block>(blockCount);
+		if (initialBlockPool == nullptr) {
+			initialBlockPoolSize = 0;
+		}
+		for (size_t i = 0; i < initialBlockPoolSize; ++i) {
+			initialBlockPool[i].dynamicallyAllocated = false;
+		}
+	}
+
+	inline Block* try_get_block_from_initial_pool()
+	{
+		if (initialBlockPoolIndex.load(std::memory_order_relaxed) >= initialBlockPoolSize) {
+			return nullptr;
+		}
+
+		auto index = initialBlockPoolIndex.fetch_add(1, std::memory_order_relaxed);
+
+		return index < initialBlockPoolSize ? (initialBlockPool + index) : nullptr;
+	}
+
+	inline void add_block_to_free_list(Block* block)
+	{
+		freeList.add(block);
+	}
+
+	inline void add_blocks_to_free_list(Block* block)
+	{
+		while (block != nullptr) {
+			auto next = block->next;
+			add_block_to_free_list(block);
+			block = next;
+		}
+	}
+
+	inline Block* try_get_block_from_free_list()
+	{
+		return freeList.try_get();
+	}
+
+	// Gets a free block from one of the memory pools, or allocates a new one (if applicable)
+	Block* requisition_block()
+	{
+		auto block = try_get_block_from_initial_pool();
+		if (block != nullptr) {
+			return block;
+		}
+
+		block = try_get_block_from_free_list();
+		if (block != nullptr) {
+			return block;
+		}
+
+		return create<Block>();
+	}
+
+
+	//////////////////////////////////
+	// Producer list manipulation
+	//////////////////////////////////
+
+	ProducerBase* recycle_or_create_producer()
+	{
+		bool recycled;
+		return recycle_or_create_producer(recycled);
+	}
+
+    ProducerBase* recycle_or_create_producer(bool& recycled)
+    {
+        // Try to re-use one first
+        for (auto ptr = producerListTail.load(std::memory_order_acquire); ptr != nullptr; ptr = ptr->next_prod()) {
+            if (ptr->inactive.load(std::memory_order_relaxed)) {
+                if( ptr->size_approx() == 0 )
+                {
+                    bool expected = true;
+                    if (ptr->inactive.compare_exchange_strong(expected, /* desired */ false, std::memory_order_acquire, std::memory_order_relaxed)) {
+                        // We caught one! It's been marked as activated, the caller can have it
+                        recycled = true;
+                        return ptr;
+                    }
+                }
+            }
+        }
+
+        recycled = false;
+        return add_producer(static_cast<ProducerBase*>(create<ExplicitProducer>(this)));
+    }
+
+	ProducerBase* add_producer(ProducerBase* producer)
+	{
+		// Handle failed memory allocation
+		if (producer == nullptr) {
+			return nullptr;
+		}
+
+		producerCount.fetch_add(1, std::memory_order_relaxed);
+
+		// Add it to the lock-free list
+		auto prevTail = producerListTail.load(std::memory_order_relaxed);
+		do {
+			producer->next = prevTail;
+		} while (!producerListTail.compare_exchange_weak(prevTail, producer, std::memory_order_release, std::memory_order_relaxed));
+
+		return producer;
+	}
+
+	void reown_producers()
+	{
+		// After another instance is moved-into/swapped-with this one, all the
+		// producers we stole still think their parents are the other queue.
+		// So fix them up!
+		for (auto ptr = producerListTail.load(std::memory_order_relaxed); ptr != nullptr; ptr = ptr->next_prod()) {
+			ptr->parent = this;
+		}
+	}
+
+	//////////////////////////////////
+	// Utility functions
+	//////////////////////////////////
+
+	template<typename U>
+	static inline U* create_array(size_t count)
+	{
+		assert(count > 0);
+		return static_cast<U*>((Traits::malloc)(sizeof(U) * count));
+	}
+
+	template<typename U>
+	static inline void destroy_array(U* p, size_t count)
+	{
+		((void)count);
+		if (p != nullptr) {
+			assert(count > 0);
+			(Traits::free)(p);
+		}
+	}
+
+	template<typename U>
+	static inline U* create()
+	{
+		auto p = (Traits::malloc)(sizeof(U));
+		return new (p) U;
+	}
+
+	template<typename U, typename A1>
+	static inline U* create(A1&& a1)
+	{
+		auto p = (Traits::malloc)(sizeof(U));
+		return new (p) U(std::forward<A1>(a1));
+	}
+
+	template<typename U>
+	static inline void destroy(U* p)
+	{
+		if (p != nullptr) {
+			p->~U();
+		}
+		(Traits::free)(p);
+	}
+
+private:
+	std::atomic<ProducerBase*> producerListTail;
+	std::atomic<std::uint32_t> producerCount;
+
+	std::atomic<size_t> initialBlockPoolIndex;
+	Block* initialBlockPool;
+	size_t initialBlockPoolSize;
+
+	FreeList<Block> freeList;
+
+	std::atomic<std::uint32_t> nextExplicitConsumerId;
+	std::atomic<std::uint32_t> globalExplicitConsumerOffset;
+};
+
+
+template<typename T, typename Traits>
+ProducerToken::ProducerToken(ConcurrentQueue<T, Traits>& queue)
+	: producer(queue.recycle_or_create_producer())
+{
+	if (producer != nullptr) {
+		producer->token = this;
+        producer->threadId = detail::GetThreadHandleImpl();
+	}
+}
+
+template<typename T, typename Traits>
+ConsumerToken::ConsumerToken(ConcurrentQueue<T, Traits>& queue)
+	: itemsConsumedFromCurrent(0), currentProducer(nullptr), desiredProducer(nullptr)
+{
+	initialOffset = queue.nextExplicitConsumerId.fetch_add(1, std::memory_order_release);
+	lastKnownGlobalOffset = static_cast<std::uint32_t>(-1);
+}
+
+template<typename T, typename Traits>
+inline void swap(ConcurrentQueue<T, Traits>& a, ConcurrentQueue<T, Traits>& b) noexcept
+{
+	a.swap(b);
+}
+
+inline void swap(ProducerToken& a, ProducerToken& b) noexcept
+{
+	a.swap(b);
+}
+
+inline void swap(ConsumerToken& a, ConsumerToken& b) noexcept
+{
+	a.swap(b);
+}
+
+}
+
+} /* namespace tracy */
+
+#if defined(__GNUC__)
+#pragma GCC diagnostic pop
+#endif
diff --git a/project/thirdparty/tracy-0.11.1/client/tracy_rpmalloc.cpp b/project/thirdparty/tracy-0.11.1/client/tracy_rpmalloc.cpp
new file mode 100644
index 000000000..4a0d0b400
--- /dev/null
+++ b/project/thirdparty/tracy-0.11.1/client/tracy_rpmalloc.cpp
@@ -0,0 +1,3517 @@
+#ifdef TRACY_ENABLE
+
+/* rpmalloc.c  -  Memory allocator  -  Public Domain  -  2016-2020 Mattias Jansson
+ *
+ * This library provides a cross-platform lock free thread caching malloc implementation in C11.
+ * The latest source code is always available at
+ *
+ * https://github.com/mjansson/rpmalloc
+ *
+ * This library is put in the public domain; you can redistribute it and/or modify it without any restrictions.
+ *
+ */
+
+#include "tracy_rpmalloc.hpp"
+
+#define BUILD_DYNAMIC_LINK 1
+
+////////////
+///
+/// Build time configurable limits
+///
+//////
+
+#if defined(__clang__)
+#pragma clang diagnostic ignored "-Wunused-macros"
+#pragma clang diagnostic ignored "-Wunused-function"
+#if __has_warning("-Wreserved-identifier")
+#pragma clang diagnostic ignored "-Wreserved-identifier"
+#endif
+#elif defined(__GNUC__)
+#pragma GCC diagnostic ignored "-Wunused-macros"
+#pragma GCC diagnostic ignored "-Wunused-function"
+#pragma GCC diagnostic ignored "-Warray-bounds"
+#endif
+
+#ifndef HEAP_ARRAY_SIZE
+//! Size of heap hashmap
+#define HEAP_ARRAY_SIZE           47
+#endif
+#ifndef ENABLE_THREAD_CACHE
+//! Enable per-thread cache
+#define ENABLE_THREAD_CACHE       1
+#endif
+#ifndef ENABLE_GLOBAL_CACHE
+//! Enable global cache shared between all threads, requires thread cache
+#define ENABLE_GLOBAL_CACHE       1
+#endif
+#ifndef ENABLE_VALIDATE_ARGS
+//! Enable validation of args to public entry points
+#define ENABLE_VALIDATE_ARGS      0
+#endif
+#ifndef ENABLE_STATISTICS
+//! Enable statistics collection
+#define ENABLE_STATISTICS         0
+#endif
+#ifndef ENABLE_ASSERTS
+//! Enable asserts
+#define ENABLE_ASSERTS            0
+#endif
+#ifndef ENABLE_OVERRIDE
+//! Override standard library malloc/free and new/delete entry points
+#define ENABLE_OVERRIDE           0
+#endif
+#ifndef ENABLE_PRELOAD
+//! Support preloading
+#define ENABLE_PRELOAD            0
+#endif
+#ifndef DISABLE_UNMAP
+//! Disable unmapping memory pages (also enables unlimited cache)
+#define DISABLE_UNMAP             0
+#endif
+#ifndef ENABLE_UNLIMITED_CACHE
+//! Enable unlimited global cache (no unmapping until finalization)
+#define ENABLE_UNLIMITED_CACHE    0
+#endif
+#ifndef ENABLE_ADAPTIVE_THREAD_CACHE
+//! Enable adaptive thread cache size based on use heuristics
+#define ENABLE_ADAPTIVE_THREAD_CACHE 0
+#endif
+#ifndef DEFAULT_SPAN_MAP_COUNT
+//! Default number of spans to map in call to map more virtual memory (default values yield 4MiB here)
+#define DEFAULT_SPAN_MAP_COUNT    64
+#endif
+#ifndef GLOBAL_CACHE_MULTIPLIER
+//! Multiplier for global cache
+#define GLOBAL_CACHE_MULTIPLIER   8
+#endif
+
+#if DISABLE_UNMAP && !ENABLE_GLOBAL_CACHE
+#error Must use global cache if unmap is disabled
+#endif
+
+#if DISABLE_UNMAP
+#undef ENABLE_UNLIMITED_CACHE
+#define ENABLE_UNLIMITED_CACHE 1
+#endif
+
+#if !ENABLE_GLOBAL_CACHE
+#undef ENABLE_UNLIMITED_CACHE
+#define ENABLE_UNLIMITED_CACHE 0
+#endif
+
+#if !ENABLE_THREAD_CACHE
+#undef ENABLE_ADAPTIVE_THREAD_CACHE
+#define ENABLE_ADAPTIVE_THREAD_CACHE 0
+#endif
+
+#if defined(_WIN32) || defined(__WIN32__) || defined(_WIN64)
+#  define PLATFORM_WINDOWS 1
+#  define PLATFORM_POSIX 0
+#else
+#  define PLATFORM_WINDOWS 0
+#  define PLATFORM_POSIX 1
+#endif
+
+/// Platform and arch specifics
+#if defined(_MSC_VER) && !defined(__clang__)
+#  pragma warning (disable: 5105)
+#  ifndef FORCEINLINE
+#    define FORCEINLINE inline __forceinline
+#  endif
+#else
+#  ifndef FORCEINLINE
+#    define FORCEINLINE inline __attribute__((__always_inline__))
+#  endif
+#endif
+#if PLATFORM_WINDOWS
+#  ifndef WIN32_LEAN_AND_MEAN
+#    define WIN32_LEAN_AND_MEAN
+#  endif
+#  include <windows.h>
+#  if ENABLE_VALIDATE_ARGS
+#    include <intsafe.h>
+#  endif
+#else
+#  include <unistd.h>
+#  include <stdio.h>
+#  include <stdlib.h>
+#  include <time.h>
+#  if defined(__linux__) || defined(__ANDROID__)
+#    include <sys/prctl.h>
+#    if !defined(PR_SET_VMA)
+#      define PR_SET_VMA 0x53564d41
+#      define PR_SET_VMA_ANON_NAME 0
+#    endif
+#  endif
+#  if defined(__APPLE__)
+#    include <TargetConditionals.h>
+#    if !TARGET_OS_IPHONE && !TARGET_OS_SIMULATOR
+#    include <mach/mach.h>
+#    include <mach/vm_statistics.h>
+#    endif
+#    include <pthread.h>
+#  endif
+#  if defined(__HAIKU__) || defined(__TINYC__)
+#    include <pthread.h>
+#  endif
+#endif
+
+#include <stdint.h>
+#include <string.h>
+#include <errno.h>
+
+#if defined(_WIN32) && (!defined(BUILD_DYNAMIC_LINK) || !BUILD_DYNAMIC_LINK)
+#include <fibersapi.h>
+static DWORD fls_key;
+#endif
+
+#if PLATFORM_POSIX
+#  include <sys/mman.h>
+#  include <sched.h>
+#  ifdef __FreeBSD__
+#    include <sys/sysctl.h>
+#    define MAP_HUGETLB MAP_ALIGNED_SUPER
+#    ifndef PROT_MAX
+#      define PROT_MAX(f) 0
+#    endif
+#  else
+#    define PROT_MAX(f) 0
+#  endif
+#  ifdef __sun
+extern int madvise(caddr_t, size_t, int);
+#  endif
+#  ifndef MAP_UNINITIALIZED
+#    define MAP_UNINITIALIZED 0
+#  endif
+#endif
+#include <errno.h>
+
+#if ENABLE_ASSERTS
+#  undef NDEBUG
+#  if defined(_MSC_VER) && !defined(_DEBUG)
+#    define _DEBUG
+#  endif
+#  include <assert.h>
+#define RPMALLOC_TOSTRING_M(x) #x
+#define RPMALLOC_TOSTRING(x) RPMALLOC_TOSTRING_M(x)
+#define rpmalloc_assert(truth, message)                                                                      \
+	do {                                                                                                     \
+		if (!(truth)) {                                                                                      \
+			if (_memory_config.error_callback) {                                                             \
+				_memory_config.error_callback(                                                               \
+				    message " (" RPMALLOC_TOSTRING(truth) ") at " __FILE__ ":" RPMALLOC_TOSTRING(__LINE__)); \
+			} else {                                                                                         \
+				assert((truth) && message);                                                                  \
+			}                                                                                                \
+		}                                                                                                    \
+	} while (0)
+#else
+#  define rpmalloc_assert(truth, message) do {} while(0)
+#endif
+#if ENABLE_STATISTICS
+#  include <stdio.h>
+#endif
+
+//////
+///
+/// Atomic access abstraction (since MSVC does not do C11 yet)
+///
+//////
+
+#include <atomic>
+
+typedef std::atomic<int32_t> atomic32_t;
+typedef std::atomic<int64_t> atomic64_t;
+typedef std::atomic<void*> atomicptr_t;
+
+static FORCEINLINE int32_t atomic_load32(atomic32_t* src) { return std::atomic_load_explicit(src, std::memory_order_relaxed); }
+static FORCEINLINE void    atomic_store32(atomic32_t* dst, int32_t val) { std::atomic_store_explicit(dst, val, std::memory_order_relaxed); }
+static FORCEINLINE int32_t atomic_incr32(atomic32_t* val) { return std::atomic_fetch_add_explicit(val, 1, std::memory_order_relaxed) + 1; }
+static FORCEINLINE int32_t atomic_decr32(atomic32_t* val) { return std::atomic_fetch_add_explicit(val, -1, std::memory_order_relaxed) - 1; }
+static FORCEINLINE int32_t atomic_add32(atomic32_t* val, int32_t add) { return std::atomic_fetch_add_explicit(val, add, std::memory_order_relaxed) + add; }
+static FORCEINLINE int     atomic_cas32_acquire(atomic32_t* dst, int32_t val, int32_t ref) { return std::atomic_compare_exchange_weak_explicit(dst, &ref, val, std::memory_order_acquire, std::memory_order_relaxed); }
+static FORCEINLINE void    atomic_store32_release(atomic32_t* dst, int32_t val) { std::atomic_store_explicit(dst, val, std::memory_order_release); }
+static FORCEINLINE int64_t atomic_load64(atomic64_t* val) { return std::atomic_load_explicit(val, std::memory_order_relaxed); }
+static FORCEINLINE int64_t atomic_add64(atomic64_t* val, int64_t add) { return std::atomic_fetch_add_explicit(val, add, std::memory_order_relaxed) + add; }
+static FORCEINLINE void*   atomic_load_ptr(atomicptr_t* src) { return std::atomic_load_explicit(src, std::memory_order_relaxed); }
+static FORCEINLINE void    atomic_store_ptr(atomicptr_t* dst, void* val) { std::atomic_store_explicit(dst, val, std::memory_order_relaxed); }
+static FORCEINLINE void    atomic_store_ptr_release(atomicptr_t* dst, void* val) { std::atomic_store_explicit(dst, val, std::memory_order_release); }
+static FORCEINLINE void*   atomic_exchange_ptr_acquire(atomicptr_t* dst, void* val) { return std::atomic_exchange_explicit(dst, val, std::memory_order_acquire); }
+static FORCEINLINE int     atomic_cas_ptr(atomicptr_t* dst, void* val, void* ref) { return std::atomic_compare_exchange_weak_explicit(dst, &ref, val, std::memory_order_relaxed, std::memory_order_relaxed); }
+
+#if defined(_MSC_VER) && !defined(__clang__)
+
+#define EXPECTED(x) (x)
+#define UNEXPECTED(x) (x)
+
+#else
+
+#define EXPECTED(x) __builtin_expect((x), 1)
+#define UNEXPECTED(x) __builtin_expect((x), 0)
+
+#endif
+
+////////////
+///
+/// Statistics related functions (evaluate to nothing when statistics not enabled)
+///
+//////
+
+#if ENABLE_STATISTICS
+#  define _rpmalloc_stat_inc(counter) atomic_incr32(counter)
+#  define _rpmalloc_stat_dec(counter) atomic_decr32(counter)
+#  define _rpmalloc_stat_add(counter, value) atomic_add32(counter, (int32_t)(value))
+#  define _rpmalloc_stat_add64(counter, value) atomic_add64(counter, (int64_t)(value))
+#  define _rpmalloc_stat_add_peak(counter, value, peak) do { int32_t _cur_count = atomic_add32(counter, (int32_t)(value)); if (_cur_count > (peak)) peak = _cur_count; } while (0)
+#  define _rpmalloc_stat_sub(counter, value) atomic_add32(counter, -(int32_t)(value))
+#  define _rpmalloc_stat_inc_alloc(heap, class_idx) do { \
+	int32_t alloc_current = atomic_incr32(&heap->size_class_use[class_idx].alloc_current); \
+	if (alloc_current > heap->size_class_use[class_idx].alloc_peak) \
+		heap->size_class_use[class_idx].alloc_peak = alloc_current; \
+	atomic_incr32(&heap->size_class_use[class_idx].alloc_total); \
+} while(0)
+#  define _rpmalloc_stat_inc_free(heap, class_idx) do { \
+	atomic_decr32(&heap->size_class_use[class_idx].alloc_current); \
+	atomic_incr32(&heap->size_class_use[class_idx].free_total); \
+} while(0)
+#else
+#  define _rpmalloc_stat_inc(counter) do {} while(0)
+#  define _rpmalloc_stat_dec(counter) do {} while(0)
+#  define _rpmalloc_stat_add(counter, value) do {} while(0)
+#  define _rpmalloc_stat_add64(counter, value) do {} while(0)
+#  define _rpmalloc_stat_add_peak(counter, value, peak) do {} while (0)
+#  define _rpmalloc_stat_sub(counter, value) do {} while(0)
+#  define _rpmalloc_stat_inc_alloc(heap, class_idx) do {} while(0)
+#  define _rpmalloc_stat_inc_free(heap, class_idx) do {} while(0)
+#endif
+
+
+///
+/// Preconfigured limits and sizes
+///
+
+//! Granularity of a small allocation block (must be power of two)
+#define SMALL_GRANULARITY         16
+//! Small granularity shift count
+#define SMALL_GRANULARITY_SHIFT   4
+//! Number of small block size classes
+#define SMALL_CLASS_COUNT         65
+//! Maximum size of a small block
+#define SMALL_SIZE_LIMIT          (SMALL_GRANULARITY * (SMALL_CLASS_COUNT - 1))
+//! Granularity of a medium allocation block
+#define MEDIUM_GRANULARITY        512
+//! Medium granularity shift count
+#define MEDIUM_GRANULARITY_SHIFT  9
+//! Number of medium block size classes
+#define MEDIUM_CLASS_COUNT        61
+//! Total number of small + medium size classes
+#define SIZE_CLASS_COUNT          (SMALL_CLASS_COUNT + MEDIUM_CLASS_COUNT)
+//! Number of large block size classes
+#define LARGE_CLASS_COUNT         63
+//! Maximum size of a medium block
+#define MEDIUM_SIZE_LIMIT         (SMALL_SIZE_LIMIT + (MEDIUM_GRANULARITY * MEDIUM_CLASS_COUNT))
+//! Maximum size of a large block
+#define LARGE_SIZE_LIMIT          ((LARGE_CLASS_COUNT * _memory_span_size) - SPAN_HEADER_SIZE)
+//! Size of a span header (must be a multiple of SMALL_GRANULARITY and a power of two)
+#define SPAN_HEADER_SIZE          128
+//! Number of spans in thread cache
+#define MAX_THREAD_SPAN_CACHE     400
+//! Number of spans to transfer between thread and global cache
+#define THREAD_SPAN_CACHE_TRANSFER 64
+//! Number of spans in thread cache for large spans (must be greater than LARGE_CLASS_COUNT / 2)
+#define MAX_THREAD_SPAN_LARGE_CACHE 100
+//! Number of spans to transfer between thread and global cache for large spans
+#define THREAD_SPAN_LARGE_CACHE_TRANSFER 6
+
+static_assert((SMALL_GRANULARITY & (SMALL_GRANULARITY - 1)) == 0, "Small granularity must be power of two");
+static_assert((SPAN_HEADER_SIZE & (SPAN_HEADER_SIZE - 1)) == 0, "Span header size must be power of two");
+
+#if ENABLE_VALIDATE_ARGS
+//! Maximum allocation size to avoid integer overflow
+#undef  MAX_ALLOC_SIZE
+#define MAX_ALLOC_SIZE            (((size_t)-1) - _memory_span_size)
+#endif
+
+#define pointer_offset(ptr, ofs) (void*)((char*)(ptr) + (ptrdiff_t)(ofs))
+#define pointer_diff(first, second) (ptrdiff_t)((const char*)(first) - (const char*)(second))
+
+#define INVALID_POINTER ((void*)((uintptr_t)-1))
+
+#define SIZE_CLASS_LARGE SIZE_CLASS_COUNT
+#define SIZE_CLASS_HUGE ((uint32_t)-1)
+
+////////////
+///
+/// Data types
+///
+//////
+
+namespace tracy
+{
+
+//! A memory heap, per thread
+typedef struct heap_t heap_t;
+//! Span of memory pages
+typedef struct span_t span_t;
+//! Span list
+typedef struct span_list_t span_list_t;
+//! Span active data
+typedef struct span_active_t span_active_t;
+//! Size class definition
+typedef struct size_class_t size_class_t;
+//! Global cache
+typedef struct global_cache_t global_cache_t;
+
+//! Flag indicating span is the first (master) span of a split superspan
+#define SPAN_FLAG_MASTER 1U
+//! Flag indicating span is a secondary (sub) span of a split superspan
+#define SPAN_FLAG_SUBSPAN 2U
+//! Flag indicating span has blocks with increased alignment
+#define SPAN_FLAG_ALIGNED_BLOCKS 4U
+//! Flag indicating an unmapped master span
+#define SPAN_FLAG_UNMAPPED_MASTER 8U
+
+#if ENABLE_ADAPTIVE_THREAD_CACHE || ENABLE_STATISTICS
+struct span_use_t {
+	//! Current number of spans used (actually used, not in cache)
+	atomic32_t current;
+	//! High water mark of spans used
+	atomic32_t high;
+#if ENABLE_STATISTICS
+	//! Number of spans in deferred list
+	atomic32_t spans_deferred;
+	//! Number of spans transitioned to global cache
+	atomic32_t spans_to_global;
+	//! Number of spans transitioned from global cache
+	atomic32_t spans_from_global;
+	//! Number of spans transitioned to thread cache
+	atomic32_t spans_to_cache;
+	//! Number of spans transitioned from thread cache
+	atomic32_t spans_from_cache;
+	//! Number of spans transitioned to reserved state
+	atomic32_t spans_to_reserved;
+	//! Number of spans transitioned from reserved state
+	atomic32_t spans_from_reserved;
+	//! Number of raw memory map calls
+	atomic32_t spans_map_calls;
+#endif
+};
+typedef struct span_use_t span_use_t;
+#endif
+
+#if ENABLE_STATISTICS
+struct size_class_use_t {
+	//! Current number of allocations
+	atomic32_t alloc_current;
+	//! Peak number of allocations
+	int32_t alloc_peak;
+	//! Total number of allocations
+	atomic32_t alloc_total;
+	//! Total number of frees
+	atomic32_t free_total;
+	//! Number of spans in use
+	atomic32_t spans_current;
+	//! Number of spans transitioned to cache
+	int32_t spans_peak;
+	//! Number of spans transitioned to cache
+	atomic32_t spans_to_cache;
+	//! Number of spans transitioned from cache
+	atomic32_t spans_from_cache;
+	//! Number of spans transitioned from reserved state
+	atomic32_t spans_from_reserved;
+	//! Number of spans mapped
+	atomic32_t spans_map_calls;
+	int32_t unused;
+};
+typedef struct size_class_use_t size_class_use_t;
+#endif
+
+// A span can either represent a single span of memory pages with size declared by span_map_count configuration variable,
+// or a set of spans in a continuous region, a super span. Any reference to the term "span" usually refers to both a single
+// span or a super span. A super span can further be divided into multiple spans (or this, super spans), where the first
+// (super)span is the master and subsequent (super)spans are subspans. The master span keeps track of how many subspans
+// that are still alive and mapped in virtual memory, and once all subspans and master have been unmapped the entire
+// superspan region is released and unmapped (on Windows for example, the entire superspan range has to be released
+// in the same call to release the virtual memory range, but individual subranges can be decommitted individually
+// to reduce physical memory use).
+struct span_t {
+	//! Free list
+	void*       free_list;
+	//! Total block count of size class
+	uint32_t    block_count;
+	//! Size class
+	uint32_t    size_class;
+	//! Index of last block initialized in free list
+	uint32_t    free_list_limit;
+	//! Number of used blocks remaining when in partial state
+	uint32_t    used_count;
+	//! Deferred free list
+	atomicptr_t free_list_deferred;
+	//! Size of deferred free list, or list of spans when part of a cache list
+	uint32_t    list_size;
+	//! Size of a block
+	uint32_t    block_size;
+	//! Flags and counters
+	uint32_t    flags;
+	//! Number of spans
+	uint32_t    span_count;
+	//! Total span counter for master spans
+	uint32_t    total_spans;
+	//! Offset from master span for subspans
+	uint32_t    offset_from_master;
+	//! Remaining span counter, for master spans
+	atomic32_t  remaining_spans;
+	//! Alignment offset
+	uint32_t    align_offset;
+	//! Owning heap
+	heap_t*     heap;
+	//! Next span
+	span_t*     next;
+	//! Previous span
+	span_t*     prev;
+};
+static_assert(sizeof(span_t) <= SPAN_HEADER_SIZE, "span size mismatch");
+
+struct span_cache_t {
+	size_t       count;
+	span_t*      span[MAX_THREAD_SPAN_CACHE];
+};
+typedef struct span_cache_t span_cache_t;
+
+struct span_large_cache_t {
+	size_t       count;
+	span_t*      span[MAX_THREAD_SPAN_LARGE_CACHE];
+};
+typedef struct span_large_cache_t span_large_cache_t;
+
+struct heap_size_class_t {
+	//! Free list of active span
+	void*        free_list;
+	//! Double linked list of partially used spans with free blocks.
+	//  Previous span pointer in head points to tail span of list.
+	span_t*      partial_span;
+	//! Early level cache of fully free spans
+	span_t*      cache;
+};
+typedef struct heap_size_class_t heap_size_class_t;
+
+// Control structure for a heap, either a thread heap or a first class heap if enabled
+struct heap_t {
+	//! Owning thread ID
+	uintptr_t    owner_thread;
+	//! Free lists for each size class
+	heap_size_class_t size_class[SIZE_CLASS_COUNT];
+#if ENABLE_THREAD_CACHE
+	//! Arrays of fully freed spans, single span
+	span_cache_t span_cache;
+#endif
+	//! List of deferred free spans (single linked list)
+	atomicptr_t  span_free_deferred;
+	//! Number of full spans
+	size_t       full_span_count;
+	//! Mapped but unused spans
+	span_t*      span_reserve;
+	//! Master span for mapped but unused spans
+	span_t*      span_reserve_master;
+	//! Number of mapped but unused spans
+	uint32_t     spans_reserved;
+	//! Child count
+	atomic32_t   child_count;
+	//! Next heap in id list
+	heap_t*      next_heap;
+	//! Next heap in orphan list
+	heap_t*      next_orphan;
+	//! Heap ID
+	int32_t      id;
+	//! Finalization state flag
+	int          finalize;
+	//! Master heap owning the memory pages
+	heap_t*      master_heap;
+#if ENABLE_THREAD_CACHE
+	//! Arrays of fully freed spans, large spans with > 1 span count
+	span_large_cache_t span_large_cache[LARGE_CLASS_COUNT - 1];
+#endif
+#if RPMALLOC_FIRST_CLASS_HEAPS
+	//! Double linked list of fully utilized spans with free blocks for each size class.
+	//  Previous span pointer in head points to tail span of list.
+	span_t*      full_span[SIZE_CLASS_COUNT];
+	//! Double linked list of large and huge spans allocated by this heap
+	span_t*      large_huge_span;
+#endif
+#if ENABLE_ADAPTIVE_THREAD_CACHE || ENABLE_STATISTICS
+	//! Current and high water mark of spans used per span count
+	span_use_t   span_use[LARGE_CLASS_COUNT];
+#endif
+#if ENABLE_STATISTICS
+	//! Allocation stats per size class
+	size_class_use_t size_class_use[SIZE_CLASS_COUNT + 1];
+	//! Number of bytes transitioned thread -> global
+	atomic64_t   thread_to_global;
+	//! Number of bytes transitioned global -> thread
+	atomic64_t   global_to_thread;
+#endif
+};
+
+// Size class for defining a block size bucket
+struct size_class_t {
+	//! Size of blocks in this class
+	uint32_t block_size;
+	//! Number of blocks in each chunk
+	uint16_t block_count;
+	//! Class index this class is merged with
+	uint16_t class_idx;
+};
+static_assert(sizeof(size_class_t) == 8, "Size class size mismatch");
+
+struct global_cache_t {
+	//! Cache lock
+	atomic32_t lock;
+	//! Cache count
+	uint32_t count;
+#if ENABLE_STATISTICS
+	//! Insert count
+	size_t insert_count;
+	//! Extract count
+	size_t extract_count;
+#endif
+	//! Cached spans
+	span_t* span[GLOBAL_CACHE_MULTIPLIER * MAX_THREAD_SPAN_CACHE];
+	//! Unlimited cache overflow
+	span_t* overflow;
+};
+
+////////////
+///
+/// Global data
+///
+//////
+
+//! Default span size (64KiB)
+#define _memory_default_span_size (64 * 1024)
+#define _memory_default_span_size_shift 16
+#define _memory_default_span_mask (~((uintptr_t)(_memory_span_size - 1)))
+
+//! Initialized flag
+static int _rpmalloc_initialized;
+//! Main thread ID
+static uintptr_t _rpmalloc_main_thread_id;
+//! Configuration
+static rpmalloc_config_t _memory_config;
+//! Memory page size
+static size_t _memory_page_size;
+//! Shift to divide by page size
+static size_t _memory_page_size_shift;
+//! Granularity at which memory pages are mapped by OS
+static size_t _memory_map_granularity;
+#if RPMALLOC_CONFIGURABLE
+//! Size of a span of memory pages
+static size_t _memory_span_size;
+//! Shift to divide by span size
+static size_t _memory_span_size_shift;
+//! Mask to get to start of a memory span
+static uintptr_t _memory_span_mask;
+#else
+//! Hardwired span size
+#define _memory_span_size _memory_default_span_size
+#define _memory_span_size_shift _memory_default_span_size_shift
+#define _memory_span_mask _memory_default_span_mask
+#endif
+//! Number of spans to map in each map call
+static size_t _memory_span_map_count;
+//! Number of spans to keep reserved in each heap
+static size_t _memory_heap_reserve_count;
+//! Global size classes
+static size_class_t _memory_size_class[SIZE_CLASS_COUNT];
+//! Run-time size limit of medium blocks
+static size_t _memory_medium_size_limit;
+//! Heap ID counter
+static atomic32_t _memory_heap_id;
+//! Huge page support
+static int _memory_huge_pages;
+#if ENABLE_GLOBAL_CACHE
+//! Global span cache
+static global_cache_t _memory_span_cache[LARGE_CLASS_COUNT];
+#endif
+//! Global reserved spans
+static span_t* _memory_global_reserve;
+//! Global reserved count
+static size_t _memory_global_reserve_count;
+//! Global reserved master
+static span_t* _memory_global_reserve_master;
+//! All heaps
+static heap_t* _memory_heaps[HEAP_ARRAY_SIZE];
+//! Used to restrict access to mapping memory for huge pages
+static atomic32_t _memory_global_lock;
+//! Orphaned heaps
+static heap_t* _memory_orphan_heaps;
+#if RPMALLOC_FIRST_CLASS_HEAPS
+//! Orphaned heaps (first class heaps)
+static heap_t* _memory_first_class_orphan_heaps;
+#endif
+#if ENABLE_STATISTICS
+//! Allocations counter
+static atomic64_t _allocation_counter;
+//! Deallocations counter
+static atomic64_t _deallocation_counter;
+//! Active heap count
+static atomic32_t _memory_active_heaps;
+//! Number of currently mapped memory pages
+static atomic32_t _mapped_pages;
+//! Peak number of concurrently mapped memory pages
+static int32_t _mapped_pages_peak;
+//! Number of mapped master spans
+static atomic32_t _master_spans;
+//! Number of unmapped dangling master spans
+static atomic32_t _unmapped_master_spans;
+//! Running counter of total number of mapped memory pages since start
+static atomic32_t _mapped_total;
+//! Running counter of total number of unmapped memory pages since start
+static atomic32_t _unmapped_total;
+//! Number of currently mapped memory pages in OS calls
+static atomic32_t _mapped_pages_os;
+//! Number of currently allocated pages in huge allocations
+static atomic32_t _huge_pages_current;
+//! Peak number of currently allocated pages in huge allocations
+static int32_t _huge_pages_peak;
+#endif
+
+////////////
+///
+/// Thread local heap and ID
+///
+//////
+
+//! Current thread heap
+#if ((defined(__APPLE__) || defined(__HAIKU__)) && ENABLE_PRELOAD) || defined(__TINYC__)
+static pthread_key_t _memory_thread_heap;
+#else
+#  ifdef _MSC_VER
+#    define _Thread_local __declspec(thread)
+#    define TLS_MODEL
+#  else
+#    ifndef __HAIKU__
+#      define TLS_MODEL __attribute__((tls_model("initial-exec")))
+#    else
+#      define TLS_MODEL
+#    endif
+#    if !defined(__clang__) && defined(__GNUC__)
+#      define _Thread_local __thread
+#    endif
+#  endif
+static _Thread_local heap_t* _memory_thread_heap TLS_MODEL;
+#endif
+
+static inline heap_t*
+get_thread_heap_raw(void) {
+#if (defined(__APPLE__) || defined(__HAIKU__)) && ENABLE_PRELOAD
+	return pthread_getspecific(_memory_thread_heap);
+#else
+	return _memory_thread_heap;
+#endif
+}
+
+//! Get the current thread heap
+static inline heap_t*
+get_thread_heap(void) {
+	heap_t* heap = get_thread_heap_raw();
+#if ENABLE_PRELOAD
+	if (EXPECTED(heap != 0))
+		return heap;
+	rpmalloc_initialize();
+	return get_thread_heap_raw();
+#else
+	return heap;
+#endif
+}
+
+//! Fast thread ID
+static inline uintptr_t
+get_thread_id(void) {
+#if defined(_WIN32)
+	return (uintptr_t)((void*)NtCurrentTeb());
+#elif (defined(__GNUC__) || defined(__clang__)) && !defined(__CYGWIN__)
+	uintptr_t tid;
+#  if defined(__i386__)
+	__asm__("movl %%gs:0, %0" : "=r" (tid) : : );
+#  elif defined(__x86_64__)
+#    if defined(__MACH__)
+	__asm__("movq %%gs:0, %0" : "=r" (tid) : : );
+#    else
+	__asm__("movq %%fs:0, %0" : "=r" (tid) : : );
+#    endif
+#  elif defined(__arm__)
+	__asm__ volatile ("mrc p15, 0, %0, c13, c0, 3" : "=r" (tid));
+#  elif defined(__aarch64__)
+#    if defined(__MACH__)
+	// tpidr_el0 likely unused, always return 0 on iOS
+	__asm__ volatile ("mrs %0, tpidrro_el0" : "=r" (tid));
+#    else
+	__asm__ volatile ("mrs %0, tpidr_el0" : "=r" (tid));
+#    endif
+#  else
+	tid = (uintptr_t)((void*)get_thread_heap_raw());
+#  endif
+	return tid;
+#else
+	return (uintptr_t)((void*)get_thread_heap_raw());
+#endif
+}
+
+//! Set the current thread heap
+static void
+set_thread_heap(heap_t* heap) {
+#if ((defined(__APPLE__) || defined(__HAIKU__)) && ENABLE_PRELOAD) || defined(__TINYC__)
+	pthread_setspecific(_memory_thread_heap, heap);
+#else
+	_memory_thread_heap = heap;
+#endif
+	if (heap)
+		heap->owner_thread = get_thread_id();
+}
+
+//! Set main thread ID
+extern void
+rpmalloc_set_main_thread(void);
+
+void
+rpmalloc_set_main_thread(void) {
+	_rpmalloc_main_thread_id = get_thread_id();
+}
+
+static void
+_rpmalloc_spin(void) {
+#if defined(_MSC_VER) && !(defined(_M_ARM) || defined(_M_ARM64))
+	_mm_pause();
+#elif defined(__x86_64__) || defined(__i386__)
+	__asm__ volatile("pause" ::: "memory");
+#elif defined(__aarch64__) || (defined(__arm__) && __ARM_ARCH >= 7)
+	__asm__ volatile("yield" ::: "memory");
+#elif defined(__powerpc__) || defined(__powerpc64__)
+        // No idea if ever been compiled in such archs but ... as precaution
+	__asm__ volatile("or 27,27,27");
+#elif defined(__sparc__)
+	__asm__ volatile("rd %ccr, %g0 \n\trd %ccr, %g0 \n\trd %ccr, %g0");
+#else
+	std::this_thread::yield();
+#endif
+}
+
+#if defined(_WIN32) && (!defined(BUILD_DYNAMIC_LINK) || !BUILD_DYNAMIC_LINK)
+static void NTAPI
+_rpmalloc_thread_destructor(void* value) {
+#if ENABLE_OVERRIDE
+	// If this is called on main thread it means rpmalloc_finalize
+	// has not been called and shutdown is forced (through _exit) or unclean
+	if (get_thread_id() == _rpmalloc_main_thread_id)
+		return;
+#endif
+	if (value)
+		rpmalloc_thread_finalize(1);
+}
+#endif
+
+
+////////////
+///
+/// Low level memory map/unmap
+///
+//////
+
+static void
+_rpmalloc_set_name(void* address, size_t size) {
+#if defined(__linux__) || defined(__ANDROID__)
+	const char *name = _memory_huge_pages ? _memory_config.huge_page_name : _memory_config.page_name;
+	if (address == MAP_FAILED || !name)
+		return;
+	// If the kernel does not support CONFIG_ANON_VMA_NAME or if the call fails
+	// (e.g. invalid name) it is a no-op basically.
+	(void)prctl(PR_SET_VMA, PR_SET_VMA_ANON_NAME, (uintptr_t)address, size, (uintptr_t)name);
+#else
+	(void)sizeof(size);
+	(void)sizeof(address);
+#endif
+}
+
+
+//! Map more virtual memory
+//  size is number of bytes to map
+//  offset receives the offset in bytes from start of mapped region
+//  returns address to start of mapped region to use
+static void*
+_rpmalloc_mmap(size_t size, size_t* offset) {
+	rpmalloc_assert(!(size % _memory_page_size), "Invalid mmap size");
+	rpmalloc_assert(size >= _memory_page_size, "Invalid mmap size");
+	void* address = _memory_config.memory_map(size, offset);
+	if (EXPECTED(address != 0)) {
+		_rpmalloc_stat_add_peak(&_mapped_pages, (size >> _memory_page_size_shift), _mapped_pages_peak);
+		_rpmalloc_stat_add(&_mapped_total, (size >> _memory_page_size_shift));
+	}
+	return address;
+}
+
+//! Unmap virtual memory
+//  address is the memory address to unmap, as returned from _memory_map
+//  size is the number of bytes to unmap, which might be less than full region for a partial unmap
+//  offset is the offset in bytes to the actual mapped region, as set by _memory_map
+//  release is set to 0 for partial unmap, or size of entire range for a full unmap
+static void
+_rpmalloc_unmap(void* address, size_t size, size_t offset, size_t release) {
+	rpmalloc_assert(!release || (release >= size), "Invalid unmap size");
+	rpmalloc_assert(!release || (release >= _memory_page_size), "Invalid unmap size");
+	if (release) {
+		rpmalloc_assert(!(release % _memory_page_size), "Invalid unmap size");
+		_rpmalloc_stat_sub(&_mapped_pages, (release >> _memory_page_size_shift));
+		_rpmalloc_stat_add(&_unmapped_total, (release >> _memory_page_size_shift));
+	}
+	_memory_config.memory_unmap(address, size, offset, release);
+}
+
+//! Default implementation to map new pages to virtual memory
+static void*
+_rpmalloc_mmap_os(size_t size, size_t* offset) {
+	//Either size is a heap (a single page) or a (multiple) span - we only need to align spans, and only if larger than map granularity
+	size_t padding = ((size >= _memory_span_size) && (_memory_span_size > _memory_map_granularity)) ? _memory_span_size : 0;
+	rpmalloc_assert(size >= _memory_page_size, "Invalid mmap size");
+#if PLATFORM_WINDOWS
+	//Ok to MEM_COMMIT - according to MSDN, "actual physical pages are not allocated unless/until the virtual addresses are actually accessed"
+	void* ptr = VirtualAlloc(0, size + padding, (_memory_huge_pages ? MEM_LARGE_PAGES : 0) | MEM_RESERVE | MEM_COMMIT, PAGE_READWRITE);
+	if (!ptr) {
+		if (_memory_config.map_fail_callback) {
+			if (_memory_config.map_fail_callback(size + padding))
+				return _rpmalloc_mmap_os(size, offset);
+		} else {
+			rpmalloc_assert(ptr, "Failed to map virtual memory block");
+		}
+		return 0;
+	}
+#else
+	int flags = MAP_PRIVATE | MAP_ANONYMOUS | MAP_UNINITIALIZED;
+#  if defined(__APPLE__) && !TARGET_OS_IPHONE && !TARGET_OS_SIMULATOR
+	int fd = (int)VM_MAKE_TAG(240U);
+	if (_memory_huge_pages)
+		fd |= VM_FLAGS_SUPERPAGE_SIZE_2MB;
+	void* ptr = mmap(0, size + padding, PROT_READ | PROT_WRITE, flags, fd, 0);
+#  elif defined(MAP_HUGETLB)
+	void* ptr = mmap(0, size + padding, PROT_READ | PROT_WRITE | PROT_MAX(PROT_READ | PROT_WRITE), (_memory_huge_pages ? MAP_HUGETLB : 0) | flags, -1, 0);
+#    if defined(MADV_HUGEPAGE)
+	// In some configurations, huge pages allocations might fail thus
+	// we fallback to normal allocations and promote the region as transparent huge page
+	if ((ptr == MAP_FAILED || !ptr) && _memory_huge_pages) {
+		ptr = mmap(0, size + padding, PROT_READ | PROT_WRITE, flags, -1, 0);
+		if (ptr && ptr != MAP_FAILED) {
+			int prm = madvise(ptr, size + padding, MADV_HUGEPAGE);
+			(void)prm;
+			rpmalloc_assert((prm == 0), "Failed to promote the page to THP");
+		}
+	}
+#    endif
+	_rpmalloc_set_name(ptr, size + padding);
+#  elif defined(MAP_ALIGNED)
+	const size_t align = (sizeof(size_t) * 8) - (size_t)(__builtin_clzl(size - 1));
+	void* ptr = mmap(0, size + padding, PROT_READ | PROT_WRITE, (_memory_huge_pages ? MAP_ALIGNED(align) : 0) | flags, -1, 0);
+#  elif defined(MAP_ALIGN)
+	caddr_t base = (_memory_huge_pages ? (caddr_t)(4 << 20) : 0);
+	void* ptr = mmap(base, size + padding, PROT_READ | PROT_WRITE, (_memory_huge_pages ? MAP_ALIGN : 0) | flags, -1, 0);
+#  else
+	void* ptr = mmap(0, size + padding, PROT_READ | PROT_WRITE, flags, -1, 0);
+#  endif
+	if ((ptr == MAP_FAILED) || !ptr) {
+		if (_memory_config.map_fail_callback) {
+			if (_memory_config.map_fail_callback(size + padding))
+				return _rpmalloc_mmap_os(size, offset);
+		} else if (errno != ENOMEM) {
+			rpmalloc_assert((ptr != MAP_FAILED) && ptr, "Failed to map virtual memory block");
+		}
+		return 0;
+	}
+#endif
+	_rpmalloc_stat_add(&_mapped_pages_os, (int32_t)((size + padding) >> _memory_page_size_shift));
+	if (padding) {
+		size_t final_padding = padding - ((uintptr_t)ptr & ~_memory_span_mask);
+		rpmalloc_assert(final_padding <= _memory_span_size, "Internal failure in padding");
+		rpmalloc_assert(final_padding <= padding, "Internal failure in padding");
+		rpmalloc_assert(!(final_padding % 8), "Internal failure in padding");
+		ptr = pointer_offset(ptr, final_padding);
+		*offset = final_padding >> 3;
+	}
+	rpmalloc_assert((size < _memory_span_size) || !((uintptr_t)ptr & ~_memory_span_mask), "Internal failure in padding");
+	return ptr;
+}
+
+//! Default implementation to unmap pages from virtual memory
+static void
+_rpmalloc_unmap_os(void* address, size_t size, size_t offset, size_t release) {
+	rpmalloc_assert(release || (offset == 0), "Invalid unmap size");
+	rpmalloc_assert(!release || (release >= _memory_page_size), "Invalid unmap size");
+	rpmalloc_assert(size >= _memory_page_size, "Invalid unmap size");
+	if (release && offset) {
+		offset <<= 3;
+		address = pointer_offset(address, -(int32_t)offset);
+		if ((release >= _memory_span_size) && (_memory_span_size > _memory_map_granularity)) {
+			//Padding is always one span size
+			release += _memory_span_size;
+		}
+	}
+#if !DISABLE_UNMAP
+#if PLATFORM_WINDOWS
+	if (!VirtualFree(address, release ? 0 : size, release ? MEM_RELEASE : MEM_DECOMMIT)) {
+		rpmalloc_assert(0, "Failed to unmap virtual memory block");
+	}
+#else
+	if (release) {
+		if (munmap(address, release)) {
+			rpmalloc_assert(0, "Failed to unmap virtual memory block");
+		}
+	} else {
+#if defined(MADV_FREE_REUSABLE)
+		int ret;
+		while ((ret = madvise(address, size, MADV_FREE_REUSABLE)) == -1 && (errno == EAGAIN))
+			errno = 0;
+		if ((ret == -1) && (errno != 0)) {
+#elif defined(MADV_DONTNEED)
+		if (madvise(address, size, MADV_DONTNEED)) {
+#elif defined(MADV_PAGEOUT)
+		if (madvise(address, size, MADV_PAGEOUT)) {
+#elif defined(MADV_FREE)
+		if (madvise(address, size, MADV_FREE)) {
+#else
+		if (posix_madvise(address, size, POSIX_MADV_DONTNEED)) {
+#endif
+			rpmalloc_assert(0, "Failed to madvise virtual memory block as free");
+		}
+	}
+#endif
+#endif
+	if (release)
+		_rpmalloc_stat_sub(&_mapped_pages_os, release >> _memory_page_size_shift);
+}
+
+static void
+_rpmalloc_span_mark_as_subspan_unless_master(span_t* master, span_t* subspan, size_t span_count);
+
+//! Use global reserved spans to fulfill a memory map request (reserve size must be checked by caller)
+static span_t*
+_rpmalloc_global_get_reserved_spans(size_t span_count) {
+	span_t* span = _memory_global_reserve;
+	_rpmalloc_span_mark_as_subspan_unless_master(_memory_global_reserve_master, span, span_count);
+	_memory_global_reserve_count -= span_count;
+	if (_memory_global_reserve_count)
+		_memory_global_reserve = (span_t*)pointer_offset(span, span_count << _memory_span_size_shift);
+	else
+		_memory_global_reserve = 0;
+	return span;
+}
+
+//! Store the given spans as global reserve (must only be called from within new heap allocation, not thread safe)
+static void
+_rpmalloc_global_set_reserved_spans(span_t* master, span_t* reserve, size_t reserve_span_count) {
+	_memory_global_reserve_master = master;
+	_memory_global_reserve_count = reserve_span_count;
+	_memory_global_reserve = reserve;
+}
+
+
+////////////
+///
+/// Span linked list management
+///
+//////
+
+//! Add a span to double linked list at the head
+static void
+_rpmalloc_span_double_link_list_add(span_t** head, span_t* span) {
+	if (*head)
+		(*head)->prev = span;
+	span->next = *head;
+	*head = span;
+}
+
+//! Pop head span from double linked list
+static void
+_rpmalloc_span_double_link_list_pop_head(span_t** head, span_t* span) {
+	rpmalloc_assert(*head == span, "Linked list corrupted");
+	span = *head;
+	*head = span->next;
+}
+
+//! Remove a span from double linked list
+static void
+_rpmalloc_span_double_link_list_remove(span_t** head, span_t* span) {
+	rpmalloc_assert(*head, "Linked list corrupted");
+	if (*head == span) {
+		*head = span->next;
+	} else {
+		span_t* next_span = span->next;
+		span_t* prev_span = span->prev;
+		prev_span->next = next_span;
+		if (EXPECTED(next_span != 0))
+			next_span->prev = prev_span;
+	}
+}
+
+
+////////////
+///
+/// Span control
+///
+//////
+
+static void
+_rpmalloc_heap_cache_insert(heap_t* heap, span_t* span);
+
+static void
+_rpmalloc_heap_finalize(heap_t* heap);
+
+static void
+_rpmalloc_heap_set_reserved_spans(heap_t* heap, span_t* master, span_t* reserve, size_t reserve_span_count);
+
+//! Declare the span to be a subspan and store distance from master span and span count
+static void
+_rpmalloc_span_mark_as_subspan_unless_master(span_t* master, span_t* subspan, size_t span_count) {
+	rpmalloc_assert((subspan != master) || (subspan->flags & SPAN_FLAG_MASTER), "Span master pointer and/or flag mismatch");
+	if (subspan != master) {
+		subspan->flags = SPAN_FLAG_SUBSPAN;
+		subspan->offset_from_master = (uint32_t)((uintptr_t)pointer_diff(subspan, master) >> _memory_span_size_shift);
+		subspan->align_offset = 0;
+	}
+	subspan->span_count = (uint32_t)span_count;
+}
+
+//! Use reserved spans to fulfill a memory map request (reserve size must be checked by caller)
+static span_t*
+_rpmalloc_span_map_from_reserve(heap_t* heap, size_t span_count) {
+	//Update the heap span reserve
+	span_t* span = heap->span_reserve;
+	heap->span_reserve = (span_t*)pointer_offset(span, span_count * _memory_span_size);
+	heap->spans_reserved -= (uint32_t)span_count;
+
+	_rpmalloc_span_mark_as_subspan_unless_master(heap->span_reserve_master, span, span_count);
+	if (span_count <= LARGE_CLASS_COUNT)
+		_rpmalloc_stat_inc(&heap->span_use[span_count - 1].spans_from_reserved);
+
+	return span;
+}
+
+//! Get the aligned number of spans to map in based on wanted count, configured mapping granularity and the page size
+static size_t
+_rpmalloc_span_align_count(size_t span_count) {
+	size_t request_count = (span_count > _memory_span_map_count) ? span_count : _memory_span_map_count;
+	if ((_memory_page_size > _memory_span_size) && ((request_count * _memory_span_size) % _memory_page_size))
+		request_count += _memory_span_map_count - (request_count % _memory_span_map_count);
+	return request_count;
+}
+
+//! Setup a newly mapped span
+static void
+_rpmalloc_span_initialize(span_t* span, size_t total_span_count, size_t span_count, size_t align_offset) {
+	span->total_spans = (uint32_t)total_span_count;
+	span->span_count = (uint32_t)span_count;
+	span->align_offset = (uint32_t)align_offset;
+	span->flags = SPAN_FLAG_MASTER;
+	atomic_store32(&span->remaining_spans, (int32_t)total_span_count);
+}
+
+static void
+_rpmalloc_span_unmap(span_t* span);
+
+//! Map an aligned set of spans, taking configured mapping granularity and the page size into account
+static span_t*
+_rpmalloc_span_map_aligned_count(heap_t* heap, size_t span_count) {
+	//If we already have some, but not enough, reserved spans, release those to heap cache and map a new
+	//full set of spans. Otherwise we would waste memory if page size > span size (huge pages)
+	size_t aligned_span_count = _rpmalloc_span_align_count(span_count);
+	size_t align_offset = 0;
+	span_t* span = (span_t*)_rpmalloc_mmap(aligned_span_count * _memory_span_size, &align_offset);
+	if (!span)
+		return 0;
+	_rpmalloc_span_initialize(span, aligned_span_count, span_count, align_offset);
+	_rpmalloc_stat_inc(&_master_spans);
+	if (span_count <= LARGE_CLASS_COUNT)
+		_rpmalloc_stat_inc(&heap->span_use[span_count - 1].spans_map_calls);
+	if (aligned_span_count > span_count) {
+		span_t* reserved_spans = (span_t*)pointer_offset(span, span_count * _memory_span_size);
+		size_t reserved_count = aligned_span_count - span_count;
+		if (heap->spans_reserved) {
+			_rpmalloc_span_mark_as_subspan_unless_master(heap->span_reserve_master, heap->span_reserve, heap->spans_reserved);
+			_rpmalloc_heap_cache_insert(heap, heap->span_reserve);
+		}
+		if (reserved_count > _memory_heap_reserve_count) {
+			// If huge pages or eager spam map count, the global reserve spin lock is held by caller, _rpmalloc_span_map
+			rpmalloc_assert(atomic_load32(&_memory_global_lock) == 1, "Global spin lock not held as expected");
+			size_t remain_count = reserved_count - _memory_heap_reserve_count;
+			reserved_count = _memory_heap_reserve_count;
+			span_t* remain_span = (span_t*)pointer_offset(reserved_spans, reserved_count * _memory_span_size);
+			if (_memory_global_reserve) {
+				_rpmalloc_span_mark_as_subspan_unless_master(_memory_global_reserve_master, _memory_global_reserve, _memory_global_reserve_count);
+				_rpmalloc_span_unmap(_memory_global_reserve);
+			}
+			_rpmalloc_global_set_reserved_spans(span, remain_span, remain_count);
+		}
+		_rpmalloc_heap_set_reserved_spans(heap, span, reserved_spans, reserved_count);
+	}
+	return span;
+}
+
+//! Map in memory pages for the given number of spans (or use previously reserved pages)
+static span_t*
+_rpmalloc_span_map(heap_t* heap, size_t span_count) {
+	if (span_count <= heap->spans_reserved)
+		return _rpmalloc_span_map_from_reserve(heap, span_count);
+	span_t* span = 0;
+	int use_global_reserve = (_memory_page_size > _memory_span_size) || (_memory_span_map_count > _memory_heap_reserve_count);
+	if (use_global_reserve) {
+		// If huge pages, make sure only one thread maps more memory to avoid bloat
+		while (!atomic_cas32_acquire(&_memory_global_lock, 1, 0))
+			_rpmalloc_spin();
+		if (_memory_global_reserve_count >= span_count) {
+			size_t reserve_count = (!heap->spans_reserved ? _memory_heap_reserve_count : span_count);
+			if (_memory_global_reserve_count < reserve_count)
+				reserve_count = _memory_global_reserve_count;
+			span = _rpmalloc_global_get_reserved_spans(reserve_count);
+			if (span) {
+				if (reserve_count > span_count) {
+					span_t* reserved_span = (span_t*)pointer_offset(span, span_count << _memory_span_size_shift);
+					_rpmalloc_heap_set_reserved_spans(heap, _memory_global_reserve_master, reserved_span, reserve_count - span_count);
+				}
+				// Already marked as subspan in _rpmalloc_global_get_reserved_spans
+				span->span_count = (uint32_t)span_count;
+			}
+		}
+	}
+	if (!span)
+		span = _rpmalloc_span_map_aligned_count(heap, span_count);
+	if (use_global_reserve)
+		atomic_store32_release(&_memory_global_lock, 0);
+	return span;
+}
+
+//! Unmap memory pages for the given number of spans (or mark as unused if no partial unmappings)
+static void
+_rpmalloc_span_unmap(span_t* span) {
+	rpmalloc_assert((span->flags & SPAN_FLAG_MASTER) || (span->flags & SPAN_FLAG_SUBSPAN), "Span flag corrupted");
+	rpmalloc_assert(!(span->flags & SPAN_FLAG_MASTER) || !(span->flags & SPAN_FLAG_SUBSPAN), "Span flag corrupted");
+
+	int is_master = !!(span->flags & SPAN_FLAG_MASTER);
+	span_t* master = is_master ? span : ((span_t*)pointer_offset(span, -(intptr_t)((uintptr_t)span->offset_from_master * _memory_span_size)));
+	rpmalloc_assert(is_master || (span->flags & SPAN_FLAG_SUBSPAN), "Span flag corrupted");
+	rpmalloc_assert(master->flags & SPAN_FLAG_MASTER, "Span flag corrupted");
+
+	size_t span_count = span->span_count;
+	if (!is_master) {
+		//Directly unmap subspans (unless huge pages, in which case we defer and unmap entire page range with master)
+		rpmalloc_assert(span->align_offset == 0, "Span align offset corrupted");
+		if (_memory_span_size >= _memory_page_size)
+			_rpmalloc_unmap(span, span_count * _memory_span_size, 0, 0);
+	} else {
+		//Special double flag to denote an unmapped master
+		//It must be kept in memory since span header must be used
+		span->flags |= SPAN_FLAG_MASTER | SPAN_FLAG_SUBSPAN | SPAN_FLAG_UNMAPPED_MASTER;
+		_rpmalloc_stat_add(&_unmapped_master_spans, 1);
+	}
+
+	if (atomic_add32(&master->remaining_spans, -(int32_t)span_count) <= 0) {
+		//Everything unmapped, unmap the master span with release flag to unmap the entire range of the super span
+		rpmalloc_assert(!!(master->flags & SPAN_FLAG_MASTER) && !!(master->flags & SPAN_FLAG_SUBSPAN), "Span flag corrupted");
+		size_t unmap_count = master->span_count;
+		if (_memory_span_size < _memory_page_size)
+			unmap_count = master->total_spans;
+		_rpmalloc_stat_sub(&_master_spans, 1);
+		_rpmalloc_stat_sub(&_unmapped_master_spans, 1);
+		_rpmalloc_unmap(master, unmap_count * _memory_span_size, master->align_offset, (size_t)master->total_spans * _memory_span_size);
+	}
+}
+
+//! Move the span (used for small or medium allocations) to the heap thread cache
+static void
+_rpmalloc_span_release_to_cache(heap_t* heap, span_t* span) {
+	rpmalloc_assert(heap == span->heap, "Span heap pointer corrupted");
+	rpmalloc_assert(span->size_class < SIZE_CLASS_COUNT, "Invalid span size class");
+	rpmalloc_assert(span->span_count == 1, "Invalid span count");
+#if ENABLE_ADAPTIVE_THREAD_CACHE || ENABLE_STATISTICS
+	atomic_decr32(&heap->span_use[0].current);
+#endif
+	_rpmalloc_stat_dec(&heap->size_class_use[span->size_class].spans_current);
+	if (!heap->finalize) {
+		_rpmalloc_stat_inc(&heap->span_use[0].spans_to_cache);
+		_rpmalloc_stat_inc(&heap->size_class_use[span->size_class].spans_to_cache);
+		if (heap->size_class[span->size_class].cache)
+			_rpmalloc_heap_cache_insert(heap, heap->size_class[span->size_class].cache);
+		heap->size_class[span->size_class].cache = span;
+	} else {
+		_rpmalloc_span_unmap(span);
+	}
+}
+
+//! Initialize a (partial) free list up to next system memory page, while reserving the first block
+//! as allocated, returning number of blocks in list
+static uint32_t
+free_list_partial_init(void** list, void** first_block, void* page_start, void* block_start, uint32_t block_count, uint32_t block_size) {
+	rpmalloc_assert(block_count, "Internal failure");
+	*first_block = block_start;
+	if (block_count > 1) {
+		void* free_block = pointer_offset(block_start, block_size);
+		void* block_end = pointer_offset(block_start, (size_t)block_size * block_count);
+		//If block size is less than half a memory page, bound init to next memory page boundary
+		if (block_size < (_memory_page_size >> 1)) {
+			void* page_end = pointer_offset(page_start, _memory_page_size);
+			if (page_end < block_end)
+				block_end = page_end;
+		}
+		*list = free_block;
+		block_count = 2;
+		void* next_block = pointer_offset(free_block, block_size);
+		while (next_block < block_end) {
+			*((void**)free_block) = next_block;
+			free_block = next_block;
+			++block_count;
+			next_block = pointer_offset(next_block, block_size);
+		}
+		*((void**)free_block) = 0;
+	} else {
+		*list = 0;
+	}
+	return block_count;
+}
+
+//! Initialize an unused span (from cache or mapped) to be new active span, putting the initial free list in heap class free list
+static void*
+_rpmalloc_span_initialize_new(heap_t* heap, heap_size_class_t* heap_size_class, span_t* span, uint32_t class_idx) {
+	rpmalloc_assert(span->span_count == 1, "Internal failure");
+	size_class_t* size_class = _memory_size_class + class_idx;
+	span->size_class = class_idx;
+	span->heap = heap;
+	span->flags &= ~SPAN_FLAG_ALIGNED_BLOCKS;
+	span->block_size = size_class->block_size;
+	span->block_count = size_class->block_count;
+	span->free_list = 0;
+	span->list_size = 0;
+	atomic_store_ptr_release(&span->free_list_deferred, 0);
+
+	//Setup free list. Only initialize one system page worth of free blocks in list
+	void* block;
+	span->free_list_limit = free_list_partial_init(&heap_size_class->free_list, &block, 
+		span, pointer_offset(span, SPAN_HEADER_SIZE), size_class->block_count, size_class->block_size);
+	//Link span as partial if there remains blocks to be initialized as free list, or full if fully initialized
+	if (span->free_list_limit < span->block_count) {
+		_rpmalloc_span_double_link_list_add(&heap_size_class->partial_span, span);
+		span->used_count = span->free_list_limit;
+	} else {
+#if RPMALLOC_FIRST_CLASS_HEAPS
+		_rpmalloc_span_double_link_list_add(&heap->full_span[class_idx], span);
+#endif
+		++heap->full_span_count;
+		span->used_count = span->block_count;
+	}
+	return block;
+}
+
+static void
+_rpmalloc_span_extract_free_list_deferred(span_t* span) {
+	// We need acquire semantics on the CAS operation since we are interested in the list size
+	// Refer to _rpmalloc_deallocate_defer_small_or_medium for further comments on this dependency
+	do {
+		span->free_list = atomic_exchange_ptr_acquire(&span->free_list_deferred, INVALID_POINTER);
+	} while (span->free_list == INVALID_POINTER);
+	span->used_count -= span->list_size;
+	span->list_size = 0;
+	atomic_store_ptr_release(&span->free_list_deferred, 0);
+}
+
+static int
+_rpmalloc_span_is_fully_utilized(span_t* span) {
+	rpmalloc_assert(span->free_list_limit <= span->block_count, "Span free list corrupted");
+	return !span->free_list && (span->free_list_limit >= span->block_count);
+}
+
+static int
+_rpmalloc_span_finalize(heap_t* heap, size_t iclass, span_t* span, span_t** list_head) {
+	void* free_list = heap->size_class[iclass].free_list;
+	span_t* class_span = (span_t*)((uintptr_t)free_list & _memory_span_mask);
+	if (span == class_span) {
+		// Adopt the heap class free list back into the span free list
+		void* block = span->free_list;
+		void* last_block = 0;
+		while (block) {
+			last_block = block;
+			block = *((void**)block);
+		}
+		uint32_t free_count = 0;
+		block = free_list;
+		while (block) {
+			++free_count;
+			block = *((void**)block);
+		}
+		if (last_block) {
+			*((void**)last_block) = free_list;
+		} else {
+			span->free_list = free_list;
+		}
+		heap->size_class[iclass].free_list = 0;
+		span->used_count -= free_count;
+	}
+	//If this assert triggers you have memory leaks
+	rpmalloc_assert(span->list_size == span->used_count, "Memory leak detected");
+	if (span->list_size == span->used_count) {
+		_rpmalloc_stat_dec(&heap->span_use[0].current);
+		_rpmalloc_stat_dec(&heap->size_class_use[iclass].spans_current);
+		// This function only used for spans in double linked lists
+		if (list_head)
+			_rpmalloc_span_double_link_list_remove(list_head, span);
+		_rpmalloc_span_unmap(span);
+		return 1;
+	}
+	return 0;
+}
+
+
+////////////
+///
+/// Global cache
+///
+//////
+
+#if ENABLE_GLOBAL_CACHE
+
+//! Finalize a global cache
+static void
+_rpmalloc_global_cache_finalize(global_cache_t* cache) {
+	while (!atomic_cas32_acquire(&cache->lock, 1, 0))
+		_rpmalloc_spin();
+
+	for (size_t ispan = 0; ispan < cache->count; ++ispan)
+		_rpmalloc_span_unmap(cache->span[ispan]);
+	cache->count = 0;
+
+	while (cache->overflow) {
+		span_t* span = cache->overflow;
+		cache->overflow = span->next;
+		_rpmalloc_span_unmap(span);
+	}
+
+	atomic_store32_release(&cache->lock, 0);
+}
+
+static void
+_rpmalloc_global_cache_insert_spans(span_t** span, size_t span_count, size_t count) {
+	const size_t cache_limit = (span_count == 1) ? 
+		GLOBAL_CACHE_MULTIPLIER * MAX_THREAD_SPAN_CACHE :
+		GLOBAL_CACHE_MULTIPLIER * (MAX_THREAD_SPAN_LARGE_CACHE - (span_count >> 1));
+
+	global_cache_t* cache = &_memory_span_cache[span_count - 1];
+
+	size_t insert_count = count;
+	while (!atomic_cas32_acquire(&cache->lock, 1, 0))
+		_rpmalloc_spin();
+
+#if ENABLE_STATISTICS
+	cache->insert_count += count;
+#endif
+	if ((cache->count + insert_count) > cache_limit)
+		insert_count = cache_limit - cache->count;
+
+	memcpy(cache->span + cache->count, span, sizeof(span_t*) * insert_count);
+	cache->count += (uint32_t)insert_count;
+
+#if ENABLE_UNLIMITED_CACHE
+	while (insert_count < count) {
+#else
+	// Enable unlimited cache if huge pages, or we will leak since it is unlikely that an entire huge page
+	// will be unmapped, and we're unable to partially decommit a huge page
+	while ((_memory_page_size > _memory_span_size) && (insert_count < count)) {
+#endif		
+		span_t* current_span = span[insert_count++];
+		current_span->next = cache->overflow;
+		cache->overflow = current_span;
+	}
+	atomic_store32_release(&cache->lock, 0);
+
+	span_t* keep = 0;
+	for (size_t ispan = insert_count; ispan < count; ++ispan) {
+		span_t* current_span = span[ispan];
+		// Keep master spans that has remaining subspans to avoid dangling them
+		if ((current_span->flags & SPAN_FLAG_MASTER) &&
+		    (atomic_load32(&current_span->remaining_spans) > (int32_t)current_span->span_count)) {
+			current_span->next = keep;
+			keep = current_span;
+		} else {
+			_rpmalloc_span_unmap(current_span);
+		}
+	}
+
+	if (keep) {
+		while (!atomic_cas32_acquire(&cache->lock, 1, 0))
+			_rpmalloc_spin();
+
+		size_t islot = 0;
+		while (keep) {
+			for (; islot < cache->count; ++islot) {
+				span_t* current_span = cache->span[islot];
+				if (!(current_span->flags & SPAN_FLAG_MASTER) || ((current_span->flags & SPAN_FLAG_MASTER) &&
+				    (atomic_load32(&current_span->remaining_spans) <= (int32_t)current_span->span_count))) {
+					_rpmalloc_span_unmap(current_span);
+					cache->span[islot] = keep;
+					break;
+				}
+			}
+			if (islot == cache->count)
+				break;
+			keep = keep->next;
+		}
+
+		if (keep) {
+			span_t* tail = keep;
+			while (tail->next)
+				tail = tail->next;
+			tail->next = cache->overflow;
+			cache->overflow = keep;
+		}
+
+		atomic_store32_release(&cache->lock, 0);
+	}
+}
+
+static size_t
+_rpmalloc_global_cache_extract_spans(span_t** span, size_t span_count, size_t count) {
+	global_cache_t* cache = &_memory_span_cache[span_count - 1];
+
+	size_t extract_count = 0;
+	while (!atomic_cas32_acquire(&cache->lock, 1, 0))
+		_rpmalloc_spin();
+
+#if ENABLE_STATISTICS
+	cache->extract_count += count;
+#endif
+	size_t want = count - extract_count;
+	if (want > cache->count)
+		want = cache->count;
+
+	memcpy(span + extract_count, cache->span + (cache->count - want), sizeof(span_t*) * want);
+	cache->count -= (uint32_t)want;
+	extract_count += want;
+
+	while ((extract_count < count) && cache->overflow) {
+		span_t* current_span = cache->overflow;
+		span[extract_count++] = current_span;
+		cache->overflow = current_span->next;
+	}
+
+#if ENABLE_ASSERTS
+	for (size_t ispan = 0; ispan < extract_count; ++ispan) {
+		assert(span[ispan]->span_count == span_count);
+	}
+#endif
+
+	atomic_store32_release(&cache->lock, 0);
+
+	return extract_count;
+}
+
+#endif
+
+////////////
+///
+/// Heap control
+///
+//////
+
+static void _rpmalloc_deallocate_huge(span_t*);
+
+//! Store the given spans as reserve in the given heap
+static void
+_rpmalloc_heap_set_reserved_spans(heap_t* heap, span_t* master, span_t* reserve, size_t reserve_span_count) {
+	heap->span_reserve_master = master;
+	heap->span_reserve = reserve;
+	heap->spans_reserved = (uint32_t)reserve_span_count;
+}
+
+//! Adopt the deferred span cache list, optionally extracting the first single span for immediate re-use
+static void
+_rpmalloc_heap_cache_adopt_deferred(heap_t* heap, span_t** single_span) {
+	span_t* span = (span_t*)((void*)atomic_exchange_ptr_acquire(&heap->span_free_deferred, 0));
+	while (span) {
+		span_t* next_span = (span_t*)span->free_list;
+		rpmalloc_assert(span->heap == heap, "Span heap pointer corrupted");
+		if (EXPECTED(span->size_class < SIZE_CLASS_COUNT)) {
+			rpmalloc_assert(heap->full_span_count, "Heap span counter corrupted");
+			--heap->full_span_count;
+			_rpmalloc_stat_dec(&heap->span_use[0].spans_deferred);
+#if RPMALLOC_FIRST_CLASS_HEAPS
+			_rpmalloc_span_double_link_list_remove(&heap->full_span[span->size_class], span);
+#endif
+			_rpmalloc_stat_dec(&heap->span_use[0].current);
+			_rpmalloc_stat_dec(&heap->size_class_use[span->size_class].spans_current);
+			if (single_span && !*single_span)
+				*single_span = span;
+			else
+				_rpmalloc_heap_cache_insert(heap, span);
+		} else {
+			if (span->size_class == SIZE_CLASS_HUGE) {
+				_rpmalloc_deallocate_huge(span);
+			} else {
+				rpmalloc_assert(span->size_class == SIZE_CLASS_LARGE, "Span size class invalid");
+				rpmalloc_assert(heap->full_span_count, "Heap span counter corrupted");
+				--heap->full_span_count;
+#if RPMALLOC_FIRST_CLASS_HEAPS
+				_rpmalloc_span_double_link_list_remove(&heap->large_huge_span, span);
+#endif
+				uint32_t idx = span->span_count - 1;
+				_rpmalloc_stat_dec(&heap->span_use[idx].spans_deferred);
+				_rpmalloc_stat_dec(&heap->span_use[idx].current);
+				if (!idx && single_span && !*single_span)
+					*single_span = span;
+				else
+					_rpmalloc_heap_cache_insert(heap, span);
+			}
+		}
+		span = next_span;
+	}
+}
+
+static void
+_rpmalloc_heap_unmap(heap_t* heap) {
+	if (!heap->master_heap) {
+		if ((heap->finalize > 1) && !atomic_load32(&heap->child_count)) {
+			span_t* span = (span_t*)((uintptr_t)heap & _memory_span_mask);
+			_rpmalloc_span_unmap(span);
+		}
+	} else {
+		if (atomic_decr32(&heap->master_heap->child_count) == 0) {
+			_rpmalloc_heap_unmap(heap->master_heap);
+		}
+	}
+}
+
+static void
+_rpmalloc_heap_global_finalize(heap_t* heap) {
+	if (heap->finalize++ > 1) {
+		--heap->finalize;
+		return;
+	}
+
+	_rpmalloc_heap_finalize(heap);
+
+#if ENABLE_THREAD_CACHE
+	for (size_t iclass = 0; iclass < LARGE_CLASS_COUNT; ++iclass) {
+		span_cache_t* span_cache;
+		if (!iclass)
+			span_cache = &heap->span_cache;
+		else
+			span_cache = (span_cache_t*)(heap->span_large_cache + (iclass - 1));
+		for (size_t ispan = 0; ispan < span_cache->count; ++ispan)
+			_rpmalloc_span_unmap(span_cache->span[ispan]);
+		span_cache->count = 0;
+	}
+#endif
+
+	if (heap->full_span_count) {
+		--heap->finalize;
+		return;
+	}
+
+	for (size_t iclass = 0; iclass < SIZE_CLASS_COUNT; ++iclass) {
+		if (heap->size_class[iclass].free_list || heap->size_class[iclass].partial_span) {
+			--heap->finalize;
+			return;
+		}
+	}
+	//Heap is now completely free, unmap and remove from heap list
+	size_t list_idx = (size_t)heap->id % HEAP_ARRAY_SIZE;
+	heap_t* list_heap = _memory_heaps[list_idx];
+	if (list_heap == heap) {
+		_memory_heaps[list_idx] = heap->next_heap;
+	} else {
+		while (list_heap->next_heap != heap)
+			list_heap = list_heap->next_heap;
+		list_heap->next_heap = heap->next_heap;
+	}
+
+	_rpmalloc_heap_unmap(heap);
+}
+
+//! Insert a single span into thread heap cache, releasing to global cache if overflow
+static void
+_rpmalloc_heap_cache_insert(heap_t* heap, span_t* span) {
+	if (UNEXPECTED(heap->finalize != 0)) {
+		_rpmalloc_span_unmap(span);
+		_rpmalloc_heap_global_finalize(heap);
+		return;
+	}
+#if ENABLE_THREAD_CACHE
+	size_t span_count = span->span_count;
+	_rpmalloc_stat_inc(&heap->span_use[span_count - 1].spans_to_cache);
+	if (span_count == 1) {
+		span_cache_t* span_cache = &heap->span_cache;
+		span_cache->span[span_cache->count++] = span;
+		if (span_cache->count == MAX_THREAD_SPAN_CACHE) {
+			const size_t remain_count = MAX_THREAD_SPAN_CACHE - THREAD_SPAN_CACHE_TRANSFER;
+#if ENABLE_GLOBAL_CACHE
+			_rpmalloc_stat_add64(&heap->thread_to_global, THREAD_SPAN_CACHE_TRANSFER * _memory_span_size);
+			_rpmalloc_stat_add(&heap->span_use[span_count - 1].spans_to_global, THREAD_SPAN_CACHE_TRANSFER);
+			_rpmalloc_global_cache_insert_spans(span_cache->span + remain_count, span_count, THREAD_SPAN_CACHE_TRANSFER);
+#else
+			for (size_t ispan = 0; ispan < THREAD_SPAN_CACHE_TRANSFER; ++ispan)
+				_rpmalloc_span_unmap(span_cache->span[remain_count + ispan]);
+#endif
+			span_cache->count = remain_count;
+		}
+	} else {
+		size_t cache_idx = span_count - 2;
+		span_large_cache_t* span_cache = heap->span_large_cache + cache_idx;
+		span_cache->span[span_cache->count++] = span;
+		const size_t cache_limit = (MAX_THREAD_SPAN_LARGE_CACHE - (span_count >> 1));
+		if (span_cache->count == cache_limit) {
+			const size_t transfer_limit = 2 + (cache_limit >> 2);
+			const size_t transfer_count = (THREAD_SPAN_LARGE_CACHE_TRANSFER <= transfer_limit ? THREAD_SPAN_LARGE_CACHE_TRANSFER : transfer_limit);
+			const size_t remain_count = cache_limit - transfer_count;
+#if ENABLE_GLOBAL_CACHE
+			_rpmalloc_stat_add64(&heap->thread_to_global, transfer_count * span_count * _memory_span_size);
+			_rpmalloc_stat_add(&heap->span_use[span_count - 1].spans_to_global, transfer_count);
+			_rpmalloc_global_cache_insert_spans(span_cache->span + remain_count, span_count, transfer_count);
+#else
+			for (size_t ispan = 0; ispan < transfer_count; ++ispan)
+				_rpmalloc_span_unmap(span_cache->span[remain_count + ispan]);
+#endif
+			span_cache->count = remain_count;
+		}
+	}
+#else
+	(void)sizeof(heap);
+	_rpmalloc_span_unmap(span);
+#endif
+}
+
+//! Extract the given number of spans from the different cache levels
+static span_t*
+_rpmalloc_heap_thread_cache_extract(heap_t* heap, size_t span_count) {
+	span_t* span = 0;
+#if ENABLE_THREAD_CACHE
+	span_cache_t* span_cache;
+	if (span_count == 1)
+		span_cache = &heap->span_cache;
+	else
+		span_cache = (span_cache_t*)(heap->span_large_cache + (span_count - 2));
+	if (span_cache->count) {
+		_rpmalloc_stat_inc(&heap->span_use[span_count - 1].spans_from_cache);
+		return span_cache->span[--span_cache->count];
+	}
+#endif
+	return span;
+}
+
+static span_t*
+_rpmalloc_heap_thread_cache_deferred_extract(heap_t* heap, size_t span_count) {
+	span_t* span = 0;
+	if (span_count == 1) {
+		_rpmalloc_heap_cache_adopt_deferred(heap, &span);
+	} else {
+		_rpmalloc_heap_cache_adopt_deferred(heap, 0);
+		span = _rpmalloc_heap_thread_cache_extract(heap, span_count);
+	}
+	return span;
+}
+
+static span_t*
+_rpmalloc_heap_reserved_extract(heap_t* heap, size_t span_count) {
+	if (heap->spans_reserved >= span_count)
+		return _rpmalloc_span_map(heap, span_count);
+	return 0;
+}
+
+//! Extract a span from the global cache
+static span_t*
+_rpmalloc_heap_global_cache_extract(heap_t* heap, size_t span_count) {
+#if ENABLE_GLOBAL_CACHE
+#if ENABLE_THREAD_CACHE
+	span_cache_t* span_cache;
+	size_t wanted_count;
+	if (span_count == 1) {
+		span_cache = &heap->span_cache;
+		wanted_count = THREAD_SPAN_CACHE_TRANSFER;
+	} else {
+		span_cache = (span_cache_t*)(heap->span_large_cache + (span_count - 2));
+		wanted_count = THREAD_SPAN_LARGE_CACHE_TRANSFER;
+	}
+	span_cache->count = _rpmalloc_global_cache_extract_spans(span_cache->span, span_count, wanted_count);
+	if (span_cache->count) {
+		_rpmalloc_stat_add64(&heap->global_to_thread, span_count * span_cache->count * _memory_span_size);
+		_rpmalloc_stat_add(&heap->span_use[span_count - 1].spans_from_global, span_cache->count);
+		return span_cache->span[--span_cache->count];
+	}
+#else
+	span_t* span = 0;
+	size_t count = _rpmalloc_global_cache_extract_spans(&span, span_count, 1);
+	if (count) {
+		_rpmalloc_stat_add64(&heap->global_to_thread, span_count * count * _memory_span_size);
+		_rpmalloc_stat_add(&heap->span_use[span_count - 1].spans_from_global, count);
+		return span;
+	}
+#endif
+#endif
+	(void)sizeof(heap);
+	(void)sizeof(span_count);
+	return 0;
+}
+
+static void
+_rpmalloc_inc_span_statistics(heap_t* heap, size_t span_count, uint32_t class_idx) {
+	(void)sizeof(heap);
+	(void)sizeof(span_count);
+	(void)sizeof(class_idx);
+#if ENABLE_ADAPTIVE_THREAD_CACHE || ENABLE_STATISTICS
+	uint32_t idx = (uint32_t)span_count - 1;
+	uint32_t current_count = (uint32_t)atomic_incr32(&heap->span_use[idx].current);
+	if (current_count > (uint32_t)atomic_load32(&heap->span_use[idx].high))
+		atomic_store32(&heap->span_use[idx].high, (int32_t)current_count);
+	_rpmalloc_stat_add_peak(&heap->size_class_use[class_idx].spans_current, 1, heap->size_class_use[class_idx].spans_peak);
+#endif
+}
+
+//! Get a span from one of the cache levels (thread cache, reserved, global cache) or fallback to mapping more memory
+static span_t*
+_rpmalloc_heap_extract_new_span(heap_t* heap, heap_size_class_t* heap_size_class, size_t span_count, uint32_t class_idx) {
+	span_t* span;
+#if ENABLE_THREAD_CACHE
+	if (heap_size_class && heap_size_class->cache) {
+		span = heap_size_class->cache;
+		heap_size_class->cache = (heap->span_cache.count ? heap->span_cache.span[--heap->span_cache.count] : 0);
+		_rpmalloc_inc_span_statistics(heap, span_count, class_idx);
+		return span;
+	}
+#endif
+	(void)sizeof(class_idx);
+	// Allow 50% overhead to increase cache hits
+	size_t base_span_count = span_count;
+	size_t limit_span_count = (span_count > 2) ? (span_count + (span_count >> 1)) : span_count;
+	if (limit_span_count > LARGE_CLASS_COUNT)
+		limit_span_count = LARGE_CLASS_COUNT;
+	do {
+		span = _rpmalloc_heap_thread_cache_extract(heap, span_count);
+		if (EXPECTED(span != 0)) {
+			_rpmalloc_stat_inc(&heap->size_class_use[class_idx].spans_from_cache);
+			_rpmalloc_inc_span_statistics(heap, span_count, class_idx);
+			return span;
+		}
+		span = _rpmalloc_heap_thread_cache_deferred_extract(heap, span_count);
+		if (EXPECTED(span != 0)) {
+			_rpmalloc_stat_inc(&heap->size_class_use[class_idx].spans_from_cache);
+			_rpmalloc_inc_span_statistics(heap, span_count, class_idx);
+			return span;
+		}
+		span = _rpmalloc_heap_reserved_extract(heap, span_count);
+		if (EXPECTED(span != 0)) {
+			_rpmalloc_stat_inc(&heap->size_class_use[class_idx].spans_from_reserved);
+			_rpmalloc_inc_span_statistics(heap, span_count, class_idx);
+			return span;
+		}
+		span = _rpmalloc_heap_global_cache_extract(heap, span_count);
+		if (EXPECTED(span != 0)) {
+			_rpmalloc_stat_inc(&heap->size_class_use[class_idx].spans_from_cache);
+			_rpmalloc_inc_span_statistics(heap, span_count, class_idx);
+			return span;
+		}
+		++span_count;
+	} while (span_count <= limit_span_count);
+	//Final fallback, map in more virtual memory
+	span = _rpmalloc_span_map(heap, base_span_count);
+	_rpmalloc_inc_span_statistics(heap, base_span_count, class_idx);
+	_rpmalloc_stat_inc(&heap->size_class_use[class_idx].spans_map_calls);
+	return span;
+}
+
+static void
+_rpmalloc_heap_initialize(heap_t* heap) {
+	memset((void*)heap, 0, sizeof(heap_t));
+	//Get a new heap ID
+	heap->id = 1 + atomic_incr32(&_memory_heap_id);
+
+	//Link in heap in heap ID map
+	size_t list_idx = (size_t)heap->id % HEAP_ARRAY_SIZE;
+	heap->next_heap = _memory_heaps[list_idx];
+	_memory_heaps[list_idx] = heap;
+}
+
+static void
+_rpmalloc_heap_orphan(heap_t* heap, int first_class) {
+	heap->owner_thread = (uintptr_t)-1;
+#if RPMALLOC_FIRST_CLASS_HEAPS
+	heap_t** heap_list = (first_class ? &_memory_first_class_orphan_heaps : &_memory_orphan_heaps);
+#else
+	(void)sizeof(first_class);
+	heap_t** heap_list = &_memory_orphan_heaps;
+#endif
+	heap->next_orphan = *heap_list;
+	*heap_list = heap;
+}
+
+//! Allocate a new heap from newly mapped memory pages
+static heap_t*
+_rpmalloc_heap_allocate_new(void) {
+	// Map in pages for a 16 heaps. If page size is greater than required size for this, map a page and
+	// use first part for heaps and remaining part for spans for allocations. Adds a lot of complexity,
+	// but saves a lot of memory on systems where page size > 64 spans (4MiB)
+	size_t heap_size = sizeof(heap_t);
+	size_t aligned_heap_size = 16 * ((heap_size + 15) / 16);
+	size_t request_heap_count = 16;
+	size_t heap_span_count = ((aligned_heap_size * request_heap_count) + sizeof(span_t) + _memory_span_size - 1) / _memory_span_size;
+	size_t block_size = _memory_span_size * heap_span_count;
+	size_t span_count = heap_span_count;
+	span_t* span = 0;
+	// If there are global reserved spans, use these first
+	if (_memory_global_reserve_count >= heap_span_count) {
+		span = _rpmalloc_global_get_reserved_spans(heap_span_count);
+	}
+	if (!span) {
+		if (_memory_page_size > block_size) {
+			span_count = _memory_page_size / _memory_span_size;
+			block_size = _memory_page_size;
+			// If using huge pages, make sure to grab enough heaps to avoid reallocating a huge page just to serve new heaps
+			size_t possible_heap_count = (block_size - sizeof(span_t)) / aligned_heap_size;
+			if (possible_heap_count >= (request_heap_count * 16))
+				request_heap_count *= 16;
+			else if (possible_heap_count < request_heap_count)
+				request_heap_count = possible_heap_count;
+			heap_span_count = ((aligned_heap_size * request_heap_count) + sizeof(span_t) + _memory_span_size - 1) / _memory_span_size;
+		}
+
+		size_t align_offset = 0;
+		span = (span_t*)_rpmalloc_mmap(block_size, &align_offset);
+		if (!span)
+			return 0;
+
+		// Master span will contain the heaps
+		_rpmalloc_stat_inc(&_master_spans);
+		_rpmalloc_span_initialize(span, span_count, heap_span_count, align_offset);
+	}
+
+	size_t remain_size = _memory_span_size - sizeof(span_t);
+	heap_t* heap = (heap_t*)pointer_offset(span, sizeof(span_t));
+	_rpmalloc_heap_initialize(heap);
+
+	// Put extra heaps as orphans
+	size_t num_heaps = remain_size / aligned_heap_size;
+	if (num_heaps < request_heap_count)
+		num_heaps = request_heap_count;
+	atomic_store32(&heap->child_count, (int32_t)num_heaps - 1);
+	heap_t* extra_heap = (heap_t*)pointer_offset(heap, aligned_heap_size);
+	while (num_heaps > 1) {
+		_rpmalloc_heap_initialize(extra_heap);
+		extra_heap->master_heap = heap;
+		_rpmalloc_heap_orphan(extra_heap, 1);
+		extra_heap = (heap_t*)pointer_offset(extra_heap, aligned_heap_size);
+		--num_heaps;
+	}
+
+	if (span_count > heap_span_count) {
+		// Cap reserved spans
+		size_t remain_count = span_count - heap_span_count;
+		size_t reserve_count = (remain_count > _memory_heap_reserve_count ? _memory_heap_reserve_count : remain_count);
+		span_t* remain_span = (span_t*)pointer_offset(span, heap_span_count * _memory_span_size);
+		_rpmalloc_heap_set_reserved_spans(heap, span, remain_span, reserve_count);
+
+		if (remain_count > reserve_count) {
+			// Set to global reserved spans
+			remain_span = (span_t*)pointer_offset(remain_span, reserve_count * _memory_span_size);
+			reserve_count = remain_count - reserve_count;
+			_rpmalloc_global_set_reserved_spans(span, remain_span, reserve_count);
+		}
+	}
+
+	return heap;
+}
+
+static heap_t*
+_rpmalloc_heap_extract_orphan(heap_t** heap_list) {
+	heap_t* heap = *heap_list;
+	*heap_list = (heap ? heap->next_orphan : 0);
+	return heap;
+}
+
+//! Allocate a new heap, potentially reusing a previously orphaned heap
+static heap_t*
+_rpmalloc_heap_allocate(int first_class) {
+	heap_t* heap = 0;
+	while (!atomic_cas32_acquire(&_memory_global_lock, 1, 0))
+		_rpmalloc_spin();
+	if (first_class == 0)
+		heap = _rpmalloc_heap_extract_orphan(&_memory_orphan_heaps);
+#if RPMALLOC_FIRST_CLASS_HEAPS
+	if (!heap)
+		heap = _rpmalloc_heap_extract_orphan(&_memory_first_class_orphan_heaps);
+#endif
+	if (!heap)
+		heap = _rpmalloc_heap_allocate_new();
+	atomic_store32_release(&_memory_global_lock, 0);
+	_rpmalloc_heap_cache_adopt_deferred(heap, 0);
+	return heap;
+}
+
+extern thread_local bool RpThreadShutdown;
+
+static void
+_rpmalloc_heap_release(void* heapptr, int first_class, int release_cache) {
+	heap_t* heap = (heap_t*)heapptr;
+	if (!heap)
+		return;
+	RpThreadShutdown = true;
+	//Release thread cache spans back to global cache
+	_rpmalloc_heap_cache_adopt_deferred(heap, 0);
+	if (release_cache  || heap->finalize) {
+#if ENABLE_THREAD_CACHE
+		for (size_t iclass = 0; iclass < LARGE_CLASS_COUNT; ++iclass) {
+			span_cache_t* span_cache;
+			if (!iclass)
+				span_cache = &heap->span_cache;
+			else
+				span_cache = (span_cache_t*)(heap->span_large_cache + (iclass - 1));
+			if (!span_cache->count)
+				continue;
+#if ENABLE_GLOBAL_CACHE
+			if (heap->finalize) {
+				for (size_t ispan = 0; ispan < span_cache->count; ++ispan)
+					_rpmalloc_span_unmap(span_cache->span[ispan]);
+			} else {
+				_rpmalloc_stat_add64(&heap->thread_to_global, span_cache->count * (iclass + 1) * _memory_span_size);
+				_rpmalloc_stat_add(&heap->span_use[iclass].spans_to_global, span_cache->count);
+				_rpmalloc_global_cache_insert_spans(span_cache->span, iclass + 1, span_cache->count);
+			}
+#else
+			for (size_t ispan = 0; ispan < span_cache->count; ++ispan)
+				_rpmalloc_span_unmap(span_cache->span[ispan]);
+#endif
+			span_cache->count = 0;
+		}
+#endif
+	}
+
+	if (get_thread_heap_raw() == heap)
+		set_thread_heap(0);
+
+#if ENABLE_STATISTICS
+	atomic_decr32(&_memory_active_heaps);
+	rpmalloc_assert(atomic_load32(&_memory_active_heaps) >= 0, "Still active heaps during finalization");
+#endif
+
+	// If we are forcibly terminating with _exit the state of the
+	// lock atomic is unknown and it's best to just go ahead and exit
+	if (get_thread_id() != _rpmalloc_main_thread_id) {
+		while (!atomic_cas32_acquire(&_memory_global_lock, 1, 0))
+			_rpmalloc_spin();
+	}
+	_rpmalloc_heap_orphan(heap, first_class);
+	atomic_store32_release(&_memory_global_lock, 0);
+}
+
+static void
+_rpmalloc_heap_release_raw(void* heapptr, int release_cache) {
+	_rpmalloc_heap_release(heapptr, 0, release_cache);
+}
+
+static void
+_rpmalloc_heap_release_raw_fc(void* heapptr) {
+	_rpmalloc_heap_release_raw(heapptr, 1);
+}
+
+static void
+_rpmalloc_heap_finalize(heap_t* heap) {
+	if (heap->spans_reserved) {
+		span_t* span = _rpmalloc_span_map(heap, heap->spans_reserved);
+		_rpmalloc_span_unmap(span);
+		heap->spans_reserved = 0;
+	}
+
+	_rpmalloc_heap_cache_adopt_deferred(heap, 0);
+
+	for (size_t iclass = 0; iclass < SIZE_CLASS_COUNT; ++iclass) {
+		if (heap->size_class[iclass].cache)
+			_rpmalloc_span_unmap(heap->size_class[iclass].cache);
+		heap->size_class[iclass].cache = 0;
+		span_t* span = heap->size_class[iclass].partial_span;
+		while (span) {
+			span_t* next = span->next;
+			_rpmalloc_span_finalize(heap, iclass, span, &heap->size_class[iclass].partial_span);
+			span = next;
+		}
+		// If class still has a free list it must be a full span
+		if (heap->size_class[iclass].free_list) {
+			span_t* class_span = (span_t*)((uintptr_t)heap->size_class[iclass].free_list & _memory_span_mask);
+			span_t** list = 0;
+#if RPMALLOC_FIRST_CLASS_HEAPS
+			list = &heap->full_span[iclass];
+#endif
+			--heap->full_span_count;
+			if (!_rpmalloc_span_finalize(heap, iclass, class_span, list)) {
+				if (list)
+					_rpmalloc_span_double_link_list_remove(list, class_span);
+				_rpmalloc_span_double_link_list_add(&heap->size_class[iclass].partial_span, class_span);
+			}
+		}
+	}
+
+#if ENABLE_THREAD_CACHE
+	for (size_t iclass = 0; iclass < LARGE_CLASS_COUNT; ++iclass) {
+		span_cache_t* span_cache;
+		if (!iclass)
+			span_cache = &heap->span_cache;
+		else
+			span_cache = (span_cache_t*)(heap->span_large_cache + (iclass - 1));
+		for (size_t ispan = 0; ispan < span_cache->count; ++ispan)
+			_rpmalloc_span_unmap(span_cache->span[ispan]);
+		span_cache->count = 0;
+	}
+#endif
+	rpmalloc_assert(!atomic_load_ptr(&heap->span_free_deferred), "Heaps still active during finalization");
+}
+
+
+////////////
+///
+/// Allocation entry points
+///
+//////
+
+//! Pop first block from a free list
+static void*
+free_list_pop(void** list) {
+	void* block = *list;
+	*list = *((void**)block);
+	return block;
+}
+
+//! Allocate a small/medium sized memory block from the given heap
+static void*
+_rpmalloc_allocate_from_heap_fallback(heap_t* heap, heap_size_class_t* heap_size_class, uint32_t class_idx) {
+	span_t* span = heap_size_class->partial_span;
+	if (EXPECTED(span != 0)) {
+		rpmalloc_assert(span->block_count == _memory_size_class[span->size_class].block_count, "Span block count corrupted");
+		rpmalloc_assert(!_rpmalloc_span_is_fully_utilized(span), "Internal failure");
+		void* block;
+		if (span->free_list) {
+			//Span local free list is not empty, swap to size class free list
+			block = free_list_pop(&span->free_list);
+			heap_size_class->free_list = span->free_list;
+			span->free_list = 0;
+		} else {
+			//If the span did not fully initialize free list, link up another page worth of blocks			
+			void* block_start = pointer_offset(span, SPAN_HEADER_SIZE + ((size_t)span->free_list_limit * span->block_size));
+			span->free_list_limit += free_list_partial_init(&heap_size_class->free_list, &block,
+				(void*)((uintptr_t)block_start & ~(_memory_page_size - 1)), block_start,
+				span->block_count - span->free_list_limit, span->block_size);
+		}
+		rpmalloc_assert(span->free_list_limit <= span->block_count, "Span block count corrupted");
+		span->used_count = span->free_list_limit;
+
+		//Swap in deferred free list if present
+		if (atomic_load_ptr(&span->free_list_deferred))
+			_rpmalloc_span_extract_free_list_deferred(span);
+
+		//If span is still not fully utilized keep it in partial list and early return block
+		if (!_rpmalloc_span_is_fully_utilized(span))
+			return block;
+
+		//The span is fully utilized, unlink from partial list and add to fully utilized list
+		_rpmalloc_span_double_link_list_pop_head(&heap_size_class->partial_span, span);
+#if RPMALLOC_FIRST_CLASS_HEAPS
+		_rpmalloc_span_double_link_list_add(&heap->full_span[class_idx], span);
+#endif
+		++heap->full_span_count;
+		return block;
+	}
+
+	//Find a span in one of the cache levels
+	span = _rpmalloc_heap_extract_new_span(heap, heap_size_class, 1, class_idx);
+	if (EXPECTED(span != 0)) {
+		//Mark span as owned by this heap and set base data, return first block
+		return _rpmalloc_span_initialize_new(heap, heap_size_class, span, class_idx);
+	}
+
+	return 0;
+}
+
+//! Allocate a small sized memory block from the given heap
+static void*
+_rpmalloc_allocate_small(heap_t* heap, size_t size) {
+	rpmalloc_assert(heap, "No thread heap");
+	//Small sizes have unique size classes
+	const uint32_t class_idx = (uint32_t)((size + (SMALL_GRANULARITY - 1)) >> SMALL_GRANULARITY_SHIFT);
+	heap_size_class_t* heap_size_class = heap->size_class + class_idx;
+	_rpmalloc_stat_inc_alloc(heap, class_idx);
+	if (EXPECTED(heap_size_class->free_list != 0))
+		return free_list_pop(&heap_size_class->free_list);
+	return _rpmalloc_allocate_from_heap_fallback(heap, heap_size_class, class_idx);
+}
+
+//! Allocate a medium sized memory block from the given heap
+static void*
+_rpmalloc_allocate_medium(heap_t* heap, size_t size) {
+	rpmalloc_assert(heap, "No thread heap");
+	//Calculate the size class index and do a dependent lookup of the final class index (in case of merged classes)
+	const uint32_t base_idx = (uint32_t)(SMALL_CLASS_COUNT + ((size - (SMALL_SIZE_LIMIT + 1)) >> MEDIUM_GRANULARITY_SHIFT));
+	const uint32_t class_idx = _memory_size_class[base_idx].class_idx;
+	heap_size_class_t* heap_size_class = heap->size_class + class_idx;
+	_rpmalloc_stat_inc_alloc(heap, class_idx);
+	if (EXPECTED(heap_size_class->free_list != 0))
+		return free_list_pop(&heap_size_class->free_list);
+	return _rpmalloc_allocate_from_heap_fallback(heap, heap_size_class, class_idx);
+}
+
+//! Allocate a large sized memory block from the given heap
+static void*
+_rpmalloc_allocate_large(heap_t* heap, size_t size) {
+	rpmalloc_assert(heap, "No thread heap");
+	//Calculate number of needed max sized spans (including header)
+	//Since this function is never called if size > LARGE_SIZE_LIMIT
+	//the span_count is guaranteed to be <= LARGE_CLASS_COUNT
+	size += SPAN_HEADER_SIZE;
+	size_t span_count = size >> _memory_span_size_shift;
+	if (size & (_memory_span_size - 1))
+		++span_count;
+
+	//Find a span in one of the cache levels
+	span_t* span = _rpmalloc_heap_extract_new_span(heap, 0, span_count, SIZE_CLASS_LARGE);
+	if (!span)
+		return span;
+
+	//Mark span as owned by this heap and set base data
+	rpmalloc_assert(span->span_count >= span_count, "Internal failure");
+	span->size_class = SIZE_CLASS_LARGE;
+	span->heap = heap;
+
+#if RPMALLOC_FIRST_CLASS_HEAPS
+	_rpmalloc_span_double_link_list_add(&heap->large_huge_span, span);
+#endif
+	++heap->full_span_count;
+
+	return pointer_offset(span, SPAN_HEADER_SIZE);
+}
+
+//! Allocate a huge block by mapping memory pages directly
+static void*
+_rpmalloc_allocate_huge(heap_t* heap, size_t size) {
+	rpmalloc_assert(heap, "No thread heap");
+	_rpmalloc_heap_cache_adopt_deferred(heap, 0);
+	size += SPAN_HEADER_SIZE;
+	size_t num_pages = size >> _memory_page_size_shift;
+	if (size & (_memory_page_size - 1))
+		++num_pages;
+	size_t align_offset = 0;
+	span_t* span = (span_t*)_rpmalloc_mmap(num_pages * _memory_page_size, &align_offset);
+	if (!span)
+		return span;
+
+	//Store page count in span_count
+	span->size_class = SIZE_CLASS_HUGE;
+	span->span_count = (uint32_t)num_pages;
+	span->align_offset = (uint32_t)align_offset;
+	span->heap = heap;
+	_rpmalloc_stat_add_peak(&_huge_pages_current, num_pages, _huge_pages_peak);
+
+#if RPMALLOC_FIRST_CLASS_HEAPS
+	_rpmalloc_span_double_link_list_add(&heap->large_huge_span, span);
+#endif
+	++heap->full_span_count;
+
+	return pointer_offset(span, SPAN_HEADER_SIZE);
+}
+
+//! Allocate a block of the given size
+static void*
+_rpmalloc_allocate(heap_t* heap, size_t size) {
+	_rpmalloc_stat_add64(&_allocation_counter, 1);
+	if (EXPECTED(size <= SMALL_SIZE_LIMIT))
+		return _rpmalloc_allocate_small(heap, size);
+	else if (size <= _memory_medium_size_limit)
+		return _rpmalloc_allocate_medium(heap, size);
+	else if (size <= LARGE_SIZE_LIMIT)
+		return _rpmalloc_allocate_large(heap, size);
+	return _rpmalloc_allocate_huge(heap, size);
+}
+
+static void*
+_rpmalloc_aligned_allocate(heap_t* heap, size_t alignment, size_t size) {
+	if (alignment <= SMALL_GRANULARITY)
+		return _rpmalloc_allocate(heap, size);
+
+#if ENABLE_VALIDATE_ARGS
+	if ((size + alignment) < size) {
+		errno = EINVAL;
+		return 0;
+	}
+	if (alignment & (alignment - 1)) {
+		errno = EINVAL;
+		return 0;
+	}
+#endif
+
+	if ((alignment <= SPAN_HEADER_SIZE) && (size < _memory_medium_size_limit)) {
+		// If alignment is less or equal to span header size (which is power of two),
+		// and size aligned to span header size multiples is less than size + alignment,
+		// then use natural alignment of blocks to provide alignment
+		size_t multiple_size = size ? (size + (SPAN_HEADER_SIZE - 1)) & ~(uintptr_t)(SPAN_HEADER_SIZE - 1) : SPAN_HEADER_SIZE;
+		rpmalloc_assert(!(multiple_size % SPAN_HEADER_SIZE), "Failed alignment calculation");
+		if (multiple_size <= (size + alignment))
+			return _rpmalloc_allocate(heap, multiple_size);
+	}
+
+	void* ptr = 0;
+	size_t align_mask = alignment - 1;
+	if (alignment <= _memory_page_size) {
+		ptr = _rpmalloc_allocate(heap, size + alignment);
+		if ((uintptr_t)ptr & align_mask) {
+			ptr = (void*)(((uintptr_t)ptr & ~(uintptr_t)align_mask) + alignment);
+			//Mark as having aligned blocks
+			span_t* span = (span_t*)((uintptr_t)ptr & _memory_span_mask);
+			span->flags |= SPAN_FLAG_ALIGNED_BLOCKS;
+		}
+		return ptr;
+	}
+
+	// Fallback to mapping new pages for this request. Since pointers passed
+	// to rpfree must be able to reach the start of the span by bitmasking of
+	// the address with the span size, the returned aligned pointer from this
+	// function must be with a span size of the start of the mapped area.
+	// In worst case this requires us to loop and map pages until we get a
+	// suitable memory address. It also means we can never align to span size
+	// or greater, since the span header will push alignment more than one
+	// span size away from span start (thus causing pointer mask to give us
+	// an invalid span start on free)
+	if (alignment & align_mask) {
+		errno = EINVAL;
+		return 0;
+	}
+	if (alignment >= _memory_span_size) {
+		errno = EINVAL;
+		return 0;
+	}
+
+	size_t extra_pages = alignment / _memory_page_size;
+
+	// Since each span has a header, we will at least need one extra memory page
+	size_t num_pages = 1 + (size / _memory_page_size);
+	if (size & (_memory_page_size - 1))
+		++num_pages;
+
+	if (extra_pages > num_pages)
+		num_pages = 1 + extra_pages;
+
+	size_t original_pages = num_pages;
+	size_t limit_pages = (_memory_span_size / _memory_page_size) * 2;
+	if (limit_pages < (original_pages * 2))
+		limit_pages = original_pages * 2;
+
+	size_t mapped_size, align_offset;
+	span_t* span;
+
+retry:
+	align_offset = 0;
+	mapped_size = num_pages * _memory_page_size;
+
+	span = (span_t*)_rpmalloc_mmap(mapped_size, &align_offset);
+	if (!span) {
+		errno = ENOMEM;
+		return 0;
+	}
+	ptr = pointer_offset(span, SPAN_HEADER_SIZE);
+
+	if ((uintptr_t)ptr & align_mask)
+		ptr = (void*)(((uintptr_t)ptr & ~(uintptr_t)align_mask) + alignment);
+
+	if (((size_t)pointer_diff(ptr, span) >= _memory_span_size) ||
+	    (pointer_offset(ptr, size) > pointer_offset(span, mapped_size)) ||
+	    (((uintptr_t)ptr & _memory_span_mask) != (uintptr_t)span)) {
+		_rpmalloc_unmap(span, mapped_size, align_offset, mapped_size);
+		++num_pages;
+		if (num_pages > limit_pages) {
+			errno = EINVAL;
+			return 0;
+		}
+		goto retry;
+	}
+
+	//Store page count in span_count
+	span->size_class = SIZE_CLASS_HUGE;
+	span->span_count = (uint32_t)num_pages;
+	span->align_offset = (uint32_t)align_offset;
+	span->heap = heap;
+	_rpmalloc_stat_add_peak(&_huge_pages_current, num_pages, _huge_pages_peak);
+
+#if RPMALLOC_FIRST_CLASS_HEAPS
+	_rpmalloc_span_double_link_list_add(&heap->large_huge_span, span);
+#endif
+	++heap->full_span_count;
+
+	_rpmalloc_stat_add64(&_allocation_counter, 1);
+
+	return ptr;
+}
+
+
+////////////
+///
+/// Deallocation entry points
+///
+//////
+
+//! Deallocate the given small/medium memory block in the current thread local heap
+static void
+_rpmalloc_deallocate_direct_small_or_medium(span_t* span, void* block) {
+	heap_t* heap = span->heap;
+	rpmalloc_assert(heap->owner_thread == get_thread_id() || !heap->owner_thread || heap->finalize, "Internal failure");
+	//Add block to free list
+	if (UNEXPECTED(_rpmalloc_span_is_fully_utilized(span))) {
+		span->used_count = span->block_count;
+#if RPMALLOC_FIRST_CLASS_HEAPS
+		_rpmalloc_span_double_link_list_remove(&heap->full_span[span->size_class], span);
+#endif
+		_rpmalloc_span_double_link_list_add(&heap->size_class[span->size_class].partial_span, span);
+		--heap->full_span_count;
+	}
+	*((void**)block) = span->free_list;
+	--span->used_count;
+	span->free_list = block;
+	if (UNEXPECTED(span->used_count == span->list_size)) {
+		// If there are no used blocks it is guaranteed that no other external thread is accessing the span
+		if (span->used_count) {
+			// Make sure we have synchronized the deferred list and list size by using acquire semantics
+			// and guarantee that no external thread is accessing span concurrently
+			void* free_list;
+			do {
+				free_list = atomic_exchange_ptr_acquire(&span->free_list_deferred, INVALID_POINTER);
+			} while (free_list == INVALID_POINTER);
+			atomic_store_ptr_release(&span->free_list_deferred, free_list);
+		}
+		_rpmalloc_span_double_link_list_remove(&heap->size_class[span->size_class].partial_span, span);
+		_rpmalloc_span_release_to_cache(heap, span);
+	}
+}
+
+static void
+_rpmalloc_deallocate_defer_free_span(heap_t* heap, span_t* span) {
+	if (span->size_class != SIZE_CLASS_HUGE)
+		_rpmalloc_stat_inc(&heap->span_use[span->span_count - 1].spans_deferred);
+	//This list does not need ABA protection, no mutable side state
+	do {
+		span->free_list = (void*)atomic_load_ptr(&heap->span_free_deferred);
+	} while (!atomic_cas_ptr(&heap->span_free_deferred, span, span->free_list));
+}
+
+//! Put the block in the deferred free list of the owning span
+static void
+_rpmalloc_deallocate_defer_small_or_medium(span_t* span, void* block) {
+	// The memory ordering here is a bit tricky, to avoid having to ABA protect
+	// the deferred free list to avoid desynchronization of list and list size
+	// we need to have acquire semantics on successful CAS of the pointer to
+	// guarantee the list_size variable validity + release semantics on pointer store
+	void* free_list;
+	do {
+		free_list = atomic_exchange_ptr_acquire(&span->free_list_deferred, INVALID_POINTER);
+	} while (free_list == INVALID_POINTER);
+	*((void**)block) = free_list;
+	uint32_t free_count = ++span->list_size;
+	int all_deferred_free = (free_count == span->block_count);
+	atomic_store_ptr_release(&span->free_list_deferred, block);
+	if (all_deferred_free) {
+		// Span was completely freed by this block. Due to the INVALID_POINTER spin lock
+		// no other thread can reach this state simultaneously on this span.
+		// Safe to move to owner heap deferred cache
+		_rpmalloc_deallocate_defer_free_span(span->heap, span);
+	}
+}
+
+static void
+_rpmalloc_deallocate_small_or_medium(span_t* span, void* p) {
+	_rpmalloc_stat_inc_free(span->heap, span->size_class);
+	if (span->flags & SPAN_FLAG_ALIGNED_BLOCKS) {
+		//Realign pointer to block start
+		void* blocks_start = pointer_offset(span, SPAN_HEADER_SIZE);
+		uint32_t block_offset = (uint32_t)pointer_diff(p, blocks_start);
+		p = pointer_offset(p, -(int32_t)(block_offset % span->block_size));
+	}
+	//Check if block belongs to this heap or if deallocation should be deferred
+#if RPMALLOC_FIRST_CLASS_HEAPS
+	int defer = (span->heap->owner_thread && (span->heap->owner_thread != get_thread_id()) && !span->heap->finalize);
+#else
+	int defer = ((span->heap->owner_thread != get_thread_id()) && !span->heap->finalize);
+#endif
+	if (!defer)
+		_rpmalloc_deallocate_direct_small_or_medium(span, p);
+	else
+		_rpmalloc_deallocate_defer_small_or_medium(span, p);
+}
+
+//! Deallocate the given large memory block to the current heap
+static void
+_rpmalloc_deallocate_large(span_t* span) {
+	rpmalloc_assert(span->size_class == SIZE_CLASS_LARGE, "Bad span size class");
+	rpmalloc_assert(!(span->flags & SPAN_FLAG_MASTER) || !(span->flags & SPAN_FLAG_SUBSPAN), "Span flag corrupted");
+	rpmalloc_assert((span->flags & SPAN_FLAG_MASTER) || (span->flags & SPAN_FLAG_SUBSPAN), "Span flag corrupted");
+	//We must always defer (unless finalizing) if from another heap since we cannot touch the list or counters of another heap
+#if RPMALLOC_FIRST_CLASS_HEAPS
+	int defer = (span->heap->owner_thread && (span->heap->owner_thread != get_thread_id()) && !span->heap->finalize);
+#else
+	int defer = ((span->heap->owner_thread != get_thread_id()) && !span->heap->finalize);
+#endif
+	if (defer) {
+		_rpmalloc_deallocate_defer_free_span(span->heap, span);
+		return;
+	}
+	rpmalloc_assert(span->heap->full_span_count, "Heap span counter corrupted");
+	--span->heap->full_span_count;
+#if RPMALLOC_FIRST_CLASS_HEAPS
+	_rpmalloc_span_double_link_list_remove(&span->heap->large_huge_span, span);
+#endif
+#if ENABLE_ADAPTIVE_THREAD_CACHE || ENABLE_STATISTICS
+	//Decrease counter
+	size_t idx = span->span_count - 1;
+	atomic_decr32(&span->heap->span_use[idx].current);
+#endif
+	heap_t* heap = span->heap;
+	rpmalloc_assert(heap, "No thread heap");
+#if ENABLE_THREAD_CACHE
+	const int set_as_reserved = ((span->span_count > 1) && (heap->span_cache.count == 0) && !heap->finalize && !heap->spans_reserved);
+#else
+	const int set_as_reserved = ((span->span_count > 1) && !heap->finalize && !heap->spans_reserved);
+#endif
+	if (set_as_reserved) {
+		heap->span_reserve = span;
+		heap->spans_reserved = span->span_count;
+		if (span->flags & SPAN_FLAG_MASTER) {
+			heap->span_reserve_master = span;
+		} else { //SPAN_FLAG_SUBSPAN
+			span_t* master = (span_t*)pointer_offset(span, -(intptr_t)((size_t)span->offset_from_master * _memory_span_size));
+			heap->span_reserve_master = master;
+			rpmalloc_assert(master->flags & SPAN_FLAG_MASTER, "Span flag corrupted");
+			rpmalloc_assert(atomic_load32(&master->remaining_spans) >= (int32_t)span->span_count, "Master span count corrupted");
+		}
+		_rpmalloc_stat_inc(&heap->span_use[idx].spans_to_reserved);
+	} else {
+		//Insert into cache list
+		_rpmalloc_heap_cache_insert(heap, span);
+	}
+}
+
+//! Deallocate the given huge span
+static void
+_rpmalloc_deallocate_huge(span_t* span) {
+	rpmalloc_assert(span->heap, "No span heap");
+#if RPMALLOC_FIRST_CLASS_HEAPS
+	int defer = (span->heap->owner_thread && (span->heap->owner_thread != get_thread_id()) && !span->heap->finalize);
+#else
+	int defer = ((span->heap->owner_thread != get_thread_id()) && !span->heap->finalize);
+#endif
+	if (defer) {
+		_rpmalloc_deallocate_defer_free_span(span->heap, span);
+		return;
+	}
+	rpmalloc_assert(span->heap->full_span_count, "Heap span counter corrupted");
+	--span->heap->full_span_count;
+#if RPMALLOC_FIRST_CLASS_HEAPS
+	_rpmalloc_span_double_link_list_remove(&span->heap->large_huge_span, span);
+#endif
+
+	//Oversized allocation, page count is stored in span_count
+	size_t num_pages = span->span_count;
+	_rpmalloc_unmap(span, num_pages * _memory_page_size, span->align_offset, num_pages * _memory_page_size);
+	_rpmalloc_stat_sub(&_huge_pages_current, num_pages);
+}
+
+//! Deallocate the given block
+static void
+_rpmalloc_deallocate(void* p) {
+	_rpmalloc_stat_add64(&_deallocation_counter, 1);
+	//Grab the span (always at start of span, using span alignment)
+	span_t* span = (span_t*)((uintptr_t)p & _memory_span_mask);
+	if (UNEXPECTED(!span))
+		return;
+	if (EXPECTED(span->size_class < SIZE_CLASS_COUNT))
+		_rpmalloc_deallocate_small_or_medium(span, p);
+	else if (span->size_class == SIZE_CLASS_LARGE)
+		_rpmalloc_deallocate_large(span);
+	else
+		_rpmalloc_deallocate_huge(span);
+}
+
+////////////
+///
+/// Reallocation entry points
+///
+//////
+
+static size_t
+_rpmalloc_usable_size(void* p);
+
+//! Reallocate the given block to the given size
+static void*
+_rpmalloc_reallocate(heap_t* heap, void* p, size_t size, size_t oldsize, unsigned int flags) {
+	if (p) {
+		//Grab the span using guaranteed span alignment
+		span_t* span = (span_t*)((uintptr_t)p & _memory_span_mask);
+		if (EXPECTED(span->size_class < SIZE_CLASS_COUNT)) {
+			//Small/medium sized block
+			rpmalloc_assert(span->span_count == 1, "Span counter corrupted");
+			void* blocks_start = pointer_offset(span, SPAN_HEADER_SIZE);
+			uint32_t block_offset = (uint32_t)pointer_diff(p, blocks_start);
+			uint32_t block_idx = block_offset / span->block_size;
+			void* block = pointer_offset(blocks_start, (size_t)block_idx * span->block_size);
+			if (!oldsize)
+				oldsize = (size_t)((ptrdiff_t)span->block_size - pointer_diff(p, block));
+			if ((size_t)span->block_size >= size) {
+				//Still fits in block, never mind trying to save memory, but preserve data if alignment changed
+				if ((p != block) && !(flags & RPMALLOC_NO_PRESERVE))
+					memmove(block, p, oldsize);
+				return block;
+			}
+		} else if (span->size_class == SIZE_CLASS_LARGE) {
+			//Large block
+			size_t total_size = size + SPAN_HEADER_SIZE;
+			size_t num_spans = total_size >> _memory_span_size_shift;
+			if (total_size & (_memory_span_mask - 1))
+				++num_spans;
+			size_t current_spans = span->span_count;
+			void* block = pointer_offset(span, SPAN_HEADER_SIZE);
+			if (!oldsize)
+				oldsize = (current_spans * _memory_span_size) - (size_t)pointer_diff(p, block) - SPAN_HEADER_SIZE;
+			if ((current_spans >= num_spans) && (total_size >= (oldsize / 2))) {
+				//Still fits in block, never mind trying to save memory, but preserve data if alignment changed
+				if ((p != block) && !(flags & RPMALLOC_NO_PRESERVE))
+					memmove(block, p, oldsize);
+				return block;
+			}
+		} else {
+			//Oversized block
+			size_t total_size = size + SPAN_HEADER_SIZE;
+			size_t num_pages = total_size >> _memory_page_size_shift;
+			if (total_size & (_memory_page_size - 1))
+				++num_pages;
+			//Page count is stored in span_count
+			size_t current_pages = span->span_count;
+			void* block = pointer_offset(span, SPAN_HEADER_SIZE);
+			if (!oldsize)
+				oldsize = (current_pages * _memory_page_size) - (size_t)pointer_diff(p, block) - SPAN_HEADER_SIZE;
+			if ((current_pages >= num_pages) && (num_pages >= (current_pages / 2))) {
+				//Still fits in block, never mind trying to save memory, but preserve data if alignment changed
+				if ((p != block) && !(flags & RPMALLOC_NO_PRESERVE))
+					memmove(block, p, oldsize);
+				return block;
+			}
+		}
+	} else {
+		oldsize = 0;
+	}
+
+	if (!!(flags & RPMALLOC_GROW_OR_FAIL))
+		return 0;
+
+	//Size is greater than block size, need to allocate a new block and deallocate the old
+	//Avoid hysteresis by overallocating if increase is small (below 37%)
+	size_t lower_bound = oldsize + (oldsize >> 2) + (oldsize >> 3);
+	size_t new_size = (size > lower_bound) ? size : ((size > oldsize) ? lower_bound : size);
+	void* block = _rpmalloc_allocate(heap, new_size);
+	if (p && block) {
+		if (!(flags & RPMALLOC_NO_PRESERVE))
+			memcpy(block, p, oldsize < new_size ? oldsize : new_size);
+		_rpmalloc_deallocate(p);
+	}
+
+	return block;
+}
+
+static void*
+_rpmalloc_aligned_reallocate(heap_t* heap, void* ptr, size_t alignment, size_t size, size_t oldsize,
+                           unsigned int flags) {
+	if (alignment <= SMALL_GRANULARITY)
+		return _rpmalloc_reallocate(heap, ptr, size, oldsize, flags);
+
+	int no_alloc = !!(flags & RPMALLOC_GROW_OR_FAIL);
+	size_t usablesize = (ptr ? _rpmalloc_usable_size(ptr) : 0);
+	if ((usablesize >= size) && !((uintptr_t)ptr & (alignment - 1))) {
+		if (no_alloc || (size >= (usablesize / 2)))
+			return ptr;
+	}
+	// Aligned alloc marks span as having aligned blocks
+	void* block = (!no_alloc ? _rpmalloc_aligned_allocate(heap, alignment, size) : 0);
+	if (EXPECTED(block != 0)) {
+		if (!(flags & RPMALLOC_NO_PRESERVE) && ptr) {
+			if (!oldsize)
+				oldsize = usablesize;
+			memcpy(block, ptr, oldsize < size ? oldsize : size);
+		}
+		_rpmalloc_deallocate(ptr);
+	}
+	return block;
+}
+
+
+////////////
+///
+/// Initialization, finalization and utility
+///
+//////
+
+//! Get the usable size of the given block
+static size_t
+_rpmalloc_usable_size(void* p) {
+	//Grab the span using guaranteed span alignment
+	span_t* span = (span_t*)((uintptr_t)p & _memory_span_mask);
+	if (span->size_class < SIZE_CLASS_COUNT) {
+		//Small/medium block
+		void* blocks_start = pointer_offset(span, SPAN_HEADER_SIZE);
+		return span->block_size - ((size_t)pointer_diff(p, blocks_start) % span->block_size);
+	}
+	if (span->size_class == SIZE_CLASS_LARGE) {
+		//Large block
+		size_t current_spans = span->span_count;
+		return (current_spans * _memory_span_size) - (size_t)pointer_diff(p, span);
+	}
+	//Oversized block, page count is stored in span_count
+	size_t current_pages = span->span_count;
+	return (current_pages * _memory_page_size) - (size_t)pointer_diff(p, span);
+}
+
+//! Adjust and optimize the size class properties for the given class
+static void
+_rpmalloc_adjust_size_class(size_t iclass) {
+	size_t block_size = _memory_size_class[iclass].block_size;
+	size_t block_count = (_memory_span_size - SPAN_HEADER_SIZE) / block_size;
+
+	_memory_size_class[iclass].block_count = (uint16_t)block_count;
+	_memory_size_class[iclass].class_idx = (uint16_t)iclass;
+
+	//Check if previous size classes can be merged
+	if (iclass >= SMALL_CLASS_COUNT) {
+		size_t prevclass = iclass;
+		while (prevclass > 0) {
+			--prevclass;
+			//A class can be merged if number of pages and number of blocks are equal
+			if (_memory_size_class[prevclass].block_count == _memory_size_class[iclass].block_count)
+				memcpy(_memory_size_class + prevclass, _memory_size_class + iclass, sizeof(_memory_size_class[iclass]));
+			else
+				break;
+		}
+	}
+}
+
+//! Initialize the allocator and setup global data
+TRACY_API int
+rpmalloc_initialize(void) {
+	if (_rpmalloc_initialized) {
+		rpmalloc_thread_initialize();
+		return 0;
+	}
+	return rpmalloc_initialize_config(0);
+}
+
+int
+rpmalloc_initialize_config(const rpmalloc_config_t* config) {
+	if (_rpmalloc_initialized) {
+		rpmalloc_thread_initialize();
+		return 0;
+	}
+	_rpmalloc_initialized = 1;
+
+	if (config)
+		memcpy(&_memory_config, config, sizeof(rpmalloc_config_t));
+	else
+		memset(&_memory_config, 0, sizeof(rpmalloc_config_t));
+
+	if (!_memory_config.memory_map || !_memory_config.memory_unmap) {
+		_memory_config.memory_map = _rpmalloc_mmap_os;
+		_memory_config.memory_unmap = _rpmalloc_unmap_os;
+	}
+
+#if PLATFORM_WINDOWS
+	SYSTEM_INFO system_info;
+	memset(&system_info, 0, sizeof(system_info));
+	GetSystemInfo(&system_info);
+	_memory_map_granularity = system_info.dwAllocationGranularity;
+#else
+	_memory_map_granularity = (size_t)sysconf(_SC_PAGESIZE);
+#endif
+
+#if RPMALLOC_CONFIGURABLE
+	_memory_page_size = _memory_config.page_size;
+#else
+	_memory_page_size = 0;
+#endif
+	_memory_huge_pages = 0;
+	if (!_memory_page_size) {
+#if PLATFORM_WINDOWS
+		_memory_page_size = system_info.dwPageSize;
+#else
+		_memory_page_size = _memory_map_granularity;
+		if (_memory_config.enable_huge_pages) {
+#if defined(__linux__)
+			size_t huge_page_size = 0;
+			FILE* meminfo = fopen("/proc/meminfo", "r");
+			if (meminfo) {
+				char line[128];
+				while (!huge_page_size && fgets(line, sizeof(line) - 1, meminfo)) {
+					line[sizeof(line) - 1] = 0;
+					if (strstr(line, "Hugepagesize:"))
+						huge_page_size = (size_t)strtol(line + 13, 0, 10) * 1024;
+				}
+				fclose(meminfo);
+			}
+			if (huge_page_size) {
+				_memory_huge_pages = 1;
+				_memory_page_size = huge_page_size;
+				_memory_map_granularity = huge_page_size;
+			}
+#elif defined(__FreeBSD__)
+			int rc;
+			size_t sz = sizeof(rc);
+
+			if (sysctlbyname("vm.pmap.pg_ps_enabled", &rc, &sz, NULL, 0) == 0 && rc == 1) {
+				_memory_huge_pages = 1;
+				_memory_page_size = 2 * 1024 * 1024;
+				_memory_map_granularity = _memory_page_size;
+			}
+#elif defined(__APPLE__) || defined(__NetBSD__)
+			_memory_huge_pages = 1;
+			_memory_page_size = 2 * 1024 * 1024;
+			_memory_map_granularity = _memory_page_size;
+#endif
+		}
+#endif
+	} else {
+		if (_memory_config.enable_huge_pages)
+			_memory_huge_pages = 1;
+	}
+
+#if PLATFORM_WINDOWS
+	if (_memory_config.enable_huge_pages) {
+		HANDLE token = 0;
+		size_t large_page_minimum = GetLargePageMinimum();
+		if (large_page_minimum)
+			OpenProcessToken(GetCurrentProcess(), TOKEN_ADJUST_PRIVILEGES | TOKEN_QUERY, &token);
+		if (token) {
+			LUID luid;
+			if (LookupPrivilegeValue(0, SE_LOCK_MEMORY_NAME, &luid)) {
+				TOKEN_PRIVILEGES token_privileges;
+				memset(&token_privileges, 0, sizeof(token_privileges));
+				token_privileges.PrivilegeCount = 1;
+				token_privileges.Privileges[0].Luid = luid;
+				token_privileges.Privileges[0].Attributes = SE_PRIVILEGE_ENABLED;
+				if (AdjustTokenPrivileges(token, FALSE, &token_privileges, 0, 0, 0)) {
+					if (GetLastError() == ERROR_SUCCESS)
+						_memory_huge_pages = 1;
+				}
+			}
+			CloseHandle(token);
+		}
+		if (_memory_huge_pages) {
+			if (large_page_minimum > _memory_page_size)
+				_memory_page_size = large_page_minimum;
+			if (large_page_minimum > _memory_map_granularity)
+				_memory_map_granularity = large_page_minimum;
+		}
+	}
+#endif
+
+	size_t min_span_size = 256;
+	size_t max_page_size;
+#if UINTPTR_MAX > 0xFFFFFFFF
+	max_page_size = 4096ULL * 1024ULL * 1024ULL;
+#else
+	max_page_size = 4 * 1024 * 1024;
+#endif
+	if (_memory_page_size < min_span_size)
+		_memory_page_size = min_span_size;
+	if (_memory_page_size > max_page_size)
+		_memory_page_size = max_page_size;
+	_memory_page_size_shift = 0;
+	size_t page_size_bit = _memory_page_size;
+	while (page_size_bit != 1) {
+		++_memory_page_size_shift;
+		page_size_bit >>= 1;
+	}
+	_memory_page_size = ((size_t)1 << _memory_page_size_shift);
+
+#if RPMALLOC_CONFIGURABLE
+	if (!_memory_config.span_size) {
+		_memory_span_size = _memory_default_span_size;
+		_memory_span_size_shift = _memory_default_span_size_shift;
+		_memory_span_mask = _memory_default_span_mask;
+	} else {
+		size_t span_size = _memory_config.span_size;
+		if (span_size > (256 * 1024))
+			span_size = (256 * 1024);
+		_memory_span_size = 4096;
+		_memory_span_size_shift = 12;
+		while (_memory_span_size < span_size) {
+			_memory_span_size <<= 1;
+			++_memory_span_size_shift;
+		}
+		_memory_span_mask = ~(uintptr_t)(_memory_span_size - 1);
+	}
+#endif
+
+	_memory_span_map_count = ( _memory_config.span_map_count ? _memory_config.span_map_count : DEFAULT_SPAN_MAP_COUNT);
+	if ((_memory_span_size * _memory_span_map_count) < _memory_page_size)
+		_memory_span_map_count = (_memory_page_size / _memory_span_size);
+	if ((_memory_page_size >= _memory_span_size) && ((_memory_span_map_count * _memory_span_size) % _memory_page_size))
+		_memory_span_map_count = (_memory_page_size / _memory_span_size);
+	_memory_heap_reserve_count = (_memory_span_map_count > DEFAULT_SPAN_MAP_COUNT) ? DEFAULT_SPAN_MAP_COUNT : _memory_span_map_count;
+
+	_memory_config.page_size = _memory_page_size;
+	_memory_config.span_size = _memory_span_size;
+	_memory_config.span_map_count = _memory_span_map_count;
+	_memory_config.enable_huge_pages = _memory_huge_pages;
+
+#if ((defined(__APPLE__) || defined(__HAIKU__)) && ENABLE_PRELOAD) || defined(__TINYC__)
+	if (pthread_key_create(&_memory_thread_heap, _rpmalloc_heap_release_raw_fc))
+		return -1;
+#endif
+#if defined(_WIN32) && (!defined(BUILD_DYNAMIC_LINK) || !BUILD_DYNAMIC_LINK)
+	fls_key = FlsAlloc(&_rpmalloc_thread_destructor);
+#endif
+
+	//Setup all small and medium size classes
+	size_t iclass = 0;
+	_memory_size_class[iclass].block_size = SMALL_GRANULARITY;
+	_rpmalloc_adjust_size_class(iclass);
+	for (iclass = 1; iclass < SMALL_CLASS_COUNT; ++iclass) {
+		size_t size = iclass * SMALL_GRANULARITY;
+		_memory_size_class[iclass].block_size = (uint32_t)size;
+		_rpmalloc_adjust_size_class(iclass);
+	}
+	//At least two blocks per span, then fall back to large allocations
+	_memory_medium_size_limit = (_memory_span_size - SPAN_HEADER_SIZE) >> 1;
+	if (_memory_medium_size_limit > MEDIUM_SIZE_LIMIT)
+		_memory_medium_size_limit = MEDIUM_SIZE_LIMIT;
+	for (iclass = 0; iclass < MEDIUM_CLASS_COUNT; ++iclass) {
+		size_t size = SMALL_SIZE_LIMIT + ((iclass + 1) * MEDIUM_GRANULARITY);
+		if (size > _memory_medium_size_limit)
+			break;
+		_memory_size_class[SMALL_CLASS_COUNT + iclass].block_size = (uint32_t)size;
+		_rpmalloc_adjust_size_class(SMALL_CLASS_COUNT + iclass);
+	}
+
+	_memory_orphan_heaps = 0;
+#if RPMALLOC_FIRST_CLASS_HEAPS
+	_memory_first_class_orphan_heaps = 0;
+#endif
+#if ENABLE_STATISTICS
+	atomic_store32(&_memory_active_heaps, 0);
+	atomic_store32(&_mapped_pages, 0);
+	_mapped_pages_peak = 0;
+	atomic_store32(&_master_spans, 0);
+	atomic_store32(&_mapped_total, 0);
+	atomic_store32(&_unmapped_total, 0);
+	atomic_store32(&_mapped_pages_os, 0);
+	atomic_store32(&_huge_pages_current, 0);
+	_huge_pages_peak = 0;
+#endif
+	memset(_memory_heaps, 0, sizeof(_memory_heaps));
+	atomic_store32_release(&_memory_global_lock, 0);
+
+	//Initialize this thread
+	rpmalloc_thread_initialize();
+	return 0;
+}
+
+//! Finalize the allocator
+TRACY_API void
+rpmalloc_finalize(void) {
+	rpmalloc_thread_finalize(1);
+	//rpmalloc_dump_statistics(stdout);
+
+	if (_memory_global_reserve) {
+		atomic_add32(&_memory_global_reserve_master->remaining_spans, -(int32_t)_memory_global_reserve_count);
+		_memory_global_reserve_master = 0;
+		_memory_global_reserve_count = 0;
+		_memory_global_reserve = 0;
+	}
+	atomic_store32_release(&_memory_global_lock, 0);	
+
+	//Free all thread caches and fully free spans
+	for (size_t list_idx = 0; list_idx < HEAP_ARRAY_SIZE; ++list_idx) {
+		heap_t* heap = _memory_heaps[list_idx];
+		while (heap) {
+			heap_t* next_heap = heap->next_heap;
+			heap->finalize = 1;
+			_rpmalloc_heap_global_finalize(heap);
+			heap = next_heap;
+		}
+	}
+
+#if ENABLE_GLOBAL_CACHE
+	//Free global caches
+	for (size_t iclass = 0; iclass < LARGE_CLASS_COUNT; ++iclass)
+		_rpmalloc_global_cache_finalize(&_memory_span_cache[iclass]);
+#endif
+
+#if (defined(__APPLE__) || defined(__HAIKU__)) && ENABLE_PRELOAD
+	pthread_key_delete(_memory_thread_heap);
+#endif
+#if defined(_WIN32) && (!defined(BUILD_DYNAMIC_LINK) || !BUILD_DYNAMIC_LINK)
+	FlsFree(fls_key);
+	fls_key = 0;
+#endif
+#if ENABLE_STATISTICS
+	//If you hit these asserts you probably have memory leaks (perhaps global scope data doing dynamic allocations) or double frees in your code
+	rpmalloc_assert(atomic_load32(&_mapped_pages) == 0, "Memory leak detected");
+	rpmalloc_assert(atomic_load32(&_mapped_pages_os) == 0, "Memory leak detected");
+#endif
+
+	_rpmalloc_initialized = 0;
+}
+
+//! Initialize thread, assign heap
+TRACY_API void
+rpmalloc_thread_initialize(void) {
+	if (!get_thread_heap_raw()) {
+		heap_t* heap = _rpmalloc_heap_allocate(0);
+		if (heap) {
+			_rpmalloc_stat_inc(&_memory_active_heaps);
+			set_thread_heap(heap);
+#if defined(_WIN32) && (!defined(BUILD_DYNAMIC_LINK) || !BUILD_DYNAMIC_LINK)
+			FlsSetValue(fls_key, heap);
+#endif
+		}
+	}
+}
+
+//! Finalize thread, orphan heap
+TRACY_API void
+rpmalloc_thread_finalize(int release_caches) {
+	heap_t* heap = get_thread_heap_raw();
+	if (heap)
+		_rpmalloc_heap_release_raw(heap, release_caches);
+	set_thread_heap(0);
+#if defined(_WIN32) && (!defined(BUILD_DYNAMIC_LINK) || !BUILD_DYNAMIC_LINK)
+	FlsSetValue(fls_key, 0);
+#endif
+}
+
+int
+rpmalloc_is_thread_initialized(void) {
+	return (get_thread_heap_raw() != 0) ? 1 : 0;
+}
+
+const rpmalloc_config_t*
+rpmalloc_config(void) {
+	return &_memory_config;
+}
+
+// Extern interface
+
+TRACY_API RPMALLOC_ALLOCATOR void*
+rpmalloc(size_t size) {
+#if ENABLE_VALIDATE_ARGS
+	if (size >= MAX_ALLOC_SIZE) {
+		errno = EINVAL;
+		return 0;
+	}
+#endif
+	heap_t* heap = get_thread_heap();
+	return _rpmalloc_allocate(heap, size);
+}
+
+TRACY_API void
+rpfree(void* ptr) {
+	_rpmalloc_deallocate(ptr);
+}
+
+extern inline RPMALLOC_ALLOCATOR void*
+rpcalloc(size_t num, size_t size) {
+	size_t total;
+#if ENABLE_VALIDATE_ARGS
+#if PLATFORM_WINDOWS
+	int err = SizeTMult(num, size, &total);
+	if ((err != S_OK) || (total >= MAX_ALLOC_SIZE)) {
+		errno = EINVAL;
+		return 0;
+	}
+#else
+	int err = __builtin_umull_overflow(num, size, &total);
+	if (err || (total >= MAX_ALLOC_SIZE)) {
+		errno = EINVAL;
+		return 0;
+	}
+#endif
+#else
+	total = num * size;
+#endif
+	heap_t* heap = get_thread_heap();
+	void* block = _rpmalloc_allocate(heap, total);
+	if (block)
+		memset(block, 0, total);
+	return block;
+}
+
+TRACY_API RPMALLOC_ALLOCATOR void*
+rprealloc(void* ptr, size_t size) {
+#if ENABLE_VALIDATE_ARGS
+	if (size >= MAX_ALLOC_SIZE) {
+		errno = EINVAL;
+		return ptr;
+	}
+#endif
+	heap_t* heap = get_thread_heap();
+	return _rpmalloc_reallocate(heap, ptr, size, 0, 0);
+}
+
+extern RPMALLOC_ALLOCATOR void*
+rpaligned_realloc(void* ptr, size_t alignment, size_t size, size_t oldsize,
+                  unsigned int flags) {
+#if ENABLE_VALIDATE_ARGS
+	if ((size + alignment < size) || (alignment > _memory_page_size)) {
+		errno = EINVAL;
+		return 0;
+	}
+#endif
+	heap_t* heap = get_thread_heap();
+	return _rpmalloc_aligned_reallocate(heap, ptr, alignment, size, oldsize, flags);
+}
+
+extern RPMALLOC_ALLOCATOR void*
+rpaligned_alloc(size_t alignment, size_t size) {
+	heap_t* heap = get_thread_heap();
+	return _rpmalloc_aligned_allocate(heap, alignment, size);
+}
+
+extern inline RPMALLOC_ALLOCATOR void*
+rpaligned_calloc(size_t alignment, size_t num, size_t size) {
+	size_t total;
+#if ENABLE_VALIDATE_ARGS
+#if PLATFORM_WINDOWS
+	int err = SizeTMult(num, size, &total);
+	if ((err != S_OK) || (total >= MAX_ALLOC_SIZE)) {
+		errno = EINVAL;
+		return 0;
+	}
+#else
+	int err = __builtin_umull_overflow(num, size, &total);
+	if (err || (total >= MAX_ALLOC_SIZE)) {
+		errno = EINVAL;
+		return 0;
+	}
+#endif
+#else
+	total = num * size;
+#endif
+	void* block = rpaligned_alloc(alignment, total);
+	if (block)
+		memset(block, 0, total);
+	return block;
+}
+
+extern inline RPMALLOC_ALLOCATOR void*
+rpmemalign(size_t alignment, size_t size) {
+	return rpaligned_alloc(alignment, size);
+}
+
+extern inline int
+rpposix_memalign(void **memptr, size_t alignment, size_t size) {
+	if (memptr)
+		*memptr = rpaligned_alloc(alignment, size);
+	else
+		return EINVAL;
+	return *memptr ? 0 : ENOMEM;
+}
+
+extern inline size_t
+rpmalloc_usable_size(void* ptr) {
+	return (ptr ? _rpmalloc_usable_size(ptr) : 0);
+}
+
+extern inline void
+rpmalloc_thread_collect(void) {
+}
+
+void
+rpmalloc_thread_statistics(rpmalloc_thread_statistics_t* stats) {
+	memset(stats, 0, sizeof(rpmalloc_thread_statistics_t));
+	heap_t* heap = get_thread_heap_raw();
+	if (!heap)
+		return;
+
+	for (size_t iclass = 0; iclass < SIZE_CLASS_COUNT; ++iclass) {
+		size_class_t* size_class = _memory_size_class + iclass;
+		span_t* span = heap->size_class[iclass].partial_span;
+		while (span) {
+			size_t free_count = span->list_size;
+			size_t block_count = size_class->block_count;
+			if (span->free_list_limit < block_count)
+				block_count = span->free_list_limit;
+			free_count += (block_count - span->used_count);
+			stats->sizecache = free_count * size_class->block_size;
+			span = span->next;
+		}
+	}
+
+#if ENABLE_THREAD_CACHE
+	for (size_t iclass = 0; iclass < LARGE_CLASS_COUNT; ++iclass) {
+		span_cache_t* span_cache;
+		if (!iclass)
+			span_cache = &heap->span_cache;
+		else
+			span_cache = (span_cache_t*)(heap->span_large_cache + (iclass - 1));
+		stats->spancache = span_cache->count * (iclass + 1) * _memory_span_size;
+	}
+#endif
+
+	span_t* deferred = (span_t*)atomic_load_ptr(&heap->span_free_deferred);
+	while (deferred) {
+		if (deferred->size_class != SIZE_CLASS_HUGE)
+			stats->spancache = (size_t)deferred->span_count * _memory_span_size;
+		deferred = (span_t*)deferred->free_list;
+	}
+
+#if ENABLE_STATISTICS
+	stats->thread_to_global = (size_t)atomic_load64(&heap->thread_to_global);
+	stats->global_to_thread = (size_t)atomic_load64(&heap->global_to_thread);
+
+	for (size_t iclass = 0; iclass < LARGE_CLASS_COUNT; ++iclass) {
+		stats->span_use[iclass].current = (size_t)atomic_load32(&heap->span_use[iclass].current);
+		stats->span_use[iclass].peak = (size_t)atomic_load32(&heap->span_use[iclass].high);
+		stats->span_use[iclass].to_global = (size_t)atomic_load32(&heap->span_use[iclass].spans_to_global);
+		stats->span_use[iclass].from_global = (size_t)atomic_load32(&heap->span_use[iclass].spans_from_global);
+		stats->span_use[iclass].to_cache = (size_t)atomic_load32(&heap->span_use[iclass].spans_to_cache);
+		stats->span_use[iclass].from_cache = (size_t)atomic_load32(&heap->span_use[iclass].spans_from_cache);
+		stats->span_use[iclass].to_reserved = (size_t)atomic_load32(&heap->span_use[iclass].spans_to_reserved);
+		stats->span_use[iclass].from_reserved = (size_t)atomic_load32(&heap->span_use[iclass].spans_from_reserved);
+		stats->span_use[iclass].map_calls = (size_t)atomic_load32(&heap->span_use[iclass].spans_map_calls);
+	}
+	for (size_t iclass = 0; iclass < SIZE_CLASS_COUNT; ++iclass) {
+		stats->size_use[iclass].alloc_current = (size_t)atomic_load32(&heap->size_class_use[iclass].alloc_current);
+		stats->size_use[iclass].alloc_peak = (size_t)heap->size_class_use[iclass].alloc_peak;
+		stats->size_use[iclass].alloc_total = (size_t)atomic_load32(&heap->size_class_use[iclass].alloc_total);
+		stats->size_use[iclass].free_total = (size_t)atomic_load32(&heap->size_class_use[iclass].free_total);
+		stats->size_use[iclass].spans_to_cache = (size_t)atomic_load32(&heap->size_class_use[iclass].spans_to_cache);
+		stats->size_use[iclass].spans_from_cache = (size_t)atomic_load32(&heap->size_class_use[iclass].spans_from_cache);
+		stats->size_use[iclass].spans_from_reserved = (size_t)atomic_load32(&heap->size_class_use[iclass].spans_from_reserved);
+		stats->size_use[iclass].map_calls = (size_t)atomic_load32(&heap->size_class_use[iclass].spans_map_calls);
+	}
+#endif
+}
+
+void
+rpmalloc_global_statistics(rpmalloc_global_statistics_t* stats) {
+	memset(stats, 0, sizeof(rpmalloc_global_statistics_t));
+#if ENABLE_STATISTICS
+	stats->mapped = (size_t)atomic_load32(&_mapped_pages) * _memory_page_size;
+	stats->mapped_peak = (size_t)_mapped_pages_peak * _memory_page_size;
+	stats->mapped_total = (size_t)atomic_load32(&_mapped_total) * _memory_page_size;
+	stats->unmapped_total = (size_t)atomic_load32(&_unmapped_total) * _memory_page_size;
+	stats->huge_alloc = (size_t)atomic_load32(&_huge_pages_current) * _memory_page_size;
+	stats->huge_alloc_peak = (size_t)_huge_pages_peak * _memory_page_size;
+#endif
+#if ENABLE_GLOBAL_CACHE
+	for (size_t iclass = 0; iclass < LARGE_CLASS_COUNT; ++iclass)
+		stats->cached += _memory_span_cache[iclass].count * (iclass + 1) * _memory_span_size;
+#endif
+}
+
+#if ENABLE_STATISTICS
+
+static void
+_memory_heap_dump_statistics(heap_t* heap, void* file) {
+	fprintf(file, "Heap %d stats:\n", heap->id);
+	fprintf(file, "Class   CurAlloc  PeakAlloc   TotAlloc    TotFree  BlkSize BlkCount SpansCur SpansPeak  PeakAllocMiB  ToCacheMiB FromCacheMiB FromReserveMiB MmapCalls\n");
+	for (size_t iclass = 0; iclass < SIZE_CLASS_COUNT; ++iclass) {
+		if (!atomic_load32(&heap->size_class_use[iclass].alloc_total))
+			continue;
+		fprintf(file, "%3u:  %10u %10u %10u %10u %8u %8u %8d %9d %13zu %11zu %12zu %14zu %9u\n", (uint32_t)iclass,
+			atomic_load32(&heap->size_class_use[iclass].alloc_current),
+			heap->size_class_use[iclass].alloc_peak,
+			atomic_load32(&heap->size_class_use[iclass].alloc_total),
+			atomic_load32(&heap->size_class_use[iclass].free_total),
+			_memory_size_class[iclass].block_size,
+			_memory_size_class[iclass].block_count,
+			atomic_load32(&heap->size_class_use[iclass].spans_current),
+			heap->size_class_use[iclass].spans_peak,
+			((size_t)heap->size_class_use[iclass].alloc_peak * (size_t)_memory_size_class[iclass].block_size) / (size_t)(1024 * 1024),
+			((size_t)atomic_load32(&heap->size_class_use[iclass].spans_to_cache) * _memory_span_size) / (size_t)(1024 * 1024),
+			((size_t)atomic_load32(&heap->size_class_use[iclass].spans_from_cache) * _memory_span_size) / (size_t)(1024 * 1024),
+			((size_t)atomic_load32(&heap->size_class_use[iclass].spans_from_reserved) * _memory_span_size) / (size_t)(1024 * 1024),
+			atomic_load32(&heap->size_class_use[iclass].spans_map_calls));
+	}
+	fprintf(file, "Spans  Current     Peak Deferred  PeakMiB  Cached  ToCacheMiB FromCacheMiB ToReserveMiB FromReserveMiB ToGlobalMiB FromGlobalMiB  MmapCalls\n");
+	for (size_t iclass = 0; iclass < LARGE_CLASS_COUNT; ++iclass) {
+		if (!atomic_load32(&heap->span_use[iclass].high) && !atomic_load32(&heap->span_use[iclass].spans_map_calls))
+			continue;
+		fprintf(file, "%4u: %8d %8u %8u %8zu %7u %11zu %12zu %12zu %14zu %11zu %13zu %10u\n", (uint32_t)(iclass + 1),
+			atomic_load32(&heap->span_use[iclass].current),
+			atomic_load32(&heap->span_use[iclass].high),
+			atomic_load32(&heap->span_use[iclass].spans_deferred),
+			((size_t)atomic_load32(&heap->span_use[iclass].high) * (size_t)_memory_span_size * (iclass + 1)) / (size_t)(1024 * 1024),
+#if ENABLE_THREAD_CACHE
+			(unsigned int)(!iclass ? heap->span_cache.count : heap->span_large_cache[iclass - 1].count),
+			((size_t)atomic_load32(&heap->span_use[iclass].spans_to_cache) * (iclass + 1) * _memory_span_size) / (size_t)(1024 * 1024),
+			((size_t)atomic_load32(&heap->span_use[iclass].spans_from_cache) * (iclass + 1) * _memory_span_size) / (size_t)(1024 * 1024),
+#else
+			0, (size_t)0, (size_t)0,
+#endif
+			((size_t)atomic_load32(&heap->span_use[iclass].spans_to_reserved) * (iclass + 1) * _memory_span_size) / (size_t)(1024 * 1024),
+			((size_t)atomic_load32(&heap->span_use[iclass].spans_from_reserved) * (iclass + 1) * _memory_span_size) / (size_t)(1024 * 1024),
+			((size_t)atomic_load32(&heap->span_use[iclass].spans_to_global) * (size_t)_memory_span_size * (iclass + 1)) / (size_t)(1024 * 1024),
+			((size_t)atomic_load32(&heap->span_use[iclass].spans_from_global) * (size_t)_memory_span_size * (iclass + 1)) / (size_t)(1024 * 1024),
+			atomic_load32(&heap->span_use[iclass].spans_map_calls));
+	}
+	fprintf(file, "Full spans: %zu\n", heap->full_span_count);
+	fprintf(file, "ThreadToGlobalMiB GlobalToThreadMiB\n");
+	fprintf(file, "%17zu %17zu\n", (size_t)atomic_load64(&heap->thread_to_global) / (size_t)(1024 * 1024), (size_t)atomic_load64(&heap->global_to_thread) / (size_t)(1024 * 1024));
+}
+
+#endif
+
+void
+rpmalloc_dump_statistics(void* file) {
+#if ENABLE_STATISTICS
+	for (size_t list_idx = 0; list_idx < HEAP_ARRAY_SIZE; ++list_idx) {
+		heap_t* heap = _memory_heaps[list_idx];
+		while (heap) {
+			int need_dump = 0;
+			for (size_t iclass = 0; !need_dump && (iclass < SIZE_CLASS_COUNT); ++iclass) {
+				if (!atomic_load32(&heap->size_class_use[iclass].alloc_total)) {
+					rpmalloc_assert(!atomic_load32(&heap->size_class_use[iclass].free_total), "Heap statistics counter mismatch");
+					rpmalloc_assert(!atomic_load32(&heap->size_class_use[iclass].spans_map_calls), "Heap statistics counter mismatch");
+					continue;
+				}
+				need_dump = 1;
+			}
+			for (size_t iclass = 0; !need_dump && (iclass < LARGE_CLASS_COUNT); ++iclass) {
+				if (!atomic_load32(&heap->span_use[iclass].high) && !atomic_load32(&heap->span_use[iclass].spans_map_calls))
+					continue;
+				need_dump = 1;
+			}
+			if (need_dump)
+				_memory_heap_dump_statistics(heap, file);
+			heap = heap->next_heap;
+		}
+	}
+	fprintf(file, "Global stats:\n");
+	size_t huge_current = (size_t)atomic_load32(&_huge_pages_current) * _memory_page_size;
+	size_t huge_peak = (size_t)_huge_pages_peak * _memory_page_size;
+	fprintf(file, "HugeCurrentMiB HugePeakMiB\n");
+	fprintf(file, "%14zu %11zu\n", huge_current / (size_t)(1024 * 1024), huge_peak / (size_t)(1024 * 1024));
+
+	fprintf(file, "GlobalCacheMiB\n");
+	for (size_t iclass = 0; iclass < LARGE_CLASS_COUNT; ++iclass) {
+		global_cache_t* cache = _memory_span_cache + iclass;
+		size_t global_cache = (size_t)cache->count * iclass * _memory_span_size;
+
+		size_t global_overflow_cache = 0;
+		span_t* span = cache->overflow;
+		while (span) {
+			global_overflow_cache += iclass * _memory_span_size;
+			span = span->next;
+		}
+		if (global_cache || global_overflow_cache || cache->insert_count || cache->extract_count)
+			fprintf(file, "%4zu: %8zuMiB (%8zuMiB overflow) %14zu insert %14zu extract\n", iclass + 1, global_cache / (size_t)(1024 * 1024), global_overflow_cache / (size_t)(1024 * 1024), cache->insert_count, cache->extract_count);
+	}
+
+	size_t mapped = (size_t)atomic_load32(&_mapped_pages) * _memory_page_size;
+	size_t mapped_os = (size_t)atomic_load32(&_mapped_pages_os) * _memory_page_size;
+	size_t mapped_peak = (size_t)_mapped_pages_peak * _memory_page_size;
+	size_t mapped_total = (size_t)atomic_load32(&_mapped_total) * _memory_page_size;
+	size_t unmapped_total = (size_t)atomic_load32(&_unmapped_total) * _memory_page_size;
+	fprintf(file, "MappedMiB MappedOSMiB MappedPeakMiB MappedTotalMiB UnmappedTotalMiB\n");
+	fprintf(file, "%9zu %11zu %13zu %14zu %16zu\n",
+		mapped / (size_t)(1024 * 1024),
+		mapped_os / (size_t)(1024 * 1024),
+		mapped_peak / (size_t)(1024 * 1024),
+		mapped_total / (size_t)(1024 * 1024),
+		unmapped_total / (size_t)(1024 * 1024));
+
+	fprintf(file, "\n");
+#if 0
+	int64_t allocated = atomic_load64(&_allocation_counter);
+	int64_t deallocated = atomic_load64(&_deallocation_counter);
+	fprintf(file, "Allocation count: %lli\n", allocated);
+	fprintf(file, "Deallocation count: %lli\n", deallocated);
+	fprintf(file, "Current allocations: %lli\n", (allocated - deallocated));
+	fprintf(file, "Master spans: %d\n", atomic_load32(&_master_spans));
+	fprintf(file, "Dangling master spans: %d\n", atomic_load32(&_unmapped_master_spans));
+#endif
+#endif
+	(void)sizeof(file);
+}
+
+#if RPMALLOC_FIRST_CLASS_HEAPS
+
+extern inline rpmalloc_heap_t*
+rpmalloc_heap_acquire(void) {
+	// Must be a pristine heap from newly mapped memory pages, or else memory blocks
+	// could already be allocated from the heap which would (wrongly) be released when
+	// heap is cleared with rpmalloc_heap_free_all(). Also heaps guaranteed to be
+	// pristine from the dedicated orphan list can be used.
+	heap_t* heap = _rpmalloc_heap_allocate(1);
+	heap->owner_thread = 0;
+	_rpmalloc_stat_inc(&_memory_active_heaps);
+	return heap;
+}
+
+extern inline void
+rpmalloc_heap_release(rpmalloc_heap_t* heap) {
+	if (heap)
+		_rpmalloc_heap_release(heap, 1, 1);
+}
+
+extern inline RPMALLOC_ALLOCATOR void*
+rpmalloc_heap_alloc(rpmalloc_heap_t* heap, size_t size) {
+#if ENABLE_VALIDATE_ARGS
+	if (size >= MAX_ALLOC_SIZE) {
+		errno = EINVAL;
+		return 0;
+	}
+#endif
+	return _rpmalloc_allocate(heap, size);
+}
+
+extern inline RPMALLOC_ALLOCATOR void*
+rpmalloc_heap_aligned_alloc(rpmalloc_heap_t* heap, size_t alignment, size_t size) {
+#if ENABLE_VALIDATE_ARGS
+	if (size >= MAX_ALLOC_SIZE) {
+		errno = EINVAL;
+		return 0;
+	}
+#endif
+	return _rpmalloc_aligned_allocate(heap, alignment, size);
+}
+
+extern inline RPMALLOC_ALLOCATOR void*
+rpmalloc_heap_calloc(rpmalloc_heap_t* heap, size_t num, size_t size) {
+	return rpmalloc_heap_aligned_calloc(heap, 0, num, size);
+}
+
+extern inline RPMALLOC_ALLOCATOR void*
+rpmalloc_heap_aligned_calloc(rpmalloc_heap_t* heap, size_t alignment, size_t num, size_t size) {
+	size_t total;
+#if ENABLE_VALIDATE_ARGS
+#if PLATFORM_WINDOWS
+	int err = SizeTMult(num, size, &total);
+	if ((err != S_OK) || (total >= MAX_ALLOC_SIZE)) {
+		errno = EINVAL;
+		return 0;
+	}
+#else
+	int err = __builtin_umull_overflow(num, size, &total);
+	if (err || (total >= MAX_ALLOC_SIZE)) {
+		errno = EINVAL;
+		return 0;
+	}
+#endif
+#else
+	total = num * size;
+#endif
+	void* block = _rpmalloc_aligned_allocate(heap, alignment, total);
+	if (block)
+		memset(block, 0, total);
+	return block;
+}
+
+extern inline RPMALLOC_ALLOCATOR void*
+rpmalloc_heap_realloc(rpmalloc_heap_t* heap, void* ptr, size_t size, unsigned int flags) {
+#if ENABLE_VALIDATE_ARGS
+	if (size >= MAX_ALLOC_SIZE) {
+		errno = EINVAL;
+		return ptr;
+	}
+#endif
+	return _rpmalloc_reallocate(heap, ptr, size, 0, flags);
+}
+
+extern inline RPMALLOC_ALLOCATOR void*
+rpmalloc_heap_aligned_realloc(rpmalloc_heap_t* heap, void* ptr, size_t alignment, size_t size, unsigned int flags) {
+#if ENABLE_VALIDATE_ARGS
+	if ((size + alignment < size) || (alignment > _memory_page_size)) {
+		errno = EINVAL;
+		return 0;
+	}
+#endif
+	return _rpmalloc_aligned_reallocate(heap, ptr, alignment, size, 0, flags);	
+}
+
+extern inline void
+rpmalloc_heap_free(rpmalloc_heap_t* heap, void* ptr) {
+	(void)sizeof(heap);
+	_rpmalloc_deallocate(ptr);
+}
+
+extern inline void
+rpmalloc_heap_free_all(rpmalloc_heap_t* heap) {
+	span_t* span;
+	span_t* next_span;
+
+	_rpmalloc_heap_cache_adopt_deferred(heap, 0);
+
+	for (size_t iclass = 0; iclass < SIZE_CLASS_COUNT; ++iclass) {
+		span = heap->size_class[iclass].partial_span;
+		while (span) {
+			next_span = span->next;
+			_rpmalloc_heap_cache_insert(heap, span);
+			span = next_span;
+		}
+		heap->size_class[iclass].partial_span = 0;
+		span = heap->full_span[iclass];
+		while (span) {
+			next_span = span->next;
+			_rpmalloc_heap_cache_insert(heap, span);
+			span = next_span;
+		}
+	}
+	memset(heap->size_class, 0, sizeof(heap->size_class));
+	memset(heap->full_span, 0, sizeof(heap->full_span));
+
+	span = heap->large_huge_span;
+	while (span) {
+		next_span = span->next;
+		if (UNEXPECTED(span->size_class == SIZE_CLASS_HUGE))
+			_rpmalloc_deallocate_huge(span);
+		else
+			_rpmalloc_heap_cache_insert(heap, span);
+		span = next_span;
+	}
+	heap->large_huge_span = 0;
+	heap->full_span_count = 0;
+
+#if ENABLE_THREAD_CACHE
+	for (size_t iclass = 0; iclass < LARGE_CLASS_COUNT; ++iclass) {
+		span_cache_t* span_cache;
+		if (!iclass)
+			span_cache = &heap->span_cache;
+		else
+			span_cache = (span_cache_t*)(heap->span_large_cache + (iclass - 1));
+		if (!span_cache->count)
+			continue;
+#if ENABLE_GLOBAL_CACHE
+		_rpmalloc_stat_add64(&heap->thread_to_global, span_cache->count * (iclass + 1) * _memory_span_size);
+		_rpmalloc_stat_add(&heap->span_use[iclass].spans_to_global, span_cache->count);
+		_rpmalloc_global_cache_insert_spans(span_cache->span, iclass + 1, span_cache->count);
+#else
+		for (size_t ispan = 0; ispan < span_cache->count; ++ispan)
+			_rpmalloc_span_unmap(span_cache->span[ispan]);
+#endif
+		span_cache->count = 0;
+	}
+#endif
+
+#if ENABLE_STATISTICS
+	for (size_t iclass = 0; iclass < SIZE_CLASS_COUNT; ++iclass) {
+		atomic_store32(&heap->size_class_use[iclass].alloc_current, 0);
+		atomic_store32(&heap->size_class_use[iclass].spans_current, 0);
+	}
+	for (size_t iclass = 0; iclass < LARGE_CLASS_COUNT; ++iclass) {
+		atomic_store32(&heap->span_use[iclass].current, 0);
+	}
+#endif
+}
+
+extern inline void
+rpmalloc_heap_thread_set_current(rpmalloc_heap_t* heap) {
+	heap_t* prev_heap = get_thread_heap_raw();
+	if (prev_heap != heap) {
+		set_thread_heap(heap);
+		if (prev_heap)
+			rpmalloc_heap_release(prev_heap);
+	}
+}
+
+#endif
+
+}
+
+#endif
diff --git a/project/thirdparty/tracy-0.11.1/client/tracy_rpmalloc.hpp b/project/thirdparty/tracy-0.11.1/client/tracy_rpmalloc.hpp
new file mode 100644
index 000000000..51216a21b
--- /dev/null
+++ b/project/thirdparty/tracy-0.11.1/client/tracy_rpmalloc.hpp
@@ -0,0 +1,363 @@
+/* rpmalloc.h  -  Memory allocator  -  Public Domain  -  2016 Mattias Jansson
+ *
+ * This library provides a cross-platform lock free thread caching malloc implementation in C11.
+ * The latest source code is always available at
+ *
+ * https://github.com/mjansson/rpmalloc
+ *
+ * This library is put in the public domain; you can redistribute it and/or modify it without any restrictions.
+ *
+ */
+
+#pragma once
+
+#include <stddef.h>
+#include "../common/TracyApi.h"
+
+namespace tracy
+{
+
+#if defined(__clang__) || defined(__GNUC__)
+# define RPMALLOC_EXPORT __attribute__((visibility("default")))
+# define RPMALLOC_ALLOCATOR 
+# if (defined(__clang_major__) && (__clang_major__ < 4)) || (defined(__GNUC__) && defined(ENABLE_PRELOAD) && ENABLE_PRELOAD)
+# define RPMALLOC_ATTRIB_MALLOC
+# define RPMALLOC_ATTRIB_ALLOC_SIZE(size)
+# define RPMALLOC_ATTRIB_ALLOC_SIZE2(count, size)
+# else
+# define RPMALLOC_ATTRIB_MALLOC __attribute__((__malloc__))
+# define RPMALLOC_ATTRIB_ALLOC_SIZE(size) __attribute__((alloc_size(size)))
+# define RPMALLOC_ATTRIB_ALLOC_SIZE2(count, size)  __attribute__((alloc_size(count, size)))
+# endif
+# define RPMALLOC_CDECL
+#elif defined(_MSC_VER)
+# define RPMALLOC_EXPORT
+# define RPMALLOC_ALLOCATOR __declspec(allocator) __declspec(restrict)
+# define RPMALLOC_ATTRIB_MALLOC
+# define RPMALLOC_ATTRIB_ALLOC_SIZE(size)
+# define RPMALLOC_ATTRIB_ALLOC_SIZE2(count,size)
+# define RPMALLOC_CDECL __cdecl
+#else
+# define RPMALLOC_EXPORT
+# define RPMALLOC_ALLOCATOR
+# define RPMALLOC_ATTRIB_MALLOC
+# define RPMALLOC_ATTRIB_ALLOC_SIZE(size)
+# define RPMALLOC_ATTRIB_ALLOC_SIZE2(count,size)
+# define RPMALLOC_CDECL
+#endif
+
+//! Define RPMALLOC_CONFIGURABLE to enable configuring sizes. Will introduce
+//  a very small overhead due to some size calculations not being compile time constants
+#ifndef RPMALLOC_CONFIGURABLE
+#define RPMALLOC_CONFIGURABLE 0
+#endif
+
+//! Define RPMALLOC_FIRST_CLASS_HEAPS to enable heap based API (rpmalloc_heap_* functions).
+//  Will introduce a very small overhead to track fully allocated spans in heaps
+#ifndef RPMALLOC_FIRST_CLASS_HEAPS
+#define RPMALLOC_FIRST_CLASS_HEAPS 0
+#endif
+
+//! Flag to rpaligned_realloc to not preserve content in reallocation
+#define RPMALLOC_NO_PRESERVE    1
+//! Flag to rpaligned_realloc to fail and return null pointer if grow cannot be done in-place,
+//  in which case the original pointer is still valid (just like a call to realloc which failes to allocate
+//  a new block).
+#define RPMALLOC_GROW_OR_FAIL   2
+
+typedef struct rpmalloc_global_statistics_t {
+	//! Current amount of virtual memory mapped, all of which might not have been committed (only if ENABLE_STATISTICS=1)
+	size_t mapped;
+	//! Peak amount of virtual memory mapped, all of which might not have been committed (only if ENABLE_STATISTICS=1)
+	size_t mapped_peak;
+	//! Current amount of memory in global caches for small and medium sizes (<32KiB)
+	size_t cached;
+	//! Current amount of memory allocated in huge allocations, i.e larger than LARGE_SIZE_LIMIT which is 2MiB by default (only if ENABLE_STATISTICS=1)
+	size_t huge_alloc;
+	//! Peak amount of memory allocated in huge allocations, i.e larger than LARGE_SIZE_LIMIT which is 2MiB by default (only if ENABLE_STATISTICS=1)
+	size_t huge_alloc_peak;
+	//! Total amount of memory mapped since initialization (only if ENABLE_STATISTICS=1)
+	size_t mapped_total;
+	//! Total amount of memory unmapped since initialization  (only if ENABLE_STATISTICS=1)
+	size_t unmapped_total;
+} rpmalloc_global_statistics_t;
+
+typedef struct rpmalloc_thread_statistics_t {
+	//! Current number of bytes available in thread size class caches for small and medium sizes (<32KiB)
+	size_t sizecache;
+	//! Current number of bytes available in thread span caches for small and medium sizes (<32KiB)
+	size_t spancache;
+	//! Total number of bytes transitioned from thread cache to global cache (only if ENABLE_STATISTICS=1)
+	size_t thread_to_global;
+	//! Total number of bytes transitioned from global cache to thread cache (only if ENABLE_STATISTICS=1)
+	size_t global_to_thread;
+	//! Per span count statistics (only if ENABLE_STATISTICS=1)
+	struct {
+		//! Currently used number of spans
+		size_t current;
+		//! High water mark of spans used
+		size_t peak;
+		//! Number of spans transitioned to global cache
+		size_t to_global;
+		//! Number of spans transitioned from global cache
+		size_t from_global;
+		//! Number of spans transitioned to thread cache
+		size_t to_cache;
+		//! Number of spans transitioned from thread cache
+		size_t from_cache;
+		//! Number of spans transitioned to reserved state
+		size_t to_reserved;
+		//! Number of spans transitioned from reserved state
+		size_t from_reserved;
+		//! Number of raw memory map calls (not hitting the reserve spans but resulting in actual OS mmap calls)
+		size_t map_calls;
+	} span_use[64];
+	//! Per size class statistics (only if ENABLE_STATISTICS=1)
+	struct {
+		//! Current number of allocations
+		size_t alloc_current;
+		//! Peak number of allocations
+		size_t alloc_peak;
+		//! Total number of allocations
+		size_t alloc_total;
+		//! Total number of frees
+		size_t free_total;
+		//! Number of spans transitioned to cache
+		size_t spans_to_cache;
+		//! Number of spans transitioned from cache
+		size_t spans_from_cache;
+		//! Number of spans transitioned from reserved state
+		size_t spans_from_reserved;
+		//! Number of raw memory map calls (not hitting the reserve spans but resulting in actual OS mmap calls)
+		size_t map_calls;
+	} size_use[128];
+} rpmalloc_thread_statistics_t;
+
+typedef struct rpmalloc_config_t {
+	//! Map memory pages for the given number of bytes. The returned address MUST be
+	//  aligned to the rpmalloc span size, which will always be a power of two.
+	//  Optionally the function can store an alignment offset in the offset variable
+	//  in case it performs alignment and the returned pointer is offset from the
+	//  actual start of the memory region due to this alignment. The alignment offset
+	//  will be passed to the memory unmap function. The alignment offset MUST NOT be
+	//  larger than 65535 (storable in an uint16_t), if it is you must use natural
+	//  alignment to shift it into 16 bits. If you set a memory_map function, you
+	//  must also set a memory_unmap function or else the default implementation will
+	//  be used for both. This function must be thread safe, it can be called by
+	//  multiple threads simultaneously.
+	void* (*memory_map)(size_t size, size_t* offset);
+	//! Unmap the memory pages starting at address and spanning the given number of bytes.
+	//  If release is set to non-zero, the unmap is for an entire span range as returned by
+	//  a previous call to memory_map and that the entire range should be released. The
+	//  release argument holds the size of the entire span range. If release is set to 0,
+	//  the unmap is a partial decommit of a subset of the mapped memory range.
+	//  If you set a memory_unmap function, you must also set a memory_map function or
+	//  else the default implementation will be used for both. This function must be thread
+	//  safe, it can be called by multiple threads simultaneously.
+	void (*memory_unmap)(void* address, size_t size, size_t offset, size_t release);
+	//! Called when an assert fails, if asserts are enabled. Will use the standard assert()
+	//  if this is not set.
+	void (*error_callback)(const char* message);
+	//! Called when a call to map memory pages fails (out of memory). If this callback is
+	//  not set or returns zero the library will return a null pointer in the allocation
+	//  call. If this callback returns non-zero the map call will be retried. The argument
+	//  passed is the number of bytes that was requested in the map call. Only used if
+	//  the default system memory map function is used (memory_map callback is not set).
+	int (*map_fail_callback)(size_t size);
+	//! Size of memory pages. The page size MUST be a power of two. All memory mapping
+	//  requests to memory_map will be made with size set to a multiple of the page size.
+	//  Used if RPMALLOC_CONFIGURABLE is defined to 1, otherwise system page size is used.
+	size_t page_size;
+	//! Size of a span of memory blocks. MUST be a power of two, and in [4096,262144]
+	//  range (unless 0 - set to 0 to use the default span size). Used if RPMALLOC_CONFIGURABLE
+	//  is defined to 1.
+	size_t span_size;
+	//! Number of spans to map at each request to map new virtual memory blocks. This can
+	//  be used to minimize the system call overhead at the cost of virtual memory address
+	//  space. The extra mapped pages will not be written until actually used, so physical
+	//  committed memory should not be affected in the default implementation. Will be
+	//  aligned to a multiple of spans that match memory page size in case of huge pages.
+	size_t span_map_count;
+	//! Enable use of large/huge pages. If this flag is set to non-zero and page size is
+	//  zero, the allocator will try to enable huge pages and auto detect the configuration.
+	//  If this is set to non-zero and page_size is also non-zero, the allocator will
+	//  assume huge pages have been configured and enabled prior to initializing the
+	//  allocator.
+	//  For Windows, see https://docs.microsoft.com/en-us/windows/desktop/memory/large-page-support
+	//  For Linux, see https://www.kernel.org/doc/Documentation/vm/hugetlbpage.txt
+	int enable_huge_pages;
+	//! Respectively allocated pages and huge allocated pages names for systems
+	//  supporting it to be able to distinguish among anonymous regions.
+	const char *page_name;
+	const char *huge_page_name;
+} rpmalloc_config_t;
+
+//! Initialize allocator with default configuration
+TRACY_API int
+rpmalloc_initialize(void);
+
+//! Initialize allocator with given configuration
+RPMALLOC_EXPORT int
+rpmalloc_initialize_config(const rpmalloc_config_t* config);
+
+//! Get allocator configuration
+RPMALLOC_EXPORT const rpmalloc_config_t*
+rpmalloc_config(void);
+
+//! Finalize allocator
+TRACY_API void
+rpmalloc_finalize(void);
+
+//! Initialize allocator for calling thread
+TRACY_API void
+rpmalloc_thread_initialize(void);
+
+//! Finalize allocator for calling thread
+TRACY_API void
+rpmalloc_thread_finalize(int release_caches);
+
+//! Perform deferred deallocations pending for the calling thread heap
+RPMALLOC_EXPORT void
+rpmalloc_thread_collect(void);
+
+//! Query if allocator is initialized for calling thread
+RPMALLOC_EXPORT int
+rpmalloc_is_thread_initialized(void);
+
+//! Get per-thread statistics
+RPMALLOC_EXPORT void
+rpmalloc_thread_statistics(rpmalloc_thread_statistics_t* stats);
+
+//! Get global statistics
+RPMALLOC_EXPORT void
+rpmalloc_global_statistics(rpmalloc_global_statistics_t* stats);
+
+//! Dump all statistics in human readable format to file (should be a FILE*)
+RPMALLOC_EXPORT void
+rpmalloc_dump_statistics(void* file);
+
+//! Allocate a memory block of at least the given size
+TRACY_API RPMALLOC_ALLOCATOR void*
+rpmalloc(size_t size) RPMALLOC_ATTRIB_MALLOC RPMALLOC_ATTRIB_ALLOC_SIZE(1);
+
+//! Free the given memory block
+TRACY_API void
+rpfree(void* ptr);
+
+//! Allocate a memory block of at least the given size and zero initialize it
+RPMALLOC_EXPORT RPMALLOC_ALLOCATOR void*
+rpcalloc(size_t num, size_t size) RPMALLOC_ATTRIB_MALLOC RPMALLOC_ATTRIB_ALLOC_SIZE2(1, 2);
+
+//! Reallocate the given block to at least the given size
+TRACY_API RPMALLOC_ALLOCATOR void*
+rprealloc(void* ptr, size_t size) RPMALLOC_ATTRIB_MALLOC RPMALLOC_ATTRIB_ALLOC_SIZE(2);
+
+//! Reallocate the given block to at least the given size and alignment,
+//  with optional control flags (see RPMALLOC_NO_PRESERVE).
+//  Alignment must be a power of two and a multiple of sizeof(void*),
+//  and should ideally be less than memory page size. A caveat of rpmalloc
+//  internals is that this must also be strictly less than the span size (default 64KiB)
+RPMALLOC_EXPORT RPMALLOC_ALLOCATOR void*
+rpaligned_realloc(void* ptr, size_t alignment, size_t size, size_t oldsize, unsigned int flags) RPMALLOC_ATTRIB_MALLOC RPMALLOC_ATTRIB_ALLOC_SIZE(3);
+
+//! Allocate a memory block of at least the given size and alignment.
+//  Alignment must be a power of two and a multiple of sizeof(void*),
+//  and should ideally be less than memory page size. A caveat of rpmalloc
+//  internals is that this must also be strictly less than the span size (default 64KiB)
+RPMALLOC_EXPORT RPMALLOC_ALLOCATOR void*
+rpaligned_alloc(size_t alignment, size_t size) RPMALLOC_ATTRIB_MALLOC RPMALLOC_ATTRIB_ALLOC_SIZE(2);
+
+//! Allocate a memory block of at least the given size and alignment, and zero initialize it.
+//  Alignment must be a power of two and a multiple of sizeof(void*),
+//  and should ideally be less than memory page size. A caveat of rpmalloc
+//  internals is that this must also be strictly less than the span size (default 64KiB)
+RPMALLOC_EXPORT RPMALLOC_ALLOCATOR void*
+rpaligned_calloc(size_t alignment, size_t num, size_t size) RPMALLOC_ATTRIB_MALLOC RPMALLOC_ATTRIB_ALLOC_SIZE2(2, 3);
+
+//! Allocate a memory block of at least the given size and alignment.
+//  Alignment must be a power of two and a multiple of sizeof(void*),
+//  and should ideally be less than memory page size. A caveat of rpmalloc
+//  internals is that this must also be strictly less than the span size (default 64KiB)
+RPMALLOC_EXPORT RPMALLOC_ALLOCATOR void*
+rpmemalign(size_t alignment, size_t size) RPMALLOC_ATTRIB_MALLOC RPMALLOC_ATTRIB_ALLOC_SIZE(2);
+
+//! Allocate a memory block of at least the given size and alignment.
+//  Alignment must be a power of two and a multiple of sizeof(void*),
+//  and should ideally be less than memory page size. A caveat of rpmalloc
+//  internals is that this must also be strictly less than the span size (default 64KiB)
+RPMALLOC_EXPORT int
+rpposix_memalign(void** memptr, size_t alignment, size_t size);
+
+//! Query the usable size of the given memory block (from given pointer to the end of block)
+RPMALLOC_EXPORT size_t
+rpmalloc_usable_size(void* ptr);
+
+#if RPMALLOC_FIRST_CLASS_HEAPS
+
+//! Heap type
+typedef struct heap_t rpmalloc_heap_t;
+
+//! Acquire a new heap. Will reuse existing released heaps or allocate memory for a new heap
+//  if none available. Heap API is implemented with the strict assumption that only one single
+//  thread will call heap functions for a given heap at any given time, no functions are thread safe.
+RPMALLOC_EXPORT rpmalloc_heap_t*
+rpmalloc_heap_acquire(void);
+
+//! Release a heap (does NOT free the memory allocated by the heap, use rpmalloc_heap_free_all before destroying the heap).
+//  Releasing a heap will enable it to be reused by other threads. Safe to pass a null pointer.
+RPMALLOC_EXPORT void
+rpmalloc_heap_release(rpmalloc_heap_t* heap);
+
+//! Allocate a memory block of at least the given size using the given heap.
+RPMALLOC_EXPORT RPMALLOC_ALLOCATOR void*
+rpmalloc_heap_alloc(rpmalloc_heap_t* heap, size_t size) RPMALLOC_ATTRIB_MALLOC RPMALLOC_ATTRIB_ALLOC_SIZE(2);
+
+//! Allocate a memory block of at least the given size using the given heap. The returned
+//  block will have the requested alignment. Alignment must be a power of two and a multiple of sizeof(void*),
+//  and should ideally be less than memory page size. A caveat of rpmalloc
+//  internals is that this must also be strictly less than the span size (default 64KiB).
+RPMALLOC_EXPORT RPMALLOC_ALLOCATOR void*
+rpmalloc_heap_aligned_alloc(rpmalloc_heap_t* heap, size_t alignment, size_t size) RPMALLOC_ATTRIB_MALLOC RPMALLOC_ATTRIB_ALLOC_SIZE(3);
+
+//! Allocate a memory block of at least the given size using the given heap and zero initialize it.
+RPMALLOC_EXPORT RPMALLOC_ALLOCATOR void*
+rpmalloc_heap_calloc(rpmalloc_heap_t* heap, size_t num, size_t size) RPMALLOC_ATTRIB_MALLOC RPMALLOC_ATTRIB_ALLOC_SIZE2(2, 3);
+
+//! Allocate a memory block of at least the given size using the given heap and zero initialize it. The returned
+//  block will have the requested alignment. Alignment must either be zero, or a power of two and a multiple of sizeof(void*),
+//  and should ideally be less than memory page size. A caveat of rpmalloc
+//  internals is that this must also be strictly less than the span size (default 64KiB).
+RPMALLOC_EXPORT RPMALLOC_ALLOCATOR void*
+rpmalloc_heap_aligned_calloc(rpmalloc_heap_t* heap, size_t alignment, size_t num, size_t size) RPMALLOC_ATTRIB_MALLOC RPMALLOC_ATTRIB_ALLOC_SIZE2(2, 3);
+
+//! Reallocate the given block to at least the given size. The memory block MUST be allocated
+//  by the same heap given to this function.
+RPMALLOC_EXPORT RPMALLOC_ALLOCATOR void*
+rpmalloc_heap_realloc(rpmalloc_heap_t* heap, void* ptr, size_t size, unsigned int flags) RPMALLOC_ATTRIB_MALLOC RPMALLOC_ATTRIB_ALLOC_SIZE(3);
+
+//! Reallocate the given block to at least the given size. The memory block MUST be allocated
+//  by the same heap given to this function. The returned block will have the requested alignment.
+//  Alignment must be either zero, or a power of two and a multiple of sizeof(void*), and should ideally be
+//  less than memory page size. A caveat of rpmalloc internals is that this must also be strictly less than
+//  the span size (default 64KiB).
+RPMALLOC_EXPORT RPMALLOC_ALLOCATOR void*
+rpmalloc_heap_aligned_realloc(rpmalloc_heap_t* heap, void* ptr, size_t alignment, size_t size, unsigned int flags) RPMALLOC_ATTRIB_MALLOC RPMALLOC_ATTRIB_ALLOC_SIZE(4);
+
+//! Free the given memory block from the given heap. The memory block MUST be allocated
+//  by the same heap given to this function.
+RPMALLOC_EXPORT void
+rpmalloc_heap_free(rpmalloc_heap_t* heap, void* ptr);
+
+//! Free all memory allocated by the heap
+RPMALLOC_EXPORT void
+rpmalloc_heap_free_all(rpmalloc_heap_t* heap);
+
+//! Set the given heap as the current heap for the calling thread. A heap MUST only be current heap
+//  for a single thread, a heap can never be shared between multiple threads. The previous
+//  current heap for the calling thread is released to be reused by other threads.
+RPMALLOC_EXPORT void
+rpmalloc_heap_thread_set_current(rpmalloc_heap_t* heap);
+
+#endif
+
+}
diff --git a/project/thirdparty/tracy-0.11.1/common/TracyAlign.hpp b/project/thirdparty/tracy-0.11.1/common/TracyAlign.hpp
new file mode 100644
index 000000000..c3531ba0d
--- /dev/null
+++ b/project/thirdparty/tracy-0.11.1/common/TracyAlign.hpp
@@ -0,0 +1,27 @@
+#ifndef __TRACYALIGN_HPP__
+#define __TRACYALIGN_HPP__
+
+#include <string.h>
+
+#include "TracyForceInline.hpp"
+
+namespace tracy
+{
+
+template<typename T>
+tracy_force_inline T MemRead( const void* ptr )
+{
+    T val;
+    memcpy( &val, ptr, sizeof( T ) );
+    return val;
+}
+
+template<typename T>
+tracy_force_inline void MemWrite( void* ptr, T val )
+{
+    memcpy( ptr, &val, sizeof( T ) );
+}
+
+}
+
+#endif
diff --git a/project/thirdparty/tracy-0.11.1/common/TracyAlloc.hpp b/project/thirdparty/tracy-0.11.1/common/TracyAlloc.hpp
new file mode 100644
index 000000000..ddb0e5df6
--- /dev/null
+++ b/project/thirdparty/tracy-0.11.1/common/TracyAlloc.hpp
@@ -0,0 +1,72 @@
+#ifndef __TRACYALLOC_HPP__
+#define __TRACYALLOC_HPP__
+
+#include <stdlib.h>
+
+#if defined TRACY_ENABLE && !defined __EMSCRIPTEN__
+#  include "TracyApi.h"
+#  include "TracyForceInline.hpp"
+#  include "../client/tracy_rpmalloc.hpp"
+#  define TRACY_USE_RPMALLOC
+#endif
+
+namespace tracy
+{
+
+#ifdef TRACY_USE_RPMALLOC
+TRACY_API void InitRpmalloc();
+#else
+static inline void InitRpmalloc() {}
+#endif
+
+static inline void* tracy_malloc( size_t size )
+{
+#ifdef TRACY_USE_RPMALLOC
+    InitRpmalloc();
+    return rpmalloc( size );
+#else
+    return malloc( size );
+#endif
+}
+
+static inline void* tracy_malloc_fast( size_t size )
+{
+#ifdef TRACY_USE_RPMALLOC
+    return rpmalloc( size );
+#else
+    return malloc( size );
+#endif
+}
+
+static inline void tracy_free( void* ptr )
+{
+#ifdef TRACY_USE_RPMALLOC
+    InitRpmalloc();
+    rpfree( ptr );
+#else
+    free( ptr );
+#endif
+}
+
+static inline void tracy_free_fast( void* ptr )
+{
+#ifdef TRACY_USE_RPMALLOC
+    rpfree( ptr );
+#else
+    free( ptr );
+#endif
+}
+
+static inline void* tracy_realloc( void* ptr, size_t size )
+{
+#ifdef TRACY_USE_RPMALLOC
+    InitRpmalloc();
+    return rprealloc( ptr, size );
+#else
+    return realloc( ptr, size );
+#endif
+}
+
+}
+
+#endif
diff --git a/project/thirdparty/tracy-0.11.1/common/TracyApi.h b/project/thirdparty/tracy-0.11.1/common/TracyApi.h
new file mode 100644
index 000000000..f396ce0c6
--- /dev/null
+++ b/project/thirdparty/tracy-0.11.1/common/TracyApi.h
@@ -0,0 +1,16 @@
+#ifndef __TRACYAPI_H__
+#define __TRACYAPI_H__
+
+#if defined _WIN32
+#  if defined TRACY_EXPORTS
+#    define TRACY_API __declspec(dllexport)
+#  elif defined TRACY_IMPORTS
+#    define TRACY_API __declspec(dllimport)
+#  else
+#    define TRACY_API
+#  endif
+#else
+#  define TRACY_API __attribute__((visibility("default")))
+#endif
+
+#endif    // __TRACYAPI_H__
diff --git a/project/thirdparty/tracy-0.11.1/common/TracyColor.hpp b/project/thirdparty/tracy-0.11.1/common/TracyColor.hpp
new file mode 100644
index 000000000..4825c0fba
--- /dev/null
+++ b/project/thirdparty/tracy-0.11.1/common/TracyColor.hpp
@@ -0,0 +1,690 @@
+#ifndef __TRACYCOLOR_HPP__
+#define __TRACYCOLOR_HPP__
+
+namespace tracy
+{
+struct Color
+{
+enum ColorType
+{
+    Snow = 0xfffafa,
+    GhostWhite = 0xf8f8ff,
+    WhiteSmoke = 0xf5f5f5,
+    Gainsboro = 0xdcdcdc,
+    FloralWhite = 0xfffaf0,
+    OldLace = 0xfdf5e6,
+    Linen = 0xfaf0e6,
+    AntiqueWhite = 0xfaebd7,
+    PapayaWhip = 0xffefd5,
+    BlanchedAlmond = 0xffebcd,
+    Bisque = 0xffe4c4,
+    PeachPuff = 0xffdab9,
+    NavajoWhite = 0xffdead,
+    Moccasin = 0xffe4b5,
+    Cornsilk = 0xfff8dc,
+    Ivory = 0xfffff0,
+    LemonChiffon = 0xfffacd,
+    Seashell = 0xfff5ee,
+    Honeydew = 0xf0fff0,
+    MintCream = 0xf5fffa,
+    Azure = 0xf0ffff,
+    AliceBlue = 0xf0f8ff,
+    Lavender = 0xe6e6fa,
+    LavenderBlush = 0xfff0f5,
+    MistyRose = 0xffe4e1,
+    White = 0xffffff,
+    Black = 0x000000,
+    DarkSlateGray = 0x2f4f4f,
+    DarkSlateGrey = 0x2f4f4f,
+    DimGray = 0x696969,
+    DimGrey = 0x696969,
+    SlateGray = 0x708090,
+    SlateGrey = 0x708090,
+    LightSlateGray = 0x778899,
+    LightSlateGrey = 0x778899,
+    Gray = 0xbebebe,
+    Grey = 0xbebebe,
+    X11Gray = 0xbebebe,
+    X11Grey = 0xbebebe,
+    WebGray = 0x808080,
+    WebGrey = 0x808080,
+    LightGrey = 0xd3d3d3,
+    LightGray = 0xd3d3d3,
+    MidnightBlue = 0x191970,
+    Navy = 0x000080,
+    NavyBlue = 0x000080,
+    CornflowerBlue = 0x6495ed,
+    DarkSlateBlue = 0x483d8b,
+    SlateBlue = 0x6a5acd,
+    MediumSlateBlue = 0x7b68ee,
+    LightSlateBlue = 0x8470ff,
+    MediumBlue = 0x0000cd,
+    RoyalBlue = 0x4169e1,
+    Blue = 0x0000ff,
+    DodgerBlue = 0x1e90ff,
+    DeepSkyBlue = 0x00bfff,
+    SkyBlue = 0x87ceeb,
+    LightSkyBlue = 0x87cefa,
+    SteelBlue = 0x4682b4,
+    LightSteelBlue = 0xb0c4de,
+    LightBlue = 0xadd8e6,
+    PowderBlue = 0xb0e0e6,
+    PaleTurquoise = 0xafeeee,
+    DarkTurquoise = 0x00ced1,
+    MediumTurquoise = 0x48d1cc,
+    Turquoise = 0x40e0d0,
+    Cyan = 0x00ffff,
+    Aqua = 0x00ffff,
+    LightCyan = 0xe0ffff,
+    CadetBlue = 0x5f9ea0,
+    MediumAquamarine = 0x66cdaa,
+    Aquamarine = 0x7fffd4,
+    DarkGreen = 0x006400,
+    DarkOliveGreen = 0x556b2f,
+    DarkSeaGreen = 0x8fbc8f,
+    SeaGreen = 0x2e8b57,
+    MediumSeaGreen = 0x3cb371,
+    LightSeaGreen = 0x20b2aa,
+    PaleGreen = 0x98fb98,
+    SpringGreen = 0x00ff7f,
+    LawnGreen = 0x7cfc00,
+    Green = 0x00ff00,
+    Lime = 0x00ff00,
+    X11Green = 0x00ff00,
+    WebGreen = 0x008000,
+    Chartreuse = 0x7fff00,
+    MediumSpringGreen = 0x00fa9a,
+    GreenYellow = 0xadff2f,
+    LimeGreen = 0x32cd32,
+    YellowGreen = 0x9acd32,
+    ForestGreen = 0x228b22,
+    OliveDrab = 0x6b8e23,
+    DarkKhaki = 0xbdb76b,
+    Khaki = 0xf0e68c,
+    PaleGoldenrod = 0xeee8aa,
+    LightGoldenrodYellow = 0xfafad2,
+    LightYellow = 0xffffe0,
+    Yellow = 0xffff00,
+    Gold = 0xffd700,
+    LightGoldenrod = 0xeedd82,
+    Goldenrod = 0xdaa520,
+    DarkGoldenrod = 0xb8860b,
+    RosyBrown = 0xbc8f8f,
+    IndianRed = 0xcd5c5c,
+    SaddleBrown = 0x8b4513,
+    Sienna = 0xa0522d,
+    Peru = 0xcd853f,
+    Burlywood = 0xdeb887,
+    Beige = 0xf5f5dc,
+    Wheat = 0xf5deb3,
+    SandyBrown = 0xf4a460,
+    Tan = 0xd2b48c,
+    Chocolate = 0xd2691e,
+    Firebrick = 0xb22222,
+    Brown = 0xa52a2a,
+    DarkSalmon = 0xe9967a,
+    Salmon = 0xfa8072,
+    LightSalmon = 0xffa07a,
+    Orange = 0xffa500,
+    DarkOrange = 0xff8c00,
+    Coral = 0xff7f50,
+    LightCoral = 0xf08080,
+    Tomato = 0xff6347,
+    OrangeRed = 0xff4500,
+    Red = 0xff0000,
+    HotPink = 0xff69b4,
+    DeepPink = 0xff1493,
+    Pink = 0xffc0cb,
+    LightPink = 0xffb6c1,
+    PaleVioletRed = 0xdb7093,
+    Maroon = 0xb03060,
+    X11Maroon = 0xb03060,
+    WebMaroon = 0x800000,
+    MediumVioletRed = 0xc71585,
+    VioletRed = 0xd02090,
+    Magenta = 0xff00ff,
+    Fuchsia = 0xff00ff,
+    Violet = 0xee82ee,
+    Plum = 0xdda0dd,
+    Orchid = 0xda70d6,
+    MediumOrchid = 0xba55d3,
+    DarkOrchid = 0x9932cc,
+    DarkViolet = 0x9400d3,
+    BlueViolet = 0x8a2be2,
+    Purple = 0xa020f0,
+    X11Purple = 0xa020f0,
+    WebPurple = 0x800080,
+    MediumPurple = 0x9370db,
+    Thistle = 0xd8bfd8,
+    Snow1 = 0xfffafa,
+    Snow2 = 0xeee9e9,
+    Snow3 = 0xcdc9c9,
+    Snow4 = 0x8b8989,
+    Seashell1 = 0xfff5ee,
+    Seashell2 = 0xeee5de,
+    Seashell3 = 0xcdc5bf,
+    Seashell4 = 0x8b8682,
+    AntiqueWhite1 = 0xffefdb,
+    AntiqueWhite2 = 0xeedfcc,
+    AntiqueWhite3 = 0xcdc0b0,
+    AntiqueWhite4 = 0x8b8378,
+    Bisque1 = 0xffe4c4,
+    Bisque2 = 0xeed5b7,
+    Bisque3 = 0xcdb79e,
+    Bisque4 = 0x8b7d6b,
+    PeachPuff1 = 0xffdab9,
+    PeachPuff2 = 0xeecbad,
+    PeachPuff3 = 0xcdaf95,
+    PeachPuff4 = 0x8b7765,
+    NavajoWhite1 = 0xffdead,
+    NavajoWhite2 = 0xeecfa1,
+    NavajoWhite3 = 0xcdb38b,
+    NavajoWhite4 = 0x8b795e,
+    LemonChiffon1 = 0xfffacd,
+    LemonChiffon2 = 0xeee9bf,
+    LemonChiffon3 = 0xcdc9a5,
+    LemonChiffon4 = 0x8b8970,
+    Cornsilk1 = 0xfff8dc,
+    Cornsilk2 = 0xeee8cd,
+    Cornsilk3 = 0xcdc8b1,
+    Cornsilk4 = 0x8b8878,
+    Ivory1 = 0xfffff0,
+    Ivory2 = 0xeeeee0,
+    Ivory3 = 0xcdcdc1,
+    Ivory4 = 0x8b8b83,
+    Honeydew1 = 0xf0fff0,
+    Honeydew2 = 0xe0eee0,
+    Honeydew3 = 0xc1cdc1,
+    Honeydew4 = 0x838b83,
+    LavenderBlush1 = 0xfff0f5,
+    LavenderBlush2 = 0xeee0e5,
+    LavenderBlush3 = 0xcdc1c5,
+    LavenderBlush4 = 0x8b8386,
+    MistyRose1 = 0xffe4e1,
+    MistyRose2 = 0xeed5d2,
+    MistyRose3 = 0xcdb7b5,
+    MistyRose4 = 0x8b7d7b,
+    Azure1 = 0xf0ffff,
+    Azure2 = 0xe0eeee,
+    Azure3 = 0xc1cdcd,
+    Azure4 = 0x838b8b,
+    SlateBlue1 = 0x836fff,
+    SlateBlue2 = 0x7a67ee,
+    SlateBlue3 = 0x6959cd,
+    SlateBlue4 = 0x473c8b,
+    RoyalBlue1 = 0x4876ff,
+    RoyalBlue2 = 0x436eee,
+    RoyalBlue3 = 0x3a5fcd,
+    RoyalBlue4 = 0x27408b,
+    Blue1 = 0x0000ff,
+    Blue2 = 0x0000ee,
+    Blue3 = 0x0000cd,
+    Blue4 = 0x00008b,
+    DodgerBlue1 = 0x1e90ff,
+    DodgerBlue2 = 0x1c86ee,
+    DodgerBlue3 = 0x1874cd,
+    DodgerBlue4 = 0x104e8b,
+    SteelBlue1 = 0x63b8ff,
+    SteelBlue2 = 0x5cacee,
+    SteelBlue3 = 0x4f94cd,
+    SteelBlue4 = 0x36648b,
+    DeepSkyBlue1 = 0x00bfff,
+    DeepSkyBlue2 = 0x00b2ee,
+    DeepSkyBlue3 = 0x009acd,
+    DeepSkyBlue4 = 0x00688b,
+    SkyBlue1 = 0x87ceff,
+    SkyBlue2 = 0x7ec0ee,
+    SkyBlue3 = 0x6ca6cd,
+    SkyBlue4 = 0x4a708b,
+    LightSkyBlue1 = 0xb0e2ff,
+    LightSkyBlue2 = 0xa4d3ee,
+    LightSkyBlue3 = 0x8db6cd,
+    LightSkyBlue4 = 0x607b8b,
+    SlateGray1 = 0xc6e2ff,
+    SlateGray2 = 0xb9d3ee,
+    SlateGray3 = 0x9fb6cd,
+    SlateGray4 = 0x6c7b8b,
+    LightSteelBlue1 = 0xcae1ff,
+    LightSteelBlue2 = 0xbcd2ee,
+    LightSteelBlue3 = 0xa2b5cd,
+    LightSteelBlue4 = 0x6e7b8b,
+    LightBlue1 = 0xbfefff,
+    LightBlue2 = 0xb2dfee,
+    LightBlue3 = 0x9ac0cd,
+    LightBlue4 = 0x68838b,
+    LightCyan1 = 0xe0ffff,
+    LightCyan2 = 0xd1eeee,
+    LightCyan3 = 0xb4cdcd,
+    LightCyan4 = 0x7a8b8b,
+    PaleTurquoise1 = 0xbbffff,
+    PaleTurquoise2 = 0xaeeeee,
+    PaleTurquoise3 = 0x96cdcd,
+    PaleTurquoise4 = 0x668b8b,
+    CadetBlue1 = 0x98f5ff,
+    CadetBlue2 = 0x8ee5ee,
+    CadetBlue3 = 0x7ac5cd,
+    CadetBlue4 = 0x53868b,
+    Turquoise1 = 0x00f5ff,
+    Turquoise2 = 0x00e5ee,
+    Turquoise3 = 0x00c5cd,
+    Turquoise4 = 0x00868b,
+    Cyan1 = 0x00ffff,
+    Cyan2 = 0x00eeee,
+    Cyan3 = 0x00cdcd,
+    Cyan4 = 0x008b8b,
+    DarkSlateGray1 = 0x97ffff,
+    DarkSlateGray2 = 0x8deeee,
+    DarkSlateGray3 = 0x79cdcd,
+    DarkSlateGray4 = 0x528b8b,
+    Aquamarine1 = 0x7fffd4,
+    Aquamarine2 = 0x76eec6,
+    Aquamarine3 = 0x66cdaa,
+    Aquamarine4 = 0x458b74,
+    DarkSeaGreen1 = 0xc1ffc1,
+    DarkSeaGreen2 = 0xb4eeb4,
+    DarkSeaGreen3 = 0x9bcd9b,
+    DarkSeaGreen4 = 0x698b69,
+    SeaGreen1 = 0x54ff9f,
+    SeaGreen2 = 0x4eee94,
+    SeaGreen3 = 0x43cd80,
+    SeaGreen4 = 0x2e8b57,
+    PaleGreen1 = 0x9aff9a,
+    PaleGreen2 = 0x90ee90,
+    PaleGreen3 = 0x7ccd7c,
+    PaleGreen4 = 0x548b54,
+    SpringGreen1 = 0x00ff7f,
+    SpringGreen2 = 0x00ee76,
+    SpringGreen3 = 0x00cd66,
+    SpringGreen4 = 0x008b45,
+    Green1 = 0x00ff00,
+    Green2 = 0x00ee00,
+    Green3 = 0x00cd00,
+    Green4 = 0x008b00,
+    Chartreuse1 = 0x7fff00,
+    Chartreuse2 = 0x76ee00,
+    Chartreuse3 = 0x66cd00,
+    Chartreuse4 = 0x458b00,
+    OliveDrab1 = 0xc0ff3e,
+    OliveDrab2 = 0xb3ee3a,
+    OliveDrab3 = 0x9acd32,
+    OliveDrab4 = 0x698b22,
+    DarkOliveGreen1 = 0xcaff70,
+    DarkOliveGreen2 = 0xbcee68,
+    DarkOliveGreen3 = 0xa2cd5a,
+    DarkOliveGreen4 = 0x6e8b3d,
+    Khaki1 = 0xfff68f,
+    Khaki2 = 0xeee685,
+    Khaki3 = 0xcdc673,
+    Khaki4 = 0x8b864e,
+    LightGoldenrod1 = 0xffec8b,
+    LightGoldenrod2 = 0xeedc82,
+    LightGoldenrod3 = 0xcdbe70,
+    LightGoldenrod4 = 0x8b814c,
+    LightYellow1 = 0xffffe0,
+    LightYellow2 = 0xeeeed1,
+    LightYellow3 = 0xcdcdb4,
+    LightYellow4 = 0x8b8b7a,
+    Yellow1 = 0xffff00,
+    Yellow2 = 0xeeee00,
+    Yellow3 = 0xcdcd00,
+    Yellow4 = 0x8b8b00,
+    Gold1 = 0xffd700,
+    Gold2 = 0xeec900,
+    Gold3 = 0xcdad00,
+    Gold4 = 0x8b7500,
+    Goldenrod1 = 0xffc125,
+    Goldenrod2 = 0xeeb422,
+    Goldenrod3 = 0xcd9b1d,
+    Goldenrod4 = 0x8b6914,
+    DarkGoldenrod1 = 0xffb90f,
+    DarkGoldenrod2 = 0xeead0e,
+    DarkGoldenrod3 = 0xcd950c,
+    DarkGoldenrod4 = 0x8b6508,
+    RosyBrown1 = 0xffc1c1,
+    RosyBrown2 = 0xeeb4b4,
+    RosyBrown3 = 0xcd9b9b,
+    RosyBrown4 = 0x8b6969,
+    IndianRed1 = 0xff6a6a,
+    IndianRed2 = 0xee6363,
+    IndianRed3 = 0xcd5555,
+    IndianRed4 = 0x8b3a3a,
+    Sienna1 = 0xff8247,
+    Sienna2 = 0xee7942,
+    Sienna3 = 0xcd6839,
+    Sienna4 = 0x8b4726,
+    Burlywood1 = 0xffd39b,
+    Burlywood2 = 0xeec591,
+    Burlywood3 = 0xcdaa7d,
+    Burlywood4 = 0x8b7355,
+    Wheat1 = 0xffe7ba,
+    Wheat2 = 0xeed8ae,
+    Wheat3 = 0xcdba96,
+    Wheat4 = 0x8b7e66,
+    Tan1 = 0xffa54f,
+    Tan2 = 0xee9a49,
+    Tan3 = 0xcd853f,
+    Tan4 = 0x8b5a2b,
+    Chocolate1 = 0xff7f24,
+    Chocolate2 = 0xee7621,
+    Chocolate3 = 0xcd661d,
+    Chocolate4 = 0x8b4513,
+    Firebrick1 = 0xff3030,
+    Firebrick2 = 0xee2c2c,
+    Firebrick3 = 0xcd2626,
+    Firebrick4 = 0x8b1a1a,
+    Brown1 = 0xff4040,
+    Brown2 = 0xee3b3b,
+    Brown3 = 0xcd3333,
+    Brown4 = 0x8b2323,
+    Salmon1 = 0xff8c69,
+    Salmon2 = 0xee8262,
+    Salmon3 = 0xcd7054,
+    Salmon4 = 0x8b4c39,
+    LightSalmon1 = 0xffa07a,
+    LightSalmon2 = 0xee9572,
+    LightSalmon3 = 0xcd8162,
+    LightSalmon4 = 0x8b5742,
+    Orange1 = 0xffa500,
+    Orange2 = 0xee9a00,
+    Orange3 = 0xcd8500,
+    Orange4 = 0x8b5a00,
+    DarkOrange1 = 0xff7f00,
+    DarkOrange2 = 0xee7600,
+    DarkOrange3 = 0xcd6600,
+    DarkOrange4 = 0x8b4500,
+    Coral1 = 0xff7256,
+    Coral2 = 0xee6a50,
+    Coral3 = 0xcd5b45,
+    Coral4 = 0x8b3e2f,
+    Tomato1 = 0xff6347,
+    Tomato2 = 0xee5c42,
+    Tomato3 = 0xcd4f39,
+    Tomato4 = 0x8b3626,
+    OrangeRed1 = 0xff4500,
+    OrangeRed2 = 0xee4000,
+    OrangeRed3 = 0xcd3700,
+    OrangeRed4 = 0x8b2500,
+    Red1 = 0xff0000,
+    Red2 = 0xee0000,
+    Red3 = 0xcd0000,
+    Red4 = 0x8b0000,
+    DeepPink1 = 0xff1493,
+    DeepPink2 = 0xee1289,
+    DeepPink3 = 0xcd1076,
+    DeepPink4 = 0x8b0a50,
+    HotPink1 = 0xff6eb4,
+    HotPink2 = 0xee6aa7,
+    HotPink3 = 0xcd6090,
+    HotPink4 = 0x8b3a62,
+    Pink1 = 0xffb5c5,
+    Pink2 = 0xeea9b8,
+    Pink3 = 0xcd919e,
+    Pink4 = 0x8b636c,
+    LightPink1 = 0xffaeb9,
+    LightPink2 = 0xeea2ad,
+    LightPink3 = 0xcd8c95,
+    LightPink4 = 0x8b5f65,
+    PaleVioletRed1 = 0xff82ab,
+    PaleVioletRed2 = 0xee799f,
+    PaleVioletRed3 = 0xcd6889,
+    PaleVioletRed4 = 0x8b475d,
+    Maroon1 = 0xff34b3,
+    Maroon2 = 0xee30a7,
+    Maroon3 = 0xcd2990,
+    Maroon4 = 0x8b1c62,
+    VioletRed1 = 0xff3e96,
+    VioletRed2 = 0xee3a8c,
+    VioletRed3 = 0xcd3278,
+    VioletRed4 = 0x8b2252,
+    Magenta1 = 0xff00ff,
+    Magenta2 = 0xee00ee,
+    Magenta3 = 0xcd00cd,
+    Magenta4 = 0x8b008b,
+    Orchid1 = 0xff83fa,
+    Orchid2 = 0xee7ae9,
+    Orchid3 = 0xcd69c9,
+    Orchid4 = 0x8b4789,
+    Plum1 = 0xffbbff,
+    Plum2 = 0xeeaeee,
+    Plum3 = 0xcd96cd,
+    Plum4 = 0x8b668b,
+    MediumOrchid1 = 0xe066ff,
+    MediumOrchid2 = 0xd15fee,
+    MediumOrchid3 = 0xb452cd,
+    MediumOrchid4 = 0x7a378b,
+    DarkOrchid1 = 0xbf3eff,
+    DarkOrchid2 = 0xb23aee,
+    DarkOrchid3 = 0x9a32cd,
+    DarkOrchid4 = 0x68228b,
+    Purple1 = 0x9b30ff,
+    Purple2 = 0x912cee,
+    Purple3 = 0x7d26cd,
+    Purple4 = 0x551a8b,
+    MediumPurple1 = 0xab82ff,
+    MediumPurple2 = 0x9f79ee,
+    MediumPurple3 = 0x8968cd,
+    MediumPurple4 = 0x5d478b,
+    Thistle1 = 0xffe1ff,
+    Thistle2 = 0xeed2ee,
+    Thistle3 = 0xcdb5cd,
+    Thistle4 = 0x8b7b8b,
+    Gray0 = 0x000000,
+    Grey0 = 0x000000,
+    Gray1 = 0x030303,
+    Grey1 = 0x030303,
+    Gray2 = 0x050505,
+    Grey2 = 0x050505,
+    Gray3 = 0x080808,
+    Grey3 = 0x080808,
+    Gray4 = 0x0a0a0a,
+    Grey4 = 0x0a0a0a,
+    Gray5 = 0x0d0d0d,
+    Grey5 = 0x0d0d0d,
+    Gray6 = 0x0f0f0f,
+    Grey6 = 0x0f0f0f,
+    Gray7 = 0x121212,
+    Grey7 = 0x121212,
+    Gray8 = 0x141414,
+    Grey8 = 0x141414,
+    Gray9 = 0x171717,
+    Grey9 = 0x171717,
+    Gray10 = 0x1a1a1a,
+    Grey10 = 0x1a1a1a,
+    Gray11 = 0x1c1c1c,
+    Grey11 = 0x1c1c1c,
+    Gray12 = 0x1f1f1f,
+    Grey12 = 0x1f1f1f,
+    Gray13 = 0x212121,
+    Grey13 = 0x212121,
+    Gray14 = 0x242424,
+    Grey14 = 0x242424,
+    Gray15 = 0x262626,
+    Grey15 = 0x262626,
+    Gray16 = 0x292929,
+    Grey16 = 0x292929,
+    Gray17 = 0x2b2b2b,
+    Grey17 = 0x2b2b2b,
+    Gray18 = 0x2e2e2e,
+    Grey18 = 0x2e2e2e,
+    Gray19 = 0x303030,
+    Grey19 = 0x303030,
+    Gray20 = 0x333333,
+    Grey20 = 0x333333,
+    Gray21 = 0x363636,
+    Grey21 = 0x363636,
+    Gray22 = 0x383838,
+    Grey22 = 0x383838,
+    Gray23 = 0x3b3b3b,
+    Grey23 = 0x3b3b3b,
+    Gray24 = 0x3d3d3d,
+    Grey24 = 0x3d3d3d,
+    Gray25 = 0x404040,
+    Grey25 = 0x404040,
+    Gray26 = 0x424242,
+    Grey26 = 0x424242,
+    Gray27 = 0x454545,
+    Grey27 = 0x454545,
+    Gray28 = 0x474747,
+    Grey28 = 0x474747,
+    Gray29 = 0x4a4a4a,
+    Grey29 = 0x4a4a4a,
+    Gray30 = 0x4d4d4d,
+    Grey30 = 0x4d4d4d,
+    Gray31 = 0x4f4f4f,
+    Grey31 = 0x4f4f4f,
+    Gray32 = 0x525252,
+    Grey32 = 0x525252,
+    Gray33 = 0x545454,
+    Grey33 = 0x545454,
+    Gray34 = 0x575757,
+    Grey34 = 0x575757,
+    Gray35 = 0x595959,
+    Grey35 = 0x595959,
+    Gray36 = 0x5c5c5c,
+    Grey36 = 0x5c5c5c,
+    Gray37 = 0x5e5e5e,
+    Grey37 = 0x5e5e5e,
+    Gray38 = 0x616161,
+    Grey38 = 0x616161,
+    Gray39 = 0x636363,
+    Grey39 = 0x636363,
+    Gray40 = 0x666666,
+    Grey40 = 0x666666,
+    Gray41 = 0x696969,
+    Grey41 = 0x696969,
+    Gray42 = 0x6b6b6b,
+    Grey42 = 0x6b6b6b,
+    Gray43 = 0x6e6e6e,
+    Grey43 = 0x6e6e6e,
+    Gray44 = 0x707070,
+    Grey44 = 0x707070,
+    Gray45 = 0x737373,
+    Grey45 = 0x737373,
+    Gray46 = 0x757575,
+    Grey46 = 0x757575,
+    Gray47 = 0x787878,
+    Grey47 = 0x787878,
+    Gray48 = 0x7a7a7a,
+    Grey48 = 0x7a7a7a,
+    Gray49 = 0x7d7d7d,
+    Grey49 = 0x7d7d7d,
+    Gray50 = 0x7f7f7f,
+    Grey50 = 0x7f7f7f,
+    Gray51 = 0x828282,
+    Grey51 = 0x828282,
+    Gray52 = 0x858585,
+    Grey52 = 0x858585,
+    Gray53 = 0x878787,
+    Grey53 = 0x878787,
+    Gray54 = 0x8a8a8a,
+    Grey54 = 0x8a8a8a,
+    Gray55 = 0x8c8c8c,
+    Grey55 = 0x8c8c8c,
+    Gray56 = 0x8f8f8f,
+    Grey56 = 0x8f8f8f,
+    Gray57 = 0x919191,
+    Grey57 = 0x919191,
+    Gray58 = 0x949494,
+    Grey58 = 0x949494,
+    Gray59 = 0x969696,
+    Grey59 = 0x969696,
+    Gray60 = 0x999999,
+    Grey60 = 0x999999,
+    Gray61 = 0x9c9c9c,
+    Grey61 = 0x9c9c9c,
+    Gray62 = 0x9e9e9e,
+    Grey62 = 0x9e9e9e,
+    Gray63 = 0xa1a1a1,
+    Grey63 = 0xa1a1a1,
+    Gray64 = 0xa3a3a3,
+    Grey64 = 0xa3a3a3,
+    Gray65 = 0xa6a6a6,
+    Grey65 = 0xa6a6a6,
+    Gray66 = 0xa8a8a8,
+    Grey66 = 0xa8a8a8,
+    Gray67 = 0xababab,
+    Grey67 = 0xababab,
+    Gray68 = 0xadadad,
+    Grey68 = 0xadadad,
+    Gray69 = 0xb0b0b0,
+    Grey69 = 0xb0b0b0,
+    Gray70 = 0xb3b3b3,
+    Grey70 = 0xb3b3b3,
+    Gray71 = 0xb5b5b5,
+    Grey71 = 0xb5b5b5,
+    Gray72 = 0xb8b8b8,
+    Grey72 = 0xb8b8b8,
+    Gray73 = 0xbababa,
+    Grey73 = 0xbababa,
+    Gray74 = 0xbdbdbd,
+    Grey74 = 0xbdbdbd,
+    Gray75 = 0xbfbfbf,
+    Grey75 = 0xbfbfbf,
+    Gray76 = 0xc2c2c2,
+    Grey76 = 0xc2c2c2,
+    Gray77 = 0xc4c4c4,
+    Grey77 = 0xc4c4c4,
+    Gray78 = 0xc7c7c7,
+    Grey78 = 0xc7c7c7,
+    Gray79 = 0xc9c9c9,
+    Grey79 = 0xc9c9c9,
+    Gray80 = 0xcccccc,
+    Grey80 = 0xcccccc,
+    Gray81 = 0xcfcfcf,
+    Grey81 = 0xcfcfcf,
+    Gray82 = 0xd1d1d1,
+    Grey82 = 0xd1d1d1,
+    Gray83 = 0xd4d4d4,
+    Grey83 = 0xd4d4d4,
+    Gray84 = 0xd6d6d6,
+    Grey84 = 0xd6d6d6,
+    Gray85 = 0xd9d9d9,
+    Grey85 = 0xd9d9d9,
+    Gray86 = 0xdbdbdb,
+    Grey86 = 0xdbdbdb,
+    Gray87 = 0xdedede,
+    Grey87 = 0xdedede,
+    Gray88 = 0xe0e0e0,
+    Grey88 = 0xe0e0e0,
+    Gray89 = 0xe3e3e3,
+    Grey89 = 0xe3e3e3,
+    Gray90 = 0xe5e5e5,
+    Grey90 = 0xe5e5e5,
+    Gray91 = 0xe8e8e8,
+    Grey91 = 0xe8e8e8,
+    Gray92 = 0xebebeb,
+    Grey92 = 0xebebeb,
+    Gray93 = 0xededed,
+    Grey93 = 0xededed,
+    Gray94 = 0xf0f0f0,
+    Grey94 = 0xf0f0f0,
+    Gray95 = 0xf2f2f2,
+    Grey95 = 0xf2f2f2,
+    Gray96 = 0xf5f5f5,
+    Grey96 = 0xf5f5f5,
+    Gray97 = 0xf7f7f7,
+    Grey97 = 0xf7f7f7,
+    Gray98 = 0xfafafa,
+    Grey98 = 0xfafafa,
+    Gray99 = 0xfcfcfc,
+    Grey99 = 0xfcfcfc,
+    Gray100 = 0xffffff,
+    Grey100 = 0xffffff,
+    DarkGrey = 0xa9a9a9,
+    DarkGray = 0xa9a9a9,
+    DarkBlue = 0x00008b,
+    DarkCyan = 0x008b8b,
+    DarkMagenta = 0x8b008b,
+    DarkRed = 0x8b0000,
+    LightGreen = 0x90ee90,
+    Crimson = 0xdc143c,
+    Indigo = 0x4b0082,
+    Olive = 0x808000,
+    RebeccaPurple = 0x663399,
+    Silver = 0xc0c0c0,
+    Teal = 0x008080,
+};
+};
+}
+
+#endif
diff --git a/project/thirdparty/tracy-0.11.1/common/TracyForceInline.hpp b/project/thirdparty/tracy-0.11.1/common/TracyForceInline.hpp
new file mode 100644
index 000000000..b6a5833e5
--- /dev/null
+++ b/project/thirdparty/tracy-0.11.1/common/TracyForceInline.hpp
@@ -0,0 +1,20 @@
+#ifndef __TRACYFORCEINLINE_HPP__
+#define __TRACYFORCEINLINE_HPP__
+
+#if defined(__GNUC__)
+#  define tracy_force_inline __attribute__((always_inline)) inline
+#elif defined(_MSC_VER)
+#  define tracy_force_inline __forceinline
+#else
+#  define tracy_force_inline inline
+#endif
+
+#if defined(__GNUC__)
+#  define tracy_no_inline __attribute__((noinline))
+#elif defined(_MSC_VER)
+#  define tracy_no_inline __declspec(noinline)
+#else
+#  define tracy_no_inline
+#endif
+
+#endif
diff --git a/project/thirdparty/tracy-0.11.1/common/TracyMutex.hpp b/project/thirdparty/tracy-0.11.1/common/TracyMutex.hpp
new file mode 100644
index 000000000..57fb01a0c
--- /dev/null
+++ b/project/thirdparty/tracy-0.11.1/common/TracyMutex.hpp
@@ -0,0 +1,24 @@
+#ifndef __TRACYMUTEX_HPP__
+#define __TRACYMUTEX_HPP__
+
+#if defined _MSC_VER
+
+#  include <shared_mutex>
+
+namespace tracy
+{
+using TracyMutex = std::shared_mutex;
+}
+
+#else
+
+#include <mutex>
+
+namespace tracy
+{
+using TracyMutex = std::mutex;
+}
+
+#endif
+
+#endif
diff --git a/project/thirdparty/tracy-0.11.1/common/TracyProtocol.hpp b/project/thirdparty/tracy-0.11.1/common/TracyProtocol.hpp
new file mode 100644
index 000000000..54124586a
--- /dev/null
+++ b/project/thirdparty/tracy-0.11.1/common/TracyProtocol.hpp
@@ -0,0 +1,169 @@
+#ifndef __TRACYPROTOCOL_HPP__
+#define __TRACYPROTOCOL_HPP__
+
+#include <limits>
+#include <stdint.h>
+
+namespace tracy
+{
+
+constexpr unsigned Lz4CompressBound( unsigned isize ) { return isize + ( isize / 255 ) + 16; }
+
+enum : uint32_t { ProtocolVersion = 69 };
+enum : uint16_t { BroadcastVersion = 3 };
+
+using lz4sz_t = uint32_t;
+
+enum { TargetFrameSize = 256 * 1024 };
+enum { LZ4Size = Lz4CompressBound( TargetFrameSize ) };
+static_assert( LZ4Size <= (std::numeric_limits<lz4sz_t>::max)(), "LZ4Size greater than lz4sz_t" );
+static_assert( TargetFrameSize * 2 >= 64 * 1024, "Not enough space for LZ4 stream buffer" );
+
+enum { HandshakeShibbolethSize = 8 };
+static const char HandshakeShibboleth[HandshakeShibbolethSize] = { 'T', 'r', 'a', 'c', 'y', 'P', 'r', 'f' };
+
+enum HandshakeStatus : uint8_t
+{
+    HandshakePending,
+    HandshakeWelcome,
+    HandshakeProtocolMismatch,
+    HandshakeNotAvailable,
+    HandshakeDropped
+};
+
+enum { WelcomeMessageProgramNameSize = 64 };
+enum { WelcomeMessageHostInfoSize = 1024 };
+
+#pragma pack( push, 1 )
+
+// Must increase left query space after handling!
+enum ServerQuery : uint8_t
+{
+    ServerQueryTerminate,
+    ServerQueryString,
+    ServerQueryThreadString,
+    ServerQuerySourceLocation,
+    ServerQueryPlotName,
+    ServerQueryFrameName,
+    ServerQueryParameter,
+    ServerQueryFiberName,
+    ServerQueryExternalName,
+    // Items above are high priority. Split order must be preserved. See IsQueryPrio().
+    ServerQueryDisconnect,
+    ServerQueryCallstackFrame,
+    ServerQuerySymbol,
+    ServerQuerySymbolCode,
+    ServerQuerySourceCode,
+    ServerQueryDataTransfer,
+    ServerQueryDataTransferPart
+};
+
+struct ServerQueryPacket
+{
+    ServerQuery type;
+    uint64_t ptr;
+    uint32_t extra;
+};
+
+enum { ServerQueryPacketSize = sizeof( ServerQueryPacket ) };
+
+
+enum CpuArchitecture : uint8_t
+{
+    CpuArchUnknown,
+    CpuArchX86,
+    CpuArchX64,
+    CpuArchArm32,
+    CpuArchArm64
+};
+
+
+struct WelcomeFlag
+{
+    enum _t : uint8_t
+    {
+        OnDemand        = 1 << 0,
+        IsApple         = 1 << 1,
+        CodeTransfer    = 1 << 2,
+        CombineSamples  = 1 << 3,
+        IdentifySamples = 1 << 4,
+    };
+};
+
+struct WelcomeMessage
+{
+    double timerMul;
+    int64_t initBegin;
+    int64_t initEnd;
+    uint64_t delay;
+    uint64_t resolution;
+    uint64_t epoch;
+    uint64_t exectime;
+    uint64_t pid;
+    int64_t samplingPeriod;
+    uint8_t flags;
+    uint8_t cpuArch;
+    char cpuManufacturer[12];
+    uint32_t cpuId;
+    char programName[WelcomeMessageProgramNameSize];
+    char hostInfo[WelcomeMessageHostInfoSize];
+};
+
+enum { WelcomeMessageSize = sizeof( WelcomeMessage ) };
+
+
+struct OnDemandPayloadMessage
+{
+    uint64_t frames;
+    uint64_t currentTime;
+};
+
+enum { OnDemandPayloadMessageSize = sizeof( OnDemandPayloadMessage ) };
+
+
+struct BroadcastMessage
+{
+    uint16_t broadcastVersion;
+    uint16_t listenPort;
+    uint32_t protocolVersion;
+    uint64_t pid;
+    int32_t activeTime;        // in seconds
+    char programName[WelcomeMessageProgramNameSize];
+};
+
+struct BroadcastMessage_v2
+{
+    uint16_t broadcastVersion;
+    uint16_t listenPort;
+    uint32_t protocolVersion;
+    int32_t activeTime;
+    char programName[WelcomeMessageProgramNameSize];
+};
+
+struct BroadcastMessage_v1
+{
+    uint32_t broadcastVersion;
+    uint32_t protocolVersion;
+    uint32_t listenPort;
+    uint32_t activeTime;
+    char programName[WelcomeMessageProgramNameSize];
+};
+
+struct BroadcastMessage_v0
+{
+    uint32_t broadcastVersion;
+    uint32_t protocolVersion;
+    uint32_t activeTime;
+    char programName[WelcomeMessageProgramNameSize];
+};
+
+enum { BroadcastMessageSize = sizeof( BroadcastMessage ) };
+enum { BroadcastMessageSize_v2 = sizeof( BroadcastMessage_v2 ) };
+enum { BroadcastMessageSize_v1 = sizeof( BroadcastMessage_v1 ) };
+enum { BroadcastMessageSize_v0 = sizeof( BroadcastMessage_v0 ) };
+
+#pragma pack( pop )
+
+}
+
+#endif
diff --git a/project/thirdparty/tracy-0.11.1/common/TracyQueue.hpp b/project/thirdparty/tracy-0.11.1/common/TracyQueue.hpp
new file mode 100644
index 000000000..affbd67ab
--- /dev/null
+++ b/project/thirdparty/tracy-0.11.1/common/TracyQueue.hpp
@@ -0,0 +1,905 @@
+#ifndef __TRACYQUEUE_HPP__
+#define __TRACYQUEUE_HPP__
+
+#include <stddef.h>
+#include <stdint.h>
+
+namespace tracy
+{
+
+enum class QueueType : uint8_t
+{
+    ZoneText,
+    ZoneName,
+    Message,
+    MessageColor,
+    MessageCallstack,
+    MessageColorCallstack,
+    MessageAppInfo,
+    ZoneBeginAllocSrcLoc,
+    ZoneBeginAllocSrcLocCallstack,
+    CallstackSerial,
+    Callstack,
+    CallstackAlloc,
+    CallstackSample,
+    CallstackSampleContextSwitch,
+    FrameImage,
+    ZoneBegin,
+    ZoneBeginCallstack,
+    ZoneEnd,
+    LockWait,
+    LockObtain,
+    LockRelease,
+    LockSharedWait,
+    LockSharedObtain,
+    LockSharedRelease,
+    LockName,
+    MemAlloc,
+    MemAllocNamed,
+    MemFree,
+    MemFreeNamed,
+    MemAllocCallstack,
+    MemAllocCallstackNamed,
+    MemFreeCallstack,
+    MemFreeCallstackNamed,
+    GpuZoneBegin,
+    GpuZoneBeginCallstack,
+    GpuZoneBeginAllocSrcLoc,
+    GpuZoneBeginAllocSrcLocCallstack,
+    GpuZoneEnd,
+    GpuZoneBeginSerial,
+    GpuZoneBeginCallstackSerial,
+    GpuZoneBeginAllocSrcLocSerial,
+    GpuZoneBeginAllocSrcLocCallstackSerial,
+    GpuZoneEndSerial,
+    PlotDataInt,
+    PlotDataFloat,
+    PlotDataDouble,
+    ContextSwitch,
+    ThreadWakeup,
+    GpuTime,
+    GpuContextName,
+    CallstackFrameSize,
+    SymbolInformation,
+    ExternalNameMetadata,
+    SymbolCodeMetadata,
+    SourceCodeMetadata,
+    FiberEnter,
+    FiberLeave,
+    Terminate,
+    KeepAlive,
+    ThreadContext,
+    GpuCalibration,
+    GpuTimeSync,
+    Crash,
+    CrashReport,
+    ZoneValidation,
+    ZoneColor,
+    ZoneValue,
+    FrameMarkMsg,
+    FrameMarkMsgStart,
+    FrameMarkMsgEnd,
+    FrameVsync,
+    SourceLocation,
+    LockAnnounce,
+    LockTerminate,
+    LockMark,
+    MessageLiteral,
+    MessageLiteralColor,
+    MessageLiteralCallstack,
+    MessageLiteralColorCallstack,
+    GpuNewContext,
+    CallstackFrame,
+    SysTimeReport,
+    SysPowerReport,
+    TidToPid,
+    HwSampleCpuCycle,
+    HwSampleInstructionRetired,
+    HwSampleCacheReference,
+    HwSampleCacheMiss,
+    HwSampleBranchRetired,
+    HwSampleBranchMiss,
+    PlotConfig,
+    ParamSetup,
+    AckServerQueryNoop,
+    AckSourceCodeNotAvailable,
+    AckSymbolCodeNotAvailable,
+    CpuTopology,
+    SingleStringData,
+    SecondStringData,
+    MemNamePayload,
+    ThreadGroupHint,
+    StringData,
+    ThreadName,
+    PlotName,
+    SourceLocationPayload,
+    CallstackPayload,
+    CallstackAllocPayload,
+    FrameName,
+    FrameImageData,
+    ExternalName,
+    ExternalThreadName,
+    SymbolCode,
+    SourceCode,
+    FiberName,
+    NUM_TYPES
+};
+
+#pragma pack( push, 1 )
+
+struct QueueThreadContext
+{
+    uint32_t thread;
+};
+
+struct QueueZoneBeginLean
+{
+    int64_t time;
+};
+
+struct QueueZoneBegin : public QueueZoneBeginLean
+{
+    uint64_t srcloc;    // ptr
+};
+
+struct QueueZoneBeginThread : public QueueZoneBegin
+{
+    uint32_t thread;
+};
+
+struct QueueZoneEnd
+{
+    int64_t time;
+};
+
+struct QueueZoneEndThread : public QueueZoneEnd
+{
+    uint32_t thread;
+};
+
+struct QueueZoneValidation
+{
+    uint32_t id;
+};
+
+struct QueueZoneValidationThread : public QueueZoneValidation
+{
+    uint32_t thread;
+};
+
+struct QueueZoneColor
+{
+    uint8_t b;
+    uint8_t g;
+    uint8_t r;
+};
+
+struct QueueZoneColorThread : public QueueZoneColor
+{
+    uint32_t thread;
+};
+
+struct QueueZoneValue
+{
+    uint64_t value;
+};
+
+struct QueueZoneValueThread : public QueueZoneValue
+{
+    uint32_t thread;
+};
+
+struct QueueStringTransfer
+{
+    uint64_t ptr;
+};
+
+struct QueueFrameMark
+{
+    int64_t time;
+    uint64_t name;      // ptr
+};
+
+struct QueueFrameVsync
+{
+    int64_t time;
+    uint32_t id;
+};
+
+struct QueueFrameImage
+{
+    uint32_t frame;
+    uint16_t w;
+    uint16_t h;
+    uint8_t flip;
+};
+
+struct QueueFrameImageFat : public QueueFrameImage
+{
+    uint64_t image;     // ptr
+};
+
+struct QueueSourceLocation
+{
+    uint64_t name;
+    uint64_t function;  // ptr
+    uint64_t file;      // ptr
+    uint32_t line;
+    uint8_t b;
+    uint8_t g;
+    uint8_t r;
+};
+
+struct QueueZoneTextFat
+{
+    uint64_t text;      // ptr
+    uint16_t size;
+};
+
+struct QueueZoneTextFatThread : public QueueZoneTextFat
+{
+    uint32_t thread;
+};
+
+enum class LockType : uint8_t
+{
+    Lockable,
+    SharedLockable
+};
+
+struct QueueLockAnnounce
+{
+    uint32_t id;
+    int64_t time;
+    uint64_t lckloc;    // ptr
+    LockType type;
+};
+
+struct QueueFiberEnter
+{
+    int64_t time;
+    uint64_t fiber;     // ptr
+    uint32_t thread;
+    int32_t groupHint;
+};
+
+struct QueueFiberLeave
+{
+    int64_t time;
+    uint32_t thread;
+};
+
+struct QueueLockTerminate
+{
+    uint32_t id;
+    int64_t time;
+};
+
+struct QueueLockWait
+{
+    uint32_t thread;
+    uint32_t id;
+    int64_t time;
+};
+
+struct QueueLockObtain
+{
+    uint32_t thread;
+    uint32_t id;
+    int64_t time;
+};
+
+struct QueueLockRelease
+{
+    uint32_t id;
+    int64_t time;
+};
+
+struct QueueLockReleaseShared : public QueueLockRelease
+{
+    uint32_t thread;
+};
+
+struct QueueLockMark
+{
+    uint32_t thread;
+    uint32_t id;
+    uint64_t srcloc;    // ptr
+};
+
+struct QueueLockName
+{
+    uint32_t id;
+};
+
+struct QueueLockNameFat : public QueueLockName
+{
+    uint64_t name;      // ptr
+    uint16_t size;
+};
+
+struct QueuePlotDataBase
+{
+    uint64_t name;      // ptr
+    int64_t time;
+};
+
+struct QueuePlotDataInt : public QueuePlotDataBase
+{
+    int64_t val;
+};
+
+struct QueuePlotDataFloat : public QueuePlotDataBase 
+{
+    float val;
+};
+
+struct QueuePlotDataDouble : public QueuePlotDataBase
+{
+    double val;
+};
+
+struct QueueMessage
+{
+    int64_t time;
+};
+
+struct QueueMessageColor : public QueueMessage
+{
+    uint8_t b;
+    uint8_t g;
+    uint8_t r;
+};
+
+struct QueueMessageLiteral : public QueueMessage
+{
+    uint64_t text;      // ptr
+};
+
+struct QueueMessageLiteralThread : public QueueMessageLiteral
+{
+    uint32_t thread;
+};
+
+struct QueueMessageColorLiteral : public QueueMessageColor
+{
+    uint64_t text;      // ptr
+};
+
+struct QueueMessageColorLiteralThread : public QueueMessageColorLiteral
+{
+    uint32_t thread;
+};
+
+struct QueueMessageFat : public QueueMessage
+{
+    uint64_t text;      // ptr
+    uint16_t size;
+};
+
+struct QueueMessageFatThread : public QueueMessageFat
+{
+    uint32_t thread;
+};
+
+struct QueueMessageColorFat : public QueueMessageColor
+{
+    uint64_t text;      // ptr
+    uint16_t size;
+};
+
+struct QueueMessageColorFatThread : public QueueMessageColorFat
+{
+    uint32_t thread;
+};
+
+// Don't change order, only add new entries at the end, this is also used on trace dumps!
+enum class GpuContextType : uint8_t
+{
+    Invalid,
+    OpenGl,
+    Vulkan,
+    OpenCL,
+    Direct3D12,
+    Direct3D11
+};
+
+enum GpuContextFlags : uint8_t
+{
+    GpuContextCalibration   = 1 << 0
+};
+
+struct QueueGpuNewContext
+{
+    int64_t cpuTime;
+    int64_t gpuTime;
+    uint32_t thread;
+    float period;
+    uint8_t context;
+    GpuContextFlags flags;
+    GpuContextType type;
+};
+
+struct QueueGpuZoneBeginLean
+{
+    int64_t cpuTime;
+    uint32_t thread;
+    uint16_t queryId;
+    uint8_t context;
+};
+
+struct QueueGpuZoneBegin : public QueueGpuZoneBeginLean
+{
+    uint64_t srcloc;
+};
+
+struct QueueGpuZoneEnd
+{
+    int64_t cpuTime;
+    uint32_t thread;
+    uint16_t queryId;
+    uint8_t context;
+};
+
+struct QueueGpuTime
+{
+    int64_t gpuTime;
+    uint16_t queryId;
+    uint8_t context;
+};
+
+struct QueueGpuCalibration
+{
+    int64_t gpuTime;
+    int64_t cpuTime;
+    int64_t cpuDelta;
+    uint8_t context;
+};
+
+struct QueueGpuTimeSync
+{
+    int64_t gpuTime;
+    int64_t cpuTime;
+    uint8_t context;
+};
+    
+struct QueueGpuContextName
+{
+    uint8_t context;
+};
+
+struct QueueGpuContextNameFat : public QueueGpuContextName
+{
+    uint64_t ptr;
+    uint16_t size;
+};
+
+struct QueueMemNamePayload
+{
+    uint64_t name;
+};
+
+struct QueueThreadGroupHint
+{
+    uint32_t thread;
+    int32_t groupHint;
+};
+
+struct QueueMemAlloc
+{
+    int64_t time;
+    uint32_t thread;
+    uint64_t ptr;
+    char size[6];
+};
+
+struct QueueMemFree
+{
+    int64_t time;
+    uint32_t thread;
+    uint64_t ptr;
+};
+
+struct QueueCallstackFat
+{
+    uint64_t ptr;
+};
+
+struct QueueCallstackFatThread : public QueueCallstackFat
+{
+    uint32_t thread;
+};
+
+struct QueueCallstackAllocFat
+{
+    uint64_t ptr;
+    uint64_t nativePtr;
+};
+
+struct QueueCallstackAllocFatThread : public QueueCallstackAllocFat
+{
+    uint32_t thread;
+};
+
+struct QueueCallstackSample
+{
+    int64_t time;
+    uint32_t thread;
+};
+
+struct QueueCallstackSampleFat : public QueueCallstackSample
+{
+    uint64_t ptr;
+};
+
+struct QueueCallstackFrameSize
+{
+    uint64_t ptr;
+    uint8_t size;
+};
+
+struct QueueCallstackFrameSizeFat : public QueueCallstackFrameSize
+{
+    uint64_t data;
+    uint64_t imageName;
+};
+
+struct QueueCallstackFrame
+{
+    uint32_t line;
+    uint64_t symAddr;
+    uint32_t symLen;
+};
+
+struct QueueSymbolInformation
+{
+    uint32_t line;
+    uint64_t symAddr;
+};
+
+struct QueueSymbolInformationFat : public QueueSymbolInformation
+{
+    uint64_t fileString;
+    uint8_t needFree;
+};
+
+struct QueueCrashReport
+{
+    int64_t time;
+    uint64_t text;      // ptr
+};
+
+struct QueueCrashReportThread
+{
+    uint32_t thread;
+};
+
+struct QueueSysTime
+{
+    int64_t time;
+    float sysTime;
+};
+
+struct QueueSysPower
+{
+    int64_t time;
+    uint64_t delta;
+    uint64_t name;  // ptr
+};
+
+struct QueueContextSwitch
+{
+    int64_t time;
+    uint32_t oldThread;
+    uint32_t newThread;
+    uint8_t cpu;
+    uint8_t reason;
+    uint8_t state;
+};
+
+struct QueueThreadWakeup
+{
+    int64_t time;
+    uint32_t thread;
+};
+
+struct QueueTidToPid
+{
+    uint64_t tid;
+    uint64_t pid;
+};
+
+struct QueueHwSample
+{
+    uint64_t ip;
+    int64_t time;
+};
+
+enum class PlotFormatType : uint8_t
+{
+    Number,
+    Memory,
+    Percentage
+};
+
+struct QueuePlotConfig
+{
+    uint64_t name;      // ptr
+    uint8_t type;
+    uint8_t step;
+    uint8_t fill;
+    uint32_t color;
+};
+
+struct QueueParamSetup
+{
+    uint32_t idx;
+    uint64_t name;      // ptr
+    uint8_t isBool;
+    int32_t val;
+};
+
+struct QueueSourceCodeNotAvailable
+{
+    uint32_t id;
+};
+
+struct QueueCpuTopology
+{
+    uint32_t package;
+    uint32_t die;
+    uint32_t core;
+    uint32_t thread;
+};
+
+struct QueueExternalNameMetadata
+{
+    uint64_t thread;
+    uint64_t name;
+    uint64_t threadName;
+};
+
+struct QueueSymbolCodeMetadata
+{
+    uint64_t symbol;
+    uint64_t ptr;
+    uint32_t size;
+};
+
+struct QueueSourceCodeMetadata
+{
+    uint64_t ptr;
+    uint32_t size;
+    uint32_t id;
+};
+
+struct QueueHeader
+{
+    union
+    {
+        QueueType type;
+        uint8_t idx;
+    };
+};
+
+struct QueueItem
+{
+    QueueHeader hdr;
+    union
+    {
+        QueueThreadContext threadCtx;
+        QueueZoneBegin zoneBegin;
+        QueueZoneBeginLean zoneBeginLean;
+        QueueZoneBeginThread zoneBeginThread;
+        QueueZoneEnd zoneEnd;
+        QueueZoneEndThread zoneEndThread;
+        QueueZoneValidation zoneValidation;
+        QueueZoneValidationThread zoneValidationThread;
+        QueueZoneColor zoneColor;
+        QueueZoneColorThread zoneColorThread;
+        QueueZoneValue zoneValue;
+        QueueZoneValueThread zoneValueThread;
+        QueueStringTransfer stringTransfer;
+        QueueFrameMark frameMark;
+        QueueFrameVsync frameVsync;
+        QueueFrameImage frameImage;
+        QueueFrameImageFat frameImageFat;
+        QueueSourceLocation srcloc;
+        QueueZoneTextFat zoneTextFat;
+        QueueZoneTextFatThread zoneTextFatThread;
+        QueueLockAnnounce lockAnnounce;
+        QueueLockTerminate lockTerminate;
+        QueueLockWait lockWait;
+        QueueLockObtain lockObtain;
+        QueueLockRelease lockRelease;
+        QueueLockReleaseShared lockReleaseShared;
+        QueueLockMark lockMark;
+        QueueLockName lockName;
+        QueueLockNameFat lockNameFat;
+        QueuePlotDataInt plotDataInt;
+        QueuePlotDataFloat plotDataFloat;
+        QueuePlotDataDouble plotDataDouble;
+        QueueMessage message;
+        QueueMessageColor messageColor;
+        QueueMessageLiteral messageLiteral;
+        QueueMessageLiteralThread messageLiteralThread;
+        QueueMessageColorLiteral messageColorLiteral;
+        QueueMessageColorLiteralThread messageColorLiteralThread;
+        QueueMessageFat messageFat;
+        QueueMessageFatThread messageFatThread;
+        QueueMessageColorFat messageColorFat;
+        QueueMessageColorFatThread messageColorFatThread;
+        QueueGpuNewContext gpuNewContext;
+        QueueGpuZoneBegin gpuZoneBegin;
+        QueueGpuZoneBeginLean gpuZoneBeginLean;
+        QueueGpuZoneEnd gpuZoneEnd;
+        QueueGpuTime gpuTime;
+        QueueGpuCalibration gpuCalibration;
+        QueueGpuTimeSync gpuTimeSync;
+        QueueGpuContextName gpuContextName;
+        QueueGpuContextNameFat gpuContextNameFat;
+        QueueMemAlloc memAlloc;
+        QueueMemFree memFree;
+        QueueMemNamePayload memName;
+        QueueThreadGroupHint threadGroupHint;
+        QueueCallstackFat callstackFat;
+        QueueCallstackFatThread callstackFatThread;
+        QueueCallstackAllocFat callstackAllocFat;
+        QueueCallstackAllocFatThread callstackAllocFatThread;
+        QueueCallstackSample callstackSample;
+        QueueCallstackSampleFat callstackSampleFat;
+        QueueCallstackFrameSize callstackFrameSize;
+        QueueCallstackFrameSizeFat callstackFrameSizeFat;
+        QueueCallstackFrame callstackFrame;
+        QueueSymbolInformation symbolInformation;
+        QueueSymbolInformationFat symbolInformationFat;
+        QueueCrashReport crashReport;
+        QueueCrashReportThread crashReportThread;
+        QueueSysTime sysTime;
+        QueueSysPower sysPower;
+        QueueContextSwitch contextSwitch;
+        QueueThreadWakeup threadWakeup;
+        QueueTidToPid tidToPid;
+        QueueHwSample hwSample;
+        QueuePlotConfig plotConfig;
+        QueueParamSetup paramSetup;
+        QueueCpuTopology cpuTopology;
+        QueueExternalNameMetadata externalNameMetadata;
+        QueueSymbolCodeMetadata symbolCodeMetadata;
+        QueueSourceCodeMetadata sourceCodeMetadata;
+        QueueSourceCodeNotAvailable sourceCodeNotAvailable;
+        QueueFiberEnter fiberEnter;
+        QueueFiberLeave fiberLeave;
+    };
+};
+#pragma pack( pop )
+
+
+enum { QueueItemSize = sizeof( QueueItem ) };
+
+static constexpr size_t QueueDataSize[] = {
+    sizeof( QueueHeader ),                                  // zone text
+    sizeof( QueueHeader ),                                  // zone name
+    sizeof( QueueHeader ) + sizeof( QueueMessage ),
+    sizeof( QueueHeader ) + sizeof( QueueMessageColor ),
+    sizeof( QueueHeader ) + sizeof( QueueMessage ),         // callstack
+    sizeof( QueueHeader ) + sizeof( QueueMessageColor ),    // callstack
+    sizeof( QueueHeader ) + sizeof( QueueMessage ),         // app info
+    sizeof( QueueHeader ) + sizeof( QueueZoneBeginLean ),   // allocated source location
+    sizeof( QueueHeader ) + sizeof( QueueZoneBeginLean ),   // allocated source location, callstack
+    sizeof( QueueHeader ),                                  // callstack memory
+    sizeof( QueueHeader ),                                  // callstack
+    sizeof( QueueHeader ),                                  // callstack alloc
+    sizeof( QueueHeader ) + sizeof( QueueCallstackSample ),
+    sizeof( QueueHeader ) + sizeof( QueueCallstackSample ), // context switch
+    sizeof( QueueHeader ) + sizeof( QueueFrameImage ),
+    sizeof( QueueHeader ) + sizeof( QueueZoneBegin ),
+    sizeof( QueueHeader ) + sizeof( QueueZoneBegin ),       // callstack
+    sizeof( QueueHeader ) + sizeof( QueueZoneEnd ),
+    sizeof( QueueHeader ) + sizeof( QueueLockWait ),
+    sizeof( QueueHeader ) + sizeof( QueueLockObtain ),
+    sizeof( QueueHeader ) + sizeof( QueueLockRelease ),
+    sizeof( QueueHeader ) + sizeof( QueueLockWait ),        // shared
+    sizeof( QueueHeader ) + sizeof( QueueLockObtain ),      // shared
+    sizeof( QueueHeader ) + sizeof( QueueLockReleaseShared ),
+    sizeof( QueueHeader ) + sizeof( QueueLockName ),
+    sizeof( QueueHeader ) + sizeof( QueueMemAlloc ),
+    sizeof( QueueHeader ) + sizeof( QueueMemAlloc ),        // named
+    sizeof( QueueHeader ) + sizeof( QueueMemFree ),
+    sizeof( QueueHeader ) + sizeof( QueueMemFree ),         // named
+    sizeof( QueueHeader ) + sizeof( QueueMemAlloc ),        // callstack
+    sizeof( QueueHeader ) + sizeof( QueueMemAlloc ),        // callstack, named
+    sizeof( QueueHeader ) + sizeof( QueueMemFree ),         // callstack
+    sizeof( QueueHeader ) + sizeof( QueueMemFree ),         // callstack, named
+    sizeof( QueueHeader ) + sizeof( QueueGpuZoneBegin ),
+    sizeof( QueueHeader ) + sizeof( QueueGpuZoneBegin ),    // callstack
+    sizeof( QueueHeader ) + sizeof( QueueGpuZoneBeginLean ),// allocated source location
+    sizeof( QueueHeader ) + sizeof( QueueGpuZoneBeginLean ),// allocated source location, callstack
+    sizeof( QueueHeader ) + sizeof( QueueGpuZoneEnd ),
+    sizeof( QueueHeader ) + sizeof( QueueGpuZoneBegin ),    // serial
+    sizeof( QueueHeader ) + sizeof( QueueGpuZoneBegin ),    // serial, callstack
+    sizeof( QueueHeader ) + sizeof( QueueGpuZoneBeginLean ),// serial, allocated source location
+    sizeof( QueueHeader ) + sizeof( QueueGpuZoneBeginLean ),// serial, allocated source location, callstack
+    sizeof( QueueHeader ) + sizeof( QueueGpuZoneEnd ),      // serial
+    sizeof( QueueHeader ) + sizeof( QueuePlotDataInt ),
+    sizeof( QueueHeader ) + sizeof( QueuePlotDataFloat ),
+    sizeof( QueueHeader ) + sizeof( QueuePlotDataDouble ),
+    sizeof( QueueHeader ) + sizeof( QueueContextSwitch ),
+    sizeof( QueueHeader ) + sizeof( QueueThreadWakeup ),
+    sizeof( QueueHeader ) + sizeof( QueueGpuTime ),
+    sizeof( QueueHeader ) + sizeof( QueueGpuContextName ),
+    sizeof( QueueHeader ) + sizeof( QueueCallstackFrameSize ),
+    sizeof( QueueHeader ) + sizeof( QueueSymbolInformation ),
+    sizeof( QueueHeader ),                                  // ExternalNameMetadata - not for wire transfer
+    sizeof( QueueHeader ),                                  // SymbolCodeMetadata - not for wire transfer
+    sizeof( QueueHeader ),                                  // SourceCodeMetadata - not for wire transfer
+    sizeof( QueueHeader ) + sizeof( QueueFiberEnter ),
+    sizeof( QueueHeader ) + sizeof( QueueFiberLeave ),
+    // above items must be first
+    sizeof( QueueHeader ),                                  // terminate
+    sizeof( QueueHeader ),                                  // keep alive
+    sizeof( QueueHeader ) + sizeof( QueueThreadContext ),
+    sizeof( QueueHeader ) + sizeof( QueueGpuCalibration ),
+    sizeof( QueueHeader ) + sizeof( QueueGpuTimeSync ),
+    sizeof( QueueHeader ),                                  // crash
+    sizeof( QueueHeader ) + sizeof( QueueCrashReport ),
+    sizeof( QueueHeader ) + sizeof( QueueZoneValidation ),
+    sizeof( QueueHeader ) + sizeof( QueueZoneColor ),
+    sizeof( QueueHeader ) + sizeof( QueueZoneValue ),
+    sizeof( QueueHeader ) + sizeof( QueueFrameMark ),       // continuous frames
+    sizeof( QueueHeader ) + sizeof( QueueFrameMark ),       // start
+    sizeof( QueueHeader ) + sizeof( QueueFrameMark ),       // end
+    sizeof( QueueHeader ) + sizeof( QueueFrameVsync ),
+    sizeof( QueueHeader ) + sizeof( QueueSourceLocation ),
+    sizeof( QueueHeader ) + sizeof( QueueLockAnnounce ),
+    sizeof( QueueHeader ) + sizeof( QueueLockTerminate ),
+    sizeof( QueueHeader ) + sizeof( QueueLockMark ),
+    sizeof( QueueHeader ) + sizeof( QueueMessageLiteral ),
+    sizeof( QueueHeader ) + sizeof( QueueMessageColorLiteral ),
+    sizeof( QueueHeader ) + sizeof( QueueMessageLiteral ),  // callstack
+    sizeof( QueueHeader ) + sizeof( QueueMessageColorLiteral ), // callstack
+    sizeof( QueueHeader ) + sizeof( QueueGpuNewContext ),
+    sizeof( QueueHeader ) + sizeof( QueueCallstackFrame ),
+    sizeof( QueueHeader ) + sizeof( QueueSysTime ),
+    sizeof( QueueHeader ) + sizeof( QueueSysPower ),
+    sizeof( QueueHeader ) + sizeof( QueueTidToPid ),
+    sizeof( QueueHeader ) + sizeof( QueueHwSample ),        // cpu cycle
+    sizeof( QueueHeader ) + sizeof( QueueHwSample ),        // instruction retired
+    sizeof( QueueHeader ) + sizeof( QueueHwSample ),        // cache reference
+    sizeof( QueueHeader ) + sizeof( QueueHwSample ),        // cache miss
+    sizeof( QueueHeader ) + sizeof( QueueHwSample ),        // branch retired
+    sizeof( QueueHeader ) + sizeof( QueueHwSample ),        // branch miss
+    sizeof( QueueHeader ) + sizeof( QueuePlotConfig ),
+    sizeof( QueueHeader ) + sizeof( QueueParamSetup ),
+    sizeof( QueueHeader ),                                  // server query acknowledgement
+    sizeof( QueueHeader ) + sizeof( QueueSourceCodeNotAvailable ),
+    sizeof( QueueHeader ),                                  // symbol code not available
+    sizeof( QueueHeader ) + sizeof( QueueCpuTopology ),
+    sizeof( QueueHeader ),                                  // single string data
+    sizeof( QueueHeader ),                                  // second string data
+    sizeof( QueueHeader ) + sizeof( QueueMemNamePayload ),
+    sizeof( QueueHeader ) + sizeof( QueueThreadGroupHint ),
+    // keep all QueueStringTransfer below
+    sizeof( QueueHeader ) + sizeof( QueueStringTransfer ),  // string data
+    sizeof( QueueHeader ) + sizeof( QueueStringTransfer ),  // thread name
+    sizeof( QueueHeader ) + sizeof( QueueStringTransfer ),  // plot name
+    sizeof( QueueHeader ) + sizeof( QueueStringTransfer ),  // allocated source location payload
+    sizeof( QueueHeader ) + sizeof( QueueStringTransfer ),  // callstack payload
+    sizeof( QueueHeader ) + sizeof( QueueStringTransfer ),  // callstack alloc payload
+    sizeof( QueueHeader ) + sizeof( QueueStringTransfer ),  // frame name
+    sizeof( QueueHeader ) + sizeof( QueueStringTransfer ),  // frame image data
+    sizeof( QueueHeader ) + sizeof( QueueStringTransfer ),  // external name
+    sizeof( QueueHeader ) + sizeof( QueueStringTransfer ),  // external thread name
+    sizeof( QueueHeader ) + sizeof( QueueStringTransfer ),  // symbol code
+    sizeof( QueueHeader ) + sizeof( QueueStringTransfer ),  // source code
+    sizeof( QueueHeader ) + sizeof( QueueStringTransfer ),  // fiber name
+};
+
+static_assert( QueueItemSize == 32, "Queue item size not 32 bytes" );
+static_assert( sizeof( QueueDataSize ) / sizeof( size_t ) == (uint8_t)QueueType::NUM_TYPES, "QueueDataSize mismatch" );
+static_assert( sizeof( void* ) <= sizeof( uint64_t ), "Pointer size > 8 bytes" );
+static_assert( sizeof( void* ) == sizeof( uintptr_t ), "Pointer size != uintptr_t" );
+
+}
+
+#endif
diff --git a/project/thirdparty/tracy-0.11.1/common/TracySocket.cpp b/project/thirdparty/tracy-0.11.1/common/TracySocket.cpp
new file mode 100644
index 000000000..bdba36196
--- /dev/null
+++ b/project/thirdparty/tracy-0.11.1/common/TracySocket.cpp
@@ -0,0 +1,752 @@
+#include <assert.h>
+#include <inttypes.h>
+#include <new>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/types.h>
+
+#include "TracyAlloc.hpp"
+#include "TracySocket.hpp"
+#include "TracySystem.hpp"
+
+#ifdef _WIN32
+#  ifndef NOMINMAX
+#    define NOMINMAX
+#  endif
+#  include <winsock2.h>
+#  include <ws2tcpip.h>
+#  ifdef _MSC_VER
+#    pragma warning(disable:4244)
+#    pragma warning(disable:4267)
+#  endif
+#  define poll WSAPoll
+#  ifdef _MSC_VER
+#    pragma comment(lib, "ws2_32.lib")
+#  endif
+#else
+#  include <arpa/inet.h>
+#  include <sys/socket.h>
+#  include <sys/param.h>
+#  include <errno.h>
+#  include <fcntl.h>
+#  include <netinet/in.h>
+#  include <netdb.h>
+#  include <unistd.h>
+#  include <poll.h>
+#endif
+
+#ifndef MSG_NOSIGNAL
+#  define MSG_NOSIGNAL 0
+#endif
+
+namespace tracy
+{
+
+#ifdef _WIN32
+typedef SOCKET socket_t;
+#else
+typedef int socket_t;
+#endif
+
+#ifdef _WIN32
+struct __wsinit
+{
+    __wsinit()
+    {
+        WSADATA wsaData;
+        if( WSAStartup( MAKEWORD( 2, 2 ), &wsaData ) != 0 )
+        {
+            fprintf( stderr, "Cannot init winsock.\n" );
+            exit( 1 );
+        }
+    }
+};
+
+void InitWinSock()
+{
+    static __wsinit init;
+}
+#endif
+
+
+enum { BufSize = 128 * 1024 };
+
+Socket::Socket()
+    : m_buf( (char*)tracy_malloc( BufSize ) )
+    , m_bufPtr( nullptr )
+    , m_sock( -1 )
+    , m_bufLeft( 0 )
+    , m_ptr( nullptr )
+{
+#ifdef _WIN32
+    InitWinSock();
+#endif
+}
+
+Socket::Socket( int sock )
+    : m_buf( (char*)tracy_malloc( BufSize ) )
+    , m_bufPtr( nullptr )
+    , m_sock( sock )
+    , m_bufLeft( 0 )
+    , m_ptr( nullptr )
+{
+}
+
+Socket::~Socket()
+{
+    tracy_free( m_buf );
+    if( m_sock.load( std::memory_order_relaxed ) != -1 )
+    {
+        Close();
+    }
+    if( m_ptr )
+    {
+        freeaddrinfo( m_res );
+#ifdef _WIN32
+        closesocket( m_connSock );
+#else
+        close( m_connSock );
+#endif
+    }
+}
+
+bool Socket::Connect( const char* addr, uint16_t port )
+{
+    assert( !IsValid() );
+
+    if( m_ptr )
+    {
+        const auto c = connect( m_connSock, m_ptr->ai_addr, m_ptr->ai_addrlen );
+        if( c == -1 )
+        {
+#if defined _WIN32
+            const auto err = WSAGetLastError();
+            if( err == WSAEALREADY || err == WSAEINPROGRESS ) return false;
+            if( err != WSAEISCONN )
+            {
+                freeaddrinfo( m_res );
+                closesocket( m_connSock );
+                m_ptr = nullptr;
+                return false;
+            }
+#else
+            const auto err = errno;
+            if( err == EALREADY || err == EINPROGRESS ) return false;
+            if( err != EISCONN )
+            {
+                freeaddrinfo( m_res );
+                close( m_connSock );
+                m_ptr = nullptr;
+                return false;
+            }
+#endif
+        }
+
+#if defined _WIN32
+        u_long nonblocking = 0;
+        ioctlsocket( m_connSock, FIONBIO, &nonblocking );
+#else
+        int flags = fcntl( m_connSock, F_GETFL, 0 );
+        fcntl( m_connSock, F_SETFL, flags & ~O_NONBLOCK );
+#endif
+        m_sock.store( m_connSock, std::memory_order_relaxed );
+        freeaddrinfo( m_res );
+        m_ptr = nullptr;
+        return true;
+    }
+
+    struct addrinfo hints;
+    struct addrinfo *res, *ptr;
+
+    memset( &hints, 0, sizeof( hints ) );
+    hints.ai_family = AF_UNSPEC;
+    hints.ai_socktype = SOCK_STREAM;
+
+    char portbuf[32];
+    sprintf( portbuf, "%" PRIu16, port );
+
+    if( getaddrinfo( addr, portbuf, &hints, &res ) != 0 ) return false;
+    int sock = 0;
+    for( ptr = res; ptr; ptr = ptr->ai_next )
+    {
+        if( ( sock = socket( ptr->ai_family, ptr->ai_socktype, ptr->ai_protocol ) ) == -1 ) continue;
+#if defined __APPLE__
+        int val = 1;
+        setsockopt( sock, SOL_SOCKET, SO_NOSIGPIPE, &val, sizeof( val ) );
+#endif
+#if defined _WIN32
+        u_long nonblocking = 1;
+        ioctlsocket( sock, FIONBIO, &nonblocking );
+#else
+        int flags = fcntl( sock, F_GETFL, 0 );
+        fcntl( sock, F_SETFL, flags | O_NONBLOCK );
+#endif
+        if( connect( sock, ptr->ai_addr, ptr->ai_addrlen ) == 0 )
+        {
+            break;
+        }
+        else
+        {
+#if defined _WIN32
+            const auto err = WSAGetLastError();
+            if( err != WSAEWOULDBLOCK )
+            {
+                closesocket( sock );
+                continue;
+            }
+#else
+            if( errno != EINPROGRESS )
+            {
+                close( sock );
+                continue;
+            }
+#endif
+        }
+        m_res = res;
+        m_ptr = ptr;
+        m_connSock = sock;
+        return false;
+    }
+    freeaddrinfo( res );
+    if( !ptr ) return false;
+
+#if defined _WIN32
+    u_long nonblocking = 0;
+    ioctlsocket( sock, FIONBIO, &nonblocking );
+#else
+    int flags = fcntl( sock, F_GETFL, 0 );
+    fcntl( sock, F_SETFL, flags & ~O_NONBLOCK );
+#endif
+
+    m_sock.store( sock, std::memory_order_relaxed );
+    return true;
+}
+
+bool Socket::ConnectBlocking( const char* addr, uint16_t port )
+{
+    assert( !IsValid() );
+    assert( !m_ptr );
+
+    struct addrinfo hints;
+    struct addrinfo *res, *ptr;
+
+    memset( &hints, 0, sizeof( hints ) );
+    hints.ai_family = AF_UNSPEC;
+    hints.ai_socktype = SOCK_STREAM;
+
+    char portbuf[32];
+    sprintf( portbuf, "%" PRIu16, port );
+
+    if( getaddrinfo( addr, portbuf, &hints, &res ) != 0 ) return false;
+    int sock = 0;
+    for( ptr = res; ptr; ptr = ptr->ai_next )
+    {
+        if( ( sock = socket( ptr->ai_family, ptr->ai_socktype, ptr->ai_protocol ) ) == -1 ) continue;
+#if defined __APPLE__
+        int val = 1;
+        setsockopt( sock, SOL_SOCKET, SO_NOSIGPIPE, &val, sizeof( val ) );
+#endif
+        if( connect( sock, ptr->ai_addr, ptr->ai_addrlen ) == -1 )
+        {
+#ifdef _WIN32
+            closesocket( sock );
+#else
+            close( sock );
+#endif
+            continue;
+        }
+        break;
+    }
+    freeaddrinfo( res );
+    if( !ptr ) return false;
+
+    m_sock.store( sock, std::memory_order_relaxed );
+    return true;
+}
+
+void Socket::Close()
+{
+    const auto sock = m_sock.load( std::memory_order_relaxed );
+    assert( sock != -1 );
+#ifdef _WIN32
+    closesocket( sock );
+#else
+    close( sock );
+#endif
+    m_sock.store( -1, std::memory_order_relaxed );
+}
+
+int Socket::Send( const void* _buf, int len )
+{
+    const auto sock = m_sock.load( std::memory_order_relaxed );
+    auto buf = (const char*)_buf;
+    assert( sock != -1 );
+    auto start = buf;
+    while( len > 0 )
+    {
+        auto ret = send( sock, buf, len, MSG_NOSIGNAL );
+        if( ret == -1 ) return -1;
+        len -= ret;
+        buf += ret;
+    }
+    return int( buf - start );
+}
+
+int Socket::GetSendBufSize()
+{
+    const auto sock = m_sock.load( std::memory_order_relaxed );
+    int bufSize;
+#if defined _WIN32
+    int sz = sizeof( bufSize );
+    getsockopt( sock, SOL_SOCKET, SO_SNDBUF, (char*)&bufSize, &sz );
+#else
+    socklen_t sz = sizeof( bufSize );
+    getsockopt( sock, SOL_SOCKET, SO_SNDBUF, &bufSize, &sz );
+#endif
+    return bufSize;
+}
+
+int Socket::RecvBuffered( void* buf, int len, int timeout )
+{
+    if( len <= m_bufLeft )
+    {
+        memcpy( buf, m_bufPtr, len );
+        m_bufPtr += len;
+        m_bufLeft -= len;
+        return len;
+    }
+
+    if( m_bufLeft > 0 )
+    {
+        memcpy( buf, m_bufPtr, m_bufLeft );
+        const auto ret = m_bufLeft;
+        m_bufLeft = 0;
+        return ret;
+    }
+
+    if( len >= BufSize ) return Recv( buf, len, timeout );
+
+    m_bufLeft = Recv( m_buf, BufSize, timeout );
+    if( m_bufLeft <= 0 ) return m_bufLeft;
+
+    const auto sz = len < m_bufLeft ? len : m_bufLeft;
+    memcpy( buf, m_buf, sz );
+    m_bufPtr = m_buf + sz;
+    m_bufLeft -= sz;
+    return sz;
+}
+
+int Socket::Recv( void* _buf, int len, int timeout )
+{
+    const auto sock = m_sock.load( std::memory_order_relaxed );
+    auto buf = (char*)_buf;
+
+    struct pollfd fd;
+    fd.fd = (socket_t)sock;
+    fd.events = POLLIN;
+
+    if( poll( &fd, 1, timeout ) > 0 )
+    {
+        return recv( sock, buf, len, 0 );
+    }
+    else
+    {
+        return -1;
+    }
+}
+
+int Socket::ReadUpTo( void* _buf, int len )
+{
+    const auto sock = m_sock.load( std::memory_order_relaxed );
+    auto buf = (char*)_buf;
+
+    int rd = 0;
+    while( len > 0 )
+    {
+        const auto res = recv( sock, buf, len, 0 );
+        if( res == 0 ) break;
+        if( res == -1 ) return -1;
+        len -= res;
+        rd += res;
+        buf += res;
+    }
+    return rd;
+}
+
+bool Socket::Read( void* buf, int len, int timeout )
+{
+    auto cbuf = (char*)buf;
+    while( len > 0 )
+    {
+        if( !ReadImpl( cbuf, len, timeout ) ) return false;
+    }
+    return true;
+}
+
+bool Socket::ReadImpl( char*& buf, int& len, int timeout )
+{
+    const auto sz = RecvBuffered( buf, len, timeout );
+    switch( sz )
+    {
+    case 0:
+        return false;
+    case -1:
+#ifdef _WIN32
+    {
+        auto err = WSAGetLastError();
+        if( err == WSAECONNABORTED || err == WSAECONNRESET ) return false;
+    }
+#endif
+    break;
+    default:
+        len -= sz;
+        buf += sz;
+        break;
+    }
+    return true;
+}
+
+bool Socket::ReadRaw( void* _buf, int len, int timeout )
+{
+    auto buf = (char*)_buf;
+    while( len > 0 )
+    {
+        const auto sz = Recv( buf, len, timeout );
+        if( sz <= 0 ) return false;
+        len -= sz;
+        buf += sz;
+    }
+    return true;
+}
+
+bool Socket::HasData()
+{
+    const auto sock = m_sock.load( std::memory_order_relaxed );
+    if( m_bufLeft > 0 ) return true;
+
+    struct pollfd fd;
+    fd.fd = (socket_t)sock;
+    fd.events = POLLIN;
+
+    return poll( &fd, 1, 0 ) > 0;
+}
+
+bool Socket::IsValid() const
+{
+    return m_sock.load( std::memory_order_relaxed ) >= 0;
+}
+
+
+ListenSocket::ListenSocket()
+    : m_sock( -1 )
+{
+#ifdef _WIN32
+    InitWinSock();
+#endif
+}
+
+ListenSocket::~ListenSocket()
+{
+    if( m_sock != -1 ) Close();
+}
+
+static int addrinfo_and_socket_for_family( uint16_t port, int ai_family, struct addrinfo** res )
+{
+    struct addrinfo hints;
+    memset( &hints, 0, sizeof( hints ) );
+    hints.ai_family = ai_family;
+    hints.ai_socktype = SOCK_STREAM;
+#ifndef TRACY_ONLY_LOCALHOST
+    const char* onlyLocalhost = GetEnvVar( "TRACY_ONLY_LOCALHOST" );
+    if( !onlyLocalhost || onlyLocalhost[0] != '1' )
+    {
+        hints.ai_flags = AI_PASSIVE;
+    }
+#endif
+    char portbuf[32];
+    sprintf( portbuf, "%" PRIu16, port );
+    if( getaddrinfo( nullptr, portbuf, &hints, res ) != 0 ) return -1;
+    int sock = socket( (*res)->ai_family, (*res)->ai_socktype, (*res)->ai_protocol );
+    if (sock == -1) freeaddrinfo( *res );
+    return sock;
+}
+
+bool ListenSocket::Listen( uint16_t port, int backlog )
+{
+    assert( m_sock == -1 );
+
+    struct addrinfo* res = nullptr;
+
+#if !defined TRACY_ONLY_IPV4 && !defined TRACY_ONLY_LOCALHOST
+    const char* onlyIPv4 = GetEnvVar( "TRACY_ONLY_IPV4" );
+    if( !onlyIPv4 || onlyIPv4[0] != '1' )
+    {
+        m_sock = addrinfo_and_socket_for_family( port, AF_INET6, &res );
+    }
+#endif
+    if (m_sock == -1)
+    {
+        // IPV6 protocol may not be available/is disabled. Try to create a socket
+        // with the IPV4 protocol
+        m_sock = addrinfo_and_socket_for_family( port, AF_INET, &res );
+        if( m_sock == -1 ) return false;
+    }
+#if defined _WIN32
+    unsigned long val = 0;
+    setsockopt( m_sock, IPPROTO_IPV6, IPV6_V6ONLY, (const char*)&val, sizeof( val ) );
+#elif defined BSD
+    int val = 0;
+    setsockopt( m_sock, IPPROTO_IPV6, IPV6_V6ONLY, (const char*)&val, sizeof( val ) );
+    val = 1;
+    setsockopt( m_sock, SOL_SOCKET, SO_REUSEADDR, &val, sizeof( val ) );
+#else
+    int val = 1;
+    setsockopt( m_sock, SOL_SOCKET, SO_REUSEADDR, &val, sizeof( val ) );
+#endif
+    if( bind( m_sock, res->ai_addr, res->ai_addrlen ) == -1 ) { freeaddrinfo( res ); Close(); return false; }
+    if( listen( m_sock, backlog ) == -1 ) { freeaddrinfo( res ); Close(); return false; }
+    freeaddrinfo( res );
+    return true;
+}
+
+Socket* ListenSocket::Accept()
+{
+    struct sockaddr_storage remote;
+    socklen_t sz = sizeof( remote );
+
+    struct pollfd fd;
+    fd.fd = (socket_t)m_sock;
+    fd.events = POLLIN;
+
+    if( poll( &fd, 1, 10 ) > 0 )
+    {
+        int sock = accept( m_sock, (sockaddr*)&remote, &sz);
+        if( sock == -1 ) return nullptr;
+
+#if defined __APPLE__
+        int val = 1;
+        setsockopt( sock, SOL_SOCKET, SO_NOSIGPIPE, &val, sizeof( val ) );
+#endif
+
+        auto ptr = (Socket*)tracy_malloc( sizeof( Socket ) );
+        new(ptr) Socket( sock );
+        return ptr;
+    }
+    else
+    {
+        return nullptr;
+    }
+}
+
+void ListenSocket::Close()
+{
+    assert( m_sock != -1 );
+#ifdef _WIN32
+    closesocket( m_sock );
+#else
+    close( m_sock );
+#endif
+    m_sock = -1;
+}
+
+UdpBroadcast::UdpBroadcast()
+    : m_sock( -1 )
+{
+#ifdef _WIN32
+    InitWinSock();
+#endif
+}
+
+UdpBroadcast::~UdpBroadcast()
+{
+    if( m_sock != -1 ) Close();
+}
+
+bool UdpBroadcast::Open( const char* addr, uint16_t port )
+{
+    assert( m_sock == -1 );
+
+    struct addrinfo hints;
+    struct addrinfo *res, *ptr;
+
+    memset( &hints, 0, sizeof( hints ) );
+    hints.ai_family = AF_INET;
+    hints.ai_socktype = SOCK_DGRAM;
+
+    char portbuf[32];
+    sprintf( portbuf, "%" PRIu16, port );
+
+    if( getaddrinfo( addr, portbuf, &hints, &res ) != 0 ) return false;
+    int sock = 0;
+    for( ptr = res; ptr; ptr = ptr->ai_next )
+    {
+        if( ( sock = socket( ptr->ai_family, ptr->ai_socktype, ptr->ai_protocol ) ) == -1 ) continue;
+#if defined __APPLE__
+        int val = 1;
+        setsockopt( sock, SOL_SOCKET, SO_NOSIGPIPE, &val, sizeof( val ) );
+#endif
+#if defined _WIN32
+        unsigned long broadcast = 1;
+        if( setsockopt( sock, SOL_SOCKET, SO_BROADCAST, (const char*)&broadcast, sizeof( broadcast ) ) == -1 )
+#else
+        int broadcast = 1;
+        if( setsockopt( sock, SOL_SOCKET, SO_BROADCAST, &broadcast, sizeof( broadcast ) ) == -1 )
+#endif
+        {
+#ifdef _WIN32
+            closesocket( sock );
+#else
+            close( sock );
+#endif
+            continue;
+        }
+        break;
+    }
+    freeaddrinfo( res );
+    if( !ptr ) return false;
+
+    m_sock = sock;
+    inet_pton( AF_INET, addr, &m_addr );
+    return true;
+}
+
+void UdpBroadcast::Close()
+{
+    assert( m_sock != -1 );
+#ifdef _WIN32
+    closesocket( m_sock );
+#else
+    close( m_sock );
+#endif
+    m_sock = -1;
+}
+
+int UdpBroadcast::Send( uint16_t port, const void* data, int len )
+{
+    assert( m_sock != -1 );
+    struct sockaddr_in addr;
+    addr.sin_family = AF_INET;
+    addr.sin_port = htons( port );
+    addr.sin_addr.s_addr = m_addr;
+    return sendto( m_sock, (const char*)data, len, MSG_NOSIGNAL, (sockaddr*)&addr, sizeof( addr ) );
+}
+
+IpAddress::IpAddress()
+    : m_number( 0 )
+{
+    *m_text = '\0';
+}
+
+IpAddress::~IpAddress()
+{
+}
+
+void IpAddress::Set( const struct sockaddr& addr )
+{
+#if defined _WIN32 && ( !defined NTDDI_WIN10 || NTDDI_VERSION < NTDDI_WIN10 )
+    struct sockaddr_in tmp;
+    memcpy( &tmp, &addr, sizeof( tmp ) );
+    auto ai = &tmp;
+#else
+    auto ai = (const struct sockaddr_in*)&addr;
+#endif
+    inet_ntop( AF_INET, &ai->sin_addr, m_text, 17 );
+    m_number = ai->sin_addr.s_addr;
+}
+
+UdpListen::UdpListen()
+    : m_sock( -1 )
+{
+#ifdef _WIN32
+    InitWinSock();
+#endif
+}
+
+UdpListen::~UdpListen()
+{
+    if( m_sock != -1 ) Close();
+}
+
+bool UdpListen::Listen( uint16_t port )
+{
+    assert( m_sock == -1 );
+
+    int sock;
+    if( ( sock = socket( AF_INET, SOCK_DGRAM, 0 ) ) == -1 ) return false;
+
+#if defined __APPLE__
+    int val = 1;
+    setsockopt( sock, SOL_SOCKET, SO_NOSIGPIPE, &val, sizeof( val ) );
+#endif
+#if defined _WIN32
+    unsigned long reuse = 1;
+    setsockopt( sock, SOL_SOCKET, SO_REUSEADDR, (const char*)&reuse, sizeof( reuse ) );
+#else
+    int reuse = 1;
+    setsockopt( sock, SOL_SOCKET, SO_REUSEADDR, &reuse, sizeof( reuse ) );
+#endif
+#if defined _WIN32
+    unsigned long broadcast = 1;
+    if( setsockopt( sock, SOL_SOCKET, SO_BROADCAST, (const char*)&broadcast, sizeof( broadcast ) ) == -1 )
+#else
+    int broadcast = 1;
+    if( setsockopt( sock, SOL_SOCKET, SO_BROADCAST, &broadcast, sizeof( broadcast ) ) == -1 )
+#endif
+    {
+#ifdef _WIN32
+        closesocket( sock );
+#else
+        close( sock );
+#endif
+        return false;
+    }
+
+    struct sockaddr_in addr;
+    addr.sin_family = AF_INET;
+    addr.sin_port = htons( port );
+    addr.sin_addr.s_addr = INADDR_ANY;
+
+    if( bind( sock, (sockaddr*)&addr, sizeof( addr ) ) == -1 )
+    {
+#ifdef _WIN32
+        closesocket( sock );
+#else
+        close( sock );
+#endif
+        return false;
+    }
+
+    m_sock = sock;
+    return true;
+}
+
+void UdpListen::Close()
+{
+    assert( m_sock != -1 );
+#ifdef _WIN32
+    closesocket( m_sock );
+#else
+    close( m_sock );
+#endif
+    m_sock = -1;
+}
+
+const char* UdpListen::Read( size_t& len, IpAddress& addr, int timeout )
+{
+    static char buf[2048];
+
+    struct pollfd fd;
+    fd.fd = (socket_t)m_sock;
+    fd.events = POLLIN;
+    if( poll( &fd, 1, timeout ) <= 0 ) return nullptr;
+
+    sockaddr sa;
+    socklen_t salen = sizeof( struct sockaddr );
+    len = (size_t)recvfrom( m_sock, buf, 2048, 0, &sa, &salen );
+    addr.Set( sa );
+
+    return buf;
+}
+
+}
diff --git a/project/thirdparty/tracy-0.11.1/common/TracySocket.hpp b/project/thirdparty/tracy-0.11.1/common/TracySocket.hpp
new file mode 100644
index 000000000..f7713aac6
--- /dev/null
+++ b/project/thirdparty/tracy-0.11.1/common/TracySocket.hpp
@@ -0,0 +1,155 @@
+#ifndef __TRACYSOCKET_HPP__
+#define __TRACYSOCKET_HPP__
+
+#include <atomic>
+#include <stddef.h>
+#include <stdint.h>
+
+struct addrinfo;
+struct sockaddr;
+
+namespace tracy
+{
+
+#ifdef _WIN32
+void InitWinSock();
+#endif
+
+class Socket
+{
+public:
+    Socket();
+    Socket( int sock );
+    ~Socket();
+
+    bool Connect( const char* addr, uint16_t port );
+    bool ConnectBlocking( const char* addr, uint16_t port );
+    void Close();
+
+    int Send( const void* buf, int len );
+    int GetSendBufSize();
+
+    int ReadUpTo( void* buf, int len );
+    bool Read( void* buf, int len, int timeout );
+
+    template<typename ShouldExit>
+    bool Read( void* buf, int len, int timeout, ShouldExit exitCb )
+    {
+        auto cbuf = (char*)buf;
+        while( len > 0 )
+        {
+            if( exitCb() ) return false;
+            if( !ReadImpl( cbuf, len, timeout ) ) return false;
+        }
+        return true;
+    }
+
+    bool ReadRaw( void* buf, int len, int timeout );
+    bool HasData();
+    bool IsValid() const;
+
+    Socket( const Socket& ) = delete;
+    Socket( Socket&& ) = delete;
+    Socket& operator=( const Socket& ) = delete;
+    Socket& operator=( Socket&& ) = delete;
+
+private:
+    int RecvBuffered( void* buf, int len, int timeout );
+    int Recv( void* buf, int len, int timeout );
+
+    bool ReadImpl( char*& buf, int& len, int timeout );
+
+    char* m_buf;
+    char* m_bufPtr;
+    std::atomic<int> m_sock;
+    int m_bufLeft;
+
+    struct addrinfo *m_res;
+    struct addrinfo *m_ptr;
+    int m_connSock;
+};
+
+class ListenSocket
+{
+public:
+    ListenSocket();
+    ~ListenSocket();
+
+    bool Listen( uint16_t port, int backlog );
+    Socket* Accept();
+    void Close();
+
+    ListenSocket( const ListenSocket& ) = delete;
+    ListenSocket( ListenSocket&& ) = delete;
+    ListenSocket& operator=( const ListenSocket& ) = delete;
+    ListenSocket& operator=( ListenSocket&& ) = delete;
+
+private:
+    int m_sock;
+};
+
+class UdpBroadcast
+{
+public:
+    UdpBroadcast();
+    ~UdpBroadcast();
+
+    bool Open( const char* addr, uint16_t port );
+    void Close();
+
+    int Send( uint16_t port, const void* data, int len );
+
+    UdpBroadcast( const UdpBroadcast& ) = delete;
+    UdpBroadcast( UdpBroadcast&& ) = delete;
+    UdpBroadcast& operator=( const UdpBroadcast& ) = delete;
+    UdpBroadcast& operator=( UdpBroadcast&& ) = delete;
+
+private:
+    int m_sock;
+    uint32_t m_addr;
+};
+
+class IpAddress
+{
+public:
+    IpAddress();
+    ~IpAddress();
+
+    void Set( const struct sockaddr& addr );
+
+    uint32_t GetNumber() const { return m_number; }
+    const char* GetText() const { return m_text; }
+
+    IpAddress( const IpAddress& ) = delete;
+    IpAddress( IpAddress&& ) = delete;
+    IpAddress& operator=( const IpAddress& ) = delete;
+    IpAddress& operator=( IpAddress&& ) = delete;
+
+private:
+    uint32_t m_number;
+    char m_text[17];
+};
+
+class UdpListen
+{
+public:
+    UdpListen();
+    ~UdpListen();
+
+    bool Listen( uint16_t port );
+    void Close();
+
+    const char* Read( size_t& len, IpAddress& addr, int timeout );
+
+    UdpListen( const UdpListen& ) = delete;
+    UdpListen( UdpListen&& ) = delete;
+    UdpListen& operator=( const UdpListen& ) = delete;
+    UdpListen& operator=( UdpListen&& ) = delete;
+
+private:
+    int m_sock;
+};
+
+}
+
+#endif
diff --git a/project/thirdparty/tracy-0.11.1/common/TracyStackFrames.cpp b/project/thirdparty/tracy-0.11.1/common/TracyStackFrames.cpp
new file mode 100644
index 000000000..7b0abace3
--- /dev/null
+++ b/project/thirdparty/tracy-0.11.1/common/TracyStackFrames.cpp
@@ -0,0 +1,122 @@
+#include "TracyStackFrames.hpp"
+
+namespace tracy
+{
+
+const char* s_tracyStackFrames_[] = {
+    "tracy::Callstack",
+    "tracy::Callstack(int)",
+    "tracy::GpuCtxScope::{ctor}",
+    "tracy::Profiler::SendCallstack",
+    "tracy::Profiler::SendCallstack(int)",
+    "tracy::Profiler::SendCallstack(int, unsigned long)",
+    "tracy::Profiler::MemAllocCallstack",
+    "tracy::Profiler::MemAllocCallstack(void const*, unsigned long, int)",
+    "tracy::Profiler::MemFreeCallstack",
+    "tracy::Profiler::MemFreeCallstack(void const*, int)",
+    "tracy::ScopedZone::{ctor}",
+    "tracy::ScopedZone::ScopedZone(tracy::SourceLocationData const*, int, bool)",
+    "tracy::Profiler::Message",
+    nullptr
+};
+
+const char** s_tracyStackFrames = s_tracyStackFrames_;
+
+const StringMatch s_tracySkipSubframes_[] = {
+    { "/include/arm_neon.h", 19 },
+    { "/include/adxintrin.h", 20 },
+    { "/include/ammintrin.h", 20 },
+    { "/include/amxbf16intrin.h", 24 },
+    { "/include/amxint8intrin.h", 24 },
+    { "/include/amxtileintrin.h", 24 },
+    { "/include/avx2intrin.h", 21 },
+    { "/include/avx5124fmapsintrin.h", 29 },
+    { "/include/avx5124vnniwintrin.h", 29 },
+    { "/include/avx512bf16intrin.h", 27 },
+    { "/include/avx512bf16vlintrin.h", 29 },
+    { "/include/avx512bitalgintrin.h", 29 },
+    { "/include/avx512bwintrin.h", 25 },
+    { "/include/avx512cdintrin.h", 25 },
+    { "/include/avx512dqintrin.h", 25 },
+    { "/include/avx512erintrin.h", 25 },
+    { "/include/avx512fintrin.h", 24 },
+    { "/include/avx512ifmaintrin.h", 27 },
+    { "/include/avx512ifmavlintrin.h", 29 },
+    { "/include/avx512pfintrin.h", 25 },
+    { "/include/avx512vbmi2intrin.h", 28 },
+    { "/include/avx512vbmi2vlintrin.h", 30 },
+    { "/include/avx512vbmiintrin.h", 27 },
+    { "/include/avx512vbmivlintrin.h", 29 },
+    { "/include/avx512vlbwintrin.h", 27 },
+    { "/include/avx512vldqintrin.h", 27 },
+    { "/include/avx512vlintrin.h", 25 },
+    { "/include/avx512vnniintrin.h", 27 },
+    { "/include/avx512vnnivlintrin.h", 29 },
+    { "/include/avx512vp2intersectintrin.h", 35 },
+    { "/include/avx512vp2intersectvlintrin.h", 37 },
+    { "/include/avx512vpopcntdqintrin.h", 32 },
+    { "/include/avx512vpopcntdqvlintrin.h", 34 },
+    { "/include/avxintrin.h", 20 },
+    { "/include/avxvnniintrin.h", 24 },
+    { "/include/bmi2intrin.h", 21 },
+    { "/include/bmiintrin.h", 20 },
+    { "/include/bmmintrin.h", 20 },
+    { "/include/cetintrin.h", 20 },
+    { "/include/cldemoteintrin.h", 25 },
+    { "/include/clflushoptintrin.h", 27 },
+    { "/include/clwbintrin.h", 21 },
+    { "/include/clzerointrin.h", 23 },
+    { "/include/emmintrin.h", 20 },
+    { "/include/enqcmdintrin.h", 23 },
+    { "/include/f16cintrin.h", 21 },
+    { "/include/fma4intrin.h", 21 },
+    { "/include/fmaintrin.h", 20 },
+    { "/include/fxsrintrin.h", 21 },
+    { "/include/gfniintrin.h", 21 },
+    { "/include/hresetintrin.h", 23 },
+    { "/include/ia32intrin.h", 21 },
+    { "/include/immintrin.h", 20 },
+    { "/include/keylockerintrin.h", 26 },
+    { "/include/lwpintrin.h", 20 },
+    { "/include/lzcntintrin.h", 22 },
+    { "/include/mmintrin.h", 19 },
+    { "/include/movdirintrin.h", 23 },
+    { "/include/mwaitxintrin.h", 23 },
+    { "/include/nmmintrin.h", 20 },
+    { "/include/pconfigintrin.h", 24 },
+    { "/include/pkuintrin.h", 20 },
+    { "/include/pmmintrin.h", 20 },
+    { "/include/popcntintrin.h", 23 },
+    { "/include/prfchwintrin.h", 23 },
+    { "/include/rdseedintrin.h", 23 },
+    { "/include/rtmintrin.h", 20 },
+    { "/include/serializeintrin.h", 26 },
+    { "/include/sgxintrin.h", 20 },
+    { "/include/shaintrin.h", 20 },
+    { "/include/smmintrin.h", 20 },
+    { "/include/tbmintrin.h", 20 },
+    { "/include/tmmintrin.h", 20 },
+    { "/include/tsxldtrkintrin.h", 25 },
+    { "/include/uintrintrin.h", 22 },
+    { "/include/vaesintrin.h", 21 },
+    { "/include/vpclmulqdqintrin.h", 27 },
+    { "/include/waitpkgintrin.h", 24 },
+    { "/include/wbnoinvdintrin.h", 25 },
+    { "/include/wmmintrin.h", 20 },
+    { "/include/x86gprintrin.h", 23 },
+    { "/include/x86intrin.h", 20 },
+    { "/include/xmmintrin.h", 20 },
+    { "/include/xopintrin.h", 20 },
+    { "/include/xsavecintrin.h", 23 },
+    { "/include/xsaveintrin.h", 22 },
+    { "/include/xsaveoptintrin.h", 25 },
+    { "/include/xsavesintrin.h", 23 },
+    { "/include/xtestintrin.h", 22 },
+    { "/bits/atomic_base.h", 19 },
+    { "/atomic", 7 },
+    {}
+};
+
+const StringMatch* s_tracySkipSubframes = s_tracySkipSubframes_;
+
+}
diff --git a/project/thirdparty/tracy-0.11.1/common/TracyStackFrames.hpp b/project/thirdparty/tracy-0.11.1/common/TracyStackFrames.hpp
new file mode 100644
index 000000000..9d4262c00
--- /dev/null
+++ b/project/thirdparty/tracy-0.11.1/common/TracyStackFrames.hpp
@@ -0,0 +1,22 @@
+#ifndef __TRACYSTACKFRAMES_HPP__
+#define __TRACYSTACKFRAMES_HPP__
+
+#include <stddef.h>
+
+namespace tracy
+{
+
+struct StringMatch
+{
+    const char* str;
+    size_t len;
+};
+
+extern const char** s_tracyStackFrames;
+extern const StringMatch* s_tracySkipSubframes;
+
+static constexpr int s_tracySkipSubframesMinLen = 7;
+
+}
+
+#endif
diff --git a/project/thirdparty/tracy-0.11.1/common/TracySystem.cpp b/project/thirdparty/tracy-0.11.1/common/TracySystem.cpp
new file mode 100644
index 000000000..d51f5d65b
--- /dev/null
+++ b/project/thirdparty/tracy-0.11.1/common/TracySystem.cpp
@@ -0,0 +1,347 @@
+#ifdef _MSC_VER
+#  pragma warning(disable:4996)
+#endif
+#if defined _WIN32
+#  ifndef WIN32_LEAN_AND_MEAN
+#    define WIN32_LEAN_AND_MEAN
+#  endif
+#  ifndef NOMINMAX
+#    define NOMINMAX
+#  endif
+#  include <windows.h>
+#  include <malloc.h>
+#  include "TracyUwp.hpp"
+#else
+#  include <pthread.h>
+#  include <string.h>
+#  include <unistd.h>
+#endif
+
+#ifdef __linux__
+#  ifdef __ANDROID__
+#    include <sys/types.h>
+#  else
+#    include <sys/syscall.h>
+#  endif
+#  include <fcntl.h>
+#elif defined __FreeBSD__
+#  include <sys/thr.h>
+#elif defined __NetBSD__ || defined __DragonFly__
+#  include <sys/lwp.h>
+#elif defined __QNX__
+#  include <process.h>
+#  include <sys/neutrino.h>
+#endif
+
+#ifdef __MINGW32__
+#  define __STDC_FORMAT_MACROS
+#endif
+#include <inttypes.h>
+#include <stdio.h>
+#include <stdlib.h>
+
+#include "TracySystem.hpp"
+
+#if defined _WIN32
+extern "C" typedef HRESULT (WINAPI *t_SetThreadDescription)( HANDLE, PCWSTR );
+extern "C" typedef HRESULT (WINAPI *t_GetThreadDescription)( HANDLE, PWSTR* );
+#endif
+
+#ifdef TRACY_ENABLE
+#  include <atomic>
+#  include "TracyAlloc.hpp"
+#endif
+
+namespace tracy
+{
+
+namespace detail
+{
+
+TRACY_API uint32_t GetThreadHandleImpl()
+{
+#if defined _WIN32
+    static_assert( sizeof( decltype( GetCurrentThreadId() ) ) <= sizeof( uint32_t ), "Thread handle too big to fit in protocol" );
+    return uint32_t( GetCurrentThreadId() );
+#elif defined __APPLE__
+    uint64_t id;
+    pthread_threadid_np( pthread_self(), &id );
+    return uint32_t( id );
+#elif defined __ANDROID__
+    return (uint32_t)gettid();
+#elif defined __linux__
+    return (uint32_t)syscall( SYS_gettid );
+#elif defined __FreeBSD__
+    long id;
+    thr_self( &id );
+    return id;
+#elif defined __NetBSD__
+    return _lwp_self();
+#elif defined __DragonFly__
+    return lwp_gettid();
+#elif defined __OpenBSD__
+    return getthrid();
+#elif defined __QNX__
+    return (uint32_t) gettid();
+#elif defined __EMSCRIPTEN__
+    // Not supported, but let it compile.
+    return 0;
+#else
+    // To add support for a platform, retrieve and return the kernel thread identifier here.
+    //
+    // Note that pthread_t (as for example returned by pthread_self()) is *not* a kernel
+    // thread identifier. It is a pointer to a library-allocated data structure instead.
+    // Such pointers will be reused heavily, making the pthread_t non-unique. Additionally
+    // a 64-bit pointer cannot be reliably truncated to 32 bits.
+    #error "Unsupported platform!"
+#endif
+
+}
+
+}
+
+#ifdef TRACY_ENABLE
+std::atomic<ThreadNameData*>& GetThreadNameData();
+#endif
+
+#if defined _MSC_VER && !defined __clang__
+#  pragma pack( push, 8 )
+struct THREADNAME_INFO
+{
+    DWORD dwType;
+    LPCSTR szName;
+    DWORD dwThreadID;
+    DWORD dwFlags;
+};
+#  pragma pack( pop )
+
+void ThreadNameMsvcMagic( const THREADNAME_INFO& info )
+{
+    __try
+    {
+        RaiseException( 0x406D1388, 0, sizeof(info)/sizeof(ULONG_PTR), (ULONG_PTR*)&info );
+    }
+    __except(EXCEPTION_EXECUTE_HANDLER)
+    {
+    }
+}
+#endif
+
+TRACY_API void SetThreadName( const char* name )
+{
+    SetThreadNameWithHint( name, 0 );
+}
+
+TRACY_API void SetThreadNameWithHint( const char* name, int32_t groupHint )
+{
+#if defined _WIN32
+#  ifdef TRACY_UWP
+    static auto _SetThreadDescription = &::SetThreadDescription;
+#  else
+    static auto _SetThreadDescription = (t_SetThreadDescription)GetProcAddress( GetModuleHandleA( "kernel32.dll" ), "SetThreadDescription" );
+#  endif
+    if( _SetThreadDescription )
+    {
+        wchar_t buf[256];
+        mbstowcs( buf, name, 256 );
+        _SetThreadDescription( GetCurrentThread(), buf );
+    }
+    else
+    {
+#  if defined _MSC_VER && !defined __clang__
+        THREADNAME_INFO info;
+        info.dwType = 0x1000;
+        info.szName = name;
+        info.dwThreadID = GetCurrentThreadId();
+        info.dwFlags = 0;
+        ThreadNameMsvcMagic( info );
+#  endif
+    }
+#elif defined _GNU_SOURCE && !defined __EMSCRIPTEN__
+    {
+        const auto sz = strlen( name );
+        if( sz <= 15 )
+        {
+#if defined __APPLE__
+            pthread_setname_np( name );
+#else
+            pthread_setname_np( pthread_self(), name );
+#endif
+        }
+        else
+        {
+            char buf[16];
+            memcpy( buf, name, 15 );
+            buf[15] = '\0';
+#if defined __APPLE__
+            pthread_setname_np( buf );
+#else
+            pthread_setname_np( pthread_self(), buf );
+#endif
+        }
+    }
+#elif defined __QNX__
+    {
+        const auto sz = strlen( name );
+        if( sz <= _NTO_THREAD_NAME_MAX )
+        {
+            pthread_setname_np( pthread_self(), name );
+        }
+        else
+        {
+            char buf[_NTO_THREAD_NAME_MAX + 1];
+            memcpy( buf, name, _NTO_THREAD_NAME_MAX );
+            buf[_NTO_THREAD_NAME_MAX] = '\0';
+            pthread_setname_np( pthread_self(), buf );
+        }
+    };
+#endif
+#ifdef TRACY_ENABLE
+    {
+        const auto sz = strlen( name );
+        char* buf = (char*)tracy_malloc( sz+1 );
+        memcpy( buf, name, sz );
+        buf[sz] = '\0';
+        auto data = (ThreadNameData*)tracy_malloc_fast( sizeof( ThreadNameData ) );
+        data->id = detail::GetThreadHandleImpl();
+        data->groupHint = groupHint;
+        data->name = buf;
+        data->next = GetThreadNameData().load( std::memory_order_relaxed );
+        while( !GetThreadNameData().compare_exchange_weak( data->next, data, std::memory_order_release, std::memory_order_relaxed ) ) {}
+    }
+#endif
+}
+
+#ifdef TRACY_ENABLE
+ThreadNameData* GetThreadNameData( uint32_t id )
+{
+    auto ptr = GetThreadNameData().load( std::memory_order_relaxed );
+    while( ptr )
+    {
+        if( ptr->id == id )
+        {
+            return ptr;
+        }
+        ptr = ptr->next;
+    }
+    return nullptr;
+}
+#endif
+
+TRACY_API const char* GetThreadName( uint32_t id )
+{
+    static char buf[256];
+#ifdef TRACY_ENABLE
+    auto ptr = GetThreadNameData().load( std::memory_order_relaxed );
+    while( ptr )
+    {
+        if( ptr->id == id )
+        {
+            return ptr->name;
+        }
+        ptr = ptr->next;
+    }
+#endif
+
+#if defined _WIN32
+# ifdef TRACY_UWP
+   static auto _GetThreadDescription = &::GetThreadDescription;
+# else
+   static auto _GetThreadDescription = (t_GetThreadDescription)GetProcAddress( GetModuleHandleA( "kernel32.dll" ), "GetThreadDescription" );
+# endif
+    if( _GetThreadDescription )
+    {
+        auto hnd = OpenThread( THREAD_QUERY_LIMITED_INFORMATION, FALSE, (DWORD)id );
+        if( hnd != 0 )
+        {
+            PWSTR tmp;
+            if( SUCCEEDED( _GetThreadDescription( hnd, &tmp ) ) )
+            {
+                auto ret = wcstombs( buf, tmp, 256 );
+                CloseHandle( hnd );
+                LocalFree( tmp );
+                if( ret != static_cast<size_t>( -1 ) )
+                {
+                    return buf;
+                }
+            }
+        }
+    }
+#elif defined __linux__
+  int cs, fd;
+  char path[32];
+  snprintf( path, sizeof( path ), "/proc/self/task/%d/comm", id );
+  sprintf( buf, "%" PRIu32, id );
+# ifndef __ANDROID__
+   pthread_setcancelstate( PTHREAD_CANCEL_DISABLE, &cs );
+# endif
+  if ( ( fd = open( path, O_RDONLY ) ) > 0) {
+      int len = read( fd, buf, 255 );
+      if( len > 0 )
+      {
+          buf[len] = 0;
+          if( len > 1 && buf[len-1] == '\n' )
+          {
+              buf[len-1] = 0;
+          }
+      }
+      close( fd );
+  }
+# ifndef __ANDROID__
+   pthread_setcancelstate( cs, 0 );
+# endif
+  return buf;
+#elif defined __QNX__
+    static char qnxNameBuf[_NTO_THREAD_NAME_MAX + 1] = {0};
+    if (pthread_getname_np(static_cast<int>(id), qnxNameBuf, _NTO_THREAD_NAME_MAX) == 0) {
+        return qnxNameBuf;
+    };
+#endif
+
+  sprintf( buf, "%" PRIu32, id );
+  return buf;
+}
+
+TRACY_API const char* GetEnvVar( const char* name )
+{
+#if defined _WIN32
+    // unfortunately getenv() on Windows is just fundamentally broken.  It caches the entire
+    // environment block once on startup, then never refreshes it again.  If any environment
+    // strings are added or modified after startup of the CRT, those changes will not be
+    // seen by getenv().  This removes the possibility of an app using this SDK from
+    // programmatically setting any of the behaviour controlling envvars here.
+    //
+    // To work around this, we'll instead go directly to the Win32 environment strings APIs
+    // to get the current value.
+    static char buffer[1024];
+    DWORD const kBufferSize = DWORD(sizeof(buffer) / sizeof(buffer[0]));
+    DWORD count = GetEnvironmentVariableA(name, buffer, kBufferSize);
+
+    if( count == 0 )
+        return nullptr;
+
+    if( count >= kBufferSize )
+    {
+        char* buf = reinterpret_cast<char*>(_alloca(count + 1));
+        count = GetEnvironmentVariableA(name, buf, count + 1);
+        memcpy(buffer, buf, kBufferSize);
+        buffer[kBufferSize - 1] = 0;
+    }
+
+    return buffer;
+#else
+    return getenv(name);
+#endif
+}
+
+}
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+TRACY_API void ___tracy_set_thread_name( const char* name ) { tracy::SetThreadName( name ); }
+
+#ifdef __cplusplus
+}
+#endif
diff --git a/project/thirdparty/tracy-0.11.1/common/TracySystem.hpp b/project/thirdparty/tracy-0.11.1/common/TracySystem.hpp
new file mode 100644
index 000000000..2f565e9a2
--- /dev/null
+++ b/project/thirdparty/tracy-0.11.1/common/TracySystem.hpp
@@ -0,0 +1,43 @@
+#ifndef __TRACYSYSTEM_HPP__
+#define __TRACYSYSTEM_HPP__
+
+#include <stdint.h>
+
+#include "TracyApi.h"
+
+namespace tracy
+{
+
+namespace detail
+{
+TRACY_API uint32_t GetThreadHandleImpl();
+}
+
+#ifdef TRACY_ENABLE
+struct ThreadNameData
+{
+    uint32_t id;
+    int32_t groupHint;
+    const char* name;
+    ThreadNameData* next;
+};
+
+ThreadNameData* GetThreadNameData( uint32_t id );
+
+TRACY_API uint32_t GetThreadHandle();
+#else
+static inline uint32_t GetThreadHandle()
+{
+    return detail::GetThreadHandleImpl();
+}
+#endif
+
+TRACY_API void SetThreadName( const char* name );
+TRACY_API void SetThreadNameWithHint( const char* name, int32_t groupHint );
+TRACY_API const char* GetThreadName( uint32_t id );
+
+TRACY_API const char* GetEnvVar( const char* name );
+
+}
+
+#endif
diff --git a/project/thirdparty/tracy-0.11.1/common/TracyUwp.hpp b/project/thirdparty/tracy-0.11.1/common/TracyUwp.hpp
new file mode 100644
index 000000000..7dce96b96
--- /dev/null
+++ b/project/thirdparty/tracy-0.11.1/common/TracyUwp.hpp
@@ -0,0 +1,11 @@
+#ifndef __TRACYUWP_HPP__
+#define __TRACYUWP_HPP__
+
+#ifdef _WIN32
+#  include <winapifamily.h>
+#  if WINAPI_FAMILY_PARTITION(WINAPI_PARTITION_APP) && !WINAPI_FAMILY_PARTITION(WINAPI_PARTITION_DESKTOP)
+#    define TRACY_UWP
+#  endif
+#endif
+
+#endif
diff --git a/project/thirdparty/tracy-0.11.1/common/TracyVersion.hpp b/project/thirdparty/tracy-0.11.1/common/TracyVersion.hpp
new file mode 100644
index 000000000..12642d652
--- /dev/null
+++ b/project/thirdparty/tracy-0.11.1/common/TracyVersion.hpp
@@ -0,0 +1,14 @@
+#ifndef __TRACYVERSION_HPP__
+#define __TRACYVERSION_HPP__
+
+namespace tracy
+{
+namespace Version
+{
+enum { Major = 0 };
+enum { Minor = 11 };
+enum { Patch = 2 };
+}
+}
+
+#endif
diff --git a/project/thirdparty/tracy-0.11.1/common/TracyYield.hpp b/project/thirdparty/tracy-0.11.1/common/TracyYield.hpp
new file mode 100644
index 000000000..035836cdb
--- /dev/null
+++ b/project/thirdparty/tracy-0.11.1/common/TracyYield.hpp
@@ -0,0 +1,28 @@
+#ifndef __TRACYYIELD_HPP__
+#define __TRACYYIELD_HPP__
+
+#if defined __SSE2__ || defined _M_AMD64 || (defined _M_IX86_FP && _M_IX86_FP == 2)
+#  include <emmintrin.h>
+#else
+#  include <thread>
+#endif
+
+#include "TracyForceInline.hpp"
+
+namespace tracy
+{
+
+static tracy_force_inline void YieldThread()
+{
+#if defined __SSE2__ || defined _M_AMD64 || (defined _M_IX86_FP && _M_IX86_FP == 2)
+    _mm_pause();
+#elif defined __aarch64__
+    asm volatile( "isb" : : );
+#else
+    std::this_thread::yield();
+#endif
+}
+
+}
+
+#endif
diff --git a/project/thirdparty/tracy-0.11.1/common/tracy_lz4.cpp b/project/thirdparty/tracy-0.11.1/common/tracy_lz4.cpp
new file mode 100644
index 000000000..15d0990f8
--- /dev/null
+++ b/project/thirdparty/tracy-0.11.1/common/tracy_lz4.cpp
@@ -0,0 +1,2720 @@
+/*
+   LZ4 - Fast LZ compression algorithm
+   Copyright (C) 2011-2020, Yann Collet.
+
+   BSD 2-Clause License (http://www.opensource.org/licenses/bsd-license.php)
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+
+       * Redistributions of source code must retain the above copyright
+   notice, this list of conditions and the following disclaimer.
+       * Redistributions in binary form must reproduce the above
+   copyright notice, this list of conditions and the following disclaimer
+   in the documentation and/or other materials provided with the
+   distribution.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+   You can contact the author at :
+    - LZ4 homepage : http://www.lz4.org
+    - LZ4 source repository : https://github.com/lz4/lz4
+*/
+
+/*-************************************
+*  Tuning parameters
+**************************************/
+/*
+ * LZ4_HEAPMODE :
+ * Select how default compression functions will allocate memory for their hash table,
+ * in memory stack (0:default, fastest), or in memory heap (1:requires malloc()).
+ */
+#ifndef LZ4_HEAPMODE
+#  define LZ4_HEAPMODE 0
+#endif
+
+/*
+ * LZ4_ACCELERATION_DEFAULT :
+ * Select "acceleration" for LZ4_compress_fast() when parameter value <= 0
+ */
+#define LZ4_ACCELERATION_DEFAULT 1
+/*
+ * LZ4_ACCELERATION_MAX :
+ * Any "acceleration" value higher than this threshold
+ * get treated as LZ4_ACCELERATION_MAX instead (fix #876)
+ */
+#define LZ4_ACCELERATION_MAX 65537
+
+
+/*-************************************
+*  CPU Feature Detection
+**************************************/
+/* LZ4_FORCE_MEMORY_ACCESS
+ * By default, access to unaligned memory is controlled by `memcpy()`, which is safe and portable.
+ * Unfortunately, on some target/compiler combinations, the generated assembly is sub-optimal.
+ * The below switch allow to select different access method for improved performance.
+ * Method 0 (default) : use `memcpy()`. Safe and portable.
+ * Method 1 : `__packed` statement. It depends on compiler extension (ie, not portable).
+ *            This method is safe if your compiler supports it, and *generally* as fast or faster than `memcpy`.
+ * Method 2 : direct access. This method is portable but violate C standard.
+ *            It can generate buggy code on targets which assembly generation depends on alignment.
+ *            But in some circumstances, it's the only known way to get the most performance (ie GCC + ARMv6)
+ * See https://fastcompression.blogspot.fr/2015/08/accessing-unaligned-memory.html for details.
+ * Prefer these methods in priority order (0 > 1 > 2)
+ */
+#ifndef LZ4_FORCE_MEMORY_ACCESS   /* can be defined externally */
+#  if defined(__GNUC__) && \
+  ( defined(__ARM_ARCH_6__) || defined(__ARM_ARCH_6J__) || defined(__ARM_ARCH_6K__) \
+  || defined(__ARM_ARCH_6Z__) || defined(__ARM_ARCH_6ZK__) || defined(__ARM_ARCH_6T2__) )
+#    define LZ4_FORCE_MEMORY_ACCESS 2
+#  elif (defined(__INTEL_COMPILER) && !defined(_WIN32)) || defined(__GNUC__)
+#    define LZ4_FORCE_MEMORY_ACCESS 1
+#  endif
+#endif
+
+/*
+ * LZ4_FORCE_SW_BITCOUNT
+ * Define this parameter if your target system or compiler does not support hardware bit count
+ */
+#if defined(_MSC_VER) && defined(_WIN32_WCE)   /* Visual Studio for WinCE doesn't support Hardware bit count */
+#  undef  LZ4_FORCE_SW_BITCOUNT  /* avoid double def */
+#  define LZ4_FORCE_SW_BITCOUNT
+#endif
+
+
+
+/*-************************************
+*  Dependency
+**************************************/
+/*
+ * LZ4_SRC_INCLUDED:
+ * Amalgamation flag, whether lz4.c is included
+ */
+#ifndef LZ4_SRC_INCLUDED
+#  define LZ4_SRC_INCLUDED 1
+#endif
+
+#ifndef LZ4_STATIC_LINKING_ONLY
+#define LZ4_STATIC_LINKING_ONLY
+#endif
+
+#ifndef LZ4_DISABLE_DEPRECATE_WARNINGS
+#define LZ4_DISABLE_DEPRECATE_WARNINGS /* due to LZ4_decompress_safe_withPrefix64k */
+#endif
+
+#define LZ4_STATIC_LINKING_ONLY  /* LZ4_DISTANCE_MAX */
+#include "tracy_lz4.hpp"
+/* see also "memory routines" below */
+
+
+/*-************************************
+*  Compiler Options
+**************************************/
+#if defined(_MSC_VER) && (_MSC_VER >= 1400)  /* Visual Studio 2005+ */
+#  include <intrin.h>               /* only present in VS2005+ */
+#  pragma warning(disable : 4127)   /* disable: C4127: conditional expression is constant */
+#  pragma warning(disable : 6237)   /* disable: C6237: conditional expression is always 0 */
+#endif  /* _MSC_VER */
+
+#ifndef LZ4_FORCE_INLINE
+#  if defined (_MSC_VER) && !defined (__clang__)    /* MSVC */
+#    define LZ4_FORCE_INLINE static __forceinline
+#  else
+#    if defined (__cplusplus) || defined (__STDC_VERSION__) && __STDC_VERSION__ >= 199901L   /* C99 */
+#      if defined (__GNUC__) || defined (__clang__)
+#        define LZ4_FORCE_INLINE static inline __attribute__((always_inline))
+#      else
+#        define LZ4_FORCE_INLINE static inline
+#      endif
+#    else
+#      define LZ4_FORCE_INLINE static
+#    endif /* __STDC_VERSION__ */
+#  endif  /* _MSC_VER */
+#endif /* LZ4_FORCE_INLINE */
+
+/* LZ4_FORCE_O2 and LZ4_FORCE_INLINE
+ * gcc on ppc64le generates an unrolled SIMDized loop for LZ4_wildCopy8,
+ * together with a simple 8-byte copy loop as a fall-back path.
+ * However, this optimization hurts the decompression speed by >30%,
+ * because the execution does not go to the optimized loop
+ * for typical compressible data, and all of the preamble checks
+ * before going to the fall-back path become useless overhead.
+ * This optimization happens only with the -O3 flag, and -O2 generates
+ * a simple 8-byte copy loop.
+ * With gcc on ppc64le, all of the LZ4_decompress_* and LZ4_wildCopy8
+ * functions are annotated with __attribute__((optimize("O2"))),
+ * and also LZ4_wildCopy8 is forcibly inlined, so that the O2 attribute
+ * of LZ4_wildCopy8 does not affect the compression speed.
+ */
+#if defined(__PPC64__) && defined(__LITTLE_ENDIAN__) && defined(__GNUC__) && !defined(__clang__)
+#  define LZ4_FORCE_O2  __attribute__((optimize("O2")))
+#  undef LZ4_FORCE_INLINE
+#  define LZ4_FORCE_INLINE  static __inline __attribute__((optimize("O2"),always_inline))
+#else
+#  define LZ4_FORCE_O2
+#endif
+
+#if (defined(__GNUC__) && (__GNUC__ >= 3)) || (defined(__INTEL_COMPILER) && (__INTEL_COMPILER >= 800)) || defined(__clang__)
+#  define expect(expr,value)    (__builtin_expect ((expr),(value)) )
+#else
+#  define expect(expr,value)    (expr)
+#endif
+
+#ifndef likely
+#define likely(expr)     expect((expr) != 0, 1)
+#endif
+#ifndef unlikely
+#define unlikely(expr)   expect((expr) != 0, 0)
+#endif
+
+/* Should the alignment test prove unreliable, for some reason,
+ * it can be disabled by setting LZ4_ALIGN_TEST to 0 */
+#ifndef LZ4_ALIGN_TEST  /* can be externally provided */
+# define LZ4_ALIGN_TEST 1
+#endif
+
+
+/*-************************************
+*  Memory routines
+**************************************/
+
+/*! LZ4_STATIC_LINKING_ONLY_DISABLE_MEMORY_ALLOCATION :
+ *  Disable relatively high-level LZ4/HC functions that use dynamic memory
+ *  allocation functions (malloc(), calloc(), free()).
+ *
+ *  Note that this is a compile-time switch. And since it disables
+ *  public/stable LZ4 v1 API functions, we don't recommend using this
+ *  symbol to generate a library for distribution.
+ *
+ *  The following public functions are removed when this symbol is defined.
+ *  - lz4   : LZ4_createStream, LZ4_freeStream,
+ *            LZ4_createStreamDecode, LZ4_freeStreamDecode, LZ4_create (deprecated)
+ *  - lz4hc : LZ4_createStreamHC, LZ4_freeStreamHC,
+ *            LZ4_createHC (deprecated), LZ4_freeHC  (deprecated)
+ *  - lz4frame, lz4file : All LZ4F_* functions
+ */
+#if defined(LZ4_STATIC_LINKING_ONLY_DISABLE_MEMORY_ALLOCATION)
+#  define ALLOC(s)          lz4_error_memory_allocation_is_disabled
+#  define ALLOC_AND_ZERO(s) lz4_error_memory_allocation_is_disabled
+#  define FREEMEM(p)        lz4_error_memory_allocation_is_disabled
+#elif defined(LZ4_USER_MEMORY_FUNCTIONS)
+/* memory management functions can be customized by user project.
+ * Below functions must exist somewhere in the Project
+ * and be available at link time */
+void* LZ4_malloc(size_t s);
+void* LZ4_calloc(size_t n, size_t s);
+void  LZ4_free(void* p);
+# define ALLOC(s)          LZ4_malloc(s)
+# define ALLOC_AND_ZERO(s) LZ4_calloc(1,s)
+# define FREEMEM(p)        LZ4_free(p)
+#else
+# include <stdlib.h>   /* malloc, calloc, free */
+# define ALLOC(s)          malloc(s)
+# define ALLOC_AND_ZERO(s) calloc(1,s)
+# define FREEMEM(p)        free(p)
+#endif
+
+#if ! LZ4_FREESTANDING
+#  include <string.h>   /* memset, memcpy */
+#endif
+#if !defined(LZ4_memset)
+#  define LZ4_memset(p,v,s) memset((p),(v),(s))
+#endif
+#define MEM_INIT(p,v,s)   LZ4_memset((p),(v),(s))
+
+
+/*-************************************
+*  Common Constants
+**************************************/
+#define MINMATCH 4
+
+#define WILDCOPYLENGTH 8
+#define LASTLITERALS   5   /* see ../doc/lz4_Block_format.md#parsing-restrictions */
+#define MFLIMIT       12   /* see ../doc/lz4_Block_format.md#parsing-restrictions */
+#define MATCH_SAFEGUARD_DISTANCE  ((2*WILDCOPYLENGTH) - MINMATCH)   /* ensure it's possible to write 2 x wildcopyLength without overflowing output buffer */
+#define FASTLOOP_SAFE_DISTANCE 64
+static const int LZ4_minLength = (MFLIMIT+1);
+
+#define KB *(1 <<10)
+#define MB *(1 <<20)
+#define GB *(1U<<30)
+
+#define LZ4_DISTANCE_ABSOLUTE_MAX 65535
+#if (LZ4_DISTANCE_MAX > LZ4_DISTANCE_ABSOLUTE_MAX)   /* max supported by LZ4 format */
+#  error "LZ4_DISTANCE_MAX is too big : must be <= 65535"
+#endif
+
+#define ML_BITS  4
+#define ML_MASK  ((1U<<ML_BITS)-1)
+#define RUN_BITS (8-ML_BITS)
+#define RUN_MASK ((1U<<RUN_BITS)-1)
+
+
+/*-************************************
+*  Error detection
+**************************************/
+#if defined(LZ4_DEBUG) && (LZ4_DEBUG>=1)
+#  include <assert.h>
+#else
+#  ifndef assert
+#    define assert(condition) ((void)0)
+#  endif
+#endif
+
+#define LZ4_STATIC_ASSERT(c)   { enum { LZ4_static_assert = 1/(int)(!!(c)) }; }   /* use after variable declarations */
+
+#if defined(LZ4_DEBUG) && (LZ4_DEBUG>=2)
+#  include <stdio.h>
+   static int g_debuglog_enable = 1;
+#  define DEBUGLOG(l, ...) {                          \
+        if ((g_debuglog_enable) && (l<=LZ4_DEBUG)) {  \
+            fprintf(stderr, __FILE__ ": ");           \
+            fprintf(stderr, __VA_ARGS__);             \
+            fprintf(stderr, " \n");                   \
+    }   }
+#else
+#  define DEBUGLOG(l, ...) {}    /* disabled */
+#endif
+
+static int LZ4_isAligned(const void* ptr, size_t alignment)
+{
+    return ((size_t)ptr & (alignment -1)) == 0;
+}
+
+
+/*-************************************
+*  Types
+**************************************/
+#include <limits.h>
+#if defined(__cplusplus) || (defined (__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) /* C99 */)
+# include <stdint.h>
+  typedef  uint8_t BYTE;
+  typedef uint16_t U16;
+  typedef uint32_t U32;
+  typedef  int32_t S32;
+  typedef uint64_t U64;
+  typedef uintptr_t uptrval;
+#else
+# if UINT_MAX != 4294967295UL
+#   error "LZ4 code (when not C++ or C99) assumes that sizeof(int) == 4"
+# endif
+  typedef unsigned char       BYTE;
+  typedef unsigned short      U16;
+  typedef unsigned int        U32;
+  typedef   signed int        S32;
+  typedef unsigned long long  U64;
+  typedef size_t              uptrval;   /* generally true, except OpenVMS-64 */
+#endif
+
+#if defined(__x86_64__)
+  typedef U64    reg_t;   /* 64-bits in x32 mode */
+#else
+  typedef size_t reg_t;   /* 32-bits in x32 mode */
+#endif
+
+typedef enum {
+    notLimited = 0,
+    limitedOutput = 1,
+    fillOutput = 2
+} limitedOutput_directive;
+
+namespace tracy
+{
+
+/*-************************************
+*  Reading and writing into memory
+**************************************/
+
+/**
+ * LZ4 relies on memcpy with a constant size being inlined. In freestanding
+ * environments, the compiler can't assume the implementation of memcpy() is
+ * standard compliant, so it can't apply its specialized memcpy() inlining
+ * logic. When possible, use __builtin_memcpy() to tell the compiler to analyze
+ * memcpy() as if it were standard compliant, so it can inline it in freestanding
+ * environments. This is needed when decompressing the Linux Kernel, for example.
+ */
+#if !defined(LZ4_memcpy)
+#  if defined(__GNUC__) && (__GNUC__ >= 4)
+#    define LZ4_memcpy(dst, src, size) __builtin_memcpy(dst, src, size)
+#  else
+#    define LZ4_memcpy(dst, src, size) memcpy(dst, src, size)
+#  endif
+#endif
+
+#if !defined(LZ4_memmove)
+#  if defined(__GNUC__) && (__GNUC__ >= 4)
+#    define LZ4_memmove __builtin_memmove
+#  else
+#    define LZ4_memmove memmove
+#  endif
+#endif
+
+static unsigned LZ4_isLittleEndian(void)
+{
+    const union { U32 u; BYTE c[4]; } one = { 1 };   /* don't use static : performance detrimental */
+    return one.c[0];
+}
+
+
+#if defined(LZ4_FORCE_MEMORY_ACCESS) && (LZ4_FORCE_MEMORY_ACCESS==2)
+/* lie to the compiler about data alignment; use with caution */
+
+static U16 LZ4_read16(const void* memPtr) { return *(const U16*) memPtr; }
+static U32 LZ4_read32(const void* memPtr) { return *(const U32*) memPtr; }
+static reg_t LZ4_read_ARCH(const void* memPtr) { return *(const reg_t*) memPtr; }
+
+static void LZ4_write16(void* memPtr, U16 value) { *(U16*)memPtr = value; }
+static void LZ4_write32(void* memPtr, U32 value) { *(U32*)memPtr = value; }
+
+#elif defined(LZ4_FORCE_MEMORY_ACCESS) && (LZ4_FORCE_MEMORY_ACCESS==1)
+
+/* __pack instructions are safer, but compiler specific, hence potentially problematic for some compilers */
+/* currently only defined for gcc and icc */
+typedef union { U16 u16; U32 u32; reg_t uArch; } __attribute__((packed)) LZ4_unalign;
+
+static U16 LZ4_read16(const void* ptr) { return ((const LZ4_unalign*)ptr)->u16; }
+static U32 LZ4_read32(const void* ptr) { return ((const LZ4_unalign*)ptr)->u32; }
+static reg_t LZ4_read_ARCH(const void* ptr) { return ((const LZ4_unalign*)ptr)->uArch; }
+
+static void LZ4_write16(void* memPtr, U16 value) { ((LZ4_unalign*)memPtr)->u16 = value; }
+static void LZ4_write32(void* memPtr, U32 value) { ((LZ4_unalign*)memPtr)->u32 = value; }
+
+#else  /* safe and portable access using memcpy() */
+
+static U16 LZ4_read16(const void* memPtr)
+{
+    U16 val; LZ4_memcpy(&val, memPtr, sizeof(val)); return val;
+}
+
+static U32 LZ4_read32(const void* memPtr)
+{
+    U32 val; LZ4_memcpy(&val, memPtr, sizeof(val)); return val;
+}
+
+static reg_t LZ4_read_ARCH(const void* memPtr)
+{
+    reg_t val; LZ4_memcpy(&val, memPtr, sizeof(val)); return val;
+}
+
+static void LZ4_write16(void* memPtr, U16 value)
+{
+    LZ4_memcpy(memPtr, &value, sizeof(value));
+}
+
+static void LZ4_write32(void* memPtr, U32 value)
+{
+    LZ4_memcpy(memPtr, &value, sizeof(value));
+}
+
+#endif /* LZ4_FORCE_MEMORY_ACCESS */
+
+
+static U16 LZ4_readLE16(const void* memPtr)
+{
+    if (LZ4_isLittleEndian()) {
+        return LZ4_read16(memPtr);
+    } else {
+        const BYTE* p = (const BYTE*)memPtr;
+        return (U16)((U16)p[0] + (p[1]<<8));
+    }
+}
+
+static void LZ4_writeLE16(void* memPtr, U16 value)
+{
+    if (LZ4_isLittleEndian()) {
+        LZ4_write16(memPtr, value);
+    } else {
+        BYTE* p = (BYTE*)memPtr;
+        p[0] = (BYTE) value;
+        p[1] = (BYTE)(value>>8);
+    }
+}
+
+/* customized variant of memcpy, which can overwrite up to 8 bytes beyond dstEnd */
+LZ4_FORCE_INLINE
+void LZ4_wildCopy8(void* dstPtr, const void* srcPtr, void* dstEnd)
+{
+    BYTE* d = (BYTE*)dstPtr;
+    const BYTE* s = (const BYTE*)srcPtr;
+    BYTE* const e = (BYTE*)dstEnd;
+
+    do { LZ4_memcpy(d,s,8); d+=8; s+=8; } while (d<e);
+}
+
+static const unsigned inc32table[8] = {0, 1, 2,  1,  0,  4, 4, 4};
+static const int      dec64table[8] = {0, 0, 0, -1, -4,  1, 2, 3};
+
+
+#ifndef LZ4_FAST_DEC_LOOP
+#  if defined __i386__ || defined _M_IX86 || defined __x86_64__ || defined _M_X64
+#    define LZ4_FAST_DEC_LOOP 1
+#  elif defined(__aarch64__) && defined(__APPLE__)
+#    define LZ4_FAST_DEC_LOOP 1
+#  elif defined(__aarch64__) && !defined(__clang__)
+     /* On non-Apple aarch64, we disable this optimization for clang because
+      * on certain mobile chipsets, performance is reduced with clang. For
+      * more information refer to https://github.com/lz4/lz4/pull/707 */
+#    define LZ4_FAST_DEC_LOOP 1
+#  else
+#    define LZ4_FAST_DEC_LOOP 0
+#  endif
+#endif
+
+#if LZ4_FAST_DEC_LOOP
+
+LZ4_FORCE_INLINE void
+LZ4_memcpy_using_offset_base(BYTE* dstPtr, const BYTE* srcPtr, BYTE* dstEnd, const size_t offset)
+{
+    assert(srcPtr + offset == dstPtr);
+    if (offset < 8) {
+        LZ4_write32(dstPtr, 0);   /* silence an msan warning when offset==0 */
+        dstPtr[0] = srcPtr[0];
+        dstPtr[1] = srcPtr[1];
+        dstPtr[2] = srcPtr[2];
+        dstPtr[3] = srcPtr[3];
+        srcPtr += inc32table[offset];
+        LZ4_memcpy(dstPtr+4, srcPtr, 4);
+        srcPtr -= dec64table[offset];
+        dstPtr += 8;
+    } else {
+        LZ4_memcpy(dstPtr, srcPtr, 8);
+        dstPtr += 8;
+        srcPtr += 8;
+    }
+
+    LZ4_wildCopy8(dstPtr, srcPtr, dstEnd);
+}
+
+/* customized variant of memcpy, which can overwrite up to 32 bytes beyond dstEnd
+ * this version copies two times 16 bytes (instead of one time 32 bytes)
+ * because it must be compatible with offsets >= 16. */
+LZ4_FORCE_INLINE void
+LZ4_wildCopy32(void* dstPtr, const void* srcPtr, void* dstEnd)
+{
+    BYTE* d = (BYTE*)dstPtr;
+    const BYTE* s = (const BYTE*)srcPtr;
+    BYTE* const e = (BYTE*)dstEnd;
+
+    do { LZ4_memcpy(d,s,16); LZ4_memcpy(d+16,s+16,16); d+=32; s+=32; } while (d<e);
+}
+
+/* LZ4_memcpy_using_offset()  presumes :
+ * - dstEnd >= dstPtr + MINMATCH
+ * - there is at least 8 bytes available to write after dstEnd */
+LZ4_FORCE_INLINE void
+LZ4_memcpy_using_offset(BYTE* dstPtr, const BYTE* srcPtr, BYTE* dstEnd, const size_t offset)
+{
+    BYTE v[8];
+
+    assert(dstEnd >= dstPtr + MINMATCH);
+
+    switch(offset) {
+    case 1:
+        MEM_INIT(v, *srcPtr, 8);
+        break;
+    case 2:
+        LZ4_memcpy(v, srcPtr, 2);
+        LZ4_memcpy(&v[2], srcPtr, 2);
+#if defined(_MSC_VER) && (_MSC_VER <= 1933) /* MSVC 2022 ver 17.3 or earlier */
+#  pragma warning(push)
+#  pragma warning(disable : 6385) /* warning C6385: Reading invalid data from 'v'. */
+#endif
+        LZ4_memcpy(&v[4], v, 4);
+#if defined(_MSC_VER) && (_MSC_VER <= 1933) /* MSVC 2022 ver 17.3 or earlier */
+#  pragma warning(pop)
+#endif
+        break;
+    case 4:
+        LZ4_memcpy(v, srcPtr, 4);
+        LZ4_memcpy(&v[4], srcPtr, 4);
+        break;
+    default:
+        LZ4_memcpy_using_offset_base(dstPtr, srcPtr, dstEnd, offset);
+        return;
+    }
+
+    LZ4_memcpy(dstPtr, v, 8);
+    dstPtr += 8;
+    while (dstPtr < dstEnd) {
+        LZ4_memcpy(dstPtr, v, 8);
+        dstPtr += 8;
+    }
+}
+#endif
+
+
+/*-************************************
+*  Common functions
+**************************************/
+LZ4_FORCE_INLINE unsigned LZ4_NbCommonBytes (reg_t val)
+{
+    assert(val != 0);
+    if (LZ4_isLittleEndian()) {
+        if (sizeof(val) == 8) {
+#       if defined(_MSC_VER) && (_MSC_VER >= 1800) && (defined(_M_AMD64) && !defined(_M_ARM64EC)) && !defined(LZ4_FORCE_SW_BITCOUNT)
+/*-*************************************************************************************************
+* ARM64EC is a Microsoft-designed ARM64 ABI compatible with AMD64 applications on ARM64 Windows 11.
+* The ARM64EC ABI does not support AVX/AVX2/AVX512 instructions, nor their relevant intrinsics
+* including _tzcnt_u64. Therefore, we need to neuter the _tzcnt_u64 code path for ARM64EC.
+****************************************************************************************************/
+#         if defined(__clang__) && (__clang_major__ < 10)
+            /* Avoid undefined clang-cl intrinsics issue.
+             * See https://github.com/lz4/lz4/pull/1017 for details. */
+            return (unsigned)__builtin_ia32_tzcnt_u64(val) >> 3;
+#         else
+            /* x64 CPUS without BMI support interpret `TZCNT` as `REP BSF` */
+            return (unsigned)_tzcnt_u64(val) >> 3;
+#         endif
+#       elif defined(_MSC_VER) && defined(_WIN64) && !defined(LZ4_FORCE_SW_BITCOUNT)
+            unsigned long r = 0;
+            _BitScanForward64(&r, (U64)val);
+            return (unsigned)r >> 3;
+#       elif (defined(__clang__) || (defined(__GNUC__) && ((__GNUC__ > 3) || \
+                            ((__GNUC__ == 3) && (__GNUC_MINOR__ >= 4))))) && \
+                                        !defined(LZ4_FORCE_SW_BITCOUNT)
+            return (unsigned)__builtin_ctzll((U64)val) >> 3;
+#       else
+            const U64 m = 0x0101010101010101ULL;
+            val ^= val - 1;
+            return (unsigned)(((U64)((val & (m - 1)) * m)) >> 56);
+#       endif
+        } else /* 32 bits */ {
+#       if defined(_MSC_VER) && (_MSC_VER >= 1400) && !defined(LZ4_FORCE_SW_BITCOUNT)
+            unsigned long r;
+            _BitScanForward(&r, (U32)val);
+            return (unsigned)r >> 3;
+#       elif (defined(__clang__) || (defined(__GNUC__) && ((__GNUC__ > 3) || \
+                            ((__GNUC__ == 3) && (__GNUC_MINOR__ >= 4))))) && \
+                        !defined(__TINYC__) && !defined(LZ4_FORCE_SW_BITCOUNT)
+            return (unsigned)__builtin_ctz((U32)val) >> 3;
+#       else
+            const U32 m = 0x01010101;
+            return (unsigned)((((val - 1) ^ val) & (m - 1)) * m) >> 24;
+#       endif
+        }
+    } else   /* Big Endian CPU */ {
+        if (sizeof(val)==8) {
+#       if (defined(__clang__) || (defined(__GNUC__) && ((__GNUC__ > 3) || \
+                            ((__GNUC__ == 3) && (__GNUC_MINOR__ >= 4))))) && \
+                        !defined(__TINYC__) && !defined(LZ4_FORCE_SW_BITCOUNT)
+            return (unsigned)__builtin_clzll((U64)val) >> 3;
+#       else
+#if 1
+            /* this method is probably faster,
+             * but adds a 128 bytes lookup table */
+            static const unsigned char ctz7_tab[128] = {
+                7, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0,
+                4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0,
+                5, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0,
+                4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0,
+                6, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0,
+                4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0,
+                5, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0,
+                4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0,
+            };
+            U64 const mask = 0x0101010101010101ULL;
+            U64 const t = (((val >> 8) - mask) | val) & mask;
+            return ctz7_tab[(t * 0x0080402010080402ULL) >> 57];
+#else
+            /* this method doesn't consume memory space like the previous one,
+             * but it contains several branches,
+             * that may end up slowing execution */
+            static const U32 by32 = sizeof(val)*4;  /* 32 on 64 bits (goal), 16 on 32 bits.
+            Just to avoid some static analyzer complaining about shift by 32 on 32-bits target.
+            Note that this code path is never triggered in 32-bits mode. */
+            unsigned r;
+            if (!(val>>by32)) { r=4; } else { r=0; val>>=by32; }
+            if (!(val>>16)) { r+=2; val>>=8; } else { val>>=24; }
+            r += (!val);
+            return r;
+#endif
+#       endif
+        } else /* 32 bits */ {
+#       if (defined(__clang__) || (defined(__GNUC__) && ((__GNUC__ > 3) || \
+                            ((__GNUC__ == 3) && (__GNUC_MINOR__ >= 4))))) && \
+                                        !defined(LZ4_FORCE_SW_BITCOUNT)
+            return (unsigned)__builtin_clz((U32)val) >> 3;
+#       else
+            val >>= 8;
+            val = ((((val + 0x00FFFF00) | 0x00FFFFFF) + val) |
+              (val + 0x00FF0000)) >> 24;
+            return (unsigned)val ^ 3;
+#       endif
+        }
+    }
+}
+
+
+#define STEPSIZE sizeof(reg_t)
+LZ4_FORCE_INLINE
+unsigned LZ4_count(const BYTE* pIn, const BYTE* pMatch, const BYTE* pInLimit)
+{
+    const BYTE* const pStart = pIn;
+
+    if (likely(pIn < pInLimit-(STEPSIZE-1))) {
+        reg_t const diff = LZ4_read_ARCH(pMatch) ^ LZ4_read_ARCH(pIn);
+        if (!diff) {
+            pIn+=STEPSIZE; pMatch+=STEPSIZE;
+        } else {
+            return LZ4_NbCommonBytes(diff);
+    }   }
+
+    while (likely(pIn < pInLimit-(STEPSIZE-1))) {
+        reg_t const diff = LZ4_read_ARCH(pMatch) ^ LZ4_read_ARCH(pIn);
+        if (!diff) { pIn+=STEPSIZE; pMatch+=STEPSIZE; continue; }
+        pIn += LZ4_NbCommonBytes(diff);
+        return (unsigned)(pIn - pStart);
+    }
+
+    if ((STEPSIZE==8) && (pIn<(pInLimit-3)) && (LZ4_read32(pMatch) == LZ4_read32(pIn))) { pIn+=4; pMatch+=4; }
+    if ((pIn<(pInLimit-1)) && (LZ4_read16(pMatch) == LZ4_read16(pIn))) { pIn+=2; pMatch+=2; }
+    if ((pIn<pInLimit) && (*pMatch == *pIn)) pIn++;
+    return (unsigned)(pIn - pStart);
+}
+
+
+#ifndef LZ4_COMMONDEFS_ONLY
+/*-************************************
+*  Local Constants
+**************************************/
+static const int LZ4_64Klimit = ((64 KB) + (MFLIMIT-1));
+static const U32 LZ4_skipTrigger = 6;  /* Increase this value ==> compression run slower on incompressible data */
+
+
+/*-************************************
+*  Local Structures and types
+**************************************/
+typedef enum { clearedTable = 0, byPtr, byU32, byU16 } tableType_t;
+
+/**
+ * This enum distinguishes several different modes of accessing previous
+ * content in the stream.
+ *
+ * - noDict        : There is no preceding content.
+ * - withPrefix64k : Table entries up to ctx->dictSize before the current blob
+ *                   blob being compressed are valid and refer to the preceding
+ *                   content (of length ctx->dictSize), which is available
+ *                   contiguously preceding in memory the content currently
+ *                   being compressed.
+ * - usingExtDict  : Like withPrefix64k, but the preceding content is somewhere
+ *                   else in memory, starting at ctx->dictionary with length
+ *                   ctx->dictSize.
+ * - usingDictCtx  : Everything concerning the preceding content is
+ *                   in a separate context, pointed to by ctx->dictCtx.
+ *                   ctx->dictionary, ctx->dictSize, and table entries
+ *                   in the current context that refer to positions
+ *                   preceding the beginning of the current compression are
+ *                   ignored. Instead, ctx->dictCtx->dictionary and ctx->dictCtx
+ *                   ->dictSize describe the location and size of the preceding
+ *                   content, and matches are found by looking in the ctx
+ *                   ->dictCtx->hashTable.
+ */
+typedef enum { noDict = 0, withPrefix64k, usingExtDict, usingDictCtx } dict_directive;
+typedef enum { noDictIssue = 0, dictSmall } dictIssue_directive;
+
+
+/*-************************************
+*  Local Utils
+**************************************/
+int LZ4_versionNumber (void) { return LZ4_VERSION_NUMBER; }
+const char* LZ4_versionString(void) { return LZ4_VERSION_STRING; }
+int LZ4_compressBound(int isize)  { return LZ4_COMPRESSBOUND(isize); }
+int LZ4_sizeofState(void) { return sizeof(LZ4_stream_t); }
+
+
+/*-****************************************
+*  Internal Definitions, used only in Tests
+*******************************************/
+
+int LZ4_compress_forceExtDict (LZ4_stream_t* LZ4_dict, const char* source, char* dest, int srcSize);
+
+int LZ4_decompress_safe_forceExtDict(const char* source, char* dest,
+                                     int compressedSize, int maxOutputSize,
+                                     const void* dictStart, size_t dictSize);
+int LZ4_decompress_safe_partial_forceExtDict(const char* source, char* dest,
+                                     int compressedSize, int targetOutputSize, int dstCapacity,
+                                     const void* dictStart, size_t dictSize);
+
+/*-******************************
+*  Compression functions
+********************************/
+LZ4_FORCE_INLINE U32 LZ4_hash4(U32 sequence, tableType_t const tableType)
+{
+    if (tableType == byU16)
+        return ((sequence * 2654435761U) >> ((MINMATCH*8)-(LZ4_HASHLOG+1)));
+    else
+        return ((sequence * 2654435761U) >> ((MINMATCH*8)-LZ4_HASHLOG));
+}
+
+LZ4_FORCE_INLINE U32 LZ4_hash5(U64 sequence, tableType_t const tableType)
+{
+    const U32 hashLog = (tableType == byU16) ? LZ4_HASHLOG+1 : LZ4_HASHLOG;
+    if (LZ4_isLittleEndian()) {
+        const U64 prime5bytes = 889523592379ULL;
+        return (U32)(((sequence << 24) * prime5bytes) >> (64 - hashLog));
+    } else {
+        const U64 prime8bytes = 11400714785074694791ULL;
+        return (U32)(((sequence >> 24) * prime8bytes) >> (64 - hashLog));
+    }
+}
+
+LZ4_FORCE_INLINE U32 LZ4_hashPosition(const void* const p, tableType_t const tableType)
+{
+    if ((sizeof(reg_t)==8) && (tableType != byU16)) return LZ4_hash5(LZ4_read_ARCH(p), tableType);
+    return LZ4_hash4(LZ4_read32(p), tableType);
+}
+
+LZ4_FORCE_INLINE void LZ4_clearHash(U32 h, void* tableBase, tableType_t const tableType)
+{
+    switch (tableType)
+    {
+    default: /* fallthrough */
+    case clearedTable: { /* illegal! */ assert(0); return; }
+    case byPtr: { const BYTE** hashTable = (const BYTE**)tableBase; hashTable[h] = NULL; return; }
+    case byU32: { U32* hashTable = (U32*) tableBase; hashTable[h] = 0; return; }
+    case byU16: { U16* hashTable = (U16*) tableBase; hashTable[h] = 0; return; }
+    }
+}
+
+LZ4_FORCE_INLINE void LZ4_putIndexOnHash(U32 idx, U32 h, void* tableBase, tableType_t const tableType)
+{
+    switch (tableType)
+    {
+    default: /* fallthrough */
+    case clearedTable: /* fallthrough */
+    case byPtr: { /* illegal! */ assert(0); return; }
+    case byU32: { U32* hashTable = (U32*) tableBase; hashTable[h] = idx; return; }
+    case byU16: { U16* hashTable = (U16*) tableBase; assert(idx < 65536); hashTable[h] = (U16)idx; return; }
+    }
+}
+
+LZ4_FORCE_INLINE void LZ4_putPositionOnHash(const BYTE* p, U32 h,
+                                  void* tableBase, tableType_t const tableType,
+                            const BYTE* srcBase)
+{
+    switch (tableType)
+    {
+    case clearedTable: { /* illegal! */ assert(0); return; }
+    case byPtr: { const BYTE** hashTable = (const BYTE**)tableBase; hashTable[h] = p; return; }
+    case byU32: { U32* hashTable = (U32*) tableBase; hashTable[h] = (U32)(p-srcBase); return; }
+    case byU16: { U16* hashTable = (U16*) tableBase; hashTable[h] = (U16)(p-srcBase); return; }
+    }
+}
+
+LZ4_FORCE_INLINE void LZ4_putPosition(const BYTE* p, void* tableBase, tableType_t tableType, const BYTE* srcBase)
+{
+    U32 const h = LZ4_hashPosition(p, tableType);
+    LZ4_putPositionOnHash(p, h, tableBase, tableType, srcBase);
+}
+
+/* LZ4_getIndexOnHash() :
+ * Index of match position registered in hash table.
+ * hash position must be calculated by using base+index, or dictBase+index.
+ * Assumption 1 : only valid if tableType == byU32 or byU16.
+ * Assumption 2 : h is presumed valid (within limits of hash table)
+ */
+LZ4_FORCE_INLINE U32 LZ4_getIndexOnHash(U32 h, const void* tableBase, tableType_t tableType)
+{
+    LZ4_STATIC_ASSERT(LZ4_MEMORY_USAGE > 2);
+    if (tableType == byU32) {
+        const U32* const hashTable = (const U32*) tableBase;
+        assert(h < (1U << (LZ4_MEMORY_USAGE-2)));
+        return hashTable[h];
+    }
+    if (tableType == byU16) {
+        const U16* const hashTable = (const U16*) tableBase;
+        assert(h < (1U << (LZ4_MEMORY_USAGE-1)));
+        return hashTable[h];
+    }
+    assert(0); return 0;  /* forbidden case */
+}
+
+static const BYTE* LZ4_getPositionOnHash(U32 h, const void* tableBase, tableType_t tableType, const BYTE* srcBase)
+{
+    if (tableType == byPtr) { const BYTE* const* hashTable = (const BYTE* const*) tableBase; return hashTable[h]; }
+    if (tableType == byU32) { const U32* const hashTable = (const U32*) tableBase; return hashTable[h] + srcBase; }
+    { const U16* const hashTable = (const U16*) tableBase; return hashTable[h] + srcBase; }   /* default, to ensure a return */
+}
+
+LZ4_FORCE_INLINE const BYTE*
+LZ4_getPosition(const BYTE* p,
+                const void* tableBase, tableType_t tableType,
+                const BYTE* srcBase)
+{
+    U32 const h = LZ4_hashPosition(p, tableType);
+    return LZ4_getPositionOnHash(h, tableBase, tableType, srcBase);
+}
+
+LZ4_FORCE_INLINE void
+LZ4_prepareTable(LZ4_stream_t_internal* const cctx,
+           const int inputSize,
+           const tableType_t tableType) {
+    /* If the table hasn't been used, it's guaranteed to be zeroed out, and is
+     * therefore safe to use no matter what mode we're in. Otherwise, we figure
+     * out if it's safe to leave as is or whether it needs to be reset.
+     */
+    if ((tableType_t)cctx->tableType != clearedTable) {
+        assert(inputSize >= 0);
+        if ((tableType_t)cctx->tableType != tableType
+          || ((tableType == byU16) && cctx->currentOffset + (unsigned)inputSize >= 0xFFFFU)
+          || ((tableType == byU32) && cctx->currentOffset > 1 GB)
+          || tableType == byPtr
+          || inputSize >= 4 KB)
+        {
+            DEBUGLOG(4, "LZ4_prepareTable: Resetting table in %p", cctx);
+            MEM_INIT(cctx->hashTable, 0, LZ4_HASHTABLESIZE);
+            cctx->currentOffset = 0;
+            cctx->tableType = (U32)clearedTable;
+        } else {
+            DEBUGLOG(4, "LZ4_prepareTable: Re-use hash table (no reset)");
+        }
+    }
+
+    /* Adding a gap, so all previous entries are > LZ4_DISTANCE_MAX back,
+     * is faster than compressing without a gap.
+     * However, compressing with currentOffset == 0 is faster still,
+     * so we preserve that case.
+     */
+    if (cctx->currentOffset != 0 && tableType == byU32) {
+        DEBUGLOG(5, "LZ4_prepareTable: adding 64KB to currentOffset");
+        cctx->currentOffset += 64 KB;
+    }
+
+    /* Finally, clear history */
+    cctx->dictCtx = NULL;
+    cctx->dictionary = NULL;
+    cctx->dictSize = 0;
+}
+
+/** LZ4_compress_generic() :
+ *  inlined, to ensure branches are decided at compilation time.
+ *  Presumed already validated at this stage:
+ *  - source != NULL
+ *  - inputSize > 0
+ */
+LZ4_FORCE_INLINE int LZ4_compress_generic_validated(
+                 LZ4_stream_t_internal* const cctx,
+                 const char* const source,
+                 char* const dest,
+                 const int inputSize,
+                 int*  inputConsumed, /* only written when outputDirective == fillOutput */
+                 const int maxOutputSize,
+                 const limitedOutput_directive outputDirective,
+                 const tableType_t tableType,
+                 const dict_directive dictDirective,
+                 const dictIssue_directive dictIssue,
+                 const int acceleration)
+{
+    int result;
+    const BYTE* ip = (const BYTE*) source;
+
+    U32 const startIndex = cctx->currentOffset;
+    const BYTE* base = (const BYTE*) source - startIndex;
+    const BYTE* lowLimit;
+
+    const LZ4_stream_t_internal* dictCtx = (const LZ4_stream_t_internal*) cctx->dictCtx;
+    const BYTE* const dictionary =
+        dictDirective == usingDictCtx ? dictCtx->dictionary : cctx->dictionary;
+    const U32 dictSize =
+        dictDirective == usingDictCtx ? dictCtx->dictSize : cctx->dictSize;
+    const U32 dictDelta = (dictDirective == usingDictCtx) ? startIndex - dictCtx->currentOffset : 0;   /* make indexes in dictCtx comparable with index in current context */
+
+    int const maybe_extMem = (dictDirective == usingExtDict) || (dictDirective == usingDictCtx);
+    U32 const prefixIdxLimit = startIndex - dictSize;   /* used when dictDirective == dictSmall */
+    const BYTE* const dictEnd = dictionary ? dictionary + dictSize : dictionary;
+    const BYTE* anchor = (const BYTE*) source;
+    const BYTE* const iend = ip + inputSize;
+    const BYTE* const mflimitPlusOne = iend - MFLIMIT + 1;
+    const BYTE* const matchlimit = iend - LASTLITERALS;
+
+    /* the dictCtx currentOffset is indexed on the start of the dictionary,
+     * while a dictionary in the current context precedes the currentOffset */
+    const BYTE* dictBase = (dictionary == NULL) ? NULL :
+                           (dictDirective == usingDictCtx) ?
+                            dictionary + dictSize - dictCtx->currentOffset :
+                            dictionary + dictSize - startIndex;
+
+    BYTE* op = (BYTE*) dest;
+    BYTE* const olimit = op + maxOutputSize;
+
+    U32 offset = 0;
+    U32 forwardH;
+
+    DEBUGLOG(5, "LZ4_compress_generic_validated: srcSize=%i, tableType=%u", inputSize, tableType);
+    assert(ip != NULL);
+    /* If init conditions are not met, we don't have to mark stream
+     * as having dirty context, since no action was taken yet */
+    if (outputDirective == fillOutput && maxOutputSize < 1) { return 0; } /* Impossible to store anything */
+    if ((tableType == byU16) && (inputSize>=LZ4_64Klimit)) { return 0; }  /* Size too large (not within 64K limit) */
+    if (tableType==byPtr) assert(dictDirective==noDict);      /* only supported use case with byPtr */
+    assert(acceleration >= 1);
+
+    lowLimit = (const BYTE*)source - (dictDirective == withPrefix64k ? dictSize : 0);
+
+    /* Update context state */
+    if (dictDirective == usingDictCtx) {
+        /* Subsequent linked blocks can't use the dictionary. */
+        /* Instead, they use the block we just compressed. */
+        cctx->dictCtx = NULL;
+        cctx->dictSize = (U32)inputSize;
+    } else {
+        cctx->dictSize += (U32)inputSize;
+    }
+    cctx->currentOffset += (U32)inputSize;
+    cctx->tableType = (U32)tableType;
+
+    if (inputSize<LZ4_minLength) goto _last_literals;        /* Input too small, no compression (all literals) */
+
+    /* First Byte */
+    LZ4_putPosition(ip, cctx->hashTable, tableType, base);
+    ip++; forwardH = LZ4_hashPosition(ip, tableType);
+
+    /* Main Loop */
+    for ( ; ; ) {
+        const BYTE* match;
+        BYTE* token;
+        const BYTE* filledIp;
+
+        /* Find a match */
+        if (tableType == byPtr) {
+            const BYTE* forwardIp = ip;
+            int step = 1;
+            int searchMatchNb = acceleration << LZ4_skipTrigger;
+            do {
+                U32 const h = forwardH;
+                ip = forwardIp;
+                forwardIp += step;
+                step = (searchMatchNb++ >> LZ4_skipTrigger);
+
+                if (unlikely(forwardIp > mflimitPlusOne)) goto _last_literals;
+                assert(ip < mflimitPlusOne);
+
+                match = LZ4_getPositionOnHash(h, cctx->hashTable, tableType, base);
+                forwardH = LZ4_hashPosition(forwardIp, tableType);
+                LZ4_putPositionOnHash(ip, h, cctx->hashTable, tableType, base);
+
+            } while ( (match+LZ4_DISTANCE_MAX < ip)
+                   || (LZ4_read32(match) != LZ4_read32(ip)) );
+
+        } else {   /* byU32, byU16 */
+
+            const BYTE* forwardIp = ip;
+            int step = 1;
+            int searchMatchNb = acceleration << LZ4_skipTrigger;
+            do {
+                U32 const h = forwardH;
+                U32 const current = (U32)(forwardIp - base);
+                U32 matchIndex = LZ4_getIndexOnHash(h, cctx->hashTable, tableType);
+                assert(matchIndex <= current);
+                assert(forwardIp - base < (ptrdiff_t)(2 GB - 1));
+                ip = forwardIp;
+                forwardIp += step;
+                step = (searchMatchNb++ >> LZ4_skipTrigger);
+
+                if (unlikely(forwardIp > mflimitPlusOne)) goto _last_literals;
+                assert(ip < mflimitPlusOne);
+
+                if (dictDirective == usingDictCtx) {
+                    if (matchIndex < startIndex) {
+                        /* there was no match, try the dictionary */
+                        assert(tableType == byU32);
+                        matchIndex = LZ4_getIndexOnHash(h, dictCtx->hashTable, byU32);
+                        match = dictBase + matchIndex;
+                        matchIndex += dictDelta;   /* make dictCtx index comparable with current context */
+                        lowLimit = dictionary;
+                    } else {
+                        match = base + matchIndex;
+                        lowLimit = (const BYTE*)source;
+                    }
+                } else if (dictDirective == usingExtDict) {
+                    if (matchIndex < startIndex) {
+                        DEBUGLOG(7, "extDict candidate: matchIndex=%5u  <  startIndex=%5u", matchIndex, startIndex);
+                        assert(startIndex - matchIndex >= MINMATCH);
+                        assert(dictBase);
+                        match = dictBase + matchIndex;
+                        lowLimit = dictionary;
+                    } else {
+                        match = base + matchIndex;
+                        lowLimit = (const BYTE*)source;
+                    }
+                } else {   /* single continuous memory segment */
+                    match = base + matchIndex;
+                }
+                forwardH = LZ4_hashPosition(forwardIp, tableType);
+                LZ4_putIndexOnHash(current, h, cctx->hashTable, tableType);
+
+                DEBUGLOG(7, "candidate at pos=%u  (offset=%u \n", matchIndex, current - matchIndex);
+                if ((dictIssue == dictSmall) && (matchIndex < prefixIdxLimit)) { continue; }    /* match outside of valid area */
+                assert(matchIndex < current);
+                if ( ((tableType != byU16) || (LZ4_DISTANCE_MAX < LZ4_DISTANCE_ABSOLUTE_MAX))
+                  && (matchIndex+LZ4_DISTANCE_MAX < current)) {
+                    continue;
+                } /* too far */
+                assert((current - matchIndex) <= LZ4_DISTANCE_MAX);  /* match now expected within distance */
+
+                if (LZ4_read32(match) == LZ4_read32(ip)) {
+                    if (maybe_extMem) offset = current - matchIndex;
+                    break;   /* match found */
+                }
+
+            } while(1);
+        }
+
+        /* Catch up */
+        filledIp = ip;
+        while (((ip>anchor) & (match > lowLimit)) && (unlikely(ip[-1]==match[-1]))) { ip--; match--; }
+
+        /* Encode Literals */
+        {   unsigned const litLength = (unsigned)(ip - anchor);
+            token = op++;
+            if ((outputDirective == limitedOutput) &&  /* Check output buffer overflow */
+                (unlikely(op + litLength + (2 + 1 + LASTLITERALS) + (litLength/255) > olimit)) ) {
+                return 0;   /* cannot compress within `dst` budget. Stored indexes in hash table are nonetheless fine */
+            }
+            if ((outputDirective == fillOutput) &&
+                (unlikely(op + (litLength+240)/255 /* litlen */ + litLength /* literals */ + 2 /* offset */ + 1 /* token */ + MFLIMIT - MINMATCH /* min last literals so last match is <= end - MFLIMIT */ > olimit))) {
+                op--;
+                goto _last_literals;
+            }
+            if (litLength >= RUN_MASK) {
+                int len = (int)(litLength - RUN_MASK);
+                *token = (RUN_MASK<<ML_BITS);
+                for(; len >= 255 ; len-=255) *op++ = 255;
+                *op++ = (BYTE)len;
+            }
+            else *token = (BYTE)(litLength<<ML_BITS);
+
+            /* Copy Literals */
+            LZ4_wildCopy8(op, anchor, op+litLength);
+            op+=litLength;
+            DEBUGLOG(6, "seq.start:%i, literals=%u, match.start:%i",
+                        (int)(anchor-(const BYTE*)source), litLength, (int)(ip-(const BYTE*)source));
+        }
+
+_next_match:
+        /* at this stage, the following variables must be correctly set :
+         * - ip : at start of LZ operation
+         * - match : at start of previous pattern occurrence; can be within current prefix, or within extDict
+         * - offset : if maybe_ext_memSegment==1 (constant)
+         * - lowLimit : must be == dictionary to mean "match is within extDict"; must be == source otherwise
+         * - token and *token : position to write 4-bits for match length; higher 4-bits for literal length supposed already written
+         */
+
+        if ((outputDirective == fillOutput) &&
+            (op + 2 /* offset */ + 1 /* token */ + MFLIMIT - MINMATCH /* min last literals so last match is <= end - MFLIMIT */ > olimit)) {
+            /* the match was too close to the end, rewind and go to last literals */
+            op = token;
+            goto _last_literals;
+        }
+
+        /* Encode Offset */
+        if (maybe_extMem) {   /* static test */
+            DEBUGLOG(6, "             with offset=%u  (ext if > %i)", offset, (int)(ip - (const BYTE*)source));
+            assert(offset <= LZ4_DISTANCE_MAX && offset > 0);
+            LZ4_writeLE16(op, (U16)offset); op+=2;
+        } else  {
+            DEBUGLOG(6, "             with offset=%u  (same segment)", (U32)(ip - match));
+            assert(ip-match <= LZ4_DISTANCE_MAX);
+            LZ4_writeLE16(op, (U16)(ip - match)); op+=2;
+        }
+
+        /* Encode MatchLength */
+        {   unsigned matchCode;
+
+            if ( (dictDirective==usingExtDict || dictDirective==usingDictCtx)
+              && (lowLimit==dictionary) /* match within extDict */ ) {
+                const BYTE* limit = ip + (dictEnd-match);
+                assert(dictEnd > match);
+                if (limit > matchlimit) limit = matchlimit;
+                matchCode = LZ4_count(ip+MINMATCH, match+MINMATCH, limit);
+                ip += (size_t)matchCode + MINMATCH;
+                if (ip==limit) {
+                    unsigned const more = LZ4_count(limit, (const BYTE*)source, matchlimit);
+                    matchCode += more;
+                    ip += more;
+                }
+                DEBUGLOG(6, "             with matchLength=%u starting in extDict", matchCode+MINMATCH);
+            } else {
+                matchCode = LZ4_count(ip+MINMATCH, match+MINMATCH, matchlimit);
+                ip += (size_t)matchCode + MINMATCH;
+                DEBUGLOG(6, "             with matchLength=%u", matchCode+MINMATCH);
+            }
+
+            if ((outputDirective) &&    /* Check output buffer overflow */
+                (unlikely(op + (1 + LASTLITERALS) + (matchCode+240)/255 > olimit)) ) {
+                if (outputDirective == fillOutput) {
+                    /* Match description too long : reduce it */
+                    U32 newMatchCode = 15 /* in token */ - 1 /* to avoid needing a zero byte */ + ((U32)(olimit - op) - 1 - LASTLITERALS) * 255;
+                    ip -= matchCode - newMatchCode;
+                    assert(newMatchCode < matchCode);
+                    matchCode = newMatchCode;
+                    if (unlikely(ip <= filledIp)) {
+                        /* We have already filled up to filledIp so if ip ends up less than filledIp
+                         * we have positions in the hash table beyond the current position. This is
+                         * a problem if we reuse the hash table. So we have to remove these positions
+                         * from the hash table.
+                         */
+                        const BYTE* ptr;
+                        DEBUGLOG(5, "Clearing %u positions", (U32)(filledIp - ip));
+                        for (ptr = ip; ptr <= filledIp; ++ptr) {
+                            U32 const h = LZ4_hashPosition(ptr, tableType);
+                            LZ4_clearHash(h, cctx->hashTable, tableType);
+                        }
+                    }
+                } else {
+                    assert(outputDirective == limitedOutput);
+                    return 0;   /* cannot compress within `dst` budget. Stored indexes in hash table are nonetheless fine */
+                }
+            }
+            if (matchCode >= ML_MASK) {
+                *token += ML_MASK;
+                matchCode -= ML_MASK;
+                LZ4_write32(op, 0xFFFFFFFF);
+                while (matchCode >= 4*255) {
+                    op+=4;
+                    LZ4_write32(op, 0xFFFFFFFF);
+                    matchCode -= 4*255;
+                }
+                op += matchCode / 255;
+                *op++ = (BYTE)(matchCode % 255);
+            } else
+                *token += (BYTE)(matchCode);
+        }
+        /* Ensure we have enough space for the last literals. */
+        assert(!(outputDirective == fillOutput && op + 1 + LASTLITERALS > olimit));
+
+        anchor = ip;
+
+        /* Test end of chunk */
+        if (ip >= mflimitPlusOne) break;
+
+        /* Fill table */
+        LZ4_putPosition(ip-2, cctx->hashTable, tableType, base);
+
+        /* Test next position */
+        if (tableType == byPtr) {
+
+            match = LZ4_getPosition(ip, cctx->hashTable, tableType, base);
+            LZ4_putPosition(ip, cctx->hashTable, tableType, base);
+            if ( (match+LZ4_DISTANCE_MAX >= ip)
+              && (LZ4_read32(match) == LZ4_read32(ip)) )
+            { token=op++; *token=0; goto _next_match; }
+
+        } else {   /* byU32, byU16 */
+
+            U32 const h = LZ4_hashPosition(ip, tableType);
+            U32 const current = (U32)(ip-base);
+            U32 matchIndex = LZ4_getIndexOnHash(h, cctx->hashTable, tableType);
+            assert(matchIndex < current);
+            if (dictDirective == usingDictCtx) {
+                if (matchIndex < startIndex) {
+                    /* there was no match, try the dictionary */
+                    matchIndex = LZ4_getIndexOnHash(h, dictCtx->hashTable, byU32);
+                    match = dictBase + matchIndex;
+                    lowLimit = dictionary;   /* required for match length counter */
+                    matchIndex += dictDelta;
+                } else {
+                    match = base + matchIndex;
+                    lowLimit = (const BYTE*)source;  /* required for match length counter */
+                }
+            } else if (dictDirective==usingExtDict) {
+                if (matchIndex < startIndex) {
+                    assert(dictBase);
+                    match = dictBase + matchIndex;
+                    lowLimit = dictionary;   /* required for match length counter */
+                } else {
+                    match = base + matchIndex;
+                    lowLimit = (const BYTE*)source;   /* required for match length counter */
+                }
+            } else {   /* single memory segment */
+                match = base + matchIndex;
+            }
+            LZ4_putIndexOnHash(current, h, cctx->hashTable, tableType);
+            assert(matchIndex < current);
+            if ( ((dictIssue==dictSmall) ? (matchIndex >= prefixIdxLimit) : 1)
+              && (((tableType==byU16) && (LZ4_DISTANCE_MAX == LZ4_DISTANCE_ABSOLUTE_MAX)) ? 1 : (matchIndex+LZ4_DISTANCE_MAX >= current))
+              && (LZ4_read32(match) == LZ4_read32(ip)) ) {
+                token=op++;
+                *token=0;
+                if (maybe_extMem) offset = current - matchIndex;
+                DEBUGLOG(6, "seq.start:%i, literals=%u, match.start:%i",
+                            (int)(anchor-(const BYTE*)source), 0, (int)(ip-(const BYTE*)source));
+                goto _next_match;
+            }
+        }
+
+        /* Prepare next loop */
+        forwardH = LZ4_hashPosition(++ip, tableType);
+
+    }
+
+_last_literals:
+    /* Encode Last Literals */
+    {   size_t lastRun = (size_t)(iend - anchor);
+        if ( (outputDirective) &&  /* Check output buffer overflow */
+            (op + lastRun + 1 + ((lastRun+255-RUN_MASK)/255) > olimit)) {
+            if (outputDirective == fillOutput) {
+                /* adapt lastRun to fill 'dst' */
+                assert(olimit >= op);
+                lastRun  = (size_t)(olimit-op) - 1/*token*/;
+                lastRun -= (lastRun + 256 - RUN_MASK) / 256;  /*additional length tokens*/
+            } else {
+                assert(outputDirective == limitedOutput);
+                return 0;   /* cannot compress within `dst` budget. Stored indexes in hash table are nonetheless fine */
+            }
+        }
+        DEBUGLOG(6, "Final literal run : %i literals", (int)lastRun);
+        if (lastRun >= RUN_MASK) {
+            size_t accumulator = lastRun - RUN_MASK;
+            *op++ = RUN_MASK << ML_BITS;
+            for(; accumulator >= 255 ; accumulator-=255) *op++ = 255;
+            *op++ = (BYTE) accumulator;
+        } else {
+            *op++ = (BYTE)(lastRun<<ML_BITS);
+        }
+        LZ4_memcpy(op, anchor, lastRun);
+        ip = anchor + lastRun;
+        op += lastRun;
+    }
+
+    if (outputDirective == fillOutput) {
+        *inputConsumed = (int) (((const char*)ip)-source);
+    }
+    result = (int)(((char*)op) - dest);
+    assert(result > 0);
+    DEBUGLOG(5, "LZ4_compress_generic: compressed %i bytes into %i bytes", inputSize, result);
+    return result;
+}
+
+/** LZ4_compress_generic() :
+ *  inlined, to ensure branches are decided at compilation time;
+ *  takes care of src == (NULL, 0)
+ *  and forward the rest to LZ4_compress_generic_validated */
+LZ4_FORCE_INLINE int LZ4_compress_generic(
+                 LZ4_stream_t_internal* const cctx,
+                 const char* const src,
+                 char* const dst,
+                 const int srcSize,
+                 int *inputConsumed, /* only written when outputDirective == fillOutput */
+                 const int dstCapacity,
+                 const limitedOutput_directive outputDirective,
+                 const tableType_t tableType,
+                 const dict_directive dictDirective,
+                 const dictIssue_directive dictIssue,
+                 const int acceleration)
+{
+    DEBUGLOG(5, "LZ4_compress_generic: srcSize=%i, dstCapacity=%i",
+                srcSize, dstCapacity);
+
+    if ((U32)srcSize > (U32)LZ4_MAX_INPUT_SIZE) { return 0; }  /* Unsupported srcSize, too large (or negative) */
+    if (srcSize == 0) {   /* src == NULL supported if srcSize == 0 */
+        if (outputDirective != notLimited && dstCapacity <= 0) return 0;  /* no output, can't write anything */
+        DEBUGLOG(5, "Generating an empty block");
+        assert(outputDirective == notLimited || dstCapacity >= 1);
+        assert(dst != NULL);
+        dst[0] = 0;
+        if (outputDirective == fillOutput) {
+            assert (inputConsumed != NULL);
+            *inputConsumed = 0;
+        }
+        return 1;
+    }
+    assert(src != NULL);
+
+    return LZ4_compress_generic_validated(cctx, src, dst, srcSize,
+                inputConsumed, /* only written into if outputDirective == fillOutput */
+                dstCapacity, outputDirective,
+                tableType, dictDirective, dictIssue, acceleration);
+}
+
+
+int LZ4_compress_fast_extState(void* state, const char* source, char* dest, int inputSize, int maxOutputSize, int acceleration)
+{
+    LZ4_stream_t_internal* const ctx = & LZ4_initStream(state, sizeof(LZ4_stream_t)) -> internal_donotuse;
+    assert(ctx != NULL);
+    if (acceleration < 1) acceleration = LZ4_ACCELERATION_DEFAULT;
+    if (acceleration > LZ4_ACCELERATION_MAX) acceleration = LZ4_ACCELERATION_MAX;
+    if (maxOutputSize >= LZ4_compressBound(inputSize)) {
+        if (inputSize < LZ4_64Klimit) {
+            return LZ4_compress_generic(ctx, source, dest, inputSize, NULL, 0, notLimited, byU16, noDict, noDictIssue, acceleration);
+        } else {
+            const tableType_t tableType = ((sizeof(void*)==4) && ((uptrval)source > LZ4_DISTANCE_MAX)) ? byPtr : byU32;
+            return LZ4_compress_generic(ctx, source, dest, inputSize, NULL, 0, notLimited, tableType, noDict, noDictIssue, acceleration);
+        }
+    } else {
+        if (inputSize < LZ4_64Klimit) {
+            return LZ4_compress_generic(ctx, source, dest, inputSize, NULL, maxOutputSize, limitedOutput, byU16, noDict, noDictIssue, acceleration);
+        } else {
+            const tableType_t tableType = ((sizeof(void*)==4) && ((uptrval)source > LZ4_DISTANCE_MAX)) ? byPtr : byU32;
+            return LZ4_compress_generic(ctx, source, dest, inputSize, NULL, maxOutputSize, limitedOutput, tableType, noDict, noDictIssue, acceleration);
+        }
+    }
+}
+
+/**
+ * LZ4_compress_fast_extState_fastReset() :
+ * A variant of LZ4_compress_fast_extState().
+ *
+ * Using this variant avoids an expensive initialization step. It is only safe
+ * to call if the state buffer is known to be correctly initialized already
+ * (see comment in lz4.h on LZ4_resetStream_fast() for a definition of
+ * "correctly initialized").
+ */
+int LZ4_compress_fast_extState_fastReset(void* state, const char* src, char* dst, int srcSize, int dstCapacity, int acceleration)
+{
+    LZ4_stream_t_internal* ctx = &((LZ4_stream_t*)state)->internal_donotuse;
+    if (acceleration < 1) acceleration = LZ4_ACCELERATION_DEFAULT;
+    if (acceleration > LZ4_ACCELERATION_MAX) acceleration = LZ4_ACCELERATION_MAX;
+
+    if (dstCapacity >= LZ4_compressBound(srcSize)) {
+        if (srcSize < LZ4_64Klimit) {
+            const tableType_t tableType = byU16;
+            LZ4_prepareTable(ctx, srcSize, tableType);
+            if (ctx->currentOffset) {
+                return LZ4_compress_generic(ctx, src, dst, srcSize, NULL, 0, notLimited, tableType, noDict, dictSmall, acceleration);
+            } else {
+                return LZ4_compress_generic(ctx, src, dst, srcSize, NULL, 0, notLimited, tableType, noDict, noDictIssue, acceleration);
+            }
+        } else {
+            const tableType_t tableType = ((sizeof(void*)==4) && ((uptrval)src > LZ4_DISTANCE_MAX)) ? byPtr : byU32;
+            LZ4_prepareTable(ctx, srcSize, tableType);
+            return LZ4_compress_generic(ctx, src, dst, srcSize, NULL, 0, notLimited, tableType, noDict, noDictIssue, acceleration);
+        }
+    } else {
+        if (srcSize < LZ4_64Klimit) {
+            const tableType_t tableType = byU16;
+            LZ4_prepareTable(ctx, srcSize, tableType);
+            if (ctx->currentOffset) {
+                return LZ4_compress_generic(ctx, src, dst, srcSize, NULL, dstCapacity, limitedOutput, tableType, noDict, dictSmall, acceleration);
+            } else {
+                return LZ4_compress_generic(ctx, src, dst, srcSize, NULL, dstCapacity, limitedOutput, tableType, noDict, noDictIssue, acceleration);
+            }
+        } else {
+            const tableType_t tableType = ((sizeof(void*)==4) && ((uptrval)src > LZ4_DISTANCE_MAX)) ? byPtr : byU32;
+            LZ4_prepareTable(ctx, srcSize, tableType);
+            return LZ4_compress_generic(ctx, src, dst, srcSize, NULL, dstCapacity, limitedOutput, tableType, noDict, noDictIssue, acceleration);
+        }
+    }
+}
+
+
+int LZ4_compress_fast(const char* source, char* dest, int inputSize, int maxOutputSize, int acceleration)
+{
+    int result;
+#if (LZ4_HEAPMODE)
+    LZ4_stream_t* ctxPtr = (LZ4_stream_t*)ALLOC(sizeof(LZ4_stream_t));   /* malloc-calloc always properly aligned */
+    if (ctxPtr == NULL) return 0;
+#else
+    LZ4_stream_t ctx;
+    LZ4_stream_t* const ctxPtr = &ctx;
+#endif
+    result = LZ4_compress_fast_extState(ctxPtr, source, dest, inputSize, maxOutputSize, acceleration);
+
+#if (LZ4_HEAPMODE)
+    FREEMEM(ctxPtr);
+#endif
+    return result;
+}
+
+
+int LZ4_compress_default(const char* src, char* dst, int srcSize, int maxOutputSize)
+{
+    return LZ4_compress_fast(src, dst, srcSize, maxOutputSize, 1);
+}
+
+
+/* Note!: This function leaves the stream in an unclean/broken state!
+ * It is not safe to subsequently use the same state with a _fastReset() or
+ * _continue() call without resetting it. */
+static int LZ4_compress_destSize_extState (LZ4_stream_t* state, const char* src, char* dst, int* srcSizePtr, int targetDstSize)
+{
+    void* const s = LZ4_initStream(state, sizeof (*state));
+    assert(s != NULL); (void)s;
+
+    if (targetDstSize >= LZ4_compressBound(*srcSizePtr)) {  /* compression success is guaranteed */
+        return LZ4_compress_fast_extState(state, src, dst, *srcSizePtr, targetDstSize, 1);
+    } else {
+        if (*srcSizePtr < LZ4_64Klimit) {
+            return LZ4_compress_generic(&state->internal_donotuse, src, dst, *srcSizePtr, srcSizePtr, targetDstSize, fillOutput, byU16, noDict, noDictIssue, 1);
+        } else {
+            tableType_t const addrMode = ((sizeof(void*)==4) && ((uptrval)src > LZ4_DISTANCE_MAX)) ? byPtr : byU32;
+            return LZ4_compress_generic(&state->internal_donotuse, src, dst, *srcSizePtr, srcSizePtr, targetDstSize, fillOutput, addrMode, noDict, noDictIssue, 1);
+    }   }
+}
+
+
+int LZ4_compress_destSize(const char* src, char* dst, int* srcSizePtr, int targetDstSize)
+{
+#if (LZ4_HEAPMODE)
+    LZ4_stream_t* ctx = (LZ4_stream_t*)ALLOC(sizeof(LZ4_stream_t));   /* malloc-calloc always properly aligned */
+    if (ctx == NULL) return 0;
+#else
+    LZ4_stream_t ctxBody;
+    LZ4_stream_t* ctx = &ctxBody;
+#endif
+
+    int result = LZ4_compress_destSize_extState(ctx, src, dst, srcSizePtr, targetDstSize);
+
+#if (LZ4_HEAPMODE)
+    FREEMEM(ctx);
+#endif
+    return result;
+}
+
+
+
+/*-******************************
+*  Streaming functions
+********************************/
+
+#if !defined(LZ4_STATIC_LINKING_ONLY_DISABLE_MEMORY_ALLOCATION)
+LZ4_stream_t* LZ4_createStream(void)
+{
+    LZ4_stream_t* const lz4s = (LZ4_stream_t*)ALLOC(sizeof(LZ4_stream_t));
+    LZ4_STATIC_ASSERT(sizeof(LZ4_stream_t) >= sizeof(LZ4_stream_t_internal));
+    DEBUGLOG(4, "LZ4_createStream %p", lz4s);
+    if (lz4s == NULL) return NULL;
+    LZ4_initStream(lz4s, sizeof(*lz4s));
+    return lz4s;
+}
+#endif
+
+static size_t LZ4_stream_t_alignment(void)
+{
+#if LZ4_ALIGN_TEST
+    typedef struct { char c; LZ4_stream_t t; } t_a;
+    return sizeof(t_a) - sizeof(LZ4_stream_t);
+#else
+    return 1;  /* effectively disabled */
+#endif
+}
+
+LZ4_stream_t* LZ4_initStream (void* buffer, size_t size)
+{
+    DEBUGLOG(5, "LZ4_initStream");
+    if (buffer == NULL) { return NULL; }
+    if (size < sizeof(LZ4_stream_t)) { return NULL; }
+    if (!LZ4_isAligned(buffer, LZ4_stream_t_alignment())) return NULL;
+    MEM_INIT(buffer, 0, sizeof(LZ4_stream_t_internal));
+    return (LZ4_stream_t*)buffer;
+}
+
+/* resetStream is now deprecated,
+ * prefer initStream() which is more general */
+void LZ4_resetStream (LZ4_stream_t* LZ4_stream)
+{
+    DEBUGLOG(5, "LZ4_resetStream (ctx:%p)", LZ4_stream);
+    MEM_INIT(LZ4_stream, 0, sizeof(LZ4_stream_t_internal));
+}
+
+void LZ4_resetStream_fast(LZ4_stream_t* ctx) {
+    LZ4_prepareTable(&(ctx->internal_donotuse), 0, byU32);
+}
+
+#if !defined(LZ4_STATIC_LINKING_ONLY_DISABLE_MEMORY_ALLOCATION)
+int LZ4_freeStream (LZ4_stream_t* LZ4_stream)
+{
+    if (!LZ4_stream) return 0;   /* support free on NULL */
+    DEBUGLOG(5, "LZ4_freeStream %p", LZ4_stream);
+    FREEMEM(LZ4_stream);
+    return (0);
+}
+#endif
+
+
+#define HASH_UNIT sizeof(reg_t)
+int LZ4_loadDict (LZ4_stream_t* LZ4_dict, const char* dictionary, int dictSize)
+{
+    LZ4_stream_t_internal* dict = &LZ4_dict->internal_donotuse;
+    const tableType_t tableType = byU32;
+    const BYTE* p = (const BYTE*)dictionary;
+    const BYTE* const dictEnd = p + dictSize;
+    const BYTE* base;
+
+    DEBUGLOG(4, "LZ4_loadDict (%i bytes from %p into %p)", dictSize, dictionary, LZ4_dict);
+
+    /* It's necessary to reset the context,
+     * and not just continue it with prepareTable()
+     * to avoid any risk of generating overflowing matchIndex
+     * when compressing using this dictionary */
+    LZ4_resetStream(LZ4_dict);
+
+    /* We always increment the offset by 64 KB, since, if the dict is longer,
+     * we truncate it to the last 64k, and if it's shorter, we still want to
+     * advance by a whole window length so we can provide the guarantee that
+     * there are only valid offsets in the window, which allows an optimization
+     * in LZ4_compress_fast_continue() where it uses noDictIssue even when the
+     * dictionary isn't a full 64k. */
+    dict->currentOffset += 64 KB;
+
+    if (dictSize < (int)HASH_UNIT) {
+        return 0;
+    }
+
+    if ((dictEnd - p) > 64 KB) p = dictEnd - 64 KB;
+    base = dictEnd - dict->currentOffset;
+    dict->dictionary = p;
+    dict->dictSize = (U32)(dictEnd - p);
+    dict->tableType = (U32)tableType;
+
+    while (p <= dictEnd-HASH_UNIT) {
+        LZ4_putPosition(p, dict->hashTable, tableType, base);
+        p+=3;
+    }
+
+    return (int)dict->dictSize;
+}
+
+void LZ4_attach_dictionary(LZ4_stream_t* workingStream, const LZ4_stream_t* dictionaryStream)
+{
+    const LZ4_stream_t_internal* dictCtx = (dictionaryStream == NULL) ? NULL :
+        &(dictionaryStream->internal_donotuse);
+
+    DEBUGLOG(4, "LZ4_attach_dictionary (%p, %p, size %u)",
+             workingStream, dictionaryStream,
+             dictCtx != NULL ? dictCtx->dictSize : 0);
+
+    if (dictCtx != NULL) {
+        /* If the current offset is zero, we will never look in the
+         * external dictionary context, since there is no value a table
+         * entry can take that indicate a miss. In that case, we need
+         * to bump the offset to something non-zero.
+         */
+        if (workingStream->internal_donotuse.currentOffset == 0) {
+            workingStream->internal_donotuse.currentOffset = 64 KB;
+        }
+
+        /* Don't actually attach an empty dictionary.
+         */
+        if (dictCtx->dictSize == 0) {
+            dictCtx = NULL;
+        }
+    }
+    workingStream->internal_donotuse.dictCtx = dictCtx;
+}
+
+
+static void LZ4_renormDictT(LZ4_stream_t_internal* LZ4_dict, int nextSize)
+{
+    assert(nextSize >= 0);
+    if (LZ4_dict->currentOffset + (unsigned)nextSize > 0x80000000) {   /* potential ptrdiff_t overflow (32-bits mode) */
+        /* rescale hash table */
+        U32 const delta = LZ4_dict->currentOffset - 64 KB;
+        const BYTE* dictEnd = LZ4_dict->dictionary + LZ4_dict->dictSize;
+        int i;
+        DEBUGLOG(4, "LZ4_renormDictT");
+        for (i=0; i<LZ4_HASH_SIZE_U32; i++) {
+            if (LZ4_dict->hashTable[i] < delta) LZ4_dict->hashTable[i]=0;
+            else LZ4_dict->hashTable[i] -= delta;
+        }
+        LZ4_dict->currentOffset = 64 KB;
+        if (LZ4_dict->dictSize > 64 KB) LZ4_dict->dictSize = 64 KB;
+        LZ4_dict->dictionary = dictEnd - LZ4_dict->dictSize;
+    }
+}
+
+
+int LZ4_compress_fast_continue (LZ4_stream_t* LZ4_stream,
+                                const char* source, char* dest,
+                                int inputSize, int maxOutputSize,
+                                int acceleration)
+{
+    const tableType_t tableType = byU32;
+    LZ4_stream_t_internal* const streamPtr = &LZ4_stream->internal_donotuse;
+    const char* dictEnd = streamPtr->dictSize ? (const char*)streamPtr->dictionary + streamPtr->dictSize : NULL;
+
+    DEBUGLOG(5, "LZ4_compress_fast_continue (inputSize=%i, dictSize=%u)", inputSize, streamPtr->dictSize);
+
+    LZ4_renormDictT(streamPtr, inputSize);   /* fix index overflow */
+    if (acceleration < 1) acceleration = LZ4_ACCELERATION_DEFAULT;
+    if (acceleration > LZ4_ACCELERATION_MAX) acceleration = LZ4_ACCELERATION_MAX;
+
+    /* invalidate tiny dictionaries */
+    if ( (streamPtr->dictSize < 4)     /* tiny dictionary : not enough for a hash */
+      && (dictEnd != source)           /* prefix mode */
+      && (inputSize > 0)               /* tolerance : don't lose history, in case next invocation would use prefix mode */
+      && (streamPtr->dictCtx == NULL)  /* usingDictCtx */
+      ) {
+        DEBUGLOG(5, "LZ4_compress_fast_continue: dictSize(%u) at addr:%p is too small", streamPtr->dictSize, streamPtr->dictionary);
+        /* remove dictionary existence from history, to employ faster prefix mode */
+        streamPtr->dictSize = 0;
+        streamPtr->dictionary = (const BYTE*)source;
+        dictEnd = source;
+    }
+
+    /* Check overlapping input/dictionary space */
+    {   const char* const sourceEnd = source + inputSize;
+        if ((sourceEnd > (const char*)streamPtr->dictionary) && (sourceEnd < dictEnd)) {
+            streamPtr->dictSize = (U32)(dictEnd - sourceEnd);
+            if (streamPtr->dictSize > 64 KB) streamPtr->dictSize = 64 KB;
+            if (streamPtr->dictSize < 4) streamPtr->dictSize = 0;
+            streamPtr->dictionary = (const BYTE*)dictEnd - streamPtr->dictSize;
+        }
+    }
+
+    /* prefix mode : source data follows dictionary */
+    if (dictEnd == source) {
+        if ((streamPtr->dictSize < 64 KB) && (streamPtr->dictSize < streamPtr->currentOffset))
+            return LZ4_compress_generic(streamPtr, source, dest, inputSize, NULL, maxOutputSize, limitedOutput, tableType, withPrefix64k, dictSmall, acceleration);
+        else
+            return LZ4_compress_generic(streamPtr, source, dest, inputSize, NULL, maxOutputSize, limitedOutput, tableType, withPrefix64k, noDictIssue, acceleration);
+    }
+
+    /* external dictionary mode */
+    {   int result;
+        if (streamPtr->dictCtx) {
+            /* We depend here on the fact that dictCtx'es (produced by
+             * LZ4_loadDict) guarantee that their tables contain no references
+             * to offsets between dictCtx->currentOffset - 64 KB and
+             * dictCtx->currentOffset - dictCtx->dictSize. This makes it safe
+             * to use noDictIssue even when the dict isn't a full 64 KB.
+             */
+            if (inputSize > 4 KB) {
+                /* For compressing large blobs, it is faster to pay the setup
+                 * cost to copy the dictionary's tables into the active context,
+                 * so that the compression loop is only looking into one table.
+                 */
+                LZ4_memcpy(streamPtr, streamPtr->dictCtx, sizeof(*streamPtr));
+                result = LZ4_compress_generic(streamPtr, source, dest, inputSize, NULL, maxOutputSize, limitedOutput, tableType, usingExtDict, noDictIssue, acceleration);
+            } else {
+                result = LZ4_compress_generic(streamPtr, source, dest, inputSize, NULL, maxOutputSize, limitedOutput, tableType, usingDictCtx, noDictIssue, acceleration);
+            }
+        } else {  /* small data <= 4 KB */
+            if ((streamPtr->dictSize < 64 KB) && (streamPtr->dictSize < streamPtr->currentOffset)) {
+                result = LZ4_compress_generic(streamPtr, source, dest, inputSize, NULL, maxOutputSize, limitedOutput, tableType, usingExtDict, dictSmall, acceleration);
+            } else {
+                result = LZ4_compress_generic(streamPtr, source, dest, inputSize, NULL, maxOutputSize, limitedOutput, tableType, usingExtDict, noDictIssue, acceleration);
+            }
+        }
+        streamPtr->dictionary = (const BYTE*)source;
+        streamPtr->dictSize = (U32)inputSize;
+        return result;
+    }
+}
+
+
+/* Hidden debug function, to force-test external dictionary mode */
+int LZ4_compress_forceExtDict (LZ4_stream_t* LZ4_dict, const char* source, char* dest, int srcSize)
+{
+    LZ4_stream_t_internal* streamPtr = &LZ4_dict->internal_donotuse;
+    int result;
+
+    LZ4_renormDictT(streamPtr, srcSize);
+
+    if ((streamPtr->dictSize < 64 KB) && (streamPtr->dictSize < streamPtr->currentOffset)) {
+        result = LZ4_compress_generic(streamPtr, source, dest, srcSize, NULL, 0, notLimited, byU32, usingExtDict, dictSmall, 1);
+    } else {
+        result = LZ4_compress_generic(streamPtr, source, dest, srcSize, NULL, 0, notLimited, byU32, usingExtDict, noDictIssue, 1);
+    }
+
+    streamPtr->dictionary = (const BYTE*)source;
+    streamPtr->dictSize = (U32)srcSize;
+
+    return result;
+}
+
+
+/*! LZ4_saveDict() :
+ *  If previously compressed data block is not guaranteed to remain available at its memory location,
+ *  save it into a safer place (char* safeBuffer).
+ *  Note : no need to call LZ4_loadDict() afterwards, dictionary is immediately usable,
+ *         one can therefore call LZ4_compress_fast_continue() right after.
+ * @return : saved dictionary size in bytes (necessarily <= dictSize), or 0 if error.
+ */
+int LZ4_saveDict (LZ4_stream_t* LZ4_dict, char* safeBuffer, int dictSize)
+{
+    LZ4_stream_t_internal* const dict = &LZ4_dict->internal_donotuse;
+
+    DEBUGLOG(5, "LZ4_saveDict : dictSize=%i, safeBuffer=%p", dictSize, safeBuffer);
+
+    if ((U32)dictSize > 64 KB) { dictSize = 64 KB; } /* useless to define a dictionary > 64 KB */
+    if ((U32)dictSize > dict->dictSize) { dictSize = (int)dict->dictSize; }
+
+    if (safeBuffer == NULL) assert(dictSize == 0);
+    if (dictSize > 0) {
+        const BYTE* const previousDictEnd = dict->dictionary + dict->dictSize;
+        assert(dict->dictionary);
+        LZ4_memmove(safeBuffer, previousDictEnd - dictSize, (size_t)dictSize);
+    }
+
+    dict->dictionary = (const BYTE*)safeBuffer;
+    dict->dictSize = (U32)dictSize;
+
+    return dictSize;
+}
+
+
+
+/*-*******************************
+ *  Decompression functions
+ ********************************/
+
+typedef enum { decode_full_block = 0, partial_decode = 1 } earlyEnd_directive;
+
+#undef MIN
+#define MIN(a,b)    ( (a) < (b) ? (a) : (b) )
+
+
+/* variant for decompress_unsafe()
+ * does not know end of input
+ * presumes input is well formed
+ * note : will consume at least one byte */
+size_t read_long_length_no_check(const BYTE** pp)
+{
+    size_t b, l = 0;
+    do { b = **pp; (*pp)++; l += b; } while (b==255);
+    DEBUGLOG(6, "read_long_length_no_check: +length=%zu using %zu input bytes", l, l/255 + 1)
+    return l;
+}
+
+/* core decoder variant for LZ4_decompress_fast*()
+ * for legacy support only : these entry points are deprecated.
+ * - Presumes input is correctly formed (no defense vs malformed inputs)
+ * - Does not know input size (presume input buffer is "large enough")
+ * - Decompress a full block (only)
+ * @return : nb of bytes read from input.
+ * Note : this variant is not optimized for speed, just for maintenance.
+ *        the goal is to remove support of decompress_fast*() variants by v2.0
+**/
+LZ4_FORCE_INLINE int
+LZ4_decompress_unsafe_generic(
+                 const BYTE* const istart,
+                 BYTE* const ostart,
+                 int decompressedSize,
+
+                 size_t prefixSize,
+                 const BYTE* const dictStart,  /* only if dict==usingExtDict */
+                 const size_t dictSize         /* note: =0 if dictStart==NULL */
+                 )
+{
+    const BYTE* ip = istart;
+    BYTE* op = (BYTE*)ostart;
+    BYTE* const oend = ostart + decompressedSize;
+    const BYTE* const prefixStart = ostart - prefixSize;
+
+    DEBUGLOG(5, "LZ4_decompress_unsafe_generic");
+    if (dictStart == NULL) assert(dictSize == 0);
+
+    while (1) {
+        /* start new sequence */
+        unsigned token = *ip++;
+
+        /* literals */
+        {   size_t ll = token >> ML_BITS;
+            if (ll==15) {
+                /* long literal length */
+                ll += read_long_length_no_check(&ip);
+            }
+            if ((size_t)(oend-op) < ll) return -1; /* output buffer overflow */
+            LZ4_memmove(op, ip, ll); /* support in-place decompression */
+            op += ll;
+            ip += ll;
+            if ((size_t)(oend-op) < MFLIMIT) {
+                if (op==oend) break;  /* end of block */
+                DEBUGLOG(5, "invalid: literals end at distance %zi from end of block", oend-op);
+                /* incorrect end of block :
+                 * last match must start at least MFLIMIT==12 bytes before end of output block */
+                return -1;
+        }   }
+
+        /* match */
+        {   size_t ml = token & 15;
+            size_t const offset = LZ4_readLE16(ip);
+            ip+=2;
+
+            if (ml==15) {
+                /* long literal length */
+                ml += read_long_length_no_check(&ip);
+            }
+            ml += MINMATCH;
+
+            if ((size_t)(oend-op) < ml) return -1; /* output buffer overflow */
+
+            {   const BYTE* match = op - offset;
+
+                /* out of range */
+                if (offset > (size_t)(op - prefixStart) + dictSize) {
+                    DEBUGLOG(6, "offset out of range");
+                    return -1;
+                }
+
+                /* check special case : extDict */
+                if (offset > (size_t)(op - prefixStart)) {
+                    /* extDict scenario */
+                    const BYTE* const dictEnd = dictStart + dictSize;
+                    const BYTE* extMatch = dictEnd - (offset - (size_t)(op-prefixStart));
+                    size_t const extml = (size_t)(dictEnd - extMatch);
+                    if (extml > ml) {
+                        /* match entirely within extDict */
+                        LZ4_memmove(op, extMatch, ml);
+                        op += ml;
+                        ml = 0;
+                    } else {
+                        /* match split between extDict & prefix */
+                        LZ4_memmove(op, extMatch, extml);
+                        op += extml;
+                        ml -= extml;
+                    }
+                    match = prefixStart;
+                }
+
+                /* match copy - slow variant, supporting overlap copy */
+                {   size_t u;
+                    for (u=0; u<ml; u++) {
+                        op[u] = match[u];
+            }   }   }
+            op += ml;
+            if ((size_t)(oend-op) < LASTLITERALS) {
+                DEBUGLOG(5, "invalid: match ends at distance %zi from end of block", oend-op);
+                /* incorrect end of block :
+                 * last match must stop at least LASTLITERALS==5 bytes before end of output block */
+                return -1;
+            }
+        } /* match */
+    } /* main loop */
+    return (int)(ip - istart);
+}
+
+
+/* Read the variable-length literal or match length.
+ *
+ * @ip : input pointer
+ * @ilimit : position after which if length is not decoded, the input is necessarily corrupted.
+ * @initial_check - check ip >= ipmax before start of loop.  Returns initial_error if so.
+ * @error (output) - error code.  Must be set to 0 before call.
+**/
+typedef size_t Rvl_t;
+static const Rvl_t rvl_error = (Rvl_t)(-1);
+LZ4_FORCE_INLINE Rvl_t
+read_variable_length(const BYTE** ip, const BYTE* ilimit,
+                     int initial_check)
+{
+    Rvl_t s, length = 0;
+    assert(ip != NULL);
+    assert(*ip !=  NULL);
+    assert(ilimit != NULL);
+    if (initial_check && unlikely((*ip) >= ilimit)) {    /* read limit reached */
+        return rvl_error;
+    }
+    do {
+        s = **ip;
+        (*ip)++;
+        length += s;
+        if (unlikely((*ip) > ilimit)) {    /* read limit reached */
+            return rvl_error;
+        }
+        /* accumulator overflow detection (32-bit mode only) */
+        if ((sizeof(length)<8) && unlikely(length > ((Rvl_t)(-1)/2)) ) {
+            return rvl_error;
+        }
+    } while (s==255);
+
+    return length;
+}
+
+/*! LZ4_decompress_generic() :
+ *  This generic decompression function covers all use cases.
+ *  It shall be instantiated several times, using different sets of directives.
+ *  Note that it is important for performance that this function really get inlined,
+ *  in order to remove useless branches during compilation optimization.
+ */
+LZ4_FORCE_INLINE int
+LZ4_decompress_generic(
+                 const char* const src,
+                 char* const dst,
+                 int srcSize,
+                 int outputSize,         /* If endOnInput==endOnInputSize, this value is `dstCapacity` */
+
+                 earlyEnd_directive partialDecoding,  /* full, partial */
+                 dict_directive dict,                 /* noDict, withPrefix64k, usingExtDict */
+                 const BYTE* const lowPrefix,  /* always <= dst, == dst when no prefix */
+                 const BYTE* const dictStart,  /* only if dict==usingExtDict */
+                 const size_t dictSize         /* note : = 0 if noDict */
+                 )
+{
+    if ((src == NULL) || (outputSize < 0)) { return -1; }
+
+    {   const BYTE* ip = (const BYTE*) src;
+        const BYTE* const iend = ip + srcSize;
+
+        BYTE* op = (BYTE*) dst;
+        BYTE* const oend = op + outputSize;
+        BYTE* cpy;
+
+        const BYTE* const dictEnd = (dictStart == NULL) ? NULL : dictStart + dictSize;
+
+        const int checkOffset = (dictSize < (int)(64 KB));
+
+
+        /* Set up the "end" pointers for the shortcut. */
+        const BYTE* const shortiend = iend - 14 /*maxLL*/ - 2 /*offset*/;
+        const BYTE* const shortoend = oend - 14 /*maxLL*/ - 18 /*maxML*/;
+
+        const BYTE* match;
+        size_t offset;
+        unsigned token;
+        size_t length;
+
+
+        DEBUGLOG(5, "LZ4_decompress_generic (srcSize:%i, dstSize:%i)", srcSize, outputSize);
+
+        /* Special cases */
+        assert(lowPrefix <= op);
+        if (unlikely(outputSize==0)) {
+            /* Empty output buffer */
+            if (partialDecoding) return 0;
+            return ((srcSize==1) && (*ip==0)) ? 0 : -1;
+        }
+        if (unlikely(srcSize==0)) { return -1; }
+
+    /* LZ4_FAST_DEC_LOOP:
+     * designed for modern OoO performance cpus,
+     * where copying reliably 32-bytes is preferable to an unpredictable branch.
+     * note : fast loop may show a regression for some client arm chips. */
+#if LZ4_FAST_DEC_LOOP
+        if ((oend - op) < FASTLOOP_SAFE_DISTANCE) {
+            DEBUGLOG(6, "skip fast decode loop");
+            goto safe_decode;
+        }
+
+        /* Fast loop : decode sequences as long as output < oend-FASTLOOP_SAFE_DISTANCE */
+        while (1) {
+            /* Main fastloop assertion: We can always wildcopy FASTLOOP_SAFE_DISTANCE */
+            assert(oend - op >= FASTLOOP_SAFE_DISTANCE);
+            assert(ip < iend);
+            token = *ip++;
+            length = token >> ML_BITS;  /* literal length */
+
+            /* decode literal length */
+            if (length == RUN_MASK) {
+                size_t const addl = read_variable_length(&ip, iend-RUN_MASK, 1);
+                if (addl == rvl_error) { goto _output_error; }
+                length += addl;
+                if (unlikely((uptrval)(op)+length<(uptrval)(op))) { goto _output_error; } /* overflow detection */
+                if (unlikely((uptrval)(ip)+length<(uptrval)(ip))) { goto _output_error; } /* overflow detection */
+
+                /* copy literals */
+                cpy = op+length;
+                LZ4_STATIC_ASSERT(MFLIMIT >= WILDCOPYLENGTH);
+                if ((cpy>oend-32) || (ip+length>iend-32)) { goto safe_literal_copy; }
+                LZ4_wildCopy32(op, ip, cpy);
+                ip += length; op = cpy;
+            } else {
+                cpy = op+length;
+                DEBUGLOG(7, "copy %u bytes in a 16-bytes stripe", (unsigned)length);
+                /* We don't need to check oend, since we check it once for each loop below */
+                if (ip > iend-(16 + 1/*max lit + offset + nextToken*/)) { goto safe_literal_copy; }
+                /* Literals can only be <= 14, but hope compilers optimize better when copy by a register size */
+                LZ4_memcpy(op, ip, 16);
+                ip += length; op = cpy;
+            }
+
+            /* get offset */
+            offset = LZ4_readLE16(ip); ip+=2;
+            match = op - offset;
+            assert(match <= op);  /* overflow check */
+
+            /* get matchlength */
+            length = token & ML_MASK;
+
+            if (length == ML_MASK) {
+                size_t const addl = read_variable_length(&ip, iend - LASTLITERALS + 1, 0);
+                if (addl == rvl_error) { goto _output_error; }
+                length += addl;
+                length += MINMATCH;
+                if (unlikely((uptrval)(op)+length<(uptrval)op)) { goto _output_error; } /* overflow detection */
+                if ((checkOffset) && (unlikely(match + dictSize < lowPrefix))) { goto _output_error; } /* Error : offset outside buffers */
+                if (op + length >= oend - FASTLOOP_SAFE_DISTANCE) {
+                    goto safe_match_copy;
+                }
+            } else {
+                length += MINMATCH;
+                if (op + length >= oend - FASTLOOP_SAFE_DISTANCE) {
+                    goto safe_match_copy;
+                }
+
+                /* Fastpath check: skip LZ4_wildCopy32 when true */
+                if ((dict == withPrefix64k) || (match >= lowPrefix)) {
+                    if (offset >= 8) {
+                        assert(match >= lowPrefix);
+                        assert(match <= op);
+                        assert(op + 18 <= oend);
+
+                        LZ4_memcpy(op, match, 8);
+                        LZ4_memcpy(op+8, match+8, 8);
+                        LZ4_memcpy(op+16, match+16, 2);
+                        op += length;
+                        continue;
+            }   }   }
+
+            if (checkOffset && (unlikely(match + dictSize < lowPrefix))) { goto _output_error; } /* Error : offset outside buffers */
+            /* match starting within external dictionary */
+            if ((dict==usingExtDict) && (match < lowPrefix)) {
+                assert(dictEnd != NULL);
+                if (unlikely(op+length > oend-LASTLITERALS)) {
+                    if (partialDecoding) {
+                        DEBUGLOG(7, "partialDecoding: dictionary match, close to dstEnd");
+                        length = MIN(length, (size_t)(oend-op));
+                    } else {
+                        goto _output_error;  /* end-of-block condition violated */
+                }   }
+
+                if (length <= (size_t)(lowPrefix-match)) {
+                    /* match fits entirely within external dictionary : just copy */
+                    LZ4_memmove(op, dictEnd - (lowPrefix-match), length);
+                    op += length;
+                } else {
+                    /* match stretches into both external dictionary and current block */
+                    size_t const copySize = (size_t)(lowPrefix - match);
+                    size_t const restSize = length - copySize;
+                    LZ4_memcpy(op, dictEnd - copySize, copySize);
+                    op += copySize;
+                    if (restSize > (size_t)(op - lowPrefix)) {  /* overlap copy */
+                        BYTE* const endOfMatch = op + restSize;
+                        const BYTE* copyFrom = lowPrefix;
+                        while (op < endOfMatch) { *op++ = *copyFrom++; }
+                    } else {
+                        LZ4_memcpy(op, lowPrefix, restSize);
+                        op += restSize;
+                }   }
+                continue;
+            }
+
+            /* copy match within block */
+            cpy = op + length;
+
+            assert((op <= oend) && (oend-op >= 32));
+            if (unlikely(offset<16)) {
+                LZ4_memcpy_using_offset(op, match, cpy, offset);
+            } else {
+                LZ4_wildCopy32(op, match, cpy);
+            }
+
+            op = cpy;   /* wildcopy correction */
+        }
+    safe_decode:
+#endif
+
+        /* Main Loop : decode remaining sequences where output < FASTLOOP_SAFE_DISTANCE */
+        while (1) {
+            assert(ip < iend);
+            token = *ip++;
+            length = token >> ML_BITS;  /* literal length */
+
+            /* A two-stage shortcut for the most common case:
+             * 1) If the literal length is 0..14, and there is enough space,
+             * enter the shortcut and copy 16 bytes on behalf of the literals
+             * (in the fast mode, only 8 bytes can be safely copied this way).
+             * 2) Further if the match length is 4..18, copy 18 bytes in a similar
+             * manner; but we ensure that there's enough space in the output for
+             * those 18 bytes earlier, upon entering the shortcut (in other words,
+             * there is a combined check for both stages).
+             */
+            if ( (length != RUN_MASK)
+                /* strictly "less than" on input, to re-enter the loop with at least one byte */
+              && likely((ip < shortiend) & (op <= shortoend)) ) {
+                /* Copy the literals */
+                LZ4_memcpy(op, ip, 16);
+                op += length; ip += length;
+
+                /* The second stage: prepare for match copying, decode full info.
+                 * If it doesn't work out, the info won't be wasted. */
+                length = token & ML_MASK; /* match length */
+                offset = LZ4_readLE16(ip); ip += 2;
+                match = op - offset;
+                assert(match <= op); /* check overflow */
+
+                /* Do not deal with overlapping matches. */
+                if ( (length != ML_MASK)
+                  && (offset >= 8)
+                  && (dict==withPrefix64k || match >= lowPrefix) ) {
+                    /* Copy the match. */
+                    LZ4_memcpy(op + 0, match + 0, 8);
+                    LZ4_memcpy(op + 8, match + 8, 8);
+                    LZ4_memcpy(op +16, match +16, 2);
+                    op += length + MINMATCH;
+                    /* Both stages worked, load the next token. */
+                    continue;
+                }
+
+                /* The second stage didn't work out, but the info is ready.
+                 * Propel it right to the point of match copying. */
+                goto _copy_match;
+            }
+
+            /* decode literal length */
+            if (length == RUN_MASK) {
+                size_t const addl = read_variable_length(&ip, iend-RUN_MASK, 1);
+                if (addl == rvl_error) { goto _output_error; }
+                length += addl;
+                if (unlikely((uptrval)(op)+length<(uptrval)(op))) { goto _output_error; } /* overflow detection */
+                if (unlikely((uptrval)(ip)+length<(uptrval)(ip))) { goto _output_error; } /* overflow detection */
+            }
+
+            /* copy literals */
+            cpy = op+length;
+#if LZ4_FAST_DEC_LOOP
+        safe_literal_copy:
+#endif
+            LZ4_STATIC_ASSERT(MFLIMIT >= WILDCOPYLENGTH);
+            if ((cpy>oend-MFLIMIT) || (ip+length>iend-(2+1+LASTLITERALS))) {
+                /* We've either hit the input parsing restriction or the output parsing restriction.
+                 * In the normal scenario, decoding a full block, it must be the last sequence,
+                 * otherwise it's an error (invalid input or dimensions).
+                 * In partialDecoding scenario, it's necessary to ensure there is no buffer overflow.
+                 */
+                if (partialDecoding) {
+                    /* Since we are partial decoding we may be in this block because of the output parsing
+                     * restriction, which is not valid since the output buffer is allowed to be undersized.
+                     */
+                    DEBUGLOG(7, "partialDecoding: copying literals, close to input or output end")
+                    DEBUGLOG(7, "partialDecoding: literal length = %u", (unsigned)length);
+                    DEBUGLOG(7, "partialDecoding: remaining space in dstBuffer : %i", (int)(oend - op));
+                    DEBUGLOG(7, "partialDecoding: remaining space in srcBuffer : %i", (int)(iend - ip));
+                    /* Finishing in the middle of a literals segment,
+                     * due to lack of input.
+                     */
+                    if (ip+length > iend) {
+                        length = (size_t)(iend-ip);
+                        cpy = op + length;
+                    }
+                    /* Finishing in the middle of a literals segment,
+                     * due to lack of output space.
+                     */
+                    if (cpy > oend) {
+                        cpy = oend;
+                        assert(op<=oend);
+                        length = (size_t)(oend-op);
+                    }
+                } else {
+                     /* We must be on the last sequence (or invalid) because of the parsing limitations
+                      * so check that we exactly consume the input and don't overrun the output buffer.
+                      */
+                    if ((ip+length != iend) || (cpy > oend)) {
+                        DEBUGLOG(6, "should have been last run of literals")
+                        DEBUGLOG(6, "ip(%p) + length(%i) = %p != iend (%p)", ip, (int)length, ip+length, iend);
+                        DEBUGLOG(6, "or cpy(%p) > oend(%p)", cpy, oend);
+                        goto _output_error;
+                    }
+                }
+                LZ4_memmove(op, ip, length);  /* supports overlapping memory regions, for in-place decompression scenarios */
+                ip += length;
+                op += length;
+                /* Necessarily EOF when !partialDecoding.
+                 * When partialDecoding, it is EOF if we've either
+                 * filled the output buffer or
+                 * can't proceed with reading an offset for following match.
+                 */
+                if (!partialDecoding || (cpy == oend) || (ip >= (iend-2))) {
+                    break;
+                }
+            } else {
+                LZ4_wildCopy8(op, ip, cpy);   /* can overwrite up to 8 bytes beyond cpy */
+                ip += length; op = cpy;
+            }
+
+            /* get offset */
+            offset = LZ4_readLE16(ip); ip+=2;
+            match = op - offset;
+
+            /* get matchlength */
+            length = token & ML_MASK;
+
+    _copy_match:
+            if (length == ML_MASK) {
+                size_t const addl = read_variable_length(&ip, iend - LASTLITERALS + 1, 0);
+                if (addl == rvl_error) { goto _output_error; }
+                length += addl;
+                if (unlikely((uptrval)(op)+length<(uptrval)op)) goto _output_error;   /* overflow detection */
+            }
+            length += MINMATCH;
+
+#if LZ4_FAST_DEC_LOOP
+        safe_match_copy:
+#endif
+            if ((checkOffset) && (unlikely(match + dictSize < lowPrefix))) goto _output_error;   /* Error : offset outside buffers */
+            /* match starting within external dictionary */
+            if ((dict==usingExtDict) && (match < lowPrefix)) {
+                assert(dictEnd != NULL);
+                if (unlikely(op+length > oend-LASTLITERALS)) {
+                    if (partialDecoding) length = MIN(length, (size_t)(oend-op));
+                    else goto _output_error;   /* doesn't respect parsing restriction */
+                }
+
+                if (length <= (size_t)(lowPrefix-match)) {
+                    /* match fits entirely within external dictionary : just copy */
+                    LZ4_memmove(op, dictEnd - (lowPrefix-match), length);
+                    op += length;
+                } else {
+                    /* match stretches into both external dictionary and current block */
+                    size_t const copySize = (size_t)(lowPrefix - match);
+                    size_t const restSize = length - copySize;
+                    LZ4_memcpy(op, dictEnd - copySize, copySize);
+                    op += copySize;
+                    if (restSize > (size_t)(op - lowPrefix)) {  /* overlap copy */
+                        BYTE* const endOfMatch = op + restSize;
+                        const BYTE* copyFrom = lowPrefix;
+                        while (op < endOfMatch) *op++ = *copyFrom++;
+                    } else {
+                        LZ4_memcpy(op, lowPrefix, restSize);
+                        op += restSize;
+                }   }
+                continue;
+            }
+            assert(match >= lowPrefix);
+
+            /* copy match within block */
+            cpy = op + length;
+
+            /* partialDecoding : may end anywhere within the block */
+            assert(op<=oend);
+            if (partialDecoding && (cpy > oend-MATCH_SAFEGUARD_DISTANCE)) {
+                size_t const mlen = MIN(length, (size_t)(oend-op));
+                const BYTE* const matchEnd = match + mlen;
+                BYTE* const copyEnd = op + mlen;
+                if (matchEnd > op) {   /* overlap copy */
+                    while (op < copyEnd) { *op++ = *match++; }
+                } else {
+                    LZ4_memcpy(op, match, mlen);
+                }
+                op = copyEnd;
+                if (op == oend) { break; }
+                continue;
+            }
+
+            if (unlikely(offset<8)) {
+                LZ4_write32(op, 0);   /* silence msan warning when offset==0 */
+                op[0] = match[0];
+                op[1] = match[1];
+                op[2] = match[2];
+                op[3] = match[3];
+                match += inc32table[offset];
+                LZ4_memcpy(op+4, match, 4);
+                match -= dec64table[offset];
+            } else {
+                LZ4_memcpy(op, match, 8);
+                match += 8;
+            }
+            op += 8;
+
+            if (unlikely(cpy > oend-MATCH_SAFEGUARD_DISTANCE)) {
+                BYTE* const oCopyLimit = oend - (WILDCOPYLENGTH-1);
+                if (cpy > oend-LASTLITERALS) { goto _output_error; } /* Error : last LASTLITERALS bytes must be literals (uncompressed) */
+                if (op < oCopyLimit) {
+                    LZ4_wildCopy8(op, match, oCopyLimit);
+                    match += oCopyLimit - op;
+                    op = oCopyLimit;
+                }
+                while (op < cpy) { *op++ = *match++; }
+            } else {
+                LZ4_memcpy(op, match, 8);
+                if (length > 16)  { LZ4_wildCopy8(op+8, match+8, cpy); }
+            }
+            op = cpy;   /* wildcopy correction */
+        }
+
+        /* end of decoding */
+        DEBUGLOG(5, "decoded %i bytes", (int) (((char*)op)-dst));
+        return (int) (((char*)op)-dst);     /* Nb of output bytes decoded */
+
+        /* Overflow error detected */
+    _output_error:
+        return (int) (-(((const char*)ip)-src))-1;
+    }
+}
+
+
+/*===== Instantiate the API decoding functions. =====*/
+
+LZ4_FORCE_O2
+int LZ4_decompress_safe(const char* source, char* dest, int compressedSize, int maxDecompressedSize)
+{
+    return LZ4_decompress_generic(source, dest, compressedSize, maxDecompressedSize,
+                                  decode_full_block, noDict,
+                                  (BYTE*)dest, NULL, 0);
+}
+
+LZ4_FORCE_O2
+int LZ4_decompress_safe_partial(const char* src, char* dst, int compressedSize, int targetOutputSize, int dstCapacity)
+{
+    dstCapacity = MIN(targetOutputSize, dstCapacity);
+    return LZ4_decompress_generic(src, dst, compressedSize, dstCapacity,
+                                  partial_decode,
+                                  noDict, (BYTE*)dst, NULL, 0);
+}
+
+LZ4_FORCE_O2
+int LZ4_decompress_fast(const char* source, char* dest, int originalSize)
+{
+    DEBUGLOG(5, "LZ4_decompress_fast");
+    return LZ4_decompress_unsafe_generic(
+                (const BYTE*)source, (BYTE*)dest, originalSize,
+                0, NULL, 0);
+}
+
+/*===== Instantiate a few more decoding cases, used more than once. =====*/
+
+LZ4_FORCE_O2 /* Exported, an obsolete API function. */
+int LZ4_decompress_safe_withPrefix64k(const char* source, char* dest, int compressedSize, int maxOutputSize)
+{
+    return LZ4_decompress_generic(source, dest, compressedSize, maxOutputSize,
+                                  decode_full_block, withPrefix64k,
+                                  (BYTE*)dest - 64 KB, NULL, 0);
+}
+
+LZ4_FORCE_O2
+static int LZ4_decompress_safe_partial_withPrefix64k(const char* source, char* dest, int compressedSize, int targetOutputSize, int dstCapacity)
+{
+    dstCapacity = MIN(targetOutputSize, dstCapacity);
+    return LZ4_decompress_generic(source, dest, compressedSize, dstCapacity,
+                                  partial_decode, withPrefix64k,
+                                  (BYTE*)dest - 64 KB, NULL, 0);
+}
+
+/* Another obsolete API function, paired with the previous one. */
+int LZ4_decompress_fast_withPrefix64k(const char* source, char* dest, int originalSize)
+{
+    return LZ4_decompress_unsafe_generic(
+                (const BYTE*)source, (BYTE*)dest, originalSize,
+                64 KB, NULL, 0);
+}
+
+LZ4_FORCE_O2
+static int LZ4_decompress_safe_withSmallPrefix(const char* source, char* dest, int compressedSize, int maxOutputSize,
+                                               size_t prefixSize)
+{
+    return LZ4_decompress_generic(source, dest, compressedSize, maxOutputSize,
+                                  decode_full_block, noDict,
+                                  (BYTE*)dest-prefixSize, NULL, 0);
+}
+
+LZ4_FORCE_O2
+static int LZ4_decompress_safe_partial_withSmallPrefix(const char* source, char* dest, int compressedSize, int targetOutputSize, int dstCapacity,
+                                               size_t prefixSize)
+{
+    dstCapacity = MIN(targetOutputSize, dstCapacity);
+    return LZ4_decompress_generic(source, dest, compressedSize, dstCapacity,
+                                  partial_decode, noDict,
+                                  (BYTE*)dest-prefixSize, NULL, 0);
+}
+
+LZ4_FORCE_O2
+int LZ4_decompress_safe_forceExtDict(const char* source, char* dest,
+                                     int compressedSize, int maxOutputSize,
+                                     const void* dictStart, size_t dictSize)
+{
+    return LZ4_decompress_generic(source, dest, compressedSize, maxOutputSize,
+                                  decode_full_block, usingExtDict,
+                                  (BYTE*)dest, (const BYTE*)dictStart, dictSize);
+}
+
+LZ4_FORCE_O2
+int LZ4_decompress_safe_partial_forceExtDict(const char* source, char* dest,
+                                     int compressedSize, int targetOutputSize, int dstCapacity,
+                                     const void* dictStart, size_t dictSize)
+{
+    dstCapacity = MIN(targetOutputSize, dstCapacity);
+    return LZ4_decompress_generic(source, dest, compressedSize, dstCapacity,
+                                  partial_decode, usingExtDict,
+                                  (BYTE*)dest, (const BYTE*)dictStart, dictSize);
+}
+
+LZ4_FORCE_O2
+static int LZ4_decompress_fast_extDict(const char* source, char* dest, int originalSize,
+                                       const void* dictStart, size_t dictSize)
+{
+    return LZ4_decompress_unsafe_generic(
+                (const BYTE*)source, (BYTE*)dest, originalSize,
+                0, (const BYTE*)dictStart, dictSize);
+}
+
+/* The "double dictionary" mode, for use with e.g. ring buffers: the first part
+ * of the dictionary is passed as prefix, and the second via dictStart + dictSize.
+ * These routines are used only once, in LZ4_decompress_*_continue().
+ */
+LZ4_FORCE_INLINE
+int LZ4_decompress_safe_doubleDict(const char* source, char* dest, int compressedSize, int maxOutputSize,
+                                   size_t prefixSize, const void* dictStart, size_t dictSize)
+{
+    return LZ4_decompress_generic(source, dest, compressedSize, maxOutputSize,
+                                  decode_full_block, usingExtDict,
+                                  (BYTE*)dest-prefixSize, (const BYTE*)dictStart, dictSize);
+}
+
+/*===== streaming decompression functions =====*/
+
+#if !defined(LZ4_STATIC_LINKING_ONLY_DISABLE_MEMORY_ALLOCATION)
+LZ4_streamDecode_t* LZ4_createStreamDecode(void)
+{
+    LZ4_STATIC_ASSERT(sizeof(LZ4_streamDecode_t) >= sizeof(LZ4_streamDecode_t_internal));
+    return (LZ4_streamDecode_t*) ALLOC_AND_ZERO(sizeof(LZ4_streamDecode_t));
+}
+
+int LZ4_freeStreamDecode (LZ4_streamDecode_t* LZ4_stream)
+{
+    if (LZ4_stream == NULL) { return 0; }  /* support free on NULL */
+    FREEMEM(LZ4_stream);
+    return 0;
+}
+#endif
+
+/*! LZ4_setStreamDecode() :
+ *  Use this function to instruct where to find the dictionary.
+ *  This function is not necessary if previous data is still available where it was decoded.
+ *  Loading a size of 0 is allowed (same effect as no dictionary).
+ * @return : 1 if OK, 0 if error
+ */
+int LZ4_setStreamDecode (LZ4_streamDecode_t* LZ4_streamDecode, const char* dictionary, int dictSize)
+{
+    LZ4_streamDecode_t_internal* lz4sd = &LZ4_streamDecode->internal_donotuse;
+    lz4sd->prefixSize = (size_t)dictSize;
+    if (dictSize) {
+        assert(dictionary != NULL);
+        lz4sd->prefixEnd = (const BYTE*) dictionary + dictSize;
+    } else {
+        lz4sd->prefixEnd = (const BYTE*) dictionary;
+    }
+    lz4sd->externalDict = NULL;
+    lz4sd->extDictSize  = 0;
+    return 1;
+}
+
+/*! LZ4_decoderRingBufferSize() :
+ *  when setting a ring buffer for streaming decompression (optional scenario),
+ *  provides the minimum size of this ring buffer
+ *  to be compatible with any source respecting maxBlockSize condition.
+ *  Note : in a ring buffer scenario,
+ *  blocks are presumed decompressed next to each other.
+ *  When not enough space remains for next block (remainingSize < maxBlockSize),
+ *  decoding resumes from beginning of ring buffer.
+ * @return : minimum ring buffer size,
+ *           or 0 if there is an error (invalid maxBlockSize).
+ */
+int LZ4_decoderRingBufferSize(int maxBlockSize)
+{
+    if (maxBlockSize < 0) return 0;
+    if (maxBlockSize > LZ4_MAX_INPUT_SIZE) return 0;
+    if (maxBlockSize < 16) maxBlockSize = 16;
+    return LZ4_DECODER_RING_BUFFER_SIZE(maxBlockSize);
+}
+
+/*
+*_continue() :
+    These decoding functions allow decompression of multiple blocks in "streaming" mode.
+    Previously decoded blocks must still be available at the memory position where they were decoded.
+    If it's not possible, save the relevant part of decoded data into a safe buffer,
+    and indicate where it stands using LZ4_setStreamDecode()
+*/
+LZ4_FORCE_O2
+int LZ4_decompress_safe_continue (LZ4_streamDecode_t* LZ4_streamDecode, const char* source, char* dest, int compressedSize, int maxOutputSize)
+{
+    LZ4_streamDecode_t_internal* lz4sd = &LZ4_streamDecode->internal_donotuse;
+    int result;
+
+    if (lz4sd->prefixSize == 0) {
+        /* The first call, no dictionary yet. */
+        assert(lz4sd->extDictSize == 0);
+        result = LZ4_decompress_safe(source, dest, compressedSize, maxOutputSize);
+        if (result <= 0) return result;
+        lz4sd->prefixSize = (size_t)result;
+        lz4sd->prefixEnd = (BYTE*)dest + result;
+    } else if (lz4sd->prefixEnd == (BYTE*)dest) {
+        /* They're rolling the current segment. */
+        if (lz4sd->prefixSize >= 64 KB - 1)
+            result = LZ4_decompress_safe_withPrefix64k(source, dest, compressedSize, maxOutputSize);
+        else if (lz4sd->extDictSize == 0)
+            result = LZ4_decompress_safe_withSmallPrefix(source, dest, compressedSize, maxOutputSize,
+                                                         lz4sd->prefixSize);
+        else
+            result = LZ4_decompress_safe_doubleDict(source, dest, compressedSize, maxOutputSize,
+                                                    lz4sd->prefixSize, lz4sd->externalDict, lz4sd->extDictSize);
+        if (result <= 0) return result;
+        lz4sd->prefixSize += (size_t)result;
+        lz4sd->prefixEnd  += result;
+    } else {
+        /* The buffer wraps around, or they're switching to another buffer. */
+        lz4sd->extDictSize = lz4sd->prefixSize;
+        lz4sd->externalDict = lz4sd->prefixEnd - lz4sd->extDictSize;
+        result = LZ4_decompress_safe_forceExtDict(source, dest, compressedSize, maxOutputSize,
+                                                  lz4sd->externalDict, lz4sd->extDictSize);
+        if (result <= 0) return result;
+        lz4sd->prefixSize = (size_t)result;
+        lz4sd->prefixEnd  = (BYTE*)dest + result;
+    }
+
+    return result;
+}
+
+LZ4_FORCE_O2 int
+LZ4_decompress_fast_continue (LZ4_streamDecode_t* LZ4_streamDecode,
+                        const char* source, char* dest, int originalSize)
+{
+    LZ4_streamDecode_t_internal* const lz4sd =
+        (assert(LZ4_streamDecode!=NULL), &LZ4_streamDecode->internal_donotuse);
+    int result;
+
+    DEBUGLOG(5, "LZ4_decompress_fast_continue (toDecodeSize=%i)", originalSize);
+    assert(originalSize >= 0);
+
+    if (lz4sd->prefixSize == 0) {
+        DEBUGLOG(5, "first invocation : no prefix nor extDict");
+        assert(lz4sd->extDictSize == 0);
+        result = LZ4_decompress_fast(source, dest, originalSize);
+        if (result <= 0) return result;
+        lz4sd->prefixSize = (size_t)originalSize;
+        lz4sd->prefixEnd = (BYTE*)dest + originalSize;
+    } else if (lz4sd->prefixEnd == (BYTE*)dest) {
+        DEBUGLOG(5, "continue using existing prefix");
+        result = LZ4_decompress_unsafe_generic(
+                        (const BYTE*)source, (BYTE*)dest, originalSize,
+                        lz4sd->prefixSize,
+                        lz4sd->externalDict, lz4sd->extDictSize);
+        if (result <= 0) return result;
+        lz4sd->prefixSize += (size_t)originalSize;
+        lz4sd->prefixEnd  += originalSize;
+    } else {
+        DEBUGLOG(5, "prefix becomes extDict");
+        lz4sd->extDictSize = lz4sd->prefixSize;
+        lz4sd->externalDict = lz4sd->prefixEnd - lz4sd->extDictSize;
+        result = LZ4_decompress_fast_extDict(source, dest, originalSize,
+                                             lz4sd->externalDict, lz4sd->extDictSize);
+        if (result <= 0) return result;
+        lz4sd->prefixSize = (size_t)originalSize;
+        lz4sd->prefixEnd  = (BYTE*)dest + originalSize;
+    }
+
+    return result;
+}
+
+
+/*
+Advanced decoding functions :
+*_usingDict() :
+    These decoding functions work the same as "_continue" ones,
+    the dictionary must be explicitly provided within parameters
+*/
+
+int LZ4_decompress_safe_usingDict(const char* source, char* dest, int compressedSize, int maxOutputSize, const char* dictStart, int dictSize)
+{
+    if (dictSize==0)
+        return LZ4_decompress_safe(source, dest, compressedSize, maxOutputSize);
+    if (dictStart+dictSize == dest) {
+        if (dictSize >= 64 KB - 1) {
+            return LZ4_decompress_safe_withPrefix64k(source, dest, compressedSize, maxOutputSize);
+        }
+        assert(dictSize >= 0);
+        return LZ4_decompress_safe_withSmallPrefix(source, dest, compressedSize, maxOutputSize, (size_t)dictSize);
+    }
+    assert(dictSize >= 0);
+    return LZ4_decompress_safe_forceExtDict(source, dest, compressedSize, maxOutputSize, dictStart, (size_t)dictSize);
+}
+
+int LZ4_decompress_safe_partial_usingDict(const char* source, char* dest, int compressedSize, int targetOutputSize, int dstCapacity, const char* dictStart, int dictSize)
+{
+    if (dictSize==0)
+        return LZ4_decompress_safe_partial(source, dest, compressedSize, targetOutputSize, dstCapacity);
+    if (dictStart+dictSize == dest) {
+        if (dictSize >= 64 KB - 1) {
+            return LZ4_decompress_safe_partial_withPrefix64k(source, dest, compressedSize, targetOutputSize, dstCapacity);
+        }
+        assert(dictSize >= 0);
+        return LZ4_decompress_safe_partial_withSmallPrefix(source, dest, compressedSize, targetOutputSize, dstCapacity, (size_t)dictSize);
+    }
+    assert(dictSize >= 0);
+    return LZ4_decompress_safe_partial_forceExtDict(source, dest, compressedSize, targetOutputSize, dstCapacity, dictStart, (size_t)dictSize);
+}
+
+int LZ4_decompress_fast_usingDict(const char* source, char* dest, int originalSize, const char* dictStart, int dictSize)
+{
+    if (dictSize==0 || dictStart+dictSize == dest)
+        return LZ4_decompress_unsafe_generic(
+                        (const BYTE*)source, (BYTE*)dest, originalSize,
+                        (size_t)dictSize, NULL, 0);
+    assert(dictSize >= 0);
+    return LZ4_decompress_fast_extDict(source, dest, originalSize, dictStart, (size_t)dictSize);
+}
+
+
+/*=*************************************************
+*  Obsolete Functions
+***************************************************/
+/* obsolete compression functions */
+int LZ4_compress_limitedOutput(const char* source, char* dest, int inputSize, int maxOutputSize)
+{
+    return LZ4_compress_default(source, dest, inputSize, maxOutputSize);
+}
+int LZ4_compress(const char* src, char* dest, int srcSize)
+{
+    return LZ4_compress_default(src, dest, srcSize, LZ4_compressBound(srcSize));
+}
+int LZ4_compress_limitedOutput_withState (void* state, const char* src, char* dst, int srcSize, int dstSize)
+{
+    return LZ4_compress_fast_extState(state, src, dst, srcSize, dstSize, 1);
+}
+int LZ4_compress_withState (void* state, const char* src, char* dst, int srcSize)
+{
+    return LZ4_compress_fast_extState(state, src, dst, srcSize, LZ4_compressBound(srcSize), 1);
+}
+int LZ4_compress_limitedOutput_continue (LZ4_stream_t* LZ4_stream, const char* src, char* dst, int srcSize, int dstCapacity)
+{
+    return LZ4_compress_fast_continue(LZ4_stream, src, dst, srcSize, dstCapacity, 1);
+}
+int LZ4_compress_continue (LZ4_stream_t* LZ4_stream, const char* source, char* dest, int inputSize)
+{
+    return LZ4_compress_fast_continue(LZ4_stream, source, dest, inputSize, LZ4_compressBound(inputSize), 1);
+}
+
+/*
+These decompression functions are deprecated and should no longer be used.
+They are only provided here for compatibility with older user programs.
+- LZ4_uncompress is totally equivalent to LZ4_decompress_fast
+- LZ4_uncompress_unknownOutputSize is totally equivalent to LZ4_decompress_safe
+*/
+int LZ4_uncompress (const char* source, char* dest, int outputSize)
+{
+    return LZ4_decompress_fast(source, dest, outputSize);
+}
+int LZ4_uncompress_unknownOutputSize (const char* source, char* dest, int isize, int maxOutputSize)
+{
+    return LZ4_decompress_safe(source, dest, isize, maxOutputSize);
+}
+
+/* Obsolete Streaming functions */
+
+int LZ4_sizeofStreamState(void) { return sizeof(LZ4_stream_t); }
+
+int LZ4_resetStreamState(void* state, char* inputBuffer)
+{
+    (void)inputBuffer;
+    LZ4_resetStream((LZ4_stream_t*)state);
+    return 0;
+}
+
+#if !defined(LZ4_STATIC_LINKING_ONLY_DISABLE_MEMORY_ALLOCATION)
+void* LZ4_create (char* inputBuffer)
+{
+    (void)inputBuffer;
+    return LZ4_createStream();
+}
+#endif
+
+char* LZ4_slideInputBuffer (void* state)
+{
+    /* avoid const char * -> char * conversion warning */
+    return (char *)(uptrval)((LZ4_stream_t*)state)->internal_donotuse.dictionary;
+}
+
+#endif   /* LZ4_COMMONDEFS_ONLY */
+
+}
diff --git a/project/thirdparty/tracy-0.11.1/common/tracy_lz4.hpp b/project/thirdparty/tracy-0.11.1/common/tracy_lz4.hpp
new file mode 100644
index 000000000..672c2feb2
--- /dev/null
+++ b/project/thirdparty/tracy-0.11.1/common/tracy_lz4.hpp
@@ -0,0 +1,847 @@
+/*
+ *  LZ4 - Fast LZ compression algorithm
+ *  Header File
+ *  Copyright (C) 2011-2020, Yann Collet.
+
+   BSD 2-Clause License (http://www.opensource.org/licenses/bsd-license.php)
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+
+       * Redistributions of source code must retain the above copyright
+   notice, this list of conditions and the following disclaimer.
+       * Redistributions in binary form must reproduce the above
+   copyright notice, this list of conditions and the following disclaimer
+   in the documentation and/or other materials provided with the
+   distribution.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+   You can contact the author at :
+    - LZ4 homepage : http://www.lz4.org
+    - LZ4 source repository : https://github.com/lz4/lz4
+*/
+
+#ifndef TRACY_LZ4_H_2983827168210
+#define TRACY_LZ4_H_2983827168210
+
+/* --- Dependency --- */
+#include <stddef.h>   /* size_t */
+#include <stdint.h>
+
+
+/**
+  Introduction
+
+  LZ4 is lossless compression algorithm, providing compression speed >500 MB/s per core,
+  scalable with multi-cores CPU. It features an extremely fast decoder, with speed in
+  multiple GB/s per core, typically reaching RAM speed limits on multi-core systems.
+
+  The LZ4 compression library provides in-memory compression and decompression functions.
+  It gives full buffer control to user.
+  Compression can be done in:
+    - a single step (described as Simple Functions)
+    - a single step, reusing a context (described in Advanced Functions)
+    - unbounded multiple steps (described as Streaming compression)
+
+  lz4.h generates and decodes LZ4-compressed blocks (doc/lz4_Block_format.md).
+  Decompressing such a compressed block requires additional metadata.
+  Exact metadata depends on exact decompression function.
+  For the typical case of LZ4_decompress_safe(),
+  metadata includes block's compressed size, and maximum bound of decompressed size.
+  Each application is free to encode and pass such metadata in whichever way it wants.
+
+  lz4.h only handle blocks, it can not generate Frames.
+
+  Blocks are different from Frames (doc/lz4_Frame_format.md).
+  Frames bundle both blocks and metadata in a specified manner.
+  Embedding metadata is required for compressed data to be self-contained and portable.
+  Frame format is delivered through a companion API, declared in lz4frame.h.
+  The `lz4` CLI can only manage frames.
+*/
+
+/*^***************************************************************
+*  Export parameters
+*****************************************************************/
+/*
+*  LZ4_DLL_EXPORT :
+*  Enable exporting of functions when building a Windows DLL
+*  LZ4LIB_VISIBILITY :
+*  Control library symbols visibility.
+*/
+#ifndef LZ4LIB_VISIBILITY
+#  if defined(__GNUC__) && (__GNUC__ >= 4)
+#    define LZ4LIB_VISIBILITY __attribute__ ((visibility ("default")))
+#  else
+#    define LZ4LIB_VISIBILITY
+#  endif
+#endif
+#if defined(LZ4_DLL_EXPORT) && (LZ4_DLL_EXPORT==1)
+#  define LZ4LIB_API __declspec(dllexport) LZ4LIB_VISIBILITY
+#elif defined(LZ4_DLL_IMPORT) && (LZ4_DLL_IMPORT==1)
+#  define LZ4LIB_API __declspec(dllimport) LZ4LIB_VISIBILITY /* It isn't required but allows to generate better code, saving a function pointer load from the IAT and an indirect jump.*/
+#else
+#  define LZ4LIB_API LZ4LIB_VISIBILITY
+#endif
+
+/*! LZ4_FREESTANDING :
+ *  When this macro is set to 1, it enables "freestanding mode" that is
+ *  suitable for typical freestanding environment which doesn't support
+ *  standard C library.
+ *
+ *  - LZ4_FREESTANDING is a compile-time switch.
+ *  - It requires the following macros to be defined:
+ *    LZ4_memcpy, LZ4_memmove, LZ4_memset.
+ *  - It only enables LZ4/HC functions which don't use heap.
+ *    All LZ4F_* functions are not supported.
+ *  - See tests/freestanding.c to check its basic setup.
+ */
+#if defined(LZ4_FREESTANDING) && (LZ4_FREESTANDING == 1)
+#  define LZ4_HEAPMODE 0
+#  define LZ4HC_HEAPMODE 0
+#  define LZ4_STATIC_LINKING_ONLY_DISABLE_MEMORY_ALLOCATION 1
+#  if !defined(LZ4_memcpy)
+#    error "LZ4_FREESTANDING requires macro 'LZ4_memcpy'."
+#  endif
+#  if !defined(LZ4_memset)
+#    error "LZ4_FREESTANDING requires macro 'LZ4_memset'."
+#  endif
+#  if !defined(LZ4_memmove)
+#    error "LZ4_FREESTANDING requires macro 'LZ4_memmove'."
+#  endif
+#elif ! defined(LZ4_FREESTANDING)
+#  define LZ4_FREESTANDING 0
+#endif
+
+
+/*------   Version   ------*/
+#define LZ4_VERSION_MAJOR    1    /* for breaking interface changes  */
+#define LZ4_VERSION_MINOR    9    /* for new (non-breaking) interface capabilities */
+#define LZ4_VERSION_RELEASE  4    /* for tweaks, bug-fixes, or development */
+
+#define LZ4_VERSION_NUMBER (LZ4_VERSION_MAJOR *100*100 + LZ4_VERSION_MINOR *100 + LZ4_VERSION_RELEASE)
+
+#define LZ4_LIB_VERSION LZ4_VERSION_MAJOR.LZ4_VERSION_MINOR.LZ4_VERSION_RELEASE
+#define LZ4_QUOTE(str) #str
+#define LZ4_EXPAND_AND_QUOTE(str) LZ4_QUOTE(str)
+#define LZ4_VERSION_STRING LZ4_EXPAND_AND_QUOTE(LZ4_LIB_VERSION)  /* requires v1.7.3+ */
+
+namespace tracy
+{
+
+LZ4LIB_API int LZ4_versionNumber (void);  /**< library version number; useful to check dll version; requires v1.3.0+ */
+LZ4LIB_API const char* LZ4_versionString (void);   /**< library version string; useful to check dll version; requires v1.7.5+ */
+
+
+/*-************************************
+*  Tuning parameter
+**************************************/
+#define LZ4_MEMORY_USAGE_MIN 10
+#define LZ4_MEMORY_USAGE_DEFAULT 14
+#define LZ4_MEMORY_USAGE_MAX 20
+
+/*!
+ * LZ4_MEMORY_USAGE :
+ * Memory usage formula : N->2^N Bytes (examples : 10 -> 1KB; 12 -> 4KB ; 16 -> 64KB; 20 -> 1MB; )
+ * Increasing memory usage improves compression ratio, at the cost of speed.
+ * Reduced memory usage may improve speed at the cost of ratio, thanks to better cache locality.
+ * Default value is 14, for 16KB, which nicely fits into Intel x86 L1 cache
+ */
+#ifndef LZ4_MEMORY_USAGE
+# define LZ4_MEMORY_USAGE LZ4_MEMORY_USAGE_DEFAULT
+#endif
+
+#if (LZ4_MEMORY_USAGE < LZ4_MEMORY_USAGE_MIN)
+#  error "LZ4_MEMORY_USAGE is too small !"
+#endif
+
+#if (LZ4_MEMORY_USAGE > LZ4_MEMORY_USAGE_MAX)
+#  error "LZ4_MEMORY_USAGE is too large !"
+#endif
+
+/*-************************************
+*  Simple Functions
+**************************************/
+/*! LZ4_compress_default() :
+ *  Compresses 'srcSize' bytes from buffer 'src'
+ *  into already allocated 'dst' buffer of size 'dstCapacity'.
+ *  Compression is guaranteed to succeed if 'dstCapacity' >= LZ4_compressBound(srcSize).
+ *  It also runs faster, so it's a recommended setting.
+ *  If the function cannot compress 'src' into a more limited 'dst' budget,
+ *  compression stops *immediately*, and the function result is zero.
+ *  In which case, 'dst' content is undefined (invalid).
+ *      srcSize : max supported value is LZ4_MAX_INPUT_SIZE.
+ *      dstCapacity : size of buffer 'dst' (which must be already allocated)
+ *     @return  : the number of bytes written into buffer 'dst' (necessarily <= dstCapacity)
+ *                or 0 if compression fails
+ * Note : This function is protected against buffer overflow scenarios (never writes outside 'dst' buffer, nor read outside 'source' buffer).
+ */
+LZ4LIB_API int LZ4_compress_default(const char* src, char* dst, int srcSize, int dstCapacity);
+
+/*! LZ4_decompress_safe() :
+ *  compressedSize : is the exact complete size of the compressed block.
+ *  dstCapacity : is the size of destination buffer (which must be already allocated), presumed an upper bound of decompressed size.
+ * @return : the number of bytes decompressed into destination buffer (necessarily <= dstCapacity)
+ *           If destination buffer is not large enough, decoding will stop and output an error code (negative value).
+ *           If the source stream is detected malformed, the function will stop decoding and return a negative result.
+ * Note 1 : This function is protected against malicious data packets :
+ *          it will never writes outside 'dst' buffer, nor read outside 'source' buffer,
+ *          even if the compressed block is maliciously modified to order the decoder to do these actions.
+ *          In such case, the decoder stops immediately, and considers the compressed block malformed.
+ * Note 2 : compressedSize and dstCapacity must be provided to the function, the compressed block does not contain them.
+ *          The implementation is free to send / store / derive this information in whichever way is most beneficial.
+ *          If there is a need for a different format which bundles together both compressed data and its metadata, consider looking at lz4frame.h instead.
+ */
+LZ4LIB_API int LZ4_decompress_safe (const char* src, char* dst, int compressedSize, int dstCapacity);
+
+
+/*-************************************
+*  Advanced Functions
+**************************************/
+#define LZ4_MAX_INPUT_SIZE        0x7E000000   /* 2 113 929 216 bytes */
+#define LZ4_COMPRESSBOUND(isize)  ((unsigned)(isize) > (unsigned)LZ4_MAX_INPUT_SIZE ? 0 : (isize) + ((isize)/255) + 16)
+
+/*! LZ4_compressBound() :
+    Provides the maximum size that LZ4 compression may output in a "worst case" scenario (input data not compressible)
+    This function is primarily useful for memory allocation purposes (destination buffer size).
+    Macro LZ4_COMPRESSBOUND() is also provided for compilation-time evaluation (stack memory allocation for example).
+    Note that LZ4_compress_default() compresses faster when dstCapacity is >= LZ4_compressBound(srcSize)
+        inputSize  : max supported value is LZ4_MAX_INPUT_SIZE
+        return : maximum output size in a "worst case" scenario
+              or 0, if input size is incorrect (too large or negative)
+*/
+LZ4LIB_API int LZ4_compressBound(int inputSize);
+
+/*! LZ4_compress_fast() :
+    Same as LZ4_compress_default(), but allows selection of "acceleration" factor.
+    The larger the acceleration value, the faster the algorithm, but also the lesser the compression.
+    It's a trade-off. It can be fine tuned, with each successive value providing roughly +~3% to speed.
+    An acceleration value of "1" is the same as regular LZ4_compress_default()
+    Values <= 0 will be replaced by LZ4_ACCELERATION_DEFAULT (currently == 1, see lz4.c).
+    Values > LZ4_ACCELERATION_MAX will be replaced by LZ4_ACCELERATION_MAX (currently == 65537, see lz4.c).
+*/
+LZ4LIB_API int LZ4_compress_fast (const char* src, char* dst, int srcSize, int dstCapacity, int acceleration);
+
+
+/*! LZ4_compress_fast_extState() :
+ *  Same as LZ4_compress_fast(), using an externally allocated memory space for its state.
+ *  Use LZ4_sizeofState() to know how much memory must be allocated,
+ *  and allocate it on 8-bytes boundaries (using `malloc()` typically).
+ *  Then, provide this buffer as `void* state` to compression function.
+ */
+LZ4LIB_API int LZ4_sizeofState(void);
+LZ4LIB_API int LZ4_compress_fast_extState (void* state, const char* src, char* dst, int srcSize, int dstCapacity, int acceleration);
+
+
+/*! LZ4_compress_destSize() :
+ *  Reverse the logic : compresses as much data as possible from 'src' buffer
+ *  into already allocated buffer 'dst', of size >= 'targetDestSize'.
+ *  This function either compresses the entire 'src' content into 'dst' if it's large enough,
+ *  or fill 'dst' buffer completely with as much data as possible from 'src'.
+ *  note: acceleration parameter is fixed to "default".
+ *
+ * *srcSizePtr : will be modified to indicate how many bytes where read from 'src' to fill 'dst'.
+ *               New value is necessarily <= input value.
+ * @return : Nb bytes written into 'dst' (necessarily <= targetDestSize)
+ *           or 0 if compression fails.
+ *
+ * Note : from v1.8.2 to v1.9.1, this function had a bug (fixed un v1.9.2+):
+ *        the produced compressed content could, in specific circumstances,
+ *        require to be decompressed into a destination buffer larger
+ *        by at least 1 byte than the content to decompress.
+ *        If an application uses `LZ4_compress_destSize()`,
+ *        it's highly recommended to update liblz4 to v1.9.2 or better.
+ *        If this can't be done or ensured,
+ *        the receiving decompression function should provide
+ *        a dstCapacity which is > decompressedSize, by at least 1 byte.
+ *        See https://github.com/lz4/lz4/issues/859 for details
+ */
+LZ4LIB_API int LZ4_compress_destSize (const char* src, char* dst, int* srcSizePtr, int targetDstSize);
+
+
+/*! LZ4_decompress_safe_partial() :
+ *  Decompress an LZ4 compressed block, of size 'srcSize' at position 'src',
+ *  into destination buffer 'dst' of size 'dstCapacity'.
+ *  Up to 'targetOutputSize' bytes will be decoded.
+ *  The function stops decoding on reaching this objective.
+ *  This can be useful to boost performance
+ *  whenever only the beginning of a block is required.
+ *
+ * @return : the number of bytes decoded in `dst` (necessarily <= targetOutputSize)
+ *           If source stream is detected malformed, function returns a negative result.
+ *
+ *  Note 1 : @return can be < targetOutputSize, if compressed block contains less data.
+ *
+ *  Note 2 : targetOutputSize must be <= dstCapacity
+ *
+ *  Note 3 : this function effectively stops decoding on reaching targetOutputSize,
+ *           so dstCapacity is kind of redundant.
+ *           This is because in older versions of this function,
+ *           decoding operation would still write complete sequences.
+ *           Therefore, there was no guarantee that it would stop writing at exactly targetOutputSize,
+ *           it could write more bytes, though only up to dstCapacity.
+ *           Some "margin" used to be required for this operation to work properly.
+ *           Thankfully, this is no longer necessary.
+ *           The function nonetheless keeps the same signature, in an effort to preserve API compatibility.
+ *
+ *  Note 4 : If srcSize is the exact size of the block,
+ *           then targetOutputSize can be any value,
+ *           including larger than the block's decompressed size.
+ *           The function will, at most, generate block's decompressed size.
+ *
+ *  Note 5 : If srcSize is _larger_ than block's compressed size,
+ *           then targetOutputSize **MUST** be <= block's decompressed size.
+ *           Otherwise, *silent corruption will occur*.
+ */
+LZ4LIB_API int LZ4_decompress_safe_partial (const char* src, char* dst, int srcSize, int targetOutputSize, int dstCapacity);
+
+
+/*-*********************************************
+*  Streaming Compression Functions
+***********************************************/
+typedef union LZ4_stream_u LZ4_stream_t;  /* incomplete type (defined later) */
+
+/**
+ Note about RC_INVOKED
+
+ - RC_INVOKED is predefined symbol of rc.exe (the resource compiler which is part of MSVC/Visual Studio).
+   https://docs.microsoft.com/en-us/windows/win32/menurc/predefined-macros
+
+ - Since rc.exe is a legacy compiler, it truncates long symbol (> 30 chars)
+   and reports warning "RC4011: identifier truncated".
+
+ - To eliminate the warning, we surround long preprocessor symbol with
+   "#if !defined(RC_INVOKED) ... #endif" block that means
+   "skip this block when rc.exe is trying to read it".
+*/
+#if !defined(RC_INVOKED) /* https://docs.microsoft.com/en-us/windows/win32/menurc/predefined-macros */
+#if !defined(LZ4_STATIC_LINKING_ONLY_DISABLE_MEMORY_ALLOCATION)
+LZ4LIB_API LZ4_stream_t* LZ4_createStream(void);
+LZ4LIB_API int           LZ4_freeStream (LZ4_stream_t* streamPtr);
+#endif /* !defined(LZ4_STATIC_LINKING_ONLY_DISABLE_MEMORY_ALLOCATION) */
+#endif
+
+/*! LZ4_resetStream_fast() : v1.9.0+
+ *  Use this to prepare an LZ4_stream_t for a new chain of dependent blocks
+ *  (e.g., LZ4_compress_fast_continue()).
+ *
+ *  An LZ4_stream_t must be initialized once before usage.
+ *  This is automatically done when created by LZ4_createStream().
+ *  However, should the LZ4_stream_t be simply declared on stack (for example),
+ *  it's necessary to initialize it first, using LZ4_initStream().
+ *
+ *  After init, start any new stream with LZ4_resetStream_fast().
+ *  A same LZ4_stream_t can be re-used multiple times consecutively
+ *  and compress multiple streams,
+ *  provided that it starts each new stream with LZ4_resetStream_fast().
+ *
+ *  LZ4_resetStream_fast() is much faster than LZ4_initStream(),
+ *  but is not compatible with memory regions containing garbage data.
+ *
+ *  Note: it's only useful to call LZ4_resetStream_fast()
+ *        in the context of streaming compression.
+ *        The *extState* functions perform their own resets.
+ *        Invoking LZ4_resetStream_fast() before is redundant, and even counterproductive.
+ */
+LZ4LIB_API void LZ4_resetStream_fast (LZ4_stream_t* streamPtr);
+
+/*! LZ4_loadDict() :
+ *  Use this function to reference a static dictionary into LZ4_stream_t.
+ *  The dictionary must remain available during compression.
+ *  LZ4_loadDict() triggers a reset, so any previous data will be forgotten.
+ *  The same dictionary will have to be loaded on decompression side for successful decoding.
+ *  Dictionary are useful for better compression of small data (KB range).
+ *  While LZ4 accept any input as dictionary,
+ *  results are generally better when using Zstandard's Dictionary Builder.
+ *  Loading a size of 0 is allowed, and is the same as reset.
+ * @return : loaded dictionary size, in bytes (necessarily <= 64 KB)
+ */
+LZ4LIB_API int LZ4_loadDict (LZ4_stream_t* streamPtr, const char* dictionary, int dictSize);
+
+/*! LZ4_compress_fast_continue() :
+ *  Compress 'src' content using data from previously compressed blocks, for better compression ratio.
+ * 'dst' buffer must be already allocated.
+ *  If dstCapacity >= LZ4_compressBound(srcSize), compression is guaranteed to succeed, and runs faster.
+ *
+ * @return : size of compressed block
+ *           or 0 if there is an error (typically, cannot fit into 'dst').
+ *
+ *  Note 1 : Each invocation to LZ4_compress_fast_continue() generates a new block.
+ *           Each block has precise boundaries.
+ *           Each block must be decompressed separately, calling LZ4_decompress_*() with relevant metadata.
+ *           It's not possible to append blocks together and expect a single invocation of LZ4_decompress_*() to decompress them together.
+ *
+ *  Note 2 : The previous 64KB of source data is __assumed__ to remain present, unmodified, at same address in memory !
+ *
+ *  Note 3 : When input is structured as a double-buffer, each buffer can have any size, including < 64 KB.
+ *           Make sure that buffers are separated, by at least one byte.
+ *           This construction ensures that each block only depends on previous block.
+ *
+ *  Note 4 : If input buffer is a ring-buffer, it can have any size, including < 64 KB.
+ *
+ *  Note 5 : After an error, the stream status is undefined (invalid), it can only be reset or freed.
+ */
+LZ4LIB_API int LZ4_compress_fast_continue (LZ4_stream_t* streamPtr, const char* src, char* dst, int srcSize, int dstCapacity, int acceleration);
+
+/*! LZ4_saveDict() :
+ *  If last 64KB data cannot be guaranteed to remain available at its current memory location,
+ *  save it into a safer place (char* safeBuffer).
+ *  This is schematically equivalent to a memcpy() followed by LZ4_loadDict(),
+ *  but is much faster, because LZ4_saveDict() doesn't need to rebuild tables.
+ * @return : saved dictionary size in bytes (necessarily <= maxDictSize), or 0 if error.
+ */
+LZ4LIB_API int LZ4_saveDict (LZ4_stream_t* streamPtr, char* safeBuffer, int maxDictSize);
+
+
+/*-**********************************************
+*  Streaming Decompression Functions
+*  Bufferless synchronous API
+************************************************/
+typedef union LZ4_streamDecode_u LZ4_streamDecode_t;   /* tracking context */
+
+/*! LZ4_createStreamDecode() and LZ4_freeStreamDecode() :
+ *  creation / destruction of streaming decompression tracking context.
+ *  A tracking context can be re-used multiple times.
+ */
+#if !defined(RC_INVOKED) /* https://docs.microsoft.com/en-us/windows/win32/menurc/predefined-macros */
+#if !defined(LZ4_STATIC_LINKING_ONLY_DISABLE_MEMORY_ALLOCATION)
+LZ4LIB_API LZ4_streamDecode_t* LZ4_createStreamDecode(void);
+LZ4LIB_API int                 LZ4_freeStreamDecode (LZ4_streamDecode_t* LZ4_stream);
+#endif /* !defined(LZ4_STATIC_LINKING_ONLY_DISABLE_MEMORY_ALLOCATION) */
+#endif
+
+/*! LZ4_setStreamDecode() :
+ *  An LZ4_streamDecode_t context can be allocated once and re-used multiple times.
+ *  Use this function to start decompression of a new stream of blocks.
+ *  A dictionary can optionally be set. Use NULL or size 0 for a reset order.
+ *  Dictionary is presumed stable : it must remain accessible and unmodified during next decompression.
+ * @return : 1 if OK, 0 if error
+ */
+LZ4LIB_API int LZ4_setStreamDecode (LZ4_streamDecode_t* LZ4_streamDecode, const char* dictionary, int dictSize);
+
+/*! LZ4_decoderRingBufferSize() : v1.8.2+
+ *  Note : in a ring buffer scenario (optional),
+ *  blocks are presumed decompressed next to each other
+ *  up to the moment there is not enough remaining space for next block (remainingSize < maxBlockSize),
+ *  at which stage it resumes from beginning of ring buffer.
+ *  When setting such a ring buffer for streaming decompression,
+ *  provides the minimum size of this ring buffer
+ *  to be compatible with any source respecting maxBlockSize condition.
+ * @return : minimum ring buffer size,
+ *           or 0 if there is an error (invalid maxBlockSize).
+ */
+LZ4LIB_API int LZ4_decoderRingBufferSize(int maxBlockSize);
+#define LZ4_DECODER_RING_BUFFER_SIZE(maxBlockSize) (65536 + 14 + (maxBlockSize))  /* for static allocation; maxBlockSize presumed valid */
+
+/*! LZ4_decompress_*_continue() :
+ *  These decoding functions allow decompression of consecutive blocks in "streaming" mode.
+ *  A block is an unsplittable entity, it must be presented entirely to a decompression function.
+ *  Decompression functions only accepts one block at a time.
+ *  The last 64KB of previously decoded data *must* remain available and unmodified at the memory position where they were decoded.
+ *  If less than 64KB of data has been decoded, all the data must be present.
+ *
+ *  Special : if decompression side sets a ring buffer, it must respect one of the following conditions :
+ *  - Decompression buffer size is _at least_ LZ4_decoderRingBufferSize(maxBlockSize).
+ *    maxBlockSize is the maximum size of any single block. It can have any value > 16 bytes.
+ *    In which case, encoding and decoding buffers do not need to be synchronized.
+ *    Actually, data can be produced by any source compliant with LZ4 format specification, and respecting maxBlockSize.
+ *  - Synchronized mode :
+ *    Decompression buffer size is _exactly_ the same as compression buffer size,
+ *    and follows exactly same update rule (block boundaries at same positions),
+ *    and decoding function is provided with exact decompressed size of each block (exception for last block of the stream),
+ *    _then_ decoding & encoding ring buffer can have any size, including small ones ( < 64 KB).
+ *  - Decompression buffer is larger than encoding buffer, by a minimum of maxBlockSize more bytes.
+ *    In which case, encoding and decoding buffers do not need to be synchronized,
+ *    and encoding ring buffer can have any size, including small ones ( < 64 KB).
+ *
+ *  Whenever these conditions are not possible,
+ *  save the last 64KB of decoded data into a safe buffer where it can't be modified during decompression,
+ *  then indicate where this data is saved using LZ4_setStreamDecode(), before decompressing next block.
+*/
+LZ4LIB_API int
+LZ4_decompress_safe_continue (LZ4_streamDecode_t* LZ4_streamDecode,
+                        const char* src, char* dst,
+                        int srcSize, int dstCapacity);
+
+
+/*! LZ4_decompress_*_usingDict() :
+ *  These decoding functions work the same as
+ *  a combination of LZ4_setStreamDecode() followed by LZ4_decompress_*_continue()
+ *  They are stand-alone, and don't need an LZ4_streamDecode_t structure.
+ *  Dictionary is presumed stable : it must remain accessible and unmodified during decompression.
+ *  Performance tip : Decompression speed can be substantially increased
+ *                    when dst == dictStart + dictSize.
+ */
+LZ4LIB_API int
+LZ4_decompress_safe_usingDict(const char* src, char* dst,
+                              int srcSize, int dstCapacity,
+                              const char* dictStart, int dictSize);
+
+LZ4LIB_API int
+LZ4_decompress_safe_partial_usingDict(const char* src, char* dst,
+                                      int compressedSize,
+                                      int targetOutputSize, int maxOutputSize,
+                                      const char* dictStart, int dictSize);
+
+}
+
+#endif /* LZ4_H_2983827168210 */
+
+
+/*^*************************************
+ * !!!!!!   STATIC LINKING ONLY   !!!!!!
+ ***************************************/
+
+/*-****************************************************************************
+ * Experimental section
+ *
+ * Symbols declared in this section must be considered unstable. Their
+ * signatures or semantics may change, or they may be removed altogether in the
+ * future. They are therefore only safe to depend on when the caller is
+ * statically linked against the library.
+ *
+ * To protect against unsafe usage, not only are the declarations guarded,
+ * the definitions are hidden by default
+ * when building LZ4 as a shared/dynamic library.
+ *
+ * In order to access these declarations,
+ * define LZ4_STATIC_LINKING_ONLY in your application
+ * before including LZ4's headers.
+ *
+ * In order to make their implementations accessible dynamically, you must
+ * define LZ4_PUBLISH_STATIC_FUNCTIONS when building the LZ4 library.
+ ******************************************************************************/
+
+#ifdef LZ4_STATIC_LINKING_ONLY
+
+#ifndef TRACY_LZ4_STATIC_3504398509
+#define TRACY_LZ4_STATIC_3504398509
+
+#ifdef LZ4_PUBLISH_STATIC_FUNCTIONS
+#define LZ4LIB_STATIC_API LZ4LIB_API
+#else
+#define LZ4LIB_STATIC_API
+#endif
+
+namespace tracy
+{
+
+/*! LZ4_compress_fast_extState_fastReset() :
+ *  A variant of LZ4_compress_fast_extState().
+ *
+ *  Using this variant avoids an expensive initialization step.
+ *  It is only safe to call if the state buffer is known to be correctly initialized already
+ *  (see above comment on LZ4_resetStream_fast() for a definition of "correctly initialized").
+ *  From a high level, the difference is that
+ *  this function initializes the provided state with a call to something like LZ4_resetStream_fast()
+ *  while LZ4_compress_fast_extState() starts with a call to LZ4_resetStream().
+ */
+LZ4LIB_STATIC_API int LZ4_compress_fast_extState_fastReset (void* state, const char* src, char* dst, int srcSize, int dstCapacity, int acceleration);
+
+/*! LZ4_attach_dictionary() :
+ *  This is an experimental API that allows
+ *  efficient use of a static dictionary many times.
+ *
+ *  Rather than re-loading the dictionary buffer into a working context before
+ *  each compression, or copying a pre-loaded dictionary's LZ4_stream_t into a
+ *  working LZ4_stream_t, this function introduces a no-copy setup mechanism,
+ *  in which the working stream references the dictionary stream in-place.
+ *
+ *  Several assumptions are made about the state of the dictionary stream.
+ *  Currently, only streams which have been prepared by LZ4_loadDict() should
+ *  be expected to work.
+ *
+ *  Alternatively, the provided dictionaryStream may be NULL,
+ *  in which case any existing dictionary stream is unset.
+ *
+ *  If a dictionary is provided, it replaces any pre-existing stream history.
+ *  The dictionary contents are the only history that can be referenced and
+ *  logically immediately precede the data compressed in the first subsequent
+ *  compression call.
+ *
+ *  The dictionary will only remain attached to the working stream through the
+ *  first compression call, at the end of which it is cleared. The dictionary
+ *  stream (and source buffer) must remain in-place / accessible / unchanged
+ *  through the completion of the first compression call on the stream.
+ */
+LZ4LIB_STATIC_API void
+LZ4_attach_dictionary(LZ4_stream_t* workingStream,
+                const LZ4_stream_t* dictionaryStream);
+
+
+/*! In-place compression and decompression
+ *
+ * It's possible to have input and output sharing the same buffer,
+ * for highly constrained memory environments.
+ * In both cases, it requires input to lay at the end of the buffer,
+ * and decompression to start at beginning of the buffer.
+ * Buffer size must feature some margin, hence be larger than final size.
+ *
+ * |<------------------------buffer--------------------------------->|
+ *                             |<-----------compressed data--------->|
+ * |<-----------decompressed size------------------>|
+ *                                                  |<----margin---->|
+ *
+ * This technique is more useful for decompression,
+ * since decompressed size is typically larger,
+ * and margin is short.
+ *
+ * In-place decompression will work inside any buffer
+ * which size is >= LZ4_DECOMPRESS_INPLACE_BUFFER_SIZE(decompressedSize).
+ * This presumes that decompressedSize > compressedSize.
+ * Otherwise, it means compression actually expanded data,
+ * and it would be more efficient to store such data with a flag indicating it's not compressed.
+ * This can happen when data is not compressible (already compressed, or encrypted).
+ *
+ * For in-place compression, margin is larger, as it must be able to cope with both
+ * history preservation, requiring input data to remain unmodified up to LZ4_DISTANCE_MAX,
+ * and data expansion, which can happen when input is not compressible.
+ * As a consequence, buffer size requirements are much higher,
+ * and memory savings offered by in-place compression are more limited.
+ *
+ * There are ways to limit this cost for compression :
+ * - Reduce history size, by modifying LZ4_DISTANCE_MAX.
+ *   Note that it is a compile-time constant, so all compressions will apply this limit.
+ *   Lower values will reduce compression ratio, except when input_size < LZ4_DISTANCE_MAX,
+ *   so it's a reasonable trick when inputs are known to be small.
+ * - Require the compressor to deliver a "maximum compressed size".
+ *   This is the `dstCapacity` parameter in `LZ4_compress*()`.
+ *   When this size is < LZ4_COMPRESSBOUND(inputSize), then compression can fail,
+ *   in which case, the return code will be 0 (zero).
+ *   The caller must be ready for these cases to happen,
+ *   and typically design a backup scheme to send data uncompressed.
+ * The combination of both techniques can significantly reduce
+ * the amount of margin required for in-place compression.
+ *
+ * In-place compression can work in any buffer
+ * which size is >= (maxCompressedSize)
+ * with maxCompressedSize == LZ4_COMPRESSBOUND(srcSize) for guaranteed compression success.
+ * LZ4_COMPRESS_INPLACE_BUFFER_SIZE() depends on both maxCompressedSize and LZ4_DISTANCE_MAX,
+ * so it's possible to reduce memory requirements by playing with them.
+ */
+
+#define LZ4_DECOMPRESS_INPLACE_MARGIN(compressedSize)          (((compressedSize) >> 8) + 32)
+#define LZ4_DECOMPRESS_INPLACE_BUFFER_SIZE(decompressedSize)   ((decompressedSize) + LZ4_DECOMPRESS_INPLACE_MARGIN(decompressedSize))  /**< note: presumes that compressedSize < decompressedSize. note2: margin is overestimated a bit, since it could use compressedSize instead */
+
+#ifndef LZ4_DISTANCE_MAX   /* history window size; can be user-defined at compile time */
+#  define LZ4_DISTANCE_MAX 65535   /* set to maximum value by default */
+#endif
+
+#define LZ4_COMPRESS_INPLACE_MARGIN                           (LZ4_DISTANCE_MAX + 32)   /* LZ4_DISTANCE_MAX can be safely replaced by srcSize when it's smaller */
+#define LZ4_COMPRESS_INPLACE_BUFFER_SIZE(maxCompressedSize)   ((maxCompressedSize) + LZ4_COMPRESS_INPLACE_MARGIN)  /**< maxCompressedSize is generally LZ4_COMPRESSBOUND(inputSize), but can be set to any lower value, with the risk that compression can fail (return code 0(zero)) */
+
+}
+
+#endif   /* LZ4_STATIC_3504398509 */
+#endif   /* LZ4_STATIC_LINKING_ONLY */
+
+
+
+#ifndef TRACY_LZ4_H_98237428734687
+#define TRACY_LZ4_H_98237428734687
+
+namespace tracy
+{
+
+/*-************************************************************
+ *  Private Definitions
+ **************************************************************
+ * Do not use these definitions directly.
+ * They are only exposed to allow static allocation of `LZ4_stream_t` and `LZ4_streamDecode_t`.
+ * Accessing members will expose user code to API and/or ABI break in future versions of the library.
+ **************************************************************/
+#define LZ4_HASHLOG   (LZ4_MEMORY_USAGE-2)
+#define LZ4_HASHTABLESIZE (1 << LZ4_MEMORY_USAGE)
+#define LZ4_HASH_SIZE_U32 (1 << LZ4_HASHLOG)       /* required as macro for static allocation */
+
+#if defined(__cplusplus) || (defined (__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) /* C99 */)
+  typedef  int8_t  LZ4_i8;
+  typedef uint8_t  LZ4_byte;
+  typedef uint16_t LZ4_u16;
+  typedef uint32_t LZ4_u32;
+#else
+  typedef   signed char  LZ4_i8;
+  typedef unsigned char  LZ4_byte;
+  typedef unsigned short LZ4_u16;
+  typedef unsigned int   LZ4_u32;
+#endif
+
+/*! LZ4_stream_t :
+ *  Never ever use below internal definitions directly !
+ *  These definitions are not API/ABI safe, and may change in future versions.
+ *  If you need static allocation, declare or allocate an LZ4_stream_t object.
+**/
+
+typedef struct LZ4_stream_t_internal LZ4_stream_t_internal;
+struct LZ4_stream_t_internal {
+    LZ4_u32 hashTable[LZ4_HASH_SIZE_U32];
+    const LZ4_byte* dictionary;
+    const LZ4_stream_t_internal* dictCtx;
+    LZ4_u32 currentOffset;
+    LZ4_u32 tableType;
+    LZ4_u32 dictSize;
+    /* Implicit padding to ensure structure is aligned */
+};
+
+#define LZ4_STREAM_MINSIZE  ((1UL << LZ4_MEMORY_USAGE) + 32)  /* static size, for inter-version compatibility */
+union LZ4_stream_u {
+    char minStateSize[LZ4_STREAM_MINSIZE];
+    LZ4_stream_t_internal internal_donotuse;
+}; /* previously typedef'd to LZ4_stream_t */
+
+
+/*! LZ4_initStream() : v1.9.0+
+ *  An LZ4_stream_t structure must be initialized at least once.
+ *  This is automatically done when invoking LZ4_createStream(),
+ *  but it's not when the structure is simply declared on stack (for example).
+ *
+ *  Use LZ4_initStream() to properly initialize a newly declared LZ4_stream_t.
+ *  It can also initialize any arbitrary buffer of sufficient size,
+ *  and will @return a pointer of proper type upon initialization.
+ *
+ *  Note : initialization fails if size and alignment conditions are not respected.
+ *         In which case, the function will @return NULL.
+ *  Note2: An LZ4_stream_t structure guarantees correct alignment and size.
+ *  Note3: Before v1.9.0, use LZ4_resetStream() instead
+**/
+LZ4LIB_API LZ4_stream_t* LZ4_initStream (void* buffer, size_t size);
+
+
+/*! LZ4_streamDecode_t :
+ *  Never ever use below internal definitions directly !
+ *  These definitions are not API/ABI safe, and may change in future versions.
+ *  If you need static allocation, declare or allocate an LZ4_streamDecode_t object.
+**/
+typedef struct {
+    const LZ4_byte* externalDict;
+    const LZ4_byte* prefixEnd;
+    size_t extDictSize;
+    size_t prefixSize;
+} LZ4_streamDecode_t_internal;
+
+#define LZ4_STREAMDECODE_MINSIZE 32
+union LZ4_streamDecode_u {
+    char minStateSize[LZ4_STREAMDECODE_MINSIZE];
+    LZ4_streamDecode_t_internal internal_donotuse;
+} ;   /* previously typedef'd to LZ4_streamDecode_t */
+
+
+
+/*-************************************
+*  Obsolete Functions
+**************************************/
+
+/*! Deprecation warnings
+ *
+ *  Deprecated functions make the compiler generate a warning when invoked.
+ *  This is meant to invite users to update their source code.
+ *  Should deprecation warnings be a problem, it is generally possible to disable them,
+ *  typically with -Wno-deprecated-declarations for gcc
+ *  or _CRT_SECURE_NO_WARNINGS in Visual.
+ *
+ *  Another method is to define LZ4_DISABLE_DEPRECATE_WARNINGS
+ *  before including the header file.
+ */
+#ifdef LZ4_DISABLE_DEPRECATE_WARNINGS
+#  define LZ4_DEPRECATED(message)   /* disable deprecation warnings */
+#else
+#  if defined (__cplusplus) && (__cplusplus >= 201402) /* C++14 or greater */
+#    define LZ4_DEPRECATED(message) [[deprecated(message)]]
+#  elif defined(_MSC_VER)
+#    define LZ4_DEPRECATED(message) __declspec(deprecated(message))
+#  elif defined(__clang__) || (defined(__GNUC__) && (__GNUC__ * 10 + __GNUC_MINOR__ >= 45))
+#    define LZ4_DEPRECATED(message) __attribute__((deprecated(message)))
+#  elif defined(__GNUC__) && (__GNUC__ * 10 + __GNUC_MINOR__ >= 31)
+#    define LZ4_DEPRECATED(message) __attribute__((deprecated))
+#  else
+#    pragma message("WARNING: LZ4_DEPRECATED needs custom implementation for this compiler")
+#    define LZ4_DEPRECATED(message)   /* disabled */
+#  endif
+#endif /* LZ4_DISABLE_DEPRECATE_WARNINGS */
+
+/*! Obsolete compression functions (since v1.7.3) */
+LZ4_DEPRECATED("use LZ4_compress_default() instead")       LZ4LIB_API int LZ4_compress               (const char* src, char* dest, int srcSize);
+LZ4_DEPRECATED("use LZ4_compress_default() instead")       LZ4LIB_API int LZ4_compress_limitedOutput (const char* src, char* dest, int srcSize, int maxOutputSize);
+LZ4_DEPRECATED("use LZ4_compress_fast_extState() instead") LZ4LIB_API int LZ4_compress_withState               (void* state, const char* source, char* dest, int inputSize);
+LZ4_DEPRECATED("use LZ4_compress_fast_extState() instead") LZ4LIB_API int LZ4_compress_limitedOutput_withState (void* state, const char* source, char* dest, int inputSize, int maxOutputSize);
+LZ4_DEPRECATED("use LZ4_compress_fast_continue() instead") LZ4LIB_API int LZ4_compress_continue                (LZ4_stream_t* LZ4_streamPtr, const char* source, char* dest, int inputSize);
+LZ4_DEPRECATED("use LZ4_compress_fast_continue() instead") LZ4LIB_API int LZ4_compress_limitedOutput_continue  (LZ4_stream_t* LZ4_streamPtr, const char* source, char* dest, int inputSize, int maxOutputSize);
+
+/*! Obsolete decompression functions (since v1.8.0) */
+LZ4_DEPRECATED("use LZ4_decompress_fast() instead") LZ4LIB_API int LZ4_uncompress (const char* source, char* dest, int outputSize);
+LZ4_DEPRECATED("use LZ4_decompress_safe() instead") LZ4LIB_API int LZ4_uncompress_unknownOutputSize (const char* source, char* dest, int isize, int maxOutputSize);
+
+/* Obsolete streaming functions (since v1.7.0)
+ * degraded functionality; do not use!
+ *
+ * In order to perform streaming compression, these functions depended on data
+ * that is no longer tracked in the state. They have been preserved as well as
+ * possible: using them will still produce a correct output. However, they don't
+ * actually retain any history between compression calls. The compression ratio
+ * achieved will therefore be no better than compressing each chunk
+ * independently.
+ */
+LZ4_DEPRECATED("Use LZ4_createStream() instead") LZ4LIB_API void* LZ4_create (char* inputBuffer);
+LZ4_DEPRECATED("Use LZ4_createStream() instead") LZ4LIB_API int   LZ4_sizeofStreamState(void);
+LZ4_DEPRECATED("Use LZ4_resetStream() instead")  LZ4LIB_API int   LZ4_resetStreamState(void* state, char* inputBuffer);
+LZ4_DEPRECATED("Use LZ4_saveDict() instead")     LZ4LIB_API char* LZ4_slideInputBuffer (void* state);
+
+/*! Obsolete streaming decoding functions (since v1.7.0) */
+LZ4_DEPRECATED("use LZ4_decompress_safe_usingDict() instead") LZ4LIB_API int LZ4_decompress_safe_withPrefix64k (const char* src, char* dst, int compressedSize, int maxDstSize);
+LZ4_DEPRECATED("use LZ4_decompress_fast_usingDict() instead") LZ4LIB_API int LZ4_decompress_fast_withPrefix64k (const char* src, char* dst, int originalSize);
+
+/*! Obsolete LZ4_decompress_fast variants (since v1.9.0) :
+ *  These functions used to be faster than LZ4_decompress_safe(),
+ *  but this is no longer the case. They are now slower.
+ *  This is because LZ4_decompress_fast() doesn't know the input size,
+ *  and therefore must progress more cautiously into the input buffer to not read beyond the end of block.
+ *  On top of that `LZ4_decompress_fast()` is not protected vs malformed or malicious inputs, making it a security liability.
+ *  As a consequence, LZ4_decompress_fast() is strongly discouraged, and deprecated.
+ *
+ *  The last remaining LZ4_decompress_fast() specificity is that
+ *  it can decompress a block without knowing its compressed size.
+ *  Such functionality can be achieved in a more secure manner
+ *  by employing LZ4_decompress_safe_partial().
+ *
+ *  Parameters:
+ *  originalSize : is the uncompressed size to regenerate.
+ *                 `dst` must be already allocated, its size must be >= 'originalSize' bytes.
+ * @return : number of bytes read from source buffer (== compressed size).
+ *           The function expects to finish at block's end exactly.
+ *           If the source stream is detected malformed, the function stops decoding and returns a negative result.
+ *  note : LZ4_decompress_fast*() requires originalSize. Thanks to this information, it never writes past the output buffer.
+ *         However, since it doesn't know its 'src' size, it may read an unknown amount of input, past input buffer bounds.
+ *         Also, since match offsets are not validated, match reads from 'src' may underflow too.
+ *         These issues never happen if input (compressed) data is correct.
+ *         But they may happen if input data is invalid (error or intentional tampering).
+ *         As a consequence, use these functions in trusted environments with trusted data **only**.
+ */
+LZ4_DEPRECATED("This function is deprecated and unsafe. Consider using LZ4_decompress_safe() instead")
+LZ4LIB_API int LZ4_decompress_fast (const char* src, char* dst, int originalSize);
+LZ4_DEPRECATED("This function is deprecated and unsafe. Consider using LZ4_decompress_safe_continue() instead")
+LZ4LIB_API int LZ4_decompress_fast_continue (LZ4_streamDecode_t* LZ4_streamDecode, const char* src, char* dst, int originalSize);
+LZ4_DEPRECATED("This function is deprecated and unsafe. Consider using LZ4_decompress_safe_usingDict() instead")
+LZ4LIB_API int LZ4_decompress_fast_usingDict (const char* src, char* dst, int originalSize, const char* dictStart, int dictSize);
+
+/*! LZ4_resetStream() :
+ *  An LZ4_stream_t structure must be initialized at least once.
+ *  This is done with LZ4_initStream(), or LZ4_resetStream().
+ *  Consider switching to LZ4_initStream(),
+ *  invoking LZ4_resetStream() will trigger deprecation warnings in the future.
+ */
+LZ4LIB_API void LZ4_resetStream (LZ4_stream_t* streamPtr);
+
+}
+
+#endif /* LZ4_H_98237428734687 */
diff --git a/project/thirdparty/tracy-0.11.1/common/tracy_lz4hc.cpp b/project/thirdparty/tracy-0.11.1/common/tracy_lz4hc.cpp
new file mode 100644
index 000000000..eec7239e0
--- /dev/null
+++ b/project/thirdparty/tracy-0.11.1/common/tracy_lz4hc.cpp
@@ -0,0 +1,1636 @@
+/*
+    LZ4 HC - High Compression Mode of LZ4
+    Copyright (C) 2011-2020, Yann Collet.
+
+    BSD 2-Clause License (http://www.opensource.org/licenses/bsd-license.php)
+
+    Redistribution and use in source and binary forms, with or without
+    modification, are permitted provided that the following conditions are
+    met:
+
+    * Redistributions of source code must retain the above copyright
+    notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above
+    copyright notice, this list of conditions and the following disclaimer
+    in the documentation and/or other materials provided with the
+    distribution.
+
+    THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+    "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+    LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+    A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+    OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+    SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+    LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+    DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+    THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+    (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+    OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+    You can contact the author at :
+       - LZ4 source repository : https://github.com/lz4/lz4
+       - LZ4 public forum : https://groups.google.com/forum/#!forum/lz4c
+*/
+/* note : lz4hc is not an independent module, it requires lz4.h/lz4.c for proper compilation */
+
+
+/* *************************************
+*  Tuning Parameter
+***************************************/
+
+/*! HEAPMODE :
+ *  Select how default compression function will allocate workplace memory,
+ *  in stack (0:fastest), or in heap (1:requires malloc()).
+ *  Since workplace is rather large, heap mode is recommended.
+**/
+#ifndef LZ4HC_HEAPMODE
+#  define LZ4HC_HEAPMODE 1
+#endif
+
+
+/*===    Dependency    ===*/
+#define LZ4_HC_STATIC_LINKING_ONLY
+#include "tracy_lz4hc.hpp"
+
+
+/*===   Common definitions   ===*/
+#if defined(__GNUC__)
+#  pragma GCC diagnostic ignored "-Wunused-function"
+#endif
+#if defined (__clang__)
+#  pragma clang diagnostic ignored "-Wunused-function"
+#endif
+
+#define LZ4_COMMONDEFS_ONLY
+#ifndef LZ4_SRC_INCLUDED
+#include "tracy_lz4.cpp"   /* LZ4_count, constants, mem */
+#endif
+
+
+/*===   Enums   ===*/
+typedef enum { noDictCtx, usingDictCtxHc } dictCtx_directive;
+
+
+/*===   Constants   ===*/
+#define OPTIMAL_ML (int)((ML_MASK-1)+MINMATCH)
+#define LZ4_OPT_NUM   (1<<12)
+
+
+/*===   Macros   ===*/
+#define MIN(a,b)   ( (a) < (b) ? (a) : (b) )
+#define MAX(a,b)   ( (a) > (b) ? (a) : (b) )
+#define HASH_FUNCTION(i)         (((i) * 2654435761U) >> ((MINMATCH*8)-LZ4HC_HASH_LOG))
+#define DELTANEXTMAXD(p)         chainTable[(p) & LZ4HC_MAXD_MASK]    /* flexible, LZ4HC_MAXD dependent */
+#define DELTANEXTU16(table, pos) table[(U16)(pos)]   /* faster */
+/* Make fields passed to, and updated by LZ4HC_encodeSequence explicit */
+#define UPDATABLE(ip, op, anchor) &ip, &op, &anchor
+
+namespace tracy
+{
+
+static U32 LZ4HC_hashPtr(const void* ptr) { return HASH_FUNCTION(LZ4_read32(ptr)); }
+
+
+/**************************************
+*  HC Compression
+**************************************/
+static void LZ4HC_clearTables (LZ4HC_CCtx_internal* hc4)
+{
+    MEM_INIT(hc4->hashTable, 0, sizeof(hc4->hashTable));
+    MEM_INIT(hc4->chainTable, 0xFF, sizeof(hc4->chainTable));
+}
+
+static void LZ4HC_init_internal (LZ4HC_CCtx_internal* hc4, const BYTE* start)
+{
+    size_t const bufferSize = (size_t)(hc4->end - hc4->prefixStart);
+    size_t newStartingOffset = bufferSize + hc4->dictLimit;
+    assert(newStartingOffset >= bufferSize);  /* check overflow */
+    if (newStartingOffset > 1 GB) {
+        LZ4HC_clearTables(hc4);
+        newStartingOffset = 0;
+    }
+    newStartingOffset += 64 KB;
+    hc4->nextToUpdate = (U32)newStartingOffset;
+    hc4->prefixStart = start;
+    hc4->end = start;
+    hc4->dictStart = start;
+    hc4->dictLimit = (U32)newStartingOffset;
+    hc4->lowLimit = (U32)newStartingOffset;
+}
+
+
+/* Update chains up to ip (excluded) */
+LZ4_FORCE_INLINE void LZ4HC_Insert (LZ4HC_CCtx_internal* hc4, const BYTE* ip)
+{
+    U16* const chainTable = hc4->chainTable;
+    U32* const hashTable  = hc4->hashTable;
+    const BYTE* const prefixPtr = hc4->prefixStart;
+    U32 const prefixIdx = hc4->dictLimit;
+    U32 const target = (U32)(ip - prefixPtr) + prefixIdx;
+    U32 idx = hc4->nextToUpdate;
+    assert(ip >= prefixPtr);
+    assert(target >= prefixIdx);
+
+    while (idx < target) {
+        U32 const h = LZ4HC_hashPtr(prefixPtr+idx-prefixIdx);
+        size_t delta = idx - hashTable[h];
+        if (delta>LZ4_DISTANCE_MAX) delta = LZ4_DISTANCE_MAX;
+        DELTANEXTU16(chainTable, idx) = (U16)delta;
+        hashTable[h] = idx;
+        idx++;
+    }
+
+    hc4->nextToUpdate = target;
+}
+
+/** LZ4HC_countBack() :
+ * @return : negative value, nb of common bytes before ip/match */
+LZ4_FORCE_INLINE
+int LZ4HC_countBack(const BYTE* const ip, const BYTE* const match,
+                    const BYTE* const iMin, const BYTE* const mMin)
+{
+    int back = 0;
+    int const min = (int)MAX(iMin - ip, mMin - match);
+    assert(min <= 0);
+    assert(ip >= iMin); assert((size_t)(ip-iMin) < (1U<<31));
+    assert(match >= mMin); assert((size_t)(match - mMin) < (1U<<31));
+    while ( (back > min)
+         && (ip[back-1] == match[back-1]) )
+            back--;
+    return back;
+}
+
+#if defined(_MSC_VER)
+#  define LZ4HC_rotl32(x,r) _rotl(x,r)
+#else
+#  define LZ4HC_rotl32(x,r) ((x << r) | (x >> (32 - r)))
+#endif
+
+
+static U32 LZ4HC_rotatePattern(size_t const rotate, U32 const pattern)
+{
+    size_t const bitsToRotate = (rotate & (sizeof(pattern) - 1)) << 3;
+    if (bitsToRotate == 0) return pattern;
+    return LZ4HC_rotl32(pattern, (int)bitsToRotate);
+}
+
+/* LZ4HC_countPattern() :
+ * pattern32 must be a sample of repetitive pattern of length 1, 2 or 4 (but not 3!) */
+static unsigned
+LZ4HC_countPattern(const BYTE* ip, const BYTE* const iEnd, U32 const pattern32)
+{
+    const BYTE* const iStart = ip;
+    reg_t const pattern = (sizeof(pattern)==8) ?
+        (reg_t)pattern32 + (((reg_t)pattern32) << (sizeof(pattern)*4)) : pattern32;
+
+    while (likely(ip < iEnd-(sizeof(pattern)-1))) {
+        reg_t const diff = LZ4_read_ARCH(ip) ^ pattern;
+        if (!diff) { ip+=sizeof(pattern); continue; }
+        ip += LZ4_NbCommonBytes(diff);
+        return (unsigned)(ip - iStart);
+    }
+
+    if (LZ4_isLittleEndian()) {
+        reg_t patternByte = pattern;
+        while ((ip<iEnd) && (*ip == (BYTE)patternByte)) {
+            ip++; patternByte >>= 8;
+        }
+    } else {  /* big endian */
+        U32 bitOffset = (sizeof(pattern)*8) - 8;
+        while (ip < iEnd) {
+            BYTE const byte = (BYTE)(pattern >> bitOffset);
+            if (*ip != byte) break;
+            ip ++; bitOffset -= 8;
+    }   }
+
+    return (unsigned)(ip - iStart);
+}
+
+/* LZ4HC_reverseCountPattern() :
+ * pattern must be a sample of repetitive pattern of length 1, 2 or 4 (but not 3!)
+ * read using natural platform endianness */
+static unsigned
+LZ4HC_reverseCountPattern(const BYTE* ip, const BYTE* const iLow, U32 pattern)
+{
+    const BYTE* const iStart = ip;
+
+    while (likely(ip >= iLow+4)) {
+        if (LZ4_read32(ip-4) != pattern) break;
+        ip -= 4;
+    }
+    {   const BYTE* bytePtr = (const BYTE*)(&pattern) + 3; /* works for any endianness */
+        while (likely(ip>iLow)) {
+            if (ip[-1] != *bytePtr) break;
+            ip--; bytePtr--;
+    }   }
+    return (unsigned)(iStart - ip);
+}
+
+/* LZ4HC_protectDictEnd() :
+ * Checks if the match is in the last 3 bytes of the dictionary, so reading the
+ * 4 byte MINMATCH would overflow.
+ * @returns true if the match index is okay.
+ */
+static int LZ4HC_protectDictEnd(U32 const dictLimit, U32 const matchIndex)
+{
+    return ((U32)((dictLimit - 1) - matchIndex) >= 3);
+}
+
+typedef enum { rep_untested, rep_not, rep_confirmed } repeat_state_e;
+typedef enum { favorCompressionRatio=0, favorDecompressionSpeed } HCfavor_e;
+
+LZ4_FORCE_INLINE int
+LZ4HC_InsertAndGetWiderMatch (
+        LZ4HC_CCtx_internal* const hc4,
+        const BYTE* const ip,
+        const BYTE* const iLowLimit, const BYTE* const iHighLimit,
+        int longest,
+        const BYTE** matchpos,
+        const BYTE** startpos,
+        const int maxNbAttempts,
+        const int patternAnalysis, const int chainSwap,
+        const dictCtx_directive dict,
+        const HCfavor_e favorDecSpeed)
+{
+    U16* const chainTable = hc4->chainTable;
+    U32* const HashTable = hc4->hashTable;
+    const LZ4HC_CCtx_internal * const dictCtx = hc4->dictCtx;
+    const BYTE* const prefixPtr = hc4->prefixStart;
+    const U32 prefixIdx = hc4->dictLimit;
+    const U32 ipIndex = (U32)(ip - prefixPtr) + prefixIdx;
+    const int withinStartDistance = (hc4->lowLimit + (LZ4_DISTANCE_MAX + 1) > ipIndex);
+    const U32 lowestMatchIndex = (withinStartDistance) ? hc4->lowLimit : ipIndex - LZ4_DISTANCE_MAX;
+    const BYTE* const dictStart = hc4->dictStart;
+    const U32 dictIdx = hc4->lowLimit;
+    const BYTE* const dictEnd = dictStart + prefixIdx - dictIdx;
+    int const lookBackLength = (int)(ip-iLowLimit);
+    int nbAttempts = maxNbAttempts;
+    U32 matchChainPos = 0;
+    U32 const pattern = LZ4_read32(ip);
+    U32 matchIndex;
+    repeat_state_e repeat = rep_untested;
+    size_t srcPatternLength = 0;
+
+    DEBUGLOG(7, "LZ4HC_InsertAndGetWiderMatch");
+    /* First Match */
+    LZ4HC_Insert(hc4, ip);
+    matchIndex = HashTable[LZ4HC_hashPtr(ip)];
+    DEBUGLOG(7, "First match at index %u / %u (lowestMatchIndex)",
+                matchIndex, lowestMatchIndex);
+
+    while ((matchIndex>=lowestMatchIndex) && (nbAttempts>0)) {
+        int matchLength=0;
+        nbAttempts--;
+        assert(matchIndex < ipIndex);
+        if (favorDecSpeed && (ipIndex - matchIndex < 8)) {
+            /* do nothing */
+        } else if (matchIndex >= prefixIdx) {   /* within current Prefix */
+            const BYTE* const matchPtr = prefixPtr + matchIndex - prefixIdx;
+            assert(matchPtr < ip);
+            assert(longest >= 1);
+            if (LZ4_read16(iLowLimit + longest - 1) == LZ4_read16(matchPtr - lookBackLength + longest - 1)) {
+                if (LZ4_read32(matchPtr) == pattern) {
+                    int const back = lookBackLength ? LZ4HC_countBack(ip, matchPtr, iLowLimit, prefixPtr) : 0;
+                    matchLength = MINMATCH + (int)LZ4_count(ip+MINMATCH, matchPtr+MINMATCH, iHighLimit);
+                    matchLength -= back;
+                    if (matchLength > longest) {
+                        longest = matchLength;
+                        *matchpos = matchPtr + back;
+                        *startpos = ip + back;
+            }   }   }
+        } else {   /* lowestMatchIndex <= matchIndex < dictLimit */
+            const BYTE* const matchPtr = dictStart + (matchIndex - dictIdx);
+            assert(matchIndex >= dictIdx);
+            if ( likely(matchIndex <= prefixIdx - 4)
+              && (LZ4_read32(matchPtr) == pattern) ) {
+                int back = 0;
+                const BYTE* vLimit = ip + (prefixIdx - matchIndex);
+                if (vLimit > iHighLimit) vLimit = iHighLimit;
+                matchLength = (int)LZ4_count(ip+MINMATCH, matchPtr+MINMATCH, vLimit) + MINMATCH;
+                if ((ip+matchLength == vLimit) && (vLimit < iHighLimit))
+                    matchLength += LZ4_count(ip+matchLength, prefixPtr, iHighLimit);
+                back = lookBackLength ? LZ4HC_countBack(ip, matchPtr, iLowLimit, dictStart) : 0;
+                matchLength -= back;
+                if (matchLength > longest) {
+                    longest = matchLength;
+                    *matchpos = prefixPtr - prefixIdx + matchIndex + back;   /* virtual pos, relative to ip, to retrieve offset */
+                    *startpos = ip + back;
+        }   }   }
+
+        if (chainSwap && matchLength==longest) {   /* better match => select a better chain */
+            assert(lookBackLength==0);   /* search forward only */
+            if (matchIndex + (U32)longest <= ipIndex) {
+                int const kTrigger = 4;
+                U32 distanceToNextMatch = 1;
+                int const end = longest - MINMATCH + 1;
+                int step = 1;
+                int accel = 1 << kTrigger;
+                int pos;
+                for (pos = 0; pos < end; pos += step) {
+                    U32 const candidateDist = DELTANEXTU16(chainTable, matchIndex + (U32)pos);
+                    step = (accel++ >> kTrigger);
+                    if (candidateDist > distanceToNextMatch) {
+                        distanceToNextMatch = candidateDist;
+                        matchChainPos = (U32)pos;
+                        accel = 1 << kTrigger;
+                }   }
+                if (distanceToNextMatch > 1) {
+                    if (distanceToNextMatch > matchIndex) break;   /* avoid overflow */
+                    matchIndex -= distanceToNextMatch;
+                    continue;
+        }   }   }
+
+        {   U32 const distNextMatch = DELTANEXTU16(chainTable, matchIndex);
+            if (patternAnalysis && distNextMatch==1 && matchChainPos==0) {
+                U32 const matchCandidateIdx = matchIndex-1;
+                /* may be a repeated pattern */
+                if (repeat == rep_untested) {
+                    if ( ((pattern & 0xFFFF) == (pattern >> 16))
+                      &  ((pattern & 0xFF)   == (pattern >> 24)) ) {
+                        repeat = rep_confirmed;
+                        srcPatternLength = LZ4HC_countPattern(ip+sizeof(pattern), iHighLimit, pattern) + sizeof(pattern);
+                    } else {
+                        repeat = rep_not;
+                }   }
+                if ( (repeat == rep_confirmed) && (matchCandidateIdx >= lowestMatchIndex)
+                  && LZ4HC_protectDictEnd(prefixIdx, matchCandidateIdx) ) {
+                    const int extDict = matchCandidateIdx < prefixIdx;
+                    const BYTE* const matchPtr = (extDict ? dictStart - dictIdx : prefixPtr - prefixIdx) + matchCandidateIdx;
+                    if (LZ4_read32(matchPtr) == pattern) {  /* good candidate */
+                        const BYTE* const iLimit = extDict ? dictEnd : iHighLimit;
+                        size_t forwardPatternLength = LZ4HC_countPattern(matchPtr+sizeof(pattern), iLimit, pattern) + sizeof(pattern);
+                        if (extDict && matchPtr + forwardPatternLength == iLimit) {
+                            U32 const rotatedPattern = LZ4HC_rotatePattern(forwardPatternLength, pattern);
+                            forwardPatternLength += LZ4HC_countPattern(prefixPtr, iHighLimit, rotatedPattern);
+                        }
+                        {   const BYTE* const lowestMatchPtr = extDict ? dictStart : prefixPtr;
+                            size_t backLength = LZ4HC_reverseCountPattern(matchPtr, lowestMatchPtr, pattern);
+                            size_t currentSegmentLength;
+                            if (!extDict
+                              && matchPtr - backLength == prefixPtr
+                              && dictIdx < prefixIdx) {
+                                U32 const rotatedPattern = LZ4HC_rotatePattern((U32)(-(int)backLength), pattern);
+                                backLength += LZ4HC_reverseCountPattern(dictEnd, dictStart, rotatedPattern);
+                            }
+                            /* Limit backLength not go further than lowestMatchIndex */
+                            backLength = matchCandidateIdx - MAX(matchCandidateIdx - (U32)backLength, lowestMatchIndex);
+                            assert(matchCandidateIdx - backLength >= lowestMatchIndex);
+                            currentSegmentLength = backLength + forwardPatternLength;
+                            /* Adjust to end of pattern if the source pattern fits, otherwise the beginning of the pattern */
+                            if ( (currentSegmentLength >= srcPatternLength)   /* current pattern segment large enough to contain full srcPatternLength */
+                              && (forwardPatternLength <= srcPatternLength) ) { /* haven't reached this position yet */
+                                U32 const newMatchIndex = matchCandidateIdx + (U32)forwardPatternLength - (U32)srcPatternLength;  /* best position, full pattern, might be followed by more match */
+                                if (LZ4HC_protectDictEnd(prefixIdx, newMatchIndex))
+                                    matchIndex = newMatchIndex;
+                                else {
+                                    /* Can only happen if started in the prefix */
+                                    assert(newMatchIndex >= prefixIdx - 3 && newMatchIndex < prefixIdx && !extDict);
+                                    matchIndex = prefixIdx;
+                                }
+                            } else {
+                                U32 const newMatchIndex = matchCandidateIdx - (U32)backLength;   /* farthest position in current segment, will find a match of length currentSegmentLength + maybe some back */
+                                if (!LZ4HC_protectDictEnd(prefixIdx, newMatchIndex)) {
+                                    assert(newMatchIndex >= prefixIdx - 3 && newMatchIndex < prefixIdx && !extDict);
+                                    matchIndex = prefixIdx;
+                                } else {
+                                    matchIndex = newMatchIndex;
+                                    if (lookBackLength==0) {  /* no back possible */
+                                        size_t const maxML = MIN(currentSegmentLength, srcPatternLength);
+                                        if ((size_t)longest < maxML) {
+                                            assert(prefixPtr - prefixIdx + matchIndex != ip);
+                                            if ((size_t)(ip - prefixPtr) + prefixIdx - matchIndex > LZ4_DISTANCE_MAX) break;
+                                            assert(maxML < 2 GB);
+                                            longest = (int)maxML;
+                                            *matchpos = prefixPtr - prefixIdx + matchIndex;   /* virtual pos, relative to ip, to retrieve offset */
+                                            *startpos = ip;
+                                        }
+                                        {   U32 const distToNextPattern = DELTANEXTU16(chainTable, matchIndex);
+                                            if (distToNextPattern > matchIndex) break;  /* avoid overflow */
+                                            matchIndex -= distToNextPattern;
+                        }   }   }   }   }
+                        continue;
+                }   }
+        }   }   /* PA optimization */
+
+        /* follow current chain */
+        matchIndex -= DELTANEXTU16(chainTable, matchIndex + matchChainPos);
+
+    }  /* while ((matchIndex>=lowestMatchIndex) && (nbAttempts)) */
+
+    if ( dict == usingDictCtxHc
+      && nbAttempts > 0
+      && ipIndex - lowestMatchIndex < LZ4_DISTANCE_MAX) {
+        size_t const dictEndOffset = (size_t)(dictCtx->end - dictCtx->prefixStart) + dictCtx->dictLimit;
+        U32 dictMatchIndex = dictCtx->hashTable[LZ4HC_hashPtr(ip)];
+        assert(dictEndOffset <= 1 GB);
+        matchIndex = dictMatchIndex + lowestMatchIndex - (U32)dictEndOffset;
+        while (ipIndex - matchIndex <= LZ4_DISTANCE_MAX && nbAttempts--) {
+            const BYTE* const matchPtr = dictCtx->prefixStart - dictCtx->dictLimit + dictMatchIndex;
+
+            if (LZ4_read32(matchPtr) == pattern) {
+                int mlt;
+                int back = 0;
+                const BYTE* vLimit = ip + (dictEndOffset - dictMatchIndex);
+                if (vLimit > iHighLimit) vLimit = iHighLimit;
+                mlt = (int)LZ4_count(ip+MINMATCH, matchPtr+MINMATCH, vLimit) + MINMATCH;
+                back = lookBackLength ? LZ4HC_countBack(ip, matchPtr, iLowLimit, dictCtx->prefixStart) : 0;
+                mlt -= back;
+                if (mlt > longest) {
+                    longest = mlt;
+                    *matchpos = prefixPtr - prefixIdx + matchIndex + back;
+                    *startpos = ip + back;
+            }   }
+
+            {   U32 const nextOffset = DELTANEXTU16(dictCtx->chainTable, dictMatchIndex);
+                dictMatchIndex -= nextOffset;
+                matchIndex -= nextOffset;
+    }   }   }
+
+    return longest;
+}
+
+LZ4_FORCE_INLINE int
+LZ4HC_InsertAndFindBestMatch(LZ4HC_CCtx_internal* const hc4,   /* Index table will be updated */
+                       const BYTE* const ip, const BYTE* const iLimit,
+                       const BYTE** matchpos,
+                       const int maxNbAttempts,
+                       const int patternAnalysis,
+                       const dictCtx_directive dict)
+{
+    const BYTE* uselessPtr = ip;
+    /* note : LZ4HC_InsertAndGetWiderMatch() is able to modify the starting position of a match (*startpos),
+     * but this won't be the case here, as we define iLowLimit==ip,
+     * so LZ4HC_InsertAndGetWiderMatch() won't be allowed to search past ip */
+    return LZ4HC_InsertAndGetWiderMatch(hc4, ip, ip, iLimit, MINMATCH-1, matchpos, &uselessPtr, maxNbAttempts, patternAnalysis, 0 /*chainSwap*/, dict, favorCompressionRatio);
+}
+
+/* LZ4HC_encodeSequence() :
+ * @return : 0 if ok,
+ *           1 if buffer issue detected */
+LZ4_FORCE_INLINE int LZ4HC_encodeSequence (
+    const BYTE** _ip,
+    BYTE** _op,
+    const BYTE** _anchor,
+    int matchLength,
+    const BYTE* const match,
+    limitedOutput_directive limit,
+    BYTE* oend)
+{
+#define ip      (*_ip)
+#define op      (*_op)
+#define anchor  (*_anchor)
+
+    size_t length;
+    BYTE* const token = op++;
+
+#if defined(LZ4_DEBUG) && (LZ4_DEBUG >= 6)
+    static const BYTE* start = NULL;
+    static U32 totalCost = 0;
+    U32 const pos = (start==NULL) ? 0 : (U32)(anchor - start);
+    U32 const ll = (U32)(ip - anchor);
+    U32 const llAdd = (ll>=15) ? ((ll-15) / 255) + 1 : 0;
+    U32 const mlAdd = (matchLength>=19) ? ((matchLength-19) / 255) + 1 : 0;
+    U32 const cost = 1 + llAdd + ll + 2 + mlAdd;
+    if (start==NULL) start = anchor;  /* only works for single segment */
+    /* g_debuglog_enable = (pos >= 2228) & (pos <= 2262); */
+    DEBUGLOG(6, "pos:%7u -- literals:%4u, match:%4i, offset:%5u, cost:%4u + %5u",
+                pos,
+                (U32)(ip - anchor), matchLength, (U32)(ip-match),
+                cost, totalCost);
+    totalCost += cost;
+#endif
+
+    /* Encode Literal length */
+    length = (size_t)(ip - anchor);
+    LZ4_STATIC_ASSERT(notLimited == 0);
+    /* Check output limit */
+    if (limit && ((op + (length / 255) + length + (2 + 1 + LASTLITERALS)) > oend)) {
+        DEBUGLOG(6, "Not enough room to write %i literals (%i bytes remaining)",
+                (int)length, (int)(oend - op));
+        return 1;
+    }
+    if (length >= RUN_MASK) {
+        size_t len = length - RUN_MASK;
+        *token = (RUN_MASK << ML_BITS);
+        for(; len >= 255 ; len -= 255) *op++ = 255;
+        *op++ = (BYTE)len;
+    } else {
+        *token = (BYTE)(length << ML_BITS);
+    }
+
+    /* Copy Literals */
+    LZ4_wildCopy8(op, anchor, op + length);
+    op += length;
+
+    /* Encode Offset */
+    assert( (ip - match) <= LZ4_DISTANCE_MAX );   /* note : consider providing offset as a value, rather than as a pointer difference */
+    LZ4_writeLE16(op, (U16)(ip - match)); op += 2;
+
+    /* Encode MatchLength */
+    assert(matchLength >= MINMATCH);
+    length = (size_t)matchLength - MINMATCH;
+    if (limit && (op + (length / 255) + (1 + LASTLITERALS) > oend)) {
+        DEBUGLOG(6, "Not enough room to write match length");
+        return 1;   /* Check output limit */
+    }
+    if (length >= ML_MASK) {
+        *token += ML_MASK;
+        length -= ML_MASK;
+        for(; length >= 510 ; length -= 510) { *op++ = 255; *op++ = 255; }
+        if (length >= 255) { length -= 255; *op++ = 255; }
+        *op++ = (BYTE)length;
+    } else {
+        *token += (BYTE)(length);
+    }
+
+    /* Prepare next loop */
+    ip += matchLength;
+    anchor = ip;
+
+    return 0;
+}
+#undef ip
+#undef op
+#undef anchor
+
+LZ4_FORCE_INLINE int LZ4HC_compress_hashChain (
+    LZ4HC_CCtx_internal* const ctx,
+    const char* const source,
+    char* const dest,
+    int* srcSizePtr,
+    int const maxOutputSize,
+    int maxNbAttempts,
+    const limitedOutput_directive limit,
+    const dictCtx_directive dict
+    )
+{
+    const int inputSize = *srcSizePtr;
+    const int patternAnalysis = (maxNbAttempts > 128);   /* levels 9+ */
+
+    const BYTE* ip = (const BYTE*) source;
+    const BYTE* anchor = ip;
+    const BYTE* const iend = ip + inputSize;
+    const BYTE* const mflimit = iend - MFLIMIT;
+    const BYTE* const matchlimit = (iend - LASTLITERALS);
+
+    BYTE* optr = (BYTE*) dest;
+    BYTE* op = (BYTE*) dest;
+    BYTE* oend = op + maxOutputSize;
+
+    int   ml0, ml, ml2, ml3;
+    const BYTE* start0;
+    const BYTE* ref0;
+    const BYTE* ref = NULL;
+    const BYTE* start2 = NULL;
+    const BYTE* ref2 = NULL;
+    const BYTE* start3 = NULL;
+    const BYTE* ref3 = NULL;
+
+    /* init */
+    *srcSizePtr = 0;
+    if (limit == fillOutput) oend -= LASTLITERALS;                  /* Hack for support LZ4 format restriction */
+    if (inputSize < LZ4_minLength) goto _last_literals;             /* Input too small, no compression (all literals) */
+
+    /* Main Loop */
+    while (ip <= mflimit) {
+        ml = LZ4HC_InsertAndFindBestMatch(ctx, ip, matchlimit, &ref, maxNbAttempts, patternAnalysis, dict);
+        if (ml<MINMATCH) { ip++; continue; }
+
+        /* saved, in case we would skip too much */
+        start0 = ip; ref0 = ref; ml0 = ml;
+
+_Search2:
+        if (ip+ml <= mflimit) {
+            ml2 = LZ4HC_InsertAndGetWiderMatch(ctx,
+                            ip + ml - 2, ip + 0, matchlimit, ml, &ref2, &start2,
+                            maxNbAttempts, patternAnalysis, 0, dict, favorCompressionRatio);
+        } else {
+            ml2 = ml;
+        }
+
+        if (ml2 == ml) { /* No better match => encode ML1 */
+            optr = op;
+            if (LZ4HC_encodeSequence(UPDATABLE(ip, op, anchor), ml, ref, limit, oend)) goto _dest_overflow;
+            continue;
+        }
+
+        if (start0 < ip) {   /* first match was skipped at least once */
+            if (start2 < ip + ml0) {  /* squeezing ML1 between ML0(original ML1) and ML2 */
+                ip = start0; ref = ref0; ml = ml0;  /* restore initial ML1 */
+        }   }
+
+        /* Here, start0==ip */
+        if ((start2 - ip) < 3) {  /* First Match too small : removed */
+            ml = ml2;
+            ip = start2;
+            ref =ref2;
+            goto _Search2;
+        }
+
+_Search3:
+        /* At this stage, we have :
+        *  ml2 > ml1, and
+        *  ip1+3 <= ip2 (usually < ip1+ml1) */
+        if ((start2 - ip) < OPTIMAL_ML) {
+            int correction;
+            int new_ml = ml;
+            if (new_ml > OPTIMAL_ML) new_ml = OPTIMAL_ML;
+            if (ip+new_ml > start2 + ml2 - MINMATCH) new_ml = (int)(start2 - ip) + ml2 - MINMATCH;
+            correction = new_ml - (int)(start2 - ip);
+            if (correction > 0) {
+                start2 += correction;
+                ref2 += correction;
+                ml2 -= correction;
+            }
+        }
+        /* Now, we have start2 = ip+new_ml, with new_ml = min(ml, OPTIMAL_ML=18) */
+
+        if (start2 + ml2 <= mflimit) {
+            ml3 = LZ4HC_InsertAndGetWiderMatch(ctx,
+                            start2 + ml2 - 3, start2, matchlimit, ml2, &ref3, &start3,
+                            maxNbAttempts, patternAnalysis, 0, dict, favorCompressionRatio);
+        } else {
+            ml3 = ml2;
+        }
+
+        if (ml3 == ml2) {  /* No better match => encode ML1 and ML2 */
+            /* ip & ref are known; Now for ml */
+            if (start2 < ip+ml)  ml = (int)(start2 - ip);
+            /* Now, encode 2 sequences */
+            optr = op;
+            if (LZ4HC_encodeSequence(UPDATABLE(ip, op, anchor), ml, ref, limit, oend)) goto _dest_overflow;
+            ip = start2;
+            optr = op;
+            if (LZ4HC_encodeSequence(UPDATABLE(ip, op, anchor), ml2, ref2, limit, oend)) {
+                ml  = ml2;
+                ref = ref2;
+                goto _dest_overflow;
+            }
+            continue;
+        }
+
+        if (start3 < ip+ml+3) {  /* Not enough space for match 2 : remove it */
+            if (start3 >= (ip+ml)) {  /* can write Seq1 immediately ==> Seq2 is removed, so Seq3 becomes Seq1 */
+                if (start2 < ip+ml) {
+                    int correction = (int)(ip+ml - start2);
+                    start2 += correction;
+                    ref2 += correction;
+                    ml2 -= correction;
+                    if (ml2 < MINMATCH) {
+                        start2 = start3;
+                        ref2 = ref3;
+                        ml2 = ml3;
+                    }
+                }
+
+                optr = op;
+                if (LZ4HC_encodeSequence(UPDATABLE(ip, op, anchor), ml, ref, limit, oend)) goto _dest_overflow;
+                ip  = start3;
+                ref = ref3;
+                ml  = ml3;
+
+                start0 = start2;
+                ref0 = ref2;
+                ml0 = ml2;
+                goto _Search2;
+            }
+
+            start2 = start3;
+            ref2 = ref3;
+            ml2 = ml3;
+            goto _Search3;
+        }
+
+        /*
+        * OK, now we have 3 ascending matches;
+        * let's write the first one ML1.
+        * ip & ref are known; Now decide ml.
+        */
+        if (start2 < ip+ml) {
+            if ((start2 - ip) < OPTIMAL_ML) {
+                int correction;
+                if (ml > OPTIMAL_ML) ml = OPTIMAL_ML;
+                if (ip + ml > start2 + ml2 - MINMATCH) ml = (int)(start2 - ip) + ml2 - MINMATCH;
+                correction = ml - (int)(start2 - ip);
+                if (correction > 0) {
+                    start2 += correction;
+                    ref2 += correction;
+                    ml2 -= correction;
+                }
+            } else {
+                ml = (int)(start2 - ip);
+            }
+        }
+        optr = op;
+        if (LZ4HC_encodeSequence(UPDATABLE(ip, op, anchor), ml, ref, limit, oend)) goto _dest_overflow;
+
+        /* ML2 becomes ML1 */
+        ip = start2; ref = ref2; ml = ml2;
+
+        /* ML3 becomes ML2 */
+        start2 = start3; ref2 = ref3; ml2 = ml3;
+
+        /* let's find a new ML3 */
+        goto _Search3;
+    }
+
+_last_literals:
+    /* Encode Last Literals */
+    {   size_t lastRunSize = (size_t)(iend - anchor);  /* literals */
+        size_t llAdd = (lastRunSize + 255 - RUN_MASK) / 255;
+        size_t const totalSize = 1 + llAdd + lastRunSize;
+        if (limit == fillOutput) oend += LASTLITERALS;  /* restore correct value */
+        if (limit && (op + totalSize > oend)) {
+            if (limit == limitedOutput) return 0;
+            /* adapt lastRunSize to fill 'dest' */
+            lastRunSize  = (size_t)(oend - op) - 1 /*token*/;
+            llAdd = (lastRunSize + 256 - RUN_MASK) / 256;
+            lastRunSize -= llAdd;
+        }
+        DEBUGLOG(6, "Final literal run : %i literals", (int)lastRunSize);
+        ip = anchor + lastRunSize;  /* can be != iend if limit==fillOutput */
+
+        if (lastRunSize >= RUN_MASK) {
+            size_t accumulator = lastRunSize - RUN_MASK;
+            *op++ = (RUN_MASK << ML_BITS);
+            for(; accumulator >= 255 ; accumulator -= 255) *op++ = 255;
+            *op++ = (BYTE) accumulator;
+        } else {
+            *op++ = (BYTE)(lastRunSize << ML_BITS);
+        }
+        LZ4_memcpy(op, anchor, lastRunSize);
+        op += lastRunSize;
+    }
+
+    /* End */
+    *srcSizePtr = (int) (((const char*)ip) - source);
+    return (int) (((char*)op)-dest);
+
+_dest_overflow:
+    if (limit == fillOutput) {
+        /* Assumption : ip, anchor, ml and ref must be set correctly */
+        size_t const ll = (size_t)(ip - anchor);
+        size_t const ll_addbytes = (ll + 240) / 255;
+        size_t const ll_totalCost = 1 + ll_addbytes + ll;
+        BYTE* const maxLitPos = oend - 3; /* 2 for offset, 1 for token */
+        DEBUGLOG(6, "Last sequence overflowing");
+        op = optr;  /* restore correct out pointer */
+        if (op + ll_totalCost <= maxLitPos) {
+            /* ll validated; now adjust match length */
+            size_t const bytesLeftForMl = (size_t)(maxLitPos - (op+ll_totalCost));
+            size_t const maxMlSize = MINMATCH + (ML_MASK-1) + (bytesLeftForMl * 255);
+            assert(maxMlSize < INT_MAX); assert(ml >= 0);
+            if ((size_t)ml > maxMlSize) ml = (int)maxMlSize;
+            if ((oend + LASTLITERALS) - (op + ll_totalCost + 2) - 1 + ml >= MFLIMIT) {
+                LZ4HC_encodeSequence(UPDATABLE(ip, op, anchor), ml, ref, notLimited, oend);
+        }   }
+        goto _last_literals;
+    }
+    /* compression failed */
+    return 0;
+}
+
+
+static int LZ4HC_compress_optimal( LZ4HC_CCtx_internal* ctx,
+    const char* const source, char* dst,
+    int* srcSizePtr, int dstCapacity,
+    int const nbSearches, size_t sufficient_len,
+    const limitedOutput_directive limit, int const fullUpdate,
+    const dictCtx_directive dict,
+    const HCfavor_e favorDecSpeed);
+
+
+LZ4_FORCE_INLINE int LZ4HC_compress_generic_internal (
+    LZ4HC_CCtx_internal* const ctx,
+    const char* const src,
+    char* const dst,
+    int* const srcSizePtr,
+    int const dstCapacity,
+    int cLevel,
+    const limitedOutput_directive limit,
+    const dictCtx_directive dict
+    )
+{
+    typedef enum { lz4hc, lz4opt } lz4hc_strat_e;
+    typedef struct {
+        lz4hc_strat_e strat;
+        int nbSearches;
+        U32 targetLength;
+    } cParams_t;
+    static const cParams_t clTable[LZ4HC_CLEVEL_MAX+1] = {
+        { lz4hc,     2, 16 },  /* 0, unused */
+        { lz4hc,     2, 16 },  /* 1, unused */
+        { lz4hc,     2, 16 },  /* 2, unused */
+        { lz4hc,     4, 16 },  /* 3 */
+        { lz4hc,     8, 16 },  /* 4 */
+        { lz4hc,    16, 16 },  /* 5 */
+        { lz4hc,    32, 16 },  /* 6 */
+        { lz4hc,    64, 16 },  /* 7 */
+        { lz4hc,   128, 16 },  /* 8 */
+        { lz4hc,   256, 16 },  /* 9 */
+        { lz4opt,   96, 64 },  /*10==LZ4HC_CLEVEL_OPT_MIN*/
+        { lz4opt,  512,128 },  /*11 */
+        { lz4opt,16384,LZ4_OPT_NUM },  /* 12==LZ4HC_CLEVEL_MAX */
+    };
+
+    DEBUGLOG(4, "LZ4HC_compress_generic(ctx=%p, src=%p, srcSize=%d, limit=%d)",
+                ctx, src, *srcSizePtr, limit);
+
+    if (limit == fillOutput && dstCapacity < 1) return 0;   /* Impossible to store anything */
+    if ((U32)*srcSizePtr > (U32)LZ4_MAX_INPUT_SIZE) return 0;    /* Unsupported input size (too large or negative) */
+
+    ctx->end += *srcSizePtr;
+    if (cLevel < 1) cLevel = LZ4HC_CLEVEL_DEFAULT;   /* note : convention is different from lz4frame, maybe something to review */
+    cLevel = MIN(LZ4HC_CLEVEL_MAX, cLevel);
+    {   cParams_t const cParam = clTable[cLevel];
+        HCfavor_e const favor = ctx->favorDecSpeed ? favorDecompressionSpeed : favorCompressionRatio;
+        int result;
+
+        if (cParam.strat == lz4hc) {
+            result = LZ4HC_compress_hashChain(ctx,
+                                src, dst, srcSizePtr, dstCapacity,
+                                cParam.nbSearches, limit, dict);
+        } else {
+            assert(cParam.strat == lz4opt);
+            result = LZ4HC_compress_optimal(ctx,
+                                src, dst, srcSizePtr, dstCapacity,
+                                cParam.nbSearches, cParam.targetLength, limit,
+                                cLevel == LZ4HC_CLEVEL_MAX,   /* ultra mode */
+                                dict, favor);
+        }
+        if (result <= 0) ctx->dirty = 1;
+        return result;
+    }
+}
+
+static void LZ4HC_setExternalDict(LZ4HC_CCtx_internal* ctxPtr, const BYTE* newBlock);
+
+static int
+LZ4HC_compress_generic_noDictCtx (
+        LZ4HC_CCtx_internal* const ctx,
+        const char* const src,
+        char* const dst,
+        int* const srcSizePtr,
+        int const dstCapacity,
+        int cLevel,
+        limitedOutput_directive limit
+        )
+{
+    assert(ctx->dictCtx == NULL);
+    return LZ4HC_compress_generic_internal(ctx, src, dst, srcSizePtr, dstCapacity, cLevel, limit, noDictCtx);
+}
+
+static int
+LZ4HC_compress_generic_dictCtx (
+        LZ4HC_CCtx_internal* const ctx,
+        const char* const src,
+        char* const dst,
+        int* const srcSizePtr,
+        int const dstCapacity,
+        int cLevel,
+        limitedOutput_directive limit
+        )
+{
+    const size_t position = (size_t)(ctx->end - ctx->prefixStart) + (ctx->dictLimit - ctx->lowLimit);
+    assert(ctx->dictCtx != NULL);
+    if (position >= 64 KB) {
+        ctx->dictCtx = NULL;
+        return LZ4HC_compress_generic_noDictCtx(ctx, src, dst, srcSizePtr, dstCapacity, cLevel, limit);
+    } else if (position == 0 && *srcSizePtr > 4 KB) {
+        LZ4_memcpy(ctx, ctx->dictCtx, sizeof(LZ4HC_CCtx_internal));
+        LZ4HC_setExternalDict(ctx, (const BYTE *)src);
+        ctx->compressionLevel = (short)cLevel;
+        return LZ4HC_compress_generic_noDictCtx(ctx, src, dst, srcSizePtr, dstCapacity, cLevel, limit);
+    } else {
+        return LZ4HC_compress_generic_internal(ctx, src, dst, srcSizePtr, dstCapacity, cLevel, limit, usingDictCtxHc);
+    }
+}
+
+static int
+LZ4HC_compress_generic (
+        LZ4HC_CCtx_internal* const ctx,
+        const char* const src,
+        char* const dst,
+        int* const srcSizePtr,
+        int const dstCapacity,
+        int cLevel,
+        limitedOutput_directive limit
+        )
+{
+    if (ctx->dictCtx == NULL) {
+        return LZ4HC_compress_generic_noDictCtx(ctx, src, dst, srcSizePtr, dstCapacity, cLevel, limit);
+    } else {
+        return LZ4HC_compress_generic_dictCtx(ctx, src, dst, srcSizePtr, dstCapacity, cLevel, limit);
+    }
+}
+
+
+int LZ4_sizeofStateHC(void) { return (int)sizeof(LZ4_streamHC_t); }
+
+static size_t LZ4_streamHC_t_alignment(void)
+{
+#if LZ4_ALIGN_TEST
+    typedef struct { char c; LZ4_streamHC_t t; } t_a;
+    return sizeof(t_a) - sizeof(LZ4_streamHC_t);
+#else
+    return 1;  /* effectively disabled */
+#endif
+}
+
+/* state is presumed correctly initialized,
+ * in which case its size and alignment have already been validate */
+int LZ4_compress_HC_extStateHC_fastReset (void* state, const char* src, char* dst, int srcSize, int dstCapacity, int compressionLevel)
+{
+    LZ4HC_CCtx_internal* const ctx = &((LZ4_streamHC_t*)state)->internal_donotuse;
+    if (!LZ4_isAligned(state, LZ4_streamHC_t_alignment())) return 0;
+    LZ4_resetStreamHC_fast((LZ4_streamHC_t*)state, compressionLevel);
+    LZ4HC_init_internal (ctx, (const BYTE*)src);
+    if (dstCapacity < LZ4_compressBound(srcSize))
+        return LZ4HC_compress_generic (ctx, src, dst, &srcSize, dstCapacity, compressionLevel, limitedOutput);
+    else
+        return LZ4HC_compress_generic (ctx, src, dst, &srcSize, dstCapacity, compressionLevel, notLimited);
+}
+
+int LZ4_compress_HC_extStateHC (void* state, const char* src, char* dst, int srcSize, int dstCapacity, int compressionLevel)
+{
+    LZ4_streamHC_t* const ctx = LZ4_initStreamHC(state, sizeof(*ctx));
+    if (ctx==NULL) return 0;   /* init failure */
+    return LZ4_compress_HC_extStateHC_fastReset(state, src, dst, srcSize, dstCapacity, compressionLevel);
+}
+
+int LZ4_compress_HC(const char* src, char* dst, int srcSize, int dstCapacity, int compressionLevel)
+{
+    int cSize;
+#if defined(LZ4HC_HEAPMODE) && LZ4HC_HEAPMODE==1
+    LZ4_streamHC_t* const statePtr = (LZ4_streamHC_t*)ALLOC(sizeof(LZ4_streamHC_t));
+    if (statePtr==NULL) return 0;
+#else
+    LZ4_streamHC_t state;
+    LZ4_streamHC_t* const statePtr = &state;
+#endif
+    cSize = LZ4_compress_HC_extStateHC(statePtr, src, dst, srcSize, dstCapacity, compressionLevel);
+#if defined(LZ4HC_HEAPMODE) && LZ4HC_HEAPMODE==1
+    FREEMEM(statePtr);
+#endif
+    return cSize;
+}
+
+/* state is presumed sized correctly (>= sizeof(LZ4_streamHC_t)) */
+int LZ4_compress_HC_destSize(void* state, const char* source, char* dest, int* sourceSizePtr, int targetDestSize, int cLevel)
+{
+    LZ4_streamHC_t* const ctx = LZ4_initStreamHC(state, sizeof(*ctx));
+    if (ctx==NULL) return 0;   /* init failure */
+    LZ4HC_init_internal(&ctx->internal_donotuse, (const BYTE*) source);
+    LZ4_setCompressionLevel(ctx, cLevel);
+    return LZ4HC_compress_generic(&ctx->internal_donotuse, source, dest, sourceSizePtr, targetDestSize, cLevel, fillOutput);
+}
+
+
+
+/**************************************
+*  Streaming Functions
+**************************************/
+/* allocation */
+#if !defined(LZ4_STATIC_LINKING_ONLY_DISABLE_MEMORY_ALLOCATION)
+LZ4_streamHC_t* LZ4_createStreamHC(void)
+{
+    LZ4_streamHC_t* const state =
+        (LZ4_streamHC_t*)ALLOC_AND_ZERO(sizeof(LZ4_streamHC_t));
+    if (state == NULL) return NULL;
+    LZ4_setCompressionLevel(state, LZ4HC_CLEVEL_DEFAULT);
+    return state;
+}
+
+int LZ4_freeStreamHC (LZ4_streamHC_t* LZ4_streamHCPtr)
+{
+    DEBUGLOG(4, "LZ4_freeStreamHC(%p)", LZ4_streamHCPtr);
+    if (!LZ4_streamHCPtr) return 0;  /* support free on NULL */
+    FREEMEM(LZ4_streamHCPtr);
+    return 0;
+}
+#endif
+
+
+LZ4_streamHC_t* LZ4_initStreamHC (void* buffer, size_t size)
+{
+    LZ4_streamHC_t* const LZ4_streamHCPtr = (LZ4_streamHC_t*)buffer;
+    DEBUGLOG(4, "LZ4_initStreamHC(%p, %u)", buffer, (unsigned)size);
+    /* check conditions */
+    if (buffer == NULL) return NULL;
+    if (size < sizeof(LZ4_streamHC_t)) return NULL;
+    if (!LZ4_isAligned(buffer, LZ4_streamHC_t_alignment())) return NULL;
+    /* init */
+    { LZ4HC_CCtx_internal* const hcstate = &(LZ4_streamHCPtr->internal_donotuse);
+      MEM_INIT(hcstate, 0, sizeof(*hcstate)); }
+    LZ4_setCompressionLevel(LZ4_streamHCPtr, LZ4HC_CLEVEL_DEFAULT);
+    return LZ4_streamHCPtr;
+}
+
+/* just a stub */
+void LZ4_resetStreamHC (LZ4_streamHC_t* LZ4_streamHCPtr, int compressionLevel)
+{
+    LZ4_initStreamHC(LZ4_streamHCPtr, sizeof(*LZ4_streamHCPtr));
+    LZ4_setCompressionLevel(LZ4_streamHCPtr, compressionLevel);
+}
+
+void LZ4_resetStreamHC_fast (LZ4_streamHC_t* LZ4_streamHCPtr, int compressionLevel)
+{
+    DEBUGLOG(4, "LZ4_resetStreamHC_fast(%p, %d)", LZ4_streamHCPtr, compressionLevel);
+    if (LZ4_streamHCPtr->internal_donotuse.dirty) {
+        LZ4_initStreamHC(LZ4_streamHCPtr, sizeof(*LZ4_streamHCPtr));
+    } else {
+        /* preserve end - prefixStart : can trigger clearTable's threshold */
+        if (LZ4_streamHCPtr->internal_donotuse.end != NULL) {
+            LZ4_streamHCPtr->internal_donotuse.end -= (uptrval)LZ4_streamHCPtr->internal_donotuse.prefixStart;
+        } else {
+            assert(LZ4_streamHCPtr->internal_donotuse.prefixStart == NULL);
+        }
+        LZ4_streamHCPtr->internal_donotuse.prefixStart = NULL;
+        LZ4_streamHCPtr->internal_donotuse.dictCtx = NULL;
+    }
+    LZ4_setCompressionLevel(LZ4_streamHCPtr, compressionLevel);
+}
+
+void LZ4_setCompressionLevel(LZ4_streamHC_t* LZ4_streamHCPtr, int compressionLevel)
+{
+    DEBUGLOG(5, "LZ4_setCompressionLevel(%p, %d)", LZ4_streamHCPtr, compressionLevel);
+    if (compressionLevel < 1) compressionLevel = LZ4HC_CLEVEL_DEFAULT;
+    if (compressionLevel > LZ4HC_CLEVEL_MAX) compressionLevel = LZ4HC_CLEVEL_MAX;
+    LZ4_streamHCPtr->internal_donotuse.compressionLevel = (short)compressionLevel;
+}
+
+void LZ4_favorDecompressionSpeed(LZ4_streamHC_t* LZ4_streamHCPtr, int favor)
+{
+    LZ4_streamHCPtr->internal_donotuse.favorDecSpeed = (favor!=0);
+}
+
+/* LZ4_loadDictHC() :
+ * LZ4_streamHCPtr is presumed properly initialized */
+int LZ4_loadDictHC (LZ4_streamHC_t* LZ4_streamHCPtr,
+              const char* dictionary, int dictSize)
+{
+    LZ4HC_CCtx_internal* const ctxPtr = &LZ4_streamHCPtr->internal_donotuse;
+    DEBUGLOG(4, "LZ4_loadDictHC(ctx:%p, dict:%p, dictSize:%d)", LZ4_streamHCPtr, dictionary, dictSize);
+    assert(LZ4_streamHCPtr != NULL);
+    if (dictSize > 64 KB) {
+        dictionary += (size_t)dictSize - 64 KB;
+        dictSize = 64 KB;
+    }
+    /* need a full initialization, there are bad side-effects when using resetFast() */
+    {   int const cLevel = ctxPtr->compressionLevel;
+        LZ4_initStreamHC(LZ4_streamHCPtr, sizeof(*LZ4_streamHCPtr));
+        LZ4_setCompressionLevel(LZ4_streamHCPtr, cLevel);
+    }
+    LZ4HC_init_internal (ctxPtr, (const BYTE*)dictionary);
+    ctxPtr->end = (const BYTE*)dictionary + dictSize;
+    if (dictSize >= 4) LZ4HC_Insert (ctxPtr, ctxPtr->end-3);
+    return dictSize;
+}
+
+void LZ4_attach_HC_dictionary(LZ4_streamHC_t *working_stream, const LZ4_streamHC_t *dictionary_stream) {
+    working_stream->internal_donotuse.dictCtx = dictionary_stream != NULL ? &(dictionary_stream->internal_donotuse) : NULL;
+}
+
+/* compression */
+
+static void LZ4HC_setExternalDict(LZ4HC_CCtx_internal* ctxPtr, const BYTE* newBlock)
+{
+    DEBUGLOG(4, "LZ4HC_setExternalDict(%p, %p)", ctxPtr, newBlock);
+    if (ctxPtr->end >= ctxPtr->prefixStart + 4)
+        LZ4HC_Insert (ctxPtr, ctxPtr->end-3);   /* Referencing remaining dictionary content */
+
+    /* Only one memory segment for extDict, so any previous extDict is lost at this stage */
+    ctxPtr->lowLimit  = ctxPtr->dictLimit;
+    ctxPtr->dictStart  = ctxPtr->prefixStart;
+    ctxPtr->dictLimit += (U32)(ctxPtr->end - ctxPtr->prefixStart);
+    ctxPtr->prefixStart = newBlock;
+    ctxPtr->end  = newBlock;
+    ctxPtr->nextToUpdate = ctxPtr->dictLimit;   /* match referencing will resume from there */
+
+    /* cannot reference an extDict and a dictCtx at the same time */
+    ctxPtr->dictCtx = NULL;
+}
+
+static int
+LZ4_compressHC_continue_generic (LZ4_streamHC_t* LZ4_streamHCPtr,
+                                 const char* src, char* dst,
+                                 int* srcSizePtr, int dstCapacity,
+                                 limitedOutput_directive limit)
+{
+    LZ4HC_CCtx_internal* const ctxPtr = &LZ4_streamHCPtr->internal_donotuse;
+    DEBUGLOG(5, "LZ4_compressHC_continue_generic(ctx=%p, src=%p, srcSize=%d, limit=%d)",
+                LZ4_streamHCPtr, src, *srcSizePtr, limit);
+    assert(ctxPtr != NULL);
+    /* auto-init if forgotten */
+    if (ctxPtr->prefixStart == NULL) LZ4HC_init_internal (ctxPtr, (const BYTE*) src);
+
+    /* Check overflow */
+    if ((size_t)(ctxPtr->end - ctxPtr->prefixStart) + ctxPtr->dictLimit > 2 GB) {
+        size_t dictSize = (size_t)(ctxPtr->end - ctxPtr->prefixStart);
+        if (dictSize > 64 KB) dictSize = 64 KB;
+        LZ4_loadDictHC(LZ4_streamHCPtr, (const char*)(ctxPtr->end) - dictSize, (int)dictSize);
+    }
+
+    /* Check if blocks follow each other */
+    if ((const BYTE*)src != ctxPtr->end)
+        LZ4HC_setExternalDict(ctxPtr, (const BYTE*)src);
+
+    /* Check overlapping input/dictionary space */
+    {   const BYTE* sourceEnd = (const BYTE*) src + *srcSizePtr;
+        const BYTE* const dictBegin = ctxPtr->dictStart;
+        const BYTE* const dictEnd   = ctxPtr->dictStart + (ctxPtr->dictLimit - ctxPtr->lowLimit);
+        if ((sourceEnd > dictBegin) && ((const BYTE*)src < dictEnd)) {
+            if (sourceEnd > dictEnd) sourceEnd = dictEnd;
+            ctxPtr->lowLimit += (U32)(sourceEnd - ctxPtr->dictStart);
+            ctxPtr->dictStart += (U32)(sourceEnd - ctxPtr->dictStart);
+            if (ctxPtr->dictLimit - ctxPtr->lowLimit < 4) {
+                ctxPtr->lowLimit = ctxPtr->dictLimit;
+                ctxPtr->dictStart = ctxPtr->prefixStart;
+    }   }   }
+
+    return LZ4HC_compress_generic (ctxPtr, src, dst, srcSizePtr, dstCapacity, ctxPtr->compressionLevel, limit);
+}
+
+int LZ4_compress_HC_continue (LZ4_streamHC_t* LZ4_streamHCPtr, const char* src, char* dst, int srcSize, int dstCapacity)
+{
+    if (dstCapacity < LZ4_compressBound(srcSize))
+        return LZ4_compressHC_continue_generic (LZ4_streamHCPtr, src, dst, &srcSize, dstCapacity, limitedOutput);
+    else
+        return LZ4_compressHC_continue_generic (LZ4_streamHCPtr, src, dst, &srcSize, dstCapacity, notLimited);
+}
+
+int LZ4_compress_HC_continue_destSize (LZ4_streamHC_t* LZ4_streamHCPtr, const char* src, char* dst, int* srcSizePtr, int targetDestSize)
+{
+    return LZ4_compressHC_continue_generic(LZ4_streamHCPtr, src, dst, srcSizePtr, targetDestSize, fillOutput);
+}
+
+
+
+/* LZ4_saveDictHC :
+ * save history content
+ * into a user-provided buffer
+ * which is then used to continue compression
+ */
+int LZ4_saveDictHC (LZ4_streamHC_t* LZ4_streamHCPtr, char* safeBuffer, int dictSize)
+{
+    LZ4HC_CCtx_internal* const streamPtr = &LZ4_streamHCPtr->internal_donotuse;
+    int const prefixSize = (int)(streamPtr->end - streamPtr->prefixStart);
+    DEBUGLOG(5, "LZ4_saveDictHC(%p, %p, %d)", LZ4_streamHCPtr, safeBuffer, dictSize);
+    assert(prefixSize >= 0);
+    if (dictSize > 64 KB) dictSize = 64 KB;
+    if (dictSize < 4) dictSize = 0;
+    if (dictSize > prefixSize) dictSize = prefixSize;
+    if (safeBuffer == NULL) assert(dictSize == 0);
+    if (dictSize > 0)
+        LZ4_memmove(safeBuffer, streamPtr->end - dictSize, dictSize);
+    {   U32 const endIndex = (U32)(streamPtr->end - streamPtr->prefixStart) + streamPtr->dictLimit;
+        streamPtr->end = (const BYTE*)safeBuffer + dictSize;
+        streamPtr->prefixStart = streamPtr->end - dictSize;
+        streamPtr->dictLimit = endIndex - (U32)dictSize;
+        streamPtr->lowLimit = endIndex - (U32)dictSize;
+        streamPtr->dictStart = streamPtr->prefixStart;
+        if (streamPtr->nextToUpdate < streamPtr->dictLimit)
+            streamPtr->nextToUpdate = streamPtr->dictLimit;
+    }
+    return dictSize;
+}
+
+
+/***************************************************
+*  Deprecated Functions
+***************************************************/
+
+/* These functions currently generate deprecation warnings */
+
+/* Wrappers for deprecated compression functions */
+int LZ4_compressHC(const char* src, char* dst, int srcSize) { return LZ4_compress_HC (src, dst, srcSize, LZ4_compressBound(srcSize), 0); }
+int LZ4_compressHC_limitedOutput(const char* src, char* dst, int srcSize, int maxDstSize) { return LZ4_compress_HC(src, dst, srcSize, maxDstSize, 0); }
+int LZ4_compressHC2(const char* src, char* dst, int srcSize, int cLevel) { return LZ4_compress_HC (src, dst, srcSize, LZ4_compressBound(srcSize), cLevel); }
+int LZ4_compressHC2_limitedOutput(const char* src, char* dst, int srcSize, int maxDstSize, int cLevel) { return LZ4_compress_HC(src, dst, srcSize, maxDstSize, cLevel); }
+int LZ4_compressHC_withStateHC (void* state, const char* src, char* dst, int srcSize) { return LZ4_compress_HC_extStateHC (state, src, dst, srcSize, LZ4_compressBound(srcSize), 0); }
+int LZ4_compressHC_limitedOutput_withStateHC (void* state, const char* src, char* dst, int srcSize, int maxDstSize) { return LZ4_compress_HC_extStateHC (state, src, dst, srcSize, maxDstSize, 0); }
+int LZ4_compressHC2_withStateHC (void* state, const char* src, char* dst, int srcSize, int cLevel) { return LZ4_compress_HC_extStateHC(state, src, dst, srcSize, LZ4_compressBound(srcSize), cLevel); }
+int LZ4_compressHC2_limitedOutput_withStateHC (void* state, const char* src, char* dst, int srcSize, int maxDstSize, int cLevel) { return LZ4_compress_HC_extStateHC(state, src, dst, srcSize, maxDstSize, cLevel); }
+int LZ4_compressHC_continue (LZ4_streamHC_t* ctx, const char* src, char* dst, int srcSize) { return LZ4_compress_HC_continue (ctx, src, dst, srcSize, LZ4_compressBound(srcSize)); }
+int LZ4_compressHC_limitedOutput_continue (LZ4_streamHC_t* ctx, const char* src, char* dst, int srcSize, int maxDstSize) { return LZ4_compress_HC_continue (ctx, src, dst, srcSize, maxDstSize); }
+
+
+/* Deprecated streaming functions */
+int LZ4_sizeofStreamStateHC(void) { return sizeof(LZ4_streamHC_t); }
+
+/* state is presumed correctly sized, aka >= sizeof(LZ4_streamHC_t)
+ * @return : 0 on success, !=0 if error */
+int LZ4_resetStreamStateHC(void* state, char* inputBuffer)
+{
+    LZ4_streamHC_t* const hc4 = LZ4_initStreamHC(state, sizeof(*hc4));
+    if (hc4 == NULL) return 1;   /* init failed */
+    LZ4HC_init_internal (&hc4->internal_donotuse, (const BYTE*)inputBuffer);
+    return 0;
+}
+
+#if !defined(LZ4_STATIC_LINKING_ONLY_DISABLE_MEMORY_ALLOCATION)
+void* LZ4_createHC (const char* inputBuffer)
+{
+    LZ4_streamHC_t* const hc4 = LZ4_createStreamHC();
+    if (hc4 == NULL) return NULL;   /* not enough memory */
+    LZ4HC_init_internal (&hc4->internal_donotuse, (const BYTE*)inputBuffer);
+    return hc4;
+}
+
+int LZ4_freeHC (void* LZ4HC_Data)
+{
+    if (!LZ4HC_Data) return 0;  /* support free on NULL */
+    FREEMEM(LZ4HC_Data);
+    return 0;
+}
+#endif
+
+int LZ4_compressHC2_continue (void* LZ4HC_Data, const char* src, char* dst, int srcSize, int cLevel)
+{
+    return LZ4HC_compress_generic (&((LZ4_streamHC_t*)LZ4HC_Data)->internal_donotuse, src, dst, &srcSize, 0, cLevel, notLimited);
+}
+
+int LZ4_compressHC2_limitedOutput_continue (void* LZ4HC_Data, const char* src, char* dst, int srcSize, int dstCapacity, int cLevel)
+{
+    return LZ4HC_compress_generic (&((LZ4_streamHC_t*)LZ4HC_Data)->internal_donotuse, src, dst, &srcSize, dstCapacity, cLevel, limitedOutput);
+}
+
+char* LZ4_slideInputBufferHC(void* LZ4HC_Data)
+{
+    LZ4_streamHC_t* const ctx = (LZ4_streamHC_t*)LZ4HC_Data;
+    const BYTE* bufferStart = ctx->internal_donotuse.prefixStart - ctx->internal_donotuse.dictLimit + ctx->internal_donotuse.lowLimit;
+    LZ4_resetStreamHC_fast(ctx, ctx->internal_donotuse.compressionLevel);
+    /* avoid const char * -> char * conversion warning :( */
+    return (char*)(uptrval)bufferStart;
+}
+
+
+/* ================================================
+ *  LZ4 Optimal parser (levels [LZ4HC_CLEVEL_OPT_MIN - LZ4HC_CLEVEL_MAX])
+ * ===============================================*/
+typedef struct {
+    int price;
+    int off;
+    int mlen;
+    int litlen;
+} LZ4HC_optimal_t;
+
+/* price in bytes */
+LZ4_FORCE_INLINE int LZ4HC_literalsPrice(int const litlen)
+{
+    int price = litlen;
+    assert(litlen >= 0);
+    if (litlen >= (int)RUN_MASK)
+        price += 1 + ((litlen-(int)RUN_MASK) / 255);
+    return price;
+}
+
+
+/* requires mlen >= MINMATCH */
+LZ4_FORCE_INLINE int LZ4HC_sequencePrice(int litlen, int mlen)
+{
+    int price = 1 + 2 ; /* token + 16-bit offset */
+    assert(litlen >= 0);
+    assert(mlen >= MINMATCH);
+
+    price += LZ4HC_literalsPrice(litlen);
+
+    if (mlen >= (int)(ML_MASK+MINMATCH))
+        price += 1 + ((mlen-(int)(ML_MASK+MINMATCH)) / 255);
+
+    return price;
+}
+
+
+typedef struct {
+    int off;
+    int len;
+} LZ4HC_match_t;
+
+LZ4_FORCE_INLINE LZ4HC_match_t
+LZ4HC_FindLongerMatch(LZ4HC_CCtx_internal* const ctx,
+                      const BYTE* ip, const BYTE* const iHighLimit,
+                      int minLen, int nbSearches,
+                      const dictCtx_directive dict,
+                      const HCfavor_e favorDecSpeed)
+{
+    LZ4HC_match_t match = { 0 , 0 };
+    const BYTE* matchPtr = NULL;
+    /* note : LZ4HC_InsertAndGetWiderMatch() is able to modify the starting position of a match (*startpos),
+     * but this won't be the case here, as we define iLowLimit==ip,
+     * so LZ4HC_InsertAndGetWiderMatch() won't be allowed to search past ip */
+    int matchLength = LZ4HC_InsertAndGetWiderMatch(ctx, ip, ip, iHighLimit, minLen, &matchPtr, &ip, nbSearches, 1 /*patternAnalysis*/, 1 /*chainSwap*/, dict, favorDecSpeed);
+    if (matchLength <= minLen) return match;
+    if (favorDecSpeed) {
+        if ((matchLength>18) & (matchLength<=36)) matchLength=18;   /* favor shortcut */
+    }
+    match.len = matchLength;
+    match.off = (int)(ip-matchPtr);
+    return match;
+}
+
+
+static int LZ4HC_compress_optimal ( LZ4HC_CCtx_internal* ctx,
+                                    const char* const source,
+                                    char* dst,
+                                    int* srcSizePtr,
+                                    int dstCapacity,
+                                    int const nbSearches,
+                                    size_t sufficient_len,
+                                    const limitedOutput_directive limit,
+                                    int const fullUpdate,
+                                    const dictCtx_directive dict,
+                                    const HCfavor_e favorDecSpeed)
+{
+    int retval = 0;
+#define TRAILING_LITERALS 3
+#if defined(LZ4HC_HEAPMODE) && LZ4HC_HEAPMODE==1
+    LZ4HC_optimal_t* const opt = (LZ4HC_optimal_t*)ALLOC(sizeof(LZ4HC_optimal_t) * (LZ4_OPT_NUM + TRAILING_LITERALS));
+#else
+    LZ4HC_optimal_t opt[LZ4_OPT_NUM + TRAILING_LITERALS];   /* ~64 KB, which is a bit large for stack... */
+#endif
+
+    const BYTE* ip = (const BYTE*) source;
+    const BYTE* anchor = ip;
+    const BYTE* const iend = ip + *srcSizePtr;
+    const BYTE* const mflimit = iend - MFLIMIT;
+    const BYTE* const matchlimit = iend - LASTLITERALS;
+    BYTE* op = (BYTE*) dst;
+    BYTE* opSaved = (BYTE*) dst;
+    BYTE* oend = op + dstCapacity;
+    int ovml = MINMATCH;  /* overflow - last sequence */
+    const BYTE* ovref = NULL;
+
+    /* init */
+#if defined(LZ4HC_HEAPMODE) && LZ4HC_HEAPMODE==1
+    if (opt == NULL) goto _return_label;
+#endif
+    DEBUGLOG(5, "LZ4HC_compress_optimal(dst=%p, dstCapa=%u)", dst, (unsigned)dstCapacity);
+    *srcSizePtr = 0;
+    if (limit == fillOutput) oend -= LASTLITERALS;   /* Hack for support LZ4 format restriction */
+    if (sufficient_len >= LZ4_OPT_NUM) sufficient_len = LZ4_OPT_NUM-1;
+
+    /* Main Loop */
+    while (ip <= mflimit) {
+         int const llen = (int)(ip - anchor);
+         int best_mlen, best_off;
+         int cur, last_match_pos = 0;
+
+         LZ4HC_match_t const firstMatch = LZ4HC_FindLongerMatch(ctx, ip, matchlimit, MINMATCH-1, nbSearches, dict, favorDecSpeed);
+         if (firstMatch.len==0) { ip++; continue; }
+
+         if ((size_t)firstMatch.len > sufficient_len) {
+             /* good enough solution : immediate encoding */
+             int const firstML = firstMatch.len;
+             const BYTE* const matchPos = ip - firstMatch.off;
+             opSaved = op;
+             if ( LZ4HC_encodeSequence(UPDATABLE(ip, op, anchor), firstML, matchPos, limit, oend) ) {  /* updates ip, op and anchor */
+                 ovml = firstML;
+                 ovref = matchPos;
+                 goto _dest_overflow;
+             }
+             continue;
+         }
+
+         /* set prices for first positions (literals) */
+         {   int rPos;
+             for (rPos = 0 ; rPos < MINMATCH ; rPos++) {
+                 int const cost = LZ4HC_literalsPrice(llen + rPos);
+                 opt[rPos].mlen = 1;
+                 opt[rPos].off = 0;
+                 opt[rPos].litlen = llen + rPos;
+                 opt[rPos].price = cost;
+                 DEBUGLOG(7, "rPos:%3i => price:%3i (litlen=%i) -- initial setup",
+                             rPos, cost, opt[rPos].litlen);
+         }   }
+         /* set prices using initial match */
+         {   int mlen = MINMATCH;
+             int const matchML = firstMatch.len;   /* necessarily < sufficient_len < LZ4_OPT_NUM */
+             int const offset = firstMatch.off;
+             assert(matchML < LZ4_OPT_NUM);
+             for ( ; mlen <= matchML ; mlen++) {
+                 int const cost = LZ4HC_sequencePrice(llen, mlen);
+                 opt[mlen].mlen = mlen;
+                 opt[mlen].off = offset;
+                 opt[mlen].litlen = llen;
+                 opt[mlen].price = cost;
+                 DEBUGLOG(7, "rPos:%3i => price:%3i (matchlen=%i) -- initial setup",
+                             mlen, cost, mlen);
+         }   }
+         last_match_pos = firstMatch.len;
+         {   int addLit;
+             for (addLit = 1; addLit <= TRAILING_LITERALS; addLit ++) {
+                 opt[last_match_pos+addLit].mlen = 1; /* literal */
+                 opt[last_match_pos+addLit].off = 0;
+                 opt[last_match_pos+addLit].litlen = addLit;
+                 opt[last_match_pos+addLit].price = opt[last_match_pos].price + LZ4HC_literalsPrice(addLit);
+                 DEBUGLOG(7, "rPos:%3i => price:%3i (litlen=%i) -- initial setup",
+                             last_match_pos+addLit, opt[last_match_pos+addLit].price, addLit);
+         }   }
+
+         /* check further positions */
+         for (cur = 1; cur < last_match_pos; cur++) {
+             const BYTE* const curPtr = ip + cur;
+             LZ4HC_match_t newMatch;
+
+             if (curPtr > mflimit) break;
+             DEBUGLOG(7, "rPos:%u[%u] vs [%u]%u",
+                     cur, opt[cur].price, opt[cur+1].price, cur+1);
+             if (fullUpdate) {
+                 /* not useful to search here if next position has same (or lower) cost */
+                 if ( (opt[cur+1].price <= opt[cur].price)
+                   /* in some cases, next position has same cost, but cost rises sharply after, so a small match would still be beneficial */
+                   && (opt[cur+MINMATCH].price < opt[cur].price + 3/*min seq price*/) )
+                     continue;
+             } else {
+                 /* not useful to search here if next position has same (or lower) cost */
+                 if (opt[cur+1].price <= opt[cur].price) continue;
+             }
+
+             DEBUGLOG(7, "search at rPos:%u", cur);
+             if (fullUpdate)
+                 newMatch = LZ4HC_FindLongerMatch(ctx, curPtr, matchlimit, MINMATCH-1, nbSearches, dict, favorDecSpeed);
+             else
+                 /* only test matches of minimum length; slightly faster, but misses a few bytes */
+                 newMatch = LZ4HC_FindLongerMatch(ctx, curPtr, matchlimit, last_match_pos - cur, nbSearches, dict, favorDecSpeed);
+             if (!newMatch.len) continue;
+
+             if ( ((size_t)newMatch.len > sufficient_len)
+               || (newMatch.len + cur >= LZ4_OPT_NUM) ) {
+                 /* immediate encoding */
+                 best_mlen = newMatch.len;
+                 best_off = newMatch.off;
+                 last_match_pos = cur + 1;
+                 goto encode;
+             }
+
+             /* before match : set price with literals at beginning */
+             {   int const baseLitlen = opt[cur].litlen;
+                 int litlen;
+                 for (litlen = 1; litlen < MINMATCH; litlen++) {
+                     int const price = opt[cur].price - LZ4HC_literalsPrice(baseLitlen) + LZ4HC_literalsPrice(baseLitlen+litlen);
+                     int const pos = cur + litlen;
+                     if (price < opt[pos].price) {
+                         opt[pos].mlen = 1; /* literal */
+                         opt[pos].off = 0;
+                         opt[pos].litlen = baseLitlen+litlen;
+                         opt[pos].price = price;
+                         DEBUGLOG(7, "rPos:%3i => price:%3i (litlen=%i)",
+                                     pos, price, opt[pos].litlen);
+             }   }   }
+
+             /* set prices using match at position = cur */
+             {   int const matchML = newMatch.len;
+                 int ml = MINMATCH;
+
+                 assert(cur + newMatch.len < LZ4_OPT_NUM);
+                 for ( ; ml <= matchML ; ml++) {
+                     int const pos = cur + ml;
+                     int const offset = newMatch.off;
+                     int price;
+                     int ll;
+                     DEBUGLOG(7, "testing price rPos %i (last_match_pos=%i)",
+                                 pos, last_match_pos);
+                     if (opt[cur].mlen == 1) {
+                         ll = opt[cur].litlen;
+                         price = ((cur > ll) ? opt[cur - ll].price : 0)
+                               + LZ4HC_sequencePrice(ll, ml);
+                     } else {
+                         ll = 0;
+                         price = opt[cur].price + LZ4HC_sequencePrice(0, ml);
+                     }
+
+                    assert((U32)favorDecSpeed <= 1);
+                     if (pos > last_match_pos+TRAILING_LITERALS
+                      || price <= opt[pos].price - (int)favorDecSpeed) {
+                         DEBUGLOG(7, "rPos:%3i => price:%3i (matchlen=%i)",
+                                     pos, price, ml);
+                         assert(pos < LZ4_OPT_NUM);
+                         if ( (ml == matchML)  /* last pos of last match */
+                           && (last_match_pos < pos) )
+                             last_match_pos = pos;
+                         opt[pos].mlen = ml;
+                         opt[pos].off = offset;
+                         opt[pos].litlen = ll;
+                         opt[pos].price = price;
+             }   }   }
+             /* complete following positions with literals */
+             {   int addLit;
+                 for (addLit = 1; addLit <= TRAILING_LITERALS; addLit ++) {
+                     opt[last_match_pos+addLit].mlen = 1; /* literal */
+                     opt[last_match_pos+addLit].off = 0;
+                     opt[last_match_pos+addLit].litlen = addLit;
+                     opt[last_match_pos+addLit].price = opt[last_match_pos].price + LZ4HC_literalsPrice(addLit);
+                     DEBUGLOG(7, "rPos:%3i => price:%3i (litlen=%i)", last_match_pos+addLit, opt[last_match_pos+addLit].price, addLit);
+             }   }
+         }  /* for (cur = 1; cur <= last_match_pos; cur++) */
+
+         assert(last_match_pos < LZ4_OPT_NUM + TRAILING_LITERALS);
+         best_mlen = opt[last_match_pos].mlen;
+         best_off = opt[last_match_pos].off;
+         cur = last_match_pos - best_mlen;
+
+encode: /* cur, last_match_pos, best_mlen, best_off must be set */
+         assert(cur < LZ4_OPT_NUM);
+         assert(last_match_pos >= 1);  /* == 1 when only one candidate */
+         DEBUGLOG(6, "reverse traversal, looking for shortest path (last_match_pos=%i)", last_match_pos);
+         {   int candidate_pos = cur;
+             int selected_matchLength = best_mlen;
+             int selected_offset = best_off;
+             while (1) {  /* from end to beginning */
+                 int const next_matchLength = opt[candidate_pos].mlen;  /* can be 1, means literal */
+                 int const next_offset = opt[candidate_pos].off;
+                 DEBUGLOG(7, "pos %i: sequence length %i", candidate_pos, selected_matchLength);
+                 opt[candidate_pos].mlen = selected_matchLength;
+                 opt[candidate_pos].off = selected_offset;
+                 selected_matchLength = next_matchLength;
+                 selected_offset = next_offset;
+                 if (next_matchLength > candidate_pos) break; /* last match elected, first match to encode */
+                 assert(next_matchLength > 0);  /* can be 1, means literal */
+                 candidate_pos -= next_matchLength;
+         }   }
+
+         /* encode all recorded sequences in order */
+         {   int rPos = 0;  /* relative position (to ip) */
+             while (rPos < last_match_pos) {
+                 int const ml = opt[rPos].mlen;
+                 int const offset = opt[rPos].off;
+                 if (ml == 1) { ip++; rPos++; continue; }  /* literal; note: can end up with several literals, in which case, skip them */
+                 rPos += ml;
+                 assert(ml >= MINMATCH);
+                 assert((offset >= 1) && (offset <= LZ4_DISTANCE_MAX));
+                 opSaved = op;
+                 if ( LZ4HC_encodeSequence(UPDATABLE(ip, op, anchor), ml, ip - offset, limit, oend) ) {  /* updates ip, op and anchor */
+                     ovml = ml;
+                     ovref = ip - offset;
+                     goto _dest_overflow;
+         }   }   }
+     }  /* while (ip <= mflimit) */
+
+_last_literals:
+     /* Encode Last Literals */
+     {   size_t lastRunSize = (size_t)(iend - anchor);  /* literals */
+         size_t llAdd = (lastRunSize + 255 - RUN_MASK) / 255;
+         size_t const totalSize = 1 + llAdd + lastRunSize;
+         if (limit == fillOutput) oend += LASTLITERALS;  /* restore correct value */
+         if (limit && (op + totalSize > oend)) {
+             if (limit == limitedOutput) { /* Check output limit */
+                retval = 0;
+                goto _return_label;
+             }
+             /* adapt lastRunSize to fill 'dst' */
+             lastRunSize  = (size_t)(oend - op) - 1 /*token*/;
+             llAdd = (lastRunSize + 256 - RUN_MASK) / 256;
+             lastRunSize -= llAdd;
+         }
+         DEBUGLOG(6, "Final literal run : %i literals", (int)lastRunSize);
+         ip = anchor + lastRunSize; /* can be != iend if limit==fillOutput */
+
+         if (lastRunSize >= RUN_MASK) {
+             size_t accumulator = lastRunSize - RUN_MASK;
+             *op++ = (RUN_MASK << ML_BITS);
+             for(; accumulator >= 255 ; accumulator -= 255) *op++ = 255;
+             *op++ = (BYTE) accumulator;
+         } else {
+             *op++ = (BYTE)(lastRunSize << ML_BITS);
+         }
+         LZ4_memcpy(op, anchor, lastRunSize);
+         op += lastRunSize;
+     }
+
+     /* End */
+     *srcSizePtr = (int) (((const char*)ip) - source);
+     retval = (int) ((char*)op-dst);
+     goto _return_label;
+
+_dest_overflow:
+if (limit == fillOutput) {
+     /* Assumption : ip, anchor, ovml and ovref must be set correctly */
+     size_t const ll = (size_t)(ip - anchor);
+     size_t const ll_addbytes = (ll + 240) / 255;
+     size_t const ll_totalCost = 1 + ll_addbytes + ll;
+     BYTE* const maxLitPos = oend - 3; /* 2 for offset, 1 for token */
+     DEBUGLOG(6, "Last sequence overflowing (only %i bytes remaining)", (int)(oend-1-opSaved));
+     op = opSaved;  /* restore correct out pointer */
+     if (op + ll_totalCost <= maxLitPos) {
+         /* ll validated; now adjust match length */
+         size_t const bytesLeftForMl = (size_t)(maxLitPos - (op+ll_totalCost));
+         size_t const maxMlSize = MINMATCH + (ML_MASK-1) + (bytesLeftForMl * 255);
+         assert(maxMlSize < INT_MAX); assert(ovml >= 0);
+         if ((size_t)ovml > maxMlSize) ovml = (int)maxMlSize;
+         if ((oend + LASTLITERALS) - (op + ll_totalCost + 2) - 1 + ovml >= MFLIMIT) {
+             DEBUGLOG(6, "Space to end : %i + ml (%i)", (int)((oend + LASTLITERALS) - (op + ll_totalCost + 2) - 1), ovml);
+             DEBUGLOG(6, "Before : ip = %p, anchor = %p", ip, anchor);
+             LZ4HC_encodeSequence(UPDATABLE(ip, op, anchor), ovml, ovref, notLimited, oend);
+             DEBUGLOG(6, "After : ip = %p, anchor = %p", ip, anchor);
+     }   }
+     goto _last_literals;
+}
+_return_label:
+#if defined(LZ4HC_HEAPMODE) && LZ4HC_HEAPMODE==1
+     FREEMEM(opt);
+#endif
+     return retval;
+}
+
+}
diff --git a/project/thirdparty/tracy-0.11.1/common/tracy_lz4hc.hpp b/project/thirdparty/tracy-0.11.1/common/tracy_lz4hc.hpp
new file mode 100644
index 000000000..460cbae7f
--- /dev/null
+++ b/project/thirdparty/tracy-0.11.1/common/tracy_lz4hc.hpp
@@ -0,0 +1,405 @@
+/*
+   LZ4 HC - High Compression Mode of LZ4
+   Header File
+   Copyright (C) 2011-2020, Yann Collet.
+   BSD 2-Clause License (http://www.opensource.org/licenses/bsd-license.php)
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+
+       * Redistributions of source code must retain the above copyright
+   notice, this list of conditions and the following disclaimer.
+       * Redistributions in binary form must reproduce the above
+   copyright notice, this list of conditions and the following disclaimer
+   in the documentation and/or other materials provided with the
+   distribution.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+   You can contact the author at :
+   - LZ4 source repository : https://github.com/lz4/lz4
+   - LZ4 public forum : https://groups.google.com/forum/#!forum/lz4c
+*/
+#ifndef TRACY_LZ4_HC_H_19834876238432
+#define TRACY_LZ4_HC_H_19834876238432
+
+/* --- Dependency --- */
+/* note : lz4hc requires lz4.h/lz4.c for compilation */
+#include "tracy_lz4.hpp"   /* stddef, LZ4LIB_API, LZ4_DEPRECATED */
+
+
+/* --- Useful constants --- */
+#define LZ4HC_CLEVEL_MIN         3
+#define LZ4HC_CLEVEL_DEFAULT     9
+#define LZ4HC_CLEVEL_OPT_MIN    10
+#define LZ4HC_CLEVEL_MAX        12
+
+namespace tracy
+{
+
+/*-************************************
+ *  Block Compression
+ **************************************/
+/*! LZ4_compress_HC() :
+ *  Compress data from `src` into `dst`, using the powerful but slower "HC" algorithm.
+ * `dst` must be already allocated.
+ *  Compression is guaranteed to succeed if `dstCapacity >= LZ4_compressBound(srcSize)` (see "lz4.h")
+ *  Max supported `srcSize` value is LZ4_MAX_INPUT_SIZE (see "lz4.h")
+ * `compressionLevel` : any value between 1 and LZ4HC_CLEVEL_MAX will work.
+ *                      Values > LZ4HC_CLEVEL_MAX behave the same as LZ4HC_CLEVEL_MAX.
+ * @return : the number of bytes written into 'dst'
+ *           or 0 if compression fails.
+ */
+LZ4LIB_API int LZ4_compress_HC (const char* src, char* dst, int srcSize, int dstCapacity, int compressionLevel);
+
+
+/* Note :
+ *   Decompression functions are provided within "lz4.h" (BSD license)
+ */
+
+
+/*! LZ4_compress_HC_extStateHC() :
+ *  Same as LZ4_compress_HC(), but using an externally allocated memory segment for `state`.
+ * `state` size is provided by LZ4_sizeofStateHC().
+ *  Memory segment must be aligned on 8-bytes boundaries (which a normal malloc() should do properly).
+ */
+LZ4LIB_API int LZ4_sizeofStateHC(void);
+LZ4LIB_API int LZ4_compress_HC_extStateHC(void* stateHC, const char* src, char* dst, int srcSize, int maxDstSize, int compressionLevel);
+
+
+/*! LZ4_compress_HC_destSize() : v1.9.0+
+ *  Will compress as much data as possible from `src`
+ *  to fit into `targetDstSize` budget.
+ *  Result is provided in 2 parts :
+ * @return : the number of bytes written into 'dst' (necessarily <= targetDstSize)
+ *           or 0 if compression fails.
+ * `srcSizePtr` : on success, *srcSizePtr is updated to indicate how much bytes were read from `src`
+ */
+LZ4LIB_API int LZ4_compress_HC_destSize(void* stateHC,
+                                  const char* src, char* dst,
+                                        int* srcSizePtr, int targetDstSize,
+                                        int compressionLevel);
+
+
+/*-************************************
+ *  Streaming Compression
+ *  Bufferless synchronous API
+ **************************************/
+ typedef union LZ4_streamHC_u LZ4_streamHC_t;   /* incomplete type (defined later) */
+
+/*! LZ4_createStreamHC() and LZ4_freeStreamHC() :
+ *  These functions create and release memory for LZ4 HC streaming state.
+ *  Newly created states are automatically initialized.
+ *  A same state can be used multiple times consecutively,
+ *  starting with LZ4_resetStreamHC_fast() to start a new stream of blocks.
+ */
+LZ4LIB_API LZ4_streamHC_t* LZ4_createStreamHC(void);
+LZ4LIB_API int             LZ4_freeStreamHC (LZ4_streamHC_t* streamHCPtr);
+
+/*
+  These functions compress data in successive blocks of any size,
+  using previous blocks as dictionary, to improve compression ratio.
+  One key assumption is that previous blocks (up to 64 KB) remain read-accessible while compressing next blocks.
+  There is an exception for ring buffers, which can be smaller than 64 KB.
+  Ring-buffer scenario is automatically detected and handled within LZ4_compress_HC_continue().
+
+  Before starting compression, state must be allocated and properly initialized.
+  LZ4_createStreamHC() does both, though compression level is set to LZ4HC_CLEVEL_DEFAULT.
+
+  Selecting the compression level can be done with LZ4_resetStreamHC_fast() (starts a new stream)
+  or LZ4_setCompressionLevel() (anytime, between blocks in the same stream) (experimental).
+  LZ4_resetStreamHC_fast() only works on states which have been properly initialized at least once,
+  which is automatically the case when state is created using LZ4_createStreamHC().
+
+  After reset, a first "fictional block" can be designated as initial dictionary,
+  using LZ4_loadDictHC() (Optional).
+
+  Invoke LZ4_compress_HC_continue() to compress each successive block.
+  The number of blocks is unlimited.
+  Previous input blocks, including initial dictionary when present,
+  must remain accessible and unmodified during compression.
+
+  It's allowed to update compression level anytime between blocks,
+  using LZ4_setCompressionLevel() (experimental).
+
+  'dst' buffer should be sized to handle worst case scenarios
+  (see LZ4_compressBound(), it ensures compression success).
+  In case of failure, the API does not guarantee recovery,
+  so the state _must_ be reset.
+  To ensure compression success
+  whenever `dst` buffer size cannot be made >= LZ4_compressBound(),
+  consider using LZ4_compress_HC_continue_destSize().
+
+  Whenever previous input blocks can't be preserved unmodified in-place during compression of next blocks,
+  it's possible to copy the last blocks into a more stable memory space, using LZ4_saveDictHC().
+  Return value of LZ4_saveDictHC() is the size of dictionary effectively saved into 'safeBuffer' (<= 64 KB)
+
+  After completing a streaming compression,
+  it's possible to start a new stream of blocks, using the same LZ4_streamHC_t state,
+  just by resetting it, using LZ4_resetStreamHC_fast().
+*/
+
+LZ4LIB_API void LZ4_resetStreamHC_fast(LZ4_streamHC_t* streamHCPtr, int compressionLevel);   /* v1.9.0+ */
+LZ4LIB_API int  LZ4_loadDictHC (LZ4_streamHC_t* streamHCPtr, const char* dictionary, int dictSize);
+
+LZ4LIB_API int LZ4_compress_HC_continue (LZ4_streamHC_t* streamHCPtr,
+                                   const char* src, char* dst,
+                                         int srcSize, int maxDstSize);
+
+/*! LZ4_compress_HC_continue_destSize() : v1.9.0+
+ *  Similar to LZ4_compress_HC_continue(),
+ *  but will read as much data as possible from `src`
+ *  to fit into `targetDstSize` budget.
+ *  Result is provided into 2 parts :
+ * @return : the number of bytes written into 'dst' (necessarily <= targetDstSize)
+ *           or 0 if compression fails.
+ * `srcSizePtr` : on success, *srcSizePtr will be updated to indicate how much bytes were read from `src`.
+ *           Note that this function may not consume the entire input.
+ */
+LZ4LIB_API int LZ4_compress_HC_continue_destSize(LZ4_streamHC_t* LZ4_streamHCPtr,
+                                           const char* src, char* dst,
+                                                 int* srcSizePtr, int targetDstSize);
+
+LZ4LIB_API int LZ4_saveDictHC (LZ4_streamHC_t* streamHCPtr, char* safeBuffer, int maxDictSize);
+
+
+
+/*^**********************************************
+ * !!!!!!   STATIC LINKING ONLY   !!!!!!
+ ***********************************************/
+
+/*-******************************************************************
+ * PRIVATE DEFINITIONS :
+ * Do not use these definitions directly.
+ * They are merely exposed to allow static allocation of `LZ4_streamHC_t`.
+ * Declare an `LZ4_streamHC_t` directly, rather than any type below.
+ * Even then, only do so in the context of static linking, as definitions may change between versions.
+ ********************************************************************/
+
+#define LZ4HC_DICTIONARY_LOGSIZE 16
+#define LZ4HC_MAXD (1<<LZ4HC_DICTIONARY_LOGSIZE)
+#define LZ4HC_MAXD_MASK (LZ4HC_MAXD - 1)
+
+#define LZ4HC_HASH_LOG 15
+#define LZ4HC_HASHTABLESIZE (1 << LZ4HC_HASH_LOG)
+#define LZ4HC_HASH_MASK (LZ4HC_HASHTABLESIZE - 1)
+
+
+/* Never ever use these definitions directly !
+ * Declare or allocate an LZ4_streamHC_t instead.
+**/
+typedef struct LZ4HC_CCtx_internal LZ4HC_CCtx_internal;
+struct LZ4HC_CCtx_internal
+{
+    LZ4_u32   hashTable[LZ4HC_HASHTABLESIZE];
+    LZ4_u16   chainTable[LZ4HC_MAXD];
+    const LZ4_byte* end;       /* next block here to continue on current prefix */
+    const LZ4_byte* prefixStart;  /* Indexes relative to this position */
+    const LZ4_byte* dictStart; /* alternate reference for extDict */
+    LZ4_u32   dictLimit;       /* below that point, need extDict */
+    LZ4_u32   lowLimit;        /* below that point, no more dict */
+    LZ4_u32   nextToUpdate;    /* index from which to continue dictionary update */
+    short     compressionLevel;
+    LZ4_i8    favorDecSpeed;   /* favor decompression speed if this flag set,
+                                  otherwise, favor compression ratio */
+    LZ4_i8    dirty;           /* stream has to be fully reset if this flag is set */
+    const LZ4HC_CCtx_internal* dictCtx;
+};
+
+#define LZ4_STREAMHC_MINSIZE  262200  /* static size, for inter-version compatibility */
+union LZ4_streamHC_u {
+    char minStateSize[LZ4_STREAMHC_MINSIZE];
+    LZ4HC_CCtx_internal internal_donotuse;
+}; /* previously typedef'd to LZ4_streamHC_t */
+
+/* LZ4_streamHC_t :
+ * This structure allows static allocation of LZ4 HC streaming state.
+ * This can be used to allocate statically on stack, or as part of a larger structure.
+ *
+ * Such state **must** be initialized using LZ4_initStreamHC() before first use.
+ *
+ * Note that invoking LZ4_initStreamHC() is not required when
+ * the state was created using LZ4_createStreamHC() (which is recommended).
+ * Using the normal builder, a newly created state is automatically initialized.
+ *
+ * Static allocation shall only be used in combination with static linking.
+ */
+
+/* LZ4_initStreamHC() : v1.9.0+
+ * Required before first use of a statically allocated LZ4_streamHC_t.
+ * Before v1.9.0 : use LZ4_resetStreamHC() instead
+ */
+LZ4LIB_API LZ4_streamHC_t* LZ4_initStreamHC(void* buffer, size_t size);
+
+
+/*-************************************
+*  Deprecated Functions
+**************************************/
+/* see lz4.h LZ4_DISABLE_DEPRECATE_WARNINGS to turn off deprecation warnings */
+
+/* deprecated compression functions */
+LZ4_DEPRECATED("use LZ4_compress_HC() instead") LZ4LIB_API int LZ4_compressHC               (const char* source, char* dest, int inputSize);
+LZ4_DEPRECATED("use LZ4_compress_HC() instead") LZ4LIB_API int LZ4_compressHC_limitedOutput (const char* source, char* dest, int inputSize, int maxOutputSize);
+LZ4_DEPRECATED("use LZ4_compress_HC() instead") LZ4LIB_API int LZ4_compressHC2              (const char* source, char* dest, int inputSize, int compressionLevel);
+LZ4_DEPRECATED("use LZ4_compress_HC() instead") LZ4LIB_API int LZ4_compressHC2_limitedOutput(const char* source, char* dest, int inputSize, int maxOutputSize, int compressionLevel);
+LZ4_DEPRECATED("use LZ4_compress_HC_extStateHC() instead") LZ4LIB_API int LZ4_compressHC_withStateHC               (void* state, const char* source, char* dest, int inputSize);
+LZ4_DEPRECATED("use LZ4_compress_HC_extStateHC() instead") LZ4LIB_API int LZ4_compressHC_limitedOutput_withStateHC (void* state, const char* source, char* dest, int inputSize, int maxOutputSize);
+LZ4_DEPRECATED("use LZ4_compress_HC_extStateHC() instead") LZ4LIB_API int LZ4_compressHC2_withStateHC              (void* state, const char* source, char* dest, int inputSize, int compressionLevel);
+LZ4_DEPRECATED("use LZ4_compress_HC_extStateHC() instead") LZ4LIB_API int LZ4_compressHC2_limitedOutput_withStateHC(void* state, const char* source, char* dest, int inputSize, int maxOutputSize, int compressionLevel);
+LZ4_DEPRECATED("use LZ4_compress_HC_continue() instead") LZ4LIB_API int LZ4_compressHC_continue               (LZ4_streamHC_t* LZ4_streamHCPtr, const char* source, char* dest, int inputSize);
+LZ4_DEPRECATED("use LZ4_compress_HC_continue() instead") LZ4LIB_API int LZ4_compressHC_limitedOutput_continue (LZ4_streamHC_t* LZ4_streamHCPtr, const char* source, char* dest, int inputSize, int maxOutputSize);
+
+/* Obsolete streaming functions; degraded functionality; do not use!
+ *
+ * In order to perform streaming compression, these functions depended on data
+ * that is no longer tracked in the state. They have been preserved as well as
+ * possible: using them will still produce a correct output. However, use of
+ * LZ4_slideInputBufferHC() will truncate the history of the stream, rather
+ * than preserve a window-sized chunk of history.
+ */
+#if !defined(LZ4_STATIC_LINKING_ONLY_DISABLE_MEMORY_ALLOCATION)
+LZ4_DEPRECATED("use LZ4_createStreamHC() instead") LZ4LIB_API void* LZ4_createHC (const char* inputBuffer);
+LZ4_DEPRECATED("use LZ4_freeStreamHC() instead") LZ4LIB_API   int   LZ4_freeHC (void* LZ4HC_Data);
+#endif
+LZ4_DEPRECATED("use LZ4_saveDictHC() instead") LZ4LIB_API     char* LZ4_slideInputBufferHC (void* LZ4HC_Data);
+LZ4_DEPRECATED("use LZ4_compress_HC_continue() instead") LZ4LIB_API int LZ4_compressHC2_continue               (void* LZ4HC_Data, const char* source, char* dest, int inputSize, int compressionLevel);
+LZ4_DEPRECATED("use LZ4_compress_HC_continue() instead") LZ4LIB_API int LZ4_compressHC2_limitedOutput_continue (void* LZ4HC_Data, const char* source, char* dest, int inputSize, int maxOutputSize, int compressionLevel);
+LZ4_DEPRECATED("use LZ4_createStreamHC() instead") LZ4LIB_API int   LZ4_sizeofStreamStateHC(void);
+LZ4_DEPRECATED("use LZ4_initStreamHC() instead") LZ4LIB_API  int   LZ4_resetStreamStateHC(void* state, char* inputBuffer);
+
+
+/* LZ4_resetStreamHC() is now replaced by LZ4_initStreamHC().
+ * The intention is to emphasize the difference with LZ4_resetStreamHC_fast(),
+ * which is now the recommended function to start a new stream of blocks,
+ * but cannot be used to initialize a memory segment containing arbitrary garbage data.
+ *
+ * It is recommended to switch to LZ4_initStreamHC().
+ * LZ4_resetStreamHC() will generate deprecation warnings in a future version.
+ */
+LZ4LIB_API void LZ4_resetStreamHC (LZ4_streamHC_t* streamHCPtr, int compressionLevel);
+
+}
+
+#endif /* LZ4_HC_H_19834876238432 */
+
+
+/*-**************************************************
+ * !!!!!     STATIC LINKING ONLY     !!!!!
+ * Following definitions are considered experimental.
+ * They should not be linked from DLL,
+ * as there is no guarantee of API stability yet.
+ * Prototypes will be promoted to "stable" status
+ * after successful usage in real-life scenarios.
+ ***************************************************/
+#ifdef LZ4_HC_STATIC_LINKING_ONLY   /* protection macro */
+#ifndef TRACY_LZ4_HC_SLO_098092834
+#define TRACY_LZ4_HC_SLO_098092834
+
+#define LZ4_STATIC_LINKING_ONLY   /* LZ4LIB_STATIC_API */
+#include "tracy_lz4.hpp"
+
+namespace tracy
+{
+
+/*! LZ4_setCompressionLevel() : v1.8.0+ (experimental)
+ *  It's possible to change compression level
+ *  between successive invocations of LZ4_compress_HC_continue*()
+ *  for dynamic adaptation.
+ */
+LZ4LIB_STATIC_API void LZ4_setCompressionLevel(
+    LZ4_streamHC_t* LZ4_streamHCPtr, int compressionLevel);
+
+/*! LZ4_favorDecompressionSpeed() : v1.8.2+ (experimental)
+ *  Opt. Parser will favor decompression speed over compression ratio.
+ *  Only applicable to levels >= LZ4HC_CLEVEL_OPT_MIN.
+ */
+LZ4LIB_STATIC_API void LZ4_favorDecompressionSpeed(
+    LZ4_streamHC_t* LZ4_streamHCPtr, int favor);
+
+/*! LZ4_resetStreamHC_fast() : v1.9.0+
+ *  When an LZ4_streamHC_t is known to be in a internally coherent state,
+ *  it can often be prepared for a new compression with almost no work, only
+ *  sometimes falling back to the full, expensive reset that is always required
+ *  when the stream is in an indeterminate state (i.e., the reset performed by
+ *  LZ4_resetStreamHC()).
+ *
+ *  LZ4_streamHCs are guaranteed to be in a valid state when:
+ *  - returned from LZ4_createStreamHC()
+ *  - reset by LZ4_resetStreamHC()
+ *  - memset(stream, 0, sizeof(LZ4_streamHC_t))
+ *  - the stream was in a valid state and was reset by LZ4_resetStreamHC_fast()
+ *  - the stream was in a valid state and was then used in any compression call
+ *    that returned success
+ *  - the stream was in an indeterminate state and was used in a compression
+ *    call that fully reset the state (LZ4_compress_HC_extStateHC()) and that
+ *    returned success
+ *
+ *  Note:
+ *  A stream that was last used in a compression call that returned an error
+ *  may be passed to this function. However, it will be fully reset, which will
+ *  clear any existing history and settings from the context.
+ */
+LZ4LIB_STATIC_API void LZ4_resetStreamHC_fast(
+    LZ4_streamHC_t* LZ4_streamHCPtr, int compressionLevel);
+
+/*! LZ4_compress_HC_extStateHC_fastReset() :
+ *  A variant of LZ4_compress_HC_extStateHC().
+ *
+ *  Using this variant avoids an expensive initialization step. It is only safe
+ *  to call if the state buffer is known to be correctly initialized already
+ *  (see above comment on LZ4_resetStreamHC_fast() for a definition of
+ *  "correctly initialized"). From a high level, the difference is that this
+ *  function initializes the provided state with a call to
+ *  LZ4_resetStreamHC_fast() while LZ4_compress_HC_extStateHC() starts with a
+ *  call to LZ4_resetStreamHC().
+ */
+LZ4LIB_STATIC_API int LZ4_compress_HC_extStateHC_fastReset (
+    void* state,
+    const char* src, char* dst,
+    int srcSize, int dstCapacity,
+    int compressionLevel);
+
+/*! LZ4_attach_HC_dictionary() :
+ *  This is an experimental API that allows for the efficient use of a
+ *  static dictionary many times.
+ *
+ *  Rather than re-loading the dictionary buffer into a working context before
+ *  each compression, or copying a pre-loaded dictionary's LZ4_streamHC_t into a
+ *  working LZ4_streamHC_t, this function introduces a no-copy setup mechanism,
+ *  in which the working stream references the dictionary stream in-place.
+ *
+ *  Several assumptions are made about the state of the dictionary stream.
+ *  Currently, only streams which have been prepared by LZ4_loadDictHC() should
+ *  be expected to work.
+ *
+ *  Alternatively, the provided dictionary stream pointer may be NULL, in which
+ *  case any existing dictionary stream is unset.
+ *
+ *  A dictionary should only be attached to a stream without any history (i.e.,
+ *  a stream that has just been reset).
+ *
+ *  The dictionary will remain attached to the working stream only for the
+ *  current stream session. Calls to LZ4_resetStreamHC(_fast) will remove the
+ *  dictionary context association from the working stream. The dictionary
+ *  stream (and source buffer) must remain in-place / accessible / unchanged
+ *  through the lifetime of the stream session.
+ */
+LZ4LIB_STATIC_API void LZ4_attach_HC_dictionary(
+          LZ4_streamHC_t *working_stream,
+    const LZ4_streamHC_t *dictionary_stream);
+
+}
+
+#endif   /* LZ4_HC_SLO_098092834 */
+#endif   /* LZ4_HC_STATIC_LINKING_ONLY */
diff --git a/project/thirdparty/tracy-0.11.1/libbacktrace/LICENSE b/project/thirdparty/tracy-0.11.1/libbacktrace/LICENSE
new file mode 100644
index 000000000..097d2774e
--- /dev/null
+++ b/project/thirdparty/tracy-0.11.1/libbacktrace/LICENSE
@@ -0,0 +1,29 @@
+# Copyright (C) 2012-2016 Free Software Foundation, Inc.
+
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are
+# met:
+
+#     (1) Redistributions of source code must retain the above copyright
+#     notice, this list of conditions and the following disclaimer. 
+
+#     (2) Redistributions in binary form must reproduce the above copyright
+#     notice, this list of conditions and the following disclaimer in
+#     the documentation and/or other materials provided with the
+#     distribution.  
+    
+#     (3) The name of the author may not be used to
+#     endorse or promote products derived from this software without
+#     specific prior written permission.
+
+# THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+# IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+# WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT,
+# INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+# (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+# HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+# STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING
+# IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+# POSSIBILITY OF SUCH DAMAGE.
diff --git a/project/thirdparty/tracy-0.11.1/libbacktrace/alloc.cpp b/project/thirdparty/tracy-0.11.1/libbacktrace/alloc.cpp
new file mode 100644
index 000000000..a365a4860
--- /dev/null
+++ b/project/thirdparty/tracy-0.11.1/libbacktrace/alloc.cpp
@@ -0,0 +1,174 @@
+/* alloc.c -- Memory allocation without mmap.
+   Copyright (C) 2012-2021 Free Software Foundation, Inc.
+   Written by Ian Lance Taylor, Google.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+
+    (1) Redistributions of source code must retain the above copyright
+    notice, this list of conditions and the following disclaimer.
+
+    (2) Redistributions in binary form must reproduce the above copyright
+    notice, this list of conditions and the following disclaimer in
+    the documentation and/or other materials provided with the
+    distribution.
+
+    (3) The name of the author may not be used to
+    endorse or promote products derived from this software without
+    specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT,
+INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING
+IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+POSSIBILITY OF SUCH DAMAGE.  */
+
+#include "config.h"
+
+#include <errno.h>
+#include <stdlib.h>
+#include <sys/types.h>
+
+#include "backtrace.hpp"
+#include "internal.hpp"
+
+#include "../common/TracyAlloc.hpp"
+
+namespace tracy
+{
+
+/* Allocation routines to use on systems that do not support anonymous
+   mmap.  This implementation just uses malloc, which means that the
+   backtrace functions may not be safely invoked from a signal
+   handler.  */
+
+/* Allocate memory like malloc.  If ERROR_CALLBACK is NULL, don't
+   report an error.  */
+
+void *
+backtrace_alloc (struct backtrace_state *state ATTRIBUTE_UNUSED,
+		 size_t size, backtrace_error_callback error_callback,
+		 void *data)
+{
+  void *ret;
+
+  ret = tracy_malloc (size);
+  if (ret == NULL)
+    {
+      if (error_callback)
+	error_callback (data, "malloc", errno);
+    }
+  return ret;
+}
+
+/* Free memory.  */
+
+void
+backtrace_free (struct backtrace_state *state ATTRIBUTE_UNUSED,
+		void *p, size_t size ATTRIBUTE_UNUSED,
+		backtrace_error_callback error_callback ATTRIBUTE_UNUSED,
+		void *data ATTRIBUTE_UNUSED)
+{
+  tracy_free (p);
+}
+
+/* Grow VEC by SIZE bytes.  */
+
+void *
+backtrace_vector_grow (struct backtrace_state *state ATTRIBUTE_UNUSED,
+		       size_t size, backtrace_error_callback error_callback,
+		       void *data, struct backtrace_vector *vec)
+{
+  void *ret;
+
+  if (size > vec->alc)
+    {
+      size_t alc;
+      void *base;
+
+      if (vec->size == 0)
+	alc = 32 * size;
+      else if (vec->size >= 4096)
+	alc = vec->size + 4096;
+      else
+	alc = 2 * vec->size;
+
+      if (alc < vec->size + size)
+	alc = vec->size + size;
+
+      base = tracy_realloc (vec->base, alc);
+      if (base == NULL)
+	{
+	  error_callback (data, "realloc", errno);
+	  return NULL;
+	}
+
+      vec->base = base;
+      vec->alc = alc - vec->size;
+    }
+
+  ret = (char *) vec->base + vec->size;
+  vec->size += size;
+  vec->alc -= size;
+  return ret;
+}
+
+/* Finish the current allocation on VEC.  */
+
+void *
+backtrace_vector_finish (struct backtrace_state *state,
+			 struct backtrace_vector *vec,
+			 backtrace_error_callback error_callback,
+			 void *data)
+{
+  void *ret;
+
+  /* With this allocator we call realloc in backtrace_vector_grow,
+     which means we can't easily reuse the memory here.  So just
+     release it.  */
+  if (!backtrace_vector_release (state, vec, error_callback, data))
+    return NULL;
+  ret = vec->base;
+  vec->base = NULL;
+  vec->size = 0;
+  vec->alc = 0;
+  return ret;
+}
+
+/* Release any extra space allocated for VEC.  */
+
+int
+backtrace_vector_release (struct backtrace_state *state ATTRIBUTE_UNUSED,
+			  struct backtrace_vector *vec,
+			  backtrace_error_callback error_callback,
+			  void *data)
+{
+  vec->alc = 0;
+
+  if (vec->size == 0)
+    {
+      /* As of C17, realloc with size 0 is marked as an obsolescent feature, use
+	 free instead.  */
+      tracy_free (vec->base);
+      vec->base = NULL;
+      return 1;
+    }
+
+  vec->base = tracy_realloc (vec->base, vec->size);
+  if (vec->base == NULL)
+    {
+      error_callback (data, "realloc", errno);
+      return 0;
+    }
+
+  return 1;
+}
+
+}
diff --git a/project/thirdparty/tracy-0.11.1/libbacktrace/backtrace.hpp b/project/thirdparty/tracy-0.11.1/libbacktrace/backtrace.hpp
new file mode 100644
index 000000000..e4be297a9
--- /dev/null
+++ b/project/thirdparty/tracy-0.11.1/libbacktrace/backtrace.hpp
@@ -0,0 +1,186 @@
+/* backtrace.h -- Public header file for stack backtrace library.
+   Copyright (C) 2012-2021 Free Software Foundation, Inc.
+   Written by Ian Lance Taylor, Google.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+
+    (1) Redistributions of source code must retain the above copyright
+    notice, this list of conditions and the following disclaimer.
+
+    (2) Redistributions in binary form must reproduce the above copyright
+    notice, this list of conditions and the following disclaimer in
+    the documentation and/or other materials provided with the
+    distribution.
+
+    (3) The name of the author may not be used to
+    endorse or promote products derived from this software without
+    specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT,
+INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING
+IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+POSSIBILITY OF SUCH DAMAGE.  */
+
+#ifndef BACKTRACE_H
+#define BACKTRACE_H
+
+#include <stddef.h>
+#include <stdint.h>
+#include <stdio.h>
+
+namespace tracy
+{
+
+/* The backtrace state.  This struct is intentionally not defined in
+   the public interface.  */
+
+struct backtrace_state;
+
+/* The type of the error callback argument to backtrace functions.
+   This function, if not NULL, will be called for certain error cases.
+   The DATA argument is passed to the function that calls this one.
+   The MSG argument is an error message.  The ERRNUM argument, if
+   greater than 0, holds an errno value.  The MSG buffer may become
+   invalid after this function returns.
+
+   As a special case, the ERRNUM argument will be passed as -1 if no
+   debug info can be found for the executable, or if the debug info
+   exists but has an unsupported version, but the function requires
+   debug info (e.g., backtrace_full, backtrace_pcinfo).  The MSG in
+   this case will be something along the lines of "no debug info".
+   Similarly, ERRNUM will be passed as -1 if there is no symbol table,
+   but the function requires a symbol table (e.g., backtrace_syminfo).
+   This may be used as a signal that some other approach should be
+   tried.  */
+
+typedef void (*backtrace_error_callback) (void *data, const char *msg,
+					  int errnum);
+
+/* Create state information for the backtrace routines.  This must be
+   called before any of the other routines, and its return value must
+   be passed to all of the other routines.  FILENAME is the path name
+   of the executable file; if it is NULL the library will try
+   system-specific path names.  If not NULL, FILENAME must point to a
+   permanent buffer.  If THREADED is non-zero the state may be
+   accessed by multiple threads simultaneously, and the library will
+   use appropriate atomic operations.  If THREADED is zero the state
+   may only be accessed by one thread at a time.  This returns a state
+   pointer on success, NULL on error.  If an error occurs, this will
+   call the ERROR_CALLBACK routine.
+
+   Calling this function allocates resources that cannot be freed.
+   There is no backtrace_free_state function.  The state is used to
+   cache information that is expensive to recompute.  Programs are
+   expected to call this function at most once and to save the return
+   value for all later calls to backtrace functions.  */
+
+extern struct backtrace_state *backtrace_create_state (
+    const char *filename, int threaded,
+    backtrace_error_callback error_callback, void *data);
+
+/* The type of the callback argument to the backtrace_full function.
+   DATA is the argument passed to backtrace_full.  PC is the program
+   counter.  FILENAME is the name of the file containing PC, or NULL
+   if not available.  LINENO is the line number in FILENAME containing
+   PC, or 0 if not available.  FUNCTION is the name of the function
+   containing PC, or NULL if not available.  This should return 0 to
+   continuing tracing.  The FILENAME and FUNCTION buffers may become
+   invalid after this function returns.  */
+
+typedef int (*backtrace_full_callback) (void *data, uintptr_t pc, uintptr_t lowaddr,
+					const char *filename, int lineno,
+					const char *function);
+
+/* Get a full stack backtrace.  SKIP is the number of frames to skip;
+   passing 0 will start the trace with the function calling
+   backtrace_full.  DATA is passed to the callback routine.  If any
+   call to CALLBACK returns a non-zero value, the stack backtrace
+   stops, and backtrace returns that value; this may be used to limit
+   the number of stack frames desired.  If all calls to CALLBACK
+   return 0, backtrace returns 0.  The backtrace_full function will
+   make at least one call to either CALLBACK or ERROR_CALLBACK.  This
+   function requires debug info for the executable.  */
+
+extern int backtrace_full (struct backtrace_state *state, int skip,
+			   backtrace_full_callback callback,
+			   backtrace_error_callback error_callback,
+			   void *data);
+
+/* The type of the callback argument to the backtrace_simple function.
+   DATA is the argument passed to simple_backtrace.  PC is the program
+   counter.  This should return 0 to continue tracing.  */
+
+typedef int (*backtrace_simple_callback) (void *data, uintptr_t pc);
+
+/* Get a simple backtrace.  SKIP is the number of frames to skip, as
+   in backtrace.  DATA is passed to the callback routine.  If any call
+   to CALLBACK returns a non-zero value, the stack backtrace stops,
+   and backtrace_simple returns that value.  Otherwise
+   backtrace_simple returns 0.  The backtrace_simple function will
+   make at least one call to either CALLBACK or ERROR_CALLBACK.  This
+   function does not require any debug info for the executable.  */
+
+extern int backtrace_simple (struct backtrace_state *state, int skip,
+			     backtrace_simple_callback callback,
+			     backtrace_error_callback error_callback,
+			     void *data);
+
+/* Print the current backtrace in a user readable format to a FILE.
+   SKIP is the number of frames to skip, as in backtrace_full.  Any
+   error messages are printed to stderr.  This function requires debug
+   info for the executable.  */
+
+extern void backtrace_print (struct backtrace_state *state, int skip, FILE *);
+
+/* Given PC, a program counter in the current program, call the
+   callback function with filename, line number, and function name
+   information.  This will normally call the callback function exactly
+   once.  However, if the PC happens to describe an inlined call, and
+   the debugging information contains the necessary information, then
+   this may call the callback function multiple times.  This will make
+   at least one call to either CALLBACK or ERROR_CALLBACK.  This
+   returns the first non-zero value returned by CALLBACK, or 0.  */
+
+extern int backtrace_pcinfo (struct backtrace_state *state, uintptr_t pc,
+			     backtrace_full_callback callback,
+			     backtrace_error_callback error_callback,
+			     void *data);
+
+/* The type of the callback argument to backtrace_syminfo.  DATA and
+   PC are the arguments passed to backtrace_syminfo.  SYMNAME is the
+   name of the symbol for the corresponding code.  SYMVAL is the
+   value and SYMSIZE is the size of the symbol.  SYMNAME will be NULL
+   if no error occurred but the symbol could not be found.  */
+
+typedef void (*backtrace_syminfo_callback) (void *data, uintptr_t pc,
+					    const char *symname,
+					    uintptr_t symval,
+					    uintptr_t symsize);
+
+/* Given ADDR, an address or program counter in the current program,
+   call the callback information with the symbol name and value
+   describing the function or variable in which ADDR may be found.
+   This will call either CALLBACK or ERROR_CALLBACK exactly once.
+   This returns 1 on success, 0 on failure.  This function requires
+   the symbol table but does not require the debug info.  Note that if
+   the symbol table is present but ADDR could not be found in the
+   table, CALLBACK will be called with a NULL SYMNAME argument.
+   Returns 1 on success, 0 on error.  */
+
+extern int backtrace_syminfo (struct backtrace_state *state, uintptr_t addr,
+			      backtrace_syminfo_callback callback,
+			      backtrace_error_callback error_callback,
+			      void *data);
+
+}
+
+#endif
diff --git a/project/thirdparty/tracy-0.11.1/libbacktrace/config.h b/project/thirdparty/tracy-0.11.1/libbacktrace/config.h
new file mode 100644
index 000000000..87e38a95b
--- /dev/null
+++ b/project/thirdparty/tracy-0.11.1/libbacktrace/config.h
@@ -0,0 +1,26 @@
+#include <limits.h>
+#if defined(__linux__) && !defined(__GLIBC__) && !defined(__WORDSIZE)
+// include __WORDSIZE headers for musl
+#  include <bits/reg.h>
+#endif
+#if __WORDSIZE == 64
+#  define BACKTRACE_ELF_SIZE 64
+#else
+#  define BACKTRACE_ELF_SIZE 32
+#endif
+
+#define HAVE_DLFCN_H 1
+#define HAVE_FCNTL 1
+#define HAVE_INTTYPES_H 1
+#define HAVE_LSTAT 1
+#define HAVE_READLINK 1
+#define HAVE_DL_ITERATE_PHDR 1
+#define HAVE_ATOMIC_FUNCTIONS 1
+#define HAVE_DECL_STRNLEN 1
+
+#ifdef __APPLE__
+#  define HAVE_MACH_O_DYLD_H 1
+#elif defined BSD
+#  define HAVE_KERN_PROC 1
+#  define HAVE_KERN_PROC_ARGS 1
+#endif
diff --git a/project/thirdparty/tracy-0.11.1/libbacktrace/dwarf.cpp b/project/thirdparty/tracy-0.11.1/libbacktrace/dwarf.cpp
new file mode 100644
index 000000000..52fa8a8d2
--- /dev/null
+++ b/project/thirdparty/tracy-0.11.1/libbacktrace/dwarf.cpp
@@ -0,0 +1,4455 @@
+/* dwarf.c -- Get file/line information from DWARF for backtraces.
+   Copyright (C) 2012-2021 Free Software Foundation, Inc.
+   Written by Ian Lance Taylor, Google.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+
+    (1) Redistributions of source code must retain the above copyright
+    notice, this list of conditions and the following disclaimer.
+
+    (2) Redistributions in binary form must reproduce the above copyright
+    notice, this list of conditions and the following disclaimer in
+    the documentation and/or other materials provided with the
+    distribution.
+
+    (3) The name of the author may not be used to
+    endorse or promote products derived from this software without
+    specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT,
+INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING
+IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+POSSIBILITY OF SUCH DAMAGE.  */
+
+#include "config.h"
+
+#include <errno.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/types.h>
+
+#include "filenames.hpp"
+
+#include "backtrace.hpp"
+#include "internal.hpp"
+
+namespace tracy
+{
+
+/* DWARF constants.  */
+
+enum dwarf_tag {
+  DW_TAG_entry_point = 0x3,
+  DW_TAG_compile_unit = 0x11,
+  DW_TAG_inlined_subroutine = 0x1d,
+  DW_TAG_subprogram = 0x2e,
+  DW_TAG_skeleton_unit = 0x4a,
+};
+
+enum dwarf_form {
+  DW_FORM_addr = 0x01,
+  DW_FORM_block2 = 0x03,
+  DW_FORM_block4 = 0x04,
+  DW_FORM_data2 = 0x05,
+  DW_FORM_data4 = 0x06,
+  DW_FORM_data8 = 0x07,
+  DW_FORM_string = 0x08,
+  DW_FORM_block = 0x09,
+  DW_FORM_block1 = 0x0a,
+  DW_FORM_data1 = 0x0b,
+  DW_FORM_flag = 0x0c,
+  DW_FORM_sdata = 0x0d,
+  DW_FORM_strp = 0x0e,
+  DW_FORM_udata = 0x0f,
+  DW_FORM_ref_addr = 0x10,
+  DW_FORM_ref1 = 0x11,
+  DW_FORM_ref2 = 0x12,
+  DW_FORM_ref4 = 0x13,
+  DW_FORM_ref8 = 0x14,
+  DW_FORM_ref_udata = 0x15,
+  DW_FORM_indirect = 0x16,
+  DW_FORM_sec_offset = 0x17,
+  DW_FORM_exprloc = 0x18,
+  DW_FORM_flag_present = 0x19,
+  DW_FORM_ref_sig8 = 0x20,
+  DW_FORM_strx = 0x1a,
+  DW_FORM_addrx = 0x1b,
+  DW_FORM_ref_sup4 = 0x1c,
+  DW_FORM_strp_sup = 0x1d,
+  DW_FORM_data16 = 0x1e,
+  DW_FORM_line_strp = 0x1f,
+  DW_FORM_implicit_const = 0x21,
+  DW_FORM_loclistx = 0x22,
+  DW_FORM_rnglistx = 0x23,
+  DW_FORM_ref_sup8 = 0x24,
+  DW_FORM_strx1 = 0x25,
+  DW_FORM_strx2 = 0x26,
+  DW_FORM_strx3 = 0x27,
+  DW_FORM_strx4 = 0x28,
+  DW_FORM_addrx1 = 0x29,
+  DW_FORM_addrx2 = 0x2a,
+  DW_FORM_addrx3 = 0x2b,
+  DW_FORM_addrx4 = 0x2c,
+  DW_FORM_GNU_addr_index = 0x1f01,
+  DW_FORM_GNU_str_index = 0x1f02,
+  DW_FORM_GNU_ref_alt = 0x1f20,
+  DW_FORM_GNU_strp_alt = 0x1f21
+};
+
+enum dwarf_attribute {
+  DW_AT_sibling = 0x01,
+  DW_AT_location = 0x02,
+  DW_AT_name = 0x03,
+  DW_AT_ordering = 0x09,
+  DW_AT_subscr_data = 0x0a,
+  DW_AT_byte_size = 0x0b,
+  DW_AT_bit_offset = 0x0c,
+  DW_AT_bit_size = 0x0d,
+  DW_AT_element_list = 0x0f,
+  DW_AT_stmt_list = 0x10,
+  DW_AT_low_pc = 0x11,
+  DW_AT_high_pc = 0x12,
+  DW_AT_language = 0x13,
+  DW_AT_member = 0x14,
+  DW_AT_discr = 0x15,
+  DW_AT_discr_value = 0x16,
+  DW_AT_visibility = 0x17,
+  DW_AT_import = 0x18,
+  DW_AT_string_length = 0x19,
+  DW_AT_common_reference = 0x1a,
+  DW_AT_comp_dir = 0x1b,
+  DW_AT_const_value = 0x1c,
+  DW_AT_containing_type = 0x1d,
+  DW_AT_default_value = 0x1e,
+  DW_AT_inline = 0x20,
+  DW_AT_is_optional = 0x21,
+  DW_AT_lower_bound = 0x22,
+  DW_AT_producer = 0x25,
+  DW_AT_prototyped = 0x27,
+  DW_AT_return_addr = 0x2a,
+  DW_AT_start_scope = 0x2c,
+  DW_AT_bit_stride = 0x2e,
+  DW_AT_upper_bound = 0x2f,
+  DW_AT_abstract_origin = 0x31,
+  DW_AT_accessibility = 0x32,
+  DW_AT_address_class = 0x33,
+  DW_AT_artificial = 0x34,
+  DW_AT_base_types = 0x35,
+  DW_AT_calling_convention = 0x36,
+  DW_AT_count = 0x37,
+  DW_AT_data_member_location = 0x38,
+  DW_AT_decl_column = 0x39,
+  DW_AT_decl_file = 0x3a,
+  DW_AT_decl_line = 0x3b,
+  DW_AT_declaration = 0x3c,
+  DW_AT_discr_list = 0x3d,
+  DW_AT_encoding = 0x3e,
+  DW_AT_external = 0x3f,
+  DW_AT_frame_base = 0x40,
+  DW_AT_friend = 0x41,
+  DW_AT_identifier_case = 0x42,
+  DW_AT_macro_info = 0x43,
+  DW_AT_namelist_items = 0x44,
+  DW_AT_priority = 0x45,
+  DW_AT_segment = 0x46,
+  DW_AT_specification = 0x47,
+  DW_AT_static_link = 0x48,
+  DW_AT_type = 0x49,
+  DW_AT_use_location = 0x4a,
+  DW_AT_variable_parameter = 0x4b,
+  DW_AT_virtuality = 0x4c,
+  DW_AT_vtable_elem_location = 0x4d,
+  DW_AT_allocated = 0x4e,
+  DW_AT_associated = 0x4f,
+  DW_AT_data_location = 0x50,
+  DW_AT_byte_stride = 0x51,
+  DW_AT_entry_pc = 0x52,
+  DW_AT_use_UTF8 = 0x53,
+  DW_AT_extension = 0x54,
+  DW_AT_ranges = 0x55,
+  DW_AT_trampoline = 0x56,
+  DW_AT_call_column = 0x57,
+  DW_AT_call_file = 0x58,
+  DW_AT_call_line = 0x59,
+  DW_AT_description = 0x5a,
+  DW_AT_binary_scale = 0x5b,
+  DW_AT_decimal_scale = 0x5c,
+  DW_AT_small = 0x5d,
+  DW_AT_decimal_sign = 0x5e,
+  DW_AT_digit_count = 0x5f,
+  DW_AT_picture_string = 0x60,
+  DW_AT_mutable = 0x61,
+  DW_AT_threads_scaled = 0x62,
+  DW_AT_explicit = 0x63,
+  DW_AT_object_pointer = 0x64,
+  DW_AT_endianity = 0x65,
+  DW_AT_elemental = 0x66,
+  DW_AT_pure = 0x67,
+  DW_AT_recursive = 0x68,
+  DW_AT_signature = 0x69,
+  DW_AT_main_subprogram = 0x6a,
+  DW_AT_data_bit_offset = 0x6b,
+  DW_AT_const_expr = 0x6c,
+  DW_AT_enum_class = 0x6d,
+  DW_AT_linkage_name = 0x6e,
+  DW_AT_string_length_bit_size = 0x6f,
+  DW_AT_string_length_byte_size = 0x70,
+  DW_AT_rank = 0x71,
+  DW_AT_str_offsets_base = 0x72,
+  DW_AT_addr_base = 0x73,
+  DW_AT_rnglists_base = 0x74,
+  DW_AT_dwo_name = 0x76,
+  DW_AT_reference = 0x77,
+  DW_AT_rvalue_reference = 0x78,
+  DW_AT_macros = 0x79,
+  DW_AT_call_all_calls = 0x7a,
+  DW_AT_call_all_source_calls = 0x7b,
+  DW_AT_call_all_tail_calls = 0x7c,
+  DW_AT_call_return_pc = 0x7d,
+  DW_AT_call_value = 0x7e,
+  DW_AT_call_origin = 0x7f,
+  DW_AT_call_parameter = 0x80,
+  DW_AT_call_pc = 0x81,
+  DW_AT_call_tail_call = 0x82,
+  DW_AT_call_target = 0x83,
+  DW_AT_call_target_clobbered = 0x84,
+  DW_AT_call_data_location = 0x85,
+  DW_AT_call_data_value = 0x86,
+  DW_AT_noreturn = 0x87,
+  DW_AT_alignment = 0x88,
+  DW_AT_export_symbols = 0x89,
+  DW_AT_deleted = 0x8a,
+  DW_AT_defaulted = 0x8b,
+  DW_AT_loclists_base = 0x8c,
+  DW_AT_lo_user = 0x2000,
+  DW_AT_hi_user = 0x3fff,
+  DW_AT_MIPS_fde = 0x2001,
+  DW_AT_MIPS_loop_begin = 0x2002,
+  DW_AT_MIPS_tail_loop_begin = 0x2003,
+  DW_AT_MIPS_epilog_begin = 0x2004,
+  DW_AT_MIPS_loop_unroll_factor = 0x2005,
+  DW_AT_MIPS_software_pipeline_depth = 0x2006,
+  DW_AT_MIPS_linkage_name = 0x2007,
+  DW_AT_MIPS_stride = 0x2008,
+  DW_AT_MIPS_abstract_name = 0x2009,
+  DW_AT_MIPS_clone_origin = 0x200a,
+  DW_AT_MIPS_has_inlines = 0x200b,
+  DW_AT_HP_block_index = 0x2000,
+  DW_AT_HP_unmodifiable = 0x2001,
+  DW_AT_HP_prologue = 0x2005,
+  DW_AT_HP_epilogue = 0x2008,
+  DW_AT_HP_actuals_stmt_list = 0x2010,
+  DW_AT_HP_proc_per_section = 0x2011,
+  DW_AT_HP_raw_data_ptr = 0x2012,
+  DW_AT_HP_pass_by_reference = 0x2013,
+  DW_AT_HP_opt_level = 0x2014,
+  DW_AT_HP_prof_version_id = 0x2015,
+  DW_AT_HP_opt_flags = 0x2016,
+  DW_AT_HP_cold_region_low_pc = 0x2017,
+  DW_AT_HP_cold_region_high_pc = 0x2018,
+  DW_AT_HP_all_variables_modifiable = 0x2019,
+  DW_AT_HP_linkage_name = 0x201a,
+  DW_AT_HP_prof_flags = 0x201b,
+  DW_AT_HP_unit_name = 0x201f,
+  DW_AT_HP_unit_size = 0x2020,
+  DW_AT_HP_widened_byte_size = 0x2021,
+  DW_AT_HP_definition_points = 0x2022,
+  DW_AT_HP_default_location = 0x2023,
+  DW_AT_HP_is_result_param = 0x2029,
+  DW_AT_sf_names = 0x2101,
+  DW_AT_src_info = 0x2102,
+  DW_AT_mac_info = 0x2103,
+  DW_AT_src_coords = 0x2104,
+  DW_AT_body_begin = 0x2105,
+  DW_AT_body_end = 0x2106,
+  DW_AT_GNU_vector = 0x2107,
+  DW_AT_GNU_guarded_by = 0x2108,
+  DW_AT_GNU_pt_guarded_by = 0x2109,
+  DW_AT_GNU_guarded = 0x210a,
+  DW_AT_GNU_pt_guarded = 0x210b,
+  DW_AT_GNU_locks_excluded = 0x210c,
+  DW_AT_GNU_exclusive_locks_required = 0x210d,
+  DW_AT_GNU_shared_locks_required = 0x210e,
+  DW_AT_GNU_odr_signature = 0x210f,
+  DW_AT_GNU_template_name = 0x2110,
+  DW_AT_GNU_call_site_value = 0x2111,
+  DW_AT_GNU_call_site_data_value = 0x2112,
+  DW_AT_GNU_call_site_target = 0x2113,
+  DW_AT_GNU_call_site_target_clobbered = 0x2114,
+  DW_AT_GNU_tail_call = 0x2115,
+  DW_AT_GNU_all_tail_call_sites = 0x2116,
+  DW_AT_GNU_all_call_sites = 0x2117,
+  DW_AT_GNU_all_source_call_sites = 0x2118,
+  DW_AT_GNU_macros = 0x2119,
+  DW_AT_GNU_deleted = 0x211a,
+  DW_AT_GNU_dwo_name = 0x2130,
+  DW_AT_GNU_dwo_id = 0x2131,
+  DW_AT_GNU_ranges_base = 0x2132,
+  DW_AT_GNU_addr_base = 0x2133,
+  DW_AT_GNU_pubnames = 0x2134,
+  DW_AT_GNU_pubtypes = 0x2135,
+  DW_AT_GNU_discriminator = 0x2136,
+  DW_AT_GNU_locviews = 0x2137,
+  DW_AT_GNU_entry_view = 0x2138,
+  DW_AT_VMS_rtnbeg_pd_address = 0x2201,
+  DW_AT_use_GNAT_descriptive_type = 0x2301,
+  DW_AT_GNAT_descriptive_type = 0x2302,
+  DW_AT_GNU_numerator = 0x2303,
+  DW_AT_GNU_denominator = 0x2304,
+  DW_AT_GNU_bias = 0x2305,
+  DW_AT_upc_threads_scaled = 0x3210,
+  DW_AT_PGI_lbase = 0x3a00,
+  DW_AT_PGI_soffset = 0x3a01,
+  DW_AT_PGI_lstride = 0x3a02,
+  DW_AT_APPLE_optimized = 0x3fe1,
+  DW_AT_APPLE_flags = 0x3fe2,
+  DW_AT_APPLE_isa = 0x3fe3,
+  DW_AT_APPLE_block = 0x3fe4,
+  DW_AT_APPLE_major_runtime_vers = 0x3fe5,
+  DW_AT_APPLE_runtime_class = 0x3fe6,
+  DW_AT_APPLE_omit_frame_ptr = 0x3fe7,
+  DW_AT_APPLE_property_name = 0x3fe8,
+  DW_AT_APPLE_property_getter = 0x3fe9,
+  DW_AT_APPLE_property_setter = 0x3fea,
+  DW_AT_APPLE_property_attribute = 0x3feb,
+  DW_AT_APPLE_objc_complete_type = 0x3fec,
+  DW_AT_APPLE_property = 0x3fed
+};
+
+enum dwarf_line_number_op {
+  DW_LNS_extended_op = 0x0,
+  DW_LNS_copy = 0x1,
+  DW_LNS_advance_pc = 0x2,
+  DW_LNS_advance_line = 0x3,
+  DW_LNS_set_file = 0x4,
+  DW_LNS_set_column = 0x5,
+  DW_LNS_negate_stmt = 0x6,
+  DW_LNS_set_basic_block = 0x7,
+  DW_LNS_const_add_pc = 0x8,
+  DW_LNS_fixed_advance_pc = 0x9,
+  DW_LNS_set_prologue_end = 0xa,
+  DW_LNS_set_epilogue_begin = 0xb,
+  DW_LNS_set_isa = 0xc,
+};
+
+enum dwarf_extended_line_number_op {
+  DW_LNE_end_sequence = 0x1,
+  DW_LNE_set_address = 0x2,
+  DW_LNE_define_file = 0x3,
+  DW_LNE_set_discriminator = 0x4,
+};
+
+enum dwarf_line_number_content_type {
+  DW_LNCT_path = 0x1,
+  DW_LNCT_directory_index = 0x2,
+  DW_LNCT_timestamp = 0x3,
+  DW_LNCT_size = 0x4,
+  DW_LNCT_MD5 = 0x5,
+  DW_LNCT_lo_user = 0x2000,
+  DW_LNCT_hi_user = 0x3fff
+};
+
+enum dwarf_range_list_entry {
+  DW_RLE_end_of_list = 0x00,
+  DW_RLE_base_addressx = 0x01,
+  DW_RLE_startx_endx = 0x02,
+  DW_RLE_startx_length = 0x03,
+  DW_RLE_offset_pair = 0x04,
+  DW_RLE_base_address = 0x05,
+  DW_RLE_start_end = 0x06,
+  DW_RLE_start_length = 0x07
+};
+
+enum dwarf_unit_type {
+  DW_UT_compile = 0x01,
+  DW_UT_type = 0x02,
+  DW_UT_partial = 0x03,
+  DW_UT_skeleton = 0x04,
+  DW_UT_split_compile = 0x05,
+  DW_UT_split_type = 0x06,
+  DW_UT_lo_user = 0x80,
+  DW_UT_hi_user = 0xff
+};
+
+#if !defined(HAVE_DECL_STRNLEN) || !HAVE_DECL_STRNLEN
+
+/* If strnlen is not declared, provide our own version.  */
+
+static size_t
+xstrnlen (const char *s, size_t maxlen)
+{
+  size_t i;
+
+  for (i = 0; i < maxlen; ++i)
+    if (s[i] == '\0')
+      break;
+  return i;
+}
+
+#define strnlen xstrnlen
+
+#endif
+
+/* A buffer to read DWARF info.  */
+
+struct dwarf_buf
+{
+  /* Buffer name for error messages.  */
+  const char *name;
+  /* Start of the buffer.  */
+  const unsigned char *start;
+  /* Next byte to read.  */
+  const unsigned char *buf;
+  /* The number of bytes remaining.  */
+  size_t left;
+  /* Whether the data is big-endian.  */
+  int is_bigendian;
+  /* Error callback routine.  */
+  backtrace_error_callback error_callback;
+  /* Data for error_callback.  */
+  void *data;
+  /* Non-zero if we've reported an underflow error.  */
+  int reported_underflow;
+};
+
+/* A single attribute in a DWARF abbreviation.  */
+
+struct attr
+{
+  /* The attribute name.  */
+  enum dwarf_attribute name;
+  /* The attribute form.  */
+  enum dwarf_form form;
+  /* The attribute value, for DW_FORM_implicit_const.  */
+  int64_t val;
+};
+
+/* A single DWARF abbreviation.  */
+
+struct abbrev
+{
+  /* The abbrev code--the number used to refer to the abbrev.  */
+  uint64_t code;
+  /* The entry tag.  */
+  enum dwarf_tag tag;
+  /* Non-zero if this abbrev has child entries.  */
+  int has_children;
+  /* The number of attributes.  */
+  size_t num_attrs;
+  /* The attributes.  */
+  struct attr *attrs;
+};
+
+/* The DWARF abbreviations for a compilation unit.  This structure
+   only exists while reading the compilation unit.  Most DWARF readers
+   seem to a hash table to map abbrev ID's to abbrev entries.
+   However, we primarily care about GCC, and GCC simply issues ID's in
+   numerical order starting at 1.  So we simply keep a sorted vector,
+   and try to just look up the code.  */
+
+struct abbrevs
+{
+  /* The number of abbrevs in the vector.  */
+  size_t num_abbrevs;
+  /* The abbrevs, sorted by the code field.  */
+  struct abbrev *abbrevs;
+};
+
+/* The different kinds of attribute values.  */
+
+enum attr_val_encoding
+{
+  /* No attribute value.  */
+  ATTR_VAL_NONE,
+  /* An address.  */
+  ATTR_VAL_ADDRESS,
+  /* An index into the .debug_addr section, whose value is relative to
+     the DW_AT_addr_base attribute of the compilation unit.  */
+  ATTR_VAL_ADDRESS_INDEX,
+  /* A unsigned integer.  */
+  ATTR_VAL_UINT,
+  /* A sigd integer.  */
+  ATTR_VAL_SINT,
+  /* A string.  */
+  ATTR_VAL_STRING,
+  /* An index into the .debug_str_offsets section.  */
+  ATTR_VAL_STRING_INDEX,
+  /* An offset to other data in the containing unit.  */
+  ATTR_VAL_REF_UNIT,
+  /* An offset to other data within the .debug_info section.  */
+  ATTR_VAL_REF_INFO,
+  /* An offset to other data within the alt .debug_info section.  */
+  ATTR_VAL_REF_ALT_INFO,
+  /* An offset to data in some other section.  */
+  ATTR_VAL_REF_SECTION,
+  /* A type signature.  */
+  ATTR_VAL_REF_TYPE,
+  /* An index into the .debug_rnglists section.  */
+  ATTR_VAL_RNGLISTS_INDEX,
+  /* A block of data (not represented).  */
+  ATTR_VAL_BLOCK,
+  /* An expression (not represented).  */
+  ATTR_VAL_EXPR,
+};
+
+/* An attribute value.  */
+
+struct attr_val
+{
+  /* How the value is stored in the field u.  */
+  enum attr_val_encoding encoding;
+  union
+  {
+    /* ATTR_VAL_ADDRESS*, ATTR_VAL_UINT, ATTR_VAL_REF*.  */
+    uint64_t uint;
+    /* ATTR_VAL_SINT.  */
+    int64_t sint;
+    /* ATTR_VAL_STRING.  */
+    const char *string;
+    /* ATTR_VAL_BLOCK not stored.  */
+  } u;
+};
+
+/* The line number program header.  */
+
+struct line_header
+{
+  /* The version of the line number information.  */
+  int version;
+  /* Address size.  */
+  int addrsize;
+  /* The minimum instruction length.  */
+  unsigned int min_insn_len;
+  /* The maximum number of ops per instruction.  */
+  unsigned int max_ops_per_insn;
+  /* The line base for special opcodes.  */
+  int line_base;
+  /* The line range for special opcodes.  */
+  unsigned int line_range;
+  /* The opcode base--the first special opcode.  */
+  unsigned int opcode_base;
+  /* Opcode lengths, indexed by opcode - 1.  */
+  const unsigned char *opcode_lengths;
+  /* The number of directory entries.  */
+  size_t dirs_count;
+  /* The directory entries.  */
+  const char **dirs;
+  /* The number of filenames.  */
+  size_t filenames_count;
+  /* The filenames.  */
+  const char **filenames;
+};
+
+/* A format description from a line header.  */
+
+struct line_header_format
+{
+  int lnct;		/* LNCT code.  */
+  enum dwarf_form form;	/* Form of entry data.  */
+};
+
+/* Map a single PC value to a file/line.  We will keep a vector of
+   these sorted by PC value.  Each file/line will be correct from the
+   PC up to the PC of the next entry if there is one.  We allocate one
+   extra entry at the end so that we can use bsearch.  */
+
+struct line
+{
+  /* PC.  */
+  uintptr_t pc;
+  /* File name.  Many entries in the array are expected to point to
+     the same file name.  */
+  const char *filename;
+  /* Line number.  */
+  int lineno;
+  /* Index of the object in the original array read from the DWARF
+     section, before it has been sorted.  The index makes it possible
+     to use Quicksort and maintain stability.  */
+  int idx;
+};
+
+/* A growable vector of line number information.  This is used while
+   reading the line numbers.  */
+
+struct line_vector
+{
+  /* Memory.  This is an array of struct line.  */
+  struct backtrace_vector vec;
+  /* Number of valid mappings.  */
+  size_t count;
+};
+
+/* A function described in the debug info.  */
+
+struct function
+{
+  /* The name of the function.  */
+  const char *name;
+  /* If this is an inlined function, the filename of the call
+     site.  */
+  const char *caller_filename;
+  /* If this is an inlined function, the line number of the call
+     site.  */
+  int caller_lineno;
+  /* Map PC ranges to inlined functions.  */
+  struct function_addrs *function_addrs;
+  size_t function_addrs_count;
+};
+
+/* An address range for a function.  This maps a PC value to a
+   specific function.  */
+
+struct function_addrs
+{
+  /* Range is LOW <= PC < HIGH.  */
+  uintptr_t low;
+  uintptr_t high;
+  /* Function for this address range.  */
+  struct function *function;
+};
+
+/* A growable vector of function address ranges.  */
+
+struct function_vector
+{
+  /* Memory.  This is an array of struct function_addrs.  */
+  struct backtrace_vector vec;
+  /* Number of address ranges present.  */
+  size_t count;
+};
+
+/* A DWARF compilation unit.  This only holds the information we need
+   to map a PC to a file and line.  */
+
+struct unit
+{
+  /* The first entry for this compilation unit.  */
+  const unsigned char *unit_data;
+  /* The length of the data for this compilation unit.  */
+  size_t unit_data_len;
+  /* The offset of UNIT_DATA from the start of the information for
+     this compilation unit.  */
+  size_t unit_data_offset;
+  /* Offset of the start of the compilation unit from the start of the
+     .debug_info section.  */
+  size_t low_offset;
+  /* Offset of the end of the compilation unit from the start of the
+     .debug_info section.  */
+  size_t high_offset;
+  /* DWARF version.  */
+  int version;
+  /* Whether unit is DWARF64.  */
+  int is_dwarf64;
+  /* Address size.  */
+  int addrsize;
+  /* Offset into line number information.  */
+  off_t lineoff;
+  /* Offset of compilation unit in .debug_str_offsets.  */
+  uint64_t str_offsets_base;
+  /* Offset of compilation unit in .debug_addr.  */
+  uint64_t addr_base;
+  /* Offset of compilation unit in .debug_rnglists.  */
+  uint64_t rnglists_base;
+  /* Primary source file.  */
+  const char *filename;
+  /* Compilation command working directory.  */
+  const char *comp_dir;
+  /* Absolute file name, only set if needed.  */
+  const char *abs_filename;
+  /* The abbreviations for this unit.  */
+  struct abbrevs abbrevs;
+
+  /* The fields above this point are read in during initialization and
+     may be accessed freely.  The fields below this point are read in
+     as needed, and therefore require care, as different threads may
+     try to initialize them simultaneously.  */
+
+  /* PC to line number mapping.  This is NULL if the values have not
+     been read.  This is (struct line *) -1 if there was an error
+     reading the values.  */
+  struct line *lines;
+  /* Number of entries in lines.  */
+  size_t lines_count;
+  /* PC ranges to function.  */
+  struct function_addrs *function_addrs;
+  size_t function_addrs_count;
+};
+
+/* An address range for a compilation unit.  This maps a PC value to a
+   specific compilation unit.  Note that we invert the representation
+   in DWARF: instead of listing the units and attaching a list of
+   ranges, we list the ranges and have each one point to the unit.
+   This lets us do a binary search to find the unit.  */
+
+struct unit_addrs
+{
+  /* Range is LOW <= PC < HIGH.  */
+  uintptr_t low;
+  uintptr_t high;
+  /* Compilation unit for this address range.  */
+  struct unit *u;
+};
+
+/* A growable vector of compilation unit address ranges.  */
+
+struct unit_addrs_vector
+{
+  /* Memory.  This is an array of struct unit_addrs.  */
+  struct backtrace_vector vec;
+  /* Number of address ranges present.  */
+  size_t count;
+};
+
+/* A growable vector of compilation unit pointer.  */
+
+struct unit_vector
+{
+  struct backtrace_vector vec;
+  size_t count;
+};
+
+/* The information we need to map a PC to a file and line.  */
+
+struct dwarf_data
+{
+  /* The data for the next file we know about.  */
+  struct dwarf_data *next;
+  /* The data for .gnu_debugaltlink.  */
+  struct dwarf_data *altlink;
+/* The base address mapping for this file.  */
+  struct libbacktrace_base_address base_address;
+  /* A sorted list of address ranges.  */
+  struct unit_addrs *addrs;
+  /* Number of address ranges in list.  */
+  size_t addrs_count;
+  /* A sorted list of units.  */
+  struct unit **units;
+  /* Number of units in the list.  */
+  size_t units_count;
+  /* The unparsed DWARF debug data.  */
+  struct dwarf_sections dwarf_sections;
+  /* Whether the data is big-endian or not.  */
+  int is_bigendian;
+  /* A vector used for function addresses.  We keep this here so that
+     we can grow the vector as we read more functions.  */
+  struct function_vector fvec;
+};
+
+/* Report an error for a DWARF buffer.  */
+
+static void
+dwarf_buf_error (struct dwarf_buf *buf, const char *msg, int errnum)
+{
+  char b[200];
+
+  snprintf (b, sizeof b, "%s in %s at %d",
+	    msg, buf->name, (int) (buf->buf - buf->start));
+  buf->error_callback (buf->data, b, errnum);
+}
+
+/* Require at least COUNT bytes in BUF.  Return 1 if all is well, 0 on
+   error.  */
+
+static int
+require (struct dwarf_buf *buf, size_t count)
+{
+  if (buf->left >= count)
+    return 1;
+
+  if (!buf->reported_underflow)
+    {
+      dwarf_buf_error (buf, "DWARF underflow", 0);
+      buf->reported_underflow = 1;
+    }
+
+  return 0;
+}
+
+/* Advance COUNT bytes in BUF.  Return 1 if all is well, 0 on
+   error.  */
+
+static int
+advance (struct dwarf_buf *buf, size_t count)
+{
+  if (!require (buf, count))
+    return 0;
+  buf->buf += count;
+  buf->left -= count;
+  return 1;
+}
+
+/* Read one zero-terminated string from BUF and advance past the string.  */
+
+static const char *
+read_string (struct dwarf_buf *buf)
+{
+  const char *p = (const char *)buf->buf;
+  size_t len = strnlen (p, buf->left);
+
+  /* - If len == left, we ran out of buffer before finding the zero terminator.
+       Generate an error by advancing len + 1.
+     - If len < left, advance by len + 1 to skip past the zero terminator.  */
+  size_t count = len + 1;
+
+  if (!advance (buf, count))
+    return NULL;
+
+  return p;
+}
+
+/* Read one byte from BUF and advance 1 byte.  */
+
+static unsigned char
+read_byte (struct dwarf_buf *buf)
+{
+  const unsigned char *p = buf->buf;
+
+  if (!advance (buf, 1))
+    return 0;
+  return p[0];
+}
+
+/* Read a signed char from BUF and advance 1 byte.  */
+
+static signed char
+read_sbyte (struct dwarf_buf *buf)
+{
+  const unsigned char *p = buf->buf;
+
+  if (!advance (buf, 1))
+    return 0;
+  return (*p ^ 0x80) - 0x80;
+}
+
+/* Read a uint16 from BUF and advance 2 bytes.  */
+
+static uint16_t
+read_uint16 (struct dwarf_buf *buf)
+{
+  const unsigned char *p = buf->buf;
+
+  if (!advance (buf, 2))
+    return 0;
+  if (buf->is_bigendian)
+    return ((uint16_t) p[0] << 8) | (uint16_t) p[1];
+  else
+    return ((uint16_t) p[1] << 8) | (uint16_t) p[0];
+}
+
+/* Read a 24 bit value from BUF and advance 3 bytes.  */
+
+static uint32_t
+read_uint24 (struct dwarf_buf *buf)
+{
+  const unsigned char *p = buf->buf;
+
+  if (!advance (buf, 3))
+    return 0;
+  if (buf->is_bigendian)
+    return (((uint32_t) p[0] << 16) | ((uint32_t) p[1] << 8)
+	    | (uint32_t) p[2]);
+  else
+    return (((uint32_t) p[2] << 16) | ((uint32_t) p[1] << 8)
+	    | (uint32_t) p[0]);
+}
+
+/* Read a uint32 from BUF and advance 4 bytes.  */
+
+static uint32_t
+read_uint32 (struct dwarf_buf *buf)
+{
+  const unsigned char *p = buf->buf;
+
+  if (!advance (buf, 4))
+    return 0;
+  if (buf->is_bigendian)
+    return (((uint32_t) p[0] << 24) | ((uint32_t) p[1] << 16)
+	    | ((uint32_t) p[2] << 8) | (uint32_t) p[3]);
+  else
+    return (((uint32_t) p[3] << 24) | ((uint32_t) p[2] << 16)
+	    | ((uint32_t) p[1] << 8) | (uint32_t) p[0]);
+}
+
+/* Read a uint64 from BUF and advance 8 bytes.  */
+
+static uint64_t
+read_uint64 (struct dwarf_buf *buf)
+{
+  const unsigned char *p = buf->buf;
+
+  if (!advance (buf, 8))
+    return 0;
+  if (buf->is_bigendian)
+    return (((uint64_t) p[0] << 56) | ((uint64_t) p[1] << 48)
+	    | ((uint64_t) p[2] << 40) | ((uint64_t) p[3] << 32)
+	    | ((uint64_t) p[4] << 24) | ((uint64_t) p[5] << 16)
+	    | ((uint64_t) p[6] << 8) | (uint64_t) p[7]);
+  else
+    return (((uint64_t) p[7] << 56) | ((uint64_t) p[6] << 48)
+	    | ((uint64_t) p[5] << 40) | ((uint64_t) p[4] << 32)
+	    | ((uint64_t) p[3] << 24) | ((uint64_t) p[2] << 16)
+	    | ((uint64_t) p[1] << 8) | (uint64_t) p[0]);
+}
+
+/* Read an offset from BUF and advance the appropriate number of
+   bytes.  */
+
+static uint64_t
+read_offset (struct dwarf_buf *buf, int is_dwarf64)
+{
+  if (is_dwarf64)
+    return read_uint64 (buf);
+  else
+    return read_uint32 (buf);
+}
+
+/* Read an address from BUF and advance the appropriate number of
+   bytes.  */
+
+static uint64_t
+read_address (struct dwarf_buf *buf, int addrsize)
+{
+  switch (addrsize)
+    {
+    case 1:
+      return read_byte (buf);
+    case 2:
+      return read_uint16 (buf);
+    case 4:
+      return read_uint32 (buf);
+    case 8:
+      return read_uint64 (buf);
+    default:
+      dwarf_buf_error (buf, "unrecognized address size", 0);
+      return 0;
+    }
+}
+
+/* Return whether a value is the highest possible address, given the
+   address size.  */
+
+static int
+is_highest_address (uint64_t address, int addrsize)
+{
+  switch (addrsize)
+    {
+    case 1:
+      return address == (unsigned char) -1;
+    case 2:
+      return address == (uint16_t) -1;
+    case 4:
+      return address == (uint32_t) -1;
+    case 8:
+      return address == (uint64_t) -1;
+    default:
+      return 0;
+    }
+}
+
+/* Read an unsigned LEB128 number.  */
+
+static uint64_t
+read_uleb128 (struct dwarf_buf *buf)
+{
+  uint64_t ret;
+  unsigned int shift;
+  int overflow;
+  unsigned char b;
+
+  ret = 0;
+  shift = 0;
+  overflow = 0;
+  do
+    {
+      const unsigned char *p;
+
+      p = buf->buf;
+      if (!advance (buf, 1))
+	return 0;
+      b = *p;
+      if (shift < 64)
+	ret |= ((uint64_t) (b & 0x7f)) << shift;
+      else if (!overflow)
+	{
+	  dwarf_buf_error (buf, "LEB128 overflows uint64_t", 0);
+	  overflow = 1;
+	}
+      shift += 7;
+    }
+  while ((b & 0x80) != 0);
+
+  return ret;
+}
+
+/* Read a signed LEB128 number.  */
+
+static int64_t
+read_sleb128 (struct dwarf_buf *buf)
+{
+  uint64_t val;
+  unsigned int shift;
+  int overflow;
+  unsigned char b;
+
+  val = 0;
+  shift = 0;
+  overflow = 0;
+  do
+    {
+      const unsigned char *p;
+
+      p = buf->buf;
+      if (!advance (buf, 1))
+	return 0;
+      b = *p;
+      if (shift < 64)
+	val |= ((uint64_t) (b & 0x7f)) << shift;
+      else if (!overflow)
+	{
+	  dwarf_buf_error (buf, "signed LEB128 overflows uint64_t", 0);
+	  overflow = 1;
+	}
+      shift += 7;
+    }
+  while ((b & 0x80) != 0);
+
+  if ((b & 0x40) != 0 && shift < 64)
+    val |= ((uint64_t) -1) << shift;
+
+  return (int64_t) val;
+}
+
+/* Return the length of an LEB128 number.  */
+
+static size_t
+leb128_len (const unsigned char *p)
+{
+  size_t ret;
+
+  ret = 1;
+  while ((*p & 0x80) != 0)
+    {
+      ++p;
+      ++ret;
+    }
+  return ret;
+}
+
+/* Read initial_length from BUF and advance the appropriate number of bytes.  */
+
+static uint64_t
+read_initial_length (struct dwarf_buf *buf, int *is_dwarf64)
+{
+  uint64_t len;
+
+  len = read_uint32 (buf);
+  if (len == 0xffffffff)
+    {
+      len = read_uint64 (buf);
+      *is_dwarf64 = 1;
+    }
+  else
+    *is_dwarf64 = 0;
+
+  return len;
+}
+
+/* Free an abbreviations structure.  */
+
+static void
+free_abbrevs (struct backtrace_state *state, struct abbrevs *abbrevs,
+	      backtrace_error_callback error_callback, void *data)
+{
+  size_t i;
+
+  for (i = 0; i < abbrevs->num_abbrevs; ++i)
+    backtrace_free (state, abbrevs->abbrevs[i].attrs,
+		    abbrevs->abbrevs[i].num_attrs * sizeof (struct attr),
+		    error_callback, data);
+  backtrace_free (state, abbrevs->abbrevs,
+		  abbrevs->num_abbrevs * sizeof (struct abbrev),
+		  error_callback, data);
+  abbrevs->num_abbrevs = 0;
+  abbrevs->abbrevs = NULL;
+}
+
+/* Read an attribute value.  Returns 1 on success, 0 on failure.  If
+   the value can be represented as a uint64_t, sets *VAL and sets
+   *IS_VALID to 1.  We don't try to store the value of other attribute
+   forms, because we don't care about them.  */
+
+static int
+read_attribute (enum dwarf_form form, uint64_t implicit_val,
+		struct dwarf_buf *buf, int is_dwarf64, int version,
+		int addrsize, const struct dwarf_sections *dwarf_sections,
+		struct dwarf_data *altlink, struct attr_val *val)
+{
+  /* Avoid warnings about val.u.FIELD may be used uninitialized if
+     this function is inlined.  The warnings aren't valid but can
+     occur because the different fields are set and used
+     conditionally.  */
+  memset (val, 0, sizeof *val);
+
+  switch (form)
+    {
+    case DW_FORM_addr:
+      val->encoding = ATTR_VAL_ADDRESS;
+      val->u.uint = read_address (buf, addrsize);
+      return 1;
+    case DW_FORM_block2:
+      val->encoding = ATTR_VAL_BLOCK;
+      return advance (buf, read_uint16 (buf));
+    case DW_FORM_block4:
+      val->encoding = ATTR_VAL_BLOCK;
+      return advance (buf, read_uint32 (buf));
+    case DW_FORM_data2:
+      val->encoding = ATTR_VAL_UINT;
+      val->u.uint = read_uint16 (buf);
+      return 1;
+    case DW_FORM_data4:
+      val->encoding = ATTR_VAL_UINT;
+      val->u.uint = read_uint32 (buf);
+      return 1;
+    case DW_FORM_data8:
+      val->encoding = ATTR_VAL_UINT;
+      val->u.uint = read_uint64 (buf);
+      return 1;
+    case DW_FORM_data16:
+      val->encoding = ATTR_VAL_BLOCK;
+      return advance (buf, 16);
+    case DW_FORM_string:
+      val->encoding = ATTR_VAL_STRING;
+      val->u.string = read_string (buf);
+      return val->u.string == NULL ? 0 : 1;
+    case DW_FORM_block:
+      val->encoding = ATTR_VAL_BLOCK;
+      return advance (buf, read_uleb128 (buf));
+    case DW_FORM_block1:
+      val->encoding = ATTR_VAL_BLOCK;
+      return advance (buf, read_byte (buf));
+    case DW_FORM_data1:
+      val->encoding = ATTR_VAL_UINT;
+      val->u.uint = read_byte (buf);
+      return 1;
+    case DW_FORM_flag:
+      val->encoding = ATTR_VAL_UINT;
+      val->u.uint = read_byte (buf);
+      return 1;
+    case DW_FORM_sdata:
+      val->encoding = ATTR_VAL_SINT;
+      val->u.sint = read_sleb128 (buf);
+      return 1;
+    case DW_FORM_strp:
+      {
+	uint64_t offset;
+
+	offset = read_offset (buf, is_dwarf64);
+	if (offset >= dwarf_sections->size[DEBUG_STR])
+	  {
+	    dwarf_buf_error (buf, "DW_FORM_strp out of range", 0);
+	    return 0;
+	  }
+	val->encoding = ATTR_VAL_STRING;
+	val->u.string =
+	  (const char *) dwarf_sections->data[DEBUG_STR] + offset;
+	return 1;
+      }
+    case DW_FORM_line_strp:
+      {
+	uint64_t offset;
+
+	offset = read_offset (buf, is_dwarf64);
+	if (offset >= dwarf_sections->size[DEBUG_LINE_STR])
+	  {
+	    dwarf_buf_error (buf, "DW_FORM_line_strp out of range", 0);
+	    return 0;
+	  }
+	val->encoding = ATTR_VAL_STRING;
+	val->u.string =
+	  (const char *) dwarf_sections->data[DEBUG_LINE_STR] + offset;
+	return 1;
+      }
+    case DW_FORM_udata:
+      val->encoding = ATTR_VAL_UINT;
+      val->u.uint = read_uleb128 (buf);
+      return 1;
+    case DW_FORM_ref_addr:
+      val->encoding = ATTR_VAL_REF_INFO;
+      if (version == 2)
+	val->u.uint = read_address (buf, addrsize);
+      else
+	val->u.uint = read_offset (buf, is_dwarf64);
+      return 1;
+    case DW_FORM_ref1:
+      val->encoding = ATTR_VAL_REF_UNIT;
+      val->u.uint = read_byte (buf);
+      return 1;
+    case DW_FORM_ref2:
+      val->encoding = ATTR_VAL_REF_UNIT;
+      val->u.uint = read_uint16 (buf);
+      return 1;
+    case DW_FORM_ref4:
+      val->encoding = ATTR_VAL_REF_UNIT;
+      val->u.uint = read_uint32 (buf);
+      return 1;
+    case DW_FORM_ref8:
+      val->encoding = ATTR_VAL_REF_UNIT;
+      val->u.uint = read_uint64 (buf);
+      return 1;
+    case DW_FORM_ref_udata:
+      val->encoding = ATTR_VAL_REF_UNIT;
+      val->u.uint = read_uleb128 (buf);
+      return 1;
+    case DW_FORM_indirect:
+      {
+	uint64_t form;
+
+	form = read_uleb128 (buf);
+	if (form == DW_FORM_implicit_const)
+	  {
+	    dwarf_buf_error (buf,
+			     "DW_FORM_indirect to DW_FORM_implicit_const",
+			     0);
+	    return 0;
+	  }
+	return read_attribute ((enum dwarf_form) form, 0, buf, is_dwarf64,
+			       version, addrsize, dwarf_sections, altlink,
+			       val);
+      }
+    case DW_FORM_sec_offset:
+      val->encoding = ATTR_VAL_REF_SECTION;
+      val->u.uint = read_offset (buf, is_dwarf64);
+      return 1;
+    case DW_FORM_exprloc:
+      val->encoding = ATTR_VAL_EXPR;
+      return advance (buf, read_uleb128 (buf));
+    case DW_FORM_flag_present:
+      val->encoding = ATTR_VAL_UINT;
+      val->u.uint = 1;
+      return 1;
+    case DW_FORM_ref_sig8:
+      val->encoding = ATTR_VAL_REF_TYPE;
+      val->u.uint = read_uint64 (buf);
+      return 1;
+    case DW_FORM_strx: case DW_FORM_strx1: case DW_FORM_strx2:
+    case DW_FORM_strx3: case DW_FORM_strx4:
+      {
+	uint64_t offset;
+
+	switch (form)
+	  {
+	  case DW_FORM_strx:
+	    offset = read_uleb128 (buf);
+	    break;
+	  case DW_FORM_strx1:
+	    offset = read_byte (buf);
+	    break;
+	  case DW_FORM_strx2:
+	    offset = read_uint16 (buf);
+	    break;
+	  case DW_FORM_strx3:
+	    offset = read_uint24 (buf);
+	    break;
+	  case DW_FORM_strx4:
+	    offset = read_uint32 (buf);
+	    break;
+	  default:
+	    /* This case can't happen.  */
+	    return 0;
+	  }
+	val->encoding = ATTR_VAL_STRING_INDEX;
+	val->u.uint = offset;
+	return 1;
+      }
+    case DW_FORM_addrx: case DW_FORM_addrx1: case DW_FORM_addrx2:
+    case DW_FORM_addrx3: case DW_FORM_addrx4:
+      {
+	uint64_t offset;
+
+	switch (form)
+	  {
+	  case DW_FORM_addrx:
+	    offset = read_uleb128 (buf);
+	    break;
+	  case DW_FORM_addrx1:
+	    offset = read_byte (buf);
+	    break;
+	  case DW_FORM_addrx2:
+	    offset = read_uint16 (buf);
+	    break;
+	  case DW_FORM_addrx3:
+	    offset = read_uint24 (buf);
+	    break;
+	  case DW_FORM_addrx4:
+	    offset = read_uint32 (buf);
+	    break;
+	  default:
+	    /* This case can't happen.  */
+	    return 0;
+	  }
+	val->encoding = ATTR_VAL_ADDRESS_INDEX;
+	val->u.uint = offset;
+	return 1;
+      }
+    case DW_FORM_ref_sup4:
+      val->encoding = ATTR_VAL_REF_SECTION;
+      val->u.uint = read_uint32 (buf);
+      return 1;
+    case DW_FORM_ref_sup8:
+      val->encoding = ATTR_VAL_REF_SECTION;
+      val->u.uint = read_uint64 (buf);
+      return 1;
+    case DW_FORM_implicit_const:
+      val->encoding = ATTR_VAL_UINT;
+      val->u.uint = implicit_val;
+      return 1;
+    case DW_FORM_loclistx:
+      /* We don't distinguish this from DW_FORM_sec_offset.  It
+       * shouldn't matter since we don't care about loclists.  */
+      val->encoding = ATTR_VAL_REF_SECTION;
+      val->u.uint = read_uleb128 (buf);
+      return 1;
+    case DW_FORM_rnglistx:
+      val->encoding = ATTR_VAL_RNGLISTS_INDEX;
+      val->u.uint = read_uleb128 (buf);
+      return 1;
+    case DW_FORM_GNU_addr_index:
+      val->encoding = ATTR_VAL_REF_SECTION;
+      val->u.uint = read_uleb128 (buf);
+      return 1;
+    case DW_FORM_GNU_str_index:
+      val->encoding = ATTR_VAL_REF_SECTION;
+      val->u.uint = read_uleb128 (buf);
+      return 1;
+    case DW_FORM_GNU_ref_alt:
+      val->u.uint = read_offset (buf, is_dwarf64);
+      if (altlink == NULL)
+	{
+	  val->encoding = ATTR_VAL_NONE;
+	  return 1;
+	}
+      val->encoding = ATTR_VAL_REF_ALT_INFO;
+      return 1;
+    case DW_FORM_strp_sup: case DW_FORM_GNU_strp_alt:
+      {
+	uint64_t offset;
+
+	offset = read_offset (buf, is_dwarf64);
+	if (altlink == NULL)
+	  {
+	    val->encoding = ATTR_VAL_NONE;
+	    return 1;
+	  }
+	if (offset >= altlink->dwarf_sections.size[DEBUG_STR])
+	  {
+	    dwarf_buf_error (buf, "DW_FORM_strp_sup out of range", 0);
+	    return 0;
+	  }
+	val->encoding = ATTR_VAL_STRING;
+	val->u.string =
+	  (const char *) altlink->dwarf_sections.data[DEBUG_STR] + offset;
+	return 1;
+      }
+    default:
+      dwarf_buf_error (buf, "unrecognized DWARF form", -1);
+      return 0;
+    }
+}
+
+/* If we can determine the value of a string attribute, set *STRING to
+   point to the string.  Return 1 on success, 0 on error.  If we don't
+   know the value, we consider that a success, and we don't change
+   *STRING.  An error is only reported for some sort of out of range
+   offset.  */
+
+static int
+resolve_string (const struct dwarf_sections *dwarf_sections, int is_dwarf64,
+		int is_bigendian, uint64_t str_offsets_base,
+		const struct attr_val *val,
+		backtrace_error_callback error_callback, void *data,
+		const char **string)
+{
+  switch (val->encoding)
+    {
+    case ATTR_VAL_STRING:
+      *string = val->u.string;
+      return 1;
+
+    case ATTR_VAL_STRING_INDEX:
+      {
+	uint64_t offset;
+	struct dwarf_buf offset_buf;
+
+	offset = val->u.uint * (is_dwarf64 ? 8 : 4) + str_offsets_base;
+	if (offset + (is_dwarf64 ? 8 : 4)
+	    > dwarf_sections->size[DEBUG_STR_OFFSETS])
+	  {
+	    error_callback (data, "DW_FORM_strx value out of range", 0);
+	    return 0;
+	  }
+
+	offset_buf.name = ".debug_str_offsets";
+	offset_buf.start = dwarf_sections->data[DEBUG_STR_OFFSETS];
+	offset_buf.buf = dwarf_sections->data[DEBUG_STR_OFFSETS] + offset;
+	offset_buf.left = dwarf_sections->size[DEBUG_STR_OFFSETS] - offset;
+	offset_buf.is_bigendian = is_bigendian;
+	offset_buf.error_callback = error_callback;
+	offset_buf.data = data;
+	offset_buf.reported_underflow = 0;
+
+	offset = read_offset (&offset_buf, is_dwarf64);
+	if (offset >= dwarf_sections->size[DEBUG_STR])
+	  {
+	    dwarf_buf_error (&offset_buf,
+				   "DW_FORM_strx offset out of range",
+				   0);
+	    return 0;
+	  }
+	*string = (const char *) dwarf_sections->data[DEBUG_STR] + offset;
+	return 1;
+      }
+
+    default:
+      return 1;
+    }
+}
+
+/* Set *ADDRESS to the real address for a ATTR_VAL_ADDRESS_INDEX.
+   Return 1 on success, 0 on error.  */
+
+static int
+resolve_addr_index (const struct dwarf_sections *dwarf_sections,
+		    uint64_t addr_base, int addrsize, int is_bigendian,
+		    uint64_t addr_index,
+		    backtrace_error_callback error_callback, void *data,
+		    uintptr_t *address)
+{
+  uint64_t offset;
+  struct dwarf_buf addr_buf;
+
+  offset = addr_index * addrsize + addr_base;
+  if (offset + addrsize > dwarf_sections->size[DEBUG_ADDR])
+    {
+      error_callback (data, "DW_FORM_addrx value out of range", 0);
+      return 0;
+    }
+
+  addr_buf.name = ".debug_addr";
+  addr_buf.start = dwarf_sections->data[DEBUG_ADDR];
+  addr_buf.buf = dwarf_sections->data[DEBUG_ADDR] + offset;
+  addr_buf.left = dwarf_sections->size[DEBUG_ADDR] - offset;
+  addr_buf.is_bigendian = is_bigendian;
+  addr_buf.error_callback = error_callback;
+  addr_buf.data = data;
+  addr_buf.reported_underflow = 0;
+
+  *address = (uintptr_t) read_address (&addr_buf, addrsize);
+  return 1;
+}
+
+/* Compare a unit offset against a unit for bsearch.  */
+
+static int
+units_search (const void *vkey, const void *ventry)
+{
+  const size_t *key = (const size_t *) vkey;
+  const struct unit *entry = *((const struct unit *const *) ventry);
+  size_t offset;
+
+  offset = *key;
+  if (offset < entry->low_offset)
+    return -1;
+  else if (offset >= entry->high_offset)
+    return 1;
+  else
+    return 0;
+}
+
+/* Find a unit in PU containing OFFSET.  */
+
+static struct unit *
+find_unit (struct unit **pu, size_t units_count, size_t offset)
+{
+  struct unit **u;
+  u = (struct unit**)bsearch (&offset, pu, units_count, sizeof (struct unit *), units_search);
+  return u == NULL ? NULL : *u;
+}
+
+/* Compare function_addrs for qsort.  When ranges are nested, make the
+   smallest one sort last.  */
+
+static int
+function_addrs_compare (const void *v1, const void *v2)
+{
+  const struct function_addrs *a1 = (const struct function_addrs *) v1;
+  const struct function_addrs *a2 = (const struct function_addrs *) v2;
+
+  if (a1->low < a2->low)
+    return -1;
+  if (a1->low > a2->low)
+    return 1;
+  if (a1->high < a2->high)
+    return 1;
+  if (a1->high > a2->high)
+    return -1;
+  return strcmp (a1->function->name, a2->function->name);
+}
+
+/* Compare a PC against a function_addrs for bsearch.  We always
+   allocate an entra entry at the end of the vector, so that this
+   routine can safely look at the next entry.  Note that if there are
+   multiple ranges containing PC, which one will be returned is
+   unpredictable.  We compensate for that in dwarf_fileline.  */
+
+static int
+function_addrs_search (const void *vkey, const void *ventry)
+{
+  const uintptr_t *key = (const uintptr_t *) vkey;
+  const struct function_addrs *entry = (const struct function_addrs *) ventry;
+  uintptr_t pc;
+
+  pc = *key;
+  if (pc < entry->low)
+    return -1;
+  else if (pc > (entry + 1)->low)
+    return 1;
+  else
+    return 0;
+}
+
+/* Add a new compilation unit address range to a vector.  This is
+   called via add_ranges.  Returns 1 on success, 0 on failure.  */
+
+static int
+add_unit_addr (struct backtrace_state *state, void *rdata,
+	       uintptr_t lowpc, uintptr_t highpc,
+	       backtrace_error_callback error_callback, void *data,
+	       void *pvec)
+{
+  struct unit *u = (struct unit *) rdata;
+  struct unit_addrs_vector *vec = (struct unit_addrs_vector *) pvec;
+  struct unit_addrs *p;
+
+  /* Try to merge with the last entry.  */
+  if (vec->count > 0)
+    {
+      p = (struct unit_addrs *) vec->vec.base + (vec->count - 1);
+      if ((lowpc == p->high || lowpc == p->high + 1)
+	  && u == p->u)
+	{
+	  if (highpc > p->high)
+	    p->high = highpc;
+	  return 1;
+	}
+    }
+
+  p = ((struct unit_addrs *)
+       backtrace_vector_grow (state, sizeof (struct unit_addrs),
+			      error_callback, data, &vec->vec));
+  if (p == NULL)
+    return 0;
+
+  p->low = lowpc;
+  p->high = highpc;
+  p->u = u;
+
+  ++vec->count;
+
+  return 1;
+}
+
+/* Compare unit_addrs for qsort.  When ranges are nested, make the
+   smallest one sort last.  */
+
+static int
+unit_addrs_compare (const void *v1, const void *v2)
+{
+  const struct unit_addrs *a1 = (const struct unit_addrs *) v1;
+  const struct unit_addrs *a2 = (const struct unit_addrs *) v2;
+
+  if (a1->low < a2->low)
+    return -1;
+  if (a1->low > a2->low)
+    return 1;
+  if (a1->high < a2->high)
+    return 1;
+  if (a1->high > a2->high)
+    return -1;
+  if (a1->u->lineoff < a2->u->lineoff)
+    return -1;
+  if (a1->u->lineoff > a2->u->lineoff)
+    return 1;
+  return 0;
+}
+
+/* Compare a PC against a unit_addrs for bsearch.  We always allocate
+   an entry entry at the end of the vector, so that this routine can
+   safely look at the next entry.  Note that if there are multiple
+   ranges containing PC, which one will be returned is unpredictable.
+   We compensate for that in dwarf_fileline.  */
+
+static int
+unit_addrs_search (const void *vkey, const void *ventry)
+{
+  const uintptr_t *key = (const uintptr_t *) vkey;
+  const struct unit_addrs *entry = (const struct unit_addrs *) ventry;
+  uintptr_t pc;
+
+  pc = *key;
+  if (pc < entry->low)
+    return -1;
+  else if (pc > (entry + 1)->low)
+    return 1;
+  else
+    return 0;
+}
+
+/* Sort the line vector by PC.  We want a stable sort here to maintain
+   the order of lines for the same PC values.  Since the sequence is
+   being sorted in place, their addresses cannot be relied on to
+   maintain stability.  That is the purpose of the index member.  */
+
+static int
+line_compare (const void *v1, const void *v2)
+{
+  const struct line *ln1 = (const struct line *) v1;
+  const struct line *ln2 = (const struct line *) v2;
+
+  if (ln1->pc < ln2->pc)
+    return -1;
+  else if (ln1->pc > ln2->pc)
+    return 1;
+  else if (ln1->idx < ln2->idx)
+    return -1;
+  else if (ln1->idx > ln2->idx)
+    return 1;
+  else
+    return 0;
+}
+
+/* Find a PC in a line vector.  We always allocate an extra entry at
+   the end of the lines vector, so that this routine can safely look
+   at the next entry.  Note that when there are multiple mappings for
+   the same PC value, this will return the last one.  */
+
+static int
+line_search (const void *vkey, const void *ventry)
+{
+  const uintptr_t *key = (const uintptr_t *) vkey;
+  const struct line *entry = (const struct line *) ventry;
+  uintptr_t pc;
+
+  pc = *key;
+  if (pc < entry->pc)
+    return -1;
+  else if (pc >= (entry + 1)->pc)
+    return 1;
+  else
+    return 0;
+}
+
+/* Sort the abbrevs by the abbrev code.  This function is passed to
+   both qsort and bsearch.  */
+
+static int
+abbrev_compare (const void *v1, const void *v2)
+{
+  const struct abbrev *a1 = (const struct abbrev *) v1;
+  const struct abbrev *a2 = (const struct abbrev *) v2;
+
+  if (a1->code < a2->code)
+    return -1;
+  else if (a1->code > a2->code)
+    return 1;
+  else
+    {
+      /* This really shouldn't happen.  It means there are two
+	 different abbrevs with the same code, and that means we don't
+	 know which one lookup_abbrev should return.  */
+      return 0;
+    }
+}
+
+/* Read the abbreviation table for a compilation unit.  Returns 1 on
+   success, 0 on failure.  */
+
+static int
+read_abbrevs (struct backtrace_state *state, uint64_t abbrev_offset,
+	      const unsigned char *dwarf_abbrev, size_t dwarf_abbrev_size,
+	      int is_bigendian, backtrace_error_callback error_callback,
+	      void *data, struct abbrevs *abbrevs)
+{
+  struct dwarf_buf abbrev_buf;
+  struct dwarf_buf count_buf;
+  size_t num_abbrevs;
+
+  abbrevs->num_abbrevs = 0;
+  abbrevs->abbrevs = NULL;
+
+  if (abbrev_offset >= dwarf_abbrev_size)
+    {
+      error_callback (data, "abbrev offset out of range", 0);
+      return 0;
+    }
+
+  abbrev_buf.name = ".debug_abbrev";
+  abbrev_buf.start = dwarf_abbrev;
+  abbrev_buf.buf = dwarf_abbrev + abbrev_offset;
+  abbrev_buf.left = dwarf_abbrev_size - abbrev_offset;
+  abbrev_buf.is_bigendian = is_bigendian;
+  abbrev_buf.error_callback = error_callback;
+  abbrev_buf.data = data;
+  abbrev_buf.reported_underflow = 0;
+
+  /* Count the number of abbrevs in this list.  */
+
+  count_buf = abbrev_buf;
+  num_abbrevs = 0;
+  while (read_uleb128 (&count_buf) != 0)
+    {
+      if (count_buf.reported_underflow)
+	return 0;
+      ++num_abbrevs;
+      // Skip tag.
+      read_uleb128 (&count_buf);
+      // Skip has_children.
+      read_byte (&count_buf);
+      // Skip attributes.
+      while (read_uleb128 (&count_buf) != 0)
+	{
+	  uint64_t form;
+
+	  form = read_uleb128 (&count_buf);
+	  if ((enum dwarf_form) form == DW_FORM_implicit_const)
+	    read_sleb128 (&count_buf);
+	}
+      // Skip form of last attribute.
+      read_uleb128 (&count_buf);
+    }
+
+  if (count_buf.reported_underflow)
+    return 0;
+
+  if (num_abbrevs == 0)
+    return 1;
+
+  abbrevs->abbrevs = ((struct abbrev *)
+		      backtrace_alloc (state,
+				       num_abbrevs * sizeof (struct abbrev),
+				       error_callback, data));
+  if (abbrevs->abbrevs == NULL)
+    return 0;
+  abbrevs->num_abbrevs = num_abbrevs;
+  memset (abbrevs->abbrevs, 0, num_abbrevs * sizeof (struct abbrev));
+
+  num_abbrevs = 0;
+  while (1)
+    {
+      uint64_t code;
+      struct abbrev a;
+      size_t num_attrs;
+      struct attr *attrs;
+
+      if (abbrev_buf.reported_underflow)
+	goto fail;
+
+      code = read_uleb128 (&abbrev_buf);
+      if (code == 0)
+	break;
+
+      a.code = code;
+      a.tag = (enum dwarf_tag) read_uleb128 (&abbrev_buf);
+      a.has_children = read_byte (&abbrev_buf);
+
+      count_buf = abbrev_buf;
+      num_attrs = 0;
+      while (read_uleb128 (&count_buf) != 0)
+	{
+	  uint64_t form;
+
+	  ++num_attrs;
+	  form = read_uleb128 (&count_buf);
+	  if ((enum dwarf_form) form == DW_FORM_implicit_const)
+	    read_sleb128 (&count_buf);
+	}
+
+      if (num_attrs == 0)
+	{
+	  attrs = NULL;
+	  read_uleb128 (&abbrev_buf);
+	  read_uleb128 (&abbrev_buf);
+	}
+      else
+	{
+	  attrs = ((struct attr *)
+		   backtrace_alloc (state, num_attrs * sizeof *attrs,
+				    error_callback, data));
+	  if (attrs == NULL)
+	    goto fail;
+	  num_attrs = 0;
+	  while (1)
+	    {
+	      uint64_t name;
+	      uint64_t form;
+
+	      name = read_uleb128 (&abbrev_buf);
+	      form = read_uleb128 (&abbrev_buf);
+	      if (name == 0)
+		break;
+	      attrs[num_attrs].name = (enum dwarf_attribute) name;
+	      attrs[num_attrs].form = (enum dwarf_form) form;
+	      if ((enum dwarf_form) form == DW_FORM_implicit_const)
+		attrs[num_attrs].val = read_sleb128 (&abbrev_buf);
+	      else
+		attrs[num_attrs].val = 0;
+	      ++num_attrs;
+	    }
+	}
+
+      a.num_attrs = num_attrs;
+      a.attrs = attrs;
+
+      abbrevs->abbrevs[num_abbrevs] = a;
+      ++num_abbrevs;
+    }
+
+  backtrace_qsort (abbrevs->abbrevs, abbrevs->num_abbrevs,
+		   sizeof (struct abbrev), abbrev_compare);
+
+  return 1;
+
+ fail:
+  free_abbrevs (state, abbrevs, error_callback, data);
+  return 0;
+}
+
+/* Return the abbrev information for an abbrev code.  */
+
+static const struct abbrev *
+lookup_abbrev (struct abbrevs *abbrevs, uint64_t code,
+	       backtrace_error_callback error_callback, void *data)
+{
+  struct abbrev key;
+  void *p;
+
+  /* With GCC, where abbrevs are simply numbered in order, we should
+     be able to just look up the entry.  */
+  if (code - 1 < abbrevs->num_abbrevs
+      && abbrevs->abbrevs[code - 1].code == code)
+    return &abbrevs->abbrevs[code - 1];
+
+  /* Otherwise we have to search.  */
+  memset (&key, 0, sizeof key);
+  key.code = code;
+  p = bsearch (&key, abbrevs->abbrevs, abbrevs->num_abbrevs,
+	       sizeof (struct abbrev), abbrev_compare);
+  if (p == NULL)
+    {
+      error_callback (data, "invalid abbreviation code", 0);
+      return NULL;
+    }
+  return (const struct abbrev *) p;
+}
+
+/* This struct is used to gather address range information while
+   reading attributes.  We use this while building a mapping from
+   address ranges to compilation units and then again while mapping
+   from address ranges to function entries.  Normally either
+   lowpc/highpc is set or ranges is set.  */
+
+struct pcrange {
+  uintptr_t lowpc;             /* The low PC value.  */
+  int have_lowpc;		/* Whether a low PC value was found.  */
+  int lowpc_is_addr_index;	/* Whether lowpc is in .debug_addr.  */
+  uintptr_t highpc;            /* The high PC value.  */
+  int have_highpc;		/* Whether a high PC value was found.  */
+  int highpc_is_relative;	/* Whether highpc is relative to lowpc.  */
+  int highpc_is_addr_index;	/* Whether highpc is in .debug_addr.  */
+  uint64_t ranges;		/* Offset in ranges section.  */
+  int have_ranges;		/* Whether ranges is valid.  */
+  int ranges_is_index;		/* Whether ranges is DW_FORM_rnglistx.  */
+};
+
+/* Update PCRANGE from an attribute value.  */
+
+static void
+update_pcrange (const struct attr* attr, const struct attr_val* val,
+		struct pcrange *pcrange)
+{
+  switch (attr->name)
+    {
+    case DW_AT_low_pc:
+      if (val->encoding == ATTR_VAL_ADDRESS)
+	{
+	  pcrange->lowpc = (uintptr_t) val->u.uint;
+	  pcrange->have_lowpc = 1;
+	}
+      else if (val->encoding == ATTR_VAL_ADDRESS_INDEX)
+	{
+	  pcrange->lowpc = (uintptr_t) val->u.uint;
+	  pcrange->have_lowpc = 1;
+	  pcrange->lowpc_is_addr_index = 1;
+	}
+      break;
+
+    case DW_AT_high_pc:
+      if (val->encoding == ATTR_VAL_ADDRESS)
+	{
+	  pcrange->highpc = (uintptr_t) val->u.uint;
+	  pcrange->have_highpc = 1;
+	}
+      else if (val->encoding == ATTR_VAL_UINT)
+	{
+	  pcrange->highpc = (uintptr_t) val->u.uint;
+	  pcrange->have_highpc = 1;
+	  pcrange->highpc_is_relative = 1;
+	}
+      else if (val->encoding == ATTR_VAL_ADDRESS_INDEX)
+	{
+	  pcrange->highpc = (uintptr_t) val->u.uint;
+	  pcrange->have_highpc = 1;
+	  pcrange->highpc_is_addr_index = 1;
+	}
+      break;
+
+    case DW_AT_ranges:
+      if (val->encoding == ATTR_VAL_UINT
+	  || val->encoding == ATTR_VAL_REF_SECTION)
+	{
+	  pcrange->ranges = val->u.uint;
+	  pcrange->have_ranges = 1;
+	}
+      else if (val->encoding == ATTR_VAL_RNGLISTS_INDEX)
+	{
+	  pcrange->ranges = val->u.uint;
+	  pcrange->have_ranges = 1;
+	  pcrange->ranges_is_index = 1;
+	}
+      break;
+
+    default:
+      break;
+    }
+}
+
+/* Call ADD_RANGE for a low/high PC pair.  Returns 1 on success, 0 on
+  error.  */
+
+static int
+add_low_high_range (struct backtrace_state *state,
+		    const struct dwarf_sections *dwarf_sections,
+		    struct libbacktrace_base_address base_address,
+		    int is_bigendian, struct unit *u,
+		    const struct pcrange *pcrange,
+		    int (*add_range) (struct backtrace_state *state,
+				      void *rdata, uintptr_t lowpc,
+				      uintptr_t highpc,
+				      backtrace_error_callback error_callback,
+				      void *data, void *vec),
+		    void *rdata,
+		    backtrace_error_callback error_callback, void *data,
+		    void *vec)
+{
+  uintptr_t lowpc;
+  uintptr_t highpc;
+
+  lowpc = pcrange->lowpc;
+  if (pcrange->lowpc_is_addr_index)
+    {
+      if (!resolve_addr_index (dwarf_sections, u->addr_base, u->addrsize,
+			       is_bigendian, lowpc, error_callback, data,
+			       &lowpc))
+	return 0;
+    }
+
+  highpc = pcrange->highpc;
+  if (pcrange->highpc_is_addr_index)
+    {
+      if (!resolve_addr_index (dwarf_sections, u->addr_base, u->addrsize,
+			       is_bigendian, highpc, error_callback, data,
+			       &highpc))
+	return 0;
+    }
+  if (pcrange->highpc_is_relative)
+    highpc += lowpc;
+
+  /* Add in the base address of the module when recording PC values,
+     so that we can look up the PC directly.  */
+  lowpc = libbacktrace_add_base (lowpc, base_address);
+  highpc = libbacktrace_add_base (highpc, base_address);
+
+  return add_range (state, rdata, lowpc, highpc, error_callback, data, vec);
+}
+
+/* Call ADD_RANGE for each range read from .debug_ranges, as used in
+   DWARF versions 2 through 4.  */
+
+static int
+add_ranges_from_ranges (
+    struct backtrace_state *state,
+    const struct dwarf_sections *dwarf_sections,
+    struct libbacktrace_base_address base_address, int is_bigendian,
+    struct unit *u, uintptr_t base,
+    const struct pcrange *pcrange,
+    int (*add_range) (struct backtrace_state *state, void *rdata,
+		      uintptr_t lowpc, uintptr_t highpc,
+		      backtrace_error_callback error_callback, void *data,
+		      void *vec),
+    void *rdata,
+    backtrace_error_callback error_callback, void *data,
+    void *vec)
+{
+  struct dwarf_buf ranges_buf;
+
+  if (pcrange->ranges >= dwarf_sections->size[DEBUG_RANGES])
+    {
+      error_callback (data, "ranges offset out of range", 0);
+      return 0;
+    }
+
+  ranges_buf.name = ".debug_ranges";
+  ranges_buf.start = dwarf_sections->data[DEBUG_RANGES];
+  ranges_buf.buf = dwarf_sections->data[DEBUG_RANGES] + pcrange->ranges;
+  ranges_buf.left = dwarf_sections->size[DEBUG_RANGES] - pcrange->ranges;
+  ranges_buf.is_bigendian = is_bigendian;
+  ranges_buf.error_callback = error_callback;
+  ranges_buf.data = data;
+  ranges_buf.reported_underflow = 0;
+
+  while (1)
+    {
+      uint64_t low;
+      uint64_t high;
+
+      if (ranges_buf.reported_underflow)
+	return 0;
+
+      low = read_address (&ranges_buf, u->addrsize);
+      high = read_address (&ranges_buf, u->addrsize);
+
+      if (low == 0 && high == 0)
+	break;
+
+      if (is_highest_address (low, u->addrsize))
+	base = (uintptr_t) high;
+      else
+	{
+	  uintptr_t rl, rh;
+
+	  rl = libbacktrace_add_base ((uintptr_t) low + base, base_address);
+	  rh = libbacktrace_add_base ((uintptr_t) high + base, base_address);
+	  if (!add_range (state, rdata, rl, rh, error_callback, data, vec))
+	    return 0;
+	}
+    }
+
+  if (ranges_buf.reported_underflow)
+    return 0;
+
+  return 1;
+}
+
+/* Call ADD_RANGE for each range read from .debug_rnglists, as used in
+   DWARF version 5.  */
+
+static int
+add_ranges_from_rnglists (
+    struct backtrace_state *state,
+    const struct dwarf_sections *dwarf_sections,
+    struct libbacktrace_base_address base_address, int is_bigendian,
+    struct unit *u, uintptr_t base,
+    const struct pcrange *pcrange,
+    int (*add_range) (struct backtrace_state *state, void *rdata,
+		      uintptr_t lowpc, uintptr_t highpc,
+		      backtrace_error_callback error_callback, void *data,
+		      void *vec),
+    void *rdata,
+    backtrace_error_callback error_callback, void *data,
+    void *vec)
+{
+  uint64_t offset;
+  struct dwarf_buf rnglists_buf;
+
+  if (!pcrange->ranges_is_index)
+    offset = pcrange->ranges;
+  else
+    offset = u->rnglists_base + pcrange->ranges * (u->is_dwarf64 ? 8 : 4);
+  if (offset >= dwarf_sections->size[DEBUG_RNGLISTS])
+    {
+      error_callback (data, "rnglists offset out of range", 0);
+      return 0;
+    }
+
+  rnglists_buf.name = ".debug_rnglists";
+  rnglists_buf.start = dwarf_sections->data[DEBUG_RNGLISTS];
+  rnglists_buf.buf = dwarf_sections->data[DEBUG_RNGLISTS] + offset;
+  rnglists_buf.left = dwarf_sections->size[DEBUG_RNGLISTS] - offset;
+  rnglists_buf.is_bigendian = is_bigendian;
+  rnglists_buf.error_callback = error_callback;
+  rnglists_buf.data = data;
+  rnglists_buf.reported_underflow = 0;
+
+  if (pcrange->ranges_is_index)
+    {
+      offset = read_offset (&rnglists_buf, u->is_dwarf64);
+      offset += u->rnglists_base;
+      if (offset >= dwarf_sections->size[DEBUG_RNGLISTS])
+	{
+	  error_callback (data, "rnglists index offset out of range", 0);
+	  return 0;
+	}
+      rnglists_buf.buf = dwarf_sections->data[DEBUG_RNGLISTS] + offset;
+      rnglists_buf.left = dwarf_sections->size[DEBUG_RNGLISTS] - offset;
+    }
+
+  while (1)
+    {
+      unsigned char rle;
+
+      rle = read_byte (&rnglists_buf);
+      if (rle == DW_RLE_end_of_list)
+	break;
+      switch (rle)
+	{
+	case DW_RLE_base_addressx:
+	  {
+	    uint64_t index;
+
+	    index = read_uleb128 (&rnglists_buf);
+	    if (!resolve_addr_index (dwarf_sections, u->addr_base,
+				     u->addrsize, is_bigendian, index,
+				     error_callback, data, &base))
+	      return 0;
+	  }
+	  break;
+
+	case DW_RLE_startx_endx:
+	  {
+	    uint64_t index;
+	    uintptr_t low;
+	    uintptr_t high;
+
+	    index = read_uleb128 (&rnglists_buf);
+	    if (!resolve_addr_index (dwarf_sections, u->addr_base,
+				     u->addrsize, is_bigendian, index,
+				     error_callback, data, &low))
+	      return 0;
+	    index = read_uleb128 (&rnglists_buf);
+	    if (!resolve_addr_index (dwarf_sections, u->addr_base,
+				     u->addrsize, is_bigendian, index,
+				     error_callback, data, &high))
+	      return 0;
+	    if (!add_range (state, rdata,
+			    libbacktrace_add_base (low, base_address),
+			    libbacktrace_add_base (high, base_address),
+			    error_callback, data, vec))
+	      return 0;
+	  }
+	  break;
+
+	case DW_RLE_startx_length:
+	  {
+	    uint64_t index;
+	    uintptr_t low;
+	    uintptr_t length;
+
+	    index = read_uleb128 (&rnglists_buf);
+	    if (!resolve_addr_index (dwarf_sections, u->addr_base,
+				     u->addrsize, is_bigendian, index,
+				     error_callback, data, &low))
+	      return 0;
+	    length = read_uleb128 (&rnglists_buf);
+	    low = libbacktrace_add_base (low, base_address);
+	    if (!add_range (state, rdata, low, low + length,
+			    error_callback, data, vec))
+	      return 0;
+	  }
+	  break;
+
+	case DW_RLE_offset_pair:
+	  {
+	    uint64_t low;
+	    uint64_t high;
+
+	    low = read_uleb128 (&rnglists_buf);
+	    high = read_uleb128 (&rnglists_buf);
+	    if (!add_range (state, rdata,
+			    libbacktrace_add_base (low + base, base_address),
+			    libbacktrace_add_base (high + base, base_address),
+			    error_callback, data, vec))
+	      return 0;
+	  }
+	  break;
+
+	case DW_RLE_base_address:
+	  base = (uintptr_t) read_address (&rnglists_buf, u->addrsize);
+	  break;
+
+	case DW_RLE_start_end:
+	  {
+	    uintptr_t low;
+	    uintptr_t high;
+
+	    low = (uintptr_t) read_address (&rnglists_buf, u->addrsize);
+	    high = (uintptr_t) read_address (&rnglists_buf, u->addrsize);
+	    if (!add_range (state, rdata,
+			    libbacktrace_add_base (low, base_address),
+			    libbacktrace_add_base (high, base_address),
+			    error_callback, data, vec))
+	      return 0;
+	  }
+	  break;
+
+	case DW_RLE_start_length:
+	  {
+	    uintptr_t low;
+	    uintptr_t length;
+
+	    low = (uintptr_t) read_address (&rnglists_buf, u->addrsize);
+	    length = (uintptr_t) read_uleb128 (&rnglists_buf);
+	    low = libbacktrace_add_base (low, base_address);
+	    if (!add_range (state, rdata, low, low + length,
+			    error_callback, data, vec))
+	      return 0;
+	  }
+	  break;
+
+	default:
+	  dwarf_buf_error (&rnglists_buf, "unrecognized DW_RLE value", -1);
+	  return 0;
+	}
+    }
+
+  if (rnglists_buf.reported_underflow)
+    return 0;
+
+  return 1;
+}
+
+/* Call ADD_RANGE for each lowpc/highpc pair in PCRANGE.  RDATA is
+   passed to ADD_RANGE, and is either a struct unit * or a struct
+   function *.  VEC is the vector we are adding ranges to, and is
+   either a struct unit_addrs_vector * or a struct function_vector *.
+   Returns 1 on success, 0 on error.  */
+
+static int
+add_ranges (struct backtrace_state *state,
+	    const struct dwarf_sections *dwarf_sections,
+	    struct libbacktrace_base_address base_address, int is_bigendian,
+	    struct unit *u, uintptr_t base, const struct pcrange *pcrange,
+	    int (*add_range) (struct backtrace_state *state, void *rdata, 
+			      uintptr_t lowpc, uintptr_t highpc,
+			      backtrace_error_callback error_callback,
+			      void *data, void *vec),
+	    void *rdata,
+	    backtrace_error_callback error_callback, void *data,
+	    void *vec)
+{
+  if (pcrange->have_lowpc && pcrange->have_highpc)
+    return add_low_high_range (state, dwarf_sections, base_address,
+			       is_bigendian, u, pcrange, add_range, rdata,
+			       error_callback, data, vec);
+
+  if (!pcrange->have_ranges)
+    {
+      /* Did not find any address ranges to add.  */
+      return 1;
+    }
+
+  if (u->version < 5)
+    return add_ranges_from_ranges (state, dwarf_sections, base_address,
+				   is_bigendian, u, base, pcrange, add_range,
+				   rdata, error_callback, data, vec);
+  else
+    return add_ranges_from_rnglists (state, dwarf_sections, base_address,
+				     is_bigendian, u, base, pcrange, add_range,
+				     rdata, error_callback, data, vec);
+}
+
+/* Find the address range covered by a compilation unit, reading from
+   UNIT_BUF and adding values to U.  Returns 1 if all data could be
+   read, 0 if there is some error.  */
+
+static int
+find_address_ranges (struct backtrace_state *state,
+		     struct libbacktrace_base_address base_address,
+		     struct dwarf_buf *unit_buf,
+		     const struct dwarf_sections *dwarf_sections,
+		     int is_bigendian, struct dwarf_data *altlink,
+		     backtrace_error_callback error_callback, void *data,
+		     struct unit *u, struct unit_addrs_vector *addrs,
+		     enum dwarf_tag *unit_tag)
+{
+  while (unit_buf->left > 0)
+    {
+      uint64_t code;
+      const struct abbrev *abbrev;
+      struct pcrange pcrange;
+      struct attr_val name_val;
+      int have_name_val;
+      struct attr_val comp_dir_val;
+      int have_comp_dir_val;
+      size_t i;
+
+      code = read_uleb128 (unit_buf);
+      if (code == 0)
+	return 1;
+
+      abbrev = lookup_abbrev (&u->abbrevs, code, error_callback, data);
+      if (abbrev == NULL)
+	return 0;
+
+      if (unit_tag != NULL)
+	*unit_tag = abbrev->tag;
+
+      memset (&pcrange, 0, sizeof pcrange);
+      memset (&name_val, 0, sizeof name_val);
+      have_name_val = 0;
+      memset (&comp_dir_val, 0, sizeof comp_dir_val);
+      have_comp_dir_val = 0;
+      for (i = 0; i < abbrev->num_attrs; ++i)
+	{
+	  struct attr_val val;
+
+	  if (!read_attribute (abbrev->attrs[i].form, abbrev->attrs[i].val,
+			       unit_buf, u->is_dwarf64, u->version,
+			       u->addrsize, dwarf_sections, altlink, &val))
+	    return 0;
+
+	  switch (abbrev->attrs[i].name)
+	    {
+	    case DW_AT_low_pc: case DW_AT_high_pc: case DW_AT_ranges:
+	      update_pcrange (&abbrev->attrs[i], &val, &pcrange);
+	      break;
+
+	    case DW_AT_stmt_list:
+	      if ((abbrev->tag == DW_TAG_compile_unit
+		   || abbrev->tag == DW_TAG_skeleton_unit)
+		  && (val.encoding == ATTR_VAL_UINT
+		      || val.encoding == ATTR_VAL_REF_SECTION))
+		u->lineoff = val.u.uint;
+	      break;
+
+	    case DW_AT_name:
+	      if (abbrev->tag == DW_TAG_compile_unit
+		  || abbrev->tag == DW_TAG_skeleton_unit)
+		{
+		  name_val = val;
+		  have_name_val = 1;
+		}
+	      break;
+
+	    case DW_AT_comp_dir:
+	      if (abbrev->tag == DW_TAG_compile_unit
+		  || abbrev->tag == DW_TAG_skeleton_unit)
+		{
+		  comp_dir_val = val;
+		  have_comp_dir_val = 1;
+		}
+	      break;
+
+	    case DW_AT_str_offsets_base:
+	      if ((abbrev->tag == DW_TAG_compile_unit
+		   || abbrev->tag == DW_TAG_skeleton_unit)
+		  && val.encoding == ATTR_VAL_REF_SECTION)
+		u->str_offsets_base = val.u.uint;
+	      break;
+
+	    case DW_AT_addr_base:
+	      if ((abbrev->tag == DW_TAG_compile_unit
+		   || abbrev->tag == DW_TAG_skeleton_unit)
+		  && val.encoding == ATTR_VAL_REF_SECTION)
+		u->addr_base = val.u.uint;
+	      break;
+
+	    case DW_AT_rnglists_base:
+	      if ((abbrev->tag == DW_TAG_compile_unit
+		   || abbrev->tag == DW_TAG_skeleton_unit)
+		  && val.encoding == ATTR_VAL_REF_SECTION)
+		u->rnglists_base = val.u.uint;
+	      break;
+
+	    default:
+	      break;
+	    }
+	}
+
+      // Resolve strings after we're sure that we have seen
+      // DW_AT_str_offsets_base.
+      if (have_name_val)
+	{
+	  if (!resolve_string (dwarf_sections, u->is_dwarf64, is_bigendian,
+			       u->str_offsets_base, &name_val,
+			       error_callback, data, &u->filename))
+	    return 0;
+	}
+      if (have_comp_dir_val)
+	{
+	  if (!resolve_string (dwarf_sections, u->is_dwarf64, is_bigendian,
+			       u->str_offsets_base, &comp_dir_val,
+			       error_callback, data, &u->comp_dir))
+	    return 0;
+	}
+
+      if (abbrev->tag == DW_TAG_compile_unit
+	  || abbrev->tag == DW_TAG_subprogram
+	  || abbrev->tag == DW_TAG_skeleton_unit)
+	{
+	  if (!add_ranges (state, dwarf_sections, base_address,
+			   is_bigendian, u, pcrange.lowpc, &pcrange,
+			   add_unit_addr, (void *) u, error_callback, data,
+			   (void *) addrs))
+	    return 0;
+
+	  /* If we found the PC range in the DW_TAG_compile_unit or
+	     DW_TAG_skeleton_unit, we can stop now.  */
+	  if ((abbrev->tag == DW_TAG_compile_unit
+	       || abbrev->tag == DW_TAG_skeleton_unit)
+	      && (pcrange.have_ranges
+		  || (pcrange.have_lowpc && pcrange.have_highpc)))
+	    return 1;
+	}
+
+      if (abbrev->has_children)
+	{
+	  if (!find_address_ranges (state, base_address, unit_buf,
+				    dwarf_sections, is_bigendian, altlink,
+				    error_callback, data, u, addrs, NULL))
+	    return 0;
+	}
+    }
+
+  return 1;
+}
+
+/* Build a mapping from address ranges to the compilation units where
+   the line number information for that range can be found.  Returns 1
+   on success, 0 on failure.  */
+
+static int
+build_address_map (struct backtrace_state *state,
+		   struct libbacktrace_base_address base_address,
+		   const struct dwarf_sections *dwarf_sections,
+		   int is_bigendian, struct dwarf_data *altlink,
+		   backtrace_error_callback error_callback, void *data,
+		   struct unit_addrs_vector *addrs,
+		   struct unit_vector *unit_vec)
+{
+  struct dwarf_buf info;
+  struct backtrace_vector units;
+  size_t units_count;
+  size_t i;
+  struct unit **pu;
+  size_t unit_offset = 0;
+  struct unit_addrs *pa;
+
+  memset (&addrs->vec, 0, sizeof addrs->vec);
+  memset (&unit_vec->vec, 0, sizeof unit_vec->vec);
+  addrs->count = 0;
+  unit_vec->count = 0;
+
+  /* Read through the .debug_info section.  FIXME: Should we use the
+     .debug_aranges section?  gdb and addr2line don't use it, but I'm
+     not sure why.  */
+
+  info.name = ".debug_info";
+  info.start = dwarf_sections->data[DEBUG_INFO];
+  info.buf = info.start;
+  info.left = dwarf_sections->size[DEBUG_INFO];
+  info.is_bigendian = is_bigendian;
+  info.error_callback = error_callback;
+  info.data = data;
+  info.reported_underflow = 0;
+
+  memset (&units, 0, sizeof units);
+  units_count = 0;
+
+  while (info.left > 0)
+    {
+      const unsigned char *unit_data_start;
+      uint64_t len;
+      int is_dwarf64;
+      struct dwarf_buf unit_buf;
+      int version;
+      int unit_type;
+      uint64_t abbrev_offset;
+      int addrsize;
+      struct unit *u;
+      enum dwarf_tag unit_tag;
+
+      if (info.reported_underflow)
+	goto fail;
+
+      unit_data_start = info.buf;
+
+      len = read_initial_length (&info, &is_dwarf64);
+      unit_buf = info;
+      unit_buf.left = len;
+
+      if (!advance (&info, len))
+	goto fail;
+
+      version = read_uint16 (&unit_buf);
+      if (version < 2 || version > 5)
+	{
+	  dwarf_buf_error (&unit_buf, "unrecognized DWARF version", -1);
+	  goto fail;
+	}
+
+      if (version < 5)
+	unit_type = 0;
+      else
+	{
+	  unit_type = read_byte (&unit_buf);
+	  if (unit_type == DW_UT_type || unit_type == DW_UT_split_type)
+	    {
+	      /* This unit doesn't have anything we need.  */
+	      continue;
+	    }
+	}
+
+      pu = ((struct unit **)
+	    backtrace_vector_grow (state, sizeof (struct unit *),
+				   error_callback, data, &units));
+      if (pu == NULL)
+	  goto fail;
+
+      u = ((struct unit *)
+	   backtrace_alloc (state, sizeof *u, error_callback, data));
+      if (u == NULL)
+	goto fail;
+
+      *pu = u;
+      ++units_count;
+
+      if (version < 5)
+	addrsize = 0; /* Set below.  */
+      else
+	addrsize = read_byte (&unit_buf);
+
+      memset (&u->abbrevs, 0, sizeof u->abbrevs);
+      abbrev_offset = read_offset (&unit_buf, is_dwarf64);
+      if (!read_abbrevs (state, abbrev_offset,
+			 dwarf_sections->data[DEBUG_ABBREV],
+			 dwarf_sections->size[DEBUG_ABBREV],
+			 is_bigendian, error_callback, data, &u->abbrevs))
+	goto fail;
+
+      if (version < 5)
+	addrsize = read_byte (&unit_buf);
+
+      switch (unit_type)
+	{
+	case 0:
+	  break;
+	case DW_UT_compile: case DW_UT_partial:
+	  break;
+	case DW_UT_skeleton: case DW_UT_split_compile:
+	  read_uint64 (&unit_buf); /* dwo_id */
+	  break;
+	default:
+	  break;
+	}
+
+      u->low_offset = unit_offset;
+      unit_offset += len + (is_dwarf64 ? 12 : 4);
+      u->high_offset = unit_offset;
+      u->unit_data = unit_buf.buf;
+      u->unit_data_len = unit_buf.left;
+      u->unit_data_offset = unit_buf.buf - unit_data_start;
+      u->version = version;
+      u->is_dwarf64 = is_dwarf64;
+      u->addrsize = addrsize;
+      u->filename = NULL;
+      u->comp_dir = NULL;
+      u->abs_filename = NULL;
+      u->lineoff = 0;
+      u->str_offsets_base = 0;
+      u->addr_base = 0;
+      u->rnglists_base = 0;
+
+      /* The actual line number mappings will be read as needed.  */
+      u->lines = NULL;
+      u->lines_count = 0;
+      u->function_addrs = NULL;
+      u->function_addrs_count = 0;
+
+      if (!find_address_ranges (state, base_address, &unit_buf, dwarf_sections,
+				is_bigendian, altlink, error_callback, data,
+				u, addrs, &unit_tag))
+	goto fail;
+
+      if (unit_buf.reported_underflow)
+	goto fail;
+    }
+  if (info.reported_underflow)
+    goto fail;
+
+  /* Add a trailing addrs entry, but don't include it in addrs->count.  */
+  pa = ((struct unit_addrs *)
+	backtrace_vector_grow (state, sizeof (struct unit_addrs),
+			       error_callback, data, &addrs->vec));
+  if (pa == NULL)
+    goto fail;
+  pa->low = 0;
+  --pa->low;
+  pa->high = pa->low;
+  pa->u = NULL;
+
+  unit_vec->vec = units;
+  unit_vec->count = units_count;
+  return 1;
+
+ fail:
+  if (units_count > 0)
+    {
+      pu = (struct unit **) units.base;
+      for (i = 0; i < units_count; i++)
+	{
+	  free_abbrevs (state, &pu[i]->abbrevs, error_callback, data);
+	  backtrace_free (state, pu[i], sizeof **pu, error_callback, data);
+	}
+      backtrace_vector_free (state, &units, error_callback, data);
+    }
+  if (addrs->count > 0)
+    {
+      backtrace_vector_free (state, &addrs->vec, error_callback, data);
+      addrs->count = 0;
+    }
+  return 0;
+}
+
+/* Add a new mapping to the vector of line mappings that we are
+   building.  Returns 1 on success, 0 on failure.  */
+
+static int
+add_line (struct backtrace_state *state, struct dwarf_data *ddata,
+	  uintptr_t pc, const char *filename, int lineno,
+	  backtrace_error_callback error_callback, void *data,
+	  struct line_vector *vec)
+{
+  struct line *ln;
+
+  /* If we are adding the same mapping, ignore it.  This can happen
+     when using discriminators.  */
+  if (vec->count > 0)
+    {
+      ln = (struct line *) vec->vec.base + (vec->count - 1);
+      if (pc == ln->pc && filename == ln->filename && lineno == ln->lineno)
+	return 1;
+    }
+
+  ln = ((struct line *)
+	backtrace_vector_grow (state, sizeof (struct line), error_callback,
+			       data, &vec->vec));
+  if (ln == NULL)
+    return 0;
+
+  /* Add in the base address here, so that we can look up the PC
+     directly.  */
+  ln->pc = libbacktrace_add_base (pc, ddata->base_address);
+
+  ln->filename = filename;
+  ln->lineno = lineno;
+  ln->idx = vec->count;
+
+  ++vec->count;
+
+  return 1;
+}
+
+/* Free the line header information.  */
+
+static void
+free_line_header (struct backtrace_state *state, struct line_header *hdr,
+		  backtrace_error_callback error_callback, void *data)
+{
+  if (hdr->dirs_count != 0)
+    backtrace_free (state, hdr->dirs, hdr->dirs_count * sizeof (const char *),
+		    error_callback, data);
+  backtrace_free (state, hdr->filenames,
+		  hdr->filenames_count * sizeof (char *),
+		  error_callback, data);
+}
+
+/* Read the directories and file names for a line header for version
+   2, setting fields in HDR.  Return 1 on success, 0 on failure.  */
+
+static int
+read_v2_paths (struct backtrace_state *state, struct unit *u,
+	       struct dwarf_buf *hdr_buf, struct line_header *hdr)
+{
+  const unsigned char *p;
+  const unsigned char *pend;
+  size_t i;
+
+  /* Count the number of directory entries.  */
+  hdr->dirs_count = 0;
+  p = hdr_buf->buf;
+  pend = p + hdr_buf->left;
+  while (p < pend && *p != '\0')
+    {
+      p += strnlen((const char *) p, pend - p) + 1;
+      ++hdr->dirs_count;
+    }
+
+  /* The index of the first entry in the list of directories is 1.  Index 0 is
+     used for the current directory of the compilation.  To simplify index
+     handling, we set entry 0 to the compilation unit directory.  */
+  ++hdr->dirs_count;
+  hdr->dirs = ((const char **)
+	       backtrace_alloc (state,
+				hdr->dirs_count * sizeof (const char *),
+				hdr_buf->error_callback,
+				hdr_buf->data));
+  if (hdr->dirs == NULL)
+    return 0;
+
+  hdr->dirs[0] = u->comp_dir;
+  i = 1;
+  while (*hdr_buf->buf != '\0')
+    {
+      if (hdr_buf->reported_underflow)
+	return 0;
+
+      hdr->dirs[i] = read_string (hdr_buf);
+      if (hdr->dirs[i] == NULL)
+	return 0;
+      ++i;
+    }
+  if (!advance (hdr_buf, 1))
+    return 0;
+
+  /* Count the number of file entries.  */
+  hdr->filenames_count = 0;
+  p = hdr_buf->buf;
+  pend = p + hdr_buf->left;
+  while (p < pend && *p != '\0')
+    {
+      p += strnlen ((const char *) p, pend - p) + 1;
+      p += leb128_len (p);
+      p += leb128_len (p);
+      p += leb128_len (p);
+      ++hdr->filenames_count;
+    }
+
+  /* The index of the first entry in the list of file names is 1.  Index 0 is
+     used for the DW_AT_name of the compilation unit.  To simplify index
+     handling, we set entry 0 to the compilation unit file name.  */
+  ++hdr->filenames_count;
+  hdr->filenames = ((const char **)
+		    backtrace_alloc (state,
+				     hdr->filenames_count * sizeof (char *),
+				     hdr_buf->error_callback,
+				     hdr_buf->data));
+  if (hdr->filenames == NULL)
+    return 0;
+  hdr->filenames[0] = u->filename;
+  i = 1;
+  while (*hdr_buf->buf != '\0')
+    {
+      const char *filename;
+      uint64_t dir_index;
+
+      if (hdr_buf->reported_underflow)
+	return 0;
+
+      filename = read_string (hdr_buf);
+      if (filename == NULL)
+	return 0;
+      dir_index = read_uleb128 (hdr_buf);
+      if (IS_ABSOLUTE_PATH (filename)
+	  || (dir_index < hdr->dirs_count && hdr->dirs[dir_index] == NULL))
+	hdr->filenames[i] = filename;
+      else
+	{
+	  const char *dir;
+	  size_t dir_len;
+	  size_t filename_len;
+	  char *s;
+
+	  if (dir_index < hdr->dirs_count)
+	    dir = hdr->dirs[dir_index];
+	  else
+	    {
+	      dwarf_buf_error (hdr_buf,
+			       ("invalid directory index in "
+				"line number program header"),
+			       0);
+	      return 0;
+	    }
+	  dir_len = strlen (dir);
+	  filename_len = strlen (filename);
+	  s = ((char *) backtrace_alloc (state, dir_len + filename_len + 2,
+					 hdr_buf->error_callback,
+					 hdr_buf->data));
+	  if (s == NULL)
+	    return 0;
+	  memcpy (s, dir, dir_len);
+	  /* FIXME: If we are on a DOS-based file system, and the
+	     directory or the file name use backslashes, then we
+	     should use a backslash here.  */
+	  s[dir_len] = '/';
+	  memcpy (s + dir_len + 1, filename, filename_len + 1);
+	  hdr->filenames[i] = s;
+	}
+
+      /* Ignore the modification time and size.  */
+      read_uleb128 (hdr_buf);
+      read_uleb128 (hdr_buf);
+
+      ++i;
+    }
+
+  return 1;
+}
+
+/* Read a single version 5 LNCT entry for a directory or file name in a
+   line header.  Sets *STRING to the resulting name, ignoring other
+   data.  Return 1 on success, 0 on failure.  */
+
+static int
+read_lnct (struct backtrace_state *state, struct dwarf_data *ddata,
+	   struct unit *u, struct dwarf_buf *hdr_buf,
+	   const struct line_header *hdr, size_t formats_count,
+	   const struct line_header_format *formats, const char **string)
+{
+  size_t i;
+  const char *dir;
+  const char *path;
+
+  dir = NULL;
+  path = NULL;
+  for (i = 0; i < formats_count; i++)
+    {
+      struct attr_val val;
+
+      if (!read_attribute (formats[i].form, 0, hdr_buf, u->is_dwarf64,
+			   u->version, hdr->addrsize, &ddata->dwarf_sections,
+			   ddata->altlink, &val))
+	return 0;
+      switch (formats[i].lnct)
+	{
+	case DW_LNCT_path:
+	  if (!resolve_string (&ddata->dwarf_sections, u->is_dwarf64,
+			       ddata->is_bigendian, u->str_offsets_base,
+			       &val, hdr_buf->error_callback, hdr_buf->data,
+			       &path))
+	    return 0;
+	  break;
+	case DW_LNCT_directory_index:
+	  if (val.encoding == ATTR_VAL_UINT)
+	    {
+	      if (val.u.uint >= hdr->dirs_count)
+		{
+		  dwarf_buf_error (hdr_buf,
+				   ("invalid directory index in "
+				    "line number program header"),
+				   0);
+		  return 0;
+		}
+	      dir = hdr->dirs[val.u.uint];
+	    }
+	  break;
+	default:
+	  /* We don't care about timestamps or sizes or hashes.  */
+	  break;
+	}
+    }
+
+  if (path == NULL)
+    {
+      dwarf_buf_error (hdr_buf,
+		       "missing file name in line number program header",
+		       0);
+      return 0;
+    }
+
+  if (dir == NULL)
+    *string = path;
+  else
+    {
+      size_t dir_len;
+      size_t path_len;
+      char *s;
+
+      dir_len = strlen (dir);
+      path_len = strlen (path);
+      s = (char *) backtrace_alloc (state, dir_len + path_len + 2,
+				    hdr_buf->error_callback, hdr_buf->data);
+      if (s == NULL)
+	return 0;
+      memcpy (s, dir, dir_len);
+      /* FIXME: If we are on a DOS-based file system, and the
+	 directory or the path name use backslashes, then we should
+	 use a backslash here.  */
+      s[dir_len] = '/';
+      memcpy (s + dir_len + 1, path, path_len + 1);
+      *string = s;
+    }
+
+  return 1;
+}
+
+/* Read a set of DWARF 5 line header format entries, setting *PCOUNT
+   and *PPATHS.  Return 1 on success, 0 on failure.  */
+
+static int
+read_line_header_format_entries (struct backtrace_state *state,
+				 struct dwarf_data *ddata,
+				 struct unit *u,
+				 struct dwarf_buf *hdr_buf,
+				 struct line_header *hdr,
+				 size_t *pcount,
+				 const char ***ppaths)
+{
+  size_t formats_count;
+  struct line_header_format *formats;
+  size_t paths_count;
+  const char **paths;
+  size_t i;
+  int ret;
+
+  formats_count = read_byte (hdr_buf);
+  if (formats_count == 0)
+    formats = NULL;
+  else
+    {
+      formats = ((struct line_header_format *)
+		 backtrace_alloc (state,
+				  (formats_count
+				   * sizeof (struct line_header_format)),
+				  hdr_buf->error_callback,
+				  hdr_buf->data));
+      if (formats == NULL)
+	return 0;
+
+      for (i = 0; i < formats_count; i++)
+	{
+	  formats[i].lnct = (int) read_uleb128(hdr_buf);
+	  formats[i].form = (enum dwarf_form) read_uleb128 (hdr_buf);
+	}
+    }
+
+  paths_count = read_uleb128 (hdr_buf);
+  if (paths_count == 0)
+    {
+      *pcount = 0;
+      *ppaths = NULL;
+      ret = 1;
+      goto exit;
+    }
+
+  paths = ((const char **)
+	   backtrace_alloc (state, paths_count * sizeof (const char *),
+			    hdr_buf->error_callback, hdr_buf->data));
+  if (paths == NULL)
+    {
+      ret = 0;
+      goto exit;
+    }
+  for (i = 0; i < paths_count; i++)
+    {
+      if (!read_lnct (state, ddata, u, hdr_buf, hdr, formats_count,
+		      formats, &paths[i]))
+	{
+	  backtrace_free (state, paths,
+			  paths_count * sizeof (const char *),
+			  hdr_buf->error_callback, hdr_buf->data);
+	  ret = 0;
+	  goto exit;
+	}
+    }
+
+  *pcount = paths_count;
+  *ppaths = paths;
+
+  ret = 1;
+
+ exit:
+  if (formats != NULL)
+    backtrace_free (state, formats,
+		    formats_count * sizeof (struct line_header_format),
+		    hdr_buf->error_callback, hdr_buf->data);
+
+  return  ret;
+}
+
+/* Read the line header.  Return 1 on success, 0 on failure.  */
+
+static int
+read_line_header (struct backtrace_state *state, struct dwarf_data *ddata,
+		  struct unit *u, int is_dwarf64, struct dwarf_buf *line_buf,
+		  struct line_header *hdr)
+{
+  uint64_t hdrlen;
+  struct dwarf_buf hdr_buf;
+
+  hdr->version = read_uint16 (line_buf);
+  if (hdr->version < 2 || hdr->version > 5)
+    {
+      dwarf_buf_error (line_buf, "unsupported line number version", -1);
+      return 0;
+    }
+
+  if (hdr->version < 5)
+    hdr->addrsize = u->addrsize;
+  else
+    {
+      hdr->addrsize = read_byte (line_buf);
+      /* We could support a non-zero segment_selector_size but I doubt
+	 we'll ever see it.  */
+      if (read_byte (line_buf) != 0)
+	{
+	  dwarf_buf_error (line_buf,
+			   "non-zero segment_selector_size not supported",
+			   -1);
+	  return 0;
+	}
+    }
+
+  hdrlen = read_offset (line_buf, is_dwarf64);
+
+  hdr_buf = *line_buf;
+  hdr_buf.left = hdrlen;
+
+  if (!advance (line_buf, hdrlen))
+    return 0;
+
+  hdr->min_insn_len = read_byte (&hdr_buf);
+  if (hdr->version < 4)
+    hdr->max_ops_per_insn = 1;
+  else
+    hdr->max_ops_per_insn = read_byte (&hdr_buf);
+
+  /* We don't care about default_is_stmt.  */
+  read_byte (&hdr_buf);
+
+  hdr->line_base = read_sbyte (&hdr_buf);
+  hdr->line_range = read_byte (&hdr_buf);
+
+  hdr->opcode_base = read_byte (&hdr_buf);
+  hdr->opcode_lengths = hdr_buf.buf;
+  if (!advance (&hdr_buf, hdr->opcode_base - 1))
+    return 0;
+
+  if (hdr->version < 5)
+    {
+      if (!read_v2_paths (state, u, &hdr_buf, hdr))
+	return 0;
+    }
+  else
+    {
+      if (!read_line_header_format_entries (state, ddata, u, &hdr_buf, hdr,
+					    &hdr->dirs_count,
+					    &hdr->dirs))
+	return 0;
+      if (!read_line_header_format_entries (state, ddata, u, &hdr_buf, hdr,
+					    &hdr->filenames_count,
+					    &hdr->filenames))
+	return 0;
+    }
+
+  if (hdr_buf.reported_underflow)
+    return 0;
+
+  return 1;
+}
+
+/* Read the line program, adding line mappings to VEC.  Return 1 on
+   success, 0 on failure.  */
+
+static int
+read_line_program (struct backtrace_state *state, struct dwarf_data *ddata,
+		   const struct line_header *hdr, struct dwarf_buf *line_buf,
+		   struct line_vector *vec)
+{
+  uint64_t address;
+  unsigned int op_index;
+  const char *reset_filename;
+  const char *filename;
+  int lineno;
+
+  address = 0;
+  op_index = 0;
+  if (hdr->filenames_count > 1)
+    reset_filename = hdr->filenames[1];
+  else
+    reset_filename = "";
+  filename = reset_filename;
+  lineno = 1;
+  while (line_buf->left > 0)
+    {
+      unsigned int op;
+
+      op = read_byte (line_buf);
+      if (op >= hdr->opcode_base)
+	{
+	  unsigned int advance;
+
+	  /* Special opcode.  */
+	  op -= hdr->opcode_base;
+	  advance = op / hdr->line_range;
+	  address += (hdr->min_insn_len * (op_index + advance)
+		      / hdr->max_ops_per_insn);
+	  op_index = (op_index + advance) % hdr->max_ops_per_insn;
+	  lineno += hdr->line_base + (int) (op % hdr->line_range);
+	  add_line (state, ddata, address, filename, lineno,
+		    line_buf->error_callback, line_buf->data, vec);
+	}
+      else if (op == DW_LNS_extended_op)
+	{
+	  uint64_t len;
+
+	  len = read_uleb128 (line_buf);
+	  op = read_byte (line_buf);
+	  switch (op)
+	    {
+	    case DW_LNE_end_sequence:
+	      /* FIXME: Should we mark the high PC here?  It seems
+		 that we already have that information from the
+		 compilation unit.  */
+	      address = 0;
+	      op_index = 0;
+	      filename = reset_filename;
+	      lineno = 1;
+	      break;
+	    case DW_LNE_set_address:
+	      address = read_address (line_buf, hdr->addrsize);
+	      break;
+	    case DW_LNE_define_file:
+	      {
+		const char *f;
+		unsigned int dir_index;
+
+		f = read_string (line_buf);
+		if (f == NULL)
+		  return 0;
+		dir_index = read_uleb128 (line_buf);
+		/* Ignore that time and length.  */
+		read_uleb128 (line_buf);
+		read_uleb128 (line_buf);
+		if (IS_ABSOLUTE_PATH (f))
+		  filename = f;
+		else
+		  {
+		    const char *dir;
+		    size_t dir_len;
+		    size_t f_len;
+		    char *p;
+
+		    if (dir_index < hdr->dirs_count)
+		      dir = hdr->dirs[dir_index];
+		    else
+		      {
+			dwarf_buf_error (line_buf,
+					 ("invalid directory index "
+					  "in line number program"),
+					 0);
+			return 0;
+		      }
+		    dir_len = strlen (dir);
+		    f_len = strlen (f);
+		    p = ((char *)
+			 backtrace_alloc (state, dir_len + f_len + 2,
+					  line_buf->error_callback,
+					  line_buf->data));
+		    if (p == NULL)
+		      return 0;
+		    memcpy (p, dir, dir_len);
+		    /* FIXME: If we are on a DOS-based file system,
+		       and the directory or the file name use
+		       backslashes, then we should use a backslash
+		       here.  */
+		    p[dir_len] = '/';
+		    memcpy (p + dir_len + 1, f, f_len + 1);
+		    filename = p;
+		  }
+	      }
+	      break;
+	    case DW_LNE_set_discriminator:
+	      /* We don't care about discriminators.  */
+	      read_uleb128 (line_buf);
+	      break;
+	    default:
+	      if (!advance (line_buf, len - 1))
+		return 0;
+	      break;
+	    }
+	}
+      else
+	{
+	  switch (op)
+	    {
+	    case DW_LNS_copy:
+	      add_line (state, ddata, address, filename, lineno,
+			line_buf->error_callback, line_buf->data, vec);
+	      break;
+	    case DW_LNS_advance_pc:
+	      {
+		uint64_t advance;
+
+		advance = read_uleb128 (line_buf);
+		address += (hdr->min_insn_len * (op_index + advance)
+			    / hdr->max_ops_per_insn);
+		op_index = (op_index + advance) % hdr->max_ops_per_insn;
+	      }
+	      break;
+	    case DW_LNS_advance_line:
+	      lineno += (int) read_sleb128 (line_buf);
+	      break;
+	    case DW_LNS_set_file:
+	      {
+		uint64_t fileno;
+
+		fileno = read_uleb128 (line_buf);
+		if (fileno >= hdr->filenames_count)
+		  {
+		    dwarf_buf_error (line_buf,
+				     ("invalid file number in "
+				      "line number program"),
+				     0);
+		    return 0;
+		  }
+		filename = hdr->filenames[fileno];
+	      }
+	      break;
+	    case DW_LNS_set_column:
+	      read_uleb128 (line_buf);
+	      break;
+	    case DW_LNS_negate_stmt:
+	      break;
+	    case DW_LNS_set_basic_block:
+	      break;
+	    case DW_LNS_const_add_pc:
+	      {
+		unsigned int advance;
+
+		op = 255 - hdr->opcode_base;
+		advance = op / hdr->line_range;
+		address += (hdr->min_insn_len * (op_index + advance)
+			    / hdr->max_ops_per_insn);
+		op_index = (op_index + advance) % hdr->max_ops_per_insn;
+	      }
+	      break;
+	    case DW_LNS_fixed_advance_pc:
+	      address += read_uint16 (line_buf);
+	      op_index = 0;
+	      break;
+	    case DW_LNS_set_prologue_end:
+	      break;
+	    case DW_LNS_set_epilogue_begin:
+	      break;
+	    case DW_LNS_set_isa:
+	      read_uleb128 (line_buf);
+	      break;
+	    default:
+	      {
+		unsigned int i;
+
+		for (i = hdr->opcode_lengths[op - 1]; i > 0; --i)
+		  read_uleb128 (line_buf);
+	      }
+	      break;
+	    }
+	}
+    }
+
+  return 1;
+}
+
+/* Read the line number information for a compilation unit.  Returns 1
+   on success, 0 on failure.  */
+
+static int
+read_line_info (struct backtrace_state *state, struct dwarf_data *ddata,
+		backtrace_error_callback error_callback, void *data,
+		struct unit *u, struct line_header *hdr, struct line **lines,
+		size_t *lines_count)
+{
+  struct line_vector vec;
+  struct dwarf_buf line_buf;
+  uint64_t len;
+  int is_dwarf64;
+  struct line *ln;
+
+  memset (&vec.vec, 0, sizeof vec.vec);
+  vec.count = 0;
+
+  memset (hdr, 0, sizeof *hdr);
+
+  if (u->lineoff != (off_t) (size_t) u->lineoff
+      || (size_t) u->lineoff >= ddata->dwarf_sections.size[DEBUG_LINE])
+    {
+      error_callback (data, "unit line offset out of range", 0);
+      goto fail;
+    }
+
+  line_buf.name = ".debug_line";
+  line_buf.start = ddata->dwarf_sections.data[DEBUG_LINE];
+  line_buf.buf = ddata->dwarf_sections.data[DEBUG_LINE] + u->lineoff;
+  line_buf.left = ddata->dwarf_sections.size[DEBUG_LINE] - u->lineoff;
+  line_buf.is_bigendian = ddata->is_bigendian;
+  line_buf.error_callback = error_callback;
+  line_buf.data = data;
+  line_buf.reported_underflow = 0;
+
+  len = read_initial_length (&line_buf, &is_dwarf64);
+  line_buf.left = len;
+
+  if (!read_line_header (state, ddata, u, is_dwarf64, &line_buf, hdr))
+    goto fail;
+
+  if (!read_line_program (state, ddata, hdr, &line_buf, &vec))
+    goto fail;
+
+  if (line_buf.reported_underflow)
+    goto fail;
+
+  if (vec.count == 0)
+    {
+      /* This is not a failure in the sense of a generating an error,
+	 but it is a failure in that sense that we have no useful
+	 information.  */
+      goto fail;
+    }
+
+  /* Allocate one extra entry at the end.  */
+  ln = ((struct line *)
+	backtrace_vector_grow (state, sizeof (struct line), error_callback,
+			       data, &vec.vec));
+  if (ln == NULL)
+    goto fail;
+  ln->pc = (uintptr_t) -1;
+  ln->filename = NULL;
+  ln->lineno = 0;
+  ln->idx = 0;
+
+  if (!backtrace_vector_release (state, &vec.vec, error_callback, data))
+    goto fail;
+
+  ln = (struct line *) vec.vec.base;
+  backtrace_qsort (ln, vec.count, sizeof (struct line), line_compare);
+
+  *lines = ln;
+  *lines_count = vec.count;
+
+  return 1;
+
+ fail:
+  backtrace_vector_free (state, &vec.vec, error_callback, data);
+  free_line_header (state, hdr, error_callback, data);
+  *lines = (struct line *) (uintptr_t) -1;
+  *lines_count = 0;
+  return 0;
+}
+
+static const char *read_referenced_name (struct dwarf_data *, struct unit *,
+					 uint64_t, backtrace_error_callback,
+					 void *);
+
+/* Read the name of a function from a DIE referenced by ATTR with VAL.  */
+
+static const char *
+read_referenced_name_from_attr (struct dwarf_data *ddata, struct unit *u,
+				struct attr *attr, struct attr_val *val,
+				backtrace_error_callback error_callback,
+				void *data)
+{
+  switch (attr->name)
+    {
+    case DW_AT_abstract_origin:
+    case DW_AT_specification:
+      break;
+    default:
+      return NULL;
+    }
+
+  if (attr->form == DW_FORM_ref_sig8)
+    return NULL;
+
+  if (val->encoding == ATTR_VAL_REF_INFO)
+    {
+      struct unit *unit
+	= find_unit (ddata->units, ddata->units_count,
+		     val->u.uint);
+      if (unit == NULL)
+	return NULL;
+
+      uint64_t offset = val->u.uint - unit->low_offset;
+      return read_referenced_name (ddata, unit, offset, error_callback, data);
+    }
+
+  if (val->encoding == ATTR_VAL_UINT
+      || val->encoding == ATTR_VAL_REF_UNIT)
+    return read_referenced_name (ddata, u, val->u.uint, error_callback, data);
+
+  if (val->encoding == ATTR_VAL_REF_ALT_INFO)
+    {
+      struct unit *alt_unit
+	= find_unit (ddata->altlink->units, ddata->altlink->units_count,
+		     val->u.uint);
+      if (alt_unit == NULL)
+	return NULL;
+
+      uint64_t offset = val->u.uint - alt_unit->low_offset;
+      return read_referenced_name (ddata->altlink, alt_unit, offset,
+				   error_callback, data);
+    }
+
+  return NULL;
+}
+
+/* Read the name of a function from a DIE referenced by a
+   DW_AT_abstract_origin or DW_AT_specification tag.  OFFSET is within
+   the same compilation unit.  */
+
+static const char *
+read_referenced_name (struct dwarf_data *ddata, struct unit *u,
+		      uint64_t offset, backtrace_error_callback error_callback,
+		      void *data)
+{
+  struct dwarf_buf unit_buf;
+  uint64_t code;
+  const struct abbrev *abbrev;
+  const char *ret;
+  size_t i;
+
+  /* OFFSET is from the start of the data for this compilation unit.
+     U->unit_data is the data, but it starts U->unit_data_offset bytes
+     from the beginning.  */
+
+  if (offset < u->unit_data_offset
+      || offset - u->unit_data_offset >= u->unit_data_len)
+    {
+      error_callback (data,
+		      "abstract origin or specification out of range",
+		      0);
+      return NULL;
+    }
+
+  offset -= u->unit_data_offset;
+
+  unit_buf.name = ".debug_info";
+  unit_buf.start = ddata->dwarf_sections.data[DEBUG_INFO];
+  unit_buf.buf = u->unit_data + offset;
+  unit_buf.left = u->unit_data_len - offset;
+  unit_buf.is_bigendian = ddata->is_bigendian;
+  unit_buf.error_callback = error_callback;
+  unit_buf.data = data;
+  unit_buf.reported_underflow = 0;
+
+  code = read_uleb128 (&unit_buf);
+  if (code == 0)
+    {
+      dwarf_buf_error (&unit_buf,
+		      "invalid abstract origin or specification",
+		      0);
+      return NULL;
+    }
+
+  abbrev = lookup_abbrev (&u->abbrevs, code, error_callback, data);
+  if (abbrev == NULL)
+    return NULL;
+
+  ret = NULL;
+  for (i = 0; i < abbrev->num_attrs; ++i)
+    {
+      struct attr_val val;
+
+      if (!read_attribute (abbrev->attrs[i].form, abbrev->attrs[i].val,
+			   &unit_buf, u->is_dwarf64, u->version, u->addrsize,
+			   &ddata->dwarf_sections, ddata->altlink, &val))
+	return NULL;
+
+      switch (abbrev->attrs[i].name)
+	{
+	case DW_AT_name:
+	  /* Third name preference: don't override.  A name we found in some
+	     other way, will normally be more useful -- e.g., this name is
+	     normally not mangled.  */
+	  if (ret != NULL)
+	    break;
+	  if (!resolve_string (&ddata->dwarf_sections, u->is_dwarf64,
+			       ddata->is_bigendian, u->str_offsets_base,
+			       &val, error_callback, data, &ret))
+	    return NULL;
+	  break;
+
+	case DW_AT_linkage_name:
+	case DW_AT_MIPS_linkage_name:
+	  /* First name preference: override all.  */
+	  {
+	    const char *s;
+
+	    s = NULL;
+	    if (!resolve_string (&ddata->dwarf_sections, u->is_dwarf64,
+				 ddata->is_bigendian, u->str_offsets_base,
+				 &val, error_callback, data, &s))
+	      return NULL;
+	    if (s != NULL)
+	      return s;
+	  }
+	  break;
+
+	case DW_AT_specification:
+	  /* Second name preference: override DW_AT_name, don't override
+	     DW_AT_linkage_name.  */
+	  {
+	    const char *name;
+
+	    name = read_referenced_name_from_attr (ddata, u, &abbrev->attrs[i],
+						   &val, error_callback, data);
+	    if (name != NULL)
+	      ret = name;
+	  }
+	  break;
+
+	default:
+	  break;
+	}
+    }
+
+  return ret;
+}
+
+/* Add a range to a unit that maps to a function.  This is called via
+   add_ranges.  Returns 1 on success, 0 on error.  */
+
+static int
+add_function_range (struct backtrace_state *state, void *rdata,
+		    uintptr_t lowpc, uintptr_t highpc,
+		    backtrace_error_callback error_callback, void *data,
+		    void *pvec)
+{
+  struct function *function = (struct function *) rdata;
+  struct function_vector *vec = (struct function_vector *) pvec;
+  struct function_addrs *p;
+
+  if (vec->count > 0)
+    {
+      p = (struct function_addrs *) vec->vec.base + (vec->count - 1);
+      if ((lowpc == p->high || lowpc == p->high + 1)
+	  && function == p->function)
+	{
+	  if (highpc > p->high)
+	    p->high = highpc;
+	  return 1;
+	}
+    }
+
+  p = ((struct function_addrs *)
+       backtrace_vector_grow (state, sizeof (struct function_addrs),
+			      error_callback, data, &vec->vec));
+  if (p == NULL)
+    return 0;
+
+  p->low = lowpc;
+  p->high = highpc;
+  p->function = function;
+
+  ++vec->count;
+
+  return 1;
+}
+
+/* Read one entry plus all its children.  Add function addresses to
+   VEC.  Returns 1 on success, 0 on error.  */
+
+static int
+read_function_entry (struct backtrace_state *state, struct dwarf_data *ddata,
+		     struct unit *u, uintptr_t base, struct dwarf_buf *unit_buf,
+		     const struct line_header *lhdr,
+		     backtrace_error_callback error_callback, void *data,
+		     struct function_vector *vec_function,
+		     struct function_vector *vec_inlined)
+{
+  while (unit_buf->left > 0)
+    {
+      uint64_t code;
+      const struct abbrev *abbrev;
+      int is_function;
+      struct function *function;
+      struct function_vector *vec;
+      size_t i;
+      struct pcrange pcrange;
+      int have_linkage_name;
+
+      code = read_uleb128 (unit_buf);
+      if (code == 0)
+	return 1;
+
+      abbrev = lookup_abbrev (&u->abbrevs, code, error_callback, data);
+      if (abbrev == NULL)
+	return 0;
+
+      is_function = (abbrev->tag == DW_TAG_subprogram
+		     || abbrev->tag == DW_TAG_entry_point
+		     || abbrev->tag == DW_TAG_inlined_subroutine);
+
+      if (abbrev->tag == DW_TAG_inlined_subroutine)
+	vec = vec_inlined;
+      else
+	vec = vec_function;
+
+      function = NULL;
+      if (is_function)
+	{
+	  function = ((struct function *)
+		      backtrace_alloc (state, sizeof *function,
+				       error_callback, data));
+	  if (function == NULL)
+	    return 0;
+	  memset (function, 0, sizeof *function);
+	}
+
+      memset (&pcrange, 0, sizeof pcrange);
+      have_linkage_name = 0;
+      for (i = 0; i < abbrev->num_attrs; ++i)
+	{
+	  struct attr_val val;
+
+	  if (!read_attribute (abbrev->attrs[i].form, abbrev->attrs[i].val,
+			       unit_buf, u->is_dwarf64, u->version,
+			       u->addrsize, &ddata->dwarf_sections,
+			       ddata->altlink, &val))
+	    return 0;
+
+	  /* The compile unit sets the base address for any address
+	     ranges in the function entries.  */
+	  if ((abbrev->tag == DW_TAG_compile_unit
+	       || abbrev->tag == DW_TAG_skeleton_unit)
+	      && abbrev->attrs[i].name == DW_AT_low_pc)
+	    {
+	      if (val.encoding == ATTR_VAL_ADDRESS)
+		base = (uintptr_t) val.u.uint;
+	      else if (val.encoding == ATTR_VAL_ADDRESS_INDEX)
+		{
+		  if (!resolve_addr_index (&ddata->dwarf_sections,
+					   u->addr_base, u->addrsize,
+					   ddata->is_bigendian, val.u.uint,
+					   error_callback, data, &base))
+		    return 0;
+		}
+	    }
+
+	  if (is_function)
+	    {
+	      switch (abbrev->attrs[i].name)
+		{
+		case DW_AT_call_file:
+		  if (val.encoding == ATTR_VAL_UINT)
+		    {
+		      if (val.u.uint >= lhdr->filenames_count)
+			{
+			  dwarf_buf_error (unit_buf,
+					   ("invalid file number in "
+					    "DW_AT_call_file attribute"),
+					   0);
+			  return 0;
+			}
+		      function->caller_filename = lhdr->filenames[val.u.uint];
+		    }
+		  break;
+
+		case DW_AT_call_line:
+		  if (val.encoding == ATTR_VAL_UINT)
+		    function->caller_lineno = val.u.uint;
+		  break;
+
+		case DW_AT_abstract_origin:
+		case DW_AT_specification:
+		  /* Second name preference: override DW_AT_name, don't override
+		     DW_AT_linkage_name.  */
+		  if (have_linkage_name)
+		    break;
+		  {
+		    const char *name;
+
+		    name
+		      = read_referenced_name_from_attr (ddata, u,
+							&abbrev->attrs[i], &val,
+							error_callback, data);
+		    if (name != NULL)
+		      function->name = name;
+		  }
+		  break;
+
+		case DW_AT_name:
+		  /* Third name preference: don't override.  */
+		  if (function->name != NULL)
+		    break;
+		  if (!resolve_string (&ddata->dwarf_sections, u->is_dwarf64,
+				       ddata->is_bigendian,
+				       u->str_offsets_base, &val,
+				       error_callback, data, &function->name))
+		    return 0;
+		  break;
+
+		case DW_AT_linkage_name:
+		case DW_AT_MIPS_linkage_name:
+		  /* First name preference: override all.  */
+		  {
+		    const char *s;
+
+		    s = NULL;
+		    if (!resolve_string (&ddata->dwarf_sections, u->is_dwarf64,
+					 ddata->is_bigendian,
+					 u->str_offsets_base, &val,
+					 error_callback, data, &s))
+		      return 0;
+		    if (s != NULL)
+		      {
+			function->name = s;
+			have_linkage_name = 1;
+		      }
+		  }
+		  break;
+
+		case DW_AT_low_pc: case DW_AT_high_pc: case DW_AT_ranges:
+		  update_pcrange (&abbrev->attrs[i], &val, &pcrange);
+		  break;
+
+		default:
+		  break;
+		}
+	    }
+	}
+
+      /* If we couldn't find a name for the function, we have no use
+	 for it.  */
+      if (is_function && function->name == NULL)
+	{
+	  backtrace_free (state, function, sizeof *function,
+			  error_callback, data);
+	  is_function = 0;
+	}
+
+      if (is_function)
+	{
+	  if (pcrange.have_ranges
+	      || (pcrange.have_lowpc && pcrange.have_highpc))
+	    {
+	      if (!add_ranges (state, &ddata->dwarf_sections,
+			       ddata->base_address, ddata->is_bigendian,
+			       u, base, &pcrange, add_function_range,
+			       (void *) function, error_callback, data,
+			       (void *) vec))
+		return 0;
+	    }
+	  else
+	    {
+	      backtrace_free (state, function, sizeof *function,
+			      error_callback, data);
+	      is_function = 0;
+	    }
+	}
+
+      if (abbrev->has_children)
+	{
+	  if (!is_function)
+	    {
+	      if (!read_function_entry (state, ddata, u, base, unit_buf, lhdr,
+					error_callback, data, vec_function,
+					vec_inlined))
+		return 0;
+	    }
+	  else
+	    {
+	      struct function_vector fvec;
+
+	      /* Gather any information for inlined functions in
+		 FVEC.  */
+
+	      memset (&fvec, 0, sizeof fvec);
+
+	      if (!read_function_entry (state, ddata, u, base, unit_buf, lhdr,
+					error_callback, data, vec_function,
+					&fvec))
+		return 0;
+
+	      if (fvec.count > 0)
+		{
+		  struct function_addrs *p;
+		  struct function_addrs *faddrs;
+
+		  /* Allocate a trailing entry, but don't include it
+		     in fvec.count.  */
+		  p = ((struct function_addrs *)
+		       backtrace_vector_grow (state,
+					      sizeof (struct function_addrs),
+					      error_callback, data,
+					      &fvec.vec));
+		  if (p == NULL)
+		    return 0;
+		  p->low = 0;
+		  --p->low;
+		  p->high = p->low;
+		  p->function = NULL;
+
+		  if (!backtrace_vector_release (state, &fvec.vec,
+						 error_callback, data))
+		    return 0;
+
+		  faddrs = (struct function_addrs *) fvec.vec.base;
+		  backtrace_qsort (faddrs, fvec.count,
+				   sizeof (struct function_addrs),
+				   function_addrs_compare);
+
+		  function->function_addrs = faddrs;
+		  function->function_addrs_count = fvec.count;
+		}
+	    }
+	}
+    }
+
+  return 1;
+}
+
+/* Read function name information for a compilation unit.  We look
+   through the whole unit looking for function tags.  */
+
+static void
+read_function_info (struct backtrace_state *state, struct dwarf_data *ddata,
+		    const struct line_header *lhdr,
+		    backtrace_error_callback error_callback, void *data,
+		    struct unit *u, struct function_vector *fvec,
+		    struct function_addrs **ret_addrs,
+		    size_t *ret_addrs_count)
+{
+  struct function_vector lvec;
+  struct function_vector *pfvec;
+  struct dwarf_buf unit_buf;
+  struct function_addrs *p;
+  struct function_addrs *addrs;
+  size_t addrs_count;
+
+  /* Use FVEC if it is not NULL.  Otherwise use our own vector.  */
+  if (fvec != NULL)
+    pfvec = fvec;
+  else
+    {
+      memset (&lvec, 0, sizeof lvec);
+      pfvec = &lvec;
+    }
+
+  unit_buf.name = ".debug_info";
+  unit_buf.start = ddata->dwarf_sections.data[DEBUG_INFO];
+  unit_buf.buf = u->unit_data;
+  unit_buf.left = u->unit_data_len;
+  unit_buf.is_bigendian = ddata->is_bigendian;
+  unit_buf.error_callback = error_callback;
+  unit_buf.data = data;
+  unit_buf.reported_underflow = 0;
+
+  while (unit_buf.left > 0)
+    {
+      if (!read_function_entry (state, ddata, u, 0, &unit_buf, lhdr,
+				error_callback, data, pfvec, pfvec))
+	return;
+    }
+
+  if (pfvec->count == 0)
+    return;
+
+  /* Allocate a trailing entry, but don't include it in
+     pfvec->count.  */
+  p = ((struct function_addrs *)
+       backtrace_vector_grow (state, sizeof (struct function_addrs),
+			      error_callback, data, &pfvec->vec));
+  if (p == NULL)
+    return;
+  p->low = 0;
+  --p->low;
+  p->high = p->low;
+  p->function = NULL;
+
+  addrs_count = pfvec->count;
+
+  if (fvec == NULL)
+    {
+      if (!backtrace_vector_release (state, &lvec.vec, error_callback, data))
+	return;
+      addrs = (struct function_addrs *) pfvec->vec.base;
+    }
+  else
+    {
+      /* Finish this list of addresses, but leave the remaining space in
+	 the vector available for the next function unit.  */
+      addrs = ((struct function_addrs *)
+	       backtrace_vector_finish (state, &fvec->vec,
+					error_callback, data));
+      if (addrs == NULL)
+	return;
+      fvec->count = 0;
+    }
+
+  backtrace_qsort (addrs, addrs_count, sizeof (struct function_addrs),
+		   function_addrs_compare);
+
+  *ret_addrs = addrs;
+  *ret_addrs_count = addrs_count;
+}
+
+/* See if PC is inlined in FUNCTION.  If it is, print out the inlined
+   information, and update FILENAME and LINENO for the caller.
+   Returns whatever CALLBACK returns, or 0 to keep going.  */
+
+static int
+report_inlined_functions (uintptr_t pc, struct function *function, const char* comp_dir,
+			  backtrace_full_callback callback, void *data,
+			  const char **filename, int *lineno)
+{
+  struct function_addrs *p;
+  struct function_addrs *match;
+  struct function *inlined;
+  int ret;
+
+  if (function->function_addrs_count == 0)
+    return 0;
+
+  /* Our search isn't safe if pc == -1, as that is the sentinel
+     value.  */
+  if (pc + 1 == 0)
+    return 0;
+
+  p = ((struct function_addrs *)
+       bsearch (&pc, function->function_addrs,
+		function->function_addrs_count,
+		sizeof (struct function_addrs),
+		function_addrs_search));
+  if (p == NULL)
+    return 0;
+
+  /* Here pc >= p->low && pc < (p + 1)->low.  The function_addrs are
+     sorted by low, so if pc > p->low we are at the end of a range of
+     function_addrs with the same low value.  If pc == p->low walk
+     forward to the end of the range with that low value.  Then walk
+     backward and use the first range that includes pc.  */
+  while (pc == (p + 1)->low)
+    ++p;
+  match = NULL;
+  while (1)
+    {
+      if (pc < p->high)
+	{
+	  match = p;
+	  break;
+	}
+      if (p == function->function_addrs)
+	break;
+      if ((p - 1)->low < p->low)
+	break;
+      --p;
+    }
+  if (match == NULL)
+    return 0;
+
+  /* We found an inlined call.  */
+
+  inlined = match->function;
+
+  /* Report any calls inlined into this one.  */
+  ret = report_inlined_functions (pc, inlined, comp_dir, callback, data,
+				  filename, lineno);
+  if (ret != 0)
+    return ret;
+
+  /* Report this inlined call.  */
+  if (*filename[0] != '/' && comp_dir)
+  {
+    char buf[1024];
+    snprintf (buf, 1024, "%s/%s", comp_dir, *filename);
+    ret = callback (data, pc, match->low, buf, *lineno, inlined->name);
+  }
+  else
+  {
+    ret = callback (data, pc, match->low, *filename, *lineno, inlined->name);
+  }
+  if (ret != 0)
+    return ret;
+
+  /* Our caller will report the caller of the inlined function; tell
+     it the appropriate filename and line number.  */
+  *filename = inlined->caller_filename;
+  *lineno = inlined->caller_lineno;
+
+  return 0;
+}
+
+/* Look for a PC in the DWARF mapping for one module.  On success,
+   call CALLBACK and return whatever it returns.  On error, call
+   ERROR_CALLBACK and return 0.  Sets *FOUND to 1 if the PC is found,
+   0 if not.  */
+
+static int
+dwarf_lookup_pc (struct backtrace_state *state, struct dwarf_data *ddata,
+		 uintptr_t pc, backtrace_full_callback callback,
+		 backtrace_error_callback error_callback, void *data,
+		 int *found)
+{
+  struct unit_addrs *entry;
+  int found_entry;
+  struct unit *u;
+  int new_data;
+  struct line *lines;
+  struct line *ln;
+  struct function_addrs *p;
+  struct function_addrs *fmatch;
+  struct function *function;
+  const char *filename;
+  int lineno;
+  int ret;
+
+  *found = 1;
+
+  /* Find an address range that includes PC.  Our search isn't safe if
+     PC == -1, as we use that as a sentinel value, so skip the search
+     in that case.  */
+  entry = (ddata->addrs_count == 0 || pc + 1 == 0
+	   ? NULL
+	   : (struct unit_addrs*)bsearch (&pc, ddata->addrs, ddata->addrs_count,
+		      sizeof (struct unit_addrs), unit_addrs_search));
+
+  if (entry == NULL)
+    {
+      *found = 0;
+      return 0;
+    }
+
+  /* Here pc >= entry->low && pc < (entry + 1)->low.  The unit_addrs
+     are sorted by low, so if pc > p->low we are at the end of a range
+     of unit_addrs with the same low value.  If pc == p->low walk
+     forward to the end of the range with that low value.  Then walk
+     backward and use the first range that includes pc.  */
+  while (pc == (entry + 1)->low)
+    ++entry;
+  found_entry = 0;
+  while (1)
+    {
+      if (pc < entry->high)
+	{
+	  found_entry = 1;
+	  break;
+	}
+      if (entry == ddata->addrs)
+	break;
+      if ((entry - 1)->low < entry->low)
+	break;
+      --entry;
+    }
+  if (!found_entry)
+    {
+      *found = 0;
+      return 0;
+    }
+
+  /* We need the lines, lines_count, function_addrs,
+     function_addrs_count fields of u.  If they are not set, we need
+     to set them.  When running in threaded mode, we need to allow for
+     the possibility that some other thread is setting them
+     simultaneously.  */
+
+  u = entry->u;
+  lines = u->lines;
+
+  /* Skip units with no useful line number information by walking
+     backward.  Useless line number information is marked by setting
+     lines == -1.  */
+  while (entry > ddata->addrs
+	 && pc >= (entry - 1)->low
+	 && pc < (entry - 1)->high)
+    {
+      if (state->threaded)
+	lines = (struct line *) backtrace_atomic_load_pointer (&u->lines);
+
+      if (lines != (struct line *) (uintptr_t) -1)
+	break;
+
+      --entry;
+
+      u = entry->u;
+      lines = u->lines;
+    }
+
+  if (state->threaded)
+    lines = backtrace_atomic_load_pointer (&u->lines);
+
+  new_data = 0;
+  if (lines == NULL)
+    {
+      struct function_addrs *function_addrs;
+      size_t function_addrs_count;
+      struct line_header lhdr;
+      size_t count;
+
+      /* We have never read the line information for this unit.  Read
+	 it now.  */
+
+      function_addrs = NULL;
+      function_addrs_count = 0;
+      if (read_line_info (state, ddata, error_callback, data, entry->u, &lhdr,
+			  &lines, &count))
+	{
+	  struct function_vector *pfvec;
+
+	  /* If not threaded, reuse DDATA->FVEC for better memory
+	     consumption.  */
+	  if (state->threaded)
+	    pfvec = NULL;
+	  else
+	    pfvec = &ddata->fvec;
+	  read_function_info (state, ddata, &lhdr, error_callback, data,
+			      entry->u, pfvec, &function_addrs,
+			      &function_addrs_count);
+	  free_line_header (state, &lhdr, error_callback, data);
+	  new_data = 1;
+	}
+
+      /* Atomically store the information we just read into the unit.
+	 If another thread is simultaneously writing, it presumably
+	 read the same information, and we don't care which one we
+	 wind up with; we just leak the other one.  We do have to
+	 write the lines field last, so that the acquire-loads above
+	 ensure that the other fields are set.  */
+
+      if (!state->threaded)
+	{
+	  u->lines_count = count;
+	  u->function_addrs = function_addrs;
+	  u->function_addrs_count = function_addrs_count;
+	  u->lines = lines;
+	}
+      else
+	{
+	  backtrace_atomic_store_size_t (&u->lines_count, count);
+	  backtrace_atomic_store_pointer (&u->function_addrs, function_addrs);
+	  backtrace_atomic_store_size_t (&u->function_addrs_count,
+					 function_addrs_count);
+	  backtrace_atomic_store_pointer (&u->lines, lines);
+	}
+    }
+
+  /* Now all fields of U have been initialized.  */
+
+  if (lines == (struct line *) (uintptr_t) -1)
+    {
+      /* If reading the line number information failed in some way,
+	 try again to see if there is a better compilation unit for
+	 this PC.  */
+      if (new_data)
+	return dwarf_lookup_pc (state, ddata, pc, callback, error_callback,
+				data, found);
+      return callback (data, pc, 0, NULL, 0, NULL);
+    }
+
+  /* Search for PC within this unit.  */
+
+  ln = (struct line *) bsearch (&pc, lines, entry->u->lines_count,
+				sizeof (struct line), line_search);
+  if (ln == NULL)
+    {
+      /* The PC is between the low_pc and high_pc attributes of the
+	 compilation unit, but no entry in the line table covers it.
+	 This implies that the start of the compilation unit has no
+	 line number information.  */
+
+      if (entry->u->abs_filename == NULL)
+	{
+	  const char *filename;
+
+	  filename = entry->u->filename;
+	  if (filename != NULL
+	      && !IS_ABSOLUTE_PATH (filename)
+	      && entry->u->comp_dir != NULL)
+	    {
+	      size_t filename_len;
+	      const char *dir;
+	      size_t dir_len;
+	      char *s;
+
+	      filename_len = strlen (filename);
+	      dir = entry->u->comp_dir;
+	      dir_len = strlen (dir);
+	      s = (char *) backtrace_alloc (state, dir_len + filename_len + 2,
+					    error_callback, data);
+	      if (s == NULL)
+		{
+		  *found = 0;
+		  return 0;
+		}
+	      memcpy (s, dir, dir_len);
+	      /* FIXME: Should use backslash if DOS file system.  */
+	      s[dir_len] = '/';
+	      memcpy (s + dir_len + 1, filename, filename_len + 1);
+	      filename = s;
+	    }
+	  entry->u->abs_filename = filename;
+	}
+
+      return callback (data, pc, 0, entry->u->abs_filename, 0, NULL);
+    }
+
+  /* Search for function name within this unit.  */
+
+  if (entry->u->function_addrs_count == 0)
+    return callback (data, pc, 0, ln->filename, ln->lineno, NULL);
+
+  p = ((struct function_addrs *)
+       bsearch (&pc, entry->u->function_addrs,
+		entry->u->function_addrs_count,
+		sizeof (struct function_addrs),
+		function_addrs_search));
+  if (p == NULL)
+    return callback (data, pc, 0, ln->filename, ln->lineno, NULL);
+
+  /* Here pc >= p->low && pc < (p + 1)->low.  The function_addrs are
+     sorted by low, so if pc > p->low we are at the end of a range of
+     function_addrs with the same low value.  If pc == p->low walk
+     forward to the end of the range with that low value.  Then walk
+     backward and use the first range that includes pc.  */
+  while (pc == (p + 1)->low)
+    ++p;
+  fmatch = NULL;
+  while (1)
+    {
+      if (pc < p->high)
+	{
+	  fmatch = p;
+	  break;
+	}
+      if (p == entry->u->function_addrs)
+	break;
+      if ((p - 1)->low < p->low)
+	break;
+      --p;
+    }
+  if (fmatch == NULL)
+    return callback (data, pc, 0, ln->filename, ln->lineno, NULL);
+
+  function = fmatch->function;
+
+  filename = ln->filename;
+  lineno = ln->lineno;
+
+  ret = report_inlined_functions (pc, function, entry->u->comp_dir, callback, data,
+				  &filename, &lineno);
+  if (ret != 0)
+    return ret;
+
+  if (filename[0] != '/' && entry->u->comp_dir)
+  {
+    char buf[1024];
+    snprintf (buf, 1024, "%s/%s", entry->u->comp_dir, filename);
+    return callback (data, pc, fmatch->low, buf, lineno, function->name);
+  }
+  else
+  {
+    return callback (data, pc, fmatch->low, filename, lineno, function->name);
+  }
+}
+
+bool dwarf_fileline_dwarf_lookup_pc_in_all_entries(struct backtrace_state *state, uintptr_t pc,
+      backtrace_full_callback callback, backtrace_error_callback error_callback, void *data,
+      int& found, int ret)
+{
+    for (struct dwarf_data* ddata = (struct dwarf_data *)state->fileline_data;
+         ddata != NULL;
+         ddata = ddata->next)
+    {
+      ret = dwarf_lookup_pc(state, ddata, pc, callback, error_callback, data, &found);
+      if (ret != 0 || found) return true;
+    }
+    return false;
+}
+
+/* Return the file/line information for a PC using the DWARF mapping
+   we built earlier.  */
+
+static int
+dwarf_fileline (struct backtrace_state *state, uintptr_t pc,
+		backtrace_full_callback callback,
+		backtrace_error_callback error_callback, void *data)
+{
+  struct dwarf_data *ddata;
+  int found;
+  int ret = 0;
+
+  if (!state->threaded)
+  {
+    if (dwarf_fileline_dwarf_lookup_pc_in_all_entries(state, pc, callback, error_callback, data, found, ret))
+    {
+       return ret;
+    }
+
+    // if we failed to obtain an entry in range, it can mean that the address map has been changed and new entries
+    //  have been loaded in the meantime. Request a refresh and try again.
+    if (state->request_known_address_ranges_refresh_fn)
+    {
+        int new_range_count = state->request_known_address_ranges_refresh_fn(state, pc);
+        if (new_range_count > 0)
+        {
+          if (dwarf_fileline_dwarf_lookup_pc_in_all_entries(state, pc, callback, error_callback, data, found, ret))
+          {
+            return ret;
+          }
+        }
+    }
+
+  }
+  else
+    {
+      struct dwarf_data **pp;
+
+      pp = (struct dwarf_data **) (void *) &state->fileline_data;
+      while (1)
+	{
+	  ddata = backtrace_atomic_load_pointer (pp);
+	  if (ddata == NULL)
+	    break;
+
+	  ret = dwarf_lookup_pc (state, ddata, pc, callback, error_callback,
+				 data, &found);
+	  if (ret != 0 || found)
+	    return ret;
+
+	  pp = &ddata->next;
+	}
+    }
+
+  /* FIXME: See if any libraries have been dlopen'ed.  */
+
+  return callback (data, pc, 0, NULL, 0, NULL);
+}
+
+/* Initialize our data structures from the DWARF debug info for a
+   file.  Return NULL on failure.  */
+
+static struct dwarf_data *
+build_dwarf_data (struct backtrace_state *state,
+		  struct libbacktrace_base_address base_address,
+		  const struct dwarf_sections *dwarf_sections,
+		  int is_bigendian,
+		  struct dwarf_data *altlink,
+		  backtrace_error_callback error_callback,
+		  void *data)
+{
+  struct unit_addrs_vector addrs_vec;
+  struct unit_addrs *addrs;
+  size_t addrs_count;
+  struct unit_vector units_vec;
+  struct unit **units;
+  size_t units_count;
+  struct dwarf_data *fdata;
+
+  if (!build_address_map (state, base_address, dwarf_sections, is_bigendian,
+			  altlink, error_callback, data, &addrs_vec,
+			  &units_vec))
+    return NULL;
+
+  if (!backtrace_vector_release (state, &addrs_vec.vec, error_callback, data))
+    return NULL;
+  if (!backtrace_vector_release (state, &units_vec.vec, error_callback, data))
+    return NULL;
+  addrs = (struct unit_addrs *) addrs_vec.vec.base;
+  units = (struct unit **) units_vec.vec.base;
+  addrs_count = addrs_vec.count;
+  units_count = units_vec.count;
+  backtrace_qsort (addrs, addrs_count, sizeof (struct unit_addrs),
+		   unit_addrs_compare);
+  /* No qsort for units required, already sorted.  */
+
+  fdata = ((struct dwarf_data *)
+	   backtrace_alloc (state, sizeof (struct dwarf_data),
+			    error_callback, data));
+  if (fdata == NULL)
+    return NULL;
+
+  fdata->next = NULL;
+  fdata->altlink = altlink;
+  fdata->base_address = base_address;
+  fdata->addrs = addrs;
+  fdata->addrs_count = addrs_count;
+  fdata->units = units;
+  fdata->units_count = units_count;
+  fdata->dwarf_sections = *dwarf_sections;
+  fdata->is_bigendian = is_bigendian;
+  memset (&fdata->fvec, 0, sizeof fdata->fvec);
+
+  return fdata;
+}
+
+/* Build our data structures from the DWARF sections for a module.
+   Set FILELINE_FN and STATE->FILELINE_DATA.  Return 1 on success, 0
+   on failure.  */
+
+int
+backtrace_dwarf_add (struct backtrace_state *state,
+		     struct libbacktrace_base_address base_address,
+		     const struct dwarf_sections *dwarf_sections,
+		     int is_bigendian,
+		     struct dwarf_data *fileline_altlink,
+		     backtrace_error_callback error_callback,
+		     void *data, fileline *fileline_fn,
+		     struct dwarf_data **fileline_entry)
+{
+  struct dwarf_data *fdata;
+
+  fdata = build_dwarf_data (state, base_address, dwarf_sections, is_bigendian,
+			    fileline_altlink, error_callback, data);
+  if (fdata == NULL)
+    return 0;
+
+  if (fileline_entry != NULL)
+    *fileline_entry = fdata;
+
+  if (!state->threaded)
+    {
+      struct dwarf_data **pp;
+
+      for (pp = (struct dwarf_data **) (void *) &state->fileline_data;
+	   *pp != NULL;
+	   pp = &(*pp)->next)
+	;
+      *pp = fdata;
+    }
+  else
+    {
+      while (1)
+	{
+	  struct dwarf_data **pp;
+
+	  pp = (struct dwarf_data **) (void *) &state->fileline_data;
+
+	  while (1)
+	    {
+	      struct dwarf_data *p;
+
+	      p = backtrace_atomic_load_pointer (pp);
+
+	      if (p == NULL)
+		break;
+
+	      pp = &p->next;
+	    }
+
+	  if (__sync_bool_compare_and_swap (pp, NULL, fdata))
+	    break;
+	}
+    }
+
+  *fileline_fn = dwarf_fileline;
+
+  return 1;
+}
+
+}
diff --git a/project/thirdparty/tracy-0.11.1/libbacktrace/elf.cpp b/project/thirdparty/tracy-0.11.1/libbacktrace/elf.cpp
new file mode 100644
index 000000000..ffe8d7024
--- /dev/null
+++ b/project/thirdparty/tracy-0.11.1/libbacktrace/elf.cpp
@@ -0,0 +1,7605 @@
+/* elf.c -- Get debug data from an ELF file for backtraces.
+   Copyright (C) 2012-2021 Free Software Foundation, Inc.
+   Written by Ian Lance Taylor, Google.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+
+    (1) Redistributions of source code must retain the above copyright
+    notice, this list of conditions and the following disclaimer.
+
+    (2) Redistributions in binary form must reproduce the above copyright
+    notice, this list of conditions and the following disclaimer in
+    the documentation and/or other materials provided with the
+    distribution.
+
+    (3) The name of the author may not be used to
+    endorse or promote products derived from this software without
+    specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT,
+INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING
+IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+POSSIBILITY OF SUCH DAMAGE.  */
+
+#include "config.h"
+
+#include <errno.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <unistd.h>
+#include <algorithm>
+
+#ifdef HAVE_DL_ITERATE_PHDR
+#include <link.h>
+#endif
+
+#include "backtrace.hpp"
+#include "internal.hpp"
+
+#include "../client/TracyFastVector.hpp"
+#include "../common/TracyAlloc.hpp"
+
+#ifndef S_ISLNK
+ #ifndef S_IFLNK
+  #define S_IFLNK 0120000
+ #endif
+ #ifndef S_IFMT
+  #define S_IFMT 0170000
+ #endif
+ #define S_ISLNK(m) (((m) & S_IFMT) == S_IFLNK)
+#endif
+
+#ifndef __GNUC__
+#define __builtin_prefetch(p, r, l)
+#ifndef unlikely
+#define unlikely(x) (x)
+#endif
+#else
+#ifndef unlikely
+#define unlikely(x) __builtin_expect(!!(x), 0)
+#endif
+#endif
+
+namespace tracy
+{
+
+#ifdef TRACY_DEBUGINFOD
+int GetDebugInfoDescriptor( const char* buildid_data, size_t buildid_size );
+#endif
+
+#if !defined(HAVE_DECL_STRNLEN) || !HAVE_DECL_STRNLEN
+
+/* If strnlen is not declared, provide our own version.  */
+
+static size_t
+xstrnlen (const char *s, size_t maxlen)
+{
+  size_t i;
+
+  for (i = 0; i < maxlen; ++i)
+    if (s[i] == '\0')
+      break;
+  return i;
+}
+
+#define strnlen xstrnlen
+
+#endif
+
+#ifndef HAVE_LSTAT
+
+/* Dummy version of lstat for systems that don't have it.  */
+
+static int
+xlstat (const char *path ATTRIBUTE_UNUSED, struct stat *st ATTRIBUTE_UNUSED)
+{
+  return -1;
+}
+
+#define lstat xlstat
+
+#endif
+
+#ifndef HAVE_READLINK
+
+/* Dummy version of readlink for systems that don't have it.  */
+
+static ssize_t
+xreadlink (const char *path ATTRIBUTE_UNUSED, char *buf ATTRIBUTE_UNUSED,
+	   size_t bufsz ATTRIBUTE_UNUSED)
+{
+  return -1;
+}
+
+#define readlink xreadlink
+
+#endif
+
+#ifndef HAVE_DL_ITERATE_PHDR
+
+/* Dummy version of dl_iterate_phdr for systems that don't have it.  */
+
+#define dl_phdr_info x_dl_phdr_info
+#define dl_iterate_phdr x_dl_iterate_phdr
+
+struct dl_phdr_info
+{
+  uintptr_t dlpi_addr;
+  const char *dlpi_name;
+};
+
+static int
+dl_iterate_phdr (int (*callback) (struct dl_phdr_info *,
+				  size_t, void *) ATTRIBUTE_UNUSED,
+		 void *data ATTRIBUTE_UNUSED)
+{
+  return 0;
+}
+
+#endif /* ! defined (HAVE_DL_ITERATE_PHDR) */
+
+/* The configure script must tell us whether we are 32-bit or 64-bit
+   ELF.  We could make this code test and support either possibility,
+   but there is no point.  This code only works for the currently
+   running executable, which means that we know the ELF mode at
+   configure time.  */
+
+#if BACKTRACE_ELF_SIZE != 32 && BACKTRACE_ELF_SIZE != 64
+#error "Unknown BACKTRACE_ELF_SIZE"
+#endif
+
+/* <link.h> might #include <elf.h> which might define our constants
+   with slightly different values.  Undefine them to be safe.  */
+
+#undef EI_NIDENT
+#undef EI_MAG0
+#undef EI_MAG1
+#undef EI_MAG2
+#undef EI_MAG3
+#undef EI_CLASS
+#undef EI_DATA
+#undef EI_VERSION
+#undef ELF_MAG0
+#undef ELF_MAG1
+#undef ELF_MAG2
+#undef ELF_MAG3
+#undef ELFCLASS32
+#undef ELFCLASS64
+#undef ELFDATA2LSB
+#undef ELFDATA2MSB
+#undef EV_CURRENT
+#undef ET_DYN
+#undef EM_PPC64
+#undef EF_PPC64_ABI
+#undef SHN_LORESERVE
+#undef SHN_XINDEX
+#undef SHN_UNDEF
+#undef SHT_PROGBITS
+#undef SHT_SYMTAB
+#undef SHT_STRTAB
+#undef SHT_DYNSYM
+#undef SHF_COMPRESSED
+#undef STT_OBJECT
+#undef STT_FUNC
+#undef NT_GNU_BUILD_ID
+#undef ELFCOMPRESS_ZLIB
+#undef ELFCOMPRESS_ZSTD
+
+/* Basic types.  */
+
+typedef uint16_t b_elf_half;    /* Elf_Half.  */
+typedef uint32_t b_elf_word;    /* Elf_Word.  */
+typedef int32_t  b_elf_sword;   /* Elf_Sword.  */
+
+#if BACKTRACE_ELF_SIZE == 32
+
+typedef uint32_t b_elf_addr;    /* Elf_Addr.  */
+typedef uint32_t b_elf_off;     /* Elf_Off.  */
+
+typedef uint32_t b_elf_wxword;  /* 32-bit Elf_Word, 64-bit ELF_Xword.  */
+
+#else
+
+typedef uint64_t b_elf_addr;    /* Elf_Addr.  */
+typedef uint64_t b_elf_off;     /* Elf_Off.  */
+typedef uint64_t b_elf_xword;   /* Elf_Xword.  */
+typedef int64_t  b_elf_sxword;  /* Elf_Sxword.  */
+
+typedef uint64_t b_elf_wxword;  /* 32-bit Elf_Word, 64-bit ELF_Xword.  */
+
+#endif
+
+/* Data structures and associated constants.  */
+
+#define EI_NIDENT 16
+
+typedef struct {
+  unsigned char	e_ident[EI_NIDENT];	/* ELF "magic number" */
+  b_elf_half	e_type;			/* Identifies object file type */
+  b_elf_half	e_machine;		/* Specifies required architecture */
+  b_elf_word	e_version;		/* Identifies object file version */
+  b_elf_addr	e_entry;		/* Entry point virtual address */
+  b_elf_off	e_phoff;		/* Program header table file offset */
+  b_elf_off	e_shoff;		/* Section header table file offset */
+  b_elf_word	e_flags;		/* Processor-specific flags */
+  b_elf_half	e_ehsize;		/* ELF header size in bytes */
+  b_elf_half	e_phentsize;		/* Program header table entry size */
+  b_elf_half	e_phnum;		/* Program header table entry count */
+  b_elf_half	e_shentsize;		/* Section header table entry size */
+  b_elf_half	e_shnum;		/* Section header table entry count */
+  b_elf_half	e_shstrndx;		/* Section header string table index */
+} b_elf_ehdr;  /* Elf_Ehdr.  */
+
+#define EI_MAG0 0
+#define EI_MAG1 1
+#define EI_MAG2 2
+#define EI_MAG3 3
+#define EI_CLASS 4
+#define EI_DATA 5
+#define EI_VERSION 6
+
+#define ELFMAG0 0x7f
+#define ELFMAG1 'E'
+#define ELFMAG2 'L'
+#define ELFMAG3 'F'
+
+#define ELFCLASS32 1
+#define ELFCLASS64 2
+
+#define ELFDATA2LSB 1
+#define ELFDATA2MSB 2
+
+#define EV_CURRENT 1
+
+#define ET_DYN 3
+
+#define EM_PPC64 21
+#define EF_PPC64_ABI 3
+
+typedef struct {
+  b_elf_word	sh_name;		/* Section name, index in string tbl */
+  b_elf_word	sh_type;		/* Type of section */
+  b_elf_wxword	sh_flags;		/* Miscellaneous section attributes */
+  b_elf_addr	sh_addr;		/* Section virtual addr at execution */
+  b_elf_off	sh_offset;		/* Section file offset */
+  b_elf_wxword	sh_size;		/* Size of section in bytes */
+  b_elf_word	sh_link;		/* Index of another section */
+  b_elf_word	sh_info;		/* Additional section information */
+  b_elf_wxword	sh_addralign;		/* Section alignment */
+  b_elf_wxword	sh_entsize;		/* Entry size if section holds table */
+} b_elf_shdr;  /* Elf_Shdr.  */
+
+#define SHN_UNDEF	0x0000		/* Undefined section */
+#define SHN_LORESERVE	0xFF00		/* Begin range of reserved indices */
+#define SHN_XINDEX	0xFFFF		/* Section index is held elsewhere */
+
+#define SHT_PROGBITS 1
+#define SHT_SYMTAB 2
+#define SHT_STRTAB 3
+#define SHT_DYNSYM 11
+
+#define SHF_COMPRESSED 0x800
+
+#if BACKTRACE_ELF_SIZE == 32
+
+typedef struct
+{
+  b_elf_word	st_name;		/* Symbol name, index in string tbl */
+  b_elf_addr	st_value;		/* Symbol value */
+  b_elf_word	st_size;		/* Symbol size */
+  unsigned char	st_info;		/* Symbol binding and type */
+  unsigned char	st_other;		/* Visibility and other data */
+  b_elf_half	st_shndx;		/* Symbol section index */
+} b_elf_sym;  /* Elf_Sym.  */
+
+#else /* BACKTRACE_ELF_SIZE != 32 */
+
+typedef struct
+{
+  b_elf_word	st_name;		/* Symbol name, index in string tbl */
+  unsigned char	st_info;		/* Symbol binding and type */
+  unsigned char	st_other;		/* Visibility and other data */
+  b_elf_half	st_shndx;		/* Symbol section index */
+  b_elf_addr	st_value;		/* Symbol value */
+  b_elf_xword	st_size;		/* Symbol size */
+} b_elf_sym;  /* Elf_Sym.  */
+
+#endif /* BACKTRACE_ELF_SIZE != 32 */
+
+#define STT_OBJECT 1
+#define STT_FUNC 2
+
+typedef struct
+{
+  uint32_t namesz;
+  uint32_t descsz;
+  uint32_t type;
+  char name[1];
+} b_elf_note;
+
+#define NT_GNU_BUILD_ID 3
+
+#if BACKTRACE_ELF_SIZE == 32
+
+typedef struct
+{
+  b_elf_word	ch_type;		/* Compresstion algorithm */
+  b_elf_word	ch_size;		/* Uncompressed size */
+  b_elf_word	ch_addralign;		/* Alignment for uncompressed data */
+} b_elf_chdr;  /* Elf_Chdr */
+
+#else /* BACKTRACE_ELF_SIZE != 32 */
+
+typedef struct
+{
+  b_elf_word	ch_type;		/* Compression algorithm */
+  b_elf_word	ch_reserved;		/* Reserved */
+  b_elf_xword	ch_size;		/* Uncompressed size */
+  b_elf_xword	ch_addralign;		/* Alignment for uncompressed data */
+} b_elf_chdr;  /* Elf_Chdr */
+
+#endif /* BACKTRACE_ELF_SIZE != 32 */
+
+#define ELFCOMPRESS_ZLIB 1
+#define ELFCOMPRESS_ZSTD 2
+
+/* Names of sections, indexed by enum dwarf_section in internal.h.  */
+
+static const char * const dwarf_section_names[DEBUG_MAX] =
+{
+  ".debug_info",
+  ".debug_line",
+  ".debug_abbrev",
+  ".debug_ranges",
+  ".debug_str",
+  ".debug_addr",
+  ".debug_str_offsets",
+  ".debug_line_str",
+  ".debug_rnglists"
+};
+
+/* Information we gather for the sections we care about.  */
+
+struct debug_section_info
+{
+  /* Section file offset.  */
+  off_t offset;
+  /* Section size.  */
+  size_t size;
+  /* Section contents, after read from file.  */
+  const unsigned char *data;
+  /* Whether the SHF_COMPRESSED flag is set for the section.  */
+  int compressed;
+};
+
+/* Information we keep for an ELF symbol.  */
+
+struct elf_symbol
+{
+  /* The name of the symbol.  */
+  const char *name;
+  /* The address of the symbol.  */
+  uintptr_t address;
+  /* The size of the symbol.  */
+  size_t size;
+};
+
+/* Information to pass to elf_syminfo.  */
+
+struct elf_syminfo_data
+{
+  /* Symbols for the next module.  */
+  struct elf_syminfo_data *next;
+  /* The ELF symbols, sorted by address.  */
+  struct elf_symbol *symbols;
+  /* The number of symbols.  */
+  size_t count;
+};
+
+/* A view that works for either a file or memory.  */
+
+struct elf_view
+{
+  struct backtrace_view view;
+  int release; /* If non-zero, must call backtrace_release_view.  */
+};
+
+/* Information about PowerPC64 ELFv1 .opd section.  */
+
+struct elf_ppc64_opd_data
+{
+  /* Address of the .opd section.  */
+  b_elf_addr addr;
+  /* Section data.  */
+  const char *data;
+  /* Size of the .opd section.  */
+  size_t size;
+  /* Corresponding section view.  */
+  struct elf_view view;
+};
+
+/* Create a view of SIZE bytes from DESCRIPTOR/MEMORY at OFFSET.  */
+
+static int
+elf_get_view (struct backtrace_state *state, int descriptor,
+	      const unsigned char *memory, size_t memory_size, off_t offset,
+	      uint64_t size, backtrace_error_callback error_callback,
+	      void *data, struct elf_view *view)
+{
+  if (memory == NULL)
+    {
+      view->release = 1;
+      return backtrace_get_view (state, descriptor, offset, size,
+				 error_callback, data, &view->view);
+    }
+  else
+    {
+      if ((uint64_t) offset + size > (uint64_t) memory_size)
+	{
+	  error_callback (data, "out of range for in-memory file", 0);
+	  return 0;
+	}
+      view->view.data = (const void *) (memory + offset);
+      view->view.base = NULL;
+      view->view.len = size;
+      view->release = 0;
+      return 1;
+    }
+}
+
+/* Release a view read by elf_get_view.  */
+
+static void
+elf_release_view (struct backtrace_state *state, struct elf_view *view,
+		  backtrace_error_callback error_callback, void *data)
+{
+  if (view->release)
+    backtrace_release_view (state, &view->view, error_callback, data);
+}
+
+/* Compute the CRC-32 of BUF/LEN.  This uses the CRC used for
+   .gnu_debuglink files.  */
+
+static uint32_t
+elf_crc32 (uint32_t crc, const unsigned char *buf, size_t len)
+{
+  static const uint32_t crc32_table[256] =
+    {
+      0x00000000, 0x77073096, 0xee0e612c, 0x990951ba, 0x076dc419,
+      0x706af48f, 0xe963a535, 0x9e6495a3, 0x0edb8832, 0x79dcb8a4,
+      0xe0d5e91e, 0x97d2d988, 0x09b64c2b, 0x7eb17cbd, 0xe7b82d07,
+      0x90bf1d91, 0x1db71064, 0x6ab020f2, 0xf3b97148, 0x84be41de,
+      0x1adad47d, 0x6ddde4eb, 0xf4d4b551, 0x83d385c7, 0x136c9856,
+      0x646ba8c0, 0xfd62f97a, 0x8a65c9ec, 0x14015c4f, 0x63066cd9,
+      0xfa0f3d63, 0x8d080df5, 0x3b6e20c8, 0x4c69105e, 0xd56041e4,
+      0xa2677172, 0x3c03e4d1, 0x4b04d447, 0xd20d85fd, 0xa50ab56b,
+      0x35b5a8fa, 0x42b2986c, 0xdbbbc9d6, 0xacbcf940, 0x32d86ce3,
+      0x45df5c75, 0xdcd60dcf, 0xabd13d59, 0x26d930ac, 0x51de003a,
+      0xc8d75180, 0xbfd06116, 0x21b4f4b5, 0x56b3c423, 0xcfba9599,
+      0xb8bda50f, 0x2802b89e, 0x5f058808, 0xc60cd9b2, 0xb10be924,
+      0x2f6f7c87, 0x58684c11, 0xc1611dab, 0xb6662d3d, 0x76dc4190,
+      0x01db7106, 0x98d220bc, 0xefd5102a, 0x71b18589, 0x06b6b51f,
+      0x9fbfe4a5, 0xe8b8d433, 0x7807c9a2, 0x0f00f934, 0x9609a88e,
+      0xe10e9818, 0x7f6a0dbb, 0x086d3d2d, 0x91646c97, 0xe6635c01,
+      0x6b6b51f4, 0x1c6c6162, 0x856530d8, 0xf262004e, 0x6c0695ed,
+      0x1b01a57b, 0x8208f4c1, 0xf50fc457, 0x65b0d9c6, 0x12b7e950,
+      0x8bbeb8ea, 0xfcb9887c, 0x62dd1ddf, 0x15da2d49, 0x8cd37cf3,
+      0xfbd44c65, 0x4db26158, 0x3ab551ce, 0xa3bc0074, 0xd4bb30e2,
+      0x4adfa541, 0x3dd895d7, 0xa4d1c46d, 0xd3d6f4fb, 0x4369e96a,
+      0x346ed9fc, 0xad678846, 0xda60b8d0, 0x44042d73, 0x33031de5,
+      0xaa0a4c5f, 0xdd0d7cc9, 0x5005713c, 0x270241aa, 0xbe0b1010,
+      0xc90c2086, 0x5768b525, 0x206f85b3, 0xb966d409, 0xce61e49f,
+      0x5edef90e, 0x29d9c998, 0xb0d09822, 0xc7d7a8b4, 0x59b33d17,
+      0x2eb40d81, 0xb7bd5c3b, 0xc0ba6cad, 0xedb88320, 0x9abfb3b6,
+      0x03b6e20c, 0x74b1d29a, 0xead54739, 0x9dd277af, 0x04db2615,
+      0x73dc1683, 0xe3630b12, 0x94643b84, 0x0d6d6a3e, 0x7a6a5aa8,
+      0xe40ecf0b, 0x9309ff9d, 0x0a00ae27, 0x7d079eb1, 0xf00f9344,
+      0x8708a3d2, 0x1e01f268, 0x6906c2fe, 0xf762575d, 0x806567cb,
+      0x196c3671, 0x6e6b06e7, 0xfed41b76, 0x89d32be0, 0x10da7a5a,
+      0x67dd4acc, 0xf9b9df6f, 0x8ebeeff9, 0x17b7be43, 0x60b08ed5,
+      0xd6d6a3e8, 0xa1d1937e, 0x38d8c2c4, 0x4fdff252, 0xd1bb67f1,
+      0xa6bc5767, 0x3fb506dd, 0x48b2364b, 0xd80d2bda, 0xaf0a1b4c,
+      0x36034af6, 0x41047a60, 0xdf60efc3, 0xa867df55, 0x316e8eef,
+      0x4669be79, 0xcb61b38c, 0xbc66831a, 0x256fd2a0, 0x5268e236,
+      0xcc0c7795, 0xbb0b4703, 0x220216b9, 0x5505262f, 0xc5ba3bbe,
+      0xb2bd0b28, 0x2bb45a92, 0x5cb36a04, 0xc2d7ffa7, 0xb5d0cf31,
+      0x2cd99e8b, 0x5bdeae1d, 0x9b64c2b0, 0xec63f226, 0x756aa39c,
+      0x026d930a, 0x9c0906a9, 0xeb0e363f, 0x72076785, 0x05005713,
+      0x95bf4a82, 0xe2b87a14, 0x7bb12bae, 0x0cb61b38, 0x92d28e9b,
+      0xe5d5be0d, 0x7cdcefb7, 0x0bdbdf21, 0x86d3d2d4, 0xf1d4e242,
+      0x68ddb3f8, 0x1fda836e, 0x81be16cd, 0xf6b9265b, 0x6fb077e1,
+      0x18b74777, 0x88085ae6, 0xff0f6a70, 0x66063bca, 0x11010b5c,
+      0x8f659eff, 0xf862ae69, 0x616bffd3, 0x166ccf45, 0xa00ae278,
+      0xd70dd2ee, 0x4e048354, 0x3903b3c2, 0xa7672661, 0xd06016f7,
+      0x4969474d, 0x3e6e77db, 0xaed16a4a, 0xd9d65adc, 0x40df0b66,
+      0x37d83bf0, 0xa9bcae53, 0xdebb9ec5, 0x47b2cf7f, 0x30b5ffe9,
+      0xbdbdf21c, 0xcabac28a, 0x53b39330, 0x24b4a3a6, 0xbad03605,
+      0xcdd70693, 0x54de5729, 0x23d967bf, 0xb3667a2e, 0xc4614ab8,
+      0x5d681b02, 0x2a6f2b94, 0xb40bbe37, 0xc30c8ea1, 0x5a05df1b,
+      0x2d02ef8d
+    };
+  const unsigned char *end;
+
+  crc = ~crc;
+  for (end = buf + len; buf < end; ++ buf)
+    crc = crc32_table[(crc ^ *buf) & 0xff] ^ (crc >> 8);
+  return ~crc;
+}
+
+/* Return the CRC-32 of the entire file open at DESCRIPTOR.  */
+
+static uint32_t
+elf_crc32_file (struct backtrace_state *state, int descriptor,
+		backtrace_error_callback error_callback, void *data)
+{
+  struct stat st;
+  struct backtrace_view file_view;
+  uint32_t ret;
+
+  if (fstat (descriptor, &st) < 0)
+    {
+      error_callback (data, "fstat", errno);
+      return 0;
+    }
+
+  if (!backtrace_get_view (state, descriptor, 0, st.st_size, error_callback,
+			   data, &file_view))
+    return 0;
+
+  ret = elf_crc32 (0, (const unsigned char *) file_view.data, st.st_size);
+
+  backtrace_release_view (state, &file_view, error_callback, data);
+
+  return ret;
+}
+
+/* A dummy callback function used when we can't find a symbol
+   table.  */
+
+static void
+elf_nosyms (struct backtrace_state *state ATTRIBUTE_UNUSED,
+	    uintptr_t addr ATTRIBUTE_UNUSED,
+	    backtrace_syminfo_callback callback ATTRIBUTE_UNUSED,
+	    backtrace_error_callback error_callback, void *data)
+{
+  error_callback (data, "no symbol table in ELF executable", -1);
+}
+
+/* A callback function used when we can't find any debug info.  */
+
+static int
+elf_nodebug (struct backtrace_state *state, uintptr_t pc,
+	     backtrace_full_callback callback,
+	     backtrace_error_callback error_callback, void *data)
+{
+  if (state->syminfo_fn != NULL && state->syminfo_fn != elf_nosyms)
+    {
+      struct backtrace_call_full bdata;
+
+      /* Fetch symbol information so that we can least get the
+	 function name.  */
+
+      bdata.full_callback = callback;
+      bdata.full_error_callback = error_callback;
+      bdata.full_data = data;
+      bdata.ret = 0;
+      state->syminfo_fn (state, pc, backtrace_syminfo_to_full_callback,
+			 backtrace_syminfo_to_full_error_callback, &bdata);
+      return bdata.ret;
+    }
+
+  error_callback (data, "no debug info in ELF executable", -1);
+  return 0;
+}
+
+/* Compare struct elf_symbol for qsort.  */
+
+static int
+elf_symbol_compare (const void *v1, const void *v2)
+{
+  const struct elf_symbol *e1 = (const struct elf_symbol *) v1;
+  const struct elf_symbol *e2 = (const struct elf_symbol *) v2;
+
+  if (e1->address < e2->address)
+    return -1;
+  else if (e1->address > e2->address)
+    return 1;
+  else
+    return 0;
+}
+
+/* Compare an ADDR against an elf_symbol for bsearch.  We allocate one
+   extra entry in the array so that this can look safely at the next
+   entry.  */
+
+static int
+elf_symbol_search (const void *vkey, const void *ventry)
+{
+  const uintptr_t *key = (const uintptr_t *) vkey;
+  const struct elf_symbol *entry = (const struct elf_symbol *) ventry;
+  uintptr_t addr;
+
+  addr = *key;
+  if (addr < entry->address)
+    return -1;
+  else if (addr >= entry->address + entry->size)
+    return 1;
+  else
+    return 0;
+}
+
+/* Initialize the symbol table info for elf_syminfo.  */
+
+static int
+elf_initialize_syminfo (struct backtrace_state *state,
+			struct libbacktrace_base_address base_address,
+			const unsigned char *symtab_data, size_t symtab_size,
+			const unsigned char *strtab, size_t strtab_size,
+			backtrace_error_callback error_callback,
+			void *data, struct elf_syminfo_data *sdata,
+			struct elf_ppc64_opd_data *opd)
+{
+  size_t sym_count;
+  const b_elf_sym *sym;
+  size_t elf_symbol_count;
+  size_t elf_symbol_size;
+  struct elf_symbol *elf_symbols;
+  size_t i;
+  unsigned int j;
+
+  sym_count = symtab_size / sizeof (b_elf_sym);
+
+  /* We only care about function symbols.  Count them.  */
+  sym = (const b_elf_sym *) symtab_data;
+  elf_symbol_count = 0;
+  for (i = 0; i < sym_count; ++i, ++sym)
+    {
+      int info;
+
+      info = sym->st_info & 0xf;
+      if ((info == STT_FUNC || info == STT_OBJECT)
+	  && sym->st_shndx != SHN_UNDEF)
+	++elf_symbol_count;
+    }
+
+  elf_symbol_size = elf_symbol_count * sizeof (struct elf_symbol);
+  elf_symbols = ((struct elf_symbol *)
+		 backtrace_alloc (state, elf_symbol_size, error_callback,
+				  data));
+  if (elf_symbols == NULL)
+    return 0;
+
+  sym = (const b_elf_sym *) symtab_data;
+  j = 0;
+  for (i = 0; i < sym_count; ++i, ++sym)
+    {
+      int info;
+
+      info = sym->st_info & 0xf;
+      if (info != STT_FUNC && info != STT_OBJECT)
+	continue;
+      if (sym->st_shndx == SHN_UNDEF)
+	continue;
+      if (sym->st_name >= strtab_size)
+	{
+	  error_callback (data, "symbol string index out of range", 0);
+	  backtrace_free (state, elf_symbols, elf_symbol_size, error_callback,
+			  data);
+	  return 0;
+	}
+      elf_symbols[j].name = (const char *) strtab + sym->st_name;
+      /* Special case PowerPC64 ELFv1 symbols in .opd section, if the symbol
+	 is a function descriptor, read the actual code address from the
+	 descriptor.  */
+      if (opd
+	  && sym->st_value >= opd->addr
+	  && sym->st_value < opd->addr + opd->size)
+	elf_symbols[j].address
+	  = *(const b_elf_addr *) (opd->data + (sym->st_value - opd->addr));
+      else
+	elf_symbols[j].address = sym->st_value;
+      elf_symbols[j].address =
+	libbacktrace_add_base (elf_symbols[j].address, base_address);
+      elf_symbols[j].size = sym->st_size;
+      ++j;
+    }
+
+  backtrace_qsort (elf_symbols, elf_symbol_count, sizeof (struct elf_symbol),
+		   elf_symbol_compare);
+
+  sdata->next = NULL;
+  sdata->symbols = elf_symbols;
+  sdata->count = elf_symbol_count;
+
+  return 1;
+}
+
+/* Add EDATA to the list in STATE.  */
+
+static void
+elf_add_syminfo_data (struct backtrace_state *state,
+		      struct elf_syminfo_data *edata)
+{
+  if (!state->threaded)
+    {
+      struct elf_syminfo_data **pp;
+
+      for (pp = (struct elf_syminfo_data **) (void *) &state->syminfo_data;
+	   *pp != NULL;
+	   pp = &(*pp)->next)
+	;
+      *pp = edata;
+    }
+  else
+    {
+      while (1)
+	{
+	  struct elf_syminfo_data **pp;
+
+	  pp = (struct elf_syminfo_data **) (void *) &state->syminfo_data;
+
+	  while (1)
+	    {
+	      struct elf_syminfo_data *p;
+
+	      p = backtrace_atomic_load_pointer (pp);
+
+	      if (p == NULL)
+		break;
+
+	      pp = &p->next;
+	    }
+
+	  if (__sync_bool_compare_and_swap (pp, NULL, edata))
+	    break;
+	}
+    }
+}
+
+/* Return the symbol name and value for an ADDR.  */
+
+static void
+elf_syminfo (struct backtrace_state *state, uintptr_t addr,
+	     backtrace_syminfo_callback callback,
+	     backtrace_error_callback error_callback ATTRIBUTE_UNUSED,
+	     void *data)
+{
+  struct elf_syminfo_data *edata;
+  struct elf_symbol *sym = NULL;
+
+  if (!state->threaded)
+    {
+      for (edata = (struct elf_syminfo_data *) state->syminfo_data;
+	   edata != NULL;
+	   edata = edata->next)
+	{
+	  sym = ((struct elf_symbol *)
+		 bsearch (&addr, edata->symbols, edata->count,
+			  sizeof (struct elf_symbol), elf_symbol_search));
+	  if (sym != NULL)
+	    break;
+	}
+    }
+  else
+    {
+      struct elf_syminfo_data **pp;
+
+      pp = (struct elf_syminfo_data **) (void *) &state->syminfo_data;
+      while (1)
+	{
+	  edata = backtrace_atomic_load_pointer (pp);
+	  if (edata == NULL)
+	    break;
+
+	  sym = ((struct elf_symbol *)
+		 bsearch (&addr, edata->symbols, edata->count,
+			  sizeof (struct elf_symbol), elf_symbol_search));
+	  if (sym != NULL)
+	    break;
+
+	  pp = &edata->next;
+	}
+    }
+
+  if (sym == NULL)
+    callback (data, addr, NULL, 0, 0);
+  else
+    callback (data, addr, sym->name, sym->address, sym->size);
+}
+
+/* Return whether FILENAME is a symlink.  */
+
+static int
+elf_is_symlink (const char *filename)
+{
+  struct stat st;
+
+  if (lstat (filename, &st) < 0)
+    return 0;
+  return S_ISLNK (st.st_mode);
+}
+
+/* Return the results of reading the symlink FILENAME in a buffer
+   allocated by backtrace_alloc.  Return the length of the buffer in
+   *LEN.  */
+
+static char *
+elf_readlink (struct backtrace_state *state, const char *filename,
+	      backtrace_error_callback error_callback, void *data,
+	      size_t *plen)
+{
+  size_t len;
+  char *buf;
+
+  len = 128;
+  while (1)
+    {
+      ssize_t rl;
+
+      buf = (char*)backtrace_alloc (state, len, error_callback, data);
+      if (buf == NULL)
+	return NULL;
+      rl = readlink (filename, buf, len);
+      if (rl < 0)
+	{
+	  backtrace_free (state, buf, len, error_callback, data);
+	  return NULL;
+	}
+      if ((size_t) rl < len - 1)
+	{
+	  buf[rl] = '\0';
+	  *plen = len;
+	  return buf;
+	}
+      backtrace_free (state, buf, len, error_callback, data);
+      len *= 2;
+    }
+}
+
+#define SYSTEM_BUILD_ID_DIR "/usr/lib/debug/.build-id/"
+
+/* Open a separate debug info file, using the build ID to find it.
+   Returns an open file descriptor, or -1.
+
+   The GDB manual says that the only place gdb looks for a debug file
+   when the build ID is known is in /usr/lib/debug/.build-id.  */
+
+static int
+elf_open_debugfile_by_buildid (struct backtrace_state *state,
+			       const char *buildid_data, size_t buildid_size,
+             const char *filename,
+			       backtrace_error_callback error_callback,
+			       void *data)
+{
+  const char * const prefix = SYSTEM_BUILD_ID_DIR;
+  const size_t prefix_len = strlen (prefix);
+  const char * const suffix = ".debug";
+  const size_t suffix_len = strlen (suffix);
+  size_t len;
+  char *bd_filename;
+  char *t;
+  size_t i;
+  int ret;
+  int does_not_exist;
+
+  len = prefix_len + buildid_size * 2 + suffix_len + 2;
+  bd_filename = (char*)backtrace_alloc (state, len, error_callback, data);
+  if (bd_filename == NULL)
+    return -1;
+
+  t = bd_filename;
+  memcpy (t, prefix, prefix_len);
+  t += prefix_len;
+  for (i = 0; i < buildid_size; i++)
+    {
+      unsigned char b;
+      unsigned char nib;
+
+      b = (unsigned char) buildid_data[i];
+      nib = (b & 0xf0) >> 4;
+      *t++ = nib < 10 ? '0' + nib : 'a' + nib - 10;
+      nib = b & 0x0f;
+      *t++ = nib < 10 ? '0' + nib : 'a' + nib - 10;
+      if (i == 0)
+	*t++ = '/';
+    }
+  memcpy (t, suffix, suffix_len);
+  t[suffix_len] = '\0';
+
+  ret = backtrace_open (bd_filename, error_callback, data, &does_not_exist);
+
+  backtrace_free (state, bd_filename, len, error_callback, data);
+
+  /* gdb checks that the debuginfo file has the same build ID note.
+     That seems kind of pointless to me--why would it have the right
+     name but not the right build ID?--so skipping the check.  */
+
+#ifdef TRACY_DEBUGINFOD
+  if (ret == -1)
+    return GetDebugInfoDescriptor( buildid_data, buildid_size, filename );
+  else
+    return ret;
+#else
+  return ret;
+#endif
+}
+
+/* Try to open a file whose name is PREFIX (length PREFIX_LEN)
+   concatenated with PREFIX2 (length PREFIX2_LEN) concatenated with
+   DEBUGLINK_NAME.  Returns an open file descriptor, or -1.  */
+
+static int
+elf_try_debugfile (struct backtrace_state *state, const char *prefix,
+		   size_t prefix_len, const char *prefix2, size_t prefix2_len,
+		   const char *debuglink_name,
+		   backtrace_error_callback error_callback, void *data)
+{
+  size_t debuglink_len;
+  size_t try_len;
+  char *Try;
+  int does_not_exist;
+  int ret;
+
+  debuglink_len = strlen (debuglink_name);
+  try_len = prefix_len + prefix2_len + debuglink_len + 1;
+  Try = (char*)backtrace_alloc (state, try_len, error_callback, data);
+  if (Try == NULL)
+    return -1;
+
+  memcpy (Try, prefix, prefix_len);
+  memcpy (Try + prefix_len, prefix2, prefix2_len);
+  memcpy (Try + prefix_len + prefix2_len, debuglink_name, debuglink_len);
+  Try[prefix_len + prefix2_len + debuglink_len] = '\0';
+
+  ret = backtrace_open (Try, error_callback, data, &does_not_exist);
+
+  backtrace_free (state, Try, try_len, error_callback, data);
+
+  return ret;
+}
+
+/* Find a separate debug info file, using the debuglink section data
+   to find it.  Returns an open file descriptor, or -1.  */
+
+static int
+elf_find_debugfile_by_debuglink (struct backtrace_state *state,
+				 const char *filename,
+				 const char *debuglink_name,
+				 backtrace_error_callback error_callback,
+				 void *data)
+{
+  int ret;
+  char *alc;
+  size_t alc_len;
+  const char *slash;
+  int ddescriptor;
+  const char *prefix;
+  size_t prefix_len;
+
+  /* Resolve symlinks in FILENAME.  Since FILENAME is fairly likely to
+     be /proc/self/exe, symlinks are common.  We don't try to resolve
+     the whole path name, just the base name.  */
+  ret = -1;
+  alc = NULL;
+  alc_len = 0;
+  while (elf_is_symlink (filename))
+    {
+      char *new_buf;
+      size_t new_len;
+
+      new_buf = elf_readlink (state, filename, error_callback, data, &new_len);
+      if (new_buf == NULL)
+	break;
+
+      if (new_buf[0] == '/')
+	filename = new_buf;
+      else
+	{
+	  slash = strrchr (filename, '/');
+	  if (slash == NULL)
+	    filename = new_buf;
+	  else
+	    {
+	      size_t clen;
+	      char *c;
+
+	      slash++;
+	      clen = slash - filename + strlen (new_buf) + 1;
+	      c = (char*)backtrace_alloc (state, clen, error_callback, data);
+	      if (c == NULL)
+		goto done;
+
+	      memcpy (c, filename, slash - filename);
+	      memcpy (c + (slash - filename), new_buf, strlen (new_buf));
+	      c[slash - filename + strlen (new_buf)] = '\0';
+	      backtrace_free (state, new_buf, new_len, error_callback, data);
+	      filename = c;
+	      new_buf = c;
+	      new_len = clen;
+	    }
+	}
+
+      if (alc != NULL)
+	backtrace_free (state, alc, alc_len, error_callback, data);
+      alc = new_buf;
+      alc_len = new_len;
+    }
+
+  /* Look for DEBUGLINK_NAME in the same directory as FILENAME.  */
+
+  slash = strrchr (filename, '/');
+  if (slash == NULL)
+    {
+      prefix = "";
+      prefix_len = 0;
+    }
+  else
+    {
+      slash++;
+      prefix = filename;
+      prefix_len = slash - filename;
+    }
+
+  ddescriptor = elf_try_debugfile (state, prefix, prefix_len, "", 0,
+				   debuglink_name, error_callback, data);
+  if (ddescriptor >= 0)
+    {
+      ret = ddescriptor;
+      goto done;
+    }
+
+  /* Look for DEBUGLINK_NAME in a .debug subdirectory of FILENAME.  */
+
+  ddescriptor = elf_try_debugfile (state, prefix, prefix_len, ".debug/",
+				   strlen (".debug/"), debuglink_name,
+				   error_callback, data);
+  if (ddescriptor >= 0)
+    {
+      ret = ddescriptor;
+      goto done;
+    }
+
+  /* Look for DEBUGLINK_NAME in /usr/lib/debug.  */
+
+  ddescriptor = elf_try_debugfile (state, "/usr/lib/debug/",
+				   strlen ("/usr/lib/debug/"), prefix,
+				   prefix_len, debuglink_name,
+				   error_callback, data);
+  if (ddescriptor >= 0)
+    ret = ddescriptor;
+
+ done:
+  if (alc != NULL && alc_len > 0)
+    backtrace_free (state, alc, alc_len, error_callback, data);
+  return ret;
+}
+
+/* Open a separate debug info file, using the debuglink section data
+   to find it.  Returns an open file descriptor, or -1.  */
+
+static int
+elf_open_debugfile_by_debuglink (struct backtrace_state *state,
+				 const char *filename,
+				 const char *debuglink_name,
+				 uint32_t debuglink_crc,
+				 backtrace_error_callback error_callback,
+				 void *data)
+{
+  int ddescriptor;
+
+  ddescriptor = elf_find_debugfile_by_debuglink (state, filename,
+						 debuglink_name,
+						 error_callback, data);
+  if (ddescriptor < 0)
+    return -1;
+
+  if (debuglink_crc != 0)
+    {
+      uint32_t got_crc;
+
+      got_crc = elf_crc32_file (state, ddescriptor, error_callback, data);
+      if (got_crc != debuglink_crc)
+	{
+	  backtrace_close (ddescriptor, error_callback, data);
+	  return -1;
+	}
+    }
+
+  return ddescriptor;
+}
+
+/* A function useful for setting a breakpoint for an inflation failure
+   when this code is compiled with -g.  */
+
+static void
+elf_uncompress_failed(void)
+{
+}
+
+/* *PVAL is the current value being read from the stream, and *PBITS
+   is the number of valid bits.  Ensure that *PVAL holds at least 15
+   bits by reading additional bits from *PPIN, up to PINEND, as
+   needed.  Updates *PPIN, *PVAL and *PBITS.  Returns 1 on success, 0
+   on error.  */
+
+static int
+elf_fetch_bits (const unsigned char **ppin, const unsigned char *pinend,
+		uint64_t *pval, unsigned int *pbits)
+{
+  unsigned int bits;
+  const unsigned char *pin;
+  uint64_t val;
+  uint32_t next;
+
+  bits = *pbits;
+  if (bits >= 15)
+    return 1;
+  pin = *ppin;
+  val = *pval;
+
+  if (unlikely (pinend - pin < 4))
+    {
+      elf_uncompress_failed ();
+      return 0;
+    }
+
+#if defined(__BYTE_ORDER__) && defined(__ORDER_LITTLE_ENDIAN__) \
+    && defined(__ORDER_BIG_ENDIAN__) \
+    && (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__ \
+        || __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__)
+  /* We've ensured that PIN is aligned.  */
+  next = *(const uint32_t *)pin;
+
+#if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
+  next = __builtin_bswap32 (next);
+#endif
+#else
+  next = pin[0] | (pin[1] << 8) | (pin[2] << 16) | (pin[3] << 24);
+#endif
+
+  val |= (uint64_t)next << bits;
+  bits += 32;
+  pin += 4;
+
+  /* We will need the next four bytes soon.  */
+  __builtin_prefetch (pin, 0, 0);
+
+  *ppin = pin;
+  *pval = val;
+  *pbits = bits;
+  return 1;
+}
+
+/* This is like elf_fetch_bits, but it fetchs the bits backward, and ensures at
+   least 16 bits.  This is for zstd.  */
+
+static int
+elf_fetch_bits_backward (const unsigned char **ppin,
+			 const unsigned char *pinend,
+			 uint64_t *pval, unsigned int *pbits)
+{
+  unsigned int bits;
+  const unsigned char *pin;
+  uint64_t val;
+  uint32_t next;
+
+  bits = *pbits;
+  if (bits >= 16)
+    return 1;
+  pin = *ppin;
+  val = *pval;
+
+  if (unlikely (pin <= pinend))
+    return 1;
+
+  pin -= 4;
+
+#if defined(__BYTE_ORDER__) && defined(__ORDER_LITTLE_ENDIAN__) \
+  && defined(__ORDER_BIG_ENDIAN__)				\
+  && (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__			\
+      || __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__)
+  /* We've ensured that PIN is aligned.  */
+  next = *(const uint32_t *)pin;
+
+#if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
+  next = __builtin_bswap32 (next);
+#endif
+#else
+  next = pin[0] | (pin[1] << 8) | (pin[2] << 16) | (pin[3] << 24);
+#endif
+
+  val <<= 32;
+  val |= next;
+  bits += 32;
+
+  if (unlikely (pin < pinend))
+    {
+      val >>= (pinend - pin) * 8;
+      bits -= (pinend - pin) * 8;
+    }
+
+  *ppin = pin;
+  *pval = val;
+  *pbits = bits;
+  return 1;
+}
+
+/* Initialize backward fetching when the bitstream starts with a 1 bit in the
+   last byte in memory (which is the first one that we read).  This is used by
+   zstd decompression.  Returns 1 on success, 0 on error.  */
+
+static int
+elf_fetch_backward_init (const unsigned char **ppin,
+			 const unsigned char *pinend,
+			 uint64_t *pval, unsigned int *pbits)
+{
+  const unsigned char *pin;
+  unsigned int stream_start;
+  uint64_t val;
+  unsigned int bits;
+
+  pin = *ppin;
+  stream_start = (unsigned int)*pin;
+  if (unlikely (stream_start == 0))
+    {
+      elf_uncompress_failed ();
+      return 0;
+    }
+  val = 0;
+  bits = 0;
+
+  /* Align to a 32-bit boundary.  */
+  while ((((uintptr_t)pin) & 3) != 0)
+    {
+      val <<= 8;
+      val |= (uint64_t)*pin;
+      bits += 8;
+      --pin;
+    }
+
+  val <<= 8;
+  val |= (uint64_t)*pin;
+  bits += 8;
+
+  *ppin = pin;
+  *pval = val;
+  *pbits = bits;
+  if (!elf_fetch_bits_backward (ppin, pinend, pval, pbits))
+    return 0;
+
+  *pbits -= __builtin_clz (stream_start) - (sizeof (unsigned int) - 1) * 8 + 1;
+
+  if (!elf_fetch_bits_backward (ppin, pinend, pval, pbits))
+    return 0;
+
+  return 1;
+}
+
+/* Huffman code tables, like the rest of the zlib format, are defined
+   by RFC 1951.  We store a Huffman code table as a series of tables
+   stored sequentially in memory.  Each entry in a table is 16 bits.
+   The first, main, table has 256 entries.  It is followed by a set of
+   secondary tables of length 2 to 128 entries.  The maximum length of
+   a code sequence in the deflate format is 15 bits, so that is all we
+   need.  Each secondary table has an index, which is the offset of
+   the table in the overall memory storage.
+
+   The deflate format says that all codes of a given bit length are
+   lexicographically consecutive.  Perhaps we could have 130 values
+   that require a 15-bit code, perhaps requiring three secondary
+   tables of size 128.  I don't know if this is actually possible, but
+   it suggests that the maximum size required for secondary tables is
+   3 * 128 + 3 * 64 ... == 768.  The zlib enough program reports 660
+   as the maximum.  We permit 768, since in addition to the 256 for
+   the primary table, with two bytes per entry, and with the two
+   tables we need, that gives us a page.
+
+   A single table entry needs to store a value or (for the main table
+   only) the index and size of a secondary table.  Values range from 0
+   to 285, inclusive.  Secondary table indexes, per above, range from
+   0 to 510.  For a value we need to store the number of bits we need
+   to determine that value (one value may appear multiple times in the
+   table), which is 1 to 8.  For a secondary table we need to store
+   the number of bits used to index into the table, which is 1 to 7.
+   And of course we need 1 bit to decide whether we have a value or a
+   secondary table index.  So each entry needs 9 bits for value/table
+   index, 3 bits for size, 1 bit what it is.  For simplicity we use 16
+   bits per entry.  */
+
+/* Number of entries we allocate to for one code table.  We get a page
+   for the two code tables we need.  */
+
+#define ZLIB_HUFFMAN_TABLE_SIZE (1024)
+
+/* Bit masks and shifts for the values in the table.  */
+
+#define ZLIB_HUFFMAN_VALUE_MASK 0x01ff
+#define ZLIB_HUFFMAN_BITS_SHIFT 9
+#define ZLIB_HUFFMAN_BITS_MASK 0x7
+#define ZLIB_HUFFMAN_SECONDARY_SHIFT 12
+
+/* For working memory while inflating we need two code tables, we need
+   an array of code lengths (max value 15, so we use unsigned char),
+   and an array of unsigned shorts used while building a table.  The
+   latter two arrays must be large enough to hold the maximum number
+   of code lengths, which RFC 1951 defines as 286 + 30.  */
+
+#define ZLIB_TABLE_SIZE \
+  (2 * ZLIB_HUFFMAN_TABLE_SIZE * sizeof (uint16_t) \
+   + (286 + 30) * sizeof (uint16_t)	      \
+   + (286 + 30) * sizeof (unsigned char))
+
+#define ZLIB_TABLE_CODELEN_OFFSET \
+  (2 * ZLIB_HUFFMAN_TABLE_SIZE * sizeof (uint16_t) \
+   + (286 + 30) * sizeof (uint16_t))
+
+#define ZLIB_TABLE_WORK_OFFSET \
+  (2 * ZLIB_HUFFMAN_TABLE_SIZE * sizeof (uint16_t))
+
+#ifdef BACKTRACE_GENERATE_FIXED_HUFFMAN_TABLE
+
+/* Used by the main function that generates the fixed table to learn
+   the table size.  */
+static size_t final_next_secondary;
+
+#endif
+
+/* Build a Huffman code table from an array of lengths in CODES of
+   length CODES_LEN.  The table is stored into *TABLE.  ZDEBUG_TABLE
+   is the same as for elf_zlib_inflate, used to find some work space.
+   Returns 1 on success, 0 on error.  */
+
+static int
+elf_zlib_inflate_table (unsigned char *codes, size_t codes_len,
+			uint16_t *zdebug_table, uint16_t *table)
+{
+  uint16_t count[16];
+  uint16_t start[16];
+  uint16_t prev[16];
+  uint16_t firstcode[7];
+  uint16_t *next;
+  size_t i;
+  size_t j;
+  unsigned int code;
+  size_t next_secondary;
+
+  /* Count the number of code of each length.  Set NEXT[val] to be the
+     next value after VAL with the same bit length.  */
+
+  next = (uint16_t *) (((unsigned char *) zdebug_table)
+		       + ZLIB_TABLE_WORK_OFFSET);
+
+  memset (&count[0], 0, 16 * sizeof (uint16_t));
+  for (i = 0; i < codes_len; ++i)
+    {
+      if (unlikely (codes[i] >= 16))
+	{
+	  elf_uncompress_failed ();
+	  return 0;
+	}
+
+      if (count[codes[i]] == 0)
+	{
+	  start[codes[i]] = i;
+	  prev[codes[i]] = i;
+	}
+      else
+	{
+	  next[prev[codes[i]]] = i;
+	  prev[codes[i]] = i;
+	}
+
+      ++count[codes[i]];
+    }
+
+  /* For each length, fill in the table for the codes of that
+     length.  */
+
+  memset (table, 0, ZLIB_HUFFMAN_TABLE_SIZE * sizeof (uint16_t));
+
+  /* Handle the values that do not require a secondary table.  */
+
+  code = 0;
+  for (j = 1; j <= 8; ++j)
+    {
+      unsigned int jcnt;
+      unsigned int val;
+
+      jcnt = count[j];
+      if (jcnt == 0)
+	continue;
+
+      if (unlikely (jcnt > (1U << j)))
+	{
+	  elf_uncompress_failed ();
+	  return 0;
+	}
+
+      /* There are JCNT values that have this length, the values
+	 starting from START[j] continuing through NEXT[VAL].  Those
+	 values are assigned consecutive values starting at CODE.  */
+
+      val = start[j];
+      for (i = 0; i < jcnt; ++i)
+	{
+	  uint16_t tval;
+	  size_t ind;
+	  unsigned int incr;
+
+	  /* In the compressed bit stream, the value VAL is encoded as
+	     J bits with the value C.  */
+
+	  if (unlikely ((val & ~ZLIB_HUFFMAN_VALUE_MASK) != 0))
+	    {
+	      elf_uncompress_failed ();
+	      return 0;
+	    }
+
+	  tval = val | ((j - 1) << ZLIB_HUFFMAN_BITS_SHIFT);
+
+	  /* The table lookup uses 8 bits.  If J is less than 8, we
+	     don't know what the other bits will be.  We need to fill
+	     in all possibilities in the table.  Since the Huffman
+	     code is unambiguous, those entries can't be used for any
+	     other code.  */
+
+	  for (ind = code; ind < 0x100; ind += 1 << j)
+	    {
+	      if (unlikely (table[ind] != 0))
+		{
+		  elf_uncompress_failed ();
+		  return 0;
+		}
+	      table[ind] = tval;
+	    }
+
+	  /* Advance to the next value with this length.  */
+	  if (i + 1 < jcnt)
+	    val = next[val];
+
+	  /* The Huffman codes are stored in the bitstream with the
+	     most significant bit first, as is required to make them
+	     unambiguous.  The effect is that when we read them from
+	     the bitstream we see the bit sequence in reverse order:
+	     the most significant bit of the Huffman code is the least
+	     significant bit of the value we read from the bitstream.
+	     That means that to make our table lookups work, we need
+	     to reverse the bits of CODE.  Since reversing bits is
+	     tedious and in general requires using a table, we instead
+	     increment CODE in reverse order.  That is, if the number
+	     of bits we are currently using, here named J, is 3, we
+	     count as 000, 100, 010, 110, 001, 101, 011, 111, which is
+	     to say the numbers from 0 to 7 but with the bits
+	     reversed.  Going to more bits, aka incrementing J,
+	     effectively just adds more zero bits as the beginning,
+	     and as such does not change the numeric value of CODE.
+
+	     To increment CODE of length J in reverse order, find the
+	     most significant zero bit and set it to one while
+	     clearing all higher bits.  In other words, add 1 modulo
+	     2^J, only reversed.  */
+
+	  incr = 1U << (j - 1);
+	  while ((code & incr) != 0)
+	    incr >>= 1;
+	  if (incr == 0)
+	    code = 0;
+	  else
+	    {
+	      code &= incr - 1;
+	      code += incr;
+	    }
+	}
+    }
+
+  /* Handle the values that require a secondary table.  */
+
+  /* Set FIRSTCODE, the number at which the codes start, for each
+     length.  */
+
+  for (j = 9; j < 16; j++)
+    {
+      unsigned int jcnt;
+      unsigned int k;
+
+      jcnt = count[j];
+      if (jcnt == 0)
+	continue;
+
+      /* There are JCNT values that have this length, the values
+	 starting from START[j].  Those values are assigned
+	 consecutive values starting at CODE.  */
+
+      firstcode[j - 9] = code;
+
+      /* Reverse add JCNT to CODE modulo 2^J.  */
+      for (k = 0; k < j; ++k)
+	{
+	  if ((jcnt & (1U << k)) != 0)
+	    {
+	      unsigned int m;
+	      unsigned int bit;
+
+	      bit = 1U << (j - k - 1);
+	      for (m = 0; m < j - k; ++m, bit >>= 1)
+		{
+		  if ((code & bit) == 0)
+		    {
+		      code += bit;
+		      break;
+		    }
+		  code &= ~bit;
+		}
+	      jcnt &= ~(1U << k);
+	    }
+	}
+      if (unlikely (jcnt != 0))
+	{
+	  elf_uncompress_failed ();
+	  return 0;
+	}
+    }
+
+  /* For J from 9 to 15, inclusive, we store COUNT[J] consecutive
+     values starting at START[J] with consecutive codes starting at
+     FIRSTCODE[J - 9].  In the primary table we need to point to the
+     secondary table, and the secondary table will be indexed by J - 9
+     bits.  We count down from 15 so that we install the larger
+     secondary tables first, as the smaller ones may be embedded in
+     the larger ones.  */
+
+  next_secondary = 0; /* Index of next secondary table (after primary).  */
+  for (j = 15; j >= 9; j--)
+    {
+      unsigned int jcnt;
+      unsigned int val;
+      size_t primary; /* Current primary index.  */
+      size_t secondary; /* Offset to current secondary table.  */
+      size_t secondary_bits; /* Bit size of current secondary table.  */
+
+      jcnt = count[j];
+      if (jcnt == 0)
+	continue;
+
+      val = start[j];
+      code = firstcode[j - 9];
+      primary = 0x100;
+      secondary = 0;
+      secondary_bits = 0;
+      for (i = 0; i < jcnt; ++i)
+	{
+	  uint16_t tval;
+	  size_t ind;
+	  unsigned int incr;
+
+	  if ((code & 0xff) != primary)
+	    {
+	      uint16_t tprimary;
+
+	      /* Fill in a new primary table entry.  */
+
+	      primary = code & 0xff;
+
+	      tprimary = table[primary];
+	      if (tprimary == 0)
+		{
+		  /* Start a new secondary table.  */
+
+		  if (unlikely ((next_secondary & ZLIB_HUFFMAN_VALUE_MASK)
+				!= next_secondary))
+		    {
+		      elf_uncompress_failed ();
+		      return 0;
+		    }
+
+		  secondary = next_secondary;
+		  secondary_bits = j - 8;
+		  next_secondary += 1 << secondary_bits;
+		  table[primary] = (secondary
+				    + ((j - 8) << ZLIB_HUFFMAN_BITS_SHIFT)
+				    + (1U << ZLIB_HUFFMAN_SECONDARY_SHIFT));
+		}
+	      else
+		{
+		  /* There is an existing entry.  It had better be a
+		     secondary table with enough bits.  */
+		  if (unlikely ((tprimary
+				 & (1U << ZLIB_HUFFMAN_SECONDARY_SHIFT))
+				== 0))
+		    {
+		      elf_uncompress_failed ();
+		      return 0;
+		    }
+		  secondary = tprimary & ZLIB_HUFFMAN_VALUE_MASK;
+		  secondary_bits = ((tprimary >> ZLIB_HUFFMAN_BITS_SHIFT)
+				    & ZLIB_HUFFMAN_BITS_MASK);
+		  if (unlikely (secondary_bits < j - 8))
+		    {
+		      elf_uncompress_failed ();
+		      return 0;
+		    }
+		}
+	    }
+
+	  /* Fill in secondary table entries.  */
+
+	  tval = val | ((j - 8) << ZLIB_HUFFMAN_BITS_SHIFT);
+
+	  for (ind = code >> 8;
+	       ind < (1U << secondary_bits);
+	       ind += 1U << (j - 8))
+	    {
+	      if (unlikely (table[secondary + 0x100 + ind] != 0))
+		{
+		  elf_uncompress_failed ();
+		  return 0;
+		}
+	      table[secondary + 0x100 + ind] = tval;
+	    }
+
+	  if (i + 1 < jcnt)
+	    val = next[val];
+
+	  incr = 1U << (j - 1);
+	  while ((code & incr) != 0)
+	    incr >>= 1;
+	  if (incr == 0)
+	    code = 0;
+	  else
+	    {
+	      code &= incr - 1;
+	      code += incr;
+	    }
+	}
+    }
+
+#ifdef BACKTRACE_GENERATE_FIXED_HUFFMAN_TABLE
+  final_next_secondary = next_secondary;
+#endif
+
+  return 1;
+}
+
+#ifdef BACKTRACE_GENERATE_FIXED_HUFFMAN_TABLE
+
+/* Used to generate the fixed Huffman table for block type 1.  */
+
+#include <stdio.h>
+
+static uint16_t table[ZLIB_TABLE_SIZE];
+static unsigned char codes[288];
+
+int
+main ()
+{
+  size_t i;
+
+  for (i = 0; i <= 143; ++i)
+    codes[i] = 8;
+  for (i = 144; i <= 255; ++i)
+    codes[i] = 9;
+  for (i = 256; i <= 279; ++i)
+    codes[i] = 7;
+  for (i = 280; i <= 287; ++i)
+    codes[i] = 8;
+  if (!elf_zlib_inflate_table (&codes[0], 288, &table[0], &table[0]))
+    {
+      fprintf (stderr, "elf_zlib_inflate_table failed\n");
+      exit (EXIT_FAILURE);
+    }
+
+  printf ("static const uint16_t elf_zlib_default_table[%#zx] =\n",
+	  final_next_secondary + 0x100);
+  printf ("{\n");
+  for (i = 0; i < final_next_secondary + 0x100; i += 8)
+    {
+      size_t j;
+
+      printf (" ");
+      for (j = i; j < final_next_secondary + 0x100 && j < i + 8; ++j)
+	printf (" %#x,", table[j]);
+      printf ("\n");
+    }
+  printf ("};\n");
+  printf ("\n");
+
+  for (i = 0; i < 32; ++i)
+    codes[i] = 5;
+  if (!elf_zlib_inflate_table (&codes[0], 32, &table[0], &table[0]))
+    {
+      fprintf (stderr, "elf_zlib_inflate_table failed\n");
+      exit (EXIT_FAILURE);
+    }
+
+  printf ("static const uint16_t elf_zlib_default_dist_table[%#zx] =\n",
+	  final_next_secondary + 0x100);
+  printf ("{\n");
+  for (i = 0; i < final_next_secondary + 0x100; i += 8)
+    {
+      size_t j;
+
+      printf (" ");
+      for (j = i; j < final_next_secondary + 0x100 && j < i + 8; ++j)
+	printf (" %#x,", table[j]);
+      printf ("\n");
+    }
+  printf ("};\n");
+
+  return 0;
+}
+
+#endif
+
+/* The fixed tables generated by the #ifdef'ed out main function
+   above.  */
+
+static const uint16_t elf_zlib_default_table[0x170] =
+{
+  0xd00, 0xe50, 0xe10, 0xf18, 0xd10, 0xe70, 0xe30, 0x1230,
+  0xd08, 0xe60, 0xe20, 0x1210, 0xe00, 0xe80, 0xe40, 0x1250,
+  0xd04, 0xe58, 0xe18, 0x1200, 0xd14, 0xe78, 0xe38, 0x1240,
+  0xd0c, 0xe68, 0xe28, 0x1220, 0xe08, 0xe88, 0xe48, 0x1260,
+  0xd02, 0xe54, 0xe14, 0xf1c, 0xd12, 0xe74, 0xe34, 0x1238,
+  0xd0a, 0xe64, 0xe24, 0x1218, 0xe04, 0xe84, 0xe44, 0x1258,
+  0xd06, 0xe5c, 0xe1c, 0x1208, 0xd16, 0xe7c, 0xe3c, 0x1248,
+  0xd0e, 0xe6c, 0xe2c, 0x1228, 0xe0c, 0xe8c, 0xe4c, 0x1268,
+  0xd01, 0xe52, 0xe12, 0xf1a, 0xd11, 0xe72, 0xe32, 0x1234,
+  0xd09, 0xe62, 0xe22, 0x1214, 0xe02, 0xe82, 0xe42, 0x1254,
+  0xd05, 0xe5a, 0xe1a, 0x1204, 0xd15, 0xe7a, 0xe3a, 0x1244,
+  0xd0d, 0xe6a, 0xe2a, 0x1224, 0xe0a, 0xe8a, 0xe4a, 0x1264,
+  0xd03, 0xe56, 0xe16, 0xf1e, 0xd13, 0xe76, 0xe36, 0x123c,
+  0xd0b, 0xe66, 0xe26, 0x121c, 0xe06, 0xe86, 0xe46, 0x125c,
+  0xd07, 0xe5e, 0xe1e, 0x120c, 0xd17, 0xe7e, 0xe3e, 0x124c,
+  0xd0f, 0xe6e, 0xe2e, 0x122c, 0xe0e, 0xe8e, 0xe4e, 0x126c,
+  0xd00, 0xe51, 0xe11, 0xf19, 0xd10, 0xe71, 0xe31, 0x1232,
+  0xd08, 0xe61, 0xe21, 0x1212, 0xe01, 0xe81, 0xe41, 0x1252,
+  0xd04, 0xe59, 0xe19, 0x1202, 0xd14, 0xe79, 0xe39, 0x1242,
+  0xd0c, 0xe69, 0xe29, 0x1222, 0xe09, 0xe89, 0xe49, 0x1262,
+  0xd02, 0xe55, 0xe15, 0xf1d, 0xd12, 0xe75, 0xe35, 0x123a,
+  0xd0a, 0xe65, 0xe25, 0x121a, 0xe05, 0xe85, 0xe45, 0x125a,
+  0xd06, 0xe5d, 0xe1d, 0x120a, 0xd16, 0xe7d, 0xe3d, 0x124a,
+  0xd0e, 0xe6d, 0xe2d, 0x122a, 0xe0d, 0xe8d, 0xe4d, 0x126a,
+  0xd01, 0xe53, 0xe13, 0xf1b, 0xd11, 0xe73, 0xe33, 0x1236,
+  0xd09, 0xe63, 0xe23, 0x1216, 0xe03, 0xe83, 0xe43, 0x1256,
+  0xd05, 0xe5b, 0xe1b, 0x1206, 0xd15, 0xe7b, 0xe3b, 0x1246,
+  0xd0d, 0xe6b, 0xe2b, 0x1226, 0xe0b, 0xe8b, 0xe4b, 0x1266,
+  0xd03, 0xe57, 0xe17, 0xf1f, 0xd13, 0xe77, 0xe37, 0x123e,
+  0xd0b, 0xe67, 0xe27, 0x121e, 0xe07, 0xe87, 0xe47, 0x125e,
+  0xd07, 0xe5f, 0xe1f, 0x120e, 0xd17, 0xe7f, 0xe3f, 0x124e,
+  0xd0f, 0xe6f, 0xe2f, 0x122e, 0xe0f, 0xe8f, 0xe4f, 0x126e,
+  0x290, 0x291, 0x292, 0x293, 0x294, 0x295, 0x296, 0x297,
+  0x298, 0x299, 0x29a, 0x29b, 0x29c, 0x29d, 0x29e, 0x29f,
+  0x2a0, 0x2a1, 0x2a2, 0x2a3, 0x2a4, 0x2a5, 0x2a6, 0x2a7,
+  0x2a8, 0x2a9, 0x2aa, 0x2ab, 0x2ac, 0x2ad, 0x2ae, 0x2af,
+  0x2b0, 0x2b1, 0x2b2, 0x2b3, 0x2b4, 0x2b5, 0x2b6, 0x2b7,
+  0x2b8, 0x2b9, 0x2ba, 0x2bb, 0x2bc, 0x2bd, 0x2be, 0x2bf,
+  0x2c0, 0x2c1, 0x2c2, 0x2c3, 0x2c4, 0x2c5, 0x2c6, 0x2c7,
+  0x2c8, 0x2c9, 0x2ca, 0x2cb, 0x2cc, 0x2cd, 0x2ce, 0x2cf,
+  0x2d0, 0x2d1, 0x2d2, 0x2d3, 0x2d4, 0x2d5, 0x2d6, 0x2d7,
+  0x2d8, 0x2d9, 0x2da, 0x2db, 0x2dc, 0x2dd, 0x2de, 0x2df,
+  0x2e0, 0x2e1, 0x2e2, 0x2e3, 0x2e4, 0x2e5, 0x2e6, 0x2e7,
+  0x2e8, 0x2e9, 0x2ea, 0x2eb, 0x2ec, 0x2ed, 0x2ee, 0x2ef,
+  0x2f0, 0x2f1, 0x2f2, 0x2f3, 0x2f4, 0x2f5, 0x2f6, 0x2f7,
+  0x2f8, 0x2f9, 0x2fa, 0x2fb, 0x2fc, 0x2fd, 0x2fe, 0x2ff,
+};
+
+static const uint16_t elf_zlib_default_dist_table[0x100] =
+{
+  0x800, 0x810, 0x808, 0x818, 0x804, 0x814, 0x80c, 0x81c,
+  0x802, 0x812, 0x80a, 0x81a, 0x806, 0x816, 0x80e, 0x81e,
+  0x801, 0x811, 0x809, 0x819, 0x805, 0x815, 0x80d, 0x81d,
+  0x803, 0x813, 0x80b, 0x81b, 0x807, 0x817, 0x80f, 0x81f,
+  0x800, 0x810, 0x808, 0x818, 0x804, 0x814, 0x80c, 0x81c,
+  0x802, 0x812, 0x80a, 0x81a, 0x806, 0x816, 0x80e, 0x81e,
+  0x801, 0x811, 0x809, 0x819, 0x805, 0x815, 0x80d, 0x81d,
+  0x803, 0x813, 0x80b, 0x81b, 0x807, 0x817, 0x80f, 0x81f,
+  0x800, 0x810, 0x808, 0x818, 0x804, 0x814, 0x80c, 0x81c,
+  0x802, 0x812, 0x80a, 0x81a, 0x806, 0x816, 0x80e, 0x81e,
+  0x801, 0x811, 0x809, 0x819, 0x805, 0x815, 0x80d, 0x81d,
+  0x803, 0x813, 0x80b, 0x81b, 0x807, 0x817, 0x80f, 0x81f,
+  0x800, 0x810, 0x808, 0x818, 0x804, 0x814, 0x80c, 0x81c,
+  0x802, 0x812, 0x80a, 0x81a, 0x806, 0x816, 0x80e, 0x81e,
+  0x801, 0x811, 0x809, 0x819, 0x805, 0x815, 0x80d, 0x81d,
+  0x803, 0x813, 0x80b, 0x81b, 0x807, 0x817, 0x80f, 0x81f,
+  0x800, 0x810, 0x808, 0x818, 0x804, 0x814, 0x80c, 0x81c,
+  0x802, 0x812, 0x80a, 0x81a, 0x806, 0x816, 0x80e, 0x81e,
+  0x801, 0x811, 0x809, 0x819, 0x805, 0x815, 0x80d, 0x81d,
+  0x803, 0x813, 0x80b, 0x81b, 0x807, 0x817, 0x80f, 0x81f,
+  0x800, 0x810, 0x808, 0x818, 0x804, 0x814, 0x80c, 0x81c,
+  0x802, 0x812, 0x80a, 0x81a, 0x806, 0x816, 0x80e, 0x81e,
+  0x801, 0x811, 0x809, 0x819, 0x805, 0x815, 0x80d, 0x81d,
+  0x803, 0x813, 0x80b, 0x81b, 0x807, 0x817, 0x80f, 0x81f,
+  0x800, 0x810, 0x808, 0x818, 0x804, 0x814, 0x80c, 0x81c,
+  0x802, 0x812, 0x80a, 0x81a, 0x806, 0x816, 0x80e, 0x81e,
+  0x801, 0x811, 0x809, 0x819, 0x805, 0x815, 0x80d, 0x81d,
+  0x803, 0x813, 0x80b, 0x81b, 0x807, 0x817, 0x80f, 0x81f,
+  0x800, 0x810, 0x808, 0x818, 0x804, 0x814, 0x80c, 0x81c,
+  0x802, 0x812, 0x80a, 0x81a, 0x806, 0x816, 0x80e, 0x81e,
+  0x801, 0x811, 0x809, 0x819, 0x805, 0x815, 0x80d, 0x81d,
+  0x803, 0x813, 0x80b, 0x81b, 0x807, 0x817, 0x80f, 0x81f,
+};
+
+/* Inflate a zlib stream from PIN/SIN to POUT/SOUT.  Return 1 on
+   success, 0 on some error parsing the stream.  */
+
+static int
+elf_zlib_inflate (const unsigned char *pin, size_t sin, uint16_t *zdebug_table,
+		  unsigned char *pout, size_t sout)
+{
+  unsigned char *porigout;
+  const unsigned char *pinend;
+  unsigned char *poutend;
+
+  /* We can apparently see multiple zlib streams concatenated
+     together, so keep going as long as there is something to read.
+     The last 4 bytes are the checksum.  */
+  porigout = pout;
+  pinend = pin + sin;
+  poutend = pout + sout;
+  while ((pinend - pin) > 4)
+    {
+      uint64_t val;
+      unsigned int bits;
+      int last;
+
+      /* Read the two byte zlib header.  */
+
+      if (unlikely ((pin[0] & 0xf) != 8)) /* 8 is zlib encoding.  */
+	{
+	  /* Unknown compression method.  */
+	  elf_uncompress_failed ();
+	  return 0;
+	}
+      if (unlikely ((pin[0] >> 4) > 7))
+	{
+	  /* Window size too large.  Other than this check, we don't
+	     care about the window size.  */
+	  elf_uncompress_failed ();
+	  return 0;
+	}
+      if (unlikely ((pin[1] & 0x20) != 0))
+	{
+	  /* Stream expects a predefined dictionary, but we have no
+	     dictionary.  */
+	  elf_uncompress_failed ();
+	  return 0;
+	}
+      val = (pin[0] << 8) | pin[1];
+      if (unlikely (val % 31 != 0))
+	{
+	  /* Header check failure.  */
+	  elf_uncompress_failed ();
+	  return 0;
+	}
+      pin += 2;
+
+      /* Align PIN to a 32-bit boundary.  */
+
+      val = 0;
+      bits = 0;
+      while ((((uintptr_t) pin) & 3) != 0)
+	{
+	  val |= (uint64_t)*pin << bits;
+	  bits += 8;
+	  ++pin;
+	}
+
+      /* Read blocks until one is marked last.  */
+
+      last = 0;
+
+      while (!last)
+	{
+	  unsigned int type;
+	  const uint16_t *tlit;
+	  const uint16_t *tdist;
+
+	  if (!elf_fetch_bits (&pin, pinend, &val, &bits))
+	    return 0;
+
+	  last = val & 1;
+	  type = (val >> 1) & 3;
+	  val >>= 3;
+	  bits -= 3;
+
+	  if (unlikely (type == 3))
+	    {
+	      /* Invalid block type.  */
+	      elf_uncompress_failed ();
+	      return 0;
+	    }
+
+	  if (type == 0)
+	    {
+	      uint16_t len;
+	      uint16_t lenc;
+
+	      /* An uncompressed block.  */
+
+	      /* If we've read ahead more than a byte, back up.  */
+	      while (bits >= 8)
+		{
+		  --pin;
+		  bits -= 8;
+		}
+
+	      val = 0;
+	      bits = 0;
+	      if (unlikely ((pinend - pin) < 4))
+		{
+		  /* Missing length.  */
+		  elf_uncompress_failed ();
+		  return 0;
+		}
+	      len = pin[0] | (pin[1] << 8);
+	      lenc = pin[2] | (pin[3] << 8);
+	      pin += 4;
+	      lenc = ~lenc;
+	      if (unlikely (len != lenc))
+		{
+		  /* Corrupt data.  */
+		  elf_uncompress_failed ();
+		  return 0;
+		}
+	      if (unlikely (len > (unsigned int) (pinend - pin)
+			    || len > (unsigned int) (poutend - pout)))
+		{
+		  /* Not enough space in buffers.  */
+		  elf_uncompress_failed ();
+		  return 0;
+		}
+	      memcpy (pout, pin, len);
+	      pout += len;
+	      pin += len;
+
+	      /* Align PIN.  */
+	      while ((((uintptr_t) pin) & 3) != 0)
+		{
+		  val |= (uint64_t)*pin << bits;
+		  bits += 8;
+		  ++pin;
+		}
+
+	      /* Go around to read the next block.  */
+	      continue;
+	    }
+
+	  if (type == 1)
+	    {
+	      tlit = elf_zlib_default_table;
+	      tdist = elf_zlib_default_dist_table;
+	    }
+	  else
+	    {
+	      unsigned int nlit;
+	      unsigned int ndist;
+	      unsigned int nclen;
+	      unsigned char codebits[19];
+	      unsigned char *plenbase;
+	      unsigned char *plen;
+	      unsigned char *plenend;
+
+	      /* Read a Huffman encoding table.  The various magic
+		 numbers here are from RFC 1951.  */
+
+	      if (!elf_fetch_bits (&pin, pinend, &val, &bits))
+		return 0;
+
+	      nlit = (val & 0x1f) + 257;
+	      val >>= 5;
+	      ndist = (val & 0x1f) + 1;
+	      val >>= 5;
+	      nclen = (val & 0xf) + 4;
+	      val >>= 4;
+	      bits -= 14;
+	      if (unlikely (nlit > 286 || ndist > 30))
+		{
+		  /* Values out of range.  */
+		  elf_uncompress_failed ();
+		  return 0;
+		}
+
+	      /* Read and build the table used to compress the
+		 literal, length, and distance codes.  */
+
+	      memset(&codebits[0], 0, 19);
+
+	      /* There are always at least 4 elements in the
+		 table.  */
+
+	      if (!elf_fetch_bits (&pin, pinend, &val, &bits))
+		return 0;
+
+	      codebits[16] = val & 7;
+	      codebits[17] = (val >> 3) & 7;
+	      codebits[18] = (val >> 6) & 7;
+	      codebits[0] = (val >> 9) & 7;
+	      val >>= 12;
+	      bits -= 12;
+
+	      if (nclen == 4)
+		goto codebitsdone;
+
+	      codebits[8] = val & 7;
+	      val >>= 3;
+	      bits -= 3;
+
+	      if (nclen == 5)
+		goto codebitsdone;
+
+	      if (!elf_fetch_bits (&pin, pinend, &val, &bits))
+		return 0;
+
+	      codebits[7] = val & 7;
+	      val >>= 3;
+	      bits -= 3;
+
+	      if (nclen == 6)
+		goto codebitsdone;
+
+	      codebits[9] = val & 7;
+	      val >>= 3;
+	      bits -= 3;
+
+	      if (nclen == 7)
+		goto codebitsdone;
+
+	      codebits[6] = val & 7;
+	      val >>= 3;
+	      bits -= 3;
+
+	      if (nclen == 8)
+		goto codebitsdone;
+
+	      codebits[10] = val & 7;
+	      val >>= 3;
+	      bits -= 3;
+
+	      if (nclen == 9)
+		goto codebitsdone;
+
+	      codebits[5] = val & 7;
+	      val >>= 3;
+	      bits -= 3;
+
+	      if (nclen == 10)
+		goto codebitsdone;
+
+	      if (!elf_fetch_bits (&pin, pinend, &val, &bits))
+		return 0;
+
+	      codebits[11] = val & 7;
+	      val >>= 3;
+	      bits -= 3;
+
+	      if (nclen == 11)
+		goto codebitsdone;
+
+	      codebits[4] = val & 7;
+	      val >>= 3;
+	      bits -= 3;
+
+	      if (nclen == 12)
+		goto codebitsdone;
+
+	      codebits[12] = val & 7;
+	      val >>= 3;
+	      bits -= 3;
+
+	      if (nclen == 13)
+		goto codebitsdone;
+
+	      codebits[3] = val & 7;
+	      val >>= 3;
+	      bits -= 3;
+
+	      if (nclen == 14)
+		goto codebitsdone;
+
+	      codebits[13] = val & 7;
+	      val >>= 3;
+	      bits -= 3;
+
+	      if (nclen == 15)
+		goto codebitsdone;
+
+	      if (!elf_fetch_bits (&pin, pinend, &val, &bits))
+		return 0;
+
+	      codebits[2] = val & 7;
+	      val >>= 3;
+	      bits -= 3;
+
+	      if (nclen == 16)
+		goto codebitsdone;
+
+	      codebits[14] = val & 7;
+	      val >>= 3;
+	      bits -= 3;
+
+	      if (nclen == 17)
+		goto codebitsdone;
+
+	      codebits[1] = val & 7;
+	      val >>= 3;
+	      bits -= 3;
+
+	      if (nclen == 18)
+		goto codebitsdone;
+
+	      codebits[15] = val & 7;
+	      val >>= 3;
+	      bits -= 3;
+
+	    codebitsdone:
+
+	      if (!elf_zlib_inflate_table (codebits, 19, zdebug_table,
+					   zdebug_table))
+		return 0;
+
+	      /* Read the compressed bit lengths of the literal,
+		 length, and distance codes.  We have allocated space
+		 at the end of zdebug_table to hold them.  */
+
+	      plenbase = (((unsigned char *) zdebug_table)
+			  + ZLIB_TABLE_CODELEN_OFFSET);
+	      plen = plenbase;
+	      plenend = plen + nlit + ndist;
+	      while (plen < plenend)
+		{
+		  uint16_t t;
+		  unsigned int b;
+		  uint16_t v;
+
+		  if (!elf_fetch_bits (&pin, pinend, &val, &bits))
+		    return 0;
+
+		  t = zdebug_table[val & 0xff];
+
+		  /* The compression here uses bit lengths up to 7, so
+		     a secondary table is never necessary.  */
+		  if (unlikely ((t & (1U << ZLIB_HUFFMAN_SECONDARY_SHIFT))
+				!= 0))
+		    {
+		      elf_uncompress_failed ();
+		      return 0;
+		    }
+
+		  b = (t >> ZLIB_HUFFMAN_BITS_SHIFT) & ZLIB_HUFFMAN_BITS_MASK;
+		  val >>= b + 1;
+		  bits -= b + 1;
+
+		  v = t & ZLIB_HUFFMAN_VALUE_MASK;
+		  if (v < 16)
+		    *plen++ = v;
+		  else if (v == 16)
+		    {
+		      unsigned int c;
+		      unsigned int prev;
+
+		      /* Copy previous entry 3 to 6 times.  */
+
+		      if (unlikely (plen == plenbase))
+			{
+			  elf_uncompress_failed ();
+			  return 0;
+			}
+
+		      /* We used up to 7 bits since the last
+			 elf_fetch_bits, so we have at least 8 bits
+			 available here.  */
+
+		      c = 3 + (val & 0x3);
+		      val >>= 2;
+		      bits -= 2;
+		      if (unlikely ((unsigned int) (plenend - plen) < c))
+			{
+			  elf_uncompress_failed ();
+			  return 0;
+			}
+
+		      prev = plen[-1];
+		      switch (c)
+			{
+			case 6:
+			  *plen++ = prev;
+			  ATTRIBUTE_FALLTHROUGH;
+			case 5:
+			  *plen++ = prev;
+			  ATTRIBUTE_FALLTHROUGH;
+			case 4:
+			  *plen++ = prev;
+			}
+		      *plen++ = prev;
+		      *plen++ = prev;
+		      *plen++ = prev;
+		    }
+		  else if (v == 17)
+		    {
+		      unsigned int c;
+
+		      /* Store zero 3 to 10 times.  */
+
+		      /* We used up to 7 bits since the last
+			 elf_fetch_bits, so we have at least 8 bits
+			 available here.  */
+
+		      c = 3 + (val & 0x7);
+		      val >>= 3;
+		      bits -= 3;
+		      if (unlikely ((unsigned int) (plenend - plen) < c))
+			{
+			  elf_uncompress_failed ();
+			  return 0;
+			}
+
+		      switch (c)
+			{
+			case 10:
+			  *plen++ = 0;
+			  ATTRIBUTE_FALLTHROUGH;
+			case 9:
+			  *plen++ = 0;
+			  ATTRIBUTE_FALLTHROUGH;
+			case 8:
+			  *plen++ = 0;
+			  ATTRIBUTE_FALLTHROUGH;
+			case 7:
+			  *plen++ = 0;
+			  ATTRIBUTE_FALLTHROUGH;
+			case 6:
+			  *plen++ = 0;
+			  ATTRIBUTE_FALLTHROUGH;
+			case 5:
+			  *plen++ = 0;
+			  ATTRIBUTE_FALLTHROUGH;
+			case 4:
+			  *plen++ = 0;
+			}
+		      *plen++ = 0;
+		      *plen++ = 0;
+		      *plen++ = 0;
+		    }
+		  else if (v == 18)
+		    {
+		      unsigned int c;
+
+		      /* Store zero 11 to 138 times.  */
+
+		      /* We used up to 7 bits since the last
+			 elf_fetch_bits, so we have at least 8 bits
+			 available here.  */
+
+		      c = 11 + (val & 0x7f);
+		      val >>= 7;
+		      bits -= 7;
+		      if (unlikely ((unsigned int) (plenend - plen) < c))
+			{
+			  elf_uncompress_failed ();
+			  return 0;
+			}
+
+		      memset (plen, 0, c);
+		      plen += c;
+		    }
+		  else
+		    {
+		      elf_uncompress_failed ();
+		      return 0;
+		    }
+		}
+
+	      /* Make sure that the stop code can appear.  */
+
+	      plen = plenbase;
+	      if (unlikely (plen[256] == 0))
+		{
+		  elf_uncompress_failed ();
+		  return 0;
+		}
+
+	      /* Build the decompression tables.  */
+
+	      if (!elf_zlib_inflate_table (plen, nlit, zdebug_table,
+					   zdebug_table))
+		return 0;
+	      if (!elf_zlib_inflate_table (plen + nlit, ndist, zdebug_table,
+					   (zdebug_table
+					    + ZLIB_HUFFMAN_TABLE_SIZE)))
+		return 0;
+	      tlit = zdebug_table;
+	      tdist = zdebug_table + ZLIB_HUFFMAN_TABLE_SIZE;
+	    }
+
+	  /* Inflate values until the end of the block.  This is the
+	     main loop of the inflation code.  */
+
+	  while (1)
+	    {
+	      uint16_t t;
+	      unsigned int b;
+	      uint16_t v;
+	      unsigned int lit;
+
+	      if (!elf_fetch_bits (&pin, pinend, &val, &bits))
+		return 0;
+
+	      t = tlit[val & 0xff];
+	      b = (t >> ZLIB_HUFFMAN_BITS_SHIFT) & ZLIB_HUFFMAN_BITS_MASK;
+	      v = t & ZLIB_HUFFMAN_VALUE_MASK;
+
+	      if ((t & (1U << ZLIB_HUFFMAN_SECONDARY_SHIFT)) == 0)
+		{
+		  lit = v;
+		  val >>= b + 1;
+		  bits -= b + 1;
+		}
+	      else
+		{
+		  t = tlit[v + 0x100 + ((val >> 8) & ((1U << b) - 1))];
+		  b = (t >> ZLIB_HUFFMAN_BITS_SHIFT) & ZLIB_HUFFMAN_BITS_MASK;
+		  lit = t & ZLIB_HUFFMAN_VALUE_MASK;
+		  val >>= b + 8;
+		  bits -= b + 8;
+		}
+
+	      if (lit < 256)
+		{
+		  if (unlikely (pout == poutend))
+		    {
+		      elf_uncompress_failed ();
+		      return 0;
+		    }
+
+		  *pout++ = lit;
+
+		  /* We will need to write the next byte soon.  We ask
+		     for high temporal locality because we will write
+		     to the whole cache line soon.  */
+		  __builtin_prefetch (pout, 1, 3);
+		}
+	      else if (lit == 256)
+		{
+		  /* The end of the block.  */
+		  break;
+		}
+	      else
+		{
+		  unsigned int dist;
+		  unsigned int len;
+
+		  /* Convert lit into a length.  */
+
+		  if (lit < 265)
+		    len = lit - 257 + 3;
+		  else if (lit == 285)
+		    len = 258;
+		  else if (unlikely (lit > 285))
+		    {
+		      elf_uncompress_failed ();
+		      return 0;
+		    }
+		  else
+		    {
+		      unsigned int extra;
+
+		      if (!elf_fetch_bits (&pin, pinend, &val, &bits))
+			return 0;
+
+		      /* This is an expression for the table of length
+			 codes in RFC 1951 3.2.5.  */
+		      lit -= 265;
+		      extra = (lit >> 2) + 1;
+		      len = (lit & 3) << extra;
+		      len += 11;
+		      len += ((1U << (extra - 1)) - 1) << 3;
+		      len += val & ((1U << extra) - 1);
+		      val >>= extra;
+		      bits -= extra;
+		    }
+
+		  if (!elf_fetch_bits (&pin, pinend, &val, &bits))
+		    return 0;
+
+		  t = tdist[val & 0xff];
+		  b = (t >> ZLIB_HUFFMAN_BITS_SHIFT) & ZLIB_HUFFMAN_BITS_MASK;
+		  v = t & ZLIB_HUFFMAN_VALUE_MASK;
+
+		  if ((t & (1U << ZLIB_HUFFMAN_SECONDARY_SHIFT)) == 0)
+		    {
+		      dist = v;
+		      val >>= b + 1;
+		      bits -= b + 1;
+		    }
+		  else
+		    {
+		      t = tdist[v + 0x100 + ((val >> 8) & ((1U << b) - 1))];
+		      b = ((t >> ZLIB_HUFFMAN_BITS_SHIFT)
+			   & ZLIB_HUFFMAN_BITS_MASK);
+		      dist = t & ZLIB_HUFFMAN_VALUE_MASK;
+		      val >>= b + 8;
+		      bits -= b + 8;
+		    }
+
+		  /* Convert dist to a distance.  */
+
+		  if (dist == 0)
+		    {
+		      /* A distance of 1.  A common case, meaning
+			 repeat the last character LEN times.  */
+
+		      if (unlikely (pout == porigout))
+			{
+			  elf_uncompress_failed ();
+			  return 0;
+			}
+
+		      if (unlikely ((unsigned int) (poutend - pout) < len))
+			{
+			  elf_uncompress_failed ();
+			  return 0;
+			}
+
+		      memset (pout, pout[-1], len);
+		      pout += len;
+		    }
+		  else if (unlikely (dist > 29))
+		    {
+		      elf_uncompress_failed ();
+		      return 0;
+		    }
+		  else
+		    {
+		      if (dist < 4)
+			dist = dist + 1;
+		      else
+			{
+			  unsigned int extra;
+
+			  if (!elf_fetch_bits (&pin, pinend, &val, &bits))
+			    return 0;
+
+			  /* This is an expression for the table of
+			     distance codes in RFC 1951 3.2.5.  */
+			  dist -= 4;
+			  extra = (dist >> 1) + 1;
+			  dist = (dist & 1) << extra;
+			  dist += 5;
+			  dist += ((1U << (extra - 1)) - 1) << 2;
+			  dist += val & ((1U << extra) - 1);
+			  val >>= extra;
+			  bits -= extra;
+			}
+
+		      /* Go back dist bytes, and copy len bytes from
+			 there.  */
+
+		      if (unlikely ((unsigned int) (pout - porigout) < dist))
+			{
+			  elf_uncompress_failed ();
+			  return 0;
+			}
+
+		      if (unlikely ((unsigned int) (poutend - pout) < len))
+			{
+			  elf_uncompress_failed ();
+			  return 0;
+			}
+
+		      if (dist >= len)
+			{
+			  memcpy (pout, pout - dist, len);
+			  pout += len;
+			}
+		      else
+			{
+			  while (len > 0)
+			    {
+			      unsigned int copy;
+
+			      copy = len < dist ? len : dist;
+			      memcpy (pout, pout - dist, copy);
+			      len -= copy;
+			      pout += copy;
+			    }
+			}
+		    }
+		}
+	    }
+	}
+    }
+
+  /* We should have filled the output buffer.  */
+  if (unlikely (pout != poutend))
+    {
+      elf_uncompress_failed ();
+      return 0;
+    }
+
+  return 1;
+}
+
+/* Verify the zlib checksum.  The checksum is in the 4 bytes at
+   CHECKBYTES, and the uncompressed data is at UNCOMPRESSED /
+   UNCOMPRESSED_SIZE.  Returns 1 on success, 0 on failure.  */
+
+static int
+elf_zlib_verify_checksum (const unsigned char *checkbytes,
+			  const unsigned char *uncompressed,
+			  size_t uncompressed_size)
+{
+  unsigned int i;
+  unsigned int cksum;
+  const unsigned char *p;
+  uint32_t s1;
+  uint32_t s2;
+  size_t hsz;
+
+  cksum = 0;
+  for (i = 0; i < 4; i++)
+    cksum = (cksum << 8) | checkbytes[i];
+
+  s1 = 1;
+  s2 = 0;
+
+  /* Minimize modulo operations.  */
+
+  p = uncompressed;
+  hsz = uncompressed_size;
+  while (hsz >= 5552)
+    {
+      for (i = 0; i < 5552; i += 16)
+	{
+	  /* Manually unroll loop 16 times.  */
+	  s1 = s1 + *p++;
+	  s2 = s2 + s1;
+	  s1 = s1 + *p++;
+	  s2 = s2 + s1;
+	  s1 = s1 + *p++;
+	  s2 = s2 + s1;
+	  s1 = s1 + *p++;
+	  s2 = s2 + s1;
+	  s1 = s1 + *p++;
+	  s2 = s2 + s1;
+	  s1 = s1 + *p++;
+	  s2 = s2 + s1;
+	  s1 = s1 + *p++;
+	  s2 = s2 + s1;
+	  s1 = s1 + *p++;
+	  s2 = s2 + s1;
+	  s1 = s1 + *p++;
+	  s2 = s2 + s1;
+	  s1 = s1 + *p++;
+	  s2 = s2 + s1;
+	  s1 = s1 + *p++;
+	  s2 = s2 + s1;
+	  s1 = s1 + *p++;
+	  s2 = s2 + s1;
+	  s1 = s1 + *p++;
+	  s2 = s2 + s1;
+	  s1 = s1 + *p++;
+	  s2 = s2 + s1;
+	  s1 = s1 + *p++;
+	  s2 = s2 + s1;
+	  s1 = s1 + *p++;
+	  s2 = s2 + s1;
+	}
+      hsz -= 5552;
+      s1 %= 65521;
+      s2 %= 65521;
+    }
+
+  while (hsz >= 16)
+    {
+      /* Manually unroll loop 16 times.  */
+      s1 = s1 + *p++;
+      s2 = s2 + s1;
+      s1 = s1 + *p++;
+      s2 = s2 + s1;
+      s1 = s1 + *p++;
+      s2 = s2 + s1;
+      s1 = s1 + *p++;
+      s2 = s2 + s1;
+      s1 = s1 + *p++;
+      s2 = s2 + s1;
+      s1 = s1 + *p++;
+      s2 = s2 + s1;
+      s1 = s1 + *p++;
+      s2 = s2 + s1;
+      s1 = s1 + *p++;
+      s2 = s2 + s1;
+      s1 = s1 + *p++;
+      s2 = s2 + s1;
+      s1 = s1 + *p++;
+      s2 = s2 + s1;
+      s1 = s1 + *p++;
+      s2 = s2 + s1;
+      s1 = s1 + *p++;
+      s2 = s2 + s1;
+      s1 = s1 + *p++;
+      s2 = s2 + s1;
+      s1 = s1 + *p++;
+      s2 = s2 + s1;
+      s1 = s1 + *p++;
+      s2 = s2 + s1;
+      s1 = s1 + *p++;
+      s2 = s2 + s1;
+
+      hsz -= 16;
+    }
+
+  for (i = 0; i < hsz; ++i)
+    {
+      s1 = s1 + *p++;
+      s2 = s2 + s1;
+    }
+
+  s1 %= 65521;
+  s2 %= 65521;
+
+  if (unlikely ((s2 << 16) + s1 != cksum))
+    {
+      elf_uncompress_failed ();
+      return 0;
+    }
+
+  return 1;
+}
+
+/* Inflate a zlib stream from PIN/SIN to POUT/SOUT, and verify the
+   checksum.  Return 1 on success, 0 on error.  */
+
+static int
+elf_zlib_inflate_and_verify (const unsigned char *pin, size_t sin,
+			     uint16_t *zdebug_table, unsigned char *pout,
+			     size_t sout)
+{
+  if (!elf_zlib_inflate (pin, sin, zdebug_table, pout, sout))
+    return 0;
+  if (!elf_zlib_verify_checksum (pin + sin - 4, pout, sout))
+    return 0;
+  return 1;
+}
+
+/* For working memory during zstd compression, we need
+   - a literal length FSE table: 512 64-bit values == 4096 bytes
+   - a match length FSE table: 512 64-bit values == 4096 bytes
+   - a offset FSE table: 256 64-bit values == 2048 bytes
+   - a Huffman tree: 2048 uint16_t values == 4096 bytes
+   - scratch space, one of
+     - to build an FSE table: 512 uint16_t values == 1024 bytes
+     - to build a Huffman tree: 512 uint16_t + 256 uint32_t == 2048 bytes
+*/
+
+#define ZSTD_TABLE_SIZE					\
+  (2 * 512 * sizeof (struct elf_zstd_fse_baseline_entry)	\
+   + 256 * sizeof (struct elf_zstd_fse_baseline_entry)		\
+   + 2048 * sizeof (uint16_t)					\
+   + 512 * sizeof (uint16_t) + 256 * sizeof (uint32_t))
+
+#define ZSTD_TABLE_LITERAL_FSE_OFFSET (0)
+
+#define ZSTD_TABLE_MATCH_FSE_OFFSET			\
+  (512 * sizeof (struct elf_zstd_fse_baseline_entry))
+
+#define ZSTD_TABLE_OFFSET_FSE_OFFSET			\
+  (ZSTD_TABLE_MATCH_FSE_OFFSET				\
+   + 512 * sizeof (struct elf_zstd_fse_baseline_entry))
+
+#define ZSTD_TABLE_HUFFMAN_OFFSET					\
+  (ZSTD_TABLE_OFFSET_FSE_OFFSET						\
+   + 256 * sizeof (struct elf_zstd_fse_baseline_entry))
+
+#define ZSTD_TABLE_WORK_OFFSET \
+  (ZSTD_TABLE_HUFFMAN_OFFSET + 2048 * sizeof (uint16_t))
+
+/* An entry in a zstd FSE table.  */
+
+struct elf_zstd_fse_entry
+{
+  /* The value that this FSE entry represents.  */
+  unsigned char symbol;
+  /* The number of bits to read to determine the next state.  */
+  unsigned char bits;
+  /* Add the bits to this base to get the next state.  */
+  uint16_t base;
+};
+
+static int
+elf_zstd_build_fse (const int16_t *, int, uint16_t *, int,
+		    struct elf_zstd_fse_entry *);
+
+/* Read a zstd FSE table and build the decoding table in *TABLE, updating *PPIN
+   as it reads.  ZDEBUG_TABLE is scratch space; it must be enough for 512
+   uint16_t values (1024 bytes).  MAXIDX is the maximum number of symbols
+   permitted. *TABLE_BITS is the maximum number of bits for symbols in the
+   table: the size of *TABLE is at least 1 << *TABLE_BITS.  This updates
+   *TABLE_BITS to the actual number of bits.  Returns 1 on success, 0 on
+   error.  */
+
+static int
+elf_zstd_read_fse (const unsigned char **ppin, const unsigned char *pinend,
+		   uint16_t *zdebug_table, int maxidx,
+		   struct elf_zstd_fse_entry *table, int *table_bits)
+{
+  const unsigned char *pin;
+  int16_t *norm;
+  uint16_t *next;
+  uint64_t val;
+  unsigned int bits;
+  int accuracy_log;
+  uint32_t remaining;
+  uint32_t threshold;
+  int bits_needed;
+  int idx;
+  int prev0;
+
+  pin = *ppin;
+
+  norm = (int16_t *) zdebug_table;
+  next = zdebug_table + 256;
+
+  if (unlikely (pin + 3 >= pinend))
+    {
+      elf_uncompress_failed ();
+      return 0;
+    }
+
+  /* Align PIN to a 32-bit boundary.  */
+
+  val = 0;
+  bits = 0;
+  while ((((uintptr_t) pin) & 3) != 0)
+    {
+      val |= (uint64_t)*pin << bits;
+      bits += 8;
+      ++pin;
+    }
+
+  if (!elf_fetch_bits (&pin, pinend, &val, &bits))
+    return 0;
+
+  accuracy_log = (val & 0xf) + 5;
+  if (accuracy_log > *table_bits)
+    {
+      elf_uncompress_failed ();
+      return 0;
+    }
+  *table_bits = accuracy_log;
+  val >>= 4;
+  bits -= 4;
+
+  /* This code is mostly copied from the reference implementation.  */
+
+  /* The number of remaining probabilities, plus 1.  This sets the number of
+     bits that need to be read for the next value.  */
+  remaining = (1 << accuracy_log) + 1;
+
+  /* The current difference between small and large values, which depends on
+     the number of remaining values.  Small values use one less bit.  */
+  threshold = 1 << accuracy_log;
+
+  /* The number of bits used to compute threshold.  */
+  bits_needed = accuracy_log + 1;
+
+  /* The next character value.  */
+  idx = 0;
+
+  /* Whether the last count was 0.  */
+  prev0 = 0;
+
+  while (remaining > 1 && idx <= maxidx)
+    {
+      uint32_t max;
+      int32_t count;
+
+      if (!elf_fetch_bits (&pin, pinend, &val, &bits))
+	return 0;
+
+      if (prev0)
+	{
+	  int zidx;
+
+	  /* Previous count was 0, so there is a 2-bit repeat flag.  If the
+	     2-bit flag is 0b11, it adds 3 and then there is another repeat
+	     flag.  */
+	  zidx = idx;
+	  while ((val & 0xfff) == 0xfff)
+	    {
+	      zidx += 3 * 6;
+	      val >>= 12;
+	      bits -= 12;
+	      if  (!elf_fetch_bits (&pin, pinend, &val, &bits))
+		return 0;
+	    }
+	  while ((val & 3) == 3)
+	    {
+	      zidx += 3;
+	      val >>= 2;
+	      bits -= 2;
+	      if (!elf_fetch_bits (&pin, pinend, &val, &bits))
+		return 0;
+	    }
+	  /* We have at least 13 bits here, don't need to fetch.  */
+	  zidx += val & 3;
+	  val >>= 2;
+	  bits -= 2;
+
+	  if (unlikely (zidx > maxidx))
+	    {
+	      elf_uncompress_failed ();
+	      return 0;
+	    }
+
+	  for (; idx < zidx; idx++)
+	    norm[idx] = 0;
+
+	  prev0 = 0;
+	  continue;
+	}
+
+      max = (2 * threshold - 1) - remaining;
+      if ((val & (threshold - 1)) < max)
+	{
+	  /* A small value.  */
+	  count = (int32_t) ((uint32_t) val & (threshold - 1));
+	  val >>= bits_needed - 1;
+	  bits -= bits_needed - 1;
+	}
+      else
+	{
+	  /* A large value.  */
+	  count = (int32_t) ((uint32_t) val & (2 * threshold - 1));
+	  if (count >= (int32_t) threshold)
+	    count -= (int32_t) max;
+	  val >>= bits_needed;
+	  bits -= bits_needed;
+	}
+
+      count--;
+      if (count >= 0)
+	remaining -= count;
+      else
+	remaining--;
+      if (unlikely (idx >= 256))
+	{
+	  elf_uncompress_failed ();
+	  return 0;
+	}
+      norm[idx] = (int16_t) count;
+      ++idx;
+
+      prev0 = count == 0;
+
+      while (remaining < threshold)
+	{
+	  bits_needed--;
+	  threshold >>= 1;
+	}
+    }
+
+  if (unlikely (remaining != 1))
+    {
+      elf_uncompress_failed ();
+      return 0;
+    }
+
+  /* If we've read ahead more than a byte, back up.  */
+  while (bits >= 8)
+    {
+      --pin;
+      bits -= 8;
+    }
+
+  *ppin = pin;
+
+  for (; idx <= maxidx; idx++)
+    norm[idx] = 0;
+
+  return elf_zstd_build_fse (norm, idx, next, *table_bits, table);
+}
+
+/* Build the FSE decoding table from a list of probabilities.  This reads from
+   NORM of length IDX, uses NEXT as scratch space, and writes to *TABLE, whose
+   size is TABLE_BITS.  */
+
+static int
+elf_zstd_build_fse (const int16_t *norm, int idx, uint16_t *next,
+		    int table_bits, struct elf_zstd_fse_entry *table)
+{
+  int table_size;
+  int high_threshold;
+  int i;
+  int pos;
+  int step;
+  int mask;
+
+  table_size = 1 << table_bits;
+  high_threshold = table_size - 1;
+  for (i = 0; i < idx; i++)
+    {
+      int16_t n;
+
+      n = norm[i];
+      if (n >= 0)
+	next[i] = (uint16_t) n;
+      else
+	{
+	  table[high_threshold].symbol = (unsigned char) i;
+	  high_threshold--;
+	  next[i] = 1;
+	}
+    }
+
+  pos = 0;
+  step = (table_size >> 1) + (table_size >> 3) + 3;
+  mask = table_size - 1;
+  for (i = 0; i < idx; i++)
+    {
+      int n;
+      int j;
+
+      n = (int) norm[i];
+      for (j = 0; j < n; j++)
+	{
+	  table[pos].symbol = (unsigned char) i;
+	  pos = (pos + step) & mask;
+	  while (unlikely (pos > high_threshold))
+	    pos = (pos + step) & mask;
+	}
+    }
+  if (unlikely (pos != 0))
+    {
+      elf_uncompress_failed ();
+      return 0;
+    }
+
+  for (i = 0; i < table_size; i++)
+    {
+      unsigned char sym;
+      uint16_t next_state;
+      int high_bit;
+      int bits;
+
+      sym = table[i].symbol;
+      next_state = next[sym];
+      ++next[sym];
+
+      if (next_state == 0)
+	{
+	  elf_uncompress_failed ();
+	  return 0;
+	}
+      high_bit = 31 - __builtin_clz (next_state);
+
+      bits = table_bits - high_bit;
+      table[i].bits = (unsigned char) bits;
+      table[i].base = (uint16_t) ((next_state << bits) - table_size);
+    }
+
+  return 1;
+}
+
+/* Encode the baseline and bits into a single 32-bit value.  */
+
+#define ZSTD_ENCODE_BASELINE_BITS(baseline, basebits)	\
+  ((uint32_t)(baseline) | ((uint32_t)(basebits) << 24))
+
+#define ZSTD_DECODE_BASELINE(baseline_basebits)	\
+  ((uint32_t)(baseline_basebits) & 0xffffff)
+
+#define ZSTD_DECODE_BASEBITS(baseline_basebits)	\
+  ((uint32_t)(baseline_basebits) >> 24)
+
+/* Given a literal length code, we need to read a number of bits and add that
+   to a baseline.  For states 0 to 15 the baseline is the state and the number
+   of bits is zero.  */
+
+#define ZSTD_LITERAL_LENGTH_BASELINE_OFFSET (16)
+
+static const uint32_t elf_zstd_literal_length_base[] =
+{
+  ZSTD_ENCODE_BASELINE_BITS(16, 1),
+  ZSTD_ENCODE_BASELINE_BITS(18, 1),
+  ZSTD_ENCODE_BASELINE_BITS(20, 1),
+  ZSTD_ENCODE_BASELINE_BITS(22, 1),
+  ZSTD_ENCODE_BASELINE_BITS(24, 2),
+  ZSTD_ENCODE_BASELINE_BITS(28, 2),
+  ZSTD_ENCODE_BASELINE_BITS(32, 3),
+  ZSTD_ENCODE_BASELINE_BITS(40, 3),
+  ZSTD_ENCODE_BASELINE_BITS(48, 4),
+  ZSTD_ENCODE_BASELINE_BITS(64, 6),
+  ZSTD_ENCODE_BASELINE_BITS(128, 7),
+  ZSTD_ENCODE_BASELINE_BITS(256, 8),
+  ZSTD_ENCODE_BASELINE_BITS(512, 9),
+  ZSTD_ENCODE_BASELINE_BITS(1024, 10),
+  ZSTD_ENCODE_BASELINE_BITS(2048, 11),
+  ZSTD_ENCODE_BASELINE_BITS(4096, 12),
+  ZSTD_ENCODE_BASELINE_BITS(8192, 13),
+  ZSTD_ENCODE_BASELINE_BITS(16384, 14),
+  ZSTD_ENCODE_BASELINE_BITS(32768, 15),
+  ZSTD_ENCODE_BASELINE_BITS(65536, 16)
+};
+
+/* The same applies to match length codes.  For states 0 to 31 the baseline is
+   the state + 3 and the number of bits is zero.  */
+
+#define ZSTD_MATCH_LENGTH_BASELINE_OFFSET (32)
+
+static const uint32_t elf_zstd_match_length_base[] =
+{
+  ZSTD_ENCODE_BASELINE_BITS(35, 1),
+  ZSTD_ENCODE_BASELINE_BITS(37, 1),
+  ZSTD_ENCODE_BASELINE_BITS(39, 1),
+  ZSTD_ENCODE_BASELINE_BITS(41, 1),
+  ZSTD_ENCODE_BASELINE_BITS(43, 2),
+  ZSTD_ENCODE_BASELINE_BITS(47, 2),
+  ZSTD_ENCODE_BASELINE_BITS(51, 3),
+  ZSTD_ENCODE_BASELINE_BITS(59, 3),
+  ZSTD_ENCODE_BASELINE_BITS(67, 4),
+  ZSTD_ENCODE_BASELINE_BITS(83, 4),
+  ZSTD_ENCODE_BASELINE_BITS(99, 5),
+  ZSTD_ENCODE_BASELINE_BITS(131, 7),
+  ZSTD_ENCODE_BASELINE_BITS(259, 8),
+  ZSTD_ENCODE_BASELINE_BITS(515, 9),
+  ZSTD_ENCODE_BASELINE_BITS(1027, 10),
+  ZSTD_ENCODE_BASELINE_BITS(2051, 11),
+  ZSTD_ENCODE_BASELINE_BITS(4099, 12),
+  ZSTD_ENCODE_BASELINE_BITS(8195, 13),
+  ZSTD_ENCODE_BASELINE_BITS(16387, 14),
+  ZSTD_ENCODE_BASELINE_BITS(32771, 15),
+  ZSTD_ENCODE_BASELINE_BITS(65539, 16)
+};
+
+/* An entry in an FSE table used for literal/match/length values.  For these we
+   have to map the symbol to a baseline value, and we have to read zero or more
+   bits and add that value to the baseline value.  Rather than look the values
+   up in a separate table, we grow the FSE table so that we get better memory
+   caching.  */
+
+struct elf_zstd_fse_baseline_entry
+{
+  /* The baseline for the value that this FSE entry represents..  */
+  uint32_t baseline;
+  /* The number of bits to read to add to the baseline.  */
+  unsigned char basebits;
+  /* The number of bits to read to determine the next state.  */
+  unsigned char bits;
+  /* Add the bits to this base to get the next state.  */
+  uint16_t base;
+};
+
+/* Convert the literal length FSE table FSE_TABLE to an FSE baseline table at
+   BASELINE_TABLE.  Note that FSE_TABLE and BASELINE_TABLE will overlap.  */
+
+static int
+elf_zstd_make_literal_baseline_fse (
+    const struct elf_zstd_fse_entry *fse_table,
+    int table_bits,
+    struct elf_zstd_fse_baseline_entry *baseline_table)
+{
+  size_t count;
+  const struct elf_zstd_fse_entry *pfse;
+  struct elf_zstd_fse_baseline_entry *pbaseline;
+
+  /* Convert backward to avoid overlap.  */
+
+  count = 1U << table_bits;
+  pfse = fse_table + count;
+  pbaseline = baseline_table + count;
+  while (pfse > fse_table)
+    {
+      unsigned char symbol;
+      unsigned char bits;
+      uint16_t base;
+
+      --pfse;
+      --pbaseline;
+      symbol = pfse->symbol;
+      bits = pfse->bits;
+      base = pfse->base;
+      if (symbol < ZSTD_LITERAL_LENGTH_BASELINE_OFFSET)
+	{
+	  pbaseline->baseline = (uint32_t)symbol;
+	  pbaseline->basebits = 0;
+	}
+      else
+	{
+	  unsigned int idx;
+	  uint32_t basebits;
+
+	  if (unlikely (symbol > 35))
+	    {
+	      elf_uncompress_failed ();
+	      return 0;
+	    }
+	  idx = symbol - ZSTD_LITERAL_LENGTH_BASELINE_OFFSET;
+	  basebits = elf_zstd_literal_length_base[idx];
+	  pbaseline->baseline = ZSTD_DECODE_BASELINE(basebits);
+	  pbaseline->basebits = ZSTD_DECODE_BASEBITS(basebits);
+	}
+      pbaseline->bits = bits;
+      pbaseline->base = base;
+    }
+
+  return 1;
+}
+
+/* Convert the offset length FSE table FSE_TABLE to an FSE baseline table at
+   BASELINE_TABLE.  Note that FSE_TABLE and BASELINE_TABLE will overlap.  */
+
+static int
+elf_zstd_make_offset_baseline_fse (
+    const struct elf_zstd_fse_entry *fse_table,
+    int table_bits,
+    struct elf_zstd_fse_baseline_entry *baseline_table)
+{
+  size_t count;
+  const struct elf_zstd_fse_entry *pfse;
+  struct elf_zstd_fse_baseline_entry *pbaseline;
+
+  /* Convert backward to avoid overlap.  */
+
+  count = 1U << table_bits;
+  pfse = fse_table + count;
+  pbaseline = baseline_table + count;
+  while (pfse > fse_table)
+    {
+      unsigned char symbol;
+      unsigned char bits;
+      uint16_t base;
+
+      --pfse;
+      --pbaseline;
+      symbol = pfse->symbol;
+      bits = pfse->bits;
+      base = pfse->base;
+      if (unlikely (symbol > 31))
+	{
+	  elf_uncompress_failed ();
+	  return 0;
+	}
+
+      /* The simple way to write this is
+
+	   pbaseline->baseline = (uint32_t)1 << symbol;
+	   pbaseline->basebits = symbol;
+
+	 That will give us an offset value that corresponds to the one
+	 described in the RFC.  However, for offset values > 3, we have to
+	 subtract 3.  And for offset values 1, 2, 3 we use a repeated offset.
+	 The baseline is always a power of 2, and is never 0, so for these low
+	 values we will see one entry that is baseline 1, basebits 0, and one
+	 entry that is baseline 2, basebits 1.  All other entries will have
+	 baseline >= 4 and basebits >= 2.
+
+	 So we can check for RFC offset <= 3 by checking for basebits <= 1.
+	 And that means that we can subtract 3 here and not worry about doing
+	 it in the hot loop.  */
+
+      pbaseline->baseline = (uint32_t)1 << symbol;
+      if (symbol >= 2)
+	pbaseline->baseline -= 3;
+      pbaseline->basebits = symbol;
+      pbaseline->bits = bits;
+      pbaseline->base = base;
+    }
+
+  return 1;
+}
+
+/* Convert the match length FSE table FSE_TABLE to an FSE baseline table at
+   BASELINE_TABLE.  Note that FSE_TABLE and BASELINE_TABLE will overlap.  */
+
+static int
+elf_zstd_make_match_baseline_fse (
+    const struct elf_zstd_fse_entry *fse_table,
+    int table_bits,
+    struct elf_zstd_fse_baseline_entry *baseline_table)
+{
+  size_t count;
+  const struct elf_zstd_fse_entry *pfse;
+  struct elf_zstd_fse_baseline_entry *pbaseline;
+
+  /* Convert backward to avoid overlap.  */
+
+  count = 1U << table_bits;
+  pfse = fse_table + count;
+  pbaseline = baseline_table + count;
+  while (pfse > fse_table)
+    {
+      unsigned char symbol;
+      unsigned char bits;
+      uint16_t base;
+
+      --pfse;
+      --pbaseline;
+      symbol = pfse->symbol;
+      bits = pfse->bits;
+      base = pfse->base;
+      if (symbol < ZSTD_MATCH_LENGTH_BASELINE_OFFSET)
+	{
+	  pbaseline->baseline = (uint32_t)symbol + 3;
+	  pbaseline->basebits = 0;
+	}
+      else
+	{
+	  unsigned int idx;
+	  uint32_t basebits;
+
+	  if (unlikely (symbol > 52))
+	    {
+	      elf_uncompress_failed ();
+	      return 0;
+	    }
+	  idx = symbol - ZSTD_MATCH_LENGTH_BASELINE_OFFSET;
+	  basebits = elf_zstd_match_length_base[idx];
+	  pbaseline->baseline = ZSTD_DECODE_BASELINE(basebits);
+	  pbaseline->basebits = ZSTD_DECODE_BASEBITS(basebits);
+	}
+      pbaseline->bits = bits;
+      pbaseline->base = base;
+    }
+
+  return 1;
+}
+
+#ifdef BACKTRACE_GENERATE_ZSTD_FSE_TABLES
+
+/* Used to generate the predefined FSE decoding tables for zstd.  */
+
+#include <stdio.h>
+
+/* These values are straight from RFC 8878.  */
+
+static int16_t lit[36] =
+{
+   4, 3, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 1, 1,
+   2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 2, 1, 1, 1, 1, 1,
+  -1,-1,-1,-1
+};
+
+static int16_t match[53] =
+{
+   1, 4, 3, 2, 2, 2, 2, 2, 2, 1, 1, 1, 1, 1, 1, 1,
+   1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+   1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,-1,-1,
+  -1,-1,-1,-1,-1
+};
+
+static int16_t offset[29] =
+{
+  1, 1, 1, 1, 1, 1, 2, 2, 2, 1, 1, 1, 1, 1, 1, 1,
+  1, 1, 1, 1, 1, 1, 1, 1,-1,-1,-1,-1,-1
+};
+
+static uint16_t next[256];
+
+static void
+print_table (const struct elf_zstd_fse_baseline_entry *table, size_t size)
+{
+  size_t i;
+
+  printf ("{\n");
+  for (i = 0; i < size; i += 3)
+    {
+      int j;
+
+      printf (" ");
+      for (j = 0; j < 3 && i + j < size; ++j)
+	printf (" { %u, %d, %d, %d },", table[i + j].baseline,
+		table[i + j].basebits, table[i + j].bits,
+		table[i + j].base);
+      printf ("\n");
+    }
+  printf ("};\n");
+}
+
+int
+main ()
+{
+  struct elf_zstd_fse_entry lit_table[64];
+  struct elf_zstd_fse_baseline_entry lit_baseline[64];
+  struct elf_zstd_fse_entry match_table[64];
+  struct elf_zstd_fse_baseline_entry match_baseline[64];
+  struct elf_zstd_fse_entry offset_table[32];
+  struct elf_zstd_fse_baseline_entry offset_baseline[32];
+
+  if (!elf_zstd_build_fse (lit, sizeof lit / sizeof lit[0], next,
+			   6, lit_table))
+    {
+      fprintf (stderr, "elf_zstd_build_fse failed\n");
+      exit (EXIT_FAILURE);
+    }
+
+  if (!elf_zstd_make_literal_baseline_fse (lit_table, 6, lit_baseline))
+    {
+      fprintf (stderr, "elf_zstd_make_literal_baseline_fse failed\n");
+      exit (EXIT_FAILURE);
+    }
+
+  printf ("static const struct elf_zstd_fse_baseline_entry "
+	  "elf_zstd_lit_table[64] =\n");
+  print_table (lit_baseline,
+	       sizeof lit_baseline / sizeof lit_baseline[0]);
+  printf ("\n");
+
+  if (!elf_zstd_build_fse (match, sizeof match / sizeof match[0], next,
+			   6, match_table))
+    {
+      fprintf (stderr, "elf_zstd_build_fse failed\n");
+      exit (EXIT_FAILURE);
+    }
+
+  if (!elf_zstd_make_match_baseline_fse (match_table, 6, match_baseline))
+    {
+      fprintf (stderr, "elf_zstd_make_match_baseline_fse failed\n");
+      exit (EXIT_FAILURE);
+    }
+
+  printf ("static const struct elf_zstd_fse_baseline_entry "
+	  "elf_zstd_match_table[64] =\n");
+  print_table (match_baseline,
+	       sizeof match_baseline / sizeof match_baseline[0]);
+  printf ("\n");
+
+  if (!elf_zstd_build_fse (offset, sizeof offset / sizeof offset[0], next,
+			   5, offset_table))
+    {
+      fprintf (stderr, "elf_zstd_build_fse failed\n");
+      exit (EXIT_FAILURE);
+    }
+
+  if (!elf_zstd_make_offset_baseline_fse (offset_table, 5, offset_baseline))
+    {
+      fprintf (stderr, "elf_zstd_make_offset_baseline_fse failed\n");
+      exit (EXIT_FAILURE);
+    }
+
+  printf ("static const struct elf_zstd_fse_baseline_entry "
+	  "elf_zstd_offset_table[32] =\n");
+  print_table (offset_baseline,
+	       sizeof offset_baseline / sizeof offset_baseline[0]);
+  printf ("\n");
+
+  return 0;
+}
+
+#endif
+
+/* The fixed tables generated by the #ifdef'ed out main function
+   above.  */
+
+static const struct elf_zstd_fse_baseline_entry elf_zstd_lit_table[64] =
+{
+  { 0, 0, 4, 0 }, { 0, 0, 4, 16 }, { 1, 0, 5, 32 },
+  { 3, 0, 5, 0 }, { 4, 0, 5, 0 }, { 6, 0, 5, 0 },
+  { 7, 0, 5, 0 }, { 9, 0, 5, 0 }, { 10, 0, 5, 0 },
+  { 12, 0, 5, 0 }, { 14, 0, 6, 0 }, { 16, 1, 5, 0 },
+  { 20, 1, 5, 0 }, { 22, 1, 5, 0 }, { 28, 2, 5, 0 },
+  { 32, 3, 5, 0 }, { 48, 4, 5, 0 }, { 64, 6, 5, 32 },
+  { 128, 7, 5, 0 }, { 256, 8, 6, 0 }, { 1024, 10, 6, 0 },
+  { 4096, 12, 6, 0 }, { 0, 0, 4, 32 }, { 1, 0, 4, 0 },
+  { 2, 0, 5, 0 }, { 4, 0, 5, 32 }, { 5, 0, 5, 0 },
+  { 7, 0, 5, 32 }, { 8, 0, 5, 0 }, { 10, 0, 5, 32 },
+  { 11, 0, 5, 0 }, { 13, 0, 6, 0 }, { 16, 1, 5, 32 },
+  { 18, 1, 5, 0 }, { 22, 1, 5, 32 }, { 24, 2, 5, 0 },
+  { 32, 3, 5, 32 }, { 40, 3, 5, 0 }, { 64, 6, 4, 0 },
+  { 64, 6, 4, 16 }, { 128, 7, 5, 32 }, { 512, 9, 6, 0 },
+  { 2048, 11, 6, 0 }, { 0, 0, 4, 48 }, { 1, 0, 4, 16 },
+  { 2, 0, 5, 32 }, { 3, 0, 5, 32 }, { 5, 0, 5, 32 },
+  { 6, 0, 5, 32 }, { 8, 0, 5, 32 }, { 9, 0, 5, 32 },
+  { 11, 0, 5, 32 }, { 12, 0, 5, 32 }, { 15, 0, 6, 0 },
+  { 18, 1, 5, 32 }, { 20, 1, 5, 32 }, { 24, 2, 5, 32 },
+  { 28, 2, 5, 32 }, { 40, 3, 5, 32 }, { 48, 4, 5, 32 },
+  { 65536, 16, 6, 0 }, { 32768, 15, 6, 0 }, { 16384, 14, 6, 0 },
+  { 8192, 13, 6, 0 },
+};
+
+static const struct elf_zstd_fse_baseline_entry elf_zstd_match_table[64] =
+{
+  { 3, 0, 6, 0 }, { 4, 0, 4, 0 }, { 5, 0, 5, 32 },
+  { 6, 0, 5, 0 }, { 8, 0, 5, 0 }, { 9, 0, 5, 0 },
+  { 11, 0, 5, 0 }, { 13, 0, 6, 0 }, { 16, 0, 6, 0 },
+  { 19, 0, 6, 0 }, { 22, 0, 6, 0 }, { 25, 0, 6, 0 },
+  { 28, 0, 6, 0 }, { 31, 0, 6, 0 }, { 34, 0, 6, 0 },
+  { 37, 1, 6, 0 }, { 41, 1, 6, 0 }, { 47, 2, 6, 0 },
+  { 59, 3, 6, 0 }, { 83, 4, 6, 0 }, { 131, 7, 6, 0 },
+  { 515, 9, 6, 0 }, { 4, 0, 4, 16 }, { 5, 0, 4, 0 },
+  { 6, 0, 5, 32 }, { 7, 0, 5, 0 }, { 9, 0, 5, 32 },
+  { 10, 0, 5, 0 }, { 12, 0, 6, 0 }, { 15, 0, 6, 0 },
+  { 18, 0, 6, 0 }, { 21, 0, 6, 0 }, { 24, 0, 6, 0 },
+  { 27, 0, 6, 0 }, { 30, 0, 6, 0 }, { 33, 0, 6, 0 },
+  { 35, 1, 6, 0 }, { 39, 1, 6, 0 }, { 43, 2, 6, 0 },
+  { 51, 3, 6, 0 }, { 67, 4, 6, 0 }, { 99, 5, 6, 0 },
+  { 259, 8, 6, 0 }, { 4, 0, 4, 32 }, { 4, 0, 4, 48 },
+  { 5, 0, 4, 16 }, { 7, 0, 5, 32 }, { 8, 0, 5, 32 },
+  { 10, 0, 5, 32 }, { 11, 0, 5, 32 }, { 14, 0, 6, 0 },
+  { 17, 0, 6, 0 }, { 20, 0, 6, 0 }, { 23, 0, 6, 0 },
+  { 26, 0, 6, 0 }, { 29, 0, 6, 0 }, { 32, 0, 6, 0 },
+  { 65539, 16, 6, 0 }, { 32771, 15, 6, 0 }, { 16387, 14, 6, 0 },
+  { 8195, 13, 6, 0 }, { 4099, 12, 6, 0 }, { 2051, 11, 6, 0 },
+  { 1027, 10, 6, 0 },
+};
+
+static const struct elf_zstd_fse_baseline_entry elf_zstd_offset_table[32] =
+{
+  { 1, 0, 5, 0 }, { 61, 6, 4, 0 }, { 509, 9, 5, 0 },
+  { 32765, 15, 5, 0 }, { 2097149, 21, 5, 0 }, { 5, 3, 5, 0 },
+  { 125, 7, 4, 0 }, { 4093, 12, 5, 0 }, { 262141, 18, 5, 0 },
+  { 8388605, 23, 5, 0 }, { 29, 5, 5, 0 }, { 253, 8, 4, 0 },
+  { 16381, 14, 5, 0 }, { 1048573, 20, 5, 0 }, { 1, 2, 5, 0 },
+  { 125, 7, 4, 16 }, { 2045, 11, 5, 0 }, { 131069, 17, 5, 0 },
+  { 4194301, 22, 5, 0 }, { 13, 4, 5, 0 }, { 253, 8, 4, 16 },
+  { 8189, 13, 5, 0 }, { 524285, 19, 5, 0 }, { 2, 1, 5, 0 },
+  { 61, 6, 4, 16 }, { 1021, 10, 5, 0 }, { 65533, 16, 5, 0 },
+  { 268435453, 28, 5, 0 }, { 134217725, 27, 5, 0 }, { 67108861, 26, 5, 0 },
+  { 33554429, 25, 5, 0 }, { 16777213, 24, 5, 0 },
+};
+
+/* Read a zstd Huffman table and build the decoding table in *TABLE, reading
+   and updating *PPIN.  This sets *PTABLE_BITS to the number of bits of the
+   table, such that the table length is 1 << *TABLE_BITS.  ZDEBUG_TABLE is
+   scratch space; it must be enough for 512 uint16_t values + 256 32-bit values
+   (2048 bytes).  Returns 1 on success, 0 on error.  */
+
+static int
+elf_zstd_read_huff (const unsigned char **ppin, const unsigned char *pinend,
+		    uint16_t *zdebug_table, uint16_t *table, int *ptable_bits)
+{
+  const unsigned char *pin;
+  unsigned char hdr;
+  unsigned char *weights;
+  size_t count;
+  uint32_t *weight_mark;
+  size_t i;
+  uint32_t weight_mask;
+  size_t table_bits;
+
+  pin = *ppin;
+  if (unlikely (pin >= pinend))
+    {
+      elf_uncompress_failed ();
+      return 0;
+    }
+  hdr = *pin;
+  ++pin;
+
+  weights = (unsigned char *) zdebug_table;
+
+  if (hdr < 128)
+    {
+      /* Table is compressed using FSE.  */
+
+      struct elf_zstd_fse_entry *fse_table;
+      int fse_table_bits;
+      uint16_t *scratch;
+      const unsigned char *pfse;
+      const unsigned char *pback;
+      uint64_t val;
+      unsigned int bits;
+      unsigned int state1, state2;
+
+      /* SCRATCH is used temporarily by elf_zstd_read_fse.  It overlaps
+	 WEIGHTS.  */
+      scratch = zdebug_table;
+      fse_table = (struct elf_zstd_fse_entry *) (scratch + 512);
+      fse_table_bits = 6;
+
+      pfse = pin;
+      if (!elf_zstd_read_fse (&pfse, pinend, scratch, 255, fse_table,
+			      &fse_table_bits))
+	return 0;
+
+      if (unlikely (pin + hdr > pinend))
+	{
+	  elf_uncompress_failed ();
+	  return 0;
+	}
+
+      /* We no longer need SCRATCH.  Start recording weights.  We need up to
+	 256 bytes of weights and 64 bytes of rank counts, so it won't overlap
+	 FSE_TABLE.  */
+
+      pback = pin + hdr - 1;
+
+      if (!elf_fetch_backward_init (&pback, pfse, &val, &bits))
+	return 0;
+
+      bits -= fse_table_bits;
+      state1 = (val >> bits) & ((1U << fse_table_bits) - 1);
+      bits -= fse_table_bits;
+      state2 = (val >> bits) & ((1U << fse_table_bits) - 1);
+
+      /* There are two independent FSE streams, tracked by STATE1 and STATE2.
+	 We decode them alternately.  */
+
+      count = 0;
+      while (1)
+	{
+	  struct elf_zstd_fse_entry *pt;
+	  uint64_t v;
+
+	  pt = &fse_table[state1];
+
+	  if (unlikely (pin < pinend) && bits < pt->bits)
+	    {
+	      if (unlikely (count >= 254))
+		{
+		  elf_uncompress_failed ();
+		  return 0;
+		}
+	      weights[count] = (unsigned char) pt->symbol;
+	      weights[count + 1] = (unsigned char) fse_table[state2].symbol;
+	      count += 2;
+	      break;
+	    }
+
+	  if (unlikely (pt->bits == 0))
+	    v = 0;
+	  else
+	    {
+	      if (!elf_fetch_bits_backward (&pback, pfse, &val, &bits))
+		return 0;
+
+	      bits -= pt->bits;
+	      v = (val >> bits) & (((uint64_t)1 << pt->bits) - 1);
+	    }
+
+	  state1 = pt->base + v;
+
+	  if (unlikely (count >= 255))
+	    {
+	      elf_uncompress_failed ();
+	      return 0;
+	    }
+
+	  weights[count] = pt->symbol;
+	  ++count;
+
+	  pt = &fse_table[state2];
+
+	  if (unlikely (pin < pinend && bits < pt->bits))
+	    {
+	      if (unlikely (count >= 254))
+		{
+		  elf_uncompress_failed ();
+		  return 0;
+		}
+	      weights[count] = (unsigned char) pt->symbol;
+	      weights[count + 1] = (unsigned char) fse_table[state1].symbol;
+	      count += 2;
+	      break;
+	    }
+
+	  if (unlikely (pt->bits == 0))
+	    v = 0;
+	  else
+	    {
+	      if (!elf_fetch_bits_backward (&pback, pfse, &val, &bits))
+		return 0;
+
+	      bits -= pt->bits;
+	      v = (val >> bits) & (((uint64_t)1 << pt->bits) - 1);
+	    }
+
+	  state2 = pt->base + v;
+
+	  if (unlikely (count >= 255))
+	    {
+	      elf_uncompress_failed ();
+	      return 0;
+	    }
+
+	  weights[count] = pt->symbol;
+	  ++count;
+	}
+
+      pin += hdr;
+    }
+  else
+    {
+      /* Table is not compressed.  Each weight is 4 bits.  */
+
+      count = hdr - 127;
+      if (unlikely (pin + ((count + 1) / 2) >= pinend))
+	{
+	  elf_uncompress_failed ();
+	  return 0;
+	}
+      for (i = 0; i < count; i += 2)
+	{
+	  unsigned char b;
+
+	  b = *pin;
+	  ++pin;
+	  weights[i] = b >> 4;
+	  weights[i + 1] = b & 0xf;
+	}
+    }
+
+  weight_mark = (uint32_t *) (weights + 256);
+  memset (weight_mark, 0, 13 * sizeof (uint32_t));
+  weight_mask = 0;
+  for (i = 0; i < count; ++i)
+    {
+      unsigned char w;
+
+      w = weights[i];
+      if (unlikely (w > 12))
+	{
+	  elf_uncompress_failed ();
+	  return 0;
+	}
+      ++weight_mark[w];
+      if (w > 0)
+	weight_mask += 1U << (w - 1);
+    }
+  if (unlikely (weight_mask == 0))
+    {
+      elf_uncompress_failed ();
+      return 0;
+    }
+
+  table_bits = 32 - __builtin_clz (weight_mask);
+  if (unlikely (table_bits > 11))
+    {
+      elf_uncompress_failed ();
+      return 0;
+    }
+
+  /* Work out the last weight value, which is omitted because the weights must
+     sum to a power of two.  */
+  {
+    uint32_t left;
+    uint32_t high_bit;
+
+    left = ((uint32_t)1 << table_bits) - weight_mask;
+    if (left == 0)
+      {
+	elf_uncompress_failed ();
+	return 0;
+      }
+    high_bit = 31 - __builtin_clz (left);
+    if (((uint32_t)1 << high_bit) != left)
+      {
+	elf_uncompress_failed ();
+	return 0;
+      }
+
+    if (unlikely (count >= 256))
+      {
+	elf_uncompress_failed ();
+	return 0;
+      }
+
+    weights[count] = high_bit + 1;
+    ++count;
+    ++weight_mark[high_bit + 1];
+  }
+
+  if (weight_mark[1] < 2 || (weight_mark[1] & 1) != 0)
+    {
+      elf_uncompress_failed ();
+      return 0;
+    }
+
+  /* Change WEIGHT_MARK from a count of weights to the index of the first
+     symbol for that weight.  We shift the indexes to also store how many we
+     have seen so far, below.  */
+  {
+    uint32_t next;
+
+    next = 0;
+    for (i = 0; i < table_bits; ++i)
+      {
+	uint32_t cur;
+
+	cur = next;
+	next += weight_mark[i + 1] << i;
+	weight_mark[i + 1] = cur;
+      }
+  }
+
+  for (i = 0; i < count; ++i)
+    {
+      unsigned char weight;
+      uint32_t length;
+      uint16_t tval;
+      size_t start;
+      uint32_t j;
+
+      weight = weights[i];
+      if (weight == 0)
+	continue;
+
+      length = 1U << (weight - 1);
+      tval = (i << 8) | (table_bits + 1 - weight);
+      start = weight_mark[weight];
+      for (j = 0; j < length; ++j)
+	table[start + j] = tval;
+      weight_mark[weight] += length;
+    }
+
+  *ppin = pin;
+  *ptable_bits = (int)table_bits;
+
+  return 1;
+}
+
+/* Read and decompress the literals and store them ending at POUTEND.  This
+   works because we are going to use all the literals in the output, so they
+   must fit into the output buffer.  HUFFMAN_TABLE, and PHUFFMAN_TABLE_BITS
+   store the Huffman table across calls.  SCRATCH is used to read a Huffman
+   table.  Store the start of the decompressed literals in *PPLIT.  Update
+   *PPIN.  Return 1 on success, 0 on error.  */
+
+static int
+elf_zstd_read_literals (const unsigned char **ppin,
+			const unsigned char *pinend,
+			unsigned char *pout,
+			unsigned char *poutend,
+			uint16_t *scratch,
+			uint16_t *huffman_table,
+			int *phuffman_table_bits,
+			unsigned char **pplit)
+{
+  const unsigned char *pin;
+  unsigned char *plit;
+  unsigned char hdr;
+  uint32_t regenerated_size;
+  uint32_t compressed_size;
+  int streams;
+  uint32_t total_streams_size;
+  unsigned int huffman_table_bits;
+  uint64_t huffman_mask;
+
+  pin = *ppin;
+  if (unlikely (pin >= pinend))
+    {
+      elf_uncompress_failed ();
+      return 0;
+    }
+  hdr = *pin;
+  ++pin;
+
+  if ((hdr & 3) == 0 || (hdr & 3) == 1)
+    {
+      int raw;
+
+      /* Raw_Literals_Block or RLE_Literals_Block */
+
+      raw = (hdr & 3) == 0;
+
+      switch ((hdr >> 2) & 3)
+	{
+	case 0: case 2:
+	  regenerated_size = hdr >> 3;
+	  break;
+	case 1:
+	  if (unlikely (pin >= pinend))
+	    {
+	      elf_uncompress_failed ();
+	      return 0;
+	    }
+	  regenerated_size = (hdr >> 4) + ((uint32_t)(*pin) << 4);
+	  ++pin;
+	  break;
+	case 3:
+	  if (unlikely (pin + 1 >= pinend))
+	    {
+	      elf_uncompress_failed ();
+	      return 0;
+	    }
+	  regenerated_size = ((hdr >> 4)
+			      + ((uint32_t)*pin << 4)
+			      + ((uint32_t)pin[1] << 12));
+	  pin += 2;
+	  break;
+	default:
+	  elf_uncompress_failed ();
+	  return 0;
+	}
+
+      if (unlikely ((size_t)(poutend - pout) < regenerated_size))
+	{
+	  elf_uncompress_failed ();
+	  return 0;
+	}
+
+      plit = poutend - regenerated_size;
+
+      if (raw)
+	{
+	  if (unlikely (pin + regenerated_size >= pinend))
+	    {
+	      elf_uncompress_failed ();
+	      return 0;
+	    }
+	  memcpy (plit, pin, regenerated_size);
+	  pin += regenerated_size;
+	}
+      else
+	{
+	  if (pin >= pinend)
+	    {
+	      elf_uncompress_failed ();
+	      return 0;
+	    }
+	  memset (plit, *pin, regenerated_size);
+	  ++pin;
+	}
+
+      *ppin = pin;
+      *pplit = plit;
+
+      return 1;
+    }
+
+  /* Compressed_Literals_Block or Treeless_Literals_Block */
+
+  switch ((hdr >> 2) & 3)
+    {
+    case 0: case 1:
+      if (unlikely (pin + 1 >= pinend))
+	{
+	  elf_uncompress_failed ();
+	  return 0;
+	}
+      regenerated_size = (hdr >> 4) | ((uint32_t)(*pin & 0x3f) << 4);
+      compressed_size = (uint32_t)*pin >> 6 | ((uint32_t)pin[1] << 2);
+      pin += 2;
+      streams = ((hdr >> 2) & 3) == 0 ? 1 : 4;
+      break;
+    case 2:
+      if (unlikely (pin + 2 >= pinend))
+	{
+	  elf_uncompress_failed ();
+	  return 0;
+	}
+      regenerated_size = (((uint32_t)hdr >> 4)
+			  | ((uint32_t)*pin << 4)
+			  | (((uint32_t)pin[1] & 3) << 12));
+      compressed_size = (((uint32_t)pin[1] >> 2)
+			 | ((uint32_t)pin[2] << 6));
+      pin += 3;
+      streams = 4;
+      break;
+    case 3:
+      if (unlikely (pin + 3 >= pinend))
+	{
+	  elf_uncompress_failed ();
+	  return 0;
+	}
+      regenerated_size = (((uint32_t)hdr >> 4)
+			  | ((uint32_t)*pin << 4)
+			  | (((uint32_t)pin[1] & 0x3f) << 12));
+      compressed_size = (((uint32_t)pin[1] >> 6)
+			 | ((uint32_t)pin[2] << 2)
+			 | ((uint32_t)pin[3] << 10));
+      pin += 4;
+      streams = 4;
+      break;
+    default:
+      elf_uncompress_failed ();
+      return 0;
+    }
+
+  if (unlikely (pin + compressed_size > pinend))
+    {
+      elf_uncompress_failed ();
+      return 0;
+    }
+
+  pinend = pin + compressed_size;
+  *ppin = pinend;
+
+  if (unlikely ((size_t)(poutend - pout) < regenerated_size))
+    {
+      elf_uncompress_failed ();
+      return 0;
+    }
+
+  plit = poutend - regenerated_size;
+
+  *pplit = plit;
+
+  total_streams_size = compressed_size;
+  if ((hdr & 3) == 2)
+    {
+      const unsigned char *ptable;
+
+      /* Compressed_Literals_Block.  Read Huffman tree.  */
+
+      ptable = pin;
+      if (!elf_zstd_read_huff (&ptable, pinend, scratch, huffman_table,
+			       phuffman_table_bits))
+	return 0;
+
+      if (unlikely (total_streams_size < (size_t)(ptable - pin)))
+	{
+	  elf_uncompress_failed ();
+	  return 0;
+	}
+
+      total_streams_size -= ptable - pin;
+      pin = ptable;
+    }
+  else
+    {
+      /* Treeless_Literals_Block.  Reuse previous Huffman tree.  */
+      if (unlikely (*phuffman_table_bits == 0))
+	{
+	  elf_uncompress_failed ();
+	  return 0;
+	}
+    }
+
+  /* Decompress COMPRESSED_SIZE bytes of data at PIN using the huffman table,
+     storing REGENERATED_SIZE bytes of decompressed data at PLIT.  */
+
+  huffman_table_bits = (unsigned int)*phuffman_table_bits;
+  huffman_mask = ((uint64_t)1 << huffman_table_bits) - 1;
+
+  if (streams == 1)
+    {
+      const unsigned char *pback;
+      const unsigned char *pbackend;
+      uint64_t val;
+      unsigned int bits;
+      uint32_t i;
+
+      pback = pin + total_streams_size - 1;
+      pbackend = pin;
+      if (!elf_fetch_backward_init (&pback, pbackend, &val, &bits))
+	return 0;
+
+      /* This is one of the inner loops of the decompression algorithm, so we
+	 put some effort into optimization.  We can't get more than 64 bytes
+	 from a single call to elf_fetch_bits_backward, and we can't subtract
+	 more than 11 bits at a time.  */
+
+      if (regenerated_size >= 64)
+	{
+	  unsigned char *plitstart;
+	  unsigned char *plitstop;
+
+	  plitstart = plit;
+	  plitstop = plit + regenerated_size - 64;
+	  while (plit < plitstop)
+	    {
+	      uint16_t t;
+
+	      if (!elf_fetch_bits_backward (&pback, pbackend, &val, &bits))
+		return 0;
+
+	      if (bits < 16)
+		break;
+
+	      while (bits >= 33)
+		{
+		  t = huffman_table[(val >> (bits - huffman_table_bits))
+				    & huffman_mask];
+		  *plit = t >> 8;
+		  ++plit;
+		  bits -= t & 0xff;
+
+		  t = huffman_table[(val >> (bits - huffman_table_bits))
+				    & huffman_mask];
+		  *plit = t >> 8;
+		  ++plit;
+		  bits -= t & 0xff;
+
+		  t = huffman_table[(val >> (bits - huffman_table_bits))
+				    & huffman_mask];
+		  *plit = t >> 8;
+		  ++plit;
+		  bits -= t & 0xff;
+		}
+
+	      while (bits > 11)
+		{
+		  t = huffman_table[(val >> (bits - huffman_table_bits))
+				    & huffman_mask];
+		  *plit = t >> 8;
+		  ++plit;
+		  bits -= t & 0xff;
+		}
+	    }
+
+	  regenerated_size -= plit - plitstart;
+	}
+
+      for (i = 0; i < regenerated_size; ++i)
+	{
+	  uint16_t t;
+
+	  if (!elf_fetch_bits_backward (&pback, pbackend, &val, &bits))
+	    return 0;
+
+	  if (unlikely (bits < huffman_table_bits))
+	    {
+	      t = huffman_table[(val << (huffman_table_bits - bits))
+				& huffman_mask];
+	      if (unlikely (bits < (t & 0xff)))
+		{
+		  elf_uncompress_failed ();
+		  return 0;
+		}
+	    }
+	  else
+	    t = huffman_table[(val >> (bits - huffman_table_bits))
+			      & huffman_mask];
+
+	  *plit = t >> 8;
+	  ++plit;
+	  bits -= t & 0xff;
+	}
+
+      return 1;
+    }
+
+  {
+    uint32_t stream_size1, stream_size2, stream_size3, stream_size4;
+    uint32_t tot;
+    const unsigned char *pback1, *pback2, *pback3, *pback4;
+    const unsigned char *pbackend1, *pbackend2, *pbackend3, *pbackend4;
+    uint64_t val1, val2, val3, val4;
+    unsigned int bits1, bits2, bits3, bits4;
+    unsigned char *plit1, *plit2, *plit3, *plit4;
+    uint32_t regenerated_stream_size;
+    uint32_t regenerated_stream_size4;
+    uint16_t t1, t2, t3, t4;
+    uint32_t i;
+    uint32_t limit;
+
+    /* Read jump table.  */
+    if (unlikely (pin + 5 >= pinend))
+      {
+	elf_uncompress_failed ();
+	return 0;
+      }
+    stream_size1 = (uint32_t)*pin | ((uint32_t)pin[1] << 8);
+    pin += 2;
+    stream_size2 = (uint32_t)*pin | ((uint32_t)pin[1] << 8);
+    pin += 2;
+    stream_size3 = (uint32_t)*pin | ((uint32_t)pin[1] << 8);
+    pin += 2;
+    tot = stream_size1 + stream_size2 + stream_size3;
+    if (unlikely (tot > total_streams_size - 6))
+      {
+	elf_uncompress_failed ();
+	return 0;
+      }
+    stream_size4 = total_streams_size - 6 - tot;
+
+    pback1 = pin + stream_size1 - 1;
+    pbackend1 = pin;
+
+    pback2 = pback1 + stream_size2;
+    pbackend2 = pback1 + 1;
+
+    pback3 = pback2 + stream_size3;
+    pbackend3 = pback2 + 1;
+
+    pback4 = pback3 + stream_size4;
+    pbackend4 = pback3 + 1;
+
+    if (!elf_fetch_backward_init (&pback1, pbackend1, &val1, &bits1))
+      return 0;
+    if (!elf_fetch_backward_init (&pback2, pbackend2, &val2, &bits2))
+      return 0;
+    if (!elf_fetch_backward_init (&pback3, pbackend3, &val3, &bits3))
+      return 0;
+    if (!elf_fetch_backward_init (&pback4, pbackend4, &val4, &bits4))
+      return 0;
+
+    regenerated_stream_size = (regenerated_size + 3) / 4;
+
+    plit1 = plit;
+    plit2 = plit1 + regenerated_stream_size;
+    plit3 = plit2 + regenerated_stream_size;
+    plit4 = plit3 + regenerated_stream_size;
+
+    regenerated_stream_size4 = regenerated_size - regenerated_stream_size * 3;
+
+    /* We can't get more than 64 literal bytes from a single call to
+       elf_fetch_bits_backward.  The fourth stream can be up to 3 bytes less,
+       so use as the limit.  */
+
+    limit = regenerated_stream_size4 <= 64 ? 0 : regenerated_stream_size4 - 64;
+    i = 0;
+    while (i < limit)
+      {
+	if (!elf_fetch_bits_backward (&pback1, pbackend1, &val1, &bits1))
+	  return 0;
+	if (!elf_fetch_bits_backward (&pback2, pbackend2, &val2, &bits2))
+	  return 0;
+	if (!elf_fetch_bits_backward (&pback3, pbackend3, &val3, &bits3))
+	  return 0;
+	if (!elf_fetch_bits_backward (&pback4, pbackend4, &val4, &bits4))
+	  return 0;
+
+	/* We can't subtract more than 11 bits at a time.  */
+
+	do
+	  {
+	    t1 = huffman_table[(val1 >> (bits1 - huffman_table_bits))
+			       & huffman_mask];
+	    t2 = huffman_table[(val2 >> (bits2 - huffman_table_bits))
+			       & huffman_mask];
+	    t3 = huffman_table[(val3 >> (bits3 - huffman_table_bits))
+			       & huffman_mask];
+	    t4 = huffman_table[(val4 >> (bits4 - huffman_table_bits))
+			       & huffman_mask];
+
+	    *plit1 = t1 >> 8;
+	    ++plit1;
+	    bits1 -= t1 & 0xff;
+
+	    *plit2 = t2 >> 8;
+	    ++plit2;
+	    bits2 -= t2 & 0xff;
+
+	    *plit3 = t3 >> 8;
+	    ++plit3;
+	    bits3 -= t3 & 0xff;
+
+	    *plit4 = t4 >> 8;
+	    ++plit4;
+	    bits4 -= t4 & 0xff;
+
+	    ++i;
+	  }
+	while (bits1 > 11 && bits2 > 11 && bits3 > 11 && bits4 > 11);
+      }
+
+    while (i < regenerated_stream_size)
+      {
+	int use4;
+
+	use4 = i < regenerated_stream_size4;
+
+	if (!elf_fetch_bits_backward (&pback1, pbackend1, &val1, &bits1))
+	  return 0;
+	if (!elf_fetch_bits_backward (&pback2, pbackend2, &val2, &bits2))
+	  return 0;
+	if (!elf_fetch_bits_backward (&pback3, pbackend3, &val3, &bits3))
+	  return 0;
+	if (use4)
+	  {
+	    if (!elf_fetch_bits_backward (&pback4, pbackend4, &val4, &bits4))
+	      return 0;
+	  }
+
+	if (unlikely (bits1 < huffman_table_bits))
+	  {
+	    t1 = huffman_table[(val1 << (huffman_table_bits - bits1))
+			       & huffman_mask];
+	    if (unlikely (bits1 < (t1 & 0xff)))
+	      {
+		elf_uncompress_failed ();
+		return 0;
+	      }
+	  }
+	else
+	  t1 = huffman_table[(val1 >> (bits1 - huffman_table_bits))
+			     & huffman_mask];
+
+	if (unlikely (bits2 < huffman_table_bits))
+	  {
+	    t2 = huffman_table[(val2 << (huffman_table_bits - bits2))
+			       & huffman_mask];
+	    if (unlikely (bits2 < (t2 & 0xff)))
+	      {
+		elf_uncompress_failed ();
+		return 0;
+	      }
+	  }
+	else
+	  t2 = huffman_table[(val2 >> (bits2 - huffman_table_bits))
+			     & huffman_mask];
+
+	if (unlikely (bits3 < huffman_table_bits))
+	  {
+	    t3 = huffman_table[(val3 << (huffman_table_bits - bits3))
+			       & huffman_mask];
+	    if (unlikely (bits3 < (t3 & 0xff)))
+	      {
+		elf_uncompress_failed ();
+		return 0;
+	      }
+	  }
+	else
+	  t3 = huffman_table[(val3 >> (bits3 - huffman_table_bits))
+			     & huffman_mask];
+
+	if (use4)
+	  {
+	    if (unlikely (bits4 < huffman_table_bits))
+	      {
+		t4 = huffman_table[(val4 << (huffman_table_bits - bits4))
+				   & huffman_mask];
+		if (unlikely (bits4 < (t4 & 0xff)))
+		  {
+		    elf_uncompress_failed ();
+		    return 0;
+		  }
+	      }
+	    else
+	      t4 = huffman_table[(val4 >> (bits4 - huffman_table_bits))
+				 & huffman_mask];
+
+	    *plit4 = t4 >> 8;
+	    ++plit4;
+	    bits4 -= t4 & 0xff;
+	  }
+
+	*plit1 = t1 >> 8;
+	++plit1;
+	bits1 -= t1 & 0xff;
+
+	*plit2 = t2 >> 8;
+	++plit2;
+	bits2 -= t2 & 0xff;
+
+	*plit3 = t3 >> 8;
+	++plit3;
+	bits3 -= t3 & 0xff;
+
+	++i;
+      }
+  }
+
+  return 1;
+}
+
+/* The information used to decompress a sequence code, which can be a literal
+   length, an offset, or a match length.  */
+
+struct elf_zstd_seq_decode
+{
+  const struct elf_zstd_fse_baseline_entry *table;
+  int table_bits;
+};
+
+/* Unpack a sequence code compression mode.  */
+
+static int
+elf_zstd_unpack_seq_decode (int mode,
+			    const unsigned char **ppin,
+			    const unsigned char *pinend,
+			    const struct elf_zstd_fse_baseline_entry *predef,
+			    int predef_bits,
+			    uint16_t *scratch,
+			    int maxidx,
+			    struct elf_zstd_fse_baseline_entry *table,
+			    int table_bits,
+			    int (*conv)(const struct elf_zstd_fse_entry *,
+					int,
+					struct elf_zstd_fse_baseline_entry *),
+			    struct elf_zstd_seq_decode *decode)
+{
+  switch (mode)
+    {
+    case 0:
+      decode->table = predef;
+      decode->table_bits = predef_bits;
+      break;
+
+    case 1:
+      {
+	struct elf_zstd_fse_entry entry;
+
+	if (unlikely (*ppin >= pinend))
+	  {
+	    elf_uncompress_failed ();
+	    return 0;
+	  }
+	entry.symbol = **ppin;
+	++*ppin;
+	entry.bits = 0;
+	entry.base = 0;
+	decode->table_bits = 0;
+	if (!conv (&entry, 0, table))
+	  return 0;
+      }
+      break;
+
+    case 2:
+      {
+	struct elf_zstd_fse_entry *fse_table;
+
+	/* We use the same space for the simple FSE table and the baseline
+	   table.  */
+	fse_table = (struct elf_zstd_fse_entry *)table;
+	decode->table_bits = table_bits;
+	if (!elf_zstd_read_fse (ppin, pinend, scratch, maxidx, fse_table,
+				&decode->table_bits))
+	  return 0;
+	if (!conv (fse_table, decode->table_bits, table))
+	  return 0;
+	decode->table = table;
+      }
+      break;
+
+    case 3:
+      if (unlikely (decode->table_bits == -1))
+	{
+	  elf_uncompress_failed ();
+	  return 0;
+	}
+      break;
+
+    default:
+      elf_uncompress_failed ();
+      return 0;
+    }
+
+  return 1;
+}
+
+/* Decompress a zstd stream from PIN/SIN to POUT/SOUT.  Code based on RFC 8878.
+   Return 1 on success, 0 on error.  */
+
+static int
+elf_zstd_decompress (const unsigned char *pin, size_t sin,
+		     unsigned char *zdebug_table, unsigned char *pout,
+		     size_t sout)
+{
+  const unsigned char *pinend;
+  unsigned char *poutstart;
+  unsigned char *poutend;
+  struct elf_zstd_seq_decode literal_decode;
+  struct elf_zstd_fse_baseline_entry *literal_fse_table;
+  struct elf_zstd_seq_decode match_decode;
+  struct elf_zstd_fse_baseline_entry *match_fse_table;
+  struct elf_zstd_seq_decode offset_decode;
+  struct elf_zstd_fse_baseline_entry *offset_fse_table;
+  uint16_t *huffman_table;
+  int huffman_table_bits;
+  uint32_t repeated_offset1;
+  uint32_t repeated_offset2;
+  uint32_t repeated_offset3;
+  uint16_t *scratch;
+  unsigned char hdr;
+  int has_checksum;
+  uint64_t content_size;
+  int last_block;
+
+  pinend = pin + sin;
+  poutstart = pout;
+  poutend = pout + sout;
+
+  literal_decode.table = NULL;
+  literal_decode.table_bits = -1;
+  literal_fse_table = ((struct elf_zstd_fse_baseline_entry *)
+		       (zdebug_table + ZSTD_TABLE_LITERAL_FSE_OFFSET));
+
+  match_decode.table = NULL;
+  match_decode.table_bits = -1;
+  match_fse_table = ((struct elf_zstd_fse_baseline_entry *)
+		     (zdebug_table + ZSTD_TABLE_MATCH_FSE_OFFSET));
+
+  offset_decode.table = NULL;
+  offset_decode.table_bits = -1;
+  offset_fse_table = ((struct elf_zstd_fse_baseline_entry *)
+		      (zdebug_table + ZSTD_TABLE_OFFSET_FSE_OFFSET));
+  huffman_table = ((uint16_t *)
+		   (zdebug_table + ZSTD_TABLE_HUFFMAN_OFFSET));
+  huffman_table_bits = 0;
+  scratch = ((uint16_t *)
+	     (zdebug_table + ZSTD_TABLE_WORK_OFFSET));
+
+  repeated_offset1 = 1;
+  repeated_offset2 = 4;
+  repeated_offset3 = 8;
+
+  if (unlikely (sin < 4))
+    {
+      elf_uncompress_failed ();
+      return 0;
+    }
+
+  /* These values are the zstd magic number.  */
+  if (unlikely (pin[0] != 0x28
+		|| pin[1] != 0xb5
+		|| pin[2] != 0x2f
+		|| pin[3] != 0xfd))
+    {
+      elf_uncompress_failed ();
+      return 0;
+    }
+
+  pin += 4;
+
+  if (unlikely (pin >= pinend))
+    {
+      elf_uncompress_failed ();
+      return 0;
+    }
+
+  hdr = *pin++;
+
+  /* We expect a single frame.  */
+  if (unlikely ((hdr & (1 << 5)) == 0))
+    {
+      elf_uncompress_failed ();
+      return 0;
+    }
+  /* Reserved bit must be zero.  */
+  if (unlikely ((hdr & (1 << 3)) != 0))
+    {
+      elf_uncompress_failed ();
+      return 0;
+    }
+  /* We do not expect a dictionary.  */
+  if (unlikely ((hdr & 3) != 0))
+    {
+      elf_uncompress_failed ();
+      return 0;
+    }
+  has_checksum = (hdr & (1 << 2)) != 0;
+  switch (hdr >> 6)
+    {
+    case 0:
+      if (unlikely (pin >= pinend))
+	{
+	  elf_uncompress_failed ();
+	  return 0;
+	}
+      content_size = (uint64_t) *pin++;
+      break;
+    case 1:
+      if (unlikely (pin + 1 >= pinend))
+	{
+	  elf_uncompress_failed ();
+	  return 0;
+	}
+      content_size = (((uint64_t) pin[0]) | (((uint64_t) pin[1]) << 8)) + 256;
+      pin += 2;
+      break;
+    case 2:
+      if (unlikely (pin + 3 >= pinend))
+	{
+	  elf_uncompress_failed ();
+	  return 0;
+	}
+      content_size = ((uint64_t) pin[0]
+		      | (((uint64_t) pin[1]) << 8)
+		      | (((uint64_t) pin[2]) << 16)
+		      | (((uint64_t) pin[3]) << 24));
+      pin += 4;
+      break;
+    case 3:
+      if (unlikely (pin + 7 >= pinend))
+	{
+	  elf_uncompress_failed ();
+	  return 0;
+	}
+      content_size = ((uint64_t) pin[0]
+		      | (((uint64_t) pin[1]) << 8)
+		      | (((uint64_t) pin[2]) << 16)
+		      | (((uint64_t) pin[3]) << 24)
+		      | (((uint64_t) pin[4]) << 32)
+		      | (((uint64_t) pin[5]) << 40)
+		      | (((uint64_t) pin[6]) << 48)
+		      | (((uint64_t) pin[7]) << 56));
+      pin += 8;
+      break;
+    default:
+      elf_uncompress_failed ();
+      return 0;
+    }
+
+  if (unlikely (content_size != (size_t) content_size
+		|| (size_t) content_size != sout))
+    {
+      elf_uncompress_failed ();
+      return 0;
+    }
+
+  last_block = 0;
+  while (!last_block)
+    {
+      uint32_t block_hdr;
+      int block_type;
+      uint32_t block_size;
+
+      if (unlikely (pin + 2 >= pinend))
+	{
+	  elf_uncompress_failed ();
+	  return 0;
+	}
+      block_hdr = ((uint32_t) pin[0]
+		   | (((uint32_t) pin[1]) << 8)
+		   | (((uint32_t) pin[2]) << 16));
+      pin += 3;
+
+      last_block = block_hdr & 1;
+      block_type = (block_hdr >> 1) & 3;
+      block_size = block_hdr >> 3;
+
+      switch (block_type)
+	{
+	case 0:
+	  /* Raw_Block */
+	  if (unlikely ((size_t) block_size > (size_t) (pinend - pin)))
+	    {
+	      elf_uncompress_failed ();
+	      return 0;
+	    }
+	  if (unlikely ((size_t) block_size > (size_t) (poutend - pout)))
+	    {
+	      elf_uncompress_failed ();
+	      return 0;
+	    }
+	  memcpy (pout, pin, block_size);
+	  pout += block_size;
+	  pin += block_size;
+	  break;
+
+	case 1:
+	  /* RLE_Block */
+	  if (unlikely (pin >= pinend))
+	    {
+	      elf_uncompress_failed ();
+	      return 0;
+	    }
+	  if (unlikely ((size_t) block_size > (size_t) (poutend - pout)))
+	    {
+	      elf_uncompress_failed ();
+	      return 0;
+	    }
+	  memset (pout, *pin, block_size);
+	  pout += block_size;
+	  pin++;
+	  break;
+
+	case 2:
+	  {
+	    const unsigned char *pblockend;
+	    unsigned char *plitstack;
+	    unsigned char *plit;
+	    uint32_t literal_count;
+	    unsigned char seq_hdr;
+	    size_t seq_count;
+	    size_t seq;
+	    const unsigned char *pback;
+	    uint64_t val;
+	    unsigned int bits;
+	    unsigned int literal_state;
+	    unsigned int offset_state;
+	    unsigned int match_state;
+
+	    /* Compressed_Block */
+	    if (unlikely ((size_t) block_size > (size_t) (pinend - pin)))
+	      {
+		elf_uncompress_failed ();
+		return 0;
+	      }
+
+	    pblockend = pin + block_size;
+
+	    /* Read the literals into the end of the output space, and leave
+	       PLIT pointing at them.  */
+
+	    if (!elf_zstd_read_literals (&pin, pblockend, pout, poutend,
+					 scratch, huffman_table,
+					 &huffman_table_bits,
+					 &plitstack))
+	      return 0;
+	    plit = plitstack;
+	    literal_count = poutend - plit;
+
+	    seq_hdr = *pin;
+	    pin++;
+	    if (seq_hdr < 128)
+	      seq_count = seq_hdr;
+	    else if (seq_hdr < 255)
+	      {
+		if (unlikely (pin >= pinend))
+		  {
+		    elf_uncompress_failed ();
+		    return 0;
+		  }
+		seq_count = ((seq_hdr - 128) << 8) + *pin;
+		pin++;
+	      }
+	    else
+	      {
+		if (unlikely (pin + 1 >= pinend))
+		  {
+		    elf_uncompress_failed ();
+		    return 0;
+		  }
+		seq_count = *pin + (pin[1] << 8) + 0x7f00;
+		pin += 2;
+	      }
+
+	    if (seq_count > 0)
+	      {
+		int (*pfn)(const struct elf_zstd_fse_entry *,
+			   int, struct elf_zstd_fse_baseline_entry *);
+
+		if (unlikely (pin >= pinend))
+		  {
+		    elf_uncompress_failed ();
+		    return 0;
+		  }
+		seq_hdr = *pin;
+		++pin;
+
+		pfn = elf_zstd_make_literal_baseline_fse;
+		if (!elf_zstd_unpack_seq_decode ((seq_hdr >> 6) & 3,
+						 &pin, pinend,
+						 &elf_zstd_lit_table[0], 6,
+						 scratch, 35,
+						 literal_fse_table, 9, pfn,
+						 &literal_decode))
+		  return 0;
+
+		pfn = elf_zstd_make_offset_baseline_fse;
+		if (!elf_zstd_unpack_seq_decode ((seq_hdr >> 4) & 3,
+						 &pin, pinend,
+						 &elf_zstd_offset_table[0], 5,
+						 scratch, 31,
+						 offset_fse_table, 8, pfn,
+						 &offset_decode))
+		  return 0;
+
+		pfn = elf_zstd_make_match_baseline_fse;
+		if (!elf_zstd_unpack_seq_decode ((seq_hdr >> 2) & 3,
+						 &pin, pinend,
+						 &elf_zstd_match_table[0], 6,
+						 scratch, 52,
+						 match_fse_table, 9, pfn,
+						 &match_decode))
+		  return 0;
+	      }
+
+	    pback = pblockend - 1;
+	    if (!elf_fetch_backward_init (&pback, pin, &val, &bits))
+	      return 0;
+
+	    bits -= literal_decode.table_bits;
+	    literal_state = ((val >> bits)
+			     & ((1U << literal_decode.table_bits) - 1));
+
+	    if (!elf_fetch_bits_backward (&pback, pin, &val, &bits))
+	      return 0;
+	    bits -= offset_decode.table_bits;
+	    offset_state = ((val >> bits)
+			    & ((1U << offset_decode.table_bits) - 1));
+
+	    if (!elf_fetch_bits_backward (&pback, pin, &val, &bits))
+	      return 0;
+	    bits -= match_decode.table_bits;
+	    match_state = ((val >> bits)
+			   & ((1U << match_decode.table_bits) - 1));
+
+	    seq = 0;
+	    while (1)
+	      {
+		const struct elf_zstd_fse_baseline_entry *pt;
+		uint32_t offset_basebits;
+		uint32_t offset_baseline;
+		uint32_t offset_bits;
+		uint32_t offset_base;
+		uint32_t offset;
+		uint32_t match_baseline;
+		uint32_t match_bits;
+		uint32_t match_base;
+		uint32_t match;
+		uint32_t literal_baseline;
+		uint32_t literal_bits;
+		uint32_t literal_base;
+		uint32_t literal;
+		uint32_t need;
+		uint32_t add;
+
+		pt = &offset_decode.table[offset_state];
+		offset_basebits = pt->basebits;
+		offset_baseline = pt->baseline;
+		offset_bits = pt->bits;
+		offset_base = pt->base;
+
+		/* This case can be more than 16 bits, which is all that
+		   elf_fetch_bits_backward promises.  */
+		need = offset_basebits;
+		add = 0;
+		if (unlikely (need > 16))
+		  {
+		    if (!elf_fetch_bits_backward (&pback, pin, &val, &bits))
+		      return 0;
+		    bits -= 16;
+		    add = (val >> bits) & ((1U << 16) - 1);
+		    need -= 16;
+		    add <<= need;
+		  }
+		if (need > 0)
+		  {
+		    if (!elf_fetch_bits_backward (&pback, pin, &val, &bits))
+		      return 0;
+		    bits -= need;
+		    add += (val >> bits) & ((1U << need) - 1);
+		  }
+
+		offset = offset_baseline + add;
+
+		pt = &match_decode.table[match_state];
+		need = pt->basebits;
+		match_baseline = pt->baseline;
+		match_bits = pt->bits;
+		match_base = pt->base;
+
+		add = 0;
+		if (need > 0)
+		  {
+		    if (!elf_fetch_bits_backward (&pback, pin, &val, &bits))
+		      return 0;
+		    bits -= need;
+		    add = (val >> bits) & ((1U << need) - 1);
+		  }
+
+		match = match_baseline + add;
+
+		pt = &literal_decode.table[literal_state];
+		need = pt->basebits;
+		literal_baseline = pt->baseline;
+		literal_bits = pt->bits;
+		literal_base = pt->base;
+
+		add = 0;
+		if (need > 0)
+		  {
+		    if (!elf_fetch_bits_backward (&pback, pin, &val, &bits))
+		      return 0;
+		    bits -= need;
+		    add = (val >> bits) & ((1U << need) - 1);
+		  }
+
+		literal = literal_baseline + add;
+
+		/* See the comment in elf_zstd_make_offset_baseline_fse.  */
+		if (offset_basebits > 1)
+		  {
+		    repeated_offset3 = repeated_offset2;
+		    repeated_offset2 = repeated_offset1;
+		    repeated_offset1 = offset;
+		  }
+		else
+		  {
+		    if (unlikely (literal == 0))
+		      ++offset;
+		    switch (offset)
+		      {
+		      case 1:
+			offset = repeated_offset1;
+			break;
+		      case 2:
+			offset = repeated_offset2;
+			repeated_offset2 = repeated_offset1;
+			repeated_offset1 = offset;
+			break;
+		      case 3:
+			offset = repeated_offset3;
+			repeated_offset3 = repeated_offset2;
+			repeated_offset2 = repeated_offset1;
+			repeated_offset1 = offset;
+			break;
+		      case 4:
+			offset = repeated_offset1 - 1;
+			repeated_offset3 = repeated_offset2;
+			repeated_offset2 = repeated_offset1;
+			repeated_offset1 = offset;
+			break;
+		      }
+		  }
+
+		++seq;
+		if (seq < seq_count)
+		  {
+		    uint32_t v;
+
+		    /* Update the three states.  */
+
+		    if (!elf_fetch_bits_backward (&pback, pin, &val, &bits))
+		      return 0;
+
+		    need = literal_bits;
+		    bits -= need;
+		    v = (val >> bits) & (((uint32_t)1 << need) - 1);
+
+		    literal_state = literal_base + v;
+
+		    if (!elf_fetch_bits_backward (&pback, pin, &val, &bits))
+		      return 0;
+
+		    need = match_bits;
+		    bits -= need;
+		    v = (val >> bits) & (((uint32_t)1 << need) - 1);
+
+		    match_state = match_base + v;
+
+		    if (!elf_fetch_bits_backward (&pback, pin, &val, &bits))
+		      return 0;
+
+		    need = offset_bits;
+		    bits -= need;
+		    v = (val >> bits) & (((uint32_t)1 << need) - 1);
+
+		    offset_state = offset_base + v;
+		  }
+
+		/* The next sequence is now in LITERAL, OFFSET, MATCH.  */
+
+		/* Copy LITERAL bytes from the literals.  */
+
+		if (unlikely ((size_t)(poutend - pout) < literal))
+		  {
+		    elf_uncompress_failed ();
+		    return 0;
+		  }
+
+		if (unlikely (literal_count < literal))
+		  {
+		    elf_uncompress_failed ();
+		    return 0;
+		  }
+
+		literal_count -= literal;
+
+		/* Often LITERAL is small, so handle small cases quickly.  */
+		switch (literal)
+		  {
+		  case 8:
+		    *pout++ = *plit++;
+		    /* FALLTHROUGH */
+		  case 7:
+		    *pout++ = *plit++;
+		    /* FALLTHROUGH */
+		  case 6:
+		    *pout++ = *plit++;
+		    /* FALLTHROUGH */
+		  case 5:
+		    *pout++ = *plit++;
+		    /* FALLTHROUGH */
+		  case 4:
+		    *pout++ = *plit++;
+		    /* FALLTHROUGH */
+		  case 3:
+		    *pout++ = *plit++;
+		    /* FALLTHROUGH */
+		  case 2:
+		    *pout++ = *plit++;
+		    /* FALLTHROUGH */
+		  case 1:
+		    *pout++ = *plit++;
+		    break;
+
+		  case 0:
+		    break;
+
+		  default:
+		    if (unlikely ((size_t)(plit - pout) < literal))
+		      {
+			uint32_t move;
+
+			move = plit - pout;
+			while (literal > move)
+			  {
+			    memcpy (pout, plit, move);
+			    pout += move;
+			    plit += move;
+			    literal -= move;
+			  }
+		      }
+
+		    memcpy (pout, plit, literal);
+		    pout += literal;
+		    plit += literal;
+		  }
+
+		if (match > 0)
+		  {
+		    /* Copy MATCH bytes from the decoded output at OFFSET.  */
+
+		    if (unlikely ((size_t)(poutend - pout) < match))
+		      {
+			elf_uncompress_failed ();
+			return 0;
+		      }
+
+		    if (unlikely ((size_t)(pout - poutstart) < offset))
+		      {
+			elf_uncompress_failed ();
+			return 0;
+		      }
+
+		    if (offset >= match)
+		      {
+			memcpy (pout, pout - offset, match);
+			pout += match;
+		      }
+		    else
+		      {
+			while (match > 0)
+			  {
+			    uint32_t copy;
+
+			    copy = match < offset ? match : offset;
+			    memcpy (pout, pout - offset, copy);
+			    match -= copy;
+			    pout += copy;
+			  }
+		      }
+		  }
+
+		if (unlikely (seq >= seq_count))
+		  {
+		    /* Copy remaining literals.  */
+		    if (literal_count > 0 && plit != pout)
+		      {
+			if (unlikely ((size_t)(poutend - pout)
+				      < literal_count))
+			  {
+			    elf_uncompress_failed ();
+			    return 0;
+			  }
+
+			if ((size_t)(plit - pout) < literal_count)
+			  {
+			    uint32_t move;
+
+			    move = plit - pout;
+			    while (literal_count > move)
+			      {
+				memcpy (pout, plit, move);
+				pout += move;
+				plit += move;
+				literal_count -= move;
+			      }
+			  }
+
+			memcpy (pout, plit, literal_count);
+		      }
+
+		    pout += literal_count;
+
+		    break;
+		  }
+	      }
+
+	    pin = pblockend;
+	  }
+	  break;
+
+	case 3:
+	default:
+	  elf_uncompress_failed ();
+	  return 0;
+	}
+    }
+
+  if (has_checksum)
+    {
+      if (unlikely (pin + 4 > pinend))
+	{
+	  elf_uncompress_failed ();
+	  return 0;
+	}
+
+      /* We don't currently verify the checksum.  Currently running GNU ld with
+	 --compress-debug-sections=zstd does not seem to generate a
+	 checksum.  */
+
+      pin += 4;
+    }
+
+  if (pin != pinend)
+    {
+      elf_uncompress_failed ();
+      return 0;
+    }
+
+  return 1;
+}
+
+#define ZDEBUG_TABLE_SIZE \
+  (ZLIB_TABLE_SIZE > ZSTD_TABLE_SIZE ? ZLIB_TABLE_SIZE : ZSTD_TABLE_SIZE)
+
+/* Uncompress the old compressed debug format, the one emitted by
+   --compress-debug-sections=zlib-gnu.  The compressed data is in
+   COMPRESSED / COMPRESSED_SIZE, and the function writes to
+   *UNCOMPRESSED / *UNCOMPRESSED_SIZE.  ZDEBUG_TABLE is work space to
+   hold Huffman tables.  Returns 0 on error, 1 on successful
+   decompression or if something goes wrong.  In general we try to
+   carry on, by returning 1, even if we can't decompress.  */
+
+static int
+elf_uncompress_zdebug (struct backtrace_state *state,
+		       const unsigned char *compressed, size_t compressed_size,
+		       uint16_t *zdebug_table,
+		       backtrace_error_callback error_callback, void *data,
+		       unsigned char **uncompressed, size_t *uncompressed_size)
+{
+  size_t sz;
+  size_t i;
+  unsigned char *po;
+
+  *uncompressed = NULL;
+  *uncompressed_size = 0;
+
+  /* The format starts with the four bytes ZLIB, followed by the 8
+     byte length of the uncompressed data in big-endian order,
+     followed by a zlib stream.  */
+
+  if (compressed_size < 12 || memcmp (compressed, "ZLIB", 4) != 0)
+    return 1;
+
+  sz = 0;
+  for (i = 0; i < 8; i++)
+    sz = (sz << 8) | compressed[i + 4];
+
+  if (*uncompressed != NULL && *uncompressed_size >= sz)
+    po = *uncompressed;
+  else
+    {
+      po = (unsigned char *) backtrace_alloc (state, sz, error_callback, data);
+      if (po == NULL)
+	return 0;
+    }
+
+  if (!elf_zlib_inflate_and_verify (compressed + 12, compressed_size - 12,
+				    zdebug_table, po, sz))
+    return 1;
+
+  *uncompressed = po;
+  *uncompressed_size = sz;
+
+  return 1;
+}
+
+/* Uncompress the new compressed debug format, the official standard
+   ELF approach emitted by --compress-debug-sections=zlib-gabi.  The
+   compressed data is in COMPRESSED / COMPRESSED_SIZE, and the
+   function writes to *UNCOMPRESSED / *UNCOMPRESSED_SIZE.
+   ZDEBUG_TABLE is work space as for elf_uncompress_zdebug.  Returns 0
+   on error, 1 on successful decompression or if something goes wrong.
+   In general we try to carry on, by returning 1, even if we can't
+   decompress.  */
+
+static int
+elf_uncompress_chdr (struct backtrace_state *state,
+		     const unsigned char *compressed, size_t compressed_size,
+		     uint16_t *zdebug_table,
+		     backtrace_error_callback error_callback, void *data,
+		     unsigned char **uncompressed, size_t *uncompressed_size)
+{
+  b_elf_chdr chdr;
+  char *alc;
+  size_t alc_len;
+  unsigned char *po;
+
+  *uncompressed = NULL;
+  *uncompressed_size = 0;
+
+  /* The format starts with an ELF compression header.  */
+  if (compressed_size < sizeof (b_elf_chdr))
+    return 1;
+
+  /* The lld linker can misalign a compressed section, so we can't safely read
+     the fields directly as we can for other ELF sections.  See
+     https://github.com/ianlancetaylor/libbacktrace/pull/120.  */
+  memcpy (&chdr, compressed, sizeof (b_elf_chdr));
+
+  alc = NULL;
+  alc_len = 0;
+  if (*uncompressed != NULL && *uncompressed_size >= chdr.ch_size)
+    po = *uncompressed;
+  else
+    {
+      alc_len = chdr.ch_size;
+      alc = (char*)backtrace_alloc (state, alc_len, error_callback, data);
+      if (alc == NULL)
+	return 0;
+      po = (unsigned char *) alc;
+    }
+
+  switch (chdr.ch_type)
+    {
+    case ELFCOMPRESS_ZLIB:
+      if (!elf_zlib_inflate_and_verify (compressed + sizeof (b_elf_chdr),
+					compressed_size - sizeof (b_elf_chdr),
+					zdebug_table, po, chdr.ch_size))
+	goto skip;
+      break;
+
+    case ELFCOMPRESS_ZSTD:
+      if (!elf_zstd_decompress (compressed + sizeof (b_elf_chdr),
+				compressed_size - sizeof (b_elf_chdr),
+				(unsigned char *)zdebug_table, po,
+				chdr.ch_size))
+	goto skip;
+      break;
+
+    default:
+      /* Unsupported compression algorithm.  */
+      goto skip;
+    }
+
+  *uncompressed = po;
+  *uncompressed_size = chdr.ch_size;
+
+  return 1;
+
+ skip:
+  if (alc != NULL && alc_len > 0)
+    backtrace_free (state, alc, alc_len, error_callback, data);
+  return 1;
+}
+
+/* This function is a hook for testing the zlib support.  It is only
+   used by tests.  */
+
+int
+backtrace_uncompress_zdebug (struct backtrace_state *state,
+			     const unsigned char *compressed,
+			     size_t compressed_size,
+			     backtrace_error_callback error_callback,
+			     void *data, unsigned char **uncompressed,
+			     size_t *uncompressed_size)
+{
+  uint16_t *zdebug_table;
+  int ret;
+
+  zdebug_table = ((uint16_t *) backtrace_alloc (state, ZDEBUG_TABLE_SIZE,
+						error_callback, data));
+  if (zdebug_table == NULL)
+    return 0;
+  ret = elf_uncompress_zdebug (state, compressed, compressed_size,
+			       zdebug_table, error_callback, data,
+			       uncompressed, uncompressed_size);
+  backtrace_free (state, zdebug_table, ZDEBUG_TABLE_SIZE,
+		  error_callback, data);
+  return ret;
+}
+
+/* This function is a hook for testing the zstd support.  It is only used by
+   tests.  */
+
+int
+backtrace_uncompress_zstd (struct backtrace_state *state,
+			   const unsigned char *compressed,
+			   size_t compressed_size,
+			   backtrace_error_callback error_callback,
+			   void *data, unsigned char *uncompressed,
+			   size_t uncompressed_size)
+{
+  unsigned char *zdebug_table;
+  int ret;
+
+  zdebug_table = ((unsigned char *) backtrace_alloc (state, ZDEBUG_TABLE_SIZE,
+						     error_callback, data));
+  if (zdebug_table == NULL)
+    return 0;
+  ret = elf_zstd_decompress (compressed, compressed_size,
+			     zdebug_table, uncompressed, uncompressed_size);
+  backtrace_free (state, zdebug_table, ZDEBUG_TABLE_SIZE,
+		  error_callback, data);
+  return ret;
+}
+
+/* Number of LZMA states.  */
+#define LZMA_STATES (12)
+
+/* Number of LZMA position states.  The pb value of the property byte
+   is the number of bits to include in these states, and the maximum
+   value of pb is 4.  */
+#define LZMA_POS_STATES (16)
+
+/* Number of LZMA distance states.  These are used match distances
+   with a short match length: up to 4 bytes.  */
+#define LZMA_DIST_STATES (4)
+
+/* Number of LZMA distance slots.  LZMA uses six bits to encode larger
+   match lengths, so 1 << 6 possible probabilities.  */
+#define LZMA_DIST_SLOTS (64)
+
+/* LZMA distances 0 to 3 are encoded directly, larger values use a
+   probability model.  */
+#define LZMA_DIST_MODEL_START (4)
+
+/* The LZMA probability model ends at 14.  */
+#define LZMA_DIST_MODEL_END (14)
+
+/* LZMA distance slots for distances less than 127.  */
+#define LZMA_FULL_DISTANCES (128)
+
+/* LZMA uses four alignment bits.  */
+#define LZMA_ALIGN_SIZE (16)
+
+/* LZMA match length is encoded with 4, 5, or 10 bits, some of which
+   are already known.  */
+#define LZMA_LEN_LOW_SYMBOLS (8)
+#define LZMA_LEN_MID_SYMBOLS (8)
+#define LZMA_LEN_HIGH_SYMBOLS (256)
+
+/* LZMA literal encoding.  */
+#define LZMA_LITERAL_CODERS_MAX (16)
+#define LZMA_LITERAL_CODER_SIZE (0x300)
+
+/* LZMA is based on a large set of probabilities, each managed
+   independently.  Each probability is an 11 bit number that we store
+   in a uint16_t.  We use a single large array of probabilities.  */
+
+/* Lengths of entries in the LZMA probabilities array.  The names used
+   here are copied from the Linux kernel implementation.  */
+
+#define LZMA_PROB_IS_MATCH_LEN (LZMA_STATES * LZMA_POS_STATES)
+#define LZMA_PROB_IS_REP_LEN LZMA_STATES
+#define LZMA_PROB_IS_REP0_LEN LZMA_STATES
+#define LZMA_PROB_IS_REP1_LEN LZMA_STATES
+#define LZMA_PROB_IS_REP2_LEN LZMA_STATES
+#define LZMA_PROB_IS_REP0_LONG_LEN (LZMA_STATES * LZMA_POS_STATES)
+#define LZMA_PROB_DIST_SLOT_LEN (LZMA_DIST_STATES * LZMA_DIST_SLOTS)
+#define LZMA_PROB_DIST_SPECIAL_LEN (LZMA_FULL_DISTANCES - LZMA_DIST_MODEL_END)
+#define LZMA_PROB_DIST_ALIGN_LEN LZMA_ALIGN_SIZE
+#define LZMA_PROB_MATCH_LEN_CHOICE_LEN 1
+#define LZMA_PROB_MATCH_LEN_CHOICE2_LEN 1
+#define LZMA_PROB_MATCH_LEN_LOW_LEN (LZMA_POS_STATES * LZMA_LEN_LOW_SYMBOLS)
+#define LZMA_PROB_MATCH_LEN_MID_LEN (LZMA_POS_STATES * LZMA_LEN_MID_SYMBOLS)
+#define LZMA_PROB_MATCH_LEN_HIGH_LEN LZMA_LEN_HIGH_SYMBOLS
+#define LZMA_PROB_REP_LEN_CHOICE_LEN 1
+#define LZMA_PROB_REP_LEN_CHOICE2_LEN 1
+#define LZMA_PROB_REP_LEN_LOW_LEN (LZMA_POS_STATES * LZMA_LEN_LOW_SYMBOLS)
+#define LZMA_PROB_REP_LEN_MID_LEN (LZMA_POS_STATES * LZMA_LEN_MID_SYMBOLS)
+#define LZMA_PROB_REP_LEN_HIGH_LEN LZMA_LEN_HIGH_SYMBOLS
+#define LZMA_PROB_LITERAL_LEN \
+  (LZMA_LITERAL_CODERS_MAX * LZMA_LITERAL_CODER_SIZE)
+
+/* Offsets into the LZMA probabilities array.  This is mechanically
+   generated from the above lengths.  */
+
+#define LZMA_PROB_IS_MATCH_OFFSET 0
+#define LZMA_PROB_IS_REP_OFFSET \
+  (LZMA_PROB_IS_MATCH_OFFSET + LZMA_PROB_IS_MATCH_LEN)
+#define LZMA_PROB_IS_REP0_OFFSET \
+  (LZMA_PROB_IS_REP_OFFSET + LZMA_PROB_IS_REP_LEN)
+#define LZMA_PROB_IS_REP1_OFFSET \
+  (LZMA_PROB_IS_REP0_OFFSET + LZMA_PROB_IS_REP0_LEN)
+#define LZMA_PROB_IS_REP2_OFFSET \
+  (LZMA_PROB_IS_REP1_OFFSET + LZMA_PROB_IS_REP1_LEN)
+#define LZMA_PROB_IS_REP0_LONG_OFFSET \
+  (LZMA_PROB_IS_REP2_OFFSET + LZMA_PROB_IS_REP2_LEN)
+#define LZMA_PROB_DIST_SLOT_OFFSET \
+  (LZMA_PROB_IS_REP0_LONG_OFFSET + LZMA_PROB_IS_REP0_LONG_LEN)
+#define LZMA_PROB_DIST_SPECIAL_OFFSET \
+  (LZMA_PROB_DIST_SLOT_OFFSET + LZMA_PROB_DIST_SLOT_LEN)
+#define LZMA_PROB_DIST_ALIGN_OFFSET \
+  (LZMA_PROB_DIST_SPECIAL_OFFSET + LZMA_PROB_DIST_SPECIAL_LEN)
+#define LZMA_PROB_MATCH_LEN_CHOICE_OFFSET \
+  (LZMA_PROB_DIST_ALIGN_OFFSET + LZMA_PROB_DIST_ALIGN_LEN)
+#define LZMA_PROB_MATCH_LEN_CHOICE2_OFFSET \
+  (LZMA_PROB_MATCH_LEN_CHOICE_OFFSET + LZMA_PROB_MATCH_LEN_CHOICE_LEN)
+#define LZMA_PROB_MATCH_LEN_LOW_OFFSET \
+  (LZMA_PROB_MATCH_LEN_CHOICE2_OFFSET + LZMA_PROB_MATCH_LEN_CHOICE2_LEN)
+#define LZMA_PROB_MATCH_LEN_MID_OFFSET \
+  (LZMA_PROB_MATCH_LEN_LOW_OFFSET + LZMA_PROB_MATCH_LEN_LOW_LEN)
+#define LZMA_PROB_MATCH_LEN_HIGH_OFFSET \
+  (LZMA_PROB_MATCH_LEN_MID_OFFSET + LZMA_PROB_MATCH_LEN_MID_LEN)
+#define LZMA_PROB_REP_LEN_CHOICE_OFFSET \
+  (LZMA_PROB_MATCH_LEN_HIGH_OFFSET + LZMA_PROB_MATCH_LEN_HIGH_LEN)
+#define LZMA_PROB_REP_LEN_CHOICE2_OFFSET \
+  (LZMA_PROB_REP_LEN_CHOICE_OFFSET + LZMA_PROB_REP_LEN_CHOICE_LEN)
+#define LZMA_PROB_REP_LEN_LOW_OFFSET \
+  (LZMA_PROB_REP_LEN_CHOICE2_OFFSET + LZMA_PROB_REP_LEN_CHOICE2_LEN)
+#define LZMA_PROB_REP_LEN_MID_OFFSET \
+  (LZMA_PROB_REP_LEN_LOW_OFFSET + LZMA_PROB_REP_LEN_LOW_LEN)
+#define LZMA_PROB_REP_LEN_HIGH_OFFSET \
+  (LZMA_PROB_REP_LEN_MID_OFFSET + LZMA_PROB_REP_LEN_MID_LEN)
+#define LZMA_PROB_LITERAL_OFFSET \
+  (LZMA_PROB_REP_LEN_HIGH_OFFSET + LZMA_PROB_REP_LEN_HIGH_LEN)
+
+#define LZMA_PROB_TOTAL_COUNT \
+  (LZMA_PROB_LITERAL_OFFSET + LZMA_PROB_LITERAL_LEN)
+
+/* Check that the number of LZMA probabilities is the same as the
+   Linux kernel implementation.  */
+
+#if LZMA_PROB_TOTAL_COUNT != 1846 + (1 << 4) * 0x300
+ #error Wrong number of LZMA probabilities
+#endif
+
+/* Expressions for the offset in the LZMA probabilities array of a
+   specific probability.  */
+
+#define LZMA_IS_MATCH(state, pos) \
+  (LZMA_PROB_IS_MATCH_OFFSET + (state) * LZMA_POS_STATES + (pos))
+#define LZMA_IS_REP(state) \
+  (LZMA_PROB_IS_REP_OFFSET + (state))
+#define LZMA_IS_REP0(state) \
+  (LZMA_PROB_IS_REP0_OFFSET + (state))
+#define LZMA_IS_REP1(state) \
+  (LZMA_PROB_IS_REP1_OFFSET + (state))
+#define LZMA_IS_REP2(state) \
+  (LZMA_PROB_IS_REP2_OFFSET + (state))
+#define LZMA_IS_REP0_LONG(state, pos) \
+  (LZMA_PROB_IS_REP0_LONG_OFFSET + (state) * LZMA_POS_STATES + (pos))
+#define LZMA_DIST_SLOT(dist, slot) \
+  (LZMA_PROB_DIST_SLOT_OFFSET + (dist) * LZMA_DIST_SLOTS + (slot))
+#define LZMA_DIST_SPECIAL(dist) \
+  (LZMA_PROB_DIST_SPECIAL_OFFSET + (dist))
+#define LZMA_DIST_ALIGN(dist) \
+  (LZMA_PROB_DIST_ALIGN_OFFSET + (dist))
+#define LZMA_MATCH_LEN_CHOICE \
+  LZMA_PROB_MATCH_LEN_CHOICE_OFFSET
+#define LZMA_MATCH_LEN_CHOICE2 \
+  LZMA_PROB_MATCH_LEN_CHOICE2_OFFSET
+#define LZMA_MATCH_LEN_LOW(pos, sym) \
+  (LZMA_PROB_MATCH_LEN_LOW_OFFSET + (pos) * LZMA_LEN_LOW_SYMBOLS + (sym))
+#define LZMA_MATCH_LEN_MID(pos, sym) \
+  (LZMA_PROB_MATCH_LEN_MID_OFFSET + (pos) * LZMA_LEN_MID_SYMBOLS + (sym))
+#define LZMA_MATCH_LEN_HIGH(sym) \
+  (LZMA_PROB_MATCH_LEN_HIGH_OFFSET + (sym))
+#define LZMA_REP_LEN_CHOICE \
+  LZMA_PROB_REP_LEN_CHOICE_OFFSET
+#define LZMA_REP_LEN_CHOICE2 \
+  LZMA_PROB_REP_LEN_CHOICE2_OFFSET
+#define LZMA_REP_LEN_LOW(pos, sym) \
+  (LZMA_PROB_REP_LEN_LOW_OFFSET + (pos) * LZMA_LEN_LOW_SYMBOLS + (sym))
+#define LZMA_REP_LEN_MID(pos, sym) \
+  (LZMA_PROB_REP_LEN_MID_OFFSET + (pos) * LZMA_LEN_MID_SYMBOLS + (sym))
+#define LZMA_REP_LEN_HIGH(sym) \
+  (LZMA_PROB_REP_LEN_HIGH_OFFSET + (sym))
+#define LZMA_LITERAL(code, size) \
+  (LZMA_PROB_LITERAL_OFFSET + (code) * LZMA_LITERAL_CODER_SIZE + (size))
+
+/* Read an LZMA varint from BUF, reading and updating *POFFSET,
+   setting *VAL.  Returns 0 on error, 1 on success.  */
+
+static int
+elf_lzma_varint (const unsigned char *compressed, size_t compressed_size,
+		 size_t *poffset, uint64_t *val)
+{
+  size_t off;
+  int i;
+  uint64_t v;
+  unsigned char b;
+
+  off = *poffset;
+  i = 0;
+  v = 0;
+  while (1)
+    {
+      if (unlikely (off >= compressed_size))
+	{
+	  elf_uncompress_failed ();
+	  return 0;
+	}
+      b = compressed[off];
+      v |= (b & 0x7f) << (i * 7);
+      ++off;
+      if ((b & 0x80) == 0)
+	{
+	  *poffset = off;
+	  *val = v;
+	  return 1;
+	}
+      ++i;
+      if (unlikely (i >= 9))
+	{
+	  elf_uncompress_failed ();
+	  return 0;
+	}
+    }
+}
+
+/* Normalize the LZMA range decoder, pulling in an extra input byte if
+   needed.  */
+
+static void
+elf_lzma_range_normalize (const unsigned char *compressed,
+			  size_t compressed_size, size_t *poffset,
+			  uint32_t *prange, uint32_t *pcode)
+{
+  if (*prange < (1U << 24))
+    {
+      if (unlikely (*poffset >= compressed_size))
+	{
+	  /* We assume this will be caught elsewhere.  */
+	  elf_uncompress_failed ();
+	  return;
+	}
+      *prange <<= 8;
+      *pcode <<= 8;
+      *pcode += compressed[*poffset];
+      ++*poffset;
+    }
+}
+
+/* Read and return a single bit from the LZMA stream, reading and
+   updating *PROB.  Each bit comes from the range coder.  */
+
+static int
+elf_lzma_bit (const unsigned char *compressed, size_t compressed_size,
+	      uint16_t *prob, size_t *poffset, uint32_t *prange,
+	      uint32_t *pcode)
+{
+  uint32_t bound;
+
+  elf_lzma_range_normalize (compressed, compressed_size, poffset,
+			    prange, pcode);
+  bound = (*prange >> 11) * (uint32_t) *prob;
+  if (*pcode < bound)
+    {
+      *prange = bound;
+      *prob += ((1U << 11) - *prob) >> 5;
+      return 0;
+    }
+  else
+    {
+      *prange -= bound;
+      *pcode -= bound;
+      *prob -= *prob >> 5;
+      return 1;
+    }
+}
+
+/* Read an integer of size BITS from the LZMA stream, most significant
+   bit first.  The bits are predicted using PROBS.  */
+
+static uint32_t
+elf_lzma_integer (const unsigned char *compressed, size_t compressed_size,
+		  uint16_t *probs, uint32_t bits, size_t *poffset,
+		  uint32_t *prange, uint32_t *pcode)
+{
+  uint32_t sym;
+  uint32_t i;
+
+  sym = 1;
+  for (i = 0; i < bits; i++)
+    {
+      int bit;
+
+      bit = elf_lzma_bit (compressed, compressed_size, probs + sym, poffset,
+			  prange, pcode);
+      sym <<= 1;
+      sym += bit;
+    }
+  return sym - (1 << bits);
+}
+
+/* Read an integer of size BITS from the LZMA stream, least
+   significant bit first.  The bits are predicted using PROBS.  */
+
+static uint32_t
+elf_lzma_reverse_integer (const unsigned char *compressed,
+			  size_t compressed_size, uint16_t *probs,
+			  uint32_t bits, size_t *poffset, uint32_t *prange,
+			  uint32_t *pcode)
+{
+  uint32_t sym;
+  uint32_t val;
+  uint32_t i;
+
+  sym = 1;
+  val = 0;
+  for (i = 0; i < bits; i++)
+    {
+      int bit;
+
+      bit = elf_lzma_bit (compressed, compressed_size, probs + sym, poffset,
+			  prange, pcode);
+      sym <<= 1;
+      sym += bit;
+      val += bit << i;
+    }
+  return val;
+}
+
+/* Read a length from the LZMA stream.  IS_REP picks either LZMA_MATCH
+   or LZMA_REP probabilities.  */
+
+static uint32_t
+elf_lzma_len (const unsigned char *compressed, size_t compressed_size,
+	      uint16_t *probs, int is_rep, unsigned int pos_state,
+	      size_t *poffset, uint32_t *prange, uint32_t *pcode)
+{
+  uint16_t *probs_choice;
+  uint16_t *probs_sym;
+  uint32_t bits;
+  uint32_t len;
+
+  probs_choice = probs + (is_rep
+			  ? LZMA_REP_LEN_CHOICE
+			  : LZMA_MATCH_LEN_CHOICE);
+  if (elf_lzma_bit (compressed, compressed_size, probs_choice, poffset,
+		    prange, pcode))
+    {
+      probs_choice = probs + (is_rep
+			      ? LZMA_REP_LEN_CHOICE2
+			      : LZMA_MATCH_LEN_CHOICE2);
+      if (elf_lzma_bit (compressed, compressed_size, probs_choice,
+			poffset, prange, pcode))
+	{
+	  probs_sym = probs + (is_rep
+			       ? LZMA_REP_LEN_HIGH (0)
+			       : LZMA_MATCH_LEN_HIGH (0));
+	  bits = 8;
+	  len = 2 + 8 + 8;
+	}
+      else
+	{
+	  probs_sym = probs + (is_rep
+			       ? LZMA_REP_LEN_MID (pos_state, 0)
+			       : LZMA_MATCH_LEN_MID (pos_state, 0));
+	  bits = 3;
+	  len = 2 + 8;
+	}
+    }
+  else
+    {
+      probs_sym = probs + (is_rep
+			   ? LZMA_REP_LEN_LOW (pos_state, 0)
+			   : LZMA_MATCH_LEN_LOW (pos_state, 0));
+      bits = 3;
+      len = 2;
+    }
+
+  len += elf_lzma_integer (compressed, compressed_size, probs_sym, bits,
+			   poffset, prange, pcode);
+  return len;
+}
+
+/* Uncompress one LZMA block from a minidebug file.  The compressed
+   data is at COMPRESSED + *POFFSET.  Update *POFFSET.  Store the data
+   into the memory at UNCOMPRESSED, size UNCOMPRESSED_SIZE.  CHECK is
+   the stream flag from the xz header.  Return 1 on successful
+   decompression.  */
+
+static int
+elf_uncompress_lzma_block (const unsigned char *compressed,
+			   size_t compressed_size, unsigned char check,
+			   uint16_t *probs, unsigned char *uncompressed,
+			   size_t uncompressed_size, size_t *poffset)
+{
+  size_t off;
+  size_t block_header_offset;
+  size_t block_header_size;
+  unsigned char block_flags;
+  uint64_t header_compressed_size;
+  uint64_t header_uncompressed_size;
+  unsigned char lzma2_properties;
+  size_t crc_offset;
+  uint32_t computed_crc;
+  uint32_t stream_crc;
+  size_t uncompressed_offset;
+  size_t dict_start_offset;
+  unsigned int lc;
+  unsigned int lp;
+  unsigned int pb;
+  uint32_t range;
+  uint32_t code;
+  uint32_t lstate;
+  uint32_t dist[4];
+
+  off = *poffset;
+  block_header_offset = off;
+
+  /* Block header size is a single byte.  */
+  if (unlikely (off >= compressed_size))
+    {
+      elf_uncompress_failed ();
+      return 0;
+    }
+  block_header_size = (compressed[off] + 1) * 4;
+  if (unlikely (off + block_header_size > compressed_size))
+    {
+      elf_uncompress_failed ();
+      return 0;
+    }
+
+  /* Block flags.  */
+  block_flags = compressed[off + 1];
+  if (unlikely ((block_flags & 0x3c) != 0))
+    {
+      elf_uncompress_failed ();
+      return 0;
+    }
+
+  off += 2;
+
+  /* Optional compressed size.  */
+  header_compressed_size = 0;
+  if ((block_flags & 0x40) != 0)
+    {
+      *poffset = off;
+      if (!elf_lzma_varint (compressed, compressed_size, poffset,
+			    &header_compressed_size))
+	return 0;
+      off = *poffset;
+    }
+
+  /* Optional uncompressed size.  */
+  header_uncompressed_size = 0;
+  if ((block_flags & 0x80) != 0)
+    {
+      *poffset = off;
+      if (!elf_lzma_varint (compressed, compressed_size, poffset,
+			    &header_uncompressed_size))
+	return 0;
+      off = *poffset;
+    }
+
+  /* The recipe for creating a minidebug file is to run the xz program
+     with no arguments, so we expect exactly one filter: lzma2.  */
+
+  if (unlikely ((block_flags & 0x3) != 0))
+    {
+      elf_uncompress_failed ();
+      return 0;
+    }
+
+  if (unlikely (off + 2 >= block_header_offset + block_header_size))
+    {
+      elf_uncompress_failed ();
+      return 0;
+    }
+
+  /* The filter ID for LZMA2 is 0x21.  */
+  if (unlikely (compressed[off] != 0x21))
+    {
+      elf_uncompress_failed ();
+      return 0;
+    }
+  ++off;
+
+  /* The size of the filter properties for LZMA2 is 1.  */
+  if (unlikely (compressed[off] != 1))
+    {
+      elf_uncompress_failed ();
+      return 0;
+    }
+  ++off;
+
+  lzma2_properties = compressed[off];
+  ++off;
+
+  if (unlikely (lzma2_properties > 40))
+    {
+      elf_uncompress_failed ();
+      return 0;
+    }
+
+  /* The properties describe the dictionary size, but we don't care
+     what that is.  */
+
+  /* Skip to just before CRC, verifying zero bytes in between.  */
+  crc_offset = block_header_offset + block_header_size - 4;
+  if (unlikely (crc_offset + 4 > compressed_size))
+    {
+      elf_uncompress_failed ();
+      return 0;
+    }
+  for (; off < crc_offset; off++)
+    {
+      if (compressed[off] != 0)
+	{
+	  elf_uncompress_failed ();
+	  return 0;
+	}
+    }
+
+  /* Block header CRC.  */
+  computed_crc = elf_crc32 (0, compressed + block_header_offset,
+			    block_header_size - 4);
+  stream_crc = ((uint32_t)compressed[off]
+		| ((uint32_t)compressed[off + 1] << 8)
+		| ((uint32_t)compressed[off + 2] << 16)
+		| ((uint32_t)compressed[off + 3] << 24));
+  if (unlikely (computed_crc != stream_crc))
+    {
+      elf_uncompress_failed ();
+      return 0;
+    }
+  off += 4;
+
+  /* Read a sequence of LZMA2 packets.  */
+
+  uncompressed_offset = 0;
+  dict_start_offset = 0;
+  lc = 0;
+  lp = 0;
+  pb = 0;
+  lstate = 0;
+  while (off < compressed_size)
+    {
+      unsigned char control;
+
+      range = 0xffffffff;
+      code = 0;
+
+      control = compressed[off];
+      ++off;
+      if (unlikely (control == 0))
+	{
+	  /* End of packets.  */
+	  break;
+	}
+
+      if (control == 1 || control >= 0xe0)
+	{
+	  /* Reset dictionary to empty.  */
+	  dict_start_offset = uncompressed_offset;
+	}
+
+      if (control < 0x80)
+	{
+	  size_t chunk_size;
+
+	  /* The only valid values here are 1 or 2.  A 1 means to
+	     reset the dictionary (done above).  Then we see an
+	     uncompressed chunk.  */
+
+	  if (unlikely (control > 2))
+	    {
+	      elf_uncompress_failed ();
+	      return 0;
+	    }
+
+	  /* An uncompressed chunk is a two byte size followed by
+	     data.  */
+
+	  if (unlikely (off + 2 > compressed_size))
+	    {
+	      elf_uncompress_failed ();
+	      return 0;
+	    }
+
+	  chunk_size = compressed[off] << 8;
+	  chunk_size += compressed[off + 1];
+	  ++chunk_size;
+
+	  off += 2;
+
+	  if (unlikely (off + chunk_size > compressed_size))
+	    {
+	      elf_uncompress_failed ();
+	      return 0;
+	    }
+	  if (unlikely (uncompressed_offset + chunk_size > uncompressed_size))
+	    {
+	      elf_uncompress_failed ();
+	      return 0;
+	    }
+
+	  memcpy (uncompressed + uncompressed_offset, compressed + off,
+		  chunk_size);
+	  uncompressed_offset += chunk_size;
+	  off += chunk_size;
+	}
+      else
+	{
+	  size_t uncompressed_chunk_start;
+	  size_t uncompressed_chunk_size;
+	  size_t compressed_chunk_size;
+	  size_t limit;
+
+	  /* An LZMA chunk.  This starts with an uncompressed size and
+	     a compressed size.  */
+
+	  if (unlikely (off + 4 >= compressed_size))
+	    {
+	      elf_uncompress_failed ();
+	      return 0;
+	    }
+
+	  uncompressed_chunk_start = uncompressed_offset;
+
+	  uncompressed_chunk_size = (control & 0x1f) << 16;
+	  uncompressed_chunk_size += compressed[off] << 8;
+	  uncompressed_chunk_size += compressed[off + 1];
+	  ++uncompressed_chunk_size;
+
+	  compressed_chunk_size = compressed[off + 2] << 8;
+	  compressed_chunk_size += compressed[off + 3];
+	  ++compressed_chunk_size;
+
+	  off += 4;
+
+	  /* Bit 7 (0x80) is set.
+	     Bits 6 and 5 (0x40 and 0x20) are as follows:
+	     0: don't reset anything
+	     1: reset state
+	     2: reset state, read properties
+	     3: reset state, read properties, reset dictionary (done above) */
+
+	  if (control >= 0xc0)
+	    {
+	      unsigned char props;
+
+	      /* Bit 6 is set, read properties.  */
+
+	      if (unlikely (off >= compressed_size))
+		{
+		  elf_uncompress_failed ();
+		  return 0;
+		}
+	      props = compressed[off];
+	      ++off;
+	      if (unlikely (props > (4 * 5 + 4) * 9 + 8))
+		{
+		  elf_uncompress_failed ();
+		  return 0;
+		}
+	      pb = 0;
+	      while (props >= 9 * 5)
+		{
+		  props -= 9 * 5;
+		  ++pb;
+		}
+	      lp = 0;
+	      while (props > 9)
+		{
+		  props -= 9;
+		  ++lp;
+		}
+	      lc = props;
+	      if (unlikely (lc + lp > 4))
+		{
+		  elf_uncompress_failed ();
+		  return 0;
+		}
+	    }
+
+	  if (control >= 0xa0)
+	    {
+	      size_t i;
+
+	      /* Bit 5 or 6 is set, reset LZMA state.  */
+
+	      lstate = 0;
+	      memset (&dist, 0, sizeof dist);
+	      for (i = 0; i < LZMA_PROB_TOTAL_COUNT; i++)
+		probs[i] = 1 << 10;
+	      range = 0xffffffff;
+	      code = 0;
+	    }
+
+	  /* Read the range code.  */
+
+	  if (unlikely (off + 5 > compressed_size))
+	    {
+	      elf_uncompress_failed ();
+	      return 0;
+	    }
+
+	  /* The byte at compressed[off] is ignored for some
+	     reason.  */
+
+	  code = ((compressed[off + 1] << 24)
+		  + (compressed[off + 2] << 16)
+		  + (compressed[off + 3] << 8)
+		  + compressed[off + 4]);
+	  off += 5;
+
+	  /* This is the main LZMA decode loop.  */
+
+	  limit = off + compressed_chunk_size;
+	  *poffset = off;
+	  while (*poffset < limit)
+	    {
+	      unsigned int pos_state;
+
+	      if (unlikely (uncompressed_offset
+			    == (uncompressed_chunk_start
+				+ uncompressed_chunk_size)))
+		{
+		  /* We've decompressed all the expected bytes.  */
+		  break;
+		}
+
+	      pos_state = ((uncompressed_offset - dict_start_offset)
+			   & ((1 << pb) - 1));
+
+	      if (elf_lzma_bit (compressed, compressed_size,
+				probs + LZMA_IS_MATCH (lstate, pos_state),
+				poffset, &range, &code))
+		{
+		  uint32_t len;
+
+		  if (elf_lzma_bit (compressed, compressed_size,
+				    probs + LZMA_IS_REP (lstate),
+				    poffset, &range, &code))
+		    {
+		      int short_rep;
+		      uint32_t next_dist;
+
+		      /* Repeated match.  */
+
+		      short_rep = 0;
+		      if (elf_lzma_bit (compressed, compressed_size,
+					probs + LZMA_IS_REP0 (lstate),
+					poffset, &range, &code))
+			{
+			  if (elf_lzma_bit (compressed, compressed_size,
+					    probs + LZMA_IS_REP1 (lstate),
+					    poffset, &range, &code))
+			    {
+			      if (elf_lzma_bit (compressed, compressed_size,
+						probs + LZMA_IS_REP2 (lstate),
+						poffset, &range, &code))
+				{
+				  next_dist = dist[3];
+				  dist[3] = dist[2];
+				}
+			      else
+				{
+				  next_dist = dist[2];
+				}
+			      dist[2] = dist[1];
+			    }
+			  else
+			    {
+			      next_dist = dist[1];
+			    }
+
+			  dist[1] = dist[0];
+			  dist[0] = next_dist;
+			}
+		      else
+			{
+			  if (!elf_lzma_bit (compressed, compressed_size,
+					    (probs
+					     + LZMA_IS_REP0_LONG (lstate,
+								  pos_state)),
+					    poffset, &range, &code))
+			    short_rep = 1;
+			}
+
+		      if (lstate < 7)
+			lstate = short_rep ? 9 : 8;
+		      else
+			lstate = 11;
+
+		      if (short_rep)
+			len = 1;
+		      else
+			len = elf_lzma_len (compressed, compressed_size,
+					    probs, 1, pos_state, poffset,
+					    &range, &code);
+		    }
+		  else
+		    {
+		      uint32_t dist_state;
+		      uint32_t dist_slot;
+		      uint16_t *probs_dist;
+
+		      /* Match.  */
+
+		      if (lstate < 7)
+			lstate = 7;
+		      else
+			lstate = 10;
+		      dist[3] = dist[2];
+		      dist[2] = dist[1];
+		      dist[1] = dist[0];
+		      len = elf_lzma_len (compressed, compressed_size,
+					  probs, 0, pos_state, poffset,
+					  &range, &code);
+
+		      if (len < 4 + 2)
+			dist_state = len - 2;
+		      else
+			dist_state = 3;
+		      probs_dist = probs + LZMA_DIST_SLOT (dist_state, 0);
+		      dist_slot = elf_lzma_integer (compressed,
+						    compressed_size,
+						    probs_dist, 6,
+						    poffset, &range,
+						    &code);
+		      if (dist_slot < LZMA_DIST_MODEL_START)
+			dist[0] = dist_slot;
+		      else
+			{
+			  uint32_t limit;
+
+			  limit = (dist_slot >> 1) - 1;
+			  dist[0] = 2 + (dist_slot & 1);
+			  if (dist_slot < LZMA_DIST_MODEL_END)
+			    {
+			      dist[0] <<= limit;
+			      probs_dist = (probs
+					    + LZMA_DIST_SPECIAL(dist[0]
+								- dist_slot
+								- 1));
+			      dist[0] +=
+				elf_lzma_reverse_integer (compressed,
+							  compressed_size,
+							  probs_dist,
+							  limit, poffset,
+							  &range, &code);
+			    }
+			  else
+			    {
+			      uint32_t dist0;
+			      uint32_t i;
+
+			      dist0 = dist[0];
+			      for (i = 0; i < limit - 4; i++)
+				{
+				  uint32_t mask;
+
+				  elf_lzma_range_normalize (compressed,
+							    compressed_size,
+							    poffset,
+							    &range, &code);
+				  range >>= 1;
+				  code -= range;
+				  mask = -(code >> 31);
+				  code += range & mask;
+				  dist0 <<= 1;
+				  dist0 += mask + 1;
+				}
+			      dist0 <<= 4;
+			      probs_dist = probs + LZMA_DIST_ALIGN (0);
+			      dist0 +=
+				elf_lzma_reverse_integer (compressed,
+							  compressed_size,
+							  probs_dist, 4,
+							  poffset,
+							  &range, &code);
+			      dist[0] = dist0;
+			    }
+			}
+		    }
+
+		  if (unlikely (uncompressed_offset
+				- dict_start_offset < dist[0] + 1))
+		    {
+		      elf_uncompress_failed ();
+		      return 0;
+		    }
+		  if (unlikely (uncompressed_offset + len > uncompressed_size))
+		    {
+		      elf_uncompress_failed ();
+		      return 0;
+		    }
+
+		  if (dist[0] == 0)
+		    {
+		      /* A common case, meaning repeat the last
+			 character LEN times.  */
+		      memset (uncompressed + uncompressed_offset,
+			      uncompressed[uncompressed_offset - 1],
+			      len);
+		      uncompressed_offset += len;
+		    }
+		  else if (dist[0] + 1 >= len)
+		    {
+		      memcpy (uncompressed + uncompressed_offset,
+			      uncompressed + uncompressed_offset - dist[0] - 1,
+			      len);
+		      uncompressed_offset += len;
+		    }
+		  else
+		    {
+		      while (len > 0)
+			{
+			  uint32_t copy;
+
+			  copy = len < dist[0] + 1 ? len : dist[0] + 1;
+			  memcpy (uncompressed + uncompressed_offset,
+				  (uncompressed + uncompressed_offset
+				   - dist[0] - 1),
+				  copy);
+			  len -= copy;
+			  uncompressed_offset += copy;
+			}
+		    }
+		}
+	      else
+		{
+		  unsigned char prev;
+		  unsigned char low;
+		  size_t high;
+		  uint16_t *lit_probs;
+		  unsigned int sym;
+
+		  /* Literal value.  */
+
+		  if (uncompressed_offset > 0)
+		    prev = uncompressed[uncompressed_offset - 1];
+		  else
+		    prev = 0;
+		  low = prev >> (8 - lc);
+		  high = (((uncompressed_offset - dict_start_offset)
+			   & ((1 << lp) - 1))
+			  << lc);
+		  lit_probs = probs + LZMA_LITERAL (low + high, 0);
+		  if (lstate < 7)
+		    sym = elf_lzma_integer (compressed, compressed_size,
+					    lit_probs, 8, poffset, &range,
+					    &code);
+		  else
+		    {
+		      unsigned int match;
+		      unsigned int bit;
+		      unsigned int match_bit;
+		      unsigned int idx;
+
+		      sym = 1;
+		      if (uncompressed_offset >= dist[0] + 1)
+			match = uncompressed[uncompressed_offset - dist[0] - 1];
+		      else
+			match = 0;
+		      match <<= 1;
+		      bit = 0x100;
+		      do
+			{
+			  match_bit = match & bit;
+			  match <<= 1;
+			  idx = bit + match_bit + sym;
+			  sym <<= 1;
+			  if (elf_lzma_bit (compressed, compressed_size,
+					    lit_probs + idx, poffset,
+					    &range, &code))
+			    {
+			      ++sym;
+			      bit &= match_bit;
+			    }
+			  else
+			    {
+			      bit &= ~ match_bit;
+			    }
+			}
+		      while (sym < 0x100);
+		    }
+
+		  if (unlikely (uncompressed_offset >= uncompressed_size))
+		    {
+		      elf_uncompress_failed ();
+		      return 0;
+		    }
+
+		  uncompressed[uncompressed_offset] = (unsigned char) sym;
+		  ++uncompressed_offset;
+		  if (lstate <= 3)
+		    lstate = 0;
+		  else if (lstate <= 9)
+		    lstate -= 3;
+		  else
+		    lstate -= 6;
+		}
+	    }
+
+	  elf_lzma_range_normalize (compressed, compressed_size, poffset,
+				    &range, &code);
+
+	  off = *poffset;
+	}
+    }
+
+  /* We have reached the end of the block.  Pad to four byte
+     boundary.  */
+  off = (off + 3) &~ (size_t) 3;
+  if (unlikely (off > compressed_size))
+    {
+      elf_uncompress_failed ();
+      return 0;
+    }
+
+  switch (check)
+    {
+    case 0:
+      /* No check.  */
+      break;
+
+    case 1:
+      /* CRC32 */
+      if (unlikely (off + 4 > compressed_size))
+	{
+	  elf_uncompress_failed ();
+	  return 0;
+	}
+      computed_crc = elf_crc32 (0, uncompressed, uncompressed_offset);
+      stream_crc = ((uint32_t)compressed[off]
+		    | ((uint32_t)compressed[off + 1] << 8)
+		    | ((uint32_t)compressed[off + 2] << 16)
+		    | ((uint32_t)compressed[off + 3] << 24));
+      if (computed_crc != stream_crc)
+	{
+	  elf_uncompress_failed ();
+	  return 0;
+	}
+      off += 4;
+      break;
+
+    case 4:
+      /* CRC64.  We don't bother computing a CRC64 checksum.  */
+      if (unlikely (off + 8 > compressed_size))
+	{
+	  elf_uncompress_failed ();
+	  return 0;
+	}
+      off += 8;
+      break;
+
+    case 10:
+      /* SHA.  We don't bother computing a SHA checksum.  */
+      if (unlikely (off + 32 > compressed_size))
+	{
+	  elf_uncompress_failed ();
+	  return 0;
+	}
+      off += 32;
+      break;
+
+    default:
+      elf_uncompress_failed ();
+      return 0;
+    }
+
+  *poffset = off;
+
+  return 1;
+}
+
+/* Uncompress LZMA data found in a minidebug file.  The minidebug
+   format is described at
+   https://sourceware.org/gdb/current/onlinedocs/gdb/MiniDebugInfo.html.
+   Returns 0 on error, 1 on successful decompression.  For this
+   function we return 0 on failure to decompress, as the calling code
+   will carry on in that case.  */
+
+static int
+elf_uncompress_lzma (struct backtrace_state *state,
+		     const unsigned char *compressed, size_t compressed_size,
+		     backtrace_error_callback error_callback, void *data,
+		     unsigned char **uncompressed, size_t *uncompressed_size)
+{
+  size_t header_size;
+  size_t footer_size;
+  unsigned char check;
+  uint32_t computed_crc;
+  uint32_t stream_crc;
+  size_t offset;
+  size_t index_size;
+  size_t footer_offset;
+  size_t index_offset;
+  uint64_t index_compressed_size;
+  uint64_t index_uncompressed_size;
+  unsigned char *mem;
+  uint16_t *probs;
+  size_t compressed_block_size;
+
+  /* The format starts with a stream header and ends with a stream
+     footer.  */
+  header_size = 12;
+  footer_size = 12;
+  if (unlikely (compressed_size < header_size + footer_size))
+    {
+      elf_uncompress_failed ();
+      return 0;
+    }
+
+  /* The stream header starts with a magic string.  */
+  if (unlikely (memcmp (compressed, "\375" "7zXZ\0", 6) != 0))
+    {
+      elf_uncompress_failed ();
+      return 0;
+    }
+
+  /* Next come stream flags.  The first byte is zero, the second byte
+     is the check.  */
+  if (unlikely (compressed[6] != 0))
+    {
+      elf_uncompress_failed ();
+      return 0;
+    }
+  check = compressed[7];
+  if (unlikely ((check & 0xf8) != 0))
+    {
+      elf_uncompress_failed ();
+      return 0;
+    }
+
+  /* Next comes a CRC of the stream flags.  */
+  computed_crc = elf_crc32 (0, compressed + 6, 2);
+  stream_crc = ((uint32_t)compressed[8]
+		| ((uint32_t)compressed[9] << 8)
+		| ((uint32_t)compressed[10] << 16)
+		| ((uint32_t)compressed[11] << 24));
+  if (unlikely (computed_crc != stream_crc))
+    {
+      elf_uncompress_failed ();
+      return 0;
+    }
+
+  /* Now that we've parsed the header, parse the footer, so that we
+     can get the uncompressed size.  */
+
+  /* The footer ends with two magic bytes.  */
+
+  offset = compressed_size;
+  if (unlikely (memcmp (compressed + offset - 2, "YZ", 2) != 0))
+    {
+      elf_uncompress_failed ();
+      return 0;
+    }
+  offset -= 2;
+
+  /* Before that are the stream flags, which should be the same as the
+     flags in the header.  */
+  if (unlikely (compressed[offset - 2] != 0
+		|| compressed[offset - 1] != check))
+    {
+      elf_uncompress_failed ();
+      return 0;
+    }
+  offset -= 2;
+
+  /* Before that is the size of the index field, which precedes the
+     footer.  */
+  index_size = (compressed[offset - 4]
+		| (compressed[offset - 3] << 8)
+		| (compressed[offset - 2] << 16)
+		| (compressed[offset - 1] << 24));
+  index_size = (index_size + 1) * 4;
+  offset -= 4;
+
+  /* Before that is a footer CRC.  */
+  computed_crc = elf_crc32 (0, compressed + offset, 6);
+  stream_crc = ((uint32_t)compressed[offset - 4]
+		| ((uint32_t)compressed[offset - 3] << 8)
+		| ((uint32_t)compressed[offset - 2] << 16)
+		| ((uint32_t)compressed[offset - 1] << 24));
+  if (unlikely (computed_crc != stream_crc))
+    {
+      elf_uncompress_failed ();
+      return 0;
+    }
+  offset -= 4;
+
+  /* The index comes just before the footer.  */
+  if (unlikely (offset < index_size + header_size))
+    {
+      elf_uncompress_failed ();
+      return 0;
+    }
+
+  footer_offset = offset;
+  offset -= index_size;
+  index_offset = offset;
+
+  /* The index starts with a zero byte.  */
+  if (unlikely (compressed[offset] != 0))
+    {
+      elf_uncompress_failed ();
+      return 0;
+    }
+  ++offset;
+
+  /* Next is the number of blocks.  We expect zero blocks for an empty
+     stream, and otherwise a single block.  */
+  if (unlikely (compressed[offset] == 0))
+    {
+      *uncompressed = NULL;
+      *uncompressed_size = 0;
+      return 1;
+    }
+  if (unlikely (compressed[offset] != 1))
+    {
+      elf_uncompress_failed ();
+      return 0;
+    }
+  ++offset;
+
+  /* Next is the compressed size and the uncompressed size.  */
+  if (!elf_lzma_varint (compressed, compressed_size, &offset,
+			&index_compressed_size))
+    return 0;
+  if (!elf_lzma_varint (compressed, compressed_size, &offset,
+			&index_uncompressed_size))
+    return 0;
+
+  /* Pad to a four byte boundary.  */
+  offset = (offset + 3) &~ (size_t) 3;
+
+  /* Next is a CRC of the index.  */
+  computed_crc = elf_crc32 (0, compressed + index_offset,
+			    offset - index_offset);
+  stream_crc = ((uint32_t)compressed[offset]
+		| ((uint32_t)compressed[offset + 1] << 8)
+		| ((uint32_t)compressed[offset + 2] << 16)
+		| ((uint32_t)compressed[offset + 3] << 24));
+  if (unlikely (computed_crc != stream_crc))
+    {
+      elf_uncompress_failed ();
+      return 0;
+    }
+  offset += 4;
+
+  /* We should now be back at the footer.  */
+  if (unlikely (offset != footer_offset))
+    {
+      elf_uncompress_failed ();
+      return 0;
+    }
+
+  /* Allocate space to hold the uncompressed data.  If we succeed in
+     uncompressing the LZMA data, we never free this memory.  */
+  mem = (unsigned char *) backtrace_alloc (state, index_uncompressed_size,
+					   error_callback, data);
+  if (unlikely (mem == NULL))
+    return 0;
+  *uncompressed = mem;
+  *uncompressed_size = index_uncompressed_size;
+
+  /* Allocate space for probabilities.  */
+  probs = ((uint16_t *)
+	   backtrace_alloc (state,
+			    LZMA_PROB_TOTAL_COUNT * sizeof (uint16_t),
+			    error_callback, data));
+  if (unlikely (probs == NULL))
+    {
+      backtrace_free (state, mem, index_uncompressed_size, error_callback,
+		      data);
+      return 0;
+    }
+
+  /* Uncompress the block, which follows the header.  */
+  offset = 12;
+  if (!elf_uncompress_lzma_block (compressed, compressed_size, check, probs,
+				  mem, index_uncompressed_size, &offset))
+    {
+      backtrace_free (state, mem, index_uncompressed_size, error_callback,
+		      data);
+      return 0;
+    }
+
+  compressed_block_size = offset - 12;
+  if (unlikely (compressed_block_size
+		!= ((index_compressed_size + 3) &~ (size_t) 3)))
+    {
+      elf_uncompress_failed ();
+      backtrace_free (state, mem, index_uncompressed_size, error_callback,
+		      data);
+      return 0;
+    }
+
+  offset = (offset + 3) &~ (size_t) 3;
+  if (unlikely (offset != index_offset))
+    {
+      elf_uncompress_failed ();
+      backtrace_free (state, mem, index_uncompressed_size, error_callback,
+		      data);
+      return 0;
+    }
+
+  return 1;
+}
+
+/* This function is a hook for testing the LZMA support.  It is only
+   used by tests.  */
+
+int
+backtrace_uncompress_lzma (struct backtrace_state *state,
+			   const unsigned char *compressed,
+			   size_t compressed_size,
+			   backtrace_error_callback error_callback,
+			   void *data, unsigned char **uncompressed,
+			   size_t *uncompressed_size)
+{
+  return elf_uncompress_lzma (state, compressed, compressed_size,
+			      error_callback, data, uncompressed,
+			      uncompressed_size);
+}
+
+/* Add the backtrace data for one ELF file.  Returns 1 on success,
+   0 on failure (in both cases descriptor is closed) or -1 if exe
+   is non-zero and the ELF file is ET_DYN, which tells the caller that
+   elf_add will need to be called on the descriptor again after
+   base_address is determined.  */
+
+static int
+elf_add (struct backtrace_state *state, const char *filename, int descriptor,
+	 const unsigned char *memory, size_t memory_size,
+	 struct libbacktrace_base_address base_address,
+	 struct elf_ppc64_opd_data *caller_opd,
+	 backtrace_error_callback error_callback, void *data,
+	 fileline *fileline_fn, int *found_sym, int *found_dwarf,
+	 struct dwarf_data **fileline_entry, int exe, int debuginfo,
+	 const char *with_buildid_data, uint32_t with_buildid_size)
+{
+  struct elf_view ehdr_view;
+  b_elf_ehdr ehdr;
+  off_t shoff;
+  unsigned int shnum;
+  unsigned int shstrndx;
+  struct elf_view shdrs_view;
+  int shdrs_view_valid;
+  const b_elf_shdr *shdrs;
+  const b_elf_shdr *shstrhdr;
+  size_t shstr_size;
+  off_t shstr_off;
+  struct elf_view names_view;
+  int names_view_valid;
+  const char *names;
+  unsigned int symtab_shndx;
+  unsigned int dynsym_shndx;
+  unsigned int i;
+  struct debug_section_info sections[DEBUG_MAX];
+  struct debug_section_info zsections[DEBUG_MAX];
+  struct elf_view symtab_view;
+  int symtab_view_valid;
+  struct elf_view strtab_view;
+  int strtab_view_valid;
+  struct elf_view buildid_view;
+  int buildid_view_valid;
+  const char *buildid_data;
+  uint32_t buildid_size;
+  struct elf_view debuglink_view;
+  int debuglink_view_valid;
+  const char *debuglink_name;
+  uint32_t debuglink_crc;
+  struct elf_view debugaltlink_view;
+  int debugaltlink_view_valid;
+  const char *debugaltlink_name;
+  const char *debugaltlink_buildid_data;
+  uint32_t debugaltlink_buildid_size;
+  struct elf_view gnu_debugdata_view;
+  int gnu_debugdata_view_valid;
+  size_t gnu_debugdata_size;
+  unsigned char *gnu_debugdata_uncompressed;
+  size_t gnu_debugdata_uncompressed_size;
+  off_t min_offset;
+  off_t max_offset;
+  off_t debug_size;
+  struct elf_view debug_view;
+  int debug_view_valid;
+  unsigned int using_debug_view;
+  uint16_t *zdebug_table;
+  struct elf_view split_debug_view[DEBUG_MAX];
+  unsigned char split_debug_view_valid[DEBUG_MAX];
+  struct elf_ppc64_opd_data opd_data, *opd;
+  int opd_view_valid;
+  struct dwarf_sections dwarf_sections;
+  struct dwarf_data *fileline_altlink = NULL;
+
+  if (!debuginfo)
+    {
+      *found_sym = 0;
+      *found_dwarf = 0;
+    }
+
+  shdrs_view_valid = 0;
+  names_view_valid = 0;
+  symtab_view_valid = 0;
+  strtab_view_valid = 0;
+  buildid_view_valid = 0;
+  buildid_data = NULL;
+  buildid_size = 0;
+  debuglink_view_valid = 0;
+  debuglink_name = NULL;
+  debuglink_crc = 0;
+  debugaltlink_view_valid = 0;
+  debugaltlink_name = NULL;
+  debugaltlink_buildid_data = NULL;
+  debugaltlink_buildid_size = 0;
+  gnu_debugdata_view_valid = 0;
+  gnu_debugdata_size = 0;
+  debug_view_valid = 0;
+  memset (&split_debug_view_valid[0], 0, sizeof split_debug_view_valid);
+  opd = NULL;
+  opd_view_valid = 0;
+
+  if (!elf_get_view (state, descriptor, memory, memory_size, 0, sizeof ehdr,
+		     error_callback, data, &ehdr_view))
+    goto fail;
+
+  memcpy (&ehdr, ehdr_view.view.data, sizeof ehdr);
+
+  elf_release_view (state, &ehdr_view, error_callback, data);
+
+  if (ehdr.e_ident[EI_MAG0] != ELFMAG0
+      || ehdr.e_ident[EI_MAG1] != ELFMAG1
+      || ehdr.e_ident[EI_MAG2] != ELFMAG2
+      || ehdr.e_ident[EI_MAG3] != ELFMAG3)
+    {
+      error_callback (data, "executable file is not ELF", 0);
+      goto fail;
+    }
+  if (ehdr.e_ident[EI_VERSION] != EV_CURRENT)
+    {
+      error_callback (data, "executable file is unrecognized ELF version", 0);
+      goto fail;
+    }
+
+#if BACKTRACE_ELF_SIZE == 32
+#define BACKTRACE_ELFCLASS ELFCLASS32
+#else
+#define BACKTRACE_ELFCLASS ELFCLASS64
+#endif
+
+  if (ehdr.e_ident[EI_CLASS] != BACKTRACE_ELFCLASS)
+    {
+      error_callback (data, "executable file is unexpected ELF class", 0);
+      goto fail;
+    }
+
+  if (ehdr.e_ident[EI_DATA] != ELFDATA2LSB
+      && ehdr.e_ident[EI_DATA] != ELFDATA2MSB)
+    {
+      error_callback (data, "executable file has unknown endianness", 0);
+      goto fail;
+    }
+
+  /* If the executable is ET_DYN, it is either a PIE, or we are running
+     directly a shared library with .interp.  We need to wait for
+     dl_iterate_phdr in that case to determine the actual base_address.  */
+  if (exe && ehdr.e_type == ET_DYN)
+    return -1;
+
+  shoff = ehdr.e_shoff;
+  shnum = ehdr.e_shnum;
+  shstrndx = ehdr.e_shstrndx;
+
+  if ((shnum == 0 || shstrndx == SHN_XINDEX)
+      && shoff != 0)
+    {
+      struct elf_view shdr_view;
+      const b_elf_shdr *shdr;
+
+      if (!elf_get_view (state, descriptor, memory, memory_size, shoff,
+			 sizeof shdr, error_callback, data, &shdr_view))
+	goto fail;
+
+      shdr = (const b_elf_shdr *) shdr_view.view.data;
+
+      if (shnum == 0)
+	shnum = shdr->sh_size;
+
+      if (shstrndx == SHN_XINDEX)
+	{
+	  shstrndx = shdr->sh_link;
+
+	  /* Versions of the GNU binutils between 2.12 and 2.18 did
+	     not handle objects with more than SHN_LORESERVE sections
+	     correctly.  All large section indexes were offset by
+	     0x100.  There is more information at
+	     http://sourceware.org/bugzilla/show_bug.cgi?id-5900 .
+	     Fortunately these object files are easy to detect, as the
+	     GNU binutils always put the section header string table
+	     near the end of the list of sections.  Thus if the
+	     section header string table index is larger than the
+	     number of sections, then we know we have to subtract
+	     0x100 to get the real section index.  */
+	  if (shstrndx >= shnum && shstrndx >= SHN_LORESERVE + 0x100)
+	    shstrndx -= 0x100;
+	}
+
+      elf_release_view (state, &shdr_view, error_callback, data);
+    }
+
+  if (shnum == 0 || shstrndx == 0)
+    goto fail;
+
+  /* To translate PC to file/line when using DWARF, we need to find
+     the .debug_info and .debug_line sections.  */
+
+  /* Read the section headers, skipping the first one.  */
+
+  if (!elf_get_view (state, descriptor, memory, memory_size,
+		     shoff + sizeof (b_elf_shdr),
+		     (shnum - 1) * sizeof (b_elf_shdr),
+		     error_callback, data, &shdrs_view))
+    goto fail;
+  shdrs_view_valid = 1;
+  shdrs = (const b_elf_shdr *) shdrs_view.view.data;
+
+  /* Read the section names.  */
+
+  shstrhdr = &shdrs[shstrndx - 1];
+  shstr_size = shstrhdr->sh_size;
+  shstr_off = shstrhdr->sh_offset;
+
+  if (!elf_get_view (state, descriptor, memory, memory_size, shstr_off,
+		     shstrhdr->sh_size, error_callback, data, &names_view))
+    goto fail;
+  names_view_valid = 1;
+  names = (const char *) names_view.view.data;
+
+  symtab_shndx = 0;
+  dynsym_shndx = 0;
+
+  memset (sections, 0, sizeof sections);
+  memset (zsections, 0, sizeof zsections);
+
+  /* Look for the symbol table.  */
+  for (i = 1; i < shnum; ++i)
+    {
+      const b_elf_shdr *shdr;
+      unsigned int sh_name;
+      const char *name;
+      int j;
+
+      shdr = &shdrs[i - 1];
+
+      if (shdr->sh_type == SHT_SYMTAB)
+	symtab_shndx = i;
+      else if (shdr->sh_type == SHT_DYNSYM)
+	dynsym_shndx = i;
+
+      sh_name = shdr->sh_name;
+      if (sh_name >= shstr_size)
+	{
+	  error_callback (data, "ELF section name out of range", 0);
+	  goto fail;
+	}
+
+      name = names + sh_name;
+
+      for (j = 0; j < (int) DEBUG_MAX; ++j)
+	{
+	  if (strcmp (name, dwarf_section_names[j]) == 0)
+	    {
+	      sections[j].offset = shdr->sh_offset;
+	      sections[j].size = shdr->sh_size;
+	      sections[j].compressed = (shdr->sh_flags & SHF_COMPRESSED) != 0;
+	      break;
+	    }
+	}
+
+      if (name[0] == '.' && name[1] == 'z')
+	{
+	  for (j = 0; j < (int) DEBUG_MAX; ++j)
+	    {
+	      if (strcmp (name + 2, dwarf_section_names[j] + 1) == 0)
+		{
+		  zsections[j].offset = shdr->sh_offset;
+		  zsections[j].size = shdr->sh_size;
+		  break;
+		}
+	    }
+	}
+
+      /* Read the build ID if present.  This could check for any
+	 SHT_NOTE section with the right note name and type, but gdb
+	 looks for a specific section name.  */
+      if ((!debuginfo || with_buildid_data != NULL)
+	  && !buildid_view_valid
+	  && strcmp (name, ".note.gnu.build-id") == 0)
+	{
+	  const b_elf_note *note;
+
+	  if (!elf_get_view (state, descriptor, memory, memory_size,
+			     shdr->sh_offset, shdr->sh_size, error_callback,
+			     data, &buildid_view))
+	    goto fail;
+
+	  buildid_view_valid = 1;
+	  note = (const b_elf_note *) buildid_view.view.data;
+	  if (note->type == NT_GNU_BUILD_ID
+	      && note->namesz == 4
+	      && strncmp (note->name, "GNU", 4) == 0
+	      && shdr->sh_size <= 12 + ((note->namesz + 3) & ~ 3) + note->descsz)
+	    {
+	      buildid_data = &note->name[0] + ((note->namesz + 3) & ~ 3);
+	      buildid_size = note->descsz;
+	    }
+
+	  if (with_buildid_size != 0)
+	    {
+	      if (buildid_size != with_buildid_size)
+		goto fail;
+
+	      if (memcmp (buildid_data, with_buildid_data, buildid_size) != 0)
+		goto fail;
+	    }
+	}
+
+      /* Read the debuglink file if present.  */
+      if (!debuginfo
+	  && !debuglink_view_valid
+	  && strcmp (name, ".gnu_debuglink") == 0)
+	{
+	  const char *debuglink_data;
+	  size_t crc_offset;
+
+	  if (!elf_get_view (state, descriptor, memory, memory_size,
+			     shdr->sh_offset, shdr->sh_size, error_callback,
+			     data, &debuglink_view))
+	    goto fail;
+
+	  debuglink_view_valid = 1;
+	  debuglink_data = (const char *) debuglink_view.view.data;
+	  crc_offset = strnlen (debuglink_data, shdr->sh_size);
+	  crc_offset = (crc_offset + 3) & ~3;
+	  if (crc_offset + 4 <= shdr->sh_size)
+	    {
+	      debuglink_name = debuglink_data;
+	      debuglink_crc = *(const uint32_t*)(debuglink_data + crc_offset);
+	    }
+	}
+
+      if (!debugaltlink_view_valid
+	  && strcmp (name, ".gnu_debugaltlink") == 0)
+	{
+	  const char *debugaltlink_data;
+	  size_t debugaltlink_name_len;
+
+	  if (!elf_get_view (state, descriptor, memory, memory_size,
+			     shdr->sh_offset, shdr->sh_size, error_callback,
+			     data, &debugaltlink_view))
+	    goto fail;
+
+	  debugaltlink_view_valid = 1;
+	  debugaltlink_data = (const char *) debugaltlink_view.view.data;
+	  debugaltlink_name = debugaltlink_data;
+	  debugaltlink_name_len = strnlen (debugaltlink_data, shdr->sh_size);
+	  if (debugaltlink_name_len < shdr->sh_size)
+	    {
+	      /* Include terminating zero.  */
+	      debugaltlink_name_len += 1;
+
+	      debugaltlink_buildid_data
+		= debugaltlink_data + debugaltlink_name_len;
+	      debugaltlink_buildid_size = shdr->sh_size - debugaltlink_name_len;
+	    }
+	}
+
+      if (!debuginfo
+	  && !gnu_debugdata_view_valid
+	  && strcmp (name, ".gnu_debugdata") == 0)
+	{
+	  if (!elf_get_view (state, descriptor, memory, memory_size,
+			     shdr->sh_offset, shdr->sh_size, error_callback,
+			     data, &gnu_debugdata_view))
+	    goto fail;
+
+	  gnu_debugdata_size = shdr->sh_size;
+	  gnu_debugdata_view_valid = 1;
+	}
+
+      /* Read the .opd section on PowerPC64 ELFv1.  */
+      if (ehdr.e_machine == EM_PPC64
+	  && (ehdr.e_flags & EF_PPC64_ABI) < 2
+	  && shdr->sh_type == SHT_PROGBITS
+	  && strcmp (name, ".opd") == 0)
+	{
+	  if (!elf_get_view (state, descriptor, memory, memory_size,
+			     shdr->sh_offset, shdr->sh_size, error_callback,
+			     data, &opd_data.view))
+	    goto fail;
+
+	  opd = &opd_data;
+	  opd->addr = shdr->sh_addr;
+	  opd->data = (const char *) opd_data.view.view.data;
+	  opd->size = shdr->sh_size;
+	  opd_view_valid = 1;
+	}
+    }
+
+  /* A debuginfo file may not have a useful .opd section, but we can use the
+     one from the original executable.  */
+  if (opd == NULL)
+    opd = caller_opd;
+
+  if (symtab_shndx == 0)
+    symtab_shndx = dynsym_shndx;
+  if (symtab_shndx != 0)
+    {
+      const b_elf_shdr *symtab_shdr;
+      unsigned int strtab_shndx;
+      const b_elf_shdr *strtab_shdr;
+      struct elf_syminfo_data *sdata;
+
+      symtab_shdr = &shdrs[symtab_shndx - 1];
+      strtab_shndx = symtab_shdr->sh_link;
+      if (strtab_shndx >= shnum)
+	{
+	  error_callback (data,
+			  "ELF symbol table strtab link out of range", 0);
+	  goto fail;
+	}
+      strtab_shdr = &shdrs[strtab_shndx - 1];
+
+      if (!elf_get_view (state, descriptor, memory, memory_size,
+			 symtab_shdr->sh_offset, symtab_shdr->sh_size,
+			 error_callback, data, &symtab_view))
+	goto fail;
+      symtab_view_valid = 1;
+
+      if (!elf_get_view (state, descriptor, memory, memory_size,
+			 strtab_shdr->sh_offset, strtab_shdr->sh_size,
+			 error_callback, data, &strtab_view))
+	goto fail;
+      strtab_view_valid = 1;
+
+      sdata = ((struct elf_syminfo_data *)
+	       backtrace_alloc (state, sizeof *sdata, error_callback, data));
+      if (sdata == NULL)
+	goto fail;
+
+      if (!elf_initialize_syminfo (state, base_address,
+				   (const unsigned char*)symtab_view.view.data, symtab_shdr->sh_size,
+				   (const unsigned char*)strtab_view.view.data, strtab_shdr->sh_size,
+				   error_callback, data, sdata, opd))
+	{
+	  backtrace_free (state, sdata, sizeof *sdata, error_callback, data);
+	  goto fail;
+	}
+
+      /* We no longer need the symbol table, but we hold on to the
+	 string table permanently.  */
+      elf_release_view (state, &symtab_view, error_callback, data);
+      symtab_view_valid = 0;
+      strtab_view_valid = 0;
+
+      *found_sym = 1;
+
+      elf_add_syminfo_data (state, sdata);
+    }
+
+  elf_release_view (state, &shdrs_view, error_callback, data);
+  shdrs_view_valid = 0;
+  elf_release_view (state, &names_view, error_callback, data);
+  names_view_valid = 0;
+
+  /* If the debug info is in a separate file, read that one instead.  */
+
+  if (buildid_data != NULL)
+    {
+      int d;
+
+      d = elf_open_debugfile_by_buildid (state, buildid_data, buildid_size,
+					 filename, error_callback, data);
+      if (d >= 0)
+	{
+	  int ret;
+
+	  elf_release_view (state, &buildid_view, error_callback, data);
+	  if (debuglink_view_valid)
+	    elf_release_view (state, &debuglink_view, error_callback, data);
+	  if (debugaltlink_view_valid)
+	    elf_release_view (state, &debugaltlink_view, error_callback, data);
+	  ret = elf_add (state, "", d, NULL, 0, base_address, opd,
+			 error_callback, data, fileline_fn, found_sym,
+			 found_dwarf, NULL, 0, 1, NULL, 0);
+	  if (ret < 0)
+	    backtrace_close (d, error_callback, data);
+	  else if (descriptor >= 0)
+	    backtrace_close (descriptor, error_callback, data);
+	  return ret;
+	}
+    }
+
+  if (buildid_view_valid)
+    {
+      elf_release_view (state, &buildid_view, error_callback, data);
+      buildid_view_valid = 0;
+    }
+
+  if (debuglink_name != NULL)
+    {
+      int d;
+
+      d = elf_open_debugfile_by_debuglink (state, filename, debuglink_name,
+					   debuglink_crc, error_callback,
+					   data);
+      if (d >= 0)
+	{
+	  int ret;
+
+	  elf_release_view (state, &debuglink_view, error_callback, data);
+	  if (debugaltlink_view_valid)
+	    elf_release_view (state, &debugaltlink_view, error_callback, data);
+	  ret = elf_add (state, "", d, NULL, 0, base_address, opd,
+			 error_callback, data, fileline_fn, found_sym,
+			 found_dwarf, NULL, 0, 1, NULL, 0);
+	  if (ret < 0)
+	    backtrace_close (d, error_callback, data);
+	  else if (descriptor >= 0)
+	    backtrace_close(descriptor, error_callback, data);
+	  return ret;
+	}
+    }
+
+  if (debuglink_view_valid)
+    {
+      elf_release_view (state, &debuglink_view, error_callback, data);
+      debuglink_view_valid = 0;
+    }
+
+  if (debugaltlink_name != NULL)
+    {
+      int d;
+
+      d = elf_open_debugfile_by_debuglink (state, filename, debugaltlink_name,
+					   0, error_callback, data);
+      if (d >= 0)
+	{
+	  int ret;
+
+	  ret = elf_add (state, filename, d, NULL, 0, base_address, opd,
+			 error_callback, data, fileline_fn, found_sym,
+			 found_dwarf, &fileline_altlink, 0, 1,
+			 debugaltlink_buildid_data, debugaltlink_buildid_size);
+	  elf_release_view (state, &debugaltlink_view, error_callback, data);
+	  debugaltlink_view_valid = 0;
+	  if (ret < 0)
+	    {
+	      backtrace_close (d, error_callback, data);
+	      return ret;
+	    }
+	}
+    }
+
+  if (debugaltlink_view_valid)
+    {
+      elf_release_view (state, &debugaltlink_view, error_callback, data);
+      debugaltlink_view_valid = 0;
+    }
+
+  if (gnu_debugdata_view_valid)
+    {
+      int ret;
+
+      ret = elf_uncompress_lzma (state,
+				 ((const unsigned char *)
+				  gnu_debugdata_view.view.data),
+				 gnu_debugdata_size, error_callback, data,
+				 &gnu_debugdata_uncompressed,
+				 &gnu_debugdata_uncompressed_size);
+
+      elf_release_view (state, &gnu_debugdata_view, error_callback, data);
+      gnu_debugdata_view_valid = 0;
+
+      if (ret)
+	{
+	  ret = elf_add (state, filename, -1, gnu_debugdata_uncompressed,
+			 gnu_debugdata_uncompressed_size, base_address, opd,
+			 error_callback, data, fileline_fn, found_sym,
+			 found_dwarf, NULL, 0, 0, NULL, 0);
+	  if (ret >= 0 && descriptor >= 0)
+	    backtrace_close(descriptor, error_callback, data);
+	  return ret;
+	}
+    }
+
+  if (opd_view_valid)
+    {
+      elf_release_view (state, &opd->view, error_callback, data);
+      opd_view_valid = 0;
+      opd = NULL;
+    }
+
+  /* Read all the debug sections in a single view, since they are
+     probably adjacent in the file.  If any of sections are
+     uncompressed, we never release this view.  */
+
+  min_offset = 0;
+  max_offset = 0;
+  debug_size = 0;
+  for (i = 0; i < (int) DEBUG_MAX; ++i)
+    {
+      off_t end;
+
+      if (sections[i].size != 0)
+	{
+	  if (min_offset == 0 || sections[i].offset < min_offset)
+	    min_offset = sections[i].offset;
+	  end = sections[i].offset + sections[i].size;
+	  if (end > max_offset)
+	    max_offset = end;
+	  debug_size += sections[i].size;
+	}
+      if (zsections[i].size != 0)
+	{
+	  if (min_offset == 0 || zsections[i].offset < min_offset)
+	    min_offset = zsections[i].offset;
+	  end = zsections[i].offset + zsections[i].size;
+	  if (end > max_offset)
+	    max_offset = end;
+	  debug_size += zsections[i].size;
+	}
+    }
+  if (min_offset == 0 || max_offset == 0)
+    {
+      if (descriptor >= 0)
+	{
+	  if (!backtrace_close (descriptor, error_callback, data))
+	    goto fail;
+	}
+      return 1;
+    }
+
+  /* If the total debug section size is large, assume that there are
+     gaps between the sections, and read them individually.  */
+
+  if (max_offset - min_offset < 0x20000000
+      || max_offset - min_offset < debug_size + 0x10000)
+    {
+      if (!elf_get_view (state, descriptor, memory, memory_size, min_offset,
+			 max_offset - min_offset, error_callback, data,
+			 &debug_view))
+	goto fail;
+      debug_view_valid = 1;
+    }
+  else
+    {
+      memset (&split_debug_view[0], 0, sizeof split_debug_view);
+      for (i = 0; i < (int) DEBUG_MAX; ++i)
+	{
+	  struct debug_section_info *dsec;
+
+	  if (sections[i].size != 0)
+	    dsec = &sections[i];
+	  else if (zsections[i].size != 0)
+	    dsec = &zsections[i];
+	  else
+	    continue;
+
+	  if (!elf_get_view (state, descriptor, memory, memory_size,
+			     dsec->offset, dsec->size, error_callback, data,
+			     &split_debug_view[i]))
+	    goto fail;
+	  split_debug_view_valid[i] = 1;
+
+	  if (sections[i].size != 0)
+	    sections[i].data = ((const unsigned char *)
+				split_debug_view[i].view.data);
+	  else
+	    zsections[i].data = ((const unsigned char *)
+				 split_debug_view[i].view.data);
+	}
+    }
+
+  /* We've read all we need from the executable.  */
+  if (descriptor >= 0)
+    {
+      if (!backtrace_close (descriptor, error_callback, data))
+	goto fail;
+      descriptor = -1;
+    }
+
+  using_debug_view = 0;
+  if (debug_view_valid)
+    {
+      for (i = 0; i < (int) DEBUG_MAX; ++i)
+	{
+	  if (sections[i].size == 0)
+	    sections[i].data = NULL;
+	  else
+	    {
+	      sections[i].data = ((const unsigned char *) debug_view.view.data
+				  + (sections[i].offset - min_offset));
+	      ++using_debug_view;
+	    }
+
+	  if (zsections[i].size == 0)
+	    zsections[i].data = NULL;
+	  else
+	    zsections[i].data = ((const unsigned char *) debug_view.view.data
+				 + (zsections[i].offset - min_offset));
+	}
+    }
+
+  /* Uncompress the old format (--compress-debug-sections=zlib-gnu).  */
+
+  zdebug_table = NULL;
+  for (i = 0; i < (int) DEBUG_MAX; ++i)
+    {
+      if (sections[i].size == 0 && zsections[i].size > 0)
+	{
+	  unsigned char *uncompressed_data;
+	  size_t uncompressed_size;
+
+	  if (zdebug_table == NULL)
+	    {
+	      zdebug_table = ((uint16_t *)
+			      backtrace_alloc (state, ZLIB_TABLE_SIZE,
+					       error_callback, data));
+	      if (zdebug_table == NULL)
+		goto fail;
+	    }
+
+	  uncompressed_data = NULL;
+	  uncompressed_size = 0;
+	  if (!elf_uncompress_zdebug (state, zsections[i].data,
+				      zsections[i].size, zdebug_table,
+				      error_callback, data,
+				      &uncompressed_data, &uncompressed_size))
+	    goto fail;
+	  sections[i].data = uncompressed_data;
+	  sections[i].size = uncompressed_size;
+	  sections[i].compressed = 0;
+
+	  if (split_debug_view_valid[i])
+	    {
+	      elf_release_view (state, &split_debug_view[i],
+				error_callback, data);
+	      split_debug_view_valid[i] = 0;
+	    }
+	}
+    }
+
+  if (zdebug_table != NULL)
+    {
+      backtrace_free (state, zdebug_table, ZLIB_TABLE_SIZE,
+		      error_callback, data);
+      zdebug_table = NULL;
+    }
+
+  /* Uncompress the official ELF format
+     (--compress-debug-sections=zlib-gabi, --compress-debug-sections=zstd).  */
+  for (i = 0; i < (int) DEBUG_MAX; ++i)
+    {
+      unsigned char *uncompressed_data;
+      size_t uncompressed_size;
+
+      if (sections[i].size == 0 || !sections[i].compressed)
+	continue;
+
+      if (zdebug_table == NULL)
+	{
+	  zdebug_table = ((uint16_t *)
+			  backtrace_alloc (state, ZDEBUG_TABLE_SIZE,
+					   error_callback, data));
+	  if (zdebug_table == NULL)
+	    goto fail;
+	}
+
+      uncompressed_data = NULL;
+      uncompressed_size = 0;
+      if (!elf_uncompress_chdr (state, sections[i].data, sections[i].size,
+				zdebug_table, error_callback, data,
+				&uncompressed_data, &uncompressed_size))
+	goto fail;
+      sections[i].data = uncompressed_data;
+      sections[i].size = uncompressed_size;
+      sections[i].compressed = 0;
+
+      if (debug_view_valid)
+	--using_debug_view;
+      else if (split_debug_view_valid[i])
+	{
+	  elf_release_view (state, &split_debug_view[i], error_callback, data);
+	  split_debug_view_valid[i] = 0;
+	}
+    }
+
+  if (zdebug_table != NULL)
+    backtrace_free (state, zdebug_table, ZDEBUG_TABLE_SIZE,
+		    error_callback, data);
+
+  if (debug_view_valid && using_debug_view == 0)
+    {
+      elf_release_view (state, &debug_view, error_callback, data);
+      debug_view_valid = 0;
+    }
+
+  for (i = 0; i < (int) DEBUG_MAX; ++i)
+    {
+      dwarf_sections.data[i] = sections[i].data;
+      dwarf_sections.size[i] = sections[i].size;
+    }
+
+  if (!backtrace_dwarf_add (state, base_address, &dwarf_sections,
+			    ehdr.e_ident[EI_DATA] == ELFDATA2MSB,
+			    fileline_altlink,
+			    error_callback, data, fileline_fn,
+			    fileline_entry))
+    goto fail;
+
+  *found_dwarf = 1;
+
+  return 1;
+
+ fail:
+  if (shdrs_view_valid)
+    elf_release_view (state, &shdrs_view, error_callback, data);
+  if (names_view_valid)
+    elf_release_view (state, &names_view, error_callback, data);
+  if (symtab_view_valid)
+    elf_release_view (state, &symtab_view, error_callback, data);
+  if (strtab_view_valid)
+    elf_release_view (state, &strtab_view, error_callback, data);
+  if (debuglink_view_valid)
+    elf_release_view (state, &debuglink_view, error_callback, data);
+  if (debugaltlink_view_valid)
+    elf_release_view (state, &debugaltlink_view, error_callback, data);
+  if (gnu_debugdata_view_valid)
+    elf_release_view (state, &gnu_debugdata_view, error_callback, data);
+  if (buildid_view_valid)
+    elf_release_view (state, &buildid_view, error_callback, data);
+  if (debug_view_valid)
+    elf_release_view (state, &debug_view, error_callback, data);
+  for (i = 0; i < (int) DEBUG_MAX; ++i)
+    {
+      if (split_debug_view_valid[i])
+	elf_release_view (state, &split_debug_view[i], error_callback, data);
+    }
+  if (opd_view_valid)
+    elf_release_view (state, &opd->view, error_callback, data);
+  if (descriptor >= 0)
+    backtrace_close (descriptor, error_callback, data);
+  return 0;
+}
+
+/* Data passed to phdr_callback.  */
+
+struct phdr_data
+{
+  struct backtrace_state *state;
+  backtrace_error_callback error_callback;
+  void *data;
+  fileline *fileline_fn;
+  int *found_sym;
+  int *found_dwarf;
+  const char *exe_filename;
+  int exe_descriptor;
+};
+
+/* Callback passed to dl_iterate_phdr.  Load debug info from shared
+   libraries.  */
+
+struct PhdrIterate
+{
+  char* dlpi_name;
+  ElfW(Addr) dlpi_addr;
+  ElfW(Addr) dlpi_end_addr;
+};
+FastVector<PhdrIterate> s_phdrData(16);
+
+struct ElfAddrRange
+{
+  ElfW(Addr) dlpi_addr;
+  ElfW(Addr) dlpi_end_addr;
+};
+FastVector<ElfAddrRange> s_sortedKnownElfRanges(16);
+
+static int address_in_known_elf_ranges(uintptr_t pc)
+{
+    auto it = std::lower_bound( s_sortedKnownElfRanges.begin(), s_sortedKnownElfRanges.end(), pc, 
+            []( const ElfAddrRange& lhs, const uintptr_t rhs ) { return uintptr_t(lhs.dlpi_addr) > rhs; } );
+	if( it != s_sortedKnownElfRanges.end() && pc <= it->dlpi_end_addr )
+	{
+		return true;
+	}
+	return false;
+}
+
+static int
+phdr_callback_mock (struct dl_phdr_info *info, size_t size ATTRIBUTE_UNUSED,
+  void *pdata)
+{
+  if( address_in_known_elf_ranges(info->dlpi_addr) )
+  {
+	return 0;
+  }
+
+  auto ptr = s_phdrData.push_next();
+  if (info->dlpi_name)
+  {
+    size_t sz = strlen (info->dlpi_name) + 1;
+    ptr->dlpi_name = (char*)tracy_malloc (sz);
+    memcpy (ptr->dlpi_name, info->dlpi_name, sz);
+  }
+  else ptr->dlpi_name = nullptr;
+  ptr->dlpi_addr = info->dlpi_addr;
+
+  // calculate the end address as well, so we can quickly determine if a PC is within the range of this image
+  ptr->dlpi_end_addr = uintptr_t(info->dlpi_addr) + (info->dlpi_phnum ? uintptr_t(
+                            info->dlpi_phdr[info->dlpi_phnum - 1].p_vaddr + 
+                            info->dlpi_phdr[info->dlpi_phnum - 1].p_memsz) : 0);
+
+  return 0;
+}
+
+static int
+#ifdef __i386__
+__attribute__ ((__force_align_arg_pointer__))
+#endif
+phdr_callback (struct PhdrIterate *info, void *pdata)
+{
+  struct phdr_data *pd = (struct phdr_data *) pdata;
+  const char *filename;
+  int descriptor;
+  int does_not_exist;
+  struct libbacktrace_base_address base_address;
+  fileline elf_fileline_fn;
+  int found_dwarf;
+
+  /* There is not much we can do if we don't have the module name,
+     unless executable is ET_DYN, where we expect the very first
+     phdr_callback to be for the PIE.  */
+  if (info->dlpi_name == NULL || info->dlpi_name[0] == '\0')
+    {
+      if (pd->exe_descriptor == -1)
+	return 0;
+      filename = pd->exe_filename;
+      descriptor = pd->exe_descriptor;
+      pd->exe_descriptor = -1;
+    }
+  else
+    {
+      if (pd->exe_descriptor != -1)
+	{
+	  backtrace_close (pd->exe_descriptor, pd->error_callback, pd->data);
+	  pd->exe_descriptor = -1;
+	}
+
+      filename = info->dlpi_name;
+      descriptor = backtrace_open (info->dlpi_name, pd->error_callback,
+				   pd->data, &does_not_exist);
+      if (descriptor < 0)
+	return 0;
+    }
+
+  base_address.m = info->dlpi_addr;
+  if (elf_add (pd->state, filename, descriptor, NULL, 0, base_address, NULL,
+	       pd->error_callback, pd->data, &elf_fileline_fn, pd->found_sym,
+	       &found_dwarf, NULL, 0, 0, NULL, 0))
+    {
+      if (found_dwarf)
+	{
+	  *pd->found_dwarf = 1;
+	  *pd->fileline_fn = elf_fileline_fn;
+	}
+    }
+
+  return 0;
+}
+
+static int elf_iterate_phdr_and_add_new_files(phdr_data *pd)
+{
+	assert(s_phdrData.empty());
+	// dl_iterate_phdr, will only add entries for elf files loaded in a previously unseen range
+	dl_iterate_phdr(phdr_callback_mock, nullptr);
+
+	if(s_phdrData.size() == 0)
+	{
+		return 0;
+	}
+
+	uint32_t headersAdded = 0;
+	for (auto &v : s_phdrData)
+	{
+		phdr_callback(&v, (void *)pd);
+
+		auto newEntry = s_sortedKnownElfRanges.push_next();
+		newEntry->dlpi_addr = v.dlpi_addr;
+		newEntry->dlpi_end_addr = v.dlpi_end_addr;
+
+		tracy_free(v.dlpi_name);
+
+		headersAdded++;
+	}
+
+	s_phdrData.clear();
+
+   	std::sort( s_sortedKnownElfRanges.begin(), s_sortedKnownElfRanges.end(), 
+		[]( const ElfAddrRange& lhs, const ElfAddrRange& rhs ) { return lhs.dlpi_addr > rhs.dlpi_addr; } );
+
+	return headersAdded;
+}
+
+#ifdef TRACY_LIBBACKTRACE_ELF_DYNLOAD_SUPPORT
+/* Request an elf entry update if the pc passed in is not in any of the known elf ranges. 
+This could mean that new images were dlopened and we need to add those new elf entries */
+static int elf_refresh_address_ranges_if_needed(struct backtrace_state *state, uintptr_t pc)
+{
+	if ( address_in_known_elf_ranges(pc) )
+	{
+		return 0;
+	}
+
+	struct phdr_data pd;
+	int found_sym = 0;
+	int found_dwarf = 0;
+	fileline fileline_fn = nullptr;
+	pd.state = state;
+	pd.error_callback = nullptr;
+	pd.data = nullptr;
+	pd.fileline_fn = &fileline_fn;
+	pd.found_sym = &found_sym;
+	pd.found_dwarf = &found_dwarf;
+	pd.exe_filename = nullptr;
+	pd.exe_descriptor = -1;
+
+	return elf_iterate_phdr_and_add_new_files(&pd);
+}
+#endif //#ifdef TRACY_LIBBACKTRACE_ELF_DYNLOAD_SUPPORT
+
+/* Initialize the backtrace data we need from an ELF executable.  At
+   the ELF level, all we need to do is find the debug info
+   sections.  */
+
+int
+backtrace_initialize (struct backtrace_state *state, const char *filename,
+		      int descriptor, backtrace_error_callback error_callback,
+		      void *data, fileline *fileline_fn)
+{
+  int ret;
+  int found_sym;
+  int found_dwarf;
+  fileline elf_fileline_fn = elf_nodebug;
+  struct phdr_data pd;
+
+
+  /* When using fdpic we must use dl_iterate_phdr for all modules, including
+     the main executable, so that we can get the right base address
+     mapping.  */
+  if (!libbacktrace_using_fdpic ())
+    {
+      struct libbacktrace_base_address zero_base_address;
+
+      memset (&zero_base_address, 0, sizeof zero_base_address);
+      ret = elf_add (state, filename, descriptor, NULL, 0, zero_base_address,
+		     NULL, error_callback, data, &elf_fileline_fn, &found_sym,
+		     &found_dwarf, NULL, 1, 0, NULL, 0);
+      if (!ret)
+	return 0;
+    }
+
+  pd.state = state;
+  pd.error_callback = error_callback;
+  pd.data = data;
+  pd.fileline_fn = &elf_fileline_fn;
+  pd.found_sym = &found_sym;
+  pd.found_dwarf = &found_dwarf;
+  pd.exe_filename = filename;
+  pd.exe_descriptor = ret < 0 ? descriptor : -1;
+
+  elf_iterate_phdr_and_add_new_files(&pd);
+
+  if (!state->threaded)
+    {
+      if (found_sym)
+	state->syminfo_fn = elf_syminfo;
+      else if (state->syminfo_fn == NULL)
+	state->syminfo_fn = elf_nosyms;
+    }
+  else
+    {
+      if (found_sym)
+	backtrace_atomic_store_pointer (&state->syminfo_fn, &elf_syminfo);
+      else
+	(void) __sync_bool_compare_and_swap (&state->syminfo_fn, NULL,
+					     elf_nosyms);
+    }
+
+  if (!state->threaded)
+    *fileline_fn = state->fileline_fn;
+  else
+    *fileline_fn = backtrace_atomic_load_pointer (&state->fileline_fn);
+
+  if (*fileline_fn == NULL || *fileline_fn == elf_nodebug)
+    *fileline_fn = elf_fileline_fn;
+
+  // install an address range refresh callback so we can cope with dynamically loaded elf files
+#ifdef TRACY_LIBBACKTRACE_ELF_DYNLOAD_SUPPORT
+  state->request_known_address_ranges_refresh_fn = elf_refresh_address_ranges_if_needed;
+#else
+  state->request_known_address_ranges_refresh_fn = NULL;
+#endif
+
+  return 1;
+}
+
+}
diff --git a/project/thirdparty/tracy-0.11.1/libbacktrace/fileline.cpp b/project/thirdparty/tracy-0.11.1/libbacktrace/fileline.cpp
new file mode 100644
index 000000000..5a37ff0c7
--- /dev/null
+++ b/project/thirdparty/tracy-0.11.1/libbacktrace/fileline.cpp
@@ -0,0 +1,412 @@
+/* fileline.c -- Get file and line number information in a backtrace.
+   Copyright (C) 2012-2021 Free Software Foundation, Inc.
+   Written by Ian Lance Taylor, Google.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+
+    (1) Redistributions of source code must retain the above copyright
+    notice, this list of conditions and the following disclaimer.
+
+    (2) Redistributions in binary form must reproduce the above copyright
+    notice, this list of conditions and the following disclaimer in
+    the documentation and/or other materials provided with the
+    distribution.
+
+    (3) The name of the author may not be used to
+    endorse or promote products derived from this software without
+    specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT,
+INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING
+IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+POSSIBILITY OF SUCH DAMAGE.  */
+
+#include "config.h"
+
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <errno.h>
+#include <fcntl.h>
+#include <stdlib.h>
+#include <unistd.h>
+
+#if defined (HAVE_KERN_PROC_ARGS) || defined (HAVE_KERN_PROC)
+#include <sys/sysctl.h>
+#endif
+
+#ifdef HAVE_MACH_O_DYLD_H
+#include <mach-o/dyld.h>
+#endif
+
+#ifdef HAVE_WINDOWS_H
+#ifndef WIN32_MEAN_AND_LEAN
+#define WIN32_MEAN_AND_LEAN
+#endif
+
+#ifndef NOMINMAX
+#define NOMINMAX
+#endif
+
+#include <windows.h>
+#endif
+
+#include "backtrace.hpp"
+#include "internal.hpp"
+
+#ifndef HAVE_GETEXECNAME
+#define getexecname() NULL
+#endif
+
+namespace tracy
+{
+
+#if !defined (HAVE_KERN_PROC_ARGS) && !defined (HAVE_KERN_PROC)
+
+#define sysctl_exec_name1(state, error_callback, data) NULL
+#define sysctl_exec_name2(state, error_callback, data) NULL
+
+#else /* defined (HAVE_KERN_PROC_ARGS) || |defined (HAVE_KERN_PROC) */
+
+static char *
+sysctl_exec_name (struct backtrace_state *state,
+		  int mib0, int mib1, int mib2, int mib3,
+		  backtrace_error_callback error_callback, void *data)
+{
+  int mib[4];
+  size_t len;
+  char *name;
+  size_t rlen;
+
+  mib[0] = mib0;
+  mib[1] = mib1;
+  mib[2] = mib2;
+  mib[3] = mib3;
+
+  if (sysctl (mib, 4, NULL, &len, NULL, 0) < 0)
+    return NULL;
+  name = (char *) backtrace_alloc (state, len, error_callback, data);
+  if (name == NULL)
+    return NULL;
+  rlen = len;
+  if (sysctl (mib, 4, name, &rlen, NULL, 0) < 0)
+    {
+      backtrace_free (state, name, len, error_callback, data);
+      return NULL;
+    }
+  return name;
+}
+
+#ifdef HAVE_KERN_PROC_ARGS
+
+static char *
+sysctl_exec_name1 (struct backtrace_state *state,
+		   backtrace_error_callback error_callback, void *data)
+{
+  /* This variant is used on NetBSD.  */
+  return sysctl_exec_name (state, CTL_KERN, KERN_PROC_ARGS, -1,
+			   KERN_PROC_PATHNAME, error_callback, data);
+}
+
+#else
+
+#define sysctl_exec_name1(state, error_callback, data) NULL
+
+#endif
+
+#ifdef HAVE_KERN_PROC
+
+static char *
+sysctl_exec_name2 (struct backtrace_state *state,
+		   backtrace_error_callback error_callback, void *data)
+{
+  /* This variant is used on FreeBSD.  */
+  return sysctl_exec_name (state, CTL_KERN, KERN_PROC, KERN_PROC_PATHNAME, -1,
+			   error_callback, data);
+}
+
+#else
+
+#define sysctl_exec_name2(state, error_callback, data) NULL
+
+#endif
+
+#endif /* defined (HAVE_KERN_PROC_ARGS) || |defined (HAVE_KERN_PROC) */
+
+#ifdef HAVE_MACH_O_DYLD_H
+
+static char *
+macho_get_executable_path (struct backtrace_state *state,
+			   backtrace_error_callback error_callback, void *data)
+{
+  uint32_t len;
+  char *name;
+
+  len = 0;
+  if (_NSGetExecutablePath (NULL, &len) == 0)
+    return NULL;
+  name = (char *) backtrace_alloc (state, len, error_callback, data);
+  if (name == NULL)
+    return NULL;
+  if (_NSGetExecutablePath (name, &len) != 0)
+    {
+      backtrace_free (state, name, len, error_callback, data);
+      return NULL;
+    }
+  return name;
+}
+
+#else /* !defined (HAVE_MACH_O_DYLD_H) */
+
+#define macho_get_executable_path(state, error_callback, data) NULL
+
+#endif /* !defined (HAVE_MACH_O_DYLD_H) */
+
+#if HAVE_DECL__PGMPTR
+
+#define windows_executable_filename() _pgmptr
+
+#else /* !HAVE_DECL__PGMPTR */
+
+#define windows_executable_filename() NULL
+
+#endif /* !HAVE_DECL__PGMPTR */
+
+#ifdef HAVE_WINDOWS_H
+
+#define FILENAME_BUF_SIZE (MAX_PATH)
+
+static char *
+windows_get_executable_path (char *buf, backtrace_error_callback error_callback,
+			     void *data)
+{
+  size_t got;
+  int error;
+
+  got = GetModuleFileNameA (NULL, buf, FILENAME_BUF_SIZE - 1);
+  error = GetLastError ();
+  if (got == 0
+      || (got == FILENAME_BUF_SIZE - 1 && error == ERROR_INSUFFICIENT_BUFFER))
+    {
+      error_callback (data,
+		      "could not get the filename of the current executable",
+		      error);
+      return NULL;
+    }
+  return buf;
+}
+
+#else /* !defined (HAVE_WINDOWS_H) */
+
+#define windows_get_executable_path(buf, error_callback, data) NULL
+#define FILENAME_BUF_SIZE 64
+
+#endif /* !defined (HAVE_WINDOWS_H) */
+
+/* Initialize the fileline information from the executable.  Returns 1
+   on success, 0 on failure.  */
+
+static int
+fileline_initialize (struct backtrace_state *state,
+		     backtrace_error_callback error_callback, void *data)
+{
+  int failed;
+  fileline fileline_fn;
+  int pass;
+  int called_error_callback;
+  int descriptor;
+  const char *filename;
+  char buf[FILENAME_BUF_SIZE];
+
+  if (!state->threaded)
+    failed = state->fileline_initialization_failed;
+  else
+    failed = backtrace_atomic_load_int (&state->fileline_initialization_failed);
+
+  if (failed)
+    {
+      error_callback (data, "failed to read executable information", -1);
+      return 0;
+    }
+
+  if (!state->threaded)
+    fileline_fn = state->fileline_fn;
+  else
+    fileline_fn = backtrace_atomic_load_pointer (&state->fileline_fn);
+  if (fileline_fn != NULL)
+    return 1;
+
+  /* We have not initialized the information.  Do it now.  */
+
+  descriptor = -1;
+  called_error_callback = 0;
+  for (pass = 0; pass < 10; ++pass)
+    {
+      int does_not_exist;
+
+      switch (pass)
+	{
+	case 0:
+	  filename = state->filename;
+	  break;
+	case 1:
+	  filename = getexecname ();
+	  break;
+	case 2:
+	  /* Test this before /proc/self/exe, as the latter exists but points
+	     to the wine binary (and thus doesn't work).  */
+	  filename = windows_executable_filename ();
+	  break;
+	case 3:
+	  filename = "/proc/self/exe";
+	  break;
+	case 4:
+	  filename = "/proc/curproc/file";
+	  break;
+	case 5:
+	  snprintf (buf, sizeof (buf), "/proc/%ld/object/a.out",
+		    (long) getpid ());
+	  filename = buf;
+	  break;
+	case 6:
+	  filename = sysctl_exec_name1 (state, error_callback, data);
+	  break;
+	case 7:
+	  filename = sysctl_exec_name2 (state, error_callback, data);
+	  break;
+	case 8:
+	  filename = macho_get_executable_path (state, error_callback, data);
+	  break;
+	case 9:
+	  filename = windows_get_executable_path (buf, error_callback, data);
+	  break;
+	default:
+	  abort ();
+	}
+
+      if (filename == NULL)
+	continue;
+
+      descriptor = backtrace_open (filename, error_callback, data,
+				   &does_not_exist);
+      if (descriptor < 0 && !does_not_exist)
+	{
+	  called_error_callback = 1;
+	  break;
+	}
+      if (descriptor >= 0)
+	break;
+    }
+
+  if (descriptor < 0)
+    {
+      if (!called_error_callback)
+	{
+	  if (state->filename != NULL)
+	    error_callback (data, state->filename, ENOENT);
+	  else
+	    error_callback (data,
+			    "libbacktrace could not find executable to open",
+			    0);
+	}
+      failed = 1;
+    }
+
+  if (!failed)
+    {
+      if (!backtrace_initialize (state, filename, descriptor, error_callback,
+				 data, &fileline_fn))
+	failed = 1;
+    }
+
+  if (failed)
+    {
+      if (!state->threaded)
+	state->fileline_initialization_failed = 1;
+      else
+	backtrace_atomic_store_int (&state->fileline_initialization_failed, 1);
+      return 0;
+    }
+
+  if (!state->threaded)
+    state->fileline_fn = fileline_fn;
+  else
+    {
+      backtrace_atomic_store_pointer (&state->fileline_fn, fileline_fn);
+
+      /* Note that if two threads initialize at once, one of the data
+	 sets may be leaked.  */
+    }
+
+  return 1;
+}
+
+/* Given a PC, find the file name, line number, and function name.  */
+
+int
+backtrace_pcinfo (struct backtrace_state *state, uintptr_t pc,
+		  backtrace_full_callback callback,
+		  backtrace_error_callback error_callback, void *data)
+{
+  if (!fileline_initialize (state, error_callback, data))
+    return 0;
+
+  if (state->fileline_initialization_failed)
+    return 0;
+
+  return state->fileline_fn (state, pc, callback, error_callback, data);
+}
+
+/* Given a PC, find the symbol for it, and its value.  */
+
+int
+backtrace_syminfo (struct backtrace_state *state, uintptr_t pc,
+		   backtrace_syminfo_callback callback,
+		   backtrace_error_callback error_callback, void *data)
+{
+  if (!fileline_initialize (state, error_callback, data))
+    return 0;
+
+  if (state->fileline_initialization_failed)
+    return 0;
+
+  state->syminfo_fn (state, pc, callback, error_callback, data);
+  return 1;
+}
+
+/* A backtrace_syminfo_callback that can call into a
+   backtrace_full_callback, used when we have a symbol table but no
+   debug info.  */
+
+void
+backtrace_syminfo_to_full_callback (void *data, uintptr_t pc,
+				    const char *symname,
+				    uintptr_t symval ATTRIBUTE_UNUSED,
+				    uintptr_t symsize ATTRIBUTE_UNUSED)
+{
+  struct backtrace_call_full *bdata = (struct backtrace_call_full *) data;
+
+  bdata->ret = bdata->full_callback (bdata->full_data, pc, 0, NULL, 0, symname);
+}
+
+/* An error callback that corresponds to
+   backtrace_syminfo_to_full_callback.  */
+
+void
+backtrace_syminfo_to_full_error_callback (void *data, const char *msg,
+					  int errnum)
+{
+  struct backtrace_call_full *bdata = (struct backtrace_call_full *) data;
+
+  bdata->full_error_callback (bdata->full_data, msg, errnum);
+}
+
+}
diff --git a/project/thirdparty/tracy-0.11.1/libbacktrace/filenames.hpp b/project/thirdparty/tracy-0.11.1/libbacktrace/filenames.hpp
new file mode 100644
index 000000000..aa7bd7adf
--- /dev/null
+++ b/project/thirdparty/tracy-0.11.1/libbacktrace/filenames.hpp
@@ -0,0 +1,52 @@
+/* btest.c -- Filename header for libbacktrace library
+   Copyright (C) 2012-2018 Free Software Foundation, Inc.
+   Written by Ian Lance Taylor, Google.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+
+    (1) Redistributions of source code must retain the above copyright
+    notice, this list of conditions and the following disclaimer.
+
+    (2) Redistributions in binary form must reproduce the above copyright
+    notice, this list of conditions and the following disclaimer in
+    the documentation and/or other materials provided with the
+    distribution.
+
+    (3) The name of the author may not be used to
+    endorse or promote products derived from this software without
+    specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT,
+INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING
+IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+POSSIBILITY OF SUCH DAMAGE.  */
+
+#ifndef GCC_VERSION
+# define GCC_VERSION (__GNUC__ * 1000 + __GNUC_MINOR__)
+#endif
+
+#if (GCC_VERSION < 2007)
+# define __attribute__(x)
+#endif
+
+#ifndef ATTRIBUTE_UNUSED
+# define ATTRIBUTE_UNUSED __attribute__ ((__unused__))
+#endif
+
+#if defined(__MSDOS__) || defined(_WIN32) || defined(__OS2__) || defined (__CYGWIN__)
+# define IS_DIR_SEPARATOR(c) ((c) == '/' || (c) == '\\')
+# define HAS_DRIVE_SPEC(f) ((f)[0] != '\0' && (f)[1] == ':')
+# define IS_ABSOLUTE_PATH(f) (IS_DIR_SEPARATOR((f)[0]) || HAS_DRIVE_SPEC(f))
+#else
+# define IS_DIR_SEPARATOR(c) ((c) == '/')
+# define IS_ABSOLUTE_PATH(f) (IS_DIR_SEPARATOR((f)[0]))
+#endif
diff --git a/project/thirdparty/tracy-0.11.1/libbacktrace/internal.hpp b/project/thirdparty/tracy-0.11.1/libbacktrace/internal.hpp
new file mode 100644
index 000000000..213959759
--- /dev/null
+++ b/project/thirdparty/tracy-0.11.1/libbacktrace/internal.hpp
@@ -0,0 +1,435 @@
+/* internal.h -- Internal header file for stack backtrace library.
+   Copyright (C) 2012-2021 Free Software Foundation, Inc.
+   Written by Ian Lance Taylor, Google.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+
+    (1) Redistributions of source code must retain the above copyright
+    notice, this list of conditions and the following disclaimer.
+
+    (2) Redistributions in binary form must reproduce the above copyright
+    notice, this list of conditions and the following disclaimer in
+    the documentation and/or other materials provided with the
+    distribution.
+
+    (3) The name of the author may not be used to
+    endorse or promote products derived from this software without
+    specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT,
+INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING
+IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+POSSIBILITY OF SUCH DAMAGE.  */
+
+#ifndef BACKTRACE_INTERNAL_H
+#define BACKTRACE_INTERNAL_H
+
+/* We assume that <sys/types.h> and "backtrace.h" have already been
+   included.  */
+
+#ifndef GCC_VERSION
+# define GCC_VERSION (__GNUC__ * 1000 + __GNUC_MINOR__)
+#endif
+
+#if (GCC_VERSION < 2007)
+# define __attribute__(x)
+#endif
+
+#ifndef ATTRIBUTE_UNUSED
+# define ATTRIBUTE_UNUSED __attribute__ ((__unused__))
+#endif
+
+#ifndef ATTRIBUTE_MALLOC
+# if (GCC_VERSION >= 2096)
+#  define ATTRIBUTE_MALLOC __attribute__ ((__malloc__))
+# else
+#  define ATTRIBUTE_MALLOC
+# endif
+#endif
+
+#ifndef ATTRIBUTE_FALLTHROUGH
+# if (GCC_VERSION >= 7000)
+#  define ATTRIBUTE_FALLTHROUGH __attribute__ ((__fallthrough__))
+# else
+#  define ATTRIBUTE_FALLTHROUGH
+# endif
+#endif
+
+#ifndef HAVE_SYNC_FUNCTIONS
+
+/* Define out the sync functions.  These should never be called if
+   they are not available.  */
+
+#define __sync_bool_compare_and_swap(A, B, C) (abort(), 1)
+#define __sync_lock_test_and_set(A, B) (abort(), 0)
+#define __sync_lock_release(A) abort()
+
+#endif /* !defined (HAVE_SYNC_FUNCTIONS) */
+
+#ifdef HAVE_ATOMIC_FUNCTIONS
+
+/* We have the atomic builtin functions.  */
+
+#define backtrace_atomic_load_pointer(p) \
+    __atomic_load_n ((p), __ATOMIC_ACQUIRE)
+#define backtrace_atomic_load_int(p) \
+    __atomic_load_n ((p), __ATOMIC_ACQUIRE)
+#define backtrace_atomic_store_pointer(p, v) \
+    __atomic_store_n ((p), (v), __ATOMIC_RELEASE)
+#define backtrace_atomic_store_size_t(p, v) \
+    __atomic_store_n ((p), (v), __ATOMIC_RELEASE)
+#define backtrace_atomic_store_int(p, v) \
+    __atomic_store_n ((p), (v), __ATOMIC_RELEASE)
+
+#else /* !defined (HAVE_ATOMIC_FUNCTIONS) */
+#ifdef HAVE_SYNC_FUNCTIONS
+
+/* We have the sync functions but not the atomic functions.  Define
+   the atomic ones in terms of the sync ones.  */
+
+extern void *backtrace_atomic_load_pointer (void *);
+extern int backtrace_atomic_load_int (int *);
+extern void backtrace_atomic_store_pointer (void *, void *);
+extern void backtrace_atomic_store_size_t (size_t *, size_t);
+extern void backtrace_atomic_store_int (int *, int);
+
+#else /* !defined (HAVE_SYNC_FUNCTIONS) */
+
+/* We have neither the sync nor the atomic functions.  These will
+   never be called.  */
+
+#define backtrace_atomic_load_pointer(p) (abort(), (void *) NULL)
+#define backtrace_atomic_load_int(p) (abort(), 0)
+#define backtrace_atomic_store_pointer(p, v) abort()
+#define backtrace_atomic_store_size_t(p, v) abort()
+#define backtrace_atomic_store_int(p, v) abort()
+
+#endif /* !defined (HAVE_SYNC_FUNCTIONS) */
+#endif /* !defined (HAVE_ATOMIC_FUNCTIONS) */
+
+namespace tracy
+{
+
+/* The type of the function that collects file/line information.  This
+   is like backtrace_pcinfo.  */
+
+typedef int (*fileline) (struct backtrace_state *state, uintptr_t pc,
+			 backtrace_full_callback callback,
+			 backtrace_error_callback error_callback, void *data);
+
+/* The type of the function that collects symbol information.  This is
+   like backtrace_syminfo.  */
+
+typedef void (*syminfo) (struct backtrace_state *state, uintptr_t pc,
+			 backtrace_syminfo_callback callback,
+			 backtrace_error_callback error_callback, void *data);
+
+/* The type of the function that will trigger an known address range refresh
+ (if pc passed in is for an address whichs lies ourtisde of known ranges) */
+typedef int (*request_known_address_ranges_refresh)(struct backtrace_state *state,
+             uintptr_t pc);
+
+/* What the backtrace state pointer points to.  */
+
+struct backtrace_state
+{
+  /* The name of the executable.  */
+  const char *filename;
+  /* Non-zero if threaded.  */
+  int threaded;
+  /* The master lock for fileline_fn, fileline_data, syminfo_fn,
+     syminfo_data, fileline_initialization_failed and everything the
+     data pointers point to.  */
+  void *lock;
+  /* The function that returns file/line information.  */
+  fileline fileline_fn;
+  /* The data to pass to FILELINE_FN.  */
+  void *fileline_data;
+  /* The function that returns symbol information.  */
+  syminfo syminfo_fn;
+  /* The data to pass to SYMINFO_FN.  */
+  void *syminfo_data;
+  /* Whether initializing the file/line information failed.  */
+  int fileline_initialization_failed;
+  /* The lock for the freelist.  */
+  int lock_alloc;
+  /* The freelist when using mmap.  */
+  struct backtrace_freelist_struct *freelist;
+  /* Trigger an known address range refresh */
+  request_known_address_ranges_refresh request_known_address_ranges_refresh_fn;
+};
+
+/* Open a file for reading.  Returns -1 on error.  If DOES_NOT_EXIST
+   is not NULL, *DOES_NOT_EXIST will be set to 0 normally and set to 1
+   if the file does not exist.  If the file does not exist and
+   DOES_NOT_EXIST is not NULL, the function will return -1 and will
+   not call ERROR_CALLBACK.  On other errors, or if DOES_NOT_EXIST is
+   NULL, the function will call ERROR_CALLBACK before returning.  */
+extern int backtrace_open (const char *filename,
+			   backtrace_error_callback error_callback,
+			   void *data,
+			   int *does_not_exist);
+
+/* A view of the contents of a file.  This supports mmap when
+   available.  A view will remain in memory even after backtrace_close
+   is called on the file descriptor from which the view was
+   obtained.  */
+
+struct backtrace_view
+{
+  /* The data that the caller requested.  */
+  const void *data;
+  /* The base of the view.  */
+  void *base;
+  /* The total length of the view.  */
+  size_t len;
+};
+
+/* Create a view of SIZE bytes from DESCRIPTOR at OFFSET.  Store the
+   result in *VIEW.  Returns 1 on success, 0 on error.  */
+extern int backtrace_get_view (struct backtrace_state *state, int descriptor,
+			       off_t offset, uint64_t size,
+			       backtrace_error_callback error_callback,
+			       void *data, struct backtrace_view *view);
+
+/* Release a view created by backtrace_get_view.  */
+extern void backtrace_release_view (struct backtrace_state *state,
+				    struct backtrace_view *view,
+				    backtrace_error_callback error_callback,
+				    void *data);
+
+/* Close a file opened by backtrace_open.  Returns 1 on success, 0 on
+   error.  */
+
+extern int backtrace_close (int descriptor,
+			    backtrace_error_callback error_callback,
+			    void *data);
+
+/* Sort without using memory.  */
+
+extern void backtrace_qsort (void *base, size_t count, size_t size,
+			     int (*compar) (const void *, const void *));
+
+/* Allocate memory.  This is like malloc.  If ERROR_CALLBACK is NULL,
+   this does not report an error, it just returns NULL.  */
+
+extern void *backtrace_alloc (struct backtrace_state *state, size_t size,
+			      backtrace_error_callback error_callback,
+			      void *data) ATTRIBUTE_MALLOC;
+
+/* Free memory allocated by backtrace_alloc.  If ERROR_CALLBACK is
+   NULL, this does not report an error.  */
+
+extern void backtrace_free (struct backtrace_state *state, void *mem,
+			    size_t size,
+			    backtrace_error_callback error_callback,
+			    void *data);
+
+/* A growable vector of some struct.  This is used for more efficient
+   allocation when we don't know the final size of some group of data
+   that we want to represent as an array.  */
+
+struct backtrace_vector
+{
+  /* The base of the vector.  */
+  void *base;
+  /* The number of bytes in the vector.  */
+  size_t size;
+  /* The number of bytes available at the current allocation.  */
+  size_t alc;
+};
+
+/* Grow VEC by SIZE bytes.  Return a pointer to the newly allocated
+   bytes.  Note that this may move the entire vector to a new memory
+   location.  Returns NULL on failure.  */
+
+extern void *backtrace_vector_grow (struct backtrace_state *state, size_t size,
+				    backtrace_error_callback error_callback,
+				    void *data,
+				    struct backtrace_vector *vec);
+
+/* Finish the current allocation on VEC.  Prepare to start a new
+   allocation.  The finished allocation will never be freed.  Returns
+   a pointer to the base of the finished entries, or NULL on
+   failure.  */
+
+extern void* backtrace_vector_finish (struct backtrace_state *state,
+				      struct backtrace_vector *vec,
+				      backtrace_error_callback error_callback,
+				      void *data);
+
+/* Release any extra space allocated for VEC.  This may change
+   VEC->base.  Returns 1 on success, 0 on failure.  */
+
+extern int backtrace_vector_release (struct backtrace_state *state,
+				     struct backtrace_vector *vec,
+				     backtrace_error_callback error_callback,
+				     void *data);
+
+/* Free the space managed by VEC.  This will reset VEC.  */
+
+static inline void
+backtrace_vector_free (struct backtrace_state *state,
+		       struct backtrace_vector *vec,
+		       backtrace_error_callback error_callback, void *data)
+{
+  vec->alc += vec->size;
+  vec->size = 0;
+  backtrace_vector_release (state, vec, error_callback, data);
+}
+
+/* Read initial debug data from a descriptor, and set the
+   fileline_data, syminfo_fn, and syminfo_data fields of STATE.
+   Return the fileln_fn field in *FILELN_FN--this is done this way so
+   that the synchronization code is only implemented once.  This is
+   called after the descriptor has first been opened.  It will close
+   the descriptor if it is no longer needed.  Returns 1 on success, 0
+   on error.  There will be multiple implementations of this function,
+   for different file formats.  Each system will compile the
+   appropriate one.  */
+
+extern int backtrace_initialize (struct backtrace_state *state,
+				 const char *filename,
+				 int descriptor,
+				 backtrace_error_callback error_callback,
+				 void *data,
+				 fileline *fileline_fn);
+
+/* An enum for the DWARF sections we care about.  */
+
+enum dwarf_section
+{
+  DEBUG_INFO,
+  DEBUG_LINE,
+  DEBUG_ABBREV,
+  DEBUG_RANGES,
+  DEBUG_STR,
+  DEBUG_ADDR,
+  DEBUG_STR_OFFSETS,
+  DEBUG_LINE_STR,
+  DEBUG_RNGLISTS,
+
+  DEBUG_MAX
+};
+
+/* Data for the DWARF sections we care about.  */
+
+struct dwarf_sections
+{
+  const unsigned char *data[DEBUG_MAX];
+  size_t size[DEBUG_MAX];
+};
+
+/* DWARF data read from a file, used for .gnu_debugaltlink.  */
+
+struct dwarf_data;
+
+/* The load address mapping.  */
+
+#if defined(__FDPIC__) && defined(HAVE_DL_ITERATE_PHDR) && (defined(HAVE_LINK_H) || defined(HAVE_SYS_LINK_H))
+
+#ifdef HAVE_LINK_H
+ #include <link.h>
+#endif
+#ifdef HAVE_SYS_LINK_H
+ #include <sys/link.h>
+#endif
+
+#define libbacktrace_using_fdpic() (1)
+
+struct libbacktrace_base_address
+{
+  struct elf32_fdpic_loadaddr m;
+};
+
+#define libbacktrace_add_base(pc, base) \
+  ((uintptr_t) (__RELOC_POINTER ((pc), (base).m)))
+
+#else /* not _FDPIC__ */
+
+#define libbacktrace_using_fdpic() (0)
+
+struct libbacktrace_base_address
+{
+  uintptr_t m;
+};
+
+#define libbacktrace_add_base(pc, base) ((pc) + (base).m)
+
+#endif /* not _FDPIC__ */
+
+/* Add file/line information for a DWARF module.  */
+
+extern int backtrace_dwarf_add (struct backtrace_state *state,
+				struct libbacktrace_base_address base_address,
+				const struct dwarf_sections *dwarf_sections,
+				int is_bigendian,
+				struct dwarf_data *fileline_altlink,
+				backtrace_error_callback error_callback,
+				void *data, fileline *fileline_fn,
+				struct dwarf_data **fileline_entry);
+
+/* A data structure to pass to backtrace_syminfo_to_full.  */
+
+struct backtrace_call_full
+{
+  backtrace_full_callback full_callback;
+  backtrace_error_callback full_error_callback;
+  void *full_data;
+  int ret;
+};
+
+/* A backtrace_syminfo_callback that can call into a
+   backtrace_full_callback, used when we have a symbol table but no
+   debug info.  */
+
+extern void backtrace_syminfo_to_full_callback (void *data, uintptr_t pc,
+						const char *symname,
+						uintptr_t symval,
+						uintptr_t symsize);
+
+/* An error callback that corresponds to
+   backtrace_syminfo_to_full_callback.  */
+
+extern void backtrace_syminfo_to_full_error_callback (void *, const char *,
+						      int);
+
+/* A test-only hook for elf_uncompress_zdebug.  */
+
+extern int backtrace_uncompress_zdebug (struct backtrace_state *,
+					const unsigned char *compressed,
+					size_t compressed_size,
+					backtrace_error_callback, void *data,
+					unsigned char **uncompressed,
+					size_t *uncompressed_size);
+
+/* A test-only hook for elf_zstd_decompress.  */
+
+extern int backtrace_uncompress_zstd (struct backtrace_state *,
+				      const unsigned char *compressed,
+				      size_t compressed_size,
+				      backtrace_error_callback, void *data,
+				      unsigned char *uncompressed,
+				      size_t uncompressed_size);
+
+/* A test-only hook for elf_uncompress_lzma.  */
+
+extern int backtrace_uncompress_lzma (struct backtrace_state *,
+				      const unsigned char *compressed,
+				      size_t compressed_size,
+				      backtrace_error_callback, void *data,
+				      unsigned char **uncompressed,
+				      size_t *uncompressed_size);
+
+}
+
+#endif
diff --git a/project/thirdparty/tracy-0.11.1/libbacktrace/macho.cpp b/project/thirdparty/tracy-0.11.1/libbacktrace/macho.cpp
new file mode 100644
index 000000000..b9f084565
--- /dev/null
+++ b/project/thirdparty/tracy-0.11.1/libbacktrace/macho.cpp
@@ -0,0 +1,1367 @@
+/* elf.c -- Get debug data from a Mach-O file for backtraces.
+   Copyright (C) 2020-2021 Free Software Foundation, Inc.
+   Written by Ian Lance Taylor, Google.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+
+    (1) Redistributions of source code must retain the above copyright
+    notice, this list of conditions and the following disclaimer.
+
+    (2) Redistributions in binary form must reproduce the above copyright
+    notice, this list of conditions and the following disclaimer in
+    the documentation and/or other materials provided with the
+    distribution.
+
+    (3) The name of the author may not be used to
+    endorse or promote products derived from this software without
+    specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT,
+INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING
+IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+POSSIBILITY OF SUCH DAMAGE.  */
+
+#include "config.h"
+
+#include <sys/types.h>
+#include <dirent.h>
+#include <stdlib.h>
+#include <string.h>
+
+#ifdef HAVE_MACH_O_DYLD_H
+#include <mach-o/dyld.h>
+#endif
+
+#include "backtrace.hpp"
+#include "internal.hpp"
+
+namespace tracy
+{
+
+/* Mach-O file header for a 32-bit executable.  */
+
+struct macho_header_32
+{
+  uint32_t magic;	/* Magic number (MACH_O_MAGIC_32) */
+  uint32_t cputype;	/* CPU type */
+  uint32_t cpusubtype;	/* CPU subtype */
+  uint32_t filetype;	/* Type of file (object, executable) */
+  uint32_t ncmds;	/* Number of load commands */
+  uint32_t sizeofcmds;	/* Total size of load commands */
+  uint32_t flags;	/* Flags for special features */
+};
+
+/* Mach-O file header for a 64-bit executable.  */
+
+struct macho_header_64
+{
+  uint32_t magic;	/* Magic number (MACH_O_MAGIC_64) */
+  uint32_t cputype;	/* CPU type */
+  uint32_t cpusubtype;	/* CPU subtype */
+  uint32_t filetype;	/* Type of file (object, executable) */
+  uint32_t ncmds;	/* Number of load commands */
+  uint32_t sizeofcmds;	/* Total size of load commands */
+  uint32_t flags;	/* Flags for special features */
+  uint32_t reserved;	/* Reserved */
+};
+
+/* Mach-O file header for a fat executable.  */
+
+struct macho_header_fat
+{
+  uint32_t magic;	/* Magic number (MACH_O_MH_(MAGIC|CIGAM)_FAT(_64)?) */
+  uint32_t nfat_arch;   /* Number of components */
+};
+
+/* Values for the header magic field.  */
+
+#define MACH_O_MH_MAGIC_32	0xfeedface
+#define MACH_O_MH_MAGIC_64	0xfeedfacf
+#define MACH_O_MH_MAGIC_FAT	0xcafebabe
+#define MACH_O_MH_CIGAM_FAT	0xbebafeca
+#define MACH_O_MH_MAGIC_FAT_64	0xcafebabf
+#define MACH_O_MH_CIGAM_FAT_64	0xbfbafeca
+
+/* Value for the header filetype field.  */
+
+#define MACH_O_MH_EXECUTE	0x02
+#define MACH_O_MH_DYLIB		0x06
+#define MACH_O_MH_DSYM		0x0a
+
+/* A component of a fat file.  A fat file starts with a
+   macho_header_fat followed by nfat_arch instances of this
+   struct.  */
+
+struct macho_fat_arch
+{
+  uint32_t cputype;	/* CPU type */
+  uint32_t cpusubtype;	/* CPU subtype */
+  uint32_t offset;	/* File offset of this entry */
+  uint32_t size;	/* Size of this entry */
+  uint32_t align;	/* Alignment of this entry */
+};
+
+/* A component of a 64-bit fat file.  This is used if the magic field
+   is MAGIC_FAT_64.  This is only used when some file size or file
+   offset is too large to represent in the 32-bit format.  */
+
+struct macho_fat_arch_64
+{
+  uint32_t cputype;	/* CPU type */
+  uint32_t cpusubtype;	/* CPU subtype */
+  uint64_t offset;	/* File offset of this entry */
+  uint64_t size;	/* Size of this entry */
+  uint32_t align;	/* Alignment of this entry */
+  uint32_t reserved;	/* Reserved */
+};
+
+/* Values for the fat_arch cputype field (and the header cputype
+   field).  */
+
+#define MACH_O_CPU_ARCH_ABI64 0x01000000
+
+#define MACH_O_CPU_TYPE_X86 7
+#define MACH_O_CPU_TYPE_ARM 12
+#define MACH_O_CPU_TYPE_PPC 18
+
+#define MACH_O_CPU_TYPE_X86_64 (MACH_O_CPU_TYPE_X86 | MACH_O_CPU_ARCH_ABI64)
+#define MACH_O_CPU_TYPE_ARM64  (MACH_O_CPU_TYPE_ARM | MACH_O_CPU_ARCH_ABI64)
+#define MACH_O_CPU_TYPE_PPC64  (MACH_O_CPU_TYPE_PPC | MACH_O_CPU_ARCH_ABI64)
+
+/* The header of a load command.  */
+
+struct macho_load_command
+{
+  uint32_t cmd;		/* The type of load command */
+  uint32_t cmdsize;	/* Size in bytes of the entire command */
+};
+
+/* Values for the load_command cmd field.  */
+
+#define MACH_O_LC_SEGMENT	0x01
+#define MACH_O_LC_SYMTAB	0x02
+#define MACH_O_LC_SEGMENT_64	0x19
+#define MACH_O_LC_UUID		0x1b
+
+/* The length of a section of segment name.  */
+
+#define MACH_O_NAMELEN (16)
+
+/* LC_SEGMENT load command.  */
+
+struct macho_segment_command
+{
+  uint32_t cmd;			/* The type of load command (LC_SEGMENT) */
+  uint32_t cmdsize;		/* Size in bytes of the entire command */
+  char segname[MACH_O_NAMELEN];	/* Segment name */
+  uint32_t vmaddr;		/* Virtual memory address */
+  uint32_t vmsize;		/* Virtual memory size */
+  uint32_t fileoff;		/* Offset of data to be mapped */
+  uint32_t filesize;		/* Size of data in file */
+  uint32_t maxprot;		/* Maximum permitted virtual protection */
+  uint32_t initprot;		/* Initial virtual memory protection */
+  uint32_t nsects;		/* Number of sections in this segment */
+  uint32_t flags;		/* Flags */
+};
+
+/* LC_SEGMENT_64 load command.  */
+
+struct macho_segment_64_command
+{
+  uint32_t cmd;			/* The type of load command (LC_SEGMENT) */
+  uint32_t cmdsize;		/* Size in bytes of the entire command */
+  char segname[MACH_O_NAMELEN];	/* Segment name */
+  uint64_t vmaddr;		/* Virtual memory address */
+  uint64_t vmsize;		/* Virtual memory size */
+  uint64_t fileoff;		/* Offset of data to be mapped */
+  uint64_t filesize;		/* Size of data in file */
+  uint32_t maxprot;		/* Maximum permitted virtual protection */
+  uint32_t initprot;		/* Initial virtual memory protection */
+  uint32_t nsects;		/* Number of sections in this segment */
+  uint32_t flags;		/* Flags */
+};
+
+/* LC_SYMTAB load command.  */
+
+struct macho_symtab_command
+{
+  uint32_t cmd;		/* The type of load command (LC_SEGMENT) */
+  uint32_t cmdsize;	/* Size in bytes of the entire command */
+  uint32_t symoff;	/* File offset of symbol table */
+  uint32_t nsyms;	/* Number of symbols */
+  uint32_t stroff;	/* File offset of string table */
+  uint32_t strsize;	/* String table size */
+};
+
+/* The length of a Mach-O uuid.  */
+
+#define MACH_O_UUID_LEN (16)
+
+/* LC_UUID load command.  */
+
+struct macho_uuid_command
+{
+  uint32_t cmd;				/* Type of load command (LC_UUID) */
+  uint32_t cmdsize;			/* Size in bytes of command */
+  unsigned char uuid[MACH_O_UUID_LEN];	/* UUID */
+};
+
+/* 32-bit section header within a LC_SEGMENT segment.  */
+
+struct macho_section
+{
+  char sectname[MACH_O_NAMELEN];	/* Section name */
+  char segment[MACH_O_NAMELEN];		/* Segment of this section */
+  uint32_t addr;			/* Address in memory */
+  uint32_t size;			/* Section size */
+  uint32_t offset;			/* File offset */
+  uint32_t align;			/* Log2 of section alignment */
+  uint32_t reloff;			/* File offset of relocations */
+  uint32_t nreloc;			/* Number of relocs for this section */
+  uint32_t flags;			/* Flags */
+  uint32_t reserved1;
+  uint32_t reserved2;
+};
+
+/* 64-bit section header within a LC_SEGMENT_64 segment.   */
+
+struct macho_section_64
+{
+  char sectname[MACH_O_NAMELEN];	/* Section name */
+  char segment[MACH_O_NAMELEN];		/* Segment of this section */
+  uint64_t addr;			/* Address in memory */
+  uint64_t size;			/* Section size */
+  uint32_t offset;			/* File offset */
+  uint32_t align;			/* Log2 of section alignment */
+  uint32_t reloff;			/* File offset of section relocations */
+  uint32_t nreloc;			/* Number of relocs for this section */
+  uint32_t flags;			/* Flags */
+  uint32_t reserved1;
+  uint32_t reserved2;
+  uint32_t reserved3;
+};
+
+/* 32-bit symbol data.  */
+
+struct macho_nlist
+{
+  uint32_t n_strx;	/* Index of name in string table */
+  uint8_t n_type;	/* Type flag */
+  uint8_t n_sect;	/* Section number */
+  uint16_t n_desc;	/* Stabs description field */
+  uint32_t n_value;	/* Value */
+};
+
+/* 64-bit symbol data.  */
+
+struct macho_nlist_64
+{
+  uint32_t n_strx;	/* Index of name in string table */
+  uint8_t n_type;	/* Type flag */
+  uint8_t n_sect;	/* Section number */
+  uint16_t n_desc;	/* Stabs description field */
+  uint64_t n_value;	/* Value */
+};
+
+/* Value found in nlist n_type field.  */
+
+#define MACH_O_N_STAB	0xe0	/* Stabs debugging symbol */
+#define MACH_O_N_TYPE	0x0e	/* Mask for type bits */
+
+/* Values found after masking with MACH_O_N_TYPE.  */
+#define MACH_O_N_UNDF	0x00	/* Undefined symbol */
+#define MACH_O_N_ABS	0x02	/* Absolute symbol */
+#define MACH_O_N_SECT	0x0e	/* Defined in section from n_sect field */
+
+
+/* Information we keep for a Mach-O symbol.  */
+
+struct macho_symbol
+{
+  const char *name;	/* Symbol name */
+  uintptr_t address;	/* Symbol address */
+};
+
+/* Information to pass to macho_syminfo.  */
+
+struct macho_syminfo_data
+{
+  struct macho_syminfo_data *next;	/* Next module */
+  struct macho_symbol *symbols;		/* Symbols sorted by address */
+  size_t count;				/* Number of symbols */
+};
+
+/* Names of sections, indexed by enum dwarf_section in internal.h.  */
+
+static const char * const dwarf_section_names[DEBUG_MAX] =
+{
+  "__debug_info",
+  "__debug_line",
+  "__debug_abbrev",
+  "__debug_ranges",
+  "__debug_str",
+  "", /* DEBUG_ADDR */
+  "__debug_str_offs",
+  "", /* DEBUG_LINE_STR */
+  "__debug_rnglists"
+};
+
+/* Forward declaration.  */
+
+static int macho_add (struct backtrace_state *, const char *, int, off_t,
+		      const unsigned char *, struct libbacktrace_base_address,
+		      int, backtrace_error_callback, void *, fileline *,
+		      int *);
+
+/* A dummy callback function used when we can't find any debug info.  */
+
+static int
+macho_nodebug (struct backtrace_state *state ATTRIBUTE_UNUSED,
+	       uintptr_t pc ATTRIBUTE_UNUSED,
+	       backtrace_full_callback callback ATTRIBUTE_UNUSED,
+	       backtrace_error_callback error_callback, void *data)
+{
+  error_callback (data, "no debug info in Mach-O executable", -1);
+  return 0;
+}
+
+/* A dummy callback function used when we can't find a symbol
+   table.  */
+
+static void
+macho_nosyms (struct backtrace_state *state ATTRIBUTE_UNUSED,
+	      uintptr_t addr ATTRIBUTE_UNUSED,
+	      backtrace_syminfo_callback callback ATTRIBUTE_UNUSED,
+	      backtrace_error_callback error_callback, void *data)
+{
+  error_callback (data, "no symbol table in Mach-O executable", -1);
+}
+
+/* Add a single DWARF section to DWARF_SECTIONS, if we need the
+   section.  Returns 1 on success, 0 on failure.  */
+
+static int
+macho_add_dwarf_section (struct backtrace_state *state, int descriptor,
+			 const char *sectname, uint32_t offset, uint64_t size,
+			 backtrace_error_callback error_callback, void *data,
+			 struct dwarf_sections *dwarf_sections)
+{
+  int i;
+
+  for (i = 0; i < (int) DEBUG_MAX; ++i)
+    {
+      if (dwarf_section_names[i][0] != '\0'
+	  && strncmp (sectname, dwarf_section_names[i], MACH_O_NAMELEN) == 0)
+	{
+	  struct backtrace_view section_view;
+
+	  /* FIXME: Perhaps it would be better to try to use a single
+	     view to read all the DWARF data, as we try to do for
+	     ELF.  */
+
+	  if (!backtrace_get_view (state, descriptor, offset, size,
+				   error_callback, data, &section_view))
+	    return 0;
+	  dwarf_sections->data[i] = (const unsigned char *) section_view.data;
+	  dwarf_sections->size[i] = size;
+	  break;
+	}
+    }
+  return 1;
+}
+
+/* Collect DWARF sections from a DWARF segment.  Returns 1 on success,
+   0 on failure.  */
+
+static int
+macho_add_dwarf_segment (struct backtrace_state *state, int descriptor,
+			 off_t offset, unsigned int cmd, const char *psecs,
+			 size_t sizesecs, unsigned int nsects,
+			 backtrace_error_callback error_callback, void *data,
+			 struct dwarf_sections *dwarf_sections)
+{
+  size_t sec_header_size;
+  size_t secoffset;
+  unsigned int i;
+
+  switch (cmd)
+    {
+    case MACH_O_LC_SEGMENT:
+      sec_header_size = sizeof (struct macho_section);
+      break;
+    case MACH_O_LC_SEGMENT_64:
+      sec_header_size = sizeof (struct macho_section_64);
+      break;
+    default:
+      abort ();
+    }
+
+  secoffset = 0;
+  for (i = 0; i < nsects; ++i)
+    {
+      if (secoffset + sec_header_size > sizesecs)
+	{
+	  error_callback (data, "section overflow withing segment", 0);
+	  return 0;
+	}
+
+      switch (cmd)
+	{
+	case MACH_O_LC_SEGMENT:
+	  {
+	    struct macho_section section;
+
+	    memcpy (&section, psecs + secoffset, sizeof section);
+	    macho_add_dwarf_section (state, descriptor, section.sectname,
+				     offset + section.offset, section.size,
+				     error_callback, data, dwarf_sections);
+	  }
+	  break;
+
+	case MACH_O_LC_SEGMENT_64:
+	  {
+	    struct macho_section_64 section;
+
+	    memcpy (&section, psecs + secoffset, sizeof section);
+	    macho_add_dwarf_section (state, descriptor, section.sectname,
+				     offset + section.offset, section.size,
+				     error_callback, data, dwarf_sections);
+	  }
+	  break;
+
+	default:
+	  abort ();
+	}
+
+      secoffset += sec_header_size;
+    }
+
+  return 1;
+}
+
+/* Compare struct macho_symbol for qsort.  */
+
+static int
+macho_symbol_compare (const void *v1, const void *v2)
+{
+  const struct macho_symbol *m1 = (const struct macho_symbol *) v1;
+  const struct macho_symbol *m2 = (const struct macho_symbol *) v2;
+
+  if (m1->address < m2->address)
+    return -1;
+  else if (m1->address > m2->address)
+    return 1;
+  else
+    return 0;
+}
+
+/* Compare an address against a macho_symbol for bsearch.  We allocate
+   one extra entry in the array so that this can safely look at the
+   next entry.  */
+
+static int
+macho_symbol_search (const void *vkey, const void *ventry)
+{
+  const uintptr_t *key = (const uintptr_t *) vkey;
+  const struct macho_symbol *entry = (const struct macho_symbol *) ventry;
+  uintptr_t addr;
+
+  addr = *key;
+  if (addr < entry->address)
+    return -1;
+  else if (entry->name[0] == '\0'
+	   && entry->address == ~(uintptr_t) 0)
+    return -1;
+  else if ((entry + 1)->name[0] == '\0'
+	   && (entry + 1)->address == ~(uintptr_t) 0)
+    return -1;
+  else if (addr >= (entry + 1)->address)
+    return 1;
+  else
+    return 0;
+}
+
+/* Return whether the symbol type field indicates a symbol table entry
+   that we care about: a function or data symbol.  */
+
+static int
+macho_defined_symbol (uint8_t type)
+{
+  if ((type & MACH_O_N_STAB) != 0)
+    return 0;
+  switch (type & MACH_O_N_TYPE)
+    {
+    case MACH_O_N_UNDF:
+      return 0;
+    case MACH_O_N_ABS:
+      return 1;
+    case MACH_O_N_SECT:
+      return 1;
+    default:
+      return 0;
+    }
+}
+
+/* Add symbol table information for a Mach-O file.  */
+
+static int
+macho_add_symtab (struct backtrace_state *state, int descriptor,
+		  struct libbacktrace_base_address base_address, int is_64,
+		  off_t symoff, unsigned int nsyms, off_t stroff,
+		  unsigned int strsize,
+		  backtrace_error_callback error_callback, void *data)
+{
+  size_t symsize;
+  struct backtrace_view sym_view;
+  int sym_view_valid;
+  struct backtrace_view str_view;
+  int str_view_valid;
+  size_t ndefs;
+  size_t symtaboff;
+  unsigned int i;
+  size_t macho_symbol_size;
+  struct macho_symbol *macho_symbols;
+  unsigned int j;
+  struct macho_syminfo_data *sdata;
+
+  sym_view_valid = 0;
+  str_view_valid = 0;
+  macho_symbol_size = 0;
+  macho_symbols = NULL;
+
+  if (is_64)
+    symsize = sizeof (struct macho_nlist_64);
+  else
+    symsize = sizeof (struct macho_nlist);
+
+  if (!backtrace_get_view (state, descriptor, symoff, nsyms * symsize,
+			   error_callback, data, &sym_view))
+    goto fail;
+  sym_view_valid = 1;
+
+  if (!backtrace_get_view (state, descriptor, stroff, strsize,
+			   error_callback, data, &str_view))
+    return 0;
+  str_view_valid = 1;
+
+  ndefs = 0;
+  symtaboff = 0;
+  for (i = 0; i < nsyms; ++i, symtaboff += symsize)
+    {
+      if (is_64)
+	{
+	  struct macho_nlist_64 nlist;
+
+	  memcpy (&nlist, (const char *) sym_view.data + symtaboff,
+		  sizeof nlist);
+	  if (macho_defined_symbol (nlist.n_type))
+	    ++ndefs;
+	}
+      else
+	{
+	  struct macho_nlist nlist;
+
+	  memcpy (&nlist, (const char *) sym_view.data + symtaboff,
+		  sizeof nlist);
+	  if (macho_defined_symbol (nlist.n_type))
+	    ++ndefs;
+	}
+    }
+
+  /* Add 1 to ndefs to make room for a sentinel.  */
+  macho_symbol_size = (ndefs + 1) * sizeof (struct macho_symbol);
+  macho_symbols = ((struct macho_symbol *)
+		   backtrace_alloc (state, macho_symbol_size, error_callback,
+				    data));
+  if (macho_symbols == NULL)
+    goto fail;
+
+  j = 0;
+  symtaboff = 0;
+  for (i = 0; i < nsyms; ++i, symtaboff += symsize)
+    {
+      uint32_t strx;
+      uint64_t value;
+      const char *name;
+
+      strx = 0;
+      value = 0;
+      if (is_64)
+	{
+	  struct macho_nlist_64 nlist;
+
+	  memcpy (&nlist, (const char *) sym_view.data + symtaboff,
+		  sizeof nlist);
+	  if (!macho_defined_symbol (nlist.n_type))
+	    continue;
+
+	  strx = nlist.n_strx;
+	  value = nlist.n_value;
+	}
+      else
+	{
+	  struct macho_nlist nlist;
+
+	  memcpy (&nlist, (const char *) sym_view.data + symtaboff,
+		  sizeof nlist);
+	  if (!macho_defined_symbol (nlist.n_type))
+	    continue;
+
+	  strx = nlist.n_strx;
+	  value = nlist.n_value;
+	}
+
+      if (strx >= strsize)
+	{
+	  error_callback (data, "symbol string index out of range", 0);
+	  goto fail;
+	}
+
+      name = (const char *) str_view.data + strx;
+      if (name[0] == '_')
+	++name;
+      macho_symbols[j].name = name;
+      macho_symbols[j].address = libbacktrace_add_base (value, base_address);
+      ++j;
+    }
+
+  sdata = ((struct macho_syminfo_data *)
+	   backtrace_alloc (state, sizeof *sdata, error_callback, data));
+  if (sdata == NULL)
+    goto fail;
+
+  /* We need to keep the string table since it holds the names, but we
+     can release the symbol table.  */
+
+  backtrace_release_view (state, &sym_view, error_callback, data);
+  sym_view_valid = 0;
+  str_view_valid = 0;
+
+  /* Add a trailing sentinel symbol.  */
+  macho_symbols[j].name = "";
+  macho_symbols[j].address = ~(uintptr_t) 0;
+
+  backtrace_qsort (macho_symbols, ndefs + 1, sizeof (struct macho_symbol),
+		   macho_symbol_compare);
+
+  sdata->next = NULL;
+  sdata->symbols = macho_symbols;
+  sdata->count = ndefs;
+
+  if (!state->threaded)
+    {
+      struct macho_syminfo_data **pp;
+
+      for (pp = (struct macho_syminfo_data **) (void *) &state->syminfo_data;
+	   *pp != NULL;
+	   pp = &(*pp)->next)
+	;
+      *pp = sdata;
+    }
+  else
+    {
+      while (1)
+	{
+	  struct macho_syminfo_data **pp;
+
+	  pp = (struct macho_syminfo_data **) (void *) &state->syminfo_data;
+
+	  while (1)
+	    {
+	      struct macho_syminfo_data *p;
+
+	      p = backtrace_atomic_load_pointer (pp);
+	      
+	      if (p == NULL)
+		break;
+
+	      pp = &p->next;
+	    }
+
+	  if (__sync_bool_compare_and_swap (pp, NULL, sdata))
+	    break;
+	}
+    }
+
+  return 1;
+
+ fail:
+  if (macho_symbols != NULL)
+    backtrace_free (state, macho_symbols, macho_symbol_size,
+		    error_callback, data);
+  if (sym_view_valid)
+    backtrace_release_view (state, &sym_view, error_callback, data);
+  if (str_view_valid)
+    backtrace_release_view (state, &str_view, error_callback, data);
+  return 0;
+}
+
+/* Return the symbol name and value for an ADDR.  */
+
+static void
+macho_syminfo (struct backtrace_state *state, uintptr_t addr,
+	       backtrace_syminfo_callback callback,
+	       backtrace_error_callback error_callback ATTRIBUTE_UNUSED,
+	       void *data)
+{
+  struct macho_syminfo_data *sdata;
+  struct macho_symbol *sym;
+
+  sym = NULL;
+  if (!state->threaded)
+    {
+      for (sdata = (struct macho_syminfo_data *) state->syminfo_data;
+	   sdata != NULL;
+	   sdata = sdata->next)
+	{
+	  sym = ((struct macho_symbol *)
+		 bsearch (&addr, sdata->symbols, sdata->count,
+			  sizeof (struct macho_symbol), macho_symbol_search));
+	  if (sym != NULL)
+	    break;
+	}
+    }
+  else
+    {
+      struct macho_syminfo_data **pp;
+
+      pp = (struct macho_syminfo_data **) (void *) &state->syminfo_data;
+      while (1)
+	{
+	  sdata = backtrace_atomic_load_pointer (pp);
+	  if (sdata == NULL)
+	    break;
+
+	  sym = ((struct macho_symbol *)
+		 bsearch (&addr, sdata->symbols, sdata->count,
+			  sizeof (struct macho_symbol), macho_symbol_search));
+	  if (sym != NULL)
+	    break;
+
+	  pp = &sdata->next;
+	}
+    }
+
+  if (sym == NULL)
+    callback (data, addr, NULL, 0, 0);
+  else
+    callback (data, addr, sym->name, sym->address, 0);
+}
+
+/* Look through a fat file to find the relevant executable.  Returns 1
+   on success, 0 on failure (in both cases descriptor is closed).  */
+
+static int
+macho_add_fat (struct backtrace_state *state, const char *filename,
+	       int descriptor, int swapped, off_t offset,
+	       const unsigned char *match_uuid,
+	       struct libbacktrace_base_address base_address,
+	       int skip_symtab, uint32_t nfat_arch, int is_64,
+	       backtrace_error_callback error_callback, void *data,
+	       fileline *fileline_fn, int *found_sym)
+{
+  int arch_view_valid;
+  unsigned int cputype;
+  size_t arch_size;
+  struct backtrace_view arch_view;
+  unsigned int i;
+
+  arch_view_valid = 0;
+
+#if defined (__x86_64__)
+  cputype = MACH_O_CPU_TYPE_X86_64;
+#elif defined (__i386__)
+  cputype = MACH_O_CPU_TYPE_X86;
+#elif defined (__aarch64__)
+  cputype = MACH_O_CPU_TYPE_ARM64;
+#elif defined (__arm__)
+  cputype = MACH_O_CPU_TYPE_ARM;
+#elif defined (__ppc__)
+  cputype = MACH_O_CPU_TYPE_PPC;
+#elif defined (__ppc64__)
+  cputype = MACH_O_CPU_TYPE_PPC64;
+#else
+  error_callback (data, "unknown Mach-O architecture", 0);
+  goto fail;
+#endif
+
+  if (is_64)
+    arch_size = sizeof (struct macho_fat_arch_64);
+  else
+    arch_size = sizeof (struct macho_fat_arch);
+
+  if (!backtrace_get_view (state, descriptor, offset,
+			   nfat_arch * arch_size,
+			   error_callback, data, &arch_view))
+    goto fail;
+
+  for (i = 0; i < nfat_arch; ++i)
+    {
+      uint32_t fcputype;
+      uint64_t foffset;
+
+      if (is_64)
+	{
+	  struct macho_fat_arch_64 fat_arch_64;
+
+	  memcpy (&fat_arch_64,
+		  (const char *) arch_view.data + i * arch_size,
+		  arch_size);
+	  fcputype = fat_arch_64.cputype;
+	  foffset = fat_arch_64.offset;
+	  if (swapped)
+	    {
+	      fcputype = __builtin_bswap32 (fcputype);
+	      foffset = __builtin_bswap64 (foffset);
+	    }
+	}
+      else
+	{
+	  struct macho_fat_arch fat_arch_32;
+
+	  memcpy (&fat_arch_32,
+		  (const char *) arch_view.data + i * arch_size,
+		  arch_size);
+	  fcputype = fat_arch_32.cputype;
+	  foffset = (uint64_t) fat_arch_32.offset;
+	  if (swapped)
+	    {
+	      fcputype = __builtin_bswap32 (fcputype);
+	      foffset = (uint64_t) __builtin_bswap32 ((uint32_t) foffset);
+	    }
+	}
+
+      if (fcputype == cputype)
+	{
+	  /* FIXME: What about cpusubtype?  */
+	  backtrace_release_view (state, &arch_view, error_callback, data);
+	  return macho_add (state, filename, descriptor, foffset, match_uuid,
+			    base_address, skip_symtab, error_callback, data,
+			    fileline_fn, found_sym);
+	}
+    }
+
+  error_callback (data, "could not find executable in fat file", 0);
+
+ fail:
+  if (arch_view_valid)
+    backtrace_release_view (state, &arch_view, error_callback, data);
+  if (descriptor != -1)
+    backtrace_close (descriptor, error_callback, data);
+  return 0;
+}
+
+/* Look for the dsym file for FILENAME.  This is called if FILENAME
+   does not have debug info or a symbol table.  Returns 1 on success,
+   0 on failure.  */
+
+static int
+macho_add_dsym (struct backtrace_state *state, const char *filename,
+		struct libbacktrace_base_address base_address,
+		const unsigned char *uuid,
+		backtrace_error_callback error_callback, void *data,
+		fileline* fileline_fn)
+{
+  const char *p;
+  const char *dirname;
+  char *diralc;
+  size_t dirnamelen;
+  const char *basename;
+  size_t basenamelen;
+  const char *dsymsuffixdir;
+  size_t dsymsuffixdirlen;
+  size_t dsymlen;
+  char *dsym;
+  char *ps;
+  int d;
+  int does_not_exist;
+  int dummy_found_sym;
+
+  diralc = NULL;
+  dirnamelen = 0;
+  dsym = NULL;
+  dsymlen = 0;
+
+  p = strrchr (filename, '/');
+  if (p == NULL)
+    {
+      dirname = ".";
+      dirnamelen = 1;
+      basename = filename;
+      basenamelen = strlen (basename);
+      diralc = NULL;
+    }
+  else
+    {
+      dirnamelen = p - filename;
+      diralc = (char*)backtrace_alloc (state, dirnamelen + 1, error_callback, data);
+      if (diralc == NULL)
+	goto fail;
+      memcpy (diralc, filename, dirnamelen);
+      diralc[dirnamelen] = '\0';
+      dirname = diralc;
+      basename = p + 1;
+      basenamelen = strlen (basename);
+    }
+
+  dsymsuffixdir = ".dSYM/Contents/Resources/DWARF/";
+  dsymsuffixdirlen = strlen (dsymsuffixdir);
+
+  dsymlen = (dirnamelen
+	     + 1
+	     + basenamelen
+	     + dsymsuffixdirlen
+	     + basenamelen
+	     + 1);
+  dsym = (char*)backtrace_alloc (state, dsymlen, error_callback, data);
+  if (dsym == NULL)
+    goto fail;
+
+  ps = dsym;
+  memcpy (ps, dirname, dirnamelen);
+  ps += dirnamelen;
+  *ps++ = '/';
+  memcpy (ps, basename, basenamelen);
+  ps += basenamelen;
+  memcpy (ps, dsymsuffixdir, dsymsuffixdirlen);
+  ps += dsymsuffixdirlen;
+  memcpy (ps, basename, basenamelen);
+  ps += basenamelen;
+  *ps = '\0';
+
+  if (diralc != NULL)
+    {
+      backtrace_free (state, diralc, dirnamelen + 1, error_callback, data);
+      diralc = NULL;
+    }
+
+  d = backtrace_open (dsym, error_callback, data, &does_not_exist);
+  if (d < 0)
+    {
+      /* The file does not exist, so we can't read the debug info.
+	 Just return success.  */
+      backtrace_free (state, dsym, dsymlen, error_callback, data);
+      return 1;
+    }
+
+  if (!macho_add (state, dsym, d, 0, uuid, base_address, 1,
+		  error_callback, data, fileline_fn, &dummy_found_sym))
+    goto fail;
+
+  backtrace_free (state, dsym, dsymlen, error_callback, data);
+
+  return 1;
+
+ fail:
+  if (dsym != NULL)
+    backtrace_free (state, dsym, dsymlen, error_callback, data);
+  if (diralc != NULL)
+    backtrace_free (state, diralc, dirnamelen, error_callback, data);
+  return 0;
+}
+
+/* Add the backtrace data for a Macho-O file.  Returns 1 on success, 0
+   on failure (in both cases descriptor is closed).
+
+   FILENAME: the name of the executable.
+   DESCRIPTOR: an open descriptor for the executable, closed here.
+   OFFSET: the offset within the file of this executable, for fat files.
+   MATCH_UUID: if not NULL, UUID that must match.
+   BASE_ADDRESS: the load address of the executable.
+   SKIP_SYMTAB: if non-zero, ignore the symbol table; used for dSYM files.
+   FILELINE_FN: set to the fileline function, by backtrace_dwarf_add.
+   FOUND_SYM: set to non-zero if we found the symbol table.
+*/
+
+static int
+macho_add (struct backtrace_state *state, const char *filename, int descriptor,
+	   off_t offset, const unsigned char *match_uuid,
+	   struct libbacktrace_base_address base_address, int skip_symtab,
+	   backtrace_error_callback error_callback, void *data,
+	   fileline *fileline_fn, int *found_sym)
+{
+  struct backtrace_view header_view;
+  struct macho_header_32 header;
+  off_t hdroffset;
+  int is_64;
+  struct backtrace_view cmds_view;
+  int cmds_view_valid;
+  struct dwarf_sections dwarf_sections;
+  int have_dwarf;
+  unsigned char uuid[MACH_O_UUID_LEN];
+  int have_uuid;
+  size_t cmdoffset;
+  unsigned int i;
+
+  *found_sym = 0;
+
+  cmds_view_valid = 0;
+
+  /* The 32-bit and 64-bit file headers start out the same, so we can
+     just always read the 32-bit version.  A fat header is shorter but
+     it will always be followed by data, so it's OK to read extra.  */
+
+  if (!backtrace_get_view (state, descriptor, offset,
+			   sizeof (struct macho_header_32),
+			   error_callback, data, &header_view))
+    goto fail;
+
+  memcpy (&header, header_view.data, sizeof header);
+
+  backtrace_release_view (state, &header_view, error_callback, data);
+
+  switch (header.magic)
+    {
+    case MACH_O_MH_MAGIC_32:
+      is_64 = 0;
+      hdroffset = offset + sizeof (struct macho_header_32);
+      break;
+    case MACH_O_MH_MAGIC_64:
+      is_64 = 1;
+      hdroffset = offset + sizeof (struct macho_header_64);
+      break;
+    case MACH_O_MH_MAGIC_FAT:
+    case MACH_O_MH_MAGIC_FAT_64:
+      {
+	struct macho_header_fat fat_header;
+
+	hdroffset = offset + sizeof (struct macho_header_fat);
+	memcpy (&fat_header, &header, sizeof fat_header);
+	return macho_add_fat (state, filename, descriptor, 0, hdroffset,
+			      match_uuid, base_address, skip_symtab,
+			      fat_header.nfat_arch,
+			      header.magic == MACH_O_MH_MAGIC_FAT_64,
+			      error_callback, data, fileline_fn, found_sym);
+      }
+    case MACH_O_MH_CIGAM_FAT:
+    case MACH_O_MH_CIGAM_FAT_64:
+      {
+	struct macho_header_fat fat_header;
+	uint32_t nfat_arch;
+
+	hdroffset = offset + sizeof (struct macho_header_fat);
+	memcpy (&fat_header, &header, sizeof fat_header);
+	nfat_arch = __builtin_bswap32 (fat_header.nfat_arch);
+	return macho_add_fat (state, filename, descriptor, 1, hdroffset,
+			      match_uuid, base_address, skip_symtab,
+			      nfat_arch,
+			      header.magic == MACH_O_MH_CIGAM_FAT_64,
+			      error_callback, data, fileline_fn, found_sym);
+      }
+    default:
+      error_callback (data, "executable file is not in Mach-O format", 0);
+      goto fail;
+    }
+
+  switch (header.filetype)
+    {
+    case MACH_O_MH_EXECUTE:
+    case MACH_O_MH_DYLIB:
+    case MACH_O_MH_DSYM:
+      break;
+    default:
+      error_callback (data, "executable file is not an executable", 0);
+      goto fail;
+    }
+
+  if (!backtrace_get_view (state, descriptor, hdroffset, header.sizeofcmds,
+			   error_callback, data, &cmds_view))
+    goto fail;
+  cmds_view_valid = 1;
+
+  memset (&dwarf_sections, 0, sizeof dwarf_sections);
+  have_dwarf = 0;
+  memset (&uuid, 0, sizeof uuid);
+  have_uuid = 0;
+
+  cmdoffset = 0;
+  for (i = 0; i < header.ncmds; ++i)
+    {
+      const char *pcmd;
+      struct macho_load_command load_command;
+
+      if (cmdoffset + sizeof load_command > header.sizeofcmds)
+	break;
+
+      pcmd = (const char *) cmds_view.data + cmdoffset;
+      memcpy (&load_command, pcmd, sizeof load_command);
+
+      switch (load_command.cmd)
+	{
+	case MACH_O_LC_SEGMENT:
+	  {
+	    struct macho_segment_command segcmd;
+
+	    memcpy (&segcmd, pcmd, sizeof segcmd);
+	    if (memcmp (segcmd.segname,
+			"__DWARF\0\0\0\0\0\0\0\0\0",
+			MACH_O_NAMELEN) == 0)
+	      {
+		if (!macho_add_dwarf_segment (state, descriptor, offset,
+					      load_command.cmd,
+					      pcmd + sizeof segcmd,
+					      (load_command.cmdsize
+					       - sizeof segcmd),
+					      segcmd.nsects, error_callback,
+					      data, &dwarf_sections))
+		  goto fail;
+		have_dwarf = 1;
+	      }
+	  }
+	  break;
+
+	case MACH_O_LC_SEGMENT_64:
+	  {
+	    struct macho_segment_64_command segcmd;
+
+	    memcpy (&segcmd, pcmd, sizeof segcmd);
+	    if (memcmp (segcmd.segname,
+			"__DWARF\0\0\0\0\0\0\0\0\0",
+			MACH_O_NAMELEN) == 0)
+	      {
+		if (!macho_add_dwarf_segment (state, descriptor, offset,
+					      load_command.cmd,
+					      pcmd + sizeof segcmd,
+					      (load_command.cmdsize
+					       - sizeof segcmd),
+					      segcmd.nsects, error_callback,
+					      data, &dwarf_sections))
+		  goto fail;
+		have_dwarf = 1;
+	      }
+	  }
+	  break;
+
+	case MACH_O_LC_SYMTAB:
+	  if (!skip_symtab)
+	    {
+	      struct macho_symtab_command symcmd;
+
+	      memcpy (&symcmd, pcmd, sizeof symcmd);
+	      if (!macho_add_symtab (state, descriptor, base_address, is_64,
+				     offset + symcmd.symoff, symcmd.nsyms,
+				     offset + symcmd.stroff, symcmd.strsize,
+				     error_callback, data))
+		goto fail;
+
+	      *found_sym = 1;
+	    }
+	  break;
+
+	case MACH_O_LC_UUID:
+	  {
+	    struct macho_uuid_command uuidcmd;
+
+	    memcpy (&uuidcmd, pcmd, sizeof uuidcmd);
+	    memcpy (&uuid[0], &uuidcmd.uuid[0], MACH_O_UUID_LEN);
+	    have_uuid = 1;
+	  }
+	  break;
+
+	default:
+	  break;
+	}
+
+      cmdoffset += load_command.cmdsize;
+    }
+
+  if (!backtrace_close (descriptor, error_callback, data))
+    goto fail;
+  descriptor = -1;
+
+  backtrace_release_view (state, &cmds_view, error_callback, data);
+  cmds_view_valid = 0;
+
+  if (match_uuid != NULL)
+    {
+      /* If we don't have a UUID, or it doesn't match, just ignore
+	 this file.  */
+      if (!have_uuid
+	  || memcmp (match_uuid, &uuid[0], MACH_O_UUID_LEN) != 0)
+	return 1;
+    }
+
+  if (have_dwarf)
+    {
+      int is_big_endian;
+
+      is_big_endian = 0;
+#if defined(__BYTE_ORDER__) && defined(__ORDER_BIG_ENDIAN__)
+#if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
+      is_big_endian = 1;
+#endif
+#endif
+
+      if (!backtrace_dwarf_add (state, base_address, &dwarf_sections,
+				is_big_endian, NULL, error_callback, data,
+				fileline_fn, NULL))
+	goto fail;
+    }
+
+  if (!have_dwarf && have_uuid)
+    {
+      if (!macho_add_dsym (state, filename, base_address, &uuid[0],
+			   error_callback, data, fileline_fn))
+	goto fail;
+    }
+
+  return 1;
+
+ fail:
+  if (cmds_view_valid)
+    backtrace_release_view (state, &cmds_view, error_callback, data);
+  if (descriptor != -1)
+    backtrace_close (descriptor, error_callback, data);
+  return 0;
+}
+
+#ifdef HAVE_MACH_O_DYLD_H
+
+/* Initialize the backtrace data we need from a Mach-O executable
+   using the dyld support functions.  This closes descriptor.  */
+
+int
+backtrace_initialize (struct backtrace_state *state, const char *filename,
+		      int descriptor, backtrace_error_callback error_callback,
+		      void *data, fileline *fileline_fn)
+{
+  uint32_t c;
+  uint32_t i;
+  int closed_descriptor;
+  int found_sym;
+  fileline macho_fileline_fn;
+
+  closed_descriptor = 0;
+  found_sym = 0;
+  macho_fileline_fn = macho_nodebug;
+
+  c = _dyld_image_count ();
+  for (i = 0; i < c; ++i)
+    {
+      struct libbacktrace_base_address base_address;
+      const char *name;
+      int d;
+      fileline mff;
+      int mfs;
+
+      name = _dyld_get_image_name (i);
+      if (name == NULL)
+	continue;
+
+      if (strcmp (name, filename) == 0 && !closed_descriptor)
+	{
+	  d = descriptor;
+	  closed_descriptor = 1;
+	}
+      else
+	{
+	  int does_not_exist;
+
+	  d = backtrace_open (name, error_callback, data, &does_not_exist);
+	  if (d < 0)
+	    continue;
+	}
+
+      base_address.m = _dyld_get_image_vmaddr_slide (i);
+
+      mff = macho_nodebug;
+      if (!macho_add (state, name, d, 0, NULL, base_address, 0,
+		      error_callback, data, &mff, &mfs))
+	continue;
+
+      if (mff != macho_nodebug)
+	macho_fileline_fn = mff;
+      if (mfs)
+	found_sym = 1;
+    }
+
+  if (!closed_descriptor)
+    backtrace_close (descriptor, error_callback, data);
+
+  if (!state->threaded)
+    {
+      if (found_sym)
+	state->syminfo_fn = macho_syminfo;
+      else if (state->syminfo_fn == NULL)
+	state->syminfo_fn = macho_nosyms;
+    }
+  else
+    {
+      if (found_sym)
+	backtrace_atomic_store_pointer (&state->syminfo_fn, &macho_syminfo);
+      else
+	(void) __sync_bool_compare_and_swap (&state->syminfo_fn, NULL,
+					     macho_nosyms);
+    }
+
+  if (!state->threaded)
+    *fileline_fn = state->fileline_fn;
+  else
+    *fileline_fn = backtrace_atomic_load_pointer (&state->fileline_fn);
+
+  if (*fileline_fn == NULL || *fileline_fn == macho_nodebug)
+    *fileline_fn = macho_fileline_fn;
+
+  return 1;
+}
+
+#else /* !defined (HAVE_MACH_O_DYLD_H) */
+
+/* Initialize the backtrace data we need from a Mach-O executable
+   without using the dyld support functions.  This closes
+   descriptor.  */
+
+int
+backtrace_initialize (struct backtrace_state *state, const char *filename,
+		      int descriptor, backtrace_error_callback error_callback,
+		      void *data, fileline *fileline_fn)
+{
+  fileline macho_fileline_fn;
+  struct libbacktrace_base_address zero_base_address;
+  int found_sym;
+
+  macho_fileline_fn = macho_nodebug;
+  memset (&zero_base_address, 0, sizeof zero_base_address);
+  if (!macho_add (state, filename, descriptor, 0, NULL, zero_base_address, 0,
+		  error_callback, data, &macho_fileline_fn, &found_sym))
+    return 0;
+
+  if (!state->threaded)
+    {
+      if (found_sym)
+	state->syminfo_fn = macho_syminfo;
+      else if (state->syminfo_fn == NULL)
+	state->syminfo_fn = macho_nosyms;
+    }
+  else
+    {
+      if (found_sym)
+	backtrace_atomic_store_pointer (&state->syminfo_fn, &macho_syminfo);
+      else
+	(void) __sync_bool_compare_and_swap (&state->syminfo_fn, NULL,
+					     macho_nosyms);
+    }
+
+  if (!state->threaded)
+    *fileline_fn = state->fileline_fn;
+  else
+    *fileline_fn = backtrace_atomic_load_pointer (&state->fileline_fn);
+
+  if (*fileline_fn == NULL || *fileline_fn == macho_nodebug)
+    *fileline_fn = macho_fileline_fn;
+
+  return 1;
+}
+
+#endif /* !defined (HAVE_MACH_O_DYLD_H) */
+
+}
diff --git a/project/thirdparty/tracy-0.11.1/libbacktrace/mmapio.cpp b/project/thirdparty/tracy-0.11.1/libbacktrace/mmapio.cpp
new file mode 100644
index 000000000..0e8f599bb
--- /dev/null
+++ b/project/thirdparty/tracy-0.11.1/libbacktrace/mmapio.cpp
@@ -0,0 +1,115 @@
+/* mmapio.c -- File views using mmap.
+   Copyright (C) 2012-2021 Free Software Foundation, Inc.
+   Written by Ian Lance Taylor, Google.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+
+    (1) Redistributions of source code must retain the above copyright
+    notice, this list of conditions and the following disclaimer.
+
+    (2) Redistributions in binary form must reproduce the above copyright
+    notice, this list of conditions and the following disclaimer in
+    the documentation and/or other materials provided with the
+    distribution.
+
+    (3) The name of the author may not be used to
+    endorse or promote products derived from this software without
+    specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT,
+INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING
+IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+POSSIBILITY OF SUCH DAMAGE.  */
+
+#include "config.h"
+
+#include <errno.h>
+#include <sys/types.h>
+#include <sys/mman.h>
+#include <unistd.h>
+
+#include "backtrace.hpp"
+#include "internal.hpp"
+
+#ifndef HAVE_DECL_GETPAGESIZE
+extern int getpagesize (void);
+#endif
+
+#ifndef MAP_FAILED
+#define MAP_FAILED ((void *)-1)
+#endif
+
+namespace tracy
+{
+
+/* This file implements file views and memory allocation when mmap is
+   available.  */
+
+/* Create a view of SIZE bytes from DESCRIPTOR at OFFSET.  */
+
+int
+backtrace_get_view (struct backtrace_state *state ATTRIBUTE_UNUSED,
+		    int descriptor, off_t offset, uint64_t size,
+		    backtrace_error_callback error_callback,
+		    void *data, struct backtrace_view *view)
+{
+  size_t pagesize;
+  unsigned int inpage;
+  off_t pageoff;
+  void *map;
+
+  if ((uint64_t) (size_t) size != size)
+    {
+      error_callback (data, "file size too large", 0);
+      return 0;
+    }
+
+  pagesize = getpagesize ();
+  inpage = offset % pagesize;
+  pageoff = offset - inpage;
+
+  size += inpage;
+  size = (size + (pagesize - 1)) & ~ (pagesize - 1);
+
+  map = mmap (NULL, size, PROT_READ, MAP_PRIVATE, descriptor, pageoff);
+  if (map == MAP_FAILED)
+    {
+      error_callback (data, "mmap", errno);
+      return 0;
+    }
+
+  view->data = (char *) map + inpage;
+  view->base = map;
+  view->len = size;
+
+  return 1;
+}
+
+/* Release a view read by backtrace_get_view.  */
+
+void
+backtrace_release_view (struct backtrace_state *state ATTRIBUTE_UNUSED,
+			struct backtrace_view *view,
+			backtrace_error_callback error_callback,
+			void *data)
+{
+  union {
+    const void *cv;
+    void *v;
+  } cc;
+
+  cc.cv = view->base;
+  if (munmap (cc.v, view->len) < 0)
+    error_callback (data, "munmap", errno);
+}
+
+}
diff --git a/project/thirdparty/tracy-0.11.1/libbacktrace/posix.cpp b/project/thirdparty/tracy-0.11.1/libbacktrace/posix.cpp
new file mode 100644
index 000000000..8233a8ea3
--- /dev/null
+++ b/project/thirdparty/tracy-0.11.1/libbacktrace/posix.cpp
@@ -0,0 +1,109 @@
+/* posix.c -- POSIX file I/O routines for the backtrace library.
+   Copyright (C) 2012-2021 Free Software Foundation, Inc.
+   Written by Ian Lance Taylor, Google.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+
+    (1) Redistributions of source code must retain the above copyright
+    notice, this list of conditions and the following disclaimer.
+
+    (2) Redistributions in binary form must reproduce the above copyright
+    notice, this list of conditions and the following disclaimer in
+    the documentation and/or other materials provided with the
+    distribution.
+
+    (3) The name of the author may not be used to
+    endorse or promote products derived from this software without
+    specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT,
+INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING
+IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+POSSIBILITY OF SUCH DAMAGE.  */
+
+#include "config.h"
+
+#include <errno.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <fcntl.h>
+#include <unistd.h>
+
+#include "backtrace.hpp"
+#include "internal.hpp"
+
+#ifndef O_BINARY
+#define O_BINARY 0
+#endif
+
+#ifndef O_CLOEXEC
+#define O_CLOEXEC 0
+#endif
+
+#ifndef FD_CLOEXEC
+#define FD_CLOEXEC 1
+#endif
+
+namespace tracy
+{
+
+/* Open a file for reading.  */
+
+int
+backtrace_open (const char *filename, backtrace_error_callback error_callback,
+		void *data, int *does_not_exist)
+{
+  int descriptor;
+
+  if (does_not_exist != NULL)
+    *does_not_exist = 0;
+
+  descriptor = open (filename, (int) (O_RDONLY | O_BINARY | O_CLOEXEC));
+  if (descriptor < 0)
+    {
+      /* If DOES_NOT_EXIST is not NULL, then don't call ERROR_CALLBACK
+	 if the file does not exist.  We treat lacking permission to
+	 open the file as the file not existing; this case arises when
+	 running the libgo syscall package tests as root.  */
+      if (does_not_exist != NULL && (errno == ENOENT || errno == EACCES))
+	*does_not_exist = 1;
+      else
+	error_callback (data, filename, errno);
+      return -1;
+    }
+
+#ifdef HAVE_FCNTL
+  /* Set FD_CLOEXEC just in case the kernel does not support
+     O_CLOEXEC. It doesn't matter if this fails for some reason.
+     FIXME: At some point it should be safe to only do this if
+     O_CLOEXEC == 0.  */
+  fcntl (descriptor, F_SETFD, FD_CLOEXEC);
+#endif
+
+  return descriptor;
+}
+
+/* Close DESCRIPTOR.  */
+
+int
+backtrace_close (int descriptor, backtrace_error_callback error_callback,
+		 void *data)
+{
+  if (close (descriptor) < 0)
+    {
+      error_callback (data, "close", errno);
+      return 0;
+    }
+  return 1;
+}
+
+}
diff --git a/project/thirdparty/tracy-0.11.1/libbacktrace/sort.cpp b/project/thirdparty/tracy-0.11.1/libbacktrace/sort.cpp
new file mode 100644
index 000000000..6daee0a64
--- /dev/null
+++ b/project/thirdparty/tracy-0.11.1/libbacktrace/sort.cpp
@@ -0,0 +1,113 @@
+/* sort.c -- Sort without allocating memory
+   Copyright (C) 2012-2021 Free Software Foundation, Inc.
+   Written by Ian Lance Taylor, Google.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+
+    (1) Redistributions of source code must retain the above copyright
+    notice, this list of conditions and the following disclaimer.
+
+    (2) Redistributions in binary form must reproduce the above copyright
+    notice, this list of conditions and the following disclaimer in
+    the documentation and/or other materials provided with the
+    distribution.
+
+    (3) The name of the author may not be used to
+    endorse or promote products derived from this software without
+    specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT,
+INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING
+IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+POSSIBILITY OF SUCH DAMAGE.  */
+
+#include "config.h"
+
+#include <stddef.h>
+#include <sys/types.h>
+
+#include "backtrace.hpp"
+#include "internal.hpp"
+
+namespace tracy
+{
+
+/* The GNU glibc version of qsort allocates memory, which we must not
+   do if we are invoked by a signal handler.  So provide our own
+   sort.  */
+
+static void
+swap (char *a, char *b, size_t size)
+{
+  size_t i;
+
+  for (i = 0; i < size; i++, a++, b++)
+    {
+      char t;
+
+      t = *a;
+      *a = *b;
+      *b = t;
+    }
+}
+
+void
+backtrace_qsort (void *basearg, size_t count, size_t size,
+		 int (*compar) (const void *, const void *))
+{
+  char *base = (char *) basearg;
+  size_t i;
+  size_t mid;
+
+ tail_recurse:
+  if (count < 2)
+    return;
+
+  /* The symbol table and DWARF tables, which is all we use this
+     routine for, tend to be roughly sorted.  Pick the middle element
+     in the array as our pivot point, so that we are more likely to
+     cut the array in half for each recursion step.  */
+  swap (base, base + (count / 2) * size, size);
+
+  mid = 0;
+  for (i = 1; i < count; i++)
+    {
+      if ((*compar) (base, base + i * size) > 0)
+	{
+	  ++mid;
+	  if (i != mid)
+	    swap (base + mid * size, base + i * size, size);
+	}
+    }
+
+  if (mid > 0)
+    swap (base, base + mid * size, size);
+
+  /* Recurse with the smaller array, loop with the larger one.  That
+     ensures that our maximum stack depth is log count.  */
+  if (2 * mid < count)
+    {
+      backtrace_qsort (base, mid, size, compar);
+      base += (mid + 1) * size;
+      count -= mid + 1;
+      goto tail_recurse;
+    }
+  else
+    {
+      backtrace_qsort (base + (mid + 1) * size, count - (mid + 1),
+		       size, compar);
+      count = mid;
+      goto tail_recurse;
+    }
+}
+
+}
diff --git a/project/thirdparty/tracy-0.11.1/libbacktrace/state.cpp b/project/thirdparty/tracy-0.11.1/libbacktrace/state.cpp
new file mode 100644
index 000000000..ea3c137c5
--- /dev/null
+++ b/project/thirdparty/tracy-0.11.1/libbacktrace/state.cpp
@@ -0,0 +1,76 @@
+/* state.c -- Create the backtrace state.
+   Copyright (C) 2012-2021 Free Software Foundation, Inc.
+   Written by Ian Lance Taylor, Google.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+
+    (1) Redistributions of source code must retain the above copyright
+    notice, this list of conditions and the following disclaimer.
+
+    (2) Redistributions in binary form must reproduce the above copyright
+    notice, this list of conditions and the following disclaimer in
+    the documentation and/or other materials provided with the
+    distribution.
+
+    (3) The name of the author may not be used to
+    endorse or promote products derived from this software without
+    specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT,
+INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING
+IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+POSSIBILITY OF SUCH DAMAGE.  */
+
+#include "config.h"
+
+#include <string.h>
+#include <sys/types.h>
+
+#include "backtrace.hpp"
+#include "internal.hpp"
+
+namespace tracy
+{
+
+/* Create the backtrace state.  This will then be passed to all the
+   other routines.  */
+
+struct backtrace_state *
+backtrace_create_state (const char *filename, int threaded,
+			backtrace_error_callback error_callback,
+			void *data)
+{
+  struct backtrace_state init_state;
+  struct backtrace_state *state;
+
+#ifndef HAVE_SYNC_FUNCTIONS
+  if (threaded)
+    {
+      error_callback (data, "backtrace library does not support threads", 0);
+      return NULL;
+    }
+#endif
+
+  memset (&init_state, 0, sizeof init_state);
+  init_state.filename = filename;
+  init_state.threaded = threaded;
+
+  state = ((struct backtrace_state *)
+	   backtrace_alloc (&init_state, sizeof *state, error_callback, data));
+  if (state == NULL)
+    return NULL;
+  *state = init_state;
+
+  return state;
+}
+
+}
diff --git a/project/thirdparty/tracy-0.11.1/tracy/Tracy.hpp b/project/thirdparty/tracy-0.11.1/tracy/Tracy.hpp
new file mode 100644
index 000000000..e75d02ce9
--- /dev/null
+++ b/project/thirdparty/tracy-0.11.1/tracy/Tracy.hpp
@@ -0,0 +1,300 @@
+#ifndef __TRACY_HPP__
+#define __TRACY_HPP__
+
+#include "../common/TracyColor.hpp"
+#include "../common/TracySystem.hpp"
+
+#ifndef TracyFunction
+#  define TracyFunction __FUNCTION__
+#endif
+
+#ifndef TracyFile
+#  define TracyFile __FILE__
+#endif
+
+#ifndef TracyLine
+#  define TracyLine __LINE__
+#endif
+
+#ifndef TRACY_ENABLE
+
+#define TracyNoop
+
+#define ZoneNamed(x,y)
+#define ZoneNamedN(x,y,z)
+#define ZoneNamedC(x,y,z)
+#define ZoneNamedNC(x,y,z,w)
+
+#define ZoneTransient(x,y)
+#define ZoneTransientN(x,y,z)
+
+#define ZoneScoped
+#define ZoneScopedN(x)
+#define ZoneScopedC(x)
+#define ZoneScopedNC(x,y)
+
+#define ZoneText(x,y)
+#define ZoneTextV(x,y,z)
+#define ZoneTextF(x,...)
+#define ZoneTextVF(x,y,...)
+#define ZoneName(x,y)
+#define ZoneNameV(x,y,z)
+#define ZoneNameF(x,...)
+#define ZoneNameVF(x,y,...)
+#define ZoneColor(x)
+#define ZoneColorV(x,y)
+#define ZoneValue(x)
+#define ZoneValueV(x,y)
+#define ZoneIsActive false
+#define ZoneIsActiveV(x) false
+
+#define FrameMark
+#define FrameMarkNamed(x)
+#define FrameMarkStart(x)
+#define FrameMarkEnd(x)
+
+#define FrameImage(x,y,z,w,a)
+
+#define TracyLockable( type, varname ) type varname
+#define TracyLockableN( type, varname, desc ) type varname
+#define TracySharedLockable( type, varname ) type varname
+#define TracySharedLockableN( type, varname, desc ) type varname
+#define LockableBase( type ) type
+#define SharedLockableBase( type ) type
+#define LockMark(x) (void)x
+#define LockableName(x,y,z)
+
+#define TracyPlot(x,y)
+#define TracyPlotConfig(x,y,z,w,a)
+
+#define TracyMessage(x,y)
+#define TracyMessageL(x)
+#define TracyMessageC(x,y,z)
+#define TracyMessageLC(x,y)
+#define TracyAppInfo(x,y)
+
+#define TracyAlloc(x,y)
+#define TracyFree(x)
+#define TracySecureAlloc(x,y)
+#define TracySecureFree(x)
+
+#define TracyAllocN(x,y,z)
+#define TracyFreeN(x,y)
+#define TracySecureAllocN(x,y,z)
+#define TracySecureFreeN(x,y)
+
+#define ZoneNamedS(x,y,z)
+#define ZoneNamedNS(x,y,z,w)
+#define ZoneNamedCS(x,y,z,w)
+#define ZoneNamedNCS(x,y,z,w,a)
+
+#define ZoneTransientS(x,y,z)
+#define ZoneTransientNS(x,y,z,w)
+
+#define ZoneScopedS(x)
+#define ZoneScopedNS(x,y)
+#define ZoneScopedCS(x,y)
+#define ZoneScopedNCS(x,y,z)
+
+#define TracyAllocS(x,y,z)
+#define TracyFreeS(x,y)
+#define TracySecureAllocS(x,y,z)
+#define TracySecureFreeS(x,y)
+
+#define TracyAllocNS(x,y,z,w)
+#define TracyFreeNS(x,y,z)
+#define TracySecureAllocNS(x,y,z,w)
+#define TracySecureFreeNS(x,y,z)
+
+#define TracyMessageS(x,y,z)
+#define TracyMessageLS(x,y)
+#define TracyMessageCS(x,y,z,w)
+#define TracyMessageLCS(x,y,z)
+
+#define TracySourceCallbackRegister(x,y)
+#define TracyParameterRegister(x,y)
+#define TracyParameterSetup(x,y,z,w)
+#define TracyIsConnected false
+#define TracyIsStarted false
+#define TracySetProgramName(x)
+
+#define TracyFiberEnter(x)
+#define TracyFiberEnterHint(x,y)
+#define TracyFiberLeave
+
+#else
+
+#include <string.h>
+
+#include "../client/TracyLock.hpp"
+#include "../client/TracyProfiler.hpp"
+#include "../client/TracyScoped.hpp"
+
+#define TracyNoop tracy::ProfilerAvailable()
+
+#if defined TRACY_HAS_CALLSTACK && defined TRACY_CALLSTACK
+#  define ZoneNamed( varname, active ) static constexpr tracy::SourceLocationData TracyConcat(__tracy_source_location,TracyLine) { nullptr, TracyFunction,  TracyFile, (uint32_t)TracyLine, 0 }; tracy::ScopedZone varname( &TracyConcat(__tracy_source_location,TracyLine), TRACY_CALLSTACK, active )
+#  define ZoneNamedN( varname, name, active ) static constexpr tracy::SourceLocationData TracyConcat(__tracy_source_location,TracyLine) { name, TracyFunction,  TracyFile, (uint32_t)TracyLine, 0 }; tracy::ScopedZone varname( &TracyConcat(__tracy_source_location,TracyLine), TRACY_CALLSTACK, active )
+#  define ZoneNamedC( varname, color, active ) static constexpr tracy::SourceLocationData TracyConcat(__tracy_source_location,TracyLine) { nullptr, TracyFunction,  TracyFile, (uint32_t)TracyLine, color }; tracy::ScopedZone varname( &TracyConcat(__tracy_source_location,TracyLine), TRACY_CALLSTACK, active )
+#  define ZoneNamedNC( varname, name, color, active ) static constexpr tracy::SourceLocationData TracyConcat(__tracy_source_location,TracyLine) { name, TracyFunction,  TracyFile, (uint32_t)TracyLine, color }; tracy::ScopedZone varname( &TracyConcat(__tracy_source_location,TracyLine), TRACY_CALLSTACK, active )
+
+#  define ZoneTransient( varname, active ) tracy::ScopedZone varname( TracyLine, TracyFile, strlen( TracyFile ), TracyFunction, strlen( TracyFunction ), nullptr, 0, TRACY_CALLSTACK, active )
+#  define ZoneTransientN( varname, name, active ) tracy::ScopedZone varname( TracyLine, TracyFile, strlen( TracyFile ), TracyFunction, strlen( TracyFunction ), name, strlen( name ), TRACY_CALLSTACK, active )
+#  define ZoneTransientNC( varname, name, color, active ) tracy::ScopedZone varname( TracyLine, TracyFile, strlen( TracyFile ), TracyFunction, strlen( TracyFunction ), name, strlen( name ), color, TRACY_CALLSTACK, active )
+#else
+#  define ZoneNamed( varname, active ) static constexpr tracy::SourceLocationData TracyConcat(__tracy_source_location,TracyLine) { nullptr, TracyFunction,  TracyFile, (uint32_t)TracyLine, 0 }; tracy::ScopedZone varname( &TracyConcat(__tracy_source_location,TracyLine), active )
+#  define ZoneNamedN( varname, name, active ) static constexpr tracy::SourceLocationData TracyConcat(__tracy_source_location,TracyLine) { name, TracyFunction,  TracyFile, (uint32_t)TracyLine, 0 }; tracy::ScopedZone varname( &TracyConcat(__tracy_source_location,TracyLine), active )
+#  define ZoneNamedC( varname, color, active ) static constexpr tracy::SourceLocationData TracyConcat(__tracy_source_location,TracyLine) { nullptr, TracyFunction,  TracyFile, (uint32_t)TracyLine, color }; tracy::ScopedZone varname( &TracyConcat(__tracy_source_location,TracyLine), active )
+#  define ZoneNamedNC( varname, name, color, active ) static constexpr tracy::SourceLocationData TracyConcat(__tracy_source_location,TracyLine) { name, TracyFunction,  TracyFile, (uint32_t)TracyLine, color }; tracy::ScopedZone varname( &TracyConcat(__tracy_source_location,TracyLine), active )
+
+#  define ZoneTransient( varname, active ) tracy::ScopedZone varname( TracyLine, TracyFile, strlen( TracyFile ), TracyFunction, strlen( TracyFunction ), nullptr, 0, active )
+#  define ZoneTransientN( varname, name, active ) tracy::ScopedZone varname( TracyLine, TracyFile, strlen( TracyFile ), TracyFunction, strlen( TracyFunction ), name, strlen( name ), active )
+#  define ZoneTransientNC( varname, name, color, active ) tracy::ScopedZone varname( TracyLine, TracyFile, strlen( TracyFile ), TracyFunction, strlen( TracyFunction ), name, strlen( name ), color, active )
+#endif
+
+#define ZoneScoped ZoneNamed( ___tracy_scoped_zone, true )
+#define ZoneScopedN( name ) ZoneNamedN( ___tracy_scoped_zone, name, true )
+#define ZoneScopedC( color ) ZoneNamedC( ___tracy_scoped_zone, color, true )
+#define ZoneScopedNC( name, color ) ZoneNamedNC( ___tracy_scoped_zone, name, color, true )
+
+#define ZoneText( txt, size ) ___tracy_scoped_zone.Text( txt, size )
+#define ZoneTextV( varname, txt, size ) varname.Text( txt, size )
+#define ZoneTextF( fmt, ... ) ___tracy_scoped_zone.TextFmt( fmt, ##__VA_ARGS__ )
+#define ZoneTextVF( varname, fmt, ... ) varname.TextFmt( fmt, ##__VA_ARGS__ )
+#define ZoneName( txt, size ) ___tracy_scoped_zone.Name( txt, size )
+#define ZoneNameV( varname, txt, size ) varname.Name( txt, size )
+#define ZoneNameF( fmt, ... ) ___tracy_scoped_zone.NameFmt( fmt, ##__VA_ARGS__ )
+#define ZoneNameVF( varname, fmt, ... ) varname.NameFmt( fmt, ##__VA_ARGS__ )
+#define ZoneColor( color ) ___tracy_scoped_zone.Color( color )
+#define ZoneColorV( varname, color ) varname.Color( color )
+#define ZoneValue( value ) ___tracy_scoped_zone.Value( value )
+#define ZoneValueV( varname, value ) varname.Value( value )
+#define ZoneIsActive ___tracy_scoped_zone.IsActive()
+#define ZoneIsActiveV( varname ) varname.IsActive()
+
+#define FrameMark tracy::Profiler::SendFrameMark( nullptr )
+#define FrameMarkNamed( name ) tracy::Profiler::SendFrameMark( name )
+#define FrameMarkStart( name ) tracy::Profiler::SendFrameMark( name, tracy::QueueType::FrameMarkMsgStart )
+#define FrameMarkEnd( name ) tracy::Profiler::SendFrameMark( name, tracy::QueueType::FrameMarkMsgEnd )
+
+#define FrameImage( image, width, height, offset, flip ) tracy::Profiler::SendFrameImage( image, width, height, offset, flip )
+
+#define TracyLockable( type, varname ) tracy::Lockable<type> varname { [] () -> const tracy::SourceLocationData* { static constexpr tracy::SourceLocationData srcloc { nullptr, #type " " #varname, TracyFile, TracyLine, 0 }; return &srcloc; }() }
+#define TracyLockableN( type, varname, desc ) tracy::Lockable<type> varname { [] () -> const tracy::SourceLocationData* { static constexpr tracy::SourceLocationData srcloc { nullptr, desc, TracyFile, TracyLine, 0 }; return &srcloc; }() }
+#define TracySharedLockable( type, varname ) tracy::SharedLockable<type> varname { [] () -> const tracy::SourceLocationData* { static constexpr tracy::SourceLocationData srcloc { nullptr, #type " " #varname, TracyFile, TracyLine, 0 }; return &srcloc; }() }
+#define TracySharedLockableN( type, varname, desc ) tracy::SharedLockable<type> varname { [] () -> const tracy::SourceLocationData* { static constexpr tracy::SourceLocationData srcloc { nullptr, desc, TracyFile, TracyLine, 0 }; return &srcloc; }() }
+#define LockableBase( type ) tracy::Lockable<type>
+#define SharedLockableBase( type ) tracy::SharedLockable<type>
+#define LockMark( varname ) static constexpr tracy::SourceLocationData __tracy_lock_location_##varname { nullptr, TracyFunction,  TracyFile, (uint32_t)TracyLine, 0 }; varname.Mark( &__tracy_lock_location_##varname )
+#define LockableName( varname, txt, size ) varname.CustomName( txt, size )
+
+#define TracyPlot( name, val ) tracy::Profiler::PlotData( name, val )
+#define TracyPlotConfig( name, type, step, fill, color ) tracy::Profiler::ConfigurePlot( name, type, step, fill, color )
+
+#define TracyAppInfo( txt, size ) tracy::Profiler::MessageAppInfo( txt, size )
+
+#if defined TRACY_HAS_CALLSTACK && defined TRACY_CALLSTACK
+#  define TracyMessage( txt, size ) tracy::Profiler::Message( txt, size, TRACY_CALLSTACK )
+#  define TracyMessageL( txt ) tracy::Profiler::Message( txt, TRACY_CALLSTACK )
+#  define TracyMessageC( txt, size, color ) tracy::Profiler::MessageColor( txt, size, color, TRACY_CALLSTACK )
+#  define TracyMessageLC( txt, color ) tracy::Profiler::MessageColor( txt, color, TRACY_CALLSTACK )
+
+#  define TracyAlloc( ptr, size ) tracy::Profiler::MemAllocCallstack( ptr, size, TRACY_CALLSTACK, false )
+#  define TracyFree( ptr ) tracy::Profiler::MemFreeCallstack( ptr, TRACY_CALLSTACK, false )
+#  define TracySecureAlloc( ptr, size ) tracy::Profiler::MemAllocCallstack( ptr, size, TRACY_CALLSTACK, true )
+#  define TracySecureFree( ptr ) tracy::Profiler::MemFreeCallstack( ptr, TRACY_CALLSTACK, true )
+
+#  define TracyAllocN( ptr, size, name ) tracy::Profiler::MemAllocCallstackNamed( ptr, size, TRACY_CALLSTACK, false, name )
+#  define TracyFreeN( ptr, name ) tracy::Profiler::MemFreeCallstackNamed( ptr, TRACY_CALLSTACK, false, name )
+#  define TracySecureAllocN( ptr, size, name ) tracy::Profiler::MemAllocCallstackNamed( ptr, size, TRACY_CALLSTACK, true, name )
+#  define TracySecureFreeN( ptr, name ) tracy::Profiler::MemFreeCallstackNamed( ptr, TRACY_CALLSTACK, true, name )
+#else
+#  define TracyMessage( txt, size ) tracy::Profiler::Message( txt, size, 0 )
+#  define TracyMessageL( txt ) tracy::Profiler::Message( txt, 0 )
+#  define TracyMessageC( txt, size, color ) tracy::Profiler::MessageColor( txt, size, color, 0 )
+#  define TracyMessageLC( txt, color ) tracy::Profiler::MessageColor( txt, color, 0 )
+
+#  define TracyAlloc( ptr, size ) tracy::Profiler::MemAlloc( ptr, size, false )
+#  define TracyFree( ptr ) tracy::Profiler::MemFree( ptr, false )
+#  define TracySecureAlloc( ptr, size ) tracy::Profiler::MemAlloc( ptr, size, true )
+#  define TracySecureFree( ptr ) tracy::Profiler::MemFree( ptr, true )
+
+#  define TracyAllocN( ptr, size, name ) tracy::Profiler::MemAllocNamed( ptr, size, false, name )
+#  define TracyFreeN( ptr, name ) tracy::Profiler::MemFreeNamed( ptr, false, name )
+#  define TracySecureAllocN( ptr, size, name ) tracy::Profiler::MemAllocNamed( ptr, size, true, name )
+#  define TracySecureFreeN( ptr, name ) tracy::Profiler::MemFreeNamed( ptr, true, name )
+#endif
+
+#ifdef TRACY_HAS_CALLSTACK
+#  define ZoneNamedS( varname, depth, active ) static constexpr tracy::SourceLocationData TracyConcat(__tracy_source_location,TracyLine) { nullptr, TracyFunction,  TracyFile, (uint32_t)TracyLine, 0 }; tracy::ScopedZone varname( &TracyConcat(__tracy_source_location,TracyLine), depth, active )
+#  define ZoneNamedNS( varname, name, depth, active ) static constexpr tracy::SourceLocationData TracyConcat(__tracy_source_location,TracyLine) { name, TracyFunction,  TracyFile, (uint32_t)TracyLine, 0 }; tracy::ScopedZone varname( &TracyConcat(__tracy_source_location,TracyLine), depth, active )
+#  define ZoneNamedCS( varname, color, depth, active ) static constexpr tracy::SourceLocationData TracyConcat(__tracy_source_location,TracyLine) { nullptr, TracyFunction,  TracyFile, (uint32_t)TracyLine, color }; tracy::ScopedZone varname( &TracyConcat(__tracy_source_location,TracyLine), depth, active )
+#  define ZoneNamedNCS( varname, name, color, depth, active ) static constexpr tracy::SourceLocationData TracyConcat(__tracy_source_location,TracyLine) { name, TracyFunction,  TracyFile, (uint32_t)TracyLine, color }; tracy::ScopedZone varname( &TracyConcat(__tracy_source_location,TracyLine), depth, active )
+
+#  define ZoneTransientS( varname, depth, active ) tracy::ScopedZone varname( TracyLine, TracyFile, strlen( TracyFile ), TracyFunction, strlen( TracyFunction ), nullptr, 0, depth, active )
+#  define ZoneTransientNS( varname, name, depth, active ) tracy::ScopedZone varname( TracyLine, TracyFile, strlen( TracyFile ), TracyFunction, strlen( TracyFunction ), name, strlen( name ), depth, active )
+
+#  define ZoneScopedS( depth ) ZoneNamedS( ___tracy_scoped_zone, depth, true )
+#  define ZoneScopedNS( name, depth ) ZoneNamedNS( ___tracy_scoped_zone, name, depth, true )
+#  define ZoneScopedCS( color, depth ) ZoneNamedCS( ___tracy_scoped_zone, color, depth, true )
+#  define ZoneScopedNCS( name, color, depth ) ZoneNamedNCS( ___tracy_scoped_zone, name, color, depth, true )
+
+#  define TracyAllocS( ptr, size, depth ) tracy::Profiler::MemAllocCallstack( ptr, size, depth, false )
+#  define TracyFreeS( ptr, depth ) tracy::Profiler::MemFreeCallstack( ptr, depth, false )
+#  define TracySecureAllocS( ptr, size, depth ) tracy::Profiler::MemAllocCallstack( ptr, size, depth, true )
+#  define TracySecureFreeS( ptr, depth ) tracy::Profiler::MemFreeCallstack( ptr, depth, true )
+
+#  define TracyAllocNS( ptr, size, depth, name ) tracy::Profiler::MemAllocCallstackNamed( ptr, size, depth, false, name )
+#  define TracyFreeNS( ptr, depth, name ) tracy::Profiler::MemFreeCallstackNamed( ptr, depth, false, name )
+#  define TracySecureAllocNS( ptr, size, depth, name ) tracy::Profiler::MemAllocCallstackNamed( ptr, size, depth, true, name )
+#  define TracySecureFreeNS( ptr, depth, name ) tracy::Profiler::MemFreeCallstackNamed( ptr, depth, true, name )
+
+#  define TracyMessageS( txt, size, depth ) tracy::Profiler::Message( txt, size, depth )
+#  define TracyMessageLS( txt, depth ) tracy::Profiler::Message( txt, depth )
+#  define TracyMessageCS( txt, size, color, depth ) tracy::Profiler::MessageColor( txt, size, color, depth )
+#  define TracyMessageLCS( txt, color, depth ) tracy::Profiler::MessageColor( txt, color, depth )
+#else
+#  define ZoneNamedS( varname, depth, active ) ZoneNamed( varname, active )
+#  define ZoneNamedNS( varname, name, depth, active ) ZoneNamedN( varname, name, active )
+#  define ZoneNamedCS( varname, color, depth, active ) ZoneNamedC( varname, color, active )
+#  define ZoneNamedNCS( varname, name, color, depth, active ) ZoneNamedNC( varname, name, color, active )
+
+#  define ZoneTransientS( varname, depth, active ) ZoneTransient( varname, active )
+#  define ZoneTransientNS( varname, name, depth, active ) ZoneTransientN( varname, name, active )
+
+#  define ZoneScopedS( depth ) ZoneScoped
+#  define ZoneScopedNS( name, depth ) ZoneScopedN( name )
+#  define ZoneScopedCS( color, depth ) ZoneScopedC( color )
+#  define ZoneScopedNCS( name, color, depth ) ZoneScopedNC( name, color )
+
+#  define TracyAllocS( ptr, size, depth ) TracyAlloc( ptr, size )
+#  define TracyFreeS( ptr, depth ) TracyFree( ptr )
+#  define TracySecureAllocS( ptr, size, depth ) TracySecureAlloc( ptr, size )
+#  define TracySecureFreeS( ptr, depth ) TracySecureFree( ptr )
+
+#  define TracyAllocNS( ptr, size, depth, name ) TracyAllocN( ptr, size, name )
+#  define TracyFreeNS( ptr, depth, name ) TracyFreeN( ptr, name )
+#  define TracySecureAllocNS( ptr, size, depth, name ) TracySecureAllocN( ptr, size, name )
+#  define TracySecureFreeNS( ptr, depth, name ) TracySecureFreeN( ptr, name )
+
+#  define TracyMessageS( txt, size, depth ) TracyMessage( txt, size )
+#  define TracyMessageLS( txt, depth ) TracyMessageL( txt )
+#  define TracyMessageCS( txt, size, color, depth ) TracyMessageC( txt, size, color )
+#  define TracyMessageLCS( txt, color, depth ) TracyMessageLC( txt, color )
+#endif
+
+#define TracySourceCallbackRegister( cb, data ) tracy::Profiler::SourceCallbackRegister( cb, data )
+#define TracyParameterRegister( cb, data ) tracy::Profiler::ParameterRegister( cb, data )
+#define TracyParameterSetup( idx, name, isBool, val ) tracy::Profiler::ParameterSetup( idx, name, isBool, val )
+#define TracyIsConnected tracy::GetProfiler().IsConnected()
+#define TracySetProgramName( name ) tracy::GetProfiler().SetProgramName( name );
+
+#ifdef TRACY_FIBERS
+#  define TracyFiberEnter( fiber ) tracy::Profiler::EnterFiber( fiber, 0 )
+#  define TracyFiberEnterHint( fiber, groupHint ) tracy::Profiler::EnterFiber( fiber, groupHint )
+#  define TracyFiberLeave tracy::Profiler::LeaveFiber()
+#endif
+
+#endif
+
+#endif
diff --git a/project/thirdparty/tracy-0.11.1/tracy/TracyC.h b/project/thirdparty/tracy-0.11.1/tracy/TracyC.h
new file mode 100644
index 000000000..8b447beb5
--- /dev/null
+++ b/project/thirdparty/tracy-0.11.1/tracy/TracyC.h
@@ -0,0 +1,417 @@
+#ifndef __TRACYC_HPP__
+#define __TRACYC_HPP__
+
+#include <stddef.h>
+#include <stdint.h>
+
+#include "../client/TracyCallstack.h"
+#include "../common/TracyApi.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+enum TracyPlotFormatEnum
+{
+    TracyPlotFormatNumber,
+    TracyPlotFormatMemory,
+    TracyPlotFormatPercentage,
+    TracyPlotFormatWatt
+};
+
+TRACY_API void ___tracy_set_thread_name( const char* name );
+
+#define TracyCSetThreadName( name ) ___tracy_set_thread_name( name );
+
+#ifndef TracyFunction
+#  define TracyFunction __FUNCTION__
+#endif
+
+#ifndef TracyFile
+#  define TracyFile __FILE__
+#endif
+
+#ifndef TracyLine
+#  define TracyLine __LINE__
+#endif
+
+#ifndef TRACY_ENABLE
+
+typedef const void* TracyCZoneCtx;
+
+typedef const void* TracyCLockCtx;
+
+#define TracyCZone(c,x)
+#define TracyCZoneN(c,x,y)
+#define TracyCZoneC(c,x,y)
+#define TracyCZoneNC(c,x,y,z)
+#define TracyCZoneEnd(c)
+#define TracyCZoneText(c,x,y)
+#define TracyCZoneName(c,x,y)
+#define TracyCZoneColor(c,x)
+#define TracyCZoneValue(c,x)
+
+#define TracyCAlloc(x,y)
+#define TracyCFree(x)
+#define TracyCSecureAlloc(x,y)
+#define TracyCSecureFree(x)
+
+#define TracyCAllocN(x,y,z)
+#define TracyCFreeN(x,y)
+#define TracyCSecureAllocN(x,y,z)
+#define TracyCSecureFreeN(x,y)
+
+#define TracyCFrameMark
+#define TracyCFrameMarkNamed(x)
+#define TracyCFrameMarkStart(x)
+#define TracyCFrameMarkEnd(x)
+#define TracyCFrameImage(x,y,z,w,a)
+
+#define TracyCPlot(x,y)
+#define TracyCPlotF(x,y)
+#define TracyCPlotI(x,y)
+#define TracyCPlotConfig(x,y,z,w,a)
+
+#define TracyCMessage(x,y)
+#define TracyCMessageL(x)
+#define TracyCMessageC(x,y,z)
+#define TracyCMessageLC(x,y)
+#define TracyCAppInfo(x,y)
+
+#define TracyCZoneS(x,y,z)
+#define TracyCZoneNS(x,y,z,w)
+#define TracyCZoneCS(x,y,z,w)
+#define TracyCZoneNCS(x,y,z,w,a)
+
+#define TracyCAllocS(x,y,z)
+#define TracyCFreeS(x,y)
+#define TracyCSecureAllocS(x,y,z)
+#define TracyCSecureFreeS(x,y)
+
+#define TracyCAllocNS(x,y,z,w)
+#define TracyCFreeNS(x,y,z)
+#define TracyCSecureAllocNS(x,y,z,w)
+#define TracyCSecureFreeNS(x,y,z)
+
+#define TracyCMessageS(x,y,z)
+#define TracyCMessageLS(x,y)
+#define TracyCMessageCS(x,y,z,w)
+#define TracyCMessageLCS(x,y,z)
+
+#define TracyCLockCtx(l)
+#define TracyCLockAnnounce(l)
+#define TracyCLockTerminate(l)
+#define TracyCLockBeforeLock(l)
+#define TracyCLockAfterLock(l)
+#define TracyCLockAfterUnlock(l)
+#define TracyCLockAfterTryLock(l,x)
+#define TracyCLockMark(l)
+#define TracyCLockCustomName(l,x,y)
+
+#define TracyCIsConnected 0
+#define TracyCIsStarted 0
+
+#ifdef TRACY_FIBERS
+#  define TracyCFiberEnter(fiber)
+#  define TracyCFiberLeave
+#endif
+
+#else
+
+#ifndef TracyConcat
+#  define TracyConcat(x,y) TracyConcatIndirect(x,y)
+#endif
+#ifndef TracyConcatIndirect
+#  define TracyConcatIndirect(x,y) x##y
+#endif
+
+struct ___tracy_source_location_data
+{
+    const char* name;
+    const char* function;
+    const char* file;
+    uint32_t line;
+    uint32_t color;
+};
+
+struct ___tracy_c_zone_context
+{
+    uint32_t id;
+    int active;
+};
+
+struct ___tracy_gpu_time_data
+{
+    int64_t gpuTime;
+    uint16_t queryId;
+    uint8_t context;
+};
+
+struct ___tracy_gpu_zone_begin_data {
+    uint64_t srcloc;
+    uint16_t queryId;
+    uint8_t context;
+};
+
+struct ___tracy_gpu_zone_begin_callstack_data {
+    uint64_t srcloc;
+    int depth;
+    uint16_t queryId;
+    uint8_t context;
+};
+
+struct ___tracy_gpu_zone_end_data {
+    uint16_t queryId;
+    uint8_t context;
+};
+
+struct ___tracy_gpu_new_context_data {
+    int64_t gpuTime;
+    float period;
+    uint8_t context;
+    uint8_t flags;
+    uint8_t type;
+};
+
+struct ___tracy_gpu_context_name_data {
+    uint8_t context;
+    const char* name;
+    uint16_t len;
+};
+
+struct ___tracy_gpu_calibration_data {
+    int64_t gpuTime;
+    int64_t cpuDelta;
+    uint8_t context;
+};
+
+struct ___tracy_gpu_time_sync_data {
+    int64_t gpuTime;
+    uint8_t context;
+};
+
+struct __tracy_lockable_context_data;
+
+// Some containers don't support storing const types.
+// This struct, as visible to user, is immutable, so treat it as if const was declared here.
+typedef /*const*/ struct ___tracy_c_zone_context TracyCZoneCtx;
+
+typedef struct __tracy_lockable_context_data* TracyCLockCtx;
+
+#ifdef TRACY_MANUAL_LIFETIME
+TRACY_API void ___tracy_startup_profiler(void);
+TRACY_API void ___tracy_shutdown_profiler(void);
+TRACY_API int ___tracy_profiler_started(void);
+
+#  define TracyCIsStarted ___tracy_profiler_started()
+#else
+#  define TracyCIsStarted 1
+#endif
+
+TRACY_API uint64_t ___tracy_alloc_srcloc( uint32_t line, const char* source, size_t sourceSz, const char* function, size_t functionSz, uint32_t color );
+TRACY_API uint64_t ___tracy_alloc_srcloc_name( uint32_t line, const char* source, size_t sourceSz, const char* function, size_t functionSz, const char* name, size_t nameSz, uint32_t color );
+
+TRACY_API TracyCZoneCtx ___tracy_emit_zone_begin( const struct ___tracy_source_location_data* srcloc, int active );
+TRACY_API TracyCZoneCtx ___tracy_emit_zone_begin_callstack( const struct ___tracy_source_location_data* srcloc, int depth, int active );
+TRACY_API TracyCZoneCtx ___tracy_emit_zone_begin_alloc( uint64_t srcloc, int active );
+TRACY_API TracyCZoneCtx ___tracy_emit_zone_begin_alloc_callstack( uint64_t srcloc, int depth, int active );
+TRACY_API void ___tracy_emit_zone_end( TracyCZoneCtx ctx );
+TRACY_API void ___tracy_emit_zone_text( TracyCZoneCtx ctx, const char* txt, size_t size );
+TRACY_API void ___tracy_emit_zone_name( TracyCZoneCtx ctx, const char* txt, size_t size );
+TRACY_API void ___tracy_emit_zone_color( TracyCZoneCtx ctx, uint32_t color );
+TRACY_API void ___tracy_emit_zone_value( TracyCZoneCtx ctx, uint64_t value );
+
+TRACY_API void ___tracy_emit_gpu_zone_begin( const struct ___tracy_gpu_zone_begin_data );
+TRACY_API void ___tracy_emit_gpu_zone_begin_callstack( const struct ___tracy_gpu_zone_begin_callstack_data );
+TRACY_API void ___tracy_emit_gpu_zone_begin_alloc( const struct ___tracy_gpu_zone_begin_data );
+TRACY_API void ___tracy_emit_gpu_zone_begin_alloc_callstack( const struct ___tracy_gpu_zone_begin_callstack_data );
+TRACY_API void ___tracy_emit_gpu_zone_end( const struct ___tracy_gpu_zone_end_data data );
+TRACY_API void ___tracy_emit_gpu_time( const struct ___tracy_gpu_time_data );
+TRACY_API void ___tracy_emit_gpu_new_context( const struct ___tracy_gpu_new_context_data );
+TRACY_API void ___tracy_emit_gpu_context_name( const struct ___tracy_gpu_context_name_data );
+TRACY_API void ___tracy_emit_gpu_calibration( const struct ___tracy_gpu_calibration_data );
+TRACY_API void ___tracy_emit_gpu_time_sync( const struct ___tracy_gpu_time_sync_data );
+
+TRACY_API void ___tracy_emit_gpu_zone_begin_serial( const struct ___tracy_gpu_zone_begin_data );
+TRACY_API void ___tracy_emit_gpu_zone_begin_callstack_serial( const struct ___tracy_gpu_zone_begin_callstack_data );
+TRACY_API void ___tracy_emit_gpu_zone_begin_alloc_serial( const struct ___tracy_gpu_zone_begin_data );
+TRACY_API void ___tracy_emit_gpu_zone_begin_alloc_callstack_serial( const struct ___tracy_gpu_zone_begin_callstack_data );
+TRACY_API void ___tracy_emit_gpu_zone_end_serial( const struct ___tracy_gpu_zone_end_data data );
+TRACY_API void ___tracy_emit_gpu_time_serial( const struct ___tracy_gpu_time_data );
+TRACY_API void ___tracy_emit_gpu_new_context_serial( const struct ___tracy_gpu_new_context_data );
+TRACY_API void ___tracy_emit_gpu_context_name_serial( const struct ___tracy_gpu_context_name_data );
+TRACY_API void ___tracy_emit_gpu_calibration_serial( const struct ___tracy_gpu_calibration_data );
+TRACY_API void ___tracy_emit_gpu_time_sync_serial( const struct ___tracy_gpu_time_sync_data );
+
+TRACY_API int ___tracy_connected(void);
+
+#if defined TRACY_HAS_CALLSTACK && defined TRACY_CALLSTACK
+#  define TracyCZone( ctx, active ) static const struct ___tracy_source_location_data TracyConcat(__tracy_source_location,TracyLine) = { NULL, __func__,  TracyFile, (uint32_t)TracyLine, 0 }; TracyCZoneCtx ctx = ___tracy_emit_zone_begin_callstack( &TracyConcat(__tracy_source_location,TracyLine), TRACY_CALLSTACK, active );
+#  define TracyCZoneN( ctx, name, active ) static const struct ___tracy_source_location_data TracyConcat(__tracy_source_location,TracyLine) = { name, __func__,  TracyFile, (uint32_t)TracyLine, 0 }; TracyCZoneCtx ctx = ___tracy_emit_zone_begin_callstack( &TracyConcat(__tracy_source_location,TracyLine), TRACY_CALLSTACK, active );
+#  define TracyCZoneC( ctx, color, active ) static const struct ___tracy_source_location_data TracyConcat(__tracy_source_location,TracyLine) = { NULL, __func__,  TracyFile, (uint32_t)TracyLine, color }; TracyCZoneCtx ctx = ___tracy_emit_zone_begin_callstack( &TracyConcat(__tracy_source_location,TracyLine), TRACY_CALLSTACK, active );
+#  define TracyCZoneNC( ctx, name, color, active ) static const struct ___tracy_source_location_data TracyConcat(__tracy_source_location,TracyLine) = { name, __func__,  TracyFile, (uint32_t)TracyLine, color }; TracyCZoneCtx ctx = ___tracy_emit_zone_begin_callstack( &TracyConcat(__tracy_source_location,TracyLine), TRACY_CALLSTACK, active );
+#else
+#  define TracyCZone( ctx, active ) static const struct ___tracy_source_location_data TracyConcat(__tracy_source_location,TracyLine) = { NULL, __func__,  TracyFile, (uint32_t)TracyLine, 0 }; TracyCZoneCtx ctx = ___tracy_emit_zone_begin( &TracyConcat(__tracy_source_location,TracyLine), active );
+#  define TracyCZoneN( ctx, name, active ) static const struct ___tracy_source_location_data TracyConcat(__tracy_source_location,TracyLine) = { name, __func__,  TracyFile, (uint32_t)TracyLine, 0 }; TracyCZoneCtx ctx = ___tracy_emit_zone_begin( &TracyConcat(__tracy_source_location,TracyLine), active );
+#  define TracyCZoneC( ctx, color, active ) static const struct ___tracy_source_location_data TracyConcat(__tracy_source_location,TracyLine) = { NULL, __func__,  TracyFile, (uint32_t)TracyLine, color }; TracyCZoneCtx ctx = ___tracy_emit_zone_begin( &TracyConcat(__tracy_source_location,TracyLine), active );
+#  define TracyCZoneNC( ctx, name, color, active ) static const struct ___tracy_source_location_data TracyConcat(__tracy_source_location,TracyLine) = { name, __func__,  TracyFile, (uint32_t)TracyLine, color }; TracyCZoneCtx ctx = ___tracy_emit_zone_begin( &TracyConcat(__tracy_source_location,TracyLine), active );
+#endif
+
+#define TracyCZoneEnd( ctx ) ___tracy_emit_zone_end( ctx );
+
+#define TracyCZoneText( ctx, txt, size ) ___tracy_emit_zone_text( ctx, txt, size );
+#define TracyCZoneName( ctx, txt, size ) ___tracy_emit_zone_name( ctx, txt, size );
+#define TracyCZoneColor( ctx, color ) ___tracy_emit_zone_color( ctx, color );
+#define TracyCZoneValue( ctx, value ) ___tracy_emit_zone_value( ctx, value );
+
+
+TRACY_API void ___tracy_emit_memory_alloc( const void* ptr, size_t size, int secure );
+TRACY_API void ___tracy_emit_memory_alloc_callstack( const void* ptr, size_t size, int depth, int secure );
+TRACY_API void ___tracy_emit_memory_free( const void* ptr, int secure );
+TRACY_API void ___tracy_emit_memory_free_callstack( const void* ptr, int depth, int secure );
+TRACY_API void ___tracy_emit_memory_alloc_named( const void* ptr, size_t size, int secure, const char* name );
+TRACY_API void ___tracy_emit_memory_alloc_callstack_named( const void* ptr, size_t size, int depth, int secure, const char* name );
+TRACY_API void ___tracy_emit_memory_free_named( const void* ptr, int secure, const char* name );
+TRACY_API void ___tracy_emit_memory_free_callstack_named( const void* ptr, int depth, int secure, const char* name );
+
+TRACY_API void ___tracy_emit_message( const char* txt, size_t size, int callstack );
+TRACY_API void ___tracy_emit_messageL( const char* txt, int callstack );
+TRACY_API void ___tracy_emit_messageC( const char* txt, size_t size, uint32_t color, int callstack );
+TRACY_API void ___tracy_emit_messageLC( const char* txt, uint32_t color, int callstack );
+
+#if defined TRACY_HAS_CALLSTACK && defined TRACY_CALLSTACK
+#  define TracyCAlloc( ptr, size ) ___tracy_emit_memory_alloc_callstack( ptr, size, TRACY_CALLSTACK, 0 )
+#  define TracyCFree( ptr ) ___tracy_emit_memory_free_callstack( ptr, TRACY_CALLSTACK, 0 )
+#  define TracyCSecureAlloc( ptr, size ) ___tracy_emit_memory_alloc_callstack( ptr, size, TRACY_CALLSTACK, 1 )
+#  define TracyCSecureFree( ptr ) ___tracy_emit_memory_free_callstack( ptr, TRACY_CALLSTACK, 1 )
+
+#  define TracyCAllocN( ptr, size, name ) ___tracy_emit_memory_alloc_callstack_named( ptr, size, TRACY_CALLSTACK, 0, name )
+#  define TracyCFreeN( ptr, name ) ___tracy_emit_memory_free_callstack_named( ptr, TRACY_CALLSTACK, 0, name )
+#  define TracyCSecureAllocN( ptr, size, name ) ___tracy_emit_memory_alloc_callstack_named( ptr, size, TRACY_CALLSTACK, 1, name )
+#  define TracyCSecureFreeN( ptr, name ) ___tracy_emit_memory_free_callstack_named( ptr, TRACY_CALLSTACK, 1, name )
+
+#  define TracyCMessage( txt, size ) ___tracy_emit_message( txt, size, TRACY_CALLSTACK );
+#  define TracyCMessageL( txt ) ___tracy_emit_messageL( txt, TRACY_CALLSTACK );
+#  define TracyCMessageC( txt, size, color ) ___tracy_emit_messageC( txt, size, color, TRACY_CALLSTACK );
+#  define TracyCMessageLC( txt, color ) ___tracy_emit_messageLC( txt, color, TRACY_CALLSTACK );
+#else
+#  define TracyCAlloc( ptr, size ) ___tracy_emit_memory_alloc( ptr, size, 0 );
+#  define TracyCFree( ptr ) ___tracy_emit_memory_free( ptr, 0 );
+#  define TracyCSecureAlloc( ptr, size ) ___tracy_emit_memory_alloc( ptr, size, 1 );
+#  define TracyCSecureFree( ptr ) ___tracy_emit_memory_free( ptr, 1 );
+
+#  define TracyCAllocN( ptr, size, name ) ___tracy_emit_memory_alloc_named( ptr, size, 0, name );
+#  define TracyCFreeN( ptr, name ) ___tracy_emit_memory_free_named( ptr, 0, name );
+#  define TracyCSecureAllocN( ptr, size, name ) ___tracy_emit_memory_alloc_named( ptr, size, 1, name );
+#  define TracyCSecureFreeN( ptr, name ) ___tracy_emit_memory_free_named( ptr, 1, name );
+
+#  define TracyCMessage( txt, size ) ___tracy_emit_message( txt, size, 0 );
+#  define TracyCMessageL( txt ) ___tracy_emit_messageL( txt, 0 );
+#  define TracyCMessageC( txt, size, color ) ___tracy_emit_messageC( txt, size, color, 0 );
+#  define TracyCMessageLC( txt, color ) ___tracy_emit_messageLC( txt, color, 0 );
+#endif
+
+
+TRACY_API void ___tracy_emit_frame_mark( const char* name );
+TRACY_API void ___tracy_emit_frame_mark_start( const char* name );
+TRACY_API void ___tracy_emit_frame_mark_end( const char* name );
+TRACY_API void ___tracy_emit_frame_image( const void* image, uint16_t w, uint16_t h, uint8_t offset, int flip );
+
+#define TracyCFrameMark ___tracy_emit_frame_mark( 0 );
+#define TracyCFrameMarkNamed( name ) ___tracy_emit_frame_mark( name );
+#define TracyCFrameMarkStart( name ) ___tracy_emit_frame_mark_start( name );
+#define TracyCFrameMarkEnd( name ) ___tracy_emit_frame_mark_end( name );
+#define TracyCFrameImage( image, width, height, offset, flip ) ___tracy_emit_frame_image( image, width, height, offset, flip );
+
+
+TRACY_API void ___tracy_emit_plot( const char* name, double val );
+TRACY_API void ___tracy_emit_plot_float( const char* name, float val );
+TRACY_API void ___tracy_emit_plot_int( const char* name, int64_t val );
+TRACY_API void ___tracy_emit_plot_config( const char* name, int type, int step, int fill, uint32_t color );
+TRACY_API void ___tracy_emit_message_appinfo( const char* txt, size_t size );
+
+#define TracyCPlot( name, val ) ___tracy_emit_plot( name, val );
+#define TracyCPlotF( name, val ) ___tracy_emit_plot_float( name, val );
+#define TracyCPlotI( name, val ) ___tracy_emit_plot_int( name, val );
+#define TracyCPlotConfig( name, type, step, fill, color ) ___tracy_emit_plot_config( name, type, step, fill, color );
+#define TracyCAppInfo( txt, size ) ___tracy_emit_message_appinfo( txt, size );
+
+
+#ifdef TRACY_HAS_CALLSTACK
+#  define TracyCZoneS( ctx, depth, active ) static const struct ___tracy_source_location_data TracyConcat(__tracy_source_location,TracyLine) = { NULL, __func__,  TracyFile, (uint32_t)TracyLine, 0 }; TracyCZoneCtx ctx = ___tracy_emit_zone_begin_callstack( &TracyConcat(__tracy_source_location,TracyLine), depth, active );
+#  define TracyCZoneNS( ctx, name, depth, active ) static const struct ___tracy_source_location_data TracyConcat(__tracy_source_location,TracyLine) = { name, __func__,  TracyFile, (uint32_t)TracyLine, 0 }; TracyCZoneCtx ctx = ___tracy_emit_zone_begin_callstack( &TracyConcat(__tracy_source_location,TracyLine), depth, active );
+#  define TracyCZoneCS( ctx, color, depth, active ) static const struct ___tracy_source_location_data TracyConcat(__tracy_source_location,TracyLine) = { NULL, __func__,  TracyFile, (uint32_t)TracyLine, color }; TracyCZoneCtx ctx = ___tracy_emit_zone_begin_callstack( &TracyConcat(__tracy_source_location,TracyLine), depth, active );
+#  define TracyCZoneNCS( ctx, name, color, depth, active ) static const struct ___tracy_source_location_data TracyConcat(__tracy_source_location,TracyLine) = { name, __func__,  TracyFile, (uint32_t)TracyLine, color }; TracyCZoneCtx ctx = ___tracy_emit_zone_begin_callstack( &TracyConcat(__tracy_source_location,TracyLine), depth, active );
+
+#  define TracyCAllocS( ptr, size, depth ) ___tracy_emit_memory_alloc_callstack( ptr, size, depth, 0 )
+#  define TracyCFreeS( ptr, depth ) ___tracy_emit_memory_free_callstack( ptr, depth, 0 )
+#  define TracyCSecureAllocS( ptr, size, depth ) ___tracy_emit_memory_alloc_callstack( ptr, size, depth, 1 )
+#  define TracyCSecureFreeS( ptr, depth ) ___tracy_emit_memory_free_callstack( ptr, depth, 1 )
+
+#  define TracyCAllocNS( ptr, size, depth, name ) ___tracy_emit_memory_alloc_callstack_named( ptr, size, depth, 0, name )
+#  define TracyCFreeNS( ptr, depth, name ) ___tracy_emit_memory_free_callstack_named( ptr, depth, 0, name )
+#  define TracyCSecureAllocNS( ptr, size, depth, name ) ___tracy_emit_memory_alloc_callstack_named( ptr, size, depth, 1, name )
+#  define TracyCSecureFreeNS( ptr, depth, name ) ___tracy_emit_memory_free_callstack_named( ptr, depth, 1, name )
+
+#  define TracyCMessageS( txt, size, depth ) ___tracy_emit_message( txt, size, depth );
+#  define TracyCMessageLS( txt, depth ) ___tracy_emit_messageL( txt, depth );
+#  define TracyCMessageCS( txt, size, color, depth ) ___tracy_emit_messageC( txt, size, color, depth );
+#  define TracyCMessageLCS( txt, color, depth ) ___tracy_emit_messageLC( txt, color, depth );
+#else
+#  define TracyCZoneS( ctx, depth, active ) TracyCZone( ctx, active )
+#  define TracyCZoneNS( ctx, name, depth, active ) TracyCZoneN( ctx, name, active )
+#  define TracyCZoneCS( ctx, color, depth, active ) TracyCZoneC( ctx, color, active )
+#  define TracyCZoneNCS( ctx, name, color, depth, active ) TracyCZoneNC( ctx, name, color, active )
+
+#  define TracyCAllocS( ptr, size, depth ) TracyCAlloc( ptr, size )
+#  define TracyCFreeS( ptr, depth ) TracyCFree( ptr )
+#  define TracyCSecureAllocS( ptr, size, depth ) TracyCSecureAlloc( ptr, size )
+#  define TracyCSecureFreeS( ptr, depth ) TracyCSecureFree( ptr )
+
+#  define TracyCAllocNS( ptr, size, depth, name ) TracyCAllocN( ptr, size, name )
+#  define TracyCFreeNS( ptr, depth, name ) TracyCFreeN( ptr, name )
+#  define TracyCSecureAllocNS( ptr, size, depth, name ) TracyCSecureAllocN( ptr, size, name )
+#  define TracyCSecureFreeNS( ptr, depth, name ) TracyCSecureFreeN( ptr, name )
+
+#  define TracyCMessageS( txt, size, depth ) TracyCMessage( txt, size )
+#  define TracyCMessageLS( txt, depth ) TracyCMessageL( txt )
+#  define TracyCMessageCS( txt, size, color, depth ) TracyCMessageC( txt, size, color )
+#  define TracyCMessageLCS( txt, color, depth ) TracyCMessageLC( txt, color )
+#endif
+
+
+TRACY_API struct __tracy_lockable_context_data* ___tracy_announce_lockable_ctx( const struct ___tracy_source_location_data* srcloc );
+TRACY_API void ___tracy_terminate_lockable_ctx( struct __tracy_lockable_context_data* lockdata );
+TRACY_API int ___tracy_before_lock_lockable_ctx( struct __tracy_lockable_context_data* lockdata );
+TRACY_API void ___tracy_after_lock_lockable_ctx( struct __tracy_lockable_context_data* lockdata );
+TRACY_API void ___tracy_after_unlock_lockable_ctx( struct __tracy_lockable_context_data* lockdata );
+TRACY_API void ___tracy_after_try_lock_lockable_ctx( struct __tracy_lockable_context_data* lockdata, int acquired );
+TRACY_API void ___tracy_mark_lockable_ctx( struct __tracy_lockable_context_data* lockdata, const struct ___tracy_source_location_data* srcloc );
+TRACY_API void ___tracy_custom_name_lockable_ctx( struct __tracy_lockable_context_data* lockdata, const char* name, size_t nameSz );
+
+#define TracyCLockAnnounce( lock ) static const struct ___tracy_source_location_data TracyConcat(__tracy_source_location,TracyLine) = { NULL, __func__,  TracyFile, (uint32_t)TracyLine, 0 }; lock = ___tracy_announce_lockable_ctx( &TracyConcat(__tracy_source_location,TracyLine) );
+#define TracyCLockTerminate( lock ) ___tracy_terminate_lockable_ctx( lock );
+#define TracyCLockBeforeLock( lock ) ___tracy_before_lock_lockable_ctx( lock );
+#define TracyCLockAfterLock( lock ) ___tracy_after_lock_lockable_ctx( lock );
+#define TracyCLockAfterUnlock( lock ) ___tracy_after_unlock_lockable_ctx( lock );
+#define TracyCLockAfterTryLock( lock, acquired ) ___tracy_after_try_lock_lockable_ctx( lock, acquired );
+#define TracyCLockMark( lock ) static const struct ___tracy_source_location_data TracyConcat(__tracy_source_location,TracyLine) = { NULL, __func__,  TracyFile, (uint32_t)TracyLine, 0 }; ___tracy_mark_lockable_ctx( lock, &TracyConcat(__tracy_source_location,TracyLine) );
+#define TracyCLockCustomName( lock, name, nameSz ) ___tracy_custom_name_lockable_ctx( lock, name, nameSz );
+
+#define TracyCIsConnected ___tracy_connected()
+
+#ifdef TRACY_FIBERS
+TRACY_API void ___tracy_fiber_enter( const char* fiber );
+TRACY_API void ___tracy_fiber_leave( void );
+
+#  define TracyCFiberEnter( fiber ) ___tracy_fiber_enter( fiber );
+#  define TracyCFiberLeave ___tracy_fiber_leave();
+#endif
+
+#endif
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
diff --git a/project/thirdparty/tracy-0.11.1/tracy/TracyD3D11.hpp b/project/thirdparty/tracy-0.11.1/tracy/TracyD3D11.hpp
new file mode 100644
index 000000000..3ed151bff
--- /dev/null
+++ b/project/thirdparty/tracy-0.11.1/tracy/TracyD3D11.hpp
@@ -0,0 +1,446 @@
+#ifndef __TRACYD3D11_HPP__
+#define __TRACYD3D11_HPP__
+
+#ifndef TRACY_ENABLE
+
+#define TracyD3D11Context(device,queue) nullptr
+#define TracyD3D11Destroy(ctx)
+#define TracyD3D11ContextName(ctx, name, size)
+
+#define TracyD3D11NewFrame(ctx)
+
+#define TracyD3D11Zone(ctx, name)
+#define TracyD3D11ZoneC(ctx, name, color)
+#define TracyD3D11NamedZone(ctx, varname, name, active)
+#define TracyD3D11NamedZoneC(ctx, varname, name, color, active)
+#define TracyD3D11ZoneTransient(ctx, varname, name, active)
+
+#define TracyD3D11ZoneS(ctx, name, depth)
+#define TracyD3D11ZoneCS(ctx, name, color, depth)
+#define TracyD3D11NamedZoneS(ctx, varname, name, depth, active)
+#define TracyD3D11NamedZoneCS(ctx, varname, name, color, depth, active)
+#define TracyD3D11ZoneTransientS(ctx, varname, name, depth, active)
+
+#define TracyD3D11Collect(ctx)
+
+namespace tracy
+{
+class D3D11ZoneScope {};
+}
+
+using TracyD3D11Ctx = void*;
+
+#else
+
+#include <atomic>
+#include <assert.h>
+#include <stdlib.h>
+
+#include "Tracy.hpp"
+#include "../client/TracyProfiler.hpp"
+#include "../client/TracyCallstack.hpp"
+#include "../common/TracyYield.hpp"
+
+#include <d3d11.h>
+
+#define TracyD3D11Panic(msg, ...) do { assert(false && "TracyD3D11: " msg); TracyMessageLC("TracyD3D11: " msg, tracy::Color::Red4); __VA_ARGS__; } while(false);
+
+namespace tracy
+{
+
+class D3D11Ctx
+{
+    friend class D3D11ZoneScope;
+
+    static constexpr uint32_t MaxQueries = 64 * 1024;
+
+    enum CollectMode { POLL, BLOCK };
+
+public:
+    D3D11Ctx( ID3D11Device* device, ID3D11DeviceContext* devicectx )
+    {
+        // TODO: consider calling ID3D11Device::GetImmediateContext() instead of passing it as an argument
+        m_device = device;
+        device->AddRef();
+        m_immediateDevCtx = devicectx;
+        devicectx->AddRef();
+
+        {
+            D3D11_QUERY_DESC desc = { };
+            desc.Query = D3D11_QUERY_TIMESTAMP_DISJOINT;
+            if (FAILED(m_device->CreateQuery(&desc, &m_disjointQuery)))
+            {
+                TracyD3D11Panic("unable to create disjoint timestamp query.", return);
+            }
+        }
+
+        for (ID3D11Query*& query : m_queries)
+        {
+            D3D11_QUERY_DESC desc = { };
+            desc.Query = D3D11_QUERY_TIMESTAMP;
+            if (FAILED(m_device->CreateQuery(&desc, &query)))
+            {
+                TracyD3D11Panic("unable to create timestamp query.", return);
+            }
+        }
+
+        // Calibrate CPU and GPU timestamps
+        int64_t tcpu = 0;
+        int64_t tgpu = 0;
+        for (int attempts = 0; attempts < 50; attempts++)
+        {
+            m_immediateDevCtx->Begin(m_disjointQuery);
+            m_immediateDevCtx->End(m_queries[0]);
+            m_immediateDevCtx->End(m_disjointQuery);
+
+            int64_t tcpu0 = Profiler::GetTime();
+            WaitForQuery(m_disjointQuery);
+            int64_t tcpu1 = Profiler::GetTime();
+
+            D3D11_QUERY_DATA_TIMESTAMP_DISJOINT disjoint = { };
+            if (m_immediateDevCtx->GetData(m_disjointQuery, &disjoint, sizeof(disjoint), 0) != S_OK)
+            {
+                TracyMessageLC("TracyD3D11: unable to query GPU timestamp; retrying...", tracy::Color::Tomato);
+                continue;
+            }
+
+            if (disjoint.Disjoint)
+                continue;
+
+            UINT64 timestamp = 0;
+            if (m_immediateDevCtx->GetData(m_queries[0], &timestamp, sizeof(timestamp), 0) != S_OK)
+                continue;   // this should never happen, since the enclosing disjoint query succeeded
+
+            tcpu = tcpu0 + (tcpu1 - tcpu0) * 1 / 2;
+            tgpu = timestamp * (1000000000 / disjoint.Frequency);
+            break;
+        }
+
+        // ready to roll
+        m_contextId = GetGpuCtxCounter().fetch_add(1);
+        m_immediateDevCtx->Begin(m_disjointQuery);
+        m_previousCheckpoint = m_nextCheckpoint = 0;
+
+        auto* item = Profiler::QueueSerial();
+        MemWrite( &item->hdr.type, QueueType::GpuNewContext );
+        MemWrite( &item->gpuNewContext.cpuTime, tcpu );
+        MemWrite( &item->gpuNewContext.gpuTime, tgpu );
+        MemWrite( &item->gpuNewContext.thread, uint32_t(0) );   // #TODO: why not GetThreadHandle()?
+        MemWrite( &item->gpuNewContext.period, 1.0f );
+        MemWrite( &item->gpuNewContext.context, m_contextId);
+        MemWrite( &item->gpuNewContext.flags, uint8_t(0) );
+        MemWrite( &item->gpuNewContext.type, GpuContextType::Direct3D11 );
+
+#ifdef TRACY_ON_DEMAND
+        GetProfiler().DeferItem( *item );
+#endif
+
+        Profiler::QueueSerialFinish();
+    }
+
+    ~D3D11Ctx()
+    {
+        // collect all pending timestamps before destroying everything
+        do
+        {
+            Collect(BLOCK);
+        } while (m_previousCheckpoint != m_queryCounter);
+
+        for (ID3D11Query* query : m_queries)
+        {
+            query->Release();
+        }
+        m_immediateDevCtx->End(m_disjointQuery);
+        m_disjointQuery->Release();
+        m_immediateDevCtx->Release();
+        m_device->Release();
+    }
+
+    void Name( const char* name, uint16_t len )
+    {
+        auto ptr = (char*)tracy_malloc( len );
+        memcpy( ptr, name, len );
+
+        auto item = Profiler::QueueSerial();
+        MemWrite( &item->hdr.type, QueueType::GpuContextName );
+        MemWrite( &item->gpuContextNameFat.context, m_contextId );
+        MemWrite( &item->gpuContextNameFat.ptr, (uint64_t)ptr );
+        MemWrite( &item->gpuContextNameFat.size, len );
+#ifdef TRACY_ON_DEMAND
+        GetProfiler().DeferItem( *item );
+#endif
+        Profiler::QueueSerialFinish();
+    }
+
+    void Collect(CollectMode mode = POLL)
+    {
+        ZoneScopedC( Color::Red4 );
+
+#ifdef TRACY_ON_DEMAND
+        if( !GetProfiler().IsConnected() )
+        {
+            m_previousCheckpoint = m_nextCheckpoint = m_queryCounter;
+            return;
+        }
+#endif
+
+        if (m_previousCheckpoint == m_nextCheckpoint)
+        {
+            uintptr_t nextCheckpoint = m_queryCounter;
+            if (nextCheckpoint == m_nextCheckpoint)
+            {
+                return;
+            }
+            m_nextCheckpoint = nextCheckpoint;
+            m_immediateDevCtx->End(m_disjointQuery);
+        }
+
+        if (mode == CollectMode::BLOCK)
+        {
+            WaitForQuery(m_disjointQuery);
+        }
+
+        D3D11_QUERY_DATA_TIMESTAMP_DISJOINT disjoint = { };
+        if (m_immediateDevCtx->GetData(m_disjointQuery, &disjoint, sizeof(disjoint), D3D11_ASYNC_GETDATA_DONOTFLUSH) != S_OK)
+        {
+            return;
+        }
+
+        if (disjoint.Disjoint == TRUE)
+        {
+            m_previousCheckpoint = m_nextCheckpoint;
+            TracyD3D11Panic("disjoint timestamps detected; dropping.");
+            return;
+        }
+
+        auto begin = m_previousCheckpoint;
+        auto end = m_nextCheckpoint;
+        for (auto i = begin; i != end; ++i)
+        {
+            uint32_t k = RingIndex(i);
+            UINT64 timestamp = 0;
+            if (m_immediateDevCtx->GetData(m_queries[k], &timestamp, sizeof(timestamp), 0) != S_OK)
+            {
+                TracyD3D11Panic("timestamp expected to be ready, but it was not!");
+                break;
+            }
+            timestamp *= (1000000000ull / disjoint.Frequency);
+            auto* item = Profiler::QueueSerial();
+            MemWrite(&item->hdr.type, QueueType::GpuTime);
+            MemWrite(&item->gpuTime.gpuTime, static_cast<int64_t>(timestamp));
+            MemWrite(&item->gpuTime.queryId, static_cast<uint16_t>(k));
+            MemWrite(&item->gpuTime.context, m_contextId);
+            Profiler::QueueSerialFinish();
+        }
+
+        // disjoint timestamp queries should only be invoked once per frame or less
+        // https://learn.microsoft.com/en-us/windows/win32/api/d3d11/ne-d3d11-d3d11_query
+        m_immediateDevCtx->Begin(m_disjointQuery);
+        m_previousCheckpoint = m_nextCheckpoint;
+    }
+
+private:
+    tracy_force_inline uint32_t RingIndex(uintptr_t index)
+    {
+        index %= MaxQueries;
+        return static_cast<uint32_t>(index);
+    }
+
+    tracy_force_inline uint32_t RingCount(uintptr_t begin, uintptr_t end)
+    {
+        // wrap-around safe: all unsigned
+        uintptr_t count = end - begin;
+        return static_cast<uint32_t>(count);
+    }
+
+    tracy_force_inline uint32_t NextQueryId()
+    {
+        auto id = m_queryCounter++;
+        if (RingCount(m_previousCheckpoint, id) >= MaxQueries)
+        {
+            TracyD3D11Panic("too many pending timestamp queries.");
+            // #TODO: return some sentinel value; ideally a "hidden" query index
+        }
+        return RingIndex(id);
+    }
+
+    tracy_force_inline ID3D11Query* GetQueryObjectFromId(uint32_t id)
+    {
+        return m_queries[id];
+    }
+
+    tracy_force_inline void WaitForQuery(ID3D11Query* query)
+    {
+        m_immediateDevCtx->Flush();
+        while (m_immediateDevCtx->GetData(query, nullptr, 0, 0) != S_OK)
+            YieldThread();  // busy-wait :-( attempt to reduce power usage with _mm_pause() & friends...
+    }
+
+    tracy_force_inline uint8_t GetContextId() const
+    {
+        return m_contextId;
+    }
+
+    ID3D11Device* m_device = nullptr;
+    ID3D11DeviceContext* m_immediateDevCtx = nullptr;
+
+    ID3D11Query* m_queries[MaxQueries];
+    ID3D11Query* m_disjointQuery = nullptr;
+
+    uint8_t m_contextId = 255;  // NOTE: apparently, 255 means invalid id; is this documented anywhere?
+
+    uintptr_t m_queryCounter = 0;
+
+    uintptr_t m_previousCheckpoint = 0;
+    uintptr_t m_nextCheckpoint = 0;
+};
+
+class D3D11ZoneScope
+{
+public:
+    tracy_force_inline D3D11ZoneScope( D3D11Ctx* ctx, const SourceLocationData* srcloc, bool active )
+        : D3D11ZoneScope(ctx, active)
+    {
+        if( !m_active ) return;
+
+        auto* item = Profiler::QueueSerial();
+        WriteQueueItem(item, QueueType::GpuZoneBeginSerial, reinterpret_cast<uint64_t>(srcloc));
+    }
+
+    tracy_force_inline D3D11ZoneScope( D3D11Ctx* ctx, const SourceLocationData* srcloc, int depth, bool active )
+        : D3D11ZoneScope(ctx, active)
+    {
+        if( !m_active ) return;
+
+        auto* item = Profiler::QueueSerialCallstack(Callstack(depth));
+        WriteQueueItem(item, QueueType::GpuZoneBeginCallstackSerial, reinterpret_cast<uint64_t>(srcloc));
+    }
+
+    tracy_force_inline D3D11ZoneScope(D3D11Ctx* ctx, uint32_t line, const char* source, size_t sourceSz, const char* function, size_t functionSz, const char* name, size_t nameSz, bool active)
+        : D3D11ZoneScope(ctx, active)
+    {
+        if( !m_active ) return;
+
+        const auto sourceLocation = Profiler::AllocSourceLocation(line, source, sourceSz, function, functionSz, name, nameSz);
+
+        auto* item = Profiler::QueueSerial();
+        WriteQueueItem(item, QueueType::GpuZoneBeginAllocSrcLocSerial, sourceLocation);
+    }
+
+    tracy_force_inline D3D11ZoneScope(D3D11Ctx* ctx, uint32_t line, const char* source, size_t sourceSz, const char* function, size_t functionSz, const char* name, size_t nameSz, int depth, bool active)
+        : D3D11ZoneScope(ctx, active)
+    {
+        if( !m_active ) return;
+
+        const auto sourceLocation = Profiler::AllocSourceLocation(line, source, sourceSz, function, functionSz, name, nameSz);
+
+        auto* item = Profiler::QueueSerialCallstack(Callstack(depth));
+        WriteQueueItem(item, QueueType::GpuZoneBeginAllocSrcLocCallstackSerial, sourceLocation);
+    }
+
+    tracy_force_inline ~D3D11ZoneScope()
+    {
+        if( !m_active ) return;
+
+        const auto queryId = m_ctx->NextQueryId();
+        m_ctx->m_immediateDevCtx->End(m_ctx->GetQueryObjectFromId(queryId));
+
+        auto* item = Profiler::QueueSerial();
+        MemWrite( &item->hdr.type, QueueType::GpuZoneEndSerial );
+        MemWrite( &item->gpuZoneEnd.cpuTime, Profiler::GetTime() );
+        MemWrite( &item->gpuZoneEnd.thread, GetThreadHandle() );
+        MemWrite( &item->gpuZoneEnd.queryId, uint16_t( queryId ) );
+        MemWrite( &item->gpuZoneEnd.context, m_ctx->GetContextId() );
+        Profiler::QueueSerialFinish();
+    }
+
+private:
+    tracy_force_inline D3D11ZoneScope( D3D11Ctx* ctx, bool active )
+#ifdef TRACY_ON_DEMAND
+        : m_active( active && GetProfiler().IsConnected() )
+#else
+        : m_active( active )
+#endif
+    {
+        if( !m_active ) return;
+        m_ctx = ctx;
+    }
+
+    void WriteQueueItem(tracy::QueueItem* item, tracy::QueueType queueItemType, uint64_t sourceLocation)
+    {
+        const auto queryId = m_ctx->NextQueryId();
+        m_ctx->m_immediateDevCtx->End(m_ctx->GetQueryObjectFromId(queryId));
+
+        MemWrite( &item->hdr.type, queueItemType);
+        MemWrite( &item->gpuZoneBegin.cpuTime, Profiler::GetTime() );
+        MemWrite( &item->gpuZoneBegin.srcloc, sourceLocation );
+        MemWrite( &item->gpuZoneBegin.thread, GetThreadHandle() );
+        MemWrite( &item->gpuZoneBegin.queryId, uint16_t( queryId ) );
+        MemWrite( &item->gpuZoneBegin.context, m_ctx->GetContextId() );
+        Profiler::QueueSerialFinish();
+    }
+
+    const bool m_active;
+
+    D3D11Ctx* m_ctx;
+};
+
+static inline D3D11Ctx* CreateD3D11Context( ID3D11Device* device, ID3D11DeviceContext* devicectx )
+{
+    auto ctx = (D3D11Ctx*)tracy_malloc( sizeof( D3D11Ctx ) );
+    new(ctx) D3D11Ctx( device, devicectx );
+    return ctx;
+}
+
+static inline void DestroyD3D11Context( D3D11Ctx* ctx )
+{
+    ctx->~D3D11Ctx();
+    tracy_free( ctx );
+}
+}
+
+#undef TracyD3D11Panic
+
+using TracyD3D11Ctx = tracy::D3D11Ctx*;
+
+#define TracyD3D11Context( device, devicectx ) tracy::CreateD3D11Context( device, devicectx );
+#define TracyD3D11Destroy(ctx) tracy::DestroyD3D11Context(ctx);
+#define TracyD3D11ContextName(ctx, name, size) ctx->Name(name, size);
+
+#define TracyD3D11UnnamedZone ___tracy_gpu_d3d11_zone
+#define TracyD3D11SrcLocSymbol TracyConcat(__tracy_gpu_d3d11_source_location,TracyLine)
+#define TracyD3D11SrcLocObject(name, color) static constexpr tracy::SourceLocationData TracyD3D11SrcLocSymbol { name, TracyFunction, TracyFile, (uint32_t)TracyLine, color };
+
+#if defined TRACY_HAS_CALLSTACK && defined TRACY_CALLSTACK
+#  define TracyD3D11Zone( ctx, name ) TracyD3D11NamedZoneS( ctx, TracyD3D11UnnamedZone, name, TRACY_CALLSTACK, true )
+#  define TracyD3D11ZoneC( ctx, name, color ) TracyD3D11NamedZoneCS( ctx, TracyD3D11UnnamedZone, name, color, TRACY_CALLSTACK, true )
+#  define TracyD3D11NamedZone( ctx, varname, name, active ) TracyD3D11SrcLocObject(name, 0); tracy::D3D11ZoneScope varname( ctx, &TracyD3D11SrcLocSymbol, TRACY_CALLSTACK, active );
+#  define TracyD3D11NamedZoneC( ctx, varname, name, color, active ) TracyD3D11SrcLocObject(name, color); tracy::D3D11ZoneScope varname( ctx, &TracyD3D11SrcLocSymbol, TRACY_CALLSTACK, active );
+#  define TracyD3D11ZoneTransient(ctx, varname, name, active) TracyD3D11ZoneTransientS(ctx, varname, cmdList, name, TRACY_CALLSTACK, active)
+#else
+#  define TracyD3D11Zone( ctx, name ) TracyD3D11NamedZone( ctx, TracyD3D11UnnamedZone, name, true )
+#  define TracyD3D11ZoneC( ctx, name, color ) TracyD3D11NamedZoneC( ctx, TracyD3D11UnnamedZone, name, color, true )
+#  define TracyD3D11NamedZone( ctx, varname, name, active ) TracyD3D11SrcLocObject(name, 0); tracy::D3D11ZoneScope varname( ctx, &TracyD3D11SrcLocSymbol, active );
+#  define TracyD3D11NamedZoneC( ctx, varname, name, color, active ) TracyD3D11SrcLocObject(name, color); tracy::D3D11ZoneScope varname( ctx, &TracyD3D11SrcLocSymbol, active );
+#  define TracyD3D11ZoneTransient(ctx, varname, name, active) tracy::D3D11ZoneScope varname{ ctx, TracyLine, TracyFile, strlen(TracyFile), TracyFunction, strlen(TracyFunction), name, strlen(name), active };
+#endif
+
+#ifdef TRACY_HAS_CALLSTACK
+#  define TracyD3D11ZoneS( ctx, name, depth ) TracyD3D11NamedZoneS( ctx, TracyD3D11UnnamedZone, name, depth, true )
+#  define TracyD3D11ZoneCS( ctx, name, color, depth ) TracyD3D11NamedZoneCS( ctx, TracyD3D11UnnamedZone, name, color, depth, true )
+#  define TracyD3D11NamedZoneS( ctx, varname, name, depth, active ) TracyD3D11SrcLocObject(name, 0); tracy::D3D11ZoneScope varname( ctx, &TracyD3D11SrcLocSymbol, depth, active );
+#  define TracyD3D11NamedZoneCS( ctx, varname, name, color, depth, active ) TracyD3D11SrcLocObject(name, color); tracy::D3D11ZoneScope varname( ctx, &TracyD3D11SrcLocSymbol, depth, active );
+#  define TracyD3D11ZoneTransientS(ctx, varname, name, depth, active) tracy::D3D11ZoneScope varname{ ctx, TracyLine, TracyFile, strlen(TracyFile), TracyFunction, strlen(TracyFunction), name, strlen(name), depth, active };
+#else
+#  define TracyD3D11ZoneS( ctx, name, depth, active ) TracyD3D11Zone( ctx, name )
+#  define TracyD3D11ZoneCS( ctx, name, color, depth, active ) TracyD3D11ZoneC( name, color )
+#  define TracyD3D11NamedZoneS( ctx, varname, name, depth, active ) TracyD3D11NamedZone( ctx, varname, name, active )
+#  define TracyD3D11NamedZoneCS( ctx, varname, name, color, depth, active ) TracyD3D11NamedZoneC( ctx, varname, name, color, active )
+#  define TracyD3D11ZoneTransientS(ctx, varname, name, depth, active) TracyD3D11ZoneTransient(ctx, varname, name, active)
+#endif
+
+#define TracyD3D11Collect( ctx ) ctx->Collect();
+
+#endif
+
+#endif
diff --git a/project/thirdparty/tracy-0.11.1/tracy/TracyD3D12.hpp b/project/thirdparty/tracy-0.11.1/tracy/TracyD3D12.hpp
new file mode 100644
index 000000000..41567937e
--- /dev/null
+++ b/project/thirdparty/tracy-0.11.1/tracy/TracyD3D12.hpp
@@ -0,0 +1,500 @@
+#ifndef __TRACYD3D12_HPP__
+#define __TRACYD3D12_HPP__
+
+#ifndef TRACY_ENABLE
+
+#define TracyD3D12Context(device, queue) nullptr
+#define TracyD3D12Destroy(ctx)
+#define TracyD3D12ContextName(ctx, name, size)
+
+#define TracyD3D12NewFrame(ctx)
+
+#define TracyD3D12Zone(ctx, cmdList, name)
+#define TracyD3D12ZoneC(ctx, cmdList, name, color)
+#define TracyD3D12NamedZone(ctx, varname, cmdList, name, active)
+#define TracyD3D12NamedZoneC(ctx, varname, cmdList, name, color, active)
+#define TracyD3D12ZoneTransient(ctx, varname, cmdList, name, active)
+
+#define TracyD3D12ZoneS(ctx, cmdList, name, depth)
+#define TracyD3D12ZoneCS(ctx, cmdList, name, color, depth)
+#define TracyD3D12NamedZoneS(ctx, varname, cmdList, name, depth, active)
+#define TracyD3D12NamedZoneCS(ctx, varname, cmdList, name, color, depth, active)
+#define TracyD3D12ZoneTransientS(ctx, varname, cmdList, name, depth, active)
+
+#define TracyD3D12Collect(ctx)
+
+namespace tracy
+{
+    class D3D12ZoneScope {};
+}
+
+using TracyD3D12Ctx = void*;
+
+#else
+
+#include "Tracy.hpp"
+#include "../client/TracyProfiler.hpp"
+#include "../client/TracyCallstack.hpp"
+
+#include <cstdlib>
+#include <cassert>
+#include <d3d12.h>
+#include <dxgi.h>
+#include <queue>
+
+#define TracyD3D12Panic(msg, ...) do { assert(false && "TracyD3D12: " msg); TracyMessageLC("TracyD3D12: " msg, tracy::Color::Red4); __VA_ARGS__; } while(false);
+
+namespace tracy
+{
+
+    struct D3D12QueryPayload
+    {
+        uint32_t m_queryIdStart = 0;
+        uint32_t m_queryCount = 0;
+    };
+
+    // Command queue context.
+    class D3D12QueueCtx
+    {
+        friend class D3D12ZoneScope;
+
+        ID3D12Device* m_device = nullptr;
+        ID3D12CommandQueue* m_queue = nullptr;
+        uint8_t m_contextId = 255;  // TODO: apparently, 255 means "invalid id"; is this documented somewhere?
+        ID3D12QueryHeap* m_queryHeap = nullptr;
+        ID3D12Resource* m_readbackBuffer = nullptr;
+
+        // In-progress payload.
+        uint32_t m_queryLimit = 0;
+        std::atomic<uint32_t> m_queryCounter = 0;
+        uint32_t m_previousQueryCounter = 0;
+
+        uint32_t m_activePayload = 0;
+        ID3D12Fence* m_payloadFence = nullptr;
+        std::queue<D3D12QueryPayload> m_payloadQueue;
+
+        UINT64 m_prevCalibrationTicksCPU = 0;
+
+        void RecalibrateClocks()
+        {
+            UINT64 cpuTimestamp;
+            UINT64 gpuTimestamp;
+            if (FAILED(m_queue->GetClockCalibration(&gpuTimestamp, &cpuTimestamp)))
+            {
+                TracyD3D12Panic("failed to obtain queue clock calibration counters.", return);
+            }
+
+            int64_t cpuDeltaTicks = cpuTimestamp - m_prevCalibrationTicksCPU;
+            if (cpuDeltaTicks > 0)
+            {
+                static const int64_t nanosecodsPerTick = int64_t(1000000000) / GetFrequencyQpc();
+                int64_t cpuDeltaNS = cpuDeltaTicks * nanosecodsPerTick;
+                // Save the device cpu timestamp, not the Tracy profiler timestamp:
+                m_prevCalibrationTicksCPU = cpuTimestamp;
+
+                cpuTimestamp = Profiler::GetTime();
+
+                auto* item = Profiler::QueueSerial();
+                MemWrite(&item->hdr.type, QueueType::GpuCalibration);
+                MemWrite(&item->gpuCalibration.gpuTime, gpuTimestamp);
+                MemWrite(&item->gpuCalibration.cpuTime, cpuTimestamp);
+                MemWrite(&item->gpuCalibration.cpuDelta, cpuDeltaNS);
+                MemWrite(&item->gpuCalibration.context, GetId());
+                SubmitQueueItem(item);
+            }
+        }
+
+        tracy_force_inline void SubmitQueueItem(tracy::QueueItem* item)
+        {
+#ifdef TRACY_ON_DEMAND
+            GetProfiler().DeferItem(*item);
+#endif
+            Profiler::QueueSerialFinish();
+        }
+
+    public:
+        D3D12QueueCtx(ID3D12Device* device, ID3D12CommandQueue* queue)
+            : m_device(device)
+            , m_queue(queue)
+        {
+            // Verify we support timestamp queries on this queue.
+
+            if (queue->GetDesc().Type == D3D12_COMMAND_LIST_TYPE_COPY)
+            {
+                D3D12_FEATURE_DATA_D3D12_OPTIONS3 featureData{};
+
+                HRESULT hr = device->CheckFeatureSupport(D3D12_FEATURE_D3D12_OPTIONS3, &featureData, sizeof(featureData));
+                if (FAILED(hr) || (featureData.CopyQueueTimestampQueriesSupported == FALSE))
+                {
+                    TracyD3D12Panic("Platform does not support profiling of copy queues.", return);
+                }
+            }
+
+            static constexpr uint32_t MaxQueries = 64 * 1024;  // Must be even, because queries are (begin, end) pairs
+            m_queryLimit = MaxQueries;
+
+            D3D12_QUERY_HEAP_DESC heapDesc{};
+            heapDesc.Type = queue->GetDesc().Type == D3D12_COMMAND_LIST_TYPE_COPY ? D3D12_QUERY_HEAP_TYPE_COPY_QUEUE_TIMESTAMP : D3D12_QUERY_HEAP_TYPE_TIMESTAMP;
+            heapDesc.Count = m_queryLimit;
+            heapDesc.NodeMask = 0;  // #TODO: Support multiple adapters.
+
+            while (FAILED(device->CreateQueryHeap(&heapDesc, IID_PPV_ARGS(&m_queryHeap))))
+            {
+                m_queryLimit /= 2;
+                heapDesc.Count = m_queryLimit;
+            }
+
+            // Create a readback buffer, which will be used as a destination for the query data.
+
+            D3D12_RESOURCE_DESC readbackBufferDesc{};
+            readbackBufferDesc.Alignment = 0;
+            readbackBufferDesc.Dimension = D3D12_RESOURCE_DIMENSION_BUFFER;
+            readbackBufferDesc.Width = m_queryLimit * sizeof(uint64_t);
+            readbackBufferDesc.Height = 1;
+            readbackBufferDesc.DepthOrArraySize = 1;
+            readbackBufferDesc.Format = DXGI_FORMAT_UNKNOWN;
+            readbackBufferDesc.Layout = D3D12_TEXTURE_LAYOUT_ROW_MAJOR;  // Buffers are always row major.
+            readbackBufferDesc.MipLevels = 1;
+            readbackBufferDesc.SampleDesc.Count = 1;
+            readbackBufferDesc.SampleDesc.Quality = 0;
+            readbackBufferDesc.Flags = D3D12_RESOURCE_FLAG_NONE;
+
+            D3D12_HEAP_PROPERTIES readbackHeapProps{};
+            readbackHeapProps.Type = D3D12_HEAP_TYPE_READBACK;
+            readbackHeapProps.CPUPageProperty = D3D12_CPU_PAGE_PROPERTY_UNKNOWN;
+            readbackHeapProps.MemoryPoolPreference = D3D12_MEMORY_POOL_UNKNOWN;
+            readbackHeapProps.CreationNodeMask = 0;
+            readbackHeapProps.VisibleNodeMask = 0;  // #TODO: Support multiple adapters.
+
+            if (FAILED(device->CreateCommittedResource(&readbackHeapProps, D3D12_HEAP_FLAG_NONE, &readbackBufferDesc, D3D12_RESOURCE_STATE_COPY_DEST, nullptr, IID_PPV_ARGS(&m_readbackBuffer))))
+            {
+                TracyD3D12Panic("Failed to create query readback buffer.", return);
+            }
+
+            if (FAILED(device->CreateFence(0, D3D12_FENCE_FLAG_NONE, IID_PPV_ARGS(&m_payloadFence))))
+            {
+                TracyD3D12Panic("Failed to create payload fence.", return);
+            }
+
+            float period = [queue]()
+            {
+                uint64_t timestampFrequency;
+                if (FAILED(queue->GetTimestampFrequency(&timestampFrequency)))
+                {
+                    return 0.0f;
+                }
+                return static_cast<float>( 1E+09 / static_cast<double>(timestampFrequency) );
+            }();
+
+            if (period == 0.0f)
+            {
+                TracyD3D12Panic("Failed to get timestamp frequency.", return);
+            }
+
+            uint64_t cpuTimestamp;
+            uint64_t gpuTimestamp;
+            if (FAILED(queue->GetClockCalibration(&gpuTimestamp, &cpuTimestamp)))
+            {
+                TracyD3D12Panic("Failed to get queue clock calibration.", return);
+            }
+
+            // Save the device cpu timestamp, not the profiler's timestamp.
+            m_prevCalibrationTicksCPU = cpuTimestamp;
+
+            cpuTimestamp = Profiler::GetTime();
+
+            // all checked: ready to roll
+            m_contextId = GetGpuCtxCounter().fetch_add(1);
+
+            auto* item = Profiler::QueueSerial();
+            MemWrite(&item->hdr.type, QueueType::GpuNewContext);
+            MemWrite(&item->gpuNewContext.cpuTime, cpuTimestamp);
+            MemWrite(&item->gpuNewContext.gpuTime, gpuTimestamp);
+            MemWrite(&item->gpuNewContext.thread, decltype(item->gpuNewContext.thread)(0)); // #TODO: why 0 instead of GetThreadHandle()?
+            MemWrite(&item->gpuNewContext.period, period);
+            MemWrite(&item->gpuNewContext.context, GetId());
+            MemWrite(&item->gpuNewContext.flags, GpuContextCalibration);
+            MemWrite(&item->gpuNewContext.type, GpuContextType::Direct3D12);
+            SubmitQueueItem(item);
+        }
+
+        ~D3D12QueueCtx()
+        {
+            ZoneScopedC(Color::Red4);
+            // collect all pending timestamps
+            while (m_payloadFence->GetCompletedValue() != m_activePayload)
+                /* busy-wait ... */;
+            Collect();
+            m_payloadFence->Release();
+            m_readbackBuffer->Release();
+            m_queryHeap->Release();
+        }
+
+
+        void NewFrame()
+        {
+            uint32_t queryCounter = m_queryCounter.exchange(0);
+            m_payloadQueue.emplace(D3D12QueryPayload{ m_previousQueryCounter, queryCounter });
+            m_previousQueryCounter += queryCounter;
+
+            if (m_previousQueryCounter >= m_queryLimit)
+            {
+                m_previousQueryCounter -= m_queryLimit;
+            }
+
+            m_queue->Signal(m_payloadFence, ++m_activePayload);
+        }
+
+        void Name( const char* name, uint16_t len )
+        {
+            auto ptr = (char*)tracy_malloc( len );
+            memcpy( ptr, name, len );
+
+            auto item = Profiler::QueueSerial();
+            MemWrite( &item->hdr.type, QueueType::GpuContextName );
+            MemWrite( &item->gpuContextNameFat.context, GetId());
+            MemWrite( &item->gpuContextNameFat.ptr, (uint64_t)ptr );
+            MemWrite( &item->gpuContextNameFat.size, len );
+            SubmitQueueItem(item);
+        }
+
+        void Collect()
+        {
+            ZoneScopedC(Color::Red4);
+
+#ifdef TRACY_ON_DEMAND
+            if (!GetProfiler().IsConnected())
+            {
+                m_queryCounter = 0;
+
+                return;
+            }
+#endif
+
+            // Find out what payloads are available.
+            const auto newestReadyPayload = m_payloadFence->GetCompletedValue();
+            const auto payloadCount = m_payloadQueue.size() - (m_activePayload - newestReadyPayload);
+
+            if (!payloadCount)
+            {
+                return;  // No payloads are available yet, exit out.
+            }
+
+            D3D12_RANGE mapRange{ 0, m_queryLimit * sizeof(uint64_t) };
+
+            // Map the readback buffer so we can fetch the query data from the GPU.
+            void* readbackBufferMapping = nullptr;
+
+            if (FAILED(m_readbackBuffer->Map(0, &mapRange, &readbackBufferMapping)))
+            {
+                TracyD3D12Panic("Failed to map readback buffer.", return);
+            }
+
+            auto* timestampData = static_cast<uint64_t*>(readbackBufferMapping);
+
+            for (uint32_t i = 0; i < payloadCount; ++i)
+            {
+                const auto& payload = m_payloadQueue.front();
+
+                for (uint32_t j = 0; j < payload.m_queryCount; ++j)
+                {
+                    const auto counter = (payload.m_queryIdStart + j) % m_queryLimit;
+                    const auto timestamp = timestampData[counter];
+                    const auto queryId = counter;
+
+                    auto* item = Profiler::QueueSerial();
+                    MemWrite(&item->hdr.type, QueueType::GpuTime);
+                    MemWrite(&item->gpuTime.gpuTime, timestamp);
+                    MemWrite(&item->gpuTime.queryId, static_cast<uint16_t>(queryId));
+                    MemWrite(&item->gpuTime.context, GetId());
+
+                    Profiler::QueueSerialFinish();
+                }
+
+                m_payloadQueue.pop();
+            }
+
+            m_readbackBuffer->Unmap(0, nullptr);
+
+            // Recalibrate to account for drift.
+            RecalibrateClocks();
+        }
+
+    private:
+        tracy_force_inline uint32_t NextQueryId()
+        {
+            uint32_t queryCounter = m_queryCounter.fetch_add(2);
+            if (queryCounter >= m_queryLimit)
+            {
+                TracyD3D12Panic("Submitted too many GPU queries! Consider increasing MaxQueries.");
+                // #TODO: consider returning an invalid id or sentinel value here
+            }
+
+            const uint32_t id = (m_previousQueryCounter + queryCounter) % m_queryLimit;
+
+            return id;
+        }
+
+        tracy_force_inline uint8_t GetId() const
+        {
+            return m_contextId;
+        }
+    };
+
+    class D3D12ZoneScope
+    {
+        const bool m_active;
+        D3D12QueueCtx* m_ctx = nullptr;
+        ID3D12GraphicsCommandList* m_cmdList = nullptr;
+        uint32_t m_queryId = 0;  // Used for tracking in nested zones.
+
+        tracy_force_inline void WriteQueueItem(QueueItem* item, QueueType type, uint64_t srcLocation)
+        {
+            MemWrite(&item->hdr.type, type);
+            MemWrite(&item->gpuZoneBegin.cpuTime, Profiler::GetTime());
+            MemWrite(&item->gpuZoneBegin.srcloc, srcLocation);
+            MemWrite(&item->gpuZoneBegin.thread, GetThreadHandle());
+            MemWrite(&item->gpuZoneBegin.queryId, static_cast<uint16_t>(m_queryId));
+            MemWrite(&item->gpuZoneBegin.context, m_ctx->GetId());
+            Profiler::QueueSerialFinish();
+        }
+
+        tracy_force_inline D3D12ZoneScope(D3D12QueueCtx* ctx, ID3D12GraphicsCommandList* cmdList, bool active)
+#ifdef TRACY_ON_DEMAND
+            : m_active(active&& GetProfiler().IsConnected())
+#else
+            : m_active(active)
+#endif
+        {
+            if (!m_active) return;
+
+            m_ctx = ctx;
+            m_cmdList = cmdList;
+
+            m_queryId = m_ctx->NextQueryId();
+            m_cmdList->EndQuery(m_ctx->m_queryHeap, D3D12_QUERY_TYPE_TIMESTAMP, m_queryId);
+        }
+
+    public:
+        tracy_force_inline D3D12ZoneScope(D3D12QueueCtx* ctx, ID3D12GraphicsCommandList* cmdList, const SourceLocationData* srcLocation, bool active)
+            : D3D12ZoneScope(ctx, cmdList, active)
+        {
+            if (!m_active) return;
+
+            auto* item = Profiler::QueueSerial();
+            WriteQueueItem(item, QueueType::GpuZoneBeginSerial, reinterpret_cast<uint64_t>(srcLocation));
+        }
+
+        tracy_force_inline D3D12ZoneScope(D3D12QueueCtx* ctx, ID3D12GraphicsCommandList* cmdList, const SourceLocationData* srcLocation, int depth, bool active)
+            : D3D12ZoneScope(ctx, cmdList, active)
+        {
+            if (!m_active) return;
+
+            auto* item = Profiler::QueueSerialCallstack(Callstack(depth));
+            WriteQueueItem(item, QueueType::GpuZoneBeginCallstackSerial, reinterpret_cast<uint64_t>(srcLocation));
+        }
+
+        tracy_force_inline D3D12ZoneScope(D3D12QueueCtx* ctx, uint32_t line, const char* source, size_t sourceSz, const char* function, size_t functionSz, const char* name, size_t nameSz, ID3D12GraphicsCommandList* cmdList, bool active)
+            : D3D12ZoneScope(ctx, cmdList, active)
+        {
+            if (!m_active) return;
+
+            const auto sourceLocation = Profiler::AllocSourceLocation(line, source, sourceSz, function, functionSz, name, nameSz);
+
+            auto* item = Profiler::QueueSerial();
+            WriteQueueItem(item, QueueType::GpuZoneBeginAllocSrcLocSerial, sourceLocation);
+        }
+
+        tracy_force_inline D3D12ZoneScope(D3D12QueueCtx* ctx, uint32_t line, const char* source, size_t sourceSz, const char* function, size_t functionSz, const char* name, size_t nameSz, ID3D12GraphicsCommandList* cmdList, int depth, bool active)
+            : D3D12ZoneScope(ctx, cmdList, active)
+        {
+            if (!m_active) return;
+
+            const auto sourceLocation = Profiler::AllocSourceLocation(line, source, sourceSz, function, functionSz, name, nameSz);
+
+            auto* item = Profiler::QueueSerialCallstack(Callstack(depth));
+            WriteQueueItem(item, QueueType::GpuZoneBeginAllocSrcLocCallstackSerial, sourceLocation);
+        }
+
+        tracy_force_inline ~D3D12ZoneScope()
+        {
+            if (!m_active) return;
+
+            const auto queryId = m_queryId + 1;  // Our end query slot is immediately after the begin slot.
+            m_cmdList->EndQuery(m_ctx->m_queryHeap, D3D12_QUERY_TYPE_TIMESTAMP, queryId);
+
+            auto* item = Profiler::QueueSerial();
+            MemWrite(&item->hdr.type, QueueType::GpuZoneEndSerial);
+            MemWrite(&item->gpuZoneEnd.cpuTime, Profiler::GetTime());
+            MemWrite(&item->gpuZoneEnd.thread, GetThreadHandle());
+            MemWrite(&item->gpuZoneEnd.queryId, static_cast<uint16_t>(queryId));
+            MemWrite(&item->gpuZoneEnd.context, m_ctx->GetId());
+            Profiler::QueueSerialFinish();
+
+            m_cmdList->ResolveQueryData(m_ctx->m_queryHeap, D3D12_QUERY_TYPE_TIMESTAMP, m_queryId, 2, m_ctx->m_readbackBuffer, m_queryId * sizeof(uint64_t));
+        }
+    };
+
+    static inline D3D12QueueCtx* CreateD3D12Context(ID3D12Device* device, ID3D12CommandQueue* queue)
+    {
+        auto* ctx = static_cast<D3D12QueueCtx*>(tracy_malloc(sizeof(D3D12QueueCtx)));
+        new (ctx) D3D12QueueCtx{ device, queue };
+
+        return ctx;
+    }
+
+    static inline void DestroyD3D12Context(D3D12QueueCtx* ctx)
+    {
+        ctx->~D3D12QueueCtx();
+        tracy_free(ctx);
+    }
+
+}
+
+#undef TracyD3D12Panic
+
+using TracyD3D12Ctx = tracy::D3D12QueueCtx*;
+
+#define TracyD3D12Context(device, queue) tracy::CreateD3D12Context(device, queue);
+#define TracyD3D12Destroy(ctx) tracy::DestroyD3D12Context(ctx);
+#define TracyD3D12ContextName(ctx, name, size) ctx->Name(name, size);
+
+#define TracyD3D12NewFrame(ctx) ctx->NewFrame();
+
+#define TracyD3D12UnnamedZone ___tracy_gpu_d3d12_zone
+#define TracyD3D12SrcLocSymbol TracyConcat(__tracy_d3d12_source_location,TracyLine)
+#define TracyD3D12SrcLocObject(name, color) static constexpr tracy::SourceLocationData TracyD3D12SrcLocSymbol { name, TracyFunction, TracyFile, (uint32_t)TracyLine, color };
+
+#if defined TRACY_HAS_CALLSTACK && defined TRACY_CALLSTACK
+#  define TracyD3D12Zone(ctx, cmdList, name) TracyD3D12NamedZoneS(ctx, TracyD3D12UnnamedZone, cmdList, name, TRACY_CALLSTACK, true)
+#  define TracyD3D12ZoneC(ctx, cmdList, name, color) TracyD3D12NamedZoneCS(ctx, TracyD3D12UnnamedZone, cmdList, name, color, TRACY_CALLSTACK, true)
+#  define TracyD3D12NamedZone(ctx, varname, cmdList, name, active) TracyD3D12SrcLocObject(name, 0); tracy::D3D12ZoneScope varname{ ctx, cmdList, &TracyD3D12SrcLocSymbol, TRACY_CALLSTACK, active };
+#  define TracyD3D12NamedZoneC(ctx, varname, cmdList, name, color, active) TracyD3D12SrcLocObject(name, color); tracy::D3D12ZoneScope varname{ ctx, cmdList, &TracyD3D12SrcLocSymbol, TRACY_CALLSTACK, active };
+#  define TracyD3D12ZoneTransient(ctx, varname, cmdList, name, active) TracyD3D12ZoneTransientS(ctx, varname, cmdList, name, TRACY_CALLSTACK, active)
+#else
+#  define TracyD3D12Zone(ctx, cmdList, name) TracyD3D12NamedZone(ctx, TracyD3D12UnnamedZone, cmdList, name, true)
+#  define TracyD3D12ZoneC(ctx, cmdList, name, color) TracyD3D12NamedZoneC(ctx, TracyD3D12UnnamedZone, cmdList, name, color, true)
+#  define TracyD3D12NamedZone(ctx, varname, cmdList, name, active) TracyD3D12SrcLocObject(name, 0); tracy::D3D12ZoneScope varname{ ctx, cmdList, &TracyD3D12SrcLocSymbol, active };
+#  define TracyD3D12NamedZoneC(ctx, varname, cmdList, name, color, active) TracyD3D12SrcLocObject(name, color); tracy::D3D12ZoneScope varname{ ctx, cmdList, &TracyD3D12SrcLocSymbol, active };
+#  define TracyD3D12ZoneTransient(ctx, varname, cmdList, name, active) tracy::D3D12ZoneScope varname{ ctx, TracyLine, TracyFile, strlen(TracyFile), TracyFunction, strlen(TracyFunction), name, strlen(name), cmdList, active };
+#endif
+
+#ifdef TRACY_HAS_CALLSTACK
+#  define TracyD3D12ZoneS(ctx, cmdList, name, depth) TracyD3D12NamedZoneS(ctx, TracyD3D12UnnamedZone, cmdList, name, depth, true)
+#  define TracyD3D12ZoneCS(ctx, cmdList, name, color, depth) TracyD3D12NamedZoneCS(ctx, TracyD3D12UnnamedZone, cmdList, name, color, depth, true)
+#  define TracyD3D12NamedZoneS(ctx, varname, cmdList, name, depth, active) TracyD3D12SrcLocObject(name, 0); tracy::D3D12ZoneScope varname{ ctx, cmdList, &TracyD3D12SrcLocSymbol, depth, active };
+#  define TracyD3D12NamedZoneCS(ctx, varname, cmdList, name, color, depth, active) TracyD3D12SrcLocObject(name, color); tracy::D3D12ZoneScope varname{ ctx, cmdList, &TracyD3D12SrcLocSymbol, depth, active };
+#  define TracyD3D12ZoneTransientS(ctx, varname, cmdList, name, depth, active) tracy::D3D12ZoneScope varname{ ctx, TracyLine, TracyFile, strlen(TracyFile), TracyFunction, strlen(TracyFunction), name, strlen(name), cmdList, depth, active };
+#else
+#  define TracyD3D12ZoneS(ctx, cmdList, name, depth) TracyD3D12Zone(ctx, cmdList, name)
+#  define TracyD3D12ZoneCS(ctx, cmdList, name, color, depth) TracyD3D12Zone(ctx, cmdList, name, color)
+#  define TracyD3D12NamedZoneS(ctx, varname, cmdList, name, depth, active) TracyD3D12NamedZone(ctx, varname, cmdList, name, active)
+#  define TracyD3D12NamedZoneCS(ctx, varname, cmdList, name, color, depth, active) TracyD3D12NamedZoneC(ctx, varname, cmdList, name, color, active)
+#  define TracyD3D12ZoneTransientS(ctx, varname, cmdList, name, depth, active) TracyD3D12ZoneTransient(ctx, varname, cmdList, name, active)
+#endif
+
+#define TracyD3D12Collect(ctx) ctx->Collect();
+
+#endif
+
+#endif
diff --git a/project/thirdparty/tracy-0.11.1/tracy/TracyLua.hpp b/project/thirdparty/tracy-0.11.1/tracy/TracyLua.hpp
new file mode 100644
index 000000000..51dead51f
--- /dev/null
+++ b/project/thirdparty/tracy-0.11.1/tracy/TracyLua.hpp
@@ -0,0 +1,446 @@
+#ifndef __TRACYLUA_HPP__
+#define __TRACYLUA_HPP__
+
+// Include this file after you include lua headers.
+
+#ifndef TRACY_ENABLE
+
+#include <string.h>
+
+namespace tracy
+{
+
+namespace detail
+{
+static inline int noop( lua_State* L ) { return 0; }
+}
+
+static inline void LuaRegister( lua_State* L )
+{
+    lua_newtable( L );
+    lua_pushcfunction( L, detail::noop );
+    lua_setfield( L, -2, "ZoneBegin" );
+    lua_pushcfunction( L, detail::noop );
+    lua_setfield( L, -2, "ZoneBeginN" );
+    lua_pushcfunction( L, detail::noop );
+    lua_setfield( L, -2, "ZoneBeginS" );
+    lua_pushcfunction( L, detail::noop );
+    lua_setfield( L, -2, "ZoneBeginNS" );
+    lua_pushcfunction( L, detail::noop );
+    lua_setfield( L, -2, "ZoneEnd" );
+    lua_pushcfunction( L, detail::noop );
+    lua_setfield( L, -2, "ZoneText" );
+    lua_pushcfunction( L, detail::noop );
+    lua_setfield( L, -2, "ZoneName" );
+    lua_pushcfunction( L, detail::noop );
+    lua_setfield( L, -2, "Message" );
+    lua_setglobal( L, "tracy" );
+}
+
+static inline char* FindEnd( char* ptr )
+{
+    unsigned int cnt = 1;
+    while( cnt != 0 )
+    {
+        if( *ptr == '(' ) cnt++;
+        else if( *ptr == ')' ) cnt--;
+        ptr++;
+    }
+    return ptr;
+}
+
+static inline void LuaRemove( char* script )
+{
+    while( *script )
+    {
+        if( strncmp( script, "tracy.", 6 ) == 0 )
+        {
+            if( strncmp( script + 6, "Zone", 4 ) == 0 )
+            {
+                if( strncmp( script + 10, "End()", 5 ) == 0 )
+                {
+                    memset( script, ' ', 15 );
+                    script += 15;
+                }
+                else if( strncmp( script + 10, "Begin()", 7 ) == 0 )
+                {
+                    memset( script, ' ', 17 );
+                    script += 17;
+                }
+                else if( strncmp( script + 10, "Text(", 5 ) == 0 )
+                {
+                    auto end = FindEnd( script + 15 );
+                    memset( script, ' ', end - script );
+                    script = end;
+                }
+                else if( strncmp( script + 10, "Name(", 5 ) == 0 )
+                {
+                    auto end = FindEnd( script + 15 );
+                    memset( script, ' ', end - script );
+                    script = end;
+                }
+                else if( strncmp( script + 10, "BeginN(", 7 ) == 0 )
+                {
+                    auto end = FindEnd( script + 17 );
+                    memset( script, ' ', end - script );
+                    script = end;
+                }
+                else if( strncmp( script + 10, "BeginS(", 7 ) == 0 )
+                {
+                    auto end = FindEnd( script + 17 );
+                    memset( script, ' ', end - script );
+                    script = end;
+                }
+                else if( strncmp( script + 10, "BeginNS(", 8 ) == 0 )
+                {
+                    auto end = FindEnd( script + 18 );
+                    memset( script, ' ', end - script );
+                    script = end;
+                }
+                else
+                {
+                    script += 10;
+                }
+            }
+            else if( strncmp( script + 6, "Message(", 8 ) == 0 )
+            {
+                auto end = FindEnd( script + 14 );
+                memset( script, ' ', end - script );
+                script = end;
+            }
+            else
+            {
+                script += 6;
+            }
+        }
+        else
+        {
+            script++;
+        }
+    }
+}
+
+}
+
+#else
+
+#include <assert.h>
+#include <limits>
+
+#include "../common/TracyColor.hpp"
+#include "../common/TracyAlign.hpp"
+#include "../common/TracyForceInline.hpp"
+#include "../common/TracySystem.hpp"
+#include "../client/TracyProfiler.hpp"
+
+namespace tracy
+{
+
+#ifdef TRACY_ON_DEMAND
+TRACY_API LuaZoneState& GetLuaZoneState();
+#endif
+
+namespace detail
+{
+
+#ifdef TRACY_HAS_CALLSTACK
+static tracy_force_inline void SendLuaCallstack( lua_State* L, uint32_t depth )
+{
+    assert( depth <= 64 );
+    lua_Debug dbg[64];
+    const char* func[64];
+    uint32_t fsz[64];
+    uint32_t ssz[64];
+
+    uint8_t cnt;
+    uint16_t spaceNeeded = sizeof( cnt );
+    for( cnt=0; cnt<depth; cnt++ )
+    {
+        if( lua_getstack( L, cnt+1, dbg+cnt ) == 0 ) break;
+        lua_getinfo( L, "Snl", dbg+cnt );
+        func[cnt] = dbg[cnt].name ? dbg[cnt].name : dbg[cnt].short_src;
+        fsz[cnt] = uint32_t( strlen( func[cnt] ) );
+        ssz[cnt] = uint32_t( strlen( dbg[cnt].source ) );
+        spaceNeeded += fsz[cnt] + ssz[cnt];
+    }
+    spaceNeeded += cnt * ( 4 + 2 + 2 );     // source line, function string length, source string length
+
+    auto ptr = (char*)tracy_malloc( spaceNeeded + 2 );
+    auto dst = ptr;
+    memcpy( dst, &spaceNeeded, 2 ); dst += 2;
+    memcpy( dst, &cnt, 1 ); dst++;
+    for( uint8_t i=0; i<cnt; i++ )
+    {
+        const uint32_t line = dbg[i].currentline;
+        memcpy( dst, &line, 4 ); dst += 4;
+        assert( fsz[i] <= (std::numeric_limits<uint16_t>::max)() );
+        memcpy( dst, fsz+i, 2 ); dst += 2;
+        memcpy( dst, func[i], fsz[i] ); dst += fsz[i];
+        assert( ssz[i] <= (std::numeric_limits<uint16_t>::max)() );
+        memcpy( dst, ssz+i, 2 ); dst += 2;
+        memcpy( dst, dbg[i].source, ssz[i] ), dst += ssz[i];
+    }
+    assert( dst - ptr == spaceNeeded + 2 );
+
+    TracyQueuePrepare( QueueType::CallstackAlloc );
+    MemWrite( &item->callstackAllocFat.ptr, (uint64_t)ptr );
+    MemWrite( &item->callstackAllocFat.nativePtr, (uint64_t)Callstack( depth ) );
+    TracyQueueCommit( callstackAllocFatThread );
+}
+
+static inline void LuaShortenSrc( char* dst, const char* src )
+{
+    size_t l = std::min( (size_t)255, strlen( src ) );
+    memcpy( dst, src, l );
+    dst[l] = 0;
+}
+
+static inline int LuaZoneBeginS( lua_State* L )
+{
+#ifdef TRACY_ON_DEMAND
+    const auto zoneCnt = GetLuaZoneState().counter++;
+    if( zoneCnt != 0 && !GetLuaZoneState().active ) return 0;
+    GetLuaZoneState().active = GetProfiler().IsConnected();
+    if( !GetLuaZoneState().active ) return 0;
+#endif
+
+#ifdef TRACY_CALLSTACK
+    const uint32_t depth = TRACY_CALLSTACK;
+#else
+    const auto depth = uint32_t( lua_tointeger( L, 1 ) );
+#endif
+    SendLuaCallstack( L, depth );
+
+    lua_Debug dbg;
+    lua_getstack( L, 1, &dbg );
+    lua_getinfo( L, "Snl", &dbg );
+    char src[256];
+    LuaShortenSrc( src, dbg.source );
+    const auto srcloc = Profiler::AllocSourceLocation( dbg.currentline, src, dbg.name ? dbg.name : dbg.short_src );
+
+    TracyQueuePrepare( QueueType::ZoneBeginAllocSrcLocCallstack );
+    MemWrite( &item->zoneBegin.time, Profiler::GetTime() );
+    MemWrite( &item->zoneBegin.srcloc, srcloc );
+    TracyQueueCommit( zoneBeginThread );
+
+    return 0;
+}
+
+static inline int LuaZoneBeginNS( lua_State* L )
+{
+#ifdef TRACY_ON_DEMAND
+    const auto zoneCnt = GetLuaZoneState().counter++;
+    if( zoneCnt != 0 && !GetLuaZoneState().active ) return 0;
+    GetLuaZoneState().active = GetProfiler().IsConnected();
+    if( !GetLuaZoneState().active ) return 0;
+#endif
+
+#ifdef TRACY_CALLSTACK
+    const uint32_t depth = TRACY_CALLSTACK;
+#else
+    const auto depth = uint32_t( lua_tointeger( L, 2 ) );
+#endif
+    SendLuaCallstack( L, depth );
+
+    lua_Debug dbg;
+    lua_getstack( L, 1, &dbg );
+    lua_getinfo( L, "Snl", &dbg );
+    size_t nsz;
+    char src[256];
+    LuaShortenSrc( src, dbg.source );
+    const auto name = lua_tolstring( L, 1, &nsz );
+    const auto srcloc = Profiler::AllocSourceLocation( dbg.currentline, src, dbg.name ? dbg.name : dbg.short_src, name, nsz );
+
+    TracyQueuePrepare( QueueType::ZoneBeginAllocSrcLocCallstack );
+    MemWrite( &item->zoneBegin.time, Profiler::GetTime() );
+    MemWrite( &item->zoneBegin.srcloc, srcloc );
+    TracyQueueCommit( zoneBeginThread );
+
+    return 0;
+}
+#endif
+
+static inline int LuaZoneBegin( lua_State* L )
+{
+#if defined TRACY_HAS_CALLSTACK && defined TRACY_CALLSTACK
+    return LuaZoneBeginS( L );
+#else
+#ifdef TRACY_ON_DEMAND
+    const auto zoneCnt = GetLuaZoneState().counter++;
+    if( zoneCnt != 0 && !GetLuaZoneState().active ) return 0;
+    GetLuaZoneState().active = GetProfiler().IsConnected();
+    if( !GetLuaZoneState().active ) return 0;
+#endif
+
+    lua_Debug dbg;
+    lua_getstack( L, 1, &dbg );
+    lua_getinfo( L, "Snl", &dbg );
+    char src[256];
+    LuaShortenSrc( src, dbg.source );
+    const auto srcloc = Profiler::AllocSourceLocation( dbg.currentline, src, dbg.name ? dbg.name : dbg.short_src );
+
+    TracyQueuePrepare( QueueType::ZoneBeginAllocSrcLoc );
+    MemWrite( &item->zoneBegin.time, Profiler::GetTime() );
+    MemWrite( &item->zoneBegin.srcloc, srcloc );
+    TracyQueueCommit( zoneBeginThread );
+    return 0;
+#endif
+}
+
+static inline int LuaZoneBeginN( lua_State* L )
+{
+#if defined TRACY_HAS_CALLSTACK && defined TRACY_CALLSTACK
+    return LuaZoneBeginNS( L );
+#else
+#ifdef TRACY_ON_DEMAND
+    const auto zoneCnt = GetLuaZoneState().counter++;
+    if( zoneCnt != 0 && !GetLuaZoneState().active ) return 0;
+    GetLuaZoneState().active = GetProfiler().IsConnected();
+    if( !GetLuaZoneState().active ) return 0;
+#endif
+
+    lua_Debug dbg;
+    lua_getstack( L, 1, &dbg );
+    lua_getinfo( L, "Snl", &dbg );
+    size_t nsz;
+    char src[256];
+    LuaShortenSrc( src, dbg.source );
+    const auto name = lua_tolstring( L, 1, &nsz );
+    const auto srcloc = Profiler::AllocSourceLocation( dbg.currentline, src, dbg.name ? dbg.name : dbg.short_src, name, nsz );
+
+    TracyQueuePrepare( QueueType::ZoneBeginAllocSrcLoc );
+    MemWrite( &item->zoneBegin.time, Profiler::GetTime() );
+    MemWrite( &item->zoneBegin.srcloc, srcloc );
+    TracyQueueCommit( zoneBeginThread );
+    return 0;
+#endif
+}
+
+static inline int LuaZoneEnd( lua_State* L )
+{
+#ifdef TRACY_ON_DEMAND
+    assert( GetLuaZoneState().counter != 0 );
+    GetLuaZoneState().counter--;
+    if( !GetLuaZoneState().active ) return 0;
+    if( !GetProfiler().IsConnected() )
+    {
+        GetLuaZoneState().active = false;
+        return 0;
+    }
+#endif
+
+    TracyQueuePrepare( QueueType::ZoneEnd );
+    MemWrite( &item->zoneEnd.time, Profiler::GetTime() );
+    TracyQueueCommit( zoneEndThread );
+    return 0;
+}
+
+static inline int LuaZoneText( lua_State* L )
+{
+#ifdef TRACY_ON_DEMAND
+    if( !GetLuaZoneState().active ) return 0;
+    if( !GetProfiler().IsConnected() )
+    {
+        GetLuaZoneState().active = false;
+        return 0;
+    }
+#endif
+
+    auto txt = lua_tostring( L, 1 );
+    const auto size = strlen( txt );
+    assert( size < (std::numeric_limits<uint16_t>::max)() );
+
+    auto ptr = (char*)tracy_malloc( size );
+    memcpy( ptr, txt, size );
+
+    TracyQueuePrepare( QueueType::ZoneText );
+    MemWrite( &item->zoneTextFat.text, (uint64_t)ptr );
+    MemWrite( &item->zoneTextFat.size, (uint16_t)size );
+    TracyQueueCommit( zoneTextFatThread );
+    return 0;
+}
+
+static inline int LuaZoneName( lua_State* L )
+{
+#ifdef TRACY_ON_DEMAND
+    if( !GetLuaZoneState().active ) return 0;
+    if( !GetProfiler().IsConnected() )
+    {
+        GetLuaZoneState().active = false;
+        return 0;
+    }
+#endif
+
+    auto txt = lua_tostring( L, 1 );
+    const auto size = strlen( txt );
+    assert( size < (std::numeric_limits<uint16_t>::max)() );
+
+    auto ptr = (char*)tracy_malloc( size );
+    memcpy( ptr, txt, size );
+
+    TracyQueuePrepare( QueueType::ZoneName );
+    MemWrite( &item->zoneTextFat.text, (uint64_t)ptr );
+    MemWrite( &item->zoneTextFat.size, (uint16_t)size );
+    TracyQueueCommit( zoneTextFatThread );
+    return 0;
+}
+
+static inline int LuaMessage( lua_State* L )
+{
+#ifdef TRACY_ON_DEMAND
+    if( !GetProfiler().IsConnected() ) return 0;
+#endif
+
+    auto txt = lua_tostring( L, 1 );
+    const auto size = strlen( txt );
+    assert( size < (std::numeric_limits<uint16_t>::max)() );
+
+    auto ptr = (char*)tracy_malloc( size );
+    memcpy( ptr, txt, size );
+
+    TracyQueuePrepare( QueueType::Message );
+    MemWrite( &item->messageFat.time, Profiler::GetTime() );
+    MemWrite( &item->messageFat.text, (uint64_t)ptr );
+    MemWrite( &item->messageFat.size, (uint16_t)size );
+    TracyQueueCommit( messageFatThread );
+    return 0;
+}
+
+}
+
+static inline void LuaRegister( lua_State* L )
+{
+    lua_newtable( L );
+    lua_pushcfunction( L, detail::LuaZoneBegin );
+    lua_setfield( L, -2, "ZoneBegin" );
+    lua_pushcfunction( L, detail::LuaZoneBeginN );
+    lua_setfield( L, -2, "ZoneBeginN" );
+#ifdef TRACY_HAS_CALLSTACK
+    lua_pushcfunction( L, detail::LuaZoneBeginS );
+    lua_setfield( L, -2, "ZoneBeginS" );
+    lua_pushcfunction( L, detail::LuaZoneBeginNS );
+    lua_setfield( L, -2, "ZoneBeginNS" );
+#else
+    lua_pushcfunction( L, detail::LuaZoneBegin );
+    lua_setfield( L, -2, "ZoneBeginS" );
+    lua_pushcfunction( L, detail::LuaZoneBeginN );
+    lua_setfield( L, -2, "ZoneBeginNS" );
+#endif
+    lua_pushcfunction( L, detail::LuaZoneEnd );
+    lua_setfield( L, -2, "ZoneEnd" );
+    lua_pushcfunction( L, detail::LuaZoneText );
+    lua_setfield( L, -2, "ZoneText" );
+    lua_pushcfunction( L, detail::LuaZoneName );
+    lua_setfield( L, -2, "ZoneName" );
+    lua_pushcfunction( L, detail::LuaMessage );
+    lua_setfield( L, -2, "Message" );
+    lua_setglobal( L, "tracy" );
+}
+
+static inline void LuaRemove( char* script ) {}
+
+}
+
+#endif
+
+#endif
diff --git a/project/thirdparty/tracy-0.11.1/tracy/TracyOpenCL.hpp b/project/thirdparty/tracy-0.11.1/tracy/TracyOpenCL.hpp
new file mode 100644
index 000000000..20d0a7cab
--- /dev/null
+++ b/project/thirdparty/tracy-0.11.1/tracy/TracyOpenCL.hpp
@@ -0,0 +1,414 @@
+#ifndef __TRACYOPENCL_HPP__
+#define __TRACYOPENCL_HPP__
+
+#if !defined TRACY_ENABLE
+
+#define TracyCLContext(c, x) nullptr
+#define TracyCLDestroy(c)
+#define TracyCLContextName(c, x, y)
+
+#define TracyCLNamedZone(c, x, y, z)
+#define TracyCLNamedZoneC(c, x, y, z, w)
+#define TracyCLZone(c, x)
+#define TracyCLZoneC(c, x, y)
+#define TracyCLZoneTransient(c,x,y,z)
+
+#define TracyCLNamedZoneS(c, x, y, z, w)
+#define TracyCLNamedZoneCS(c, x, y, z, w, v)
+#define TracyCLZoneS(c, x, y)
+#define TracyCLZoneCS(c, x, y, z)
+#define TracyCLZoneTransientS(c,x,y,z,w)
+
+#define TracyCLNamedZoneSetEvent(x, e)
+#define TracyCLZoneSetEvent(e)
+
+#define TracyCLCollect(c)
+
+namespace tracy
+{
+    class OpenCLCtxScope {};
+}
+
+using TracyCLCtx = void*;
+
+#else
+
+#include <CL/cl.h>
+
+#include <atomic>
+#include <cassert>
+#include <sstream>
+
+#include "Tracy.hpp"
+#include "../client/TracyCallstack.hpp"
+#include "../client/TracyProfiler.hpp"
+#include "../common/TracyAlloc.hpp"
+
+#define TRACY_CL_TO_STRING_INDIRECT(T) #T
+#define TRACY_CL_TO_STRING(T) TRACY_CL_TO_STRING_INDIRECT(T)
+#define TRACY_CL_ASSERT(p) if(!(p)) {                                                         \
+    TracyMessageL( "TRACY_CL_ASSERT failed on " TracyFile ":" TRACY_CL_TO_STRING(TracyLine) );  \
+    assert(false && "TRACY_CL_ASSERT failed");                                                \
+}
+#define TRACY_CL_CHECK_ERROR(err) if(err != CL_SUCCESS) {                    \
+    std::ostringstream oss;                                                  \
+    oss << "TRACY_CL_CHECK_ERROR failed on " << TracyFile << ":" << TracyLine  \
+        << ": error code " << err;                                           \
+    auto msg = oss.str();                                                    \
+    TracyMessage(msg.data(), msg.size());                                    \
+    assert(false && "TRACY_CL_CHECK_ERROR failed");                          \
+}
+
+namespace tracy {
+
+    enum class EventPhase : uint8_t
+    {
+        Begin,
+        End
+    };
+
+    struct EventInfo
+    {
+        cl_event event;
+        EventPhase phase;
+    };
+
+    class OpenCLCtx
+    {
+    public:
+        enum { QueryCount = 64 * 1024 };
+
+        OpenCLCtx(cl_context context, cl_device_id device)
+            : m_contextId(GetGpuCtxCounter().fetch_add(1, std::memory_order_relaxed))
+            , m_head(0)
+            , m_tail(0)
+        {
+            int64_t tcpu, tgpu;
+            TRACY_CL_ASSERT(m_contextId != 255);
+
+            cl_int err = CL_SUCCESS;
+            cl_command_queue queue = clCreateCommandQueue(context, device, CL_QUEUE_PROFILING_ENABLE, &err);
+            TRACY_CL_CHECK_ERROR(err)
+            uint32_t dummyValue = 42;
+            cl_mem dummyBuffer = clCreateBuffer(context, CL_MEM_WRITE_ONLY, sizeof(uint32_t), nullptr, &err);
+            TRACY_CL_CHECK_ERROR(err)
+            cl_event writeBufferEvent;
+            TRACY_CL_CHECK_ERROR(clEnqueueWriteBuffer(queue, dummyBuffer, CL_FALSE, 0, sizeof(uint32_t), &dummyValue, 0, nullptr, &writeBufferEvent));
+            TRACY_CL_CHECK_ERROR(clWaitForEvents(1, &writeBufferEvent));
+
+            tcpu = Profiler::GetTime();
+
+            cl_int eventStatus;
+            TRACY_CL_CHECK_ERROR(clGetEventInfo(writeBufferEvent, CL_EVENT_COMMAND_EXECUTION_STATUS, sizeof(cl_int), &eventStatus, nullptr));
+            TRACY_CL_ASSERT(eventStatus == CL_COMPLETE);
+            TRACY_CL_CHECK_ERROR(clGetEventProfilingInfo(writeBufferEvent, CL_PROFILING_COMMAND_END, sizeof(cl_ulong), &tgpu, nullptr));
+            TRACY_CL_CHECK_ERROR(clReleaseEvent(writeBufferEvent));
+            TRACY_CL_CHECK_ERROR(clReleaseMemObject(dummyBuffer));
+            TRACY_CL_CHECK_ERROR(clReleaseCommandQueue(queue));
+
+            auto item = Profiler::QueueSerial();
+            MemWrite(&item->hdr.type, QueueType::GpuNewContext);
+            MemWrite(&item->gpuNewContext.cpuTime, tcpu);
+            MemWrite(&item->gpuNewContext.gpuTime, tgpu);
+            memset(&item->gpuNewContext.thread, 0, sizeof(item->gpuNewContext.thread));
+            MemWrite(&item->gpuNewContext.period, 1.0f);
+            MemWrite(&item->gpuNewContext.type, GpuContextType::OpenCL);
+            MemWrite(&item->gpuNewContext.context, (uint8_t) m_contextId);
+            MemWrite(&item->gpuNewContext.flags, (uint8_t)0);
+#ifdef TRACY_ON_DEMAND
+            GetProfiler().DeferItem(*item);
+#endif
+            Profiler::QueueSerialFinish();
+        }
+
+        void Name( const char* name, uint16_t len )
+        {
+            auto ptr = (char*)tracy_malloc( len );
+            memcpy( ptr, name, len );
+
+            auto item = Profiler::QueueSerial();
+            MemWrite( &item->hdr.type, QueueType::GpuContextName );
+            MemWrite( &item->gpuContextNameFat.context, (uint8_t)m_contextId );
+            MemWrite( &item->gpuContextNameFat.ptr, (uint64_t)ptr );
+            MemWrite( &item->gpuContextNameFat.size, len );
+#ifdef TRACY_ON_DEMAND
+            GetProfiler().DeferItem( *item );
+#endif
+            Profiler::QueueSerialFinish();
+        }
+
+        void Collect()
+        {
+            ZoneScopedC(Color::Red4);
+
+            if (m_tail == m_head) return;
+
+#ifdef TRACY_ON_DEMAND
+            if (!GetProfiler().IsConnected())
+            {
+                m_head = m_tail = 0;
+            }
+#endif
+
+            for (; m_tail != m_head; m_tail = (m_tail + 1) % QueryCount)
+            {
+                EventInfo eventInfo = GetQuery(m_tail);
+                cl_int eventStatus;
+                cl_int err = clGetEventInfo(eventInfo.event, CL_EVENT_COMMAND_EXECUTION_STATUS, sizeof(cl_int), &eventStatus, nullptr);
+                if (err != CL_SUCCESS)
+                {
+                    std::ostringstream oss;
+                    oss << "clGetEventInfo falied with error code " << err << ", on event " << eventInfo.event << ", skipping...";
+                    auto msg = oss.str();
+                    TracyMessage(msg.data(), msg.size());
+                    if (eventInfo.event == nullptr) {
+                        TracyMessageL("A TracyCLZone must be paird with a TracyCLZoneSetEvent, check your code!");
+                    }
+                    assert(false && "clGetEventInfo failed, maybe a TracyCLZone is not paired with TracyCLZoneSetEvent");
+                    continue;
+                }
+                if (eventStatus != CL_COMPLETE) return;
+
+                cl_int eventInfoQuery = (eventInfo.phase == EventPhase::Begin)
+                    ? CL_PROFILING_COMMAND_START
+                    : CL_PROFILING_COMMAND_END;
+
+                cl_ulong eventTimeStamp = 0;
+                err = clGetEventProfilingInfo(eventInfo.event, eventInfoQuery, sizeof(cl_ulong), &eventTimeStamp, nullptr);
+                if (err == CL_PROFILING_INFO_NOT_AVAILABLE)
+                {
+                    TracyMessageL("command queue is not created with CL_QUEUE_PROFILING_ENABLE flag, check your code!");
+                    assert(false && "command queue is not created with CL_QUEUE_PROFILING_ENABLE flag");
+                }
+                else
+                    TRACY_CL_CHECK_ERROR(err);
+
+                TRACY_CL_ASSERT(eventTimeStamp != 0);
+
+                auto item = Profiler::QueueSerial();
+                MemWrite(&item->hdr.type, QueueType::GpuTime);
+                MemWrite(&item->gpuTime.gpuTime, (int64_t)eventTimeStamp);
+                MemWrite(&item->gpuTime.queryId, (uint16_t)m_tail);
+                MemWrite(&item->gpuTime.context, m_contextId);
+                Profiler::QueueSerialFinish();
+
+                if (eventInfo.phase == EventPhase::End)
+                {
+                    // Done with the event, so release it
+                    TRACY_CL_CHECK_ERROR(clReleaseEvent(eventInfo.event));
+                }
+            }
+        }
+
+        tracy_force_inline uint8_t GetId() const
+        {
+            return m_contextId;
+        }
+
+        tracy_force_inline unsigned int NextQueryId(EventInfo eventInfo)
+        {
+            const auto id = m_head;
+            m_head = (m_head + 1) % QueryCount;
+            TRACY_CL_ASSERT(m_head != m_tail);
+            m_query[id] = eventInfo;
+            return id;
+        }
+
+        tracy_force_inline EventInfo& GetQuery(unsigned int id)
+        {
+            TRACY_CL_ASSERT(id < QueryCount);
+            return m_query[id];
+        }
+
+    private:
+
+        unsigned int m_contextId;
+
+        EventInfo m_query[QueryCount];
+        unsigned int m_head; // index at which a new event should be inserted
+        unsigned int m_tail; // oldest event
+
+    };
+
+    class OpenCLCtxScope {
+    public:
+        tracy_force_inline OpenCLCtxScope(OpenCLCtx* ctx, const SourceLocationData* srcLoc, bool is_active)
+#ifdef TRACY_ON_DEMAND
+            : m_active(is_active&& GetProfiler().IsConnected())
+#else
+            : m_active(is_active)
+#endif
+            , m_ctx(ctx)
+            , m_event(nullptr)
+        {
+            if (!m_active) return;
+
+            m_beginQueryId = ctx->NextQueryId(EventInfo{ nullptr, EventPhase::Begin });
+
+            auto item = Profiler::QueueSerial();
+            MemWrite(&item->hdr.type, QueueType::GpuZoneBeginSerial);
+            MemWrite(&item->gpuZoneBegin.cpuTime, Profiler::GetTime());
+            MemWrite(&item->gpuZoneBegin.srcloc, (uint64_t)srcLoc);
+            MemWrite(&item->gpuZoneBegin.thread, GetThreadHandle());
+            MemWrite(&item->gpuZoneBegin.queryId, (uint16_t)m_beginQueryId);
+            MemWrite(&item->gpuZoneBegin.context, ctx->GetId());
+            Profiler::QueueSerialFinish();
+        }
+
+        tracy_force_inline OpenCLCtxScope(OpenCLCtx* ctx, const SourceLocationData* srcLoc, int depth, bool is_active)
+#ifdef TRACY_ON_DEMAND
+            : m_active(is_active&& GetProfiler().IsConnected())
+#else
+            : m_active(is_active)
+#endif
+            , m_ctx(ctx)
+            , m_event(nullptr)
+        {
+            if (!m_active) return;
+
+            m_beginQueryId = ctx->NextQueryId(EventInfo{ nullptr, EventPhase::Begin });
+
+            GetProfiler().SendCallstack(depth);
+
+            auto item = Profiler::QueueSerial();
+            MemWrite(&item->hdr.type, QueueType::GpuZoneBeginCallstackSerial);
+            MemWrite(&item->gpuZoneBegin.cpuTime, Profiler::GetTime());
+            MemWrite(&item->gpuZoneBegin.srcloc, (uint64_t)srcLoc);
+            MemWrite(&item->gpuZoneBegin.thread, GetThreadHandle());
+            MemWrite(&item->gpuZoneBegin.queryId, (uint16_t)m_beginQueryId);
+            MemWrite(&item->gpuZoneBegin.context, ctx->GetId());
+            Profiler::QueueSerialFinish();
+        }
+
+        tracy_force_inline OpenCLCtxScope(OpenCLCtx* ctx, uint32_t line, const char* source, size_t sourceSz, const char* function, size_t functionSz, const char* name, size_t nameSz, bool is_active)
+#ifdef TRACY_ON_DEMAND
+            : m_active(is_active && GetProfiler().IsConnected())
+#else
+            : m_active(is_active)
+#endif
+            , m_ctx(ctx)
+            , m_event(nullptr)
+        {
+            if (!m_active) return;
+
+            m_beginQueryId = ctx->NextQueryId(EventInfo{ nullptr, EventPhase::Begin });
+
+            const auto srcloc = Profiler::AllocSourceLocation( line, source, sourceSz, function, functionSz, name, nameSz );
+            auto item = Profiler::QueueSerial();
+            MemWrite( &item->hdr.type, QueueType::GpuZoneBeginAllocSrcLocSerial );
+            MemWrite(&item->gpuZoneBegin.cpuTime, Profiler::GetTime());
+            MemWrite(&item->gpuZoneBegin.srcloc, srcloc);
+            MemWrite(&item->gpuZoneBegin.thread, GetThreadHandle());
+            MemWrite(&item->gpuZoneBegin.queryId, (uint16_t)m_beginQueryId);
+            MemWrite(&item->gpuZoneBegin.context, ctx->GetId());
+            Profiler::QueueSerialFinish();
+        }
+
+        tracy_force_inline OpenCLCtxScope(OpenCLCtx* ctx, uint32_t line, const char* source, size_t sourceSz, const char* function, size_t functionSz, const char* name, size_t nameSz, int depth, bool is_active)
+#ifdef TRACY_ON_DEMAND
+            : m_active(is_active && GetProfiler().IsConnected())
+#else
+            : m_active(is_active)
+#endif
+            , m_ctx(ctx)
+            , m_event(nullptr)
+        {
+            if (!m_active) return;
+
+            m_beginQueryId = ctx->NextQueryId(EventInfo{ nullptr, EventPhase::Begin });
+
+            const auto srcloc = Profiler::AllocSourceLocation( line, source, sourceSz, function, functionSz, name, nameSz );
+            auto item = Profiler::QueueSerialCallstack( Callstack( depth ) );
+            MemWrite(&item->hdr.type, QueueType::GpuZoneBeginAllocSrcLocCallstackSerial);
+            MemWrite(&item->gpuZoneBegin.cpuTime, Profiler::GetTime());
+            MemWrite(&item->gpuZoneBegin.srcloc, srcloc);
+            MemWrite(&item->gpuZoneBegin.thread, GetThreadHandle());
+            MemWrite(&item->gpuZoneBegin.queryId, (uint16_t)m_beginQueryId);
+            MemWrite(&item->gpuZoneBegin.context, ctx->GetId());
+            Profiler::QueueSerialFinish();
+        }
+
+        tracy_force_inline void SetEvent(cl_event event)
+        {
+            if (!m_active) return;
+            m_event = event;
+            TRACY_CL_CHECK_ERROR(clRetainEvent(m_event));
+            m_ctx->GetQuery(m_beginQueryId).event = m_event;
+        }
+
+        tracy_force_inline ~OpenCLCtxScope()
+        {
+            if (!m_active) return;
+            const auto queryId = m_ctx->NextQueryId(EventInfo{ m_event, EventPhase::End });
+
+            auto item = Profiler::QueueSerial();
+            MemWrite(&item->hdr.type, QueueType::GpuZoneEndSerial);
+            MemWrite(&item->gpuZoneEnd.cpuTime, Profiler::GetTime());
+            MemWrite(&item->gpuZoneEnd.thread, GetThreadHandle());
+            MemWrite(&item->gpuZoneEnd.queryId, (uint16_t)queryId);
+            MemWrite(&item->gpuZoneEnd.context, m_ctx->GetId());
+            Profiler::QueueSerialFinish();
+        }
+
+        const bool m_active;
+        OpenCLCtx* m_ctx;
+        cl_event m_event;
+        unsigned int m_beginQueryId;
+    };
+
+    static inline OpenCLCtx* CreateCLContext(cl_context context, cl_device_id device)
+    {
+        auto ctx = (OpenCLCtx*)tracy_malloc(sizeof(OpenCLCtx));
+        new (ctx) OpenCLCtx(context, device);
+        return ctx;
+    }
+
+    static inline void DestroyCLContext(OpenCLCtx* ctx)
+    {
+        ctx->~OpenCLCtx();
+        tracy_free(ctx);
+    }
+
+}  // namespace tracy
+
+using TracyCLCtx = tracy::OpenCLCtx*;
+
+#define TracyCLContext(ctx, device) tracy::CreateCLContext(ctx, device);
+#define TracyCLDestroy(ctx) tracy::DestroyCLContext(ctx);
+#define TracyCLContextName(ctx, name, size) ctx->Name(name, size);
+#if defined TRACY_HAS_CALLSTACK && defined TRACY_CALLSTACK
+#  define TracyCLNamedZone(ctx, varname, name, active) static constexpr tracy::SourceLocationData TracyConcat(__tracy_gpu_source_location,TracyLine) { name, TracyFunction, TracyFile, (uint32_t)TracyLine, 0 }; tracy::OpenCLCtxScope varname(ctx, &TracyConcat(__tracy_gpu_source_location,TracyLine), TRACY_CALLSTACK, active );
+#  define TracyCLNamedZoneC(ctx, varname, name, color, active) static constexpr tracy::SourceLocationData TracyConcat(__tracy_gpu_source_location,TracyLine) { name, TracyFunction, TracyFile, (uint32_t)TracyLine, color }; tracy::OpenCLCtxScope varname(ctx, &TracyConcat(__tracy_gpu_source_location,TracyLine), TRACY_CALLSTACK, active );
+#  define TracyCLZone(ctx, name) TracyCLNamedZoneS(ctx, __tracy_gpu_zone, name, TRACY_CALLSTACK, true)
+#  define TracyCLZoneC(ctx, name, color) TracyCLNamedZoneCS(ctx, __tracy_gpu_zone, name, color, TRACY_CALLSTACK, true)
+#  define TracyCLZoneTransient( ctx, varname, name, active ) tracy::OpenCLCtxScope varname( ctx, TracyLine, TracyFile, strlen( TracyFile ), TracyFunction, strlen( TracyFunction ), name, strlen( name ), TRACY_CALLSTACK, active );
+#else
+#  define TracyCLNamedZone(ctx, varname, name, active) static constexpr tracy::SourceLocationData TracyConcat(__tracy_gpu_source_location,TracyLine){ name, TracyFunction, TracyFile, (uint32_t)TracyLine, 0 }; tracy::OpenCLCtxScope varname(ctx, &TracyConcat(__tracy_gpu_source_location,TracyLine), active);
+#  define TracyCLNamedZoneC(ctx, varname, name, color, active) static constexpr tracy::SourceLocationData TracyConcat(__tracy_gpu_source_location,TracyLine){ name, TracyFunction, TracyFile, (uint32_t)TracyLine, color }; tracy::OpenCLCtxScope varname(ctx, &TracyConcat(__tracy_gpu_source_location,TracyLine), active);
+#  define TracyCLZone(ctx, name) TracyCLNamedZone(ctx, __tracy_gpu_zone, name, true)
+#  define TracyCLZoneC(ctx, name, color) TracyCLNamedZoneC(ctx, __tracy_gpu_zone, name, color, true )
+#  define TracyCLZoneTransient( ctx, varname, name, active ) tracy::OpenCLCtxScope varname( ctx, TracyLine, TracyFile, strlen( TracyFile ), TracyFunction, strlen( TracyFunction ), name, strlen( name ), active );
+#endif
+
+#ifdef TRACY_HAS_CALLSTACK
+#  define TracyCLNamedZoneS(ctx, varname, name, depth, active) static constexpr tracy::SourceLocationData TracyConcat(__tracy_gpu_source_location,TracyLine){ name, TracyFunction, TracyFile, (uint32_t)TracyLine, 0 }; tracy::OpenCLCtxScope varname(ctx, &TracyConcat(__tracy_gpu_source_location,TracyLine), depth, active);
+#  define TracyCLNamedZoneCS(ctx, varname, name, color, depth, active) static constexpr tracy::SourceLocationData TracyConcat(__tracy_gpu_source_location,TracyLine){ name, TracyFunction, TracyFile, (uint32_t)TracyLine, color }; tracy::OpenCLCtxScope varname(ctx, &TracyConcat(__tracy_gpu_source_location,TracyLine), depth, active);
+#  define TracyCLZoneS(ctx, name, depth) TracyCLNamedZoneS(ctx, __tracy_gpu_zone, name, depth, true)
+#  define TracyCLZoneCS(ctx, name, color, depth) TracyCLNamedZoneCS(ctx, __tracy_gpu_zone, name, color, depth, true)
+#  define TracyCLZoneTransientS( ctx, varname, name, depth, active ) tracy::OpenCLCtxScope varname( ctx, TracyLine, TracyFile, strlen( TracyFile ), TracyFunction, strlen( TracyFunction ), name, strlen( name ), depth, active );
+#else
+#  define TracyCLNamedZoneS(ctx, varname, name, depth, active) TracyCLNamedZone(ctx, varname, name, active)
+#  define TracyCLNamedZoneCS(ctx, varname, name, color, depth, active) TracyCLNamedZoneC(ctx, varname, name, color, active)
+#  define TracyCLZoneS(ctx, name, depth) TracyCLZone(ctx, name)
+#  define TracyCLZoneCS(ctx, name, color, depth) TracyCLZoneC(ctx, name, color)
+#  define TracyCLZoneTransientS( ctx, varname, name, depth, active ) TracyCLZoneTransient( ctx, varname, name, active )
+#endif
+
+#define TracyCLNamedZoneSetEvent(varname, event) varname.SetEvent(event)
+#define TracyCLZoneSetEvent(event) __tracy_gpu_zone.SetEvent(event)
+
+#define TracyCLCollect(ctx) ctx->Collect()
+
+#endif
+
+#endif
diff --git a/project/thirdparty/tracy-0.11.1/tracy/TracyOpenGL.hpp b/project/thirdparty/tracy-0.11.1/tracy/TracyOpenGL.hpp
new file mode 100644
index 000000000..3bdadccee
--- /dev/null
+++ b/project/thirdparty/tracy-0.11.1/tracy/TracyOpenGL.hpp
@@ -0,0 +1,325 @@
+#ifndef __TRACYOPENGL_HPP__
+#define __TRACYOPENGL_HPP__
+
+#if !defined TRACY_ENABLE || defined __APPLE__
+
+#define TracyGpuContext
+#define TracyGpuContextName(x,y)
+#define TracyGpuNamedZone(x,y,z)
+#define TracyGpuNamedZoneC(x,y,z,w)
+#define TracyGpuZone(x)
+#define TracyGpuZoneC(x,y)
+#define TracyGpuZoneTransient(x,y,z)
+#define TracyGpuCollect
+
+#define TracyGpuNamedZoneS(x,y,z,w)
+#define TracyGpuNamedZoneCS(x,y,z,w,a)
+#define TracyGpuZoneS(x,y)
+#define TracyGpuZoneCS(x,y,z)
+#define TracyGpuZoneTransientS(x,y,z,w)
+
+namespace tracy
+{
+struct SourceLocationData;
+class GpuCtxScope
+{
+public:
+    GpuCtxScope( const SourceLocationData*, bool ) {}
+    GpuCtxScope( const SourceLocationData*, int, bool ) {}
+};
+}
+
+#else
+
+#include <atomic>
+#include <assert.h>
+#include <stdlib.h>
+
+#include "Tracy.hpp"
+#include "../client/TracyProfiler.hpp"
+#include "../client/TracyCallstack.hpp"
+#include "../common/TracyAlign.hpp"
+#include "../common/TracyAlloc.hpp"
+
+#if !defined GL_TIMESTAMP && defined GL_TIMESTAMP_EXT
+#  define GL_TIMESTAMP GL_TIMESTAMP_EXT
+#  define GL_QUERY_COUNTER_BITS GL_QUERY_COUNTER_BITS_EXT
+#  define glGetQueryObjectiv glGetQueryObjectivEXT
+#  define glGetQueryObjectui64v glGetQueryObjectui64vEXT
+#  define glQueryCounter glQueryCounterEXT
+#endif
+
+#define TracyGpuContext tracy::GetGpuCtx().ptr = (tracy::GpuCtx*)tracy::tracy_malloc( sizeof( tracy::GpuCtx ) ); new(tracy::GetGpuCtx().ptr) tracy::GpuCtx;
+#define TracyGpuContextName( name, size ) tracy::GetGpuCtx().ptr->Name( name, size );
+#if defined TRACY_HAS_CALLSTACK && defined TRACY_CALLSTACK
+#  define TracyGpuNamedZone( varname, name, active ) static constexpr tracy::SourceLocationData TracyConcat(__tracy_gpu_source_location,TracyLine) { name, TracyFunction,  TracyFile, (uint32_t)TracyLine, 0 }; tracy::GpuCtxScope varname( &TracyConcat(__tracy_gpu_source_location,TracyLine), TRACY_CALLSTACK, active );
+#  define TracyGpuNamedZoneC( varname, name, color, active ) static constexpr tracy::SourceLocationData TracyConcat(__tracy_gpu_source_location,TracyLine) { name, TracyFunction,  TracyFile, (uint32_t)TracyLine, color }; tracy::GpuCtxScope varname( &TracyConcat(__tracy_gpu_source_location,TracyLine), TRACY_CALLSTACK, active );
+#  define TracyGpuZone( name ) TracyGpuNamedZoneS( ___tracy_gpu_zone, name, TRACY_CALLSTACK, true )
+#  define TracyGpuZoneC( name, color ) TracyGpuNamedZoneCS( ___tracy_gpu_zone, name, color, TRACY_CALLSTACK, true )
+#  define TracyGpuZoneTransient( varname, name, active ) tracy::GpuCtxScope varname( TracyLine, TracyFile, strlen( TracyFile ), TracyFunction, strlen( TracyFunction ), name, strlen( name ), TRACY_CALLSTACK, active );
+#else
+#  define TracyGpuNamedZone( varname, name, active ) static constexpr tracy::SourceLocationData TracyConcat(__tracy_gpu_source_location,TracyLine) { name, TracyFunction,  TracyFile, (uint32_t)TracyLine, 0 }; tracy::GpuCtxScope varname( &TracyConcat(__tracy_gpu_source_location,TracyLine), active );
+#  define TracyGpuNamedZoneC( varname, name, color, active ) static constexpr tracy::SourceLocationData TracyConcat(__tracy_gpu_source_location,TracyLine) { name, TracyFunction,  TracyFile, (uint32_t)TracyLine, color }; tracy::GpuCtxScope varname( &TracyConcat(__tracy_gpu_source_location,TracyLine), active );
+#  define TracyGpuZone( name ) TracyGpuNamedZone( ___tracy_gpu_zone, name, true )
+#  define TracyGpuZoneC( name, color ) TracyGpuNamedZoneC( ___tracy_gpu_zone, name, color, true )
+#  define TracyGpuZoneTransient( varname, name, active ) tracy::GpuCtxScope varname( TracyLine, TracyFile, strlen( TracyFile ), TracyFunction, strlen( TracyFunction ), name, strlen( name ), active );
+#endif
+#define TracyGpuCollect tracy::GetGpuCtx().ptr->Collect();
+
+#ifdef TRACY_HAS_CALLSTACK
+#  define TracyGpuNamedZoneS( varname, name, depth, active ) static constexpr tracy::SourceLocationData TracyConcat(__tracy_gpu_source_location,TracyLine) { name, TracyFunction,  TracyFile, (uint32_t)TracyLine, 0 }; tracy::GpuCtxScope varname( &TracyConcat(__tracy_gpu_source_location,TracyLine), depth, active );
+#  define TracyGpuNamedZoneCS( varname, name, color, depth, active ) static constexpr tracy::SourceLocationData TracyConcat(__tracy_gpu_source_location,TracyLine) { name, TracyFunction,  TracyFile, (uint32_t)TracyLine, color }; tracy::GpuCtxScope varname( &TracyConcat(__tracy_gpu_source_location,TracyLine), depth, active );
+#  define TracyGpuZoneS( name, depth ) TracyGpuNamedZoneS( ___tracy_gpu_zone, name, depth, true )
+#  define TracyGpuZoneCS( name, color, depth ) TracyGpuNamedZoneCS( ___tracy_gpu_zone, name, color, depth, true )
+#  define TracyGpuZoneTransientS( varname, name, depth, active ) tracy::GpuCtxScope varname( TracyLine, TracyFile, strlen( TracyFile ), TracyFunction, strlen( TracyFunction ), name, strlen( name ), depth, active );
+#else
+#  define TracyGpuNamedZoneS( varname, name, depth, active ) TracyGpuNamedZone( varname, name, active )
+#  define TracyGpuNamedZoneCS( varname, name, color, depth, active ) TracyGpuNamedZoneC( varname, name, color, active )
+#  define TracyGpuZoneS( name, depth ) TracyGpuZone( name )
+#  define TracyGpuZoneCS( name, color, depth ) TracyGpuZoneC( name, color )
+#  define TracyGpuZoneTransientS( varname, name, depth, active ) TracyGpuZoneTransient( varname, name, active )
+#endif
+
+namespace tracy
+{
+
+class GpuCtx
+{
+    friend class GpuCtxScope;
+
+    enum { QueryCount = 64 * 1024 };
+
+public:
+    GpuCtx()
+        : m_context( GetGpuCtxCounter().fetch_add( 1, std::memory_order_relaxed ) )
+        , m_head( 0 )
+        , m_tail( 0 )
+    {
+        assert( m_context != 255 );
+
+        glGenQueries( QueryCount, m_query );
+
+        int64_t tgpu;
+        glGetInteger64v( GL_TIMESTAMP, &tgpu );
+        int64_t tcpu = Profiler::GetTime();
+
+        GLint bits;
+        glGetQueryiv( GL_TIMESTAMP, GL_QUERY_COUNTER_BITS, &bits );
+
+        const float period = 1.f;
+        const auto thread = GetThreadHandle();
+        TracyLfqPrepare( QueueType::GpuNewContext );
+        MemWrite( &item->gpuNewContext.cpuTime, tcpu );
+        MemWrite( &item->gpuNewContext.gpuTime, tgpu );
+        MemWrite( &item->gpuNewContext.thread, thread );
+        MemWrite( &item->gpuNewContext.period, period );
+        MemWrite( &item->gpuNewContext.context, m_context );
+        MemWrite( &item->gpuNewContext.flags, uint8_t( 0 ) );
+        MemWrite( &item->gpuNewContext.type, GpuContextType::OpenGl );
+
+#ifdef TRACY_ON_DEMAND
+        GetProfiler().DeferItem( *item );
+#endif
+
+        TracyLfqCommit;
+    }
+
+    void Name( const char* name, uint16_t len )
+    {
+        auto ptr = (char*)tracy_malloc( len );
+        memcpy( ptr, name, len );
+
+        TracyLfqPrepare( QueueType::GpuContextName );
+        MemWrite( &item->gpuContextNameFat.context, m_context );
+        MemWrite( &item->gpuContextNameFat.ptr, (uint64_t)ptr );
+        MemWrite( &item->gpuContextNameFat.size, len );
+#ifdef TRACY_ON_DEMAND
+        GetProfiler().DeferItem( *item );
+#endif
+        TracyLfqCommit;
+    }
+
+    void Collect()
+    {
+        ZoneScopedC( Color::Red4 );
+
+        if( m_tail == m_head ) return;
+
+#ifdef TRACY_ON_DEMAND
+        if( !GetProfiler().IsConnected() )
+        {
+            m_head = m_tail = 0;
+            return;
+        }
+#endif
+
+        while( m_tail != m_head )
+        {
+            GLint available;
+            glGetQueryObjectiv( m_query[m_tail], GL_QUERY_RESULT_AVAILABLE, &available );
+            if( !available ) return;
+
+            uint64_t time;
+            glGetQueryObjectui64v( m_query[m_tail], GL_QUERY_RESULT, &time );
+
+            TracyLfqPrepare( QueueType::GpuTime );
+            MemWrite( &item->gpuTime.gpuTime, (int64_t)time );
+            MemWrite( &item->gpuTime.queryId, (uint16_t)m_tail );
+            MemWrite( &item->gpuTime.context, m_context );
+            TracyLfqCommit;
+
+            m_tail = ( m_tail + 1 ) % QueryCount;
+        }
+    }
+
+private:
+    tracy_force_inline unsigned int NextQueryId()
+    {
+        const auto id = m_head;
+        m_head = ( m_head + 1 ) % QueryCount;
+        assert( m_head != m_tail );
+        return id;
+    }
+
+    tracy_force_inline unsigned int TranslateOpenGlQueryId( unsigned int id )
+    {
+        return m_query[id];
+    }
+
+    tracy_force_inline uint8_t GetId() const
+    {
+        return m_context;
+    }
+
+    unsigned int m_query[QueryCount];
+    uint8_t m_context;
+
+    unsigned int m_head;
+    unsigned int m_tail;
+};
+
+class GpuCtxScope
+{
+public:
+    tracy_force_inline GpuCtxScope( const SourceLocationData* srcloc, bool is_active )
+#ifdef TRACY_ON_DEMAND
+        : m_active( is_active && GetProfiler().IsConnected() )
+#else
+        : m_active( is_active )
+#endif
+    {
+        if( !m_active ) return;
+
+        const auto queryId = GetGpuCtx().ptr->NextQueryId();
+        glQueryCounter( GetGpuCtx().ptr->TranslateOpenGlQueryId( queryId ), GL_TIMESTAMP );
+
+        TracyLfqPrepare( QueueType::GpuZoneBegin );
+        MemWrite( &item->gpuZoneBegin.cpuTime, Profiler::GetTime() );
+        memset( &item->gpuZoneBegin.thread, 0, sizeof( item->gpuZoneBegin.thread ) );
+        MemWrite( &item->gpuZoneBegin.queryId, uint16_t( queryId ) );
+        MemWrite( &item->gpuZoneBegin.context, GetGpuCtx().ptr->GetId() );
+        MemWrite( &item->gpuZoneBegin.srcloc, (uint64_t)srcloc );
+        TracyLfqCommit;
+    }
+
+    tracy_force_inline GpuCtxScope( const SourceLocationData* srcloc, int depth, bool is_active )
+#ifdef TRACY_ON_DEMAND
+        : m_active( is_active && GetProfiler().IsConnected() )
+#else
+        : m_active( is_active )
+#endif
+    {
+        if( !m_active ) return;
+
+        const auto queryId = GetGpuCtx().ptr->NextQueryId();
+        glQueryCounter( GetGpuCtx().ptr->TranslateOpenGlQueryId( queryId ), GL_TIMESTAMP );
+
+#ifdef TRACY_FIBERS
+        TracyLfqPrepare( QueueType::GpuZoneBegin );
+        memset( &item->gpuZoneBegin.thread, 0, sizeof( item->gpuZoneBegin.thread ) );
+#else
+        GetProfiler().SendCallstack( depth );
+        TracyLfqPrepare( QueueType::GpuZoneBeginCallstack );
+        MemWrite( &item->gpuZoneBegin.thread, GetThreadHandle() );
+#endif
+        MemWrite( &item->gpuZoneBegin.cpuTime, Profiler::GetTime() );
+        MemWrite( &item->gpuZoneBegin.queryId, uint16_t( queryId ) );
+        MemWrite( &item->gpuZoneBegin.context, GetGpuCtx().ptr->GetId() );
+        MemWrite( &item->gpuZoneBegin.srcloc, (uint64_t)srcloc );
+        TracyLfqCommit;
+    }
+
+    tracy_force_inline GpuCtxScope( uint32_t line, const char* source, size_t sourceSz, const char* function, size_t functionSz, const char* name, size_t nameSz, bool is_active )
+#ifdef TRACY_ON_DEMAND
+        : m_active( is_active && GetProfiler().IsConnected() )
+#else
+        : m_active( is_active )
+#endif
+    {
+        if( !m_active ) return;
+
+        const auto queryId = GetGpuCtx().ptr->NextQueryId();
+        glQueryCounter( GetGpuCtx().ptr->TranslateOpenGlQueryId( queryId ), GL_TIMESTAMP );
+
+        TracyLfqPrepare( QueueType::GpuZoneBeginAllocSrcLoc );
+        const auto srcloc = Profiler::AllocSourceLocation( line, source, sourceSz, function, functionSz, name, nameSz );
+        MemWrite( &item->gpuZoneBegin.cpuTime, Profiler::GetTime() );
+        memset( &item->gpuZoneBegin.thread, 0, sizeof( item->gpuZoneBegin.thread ) );
+        MemWrite( &item->gpuZoneBegin.queryId, uint16_t( queryId ) );
+        MemWrite( &item->gpuZoneBegin.context, GetGpuCtx().ptr->GetId() );
+        MemWrite( &item->gpuZoneBegin.srcloc, (uint64_t)srcloc );
+        TracyLfqCommit;
+    }
+
+    tracy_force_inline GpuCtxScope( uint32_t line, const char* source, size_t sourceSz, const char* function, size_t functionSz, const char* name, size_t nameSz, int depth, bool is_active )
+#ifdef TRACY_ON_DEMAND
+        : m_active( is_active && GetProfiler().IsConnected() )
+#else
+        : m_active( is_active )
+#endif
+    {
+        if( !m_active ) return;
+
+        const auto queryId = GetGpuCtx().ptr->NextQueryId();
+        glQueryCounter( GetGpuCtx().ptr->TranslateOpenGlQueryId( queryId ), GL_TIMESTAMP );
+
+#ifdef TRACY_FIBERS
+        TracyLfqPrepare( QueueType::GpuZoneBeginAllocSrcLoc );
+        memset( &item->gpuZoneBegin.thread, 0, sizeof( item->gpuZoneBegin.thread ) );
+#else
+        GetProfiler().SendCallstack( depth );
+        TracyLfqPrepare( QueueType::GpuZoneBeginAllocSrcLocCallstack );
+        MemWrite( &item->gpuZoneBegin.thread, GetThreadHandle() );
+#endif
+        const auto srcloc = Profiler::AllocSourceLocation( line, source, sourceSz, function, functionSz, name, nameSz );
+        MemWrite( &item->gpuZoneBegin.cpuTime, Profiler::GetTime() );
+        MemWrite( &item->gpuZoneBegin.queryId, uint16_t( queryId ) );
+        MemWrite( &item->gpuZoneBegin.context, GetGpuCtx().ptr->GetId() );
+        MemWrite( &item->gpuZoneBegin.srcloc, (uint64_t)srcloc );
+        TracyLfqCommit;
+    }
+
+    tracy_force_inline ~GpuCtxScope()
+    {
+        if( !m_active ) return;
+
+        const auto queryId = GetGpuCtx().ptr->NextQueryId();
+        glQueryCounter( GetGpuCtx().ptr->TranslateOpenGlQueryId( queryId ), GL_TIMESTAMP );
+
+        TracyLfqPrepare( QueueType::GpuZoneEnd );
+        MemWrite( &item->gpuZoneEnd.cpuTime, Profiler::GetTime() );
+        memset( &item->gpuZoneEnd.thread, 0, sizeof( item->gpuZoneEnd.thread ) );
+        MemWrite( &item->gpuZoneEnd.queryId, uint16_t( queryId ) );
+        MemWrite( &item->gpuZoneEnd.context, GetGpuCtx().ptr->GetId() );
+        TracyLfqCommit;
+    }
+
+private:
+    const bool m_active;
+};
+
+}
+
+#endif
+
+#endif
diff --git a/project/thirdparty/tracy-0.11.1/tracy/TracyVulkan.hpp b/project/thirdparty/tracy-0.11.1/tracy/TracyVulkan.hpp
new file mode 100644
index 000000000..c34b71852
--- /dev/null
+++ b/project/thirdparty/tracy-0.11.1/tracy/TracyVulkan.hpp
@@ -0,0 +1,723 @@
+#ifndef __TRACYVULKAN_HPP__
+#define __TRACYVULKAN_HPP__
+
+#if !defined TRACY_ENABLE
+
+#define TracyVkContext(x,y,z,w) nullptr
+#define TracyVkContextCalibrated(x,y,z,w,a,b) nullptr
+#if defined VK_EXT_host_query_reset
+#define TracyVkContextHostCalibrated(x,y,z,w,a) nullptr
+#endif
+#define TracyVkDestroy(x)
+#define TracyVkContextName(c,x,y)
+#define TracyVkNamedZone(c,x,y,z,w)
+#define TracyVkNamedZoneC(c,x,y,z,w,a)
+#define TracyVkZone(c,x,y)
+#define TracyVkZoneC(c,x,y,z)
+#define TracyVkZoneTransient(c,x,y,z,w)
+#define TracyVkCollect(c,x)
+
+#define TracyVkNamedZoneS(c,x,y,z,w,a)
+#define TracyVkNamedZoneCS(c,x,y,z,w,v,a)
+#define TracyVkZoneS(c,x,y,z)
+#define TracyVkZoneCS(c,x,y,z,w)
+#define TracyVkZoneTransientS(c,x,y,z,w,a)
+
+namespace tracy
+{
+class VkCtxScope {};
+}
+
+using TracyVkCtx = void*;
+
+#else
+
+#if !defined VK_NULL_HANDLE
+#  error "You must include Vulkan headers before including TracyVulkan.hpp"
+#endif
+
+#include <assert.h>
+#include <stdlib.h>
+#include "Tracy.hpp"
+#include "../client/TracyProfiler.hpp"
+#include "../client/TracyCallstack.hpp"
+
+#include <atomic>
+
+namespace tracy
+{
+
+#if defined TRACY_VK_USE_SYMBOL_TABLE
+#define LoadVkDeviceCoreSymbols(Operation) \
+    Operation(vkBeginCommandBuffer) \
+    Operation(vkCmdResetQueryPool) \
+    Operation(vkCmdWriteTimestamp) \
+    Operation(vkCreateQueryPool) \
+    Operation(vkDestroyQueryPool) \
+    Operation(vkEndCommandBuffer) \
+    Operation(vkGetQueryPoolResults) \
+    Operation(vkQueueSubmit) \
+    Operation(vkQueueWaitIdle) \
+    Operation(vkResetQueryPool)
+
+#define LoadVkDeviceExtensionSymbols(Operation) \
+    Operation(vkGetCalibratedTimestampsEXT)
+
+#define LoadVkInstanceExtensionSymbols(Operation) \
+    Operation(vkGetPhysicalDeviceCalibrateableTimeDomainsEXT)
+
+#define LoadVkInstanceCoreSymbols(Operation) \
+    Operation(vkGetPhysicalDeviceProperties)
+
+struct VkSymbolTable
+{
+#define MAKE_PFN(name) PFN_##name name;
+    LoadVkDeviceCoreSymbols(MAKE_PFN)
+    LoadVkDeviceExtensionSymbols(MAKE_PFN)
+    LoadVkInstanceExtensionSymbols(MAKE_PFN)
+    LoadVkInstanceCoreSymbols(MAKE_PFN)
+#undef MAKE_PFN
+};
+
+#define VK_FUNCTION_WRAPPER(callSignature) m_symbols.callSignature
+#define CONTEXT_VK_FUNCTION_WRAPPER(callSignature) m_ctx->m_symbols.callSignature
+#else
+#define VK_FUNCTION_WRAPPER(callSignature) callSignature
+#define CONTEXT_VK_FUNCTION_WRAPPER(callSignature) callSignature
+#endif
+
+class VkCtx
+{
+    friend class VkCtxScope;
+
+    enum { QueryCount = 64 * 1024 };
+
+public:
+#if defined TRACY_VK_USE_SYMBOL_TABLE
+    VkCtx( VkInstance instance, VkPhysicalDevice physdev, VkDevice device, VkQueue queue, VkCommandBuffer cmdbuf, PFN_vkGetInstanceProcAddr instanceProcAddr, PFN_vkGetDeviceProcAddr deviceProcAddr, bool calibrated )
+#else
+    VkCtx( VkPhysicalDevice physdev, VkDevice device, VkQueue queue, VkCommandBuffer cmdbuf, PFN_vkGetPhysicalDeviceCalibrateableTimeDomainsEXT vkGetPhysicalDeviceCalibrateableTimeDomainsEXT, PFN_vkGetCalibratedTimestampsEXT vkGetCalibratedTimestampsEXT)
+#endif
+        : m_device( device )
+        , m_timeDomain( VK_TIME_DOMAIN_DEVICE_EXT )
+        , m_context( GetGpuCtxCounter().fetch_add( 1, std::memory_order_relaxed ) )
+        , m_head( 0 )
+        , m_tail( 0 )
+        , m_oldCnt( 0 )
+        , m_queryCount( QueryCount )
+#if !defined TRACY_VK_USE_SYMBOL_TABLE
+        , m_vkGetCalibratedTimestampsEXT( vkGetCalibratedTimestampsEXT )
+#endif
+    {
+        assert( m_context != 255 );
+
+#if defined TRACY_VK_USE_SYMBOL_TABLE
+        PopulateSymbolTable(instance, instanceProcAddr, deviceProcAddr);
+        if ( calibrated )
+        {
+            m_vkGetCalibratedTimestampsEXT = m_symbols.vkGetCalibratedTimestampsEXT;
+        }
+
+#endif
+
+        if( VK_FUNCTION_WRAPPER( vkGetPhysicalDeviceCalibrateableTimeDomainsEXT ) && m_vkGetCalibratedTimestampsEXT )
+        {
+            FindAvailableTimeDomains( physdev, VK_FUNCTION_WRAPPER( vkGetPhysicalDeviceCalibrateableTimeDomainsEXT ) );
+        }
+
+        CreateQueryPool();
+
+        VkCommandBufferBeginInfo beginInfo = {};
+        beginInfo.sType = VK_STRUCTURE_TYPE_COMMAND_BUFFER_BEGIN_INFO;
+        beginInfo.flags = VK_COMMAND_BUFFER_USAGE_ONE_TIME_SUBMIT_BIT;
+
+        VkSubmitInfo submitInfo = {};
+        submitInfo.sType = VK_STRUCTURE_TYPE_SUBMIT_INFO;
+        submitInfo.commandBufferCount = 1;
+        submitInfo.pCommandBuffers = &cmdbuf;
+
+        VK_FUNCTION_WRAPPER( vkBeginCommandBuffer( cmdbuf, &beginInfo ) );
+        VK_FUNCTION_WRAPPER( vkCmdResetQueryPool( cmdbuf, m_query, 0, m_queryCount ) );
+        VK_FUNCTION_WRAPPER( vkEndCommandBuffer( cmdbuf ) );
+        VK_FUNCTION_WRAPPER( vkQueueSubmit( queue, 1, &submitInfo, VK_NULL_HANDLE ) );
+        VK_FUNCTION_WRAPPER( vkQueueWaitIdle( queue ) );
+
+        int64_t tcpu, tgpu;
+        if( m_timeDomain == VK_TIME_DOMAIN_DEVICE_EXT )
+        {
+            VK_FUNCTION_WRAPPER( vkBeginCommandBuffer( cmdbuf, &beginInfo ) );
+            VK_FUNCTION_WRAPPER( vkCmdWriteTimestamp( cmdbuf, VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT, m_query, 0 ) );
+            VK_FUNCTION_WRAPPER( vkEndCommandBuffer( cmdbuf ) );
+            VK_FUNCTION_WRAPPER( vkQueueSubmit( queue, 1, &submitInfo, VK_NULL_HANDLE ) );
+            VK_FUNCTION_WRAPPER( vkQueueWaitIdle( queue ) );
+
+            tcpu = Profiler::GetTime();
+            VK_FUNCTION_WRAPPER( vkGetQueryPoolResults( device, m_query, 0, 1, sizeof( tgpu ), &tgpu, sizeof( tgpu ), VK_QUERY_RESULT_64_BIT | VK_QUERY_RESULT_WAIT_BIT ) );
+
+            VK_FUNCTION_WRAPPER( vkBeginCommandBuffer( cmdbuf, &beginInfo ) );
+            VK_FUNCTION_WRAPPER( vkCmdResetQueryPool( cmdbuf, m_query, 0, 1 ) );
+            VK_FUNCTION_WRAPPER( vkEndCommandBuffer( cmdbuf ) );
+            VK_FUNCTION_WRAPPER( vkQueueSubmit( queue, 1, &submitInfo, VK_NULL_HANDLE ) );
+            VK_FUNCTION_WRAPPER( vkQueueWaitIdle( queue ) );
+        }
+        else
+        {
+            FindCalibratedTimestampDeviation();
+            Calibrate( device, m_prevCalibration, tgpu );
+            tcpu = Profiler::GetTime();
+        }
+
+        WriteInitialItem( physdev, tcpu, tgpu );
+
+        m_res = (int64_t*)tracy_malloc( sizeof( int64_t ) * m_queryCount );
+    }
+
+#if defined VK_EXT_host_query_reset
+    /**
+     * This alternative constructor does not use command buffers and instead uses functionality from
+     * VK_EXT_host_query_reset (core with 1.2 and non-optional) and VK_EXT_calibrated_timestamps. This requires
+     * the physical device to have another time domain apart from DEVICE to be calibrateable.
+     */
+#if defined TRACY_VK_USE_SYMBOL_TABLE
+    VkCtx( VkInstance instance, VkPhysicalDevice physdev, VkDevice device, PFN_vkGetInstanceProcAddr instanceProcAddr, PFN_vkGetDeviceProcAddr deviceProcAddr )
+#else
+    VkCtx( VkPhysicalDevice physdev, VkDevice device, PFN_vkResetQueryPoolEXT vkResetQueryPool, PFN_vkGetPhysicalDeviceCalibrateableTimeDomainsEXT vkGetPhysicalDeviceCalibrateableTimeDomainsEXT, PFN_vkGetCalibratedTimestampsEXT vkGetCalibratedTimestampsEXT )
+#endif
+        : m_device( device )
+        , m_timeDomain( VK_TIME_DOMAIN_DEVICE_EXT )
+        , m_context( GetGpuCtxCounter().fetch_add(1, std::memory_order_relaxed) )
+        , m_head( 0 )
+        , m_tail( 0 )
+        , m_oldCnt( 0 )
+        , m_queryCount( QueryCount )
+#if !defined TRACY_VK_USE_SYMBOL_TABLE
+        , m_vkGetCalibratedTimestampsEXT( vkGetCalibratedTimestampsEXT )
+#endif
+    {
+        assert( m_context != 255);
+
+#if defined TRACY_VK_USE_SYMBOL_TABLE
+        PopulateSymbolTable(instance, instanceProcAddr, deviceProcAddr);
+        m_vkGetCalibratedTimestampsEXT = m_symbols.vkGetCalibratedTimestampsEXT;
+#endif
+
+        assert( VK_FUNCTION_WRAPPER( vkResetQueryPool ) != nullptr );
+        assert( VK_FUNCTION_WRAPPER( vkGetPhysicalDeviceCalibrateableTimeDomainsEXT ) != nullptr );
+        assert( VK_FUNCTION_WRAPPER( vkGetCalibratedTimestampsEXT ) != nullptr );
+
+        FindAvailableTimeDomains( physdev, VK_FUNCTION_WRAPPER( vkGetPhysicalDeviceCalibrateableTimeDomainsEXT ) );
+
+        // We require a host time domain to be available to properly calibrate.
+        FindCalibratedTimestampDeviation();
+        int64_t tgpu;
+        Calibrate( device, m_prevCalibration, tgpu );
+        int64_t tcpu = Profiler::GetTime();
+
+        CreateQueryPool();
+        VK_FUNCTION_WRAPPER( vkResetQueryPool( device, m_query, 0, m_queryCount ) );
+
+        WriteInitialItem( physdev, tcpu, tgpu );
+
+        // We need the buffer to be twice as large for availability values
+        size_t resSize = sizeof( int64_t ) * m_queryCount * 2;
+        m_res = (int64_t*)tracy_malloc( resSize );
+    }
+#endif
+
+    ~VkCtx()
+    {
+        tracy_free( m_res );
+        VK_FUNCTION_WRAPPER( vkDestroyQueryPool( m_device, m_query, nullptr ) );
+    }
+
+    void Name( const char* name, uint16_t len )
+    {
+        auto ptr = (char*)tracy_malloc( len );
+        memcpy( ptr, name, len );
+
+        auto item = Profiler::QueueSerial();
+        MemWrite( &item->hdr.type, QueueType::GpuContextName );
+        MemWrite( &item->gpuContextNameFat.context, m_context );
+        MemWrite( &item->gpuContextNameFat.ptr, (uint64_t)ptr );
+        MemWrite( &item->gpuContextNameFat.size, len );
+#ifdef TRACY_ON_DEMAND
+        GetProfiler().DeferItem( *item );
+#endif
+        Profiler::QueueSerialFinish();
+    }
+
+    void Collect( VkCommandBuffer cmdbuf )
+    {
+        ZoneScopedC( Color::Red4 );
+
+        const uint64_t head = m_head.load(std::memory_order_relaxed);
+        if( m_tail == head ) return;
+
+#ifdef TRACY_ON_DEMAND
+        if( !GetProfiler().IsConnected() )
+        {
+            VK_FUNCTION_WRAPPER( vkCmdResetQueryPool( cmdbuf, m_query, 0, m_queryCount ) );
+            m_tail = head;
+            m_oldCnt = 0;
+            int64_t tgpu;
+            if( m_timeDomain != VK_TIME_DOMAIN_DEVICE_EXT ) Calibrate( m_device, m_prevCalibration, tgpu );
+            return;
+        }
+#endif
+        assert( head > m_tail );
+        
+        const unsigned int wrappedTail = (unsigned int)( m_tail % m_queryCount );
+
+        unsigned int cnt;
+        if( m_oldCnt != 0 )
+        {
+            cnt = m_oldCnt;
+            m_oldCnt = 0;
+        }
+        else
+        {
+            cnt = (unsigned int)( head - m_tail );
+            assert( cnt <= m_queryCount );
+            if( wrappedTail + cnt > m_queryCount )
+            {
+                cnt = m_queryCount - wrappedTail;
+            }
+        }
+
+
+        VK_FUNCTION_WRAPPER( vkGetQueryPoolResults( m_device, m_query, wrappedTail, cnt, sizeof( int64_t ) * m_queryCount * 2, m_res, sizeof( int64_t ) * 2, VK_QUERY_RESULT_64_BIT | VK_QUERY_RESULT_WITH_AVAILABILITY_BIT ) );
+
+        for( unsigned int idx=0; idx<cnt; idx++ )
+        {
+            int64_t avail = m_res[idx * 2 + 1];
+            if( avail == 0 )
+            {
+                m_oldCnt = cnt - idx;
+                cnt = idx;
+
+                break;
+            }
+
+            auto item = Profiler::QueueSerial();
+            MemWrite( &item->hdr.type, QueueType::GpuTime );
+            MemWrite( &item->gpuTime.gpuTime, m_res[idx * 2] );
+            MemWrite( &item->gpuTime.queryId, uint16_t( wrappedTail + idx ) );
+            MemWrite( &item->gpuTime.context, m_context );
+            Profiler::QueueSerialFinish();
+        }
+
+        if( m_timeDomain != VK_TIME_DOMAIN_DEVICE_EXT )
+        {
+            int64_t tgpu, tcpu;
+            Calibrate( m_device, tcpu, tgpu );
+            const auto refCpu = Profiler::GetTime();
+            const auto delta = tcpu - m_prevCalibration;
+            if( delta > 0 )
+            {
+                m_prevCalibration = tcpu;
+                auto item = Profiler::QueueSerial();
+                MemWrite( &item->hdr.type, QueueType::GpuCalibration );
+                MemWrite( &item->gpuCalibration.gpuTime, tgpu );
+                MemWrite( &item->gpuCalibration.cpuTime, refCpu );
+                MemWrite( &item->gpuCalibration.cpuDelta, delta );
+                MemWrite( &item->gpuCalibration.context, m_context );
+                Profiler::QueueSerialFinish();
+            }
+        }
+
+        VK_FUNCTION_WRAPPER( vkCmdResetQueryPool( cmdbuf, m_query, wrappedTail, cnt ) );
+
+        m_tail += cnt;
+    }
+
+    tracy_force_inline unsigned int NextQueryId()
+    {
+        const uint64_t id = m_head.fetch_add(1, std::memory_order_relaxed);
+        return id % m_queryCount;
+    }
+
+    tracy_force_inline uint8_t GetId() const
+    {
+        return m_context;
+    }
+
+    tracy_force_inline VkQueryPool GetQueryPool() const
+    {
+         return m_query;
+    }
+
+private:
+    tracy_force_inline void Calibrate( VkDevice device, int64_t& tCpu, int64_t& tGpu )
+    {
+        assert( m_timeDomain != VK_TIME_DOMAIN_DEVICE_EXT );
+        VkCalibratedTimestampInfoEXT spec[2] = {
+            { VK_STRUCTURE_TYPE_CALIBRATED_TIMESTAMP_INFO_EXT, nullptr, VK_TIME_DOMAIN_DEVICE_EXT },
+            { VK_STRUCTURE_TYPE_CALIBRATED_TIMESTAMP_INFO_EXT, nullptr, m_timeDomain },
+        };
+        uint64_t ts[2];
+        uint64_t deviation;
+        do
+        {
+            m_vkGetCalibratedTimestampsEXT( device, 2, spec, ts, &deviation );
+        }
+        while( deviation > m_deviation );
+
+#if defined _WIN32
+        tGpu = ts[0];
+        tCpu = ts[1] * m_qpcToNs;
+#elif defined __linux__ && defined CLOCK_MONOTONIC_RAW
+        tGpu = ts[0];
+        tCpu = ts[1];
+#else
+        assert( false );
+#endif
+    }
+
+    tracy_force_inline void CreateQueryPool()
+    {
+        VkQueryPoolCreateInfo poolInfo = {};
+        poolInfo.sType = VK_STRUCTURE_TYPE_QUERY_POOL_CREATE_INFO;
+        poolInfo.queryCount = m_queryCount;
+        poolInfo.queryType = VK_QUERY_TYPE_TIMESTAMP;
+        while ( VK_FUNCTION_WRAPPER( vkCreateQueryPool( m_device, &poolInfo, nullptr, &m_query ) != VK_SUCCESS ) )
+        {
+            m_queryCount /= 2;
+            poolInfo.queryCount = m_queryCount;
+        }
+    }
+
+    tracy_force_inline void FindAvailableTimeDomains( VkPhysicalDevice physicalDevice, PFN_vkGetPhysicalDeviceCalibrateableTimeDomainsEXT _vkGetPhysicalDeviceCalibrateableTimeDomainsEXT )
+    {
+        uint32_t num;
+        _vkGetPhysicalDeviceCalibrateableTimeDomainsEXT( physicalDevice, &num, nullptr );
+        if(num > 4) num = 4;
+        VkTimeDomainEXT data[4];
+        _vkGetPhysicalDeviceCalibrateableTimeDomainsEXT( physicalDevice, &num, data );
+        VkTimeDomainEXT supportedDomain = (VkTimeDomainEXT)-1;
+#if defined _WIN32
+        supportedDomain = VK_TIME_DOMAIN_QUERY_PERFORMANCE_COUNTER_EXT;
+#elif defined __linux__ && defined CLOCK_MONOTONIC_RAW
+        supportedDomain = VK_TIME_DOMAIN_CLOCK_MONOTONIC_RAW_EXT;
+#endif
+        for( uint32_t i=0; i<num; i++ ) {
+            if(data[i] == supportedDomain) {
+                m_timeDomain = data[i];
+                break;
+            }
+        }
+    }
+
+    tracy_force_inline void FindCalibratedTimestampDeviation()
+    {
+        assert( m_timeDomain != VK_TIME_DOMAIN_DEVICE_EXT );
+        constexpr size_t NumProbes = 32;
+        VkCalibratedTimestampInfoEXT spec[2] = {
+            { VK_STRUCTURE_TYPE_CALIBRATED_TIMESTAMP_INFO_EXT, nullptr, VK_TIME_DOMAIN_DEVICE_EXT },
+            { VK_STRUCTURE_TYPE_CALIBRATED_TIMESTAMP_INFO_EXT, nullptr, m_timeDomain },
+        };
+        uint64_t ts[2];
+        uint64_t deviation[NumProbes];
+        for( size_t i=0; i<NumProbes; i++ ) {
+            m_vkGetCalibratedTimestampsEXT( m_device, 2, spec, ts, deviation + i );
+        }
+        uint64_t minDeviation = deviation[0];
+        for( size_t i=1; i<NumProbes; i++ ) {
+            if ( minDeviation > deviation[i] ) {
+                minDeviation = deviation[i];
+            }
+        }
+        m_deviation = minDeviation * 3 / 2;
+
+#if defined _WIN32
+        m_qpcToNs = int64_t( 1000000000. / GetFrequencyQpc() );
+#endif
+    }
+
+    tracy_force_inline void WriteInitialItem( VkPhysicalDevice physdev, int64_t tcpu, int64_t tgpu )
+    {
+        uint8_t flags = 0;
+        if( m_timeDomain != VK_TIME_DOMAIN_DEVICE_EXT ) flags |= GpuContextCalibration;
+
+        VkPhysicalDeviceProperties prop;
+        VK_FUNCTION_WRAPPER( vkGetPhysicalDeviceProperties( physdev, &prop ) );
+        const float period = prop.limits.timestampPeriod;
+
+        auto item = Profiler::QueueSerial();
+        MemWrite( &item->hdr.type, QueueType::GpuNewContext );
+        MemWrite( &item->gpuNewContext.cpuTime, tcpu );
+        MemWrite( &item->gpuNewContext.gpuTime, tgpu );
+        memset( &item->gpuNewContext.thread, 0, sizeof( item->gpuNewContext.thread ) );
+        MemWrite( &item->gpuNewContext.period, period );
+        MemWrite( &item->gpuNewContext.context, m_context );
+        MemWrite( &item->gpuNewContext.flags, flags );
+        MemWrite( &item->gpuNewContext.type, GpuContextType::Vulkan );
+
+#ifdef TRACY_ON_DEMAND
+        GetProfiler().DeferItem( *item );
+#endif
+        Profiler::QueueSerialFinish();
+    }
+
+#if defined TRACY_VK_USE_SYMBOL_TABLE
+    void PopulateSymbolTable( VkInstance instance, PFN_vkGetInstanceProcAddr instanceProcAddr, PFN_vkGetDeviceProcAddr deviceProcAddr )
+    {
+#define VK_GET_DEVICE_SYMBOL( name ) \
+        (PFN_##name)deviceProcAddr( m_device, #name );
+#define VK_LOAD_DEVICE_SYMBOL( name ) \
+        m_symbols.name = VK_GET_DEVICE_SYMBOL( name );
+#define VK_GET_INSTANCE_SYMBOL( name ) \
+        (PFN_##name)instanceProcAddr( instance, #name );
+#define VK_LOAD_INSTANCE_SYMBOL( name ) \
+        m_symbols.name = VK_GET_INSTANCE_SYMBOL( name );
+
+        LoadVkDeviceCoreSymbols( VK_LOAD_DEVICE_SYMBOL )
+        LoadVkDeviceExtensionSymbols( VK_LOAD_DEVICE_SYMBOL )
+        LoadVkInstanceExtensionSymbols( VK_LOAD_INSTANCE_SYMBOL )
+        LoadVkInstanceCoreSymbols( VK_LOAD_INSTANCE_SYMBOL )
+#undef VK_GET_DEVICE_SYMBOL
+#undef VK_LOAD_DEVICE_SYMBOL
+#undef VK_GET_INSTANCE_SYMBOL
+#undef VK_LOAD_INSTANCE_SYMBOL
+    }
+#endif
+
+    VkDevice m_device;
+    VkQueryPool m_query;
+    VkTimeDomainEXT m_timeDomain;
+#if defined TRACY_VK_USE_SYMBOL_TABLE
+    VkSymbolTable m_symbols;
+#endif
+    uint64_t m_deviation;
+#ifdef _WIN32
+    int64_t m_qpcToNs;
+#endif
+    int64_t m_prevCalibration;
+    uint8_t m_context;
+
+    std::atomic<uint64_t> m_head;
+    uint64_t m_tail;
+    unsigned int m_oldCnt;
+    unsigned int m_queryCount;
+
+    int64_t* m_res;
+
+    PFN_vkGetCalibratedTimestampsEXT m_vkGetCalibratedTimestampsEXT;
+};
+
+class VkCtxScope
+{
+public:
+    tracy_force_inline VkCtxScope( VkCtx* ctx, const SourceLocationData* srcloc, VkCommandBuffer cmdbuf, bool is_active )
+#ifdef TRACY_ON_DEMAND
+        : m_active( is_active && GetProfiler().IsConnected() )
+#else
+        : m_active( is_active )
+#endif
+    {
+        if( !m_active ) return;
+        m_cmdbuf = cmdbuf;
+        m_ctx = ctx;
+
+        const auto queryId = ctx->NextQueryId();
+        CONTEXT_VK_FUNCTION_WRAPPER( vkCmdWriteTimestamp( cmdbuf, VK_PIPELINE_STAGE_BOTTOM_OF_PIPE_BIT, ctx->m_query, queryId ) );
+
+        auto item = Profiler::QueueSerial();
+        MemWrite( &item->hdr.type, QueueType::GpuZoneBeginSerial );
+        MemWrite( &item->gpuZoneBegin.cpuTime, Profiler::GetTime() );
+        MemWrite( &item->gpuZoneBegin.srcloc, (uint64_t)srcloc );
+        MemWrite( &item->gpuZoneBegin.thread, GetThreadHandle() );
+        MemWrite( &item->gpuZoneBegin.queryId, uint16_t( queryId ) );
+        MemWrite( &item->gpuZoneBegin.context, ctx->GetId() );
+        Profiler::QueueSerialFinish();
+    }
+
+    tracy_force_inline VkCtxScope( VkCtx* ctx, const SourceLocationData* srcloc, VkCommandBuffer cmdbuf, int depth, bool is_active )
+#ifdef TRACY_ON_DEMAND
+        : m_active( is_active && GetProfiler().IsConnected() )
+#else
+        : m_active( is_active )
+#endif
+    {
+        if( !m_active ) return;
+        m_cmdbuf = cmdbuf;
+        m_ctx = ctx;
+
+        const auto queryId = ctx->NextQueryId();
+        CONTEXT_VK_FUNCTION_WRAPPER( vkCmdWriteTimestamp( cmdbuf, VK_PIPELINE_STAGE_BOTTOM_OF_PIPE_BIT, ctx->m_query, queryId ) );
+
+        auto item = Profiler::QueueSerialCallstack( Callstack( depth ) );
+        MemWrite( &item->hdr.type, QueueType::GpuZoneBeginCallstackSerial );
+        MemWrite( &item->gpuZoneBegin.cpuTime, Profiler::GetTime() );
+        MemWrite( &item->gpuZoneBegin.srcloc, (uint64_t)srcloc );
+        MemWrite( &item->gpuZoneBegin.thread, GetThreadHandle() );
+        MemWrite( &item->gpuZoneBegin.queryId, uint16_t( queryId ) );
+        MemWrite( &item->gpuZoneBegin.context, ctx->GetId() );
+        Profiler::QueueSerialFinish();
+    }
+
+    tracy_force_inline VkCtxScope( VkCtx* ctx, uint32_t line, const char* source, size_t sourceSz, const char* function, size_t functionSz, const char* name, size_t nameSz, VkCommandBuffer cmdbuf, bool is_active )
+#ifdef TRACY_ON_DEMAND
+        : m_active( is_active && GetProfiler().IsConnected() )
+#else
+        : m_active( is_active )
+#endif
+    {
+        if( !m_active ) return;
+        m_cmdbuf = cmdbuf;
+        m_ctx = ctx;
+
+        const auto queryId = ctx->NextQueryId();
+        CONTEXT_VK_FUNCTION_WRAPPER( vkCmdWriteTimestamp( cmdbuf, VK_PIPELINE_STAGE_BOTTOM_OF_PIPE_BIT, ctx->m_query, queryId ) );
+
+        const auto srcloc = Profiler::AllocSourceLocation( line, source, sourceSz, function, functionSz, name, nameSz );
+        auto item = Profiler::QueueSerial();
+        MemWrite( &item->hdr.type, QueueType::GpuZoneBeginAllocSrcLocSerial );
+        MemWrite( &item->gpuZoneBegin.cpuTime, Profiler::GetTime() );
+        MemWrite( &item->gpuZoneBegin.srcloc, srcloc );
+        MemWrite( &item->gpuZoneBegin.thread, GetThreadHandle() );
+        MemWrite( &item->gpuZoneBegin.queryId, uint16_t( queryId ) );
+        MemWrite( &item->gpuZoneBegin.context, ctx->GetId() );
+        Profiler::QueueSerialFinish();
+    }
+
+    tracy_force_inline VkCtxScope( VkCtx* ctx, uint32_t line, const char* source, size_t sourceSz, const char* function, size_t functionSz, const char* name, size_t nameSz, VkCommandBuffer cmdbuf, int depth, bool is_active )
+#ifdef TRACY_ON_DEMAND
+        : m_active( is_active && GetProfiler().IsConnected() )
+#else
+        : m_active( is_active )
+#endif
+    {
+        if( !m_active ) return;
+        m_cmdbuf = cmdbuf;
+        m_ctx = ctx;
+
+        const auto queryId = ctx->NextQueryId();
+        CONTEXT_VK_FUNCTION_WRAPPER( vkCmdWriteTimestamp( cmdbuf, VK_PIPELINE_STAGE_BOTTOM_OF_PIPE_BIT, ctx->m_query, queryId ) );
+
+        const auto srcloc = Profiler::AllocSourceLocation( line, source, sourceSz, function, functionSz, name, nameSz );
+        auto item = Profiler::QueueSerialCallstack( Callstack( depth ) );
+        MemWrite( &item->hdr.type, QueueType::GpuZoneBeginAllocSrcLocCallstackSerial );
+        MemWrite( &item->gpuZoneBegin.cpuTime, Profiler::GetTime() );
+        MemWrite( &item->gpuZoneBegin.srcloc, srcloc );
+        MemWrite( &item->gpuZoneBegin.thread, GetThreadHandle() );
+        MemWrite( &item->gpuZoneBegin.queryId, uint16_t( queryId ) );
+        MemWrite( &item->gpuZoneBegin.context, ctx->GetId() );
+        Profiler::QueueSerialFinish();
+    }
+
+    tracy_force_inline ~VkCtxScope()
+    {
+        if( !m_active ) return;
+
+        const auto queryId = m_ctx->NextQueryId();
+        CONTEXT_VK_FUNCTION_WRAPPER( vkCmdWriteTimestamp( m_cmdbuf, VK_PIPELINE_STAGE_BOTTOM_OF_PIPE_BIT, m_ctx->m_query, queryId ) );
+
+        auto item = Profiler::QueueSerial();
+        MemWrite( &item->hdr.type, QueueType::GpuZoneEndSerial );
+        MemWrite( &item->gpuZoneEnd.cpuTime, Profiler::GetTime() );
+        MemWrite( &item->gpuZoneEnd.thread, GetThreadHandle() );
+        MemWrite( &item->gpuZoneEnd.queryId, uint16_t( queryId ) );
+        MemWrite( &item->gpuZoneEnd.context, m_ctx->GetId() );
+        Profiler::QueueSerialFinish();
+    }
+
+private:
+    const bool m_active;
+
+    VkCommandBuffer m_cmdbuf;
+    VkCtx* m_ctx;
+};
+
+#if defined TRACY_VK_USE_SYMBOL_TABLE
+static inline VkCtx* CreateVkContext( VkInstance instance, VkPhysicalDevice physdev, VkDevice device, VkQueue queue, VkCommandBuffer cmdbuf, PFN_vkGetInstanceProcAddr instanceProcAddr, PFN_vkGetDeviceProcAddr getDeviceProcAddr, bool calibrated = false )
+#else
+static inline VkCtx* CreateVkContext( VkPhysicalDevice physdev, VkDevice device, VkQueue queue, VkCommandBuffer cmdbuf, PFN_vkGetPhysicalDeviceCalibrateableTimeDomainsEXT gpdctd, PFN_vkGetCalibratedTimestampsEXT gct )
+#endif
+{
+    auto ctx = (VkCtx*)tracy_malloc( sizeof( VkCtx ) );
+#if defined TRACY_VK_USE_SYMBOL_TABLE
+    new(ctx) VkCtx( instance, physdev, device, queue, cmdbuf, instanceProcAddr, getDeviceProcAddr, calibrated );
+#else
+    new(ctx) VkCtx( physdev, device, queue, cmdbuf, gpdctd, gct );
+#endif
+    return ctx;
+}
+
+#if defined VK_EXT_host_query_reset
+#if defined TRACY_VK_USE_SYMBOL_TABLE
+static inline VkCtx* CreateVkContext( VkInstance instance, VkPhysicalDevice physdev, VkDevice device, PFN_vkGetInstanceProcAddr instanceProcAddr, PFN_vkGetDeviceProcAddr getDeviceProcAddr )
+#else
+static inline VkCtx* CreateVkContext( VkPhysicalDevice physdev, VkDevice device, PFN_vkResetQueryPoolEXT qpreset, PFN_vkGetPhysicalDeviceCalibrateableTimeDomainsEXT gpdctd, PFN_vkGetCalibratedTimestampsEXT gct )
+#endif
+{
+    auto ctx = (VkCtx*)tracy_malloc( sizeof( VkCtx ) );
+#if defined TRACY_VK_USE_SYMBOL_TABLE
+    new(ctx) VkCtx( instance, physdev, device, instanceProcAddr, getDeviceProcAddr );
+#else
+    new(ctx) VkCtx( physdev, device, qpreset, gpdctd, gct );
+#endif
+    return ctx;
+}
+#endif
+
+static inline void DestroyVkContext( VkCtx* ctx )
+{
+    ctx->~VkCtx();
+    tracy_free( ctx );
+}
+
+}
+
+using TracyVkCtx = tracy::VkCtx*;
+
+#if defined TRACY_VK_USE_SYMBOL_TABLE
+#define TracyVkContext( instance, physdev, device, queue, cmdbuf, instanceProcAddr, deviceProcAddr ) tracy::CreateVkContext( instance, physdev, device, queue, cmdbuf, instanceProcAddr, deviceProcAddr );
+#else
+#define TracyVkContext( physdev, device, queue, cmdbuf ) tracy::CreateVkContext( physdev, device, queue, cmdbuf, nullptr, nullptr );
+#endif
+#if defined TRACY_VK_USE_SYMBOL_TABLE
+#define TracyVkContextCalibrated( instance, physdev, device, queue, cmdbuf, instanceProcAddr, deviceProcAddr ) tracy::CreateVkContext( instance, physdev, device, queue, cmdbuf, instanceProcAddr, deviceProcAddr, true );
+#else
+#define TracyVkContextCalibrated( physdev, device, queue, cmdbuf, gpdctd, gct ) tracy::CreateVkContext( physdev, device, queue, cmdbuf, gpdctd, gct );
+#endif
+#if defined VK_EXT_host_query_reset
+#if defined TRACY_VK_USE_SYMBOL_TABLE
+#define TracyVkContextHostCalibrated( instance, physdev, device, instanceProcAddr, deviceProcAddr ) tracy::CreateVkContext( instance, physdev, device, instanceProcAddr, deviceProcAddr );
+#else
+#define TracyVkContextHostCalibrated( physdev, device, qpreset, gpdctd, gct ) tracy::CreateVkContext( physdev, device, qpreset, gpdctd, gct );
+#endif
+#endif
+#define TracyVkDestroy( ctx ) tracy::DestroyVkContext( ctx );
+#define TracyVkContextName( ctx, name, size ) ctx->Name( name, size );
+#if defined TRACY_HAS_CALLSTACK && defined TRACY_CALLSTACK
+#  define TracyVkNamedZone( ctx, varname, cmdbuf, name, active ) static constexpr tracy::SourceLocationData TracyConcat(__tracy_gpu_source_location,TracyLine) { name, TracyFunction,  TracyFile, (uint32_t)TracyLine, 0 }; tracy::VkCtxScope varname( ctx, &TracyConcat(__tracy_gpu_source_location,TracyLine), cmdbuf, TRACY_CALLSTACK, active );
+#  define TracyVkNamedZoneC( ctx, varname, cmdbuf, name, color, active ) static constexpr tracy::SourceLocationData TracyConcat(__tracy_gpu_source_location,TracyLine) { name, TracyFunction,  TracyFile, (uint32_t)TracyLine, color }; tracy::VkCtxScope varname( ctx, &TracyConcat(__tracy_gpu_source_location,TracyLine), cmdbuf, TRACY_CALLSTACK, active );
+#  define TracyVkZone( ctx, cmdbuf, name ) TracyVkNamedZoneS( ctx, ___tracy_gpu_zone, cmdbuf, name, TRACY_CALLSTACK, true )
+#  define TracyVkZoneC( ctx, cmdbuf, name, color ) TracyVkNamedZoneCS( ctx, ___tracy_gpu_zone, cmdbuf, name, color, TRACY_CALLSTACK, true )
+#  define TracyVkZoneTransient( ctx, varname, cmdbuf, name, active ) TracyVkZoneTransientS( ctx, varname, cmdbuf, name, TRACY_CALLSTACK, active )
+#else
+#  define TracyVkNamedZone( ctx, varname, cmdbuf, name, active ) static constexpr tracy::SourceLocationData TracyConcat(__tracy_gpu_source_location,TracyLine) { name, TracyFunction,  TracyFile, (uint32_t)TracyLine, 0 }; tracy::VkCtxScope varname( ctx, &TracyConcat(__tracy_gpu_source_location,TracyLine), cmdbuf, active );
+#  define TracyVkNamedZoneC( ctx, varname, cmdbuf, name, color, active ) static constexpr tracy::SourceLocationData TracyConcat(__tracy_gpu_source_location,TracyLine) { name, TracyFunction,  TracyFile, (uint32_t)TracyLine, color }; tracy::VkCtxScope varname( ctx, &TracyConcat(__tracy_gpu_source_location,TracyLine), cmdbuf, active );
+#  define TracyVkZone( ctx, cmdbuf, name ) TracyVkNamedZone( ctx, ___tracy_gpu_zone, cmdbuf, name, true )
+#  define TracyVkZoneC( ctx, cmdbuf, name, color ) TracyVkNamedZoneC( ctx, ___tracy_gpu_zone, cmdbuf, name, color, true )
+#  define TracyVkZoneTransient( ctx, varname, cmdbuf, name, active ) tracy::VkCtxScope varname( ctx, TracyLine, TracyFile, strlen( TracyFile ), TracyFunction, strlen( TracyFunction ), name, strlen( name ), cmdbuf, active );
+#endif
+#define TracyVkCollect( ctx, cmdbuf ) ctx->Collect( cmdbuf );
+
+#ifdef TRACY_HAS_CALLSTACK
+#  define TracyVkNamedZoneS( ctx, varname, cmdbuf, name, depth, active ) static constexpr tracy::SourceLocationData TracyConcat(__tracy_gpu_source_location,TracyLine) { name, TracyFunction,  TracyFile, (uint32_t)TracyLine, 0 }; tracy::VkCtxScope varname( ctx, &TracyConcat(__tracy_gpu_source_location,TracyLine), cmdbuf, depth, active );
+#  define TracyVkNamedZoneCS( ctx, varname, cmdbuf, name, color, depth, active ) static constexpr tracy::SourceLocationData TracyConcat(__tracy_gpu_source_location,TracyLine) { name, TracyFunction,  TracyFile, (uint32_t)TracyLine, color }; tracy::VkCtxScope varname( ctx, &TracyConcat(__tracy_gpu_source_location,TracyLine), cmdbuf, depth, active );
+#  define TracyVkZoneS( ctx, cmdbuf, name, depth ) TracyVkNamedZoneS( ctx, ___tracy_gpu_zone, cmdbuf, name, depth, true )
+#  define TracyVkZoneCS( ctx, cmdbuf, name, color, depth ) TracyVkNamedZoneCS( ctx, ___tracy_gpu_zone, cmdbuf, name, color, depth, true )
+#  define TracyVkZoneTransientS( ctx, varname, cmdbuf, name, depth, active ) tracy::VkCtxScope varname( ctx, TracyLine, TracyFile, strlen( TracyFile ), TracyFunction, strlen( TracyFunction ), name, strlen( name ), cmdbuf, depth, active );
+#else
+#  define TracyVkNamedZoneS( ctx, varname, cmdbuf, name, depth, active ) TracyVkNamedZone( ctx, varname, cmdbuf, name, active )
+#  define TracyVkNamedZoneCS( ctx, varname, cmdbuf, name, color, depth, active ) TracyVkNamedZoneC( ctx, varname, cmdbuf, name, color, active )
+#  define TracyVkZoneS( ctx, cmdbuf, name, depth ) TracyVkZone( ctx, cmdbuf, name )
+#  define TracyVkZoneCS( ctx, cmdbuf, name, color, depth ) TracyVkZoneC( ctx, cmdbuf, name, color )
+#  define TracyVkZoneTransientS( ctx, varname, cmdbuf, name, depth, active ) TracyVkZoneTransient( ctx, varname, cmdbuf, name, active )
+#endif
+
+#endif
+
+#endif
diff --git a/src/hx/Debug.cpp b/src/hx/Debug.cpp
index 9cd52c3f0..25c7d4dba 100644
--- a/src/hx/Debug.cpp
+++ b/src/hx/Debug.cpp
@@ -192,10 +192,8 @@ StackContext::StackContext()
    mIsUnwindingException = false;
    #endif
 
-   #ifdef HXCPP_TELEMETRY
-   //mTelemetry = tlmCreate(this);
-   // Do not automatically start
-   mTelemetry = 0;
+   #if HXCPP_TELEMETRY
+   mTelemetry = tlmCreate(this);
    #endif
 
    #ifdef HXCPP_DEBUGGER
diff --git a/src/hx/Telemetry.cpp b/src/hx/Telemetry.cpp
index eaa29d2f5..593299a4c 100644
--- a/src/hx/Telemetry.cpp
+++ b/src/hx/Telemetry.cpp
@@ -649,6 +649,8 @@ void __hxt_new_hash(void* obj, int inSize)
       stack->mTelemetry->HXTAllocation(obj, inSize, (const char *)"Hash");
   #endif
 }
+void __hxt_gc_alloc(void* obj, int inSize) { }
+void __hxt_gc_free_large(void*) {}
 void __hxt_gc_new(hx::StackContext *stack, void* obj, int inSize, const char* name)
 {
   #ifdef HXCPP_STACK_TRACE
diff --git a/src/hx/TelemetryTracy.cpp b/src/hx/TelemetryTracy.cpp
new file mode 100644
index 000000000..4058ac842
--- /dev/null
+++ b/src/hx/TelemetryTracy.cpp
@@ -0,0 +1,262 @@
+#include <hxcpp.h>
+#include <hx/TelemetryTracy.h>
+#include <hx/Thread.h>
+
+#include <vector>
+
+namespace
+{
+	TracyCZoneCtx gcZone;
+
+	// ___tracy_source_location_data gcSourceLocation = { "GC", "__hxt_gc_start",  TracyFile, (uint32_t)TracyLine, 0 };
+
+	const char* sohName = "Small Object Heap";
+	const char* lohName = "Large Object Heap";
+
+	std::vector<hx::Telemetry*> created;
+	hx::QuickVec<void*> largeAllocs;
+	HxMutex largeAllocsLock;
+
+	bool isLargeObject(void* ptr)
+	{
+		auto ptrHeader = reinterpret_cast<size_t>(ptr) - sizeof(int);
+		auto flags     = *reinterpret_cast<unsigned int*>(ptrHeader);
+
+		return (flags & 0xffff) == 0;
+	}
+}
+
+namespace hx
+{
+	class Telemetry
+	{
+	public:
+		std::vector<TracyCZoneCtx> tracyZones;
+		hx::QuickVec<void*> smallAllocs;
+
+		Telemetry() : tracyZones(0), smallAllocs() {}
+	};
+}
+
+int __hxcpp_hxt_start_telemetry(bool profiler, bool allocations)
+{
+	hx::Throw(HX_CSTRING("Not implemented"));
+
+	return 0;
+}
+
+TelemetryFrame* __hxcpp_hxt_dump_telemetry(int)
+{
+	hx::Throw(HX_CSTRING("Not implemented"));
+
+	return 0;
+}
+
+void __hxcpp_hxt_stash_telemetry() { }
+
+void __hxcpp_hxt_ignore_allocs(int) {}
+
+void __hxt_new_string(void* obj, int inSize) { }
+
+void __hxt_new_array(void* obj, int inSize) { }
+
+void __hxt_new_hash(void* obj, int inSize) { }
+
+void __hxt_gc_new(hx::StackContext* stack, void* obj, int inSize, const char* name) { }
+
+void __hxt_gc_alloc(void* obj, int inSize)
+{
+	#ifdef HXCPP_TRACY_MEMORY
+		if (isLargeObject(obj))
+		{
+			AutoLock lock(largeAllocsLock);
+
+			largeAllocs.push(obj);
+
+			TracyAllocN(obj, inSize, lohName);
+		}
+		else
+		{
+			hx::StackContext::getCurrent()->mTelemetry->smallAllocs.push(obj);
+
+			TracyAllocN(obj, inSize, sohName);
+		}
+	#endif
+}
+
+void __hxt_gc_free_large(void* obj)
+{
+	AutoLock lock(largeAllocsLock);
+
+	for (auto i = 0; i < largeAllocs.size(); i++)
+	{
+		if (largeAllocs[i] == obj)
+		{
+			largeAllocs.erase(i);
+
+			TracyFreeN(obj, lohName);
+
+			return;
+		}
+	}
+}
+
+void __hxt_gc_realloc(void* oldObj, void* newObj, int newSize) { }
+
+void __hxt_gc_after_mark(int byteMarkId, int endianByteMarkId)
+{
+	#ifdef HXCPP_TRACY_MEMORY
+		for (auto&& telemetry : created)
+		{
+			hx::QuickVec<void*> smallRetained;
+
+			smallRetained.safeReserveExtra(telemetry->smallAllocs.size());
+
+			for (auto i = 0; i < telemetry->smallAllocs.size(); i++)
+			{
+				auto ptr      = telemetry->smallAllocs[i];
+				auto markByte = reinterpret_cast<unsigned char*>(ptr)[endianByteMarkId];
+				if (markByte != byteMarkId)
+				{
+					TracyFreeN(ptr, sohName);
+				}
+				else
+				{
+					smallRetained.push(ptr);
+				}
+			}
+
+			telemetry->smallAllocs.swap(smallRetained);
+		}
+
+		hx::QuickVec<void*> largeRetained;
+
+		largeRetained.safeReserveExtra(largeAllocs.size());
+
+		for (auto i = 0; i < largeAllocs.size(); i++)
+		{
+			auto ptr      = largeAllocs[i];
+			auto markByte = reinterpret_cast<unsigned char*>(ptr)[endianByteMarkId];
+			if (markByte != byteMarkId)
+			{
+				TracyFreeN(ptr, lohName);
+			}
+			else
+			{
+				largeRetained.push(ptr);
+			}
+		}
+
+		largeAllocs.swap(largeRetained);
+	#endif
+}
+
+void __hxt_gc_start()
+{
+	// gcZone = ___tracy_emit_zone_begin(&gcSourceLocation, true);
+}
+
+void __hxt_gc_end()
+{
+	// ___tracy_emit_zone_end(gcZone);
+}
+
+hx::Telemetry* hx::tlmCreate(StackContext* stack)
+{
+	auto obj = new hx::Telemetry();
+
+	created.push_back(obj);
+
+	return obj;
+}
+
+void hx::tlmDestroy(hx::Telemetry* telemetry)
+{
+	created.erase(std::find(created.begin(), created.end(), telemetry));
+
+	delete telemetry;
+}
+
+void hx::tlmAttach(hx::Telemetry* telemetry, hx::StackContext* stack)
+{
+	//
+}
+
+void hx::tlmDetach(hx::Telemetry* telemetry)
+{
+	//
+}
+
+void hx::tlmSampleEnter(hx::Telemetry* telemetry, hx::StackFrame* frame)
+{
+	auto srcloc =
+		___tracy_alloc_srcloc(
+			frame->lineNumber,
+			frame->position->fileName,
+			strlen(frame->position->fileName),
+			frame->position->fullName,
+			strlen(frame->position->fullName),
+			0);
+
+	#if HXCPP_TRACY_INCLUDE_CALLSTACKS
+		// Note: Tracy doesnt support Callstacks outside this scope: depth >= 1 && depth < 63
+		// Determine depth from tracyZones vector: +1 since we are about to add one
+		auto depth = telemetry->tracyZones.size() + 1;
+
+		telemetry->tracyZones.push_back(___tracy_emit_zone_begin_alloc_callstack(srcloc, depth, true));
+	#else
+		telemetry->tracyZones.push_back(___tracy_emit_zone_begin_alloc(srcloc, true));
+	#endif
+}
+
+void hx::tlmSampleExit(hx::Telemetry* telemetry)
+{
+	if (telemetry->tracyZones.empty())
+	{
+		return;
+	}
+
+	___tracy_emit_zone_end(telemetry->tracyZones.back());
+
+	telemetry->tracyZones.pop_back();
+}
+
+void __hxcpp_tracy_framemark()
+{
+	::tracy::Profiler::SendFrameMark(0);
+}
+
+void __hxcpp_tracy_plot(String name, ::Float val)
+{
+	hx::strbuf buffer;
+	::tracy::Profiler::PlotData(name.utf8_str(&buffer), val);
+}
+
+void __hxcpp_tracy_plot_config(String name, uint8_t format, bool step, bool fill, int color)
+{
+	hx::strbuf buffer;
+	::tracy::Profiler::ConfigurePlot(name.utf8_str(&buffer),::tracy::PlotFormatType(format), step, fill, color);
+}
+
+void __hxcpp_tracy_message(String msg, int color)
+{
+	hx::strbuf buffer;
+	::tracy::Profiler::MessageColor(msg.utf8_str(&buffer), msg.length, color, 0);
+}
+
+void __hxcpp_tracy_message_app_info(String info)
+{
+	hx::strbuf buffer;
+	::tracy::Profiler::MessageAppInfo(info.utf8_str(&buffer), info.length);
+}
+
+void __hxcpp_tracy_set_thread_name_and_group(String name, int groupHint)
+{
+	hx::strbuf buffer;
+	::tracy::SetThreadNameWithHint(name.utf8_str(&buffer), groupHint);
+}
+
+int __hxcpp_tracy_get_zone_count()
+{
+	return static_cast<int>(hx::StackContext::getCurrent()->mTelemetry->tracyZones.size());
+}
diff --git a/src/hx/gc/Immix.cpp b/src/hx/gc/Immix.cpp
index 05d0ac842..880b51c62 100644
--- a/src/hx/gc/Immix.cpp
+++ b/src/hx/gc/Immix.cpp
@@ -3207,6 +3207,10 @@ class GlobalAllocator
 
    void FreeLarge(void *inLarge)
    {
+#ifdef HXCPP_TELEMETRY
+       __hxt_gc_free_large(inLarge);
+#endif
+
       ((unsigned char *)inLarge)[HX_ENDIAN_MARK_ID_BYTE] = 0;
       // AllocLarge will not lock this list unless it decides there is a suitable
       //  value, so we can't doa realloc without potentially crashing it.
@@ -3326,6 +3330,10 @@ class GlobalAllocator
       if (do_lock)
          mLargeListLock.Unlock();
 
+#ifdef HXCPP_TELEMETRY
+      __hxt_gc_alloc(result + 2, inSize);
+#endif
+
       return result+2;
    }
 
@@ -6323,6 +6331,10 @@ class LocalAllocator : public hx::StackContext
                hx::GCOnNewPointer(buffer);
                #endif
 
+               #ifdef HXCPP_TELEMETRY
+               __hxt_gc_alloc(buffer, inSize);
+               #endif
+
                return buffer;
             }
             if (mFraggedRows)
@@ -6653,8 +6665,7 @@ void SetTopOfStack(int *inTop,bool inForce)
 
 void *InternalNew(int inSize,bool inIsObject)
 {
-   //HX_STACK_FRAME("GC", "new", 0, "GC::new", "src/hx/GCInternal.cpp", __LINE__, 0)
-   HX_STACK_FRAME("GC", "new", 0, "GC::new", "src/hx/GCInternal.cpp", inSize, 0)
+   // HX_STACK_FRAME("GC", "new", 0, "GC::new", __FILE__, __LINE__, 0)
 
    #ifdef HXCPP_DEBUG
    if (sgSpamCollects && sgAllocsSinceLastSpam>=sgSpamCollects)
@@ -6765,7 +6776,7 @@ void *InternalRealloc(int inFromSize, void *inData,int inSize, bool inExpand)
       return hx::InternalNew(inSize,false);
    }
 
-   HX_STACK_FRAME("GC", "realloc", 0, "GC::relloc", __FILE__ , __LINE__, 0)
+   // HX_STACK_FRAME("GC", "realloc", 0, "GC::relloc", __FILE__ , __LINE__, 0)
 
    #ifdef HXCPP_DEBUG
    if (sgSpamCollects && sgAllocsSinceLastSpam>=sgSpamCollects)
diff --git a/toolchain/common-defines.xml b/toolchain/common-defines.xml
index 3f8c111a9..5481a41ee 100644
--- a/toolchain/common-defines.xml
+++ b/toolchain/common-defines.xml
@@ -39,6 +39,18 @@
  <flag value='-DHXCPP_GC_DEBUG_LEVEL=${HXCPP_GC_DEBUG_LEVEL}' if="HXCPP_GC_DEBUG_LEVEL" tag="gc" />
  <flag value='-DHXCPP_GC_VERIFY' if="HXCPP_GC_VERIFY" tag="gc" />
  <flag value='-DHXCPP_WINXP_COMPAT' if="HXCPP_WINXP_COMPAT" tag="haxe"/>
+ 
+ <!-- fwd these defines into haxe target so we can check for it in telemetry -->
+ <flag value="-DHXCPP_GC_MOVING" if="HXCPP_GC_MOVING" tag="haxe" />
+ <flag value="-DHXCPP_GC_GENERATIONAL" if="HXCPP_GC_GENERATIONAL" tag="haxe" />
+
+ <!-- tracy features  -->
+ <flag value='-DHXCPP_TRACY' if="HXCPP_TRACY" tag="haxe"/>
+ <flag value='-DHXCPP_TRACY_MEMORY' if="HXCPP_TRACY_MEMORY" tag="haxe"/>
+ <flag value='-DHXCPP_TRACY_NO_EXIT' if="HXCPP_TRACY_NO_EXIT" tag="haxe"/>
+ <flag value='-DHXCPP_TRACY_ON_DEMAND' if="HXCPP_TRACY_ON_DEMAND" tag="haxe"/>
+ <flag value='-DHXCPP_TRACY_INCLUDE_CALLSTACKS' if="HXCPP_TRACY_INCLUDE_CALLSTACKS" tag="haxe"/>
+
  <mmflag value='-Wno-invalid-offsetof' />
  <mmflag value='-Wno-format-security' />
 </xml>
diff --git a/toolchain/haxe-target.xml b/toolchain/haxe-target.xml
index a795596ab..081c15bc5 100644
--- a/toolchain/haxe-target.xml
+++ b/toolchain/haxe-target.xml
@@ -133,6 +133,16 @@
   <cache value="1" project="hxcpp-cppia" asLibrary="true" />
 </files>
 
+<files id="tracy" dir="${HXCPP}" tags="haxe"  >
+  <depend files="hxcpp-depends"/>
+  <options name="Options.txt"/>
+  <compilerflag value="-DTRACY_ENABLE"/>
+  <compilerflag value="-DTRACY_NO_EXIT" if="HXCPP_TRACY_NO_EXIT"/>
+  <compilerflag value="-DTRACY_ON_DEMAND" if="HXCPP_TRACY_ON_DEMAND"/>
+
+  <file name="project/thirdparty/tracy-0.11.1/TracyClient.cpp"/>
+  <cache value="1" project="hxcpp-tracy" asLibrary="true" />
+</files>
 
 <files id="runtime" dir="${HXCPP}" tags="haxe" >
   <depend files="hxcpp-depends"/>
@@ -159,7 +169,12 @@
   <file name = "src/hx/StdLibs.cpp" tags="haxe,static" />
   <file name = "src/hx/Debug.cpp"/>
   <file name = "src/hx/Debugger.cpp" if="HXCPP_DEBUGGER" />
-  <file name = "src/hx/Telemetry.cpp" if="HXCPP_TELEMETRY" />
+
+  <section if="HXCPP_TELEMETRY">
+    <file name = "src/hx/TelemetryTracy.cpp" if="HXCPP_TRACY"/>
+    <file name = "src/hx/Telemetry.cpp" unless="HXCPP_TRACY"/>
+  </section>
+  
   <file name = "src/hx/Profiler.cpp" if="HXCPP_PROFILER" />
   <file name = "src/hx/Thread.cpp"/>
   <file name = "src/hx/RunLibs.cpp" if="static_link||dll_link"/>
@@ -233,6 +248,7 @@
   <files id="__externs__" />
   <files id="runtime" unless="dll_import" />
   <files id="cppia" if="scriptable" />
+  <files id="tracy" if="HXCPP_TRACY" />
   <files id="rc" unless="static_link" />
   <lib name="-lpthread" if="linux" unless="static_link" />
   <lib name="-ldl" if="linux" unless="static_link" />