diff --git a/CMakeLists.txt b/CMakeLists.txt
index 4e1945873903..2a174cb8e84e 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -687,6 +687,7 @@ add_subdirectory(${EXTERNAL}/robin-map/tnt)
 add_subdirectory(${EXTERNAL}/smol-v/tnt)
 add_subdirectory(${EXTERNAL}/benchmark/tnt)
 add_subdirectory(${EXTERNAL}/meshoptimizer/tnt)
+add_subdirectory(${EXTERNAL}/mikktspace)
 add_subdirectory(${EXTERNAL}/cgltf/tnt)
 add_subdirectory(${EXTERNAL}/draco/tnt)
 add_subdirectory(${EXTERNAL}/jsmn/tnt)
diff --git a/README.md b/README.md
index 4fae4a07cb3d..19fd0df12ae8 100644
--- a/README.md
+++ b/README.md
@@ -31,7 +31,7 @@ repositories {
 }
 
 dependencies {
-    implementation 'com.google.android.filament:filament-android:1.31.6'
+    implementation 'com.google.android.filament:filament-android:1.31.7'
 }
 ```
 
@@ -51,7 +51,7 @@ Here are all the libraries available in the group `com.google.android.filament`:
 iOS projects can use CocoaPods to install the latest release:
 
 ```
-pod 'Filament', '~> 1.31.6'
+pod 'Filament', '~> 1.31.7'
 ```
 
 ### Snapshots
diff --git a/RELEASE_NOTES.md b/RELEASE_NOTES.md
index 57a57b816bbb..e92aef42bd24 100644
--- a/RELEASE_NOTES.md
+++ b/RELEASE_NOTES.md
@@ -7,6 +7,8 @@ A new header is inserted each time a *tag* is created.
 Instead, if you are authoring a PR for the main branch, add your release note to
 [NEW_RELEASE_NOTES.md](./NEW_RELEASE_NOTES.md).
 
+## v1.31.7
+
 ## v1.31.6
 
 - engine: the default render channel is now 2 instead of 0
diff --git a/android/gradle.properties b/android/gradle.properties
index 50a961ba32b9..1b9b322b1351 100644
--- a/android/gradle.properties
+++ b/android/gradle.properties
@@ -1,5 +1,5 @@
 GROUP=com.google.android.filament
-VERSION_NAME=1.31.6
+VERSION_NAME=1.31.7
 
 POM_DESCRIPTION=Real-time physically based rendering engine for Android.
 
diff --git a/filament/CMakeLists.txt b/filament/CMakeLists.txt
index a153b1bce45f..e94884c05b16 100644
--- a/filament/CMakeLists.txt
+++ b/filament/CMakeLists.txt
@@ -50,7 +50,6 @@ set(PUBLIC_HDRS
 
 set(SRCS
         src/AtlasAllocator.cpp
-        src/Box.cpp
         src/BufferObject.cpp
         src/Camera.cpp
         src/Color.cpp
diff --git a/filament/backend/include/backend/PixelBufferDescriptor.h b/filament/backend/include/backend/PixelBufferDescriptor.h
index 2a1a2c665478..1b498032fdcf 100644
--- a/filament/backend/include/backend/PixelBufferDescriptor.h
+++ b/filament/backend/include/backend/PixelBufferDescriptor.h
@@ -279,8 +279,8 @@ class UTILS_PUBLIC PixelBufferDescriptor : public BufferDescriptor {
                 break;
         }
 
-        size_t bpr = bpp * stride;
-        size_t bprAligned = (bpr + (alignment - 1)) & (~alignment + 1);
+        size_t const bpr = bpp * stride;
+        size_t const bprAligned = (bpr + (alignment - 1)) & (~alignment + 1);
         return bprAligned * height;
     }
 
diff --git a/filament/backend/src/CommandStream.cpp b/filament/backend/src/CommandStream.cpp
index 6e3d1c1b5fb9..29bb21845751 100644
--- a/filament/backend/src/CommandStream.cpp
+++ b/filament/backend/src/CommandStream.cpp
@@ -75,6 +75,7 @@ CommandStream::CommandStream(Driver& driver, CircularBuffer& buffer) noexcept
 
 void CommandStream::execute(void* buffer) {
     SYSTRACE_CALL();
+    SYSTRACE_CONTEXT();
 
     Profiler profiler;
 
diff --git a/filament/backend/src/opengl/GLUtils.h b/filament/backend/src/opengl/GLUtils.h
index ffb640eb8115..bdb1327a285a 100644
--- a/filament/backend/src/opengl/GLUtils.h
+++ b/filament/backend/src/opengl/GLUtils.h
@@ -120,7 +120,12 @@ constexpr inline GLenum getBufferBindingType(BufferObjectBinding bindingType) no
         case BufferObjectBinding::UNIFORM:
             return GL_UNIFORM_BUFFER;
         case BufferObjectBinding::SHADER_STORAGE:
+#if defined(GL_VERSION_4_1) || defined(GL_ES_VERSION_3_1)
             return GL_SHADER_STORAGE_BUFFER;
+#else
+            utils::panic(__func__, __FILE__, __LINE__, "SHADER_STORAGE not supported");
+            return 0x90D2; // just to return something
+#endif
     }
 }
 
diff --git a/filament/backend/src/opengl/OpenGLContext.cpp b/filament/backend/src/opengl/OpenGLContext.cpp
index 10c6f541bdcc..48772dcdf996 100644
--- a/filament/backend/src/opengl/OpenGLContext.cpp
+++ b/filament/backend/src/opengl/OpenGLContext.cpp
@@ -16,6 +16,10 @@
 
 #include "OpenGLContext.h"
 
+#include <backend/platforms/OpenGLPlatform.h>
+
+#include <utility>
+
 // change to true to display all GL extensions in the console on start-up
 #define DEBUG_PRINT_EXTENSIONS false
 
@@ -70,7 +74,9 @@ OpenGLContext::OpenGLContext() noexcept {
     constexpr GLint MAX_FRAGMENT_SAMPLER_COUNT = caps3.MAX_FRAGMENT_SAMPLER_COUNT;
 
     if constexpr (BACKEND_OPENGL_VERSION == BACKEND_OPENGL_VERSION_GLES) {
+#if defined(GL_ES_VERSION_2_0)
         initExtensionsGLES();
+#endif
         if (state.major == 3) {
             assert_invariant(gets.max_texture_image_units >= 16);
             assert_invariant(gets.max_combined_texture_image_units >= 32);
@@ -88,8 +94,9 @@ OpenGLContext::OpenGLContext() noexcept {
             }
         }
     } else if constexpr (BACKEND_OPENGL_VERSION == BACKEND_OPENGL_VERSION_GL) {
-        // OpenGL version
+#if defined(GL_VERSION_4_1)
         initExtensionsGL();
+#endif
         if (state.major == 4) {
             assert_invariant(state.minor >= 1);
             mShaderModel = ShaderModel::DESKTOP;
@@ -351,6 +358,8 @@ void OpenGLContext::setDefaultState() noexcept {
 #endif
 }
 
+#if defined(GL_ES_VERSION_2_0)
+
 void OpenGLContext::initExtensionsGLES() noexcept {
     const char * const extensions = (const char*)glGetString(GL_EXTENSIONS);
     GLUtils::unordered_string_set const exts = GLUtils::split(extensions);
@@ -395,6 +404,10 @@ void OpenGLContext::initExtensionsGLES() noexcept {
     }
 }
 
+#endif // defined(GL_ES_VERSION_2_0)
+
+#if defined(GL_VERSION_4_1)
+
 void OpenGLContext::initExtensionsGL() noexcept {
     GLUtils::unordered_string_set exts;
     GLint n = 0;
@@ -418,21 +431,31 @@ void OpenGLContext::initExtensionsGL() noexcept {
     ext.EXT_color_buffer_float = true;  // Assumes core profile.
     ext.EXT_color_buffer_half_float = true;  // Assumes core profile.
     ext.EXT_debug_marker = exts.has("GL_EXT_debug_marker"sv);
+    ext.EXT_disjoint_timer_query = true;
+    ext.EXT_multisampled_render_to_texture = false;
+    ext.EXT_multisampled_render_to_texture2 = false;
     ext.EXT_shader_framebuffer_fetch = exts.has("GL_EXT_shader_framebuffer_fetch"sv);
+    ext.EXT_texture_compression_bptc = exts.has("GL_EXT_texture_compression_bptc"sv);
     ext.EXT_texture_compression_etc2 = exts.has("GL_ARB_ES3_compatibility"sv);
+    ext.EXT_texture_compression_rgtc = exts.has("GL_EXT_texture_compression_rgtc"sv);
     ext.EXT_texture_compression_s3tc = exts.has("GL_EXT_texture_compression_s3tc"sv);
     ext.EXT_texture_compression_s3tc_srgb = exts.has("GL_EXT_texture_compression_s3tc_srgb"sv);
-    ext.EXT_texture_compression_rgtc = exts.has("GL_EXT_texture_compression_rgtc"sv);
-    ext.EXT_texture_compression_bptc = exts.has("GL_EXT_texture_compression_bptc"sv);
+    ext.EXT_texture_cube_map_array = true;
     ext.EXT_texture_filter_anisotropic = exts.has("GL_EXT_texture_filter_anisotropic"sv);
     ext.EXT_texture_sRGB = exts.has("GL_EXT_texture_sRGB"sv);
     ext.GOOGLE_cpp_style_line_directive = exts.has("GL_GOOGLE_cpp_style_line_directive"sv);
     ext.KHR_debug = major >= 4 && minor >= 3;
     ext.KHR_texture_compression_astc_hdr = exts.has("GL_KHR_texture_compression_astc_hdr"sv);
     ext.KHR_texture_compression_astc_ldr = exts.has("GL_KHR_texture_compression_astc_ldr"sv);
-    ext.OES_EGL_image_external_essl3 = exts.has("GL_OES_EGL_image_external_essl3"sv);
+    ext.OES_EGL_image_external_essl3 = false;
+    ext.QCOM_tiled_rendering = false;
+    ext.WEBGL_compressed_texture_etc = false;
+    ext.WEBGL_compressed_texture_s3tc = false;
+    ext.WEBGL_compressed_texture_s3tc_srgb = false;
 }
 
+#endif // defined(GL_VERSION_4_1)
+
 void OpenGLContext::bindBuffer(GLenum target, GLuint buffer) noexcept {
     if (target == GL_ELEMENT_ARRAY_BUFFER) {
         constexpr size_t targetIndex = getIndexForBufferTarget(GL_ELEMENT_ARRAY_BUFFER);
@@ -465,13 +488,8 @@ void OpenGLContext::pixelStore(GLenum pname, GLint param) noexcept {
 
     switch (pname) {
         case GL_PACK_ALIGNMENT:     pcur = &state.pack.alignment;       break;
-        case GL_PACK_ROW_LENGTH:    pcur = &state.pack.row_length;      break;
-        case GL_PACK_SKIP_PIXELS:   pcur = &state.pack.skip_pixels;     break;  // convenience
-        case GL_PACK_SKIP_ROWS:     pcur = &state.pack.skip_row;        break;  // convenience
         case GL_UNPACK_ALIGNMENT:   pcur = &state.unpack.alignment;     break;
         case GL_UNPACK_ROW_LENGTH:  pcur = &state.unpack.row_length;    break;
-        case GL_UNPACK_SKIP_PIXELS: pcur = &state.unpack.skip_pixels;   break;  // convenience
-        case GL_UNPACK_SKIP_ROWS:   pcur = &state.unpack.skip_row;      break;  // convenience
         default:
             goto default_case;
     }
@@ -618,12 +636,10 @@ void OpenGLContext::resetState() noexcept {
     GLenum const bufferTargets[] = {
         GL_UNIFORM_BUFFER,
         GL_TRANSFORM_FEEDBACK_BUFFER,
-#if !defined(__EMSCRIPTEN__)
+#if !defined(__EMSCRIPTEN__) && (defined(GL_VERSION_4_1) || defined(GL_ES_VERSION_3_1))
         GL_SHADER_STORAGE_BUFFER,
 #endif
         GL_ARRAY_BUFFER,
-        GL_COPY_READ_BUFFER,
-        GL_COPY_WRITE_BUFFER,
         GL_ELEMENT_ARRAY_BUFFER,
         GL_PIXEL_PACK_BUFFER,
         GL_PIXEL_UNPACK_BUFFER,
@@ -646,23 +662,30 @@ void OpenGLContext::resetState() noexcept {
     // Reset state.textures to its default state to avoid the complexity and error-prone
     // nature of resetting the GL state to its existing state
     state.textures = {};
-    const GLuint textureTargets[] = {
-        GL_TEXTURE_2D,
-        GL_TEXTURE_2D_ARRAY,
-        GL_TEXTURE_CUBE_MAP,
-        GL_TEXTURE_3D,
+    const std::pair<GLuint, bool> textureTargets[] = {
+            { GL_TEXTURE_2D,                true },
+            { GL_TEXTURE_2D_ARRAY,          true },
+            { GL_TEXTURE_CUBE_MAP,          true },
+            { GL_TEXTURE_3D,                true },
 #if !defined(__EMSCRIPTEN__)
-        GL_TEXTURE_2D_MULTISAMPLE,
-        GL_TEXTURE_EXTERNAL_OES,
-        GL_TEXTURE_CUBE_MAP_ARRAY,
+#if defined(GL_VERSION_4_1) || defined(GL_ES_VERSION_3_1)
+            { GL_TEXTURE_2D_MULTISAMPLE,    true },
+#endif
+#if defined(GL_OES_EGL_image_external)
+            { GL_TEXTURE_EXTERNAL_OES,      ext.OES_EGL_image_external_essl3 },
+#endif
+#if defined(GL_VERSION_4_1) || defined(GL_EXT_texture_cube_map_array)
+            { GL_TEXTURE_CUBE_MAP_ARRAY,    ext.EXT_texture_cube_map_array },
+#endif
 #endif
     };
     for (GLint unit = 0; unit < gets.max_combined_texture_image_units; ++unit) {
         glActiveTexture(GL_TEXTURE0 + unit);
         glBindSampler(unit, 0);
-
-        for (auto const target : textureTargets) {
-            glBindTexture(target, 0);
+        for (auto [target, available] : textureTargets) {
+            if (available) {
+                glBindTexture(target, 0);
+            }
         }
     }
     glActiveTexture(GL_TEXTURE0 + state.textures.active);
@@ -670,14 +693,10 @@ void OpenGLContext::resetState() noexcept {
     // state.unpack
     glPixelStorei(GL_UNPACK_ALIGNMENT, state.unpack.alignment);
     glPixelStorei(GL_UNPACK_ROW_LENGTH, state.unpack.row_length);
-    glPixelStorei(GL_UNPACK_SKIP_PIXELS, state.unpack.skip_pixels);
-    glPixelStorei(GL_UNPACK_SKIP_ROWS, state.unpack.skip_row);
 
     // state.pack
     glPixelStorei(GL_PACK_ALIGNMENT, state.pack.alignment);
-    glPixelStorei(GL_PACK_ROW_LENGTH, state.pack.row_length);
-    glPixelStorei(GL_PACK_SKIP_PIXELS, state.pack.skip_pixels);
-    glPixelStorei(GL_PACK_SKIP_ROWS, state.pack.skip_row);
+    glPixelStorei(GL_PACK_ROW_LENGTH, 0); // we rely on GL_PACK_ROW_LENGTH being zero
 
     // state.window
     glScissor(
@@ -696,4 +715,30 @@ void OpenGLContext::resetState() noexcept {
     
 }
 
+OpenGLContext::FenceSync OpenGLContext::createFenceSync(
+        OpenGLPlatform&) noexcept {
+    auto sync = glFenceSync(GL_SYNC_GPU_COMMANDS_COMPLETE, 0);
+    CHECK_GL_ERROR(utils::slog.e)
+    return { .sync = sync };
+}
+
+void OpenGLContext::destroyFenceSync(
+        OpenGLPlatform&, FenceSync sync) noexcept {
+    glDeleteSync(sync.sync);
+    CHECK_GL_ERROR(utils::slog.e)
+}
+
+OpenGLContext::FenceSync::Status OpenGLContext::clientWaitSync(
+        OpenGLPlatform&, FenceSync sync) const noexcept {
+    GLenum const status = glClientWaitSync(sync.sync, 0, 0u);
+    CHECK_GL_ERROR(utils::slog.e)
+    using Status = OpenGLContext::FenceSync::Status;
+    switch (status) {
+        case GL_ALREADY_SIGNALED:       return Status::ALREADY_SIGNALED;
+        case GL_TIMEOUT_EXPIRED:        return Status::TIMEOUT_EXPIRED;
+        case GL_CONDITION_SATISFIED:    return Status::CONDITION_SATISFIED;
+        default:                        return Status::FAILURE;
+    }
+}
+
 } // namesapce filament
diff --git a/filament/backend/src/opengl/OpenGLContext.h b/filament/backend/src/opengl/OpenGLContext.h
index 85f3ce7fc239..ded404f83aa5 100644
--- a/filament/backend/src/opengl/OpenGLContext.h
+++ b/filament/backend/src/opengl/OpenGLContext.h
@@ -33,6 +33,8 @@
 
 namespace filament::backend {
 
+class OpenGLPlatform;
+
 class OpenGLContext {
 public:
     static constexpr const size_t MAX_TEXTURE_UNIT_COUNT = MAX_SAMPLER_COUNT;
@@ -110,9 +112,6 @@ class OpenGLContext {
             GLenum sfailBack, GLenum dpfailBack, GLenum dppassBack) noexcept;
     inline void stencilMaskSeparate(GLuint maskFront, GLuint maskBack) noexcept;
     inline void polygonOffset(GLfloat factor, GLfloat units) noexcept;
-    inline void beginQuery(GLenum target, GLuint query) noexcept;
-    inline void endQuery(GLenum target) noexcept;
-    inline GLuint getQuery(GLenum target) const noexcept;
 
     inline void setScissor(GLint left, GLint bottom, GLsizei width, GLsizei height) noexcept;
     inline void viewport(GLint left, GLint bottom, GLsizei width, GLsizei height) noexcept;
@@ -121,6 +120,26 @@ class OpenGLContext {
     void deleteBuffers(GLsizei n, const GLuint* buffers, GLenum target) noexcept;
     void deleteVertexArrays(GLsizei n, const GLuint* arrays) noexcept;
 
+    // we abstract GL's sync because it's not available in ES2, but we can use EGL's sync
+    // instead, if available.
+    struct FenceSync {
+        enum class Status {
+            ALREADY_SIGNALED,
+            TIMEOUT_EXPIRED,
+            CONDITION_SATISFIED,
+            FAILURE
+        };
+        union {
+            void* fence;
+            GLsync sync;
+        };
+    };
+
+    FenceSync createFenceSync(OpenGLPlatform& platform) noexcept;
+    void destroyFenceSync(OpenGLPlatform& platform, FenceSync sync) noexcept;
+    FenceSync::Status clientWaitSync(OpenGLPlatform& platform, FenceSync sync) const noexcept;
+
+
     // glGet*() values
     struct {
         GLfloat max_anisotropy;
@@ -149,21 +168,21 @@ class OpenGLContext {
         bool EXT_color_buffer_half_float;
         bool EXT_debug_marker;
         bool EXT_disjoint_timer_query;
-        bool EXT_multisampled_render_to_texture;
         bool EXT_multisampled_render_to_texture2;
+        bool EXT_multisampled_render_to_texture;
         bool EXT_shader_framebuffer_fetch;
-        bool KHR_texture_compression_astc_hdr;
-        bool KHR_texture_compression_astc_ldr;
+        bool EXT_texture_compression_bptc;
         bool EXT_texture_compression_etc2;
+        bool EXT_texture_compression_rgtc;
         bool EXT_texture_compression_s3tc;
         bool EXT_texture_compression_s3tc_srgb;
-        bool EXT_texture_compression_rgtc;
-        bool EXT_texture_compression_bptc;
         bool EXT_texture_cube_map_array;
         bool EXT_texture_filter_anisotropic;
         bool EXT_texture_sRGB;
         bool GOOGLE_cpp_style_line_directive;
         bool KHR_debug;
+        bool KHR_texture_compression_astc_hdr;
+        bool KHR_texture_compression_astc_ldr;
         bool OES_EGL_image_external_essl3;
         bool QCOM_tiled_rendering;
         bool WEBGL_compressed_texture_etc;
@@ -315,8 +334,8 @@ class OpenGLContext {
                     GLintptr offset = 0;
                     GLsizeiptr size = 0;
                 } buffers[MAX_BUFFER_BINDINGS];
-            } targets[2];   // there are only 2 indexed buffer target (uniform and transform feedback)
-            GLuint genericBinding[9] = { 0 };
+            } targets[3];   // there are only 3 indexed buffer targets
+            GLuint genericBinding[7] = {};
         } buffers;
 
         struct {
@@ -332,15 +351,10 @@ class OpenGLContext {
         struct {
             GLint row_length = 0;
             GLint alignment = 4;
-            GLint skip_pixels = 0;
-            GLint skip_row = 0;
         } unpack;
 
         struct {
-            GLint row_length = 0;
             GLint alignment = 4;
-            GLint skip_pixels = 0;
-            GLint skip_row = 0;
         } pack;
 
         struct {
@@ -348,10 +362,6 @@ class OpenGLContext {
             vec4gli viewport { 0 };
             vec2glf depthRange { 0.0f, 1.0f };
         } window;
-
-        struct {
-            GLuint timer = -1u;
-        } queries;
     } state;
 
 private:
@@ -403,8 +413,12 @@ class OpenGLContext {
     RenderPrimitive mDefaultVAO;
 
     // this is chosen to minimize code size
+#if defined(GL_ES_VERSION_2_0)
     void initExtensionsGLES() noexcept;
+#endif
+#if defined(GL_VERSION_4_1)
     void initExtensionsGL() noexcept;
+#endif
 
     template <typename T, typename F>
     static inline void update_state(T& state, T const& expected, F functor, bool force = false) noexcept {
@@ -429,7 +443,9 @@ constexpr size_t OpenGLContext::getIndexForTextureTarget(GLuint target) noexcept
         case GL_TEXTURE_2D:                     return 0;
         case GL_TEXTURE_2D_ARRAY:               return 1;
         case GL_TEXTURE_CUBE_MAP:               return 2;
+#if defined(GL_VERSION_4_1) || defined(GL_ES_VERSION_3_1)
         case GL_TEXTURE_2D_MULTISAMPLE:         return 3;
+#endif
         case GL_TEXTURE_EXTERNAL_OES:           return 4;
         case GL_TEXTURE_3D:                     return 5;
         case GL_TEXTURE_CUBE_MAP_ARRAY:         return 6;
@@ -450,17 +466,15 @@ constexpr size_t OpenGLContext::getIndexForCap(GLenum cap) noexcept { //NOLINT
         case GL_SAMPLE_ALPHA_TO_COVERAGE:       index =  6; break;
         case GL_SAMPLE_COVERAGE:                index =  7; break;
         case GL_POLYGON_OFFSET_FILL:            index =  8; break;
-        case GL_PRIMITIVE_RESTART_FIXED_INDEX:  index =  9; break;
-        case GL_RASTERIZER_DISCARD:             index = 10; break;
 #ifdef GL_ARB_seamless_cube_map
-        case GL_TEXTURE_CUBE_MAP_SEAMLESS:      index = 11; break;
+        case GL_TEXTURE_CUBE_MAP_SEAMLESS:      index =  9; break;
 #endif
 #if BACKEND_OPENGL_VERSION == BACKEND_OPENGL_VERSION_GL
-        case GL_PROGRAM_POINT_SIZE:             index = 12; break;
+        case GL_PROGRAM_POINT_SIZE:             index = 10; break;
 #endif
-        default: index = 13; break; // should never happen
+        default: break;
     }
-    assert_invariant(index < 13 && index < state.enables.caps.size());
+    assert_invariant(index < state.enables.caps.size());
     return index;
 }
 
@@ -470,15 +484,14 @@ constexpr size_t OpenGLContext::getIndexForBufferTarget(GLenum target) noexcept
         // The indexed buffers MUST be first in this list (those usable with bindBufferRange)
         case GL_UNIFORM_BUFFER:             index = 0; break;
         case GL_TRANSFORM_FEEDBACK_BUFFER:  index = 1; break;
+#if defined(GL_VERSION_4_1) || defined(GL_ES_VERSION_3_1)
         case GL_SHADER_STORAGE_BUFFER:      index = 2; break;
-
+#endif
         case GL_ARRAY_BUFFER:               index = 3; break;
-        case GL_COPY_READ_BUFFER:           index = 4; break;
-        case GL_COPY_WRITE_BUFFER:          index = 5; break;
-        case GL_ELEMENT_ARRAY_BUFFER:       index = 6; break;
-        case GL_PIXEL_PACK_BUFFER:          index = 7; break;
-        case GL_PIXEL_UNPACK_BUFFER:        index = 8; break;
-        default: index = 9; break; // should never happen
+        case GL_ELEMENT_ARRAY_BUFFER:       index = 4; break;
+        case GL_PIXEL_PACK_BUFFER:          index = 5; break;
+        case GL_PIXEL_UNPACK_BUFFER:        index = 6; break;
+        default: break;
     }
     assert_invariant(index < sizeof(state.buffers.genericBinding)/sizeof(state.buffers.genericBinding[0])); // NOLINT(misc-redundant-expression)
     return index;
@@ -501,21 +514,21 @@ void OpenGLContext::bindSampler(GLuint unit, GLuint sampler) noexcept {
 }
 
 void OpenGLContext::setScissor(GLint left, GLint bottom, GLsizei width, GLsizei height) noexcept {
-    vec4gli scissor(left, bottom, width, height);
+    vec4gli const scissor(left, bottom, width, height);
     update_state(state.window.scissor, scissor, [&]() {
         glScissor(left, bottom, width, height);
     });
 }
 
 void OpenGLContext::viewport(GLint left, GLint bottom, GLsizei width, GLsizei height) noexcept {
-    vec4gli viewport(left, bottom, width, height);
+    vec4gli const viewport(left, bottom, width, height);
     update_state(state.window.viewport, viewport, [&]() {
         glViewport(left, bottom, width, height);
     });
 }
 
 void OpenGLContext::depthRange(GLclampf near, GLclampf far) noexcept {
-    vec2glf depthRange(near, far);
+    vec2glf const depthRange(near, far);
     update_state(state.window.depthRange, depthRange, [&]() {
         glDepthRangef(near, far);
     });
@@ -526,7 +539,7 @@ void OpenGLContext::bindVertexArray(RenderPrimitive const* p) noexcept {
     update_state(state.vao.p, vao, [&]() {
         glBindVertexArray(vao->vao);
         // update GL_ELEMENT_ARRAY_BUFFER, which is updated by glBindVertexArray
-        size_t targetIndex = getIndexForBufferTarget(GL_ELEMENT_ARRAY_BUFFER);
+        size_t const targetIndex = getIndexForBufferTarget(GL_ELEMENT_ARRAY_BUFFER);
         state.buffers.genericBinding[targetIndex] = vao->elementArray;
         if (UTILS_UNLIKELY(bugs.vao_doesnt_store_element_array_buffer_binding)) {
             // This shouldn't be needed, but it looks like some drivers don't do the implicit
@@ -538,8 +551,10 @@ void OpenGLContext::bindVertexArray(RenderPrimitive const* p) noexcept {
 
 void OpenGLContext::bindBufferRange(GLenum target, GLuint index, GLuint buffer,
         GLintptr offset, GLsizeiptr size) noexcept {
-    size_t targetIndex = getIndexForBufferTarget(target);
-    assert_invariant(targetIndex <= 2); // validity check
+    size_t const targetIndex = getIndexForBufferTarget(target);
+
+    // validity check
+    assert_invariant(targetIndex < sizeof(state.buffers.targets) / sizeof(*state.buffers.targets));
 
     // this ALSO sets the generic binding
     if (   state.buffers.targets[targetIndex].buffers[index].name != buffer
@@ -616,7 +631,7 @@ void OpenGLContext::disableVertexAttribArray(GLuint index) noexcept {
 }
 
 void OpenGLContext::enable(GLenum cap) noexcept {
-    size_t index = getIndexForCap(cap);
+    size_t const index = getIndexForCap(cap);
     if (UTILS_UNLIKELY(!state.enables.caps[index])) {
         state.enables.caps.set(index);
         glEnable(cap);
@@ -624,7 +639,7 @@ void OpenGLContext::enable(GLenum cap) noexcept {
 }
 
 void OpenGLContext::disable(GLenum cap) noexcept {
-    size_t index = getIndexForCap(cap);
+    size_t const index = getIndexForCap(cap);
     if (UTILS_UNLIKELY(state.enables.caps[index])) {
         state.enables.caps.unset(index);
         glDisable(cap);
@@ -723,41 +738,6 @@ void OpenGLContext::polygonOffset(GLfloat factor, GLfloat units) noexcept {
     });
 }
 
-void OpenGLContext::beginQuery(GLenum target, GLuint query) noexcept {
-    switch (target) {
-        case GL_TIME_ELAPSED:
-            if (state.queries.timer != -1u) {
-                // this is an error
-                break;
-            }
-            state.queries.timer = query;
-            break;
-        default:
-            return;
-    }
-    glBeginQuery(target, query);
-}
-
-void OpenGLContext::endQuery(GLenum target) noexcept {
-    switch (target) {
-        case GL_TIME_ELAPSED:
-            state.queries.timer = -1u;
-            break;
-        default:
-            return;
-    }
-    glEndQuery(target);
-}
-
-GLuint OpenGLContext::getQuery(GLenum target) const noexcept {
-    switch (target) {
-        case GL_TIME_ELAPSED:
-            return state.queries.timer;
-        default:
-            return 0;
-    }
-}
-
 } // namespace filament
 
 #endif //TNT_FILAMENT_BACKEND_OPENGLCONTEXT_H
diff --git a/filament/backend/src/opengl/OpenGLDriver.cpp b/filament/backend/src/opengl/OpenGLDriver.cpp
index a13ac558155c..0a3ab8ac9a2e 100644
--- a/filament/backend/src/opengl/OpenGLDriver.cpp
+++ b/filament/backend/src/opengl/OpenGLDriver.cpp
@@ -69,7 +69,9 @@ using namespace utils;
 namespace filament::backend {
 
 Driver* OpenGLDriverFactory::create(
-        OpenGLPlatform* const platform, void* const sharedGLContext, const Platform::DriverConfig& driverConfig) noexcept {
+        OpenGLPlatform* const platform,
+        void* const sharedGLContext,
+        const Platform::DriverConfig& driverConfig) noexcept {
     return OpenGLDriver::create(platform, sharedGLContext, driverConfig);
 }
 
@@ -177,6 +179,10 @@ OpenGLDriver::OpenGLDriver(OpenGLPlatform* platform, const Platform::DriverConfi
     slog.i << "OS version: " << mPlatform.getOSVersion() << io::endl;
 #endif
 
+    // Timer queries are core in GL 3.3, otherwise we need EXT_disjoint_timer_query
+    // iOS headers don't define GL_EXT_disjoint_timer_query, so make absolutely sure
+    // we won't use it.
+#if defined(GL_VERSION_3_3) || defined(GL_EXT_disjoint_timer_query)
     if (mContext.ext.EXT_disjoint_timer_query ||
             BACKEND_OPENGL_VERSION == BACKEND_OPENGL_VERSION_GL) {
         // timer queries are available
@@ -187,7 +193,9 @@ OpenGLDriver::OpenGLDriver(OpenGLPlatform* platform, const Platform::DriverConfi
             mTimerQueryImpl = new TimerQueryNative(mContext);
         }
         mFrameTimeSupported = true;
-    } else if (mPlatform.canCreateFence()) {
+    } else
+#endif
+    if (mPlatform.canCreateFence()) {
         // no timer queries, but we can use fences
         mTimerQueryImpl = new OpenGLTimerQueryFence(mPlatform);
         mFrameTimeSupported = true;
@@ -537,6 +545,7 @@ void OpenGLDriver::textureStorage(OpenGLDriver::GLTexture* t,
                     GLsizei(width), GLsizei(height), GLsizei(depth) * 6);
             break;
         }
+#if defined(GL_VERSION_4_1) || defined(GL_ES_VERSION_3_1)
         case GL_TEXTURE_2D_MULTISAMPLE:
             if constexpr (TEXTURE_2D_MULTISAMPLE_SUPPORTED) {
                 // NOTE: if there is a mix of texture and renderbuffers, "fixed_sample_locations" must be true
@@ -554,6 +563,7 @@ void OpenGLDriver::textureStorage(OpenGLDriver::GLTexture* t,
                 PANIC_LOG("GL_TEXTURE_2D_MULTISAMPLE is not supported");
             }
             break;
+#endif
         default: // cannot happen
             break;
     }
@@ -621,6 +631,7 @@ void OpenGLDriver::createTextureR(Handle<HwTexture> th, SamplerType target, uint
             if (t->samples > 1) {
                 // Note: we can't be here in practice because filament's user API doesn't
                 // allow the creation of multi-sampled textures.
+#if defined(GL_VERSION_4_1) || defined(GL_ES_VERSION_3_1)
                 if (gl.features.multisample_texture) {
                     // multi-sample texture on GL 3.2 / GLES 3.1 and above
                     t->gl.target = GL_TEXTURE_2D_MULTISAMPLE;
@@ -629,6 +640,7 @@ void OpenGLDriver::createTextureR(Handle<HwTexture> th, SamplerType target, uint
                 } else {
                     // Turn off multi-sampling for that texture. It's just not supported.
                 }
+#endif
             }
             textureStorage(t, w, h, depth);
         }
@@ -721,6 +733,7 @@ void OpenGLDriver::importTextureR(Handle<HwTexture> th, intptr_t id,
     if (t->samples > 1) {
         // Note: we can't be here in practice because filament's user API doesn't
         // allow the creation of multi-sampled textures.
+#if defined(GL_VERSION_4_1) || defined(GL_ES_VERSION_3_1)
         if (gl.features.multisample_texture) {
             // multi-sample texture on GL 3.2 / GLES 3.1 and above
             t->gl.target = GL_TEXTURE_2D_MULTISAMPLE;
@@ -728,6 +741,7 @@ void OpenGLDriver::importTextureR(Handle<HwTexture> th, intptr_t id,
         } else {
             // Turn off multi-sampling for that texture. It's just not supported.
         }
+#endif
     }
 
     CHECK_GL_ERROR(utils::slog.e)
@@ -800,6 +814,7 @@ void OpenGLDriver::framebufferTexture(TargetBufferInfo const& binfo,
 
     GLTexture* t = handle_cast<GLTexture*>(binfo.handle);
 
+    assert_invariant(t);
     assert_invariant(t->target != SamplerType::SAMPLER_EXTERNAL);
     assert_invariant(rt->width  <= valueForLevel(binfo.level, t->width) &&
            rt->height <= valueForLevel(binfo.level, t->height));
@@ -907,7 +922,9 @@ void OpenGLDriver::framebufferTexture(TargetBufferInfo const& binfo,
             case GL_TEXTURE_CUBE_MAP_POSITIVE_Z:
             case GL_TEXTURE_CUBE_MAP_NEGATIVE_Z:
             case GL_TEXTURE_2D:
+#if defined(GL_VERSION_4_1) || defined(GL_ES_VERSION_3_1)
             case GL_TEXTURE_2D_MULTISAMPLE:
+#endif
                 if (any(t->usage & TextureUsage::SAMPLEABLE)) {
                     glFramebufferTexture2D(GL_FRAMEBUFFER, attachment,
                             target, t->gl.id, binfo.level);
@@ -1135,21 +1152,26 @@ void OpenGLDriver::createRenderTargetR(Handle<HwRenderTarget> rth,
     rt->gl.samples = samples;
     rt->targets = targets;
 
-    UTILS_UNUSED_IN_RELEASE math::vec2<uint32_t> tmin = {std::numeric_limits<uint32_t>::max()};
-    UTILS_UNUSED_IN_RELEASE math::vec2<uint32_t> tmax = {0};
+    UTILS_UNUSED_IN_RELEASE math::vec2<uint32_t> tmin = { std::numeric_limits<uint32_t>::max() };
+    UTILS_UNUSED_IN_RELEASE math::vec2<uint32_t> tmax = { 0 };
+    auto checkDimensions = [&tmin, &tmax](GLTexture* t, uint8_t level) {
+        const auto twidth = std::max(1u, t->width >> level);
+        const auto theight = std::max(1u, t->height >> level);
+        tmin = { std::min(tmin.x, twidth), std::min(tmin.y, theight) };
+        tmax = { std::max(tmax.x, twidth), std::max(tmax.y, theight) };
+    };
+
 
     if (any(targets & TargetBufferFlags::COLOR_ALL)) {
         GLenum bufs[MRT::MAX_SUPPORTED_RENDER_TARGET_COUNT] = { GL_NONE };
         const size_t maxDrawBuffers = getMaxDrawBuffers();
         for (size_t i = 0; i < maxDrawBuffers; i++) {
             if (any(targets & getTargetBufferFlagsAt(i))) {
-                auto t = rt->gl.color[i] = handle_cast<GLTexture*>(color[i].handle);
-                const auto twidth = std::max(1u, t->width >> color[i].level);
-                const auto theight = std::max(1u, t->height >> color[i].level);
-                tmin = { std::min(tmin.x, twidth), std::min(tmin.y, theight) };
-                tmax = { std::max(tmax.x, twidth), std::max(tmax.y, theight) };
+                assert_invariant(color[i].handle);
+                rt->gl.color[i] = handle_cast<GLTexture*>(color[i].handle);
                 framebufferTexture(color[i], rt, GL_COLOR_ATTACHMENT0 + i);
                 bufs[i] = GL_COLOR_ATTACHMENT0 + i;
+                checkDimensions(rt->gl.color[i], color[i].level);
             }
         }
         glDrawBuffers((GLsizei)maxDrawBuffers, bufs);
@@ -1159,37 +1181,28 @@ void OpenGLDriver::createRenderTargetR(Handle<HwRenderTarget> rth,
     // handle special cases first (where depth/stencil are packed)
     bool specialCased = false;
     if ((targets & TargetBufferFlags::DEPTH_AND_STENCIL) == TargetBufferFlags::DEPTH_AND_STENCIL) {
-        assert_invariant(!stencil.handle || stencil.handle == depth.handle);
-        auto t = rt->gl.depth = handle_cast<GLTexture*>(depth.handle);
-        const auto twidth = std::max(1u, t->width >> depth.level);
-        const auto theight = std::max(1u, t->height >> depth.level);
-        tmin = { std::min(tmin.x, twidth), std::min(tmin.y, theight) };
-        tmax = { std::max(tmax.x, twidth), std::max(tmax.y, theight) };
-        if (any(rt->gl.depth->usage & TextureUsage::SAMPLEABLE) ||
-            (!depth.handle && !stencil.handle)) {
-            // special case: depth & stencil requested, and both provided as the same texture
-            // special case: depth & stencil requested, but both not provided
-            specialCased = true;
+        assert_invariant(depth.handle);
+        // either we supplied only the depth handle or both depth/stencil are identical and not null
+        if (depth.handle && (stencil.handle == depth.handle || !stencil.handle)) {
+            rt->gl.depth = handle_cast<GLTexture*>(depth.handle);
             framebufferTexture(depth, rt, GL_DEPTH_STENCIL_ATTACHMENT);
+            specialCased = true;
+            checkDimensions(rt->gl.depth, depth.level);
         }
     }
 
     if (!specialCased) {
         if (any(targets & TargetBufferFlags::DEPTH)) {
-            auto t = rt->gl.depth = handle_cast<GLTexture*>(depth.handle);
-            const auto twidth = std::max(1u, t->width >> depth.level);
-            const auto theight = std::max(1u, t->height >> depth.level);
-            tmin = { std::min(tmin.x, twidth), std::min(tmin.y, theight) };
-            tmax = { std::max(tmax.x, twidth), std::max(tmax.y, theight) };
+            assert_invariant(depth.handle);
+            rt->gl.depth = handle_cast<GLTexture*>(depth.handle);
             framebufferTexture(depth, rt, GL_DEPTH_ATTACHMENT);
+            checkDimensions(rt->gl.depth, depth.level);
         }
         if (any(targets & TargetBufferFlags::STENCIL)) {
-            auto t = rt->gl.stencil = handle_cast<GLTexture*>(stencil.handle);
-            const auto twidth = std::max(1u, t->width >> stencil.level);
-            const auto theight = std::max(1u, t->height >> stencil.level);
-            tmin = { std::min(tmin.x, twidth), std::min(tmin.y, theight) };
-            tmax = { std::max(tmax.x, twidth), std::max(tmax.y, theight) };
+            assert_invariant(stencil.handle);
+            rt->gl.stencil = handle_cast<GLTexture*>(stencil.handle);
             framebufferTexture(stencil, rt, GL_STENCIL_ATTACHMENT);
+            checkDimensions(rt->gl.stencil, stencil.level);
         }
     }
 
@@ -1211,19 +1224,19 @@ void OpenGLDriver::createSyncR(Handle<HwSync> fh, int) {
     DEBUG_MARKER()
 
     GLSync* f = handle_cast<GLSync *>(fh);
-    f->gl.sync = glFenceSync(GL_SYNC_GPU_COMMANDS_COMPLETE, 0);
-    CHECK_GL_ERROR(utils::slog.e)
+    f->handle = mContext.createFenceSync(mPlatform);
 
     // check the status of the sync once a frame, since we must do this from our thread
     std::weak_ptr<GLSync::State> const weak = f->result;
-    runEveryNowAndThen([sync = f->gl.sync, weak]() -> bool {
+    runEveryNowAndThen(
+            [&platform = mPlatform, context = mContext, handle = f->handle, weak]() -> bool {
         auto result = weak.lock();
         if (result) {
-            GLenum const status = glClientWaitSync(sync, 0, 0u);
+            auto const status = context.clientWaitSync(platform, handle);
             result->status.store(status, std::memory_order_relaxed);
-            return (status != GL_TIMEOUT_EXPIRED);
+            return (status != OpenGLContext::FenceSync::Status::TIMEOUT_EXPIRED);
         }
-        return true; // we're done
+        return true;
     });
 }
 
@@ -1332,9 +1345,6 @@ void OpenGLDriver::destroyTexture(Handle<HwTexture> th) {
                 assert_invariant(t->gl.target == GL_RENDERBUFFER);
                 glDeleteRenderbuffers(1, &t->gl.id);
             }
-            if (t->gl.fence) {
-                glDeleteSync(t->gl.fence);
-            }
             if (t->gl.sidecarRenderBufferMS) {
                 glDeleteRenderbuffers(1, &t->gl.sidecarRenderBufferMS);
             }
@@ -1413,10 +1423,9 @@ void OpenGLDriver::destroyTimerQuery(Handle<HwTimerQuery> tqh) {
 
 void OpenGLDriver::destroySync(Handle<HwSync> sh) {
     DEBUG_MARKER()
-
     if (sh) {
         GLSync* s = handle_cast<GLSync*>(sh);
-        glDeleteSync(s->gl.sync);
+        mContext.destroyFenceSync(mPlatform, s->handle);
         destruct(sh, s);
     }
 }
@@ -1970,7 +1979,9 @@ void OpenGLDriver::generateMipmaps(Handle<HwTexture> th) {
 
     auto& gl = mContext;
     GLTexture* t = handle_cast<GLTexture *>(th);
+#if defined(GL_VERSION_4_1) || defined(GL_ES_VERSION_3_1)
     assert_invariant(t->gl.target != GL_TEXTURE_2D_MULTISAMPLE);
+#endif
     // Note: glGenerateMimap can also fail if the internal format is not both
     // color-renderable and filterable (i.e.: doesn't work for depth)
     bindTexture(OpenGLContext::DUMMY_TEXTURE_BINDING, t);
@@ -2011,8 +2022,13 @@ void OpenGLDriver::setTextureData(GLTexture* t, uint32_t level,
 
     gl.pixelStore(GL_UNPACK_ROW_LENGTH, GLint(p.stride));
     gl.pixelStore(GL_UNPACK_ALIGNMENT, GLint(p.alignment));
-    gl.pixelStore(GL_UNPACK_SKIP_PIXELS, GLint(p.left));
-    gl.pixelStore(GL_UNPACK_SKIP_ROWS, GLint(p.top));
+
+    // This is equivalent to using GL_UNPACK_SKIP_PIXELS and GL_UNPACK_SKIP_ROWS
+    using PBD = PixelBufferDescriptor;
+    size_t const stride = p.stride ? p.stride : width;
+    size_t const bpp = PBD::computeDataSize(p.format, p.type, 1, 1, 1);
+    size_t const bpr = PBD::computeDataSize(p.format, p.type, stride, 1, p.alignment);
+    void const* const buffer = static_cast<char const*>(p.buffer) + p.left * bpp + bpr * p.top;
 
     switch (t->target) {
         case SamplerType::SAMPLER_EXTERNAL:
@@ -2026,7 +2042,7 @@ void OpenGLDriver::setTextureData(GLTexture* t, uint32_t level,
             assert_invariant(t->gl.target == GL_TEXTURE_2D);
             glTexSubImage2D(t->gl.target, GLint(level),
                     GLint(xoffset), GLint(yoffset),
-                    GLsizei(width), GLsizei(height), glFormat, glType, p.buffer);
+                    GLsizei(width), GLsizei(height), glFormat, glType, buffer);
             break;
         case SamplerType::SAMPLER_3D:
             assert_invariant(zoffset + depth <= std::max(1u, t->depth >> level));
@@ -2035,7 +2051,7 @@ void OpenGLDriver::setTextureData(GLTexture* t, uint32_t level,
             assert_invariant(t->gl.target == GL_TEXTURE_3D);
             glTexSubImage3D(t->gl.target, GLint(level),
                     GLint(xoffset), GLint(yoffset), GLint(zoffset),
-                    GLsizei(width), GLsizei(height), GLsizei(depth), glFormat, glType, p.buffer);
+                    GLsizei(width), GLsizei(height), GLsizei(depth), glFormat, glType, buffer);
             break;
         case SamplerType::SAMPLER_2D_ARRAY:
         case SamplerType::SAMPLER_CUBEMAP_ARRAY:
@@ -2047,7 +2063,7 @@ void OpenGLDriver::setTextureData(GLTexture* t, uint32_t level,
                     t->gl.target == GL_TEXTURE_CUBE_MAP_ARRAY);
             glTexSubImage3D(t->gl.target, GLint(level),
                     GLint(xoffset), GLint(yoffset), GLint(zoffset),
-                    GLsizei(width), GLsizei(height), GLsizei(depth), glFormat, glType, p.buffer);
+                    GLsizei(width), GLsizei(height), GLsizei(depth), glFormat, glType, buffer);
             break;
         case SamplerType::SAMPLER_CUBEMAP: {
             assert_invariant(t->gl.target == GL_TEXTURE_CUBE_MAP);
@@ -2063,7 +2079,7 @@ void OpenGLDriver::setTextureData(GLTexture* t, uint32_t level,
                 GLenum const target = getCubemapTarget(zoffset + face);
                 glTexSubImage2D(target, GLint(level), GLint(xoffset), GLint(yoffset),
                         GLsizei(width), GLsizei(height), glFormat, glType,
-                        static_cast<uint8_t const*>(p.buffer) + faceSize * face);
+                        static_cast<uint8_t const*>(buffer) + faceSize * face);
             }
             break;
         }
@@ -2337,13 +2353,14 @@ SyncStatus OpenGLDriver::getSyncStatus(Handle<HwSync> sh) {
         return SyncStatus::NOT_SIGNALED;
     }
     auto status = s->result->status.load(std::memory_order_relaxed);
+    using Status = OpenGLContext::FenceSync::Status;
     switch (status) {
-        case GL_CONDITION_SATISFIED:
-        case GL_ALREADY_SIGNALED:
+        case Status::CONDITION_SATISFIED:
+        case Status::ALREADY_SIGNALED:
             return SyncStatus::SIGNALED;
-        case GL_TIMEOUT_EXPIRED:
+        case Status::TIMEOUT_EXPIRED:
             return SyncStatus::NOT_SIGNALED;
-        case GL_WAIT_FAILED:
+        case Status::FAILURE:
         default:
             return SyncStatus::ERROR;
     }
@@ -2714,10 +2731,7 @@ void OpenGLDriver::readPixels(Handle<HwRenderTarget> src,
     GLenum const glFormat = getFormat(p.format);
     GLenum const glType = getType(p.type);
 
-    gl.pixelStore(GL_PACK_ROW_LENGTH,   (GLint)p.stride);
-    gl.pixelStore(GL_PACK_ALIGNMENT,    (GLint)p.alignment);
-    gl.pixelStore(GL_PACK_SKIP_PIXELS,  (GLint)p.left);
-    gl.pixelStore(GL_PACK_SKIP_ROWS,    (GLint)p.top);
+    gl.pixelStore(GL_PACK_ALIGNMENT, (GLint)p.alignment);
 
     /*
      * glReadPixel() operation...
@@ -2745,44 +2759,54 @@ void OpenGLDriver::readPixels(Handle<HwRenderTarget> src,
      */
 
     GLRenderTarget const* s = handle_cast<GLRenderTarget const*>(src);
-    gl.bindFramebuffer(GL_READ_FRAMEBUFFER, s->gl.fbo);
+
+    // glReadPixel doesn't resolve automatically, but it does with the auto-resolve extension,
+    // which we're always emulating. So if we have a resolved fbo (fbo_read), use that instead.
+    gl.bindFramebuffer(GL_READ_FRAMEBUFFER, s->gl.fbo_read ? s->gl.fbo_read : s->gl.fbo);
+
+    using PBD = PixelBufferDescriptor;
+
+    // The PBO only needs to accommodate the area we're reading, with alignment.
+    auto const pboSize = (GLsizeiptr)PBD::computeDataSize(
+            p.format, p.type, width, height, p.alignment);
 
     GLuint pbo;
     glGenBuffers(1, &pbo);
     gl.bindBuffer(GL_PIXEL_PACK_BUFFER, pbo);
-    glBufferData(GL_PIXEL_PACK_BUFFER, (GLsizeiptr)p.size, nullptr, GL_STATIC_DRAW);
+    glBufferData(GL_PIXEL_PACK_BUFFER, pboSize, nullptr, GL_STATIC_DRAW);
     glReadPixels(GLint(x), GLint(y), GLint(width), GLint(height), glFormat, glType, nullptr);
     gl.bindBuffer(GL_PIXEL_PACK_BUFFER, 0);
     CHECK_GL_ERROR(utils::slog.e)
 
     // we're forced to make a copy on the heap because otherwise it deletes std::function<> copy
     // constructor.
-    auto* pUserBuffer = new PixelBufferDescriptor(std::move(p));
-    whenGpuCommandsComplete([this, width, height, pbo, pUserBuffer]() mutable {
+    auto* const pUserBuffer = new PixelBufferDescriptor(std::move(p));
+    whenGpuCommandsComplete([this, width, height, pbo, pboSize, pUserBuffer]() mutable {
         PixelBufferDescriptor& p = *pUserBuffer;
         auto& gl = mContext;
         gl.bindBuffer(GL_PIXEL_PACK_BUFFER, pbo);
         void* vaddr = nullptr;
 #if defined(__EMSCRIPTEN__)
-        std::unique_ptr<uint8_t> clientBuffer = std::make_unique<uint8_t>(p.size);
-        glGetBufferSubData(GL_PIXEL_PACK_BUFFER, 0, p.size, clientBuffer.get());
+        std::unique_ptr<uint8_t> clientBuffer = std::make_unique<uint8_t>(pboSize);
+        glGetBufferSubData(GL_PIXEL_PACK_BUFFER, 0, pboSize, clientBuffer.get());
         vaddr = clientBuffer.get();
 #else
-        vaddr = glMapBufferRange(GL_PIXEL_PACK_BUFFER, 0,  (GLsizeiptr)p.size, GL_MAP_READ_BIT);
+        vaddr = glMapBufferRange(GL_PIXEL_PACK_BUFFER, 0, pboSize, GL_MAP_READ_BIT);
 #endif
         if (vaddr) {
             // now we need to flip the buffer vertically to match our API
             size_t const stride = p.stride ? p.stride : width;
-            size_t const bpp = PixelBufferDescriptor::computeDataSize(
-                    p.format, p.type, 1, 1, 1);
-            size_t const bpr = PixelBufferDescriptor::computeDataSize(
-                    p.format, p.type, stride, 1, p.alignment);
-            char const* head = (char const*)vaddr + p.left * bpp + bpr * p.top;
-            char* tail = (char*)p.buffer + p.left * bpp + bpr * (p.top + height - 1);
+            size_t const bpp = PBD::computeDataSize(p.format, p.type, 1, 1, 1);
+            size_t const dstBpr = PBD::computeDataSize(p.format, p.type, stride, 1, p.alignment);
+            char* pDst = (char*)p.buffer + p.left * bpp + dstBpr * (p.top + height - 1);
+
+            size_t const srcBpr = PBD::computeDataSize(p.format, p.type, width, 1, p.alignment);
+            char const* pSrc = (char const*)vaddr;
+
             for (size_t i = 0; i < height; ++i) {
-                memcpy(tail, head, bpp * width);
-                head += bpr;
-                tail -= bpr;
+                memcpy(pDst, pSrc, bpp * width);
+                pSrc += srcBpr;
+                pDst -= dstBpr;
             }
 #if !defined(__EMSCRIPTEN__)
             glUnmapBuffer(GL_PIXEL_PACK_BUFFER);
@@ -2849,7 +2873,7 @@ void OpenGLDriver::readBufferSubData(backend::BufferObjectHandle boh,
 }
 
 void OpenGLDriver::whenGpuCommandsComplete(std::function<void()> fn) noexcept {
-    GLsync sync = glFenceSync(GL_SYNC_GPU_COMMANDS_COMPLETE, 0);
+    OpenGLContext::FenceSync sync = mContext.createFenceSync(mPlatform);
     mGpuCommandCompleteOps.emplace_back(sync, std::move(fn));
     CHECK_GL_ERROR(utils::slog.e)
 }
@@ -2862,15 +2886,16 @@ void OpenGLDriver::executeGpuCommandsCompleteOps() noexcept {
     auto& v = mGpuCommandCompleteOps;
     auto it = v.begin();
     while (it != v.end()) {
-        GLenum const status = glClientWaitSync(it->first, 0, 0);
-        if (status == GL_ALREADY_SIGNALED || status == GL_CONDITION_SATISFIED) {
+        using Status = OpenGLContext::FenceSync::Status;
+        auto const status = mContext.clientWaitSync(mPlatform, it->first);
+        if (status == Status::ALREADY_SIGNALED || status == Status::CONDITION_SATISFIED) {
             it->second();
-            glDeleteSync(it->first);
+            mContext.destroyFenceSync(mPlatform, it->first);
             it = v.erase(it);
-        } else if (UTILS_UNLIKELY(status == GL_WAIT_FAILED)) {
+        } else if (UTILS_UNLIKELY(status == Status::FAILURE)) {
             // This should never happen, but is very problematic if it does, as we might leak
             // some data depending on what the callback does. However, we clean up our own state.
-            glDeleteSync(it->first);
+            mContext.destroyFenceSync(mPlatform, it->first);
             it = v.erase(it);
         } else {
             ++it;
diff --git a/filament/backend/src/opengl/OpenGLDriver.h b/filament/backend/src/opengl/OpenGLDriver.h
index d9c8b9ccff0c..7df05e84854a 100644
--- a/filament/backend/src/opengl/OpenGLDriver.h
+++ b/filament/backend/src/opengl/OpenGLDriver.h
@@ -122,7 +122,6 @@ class OpenGLDriver final : public DriverBase {
             GLenum target = 0;
             GLenum internalFormat = 0;
             GLuint sidecarRenderBufferMS = 0;  // multi-sample sidecar renderbuffer
-            mutable GLsync fence = {};
 
             // texture parameters go here too
             GLfloat anisotropy = 1.0;
@@ -186,11 +185,10 @@ class OpenGLDriver final : public DriverBase {
     struct GLSync : public HwSync {
         using HwSync::HwSync;
         struct State {
-            std::atomic<GLenum> status{ GL_TIMEOUT_EXPIRED };
+            std::atomic<OpenGLContext::FenceSync::Status> status{
+                OpenGLContext::FenceSync::Status::TIMEOUT_EXPIRED };
         };
-        struct {
-            GLsync sync;
-        } gl;
+        OpenGLContext::FenceSync handle{};
         std::shared_ptr<State> result{ std::make_shared<GLSync::State>() };
     };
 
@@ -364,7 +362,7 @@ class OpenGLDriver final : public DriverBase {
     // tasks executed on the main thread after the fence signaled
     void whenGpuCommandsComplete(std::function<void()> fn) noexcept;
     void executeGpuCommandsCompleteOps() noexcept;
-    std::vector<std::pair<GLsync, std::function<void()>>> mGpuCommandCompleteOps;
+    std::vector<std::pair<OpenGLContext::FenceSync, std::function<void()>>> mGpuCommandCompleteOps;
 
     // tasks regularly executed on the main thread at until they return true
     void runEveryNowAndThen(std::function<bool()> fn) noexcept;
diff --git a/filament/backend/src/opengl/OpenGLProgram.cpp b/filament/backend/src/opengl/OpenGLProgram.cpp
index 274c052f5ec0..2605306dac1d 100644
--- a/filament/backend/src/opengl/OpenGLProgram.cpp
+++ b/filament/backend/src/opengl/OpenGLProgram.cpp
@@ -121,11 +121,21 @@ void OpenGLProgram::compileShaders(OpenGLContext& context,
     UTILS_NOUNROLL
     for (size_t i = 0; i < Program::SHADER_TYPE_COUNT; i++) {
         const ShaderStage stage = static_cast<ShaderStage>(i);
-        GLenum glShaderType;
+        GLenum glShaderType{};
         switch (stage) {
-            case ShaderStage::VERTEX:    glShaderType = GL_VERTEX_SHADER;    break;
-            case ShaderStage::FRAGMENT:  glShaderType = GL_FRAGMENT_SHADER;  break;
-            case ShaderStage::COMPUTE:   glShaderType = GL_COMPUTE_SHADER;   break;
+            case ShaderStage::VERTEX:
+                glShaderType = GL_VERTEX_SHADER;
+                break;
+            case ShaderStage::FRAGMENT:
+                glShaderType = GL_FRAGMENT_SHADER;
+                break;
+            case ShaderStage::COMPUTE:
+#if defined(GL_VERSION_4_1) || defined(GL_ES_VERSION_3_1)
+                glShaderType = GL_COMPUTE_SHADER;
+#else
+                continue;
+#endif
+                break;
         }
 
         if (UTILS_LIKELY(!shadersSource[i].empty())) {
@@ -406,11 +416,6 @@ void OpenGLProgram::updateSamplers(OpenGLDriver* gld) const noexcept {
             const GLTexture* const t = sb->textureUnitEntries[j].texture;
             GLuint const s = sb->textureUnitEntries[j].sampler;
             if (t) { // program may not use all samplers of sampler group
-                if (UTILS_UNLIKELY(t->gl.fence)) {
-                    glWaitSync(t->gl.fence, 0, GL_TIMEOUT_IGNORED);
-                    glDeleteSync(t->gl.fence);
-                    t->gl.fence = nullptr;
-                }
                 gld->bindTexture(tmu, t);
                 gld->bindSampler(tmu, s);
             }
diff --git a/filament/backend/src/opengl/OpenGLTimerQuery.cpp b/filament/backend/src/opengl/OpenGLTimerQuery.cpp
index 4c5857498965..dd07ac30f891 100644
--- a/filament/backend/src/opengl/OpenGLTimerQuery.cpp
+++ b/filament/backend/src/opengl/OpenGLTimerQuery.cpp
@@ -34,8 +34,9 @@ OpenGLTimerQueryInterface::~OpenGLTimerQueryInterface() = default;
 
 // ------------------------------------------------------------------------------------------------
 
-TimerQueryNative::TimerQueryNative(OpenGLContext& context)
-        : gl(context) {
+#if defined(GL_VERSION_3_3) || defined(GL_EXT_disjoint_timer_query)
+
+TimerQueryNative::TimerQueryNative(OpenGLContext&) {
 }
 
 TimerQueryNative::~TimerQueryNative() = default;
@@ -44,12 +45,12 @@ void TimerQueryNative::flush() {
 }
 
 void TimerQueryNative::beginTimeElapsedQuery(GLTimerQuery* query) {
-    gl.beginQuery(GL_TIME_ELAPSED, query->gl.query);
+    glBeginQuery(GL_TIME_ELAPSED, query->gl.query);
     CHECK_GL_ERROR(utils::slog.e)
 }
 
 void TimerQueryNative::endTimeElapsedQuery(GLTimerQuery*) {
-    gl.endQuery(GL_TIME_ELAPSED);
+    glEndQuery(GL_TIME_ELAPSED);
     CHECK_GL_ERROR(utils::slog.e)
 }
 
@@ -62,14 +63,14 @@ bool TimerQueryNative::queryResultAvailable(GLTimerQuery* query) {
 
 uint64_t TimerQueryNative::queryResult(GLTimerQuery* query) {
     GLuint64 elapsedTime = 0;
-    // IOS doesn't have glGetQueryObjectui64v, we'll never end-up here on ios anyways
-#ifndef IOS
+    // we won't end-up here if we're on ES and don't have GL_EXT_disjoint_timer_query
     glGetQueryObjectui64v(query->gl.query, GL_QUERY_RESULT, &elapsedTime);
-#endif
     CHECK_GL_ERROR(utils::slog.e)
     return elapsedTime;
 }
 
+#endif
+
 // ------------------------------------------------------------------------------------------------
 
 OpenGLTimerQueryFence::OpenGLTimerQueryFence(OpenGLPlatform& platform)
@@ -85,7 +86,7 @@ OpenGLTimerQueryFence::OpenGLTimerQueryFence(OpenGLPlatform& platform)
             });
             exitRequested = mExitRequested;
             if (!queue.empty()) {
-                Job job(queue.front());
+                Job const job(queue.front());
                 queue.erase(queue.begin());
                 lock.unlock();
                 job();
@@ -105,7 +106,7 @@ OpenGLTimerQueryFence::~OpenGLTimerQueryFence() {
 }
 
 void OpenGLTimerQueryFence::enqueue(OpenGLTimerQueryFence::Job&& job) {
-    std::unique_lock<utils::Mutex> lock(mLock);
+    std::unique_lock<utils::Mutex> const lock(mLock);
     mQueue.push_back(std::forward<Job>(job));
     mCondition.notify_one();
 }
@@ -114,9 +115,9 @@ void OpenGLTimerQueryFence::flush() {
     // Use calls to flush() as a proxy for when the GPU work started.
     GLTimerQuery* query = mActiveQuery;
     if (query) {
-        uint64_t elapsed = query->gl.emulation->elapsed.load(std::memory_order_relaxed);
+        uint64_t const elapsed = query->gl.emulation->elapsed.load(std::memory_order_relaxed);
         if (!elapsed) {
-            uint64_t now = clock::now().time_since_epoch().count();
+            uint64_t const now = clock::now().time_since_epoch().count();
             query->gl.emulation->elapsed.store(now, std::memory_order_relaxed);
             //SYSTRACE_CONTEXT();
             //SYSTRACE_ASYNC_BEGIN("gpu", query->gl.query);
@@ -139,7 +140,7 @@ void OpenGLTimerQueryFence::beginTimeElapsedQuery(GLTimerQuery* query) {
 void OpenGLTimerQueryFence::endTimeElapsedQuery(GLTimerQuery* query) {
     assert_invariant(mActiveQuery);
     Platform::Fence* fence = mPlatform.createFence();
-    std::weak_ptr<GLTimerQuery::State> weak = query->gl.emulation;
+    std::weak_ptr<GLTimerQuery::State> const weak = query->gl.emulation;
     mActiveQuery = nullptr;
     //uint32_t cookie = cookie = query->gl.query;
     push([&platform = mPlatform, fence, weak]() {
diff --git a/filament/backend/src/opengl/OpenGLTimerQuery.h b/filament/backend/src/opengl/OpenGLTimerQuery.h
index 69cb08490712..e6fc88e2c2fa 100644
--- a/filament/backend/src/opengl/OpenGLTimerQuery.h
+++ b/filament/backend/src/opengl/OpenGLTimerQuery.h
@@ -48,6 +48,8 @@ class OpenGLTimerQueryInterface {
     virtual uint64_t queryResult(GLTimerQuery* query) = 0;
 };
 
+#if defined(GL_VERSION_3_3) || defined(GL_EXT_disjoint_timer_query)
+
 class TimerQueryNative : public OpenGLTimerQueryInterface {
 public:
     explicit TimerQueryNative(OpenGLContext& context);
@@ -58,9 +60,10 @@ class TimerQueryNative : public OpenGLTimerQueryInterface {
     void endTimeElapsedQuery(GLTimerQuery* query) override;
     bool queryResultAvailable(GLTimerQuery* query) override;
     uint64_t queryResult(GLTimerQuery* query) override;
-    OpenGLContext& gl;
 };
 
+#endif
+
 class OpenGLTimerQueryFence : public OpenGLTimerQueryInterface {
 public:
     explicit OpenGLTimerQueryFence(OpenGLPlatform& platform);
diff --git a/filament/backend/src/opengl/gl_headers.cpp b/filament/backend/src/opengl/gl_headers.cpp
index fa1b1414f7b4..2c29aae796f8 100644
--- a/filament/backend/src/opengl/gl_headers.cpp
+++ b/filament/backend/src/opengl/gl_headers.cpp
@@ -16,11 +16,17 @@
 
 #include "gl_headers.h"
 
-#if defined(__ANDROID__) || defined(FILAMENT_USE_EXTERNAL_GLES3) || defined(__EMSCRIPTEN__)
+#if defined(FILAMENT_IMPORT_ENTRY_POINTS)
 
 #include <EGL/egl.h>
 #include <mutex>
 
+// for non EGL platforms, we'd need to implement this differently. Currently, it's not a problem.
+template<typename T>
+static void getProcAddress(T& pfn, const char* name) noexcept {
+    pfn = (T)eglGetProcAddress(name);
+}
+
 namespace glext {
 #ifdef GL_QCOM_tiled_rendering
 PFNGLSTARTTILINGQCOMPROC glStartTilingQCOM;
@@ -58,71 +64,39 @@ PFNGLDISPATCHCOMPUTEPROC glDispatchCompute;
 static std::once_flag sGlExtInitialized;
 
 void importGLESExtensionsEntryPoints() {
-    std::call_once(sGlExtInitialized, []() {
+    std::call_once(sGlExtInitialized, +[]() {
 #ifdef GL_QCOM_tiled_rendering
-        glStartTilingQCOM =
-                (PFNGLSTARTTILINGQCOMPROC)eglGetProcAddress(
-                        "glStartTilingQCOM");
-
-        glEndTilingQCOM =
-                (PFNGLENDTILINGQCOMPROC)eglGetProcAddress(
-                        "glEndTilingQCOM");
+        getProcAddress(glStartTilingQCOM, "glStartTilingQCOM");
+        getProcAddress(glEndTilingQCOM, "glEndTilingQCOM");
 #endif
-
 #ifdef GL_OES_EGL_image
-        glEGLImageTargetTexture2DOES =
-                (PFNGLEGLIMAGETARGETTEXTURE2DOESPROC)eglGetProcAddress(
-                        "glEGLImageTargetTexture2DOES");
+        getProcAddress(glEGLImageTargetTexture2DOES, "glEGLImageTargetTexture2DOES");
 #endif
-
 #if GL_EXT_debug_marker
-        glInsertEventMarkerEXT =
-                (PFNGLINSERTEVENTMARKEREXTPROC)eglGetProcAddress(
-                        "glInsertEventMarkerEXT");
-
-        glPushGroupMarkerEXT =
-                (PFNGLPUSHGROUPMARKEREXTPROC)eglGetProcAddress(
-                        "glPushGroupMarkerEXT");
-
-        glPopGroupMarkerEXT =
-                (PFNGLPOPGROUPMARKEREXTPROC)eglGetProcAddress(
-                        "glPopGroupMarkerEXT");
+        getProcAddress(glInsertEventMarkerEXT, "glInsertEventMarkerEXT");
+        getProcAddress(glPushGroupMarkerEXT, "glPushGroupMarkerEXT");
+        getProcAddress(glPopGroupMarkerEXT, "glPopGroupMarkerEXT");
 #endif
 #if GL_EXT_multisampled_render_to_texture
-        glFramebufferTexture2DMultisampleEXT =
-                (PFNGLFRAMEBUFFERTEXTURE2DMULTISAMPLEEXTPROC)eglGetProcAddress(
-                        "glFramebufferTexture2DMultisampleEXT");
-        glRenderbufferStorageMultisampleEXT =
-                (PFNGLRENDERBUFFERSTORAGEMULTISAMPLEEXTPROC)eglGetProcAddress(
-                        "glRenderbufferStorageMultisampleEXT");
+        getProcAddress(glFramebufferTexture2DMultisampleEXT, "glFramebufferTexture2DMultisampleEXT");
+        getProcAddress(glRenderbufferStorageMultisampleEXT, "glRenderbufferStorageMultisampleEXT");
 #endif
 #ifdef GL_KHR_debug
-        glDebugMessageCallbackKHR =
-                (PFNGLDEBUGMESSAGECALLBACKKHRPROC)eglGetProcAddress(
-                        "glDebugMessageCallbackKHR");
-        glGetDebugMessageLogKHR =
-                (PFNGLGETDEBUGMESSAGELOGKHRPROC)eglGetProcAddress(
-                        "glGetDebugMessageLogKHR");
+        getProcAddress(glDebugMessageCallbackKHR, "glDebugMessageCallbackKHR");
+        getProcAddress(glGetDebugMessageLogKHR, "glGetDebugMessageLogKHR");
 #endif
 #ifdef GL_EXT_disjoint_timer_query
-        glGetQueryObjectui64v =
-                (PFNGLGETQUERYOBJECTUI64VEXTPROC)eglGetProcAddress(
-                        "glGetQueryObjectui64vEXT");
+        getProcAddress(glGetQueryObjectui64v, "glGetQueryObjectui64vEXT");
 #endif
-    });
 #ifdef GL_EXT_clip_control
-    glClipControl =
-            (PFNGLCLIPCONTROLEXTPROC)eglGetProcAddress(
-                    "glClipControlEXT");
+        getProcAddress(glClipControl, "glClipControlEXT");
 #endif
-
 #if defined(__ANDROID__)
-    glDispatchCompute =
-            (PFNGLDISPATCHCOMPUTEPROC)eglGetProcAddress(
-                    "glDispatchCompute");
+        getProcAddress(glDispatchCompute, "glDispatchCompute");
 #endif
+    });
 }
 
 } // namespace glext
 
-#endif
+#endif // defined(FILAMENT_IMPORT_ENTRY_POINTS)
diff --git a/filament/backend/src/opengl/gl_headers.h b/filament/backend/src/opengl/gl_headers.h
index 68fc38491032..e59d341cc29d 100644
--- a/filament/backend/src/opengl/gl_headers.h
+++ b/filament/backend/src/opengl/gl_headers.h
@@ -26,45 +26,6 @@
     #endif
     #include <GLES2/gl2ext.h>
 
-    /* The Android NDK doesn't expose extensions, fake it with eglGetProcAddress */
-    namespace glext {
-        // importGLESExtensionsEntryPoints is thread-safe and can be called multiple times.
-        // it is currently called from PlatformEGL.
-        void importGLESExtensionsEntryPoints();
-
-#ifdef GL_QCOM_tiled_rendering
-        extern PFNGLSTARTTILINGQCOMPROC glStartTilingQCOM;
-        extern PFNGLENDTILINGQCOMPROC glEndTilingQCOM;
-#endif
-#ifdef GL_OES_EGL_image
-        extern PFNGLEGLIMAGETARGETTEXTURE2DOESPROC glEGLImageTargetTexture2DOES;
-#endif
-#ifdef GL_EXT_debug_marker
-        extern PFNGLINSERTEVENTMARKEREXTPROC glInsertEventMarkerEXT;
-        extern PFNGLPUSHGROUPMARKEREXTPROC glPushGroupMarkerEXT;
-        extern PFNGLPOPGROUPMARKEREXTPROC glPopGroupMarkerEXT;
-#endif
-#ifdef GL_EXT_multisampled_render_to_texture
-        extern PFNGLRENDERBUFFERSTORAGEMULTISAMPLEEXTPROC glRenderbufferStorageMultisampleEXT;
-        extern PFNGLFRAMEBUFFERTEXTURE2DMULTISAMPLEEXTPROC glFramebufferTexture2DMultisampleEXT;
-#endif
-#ifdef GL_KHR_debug
-        extern PFNGLDEBUGMESSAGECALLBACKKHRPROC glDebugMessageCallbackKHR;
-        extern PFNGLGETDEBUGMESSAGELOGKHRPROC glGetDebugMessageLogKHR;
-#endif
-#ifdef GL_EXT_disjoint_timer_query
-        extern PFNGLGETQUERYOBJECTUI64VEXTPROC glGetQueryObjectui64v;
-#endif
-#ifdef GL_EXT_clip_control
-        extern PFNGLCLIPCONTROLEXTPROC glClipControl;
-#endif
-#if defined(__ANDROID__)
-        extern PFNGLDISPATCHCOMPUTEPROC glDispatchCompute;
-#endif
-    }
-
-    using namespace glext;
-
 #elif defined(IOS)
 
     #define GLES_SILENCE_DEPRECATION
@@ -85,111 +46,110 @@
 
 #endif
 
+
 #if (!defined(GL_ES_VERSION_2_0) && !defined(GL_VERSION_4_1))
 #error "Minimum header version must be OpenGL ES 2.0 or OpenGL 4.1"
 #endif
 
 
 /*
- * Since we need ES3.1 headers and iOS only has ES3.0, we also define the constants we
- * need to avoid many #ifdef in the actual code.
+ * GLES extensions
  */
 
-#if defined(GL_ES_VERSION_2_0)
+#if defined(GL_ES_VERSION_2_0)  // this basically means all versions of GLES
 
-#ifdef GL_EXT_disjoint_timer_query
-#    ifndef GL_TIME_ELAPSED
-#        define GL_TIME_ELAPSED             GL_TIME_ELAPSED_EXT
-#    endif
-#endif
+#if defined(IOS)
+
+// iOS headers only provide prototypes, nothing to do.
 
+#else
+
+#define FILAMENT_IMPORT_ENTRY_POINTS
+
+/* The Android NDK doesn't expose extensions, fake it with eglGetProcAddress */
+namespace glext {
+// importGLESExtensionsEntryPoints is thread-safe and can be called multiple times.
+// it is currently called from PlatformEGL.
+void importGLESExtensionsEntryPoints();
+
+#ifdef GL_QCOM_tiled_rendering
+extern PFNGLSTARTTILINGQCOMPROC glStartTilingQCOM;
+extern PFNGLENDTILINGQCOMPROC glEndTilingQCOM;
+#endif
+#ifdef GL_OES_EGL_image
+extern PFNGLEGLIMAGETARGETTEXTURE2DOESPROC glEGLImageTargetTexture2DOES;
+#endif
+#ifdef GL_EXT_debug_marker
+extern PFNGLINSERTEVENTMARKEREXTPROC glInsertEventMarkerEXT;
+extern PFNGLPUSHGROUPMARKEREXTPROC glPushGroupMarkerEXT;
+extern PFNGLPOPGROUPMARKEREXTPROC glPopGroupMarkerEXT;
+#endif
+#ifdef GL_EXT_multisampled_render_to_texture
+extern PFNGLRENDERBUFFERSTORAGEMULTISAMPLEEXTPROC glRenderbufferStorageMultisampleEXT;
+extern PFNGLFRAMEBUFFERTEXTURE2DMULTISAMPLEEXTPROC glFramebufferTexture2DMultisampleEXT;
+#endif
+#ifdef GL_KHR_debug
+extern PFNGLDEBUGMESSAGECALLBACKKHRPROC glDebugMessageCallbackKHR;
+extern PFNGLGETDEBUGMESSAGELOGKHRPROC glGetDebugMessageLogKHR;
+#endif
 #ifdef GL_EXT_clip_control
-#   ifndef GL_LOWER_LEFT
-#      define GL_LOWER_LEFT                 GL_LOWER_LEFT_EXT
-#   endif
-#   ifndef GL_ZERO_TO_ONE
-#      define GL_ZERO_TO_ONE                GL_ZERO_TO_ONE_EXT
-#   endif
+extern PFNGLCLIPCONTROLEXTPROC glClipControl;
+#endif
+#ifdef GL_EXT_disjoint_timer_query
+extern PFNGLGETQUERYOBJECTUI64VEXTPROC glGetQueryObjectui64v;
 #endif
+#if defined(__ANDROID__)
+extern PFNGLDISPATCHCOMPUTEPROC glDispatchCompute;
+#endif
+} // namespace glext
+
+using namespace glext;
 
-#ifndef GL_TEXTURE_CUBE_MAP_ARRAY
-#   define GL_TEXTURE_CUBE_MAP_ARRAY        0x9009
 #endif
 
 // Prevent lots of #ifdef's between desktop and mobile
 
-#if defined(GL_KHR_debug)
-#   ifndef GL_DEBUG_OUTPUT
-#      define GL_DEBUG_OUTPUT                   GL_DEBUG_OUTPUT_KHR
-#   endif
-#   ifndef GL_DEBUG_OUTPUT_SYNCHRONOUS
-#      define GL_DEBUG_OUTPUT_SYNCHRONOUS       GL_DEBUG_OUTPUT_SYNCHRONOUS_KHR
-#   endif
-
-#   ifndef GL_DEBUG_SEVERITY_HIGH
-#      define GL_DEBUG_SEVERITY_HIGH            GL_DEBUG_SEVERITY_HIGH_KHR
-#   endif
-#   ifndef GL_DEBUG_SEVERITY_MEDIUM
-#      define GL_DEBUG_SEVERITY_MEDIUM          GL_DEBUG_SEVERITY_MEDIUM_KHR
-#   endif
-#   ifndef GL_DEBUG_SEVERITY_LOW
-#      define GL_DEBUG_SEVERITY_LOW             GL_DEBUG_SEVERITY_LOW_KHR
-#   endif
-#   ifndef GL_DEBUG_SEVERITY_NOTIFICATION
-#      define GL_DEBUG_SEVERITY_NOTIFICATION    GL_DEBUG_SEVERITY_NOTIFICATION_KHR
-#   endif
-
-#   ifndef GL_DEBUG_TYPE_MARKER
-#      define GL_DEBUG_TYPE_MARKER              GL_DEBUG_TYPE_MARKER_KHR
-#   endif
-#   ifndef GL_DEBUG_TYPE_ERROR
-#      define GL_DEBUG_TYPE_ERROR               GL_DEBUG_TYPE_ERROR_KHR
-#   endif
-#   ifndef GL_DEBUG_TYPE_DEPRECATED_BEHAVIOR
-#      define GL_DEBUG_TYPE_DEPRECATED_BEHAVIOR GL_DEBUG_TYPE_DEPRECATED_BEHAVIOR_KHR
-#   endif
-#   ifndef GL_DEBUG_TYPE_UNDEFINED_BEHAVIOR
-#      define GL_DEBUG_TYPE_UNDEFINED_BEHAVIOR  GL_DEBUG_TYPE_UNDEFINED_BEHAVIOR_KHR
-#   endif
-#   ifndef GL_DEBUG_TYPE_PORTABILITY
-#      define GL_DEBUG_TYPE_PORTABILITY         GL_DEBUG_TYPE_PORTABILITY_KHR
-#   endif
-#   ifndef GL_DEBUG_TYPE_PERFORMANCE
-#      define GL_DEBUG_TYPE_PERFORMANCE         GL_DEBUG_TYPE_PERFORMANCE_KHR
-#   endif
-#   ifndef GL_DEBUG_TYPE_OTHER
-#      define GL_DEBUG_TYPE_OTHER               GL_DEBUG_TYPE_OTHER_KHR
-#   endif
-
-#   define glDebugMessageCallback            glDebugMessageCallbackKHR
-#endif
-
-/* The iOS SDK only provides OpenGL ES headers up to 3.0. Filament works with OpenGL 3.0, but
- * requires ES3.1 headers */
-#if !defined(GL_ES_VERSION_3_1)
-    #define GL_SHADER_STORAGE_BUFFER                0x90D2
-    #define GL_COMPUTE_SHADER                       0x91B9
-
-    #define GL_TEXTURE_2D_MULTISAMPLE               0x9100
-
-// FIXME: The GL_TIME_ELAPSED define is used unconditionally in Filament, but
-// requires extension support.
-#ifndef GL_TIME_ELAPSED
-    #define GL_TIME_ELAPSED                         0x88BF
-#endif
-
-    #define GL_TEXTURE_BINDING_CUBE_MAP_ARRAY       0x900A
-    #define GL_SAMPLER_CUBE_MAP_ARRAY               0x900C
-    #define GL_SAMPLER_CUBE_MAP_ARRAY_SHADOW        0x900D
-    #define GL_INT_SAMPLER_CUBE_MAP_ARRAY           0x900E
-    #define GL_UNSIGNED_INT_SAMPLER_CUBE_MAP_ARRAY  0x900F
-    #define GL_IMAGE_CUBE_MAP_ARRAY                 0x9054
-    #define GL_INT_IMAGE_CUBE_MAP_ARRAY             0x905F
-    #define GL_UNSIGNED_INT_IMAGE_CUBE_MAP_ARRAY    0x906A
+#ifdef GL_EXT_disjoint_timer_query
+#   define GL_TIME_ELAPSED                          GL_TIME_ELAPSED_EXT
+#endif
+
+#ifdef GL_EXT_clip_control
+#   define GL_LOWER_LEFT                            GL_LOWER_LEFT_EXT
+#   define GL_ZERO_TO_ONE                           GL_ZERO_TO_ONE_EXT
+#endif
+
+// we need GL_TEXTURE_CUBE_MAP_ARRAY defined, but we won't use it if the extension/feature
+// is not available.
+#if defined(GL_EXT_texture_cube_map_array)
+#   define GL_TEXTURE_CUBE_MAP_ARRAY                GL_TEXTURE_CUBE_MAP_ARRAY_EXT
+#else
+#   define GL_TEXTURE_CUBE_MAP_ARRAY                0x9009
+#endif
 
+#if defined(GL_KHR_debug)
+#   define GL_DEBUG_OUTPUT                          GL_DEBUG_OUTPUT_KHR
+#   define GL_DEBUG_OUTPUT_SYNCHRONOUS              GL_DEBUG_OUTPUT_SYNCHRONOUS_KHR
+#   define GL_DEBUG_SEVERITY_HIGH                   GL_DEBUG_SEVERITY_HIGH_KHR
+#   define GL_DEBUG_SEVERITY_MEDIUM                 GL_DEBUG_SEVERITY_MEDIUM_KHR
+#   define GL_DEBUG_SEVERITY_LOW                    GL_DEBUG_SEVERITY_LOW_KHR
+#   define GL_DEBUG_SEVERITY_NOTIFICATION           GL_DEBUG_SEVERITY_NOTIFICATION_KHR
+#   define GL_DEBUG_TYPE_MARKER                     GL_DEBUG_TYPE_MARKER_KHR
+#   define GL_DEBUG_TYPE_ERROR                      GL_DEBUG_TYPE_ERROR_KHR
+#   define GL_DEBUG_TYPE_DEPRECATED_BEHAVIOR        GL_DEBUG_TYPE_DEPRECATED_BEHAVIOR_KHR
+#   define GL_DEBUG_TYPE_UNDEFINED_BEHAVIOR         GL_DEBUG_TYPE_UNDEFINED_BEHAVIOR_KHR
+#   define GL_DEBUG_TYPE_PORTABILITY                GL_DEBUG_TYPE_PORTABILITY_KHR
+#   define GL_DEBUG_TYPE_PERFORMANCE                GL_DEBUG_TYPE_PERFORMANCE_KHR
+#   define GL_DEBUG_TYPE_OTHER                      GL_DEBUG_TYPE_OTHER_KHR
+#   define glDebugMessageCallback                   glDebugMessageCallbackKHR
 #endif
+
 #endif // GL_ES_VERSION_2_0
 
+// This is just to simplify the implementation (i.e. so we don't have to have #ifdefs everywhere)
+#ifndef GL_OES_EGL_image_external
+#define GL_TEXTURE_EXTERNAL_OES           0x8D65
+#endif
+
 // This is an odd duck function that exists in WebGL 2.0 but not in OpenGL ES.
 #if defined(__EMSCRIPTEN__)
 extern "C" {
@@ -222,11 +182,6 @@ void glGetBufferSubData(GLenum target, GLintptr offset, GLsizeiptr size, void *d
 #   define BACKEND_OPENGL_LEVEL        BACKEND_OPENGL_LEVEL_GLES20
 #endif
 
-// This is just to simplify the implementation (i.e. so we don't have to have #ifdefs everywhere)
-#ifndef GL_OES_EGL_image_external
-#define GL_TEXTURE_EXTERNAL_OES           0x8D65
-#endif
-
 #include "NullGLES.h"
 
 #endif // TNT_FILAMENT_BACKEND_OPENGL_GL_HEADERS_H
diff --git a/filament/backend/src/opengl/platforms/PlatformEGL.cpp b/filament/backend/src/opengl/platforms/PlatformEGL.cpp
index bfff2f6cae58..3d54ef2ffb81 100644
--- a/filament/backend/src/opengl/platforms/PlatformEGL.cpp
+++ b/filament/backend/src/opengl/platforms/PlatformEGL.cpp
@@ -94,9 +94,7 @@ Driver* PlatformEGL::createDriver(void* sharedContext, const Platform::DriverCon
         return nullptr;
     }
 
-#if defined(__ANDROID__) || defined(FILAMENT_USE_EXTERNAL_GLES3) || defined(__EMSCRIPTEN__)
-    // PlatofrmEGL is used with and without GLES, but this function is only
-    // meaningful when GLES is used.
+#if defined(FILAMENT_IMPORT_ENTRY_POINTS)
     importGLESExtensionsEntryPoints();
 #endif
 
diff --git a/filament/backend/test/test_StencilBuffer.cpp b/filament/backend/test/test_StencilBuffer.cpp
index caa23862563f..622237131009 100644
--- a/filament/backend/test/test_StencilBuffer.cpp
+++ b/filament/backend/test/test_StencilBuffer.cpp
@@ -262,7 +262,7 @@ TEST_F(BasicStencilBufferTest, StencilBufferMSAA) {
     api.stopCapture(0);
     api.endFrame(0);
 
-    readPixelsAndAssertHash("StencilBufferAutoResolve", 512, 512, renderTarget1, 0xC7E34C43, true);
+    readPixelsAndAssertHash("StencilBufferAutoResolve", 512, 512, renderTarget1, 0x6CEFAC8F, true);
 
     flushAndWait();
     getDriver().purge();
diff --git a/filament/include/filament/Box.h b/filament/include/filament/Box.h
index f4cdb5fab6e4..36f19924a164 100644
--- a/filament/include/filament/Box.h
+++ b/filament/include/filament/Box.h
@@ -104,22 +104,24 @@ class UTILS_PUBLIC Box {
     }
 
     /**
-     * Computes the bounding box of a box transformed by a rigid transform
+     * Transform a Box by a linear transform and a translation.
+     *
+     * @param m a 3x3 matrix, the linear transform
+     * @param t a float3, the translation
      * @param box the box to transform
-     * @param m a 4x4 matrix that must be a rigid transform
-     * @return the bounding box of the transformed box.
-     *         Result is undefined if \p m is not a rigid transform
+     * @return the bounding box of the transformed box
      */
-    friend Box rigidTransform(Box const& box, const math::mat4f& m) noexcept;
+    static Box transform(const math::mat3f& m, math::float3 const& t, const Box& box) noexcept {
+        return { m * box.center + t, abs(m) * box.halfExtent };
+    }
 
     /**
-     * Computes the bounding box of a box transformed by a rigid transform
-     * @param box the box to transform
-     * @param m a 3x3 matrix that must be a rigid transform
-     * @return the bounding box of the transformed box.
-     *         Result is undefined if \p m is not a rigid transform
+     * @deprecated Use transform() instead
+     * @see transform()
      */
-    friend Box rigidTransform(Box const& box, const math::mat3f& m) noexcept;
+    friend Box rigidTransform(Box const& box, const math::mat4f& m) noexcept {
+        return transform(m.upperLeft(), m[3].xyz, box);
+    }
 };
 
 /**
@@ -174,7 +176,18 @@ struct UTILS_PUBLIC Aabb {
     /**
      * Returns the 8 corner vertices of the AABB.
      */
-    Corners getCorners() const;
+    Corners getCorners() const {
+        return Aabb::Corners{ .vertices = {
+                { min.x, min.y, min.z },
+                { max.x, min.y, min.z },
+                { min.x, max.y, min.z },
+                { max.x, max.y, min.z },
+                { min.x, min.y, max.z },
+                { max.x, min.y, max.z },
+                { min.x, max.y, max.z },
+                { max.x, max.y, max.z },
+        }};
+    }
 
     /**
      * Returns whether the box contains a given point.
@@ -182,15 +195,44 @@ struct UTILS_PUBLIC Aabb {
      * @param p the point to test
      * @return the maximum signed distance to the box. Negative if p is in the box
      */
-    float contains(math::float3 p) const noexcept;
+    float contains(math::float3 p) const noexcept {
+        float d = min.x - p.x;
+        d = std::max(d, min.y - p.y);
+        d = std::max(d, min.z - p.z);
+        d = std::max(d, p.x - max.x);
+        d = std::max(d, p.y - max.y);
+        d = std::max(d, p.z - max.z);
+        return d;
+    }
 
     /**
      * Applies an affine transformation to the AABB.
      *
-     * @param m the 4x4 transformation to apply
+     * @param m the 3x3 transformation to apply
+     * @param t the translation
      * @return the transformed box
      */
-    Aabb transform(const math::mat4f& m) const noexcept;
+    static Aabb transform(const math::mat3f& m, math::float3 const& t, const Aabb& box) noexcept {
+        // Fast AABB transformation per Jim Arvo in Graphics Gems (1990).
+        Aabb result{ t, t };
+        for (size_t col = 0; col < 3; ++col) {
+            for (size_t row = 0; row < 3; ++row) {
+                const float a = m[col][row] * box.min[col];
+                const float b = m[col][row] * box.max[col];
+                result.min[row] += a < b ? a : b;
+                result.max[row] += a < b ? b : a;
+            }
+        }
+        return result;
+    }
+
+    /**
+     * @deprecated Use transform() instead
+     * @see transform()
+     */
+    Aabb transform(const math::mat4f& m) const noexcept {
+        return transform(m.upperLeft(), m[3].xyz, *this);
+    }
 };
 
 } // namespace filament
diff --git a/filament/src/Box.cpp b/filament/src/Box.cpp
deleted file mode 100644
index a19cf3d79fd5..000000000000
--- a/filament/src/Box.cpp
+++ /dev/null
@@ -1,71 +0,0 @@
-/*
- * Copyright (C) 2015 The Android Open Source Project
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include <filament/Box.h>
-
-using namespace filament::math;
-
-namespace filament {
-
-Box rigidTransform(Box const& UTILS_RESTRICT box, const mat4f& UTILS_RESTRICT m) noexcept {
-    const mat3f u(m.upperLeft());
-    return { u * box.center + m[3].xyz, abs(u) * box.halfExtent };
-}
-
-Box rigidTransform(Box const& UTILS_RESTRICT box, const mat3f& UTILS_RESTRICT u) noexcept {
-    return { u * box.center, abs(u) * box.halfExtent };
-}
-
-Aabb::Corners Aabb::getCorners() const {
-    return Aabb::Corners{ .vertices = {
-                { min.x, min.y, min.z },
-                { max.x, min.y, min.z },
-                { min.x, max.y, min.z },
-                { max.x, max.y, min.z },
-                { min.x, min.y, max.z },
-                { max.x, min.y, max.z },
-                { min.x, max.y, max.z },
-                { max.x, max.y, max.z },
-            }};
-}
-
-float Aabb::contains(float3 p) const noexcept {
-    float d = min.x - p.x;
-    d = std::max(d, min.y - p.y);
-    d = std::max(d, min.z - p.z);
-    d = std::max(d, p.x - max.x);
-    d = std::max(d, p.y - max.y);
-    d = std::max(d, p.z - max.z);
-    return d;
-}
-
-// Fast AABB transformation per Jim Arvo in Graphics Gems (1990).
-Aabb Aabb::transform(const mat4f& mat) const noexcept {
-    const float3 translation = mat[3].xyz;
-    const mat3f upperLeft = mat.upperLeft();
-    Aabb result = { translation, translation };
-    for (size_t col = 0; col < upperLeft.getColumnCount(); ++col) {
-        for (size_t row = 0; row < upperLeft.getRowCount(); ++row) {
-            const float a = upperLeft[col][row] * min[col];
-            const float b = upperLeft[col][row] * max[col];
-            result.min[row] += a < b ? a : b;
-            result.max[row] += a < b ? b : a;
-        }
-    }
-    return result;
-}
-
-} // namespace filament
diff --git a/filament/src/RenderPass.cpp b/filament/src/RenderPass.cpp
index 99643f152cc7..28fcb4edaf2f 100644
--- a/filament/src/RenderPass.cpp
+++ b/filament/src/RenderPass.cpp
@@ -86,11 +86,12 @@ void RenderPass::setScissorViewport(backend::Viewport viewport) noexcept {
 }
 
 void RenderPass::appendCommands(FEngine& engine, CommandTypeFlags const commandTypeFlags) noexcept {
+    SYSTRACE_CALL();
     SYSTRACE_CONTEXT();
 
     assert_invariant(mRenderableSoa);
 
-    utils::Range<uint32_t> vr = mVisibleRenderables;
+    utils::Range<uint32_t> const vr = mVisibleRenderables;
     // trace the number of visible renderables
     SYSTRACE_VALUE32("visibleRenderables", vr.size());
     if (UTILS_UNLIKELY(vr.empty())) {
@@ -129,7 +130,7 @@ void RenderPass::appendCommands(FEngine& engine, CommandTypeFlags const commandT
         work(vr.first, vr.size());
     } else {
         auto* jobCommandsParallel = jobs::parallel_for(js, nullptr, vr.first, (uint32_t)vr.size(),
-                std::cref(work), jobs::CountSplitter<JOBS_PARALLEL_FOR_COMMANDS_COUNT, 4>());
+                std::cref(work), jobs::CountSplitter<JOBS_PARALLEL_FOR_COMMANDS_COUNT, 5>());
         js.runAndWait(jobCommandsParallel);
     }
 
@@ -375,6 +376,8 @@ void RenderPass::generateCommands(uint32_t commandTypeFlags, Command* const comm
         FScene::VisibleMaskType visibilityMask,
         float3 cameraPosition, float3 cameraForward) noexcept {
 
+    SYSTRACE_CALL();
+
     // generateCommands() writes both the draw and depth commands simultaneously such that
     // we go throw the list of renderables just once.
     // (in principle, we could have split this method into two, at the cost of going through
@@ -723,6 +726,7 @@ UTILS_NOINLINE // no need to be inlined
 void RenderPass::Executor::execute(backend::DriverApi& driver,
         const Command* first, const Command* last) const noexcept {
     SYSTRACE_CALL();
+    SYSTRACE_CONTEXT();
 
     if (first != last) {
         SYSTRACE_VALUE32("commandCount", last - first);
diff --git a/filament/src/RenderPass.h b/filament/src/RenderPass.h
index 5d4476c76721..2ef3e553ae1e 100644
--- a/filament/src/RenderPass.h
+++ b/filament/src/RenderPass.h
@@ -389,9 +389,9 @@ class RenderPass {
     void resize(size_t count) noexcept;
     void instanceify(FEngine& engine) noexcept;
 
-    // on 64-bits systems, we process batches of 256 (64 bytes) cache-lines, or 512 (32 bytes) commands
-    // on 32-bits systems, we process batches of 512 (32 bytes) cache-lines, or 512 (32 bytes) commands
-    static constexpr size_t JOBS_PARALLEL_FOR_COMMANDS_COUNT = 512;
+    // we choose the command count per job to minimize JobSystem overhead.
+    // on a Pixel 4, 2048 commands is about half a millisecond of processing.
+    static constexpr size_t JOBS_PARALLEL_FOR_COMMANDS_COUNT = 2048;
     static constexpr size_t JOBS_PARALLEL_FOR_COMMANDS_SIZE  =
             sizeof(Command) * JOBS_PARALLEL_FOR_COMMANDS_COUNT;
 
diff --git a/filament/src/ShadowMap.cpp b/filament/src/ShadowMap.cpp
index 2038dfba80a3..b6690a08e532 100644
--- a/filament/src/ShadowMap.cpp
+++ b/filament/src/ShadowMap.cpp
@@ -58,7 +58,7 @@ ShadowMap::ShadowMap(FEngine& engine) noexcept
 
 void ShadowMap::terminate(FEngine& engine) {
     Entity entities[] = { mCamera->getEntity(), mDebugCamera->getEntity() };
-    for (Entity e : entities) {
+    for (Entity const e : entities) {
         engine.destroyCameraComponent(e);
     }
     engine.getEntityManager().destroy(sizeof(entities) / sizeof(Entity), entities);
@@ -112,12 +112,12 @@ math::mat4f ShadowMap::getPointLightViewMatrix(backend::TextureCubemapFace face,
 ShadowMap::ShaderParameters ShadowMap::updateDirectional(FEngine& engine,
         const FScene::LightSoa& lightData, size_t index,
         filament::CameraInfo const& camera,
-        ShadowMapInfo const& shadowMapInfo, FScene const& scene,
-        SceneInfo& sceneInfo) noexcept {
+        ShadowMapInfo const& shadowMapInfo,
+        SceneInfo const& sceneInfo) noexcept {
 
     // Note: we keep the polygon offset even with VSM as it seems to help.
     auto& lcm = engine.getLightManager();
-    FLightManager::Instance li = lightData.elementAt<FScene::LIGHT_INSTANCE>(index);
+    FLightManager::Instance const li = lightData.elementAt<FScene::LIGHT_INSTANCE>(index);
     FLightManager::ShadowParams params = lcm.getShadowParams(li);
 
 #ifndef NDEBUG
@@ -134,8 +134,8 @@ ShadowMap::ShaderParameters ShadowMap::updateDirectional(FEngine& engine,
     // Adjust the camera's projection for the light's shadowFar
     const mat4f cullingProjection{ [&](auto p) {
         if (params.options.shadowFar > 0.0f) {
-            float n = camera.zn;
-            float f = params.options.shadowFar;
+            float const n = camera.zn;
+            float const f = params.options.shadowFar;
             // orthographic projection
             assert_invariant(std::abs(p[2].w) <= std::numeric_limits<float>::epsilon());
             p[2].z =    2.0f / (n - f);
@@ -153,10 +153,7 @@ ShadowMap::ShaderParameters ShadowMap::updateDirectional(FEngine& engine,
     // We compute the directional light's model matrix using the origin's as the light position.
     // The choice of the light's origin initially doesn't matter for a directional light.
     // This will be adjusted later because of how we compute the depth metric for VSM.
-    const mat4f MvAtOrigin = getDirectionalLightViewMatrix(direction);
-
-    // Compute scene-dependent values shared across all cascades
-    ShadowMap::updateSceneInfoDirectional(MvAtOrigin, scene, sceneInfo);
+    const mat4f MvAtOrigin = ShadowMap::getDirectionalLightViewMatrix(direction);
 
     const Aabb wsShadowCastersVolume = sceneInfo.wsShadowCastersVolume;
     const Aabb wsShadowReceiversVolume = sceneInfo.wsShadowReceiversVolume;
@@ -177,7 +174,7 @@ ShadowMap::ShaderParameters ShadowMap::updateDirectional(FEngine& engine,
 
     // compute the intersection of the shadow receivers' volume with the view volume
     // in world space. This returns a set of points on the convex-hull of the intersection.
-    size_t vertexCount = intersectFrustumWithBox(wsClippedShadowReceiverVolume,
+    size_t const vertexCount = intersectFrustumWithBox(wsClippedShadowReceiverVolume,
             wsFrustum, wsViewFrustumVertices, wsShadowReceiversVolume);
 
     if (UTILS_UNLIKELY(vertexCount < 4)) {
@@ -206,7 +203,7 @@ ShadowMap::ShaderParameters ShadowMap::updateDirectional(FEngine& engine,
     }
     for (size_t i = 0; i < vertexCount; ++i) {
         // far: figure out the farthest shadow receivers
-        float3 v = mat4f::project(MvAtOrigin, wsClippedShadowReceiverVolume[i]);
+        float3 const v = mat4f::project(MvAtOrigin, wsClippedShadowReceiverVolume[i]);
         lsLightFrustumBounds.min.z = std::min(lsLightFrustumBounds.min.z, v.z);
         if constexpr (USE_DEPTH_CLAMP) {
             // tighten the shadow receiver volume further
@@ -487,7 +484,7 @@ ShadowMap::ShaderParameters ShadowMap::updatePunctual(
 
 ShadowMap::ShaderParameters ShadowMap::updateSpot(FEngine& engine,
         const FScene::LightSoa& lightData, size_t index,
-        filament::CameraInfo const& camera,
+        filament::CameraInfo const&,
         const ShadowMapInfo& shadowMapInfo,
         FScene const& scene, SceneInfo sceneInfo) noexcept {
 
@@ -499,8 +496,17 @@ ShadowMap::ShaderParameters ShadowMap::updateSpot(FEngine& engine,
     const FLightManager::ShadowParams& params = lcm.getShadowParams(li);
     const mat4f Mv = getDirectionalLightViewMatrix(direction, position);
 
-    // find decent near/far
-    ShadowMap::updateSceneInfoSpot(Mv, scene, sceneInfo);
+    // We only keep this for reference. updateSceneInfoSpot() is quite expensive on large scenes
+    // currently, and only needed to find a near/far. Instead, we just use a small near and the
+    // radius as far.
+    // TODO: Another potential solution would be to visit only the part of the scene that's visible
+    //       by the light -- which should be much smaller.
+    if constexpr (false) {
+        // find decent near/far
+        ShadowMap::updateSceneInfoSpot(Mv, scene, sceneInfo);
+    } else {
+        sceneInfo.lsNearFar = { -0.01f, -radius };
+    }
 
     // if the scene was empty, near > far
     mHasVisibleShadows = -sceneInfo.lsNearFar[0] < -sceneInfo.lsNearFar[1];
@@ -509,22 +515,22 @@ ShadowMap::ShaderParameters ShadowMap::updateSpot(FEngine& engine,
     }
 
     // FIXME: we need a configuration for minimum near plane (for now hardcoded to 1cm)
-    float nearPlane = std::max(0.01f, -sceneInfo.lsNearFar[0]);
-    float farPlane  = std::min(radius, -sceneInfo.lsNearFar[1]);
+    float const nearPlane = std::max(0.01f, -sceneInfo.lsNearFar[0]);
+    float const farPlane  = std::min(radius, -sceneInfo.lsNearFar[1]);
+
     auto outerConeAngle = lcm.getSpotLightOuterCone(li);
     return updatePunctual(Mv, outerConeAngle, nearPlane, farPlane, shadowMapInfo, params);
 }
 
 ShadowMap::ShaderParameters ShadowMap::updatePoint(FEngine& engine,
-        const FScene::LightSoa& lightData, size_t index,
-        filament::CameraInfo const& camera, const ShadowMapInfo& shadowMapInfo, FScene const& scene,
-        SceneInfo, uint8_t face) noexcept {
+        const FScene::LightSoa& lightData, size_t index, filament::CameraInfo const&,
+        const ShadowMapInfo& shadowMapInfo, FScene const& scene, uint8_t face) noexcept {
 
     // check if this shadow map has anything to render
     mHasVisibleShadows = false;
     FScene::RenderableSoa const& UTILS_RESTRICT soa = scene.getRenderableData();
     auto const* const UTILS_RESTRICT visibleMasks = soa.data<FScene::VISIBLE_MASK>();
-    size_t c = soa.size();
+    size_t const c = soa.size();
     for (size_t i = 0; i < c; i++) {
         if (visibleMasks[i] & VISIBLE_DYN_SHADOW_RENDERABLE) {
             mHasVisibleShadows = true;
@@ -702,18 +708,12 @@ mat4f ShadowMap::directionalLightFrustum(float near, float far) noexcept {
     return m;
 }
 
-float2 ShadowMap::computeNearFar(const mat4f& view,
-        Aabb const& wsShadowCastersVolume) noexcept {
-    const Aabb::Corners wsSceneCastersCorners = wsShadowCastersVolume.getCorners();
-    return computeNearFar(view, wsSceneCastersCorners.data(), wsSceneCastersCorners.size());
-}
-
 float2 ShadowMap::computeNearFar(const mat4f& view,
         float3 const* wsVertices, size_t count) noexcept {
     float2 nearFar = { std::numeric_limits<float>::lowest(), std::numeric_limits<float>::max() };
     for (size_t i = 0; i < count; i++) {
         // we're on the z axis in light space (looking down to -z)
-        float c = mat4f::project(view, wsVertices[i]).z;
+        float const c = mat4f::project(view, wsVertices[i]).z;
         nearFar.x = std::max(nearFar.x, c);  // near
         nearFar.y = std::min(nearFar.y, c);  // far
     }
@@ -726,7 +726,7 @@ float2 ShadowMap::computeNearFarOfWarpSpace(mat4f const& lightView,
     #pragma nounroll
     for (size_t i = 0; i < count; i++) {
         // we're on the y-axis in light space (looking down to +y)
-        float c = mat4f::project(lightView, wsVertices[i]).y;
+        float const c = mat4f::project(lightView, wsVertices[i]).y;
         nearFar.x = std::min(nearFar.x, c);
         nearFar.y = std::max(nearFar.y, c);
     }
@@ -791,11 +791,11 @@ void ShadowMap::intersectWithShadowCasters(Aabb& UTILS_RESTRICT lightFrustum,
     // first intersect wsShadowCastersVolume with the light's frustum, otherwise we end-up
     // transforming vertices that are "outside" the frustum, and that's forbidden.
     FrustumBoxIntersection wsClippedShadowCasterVolumeVertices;
-    size_t vertexCount = intersectFrustumWithBox(wsClippedShadowCasterVolumeVertices,
+    size_t const vertexCount = intersectFrustumWithBox(wsClippedShadowCasterVolumeVertices,
             wsLightFrustum, wsLightFrustumCorners, wsShadowCastersVolume);
 
     // compute shadow-caster bounds in light space
-    Aabb box = compute2DBounds(lightView, wsClippedShadowCasterVolumeVertices.data(), vertexCount);
+    Aabb const box = compute2DBounds(lightView, wsClippedShadowCasterVolumeVertices.data(), vertexCount);
 
     // intersect shadow-caster and current light frustum bounds
     lightFrustum.min.xy = max(box.min.xy, lightFrustum.min.xy);
@@ -809,7 +809,7 @@ void ShadowMap::computeFrustumCorners(float3* UTILS_RESTRICT out,
     // matrix to convert: ndc -> camera -> world
     float near = csNearFar.x;
     float far = csNearFar.y;
-    float3 csViewFrustumCorners[8] = {
+    float3 const csViewFrustumCorners[8] = {
             { -1, -1,  far },
             {  1, -1,  far },
             { -1,  1,  far },
@@ -819,7 +819,7 @@ void ShadowMap::computeFrustumCorners(float3* UTILS_RESTRICT out,
             { -1,  1,  near },
             {  1,  1,  near },
     };
-    for (float3 c : csViewFrustumCorners) {
+    for (float3 const c : csViewFrustumCorners) {
         *out++ = mat4f::project(projectionViewInverse, c);
     }
 }
@@ -869,7 +869,7 @@ size_t ShadowMap::intersectFrustumWithBox(
     // a) Keep the frustum's vertices that are known to be inside the scene's box
     UTILS_NOUNROLL
     for (size_t i = 0; i < 8; i++) {
-        float3 p = wsFrustumCorners[i];
+        float3 const p = wsFrustumCorners[i];
         outVertices[vertexCount] = p;
         if ((p.x >= wsBox.min.x && p.x <= wsBox.max.x) &&
             (p.y >= wsBox.min.y && p.y <= wsBox.max.y) &&
@@ -889,14 +889,14 @@ size_t ShadowMap::intersectFrustumWithBox(
         // the frustum. This actually often happens due to fitting light-space
         // We fudge the distance to the plane by a small amount.
         #pragma nounroll
-        for (float3 p : wsSceneReceiversCorners) {
+        for (float3 const p : wsSceneReceiversCorners) {
             outVertices[vertexCount] = p;
-            float l = dot(wsFrustumPlanes[0].xyz, p) + wsFrustumPlanes[0].w;
-            float b = dot(wsFrustumPlanes[1].xyz, p) + wsFrustumPlanes[1].w;
-            float r = dot(wsFrustumPlanes[2].xyz, p) + wsFrustumPlanes[2].w;
-            float t = dot(wsFrustumPlanes[3].xyz, p) + wsFrustumPlanes[3].w;
-            float f = dot(wsFrustumPlanes[4].xyz, p) + wsFrustumPlanes[4].w;
-            float n = dot(wsFrustumPlanes[5].xyz, p) + wsFrustumPlanes[5].w;
+            float const l = dot(wsFrustumPlanes[0].xyz, p) + wsFrustumPlanes[0].w;
+            float const b = dot(wsFrustumPlanes[1].xyz, p) + wsFrustumPlanes[1].w;
+            float const r = dot(wsFrustumPlanes[2].xyz, p) + wsFrustumPlanes[2].w;
+            float const t = dot(wsFrustumPlanes[3].xyz, p) + wsFrustumPlanes[3].w;
+            float const f = dot(wsFrustumPlanes[4].xyz, p) + wsFrustumPlanes[4].w;
+            float const n = dot(wsFrustumPlanes[5].xyz, p) + wsFrustumPlanes[5].w;
             if ((l <= EPSILON) && (b <= EPSILON) &&
                 (r <= EPSILON) && (t <= EPSILON) &&
                 (f <= EPSILON) && (n <= EPSILON)) {
@@ -981,7 +981,7 @@ size_t ShadowMap::intersectFrustum(
         const float3 s0{ segmentsVertices[segment.v0] };
         const float3 s1{ segmentsVertices[segment.v1] };
         // each segment should only intersect with 2 quads at most
-        size_t maxVertexCount = vertexCount + 2;
+        size_t const maxVertexCount = vertexCount + 2;
         for (size_t j = 0; j < 6 && vertexCount < maxVertexCount; ++j) {
             const Quad quad = sBoxQuads[j];
             const float3 t0{ quadsVertices[quad.v0] };
@@ -1036,8 +1036,8 @@ inline bool ShadowMap::intersectSegmentWithTriangle(float3& UTILS_RESTRICT p,
 
 bool ShadowMap::intersectSegmentWithPlanarQuad(float3& UTILS_RESTRICT p,
         float3 s0, float3 s1, float3 t0, float3 t1, float3 t2, float3 t3) noexcept {
-    bool hit = intersectSegmentWithTriangle(p, s0, s1, t0, t1, t2) ||
-               intersectSegmentWithTriangle(p, s0, s1, t0, t2, t3);
+    bool const hit = intersectSegmentWithTriangle(p, s0, s1, t0, t1, t2) ||
+                     intersectSegmentWithTriangle(p, s0, s1, t0, t2, t3);
     return hit;
 }
 
@@ -1070,7 +1070,7 @@ float ShadowMap::texelSizeWorldSpace(const mat4f& Wp, const mat4f& MbMtF,
 
     // The Jacobian is not constant, so we evaluate it in the center of the shadow-map texture.
     // It might be better to do this computation in the vertex shader.
-    float3 p = {0.5, 0.5, 0.0};
+    float3 const p = { 0.5f, 0.5f, 0.0f };
 
     const float ures = 1.0f / float(shadowDimension);
     const float vres = 1.0f / float(shadowDimension);
@@ -1115,9 +1115,9 @@ float ShadowMap::texelSizeWorldSpace(const mat4f& Wp, const mat4f& MbMtF,
             0.0f,           j * Z * sx,     j * dz * sx
     });
 
-    float3 Jx = J[0] * ures;
-    float3 Jy = J[1] * vres;
-    UTILS_UNUSED float3 Jz = J[2] * dres;
+    float3 const Jx = J[0] * ures;
+    float3 const Jy = J[1] * vres;
+    UTILS_UNUSED float3 const Jz = J[2] * dres;
     const float s = std::max(length(Jx), length(Jy));
     return s;
 }
@@ -1128,13 +1128,13 @@ void ShadowMap::visitScene(const FScene& scene, uint32_t visibleLayers,
     SYSTRACE_CALL();
 
     using State = FRenderableManager::Visibility;
-    FScene::RenderableSoa const& UTILS_RESTRICT soa = scene.getRenderableData();
-    float3 const* const UTILS_RESTRICT worldAABBCenter = soa.data<FScene::WORLD_AABB_CENTER>();
-    float3 const* const UTILS_RESTRICT worldAABBExtent = soa.data<FScene::WORLD_AABB_EXTENT>();
-    uint8_t const* const UTILS_RESTRICT layers = soa.data<FScene::LAYERS>();
-    State const* const UTILS_RESTRICT visibility = soa.data<FScene::VISIBILITY_STATE>();
-    auto const* const UTILS_RESTRICT visibleMasks = soa.data<FScene::VISIBLE_MASK>();
-    size_t c = soa.size();
+    FScene::RenderableSoa const& soa = scene.getRenderableData();
+    float3 const* const worldAABBCenter = soa.data<FScene::WORLD_AABB_CENTER>();
+    float3 const* const worldAABBExtent = soa.data<FScene::WORLD_AABB_EXTENT>();
+    uint8_t const* const layers = soa.data<FScene::LAYERS>();
+    State const* const visibility = soa.data<FScene::VISIBILITY_STATE>();
+    auto const* const visibleMasks = soa.data<FScene::VISIBLE_MASK>();
+    size_t const c = soa.size();
     for (size_t i = 0; i < c; i++) {
         if (layers[i] & visibleLayers) {
             const Aabb aabb{ worldAABBCenter[i] - worldAABBExtent[i],
@@ -1149,46 +1149,47 @@ void ShadowMap::visitScene(const FScene& scene, uint32_t visibleLayers,
     }
 }
 
-void ShadowMap::initSceneInfo(ShadowMap::SceneInfo& sceneInfo,
-        uint8_t visibleLayers, FScene const& scene, mat4f const& viewMatrix) {
-    sceneInfo.csNearFar = { -1.0f, 1.0f };
-    sceneInfo.lsNearFar = {};
-    sceneInfo.visibleLayers = visibleLayers;
-    sceneInfo.vsNearFar = { std::numeric_limits<float>::lowest(), std::numeric_limits<float>::max() };
+ShadowMap::SceneInfo::SceneInfo(
+        FScene const& scene, uint8_t visibleLayers, mat4f const& viewMatrix) noexcept
+        : vsNearFar(std::numeric_limits<float>::lowest(), std::numeric_limits<float>::max()),
+          visibleLayers(visibleLayers) {
+
+    // the code below only works with affine transforms
+    assert_invariant(transpose(viewMatrix)[3] == float4(0, 0, 0, 1));
 
     // We assume the light is at the origin to compute the SceneInfo. This is consumed later by
     // computeShadowCameraDirectional() which takes this into account.
 
     // Compute scene bounds in world space, as well as the light-space and view-space near/far planes
-    sceneInfo.wsShadowCastersVolume = {};
-    sceneInfo.wsShadowReceiversVolume = {};
-    visitScene(scene, sceneInfo.visibleLayers,
+    wsShadowCastersVolume = {};
+    wsShadowReceiversVolume = {};
+    ShadowMap::visitScene(scene, visibleLayers,
             [&](Aabb caster, Culler::result_type) {
-                sceneInfo.wsShadowCastersVolume.min =
-                        min(sceneInfo.wsShadowCastersVolume.min, caster.min);
-                sceneInfo.wsShadowCastersVolume.max =
-                        max(sceneInfo.wsShadowCastersVolume.max, caster.max);
+                wsShadowCastersVolume.min = min(wsShadowCastersVolume.min, caster.min);
+                wsShadowCastersVolume.max = max(wsShadowCastersVolume.max, caster.max);
             },
             [&](Aabb receiver, Culler::result_type) {
-                sceneInfo.wsShadowReceiversVolume.min =
-                        min(sceneInfo.wsShadowReceiversVolume.min, receiver.min);
-                sceneInfo.wsShadowReceiversVolume.max =
-                        max(sceneInfo.wsShadowReceiversVolume.max, receiver.max);
-                float2 nf = ShadowMap::computeNearFar(viewMatrix, receiver);
-                sceneInfo.vsNearFar.x = std::max(sceneInfo.vsNearFar.x, nf.x);
-                sceneInfo.vsNearFar.y = std::min(sceneInfo.vsNearFar.y, nf.y);
+                wsShadowReceiversVolume.min = min(wsShadowReceiversVolume.min, receiver.min);
+                wsShadowReceiversVolume.max = max(wsShadowReceiversVolume.max, receiver.max);
+                auto r = Aabb::transform(viewMatrix.upperLeft(), viewMatrix[3].xyz, receiver);
+                vsNearFar.x = std::max(vsNearFar.x, r.max.z);
+                vsNearFar.y = std::min(vsNearFar.y, r.min.z);
             }
     );
 }
 
 void ShadowMap::updateSceneInfoDirectional(mat4f const& Mv, FScene const& scene,
         SceneInfo& sceneInfo) {
+
+    // the code below only works with affine transforms
+    assert_invariant(transpose(Mv)[3] == float4(0, 0, 0, 1));
+
     sceneInfo.lsNearFar = { std::numeric_limits<float>::lowest(), std::numeric_limits<float>::max() };
     visitScene(scene, sceneInfo.visibleLayers,
             [&](Aabb caster, Culler::result_type) {
-                float2 nf = ShadowMap::computeNearFar(Mv, caster);
-                sceneInfo.lsNearFar.x = std::max(sceneInfo.lsNearFar.x, nf.x);  // near
-                sceneInfo.lsNearFar.y = std::min(sceneInfo.lsNearFar.y, nf.y);  // far
+                auto r = Aabb::transform(Mv.upperLeft(), Mv[3].xyz, caster);
+                sceneInfo.lsNearFar.x = std::max(sceneInfo.lsNearFar.x, r.max.z);  // near
+                sceneInfo.lsNearFar.y = std::min(sceneInfo.lsNearFar.y, r.min.z);  // far
             },
             [&](Aabb receiver, Culler::result_type) {
             }
@@ -1197,14 +1198,17 @@ void ShadowMap::updateSceneInfoDirectional(mat4f const& Mv, FScene const& scene,
 
 void ShadowMap::updateSceneInfoSpot(mat4f const& Mv, FScene const& scene,
         SceneInfo& sceneInfo) {
+
+    // the code below only works with affine transforms
+    assert_invariant(transpose(Mv)[3] == float4(0, 0, 0, 1));
+
     sceneInfo.lsNearFar = { std::numeric_limits<float>::lowest(), std::numeric_limits<float>::max() };
-    sceneInfo.vsNearFar = { std::numeric_limits<float>::lowest(), std::numeric_limits<float>::max() };
     visitScene(scene, sceneInfo.visibleLayers,
             [&](Aabb caster, Culler::result_type mask) {
                 if (mask & VISIBLE_DYN_SHADOW_RENDERABLE) {
-                    float2 nf = ShadowMap::computeNearFar(Mv, caster);
-                    sceneInfo.lsNearFar.x = std::max(sceneInfo.lsNearFar.x, nf.x);  // near
-                    sceneInfo.lsNearFar.y = std::min(sceneInfo.lsNearFar.y, nf.y);  // far
+                    auto r = Aabb::transform(Mv.upperLeft(), Mv[3].xyz, caster);
+                    sceneInfo.lsNearFar.x = std::max(sceneInfo.lsNearFar.x, r.max.z);  // near
+                    sceneInfo.lsNearFar.y = std::min(sceneInfo.lsNearFar.y, r.min.z);  // far
                 }
             },
             [&](Aabb receiver, Culler::result_type) {
diff --git a/filament/src/ShadowMap.h b/filament/src/ShadowMap.h
index 214ed4a31dcb..de198387c62c 100644
--- a/filament/src/ShadowMap.h
+++ b/filament/src/ShadowMap.h
@@ -95,16 +95,20 @@ class ShadowMap {
     };
 
     struct SceneInfo {
+
+        SceneInfo() noexcept = default;
+        SceneInfo(FScene const& scene, uint8_t visibleLayers, math::mat4f const& viewMatrix) noexcept;
+
         // scratch data: The near and far planes, in clip space, to use for this shadow map
         math::float2 csNearFar = { -1.0f, 1.0f };
 
         // scratch data: light's near/far expressed in light-space, calculated from the scene's
         // content assuming the light is at the origin.
-        math::float2 lsNearFar{};
+        math::float2 lsNearFar;
 
-        // scratch data: Viewing camera's near/far expressed in view-space, calculated from the
+        // Viewing camera's near/far expressed in view-space, calculated from the
         // scene's content.
-        math::float2 vsNearFar{};
+        math::float2 vsNearFar;
 
         // World-space shadow-casters volume
         Aabb wsShadowCastersVolume;
@@ -136,8 +140,8 @@ class ShadowMap {
     ShaderParameters updateDirectional(FEngine& engine,
             const FScene::LightSoa& lightData, size_t index,
             filament::CameraInfo const& camera,
-            ShadowMapInfo const& shadowMapInfo, FScene const& scene,
-            SceneInfo& sceneInfo) noexcept;
+            ShadowMapInfo const& shadowMapInfo,
+            SceneInfo const& sceneInfo) noexcept;
 
     ShaderParameters updateSpot(FEngine& engine,
             const FScene::LightSoa& lightData, size_t index,
@@ -145,11 +149,9 @@ class ShadowMap {
             const ShadowMapInfo& shadowMapInfo, FScene const& scene,
             SceneInfo sceneInfo) noexcept;
 
-    ShaderParameters updatePoint(FEngine& engine,
-            const FScene::LightSoa& lightData, size_t index,
-            filament::CameraInfo const& camera, const ShadowMapInfo& shadowMapInfo,
-            FScene const& scene,
-            SceneInfo sceneInfo, uint8_t face) noexcept;
+    ShadowMap::ShaderParameters updatePoint(FEngine& engine,
+            const FScene::LightSoa& lightData, size_t index, filament::CameraInfo const& camera,
+            const ShadowMapInfo& shadowMapInfo, FScene const& scene, uint8_t face) noexcept;
 
     // Do we have visible shadows. Valid after calling update().
     bool hasVisibleShadows() const noexcept { return mHasVisibleShadows; }
@@ -160,11 +162,6 @@ class ShadowMap {
     // use only for debugging
     FCamera const& getDebugCamera() const noexcept { return *mDebugCamera; }
 
-    // Call once per frame to populate the SceneInfo struct, then pass to update().
-    // This computes values constant across all shadow maps.
-    static void initSceneInfo(ShadowMap::SceneInfo& sceneInfo, uint8_t visibleLayers,
-            FScene const& scene, math::mat4f const& viewMatrix);
-
     // Update SceneInfo struct for a given light
     static void updateSceneInfoDirectional(const math::mat4f& Mv, FScene const& scene,
             SceneInfo& sceneInfo);
@@ -230,9 +227,6 @@ class ShadowMap {
     static inline void computeFrustumCorners(math::float3* out,
             const math::mat4f& projectionViewInverse, math::float2 csNearFar = { -1.0f, 1.0f }) noexcept;
 
-    static inline math::float2 computeNearFar(math::mat4f const& view,
-            Aabb const& wsShadowCastersVolume) noexcept;
-
     static inline math::float2 computeNearFar(math::mat4f const& view,
             math::float3 const* wsVertices, size_t count) noexcept;
 
diff --git a/filament/src/ShadowMapManager.cpp b/filament/src/ShadowMapManager.cpp
index b6a11cea8144..725cf4424b89 100644
--- a/filament/src/ShadowMapManager.cpp
+++ b/filament/src/ShadowMapManager.cpp
@@ -76,15 +76,16 @@ ShadowMapManager::ShadowTechnique ShadowMapManager::update(FEngine& engine, FVie
     calculateTextureRequirements(engine, view, lightData);
 
     // Compute scene-dependent values shared across all shadow maps
-    ShadowMap::initSceneInfo(mSceneInfo,
-            view.getVisibleLayers(), *view.getScene(), cameraInfo.view);
+    ShadowMap::SceneInfo const info{ *view.getScene(), view.getVisibleLayers(), cameraInfo.view };
 
     shadowTechnique |= updateCascadeShadowMaps(
-            engine, view, cameraInfo, renderableData, lightData, mSceneInfo);
+            engine, view, cameraInfo, renderableData, lightData, info);
 
     shadowTechnique |= updateSpotShadowMaps(
             engine, lightData);
 
+    mSceneInfo = info;
+
     return shadowTechnique;
 }
 
@@ -207,7 +208,7 @@ FrameGraphId<FrameGraphTexture> ShadowMapManager::render(FEngine& engine, FrameG
             },
             [this, &engine, &view, vsmShadowOptions,
                 scene, mainCameraInfo, userTime, passTemplate = pass](
-                    FrameGraphResources const& resources, auto const& data, DriverApi& driver) {
+                    FrameGraphResources const&, auto const& data, DriverApi& driver) {
 
                 // Note: we could almost parallel_for the loop below, the problem currently is
                 // that updatePrimitivesLod() updates temporary global state.
@@ -435,11 +436,11 @@ FrameGraphId<FrameGraphTexture> ShadowMapManager::render(FEngine& engine, FrameG
 
 ShadowMapManager::ShadowTechnique ShadowMapManager::updateCascadeShadowMaps(FEngine& engine,
         FView& view, CameraInfo const& cameraInfo, FScene::RenderableSoa& renderableData,
-        FScene::LightSoa& lightData, ShadowMap::SceneInfo& sceneInfo) noexcept {
+        FScene::LightSoa& lightData, ShadowMap::SceneInfo sceneInfo) noexcept {
     FScene* scene = view.getScene();
     auto& lcm = engine.getLightManager();
 
-    FLightManager::Instance directionalLight = lightData.elementAt<FScene::LIGHT_INSTANCE>(0);
+    FLightManager::Instance const directionalLight = lightData.elementAt<FScene::LIGHT_INSTANCE>(0);
     FLightManager::ShadowOptions const& options = lcm.getShadowOptions(directionalLight);
     FLightManager::ShadowParams const& params = lcm.getShadowParams(directionalLight);
 
@@ -458,8 +459,18 @@ ShadowMapManager::ShadowTechnique ShadowMapManager::updateCascadeShadowMaps(FEng
         // entire camera frustum, as if we only had a single cascade.
         ShadowMap& shadowMap = *mCascadeShadowMaps[0];
 
+        const auto direction = lightData.elementAt<FScene::DIRECTION>(0);
+
+        // We compute the directional light's model matrix using the origin's as the light position.
+        // The choice of the light's origin initially doesn't matter for a directional light.
+        // This will be adjusted later because of how we compute the depth metric for VSM.
+        const mat4f MvAtOrigin = ShadowMap::getDirectionalLightViewMatrix(direction);
+
+        // Compute scene-dependent values shared across all cascades
+        ShadowMap::updateSceneInfoDirectional(MvAtOrigin, *scene, sceneInfo);
+
         shadowMap.updateDirectional(mEngine,
-                lightData, 0, cameraInfo, shadowMapInfo, *scene, sceneInfo);
+                lightData, 0, cameraInfo, shadowMapInfo, sceneInfo);
 
         hasVisibleShadows = shadowMap.hasVisibleShadows();
 
@@ -533,7 +544,7 @@ ShadowMapManager::ShadowTechnique ShadowMapManager::updateCascadeShadowMaps(FEng
             sceneInfo.csNearFar = { csSplitPosition[i], csSplitPosition[i + 1] };
 
             auto shaderParameters = shadowMap.updateDirectional(mEngine,
-                    lightData, 0, cameraInfo, shadowMapInfo, *scene, sceneInfo);
+                    lightData, 0, cameraInfo, shadowMapInfo, sceneInfo);
 
             if (shadowMap.hasVisibleShadows()) {
                 const size_t shadowIndex = shadowMap.getShadowIndex();
@@ -560,7 +571,7 @@ ShadowMapManager::ShadowTechnique ShadowMapManager::updateCascadeShadowMaps(FEng
     }
 
     // screen-space contact shadows for the directional light
-    float screenSpaceShadowDistance = options.maxShadowDistance;
+    float const screenSpaceShadowDistance = options.maxShadowDistance;
     if (options.screenSpaceContactShadows) {
         shadowTechnique |= ShadowTechnique::SCREEN_SPACE;
     }
@@ -749,7 +760,7 @@ void ShadowMapManager::preparePointShadowMap(ShadowMap& shadowMap,
     };
 
     auto shaderParameters = shadowMap.updatePoint(mEngine, lightData, lightIndex,
-            mainCameraInfo, shadowMapInfo, *view.getScene(), sceneInfo, face);
+            mainCameraInfo, shadowMapInfo, *view.getScene(), face);
 
 
     // and if we need to generate it, update all the UBO data
@@ -812,8 +823,8 @@ ShadowMapManager::ShadowTechnique ShadowMapManager::updateSpotShadowMaps(FEngine
     return shadowTechnique;
 }
 
-void ShadowMapManager::calculateTextureRequirements(FEngine& engine, FView& view,
-        FScene::LightSoa& lightData) noexcept {
+void ShadowMapManager::calculateTextureRequirements(FEngine&, FView& view,
+        FScene::LightSoa&) noexcept {
 
     // Lay out the shadow maps. For now, we take the largest requested dimension and allocate a
     // texture of that size. Each cascade / shadow map gets its own layer in the array texture.
@@ -867,7 +878,7 @@ void ShadowMapManager::calculateTextureRequirements(FEngine& engine, FView& view
     if (useMipmapping) {
         // Limit the lowest mipmap level to 256x256.
         // This avoids artifacts on high derivative tangent surfaces.
-        int lowMipmapLevel = 7;    // log2(256) - 1
+        int const lowMipmapLevel = 7;    // log2(256) - 1
         mipLevels = std::max(1, FTexture::maxLevelCount(maxDimension) - lowMipmapLevel);
     }
 
diff --git a/filament/src/ShadowMapManager.h b/filament/src/ShadowMapManager.h
index 39d106d9af22..952cb9e0d465 100644
--- a/filament/src/ShadowMapManager.h
+++ b/filament/src/ShadowMapManager.h
@@ -110,7 +110,7 @@ class ShadowMapManager {
 private:
     ShadowMapManager::ShadowTechnique updateCascadeShadowMaps(FEngine& engine,
             FView& view, CameraInfo const& cameraInfo, FScene::RenderableSoa& renderableData,
-            FScene::LightSoa& lightData, ShadowMap::SceneInfo& sceneInfo) noexcept;
+            FScene::LightSoa& lightData, ShadowMap::SceneInfo sceneInfo) noexcept;
 
     ShadowMapManager::ShadowTechnique updateSpotShadowMaps(FEngine& engine,
             FScene::LightSoa& lightData) noexcept;
diff --git a/filament/src/details/Scene.cpp b/filament/src/details/Scene.cpp
index 07d4d61fa4d5..0475e268f1cf 100644
--- a/filament/src/details/Scene.cpp
+++ b/filament/src/details/Scene.cpp
@@ -49,76 +49,136 @@ FScene::FScene(FEngine& engine) :
 FScene::~FScene() noexcept = default;
 
 
-void FScene::prepare(const mat4& worldOriginTransform, bool shadowReceiversAreCasters) noexcept {
+void FScene::prepare(utils::JobSystem& js,
+        LinearAllocatorArena& allocator,
+        const mat4& worldOriginTransform,
+        bool shadowReceiversAreCasters) noexcept {
     // TODO: can we skip this in most cases? Since we rely on indices staying the same,
     //       we could only skip, if nothing changed in the RCM.
 
     SYSTRACE_CALL();
 
+    SYSTRACE_CONTEXT();
+
+    // This will reset the allocator upon exiting
+    ArenaScope const arena(allocator);
+
     FEngine& engine = mEngine;
-    EntityManager& em = engine.getEntityManager();
-    FRenderableManager& rcm = engine.getRenderableManager();
-    FTransformManager& tcm = engine.getTransformManager();
-    FLightManager& lcm = engine.getLightManager();
+    EntityManager const& em = engine.getEntityManager();
+    FRenderableManager const& rcm = engine.getRenderableManager();
+    FTransformManager const& tcm = engine.getTransformManager();
+    FLightManager const& lcm = engine.getLightManager();
     // go through the list of entities, and gather the data of those that are renderables
     auto& sceneData = mRenderableData;
     auto& lightData = mLightData;
     auto const& entities = mEntities;
 
+    using RenderableContainerData = std::pair<RenderableManager::Instance, TransformManager::Instance>;
+    using RenderableInstanceContainer = FixedCapacityVector<RenderableContainerData,
+            utils::STLAllocator< RenderableContainerData, LinearAllocatorArena >, false>;
 
-    // NOTE: we can't know in advance how many entities are renderable or lights because the corresponding
-    // component can be added after the entity is added to the scene.
+    using LightContainerData = std::pair<LightManager::Instance, TransformManager::Instance>;
+    using LightInstanceContainer = FixedCapacityVector<LightContainerData,
+            utils::STLAllocator< LightContainerData, LinearAllocatorArena >, false>;
+
+    RenderableInstanceContainer renderableInstances{
+            RenderableInstanceContainer::with_capacity(entities.size(), allocator) };
+
+    LightInstanceContainer lightInstances{
+            LightInstanceContainer::with_capacity(entities.size(), allocator) };
+
+    SYSTRACE_NAME_BEGIN("InstanceLoop");
+
+    // find the max intensity directional light index in our local array
+    float maxIntensity = 0.0f;
+    std::pair<LightManager::Instance, TransformManager::Instance> directionalLightInstances{};
+
+    /*
+     * First compute the exact number of renderables and lights in the scene.
+     * Also find the main directional light.
+     */
+
+    for (Entity const e: entities) {
+        if (UTILS_LIKELY(em.isAlive(e))) {
+            auto ti = tcm.getInstance(e);
+            auto li = lcm.getInstance(e);
+            auto ri = rcm.getInstance(e);
+            if (li) {
+                // we handle the directional light here because it'd prevent multithreading below
+                if (UTILS_UNLIKELY(lcm.isDirectionalLight(li))) {
+                    // we don't store the directional lights, because we only have a single one
+                    if (lcm.getIntensity(li) >= maxIntensity) {
+                        maxIntensity = lcm.getIntensity(li);
+                        directionalLightInstances = { li, ti };
+                    }
+                } else {
+                    lightInstances.emplace_back(li, ti);
+                }
+            }
+            if (ri) {
+                renderableInstances.emplace_back(ri, ti);
+            }
+        }
+    }
+
+    SYSTRACE_NAME_END();
+
+    /*
+     * Evaluate the capacity needed for the renderable and light SoAs
+     */
 
-    size_t renderableDataCapacity = entities.size();
     // we need the capacity to be multiple of 16 for SIMD loops
-    renderableDataCapacity = (renderableDataCapacity + 0xFu) & ~0xFu;
     // we need 1 extra entry at the end for the summed primitive count
+    size_t renderableDataCapacity = entities.size();
+    renderableDataCapacity = (renderableDataCapacity + 0xFu) & ~0xFu;
     renderableDataCapacity = renderableDataCapacity + 1;
 
-    sceneData.clear();
-    if (sceneData.capacity() < renderableDataCapacity) {
-        sceneData.setCapacity(renderableDataCapacity);
-    }
-
     // The light data list will always contain at least one entry for the
     // dominating directional light, even if there are no entities.
-    size_t lightDataCapacity = std::max<size_t>(1, entities.size());
     // we need the capacity to be multiple of 16 for SIMD loops
+    size_t lightDataCapacity = std::max<size_t>(DIRECTIONAL_LIGHTS_COUNT, entities.size());
     lightDataCapacity = (lightDataCapacity + 0xFu) & ~0xFu;
 
-    lightData.clear();
-    if (lightData.capacity() < lightDataCapacity) {
-        lightData.setCapacity(lightDataCapacity);
-    }
-    // the first entries are reserved for the directional lights (currently only one)
-    lightData.resize(DIRECTIONAL_LIGHTS_COUNT);
-
+    /*
+     * Now resize the SoAs if needed
+     */
 
-    // find the max intensity directional light index in our local array
-    float maxIntensity = 0.0f;
+    // TODO: the resize below could happen in a job
 
-    for (Entity e : entities) {
-        if (!em.isAlive(e)) {
-            continue;
+    if (sceneData.size() != renderableInstances.size()) {
+        sceneData.clear();
+        if (sceneData.capacity() < renderableDataCapacity) {
+            sceneData.setCapacity(renderableDataCapacity);
         }
+        assert_invariant(renderableInstances.size() <= sceneData.capacity());
+        sceneData.resize(renderableInstances.size());
+    }
 
-        // getInstance() always returns null if the entity is the Null entity,
-        // so we don't need to check for that, but we need to check it's alive
-        auto ri = rcm.getInstance(e);
-        auto li = lcm.getInstance(e);
-        if (!ri && !li) {
-            continue;
+    if (lightData.size() != lightInstances.size() + DIRECTIONAL_LIGHTS_COUNT) {
+        lightData.clear();
+        if (lightData.capacity() < lightDataCapacity) {
+            lightData.setCapacity(lightDataCapacity);
         }
+        assert_invariant(lightInstances.size() + DIRECTIONAL_LIGHTS_COUNT <= lightData.capacity());
+        lightData.resize(lightInstances.size() + DIRECTIONAL_LIGHTS_COUNT);
+    }
 
-        // get the world transform
-        auto ti = tcm.getInstance(e);
-        // this is where we go from double to float for our transforms
-        const mat4f worldTransform{ worldOriginTransform * tcm.getWorldTransformAccurate(ti) };
-        const bool reversedWindingOrder = det(worldTransform.upperLeft()) < 0;
+    /*
+     * Fill the SoA with the JobSystem
+     */
+
+    auto renderableWork = [first = renderableInstances.data(), &rcm, &tcm, &worldOriginTransform,
+                 &sceneData, shadowReceiversAreCasters](auto* p, auto c) {
+        SYSTRACE_NAME("renderableWork");
+
+        for (size_t i = 0; i < c; i++) {
+            auto [ri, ti] = p[i];
+
+            // this is where we go from double to float for our transforms
+            const mat4f worldTransform{
+                    worldOriginTransform * tcm.getWorldTransformAccurate(ti) };
+            const bool reversedWindingOrder = det(worldTransform.upperLeft()) < 0;
 
-        // don't even draw this object if it doesn't have a transform (which shouldn't happen
-        // because one is always created when creating a Renderable component).
-        if (ri && ti) {
             // compute the world AABB so we can perform culling
             const Box worldAABB = rigidTransform(rcm.getAABB(ri), worldTransform);
 
@@ -131,81 +191,117 @@ void FScene::prepare(const mat4& worldOriginTransform, bool shadowReceiversAreCa
             // FIXME: We compute and store the local scale because it's needed for glTF but
             //        we need a better way to handle this
             const mat4f& transform = tcm.getTransform(ti);
-            float scale = (length(transform[0].xyz) + length(transform[1].xyz) +
-                    length(transform[2].xyz)) / 3.0f;
-
-            // we know there is enough space in the array
-            sceneData.push_back_unsafe(
-                    ri,                             // RENDERABLE_INSTANCE
-                    worldTransform,                 // WORLD_TRANSFORM
-                    visibility,                     // VISIBILITY_STATE
-                    rcm.getSkinningBufferInfo(ri),  // SKINNING_BUFFER
-                    rcm.getMorphingBufferInfo(ri),  // MORPHING_BUFFER
-                    worldAABB.center,               // WORLD_AABB_CENTER
-                    0,                              // VISIBLE_MASK
-                    rcm.getChannels(ri),            // CHANNELS
-                    rcm.getInstanceCount(ri),       // INSTANCE_COUNT
-                    rcm.getLayerMask(ri),           // LAYERS
-                    worldAABB.halfExtent,           // WORLD_AABB_EXTENT
-                    {},                             // PRIMITIVES
-                    0,                              // SUMMED_PRIMITIVE_COUNT
-                    {},                             // UBO
-                    scale                           // USER_DATA
-            );
+            float const scale = (length(transform[0].xyz) + length(transform[1].xyz) +
+                                 length(transform[2].xyz)) / 3.0f;
+
+            size_t const index = std::distance(first, p) + i;
+            assert_invariant(index < sceneData.size());
+
+            sceneData.elementAt<RENDERABLE_INSTANCE>(index) = ri;
+            sceneData.elementAt<WORLD_TRANSFORM>(index)     = worldTransform;
+            sceneData.elementAt<VISIBILITY_STATE>(index)    = visibility;
+            sceneData.elementAt<SKINNING_BUFFER>(index)     = rcm.getSkinningBufferInfo(ri);
+            sceneData.elementAt<MORPHING_BUFFER>(index)     = rcm.getMorphingBufferInfo(ri);
+            sceneData.elementAt<WORLD_AABB_CENTER>(index)   = worldAABB.center;
+            sceneData.elementAt<VISIBLE_MASK>(index)        = 0;
+            sceneData.elementAt<CHANNELS>(index)            = rcm.getChannels(ri);
+            sceneData.elementAt<INSTANCE_COUNT>(index)      = rcm.getInstanceCount(ri);
+            sceneData.elementAt<LAYERS>(index)              = rcm.getLayerMask(ri);
+            sceneData.elementAt<WORLD_AABB_EXTENT>(index)   = worldAABB.halfExtent;
+            //sceneData.elementAt<PRIMITIVES>(index)          = {}; // already initialized, Slice<>
+            sceneData.elementAt<SUMMED_PRIMITIVE_COUNT>(index) = 0;
+            //sceneData.elementAt<UBO>(index)                 = {}; // not needed here
+            sceneData.elementAt<USER_DATA>(index)           = scale;
         }
-
-        if (li) {
-            // find the dominant directional light
-            if (UTILS_UNLIKELY(lcm.isDirectionalLight(li))) {
-                // we don't store the directional lights, because we only have a single one
-                if (lcm.getIntensity(li) >= maxIntensity) {
-                    maxIntensity = lcm.getIntensity(li);
-                    float3 d = lcm.getLocalDirection(li);
-                    // using mat3f::getTransformForNormals handles non-uniform scaling
-                    d = normalize(mat3f::getTransformForNormals(worldTransform.upperLeft()) * d);
-                    lightData.elementAt<FScene::POSITION_RADIUS>(0) =
-                            float4{ 0, 0, 0, std::numeric_limits<float>::infinity() };
-                    lightData.elementAt<FScene::DIRECTION>(0)       = d;
-                    lightData.elementAt<FScene::LIGHT_INSTANCE>(0)  = li;
-                }
-            } else {
-                const float4 p = worldTransform * float4{ lcm.getLocalPosition(li), 1 };
-                float3 d = 0;
-                if (!lcm.isPointLight(li) || lcm.isIESLight(li)) {
-                    d = lcm.getLocalDirection(li);
-                    // using mat3f::getTransformForNormals handles non-uniform scaling
-                    d = normalize(mat3f::getTransformForNormals(worldTransform.upperLeft()) * d);
-                }
-                lightData.push_back_unsafe(
-                        float4{ p.xyz, lcm.getRadius(li) }, d, li, {}, {}, {});
+    };
+
+    auto lightWork = [first = lightInstances.data(), &lcm, &tcm, &worldOriginTransform,
+            &lightData](auto* p, auto c) {
+        SYSTRACE_NAME("lightWork");
+        for (size_t i = 0; i < c; i++) {
+            auto [li, ti] = p[i];
+            // this is where we go from double to float for our transforms
+            const mat4f worldTransform{ worldOriginTransform * tcm.getWorldTransformAccurate(ti) };
+            const float4 position = worldTransform * float4{ lcm.getLocalPosition(li), 1 };
+            float3 d = 0;
+            if (!lcm.isPointLight(li) || lcm.isIESLight(li)) {
+                d = lcm.getLocalDirection(li);
+                // using mat3f::getTransformForNormals handles non-uniform scaling
+                d = normalize(mat3f::getTransformForNormals(worldTransform.upperLeft()) * d);
             }
+            size_t const index = DIRECTIONAL_LIGHTS_COUNT + std::distance(first, p) + i;
+            assert_invariant(index < lightData.size());
+            lightData.elementAt<POSITION_RADIUS>(index) = float4{ position.xyz, lcm.getRadius(li) };
+            lightData.elementAt<DIRECTION>(index) = d;
+            lightData.elementAt<LIGHT_INSTANCE>(index) = li;
         }
+    };
+
+
+    SYSTRACE_NAME_BEGIN("Renderable and Light jobs");
+
+    JobSystem::Job* rootJob = js.createJob();
+
+    auto* renderableJob = jobs::parallel_for(js, rootJob,
+            renderableInstances.data(), renderableInstances.size(),
+            std::cref(renderableWork), jobs::CountSplitter<128, 5>());
+
+    auto* lightJob = jobs::parallel_for(js, rootJob,
+            lightInstances.data(), lightInstances.size(),
+            std::cref(lightWork), jobs::CountSplitter<32, 5>());
+
+    js.run(renderableJob);
+    js.run(lightJob);
+
+    // Everything below can be done in parallel.
+
+    /*
+     * Handle the directional light separately
+     */
+
+    if (auto [li, ti] = directionalLightInstances ; li) {
+        const mat4f worldTransform{
+                worldOriginTransform * tcm.getWorldTransformAccurate(ti) };
+        // using mat3f::getTransformForNormals handles non-uniform scaling
+        float3 d = lcm.getLocalDirection(li);
+        d = normalize(mat3f::getTransformForNormals(worldTransform.upperLeft()) * d);
+        constexpr float inf = std::numeric_limits<float>::infinity();
+        lightData.elementAt<POSITION_RADIUS>(0) = float4{ 0, 0, 0, inf };
+        lightData.elementAt<DIRECTION>(0) = d;
+        lightData.elementAt<LIGHT_INSTANCE>(0) = li;
+    } else {
+        lightData.elementAt<LIGHT_INSTANCE>(0) = 0;
     }
 
     // some elements past the end of the array will be accessed by SIMD code, we need to make
     // sure the data is valid enough as not to produce errors such as divide-by-zero
     // (e.g. in computeLightRanges())
-    for (size_t i = lightData.size(), e = lightDataCapacity; i < e; i++) {
+    for (size_t i = lightData.size(), e = lightData.capacity(); i < e; i++) {
         new(lightData.data<POSITION_RADIUS>() + i) float4{ 0, 0, 0, 1 };
     }
 
     // Purely for the benefit of MSAN, we can avoid uninitialized reads by zeroing out the
     // unused scene elements between the end of the array and the rounded-up count.
     if (UTILS_HAS_SANITIZE_MEMORY) {
-        for (size_t i = sceneData.size(), e = renderableDataCapacity; i < e; i++) {
+        for (size_t i = sceneData.size(), e = sceneData.capacity(); i < e; i++) {
             sceneData.data<LAYERS>()[i] = 0;
             sceneData.data<VISIBLE_MASK>()[i] = 0;
             sceneData.data<VISIBILITY_STATE>()[i] = {};
         }
     }
+
+    js.runAndWait(rootJob);
+
+    SYSTRACE_NAME_END();
 }
 
 void FScene::prepareVisibleRenderables(Range<uint32_t> visibleRenderables) noexcept {
+    SYSTRACE_CALL();
     RenderableSoa& sceneData = mRenderableData;
-    FRenderableManager& rcm = mEngine.getRenderableManager();
+    FRenderableManager const& rcm = mEngine.getRenderableManager();
 
     mHasContactShadows = false;
-    for (uint32_t i : visibleRenderables) {
+    for (uint32_t const i : visibleRenderables) {
         PerRenderableData& uboData = sceneData.elementAt<UBO>(i);
 
         auto const visibility = sceneData.elementAt<VISIBILITY_STATE>(i);
@@ -257,6 +353,7 @@ void FScene::prepareVisibleRenderables(Range<uint32_t> visibleRenderables) noexc
 void FScene::updateUBOs(
         Range<uint32_t> visibleRenderables,
         Handle<HwBufferObject> renderableUbh) noexcept {
+    SYSTRACE_CALL();
     FEngine::DriverApi& driver = mEngine.getDriverApi();
 
     // store the UBO handle
@@ -278,7 +375,7 @@ void FScene::updateUBOs(
 
     // copy our data into the UBO for each visible renderable
     PerRenderableData const* const uboData = mRenderableData.data<UBO>();
-    for (uint32_t i : visibleRenderables) {
+    for (uint32_t const i : visibleRenderables) {
         buffer[i] = uboData[i];
     }
 
@@ -308,15 +405,15 @@ void FScene::updateUBOs(
     }
 }
 
-void FScene::terminate(FEngine& engine) {
+void FScene::terminate(FEngine&) {
     // DO NOT destroy this UBO, it's owned by the View
     mRenderableViewUbh.clear();
 }
 
-void FScene::prepareDynamicLights(const CameraInfo& camera, ArenaScope& rootArena,
+void FScene::prepareDynamicLights(const CameraInfo& camera, ArenaScope&,
         Handle<HwBufferObject> lightUbh) noexcept {
     FEngine::DriverApi& driver = mEngine.getDriverApi();
-    FLightManager& lcm = mEngine.getLightManager();
+    FLightManager const& lcm = mEngine.getLightManager();
     FScene::LightSoa& lightData = getLightData();
 
     /*
@@ -325,7 +422,7 @@ void FScene::prepareDynamicLights(const CameraInfo& camera, ArenaScope& rootAren
 
     size_t const size = lightData.size();
     // number of point-light/spotlights
-    size_t positionalLightCount = size - DIRECTIONAL_LIGHTS_COUNT;
+    size_t const positionalLightCount = size - DIRECTIONAL_LIGHTS_COUNT;
     assert_invariant(positionalLightCount);
 
     float4 const* const UTILS_RESTRICT spheres = lightData.data<FScene::POSITION_RADIUS>();
@@ -418,11 +515,11 @@ void FScene::removeEntities(const Entity* entities, size_t count) {
 UTILS_NOINLINE
 size_t FScene::getRenderableCount() const noexcept {
     FEngine& engine = mEngine;
-    EntityManager& em = engine.getEntityManager();
-    FRenderableManager& rcm = engine.getRenderableManager();
+    EntityManager const& em = engine.getEntityManager();
+    FRenderableManager const& rcm = engine.getRenderableManager();
     size_t count = 0;
     auto const& entities = mEntities;
-    for (Entity e : entities) {
+    for (Entity const e : entities) {
         count += em.isAlive(e) && rcm.getInstance(e) ? 1 : 0;
     }
     return count;
@@ -431,11 +528,11 @@ size_t FScene::getRenderableCount() const noexcept {
 UTILS_NOINLINE
 size_t FScene::getLightCount() const noexcept {
     FEngine& engine = mEngine;
-    EntityManager& em = engine.getEntityManager();
-    FLightManager& lcm = engine.getLightManager();
+    EntityManager const& em = engine.getEntityManager();
+    FLightManager const& lcm = engine.getLightManager();
     size_t count = 0;
     auto const& entities = mEntities;
-    for (Entity e : entities) {
+    for (Entity const e : entities) {
         count += em.isAlive(e) && lcm.getInstance(e) ? 1 : 0;
     }
     return count;
diff --git a/filament/src/details/Scene.h b/filament/src/details/Scene.h
index 87da462a7156..1f05c9e3af6c 100644
--- a/filament/src/details/Scene.h
+++ b/filament/src/details/Scene.h
@@ -70,7 +70,8 @@ class FScene : public Scene {
     ~FScene() noexcept;
     void terminate(FEngine& engine);
 
-    void prepare(const math::mat4& worldOriginTransform, bool shadowReceiversAreCasters) noexcept;
+    void prepare(utils::JobSystem& js, LinearAllocatorArena& allocator,
+            math::mat4 const& worldOriginTransform, bool shadowReceiversAreCasters) noexcept;
 
     void prepareVisibleRenderables(utils::Range<uint32_t> visibleRenderables) noexcept;
 
diff --git a/filament/src/details/View.cpp b/filament/src/details/View.cpp
index ae8923ed1655..bd3ee5747dfe 100644
--- a/filament/src/details/View.cpp
+++ b/filament/src/details/View.cpp
@@ -268,9 +268,8 @@ bool FView::isSkyboxVisible() const noexcept {
     return skybox != nullptr && (skybox->getLayerMask() & mVisibleLayers);
 }
 
-void FView::prepareShadowing(FEngine& engine, DriverApi& driver,
-        FScene::RenderableSoa& renderableData, FScene::LightSoa& lightData,
-        CameraInfo const& cameraInfo) noexcept {
+void FView::prepareShadowing(FEngine& engine, FScene::RenderableSoa& renderableData,
+        FScene::LightSoa& lightData, CameraInfo const& cameraInfo) noexcept {
     SYSTRACE_CALL();
 
     mHasShadowing = false;
@@ -284,7 +283,7 @@ void FView::prepareShadowing(FEngine& engine, DriverApi& driver,
     auto& lcm = engine.getLightManager();
 
     // dominant directional light is always as index 0
-    FLightManager::Instance directionalLight = lightData.elementAt<FScene::LIGHT_INSTANCE>(0);
+    FLightManager::Instance const directionalLight = lightData.elementAt<FScene::LIGHT_INSTANCE>(0);
     const bool hasDirectionalShadows = directionalLight && lcm.isShadowCaster(directionalLight);
     if (UTILS_UNLIKELY(hasDirectionalShadows)) {
         const auto& shadowOptions = lcm.getShadowOptions(directionalLight);
@@ -338,6 +337,7 @@ void FView::prepareShadowing(FEngine& engine, DriverApi& driver,
 void FView::prepareLighting(FEngine& engine, FEngine::DriverApi& driver, ArenaScope& arena,
         filament::Viewport const& viewport, CameraInfo const& cameraInfo) noexcept {
     SYSTRACE_CALL();
+    SYSTRACE_CONTEXT();
 
     FScene* const scene = mScene;
     auto const& lightData = scene->getLightData();
@@ -433,6 +433,9 @@ void FView::prepare(FEngine& engine, DriverApi& driver, ArenaScope& arena,
         filament::Viewport const& viewport, CameraInfo const& cameraInfo,
         float4 const& userTime, bool needsAlphaChannel) noexcept {
 
+        SYSTRACE_CALL();
+        SYSTRACE_CONTEXT();
+
     JobSystem& js = engine.getJobSystem();
 
     /*
@@ -464,7 +467,7 @@ void FView::prepare(FEngine& engine, DriverApi& driver, ArenaScope& arena,
      * Gather all information needed to render this scene. Apply the world origin to all
      * objects in the scene.
      */
-    scene->prepare(cameraInfo.worldOrigin, hasVSM());
+    scene->prepare(js, arena.getAllocator(), cameraInfo.worldOrigin, hasVSM());
 
     /*
      * Light culling: runs in parallel with Renderable culling (below)
@@ -504,7 +507,7 @@ void FView::prepare(FEngine& engine, DriverApi& driver, ArenaScope& arena,
         if (prepareVisibleLightsJob) {
             js.waitAndRelease(prepareVisibleLightsJob);
         }
-        prepareShadowing(engine, driver, renderableData, scene->getLightData(), cameraInfo);
+        prepareShadowing(engine, renderableData, scene->getLightData(), cameraInfo);
 
         /*
          * Partition the SoA so that renderables are partitioned w.r.t their visibility into the
@@ -528,6 +531,8 @@ void FView::prepare(FEngine& engine, DriverApi& driver, ArenaScope& arena,
         // TODO: we need to compare performance of doing this partitioning vs not doing it.
         //       and rely on checking visibility in the loops
 
+        SYSTRACE_NAME_BEGIN("Partitioning");
+
         // calculate the sorting key for all elements, based on their visibility
         uint8_t const* layers = renderableData.data<FScene::LAYERS>();
         auto const* visibility = renderableData.data<FScene::VISIBILITY_STATE>();
@@ -568,6 +573,8 @@ void FView::prepare(FEngine& engine, DriverApi& driver, ArenaScope& arena,
 
         mSpotLightShadowCasters = merged;
 
+        SYSTRACE_NAME_END();
+
         // TODO: when any spotlight is used, `merged` ends-up being the whole list. However,
         //       some of the items will end-up not being visible by any light. Can we do better?
         //       e.g. could we deffer some of the prepareVisibleRenderables() to later?
diff --git a/filament/src/details/View.h b/filament/src/details/View.h
index 8c9f42a9be30..28e8cfbf3c24 100644
--- a/filament/src/details/View.h
+++ b/filament/src/details/View.h
@@ -140,9 +140,8 @@ class FView : public View {
             const Viewport& physicalViewport,
             const filament::Viewport& logicalViewport) const noexcept;
 
-    void prepareShadowing(FEngine& engine, backend::DriverApi& driver,
-            FScene::RenderableSoa& renderableData, FScene::LightSoa& lightData,
-            CameraInfo const& cameraInfo) noexcept;
+    void prepareShadowing(FEngine& engine, FScene::RenderableSoa& renderableData,
+            FScene::LightSoa& lightData, CameraInfo const& cameraInfo) noexcept;
     void prepareLighting(FEngine& engine, FEngine::DriverApi& driver, ArenaScope& arena,
             filament::Viewport const& viewport, CameraInfo const &cameraInfo) noexcept;
 
diff --git a/ios/CocoaPods/Filament.podspec b/ios/CocoaPods/Filament.podspec
index 5581bd0372c4..a47b27ad2a2a 100644
--- a/ios/CocoaPods/Filament.podspec
+++ b/ios/CocoaPods/Filament.podspec
@@ -1,12 +1,12 @@
 Pod::Spec.new do |spec|
   spec.name = "Filament"
-  spec.version = "1.31.6"
+  spec.version = "1.31.7"
   spec.license = { :type => "Apache 2.0", :file => "LICENSE" }
   spec.homepage = "https://google.github.io/filament"
   spec.authors = "Google LLC."
   spec.summary = "Filament is a real-time physically based rendering engine for Android, iOS, Windows, Linux, macOS, and WASM/WebGL."
   spec.platform = :ios, "11.0"
-  spec.source = { :http => "https://github.com/google/filament/releases/download/v1.31.6/filament-v1.31.6-ios.tgz" }
+  spec.source = { :http => "https://github.com/google/filament/releases/download/v1.31.7/filament-v1.31.7-ios.tgz" }
 
   # Fix linking error with Xcode 12; we do not yet support the simulator on Apple silicon.
   spec.pod_target_xcconfig = {
diff --git a/libs/filabridge/include/private/filament/UibStructs.h b/libs/filabridge/include/private/filament/UibStructs.h
index 03e1282d6a5d..7073769ef388 100644
--- a/libs/filabridge/include/private/filament/UibStructs.h
+++ b/libs/filabridge/include/private/filament/UibStructs.h
@@ -175,6 +175,7 @@ static_assert(sizeof(PerViewUib) == sizeof(math::float4) * 128,
 struct PerRenderableData {
 
     struct alignas(16) vec3_std140 : public std::array<float, 3> { };
+    struct alignas(16) vec4_std140 : public std::array<float, 4> { };
     struct mat33_std140 : public std::array<vec3_std140, 3> {
         mat33_std140& operator=(math::mat3f const& rhs) noexcept {
             for (int i = 0; i < 3; i++) {
@@ -185,8 +186,19 @@ struct PerRenderableData {
             return *this;
         }
     };
+    struct mat44_std140 : public std::array<vec4_std140, 4> {
+        mat44_std140& operator=(math::mat4f const& rhs) noexcept {
+            for (int i = 0; i < 4; i++) {
+                (*this)[i][0] = rhs[i][0];
+                (*this)[i][1] = rhs[i][1];
+                (*this)[i][2] = rhs[i][2];
+                (*this)[i][3] = rhs[i][3];
+            }
+            return *this;
+        }
+    };
 
-    math::mat4f worldFromModelMatrix;
+    mat44_std140 worldFromModelMatrix;
     mat33_std140 worldFromModelNormalMatrix;
     uint32_t morphTargetCount;
     uint32_t flagsChannels;                   // see packFlags() below (0x00000fll)
@@ -204,6 +216,13 @@ struct PerRenderableData {
                channels;
     }
 };
+
+#ifndef _MSC_VER
+// not sure why this static_assert fails on MSVC
+static_assert(std::is_trivially_default_constructible_v<PerRenderableData>,
+        "make sure PerRenderableData stays trivially_default_constructible");
+#endif
+
 static_assert(sizeof(PerRenderableData) == 256,
         "sizeof(PerRenderableData) must be 256 bytes");
 
diff --git a/libs/geometry/CMakeLists.txt b/libs/geometry/CMakeLists.txt
index 77286760b957..38e8a9e878b0 100644
--- a/libs/geometry/CMakeLists.txt
+++ b/libs/geometry/CMakeLists.txt
@@ -14,6 +14,7 @@ set(PUBLIC_HDRS
 )
 
 set(SRCS
+        src/MikktspaceImpl.cpp
         src/SurfaceOrientation.cpp
         src/TangentSpaceMesh.cpp
         src/Transcoder.cpp
@@ -26,7 +27,13 @@ include_directories(${PUBLIC_HDR_DIR})
 
 add_library(${TARGET} STATIC ${PUBLIC_HDRS} ${SRCS})
 
+set(GEOMETRY_DEPS
+    meshoptimizer
+    mikktspace
+)
+
 target_link_libraries(${TARGET} PUBLIC math utils)
+target_link_libraries(${TARGET} PRIVATE ${GEOMETRY_DEPS})
 
 target_include_directories(${TARGET} PUBLIC ${PUBLIC_HDR_DIR})
 set_target_properties(${TARGET} PROPERTIES FOLDER Libs)
@@ -47,6 +54,19 @@ endif()
 install(TARGETS ${TARGET} ARCHIVE DESTINATION lib/${DIST_DIR})
 install(DIRECTORY ${PUBLIC_HDR_DIR}/geometry DESTINATION include)
 
+set(COMBINED_DEPS
+    ${TARGET}
+    ${GEOMETRY_DEPS}
+)
+
+# Combine the deps into a single static lib so that client only have to link this lib and not have
+# to link its dependencies.
+set(GEOMETRY_COMBINED_OUTPUT "${CMAKE_CURRENT_BINARY_DIR}/libgeometry_combined.a")
+combine_static_libs(${TARGET} "${GEOMETRY_COMBINED_OUTPUT}" "${COMBINED_DEPS}")
+
+set(GEOMETRY_LIB_NAME ${CMAKE_STATIC_LIBRARY_PREFIX}geometry${CMAKE_STATIC_LIBRARY_SUFFIX})
+install(FILES "${GEOMETRY_COMBINED_OUTPUT}" DESTINATION lib/${DIST_DIR} RENAME ${GEOMETRY_LIB_NAME})
+
 # ==================================================================================================
 # Tests
 # ==================================================================================================
diff --git a/libs/geometry/include/geometry/TangentSpaceMesh.h b/libs/geometry/include/geometry/TangentSpaceMesh.h
index c8f1502f77b8..bb47c030a834 100644
--- a/libs/geometry/include/geometry/TangentSpaceMesh.h
+++ b/libs/geometry/include/geometry/TangentSpaceMesh.h
@@ -20,7 +20,6 @@
 #include <math/quat.h>
 #include <math/vec3.h>
 #include <math/vec4.h>
-#include <utils/compiler.h>
 
 namespace filament {
 namespace geometry {
diff --git a/libs/geometry/src/MikktspaceImpl.cpp b/libs/geometry/src/MikktspaceImpl.cpp
new file mode 100644
index 000000000000..6c35647f6ba3
--- /dev/null
+++ b/libs/geometry/src/MikktspaceImpl.cpp
@@ -0,0 +1,152 @@
+/*
+ * Copyright (C) 2023 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "MikktspaceImpl.h"
+
+#include <math/mat3.h>
+#include <math/norm.h>
+
+
+#include <meshoptimizer.h>
+#include <mikktspace/mikktspace.h>
+
+#include <vector>
+
+namespace filament::geometry {
+
+using namespace filament::math;
+
+int MikktspaceImpl::getNumFaces(SMikkTSpaceContext const* context) noexcept {
+    auto const wrapper = MikktspaceImpl::getThis(context);
+    return wrapper->mFaceCount;
+}
+
+int MikktspaceImpl::getNumVerticesOfFace(SMikkTSpaceContext const* context,
+        int const iFace) noexcept {
+    return 3;
+}
+
+void MikktspaceImpl::getPosition(SMikkTSpaceContext const* context, float fvPosOut[],
+        const int iFace, const int iVert) noexcept {
+    auto const wrapper = MikktspaceImpl::getThis(context);
+    float3 const pos = *pointerAdd(wrapper->mPositions, wrapper->getTriangle(iFace)[iVert],
+            wrapper->mPositionStride);
+    fvPosOut[0] = pos.x;
+    fvPosOut[1] = pos.y;
+    fvPosOut[2] = pos.z;
+}
+
+void MikktspaceImpl::getNormal(SMikkTSpaceContext const* context, float fvNormOut[],
+        int const iFace, int const iVert) noexcept {
+    auto const wrapper = MikktspaceImpl::getThis(context);
+    float3 const normal = *pointerAdd(wrapper->mNormals, wrapper->getTriangle(iFace)[iVert],
+            wrapper->mNormalStride);
+    fvNormOut[0] = normal.x;
+    fvNormOut[1] = normal.y;
+    fvNormOut[2] = normal.z;
+}
+
+void MikktspaceImpl::getTexCoord(SMikkTSpaceContext const* context, float fvTexcOut[],
+        const int iFace, const int iVert) noexcept {
+    auto const wrapper = MikktspaceImpl::getThis(context);
+    float2 const texc =
+            *pointerAdd(wrapper->mUVs, wrapper->getTriangle(iFace)[iVert], wrapper->mUVStride);
+    fvTexcOut[0] = texc.x;
+    fvTexcOut[1] = texc.y;
+}
+
+void MikktspaceImpl::setTSpaceBasic(SMikkTSpaceContext const* context, float const fvTangent[],
+        float const fSign, int const iFace, int const iVert) noexcept {
+    auto const wrapper = MikktspaceImpl::getThis(context);
+    uint32_t const vertInd = wrapper->getTriangle(iFace)[iVert];
+    float3 const pos = *pointerAdd(wrapper->mPositions, vertInd, wrapper->mPositionStride);
+    float3 const n = normalize(*pointerAdd(wrapper->mNormals, vertInd, wrapper->mNormalStride));
+    float2 const uv = *pointerAdd(wrapper->mUVs, vertInd, wrapper->mUVStride);
+    float3 const t{fvTangent[0], fvTangent[1], fvTangent[2]};
+    float3 const b = fSign * normalize(cross(n, t));
+
+    // TODO: packTangentFrame actually changes the orientation of b.
+    quatf const quat = mat3f::packTangentFrame({t, b, n}, sizeof(int32_t));
+
+    wrapper->mOutVertices.push_back({pos, uv, quat});
+}
+
+MikktspaceImpl::MikktspaceImpl(const TangentSpaceMeshInput* input) noexcept
+    : mFaceCount((int) input->triangleCount),
+      mPositions(input->positions),
+      mPositionStride(input->positionStride ? input->positionStride : sizeof(float3)),
+      mNormals(input->normals),
+      mNormalStride(input->normalStride ? input->normalStride : sizeof(float3)),
+      mUVs(input->uvs),
+      mUVStride(input->uvStride ? input->uvStride : sizeof(float2)),
+      mIsTriangle16(input->triangles16),
+      mTriangles(
+              input->triangles16 ? (uint8_t*) input->triangles16 : (uint8_t*) input->triangles32) {
+    mOutVertices.reserve(mFaceCount * 3);
+}
+
+MikktspaceImpl* MikktspaceImpl::getThis(SMikkTSpaceContext const* context) noexcept {
+    return (MikktspaceImpl*) context->m_pUserData;
+}
+
+inline const uint3 MikktspaceImpl::getTriangle(int triangleIndex) const noexcept {
+    const size_t tstride = mIsTriangle16 ? sizeof(ushort3) : sizeof(uint3);
+    return mIsTriangle16 ? uint3(*(ushort3*) (pointerAdd(mTriangles, triangleIndex, tstride)))
+                         : *(uint3*) (pointerAdd(mTriangles, triangleIndex, tstride));
+}
+
+void MikktspaceImpl::run(TangentSpaceMeshOutput* output) noexcept {
+    SMikkTSpaceInterface interface {
+        .m_getNumFaces = MikktspaceImpl::getNumFaces,
+        .m_getNumVerticesOfFace = MikktspaceImpl::getNumVerticesOfFace,
+        .m_getPosition = MikktspaceImpl::getPosition,
+        .m_getNormal = MikktspaceImpl::getNormal,
+        .m_getTexCoord = MikktspaceImpl::getTexCoord,
+        .m_setTSpaceBasic = MikktspaceImpl::setTSpaceBasic
+    };
+    SMikkTSpaceContext context{.m_pInterface = &interface, .m_pUserData = this};
+    genTangSpaceDefault(&context);
+
+    std::vector<unsigned int> remap(mOutVertices.size());
+    size_t vertexCount = meshopt_generateVertexRemap(remap.data(), NULL, mOutVertices.size(),
+            mOutVertices.data(), mOutVertices.size(), sizeof(IOVertex));
+
+    std::vector<IOVertex> newVertices(vertexCount);
+    meshopt_remapVertexBuffer((void*) newVertices.data(), mOutVertices.data(), mOutVertices.size(),
+            sizeof(IOVertex), remap.data());
+
+    uint3* triangles32 = new uint3[mFaceCount];
+    meshopt_remapIndexBuffer((uint32_t*) triangles32, NULL, mOutVertices.size(), remap.data());
+
+    float3* outPositions = new float3[vertexCount];
+    float2* outUVs = new float2[vertexCount];
+    quatf* outQuats = new quatf[vertexCount];
+
+    for (size_t i = 0; i < vertexCount; ++i) {
+        outPositions[i] = newVertices[i].position;
+        outUVs[i] = newVertices[i].uv;
+        outQuats[i] = newVertices[i].tangentSpace;
+    }
+
+    output->vertexCount = vertexCount;
+    output->positions = outPositions;
+    output->uvs = outUVs;
+    output->tangentSpace = outQuats;
+    output->triangles32 = triangles32;
+    output->triangleCount = mFaceCount;
+}
+
+}// namespace filament::geometry
diff --git a/libs/geometry/src/MikktspaceImpl.h b/libs/geometry/src/MikktspaceImpl.h
new file mode 100644
index 000000000000..cc4831d4d662
--- /dev/null
+++ b/libs/geometry/src/MikktspaceImpl.h
@@ -0,0 +1,80 @@
+/*
+ * Copyright (C) 2023 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef TNT_GEOMETRY_MIKKTSPACEIMPL_H
+#define TNT_GEOMETRY_MIKKTSPACEIMPL_H
+
+#include "TangentSpaceMeshInternal.h"
+
+#include <math/quat.h>
+#include <math/vec2.h>
+#include <math/vec3.h>
+
+#include <vector>
+
+struct SMikkTSpaceContext;
+
+namespace filament::geometry {
+
+using namespace filament::math;
+
+class MikktspaceImpl {
+public:
+    struct IOVertex {
+        float3 position;
+        float2 uv;
+        quatf tangentSpace;
+    };
+
+    MikktspaceImpl(const TangentSpaceMeshInput* input) noexcept;
+
+    MikktspaceImpl(const MikktspaceImpl&) = delete;
+    MikktspaceImpl& operator=(const MikktspaceImpl&) = delete;
+
+    void run(TangentSpaceMeshOutput* output) noexcept;
+
+private:
+    static int getNumFaces(SMikkTSpaceContext const* context) noexcept;
+    static int getNumVerticesOfFace(SMikkTSpaceContext const* context, int const iFace) noexcept;
+    static void getPosition(SMikkTSpaceContext const* context, float fvPosOut[], const int iFace,
+            const int iVert) noexcept;
+    static void getNormal(SMikkTSpaceContext const* context, float fvNormOut[], int const iFace,
+            int const iVert) noexcept;
+    static void getTexCoord(SMikkTSpaceContext const* context, float fvTexcOut[], const int iFace,
+            const int iVert) noexcept;
+    static void setTSpaceBasic(SMikkTSpaceContext const* context, float const fvTangent[],
+            float const fSign, int const iFace, int const iVert) noexcept;
+
+    static MikktspaceImpl* getThis(SMikkTSpaceContext const* context) noexcept;
+
+    inline const uint3 getTriangle(int triangleIndex) const noexcept;
+
+    int const mFaceCount;
+    float3 const* mPositions;
+    size_t const mPositionStride;
+    float3 const* mNormals;
+    size_t const mNormalStride;
+    float2 const* mUVs;
+    size_t const mUVStride;
+    uint8_t const* mTriangles;
+    bool mIsTriangle16;
+
+    std::vector<IOVertex> mOutVertices;
+};
+
+}// namespace filament::geometry
+
+#endif//TNT_GEOMETRY_MIKKTSPACEIMPL_H
diff --git a/libs/geometry/src/TangentSpaceMesh.cpp b/libs/geometry/src/TangentSpaceMesh.cpp
index 62e503a6229b..8e21c9e3b95b 100644
--- a/libs/geometry/src/TangentSpaceMesh.cpp
+++ b/libs/geometry/src/TangentSpaceMesh.cpp
@@ -16,61 +16,36 @@
 
 #include <geometry/TangentSpaceMesh.h>
 
+#include "MikktspaceImpl.h"
+#include "TangentSpaceMeshInternal.h"
+
 #include <math/mat3.h>
 #include <math/norm.h>
 
 #include <utils/Log.h>
 #include <utils/Panic.h>
 
+#include <vector>
+
 namespace filament {
 namespace geometry {
 
 using namespace filament::math;
 using Builder = TangentSpaceMesh::Builder;
 using Algorithm = TangentSpaceMesh::Algorithm;
-using MethodPtr = void(*)(const TangentSpaceMeshInput*, TangentSpaceMeshOutput*);
-using NormalsOnlyKernelPtr = void(*)(const float3& N, float3& T, float3& B);
-
-struct TangentSpaceMeshInput {
-    size_t vertexCount = 0;
-    const float3* normals = nullptr;
-    const float2* uvs = nullptr;
-    const float3* positions = nullptr;
-    const ushort3* triangles16 = nullptr;
-    const uint3* triangles32 = nullptr;
-
-    size_t normalStride = 0;
-    size_t uvStride = 0;
-    size_t positionStride = 0;
-    size_t triangleCount = 0;
-
-    Algorithm algorithm;
-};
-
-struct TangentSpaceMeshOutput {
-    Algorithm algorithm;
-
-    size_t triangleCount = 0;
-    size_t vertexCount = 0;
-
-    quatf const* tangentSpace = nullptr;
-    float2 const* uvs = nullptr;
-    float3 const* positions = nullptr;
-    uint3 const* triangles32 = nullptr;
-    ushort3 const* triangles16 = nullptr;
-};
+using MethodPtr = void(*)(TangentSpaceMeshInput const*, TangentSpaceMeshOutput*);
 
 namespace {
 
-const uint8_t NORMALS_BIT = 0x01;
-const uint8_t UVS_BIT = 0x02;
-const uint8_t POSITIONS_BIT = 0x04;
-const uint8_t INDICES_BIT = 0x08;
+uint8_t const NORMALS_BIT = 0x01;
+uint8_t const UVS_BIT = 0x02;
+uint8_t const POSITIONS_BIT = 0x04;
+uint8_t const INDICES_BIT = 0x08;
 
 // Input types
-const uint8_t NORMALS = NORMALS_BIT;
-const uint8_t POSITIONS_INDICES = POSITIONS_BIT | INDICES_BIT;
-const uint8_t NORMALS_UVS_POSITIONS_INDICES = NORMALS_BIT | UVS_BIT | POSITIONS_BIT | INDICES_BIT;
+uint8_t const NORMALS = NORMALS_BIT;
+uint8_t const POSITIONS_INDICES = POSITIONS_BIT | INDICES_BIT;
+uint8_t const NORMALS_UVS_POSITIONS_INDICES = NORMALS_BIT | UVS_BIT | POSITIONS_BIT | INDICES_BIT;
 
 std::string_view to_string(Algorithm algorithm) noexcept {
     switch (algorithm) {
@@ -89,20 +64,10 @@ std::string_view to_string(Algorithm algorithm) noexcept {
     }
 }
 
-inline bool isInputType(const uint8_t inputType, const uint8_t checkType) noexcept {
+inline bool isInputType(uint8_t const inputType, uint8_t const checkType) noexcept {
     return ((inputType & checkType) == checkType);
 }
 
-template<typename InputType>
-inline const InputType* pointerAdd(const InputType* ptr, size_t index, size_t stride) noexcept {
-    return (InputType*) (((const uint8_t*) ptr) + (index * stride));
-}
-
-template<typename InputType>
-inline InputType* pointerAdd(InputType* ptr, size_t index, size_t stride) noexcept {
-    return (InputType*) (((uint8_t*) ptr) + (index * stride));
-}
-
 template <typename InputType>
 inline void takeStride(InputType*& out, size_t stride) noexcept {
     out = pointerAdd(out, 1, stride);
@@ -179,7 +144,7 @@ Algorithm selectAlgorithm(TangentSpaceMeshInput *input) noexcept {
     if (!foundAlgo) {
         outAlgo = selectBestDefaultAlgorithm(inputType);
         utils::slog.w << "Cannot satisfy algorithm=" << to_string(input->algorithm)
-            << ". Selected algorithm=" << to_string(input->algorithm) << " instead"
+            << ". Selected algorithm=" << to_string(outAlgo) << " instead"
             << utils::io::endl;
     }
 
@@ -187,32 +152,32 @@ Algorithm selectAlgorithm(TangentSpaceMeshInput *input) noexcept {
 }
 
 // The paper uses a Z-up world basis, which has been converted to Y-up here
-inline std::pair<float3, float3> frisvadKernel(const float3& n) {
+inline std::pair<float3, float3> frisvadKernel(float3 const& n) {
     float3 b, t;
     if (n.y < -1.0f + std::numeric_limits<float>::epsilon()) {
         // Handle the singularity
         t = float3{-1.0f, 0.0f, 0.0f};
         b = float3{0.0f, 0.0f, -1.0f};
     } else {
-        const float va = 1.0f / (1.0f + n.y);
-        const float vb = -n.z * n.x * va;
+        float const va = 1.0f / (1.0f + n.y);
+        float const vb = -n.z * n.x * va;
         t = float3{vb, -n.z, 1.0f - n.z * n.z * va};
         b = float3{1.0f - n.x * n.x * va, -n.x, vb};
     }
     return {b, t};
 }
 
-void frisvadMethod(const TangentSpaceMeshInput* input, TangentSpaceMeshOutput* output)
+void frisvadMethod(TangentSpaceMeshInput const* input, TangentSpaceMeshOutput* output)
         noexcept {
-    const size_t vertexCount = input->vertexCount;
+    size_t const vertexCount = input->vertexCount;
     quatf* quats = new quatf[vertexCount];
 
-    const float3* UTILS_RESTRICT normals = input->normals;
+    float3 const* UTILS_RESTRICT normals = input->normals;
     size_t nstride = input->normalStride ? input->normalStride : sizeof(float3);
 
     for (size_t qindex = 0; qindex < vertexCount; ++qindex) {
-        const float3 n = *normals;
-        const auto [b, t] = frisvadKernel(n);
+        float3 const n = *normals;
+        auto const [b, t] = frisvadKernel(n);
         quats[qindex] = mat3f::packTangentFrame({t, b, n}, sizeof(int32_t));
         normals = pointerAdd(normals, 1, nstride);
     }
@@ -225,17 +190,16 @@ void frisvadMethod(const TangentSpaceMeshInput* input, TangentSpaceMeshOutput* o
     output->triangles16 = input->triangles16;
 }
 
-
-void hughesMollerMethod(const TangentSpaceMeshInput* input, TangentSpaceMeshOutput* output)
+void hughesMollerMethod(TangentSpaceMeshInput const* input, TangentSpaceMeshOutput* output)
         noexcept {
-    const size_t vertexCount = input->vertexCount;
+    size_t const vertexCount = input->vertexCount;
     quatf* quats = new quatf[vertexCount];
 
-    const float3* UTILS_RESTRICT normals = input->normals;
+    float3 const* UTILS_RESTRICT normals = input->normals;
     size_t nstride = input->normalStride ? input->normalStride : sizeof(float3);
 
     for (size_t qindex = 0; qindex < vertexCount; ++qindex) {
-        const float3 n = *normals;
+        float3 const n = *normals;
         float3 b, t;
 
         if (abs(n.x) > abs(n.z) + std::numeric_limits<float>::epsilon()) {
@@ -258,7 +222,7 @@ void hughesMollerMethod(const TangentSpaceMeshInput* input, TangentSpaceMeshOutp
     output->triangles16 = input->triangles16;
 }
 
-void flatShadingMethod(const TangentSpaceMeshInput* input, TangentSpaceMeshOutput* output)
+void flatShadingMethod(TangentSpaceMeshInput const* input, TangentSpaceMeshOutput* output)
         noexcept {
     const float3* positions = input->positions;
     const size_t pstride = input->positionStride ? input->positionStride : sizeof(float3);
@@ -321,6 +285,102 @@ void flatShadingMethod(const TangentSpaceMeshInput* input, TangentSpaceMeshOutpu
     output->triangleCount = outTriangleCount;
 }
 
+void mikktspaceMethod(TangentSpaceMeshInput const* input, TangentSpaceMeshOutput* output) {
+    MikktspaceImpl impl(input);
+    impl.run(output);
+}
+
+inline float3 randomPerp(const float3& n) {
+    float3 perp = cross(n, float3{1, 0, 0});
+    float sqrlen = dot(perp, perp);
+    if (sqrlen <= std::numeric_limits<float>::epsilon()) {
+        perp = cross(n, float3{0, 1, 0});
+        sqrlen = dot(perp, perp);
+    }
+    return perp / sqrlen;
+}
+
+void lengyelMethod(TangentSpaceMeshInput const* input, TangentSpaceMeshOutput* output) {
+    size_t const vertexCount = input->vertexCount;
+    size_t const triangleCount = input->triangleCount;
+    size_t const positionStride = input->positionStride ? input->positionStride : sizeof(float3);
+    size_t const normalStride = input->normalStride ? input->normalStride : sizeof(float3);
+    size_t const uvStride = input->uvStride ? input->uvStride : sizeof(float2);
+    auto const* triangles16 = input->triangles16;
+    auto const* triangles32 = input->triangles32;
+    auto const* positions = input->positions;
+    auto const* uvs = input->uvs;
+    auto const* normals = input->normals;
+
+    std::vector<float3> tan1(vertexCount, float3{0.0f});
+    std::vector<float3> tan2(vertexCount, float3{0.0f});
+    for (size_t a = 0; a < triangleCount; ++a) {
+        uint3 tri = triangles16 ? uint3(triangles16[a]) : triangles32[a];
+        assert_invariant(tri.x < vertexCount && tri.y < vertexCount && tri.z < vertexCount);
+        float3 const& v1 = *pointerAdd(positions, tri.x, positionStride);
+        float3 const& v2 = *pointerAdd(positions, tri.y, positionStride);
+        float3 const& v3 = *pointerAdd(positions, tri.z, positionStride);
+        float2 const& w1 = *pointerAdd(uvs, tri.x, uvStride);
+        float2 const& w2 = *pointerAdd(uvs, tri.y, uvStride);
+        float2 const& w3 = *pointerAdd(uvs, tri.z, uvStride);
+        float const x1 = v2.x - v1.x;
+        float const x2 = v3.x - v1.x;
+        float const y1 = v2.y - v1.y;
+        float const y2 = v3.y - v1.y;
+        float const z1 = v2.z - v1.z;
+        float const z2 = v3.z - v1.z;
+        float const s1 = w2.x - w1.x;
+        float const s2 = w3.x - w1.x;
+        float const t1 = w2.y - w1.y;
+        float const t2 = w3.y - w1.y;
+        float const d = s1 * t2 - s2 * t1;
+        float3 sdir, tdir;
+        // In general we can't guarantee smooth tangents when the UV's are non-smooth, but let's at
+        // least avoid divide-by-zero and fall back to normals-only method.
+        if (d == 0.0) {
+            float3 const& n1 = *pointerAdd(normals, tri.x, normalStride);
+            sdir = randomPerp(n1);
+            tdir = cross(n1, sdir);
+        } else {
+            sdir = {t2 * x1 - t1 * x2, t2 * y1 - t1 * y2, t2 * z1 - t1 * z2};
+            tdir = {s1 * x2 - s2 * x1, s1 * y2 - s2 * y1, s1 * z2 - s2 * z1};
+            float const r = 1.0f / d;
+            sdir *= r;
+            tdir *= r;
+        }
+        tan1[tri.x] += sdir;
+        tan1[tri.y] += sdir;
+        tan1[tri.z] += sdir;
+        tan2[tri.x] += tdir;
+        tan2[tri.y] += tdir;
+        tan2[tri.z] += tdir;
+    }
+
+    quatf* quats = new quatf[vertexCount];
+    for (size_t a = 0; a < vertexCount; a++) {
+        float3 const& n = normals[a];
+        float3 const& t1 = tan1[a];
+        float3 const& t2 = tan2[a];
+
+        // Gram-Schmidt orthogonalize
+        float3 t = normalize(t1 - n * dot(n, t1));
+
+        // Calculate handedness
+        float w = (dot(cross(n, t1), t2) < 0.0f) ? -1.0f : 1.0f;
+
+        float3 b = w < 0 ? cross(t, n) : cross(n, t);
+        quats[a] = mat3f::packTangentFrame({t, b, n}, sizeof(int32_t));
+    }
+
+    output->tangentSpace = quats;
+    output->vertexCount = vertexCount;
+    output->triangleCount = triangleCount;
+    output->uvs = uvs;
+    output->positions = positions;
+    output->triangles32 = triangles32;
+    output->triangles16 = triangles16;
+}
+
 template<typename DataType, typename InputType>
 inline void cleanOutputPointer(DataType*& ptr, InputType inputPtr) noexcept {
     if (ptr && ptr != (const DataType*) inputPtr) {
@@ -397,12 +457,18 @@ TangentSpaceMesh* Builder::build() {
     mMesh->mOutput->algorithm = selectAlgorithm(mMesh->mInput);
     MethodPtr method = nullptr;
     switch (mMesh->mOutput->algorithm) {
-        case Algorithm::FRISVAD:
-            method = frisvadMethod;
+        case Algorithm::MIKKTSPACE:
+            method = mikktspaceMethod;
+            break;
+        case Algorithm::LENGYEL:
+            method = lengyelMethod;
             break;
         case Algorithm::HUGHES_MOLLER:
             method = hughesMollerMethod;
             break;
+        case Algorithm::FRISVAD:
+            method = frisvadMethod;
+            break;
         case Algorithm::FLAT_SHADING:
             method = flatShadingMethod;
             break;
diff --git a/libs/geometry/src/TangentSpaceMeshInternal.h b/libs/geometry/src/TangentSpaceMeshInternal.h
new file mode 100644
index 000000000000..f9beb0684fea
--- /dev/null
+++ b/libs/geometry/src/TangentSpaceMeshInternal.h
@@ -0,0 +1,71 @@
+/*
+ * Copyright (C) 2023 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef TNT_GEOMETRY_TANGENTSPACEMESHIMPL_H
+#define TNT_GEOMETRY_TANGENTSPACEMESHIMPL_H
+
+#include <geometry/TangentSpaceMesh.h>
+
+#include <math/mat3.h>
+#include <math/norm.h>
+
+namespace filament::geometry {
+
+using namespace filament::math;
+using Algorithm = TangentSpaceMesh::Algorithm;
+
+struct TangentSpaceMeshInput {
+    size_t vertexCount = 0;
+    float3 const* normals = nullptr;
+    float2 const* uvs = nullptr;
+    float3 const* positions = nullptr;
+    ushort3 const* triangles16 = nullptr;
+    uint3 const* triangles32 = nullptr;
+
+    size_t normalStride = 0;
+    size_t uvStride = 0;
+    size_t positionStride = 0;
+    size_t triangleCount = 0;
+
+    Algorithm algorithm;
+};
+
+struct TangentSpaceMeshOutput {
+    Algorithm algorithm;
+
+    size_t triangleCount = 0;
+    size_t vertexCount = 0;
+
+    quatf const* tangentSpace = nullptr;
+    float2 const* uvs = nullptr;
+    float3 const* positions = nullptr;
+    uint3 const* triangles32 = nullptr;
+    ushort3 const* triangles16 = nullptr;
+};
+
+template<typename InputType>
+inline const InputType* pointerAdd(InputType const* ptr, size_t index, size_t stride) noexcept {
+    return (InputType*) (((uint8_t const*) ptr) + (index * stride));
+}
+
+template<typename InputType>
+inline InputType* pointerAdd(InputType* ptr, size_t index, size_t stride) noexcept {
+    return (InputType*) (((uint8_t*) ptr) + (index * stride));
+}
+
+}// namespace filament::geometry
+
+#endif//TNT_GEOMETRY_TANGENTSPACEMESHIMPL_H
diff --git a/libs/geometry/tests/test_tangent_space_mesh.cpp b/libs/geometry/tests/test_tangent_space_mesh.cpp
index ba31e2c01d40..b048235fb77a 100644
--- a/libs/geometry/tests/test_tangent_space_mesh.cpp
+++ b/libs/geometry/tests/test_tangent_space_mesh.cpp
@@ -18,7 +18,6 @@
 
 #include <math/quat.h>
 #include <math/vec3.h>
-#include <utils/Log.h>
 
 #include <gtest/gtest.h>
 
@@ -39,15 +38,27 @@ const std::vector<float3> CUBE_VERTS {
         float3{1, 1, 1}
 };
 
-const std::vector<float2> CUBE_UVS{
+const std::vector<float2> CUBE_UVS {
+        float2{0, 0},
         float2{0, 0},
         float2{1, 0},
+        float2{1, 1},
+        float2{0, 1},
         float2{0, 1},
         float2{1, 1},
-        float2{.5, 0},
-        float2{0, .5},
-        float2{.5, .5},
-        float2{0, 0}
+        float2{0, 1}
+};
+
+const float3 CUBE_CENTER{.5, .5, .5};
+const std::vector<float3> CUBE_NORMALS {
+    normalize(CUBE_VERTS[0] - CUBE_CENTER),
+    normalize(CUBE_VERTS[1] - CUBE_CENTER),
+    normalize(CUBE_VERTS[2] - CUBE_CENTER),
+    normalize(CUBE_VERTS[3] - CUBE_CENTER),
+    normalize(CUBE_VERTS[4] - CUBE_CENTER),
+    normalize(CUBE_VERTS[5] - CUBE_CENTER),
+    normalize(CUBE_VERTS[6] - CUBE_CENTER),
+    normalize(CUBE_VERTS[7] - CUBE_CENTER),
 };
 
 const std::vector<ushort3> CUBE_TRIANGLES {
@@ -121,6 +132,17 @@ TEST_F(TangentSpaceMeshTest, BuilderDefaultAlgorithms) {
             .build();
     EXPECT_EQ(mesh->getAlgorithm(), TangentSpaceMesh::Algorithm::FRISVAD);
     TangentSpaceMesh::destroy(mesh);
+
+    mesh = TangentSpaceMesh::Builder()
+            .vertexCount(CUBE_VERTS.size())
+            .positions(CUBE_VERTS.data())
+            .uvs(CUBE_UVS.data())
+            .normals(CUBE_NORMALS.data())
+            .triangleCount(CUBE_TRIANGLES.size())
+            .triangles(CUBE_TRIANGLES.data())
+            .build();
+    EXPECT_EQ(mesh->getAlgorithm(), TangentSpaceMesh::Algorithm::MIKKTSPACE);
+    TangentSpaceMesh::destroy(mesh);
 }
 
 // Remeshed vertices/uvs should map to input vertices/uvs
@@ -189,7 +211,6 @@ TEST_F(TangentSpaceMeshTest, FlatShading) {
     TangentSpaceMesh::destroy(mesh);
 }
 
-
 TEST_F(TangentSpaceMeshTest, Frisvad) {
     TangentSpaceMesh* mesh = TangentSpaceMesh::Builder()
             .vertexCount(TEST_NORMALS.size())
@@ -244,6 +265,103 @@ TEST_F(TangentSpaceMeshTest, HughesMoller) {
     TangentSpaceMesh::destroy(mesh);
 }
 
+TEST_F(TangentSpaceMeshTest, MikktspaceRemesh) {
+    TangentSpaceMesh* mesh = TangentSpaceMesh::Builder()
+            .vertexCount(CUBE_VERTS.size())
+            .normals(CUBE_NORMALS.data())
+            .positions(CUBE_VERTS.data())
+            .uvs(CUBE_UVS.data())
+            .triangleCount(CUBE_TRIANGLES.size())
+            .triangles(CUBE_TRIANGLES.data())
+            .algorithm(TangentSpaceMesh::Algorithm::MIKKTSPACE)
+            .build();
+
+    size_t const vertexCount = mesh->getVertexCount();
+
+    std::vector<float3> outPositions(vertexCount);
+    mesh->getPositions(outPositions.data());
+
+    std::vector<float2> outUVs(vertexCount);
+    mesh->getUVs(outUVs.data());
+
+    for (size_t i = 0; i < outPositions.size(); ++i) {
+        auto const& outPos = outPositions[i];
+        auto const& outUV = outUVs[i];
+
+        bool found = false;
+        for (size_t j = 0; j < CUBE_VERTS.size(); ++j) {
+            auto const& inPos = CUBE_VERTS[j];
+            auto const& inUV = CUBE_UVS[j];
+            if (isAlmostEqual3(outPos, inPos)) {
+                found = true;
+                EXPECT_PRED2(isAlmostEqual2, outUV, inUV);
+                break;
+            }
+        }
+        EXPECT_TRUE(found);
+    }
+    TangentSpaceMesh::destroy(mesh);
+}
+
+TEST_F(TangentSpaceMeshTest, Mikktspace) {
+    // It's unclear why the dot product between n and b is greater epsilon, but since we don't
+    // control the implementation of mikktspace, we simply add a little slack to the test.
+    constexpr float MAGIC_SLACK = 1.00001;
+    TangentSpaceMesh* mesh = TangentSpaceMesh::Builder()
+            .vertexCount(CUBE_VERTS.size())
+            .normals(CUBE_NORMALS.data())
+            .positions(CUBE_VERTS.data())
+            .uvs(CUBE_UVS.data())
+            .triangleCount(CUBE_TRIANGLES.size())
+            .triangles(CUBE_TRIANGLES.data())
+            .algorithm(TangentSpaceMesh::Algorithm::MIKKTSPACE)
+            .build();
+
+    size_t const vertexCount = mesh->getVertexCount();
+    std::vector<quatf> quats(vertexCount);
+    mesh->getQuats(quats.data());
+    for (size_t i = 0; i < vertexCount; ++i) {
+        float3 const n = quats[i] * NORMAL_AXIS;
+        float3 const b = quats[i] * BITANGENT_AXIS;
+        float3 const t = quats[i] * TANGENT_AXIS;
+
+        EXPECT_LT(abs(dot(b, t)), std::numeric_limits<float>::epsilon());
+        EXPECT_LT(abs(dot(n, t)), std::numeric_limits<float>::epsilon());
+        EXPECT_LT(abs(dot(n, b)), std::numeric_limits<float>::epsilon() * MAGIC_SLACK);
+        EXPECT_PRED2(isAlmostEqual3, cross(n, t), b);
+    }
+    TangentSpaceMesh::destroy(mesh);
+}
+
+TEST_F(TangentSpaceMeshTest, Lengyel) {
+    TangentSpaceMesh* mesh = TangentSpaceMesh::Builder()
+            .vertexCount(CUBE_VERTS.size())
+            .normals(CUBE_NORMALS.data())
+            .positions(CUBE_VERTS.data())
+            .uvs(CUBE_UVS.data())
+            .triangleCount(CUBE_TRIANGLES.size())
+            .triangles(CUBE_TRIANGLES.data())
+            .algorithm(TangentSpaceMesh::Algorithm::LENGYEL)
+            .build();
+
+    size_t const vertexCount = mesh->getVertexCount();
+    std::vector<quatf> quats(vertexCount);
+    mesh->getQuats(quats.data());
+    for (size_t i = 0; i < vertexCount; ++i) {
+        float3 const n = quats[i] * NORMAL_AXIS;
+        EXPECT_PRED2(isAlmostEqual3, n, CUBE_NORMALS[i]);
+
+        float3 const b = quats[i] * BITANGENT_AXIS;
+        float3 const t = quats[i] * TANGENT_AXIS;
+
+        EXPECT_LT(abs(dot(b, t)), std::numeric_limits<float>::epsilon());
+        EXPECT_LT(abs(dot(n, t)), std::numeric_limits<float>::epsilon());
+        EXPECT_LT(abs(dot(n, b)), std::numeric_limits<float>::epsilon());
+        EXPECT_PRED2(isAlmostEqual3, cross(n, t), b);
+    }
+    TangentSpaceMesh::destroy(mesh);
+}
+
 int main(int argc, char** argv) {
     ::testing::InitGoogleTest(&argc, argv);
     return RUN_ALL_TESTS();
diff --git a/libs/utils/include/utils/SingleInstanceComponentManager.h b/libs/utils/include/utils/SingleInstanceComponentManager.h
index 2ed7f2c9dd92..c03ec5f100c6 100644
--- a/libs/utils/include/utils/SingleInstanceComponentManager.h
+++ b/libs/utils/include/utils/SingleInstanceComponentManager.h
@@ -60,16 +60,19 @@ class UTILS_PUBLIC SingleInstanceComponentManager {
 protected:
     static constexpr size_t ENTITY_INDEX = sizeof ... (Elements);
 
+
 public:
     using SoA = StructureOfArrays<Elements ..., Entity>;
 
+    using Structure = typename SoA::Structure;
+
     using Instance = EntityInstanceBase::Type;
 
     SingleInstanceComponentManager() noexcept {
         // We always start with a dummy entry because index=0 is reserved. The component
         // at index = 0, is guaranteed to be default-initialized.
         // Sub-classes can use this to their advantage.
-        mData.push_back();
+        mData.push_back(Structure{});
     }
 
     SingleInstanceComponentManager(SingleInstanceComponentManager&&) noexcept {/* = default */}
@@ -269,7 +272,7 @@ SingleInstanceComponentManager<Elements ...>::addComponent(Entity e) {
     if (!e.isNull()) {
         if (!hasComponent(e)) {
             // this is like a push_back(e);
-            mData.push_back().template back<ENTITY_INDEX>() = e;
+            mData.push_back(Structure{}).template back<ENTITY_INDEX>() = e;
             // index 0 is used when the component doesn't exist
             ci = Instance(mData.size() - 1);
             mInstanceMap[e] = ci;
diff --git a/libs/utils/include/utils/StructureOfArrays.h b/libs/utils/include/utils/StructureOfArrays.h
index b4623e341637..b6ea3bfbe478 100644
--- a/libs/utils/include/utils/StructureOfArrays.h
+++ b/libs/utils/include/utils/StructureOfArrays.h
@@ -41,11 +41,13 @@ class StructureOfArraysBase {
     static constexpr const size_t kArrayCount = sizeof...(Elements);
 
 public:
-    using SoA = StructureOfArraysBase<Allocator, Elements ...>;
+    using SoA = StructureOfArraysBase<Allocator, Elements...>;
+
+    using Structure = std::tuple<Elements...>;
 
     // Type of the Nth array
     template<size_t N>
-    using TypeAt = typename std::tuple_element_t<N, std::tuple<Elements...>>;
+    using TypeAt = typename std::tuple_element_t<N, Structure>;
 
     // Number of arrays
     static constexpr size_t getArrayCount() noexcept { return kArrayCount; }
@@ -57,7 +59,7 @@ class StructureOfArraysBase {
 
     // --------------------------------------------------------------------------------------------
 
-    class Structure;
+    class IteratorValue;
     template<typename T> class Iterator;
     using iterator = Iterator<StructureOfArraysBase*>;
     using const_iterator = Iterator<StructureOfArraysBase const*>;
@@ -69,45 +71,45 @@ class StructureOfArraysBase {
      * In other words, it's the return type of iterator::operator*(), and since it
      * cannot be a C++ reference (&), it's an object that acts like it.
      */
-    class StructureRef {
-        friend class Structure;
+    class IteratorValueRef {
+        friend class IteratorValue;
         friend iterator;
         friend const_iterator;
         StructureOfArraysBase* const UTILS_RESTRICT soa;
         size_t const index;
 
-        StructureRef(StructureOfArraysBase* soa, size_t index) : soa(soa), index(index) { }
+        IteratorValueRef(StructureOfArraysBase* soa, size_t index) : soa(soa), index(index) { }
 
         // assigns a value_type to a reference (i.e. assigns to what's pointed to by the reference)
         template<size_t ... Is>
-        StructureRef& assign(Structure const& rhs, std::index_sequence<Is...>);
+        IteratorValueRef& assign(IteratorValue const& rhs, std::index_sequence<Is...>);
 
         // assigns a value_type to a reference (i.e. assigns to what's pointed to by the reference)
         template<size_t ... Is>
-        StructureRef& assign(Structure&& rhs, std::index_sequence<Is...>) noexcept;
+        IteratorValueRef& assign(IteratorValue&& rhs, std::index_sequence<Is...>) noexcept;
 
         // objects pointed to by reference can be swapped, so provide the special swap() function.
-        friend void swap(StructureRef lhs, StructureRef rhs) {
+        friend void swap(IteratorValueRef lhs, IteratorValueRef rhs) {
             lhs.soa->swap(lhs.index, rhs.index);
         }
 
     public:
         // references can be created by copy-assignment only
-        StructureRef(StructureRef const& rhs) noexcept : soa(rhs.soa), index(rhs.index) { }
+        IteratorValueRef(IteratorValueRef const& rhs) noexcept : soa(rhs.soa), index(rhs.index) { }
 
         // copy the content of a reference to the content of this one
-        StructureRef& operator=(StructureRef const& rhs);
+        IteratorValueRef& operator=(IteratorValueRef const& rhs);
 
         // move the content of a reference to the content of this one
-        StructureRef& operator=(StructureRef&& rhs) noexcept;
+        IteratorValueRef& operator=(IteratorValueRef&& rhs) noexcept;
 
         // copy a value_type to the content of this reference
-        StructureRef& operator=(Structure const& rhs) {
+        IteratorValueRef& operator=(IteratorValue const& rhs) {
             return assign(rhs, std::make_index_sequence<kArrayCount>());
         }
 
         // move a value_type to the content of this reference
-        StructureRef& operator=(Structure&& rhs) noexcept {
+        IteratorValueRef& operator=(IteratorValue&& rhs) noexcept {
             return assign(rhs, std::make_index_sequence<kArrayCount>());
         }
 
@@ -122,36 +124,36 @@ class StructureOfArraysBase {
      * Internally we're using a tuple<> to store the data.
      * This object is not trivial to construct, as it copies an entry of the SoA.
      */
-    class Structure {
-        friend class StructureRef;
+    class IteratorValue {
+        friend class IteratorValueRef;
         friend iterator;
         friend const_iterator;
         using Type = std::tuple<typename std::decay<Elements>::type...>;
         Type elements;
 
         template<size_t ... Is>
-        static Type init(StructureRef const& rhs, std::index_sequence<Is...>) {
+        static Type init(IteratorValueRef const& rhs, std::index_sequence<Is...>) {
             return Type{ rhs.soa->template elementAt<Is>(rhs.index)... };
         }
 
         template<size_t ... Is>
-        static Type init(StructureRef&& rhs, std::index_sequence<Is...>) noexcept {
+        static Type init(IteratorValueRef&& rhs, std::index_sequence<Is...>) noexcept {
             return Type{ std::move(rhs.soa->template elementAt<Is>(rhs.index))... };
         }
 
     public:
-        Structure(Structure const& rhs) = default;
-        Structure(Structure&& rhs) noexcept = default;
-        Structure& operator=(Structure const& rhs) = default;
-        Structure& operator=(Structure&& rhs) noexcept = default;
+        IteratorValue(IteratorValue const& rhs) = default;
+        IteratorValue(IteratorValue&& rhs) noexcept = default;
+        IteratorValue& operator=(IteratorValue const& rhs) = default;
+        IteratorValue& operator=(IteratorValue&& rhs) noexcept = default;
 
         // initialize and assign from a StructureRef
-        Structure(StructureRef const& rhs)
+        IteratorValue(IteratorValueRef const& rhs)
                 : elements(init(rhs, std::make_index_sequence<kArrayCount>())) {}
-        Structure(StructureRef&& rhs) noexcept
+        IteratorValue(IteratorValueRef&& rhs) noexcept
                 : elements(init(rhs, std::make_index_sequence<kArrayCount>())) {}
-        Structure& operator=(StructureRef const& rhs) { return operator=(Structure(rhs)); }
-        Structure& operator=(StructureRef&& rhs) noexcept { return operator=(Structure(rhs)); }
+        IteratorValue& operator=(IteratorValueRef const& rhs) { return operator=(IteratorValue(rhs)); }
+        IteratorValue& operator=(IteratorValueRef&& rhs) noexcept { return operator=(IteratorValue(rhs)); }
 
         // access the elements of this value_Type (i.e. the "fields" of the structure)
         template<size_t I> TypeAt<I> const& get() const { return std::get<I>(elements); }
@@ -174,9 +176,9 @@ class StructureOfArraysBase {
         Iterator(CVQualifiedSOAPointer soa, size_t index) : soa(soa), index(index) {}
 
     public:
-        using value_type = Structure;
-        using reference = StructureRef;
-        using pointer = StructureRef*;    // FIXME: this should be a StructurePtr type
+        using value_type = IteratorValue;
+        using reference = IteratorValueRef;
+        using pointer = IteratorValueRef*;    // FIXME: this should be a StructurePtr type
         using difference_type = ptrdiff_t;
         using iterator_category = std::random_access_iterator_tag;
 
@@ -335,6 +337,11 @@ class StructureOfArraysBase {
         return *this;
     }
 
+    StructureOfArraysBase& push_back(Structure&& args) noexcept {
+        ensureCapacity(mSize + 1);
+        return push_back_unsafe(std::forward<Structure>(args));
+    }
+
     StructureOfArraysBase& push_back(Elements const& ... args) noexcept {
         ensureCapacity(mSize + 1);
         return push_back_unsafe(args...);
@@ -349,23 +356,29 @@ class StructureOfArraysBase {
     struct PushBackUnsafeClosure {
         size_t last;
         std::tuple<Elements...> args;
-        inline explicit PushBackUnsafeClosure(size_t last, Elements&& ... args)
-                : last(last), args(std::forward<Elements>(args)...) {}
-        inline explicit PushBackUnsafeClosure(size_t last, Elements const& ... args)
-                : last(last), args(args...) {}
+        inline explicit PushBackUnsafeClosure(size_t last, Structure&& args)
+                : last(last), args(std::forward<Structure>(args)) {}
         template<size_t I>
         inline void operator()(TypeAt<I>* p) {
             new(p + last) TypeAt<I>{ std::get<I>(args) };
         }
     };
 
+    StructureOfArraysBase& push_back_unsafe(Structure&& args) noexcept {
+        for_each_index(mArrays,
+                PushBackUnsafeClosure{ mSize++, std::forward<Structure>(args) });
+        return *this;
+    }
+
     StructureOfArraysBase& push_back_unsafe(Elements const& ... args) noexcept {
-        for_each_index(mArrays, PushBackUnsafeClosure{ mSize++, args... });
+        for_each_index(mArrays,
+                PushBackUnsafeClosure{ mSize++, { args... } });
         return *this;
     }
 
     StructureOfArraysBase& push_back_unsafe(Elements&& ... args) noexcept {
-        for_each_index(mArrays, PushBackUnsafeClosure{ mSize++, std::forward<Elements>(args)... });
+        for_each_index(mArrays,
+                PushBackUnsafeClosure{ mSize++, { std::forward<Elements>(args)... }});
         return *this;
     }
 
@@ -562,8 +575,10 @@ class StructureOfArraysBase {
         forEach([from, to](auto p) {
             using T = typename std::decay<decltype(*p)>::type;
             // note: scalar types like int/float get initialized to zero
-            for (size_t i = from; i < to; i++) {
-                new(p + i) T();
+            if constexpr (!std::is_trivially_default_constructible_v<T>) {
+                for (size_t i = from; i < to; i++) {
+                    new(p + i) T();
+                }
             }
         });
     }
@@ -571,8 +586,10 @@ class StructureOfArraysBase {
     void destroy_each(size_t from, size_t to) noexcept {
         forEach([from, to](auto p) {
             using T = typename std::decay<decltype(*p)>::type;
-            for (size_t i = from; i < to; i++) {
-                p[i].~T();
+            if constexpr (!std::is_trivially_destructible_v<T>) {
+                for (size_t i = from; i < to; i++) {
+                    p[i].~T();
+                }
             }
         });
     }
@@ -592,15 +609,17 @@ class StructureOfArraysBase {
                         reinterpret_cast<T*>(uintptr_t(b) + offsets[index]);
 
                 // for trivial cases, just call memcpy()
-                if (std::is_trivially_copyable<T>::value &&
-                    std::is_trivially_destructible<T>::value) {
+                if constexpr (std::is_trivially_copyable_v<T> &&
+                              std::is_trivially_destructible_v<T>) {
                     memcpy(arrayPointer, p, size * sizeof(T));
                 } else {
                     for (size_t i = 0; i < size; i++) {
                         // we move an element by using the in-place move-constructor
                         new(arrayPointer + i) T(std::move(p[i]));
-                        // and delete them by calling the destructor directly
-                        p[i].~T();
+                        if constexpr (!std::is_trivially_destructible_v<T>) {
+                            // and delete them by calling the destructor directly
+                            p[i].~T();
+                        }
                     }
                 }
                 index++;
@@ -626,27 +645,27 @@ class StructureOfArraysBase {
 
 template<typename Allocator, typename... Elements>
 inline
-typename StructureOfArraysBase<Allocator, Elements...>::StructureRef&
-StructureOfArraysBase<Allocator, Elements...>::StructureRef::operator=(
-        StructureOfArraysBase::StructureRef const& rhs) {
-    return operator=(Structure(rhs));
+typename StructureOfArraysBase<Allocator, Elements...>::IteratorValueRef&
+StructureOfArraysBase<Allocator, Elements...>::IteratorValueRef::operator=(
+        StructureOfArraysBase::IteratorValueRef const& rhs) {
+    return operator=(IteratorValue(rhs));
 }
 
 template<typename Allocator, typename... Elements>
 inline
-typename StructureOfArraysBase<Allocator, Elements...>::StructureRef&
-StructureOfArraysBase<Allocator, Elements...>::StructureRef::operator=(
-        StructureOfArraysBase::StructureRef&& rhs) noexcept {
-    return operator=(Structure(rhs));
+typename StructureOfArraysBase<Allocator, Elements...>::IteratorValueRef&
+StructureOfArraysBase<Allocator, Elements...>::IteratorValueRef::operator=(
+        StructureOfArraysBase::IteratorValueRef&& rhs) noexcept {
+    return operator=(IteratorValue(rhs));
 }
 
 template<typename Allocator, typename... Elements>
 template<size_t... Is>
 inline
-typename StructureOfArraysBase<Allocator, Elements...>::StructureRef&
-StructureOfArraysBase<Allocator, Elements...>::StructureRef::assign(
-        StructureOfArraysBase::Structure const& rhs, std::index_sequence<Is...>) {
-    // implements StructureRef& StructureRef::operator=(Structure const& rhs)
+typename StructureOfArraysBase<Allocator, Elements...>::IteratorValueRef&
+StructureOfArraysBase<Allocator, Elements...>::IteratorValueRef::assign(
+        StructureOfArraysBase::IteratorValue const& rhs, std::index_sequence<Is...>) {
+    // implements IteratorValueRef& IteratorValueRef::operator=(IteratorValue const& rhs)
     auto UTILS_UNUSED l = { (soa->elementAt<Is>(index) = std::get<Is>(rhs.elements), 0)... };
     return *this;
 }
@@ -654,10 +673,10 @@ StructureOfArraysBase<Allocator, Elements...>::StructureRef::assign(
 template<typename Allocator, typename... Elements>
 template<size_t... Is>
 inline
-typename StructureOfArraysBase<Allocator, Elements...>::StructureRef&
-StructureOfArraysBase<Allocator, Elements...>::StructureRef::assign(
-        StructureOfArraysBase::Structure&& rhs, std::index_sequence<Is...>) noexcept {
-    // implements StructureRef& StructureRef::operator=(Structure&& rhs) noexcept
+typename StructureOfArraysBase<Allocator, Elements...>::IteratorValueRef&
+StructureOfArraysBase<Allocator, Elements...>::IteratorValueRef::assign(
+        StructureOfArraysBase::IteratorValue&& rhs, std::index_sequence<Is...>) noexcept {
+    // implements IteratorValueRef& IteratorValueRef::operator=(IteratorValue&& rhs) noexcept
     auto UTILS_UNUSED l = {
             (soa->elementAt<Is>(index) = std::move(std::get<Is>(rhs.elements)), 0)... };
     return *this;
diff --git a/libs/utils/include/utils/android/Systrace.h b/libs/utils/include/utils/android/Systrace.h
index 8af96aeaf5ad..41b64f167231 100644
--- a/libs/utils/include/utils/android/Systrace.h
+++ b/libs/utils/include/utils/android/Systrace.h
@@ -36,7 +36,7 @@
  * Creates a Systrace context in the current scope. needed for calling all other systrace
  * commands below.
  */
-#define SYSTRACE_CONTEXT() ::utils::details::Systrace ___tracer(SYSTRACE_TAG)
+#define SYSTRACE_CONTEXT() ::utils::details::Systrace ___trctx(SYSTRACE_TAG)
 
 
 // SYSTRACE_NAME traces the beginning and end of the current scope.  To trace
@@ -57,10 +57,10 @@
 #define SYSTRACE_CALL() SYSTRACE_NAME(__FUNCTION__)
 
 #define SYSTRACE_NAME_BEGIN(name) \
-        ___tracer.traceBegin(SYSTRACE_TAG, name)
+        ___trctx.traceBegin(SYSTRACE_TAG, name)
 
 #define SYSTRACE_NAME_END() \
-        ___tracer.traceEnd(SYSTRACE_TAG)
+        ___trctx.traceEnd(SYSTRACE_TAG)
 
 
 /**
@@ -71,24 +71,24 @@
  * used to end it.
  */
 #define SYSTRACE_ASYNC_BEGIN(name, cookie) \
-        ___tracer.asyncBegin(SYSTRACE_TAG, name, cookie)
+        ___trctx.asyncBegin(SYSTRACE_TAG, name, cookie)
 
 /**
  * Trace the end of an asynchronous event.
  * This should have a corresponding SYSTRACE_ASYNC_BEGIN.
  */
 #define SYSTRACE_ASYNC_END(name, cookie) \
-        ___tracer.asyncEnd(SYSTRACE_TAG, name, cookie)
+        ___trctx.asyncEnd(SYSTRACE_TAG, name, cookie)
 
 /**
  * Traces an integer counter value.  name is used to identify the counter.
  * This can be used to track how a value changes over time.
  */
 #define SYSTRACE_VALUE32(name, val) \
-        ___tracer.value(SYSTRACE_TAG, name, int32_t(val))
+        ___trctx.value(SYSTRACE_TAG, name, int32_t(val))
 
 #define SYSTRACE_VALUE64(name, val) \
-        ___tracer.value(SYSTRACE_TAG, name, int64_t(val))
+        ___trctx.value(SYSTRACE_TAG, name, int64_t(val))
 
 // ------------------------------------------------------------------------------------------------
 // No user serviceable code below...
@@ -221,9 +221,9 @@ class Systrace {
 // ------------------------------------------------------------------------------------------------
 
 class ScopedTrace {
-   public:
+public:
     // we don't inline this because it's relatively heavy due to a global check
-    ScopedTrace(uint32_t tag, const char* name) noexcept : mTrace(tag), mTag(tag) {
+    ScopedTrace(uint32_t tag, const char* name) noexcept: mTrace(tag), mTag(tag) {
         mTrace.traceBegin(tag, name);
     }
 
@@ -231,15 +231,7 @@ class ScopedTrace {
         mTrace.traceEnd(mTag);
     }
 
-    inline void value(uint32_t tag, const char* name, int32_t v) noexcept {
-        mTrace.value(tag, name, v);
-    }
-
-    inline void value(uint32_t tag, const char* name, int64_t v) noexcept {
-        mTrace.value(tag, name, v);
-    }
-
-   private:
+private:
     Systrace mTrace;
     const uint32_t mTag;
 };
diff --git a/third_party/mikktspace/CMakeLists.txt b/third_party/mikktspace/CMakeLists.txt
new file mode 100644
index 000000000000..120013568daf
--- /dev/null
+++ b/third_party/mikktspace/CMakeLists.txt
@@ -0,0 +1,40 @@
+cmake_minimum_required(VERSION 3.19)
+project(mikktspace)
+
+set(TARGET mikktspace)
+set(PUBLIC_HDR_DIR include)
+
+# ==================================================================================================
+# Sources and headers
+# ==================================================================================================
+set(PUBLIC_HDRS
+    ${PUBLIC_HDR_DIR}/mikktspace/mikktspace.h
+)
+
+set(SRCS
+    src/mikktspace.c
+)
+
+# ==================================================================================================
+# Target definitions
+# ==================================================================================================
+include_directories(${PUBLIC_HDR_DIR})
+
+add_library(${TARGET} ${PUBLIC_HDRS} ${PRIVATE_HDRS} ${SRCS})
+target_include_directories(${TARGET} PUBLIC ${PUBLIC_HDR_DIR})
+set_target_properties(${TARGET} PROPERTIES FOLDER Libs)
+
+# ==================================================================================================
+# Compile options and optimizations
+# ==================================================================================================
+if (MSVC)
+    target_compile_options(${TARGET} PRIVATE /fp:fast)
+else()
+    target_compile_options(${TARGET} PRIVATE -ffast-math)
+endif()
+
+# ==================================================================================================
+# Installation
+# ==================================================================================================
+install(TARGETS ${TARGET} ARCHIVE DESTINATION lib/${DIST_DIR})
+install(DIRECTORY ${PUBLIC_HDR_DIR}/mikktspace DESTINATION include)
diff --git a/third_party/mikktspace/include/mikktspace/mikktspace.h b/third_party/mikktspace/include/mikktspace/mikktspace.h
new file mode 100644
index 000000000000..52c44a713c60
--- /dev/null
+++ b/third_party/mikktspace/include/mikktspace/mikktspace.h
@@ -0,0 +1,145 @@
+/** \file mikktspace/mikktspace.h
+ *  \ingroup mikktspace
+ */
+/**
+ *  Copyright (C) 2011 by Morten S. Mikkelsen
+ *
+ *  This software is provided 'as-is', without any express or implied
+ *  warranty.  In no event will the authors be held liable for any damages
+ *  arising from the use of this software.
+ *
+ *  Permission is granted to anyone to use this software for any purpose,
+ *  including commercial applications, and to alter it and redistribute it
+ *  freely, subject to the following restrictions:
+ *
+ *  1. The origin of this software must not be misrepresented; you must not
+ *     claim that you wrote the original software. If you use this software
+ *     in a product, an acknowledgment in the product documentation would be
+ *     appreciated but is not required.
+ *  2. Altered source versions must be plainly marked as such, and must not be
+ *     misrepresented as being the original software.
+ *  3. This notice may not be removed or altered from any source distribution.
+ */
+
+#ifndef __MIKKTSPACE_H__
+#define __MIKKTSPACE_H__
+
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/* Author: Morten S. Mikkelsen
+ * Version: 1.0
+ *
+ * The files mikktspace.h and mikktspace.c are designed to be
+ * stand-alone files and it is important that they are kept this way.
+ * Not having dependencies on structures/classes/libraries specific
+ * to the program, in which they are used, allows them to be copied
+ * and used as is into any tool, program or plugin.
+ * The code is designed to consistently generate the same
+ * tangent spaces, for a given mesh, in any tool in which it is used.
+ * This is done by performing an internal welding step and subsequently an order-independent evaluation
+ * of tangent space for meshes consisting of triangles and quads.
+ * This means faces can be received in any order and the same is true for
+ * the order of vertices of each face. The generated result will not be affected
+ * by such reordering. Additionally, whether degenerate (vertices or texture coordinates)
+ * primitives are present or not will not affect the generated results either.
+ * Once tangent space calculation is done the vertices of degenerate primitives will simply
+ * inherit tangent space from neighboring non degenerate primitives.
+ * The analysis behind this implementation can be found in my master's thesis
+ * which is available for download --> http://image.diku.dk/projects/media/morten.mikkelsen.08.pdf
+ * Note that though the tangent spaces at the vertices are generated in an order-independent way,
+ * by this implementation, the interpolated tangent space is still affected by which diagonal is
+ * chosen to split each quad. A sensible solution is to have your tools pipeline always
+ * split quads by the shortest diagonal. This choice is order-independent and works with mirroring.
+ * If these have the same length then compare the diagonals defined by the texture coordinates.
+ * XNormal which is a tool for baking normal maps allows you to write your own tangent space plugin
+ * and also quad triangulator plugin.
+ */
+
+
+typedef int tbool;
+typedef struct SMikkTSpaceContext SMikkTSpaceContext;
+
+typedef struct {
+	// Returns the number of faces (triangles/quads) on the mesh to be processed.
+	int (*m_getNumFaces)(const SMikkTSpaceContext * pContext);
+
+	// Returns the number of vertices on face number iFace
+	// iFace is a number in the range {0, 1, ..., getNumFaces()-1}
+	int (*m_getNumVerticesOfFace)(const SMikkTSpaceContext * pContext, const int iFace);
+
+	// returns the position/normal/texcoord of the referenced face of vertex number iVert.
+	// iVert is in the range {0,1,2} for triangles and {0,1,2,3} for quads.
+	void (*m_getPosition)(const SMikkTSpaceContext * pContext, float fvPosOut[], const int iFace, const int iVert);
+	void (*m_getNormal)(const SMikkTSpaceContext * pContext, float fvNormOut[], const int iFace, const int iVert);
+	void (*m_getTexCoord)(const SMikkTSpaceContext * pContext, float fvTexcOut[], const int iFace, const int iVert);
+
+	// either (or both) of the two setTSpace callbacks can be set.
+	// The call-back m_setTSpaceBasic() is sufficient for basic normal mapping.
+
+	// This function is used to return the tangent and fSign to the application.
+	// fvTangent is a unit length vector.
+	// For normal maps it is sufficient to use the following simplified version of the bitangent which is generated at pixel/vertex level.
+	// bitangent = fSign * cross(vN, tangent);
+	// Note that the results are returned unindexed. It is possible to generate a new index list
+	// But averaging/overwriting tangent spaces by using an already existing index list WILL produce INCRORRECT results.
+	// DO NOT! use an already existing index list.
+	void (*m_setTSpaceBasic)(const SMikkTSpaceContext * pContext, const float fvTangent[], const float fSign, const int iFace, const int iVert);
+
+	// This function is used to return tangent space results to the application.
+	// fvTangent and fvBiTangent are unit length vectors and fMagS and fMagT are their
+	// true magnitudes which can be used for relief mapping effects.
+	// fvBiTangent is the "real" bitangent and thus may not be perpendicular to fvTangent.
+	// However, both are perpendicular to the vertex normal.
+	// For normal maps it is sufficient to use the following simplified version of the bitangent which is generated at pixel/vertex level.
+	// fSign = bIsOrientationPreserving ? 1.0f : (-1.0f);
+	// bitangent = fSign * cross(vN, tangent);
+	// Note that the results are returned unindexed. It is possible to generate a new index list
+	// But averaging/overwriting tangent spaces by using an already existing index list WILL produce INCRORRECT results.
+	// DO NOT! use an already existing index list.
+	void (*m_setTSpace)(const SMikkTSpaceContext * pContext, const float fvTangent[], const float fvBiTangent[], const float fMagS, const float fMagT,
+						const tbool bIsOrientationPreserving, const int iFace, const int iVert);
+} SMikkTSpaceInterface;
+
+struct SMikkTSpaceContext
+{
+	SMikkTSpaceInterface * m_pInterface;	// initialized with callback functions
+	void * m_pUserData;						// pointer to client side mesh data etc. (passed as the first parameter with every interface call)
+};
+
+// these are both thread safe!
+tbool genTangSpaceDefault(const SMikkTSpaceContext * pContext);	// Default (recommended) fAngularThreshold is 180 degrees (which means threshold disabled)
+tbool genTangSpace(const SMikkTSpaceContext * pContext, const float fAngularThreshold);
+
+
+// To avoid visual errors (distortions/unwanted hard edges in lighting), when using sampled normal maps, the
+// normal map sampler must use the exact inverse of the pixel shader transformation.
+// The most efficient transformation we can possibly do in the pixel shader is
+// achieved by using, directly, the "unnormalized" interpolated tangent, bitangent and vertex normal: vT, vB and vN.
+// pixel shader (fast transform out)
+// vNout = normalize( vNt.x * vT + vNt.y * vB + vNt.z * vN );
+// where vNt is the tangent space normal. The normal map sampler must likewise use the
+// interpolated and "unnormalized" tangent, bitangent and vertex normal to be compliant with the pixel shader.
+// sampler does (exact inverse of pixel shader):
+// float3 row0 = cross(vB, vN);
+// float3 row1 = cross(vN, vT);
+// float3 row2 = cross(vT, vB);
+// float fSign = dot(vT, row0)<0 ? -1 : 1;
+// vNt = normalize( fSign * float3(dot(vNout,row0), dot(vNout,row1), dot(vNout,row2)) );
+// where vNout is the sampled normal in some chosen 3D space.
+//
+// Should you choose to reconstruct the bitangent in the pixel shader instead
+// of the vertex shader, as explained earlier, then be sure to do this in the normal map sampler also.
+// Finally, beware of quad triangulations. If the normal map sampler doesn't use the same triangulation of
+// quads as your renderer then problems will occur since the interpolated tangent spaces will differ
+// eventhough the vertex level tangent spaces match. This can be solved either by triangulating before
+// sampling/exporting or by using the order-independent choice of diagonal for splitting quads suggested earlier.
+// However, this must be used both by the sampler and your tools/rendering pipeline.
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
diff --git a/third_party/mikktspace/src/mikktspace.c b/third_party/mikktspace/src/mikktspace.c
new file mode 100644
index 000000000000..4e27620b2779
--- /dev/null
+++ b/third_party/mikktspace/src/mikktspace.c
@@ -0,0 +1,1899 @@
+/** \file mikktspace/mikktspace.c
+ *  \ingroup mikktspace
+ */
+/**
+ *  Copyright (C) 2011 by Morten S. Mikkelsen
+ *
+ *  This software is provided 'as-is', without any express or implied
+ *  warranty.  In no event will the authors be held liable for any damages
+ *  arising from the use of this software.
+ *
+ *  Permission is granted to anyone to use this software for any purpose,
+ *  including commercial applications, and to alter it and redistribute it
+ *  freely, subject to the following restrictions:
+ *
+ *  1. The origin of this software must not be misrepresented; you must not
+ *     claim that you wrote the original software. If you use this software
+ *     in a product, an acknowledgment in the product documentation would be
+ *     appreciated but is not required.
+ *  2. Altered source versions must be plainly marked as such, and must not be
+ *     misrepresented as being the original software.
+ *  3. This notice may not be removed or altered from any source distribution.
+ */
+
+#include <assert.h>
+#include <stdio.h>
+#include <math.h>
+#include <string.h>
+#include <float.h>
+#include <stdlib.h>
+
+#include <mikktspace/mikktspace.h>
+
+#define TFALSE		0
+#define TTRUE		1
+
+#ifndef M_PI
+#define M_PI	3.1415926535897932384626433832795
+#endif
+
+#define INTERNAL_RND_SORT_SEED		39871946
+
+// internal structure
+typedef struct {
+	float x, y, z;
+} SVec3;
+
+static tbool			veq( const SVec3 v1, const SVec3 v2 )
+{
+	return (v1.x == v2.x) && (v1.y == v2.y) && (v1.z == v2.z);
+}
+
+static SVec3		vadd( const SVec3 v1, const SVec3 v2 )
+{
+	SVec3 vRes;
+
+	vRes.x = v1.x + v2.x;
+	vRes.y = v1.y + v2.y;
+	vRes.z = v1.z + v2.z;
+
+	return vRes;
+}
+
+
+static SVec3		vsub( const SVec3 v1, const SVec3 v2 )
+{
+	SVec3 vRes;
+
+	vRes.x = v1.x - v2.x;
+	vRes.y = v1.y - v2.y;
+	vRes.z = v1.z - v2.z;
+
+	return vRes;
+}
+
+static SVec3		vscale(const float fS, const SVec3 v)
+{
+	SVec3 vRes;
+
+	vRes.x = fS * v.x;
+	vRes.y = fS * v.y;
+	vRes.z = fS * v.z;
+
+	return vRes;
+}
+
+static float			LengthSquared( const SVec3 v )
+{
+	return v.x*v.x + v.y*v.y + v.z*v.z;
+}
+
+static float			Length( const SVec3 v )
+{
+	return sqrtf(LengthSquared(v));
+}
+
+static SVec3		Normalize( const SVec3 v )
+{
+	return vscale(1 / Length(v), v);
+}
+
+static float		vdot( const SVec3 v1, const SVec3 v2)
+{
+	return v1.x*v2.x + v1.y*v2.y + v1.z*v2.z;
+}
+
+
+static tbool NotZero(const float fX)
+{
+	// could possibly use FLT_EPSILON instead
+	return fabsf(fX) > FLT_MIN;
+}
+
+static tbool VNotZero(const SVec3 v)
+{
+	// might change this to an epsilon based test
+	return NotZero(v.x) || NotZero(v.y) || NotZero(v.z);
+}
+
+
+
+typedef struct {
+	int iNrFaces;
+	int * pTriMembers;
+} SSubGroup;
+
+typedef struct {
+	int iNrFaces;
+	int * pFaceIndices;
+	int iVertexRepresentitive;
+	tbool bOrientPreservering;
+} SGroup;
+
+// 
+#define MARK_DEGENERATE				1
+#define QUAD_ONE_DEGEN_TRI			2
+#define GROUP_WITH_ANY				4
+#define ORIENT_PRESERVING			8
+
+
+
+typedef struct {
+	int FaceNeighbors[3];
+	SGroup * AssignedGroup[3];
+	
+	// normalized first order face derivatives
+	SVec3 vOs, vOt;
+	float fMagS, fMagT;	// original magnitudes
+
+	// determines if the current and the next triangle are a quad.
+	int iOrgFaceNumber;
+	int iFlag, iTSpacesOffs;
+	unsigned char vert_num[4];
+} STriInfo;
+
+typedef struct {
+	SVec3 vOs;
+	float fMagS;
+	SVec3 vOt;
+	float fMagT;
+	int iCounter;	// this is to average back into quads.
+	tbool bOrient;
+} STSpace;
+
+static int GenerateInitialVerticesIndexList(STriInfo pTriInfos[], int piTriList_out[], const SMikkTSpaceContext * pContext, const int iNrTrianglesIn);
+static void GenerateSharedVerticesIndexList(int piTriList_in_and_out[], const SMikkTSpaceContext * pContext, const int iNrTrianglesIn);
+static void InitTriInfo(STriInfo pTriInfos[], const int piTriListIn[], const SMikkTSpaceContext * pContext, const int iNrTrianglesIn);
+static int Build4RuleGroups(STriInfo pTriInfos[], SGroup pGroups[], int piGroupTrianglesBuffer[], const int piTriListIn[], const int iNrTrianglesIn);
+static tbool GenerateTSpaces(STSpace psTspace[], const STriInfo pTriInfos[], const SGroup pGroups[],
+                             const int iNrActiveGroups, const int piTriListIn[], const float fThresCos,
+                             const SMikkTSpaceContext * pContext);
+
+static int MakeIndex(const int iFace, const int iVert)
+{
+	assert(iVert>=0 && iVert<4 && iFace>=0);
+	return (iFace<<2) | (iVert&0x3);
+}
+
+static void IndexToData(int * piFace, int * piVert, const int iIndexIn)
+{
+	piVert[0] = iIndexIn&0x3;
+	piFace[0] = iIndexIn>>2;
+}
+
+static STSpace AvgTSpace(const STSpace * pTS0, const STSpace * pTS1)
+{
+	STSpace ts_res;
+
+	// this if is important. Due to floating point precision
+	// averaging when ts0==ts1 will cause a slight difference
+	// which results in tangent space splits later on
+	if (pTS0->fMagS==pTS1->fMagS && pTS0->fMagT==pTS1->fMagT &&
+	   veq(pTS0->vOs,pTS1->vOs)	&& veq(pTS0->vOt, pTS1->vOt))
+	{
+		ts_res.fMagS = pTS0->fMagS;
+		ts_res.fMagT = pTS0->fMagT;
+		ts_res.vOs = pTS0->vOs;
+		ts_res.vOt = pTS0->vOt;
+	}
+	else
+	{
+		ts_res.fMagS = 0.5f*(pTS0->fMagS+pTS1->fMagS);
+		ts_res.fMagT = 0.5f*(pTS0->fMagT+pTS1->fMagT);
+		ts_res.vOs = vadd(pTS0->vOs,pTS1->vOs);
+		ts_res.vOt = vadd(pTS0->vOt,pTS1->vOt);
+		if ( VNotZero(ts_res.vOs) ) ts_res.vOs = Normalize(ts_res.vOs);
+		if ( VNotZero(ts_res.vOt) ) ts_res.vOt = Normalize(ts_res.vOt);
+	}
+
+	return ts_res;
+}
+
+
+
+static SVec3 GetPosition(const SMikkTSpaceContext * pContext, const int index);
+static SVec3 GetNormal(const SMikkTSpaceContext * pContext, const int index);
+static SVec3 GetTexCoord(const SMikkTSpaceContext * pContext, const int index);
+
+
+// degen triangles
+static void DegenPrologue(STriInfo pTriInfos[], int piTriList_out[], const int iNrTrianglesIn, const int iTotTris);
+static void DegenEpilogue(STSpace psTspace[], STriInfo pTriInfos[], int piTriListIn[], const SMikkTSpaceContext * pContext, const int iNrTrianglesIn, const int iTotTris);
+
+
+tbool genTangSpaceDefault(const SMikkTSpaceContext * pContext)
+{
+	return genTangSpace(pContext, 180.0f);
+}
+
+tbool genTangSpace(const SMikkTSpaceContext * pContext, const float fAngularThreshold)
+{
+	// count nr_triangles
+	int * piTriListIn = NULL, * piGroupTrianglesBuffer = NULL;
+	STriInfo * pTriInfos = NULL;
+	SGroup * pGroups = NULL;
+	STSpace * psTspace = NULL;
+	int iNrTrianglesIn = 0, f=0, t=0, i=0;
+	int iNrTSPaces = 0, iTotTris = 0, iDegenTriangles = 0, iNrMaxGroups = 0;
+	int iNrActiveGroups = 0, index = 0;
+	const int iNrFaces = pContext->m_pInterface->m_getNumFaces(pContext);
+	tbool bRes = TFALSE;
+	const float fThresCos = (float) cos((fAngularThreshold*(float)M_PI)/180.0f);
+
+	// verify all call-backs have been set
+	if ( pContext->m_pInterface->m_getNumFaces==NULL ||
+		pContext->m_pInterface->m_getNumVerticesOfFace==NULL ||
+		pContext->m_pInterface->m_getPosition==NULL ||
+		pContext->m_pInterface->m_getNormal==NULL ||
+		pContext->m_pInterface->m_getTexCoord==NULL )
+		return TFALSE;
+
+	// count triangles on supported faces
+	for (f=0; f<iNrFaces; f++)
+	{
+		const int verts = pContext->m_pInterface->m_getNumVerticesOfFace(pContext, f);
+		if (verts==3) ++iNrTrianglesIn;
+		else if (verts==4) iNrTrianglesIn += 2;
+	}
+	if (iNrTrianglesIn<=0) return TFALSE;
+
+	// allocate memory for an index list
+	piTriListIn = (int *) malloc(sizeof(int)*3*iNrTrianglesIn);
+	pTriInfos = (STriInfo *) malloc(sizeof(STriInfo)*iNrTrianglesIn);
+	if (piTriListIn==NULL || pTriInfos==NULL)
+	{
+		if (piTriListIn!=NULL) free(piTriListIn);
+		if (pTriInfos!=NULL) free(pTriInfos);
+		return TFALSE;
+	}
+
+	// make an initial triangle --> face index list
+	iNrTSPaces = GenerateInitialVerticesIndexList(pTriInfos, piTriListIn, pContext, iNrTrianglesIn);
+
+	// make a welded index list of identical positions and attributes (pos, norm, texc)
+	//printf("gen welded index list begin\n");
+	GenerateSharedVerticesIndexList(piTriListIn, pContext, iNrTrianglesIn);
+	//printf("gen welded index list end\n");
+
+	// Mark all degenerate triangles
+	iTotTris = iNrTrianglesIn;
+	iDegenTriangles = 0;
+	for (t=0; t<iTotTris; t++)
+	{
+		const int i0 = piTriListIn[t*3+0];
+		const int i1 = piTriListIn[t*3+1];
+		const int i2 = piTriListIn[t*3+2];
+		const SVec3 p0 = GetPosition(pContext, i0);
+		const SVec3 p1 = GetPosition(pContext, i1);
+		const SVec3 p2 = GetPosition(pContext, i2);
+		if (veq(p0,p1) || veq(p0,p2) || veq(p1,p2))	// degenerate
+		{
+			pTriInfos[t].iFlag |= MARK_DEGENERATE;
+			++iDegenTriangles;
+		}
+	}
+	iNrTrianglesIn = iTotTris - iDegenTriangles;
+
+	// mark all triangle pairs that belong to a quad with only one
+	// good triangle. These need special treatment in DegenEpilogue().
+	// Additionally, move all good triangles to the start of
+	// pTriInfos[] and piTriListIn[] without changing order and
+	// put the degenerate triangles last.
+	DegenPrologue(pTriInfos, piTriListIn, iNrTrianglesIn, iTotTris);
+
+	
+	// evaluate triangle level attributes and neighbor list
+	//printf("gen neighbors list begin\n");
+	InitTriInfo(pTriInfos, piTriListIn, pContext, iNrTrianglesIn);
+	//printf("gen neighbors list end\n");
+
+	
+	// based on the 4 rules, identify groups based on connectivity
+	iNrMaxGroups = iNrTrianglesIn*3;
+	pGroups = (SGroup *) malloc(sizeof(SGroup)*iNrMaxGroups);
+	piGroupTrianglesBuffer = (int *) malloc(sizeof(int)*iNrTrianglesIn*3);
+	if (pGroups==NULL || piGroupTrianglesBuffer==NULL)
+	{
+		if (pGroups!=NULL) free(pGroups);
+		if (piGroupTrianglesBuffer!=NULL) free(piGroupTrianglesBuffer);
+		free(piTriListIn);
+		free(pTriInfos);
+		return TFALSE;
+	}
+	//printf("gen 4rule groups begin\n");
+	iNrActiveGroups =
+		Build4RuleGroups(pTriInfos, pGroups, piGroupTrianglesBuffer, piTriListIn, iNrTrianglesIn);
+	//printf("gen 4rule groups end\n");
+
+	//
+
+	psTspace = (STSpace *) malloc(sizeof(STSpace)*iNrTSPaces);
+	if (psTspace==NULL)
+	{
+		free(piTriListIn);
+		free(pTriInfos);
+		free(pGroups);
+		free(piGroupTrianglesBuffer);
+		return TFALSE;
+	}
+	memset(psTspace, 0, sizeof(STSpace)*iNrTSPaces);
+	for (t=0; t<iNrTSPaces; t++)
+	{
+		psTspace[t].vOs.x=1.0f; psTspace[t].vOs.y=0.0f; psTspace[t].vOs.z=0.0f; psTspace[t].fMagS = 1.0f;
+		psTspace[t].vOt.x=0.0f; psTspace[t].vOt.y=1.0f; psTspace[t].vOt.z=0.0f; psTspace[t].fMagT = 1.0f;
+	}
+
+	// make tspaces, each group is split up into subgroups if necessary
+	// based on fAngularThreshold. Finally a tangent space is made for
+	// every resulting subgroup
+	//printf("gen tspaces begin\n");
+	bRes = GenerateTSpaces(psTspace, pTriInfos, pGroups, iNrActiveGroups, piTriListIn, fThresCos, pContext);
+	//printf("gen tspaces end\n");
+	
+	// clean up
+	free(pGroups);
+	free(piGroupTrianglesBuffer);
+
+	if (!bRes)	// if an allocation in GenerateTSpaces() failed
+	{
+		// clean up and return false
+		free(pTriInfos); free(piTriListIn); free(psTspace);
+		return TFALSE;
+	}
+
+
+	// degenerate quads with one good triangle will be fixed by copying a space from
+	// the good triangle to the coinciding vertex.
+	// all other degenerate triangles will just copy a space from any good triangle
+	// with the same welded index in piTriListIn[].
+	DegenEpilogue(psTspace, pTriInfos, piTriListIn, pContext, iNrTrianglesIn, iTotTris);
+
+	free(pTriInfos); free(piTriListIn);
+
+	index = 0;
+	for (f=0; f<iNrFaces; f++)
+	{
+		const int verts = pContext->m_pInterface->m_getNumVerticesOfFace(pContext, f);
+		if (verts!=3 && verts!=4) continue;
+		
+
+		// I've decided to let degenerate triangles and group-with-anythings
+		// vary between left/right hand coordinate systems at the vertices.
+		// All healthy triangles on the other hand are built to always be either or.
+
+		/*// force the coordinate system orientation to be uniform for every face.
+		// (this is already the case for good triangles but not for
+		// degenerate ones and those with bGroupWithAnything==true)
+		bool bOrient = psTspace[index].bOrient;
+		if (psTspace[index].iCounter == 0)	// tspace was not derived from a group
+		{
+			// look for a space created in GenerateTSpaces() by iCounter>0
+			bool bNotFound = true;
+			int i=1;
+			while (i<verts && bNotFound)
+			{
+				if (psTspace[index+i].iCounter > 0) bNotFound=false;
+				else ++i;
+			}
+			if (!bNotFound) bOrient = psTspace[index+i].bOrient;
+		}*/
+
+		// set data
+		for (i=0; i<verts; i++)
+		{
+			const STSpace * pTSpace = &psTspace[index];
+			float tang[] = {pTSpace->vOs.x, pTSpace->vOs.y, pTSpace->vOs.z};
+			float bitang[] = {pTSpace->vOt.x, pTSpace->vOt.y, pTSpace->vOt.z};
+			if (pContext->m_pInterface->m_setTSpace!=NULL)
+				pContext->m_pInterface->m_setTSpace(pContext, tang, bitang, pTSpace->fMagS, pTSpace->fMagT, pTSpace->bOrient, f, i);
+			if (pContext->m_pInterface->m_setTSpaceBasic!=NULL)
+				pContext->m_pInterface->m_setTSpaceBasic(pContext, tang, pTSpace->bOrient==TTRUE ? 1.0f : (-1.0f), f, i);
+
+			++index;
+		}
+	}
+
+	free(psTspace);
+
+	
+	return TTRUE;
+}
+
+///////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+
+typedef struct {
+	float vert[3];
+	int index;
+} STmpVert;
+
+static const int g_iCells = 2048;
+
+#ifdef _MSC_VER
+#  define NOINLINE __declspec(noinline)
+#else
+#  define NOINLINE __attribute__ ((noinline))
+#endif
+
+// it is IMPORTANT that this function is called to evaluate the hash since
+// inlining could potentially reorder instructions and generate different
+// results for the same effective input value fVal.
+static NOINLINE int FindGridCell(const float fMin, const float fMax, const float fVal)
+{
+	const float fIndex = g_iCells * ((fVal-fMin)/(fMax-fMin));
+	const int iIndex = (int)fIndex;
+	return iIndex < g_iCells ? (iIndex >= 0 ? iIndex : 0) : (g_iCells - 1);
+}
+
+static void MergeVertsFast(int piTriList_in_and_out[], STmpVert pTmpVert[], const SMikkTSpaceContext * pContext, const int iL_in, const int iR_in);
+static void MergeVertsSlow(int piTriList_in_and_out[], const SMikkTSpaceContext * pContext, const int pTable[], const int iEntries);
+static void GenerateSharedVerticesIndexListSlow(int piTriList_in_and_out[], const SMikkTSpaceContext * pContext, const int iNrTrianglesIn);
+
+static void GenerateSharedVerticesIndexList(int piTriList_in_and_out[], const SMikkTSpaceContext * pContext, const int iNrTrianglesIn)
+{
+
+	// Generate bounding box
+	int * piHashTable=NULL, * piHashCount=NULL, * piHashOffsets=NULL, * piHashCount2=NULL;
+	STmpVert * pTmpVert = NULL;
+	int i=0, iChannel=0, k=0, e=0;
+	int iMaxCount=0;
+	SVec3 vMin = GetPosition(pContext, 0), vMax = vMin, vDim;
+	float fMin, fMax;
+	for (i=1; i<(iNrTrianglesIn*3); i++)
+	{
+		const int index = piTriList_in_and_out[i];
+
+		const SVec3 vP = GetPosition(pContext, index);
+		if (vMin.x > vP.x) vMin.x = vP.x;
+		else if (vMax.x < vP.x) vMax.x = vP.x;
+		if (vMin.y > vP.y) vMin.y = vP.y;
+		else if (vMax.y < vP.y) vMax.y = vP.y;
+		if (vMin.z > vP.z) vMin.z = vP.z;
+		else if (vMax.z < vP.z) vMax.z = vP.z;
+	}
+
+	vDim = vsub(vMax,vMin);
+	iChannel = 0;
+	fMin = vMin.x; fMax=vMax.x;
+	if (vDim.y>vDim.x && vDim.y>vDim.z)
+	{
+		iChannel=1;
+		fMin = vMin.y;
+		fMax = vMax.y;
+	}
+	else if (vDim.z>vDim.x)
+	{
+		iChannel=2;
+		fMin = vMin.z;
+		fMax = vMax.z;
+	}
+
+	// make allocations
+	piHashTable = (int *) malloc(sizeof(int)*iNrTrianglesIn*3);
+	piHashCount = (int *) malloc(sizeof(int)*g_iCells);
+	piHashOffsets = (int *) malloc(sizeof(int)*g_iCells);
+	piHashCount2 = (int *) malloc(sizeof(int)*g_iCells);
+
+	if (piHashTable==NULL || piHashCount==NULL || piHashOffsets==NULL || piHashCount2==NULL)
+	{
+		if (piHashTable!=NULL) free(piHashTable);
+		if (piHashCount!=NULL) free(piHashCount);
+		if (piHashOffsets!=NULL) free(piHashOffsets);
+		if (piHashCount2!=NULL) free(piHashCount2);
+		GenerateSharedVerticesIndexListSlow(piTriList_in_and_out, pContext, iNrTrianglesIn);
+		return;
+	}
+	memset(piHashCount, 0, sizeof(int)*g_iCells);
+	memset(piHashCount2, 0, sizeof(int)*g_iCells);
+
+	// count amount of elements in each cell unit
+	for (i=0; i<(iNrTrianglesIn*3); i++)
+	{
+		const int index = piTriList_in_and_out[i];
+		const SVec3 vP = GetPosition(pContext, index);
+		const float fVal = iChannel==0 ? vP.x : (iChannel==1 ? vP.y : vP.z);
+		const int iCell = FindGridCell(fMin, fMax, fVal);
+		++piHashCount[iCell];
+	}
+
+	// evaluate start index of each cell.
+	piHashOffsets[0]=0;
+	for (k=1; k<g_iCells; k++)
+		piHashOffsets[k]=piHashOffsets[k-1]+piHashCount[k-1];
+
+	// insert vertices
+	for (i=0; i<(iNrTrianglesIn*3); i++)
+	{
+		const int index = piTriList_in_and_out[i];
+		const SVec3 vP = GetPosition(pContext, index);
+		const float fVal = iChannel==0 ? vP.x : (iChannel==1 ? vP.y : vP.z);
+		const int iCell = FindGridCell(fMin, fMax, fVal);
+		int * pTable = NULL;
+
+		assert(piHashCount2[iCell]<piHashCount[iCell]);
+		pTable = &piHashTable[piHashOffsets[iCell]];
+		pTable[piHashCount2[iCell]] = i;	// vertex i has been inserted.
+		++piHashCount2[iCell];
+	}
+	for (k=0; k<g_iCells; k++)
+		assert(piHashCount2[k] == piHashCount[k]);	// verify the count
+	free(piHashCount2);
+
+	// find maximum amount of entries in any hash entry
+	iMaxCount = piHashCount[0];
+	for (k=1; k<g_iCells; k++)
+		if (iMaxCount<piHashCount[k])
+			iMaxCount=piHashCount[k];
+	pTmpVert = (STmpVert *) malloc(sizeof(STmpVert)*iMaxCount);
+	
+
+	// complete the merge
+	for (k=0; k<g_iCells; k++)
+	{
+		// extract table of cell k and amount of entries in it
+		int * pTable = &piHashTable[piHashOffsets[k]];
+		const int iEntries = piHashCount[k];
+		if (iEntries < 2) continue;
+
+		if (pTmpVert!=NULL)
+		{
+			for (e=0; e<iEntries; e++)
+			{
+				int i = pTable[e];
+				const SVec3 vP = GetPosition(pContext, piTriList_in_and_out[i]);
+				pTmpVert[e].vert[0] = vP.x; pTmpVert[e].vert[1] = vP.y;
+				pTmpVert[e].vert[2] = vP.z; pTmpVert[e].index = i;
+			}
+			MergeVertsFast(piTriList_in_and_out, pTmpVert, pContext, 0, iEntries-1);
+		}
+		else
+			MergeVertsSlow(piTriList_in_and_out, pContext, pTable, iEntries);
+	}
+
+	if (pTmpVert!=NULL) { free(pTmpVert); }
+	free(piHashTable);
+	free(piHashCount);
+	free(piHashOffsets);
+}
+
+static void MergeVertsFast(int piTriList_in_and_out[], STmpVert pTmpVert[], const SMikkTSpaceContext * pContext, const int iL_in, const int iR_in)
+{
+	// make bbox
+	int c=0, l=0, channel=0;
+	float fvMin[3], fvMax[3];
+	float dx=0, dy=0, dz=0, fSep=0;
+	for (c=0; c<3; c++)
+	{	fvMin[c]=pTmpVert[iL_in].vert[c]; fvMax[c]=fvMin[c];	}
+	for (l=(iL_in+1); l<=iR_in; l++) {
+		for (c=0; c<3; c++) {
+			if (fvMin[c]>pTmpVert[l].vert[c]) fvMin[c]=pTmpVert[l].vert[c];
+			if (fvMax[c]<pTmpVert[l].vert[c]) fvMax[c]=pTmpVert[l].vert[c];
+		}
+	}
+
+	dx = fvMax[0]-fvMin[0];
+	dy = fvMax[1]-fvMin[1];
+	dz = fvMax[2]-fvMin[2];
+
+	channel = 0;
+	if (dy>dx && dy>dz) channel=1;
+	else if (dz>dx) channel=2;
+
+	fSep = 0.5f*(fvMax[channel]+fvMin[channel]);
+
+	// stop if all vertices are NaNs
+	if (!isfinite(fSep))
+		return;
+
+	// terminate recursion when the separation/average value
+	// is no longer strictly between fMin and fMax values.
+	if (fSep>=fvMax[channel] || fSep<=fvMin[channel])
+	{
+		// complete the weld
+		for (l=iL_in; l<=iR_in; l++)
+		{
+			int i = pTmpVert[l].index;
+			const int index = piTriList_in_and_out[i];
+			const SVec3 vP = GetPosition(pContext, index);
+			const SVec3 vN = GetNormal(pContext, index);
+			const SVec3 vT = GetTexCoord(pContext, index);
+
+			tbool bNotFound = TTRUE;
+			int l2=iL_in, i2rec=-1;
+			while (l2<l && bNotFound)
+			{
+				const int i2 = pTmpVert[l2].index;
+				const int index2 = piTriList_in_and_out[i2];
+				const SVec3 vP2 = GetPosition(pContext, index2);
+				const SVec3 vN2 = GetNormal(pContext, index2);
+				const SVec3 vT2 = GetTexCoord(pContext, index2);
+				i2rec=i2;
+
+				//if (vP==vP2 && vN==vN2 && vT==vT2)
+				if (vP.x==vP2.x && vP.y==vP2.y && vP.z==vP2.z &&
+					vN.x==vN2.x && vN.y==vN2.y && vN.z==vN2.z &&
+					vT.x==vT2.x && vT.y==vT2.y && vT.z==vT2.z)
+					bNotFound = TFALSE;
+				else
+					++l2;
+			}
+			
+			// merge if previously found
+			if (!bNotFound)
+				piTriList_in_and_out[i] = piTriList_in_and_out[i2rec];
+		}
+	}
+	else
+	{
+		int iL=iL_in, iR=iR_in;
+		assert((iR_in-iL_in)>0);	// at least 2 entries
+
+		// separate (by fSep) all points between iL_in and iR_in in pTmpVert[]
+		while (iL < iR)
+		{
+			tbool bReadyLeftSwap = TFALSE, bReadyRightSwap = TFALSE;
+			while ((!bReadyLeftSwap) && iL<iR)
+			{
+				assert(iL>=iL_in && iL<=iR_in);
+				bReadyLeftSwap = !(pTmpVert[iL].vert[channel]<fSep);
+				if (!bReadyLeftSwap) ++iL;
+			}
+			while ((!bReadyRightSwap) && iL<iR)
+			{
+				assert(iR>=iL_in && iR<=iR_in);
+				bReadyRightSwap = pTmpVert[iR].vert[channel]<fSep;
+				if (!bReadyRightSwap) --iR;
+			}
+			assert( (iL<iR) || !(bReadyLeftSwap && bReadyRightSwap) );
+
+			if (bReadyLeftSwap && bReadyRightSwap)
+			{
+				const STmpVert sTmp = pTmpVert[iL];
+				assert(iL<iR);
+				pTmpVert[iL] = pTmpVert[iR];
+				pTmpVert[iR] = sTmp;
+				++iL; --iR;
+			}
+		}
+
+		assert(iL==(iR+1) || (iL==iR));
+		if (iL==iR)
+		{
+			const tbool bReadyRightSwap = pTmpVert[iR].vert[channel]<fSep;
+			if (bReadyRightSwap) ++iL;
+			else --iR;
+		}
+
+		// only need to weld when there is more than 1 instance of the (x,y,z)
+		if (iL_in < iR)
+			MergeVertsFast(piTriList_in_and_out, pTmpVert, pContext, iL_in, iR);	// weld all left of fSep
+		if (iL < iR_in)
+			MergeVertsFast(piTriList_in_and_out, pTmpVert, pContext, iL, iR_in);	// weld all right of (or equal to) fSep
+	}
+}
+
+static void MergeVertsSlow(int piTriList_in_and_out[], const SMikkTSpaceContext * pContext, const int pTable[], const int iEntries)
+{
+	// this can be optimized further using a tree structure or more hashing.
+	int e=0;
+	for (e=0; e<iEntries; e++)
+	{
+		int i = pTable[e];
+		const int index = piTriList_in_and_out[i];
+		const SVec3 vP = GetPosition(pContext, index);
+		const SVec3 vN = GetNormal(pContext, index);
+		const SVec3 vT = GetTexCoord(pContext, index);
+
+		tbool bNotFound = TTRUE;
+		int e2=0, i2rec=-1;
+		while (e2<e && bNotFound)
+		{
+			const int i2 = pTable[e2];
+			const int index2 = piTriList_in_and_out[i2];
+			const SVec3 vP2 = GetPosition(pContext, index2);
+			const SVec3 vN2 = GetNormal(pContext, index2);
+			const SVec3 vT2 = GetTexCoord(pContext, index2);
+			i2rec = i2;
+
+			if (veq(vP,vP2) && veq(vN,vN2) && veq(vT,vT2))
+				bNotFound = TFALSE;
+			else
+				++e2;
+		}
+		
+		// merge if previously found
+		if (!bNotFound)
+			piTriList_in_and_out[i] = piTriList_in_and_out[i2rec];
+	}
+}
+
+static void GenerateSharedVerticesIndexListSlow(int piTriList_in_and_out[], const SMikkTSpaceContext * pContext, const int iNrTrianglesIn)
+{
+	int iNumUniqueVerts = 0, t=0, i=0;
+	for (t=0; t<iNrTrianglesIn; t++)
+	{
+		for (i=0; i<3; i++)
+		{
+			const int offs = t*3 + i;
+			const int index = piTriList_in_and_out[offs];
+
+			const SVec3 vP = GetPosition(pContext, index);
+			const SVec3 vN = GetNormal(pContext, index);
+			const SVec3 vT = GetTexCoord(pContext, index);
+
+			tbool bFound = TFALSE;
+			int t2=0, index2rec=-1;
+			while (!bFound && t2<=t)
+			{
+				int j=0;
+				while (!bFound && j<3)
+				{
+					const int index2 = piTriList_in_and_out[t2*3 + j];
+					const SVec3 vP2 = GetPosition(pContext, index2);
+					const SVec3 vN2 = GetNormal(pContext, index2);
+					const SVec3 vT2 = GetTexCoord(pContext, index2);
+					
+					if (veq(vP,vP2) && veq(vN,vN2) && veq(vT,vT2))
+						bFound = TTRUE;
+					else
+						++j;
+				}
+				if (!bFound) ++t2;
+			}
+
+			assert(bFound);
+			// if we found our own
+			if (index2rec == index) { ++iNumUniqueVerts; }
+
+			piTriList_in_and_out[offs] = index2rec;
+		}
+	}
+}
+
+static int GenerateInitialVerticesIndexList(STriInfo pTriInfos[], int piTriList_out[], const SMikkTSpaceContext * pContext, const int iNrTrianglesIn)
+{
+	int iTSpacesOffs = 0, f=0, t=0;
+	int iDstTriIndex = 0;
+	for (f=0; f<pContext->m_pInterface->m_getNumFaces(pContext); f++)
+	{
+		const int verts = pContext->m_pInterface->m_getNumVerticesOfFace(pContext, f);
+		if (verts!=3 && verts!=4) continue;
+
+		pTriInfos[iDstTriIndex].iOrgFaceNumber = f;
+		pTriInfos[iDstTriIndex].iTSpacesOffs = iTSpacesOffs;
+
+		if (verts==3)
+		{
+			unsigned char * pVerts = pTriInfos[iDstTriIndex].vert_num;
+			pVerts[0]=0; pVerts[1]=1; pVerts[2]=2;
+			piTriList_out[iDstTriIndex*3+0] = MakeIndex(f, 0);
+			piTriList_out[iDstTriIndex*3+1] = MakeIndex(f, 1);
+			piTriList_out[iDstTriIndex*3+2] = MakeIndex(f, 2);
+			++iDstTriIndex;	// next
+		}
+		else
+		{
+			{
+				pTriInfos[iDstTriIndex+1].iOrgFaceNumber = f;
+				pTriInfos[iDstTriIndex+1].iTSpacesOffs = iTSpacesOffs;
+			}
+
+			{
+				// need an order independent way to evaluate
+				// tspace on quads. This is done by splitting
+				// along the shortest diagonal.
+				const int i0 = MakeIndex(f, 0);
+				const int i1 = MakeIndex(f, 1);
+				const int i2 = MakeIndex(f, 2);
+				const int i3 = MakeIndex(f, 3);
+				const SVec3 T0 = GetTexCoord(pContext, i0);
+				const SVec3 T1 = GetTexCoord(pContext, i1);
+				const SVec3 T2 = GetTexCoord(pContext, i2);
+				const SVec3 T3 = GetTexCoord(pContext, i3);
+				const float distSQ_02 = LengthSquared(vsub(T2,T0));
+				const float distSQ_13 = LengthSquared(vsub(T3,T1));
+				tbool bQuadDiagIs_02;
+				if (distSQ_02<distSQ_13)
+					bQuadDiagIs_02 = TTRUE;
+				else if (distSQ_13<distSQ_02)
+					bQuadDiagIs_02 = TFALSE;
+				else
+				{
+					const SVec3 P0 = GetPosition(pContext, i0);
+					const SVec3 P1 = GetPosition(pContext, i1);
+					const SVec3 P2 = GetPosition(pContext, i2);
+					const SVec3 P3 = GetPosition(pContext, i3);
+					const float distSQ_02 = LengthSquared(vsub(P2,P0));
+					const float distSQ_13 = LengthSquared(vsub(P3,P1));
+
+					bQuadDiagIs_02 = distSQ_13<distSQ_02 ? TFALSE : TTRUE;
+				}
+
+				if (bQuadDiagIs_02)
+				{
+					{
+						unsigned char * pVerts_A = pTriInfos[iDstTriIndex].vert_num;
+						pVerts_A[0]=0; pVerts_A[1]=1; pVerts_A[2]=2;
+					}
+					piTriList_out[iDstTriIndex*3+0] = i0;
+					piTriList_out[iDstTriIndex*3+1] = i1;
+					piTriList_out[iDstTriIndex*3+2] = i2;
+					++iDstTriIndex;	// next
+					{
+						unsigned char * pVerts_B = pTriInfos[iDstTriIndex].vert_num;
+						pVerts_B[0]=0; pVerts_B[1]=2; pVerts_B[2]=3;
+					}
+					piTriList_out[iDstTriIndex*3+0] = i0;
+					piTriList_out[iDstTriIndex*3+1] = i2;
+					piTriList_out[iDstTriIndex*3+2] = i3;
+					++iDstTriIndex;	// next
+				}
+				else
+				{
+					{
+						unsigned char * pVerts_A = pTriInfos[iDstTriIndex].vert_num;
+						pVerts_A[0]=0; pVerts_A[1]=1; pVerts_A[2]=3;
+					}
+					piTriList_out[iDstTriIndex*3+0] = i0;
+					piTriList_out[iDstTriIndex*3+1] = i1;
+					piTriList_out[iDstTriIndex*3+2] = i3;
+					++iDstTriIndex;	// next
+					{
+						unsigned char * pVerts_B = pTriInfos[iDstTriIndex].vert_num;
+						pVerts_B[0]=1; pVerts_B[1]=2; pVerts_B[2]=3;
+					}
+					piTriList_out[iDstTriIndex*3+0] = i1;
+					piTriList_out[iDstTriIndex*3+1] = i2;
+					piTriList_out[iDstTriIndex*3+2] = i3;
+					++iDstTriIndex;	// next
+				}
+			}
+		}
+
+		iTSpacesOffs += verts;
+		assert(iDstTriIndex<=iNrTrianglesIn);
+	}
+
+	for (t=0; t<iNrTrianglesIn; t++)
+		pTriInfos[t].iFlag = 0;
+
+	// return total amount of tspaces
+	return iTSpacesOffs;
+}
+
+static SVec3 GetPosition(const SMikkTSpaceContext * pContext, const int index)
+{
+	int iF, iI;
+	SVec3 res; float pos[3];
+	IndexToData(&iF, &iI, index);
+	pContext->m_pInterface->m_getPosition(pContext, pos, iF, iI);
+	res.x=pos[0]; res.y=pos[1]; res.z=pos[2];
+	return res;
+}
+
+static SVec3 GetNormal(const SMikkTSpaceContext * pContext, const int index)
+{
+	int iF, iI;
+	SVec3 res; float norm[3];
+	IndexToData(&iF, &iI, index);
+	pContext->m_pInterface->m_getNormal(pContext, norm, iF, iI);
+	res.x=norm[0]; res.y=norm[1]; res.z=norm[2];
+	return res;
+}
+
+static SVec3 GetTexCoord(const SMikkTSpaceContext * pContext, const int index)
+{
+	int iF, iI;
+	SVec3 res; float texc[2];
+	IndexToData(&iF, &iI, index);
+	pContext->m_pInterface->m_getTexCoord(pContext, texc, iF, iI);
+	res.x=texc[0]; res.y=texc[1]; res.z=1.0f;
+	return res;
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////////
+/////////////////////////////////////////////////////////////////////////////////////////////////////
+
+typedef union {
+	struct
+	{
+		int i0, i1, f;
+	};
+	int array[3];
+} SEdge;
+
+static void BuildNeighborsFast(STriInfo pTriInfos[], SEdge * pEdges, const int piTriListIn[], const int iNrTrianglesIn);
+static void BuildNeighborsSlow(STriInfo pTriInfos[], const int piTriListIn[], const int iNrTrianglesIn);
+
+// returns the texture area times 2
+static float CalcTexArea(const SMikkTSpaceContext * pContext, const int indices[])
+{
+	const SVec3 t1 = GetTexCoord(pContext, indices[0]);
+	const SVec3 t2 = GetTexCoord(pContext, indices[1]);
+	const SVec3 t3 = GetTexCoord(pContext, indices[2]);
+
+	const float t21x = t2.x-t1.x;
+	const float t21y = t2.y-t1.y;
+	const float t31x = t3.x-t1.x;
+	const float t31y = t3.y-t1.y;
+
+	const float fSignedAreaSTx2 = t21x*t31y - t21y*t31x;
+
+	return fSignedAreaSTx2<0 ? (-fSignedAreaSTx2) : fSignedAreaSTx2;
+}
+
+static void InitTriInfo(STriInfo pTriInfos[], const int piTriListIn[], const SMikkTSpaceContext * pContext, const int iNrTrianglesIn)
+{
+	int f=0, i=0, t=0;
+	// pTriInfos[f].iFlag is cleared in GenerateInitialVerticesIndexList() which is called before this function.
+
+	// generate neighbor info list
+	for (f=0; f<iNrTrianglesIn; f++)
+		for (i=0; i<3; i++)
+		{
+			pTriInfos[f].FaceNeighbors[i] = -1;
+			pTriInfos[f].AssignedGroup[i] = NULL;
+
+			pTriInfos[f].vOs.x=0.0f; pTriInfos[f].vOs.y=0.0f; pTriInfos[f].vOs.z=0.0f;
+			pTriInfos[f].vOt.x=0.0f; pTriInfos[f].vOt.y=0.0f; pTriInfos[f].vOt.z=0.0f;
+			pTriInfos[f].fMagS = 0;
+			pTriInfos[f].fMagT = 0;
+
+			// assumed bad
+			pTriInfos[f].iFlag |= GROUP_WITH_ANY;
+		}
+
+	// evaluate first order derivatives
+	for (f=0; f<iNrTrianglesIn; f++)
+	{
+		// initial values
+		const SVec3 v1 = GetPosition(pContext, piTriListIn[f*3+0]);
+		const SVec3 v2 = GetPosition(pContext, piTriListIn[f*3+1]);
+		const SVec3 v3 = GetPosition(pContext, piTriListIn[f*3+2]);
+		const SVec3 t1 = GetTexCoord(pContext, piTriListIn[f*3+0]);
+		const SVec3 t2 = GetTexCoord(pContext, piTriListIn[f*3+1]);
+		const SVec3 t3 = GetTexCoord(pContext, piTriListIn[f*3+2]);
+
+		const float t21x = t2.x-t1.x;
+		const float t21y = t2.y-t1.y;
+		const float t31x = t3.x-t1.x;
+		const float t31y = t3.y-t1.y;
+		const SVec3 d1 = vsub(v2,v1);
+		const SVec3 d2 = vsub(v3,v1);
+
+		const float fSignedAreaSTx2 = t21x*t31y - t21y*t31x;
+		//assert(fSignedAreaSTx2!=0);
+		SVec3 vOs = vsub(vscale(t31y,d1), vscale(t21y,d2));	// eq 18
+		SVec3 vOt = vadd(vscale(-t31x,d1), vscale(t21x,d2)); // eq 19
+
+		pTriInfos[f].iFlag |= (fSignedAreaSTx2>0 ? ORIENT_PRESERVING : 0);
+
+		if ( NotZero(fSignedAreaSTx2) )
+		{
+			const float fAbsArea = fabsf(fSignedAreaSTx2);
+			const float fLenOs = Length(vOs);
+			const float fLenOt = Length(vOt);
+			const float fS = (pTriInfos[f].iFlag&ORIENT_PRESERVING)==0 ? (-1.0f) : 1.0f;
+			if ( NotZero(fLenOs) ) pTriInfos[f].vOs = vscale(fS/fLenOs, vOs);
+			if ( NotZero(fLenOt) ) pTriInfos[f].vOt = vscale(fS/fLenOt, vOt);
+
+			// evaluate magnitudes prior to normalization of vOs and vOt
+			pTriInfos[f].fMagS = fLenOs / fAbsArea;
+			pTriInfos[f].fMagT = fLenOt / fAbsArea;
+
+			// if this is a good triangle
+			if ( NotZero(pTriInfos[f].fMagS) && NotZero(pTriInfos[f].fMagT))
+				pTriInfos[f].iFlag &= (~GROUP_WITH_ANY);
+		}
+	}
+
+	// force otherwise healthy quads to a fixed orientation
+	while (t<(iNrTrianglesIn-1))
+	{
+		const int iFO_a = pTriInfos[t].iOrgFaceNumber;
+		const int iFO_b = pTriInfos[t+1].iOrgFaceNumber;
+		if (iFO_a==iFO_b)	// this is a quad
+		{
+			const tbool bIsDeg_a = (pTriInfos[t].iFlag&MARK_DEGENERATE)!=0 ? TTRUE : TFALSE;
+			const tbool bIsDeg_b = (pTriInfos[t+1].iFlag&MARK_DEGENERATE)!=0 ? TTRUE : TFALSE;
+			
+			// bad triangles should already have been removed by
+			// DegenPrologue(), but just in case check bIsDeg_a and bIsDeg_a are false
+			if ((bIsDeg_a||bIsDeg_b)==TFALSE)
+			{
+				const tbool bOrientA = (pTriInfos[t].iFlag&ORIENT_PRESERVING)!=0 ? TTRUE : TFALSE;
+				const tbool bOrientB = (pTriInfos[t+1].iFlag&ORIENT_PRESERVING)!=0 ? TTRUE : TFALSE;
+				// if this happens the quad has extremely bad mapping!!
+				if (bOrientA!=bOrientB)
+				{
+					//printf("found quad with bad mapping\n");
+					tbool bChooseOrientFirstTri = TFALSE;
+					if ((pTriInfos[t+1].iFlag&GROUP_WITH_ANY)!=0) bChooseOrientFirstTri = TTRUE;
+					else if ( CalcTexArea(pContext, &piTriListIn[t*3+0]) >= CalcTexArea(pContext, &piTriListIn[(t+1)*3+0]) )
+						bChooseOrientFirstTri = TTRUE;
+
+					// force match
+					{
+						const int t0 = bChooseOrientFirstTri ? t : (t+1);
+						const int t1 = bChooseOrientFirstTri ? (t+1) : t;
+						pTriInfos[t1].iFlag &= (~ORIENT_PRESERVING);	// clear first
+						pTriInfos[t1].iFlag |= (pTriInfos[t0].iFlag&ORIENT_PRESERVING);	// copy bit
+					}
+				}
+			}
+			t += 2;
+		}
+		else
+			++t;
+	}
+	
+	// match up edge pairs
+	{
+		SEdge * pEdges = (SEdge *) malloc(sizeof(SEdge)*iNrTrianglesIn*3);
+		if (pEdges==NULL)
+			BuildNeighborsSlow(pTriInfos, piTriListIn, iNrTrianglesIn);
+		else
+		{
+			BuildNeighborsFast(pTriInfos, pEdges, piTriListIn, iNrTrianglesIn);
+	
+			free(pEdges);
+		}
+	}
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////////
+/////////////////////////////////////////////////////////////////////////////////////////////////////
+
+static tbool AssignRecur(const int piTriListIn[], STriInfo psTriInfos[], const int iMyTriIndex, SGroup * pGroup);
+static void AddTriToGroup(SGroup * pGroup, const int iTriIndex);
+
+static int Build4RuleGroups(STriInfo pTriInfos[], SGroup pGroups[], int piGroupTrianglesBuffer[], const int piTriListIn[], const int iNrTrianglesIn)
+{
+	const int iNrMaxGroups = iNrTrianglesIn*3;
+	int iNrActiveGroups = 0;
+	int iOffset = 0, f=0, i=0;
+	(void)iNrMaxGroups;  /* quiet warnings in non debug mode */
+	for (f=0; f<iNrTrianglesIn; f++)
+	{
+		for (i=0; i<3; i++)
+		{
+			// if not assigned to a group
+			if ((pTriInfos[f].iFlag&GROUP_WITH_ANY)==0 && pTriInfos[f].AssignedGroup[i]==NULL)
+			{
+				tbool bOrPre;
+				int neigh_indexL, neigh_indexR;
+				const int vert_index = piTriListIn[f*3+i];
+				assert(iNrActiveGroups<iNrMaxGroups);
+				pTriInfos[f].AssignedGroup[i] = &pGroups[iNrActiveGroups];
+				pTriInfos[f].AssignedGroup[i]->iVertexRepresentitive = vert_index;
+				pTriInfos[f].AssignedGroup[i]->bOrientPreservering = (pTriInfos[f].iFlag&ORIENT_PRESERVING)!=0;
+				pTriInfos[f].AssignedGroup[i]->iNrFaces = 0;
+				pTriInfos[f].AssignedGroup[i]->pFaceIndices = &piGroupTrianglesBuffer[iOffset];
+				++iNrActiveGroups;
+
+				AddTriToGroup(pTriInfos[f].AssignedGroup[i], f);
+				bOrPre = (pTriInfos[f].iFlag&ORIENT_PRESERVING)!=0 ? TTRUE : TFALSE;
+				neigh_indexL = pTriInfos[f].FaceNeighbors[i];
+				neigh_indexR = pTriInfos[f].FaceNeighbors[i>0?(i-1):2];
+				if (neigh_indexL>=0) // neighbor
+				{
+					const tbool bAnswer =
+						AssignRecur(piTriListIn, pTriInfos, neigh_indexL,
+									pTriInfos[f].AssignedGroup[i] );
+					
+					const tbool bOrPre2 = (pTriInfos[neigh_indexL].iFlag&ORIENT_PRESERVING)!=0 ? TTRUE : TFALSE;
+					const tbool bDiff = bOrPre!=bOrPre2 ? TTRUE : TFALSE;
+					assert(bAnswer || bDiff);
+					(void)bAnswer, (void)bDiff;  /* quiet warnings in non debug mode */
+				}
+				if (neigh_indexR>=0) // neighbor
+				{
+					const tbool bAnswer =
+						AssignRecur(piTriListIn, pTriInfos, neigh_indexR,
+									pTriInfos[f].AssignedGroup[i] );
+
+					const tbool bOrPre2 = (pTriInfos[neigh_indexR].iFlag&ORIENT_PRESERVING)!=0 ? TTRUE : TFALSE;
+					const tbool bDiff = bOrPre!=bOrPre2 ? TTRUE : TFALSE;
+					assert(bAnswer || bDiff);
+					(void)bAnswer, (void)bDiff;  /* quiet warnings in non debug mode */
+				}
+
+				// update offset
+				iOffset += pTriInfos[f].AssignedGroup[i]->iNrFaces;
+				// since the groups are disjoint a triangle can never
+				// belong to more than 3 groups. Subsequently something
+				// is completely screwed if this assertion ever hits.
+				assert(iOffset <= iNrMaxGroups);
+			}
+		}
+	}
+
+	return iNrActiveGroups;
+}
+
+static void AddTriToGroup(SGroup * pGroup, const int iTriIndex)
+{
+	pGroup->pFaceIndices[pGroup->iNrFaces] = iTriIndex;
+	++pGroup->iNrFaces;
+}
+
+static tbool AssignRecur(const int piTriListIn[], STriInfo psTriInfos[],
+				 const int iMyTriIndex, SGroup * pGroup)
+{
+	STriInfo * pMyTriInfo = &psTriInfos[iMyTriIndex];
+
+	// track down vertex
+	const int iVertRep = pGroup->iVertexRepresentitive;
+	const int * pVerts = &piTriListIn[3*iMyTriIndex+0];
+	int i=-1;
+	if (pVerts[0]==iVertRep) i=0;
+	else if (pVerts[1]==iVertRep) i=1;
+	else if (pVerts[2]==iVertRep) i=2;
+	assert(i>=0 && i<3);
+
+	// early out
+	if (pMyTriInfo->AssignedGroup[i] == pGroup) return TTRUE;
+	else if (pMyTriInfo->AssignedGroup[i]!=NULL) return TFALSE;
+	if ((pMyTriInfo->iFlag&GROUP_WITH_ANY)!=0)
+	{
+		// first to group with a group-with-anything triangle
+		// determines it's orientation.
+		// This is the only existing order dependency in the code!!
+		if ( pMyTriInfo->AssignedGroup[0] == NULL &&
+			pMyTriInfo->AssignedGroup[1] == NULL &&
+			pMyTriInfo->AssignedGroup[2] == NULL )
+		{
+			pMyTriInfo->iFlag &= (~ORIENT_PRESERVING);
+			pMyTriInfo->iFlag |= (pGroup->bOrientPreservering ? ORIENT_PRESERVING : 0);
+		}
+	}
+	{
+		const tbool bOrient = (pMyTriInfo->iFlag&ORIENT_PRESERVING)!=0 ? TTRUE : TFALSE;
+		if (bOrient != pGroup->bOrientPreservering) return TFALSE;
+	}
+
+	AddTriToGroup(pGroup, iMyTriIndex);
+	pMyTriInfo->AssignedGroup[i] = pGroup;
+
+	{
+		const int neigh_indexL = pMyTriInfo->FaceNeighbors[i];
+		const int neigh_indexR = pMyTriInfo->FaceNeighbors[i>0?(i-1):2];
+		if (neigh_indexL>=0)
+			AssignRecur(piTriListIn, psTriInfos, neigh_indexL, pGroup);
+		if (neigh_indexR>=0)
+			AssignRecur(piTriListIn, psTriInfos, neigh_indexR, pGroup);
+	}
+
+
+
+	return TTRUE;
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////////
+/////////////////////////////////////////////////////////////////////////////////////////////////////
+
+static tbool CompareSubGroups(const SSubGroup * pg1, const SSubGroup * pg2);
+static void QuickSort(int* pSortBuffer, int iLeft, int iRight, unsigned int uSeed);
+static STSpace EvalTspace(int face_indices[], const int iFaces, const int piTriListIn[], const STriInfo pTriInfos[], const SMikkTSpaceContext * pContext, const int iVertexRepresentitive);
+
+static tbool GenerateTSpaces(STSpace psTspace[], const STriInfo pTriInfos[], const SGroup pGroups[],
+                             const int iNrActiveGroups, const int piTriListIn[], const float fThresCos,
+                             const SMikkTSpaceContext * pContext)
+{
+	STSpace * pSubGroupTspace = NULL;
+	SSubGroup * pUniSubGroups = NULL;
+	int * pTmpMembers = NULL;
+	int iMaxNrFaces=0, iUniqueTspaces=0, g=0, i=0;
+	for (g=0; g<iNrActiveGroups; g++)
+		if (iMaxNrFaces < pGroups[g].iNrFaces)
+			iMaxNrFaces = pGroups[g].iNrFaces;
+
+	if (iMaxNrFaces == 0) return TTRUE;
+
+	// make initial allocations
+	pSubGroupTspace = (STSpace *) malloc(sizeof(STSpace)*iMaxNrFaces);
+	pUniSubGroups = (SSubGroup *) malloc(sizeof(SSubGroup)*iMaxNrFaces);
+	pTmpMembers = (int *) malloc(sizeof(int)*iMaxNrFaces);
+	if (pSubGroupTspace==NULL || pUniSubGroups==NULL || pTmpMembers==NULL)
+	{
+		if (pSubGroupTspace!=NULL) free(pSubGroupTspace);
+		if (pUniSubGroups!=NULL) free(pUniSubGroups);
+		if (pTmpMembers!=NULL) free(pTmpMembers);
+		return TFALSE;
+	}
+
+
+	iUniqueTspaces = 0;
+	for (g=0; g<iNrActiveGroups; g++)
+	{
+		const SGroup * pGroup = &pGroups[g];
+		int iUniqueSubGroups = 0, s=0;
+
+		for (i=0; i<pGroup->iNrFaces; i++)	// triangles
+		{
+			const int f = pGroup->pFaceIndices[i];	// triangle number
+			int index=-1, iVertIndex=-1, iOF_1=-1, iMembers=0, j=0, l=0;
+			SSubGroup tmp_group;
+			tbool bFound;
+			SVec3 n, vOs, vOt;
+			if (pTriInfos[f].AssignedGroup[0]==pGroup) index=0;
+			else if (pTriInfos[f].AssignedGroup[1]==pGroup) index=1;
+			else if (pTriInfos[f].AssignedGroup[2]==pGroup) index=2;
+			assert(index>=0 && index<3);
+
+			iVertIndex = piTriListIn[f*3+index];
+			assert(iVertIndex==pGroup->iVertexRepresentitive);
+
+			// is normalized already
+			n = GetNormal(pContext, iVertIndex);
+			
+			// project
+			vOs = vsub(pTriInfos[f].vOs, vscale(vdot(n,pTriInfos[f].vOs), n));
+			vOt = vsub(pTriInfos[f].vOt, vscale(vdot(n,pTriInfos[f].vOt), n));
+			if ( VNotZero(vOs) ) vOs = Normalize(vOs);
+			if ( VNotZero(vOt) ) vOt = Normalize(vOt);
+
+			// original face number
+			iOF_1 = pTriInfos[f].iOrgFaceNumber;
+			
+			iMembers = 0;
+			for (j=0; j<pGroup->iNrFaces; j++)
+			{
+				const int t = pGroup->pFaceIndices[j];	// triangle number
+				const int iOF_2 = pTriInfos[t].iOrgFaceNumber;
+
+				// project
+				SVec3 vOs2 = vsub(pTriInfos[t].vOs, vscale(vdot(n,pTriInfos[t].vOs), n));
+				SVec3 vOt2 = vsub(pTriInfos[t].vOt, vscale(vdot(n,pTriInfos[t].vOt), n));
+				if ( VNotZero(vOs2) ) vOs2 = Normalize(vOs2);
+				if ( VNotZero(vOt2) ) vOt2 = Normalize(vOt2);
+
+				{
+					const tbool bAny = ( (pTriInfos[f].iFlag | pTriInfos[t].iFlag) & GROUP_WITH_ANY )!=0 ? TTRUE : TFALSE;
+					// make sure triangles which belong to the same quad are joined.
+					const tbool bSameOrgFace = iOF_1==iOF_2 ? TTRUE : TFALSE;
+
+					const float fCosS = vdot(vOs,vOs2);
+					const float fCosT = vdot(vOt,vOt2);
+
+					assert(f!=t || bSameOrgFace);	// sanity check
+					if (bAny || bSameOrgFace || (fCosS>fThresCos && fCosT>fThresCos))
+						pTmpMembers[iMembers++] = t;
+				}
+			}
+
+			// sort pTmpMembers
+			tmp_group.iNrFaces = iMembers;
+			tmp_group.pTriMembers = pTmpMembers;
+			if (iMembers>1)
+			{
+				unsigned int uSeed = INTERNAL_RND_SORT_SEED;	// could replace with a random seed?
+				QuickSort(pTmpMembers, 0, iMembers-1, uSeed);
+			}
+
+			// look for an existing match
+			bFound = TFALSE;
+			l=0;
+			while (l<iUniqueSubGroups && !bFound)
+			{
+				bFound = CompareSubGroups(&tmp_group, &pUniSubGroups[l]);
+				if (!bFound) ++l;
+			}
+			
+			// assign tangent space index
+			assert(bFound || l==iUniqueSubGroups);
+			//piTempTangIndices[f*3+index] = iUniqueTspaces+l;
+
+			// if no match was found we allocate a new subgroup
+			if (!bFound)
+			{
+				// insert new subgroup
+				int * pIndices = (int *) malloc(sizeof(int)*iMembers);
+				if (pIndices==NULL)
+				{
+					// clean up and return false
+					int s=0;
+					for (s=0; s<iUniqueSubGroups; s++)
+						free(pUniSubGroups[s].pTriMembers);
+					free(pUniSubGroups);
+					free(pTmpMembers);
+					free(pSubGroupTspace);
+					return TFALSE;
+				}
+				pUniSubGroups[iUniqueSubGroups].iNrFaces = iMembers;
+				pUniSubGroups[iUniqueSubGroups].pTriMembers = pIndices;
+				memcpy(pIndices, tmp_group.pTriMembers, iMembers*sizeof(int));
+				pSubGroupTspace[iUniqueSubGroups] =
+					EvalTspace(tmp_group.pTriMembers, iMembers, piTriListIn, pTriInfos, pContext, pGroup->iVertexRepresentitive);
+				++iUniqueSubGroups;
+			}
+
+			// output tspace
+			{
+				const int iOffs = pTriInfos[f].iTSpacesOffs;
+				const int iVert = pTriInfos[f].vert_num[index];
+				STSpace * pTS_out = &psTspace[iOffs+iVert];
+				assert(pTS_out->iCounter<2);
+				assert(((pTriInfos[f].iFlag&ORIENT_PRESERVING)!=0) == pGroup->bOrientPreservering);
+				if (pTS_out->iCounter==1)
+				{
+					*pTS_out = AvgTSpace(pTS_out, &pSubGroupTspace[l]);
+					pTS_out->iCounter = 2;	// update counter
+					pTS_out->bOrient = pGroup->bOrientPreservering;
+				}
+				else
+				{
+					assert(pTS_out->iCounter==0);
+					*pTS_out = pSubGroupTspace[l];
+					pTS_out->iCounter = 1;	// update counter
+					pTS_out->bOrient = pGroup->bOrientPreservering;
+				}
+			}
+		}
+
+		// clean up and offset iUniqueTspaces
+		for (s=0; s<iUniqueSubGroups; s++)
+			free(pUniSubGroups[s].pTriMembers);
+		iUniqueTspaces += iUniqueSubGroups;
+	}
+
+	// clean up
+	free(pUniSubGroups);
+	free(pTmpMembers);
+	free(pSubGroupTspace);
+
+	return TTRUE;
+}
+
+static STSpace EvalTspace(int face_indices[], const int iFaces, const int piTriListIn[], const STriInfo pTriInfos[],
+                          const SMikkTSpaceContext * pContext, const int iVertexRepresentitive)
+{
+	STSpace res;
+	float fAngleSum = 0;
+	int face=0;
+	res.vOs.x=0.0f; res.vOs.y=0.0f; res.vOs.z=0.0f;
+	res.vOt.x=0.0f; res.vOt.y=0.0f; res.vOt.z=0.0f;
+	res.fMagS = 0; res.fMagT = 0;
+
+	for (face=0; face<iFaces; face++)
+	{
+		const int f = face_indices[face];
+
+		// only valid triangles get to add their contribution
+		if ( (pTriInfos[f].iFlag&GROUP_WITH_ANY)==0 )
+		{
+			SVec3 n, vOs, vOt, p0, p1, p2, v1, v2;
+			float fCos, fAngle, fMagS, fMagT;
+			int i=-1, index=-1, i0=-1, i1=-1, i2=-1;
+			if (piTriListIn[3*f+0]==iVertexRepresentitive) i=0;
+			else if (piTriListIn[3*f+1]==iVertexRepresentitive) i=1;
+			else if (piTriListIn[3*f+2]==iVertexRepresentitive) i=2;
+			assert(i>=0 && i<3);
+
+			// project
+			index = piTriListIn[3*f+i];
+			n = GetNormal(pContext, index);
+			vOs = vsub(pTriInfos[f].vOs, vscale(vdot(n,pTriInfos[f].vOs), n));
+			vOt = vsub(pTriInfos[f].vOt, vscale(vdot(n,pTriInfos[f].vOt), n));
+			if ( VNotZero(vOs) ) vOs = Normalize(vOs);
+			if ( VNotZero(vOt) ) vOt = Normalize(vOt);
+
+			i2 = piTriListIn[3*f + (i<2?(i+1):0)];
+			i1 = piTriListIn[3*f + i];
+			i0 = piTriListIn[3*f + (i>0?(i-1):2)];
+
+			p0 = GetPosition(pContext, i0);
+			p1 = GetPosition(pContext, i1);
+			p2 = GetPosition(pContext, i2);
+			v1 = vsub(p0,p1);
+			v2 = vsub(p2,p1);
+
+			// project
+			v1 = vsub(v1, vscale(vdot(n,v1),n)); if ( VNotZero(v1) ) v1 = Normalize(v1);
+			v2 = vsub(v2, vscale(vdot(n,v2),n)); if ( VNotZero(v2) ) v2 = Normalize(v2);
+
+			// weight contribution by the angle
+			// between the two edge vectors
+			fCos = vdot(v1,v2); fCos=fCos>1?1:(fCos<(-1) ? (-1) : fCos);
+			fAngle = (float) acos(fCos);
+			fMagS = pTriInfos[f].fMagS;
+			fMagT = pTriInfos[f].fMagT;
+
+			res.vOs=vadd(res.vOs, vscale(fAngle,vOs));
+			res.vOt=vadd(res.vOt,vscale(fAngle,vOt));
+			res.fMagS+=(fAngle*fMagS);
+			res.fMagT+=(fAngle*fMagT);
+			fAngleSum += fAngle;
+		}
+	}
+
+	// normalize
+	if ( VNotZero(res.vOs) ) res.vOs = Normalize(res.vOs);
+	if ( VNotZero(res.vOt) ) res.vOt = Normalize(res.vOt);
+	if (fAngleSum>0)
+	{
+		res.fMagS /= fAngleSum;
+		res.fMagT /= fAngleSum;
+	}
+
+	return res;
+}
+
+static tbool CompareSubGroups(const SSubGroup * pg1, const SSubGroup * pg2)
+{
+	tbool bStillSame=TTRUE;
+	int i=0;
+	if (pg1->iNrFaces!=pg2->iNrFaces) return TFALSE;
+	while (i<pg1->iNrFaces && bStillSame)
+	{
+		bStillSame = pg1->pTriMembers[i]==pg2->pTriMembers[i] ? TTRUE : TFALSE;
+		if (bStillSame) ++i;
+	}
+	return bStillSame;
+}
+
+static void QuickSort(int* pSortBuffer, int iLeft, int iRight, unsigned int uSeed)
+{
+	int iL, iR, n, index, iMid, iTmp;
+
+	// Random
+	unsigned int t=uSeed&31;
+	t=(uSeed<<t)|(uSeed>>(32-t));
+	uSeed=uSeed+t+3;
+	// Random end
+
+	iL=iLeft; iR=iRight;
+	n = (iR-iL)+1;
+	assert(n>=0);
+	index = (int) (uSeed%n);
+
+	iMid=pSortBuffer[index + iL];
+
+
+	do
+	{
+		while (pSortBuffer[iL] < iMid)
+			++iL;
+		while (pSortBuffer[iR] > iMid)
+			--iR;
+
+		if (iL <= iR)
+		{
+			iTmp = pSortBuffer[iL];
+			pSortBuffer[iL] = pSortBuffer[iR];
+			pSortBuffer[iR] = iTmp;
+			++iL; --iR;
+		}
+	}
+	while (iL <= iR);
+
+	if (iLeft < iR)
+		QuickSort(pSortBuffer, iLeft, iR, uSeed);
+	if (iL < iRight)
+		QuickSort(pSortBuffer, iL, iRight, uSeed);
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////
+/////////////////////////////////////////////////////////////////////////////////////////////
+
+static void QuickSortEdges(SEdge * pSortBuffer, int iLeft, int iRight, const int channel, unsigned int uSeed);
+static void GetEdge(int * i0_out, int * i1_out, int * edgenum_out, const int indices[], const int i0_in, const int i1_in);
+
+static void BuildNeighborsFast(STriInfo pTriInfos[], SEdge * pEdges, const int piTriListIn[], const int iNrTrianglesIn)
+{
+	// build array of edges
+	unsigned int uSeed = INTERNAL_RND_SORT_SEED;				// could replace with a random seed?
+	int iEntries=0, iCurStartIndex=-1, f=0, i=0;
+	for (f=0; f<iNrTrianglesIn; f++)
+		for (i=0; i<3; i++)
+		{
+			const int i0 = piTriListIn[f*3+i];
+			const int i1 = piTriListIn[f*3+(i<2?(i+1):0)];
+			pEdges[f*3+i].i0 = i0 < i1 ? i0 : i1;			// put minimum index in i0
+			pEdges[f*3+i].i1 = !(i0 < i1) ? i0 : i1;		// put maximum index in i1
+			pEdges[f*3+i].f = f;							// record face number
+		}
+
+	// sort over all edges by i0, this is the pricy one.
+	QuickSortEdges(pEdges, 0, iNrTrianglesIn*3-1, 0, uSeed);	// sort channel 0 which is i0
+
+	// sub sort over i1, should be fast.
+	// could replace this with a 64 bit int sort over (i0,i1)
+	// with i0 as msb in the quicksort call above.
+	iEntries = iNrTrianglesIn*3;
+	iCurStartIndex = 0;
+	for (i=1; i<iEntries; i++)
+	{
+		if (pEdges[iCurStartIndex].i0 != pEdges[i].i0)
+		{
+			const int iL = iCurStartIndex;
+			const int iR = i-1;
+			//const int iElems = i-iL;
+			iCurStartIndex = i;
+			QuickSortEdges(pEdges, iL, iR, 1, uSeed);	// sort channel 1 which is i1
+		}
+	}
+
+	// sub sort over f, which should be fast.
+	// this step is to remain compliant with BuildNeighborsSlow() when
+	// more than 2 triangles use the same edge (such as a butterfly topology).
+	iCurStartIndex = 0;
+	for (i=1; i<iEntries; i++)
+	{
+		if (pEdges[iCurStartIndex].i0 != pEdges[i].i0 || pEdges[iCurStartIndex].i1 != pEdges[i].i1)
+		{
+			const int iL = iCurStartIndex;
+			const int iR = i-1;
+			//const int iElems = i-iL;
+			iCurStartIndex = i;
+			QuickSortEdges(pEdges, iL, iR, 2, uSeed);	// sort channel 2 which is f
+		}
+	}
+
+	// pair up, adjacent triangles
+	for (i=0; i<iEntries; i++)
+	{
+		const int i0=pEdges[i].i0;
+		const int i1=pEdges[i].i1;
+		const int f = pEdges[i].f;
+		tbool bUnassigned_A;
+
+		int i0_A, i1_A;
+		int edgenum_A, edgenum_B=0;	// 0,1 or 2
+		GetEdge(&i0_A, &i1_A, &edgenum_A, &piTriListIn[f*3], i0, i1);	// resolve index ordering and edge_num
+		bUnassigned_A = pTriInfos[f].FaceNeighbors[edgenum_A] == -1 ? TTRUE : TFALSE;
+
+		if (bUnassigned_A)
+		{
+			// get true index ordering
+			int j=i+1, t;
+			tbool bNotFound = TTRUE;
+			while (j<iEntries && i0==pEdges[j].i0 && i1==pEdges[j].i1 && bNotFound)
+			{
+				tbool bUnassigned_B;
+				int i0_B, i1_B;
+				t = pEdges[j].f;
+				// flip i0_B and i1_B
+				GetEdge(&i1_B, &i0_B, &edgenum_B, &piTriListIn[t*3], pEdges[j].i0, pEdges[j].i1);	// resolve index ordering and edge_num
+				//assert(!(i0_A==i1_B && i1_A==i0_B));
+				bUnassigned_B =  pTriInfos[t].FaceNeighbors[edgenum_B]==-1 ? TTRUE : TFALSE;
+				if (i0_A==i0_B && i1_A==i1_B && bUnassigned_B)
+					bNotFound = TFALSE;
+				else
+					++j;
+			}
+
+			if (!bNotFound)
+			{
+				int t = pEdges[j].f;
+				pTriInfos[f].FaceNeighbors[edgenum_A] = t;
+				//assert(pTriInfos[t].FaceNeighbors[edgenum_B]==-1);
+				pTriInfos[t].FaceNeighbors[edgenum_B] = f;
+			}
+		}
+	}
+}
+
+static void BuildNeighborsSlow(STriInfo pTriInfos[], const int piTriListIn[], const int iNrTrianglesIn)
+{
+	int f=0, i=0;
+	for (f=0; f<iNrTrianglesIn; f++)
+	{
+		for (i=0; i<3; i++)
+		{
+			// if unassigned
+			if (pTriInfos[f].FaceNeighbors[i] == -1)
+			{
+				const int i0_A = piTriListIn[f*3+i];
+				const int i1_A = piTriListIn[f*3+(i<2?(i+1):0)];
+
+				// search for a neighbor
+				tbool bFound = TFALSE;
+				int t=0, j=0;
+				while (!bFound && t<iNrTrianglesIn)
+				{
+					if (t!=f)
+					{
+						j=0;
+						while (!bFound && j<3)
+						{
+							// in rev order
+							const int i1_B = piTriListIn[t*3+j];
+							const int i0_B = piTriListIn[t*3+(j<2?(j+1):0)];
+							//assert(!(i0_A==i1_B && i1_A==i0_B));
+							if (i0_A==i0_B && i1_A==i1_B)
+								bFound = TTRUE;
+							else
+								++j;
+						}
+					}
+					
+					if (!bFound) ++t;
+				}
+
+				// assign neighbors
+				if (bFound)
+				{
+					pTriInfos[f].FaceNeighbors[i] = t;
+					//assert(pTriInfos[t].FaceNeighbors[j]==-1);
+					pTriInfos[t].FaceNeighbors[j] = f;
+				}
+			}
+		}
+	}
+}
+
+static void QuickSortEdges(SEdge * pSortBuffer, int iLeft, int iRight, const int channel, unsigned int uSeed)
+{
+	unsigned int t;
+	int iL, iR, n, index, iMid;
+
+	// early out
+	SEdge sTmp;
+	const int iElems = iRight-iLeft+1;
+	if (iElems<2) return;
+	else if (iElems==2)
+	{
+		if (pSortBuffer[iLeft].array[channel] > pSortBuffer[iRight].array[channel])
+		{
+			sTmp = pSortBuffer[iLeft];
+			pSortBuffer[iLeft] = pSortBuffer[iRight];
+			pSortBuffer[iRight] = sTmp;
+		}
+		return;
+	}
+
+	// Random
+	t=uSeed&31;
+	t=(uSeed<<t)|(uSeed>>(32-t));
+	uSeed=uSeed+t+3;
+	// Random end
+
+	iL = iLeft;
+	iR = iRight;
+	n = (iR-iL)+1;
+	assert(n>=0);
+	index = (int) (uSeed%n);
+
+	iMid=pSortBuffer[index + iL].array[channel];
+
+	do
+	{
+		while (pSortBuffer[iL].array[channel] < iMid)
+			++iL;
+		while (pSortBuffer[iR].array[channel] > iMid)
+			--iR;
+
+		if (iL <= iR)
+		{
+			sTmp = pSortBuffer[iL];
+			pSortBuffer[iL] = pSortBuffer[iR];
+			pSortBuffer[iR] = sTmp;
+			++iL; --iR;
+		}
+	}
+	while (iL <= iR);
+
+	if (iLeft < iR)
+		QuickSortEdges(pSortBuffer, iLeft, iR, channel, uSeed);
+	if (iL < iRight)
+		QuickSortEdges(pSortBuffer, iL, iRight, channel, uSeed);
+}
+
+// resolve ordering and edge number
+static void GetEdge(int * i0_out, int * i1_out, int * edgenum_out, const int indices[], const int i0_in, const int i1_in)
+{
+	*edgenum_out = -1;
+	
+	// test if first index is on the edge
+	if (indices[0]==i0_in || indices[0]==i1_in)
+	{
+		// test if second index is on the edge
+		if (indices[1]==i0_in || indices[1]==i1_in)
+		{
+			edgenum_out[0]=0;	// first edge
+			i0_out[0]=indices[0];
+			i1_out[0]=indices[1];
+		}
+		else
+		{
+			edgenum_out[0]=2;	// third edge
+			i0_out[0]=indices[2];
+			i1_out[0]=indices[0];
+		}
+	}
+	else
+	{
+		// only second and third index is on the edge
+		edgenum_out[0]=1;	// second edge
+		i0_out[0]=indices[1];
+		i1_out[0]=indices[2];
+	}
+}
+
+
+/////////////////////////////////////////////////////////////////////////////////////////////
+/////////////////////////////////// Degenerate triangles ////////////////////////////////////
+
+static void DegenPrologue(STriInfo pTriInfos[], int piTriList_out[], const int iNrTrianglesIn, const int iTotTris)
+{
+	int iNextGoodTriangleSearchIndex=-1;
+	tbool bStillFindingGoodOnes;
+
+	// locate quads with only one good triangle
+	int t=0;
+	while (t<(iTotTris-1))
+	{
+		const int iFO_a = pTriInfos[t].iOrgFaceNumber;
+		const int iFO_b = pTriInfos[t+1].iOrgFaceNumber;
+		if (iFO_a==iFO_b)	// this is a quad
+		{
+			const tbool bIsDeg_a = (pTriInfos[t].iFlag&MARK_DEGENERATE)!=0 ? TTRUE : TFALSE;
+			const tbool bIsDeg_b = (pTriInfos[t+1].iFlag&MARK_DEGENERATE)!=0 ? TTRUE : TFALSE;
+			if ((bIsDeg_a^bIsDeg_b)!=0)
+			{
+				pTriInfos[t].iFlag |= QUAD_ONE_DEGEN_TRI;
+				pTriInfos[t+1].iFlag |= QUAD_ONE_DEGEN_TRI;
+			}
+			t += 2;
+		}
+		else
+			++t;
+	}
+
+	// reorder list so all degen triangles are moved to the back
+	// without reordering the good triangles
+	iNextGoodTriangleSearchIndex = 1;
+	t=0;
+	bStillFindingGoodOnes = TTRUE;
+	while (t<iNrTrianglesIn && bStillFindingGoodOnes)
+	{
+		const tbool bIsGood = (pTriInfos[t].iFlag&MARK_DEGENERATE)==0 ? TTRUE : TFALSE;
+		if (bIsGood)
+		{
+			if (iNextGoodTriangleSearchIndex < (t+2))
+				iNextGoodTriangleSearchIndex = t+2;
+		}
+		else
+		{
+			int t0, t1;
+			// search for the first good triangle.
+			tbool bJustADegenerate = TTRUE;
+			while (bJustADegenerate && iNextGoodTriangleSearchIndex<iTotTris)
+			{
+				const tbool bIsGood = (pTriInfos[iNextGoodTriangleSearchIndex].iFlag&MARK_DEGENERATE)==0 ? TTRUE : TFALSE;
+				if (bIsGood) bJustADegenerate=TFALSE;
+				else ++iNextGoodTriangleSearchIndex;
+			}
+
+			t0 = t;
+			t1 = iNextGoodTriangleSearchIndex;
+			++iNextGoodTriangleSearchIndex;
+			assert(iNextGoodTriangleSearchIndex > (t+1));
+
+			// swap triangle t0 and t1
+			if (!bJustADegenerate)
+			{
+				int i=0;
+				for (i=0; i<3; i++)
+				{
+					const int index = piTriList_out[t0*3+i];
+					piTriList_out[t0*3+i] = piTriList_out[t1*3+i];
+					piTriList_out[t1*3+i] = index;
+				}
+				{
+					const STriInfo tri_info = pTriInfos[t0];
+					pTriInfos[t0] = pTriInfos[t1];
+					pTriInfos[t1] = tri_info;
+				}
+			}
+			else
+				bStillFindingGoodOnes = TFALSE;	// this is not supposed to happen
+		}
+
+		if (bStillFindingGoodOnes) ++t;
+	}
+
+	assert(bStillFindingGoodOnes);	// code will still work.
+	assert(iNrTrianglesIn == t);
+}
+
+static void DegenEpilogue(STSpace psTspace[], STriInfo pTriInfos[], int piTriListIn[], const SMikkTSpaceContext * pContext, const int iNrTrianglesIn, const int iTotTris)
+{
+	int t=0, i=0;
+	// deal with degenerate triangles
+	// punishment for degenerate triangles is O(N^2)
+	for (t=iNrTrianglesIn; t<iTotTris; t++)
+	{
+		// degenerate triangles on a quad with one good triangle are skipped
+		// here but processed in the next loop
+		const tbool bSkip = (pTriInfos[t].iFlag&QUAD_ONE_DEGEN_TRI)!=0 ? TTRUE : TFALSE;
+
+		if (!bSkip)
+		{
+			for (i=0; i<3; i++)
+			{
+				const int index1 = piTriListIn[t*3+i];
+				// search through the good triangles
+				tbool bNotFound = TTRUE;
+				int j=0;
+				while (bNotFound && j<(3*iNrTrianglesIn))
+				{
+					const int index2 = piTriListIn[j];
+					if (index1==index2) bNotFound=TFALSE;
+					else ++j;
+				}
+
+				if (!bNotFound)
+				{
+					const int iTri = j/3;
+					const int iVert = j%3;
+					const int iSrcVert=pTriInfos[iTri].vert_num[iVert];
+					const int iSrcOffs=pTriInfos[iTri].iTSpacesOffs;
+					const int iDstVert=pTriInfos[t].vert_num[i];
+					const int iDstOffs=pTriInfos[t].iTSpacesOffs;
+					
+					// copy tspace
+					psTspace[iDstOffs+iDstVert] = psTspace[iSrcOffs+iSrcVert];
+				}
+			}
+		}
+	}
+
+	// deal with degenerate quads with one good triangle
+	for (t=0; t<iNrTrianglesIn; t++)
+	{
+		// this triangle belongs to a quad where the
+		// other triangle is degenerate
+		if ( (pTriInfos[t].iFlag&QUAD_ONE_DEGEN_TRI)!=0 )
+		{
+			SVec3 vDstP;
+			int iOrgF=-1, i=0;
+			tbool bNotFound;
+			unsigned char * pV = pTriInfos[t].vert_num;
+			int iFlag = (1<<pV[0]) | (1<<pV[1]) | (1<<pV[2]);
+			int iMissingIndex = 0;
+			if ((iFlag&2)==0) iMissingIndex=1;
+			else if ((iFlag&4)==0) iMissingIndex=2;
+			else if ((iFlag&8)==0) iMissingIndex=3;
+
+			iOrgF = pTriInfos[t].iOrgFaceNumber;
+			vDstP = GetPosition(pContext, MakeIndex(iOrgF, iMissingIndex));
+			bNotFound = TTRUE;
+			i=0;
+			while (bNotFound && i<3)
+			{
+				const int iVert = pV[i];
+				const SVec3 vSrcP = GetPosition(pContext, MakeIndex(iOrgF, iVert));
+				if (veq(vSrcP, vDstP)==TTRUE)
+				{
+					const int iOffs = pTriInfos[t].iTSpacesOffs;
+					psTspace[iOffs+iMissingIndex] = psTspace[iOffs+iVert];
+					bNotFound=TFALSE;
+				}
+				else
+					++i;
+			}
+			assert(!bNotFound);
+		}
+	}
+}
diff --git a/web/filament-js/package.json b/web/filament-js/package.json
index b382e29482d0..095887d6757a 100644
--- a/web/filament-js/package.json
+++ b/web/filament-js/package.json
@@ -1,6 +1,6 @@
 {
   "name": "filament",
-  "version": "1.31.6",
+  "version": "1.31.7",
   "description": "Real-time physically based rendering engine",
   "main": "filament.js",
   "module": "filament.js",