diff --git a/.deny.toml b/.deny.toml
index a8b6db974e..7e000d6f82 100644
--- a/.deny.toml
+++ b/.deny.toml
@@ -2,12 +2,15 @@
 multiple-versions = "deny"
 skip-tree = [
 	{ name = "windows-sys", version = "0.45" },
-	{ name = "winit", version = "0.27.5" },
+	{ name = "winit", version = "0.27" },
+	{ name = "winit", version = "0.29" },
 	{ name = "rustc_version", version = "0.2.3" },
 	{ name = "sourcemap", version = "7.1.1" },
 ]
 skip = [
 	{ name = "hlsl-snapshots", version = "0.1.0" },
+	# Strum uses an old version
+	{ name = "heck", version = "0.4.0" },
 ]
 wildcards = "deny"
 allow-wildcard-paths = true
@@ -20,6 +23,7 @@ allow = [
 	"BSD-3-Clause",
 	"CC0-1.0",
 	"ISC",
+	"MPL-2.0",
 	"MIT",
 	"MIT-0",
 	"Unicode-DFS-2016",
diff --git a/.envrc b/.envrc
new file mode 100644
index 0000000000..1d953f4bd7
--- /dev/null
+++ b/.envrc
@@ -0,0 +1 @@
+use nix
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index 972d02caff..dba0cd1228 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -78,6 +78,7 @@ jobs:
     # runtime is normally 2-8 minutes
     #
     # currently high due to documentation time problems on mac.
+    # https://github.com/rust-lang/rust/issues/114891
     timeout-minutes: 30
 
     strategy:
@@ -229,6 +230,14 @@ jobs:
 
           # build docs
           cargo +${{ env.DOCS_RUST_VERSION }} doc --target ${{ matrix.target }} --all-features --no-deps
+      - name: check private item docs
+        if: matrix.kind == 'native'
+        shell: bash
+        run: |
+          set -e
+
+          # wgpu_core package
+          cargo +${{ env.DOCS_RUST_VERSION }} doc --target ${{ matrix.target }} --all-features --no-deps --package wgpu-core --package wgpu-hal --document-private-items 
 
   # We run minimal checks on the MSRV of the core crates, ensuring that
   # its dependency tree does not cause issues for firefox.
@@ -614,7 +623,7 @@ jobs:
           cargo fmt --manifest-path xtask/Cargo.toml -- --check
 
       - name: Check for typos
-        uses: crate-ci/typos@v1.20.8
+        uses: crate-ci/typos@v1.20.10
 
   check-cts-runner:
     # runtime is normally 2 minutes
diff --git a/.github/workflows/docs.yml b/.github/workflows/docs.yml
index ceecbb703f..9017220fe5 100644
--- a/.github/workflows/docs.yml
+++ b/.github/workflows/docs.yml
@@ -41,7 +41,7 @@ jobs:
         if: ${{ failure() }}
 
       - name: Deploy the docs
-        uses: JamesIves/github-pages-deploy-action@v4.5.0
+        uses: JamesIves/github-pages-deploy-action@v4.6.0
         if: github.ref == 'refs/heads/trunk'
         with:
           token: ${{ secrets.WEB_DEPLOY }}
diff --git a/.github/workflows/publish.yml b/.github/workflows/publish.yml
index 6dfed56f6a..258c788a4e 100644
--- a/.github/workflows/publish.yml
+++ b/.github/workflows/publish.yml
@@ -41,7 +41,7 @@ jobs:
         run: cargo xtask run-wasm --no-serve
 
       - name: Deploy WebGPU examples
-        uses: JamesIves/github-pages-deploy-action@v4.5.0
+        uses: JamesIves/github-pages-deploy-action@v4.6.0
         if: github.ref == 'refs/heads/trunk'
         with:
           token: ${{ secrets.WEB_DEPLOY }}
diff --git a/CHANGELOG.md b/CHANGELOG.md
index cb7c17a6b9..538546e4c1 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -41,100 +41,174 @@ Bottom level categories:
 
 ### Major Changes
 
-### Documentation
+#### Querying shader compilation errors
+
+Wgpu now supports querying [shader compilation info](https://www.w3.org/TR/webgpu/#dom-gpushadermodule-getcompilationinfo).
+
+This allows you to get more structured information about compilation errors, warnings and info:
+```rust
+...
+let lighting_shader = ctx.device.create_shader_module(include_wgsl!("lighting.wgsl"));
+let compilation_info = lighting_shader.get_compilation_info().await;
+for message in compilation_info
+    .messages
+    .iter()
+    .filter(|m| m.message_type == wgpu::CompilationMessageType::Error)
+{
+    let line = message.location.map(|l| l.line_number).unwrap_or(1);
+    println!("Compile error at line {line}");
+}
+```
+
+By @stefnotch in [#5410](https://github.com/gfx-rs/wgpu/pull/5410)
+
 
-- Add mention of primitive restart in the description of `PrimitiveState::strip_index_format`. By @cpsdqs in [#5350](https://github.com/gfx-rs/wgpu/pull/5350)
-- Document precise behaviour of `SourceLocation`. By @stefnotch in [#5386](https://github.com/gfx-rs/wgpu/pull/5386)
-- Give short example of WGSL `push_constant` syntax. By @waywardmonkeys in [#5393](https://github.com/gfx-rs/wgpu/pull/5393)
 
 ### New features
 
 #### General
 
-- Implemented the `Unorm10_10_10_2` VertexFormat.
-- Many numeric built-ins have had a constant evaluation implementation added for them, which allows them to be used in a `const` context:
-  - [#4879](https://github.com/gfx-rs/wgpu/pull/4879) by @ErichDonGubler:
-    - `abs`
-    - `acos`
-    - `acosh`
-    - `asin`
-    - `asinh`
-    - `atan`
-    - `atanh`
-    - `cos`
-    - `cosh`
-    - `round`
-    - `saturate`
-    - `sin`
-    - `sinh`
-    - `sqrt`
-    - `step`
-    - `tan`
-    - `tanh`
-  - [#5098](https://github.com/gfx-rs/wgpu/pull/5098) by @ErichDonGubler:
-    - `ceil`
-    - `countLeadingZeros`
-    - `countOneBits`
-    - `countTrailingZeros`
-    - `degrees`
-    - `exp`
-    - `exp2`
-    - `floor`
-    - `fract`
-    - `fma`
-    - `inverseSqrt`
-    - `log`
-    - `log2`
-    - `max`
-    - `min`
-    - `radians`
-    - `reverseBits`
-    - `sign`
-    - `trunc`
-- Eager release of GPU resources comes from device.trackers. By @bradwerth in [#5075](https://github.com/gfx-rs/wgpu/pull/5075)
+#### Naga
+
+### Bug Fixes
+
+## v0.20.0 (2024-04-28)
+
+### Major Changes
+
+#### Pipeline overridable constants
+
+Wgpu supports now [pipeline-overridable constants](https://www.w3.org/TR/webgpu/#dom-gpuprogrammablestage-constants)
+
+This allows you to define constants in wgsl like this:
+```rust
+override some_factor: f32 = 42.1337; // Specifies a default of 42.1337 if it's not set.
+```
+And then set them at runtime like so on your pipeline consuming this shader:
+```rust
+// ...
+fragment: Some(wgpu::FragmentState {
+    compilation_options: wgpu::PipelineCompilationOptions {
+        constants: &[("some_factor".to_owned(), 0.1234)].into(), // Sets `some_factor` to 0.1234.
+        ..Default::default()
+    },
+    // ...
+}),
+// ...
+```
+
+By @teoxoy & @jimblandy in [#5500](https://github.com/gfx-rs/wgpu/pull/5500)
+
+#### Changed feature requirements for timestamps
+
+Due to a specification change `write_timestamp` is no longer supported on WebGPU.
+`wgpu::CommandEncoder::write_timestamp` requires now the new `wgpu::Features::TIMESTAMP_QUERY_INSIDE_ENCODERS` feature which is available on all native backends but not on WebGPU.
+
+By @wumpf in [#5188](https://github.com/gfx-rs/wgpu/pull/5188)
+
+
+#### Wgsl const evaluation for many more built-ins
+
+Many numeric built-ins have had a constant evaluation implementation added for them, which allows them to be used in a `const` context:
+
+`abs`, `acos`, `acosh`, `asin`, `asinh`, `atan`, `atanh`, `cos`, `cosh`, `round`, `saturate`, `sin`, `sinh`, `sqrt`, `step`, `tan`, `tanh`, `ceil`, `countLeadingZeros`, `countOneBits`, `countTrailingZeros`, `degrees`, `exp`, `exp2`, `floor`, `fract`, `fma`, `inverseSqrt`, `log`, `log2`, `max`, `min`, `radians`, `reverseBits`, `sign`, `trunc`
+
+By @ErichDonGubler in [#4879](https://github.com/gfx-rs/wgpu/pull/4879), [#5098](https://github.com/gfx-rs/wgpu/pull/5098)
+
+#### New **native-only** wgsl features
+
+##### Subgroup operations
+
+The following subgroup operations are available in wgsl now:
+
+`subgroupBallot`, `subgroupAll`, `subgroupAny`, `subgroupAdd`, `subgroupMul`, `subgroupMin`, `subgroupMax`, `subgroupAnd`, `subgroupOr`, `subgroupXor`, `subgroupExclusiveAdd`, `subgroupExclusiveMul`, `subgroupInclusiveAdd`, `subgroupInclusiveMul`, `subgroupBroadcastFirst`, `subgroupBroadcast`, `subgroupShuffle`, `subgroupShuffleDown`, `subgroupShuffleUp`, `subgroupShuffleXor`
+
+
+Availability is governed by the following feature flags:
+* `wgpu::Features::SUBGROUP` for all operations except `subgroupBarrier` in fragment & compute, supported on Vulkan, DX12 and Metal.
+* `wgpu::Features::SUBGROUP_VERTEX`, for all operations except `subgroupBarrier` general operations in  vertex shaders, supported on Vulkan
+* `wgpu::Features::SUBGROUP_BARRIER`, for support of the `subgroupBarrier` operation, supported on Vulkan & Metal
+
+Note that there currently [some differences](https://github.com/gfx-rs/wgpu/issues/5555) between wgpu's native-only implementation and the [open WebGPU proposal](https://github.com/gpuweb/gpuweb/blob/main/proposals/subgroups.md).
+
+By @exrook and @lichtso in [#5301](https://github.com/gfx-rs/wgpu/pull/5301)
+
+##### Signed and unsigned 64 bit integer support in shaders.
+
+`wgpu::Features::SHADER_INT64` enables 64 bit integer signed and unsigned integer variables in wgsl (`i64` and `u64` respectively).
+Supported on Vulkan, DX12 (requires DXC) and Metal (with MSL 2.3+ support).
+
+By @atlv24 and @cwfitzgerald in [#5154](https://github.com/gfx-rs/wgpu/pull/5154)
+
+### New features
+
+#### General
+
+- Implemented the `Unorm10_10_10_2` VertexFormat by @McMackety in [#5477](https://github.com/gfx-rs/wgpu/pull/5477)
 - `wgpu-types`'s `trace` and `replay` features have been replaced by the `serde` feature. By @KirmesBude in [#5149](https://github.com/gfx-rs/wgpu/pull/5149)
 - `wgpu-core`'s `serial-pass` feature has been removed. Use `serde` instead. By @KirmesBude in [#5149](https://github.com/gfx-rs/wgpu/pull/5149)
-- Added `InstanceFlags::GPU_BASED_VALIDATION`, which enables GPU-based validation for shaders. This is currently only supported on the DX12 and Vulkan backends; other platforms ignore this flag, for now.
+- Added `InstanceFlags::GPU_BASED_VALIDATION`, which enables GPU-based validation for shaders. This is currently only supported on the DX12 and Vulkan backends; other platforms ignore this flag, for now. By @ErichDonGubler in [#5146](https://github.com/gfx-rs/wgpu/pull/5146), [#5046](https://github.com/gfx-rs/wgpu/pull/5046).
   - When set, this flag implies `InstanceFlags::VALIDATION`.
   - This has been added to the set of flags set by `InstanceFlags::advanced_debugging`. Since the overhead is potentially very large, the flag is not enabled by default in debug builds when using `InstanceFlags::from_build_config`.
   - As with other instance flags, this flag can be changed in calls to `InstanceFlags::with_env` with the new `WGPU_GPU_BASED_VALIDATION` environment variable.
-
-  By @ErichDonGubler in [#5146](https://github.com/gfx-rs/wgpu/pull/5146), [#5046](https://github.com/gfx-rs/wgpu/pull/5046).
-- Signed and unsigned 64 bit integer support in shaders. By @rodolphito and @cwfitzgerald in [#5154](https://github.com/gfx-rs/wgpu/pull/5154)
 - `wgpu::Instance` can now report which `wgpu::Backends` are available based on the build configuration. By @wumpf [#5167](https://github.com/gfx-rs/wgpu/pull/5167)
-
   ```diff
   -wgpu::Instance::any_backend_feature_enabled()
   +!wgpu::Instance::enabled_backend_features().is_empty()
   ```
-
-- `wgpu::CommandEncoder::write_timestamp` requires now the new `wgpu::Features::TIMESTAMP_QUERY_INSIDE_ENCODERS` feature which is available on all native backends but not on WebGPU (due to a spec change `write_timestamp` is no longer supported on WebGPU). By @wumpf in [#5188](https://github.com/gfx-rs/wgpu/pull/5188)
 - Breaking change: [`wgpu_core::pipeline::ProgrammableStageDescriptor`](https://docs.rs/wgpu-core/latest/wgpu_core/pipeline/struct.ProgrammableStageDescriptor.html#structfield.entry_point) is now optional. By @ErichDonGubler in [#5305](https://github.com/gfx-rs/wgpu/pull/5305).
 - `Features::downlevel{_webgl2,}_features` was made const by @MultisampledNight in [#5343](https://github.com/gfx-rs/wgpu/pull/5343)
-
+- Breaking change: [`wgpu_core::pipeline::ShaderError`](https://docs.rs/wgpu-core/latest/wgpu_core/pipeline/struct.ShaderError.html) has been moved to `naga`. By @stefnotch in [#5410](https://github.com/gfx-rs/wgpu/pull/5410)
 - More as_hal methods and improvements by @JMS55 in [#5452](https://github.com/gfx-rs/wgpu/pull/5452)
   - Added `wgpu::CommandEncoder::as_hal_mut`
   - Added `wgpu::TextureView::as_hal`
   - `wgpu::Texture::as_hal` now returns a user-defined type to match the other as_hal functions
 
-- Added support for pipeline-overridable constants. By @teoxoy & @jimblandy in [#5500](https://github.com/gfx-rs/wgpu/pull/5500)
-
-#### GLES
-
-- Log an error when GLES texture format heuristics fail. By @PolyMeilex in [#5266](https://github.com/gfx-rs/wgpu/issues/5266)
-- Cache the sample count to keep `get_texture_format_features` cheap. By @Dinnerbone in [#5346](https://github.com/gfx-rs/wgpu/pull/5346)
-- Mark `DEPTH32FLOAT_STENCIL8` as supported in GLES. By @Dinnerbone in [#5370](https://github.com/gfx-rs/wgpu/pull/5370)
-
 #### Naga
 
 - Allow user to select which MSL version to use via `--metal-version` with Naga CLI. By @pcleavelin in [#5392](https://github.com/gfx-rs/wgpu/pull/5392)
 - Support `arrayLength` for runtime-sized arrays inside binding arrays (for WGSL input and SPIR-V output). By @kvark in [#5428](https://github.com/gfx-rs/wgpu/pull/5428)
+- Added `--shader-stage` and `--input-kind` options to naga-cli for specifying vertex/fragment/compute shaders, and frontend. by @ratmice in [#5411](https://github.com/gfx-rs/wgpu/pull/5411)
+- Added a `create_validator` function to wgpu_core `Device` to create naga `Validator`s. By @atlv24 [#5606](https://github.com/gfx-rs/wgpu/pull/5606)
 
 #### WebGPU
 
 - Implement the `device_set_device_lost_callback` method for `ContextWebGpu`. By @suti in [#5438](https://github.com/gfx-rs/wgpu/pull/5438)
 - Add support for storage texture access modes `ReadOnly` and `ReadWrite`. By @JolifantoBambla in [#5434](https://github.com/gfx-rs/wgpu/pull/5434)
 
+#### GLES / OpenGL
+
+- Log an error when GLES texture format heuristics fail. By @PolyMeilex in [#5266](https://github.com/gfx-rs/wgpu/issues/5266)
+- Cache the sample count to keep `get_texture_format_features` cheap. By @Dinnerbone in [#5346](https://github.com/gfx-rs/wgpu/pull/5346)
+- Mark `DEPTH32FLOAT_STENCIL8` as supported in GLES. By @Dinnerbone in [#5370](https://github.com/gfx-rs/wgpu/pull/5370)
+- Desktop GL now also supports `TEXTURE_COMPRESSION_ETC2`. By @Valaphee in [#5568](https://github.com/gfx-rs/wgpu/pull/5568)
+- Don't create a program for shader-clearing if that workaround isn't required. By @Dinnerbone in [#5348](https://github.com/gfx-rs/wgpu/pull/5348).
+- OpenGL will now be preferred over OpenGL ES on EGL, making it consistent with WGL. By @valaphee in [#5482](https://github.com/gfx-rs/wgpu/pull/5482)
+- Fill out `driver` and `driver_info`, with the OpenGL flavor and version, similar to Vulkan. By @valaphee in [#5482](https://github.com/gfx-rs/wgpu/pull/5482)
+
+#### Metal
+
+- Metal 3.0 and 3.1 detection. By @atlv24 in [#5497](https://github.com/gfx-rs/wgpu/pull/5497)
+
+#### DX12
+
+- Shader Model 6.1-6.7 detection. By @atlv24 in [#5498](https://github.com/gfx-rs/wgpu/pull/5498)
+
+### Other performance improvements
+
+- Simplify and speed up the allocation of internal IDs. By @nical in [#5229](https://github.com/gfx-rs/wgpu/pull/5229)
+- Use memory pooling for UsageScopes to avoid frequent large allocations. by @robtfm in [#5414](https://github.com/gfx-rs/wgpu/pull/5414)
+- Eager release of GPU resources comes from device.trackers. By @bradwerth in [#5075](https://github.com/gfx-rs/wgpu/pull/5075)
+- Support disabling zero-initialization of workgroup local memory in compute shaders. By @DJMcNab in [#5508](https://github.com/gfx-rs/wgpu/pull/5508)
+
+### Documentation
+
+- Improved `wgpu_hal` documentation. By @jimblandy in [#5516](https://github.com/gfx-rs/wgpu/pull/5516), [#5524](https://github.com/gfx-rs/wgpu/pull/5524), [#5562](https://github.com/gfx-rs/wgpu/pull/5562), [#5563](https://github.com/gfx-rs/wgpu/pull/5563), [#5566](https://github.com/gfx-rs/wgpu/pull/5566), [#5617](https://github.com/gfx-rs/wgpu/pull/5617), [#5618](https://github.com/gfx-rs/wgpu/pull/5618)
+- Add mention of primitive restart in the description of `PrimitiveState::strip_index_format`. By @cpsdqs in [#5350](https://github.com/gfx-rs/wgpu/pull/5350)
+- Document and tweak precise behaviour of `SourceLocation`. By @stefnotch in [#5386](https://github.com/gfx-rs/wgpu/pull/5386) and [#5410](https://github.com/gfx-rs/wgpu/pull/5410)
+- Give short example of WGSL `push_constant` syntax. By @waywardmonkeys in [#5393](https://github.com/gfx-rs/wgpu/pull/5393)
+- Fix incorrect documentation of `Limits::max_compute_workgroup_storage_size` default value. By @atlv24 in [#5601](https://github.com/gfx-rs/wgpu/pull/5601)
+
 ### Bug Fixes
 
 #### General
@@ -143,48 +217,54 @@ Bottom level categories:
 - Fix panic when creating a surface while no backend is available. By @wumpf [#5166](https://github.com/gfx-rs/wgpu/pull/5166)
 - Correctly compute minimum buffer size for array-typed `storage` and `uniform` vars. By @jimblandy [#5222](https://github.com/gfx-rs/wgpu/pull/5222)
 - Fix timeout when presenting a surface where no work has been done. By @waywardmonkeys in [#5200](https://github.com/gfx-rs/wgpu/pull/5200)
-- Simplify and speed up the allocation of internal IDs. By @nical in [#5229](https://github.com/gfx-rs/wgpu/pull/5229)
-- Fix behavior of `extractBits` and `insertBits` when `offset + count` overflows the bit width. By @cwfitzgerald in [#5305](https://github.com/gfx-rs/wgpu/pull/5305)
 - Fix registry leaks with de-duplicated resources. By @nical in [#5244](https://github.com/gfx-rs/wgpu/pull/5244)
-- Fix behavior of integer `clamp` when `min` argument > `max` argument. By @cwfitzgerald in [#5300](https://github.com/gfx-rs/wgpu/pull/5300).
 - Fix linking when targeting android. By @ashdnazg in [#5326](https://github.com/gfx-rs/wgpu/pull/5326).
-- fix resource leak for buffers/textures dropped while having pending writes. By @robtfm in [#5413](https://github.com/gfx-rs/wgpu/pull/5413)
 - Failing to set the device lost closure will call the closure before returning. By @bradwerth in [#5358](https://github.com/gfx-rs/wgpu/pull/5358).
-- Use memory pooling for UsageScopes to avoid frequent large allocations. by @robtfm in [#5414](https://github.com/gfx-rs/wgpu/pull/5414)
 - Fix deadlocks caused by recursive read-write lock acquisitions [#5426](https://github.com/gfx-rs/wgpu/pull/5426).
+- Remove exposed C symbols (`extern "C"` + [no_mangle]) from RenderPass & ComputePass recording. By @wumpf in [#5409](https://github.com/gfx-rs/wgpu/pull/5409).
+- Fix surfaces being only compatible with first backend enabled on an instance, causing failures when manually specifying an adapter. By @Wumpf in [#5535](https://github.com/gfx-rs/wgpu/pull/5535).
 
 #### Naga
-- In spv-in, remove unnecessary "gl_PerVertex" name check so unused builtins will always be skipped. By @Imberflur in [#5227](https://github.com/gfx-rs/wgpu/pull/5227).
-- GLSL 410 does not support layout(binding = ...), enable only for GLSL 420. By @bes in [#5357](https://github.com/gfx-rs/wgpu/pull/5357)
+
+- In spv-in, remove unnecessary "gl_PerVertex" name check so unused builtins will always be skipped. Prevents validation errors caused by capability requirements of these builtins [#4915](https://github.com/gfx-rs/wgpu/issues/4915). By @Imberflur in [#5227](https://github.com/gfx-rs/wgpu/pull/5227).
 - In spv-out, check for acceleration and ray-query types when enabling ray-query extension to prevent validation error. By @Vecvec in [#5463](https://github.com/gfx-rs/wgpu/pull/5463)
 - Add a limit for curly brace nesting in WGSL parsing, plus a note about stack size requirements. By @ErichDonGubler in [#5447](https://github.com/gfx-rs/wgpu/pull/5447).
+- In hlsl-out, fix accesses on zero value expressions by generating helper functions for `Expression::ZeroValue`. By @Imberflur in [#5587](https://github.com/gfx-rs/wgpu/pull/5587).
+- Fix behavior of `extractBits` and `insertBits` when `offset + count` overflows the bit width. By @cwfitzgerald in [#5305](https://github.com/gfx-rs/wgpu/pull/5305)
+- Fix behavior of integer `clamp` when `min` argument > `max` argument. By @cwfitzgerald in [#5300](https://github.com/gfx-rs/wgpu/pull/5300).
+- Fix `TypeInner::scalar_width` to be consistent with the rest of the codebase and return values in bytes not bits. By @atlv24 in [#5532](https://github.com/gfx-rs/wgpu/pull/5532).
 
-#### Tests
-
-- Fix intermittent crashes on Linux in the `multithreaded_compute` test. By @jimblandy in [#5129](https://github.com/gfx-rs/wgpu/pull/5129).
-- Refactor tests to read feature flags by name instead of a hardcoded hexadecimal u64. By @rodolphito in [#5155](https://github.com/gfx-rs/wgpu/pull/5155).
-- Add test that verifies that we can drop the queue before using the device to create a command encoder. By @Davidster in [#5211](https://github.com/gfx-rs/wgpu/pull/5211)
-
-#### GLES
+#### GLES / OpenGL
 
+- GLSL 410 does not support layout(binding = ...), enable only for GLSL 420. By @bes in [#5357](https://github.com/gfx-rs/wgpu/pull/5357)
 - Fixes for being able to use an OpenGL 4.1 core context provided by macOS with wgpu. By @bes in [#5331](https://github.com/gfx-rs/wgpu/pull/5331).
-- Don't create a program for shader-clearing if that workaround isn't required. By @Dinnerbone in [#5348](https://github.com/gfx-rs/wgpu/pull/5348).
 - Fix crash when holding multiple devices on wayland/surfaceless. By @ashdnazg in [#5351](https://github.com/gfx-rs/wgpu/pull/5351).
-- Don't depend on bind group and bind group layout entry order in HAL. This caused incorrect severely incorrect command execution and, in some cases, crashes. By @ErichDonGubler in [#5421](https://github.com/gfx-rs/wgpu/pull/5421).
+- Fix `first_instance` getting ignored in draw indexed when `ARB_shader_draw_parameters` feature is present and `base_vertex` is 0. By @valaphee in [#5482](https://github.com/gfx-rs/wgpu/pull/5482)
 
 #### Vulkan
 
 - Set object labels when the DEBUG flag is set, even if the VALIDATION flag is disabled. By @DJMcNab in [#5345](https://github.com/gfx-rs/wgpu/pull/5345).
+- Add safety check to `wgpu_hal::vulkan::CommandEncoder` to make sure `discard_encoding` is not called in the closed state. By @villuna in [#5557](https://github.com/gfx-rs/wgpu/pull/5557)
+- Fix SPIR-V type capability requests to not depend on `LocalType` caching. By @atlv24 in [#5590](https://github.com/gfx-rs/wgpu/pull/5590)
 
-#### Metal
+#### Tests
 
-- Don't depend on bind group and bind group layout entry order in HAL. This caused incorrect severely incorrect command execution and, in some cases, crashes. By @ErichDonGubler in [#5421](https://github.com/gfx-rs/wgpu/pull/5421).
-- Metal 3.0 and 3.1 detection. By @atlv24 in [#5497](https://github.com/gfx-rs/wgpu/pull/5497)
+- Fix intermittent crashes on Linux in the `multithreaded_compute` test. By @jimblandy in [#5129](https://github.com/gfx-rs/wgpu/pull/5129).
+- Refactor tests to read feature flags by name instead of a hardcoded hexadecimal u64. By @atlv24 in [#5155](https://github.com/gfx-rs/wgpu/pull/5155).
+- Add test that verifies that we can drop the queue before using the device to create a command encoder. By @Davidster in [#5211](https://github.com/gfx-rs/wgpu/pull/5211)
 
-#### DX12
+## v0.19.4 (2024-04-17)
+
+### Bug Fixes
 
-- Don't depend on bind group and bind group layout entry order in HAL. This caused incorrect severely incorrect command execution and, in some cases, crashes. By @ErichDonGubler in [#5421](https://github.com/gfx-rs/wgpu/pull/5421).
-- Shader Model 6.1, 6.2, 6.3, 6.4, 6.5, 6.6, and 6.7 detection. By @atlv24 in [#5498](https://github.com/gfx-rs/wgpu/pull/5498)
+#### General
+
+- Don't depend on bind group and bind group layout entry order in backends. This caused incorrect severely incorrect command execution and, in some cases, crashes. By @ErichDonGubler in [#5421](https://github.com/gfx-rs/wgpu/pull/5421).
+- Properly clean up all write_buffer/texture temporary resources. By @robtfm in [#5413](https://github.com/gfx-rs/wgpu/pull/5413).
+- Fix deadlock in certain situations when mapping buffers using `wgpu-profiler`. By @cwfitzgerald in [#5517](https://github.com/gfx-rs/wgpu/pull/5517)
+
+#### WebGPU
+- Correctly pass through timestamp queries to WebGPU. By @cwfitzgerald in [#5527](https://github.com/gfx-rs/wgpu/pull/5527).
 
 ## v0.19.3 (2024-03-01)
 
diff --git a/Cargo.lock b/Cargo.lock
index 71bc0211e7..9d2c2baa56 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -4,9 +4,9 @@ version = 3
 
 [[package]]
 name = "ab_glyph"
-version = "0.2.24"
+version = "0.2.25"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "8e08104bebc65a46f8bc7aa733d39ea6874bfa7156f41a46b805785e3af1587d"
+checksum = "6f90148830dac590fac7ccfe78ec4a8ea404c60f75a24e16407a71f0f40de775"
 dependencies = [
  "ab_glyph_rasterizer",
  "owned_ttf_parser",
@@ -57,9 +57,9 @@ dependencies = [
 
 [[package]]
 name = "allocator-api2"
-version = "0.2.16"
+version = "0.2.18"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "0942ffc6dcaadf03badf6e6a2d0228460359d5e34b57ccdc720b7382dfbd5ec5"
+checksum = "5c6cb57a04249c6480766f7f7cef5467412af1490f8d1e243141daddada3264f"
 
 [[package]]
 name = "android-activity"
@@ -153,9 +153,9 @@ dependencies = [
 
 [[package]]
 name = "anyhow"
-version = "1.0.81"
+version = "1.0.82"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "0952808a6c2afd1aa8947271f3a60f1a6763c7b912d210184c5149b5cf147247"
+checksum = "f538837af36e6f6a9be0faa67f9a314f8119e4e4b5867c6ab40ed60360142519"
 
 [[package]]
 name = "arbitrary"
@@ -185,7 +185,7 @@ dependencies = [
  "argh_shared",
  "proc-macro2",
  "quote",
- "syn 2.0.58",
+ "syn 2.0.60",
 ]
 
 [[package]]
@@ -235,13 +235,13 @@ dependencies = [
 
 [[package]]
 name = "async-trait"
-version = "0.1.79"
+version = "0.1.80"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "a507401cad91ec6a857ed5513a2073c82a9b9048762b885bb98655b306964681"
+checksum = "c6fa2087f2753a7da8cc1c0dbfcf89579dd57458e36769de5ac750b4671737ca"
 dependencies = [
  "proc-macro2",
  "quote",
- "syn 2.0.58",
+ "syn 2.0.60",
 ]
 
 [[package]]
@@ -384,7 +384,7 @@ checksum = "4da9a32f3fed317401fa3c862968128267c3106685286e15d5aaa3d7389c2f60"
 dependencies = [
  "proc-macro2",
  "quote",
- "syn 2.0.58",
+ "syn 2.0.60",
 ]
 
 [[package]]
@@ -447,12 +447,13 @@ checksum = "37b2a672a2cb129a2e41c10b1224bb368f9f37a2b16b612598138befd7b37eb5"
 
 [[package]]
 name = "cc"
-version = "1.0.92"
+version = "1.0.95"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "2678b2e3449475e95b0aa6f9b506a28e61b3dc8996592b983695e8ebb58a8b41"
+checksum = "d32a725bc159af97c3e629873bb9f88fb8cf8a4867175f76dc987815ea07c83b"
 dependencies = [
  "jobserver",
  "libc",
+ "once_cell",
 ]
 
 [[package]]
@@ -511,9 +512,9 @@ dependencies = [
 
 [[package]]
 name = "clap"
-version = "4.5.0"
+version = "4.5.4"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "80c21025abd42669a92efc996ef13cfb2c5c627858421ea58d5c3b331a6c134f"
+checksum = "90bc066a67923782aa8515dbaea16946c5bcc5addbd668bb80af688e53e548a0"
 dependencies = [
  "clap_builder",
  "clap_derive",
@@ -521,9 +522,9 @@ dependencies = [
 
 [[package]]
 name = "clap_builder"
-version = "4.5.0"
+version = "4.5.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "458bf1f341769dfcf849846f65dffdf9146daa56bcd2a47cb4e1de9915567c99"
+checksum = "ae129e2e766ae0ec03484e609954119f123cc1fe650337e155d03b022f24f7b4"
 dependencies = [
  "anstream",
  "anstyle",
@@ -533,14 +534,14 @@ dependencies = [
 
 [[package]]
 name = "clap_derive"
-version = "4.5.0"
+version = "4.5.4"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "307bc0538d5f0f83b8248db3087aa92fe504e4691294d0c96c0eabc33f47ba47"
+checksum = "528131438037fd55894f62d6e9f068b8f45ac57ffa77517819645d10aed04f64"
 dependencies = [
- "heck",
+ "heck 0.5.0",
  "proc-macro2",
  "quote",
- "syn 2.0.58",
+ "syn 2.0.60",
 ]
 
 [[package]]
@@ -659,9 +660,9 @@ dependencies = [
 
 [[package]]
 name = "combine"
-version = "4.6.6"
+version = "4.6.7"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "35ed6e9d84f0b51a7f52daf1c7d71dd136fd7a3f41a8462b8cdb8c78d920fad4"
+checksum = "ba5a308b75df32fe02788e748662718f03fde005016435c444eea572398219fd"
 dependencies = [
  "bytes",
  "memchr",
@@ -669,9 +670,9 @@ dependencies = [
 
 [[package]]
 name = "concurrent-queue"
-version = "2.4.0"
+version = "2.5.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "d16048cd947b08fa32c24458a22f5dc5e835264f689f4f5653210c69fd107363"
+checksum = "4ca0197aee26d1ae37445ee532fefce43251d24cc7c166799f4d46817f1d3973"
 dependencies = [
  "crossbeam-utils",
 ]
@@ -880,12 +881,12 @@ checksum = "7a81dae078cea95a014a339291cec439d2f232ebe854a9d672b796c6afafa9b7"
 
 [[package]]
 name = "ctor"
-version = "0.2.7"
+version = "0.2.8"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "ad291aa74992b9b7a7e88c38acbbf6ad7e107f1d90ee8775b7bc1fc3394f485c"
+checksum = "edb49164822f3ee45b17acd4a208cfc1251410cf0cad9a833234c9890774dd9f"
 dependencies = [
  "quote",
- "syn 2.0.58",
+ "syn 2.0.60",
 ]
 
 [[package]]
@@ -916,7 +917,7 @@ checksum = "96a6ac251f4a2aca6b3f91340350eab87ae57c3f127ffeb585e92bd336717991"
 
 [[package]]
 name = "d3d12"
-version = "0.19.0"
+version = "0.20.0"
 dependencies = [
  "bitflags 2.5.0",
  "libloading 0.8.3",
@@ -960,9 +961,9 @@ dependencies = [
 
 [[package]]
 name = "data-encoding"
-version = "2.5.0"
+version = "2.6.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "7e962a19be5cfc3f3bf6dd8f61eb50107f356ad6270fbb3ed41476571db78be5"
+checksum = "e8566979429cf69b49a5c740c60791108e86440e8be149bbea4fe54d2c32d6e2"
 
 [[package]]
 name = "debugid"
@@ -1032,15 +1033,15 @@ dependencies = [
  "quote",
  "strum",
  "strum_macros",
- "syn 2.0.58",
+ "syn 2.0.60",
  "thiserror",
 ]
 
 [[package]]
 name = "deno_unsync"
-version = "0.3.2"
+version = "0.3.3"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "30dff7e03584dbae188dae96a0f1876740054809b2ad0cf7c9fc5d361f20e739"
+checksum = "e3d79c7af81e0a5ac75cff7b2fff4d1896e2bff694c688258edf21ef8a519736"
 dependencies = [
  "tokio",
 ]
@@ -1080,7 +1081,7 @@ name = "deno_webgpu"
 version = "0.110.0"
 dependencies = [
  "deno_core",
- "raw-window-handle 0.6.0",
+ "raw-window-handle 0.6.1",
  "serde",
  "tokio",
  "wgpu-core",
@@ -1105,7 +1106,7 @@ checksum = "67e77553c4162a157adbf834ebae5b415acbecbeafc7a74b0e886657506a7611"
 dependencies = [
  "proc-macro2",
  "quote",
- "syn 2.0.58",
+ "syn 2.0.60",
 ]
 
 [[package]]
@@ -1173,15 +1174,15 @@ dependencies = [
 
 [[package]]
 name = "either"
-version = "1.10.0"
+version = "1.11.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "11157ac094ffbdde99aa67b23417ebdd801842852b500e395a45a9c0aac03e4a"
+checksum = "a47c1c47d2f5964e29c61246e81db715514cd532db6b5116a25ea3c03d6780a2"
 
 [[package]]
 name = "encase"
-version = "0.7.0"
+version = "0.8.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "95ed933078d2e659745df651f4c180511cd582e5b9414ff896e7d50d207e3103"
+checksum = "5a9299a95fa5671ddf29ecc22b00e121843a65cb9ff24911e394b4ae556baf36"
 dependencies = [
  "const_panic",
  "encase_derive",
@@ -1191,22 +1192,22 @@ dependencies = [
 
 [[package]]
 name = "encase_derive"
-version = "0.7.0"
+version = "0.8.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "f4ce1449c7d19eba6cc0abd231150ad81620a8dce29601d7f8d236e5d431d72a"
+checksum = "07e09decb3beb1fe2db6940f598957b2e1f7df6206a804d438ff6cb2a9cddc10"
 dependencies = [
  "encase_derive_impl",
 ]
 
 [[package]]
 name = "encase_derive_impl"
-version = "0.7.0"
+version = "0.8.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "92959a9e8d13eaa13b8ae8c7b583c3bf1669ca7a8e7708a088d12587ba86effc"
+checksum = "fd31dbbd9743684d339f907a87fe212cb7b51d75b9e8e74181fe363199ee9b47"
 dependencies = [
  "proc-macro2",
  "quote",
- "syn 2.0.58",
+ "syn 2.0.60",
 ]
 
 [[package]]
@@ -1269,9 +1270,9 @@ dependencies = [
 
 [[package]]
 name = "fastrand"
-version = "2.0.2"
+version = "2.1.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "658bd65b1cf4c852a3cc96f18a8ce7b5640f6b703f905c7d74532294c2a63984"
+checksum = "9fc0510504f03c51ada170672ac806f1f105a88aa97a5281117e1ddc3368e51a"
 
 [[package]]
 name = "fdeflate"
@@ -1299,9 +1300,9 @@ checksum = "0ce7134b9999ecaf8bcd65542e436736ef32ddca1b3e06094cb6ec5755203b80"
 
 [[package]]
 name = "flate2"
-version = "1.0.28"
+version = "1.0.29"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "46303f565772937ffe1d394a4fac6f411c6013172fadde9dcdb1e147a086940e"
+checksum = "4556222738635b7a3417ae6130d8f52201e45a0c4d1a907f0826383adb5f85e7"
 dependencies = [
  "crc32fast",
  "miniz_oxide",
@@ -1352,7 +1353,7 @@ checksum = "1a5c6c585bc94aaf2c7b51dd4c2ba22680844aba4c687be581871a6f518c5742"
 dependencies = [
  "proc-macro2",
  "quote",
- "syn 2.0.58",
+ "syn 2.0.60",
 ]
 
 [[package]]
@@ -1477,7 +1478,7 @@ checksum = "87750cf4b7a4c0625b1529e4c543c2182106e4dedc60a2a6455e00d212c489ac"
 dependencies = [
  "proc-macro2",
  "quote",
- "syn 2.0.58",
+ "syn 2.0.60",
 ]
 
 [[package]]
@@ -1552,9 +1553,9 @@ dependencies = [
 
 [[package]]
 name = "glam"
-version = "0.25.0"
+version = "0.27.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "151665d9be52f9bb40fc7966565d39666f2d1e69233571b71b87791c7e0528b3"
+checksum = "9e05e7e6723e3455f4818c7b26e855439f7546cf617ef669d1adedb8669e5cb9"
 
 [[package]]
 name = "glow"
@@ -1676,9 +1677,9 @@ dependencies = [
 
 [[package]]
 name = "gpu-descriptor"
-version = "0.2.4"
+version = "0.3.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "cc11df1ace8e7e564511f53af41f3e42ddc95b56fd07b3f4445d2a6048bc682c"
+checksum = "9c08c1f623a8d0b722b8b99f821eb0ba672a1618f0d3b16ddbee1cedd2dd8557"
 dependencies = [
  "bitflags 2.5.0",
  "gpu-descriptor-types",
@@ -1687,9 +1688,9 @@ dependencies = [
 
 [[package]]
 name = "gpu-descriptor-types"
-version = "0.1.2"
+version = "0.2.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "6bf0b36e6f090b7e1d8a4b49c0cb81c1f8376f72198c65dd3ad9ff3556b8b78c"
+checksum = "fdf242682df893b86f33a73828fb09ca4b2d3bb6cc95249707fc684d27484b91"
 dependencies = [
  "bitflags 2.5.0",
 ]
@@ -1715,9 +1716,9 @@ dependencies = [
 
 [[package]]
 name = "hashbrown"
-version = "0.14.3"
+version = "0.14.5"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "290f1a1d9242c78d09ce40a5e87e7554ee637af1351968159f4952f028f75604"
+checksum = "e5274423e17b7c9fc20b6e7e208532f9b19825d82dfd615708b70edd83df41f1"
 dependencies = [
  "ahash",
  "allocator-api2",
@@ -1744,6 +1745,12 @@ version = "0.4.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "95505c38b4572b2d910cecb0281560f54b440a19336cbbcb27bf6ce6adc6f5a8"
 
+[[package]]
+name = "heck"
+version = "0.5.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "2304e00983f87ffb38b55b444b5e3b60a884b5d30c0fca7d82fe33449bbe55ea"
+
 [[package]]
 name = "hermit-abi"
 version = "0.3.9"
@@ -1899,9 +1906,9 @@ checksum = "8eaf4bc02d17cbdd7ff4c7438cafcdf7fb9a4613313ad11b4f8fefe7d3fa0130"
 
 [[package]]
 name = "jobserver"
-version = "0.1.28"
+version = "0.1.31"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "ab46a6e9526ddef3ae7f787c06f0f2600639ba80ea3eade3d8e670a2230f51d6"
+checksum = "d2b099aaa34a9751c5bf0878add70444e1ed2dd73f347be99003d4577277de6e"
 dependencies = [
  "libc",
 ]
@@ -1981,7 +1988,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "0c2a198fb6b0eada2a8df47933734e6d35d350665a33a3593d7164fa52c75c19"
 dependencies = [
  "cfg-if",
- "windows-targets 0.48.5",
+ "windows-targets 0.52.5",
 ]
 
 [[package]]
@@ -2020,9 +2027,9 @@ checksum = "b4ce301924b7887e9d637144fdade93f9dfff9b60981d4ac161db09720d39aa5"
 
 [[package]]
 name = "lock_api"
-version = "0.4.11"
+version = "0.4.12"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "3c168f8615b12bc01f9c17e2eb0cc07dcae1940121185446edc3744920e8ef45"
+checksum = "07af8b9cdd281b7915f413fa73f29ebd5d55d0d3f0155584dade1ff18cea1b17"
 dependencies = [
  "autocfg",
  "scopeguard",
@@ -2087,8 +2094,9 @@ dependencies = [
 
 [[package]]
 name = "metal"
-version = "0.27.0"
-source = "git+https://github.com/gfx-rs/metal-rs?rev=ff8fd3d6dc7792852f8a015458d7e6d42d7fb352#ff8fd3d6dc7792852f8a015458d7e6d42d7fb352"
+version = "0.28.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "5637e166ea14be6063a3f8ba5ccb9a4159df7d8f6d61c02fc3d480b1f90dcfcb"
 dependencies = [
  "bitflags 2.5.0",
  "block",
@@ -2123,7 +2131,7 @@ dependencies = [
 
 [[package]]
 name = "naga"
-version = "0.19.2"
+version = "0.20.0"
 dependencies = [
  "arbitrary",
  "arrayvec 0.7.4",
@@ -2153,8 +2161,9 @@ dependencies = [
 
 [[package]]
 name = "naga-cli"
-version = "0.19.0"
+version = "0.20.0"
 dependencies = [
+ "anyhow",
  "argh",
  "bincode",
  "codespan-reporting",
@@ -2221,7 +2230,7 @@ dependencies = [
  "log",
  "ndk-sys 0.5.0+25.2.9519653",
  "num_enum 0.7.2",
- "raw-window-handle 0.6.0",
+ "raw-window-handle 0.6.1",
  "thiserror",
 ]
 
@@ -2392,7 +2401,7 @@ dependencies = [
  "proc-macro-crate 3.1.0",
  "proc-macro2",
  "quote",
- "syn 2.0.58",
+ "syn 2.0.60",
 ]
 
 [[package]]
@@ -2430,9 +2439,9 @@ dependencies = [
 
 [[package]]
 name = "objc-sys"
-version = "0.3.2"
+version = "0.3.3"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "c7c71324e4180d0899963fc83d9d241ac39e699609fc1025a850aadac8257459"
+checksum = "da284c198fb9b7b0603f8635185e85fbd5b64ee154b1ed406d489077de2d6d60"
 
 [[package]]
 name = "objc2"
@@ -2518,9 +2527,9 @@ checksum = "bb813b8af86854136c6922af0598d719255ecb2179515e6e7730d468f05c9cae"
 
 [[package]]
 name = "parking_lot"
-version = "0.12.1"
+version = "0.12.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "3742b2c103b9f06bc9fff0a37ff4912935851bee6d36f3c02bcc755bcfec228f"
+checksum = "7e4af0ca4f6caed20e900d564c242b8e5d4903fdacf31d3daf527b66fe6f42fb"
 dependencies = [
  "lock_api",
  "parking_lot_core",
@@ -2528,18 +2537,18 @@ dependencies = [
 
 [[package]]
 name = "parking_lot_core"
-version = "0.9.9"
+version = "0.9.10"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "4c42a9226546d68acdd9c0a280d17ce19bfe27a46bf68784e4066115788d008e"
+checksum = "1e401f977ab385c9e4e3ab30627d6f26d00e2c73eef317493c4ec6d468726cf8"
 dependencies = [
  "backtrace",
  "cfg-if",
  "libc",
  "petgraph",
- "redox_syscall 0.4.1",
+ "redox_syscall 0.5.1",
  "smallvec",
  "thread-id",
- "windows-targets 0.48.5",
+ "windows-targets 0.52.5",
 ]
 
 [[package]]
@@ -2587,7 +2596,7 @@ checksum = "2f38a4412a78282e09a2cf38d195ea5420d15ba0602cb375210efbc877243965"
 dependencies = [
  "proc-macro2",
  "quote",
- "syn 2.0.58",
+ "syn 2.0.60",
 ]
 
 [[package]]
@@ -2610,11 +2619,11 @@ checksum = "d231b230927b5e4ad203db57bbcbee2802f6bce620b1e4a9024a07d94e2907ec"
 
 [[package]]
 name = "player"
-version = "0.19.3"
+version = "0.20.0"
 dependencies = [
  "env_logger",
  "log",
- "raw-window-handle 0.6.0",
+ "raw-window-handle 0.6.1",
  "ron",
  "serde",
  "wgpu-core",
@@ -2665,9 +2674,9 @@ dependencies = [
 
 [[package]]
 name = "polling"
-version = "3.6.0"
+version = "3.7.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "e0c976a60b2d7e99d6f229e414670a9b85d13ac305cc6d1e9c134de58c5aaaf6"
+checksum = "645493cf344456ef24219d02a768cf1fb92ddf8c92161679ae3d91b91a637be3"
 dependencies = [
  "cfg-if",
  "concurrent-queue",
@@ -2726,7 +2735,7 @@ checksum = "07c277e4e643ef00c1233393c673f655e3672cf7eb3ba08a00bdd0ea59139b5f"
 dependencies = [
  "proc-macro-rules-macros",
  "proc-macro2",
- "syn 2.0.58",
+ "syn 2.0.60",
 ]
 
 [[package]]
@@ -2738,14 +2747,14 @@ dependencies = [
  "once_cell",
  "proc-macro2",
  "quote",
- "syn 2.0.58",
+ "syn 2.0.60",
 ]
 
 [[package]]
 name = "proc-macro2"
-version = "1.0.79"
+version = "1.0.81"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "e835ff2298f5721608eb1a980ecaee1aef2c132bf95ecc026a11b7bf3c01c02e"
+checksum = "3d1597b0c024618f09a9c3b8655b7e430397a36d23fdafec26d6965e9eec3eba"
 dependencies = [
  "unicode-ident",
 ]
@@ -2767,9 +2776,9 @@ dependencies = [
 
 [[package]]
 name = "quote"
-version = "1.0.35"
+version = "1.0.36"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "291ec9ab5efd934aaf503a6466c5d5251535d108ee747472c3977cc5acc868ef"
+checksum = "0fa76aaf39101c457836aec0ce2316dbdc3ab723cdda1c6bd4e6ad4208acaca7"
 dependencies = [
  "proc-macro2",
 ]
@@ -2821,9 +2830,9 @@ checksum = "f2ff9a1f06a88b01621b7ae906ef0211290d1c8a168a15542486a8f61c0833b9"
 
 [[package]]
 name = "raw-window-handle"
-version = "0.6.0"
+version = "0.6.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "42a9830a0e1b9fb145ebb365b8bc4ccd75f290f98c0247deafbbe2c75cefb544"
+checksum = "8cc3bcbdb1ddfc11e700e62968e6b4cc9c75bb466464ad28fb61c5b2c964418b"
 
 [[package]]
 name = "rayon"
@@ -2863,6 +2872,15 @@ dependencies = [
  "bitflags 1.3.2",
 ]
 
+[[package]]
+name = "redox_syscall"
+version = "0.5.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "469052894dcb553421e483e4209ee581a45100d31b4018de03e5a7ad86374a7e"
+dependencies = [
+ "bitflags 2.5.0",
+]
+
 [[package]]
 name = "regex"
 version = "1.10.4"
@@ -2951,9 +2969,9 @@ dependencies = [
 
 [[package]]
 name = "rustix"
-version = "0.38.32"
+version = "0.38.34"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "65e04861e65f21776e67888bfbea442b3642beaa0138fdb1dd7a84a52dffdb89"
+checksum = "70dc5ec042f7a43c4a73241207cecc9873a06d45debb38b329f8541d85c2730f"
 dependencies = [
  "bitflags 2.5.0",
  "errno",
@@ -3052,29 +3070,29 @@ checksum = "388a1df253eca08550bef6c72392cfe7c30914bf41df5269b68cbd6ff8f570a3"
 
 [[package]]
 name = "serde"
-version = "1.0.197"
+version = "1.0.199"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "3fb1c873e1b9b056a4dc4c0c198b24c3ffa059243875552b2bd0933b1aee4ce2"
+checksum = "0c9f6e76df036c77cd94996771fb40db98187f096dd0b9af39c6c6e452ba966a"
 dependencies = [
  "serde_derive",
 ]
 
 [[package]]
 name = "serde_derive"
-version = "1.0.197"
+version = "1.0.199"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "7eb0b34b42edc17f6b7cac84a52a1c5f0e1bb2227e997ca9011ea3dd34e8610b"
+checksum = "11bd257a6541e141e42ca6d24ae26f7714887b47e89aa739099104c7e4d3b7fc"
 dependencies = [
  "proc-macro2",
  "quote",
- "syn 2.0.58",
+ "syn 2.0.60",
 ]
 
 [[package]]
 name = "serde_json"
-version = "1.0.115"
+version = "1.0.116"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "12dc5c46daa8e9fdf4f5e71b6cf9a53f2487da0e86e55808e2d35539666497dd"
+checksum = "3e17db7126d17feb94eb3fad46bf1a96b034e8aacbc2e775fe81505f8b0b2813"
 dependencies = [
  "indexmap",
  "itoa",
@@ -3129,9 +3147,9 @@ dependencies = [
 
 [[package]]
 name = "signal-hook-registry"
-version = "1.4.1"
+version = "1.4.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "d8229b473baa5980ac72ef434c4415e70c4b5e71b423043adb4ba059f89c99a1"
+checksum = "a9e9e0b4211b72e7b8b6e85c807d36c212bdb33ea8587f7569562a84df5465b1"
 dependencies = [
  "libc",
 ]
@@ -3321,11 +3339,11 @@ version = "0.25.3"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "23dc1fa9ac9c169a78ba62f0b841814b7abae11bdd047b9c58f893439e309ea0"
 dependencies = [
- "heck",
+ "heck 0.4.1",
  "proc-macro2",
  "quote",
  "rustversion",
- "syn 2.0.58",
+ "syn 2.0.60",
 ]
 
 [[package]]
@@ -3341,9 +3359,9 @@ dependencies = [
 
 [[package]]
 name = "syn"
-version = "2.0.58"
+version = "2.0.60"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "44cfb93f38070beee36b3fef7d4f5a16f27751d94b187b666a5cc5e9b0d30687"
+checksum = "909518bc7b1c9b779f1bbf07f2929d35af9f0f37e47c6e9ef7f9dddc1e1821f3"
 dependencies = [
  "proc-macro2",
  "quote",
@@ -3361,22 +3379,22 @@ dependencies = [
 
 [[package]]
 name = "thiserror"
-version = "1.0.58"
+version = "1.0.59"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "03468839009160513471e86a034bb2c5c0e4baae3b43f79ffc55c4a5427b3297"
+checksum = "f0126ad08bff79f29fc3ae6a55cc72352056dfff61e3ff8bb7129476d44b23aa"
 dependencies = [
  "thiserror-impl",
 ]
 
 [[package]]
 name = "thiserror-impl"
-version = "1.0.58"
+version = "1.0.59"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "c61f3ba182994efc43764a46c018c347bc492c79f024e705f46567b418f6d4f7"
+checksum = "d1cd413b5d558b4c5bf3680e324a6fa5014e7b7c067a51e69dbdf47eb7148b66"
 dependencies = [
  "proc-macro2",
  "quote",
- "syn 2.0.58",
+ "syn 2.0.60",
 ]
 
 [[package]]
@@ -3500,7 +3518,7 @@ checksum = "5b8a1e28f2deaa14e508979454cb3a223b10b938b45af148bc0986de36f1923b"
 dependencies = [
  "proc-macro2",
  "quote",
- "syn 2.0.58",
+ "syn 2.0.60",
 ]
 
 [[package]]
@@ -3629,9 +3647,9 @@ checksum = "d4c87d22b6e3f4a18d4d40ef354e97c90fcb14dd91d7dc0aa9d8a1172ebf7202"
 
 [[package]]
 name = "unicode-width"
-version = "0.1.11"
+version = "0.1.12"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "e51733f11c9c4f72aa0c160008246859e340b00807569a0da0e7a1079b27ba85"
+checksum = "68f5e5f3158ecfd4b8ff6fe086db7c8467a2dfdac97fe420f2b7c4aa97af66d6"
 
 [[package]]
 name = "unicode-xid"
@@ -3750,7 +3768,7 @@ dependencies = [
  "once_cell",
  "proc-macro2",
  "quote",
- "syn 2.0.58",
+ "syn 2.0.60",
  "wasm-bindgen-shared",
 ]
 
@@ -3784,7 +3802,7 @@ checksum = "e94f17b526d0a461a191c78ea52bbce64071ed5c04c9ffe424dcb38f74171bb7"
 dependencies = [
  "proc-macro2",
  "quote",
- "syn 2.0.58",
+ "syn 2.0.60",
  "wasm-bindgen-backend",
  "wasm-bindgen-shared",
 ]
@@ -3817,7 +3835,7 @@ checksum = "b7f89739351a2e03cb94beb799d47fb2cac01759b40ec441f7de39b00cbf7ef0"
 dependencies = [
  "proc-macro2",
  "quote",
- "syn 2.0.58",
+ "syn 2.0.60",
 ]
 
 [[package]]
@@ -4034,7 +4052,7 @@ dependencies = [
 
 [[package]]
 name = "wgpu"
-version = "0.19.3"
+version = "0.20.0"
 dependencies = [
  "arrayvec 0.7.4",
  "cfg-if",
@@ -4045,7 +4063,7 @@ dependencies = [
  "naga",
  "parking_lot",
  "profiling",
- "raw-window-handle 0.6.0",
+ "raw-window-handle 0.6.1",
  "serde",
  "smallvec",
  "static_assertions",
@@ -4059,14 +4077,13 @@ dependencies = [
 
 [[package]]
 name = "wgpu-core"
-version = "0.19.3"
+version = "0.20.0"
 dependencies = [
  "arrayvec 0.7.4",
  "bit-vec",
  "bitflags 2.5.0",
  "bytemuck",
  "cfg_aliases",
- "codespan-reporting",
  "document-features",
  "indexmap",
  "log",
@@ -4074,7 +4091,7 @@ dependencies = [
  "once_cell",
  "parking_lot",
  "profiling",
- "raw-window-handle 0.6.0",
+ "raw-window-handle 0.6.1",
  "ron",
  "rustc-hash",
  "serde",
@@ -4087,7 +4104,7 @@ dependencies = [
 
 [[package]]
 name = "wgpu-examples"
-version = "0.19.3"
+version = "0.20.0"
 dependencies = [
  "bytemuck",
  "cfg-if",
@@ -4120,7 +4137,7 @@ dependencies = [
 
 [[package]]
 name = "wgpu-hal"
-version = "0.19.3"
+version = "0.20.0"
 dependencies = [
  "android_system_properties",
  "arrayvec 0.7.4",
@@ -4144,7 +4161,7 @@ dependencies = [
  "js-sys",
  "khronos-egl",
  "libc",
- "libloading 0.7.4",
+ "libloading 0.8.3",
  "log",
  "metal",
  "naga",
@@ -4154,7 +4171,7 @@ dependencies = [
  "parking_lot",
  "profiling",
  "range-alloc",
- "raw-window-handle 0.6.0",
+ "raw-window-handle 0.6.1",
  "renderdoc-sys",
  "rustc-hash",
  "smallvec",
@@ -4168,7 +4185,7 @@ dependencies = [
 
 [[package]]
 name = "wgpu-info"
-version = "0.19.3"
+version = "0.20.0"
 dependencies = [
  "anyhow",
  "bitflags 2.5.0",
@@ -4182,16 +4199,16 @@ dependencies = [
 
 [[package]]
 name = "wgpu-macros"
-version = "0.19.3"
+version = "0.20.0"
 dependencies = [
- "heck",
+ "heck 0.5.0",
  "quote",
- "syn 2.0.58",
+ "syn 2.0.60",
 ]
 
 [[package]]
 name = "wgpu-test"
-version = "0.19.3"
+version = "0.20.0"
 dependencies = [
  "anyhow",
  "arrayvec 0.7.4",
@@ -4203,7 +4220,7 @@ dependencies = [
  "env_logger",
  "futures-lite",
  "glam",
- "heck",
+ "heck 0.5.0",
  "image",
  "js-sys",
  "libtest-mimic",
@@ -4214,7 +4231,7 @@ dependencies = [
  "png",
  "pollster",
  "profiling",
- "raw-window-handle 0.6.0",
+ "raw-window-handle 0.6.1",
  "serde",
  "serde_json",
  "wasm-bindgen",
@@ -4228,7 +4245,7 @@ dependencies = [
 
 [[package]]
 name = "wgpu-types"
-version = "0.19.2"
+version = "0.20.0"
 dependencies = [
  "bitflags 2.5.0",
  "js-sys",
@@ -4274,11 +4291,11 @@ checksum = "ac3b87c63620426dd9b991e5ce0329eff545bccbbb34f3be09ff6fb6ab51b7b6"
 
 [[package]]
 name = "winapi-util"
-version = "0.1.6"
+version = "0.1.8"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "f29e6f9198ba0d26b4c9f07dbe6f9ed633e1f3d5b8b414090084349e46a52596"
+checksum = "4d4cc384e1e73b93bafa6fb4f1df8c41695c8a91cf9c4c64358067d15a7b6c6b"
 dependencies = [
- "winapi",
+ "windows-sys 0.52.0",
 ]
 
 [[package]]
@@ -4294,7 +4311,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "e48a53791691ab099e5e2ad123536d0fff50652600abaf43bbf952894110d0be"
 dependencies = [
  "windows-core",
- "windows-targets 0.52.4",
+ "windows-targets 0.52.5",
 ]
 
 [[package]]
@@ -4303,7 +4320,7 @@ version = "0.52.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "33ab640c8d7e35bf8ba19b884ba838ceb4fba93a4e8c65a9059d08afcfc683d9"
 dependencies = [
- "windows-targets 0.52.4",
+ "windows-targets 0.52.5",
 ]
 
 [[package]]
@@ -4343,7 +4360,7 @@ version = "0.52.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "282be5f36a8ce781fad8c8ae18fa3f9beff57ec1b52cb3de0789201425d9a33d"
 dependencies = [
- "windows-targets 0.52.4",
+ "windows-targets 0.52.5",
 ]
 
 [[package]]
@@ -4378,17 +4395,18 @@ dependencies = [
 
 [[package]]
 name = "windows-targets"
-version = "0.52.4"
+version = "0.52.5"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "7dd37b7e5ab9018759f893a1952c9420d060016fc19a472b4bb20d1bdd694d1b"
+checksum = "6f0713a46559409d202e70e28227288446bf7841d3211583a4b53e3f6d96e7eb"
 dependencies = [
- "windows_aarch64_gnullvm 0.52.4",
- "windows_aarch64_msvc 0.52.4",
- "windows_i686_gnu 0.52.4",
- "windows_i686_msvc 0.52.4",
- "windows_x86_64_gnu 0.52.4",
- "windows_x86_64_gnullvm 0.52.4",
- "windows_x86_64_msvc 0.52.4",
+ "windows_aarch64_gnullvm 0.52.5",
+ "windows_aarch64_msvc 0.52.5",
+ "windows_i686_gnu 0.52.5",
+ "windows_i686_gnullvm",
+ "windows_i686_msvc 0.52.5",
+ "windows_x86_64_gnu 0.52.5",
+ "windows_x86_64_gnullvm 0.52.5",
+ "windows_x86_64_msvc 0.52.5",
 ]
 
 [[package]]
@@ -4405,9 +4423,9 @@ checksum = "2b38e32f0abccf9987a4e3079dfb67dcd799fb61361e53e2882c3cbaf0d905d8"
 
 [[package]]
 name = "windows_aarch64_gnullvm"
-version = "0.52.4"
+version = "0.52.5"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "bcf46cf4c365c6f2d1cc93ce535f2c8b244591df96ceee75d8e83deb70a9cac9"
+checksum = "7088eed71e8b8dda258ecc8bac5fb1153c5cffaf2578fc8ff5d61e23578d3263"
 
 [[package]]
 name = "windows_aarch64_msvc"
@@ -4429,9 +4447,9 @@ checksum = "dc35310971f3b2dbbf3f0690a219f40e2d9afcf64f9ab7cc1be722937c26b4bc"
 
 [[package]]
 name = "windows_aarch64_msvc"
-version = "0.52.4"
+version = "0.52.5"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "da9f259dd3bcf6990b55bffd094c4f7235817ba4ceebde8e6d11cd0c5633b675"
+checksum = "9985fd1504e250c615ca5f281c3f7a6da76213ebd5ccc9561496568a2752afb6"
 
 [[package]]
 name = "windows_i686_gnu"
@@ -4453,9 +4471,15 @@ checksum = "a75915e7def60c94dcef72200b9a8e58e5091744960da64ec734a6c6e9b3743e"
 
 [[package]]
 name = "windows_i686_gnu"
-version = "0.52.4"
+version = "0.52.5"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "88ba073cf16d5372720ec942a8ccbf61626074c6d4dd2e745299726ce8b89670"
+
+[[package]]
+name = "windows_i686_gnullvm"
+version = "0.52.5"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "b474d8268f99e0995f25b9f095bc7434632601028cf86590aea5c8a5cb7801d3"
+checksum = "87f4261229030a858f36b459e748ae97545d6f1ec60e5e0d6a3d32e0dc232ee9"
 
 [[package]]
 name = "windows_i686_msvc"
@@ -4477,9 +4501,9 @@ checksum = "8f55c233f70c4b27f66c523580f78f1004e8b5a8b659e05a4eb49d4166cca406"
 
 [[package]]
 name = "windows_i686_msvc"
-version = "0.52.4"
+version = "0.52.5"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "1515e9a29e5bed743cb4415a9ecf5dfca648ce85ee42e15873c3cd8610ff8e02"
+checksum = "db3c2bf3d13d5b658be73463284eaf12830ac9a26a90c717b7f771dfe97487bf"
 
 [[package]]
 name = "windows_x86_64_gnu"
@@ -4501,9 +4525,9 @@ checksum = "53d40abd2583d23e4718fddf1ebec84dbff8381c07cae67ff7768bbf19c6718e"
 
 [[package]]
 name = "windows_x86_64_gnu"
-version = "0.52.4"
+version = "0.52.5"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "5eee091590e89cc02ad514ffe3ead9eb6b660aedca2183455434b93546371a03"
+checksum = "4e4246f76bdeff09eb48875a0fd3e2af6aada79d409d33011886d3e1581517d9"
 
 [[package]]
 name = "windows_x86_64_gnullvm"
@@ -4519,9 +4543,9 @@ checksum = "0b7b52767868a23d5bab768e390dc5f5c55825b6d30b86c844ff2dc7414044cc"
 
 [[package]]
 name = "windows_x86_64_gnullvm"
-version = "0.52.4"
+version = "0.52.5"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "77ca79f2451b49fa9e2af39f0747fe999fcda4f5e241b2898624dca97a1f2177"
+checksum = "852298e482cd67c356ddd9570386e2862b5673c85bd5f88df9ab6802b334c596"
 
 [[package]]
 name = "windows_x86_64_msvc"
@@ -4543,9 +4567,9 @@ checksum = "ed94fce61571a4006852b7389a063ab983c02eb1bb37b47f8272ce92d06d9538"
 
 [[package]]
 name = "windows_x86_64_msvc"
-version = "0.52.4"
+version = "0.52.5"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "32b752e52a2da0ddfbdbcc6fceadfeede4c939ed16d13e648833a61dfb611ed8"
+checksum = "bec47e5bfd1bff0eeaf6d8b485cc1074891a197ab4225d504cb7a1ab88b02bf0"
 
 [[package]]
 name = "winit"
@@ -4607,7 +4631,7 @@ dependencies = [
  "once_cell",
  "orbclient",
  "percent-encoding",
- "raw-window-handle 0.6.0",
+ "raw-window-handle 0.6.1",
  "redox_syscall 0.3.5",
  "rustix",
  "sctk-adwaita 0.8.1",
@@ -4726,5 +4750,5 @@ checksum = "9ce1b18ccd8e73a9321186f97e46f9f04b778851177567b1975109d26a08d2a6"
 dependencies = [
  "proc-macro2",
  "quote",
- "syn 2.0.58",
+ "syn 2.0.60",
 ]
diff --git a/Cargo.toml b/Cargo.toml
index c992222cf4..fbc0dba87c 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -45,30 +45,30 @@ keywords = ["graphics"]
 license = "MIT OR Apache-2.0"
 homepage = "https://wgpu.rs/"
 repository = "https://github.com/gfx-rs/wgpu"
-version = "0.19.3"
+version = "0.20.0"
 authors = ["gfx-rs developers"]
 
 [workspace.dependencies.wgc]
 package = "wgpu-core"
 path = "./wgpu-core"
-version = "0.19.3"
+version = "0.20.0"
 
 [workspace.dependencies.wgt]
 package = "wgpu-types"
 path = "./wgpu-types"
-version = "0.19.2"
+version = "0.20.0"
 
 [workspace.dependencies.hal]
 package = "wgpu-hal"
 path = "./wgpu-hal"
-version = "0.19.3"
+version = "0.20.0"
 
 [workspace.dependencies.naga]
 path = "./naga"
-version = "0.19.2"
+version = "0.20.0"
 
 [workspace.dependencies]
-anyhow = "1.0"
+anyhow = "1.0.23"
 arrayvec = "0.7"
 bit-vec = "0.6"
 bitflags = "2"
@@ -78,14 +78,14 @@ cfg-if = "1"
 codespan-reporting = "0.11"
 ctor = "0.2"
 document-features = "0.2.8"
-encase = "0.7"
+encase = "0.8"
 env_logger = "0.11"
 fern = "0.6"
 flume = "0.11"
 futures-lite = "2"
 getrandom = "0.2"
-glam = "0.25"
-heck = "0.4.0"
+glam = "0.27"
+heck = "0.5.0"
 image = { version = "0.24", default-features = false, features = ["png"] }
 ktx2 = "0.3"
 libc = "0.2"
@@ -114,29 +114,29 @@ renderdoc-sys = "1.1.0"
 ron = "0.8"
 rustc-hash = "1.1.0"
 serde = "1"
-serde_json = "1.0.115"
+serde_json = "1.0.116"
 smallvec = "1"
 static_assertions = "1.1.0"
 thiserror = "1"
-wgpu = { version = "0.19.3", path = "./wgpu" }
-wgpu-core = { version = "0.19.3", path = "./wgpu-core" }
-wgpu-example = { version = "0.19.0", path = "./examples/common" }
-wgpu-macros = { version = "0.19.0", path = "./wgpu-macros" }
-wgpu-test = { version = "0.19.0", path = "./tests" }
-wgpu-types = { version = "0.19.2", path = "./wgpu-types" }
+wgpu = { version = "0.20.0", path = "./wgpu" }
+wgpu-core = { version = "0.20.0", path = "./wgpu-core" }
+wgpu-example = { version = "0.20.0", path = "./examples/common" }
+wgpu-macros = { version = "0.20.0", path = "./wgpu-macros" }
+wgpu-test = { version = "0.20.0", path = "./tests" }
+wgpu-types = { version = "0.20.0", path = "./wgpu-types" }
 winit = { version = "0.29", features = ["android-native-activity"] }
 
 # Metal dependencies
 block = "0.1"
 core-graphics-types = "0.1"
-metal = { version = "0.27.0", git = "https://github.com/gfx-rs/metal-rs", rev = "ff8fd3d6dc7792852f8a015458d7e6d42d7fb352" }
+metal = { version = "0.28.0" }
 objc = "0.2.5"
 
 # Vulkan dependencies
 android_system_properties = "0.1.1"
 ash = "0.37.3"
 gpu-alloc = "0.6"
-gpu-descriptor = "0.2"
+gpu-descriptor = "0.3"
 
 # DX dependencies
 bit-set = "0.5"
@@ -144,7 +144,7 @@ gpu-allocator = { version = "0.25", default_features = false, features = [
     "d3d12",
     "public-winapi",
 ] }
-d3d12 = { version = "0.7.0", path = "./d3d12/" }
+d3d12 = { version = "0.20.0", path = "./d3d12/" }
 range-alloc = "0.1"
 winapi = "0.3"
 hassle-rs = "0.11.0"
diff --git a/README.md b/README.md
index bc0f01b302..c1635042f0 100644
--- a/README.md
+++ b/README.md
@@ -199,7 +199,7 @@ To run a given set of tests:
 
 ```
 # Must be inside the `cts` folder we just checked out, else this will fail
-cargo run --manifest-path ../Cargo.toml --bin cts_runner -- ./tools/run_deno --verbose "<test string>"
+cargo run --manifest-path ../Cargo.toml -p cts_runner --bin cts_runner -- ./tools/run_deno --verbose "<test string>"
 ```
 
 To find the full list of tests, go to the [online cts viewer](https://gpuweb.github.io/cts/standalone/?runnow=0&worker=0&debug=0&q=webgpu:*).
diff --git a/d3d12/Cargo.toml b/d3d12/Cargo.toml
index 44f5dc35e2..2c3f721525 100644
--- a/d3d12/Cargo.toml
+++ b/d3d12/Cargo.toml
@@ -1,6 +1,6 @@
 [package]
 name = "d3d12"
-version = "0.19.0"
+version = "0.20.0"
 authors = ["gfx-rs developers"]
 description = "Low level D3D12 API wrapper"
 repository = "https://github.com/gfx-rs/wgpu/tree/trunk/d3d12"
diff --git a/deno_webgpu/01_webgpu.js b/deno_webgpu/01_webgpu.js
index f1916e81ee..369d1cd9b9 100644
--- a/deno_webgpu/01_webgpu.js
+++ b/deno_webgpu/01_webgpu.js
@@ -92,7 +92,7 @@ const {
   ArrayBuffer,
   ArrayBufferPrototypeGetByteLength,
   ArrayIsArray,
-  ArrayPrototypeFilter,
+  ArrayPrototypeFindLast,
   ArrayPrototypeMap,
   ArrayPrototypePop,
   ArrayPrototypePush,
@@ -103,12 +103,9 @@ const {
   ObjectHasOwn,
   ObjectPrototypeIsPrototypeOf,
   Promise,
-  PromisePrototypeCatch,
-  PromisePrototypeThen,
   PromiseReject,
   PromiseResolve,
   SafeArrayIterator,
-  SafePromiseAll,
   SafeSet,
   SafeWeakRef,
   SetPrototypeHas,
@@ -908,7 +905,7 @@ function GPUObjectBaseMixin(name, type) {
 /**
  * @typedef ErrorScope
  * @property {string} filter
- * @property {Promise<void>[]} operations
+ * @property {GPUError[]} errors
  */
 
 /**
@@ -964,114 +961,47 @@ class InnerGPUDevice {
     ArrayPrototypePush(this.resources, new SafeWeakRef(resource));
   }
 
-  /** @param {{ type: string, value: string | null } | undefined} err */
-  pushError(err) {
-    this.pushErrorPromise(PromiseResolve(err));
-  }
-
-  /** @param {Promise<{ type: string, value: string | null } | undefined>} promise */
-  pushErrorPromise(promise) {
-    const operation = PromisePrototypeThen(promise, (err) => {
-      if (err) {
-        switch (err.type) {
-          case "lost":
-            this.isLost = true;
-            this.resolveLost(
-              createGPUDeviceLostInfo(undefined, "device was lost"),
-            );
-            break;
-          case "validation":
-            return PromiseReject(
-              new GPUValidationError(err.value ?? "validation error"),
-            );
-          case "out-of-memory":
-            return PromiseReject(new GPUOutOfMemoryError());
-          case "internal":
-            return PromiseReject(new GPUInternalError());
-        }
-      }
-    });
+  // Ref: https://gpuweb.github.io/gpuweb/#abstract-opdef-dispatch-error
+  /** @param {{ type: string, value: string | null } | undefined} error */
+  pushError(error) {
+    if (!error) {
+      return;
+    }
 
-    const validationStack = ArrayPrototypeFilter(
-      this.errorScopeStack,
-      ({ filter }) => filter == "validation",
-    );
-    const validationScope = validationStack[validationStack.length - 1];
-    const validationFilteredPromise = PromisePrototypeCatch(
-      operation,
-      (err) => {
-        if (ObjectPrototypeIsPrototypeOf(GPUValidationErrorPrototype, err)) {
-          return PromiseReject(err);
-        }
-        return PromiseResolve();
-      },
-    );
-    if (validationScope) {
-      ArrayPrototypePush(
-        validationScope.operations,
-        validationFilteredPromise,
-      );
-    } else {
-      PromisePrototypeCatch(validationFilteredPromise, (err) => {
-        this.device.dispatchEvent(
-          new GPUUncapturedErrorEvent("uncapturederror", {
-            error: err,
-          }),
+    let constructedError;
+    switch (error.type) {
+      case "lost":
+        this.isLost = true;
+        this.resolveLost(
+          createGPUDeviceLostInfo(undefined, "device was lost"),
         );
-      });
+        return;
+      case "validation":
+        constructedError = new GPUValidationError(error.value ?? "validation error");
+        break;
+      case "out-of-memory":
+        constructedError = new GPUOutOfMemoryError();
+        break;
+      case "internal":
+        constructedError = new GPUInternalError();
+        break;
     }
-    // prevent uncaptured promise rejections
-    PromisePrototypeCatch(validationFilteredPromise, (_err) => {});
 
-    const oomStack = ArrayPrototypeFilter(
-      this.errorScopeStack,
-      ({ filter }) => filter == "out-of-memory",
-    );
-    const oomScope = oomStack[oomStack.length - 1];
-    const oomFilteredPromise = PromisePrototypeCatch(operation, (err) => {
-      if (ObjectPrototypeIsPrototypeOf(GPUOutOfMemoryErrorPrototype, err)) {
-        return PromiseReject(err);
-      }
-      return PromiseResolve();
-    });
-    if (oomScope) {
-      ArrayPrototypePush(oomScope.operations, oomFilteredPromise);
-    } else {
-      PromisePrototypeCatch(oomFilteredPromise, (err) => {
-        this.device.dispatchEvent(
-          new GPUUncapturedErrorEvent("uncapturederror", {
-            error: err,
-          }),
-        );
-      });
+    if (this.isLost) {
+      return;
     }
-    // prevent uncaptured promise rejections
-    PromisePrototypeCatch(oomFilteredPromise, (_err) => {});
 
-    const internalStack = ArrayPrototypeFilter(
+    const scope = ArrayPrototypeFindLast(
       this.errorScopeStack,
-      ({ filter }) => filter == "internal",
+      ({ filter }) => filter === error.type,
     );
-    const internalScope = internalStack[internalStack.length - 1];
-    const internalFilteredPromise = PromisePrototypeCatch(operation, (err) => {
-      if (ObjectPrototypeIsPrototypeOf(GPUInternalErrorPrototype, err)) {
-        return PromiseReject(err);
-      }
-      return PromiseResolve();
-    });
-    if (internalScope) {
-      ArrayPrototypePush(internalScope.operations, internalFilteredPromise);
+    if (scope) {
+      scope.errors.push(constructedError);
     } else {
-      PromisePrototypeCatch(internalFilteredPromise, (err) => {
-        this.device.dispatchEvent(
-          new GPUUncapturedErrorEvent("uncapturederror", {
-            error: err,
-          }),
-        );
-      });
+      this.device.dispatchEvent(new GPUUncapturedErrorEvent("uncapturederror", {
+        error: constructedError,
+      }));
     }
-    // prevent uncaptured promise rejections
-    PromisePrototypeCatch(internalFilteredPromise, (_err) => {});
   }
 }
 
@@ -1359,11 +1289,6 @@ class GPUDevice extends EventTarget {
       const resource = entry.resource;
       if (ObjectPrototypeIsPrototypeOf(GPUSamplerPrototype, resource)) {
         const rid = assertResource(resource, prefix, context);
-        assertDeviceMatch(device, resource, {
-          prefix,
-          resourceContext: context,
-          selfContext: "this",
-        });
         return {
           binding: entry.binding,
           kind: "GPUSampler",
@@ -1374,11 +1299,6 @@ class GPUDevice extends EventTarget {
       ) {
         const rid = assertResource(resource, prefix, context);
         assertResource(resource[_texture], prefix, context);
-        assertDeviceMatch(device, resource[_texture], {
-          prefix,
-          resourceContext: context,
-          selfContext: "this",
-        });
         return {
           binding: entry.binding,
           kind: "GPUTextureView",
@@ -1388,11 +1308,6 @@ class GPUDevice extends EventTarget {
         // deno-lint-ignore prefer-primordials
         const rid = assertResource(resource.buffer, prefix, context);
         // deno-lint-ignore prefer-primordials
-        assertDeviceMatch(device, resource.buffer, {
-          prefix,
-          resourceContext: context,
-          selfContext: "this",
-        });
         return {
           binding: entry.binding,
           kind: "GPUBufferBinding",
@@ -1856,7 +1771,7 @@ class GPUDevice extends EventTarget {
     webidl.requiredArguments(arguments.length, 1, prefix);
     filter = webidl.converters.GPUErrorFilter(filter, prefix, "Argument 1");
     const device = assertDevice(this, prefix, "this");
-    ArrayPrototypePush(device.errorScopeStack, { filter, operations: [] });
+    ArrayPrototypePush(device.errorScopeStack, { filter, errors: [] });
   }
 
   /**
@@ -1877,12 +1792,7 @@ class GPUDevice extends EventTarget {
         "OperationError",
       );
     }
-    const operations = SafePromiseAll(scope.operations);
-    return PromisePrototypeThen(
-      operations,
-      () => PromiseResolve(null),
-      (err) => PromiseResolve(err),
-    );
+    return PromiseResolve(scope.errors[0] ?? null);
   }
 
   [SymbolFor("Deno.privateCustomInspect")](inspect, inspectOptions) {
@@ -2284,19 +2194,15 @@ class GPUBuffer {
 
     this[_mapMode] = mode;
     this[_state] = "pending";
-    const promise = PromisePrototypeThen(
-      op_webgpu_buffer_get_map_async(
-        bufferRid,
-        device.rid,
-        mode,
-        offset,
-        rangeSize,
-      ),
-      ({ err }) => err,
+    const { err } = await op_webgpu_buffer_get_map_async(
+      bufferRid,
+      device.rid,
+      mode,
+      offset,
+      rangeSize,
     );
-    device.pushErrorPromise(promise);
-    const err = await promise;
     if (err) {
+      device.pushError(err);
       throw new DOMException("validation error occurred", "OperationError");
     }
     this[_state] = "mapped";
diff --git a/deno_webgpu/Cargo.toml b/deno_webgpu/Cargo.toml
index 586eb90c85..cf05e00f96 100644
--- a/deno_webgpu/Cargo.toml
+++ b/deno_webgpu/Cargo.toml
@@ -24,7 +24,15 @@ raw-window-handle = { workspace = true }
 
 [target.'cfg(not(target_arch = "wasm32"))'.dependencies.wgpu-core]
 workspace = true
-features = ["raw-window-handle", "trace", "replay", "serde", "strict_asserts", "wgsl", "gles"]
+features = [
+    "raw-window-handle",
+    "trace",
+    "replay",
+    "serde",
+    "strict_asserts",
+    "wgsl",
+    "gles",
+]
 
 # We want the wgpu-core Metal backend on macOS and iOS.
 [target.'cfg(any(target_os = "macos", target_os = "ios"))'.dependencies.wgpu-core]
@@ -37,7 +45,7 @@ workspace = true
 features = ["dx12"]
 
 [target.'cfg(windows)'.dependencies.wgpu-hal]
-version = "0.19.0"
+version = "0.20.0"
 path = "../wgpu-hal"
 features = ["windows_rs"]
 
diff --git a/deno_webgpu/compute_pass.rs b/deno_webgpu/compute_pass.rs
index 65ac93d632..2cdea2c8f2 100644
--- a/deno_webgpu/compute_pass.rs
+++ b/deno_webgpu/compute_pass.rs
@@ -31,7 +31,7 @@ pub fn op_webgpu_compute_pass_set_pipeline(
         .resource_table
         .get::<WebGpuComputePass>(compute_pass_rid)?;
 
-    wgpu_core::command::compute_ffi::wgpu_compute_pass_set_pipeline(
+    wgpu_core::command::compute_commands::wgpu_compute_pass_set_pipeline(
         &mut compute_pass_resource.0.borrow_mut(),
         compute_pipeline_resource.1,
     );
@@ -52,7 +52,7 @@ pub fn op_webgpu_compute_pass_dispatch_workgroups(
         .resource_table
         .get::<WebGpuComputePass>(compute_pass_rid)?;
 
-    wgpu_core::command::compute_ffi::wgpu_compute_pass_dispatch_workgroups(
+    wgpu_core::command::compute_commands::wgpu_compute_pass_dispatch_workgroups(
         &mut compute_pass_resource.0.borrow_mut(),
         x,
         y,
@@ -77,7 +77,7 @@ pub fn op_webgpu_compute_pass_dispatch_workgroups_indirect(
         .resource_table
         .get::<WebGpuComputePass>(compute_pass_rid)?;
 
-    wgpu_core::command::compute_ffi::wgpu_compute_pass_dispatch_workgroups_indirect(
+    wgpu_core::command::compute_commands::wgpu_compute_pass_dispatch_workgroups_indirect(
         &mut compute_pass_resource.0.borrow_mut(),
         buffer_resource.1,
         indirect_offset,
@@ -137,17 +137,12 @@ pub fn op_webgpu_compute_pass_set_bind_group(
 
     let dynamic_offsets_data: &[u32] = &dynamic_offsets_data[start..start + len];
 
-    // SAFETY: the raw pointer and length are of the same slice, and that slice
-    // lives longer than the below function invocation.
-    unsafe {
-        wgpu_core::command::compute_ffi::wgpu_compute_pass_set_bind_group(
-            &mut compute_pass_resource.0.borrow_mut(),
-            index,
-            bind_group_resource.1,
-            dynamic_offsets_data.as_ptr(),
-            dynamic_offsets_data.len(),
-        );
-    }
+    wgpu_core::command::compute_commands::wgpu_compute_pass_set_bind_group(
+        &mut compute_pass_resource.0.borrow_mut(),
+        index,
+        bind_group_resource.1,
+        dynamic_offsets_data,
+    );
 
     Ok(WebGpuResult::empty())
 }
@@ -163,16 +158,11 @@ pub fn op_webgpu_compute_pass_push_debug_group(
         .resource_table
         .get::<WebGpuComputePass>(compute_pass_rid)?;
 
-    let label = std::ffi::CString::new(group_label).unwrap();
-    // SAFETY: the string the raw pointer points to lives longer than the below
-    // function invocation.
-    unsafe {
-        wgpu_core::command::compute_ffi::wgpu_compute_pass_push_debug_group(
-            &mut compute_pass_resource.0.borrow_mut(),
-            label.as_ptr(),
-            0, // wgpu#975
-        );
-    }
+    wgpu_core::command::compute_commands::wgpu_compute_pass_push_debug_group(
+        &mut compute_pass_resource.0.borrow_mut(),
+        group_label,
+        0, // wgpu#975
+    );
 
     Ok(WebGpuResult::empty())
 }
@@ -187,7 +177,7 @@ pub fn op_webgpu_compute_pass_pop_debug_group(
         .resource_table
         .get::<WebGpuComputePass>(compute_pass_rid)?;
 
-    wgpu_core::command::compute_ffi::wgpu_compute_pass_pop_debug_group(
+    wgpu_core::command::compute_commands::wgpu_compute_pass_pop_debug_group(
         &mut compute_pass_resource.0.borrow_mut(),
     );
 
@@ -205,16 +195,11 @@ pub fn op_webgpu_compute_pass_insert_debug_marker(
         .resource_table
         .get::<WebGpuComputePass>(compute_pass_rid)?;
 
-    let label = std::ffi::CString::new(marker_label).unwrap();
-    // SAFETY: the string the raw pointer points to lives longer than the below
-    // function invocation.
-    unsafe {
-        wgpu_core::command::compute_ffi::wgpu_compute_pass_insert_debug_marker(
-            &mut compute_pass_resource.0.borrow_mut(),
-            label.as_ptr(),
-            0, // wgpu#975
-        );
-    }
+    wgpu_core::command::compute_commands::wgpu_compute_pass_insert_debug_marker(
+        &mut compute_pass_resource.0.borrow_mut(),
+        marker_label,
+        0, // wgpu#975
+    );
 
     Ok(WebGpuResult::empty())
 }
diff --git a/deno_webgpu/pipeline.rs b/deno_webgpu/pipeline.rs
index 3031287607..e8b5a71cf0 100644
--- a/deno_webgpu/pipeline.rs
+++ b/deno_webgpu/pipeline.rs
@@ -113,6 +113,7 @@ pub fn op_webgpu_create_compute_pipeline(
             module: compute_shader_module_resource.1,
             entry_point: compute.entry_point.map(Cow::from),
             constants: Cow::Owned(compute.constants),
+            zero_initialize_workgroup_memory: true,
         },
     };
     let implicit_pipelines = match layout {
@@ -359,6 +360,8 @@ pub fn op_webgpu_create_render_pipeline(
                 module: fragment_shader_module_resource.1,
                 entry_point: Some(Cow::from(fragment.entry_point)),
                 constants: Cow::Owned(fragment.constants),
+                // Required to be true for WebGPU
+                zero_initialize_workgroup_memory: true,
             },
             targets: Cow::Owned(fragment.targets),
         })
@@ -382,6 +385,8 @@ pub fn op_webgpu_create_render_pipeline(
                 module: vertex_shader_module_resource.1,
                 entry_point: Some(Cow::Owned(args.vertex.entry_point)),
                 constants: Cow::Owned(args.vertex.constants),
+                // Required to be true for WebGPU
+                zero_initialize_workgroup_memory: true,
             },
             buffers: Cow::Owned(vertex_buffers),
         },
diff --git a/deno_webgpu/render_pass.rs b/deno_webgpu/render_pass.rs
index 11b2f22865..5a5ecdbadc 100644
--- a/deno_webgpu/render_pass.rs
+++ b/deno_webgpu/render_pass.rs
@@ -41,7 +41,7 @@ pub fn op_webgpu_render_pass_set_viewport(
         .resource_table
         .get::<WebGpuRenderPass>(args.render_pass_rid)?;
 
-    wgpu_core::command::render_ffi::wgpu_render_pass_set_viewport(
+    wgpu_core::command::render_commands::wgpu_render_pass_set_viewport(
         &mut render_pass_resource.0.borrow_mut(),
         args.x,
         args.y,
@@ -68,7 +68,7 @@ pub fn op_webgpu_render_pass_set_scissor_rect(
         .resource_table
         .get::<WebGpuRenderPass>(render_pass_rid)?;
 
-    wgpu_core::command::render_ffi::wgpu_render_pass_set_scissor_rect(
+    wgpu_core::command::render_commands::wgpu_render_pass_set_scissor_rect(
         &mut render_pass_resource.0.borrow_mut(),
         x,
         y,
@@ -90,7 +90,7 @@ pub fn op_webgpu_render_pass_set_blend_constant(
         .resource_table
         .get::<WebGpuRenderPass>(render_pass_rid)?;
 
-    wgpu_core::command::render_ffi::wgpu_render_pass_set_blend_constant(
+    wgpu_core::command::render_commands::wgpu_render_pass_set_blend_constant(
         &mut render_pass_resource.0.borrow_mut(),
         &color,
     );
@@ -109,7 +109,7 @@ pub fn op_webgpu_render_pass_set_stencil_reference(
         .resource_table
         .get::<WebGpuRenderPass>(render_pass_rid)?;
 
-    wgpu_core::command::render_ffi::wgpu_render_pass_set_stencil_reference(
+    wgpu_core::command::render_commands::wgpu_render_pass_set_stencil_reference(
         &mut render_pass_resource.0.borrow_mut(),
         reference,
     );
@@ -128,7 +128,7 @@ pub fn op_webgpu_render_pass_begin_occlusion_query(
         .resource_table
         .get::<WebGpuRenderPass>(render_pass_rid)?;
 
-    wgpu_core::command::render_ffi::wgpu_render_pass_begin_occlusion_query(
+    wgpu_core::command::render_commands::wgpu_render_pass_begin_occlusion_query(
         &mut render_pass_resource.0.borrow_mut(),
         query_index,
     );
@@ -146,7 +146,7 @@ pub fn op_webgpu_render_pass_end_occlusion_query(
         .resource_table
         .get::<WebGpuRenderPass>(render_pass_rid)?;
 
-    wgpu_core::command::render_ffi::wgpu_render_pass_end_occlusion_query(
+    wgpu_core::command::render_commands::wgpu_render_pass_end_occlusion_query(
         &mut render_pass_resource.0.borrow_mut(),
     );
 
@@ -174,15 +174,10 @@ pub fn op_webgpu_render_pass_execute_bundles(
         .resource_table
         .get::<WebGpuRenderPass>(render_pass_rid)?;
 
-    // SAFETY: the raw pointer and length are of the same slice, and that slice
-    // lives longer than the below function invocation.
-    unsafe {
-        wgpu_core::command::render_ffi::wgpu_render_pass_execute_bundles(
-            &mut render_pass_resource.0.borrow_mut(),
-            bundles.as_ptr(),
-            bundles.len(),
-        );
-    }
+    wgpu_core::command::render_commands::wgpu_render_pass_execute_bundles(
+        &mut render_pass_resource.0.borrow_mut(),
+        &bundles,
+    );
 
     Ok(WebGpuResult::empty())
 }
@@ -235,17 +230,12 @@ pub fn op_webgpu_render_pass_set_bind_group(
 
     let dynamic_offsets_data: &[u32] = &dynamic_offsets_data[start..start + len];
 
-    // SAFETY: the raw pointer and length are of the same slice, and that slice
-    // lives longer than the below function invocation.
-    unsafe {
-        wgpu_core::command::render_ffi::wgpu_render_pass_set_bind_group(
-            &mut render_pass_resource.0.borrow_mut(),
-            index,
-            bind_group_resource.1,
-            dynamic_offsets_data.as_ptr(),
-            dynamic_offsets_data.len(),
-        );
-    }
+    wgpu_core::command::render_commands::wgpu_render_pass_set_bind_group(
+        &mut render_pass_resource.0.borrow_mut(),
+        index,
+        bind_group_resource.1,
+        dynamic_offsets_data,
+    );
 
     Ok(WebGpuResult::empty())
 }
@@ -261,16 +251,11 @@ pub fn op_webgpu_render_pass_push_debug_group(
         .resource_table
         .get::<WebGpuRenderPass>(render_pass_rid)?;
 
-    let label = std::ffi::CString::new(group_label).unwrap();
-    // SAFETY: the string the raw pointer points to lives longer than the below
-    // function invocation.
-    unsafe {
-        wgpu_core::command::render_ffi::wgpu_render_pass_push_debug_group(
-            &mut render_pass_resource.0.borrow_mut(),
-            label.as_ptr(),
-            0, // wgpu#975
-        );
-    }
+    wgpu_core::command::render_commands::wgpu_render_pass_push_debug_group(
+        &mut render_pass_resource.0.borrow_mut(),
+        group_label,
+        0, // wgpu#975
+    );
 
     Ok(WebGpuResult::empty())
 }
@@ -285,7 +270,7 @@ pub fn op_webgpu_render_pass_pop_debug_group(
         .resource_table
         .get::<WebGpuRenderPass>(render_pass_rid)?;
 
-    wgpu_core::command::render_ffi::wgpu_render_pass_pop_debug_group(
+    wgpu_core::command::render_commands::wgpu_render_pass_pop_debug_group(
         &mut render_pass_resource.0.borrow_mut(),
     );
 
@@ -303,16 +288,11 @@ pub fn op_webgpu_render_pass_insert_debug_marker(
         .resource_table
         .get::<WebGpuRenderPass>(render_pass_rid)?;
 
-    let label = std::ffi::CString::new(marker_label).unwrap();
-    // SAFETY: the string the raw pointer points to lives longer than the below
-    // function invocation.
-    unsafe {
-        wgpu_core::command::render_ffi::wgpu_render_pass_insert_debug_marker(
-            &mut render_pass_resource.0.borrow_mut(),
-            label.as_ptr(),
-            0, // wgpu#975
-        );
-    }
+    wgpu_core::command::render_commands::wgpu_render_pass_insert_debug_marker(
+        &mut render_pass_resource.0.borrow_mut(),
+        marker_label,
+        0, // wgpu#975
+    );
 
     Ok(WebGpuResult::empty())
 }
@@ -331,7 +311,7 @@ pub fn op_webgpu_render_pass_set_pipeline(
         .resource_table
         .get::<WebGpuRenderPass>(render_pass_rid)?;
 
-    wgpu_core::command::render_ffi::wgpu_render_pass_set_pipeline(
+    wgpu_core::command::render_commands::wgpu_render_pass_set_pipeline(
         &mut render_pass_resource.0.borrow_mut(),
         render_pipeline_resource.1,
     );
@@ -401,7 +381,7 @@ pub fn op_webgpu_render_pass_set_vertex_buffer(
         None
     };
 
-    wgpu_core::command::render_ffi::wgpu_render_pass_set_vertex_buffer(
+    wgpu_core::command::render_commands::wgpu_render_pass_set_vertex_buffer(
         &mut render_pass_resource.0.borrow_mut(),
         slot,
         buffer_resource.1,
@@ -426,7 +406,7 @@ pub fn op_webgpu_render_pass_draw(
         .resource_table
         .get::<WebGpuRenderPass>(render_pass_rid)?;
 
-    wgpu_core::command::render_ffi::wgpu_render_pass_draw(
+    wgpu_core::command::render_commands::wgpu_render_pass_draw(
         &mut render_pass_resource.0.borrow_mut(),
         vertex_count,
         instance_count,
@@ -452,7 +432,7 @@ pub fn op_webgpu_render_pass_draw_indexed(
         .resource_table
         .get::<WebGpuRenderPass>(render_pass_rid)?;
 
-    wgpu_core::command::render_ffi::wgpu_render_pass_draw_indexed(
+    wgpu_core::command::render_commands::wgpu_render_pass_draw_indexed(
         &mut render_pass_resource.0.borrow_mut(),
         index_count,
         instance_count,
@@ -479,7 +459,7 @@ pub fn op_webgpu_render_pass_draw_indirect(
         .resource_table
         .get::<WebGpuRenderPass>(render_pass_rid)?;
 
-    wgpu_core::command::render_ffi::wgpu_render_pass_draw_indirect(
+    wgpu_core::command::render_commands::wgpu_render_pass_draw_indirect(
         &mut render_pass_resource.0.borrow_mut(),
         buffer_resource.1,
         indirect_offset,
@@ -503,7 +483,7 @@ pub fn op_webgpu_render_pass_draw_indexed_indirect(
         .resource_table
         .get::<WebGpuRenderPass>(render_pass_rid)?;
 
-    wgpu_core::command::render_ffi::wgpu_render_pass_draw_indexed_indirect(
+    wgpu_core::command::render_commands::wgpu_render_pass_draw_indexed_indirect(
         &mut render_pass_resource.0.borrow_mut(),
         buffer_resource.1,
         indirect_offset,
diff --git a/examples/src/boids/mod.rs b/examples/src/boids/mod.rs
index 02846beeae..6c8bb6e76c 100644
--- a/examples/src/boids/mod.rs
+++ b/examples/src/boids/mod.rs
@@ -132,7 +132,7 @@ impl crate::framework::Example for Example {
             vertex: wgpu::VertexState {
                 module: &draw_shader,
                 entry_point: "main_vs",
-                constants: &Default::default(),
+                compilation_options: Default::default(),
                 buffers: &[
                     wgpu::VertexBufferLayout {
                         array_stride: 4 * 4,
@@ -149,7 +149,7 @@ impl crate::framework::Example for Example {
             fragment: Some(wgpu::FragmentState {
                 module: &draw_shader,
                 entry_point: "main_fs",
-                constants: &Default::default(),
+                compilation_options: Default::default(),
                 targets: &[Some(config.view_formats[0].into())],
             }),
             primitive: wgpu::PrimitiveState::default(),
@@ -165,7 +165,7 @@ impl crate::framework::Example for Example {
             layout: Some(&compute_pipeline_layout),
             module: &compute_shader,
             entry_point: "main",
-            constants: &Default::default(),
+            compilation_options: Default::default(),
         });
 
         // buffer for the three 2d triangle vertices of each instance
diff --git a/examples/src/bunnymark/mod.rs b/examples/src/bunnymark/mod.rs
index be09478071..679fc5014a 100644
--- a/examples/src/bunnymark/mod.rs
+++ b/examples/src/bunnymark/mod.rs
@@ -203,13 +203,13 @@ impl crate::framework::Example for Example {
             vertex: wgpu::VertexState {
                 module: &shader,
                 entry_point: "vs_main",
-                constants: &Default::default(),
+                compilation_options: Default::default(),
                 buffers: &[],
             },
             fragment: Some(wgpu::FragmentState {
                 module: &shader,
                 entry_point: "fs_main",
-                constants: &Default::default(),
+                compilation_options: Default::default(),
                 targets: &[Some(wgpu::ColorTargetState {
                     format: config.view_formats[0],
                     blend: Some(wgpu::BlendState::ALPHA_BLENDING),
diff --git a/examples/src/conservative_raster/mod.rs b/examples/src/conservative_raster/mod.rs
index 12cdaa399d..89500a798f 100644
--- a/examples/src/conservative_raster/mod.rs
+++ b/examples/src/conservative_raster/mod.rs
@@ -97,13 +97,13 @@ impl crate::framework::Example for Example {
                 vertex: wgpu::VertexState {
                     module: &shader_triangle_and_lines,
                     entry_point: "vs_main",
-                    constants: &Default::default(),
+                    compilation_options: Default::default(),
                     buffers: &[],
                 },
                 fragment: Some(wgpu::FragmentState {
                     module: &shader_triangle_and_lines,
                     entry_point: "fs_main_red",
-                    constants: &Default::default(),
+                    compilation_options: Default::default(),
                     targets: &[Some(RENDER_TARGET_FORMAT.into())],
                 }),
                 primitive: wgpu::PrimitiveState {
@@ -122,13 +122,13 @@ impl crate::framework::Example for Example {
                 vertex: wgpu::VertexState {
                     module: &shader_triangle_and_lines,
                     entry_point: "vs_main",
-                    constants: &Default::default(),
+                    compilation_options: Default::default(),
                     buffers: &[],
                 },
                 fragment: Some(wgpu::FragmentState {
                     module: &shader_triangle_and_lines,
                     entry_point: "fs_main_blue",
-                    constants: &Default::default(),
+                    compilation_options: Default::default(),
                     targets: &[Some(RENDER_TARGET_FORMAT.into())],
                 }),
                 primitive: wgpu::PrimitiveState::default(),
@@ -148,13 +148,13 @@ impl crate::framework::Example for Example {
                     vertex: wgpu::VertexState {
                         module: &shader_triangle_and_lines,
                         entry_point: "vs_main",
-                        constants: &Default::default(),
+                        compilation_options: Default::default(),
                         buffers: &[],
                     },
                     fragment: Some(wgpu::FragmentState {
                         module: &shader_triangle_and_lines,
                         entry_point: "fs_main_white",
-                        constants: &Default::default(),
+                        compilation_options: Default::default(),
                         targets: &[Some(config.view_formats[0].into())],
                     }),
                     primitive: wgpu::PrimitiveState {
@@ -211,13 +211,13 @@ impl crate::framework::Example for Example {
                     vertex: wgpu::VertexState {
                         module: &shader,
                         entry_point: "vs_main",
-                        constants: &Default::default(),
+                        compilation_options: Default::default(),
                         buffers: &[],
                     },
                     fragment: Some(wgpu::FragmentState {
                         module: &shader,
                         entry_point: "fs_main",
-                        constants: &Default::default(),
+                        compilation_options: Default::default(),
                         targets: &[Some(config.view_formats[0].into())],
                     }),
                     primitive: wgpu::PrimitiveState::default(),
diff --git a/examples/src/cube/mod.rs b/examples/src/cube/mod.rs
index d87193fcfe..9347627812 100644
--- a/examples/src/cube/mod.rs
+++ b/examples/src/cube/mod.rs
@@ -244,13 +244,13 @@ impl crate::framework::Example for Example {
             vertex: wgpu::VertexState {
                 module: &shader,
                 entry_point: "vs_main",
-                constants: &Default::default(),
+                compilation_options: Default::default(),
                 buffers: &vertex_buffers,
             },
             fragment: Some(wgpu::FragmentState {
                 module: &shader,
                 entry_point: "fs_main",
-                constants: &Default::default(),
+                compilation_options: Default::default(),
                 targets: &[Some(config.view_formats[0].into())],
             }),
             primitive: wgpu::PrimitiveState {
@@ -272,13 +272,13 @@ impl crate::framework::Example for Example {
                 vertex: wgpu::VertexState {
                     module: &shader,
                     entry_point: "vs_main",
-                    constants: &Default::default(),
+                    compilation_options: Default::default(),
                     buffers: &vertex_buffers,
                 },
                 fragment: Some(wgpu::FragmentState {
                     module: &shader,
                     entry_point: "fs_wire",
-                    constants: &Default::default(),
+                    compilation_options: Default::default(),
                     targets: &[Some(wgpu::ColorTargetState {
                         format: config.view_formats[0],
                         blend: Some(wgpu::BlendState {
diff --git a/examples/src/hello_compute/mod.rs b/examples/src/hello_compute/mod.rs
index 63169662e0..d04aaa4309 100644
--- a/examples/src/hello_compute/mod.rs
+++ b/examples/src/hello_compute/mod.rs
@@ -109,7 +109,7 @@ async fn execute_gpu_inner(
         layout: None,
         module: &cs_module,
         entry_point: "main",
-        constants: &Default::default(),
+        compilation_options: Default::default(),
     });
 
     // Instantiates the bind group, once again specifying the binding of buffers.
diff --git a/examples/src/hello_synchronization/mod.rs b/examples/src/hello_synchronization/mod.rs
index 7dc2e6c9c0..0a222fbe54 100644
--- a/examples/src/hello_synchronization/mod.rs
+++ b/examples/src/hello_synchronization/mod.rs
@@ -103,14 +103,14 @@ async fn execute(
         layout: Some(&pipeline_layout),
         module: &shaders_module,
         entry_point: "patient_main",
-        constants: &Default::default(),
+        compilation_options: Default::default(),
     });
     let hasty_pipeline = device.create_compute_pipeline(&wgpu::ComputePipelineDescriptor {
         label: None,
         layout: Some(&pipeline_layout),
         module: &shaders_module,
         entry_point: "hasty_main",
-        constants: &Default::default(),
+        compilation_options: Default::default(),
     });
 
     //----------------------------------------------------------
diff --git a/examples/src/hello_triangle/mod.rs b/examples/src/hello_triangle/mod.rs
index 76b7a5a73d..79162a6956 100644
--- a/examples/src/hello_triangle/mod.rs
+++ b/examples/src/hello_triangle/mod.rs
@@ -60,12 +60,12 @@ async fn run(event_loop: EventLoop<()>, window: Window) {
             module: &shader,
             entry_point: "vs_main",
             buffers: &[],
-            constants: &Default::default(),
+            compilation_options: Default::default(),
         },
         fragment: Some(wgpu::FragmentState {
             module: &shader,
             entry_point: "fs_main",
-            constants: &Default::default(),
+            compilation_options: Default::default(),
             targets: &[Some(swapchain_format.into())],
         }),
         primitive: wgpu::PrimitiveState::default(),
diff --git a/examples/src/hello_workgroups/mod.rs b/examples/src/hello_workgroups/mod.rs
index 5fb0eff6b1..572de36d3e 100644
--- a/examples/src/hello_workgroups/mod.rs
+++ b/examples/src/hello_workgroups/mod.rs
@@ -110,7 +110,7 @@ async fn run() {
         layout: Some(&pipeline_layout),
         module: &shader,
         entry_point: "main",
-        constants: &Default::default(),
+        compilation_options: Default::default(),
     });
 
     //----------------------------------------------------------
diff --git a/examples/src/mipmap/mod.rs b/examples/src/mipmap/mod.rs
index fc40d5d884..0848e94e10 100644
--- a/examples/src/mipmap/mod.rs
+++ b/examples/src/mipmap/mod.rs
@@ -93,13 +93,13 @@ impl Example {
             vertex: wgpu::VertexState {
                 module: &shader,
                 entry_point: "vs_main",
-                constants: &Default::default(),
+                compilation_options: Default::default(),
                 buffers: &[],
             },
             fragment: Some(wgpu::FragmentState {
                 module: &shader,
                 entry_point: "fs_main",
-                constants: &Default::default(),
+                compilation_options: Default::default(),
                 targets: &[Some(TEXTURE_FORMAT.into())],
             }),
             primitive: wgpu::PrimitiveState {
@@ -292,13 +292,13 @@ impl crate::framework::Example for Example {
             vertex: wgpu::VertexState {
                 module: &shader,
                 entry_point: "vs_main",
-                constants: &Default::default(),
+                compilation_options: Default::default(),
                 buffers: &[],
             },
             fragment: Some(wgpu::FragmentState {
                 module: &shader,
                 entry_point: "fs_main",
-                constants: &Default::default(),
+                compilation_options: Default::default(),
                 targets: &[Some(config.view_formats[0].into())],
             }),
             primitive: wgpu::PrimitiveState {
diff --git a/examples/src/msaa_line/mod.rs b/examples/src/msaa_line/mod.rs
index 178968f47b..cd22e75bc4 100644
--- a/examples/src/msaa_line/mod.rs
+++ b/examples/src/msaa_line/mod.rs
@@ -54,7 +54,7 @@ impl Example {
             vertex: wgpu::VertexState {
                 module: shader,
                 entry_point: "vs_main",
-                constants: &Default::default(),
+                compilation_options: Default::default(),
                 buffers: &[wgpu::VertexBufferLayout {
                     array_stride: std::mem::size_of::<Vertex>() as wgpu::BufferAddress,
                     step_mode: wgpu::VertexStepMode::Vertex,
@@ -64,7 +64,7 @@ impl Example {
             fragment: Some(wgpu::FragmentState {
                 module: shader,
                 entry_point: "fs_main",
-                constants: &Default::default(),
+                compilation_options: Default::default(),
                 targets: &[Some(config.view_formats[0].into())],
             }),
             primitive: wgpu::PrimitiveState {
diff --git a/examples/src/render_to_texture/mod.rs b/examples/src/render_to_texture/mod.rs
index 0cb2cdea74..5e571dc74e 100644
--- a/examples/src/render_to_texture/mod.rs
+++ b/examples/src/render_to_texture/mod.rs
@@ -59,13 +59,13 @@ async fn run(_path: Option<String>) {
         vertex: wgpu::VertexState {
             module: &shader,
             entry_point: "vs_main",
-            constants: &Default::default(),
+            compilation_options: Default::default(),
             buffers: &[],
         },
         fragment: Some(wgpu::FragmentState {
             module: &shader,
             entry_point: "fs_main",
-            constants: &Default::default(),
+            compilation_options: Default::default(),
             targets: &[Some(wgpu::TextureFormat::Rgba8UnormSrgb.into())],
         }),
         primitive: wgpu::PrimitiveState::default(),
diff --git a/examples/src/repeated_compute/mod.rs b/examples/src/repeated_compute/mod.rs
index 0c47055191..55e87eed9a 100644
--- a/examples/src/repeated_compute/mod.rs
+++ b/examples/src/repeated_compute/mod.rs
@@ -245,7 +245,7 @@ impl WgpuContext {
             layout: Some(&pipeline_layout),
             module: &shader,
             entry_point: "main",
-            constants: &Default::default(),
+            compilation_options: Default::default(),
         });
 
         WgpuContext {
diff --git a/examples/src/shadow/mod.rs b/examples/src/shadow/mod.rs
index d0a29cc8b0..2cb6d6f3e2 100644
--- a/examples/src/shadow/mod.rs
+++ b/examples/src/shadow/mod.rs
@@ -500,7 +500,7 @@ impl crate::framework::Example for Example {
                 vertex: wgpu::VertexState {
                     module: &shader,
                     entry_point: "vs_bake",
-                    constants: &Default::default(),
+                    compilation_options: Default::default(),
                     buffers: &[vb_desc.clone()],
                 },
                 fragment: None,
@@ -633,7 +633,7 @@ impl crate::framework::Example for Example {
                 vertex: wgpu::VertexState {
                     module: &shader,
                     entry_point: "vs_main",
-                    constants: &Default::default(),
+                    compilation_options: Default::default(),
                     buffers: &[vb_desc],
                 },
                 fragment: Some(wgpu::FragmentState {
@@ -643,7 +643,7 @@ impl crate::framework::Example for Example {
                     } else {
                         "fs_main_without_storage"
                     },
-                    constants: &Default::default(),
+                    compilation_options: Default::default(),
                     targets: &[Some(config.view_formats[0].into())],
                 }),
                 primitive: wgpu::PrimitiveState {
diff --git a/examples/src/skybox/mod.rs b/examples/src/skybox/mod.rs
index 443c9d41e0..35a4266d20 100644
--- a/examples/src/skybox/mod.rs
+++ b/examples/src/skybox/mod.rs
@@ -199,13 +199,13 @@ impl crate::framework::Example for Example {
             vertex: wgpu::VertexState {
                 module: &shader,
                 entry_point: "vs_sky",
-                constants: &Default::default(),
+                compilation_options: Default::default(),
                 buffers: &[],
             },
             fragment: Some(wgpu::FragmentState {
                 module: &shader,
                 entry_point: "fs_sky",
-                constants: &Default::default(),
+                compilation_options: Default::default(),
                 targets: &[Some(config.view_formats[0].into())],
             }),
             primitive: wgpu::PrimitiveState {
@@ -228,7 +228,7 @@ impl crate::framework::Example for Example {
             vertex: wgpu::VertexState {
                 module: &shader,
                 entry_point: "vs_entity",
-                constants: &Default::default(),
+                compilation_options: Default::default(),
                 buffers: &[wgpu::VertexBufferLayout {
                     array_stride: std::mem::size_of::<Vertex>() as wgpu::BufferAddress,
                     step_mode: wgpu::VertexStepMode::Vertex,
@@ -238,7 +238,7 @@ impl crate::framework::Example for Example {
             fragment: Some(wgpu::FragmentState {
                 module: &shader,
                 entry_point: "fs_entity",
-                constants: &Default::default(),
+                compilation_options: Default::default(),
                 targets: &[Some(config.view_formats[0].into())],
             }),
             primitive: wgpu::PrimitiveState {
diff --git a/examples/src/srgb_blend/mod.rs b/examples/src/srgb_blend/mod.rs
index fdff310c31..f701aff989 100644
--- a/examples/src/srgb_blend/mod.rs
+++ b/examples/src/srgb_blend/mod.rs
@@ -131,13 +131,13 @@ impl<const SRGB: bool> crate::framework::Example for Example<SRGB> {
             vertex: wgpu::VertexState {
                 module: &shader,
                 entry_point: "vs_main",
-                constants: &Default::default(),
+                compilation_options: Default::default(),
                 buffers: &vertex_buffers,
             },
             fragment: Some(wgpu::FragmentState {
                 module: &shader,
                 entry_point: "fs_main",
-                constants: &Default::default(),
+                compilation_options: Default::default(),
                 targets: &[Some(wgpu::ColorTargetState {
                     format: config.view_formats[0],
                     blend: Some(wgpu::BlendState::ALPHA_BLENDING),
diff --git a/examples/src/stencil_triangles/mod.rs b/examples/src/stencil_triangles/mod.rs
index 07b8e3ec51..e0f495177f 100644
--- a/examples/src/stencil_triangles/mod.rs
+++ b/examples/src/stencil_triangles/mod.rs
@@ -74,13 +74,13 @@ impl crate::framework::Example for Example {
             vertex: wgpu::VertexState {
                 module: &shader,
                 entry_point: "vs_main",
-                constants: &Default::default(),
+                compilation_options: Default::default(),
                 buffers: &vertex_buffers,
             },
             fragment: Some(wgpu::FragmentState {
                 module: &shader,
                 entry_point: "fs_main",
-                constants: &Default::default(),
+                compilation_options: Default::default(),
                 targets: &[Some(wgpu::ColorTargetState {
                     format: config.view_formats[0],
                     blend: None,
@@ -114,13 +114,13 @@ impl crate::framework::Example for Example {
             vertex: wgpu::VertexState {
                 module: &shader,
                 entry_point: "vs_main",
-                constants: &Default::default(),
+                compilation_options: Default::default(),
                 buffers: &vertex_buffers,
             },
             fragment: Some(wgpu::FragmentState {
                 module: &shader,
                 entry_point: "fs_main",
-                constants: &Default::default(),
+                compilation_options: Default::default(),
                 targets: &[Some(config.view_formats[0].into())],
             }),
             primitive: Default::default(),
diff --git a/examples/src/storage_texture/mod.rs b/examples/src/storage_texture/mod.rs
index f83f61967d..02900c8918 100644
--- a/examples/src/storage_texture/mod.rs
+++ b/examples/src/storage_texture/mod.rs
@@ -100,7 +100,7 @@ async fn run(_path: Option<String>) {
         layout: Some(&pipeline_layout),
         module: &shader,
         entry_point: "main",
-        constants: &Default::default(),
+        compilation_options: Default::default(),
     });
 
     log::info!("Wgpu context set up.");
diff --git a/examples/src/texture_arrays/mod.rs b/examples/src/texture_arrays/mod.rs
index c786b0efee..dd7b4ec89a 100644
--- a/examples/src/texture_arrays/mod.rs
+++ b/examples/src/texture_arrays/mod.rs
@@ -321,7 +321,7 @@ impl crate::framework::Example for Example {
             vertex: wgpu::VertexState {
                 module: &base_shader_module,
                 entry_point: "vert_main",
-                constants: &Default::default(),
+                compilation_options: Default::default(),
                 buffers: &[wgpu::VertexBufferLayout {
                     array_stride: vertex_size as wgpu::BufferAddress,
                     step_mode: wgpu::VertexStepMode::Vertex,
@@ -331,7 +331,7 @@ impl crate::framework::Example for Example {
             fragment: Some(wgpu::FragmentState {
                 module: fragment_shader_module,
                 entry_point: fragment_entry_point,
-                constants: &Default::default(),
+                compilation_options: Default::default(),
                 targets: &[Some(config.view_formats[0].into())],
             }),
             primitive: wgpu::PrimitiveState {
diff --git a/examples/src/timestamp_queries/mod.rs b/examples/src/timestamp_queries/mod.rs
index 58952c76c0..7042d60fe9 100644
--- a/examples/src/timestamp_queries/mod.rs
+++ b/examples/src/timestamp_queries/mod.rs
@@ -298,7 +298,7 @@ fn compute_pass(
         layout: None,
         module,
         entry_point: "main_cs",
-        constants: &Default::default(),
+        compilation_options: Default::default(),
     });
     let bind_group_layout = compute_pipeline.get_bind_group_layout(0);
     let bind_group = device.create_bind_group(&wgpu::BindGroupDescriptor {
@@ -353,13 +353,13 @@ fn render_pass(
         vertex: wgpu::VertexState {
             module,
             entry_point: "vs_main",
-            constants: &Default::default(),
+            compilation_options: Default::default(),
             buffers: &[],
         },
         fragment: Some(wgpu::FragmentState {
             module,
             entry_point: "fs_main",
-            constants: &Default::default(),
+            compilation_options: Default::default(),
             targets: &[Some(format.into())],
         }),
         primitive: wgpu::PrimitiveState::default(),
diff --git a/examples/src/uniform_values/mod.rs b/examples/src/uniform_values/mod.rs
index 1ddee03e9f..932c7aaeec 100644
--- a/examples/src/uniform_values/mod.rs
+++ b/examples/src/uniform_values/mod.rs
@@ -179,13 +179,13 @@ impl WgpuContext {
             vertex: wgpu::VertexState {
                 module: &shader,
                 entry_point: "vs_main",
-                constants: &Default::default(),
+                compilation_options: Default::default(),
                 buffers: &[],
             },
             fragment: Some(wgpu::FragmentState {
                 module: &shader,
                 entry_point: "fs_main",
-                constants: &Default::default(),
+                compilation_options: Default::default(),
                 targets: &[Some(swapchain_format.into())],
             }),
             primitive: wgpu::PrimitiveState::default(),
diff --git a/examples/src/water/mod.rs b/examples/src/water/mod.rs
index 6bc3824e73..94f12895a8 100644
--- a/examples/src/water/mod.rs
+++ b/examples/src/water/mod.rs
@@ -512,7 +512,7 @@ impl crate::framework::Example for Example {
             vertex: wgpu::VertexState {
                 module: &water_module,
                 entry_point: "vs_main",
-                constants: &Default::default(),
+                compilation_options: Default::default(),
                 // Layout of our vertices. This should match the structs
                 // which are uploaded to the GPU. This should also be
                 // ensured by tagging on either a `#[repr(C)]` onto a
@@ -528,7 +528,7 @@ impl crate::framework::Example for Example {
             fragment: Some(wgpu::FragmentState {
                 module: &water_module,
                 entry_point: "fs_main",
-                constants: &Default::default(),
+                compilation_options: Default::default(),
                 // Describes how the colour will be interpolated
                 // and assigned to the output attachment.
                 targets: &[Some(wgpu::ColorTargetState {
@@ -583,7 +583,7 @@ impl crate::framework::Example for Example {
             vertex: wgpu::VertexState {
                 module: &terrain_module,
                 entry_point: "vs_main",
-                constants: &Default::default(),
+                compilation_options: Default::default(),
                 buffers: &[wgpu::VertexBufferLayout {
                     array_stride: terrain_vertex_size as wgpu::BufferAddress,
                     step_mode: wgpu::VertexStepMode::Vertex,
@@ -593,7 +593,7 @@ impl crate::framework::Example for Example {
             fragment: Some(wgpu::FragmentState {
                 module: &terrain_module,
                 entry_point: "fs_main",
-                constants: &Default::default(),
+                compilation_options: Default::default(),
                 targets: &[Some(config.view_formats[0].into())],
             }),
             primitive: wgpu::PrimitiveState {
diff --git a/naga-cli/Cargo.toml b/naga-cli/Cargo.toml
index 1f35499589..dc03fc96c4 100644
--- a/naga-cli/Cargo.toml
+++ b/naga-cli/Cargo.toml
@@ -1,6 +1,6 @@
 [package]
 name = "naga-cli"
-version = "0.19.0"
+version = "0.20.0"
 authors = ["gfx-rs developers"]
 edition = "2021"
 description = "Shader translation command line tool"
@@ -23,9 +23,10 @@ log = "0.4"
 codespan-reporting = "0.11"
 env_logger = "0.11"
 argh = "0.1.5"
+anyhow.workspace = true
 
 [dependencies.naga]
-version = "0.19"
+version = "0.20.0"
 path = "../naga"
 features = [
     "compact",
diff --git a/naga-cli/src/bin/naga.rs b/naga-cli/src/bin/naga.rs
index eaa37b8fc3..7ff086d3f7 100644
--- a/naga-cli/src/bin/naga.rs
+++ b/naga-cli/src/bin/naga.rs
@@ -1,4 +1,5 @@
 #![allow(clippy::manual_strip)]
+use anyhow::{anyhow, Context as _};
 #[allow(unused_imports)]
 use std::fs;
 use std::{error::Error, fmt, io::Read, path::Path, str::FromStr};
@@ -62,6 +63,16 @@ struct Args {
     #[argh(option)]
     shader_model: Option<ShaderModelArg>,
 
+    /// the shader stage, for example 'frag', 'vert', or 'compute'.
+    /// if the shader stage is unspecified it will be derived from
+    /// the file extension.
+    #[argh(option)]
+    shader_stage: Option<ShaderStage>,
+
+    /// the kind of input, e.g. 'glsl', 'wgsl', 'spv', or 'bin'.
+    #[argh(option)]
+    input_kind: Option<InputKind>,
+
     /// the metal version to use, for example, 1.0, 1.1, 1.2, etc.
     #[argh(option)]
     metal_version: Option<MslVersionArg>,
@@ -170,6 +181,46 @@ impl FromStr for ShaderModelArg {
     }
 }
 
+/// Newtype so we can implement [`FromStr`] for `ShaderSource`.
+#[derive(Debug, Clone, Copy)]
+struct ShaderStage(naga::ShaderStage);
+
+impl FromStr for ShaderStage {
+    type Err = anyhow::Error;
+
+    fn from_str(s: &str) -> Result<Self, Self::Err> {
+        use naga::ShaderStage;
+        Ok(Self(match s.to_lowercase().as_str() {
+            "frag" | "fragment" => ShaderStage::Fragment,
+            "comp" | "compute" => ShaderStage::Compute,
+            "vert" | "vertex" => ShaderStage::Vertex,
+            _ => return Err(anyhow!("Invalid shader stage: {s}")),
+        }))
+    }
+}
+
+/// Input kind/file extension mapping
+#[derive(Debug, Clone, Copy)]
+enum InputKind {
+    Bincode,
+    Glsl,
+    SpirV,
+    Wgsl,
+}
+impl FromStr for InputKind {
+    type Err = anyhow::Error;
+
+    fn from_str(s: &str) -> Result<Self, Self::Err> {
+        Ok(match s.to_lowercase().as_str() {
+            "bin" => InputKind::Bincode,
+            "glsl" => InputKind::Glsl,
+            "spv" => InputKind::SpirV,
+            "wgsl" => InputKind::Wgsl,
+            _ => return Err(anyhow!("Invalid value for --input-kind: {s}")),
+        })
+    }
+}
+
 /// Newtype so we can implement [`FromStr`] for [`naga::back::glsl::Version`].
 #[derive(Clone, Debug)]
 struct GlslProfileArg(naga::back::glsl::Version);
@@ -247,6 +298,8 @@ struct Parameters<'a> {
     msl: naga::back::msl::Options,
     glsl: naga::back::glsl::Options,
     hlsl: naga::back::hlsl::Options,
+    input_kind: Option<InputKind>,
+    shader_stage: Option<ShaderStage>,
 }
 
 trait PrettyResult {
@@ -300,7 +353,7 @@ impl fmt::Display for CliError {
 }
 impl std::error::Error for CliError {}
 
-fn run() -> Result<(), Box<dyn std::error::Error>> {
+fn run() -> anyhow::Result<()> {
     env_logger::init();
 
     // Parse commandline arguments
@@ -381,6 +434,9 @@ fn run() -> Result<(), Box<dyn std::error::Error>> {
         return Err(CliError("Input file path is not specified").into());
     };
 
+    params.input_kind = args.input_kind;
+    params.shader_stage = args.shader_stage;
+
     let Parsed {
         mut module,
         input_text,
@@ -424,6 +480,8 @@ fn run() -> Result<(), Box<dyn std::error::Error>> {
 
     // Validate the IR before compaction.
     let info = match naga::valid::Validator::new(params.validation_flags, validation_caps)
+        .subgroup_stages(naga::valid::ShaderStages::all())
+        .subgroup_operations(naga::valid::SubgroupOperationSet::all())
         .validate(&module)
     {
         Ok(info) => Some(info),
@@ -498,67 +556,70 @@ struct Parsed {
     input_text: Option<String>,
 }
 
-fn parse_input(
-    input_path: &Path,
-    input: Vec<u8>,
-    params: &Parameters,
-) -> Result<Parsed, Box<dyn std::error::Error>> {
-    let (module, input_text) = match Path::new(&input_path)
-        .extension()
-        .ok_or(CliError("Input filename has no extension"))?
-        .to_str()
-        .ok_or(CliError("Input filename not valid unicode"))?
-    {
-        "bin" => (bincode::deserialize(&input)?, None),
-        "spv" => naga::front::spv::parse_u8_slice(&input, &params.spv_in).map(|m| (m, None))?,
-        "wgsl" => {
+fn parse_input(input_path: &Path, input: Vec<u8>, params: &Parameters) -> anyhow::Result<Parsed> {
+    let input_kind = match params.input_kind {
+        Some(kind) => kind,
+        None => input_path
+            .extension()
+            .context("Input filename has no extension")?
+            .to_str()
+            .context("Input filename not valid unicode")?
+            .parse()
+            .context("Unable to determine --input-kind from filename")?,
+    };
+
+    let (module, input_text) = match input_kind {
+        InputKind::Bincode => (bincode::deserialize(&input)?, None),
+        InputKind::SpirV => {
+            naga::front::spv::parse_u8_slice(&input, &params.spv_in).map(|m| (m, None))?
+        }
+        InputKind::Wgsl => {
             let input = String::from_utf8(input)?;
             let result = naga::front::wgsl::parse_str(&input);
             match result {
                 Ok(v) => (v, Some(input)),
                 Err(ref e) => {
-                    let message = format!(
+                    let message = anyhow!(
                         "Could not parse WGSL:\n{}",
                         e.emit_to_string_with_path(&input, input_path)
                     );
-                    return Err(message.into());
+                    return Err(message);
                 }
             }
         }
-        ext @ ("vert" | "frag" | "comp" | "glsl") => {
+        InputKind::Glsl => {
+            let shader_stage = match params.shader_stage {
+                Some(shader_stage) => shader_stage,
+                None => {
+                    // filename.shader_stage.glsl -> filename.shader_stage
+                    let file_stem = input_path
+                        .file_stem()
+                        .context("Unable to determine file stem from input filename.")?;
+                    // filename.shader_stage -> shader_stage
+                    let inner_ext = Path::new(file_stem)
+                        .extension()
+                        .context("Unable to determine inner extension from input filename.")?
+                        .to_str()
+                        .context("Input filename not valid unicode")?;
+                    inner_ext.parse().context("from input filename")?
+                }
+            };
             let input = String::from_utf8(input)?;
             let mut parser = naga::front::glsl::Frontend::default();
-
             (
                 parser
                     .parse(
                         &naga::front::glsl::Options {
-                            stage: match ext {
-                                "vert" => naga::ShaderStage::Vertex,
-                                "frag" => naga::ShaderStage::Fragment,
-                                "comp" => naga::ShaderStage::Compute,
-                                "glsl" => {
-                                    let internal_name = input_path.to_string_lossy();
-                                    match Path::new(&internal_name[..internal_name.len()-5])
-                                        .extension()
-                                        .ok_or(CliError("Input filename ending with .glsl has no internal extension"))?
-                                        .to_str()
-                                        .ok_or(CliError("Input filename not valid unicode"))?
-                                    {
-                                        "vert" => naga::ShaderStage::Vertex,
-                                        "frag" => naga::ShaderStage::Fragment,
-                                        "comp" => naga::ShaderStage::Compute,
-                                        _ => unreachable!(),
-                                    }
-                                },
-                                _ => unreachable!(),
-                            },
+                            stage: shader_stage.0,
                             defines: Default::default(),
                         },
                         &input,
                     )
                     .unwrap_or_else(|error| {
-                        let filename = input_path.file_name().and_then(std::ffi::OsStr::to_str).unwrap_or("glsl");
+                        let filename = input_path
+                            .file_name()
+                            .and_then(std::ffi::OsStr::to_str)
+                            .unwrap_or("glsl");
                         let mut writer = StandardStream::stderr(ColorChoice::Auto);
                         error.emit_to_writer_with_path(&mut writer, &input, filename);
                         std::process::exit(1);
@@ -566,7 +627,6 @@ fn parse_input(
                 Some(input),
             )
         }
-        _ => return Err(CliError("Unknown input file extension").into()),
     };
 
     Ok(Parsed { module, input_text })
@@ -577,7 +637,7 @@ fn write_output(
     info: &Option<naga::valid::ModuleInfo>,
     params: &Parameters,
     output_path: &str,
-) -> Result<(), Box<dyn std::error::Error>> {
+) -> anyhow::Result<()> {
     match Path::new(&output_path)
         .extension()
         .ok_or(CliError("Output filename has no extension"))?
@@ -742,7 +802,7 @@ fn write_output(
     Ok(())
 }
 
-fn bulk_validate(args: Args, params: &Parameters) -> Result<(), Box<dyn std::error::Error>> {
+fn bulk_validate(args: Args, params: &Parameters) -> anyhow::Result<()> {
     let mut invalid = vec![];
     for input_path in args.files {
         let path = Path::new(&input_path);
@@ -760,6 +820,8 @@ fn bulk_validate(args: Args, params: &Parameters) -> Result<(), Box<dyn std::err
 
         let mut validator =
             naga::valid::Validator::new(params.validation_flags, naga::valid::Capabilities::all());
+        validator.subgroup_stages(naga::valid::ShaderStages::all());
+        validator.subgroup_operations(naga::valid::SubgroupOperationSet::all());
 
         if let Err(error) = validator.validate(&module) {
             invalid.push(input_path.clone());
@@ -783,7 +845,7 @@ fn bulk_validate(args: Args, params: &Parameters) -> Result<(), Box<dyn std::err
         for path in invalid {
             writeln!(&mut formatted, "  {path}").unwrap();
         }
-        return Err(formatted.into());
+        return Err(anyhow!(formatted));
     }
 
     Ok(())
diff --git a/naga/Cargo.toml b/naga/Cargo.toml
index 5cc078ad99..5b12fb2b14 100644
--- a/naga/Cargo.toml
+++ b/naga/Cargo.toml
@@ -1,6 +1,6 @@
 [package]
 name = "naga"
-version = "0.19.2"
+version = "0.20.0"
 authors = ["gfx-rs developers"]
 edition = "2021"
 description = "Shader translation infrastructure"
@@ -53,8 +53,8 @@ indexmap = { version = "2", features = ["std"] }
 log = "0.4"
 num-traits = "0.2"
 spirv = { version = "0.3", optional = true }
-thiserror = "1.0.57"
-serde = { version = "1.0.196", features = ["derive"], optional = true }
+thiserror = "1.0.59"
+serde = { version = "1.0.198", features = ["derive"], optional = true }
 petgraph = { version = "0.6", optional = true }
 pp-rs = { version = "0.2.1", optional = true }
 hexf-parse = { version = "0.2.1", optional = true }
diff --git a/naga/fuzz/Cargo.toml b/naga/fuzz/Cargo.toml
index 1f1c1814ba..3e46af0c59 100644
--- a/naga/fuzz/Cargo.toml
+++ b/naga/fuzz/Cargo.toml
@@ -15,7 +15,7 @@ libfuzzer-sys = "0.4"
 
 [target.'cfg(not(any(target_arch = "wasm32", target_os = "ios")))'.dependencies.naga]
 path = ".."
-version = "0.19.0"
+version = "0.20.0"
 features = ["arbitrary", "spv-in", "wgsl-in", "glsl-in"]
 
 [[bin]]
diff --git a/naga/src/back/dot/mod.rs b/naga/src/back/dot/mod.rs
index d128c855ca..9a7702b3f6 100644
--- a/naga/src/back/dot/mod.rs
+++ b/naga/src/back/dot/mod.rs
@@ -279,6 +279,94 @@ impl StatementGraph {
                         crate::RayQueryFunction::Terminate => "RayQueryTerminate",
                     }
                 }
+                S::SubgroupBallot { result, predicate } => {
+                    if let Some(predicate) = predicate {
+                        self.dependencies.push((id, predicate, "predicate"));
+                    }
+                    self.emits.push((id, result));
+                    "SubgroupBallot"
+                }
+                S::SubgroupCollectiveOperation {
+                    op,
+                    collective_op,
+                    argument,
+                    result,
+                } => {
+                    self.dependencies.push((id, argument, "arg"));
+                    self.emits.push((id, result));
+                    match (collective_op, op) {
+                        (crate::CollectiveOperation::Reduce, crate::SubgroupOperation::All) => {
+                            "SubgroupAll"
+                        }
+                        (crate::CollectiveOperation::Reduce, crate::SubgroupOperation::Any) => {
+                            "SubgroupAny"
+                        }
+                        (crate::CollectiveOperation::Reduce, crate::SubgroupOperation::Add) => {
+                            "SubgroupAdd"
+                        }
+                        (crate::CollectiveOperation::Reduce, crate::SubgroupOperation::Mul) => {
+                            "SubgroupMul"
+                        }
+                        (crate::CollectiveOperation::Reduce, crate::SubgroupOperation::Max) => {
+                            "SubgroupMax"
+                        }
+                        (crate::CollectiveOperation::Reduce, crate::SubgroupOperation::Min) => {
+                            "SubgroupMin"
+                        }
+                        (crate::CollectiveOperation::Reduce, crate::SubgroupOperation::And) => {
+                            "SubgroupAnd"
+                        }
+                        (crate::CollectiveOperation::Reduce, crate::SubgroupOperation::Or) => {
+                            "SubgroupOr"
+                        }
+                        (crate::CollectiveOperation::Reduce, crate::SubgroupOperation::Xor) => {
+                            "SubgroupXor"
+                        }
+                        (
+                            crate::CollectiveOperation::ExclusiveScan,
+                            crate::SubgroupOperation::Add,
+                        ) => "SubgroupExclusiveAdd",
+                        (
+                            crate::CollectiveOperation::ExclusiveScan,
+                            crate::SubgroupOperation::Mul,
+                        ) => "SubgroupExclusiveMul",
+                        (
+                            crate::CollectiveOperation::InclusiveScan,
+                            crate::SubgroupOperation::Add,
+                        ) => "SubgroupInclusiveAdd",
+                        (
+                            crate::CollectiveOperation::InclusiveScan,
+                            crate::SubgroupOperation::Mul,
+                        ) => "SubgroupInclusiveMul",
+                        _ => unimplemented!(),
+                    }
+                }
+                S::SubgroupGather {
+                    mode,
+                    argument,
+                    result,
+                } => {
+                    match mode {
+                        crate::GatherMode::BroadcastFirst => {}
+                        crate::GatherMode::Broadcast(index)
+                        | crate::GatherMode::Shuffle(index)
+                        | crate::GatherMode::ShuffleDown(index)
+                        | crate::GatherMode::ShuffleUp(index)
+                        | crate::GatherMode::ShuffleXor(index) => {
+                            self.dependencies.push((id, index, "index"))
+                        }
+                    }
+                    self.dependencies.push((id, argument, "arg"));
+                    self.emits.push((id, result));
+                    match mode {
+                        crate::GatherMode::BroadcastFirst => "SubgroupBroadcastFirst",
+                        crate::GatherMode::Broadcast(_) => "SubgroupBroadcast",
+                        crate::GatherMode::Shuffle(_) => "SubgroupShuffle",
+                        crate::GatherMode::ShuffleDown(_) => "SubgroupShuffleDown",
+                        crate::GatherMode::ShuffleUp(_) => "SubgroupShuffleUp",
+                        crate::GatherMode::ShuffleXor(_) => "SubgroupShuffleXor",
+                    }
+                }
             };
             // Set the last node to the merge node
             last_node = merge_id;
@@ -587,6 +675,8 @@ fn write_function_expressions(
                 let ty = if committed { "Committed" } else { "Candidate" };
                 (format!("rayQueryGet{}Intersection", ty).into(), 4)
             }
+            E::SubgroupBallotResult => ("SubgroupBallotResult".into(), 4),
+            E::SubgroupOperationResult { .. } => ("SubgroupOperationResult".into(), 4),
         };
 
         // give uniform expressions an outline
diff --git a/naga/src/back/glsl/features.rs b/naga/src/back/glsl/features.rs
index 99c128c6d9..e5a43f3e02 100644
--- a/naga/src/back/glsl/features.rs
+++ b/naga/src/back/glsl/features.rs
@@ -50,6 +50,8 @@ bitflags::bitflags! {
         const INSTANCE_INDEX = 1 << 22;
         /// Sample specific LODs of cube / array shadow textures
         const TEXTURE_SHADOW_LOD = 1 << 23;
+        /// Subgroup operations
+        const SUBGROUP_OPERATIONS = 1 << 24;
     }
 }
 
@@ -117,6 +119,7 @@ impl FeaturesManager {
         check_feature!(SAMPLE_VARIABLES, 400, 300);
         check_feature!(DYNAMIC_ARRAY_SIZE, 430, 310);
         check_feature!(DUAL_SOURCE_BLENDING, 330, 300 /* with extension */);
+        check_feature!(SUBGROUP_OPERATIONS, 430, 310);
         match version {
             Version::Embedded { is_webgl: true, .. } => check_feature!(MULTI_VIEW, 140, 300),
             _ => check_feature!(MULTI_VIEW, 140, 310),
@@ -259,6 +262,22 @@ impl FeaturesManager {
             writeln!(out, "#extension GL_EXT_texture_shadow_lod : require")?;
         }
 
+        if self.0.contains(Features::SUBGROUP_OPERATIONS) {
+            // https://registry.khronos.org/OpenGL/extensions/KHR/KHR_shader_subgroup.txt
+            writeln!(out, "#extension GL_KHR_shader_subgroup_basic : require")?;
+            writeln!(out, "#extension GL_KHR_shader_subgroup_vote : require")?;
+            writeln!(
+                out,
+                "#extension GL_KHR_shader_subgroup_arithmetic : require"
+            )?;
+            writeln!(out, "#extension GL_KHR_shader_subgroup_ballot : require")?;
+            writeln!(out, "#extension GL_KHR_shader_subgroup_shuffle : require")?;
+            writeln!(
+                out,
+                "#extension GL_KHR_shader_subgroup_shuffle_relative : require"
+            )?;
+        }
+
         Ok(())
     }
 }
@@ -518,6 +537,10 @@ impl<'a, W> Writer<'a, W> {
                         }
                     }
                 }
+                Expression::SubgroupBallotResult |
+                Expression::SubgroupOperationResult { .. } => {
+                    features.request(Features::SUBGROUP_OPERATIONS)
+                }
                 _ => {}
             }
             }
diff --git a/naga/src/back/glsl/mod.rs b/naga/src/back/glsl/mod.rs
index bede79610a..c8c7ea557d 100644
--- a/naga/src/back/glsl/mod.rs
+++ b/naga/src/back/glsl/mod.rs
@@ -2390,6 +2390,125 @@ impl<'a, W: Write> Writer<'a, W> {
                 writeln!(self.out, ");")?;
             }
             Statement::RayQuery { .. } => unreachable!(),
+            Statement::SubgroupBallot { result, predicate } => {
+                write!(self.out, "{level}")?;
+                let res_name = format!("{}{}", back::BAKE_PREFIX, result.index());
+                let res_ty = ctx.info[result].ty.inner_with(&self.module.types);
+                self.write_value_type(res_ty)?;
+                write!(self.out, " {res_name} = ")?;
+                self.named_expressions.insert(result, res_name);
+
+                write!(self.out, "subgroupBallot(")?;
+                match predicate {
+                    Some(predicate) => self.write_expr(predicate, ctx)?,
+                    None => write!(self.out, "true")?,
+                }
+                writeln!(self.out, ");")?;
+            }
+            Statement::SubgroupCollectiveOperation {
+                op,
+                collective_op,
+                argument,
+                result,
+            } => {
+                write!(self.out, "{level}")?;
+                let res_name = format!("{}{}", back::BAKE_PREFIX, result.index());
+                let res_ty = ctx.info[result].ty.inner_with(&self.module.types);
+                self.write_value_type(res_ty)?;
+                write!(self.out, " {res_name} = ")?;
+                self.named_expressions.insert(result, res_name);
+
+                match (collective_op, op) {
+                    (crate::CollectiveOperation::Reduce, crate::SubgroupOperation::All) => {
+                        write!(self.out, "subgroupAll(")?
+                    }
+                    (crate::CollectiveOperation::Reduce, crate::SubgroupOperation::Any) => {
+                        write!(self.out, "subgroupAny(")?
+                    }
+                    (crate::CollectiveOperation::Reduce, crate::SubgroupOperation::Add) => {
+                        write!(self.out, "subgroupAdd(")?
+                    }
+                    (crate::CollectiveOperation::Reduce, crate::SubgroupOperation::Mul) => {
+                        write!(self.out, "subgroupMul(")?
+                    }
+                    (crate::CollectiveOperation::Reduce, crate::SubgroupOperation::Max) => {
+                        write!(self.out, "subgroupMax(")?
+                    }
+                    (crate::CollectiveOperation::Reduce, crate::SubgroupOperation::Min) => {
+                        write!(self.out, "subgroupMin(")?
+                    }
+                    (crate::CollectiveOperation::Reduce, crate::SubgroupOperation::And) => {
+                        write!(self.out, "subgroupAnd(")?
+                    }
+                    (crate::CollectiveOperation::Reduce, crate::SubgroupOperation::Or) => {
+                        write!(self.out, "subgroupOr(")?
+                    }
+                    (crate::CollectiveOperation::Reduce, crate::SubgroupOperation::Xor) => {
+                        write!(self.out, "subgroupXor(")?
+                    }
+                    (crate::CollectiveOperation::ExclusiveScan, crate::SubgroupOperation::Add) => {
+                        write!(self.out, "subgroupExclusiveAdd(")?
+                    }
+                    (crate::CollectiveOperation::ExclusiveScan, crate::SubgroupOperation::Mul) => {
+                        write!(self.out, "subgroupExclusiveMul(")?
+                    }
+                    (crate::CollectiveOperation::InclusiveScan, crate::SubgroupOperation::Add) => {
+                        write!(self.out, "subgroupInclusiveAdd(")?
+                    }
+                    (crate::CollectiveOperation::InclusiveScan, crate::SubgroupOperation::Mul) => {
+                        write!(self.out, "subgroupInclusiveMul(")?
+                    }
+                    _ => unimplemented!(),
+                }
+                self.write_expr(argument, ctx)?;
+                writeln!(self.out, ");")?;
+            }
+            Statement::SubgroupGather {
+                mode,
+                argument,
+                result,
+            } => {
+                write!(self.out, "{level}")?;
+                let res_name = format!("{}{}", back::BAKE_PREFIX, result.index());
+                let res_ty = ctx.info[result].ty.inner_with(&self.module.types);
+                self.write_value_type(res_ty)?;
+                write!(self.out, " {res_name} = ")?;
+                self.named_expressions.insert(result, res_name);
+
+                match mode {
+                    crate::GatherMode::BroadcastFirst => {
+                        write!(self.out, "subgroupBroadcastFirst(")?;
+                    }
+                    crate::GatherMode::Broadcast(_) => {
+                        write!(self.out, "subgroupBroadcast(")?;
+                    }
+                    crate::GatherMode::Shuffle(_) => {
+                        write!(self.out, "subgroupShuffle(")?;
+                    }
+                    crate::GatherMode::ShuffleDown(_) => {
+                        write!(self.out, "subgroupShuffleDown(")?;
+                    }
+                    crate::GatherMode::ShuffleUp(_) => {
+                        write!(self.out, "subgroupShuffleUp(")?;
+                    }
+                    crate::GatherMode::ShuffleXor(_) => {
+                        write!(self.out, "subgroupShuffleXor(")?;
+                    }
+                }
+                self.write_expr(argument, ctx)?;
+                match mode {
+                    crate::GatherMode::BroadcastFirst => {}
+                    crate::GatherMode::Broadcast(index)
+                    | crate::GatherMode::Shuffle(index)
+                    | crate::GatherMode::ShuffleDown(index)
+                    | crate::GatherMode::ShuffleUp(index)
+                    | crate::GatherMode::ShuffleXor(index) => {
+                        write!(self.out, ", ")?;
+                        self.write_expr(index, ctx)?;
+                    }
+                }
+                writeln!(self.out, ");")?;
+            }
         }
 
         Ok(())
@@ -3418,7 +3537,8 @@ impl<'a, W: Write> Writer<'a, W> {
                         let scalar_bits = ctx
                             .resolve_type(arg, &self.module.types)
                             .scalar_width()
-                            .unwrap();
+                            .unwrap()
+                            * 8;
 
                         write!(self.out, "bitfieldExtract(")?;
                         self.write_expr(arg, ctx)?;
@@ -3437,7 +3557,8 @@ impl<'a, W: Write> Writer<'a, W> {
                         let scalar_bits = ctx
                             .resolve_type(arg, &self.module.types)
                             .scalar_width()
-                            .unwrap();
+                            .unwrap()
+                            * 8;
 
                         write!(self.out, "bitfieldInsert(")?;
                         self.write_expr(arg, ctx)?;
@@ -3656,7 +3777,9 @@ impl<'a, W: Write> Writer<'a, W> {
             Expression::CallResult(_)
             | Expression::AtomicResult { .. }
             | Expression::RayQueryProceedResult
-            | Expression::WorkGroupUniformLoadResult { .. } => unreachable!(),
+            | Expression::WorkGroupUniformLoadResult { .. }
+            | Expression::SubgroupOperationResult { .. }
+            | Expression::SubgroupBallotResult => unreachable!(),
             // `ArrayLength` is written as `expr.length()` and we convert it to a uint
             Expression::ArrayLength(expr) => {
                 write!(self.out, "uint(")?;
@@ -4225,6 +4348,9 @@ impl<'a, W: Write> Writer<'a, W> {
         if flags.contains(crate::Barrier::WORK_GROUP) {
             writeln!(self.out, "{level}memoryBarrierShared();")?;
         }
+        if flags.contains(crate::Barrier::SUB_GROUP) {
+            writeln!(self.out, "{level}subgroupMemoryBarrier();")?;
+        }
         writeln!(self.out, "{level}barrier();")?;
         Ok(())
     }
@@ -4494,6 +4620,11 @@ const fn glsl_built_in(built_in: crate::BuiltIn, options: VaryingOptions) -> &'s
         Bi::WorkGroupId => "gl_WorkGroupID",
         Bi::WorkGroupSize => "gl_WorkGroupSize",
         Bi::NumWorkGroups => "gl_NumWorkGroups",
+        // subgroup
+        Bi::NumSubgroups => "gl_NumSubgroups",
+        Bi::SubgroupId => "gl_SubgroupID",
+        Bi::SubgroupSize => "gl_SubgroupSize",
+        Bi::SubgroupInvocationId => "gl_SubgroupInvocationID",
     }
 }
 
diff --git a/naga/src/back/hlsl/conv.rs b/naga/src/back/hlsl/conv.rs
index 2a6db35db8..7d15f43f6c 100644
--- a/naga/src/back/hlsl/conv.rs
+++ b/naga/src/back/hlsl/conv.rs
@@ -179,6 +179,11 @@ impl crate::BuiltIn {
             // to this field will get replaced with references to `SPECIAL_CBUF_VAR`
             // in `Writer::write_expr`.
             Self::NumWorkGroups => "SV_GroupID",
+            // These builtins map to functions
+            Self::SubgroupSize
+            | Self::SubgroupInvocationId
+            | Self::NumSubgroups
+            | Self::SubgroupId => unreachable!(),
             Self::BaseInstance | Self::BaseVertex | Self::WorkGroupSize => {
                 return Err(Error::Unimplemented(format!("builtin {self:?}")))
             }
diff --git a/naga/src/back/hlsl/help.rs b/naga/src/back/hlsl/help.rs
index 4dd9ea5987..d3bb1ce7f5 100644
--- a/naga/src/back/hlsl/help.rs
+++ b/naga/src/back/hlsl/help.rs
@@ -70,6 +70,11 @@ pub(super) struct WrappedMath {
     pub(super) components: Option<u32>,
 }
 
+#[derive(Clone, Copy, Debug, Hash, Eq, Ord, PartialEq, PartialOrd)]
+pub(super) struct WrappedZeroValue {
+    pub(super) ty: Handle<crate::Type>,
+}
+
 /// HLSL backend requires its own `ImageQuery` enum.
 ///
 /// It is used inside `WrappedImageQuery` and should be unique per ImageQuery function.
@@ -359,7 +364,7 @@ impl<'a, W: Write> super::Writer<'a, W> {
     }
 
     /// Helper function that write wrapped function for `Expression::Compose` for structures.
-    pub(super) fn write_wrapped_constructor_function(
+    fn write_wrapped_constructor_function(
         &mut self,
         module: &crate::Module,
         constructor: WrappedConstructor,
@@ -862,6 +867,25 @@ impl<'a, W: Write> super::Writer<'a, W> {
         Ok(())
     }
 
+    // TODO: we could merge this with iteration in write_wrapped_compose_functions...
+    //
+    /// Helper function that writes zero value wrapped functions
+    pub(super) fn write_wrapped_zero_value_functions(
+        &mut self,
+        module: &crate::Module,
+        expressions: &crate::Arena<crate::Expression>,
+    ) -> BackendResult {
+        for (handle, _) in expressions.iter() {
+            if let crate::Expression::ZeroValue(ty) = expressions[handle] {
+                let zero_value = WrappedZeroValue { ty };
+                if self.wrapped.zero_values.insert(zero_value) {
+                    self.write_wrapped_zero_value_function(module, zero_value)?;
+                }
+            }
+        }
+        Ok(())
+    }
+
     pub(super) fn write_wrapped_math_functions(
         &mut self,
         module: &crate::Module,
@@ -1006,6 +1030,7 @@ impl<'a, W: Write> super::Writer<'a, W> {
     ) -> BackendResult {
         self.write_wrapped_math_functions(module, func_ctx)?;
         self.write_wrapped_compose_functions(module, func_ctx.expressions)?;
+        self.write_wrapped_zero_value_functions(module, func_ctx.expressions)?;
 
         for (handle, _) in func_ctx.expressions.iter() {
             match func_ctx.expressions[handle] {
@@ -1283,4 +1308,71 @@ impl<'a, W: Write> super::Writer<'a, W> {
 
         Ok(())
     }
+
+    pub(super) fn write_wrapped_zero_value_function_name(
+        &mut self,
+        module: &crate::Module,
+        zero_value: WrappedZeroValue,
+    ) -> BackendResult {
+        let name = crate::TypeInner::hlsl_type_id(zero_value.ty, module.to_ctx(), &self.names)?;
+        write!(self.out, "ZeroValue{name}")?;
+        Ok(())
+    }
+
+    /// Helper function that write wrapped function for `Expression::ZeroValue`
+    ///
+    /// This is necessary since we might have a member access after the zero value expression, e.g.
+    /// `.y` (in practice this can come up when consuming SPIRV that's been produced by glslc).
+    ///
+    /// So we can't just write `(float4)0` since `(float4)0.y` won't parse correctly.
+    ///
+    /// Parenthesizing the expression like `((float4)0).y` would work... except DXC can't handle
+    /// cases like:
+    ///
+    /// ```ignore
+    /// tests\out\hlsl\access.hlsl:183:41: error: cannot compile this l-value expression yet
+    ///     t_1.am = (__mat4x2[2])((float4x2[2])0);
+    ///                                         ^
+    /// ```
+    fn write_wrapped_zero_value_function(
+        &mut self,
+        module: &crate::Module,
+        zero_value: WrappedZeroValue,
+    ) -> BackendResult {
+        use crate::back::INDENT;
+
+        const RETURN_VARIABLE_NAME: &str = "ret";
+
+        // Write function return type and name
+        if let crate::TypeInner::Array { base, size, .. } = module.types[zero_value.ty].inner {
+            write!(self.out, "typedef ")?;
+            self.write_type(module, zero_value.ty)?;
+            write!(self.out, " ret_")?;
+            self.write_wrapped_zero_value_function_name(module, zero_value)?;
+            self.write_array_size(module, base, size)?;
+            writeln!(self.out, ";")?;
+
+            write!(self.out, "ret_")?;
+            self.write_wrapped_zero_value_function_name(module, zero_value)?;
+        } else {
+            self.write_type(module, zero_value.ty)?;
+        }
+        write!(self.out, " ")?;
+        self.write_wrapped_zero_value_function_name(module, zero_value)?;
+
+        // Write function parameters (none) and start function body
+        writeln!(self.out, "() {{")?;
+
+        // Write `ZeroValue` function.
+        write!(self.out, "{INDENT}return ")?;
+        self.write_default_init(module, zero_value.ty)?;
+        writeln!(self.out, ";")?;
+
+        // End of function body
+        writeln!(self.out, "}}")?;
+        // Write extra new line
+        writeln!(self.out)?;
+
+        Ok(())
+    }
 }
diff --git a/naga/src/back/hlsl/mod.rs b/naga/src/back/hlsl/mod.rs
index fe9740a2f4..28edbf70e1 100644
--- a/naga/src/back/hlsl/mod.rs
+++ b/naga/src/back/hlsl/mod.rs
@@ -267,6 +267,7 @@ pub enum Error {
 
 #[derive(Default)]
 struct Wrapped {
+    zero_values: crate::FastHashSet<help::WrappedZeroValue>,
     array_lengths: crate::FastHashSet<help::WrappedArrayLength>,
     image_queries: crate::FastHashSet<help::WrappedImageQuery>,
     constructors: crate::FastHashSet<help::WrappedConstructor>,
diff --git a/naga/src/back/hlsl/writer.rs b/naga/src/back/hlsl/writer.rs
index d4c6097eb3..86d8f89035 100644
--- a/naga/src/back/hlsl/writer.rs
+++ b/naga/src/back/hlsl/writer.rs
@@ -1,5 +1,8 @@
 use super::{
-    help::{WrappedArrayLength, WrappedConstructor, WrappedImageQuery, WrappedStructMatrixAccess},
+    help::{
+        WrappedArrayLength, WrappedConstructor, WrappedImageQuery, WrappedStructMatrixAccess,
+        WrappedZeroValue,
+    },
     storage::StoreValue,
     BackendResult, Error, Options,
 };
@@ -77,6 +80,19 @@ enum Io {
     Output,
 }
 
+const fn is_subgroup_builtin_binding(binding: &Option<crate::Binding>) -> bool {
+    let &Some(crate::Binding::BuiltIn(builtin)) = binding else {
+        return false;
+    };
+    matches!(
+        builtin,
+        crate::BuiltIn::SubgroupSize
+            | crate::BuiltIn::SubgroupInvocationId
+            | crate::BuiltIn::NumSubgroups
+            | crate::BuiltIn::SubgroupId
+    )
+}
+
 impl<'a, W: fmt::Write> super::Writer<'a, W> {
     pub fn new(out: W, options: &'a Options) -> Self {
         Self {
@@ -161,6 +177,19 @@ impl<'a, W: fmt::Write> super::Writer<'a, W> {
                 }
             }
         }
+        for statement in func.body.iter() {
+            match *statement {
+                crate::Statement::SubgroupCollectiveOperation {
+                    op: _,
+                    collective_op: crate::CollectiveOperation::InclusiveScan,
+                    argument,
+                    result: _,
+                } => {
+                    self.need_bake_expressions.insert(argument);
+                }
+                _ => {}
+            }
+        }
     }
 
     pub fn write(
@@ -238,6 +267,7 @@ impl<'a, W: fmt::Write> super::Writer<'a, W> {
         self.write_special_functions(module)?;
 
         self.write_wrapped_compose_functions(module, &module.global_expressions)?;
+        self.write_wrapped_zero_value_functions(module, &module.global_expressions)?;
 
         // Write all named constants
         let mut constants = module
@@ -401,31 +431,32 @@ impl<'a, W: fmt::Write> super::Writer<'a, W> {
     // if they are struct, so that the `stage` argument here could be omitted.
     fn write_semantic(
         &mut self,
-        binding: &crate::Binding,
+        binding: &Option<crate::Binding>,
         stage: Option<(ShaderStage, Io)>,
     ) -> BackendResult {
         match *binding {
-            crate::Binding::BuiltIn(builtin) => {
+            Some(crate::Binding::BuiltIn(builtin)) if !is_subgroup_builtin_binding(binding) => {
                 let builtin_str = builtin.to_hlsl_str()?;
                 write!(self.out, " : {builtin_str}")?;
             }
-            crate::Binding::Location {
+            Some(crate::Binding::Location {
                 second_blend_source: true,
                 ..
-            } => {
+            }) => {
                 write!(self.out, " : SV_Target1")?;
             }
-            crate::Binding::Location {
+            Some(crate::Binding::Location {
                 location,
                 second_blend_source: false,
                 ..
-            } => {
+            }) => {
                 if stage == Some((crate::ShaderStage::Fragment, Io::Output)) {
                     write!(self.out, " : SV_Target{location}")?;
                 } else {
                     write!(self.out, " : {LOCATION_SEMANTIC}{location}")?;
                 }
             }
+            _ => {}
         }
 
         Ok(())
@@ -446,17 +477,30 @@ impl<'a, W: fmt::Write> super::Writer<'a, W> {
         write!(self.out, "struct {struct_name}")?;
         writeln!(self.out, " {{")?;
         for m in members.iter() {
+            if is_subgroup_builtin_binding(&m.binding) {
+                continue;
+            }
             write!(self.out, "{}", back::INDENT)?;
             if let Some(ref binding) = m.binding {
                 self.write_modifier(binding)?;
             }
             self.write_type(module, m.ty)?;
             write!(self.out, " {}", &m.name)?;
-            if let Some(ref binding) = m.binding {
-                self.write_semantic(binding, Some(shader_stage))?;
-            }
+            self.write_semantic(&m.binding, Some(shader_stage))?;
             writeln!(self.out, ";")?;
         }
+        if members.iter().any(|arg| {
+            matches!(
+                arg.binding,
+                Some(crate::Binding::BuiltIn(crate::BuiltIn::SubgroupId))
+            )
+        }) {
+            writeln!(
+                self.out,
+                "{}uint __local_invocation_index : SV_GroupIndex;",
+                back::INDENT
+            )?;
+        }
         writeln!(self.out, "}};")?;
         writeln!(self.out)?;
 
@@ -557,8 +601,8 @@ impl<'a, W: fmt::Write> super::Writer<'a, W> {
     }
 
     /// Writes special interface structures for an entry point. The special structures have
-    /// all the fields flattened into them and sorted by binding. They are only needed for
-    /// VS outputs and FS inputs, so that these interfaces match.
+    /// all the fields flattened into them and sorted by binding. They are needed to emulate
+    /// subgroup built-ins and to make the interfaces between VS outputs and FS inputs match.
     fn write_ep_interface(
         &mut self,
         module: &Module,
@@ -567,7 +611,13 @@ impl<'a, W: fmt::Write> super::Writer<'a, W> {
         ep_name: &str,
     ) -> Result<EntryPointInterface, Error> {
         Ok(EntryPointInterface {
-            input: if !func.arguments.is_empty() && stage == ShaderStage::Fragment {
+            input: if !func.arguments.is_empty()
+                && (stage == ShaderStage::Fragment
+                    || func
+                        .arguments
+                        .iter()
+                        .any(|arg| is_subgroup_builtin_binding(&arg.binding)))
+            {
                 Some(self.write_ep_input_struct(module, func, stage, ep_name)?)
             } else {
                 None
@@ -581,6 +631,38 @@ impl<'a, W: fmt::Write> super::Writer<'a, W> {
         })
     }
 
+    fn write_ep_argument_initialization(
+        &mut self,
+        ep: &crate::EntryPoint,
+        ep_input: &EntryPointBinding,
+        fake_member: &EpStructMember,
+    ) -> BackendResult {
+        match fake_member.binding {
+            Some(crate::Binding::BuiltIn(crate::BuiltIn::SubgroupSize)) => {
+                write!(self.out, "WaveGetLaneCount()")?
+            }
+            Some(crate::Binding::BuiltIn(crate::BuiltIn::SubgroupInvocationId)) => {
+                write!(self.out, "WaveGetLaneIndex()")?
+            }
+            Some(crate::Binding::BuiltIn(crate::BuiltIn::NumSubgroups)) => write!(
+                self.out,
+                "({}u + WaveGetLaneCount() - 1u) / WaveGetLaneCount()",
+                ep.workgroup_size[0] * ep.workgroup_size[1] * ep.workgroup_size[2]
+            )?,
+            Some(crate::Binding::BuiltIn(crate::BuiltIn::SubgroupId)) => {
+                write!(
+                    self.out,
+                    "{}.__local_invocation_index / WaveGetLaneCount()",
+                    ep_input.arg_name
+                )?;
+            }
+            _ => {
+                write!(self.out, "{}.{}", ep_input.arg_name, fake_member.name)?;
+            }
+        }
+        Ok(())
+    }
+
     /// Write an entry point preface that initializes the arguments as specified in IR.
     fn write_ep_arguments_initialization(
         &mut self,
@@ -588,6 +670,7 @@ impl<'a, W: fmt::Write> super::Writer<'a, W> {
         func: &crate::Function,
         ep_index: u16,
     ) -> BackendResult {
+        let ep = &module.entry_points[ep_index as usize];
         let ep_input = match self.entry_point_io[ep_index as usize].input.take() {
             Some(ep_input) => ep_input,
             None => return Ok(()),
@@ -601,8 +684,13 @@ impl<'a, W: fmt::Write> super::Writer<'a, W> {
             match module.types[arg.ty].inner {
                 TypeInner::Array { base, size, .. } => {
                     self.write_array_size(module, base, size)?;
-                    let fake_member = fake_iter.next().unwrap();
-                    writeln!(self.out, " = {}.{};", ep_input.arg_name, fake_member.name)?;
+                    write!(self.out, " = ")?;
+                    self.write_ep_argument_initialization(
+                        ep,
+                        &ep_input,
+                        fake_iter.next().unwrap(),
+                    )?;
+                    writeln!(self.out, ";")?;
                 }
                 TypeInner::Struct { ref members, .. } => {
                     write!(self.out, " = {{ ")?;
@@ -610,14 +698,22 @@ impl<'a, W: fmt::Write> super::Writer<'a, W> {
                         if index != 0 {
                             write!(self.out, ", ")?;
                         }
-                        let fake_member = fake_iter.next().unwrap();
-                        write!(self.out, "{}.{}", ep_input.arg_name, fake_member.name)?;
+                        self.write_ep_argument_initialization(
+                            ep,
+                            &ep_input,
+                            fake_iter.next().unwrap(),
+                        )?;
                     }
                     writeln!(self.out, " }};")?;
                 }
                 _ => {
-                    let fake_member = fake_iter.next().unwrap();
-                    writeln!(self.out, " = {}.{};", ep_input.arg_name, fake_member.name)?;
+                    write!(self.out, " = ")?;
+                    self.write_ep_argument_initialization(
+                        ep,
+                        &ep_input,
+                        fake_iter.next().unwrap(),
+                    )?;
+                    writeln!(self.out, ";")?;
                 }
             }
         }
@@ -932,9 +1028,7 @@ impl<'a, W: fmt::Write> super::Writer<'a, W> {
                 }
             }
 
-            if let Some(ref binding) = member.binding {
-                self.write_semantic(binding, shader_stage)?;
-            };
+            self.write_semantic(&member.binding, shader_stage)?;
             writeln!(self.out, ";")?;
         }
 
@@ -1147,7 +1241,7 @@ impl<'a, W: fmt::Write> super::Writer<'a, W> {
             }
             back::FunctionType::EntryPoint(ep_index) => {
                 if let Some(ref ep_input) = self.entry_point_io[ep_index as usize].input {
-                    write!(self.out, "{} {}", ep_input.ty_name, ep_input.arg_name,)?;
+                    write!(self.out, "{} {}", ep_input.ty_name, ep_input.arg_name)?;
                 } else {
                     let stage = module.entry_points[ep_index as usize].stage;
                     for (index, arg) in func.arguments.iter().enumerate() {
@@ -1164,17 +1258,16 @@ impl<'a, W: fmt::Write> super::Writer<'a, W> {
                             self.write_array_size(module, base, size)?;
                         }
 
-                        if let Some(ref binding) = arg.binding {
-                            self.write_semantic(binding, Some((stage, Io::Input)))?;
-                        }
+                        self.write_semantic(&arg.binding, Some((stage, Io::Input)))?;
                     }
-
-                    if need_workgroup_variables_initialization {
-                        if !func.arguments.is_empty() {
-                            write!(self.out, ", ")?;
-                        }
-                        write!(self.out, "uint3 __local_invocation_id : SV_GroupThreadID")?;
+                }
+                if need_workgroup_variables_initialization {
+                    if self.entry_point_io[ep_index as usize].input.is_some()
+                        || !func.arguments.is_empty()
+                    {
+                        write!(self.out, ", ")?;
                     }
+                    write!(self.out, "uint3 __local_invocation_id : SV_GroupThreadID")?;
                 }
             }
         }
@@ -1184,11 +1277,7 @@ impl<'a, W: fmt::Write> super::Writer<'a, W> {
         // Write semantic if it present
         if let back::FunctionType::EntryPoint(index) = func_ctx.ty {
             let stage = module.entry_points[index as usize].stage;
-            if let Some(crate::FunctionResult {
-                binding: Some(ref binding),
-                ..
-            }) = func.result
-            {
+            if let Some(crate::FunctionResult { ref binding, .. }) = func.result {
                 self.write_semantic(binding, Some((stage, Io::Output)))?;
             }
         }
@@ -1988,6 +2077,129 @@ impl<'a, W: fmt::Write> super::Writer<'a, W> {
                 writeln!(self.out, "{level}}}")?
             }
             Statement::RayQuery { .. } => unreachable!(),
+            Statement::SubgroupBallot { result, predicate } => {
+                write!(self.out, "{level}")?;
+                let name = format!("{}{}", back::BAKE_PREFIX, result.index());
+                write!(self.out, "const uint4 {name} = ")?;
+                self.named_expressions.insert(result, name);
+
+                write!(self.out, "WaveActiveBallot(")?;
+                match predicate {
+                    Some(predicate) => self.write_expr(module, predicate, func_ctx)?,
+                    None => write!(self.out, "true")?,
+                }
+                writeln!(self.out, ");")?;
+            }
+            Statement::SubgroupCollectiveOperation {
+                op,
+                collective_op,
+                argument,
+                result,
+            } => {
+                write!(self.out, "{level}")?;
+                write!(self.out, "const ")?;
+                let name = format!("{}{}", back::BAKE_PREFIX, result.index());
+                match func_ctx.info[result].ty {
+                    proc::TypeResolution::Handle(handle) => self.write_type(module, handle)?,
+                    proc::TypeResolution::Value(ref value) => {
+                        self.write_value_type(module, value)?
+                    }
+                };
+                write!(self.out, " {name} = ")?;
+                self.named_expressions.insert(result, name);
+
+                match (collective_op, op) {
+                    (crate::CollectiveOperation::Reduce, crate::SubgroupOperation::All) => {
+                        write!(self.out, "WaveActiveAllTrue(")?
+                    }
+                    (crate::CollectiveOperation::Reduce, crate::SubgroupOperation::Any) => {
+                        write!(self.out, "WaveActiveAnyTrue(")?
+                    }
+                    (crate::CollectiveOperation::Reduce, crate::SubgroupOperation::Add) => {
+                        write!(self.out, "WaveActiveSum(")?
+                    }
+                    (crate::CollectiveOperation::Reduce, crate::SubgroupOperation::Mul) => {
+                        write!(self.out, "WaveActiveProduct(")?
+                    }
+                    (crate::CollectiveOperation::Reduce, crate::SubgroupOperation::Max) => {
+                        write!(self.out, "WaveActiveMax(")?
+                    }
+                    (crate::CollectiveOperation::Reduce, crate::SubgroupOperation::Min) => {
+                        write!(self.out, "WaveActiveMin(")?
+                    }
+                    (crate::CollectiveOperation::Reduce, crate::SubgroupOperation::And) => {
+                        write!(self.out, "WaveActiveBitAnd(")?
+                    }
+                    (crate::CollectiveOperation::Reduce, crate::SubgroupOperation::Or) => {
+                        write!(self.out, "WaveActiveBitOr(")?
+                    }
+                    (crate::CollectiveOperation::Reduce, crate::SubgroupOperation::Xor) => {
+                        write!(self.out, "WaveActiveBitXor(")?
+                    }
+                    (crate::CollectiveOperation::ExclusiveScan, crate::SubgroupOperation::Add) => {
+                        write!(self.out, "WavePrefixSum(")?
+                    }
+                    (crate::CollectiveOperation::ExclusiveScan, crate::SubgroupOperation::Mul) => {
+                        write!(self.out, "WavePrefixProduct(")?
+                    }
+                    (crate::CollectiveOperation::InclusiveScan, crate::SubgroupOperation::Add) => {
+                        self.write_expr(module, argument, func_ctx)?;
+                        write!(self.out, " + WavePrefixSum(")?;
+                    }
+                    (crate::CollectiveOperation::InclusiveScan, crate::SubgroupOperation::Mul) => {
+                        self.write_expr(module, argument, func_ctx)?;
+                        write!(self.out, " * WavePrefixProduct(")?;
+                    }
+                    _ => unimplemented!(),
+                }
+                self.write_expr(module, argument, func_ctx)?;
+                writeln!(self.out, ");")?;
+            }
+            Statement::SubgroupGather {
+                mode,
+                argument,
+                result,
+            } => {
+                write!(self.out, "{level}")?;
+                write!(self.out, "const ")?;
+                let name = format!("{}{}", back::BAKE_PREFIX, result.index());
+                match func_ctx.info[result].ty {
+                    proc::TypeResolution::Handle(handle) => self.write_type(module, handle)?,
+                    proc::TypeResolution::Value(ref value) => {
+                        self.write_value_type(module, value)?
+                    }
+                };
+                write!(self.out, " {name} = ")?;
+                self.named_expressions.insert(result, name);
+
+                if matches!(mode, crate::GatherMode::BroadcastFirst) {
+                    write!(self.out, "WaveReadLaneFirst(")?;
+                    self.write_expr(module, argument, func_ctx)?;
+                } else {
+                    write!(self.out, "WaveReadLaneAt(")?;
+                    self.write_expr(module, argument, func_ctx)?;
+                    write!(self.out, ", ")?;
+                    match mode {
+                        crate::GatherMode::BroadcastFirst => unreachable!(),
+                        crate::GatherMode::Broadcast(index) | crate::GatherMode::Shuffle(index) => {
+                            self.write_expr(module, index, func_ctx)?;
+                        }
+                        crate::GatherMode::ShuffleDown(index) => {
+                            write!(self.out, "WaveGetLaneIndex() + ")?;
+                            self.write_expr(module, index, func_ctx)?;
+                        }
+                        crate::GatherMode::ShuffleUp(index) => {
+                            write!(self.out, "WaveGetLaneIndex() - ")?;
+                            self.write_expr(module, index, func_ctx)?;
+                        }
+                        crate::GatherMode::ShuffleXor(index) => {
+                            write!(self.out, "WaveGetLaneIndex() ^ ")?;
+                            self.write_expr(module, index, func_ctx)?;
+                        }
+                    }
+                }
+                writeln!(self.out, ");")?;
+            }
         }
 
         Ok(())
@@ -2043,7 +2255,10 @@ impl<'a, W: fmt::Write> super::Writer<'a, W> {
                     self.write_const_expression(module, constant.init)?;
                 }
             }
-            Expression::ZeroValue(ty) => self.write_default_init(module, ty)?,
+            Expression::ZeroValue(ty) => {
+                self.write_wrapped_zero_value_function_name(module, WrappedZeroValue { ty })?;
+                write!(self.out, "()")?;
+            }
             Expression::Compose { ty, ref components } => {
                 match module.types[ty].inner {
                     TypeInner::Struct { .. } | TypeInner::Array { .. } => {
@@ -2593,7 +2808,7 @@ impl<'a, W: fmt::Write> super::Writer<'a, W> {
                         true
                     }
                     None => {
-                        if inner.scalar_width() == Some(64) {
+                        if inner.scalar_width() == Some(8) {
                             false
                         } else {
                             write!(self.out, "{}(", kind.to_hlsl_cast(),)?;
@@ -3134,7 +3349,9 @@ impl<'a, W: fmt::Write> super::Writer<'a, W> {
             Expression::CallResult(_)
             | Expression::AtomicResult { .. }
             | Expression::WorkGroupUniformLoadResult { .. }
-            | Expression::RayQueryProceedResult => {}
+            | Expression::RayQueryProceedResult
+            | Expression::SubgroupBallotResult
+            | Expression::SubgroupOperationResult { .. } => {}
         }
 
         if !closing_bracket.is_empty() {
@@ -3184,7 +3401,11 @@ impl<'a, W: fmt::Write> super::Writer<'a, W> {
     }
 
     /// Helper function that write default zero initialization
-    fn write_default_init(&mut self, module: &Module, ty: Handle<crate::Type>) -> BackendResult {
+    pub(super) fn write_default_init(
+        &mut self,
+        module: &Module,
+        ty: Handle<crate::Type>,
+    ) -> BackendResult {
         write!(self.out, "(")?;
         self.write_type(module, ty)?;
         if let TypeInner::Array { base, size, .. } = module.types[ty].inner {
@@ -3201,6 +3422,9 @@ impl<'a, W: fmt::Write> super::Writer<'a, W> {
         if barrier.contains(crate::Barrier::WORK_GROUP) {
             writeln!(self.out, "{level}GroupMemoryBarrierWithGroupSync();")?;
         }
+        if barrier.contains(crate::Barrier::SUB_GROUP) {
+            // Does not exist in DirectX
+        }
         Ok(())
     }
 }
diff --git a/naga/src/back/msl/mod.rs b/naga/src/back/msl/mod.rs
index 2c7cdea6af..8b03e20376 100644
--- a/naga/src/back/msl/mod.rs
+++ b/naga/src/back/msl/mod.rs
@@ -436,6 +436,11 @@ impl ResolvedBinding {
                     Bi::WorkGroupId => "threadgroup_position_in_grid",
                     Bi::WorkGroupSize => "dispatch_threads_per_threadgroup",
                     Bi::NumWorkGroups => "threadgroups_per_grid",
+                    // subgroup
+                    Bi::NumSubgroups => "simdgroups_per_threadgroup",
+                    Bi::SubgroupId => "simdgroup_index_in_threadgroup",
+                    Bi::SubgroupSize => "threads_per_simdgroup",
+                    Bi::SubgroupInvocationId => "thread_index_in_simdgroup",
                     Bi::CullDistance | Bi::ViewIndex => {
                         return Err(Error::UnsupportedBuiltIn(built_in))
                     }
@@ -538,3 +543,21 @@ fn test_error_size() {
     use std::mem::size_of;
     assert_eq!(size_of::<Error>(), 32);
 }
+
+impl crate::AtomicFunction {
+    fn to_msl(self) -> Result<&'static str, Error> {
+        Ok(match self {
+            Self::Add => "fetch_add",
+            Self::Subtract => "fetch_sub",
+            Self::And => "fetch_and",
+            Self::InclusiveOr => "fetch_or",
+            Self::ExclusiveOr => "fetch_xor",
+            Self::Min => "fetch_min",
+            Self::Max => "fetch_max",
+            Self::Exchange { compare: None } => "exchange",
+            Self::Exchange { compare: Some(_) } => Err(Error::FeatureNotImplemented(
+                "atomic CompareExchange".to_string(),
+            ))?,
+        })
+    }
+}
diff --git a/naga/src/back/msl/writer.rs b/naga/src/back/msl/writer.rs
index 0d0f651665..e250d0b72c 100644
--- a/naga/src/back/msl/writer.rs
+++ b/naga/src/back/msl/writer.rs
@@ -1131,21 +1131,10 @@ impl<W: Write> Writer<W> {
         Ok(())
     }
 
-    fn put_atomic_fetch(
-        &mut self,
-        pointer: Handle<crate::Expression>,
-        key: &str,
-        value: Handle<crate::Expression>,
-        context: &ExpressionContext,
-    ) -> BackendResult {
-        self.put_atomic_operation(pointer, "fetch_", key, value, context)
-    }
-
     fn put_atomic_operation(
         &mut self,
         pointer: Handle<crate::Expression>,
-        key1: &str,
-        key2: &str,
+        key: &str,
         value: Handle<crate::Expression>,
         context: &ExpressionContext,
     ) -> BackendResult {
@@ -1163,7 +1152,7 @@ impl<W: Write> Writer<W> {
 
         write!(
             self.out,
-            "{NAMESPACE}::atomic_{key1}{key2}_explicit({ATOMIC_REFERENCE}"
+            "{NAMESPACE}::atomic_{key}_explicit({ATOMIC_REFERENCE}"
         )?;
         self.put_access_chain(pointer, policy, context)?;
         write!(self.out, ", ")?;
@@ -1945,7 +1934,7 @@ impl<W: Write> Writer<W> {
                     //
                     // extract_bits(e, min(offset, w), min(count, w - min(offset, w))))
 
-                    let scalar_bits = context.resolve_type(arg).scalar_width().unwrap();
+                    let scalar_bits = context.resolve_type(arg).scalar_width().unwrap() * 8;
 
                     write!(self.out, "{NAMESPACE}::extract_bits(")?;
                     self.put_expression(arg, context, true)?;
@@ -1961,7 +1950,7 @@ impl<W: Write> Writer<W> {
                     //
                     // insertBits(e, newBits, min(offset, w), min(count, w - min(offset, w))))
 
-                    let scalar_bits = context.resolve_type(arg).scalar_width().unwrap();
+                    let scalar_bits = context.resolve_type(arg).scalar_width().unwrap() * 8;
 
                     write!(self.out, "{NAMESPACE}::insert_bits(")?;
                     self.put_expression(arg, context, true)?;
@@ -2042,6 +2031,8 @@ impl<W: Write> Writer<W> {
             crate::Expression::CallResult(_)
             | crate::Expression::AtomicResult { .. }
             | crate::Expression::WorkGroupUniformLoadResult { .. }
+            | crate::Expression::SubgroupBallotResult
+            | crate::Expression::SubgroupOperationResult { .. }
             | crate::Expression::RayQueryProceedResult => {
                 unreachable!()
             }
@@ -2995,43 +2986,8 @@ impl<W: Write> Writer<W> {
                     let res_name = format!("{}{}", back::BAKE_PREFIX, result.index());
                     self.start_baking_expression(result, &context.expression, &res_name)?;
                     self.named_expressions.insert(result, res_name);
-                    match *fun {
-                        crate::AtomicFunction::Add => {
-                            self.put_atomic_fetch(pointer, "add", value, &context.expression)?;
-                        }
-                        crate::AtomicFunction::Subtract => {
-                            self.put_atomic_fetch(pointer, "sub", value, &context.expression)?;
-                        }
-                        crate::AtomicFunction::And => {
-                            self.put_atomic_fetch(pointer, "and", value, &context.expression)?;
-                        }
-                        crate::AtomicFunction::InclusiveOr => {
-                            self.put_atomic_fetch(pointer, "or", value, &context.expression)?;
-                        }
-                        crate::AtomicFunction::ExclusiveOr => {
-                            self.put_atomic_fetch(pointer, "xor", value, &context.expression)?;
-                        }
-                        crate::AtomicFunction::Min => {
-                            self.put_atomic_fetch(pointer, "min", value, &context.expression)?;
-                        }
-                        crate::AtomicFunction::Max => {
-                            self.put_atomic_fetch(pointer, "max", value, &context.expression)?;
-                        }
-                        crate::AtomicFunction::Exchange { compare: None } => {
-                            self.put_atomic_operation(
-                                pointer,
-                                "exchange",
-                                "",
-                                value,
-                                &context.expression,
-                            )?;
-                        }
-                        crate::AtomicFunction::Exchange { .. } => {
-                            return Err(Error::FeatureNotImplemented(
-                                "atomic CompareExchange".to_string(),
-                            ));
-                        }
-                    }
+                    let fun_str = fun.to_msl()?;
+                    self.put_atomic_operation(pointer, fun_str, value, &context.expression)?;
                     // done
                     writeln!(self.out, ";")?;
                 }
@@ -3145,6 +3101,121 @@ impl<W: Write> Writer<W> {
                         }
                     }
                 }
+                crate::Statement::SubgroupBallot { result, predicate } => {
+                    write!(self.out, "{level}")?;
+                    let name = self.namer.call("");
+                    self.start_baking_expression(result, &context.expression, &name)?;
+                    self.named_expressions.insert(result, name);
+                    write!(self.out, "uint4((uint64_t){NAMESPACE}::simd_ballot(")?;
+                    if let Some(predicate) = predicate {
+                        self.put_expression(predicate, &context.expression, true)?;
+                    } else {
+                        write!(self.out, "true")?;
+                    }
+                    writeln!(self.out, "), 0, 0, 0);")?;
+                }
+                crate::Statement::SubgroupCollectiveOperation {
+                    op,
+                    collective_op,
+                    argument,
+                    result,
+                } => {
+                    write!(self.out, "{level}")?;
+                    let name = self.namer.call("");
+                    self.start_baking_expression(result, &context.expression, &name)?;
+                    self.named_expressions.insert(result, name);
+                    match (collective_op, op) {
+                        (crate::CollectiveOperation::Reduce, crate::SubgroupOperation::All) => {
+                            write!(self.out, "{NAMESPACE}::simd_all(")?
+                        }
+                        (crate::CollectiveOperation::Reduce, crate::SubgroupOperation::Any) => {
+                            write!(self.out, "{NAMESPACE}::simd_any(")?
+                        }
+                        (crate::CollectiveOperation::Reduce, crate::SubgroupOperation::Add) => {
+                            write!(self.out, "{NAMESPACE}::simd_sum(")?
+                        }
+                        (crate::CollectiveOperation::Reduce, crate::SubgroupOperation::Mul) => {
+                            write!(self.out, "{NAMESPACE}::simd_product(")?
+                        }
+                        (crate::CollectiveOperation::Reduce, crate::SubgroupOperation::Max) => {
+                            write!(self.out, "{NAMESPACE}::simd_max(")?
+                        }
+                        (crate::CollectiveOperation::Reduce, crate::SubgroupOperation::Min) => {
+                            write!(self.out, "{NAMESPACE}::simd_min(")?
+                        }
+                        (crate::CollectiveOperation::Reduce, crate::SubgroupOperation::And) => {
+                            write!(self.out, "{NAMESPACE}::simd_and(")?
+                        }
+                        (crate::CollectiveOperation::Reduce, crate::SubgroupOperation::Or) => {
+                            write!(self.out, "{NAMESPACE}::simd_or(")?
+                        }
+                        (crate::CollectiveOperation::Reduce, crate::SubgroupOperation::Xor) => {
+                            write!(self.out, "{NAMESPACE}::simd_xor(")?
+                        }
+                        (
+                            crate::CollectiveOperation::ExclusiveScan,
+                            crate::SubgroupOperation::Add,
+                        ) => write!(self.out, "{NAMESPACE}::simd_prefix_exclusive_sum(")?,
+                        (
+                            crate::CollectiveOperation::ExclusiveScan,
+                            crate::SubgroupOperation::Mul,
+                        ) => write!(self.out, "{NAMESPACE}::simd_prefix_exclusive_product(")?,
+                        (
+                            crate::CollectiveOperation::InclusiveScan,
+                            crate::SubgroupOperation::Add,
+                        ) => write!(self.out, "{NAMESPACE}::simd_prefix_inclusive_sum(")?,
+                        (
+                            crate::CollectiveOperation::InclusiveScan,
+                            crate::SubgroupOperation::Mul,
+                        ) => write!(self.out, "{NAMESPACE}::simd_prefix_inclusive_product(")?,
+                        _ => unimplemented!(),
+                    }
+                    self.put_expression(argument, &context.expression, true)?;
+                    writeln!(self.out, ");")?;
+                }
+                crate::Statement::SubgroupGather {
+                    mode,
+                    argument,
+                    result,
+                } => {
+                    write!(self.out, "{level}")?;
+                    let name = self.namer.call("");
+                    self.start_baking_expression(result, &context.expression, &name)?;
+                    self.named_expressions.insert(result, name);
+                    match mode {
+                        crate::GatherMode::BroadcastFirst => {
+                            write!(self.out, "{NAMESPACE}::simd_broadcast_first(")?;
+                        }
+                        crate::GatherMode::Broadcast(_) => {
+                            write!(self.out, "{NAMESPACE}::simd_broadcast(")?;
+                        }
+                        crate::GatherMode::Shuffle(_) => {
+                            write!(self.out, "{NAMESPACE}::simd_shuffle(")?;
+                        }
+                        crate::GatherMode::ShuffleDown(_) => {
+                            write!(self.out, "{NAMESPACE}::simd_shuffle_down(")?;
+                        }
+                        crate::GatherMode::ShuffleUp(_) => {
+                            write!(self.out, "{NAMESPACE}::simd_shuffle_up(")?;
+                        }
+                        crate::GatherMode::ShuffleXor(_) => {
+                            write!(self.out, "{NAMESPACE}::simd_shuffle_xor(")?;
+                        }
+                    }
+                    self.put_expression(argument, &context.expression, true)?;
+                    match mode {
+                        crate::GatherMode::BroadcastFirst => {}
+                        crate::GatherMode::Broadcast(index)
+                        | crate::GatherMode::Shuffle(index)
+                        | crate::GatherMode::ShuffleDown(index)
+                        | crate::GatherMode::ShuffleUp(index)
+                        | crate::GatherMode::ShuffleXor(index) => {
+                            write!(self.out, ", ")?;
+                            self.put_expression(index, &context.expression, true)?;
+                        }
+                    }
+                    writeln!(self.out, ");")?;
+                }
             }
         }
 
@@ -4492,6 +4563,12 @@ impl<W: Write> Writer<W> {
                 "{level}{NAMESPACE}::threadgroup_barrier({NAMESPACE}::mem_flags::mem_threadgroup);",
             )?;
         }
+        if flags.contains(crate::Barrier::SUB_GROUP) {
+            writeln!(
+                self.out,
+                "{level}{NAMESPACE}::simdgroup_barrier({NAMESPACE}::mem_flags::mem_threadgroup);",
+            )?;
+        }
         Ok(())
     }
 }
@@ -4762,8 +4839,8 @@ fn test_stack_size() {
         }
         let stack_size = addresses_end - addresses_start;
         // check the size (in debug only)
-        // last observed macOS value: 19152 (CI)
-        if !(9000..=20000).contains(&stack_size) {
+        // last observed macOS value: 22256 (CI)
+        if !(15000..=25000).contains(&stack_size) {
             panic!("`put_block` stack size {stack_size} has changed!");
         }
     }
diff --git a/naga/src/back/pipeline_constants.rs b/naga/src/back/pipeline_constants.rs
index 50a6a3d57a..0dbe9cf4e8 100644
--- a/naga/src/back/pipeline_constants.rs
+++ b/naga/src/back/pipeline_constants.rs
@@ -129,8 +129,10 @@ pub fn process_overrides<'a>(
                 Expression::Constant(c_h)
             }
             Expression::Constant(c_h) => {
-                adjusted_constant_initializers.insert(c_h);
-                module.constants[c_h].init = adjusted_global_expressions[c_h.index()];
+                if adjusted_constant_initializers.insert(c_h) {
+                    let init = &mut module.constants[c_h].init;
+                    *init = adjusted_global_expressions[init.index()];
+                }
                 expr
             }
             expr => expr,
@@ -522,7 +524,9 @@ fn adjust_expr(new_pos: &[Handle<Expression>], expr: &mut Expression) {
             ty: _,
             comparison: _,
         }
-        | Expression::WorkGroupUniformLoadResult { ty: _ } => {}
+        | Expression::WorkGroupUniformLoadResult { ty: _ }
+        | Expression::SubgroupBallotResult
+        | Expression::SubgroupOperationResult { .. } => {}
     }
 }
 
@@ -637,6 +641,41 @@ fn adjust_stmt(new_pos: &[Handle<Expression>], stmt: &mut Statement) {
             adjust(pointer);
             adjust(result);
         }
+        Statement::SubgroupBallot {
+            ref mut result,
+            ref mut predicate,
+        } => {
+            if let Some(ref mut predicate) = *predicate {
+                adjust(predicate);
+            }
+            adjust(result);
+        }
+        Statement::SubgroupCollectiveOperation {
+            ref mut argument,
+            ref mut result,
+            ..
+        } => {
+            adjust(argument);
+            adjust(result);
+        }
+        Statement::SubgroupGather {
+            ref mut mode,
+            ref mut argument,
+            ref mut result,
+        } => {
+            match *mode {
+                crate::GatherMode::BroadcastFirst => {}
+                crate::GatherMode::Broadcast(ref mut index)
+                | crate::GatherMode::Shuffle(ref mut index)
+                | crate::GatherMode::ShuffleDown(ref mut index)
+                | crate::GatherMode::ShuffleUp(ref mut index)
+                | crate::GatherMode::ShuffleXor(ref mut index) => {
+                    adjust(index);
+                }
+            }
+            adjust(argument);
+            adjust(result)
+        }
         Statement::Call {
             ref mut arguments,
             ref mut result,
diff --git a/naga/src/back/spv/block.rs b/naga/src/back/spv/block.rs
index 9b8430e861..120d60fc40 100644
--- a/naga/src/back/spv/block.rs
+++ b/naga/src/back/spv/block.rs
@@ -1073,7 +1073,7 @@ impl<'w> BlockContext<'w> {
                         //
                         // bitfieldExtract(x, o, c)
 
-                        let bit_width = arg_ty.scalar_width().unwrap();
+                        let bit_width = arg_ty.scalar_width().unwrap() * 8;
                         let width_constant = self
                             .writer
                             .get_constant_scalar(crate::Literal::U32(bit_width as u32));
@@ -1129,7 +1129,7 @@ impl<'w> BlockContext<'w> {
                     Mf::InsertBits => {
                         // The behavior of InsertBits has the same undefined behavior as ExtractBits.
 
-                        let bit_width = arg_ty.scalar_width().unwrap();
+                        let bit_width = arg_ty.scalar_width().unwrap() * 8;
                         let width_constant = self
                             .writer
                             .get_constant_scalar(crate::Literal::U32(bit_width as u32));
@@ -1185,7 +1185,7 @@ impl<'w> BlockContext<'w> {
                     }
                     Mf::FindLsb => MathOp::Ext(spirv::GLOp::FindILsb),
                     Mf::FindMsb => {
-                        if arg_ty.scalar_width() == Some(32) {
+                        if arg_ty.scalar_width() == Some(4) {
                             let thing = match arg_scalar_kind {
                                 Some(crate::ScalarKind::Uint) => spirv::GLOp::FindUMsb,
                                 Some(crate::ScalarKind::Sint) => spirv::GLOp::FindSMsb,
@@ -1279,7 +1279,9 @@ impl<'w> BlockContext<'w> {
             crate::Expression::CallResult(_)
             | crate::Expression::AtomicResult { .. }
             | crate::Expression::WorkGroupUniformLoadResult { .. }
-            | crate::Expression::RayQueryProceedResult => self.cached[expr_handle],
+            | crate::Expression::RayQueryProceedResult
+            | crate::Expression::SubgroupBallotResult
+            | crate::Expression::SubgroupOperationResult { .. } => self.cached[expr_handle],
             crate::Expression::As {
                 expr,
                 kind,
@@ -2490,6 +2492,27 @@ impl<'w> BlockContext<'w> {
                 crate::Statement::RayQuery { query, ref fun } => {
                     self.write_ray_query_function(query, fun, &mut block);
                 }
+                crate::Statement::SubgroupBallot {
+                    result,
+                    ref predicate,
+                } => {
+                    self.write_subgroup_ballot(predicate, result, &mut block)?;
+                }
+                crate::Statement::SubgroupCollectiveOperation {
+                    ref op,
+                    ref collective_op,
+                    argument,
+                    result,
+                } => {
+                    self.write_subgroup_operation(op, collective_op, argument, result, &mut block)?;
+                }
+                crate::Statement::SubgroupGather {
+                    ref mode,
+                    argument,
+                    result,
+                } => {
+                    self.write_subgroup_gather(mode, argument, result, &mut block)?;
+                }
             }
         }
 
diff --git a/naga/src/back/spv/instructions.rs b/naga/src/back/spv/instructions.rs
index f3acf01d6c..df2774ab9c 100644
--- a/naga/src/back/spv/instructions.rs
+++ b/naga/src/back/spv/instructions.rs
@@ -1073,6 +1073,73 @@ impl super::Instruction {
         instruction.add_operand(semantics_id);
         instruction
     }
+
+    // Group Instructions
+
+    pub(super) fn group_non_uniform_ballot(
+        result_type_id: Word,
+        id: Word,
+        exec_scope_id: Word,
+        predicate: Word,
+    ) -> Self {
+        let mut instruction = Self::new(Op::GroupNonUniformBallot);
+        instruction.set_type(result_type_id);
+        instruction.set_result(id);
+        instruction.add_operand(exec_scope_id);
+        instruction.add_operand(predicate);
+
+        instruction
+    }
+    pub(super) fn group_non_uniform_broadcast_first(
+        result_type_id: Word,
+        id: Word,
+        exec_scope_id: Word,
+        value: Word,
+    ) -> Self {
+        let mut instruction = Self::new(Op::GroupNonUniformBroadcastFirst);
+        instruction.set_type(result_type_id);
+        instruction.set_result(id);
+        instruction.add_operand(exec_scope_id);
+        instruction.add_operand(value);
+
+        instruction
+    }
+    pub(super) fn group_non_uniform_gather(
+        op: Op,
+        result_type_id: Word,
+        id: Word,
+        exec_scope_id: Word,
+        value: Word,
+        index: Word,
+    ) -> Self {
+        let mut instruction = Self::new(op);
+        instruction.set_type(result_type_id);
+        instruction.set_result(id);
+        instruction.add_operand(exec_scope_id);
+        instruction.add_operand(value);
+        instruction.add_operand(index);
+
+        instruction
+    }
+    pub(super) fn group_non_uniform_arithmetic(
+        op: Op,
+        result_type_id: Word,
+        id: Word,
+        exec_scope_id: Word,
+        group_op: Option<spirv::GroupOperation>,
+        value: Word,
+    ) -> Self {
+        let mut instruction = Self::new(op);
+        instruction.set_type(result_type_id);
+        instruction.set_result(id);
+        instruction.add_operand(exec_scope_id);
+        if let Some(group_op) = group_op {
+            instruction.add_operand(group_op as u32);
+        }
+        instruction.add_operand(value);
+
+        instruction
+    }
 }
 
 impl From<crate::StorageFormat> for spirv::ImageFormat {
diff --git a/naga/src/back/spv/mod.rs b/naga/src/back/spv/mod.rs
index 8626bb104d..38a87049e6 100644
--- a/naga/src/back/spv/mod.rs
+++ b/naga/src/back/spv/mod.rs
@@ -13,6 +13,7 @@ mod layout;
 mod ray;
 mod recyclable;
 mod selection;
+mod subgroup;
 mod writer;
 
 pub use spirv::Capability;
@@ -247,7 +248,7 @@ impl LocalImageType {
 /// this, by converting everything possible to a `LocalType` before inspecting
 /// it.
 ///
-/// ## `Localtype` equality and SPIR-V `OpType` uniqueness
+/// ## `LocalType` equality and SPIR-V `OpType` uniqueness
 ///
 /// The definition of `Eq` on `LocalType` is carefully chosen to help us follow
 /// certain SPIR-V rules. SPIR-V §2.8 requires some classes of `OpType...`
@@ -456,7 +457,7 @@ impl recyclable::Recyclable for CachedExpressions {
 
 #[derive(Eq, Hash, PartialEq)]
 enum CachedConstant {
-    Literal(crate::Literal),
+    Literal(crate::proc::HashableLiteral),
     Composite {
         ty: LookupType,
         constituent_ids: Vec<Word>,
diff --git a/naga/src/back/spv/subgroup.rs b/naga/src/back/spv/subgroup.rs
new file mode 100644
index 0000000000..c952cb11a7
--- /dev/null
+++ b/naga/src/back/spv/subgroup.rs
@@ -0,0 +1,207 @@
+use super::{Block, BlockContext, Error, Instruction};
+use crate::{
+    arena::Handle,
+    back::spv::{LocalType, LookupType},
+    TypeInner,
+};
+
+impl<'w> BlockContext<'w> {
+    pub(super) fn write_subgroup_ballot(
+        &mut self,
+        predicate: &Option<Handle<crate::Expression>>,
+        result: Handle<crate::Expression>,
+        block: &mut Block,
+    ) -> Result<(), Error> {
+        self.writer.require_any(
+            "GroupNonUniformBallot",
+            &[spirv::Capability::GroupNonUniformBallot],
+        )?;
+        let vec4_u32_type_id = self.get_type_id(LookupType::Local(LocalType::Value {
+            vector_size: Some(crate::VectorSize::Quad),
+            scalar: crate::Scalar::U32,
+            pointer_space: None,
+        }));
+        let exec_scope_id = self.get_index_constant(spirv::Scope::Subgroup as u32);
+        let predicate = if let Some(predicate) = *predicate {
+            self.cached[predicate]
+        } else {
+            self.writer.get_constant_scalar(crate::Literal::Bool(true))
+        };
+        let id = self.gen_id();
+        block.body.push(Instruction::group_non_uniform_ballot(
+            vec4_u32_type_id,
+            id,
+            exec_scope_id,
+            predicate,
+        ));
+        self.cached[result] = id;
+        Ok(())
+    }
+    pub(super) fn write_subgroup_operation(
+        &mut self,
+        op: &crate::SubgroupOperation,
+        collective_op: &crate::CollectiveOperation,
+        argument: Handle<crate::Expression>,
+        result: Handle<crate::Expression>,
+        block: &mut Block,
+    ) -> Result<(), Error> {
+        use crate::SubgroupOperation as sg;
+        match *op {
+            sg::All | sg::Any => {
+                self.writer.require_any(
+                    "GroupNonUniformVote",
+                    &[spirv::Capability::GroupNonUniformVote],
+                )?;
+            }
+            _ => {
+                self.writer.require_any(
+                    "GroupNonUniformArithmetic",
+                    &[spirv::Capability::GroupNonUniformArithmetic],
+                )?;
+            }
+        }
+
+        let id = self.gen_id();
+        let result_ty = &self.fun_info[result].ty;
+        let result_type_id = self.get_expression_type_id(result_ty);
+        let result_ty_inner = result_ty.inner_with(&self.ir_module.types);
+
+        let (is_scalar, scalar) = match *result_ty_inner {
+            TypeInner::Scalar(kind) => (true, kind),
+            TypeInner::Vector { scalar: kind, .. } => (false, kind),
+            _ => unimplemented!(),
+        };
+
+        use crate::ScalarKind as sk;
+        let spirv_op = match (scalar.kind, *op) {
+            (sk::Bool, sg::All) if is_scalar => spirv::Op::GroupNonUniformAll,
+            (sk::Bool, sg::Any) if is_scalar => spirv::Op::GroupNonUniformAny,
+            (_, sg::All | sg::Any) => unimplemented!(),
+
+            (sk::Sint | sk::Uint, sg::Add) => spirv::Op::GroupNonUniformIAdd,
+            (sk::Float, sg::Add) => spirv::Op::GroupNonUniformFAdd,
+            (sk::Sint | sk::Uint, sg::Mul) => spirv::Op::GroupNonUniformIMul,
+            (sk::Float, sg::Mul) => spirv::Op::GroupNonUniformFMul,
+            (sk::Sint, sg::Max) => spirv::Op::GroupNonUniformSMax,
+            (sk::Uint, sg::Max) => spirv::Op::GroupNonUniformUMax,
+            (sk::Float, sg::Max) => spirv::Op::GroupNonUniformFMax,
+            (sk::Sint, sg::Min) => spirv::Op::GroupNonUniformSMin,
+            (sk::Uint, sg::Min) => spirv::Op::GroupNonUniformUMin,
+            (sk::Float, sg::Min) => spirv::Op::GroupNonUniformFMin,
+            (_, sg::Add | sg::Mul | sg::Min | sg::Max) => unimplemented!(),
+
+            (sk::Sint | sk::Uint, sg::And) => spirv::Op::GroupNonUniformBitwiseAnd,
+            (sk::Sint | sk::Uint, sg::Or) => spirv::Op::GroupNonUniformBitwiseOr,
+            (sk::Sint | sk::Uint, sg::Xor) => spirv::Op::GroupNonUniformBitwiseXor,
+            (sk::Bool, sg::And) => spirv::Op::GroupNonUniformLogicalAnd,
+            (sk::Bool, sg::Or) => spirv::Op::GroupNonUniformLogicalOr,
+            (sk::Bool, sg::Xor) => spirv::Op::GroupNonUniformLogicalXor,
+            (_, sg::And | sg::Or | sg::Xor) => unimplemented!(),
+        };
+
+        let exec_scope_id = self.get_index_constant(spirv::Scope::Subgroup as u32);
+
+        use crate::CollectiveOperation as c;
+        let group_op = match *op {
+            sg::All | sg::Any => None,
+            _ => Some(match *collective_op {
+                c::Reduce => spirv::GroupOperation::Reduce,
+                c::InclusiveScan => spirv::GroupOperation::InclusiveScan,
+                c::ExclusiveScan => spirv::GroupOperation::ExclusiveScan,
+            }),
+        };
+
+        let arg_id = self.cached[argument];
+        block.body.push(Instruction::group_non_uniform_arithmetic(
+            spirv_op,
+            result_type_id,
+            id,
+            exec_scope_id,
+            group_op,
+            arg_id,
+        ));
+        self.cached[result] = id;
+        Ok(())
+    }
+    pub(super) fn write_subgroup_gather(
+        &mut self,
+        mode: &crate::GatherMode,
+        argument: Handle<crate::Expression>,
+        result: Handle<crate::Expression>,
+        block: &mut Block,
+    ) -> Result<(), Error> {
+        self.writer.require_any(
+            "GroupNonUniformBallot",
+            &[spirv::Capability::GroupNonUniformBallot],
+        )?;
+        match *mode {
+            crate::GatherMode::BroadcastFirst | crate::GatherMode::Broadcast(_) => {
+                self.writer.require_any(
+                    "GroupNonUniformBallot",
+                    &[spirv::Capability::GroupNonUniformBallot],
+                )?;
+            }
+            crate::GatherMode::Shuffle(_) | crate::GatherMode::ShuffleXor(_) => {
+                self.writer.require_any(
+                    "GroupNonUniformShuffle",
+                    &[spirv::Capability::GroupNonUniformShuffle],
+                )?;
+            }
+            crate::GatherMode::ShuffleDown(_) | crate::GatherMode::ShuffleUp(_) => {
+                self.writer.require_any(
+                    "GroupNonUniformShuffleRelative",
+                    &[spirv::Capability::GroupNonUniformShuffleRelative],
+                )?;
+            }
+        }
+
+        let id = self.gen_id();
+        let result_ty = &self.fun_info[result].ty;
+        let result_type_id = self.get_expression_type_id(result_ty);
+
+        let exec_scope_id = self.get_index_constant(spirv::Scope::Subgroup as u32);
+
+        let arg_id = self.cached[argument];
+        match *mode {
+            crate::GatherMode::BroadcastFirst => {
+                block
+                    .body
+                    .push(Instruction::group_non_uniform_broadcast_first(
+                        result_type_id,
+                        id,
+                        exec_scope_id,
+                        arg_id,
+                    ));
+            }
+            crate::GatherMode::Broadcast(index)
+            | crate::GatherMode::Shuffle(index)
+            | crate::GatherMode::ShuffleDown(index)
+            | crate::GatherMode::ShuffleUp(index)
+            | crate::GatherMode::ShuffleXor(index) => {
+                let index_id = self.cached[index];
+                let op = match *mode {
+                    crate::GatherMode::BroadcastFirst => unreachable!(),
+                    // Use shuffle to emit broadcast to allow the index to
+                    // be dynamically uniform on Vulkan 1.1. The argument to
+                    // OpGroupNonUniformBroadcast must be a constant pre SPIR-V
+                    // 1.5 (vulkan 1.2)
+                    crate::GatherMode::Broadcast(_) => spirv::Op::GroupNonUniformShuffle,
+                    crate::GatherMode::Shuffle(_) => spirv::Op::GroupNonUniformShuffle,
+                    crate::GatherMode::ShuffleDown(_) => spirv::Op::GroupNonUniformShuffleDown,
+                    crate::GatherMode::ShuffleUp(_) => spirv::Op::GroupNonUniformShuffleUp,
+                    crate::GatherMode::ShuffleXor(_) => spirv::Op::GroupNonUniformShuffleXor,
+                };
+                block.body.push(Instruction::group_non_uniform_gather(
+                    op,
+                    result_type_id,
+                    id,
+                    exec_scope_id,
+                    arg_id,
+                    index_id,
+                ));
+            }
+        }
+        self.cached[result] = id;
+        Ok(())
+    }
+}
diff --git a/naga/src/back/spv/writer.rs b/naga/src/back/spv/writer.rs
index cf96fa59b4..73a16c273e 100644
--- a/naga/src/back/spv/writer.rs
+++ b/naga/src/back/spv/writer.rs
@@ -970,6 +970,11 @@ impl Writer {
         handle: Handle<crate::Type>,
     ) -> Result<Word, Error> {
         let ty = &arena[handle];
+        // If it's a type that needs SPIR-V capabilities, request them now.
+        // This needs to happen regardless of the LocalType lookup succeeding,
+        // because some types which map to the same LocalType have different
+        // capability requirements. See https://github.com/gfx-rs/wgpu/issues/5569
+        self.request_type_capabilities(&ty.inner)?;
         let id = if let Some(local) = make_local(&ty.inner) {
             // This type can be represented as a `LocalType`, so check if we've
             // already written an instruction for it. If not, do so now, with
@@ -985,10 +990,6 @@ impl Writer {
 
                     self.write_type_declaration_local(id, local);
 
-                    // If it's a type that needs SPIR-V capabilities, request them now,
-                    // so write_type_declaration_local can stay infallible.
-                    self.request_type_capabilities(&ty.inner)?;
-
                     id
                 }
             }
@@ -1150,7 +1151,7 @@ impl Writer {
     }
 
     pub(super) fn get_constant_scalar(&mut self, value: crate::Literal) -> Word {
-        let scalar = CachedConstant::Literal(value);
+        let scalar = CachedConstant::Literal(value.into());
         if let Some(&id) = self.cached_constants.get(&scalar) {
             return id;
         }
@@ -1310,7 +1311,11 @@ impl Writer {
             spirv::MemorySemantics::WORKGROUP_MEMORY,
             flags.contains(crate::Barrier::WORK_GROUP),
         );
-        let exec_scope_id = self.get_index_constant(spirv::Scope::Workgroup as u32);
+        let exec_scope_id = if flags.contains(crate::Barrier::SUB_GROUP) {
+            self.get_index_constant(spirv::Scope::Subgroup as u32)
+        } else {
+            self.get_index_constant(spirv::Scope::Workgroup as u32)
+        };
         let mem_scope_id = self.get_index_constant(memory_scope as u32);
         let semantics_id = self.get_index_constant(semantics.bits());
         block.body.push(Instruction::control_barrier(
@@ -1585,6 +1590,41 @@ impl Writer {
                     Bi::WorkGroupId => BuiltIn::WorkgroupId,
                     Bi::WorkGroupSize => BuiltIn::WorkgroupSize,
                     Bi::NumWorkGroups => BuiltIn::NumWorkgroups,
+                    // Subgroup
+                    Bi::NumSubgroups => {
+                        self.require_any(
+                            "`num_subgroups` built-in",
+                            &[spirv::Capability::GroupNonUniform],
+                        )?;
+                        BuiltIn::NumSubgroups
+                    }
+                    Bi::SubgroupId => {
+                        self.require_any(
+                            "`subgroup_id` built-in",
+                            &[spirv::Capability::GroupNonUniform],
+                        )?;
+                        BuiltIn::SubgroupId
+                    }
+                    Bi::SubgroupSize => {
+                        self.require_any(
+                            "`subgroup_size` built-in",
+                            &[
+                                spirv::Capability::GroupNonUniform,
+                                spirv::Capability::SubgroupBallotKHR,
+                            ],
+                        )?;
+                        BuiltIn::SubgroupSize
+                    }
+                    Bi::SubgroupInvocationId => {
+                        self.require_any(
+                            "`subgroup_invocation_id` built-in",
+                            &[
+                                spirv::Capability::GroupNonUniform,
+                                spirv::Capability::SubgroupBallotKHR,
+                            ],
+                        )?;
+                        BuiltIn::SubgroupLocalInvocationId
+                    }
                 };
 
                 self.decorate(id, Decoration::BuiltIn, &[built_in as u32]);
diff --git a/naga/src/back/wgsl/writer.rs b/naga/src/back/wgsl/writer.rs
index b63e16da3b..789f6f62bf 100644
--- a/naga/src/back/wgsl/writer.rs
+++ b/naga/src/back/wgsl/writer.rs
@@ -924,8 +924,124 @@ impl<W: Write> Writer<W> {
                 if barrier.contains(crate::Barrier::WORK_GROUP) {
                     writeln!(self.out, "{level}workgroupBarrier();")?;
                 }
+
+                if barrier.contains(crate::Barrier::SUB_GROUP) {
+                    writeln!(self.out, "{level}subgroupBarrier();")?;
+                }
             }
             Statement::RayQuery { .. } => unreachable!(),
+            Statement::SubgroupBallot { result, predicate } => {
+                write!(self.out, "{level}")?;
+                let res_name = format!("{}{}", back::BAKE_PREFIX, result.index());
+                self.start_named_expr(module, result, func_ctx, &res_name)?;
+                self.named_expressions.insert(result, res_name);
+
+                write!(self.out, "subgroupBallot(")?;
+                if let Some(predicate) = predicate {
+                    self.write_expr(module, predicate, func_ctx)?;
+                }
+                writeln!(self.out, ");")?;
+            }
+            Statement::SubgroupCollectiveOperation {
+                op,
+                collective_op,
+                argument,
+                result,
+            } => {
+                write!(self.out, "{level}")?;
+                let res_name = format!("{}{}", back::BAKE_PREFIX, result.index());
+                self.start_named_expr(module, result, func_ctx, &res_name)?;
+                self.named_expressions.insert(result, res_name);
+
+                match (collective_op, op) {
+                    (crate::CollectiveOperation::Reduce, crate::SubgroupOperation::All) => {
+                        write!(self.out, "subgroupAll(")?
+                    }
+                    (crate::CollectiveOperation::Reduce, crate::SubgroupOperation::Any) => {
+                        write!(self.out, "subgroupAny(")?
+                    }
+                    (crate::CollectiveOperation::Reduce, crate::SubgroupOperation::Add) => {
+                        write!(self.out, "subgroupAdd(")?
+                    }
+                    (crate::CollectiveOperation::Reduce, crate::SubgroupOperation::Mul) => {
+                        write!(self.out, "subgroupMul(")?
+                    }
+                    (crate::CollectiveOperation::Reduce, crate::SubgroupOperation::Max) => {
+                        write!(self.out, "subgroupMax(")?
+                    }
+                    (crate::CollectiveOperation::Reduce, crate::SubgroupOperation::Min) => {
+                        write!(self.out, "subgroupMin(")?
+                    }
+                    (crate::CollectiveOperation::Reduce, crate::SubgroupOperation::And) => {
+                        write!(self.out, "subgroupAnd(")?
+                    }
+                    (crate::CollectiveOperation::Reduce, crate::SubgroupOperation::Or) => {
+                        write!(self.out, "subgroupOr(")?
+                    }
+                    (crate::CollectiveOperation::Reduce, crate::SubgroupOperation::Xor) => {
+                        write!(self.out, "subgroupXor(")?
+                    }
+                    (crate::CollectiveOperation::ExclusiveScan, crate::SubgroupOperation::Add) => {
+                        write!(self.out, "subgroupExclusiveAdd(")?
+                    }
+                    (crate::CollectiveOperation::ExclusiveScan, crate::SubgroupOperation::Mul) => {
+                        write!(self.out, "subgroupExclusiveMul(")?
+                    }
+                    (crate::CollectiveOperation::InclusiveScan, crate::SubgroupOperation::Add) => {
+                        write!(self.out, "subgroupInclusiveAdd(")?
+                    }
+                    (crate::CollectiveOperation::InclusiveScan, crate::SubgroupOperation::Mul) => {
+                        write!(self.out, "subgroupInclusiveMul(")?
+                    }
+                    _ => unimplemented!(),
+                }
+                self.write_expr(module, argument, func_ctx)?;
+                writeln!(self.out, ");")?;
+            }
+            Statement::SubgroupGather {
+                mode,
+                argument,
+                result,
+            } => {
+                write!(self.out, "{level}")?;
+                let res_name = format!("{}{}", back::BAKE_PREFIX, result.index());
+                self.start_named_expr(module, result, func_ctx, &res_name)?;
+                self.named_expressions.insert(result, res_name);
+
+                match mode {
+                    crate::GatherMode::BroadcastFirst => {
+                        write!(self.out, "subgroupBroadcastFirst(")?;
+                    }
+                    crate::GatherMode::Broadcast(_) => {
+                        write!(self.out, "subgroupBroadcast(")?;
+                    }
+                    crate::GatherMode::Shuffle(_) => {
+                        write!(self.out, "subgroupShuffle(")?;
+                    }
+                    crate::GatherMode::ShuffleDown(_) => {
+                        write!(self.out, "subgroupShuffleDown(")?;
+                    }
+                    crate::GatherMode::ShuffleUp(_) => {
+                        write!(self.out, "subgroupShuffleUp(")?;
+                    }
+                    crate::GatherMode::ShuffleXor(_) => {
+                        write!(self.out, "subgroupShuffleXor(")?;
+                    }
+                }
+                self.write_expr(module, argument, func_ctx)?;
+                match mode {
+                    crate::GatherMode::BroadcastFirst => {}
+                    crate::GatherMode::Broadcast(index)
+                    | crate::GatherMode::Shuffle(index)
+                    | crate::GatherMode::ShuffleDown(index)
+                    | crate::GatherMode::ShuffleUp(index)
+                    | crate::GatherMode::ShuffleXor(index) => {
+                        write!(self.out, ", ")?;
+                        self.write_expr(module, index, func_ctx)?;
+                    }
+                }
+                writeln!(self.out, ");")?;
+            }
         }
 
         Ok(())
@@ -1698,6 +1814,8 @@ impl<W: Write> Writer<W> {
             Expression::CallResult(_)
             | Expression::AtomicResult { .. }
             | Expression::RayQueryProceedResult
+            | Expression::SubgroupBallotResult
+            | Expression::SubgroupOperationResult { .. }
             | Expression::WorkGroupUniformLoadResult { .. } => {}
         }
 
@@ -1799,6 +1917,10 @@ fn builtin_str(built_in: crate::BuiltIn) -> Result<&'static str, Error> {
         Bi::SampleMask => "sample_mask",
         Bi::PrimitiveIndex => "primitive_index",
         Bi::ViewIndex => "view_index",
+        Bi::NumSubgroups => "num_subgroups",
+        Bi::SubgroupId => "subgroup_id",
+        Bi::SubgroupSize => "subgroup_size",
+        Bi::SubgroupInvocationId => "subgroup_invocation_id",
         Bi::BaseInstance
         | Bi::BaseVertex
         | Bi::ClipDistance
diff --git a/naga/src/compact/expressions.rs b/naga/src/compact/expressions.rs
index 0f2d8b1a02..a418bde301 100644
--- a/naga/src/compact/expressions.rs
+++ b/naga/src/compact/expressions.rs
@@ -72,6 +72,7 @@ impl<'tracer> ExpressionTracer<'tracer> {
                 | Ex::GlobalVariable(_)
                 | Ex::LocalVariable(_)
                 | Ex::CallResult(_)
+                | Ex::SubgroupBallotResult
                 | Ex::RayQueryProceedResult => {}
 
                 Ex::Constant(handle) => {
@@ -192,6 +193,7 @@ impl<'tracer> ExpressionTracer<'tracer> {
                 Ex::AtomicResult { ty, comparison: _ } => self.types_used.insert(ty),
                 Ex::WorkGroupUniformLoadResult { ty } => self.types_used.insert(ty),
                 Ex::ArrayLength(expr) => self.expressions_used.insert(expr),
+                Ex::SubgroupOperationResult { ty } => self.types_used.insert(ty),
                 Ex::RayQueryGetIntersection {
                     query,
                     committed: _,
@@ -223,6 +225,7 @@ impl ModuleMap {
             | Ex::GlobalVariable(_)
             | Ex::LocalVariable(_)
             | Ex::CallResult(_)
+            | Ex::SubgroupBallotResult
             | Ex::RayQueryProceedResult => {}
 
             // All overrides are retained, so their handles never change.
@@ -353,6 +356,7 @@ impl ModuleMap {
                 comparison: _,
             } => self.types.adjust(ty),
             Ex::WorkGroupUniformLoadResult { ref mut ty } => self.types.adjust(ty),
+            Ex::SubgroupOperationResult { ref mut ty } => self.types.adjust(ty),
             Ex::ArrayLength(ref mut expr) => adjust(expr),
             Ex::RayQueryGetIntersection {
                 ref mut query,
diff --git a/naga/src/compact/statements.rs b/naga/src/compact/statements.rs
index 0698b57258..a124281bc1 100644
--- a/naga/src/compact/statements.rs
+++ b/naga/src/compact/statements.rs
@@ -97,6 +97,39 @@ impl FunctionTracer<'_> {
                         self.expressions_used.insert(query);
                         self.trace_ray_query_function(fun);
                     }
+                    St::SubgroupBallot { result, predicate } => {
+                        if let Some(predicate) = predicate {
+                            self.expressions_used.insert(predicate)
+                        }
+                        self.expressions_used.insert(result)
+                    }
+                    St::SubgroupCollectiveOperation {
+                        op: _,
+                        collective_op: _,
+                        argument,
+                        result,
+                    } => {
+                        self.expressions_used.insert(argument);
+                        self.expressions_used.insert(result)
+                    }
+                    St::SubgroupGather {
+                        mode,
+                        argument,
+                        result,
+                    } => {
+                        match mode {
+                            crate::GatherMode::BroadcastFirst => {}
+                            crate::GatherMode::Broadcast(index)
+                            | crate::GatherMode::Shuffle(index)
+                            | crate::GatherMode::ShuffleDown(index)
+                            | crate::GatherMode::ShuffleUp(index)
+                            | crate::GatherMode::ShuffleXor(index) => {
+                                self.expressions_used.insert(index)
+                            }
+                        }
+                        self.expressions_used.insert(argument);
+                        self.expressions_used.insert(result)
+                    }
 
                     // Trivial statements.
                     St::Break
@@ -250,6 +283,40 @@ impl FunctionMap {
                         adjust(query);
                         self.adjust_ray_query_function(fun);
                     }
+                    St::SubgroupBallot {
+                        ref mut result,
+                        ref mut predicate,
+                    } => {
+                        if let Some(ref mut predicate) = *predicate {
+                            adjust(predicate);
+                        }
+                        adjust(result);
+                    }
+                    St::SubgroupCollectiveOperation {
+                        op: _,
+                        collective_op: _,
+                        ref mut argument,
+                        ref mut result,
+                    } => {
+                        adjust(argument);
+                        adjust(result);
+                    }
+                    St::SubgroupGather {
+                        ref mut mode,
+                        ref mut argument,
+                        ref mut result,
+                    } => {
+                        match *mode {
+                            crate::GatherMode::BroadcastFirst => {}
+                            crate::GatherMode::Broadcast(ref mut index)
+                            | crate::GatherMode::Shuffle(ref mut index)
+                            | crate::GatherMode::ShuffleDown(ref mut index)
+                            | crate::GatherMode::ShuffleUp(ref mut index)
+                            | crate::GatherMode::ShuffleXor(ref mut index) => adjust(index),
+                        }
+                        adjust(argument);
+                        adjust(result);
+                    }
 
                     // Trivial statements.
                     St::Break
diff --git a/naga/src/error.rs b/naga/src/error.rs
new file mode 100644
index 0000000000..5f2e28360b
--- /dev/null
+++ b/naga/src/error.rs
@@ -0,0 +1,74 @@
+use std::{error::Error, fmt};
+
+#[derive(Clone, Debug)]
+pub struct ShaderError<E> {
+    /// The source code of the shader.
+    pub source: String,
+    pub label: Option<String>,
+    pub inner: Box<E>,
+}
+
+#[cfg(feature = "wgsl-in")]
+impl fmt::Display for ShaderError<crate::front::wgsl::ParseError> {
+    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+        let label = self.label.as_deref().unwrap_or_default();
+        let string = self.inner.emit_to_string(&self.source);
+        write!(f, "\nShader '{label}' parsing {string}")
+    }
+}
+#[cfg(feature = "glsl-in")]
+impl fmt::Display for ShaderError<crate::front::glsl::ParseErrors> {
+    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+        let label = self.label.as_deref().unwrap_or_default();
+        let string = self.inner.emit_to_string(&self.source);
+        write!(f, "\nShader '{label}' parsing {string}")
+    }
+}
+#[cfg(feature = "spv-in")]
+impl fmt::Display for ShaderError<crate::front::spv::Error> {
+    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+        let label = self.label.as_deref().unwrap_or_default();
+        let string = self.inner.emit_to_string(&self.source);
+        write!(f, "\nShader '{label}' parsing {string}")
+    }
+}
+impl fmt::Display for ShaderError<crate::WithSpan<crate::valid::ValidationError>> {
+    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+        use codespan_reporting::{
+            diagnostic::{Diagnostic, Label},
+            files::SimpleFile,
+            term,
+        };
+
+        let label = self.label.as_deref().unwrap_or_default();
+        let files = SimpleFile::new(label, &self.source);
+        let config = term::Config::default();
+        let mut writer = term::termcolor::NoColor::new(Vec::new());
+
+        let diagnostic = Diagnostic::error().with_labels(
+            self.inner
+                .spans()
+                .map(|&(span, ref desc)| {
+                    Label::primary((), span.to_range().unwrap()).with_message(desc.to_owned())
+                })
+                .collect(),
+        );
+
+        term::emit(&mut writer, &config, &files, &diagnostic).expect("cannot write error");
+
+        write!(
+            f,
+            "\nShader validation {}",
+            String::from_utf8_lossy(&writer.into_inner())
+        )
+    }
+}
+impl<E> Error for ShaderError<E>
+where
+    ShaderError<E>: fmt::Display,
+    E: Error + 'static,
+{
+    fn source(&self) -> Option<&(dyn Error + 'static)> {
+        Some(&self.inner)
+    }
+}
diff --git a/naga/src/front/glsl/error.rs b/naga/src/front/glsl/error.rs
index bd16ee30bc..e0771437e6 100644
--- a/naga/src/front/glsl/error.rs
+++ b/naga/src/front/glsl/error.rs
@@ -1,4 +1,5 @@
 use super::token::TokenValue;
+use crate::SourceLocation;
 use crate::{proc::ConstantEvaluatorError, Span};
 use codespan_reporting::diagnostic::{Diagnostic, Label};
 use codespan_reporting::files::SimpleFile;
@@ -137,14 +138,21 @@ pub struct Error {
     pub meta: Span,
 }
 
+impl Error {
+    /// Returns a [`SourceLocation`] for the error message.
+    pub fn location(&self, source: &str) -> Option<SourceLocation> {
+        Some(self.meta.location(source))
+    }
+}
+
 /// A collection of errors returned during shader parsing.
 #[derive(Clone, Debug)]
 #[cfg_attr(test, derive(PartialEq))]
-pub struct ParseError {
+pub struct ParseErrors {
     pub errors: Vec<Error>,
 }
 
-impl ParseError {
+impl ParseErrors {
     pub fn emit_to_writer(&self, writer: &mut impl WriteColor, source: &str) {
         self.emit_to_writer_with_path(writer, source, "glsl");
     }
@@ -172,19 +180,19 @@ impl ParseError {
     }
 }
 
-impl std::fmt::Display for ParseError {
+impl std::fmt::Display for ParseErrors {
     fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result {
         self.errors.iter().try_for_each(|e| write!(f, "{e:?}"))
     }
 }
 
-impl std::error::Error for ParseError {
+impl std::error::Error for ParseErrors {
     fn source(&self) -> Option<&(dyn std::error::Error + 'static)> {
         None
     }
 }
 
-impl From<Vec<Error>> for ParseError {
+impl From<Vec<Error>> for ParseErrors {
     fn from(errors: Vec<Error>) -> Self {
         Self { errors }
     }
diff --git a/naga/src/front/glsl/mod.rs b/naga/src/front/glsl/mod.rs
index 75f3929db4..ea202b2445 100644
--- a/naga/src/front/glsl/mod.rs
+++ b/naga/src/front/glsl/mod.rs
@@ -13,7 +13,7 @@ To begin, take a look at the documentation for the [`Frontend`].
 */
 
 pub use ast::{Precision, Profile};
-pub use error::{Error, ErrorKind, ExpectedToken, ParseError};
+pub use error::{Error, ErrorKind, ExpectedToken, ParseErrors};
 pub use token::TokenValue;
 
 use crate::{proc::Layouter, FastHashMap, FastHashSet, Handle, Module, ShaderStage, Span, Type};
@@ -196,7 +196,7 @@ impl Frontend {
         &mut self,
         options: &Options,
         source: &str,
-    ) -> std::result::Result<Module, ParseError> {
+    ) -> std::result::Result<Module, ParseErrors> {
         self.reset(options.stage);
 
         let lexer = lex::Lexer::new(source, &options.defines);
diff --git a/naga/src/front/glsl/parser_tests.rs b/naga/src/front/glsl/parser_tests.rs
index c065dc15d6..135765ca58 100644
--- a/naga/src/front/glsl/parser_tests.rs
+++ b/naga/src/front/glsl/parser_tests.rs
@@ -1,7 +1,7 @@
 use super::{
     ast::Profile,
     error::ExpectedToken,
-    error::{Error, ErrorKind, ParseError},
+    error::{Error, ErrorKind, ParseErrors},
     token::TokenValue,
     Frontend, Options, Span,
 };
@@ -21,7 +21,7 @@ fn version() {
             )
             .err()
             .unwrap(),
-        ParseError {
+        ParseErrors {
             errors: vec![Error {
                 kind: ErrorKind::InvalidVersion(99000),
                 meta: Span::new(9, 14)
@@ -37,7 +37,7 @@ fn version() {
             )
             .err()
             .unwrap(),
-        ParseError {
+        ParseErrors {
             errors: vec![Error {
                 kind: ErrorKind::InvalidVersion(449),
                 meta: Span::new(9, 12)
@@ -53,7 +53,7 @@ fn version() {
             )
             .err()
             .unwrap(),
-        ParseError {
+        ParseErrors {
             errors: vec![Error {
                 kind: ErrorKind::InvalidProfile("smart".into()),
                 meta: Span::new(13, 18),
@@ -69,7 +69,7 @@ fn version() {
             )
             .err()
             .unwrap(),
-        ParseError {
+        ParseErrors {
             errors: vec![
                 Error {
                     kind: ErrorKind::PreprocessorError(PreprocessorError::UnexpectedHash,),
@@ -455,7 +455,7 @@ fn functions() {
             )
             .err()
             .unwrap(),
-        ParseError {
+        ParseErrors {
             errors: vec![Error {
                 kind: ErrorKind::SemanticError("Function already defined".into()),
                 meta: Span::new(134, 152),
@@ -634,7 +634,7 @@ fn implicit_conversions() {
             )
             .err()
             .unwrap(),
-        ParseError {
+        ParseErrors {
             errors: vec![Error {
                 kind: ErrorKind::SemanticError("Unknown function \'test\'".into()),
                 meta: Span::new(156, 165),
@@ -658,7 +658,7 @@ fn implicit_conversions() {
             )
             .err()
             .unwrap(),
-        ParseError {
+        ParseErrors {
             errors: vec![Error {
                 kind: ErrorKind::SemanticError("Ambiguous best function for \'test\'".into()),
                 meta: Span::new(158, 165),
diff --git a/naga/src/front/spv/convert.rs b/naga/src/front/spv/convert.rs
index f0a714fbeb..a6bf0e0451 100644
--- a/naga/src/front/spv/convert.rs
+++ b/naga/src/front/spv/convert.rs
@@ -153,6 +153,11 @@ pub(super) fn map_builtin(word: spirv::Word, invariant: bool) -> Result<crate::B
         Some(Bi::WorkgroupId) => crate::BuiltIn::WorkGroupId,
         Some(Bi::WorkgroupSize) => crate::BuiltIn::WorkGroupSize,
         Some(Bi::NumWorkgroups) => crate::BuiltIn::NumWorkGroups,
+        // subgroup
+        Some(Bi::NumSubgroups) => crate::BuiltIn::NumSubgroups,
+        Some(Bi::SubgroupId) => crate::BuiltIn::SubgroupId,
+        Some(Bi::SubgroupSize) => crate::BuiltIn::SubgroupSize,
+        Some(Bi::SubgroupLocalInvocationId) => crate::BuiltIn::SubgroupInvocationId,
         _ => return Err(Error::UnsupportedBuiltIn(word)),
     })
 }
diff --git a/naga/src/front/spv/error.rs b/naga/src/front/spv/error.rs
index 2825a44a00..44beadce98 100644
--- a/naga/src/front/spv/error.rs
+++ b/naga/src/front/spv/error.rs
@@ -5,7 +5,7 @@ use codespan_reporting::files::SimpleFile;
 use codespan_reporting::term;
 use termcolor::{NoColor, WriteColor};
 
-#[derive(Debug, thiserror::Error)]
+#[derive(Clone, Debug, thiserror::Error)]
 pub enum Error {
     #[error("invalid header")]
     InvalidHeader,
@@ -58,6 +58,8 @@ pub enum Error {
     UnknownBinaryOperator(spirv::Op),
     #[error("unknown relational function {0:?}")]
     UnknownRelationalFunction(spirv::Op),
+    #[error("unsupported group operation %{0}")]
+    UnsupportedGroupOperation(spirv::Word),
     #[error("invalid parameter {0:?}")]
     InvalidParameter(spirv::Op),
     #[error("invalid operand count {1} for {0:?}")]
diff --git a/naga/src/front/spv/mod.rs b/naga/src/front/spv/mod.rs
index 2ad40677fb..7ac5a18cd6 100644
--- a/naga/src/front/spv/mod.rs
+++ b/naga/src/front/spv/mod.rs
@@ -3700,6 +3700,254 @@ impl<I: Iterator<Item = u32>> Frontend<I> {
                         },
                     );
                 }
+                Op::GroupNonUniformBallot => {
+                    inst.expect(5)?;
+                    block.extend(emitter.finish(ctx.expressions));
+                    let result_type_id = self.next()?;
+                    let result_id = self.next()?;
+                    let exec_scope_id = self.next()?;
+                    let predicate_id = self.next()?;
+
+                    let exec_scope_const = self.lookup_constant.lookup(exec_scope_id)?;
+                    let _exec_scope = resolve_constant(ctx.gctx(), &exec_scope_const.inner)
+                        .filter(|exec_scope| *exec_scope == spirv::Scope::Subgroup as u32)
+                        .ok_or(Error::InvalidBarrierScope(exec_scope_id))?;
+
+                    let predicate = if self
+                        .lookup_constant
+                        .lookup(predicate_id)
+                        .ok()
+                        .filter(|predicate_const| match predicate_const.inner {
+                            Constant::Constant(constant) => matches!(
+                                ctx.gctx().global_expressions[ctx.gctx().constants[constant].init],
+                                crate::Expression::Literal(crate::Literal::Bool(true)),
+                            ),
+                            Constant::Override(_) => false,
+                        })
+                        .is_some()
+                    {
+                        None
+                    } else {
+                        let predicate_lookup = self.lookup_expression.lookup(predicate_id)?;
+                        let predicate_handle = get_expr_handle!(predicate_id, predicate_lookup);
+                        Some(predicate_handle)
+                    };
+
+                    let result_handle = ctx
+                        .expressions
+                        .append(crate::Expression::SubgroupBallotResult, span);
+                    self.lookup_expression.insert(
+                        result_id,
+                        LookupExpression {
+                            handle: result_handle,
+                            type_id: result_type_id,
+                            block_id,
+                        },
+                    );
+
+                    block.push(
+                        crate::Statement::SubgroupBallot {
+                            result: result_handle,
+                            predicate,
+                        },
+                        span,
+                    );
+                    emitter.start(ctx.expressions);
+                }
+                spirv::Op::GroupNonUniformAll
+                | spirv::Op::GroupNonUniformAny
+                | spirv::Op::GroupNonUniformIAdd
+                | spirv::Op::GroupNonUniformFAdd
+                | spirv::Op::GroupNonUniformIMul
+                | spirv::Op::GroupNonUniformFMul
+                | spirv::Op::GroupNonUniformSMax
+                | spirv::Op::GroupNonUniformUMax
+                | spirv::Op::GroupNonUniformFMax
+                | spirv::Op::GroupNonUniformSMin
+                | spirv::Op::GroupNonUniformUMin
+                | spirv::Op::GroupNonUniformFMin
+                | spirv::Op::GroupNonUniformBitwiseAnd
+                | spirv::Op::GroupNonUniformBitwiseOr
+                | spirv::Op::GroupNonUniformBitwiseXor
+                | spirv::Op::GroupNonUniformLogicalAnd
+                | spirv::Op::GroupNonUniformLogicalOr
+                | spirv::Op::GroupNonUniformLogicalXor => {
+                    block.extend(emitter.finish(ctx.expressions));
+                    inst.expect(
+                        if matches!(
+                            inst.op,
+                            spirv::Op::GroupNonUniformAll | spirv::Op::GroupNonUniformAny
+                        ) {
+                            5
+                        } else {
+                            6
+                        },
+                    )?;
+                    let result_type_id = self.next()?;
+                    let result_id = self.next()?;
+                    let exec_scope_id = self.next()?;
+                    let collective_op_id = match inst.op {
+                        spirv::Op::GroupNonUniformAll | spirv::Op::GroupNonUniformAny => {
+                            crate::CollectiveOperation::Reduce
+                        }
+                        _ => {
+                            let group_op_id = self.next()?;
+                            match spirv::GroupOperation::from_u32(group_op_id) {
+                                Some(spirv::GroupOperation::Reduce) => {
+                                    crate::CollectiveOperation::Reduce
+                                }
+                                Some(spirv::GroupOperation::InclusiveScan) => {
+                                    crate::CollectiveOperation::InclusiveScan
+                                }
+                                Some(spirv::GroupOperation::ExclusiveScan) => {
+                                    crate::CollectiveOperation::ExclusiveScan
+                                }
+                                _ => return Err(Error::UnsupportedGroupOperation(group_op_id)),
+                            }
+                        }
+                    };
+                    let argument_id = self.next()?;
+
+                    let argument_lookup = self.lookup_expression.lookup(argument_id)?;
+                    let argument_handle = get_expr_handle!(argument_id, argument_lookup);
+
+                    let exec_scope_const = self.lookup_constant.lookup(exec_scope_id)?;
+                    let _exec_scope = resolve_constant(ctx.gctx(), &exec_scope_const.inner)
+                        .filter(|exec_scope| *exec_scope == spirv::Scope::Subgroup as u32)
+                        .ok_or(Error::InvalidBarrierScope(exec_scope_id))?;
+
+                    let op_id = match inst.op {
+                        spirv::Op::GroupNonUniformAll => crate::SubgroupOperation::All,
+                        spirv::Op::GroupNonUniformAny => crate::SubgroupOperation::Any,
+                        spirv::Op::GroupNonUniformIAdd | spirv::Op::GroupNonUniformFAdd => {
+                            crate::SubgroupOperation::Add
+                        }
+                        spirv::Op::GroupNonUniformIMul | spirv::Op::GroupNonUniformFMul => {
+                            crate::SubgroupOperation::Mul
+                        }
+                        spirv::Op::GroupNonUniformSMax
+                        | spirv::Op::GroupNonUniformUMax
+                        | spirv::Op::GroupNonUniformFMax => crate::SubgroupOperation::Max,
+                        spirv::Op::GroupNonUniformSMin
+                        | spirv::Op::GroupNonUniformUMin
+                        | spirv::Op::GroupNonUniformFMin => crate::SubgroupOperation::Min,
+                        spirv::Op::GroupNonUniformBitwiseAnd
+                        | spirv::Op::GroupNonUniformLogicalAnd => crate::SubgroupOperation::And,
+                        spirv::Op::GroupNonUniformBitwiseOr
+                        | spirv::Op::GroupNonUniformLogicalOr => crate::SubgroupOperation::Or,
+                        spirv::Op::GroupNonUniformBitwiseXor
+                        | spirv::Op::GroupNonUniformLogicalXor => crate::SubgroupOperation::Xor,
+                        _ => unreachable!(),
+                    };
+
+                    let result_type = self.lookup_type.lookup(result_type_id)?;
+
+                    let result_handle = ctx.expressions.append(
+                        crate::Expression::SubgroupOperationResult {
+                            ty: result_type.handle,
+                        },
+                        span,
+                    );
+                    self.lookup_expression.insert(
+                        result_id,
+                        LookupExpression {
+                            handle: result_handle,
+                            type_id: result_type_id,
+                            block_id,
+                        },
+                    );
+
+                    block.push(
+                        crate::Statement::SubgroupCollectiveOperation {
+                            result: result_handle,
+                            op: op_id,
+                            collective_op: collective_op_id,
+                            argument: argument_handle,
+                        },
+                        span,
+                    );
+                    emitter.start(ctx.expressions);
+                }
+                Op::GroupNonUniformBroadcastFirst
+                | Op::GroupNonUniformBroadcast
+                | Op::GroupNonUniformShuffle
+                | Op::GroupNonUniformShuffleDown
+                | Op::GroupNonUniformShuffleUp
+                | Op::GroupNonUniformShuffleXor => {
+                    inst.expect(
+                        if matches!(inst.op, spirv::Op::GroupNonUniformBroadcastFirst) {
+                            5
+                        } else {
+                            6
+                        },
+                    )?;
+                    block.extend(emitter.finish(ctx.expressions));
+                    let result_type_id = self.next()?;
+                    let result_id = self.next()?;
+                    let exec_scope_id = self.next()?;
+                    let argument_id = self.next()?;
+
+                    let argument_lookup = self.lookup_expression.lookup(argument_id)?;
+                    let argument_handle = get_expr_handle!(argument_id, argument_lookup);
+
+                    let exec_scope_const = self.lookup_constant.lookup(exec_scope_id)?;
+                    let _exec_scope = resolve_constant(ctx.gctx(), &exec_scope_const.inner)
+                        .filter(|exec_scope| *exec_scope == spirv::Scope::Subgroup as u32)
+                        .ok_or(Error::InvalidBarrierScope(exec_scope_id))?;
+
+                    let mode = if matches!(inst.op, spirv::Op::GroupNonUniformBroadcastFirst) {
+                        crate::GatherMode::BroadcastFirst
+                    } else {
+                        let index_id = self.next()?;
+                        let index_lookup = self.lookup_expression.lookup(index_id)?;
+                        let index_handle = get_expr_handle!(index_id, index_lookup);
+                        match inst.op {
+                            spirv::Op::GroupNonUniformBroadcast => {
+                                crate::GatherMode::Broadcast(index_handle)
+                            }
+                            spirv::Op::GroupNonUniformShuffle => {
+                                crate::GatherMode::Shuffle(index_handle)
+                            }
+                            spirv::Op::GroupNonUniformShuffleDown => {
+                                crate::GatherMode::ShuffleDown(index_handle)
+                            }
+                            spirv::Op::GroupNonUniformShuffleUp => {
+                                crate::GatherMode::ShuffleUp(index_handle)
+                            }
+                            spirv::Op::GroupNonUniformShuffleXor => {
+                                crate::GatherMode::ShuffleXor(index_handle)
+                            }
+                            _ => unreachable!(),
+                        }
+                    };
+
+                    let result_type = self.lookup_type.lookup(result_type_id)?;
+
+                    let result_handle = ctx.expressions.append(
+                        crate::Expression::SubgroupOperationResult {
+                            ty: result_type.handle,
+                        },
+                        span,
+                    );
+                    self.lookup_expression.insert(
+                        result_id,
+                        LookupExpression {
+                            handle: result_handle,
+                            type_id: result_type_id,
+                            block_id,
+                        },
+                    );
+
+                    block.push(
+                        crate::Statement::SubgroupGather {
+                            result: result_handle,
+                            mode,
+                            argument: argument_handle,
+                        },
+                        span,
+                    );
+                    emitter.start(ctx.expressions);
+                }
                 _ => return Err(Error::UnsupportedInstruction(self.state, inst.op)),
             }
         };
@@ -3824,7 +4072,10 @@ impl<I: Iterator<Item = u32>> Frontend<I> {
                 | S::Store { .. }
                 | S::ImageStore { .. }
                 | S::Atomic { .. }
-                | S::RayQuery { .. } => {}
+                | S::RayQuery { .. }
+                | S::SubgroupBallot { .. }
+                | S::SubgroupCollectiveOperation { .. }
+                | S::SubgroupGather { .. } => {}
                 S::Call {
                     function: ref mut callee,
                     ref arguments,
diff --git a/naga/src/front/wgsl/error.rs b/naga/src/front/wgsl/error.rs
index 24e6c9f8c5..dc1339521c 100644
--- a/naga/src/front/wgsl/error.rs
+++ b/naga/src/front/wgsl/error.rs
@@ -13,6 +13,7 @@ use thiserror::Error;
 #[derive(Clone, Debug)]
 pub struct ParseError {
     message: String,
+    // The first span should be the primary span, and the other ones should be complementary.
     labels: Vec<(Span, Cow<'static, str>)>,
     notes: Vec<String>,
 }
diff --git a/naga/src/front/wgsl/lower/mod.rs b/naga/src/front/wgsl/lower/mod.rs
index 77212f2086..e7cce17723 100644
--- a/naga/src/front/wgsl/lower/mod.rs
+++ b/naga/src/front/wgsl/lower/mod.rs
@@ -874,6 +874,29 @@ impl Texture {
     }
 }
 
+enum SubgroupGather {
+    BroadcastFirst,
+    Broadcast,
+    Shuffle,
+    ShuffleDown,
+    ShuffleUp,
+    ShuffleXor,
+}
+
+impl SubgroupGather {
+    pub fn map(word: &str) -> Option<Self> {
+        Some(match word {
+            "subgroupBroadcastFirst" => Self::BroadcastFirst,
+            "subgroupBroadcast" => Self::Broadcast,
+            "subgroupShuffle" => Self::Shuffle,
+            "subgroupShuffleDown" => Self::ShuffleDown,
+            "subgroupShuffleUp" => Self::ShuffleUp,
+            "subgroupShuffleXor" => Self::ShuffleXor,
+            _ => return None,
+        })
+    }
+}
+
 pub struct Lowerer<'source, 'temp> {
     index: &'temp Index<'source>,
     layouter: Layouter,
@@ -2054,6 +2077,16 @@ impl<'source, 'temp> Lowerer<'source, 'temp> {
                     }
                 } else if let Some(fun) = Texture::map(function.name) {
                     self.texture_sample_helper(fun, arguments, span, ctx)?
+                } else if let Some((op, cop)) = conv::map_subgroup_operation(function.name) {
+                    return Ok(Some(
+                        self.subgroup_operation_helper(span, op, cop, arguments, ctx)?,
+                    ));
+                } else if let Some(mode) = SubgroupGather::map(function.name) {
+                    return Ok(Some(
+                        self.subgroup_gather_helper(span, mode, arguments, ctx)?,
+                    ));
+                } else if let Some(fun) = crate::AtomicFunction::map(function.name) {
+                    return Ok(Some(self.atomic_helper(span, fun, arguments, ctx)?));
                 } else {
                     match function.name {
                         "select" => {
@@ -2099,70 +2132,6 @@ impl<'source, 'temp> Lowerer<'source, 'temp> {
                                 .push(crate::Statement::Store { pointer, value }, span);
                             return Ok(None);
                         }
-                        "atomicAdd" => {
-                            return Ok(Some(self.atomic_helper(
-                                span,
-                                crate::AtomicFunction::Add,
-                                arguments,
-                                ctx,
-                            )?))
-                        }
-                        "atomicSub" => {
-                            return Ok(Some(self.atomic_helper(
-                                span,
-                                crate::AtomicFunction::Subtract,
-                                arguments,
-                                ctx,
-                            )?))
-                        }
-                        "atomicAnd" => {
-                            return Ok(Some(self.atomic_helper(
-                                span,
-                                crate::AtomicFunction::And,
-                                arguments,
-                                ctx,
-                            )?))
-                        }
-                        "atomicOr" => {
-                            return Ok(Some(self.atomic_helper(
-                                span,
-                                crate::AtomicFunction::InclusiveOr,
-                                arguments,
-                                ctx,
-                            )?))
-                        }
-                        "atomicXor" => {
-                            return Ok(Some(self.atomic_helper(
-                                span,
-                                crate::AtomicFunction::ExclusiveOr,
-                                arguments,
-                                ctx,
-                            )?))
-                        }
-                        "atomicMin" => {
-                            return Ok(Some(self.atomic_helper(
-                                span,
-                                crate::AtomicFunction::Min,
-                                arguments,
-                                ctx,
-                            )?))
-                        }
-                        "atomicMax" => {
-                            return Ok(Some(self.atomic_helper(
-                                span,
-                                crate::AtomicFunction::Max,
-                                arguments,
-                                ctx,
-                            )?))
-                        }
-                        "atomicExchange" => {
-                            return Ok(Some(self.atomic_helper(
-                                span,
-                                crate::AtomicFunction::Exchange { compare: None },
-                                arguments,
-                                ctx,
-                            )?))
-                        }
                         "atomicCompareExchangeWeak" => {
                             let mut args = ctx.prepare_args(arguments, 3, span);
 
@@ -2221,6 +2190,14 @@ impl<'source, 'temp> Lowerer<'source, 'temp> {
                                 .push(crate::Statement::Barrier(crate::Barrier::WORK_GROUP), span);
                             return Ok(None);
                         }
+                        "subgroupBarrier" => {
+                            ctx.prepare_args(arguments, 0, span).finish()?;
+
+                            let rctx = ctx.runtime_expression_ctx(span)?;
+                            rctx.block
+                                .push(crate::Statement::Barrier(crate::Barrier::SUB_GROUP), span);
+                            return Ok(None);
+                        }
                         "workgroupUniformLoad" => {
                             let mut args = ctx.prepare_args(arguments, 1, span);
                             let expr = args.next()?;
@@ -2428,6 +2405,22 @@ impl<'source, 'temp> Lowerer<'source, 'temp> {
                             )?;
                             return Ok(Some(handle));
                         }
+                        "subgroupBallot" => {
+                            let mut args = ctx.prepare_args(arguments, 0, span);
+                            let predicate = if arguments.len() == 1 {
+                                Some(self.expression(args.next()?, ctx)?)
+                            } else {
+                                None
+                            };
+                            args.finish()?;
+
+                            let result = ctx
+                                .interrupt_emitter(crate::Expression::SubgroupBallotResult, span)?;
+                            let rctx = ctx.runtime_expression_ctx(span)?;
+                            rctx.block
+                                .push(crate::Statement::SubgroupBallot { result, predicate }, span);
+                            return Ok(Some(result));
+                        }
                         _ => return Err(Error::UnknownIdent(function.span, function.name)),
                     }
                 };
@@ -2619,6 +2612,80 @@ impl<'source, 'temp> Lowerer<'source, 'temp> {
         })
     }
 
+    fn subgroup_operation_helper(
+        &mut self,
+        span: Span,
+        op: crate::SubgroupOperation,
+        collective_op: crate::CollectiveOperation,
+        arguments: &[Handle<ast::Expression<'source>>],
+        ctx: &mut ExpressionContext<'source, '_, '_>,
+    ) -> Result<Handle<crate::Expression>, Error<'source>> {
+        let mut args = ctx.prepare_args(arguments, 1, span);
+
+        let argument = self.expression(args.next()?, ctx)?;
+        args.finish()?;
+
+        let ty = ctx.register_type(argument)?;
+
+        let result =
+            ctx.interrupt_emitter(crate::Expression::SubgroupOperationResult { ty }, span)?;
+        let rctx = ctx.runtime_expression_ctx(span)?;
+        rctx.block.push(
+            crate::Statement::SubgroupCollectiveOperation {
+                op,
+                collective_op,
+                argument,
+                result,
+            },
+            span,
+        );
+        Ok(result)
+    }
+
+    fn subgroup_gather_helper(
+        &mut self,
+        span: Span,
+        mode: SubgroupGather,
+        arguments: &[Handle<ast::Expression<'source>>],
+        ctx: &mut ExpressionContext<'source, '_, '_>,
+    ) -> Result<Handle<crate::Expression>, Error<'source>> {
+        let mut args = ctx.prepare_args(arguments, 2, span);
+
+        let argument = self.expression(args.next()?, ctx)?;
+
+        use SubgroupGather as Sg;
+        let mode = if let Sg::BroadcastFirst = mode {
+            crate::GatherMode::BroadcastFirst
+        } else {
+            let index = self.expression(args.next()?, ctx)?;
+            match mode {
+                Sg::Broadcast => crate::GatherMode::Broadcast(index),
+                Sg::Shuffle => crate::GatherMode::Shuffle(index),
+                Sg::ShuffleDown => crate::GatherMode::ShuffleDown(index),
+                Sg::ShuffleUp => crate::GatherMode::ShuffleUp(index),
+                Sg::ShuffleXor => crate::GatherMode::ShuffleXor(index),
+                Sg::BroadcastFirst => unreachable!(),
+            }
+        };
+
+        args.finish()?;
+
+        let ty = ctx.register_type(argument)?;
+
+        let result =
+            ctx.interrupt_emitter(crate::Expression::SubgroupOperationResult { ty }, span)?;
+        let rctx = ctx.runtime_expression_ctx(span)?;
+        rctx.block.push(
+            crate::Statement::SubgroupGather {
+                mode,
+                argument,
+                result,
+            },
+            span,
+        );
+        Ok(result)
+    }
+
     fn r#struct(
         &mut self,
         s: &ast::Struct<'source>,
@@ -2877,3 +2944,19 @@ impl<'source, 'temp> Lowerer<'source, 'temp> {
         }
     }
 }
+
+impl crate::AtomicFunction {
+    pub fn map(word: &str) -> Option<Self> {
+        Some(match word {
+            "atomicAdd" => crate::AtomicFunction::Add,
+            "atomicSub" => crate::AtomicFunction::Subtract,
+            "atomicAnd" => crate::AtomicFunction::And,
+            "atomicOr" => crate::AtomicFunction::InclusiveOr,
+            "atomicXor" => crate::AtomicFunction::ExclusiveOr,
+            "atomicMin" => crate::AtomicFunction::Min,
+            "atomicMax" => crate::AtomicFunction::Max,
+            "atomicExchange" => crate::AtomicFunction::Exchange { compare: None },
+            _ => return None,
+        })
+    }
+}
diff --git a/naga/src/front/wgsl/parse/conv.rs b/naga/src/front/wgsl/parse/conv.rs
index 1a4911a3bd..207f0eda41 100644
--- a/naga/src/front/wgsl/parse/conv.rs
+++ b/naga/src/front/wgsl/parse/conv.rs
@@ -35,6 +35,11 @@ pub fn map_built_in(word: &str, span: Span) -> Result<crate::BuiltIn, Error<'_>>
         "local_invocation_index" => crate::BuiltIn::LocalInvocationIndex,
         "workgroup_id" => crate::BuiltIn::WorkGroupId,
         "num_workgroups" => crate::BuiltIn::NumWorkGroups,
+        // subgroup
+        "num_subgroups" => crate::BuiltIn::NumSubgroups,
+        "subgroup_id" => crate::BuiltIn::SubgroupId,
+        "subgroup_size" => crate::BuiltIn::SubgroupSize,
+        "subgroup_invocation_id" => crate::BuiltIn::SubgroupInvocationId,
         _ => return Err(Error::UnknownBuiltin(span)),
     })
 }
@@ -260,3 +265,26 @@ pub fn map_conservative_depth(
         _ => Err(Error::UnknownConservativeDepth(span)),
     }
 }
+
+pub fn map_subgroup_operation(
+    word: &str,
+) -> Option<(crate::SubgroupOperation, crate::CollectiveOperation)> {
+    use crate::CollectiveOperation as co;
+    use crate::SubgroupOperation as sg;
+    Some(match word {
+        "subgroupAll" => (sg::All, co::Reduce),
+        "subgroupAny" => (sg::Any, co::Reduce),
+        "subgroupAdd" => (sg::Add, co::Reduce),
+        "subgroupMul" => (sg::Mul, co::Reduce),
+        "subgroupMin" => (sg::Min, co::Reduce),
+        "subgroupMax" => (sg::Max, co::Reduce),
+        "subgroupAnd" => (sg::And, co::Reduce),
+        "subgroupOr" => (sg::Or, co::Reduce),
+        "subgroupXor" => (sg::Xor, co::Reduce),
+        "subgroupExclusiveAdd" => (sg::Add, co::ExclusiveScan),
+        "subgroupExclusiveMul" => (sg::Mul, co::ExclusiveScan),
+        "subgroupInclusiveAdd" => (sg::Add, co::InclusiveScan),
+        "subgroupInclusiveMul" => (sg::Mul, co::InclusiveScan),
+        _ => return None,
+    })
+}
diff --git a/naga/src/lib.rs b/naga/src/lib.rs
index ceb7e55b7b..24e1b02c76 100644
--- a/naga/src/lib.rs
+++ b/naga/src/lib.rs
@@ -274,6 +274,7 @@ pub mod back;
 mod block;
 #[cfg(feature = "compact")]
 pub mod compact;
+pub mod error;
 pub mod front;
 pub mod keywords;
 pub mod proc;
@@ -431,6 +432,11 @@ pub enum BuiltIn {
     WorkGroupId,
     WorkGroupSize,
     NumWorkGroups,
+    // subgroup
+    NumSubgroups,
+    SubgroupId,
+    SubgroupSize,
+    SubgroupInvocationId,
 }
 
 /// Number of bytes per scalar.
@@ -866,7 +872,7 @@ pub enum TypeInner {
     BindingArray { base: Handle<Type>, size: ArraySize },
 }
 
-#[derive(Debug, Clone, Copy, PartialOrd)]
+#[derive(Debug, Clone, Copy, PartialEq, PartialOrd)]
 #[cfg_attr(feature = "serialize", derive(Serialize))]
 #[cfg_attr(feature = "deserialize", derive(Deserialize))]
 #[cfg_attr(feature = "arbitrary", derive(Arbitrary))]
@@ -1277,6 +1283,51 @@ pub enum SwizzleComponent {
     W = 3,
 }
 
+#[derive(Clone, Copy, Debug, Hash, Eq, Ord, PartialEq, PartialOrd)]
+#[cfg_attr(feature = "serialize", derive(Serialize))]
+#[cfg_attr(feature = "deserialize", derive(Deserialize))]
+#[cfg_attr(feature = "arbitrary", derive(Arbitrary))]
+pub enum GatherMode {
+    /// All gather from the active lane with the smallest index
+    BroadcastFirst,
+    /// All gather from the same lane at the index given by the expression
+    Broadcast(Handle<Expression>),
+    /// Each gathers from a different lane at the index given by the expression
+    Shuffle(Handle<Expression>),
+    /// Each gathers from their lane plus the shift given by the expression
+    ShuffleDown(Handle<Expression>),
+    /// Each gathers from their lane minus the shift given by the expression
+    ShuffleUp(Handle<Expression>),
+    /// Each gathers from their lane xored with the given by the expression
+    ShuffleXor(Handle<Expression>),
+}
+
+#[derive(Clone, Copy, Debug, Hash, Eq, Ord, PartialEq, PartialOrd)]
+#[cfg_attr(feature = "serialize", derive(Serialize))]
+#[cfg_attr(feature = "deserialize", derive(Deserialize))]
+#[cfg_attr(feature = "arbitrary", derive(Arbitrary))]
+pub enum SubgroupOperation {
+    All = 0,
+    Any = 1,
+    Add = 2,
+    Mul = 3,
+    Min = 4,
+    Max = 5,
+    And = 6,
+    Or = 7,
+    Xor = 8,
+}
+
+#[derive(Clone, Copy, Debug, Hash, Eq, Ord, PartialEq, PartialOrd)]
+#[cfg_attr(feature = "serialize", derive(Serialize))]
+#[cfg_attr(feature = "deserialize", derive(Deserialize))]
+#[cfg_attr(feature = "arbitrary", derive(Arbitrary))]
+pub enum CollectiveOperation {
+    Reduce = 0,
+    InclusiveScan = 1,
+    ExclusiveScan = 2,
+}
+
 bitflags::bitflags! {
     /// Memory barrier flags.
     #[cfg_attr(feature = "serialize", derive(Serialize))]
@@ -1285,9 +1336,11 @@ bitflags::bitflags! {
     #[derive(Clone, Copy, Debug, Default, Eq, PartialEq)]
     pub struct Barrier: u32 {
         /// Barrier affects all `AddressSpace::Storage` accesses.
-        const STORAGE = 0x1;
+        const STORAGE = 1 << 0;
         /// Barrier affects all `AddressSpace::WorkGroup` accesses.
-        const WORK_GROUP = 0x2;
+        const WORK_GROUP = 1 << 1;
+        /// Barrier synchronizes execution across all invocations within a subgroup that exectue this instruction.
+        const SUB_GROUP = 1 << 2;
     }
 }
 
@@ -1588,6 +1641,15 @@ pub enum Expression {
         query: Handle<Expression>,
         committed: bool,
     },
+    /// Result of a [`SubgroupBallot`] statement.
+    ///
+    /// [`SubgroupBallot`]: Statement::SubgroupBallot
+    SubgroupBallotResult,
+    /// Result of a [`SubgroupCollectiveOperation`] or [`SubgroupGather`] statement.
+    ///
+    /// [`SubgroupCollectiveOperation`]: Statement::SubgroupCollectiveOperation
+    /// [`SubgroupGather`]: Statement::SubgroupGather
+    SubgroupOperationResult { ty: Handle<Type> },
 }
 
 pub use block::Block;
@@ -1872,6 +1934,39 @@ pub enum Statement {
         /// The specific operation we're performing on `query`.
         fun: RayQueryFunction,
     },
+    /// Calculate a bitmask using a boolean from each active thread in the subgroup
+    SubgroupBallot {
+        /// The [`SubgroupBallotResult`] expression representing this load's result.
+        ///
+        /// [`SubgroupBallotResult`]: Expression::SubgroupBallotResult
+        result: Handle<Expression>,
+        /// The value from this thread to store in the ballot
+        predicate: Option<Handle<Expression>>,
+    },
+    /// Gather a value from another active thread in the subgroup
+    SubgroupGather {
+        /// Specifies which thread to gather from
+        mode: GatherMode,
+        /// The value to broadcast over
+        argument: Handle<Expression>,
+        /// The [`SubgroupOperationResult`] expression representing this load's result.
+        ///
+        /// [`SubgroupOperationResult`]: Expression::SubgroupOperationResult
+        result: Handle<Expression>,
+    },
+    /// Compute a collective operation across all active threads in the subgroup
+    SubgroupCollectiveOperation {
+        /// What operation to compute
+        op: SubgroupOperation,
+        /// How to combine the results
+        collective_op: CollectiveOperation,
+        /// The value to compute over
+        argument: Handle<Expression>,
+        /// The [`SubgroupOperationResult`] expression representing this load's result.
+        ///
+        /// [`SubgroupOperationResult`]: Expression::SubgroupOperationResult
+        result: Handle<Expression>,
+    },
 }
 
 /// A function argument.
diff --git a/naga/src/proc/constant_evaluator.rs b/naga/src/proc/constant_evaluator.rs
index 547fbbc652..ead3d00980 100644
--- a/naga/src/proc/constant_evaluator.rs
+++ b/naga/src/proc/constant_evaluator.rs
@@ -476,6 +476,8 @@ pub enum ConstantEvaluatorError {
     ImageExpression,
     #[error("Constants don't support ray query expressions")]
     RayQueryExpression,
+    #[error("Constants don't support subgroup expressions")]
+    SubgroupExpression,
     #[error("Cannot access the type")]
     InvalidAccessBase,
     #[error("Cannot access at the index")]
@@ -884,6 +886,12 @@ impl<'a> ConstantEvaluator<'a> {
             Expression::RayQueryProceedResult | Expression::RayQueryGetIntersection { .. } => {
                 Err(ConstantEvaluatorError::RayQueryExpression)
             }
+            Expression::SubgroupBallotResult { .. } => {
+                Err(ConstantEvaluatorError::SubgroupExpression)
+            }
+            Expression::SubgroupOperationResult { .. } => {
+                Err(ConstantEvaluatorError::SubgroupExpression)
+            }
         }
     }
 
@@ -942,10 +950,10 @@ impl<'a> ConstantEvaluator<'a> {
         pattern: [crate::SwizzleComponent; 4],
     ) -> Result<Handle<Expression>, ConstantEvaluatorError> {
         let mut get_dst_ty = |ty| match self.types[ty].inner {
-            crate::TypeInner::Vector { size: _, scalar } => Ok(self.types.insert(
+            TypeInner::Vector { size: _, scalar } => Ok(self.types.insert(
                 Type {
                     name: None,
-                    inner: crate::TypeInner::Vector { size, scalar },
+                    inner: TypeInner::Vector { size, scalar },
                 },
                 span,
             )),
@@ -1236,13 +1244,11 @@ impl<'a> ConstantEvaluator<'a> {
             Expression::ZeroValue(ty) | Expression::Compose { ty, .. } => {
                 match self.types[ty].inner {
                     TypeInner::Array { size, .. } => match size {
-                        crate::ArraySize::Constant(len) => {
+                        ArraySize::Constant(len) => {
                             let expr = Expression::Literal(Literal::U32(len.get()));
                             self.register_evaluated_expr(expr, span)
                         }
-                        crate::ArraySize::Dynamic => {
-                            Err(ConstantEvaluatorError::ArrayLengthDynamic)
-                        }
+                        ArraySize::Dynamic => Err(ConstantEvaluatorError::ArrayLengthDynamic),
                     },
                     _ => Err(ConstantEvaluatorError::InvalidArrayLengthArg),
                 }
@@ -1305,7 +1311,7 @@ impl<'a> ConstantEvaluator<'a> {
             Expression::ZeroValue(ty)
                 if matches!(
                     self.types[ty].inner,
-                    crate::TypeInner::Scalar(crate::Scalar {
+                    TypeInner::Scalar(crate::Scalar {
                         kind: ScalarKind::Uint,
                         ..
                     })
@@ -1620,7 +1626,7 @@ impl<'a> ConstantEvaluator<'a> {
             return self.cast(expr, target, span);
         };
 
-        let crate::TypeInner::Array {
+        let TypeInner::Array {
             base: _,
             size,
             stride: _,
diff --git a/naga/src/proc/index.rs b/naga/src/proc/index.rs
index af3221c0fe..e2c3de8eb0 100644
--- a/naga/src/proc/index.rs
+++ b/naga/src/proc/index.rs
@@ -239,7 +239,7 @@ pub enum GuardedIndex {
 pub fn find_checked_indexes(
     module: &crate::Module,
     function: &crate::Function,
-    info: &crate::valid::FunctionInfo,
+    info: &valid::FunctionInfo,
     policies: BoundsCheckPolicies,
 ) -> BitSet {
     use crate::Expression as Ex;
@@ -321,7 +321,7 @@ pub fn access_needs_check(
     mut index: GuardedIndex,
     module: &crate::Module,
     function: &crate::Function,
-    info: &crate::valid::FunctionInfo,
+    info: &valid::FunctionInfo,
 ) -> Option<IndexableLength> {
     let base_inner = info[base].ty.inner_with(&module.types);
     // Unwrap safety: `Err` here indicates unindexable base types and invalid
diff --git a/naga/src/proc/mod.rs b/naga/src/proc/mod.rs
index 0e89f29032..93aac5b3e5 100644
--- a/naga/src/proc/mod.rs
+++ b/naga/src/proc/mod.rs
@@ -153,56 +153,31 @@ impl super::Scalar {
     }
 }
 
-impl PartialEq for crate::Literal {
-    fn eq(&self, other: &Self) -> bool {
-        match (*self, *other) {
-            (Self::F64(a), Self::F64(b)) => a.to_bits() == b.to_bits(),
-            (Self::F32(a), Self::F32(b)) => a.to_bits() == b.to_bits(),
-            (Self::U32(a), Self::U32(b)) => a == b,
-            (Self::I32(a), Self::I32(b)) => a == b,
-            (Self::U64(a), Self::U64(b)) => a == b,
-            (Self::I64(a), Self::I64(b)) => a == b,
-            (Self::Bool(a), Self::Bool(b)) => a == b,
-            _ => false,
-        }
-    }
+#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
+pub enum HashableLiteral {
+    F64(u64),
+    F32(u32),
+    U32(u32),
+    I32(i32),
+    U64(u64),
+    I64(i64),
+    Bool(bool),
+    AbstractInt(i64),
+    AbstractFloat(u64),
 }
-impl Eq for crate::Literal {}
-impl std::hash::Hash for crate::Literal {
-    fn hash<H: std::hash::Hasher>(&self, hasher: &mut H) {
-        match *self {
-            Self::F64(v) | Self::AbstractFloat(v) => {
-                hasher.write_u8(0);
-                v.to_bits().hash(hasher);
-            }
-            Self::F32(v) => {
-                hasher.write_u8(1);
-                v.to_bits().hash(hasher);
-            }
-            Self::U32(v) => {
-                hasher.write_u8(2);
-                v.hash(hasher);
-            }
-            Self::I32(v) => {
-                hasher.write_u8(3);
-                v.hash(hasher);
-            }
-            Self::Bool(v) => {
-                hasher.write_u8(4);
-                v.hash(hasher);
-            }
-            Self::I64(v) => {
-                hasher.write_u8(5);
-                v.hash(hasher);
-            }
-            Self::U64(v) => {
-                hasher.write_u8(6);
-                v.hash(hasher);
-            }
-            Self::AbstractInt(v) => {
-                hasher.write_u8(7);
-                v.hash(hasher);
-            }
+
+impl From<crate::Literal> for HashableLiteral {
+    fn from(l: crate::Literal) -> Self {
+        match l {
+            crate::Literal::F64(v) => Self::F64(v.to_bits()),
+            crate::Literal::F32(v) => Self::F32(v.to_bits()),
+            crate::Literal::U32(v) => Self::U32(v),
+            crate::Literal::I32(v) => Self::I32(v),
+            crate::Literal::U64(v) => Self::U64(v),
+            crate::Literal::I64(v) => Self::I64(v),
+            crate::Literal::Bool(v) => Self::Bool(v),
+            crate::Literal::AbstractInt(v) => Self::AbstractInt(v),
+            crate::Literal::AbstractFloat(v) => Self::AbstractFloat(v.to_bits()),
         }
     }
 }
@@ -279,8 +254,9 @@ impl super::TypeInner {
         self.scalar().map(|scalar| scalar.kind)
     }
 
+    /// Returns the scalar width in bytes
     pub fn scalar_width(&self) -> Option<u8> {
-        self.scalar().map(|scalar| scalar.width * 8)
+        self.scalar().map(|scalar| scalar.width)
     }
 
     pub const fn pointer_space(&self) -> Option<crate::AddressSpace> {
@@ -532,6 +508,7 @@ impl crate::Expression {
         match *self {
             Self::Literal(_)
             | Self::Constant(_)
+            | Self::Override(_)
             | Self::ZeroValue(_)
             | Self::FunctionArgument(_)
             | Self::GlobalVariable(_)
diff --git a/naga/src/proc/terminator.rs b/naga/src/proc/terminator.rs
index a5239d4eca..5edf55cb73 100644
--- a/naga/src/proc/terminator.rs
+++ b/naga/src/proc/terminator.rs
@@ -37,6 +37,9 @@ pub fn ensure_block_returns(block: &mut crate::Block) {
             | S::RayQuery { .. }
             | S::Atomic { .. }
             | S::WorkGroupUniformLoad { .. }
+            | S::SubgroupBallot { .. }
+            | S::SubgroupCollectiveOperation { .. }
+            | S::SubgroupGather { .. }
             | S::Barrier(_)),
         )
         | None => block.push(S::Return { value: None }, Default::default()),
diff --git a/naga/src/proc/typifier.rs b/naga/src/proc/typifier.rs
index 845b35cb4d..3936e7efbe 100644
--- a/naga/src/proc/typifier.rs
+++ b/naga/src/proc/typifier.rs
@@ -598,6 +598,7 @@ impl<'a> ResolveContext<'a> {
                 | crate::BinaryOperator::ShiftRight => past(left)?.clone(),
             },
             crate::Expression::AtomicResult { ty, .. } => TypeResolution::Handle(ty),
+            crate::Expression::SubgroupOperationResult { ty } => TypeResolution::Handle(ty),
             crate::Expression::WorkGroupUniformLoadResult { ty } => TypeResolution::Handle(ty),
             crate::Expression::Select { accept, .. } => past(accept)?.clone(),
             crate::Expression::Derivative { expr, .. } => past(expr)?.clone(),
@@ -885,6 +886,10 @@ impl<'a> ResolveContext<'a> {
                     .ok_or(ResolveError::MissingSpecialType)?;
                 TypeResolution::Handle(result)
             }
+            crate::Expression::SubgroupBallotResult => TypeResolution::Value(Ti::Vector {
+                scalar: crate::Scalar::U32,
+                size: crate::VectorSize::Quad,
+            }),
         })
     }
 }
diff --git a/naga/src/span.rs b/naga/src/span.rs
index 10744647e9..82cfbe5a4b 100644
--- a/naga/src/span.rs
+++ b/naga/src/span.rs
@@ -72,8 +72,8 @@ impl Span {
     pub fn location(&self, source: &str) -> SourceLocation {
         let prefix = &source[..self.start as usize];
         let line_number = prefix.matches('\n').count() as u32 + 1;
-        let line_start = prefix.rfind('\n').map(|pos| pos + 1).unwrap_or(0);
-        let line_position = source[line_start..self.start as usize].chars().count() as u32 + 1;
+        let line_start = prefix.rfind('\n').map(|pos| pos + 1).unwrap_or(0) as u32;
+        let line_position = self.start - line_start + 1;
 
         SourceLocation {
             line_number,
@@ -107,14 +107,14 @@ impl std::ops::Index<Span> for str {
 /// Roughly corresponds to the positional members of [`GPUCompilationMessage`][gcm] from
 /// the WebGPU specification, except
 /// - `offset` and `length` are in bytes (UTF-8 code units), instead of UTF-16 code units.
-/// - `line_position` counts entire Unicode code points, instead of UTF-16 code units.
+/// - `line_position` is in bytes (UTF-8 code units), instead of UTF-16 code units.
 ///
 /// [gcm]: https://www.w3.org/TR/webgpu/#gpucompilationmessage
 #[derive(Copy, Clone, Debug, PartialEq, Eq)]
 pub struct SourceLocation {
     /// 1-based line number.
     pub line_number: u32,
-    /// 1-based column of the start of this span, counted in Unicode code points.
+    /// 1-based column in code units (in bytes) of the start of the span.
     pub line_position: u32,
     /// 0-based Offset in code units (in bytes) of the start of the span.
     pub offset: u32,
@@ -136,7 +136,7 @@ impl<E> fmt::Display for WithSpan<E>
 where
     E: fmt::Display,
 {
-    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> std::fmt::Result {
+    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
         self.inner.fmt(f)
     }
 }
@@ -304,7 +304,7 @@ impl<E> WithSpan<E> {
         use term::termcolor::NoColor;
 
         let files = files::SimpleFile::new(path, source);
-        let config = codespan_reporting::term::Config::default();
+        let config = term::Config::default();
         let mut writer = NoColor::new(Vec::new());
         term::emit(&mut writer, &config, &files, &self.diagnostic()).expect("cannot write error");
         String::from_utf8(writer.into_inner()).unwrap()
diff --git a/naga/src/valid/analyzer.rs b/naga/src/valid/analyzer.rs
index d45c25c62e..6799e5db27 100644
--- a/naga/src/valid/analyzer.rs
+++ b/naga/src/valid/analyzer.rs
@@ -787,6 +787,14 @@ impl FunctionInfo {
                 non_uniform_result: self.add_ref(query),
                 requirements: UniformityRequirements::empty(),
             },
+            E::SubgroupBallotResult => Uniformity {
+                non_uniform_result: Some(handle),
+                requirements: UniformityRequirements::empty(),
+            },
+            E::SubgroupOperationResult { .. } => Uniformity {
+                non_uniform_result: Some(handle),
+                requirements: UniformityRequirements::empty(),
+            },
         };
 
         let ty = resolve_context.resolve(expression, |h| Ok(&self[h].ty))?;
@@ -827,7 +835,7 @@ impl FunctionInfo {
                         let req = self.expressions[expr.index()].uniformity.requirements;
                         if self
                             .flags
-                            .contains(super::ValidationFlags::CONTROL_FLOW_UNIFORMITY)
+                            .contains(ValidationFlags::CONTROL_FLOW_UNIFORMITY)
                             && !req.is_empty()
                         {
                             if let Some(cause) = disruptor {
@@ -1029,6 +1037,42 @@ impl FunctionInfo {
                     }
                     FunctionUniformity::new()
                 }
+                S::SubgroupBallot {
+                    result: _,
+                    predicate,
+                } => {
+                    if let Some(predicate) = predicate {
+                        let _ = self.add_ref(predicate);
+                    }
+                    FunctionUniformity::new()
+                }
+                S::SubgroupCollectiveOperation {
+                    op: _,
+                    collective_op: _,
+                    argument,
+                    result: _,
+                } => {
+                    let _ = self.add_ref(argument);
+                    FunctionUniformity::new()
+                }
+                S::SubgroupGather {
+                    mode,
+                    argument,
+                    result: _,
+                } => {
+                    let _ = self.add_ref(argument);
+                    match mode {
+                        crate::GatherMode::BroadcastFirst => {}
+                        crate::GatherMode::Broadcast(index)
+                        | crate::GatherMode::Shuffle(index)
+                        | crate::GatherMode::ShuffleDown(index)
+                        | crate::GatherMode::ShuffleUp(index)
+                        | crate::GatherMode::ShuffleXor(index) => {
+                            let _ = self.add_ref(index);
+                        }
+                    }
+                    FunctionUniformity::new()
+                }
             };
 
             disruptor = disruptor.or(uniformity.exit_disruptor());
diff --git a/naga/src/valid/expression.rs b/naga/src/valid/expression.rs
index bf46fd3262..525bd28c17 100644
--- a/naga/src/valid/expression.rs
+++ b/naga/src/valid/expression.rs
@@ -194,7 +194,7 @@ impl super::Validator {
         use crate::Expression as E;
 
         if !global_expr_kind.is_const_or_override(handle) {
-            return Err(super::ConstExpressionError::NonConstOrOverride);
+            return Err(ConstExpressionError::NonConstOrOverride);
         }
 
         match gctx.global_expressions[handle] {
@@ -211,10 +211,10 @@ impl super::Validator {
             }
             E::Splat { value, .. } => match *mod_info[value].inner_with(gctx.types) {
                 crate::TypeInner::Scalar { .. } => {}
-                _ => return Err(super::ConstExpressionError::InvalidSplatType(value)),
+                _ => return Err(ConstExpressionError::InvalidSplatType(value)),
             },
             _ if global_expr_kind.is_const(handle) || !self.allow_overrides => {
-                return Err(super::ConstExpressionError::NonFullyEvaluatedConst)
+                return Err(ConstExpressionError::NonFullyEvaluatedConst)
             }
             // the constant evaluator will report errors about override-expressions
             _ => {}
@@ -1641,6 +1641,7 @@ impl super::Validator {
                     return Err(ExpressionError::InvalidRayQueryType(query));
                 }
             },
+            E::SubgroupBallotResult | E::SubgroupOperationResult { .. } => self.subgroup_stages,
         };
         Ok(stages)
     }
diff --git a/naga/src/valid/function.rs b/naga/src/valid/function.rs
index fe5681449e..71128fc86d 100644
--- a/naga/src/valid/function.rs
+++ b/naga/src/valid/function.rs
@@ -47,6 +47,19 @@ pub enum AtomicError {
     ResultTypeMismatch(Handle<crate::Expression>),
 }
 
+#[derive(Clone, Debug, thiserror::Error)]
+#[cfg_attr(test, derive(PartialEq))]
+pub enum SubgroupError {
+    #[error("Operand {0:?} has invalid type.")]
+    InvalidOperand(Handle<crate::Expression>),
+    #[error("Result type for {0:?} doesn't match the statement")]
+    ResultTypeMismatch(Handle<crate::Expression>),
+    #[error("Support for subgroup operation {0:?} is required")]
+    UnsupportedOperation(super::SubgroupOperationSet),
+    #[error("Unknown operation")]
+    UnknownOperation,
+}
+
 #[derive(Clone, Debug, thiserror::Error)]
 #[cfg_attr(test, derive(PartialEq))]
 pub enum LocalVariableError {
@@ -135,6 +148,8 @@ pub enum FunctionError {
     InvalidRayDescriptor(Handle<crate::Expression>),
     #[error("Ray Query {0:?} does not have a matching type")]
     InvalidRayQueryType(Handle<crate::Type>),
+    #[error("Shader requires capability {0:?}")]
+    MissingCapability(super::Capabilities),
     #[error(
         "Required uniformity of control flow for {0:?} in {1:?} is not fulfilled because of {2:?}"
     )]
@@ -155,6 +170,8 @@ pub enum FunctionError {
     WorkgroupUniformLoadExpressionMismatch(Handle<crate::Expression>),
     #[error("The expression {0:?} is not valid as a WorkGroupUniformLoad argument. It should be a Pointer in Workgroup address space")]
     WorkgroupUniformLoadInvalidPointer(Handle<crate::Expression>),
+    #[error("Subgroup operation is invalid")]
+    InvalidSubgroup(#[from] SubgroupError),
 }
 
 bitflags::bitflags! {
@@ -399,6 +416,127 @@ impl super::Validator {
         }
         Ok(())
     }
+    fn validate_subgroup_operation(
+        &mut self,
+        op: &crate::SubgroupOperation,
+        collective_op: &crate::CollectiveOperation,
+        argument: Handle<crate::Expression>,
+        result: Handle<crate::Expression>,
+        context: &BlockContext,
+    ) -> Result<(), WithSpan<FunctionError>> {
+        let argument_inner = context.resolve_type(argument, &self.valid_expression_set)?;
+
+        let (is_scalar, scalar) = match *argument_inner {
+            crate::TypeInner::Scalar(scalar) => (true, scalar),
+            crate::TypeInner::Vector { scalar, .. } => (false, scalar),
+            _ => {
+                log::error!("Subgroup operand type {:?}", argument_inner);
+                return Err(SubgroupError::InvalidOperand(argument)
+                    .with_span_handle(argument, context.expressions)
+                    .into_other());
+            }
+        };
+
+        use crate::ScalarKind as sk;
+        use crate::SubgroupOperation as sg;
+        match (scalar.kind, *op) {
+            (sk::Bool, sg::All | sg::Any) if is_scalar => {}
+            (sk::Sint | sk::Uint | sk::Float, sg::Add | sg::Mul | sg::Min | sg::Max) => {}
+            (sk::Sint | sk::Uint, sg::And | sg::Or | sg::Xor) => {}
+
+            (_, _) => {
+                log::error!("Subgroup operand type {:?}", argument_inner);
+                return Err(SubgroupError::InvalidOperand(argument)
+                    .with_span_handle(argument, context.expressions)
+                    .into_other());
+            }
+        };
+
+        use crate::CollectiveOperation as co;
+        match (*collective_op, *op) {
+            (
+                co::Reduce,
+                sg::All
+                | sg::Any
+                | sg::Add
+                | sg::Mul
+                | sg::Min
+                | sg::Max
+                | sg::And
+                | sg::Or
+                | sg::Xor,
+            ) => {}
+            (co::InclusiveScan | co::ExclusiveScan, sg::Add | sg::Mul) => {}
+
+            (_, _) => {
+                return Err(SubgroupError::UnknownOperation.with_span().into_other());
+            }
+        };
+
+        self.emit_expression(result, context)?;
+        match context.expressions[result] {
+            crate::Expression::SubgroupOperationResult { ty }
+                if { &context.types[ty].inner == argument_inner } => {}
+            _ => {
+                return Err(SubgroupError::ResultTypeMismatch(result)
+                    .with_span_handle(result, context.expressions)
+                    .into_other())
+            }
+        }
+        Ok(())
+    }
+    fn validate_subgroup_gather(
+        &mut self,
+        mode: &crate::GatherMode,
+        argument: Handle<crate::Expression>,
+        result: Handle<crate::Expression>,
+        context: &BlockContext,
+    ) -> Result<(), WithSpan<FunctionError>> {
+        match *mode {
+            crate::GatherMode::BroadcastFirst => {}
+            crate::GatherMode::Broadcast(index)
+            | crate::GatherMode::Shuffle(index)
+            | crate::GatherMode::ShuffleDown(index)
+            | crate::GatherMode::ShuffleUp(index)
+            | crate::GatherMode::ShuffleXor(index) => {
+                let index_ty = context.resolve_type(index, &self.valid_expression_set)?;
+                match *index_ty {
+                    crate::TypeInner::Scalar(crate::Scalar::U32) => {}
+                    _ => {
+                        log::error!(
+                            "Subgroup gather index type {:?}, expected unsigned int",
+                            index_ty
+                        );
+                        return Err(SubgroupError::InvalidOperand(argument)
+                            .with_span_handle(index, context.expressions)
+                            .into_other());
+                    }
+                }
+            }
+        }
+        let argument_inner = context.resolve_type(argument, &self.valid_expression_set)?;
+        if !matches!(*argument_inner,
+            crate::TypeInner::Scalar ( scalar, .. ) | crate::TypeInner::Vector { scalar, .. }
+            if matches!(scalar.kind, crate::ScalarKind::Uint | crate::ScalarKind::Sint | crate::ScalarKind::Float)
+        ) {
+            log::error!("Subgroup gather operand type {:?}", argument_inner);
+            return Err(SubgroupError::InvalidOperand(argument)
+                .with_span_handle(argument, context.expressions)
+                .into_other());
+        }
+
+        self.emit_expression(result, context)?;
+        match context.expressions[result] {
+            crate::Expression::SubgroupOperationResult { ty }
+                if { &context.types[ty].inner == argument_inner } => {}
+            _ => {
+                return Err(SubgroupError::ResultTypeMismatch(result)
+                    .with_span_handle(result, context.expressions)
+                    .into_other())
+            }
+        }
+        Ok(())
+    }
 
     fn validate_block_impl(
         &mut self,
@@ -613,8 +751,30 @@ impl super::Validator {
                     stages &= super::ShaderStages::FRAGMENT;
                     finished = true;
                 }
-                S::Barrier(_) => {
+                S::Barrier(barrier) => {
                     stages &= super::ShaderStages::COMPUTE;
+                    if barrier.contains(crate::Barrier::SUB_GROUP) {
+                        if !self.capabilities.contains(
+                            super::Capabilities::SUBGROUP | super::Capabilities::SUBGROUP_BARRIER,
+                        ) {
+                            return Err(FunctionError::MissingCapability(
+                                super::Capabilities::SUBGROUP
+                                    | super::Capabilities::SUBGROUP_BARRIER,
+                            )
+                            .with_span_static(span, "missing capability for this operation"));
+                        }
+                        if !self
+                            .subgroup_operations
+                            .contains(super::SubgroupOperationSet::BASIC)
+                        {
+                            return Err(FunctionError::InvalidSubgroup(
+                                SubgroupError::UnsupportedOperation(
+                                    super::SubgroupOperationSet::BASIC,
+                                ),
+                            )
+                            .with_span_static(span, "support for this operation is not present"));
+                        }
+                    }
                 }
                 S::Store { pointer, value } => {
                     let mut current = pointer;
@@ -904,6 +1064,86 @@ impl super::Validator {
                         crate::RayQueryFunction::Terminate => {}
                     }
                 }
+                S::SubgroupBallot { result, predicate } => {
+                    stages &= self.subgroup_stages;
+                    if !self.capabilities.contains(super::Capabilities::SUBGROUP) {
+                        return Err(FunctionError::MissingCapability(
+                            super::Capabilities::SUBGROUP,
+                        )
+                        .with_span_static(span, "missing capability for this operation"));
+                    }
+                    if !self
+                        .subgroup_operations
+                        .contains(super::SubgroupOperationSet::BALLOT)
+                    {
+                        return Err(FunctionError::InvalidSubgroup(
+                            SubgroupError::UnsupportedOperation(
+                                super::SubgroupOperationSet::BALLOT,
+                            ),
+                        )
+                        .with_span_static(span, "support for this operation is not present"));
+                    }
+                    if let Some(predicate) = predicate {
+                        let predicate_inner =
+                            context.resolve_type(predicate, &self.valid_expression_set)?;
+                        if !matches!(
+                            *predicate_inner,
+                            crate::TypeInner::Scalar(crate::Scalar::BOOL,)
+                        ) {
+                            log::error!(
+                                "Subgroup ballot predicate type {:?} expected bool",
+                                predicate_inner
+                            );
+                            return Err(SubgroupError::InvalidOperand(predicate)
+                                .with_span_handle(predicate, context.expressions)
+                                .into_other());
+                        }
+                    }
+                    self.emit_expression(result, context)?;
+                }
+                S::SubgroupCollectiveOperation {
+                    ref op,
+                    ref collective_op,
+                    argument,
+                    result,
+                } => {
+                    stages &= self.subgroup_stages;
+                    if !self.capabilities.contains(super::Capabilities::SUBGROUP) {
+                        return Err(FunctionError::MissingCapability(
+                            super::Capabilities::SUBGROUP,
+                        )
+                        .with_span_static(span, "missing capability for this operation"));
+                    }
+                    let operation = op.required_operations();
+                    if !self.subgroup_operations.contains(operation) {
+                        return Err(FunctionError::InvalidSubgroup(
+                            SubgroupError::UnsupportedOperation(operation),
+                        )
+                        .with_span_static(span, "support for this operation is not present"));
+                    }
+                    self.validate_subgroup_operation(op, collective_op, argument, result, context)?;
+                }
+                S::SubgroupGather {
+                    ref mode,
+                    argument,
+                    result,
+                } => {
+                    stages &= self.subgroup_stages;
+                    if !self.capabilities.contains(super::Capabilities::SUBGROUP) {
+                        return Err(FunctionError::MissingCapability(
+                            super::Capabilities::SUBGROUP,
+                        )
+                        .with_span_static(span, "missing capability for this operation"));
+                    }
+                    let operation = mode.required_operations();
+                    if !self.subgroup_operations.contains(operation) {
+                        return Err(FunctionError::InvalidSubgroup(
+                            SubgroupError::UnsupportedOperation(operation),
+                        )
+                        .with_span_static(span, "support for this operation is not present"));
+                    }
+                    self.validate_subgroup_gather(mode, argument, result, context)?;
+                }
             }
         }
         Ok(BlockInfo { stages, finished })
diff --git a/naga/src/valid/handles.rs b/naga/src/valid/handles.rs
index 5d3087a28f..8f78204055 100644
--- a/naga/src/valid/handles.rs
+++ b/naga/src/valid/handles.rs
@@ -420,6 +420,8 @@ impl super::Validator {
             }
             crate::Expression::AtomicResult { .. }
             | crate::Expression::RayQueryProceedResult
+            | crate::Expression::SubgroupBallotResult
+            | crate::Expression::SubgroupOperationResult { .. }
             | crate::Expression::WorkGroupUniformLoadResult { .. } => (),
             crate::Expression::ArrayLength(array) => {
                 handle.check_dep(array)?;
@@ -565,6 +567,38 @@ impl super::Validator {
                 }
                 Ok(())
             }
+            crate::Statement::SubgroupBallot { result, predicate } => {
+                validate_expr_opt(predicate)?;
+                validate_expr(result)?;
+                Ok(())
+            }
+            crate::Statement::SubgroupCollectiveOperation {
+                op: _,
+                collective_op: _,
+                argument,
+                result,
+            } => {
+                validate_expr(argument)?;
+                validate_expr(result)?;
+                Ok(())
+            }
+            crate::Statement::SubgroupGather {
+                mode,
+                argument,
+                result,
+            } => {
+                validate_expr(argument)?;
+                match mode {
+                    crate::GatherMode::BroadcastFirst => {}
+                    crate::GatherMode::Broadcast(index)
+                    | crate::GatherMode::Shuffle(index)
+                    | crate::GatherMode::ShuffleDown(index)
+                    | crate::GatherMode::ShuffleUp(index)
+                    | crate::GatherMode::ShuffleXor(index) => validate_expr(index)?,
+                }
+                validate_expr(result)?;
+                Ok(())
+            }
             crate::Statement::Break
             | crate::Statement::Continue
             | crate::Statement::Kill
diff --git a/naga/src/valid/interface.rs b/naga/src/valid/interface.rs
index 2435b34c29..db890ddbac 100644
--- a/naga/src/valid/interface.rs
+++ b/naga/src/valid/interface.rs
@@ -77,6 +77,8 @@ pub enum VaryingError {
         location: u32,
         attribute: &'static str,
     },
+    #[error("Workgroup size is multi dimensional, @builtin(subgroup_id) and @builtin(subgroup_invocation_id) are not supported.")]
+    InvalidMultiDimensionalSubgroupBuiltIn,
 }
 
 #[derive(Clone, Debug, thiserror::Error)]
@@ -140,6 +142,7 @@ struct VaryingContext<'a> {
 impl VaryingContext<'_> {
     fn validate_impl(
         &mut self,
+        ep: &crate::EntryPoint,
         ty: Handle<crate::Type>,
         binding: &crate::Binding,
     ) -> Result<(), VaryingError> {
@@ -167,12 +170,24 @@ impl VaryingContext<'_> {
                     Bi::PrimitiveIndex => Capabilities::PRIMITIVE_INDEX,
                     Bi::ViewIndex => Capabilities::MULTIVIEW,
                     Bi::SampleIndex => Capabilities::MULTISAMPLED_SHADING,
+                    Bi::NumSubgroups
+                    | Bi::SubgroupId
+                    | Bi::SubgroupSize
+                    | Bi::SubgroupInvocationId => Capabilities::SUBGROUP,
                     _ => Capabilities::empty(),
                 };
                 if !self.capabilities.contains(required) {
                     return Err(VaryingError::UnsupportedCapability(required));
                 }
 
+                if matches!(
+                    built_in,
+                    crate::BuiltIn::SubgroupId | crate::BuiltIn::SubgroupInvocationId
+                ) && ep.workgroup_size[1..].iter().any(|&s| s > 1)
+                {
+                    return Err(VaryingError::InvalidMultiDimensionalSubgroupBuiltIn);
+                }
+
                 let (visible, type_good) = match built_in {
                     Bi::BaseInstance | Bi::BaseVertex | Bi::InstanceIndex | Bi::VertexIndex => (
                         self.stage == St::Vertex && !self.output,
@@ -254,6 +269,17 @@ impl VaryingContext<'_> {
                                 scalar: crate::Scalar::U32,
                             },
                     ),
+                    Bi::NumSubgroups | Bi::SubgroupId => (
+                        self.stage == St::Compute && !self.output,
+                        *ty_inner == Ti::Scalar(crate::Scalar::U32),
+                    ),
+                    Bi::SubgroupSize | Bi::SubgroupInvocationId => (
+                        match self.stage {
+                            St::Compute | St::Fragment => !self.output,
+                            St::Vertex => false,
+                        },
+                        *ty_inner == Ti::Scalar(crate::Scalar::U32),
+                    ),
                 };
 
                 if !visible {
@@ -354,13 +380,14 @@ impl VaryingContext<'_> {
 
     fn validate(
         &mut self,
+        ep: &crate::EntryPoint,
         ty: Handle<crate::Type>,
         binding: Option<&crate::Binding>,
     ) -> Result<(), WithSpan<VaryingError>> {
         let span_context = self.types.get_span_context(ty);
         match binding {
             Some(binding) => self
-                .validate_impl(ty, binding)
+                .validate_impl(ep, ty, binding)
                 .map_err(|e| e.with_span_context(span_context)),
             None => {
                 match self.types[ty].inner {
@@ -377,7 +404,7 @@ impl VaryingContext<'_> {
                                     }
                                 }
                                 Some(ref binding) => self
-                                    .validate_impl(member.ty, binding)
+                                    .validate_impl(ep, member.ty, binding)
                                     .map_err(|e| e.with_span_context(span_context))?,
                             }
                         }
@@ -609,7 +636,7 @@ impl super::Validator {
                 capabilities: self.capabilities,
                 flags: self.flags,
             };
-            ctx.validate(fa.ty, fa.binding.as_ref())
+            ctx.validate(ep, fa.ty, fa.binding.as_ref())
                 .map_err_inner(|e| EntryPointError::Argument(index as u32, e).with_span())?;
         }
 
@@ -627,7 +654,7 @@ impl super::Validator {
                 capabilities: self.capabilities,
                 flags: self.flags,
             };
-            ctx.validate(fr.ty, fr.binding.as_ref())
+            ctx.validate(ep, fr.ty, fr.binding.as_ref())
                 .map_err_inner(|e| EntryPointError::Result(e).with_span())?;
             if ctx.second_blend_source {
                 // Only the first location may be used when dual source blending
diff --git a/naga/src/valid/mod.rs b/naga/src/valid/mod.rs
index f34c0f6f1a..a0057f39ac 100644
--- a/naga/src/valid/mod.rs
+++ b/naga/src/valid/mod.rs
@@ -77,7 +77,7 @@ bitflags::bitflags! {
     #[cfg_attr(feature = "serialize", derive(serde::Serialize))]
     #[cfg_attr(feature = "deserialize", derive(serde::Deserialize))]
     #[derive(Clone, Copy, Debug, Eq, PartialEq)]
-    pub struct Capabilities: u16 {
+    pub struct Capabilities: u32 {
         /// Support for [`AddressSpace:PushConstant`].
         const PUSH_CONSTANT = 0x1;
         /// Float values with width = 8.
@@ -110,6 +110,10 @@ bitflags::bitflags! {
         const CUBE_ARRAY_TEXTURES = 0x4000;
         /// Support for 64-bit signed and unsigned integers.
         const SHADER_INT64 = 0x8000;
+        /// Support for subgroup operations.
+        const SUBGROUP = 0x10000;
+        /// Support for subgroup barriers.
+        const SUBGROUP_BARRIER = 0x20000;
     }
 }
 
@@ -119,6 +123,57 @@ impl Default for Capabilities {
     }
 }
 
+bitflags::bitflags! {
+    /// Supported subgroup operations
+    #[cfg_attr(feature = "serialize", derive(serde::Serialize))]
+    #[cfg_attr(feature = "deserialize", derive(serde::Deserialize))]
+    #[derive(Clone, Copy, Debug, Default, Eq, PartialEq)]
+    pub struct SubgroupOperationSet: u8 {
+        /// Elect, Barrier
+        const BASIC = 1 << 0;
+        /// Any, All
+        const VOTE = 1 << 1;
+        /// reductions, scans
+        const ARITHMETIC = 1 << 2;
+        /// ballot, broadcast
+        const BALLOT = 1 << 3;
+        /// shuffle, shuffle xor
+        const SHUFFLE = 1 << 4;
+        /// shuffle up, down
+        const SHUFFLE_RELATIVE = 1 << 5;
+        // We don't support these operations yet
+        // /// Clustered
+        // const CLUSTERED = 1 << 6;
+        // /// Quad supported
+        // const QUAD_FRAGMENT_COMPUTE = 1 << 7;
+        // /// Quad supported in all stages
+        // const QUAD_ALL_STAGES = 1 << 8;
+    }
+}
+
+impl super::SubgroupOperation {
+    const fn required_operations(&self) -> SubgroupOperationSet {
+        use SubgroupOperationSet as S;
+        match *self {
+            Self::All | Self::Any => S::VOTE,
+            Self::Add | Self::Mul | Self::Min | Self::Max | Self::And | Self::Or | Self::Xor => {
+                S::ARITHMETIC
+            }
+        }
+    }
+}
+
+impl super::GatherMode {
+    const fn required_operations(&self) -> SubgroupOperationSet {
+        use SubgroupOperationSet as S;
+        match *self {
+            Self::BroadcastFirst | Self::Broadcast(_) => S::BALLOT,
+            Self::Shuffle(_) | Self::ShuffleXor(_) => S::SHUFFLE,
+            Self::ShuffleUp(_) | Self::ShuffleDown(_) => S::SHUFFLE_RELATIVE,
+        }
+    }
+}
+
 bitflags::bitflags! {
     /// Validation flags.
     #[cfg_attr(feature = "serialize", derive(serde::Serialize))]
@@ -166,6 +221,8 @@ impl ops::Index<Handle<crate::Expression>> for ModuleInfo {
 pub struct Validator {
     flags: ValidationFlags,
     capabilities: Capabilities,
+    subgroup_stages: ShaderStages,
+    subgroup_operations: SubgroupOperationSet,
     types: Vec<r#type::TypeInfo>,
     layouter: Layouter,
     location_mask: BitSet,
@@ -317,6 +374,8 @@ impl Validator {
         Validator {
             flags,
             capabilities,
+            subgroup_stages: ShaderStages::empty(),
+            subgroup_operations: SubgroupOperationSet::empty(),
             types: Vec::new(),
             layouter: Layouter::default(),
             location_mask: BitSet::new(),
@@ -329,6 +388,16 @@ impl Validator {
         }
     }
 
+    pub fn subgroup_stages(&mut self, stages: ShaderStages) -> &mut Self {
+        self.subgroup_stages = stages;
+        self
+    }
+
+    pub fn subgroup_operations(&mut self, operations: SubgroupOperationSet) -> &mut Self {
+        self.subgroup_operations = operations;
+        self
+    }
+
     /// Reset the validator internals
     pub fn reset(&mut self) {
         self.types.clear();
diff --git a/naga/src/valid/type.rs b/naga/src/valid/type.rs
index 03e87fd99b..f5b9856074 100644
--- a/naga/src/valid/type.rs
+++ b/naga/src/valid/type.rs
@@ -328,7 +328,6 @@ impl super::Validator {
                     TypeFlags::DATA
                         | TypeFlags::SIZED
                         | TypeFlags::COPY
-                        | TypeFlags::HOST_SHAREABLE
                         | TypeFlags::ARGUMENT
                         | TypeFlags::CONSTRUCTIBLE
                         | shareable,
diff --git a/naga/tests/in/overrides.wgsl b/naga/tests/in/overrides.wgsl
index 6173c3463f..a746ce1c76 100644
--- a/naga/tests/in/overrides.wgsl
+++ b/naga/tests/in/overrides.wgsl
@@ -14,6 +14,7 @@
 override inferred_f32 = 2.718;
 
 var<private> gain_x_10: f32 = gain * 10.;
+var<private> store_override: f32;
 
 @compute @workgroup_size(1)
 fn main() {
@@ -22,4 +23,6 @@ fn main() {
     var x = a;
 
     var gain_x_100 = gain_x_10 * 10.;
+
+    store_override = gain;
 }
diff --git a/naga/tests/in/spv/spec-constants-issue-5598.spv b/naga/tests/in/spv/spec-constants-issue-5598.spv
new file mode 100644
index 0000000000..2f32de970d
Binary files /dev/null and b/naga/tests/in/spv/spec-constants-issue-5598.spv differ
diff --git a/naga/tests/in/spv/spec-constants-issue-5598.spvasm b/naga/tests/in/spv/spec-constants-issue-5598.spvasm
new file mode 100644
index 0000000000..a1fdbcbdd8
--- /dev/null
+++ b/naga/tests/in/spv/spec-constants-issue-5598.spvasm
@@ -0,0 +1,96 @@
+; SPIR-V
+; Version: 1.5
+; Generator: Google rspirv; 0
+; Bound: 68
+; Schema: 0
+               OpCapability Shader
+               OpCapability VulkanMemoryModel
+               OpMemoryModel Logical Vulkan
+               OpEntryPoint Fragment %1 "fragment" %gl_FragCoord %3
+               OpEntryPoint Vertex %4 "vertex" %gl_VertexIndex %gl_Position
+               OpExecutionMode %1 OriginUpperLeft
+               OpDecorate %gl_FragCoord BuiltIn FragCoord
+               OpDecorate %10 SpecId 100
+               OpDecorate %3 Location 0
+               OpDecorate %_arr_v4float_uint_6 ArrayStride 16
+               OpDecorate %gl_VertexIndex BuiltIn VertexIndex
+               OpDecorate %gl_Position BuiltIn Position
+               OpDecorate %gl_Position Invariant
+      %float = OpTypeFloat 32
+    %v4float = OpTypeVector %float 4
+%_ptr_Input_v4float = OpTypePointer Input %v4float
+%_ptr_Output_v4float = OpTypePointer Output %v4float
+       %void = OpTypeVoid
+         %17 = OpTypeFunction %void
+%gl_FragCoord = OpVariable %_ptr_Input_v4float Input
+       %bool = OpTypeBool
+       %uint = OpTypeInt 32 0
+         %10 = OpSpecConstant %uint 2
+     %uint_1 = OpConstant %uint 1
+    %v2float = OpTypeVector %float 2
+%_ptr_Output_float = OpTypePointer Output %float
+          %3 = OpVariable %_ptr_Output_v4float Output
+     %uint_0 = OpConstant %uint 0
+%_ptr_Input_uint = OpTypePointer Input %uint
+     %uint_6 = OpConstant %uint 6
+%_arr_v4float_uint_6 = OpTypeArray %v4float %uint_6
+%_ptr_Function__arr_v4float_uint_6 = OpTypePointer Function %_arr_v4float_uint_6
+%gl_VertexIndex = OpVariable %_ptr_Input_uint Input
+   %float_n1 = OpConstant %float -1
+    %float_0 = OpConstant %float 0
+    %float_1 = OpConstant %float 1
+         %32 = OpConstantComposite %v4float %float_n1 %float_n1 %float_0 %float_1
+         %33 = OpConstantComposite %v4float %float_1 %float_n1 %float_0 %float_1
+         %34 = OpConstantComposite %v4float %float_1 %float_1 %float_0 %float_1
+         %35 = OpConstantComposite %v4float %float_n1 %float_1 %float_0 %float_1
+         %36 = OpConstantComposite %_arr_v4float_uint_6 %32 %33 %34 %34 %35 %32
+%_ptr_Function_v4float = OpTypePointer Function %v4float
+%gl_Position = OpVariable %_ptr_Output_v4float Output
+ %float_0_25 = OpConstant %float 0.25
+  %float_0_5 = OpConstant %float 0.5
+          %1 = OpFunction %void None %17
+         %38 = OpLabel
+         %39 = OpLoad %v4float %gl_FragCoord
+         %40 = OpCompositeExtract %float %39 0
+         %41 = OpCompositeExtract %float %39 1
+         %42 = OpIEqual %bool %10 %uint_1
+               OpSelectionMerge %43 None
+               OpBranchConditional %42 %44 %45
+         %44 = OpLabel
+         %46 = OpFMul %float %40 %float_0_5
+         %47 = OpFMul %float %41 %float_0_5
+         %48 = OpCompositeConstruct %v2float %46 %47
+               OpBranch %43
+         %45 = OpLabel
+         %49 = OpFMul %float %40 %float_0_25
+         %50 = OpFMul %float %41 %float_0_25
+         %51 = OpCompositeConstruct %v2float %49 %50
+               OpBranch %43
+         %43 = OpLabel
+         %52 = OpPhi %v2float %48 %44 %51 %45
+         %53 = OpCompositeExtract %float %52 0
+         %54 = OpAccessChain %_ptr_Output_float %3 %uint_0
+               OpStore %54 %53
+         %55 = OpCompositeExtract %float %52 1
+         %56 = OpAccessChain %_ptr_Output_float %3 %uint_1
+               OpStore %56 %55
+               OpReturn
+               OpFunctionEnd
+          %4 = OpFunction %void None %17
+         %57 = OpLabel
+         %58 = OpVariable %_ptr_Function__arr_v4float_uint_6 Function
+         %59 = OpLoad %uint %gl_VertexIndex
+               OpStore %58 %36
+         %60 = OpULessThan %bool %59 %uint_6
+               OpSelectionMerge %61 None
+               OpBranchConditional %60 %62 %63
+         %62 = OpLabel
+         %64 = OpInBoundsAccessChain %_ptr_Function_v4float %58 %59
+         %65 = OpLoad %v4float %64
+               OpStore %gl_Position %65
+               OpBranch %61
+         %63 = OpLabel
+               OpBranch %61
+         %61 = OpLabel
+               OpReturn
+               OpFunctionEnd
diff --git a/naga/tests/in/spv/subgroup-operations-s.param.ron b/naga/tests/in/spv/subgroup-operations-s.param.ron
new file mode 100644
index 0000000000..122542d1f6
--- /dev/null
+++ b/naga/tests/in/spv/subgroup-operations-s.param.ron
@@ -0,0 +1,27 @@
+(
+	god_mode: true,
+	spv: (
+		version: (1, 3),
+	),
+	msl: (
+	    lang_version: (2, 4),
+		per_entry_point_map: {},
+		inline_samplers: [],
+		spirv_cross_compatibility: false,
+		fake_missing_bindings: false,
+		zero_initialize_workgroup_memory: true,
+	),
+	glsl: (
+		version: Desktop(430),
+		writer_flags: (""),
+		binding_map: { },
+		zero_initialize_workgroup_memory: true,
+	),
+	hlsl: (
+		shader_model: V6_0,
+		binding_map: {},
+		fake_missing_bindings: true,
+		special_constants_binding: None,
+		zero_initialize_workgroup_memory: true,
+	),
+)
diff --git a/naga/tests/in/spv/subgroup-operations-s.spv b/naga/tests/in/spv/subgroup-operations-s.spv
new file mode 100644
index 0000000000..d4bf0191db
Binary files /dev/null and b/naga/tests/in/spv/subgroup-operations-s.spv differ
diff --git a/naga/tests/in/spv/subgroup-operations-s.spvasm b/naga/tests/in/spv/subgroup-operations-s.spvasm
new file mode 100644
index 0000000000..72c68aa46c
--- /dev/null
+++ b/naga/tests/in/spv/subgroup-operations-s.spvasm
@@ -0,0 +1,75 @@
+; SPIR-V
+; Version: 1.3
+; Generator: rspirv
+; Bound: 54
+OpCapability Shader
+OpCapability GroupNonUniform
+OpCapability GroupNonUniformBallot
+OpCapability GroupNonUniformVote
+OpCapability GroupNonUniformArithmetic
+OpCapability GroupNonUniformShuffle
+OpCapability GroupNonUniformShuffleRelative
+%1 = OpExtInstImport "GLSL.std.450"
+OpMemoryModel Logical GLSL450
+OpEntryPoint GLCompute %15 "main" %6 %9 %11 %13
+OpExecutionMode %15 LocalSize 1 1 1
+OpDecorate %6 BuiltIn NumSubgroups
+OpDecorate %9 BuiltIn SubgroupId
+OpDecorate %11 BuiltIn SubgroupSize
+OpDecorate %13 BuiltIn SubgroupLocalInvocationId
+%2 = OpTypeVoid
+%3 = OpTypeInt 32 0
+%4 = OpTypeBool
+%7 = OpTypePointer Input %3
+%6 = OpVariable  %7  Input
+%9 = OpVariable  %7  Input
+%11 = OpVariable  %7  Input
+%13 = OpVariable  %7  Input
+%16 = OpTypeFunction %2
+%17 = OpConstant  %3  1
+%18 = OpConstant  %3  0
+%19 = OpConstant  %3  4
+%21 = OpConstant  %3  3
+%22 = OpConstant  %3  2
+%23 = OpConstant  %3  8
+%26 = OpTypeVector %3 4
+%28 = OpConstantTrue  %4
+%15 = OpFunction  %2  None %16
+%5 = OpLabel
+%8 = OpLoad  %3  %6
+%10 = OpLoad  %3  %9
+%12 = OpLoad  %3  %11
+%14 = OpLoad  %3  %13
+OpBranch %20
+%20 = OpLabel
+OpControlBarrier %21 %22 %23
+%24 = OpBitwiseAnd  %3  %14 %17
+%25 = OpIEqual  %4  %24 %17
+%27 = OpGroupNonUniformBallot  %26  %21 %25
+%29 = OpGroupNonUniformBallot  %26  %21 %28
+%30 = OpINotEqual  %4  %14 %18
+%31 = OpGroupNonUniformAll  %4  %21 %30
+%32 = OpIEqual  %4  %14 %18
+%33 = OpGroupNonUniformAny  %4  %21 %32
+%34 = OpGroupNonUniformIAdd  %3  %21 Reduce %14
+%35 = OpGroupNonUniformIMul  %3  %21 Reduce %14
+%36 = OpGroupNonUniformUMin  %3  %21 Reduce %14
+%37 = OpGroupNonUniformUMax  %3  %21 Reduce %14
+%38 = OpGroupNonUniformBitwiseAnd  %3  %21 Reduce %14
+%39 = OpGroupNonUniformBitwiseOr  %3  %21 Reduce %14
+%40 = OpGroupNonUniformBitwiseXor  %3  %21 Reduce %14
+%41 = OpGroupNonUniformIAdd  %3  %21 ExclusiveScan %14
+%42 = OpGroupNonUniformIMul  %3  %21 ExclusiveScan %14
+%43 = OpGroupNonUniformIAdd  %3  %21 InclusiveScan %14
+%44 = OpGroupNonUniformIMul  %3  %21 InclusiveScan %14
+%45 = OpGroupNonUniformBroadcastFirst  %3  %21 %14
+%46 = OpGroupNonUniformBroadcast  %3  %21 %14 %19
+%47 = OpISub  %3  %12 %17
+%48 = OpISub  %3  %47 %14
+%49 = OpGroupNonUniformShuffle  %3  %21 %14 %48
+%50 = OpGroupNonUniformShuffleDown  %3  %21 %14 %17
+%51 = OpGroupNonUniformShuffleUp  %3  %21 %14 %17
+%52 = OpISub  %3  %12 %17
+%53 = OpGroupNonUniformShuffleXor  %3  %21 %14 %52
+OpReturn
+OpFunctionEnd
\ No newline at end of file
diff --git a/naga/tests/in/subgroup-operations.param.ron b/naga/tests/in/subgroup-operations.param.ron
new file mode 100644
index 0000000000..122542d1f6
--- /dev/null
+++ b/naga/tests/in/subgroup-operations.param.ron
@@ -0,0 +1,27 @@
+(
+	god_mode: true,
+	spv: (
+		version: (1, 3),
+	),
+	msl: (
+	    lang_version: (2, 4),
+		per_entry_point_map: {},
+		inline_samplers: [],
+		spirv_cross_compatibility: false,
+		fake_missing_bindings: false,
+		zero_initialize_workgroup_memory: true,
+	),
+	glsl: (
+		version: Desktop(430),
+		writer_flags: (""),
+		binding_map: { },
+		zero_initialize_workgroup_memory: true,
+	),
+	hlsl: (
+		shader_model: V6_0,
+		binding_map: {},
+		fake_missing_bindings: true,
+		special_constants_binding: None,
+		zero_initialize_workgroup_memory: true,
+	),
+)
diff --git a/naga/tests/in/subgroup-operations.wgsl b/naga/tests/in/subgroup-operations.wgsl
new file mode 100644
index 0000000000..bb6eb47fb5
--- /dev/null
+++ b/naga/tests/in/subgroup-operations.wgsl
@@ -0,0 +1,37 @@
+struct Structure {
+    @builtin(num_subgroups) num_subgroups: u32,
+    @builtin(subgroup_size) subgroup_size: u32,
+};
+
+@compute @workgroup_size(1)
+fn main(
+    sizes: Structure,
+    @builtin(subgroup_id) subgroup_id: u32,
+    @builtin(subgroup_invocation_id) subgroup_invocation_id: u32,
+) {
+    subgroupBarrier();
+
+    subgroupBallot((subgroup_invocation_id & 1u) == 1u);
+    subgroupBallot();
+
+    subgroupAll(subgroup_invocation_id != 0u);
+    subgroupAny(subgroup_invocation_id == 0u);
+    subgroupAdd(subgroup_invocation_id);
+    subgroupMul(subgroup_invocation_id);
+    subgroupMin(subgroup_invocation_id);
+    subgroupMax(subgroup_invocation_id);
+    subgroupAnd(subgroup_invocation_id);
+    subgroupOr(subgroup_invocation_id);
+    subgroupXor(subgroup_invocation_id);
+    subgroupExclusiveAdd(subgroup_invocation_id);
+    subgroupExclusiveMul(subgroup_invocation_id);
+    subgroupInclusiveAdd(subgroup_invocation_id);
+    subgroupInclusiveMul(subgroup_invocation_id);
+
+    subgroupBroadcastFirst(subgroup_invocation_id);
+    subgroupBroadcast(subgroup_invocation_id, 4u);
+    subgroupShuffle(subgroup_invocation_id, sizes.subgroup_size - 1u - subgroup_invocation_id);
+    subgroupShuffleDown(subgroup_invocation_id, 1u);
+    subgroupShuffleUp(subgroup_invocation_id, 1u);
+    subgroupShuffleXor(subgroup_invocation_id, sizes.subgroup_size - 1u);
+}
diff --git a/naga/tests/out/analysis/overrides.info.ron b/naga/tests/out/analysis/overrides.info.ron
index 00d8ce1ea8..12fa4b339f 100644
--- a/naga/tests/out/analysis/overrides.info.ron
+++ b/naga/tests/out/analysis/overrides.info.ron
@@ -16,6 +16,7 @@
             sampling_set: [],
             global_uses: [
                 ("READ"),
+                ("WRITE"),
             ],
             expressions: [
                 (
@@ -138,6 +139,27 @@
                         space: Function,
                     )),
                 ),
+                (
+                    uniformity: (
+                        non_uniform_result: Some(12),
+                        requirements: (""),
+                    ),
+                    ref_count: 1,
+                    assignable_global: Some(2),
+                    ty: Value(Pointer(
+                        base: 2,
+                        space: Private,
+                    )),
+                ),
+                (
+                    uniformity: (
+                        non_uniform_result: None,
+                        requirements: (""),
+                    ),
+                    ref_count: 1,
+                    assignable_global: None,
+                    ty: Handle(2),
+                ),
             ],
             sampling: [],
             dual_source_blending: false,
diff --git a/naga/tests/out/glsl/overrides.main.Compute.glsl b/naga/tests/out/glsl/overrides.main.Compute.glsl
index b6d86d50ba..d1170df962 100644
--- a/naga/tests/out/glsl/overrides.main.Compute.glsl
+++ b/naga/tests/out/glsl/overrides.main.Compute.glsl
@@ -15,6 +15,8 @@ const float inferred_f32_ = 2.718;
 
 float gain_x_10_ = 11.0;
 
+float store_override = 0.0;
+
 
 void main() {
     float t = 23.0;
@@ -23,6 +25,7 @@ void main() {
     x = true;
     float _e9 = gain_x_10_;
     gain_x_100_ = (_e9 * 10.0);
+    store_override = gain;
     return;
 }
 
diff --git a/naga/tests/out/glsl/spec-constants-issue-5598.fragment.Fragment.glsl b/naga/tests/out/glsl/spec-constants-issue-5598.fragment.Fragment.glsl
new file mode 100644
index 0000000000..e81d8fa1b1
--- /dev/null
+++ b/naga/tests/out/glsl/spec-constants-issue-5598.fragment.Fragment.glsl
@@ -0,0 +1,34 @@
+#version 310 es
+
+precision highp float;
+precision highp int;
+
+vec4 global = vec4(0.0);
+
+vec4 global_1 = vec4(0.0);
+
+layout(location = 0) out vec4 _fs2p_location0;
+
+void function() {
+    vec2 phi_52_ = vec2(0.0);
+    vec4 _e7 = global;
+    if (false) {
+        phi_52_ = vec2((_e7.x * 0.5), (_e7.y * 0.5));
+    } else {
+        phi_52_ = vec2((_e7.x * 0.25), (_e7.y * 0.25));
+    }
+    vec2 _e20 = phi_52_;
+    global_1[0u] = _e20.x;
+    global_1[1u] = _e20.y;
+    return;
+}
+
+void main() {
+    vec4 param = gl_FragCoord;
+    global = param;
+    function();
+    vec4 _e3 = global_1;
+    _fs2p_location0 = _e3;
+    return;
+}
+
diff --git a/naga/tests/out/glsl/spec-constants-issue-5598.vertex.Vertex.glsl b/naga/tests/out/glsl/spec-constants-issue-5598.vertex.Vertex.glsl
new file mode 100644
index 0000000000..256e9380ac
--- /dev/null
+++ b/naga/tests/out/glsl/spec-constants-issue-5598.vertex.Vertex.glsl
@@ -0,0 +1,34 @@
+#version 310 es
+
+precision highp float;
+precision highp int;
+
+uint global_2 = 0u;
+
+vec4 global_3 = vec4(0.0, 0.0, 0.0, 1.0);
+
+invariant gl_Position;
+
+void function_1() {
+    vec4 local[6] = vec4[6](vec4(0.0), vec4(0.0), vec4(0.0), vec4(0.0), vec4(0.0), vec4(0.0));
+    uint _e5 = global_2;
+    local = vec4[6](vec4(-1.0, -1.0, 0.0, 1.0), vec4(1.0, -1.0, 0.0, 1.0), vec4(1.0, 1.0, 0.0, 1.0), vec4(1.0, 1.0, 0.0, 1.0), vec4(-1.0, 1.0, 0.0, 1.0), vec4(-1.0, -1.0, 0.0, 1.0));
+    if ((_e5 < 6u)) {
+        vec4 _e8 = local[_e5];
+        global_3 = _e8;
+    }
+    return;
+}
+
+void main() {
+    uint param_1 = uint(gl_VertexID);
+    global_2 = param_1;
+    function_1();
+    float _e4 = global_3.y;
+    global_3.y = -(_e4);
+    vec4 _e6 = global_3;
+    gl_Position = _e6;
+    gl_Position.yz = vec2(-gl_Position.y, gl_Position.z * 2.0 - gl_Position.w);
+    return;
+}
+
diff --git a/naga/tests/out/glsl/subgroup-operations-s.main.Compute.glsl b/naga/tests/out/glsl/subgroup-operations-s.main.Compute.glsl
new file mode 100644
index 0000000000..cc1aac5417
--- /dev/null
+++ b/naga/tests/out/glsl/subgroup-operations-s.main.Compute.glsl
@@ -0,0 +1,58 @@
+#version 430 core
+#extension GL_ARB_compute_shader : require
+#extension GL_KHR_shader_subgroup_basic : require
+#extension GL_KHR_shader_subgroup_vote : require
+#extension GL_KHR_shader_subgroup_arithmetic : require
+#extension GL_KHR_shader_subgroup_ballot : require
+#extension GL_KHR_shader_subgroup_shuffle : require
+#extension GL_KHR_shader_subgroup_shuffle_relative : require
+layout(local_size_x = 1, local_size_y = 1, local_size_z = 1) in;
+
+uint num_subgroups_1 = 0u;
+
+uint subgroup_id_1 = 0u;
+
+uint subgroup_size_1 = 0u;
+
+uint subgroup_invocation_id_1 = 0u;
+
+
+void main_1() {
+    uint _e5 = subgroup_size_1;
+    uint _e6 = subgroup_invocation_id_1;
+    uvec4 _e9 = subgroupBallot(((_e6 & 1u) == 1u));
+    uvec4 _e10 = subgroupBallot(true);
+    bool _e12 = subgroupAll((_e6 != 0u));
+    bool _e14 = subgroupAny((_e6 == 0u));
+    uint _e15 = subgroupAdd(_e6);
+    uint _e16 = subgroupMul(_e6);
+    uint _e17 = subgroupMin(_e6);
+    uint _e18 = subgroupMax(_e6);
+    uint _e19 = subgroupAnd(_e6);
+    uint _e20 = subgroupOr(_e6);
+    uint _e21 = subgroupXor(_e6);
+    uint _e22 = subgroupExclusiveAdd(_e6);
+    uint _e23 = subgroupExclusiveMul(_e6);
+    uint _e24 = subgroupInclusiveAdd(_e6);
+    uint _e25 = subgroupInclusiveMul(_e6);
+    uint _e26 = subgroupBroadcastFirst(_e6);
+    uint _e27 = subgroupBroadcast(_e6, 4u);
+    uint _e30 = subgroupShuffle(_e6, ((_e5 - 1u) - _e6));
+    uint _e31 = subgroupShuffleDown(_e6, 1u);
+    uint _e32 = subgroupShuffleUp(_e6, 1u);
+    uint _e34 = subgroupShuffleXor(_e6, (_e5 - 1u));
+    return;
+}
+
+void main() {
+    uint num_subgroups = gl_NumSubgroups;
+    uint subgroup_id = gl_SubgroupID;
+    uint subgroup_size = gl_SubgroupSize;
+    uint subgroup_invocation_id = gl_SubgroupInvocationID;
+    num_subgroups_1 = num_subgroups;
+    subgroup_id_1 = subgroup_id;
+    subgroup_size_1 = subgroup_size;
+    subgroup_invocation_id_1 = subgroup_invocation_id;
+    main_1();
+}
+
diff --git a/naga/tests/out/glsl/subgroup-operations.main.Compute.glsl b/naga/tests/out/glsl/subgroup-operations.main.Compute.glsl
new file mode 100644
index 0000000000..05ab403565
--- /dev/null
+++ b/naga/tests/out/glsl/subgroup-operations.main.Compute.glsl
@@ -0,0 +1,45 @@
+#version 430 core
+#extension GL_ARB_compute_shader : require
+#extension GL_KHR_shader_subgroup_basic : require
+#extension GL_KHR_shader_subgroup_vote : require
+#extension GL_KHR_shader_subgroup_arithmetic : require
+#extension GL_KHR_shader_subgroup_ballot : require
+#extension GL_KHR_shader_subgroup_shuffle : require
+#extension GL_KHR_shader_subgroup_shuffle_relative : require
+layout(local_size_x = 1, local_size_y = 1, local_size_z = 1) in;
+
+struct Structure {
+    uint num_subgroups;
+    uint subgroup_size;
+};
+
+void main() {
+    Structure sizes = Structure(gl_NumSubgroups, gl_SubgroupSize);
+    uint subgroup_id = gl_SubgroupID;
+    uint subgroup_invocation_id = gl_SubgroupInvocationID;
+    subgroupMemoryBarrier();
+    barrier();
+    uvec4 _e7 = subgroupBallot(((subgroup_invocation_id & 1u) == 1u));
+    uvec4 _e8 = subgroupBallot(true);
+    bool _e11 = subgroupAll((subgroup_invocation_id != 0u));
+    bool _e14 = subgroupAny((subgroup_invocation_id == 0u));
+    uint _e15 = subgroupAdd(subgroup_invocation_id);
+    uint _e16 = subgroupMul(subgroup_invocation_id);
+    uint _e17 = subgroupMin(subgroup_invocation_id);
+    uint _e18 = subgroupMax(subgroup_invocation_id);
+    uint _e19 = subgroupAnd(subgroup_invocation_id);
+    uint _e20 = subgroupOr(subgroup_invocation_id);
+    uint _e21 = subgroupXor(subgroup_invocation_id);
+    uint _e22 = subgroupExclusiveAdd(subgroup_invocation_id);
+    uint _e23 = subgroupExclusiveMul(subgroup_invocation_id);
+    uint _e24 = subgroupInclusiveAdd(subgroup_invocation_id);
+    uint _e25 = subgroupInclusiveMul(subgroup_invocation_id);
+    uint _e26 = subgroupBroadcastFirst(subgroup_invocation_id);
+    uint _e28 = subgroupBroadcast(subgroup_invocation_id, 4u);
+    uint _e33 = subgroupShuffle(subgroup_invocation_id, ((sizes.subgroup_size - 1u) - subgroup_invocation_id));
+    uint _e35 = subgroupShuffleDown(subgroup_invocation_id, 1u);
+    uint _e37 = subgroupShuffleUp(subgroup_invocation_id, 1u);
+    uint _e41 = subgroupShuffleXor(subgroup_invocation_id, (sizes.subgroup_size - 1u));
+    return;
+}
+
diff --git a/naga/tests/out/hlsl/access.hlsl b/naga/tests/out/hlsl/access.hlsl
index 47d9cc24f7..4f0cb4b839 100644
--- a/naga/tests/out/hlsl/access.hlsl
+++ b/naga/tests/out/hlsl/access.hlsl
@@ -158,10 +158,15 @@ MatCx2InArray ConstructMatCx2InArray(float4x2 arg0[2]) {
     return ret;
 }
 
+typedef float4x2 ret_ZeroValuearray2_float4x2_[2];
+ret_ZeroValuearray2_float4x2_ ZeroValuearray2_float4x2_() {
+    return (float4x2[2])0;
+}
+
 void test_matrix_within_array_within_struct_accesses()
 {
     int idx_1 = 1;
-    MatCx2InArray t_1 = ConstructMatCx2InArray((float4x2[2])0);
+    MatCx2InArray t_1 = ConstructMatCx2InArray(ZeroValuearray2_float4x2_());
 
     int _expr3 = idx_1;
     idx_1 = (_expr3 - 1);
@@ -180,7 +185,7 @@ void test_matrix_within_array_within_struct_accesses()
     float l7_ = __get_col_of_mat4x2(nested_mat_cx2_.am[0], _expr46)[_expr48];
     int _expr55 = idx_1;
     idx_1 = (_expr55 + 1);
-    t_1.am = (__mat4x2[2])(float4x2[2])0;
+    t_1.am = (__mat4x2[2])ZeroValuearray2_float4x2_();
     t_1.am[0] = (__mat4x2)float4x2((8.0).xx, (7.0).xx, (6.0).xx, (5.0).xx);
     t_1.am[0]._0 = (9.0).xx;
     int _expr77 = idx_1;
@@ -231,6 +236,11 @@ ret_Constructarray5_int_ Constructarray5_int_(int arg0, int arg1, int arg2, int
     return ret;
 }
 
+typedef float ret_ZeroValuearray5_array10_float__[5][10];
+ret_ZeroValuearray5_array10_float__ ZeroValuearray5_array10_float__() {
+    return (float[5][10])0;
+}
+
 typedef uint2 ret_Constructarray2_uint2_[2];
 ret_Constructarray2_uint2_ Constructarray2_uint2_(uint2 arg0, uint2 arg1) {
     uint2 ret[2] = { arg0, arg1 };
@@ -262,10 +272,14 @@ float4 foo_vert(uint vi : SV_VertexID) : SV_Position
     c2_ = Constructarray5_int_(a_1, int(b), 3, 4, 5);
     c2_[(vi + 1u)] = 42;
     int value = c2_[vi];
-    const float _e47 = test_arr_as_arg((float[5][10])0);
+    const float _e47 = test_arr_as_arg(ZeroValuearray5_array10_float__());
     return float4(mul(float4((value).xxxx), _matrix), 2.0);
 }
 
+int2 ZeroValueint2() {
+    return (int2)0;
+}
+
 float4 foo_frag() : SV_Target0
 {
     bar.Store(8+16+0, asuint(1.0));
@@ -282,7 +296,7 @@ float4 foo_frag() : SV_Target0
         bar.Store2(144+8, asuint(_value2[1]));
     }
     bar.Store(0+8+160, asuint(1));
-    qux.Store2(0, asuint((int2)0));
+    qux.Store2(0, asuint(ZeroValueint2()));
     return (0.0).xxxx;
 }
 
diff --git a/naga/tests/out/hlsl/constructors.hlsl b/naga/tests/out/hlsl/constructors.hlsl
index 39f3137605..90d8db9a33 100644
--- a/naga/tests/out/hlsl/constructors.hlsl
+++ b/naga/tests/out/hlsl/constructors.hlsl
@@ -18,17 +18,50 @@ ret_Constructarray4_int_ Constructarray4_int_(int arg0, int arg1, int arg2, int
     return ret;
 }
 
+bool ZeroValuebool() {
+    return (bool)0;
+}
+
+int ZeroValueint() {
+    return (int)0;
+}
+
+uint ZeroValueuint() {
+    return (uint)0;
+}
+
+float ZeroValuefloat() {
+    return (float)0;
+}
+
+uint2 ZeroValueuint2() {
+    return (uint2)0;
+}
+
+float2x2 ZeroValuefloat2x2() {
+    return (float2x2)0;
+}
+
+typedef Foo ret_ZeroValuearray3_Foo_[3];
+ret_ZeroValuearray3_Foo_ ZeroValuearray3_Foo_() {
+    return (Foo[3])0;
+}
+
+Foo ZeroValueFoo() {
+    return (Foo)0;
+}
+
 static const float3 const2_ = float3(0.0, 1.0, 2.0);
 static const float2x2 const3_ = float2x2(float2(0.0, 1.0), float2(2.0, 3.0));
 static const float2x2 const4_[1] = Constructarray1_float2x2_(float2x2(float2(0.0, 1.0), float2(2.0, 3.0)));
-static const bool cz0_ = (bool)0;
-static const int cz1_ = (int)0;
-static const uint cz2_ = (uint)0;
-static const float cz3_ = (float)0;
-static const uint2 cz4_ = (uint2)0;
-static const float2x2 cz5_ = (float2x2)0;
-static const Foo cz6_[3] = (Foo[3])0;
-static const Foo cz7_ = (Foo)0;
+static const bool cz0_ = ZeroValuebool();
+static const int cz1_ = ZeroValueint();
+static const uint cz2_ = ZeroValueuint();
+static const float cz3_ = ZeroValuefloat();
+static const uint2 cz4_ = ZeroValueuint2();
+static const float2x2 cz5_ = ZeroValuefloat2x2();
+static const Foo cz6_[3] = ZeroValuearray3_Foo_();
+static const Foo cz7_ = ZeroValueFoo();
 static const int cp3_[4] = Constructarray4_int_(0, 1, 2, 3);
 
 Foo ConstructFoo(float4 arg0, int arg1) {
@@ -38,6 +71,10 @@ Foo ConstructFoo(float4 arg0, int arg1) {
     return ret;
 }
 
+float2x3 ZeroValuefloat2x3() {
+    return (float2x3)0;
+}
+
 [numthreads(1, 1, 1)]
 void main()
 {
diff --git a/naga/tests/out/hlsl/globals.hlsl b/naga/tests/out/hlsl/globals.hlsl
index 55faf060d0..adf0b28b89 100644
--- a/naga/tests/out/hlsl/globals.hlsl
+++ b/naga/tests/out/hlsl/globals.hlsl
@@ -71,6 +71,10 @@ void test_msl_packed_vec3_as_arg(float3 arg)
     return;
 }
 
+float3x3 ZeroValuefloat3x3() {
+    return (float3x3)0;
+}
+
 FooStruct ConstructFooStruct(float3 arg0, float arg1) {
     FooStruct ret = (FooStruct)0;
     ret.v3_ = arg0;
@@ -91,8 +95,8 @@ void test_msl_packed_vec3_()
     float3 l0_ = data.v3_;
     float2 l1_ = data.v3_.zx;
     test_msl_packed_vec3_as_arg(data.v3_);
-    float3 mvm0_ = mul((float3x3)0, data.v3_);
-    float3 mvm1_ = mul(data.v3_, (float3x3)0);
+    float3 mvm0_ = mul(ZeroValuefloat3x3(), data.v3_);
+    float3 mvm1_ = mul(data.v3_, ZeroValuefloat3x3());
     float3 svm0_ = (data.v3_ * 2.0);
     float3 svm1_ = (2.0 * data.v3_);
 }
diff --git a/naga/tests/out/hlsl/math-functions.hlsl b/naga/tests/out/hlsl/math-functions.hlsl
index 61c59f00c1..c1a771c25d 100644
--- a/naga/tests/out/hlsl/math-functions.hlsl
+++ b/naga/tests/out/hlsl/math-functions.hlsl
@@ -63,6 +63,10 @@ _frexp_result_vec4_f32_ naga_frexp(float4 arg) {
     return result;
 }
 
+int2 ZeroValueint2() {
+    return (int2)0;
+}
+
 void main()
 {
     float4 v = (0.0).xxxx;
@@ -74,7 +78,7 @@ void main()
     float4 g = refract(v, v, 1.0);
     int4 sign_b = int4(-1, -1, -1, -1);
     float4 sign_d = float4(-1.0, -1.0, -1.0, -1.0);
-    int const_dot = dot((int2)0, (int2)0);
+    int const_dot = dot(ZeroValueint2(), ZeroValueint2());
     uint first_leading_bit_abs = firstbithigh(0u);
     int flb_a = asint(firstbithigh(-1));
     int2 flb_b = asint(firstbithigh((-1).xx));
diff --git a/naga/tests/out/hlsl/operators.hlsl b/naga/tests/out/hlsl/operators.hlsl
index 58ec5a170d..eab1a8d9fa 100644
--- a/naga/tests/out/hlsl/operators.hlsl
+++ b/naga/tests/out/hlsl/operators.hlsl
@@ -55,6 +55,18 @@ void logical()
     bool4 bitwise_and1_ = ((true).xxxx & (false).xxxx);
 }
 
+float3x3 ZeroValuefloat3x3() {
+    return (float3x3)0;
+}
+
+float4x3 ZeroValuefloat4x3() {
+    return (float4x3)0;
+}
+
+float3x4 ZeroValuefloat3x4() {
+    return (float3x4)0;
+}
+
 void arithmetic()
 {
     float neg0_1 = -(1.0);
@@ -122,13 +134,13 @@ void arithmetic()
         float2 rem4_1 = fmod((2.0).xx, (1.0).xx);
         float2 rem5_1 = fmod((2.0).xx, (1.0).xx);
     }
-    float3x3 add = ((float3x3)0 + (float3x3)0);
-    float3x3 sub = ((float3x3)0 - (float3x3)0);
-    float3x3 mul_scalar0_ = mul(1.0, (float3x3)0);
-    float3x3 mul_scalar1_ = mul((float3x3)0, 2.0);
-    float3 mul_vector0_ = mul((1.0).xxxx, (float4x3)0);
-    float4 mul_vector1_ = mul((float4x3)0, (2.0).xxx);
-    float3x3 mul_ = mul((float3x4)0, (float4x3)0);
+    float3x3 add = (ZeroValuefloat3x3() + ZeroValuefloat3x3());
+    float3x3 sub = (ZeroValuefloat3x3() - ZeroValuefloat3x3());
+    float3x3 mul_scalar0_ = mul(1.0, ZeroValuefloat3x3());
+    float3x3 mul_scalar1_ = mul(ZeroValuefloat3x3(), 2.0);
+    float3 mul_vector0_ = mul((1.0).xxxx, ZeroValuefloat4x3());
+    float4 mul_vector1_ = mul(ZeroValuefloat4x3(), (2.0).xxx);
+    float3x3 mul_ = mul(ZeroValuefloat3x4(), ZeroValuefloat4x3());
 }
 
 void bit()
@@ -199,10 +211,14 @@ void comparison()
     bool4 gte5_ = ((2.0).xxxx >= (1.0).xxxx);
 }
 
+int3 ZeroValueint3() {
+    return (int3)0;
+}
+
 void assignment()
 {
     int a_1 = (int)0;
-    int3 vec0_ = (int3)0;
+    int3 vec0_ = ZeroValueint3();
 
     a_1 = 1;
     int _expr5 = a_1;
diff --git a/naga/tests/out/hlsl/overrides.hlsl b/naga/tests/out/hlsl/overrides.hlsl
index b0582d544e..a7c49f9ba1 100644
--- a/naga/tests/out/hlsl/overrides.hlsl
+++ b/naga/tests/out/hlsl/overrides.hlsl
@@ -7,6 +7,7 @@ static const float height = 4.6;
 static const float inferred_f32_ = 2.718;
 
 static float gain_x_10_ = 11.0;
+static float store_override = (float)0;
 
 [numthreads(1, 1, 1)]
 void main()
@@ -18,5 +19,6 @@ void main()
     x = true;
     float _expr9 = gain_x_10_;
     gain_x_100_ = (_expr9 * 10.0);
+    store_override = gain;
     return;
 }
diff --git a/naga/tests/out/hlsl/quad-vert.hlsl b/naga/tests/out/hlsl/quad-vert.hlsl
index 4505858a4f..5c4eeb7ecc 100644
--- a/naga/tests/out/hlsl/quad-vert.hlsl
+++ b/naga/tests/out/hlsl/quad-vert.hlsl
@@ -20,9 +20,14 @@ gl_PerVertex Constructgl_PerVertex(float4 arg0, float arg1, float arg2[1], float
     return ret;
 }
 
+typedef float ret_ZeroValuearray1_float_[1];
+ret_ZeroValuearray1_float_ ZeroValuearray1_float_() {
+    return (float[1])0;
+}
+
 static float2 v_uv = (float2)0;
 static float2 a_uv_1 = (float2)0;
-static gl_PerVertex unnamed = Constructgl_PerVertex(float4(0.0, 0.0, 0.0, 1.0), 1.0, (float[1])0, (float[1])0);
+static gl_PerVertex unnamed = Constructgl_PerVertex(float4(0.0, 0.0, 0.0, 1.0), 1.0, ZeroValuearray1_float_(), ZeroValuearray1_float_());
 static float2 a_pos_1 = (float2)0;
 
 struct VertexOutput_main {
diff --git a/naga/tests/out/hlsl/subgroup-operations-s.hlsl b/naga/tests/out/hlsl/subgroup-operations-s.hlsl
new file mode 100644
index 0000000000..d963e91503
--- /dev/null
+++ b/naga/tests/out/hlsl/subgroup-operations-s.hlsl
@@ -0,0 +1,50 @@
+static uint num_subgroups_1 = (uint)0;
+static uint subgroup_id_1 = (uint)0;
+static uint subgroup_size_1 = (uint)0;
+static uint subgroup_invocation_id_1 = (uint)0;
+
+struct ComputeInput_main {
+    uint __local_invocation_index : SV_GroupIndex;
+};
+
+void main_1()
+{
+    uint _expr5 = subgroup_size_1;
+    uint _expr6 = subgroup_invocation_id_1;
+    const uint4 _e9 = WaveActiveBallot(((_expr6 & 1u) == 1u));
+    const uint4 _e10 = WaveActiveBallot(true);
+    const bool _e12 = WaveActiveAllTrue((_expr6 != 0u));
+    const bool _e14 = WaveActiveAnyTrue((_expr6 == 0u));
+    const uint _e15 = WaveActiveSum(_expr6);
+    const uint _e16 = WaveActiveProduct(_expr6);
+    const uint _e17 = WaveActiveMin(_expr6);
+    const uint _e18 = WaveActiveMax(_expr6);
+    const uint _e19 = WaveActiveBitAnd(_expr6);
+    const uint _e20 = WaveActiveBitOr(_expr6);
+    const uint _e21 = WaveActiveBitXor(_expr6);
+    const uint _e22 = WavePrefixSum(_expr6);
+    const uint _e23 = WavePrefixProduct(_expr6);
+    const uint _e24 = _expr6 + WavePrefixSum(_expr6);
+    const uint _e25 = _expr6 * WavePrefixProduct(_expr6);
+    const uint _e26 = WaveReadLaneFirst(_expr6);
+    const uint _e27 = WaveReadLaneAt(_expr6, 4u);
+    const uint _e30 = WaveReadLaneAt(_expr6, ((_expr5 - 1u) - _expr6));
+    const uint _e31 = WaveReadLaneAt(_expr6, WaveGetLaneIndex() + 1u);
+    const uint _e32 = WaveReadLaneAt(_expr6, WaveGetLaneIndex() - 1u);
+    const uint _e34 = WaveReadLaneAt(_expr6, WaveGetLaneIndex() ^ (_expr5 - 1u));
+    return;
+}
+
+[numthreads(1, 1, 1)]
+void main(ComputeInput_main computeinput_main)
+{
+    uint num_subgroups = (1u + WaveGetLaneCount() - 1u) / WaveGetLaneCount();
+    uint subgroup_id = computeinput_main.__local_invocation_index / WaveGetLaneCount();
+    uint subgroup_size = WaveGetLaneCount();
+    uint subgroup_invocation_id = WaveGetLaneIndex();
+    num_subgroups_1 = num_subgroups;
+    subgroup_id_1 = subgroup_id;
+    subgroup_size_1 = subgroup_size;
+    subgroup_invocation_id_1 = subgroup_invocation_id;
+    main_1();
+}
diff --git a/naga/tests/out/hlsl/subgroup-operations-s.ron b/naga/tests/out/hlsl/subgroup-operations-s.ron
new file mode 100644
index 0000000000..b973fe3da1
--- /dev/null
+++ b/naga/tests/out/hlsl/subgroup-operations-s.ron
@@ -0,0 +1,12 @@
+(
+    vertex:[
+    ],
+    fragment:[
+    ],
+    compute:[
+        (
+            entry_point:"main",
+            target_profile:"cs_6_0",
+        ),
+    ],
+)
diff --git a/naga/tests/out/hlsl/subgroup-operations.hlsl b/naga/tests/out/hlsl/subgroup-operations.hlsl
new file mode 100644
index 0000000000..839b1fa6b2
--- /dev/null
+++ b/naga/tests/out/hlsl/subgroup-operations.hlsl
@@ -0,0 +1,38 @@
+struct Structure {
+    uint num_subgroups;
+    uint subgroup_size;
+};
+
+struct ComputeInput_main {
+    uint __local_invocation_index : SV_GroupIndex;
+};
+
+[numthreads(1, 1, 1)]
+void main(ComputeInput_main computeinput_main)
+{
+    Structure sizes = { (1u + WaveGetLaneCount() - 1u) / WaveGetLaneCount(), WaveGetLaneCount() };
+    uint subgroup_id = computeinput_main.__local_invocation_index / WaveGetLaneCount();
+    uint subgroup_invocation_id = WaveGetLaneIndex();
+    const uint4 _e7 = WaveActiveBallot(((subgroup_invocation_id & 1u) == 1u));
+    const uint4 _e8 = WaveActiveBallot(true);
+    const bool _e11 = WaveActiveAllTrue((subgroup_invocation_id != 0u));
+    const bool _e14 = WaveActiveAnyTrue((subgroup_invocation_id == 0u));
+    const uint _e15 = WaveActiveSum(subgroup_invocation_id);
+    const uint _e16 = WaveActiveProduct(subgroup_invocation_id);
+    const uint _e17 = WaveActiveMin(subgroup_invocation_id);
+    const uint _e18 = WaveActiveMax(subgroup_invocation_id);
+    const uint _e19 = WaveActiveBitAnd(subgroup_invocation_id);
+    const uint _e20 = WaveActiveBitOr(subgroup_invocation_id);
+    const uint _e21 = WaveActiveBitXor(subgroup_invocation_id);
+    const uint _e22 = WavePrefixSum(subgroup_invocation_id);
+    const uint _e23 = WavePrefixProduct(subgroup_invocation_id);
+    const uint _e24 = subgroup_invocation_id + WavePrefixSum(subgroup_invocation_id);
+    const uint _e25 = subgroup_invocation_id * WavePrefixProduct(subgroup_invocation_id);
+    const uint _e26 = WaveReadLaneFirst(subgroup_invocation_id);
+    const uint _e28 = WaveReadLaneAt(subgroup_invocation_id, 4u);
+    const uint _e33 = WaveReadLaneAt(subgroup_invocation_id, ((sizes.subgroup_size - 1u) - subgroup_invocation_id));
+    const uint _e35 = WaveReadLaneAt(subgroup_invocation_id, WaveGetLaneIndex() + 1u);
+    const uint _e37 = WaveReadLaneAt(subgroup_invocation_id, WaveGetLaneIndex() - 1u);
+    const uint _e41 = WaveReadLaneAt(subgroup_invocation_id, WaveGetLaneIndex() ^ (sizes.subgroup_size - 1u));
+    return;
+}
diff --git a/naga/tests/out/hlsl/subgroup-operations.ron b/naga/tests/out/hlsl/subgroup-operations.ron
new file mode 100644
index 0000000000..b973fe3da1
--- /dev/null
+++ b/naga/tests/out/hlsl/subgroup-operations.ron
@@ -0,0 +1,12 @@
+(
+    vertex:[
+    ],
+    fragment:[
+    ],
+    compute:[
+        (
+            entry_point:"main",
+            target_profile:"cs_6_0",
+        ),
+    ],
+)
diff --git a/naga/tests/out/hlsl/unnamed-gl-per-vertex.hlsl b/naga/tests/out/hlsl/unnamed-gl-per-vertex.hlsl
index 8270ad4e5d..f0f330e7cc 100644
--- a/naga/tests/out/hlsl/unnamed-gl-per-vertex.hlsl
+++ b/naga/tests/out/hlsl/unnamed-gl-per-vertex.hlsl
@@ -15,7 +15,12 @@ type_4 Constructtype_4(float4 arg0, float arg1, float arg2[1], float arg3[1]) {
     return ret;
 }
 
-static type_4 global = Constructtype_4(float4(0.0, 0.0, 0.0, 1.0), 1.0, (float[1])0, (float[1])0);
+typedef float ret_ZeroValuearray1_float_[1];
+ret_ZeroValuearray1_float_ ZeroValuearray1_float_() {
+    return (float[1])0;
+}
+
+static type_4 global = Constructtype_4(float4(0.0, 0.0, 0.0, 1.0), 1.0, ZeroValuearray1_float_(), ZeroValuearray1_float_());
 static int global_1 = (int)0;
 
 void function()
diff --git a/naga/tests/out/ir/overrides.compact.ron b/naga/tests/out/ir/overrides.compact.ron
index bc25af3bce..111a134890 100644
--- a/naga/tests/out/ir/overrides.compact.ron
+++ b/naga/tests/out/ir/overrides.compact.ron
@@ -73,6 +73,13 @@
             ty: 2,
             init: Some(10),
         ),
+        (
+            name: Some("store_override"),
+            space: Private,
+            binding: None,
+            ty: 2,
+            init: None,
+        ),
     ],
     global_expressions: [
         Literal(Bool(true)),
@@ -147,6 +154,8 @@
                         right: 9,
                     ),
                     LocalVariable(3),
+                    GlobalVariable(2),
+                    Override(3),
                 ],
                 named_expressions: {
                     5: "a",
@@ -176,6 +185,10 @@
                         pointer: 11,
                         value: 10,
                     ),
+                    Store(
+                        pointer: 12,
+                        value: 13,
+                    ),
                     Return(
                         value: None,
                     ),
diff --git a/naga/tests/out/ir/overrides.ron b/naga/tests/out/ir/overrides.ron
index bc25af3bce..111a134890 100644
--- a/naga/tests/out/ir/overrides.ron
+++ b/naga/tests/out/ir/overrides.ron
@@ -73,6 +73,13 @@
             ty: 2,
             init: Some(10),
         ),
+        (
+            name: Some("store_override"),
+            space: Private,
+            binding: None,
+            ty: 2,
+            init: None,
+        ),
     ],
     global_expressions: [
         Literal(Bool(true)),
@@ -147,6 +154,8 @@
                         right: 9,
                     ),
                     LocalVariable(3),
+                    GlobalVariable(2),
+                    Override(3),
                 ],
                 named_expressions: {
                     5: "a",
@@ -176,6 +185,10 @@
                         pointer: 11,
                         value: 10,
                     ),
+                    Store(
+                        pointer: 12,
+                        value: 13,
+                    ),
                     Return(
                         value: None,
                     ),
diff --git a/naga/tests/out/msl/overrides.msl b/naga/tests/out/msl/overrides.msl
index d9e95d0704..d3638dd4cd 100644
--- a/naga/tests/out/msl/overrides.msl
+++ b/naga/tests/out/msl/overrides.msl
@@ -15,11 +15,13 @@ constant float inferred_f32_ = 2.718;
 kernel void main_(
 ) {
     float gain_x_10_ = 11.0;
+    float store_override = {};
     float t = 23.0;
     bool x = {};
     float gain_x_100_ = {};
     x = true;
     float _e9 = gain_x_10_;
     gain_x_100_ = _e9 * 10.0;
+    store_override = gain;
     return;
 }
diff --git a/naga/tests/out/msl/subgroup-operations-s.msl b/naga/tests/out/msl/subgroup-operations-s.msl
new file mode 100644
index 0000000000..3a6f30231c
--- /dev/null
+++ b/naga/tests/out/msl/subgroup-operations-s.msl
@@ -0,0 +1,55 @@
+// language: metal2.4
+#include <metal_stdlib>
+#include <simd/simd.h>
+
+using metal::uint;
+
+
+void main_1(
+    thread uint& subgroup_size_1,
+    thread uint& subgroup_invocation_id_1
+) {
+    uint _e5 = subgroup_size_1;
+    uint _e6 = subgroup_invocation_id_1;
+    metal::uint4 unnamed = uint4((uint64_t)metal::simd_ballot((_e6 & 1u) == 1u), 0, 0, 0);
+    metal::uint4 unnamed_1 = uint4((uint64_t)metal::simd_ballot(true), 0, 0, 0);
+    bool unnamed_2 = metal::simd_all(_e6 != 0u);
+    bool unnamed_3 = metal::simd_any(_e6 == 0u);
+    uint unnamed_4 = metal::simd_sum(_e6);
+    uint unnamed_5 = metal::simd_product(_e6);
+    uint unnamed_6 = metal::simd_min(_e6);
+    uint unnamed_7 = metal::simd_max(_e6);
+    uint unnamed_8 = metal::simd_and(_e6);
+    uint unnamed_9 = metal::simd_or(_e6);
+    uint unnamed_10 = metal::simd_xor(_e6);
+    uint unnamed_11 = metal::simd_prefix_exclusive_sum(_e6);
+    uint unnamed_12 = metal::simd_prefix_exclusive_product(_e6);
+    uint unnamed_13 = metal::simd_prefix_inclusive_sum(_e6);
+    uint unnamed_14 = metal::simd_prefix_inclusive_product(_e6);
+    uint unnamed_15 = metal::simd_broadcast_first(_e6);
+    uint unnamed_16 = metal::simd_broadcast(_e6, 4u);
+    uint unnamed_17 = metal::simd_shuffle(_e6, (_e5 - 1u) - _e6);
+    uint unnamed_18 = metal::simd_shuffle_down(_e6, 1u);
+    uint unnamed_19 = metal::simd_shuffle_up(_e6, 1u);
+    uint unnamed_20 = metal::simd_shuffle_xor(_e6, _e5 - 1u);
+    return;
+}
+
+struct main_Input {
+};
+kernel void main_(
+  uint num_subgroups [[simdgroups_per_threadgroup]]
+, uint subgroup_id [[simdgroup_index_in_threadgroup]]
+, uint subgroup_size [[threads_per_simdgroup]]
+, uint subgroup_invocation_id [[thread_index_in_simdgroup]]
+) {
+    uint num_subgroups_1 = {};
+    uint subgroup_id_1 = {};
+    uint subgroup_size_1 = {};
+    uint subgroup_invocation_id_1 = {};
+    num_subgroups_1 = num_subgroups;
+    subgroup_id_1 = subgroup_id;
+    subgroup_size_1 = subgroup_size;
+    subgroup_invocation_id_1 = subgroup_invocation_id;
+    main_1(subgroup_size_1, subgroup_invocation_id_1);
+}
diff --git a/naga/tests/out/msl/subgroup-operations.msl b/naga/tests/out/msl/subgroup-operations.msl
new file mode 100644
index 0000000000..980dea47f8
--- /dev/null
+++ b/naga/tests/out/msl/subgroup-operations.msl
@@ -0,0 +1,44 @@
+// language: metal2.4
+#include <metal_stdlib>
+#include <simd/simd.h>
+
+using metal::uint;
+
+struct Structure {
+    uint num_subgroups;
+    uint subgroup_size;
+};
+
+struct main_Input {
+};
+kernel void main_(
+  uint num_subgroups [[simdgroups_per_threadgroup]]
+, uint subgroup_size [[threads_per_simdgroup]]
+, uint subgroup_id [[simdgroup_index_in_threadgroup]]
+, uint subgroup_invocation_id [[thread_index_in_simdgroup]]
+) {
+    const Structure sizes = { num_subgroups, subgroup_size };
+    metal::simdgroup_barrier(metal::mem_flags::mem_threadgroup);
+    metal::uint4 unnamed = uint4((uint64_t)metal::simd_ballot((subgroup_invocation_id & 1u) == 1u), 0, 0, 0);
+    metal::uint4 unnamed_1 = uint4((uint64_t)metal::simd_ballot(true), 0, 0, 0);
+    bool unnamed_2 = metal::simd_all(subgroup_invocation_id != 0u);
+    bool unnamed_3 = metal::simd_any(subgroup_invocation_id == 0u);
+    uint unnamed_4 = metal::simd_sum(subgroup_invocation_id);
+    uint unnamed_5 = metal::simd_product(subgroup_invocation_id);
+    uint unnamed_6 = metal::simd_min(subgroup_invocation_id);
+    uint unnamed_7 = metal::simd_max(subgroup_invocation_id);
+    uint unnamed_8 = metal::simd_and(subgroup_invocation_id);
+    uint unnamed_9 = metal::simd_or(subgroup_invocation_id);
+    uint unnamed_10 = metal::simd_xor(subgroup_invocation_id);
+    uint unnamed_11 = metal::simd_prefix_exclusive_sum(subgroup_invocation_id);
+    uint unnamed_12 = metal::simd_prefix_exclusive_product(subgroup_invocation_id);
+    uint unnamed_13 = metal::simd_prefix_inclusive_sum(subgroup_invocation_id);
+    uint unnamed_14 = metal::simd_prefix_inclusive_product(subgroup_invocation_id);
+    uint unnamed_15 = metal::simd_broadcast_first(subgroup_invocation_id);
+    uint unnamed_16 = metal::simd_broadcast(subgroup_invocation_id, 4u);
+    uint unnamed_17 = metal::simd_shuffle(subgroup_invocation_id, (sizes.subgroup_size - 1u) - subgroup_invocation_id);
+    uint unnamed_18 = metal::simd_shuffle_down(subgroup_invocation_id, 1u);
+    uint unnamed_19 = metal::simd_shuffle_up(subgroup_invocation_id, 1u);
+    uint unnamed_20 = metal::simd_shuffle_xor(subgroup_invocation_id, sizes.subgroup_size - 1u);
+    return;
+}
diff --git a/naga/tests/out/spv/overrides.main.spvasm b/naga/tests/out/spv/overrides.main.spvasm
index d21eb7c674..5c748a01b2 100644
--- a/naga/tests/out/spv/overrides.main.spvasm
+++ b/naga/tests/out/spv/overrides.main.spvasm
@@ -1,12 +1,12 @@
 ; SPIR-V
 ; Version: 1.0
 ; Generator: rspirv
-; Bound: 31
+; Bound: 33
 OpCapability Shader
 %1 = OpExtInstImport "GLSL.std.450"
 OpMemoryModel Logical GLSL450
-OpEntryPoint GLCompute %18 "main"
-OpExecutionMode %18 LocalSize 1 1 1
+OpEntryPoint GLCompute %20 "main"
+OpExecutionMode %20 LocalSize 1 1 1
 %2 = OpTypeVoid
 %3 = OpTypeBool
 %4 = OpTypeFloat 32
@@ -22,22 +22,25 @@ OpExecutionMode %18 LocalSize 1 1 1
 %14 = OpConstant  %4  11.0
 %16 = OpTypePointer Private %4
 %15 = OpVariable  %16  Private %14
-%19 = OpTypeFunction %2
-%20 = OpConstant  %4  23.0
-%22 = OpTypePointer Function %4
-%24 = OpTypePointer Function %3
-%25 = OpConstantNull  %3
-%27 = OpConstantNull  %4
-%18 = OpFunction  %2  None %19
-%17 = OpLabel
-%21 = OpVariable  %22  Function %20
-%23 = OpVariable  %24  Function %25
-%26 = OpVariable  %22  Function %27
-OpBranch %28
-%28 = OpLabel
-OpStore %23 %5
-%29 = OpLoad  %4  %15
-%30 = OpFMul  %4  %29 %13
-OpStore %26 %30
+%18 = OpConstantNull  %4
+%17 = OpVariable  %16  Private %18
+%21 = OpTypeFunction %2
+%22 = OpConstant  %4  23.0
+%24 = OpTypePointer Function %4
+%26 = OpTypePointer Function %3
+%27 = OpConstantNull  %3
+%29 = OpConstantNull  %4
+%20 = OpFunction  %2  None %21
+%19 = OpLabel
+%23 = OpVariable  %24  Function %22
+%25 = OpVariable  %26  Function %27
+%28 = OpVariable  %24  Function %29
+OpBranch %30
+%30 = OpLabel
+OpStore %25 %5
+%31 = OpLoad  %4  %15
+%32 = OpFMul  %4  %31 %13
+OpStore %28 %32
+OpStore %17 %9
 OpReturn
 OpFunctionEnd
\ No newline at end of file
diff --git a/naga/tests/out/spv/subgroup-operations.spvasm b/naga/tests/out/spv/subgroup-operations.spvasm
new file mode 100644
index 0000000000..fb60aae5bc
--- /dev/null
+++ b/naga/tests/out/spv/subgroup-operations.spvasm
@@ -0,0 +1,81 @@
+; SPIR-V
+; Version: 1.3
+; Generator: rspirv
+; Bound: 58
+OpCapability Shader
+OpCapability GroupNonUniform
+OpCapability GroupNonUniformBallot
+OpCapability GroupNonUniformVote
+OpCapability GroupNonUniformArithmetic
+OpCapability GroupNonUniformShuffle
+OpCapability GroupNonUniformShuffleRelative
+%1 = OpExtInstImport "GLSL.std.450"
+OpMemoryModel Logical GLSL450
+OpEntryPoint GLCompute %17 "main" %8 %11 %13 %15
+OpExecutionMode %17 LocalSize 1 1 1
+OpMemberDecorate %4 0 Offset 0
+OpMemberDecorate %4 1 Offset 4
+OpDecorate %8 BuiltIn NumSubgroups
+OpDecorate %11 BuiltIn SubgroupSize
+OpDecorate %13 BuiltIn SubgroupId
+OpDecorate %15 BuiltIn SubgroupLocalInvocationId
+%2 = OpTypeVoid
+%3 = OpTypeInt 32 0
+%4 = OpTypeStruct %3 %3
+%5 = OpTypeBool
+%9 = OpTypePointer Input %3
+%8 = OpVariable  %9  Input
+%11 = OpVariable  %9  Input
+%13 = OpVariable  %9  Input
+%15 = OpVariable  %9  Input
+%18 = OpTypeFunction %2
+%19 = OpConstant  %3  1
+%20 = OpConstant  %3  0
+%21 = OpConstant  %3  4
+%23 = OpConstant  %3  3
+%24 = OpConstant  %3  2
+%25 = OpConstant  %3  8
+%28 = OpTypeVector %3 4
+%30 = OpConstantTrue  %5
+%17 = OpFunction  %2  None %18
+%6 = OpLabel
+%10 = OpLoad  %3  %8
+%12 = OpLoad  %3  %11
+%7 = OpCompositeConstruct  %4  %10 %12
+%14 = OpLoad  %3  %13
+%16 = OpLoad  %3  %15
+OpBranch %22
+%22 = OpLabel
+OpControlBarrier %23 %24 %25
+%26 = OpBitwiseAnd  %3  %16 %19
+%27 = OpIEqual  %5  %26 %19
+%29 = OpGroupNonUniformBallot  %28  %23 %27
+%31 = OpGroupNonUniformBallot  %28  %23 %30
+%32 = OpINotEqual  %5  %16 %20
+%33 = OpGroupNonUniformAll  %5  %23 %32
+%34 = OpIEqual  %5  %16 %20
+%35 = OpGroupNonUniformAny  %5  %23 %34
+%36 = OpGroupNonUniformIAdd  %3  %23 Reduce %16
+%37 = OpGroupNonUniformIMul  %3  %23 Reduce %16
+%38 = OpGroupNonUniformUMin  %3  %23 Reduce %16
+%39 = OpGroupNonUniformUMax  %3  %23 Reduce %16
+%40 = OpGroupNonUniformBitwiseAnd  %3  %23 Reduce %16
+%41 = OpGroupNonUniformBitwiseOr  %3  %23 Reduce %16
+%42 = OpGroupNonUniformBitwiseXor  %3  %23 Reduce %16
+%43 = OpGroupNonUniformIAdd  %3  %23 ExclusiveScan %16
+%44 = OpGroupNonUniformIMul  %3  %23 ExclusiveScan %16
+%45 = OpGroupNonUniformIAdd  %3  %23 InclusiveScan %16
+%46 = OpGroupNonUniformIMul  %3  %23 InclusiveScan %16
+%47 = OpGroupNonUniformBroadcastFirst  %3  %23 %16
+%48 = OpGroupNonUniformShuffle  %3  %23 %16 %21
+%49 = OpCompositeExtract  %3  %7 1
+%50 = OpISub  %3  %49 %19
+%51 = OpISub  %3  %50 %16
+%52 = OpGroupNonUniformShuffle  %3  %23 %16 %51
+%53 = OpGroupNonUniformShuffleDown  %3  %23 %16 %19
+%54 = OpGroupNonUniformShuffleUp  %3  %23 %16 %19
+%55 = OpCompositeExtract  %3  %7 1
+%56 = OpISub  %3  %55 %19
+%57 = OpGroupNonUniformShuffleXor  %3  %23 %16 %56
+OpReturn
+OpFunctionEnd
\ No newline at end of file
diff --git a/naga/tests/out/wgsl/subgroup-operations-s.wgsl b/naga/tests/out/wgsl/subgroup-operations-s.wgsl
new file mode 100644
index 0000000000..c61e2dfc57
--- /dev/null
+++ b/naga/tests/out/wgsl/subgroup-operations-s.wgsl
@@ -0,0 +1,40 @@
+var<private> num_subgroups_1: u32;
+var<private> subgroup_id_1: u32;
+var<private> subgroup_size_1: u32;
+var<private> subgroup_invocation_id_1: u32;
+
+fn main_1() {
+    let _e5 = subgroup_size_1;
+    let _e6 = subgroup_invocation_id_1;
+    let _e9 = subgroupBallot(((_e6 & 1u) == 1u));
+    let _e10 = subgroupBallot();
+    let _e12 = subgroupAll((_e6 != 0u));
+    let _e14 = subgroupAny((_e6 == 0u));
+    let _e15 = subgroupAdd(_e6);
+    let _e16 = subgroupMul(_e6);
+    let _e17 = subgroupMin(_e6);
+    let _e18 = subgroupMax(_e6);
+    let _e19 = subgroupAnd(_e6);
+    let _e20 = subgroupOr(_e6);
+    let _e21 = subgroupXor(_e6);
+    let _e22 = subgroupExclusiveAdd(_e6);
+    let _e23 = subgroupExclusiveMul(_e6);
+    let _e24 = subgroupInclusiveAdd(_e6);
+    let _e25 = subgroupInclusiveMul(_e6);
+    let _e26 = subgroupBroadcastFirst(_e6);
+    let _e27 = subgroupBroadcast(_e6, 4u);
+    let _e30 = subgroupShuffle(_e6, ((_e5 - 1u) - _e6));
+    let _e31 = subgroupShuffleDown(_e6, 1u);
+    let _e32 = subgroupShuffleUp(_e6, 1u);
+    let _e34 = subgroupShuffleXor(_e6, (_e5 - 1u));
+    return;
+}
+
+@compute @workgroup_size(1, 1, 1) 
+fn main(@builtin(num_subgroups) num_subgroups: u32, @builtin(subgroup_id) subgroup_id: u32, @builtin(subgroup_size) subgroup_size: u32, @builtin(subgroup_invocation_id) subgroup_invocation_id: u32) {
+    num_subgroups_1 = num_subgroups;
+    subgroup_id_1 = subgroup_id;
+    subgroup_size_1 = subgroup_size;
+    subgroup_invocation_id_1 = subgroup_invocation_id;
+    main_1();
+}
diff --git a/naga/tests/out/wgsl/subgroup-operations.wgsl b/naga/tests/out/wgsl/subgroup-operations.wgsl
new file mode 100644
index 0000000000..25f713b357
--- /dev/null
+++ b/naga/tests/out/wgsl/subgroup-operations.wgsl
@@ -0,0 +1,31 @@
+struct Structure {
+    @builtin(num_subgroups) num_subgroups: u32,
+    @builtin(subgroup_size) subgroup_size: u32,
+}
+
+@compute @workgroup_size(1, 1, 1) 
+fn main(sizes: Structure, @builtin(subgroup_id) subgroup_id: u32, @builtin(subgroup_invocation_id) subgroup_invocation_id: u32) {
+    subgroupBarrier();
+    let _e7 = subgroupBallot(((subgroup_invocation_id & 1u) == 1u));
+    let _e8 = subgroupBallot();
+    let _e11 = subgroupAll((subgroup_invocation_id != 0u));
+    let _e14 = subgroupAny((subgroup_invocation_id == 0u));
+    let _e15 = subgroupAdd(subgroup_invocation_id);
+    let _e16 = subgroupMul(subgroup_invocation_id);
+    let _e17 = subgroupMin(subgroup_invocation_id);
+    let _e18 = subgroupMax(subgroup_invocation_id);
+    let _e19 = subgroupAnd(subgroup_invocation_id);
+    let _e20 = subgroupOr(subgroup_invocation_id);
+    let _e21 = subgroupXor(subgroup_invocation_id);
+    let _e22 = subgroupExclusiveAdd(subgroup_invocation_id);
+    let _e23 = subgroupExclusiveMul(subgroup_invocation_id);
+    let _e24 = subgroupInclusiveAdd(subgroup_invocation_id);
+    let _e25 = subgroupInclusiveMul(subgroup_invocation_id);
+    let _e26 = subgroupBroadcastFirst(subgroup_invocation_id);
+    let _e28 = subgroupBroadcast(subgroup_invocation_id, 4u);
+    let _e33 = subgroupShuffle(subgroup_invocation_id, ((sizes.subgroup_size - 1u) - subgroup_invocation_id));
+    let _e35 = subgroupShuffleDown(subgroup_invocation_id, 1u);
+    let _e37 = subgroupShuffleUp(subgroup_invocation_id, 1u);
+    let _e41 = subgroupShuffleXor(subgroup_invocation_id, (sizes.subgroup_size - 1u));
+    return;
+}
diff --git a/naga/tests/snapshots.rs b/naga/tests/snapshots.rs
index 3e45faeb16..ee775a3e63 100644
--- a/naga/tests/snapshots.rs
+++ b/naga/tests/snapshots.rs
@@ -269,10 +269,18 @@ fn check_targets(
     let params = input.read_parameters();
     let name = &input.file_name;
 
-    let capabilities = if params.god_mode {
-        naga::valid::Capabilities::all()
+    let (capabilities, subgroup_stages, subgroup_operations) = if params.god_mode {
+        (
+            naga::valid::Capabilities::all(),
+            naga::valid::ShaderStages::all(),
+            naga::valid::SubgroupOperationSet::all(),
+        )
     } else {
-        naga::valid::Capabilities::default()
+        (
+            naga::valid::Capabilities::default(),
+            naga::valid::ShaderStages::empty(),
+            naga::valid::SubgroupOperationSet::empty(),
+        )
     };
 
     #[cfg(feature = "serialize")]
@@ -285,6 +293,8 @@ fn check_targets(
     }
 
     let info = naga::valid::Validator::new(naga::valid::ValidationFlags::all(), capabilities)
+        .subgroup_stages(subgroup_stages)
+        .subgroup_operations(subgroup_operations)
         .validate(module)
         .unwrap_or_else(|err| {
             panic!(
@@ -308,6 +318,8 @@ fn check_targets(
         }
 
         naga::valid::Validator::new(naga::valid::ValidationFlags::all(), capabilities)
+            .subgroup_stages(subgroup_stages)
+            .subgroup_operations(subgroup_operations)
             .validate(module)
             .unwrap_or_else(|err| {
                 panic!(
@@ -850,6 +862,10 @@ fn convert_wgsl() {
             "int64",
             Targets::SPIRV | Targets::HLSL | Targets::WGSL | Targets::METAL,
         ),
+        (
+            "subgroup-operations",
+            Targets::SPIRV | Targets::METAL | Targets::GLSL | Targets::HLSL | Targets::WGSL,
+        ),
         (
             "overrides",
             Targets::IR
@@ -957,6 +973,12 @@ fn convert_spv_all() {
     );
     convert_spv("builtin-accessed-outside-entrypoint", true, Targets::WGSL);
     convert_spv("spec-constants", true, Targets::IR);
+    convert_spv("spec-constants-issue-5598", true, Targets::GLSL);
+    convert_spv(
+        "subgroup-operations-s",
+        false,
+        Targets::METAL | Targets::GLSL | Targets::HLSL | Targets::WGSL,
+    );
 }
 
 #[cfg(feature = "glsl-in")]
diff --git a/player/src/lib.rs b/player/src/lib.rs
index 0ea491ea20..5777f4d7a6 100644
--- a/player/src/lib.rs
+++ b/player/src/lib.rs
@@ -99,7 +99,7 @@ impl GlobalPlay for wgc::global::Global {
                     base,
                     timestamp_writes,
                 } => {
-                    self.command_encoder_run_compute_pass_impl::<A>(
+                    self.command_encoder_run_compute_pass_with_unresolved_commands::<A>(
                         encoder,
                         base.as_ref(),
                         timestamp_writes.as_ref(),
diff --git a/player/tests/data/bind-group.ron b/player/tests/data/bind-group.ron
index 92415e4ff3..9da7abe097 100644
--- a/player/tests/data/bind-group.ron
+++ b/player/tests/data/bind-group.ron
@@ -58,6 +58,7 @@
                     module: Id(0, 1, Empty),
                     entry_point: None,
                     constants: {},
+                    zero_initialize_workgroup_memory: true,
                 ),
             ),
         ),
diff --git a/player/tests/data/pipeline-statistics-query.ron b/player/tests/data/pipeline-statistics-query.ron
index 3c672f4e56..f0f96d42cb 100644
--- a/player/tests/data/pipeline-statistics-query.ron
+++ b/player/tests/data/pipeline-statistics-query.ron
@@ -31,6 +31,7 @@
                     module: Id(0, 1, Empty),
                     entry_point: None,
                     constants: {},
+                    zero_initialize_workgroup_memory: true,
                 ),
             ),
         ),
diff --git a/player/tests/data/quad.ron b/player/tests/data/quad.ron
index 9d6b4a25f6..1a8b4028bb 100644
--- a/player/tests/data/quad.ron
+++ b/player/tests/data/quad.ron
@@ -59,6 +59,7 @@
                         module: Id(0, 1, Empty),
                         entry_point: None,
                         constants: {},
+                        zero_initialize_workgroup_memory: true,
                     ),
                     buffers: [],
                 ),
@@ -67,6 +68,7 @@
                         module: Id(0, 1, Empty),
                         entry_point: None,
                         constants: {},
+                        zero_initialize_workgroup_memory: true,
                     ),
                     targets: [
                         Some((
diff --git a/player/tests/data/zero-init-buffer.ron b/player/tests/data/zero-init-buffer.ron
index 5697a2555e..1ce7924ddd 100644
--- a/player/tests/data/zero-init-buffer.ron
+++ b/player/tests/data/zero-init-buffer.ron
@@ -135,6 +135,7 @@
                     module: Id(0, 1, Empty),
                     entry_point: None,
                     constants: {},
+                    zero_initialize_workgroup_memory: true,
                 ),
             ),
         ),
diff --git a/player/tests/data/zero-init-texture-binding.ron b/player/tests/data/zero-init-texture-binding.ron
index 340cb0cfa2..2aeaf22c7d 100644
--- a/player/tests/data/zero-init-texture-binding.ron
+++ b/player/tests/data/zero-init-texture-binding.ron
@@ -136,6 +136,7 @@
                     module: Id(0, 1, Empty),
                     entry_point: None,
                     constants: {},
+                    zero_initialize_workgroup_memory: true,
                 ),
             ),
         ),
diff --git a/shell.nix b/shell.nix
new file mode 100644
index 0000000000..6afa344709
--- /dev/null
+++ b/shell.nix
@@ -0,0 +1,69 @@
+# This file is only relevant for Nix and NixOS users.
+# What's actually meant by "Nix" here is not UNIX, but the *package manager* Nix, see https://nixos.org/.
+# If you are
+#   on macOS (and not using nix-darwin)
+#   or on Windows (and not using Nix in WSL),
+# you can carelessly ignore this file.
+#
+# Otherwise, if you *do* use Nix the package manager,
+# this file declares
+#   common dependencies
+#   and some nice tools
+# which you'll most likely need when working with wgpu.
+# Feel free to copy it into your own project if deemed useful.
+#
+# To use this file, just run `nix-shell` in this folder,
+# which will drop you into a shell
+# with all the deps needed for building wgpu available.
+#
+# Or if you're using direnv (https://direnv.net/),
+# use `direnv allow` to automatically always use this file
+# if you're navigating into this or a subfolder.
+
+{ pkgs ? import <nixpkgs> {} }:
+
+pkgs.mkShell rec {
+  buildInputs = with pkgs; [
+    # necessary for building wgpu in 3rd party packages (in most cases)
+    libxkbcommon
+    wayland xorg.libX11 xorg.libXcursor xorg.libXrandr xorg.libXi
+    alsa-lib
+    fontconfig freetype
+    shaderc directx-shader-compiler
+    pkg-config cmake
+    mold # could use any linker, needed for rustix (but mold is fast)
+
+    libGL
+    vulkan-headers vulkan-loader
+    vulkan-tools vulkan-tools-lunarg
+    vulkan-extension-layer
+    vulkan-validation-layers # don't need them *strictly* but immensely helpful
+
+    # necessary for developing (all of) wgpu itself
+    cargo-nextest cargo-fuzz
+
+    # nice for developing wgpu itself
+    typos 
+
+    # if you don't already have rust installed through other means,
+    # this shell.nix can do that for you with this below
+    yq # for tomlq below
+    rustup
+
+    # nice tools
+    gdb rr
+    evcxr
+    valgrind
+    renderdoc
+  ];
+
+  shellHook = ''
+    export RUSTC_VERSION="$(tomlq -r .toolchain.channel rust-toolchain.toml)"
+    export PATH="$PATH:''${CARGO_HOME:-~/.cargo}/bin"
+    export PATH="$PATH:''${RUSTUP_HOME:-~/.rustup/toolchains/$RUSTC_VERSION-x86_64-unknown-linux/bin}"
+    export LD_LIBRARY_PATH="$LD_LIBRARY_PATH:${builtins.toString (pkgs.lib.makeLibraryPath buildInputs)}";
+
+    rustup default $RUSTC_VERSION
+    rustup component add rust-src rust-analyzer
+  '';
+}
diff --git a/tests/src/config.rs b/tests/src/config.rs
index fa96adbc1d..62d3e56091 100644
--- a/tests/src/config.rs
+++ b/tests/src/config.rs
@@ -1,4 +1,4 @@
-use std::{future::Future, pin::Pin, sync::Arc};
+use std::{future::Future, panic::Location, pin::Pin, sync::Arc};
 
 use crate::{TestParameters, TestingContext};
 
@@ -26,14 +26,17 @@ cfg_if::cfg_if! {
 #[derive(Clone)]
 pub struct GpuTestConfiguration {
     pub(crate) name: String,
+    pub(crate) location: &'static Location<'static>,
     pub(crate) params: TestParameters,
     pub(crate) test: Option<RunTestAsync>,
 }
 
 impl GpuTestConfiguration {
+    #[track_caller]
     pub fn new() -> Self {
         Self {
             name: String::new(),
+            location: Location::caller(),
             params: TestParameters::default(),
             test: None,
         }
diff --git a/tests/src/image.rs b/tests/src/image.rs
index 98310233c9..8996f361cd 100644
--- a/tests/src/image.rs
+++ b/tests/src/image.rs
@@ -369,7 +369,7 @@ fn copy_via_compute(
         layout: Some(&pll),
         module: &sm,
         entry_point: "copy_texture_to_buffer",
-        constants: &Default::default(),
+        compilation_options: Default::default(),
     });
 
     {
diff --git a/tests/src/run.rs b/tests/src/run.rs
index f56651b574..82ddb93399 100644
--- a/tests/src/run.rs
+++ b/tests/src/run.rs
@@ -116,7 +116,10 @@ pub async fn execute_test(
 
     // The call to matches_failure will log.
     if expectations_match_failures(&test_info.failures, failures) == ExpectationMatchResult::Panic {
-        panic!();
+        panic!(
+            "{}: test {:?} did not behave as expected",
+            config.location, config.name
+        );
     }
     // Print the name of the test.
     log::info!("TEST FINISHED: {}", config.name);
diff --git a/tests/tests/bgra8unorm_storage.rs b/tests/tests/bgra8unorm_storage.rs
index c3913e5df8..17082a9ed4 100644
--- a/tests/tests/bgra8unorm_storage.rs
+++ b/tests/tests/bgra8unorm_storage.rs
@@ -96,7 +96,7 @@ static BGRA8_UNORM_STORAGE: GpuTestConfiguration = GpuTestConfiguration::new()
             label: None,
             layout: Some(&pl),
             entry_point: "main",
-            constants: &Default::default(),
+            compilation_options: Default::default(),
             module: &module,
         });
 
diff --git a/tests/tests/bind_group_layout_dedup.rs b/tests/tests/bind_group_layout_dedup.rs
index 519cfbda29..3466e1e244 100644
--- a/tests/tests/bind_group_layout_dedup.rs
+++ b/tests/tests/bind_group_layout_dedup.rs
@@ -90,7 +90,7 @@ async fn bgl_dedupe(ctx: TestingContext) {
             layout: Some(&pipeline_layout),
             module: &module,
             entry_point: "no_resources",
-            constants: &Default::default(),
+            compilation_options: Default::default(),
         };
 
         let pipeline = ctx.device.create_compute_pipeline(&desc);
@@ -219,7 +219,7 @@ fn bgl_dedupe_with_dropped_user_handle(ctx: TestingContext) {
             layout: Some(&pipeline_layout),
             module: &module,
             entry_point: "no_resources",
-            constants: &Default::default(),
+            compilation_options: Default::default(),
         });
 
     let mut encoder = ctx.device.create_command_encoder(&Default::default());
@@ -265,7 +265,7 @@ fn bgl_dedupe_derived(ctx: TestingContext) {
             layout: None,
             module: &module,
             entry_point: "resources",
-            constants: &Default::default(),
+            compilation_options: Default::default(),
         });
 
     // We create two bind groups, pulling the bind_group_layout from the pipeline each time.
@@ -336,7 +336,7 @@ fn separate_programs_have_incompatible_derived_bgls(ctx: TestingContext) {
         layout: None,
         module: &module,
         entry_point: "resources",
-        constants: &Default::default(),
+        compilation_options: Default::default(),
     };
     // Create two pipelines, creating a BG from the second.
     let pipeline1 = ctx.device.create_compute_pipeline(&desc);
@@ -398,7 +398,7 @@ fn derived_bgls_incompatible_with_regular_bgls(ctx: TestingContext) {
             layout: None,
             module: &module,
             entry_point: "resources",
-            constants: &Default::default(),
+            compilation_options: Default::default(),
         });
 
     // Create a matching BGL
diff --git a/tests/tests/buffer.rs b/tests/tests/buffer.rs
index 1622995c35..0693877d00 100644
--- a/tests/tests/buffer.rs
+++ b/tests/tests/buffer.rs
@@ -224,7 +224,7 @@ static MINIMUM_BUFFER_BINDING_SIZE_LAYOUT: GpuTestConfiguration = GpuTestConfigu
                     layout: Some(&pipeline_layout),
                     module: &shader_module,
                     entry_point: "main",
-                    constants: &Default::default(),
+                    compilation_options: Default::default(),
                 });
         });
     });
@@ -293,7 +293,7 @@ static MINIMUM_BUFFER_BINDING_SIZE_DISPATCH: GpuTestConfiguration = GpuTestConfi
                 layout: Some(&pipeline_layout),
                 module: &shader_module,
                 entry_point: "main",
-                constants: &Default::default(),
+                compilation_options: Default::default(),
             });
 
         let buffer = ctx.device.create_buffer(&wgpu::BufferDescriptor {
diff --git a/tests/tests/device.rs b/tests/tests/device.rs
index 82e3f71a1c..649a850fa9 100644
--- a/tests/tests/device.rs
+++ b/tests/tests/device.rs
@@ -480,7 +480,7 @@ static DEVICE_DESTROY_THEN_MORE: GpuTestConfiguration = GpuTestConfiguration::ne
                     vertex: wgpu::VertexState {
                         module: &shader_module,
                         entry_point: "",
-                        constants: &Default::default(),
+                        compilation_options: Default::default(),
                         buffers: &[],
                     },
                     primitive: wgpu::PrimitiveState::default(),
@@ -499,7 +499,7 @@ static DEVICE_DESTROY_THEN_MORE: GpuTestConfiguration = GpuTestConfiguration::ne
                     layout: None,
                     module: &shader_module,
                     entry_point: "",
-                    constants: &Default::default(),
+                    compilation_options: Default::default(),
                 });
         });
 
@@ -736,7 +736,7 @@ fn vs_main() -> @builtin(position) vec4<f32> {
                 fragment: Some(wgpu::FragmentState {
                     module: &trivial_shaders_with_some_reversed_bindings,
                     entry_point: "fs_main",
-                    constants: &Default::default(),
+                    compilation_options: Default::default(),
                     targets: &[Some(wgt::ColorTargetState {
                         format: wgt::TextureFormat::Bgra8Unorm,
                         blend: None,
@@ -750,7 +750,7 @@ fn vs_main() -> @builtin(position) vec4<f32> {
                 vertex: wgpu::VertexState {
                     module: &trivial_shaders_with_some_reversed_bindings,
                     entry_point: "vs_main",
-                    constants: &Default::default(),
+                    compilation_options: Default::default(),
                     buffers: &[],
                 },
                 primitive: wgt::PrimitiveState::default(),
diff --git a/tests/tests/mem_leaks.rs b/tests/tests/mem_leaks.rs
index 949b4d96ce..7002ebabe0 100644
--- a/tests/tests/mem_leaks.rs
+++ b/tests/tests/mem_leaks.rs
@@ -97,7 +97,7 @@ async fn draw_test_with_reports(
                 buffers: &[],
                 module: &shader,
                 entry_point: "vs_main_builtin",
-                constants: &Default::default(),
+                compilation_options: Default::default(),
             },
             primitive: wgpu::PrimitiveState::default(),
             depth_stencil: None,
@@ -105,7 +105,7 @@ async fn draw_test_with_reports(
             fragment: Some(wgpu::FragmentState {
                 module: &shader,
                 entry_point: "fs_main",
-                constants: &Default::default(),
+                compilation_options: Default::default(),
                 targets: &[Some(wgpu::ColorTargetState {
                     format: wgpu::TextureFormat::Rgba8Unorm,
                     blend: None,
diff --git a/tests/tests/nv12_texture/mod.rs b/tests/tests/nv12_texture/mod.rs
index 0f4ba16f25..70ee849831 100644
--- a/tests/tests/nv12_texture/mod.rs
+++ b/tests/tests/nv12_texture/mod.rs
@@ -24,13 +24,13 @@ static NV12_TEXTURE_CREATION_SAMPLING: GpuTestConfiguration = GpuTestConfigurati
                 vertex: wgpu::VertexState {
                     module: &shader,
                     entry_point: "vs_main",
-                    constants: &Default::default(),
+                    compilation_options: Default::default(),
                     buffers: &[],
                 },
                 fragment: Some(wgpu::FragmentState {
                     module: &shader,
                     entry_point: "fs_main",
-                    constants: &Default::default(),
+                    compilation_options: Default::default(),
                     targets: &[Some(target_format.into())],
                 }),
                 primitive: wgpu::PrimitiveState {
diff --git a/tests/tests/occlusion_query/mod.rs b/tests/tests/occlusion_query/mod.rs
index 2db035bfb2..1a68ecf79d 100644
--- a/tests/tests/occlusion_query/mod.rs
+++ b/tests/tests/occlusion_query/mod.rs
@@ -37,7 +37,7 @@ static OCCLUSION_QUERY: GpuTestConfiguration = GpuTestConfiguration::new()
                 vertex: wgpu::VertexState {
                     module: &shader,
                     entry_point: "vs_main",
-                    constants: &Default::default(),
+                    compilation_options: Default::default(),
                     buffers: &[],
                 },
                 fragment: None,
diff --git a/tests/tests/partially_bounded_arrays/mod.rs b/tests/tests/partially_bounded_arrays/mod.rs
index b93e900a9c..11eee5b207 100644
--- a/tests/tests/partially_bounded_arrays/mod.rs
+++ b/tests/tests/partially_bounded_arrays/mod.rs
@@ -69,7 +69,7 @@ static PARTIALLY_BOUNDED_ARRAY: GpuTestConfiguration = GpuTestConfiguration::new
             layout: Some(&pipeline_layout),
             module: &cs_module,
             entry_point: "main",
-            constants: &Default::default(),
+            compilation_options: Default::default(),
         });
 
         let bind_group = device.create_bind_group(&wgpu::BindGroupDescriptor {
diff --git a/tests/tests/pipeline.rs b/tests/tests/pipeline.rs
index c8814e25f7..a07e158a53 100644
--- a/tests/tests/pipeline.rs
+++ b/tests/tests/pipeline.rs
@@ -28,7 +28,7 @@ static PIPELINE_DEFAULT_LAYOUT_BAD_MODULE: GpuTestConfiguration = GpuTestConfigu
                     layout: None,
                     module: &module,
                     entry_point: "doesn't exist",
-                    constants: &Default::default(),
+                    compilation_options: Default::default(),
                 });
 
             pipeline.get_bind_group_layout(0);
diff --git a/tests/tests/push_constants.rs b/tests/tests/push_constants.rs
index d1119476c3..04d9a00f7d 100644
--- a/tests/tests/push_constants.rs
+++ b/tests/tests/push_constants.rs
@@ -103,7 +103,7 @@ async fn partial_update_test(ctx: TestingContext) {
             layout: Some(&pipeline_layout),
             module: &sm,
             entry_point: "main",
-            constants: &Default::default(),
+            compilation_options: Default::default(),
         });
 
     let mut encoder = ctx
diff --git a/tests/tests/regression/issue_3349.rs b/tests/tests/regression/issue_3349.rs
index 93b91b9d7b..74c466b45a 100644
--- a/tests/tests/regression/issue_3349.rs
+++ b/tests/tests/regression/issue_3349.rs
@@ -102,13 +102,13 @@ async fn multi_stage_data_binding_test(ctx: TestingContext) {
             vertex: wgpu::VertexState {
                 module: &vs_sm,
                 entry_point: "vs_main",
-                constants: &Default::default(),
+                compilation_options: Default::default(),
                 buffers: &[],
             },
             fragment: Some(wgpu::FragmentState {
                 module: &fs_sm,
                 entry_point: "fs_main",
-                constants: &Default::default(),
+                compilation_options: Default::default(),
                 targets: &[Some(wgpu::ColorTargetState {
                     format: wgpu::TextureFormat::Rgba8Unorm,
                     blend: None,
diff --git a/tests/tests/regression/issue_3457.rs b/tests/tests/regression/issue_3457.rs
index 0fca44b0c9..f18d681ae1 100644
--- a/tests/tests/regression/issue_3457.rs
+++ b/tests/tests/regression/issue_3457.rs
@@ -52,7 +52,7 @@ static PASS_RESET_VERTEX_BUFFER: GpuTestConfiguration =
                 vertex: VertexState {
                     module: &module,
                     entry_point: "double_buffer_vert",
-                    constants: &Default::default(),
+                    compilation_options: Default::default(),
                     buffers: &[
                         VertexBufferLayout {
                             array_stride: 16,
@@ -72,7 +72,7 @@ static PASS_RESET_VERTEX_BUFFER: GpuTestConfiguration =
                 fragment: Some(FragmentState {
                     module: &module,
                     entry_point: "double_buffer_frag",
-                    constants: &Default::default(),
+                    compilation_options: Default::default(),
                     targets: &[Some(ColorTargetState {
                         format: TextureFormat::Rgba8Unorm,
                         blend: None,
@@ -90,7 +90,7 @@ static PASS_RESET_VERTEX_BUFFER: GpuTestConfiguration =
                 vertex: VertexState {
                     module: &module,
                     entry_point: "single_buffer_vert",
-                    constants: &Default::default(),
+                    compilation_options: Default::default(),
                     buffers: &[VertexBufferLayout {
                         array_stride: 16,
                         step_mode: VertexStepMode::Vertex,
@@ -103,7 +103,7 @@ static PASS_RESET_VERTEX_BUFFER: GpuTestConfiguration =
                 fragment: Some(FragmentState {
                     module: &module,
                     entry_point: "single_buffer_frag",
-                    constants: &Default::default(),
+                    compilation_options: Default::default(),
                     targets: &[Some(ColorTargetState {
                         format: TextureFormat::Rgba8Unorm,
                         blend: None,
diff --git a/tests/tests/root.rs b/tests/tests/root.rs
index ec58927d16..82b74717eb 100644
--- a/tests/tests/root.rs
+++ b/tests/tests/root.rs
@@ -34,6 +34,7 @@ mod scissor_tests;
 mod shader;
 mod shader_primitive_index;
 mod shader_view_format;
+mod subgroup_operations;
 mod texture_bounds;
 mod texture_view_creation;
 mod transfer;
diff --git a/tests/tests/scissor_tests/mod.rs b/tests/tests/scissor_tests/mod.rs
index efc658501d..15c35644e5 100644
--- a/tests/tests/scissor_tests/mod.rs
+++ b/tests/tests/scissor_tests/mod.rs
@@ -44,7 +44,7 @@ async fn scissor_test_impl(
             vertex: wgpu::VertexState {
                 module: &shader,
                 entry_point: "vs_main",
-                constants: &Default::default(),
+                compilation_options: Default::default(),
                 buffers: &[],
             },
             primitive: wgpu::PrimitiveState::default(),
@@ -53,7 +53,7 @@ async fn scissor_test_impl(
             fragment: Some(wgpu::FragmentState {
                 module: &shader,
                 entry_point: "fs_main",
-                constants: &Default::default(),
+                compilation_options: Default::default(),
                 targets: &[Some(wgpu::ColorTargetState {
                     format: wgpu::TextureFormat::Rgba8Unorm,
                     blend: None,
diff --git a/tests/tests/shader/compilation_messages/error_shader.wgsl b/tests/tests/shader/compilation_messages/error_shader.wgsl
new file mode 100644
index 0000000000..c57bdbe8f0
--- /dev/null
+++ b/tests/tests/shader/compilation_messages/error_shader.wgsl
@@ -0,0 +1,2 @@
+/*🐈🐈🐈🐈🐈🐈🐈*/?
+// Expected Error: invalid character found
\ No newline at end of file
diff --git a/tests/tests/shader/compilation_messages/mod.rs b/tests/tests/shader/compilation_messages/mod.rs
new file mode 100644
index 0000000000..09000205a2
--- /dev/null
+++ b/tests/tests/shader/compilation_messages/mod.rs
@@ -0,0 +1,49 @@
+use wgpu::include_wgsl;
+
+use wgpu_test::{gpu_test, GpuTestConfiguration, TestParameters};
+
+#[gpu_test]
+static SHADER_COMPILE_SUCCESS: GpuTestConfiguration = GpuTestConfiguration::new()
+    .parameters(TestParameters::default())
+    .run_async(|ctx| async move {
+        let sm = ctx
+            .device
+            .create_shader_module(include_wgsl!("successful_shader.wgsl"));
+
+        let compilation_info = sm.get_compilation_info().await;
+        for message in compilation_info.messages.iter() {
+            assert!(message.message_type != wgpu::CompilationMessageType::Error);
+        }
+    });
+
+#[gpu_test]
+static SHADER_COMPILE_ERROR: GpuTestConfiguration = GpuTestConfiguration::new()
+    .parameters(TestParameters::default())
+    .run_async(|ctx| async move {
+        ctx.device.push_error_scope(wgpu::ErrorFilter::Validation);
+        let sm = ctx
+            .device
+            .create_shader_module(include_wgsl!("error_shader.wgsl"));
+        assert!(pollster::block_on(ctx.device.pop_error_scope()).is_some());
+
+        let compilation_info = sm.get_compilation_info().await;
+        let error_message = compilation_info
+            .messages
+            .iter()
+            .find(|message| message.message_type == wgpu::CompilationMessageType::Error)
+            .expect("Expected error message not found");
+        let span = error_message.location.expect("Expected span not found");
+        assert_eq!(
+            span.offset, 32,
+            "Expected the offset to be 32, because we're counting UTF-8 bytes"
+        );
+        assert_eq!(span.length, 1, "Expected length to roughly be 1"); // Could be relaxed, depending on the parser requirements.
+        assert_eq!(
+            span.line_number, 1,
+            "Expected the line number to be 1, because we're counting lines from 1"
+        );
+        assert_eq!(
+            span.line_position, 33,
+            "Expected the column number to be 33, because we're counting lines from 1"
+        );
+    });
diff --git a/tests/tests/shader/compilation_messages/successful_shader.wgsl b/tests/tests/shader/compilation_messages/successful_shader.wgsl
new file mode 100644
index 0000000000..638b89edab
--- /dev/null
+++ b/tests/tests/shader/compilation_messages/successful_shader.wgsl
@@ -0,0 +1,31 @@
+const array_size = 512u;
+
+struct WStruct {
+    arr: array<u32, array_size>,
+    atom: atomic<u32>
+}
+
+var<workgroup> w_mem: WStruct;
+
+@group(0) @binding(0)
+var<storage, read_write> output: array<u32>;
+
+@compute @workgroup_size(1)
+fn read(@builtin(workgroup_id) wgid: vec3<u32>, @builtin(num_workgroups) num_workgroups: vec3<u32>) {
+    var is_zero = true;
+    for(var i = 0u; i < array_size; i++) {
+        is_zero &= w_mem.arr[i] == 0u;
+    }
+    is_zero &= atomicLoad(&w_mem.atom) == 0u;
+
+    let idx = wgid.x + (wgid.y * num_workgroups.x) + (wgid.z * num_workgroups.x * num_workgroups.y);
+    output[idx] = u32(!is_zero);
+}
+
+@compute @workgroup_size(1)
+fn write() {
+    for(var i = 0u; i < array_size; i++) {
+        w_mem.arr[i] = i;
+    }
+    atomicStore(&w_mem.atom, 3u);
+}
diff --git a/tests/tests/shader/mod.rs b/tests/tests/shader/mod.rs
index bb93c690e8..6ece08652f 100644
--- a/tests/tests/shader/mod.rs
+++ b/tests/tests/shader/mod.rs
@@ -15,6 +15,7 @@ use wgpu::{
 
 use wgpu_test::TestingContext;
 
+pub mod compilation_messages;
 pub mod numeric_builtins;
 pub mod struct_layout;
 pub mod zero_init_workgroup_mem;
@@ -307,7 +308,7 @@ async fn shader_input_output_test(
                 layout: Some(&pll),
                 module: &sm,
                 entry_point: "cs_main",
-                constants: &Default::default(),
+                compilation_options: Default::default(),
             });
 
         // -- Initializing data --
diff --git a/tests/tests/shader/zero_init_workgroup_mem.rs b/tests/tests/shader/zero_init_workgroup_mem.rs
index 2bbcd87d90..cb9f341ee5 100644
--- a/tests/tests/shader/zero_init_workgroup_mem.rs
+++ b/tests/tests/shader/zero_init_workgroup_mem.rs
@@ -87,7 +87,7 @@ static ZERO_INIT_WORKGROUP_MEMORY: GpuTestConfiguration = GpuTestConfiguration::
                 layout: Some(&pll),
                 module: &sm,
                 entry_point: "read",
-                constants: &Default::default(),
+                compilation_options: Default::default(),
             });
 
         let pipeline_write = ctx
@@ -97,7 +97,7 @@ static ZERO_INIT_WORKGROUP_MEMORY: GpuTestConfiguration = GpuTestConfiguration::
                 layout: None,
                 module: &sm,
                 entry_point: "write",
-                constants: &Default::default(),
+                compilation_options: Default::default(),
             });
 
         // -- Initializing data --
diff --git a/tests/tests/shader_primitive_index/mod.rs b/tests/tests/shader_primitive_index/mod.rs
index fa6bbcfb53..fb43397830 100644
--- a/tests/tests/shader_primitive_index/mod.rs
+++ b/tests/tests/shader_primitive_index/mod.rs
@@ -122,7 +122,7 @@ async fn pulling_common(
             vertex: wgpu::VertexState {
                 module: &shader,
                 entry_point: "vs_main",
-                constants: &Default::default(),
+                compilation_options: Default::default(),
                 buffers: &[wgpu::VertexBufferLayout {
                     array_stride: 8,
                     step_mode: wgpu::VertexStepMode::Vertex,
@@ -139,7 +139,7 @@ async fn pulling_common(
             fragment: Some(wgpu::FragmentState {
                 module: &shader,
                 entry_point: "fs_main",
-                constants: &Default::default(),
+                compilation_options: Default::default(),
                 targets: &[Some(wgpu::ColorTargetState {
                     format: wgpu::TextureFormat::Rgba8Unorm,
                     blend: None,
diff --git a/tests/tests/shader_view_format/mod.rs b/tests/tests/shader_view_format/mod.rs
index 60efa0130f..53c642bf7a 100644
--- a/tests/tests/shader_view_format/mod.rs
+++ b/tests/tests/shader_view_format/mod.rs
@@ -93,13 +93,13 @@ async fn reinterpret(
             vertex: wgpu::VertexState {
                 module: shader,
                 entry_point: "vs_main",
-                constants: &Default::default(),
+                compilation_options: Default::default(),
                 buffers: &[],
             },
             fragment: Some(wgpu::FragmentState {
                 module: shader,
                 entry_point: "fs_main",
-                constants: &Default::default(),
+                compilation_options: Default::default(),
                 targets: &[Some(src_format.into())],
             }),
             primitive: wgpu::PrimitiveState {
diff --git a/tests/tests/subgroup_operations/mod.rs b/tests/tests/subgroup_operations/mod.rs
new file mode 100644
index 0000000000..2c518a9d93
--- /dev/null
+++ b/tests/tests/subgroup_operations/mod.rs
@@ -0,0 +1,138 @@
+use std::{borrow::Cow, num::NonZeroU64};
+
+use wgpu_test::{gpu_test, GpuTestConfiguration, TestParameters};
+
+const THREAD_COUNT: u64 = 128;
+const TEST_COUNT: u32 = 32;
+
+#[gpu_test]
+static SUBGROUP_OPERATIONS: GpuTestConfiguration = GpuTestConfiguration::new()
+    .parameters(
+        TestParameters::default()
+            .features(wgpu::Features::SUBGROUP)
+            .limits(wgpu::Limits::downlevel_defaults())
+            // Expect metal to fail on tests involving operations in divergent control flow
+            //
+            // Newlines are included in the panic message to ensure that _additional_ failures
+            // are not matched against.
+            .expect_fail(
+                wgpu_test::FailureCase::molten_vk()
+                    // 14.3 doesn't fail test 29
+                    .panic("thread 0 failed tests: 27,\nthread 1 failed tests: 27, 28,\n")
+                    // Prior versions do.
+                    .panic("thread 0 failed tests: 27, 29,\nthread 1 failed tests: 27, 28, 29,\n"),
+            )
+            .expect_fail(
+                wgpu_test::FailureCase::backend(wgpu::Backends::METAL)
+                    // 14.3 doesn't fail test 29
+                    .panic("thread 0 failed tests: 27,\nthread 1 failed tests: 27, 28,\n")
+                    // Prior versions do.
+                    .panic("thread 0 failed tests: 27, 29,\nthread 1 failed tests: 27, 28, 29,\n"),
+            ),
+    )
+    .run_sync(|ctx| {
+        let device = &ctx.device;
+
+        let storage_buffer = device.create_buffer(&wgpu::BufferDescriptor {
+            label: None,
+            size: THREAD_COUNT * std::mem::size_of::<u32>() as u64,
+            usage: wgpu::BufferUsages::STORAGE
+                | wgpu::BufferUsages::COPY_DST
+                | wgpu::BufferUsages::COPY_SRC,
+            mapped_at_creation: false,
+        });
+
+        let bind_group_layout = device.create_bind_group_layout(&wgpu::BindGroupLayoutDescriptor {
+            label: Some("bind group layout"),
+            entries: &[wgpu::BindGroupLayoutEntry {
+                binding: 0,
+                visibility: wgpu::ShaderStages::COMPUTE,
+                ty: wgpu::BindingType::Buffer {
+                    ty: wgpu::BufferBindingType::Storage { read_only: false },
+                    has_dynamic_offset: false,
+                    min_binding_size: NonZeroU64::new(
+                        THREAD_COUNT * std::mem::size_of::<u32>() as u64,
+                    ),
+                },
+                count: None,
+            }],
+        });
+
+        let cs_module = device.create_shader_module(wgpu::ShaderModuleDescriptor {
+            label: None,
+            source: wgpu::ShaderSource::Wgsl(Cow::Borrowed(include_str!("shader.wgsl"))),
+        });
+
+        let pipeline_layout = device.create_pipeline_layout(&wgpu::PipelineLayoutDescriptor {
+            label: Some("main"),
+            bind_group_layouts: &[&bind_group_layout],
+            push_constant_ranges: &[],
+        });
+
+        let compute_pipeline = device.create_compute_pipeline(&wgpu::ComputePipelineDescriptor {
+            label: None,
+            layout: Some(&pipeline_layout),
+            module: &cs_module,
+            entry_point: "main",
+            compilation_options: Default::default(),
+        });
+
+        let bind_group = device.create_bind_group(&wgpu::BindGroupDescriptor {
+            entries: &[wgpu::BindGroupEntry {
+                binding: 0,
+                resource: storage_buffer.as_entire_binding(),
+            }],
+            layout: &bind_group_layout,
+            label: Some("bind group"),
+        });
+
+        let mut encoder =
+            device.create_command_encoder(&wgpu::CommandEncoderDescriptor { label: None });
+        {
+            let mut cpass = encoder.begin_compute_pass(&wgpu::ComputePassDescriptor {
+                label: None,
+                timestamp_writes: None,
+            });
+            cpass.set_pipeline(&compute_pipeline);
+            cpass.set_bind_group(0, &bind_group, &[]);
+            cpass.dispatch_workgroups(1, 1, 1);
+        }
+        ctx.queue.submit(Some(encoder.finish()));
+
+        wgpu::util::DownloadBuffer::read_buffer(
+            device,
+            &ctx.queue,
+            &storage_buffer.slice(..),
+            |mapping_buffer_view| {
+                let mapping_buffer_view = mapping_buffer_view.unwrap();
+                let result: &[u32; THREAD_COUNT as usize] =
+                    bytemuck::from_bytes(&mapping_buffer_view);
+                let expected_mask = (1u64 << (TEST_COUNT)) - 1; // generate full mask
+                let expected_array = [expected_mask as u32; THREAD_COUNT as usize];
+                if result != &expected_array {
+                    use std::fmt::Write;
+                    let mut msg = String::new();
+                    writeln!(
+                        &mut msg,
+                        "Got from GPU:\n{:x?}\n  expected:\n{:x?}",
+                        result, &expected_array,
+                    )
+                    .unwrap();
+                    for (thread, (result, expected)) in result
+                        .iter()
+                        .zip(expected_array)
+                        .enumerate()
+                        .filter(|(_, (r, e))| *r != e)
+                    {
+                        write!(&mut msg, "thread {} failed tests:", thread).unwrap();
+                        let difference = result ^ expected;
+                        for i in (0..u32::BITS).filter(|i| (difference & (1 << i)) != 0) {
+                            write!(&mut msg, " {},", i).unwrap();
+                        }
+                        writeln!(&mut msg).unwrap();
+                    }
+                    panic!("{}", msg);
+                }
+            },
+        );
+    });
diff --git a/tests/tests/subgroup_operations/shader.wgsl b/tests/tests/subgroup_operations/shader.wgsl
new file mode 100644
index 0000000000..77cb81ce75
--- /dev/null
+++ b/tests/tests/subgroup_operations/shader.wgsl
@@ -0,0 +1,161 @@
+@group(0)
+@binding(0)
+var<storage, read_write> storage_buffer: array<u32>;
+
+var<workgroup> workgroup_buffer: u32;
+
+fn add_result_to_mask(mask: ptr<function, u32>, index: u32, value: bool) {
+   (*mask) |= u32(value) << index;
+}
+
+@compute
+@workgroup_size(128)
+fn main(
+    @builtin(global_invocation_id) global_id: vec3<u32>,
+    @builtin(num_subgroups) num_subgroups: u32,
+    @builtin(subgroup_id) subgroup_id: u32,
+    @builtin(subgroup_size) subgroup_size: u32,
+    @builtin(subgroup_invocation_id) subgroup_invocation_id: u32,
+) {
+    var passed = 0u;
+    var expected: u32;
+
+    add_result_to_mask(&passed, 0u, num_subgroups == 128u / subgroup_size);
+    add_result_to_mask(&passed, 1u, subgroup_id == global_id.x / subgroup_size);
+    add_result_to_mask(&passed, 2u, subgroup_invocation_id == global_id.x % subgroup_size);
+
+    var expected_ballot = vec4<u32>(0u);
+    for(var i = 0u; i < subgroup_size; i += 1u) {
+        expected_ballot[i / 32u] |= ((global_id.x - subgroup_invocation_id + i) & 1u) << (i % 32u);
+    }
+    add_result_to_mask(&passed, 3u, dot(vec4<u32>(1u), vec4<u32>(subgroupBallot((subgroup_invocation_id & 1u) == 1u) == expected_ballot)) == 4u);
+
+    add_result_to_mask(&passed, 4u, subgroupAll(true));
+    add_result_to_mask(&passed, 5u, !subgroupAll(subgroup_invocation_id != 0u));
+
+    add_result_to_mask(&passed, 6u, subgroupAny(subgroup_invocation_id == 0u));
+    add_result_to_mask(&passed, 7u, !subgroupAny(false));
+
+    expected = 0u;
+    for(var i = 0u; i < subgroup_size; i += 1u) {
+        expected += global_id.x - subgroup_invocation_id + i + 1u;
+    }
+    add_result_to_mask(&passed, 8u, subgroupAdd(global_id.x + 1u) == expected);
+
+    expected = 1u;
+    for(var i = 0u; i < subgroup_size; i += 1u) {
+        expected *= global_id.x - subgroup_invocation_id + i + 1u;
+    }
+    add_result_to_mask(&passed, 9u, subgroupMul(global_id.x + 1u) == expected);
+
+    expected = 0u;
+    for(var i = 0u; i < subgroup_size; i += 1u) {
+        expected = max(expected, global_id.x - subgroup_invocation_id + i + 1u);
+    }
+    add_result_to_mask(&passed, 10u, subgroupMax(global_id.x + 1u) == expected);
+
+    expected = 0xFFFFFFFFu;
+    for(var i = 0u; i < subgroup_size; i += 1u) {
+        expected = min(expected, global_id.x - subgroup_invocation_id + i + 1u);
+    }
+    add_result_to_mask(&passed, 11u, subgroupMin(global_id.x + 1u) == expected);
+
+    expected = 0xFFFFFFFFu;
+    for(var i = 0u; i < subgroup_size; i += 1u) {
+        expected &= global_id.x - subgroup_invocation_id + i + 1u;
+    }
+    add_result_to_mask(&passed, 12u, subgroupAnd(global_id.x + 1u) == expected);
+
+    expected = 0u;
+    for(var i = 0u; i < subgroup_size; i += 1u) {
+        expected |= global_id.x - subgroup_invocation_id + i + 1u;
+    }
+    add_result_to_mask(&passed, 13u, subgroupOr(global_id.x + 1u) == expected);
+
+    expected = 0u;
+    for(var i = 0u; i < subgroup_size; i += 1u) {
+        expected ^= global_id.x - subgroup_invocation_id + i + 1u;
+    }
+    add_result_to_mask(&passed, 14u, subgroupXor(global_id.x + 1u) == expected);
+
+    expected = 0u;
+    for(var i = 0u; i < subgroup_invocation_id; i += 1u) {
+        expected += global_id.x - subgroup_invocation_id + i + 1u;
+    }
+    add_result_to_mask(&passed, 15u, subgroupExclusiveAdd(global_id.x + 1u) == expected);
+
+    expected = 1u;
+    for(var i = 0u; i < subgroup_invocation_id; i += 1u) {
+        expected *= global_id.x - subgroup_invocation_id + i + 1u;
+    }
+    add_result_to_mask(&passed, 16u, subgroupExclusiveMul(global_id.x + 1u) == expected);
+
+    expected = 0u;
+    for(var i = 0u; i <= subgroup_invocation_id; i += 1u) {
+        expected += global_id.x - subgroup_invocation_id + i + 1u;
+    }
+    add_result_to_mask(&passed, 17u, subgroupInclusiveAdd(global_id.x + 1u) == expected);
+
+    expected = 1u;
+    for(var i = 0u; i <= subgroup_invocation_id; i += 1u) {
+        expected *= global_id.x - subgroup_invocation_id + i + 1u;
+    }
+    add_result_to_mask(&passed, 18u, subgroupInclusiveMul(global_id.x + 1u) == expected);
+
+    add_result_to_mask(&passed, 19u, subgroupBroadcastFirst(u32(subgroup_invocation_id != 0u)) == 0u);
+    add_result_to_mask(&passed, 20u, subgroupBroadcastFirst(u32(subgroup_invocation_id == 0u)) == 1u);
+    add_result_to_mask(&passed, 21u, subgroupBroadcast(subgroup_invocation_id, 1u) == 1u);
+    add_result_to_mask(&passed, 22u, subgroupShuffle(subgroup_invocation_id, subgroup_invocation_id) == subgroup_invocation_id);
+    add_result_to_mask(&passed, 23u, subgroupShuffle(subgroup_invocation_id, subgroup_size - 1u - subgroup_invocation_id) == subgroup_size - 1u - subgroup_invocation_id);
+    add_result_to_mask(&passed, 24u, subgroup_invocation_id == subgroup_size - 1u || subgroupShuffleDown(subgroup_invocation_id, 1u) == subgroup_invocation_id + 1u);
+    add_result_to_mask(&passed, 25u, subgroup_invocation_id == 0u || subgroupShuffleUp(subgroup_invocation_id, 1u) == subgroup_invocation_id - 1u);
+    add_result_to_mask(&passed, 26u, subgroupShuffleXor(subgroup_invocation_id, subgroup_size - 1u) == (subgroup_invocation_id ^ (subgroup_size - 1u)));
+
+    // Mac/Apple will fail this test.
+    var passed_27 = false;
+    if subgroup_invocation_id % 2u == 0u {
+        passed_27 |= subgroupAdd(1u) == (subgroup_size / 2u);
+    } else {
+        passed_27 |= subgroupAdd(1u) == (subgroup_size / 2u);
+    }
+    add_result_to_mask(&passed, 27u, passed_27);
+
+    // Mac/Apple will fail this test.
+    var passed_28 = false;
+    switch subgroup_invocation_id % 3u {
+        case 0u: {
+            passed_28 = subgroupBroadcastFirst(subgroup_invocation_id) == 0u;
+        }
+        case 1u: {
+            passed_28 = subgroupBroadcastFirst(subgroup_invocation_id) == 1u;
+        }
+        case 2u: {
+            passed_28 = subgroupBroadcastFirst(subgroup_invocation_id) == 2u;
+        }
+        default {  }
+    }
+    add_result_to_mask(&passed, 28u, passed_28);
+
+    // Mac/Apple will sometimes fail this test. MacOS 14.3 passes it, so the bug in the metal compiler seems to be fixed.
+    expected = 0u;
+    for (var i = subgroup_size; i >= 0u; i -= 1u) {
+        expected = subgroupAdd(1u);
+        if i == subgroup_invocation_id {
+            break;
+        }
+    }
+    add_result_to_mask(&passed, 29u, expected == (subgroup_invocation_id + 1u));
+
+    if global_id.x == 0u {
+        workgroup_buffer = subgroup_size;
+    }
+    workgroupBarrier();
+    add_result_to_mask(&passed, 30u, workgroup_buffer == subgroup_size);
+
+    // Keep this test last, verify we are still convergent after running other tests
+    add_result_to_mask(&passed, 31u, subgroupAdd(1u) == subgroup_size);
+
+    // Increment TEST_COUNT in subgroup_operations/mod.rs if adding more tests
+
+    storage_buffer[global_id.x] = passed;
+}
diff --git a/tests/tests/vertex_indices/mod.rs b/tests/tests/vertex_indices/mod.rs
index 77e08489bf..cad7e731d1 100644
--- a/tests/tests/vertex_indices/mod.rs
+++ b/tests/tests/vertex_indices/mod.rs
@@ -272,7 +272,6 @@ async fn vertex_index_common(ctx: TestingContext) {
             push_constant_ranges: &[],
         });
 
-    let constants = &Default::default();
     let mut pipeline_desc = wgpu::RenderPipelineDescriptor {
         label: None,
         layout: Some(&ppl),
@@ -280,7 +279,7 @@ async fn vertex_index_common(ctx: TestingContext) {
             buffers: &[],
             module: &shader,
             entry_point: "vs_main_builtin",
-            constants,
+            compilation_options: Default::default(),
         },
         primitive: wgpu::PrimitiveState::default(),
         depth_stencil: None,
@@ -288,7 +287,7 @@ async fn vertex_index_common(ctx: TestingContext) {
         fragment: Some(wgpu::FragmentState {
             module: &shader,
             entry_point: "fs_main",
-            constants,
+            compilation_options: Default::default(),
             targets: &[Some(wgpu::ColorTargetState {
                 format: wgpu::TextureFormat::Rgba8Unorm,
                 blend: None,
diff --git a/wgpu-core/Cargo.toml b/wgpu-core/Cargo.toml
index ef5f56d067..7f099da5ca 100644
--- a/wgpu-core/Cargo.toml
+++ b/wgpu-core/Cargo.toml
@@ -1,6 +1,6 @@
 [package]
 name = "wgpu-core"
-version = "0.19.3"
+version = "0.20.0"
 authors = ["gfx-rs developers"]
 edition = "2021"
 description = "WebGPU core logic on wgpu-hal"
@@ -100,7 +100,6 @@ arrayvec = "0.7"
 bit-vec = "0.6"
 bitflags = "2"
 bytemuck = { version = "1.14", optional = true }
-codespan-reporting = "0.11"
 document-features.workspace = true
 indexmap = "2"
 log = "0.4"
@@ -117,17 +116,17 @@ thiserror = "1"
 
 [dependencies.naga]
 path = "../naga"
-version = "0.19.2"
+version = "0.20.0"
 
 [dependencies.wgt]
 package = "wgpu-types"
 path = "../wgpu-types"
-version = "0.19.2"
+version = "0.20.0"
 
 [dependencies.hal]
 package = "wgpu-hal"
 path = "../wgpu-hal"
-version = "0.19.3"
+version = "0.20.0"
 default_features = false
 
 [target.'cfg(all(target_arch = "wasm32", not(target_os = "emscripten")))'.dependencies]
diff --git a/wgpu-core/src/any_surface.rs b/wgpu-core/src/any_surface.rs
deleted file mode 100644
index 94edfc4433..0000000000
--- a/wgpu-core/src/any_surface.rs
+++ /dev/null
@@ -1,95 +0,0 @@
-use wgt::Backend;
-
-/// The `AnySurface` type: a `Arc` of a `A::Surface` for any backend `A`.
-use crate::hal_api::HalApi;
-
-use std::fmt;
-use std::mem::ManuallyDrop;
-use std::ptr::NonNull;
-
-struct AnySurfaceVtable {
-    // We oppurtunistically store the backend here, since we now it will be used
-    // with backend selection and it can be stored in static memory.
-    backend: Backend,
-    // Drop glue which knows how to drop the stored data.
-    drop: unsafe fn(*mut ()),
-}
-
-/// An `A::Surface`, for any backend `A`.
-///
-/// Any `AnySurface` is just like an `A::Surface`, except that the `A` type
-/// parameter is erased. To access the `Surface`, you must downcast to a
-/// particular backend with the \[`downcast_ref`\] or \[`take`\] methods.
-pub struct AnySurface {
-    data: NonNull<()>,
-    vtable: &'static AnySurfaceVtable,
-}
-
-impl AnySurface {
-    /// Construct an `AnySurface` that owns an `A::Surface`.
-    pub fn new<A: HalApi>(surface: A::Surface) -> AnySurface {
-        unsafe fn drop_glue<A: HalApi>(ptr: *mut ()) {
-            unsafe {
-                _ = Box::from_raw(ptr.cast::<A::Surface>());
-            }
-        }
-
-        let data = NonNull::from(Box::leak(Box::new(surface)));
-
-        AnySurface {
-            data: data.cast(),
-            vtable: &AnySurfaceVtable {
-                backend: A::VARIANT,
-                drop: drop_glue::<A>,
-            },
-        }
-    }
-
-    /// Get the backend this surface was created through.
-    pub fn backend(&self) -> Backend {
-        self.vtable.backend
-    }
-
-    /// If `self` refers to an `A::Surface`, returns a reference to it.
-    pub fn downcast_ref<A: HalApi>(&self) -> Option<&A::Surface> {
-        if A::VARIANT != self.vtable.backend {
-            return None;
-        }
-
-        // SAFETY: We just checked the instance above implicitly by the backend
-        // that it was statically constructed through.
-        Some(unsafe { &*self.data.as_ptr().cast::<A::Surface>() })
-    }
-
-    /// If `self` is an `Arc<A::Surface>`, returns that.
-    pub fn take<A: HalApi>(self) -> Option<A::Surface> {
-        if A::VARIANT != self.vtable.backend {
-            return None;
-        }
-
-        // Disable drop glue, since we're returning the owned surface. The
-        // caller will be responsible for dropping it.
-        let this = ManuallyDrop::new(self);
-
-        // SAFETY: We just checked the instance above implicitly by the backend
-        // that it was statically constructed through.
-        Some(unsafe { *Box::from_raw(this.data.as_ptr().cast::<A::Surface>()) })
-    }
-}
-
-impl Drop for AnySurface {
-    fn drop(&mut self) {
-        unsafe { (self.vtable.drop)(self.data.as_ptr()) }
-    }
-}
-
-impl fmt::Debug for AnySurface {
-    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
-        write!(f, "AnySurface<{}>", self.vtable.backend)
-    }
-}
-
-#[cfg(send_sync)]
-unsafe impl Send for AnySurface {}
-#[cfg(send_sync)]
-unsafe impl Sync for AnySurface {}
diff --git a/wgpu-core/src/command/allocator.rs b/wgpu-core/src/command/allocator.rs
new file mode 100644
index 0000000000..e17fd08d76
--- /dev/null
+++ b/wgpu-core/src/command/allocator.rs
@@ -0,0 +1,67 @@
+use crate::hal_api::HalApi;
+use crate::resource_log;
+use hal::Device as _;
+
+use crate::lock::{rank, Mutex};
+
+/// A pool of free [`wgpu_hal::CommandEncoder`]s, owned by a `Device`.
+///
+/// Each encoder in this list is in the "closed" state.
+///
+/// Since a raw [`CommandEncoder`][ce] is itself a pool for allocating
+/// raw [`CommandBuffer`][cb]s, this is a pool of pools.
+///
+/// [`wgpu_hal::CommandEncoder`]: hal::CommandEncoder
+/// [ce]: hal::CommandEncoder
+/// [cb]: hal::Api::CommandBuffer
+pub(crate) struct CommandAllocator<A: HalApi> {
+    free_encoders: Mutex<Vec<A::CommandEncoder>>,
+}
+
+impl<A: HalApi> CommandAllocator<A> {
+    pub(crate) fn new() -> Self {
+        Self {
+            free_encoders: Mutex::new(rank::COMMAND_ALLOCATOR_FREE_ENCODERS, Vec::new()),
+        }
+    }
+
+    /// Return a fresh [`wgpu_hal::CommandEncoder`] in the "closed" state.
+    ///
+    /// If we have free encoders in the pool, take one of those. Otherwise,
+    /// create a new one on `device`.
+    ///
+    /// [`wgpu_hal::CommandEncoder`]: hal::CommandEncoder
+    pub(crate) fn acquire_encoder(
+        &self,
+        device: &A::Device,
+        queue: &A::Queue,
+    ) -> Result<A::CommandEncoder, hal::DeviceError> {
+        let mut free_encoders = self.free_encoders.lock();
+        match free_encoders.pop() {
+            Some(encoder) => Ok(encoder),
+            None => unsafe {
+                let hal_desc = hal::CommandEncoderDescriptor { label: None, queue };
+                device.create_command_encoder(&hal_desc)
+            },
+        }
+    }
+
+    /// Add `encoder` back to the free pool.
+    pub(crate) fn release_encoder(&self, encoder: A::CommandEncoder) {
+        let mut free_encoders = self.free_encoders.lock();
+        free_encoders.push(encoder);
+    }
+
+    /// Free the pool of command encoders.
+    ///
+    /// This is only called when the `Device` is dropped.
+    pub(crate) fn dispose(&self, device: &A::Device) {
+        let mut free_encoders = self.free_encoders.lock();
+        resource_log!("CommandAllocator::dispose encoders {}", free_encoders.len());
+        for cmd_encoder in free_encoders.drain(..) {
+            unsafe {
+                device.destroy_command_encoder(cmd_encoder);
+            }
+        }
+    }
+}
diff --git a/wgpu-core/src/command/bundle.rs b/wgpu-core/src/command/bundle.rs
index 47beda8ec6..d9d821c533 100644
--- a/wgpu-core/src/command/bundle.rs
+++ b/wgpu-core/src/command/bundle.rs
@@ -73,7 +73,7 @@ index format changes.
 
 [Gdcrbe]: crate::global::Global::device_create_render_bundle_encoder
 [Grbef]: crate::global::Global::render_bundle_encoder_finish
-[wrpeb]: crate::command::render_ffi::wgpu_render_pass_execute_bundles
+[wrpeb]: crate::command::render::render_commands::wgpu_render_pass_execute_bundles
 !*/
 
 #![allow(clippy::reversed_empty_ranges)]
@@ -113,7 +113,7 @@ use hal::CommandEncoder as _;
 
 use super::ArcRenderCommand;
 
-/// https://gpuweb.github.io/gpuweb/#dom-gpurendercommandsmixin-draw
+/// <https://gpuweb.github.io/gpuweb/#dom-gpurendercommandsmixin-draw>
 fn validate_draw<A: HalApi>(
     vertex: &[Option<VertexState<A>>],
     step: &[VertexStep],
@@ -1548,15 +1548,14 @@ pub mod bundle_ffi {
         offsets: *const DynamicOffset,
         offset_length: usize,
     ) {
-        let redundant = unsafe {
-            bundle.current_bind_groups.set_and_check_redundant(
-                bind_group_id,
-                index,
-                &mut bundle.base.dynamic_offsets,
-                offsets,
-                offset_length,
-            )
-        };
+        let offsets = unsafe { slice::from_raw_parts(offsets, offset_length) };
+
+        let redundant = bundle.current_bind_groups.set_and_check_redundant(
+            bind_group_id,
+            index,
+            &mut bundle.base.dynamic_offsets,
+            offsets,
+        );
 
         if redundant {
             return;
diff --git a/wgpu-core/src/command/clear.rs b/wgpu-core/src/command/clear.rs
index 72c923f82e..faff177928 100644
--- a/wgpu-core/src/command/clear.rs
+++ b/wgpu-core/src/command/clear.rs
@@ -104,6 +104,11 @@ impl Global {
             let dst_buffer = buffer_guard
                 .get(dst)
                 .map_err(|_| ClearError::InvalidBuffer(dst))?;
+
+            if dst_buffer.device.as_info().id() != cmd_buf.device.as_info().id() {
+                return Err(DeviceError::WrongDevice.into());
+            }
+
             cmd_buf_data
                 .trackers
                 .buffers
@@ -200,6 +205,10 @@ impl Global {
             .get(dst)
             .map_err(|_| ClearError::InvalidTexture(dst))?;
 
+        if dst_texture.device.as_info().id() != cmd_buf.device.as_info().id() {
+            return Err(DeviceError::WrongDevice.into());
+        }
+
         // Check if subresource aspects are valid.
         let clear_aspects =
             hal::FormatAspects::new(dst_texture.desc.format, subresource_range.aspect);
diff --git a/wgpu-core/src/command/compute.rs b/wgpu-core/src/command/compute.rs
index 67cec2d006..046d0df9ff 100644
--- a/wgpu-core/src/command/compute.rs
+++ b/wgpu-core/src/command/compute.rs
@@ -1,3 +1,4 @@
+use crate::command::compute_command::{ArcComputeCommand, ComputeCommand};
 use crate::device::DeviceError;
 use crate::resource::Resource;
 use crate::snatch::SnatchGuard;
@@ -20,7 +21,6 @@ use crate::{
     hal_label, id,
     id::DeviceId,
     init_tracker::MemoryInitKind,
-    pipeline,
     resource::{self},
     storage::Storage,
     track::{Tracker, UsageConflict, UsageScope},
@@ -39,59 +39,6 @@ use thiserror::Error;
 use std::sync::Arc;
 use std::{fmt, mem, str};
 
-#[doc(hidden)]
-#[derive(Clone, Copy, Debug)]
-#[cfg_attr(feature = "serde", derive(serde::Serialize, serde::Deserialize))]
-pub enum ComputeCommand {
-    SetBindGroup {
-        index: u32,
-        num_dynamic_offsets: usize,
-        bind_group_id: id::BindGroupId,
-    },
-    SetPipeline(id::ComputePipelineId),
-
-    /// Set a range of push constants to values stored in [`BasePass::push_constant_data`].
-    SetPushConstant {
-        /// The byte offset within the push constant storage to write to. This
-        /// must be a multiple of four.
-        offset: u32,
-
-        /// The number of bytes to write. This must be a multiple of four.
-        size_bytes: u32,
-
-        /// Index in [`BasePass::push_constant_data`] of the start of the data
-        /// to be written.
-        ///
-        /// Note: this is not a byte offset like `offset`. Rather, it is the
-        /// index of the first `u32` element in `push_constant_data` to read.
-        values_offset: u32,
-    },
-
-    Dispatch([u32; 3]),
-    DispatchIndirect {
-        buffer_id: id::BufferId,
-        offset: wgt::BufferAddress,
-    },
-    PushDebugGroup {
-        color: u32,
-        len: usize,
-    },
-    PopDebugGroup,
-    InsertDebugMarker {
-        color: u32,
-        len: usize,
-    },
-    WriteTimestamp {
-        query_set_id: id::QuerySetId,
-        query_index: u32,
-    },
-    BeginPipelineStatisticsQuery {
-        query_set_id: id::QuerySetId,
-        query_index: u32,
-    },
-    EndPipelineStatisticsQuery,
-}
-
 #[cfg_attr(feature = "serde", derive(serde::Deserialize, serde::Serialize))]
 pub struct ComputePass {
     base: BasePass<ComputeCommand>,
@@ -185,7 +132,7 @@ pub enum ComputePassErrorInner {
     #[error(transparent)]
     Encoder(#[from] CommandEncoderError),
     #[error("Bind group at index {0:?} is invalid")]
-    InvalidBindGroup(usize),
+    InvalidBindGroup(u32),
     #[error("Device {0:?} is invalid")]
     InvalidDevice(DeviceId),
     #[error("Bind group index {index} is greater than the device's requested `max_bind_group` limit {max}")]
@@ -250,7 +197,7 @@ impl PrettyError for ComputePassErrorInner {
 pub struct ComputePassError {
     pub scope: PassErrorScope,
     #[source]
-    inner: ComputePassErrorInner,
+    pub(super) inner: ComputePassErrorInner,
 }
 impl PrettyError for ComputePassError {
     fn fmt_pretty(&self, fmt: &mut ErrorFormatter) {
@@ -347,7 +294,8 @@ impl Global {
         encoder_id: id::CommandEncoderId,
         pass: &ComputePass,
     ) -> Result<(), ComputePassError> {
-        self.command_encoder_run_compute_pass_impl::<A>(
+        // TODO: This should go directly to `command_encoder_run_compute_pass_impl` by means of storing `ArcComputeCommand` internally.
+        self.command_encoder_run_compute_pass_with_unresolved_commands::<A>(
             encoder_id,
             pass.base.as_ref(),
             pass.timestamp_writes.as_ref(),
@@ -355,11 +303,33 @@ impl Global {
     }
 
     #[doc(hidden)]
-    pub fn command_encoder_run_compute_pass_impl<A: HalApi>(
+    pub fn command_encoder_run_compute_pass_with_unresolved_commands<A: HalApi>(
         &self,
         encoder_id: id::CommandEncoderId,
         base: BasePassRef<ComputeCommand>,
         timestamp_writes: Option<&ComputePassTimestampWrites>,
+    ) -> Result<(), ComputePassError> {
+        let resolved_commands =
+            ComputeCommand::resolve_compute_command_ids(A::hub(self), base.commands)?;
+
+        self.command_encoder_run_compute_pass_impl::<A>(
+            encoder_id,
+            BasePassRef {
+                label: base.label,
+                commands: &resolved_commands,
+                dynamic_offsets: base.dynamic_offsets,
+                string_data: base.string_data,
+                push_constant_data: base.push_constant_data,
+            },
+            timestamp_writes,
+        )
+    }
+
+    fn command_encoder_run_compute_pass_impl<A: HalApi>(
+        &self,
+        encoder_id: id::CommandEncoderId,
+        base: BasePassRef<ArcComputeCommand<A>>,
+        timestamp_writes: Option<&ComputePassTimestampWrites>,
     ) -> Result<(), ComputePassError> {
         profiling::scope!("CommandEncoder::run_compute_pass");
         let pass_scope = PassErrorScope::Pass(encoder_id);
@@ -382,7 +352,13 @@ impl Global {
         #[cfg(feature = "trace")]
         if let Some(ref mut list) = cmd_buf_data.commands {
             list.push(crate::device::trace::Command::RunComputePass {
-                base: BasePass::from_ref(base),
+                base: BasePass {
+                    label: base.label.map(str::to_string),
+                    commands: base.commands.iter().map(Into::into).collect(),
+                    dynamic_offsets: base.dynamic_offsets.to_vec(),
+                    string_data: base.string_data.to_vec(),
+                    push_constant_data: base.push_constant_data.to_vec(),
+                },
                 timestamp_writes: timestamp_writes.cloned(),
             });
         }
@@ -402,7 +378,6 @@ impl Global {
         let raw = encoder.open().map_pass_err(pass_scope)?;
 
         let bind_group_guard = hub.bind_groups.read();
-        let pipeline_guard = hub.compute_pipelines.read();
         let query_set_guard = hub.query_sets.read();
         let buffer_guard = hub.buffers.read();
         let tlas_guard = hub.tlas_s.read();
@@ -484,19 +459,21 @@ impl Global {
         // be inserted before texture reads.
         let mut pending_discard_init_fixups = SurfacesInDiscardState::new();
 
+        // TODO: We should be draining the commands here, avoiding extra copies in the process.
+        //       (A command encoder can't be executed twice!)
         for command in base.commands {
-            match *command {
-                ComputeCommand::SetBindGroup {
+            match command {
+                ArcComputeCommand::SetBindGroup {
                     index,
                     num_dynamic_offsets,
-                    bind_group_id,
+                    bind_group,
                 } => {
-                    let scope = PassErrorScope::SetBindGroup(bind_group_id);
+                    let scope = PassErrorScope::SetBindGroup(bind_group.as_info().id());
 
                     let max_bind_groups = cmd_buf.limits.max_bind_groups;
-                    if index >= max_bind_groups {
+                    if index >= &max_bind_groups {
                         return Err(ComputePassErrorInner::BindGroupIndexOutOfRange {
-                            index,
+                            index: *index,
                             max: max_bind_groups,
                         })
                         .map_pass_err(scope);
@@ -509,13 +486,9 @@ impl Global {
                     );
                     dynamic_offset_count += num_dynamic_offsets;
 
-                    let bind_group = tracker
-                        .bind_groups
-                        .add_single(&*bind_group_guard, bind_group_id)
-                        .ok_or(ComputePassErrorInner::InvalidBindGroup(index as usize))
-                        .map_pass_err(scope)?;
+                    let bind_group = tracker.bind_groups.insert_single(bind_group.clone());
                     bind_group
-                        .validate_dynamic_bindings(index, &temp_offsets, &cmd_buf.limits)
+                        .validate_dynamic_bindings(*index, &temp_offsets, &cmd_buf.limits)
                         .map_pass_err(scope)?;
 
                     buffer_memory_init_actions.extend(
@@ -551,14 +524,14 @@ impl Global {
                     let entries =
                         state
                             .binder
-                            .assign_group(index as usize, bind_group, &temp_offsets);
+                            .assign_group(*index as usize, bind_group, &temp_offsets);
                     if !entries.is_empty() && pipeline_layout.is_some() {
                         let pipeline_layout = pipeline_layout.as_ref().unwrap().raw();
                         for (i, e) in entries.iter().enumerate() {
                             if let Some(group) = e.group.as_ref() {
                                 let raw_bg = group
                                     .raw(&snatch_guard)
-                                    .ok_or(ComputePassErrorInner::InvalidBindGroup(i))
+                                    .ok_or(ComputePassErrorInner::InvalidBindGroup(i as u32))
                                     .map_pass_err(scope)?;
                                 unsafe {
                                     raw.set_bind_group(
@@ -572,16 +545,13 @@ impl Global {
                         }
                     }
                 }
-                ComputeCommand::SetPipeline(pipeline_id) => {
+                ArcComputeCommand::SetPipeline(pipeline) => {
+                    let pipeline_id = pipeline.as_info().id();
                     let scope = PassErrorScope::SetPipelineCompute(pipeline_id);
 
                     state.pipeline = Some(pipeline_id);
 
-                    let pipeline: &pipeline::ComputePipeline<A> = tracker
-                        .compute_pipelines
-                        .add_single(&*pipeline_guard, pipeline_id)
-                        .ok_or(ComputePassErrorInner::InvalidPipeline(pipeline_id))
-                        .map_pass_err(scope)?;
+                    tracker.compute_pipelines.insert_single(pipeline.clone());
 
                     unsafe {
                         raw.set_compute_pipeline(pipeline.raw());
@@ -605,7 +575,7 @@ impl Global {
                                 if let Some(group) = e.group.as_ref() {
                                     let raw_bg = group
                                         .raw(&snatch_guard)
-                                        .ok_or(ComputePassErrorInner::InvalidBindGroup(i))
+                                        .ok_or(ComputePassErrorInner::InvalidBindGroup(i as u32))
                                         .map_pass_err(scope)?;
                                     unsafe {
                                         raw.set_bind_group(
@@ -641,7 +611,7 @@ impl Global {
                         }
                     }
                 }
-                ComputeCommand::SetPushConstant {
+                ArcComputeCommand::SetPushConstant {
                     offset,
                     size_bytes,
                     values_offset,
@@ -652,7 +622,7 @@ impl Global {
                     let values_end_offset =
                         (values_offset + size_bytes / wgt::PUSH_CONSTANT_ALIGNMENT) as usize;
                     let data_slice =
-                        &base.push_constant_data[(values_offset as usize)..values_end_offset];
+                        &base.push_constant_data[(*values_offset as usize)..values_end_offset];
 
                     let pipeline_layout = state
                         .binder
@@ -667,7 +637,7 @@ impl Global {
                     pipeline_layout
                         .validate_push_constant_ranges(
                             wgt::ShaderStages::COMPUTE,
-                            offset,
+                            *offset,
                             end_offset_bytes,
                         )
                         .map_pass_err(scope)?;
@@ -676,12 +646,12 @@ impl Global {
                         raw.set_push_constants(
                             pipeline_layout.raw(),
                             wgt::ShaderStages::COMPUTE,
-                            offset,
+                            *offset,
                             data_slice,
                         );
                     }
                 }
-                ComputeCommand::Dispatch(groups) => {
+                ArcComputeCommand::Dispatch(groups) => {
                     let scope = PassErrorScope::Dispatch {
                         indirect: false,
                         pipeline: state.pipeline,
@@ -706,7 +676,7 @@ impl Global {
                     {
                         return Err(ComputePassErrorInner::Dispatch(
                             DispatchError::InvalidGroupSize {
-                                current: groups,
+                                current: *groups,
                                 limit: groups_size_limit,
                             },
                         ))
@@ -714,10 +684,11 @@ impl Global {
                     }
 
                     unsafe {
-                        raw.dispatch(groups);
+                        raw.dispatch(*groups);
                     }
                 }
-                ComputeCommand::DispatchIndirect { buffer_id, offset } => {
+                ArcComputeCommand::DispatchIndirect { buffer, offset } => {
+                    let buffer_id = buffer.as_info().id();
                     let scope = PassErrorScope::Dispatch {
                         indirect: true,
                         pipeline: state.pipeline,
@@ -729,29 +700,25 @@ impl Global {
                         .require_downlevel_flags(wgt::DownlevelFlags::INDIRECT_EXECUTION)
                         .map_pass_err(scope)?;
 
-                    let indirect_buffer = state
+                    state
                         .scope
                         .buffers
-                        .merge_single(&*buffer_guard, buffer_id, hal::BufferUses::INDIRECT)
+                        .insert_merge_single(buffer.clone(), hal::BufferUses::INDIRECT)
+                        .map_pass_err(scope)?;
+                    check_buffer_usage(buffer_id, buffer.usage, wgt::BufferUsages::INDIRECT)
                         .map_pass_err(scope)?;
-                    check_buffer_usage(
-                        buffer_id,
-                        indirect_buffer.usage,
-                        wgt::BufferUsages::INDIRECT,
-                    )
-                    .map_pass_err(scope)?;
 
                     let end_offset = offset + mem::size_of::<wgt::DispatchIndirectArgs>() as u64;
-                    if end_offset > indirect_buffer.size {
+                    if end_offset > buffer.size {
                         return Err(ComputePassErrorInner::IndirectBufferOverrun {
-                            offset,
+                            offset: *offset,
                             end_offset,
-                            buffer_size: indirect_buffer.size,
+                            buffer_size: buffer.size,
                         })
                         .map_pass_err(scope);
                     }
 
-                    let buf_raw = indirect_buffer
+                    let buf_raw = buffer
                         .raw
                         .get(&snatch_guard)
                         .ok_or(ComputePassErrorInner::InvalidIndirectBuffer(buffer_id))
@@ -760,9 +727,9 @@ impl Global {
                     let stride = 3 * 4; // 3 integers, x/y/z group size
 
                     buffer_memory_init_actions.extend(
-                        indirect_buffer.initialization_status.read().create_action(
-                            indirect_buffer,
-                            offset..(offset + stride),
+                        buffer.initialization_status.read().create_action(
+                            buffer,
+                            *offset..(*offset + stride),
                             MemoryInitKind::NeedsInitializedMemory,
                         ),
                     );
@@ -772,15 +739,15 @@ impl Global {
                             raw,
                             &mut intermediate_trackers,
                             &*bind_group_guard,
-                            Some(indirect_buffer.as_info().tracker_index()),
+                            Some(buffer.as_info().tracker_index()),
                             &snatch_guard,
                         )
                         .map_pass_err(scope)?;
                     unsafe {
-                        raw.dispatch_indirect(buf_raw, offset);
+                        raw.dispatch_indirect(buf_raw, *offset);
                     }
                 }
-                ComputeCommand::PushDebugGroup { color: _, len } => {
+                ArcComputeCommand::PushDebugGroup { color: _, len } => {
                     state.debug_scope_depth += 1;
                     if !discard_hal_labels {
                         let label =
@@ -792,7 +759,7 @@ impl Global {
                     }
                     string_offset += len;
                 }
-                ComputeCommand::PopDebugGroup => {
+                ArcComputeCommand::PopDebugGroup => {
                     let scope = PassErrorScope::PopDebugGroup;
 
                     if state.debug_scope_depth == 0 {
@@ -806,7 +773,7 @@ impl Global {
                         }
                     }
                 }
-                ComputeCommand::InsertDebugMarker { color: _, len } => {
+                ArcComputeCommand::InsertDebugMarker { color: _, len } => {
                     if !discard_hal_labels {
                         let label =
                             str::from_utf8(&base.string_data[string_offset..string_offset + len])
@@ -815,49 +782,43 @@ impl Global {
                     }
                     string_offset += len;
                 }
-                ComputeCommand::WriteTimestamp {
-                    query_set_id,
+                ArcComputeCommand::WriteTimestamp {
+                    query_set,
                     query_index,
                 } => {
+                    let query_set_id = query_set.as_info().id();
                     let scope = PassErrorScope::WriteTimestamp;
 
                     device
                         .require_features(wgt::Features::TIMESTAMP_QUERY_INSIDE_PASSES)
                         .map_pass_err(scope)?;
 
-                    let query_set: &resource::QuerySet<A> = tracker
-                        .query_sets
-                        .add_single(&*query_set_guard, query_set_id)
-                        .ok_or(ComputePassErrorInner::InvalidQuerySet(query_set_id))
-                        .map_pass_err(scope)?;
+                    let query_set = tracker.query_sets.insert_single(query_set.clone());
 
                     query_set
-                        .validate_and_write_timestamp(raw, query_set_id, query_index, None)
+                        .validate_and_write_timestamp(raw, query_set_id, *query_index, None)
                         .map_pass_err(scope)?;
                 }
-                ComputeCommand::BeginPipelineStatisticsQuery {
-                    query_set_id,
+                ArcComputeCommand::BeginPipelineStatisticsQuery {
+                    query_set,
                     query_index,
                 } => {
+                    let query_set_id = query_set.as_info().id();
                     let scope = PassErrorScope::BeginPipelineStatisticsQuery;
 
-                    let query_set: &resource::QuerySet<A> = tracker
-                        .query_sets
-                        .add_single(&*query_set_guard, query_set_id)
-                        .ok_or(ComputePassErrorInner::InvalidQuerySet(query_set_id))
-                        .map_pass_err(scope)?;
+                    let query_set = tracker.query_sets.insert_single(query_set.clone());
 
                     query_set
                         .validate_and_begin_pipeline_statistics_query(
                             raw,
                             query_set_id,
-                            query_index,
+                            *query_index,
                             None,
                             &mut active_query,
                         )
                         .map_pass_err(scope)?;
                 }
-                ComputeCommand::EndPipelineStatisticsQuery => {
+                ArcComputeCommand::EndPipelineStatisticsQuery => {
                     let scope = PassErrorScope::EndPipelineStatisticsQuery;
 
                     end_pipeline_statistics_query(raw, &*query_set_guard, &mut active_query)
@@ -901,33 +862,24 @@ impl Global {
     }
 }
 
-pub mod compute_ffi {
+pub mod compute_commands {
     use super::{ComputeCommand, ComputePass};
-    use crate::{id, RawString};
-    use std::{convert::TryInto, ffi, slice};
+    use crate::id;
+    use std::convert::TryInto;
     use wgt::{BufferAddress, DynamicOffset};
 
-    /// # Safety
-    ///
-    /// This function is unsafe as there is no guarantee that the given pointer is
-    /// valid for `offset_length` elements.
-    #[no_mangle]
-    pub unsafe extern "C" fn wgpu_compute_pass_set_bind_group(
+    pub fn wgpu_compute_pass_set_bind_group(
         pass: &mut ComputePass,
         index: u32,
         bind_group_id: id::BindGroupId,
-        offsets: *const DynamicOffset,
-        offset_length: usize,
+        offsets: &[DynamicOffset],
     ) {
-        let redundant = unsafe {
-            pass.current_bind_groups.set_and_check_redundant(
-                bind_group_id,
-                index,
-                &mut pass.base.dynamic_offsets,
-                offsets,
-                offset_length,
-            )
-        };
+        let redundant = pass.current_bind_groups.set_and_check_redundant(
+            bind_group_id,
+            index,
+            &mut pass.base.dynamic_offsets,
+            offsets,
+        );
 
         if redundant {
             return;
@@ -935,13 +887,12 @@ pub mod compute_ffi {
 
         pass.base.commands.push(ComputeCommand::SetBindGroup {
             index,
-            num_dynamic_offsets: offset_length,
+            num_dynamic_offsets: offsets.len(),
             bind_group_id,
         });
     }
 
-    #[no_mangle]
-    pub extern "C" fn wgpu_compute_pass_set_pipeline(
+    pub fn wgpu_compute_pass_set_pipeline(
         pass: &mut ComputePass,
         pipeline_id: id::ComputePipelineId,
     ) {
@@ -954,47 +905,34 @@ pub mod compute_ffi {
             .push(ComputeCommand::SetPipeline(pipeline_id));
     }
 
-    /// # Safety
-    ///
-    /// This function is unsafe as there is no guarantee that the given pointer is
-    /// valid for `size_bytes` bytes.
-    #[no_mangle]
-    pub unsafe extern "C" fn wgpu_compute_pass_set_push_constant(
-        pass: &mut ComputePass,
-        offset: u32,
-        size_bytes: u32,
-        data: *const u8,
-    ) {
+    pub fn wgpu_compute_pass_set_push_constant(pass: &mut ComputePass, offset: u32, data: &[u8]) {
         assert_eq!(
             offset & (wgt::PUSH_CONSTANT_ALIGNMENT - 1),
             0,
             "Push constant offset must be aligned to 4 bytes."
         );
         assert_eq!(
-            size_bytes & (wgt::PUSH_CONSTANT_ALIGNMENT - 1),
+            data.len() as u32 & (wgt::PUSH_CONSTANT_ALIGNMENT - 1),
             0,
             "Push constant size must be aligned to 4 bytes."
         );
-        let data_slice = unsafe { slice::from_raw_parts(data, size_bytes as usize) };
         let value_offset = pass.base.push_constant_data.len().try_into().expect(
             "Ran out of push constant space. Don't set 4gb of push constants per ComputePass.",
         );
 
         pass.base.push_constant_data.extend(
-            data_slice
-                .chunks_exact(wgt::PUSH_CONSTANT_ALIGNMENT as usize)
+            data.chunks_exact(wgt::PUSH_CONSTANT_ALIGNMENT as usize)
                 .map(|arr| u32::from_ne_bytes([arr[0], arr[1], arr[2], arr[3]])),
         );
 
         pass.base.commands.push(ComputeCommand::SetPushConstant {
             offset,
-            size_bytes,
+            size_bytes: data.len() as u32,
             values_offset: value_offset,
         });
     }
 
-    #[no_mangle]
-    pub extern "C" fn wgpu_compute_pass_dispatch_workgroups(
+    pub fn wgpu_compute_pass_dispatch_workgroups(
         pass: &mut ComputePass,
         groups_x: u32,
         groups_y: u32,
@@ -1005,8 +943,7 @@ pub mod compute_ffi {
             .push(ComputeCommand::Dispatch([groups_x, groups_y, groups_z]));
     }
 
-    #[no_mangle]
-    pub extern "C" fn wgpu_compute_pass_dispatch_workgroups_indirect(
+    pub fn wgpu_compute_pass_dispatch_workgroups_indirect(
         pass: &mut ComputePass,
         buffer_id: id::BufferId,
         offset: BufferAddress,
@@ -1016,17 +953,8 @@ pub mod compute_ffi {
             .push(ComputeCommand::DispatchIndirect { buffer_id, offset });
     }
 
-    /// # Safety
-    ///
-    /// This function is unsafe as there is no guarantee that the given `label`
-    /// is a valid null-terminated string.
-    #[no_mangle]
-    pub unsafe extern "C" fn wgpu_compute_pass_push_debug_group(
-        pass: &mut ComputePass,
-        label: RawString,
-        color: u32,
-    ) {
-        let bytes = unsafe { ffi::CStr::from_ptr(label) }.to_bytes();
+    pub fn wgpu_compute_pass_push_debug_group(pass: &mut ComputePass, label: &str, color: u32) {
+        let bytes = label.as_bytes();
         pass.base.string_data.extend_from_slice(bytes);
 
         pass.base.commands.push(ComputeCommand::PushDebugGroup {
@@ -1035,22 +963,12 @@ pub mod compute_ffi {
         });
     }
 
-    #[no_mangle]
-    pub extern "C" fn wgpu_compute_pass_pop_debug_group(pass: &mut ComputePass) {
+    pub fn wgpu_compute_pass_pop_debug_group(pass: &mut ComputePass) {
         pass.base.commands.push(ComputeCommand::PopDebugGroup);
     }
 
-    /// # Safety
-    ///
-    /// This function is unsafe as there is no guarantee that the given `label`
-    /// is a valid null-terminated string.
-    #[no_mangle]
-    pub unsafe extern "C" fn wgpu_compute_pass_insert_debug_marker(
-        pass: &mut ComputePass,
-        label: RawString,
-        color: u32,
-    ) {
-        let bytes = unsafe { ffi::CStr::from_ptr(label) }.to_bytes();
+    pub fn wgpu_compute_pass_insert_debug_marker(pass: &mut ComputePass, label: &str, color: u32) {
+        let bytes = label.as_bytes();
         pass.base.string_data.extend_from_slice(bytes);
 
         pass.base.commands.push(ComputeCommand::InsertDebugMarker {
@@ -1059,8 +977,7 @@ pub mod compute_ffi {
         });
     }
 
-    #[no_mangle]
-    pub extern "C" fn wgpu_compute_pass_write_timestamp(
+    pub fn wgpu_compute_pass_write_timestamp(
         pass: &mut ComputePass,
         query_set_id: id::QuerySetId,
         query_index: u32,
@@ -1071,8 +988,7 @@ pub mod compute_ffi {
         });
     }
 
-    #[no_mangle]
-    pub extern "C" fn wgpu_compute_pass_begin_pipeline_statistics_query(
+    pub fn wgpu_compute_pass_begin_pipeline_statistics_query(
         pass: &mut ComputePass,
         query_set_id: id::QuerySetId,
         query_index: u32,
@@ -1085,8 +1001,7 @@ pub mod compute_ffi {
             });
     }
 
-    #[no_mangle]
-    pub extern "C" fn wgpu_compute_pass_end_pipeline_statistics_query(pass: &mut ComputePass) {
+    pub fn wgpu_compute_pass_end_pipeline_statistics_query(pass: &mut ComputePass) {
         pass.base
             .commands
             .push(ComputeCommand::EndPipelineStatisticsQuery);
diff --git a/wgpu-core/src/command/compute_command.rs b/wgpu-core/src/command/compute_command.rs
new file mode 100644
index 0000000000..49fdbbec24
--- /dev/null
+++ b/wgpu-core/src/command/compute_command.rs
@@ -0,0 +1,322 @@
+use std::sync::Arc;
+
+use crate::{
+    binding_model::BindGroup,
+    hal_api::HalApi,
+    id,
+    pipeline::ComputePipeline,
+    resource::{Buffer, QuerySet},
+};
+
+use super::{ComputePassError, ComputePassErrorInner, PassErrorScope};
+
+#[derive(Clone, Copy, Debug)]
+#[cfg_attr(feature = "serde", derive(serde::Serialize, serde::Deserialize))]
+pub enum ComputeCommand {
+    SetBindGroup {
+        index: u32,
+        num_dynamic_offsets: usize,
+        bind_group_id: id::BindGroupId,
+    },
+
+    SetPipeline(id::ComputePipelineId),
+
+    /// Set a range of push constants to values stored in `push_constant_data`.
+    SetPushConstant {
+        /// The byte offset within the push constant storage to write to. This
+        /// must be a multiple of four.
+        offset: u32,
+
+        /// The number of bytes to write. This must be a multiple of four.
+        size_bytes: u32,
+
+        /// Index in `push_constant_data` of the start of the data
+        /// to be written.
+        ///
+        /// Note: this is not a byte offset like `offset`. Rather, it is the
+        /// index of the first `u32` element in `push_constant_data` to read.
+        values_offset: u32,
+    },
+
+    Dispatch([u32; 3]),
+
+    DispatchIndirect {
+        buffer_id: id::BufferId,
+        offset: wgt::BufferAddress,
+    },
+
+    PushDebugGroup {
+        color: u32,
+        len: usize,
+    },
+
+    PopDebugGroup,
+
+    InsertDebugMarker {
+        color: u32,
+        len: usize,
+    },
+
+    WriteTimestamp {
+        query_set_id: id::QuerySetId,
+        query_index: u32,
+    },
+
+    BeginPipelineStatisticsQuery {
+        query_set_id: id::QuerySetId,
+        query_index: u32,
+    },
+
+    EndPipelineStatisticsQuery,
+}
+
+impl ComputeCommand {
+    /// Resolves all ids in a list of commands into the corresponding resource Arc.
+    ///
+    // TODO: Once resolving is done on-the-fly during recording, this function should be only needed with the replay feature:
+    // #[cfg(feature = "replay")]
+    pub fn resolve_compute_command_ids<A: HalApi>(
+        hub: &crate::hub::Hub<A>,
+        commands: &[ComputeCommand],
+    ) -> Result<Vec<ArcComputeCommand<A>>, ComputePassError> {
+        let buffers_guard = hub.buffers.read();
+        let bind_group_guard = hub.bind_groups.read();
+        let query_set_guard = hub.query_sets.read();
+        let pipelines_guard = hub.compute_pipelines.read();
+
+        let resolved_commands: Vec<ArcComputeCommand<A>> = commands
+            .iter()
+            .map(|c| -> Result<ArcComputeCommand<A>, ComputePassError> {
+                Ok(match *c {
+                    ComputeCommand::SetBindGroup {
+                        index,
+                        num_dynamic_offsets,
+                        bind_group_id,
+                    } => ArcComputeCommand::SetBindGroup {
+                        index,
+                        num_dynamic_offsets,
+                        bind_group: bind_group_guard.get_owned(bind_group_id).map_err(|_| {
+                            ComputePassError {
+                                scope: PassErrorScope::SetBindGroup(bind_group_id),
+                                inner: ComputePassErrorInner::InvalidBindGroup(index),
+                            }
+                        })?,
+                    },
+
+                    ComputeCommand::SetPipeline(pipeline_id) => ArcComputeCommand::SetPipeline(
+                        pipelines_guard
+                            .get_owned(pipeline_id)
+                            .map_err(|_| ComputePassError {
+                                scope: PassErrorScope::SetPipelineCompute(pipeline_id),
+                                inner: ComputePassErrorInner::InvalidPipeline(pipeline_id),
+                            })?,
+                    ),
+
+                    ComputeCommand::SetPushConstant {
+                        offset,
+                        size_bytes,
+                        values_offset,
+                    } => ArcComputeCommand::SetPushConstant {
+                        offset,
+                        size_bytes,
+                        values_offset,
+                    },
+
+                    ComputeCommand::Dispatch(dim) => ArcComputeCommand::Dispatch(dim),
+
+                    ComputeCommand::DispatchIndirect { buffer_id, offset } => {
+                        ArcComputeCommand::DispatchIndirect {
+                            buffer: buffers_guard.get_owned(buffer_id).map_err(|_| {
+                                ComputePassError {
+                                    scope: PassErrorScope::Dispatch {
+                                        indirect: true,
+                                        pipeline: None, // TODO: not used right now, but once we do the resolve during recording we can use this again.
+                                    },
+                                    inner: ComputePassErrorInner::InvalidBuffer(buffer_id),
+                                }
+                            })?,
+                            offset,
+                        }
+                    }
+
+                    ComputeCommand::PushDebugGroup { color, len } => {
+                        ArcComputeCommand::PushDebugGroup { color, len }
+                    }
+
+                    ComputeCommand::PopDebugGroup => ArcComputeCommand::PopDebugGroup,
+
+                    ComputeCommand::InsertDebugMarker { color, len } => {
+                        ArcComputeCommand::InsertDebugMarker { color, len }
+                    }
+
+                    ComputeCommand::WriteTimestamp {
+                        query_set_id,
+                        query_index,
+                    } => ArcComputeCommand::WriteTimestamp {
+                        query_set: query_set_guard.get_owned(query_set_id).map_err(|_| {
+                            ComputePassError {
+                                scope: PassErrorScope::WriteTimestamp,
+                                inner: ComputePassErrorInner::InvalidQuerySet(query_set_id),
+                            }
+                        })?,
+                        query_index,
+                    },
+
+                    ComputeCommand::BeginPipelineStatisticsQuery {
+                        query_set_id,
+                        query_index,
+                    } => ArcComputeCommand::BeginPipelineStatisticsQuery {
+                        query_set: query_set_guard.get_owned(query_set_id).map_err(|_| {
+                            ComputePassError {
+                                scope: PassErrorScope::BeginPipelineStatisticsQuery,
+                                inner: ComputePassErrorInner::InvalidQuerySet(query_set_id),
+                            }
+                        })?,
+                        query_index,
+                    },
+
+                    ComputeCommand::EndPipelineStatisticsQuery => {
+                        ArcComputeCommand::EndPipelineStatisticsQuery
+                    }
+                })
+            })
+            .collect::<Result<Vec<_>, ComputePassError>>()?;
+        Ok(resolved_commands)
+    }
+}
+
+/// Equivalent to `ComputeCommand` but the Ids resolved into resource Arcs.
+#[derive(Clone, Debug)]
+pub enum ArcComputeCommand<A: HalApi> {
+    SetBindGroup {
+        index: u32,
+        num_dynamic_offsets: usize,
+        bind_group: Arc<BindGroup<A>>,
+    },
+
+    SetPipeline(Arc<ComputePipeline<A>>),
+
+    /// Set a range of push constants to values stored in `push_constant_data`.
+    SetPushConstant {
+        /// The byte offset within the push constant storage to write to. This
+        /// must be a multiple of four.
+        offset: u32,
+
+        /// The number of bytes to write. This must be a multiple of four.
+        size_bytes: u32,
+
+        /// Index in `push_constant_data` of the start of the data
+        /// to be written.
+        ///
+        /// Note: this is not a byte offset like `offset`. Rather, it is the
+        /// index of the first `u32` element in `push_constant_data` to read.
+        values_offset: u32,
+    },
+
+    Dispatch([u32; 3]),
+
+    DispatchIndirect {
+        buffer: Arc<Buffer<A>>,
+        offset: wgt::BufferAddress,
+    },
+
+    PushDebugGroup {
+        color: u32,
+        len: usize,
+    },
+
+    PopDebugGroup,
+
+    InsertDebugMarker {
+        color: u32,
+        len: usize,
+    },
+
+    WriteTimestamp {
+        query_set: Arc<QuerySet<A>>,
+        query_index: u32,
+    },
+
+    BeginPipelineStatisticsQuery {
+        query_set: Arc<QuerySet<A>>,
+        query_index: u32,
+    },
+
+    EndPipelineStatisticsQuery,
+}
+
+#[cfg(feature = "trace")]
+impl<A: HalApi> From<&ArcComputeCommand<A>> for ComputeCommand {
+    fn from(value: &ArcComputeCommand<A>) -> Self {
+        use crate::resource::Resource as _;
+
+        match value {
+            ArcComputeCommand::SetBindGroup {
+                index,
+                num_dynamic_offsets,
+                bind_group,
+            } => ComputeCommand::SetBindGroup {
+                index: *index,
+                num_dynamic_offsets: *num_dynamic_offsets,
+                bind_group_id: bind_group.as_info().id(),
+            },
+
+            ArcComputeCommand::SetPipeline(pipeline) => {
+                ComputeCommand::SetPipeline(pipeline.as_info().id())
+            }
+
+            ArcComputeCommand::SetPushConstant {
+                offset,
+                size_bytes,
+                values_offset,
+            } => ComputeCommand::SetPushConstant {
+                offset: *offset,
+                size_bytes: *size_bytes,
+                values_offset: *values_offset,
+            },
+
+            ArcComputeCommand::Dispatch(dim) => ComputeCommand::Dispatch(*dim),
+
+            ArcComputeCommand::DispatchIndirect { buffer, offset } => {
+                ComputeCommand::DispatchIndirect {
+                    buffer_id: buffer.as_info().id(),
+                    offset: *offset,
+                }
+            }
+
+            ArcComputeCommand::PushDebugGroup { color, len } => ComputeCommand::PushDebugGroup {
+                color: *color,
+                len: *len,
+            },
+
+            ArcComputeCommand::PopDebugGroup => ComputeCommand::PopDebugGroup,
+
+            ArcComputeCommand::InsertDebugMarker { color, len } => {
+                ComputeCommand::InsertDebugMarker {
+                    color: *color,
+                    len: *len,
+                }
+            }
+
+            ArcComputeCommand::WriteTimestamp {
+                query_set,
+                query_index,
+            } => ComputeCommand::WriteTimestamp {
+                query_set_id: query_set.as_info().id(),
+                query_index: *query_index,
+            },
+
+            ArcComputeCommand::BeginPipelineStatisticsQuery {
+                query_set,
+                query_index,
+            } => ComputeCommand::BeginPipelineStatisticsQuery {
+                query_set_id: query_set.as_info().id(),
+                query_index: *query_index,
+            },
+
+            ArcComputeCommand::EndPipelineStatisticsQuery => {
+                ComputeCommand::EndPipelineStatisticsQuery
+            }
+        }
+    }
+}
diff --git a/wgpu-core/src/command/mod.rs b/wgpu-core/src/command/mod.rs
index 6e6c1abacf..2a6298d91d 100644
--- a/wgpu-core/src/command/mod.rs
+++ b/wgpu-core/src/command/mod.rs
@@ -1,7 +1,9 @@
+mod allocator;
 mod bind;
 mod bundle;
 mod clear;
 mod compute;
+mod compute_command;
 mod draw;
 mod memory_init;
 mod query;
@@ -9,13 +11,14 @@ mod ray_tracing;
 mod render;
 mod transfer;
 
-use std::slice;
 use std::sync::Arc;
 
 pub(crate) use self::clear::clear_texture;
 pub use self::{
-    bundle::*, clear::ClearError, compute::*, draw::*, query::*, render::*, transfer::*,
+    bundle::*, clear::ClearError, compute::*, compute_command::ComputeCommand, draw::*, query::*,
+    render::*, transfer::*,
 };
+pub(crate) use allocator::CommandAllocator;
 
 use self::memory_init::CommandBufferTextureMemoryActions;
 
@@ -23,6 +26,7 @@ use crate::device::{Device, DeviceError};
 use crate::error::{ErrorFormatter, PrettyError};
 use crate::hub::Hub;
 use crate::id::CommandBufferId;
+use crate::lock::{rank, Mutex};
 use crate::snatch::SnatchGuard;
 
 use crate::init_tracker::BufferInitTrackerAction;
@@ -32,7 +36,6 @@ use crate::track::{Tracker, UsageScope};
 use crate::{api_log, global::Global, hal_api::HalApi, id, resource_log, Label};
 
 use hal::CommandEncoder as _;
-use parking_lot::Mutex;
 use thiserror::Error;
 
 #[cfg(feature = "trace")]
@@ -40,23 +43,122 @@ use crate::device::trace::Command as TraceCommand;
 
 const PUSH_CONSTANT_CLEAR_ARRAY: &[u32] = &[0_u32; 64];
 
+/// The current state of a [`CommandBuffer`].
 #[derive(Debug)]
 pub(crate) enum CommandEncoderStatus {
+    /// Ready to record commands. An encoder's initial state.
+    ///
+    /// Command building methods like [`command_encoder_clear_buffer`] and
+    /// [`command_encoder_run_compute_pass`] require the encoder to be in this
+    /// state.
+    ///
+    /// [`command_encoder_clear_buffer`]: Global::command_encoder_clear_buffer
+    /// [`command_encoder_run_compute_pass`]: Global::command_encoder_run_compute_pass
     Recording,
+
+    /// Command recording is complete, and the buffer is ready for submission.
+    ///
+    /// [`Global::command_encoder_finish`] transitions a
+    /// `CommandBuffer` from the `Recording` state into this state.
+    ///
+    /// [`Global::queue_submit`] drops command buffers unless they are
+    /// in this state.
     Finished,
+
+    /// An error occurred while recording a compute or render pass.
+    ///
+    /// When a `CommandEncoder` is left in this state, we have also
+    /// returned an error result from the function that encountered
+    /// the problem. Future attempts to use the encoder (that is,
+    /// calls to [`CommandBuffer::get_encoder`]) will also return
+    /// errors.
+    ///
+    /// Calling [`Global::command_encoder_finish`] in this state
+    /// discards the command buffer under construction.
     Error,
 }
 
+/// A raw [`CommandEncoder`][rce], and the raw [`CommandBuffer`][rcb]s built from it.
+///
+/// Each wgpu-core [`CommandBuffer`] owns an instance of this type, which is
+/// where the commands are actually stored.
+///
+/// This holds a `Vec` of raw [`CommandBuffer`][rcb]s, not just one. We are not
+/// always able to record commands in the order in which they must ultimately be
+/// submitted to the queue, but raw command buffers don't permit inserting new
+/// commands into the middle of a recorded stream. However, hal queue submission
+/// accepts a series of command buffers at once, so we can simply break the
+/// stream up into multiple buffers, and then reorder the buffers. See
+/// [`CommandEncoder::close_and_swap`] for a specific example of this.
+///
+/// Note that a [`CommandEncoderId`] actually refers to a [`CommandBuffer`].
+/// Methods that take a command encoder id actually look up the command buffer,
+/// and then use its encoder.
+///
+/// [rce]: hal::Api::CommandEncoder
+/// [rcb]: hal::Api::CommandBuffer
+/// [`CommandEncoderId`]: crate::id::CommandEncoderId
 pub(crate) struct CommandEncoder<A: HalApi> {
+    /// The underlying `wgpu_hal` [`CommandEncoder`].
+    ///
+    /// Successfully executed command buffers' encoders are saved in a
+    /// [`CommandAllocator`] for recycling.
+    ///
+    /// [`CommandEncoder`]: hal::Api::CommandEncoder
+    /// [`CommandAllocator`]: crate::command::CommandAllocator
     raw: A::CommandEncoder,
+
+    /// All the raw command buffers for our owning [`CommandBuffer`], in
+    /// submission order.
+    ///
+    /// These command buffers were all constructed with `raw`. The
+    /// [`wgpu_hal::CommandEncoder`] trait forbids these from outliving `raw`,
+    /// and requires that we provide all of these when we call
+    /// [`raw.reset_all()`][CE::ra], so the encoder and its buffers travel
+    /// together.
+    ///
+    /// [CE::ra]: hal::CommandEncoder::reset_all
+    /// [`wgpu_hal::CommandEncoder`]: hal::CommandEncoder
     list: Vec<A::CommandBuffer>,
+
+    /// True if `raw` is in the "recording" state.
+    ///
+    /// See the documentation for [`wgpu_hal::CommandEncoder`] for
+    /// details on the states `raw` can be in.
+    ///
+    /// [`wgpu_hal::CommandEncoder`]: hal::CommandEncoder
     is_open: bool,
+
     label: Option<String>,
 }
 
 //TODO: handle errors better
 impl<A: HalApi> CommandEncoder<A> {
-    /// Closes the live encoder
+    /// Finish the current command buffer, if any, and place it
+    /// at the second-to-last position in our list.
+    ///
+    /// If we have opened this command encoder, finish its current
+    /// command buffer, and insert it just before the last element in
+    /// [`self.list`][l]. If this command buffer is closed, do nothing.
+    ///
+    /// On return, the underlying hal encoder is closed.
+    ///
+    /// What is this for?
+    ///
+    /// The `wgpu_hal` contract requires that each render or compute pass's
+    /// commands be preceded by calls to [`transition_buffers`] and
+    /// [`transition_textures`], to put the resources the pass operates on in
+    /// the appropriate state. Unfortunately, we don't know which transitions
+    /// are needed until we're done recording the pass itself. Rather than
+    /// iterating over the pass twice, we note the necessary transitions as we
+    /// record its commands, finish the raw command buffer for the actual pass,
+    /// record a new raw command buffer for the transitions, and jam that buffer
+    /// in just before the pass's. This is the function that jams in the
+    /// transitions' command buffer.
+    ///
+    /// [l]: CommandEncoder::list
+    /// [`transition_buffers`]: hal::CommandEncoder::transition_buffers
+    /// [`transition_textures`]: hal::CommandEncoder::transition_textures
     fn close_and_swap(&mut self) -> Result<(), DeviceError> {
         if self.is_open {
             self.is_open = false;
@@ -67,6 +169,16 @@ impl<A: HalApi> CommandEncoder<A> {
         Ok(())
     }
 
+    /// Finish the current command buffer, if any, and add it to the
+    /// end of [`self.list`][l].
+    ///
+    /// If we have opened this command encoder, finish its current
+    /// command buffer, and push it onto the end of [`self.list`][l].
+    /// If this command buffer is closed, do nothing.
+    ///
+    /// On return, the underlying hal encoder is closed.
+    ///
+    /// [l]: CommandEncoder::list
     fn close(&mut self) -> Result<(), DeviceError> {
         if self.is_open {
             self.is_open = false;
@@ -77,6 +189,9 @@ impl<A: HalApi> CommandEncoder<A> {
         Ok(())
     }
 
+    /// Discard the command buffer under construction, if any.
+    ///
+    /// The underlying hal encoder is closed, if it was recording.
     pub(crate) fn discard(&mut self) {
         if self.is_open {
             self.is_open = false;
@@ -84,6 +199,9 @@ impl<A: HalApi> CommandEncoder<A> {
         }
     }
 
+    /// Begin recording a new command buffer, if we haven't already.
+    ///
+    /// The underlying hal encoder is put in the "recording" state.
     pub(crate) fn open(&mut self) -> Result<&mut A::CommandEncoder, DeviceError> {
         if !self.is_open {
             self.is_open = true;
@@ -94,6 +212,10 @@ impl<A: HalApi> CommandEncoder<A> {
         Ok(&mut self.raw)
     }
 
+    /// Begin recording a new command buffer for a render pass, with
+    /// its own label.
+    ///
+    /// The underlying hal encoder is put in the "recording" state.
     fn open_pass(&mut self, label: Option<&str>) -> Result<(), DeviceError> {
         self.is_open = true;
         unsafe { self.raw.begin_encoding(label)? };
@@ -115,12 +237,29 @@ pub(crate) struct BakedCommands<A: HalApi> {
 pub(crate) struct DestroyedBufferError(pub id::BufferId);
 pub(crate) struct DestroyedTextureError(pub id::TextureId);
 
+/// The mutable state of a [`CommandBuffer`].
 pub struct CommandBufferMutable<A: HalApi> {
+    /// The [`wgpu_hal::Api::CommandBuffer`]s we've built so far, and the encoder
+    /// they belong to.
+    ///
+    /// [`wgpu_hal::Api::CommandBuffer`]: hal::Api::CommandBuffer
     pub(crate) encoder: CommandEncoder<A>,
+
+    /// The current state of this command buffer's encoder.
     status: CommandEncoderStatus,
+
+    /// All the resources that the commands recorded so far have referred to.
     pub(crate) trackers: Tracker<A>,
+
+    /// The regions of buffers and textures these commands will read and write.
+    ///
+    /// This is used to determine which portions of which
+    /// buffers/textures we actually need to initialize. If we're
+    /// definitely going to write to something before we read from it,
+    /// we don't need to clear its contents.
     buffer_memory_init_actions: Vec<BufferInitTrackerAction<A>>,
     texture_memory_actions: CommandBufferTextureMemoryActions<A>,
+
     pub(crate) pending_query_resets: QueryResetMap<A>,
     blas_actions: Vec<BlasAction>,
     tlas_actions: Vec<TlasAction>,
@@ -139,11 +278,36 @@ impl<A: HalApi> CommandBufferMutable<A> {
     }
 }
 
+/// A buffer of commands to be submitted to the GPU for execution.
+///
+/// Whereas the WebGPU API uses two separate types for command buffers and
+/// encoders, this type is a fusion of the two:
+///
+/// - During command recording, this holds a [`CommandEncoder`] accepting this
+///   buffer's commands. In this state, the [`CommandBuffer`] type behaves like
+///   a WebGPU `GPUCommandEncoder`.
+///
+/// - Once command recording is finished by calling
+///   [`Global::command_encoder_finish`], no further recording is allowed. The
+///   internal [`CommandEncoder`] is retained solely as a storage pool for the
+///   raw command buffers. In this state, the value behaves like a WebGPU
+///   `GPUCommandBuffer`.
+///
+/// - Once a command buffer is submitted to the queue, it is removed from the id
+///   registry, and its contents are taken to construct a [`BakedCommands`],
+///   whose contents eventually become the property of the submission queue.
 pub struct CommandBuffer<A: HalApi> {
     pub(crate) device: Arc<Device<A>>,
     limits: wgt::Limits,
     support_clear_texture: bool,
     pub(crate) info: ResourceInfo<CommandBuffer<A>>,
+
+    /// The mutable state of this command buffer.
+    ///
+    /// This `Option` is populated when the command buffer is first created.
+    /// When this is submitted, dropped, or destroyed, its contents are
+    /// extracted into a [`BakedCommands`] by
+    /// [`CommandBuffer::extract_baked_commands`].
     pub(crate) data: Mutex<Option<CommandBufferMutable<A>>>,
 }
 
@@ -183,27 +347,30 @@ impl<A: HalApi> CommandBuffer<A> {
                 None,
             ),
             //Todo come back
-            data: Mutex::new(Some(CommandBufferMutable {
-                encoder: CommandEncoder {
-                    raw: encoder,
-                    is_open: false,
-                    list: Vec::new(),
-                    label,
-                },
-                status: CommandEncoderStatus::Recording,
-                trackers: Tracker::new(),
-                buffer_memory_init_actions: Default::default(),
-                texture_memory_actions: Default::default(),
-                pending_query_resets: QueryResetMap::new(),
-                blas_actions: Default::default(),
-                tlas_actions: Default::default(),
-                #[cfg(feature = "trace")]
-                commands: if enable_tracing {
-                    Some(Vec::new())
-                } else {
-                    None
-                },
-            })),
+            data: Mutex::new(
+                rank::COMMAND_BUFFER_DATA,
+                Some(CommandBufferMutable {
+                    encoder: CommandEncoder {
+                        raw: encoder,
+                        is_open: false,
+                        list: Vec::new(),
+                        label,
+                    },
+                    status: CommandEncoderStatus::Recording,
+                    trackers: Tracker::new(),
+                    buffer_memory_init_actions: Default::default(),
+                    texture_memory_actions: Default::default(),
+                    pending_query_resets: QueryResetMap::new(),
+                    blas_actions: Default::default(),
+                    tlas_actions: Default::default()
+                    #[cfg(feature = "trace")]
+                    commands: if enable_tracing {
+                        Some(Vec::new())
+                    } else {
+                        None
+                    },
+                }),
+            ),
         }
     }
 
@@ -257,6 +424,12 @@ impl<A: HalApi> CommandBuffer<A> {
 }
 
 impl<A: HalApi> CommandBuffer<A> {
+    /// Return the [`CommandBuffer`] for `id`, for recording new commands.
+    ///
+    /// In `wgpu_core`, the [`CommandBuffer`] type serves both as encoder and
+    /// buffer, which is why this function takes an [`id::CommandEncoderId`] but
+    /// returns a [`CommandBuffer`]. The returned command buffer must be in the
+    /// "recording" state. Otherwise, an error is returned.
     fn get_encoder(
         hub: &Hub<A>,
         id: id::CommandEncoderId,
@@ -608,16 +781,15 @@ impl BindGroupStateChange {
         }
     }
 
-    unsafe fn set_and_check_redundant(
+    fn set_and_check_redundant(
         &mut self,
         bind_group_id: id::BindGroupId,
         index: u32,
         dynamic_offsets: &mut Vec<u32>,
-        offsets: *const wgt::DynamicOffset,
-        offset_length: usize,
+        offsets: &[wgt::DynamicOffset],
     ) -> bool {
         // For now never deduplicate bind groups with dynamic offsets.
-        if offset_length == 0 {
+        if offsets.is_empty() {
             // If this get returns None, that means we're well over the limit,
             // so let the call through to get a proper error
             if let Some(current_bind_group) = self.last_states.get_mut(index as usize) {
@@ -633,8 +805,7 @@ impl BindGroupStateChange {
             if let Some(current_bind_group) = self.last_states.get_mut(index as usize) {
                 current_bind_group.reset();
             }
-            dynamic_offsets
-                .extend_from_slice(unsafe { slice::from_raw_parts(offsets, offset_length) });
+            dynamic_offsets.extend_from_slice(offsets);
         }
         false
     }
diff --git a/wgpu-core/src/command/query.rs b/wgpu-core/src/command/query.rs
index 89cba6fbf3..fd3360cc00 100644
--- a/wgpu-core/src/command/query.rs
+++ b/wgpu-core/src/command/query.rs
@@ -9,7 +9,7 @@ use crate::{
     hal_api::HalApi,
     id::{self, Id},
     init_tracker::MemoryInitKind,
-    resource::QuerySet,
+    resource::{QuerySet, Resource},
     storage::Storage,
     Epoch, FastHashMap, Index,
 };
@@ -429,11 +429,20 @@ impl Global {
             .add_single(&*query_set_guard, query_set_id)
             .ok_or(QueryError::InvalidQuerySet(query_set_id))?;
 
+        if query_set.device.as_info().id() != cmd_buf.device.as_info().id() {
+            return Err(DeviceError::WrongDevice.into());
+        }
+
         let (dst_buffer, dst_pending) = {
             let buffer_guard = hub.buffers.read();
             let dst_buffer = buffer_guard
                 .get(destination)
                 .map_err(|_| QueryError::InvalidBuffer(destination))?;
+
+            if dst_buffer.device.as_info().id() != cmd_buf.device.as_info().id() {
+                return Err(DeviceError::WrongDevice.into());
+            }
+
             tracker
                 .buffers
                 .set_single(dst_buffer, hal::BufferUses::COPY_DST)
diff --git a/wgpu-core/src/command/render.rs b/wgpu-core/src/command/render.rs
index d128f38cf6..93fb7b5622 100644
--- a/wgpu-core/src/command/render.rs
+++ b/wgpu-core/src/command/render.rs
@@ -2476,36 +2476,27 @@ impl Global {
     }
 }
 
-pub mod render_ffi {
+pub mod render_commands {
     use super::{
         super::{Rect, RenderCommand},
         RenderPass,
     };
-    use crate::{id, RawString};
-    use std::{convert::TryInto, ffi, num::NonZeroU32, slice};
+    use crate::id;
+    use std::{convert::TryInto, num::NonZeroU32};
     use wgt::{BufferAddress, BufferSize, Color, DynamicOffset, IndexFormat};
 
-    /// # Safety
-    ///
-    /// This function is unsafe as there is no guarantee that the given pointer is
-    /// valid for `offset_length` elements.
-    #[no_mangle]
-    pub unsafe extern "C" fn wgpu_render_pass_set_bind_group(
+    pub fn wgpu_render_pass_set_bind_group(
         pass: &mut RenderPass,
         index: u32,
         bind_group_id: id::BindGroupId,
-        offsets: *const DynamicOffset,
-        offset_length: usize,
+        offsets: &[DynamicOffset],
     ) {
-        let redundant = unsafe {
-            pass.current_bind_groups.set_and_check_redundant(
-                bind_group_id,
-                index,
-                &mut pass.base.dynamic_offsets,
-                offsets,
-                offset_length,
-            )
-        };
+        let redundant = pass.current_bind_groups.set_and_check_redundant(
+            bind_group_id,
+            index,
+            &mut pass.base.dynamic_offsets,
+            offsets,
+        );
 
         if redundant {
             return;
@@ -2513,16 +2504,12 @@ pub mod render_ffi {
 
         pass.base.commands.push(RenderCommand::SetBindGroup {
             index,
-            num_dynamic_offsets: offset_length,
+            num_dynamic_offsets: offsets.len(),
             bind_group_id,
         });
     }
 
-    #[no_mangle]
-    pub extern "C" fn wgpu_render_pass_set_pipeline(
-        pass: &mut RenderPass,
-        pipeline_id: id::RenderPipelineId,
-    ) {
+    pub fn wgpu_render_pass_set_pipeline(pass: &mut RenderPass, pipeline_id: id::RenderPipelineId) {
         if pass.current_pipeline.set_and_check_redundant(pipeline_id) {
             return;
         }
@@ -2532,8 +2519,7 @@ pub mod render_ffi {
             .push(RenderCommand::SetPipeline(pipeline_id));
     }
 
-    #[no_mangle]
-    pub extern "C" fn wgpu_render_pass_set_vertex_buffer(
+    pub fn wgpu_render_pass_set_vertex_buffer(
         pass: &mut RenderPass,
         slot: u32,
         buffer_id: id::BufferId,
@@ -2548,8 +2534,7 @@ pub mod render_ffi {
         });
     }
 
-    #[no_mangle]
-    pub extern "C" fn wgpu_render_pass_set_index_buffer(
+    pub fn wgpu_render_pass_set_index_buffer(
         pass: &mut RenderPass,
         buffer: id::BufferId,
         index_format: IndexFormat,
@@ -2559,22 +2544,19 @@ pub mod render_ffi {
         pass.set_index_buffer(buffer, index_format, offset, size);
     }
 
-    #[no_mangle]
-    pub extern "C" fn wgpu_render_pass_set_blend_constant(pass: &mut RenderPass, color: &Color) {
+    pub fn wgpu_render_pass_set_blend_constant(pass: &mut RenderPass, color: &Color) {
         pass.base
             .commands
             .push(RenderCommand::SetBlendConstant(*color));
     }
 
-    #[no_mangle]
-    pub extern "C" fn wgpu_render_pass_set_stencil_reference(pass: &mut RenderPass, value: u32) {
+    pub fn wgpu_render_pass_set_stencil_reference(pass: &mut RenderPass, value: u32) {
         pass.base
             .commands
             .push(RenderCommand::SetStencilReference(value));
     }
 
-    #[no_mangle]
-    pub extern "C" fn wgpu_render_pass_set_viewport(
+    pub fn wgpu_render_pass_set_viewport(
         pass: &mut RenderPass,
         x: f32,
         y: f32,
@@ -2590,8 +2572,7 @@ pub mod render_ffi {
         });
     }
 
-    #[no_mangle]
-    pub extern "C" fn wgpu_render_pass_set_scissor_rect(
+    pub fn wgpu_render_pass_set_scissor_rect(
         pass: &mut RenderPass,
         x: u32,
         y: u32,
@@ -2603,17 +2584,11 @@ pub mod render_ffi {
             .push(RenderCommand::SetScissor(Rect { x, y, w, h }));
     }
 
-    /// # Safety
-    ///
-    /// This function is unsafe as there is no guarantee that the given pointer is
-    /// valid for `size_bytes` bytes.
-    #[no_mangle]
-    pub unsafe extern "C" fn wgpu_render_pass_set_push_constants(
+    pub fn wgpu_render_pass_set_push_constants(
         pass: &mut RenderPass,
         stages: wgt::ShaderStages,
         offset: u32,
-        size_bytes: u32,
-        data: *const u8,
+        data: &[u8],
     ) {
         assert_eq!(
             offset & (wgt::PUSH_CONSTANT_ALIGNMENT - 1),
@@ -2621,31 +2596,28 @@ pub mod render_ffi {
             "Push constant offset must be aligned to 4 bytes."
         );
         assert_eq!(
-            size_bytes & (wgt::PUSH_CONSTANT_ALIGNMENT - 1),
+            data.len() as u32 & (wgt::PUSH_CONSTANT_ALIGNMENT - 1),
             0,
             "Push constant size must be aligned to 4 bytes."
         );
-        let data_slice = unsafe { slice::from_raw_parts(data, size_bytes as usize) };
         let value_offset = pass.base.push_constant_data.len().try_into().expect(
             "Ran out of push constant space. Don't set 4gb of push constants per RenderPass.",
         );
 
         pass.base.push_constant_data.extend(
-            data_slice
-                .chunks_exact(wgt::PUSH_CONSTANT_ALIGNMENT as usize)
+            data.chunks_exact(wgt::PUSH_CONSTANT_ALIGNMENT as usize)
                 .map(|arr| u32::from_ne_bytes([arr[0], arr[1], arr[2], arr[3]])),
         );
 
         pass.base.commands.push(RenderCommand::SetPushConstant {
             stages,
             offset,
-            size_bytes,
+            size_bytes: data.len() as u32,
             values_offset: Some(value_offset),
         });
     }
 
-    #[no_mangle]
-    pub extern "C" fn wgpu_render_pass_draw(
+    pub fn wgpu_render_pass_draw(
         pass: &mut RenderPass,
         vertex_count: u32,
         instance_count: u32,
@@ -2660,8 +2632,7 @@ pub mod render_ffi {
         });
     }
 
-    #[no_mangle]
-    pub extern "C" fn wgpu_render_pass_draw_indexed(
+    pub fn wgpu_render_pass_draw_indexed(
         pass: &mut RenderPass,
         index_count: u32,
         instance_count: u32,
@@ -2678,8 +2649,7 @@ pub mod render_ffi {
         });
     }
 
-    #[no_mangle]
-    pub extern "C" fn wgpu_render_pass_draw_indirect(
+    pub fn wgpu_render_pass_draw_indirect(
         pass: &mut RenderPass,
         buffer_id: id::BufferId,
         offset: BufferAddress,
@@ -2692,8 +2662,7 @@ pub mod render_ffi {
         });
     }
 
-    #[no_mangle]
-    pub extern "C" fn wgpu_render_pass_draw_indexed_indirect(
+    pub fn wgpu_render_pass_draw_indexed_indirect(
         pass: &mut RenderPass,
         buffer_id: id::BufferId,
         offset: BufferAddress,
@@ -2706,8 +2675,7 @@ pub mod render_ffi {
         });
     }
 
-    #[no_mangle]
-    pub extern "C" fn wgpu_render_pass_multi_draw_indirect(
+    pub fn wgpu_render_pass_multi_draw_indirect(
         pass: &mut RenderPass,
         buffer_id: id::BufferId,
         offset: BufferAddress,
@@ -2721,8 +2689,7 @@ pub mod render_ffi {
         });
     }
 
-    #[no_mangle]
-    pub extern "C" fn wgpu_render_pass_multi_draw_indexed_indirect(
+    pub fn wgpu_render_pass_multi_draw_indexed_indirect(
         pass: &mut RenderPass,
         buffer_id: id::BufferId,
         offset: BufferAddress,
@@ -2736,8 +2703,7 @@ pub mod render_ffi {
         });
     }
 
-    #[no_mangle]
-    pub extern "C" fn wgpu_render_pass_multi_draw_indirect_count(
+    pub fn wgpu_render_pass_multi_draw_indirect_count(
         pass: &mut RenderPass,
         buffer_id: id::BufferId,
         offset: BufferAddress,
@@ -2757,8 +2723,7 @@ pub mod render_ffi {
             });
     }
 
-    #[no_mangle]
-    pub extern "C" fn wgpu_render_pass_multi_draw_indexed_indirect_count(
+    pub fn wgpu_render_pass_multi_draw_indexed_indirect_count(
         pass: &mut RenderPass,
         buffer_id: id::BufferId,
         offset: BufferAddress,
@@ -2778,17 +2743,8 @@ pub mod render_ffi {
             });
     }
 
-    /// # Safety
-    ///
-    /// This function is unsafe as there is no guarantee that the given `label`
-    /// is a valid null-terminated string.
-    #[no_mangle]
-    pub unsafe extern "C" fn wgpu_render_pass_push_debug_group(
-        pass: &mut RenderPass,
-        label: RawString,
-        color: u32,
-    ) {
-        let bytes = unsafe { ffi::CStr::from_ptr(label) }.to_bytes();
+    pub fn wgpu_render_pass_push_debug_group(pass: &mut RenderPass, label: &str, color: u32) {
+        let bytes = label.as_bytes();
         pass.base.string_data.extend_from_slice(bytes);
 
         pass.base.commands.push(RenderCommand::PushDebugGroup {
@@ -2797,22 +2753,12 @@ pub mod render_ffi {
         });
     }
 
-    #[no_mangle]
-    pub extern "C" fn wgpu_render_pass_pop_debug_group(pass: &mut RenderPass) {
+    pub fn wgpu_render_pass_pop_debug_group(pass: &mut RenderPass) {
         pass.base.commands.push(RenderCommand::PopDebugGroup);
     }
 
-    /// # Safety
-    ///
-    /// This function is unsafe as there is no guarantee that the given `label`
-    /// is a valid null-terminated string.
-    #[no_mangle]
-    pub unsafe extern "C" fn wgpu_render_pass_insert_debug_marker(
-        pass: &mut RenderPass,
-        label: RawString,
-        color: u32,
-    ) {
-        let bytes = unsafe { ffi::CStr::from_ptr(label) }.to_bytes();
+    pub fn wgpu_render_pass_insert_debug_marker(pass: &mut RenderPass, label: &str, color: u32) {
+        let bytes = label.as_bytes();
         pass.base.string_data.extend_from_slice(bytes);
 
         pass.base.commands.push(RenderCommand::InsertDebugMarker {
@@ -2821,8 +2767,7 @@ pub mod render_ffi {
         });
     }
 
-    #[no_mangle]
-    pub extern "C" fn wgpu_render_pass_write_timestamp(
+    pub fn wgpu_render_pass_write_timestamp(
         pass: &mut RenderPass,
         query_set_id: id::QuerySetId,
         query_index: u32,
@@ -2833,23 +2778,17 @@ pub mod render_ffi {
         });
     }
 
-    #[no_mangle]
-    pub extern "C" fn wgpu_render_pass_begin_occlusion_query(
-        pass: &mut RenderPass,
-        query_index: u32,
-    ) {
+    pub fn wgpu_render_pass_begin_occlusion_query(pass: &mut RenderPass, query_index: u32) {
         pass.base
             .commands
             .push(RenderCommand::BeginOcclusionQuery { query_index });
     }
 
-    #[no_mangle]
-    pub extern "C" fn wgpu_render_pass_end_occlusion_query(pass: &mut RenderPass) {
+    pub fn wgpu_render_pass_end_occlusion_query(pass: &mut RenderPass) {
         pass.base.commands.push(RenderCommand::EndOcclusionQuery);
     }
 
-    #[no_mangle]
-    pub extern "C" fn wgpu_render_pass_begin_pipeline_statistics_query(
+    pub fn wgpu_render_pass_begin_pipeline_statistics_query(
         pass: &mut RenderPass,
         query_set_id: id::QuerySetId,
         query_index: u32,
@@ -2862,26 +2801,17 @@ pub mod render_ffi {
             });
     }
 
-    #[no_mangle]
-    pub extern "C" fn wgpu_render_pass_end_pipeline_statistics_query(pass: &mut RenderPass) {
+    pub fn wgpu_render_pass_end_pipeline_statistics_query(pass: &mut RenderPass) {
         pass.base
             .commands
             .push(RenderCommand::EndPipelineStatisticsQuery);
     }
 
-    /// # Safety
-    ///
-    /// This function is unsafe as there is no guarantee that the given pointer is
-    /// valid for `render_bundle_ids_length` elements.
-    #[no_mangle]
-    pub unsafe extern "C" fn wgpu_render_pass_execute_bundles(
+    pub fn wgpu_render_pass_execute_bundles(
         pass: &mut RenderPass,
-        render_bundle_ids: *const id::RenderBundleId,
-        render_bundle_ids_length: usize,
+        render_bundle_ids: &[id::RenderBundleId],
     ) {
-        for &bundle_id in
-            unsafe { slice::from_raw_parts(render_bundle_ids, render_bundle_ids_length) }
-        {
+        for &bundle_id in render_bundle_ids {
             pass.base
                 .commands
                 .push(RenderCommand::ExecuteBundle(bundle_id));
diff --git a/wgpu-core/src/command/transfer.rs b/wgpu-core/src/command/transfer.rs
index 8e98a4c9b9..84bc88e723 100644
--- a/wgpu-core/src/command/transfer.rs
+++ b/wgpu-core/src/command/transfer.rs
@@ -607,6 +607,11 @@ impl Global {
             let src_buffer = buffer_guard
                 .get(source)
                 .map_err(|_| TransferError::InvalidBuffer(source))?;
+
+            if src_buffer.device.as_info().id() != device.as_info().id() {
+                return Err(DeviceError::WrongDevice.into());
+            }
+
             cmd_buf_data
                 .trackers
                 .buffers
@@ -628,6 +633,11 @@ impl Global {
             let dst_buffer = buffer_guard
                 .get(destination)
                 .map_err(|_| TransferError::InvalidBuffer(destination))?;
+
+            if dst_buffer.device.as_info().id() != device.as_info().id() {
+                return Err(DeviceError::WrongDevice.into());
+            }
+
             cmd_buf_data
                 .trackers
                 .buffers
@@ -777,6 +787,10 @@ impl Global {
             .get(destination.texture)
             .map_err(|_| TransferError::InvalidTexture(destination.texture))?;
 
+        if dst_texture.device.as_info().id() != device.as_info().id() {
+            return Err(DeviceError::WrongDevice.into());
+        }
+
         let (hal_copy_size, array_layer_count) = validate_texture_copy_range(
             destination,
             &dst_texture.desc,
@@ -807,6 +821,11 @@ impl Global {
             let src_buffer = buffer_guard
                 .get(source.buffer)
                 .map_err(|_| TransferError::InvalidBuffer(source.buffer))?;
+
+            if src_buffer.device.as_info().id() != device.as_info().id() {
+                return Err(DeviceError::WrongDevice.into());
+            }
+
             tracker
                 .buffers
                 .set_single(src_buffer, hal::BufferUses::COPY_SRC)
@@ -938,6 +957,10 @@ impl Global {
             .get(source.texture)
             .map_err(|_| TransferError::InvalidTexture(source.texture))?;
 
+        if src_texture.device.as_info().id() != device.as_info().id() {
+            return Err(DeviceError::WrongDevice.into());
+        }
+
         let (hal_copy_size, array_layer_count) =
             validate_texture_copy_range(source, &src_texture.desc, CopySide::Source, copy_size)?;
 
@@ -989,6 +1012,11 @@ impl Global {
             let dst_buffer = buffer_guard
                 .get(destination.buffer)
                 .map_err(|_| TransferError::InvalidBuffer(destination.buffer))?;
+
+            if dst_buffer.device.as_info().id() != device.as_info().id() {
+                return Err(DeviceError::WrongDevice.into());
+            }
+
             tracker
                 .buffers
                 .set_single(dst_buffer, hal::BufferUses::COPY_DST)
@@ -1117,6 +1145,13 @@ impl Global {
             .get(destination.texture)
             .map_err(|_| TransferError::InvalidTexture(source.texture))?;
 
+        if src_texture.device.as_info().id() != device.as_info().id() {
+            return Err(DeviceError::WrongDevice.into());
+        }
+        if dst_texture.device.as_info().id() != device.as_info().id() {
+            return Err(DeviceError::WrongDevice.into());
+        }
+
         // src and dst texture format must be copy-compatible
         // https://gpuweb.github.io/gpuweb/#copy-compatible
         if src_texture.desc.format.remove_srgb_suffix()
diff --git a/wgpu-core/src/device/any_device.rs b/wgpu-core/src/device/any_device.rs
index 693155a753..9e459c1a94 100644
--- a/wgpu-core/src/device/any_device.rs
+++ b/wgpu-core/src/device/any_device.rs
@@ -34,7 +34,7 @@ impl AnyDevice {
         unsafe fn drop_glue<A: HalApi>(ptr: *mut ()) {
             // Drop the arc this instance is holding.
             unsafe {
-                _ = Arc::from_raw(ptr.cast::<A::Surface>());
+                _ = Arc::from_raw(ptr.cast::<A::Device>());
             }
         }
 
diff --git a/wgpu-core/src/device/bgl.rs b/wgpu-core/src/device/bgl.rs
index d606f049a3..911ac8a435 100644
--- a/wgpu-core/src/device/bgl.rs
+++ b/wgpu-core/src/device/bgl.rs
@@ -58,7 +58,7 @@ impl EntryMap {
         assert!(self.sorted);
     }
 
-    /// Create a new [`BindGroupLayoutEntryMap`] from a slice of [`wgt::BindGroupLayoutEntry`]s.
+    /// Create a new [`EntryMap`] from a slice of [`wgt::BindGroupLayoutEntry`]s.
     ///
     /// Errors if there are duplicate bindings or if any binding index is greater than
     /// the device's limits.
diff --git a/wgpu-core/src/device/global.rs b/wgpu-core/src/device/global.rs
index 9c54dfc193..9f78878cc7 100644
--- a/wgpu-core/src/device/global.rs
+++ b/wgpu-core/src/device/global.rs
@@ -11,6 +11,7 @@ use crate::{
     id::{self, AdapterId, DeviceId, QueueId, SurfaceId},
     init_tracker::TextureInitTracker,
     instance::{self, Adapter, Surface},
+    lock::{rank, RwLock},
     pipeline, present,
     resource::{self, BufferAccessResult},
     resource::{BufferAccessError, BufferMapOperation, CreateBufferError, Resource},
@@ -20,7 +21,6 @@ use crate::{
 
 use arrayvec::ArrayVec;
 use hal::Device as _;
-use parking_lot::RwLock;
 
 use wgt::{BufferAddress, TextureFormat};
 
@@ -643,8 +643,10 @@ impl Global {
                 texture.hal_usage |= hal::TextureUses::COPY_DST;
             }
 
-            texture.initialization_status =
-                RwLock::new(TextureInitTracker::new(desc.mip_level_count, 0));
+            texture.initialization_status = RwLock::new(
+                rank::TEXTURE_INITIALIZATION_STATUS,
+                TextureInitTracker::new(desc.mip_level_count, 0),
+            );
 
             let (id, resource) = fid.assign(Arc::new(texture));
             api_log!("Device::create_texture({desc:?}) -> {id:?}");
@@ -1351,9 +1353,6 @@ impl Global {
             };
             let encoder = match device
                 .command_allocator
-                .lock()
-                .as_mut()
-                .unwrap()
                 .acquire_encoder(device.raw(), queue.raw.as_ref().unwrap())
             {
                 Ok(raw) => raw,
@@ -1972,7 +1971,7 @@ impl Global {
                 };
 
                 let caps = unsafe {
-                    let suf = A::get_surface(surface);
+                    let suf = A::surface_as_hal(surface);
                     let adapter = &device.adapter;
                     match adapter.raw.adapter.surface_capabilities(suf.unwrap()) {
                         Some(caps) => caps,
@@ -2058,7 +2057,7 @@ impl Global {
                 // https://github.com/gfx-rs/wgpu/issues/4105
 
                 match unsafe {
-                    A::get_surface(surface)
+                    A::surface_as_hal(surface)
                         .unwrap()
                         .configure(device.raw(), &hal_config)
                 } {
diff --git a/wgpu-core/src/device/life.rs b/wgpu-core/src/device/life.rs
index 187e6099b9..24f41e7ed9 100644
--- a/wgpu-core/src/device/life.rs
+++ b/wgpu-core/src/device/life.rs
@@ -7,6 +7,7 @@ use crate::{
     },
     hal_api::HalApi,
     id,
+    lock::Mutex,
     pipeline::{ComputePipeline, RenderPipeline},
     resource::{
         self, Buffer, DestroyedBuffer, DestroyedTexture, QuerySet, Resource, Sampler,
@@ -24,7 +25,6 @@ use std::sync::Arc;
 use thiserror::Error;
 
 /// A struct that keeps lists of resources that are no longer needed by the user.
-#[derive(Default)]
 pub(crate) struct ResourceMaps<A: HalApi> {
     pub buffers: FastHashMap<TrackerIndex, Arc<Buffer<A>>>,
     pub staging_buffers: FastHashMap<TrackerIndex, Arc<StagingBuffer<A>>>,
@@ -141,7 +141,37 @@ impl<A: HalApi> ResourceMaps<A> {
     }
 }
 
-/// Resources used by a queue submission, and work to be done once it completes.
+/// A command submitted to the GPU for execution.
+///
+/// ## Keeping resources alive while the GPU is using them
+///
+/// [`wgpu_hal`] requires that, when a command is submitted to a queue, all the
+/// resources it uses must remain alive until it has finished executing.
+///
+/// The natural way to satisfy this would be for `ActiveSubmission` to hold
+/// strong references to all the resources used by its commands. However, that
+/// would entail dropping those strong references every time a queue submission
+/// finishes, adjusting the reference counts of all the resources it used. This
+/// is usually needless work: it's rare for the active submission queue to be
+/// the final reference to an object. Usually the user is still holding on to
+/// it.
+///
+/// To avoid this, an `ActiveSubmission` does not initially hold any strong
+/// references to its commands' resources. Instead, each resource tracks the
+/// most recent submission index at which it has been used in
+/// [`ResourceInfo::submission_index`]. When the user drops a resource, if the
+/// submission in which it was last used is still present in the device's queue,
+/// we add the resource to [`ActiveSubmission::last_resources`]. Finally, when
+/// this `ActiveSubmission` is dequeued and dropped in
+/// [`LifetimeTracker::triage_submissions`], we drop `last_resources` along with
+/// it. Thus, unless a resource is dropped by the user, it doesn't need to be
+/// touched at all when processing completed work.
+///
+/// However, it's not clear that this is effective. See [#5560].
+///
+/// [`wgpu_hal`]: hal
+/// [`ResourceInfo::submission_index`]: crate::resource::ResourceInfo
+/// [#5560]: https://github.com/gfx-rs/wgpu/issues/5560
 struct ActiveSubmission<A: HalApi> {
     /// The index of the submission we track.
     ///
@@ -163,6 +193,18 @@ struct ActiveSubmission<A: HalApi> {
     /// Buffers to be mapped once this submission has completed.
     mapped: Vec<Arc<Buffer<A>>>,
 
+    /// Command buffers used by this submission, and the encoder that owns them.
+    ///
+    /// [`wgpu_hal::Queue::submit`] requires the submitted command buffers to
+    /// remain alive until the submission has completed execution. Command
+    /// encoders double as allocation pools for command buffers, so holding them
+    /// here and cleaning them up in [`LifetimeTracker::triage_submissions`]
+    /// satisfies that requirement.
+    ///
+    /// Once this submission has completed, the command buffers are reset and
+    /// the command encoder is recycled.
+    ///
+    /// [`wgpu_hal::Queue::submit`]: hal::Queue::submit
     encoders: Vec<EncoderInFlight<A>>,
 
     /// List of queue "on_submitted_work_done" closures to be called once this
@@ -353,28 +395,25 @@ impl<A: HalApi> LifetimeTracker<A> {
     ///
     /// Assume that all submissions up through `last_done` have completed.
     ///
-    /// -   Buffers used by those submissions are now ready to map, if
-    ///     requested. Add any buffers in the submission's [`mapped`] list to
-    ///     [`self.ready_to_map`], where [`LifetimeTracker::handle_mapping`] will find
-    ///     them.
+    /// -   Buffers used by those submissions are now ready to map, if requested.
+    ///     Add any buffers in the submission's [`mapped`] list to
+    ///     [`self.ready_to_map`], where [`LifetimeTracker::handle_mapping`]
+    ///     will find them.
     ///
     /// -   Resources whose final use was in those submissions are now ready to
-    ///     free. Add any resources in the submission's [`last_resources`] table
-    ///     to [`self.free_resources`], where [`LifetimeTracker::cleanup`] will find
-    ///     them.
+    ///     free. Dropping the submission's [`last_resources`] table does so.
     ///
     /// Return a list of [`SubmittedWorkDoneClosure`]s to run.
     ///
     /// [`mapped`]: ActiveSubmission::mapped
     /// [`self.ready_to_map`]: LifetimeTracker::ready_to_map
     /// [`last_resources`]: ActiveSubmission::last_resources
-    /// [`self.free_resources`]: LifetimeTracker::free_resources
     /// [`SubmittedWorkDoneClosure`]: crate::device::queue::SubmittedWorkDoneClosure
     #[must_use]
     pub fn triage_submissions(
         &mut self,
         last_done: SubmissionIndex,
-        command_allocator: &mut super::CommandAllocator<A>,
+        command_allocator: &crate::command::CommandAllocator<A>,
     ) -> SmallVec<[SubmittedWorkDoneClosure; 1]> {
         profiling::scope!("triage_submissions");
 
@@ -751,13 +790,10 @@ impl<A: HalApi> LifetimeTracker<A> {
 
     /// Identify resources to free, according to `trackers` and `self.suspected_resources`.
     ///
-    /// Given `trackers`, the [`Tracker`] belonging to same [`Device`] as
-    /// `self`, and `hub`, the [`Hub`] to which that `Device` belongs:
-    ///
-    /// Remove from `trackers` each resource mentioned in
-    /// [`self.suspected_resources`]. If `trackers` held the final reference to
-    /// that resource, add it to the appropriate free list, to be destroyed by
-    /// the hal:
+    /// Remove from `trackers`, the [`Tracker`] belonging to same [`Device`] as
+    /// `self`, each resource mentioned in [`self.suspected_resources`]. If
+    /// `trackers` held the final reference to that resource, add it to the
+    /// appropriate free list, to be destroyed by the hal:
     ///
     /// -   Add resources used by queue submissions still in flight to the
     ///     [`last_resources`] table of the last such submission's entry in
@@ -859,29 +895,33 @@ impl<A: HalApi> LifetimeTracker<A> {
                 *buffer.map_state.lock() = resource::BufferMapState::Idle;
                 log::trace!("Buffer ready to map {tracker_index:?} is not tracked anymore");
             } else {
-                let mapping = match std::mem::replace(
+                // This _cannot_ be inlined into the match. If it is, the lock will be held
+                // open through the whole match, resulting in a deadlock when we try to re-lock
+                // the buffer back to active.
+                let mapping = std::mem::replace(
                     &mut *buffer.map_state.lock(),
                     resource::BufferMapState::Idle,
-                ) {
+                );
+                let pending_mapping = match mapping {
                     resource::BufferMapState::Waiting(pending_mapping) => pending_mapping,
                     // Mapping cancelled
                     resource::BufferMapState::Idle => continue,
                     // Mapping queued at least twice by map -> unmap -> map
                     // and was already successfully mapped below
-                    active @ resource::BufferMapState::Active { .. } => {
-                        *buffer.map_state.lock() = active;
+                    resource::BufferMapState::Active { .. } => {
+                        *buffer.map_state.lock() = mapping;
                         continue;
                     }
                     _ => panic!("No pending mapping."),
                 };
-                let status = if mapping.range.start != mapping.range.end {
+                let status = if pending_mapping.range.start != pending_mapping.range.end {
                     log::debug!("Buffer {tracker_index:?} map state -> Active");
-                    let host = mapping.op.host;
-                    let size = mapping.range.end - mapping.range.start;
+                    let host = pending_mapping.op.host;
+                    let size = pending_mapping.range.end - pending_mapping.range.start;
                     match super::map_buffer(
                         raw,
                         &buffer,
-                        mapping.range.start,
+                        pending_mapping.range.start,
                         size,
                         host,
                         snatch_guard,
@@ -889,7 +929,8 @@ impl<A: HalApi> LifetimeTracker<A> {
                         Ok(ptr) => {
                             *buffer.map_state.lock() = resource::BufferMapState::Active {
                                 ptr,
-                                range: mapping.range.start..mapping.range.start + size,
+                                range: pending_mapping.range.start
+                                    ..pending_mapping.range.start + size,
                                 host,
                             };
                             Ok(())
@@ -902,12 +943,12 @@ impl<A: HalApi> LifetimeTracker<A> {
                 } else {
                     *buffer.map_state.lock() = resource::BufferMapState::Active {
                         ptr: std::ptr::NonNull::dangling(),
-                        range: mapping.range,
-                        host: mapping.op.host,
+                        range: pending_mapping.range,
+                        host: pending_mapping.op.host,
                     };
                     Ok(())
                 };
-                pending_callbacks.push((mapping.op, status));
+                pending_callbacks.push((pending_mapping.op, status));
             }
         }
         pending_callbacks
diff --git a/wgpu-core/src/device/mod.rs b/wgpu-core/src/device/mod.rs
index 944e10bf12..3120537544 100644
--- a/wgpu-core/src/device/mod.rs
+++ b/wgpu-core/src/device/mod.rs
@@ -4,7 +4,6 @@ use crate::{
     hub::Hub,
     id::{BindGroupLayoutId, PipelineLayoutId},
     resource::{Buffer, BufferAccessError, BufferAccessResult, BufferMapOperation},
-    resource_log,
     snatch::SnatchGuard,
     Label, DOWNLEVEL_ERROR_MESSAGE,
 };
@@ -378,42 +377,6 @@ fn map_buffer<A: HalApi>(
     Ok(mapping.ptr)
 }
 
-pub(crate) struct CommandAllocator<A: HalApi> {
-    free_encoders: Vec<A::CommandEncoder>,
-}
-
-impl<A: HalApi> CommandAllocator<A> {
-    fn acquire_encoder(
-        &mut self,
-        device: &A::Device,
-        queue: &A::Queue,
-    ) -> Result<A::CommandEncoder, hal::DeviceError> {
-        match self.free_encoders.pop() {
-            Some(encoder) => Ok(encoder),
-            None => unsafe {
-                let hal_desc = hal::CommandEncoderDescriptor { label: None, queue };
-                device.create_command_encoder(&hal_desc)
-            },
-        }
-    }
-
-    fn release_encoder(&mut self, encoder: A::CommandEncoder) {
-        self.free_encoders.push(encoder);
-    }
-
-    fn dispose(self, device: &A::Device) {
-        resource_log!(
-            "CommandAllocator::dispose encoders {}",
-            self.free_encoders.len()
-        );
-        for cmd_encoder in self.free_encoders {
-            unsafe {
-                device.destroy_command_encoder(cmd_encoder);
-            }
-        }
-    }
-}
-
 #[derive(Clone, Debug, Error)]
 #[error("Device is invalid")]
 pub struct InvalidDevice;
diff --git a/wgpu-core/src/device/queue.rs b/wgpu-core/src/device/queue.rs
index 8a9c52fa4f..a43414af1a 100644
--- a/wgpu-core/src/device/queue.rs
+++ b/wgpu-core/src/device/queue.rs
@@ -4,7 +4,7 @@ use crate::{
     api_log,
     command::{
         extract_texture_selector, validate_linear_texture_data, validate_texture_copy_range,
-        ClearError, CommandBuffer, CopySide, ImageCopyTexture, TransferError,
+        ClearError, CommandAllocator, CommandBuffer, CopySide, ImageCopyTexture, TransferError,
     },
     conv,
     device::{life::ResourceMaps, DeviceError, WaitIdleError},
@@ -14,6 +14,7 @@ use crate::{
     hal_label,
     id::{self, DeviceId, QueueId},
     init_tracker::{has_copy_partial_init_tracker_coverage, TextureInitRange},
+    lock::{rank, Mutex},
     resource::{
         Buffer, BufferAccessError, BufferMapState, DestroyedBuffer, DestroyedTexture, Resource,
         ResourceInfo, ResourceType, StagingBuffer, Texture, TextureInner,
@@ -22,7 +23,6 @@ use crate::{
 };
 
 use hal::{CommandEncoder as _, Device as _, Queue as _};
-use parking_lot::Mutex;
 use smallvec::SmallVec;
 
 use crate::resource::{Blas, Tlas};
@@ -155,13 +155,21 @@ pub enum TempResource<A: HalApi> {
     Blas(Arc<Blas<A>>),
 }
 
-/// A queue execution for a particular command encoder.
+/// A series of raw [`CommandBuffer`]s that have been submitted to a
+/// queue, and the [`wgpu_hal::CommandEncoder`] that built them.
+///
+/// [`CommandBuffer`]: hal::Api::CommandBuffer
+/// [`wgpu_hal::CommandEncoder`]: hal::CommandEncoder
 pub(crate) struct EncoderInFlight<A: HalApi> {
     raw: A::CommandEncoder,
     cmd_buffers: Vec<A::CommandBuffer>,
 }
 
 impl<A: HalApi> EncoderInFlight<A> {
+    /// Free all of our command buffers.
+    ///
+    /// Return the command encoder, fully reset and ready to be
+    /// reused.
     pub(crate) unsafe fn land(mut self) -> A::CommandEncoder {
         unsafe { self.raw.reset_all(self.cmd_buffers.into_iter()) };
         self.raw
@@ -195,6 +203,8 @@ pub(crate) struct PendingWrites<A: HalApi> {
     /// True if `command_encoder` is in the "recording" state, as
     /// described in the docs for the [`wgpu_hal::CommandEncoder`]
     /// trait.
+    ///
+    /// [`wgpu_hal::CommandEncoder`]: hal::CommandEncoder
     pub is_recording: bool,
 
     pub temp_resources: Vec<TempResource<A>>,
@@ -256,7 +266,7 @@ impl<A: HalApi> PendingWrites<A> {
     #[must_use]
     fn post_submit(
         &mut self,
-        command_allocator: &mut super::CommandAllocator<A>,
+        command_allocator: &CommandAllocator<A>,
         device: &A::Device,
         queue: &A::Queue,
     ) -> Option<EncoderInFlight<A>> {
@@ -310,7 +320,7 @@ fn prepare_staging_buffer<A: HalApi>(
     let mapping = unsafe { device.raw().map_buffer(&buffer, 0..size) }?;
 
     let staging_buffer = StagingBuffer {
-        raw: Mutex::new(Some(buffer)),
+        raw: Mutex::new(rank::STAGING_BUFFER_RAW, Some(buffer)),
         device: device.clone(),
         size,
         info: ResourceInfo::new(
@@ -1558,7 +1568,7 @@ impl Global {
 
             profiling::scope!("cleanup");
             if let Some(pending_execution) = pending_writes.post_submit(
-                device.command_allocator.lock().as_mut().unwrap(),
+                &device.command_allocator,
                 device.raw(),
                 queue.raw.as_ref().unwrap(),
             ) {
diff --git a/wgpu-core/src/device/resource.rs b/wgpu-core/src/device/resource.rs
index c735da6282..58947ef196 100644
--- a/wgpu-core/src/device/resource.rs
+++ b/wgpu-core/src/device/resource.rs
@@ -7,18 +7,20 @@ use crate::{
         bgl,
         life::{LifetimeTracker, WaitIdleError},
         queue::PendingWrites,
-        AttachmentData, CommandAllocator, DeviceLostInvocation, MissingDownlevelFlags,
-        MissingFeatures, RenderPassContext, CLEANUP_WAIT_MS,
+        AttachmentData, DeviceLostInvocation, MissingDownlevelFlags, MissingFeatures,
+        RenderPassContext, CLEANUP_WAIT_MS,
     },
     hal_api::HalApi,
     hal_label,
     hub::Hub,
+    id,
     init_tracker::{
         BufferInitTracker, BufferInitTrackerAction, MemoryInitKind, TextureInitRange,
         TextureInitTracker, TextureInitTrackerAction,
     },
     instance::Adapter,
-    pipeline,
+    lock::{rank, Mutex, MutexGuard, RwLock},
+    pipeline::{self},
     pool::ResourcePool,
     registry::Registry,
     resource::{
@@ -41,7 +43,6 @@ use crate::{
 use arrayvec::ArrayVec;
 use hal::{CommandEncoder as _, Device as _};
 use once_cell::sync::OnceCell;
-use parking_lot::{Mutex, MutexGuard, RwLock};
 
 use smallvec::SmallVec;
 use thiserror::Error;
@@ -97,7 +98,7 @@ pub struct Device<A: HalApi> {
     pub(crate) zero_buffer: Option<A::Buffer>,
     pub(crate) info: ResourceInfo<Device<A>>,
 
-    pub(crate) command_allocator: Mutex<Option<CommandAllocator<A>>>,
+    pub(crate) command_allocator: command::CommandAllocator<A>,
     //Note: The submission index here corresponds to the last submission that is done.
     pub(crate) active_submission_index: AtomicU64, //SubmissionIndex,
     // NOTE: if both are needed, the `snatchable_lock` must be consistently acquired before the
@@ -138,10 +139,10 @@ pub struct Device<A: HalApi> {
     pub(crate) instance_flags: wgt::InstanceFlags,
     pub(crate) pending_writes: Mutex<Option<PendingWrites<A>>>,
     pub(crate) deferred_destroy: Mutex<Vec<DeferredDestroy<A>>>,
-    pub(crate) last_acceleration_structure_build_command_index: AtomicU64,
     #[cfg(feature = "trace")]
     pub(crate) trace: Mutex<Option<trace::Trace>>,
     pub(crate) usage_scopes: UsageScopePool<A>,
+    pub(crate) last_acceleration_structure_build_command_index: AtomicU64,
 }
 
 pub(crate) enum DeferredDestroy<A: HalApi> {
@@ -166,7 +167,7 @@ impl<A: HalApi> Drop for Device<A> {
         let raw = self.raw.take().unwrap();
         let pending_writes = self.pending_writes.lock().take().unwrap();
         pending_writes.dispose(&raw);
-        self.command_allocator.lock().take().unwrap().dispose(&raw);
+        self.command_allocator.dispose(&raw);
         unsafe {
             raw.destroy_buffer(self.zero_buffer.take().unwrap());
             raw.destroy_fence(self.fence.write().take().unwrap());
@@ -224,10 +225,8 @@ impl<A: HalApi> Device<A> {
         let fence =
             unsafe { raw_device.create_fence() }.map_err(|_| CreateDeviceError::OutOfMemory)?;
 
-        let mut com_alloc = CommandAllocator {
-            free_encoders: Vec::new(),
-        };
-        let pending_encoder = com_alloc
+        let command_allocator = command::CommandAllocator::new();
+        let pending_encoder = command_allocator
             .acquire_encoder(&raw_device, raw_queue)
             .map_err(|_| CreateDeviceError::OutOfMemory)?;
         let mut pending_writes = queue::PendingWrites::<A>::new(pending_encoder);
@@ -272,39 +271,45 @@ impl<A: HalApi> Device<A> {
             queue_to_drop: OnceCell::new(),
             zero_buffer: Some(zero_buffer),
             info: ResourceInfo::new("<device>", None),
-            command_allocator: Mutex::new(Some(com_alloc)),
+            command_allocator,
             active_submission_index: AtomicU64::new(0),
-            fence: RwLock::new(Some(fence)),
-            snatchable_lock: unsafe { SnatchLock::new() },
+            fence: RwLock::new(rank::DEVICE_FENCE, Some(fence)),
+            snatchable_lock: unsafe { SnatchLock::new(rank::DEVICE_SNATCHABLE_LOCK) },
             valid: AtomicBool::new(true),
-            trackers: Mutex::new(Tracker::new()),
+            trackers: Mutex::new(rank::DEVICE_TRACKERS, Tracker::new()),
             tracker_indices: TrackerIndexAllocators::new(),
-            life_tracker: Mutex::new(life::LifetimeTracker::new()),
-            temp_suspected: Mutex::new(Some(life::ResourceMaps::new())),
+            life_tracker: Mutex::new(rank::DEVICE_LIFE_TRACKER, life::LifetimeTracker::new()),
+            temp_suspected: Mutex::new(
+                rank::DEVICE_TEMP_SUSPECTED,
+                Some(life::ResourceMaps::new()),
+            ),
             bgl_pool: ResourcePool::new(),
             #[cfg(feature = "trace")]
-            trace: Mutex::new(trace_path.and_then(|path| match trace::Trace::new(path) {
-                Ok(mut trace) => {
-                    trace.add(trace::Action::Init {
-                        desc: desc.clone(),
-                        backend: A::VARIANT,
-                    });
-                    Some(trace)
-                }
-                Err(e) => {
-                    log::error!("Unable to start a trace in '{path:?}': {e}");
-                    None
-                }
-            })),
+            trace: Mutex::new(
+                rank::DEVICE_TRACE,
+                trace_path.and_then(|path| match trace::Trace::new(path) {
+                    Ok(mut trace) => {
+                        trace.add(trace::Action::Init {
+                            desc: desc.clone(),
+                            backend: A::VARIANT,
+                        });
+                        Some(trace)
+                    }
+                    Err(e) => {
+                        log::error!("Unable to start a trace in '{path:?}': {e}");
+                        None
+                    }
+                }),
+            ),
             alignments,
             limits: desc.required_limits.clone(),
             features: desc.required_features,
             downlevel,
             instance_flags,
-            pending_writes: Mutex::new(Some(pending_writes)),
-            deferred_destroy: Mutex::new(Vec::new()),
+            pending_writes: Mutex::new(rank::DEVICE_PENDING_WRITES, Some(pending_writes)),
+            deferred_destroy: Mutex::new(rank::DEVICE_DEFERRED_DESTROY, Vec::new()),
+            usage_scopes: Mutex::new(rank::DEVICE_USAGE_SCOPES, Default::default()),
             last_acceleration_structure_build_command_index: AtomicU64::new(0),
-            usage_scopes: Default::default(),
         })
     }
 
@@ -427,10 +432,8 @@ impl<A: HalApi> Device<A> {
         };
 
         let mut life_tracker = self.lock_life();
-        let submission_closures = life_tracker.triage_submissions(
-            last_done_index,
-            self.command_allocator.lock().as_mut().unwrap(),
-        );
+        let submission_closures =
+            life_tracker.triage_submissions(last_done_index, &self.command_allocator);
 
         {
             // Normally, `temp_suspected` exists only to save heap
@@ -669,14 +672,17 @@ impl<A: HalApi> Device<A> {
             device: self.clone(),
             usage: desc.usage,
             size: desc.size,
-            initialization_status: RwLock::new(BufferInitTracker::new(aligned_size)),
-            sync_mapped_writes: Mutex::new(None),
-            map_state: Mutex::new(resource::BufferMapState::Idle),
+            initialization_status: RwLock::new(
+                rank::BUFFER_INITIALIZATION_STATUS,
+                BufferInitTracker::new(aligned_size),
+            ),
+            sync_mapped_writes: Mutex::new(rank::BUFFER_SYNC_MAPPED_WRITES, None),
+            map_state: Mutex::new(rank::BUFFER_MAP_STATE, resource::BufferMapState::Idle),
             info: ResourceInfo::new(
                 desc.label.borrow_or_default(),
                 Some(self.tracker_indices.buffers.clone()),
             ),
-            bind_groups: Mutex::new(Vec::new()),
+            bind_groups: Mutex::new(rank::BUFFER_BIND_GROUPS, Vec::new()),
         })
     }
 
@@ -696,10 +702,10 @@ impl<A: HalApi> Device<A> {
             desc: desc.map_label(|_| ()),
             hal_usage,
             format_features,
-            initialization_status: RwLock::new(TextureInitTracker::new(
-                desc.mip_level_count,
-                desc.array_layer_count(),
-            )),
+            initialization_status: RwLock::new(
+                rank::TEXTURE_INITIALIZATION_STATUS,
+                TextureInitTracker::new(desc.mip_level_count, desc.array_layer_count()),
+            ),
             full_range: TextureSelector {
                 mips: 0..desc.mip_level_count,
                 layers: 0..desc.array_layer_count(),
@@ -708,9 +714,9 @@ impl<A: HalApi> Device<A> {
                 desc.label.borrow_or_default(),
                 Some(self.tracker_indices.textures.clone()),
             ),
-            clear_mode: RwLock::new(clear_mode),
-            views: Mutex::new(Vec::new()),
-            bind_groups: Mutex::new(Vec::new()),
+            clear_mode: RwLock::new(rank::TEXTURE_CLEAR_MODE, clear_mode),
+            views: Mutex::new(rank::TEXTURE_VIEWS, Vec::new()),
+            bind_groups: Mutex::new(rank::TEXTURE_BIND_GROUPS, Vec::new()),
         }
     }
 
@@ -726,14 +732,17 @@ impl<A: HalApi> Device<A> {
             device: self.clone(),
             usage: desc.usage,
             size: desc.size,
-            initialization_status: RwLock::new(BufferInitTracker::new(0)),
-            sync_mapped_writes: Mutex::new(None),
-            map_state: Mutex::new(resource::BufferMapState::Idle),
+            initialization_status: RwLock::new(
+                rank::BUFFER_INITIALIZATION_STATUS,
+                BufferInitTracker::new(0),
+            ),
+            sync_mapped_writes: Mutex::new(rank::BUFFER_SYNC_MAPPED_WRITES, None),
+            map_state: Mutex::new(rank::BUFFER_MAP_STATE, resource::BufferMapState::Idle),
             info: ResourceInfo::new(
                 desc.label.borrow_or_default(),
                 Some(self.tracker_indices.buffers.clone()),
             ),
-            bind_groups: Mutex::new(Vec::new()),
+            bind_groups: Mutex::new(rank::BUFFER_BIND_GROUPS, Vec::new()),
         }
     }
 
@@ -1437,7 +1446,7 @@ impl<A: HalApi> Device<A> {
             pipeline::ShaderModuleSource::Wgsl(code) => {
                 profiling::scope!("naga::front::wgsl::parse_str");
                 let module = naga::front::wgsl::parse_str(&code).map_err(|inner| {
-                    pipeline::CreateShaderModuleError::Parsing(pipeline::ShaderError {
+                    pipeline::CreateShaderModuleError::Parsing(naga::error::ShaderError {
                         source: code.to_string(),
                         label: desc.label.as_ref().map(|l| l.to_string()),
                         inner: Box::new(inner),
@@ -1450,7 +1459,7 @@ impl<A: HalApi> Device<A> {
                 let parser = naga::front::spv::Frontend::new(spv.iter().cloned(), &options);
                 profiling::scope!("naga::front::spv::Frontend");
                 let module = parser.parse().map_err(|inner| {
-                    pipeline::CreateShaderModuleError::ParsingSpirV(pipeline::ShaderError {
+                    pipeline::CreateShaderModuleError::ParsingSpirV(naga::error::ShaderError {
                         source: String::new(),
                         label: desc.label.as_ref().map(|l| l.to_string()),
                         inner: Box::new(inner),
@@ -1463,7 +1472,7 @@ impl<A: HalApi> Device<A> {
                 let mut parser = naga::front::glsl::Frontend::default();
                 profiling::scope!("naga::front::glsl::Frontend.parse");
                 let module = parser.parse(&options, &code).map_err(|inner| {
-                    pipeline::CreateShaderModuleError::ParsingGlsl(pipeline::ShaderError {
+                    pipeline::CreateShaderModuleError::ParsingGlsl(naga::error::ShaderError {
                         source: code.to_string(),
                         label: desc.label.as_ref().map(|l| l.to_string()),
                         inner: Box::new(inner),
@@ -1487,9 +1496,78 @@ impl<A: HalApi> Device<A> {
             };
         }
 
-        use naga::valid::Capabilities as Caps;
         profiling::scope!("naga::validate");
+        let debug_source =
+            if self.instance_flags.contains(wgt::InstanceFlags::DEBUG) && !source.is_empty() {
+                Some(hal::DebugSource {
+                    file_name: Cow::Owned(
+                        desc.label
+                            .as_ref()
+                            .map_or("shader".to_string(), |l| l.to_string()),
+                    ),
+                    source_code: Cow::Owned(source.clone()),
+                })
+            } else {
+                None
+            };
+
+        let info = self
+            .create_validator(naga::valid::ValidationFlags::all())
+            .validate(&module)
+            .map_err(|inner| {
+                pipeline::CreateShaderModuleError::Validation(naga::error::ShaderError {
+                    source,
+                    label: desc.label.as_ref().map(|l| l.to_string()),
+                    inner: Box::new(inner),
+                })
+            })?;
+
+        let interface =
+            validation::Interface::new(&module, &info, self.limits.clone(), self.features);
+        let hal_shader = hal::ShaderInput::Naga(hal::NagaShader {
+            module,
+            info,
+            debug_source,
+        });
+        let hal_desc = hal::ShaderModuleDescriptor {
+            label: desc.label.to_hal(self.instance_flags),
+            runtime_checks: desc.shader_bound_checks.runtime_checks(),
+        };
+        let raw = match unsafe {
+            self.raw
+                .as_ref()
+                .unwrap()
+                .create_shader_module(&hal_desc, hal_shader)
+        } {
+            Ok(raw) => raw,
+            Err(error) => {
+                return Err(match error {
+                    hal::ShaderError::Device(error) => {
+                        pipeline::CreateShaderModuleError::Device(error.into())
+                    }
+                    hal::ShaderError::Compilation(ref msg) => {
+                        log::error!("Shader error: {}", msg);
+                        pipeline::CreateShaderModuleError::Generation
+                    }
+                })
+            }
+        };
+
+        Ok(pipeline::ShaderModule {
+            raw: Some(raw),
+            device: self.clone(),
+            interface: Some(interface),
+            info: ResourceInfo::new(desc.label.borrow_or_default(), None),
+            label: desc.label.borrow_or_default().to_string(),
+        })
+    }
 
+    /// Create a validator with the given validation flags.
+    pub fn create_validator(
+        self: &Arc<Self>,
+        flags: naga::valid::ValidationFlags,
+    ) -> naga::valid::Validator {
+        use naga::valid::Capabilities as Caps;
         let mut caps = Caps::empty();
         caps.set(
             Caps::PUSH_CONSTANT,
@@ -1561,69 +1639,36 @@ impl<A: HalApi> Device<A> {
                 .flags
                 .contains(wgt::DownlevelFlags::CUBE_ARRAY_TEXTURES),
         );
+        caps.set(
+            Caps::SUBGROUP,
+            self.features
+                .intersects(wgt::Features::SUBGROUP | wgt::Features::SUBGROUP_VERTEX),
+        );
+        caps.set(
+            Caps::SUBGROUP_BARRIER,
+            self.features.intersects(wgt::Features::SUBGROUP_BARRIER),
+        );
 
-        let debug_source =
-            if self.instance_flags.contains(wgt::InstanceFlags::DEBUG) && !source.is_empty() {
-                Some(hal::DebugSource {
-                    file_name: Cow::Owned(
-                        desc.label
-                            .as_ref()
-                            .map_or("shader".to_string(), |l| l.to_string()),
-                    ),
-                    source_code: Cow::Owned(source.clone()),
-                })
-            } else {
-                None
-            };
-
-        let info = naga::valid::Validator::new(naga::valid::ValidationFlags::all(), caps)
-            .validate(&module)
-            .map_err(|inner| {
-                pipeline::CreateShaderModuleError::Validation(pipeline::ShaderError {
-                    source,
-                    label: desc.label.as_ref().map(|l| l.to_string()),
-                    inner: Box::new(inner),
-                })
-            })?;
+        let mut subgroup_stages = naga::valid::ShaderStages::empty();
+        subgroup_stages.set(
+            naga::valid::ShaderStages::COMPUTE | naga::valid::ShaderStages::FRAGMENT,
+            self.features.contains(wgt::Features::SUBGROUP),
+        );
+        subgroup_stages.set(
+            naga::valid::ShaderStages::VERTEX,
+            self.features.contains(wgt::Features::SUBGROUP_VERTEX),
+        );
 
-        let interface =
-            validation::Interface::new(&module, &info, self.limits.clone(), self.features);
-        let hal_shader = hal::ShaderInput::Naga(hal::NagaShader {
-            module,
-            info,
-            debug_source,
-        });
-        let hal_desc = hal::ShaderModuleDescriptor {
-            label: desc.label.to_hal(self.instance_flags),
-            runtime_checks: desc.shader_bound_checks.runtime_checks(),
-        };
-        let raw = match unsafe {
-            self.raw
-                .as_ref()
-                .unwrap()
-                .create_shader_module(&hal_desc, hal_shader)
-        } {
-            Ok(raw) => raw,
-            Err(error) => {
-                return Err(match error {
-                    hal::ShaderError::Device(error) => {
-                        pipeline::CreateShaderModuleError::Device(error.into())
-                    }
-                    hal::ShaderError::Compilation(ref msg) => {
-                        log::error!("Shader error: {}", msg);
-                        pipeline::CreateShaderModuleError::Generation
-                    }
-                })
-            }
+        let subgroup_operations = if caps.contains(Caps::SUBGROUP) {
+            use naga::valid::SubgroupOperationSet as S;
+            S::BASIC | S::VOTE | S::ARITHMETIC | S::BALLOT | S::SHUFFLE | S::SHUFFLE_RELATIVE
+        } else {
+            naga::valid::SubgroupOperationSet::empty()
         };
-
-        Ok(pipeline::ShaderModule {
-            raw: Some(raw),
-            device: self.clone(),
-            interface: Some(interface),
-            info: ResourceInfo::new(desc.label.borrow_or_default(), None),
-            label: desc.label.borrow_or_default().to_string(),
-        })
+        let mut validator = naga::valid::Validator::new(flags, caps);
+        validator.subgroup_stages(subgroup_stages);
+        validator.subgroup_operations(subgroup_operations);
+        validator
     }
 
     #[allow(unused_unsafe)]
@@ -1933,6 +1978,7 @@ impl<A: HalApi> Device<A> {
         used: &mut BindGroupStates<A>,
         storage: &'a Storage<Buffer<A>>,
         limits: &wgt::Limits,
+        device_id: id::Id<id::markers::Device>,
         snatch_guard: &'a SnatchGuard<'a>,
     ) -> Result<hal::BufferBinding<'a, A>, binding_model::CreateBindGroupError> {
         use crate::binding_model::CreateBindGroupError as Error;
@@ -1951,6 +1997,7 @@ impl<A: HalApi> Device<A> {
                 })
             }
         };
+
         let (pub_usage, internal_use, range_limit) = match binding_ty {
             wgt::BufferBindingType::Uniform => (
                 wgt::BufferUsages::UNIFORM,
@@ -1983,6 +2030,10 @@ impl<A: HalApi> Device<A> {
             .add_single(storage, bb.buffer_id, internal_use)
             .ok_or(Error::InvalidBuffer(bb.buffer_id))?;
 
+        if buffer.device.as_info().id() != device_id {
+            return Err(DeviceError::WrongDevice.into());
+        }
+
         check_buffer_usage(bb.buffer_id, buffer.usage, pub_usage)?;
         let raw_buffer = buffer
             .raw
@@ -2061,13 +2112,53 @@ impl<A: HalApi> Device<A> {
         })
     }
 
-    pub(crate) fn create_texture_binding(
-        view: &TextureView<A>,
-        internal_use: hal::TextureUses,
-        pub_usage: wgt::TextureUsages,
+    fn create_sampler_binding<'a>(
+        used: &BindGroupStates<A>,
+        storage: &'a Storage<Sampler<A>>,
+        id: id::Id<id::markers::Sampler>,
+        device_id: id::Id<id::markers::Device>,
+    ) -> Result<&'a Sampler<A>, binding_model::CreateBindGroupError> {
+        use crate::binding_model::CreateBindGroupError as Error;
+
+        let sampler = used
+            .samplers
+            .add_single(storage, id)
+            .ok_or(Error::InvalidSampler(id))?;
+
+        if sampler.device.as_info().id() != device_id {
+            return Err(DeviceError::WrongDevice.into());
+        }
+
+        Ok(sampler)
+    }
+
+    pub(crate) fn create_texture_binding<'a>(
+        self: &Arc<Self>,
+        binding: u32,
+        decl: &wgt::BindGroupLayoutEntry,
+        storage: &'a Storage<TextureView<A>>,
+        id: id::Id<id::markers::TextureView>,
         used: &mut BindGroupStates<A>,
         used_texture_ranges: &mut Vec<TextureInitTrackerAction<A>>,
-    ) -> Result<(), binding_model::CreateBindGroupError> {
+        snatch_guard: &'a SnatchGuard<'a>,
+    ) -> Result<hal::TextureBinding<'a, A>, binding_model::CreateBindGroupError> {
+        use crate::binding_model::CreateBindGroupError as Error;
+
+        let view = used
+            .views
+            .add_single(storage, id)
+            .ok_or(Error::InvalidTextureView(id))?;
+
+        if view.device.as_info().id() != self.as_info().id() {
+            return Err(DeviceError::WrongDevice.into());
+        }
+
+        let (pub_usage, internal_use) = self.texture_use_parameters(
+            binding,
+            decl,
+            view,
+            "SampledTexture, ReadonlyStorageTexture or WriteonlyStorageTexture",
+        )?;
         let texture = &view.parent;
         let texture_id = texture.as_info().id();
         // Careful here: the texture may no longer have its own ref count,
@@ -2097,7 +2188,12 @@ impl<A: HalApi> Device<A> {
             kind: MemoryInitKind::NeedsInitializedMemory,
         });
 
-        Ok(())
+        Ok(hal::TextureBinding {
+            view: view
+                .raw(snatch_guard)
+                .ok_or(Error::InvalidTextureView(id))?,
+            usage: internal_use,
+        })
     }
 
     // This function expects the provided bind group layout to be resolved
@@ -2161,6 +2257,7 @@ impl<A: HalApi> Device<A> {
                         &mut used,
                         &*buffer_guard,
                         &self.limits,
+                        self.as_info().id(),
                         &snatch_guard,
                     )?;
 
@@ -2184,105 +2281,86 @@ impl<A: HalApi> Device<A> {
                             &mut used,
                             &*buffer_guard,
                             &self.limits,
+                            self.as_info().id(),
                             &snatch_guard,
                         )?;
                         hal_buffers.push(bb);
                     }
                     (res_index, num_bindings)
                 }
-                Br::Sampler(id) => {
-                    match decl.ty {
-                        wgt::BindingType::Sampler(ty) => {
-                            let sampler = used
-                                .samplers
-                                .add_single(&*sampler_guard, id)
-                                .ok_or(Error::InvalidSampler(id))?;
-
-                            if sampler.device.as_info().id() != self.as_info().id() {
-                                return Err(DeviceError::WrongDevice.into());
-                            }
-
-                            // Allowed sampler values for filtering and comparison
-                            let (allowed_filtering, allowed_comparison) = match ty {
-                                wgt::SamplerBindingType::Filtering => (None, false),
-                                wgt::SamplerBindingType::NonFiltering => (Some(false), false),
-                                wgt::SamplerBindingType::Comparison => (None, true),
-                            };
-
-                            if let Some(allowed_filtering) = allowed_filtering {
-                                if allowed_filtering != sampler.filtering {
-                                    return Err(Error::WrongSamplerFiltering {
-                                        binding,
-                                        layout_flt: allowed_filtering,
-                                        sampler_flt: sampler.filtering,
-                                    });
-                                }
-                            }
+                Br::Sampler(id) => match decl.ty {
+                    wgt::BindingType::Sampler(ty) => {
+                        let sampler = Self::create_sampler_binding(
+                            &used,
+                            &sampler_guard,
+                            id,
+                            self.as_info().id(),
+                        )?;
 
-                            if allowed_comparison != sampler.comparison {
-                                return Err(Error::WrongSamplerComparison {
+                        let (allowed_filtering, allowed_comparison) = match ty {
+                            wgt::SamplerBindingType::Filtering => (None, false),
+                            wgt::SamplerBindingType::NonFiltering => (Some(false), false),
+                            wgt::SamplerBindingType::Comparison => (None, true),
+                        };
+                        if let Some(allowed_filtering) = allowed_filtering {
+                            if allowed_filtering != sampler.filtering {
+                                return Err(Error::WrongSamplerFiltering {
                                     binding,
-                                    layout_cmp: allowed_comparison,
-                                    sampler_cmp: sampler.comparison,
+                                    layout_flt: allowed_filtering,
+                                    sampler_flt: sampler.filtering,
                                 });
                             }
-
-                            let res_index = hal_samplers.len();
-                            hal_samplers.push(sampler.raw());
-                            (res_index, 1)
                         }
-                        _ => {
-                            return Err(Error::WrongBindingType {
+                        if allowed_comparison != sampler.comparison {
+                            return Err(Error::WrongSamplerComparison {
                                 binding,
-                                actual: decl.ty,
-                                expected: "Sampler",
-                            })
+                                layout_cmp: allowed_comparison,
+                                sampler_cmp: sampler.comparison,
+                            });
                         }
+
+                        let res_index = hal_samplers.len();
+                        hal_samplers.push(sampler.raw());
+                        (res_index, 1)
                     }
-                }
+                    _ => {
+                        return Err(Error::WrongBindingType {
+                            binding,
+                            actual: decl.ty,
+                            expected: "Sampler",
+                        })
+                    }
+                },
                 Br::SamplerArray(ref bindings_array) => {
                     let num_bindings = bindings_array.len();
                     Self::check_array_binding(self.features, decl.count, num_bindings)?;
 
                     let res_index = hal_samplers.len();
                     for &id in bindings_array.iter() {
-                        let sampler = used
-                            .samplers
-                            .add_single(&*sampler_guard, id)
-                            .ok_or(Error::InvalidSampler(id))?;
-                        if sampler.device.as_info().id() != self.as_info().id() {
-                            return Err(DeviceError::WrongDevice.into());
-                        }
+                        let sampler = Self::create_sampler_binding(
+                            &used,
+                            &sampler_guard,
+                            id,
+                            self.as_info().id(),
+                        )?;
+
                         hal_samplers.push(sampler.raw());
                     }
 
                     (res_index, num_bindings)
                 }
                 Br::TextureView(id) => {
-                    let view = used
-                        .views
-                        .add_single(&*texture_view_guard, id)
-                        .ok_or(Error::InvalidTextureView(id))?;
-                    let (pub_usage, internal_use) = self.texture_use_parameters(
+                    let tb = self.create_texture_binding(
                         binding,
                         decl,
-                        view,
-                        "SampledTexture, ReadonlyStorageTexture or WriteonlyStorageTexture",
-                    )?;
-                    Self::create_texture_binding(
-                        view,
-                        internal_use,
-                        pub_usage,
+                        &texture_view_guard,
+                        id,
                         &mut used,
                         &mut used_texture_ranges,
+                        &snatch_guard,
                     )?;
                     let res_index = hal_textures.len();
-                    hal_textures.push(hal::TextureBinding {
-                        view: view
-                            .raw(&snatch_guard)
-                            .ok_or(Error::InvalidTextureView(id))?,
-                        usage: internal_use,
-                    });
+                    hal_textures.push(tb);
                     (res_index, 1)
                 }
                 Br::TextureViewArray(ref bindings_array) => {
@@ -2291,26 +2369,17 @@ impl<A: HalApi> Device<A> {
 
                     let res_index = hal_textures.len();
                     for &id in bindings_array.iter() {
-                        let view = used
-                            .views
-                            .add_single(&*texture_view_guard, id)
-                            .ok_or(Error::InvalidTextureView(id))?;
-                        let (pub_usage, internal_use) =
-                            self.texture_use_parameters(binding, decl, view,
-                                                         "SampledTextureArray, ReadonlyStorageTextureArray or WriteonlyStorageTextureArray")?;
-                        Self::create_texture_binding(
-                            view,
-                            internal_use,
-                            pub_usage,
+                        let tb = self.create_texture_binding(
+                            binding,
+                            decl,
+                            &texture_view_guard,
+                            id,
                             &mut used,
                             &mut used_texture_ranges,
+                            &snatch_guard,
                         )?;
-                        hal_textures.push(hal::TextureBinding {
-                            view: view
-                                .raw(&snatch_guard)
-                                .ok_or(Error::InvalidTextureView(id))?,
-                            usage: internal_use,
-                        });
+
+                        hal_textures.push(tb);
                     }
 
                     (res_index, num_bindings)
@@ -2799,6 +2868,7 @@ impl<A: HalApi> Device<A> {
                 module: shader_module.raw(),
                 entry_point: final_entry_point_name.as_ref(),
                 constants: desc.stage.constants.as_ref(),
+                zero_initialize_workgroup_memory: desc.stage.zero_initialize_workgroup_memory,
             },
         };
 
@@ -3214,6 +3284,7 @@ impl<A: HalApi> Device<A> {
                 module: vertex_shader_module.raw(),
                 entry_point: &vertex_entry_point_name,
                 constants: stage_desc.constants.as_ref(),
+                zero_initialize_workgroup_memory: stage_desc.zero_initialize_workgroup_memory,
             }
         };
 
@@ -3274,6 +3345,9 @@ impl<A: HalApi> Device<A> {
                     module: shader_module.raw(),
                     entry_point: &fragment_entry_point_name,
                     constants: fragment_state.stage.constants.as_ref(),
+                    zero_initialize_workgroup_memory: fragment_state
+                        .stage
+                        .zero_initialize_workgroup_memory,
                 })
             }
             None => None,
@@ -3519,10 +3593,9 @@ impl<A: HalApi> Device<A> {
                     .map_err(DeviceError::from)?
             };
             drop(guard);
-            let closures = self.lock_life().triage_submissions(
-                submission_index,
-                self.command_allocator.lock().as_mut().unwrap(),
-            );
+            let closures = self
+                .lock_life()
+                .triage_submissions(submission_index, &self.command_allocator);
             assert!(
                 closures.is_empty(),
                 "wait_for_submit is not expected to work with closures"
@@ -3650,10 +3723,7 @@ impl<A: HalApi> Device<A> {
             log::error!("failed to wait for the device: {error}");
         }
         let mut life_tracker = self.lock_life();
-        let _ = life_tracker.triage_submissions(
-            current_index,
-            self.command_allocator.lock().as_mut().unwrap(),
-        );
+        let _ = life_tracker.triage_submissions(current_index, &self.command_allocator);
         if let Some(device_lost_closure) = life_tracker.device_lost_closure.take() {
             // It's important to not hold the lock while calling the closure.
             drop(life_tracker);
diff --git a/wgpu-core/src/hal_api.rs b/wgpu-core/src/hal_api.rs
index 179024baed..f1a40b1cff 100644
--- a/wgpu-core/src/hal_api.rs
+++ b/wgpu-core/src/hal_api.rs
@@ -11,7 +11,7 @@ pub trait HalApi: hal::Api + 'static + WasmNotSendSync {
     fn create_instance_from_hal(name: &str, hal_instance: Self::Instance) -> Instance;
     fn instance_as_hal(instance: &Instance) -> Option<&Self::Instance>;
     fn hub(global: &Global) -> &Hub<Self>;
-    fn get_surface(surface: &Surface) -> Option<&Self::Surface>;
+    fn surface_as_hal(surface: &Surface) -> Option<&Self::Surface>;
 }
 
 impl HalApi for hal::api::Empty {
@@ -25,7 +25,7 @@ impl HalApi for hal::api::Empty {
     fn hub(_: &Global) -> &Hub<Self> {
         unimplemented!("called empty api")
     }
-    fn get_surface(_: &Surface) -> Option<&Self::Surface> {
+    fn surface_as_hal(_: &Surface) -> Option<&Self::Surface> {
         unimplemented!("called empty api")
     }
 }
@@ -46,8 +46,8 @@ impl HalApi for hal::api::Vulkan {
     fn hub(global: &Global) -> &Hub<Self> {
         &global.hubs.vulkan
     }
-    fn get_surface(surface: &Surface) -> Option<&Self::Surface> {
-        surface.raw.downcast_ref::<Self>()
+    fn surface_as_hal(surface: &Surface) -> Option<&Self::Surface> {
+        surface.vulkan.as_ref()
     }
 }
 
@@ -67,8 +67,8 @@ impl HalApi for hal::api::Metal {
     fn hub(global: &Global) -> &Hub<Self> {
         &global.hubs.metal
     }
-    fn get_surface(surface: &Surface) -> Option<&Self::Surface> {
-        surface.raw.downcast_ref::<Self>()
+    fn surface_as_hal(surface: &Surface) -> Option<&Self::Surface> {
+        surface.metal.as_ref()
     }
 }
 
@@ -88,8 +88,8 @@ impl HalApi for hal::api::Dx12 {
     fn hub(global: &Global) -> &Hub<Self> {
         &global.hubs.dx12
     }
-    fn get_surface(surface: &Surface) -> Option<&Self::Surface> {
-        surface.raw.downcast_ref::<Self>()
+    fn surface_as_hal(surface: &Surface) -> Option<&Self::Surface> {
+        surface.dx12.as_ref()
     }
 }
 
@@ -110,7 +110,7 @@ impl HalApi for hal::api::Gles {
     fn hub(global: &Global) -> &Hub<Self> {
         &global.hubs.gl
     }
-    fn get_surface(surface: &Surface) -> Option<&Self::Surface> {
-        surface.raw.downcast_ref::<Self>()
+    fn surface_as_hal(surface: &Surface) -> Option<&Self::Surface> {
+        surface.gl.as_ref()
     }
 }
diff --git a/wgpu-core/src/hub.rs b/wgpu-core/src/hub.rs
index 794420cd15..4abba1c495 100644
--- a/wgpu-core/src/hub.rs
+++ b/wgpu-core/src/hub.rs
@@ -245,7 +245,7 @@ impl<A: HalApi> Hub<A> {
             if let Element::Occupied(ref surface, _epoch) = *element {
                 if let Some(ref mut present) = surface.presentation.lock().take() {
                     if let Some(device) = present.device.downcast_ref::<A>() {
-                        let suf = A::get_surface(surface);
+                        let suf = A::surface_as_hal(surface);
                         unsafe {
                             suf.unwrap().unconfigure(device.raw());
                             //TODO: we could destroy the surface here
diff --git a/wgpu-core/src/identity.rs b/wgpu-core/src/identity.rs
index d76d29341a..c89731f7af 100644
--- a/wgpu-core/src/identity.rs
+++ b/wgpu-core/src/identity.rs
@@ -1,8 +1,8 @@
-use parking_lot::Mutex;
 use wgt::Backend;
 
 use crate::{
     id::{Id, Marker},
+    lock::{rank, Mutex},
     Epoch, Index,
 };
 use std::{fmt::Debug, marker::PhantomData};
@@ -16,31 +16,26 @@ enum IdSource {
 
 /// A simple structure to allocate [`Id`] identifiers.
 ///
-/// Calling [`alloc`] returns a fresh, never-before-seen id. Calling [`free`]
+/// Calling [`alloc`] returns a fresh, never-before-seen id. Calling [`release`]
 /// marks an id as dead; it will never be returned again by `alloc`.
 ///
-/// Use `IdentityManager::default` to construct new instances.
+/// `IdentityValues` returns `Id`s whose index values are suitable for use as
+/// indices into a `Vec<T>` that holds those ids' referents:
 ///
-/// `IdentityManager` returns `Id`s whose index values are suitable for use as
-/// indices into a `Storage<T>` that holds those ids' referents:
+/// - Every live id has a distinct index value. Every live id's index
+///   selects a distinct element in the vector.
 ///
-/// - Every live id has a distinct index value. Each live id's index selects a
-///   distinct element in the vector.
-///
-/// - `IdentityManager` prefers low index numbers. If you size your vector to
+/// - `IdentityValues` prefers low index numbers. If you size your vector to
 ///   accommodate the indices produced here, the vector's length will reflect
 ///   the highwater mark of actual occupancy.
 ///
-/// - `IdentityManager` reuses the index values of freed ids before returning
+/// - `IdentityValues` reuses the index values of freed ids before returning
 ///   ids with new index values. Freed vector entries get reused.
 ///
-/// See the module-level documentation for an overview of how this
-/// fits together.
-///
 /// [`Id`]: crate::id::Id
 /// [`Backend`]: wgt::Backend;
-/// [`alloc`]: IdentityManager::alloc
-/// [`free`]: IdentityManager::free
+/// [`alloc`]: IdentityValues::alloc
+/// [`release`]: IdentityValues::release
 #[derive(Debug)]
 pub(super) struct IdentityValues {
     free: Vec<(Index, Epoch)>,
@@ -122,12 +117,15 @@ impl<T: Marker> IdentityManager<T> {
 impl<T: Marker> IdentityManager<T> {
     pub fn new() -> Self {
         Self {
-            values: Mutex::new(IdentityValues {
-                free: Vec::new(),
-                next_index: 0,
-                count: 0,
-                id_source: IdSource::None,
-            }),
+            values: Mutex::new(
+                rank::IDENTITY_MANAGER_VALUES,
+                IdentityValues {
+                    free: Vec::new(),
+                    next_index: 0,
+                    count: 0,
+                    id_source: IdSource::None,
+                },
+            ),
             _phantom: PhantomData,
         }
     }
diff --git a/wgpu-core/src/instance.rs b/wgpu-core/src/instance.rs
index 20e67d5f71..f0a3890c1e 100644
--- a/wgpu-core/src/instance.rs
+++ b/wgpu-core/src/instance.rs
@@ -1,19 +1,19 @@
+use std::collections::HashMap;
 use std::sync::Arc;
 
 use crate::{
-    any_surface::AnySurface,
     api_log,
     device::{queue::Queue, resource::Device, DeviceDescriptor},
     global::Global,
     hal_api::HalApi,
     id::markers,
     id::{AdapterId, DeviceId, Id, Marker, QueueId, SurfaceId},
+    lock::{rank, Mutex},
     present::Presentation,
     resource::{Resource, ResourceInfo, ResourceType},
     resource_log, LabelHelpers, DOWNLEVEL_WARNING_MESSAGE,
 };
 
-use parking_lot::Mutex;
 use wgt::{Backend, Backends, PowerPreference};
 
 use hal::{Adapter as _, Instance as _, OpenDevice};
@@ -21,6 +21,7 @@ use thiserror::Error;
 
 pub type RequestAdapterOptions = wgt::RequestAdapterOptions<SurfaceId>;
 type HalInstance<A> = <A as hal::Api>::Instance;
+type HalSurface<A> = <A as hal::Api>::Surface;
 
 #[derive(Clone, Debug, Error)]
 #[error("Limit '{name}' value {requested} is better than allowed {allowed}")]
@@ -113,31 +114,36 @@ impl Instance {
     }
 
     pub(crate) fn destroy_surface(&self, surface: Surface) {
-        fn destroy<A: HalApi>(instance: &Option<A::Instance>, surface: AnySurface) {
-            unsafe {
-                if let Some(suf) = surface.take::<A>() {
-                    instance.as_ref().unwrap().destroy_surface(suf);
+        fn destroy<A: HalApi>(instance: &Option<A::Instance>, mut surface: Option<HalSurface<A>>) {
+            if let Some(surface) = surface.take() {
+                unsafe {
+                    instance.as_ref().unwrap().destroy_surface(surface);
                 }
             }
         }
-        match surface.raw.backend() {
-            #[cfg(vulkan)]
-            Backend::Vulkan => destroy::<hal::api::Vulkan>(&self.vulkan, surface.raw),
-            #[cfg(metal)]
-            Backend::Metal => destroy::<hal::api::Metal>(&self.metal, surface.raw),
-            #[cfg(dx12)]
-            Backend::Dx12 => destroy::<hal::api::Dx12>(&self.dx12, surface.raw),
-            #[cfg(gles)]
-            Backend::Gl => destroy::<hal::api::Gles>(&self.gl, surface.raw),
-            _ => unreachable!(),
-        }
+        #[cfg(vulkan)]
+        destroy::<hal::api::Vulkan>(&self.vulkan, surface.vulkan);
+        #[cfg(metal)]
+        destroy::<hal::api::Metal>(&self.metal, surface.metal);
+        #[cfg(dx12)]
+        destroy::<hal::api::Dx12>(&self.dx12, surface.dx12);
+        #[cfg(gles)]
+        destroy::<hal::api::Gles>(&self.gl, surface.gl);
     }
 }
 
 pub struct Surface {
     pub(crate) presentation: Mutex<Option<Presentation>>,
     pub(crate) info: ResourceInfo<Surface>,
-    pub(crate) raw: AnySurface,
+
+    #[cfg(vulkan)]
+    pub vulkan: Option<HalSurface<hal::api::Vulkan>>,
+    #[cfg(metal)]
+    pub metal: Option<HalSurface<hal::api::Metal>>,
+    #[cfg(dx12)]
+    pub dx12: Option<HalSurface<hal::api::Dx12>>,
+    #[cfg(gles)]
+    pub gl: Option<HalSurface<hal::api::Gles>>,
 }
 
 impl Resource for Surface {
@@ -163,7 +169,7 @@ impl Surface {
         &self,
         adapter: &Adapter<A>,
     ) -> Result<hal::SurfaceCapabilities, GetSurfaceSupportError> {
-        let suf = A::get_surface(self).ok_or(GetSurfaceSupportError::Unsupported)?;
+        let suf = A::surface_as_hal(self).ok_or(GetSurfaceSupportError::Unsupported)?;
         profiling::scope!("surface_capabilities");
         let caps = unsafe {
             adapter
@@ -203,7 +209,7 @@ impl<A: HalApi> Adapter<A> {
     }
 
     pub fn is_surface_supported(&self, surface: &Surface) -> bool {
-        let suf = A::get_surface(surface);
+        let suf = A::surface_as_hal(surface);
 
         // If get_surface returns None, then the API does not advertise support for the surface.
         //
@@ -461,13 +467,25 @@ pub enum RequestAdapterError {
 #[derive(Clone, Debug, Error)]
 #[non_exhaustive]
 pub enum CreateSurfaceError {
-    #[error("No backend is available")]
-    NoSupportedBackend,
-    #[error(transparent)]
-    InstanceError(#[from] hal::InstanceError),
+    #[error("The backend {0} was not enabled on the instance.")]
+    BackendNotEnabled(Backend),
+    #[error("Failed to create surface for any enabled backend: {0:?}")]
+    FailedToCreateSurfaceForAnyBackend(HashMap<Backend, hal::InstanceError>),
 }
 
 impl Global {
+    /// Creates a new surface targeting the given display/window handles.
+    ///
+    /// Internally attempts to create hal surfaces for all enabled backends.
+    ///
+    /// Fails only if creation for surfaces for all enabled backends fails in which case
+    /// the error for each enabled backend is listed.
+    /// Vice versa, if creation for any backend succeeds, success is returned.
+    /// Surface creation errors are logged to the debug log in any case.
+    ///
+    /// id_in:
+    /// - If `Some`, the id to assign to the surface. A new one will be generated otherwise.
+    ///
     /// # Safety
     ///
     /// - `display_handle` must be a valid object to create a surface upon.
@@ -483,51 +501,86 @@ impl Global {
         profiling::scope!("Instance::create_surface");
 
         fn init<A: HalApi>(
+            errors: &mut HashMap<Backend, hal::InstanceError>,
+            any_created: &mut bool,
+            backend: Backend,
             inst: &Option<A::Instance>,
             display_handle: raw_window_handle::RawDisplayHandle,
             window_handle: raw_window_handle::RawWindowHandle,
-        ) -> Option<Result<AnySurface, hal::InstanceError>> {
-            inst.as_ref().map(|inst| unsafe {
-                match inst.create_surface(display_handle, window_handle) {
-                    Ok(raw) => Ok(AnySurface::new::<A>(raw)),
-                    Err(e) => Err(e),
+        ) -> Option<HalSurface<A>> {
+            inst.as_ref().and_then(|inst| {
+                match unsafe { inst.create_surface(display_handle, window_handle) } {
+                    Ok(raw) => {
+                        *any_created = true;
+                        Some(raw)
+                    }
+                    Err(err) => {
+                        log::debug!(
+                            "Instance::create_surface: failed to create surface for {:?}: {:?}",
+                            backend,
+                            err
+                        );
+                        errors.insert(backend, err);
+                        None
+                    }
                 }
             })
         }
 
-        let mut hal_surface: Option<Result<AnySurface, hal::InstanceError>> = None;
-
-        #[cfg(vulkan)]
-        if hal_surface.is_none() {
-            hal_surface =
-                init::<hal::api::Vulkan>(&self.instance.vulkan, display_handle, window_handle);
-        }
-        #[cfg(metal)]
-        if hal_surface.is_none() {
-            hal_surface =
-                init::<hal::api::Metal>(&self.instance.metal, display_handle, window_handle);
-        }
-        #[cfg(dx12)]
-        if hal_surface.is_none() {
-            hal_surface =
-                init::<hal::api::Dx12>(&self.instance.dx12, display_handle, window_handle);
-        }
-        #[cfg(gles)]
-        if hal_surface.is_none() {
-            hal_surface = init::<hal::api::Gles>(&self.instance.gl, display_handle, window_handle);
-        }
-
-        let hal_surface = hal_surface.ok_or(CreateSurfaceError::NoSupportedBackend)??;
+        let mut errors = HashMap::default();
+        let mut any_created = false;
 
         let surface = Surface {
-            presentation: Mutex::new(None),
+            presentation: Mutex::new(rank::SURFACE_PRESENTATION, None),
             info: ResourceInfo::new("<Surface>", None),
-            raw: hal_surface,
+
+            #[cfg(vulkan)]
+            vulkan: init::<hal::api::Vulkan>(
+                &mut errors,
+                &mut any_created,
+                Backend::Vulkan,
+                &self.instance.vulkan,
+                display_handle,
+                window_handle,
+            ),
+            #[cfg(metal)]
+            metal: init::<hal::api::Metal>(
+                &mut errors,
+                &mut any_created,
+                Backend::Metal,
+                &self.instance.metal,
+                display_handle,
+                window_handle,
+            ),
+            #[cfg(dx12)]
+            dx12: init::<hal::api::Dx12>(
+                &mut errors,
+                &mut any_created,
+                Backend::Dx12,
+                &self.instance.dx12,
+                display_handle,
+                window_handle,
+            ),
+            #[cfg(gles)]
+            gl: init::<hal::api::Gles>(
+                &mut errors,
+                &mut any_created,
+                Backend::Gl,
+                &self.instance.gl,
+                display_handle,
+                window_handle,
+            ),
         };
 
-        #[allow(clippy::arc_with_non_send_sync)]
-        let (id, _) = self.surfaces.prepare(id_in).assign(Arc::new(surface));
-        Ok(id)
+        if any_created {
+            #[allow(clippy::arc_with_non_send_sync)]
+            let (id, _) = self.surfaces.prepare(id_in).assign(Arc::new(surface));
+            Ok(id)
+        } else {
+            Err(CreateSurfaceError::FailedToCreateSurfaceForAnyBackend(
+                errors,
+            ))
+        }
     }
 
     /// # Safety
@@ -538,58 +591,72 @@ impl Global {
         &self,
         layer: *mut std::ffi::c_void,
         id_in: Option<SurfaceId>,
-    ) -> SurfaceId {
+    ) -> Result<SurfaceId, CreateSurfaceError> {
         profiling::scope!("Instance::create_surface_metal");
 
         let surface = Surface {
-            presentation: Mutex::new(None),
+            presentation: Mutex::new(rank::SURFACE_PRESENTATION, None),
             info: ResourceInfo::new("<Surface>", None),
-            raw: {
-                let hal_surface = self
-                    .instance
-                    .metal
-                    .as_ref()
-                    .map(|inst| {
-                        // we don't want to link to metal-rs for this
-                        #[allow(clippy::transmute_ptr_to_ref)]
-                        inst.create_surface_from_layer(unsafe { std::mem::transmute(layer) })
-                    })
-                    .unwrap();
-                AnySurface::new::<hal::api::Metal>(hal_surface)
-            },
+            metal: Some(self.instance.metal.as_ref().map_or(
+                Err(CreateSurfaceError::BackendNotEnabled(Backend::Metal)),
+                |inst| {
+                    // we don't want to link to metal-rs for this
+                    #[allow(clippy::transmute_ptr_to_ref)]
+                    Ok(inst.create_surface_from_layer(unsafe { std::mem::transmute(layer) }))
+                },
+            )?),
+            #[cfg(dx12)]
+            dx12: None,
+            #[cfg(vulkan)]
+            vulkan: None,
+            #[cfg(gles)]
+            gl: None,
         };
 
         let (id, _) = self.surfaces.prepare(id_in).assign(Arc::new(surface));
-        id
+        Ok(id)
     }
 
     #[cfg(dx12)]
-    /// # Safety
-    ///
-    /// The visual must be valid and able to be used to make a swapchain with.
-    pub unsafe fn instance_create_surface_from_visual(
+    fn instance_create_surface_dx12(
         &self,
-        visual: *mut std::ffi::c_void,
         id_in: Option<SurfaceId>,
-    ) -> SurfaceId {
-        profiling::scope!("Instance::instance_create_surface_from_visual");
-
+        create_surface_func: impl FnOnce(&HalInstance<hal::api::Dx12>) -> HalSurface<hal::api::Dx12>,
+    ) -> Result<SurfaceId, CreateSurfaceError> {
         let surface = Surface {
-            presentation: Mutex::new(None),
+            presentation: Mutex::new(rank::SURFACE_PRESENTATION, None),
             info: ResourceInfo::new("<Surface>", None),
-            raw: {
-                let hal_surface = self
-                    .instance
+            dx12: Some(create_surface_func(
+                self.instance
                     .dx12
                     .as_ref()
-                    .map(|inst| unsafe { inst.create_surface_from_visual(visual as _) })
-                    .unwrap();
-                AnySurface::new::<hal::api::Dx12>(hal_surface)
-            },
+                    .ok_or(CreateSurfaceError::BackendNotEnabled(Backend::Dx12))?,
+            )),
+            #[cfg(metal)]
+            metal: None,
+            #[cfg(vulkan)]
+            vulkan: None,
+            #[cfg(gles)]
+            gl: None,
         };
 
         let (id, _) = self.surfaces.prepare(id_in).assign(Arc::new(surface));
-        id
+        Ok(id)
+    }
+
+    #[cfg(dx12)]
+    /// # Safety
+    ///
+    /// The visual must be valid and able to be used to make a swapchain with.
+    pub unsafe fn instance_create_surface_from_visual(
+        &self,
+        visual: *mut std::ffi::c_void,
+        id_in: Option<SurfaceId>,
+    ) -> Result<SurfaceId, CreateSurfaceError> {
+        profiling::scope!("Instance::instance_create_surface_from_visual");
+        self.instance_create_surface_dx12(id_in, |inst| unsafe {
+            inst.create_surface_from_visual(visual as _)
+        })
     }
 
     #[cfg(dx12)]
@@ -600,25 +667,11 @@ impl Global {
         &self,
         surface_handle: *mut std::ffi::c_void,
         id_in: Option<SurfaceId>,
-    ) -> SurfaceId {
+    ) -> Result<SurfaceId, CreateSurfaceError> {
         profiling::scope!("Instance::instance_create_surface_from_surface_handle");
-
-        let surface = Surface {
-            presentation: Mutex::new(None),
-            info: ResourceInfo::new("<Surface>", None),
-            raw: {
-                let hal_surface = self
-                    .instance
-                    .dx12
-                    .as_ref()
-                    .map(|inst| unsafe { inst.create_surface_from_surface_handle(surface_handle) })
-                    .unwrap();
-                AnySurface::new::<hal::api::Dx12>(hal_surface)
-            },
-        };
-
-        let (id, _) = self.surfaces.prepare(id_in).assign(Arc::new(surface));
-        id
+        self.instance_create_surface_dx12(id_in, |inst| unsafe {
+            inst.create_surface_from_surface_handle(surface_handle)
+        })
     }
 
     #[cfg(dx12)]
@@ -629,27 +682,11 @@ impl Global {
         &self,
         swap_chain_panel: *mut std::ffi::c_void,
         id_in: Option<SurfaceId>,
-    ) -> SurfaceId {
+    ) -> Result<SurfaceId, CreateSurfaceError> {
         profiling::scope!("Instance::instance_create_surface_from_swap_chain_panel");
-
-        let surface = Surface {
-            presentation: Mutex::new(None),
-            info: ResourceInfo::new("<Surface>", None),
-            raw: {
-                let hal_surface = self
-                    .instance
-                    .dx12
-                    .as_ref()
-                    .map(|inst| unsafe {
-                        inst.create_surface_from_swap_chain_panel(swap_chain_panel as _)
-                    })
-                    .unwrap();
-                AnySurface::new::<hal::api::Dx12>(hal_surface)
-            },
-        };
-
-        let (id, _) = self.surfaces.prepare(id_in).assign(Arc::new(surface));
-        id
+        self.instance_create_surface_dx12(id_in, |inst| unsafe {
+            inst.create_surface_from_swap_chain_panel(swap_chain_panel as _)
+        })
     }
 
     pub fn surface_drop(&self, id: SurfaceId) {
@@ -657,11 +694,15 @@ impl Global {
 
         api_log!("Surface::drop {id:?}");
 
-        fn unconfigure<A: HalApi>(global: &Global, surface: &AnySurface, present: &Presentation) {
-            let hub = HalApi::hub(global);
-            if let Some(hal_surface) = surface.downcast_ref::<A>() {
+        fn unconfigure<A: HalApi>(
+            global: &Global,
+            surface: &Option<HalSurface<A>>,
+            present: &Presentation,
+        ) {
+            if let Some(surface) = surface {
+                let hub = HalApi::hub(global);
                 if let Some(device) = present.device.downcast_ref::<A>() {
-                    hub.surface_unconfigure(device, hal_surface);
+                    hub.surface_unconfigure(device, surface);
                 }
             }
         }
@@ -669,15 +710,16 @@ impl Global {
         let surface = self.surfaces.unregister(id);
         let surface = Arc::into_inner(surface.unwrap())
             .expect("Surface cannot be destroyed because is still in use");
+
         if let Some(present) = surface.presentation.lock().take() {
             #[cfg(vulkan)]
-            unconfigure::<hal::api::Vulkan>(self, &surface.raw, &present);
+            unconfigure::<hal::api::Vulkan>(self, &surface.vulkan, &present);
             #[cfg(metal)]
-            unconfigure::<hal::api::Metal>(self, &surface.raw, &present);
+            unconfigure::<hal::api::Metal>(self, &surface.metal, &present);
             #[cfg(dx12)]
-            unconfigure::<hal::api::Dx12>(self, &surface.raw, &present);
+            unconfigure::<hal::api::Dx12>(self, &surface.dx12, &present);
             #[cfg(gles)]
-            unconfigure::<hal::api::Gles>(self, &surface.raw, &present);
+            unconfigure::<hal::api::Gles>(self, &surface.gl, &present);
         }
         self.instance.destroy_surface(surface);
     }
@@ -785,7 +827,7 @@ impl Global {
                         adapters.retain(|exposed| exposed.info.device_type == wgt::DeviceType::Cpu);
                     }
                     if let Some(surface) = compatible_surface {
-                        let surface = &A::get_surface(surface);
+                        let surface = &A::surface_as_hal(surface);
                         adapters.retain(|exposed| unsafe {
                             // If the surface does not exist for this backend,
                             // then the surface is not supported.
diff --git a/wgpu-core/src/lib.rs b/wgpu-core/src/lib.rs
index cf24f589fa..f62878797b 100644
--- a/wgpu-core/src/lib.rs
+++ b/wgpu-core/src/lib.rs
@@ -39,6 +39,8 @@
     unused_braces,
     // It gets in the way a lot and does not prevent bugs in practice.
     clippy::pattern_type_mismatch,
+    // `wgpu-core` isn't entirely user-facing, so it's useful to document internal items.
+    rustdoc::private_intra_doc_links
 )]
 #![warn(
     trivial_casts,
@@ -48,7 +50,6 @@
     unused_qualifications
 )]
 
-pub mod any_surface;
 pub mod binding_model;
 pub mod command;
 mod conv;
@@ -62,6 +63,7 @@ pub mod id;
 pub mod identity;
 mod init_tracker;
 pub mod instance;
+mod lock;
 pub mod pipeline;
 mod pool;
 pub mod present;
diff --git a/wgpu-core/src/lock/mod.rs b/wgpu-core/src/lock/mod.rs
new file mode 100644
index 0000000000..a6593a062d
--- /dev/null
+++ b/wgpu-core/src/lock/mod.rs
@@ -0,0 +1,41 @@
+//! Instrumented lock types.
+//!
+//! This module defines a set of instrumented wrappers for the lock
+//! types used in `wgpu-core` ([`Mutex`], [`RwLock`], and
+//! [`SnatchLock`]) that help us understand and validate `wgpu-core`
+//! synchronization.
+//!
+//! - The [`ranked`] module defines lock types that perform run-time
+//!   checks to ensure that each thread acquires locks only in a
+//!   specific order, to prevent deadlocks.
+//!
+//! - The [`vanilla`] module defines lock types that are
+//!   uninstrumented, no-overhead wrappers around the standard lock
+//!   types.
+//!
+//! (We plan to add more wrappers in the future.)
+//!
+//! If the `wgpu_validate_locks` config is set (for example, with
+//! `RUSTFLAGS='--cfg wgpu_validate_locks'`), `wgpu-core` uses the
+//! [`ranked`] module's locks. We hope to make this the default for
+//! debug builds soon.
+//!
+//! Otherwise, `wgpu-core` uses the [`vanilla`] module's locks.
+//!
+//! [`Mutex`]: parking_lot::Mutex
+//! [`RwLock`]: parking_lot::RwLock
+//! [`SnatchLock`]: crate::snatch::SnatchLock
+
+pub mod rank;
+
+#[cfg_attr(not(wgpu_validate_locks), allow(dead_code))]
+mod ranked;
+
+#[cfg_attr(wgpu_validate_locks, allow(dead_code))]
+mod vanilla;
+
+#[cfg(wgpu_validate_locks)]
+pub use ranked::{Mutex, MutexGuard, RwLock, RwLockReadGuard, RwLockWriteGuard};
+
+#[cfg(not(wgpu_validate_locks))]
+pub use vanilla::{Mutex, MutexGuard, RwLock, RwLockReadGuard, RwLockWriteGuard};
diff --git a/wgpu-core/src/lock/rank.rs b/wgpu-core/src/lock/rank.rs
new file mode 100644
index 0000000000..4387b8d138
--- /dev/null
+++ b/wgpu-core/src/lock/rank.rs
@@ -0,0 +1,170 @@
+//! Ranks for `wgpu-core` locks, restricting acquisition order.
+//!
+//! See [`LockRank`].
+
+/// The rank of a lock.
+///
+/// Each [`Mutex`], [`RwLock`], and [`SnatchLock`] in `wgpu-core` has been
+/// assigned a *rank*: a node in the DAG defined at the bottom of
+/// `wgpu-core/src/lock/rank.rs`. The rank of the most recently
+/// acquired lock you are still holding determines which locks you may
+/// attempt to acquire next.
+///
+/// When you create a lock in `wgpu-core`, you must specify its rank
+/// by passing in a [`LockRank`] value. This module declares a
+/// pre-defined set of ranks to cover everything in `wgpu-core`, named
+/// after the type in which they occur, and the name of the type's
+/// field that is a lock. For example, [`CommandBuffer::data`] is a
+/// `Mutex`, and its rank here is the constant
+/// [`COMMAND_BUFFER_DATA`].
+///
+/// [`Mutex`]: parking_lot::Mutex
+/// [`RwLock`]: parking_lot::RwLock
+/// [`SnatchLock`]: crate::snatch::SnatchLock
+/// [`CommandBuffer::data`]: crate::command::CommandBuffer::data
+#[derive(Debug, Copy, Clone)]
+pub struct LockRank {
+    /// The bit representing this lock.
+    ///
+    /// There should only be a single bit set in this value.
+    pub(super) bit: LockRankSet,
+
+    /// A bitmask of permitted successor ranks.
+    ///
+    /// If `rank` is the rank of the most recently acquired lock we
+    /// are still holding, then `rank.followers` is the mask of
+    /// locks we are allowed to acquire next.
+    ///
+    /// The `define_lock_ranks!` macro ensures that there are no
+    /// cycles in the graph of lock ranks and their followers.
+    pub(super) followers: LockRankSet,
+}
+
+/// Define a set of lock ranks, and each rank's permitted successors.
+macro_rules! define_lock_ranks {
+    {
+        $(
+            $( #[ $attr:meta ] )*
+            rank $name:ident $member:literal followed by { $( $follower:ident ),* $(,)? }
+        )*
+    } => {
+        // An enum that assigns a unique number to each rank.
+        #[allow(non_camel_case_types, clippy::upper_case_acronyms)]
+        enum LockRankNumber { $( $name, )* }
+
+        bitflags::bitflags! {
+            #[derive(Debug, Copy, Clone, Eq, PartialEq)]
+            /// A bitflags type representing a set of lock ranks.
+            pub struct LockRankSet: u64 {
+                $(
+                    const $name = 1 << (LockRankNumber:: $name as u64);
+                )*
+            }
+        }
+
+        impl LockRankSet {
+            pub fn name(self) -> &'static str {
+                match self {
+                    $(
+                        LockRankSet:: $name => $member,
+                    )*
+                    _ => "<unrecognized LockRankSet bit>",
+                }
+            }
+        }
+
+        $(
+            // If there is any cycle in the ranking, the initializers
+            // for `followers` will be cyclic, and rustc will give us
+            // an error message explaining the cycle.
+            $( #[ $attr ] )*
+            pub const $name: LockRank = LockRank {
+                bit: LockRankSet:: $name,
+                followers: LockRankSet::empty() $( .union($follower.bit) )*,
+            };
+        )*
+    }
+}
+
+define_lock_ranks! {
+    rank DEVICE_TEMP_SUSPECTED "Device::temp_suspected" followed by {
+        SHARED_TRACKER_INDEX_ALLOCATOR_INNER,
+        COMMAND_BUFFER_DATA,
+        DEVICE_TRACKERS,
+    }
+    rank COMMAND_BUFFER_DATA "CommandBuffer::data" followed by {
+        DEVICE_SNATCHABLE_LOCK,
+        DEVICE_USAGE_SCOPES,
+        SHARED_TRACKER_INDEX_ALLOCATOR_INNER,
+        BUFFER_BIND_GROUP_STATE_BUFFERS,
+        TEXTURE_BIND_GROUP_STATE_TEXTURES,
+        BUFFER_MAP_STATE,
+        STATELESS_BIND_GROUP_STATE_RESOURCES,
+    }
+    rank DEVICE_SNATCHABLE_LOCK "Device::snatchable_lock" followed by {
+        SHARED_TRACKER_INDEX_ALLOCATOR_INNER,
+        DEVICE_TRACE,
+        BUFFER_MAP_STATE,
+        BUFFER_BIND_GROUP_STATE_BUFFERS,
+        TEXTURE_BIND_GROUP_STATE_TEXTURES,
+        STATELESS_BIND_GROUP_STATE_RESOURCES,
+        // Uncomment this to see an interesting cycle.
+        // COMMAND_BUFFER_DATA,
+    }
+    rank BUFFER_MAP_STATE "Buffer::map_state" followed by {
+        DEVICE_PENDING_WRITES,
+        SHARED_TRACKER_INDEX_ALLOCATOR_INNER,
+        DEVICE_TRACE,
+    }
+    rank DEVICE_PENDING_WRITES "Device::pending_writes" followed by {
+        COMMAND_ALLOCATOR_FREE_ENCODERS,
+        SHARED_TRACKER_INDEX_ALLOCATOR_INNER,
+        DEVICE_LIFE_TRACKER,
+    }
+    rank DEVICE_LIFE_TRACKER "Device::life_tracker" followed by {
+        COMMAND_ALLOCATOR_FREE_ENCODERS,
+        // Uncomment this to see an interesting cycle.
+        // DEVICE_TEMP_SUSPECTED,
+        DEVICE_TRACE,
+    }
+    rank COMMAND_ALLOCATOR_FREE_ENCODERS "CommandAllocator::free_encoders" followed by {
+        SHARED_TRACKER_INDEX_ALLOCATOR_INNER,
+    }
+
+    rank BUFFER_BIND_GROUPS "Buffer::bind_groups" followed by { }
+    rank BUFFER_BIND_GROUP_STATE_BUFFERS "BufferBindGroupState::buffers" followed by { }
+    rank BUFFER_INITIALIZATION_STATUS "Buffer::initialization_status" followed by { }
+    rank BUFFER_SYNC_MAPPED_WRITES "Buffer::sync_mapped_writes" followed by { }
+    rank DEVICE_DEFERRED_DESTROY "Device::deferred_destroy" followed by { }
+    rank DEVICE_FENCE "Device::fence" followed by { }
+    #[allow(dead_code)]
+    rank DEVICE_TRACE "Device::trace" followed by { }
+    rank DEVICE_TRACKERS "Device::trackers" followed by { }
+    rank DEVICE_USAGE_SCOPES "Device::usage_scopes" followed by { }
+    rank IDENTITY_MANAGER_VALUES "IdentityManager::values" followed by { }
+    rank REGISTRY_STORAGE "Registry::storage" followed by { }
+    rank RENDER_BUNDLE_SCOPE_BUFFERS "RenderBundleScope::buffers" followed by { }
+    rank RENDER_BUNDLE_SCOPE_TEXTURES "RenderBundleScope::textures" followed by { }
+    rank RENDER_BUNDLE_SCOPE_BIND_GROUPS "RenderBundleScope::bind_groups" followed by { }
+    rank RENDER_BUNDLE_SCOPE_RENDER_PIPELINES "RenderBundleScope::render_pipelines" followed by { }
+    rank RENDER_BUNDLE_SCOPE_QUERY_SETS "RenderBundleScope::query_sets" followed by { }
+    rank RESOURCE_POOL_INNER "ResourcePool::inner" followed by { }
+    rank SHARED_TRACKER_INDEX_ALLOCATOR_INNER "SharedTrackerIndexAllocator::inner" followed by { }
+    rank STAGING_BUFFER_RAW "StagingBuffer::raw" followed by { }
+    rank STATELESS_BIND_GROUP_STATE_RESOURCES "StatelessBindGroupState::resources" followed by { }
+    rank SURFACE_PRESENTATION "Surface::presentation" followed by { }
+    rank TEXTURE_BIND_GROUPS "Texture::bind_groups" followed by { }
+    rank TEXTURE_BIND_GROUP_STATE_TEXTURES "TextureBindGroupState::textures" followed by { }
+    rank TEXTURE_INITIALIZATION_STATUS "Texture::initialization_status" followed by { }
+    rank TEXTURE_CLEAR_MODE "Texture::clear_mode" followed by { }
+    rank TEXTURE_VIEWS "Texture::views" followed by { }
+
+    #[cfg(test)]
+    rank PAWN "pawn" followed by { ROOK, BISHOP }
+    #[cfg(test)]
+    rank ROOK "rook" followed by { KNIGHT }
+    #[cfg(test)]
+    rank KNIGHT "knight" followed by { }
+    #[cfg(test)]
+    rank BISHOP "bishop" followed by { }
+}
diff --git a/wgpu-core/src/lock/ranked.rs b/wgpu-core/src/lock/ranked.rs
new file mode 100644
index 0000000000..ecf37c1d77
--- /dev/null
+++ b/wgpu-core/src/lock/ranked.rs
@@ -0,0 +1,386 @@
+//! Lock types that enforce well-ranked lock acquisition order.
+//!
+//! This module's [`Mutex`] and [`RwLock` types are instrumented to check that
+//! `wgpu-core` acquires locks according to their rank, to prevent deadlocks. To
+//! use it, put `--cfg wgpu_validate_locks` in `RUSTFLAGS`.
+//!
+//! The [`LockRank`] constants in the [`lock::rank`] module describe edges in a
+//! directed graph of lock acquisitions: each lock's rank says, if this is the most
+//! recently acquired lock that you are still holding, then these are the locks you
+//! are allowed to acquire next.
+//!
+//! As long as this graph doesn't have cycles, any number of threads can acquire
+//! locks along paths through the graph without deadlock:
+//!
+//! - Assume that if a thread is holding a lock, then it will either release it,
+//!   or block trying to acquire another one. No thread just sits on its locks
+//!   forever for unrelated reasons. If it did, then that would be a source of
+//!   deadlock "outside the system" that we can't do anything about.
+//!
+//! - This module asserts that threads acquire and release locks in a stack-like
+//!   order: a lock is dropped only when it is the *most recently acquired* lock
+//!   *still held* - call this the "youngest" lock. This stack-like ordering
+//!   isn't a Rust requirement; Rust lets you drop guards in any order you like.
+//!   This is a restriction we impose.
+//!
+//! - Consider the directed graph whose nodes are locks, and whose edges go from
+//!   each lock to its permitted followers, the locks in its [`LockRank::followers`]
+//!   set. The definition of the [`lock::rank`] module's [`LockRank`] constants
+//!   ensures that this graph has no cycles, including trivial cycles from a node to
+//!   itself.
+//!
+//! - This module then asserts that each thread attempts to acquire a lock only if
+//!   it is among its youngest lock's permitted followers. Thus, as a thread
+//!   acquires locks, it must be traversing a path through the graph along its
+//!   edges.
+//!  
+//! - Because there are no cycles in the graph, whenever one thread is blocked
+//!   waiting to acquire a lock, that lock must be held by a different thread: if
+//!   you were allowed to acquire a lock you already hold, that would be a cycle in
+//!   the graph.
+//!
+//! - Furthermore, because the graph has no cycles, as we work our way from each
+//!   thread to the thread it is blocked waiting for, we must eventually reach an
+//!   end point: there must be some thread that is able to acquire its next lock, or
+//!   that is about to release a lock.
+//!
+//! Thus, the system as a whole is always able to make progress: it is free of
+//! deadlocks.
+//!
+//! Note that this validation only monitors each thread's behavior in isolation:
+//! there's only thread-local state, nothing communicated between threads. So we
+//! don't detect deadlocks, per se, only the potential to cause deadlocks. This
+//! means that the validation is conservative, but more reproducible, since it's not
+//! dependent on any particular interleaving of execution.
+//!
+//! [`lock::rank`]: crate::lock::rank
+
+use super::rank::LockRank;
+use std::{cell::Cell, panic::Location};
+
+/// A `Mutex` instrumented for deadlock prevention.
+///
+/// This is just a wrapper around a [`parking_lot::Mutex`], along with
+/// its rank in the `wgpu_core` lock ordering.
+///
+/// For details, see [the module documentation][mod].
+///
+/// [mod]: crate::lock::ranked
+pub struct Mutex<T> {
+    inner: parking_lot::Mutex<T>,
+    rank: LockRank,
+}
+
+/// A guard produced by locking [`Mutex`].
+///
+/// This is just a wrapper around a [`parking_lot::MutexGuard`], along
+/// with the state needed to track lock acquisition.
+///
+/// For details, see [the module documentation][mod].
+///
+/// [mod]: crate::lock::ranked
+pub struct MutexGuard<'a, T> {
+    inner: parking_lot::MutexGuard<'a, T>,
+    saved: LockState,
+}
+
+thread_local! {
+    static LOCK_STATE: Cell<LockState> = const { Cell::new(LockState::INITIAL) };
+}
+
+/// Per-thread state for the deadlock checker.
+#[derive(Debug, Copy, Clone)]
+struct LockState {
+    /// The last lock we acquired, and where.
+    last_acquired: Option<(LockRank, &'static Location<'static>)>,
+
+    /// The number of locks currently held.
+    ///
+    /// This is used to enforce stack-like lock acquisition and release.
+    depth: u32,
+}
+
+impl LockState {
+    const INITIAL: LockState = LockState {
+        last_acquired: None,
+        depth: 0,
+    };
+}
+
+/// Check and record the acquisition of a lock with `new_rank`.
+///
+/// Check that acquiring a lock with `new_rank` is permitted at this point, and
+/// update the per-thread state accordingly.
+///
+/// Return the `LockState` that must be restored when this thread is released.
+fn acquire(new_rank: LockRank, location: &'static Location<'static>) -> LockState {
+    let state = LOCK_STATE.get();
+    // Initially, it's fine to acquire any lock. So we only
+    // need to check when `last_acquired` is `Some`.
+    if let Some((ref last_rank, ref last_location)) = state.last_acquired {
+        assert!(
+            last_rank.followers.contains(new_rank.bit),
+            "Attempt to acquire nested mutexes in wrong order:\n\
+             last locked {:<35} at {}\n\
+             now locking {:<35} at {}\n\
+             Locking {} after locking {} is not permitted.",
+            last_rank.bit.name(),
+            last_location,
+            new_rank.bit.name(),
+            location,
+            new_rank.bit.name(),
+            last_rank.bit.name(),
+        );
+    }
+    LOCK_STATE.set(LockState {
+        last_acquired: Some((new_rank, location)),
+        depth: state.depth + 1,
+    });
+    state
+}
+
+/// Record the release of a lock whose saved state was `saved`.
+///
+/// Check that locks are being acquired in stacking order, and update the
+/// per-thread state accordingly.
+fn release(saved: LockState) {
+    let prior = LOCK_STATE.replace(saved);
+
+    // Although Rust allows mutex guards to be dropped in any
+    // order, this analysis requires that locks be acquired and
+    // released in stack order: the next lock to be released must be
+    // the most recently acquired lock still held.
+    assert_eq!(
+        prior.depth,
+        saved.depth + 1,
+        "Lock not released in stacking order"
+    );
+}
+
+impl<T> Mutex<T> {
+    pub fn new(rank: LockRank, value: T) -> Mutex<T> {
+        Mutex {
+            inner: parking_lot::Mutex::new(value),
+            rank,
+        }
+    }
+
+    #[track_caller]
+    pub fn lock(&self) -> MutexGuard<T> {
+        let saved = acquire(self.rank, Location::caller());
+        MutexGuard {
+            inner: self.inner.lock(),
+            saved,
+        }
+    }
+}
+
+impl<'a, T> Drop for MutexGuard<'a, T> {
+    fn drop(&mut self) {
+        release(self.saved);
+    }
+}
+
+impl<'a, T> std::ops::Deref for MutexGuard<'a, T> {
+    type Target = T;
+
+    fn deref(&self) -> &Self::Target {
+        self.inner.deref()
+    }
+}
+
+impl<'a, T> std::ops::DerefMut for MutexGuard<'a, T> {
+    fn deref_mut(&mut self) -> &mut Self::Target {
+        self.inner.deref_mut()
+    }
+}
+
+impl<T: std::fmt::Debug> std::fmt::Debug for Mutex<T> {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        self.inner.fmt(f)
+    }
+}
+
+/// An `RwLock` instrumented for deadlock prevention.
+///
+/// This is just a wrapper around a [`parking_lot::RwLock`], along with
+/// its rank in the `wgpu_core` lock ordering.
+///
+/// For details, see [the module documentation][mod].
+///
+/// [mod]: crate::lock::ranked
+pub struct RwLock<T> {
+    inner: parking_lot::RwLock<T>,
+    rank: LockRank,
+}
+
+/// A read guard produced by locking [`RwLock`] for reading.
+///
+/// This is just a wrapper around a [`parking_lot::RwLockReadGuard`], along with
+/// the state needed to track lock acquisition.
+///
+/// For details, see [the module documentation][mod].
+///
+/// [mod]: crate::lock::ranked
+pub struct RwLockReadGuard<'a, T> {
+    inner: parking_lot::RwLockReadGuard<'a, T>,
+    saved: LockState,
+}
+
+/// A write guard produced by locking [`RwLock`] for writing.
+///
+/// This is just a wrapper around a [`parking_lot::RwLockWriteGuard`], along
+/// with the state needed to track lock acquisition.
+///
+/// For details, see [the module documentation][mod].
+///
+/// [mod]: crate::lock::ranked
+pub struct RwLockWriteGuard<'a, T> {
+    inner: parking_lot::RwLockWriteGuard<'a, T>,
+    saved: LockState,
+}
+
+impl<T> RwLock<T> {
+    pub fn new(rank: LockRank, value: T) -> RwLock<T> {
+        RwLock {
+            inner: parking_lot::RwLock::new(value),
+            rank,
+        }
+    }
+
+    #[track_caller]
+    pub fn read(&self) -> RwLockReadGuard<T> {
+        let saved = acquire(self.rank, Location::caller());
+        RwLockReadGuard {
+            inner: self.inner.read(),
+            saved,
+        }
+    }
+
+    #[track_caller]
+    pub fn write(&self) -> RwLockWriteGuard<T> {
+        let saved = acquire(self.rank, Location::caller());
+        RwLockWriteGuard {
+            inner: self.inner.write(),
+            saved,
+        }
+    }
+}
+
+impl<T: std::fmt::Debug> std::fmt::Debug for RwLock<T> {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        self.inner.fmt(f)
+    }
+}
+
+impl<'a, T> Drop for RwLockReadGuard<'a, T> {
+    fn drop(&mut self) {
+        release(self.saved);
+    }
+}
+
+impl<'a, T> Drop for RwLockWriteGuard<'a, T> {
+    fn drop(&mut self) {
+        release(self.saved);
+    }
+}
+
+impl<'a, T> std::ops::Deref for RwLockReadGuard<'a, T> {
+    type Target = T;
+
+    fn deref(&self) -> &Self::Target {
+        self.inner.deref()
+    }
+}
+
+impl<'a, T> std::ops::Deref for RwLockWriteGuard<'a, T> {
+    type Target = T;
+
+    fn deref(&self) -> &Self::Target {
+        self.inner.deref()
+    }
+}
+
+impl<'a, T> std::ops::DerefMut for RwLockWriteGuard<'a, T> {
+    fn deref_mut(&mut self) -> &mut Self::Target {
+        self.inner.deref_mut()
+    }
+}
+
+/// Locks can be acquired in the order indicated by their ranks.
+#[test]
+fn permitted() {
+    use super::rank;
+
+    let lock1 = Mutex::new(rank::PAWN, ());
+    let lock2 = Mutex::new(rank::ROOK, ());
+
+    let _guard1 = lock1.lock();
+    let _guard2 = lock2.lock();
+}
+
+/// Locks can only be acquired in the order indicated by their ranks.
+#[test]
+#[should_panic(expected = "Locking pawn after locking rook")]
+fn forbidden_unrelated() {
+    use super::rank;
+
+    let lock1 = Mutex::new(rank::ROOK, ());
+    let lock2 = Mutex::new(rank::PAWN, ());
+
+    let _guard1 = lock1.lock();
+    let _guard2 = lock2.lock();
+}
+
+/// Lock acquisitions can't skip ranks.
+///
+/// These two locks *could* be acquired in this order, but only if other locks
+/// are acquired in between them. Skipping ranks isn't allowed.
+#[test]
+#[should_panic(expected = "Locking knight after locking pawn")]
+fn forbidden_skip() {
+    use super::rank;
+
+    let lock1 = Mutex::new(rank::PAWN, ());
+    let lock2 = Mutex::new(rank::KNIGHT, ());
+
+    let _guard1 = lock1.lock();
+    let _guard2 = lock2.lock();
+}
+
+/// Locks can be acquired and released in a stack-like order.
+#[test]
+fn stack_like() {
+    use super::rank;
+
+    let lock1 = Mutex::new(rank::PAWN, ());
+    let lock2 = Mutex::new(rank::ROOK, ());
+    let lock3 = Mutex::new(rank::BISHOP, ());
+
+    let guard1 = lock1.lock();
+    let guard2 = lock2.lock();
+    drop(guard2);
+
+    let guard3 = lock3.lock();
+    drop(guard3);
+    drop(guard1);
+}
+
+/// Locks can only be acquired and released in a stack-like order.
+#[test]
+#[should_panic(expected = "Lock not released in stacking order")]
+fn non_stack_like() {
+    use super::rank;
+
+    let lock1 = Mutex::new(rank::PAWN, ());
+    let lock2 = Mutex::new(rank::ROOK, ());
+
+    let guard1 = lock1.lock();
+    let guard2 = lock2.lock();
+
+    // Avoid a double panic from dropping this while unwinding due to the panic
+    // we're testing for.
+    std::mem::forget(guard2);
+
+    drop(guard1);
+}
diff --git a/wgpu-core/src/lock/vanilla.rs b/wgpu-core/src/lock/vanilla.rs
new file mode 100644
index 0000000000..4fc419f12e
--- /dev/null
+++ b/wgpu-core/src/lock/vanilla.rs
@@ -0,0 +1,115 @@
+//! Plain, uninstrumented wrappers around [`parking_lot`] lock types.
+//!
+//! These definitions are used when no particular lock instrumentation
+//! Cargo feature is selected.
+
+/// A plain wrapper around [`parking_lot::Mutex`].
+///
+/// This is just like [`parking_lot::Mutex`], except that our [`new`]
+/// method takes a rank, indicating where the new mutex should sit in
+/// `wgpu-core`'s lock ordering. The rank is ignored.
+///
+/// See the [`lock`] module documentation for other wrappers.
+///
+/// [`new`]: Mutex::new
+/// [`lock`]: crate::lock
+pub struct Mutex<T>(parking_lot::Mutex<T>);
+
+/// A guard produced by locking [`Mutex`].
+///
+/// This is just a wrapper around a [`parking_lot::MutexGuard`].
+pub struct MutexGuard<'a, T>(parking_lot::MutexGuard<'a, T>);
+
+impl<T> Mutex<T> {
+    pub fn new(_rank: super::rank::LockRank, value: T) -> Mutex<T> {
+        Mutex(parking_lot::Mutex::new(value))
+    }
+
+    pub fn lock(&self) -> MutexGuard<T> {
+        MutexGuard(self.0.lock())
+    }
+}
+
+impl<'a, T> std::ops::Deref for MutexGuard<'a, T> {
+    type Target = T;
+
+    fn deref(&self) -> &Self::Target {
+        self.0.deref()
+    }
+}
+
+impl<'a, T> std::ops::DerefMut for MutexGuard<'a, T> {
+    fn deref_mut(&mut self) -> &mut Self::Target {
+        self.0.deref_mut()
+    }
+}
+
+impl<T: std::fmt::Debug> std::fmt::Debug for Mutex<T> {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        self.0.fmt(f)
+    }
+}
+
+/// A plain wrapper around [`parking_lot::RwLock`].
+///
+/// This is just like [`parking_lot::RwLock`], except that our [`new`]
+/// method takes a rank, indicating where the new mutex should sit in
+/// `wgpu-core`'s lock ordering. The rank is ignored.
+///
+/// See the [`lock`] module documentation for other wrappers.
+///
+/// [`new`]: RwLock::new
+/// [`lock`]: crate::lock
+pub struct RwLock<T>(parking_lot::RwLock<T>);
+
+/// A read guard produced by locking [`RwLock`] as a reader.
+///
+/// This is just a wrapper around a [`parking_lot::RwLockReadGuard`].
+pub struct RwLockReadGuard<'a, T>(parking_lot::RwLockReadGuard<'a, T>);
+
+/// A write guard produced by locking [`RwLock`] as a writer.
+///
+/// This is just a wrapper around a [`parking_lot::RwLockWriteGuard`].
+pub struct RwLockWriteGuard<'a, T>(parking_lot::RwLockWriteGuard<'a, T>);
+
+impl<T> RwLock<T> {
+    pub fn new(_rank: super::rank::LockRank, value: T) -> RwLock<T> {
+        RwLock(parking_lot::RwLock::new(value))
+    }
+
+    pub fn read(&self) -> RwLockReadGuard<T> {
+        RwLockReadGuard(self.0.read())
+    }
+
+    pub fn write(&self) -> RwLockWriteGuard<T> {
+        RwLockWriteGuard(self.0.write())
+    }
+}
+
+impl<T: std::fmt::Debug> std::fmt::Debug for RwLock<T> {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        self.0.fmt(f)
+    }
+}
+
+impl<'a, T> std::ops::Deref for RwLockReadGuard<'a, T> {
+    type Target = T;
+
+    fn deref(&self) -> &Self::Target {
+        self.0.deref()
+    }
+}
+
+impl<'a, T> std::ops::Deref for RwLockWriteGuard<'a, T> {
+    type Target = T;
+
+    fn deref(&self) -> &Self::Target {
+        self.0.deref()
+    }
+}
+
+impl<'a, T> std::ops::DerefMut for RwLockWriteGuard<'a, T> {
+    fn deref_mut(&mut self) -> &mut Self::Target {
+        self.0.deref_mut()
+    }
+}
diff --git a/wgpu-core/src/pipeline.rs b/wgpu-core/src/pipeline.rs
index b1689bd691..d70b118d7e 100644
--- a/wgpu-core/src/pipeline.rs
+++ b/wgpu-core/src/pipeline.rs
@@ -10,7 +10,8 @@ use crate::{
     resource_log, validation, Label,
 };
 use arrayvec::ArrayVec;
-use std::{borrow::Cow, error::Error, fmt, marker::PhantomData, num::NonZeroU32, sync::Arc};
+use naga::error::ShaderError;
+use std::{borrow::Cow, marker::PhantomData, num::NonZeroU32, sync::Arc};
 use thiserror::Error;
 
 /// Information about buffer bindings, which
@@ -107,79 +108,8 @@ impl<A: HalApi> ShaderModule<A> {
     }
 }
 
-#[derive(Clone, Debug)]
-pub struct ShaderError<E> {
-    pub source: String,
-    pub label: Option<String>,
-    pub inner: Box<E>,
-}
-#[cfg(feature = "wgsl")]
-impl fmt::Display for ShaderError<naga::front::wgsl::ParseError> {
-    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
-        let label = self.label.as_deref().unwrap_or_default();
-        let string = self.inner.emit_to_string(&self.source);
-        write!(f, "\nShader '{label}' parsing {string}")
-    }
-}
-#[cfg(feature = "glsl")]
-impl fmt::Display for ShaderError<naga::front::glsl::ParseError> {
-    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
-        let label = self.label.as_deref().unwrap_or_default();
-        let string = self.inner.emit_to_string(&self.source);
-        write!(f, "\nShader '{label}' parsing {string}")
-    }
-}
-#[cfg(feature = "spirv")]
-impl fmt::Display for ShaderError<naga::front::spv::Error> {
-    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
-        let label = self.label.as_deref().unwrap_or_default();
-        let string = self.inner.emit_to_string(&self.source);
-        write!(f, "\nShader '{label}' parsing {string}")
-    }
-}
-impl fmt::Display for ShaderError<naga::WithSpan<naga::valid::ValidationError>> {
-    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
-        use codespan_reporting::{
-            diagnostic::{Diagnostic, Label},
-            files::SimpleFile,
-            term,
-        };
-
-        let label = self.label.as_deref().unwrap_or_default();
-        let files = SimpleFile::new(label, &self.source);
-        let config = term::Config::default();
-        let mut writer = term::termcolor::NoColor::new(Vec::new());
-
-        let diagnostic = Diagnostic::error().with_labels(
-            self.inner
-                .spans()
-                .map(|&(span, ref desc)| {
-                    Label::primary((), span.to_range().unwrap()).with_message(desc.to_owned())
-                })
-                .collect(),
-        );
-
-        term::emit(&mut writer, &config, &files, &diagnostic).expect("cannot write error");
-
-        write!(
-            f,
-            "\nShader validation {}",
-            String::from_utf8_lossy(&writer.into_inner())
-        )
-    }
-}
-impl<E> Error for ShaderError<E>
-where
-    ShaderError<E>: fmt::Display,
-    E: Error + 'static,
-{
-    fn source(&self) -> Option<&(dyn Error + 'static)> {
-        Some(&self.inner)
-    }
-}
-
 //Note: `Clone` would require `WithSpan: Clone`.
-#[derive(Debug, Error)]
+#[derive(Clone, Debug, Error)]
 #[non_exhaustive]
 pub enum CreateShaderModuleError {
     #[cfg(feature = "wgsl")]
@@ -187,7 +117,7 @@ pub enum CreateShaderModuleError {
     Parsing(#[from] ShaderError<naga::front::wgsl::ParseError>),
     #[cfg(feature = "glsl")]
     #[error(transparent)]
-    ParsingGlsl(#[from] ShaderError<naga::front::glsl::ParseError>),
+    ParsingGlsl(#[from] ShaderError<naga::front::glsl::ParseErrors>),
     #[cfg(feature = "spirv")]
     #[error(transparent)]
     ParsingSpirV(#[from] ShaderError<naga::front::spv::Error>),
@@ -209,17 +139,6 @@ pub enum CreateShaderModuleError {
     },
 }
 
-impl CreateShaderModuleError {
-    pub fn location(&self, source: &str) -> Option<naga::SourceLocation> {
-        match *self {
-            #[cfg(feature = "wgsl")]
-            CreateShaderModuleError::Parsing(ref err) => err.inner.location(source),
-            CreateShaderModuleError::Validation(ref err) => err.inner.location(source),
-            _ => None,
-        }
-    }
-}
-
 /// Describes a programmable pipeline stage.
 #[derive(Clone, Debug)]
 #[cfg_attr(feature = "serde", derive(serde::Serialize, serde::Deserialize))]
@@ -241,6 +160,11 @@ pub struct ProgrammableStageDescriptor<'a> {
     ///
     /// The value may represent any of WGSL's concrete scalar types.
     pub constants: Cow<'a, naga::back::PipelineConstants>,
+    /// Whether workgroup scoped memory will be initialized with zero values for this stage.
+    ///
+    /// This is required by the WebGPU spec, but may have overhead which can be avoided
+    /// for cross-platform applications
+    pub zero_initialize_workgroup_memory: bool,
 }
 
 /// Number of implicit bind groups derived at pipeline creation.
diff --git a/wgpu-core/src/pool.rs b/wgpu-core/src/pool.rs
index 47de6d5feb..7d17f3a7a3 100644
--- a/wgpu-core/src/pool.rs
+++ b/wgpu-core/src/pool.rs
@@ -5,8 +5,8 @@ use std::{
 };
 
 use once_cell::sync::OnceCell;
-use parking_lot::Mutex;
 
+use crate::lock::{rank, Mutex};
 use crate::{PreHashedKey, PreHashedMap};
 
 type SlotInner<V> = Weak<V>;
@@ -22,13 +22,15 @@ pub struct ResourcePool<K, V> {
 impl<K: Clone + Eq + Hash, V> ResourcePool<K, V> {
     pub fn new() -> Self {
         Self {
-            inner: Mutex::new(HashMap::default()),
+            inner: Mutex::new(rank::RESOURCE_POOL_INNER, HashMap::default()),
         }
     }
 
-    /// Get a resource from the pool with the given entry map, or create a new one if it doesn't exist using the given constructor.
+    /// Get a resource from the pool with the given entry map, or create a new
+    /// one if it doesn't exist using the given constructor.
     ///
-    /// Behaves such that only one resource will be created for each unique entry map at any one time.
+    /// Behaves such that only one resource will be created for each unique
+    /// entry map at any one time.
     pub fn get_or_init<F, E>(&self, key: K, constructor: F) -> Result<Arc<V>, E>
     where
         F: FnOnce(K) -> Result<Arc<V>, E>,
@@ -96,6 +98,8 @@ impl<K: Clone + Eq + Hash, V> ResourcePool<K, V> {
     /// Remove the given entry map from the pool.
     ///
     /// Must *only* be called in the Drop impl of [`BindGroupLayout`].
+    ///
+    /// [`BindGroupLayout`]: crate::binding_model::BindGroupLayout
     pub fn remove(&self, key: &K) {
         let hashed_key = PreHashedKey::from_key(key);
 
diff --git a/wgpu-core/src/present.rs b/wgpu-core/src/present.rs
index 2f274cd554..053f7fdb24 100644
--- a/wgpu-core/src/present.rs
+++ b/wgpu-core/src/present.rs
@@ -21,13 +21,13 @@ use crate::{
     hal_api::HalApi,
     hal_label, id,
     init_tracker::TextureInitTracker,
+    lock::{rank, Mutex, RwLock},
     resource::{self, ResourceInfo},
     snatch::Snatchable,
     track,
 };
 
 use hal::{Queue as _, Surface as _};
-use parking_lot::{Mutex, RwLock};
 use thiserror::Error;
 use wgt::SurfaceStatus as Status;
 
@@ -157,7 +157,7 @@ impl Global {
         #[cfg(not(feature = "trace"))]
         let _ = device;
 
-        let suf = A::get_surface(surface.as_ref());
+        let suf = A::surface_as_hal(surface.as_ref());
         let (texture_id, status) = match unsafe {
             suf.unwrap()
                 .acquire_texture(Some(std::time::Duration::from_millis(
@@ -215,7 +215,10 @@ impl Global {
                     desc: texture_desc,
                     hal_usage,
                     format_features,
-                    initialization_status: RwLock::new(TextureInitTracker::new(1, 1)),
+                    initialization_status: RwLock::new(
+                        rank::TEXTURE_INITIALIZATION_STATUS,
+                        TextureInitTracker::new(1, 1),
+                    ),
                     full_range: track::TextureSelector {
                         layers: 0..1,
                         mips: 0..1,
@@ -224,11 +227,14 @@ impl Global {
                         "<Surface Texture>",
                         Some(device.tracker_indices.textures.clone()),
                     ),
-                    clear_mode: RwLock::new(resource::TextureClearMode::Surface {
-                        clear_view: Some(clear_view),
-                    }),
-                    views: Mutex::new(Vec::new()),
-                    bind_groups: Mutex::new(Vec::new()),
+                    clear_mode: RwLock::new(
+                        rank::TEXTURE_CLEAR_MODE,
+                        resource::TextureClearMode::Surface {
+                            clear_view: Some(clear_view),
+                        },
+                    ),
+                    views: Mutex::new(rank::TEXTURE_VIEWS, Vec::new()),
+                    bind_groups: Mutex::new(rank::TEXTURE_BIND_GROUPS, Vec::new()),
                 };
 
                 let (id, resource) = fid.assign(Arc::new(texture));
@@ -324,7 +330,7 @@ impl Global {
                     .textures
                     .remove(texture.info.tracker_index());
                 let mut exclusive_snatch_guard = device.snatchable_lock.write();
-                let suf = A::get_surface(&surface);
+                let suf = A::surface_as_hal(&surface);
                 let mut inner = texture.inner_mut(&mut exclusive_snatch_guard);
                 let inner = inner.as_mut().unwrap();
 
@@ -418,7 +424,7 @@ impl Global {
                     .lock()
                     .textures
                     .remove(texture.info.tracker_index());
-                let suf = A::get_surface(&surface);
+                let suf = A::surface_as_hal(&surface);
                 let exclusive_snatch_guard = device.snatchable_lock.write();
                 match texture.inner.snatch(exclusive_snatch_guard).unwrap() {
                     resource::TextureInner::Surface { mut raw, parent_id } => {
diff --git a/wgpu-core/src/registry.rs b/wgpu-core/src/registry.rs
index f78abcaa6a..f0f5674dae 100644
--- a/wgpu-core/src/registry.rs
+++ b/wgpu-core/src/registry.rs
@@ -1,11 +1,11 @@
 use std::sync::Arc;
 
-use parking_lot::{RwLock, RwLockReadGuard, RwLockWriteGuard};
 use wgt::Backend;
 
 use crate::{
     id::Id,
     identity::IdentityManager,
+    lock::{rank, RwLock, RwLockReadGuard, RwLockWriteGuard},
     resource::Resource,
     storage::{Element, InvalidId, Storage},
 };
@@ -38,6 +38,7 @@ impl RegistryReport {
 ///
 #[derive(Debug)]
 pub(crate) struct Registry<T: Resource> {
+    // Must only contain an id which has either never been used or has been released from `storage`
     identity: Arc<IdentityManager<T::Marker>>,
     storage: RwLock<Storage<T>>,
     backend: Backend,
@@ -47,7 +48,7 @@ impl<T: Resource> Registry<T> {
     pub(crate) fn new(backend: Backend) -> Self {
         Self {
             identity: Arc::new(IdentityManager::new()),
-            storage: RwLock::new(Storage::new()),
+            storage: RwLock::new(rank::REGISTRY_STORAGE, Storage::new()),
             backend,
         }
     }
@@ -98,9 +99,6 @@ impl<T: Resource> FutureId<'_, T> {
     /// Assign an existing resource to a new ID.
     ///
     /// Registers it with the registry.
-    ///
-    /// This _will_ leak the ID, and it will not be recycled again.
-    /// See https://github.com/gfx-rs/wgpu/issues/4912.
     pub fn assign_existing(self, value: &Arc<T>) -> Id<T::Marker> {
         let mut data = self.data.write();
         debug_assert!(!data.contains(self.id));
@@ -165,8 +163,11 @@ impl<T: Resource> Registry<T> {
         storage.insert_error(id, label);
     }
     pub(crate) fn unregister(&self, id: Id<T::Marker>) -> Option<Arc<T>> {
-        self.identity.free(id);
         let value = self.storage.write().remove(id);
+        // This needs to happen *after* removing it from the storage, to maintain the
+        // invariant that `self.identity` only contains ids which are actually available
+        // See https://github.com/gfx-rs/wgpu/issues/5372
+        self.identity.free(id);
         //Returning None is legal if it's an error ID
         value
     }
@@ -209,3 +210,53 @@ impl<T: Resource> Registry<T> {
         report
     }
 }
+
+#[cfg(test)]
+mod tests {
+    use std::sync::Arc;
+
+    use crate::{
+        id::Marker,
+        resource::{Resource, ResourceInfo, ResourceType},
+    };
+
+    use super::Registry;
+    struct TestData {
+        info: ResourceInfo<TestData>,
+    }
+    struct TestDataId;
+    impl Marker for TestDataId {}
+
+    impl Resource for TestData {
+        type Marker = TestDataId;
+
+        const TYPE: ResourceType = "Test data";
+
+        fn as_info(&self) -> &ResourceInfo<Self> {
+            &self.info
+        }
+
+        fn as_info_mut(&mut self) -> &mut ResourceInfo<Self> {
+            &mut self.info
+        }
+    }
+
+    #[test]
+    fn simultaneous_registration() {
+        let registry = Registry::without_backend();
+        std::thread::scope(|s| {
+            for _ in 0..5 {
+                s.spawn(|| {
+                    for _ in 0..1000 {
+                        let value = Arc::new(TestData {
+                            info: ResourceInfo::new("Test data", None),
+                        });
+                        let new_id = registry.prepare(None);
+                        let (id, _) = new_id.assign(value);
+                        registry.unregister(id);
+                    }
+                });
+            }
+        })
+    }
+}
diff --git a/wgpu-core/src/resource.rs b/wgpu-core/src/resource.rs
index 11109e27f9..a6b945f417 100644
--- a/wgpu-core/src/resource.rs
+++ b/wgpu-core/src/resource.rs
@@ -13,6 +13,7 @@ use crate::{
         TextureViewId,
     },
     init_tracker::{BufferInitTracker, TextureInitTracker},
+    lock::{Mutex, RwLock},
     resource, resource_log,
     snatch::{ExclusiveSnatchGuard, SnatchGuard, Snatchable},
     track::{SharedTrackerIndexAllocator, TextureSelector, TrackerIndex},
@@ -21,7 +22,6 @@ use crate::{
 };
 
 use hal::CommandEncoder;
-use parking_lot::{Mutex, RwLock};
 use smallvec::SmallVec;
 use thiserror::Error;
 use wgt::WasmNotSendSync;
@@ -1026,7 +1026,9 @@ impl Global {
         profiling::scope!("Surface::as_hal");
 
         let surface = self.surfaces.get(id).ok();
-        let hal_surface = surface.as_ref().and_then(|surface| A::get_surface(surface));
+        let hal_surface = surface
+            .as_ref()
+            .and_then(|surface| A::surface_as_hal(surface));
 
         hal_surface_callback(hal_surface)
     }
diff --git a/wgpu-core/src/snatch.rs b/wgpu-core/src/snatch.rs
index d5cd1a3d37..08a1eba11d 100644
--- a/wgpu-core/src/snatch.rs
+++ b/wgpu-core/src/snatch.rs
@@ -1,6 +1,6 @@
 #![allow(unused)]
 
-use parking_lot::{RwLock, RwLockReadGuard, RwLockWriteGuard};
+use crate::lock::{RwLock, RwLockReadGuard, RwLockWriteGuard};
 use std::{
     backtrace::Backtrace,
     cell::{Cell, RefCell, UnsafeCell},
@@ -8,6 +8,8 @@ use std::{
     thread,
 };
 
+use crate::lock::rank;
+
 /// A guard that provides read access to snatchable data.
 pub struct SnatchGuard<'a>(RwLockReadGuard<'a, ()>);
 /// A guard that allows snatching the snatchable data.
@@ -64,8 +66,58 @@ impl<T> std::fmt::Debug for Snatchable<T> {
 
 unsafe impl<T> Sync for Snatchable<T> {}
 
+struct LockTrace {
+    purpose: &'static str,
+    caller: &'static Location<'static>,
+    backtrace: Backtrace,
+}
+
+impl std::fmt::Display for LockTrace {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        write!(
+            f,
+            "a {} lock at {}\n{}",
+            self.purpose, self.caller, self.backtrace
+        )
+    }
+}
+
+#[cfg(debug_assertions)]
+impl LockTrace {
+    #[track_caller]
+    fn enter(purpose: &'static str) {
+        let new = LockTrace {
+            purpose,
+            caller: Location::caller(),
+            backtrace: Backtrace::capture(),
+        };
+
+        if let Some(prev) = SNATCH_LOCK_TRACE.take() {
+            let current = thread::current();
+            let name = current.name().unwrap_or("<unnamed>");
+            panic!(
+                "thread '{name}' attempted to acquire a snatch lock recursively.\n\
+                 - Currently trying to acquire {new}\n\
+                 - Previously acquired {prev}",
+            );
+        } else {
+            SNATCH_LOCK_TRACE.set(Some(new));
+        }
+    }
+
+    fn exit() {
+        SNATCH_LOCK_TRACE.take();
+    }
+}
+
+#[cfg(not(debug_assertions))]
+impl LockTrace {
+    fn enter(purpose: &'static str) {}
+    fn exit() {}
+}
+
 thread_local! {
-    static READ_LOCK_LOCATION: Cell<Option<(&'static Location<'static>, Backtrace)>> = const { Cell::new(None) };
+    static SNATCH_LOCK_TRACE: Cell<Option<LockTrace>> = const { Cell::new(None) };
 }
 
 /// A Device-global lock for all snatchable data.
@@ -78,31 +130,16 @@ impl SnatchLock {
     /// right SnatchLock (the one associated to the same device). This method is unsafe
     /// to force force sers to think twice about creating a SnatchLock. The only place this
     /// method should be called is when creating the device.
-    pub unsafe fn new() -> Self {
+    pub unsafe fn new(rank: rank::LockRank) -> Self {
         SnatchLock {
-            lock: RwLock::new(()),
+            lock: RwLock::new(rank, ()),
         }
     }
 
     /// Request read access to snatchable resources.
     #[track_caller]
     pub fn read(&self) -> SnatchGuard {
-        if cfg!(debug_assertions) {
-            let caller = Location::caller();
-            let backtrace = Backtrace::capture();
-            if let Some((prev, bt)) = READ_LOCK_LOCATION.take() {
-                let current = thread::current();
-                let name = current.name().unwrap_or("<unnamed>");
-                panic!(
-                    "thread '{name}' attempted to acquire a snatch read lock recursively.\n
-                    - {prev}\n{bt}\n
-                    - {caller}\n{backtrace}"
-                );
-            } else {
-                READ_LOCK_LOCATION.set(Some((caller, backtrace)));
-            }
-        }
-
+        LockTrace::enter("read");
         SnatchGuard(self.lock.read())
     }
 
@@ -111,14 +148,21 @@ impl SnatchLock {
     /// This should only be called when a resource needs to be snatched. This has
     /// a high risk of causing lock contention if called concurrently with other
     /// wgpu work.
+    #[track_caller]
     pub fn write(&self) -> ExclusiveSnatchGuard {
+        LockTrace::enter("write");
         ExclusiveSnatchGuard(self.lock.write())
     }
 }
 
 impl Drop for SnatchGuard<'_> {
     fn drop(&mut self) {
-        #[cfg(debug_assertions)]
-        READ_LOCK_LOCATION.take();
+        LockTrace::exit();
+    }
+}
+
+impl Drop for ExclusiveSnatchGuard<'_> {
+    fn drop(&mut self) {
+        LockTrace::exit();
     }
 }
diff --git a/wgpu-core/src/track/buffer.rs b/wgpu-core/src/track/buffer.rs
index 6cf1fdda6f..9a52a53253 100644
--- a/wgpu-core/src/track/buffer.rs
+++ b/wgpu-core/src/track/buffer.rs
@@ -11,6 +11,7 @@ use super::{PendingTransition, ResourceTracker, TrackerIndex};
 use crate::{
     hal_api::HalApi,
     id::BufferId,
+    lock::{rank, Mutex},
     resource::{Buffer, Resource},
     snatch::SnatchGuard,
     storage::Storage,
@@ -20,7 +21,6 @@ use crate::{
     },
 };
 use hal::{BufferBarrier, BufferUses};
-use parking_lot::Mutex;
 use wgt::{strict_assert, strict_assert_eq};
 
 impl ResourceUses for BufferUses {
@@ -51,7 +51,7 @@ pub(crate) struct BufferBindGroupState<A: HalApi> {
 impl<A: HalApi> BufferBindGroupState<A> {
     pub fn new() -> Self {
         Self {
-            buffers: Mutex::new(Vec::new()),
+            buffers: Mutex::new(rank::BUFFER_BIND_GROUP_STATE_BUFFERS, Vec::new()),
 
             _phantom: PhantomData,
         }
@@ -245,6 +245,22 @@ impl<A: HalApi> BufferUsageScope<A> {
             .get(id)
             .map_err(|_| UsageConflict::BufferInvalid { id })?;
 
+        self.insert_merge_single(buffer.clone(), new_state)
+            .map(|_| buffer)
+    }
+
+    /// Merge a single state into the UsageScope, using an already resolved buffer.
+    ///
+    /// If the resulting state is invalid, returns a usage
+    /// conflict with the details of the invalid state.
+    ///
+    /// If the ID is higher than the length of internal vectors,
+    /// the vectors will be extended. A call to set_size is not needed.
+    pub fn insert_merge_single(
+        &mut self,
+        buffer: Arc<Buffer<A>>,
+        new_state: BufferUses,
+    ) -> Result<(), UsageConflict> {
         let index = buffer.info.tracker_index().as_usize();
 
         self.allow_index(index);
@@ -260,12 +276,12 @@ impl<A: HalApi> BufferUsageScope<A> {
                 index,
                 BufferStateProvider::Direct { state: new_state },
                 ResourceMetadataProvider::Direct {
-                    resource: Cow::Owned(buffer.clone()),
+                    resource: Cow::Owned(buffer),
                 },
             )?;
         }
 
-        Ok(buffer)
+        Ok(())
     }
 }
 
diff --git a/wgpu-core/src/track/metadata.rs b/wgpu-core/src/track/metadata.rs
index 3e71e0e084..d6e8d6f906 100644
--- a/wgpu-core/src/track/metadata.rs
+++ b/wgpu-core/src/track/metadata.rs
@@ -87,16 +87,18 @@ impl<T: Resource> ResourceMetadata<T> {
     /// Add the resource with the given index, epoch, and reference count to the
     /// set.
     ///
+    /// Returns a reference to the newly inserted resource.
+    /// (This allows avoiding a clone/reference count increase in many cases.)
+    ///
     /// # Safety
     ///
     /// The given `index` must be in bounds for this `ResourceMetadata`'s
     /// existing tables. See `tracker_assert_in_bounds`.
     #[inline(always)]
-    pub(super) unsafe fn insert(&mut self, index: usize, resource: Arc<T>) {
+    pub(super) unsafe fn insert(&mut self, index: usize, resource: Arc<T>) -> &Arc<T> {
         self.owned.set(index, true);
-        unsafe {
-            *self.resources.get_unchecked_mut(index) = Some(resource);
-        }
+        let resource_dst = unsafe { self.resources.get_unchecked_mut(index) };
+        resource_dst.insert(resource)
     }
 
     /// Get the resource with the given index.
diff --git a/wgpu-core/src/track/mod.rs b/wgpu-core/src/track/mod.rs
index f5b37f3756..7df13d039b 100644
--- a/wgpu-core/src/track/mod.rs
+++ b/wgpu-core/src/track/mod.rs
@@ -102,10 +102,14 @@ mod stateless;
 mod texture;
 
 use crate::{
-    binding_model, command, conv, hal_api::HalApi, id, pipeline, resource, snatch::SnatchGuard,
+    binding_model, command, conv,
+    hal_api::HalApi,
+    id,
+    lock::{rank, Mutex, RwLock},
+    pipeline, resource,
+    snatch::SnatchGuard,
 };
 
-use parking_lot::{Mutex, RwLock};
 use std::{fmt, ops, sync::Arc};
 use thiserror::Error;
 
@@ -136,7 +140,8 @@ impl TrackerIndex {
 /// of a certain type. This index is separate from the resource ID for various reasons:
 /// - There can be multiple resource IDs pointing the the same resource.
 /// - IDs of dead handles can be recycled while resources are internally held alive (and tracked).
-/// - The plan is to remove IDs in the long run (https://github.com/gfx-rs/wgpu/issues/5121).
+/// - The plan is to remove IDs in the long run
+///   ([#5121](https://github.com/gfx-rs/wgpu/issues/5121)).
 /// In order to produce these tracker indices, there is a shared TrackerIndexAllocator
 /// per resource type. Indices have the same lifetime as the internal resource they
 /// are associated to (alloc happens when creating the resource and free is called when
@@ -190,7 +195,10 @@ pub(crate) struct SharedTrackerIndexAllocator {
 impl SharedTrackerIndexAllocator {
     pub fn new() -> Self {
         SharedTrackerIndexAllocator {
-            inner: Mutex::new(TrackerIndexAllocator::new()),
+            inner: Mutex::new(
+                rank::SHARED_TRACKER_INDEX_ALLOCATOR_INNER,
+                TrackerIndexAllocator::new(),
+            ),
         }
     }
 
@@ -487,11 +495,26 @@ impl<A: HalApi> RenderBundleScope<A> {
     /// Create the render bundle scope and pull the maximum IDs from the hubs.
     pub fn new() -> Self {
         Self {
-            buffers: RwLock::new(BufferUsageScope::default()),
-            textures: RwLock::new(TextureUsageScope::default()),
-            bind_groups: RwLock::new(StatelessTracker::new()),
-            render_pipelines: RwLock::new(StatelessTracker::new()),
-            query_sets: RwLock::new(StatelessTracker::new()),
+            buffers: RwLock::new(
+                rank::RENDER_BUNDLE_SCOPE_BUFFERS,
+                BufferUsageScope::default(),
+            ),
+            textures: RwLock::new(
+                rank::RENDER_BUNDLE_SCOPE_TEXTURES,
+                TextureUsageScope::default(),
+            ),
+            bind_groups: RwLock::new(
+                rank::RENDER_BUNDLE_SCOPE_BIND_GROUPS,
+                StatelessTracker::new(),
+            ),
+            render_pipelines: RwLock::new(
+                rank::RENDER_BUNDLE_SCOPE_RENDER_PIPELINES,
+                StatelessTracker::new(),
+            ),
+            query_sets: RwLock::new(
+                rank::RENDER_BUNDLE_SCOPE_QUERY_SETS,
+                StatelessTracker::new(),
+            ),
         }
     }
 
@@ -650,8 +673,8 @@ impl<A: HalApi> Tracker<A> {
     ///
     /// If a transition is needed to get the resources into the needed
     /// state, those transitions are stored within the tracker. A
-    /// subsequent call to [`BufferTracker::drain`] or
-    /// [`TextureTracker::drain`] is needed to get those transitions.
+    /// subsequent call to [`BufferTracker::drain_transitions`] or
+    /// [`TextureTracker::drain_transitions`] is needed to get those transitions.
     ///
     /// This is a really funky method used by Compute Passes to generate
     /// barriers after a call to dispatch without needing to iterate
diff --git a/wgpu-core/src/track/stateless.rs b/wgpu-core/src/track/stateless.rs
index 00225f2305..25ffc027ee 100644
--- a/wgpu-core/src/track/stateless.rs
+++ b/wgpu-core/src/track/stateless.rs
@@ -6,9 +6,14 @@
 
 use std::sync::Arc;
 
-use parking_lot::Mutex;
-
-use crate::{id::Id, resource::Resource, resource_log, storage::Storage, track::ResourceMetadata};
+use crate::{
+    id::Id,
+    lock::{rank, Mutex},
+    resource::Resource,
+    resource_log,
+    storage::Storage,
+    track::ResourceMetadata,
+};
 
 use super::{ResourceTracker, TrackerIndex};
 
@@ -24,7 +29,7 @@ pub(crate) struct StatelessBindGroupSate<T: Resource> {
 impl<T: Resource> StatelessBindGroupSate<T> {
     pub fn new() -> Self {
         Self {
-            resources: Mutex::new(Vec::new()),
+            resources: Mutex::new(rank::STATELESS_BIND_GROUP_STATE_RESOURCES, Vec::new()),
         }
     }
 
@@ -153,16 +158,17 @@ impl<T: Resource> StatelessTracker<T> {
     ///
     /// If the ID is higher than the length of internal vectors,
     /// the vectors will be extended. A call to set_size is not needed.
-    pub fn insert_single(&mut self, resource: Arc<T>) {
+    ///
+    /// Returns a reference to the newly inserted resource.
+    /// (This allows avoiding a clone/reference count increase in many cases.)
+    pub fn insert_single(&mut self, resource: Arc<T>) -> &Arc<T> {
         let index = resource.as_info().tracker_index().as_usize();
 
         self.allow_index(index);
 
         self.tracker_assert_in_bounds(index);
 
-        unsafe {
-            self.metadata.insert(index, resource);
-        }
+        unsafe { self.metadata.insert(index, resource) }
     }
 
     /// Adds the given resource to the tracker.
diff --git a/wgpu-core/src/track/texture.rs b/wgpu-core/src/track/texture.rs
index 3cf95ff38a..51ed72a18d 100644
--- a/wgpu-core/src/track/texture.rs
+++ b/wgpu-core/src/track/texture.rs
@@ -24,6 +24,7 @@ use super::{
 };
 use crate::{
     hal_api::HalApi,
+    lock::{rank, Mutex},
     resource::{Resource, Texture, TextureInner},
     snatch::SnatchGuard,
     track::{
@@ -36,7 +37,6 @@ use hal::TextureUses;
 use arrayvec::ArrayVec;
 use naga::FastHashMap;
 
-use parking_lot::Mutex;
 use wgt::{strict_assert, strict_assert_eq};
 
 use std::{borrow::Cow, iter, marker::PhantomData, ops::Range, sync::Arc, vec::Drain};
@@ -164,7 +164,7 @@ pub(crate) struct TextureBindGroupState<A: HalApi> {
 impl<A: HalApi> TextureBindGroupState<A> {
     pub fn new() -> Self {
         Self {
-            textures: Mutex::new(Vec::new()),
+            textures: Mutex::new(rank::TEXTURE_BIND_GROUP_STATE_TEXTURES, Vec::new()),
         }
     }
 
diff --git a/wgpu-hal/Cargo.toml b/wgpu-hal/Cargo.toml
index ab21c6dfe3..dafcb3a1ab 100644
--- a/wgpu-hal/Cargo.toml
+++ b/wgpu-hal/Cargo.toml
@@ -1,6 +1,6 @@
 [package]
 name = "wgpu-hal"
-version = "0.19.3"
+version = "0.20.0"
 authors = ["gfx-rs developers"]
 edition = "2021"
 description = "WebGPU hardware abstraction layer"
@@ -110,13 +110,13 @@ glow = { version = "0.13.1", optional = true }
 [dependencies.wgt]
 package = "wgpu-types"
 path = "../wgpu-types"
-version = "0.19.2"
+version = "0.20.0"
 
 [target.'cfg(not(target_arch = "wasm32"))'.dependencies]
 # backend: Vulkan
 ash = { version = "0.37.3", optional = true }
 gpu-alloc = { version = "0.6", optional = true }
-gpu-descriptor = { version = "0.2", optional = true }
+gpu-descriptor = { version = "0.3", optional = true }
 smallvec = { version = "1", optional = true, features = ["union"] }
 
 khronos-egl = { version = "6", features = ["dynamic"], optional = true }
@@ -147,7 +147,7 @@ winapi = { version = "0.3", features = [
     "winuser",
     "dcomp",
 ] }
-d3d12 = { path = "../d3d12/", version = "0.19.0", optional = true, features = [
+d3d12 = { path = "../d3d12/", version = "0.20.0", optional = true, features = [
     "libloading",
 ] }
 
@@ -155,7 +155,7 @@ d3d12 = { path = "../d3d12/", version = "0.19.0", optional = true, features = [
 # backend: Metal
 block = { version = "0.1", optional = true }
 
-metal = { version = "0.27.0", git = "https://github.com/gfx-rs/metal-rs", rev = "ff8fd3d6dc7792852f8a015458d7e6d42d7fb352" }
+metal = { version = "0.28.0" }
 objc = "0.2.5"
 core-graphics-types = "0.1"
 
@@ -178,7 +178,7 @@ ndk-sys = { version = "0.5.0", optional = true }
 
 [dependencies.naga]
 path = "../naga"
-version = "0.19.2"
+version = "0.20.0"
 
 [build-dependencies]
 cfg_aliases.workspace = true
@@ -186,13 +186,13 @@ cfg_aliases.workspace = true
 # DEV dependencies
 [dev-dependencies.naga]
 path = "../naga"
-version = "0.19.2"
+version = "0.20.0"
 features = ["wgsl-in"]
 
 [dev-dependencies]
 cfg-if = "1"
 env_logger = "0.11"
-glam = "0.25.0" # for ray-traced-triangle example
+glam = "0.27.0" # for ray-traced-triangle example
 winit = { version = "0.29.14", features = [
     "android-native-activity",
 ] } # for "halmark" example
diff --git a/wgpu-hal/examples/halmark/main.rs b/wgpu-hal/examples/halmark/main.rs
index 29dfd49d28..aef6919c8f 100644
--- a/wgpu-hal/examples/halmark/main.rs
+++ b/wgpu-hal/examples/halmark/main.rs
@@ -253,12 +253,14 @@ impl<A: hal::Api> Example<A> {
                 module: &shader,
                 entry_point: "vs_main",
                 constants: &constants,
+                zero_initialize_workgroup_memory: true,
             },
             vertex_buffers: &[],
             fragment_stage: Some(hal::ProgrammableStage {
                 module: &shader,
                 entry_point: "fs_main",
                 constants: &constants,
+                zero_initialize_workgroup_memory: true,
             }),
             primitive: wgt::PrimitiveState {
                 topology: wgt::PrimitiveTopology::TriangleStrip,
@@ -843,6 +845,7 @@ fn main() {
                             }
                         }
                         ex.render();
+                        window.request_redraw();
                     }
                     _ => {
                         example.as_mut().unwrap().update(event);
diff --git a/wgpu-hal/examples/ray-traced-triangle/main.rs b/wgpu-hal/examples/ray-traced-triangle/main.rs
index 2ed2d64627..3985cd60af 100644
--- a/wgpu-hal/examples/ray-traced-triangle/main.rs
+++ b/wgpu-hal/examples/ray-traced-triangle/main.rs
@@ -372,6 +372,7 @@ impl<A: hal::Api> Example<A> {
                     module: &shader_module,
                     entry_point: "main",
                     constants: &Default::default(),
+                    zero_initialize_workgroup_memory: true,
                 },
             })
         }
diff --git a/wgpu-hal/src/dx12/adapter.rs b/wgpu-hal/src/dx12/adapter.rs
index 2b7040720e..faf25cc852 100644
--- a/wgpu-hal/src/dx12/adapter.rs
+++ b/wgpu-hal/src/dx12/adapter.rs
@@ -115,18 +115,6 @@ impl super::Adapter {
             )
         });
 
-        let mut shader_model_support: d3d12_ty::D3D12_FEATURE_DATA_SHADER_MODEL =
-            d3d12_ty::D3D12_FEATURE_DATA_SHADER_MODEL {
-                HighestShaderModel: d3d12_ty::D3D_SHADER_MODEL_6_0,
-            };
-        assert_eq!(0, unsafe {
-            device.CheckFeatureSupport(
-                d3d12_ty::D3D12_FEATURE_SHADER_MODEL,
-                &mut shader_model_support as *mut _ as *mut _,
-                mem::size_of::<d3d12_ty::D3D12_FEATURE_DATA_SHADER_MODEL>() as _,
-            )
-        });
-
         let mut workarounds = super::Workarounds::default();
 
         let info = wgt::AdapterInfo {
@@ -321,7 +309,7 @@ impl super::Adapter {
             wgt::Features::TEXTURE_BINDING_ARRAY
                 | wgt::Features::UNIFORM_BUFFER_AND_STORAGE_TEXTURE_ARRAY_NON_UNIFORM_INDEXING
                 | wgt::Features::SAMPLED_TEXTURE_AND_STORAGE_BUFFER_ARRAY_NON_UNIFORM_INDEXING,
-            shader_model_support.HighestShaderModel >= d3d12_ty::D3D_SHADER_MODEL_5_1,
+            shader_model >= naga::back::hlsl::ShaderModel::V5_1,
         );
 
         let bgra8unorm_storage_supported = {
@@ -343,21 +331,28 @@ impl super::Adapter {
             bgra8unorm_storage_supported,
         );
 
-        // we must be using DXC because uint64_t was added with Shader Model 6
-        // and FXC only supports up to 5.1
-        let int64_shader_ops_supported = dxc_container.is_some() && {
-            let mut features1: d3d12_ty::D3D12_FEATURE_DATA_D3D12_OPTIONS1 =
-                unsafe { mem::zeroed() };
-            let hr = unsafe {
-                device.CheckFeatureSupport(
-                    d3d12_ty::D3D12_FEATURE_D3D12_OPTIONS1,
-                    &mut features1 as *mut _ as *mut _,
-                    mem::size_of::<d3d12_ty::D3D12_FEATURE_DATA_D3D12_OPTIONS1>() as _,
-                )
-            };
-            hr == 0 && features1.Int64ShaderOps != 0
+        let mut features1: d3d12_ty::D3D12_FEATURE_DATA_D3D12_OPTIONS1 = unsafe { mem::zeroed() };
+        let hr = unsafe {
+            device.CheckFeatureSupport(
+                d3d12_ty::D3D12_FEATURE_D3D12_OPTIONS1,
+                &mut features1 as *mut _ as *mut _,
+                mem::size_of::<d3d12_ty::D3D12_FEATURE_DATA_D3D12_OPTIONS1>() as _,
+            )
         };
-        features.set(wgt::Features::SHADER_INT64, int64_shader_ops_supported);
+
+        features.set(
+            wgt::Features::SHADER_INT64,
+            shader_model >= naga::back::hlsl::ShaderModel::V6_0
+                && hr == 0
+                && features1.Int64ShaderOps != 0,
+        );
+
+        features.set(
+            wgt::Features::SUBGROUP,
+            shader_model >= naga::back::hlsl::ShaderModel::V6_0
+                && hr == 0
+                && features1.WaveOps != 0,
+        );
 
         // float32-filterable should always be available on d3d12
         features.set(wgt::Features::FLOAT32_FILTERABLE, true);
@@ -425,6 +420,8 @@ impl super::Adapter {
                         .min(crate::MAX_VERTEX_BUFFERS as u32),
                     max_vertex_attributes: d3d12_ty::D3D12_IA_VERTEX_INPUT_RESOURCE_SLOT_COUNT,
                     max_vertex_buffer_array_stride: d3d12_ty::D3D12_SO_BUFFER_MAX_STRIDE_IN_BYTES,
+                    min_subgroup_size: 4, // Not using `features1.WaveLaneCountMin` as it is unreliable
+                    max_subgroup_size: 128,
                     // The push constants are part of the root signature which
                     // has a limit of 64 DWORDS (256 bytes), but other resources
                     // also share the root signature:
diff --git a/wgpu-hal/src/dx12/conv.rs b/wgpu-hal/src/dx12/conv.rs
index 2b6c1d959e..b09ea76080 100644
--- a/wgpu-hal/src/dx12/conv.rs
+++ b/wgpu-hal/src/dx12/conv.rs
@@ -224,7 +224,7 @@ pub fn map_polygon_mode(mode: wgt::PolygonMode) -> d3d12_ty::D3D12_FILL_MODE {
 }
 
 /// D3D12 doesn't support passing factors ending in `_COLOR` for alpha blending
-/// (see https://learn.microsoft.com/en-us/windows/win32/api/d3d12/ns-d3d12-d3d12_render_target_blend_desc).
+/// (see <https://learn.microsoft.com/en-us/windows/win32/api/d3d12/ns-d3d12-d3d12_render_target_blend_desc>).
 /// Therefore this function takes an additional `is_alpha` argument
 /// which if set will return an equivalent `_ALPHA` factor.
 fn map_blend_factor(factor: wgt::BlendFactor, is_alpha: bool) -> d3d12_ty::D3D12_BLEND {
diff --git a/wgpu-hal/src/dx12/device.rs b/wgpu-hal/src/dx12/device.rs
index f4539817d3..82075294ee 100644
--- a/wgpu-hal/src/dx12/device.rs
+++ b/wgpu-hal/src/dx12/device.rs
@@ -226,9 +226,20 @@ impl super::Device {
         )
         .map_err(|e| crate::PipelineError::Linkage(stage_bit, format!("HLSL: {e:?}")))?;
 
+        let needs_temp_options = stage.zero_initialize_workgroup_memory
+            != layout.naga_options.zero_initialize_workgroup_memory;
+        let mut temp_options;
+        let naga_options = if needs_temp_options {
+            temp_options = layout.naga_options.clone();
+            temp_options.zero_initialize_workgroup_memory = stage.zero_initialize_workgroup_memory;
+            &temp_options
+        } else {
+            &layout.naga_options
+        };
+
         //TODO: reuse the writer
         let mut source = String::new();
-        let mut writer = hlsl::Writer::new(&mut source, &layout.naga_options);
+        let mut writer = hlsl::Writer::new(&mut source, naga_options);
         let reflection_info = {
             profiling::scope!("naga::back::hlsl::write");
             writer
@@ -239,7 +250,7 @@ impl super::Device {
         let full_stage = format!(
             "{}_{}\0",
             naga_stage.to_hlsl_str(),
-            layout.naga_options.shader_model.to_str()
+            naga_options.shader_model.to_str()
         );
 
         let ep_index = module
diff --git a/wgpu-hal/src/dx12/mod.rs b/wgpu-hal/src/dx12/mod.rs
index 735732ef29..9f021bc241 100644
--- a/wgpu-hal/src/dx12/mod.rs
+++ b/wgpu-hal/src/dx12/mod.rs
@@ -440,7 +440,7 @@ impl Texture {
         }
     }
 
-    /// see https://learn.microsoft.com/en-us/windows/win32/direct3d12/subresources#plane-slice
+    /// see <https://learn.microsoft.com/en-us/windows/win32/direct3d12/subresources#plane-slice>
     fn calc_subresource(&self, mip_level: u32, array_layer: u32, plane: u32) -> u32 {
         mip_level + (array_layer + plane * self.array_layer_count()) * self.mip_level_count
     }
diff --git a/wgpu-hal/src/gles/adapter.rs b/wgpu-hal/src/gles/adapter.rs
index b9d044337c..052c77006b 100644
--- a/wgpu-hal/src/gles/adapter.rs
+++ b/wgpu-hal/src/gles/adapter.rs
@@ -104,7 +104,7 @@ impl super::Adapter {
         }
     }
 
-    fn make_info(vendor_orig: String, renderer_orig: String) -> wgt::AdapterInfo {
+    fn make_info(vendor_orig: String, renderer_orig: String, version: String) -> wgt::AdapterInfo {
         let vendor = vendor_orig.to_lowercase();
         let renderer = renderer_orig.to_lowercase();
 
@@ -179,13 +179,33 @@ impl super::Adapter {
             0
         };
 
+        let driver;
+        let driver_info;
+        if version.starts_with("WebGL ") || version.starts_with("OpenGL ") {
+            let es_sig = " ES";
+            match version.find(es_sig) {
+                Some(pos) => {
+                    driver = version[..pos + es_sig.len()].to_owned();
+                    driver_info = version[pos + es_sig.len() + 1..].to_owned();
+                }
+                None => {
+                    let pos = version.find(' ').unwrap();
+                    driver = version[..pos].to_owned();
+                    driver_info = version[pos + 1..].to_owned();
+                }
+            }
+        } else {
+            driver = "OpenGL".to_owned();
+            driver_info = version;
+        }
+
         wgt::AdapterInfo {
             name: renderer_orig,
             vendor: vendor_id,
             device: 0,
             device_type: inferred_device_type,
-            driver: String::new(),
-            driver_info: String::new(),
+            driver,
+            driver_info,
             backend: wgt::Backend::Gl,
         }
     }
@@ -507,8 +527,7 @@ impl super::Adapter {
         let has_etc = if cfg!(any(webgl, Emscripten)) {
             extensions.contains("WEBGL_compressed_texture_etc")
         } else {
-            // This is a required part of GLES3, but not part of Desktop GL at all.
-            es_ver.is_some()
+            es_ver.is_some() || extensions.contains("GL_ARB_ES3_compatibility")
         };
         features.set(wgt::Features::TEXTURE_COMPRESSION_ETC2, has_etc);
 
@@ -728,6 +747,8 @@ impl super::Adapter {
             } else {
                 !0
             },
+            min_subgroup_size: 0,
+            max_subgroup_size: 0,
             max_push_constant_size: super::MAX_PUSH_CONSTANTS as u32 * 4,
             min_uniform_buffer_offset_alignment,
             min_storage_buffer_offset_alignment,
@@ -825,7 +846,7 @@ impl super::Adapter {
                     max_msaa_samples: max_samples,
                 }),
             },
-            info: Self::make_info(vendor, renderer),
+            info: Self::make_info(vendor, renderer, version),
             features,
             capabilities: crate::Capabilities {
                 limits,
diff --git a/wgpu-hal/src/gles/device.rs b/wgpu-hal/src/gles/device.rs
index 921941735c..a1e2736aa6 100644
--- a/wgpu-hal/src/gles/device.rs
+++ b/wgpu-hal/src/gles/device.rs
@@ -255,11 +255,23 @@ impl super::Device {
         };
 
         let mut output = String::new();
+        let needs_temp_options = stage.zero_initialize_workgroup_memory
+            != context.layout.naga_options.zero_initialize_workgroup_memory;
+        let mut temp_options;
+        let naga_options = if needs_temp_options {
+            // We use a conditional here, as cloning the naga_options could be expensive
+            // That is, we want to avoid doing that unless we cannot avoid it
+            temp_options = context.layout.naga_options.clone();
+            temp_options.zero_initialize_workgroup_memory = stage.zero_initialize_workgroup_memory;
+            &temp_options
+        } else {
+            &context.layout.naga_options
+        };
         let mut writer = glsl::Writer::new(
             &mut output,
             &module,
             &info,
-            &context.layout.naga_options,
+            naga_options,
             &pipeline_options,
             policies,
         )
@@ -305,6 +317,7 @@ impl super::Device {
                 naga_stage: naga_stage.to_owned(),
                 shader_id: stage.module.id,
                 entry_point: stage.entry_point.to_owned(),
+                zero_initialize_workgroup_memory: stage.zero_initialize_workgroup_memory,
             });
         }
         let mut guard = self
diff --git a/wgpu-hal/src/gles/egl.rs b/wgpu-hal/src/gles/egl.rs
index b166f4f102..7494dcad76 100644
--- a/wgpu-hal/src/gles/egl.rs
+++ b/wgpu-hal/src/gles/egl.rs
@@ -526,7 +526,24 @@ impl Inner {
         }
 
         let (config, supports_native_window) = choose_config(&egl, display, srgb_kind)?;
-        egl.bind_api(khronos_egl::OPENGL_ES_API).unwrap();
+
+        let supports_opengl = if version >= (1, 4) {
+            let client_apis = egl
+                .query_string(Some(display), khronos_egl::CLIENT_APIS)
+                .unwrap()
+                .to_string_lossy();
+            client_apis
+                .split(' ')
+                .any(|client_api| client_api == "OpenGL")
+        } else {
+            false
+        };
+        egl.bind_api(if supports_opengl {
+            khronos_egl::OPENGL_API
+        } else {
+            khronos_egl::OPENGL_ES_API
+        })
+        .unwrap();
 
         let needs_robustness = true;
         let mut khr_context_flags = 0;
@@ -977,6 +994,7 @@ impl crate::Instance for Instance {
             srgb_kind: inner.srgb_kind,
         })
     }
+
     unsafe fn destroy_surface(&self, _surface: Surface) {}
 
     unsafe fn enumerate_adapters(&self) -> Vec<crate::ExposedAdapter<super::Api>> {
@@ -993,6 +1011,12 @@ impl crate::Instance for Instance {
             })
         };
 
+        // In contrast to OpenGL ES, OpenGL requires explicitly enabling sRGB conversions,
+        // as otherwise the user has to do the sRGB conversion.
+        if !matches!(inner.srgb_kind, SrgbFrameBufferKind::None) {
+            unsafe { gl.enable(glow::FRAMEBUFFER_SRGB) };
+        }
+
         if self.flags.contains(wgt::InstanceFlags::DEBUG) && gl.supports_debug() {
             log::debug!("Max label length: {}", unsafe {
                 gl.get_parameter_i32(glow::MAX_LABEL_LENGTH)
diff --git a/wgpu-hal/src/gles/mod.rs b/wgpu-hal/src/gles/mod.rs
index 6f41f7c000..0fcb09be46 100644
--- a/wgpu-hal/src/gles/mod.rs
+++ b/wgpu-hal/src/gles/mod.rs
@@ -602,6 +602,7 @@ struct ProgramStage {
     naga_stage: naga::ShaderStage,
     shader_id: ShaderId,
     entry_point: String,
+    zero_initialize_workgroup_memory: bool,
 }
 
 #[derive(PartialEq, Eq, Hash)]
diff --git a/wgpu-hal/src/gles/queue.rs b/wgpu-hal/src/gles/queue.rs
index 29dfb79d04..7c728d3978 100644
--- a/wgpu-hal/src/gles/queue.rs
+++ b/wgpu-hal/src/gles/queue.rs
@@ -213,12 +213,27 @@ impl super::Queue {
                 instance_count,
                 ref first_instance_location,
             } => {
-                match base_vertex {
-                    0 => {
-                        unsafe {
-                            gl.uniform_1_u32(first_instance_location.as_ref(), first_instance)
-                        };
+                let supports_full_instancing = self
+                    .shared
+                    .private_caps
+                    .contains(PrivateCapabilities::FULLY_FEATURED_INSTANCING);
 
+                if supports_full_instancing {
+                    unsafe {
+                        gl.draw_elements_instanced_base_vertex_base_instance(
+                            topology,
+                            index_count as i32,
+                            index_type,
+                            index_offset as i32,
+                            instance_count as i32,
+                            base_vertex,
+                            first_instance,
+                        )
+                    }
+                } else {
+                    unsafe { gl.uniform_1_u32(first_instance_location.as_ref(), first_instance) };
+
+                    if base_vertex == 0 {
                         unsafe {
                             // Don't use `gl.draw_elements`/`gl.draw_elements_base_vertex` for `instance_count == 1`.
                             // Angle has a bug where it doesn't consider the instance divisor when `DYNAMIC_DRAW` is used in `gl.draw_elements`/`gl.draw_elements_base_vertex`.
@@ -231,41 +246,17 @@ impl super::Queue {
                                 instance_count as i32,
                             )
                         }
-                    }
-                    _ => {
-                        let supports_full_instancing = self
-                            .shared
-                            .private_caps
-                            .contains(PrivateCapabilities::FULLY_FEATURED_INSTANCING);
-
-                        if supports_full_instancing {
-                            unsafe {
-                                gl.draw_elements_instanced_base_vertex_base_instance(
-                                    topology,
-                                    index_count as i32,
-                                    index_type,
-                                    index_offset as i32,
-                                    instance_count as i32,
-                                    base_vertex,
-                                    first_instance,
-                                )
-                            }
-                        } else {
-                            unsafe {
-                                gl.uniform_1_u32(first_instance_location.as_ref(), first_instance)
-                            };
-
-                            // If we've gotten here, wgpu-core has already validated that this function exists via the DownlevelFlags::BASE_VERTEX feature.
-                            unsafe {
-                                gl.draw_elements_instanced_base_vertex(
-                                    topology,
-                                    index_count as _,
-                                    index_type,
-                                    index_offset as i32,
-                                    instance_count as i32,
-                                    base_vertex,
-                                )
-                            }
+                    } else {
+                        // If we've gotten here, wgpu-core has already validated that this function exists via the DownlevelFlags::BASE_VERTEX feature.
+                        unsafe {
+                            gl.draw_elements_instanced_base_vertex(
+                                topology,
+                                index_count as _,
+                                index_type,
+                                index_offset as i32,
+                                instance_count as i32,
+                                base_vertex,
+                            )
                         }
                     }
                 }
diff --git a/wgpu-hal/src/gles/wgl.rs b/wgpu-hal/src/gles/wgl.rs
index 2564892969..aae70478b4 100644
--- a/wgpu-hal/src/gles/wgl.rs
+++ b/wgpu-hal/src/gles/wgl.rs
@@ -507,6 +507,8 @@ impl crate::Instance for Instance {
                 .supported_extensions()
                 .contains("GL_ARB_framebuffer_sRGB");
 
+        // In contrast to OpenGL ES, OpenGL requires explicitly enabling sRGB conversions,
+        // as otherwise the user has to do the sRGB conversion.
         if srgb_capable {
             unsafe { gl.enable(glow::FRAMEBUFFER_SRGB) };
         }
diff --git a/wgpu-hal/src/lib.rs b/wgpu-hal/src/lib.rs
index ddcb0634fe..d300ca30cc 100644
--- a/wgpu-hal/src/lib.rs
+++ b/wgpu-hal/src/lib.rs
@@ -3,14 +3,14 @@
  * This crate defines a set of traits abstracting over modern graphics APIs,
  * with implementations ("backends") for Vulkan, Metal, Direct3D, and GL.
  *
- * `wgpu_hal` is a spiritual successor to
+ * `wgpu-hal` is a spiritual successor to
  * [gfx-hal](https://github.com/gfx-rs/gfx), but with reduced scope, and
  * oriented towards WebGPU implementation goals. It has no overhead for
  * validation or tracking, and the API translation overhead is kept to the bare
  * minimum by the design of WebGPU. This API can be used for resource-demanding
  * applications and engines.
  *
- * The `wgpu_hal` crate's main design choices:
+ * The `wgpu-hal` crate's main design choices:
  *
  * - Our traits are meant to be *portable*: proper use
  *   should get equivalent results regardless of the backend.
@@ -19,7 +19,7 @@
  *   validation, if any, and incorrect use will often cause undefined behavior.
  *   This allows us to minimize the overhead we impose over the underlying
  *   graphics system. If you need safety, the [`wgpu-core`] crate provides a
- *   safe API for driving `wgpu_hal`, implementing all necessary validation,
+ *   safe API for driving `wgpu-hal`, implementing all necessary validation,
  *   resource state tracking, and so on. (Note that `wgpu-core` is designed for
  *   use via FFI; the [`wgpu`] crate provides more idiomatic Rust bindings for
  *   `wgpu-core`.) Or, you can do your own validation.
@@ -27,7 +27,7 @@
  * - In the same vein, returned errors *only cover cases the user can't
  *   anticipate*, like running out of memory or losing the device. Any errors
  *   that the user could reasonably anticipate are their responsibility to
- *   avoid. For example, `wgpu_hal` returns no error for mapping a buffer that's
+ *   avoid. For example, `wgpu-hal` returns no error for mapping a buffer that's
  *   not mappable: as the buffer creator, the user should already know if they
  *   can map it.
  *
@@ -43,7 +43,7 @@
  * - We map buffer contents *persistently*. This means that the buffer
  *   can remain mapped on the CPU while the GPU reads or writes to it.
  *   You must explicitly indicate when data might need to be
- *   transferred between CPU and GPU, if `wgpu_hal` indicates that the
+ *   transferred between CPU and GPU, if `wgpu-hal` indicates that the
  *   mapping is not coherent (that is, automatically synchronized
  *   between the two devices).
  *
@@ -62,7 +62,7 @@
  *   function documentation. For this reason, we recommend that iterators don't
  *   do any mutating work.
  *
- * Unfortunately, `wgpu_hal`'s safety requirements are not fully documented.
+ * Unfortunately, `wgpu-hal`'s safety requirements are not fully documented.
  * Ideally, all trait methods would have doc comments setting out the
  * requirements users must meet to ensure correct and portable behavior. If you
  * are aware of a specific requirement that a backend imposes that is not
@@ -76,7 +76,7 @@
  *
  * ## Primary backends
  *
- * The `wgpu_hal` crate has full-featured backends implemented on the following
+ * The `wgpu-hal` crate has full-featured backends implemented on the following
  * platform graphics APIs:
  *
  * - Vulkan, available on Linux, Android, and Windows, using the [`ash`] crate's
@@ -93,7 +93,7 @@
  *
  * ## Secondary backends
  *
- * The `wgpu_hal` crate has a partial implementation based on the following
+ * The `wgpu-hal` crate has a partial implementation based on the following
  * platform graphics API:
  *
  * - The GL backend is available anywhere OpenGL, OpenGL ES, or WebGL are
@@ -110,6 +110,92 @@
  *
  * [tdc]: wgt::DownlevelCapabilities
  *
+ * ## Traits
+ *
+ * The `wgpu-hal` crate defines a handful of traits that together
+ * represent a cross-platform abstraction for modern GPU APIs.
+ *
+ * - The [`Api`] trait represents a `wgpu-hal` backend. It has no methods of its
+ *   own, only a collection of associated types.
+ *
+ * - [`Api::Instance`] implements the [`Instance`] trait. [`Instance::init`]
+ *   creates an instance value, which you can use to enumerate the adapters
+ *   available on the system. For example, [`vulkan::Api::Instance::init`][Ii]
+ *   returns an instance that can enumerate the Vulkan physical devices on your
+ *   system.
+ *
+ * - [`Api::Adapter`] implements the [`Adapter`] trait, representing a
+ *   particular device from a particular backend. For example, a Vulkan instance
+ *   might have a Lavapipe software adapter and a GPU-based adapter.
+ *
+ * - [`Api::Device`] implements the [`Device`] trait, representing an active
+ *   link to a device. You get a device value by calling [`Adapter::open`], and
+ *   then use it to create buffers, textures, shader modules, and so on.
+ *
+ * - [`Api::Queue`] implements the [`Queue`] trait, which you use to submit
+ *   command buffers to a given device.
+ *
+ * - [`Api::CommandEncoder`] implements the [`CommandEncoder`] trait, which you
+ *   use to build buffers of commands to submit to a queue. This has all the
+ *   methods for drawing and running compute shaders, which is presumably what
+ *   you're here for.
+ *
+ * - [`Api::Surface`] implements the [`Surface`] trait, which represents a
+ *   swapchain for presenting images on the screen, via interaction with the
+ *   system's window manager.
+ *
+ * The [`Api`] trait has various other associated types like [`Api::Buffer`] and
+ * [`Api::Texture`] that represent resources the rest of the interface can
+ * operate on, but these generally do not have their own traits.
+ *
+ * [Ii]: Instance::init
+ *
+ * ## Validation is the calling code's responsibility, not `wgpu-hal`'s
+ *
+ * As much as possible, `wgpu-hal` traits place the burden of validation,
+ * resource tracking, and state tracking on the caller, not on the trait
+ * implementations themselves. Anything which can reasonably be handled in
+ * backend-independent code should be. A `wgpu_hal` backend's sole obligation is
+ * to provide portable behavior, and report conditions that the calling code
+ * can't reasonably anticipate, like device loss or running out of memory.
+ *
+ * The `wgpu` crate collection is intended for use in security-sensitive
+ * applications, like web browsers, where the API is available to untrusted
+ * code. This means that `wgpu-core`'s validation is not simply a service to
+ * developers, to be provided opportunistically when the performance costs are
+ * acceptable and the necessary data is ready at hand. Rather, `wgpu-core`'s
+ * validation must be exhaustive, to ensure that even malicious content cannot
+ * provoke and exploit undefined behavior in the platform's graphics API.
+ *
+ * Because graphics APIs' requirements are complex, the only practical way for
+ * `wgpu` to provide exhaustive validation is to comprehensively track the
+ * lifetime and state of all the resources in the system. Implementing this
+ * separately for each backend is infeasible; effort would be better spent
+ * making the cross-platform validation in `wgpu-core` legible and trustworthy.
+ * Fortunately, the requirements are largely similar across the various
+ * platforms, so cross-platform validation is practical.
+ *
+ * Some backends have specific requirements that aren't practical to foist off
+ * on the `wgpu-hal` user. For example, properly managing macOS Objective-C or
+ * Microsoft COM reference counts is best handled by using appropriate pointer
+ * types within the backend.
+ *
+ * A desire for "defense in depth" may suggest performing additional validation
+ * in `wgpu-hal` when the opportunity arises, but this must be done with
+ * caution. Even experienced contributors infer the expectations their changes
+ * must meet by considering not just requirements made explicit in types, tests,
+ * assertions, and comments, but also those implicit in the surrounding code.
+ * When one sees validation or state-tracking code in `wgpu-hal`, it is tempting
+ * to conclude, "Oh, `wgpu-hal` checks for this, so `wgpu-core` needn't worry
+ * about it - that would be redundant!" The responsibility for exhaustive
+ * validation always rests with `wgpu-core`, regardless of what may or may not
+ * be checked in `wgpu-hal`.
+ *
+ * To this end, any "defense in depth" validation that does appear in `wgpu-hal`
+ * for requirements that `wgpu-core` should have enforced should report failure
+ * via the `unreachable!` macro, because problems detected at this stage always
+ * indicate a bug in `wgpu-core`.
+ *
  * ## Debugging
  *
  * Most of the information on the wiki [Debugging wgpu Applications][wiki-debug]
@@ -303,6 +389,15 @@ pub trait Api: Clone + fmt::Debug + Sized {
 
     type Queue: Queue<A = Self>;
     type CommandEncoder: CommandEncoder<A = Self>;
+
+    /// This API's command buffer type.
+    ///
+    /// The only thing you can do with `CommandBuffer`s is build them
+    /// with a [`CommandEncoder`] and then pass them to
+    /// [`Queue::submit`] for execution, or destroy them by passing
+    /// them to [`CommandEncoder::reset_all`].
+    ///
+    /// [`CommandEncoder`]: Api::CommandEncoder
     type CommandBuffer: WasmNotSendSync + fmt::Debug;
 
     type Buffer: fmt::Debug + WasmNotSendSync + 'static;
@@ -311,6 +406,24 @@ pub trait Api: Clone + fmt::Debug + Sized {
     type TextureView: fmt::Debug + WasmNotSendSync;
     type Sampler: fmt::Debug + WasmNotSendSync;
     type QuerySet: fmt::Debug + WasmNotSendSync;
+
+    /// A value you can block on to wait for something to finish.
+    ///
+    /// A `Fence` holds a monotonically increasing [`FenceValue`]. You can call
+    /// [`Device::wait`] to block until a fence reaches or passes a value you
+    /// choose. [`Queue::submit`] can take a `Fence` and a [`FenceValue`] to
+    /// store in it when the submitted work is complete.
+    ///
+    /// Attempting to set a fence to a value less than its current value has no
+    /// effect.
+    ///
+    /// Waiting on a fence returns as soon as the fence reaches *or passes* the
+    /// requested value. This implies that, in order to reliably determine when
+    /// an operation has completed, operations must finish in order of
+    /// increasing fence values: if a higher-valued operation were to finish
+    /// before a lower-valued operation, then waiting for the fence to reach the
+    /// lower value could return before the lower-valued operation has actually
+    /// finished.
     type Fence: fmt::Debug + WasmNotSendSync;
 
     type BindGroupLayout: fmt::Debug + WasmNotSendSync;
@@ -510,7 +623,25 @@ pub trait Device: WasmNotSendSync {
         &self,
         fence: &<Self::A as Api>::Fence,
     ) -> Result<FenceValue, DeviceError>;
-    /// Calling wait with a lower value than the current fence value will immediately return.
+
+    /// Wait for `fence` to reach `value`.
+    ///
+    /// Operations like [`Queue::submit`] can accept a [`Fence`] and a
+    /// [`FenceValue`] to store in it, so you can use this `wait` function
+    /// to wait for a given queue submission to finish execution.
+    ///
+    /// The `value` argument must be a value that some actual operation you have
+    /// already presented to the device is going to store in `fence`. You cannot
+    /// wait for values yet to be submitted. (This restriction accommodates
+    /// implementations like the `vulkan` backend's [`FencePool`] that must
+    /// allocate a distinct synchronization object for each fence value one is
+    /// able to wait for.)
+    ///
+    /// Calling `wait` with a lower [`FenceValue`] than `fence`'s current value
+    /// returns immediately.
+    ///
+    /// [`Fence`]: Api::Fence
+    /// [`FencePool`]: vulkan/enum.Fence.html#variant.FencePool
     unsafe fn wait(
         &self,
         fence: &<Self::A as Api>::Fence,
@@ -542,14 +673,48 @@ pub trait Device: WasmNotSendSync {
 pub trait Queue: WasmNotSendSync {
     type A: Api;
 
-    /// Submits the command buffers for execution on GPU.
+    /// Submit `command_buffers` for execution on GPU.
+    ///
+    /// If `signal_fence` is `Some(fence, value)`, update `fence` to `value`
+    /// when the operation is complete. See [`Fence`] for details.
+    ///
+    /// If two calls to `submit` on a single `Queue` occur in a particular order
+    /// (that is, they happen on the same thread, or on two threads that have
+    /// synchronized to establish an ordering), then the first submission's
+    /// commands all complete execution before any of the second submission's
+    /// commands begin. All results produced by one submission are visible to
+    /// the next.
+    ///
+    /// Within a submission, command buffers execute in the order in which they
+    /// appear in `command_buffers`. All results produced by one buffer are
+    /// visible to the next.
+    ///
+    /// If two calls to `submit` on a single `Queue` from different threads are
+    /// not synchronized to occur in a particular order, they must pass distinct
+    /// [`Fence`]s. As explained in the [`Fence`] documentation, waiting for
+    /// operations to complete is only trustworthy when operations finish in
+    /// order of increasing fence value, but submissions from different threads
+    /// cannot determine how to order the fence values if the submissions
+    /// themselves are unordered. If each thread uses a separate [`Fence`], this
+    /// problem does not arise.
     ///
     /// Valid usage:
-    /// - all of the command buffers were created from command pools
-    ///   that are associated with this queue.
-    /// - all of the command buffers had `CommandBuffer::finish()` called.
-    /// - all surface textures that the command buffers write to must be
-    ///   passed to the surface_textures argument.
+    ///
+    /// - All of the [`CommandBuffer`][cb]s were created from
+    ///   [`CommandEncoder`][ce]s that are associated with this queue.
+    ///
+    /// - All of those [`CommandBuffer`][cb]s must remain alive until
+    ///   the submitted commands have finished execution. (Since
+    ///   command buffers must not outlive their encoders, this
+    ///   implies that the encoders must remain alive as well.)
+    ///
+    /// - All of the [`SurfaceTexture`][st]s that the command buffers
+    ///   write to appear in the `surface_textures` argument.
+    ///
+    /// [`Fence`]: Api::Fence
+    /// [cb]: Api::CommandBuffer
+    /// [ce]: Api::CommandEncoder
+    /// [st]: Api::SurfaceTexture
     unsafe fn submit(
         &self,
         command_buffers: &[&<Self::A as Api>::CommandBuffer],
@@ -564,7 +729,12 @@ pub trait Queue: WasmNotSendSync {
     unsafe fn get_timestamp_period(&self) -> f32;
 }
 
-/// Encoder and allocation pool for `CommandBuffer`.
+/// Encoder and allocation pool for `CommandBuffer`s.
+///
+/// A `CommandEncoder` not only constructs `CommandBuffer`s but also
+/// acts as the allocation pool that owns the buffers' underlying
+/// storage. Thus, `CommandBuffer`s must not outlive the
+/// `CommandEncoder` that created them.
 ///
 /// The life cycle of a `CommandBuffer` is as follows:
 ///
@@ -577,14 +747,17 @@ pub trait Queue: WasmNotSendSync {
 ///
 /// - Call methods like `copy_buffer_to_buffer`, `begin_render_pass`,
 ///   etc. on a "recording" `CommandEncoder` to add commands to the
-///   list.
+///   list. (If an error occurs, you must call `discard_encoding`; see
+///   below.)
 ///
 /// - Call `end_encoding` on a recording `CommandEncoder` to close the
 ///   encoder and construct a fresh `CommandBuffer` consisting of the
 ///   list of commands recorded up to that point.
 ///
 /// - Call `discard_encoding` on a recording `CommandEncoder` to drop
-///   the commands recorded thus far and close the encoder.
+///   the commands recorded thus far and close the encoder. This is
+///   the only safe thing to do on a `CommandEncoder` if an error has
+///   occurred while recording commands.
 ///
 /// - Call `reset_all` on a closed `CommandEncoder`, passing all the
 ///   live `CommandBuffers` built from it. All the `CommandBuffer`s
@@ -602,6 +775,10 @@ pub trait Queue: WasmNotSendSync {
 ///   built it.
 ///
 /// - A `CommandEncoder` must not outlive its `Device`.
+///
+/// It is the user's responsibility to meet this requirements. This
+/// allows `CommandEncoder` implementations to keep their state
+/// tracking to a minimum.
 pub trait CommandEncoder: WasmNotSendSync + fmt::Debug {
     type A: Api;
 
@@ -614,13 +791,20 @@ pub trait CommandEncoder: WasmNotSendSync + fmt::Debug {
     /// This `CommandEncoder` must be in the "closed" state.
     unsafe fn begin_encoding(&mut self, label: Label) -> Result<(), DeviceError>;
 
-    /// Discard the command list under construction, if any.
+    /// Discard the command list under construction.
+    ///
+    /// If an error has occurred while recording commands, this
+    /// is the only safe thing to do with the encoder.
     ///
     /// This puts this `CommandEncoder` in the "closed" state.
     ///
     /// # Safety
     ///
     /// This `CommandEncoder` must be in the "recording" state.
+    ///
+    /// Callers must not assume that implementations of this
+    /// function are idempotent, and thus should not call it
+    /// multiple times in a row.
     unsafe fn discard_encoding(&mut self);
 
     /// Return a fresh [`CommandBuffer`] holding the recorded commands.
@@ -1425,6 +1609,11 @@ pub struct ProgrammableStage<'a, A: Api> {
     pub entry_point: &'a str,
     /// Pipeline constants
     pub constants: &'a naga::back::PipelineConstants,
+    /// Whether workgroup scoped memory will be initialized with zero values for this stage.
+    ///
+    /// This is required by the WebGPU spec, but may have overhead which can be avoided
+    /// for cross-platform applications
+    pub zero_initialize_workgroup_memory: bool,
 }
 
 // Rust gets confused about the impl requirements for `A`
@@ -1434,6 +1623,7 @@ impl<A: Api> Clone for ProgrammableStage<'_, A> {
             module: self.module,
             entry_point: self.entry_point,
             constants: self.constants,
+            zero_initialize_workgroup_memory: self.zero_initialize_workgroup_memory,
         }
     }
 }
diff --git a/wgpu-hal/src/metal/adapter.rs b/wgpu-hal/src/metal/adapter.rs
index b67d5c6f97..cddba472bd 100644
--- a/wgpu-hal/src/metal/adapter.rs
+++ b/wgpu-hal/src/metal/adapter.rs
@@ -813,6 +813,14 @@ impl super::PrivateCapabilities {
                 None
             },
             timestamp_query_support,
+            supports_simd_scoped_operations: family_check
+                && (device.supports_family(MTLGPUFamily::Metal3)
+                    || device.supports_family(MTLGPUFamily::Mac2)
+                    || device.supports_family(MTLGPUFamily::Apple7)),
+            // https://developer.apple.com/metal/Metal-Feature-Set-Tables.pdf#page=5
+            int64: family_check
+                && (device.supports_family(MTLGPUFamily::Apple3)
+                    || device.supports_family(MTLGPUFamily::Metal3)),
         }
     }
 
@@ -886,7 +894,7 @@ impl super::PrivateCapabilities {
         }
         features.set(
             F::SHADER_INT64,
-            self.msl_version >= MTLLanguageVersion::V2_3,
+            self.int64 && self.msl_version >= MTLLanguageVersion::V2_3,
         );
 
         features.set(
@@ -898,6 +906,10 @@ impl super::PrivateCapabilities {
         features.set(F::RG11B10UFLOAT_RENDERABLE, self.format_rg11b10_all);
         features.set(F::SHADER_UNUSED_VERTEX_OUTPUT, true);
 
+        if self.supports_simd_scoped_operations {
+            features.insert(F::SUBGROUP | F::SUBGROUP_BARRIER);
+        }
+
         features
     }
 
@@ -952,6 +964,8 @@ impl super::PrivateCapabilities {
                 max_vertex_buffers: self.max_vertex_buffers,
                 max_vertex_attributes: 31,
                 max_vertex_buffer_array_stride: base.max_vertex_buffer_array_stride,
+                min_subgroup_size: 4,
+                max_subgroup_size: 64,
                 max_push_constant_size: 0x1000,
                 min_uniform_buffer_offset_alignment: self.buffer_alignment as u32,
                 min_storage_buffer_offset_alignment: self.buffer_alignment as u32,
diff --git a/wgpu-hal/src/metal/device.rs b/wgpu-hal/src/metal/device.rs
index 0906d21510..2c8f5a2bfb 100644
--- a/wgpu-hal/src/metal/device.rs
+++ b/wgpu-hal/src/metal/device.rs
@@ -112,7 +112,7 @@ impl super::Device {
                 // TODO: support bounds checks on binding arrays
                 binding_array: naga::proc::BoundsCheckPolicy::Unchecked,
             },
-            zero_initialize_workgroup_memory: true,
+            zero_initialize_workgroup_memory: stage.zero_initialize_workgroup_memory,
         };
 
         let pipeline_options = naga::back::msl::PipelineOptions {
diff --git a/wgpu-hal/src/metal/mod.rs b/wgpu-hal/src/metal/mod.rs
index 6aeafb0f86..7d547cfe3c 100644
--- a/wgpu-hal/src/metal/mod.rs
+++ b/wgpu-hal/src/metal/mod.rs
@@ -269,6 +269,8 @@ struct PrivateCapabilities {
     supports_shader_primitive_index: bool,
     has_unified_memory: Option<bool>,
     timestamp_query_support: TimestampQuerySupport,
+    supports_simd_scoped_operations: bool,
+    int64: bool,
 }
 
 #[derive(Clone, Debug)]
@@ -649,7 +651,7 @@ struct BufferResource {
     /// Buffers with the [`wgt::BufferBindingType::Storage`] binding type can
     /// hold WGSL runtime-sized arrays. When one does, we must pass its size to
     /// shader entry points to implement bounds checks and WGSL's `arrayLength`
-    /// function. See [`device::CompiledShader::sized_bindings`] for details.
+    /// function. See `device::CompiledShader::sized_bindings` for details.
     ///
     /// [`Storage`]: wgt::BufferBindingType::Storage
     binding_size: Option<wgt::BufferSize>,
@@ -680,12 +682,12 @@ struct PipelineStageInfo {
 
     /// The buffer argument table index at which we pass runtime-sized arrays' buffer sizes.
     ///
-    /// See [`device::CompiledShader::sized_bindings`] for more details.
+    /// See `device::CompiledShader::sized_bindings` for more details.
     sizes_slot: Option<naga::back::msl::Slot>,
 
     /// Bindings of all WGSL `storage` globals that contain runtime-sized arrays.
     ///
-    /// See [`device::CompiledShader::sized_bindings`] for more details.
+    /// See `device::CompiledShader::sized_bindings` for more details.
     sized_bindings: Vec<naga::ResourceBinding>,
 }
 
@@ -801,7 +803,7 @@ struct CommandState {
     ///
     /// Specifically:
     ///
-    /// - The keys are ['ResourceBinding`] values (that is, the WGSL `@group`
+    /// - The keys are [`ResourceBinding`] values (that is, the WGSL `@group`
     ///   and `@binding` attributes) for `var<storage>` global variables in the
     ///   current module that contain runtime-sized arrays.
     ///
@@ -813,7 +815,7 @@ struct CommandState {
     /// of the buffers listed in [`stage_infos.S.sized_bindings`], which we must
     /// pass to the entry point.
     ///
-    /// See [`device::CompiledShader::sized_bindings`] for more details.
+    /// See `device::CompiledShader::sized_bindings` for more details.
     ///
     /// [`ResourceBinding`]: naga::ResourceBinding
     storage_buffer_length_map: rustc_hash::FxHashMap<naga::ResourceBinding, wgt::BufferSize>,
diff --git a/wgpu-hal/src/vulkan/adapter.rs b/wgpu-hal/src/vulkan/adapter.rs
index 245c0f0933..f1700d7f55 100644
--- a/wgpu-hal/src/vulkan/adapter.rs
+++ b/wgpu-hal/src/vulkan/adapter.rs
@@ -35,6 +35,8 @@ fn indexing_features() -> wgt::Features {
 ///   [`PhysicalDeviceFeatures::from_extensions_and_requested_features`]
 ///   constructs an value of this type indicating which Vulkan features to
 ///   enable, based on the `wgpu_types::Features` requested.
+///
+/// [`Instance::expose_adapter`]: super::Instance::expose_adapter
 #[derive(Debug, Default)]
 pub struct PhysicalDeviceFeatures {
     /// Basic Vulkan 1.0 features.
@@ -86,6 +88,9 @@ pub struct PhysicalDeviceFeatures {
     ///
     /// However, we do populate this when creating a device if
     /// [`Features::RAY_TRACING_ACCELERATION_STRUCTURE`] is requested.
+    ///
+    /// [`Instance::expose_adapter`]: super::Instance::expose_adapter
+    /// [`Features::RAY_TRACING_ACCELERATION_STRUCTURE`]: wgt::Features::RAY_TRACING_ACCELERATION_STRUCTURE
     buffer_device_address: Option<vk::PhysicalDeviceBufferDeviceAddressFeaturesKHR>,
 
     /// Features provided by `VK_KHR_ray_query`,
@@ -95,12 +100,17 @@ pub struct PhysicalDeviceFeatures {
     /// this from `vkGetPhysicalDeviceFeatures2`.
     ///
     /// However, we do populate this when creating a device if ray tracing is requested.
+    ///
+    /// [`Instance::expose_adapter`]: super::Instance::expose_adapter
     ray_query: Option<vk::PhysicalDeviceRayQueryFeaturesKHR>,
 
     /// Features provided by `VK_KHR_zero_initialize_workgroup_memory`, promoted
     /// to Vulkan 1.3.
     zero_initialize_workgroup_memory:
         Option<vk::PhysicalDeviceZeroInitializeWorkgroupMemoryFeatures>,
+
+    /// Features provided by `VK_EXT_subgroup_size_control`, promoted to Vulkan 1.3.
+    subgroup_size_control: Option<vk::PhysicalDeviceSubgroupSizeControlFeatures>,
 }
 
 // This is safe because the structs have `p_next: *mut c_void`, which we null out/never read.
@@ -148,6 +158,9 @@ impl PhysicalDeviceFeatures {
         if let Some(ref mut feature) = self.ray_query {
             info = info.push_next(feature);
         }
+        if let Some(ref mut feature) = self.subgroup_size_control {
+            info = info.push_next(feature);
+        }
         info
     }
 
@@ -175,6 +188,7 @@ impl PhysicalDeviceFeatures {
     /// [`Features`]: wgt::Features
     /// [`DownlevelFlags`]: wgt::DownlevelFlags
     /// [`PrivateCapabilities`]: super::PrivateCapabilities
+    /// [`add_to_device_create_builder`]: PhysicalDeviceFeatures::add_to_device_create_builder
     /// [`DeviceCreateInfoBuilder`]: vk::DeviceCreateInfoBuilder
     /// [`Adapter::required_device_extensions`]: super::Adapter::required_device_extensions
     fn from_extensions_and_requested_features(
@@ -434,6 +448,17 @@ impl PhysicalDeviceFeatures {
             } else {
                 None
             },
+            subgroup_size_control: if device_api_version >= vk::API_VERSION_1_3
+                || enabled_extensions.contains(&vk::ExtSubgroupSizeControlFn::name())
+            {
+                Some(
+                    vk::PhysicalDeviceSubgroupSizeControlFeatures::builder()
+                        .subgroup_size_control(true)
+                        .build(),
+                )
+            } else {
+                None
+            },
         }
     }
 
@@ -442,6 +467,9 @@ impl PhysicalDeviceFeatures {
     /// Given `self`, together with the instance and physical device it was
     /// built from, and a `caps` also built from those, determine which wgpu
     /// features and downlevel flags the device can support.
+    ///
+    /// [`Features`]: wgt::Features
+    /// [`DownlevelFlags`]: wgt::DownlevelFlags
     fn to_wgpu(
         &self,
         instance: &ash::Instance,
@@ -638,6 +666,34 @@ impl PhysicalDeviceFeatures {
             );
         }
 
+        if let Some(ref subgroup) = caps.subgroup {
+            if (caps.device_api_version >= vk::API_VERSION_1_3
+                || caps.supports_extension(vk::ExtSubgroupSizeControlFn::name()))
+                && subgroup.supported_operations.contains(
+                    vk::SubgroupFeatureFlags::BASIC
+                        | vk::SubgroupFeatureFlags::VOTE
+                        | vk::SubgroupFeatureFlags::ARITHMETIC
+                        | vk::SubgroupFeatureFlags::BALLOT
+                        | vk::SubgroupFeatureFlags::SHUFFLE
+                        | vk::SubgroupFeatureFlags::SHUFFLE_RELATIVE,
+                )
+            {
+                features.set(
+                    F::SUBGROUP,
+                    subgroup
+                        .supported_stages
+                        .contains(vk::ShaderStageFlags::COMPUTE | vk::ShaderStageFlags::FRAGMENT),
+                );
+                features.set(
+                    F::SUBGROUP_VERTEX,
+                    subgroup
+                        .supported_stages
+                        .contains(vk::ShaderStageFlags::VERTEX),
+                );
+                features.insert(F::SUBGROUP_BARRIER);
+            }
+        }
+
         let supports_depth_format = |format| {
             supports_format(
                 instance,
@@ -773,6 +829,13 @@ pub struct PhysicalDeviceProperties {
     /// `VK_KHR_driver_properties` extension, promoted to Vulkan 1.2.
     driver: Option<vk::PhysicalDeviceDriverPropertiesKHR>,
 
+    /// Additional `vk::PhysicalDevice` properties from Vulkan 1.1.
+    subgroup: Option<vk::PhysicalDeviceSubgroupProperties>,
+
+    /// Additional `vk::PhysicalDevice` properties from the
+    /// `VK_EXT_subgroup_size_control` extension, promoted to Vulkan 1.3.
+    subgroup_size_control: Option<vk::PhysicalDeviceSubgroupSizeControlProperties>,
+
     /// The device API version.
     ///
     /// Which is the version of Vulkan supported for device-level functionality.
@@ -888,6 +951,11 @@ impl PhysicalDeviceProperties {
             if self.supports_extension(vk::ExtImageRobustnessFn::name()) {
                 extensions.push(vk::ExtImageRobustnessFn::name());
             }
+
+            // Require `VK_EXT_subgroup_size_control` if the associated feature was requested
+            if requested_features.contains(wgt::Features::SUBGROUP) {
+                extensions.push(vk::ExtSubgroupSizeControlFn::name());
+            }
         }
 
         // Optional `VK_KHR_swapchain_mutable_format`
@@ -987,6 +1055,14 @@ impl PhysicalDeviceProperties {
                 .min(crate::MAX_VERTEX_BUFFERS as u32),
             max_vertex_attributes: limits.max_vertex_input_attributes,
             max_vertex_buffer_array_stride: limits.max_vertex_input_binding_stride,
+            min_subgroup_size: self
+                .subgroup_size_control
+                .map(|subgroup_size| subgroup_size.min_subgroup_size)
+                .unwrap_or(0),
+            max_subgroup_size: self
+                .subgroup_size_control
+                .map(|subgroup_size| subgroup_size.max_subgroup_size)
+                .unwrap_or(0),
             max_push_constant_size: limits.max_push_constants_size,
             min_uniform_buffer_offset_alignment: limits.min_uniform_buffer_offset_alignment as u32,
             min_storage_buffer_offset_alignment: limits.min_storage_buffer_offset_alignment as u32,
@@ -1042,6 +1118,9 @@ impl super::InstanceShared {
                 let supports_driver_properties = capabilities.device_api_version
                     >= vk::API_VERSION_1_2
                     || capabilities.supports_extension(vk::KhrDriverPropertiesFn::name());
+                let supports_subgroup_size_control = capabilities.device_api_version
+                    >= vk::API_VERSION_1_3
+                    || capabilities.supports_extension(vk::ExtSubgroupSizeControlFn::name());
 
                 let supports_acceleration_structure =
                     capabilities.supports_extension(vk::KhrAccelerationStructureFn::name());
@@ -1075,6 +1154,20 @@ impl super::InstanceShared {
                     builder = builder.push_next(next);
                 }
 
+                if capabilities.device_api_version >= vk::API_VERSION_1_1 {
+                    let next = capabilities
+                        .subgroup
+                        .insert(vk::PhysicalDeviceSubgroupProperties::default());
+                    builder = builder.push_next(next);
+                }
+
+                if supports_subgroup_size_control {
+                    let next = capabilities
+                        .subgroup_size_control
+                        .insert(vk::PhysicalDeviceSubgroupSizeControlProperties::default());
+                    builder = builder.push_next(next);
+                }
+
                 let mut properties2 = builder.build();
                 unsafe {
                     get_device_properties.get_physical_device_properties2(phd, &mut properties2);
@@ -1190,6 +1283,16 @@ impl super::InstanceShared {
                 builder = builder.push_next(next);
             }
 
+            // `VK_EXT_subgroup_size_control` is promoted to 1.3
+            if capabilities.device_api_version >= vk::API_VERSION_1_3
+                || capabilities.supports_extension(vk::ExtSubgroupSizeControlFn::name())
+            {
+                let next = features
+                    .subgroup_size_control
+                    .insert(vk::PhysicalDeviceSubgroupSizeControlFeatures::default());
+                builder = builder.push_next(next);
+            }
+
             let mut features2 = builder.build();
             unsafe {
                 get_device_properties.get_physical_device_features2(phd, &mut features2);
@@ -1382,6 +1485,9 @@ impl super::Instance {
                 }),
             image_format_list: phd_capabilities.device_api_version >= vk::API_VERSION_1_2
                 || phd_capabilities.supports_extension(vk::KhrImageFormatListFn::name()),
+            subgroup_size_control: phd_features
+                .subgroup_size_control
+                .map_or(false, |ext| ext.subgroup_size_control == vk::TRUE),
         };
         let capabilities = crate::Capabilities {
             limits: phd_capabilities.to_wgpu_limits(),
@@ -1581,6 +1687,15 @@ impl super::Adapter {
                 capabilities.push(spv::Capability::Geometry);
             }
 
+            if features.intersects(wgt::Features::SUBGROUP | wgt::Features::SUBGROUP_VERTEX) {
+                capabilities.push(spv::Capability::GroupNonUniform);
+                capabilities.push(spv::Capability::GroupNonUniformVote);
+                capabilities.push(spv::Capability::GroupNonUniformArithmetic);
+                capabilities.push(spv::Capability::GroupNonUniformBallot);
+                capabilities.push(spv::Capability::GroupNonUniformShuffle);
+                capabilities.push(spv::Capability::GroupNonUniformShuffleRelative);
+            }
+
             if features.intersects(
                 wgt::Features::SAMPLED_TEXTURE_AND_STORAGE_BUFFER_ARRAY_NON_UNIFORM_INDEXING
                     | wgt::Features::UNIFORM_BUFFER_AND_STORAGE_TEXTURE_ARRAY_NON_UNIFORM_INDEXING,
@@ -1619,7 +1734,13 @@ impl super::Adapter {
                 capabilities.push(spv::Capability::RayQueryKHR);
             }
             spv::Options {
-                lang_version: (1, 0),
+                lang_version: if features
+                    .intersects(wgt::Features::SUBGROUP | wgt::Features::SUBGROUP_VERTEX)
+                {
+                    (1, 3)
+                } else {
+                    (1, 0)
+                },
                 flags,
                 capabilities: Some(capabilities.iter().cloned().collect()),
                 bounds_check_policies: naga::proc::BoundsCheckPolicies {
diff --git a/wgpu-hal/src/vulkan/command.rs b/wgpu-hal/src/vulkan/command.rs
index 43a2471954..ceb44dfbe6 100644
--- a/wgpu-hal/src/vulkan/command.rs
+++ b/wgpu-hal/src/vulkan/command.rs
@@ -104,6 +104,11 @@ impl crate::CommandEncoder for super::CommandEncoder {
     }
 
     unsafe fn discard_encoding(&mut self) {
+        // Safe use requires this is not called in the "closed" state, so the buffer
+        // shouldn't be null. Assert this to make sure we're not pushing null
+        // buffers to the discard pile.
+        assert_ne!(self.active, vk::CommandBuffer::null());
+
         self.discarded.push(self.active);
         self.active = vk::CommandBuffer::null();
     }
diff --git a/wgpu-hal/src/vulkan/device.rs b/wgpu-hal/src/vulkan/device.rs
index 52b899900f..ec392533a0 100644
--- a/wgpu-hal/src/vulkan/device.rs
+++ b/wgpu-hal/src/vulkan/device.rs
@@ -2,6 +2,7 @@ use super::conv;
 
 use arrayvec::ArrayVec;
 use ash::{extensions::khr, vk};
+use naga::back::spv::ZeroInitializeWorkgroupMemoryMode;
 use parking_lot::Mutex;
 
 use std::{
@@ -737,7 +738,8 @@ impl super::Device {
                 };
                 let needs_temp_options = !runtime_checks
                     || !binding_map.is_empty()
-                    || naga_shader.debug_source.is_some();
+                    || naga_shader.debug_source.is_some()
+                    || !stage.zero_initialize_workgroup_memory;
                 let mut temp_options;
                 let options = if needs_temp_options {
                     temp_options = self.naga_options.clone();
@@ -760,6 +762,10 @@ impl super::Device {
                             file_name: debug.file_name.as_ref().as_ref(),
                         })
                     }
+                    if !stage.zero_initialize_workgroup_memory {
+                        temp_options.zero_initialize_workgroup_memory =
+                            ZeroInitializeWorkgroupMemoryMode::None;
+                    }
 
                     &temp_options
                 } else {
@@ -782,8 +788,14 @@ impl super::Device {
             }
         };
 
+        let mut flags = vk::PipelineShaderStageCreateFlags::empty();
+        if self.shared.private_caps.subgroup_size_control {
+            flags |= vk::PipelineShaderStageCreateFlags::ALLOW_VARYING_SUBGROUP_SIZE
+        }
+
         let entry_point = CString::new(stage.entry_point).unwrap();
         let create_info = vk::PipelineShaderStageCreateInfo::builder()
+            .flags(flags)
             .stage(conv::map_shader_stage(stage_flags))
             .module(vk_module)
             .name(&entry_point)
diff --git a/wgpu-hal/src/vulkan/mod.rs b/wgpu-hal/src/vulkan/mod.rs
index d969c887d5..d1ea82772e 100644
--- a/wgpu-hal/src/vulkan/mod.rs
+++ b/wgpu-hal/src/vulkan/mod.rs
@@ -238,6 +238,7 @@ struct PrivateCapabilities {
     robust_image_access2: bool,
     zero_initialize_workgroup_memory: bool,
     image_format_list: bool,
+    subgroup_size_control: bool,
 }
 
 bitflags::bitflags!(
@@ -447,6 +448,7 @@ pub struct BindGroup {
     set: gpu_descriptor::DescriptorSet<vk::DescriptorSet>,
 }
 
+/// Miscellaneous allocation recycling pool for `CommandAllocator`.
 #[derive(Default)]
 struct Temp {
     marker: Vec<u8>,
@@ -476,11 +478,31 @@ impl Temp {
 pub struct CommandEncoder {
     raw: vk::CommandPool,
     device: Arc<DeviceShared>,
+
+    /// The current command buffer, if `self` is in the ["recording"]
+    /// state.
+    ///
+    /// ["recording"]: crate::CommandEncoder
+    ///
+    /// If non-`null`, the buffer is in the Vulkan "recording" state.
     active: vk::CommandBuffer,
+
+    /// What kind of pass we are currently within: compute or render.
     bind_point: vk::PipelineBindPoint,
+
+    /// Allocation recycling pool for this encoder.
     temp: Temp,
+
+    /// A pool of available command buffers.
+    ///
+    /// These are all in the Vulkan "initial" state.
     free: Vec<vk::CommandBuffer>,
+
+    /// A pool of discarded command buffers.
+    ///
+    /// These could be in any Vulkan state except "pending".
     discarded: Vec<vk::CommandBuffer>,
+
     /// If this is true, the active renderpass enabled a debug span,
     /// and needs to be disabled on renderpass close.
     rpass_debug_marker_active: bool,
@@ -537,9 +559,47 @@ pub struct QuerySet {
     raw: vk::QueryPool,
 }
 
+/// The [`Api::Fence`] type for [`vulkan::Api`].
+///
+/// This is an `enum` because there are two possible implementations of
+/// `wgpu-hal` fences on Vulkan: Vulkan fences, which work on any version of
+/// Vulkan, and Vulkan timeline semaphores, which are easier and cheaper but
+/// require non-1.0 features.
+///
+/// [`Device::create_fence`] returns a [`TimelineSemaphore`] if
+/// [`VK_KHR_timeline_semaphore`] is available and enabled, and a [`FencePool`]
+/// otherwise.
+///
+/// [`Api::Fence`]: crate::Api::Fence
+/// [`vulkan::Api`]: Api
+/// [`Device::create_fence`]: crate::Device::create_fence
+/// [`TimelineSemaphore`]: Fence::TimelineSemaphore
+/// [`VK_KHR_timeline_semaphore`]: https://registry.khronos.org/vulkan/specs/1.3-extensions/html/vkspec.html#VK_KHR_timeline_semaphore
+/// [`FencePool`]: Fence::FencePool
 #[derive(Debug)]
 pub enum Fence {
+    /// A Vulkan [timeline semaphore].
+    ///
+    /// These are simpler to use than Vulkan fences, since timeline semaphores
+    /// work exactly the way [`wpgu_hal::Api::Fence`] is specified to work.
+    ///
+    /// [timeline semaphore]: https://registry.khronos.org/vulkan/specs/1.3-extensions/html/vkspec.html#synchronization-semaphores
+    /// [`wpgu_hal::Api::Fence`]: crate::Api::Fence
     TimelineSemaphore(vk::Semaphore),
+
+    /// A collection of Vulkan [fence]s, each associated with a [`FenceValue`].
+    ///
+    /// The effective [`FenceValue`] of this variant is the greater of
+    /// `last_completed` and the maximum value associated with a signalled fence
+    /// in `active`.
+    ///
+    /// Fences are available in all versions of Vulkan, but since they only have
+    /// two states, "signaled" and "unsignaled", we need to use a separate fence
+    /// for each queue submission we might want to wait for, and remember which
+    /// [`FenceValue`] each one represents.
+    ///
+    /// [fence]: https://registry.khronos.org/vulkan/specs/1.3-extensions/html/vkspec.html#synchronization-fences
+    /// [`FenceValue`]: crate::FenceValue
     FencePool {
         last_completed: crate::FenceValue,
         /// The pending fence values have to be ascending.
@@ -549,21 +609,32 @@ pub enum Fence {
 }
 
 impl Fence {
+    /// Return the highest [`FenceValue`] among the signalled fences in `active`.
+    ///
+    /// As an optimization, assume that we already know that the fence has
+    /// reached `last_completed`, and don't bother checking fences whose values
+    /// are less than that: those fences remain in the `active` array only
+    /// because we haven't called `maintain` yet to clean them up.
+    ///
+    /// [`FenceValue`]: crate::FenceValue
     fn check_active(
         device: &ash::Device,
-        mut max_value: crate::FenceValue,
+        mut last_completed: crate::FenceValue,
         active: &[(crate::FenceValue, vk::Fence)],
     ) -> Result<crate::FenceValue, crate::DeviceError> {
         for &(value, raw) in active.iter() {
             unsafe {
-                if value > max_value && device.get_fence_status(raw)? {
-                    max_value = value;
+                if value > last_completed && device.get_fence_status(raw)? {
+                    last_completed = value;
                 }
             }
         }
-        Ok(max_value)
+        Ok(last_completed)
     }
 
+    /// Return the highest signalled [`FenceValue`] for `self`.
+    ///
+    /// [`FenceValue`]: crate::FenceValue
     fn get_latest(
         &self,
         device: &ash::Device,
@@ -584,6 +655,18 @@ impl Fence {
         }
     }
 
+    /// Trim the internal state of this [`Fence`].
+    ///
+    /// This function has no externally visible effect, but you should call it
+    /// periodically to keep this fence's resource consumption under control.
+    ///
+    /// For fences using the [`FencePool`] implementation, this function
+    /// recycles fences that have been signaled. If you don't call this,
+    /// [`Queue::submit`] will just keep allocating a new Vulkan fence every
+    /// time it's called.
+    ///
+    /// [`FencePool`]: Fence::FencePool
+    /// [`Queue::submit`]: crate::Queue::submit
     fn maintain(&mut self, device: &ash::Device) -> Result<(), crate::DeviceError> {
         match *self {
             Self::TimelineSemaphore(_) => {}
diff --git a/wgpu-info/src/human.rs b/wgpu-info/src/human.rs
index 9bb281352c..24eeec0008 100644
--- a/wgpu-info/src/human.rs
+++ b/wgpu-info/src/human.rs
@@ -143,6 +143,8 @@ fn print_adapter(output: &mut impl io::Write, report: &AdapterReport, idx: usize
         max_vertex_buffers,
         max_vertex_attributes,
         max_vertex_buffer_array_stride,
+        min_subgroup_size,
+        max_subgroup_size,
         max_push_constant_size,
         min_uniform_buffer_offset_alignment,
         min_storage_buffer_offset_alignment,
@@ -176,6 +178,8 @@ fn print_adapter(output: &mut impl io::Write, report: &AdapterReport, idx: usize
     writeln!(output, "\t\t                              Max Vertex Buffers: {max_vertex_buffers}")?;
     writeln!(output, "\t\t                           Max Vertex Attributes: {max_vertex_attributes}")?;
     writeln!(output, "\t\t                  Max Vertex Buffer Array Stride: {max_vertex_buffer_array_stride}")?;
+    writeln!(output, "\t\t                               Min Subgroup Size: {min_subgroup_size}")?;
+    writeln!(output, "\t\t                               Max Subgroup Size: {max_subgroup_size}")?;
     writeln!(output, "\t\t                          Max Push Constant Size: {max_push_constant_size}")?;
     writeln!(output, "\t\t             Min Uniform Buffer Offset Alignment: {min_uniform_buffer_offset_alignment}")?;
     writeln!(output, "\t\t             Min Storage Buffer Offset Alignment: {min_storage_buffer_offset_alignment}")?;
diff --git a/wgpu-macros/Cargo.toml b/wgpu-macros/Cargo.toml
index b06df02cce..3c605e6554 100644
--- a/wgpu-macros/Cargo.toml
+++ b/wgpu-macros/Cargo.toml
@@ -15,6 +15,6 @@ publish = false
 proc-macro = true
 
 [dependencies]
-heck = "0.4"
+heck = "0.5"
 quote = "1"
 syn = { version = "2", features = ["full"] }
diff --git a/wgpu-types/Cargo.toml b/wgpu-types/Cargo.toml
index f8024f516e..ea18e6b335 100644
--- a/wgpu-types/Cargo.toml
+++ b/wgpu-types/Cargo.toml
@@ -1,6 +1,6 @@
 [package]
 name = "wgpu-types"
-version = "0.19.2"
+version = "0.20.0"
 authors = ["gfx-rs developers"]
 edition = "2021"
 description = "WebGPU types"
@@ -46,4 +46,4 @@ web-sys = { version = "0.3.69", features = [
 
 [dev-dependencies]
 serde = { version = "1", features = ["serde_derive"] }
-serde_json = "1.0.115"
+serde_json = "1.0.116"
diff --git a/wgpu-types/src/lib.rs b/wgpu-types/src/lib.rs
index 75d3947df9..8d5e367301 100644
--- a/wgpu-types/src/lib.rs
+++ b/wgpu-types/src/lib.rs
@@ -890,6 +890,30 @@ bitflags::bitflags! {
         ///
         /// This is a native only feature.
         const SHADER_INT64 = 1 << 55;
+        /// Allows compute and fragment shaders to use the subgroup operation built-ins
+        ///
+        /// Supported Platforms:
+        /// - Vulkan
+        /// - DX12
+        /// - Metal
+        ///
+        /// This is a native only feature.
+        const SUBGROUP = 1 << 56;
+        /// Allows vertex shaders to use the subgroup operation built-ins
+        ///
+        /// Supported Platforms:
+        /// - Vulkan
+        ///
+        /// This is a native only feature.
+        const SUBGROUP_VERTEX = 1 << 57;
+        /// Allows shaders to use the subgroup barrier
+        ///
+        /// Supported Platforms:
+        /// - Vulkan
+        /// - Metal
+        ///
+        /// This is a native only feature.
+        const SUBGROUP_BARRIER = 1 << 58;
     }
 }
 
@@ -1119,7 +1143,7 @@ pub struct Limits {
     /// pipeline output data, across all color attachments.
     pub max_color_attachment_bytes_per_sample: u32,
     /// Maximum number of bytes used for workgroup memory in a compute entry point. Defaults to
-    /// 16352. Higher is "better".
+    /// 16384. Higher is "better".
     pub max_compute_workgroup_storage_size: u32,
     /// Maximum value of the product of the `workgroup_size` dimensions for a compute entry-point.
     /// Defaults to 256. Higher is "better".
@@ -1136,6 +1160,11 @@ pub struct Limits {
     /// The maximum value for each dimension of a `ComputePass::dispatch(x, y, z)` operation.
     /// Defaults to 65535. Higher is "better".
     pub max_compute_workgroups_per_dimension: u32,
+
+    /// Minimal number of invocations in a subgroup. Higher is "better".
+    pub min_subgroup_size: u32,
+    /// Maximal number of invocations in a subgroup. Lower is "better".
+    pub max_subgroup_size: u32,
     /// Amount of storage available for push constants in bytes. Defaults to 0. Higher is "better".
     /// Requesting more than 0 during device creation requires [`Features::PUSH_CONSTANTS`] to be enabled.
     ///
@@ -1146,7 +1175,6 @@ pub struct Limits {
     /// - OpenGL doesn't natively support push constants, and are emulated with uniforms,
     ///   so this number is less useful but likely 256.
     pub max_push_constant_size: u32,
-
     /// Maximum number of live non-sampler bindings.
     ///
     /// This limit only affects the d3d12 backend. Using a large number will allow the device
@@ -1156,6 +1184,14 @@ pub struct Limits {
 
 impl Default for Limits {
     fn default() -> Self {
+        Self::defaults()
+    }
+}
+
+impl Limits {
+    // Rust doesn't allow const in trait implementations, so we break this out
+    // to allow reusing these defaults in const contexts like `downlevel_defaults`
+    const fn defaults() -> Self {
         Self {
             max_texture_dimension_1d: 8192,
             max_texture_dimension_2d: 8192,
@@ -1170,10 +1206,10 @@ impl Default for Limits {
             max_storage_buffers_per_shader_stage: 8,
             max_storage_textures_per_shader_stage: 4,
             max_uniform_buffers_per_shader_stage: 12,
-            max_uniform_buffer_binding_size: 64 << 10,
-            max_storage_buffer_binding_size: 128 << 20,
+            max_uniform_buffer_binding_size: 64 << 10, // (64 KiB)
+            max_storage_buffer_binding_size: 128 << 20, // (128 MiB)
             max_vertex_buffers: 8,
-            max_buffer_size: 256 << 20,
+            max_buffer_size: 256 << 20, // (256 MiB)
             max_vertex_attributes: 16,
             max_vertex_buffer_array_stride: 2048,
             min_uniform_buffer_offset_alignment: 256,
@@ -1187,13 +1223,13 @@ impl Default for Limits {
             max_compute_workgroup_size_y: 256,
             max_compute_workgroup_size_z: 64,
             max_compute_workgroups_per_dimension: 65535,
+            min_subgroup_size: 0,
+            max_subgroup_size: 0,
             max_push_constant_size: 0,
             max_non_sampler_bindings: 1_000_000,
         }
     }
-}
 
-impl Limits {
     /// These default limits are guaranteed to be compatible with GLES-3.1, and D3D11
     ///
     /// Those limits are as follows (different from default are marked with *):
@@ -1218,13 +1254,15 @@ impl Limits {
     ///     max_vertex_buffers: 8,
     ///     max_vertex_attributes: 16,
     ///     max_vertex_buffer_array_stride: 2048,
+    ///     min_subgroup_size: 0,
+    ///     max_subgroup_size: 0,
     ///     max_push_constant_size: 0,
     ///     min_uniform_buffer_offset_alignment: 256,
     ///     min_storage_buffer_offset_alignment: 256,
     ///     max_inter_stage_shader_components: 60,
     ///     max_color_attachments: 8,
     ///     max_color_attachment_bytes_per_sample: 32,
-    ///     max_compute_workgroup_storage_size: 16352,
+    ///     max_compute_workgroup_storage_size: 16352, // *
     ///     max_compute_invocations_per_workgroup: 256,
     ///     max_compute_workgroup_size_x: 256,
     ///     max_compute_workgroup_size_y: 256,
@@ -1239,35 +1277,11 @@ impl Limits {
             max_texture_dimension_1d: 2048,
             max_texture_dimension_2d: 2048,
             max_texture_dimension_3d: 256,
-            max_texture_array_layers: 256,
-            max_bind_groups: 4,
-            max_bindings_per_bind_group: 1000,
-            max_dynamic_uniform_buffers_per_pipeline_layout: 8,
-            max_dynamic_storage_buffers_per_pipeline_layout: 4,
-            max_sampled_textures_per_shader_stage: 16,
-            max_samplers_per_shader_stage: 16,
             max_storage_buffers_per_shader_stage: 4,
-            max_storage_textures_per_shader_stage: 4,
-            max_uniform_buffers_per_shader_stage: 12,
-            max_uniform_buffer_binding_size: 16 << 10,
-            max_storage_buffer_binding_size: 128 << 20,
-            max_vertex_buffers: 8,
-            max_vertex_attributes: 16,
-            max_vertex_buffer_array_stride: 2048,
-            max_push_constant_size: 0,
-            min_uniform_buffer_offset_alignment: 256,
-            min_storage_buffer_offset_alignment: 256,
-            max_inter_stage_shader_components: 60,
-            max_color_attachments: 8,
-            max_color_attachment_bytes_per_sample: 32,
+            max_uniform_buffer_binding_size: 16 << 10, // (16 KiB)
+            // see: https://developer.apple.com/metal/Metal-Feature-Set-Tables.pdf#page=7
             max_compute_workgroup_storage_size: 16352,
-            max_compute_invocations_per_workgroup: 256,
-            max_compute_workgroup_size_x: 256,
-            max_compute_workgroup_size_y: 256,
-            max_compute_workgroup_size_z: 64,
-            max_compute_workgroups_per_dimension: 65535,
-            max_buffer_size: 256 << 20,
-            max_non_sampler_bindings: 1_000_000,
+            ..Self::defaults()
         }
     }
 
@@ -1296,6 +1310,8 @@ impl Limits {
     ///     max_vertex_buffers: 8,
     ///     max_vertex_attributes: 16,
     ///     max_vertex_buffer_array_stride: 255, // +
+    ///     min_subgroup_size: 0,
+    ///     max_subgroup_size: 0,
     ///     max_push_constant_size: 0,
     ///     min_uniform_buffer_offset_alignment: 256,
     ///     min_storage_buffer_offset_alignment: 256,
@@ -1326,6 +1342,8 @@ impl Limits {
             max_compute_workgroup_size_y: 0,
             max_compute_workgroup_size_z: 0,
             max_compute_workgroups_per_dimension: 0,
+            min_subgroup_size: 0,
+            max_subgroup_size: 0,
 
             // Value supported by Intel Celeron B830 on Windows (OpenGL 3.1)
             max_inter_stage_shader_components: 31,
@@ -1418,6 +1436,10 @@ impl Limits {
         compare!(max_vertex_buffers, Less);
         compare!(max_vertex_attributes, Less);
         compare!(max_vertex_buffer_array_stride, Less);
+        if self.min_subgroup_size > 0 && self.max_subgroup_size > 0 {
+            compare!(min_subgroup_size, Greater);
+            compare!(max_subgroup_size, Less);
+        }
         compare!(max_push_constant_size, Less);
         compare!(min_uniform_buffer_offset_alignment, Greater);
         compare!(min_storage_buffer_offset_alignment, Greater);
diff --git a/wgpu/src/backend/webgpu.rs b/wgpu/src/backend/webgpu.rs
index b6c824e283..024b39bca3 100644
--- a/wgpu/src/backend/webgpu.rs
+++ b/wgpu/src/backend/webgpu.rs
@@ -21,7 +21,7 @@ use wasm_bindgen::{prelude::*, JsCast};
 
 use crate::{
     context::{downcast_ref, ObjectId, QueueWriteBuffer, Unused},
-    SurfaceTargetUnsafe, UncapturedErrorHandler,
+    CompilationInfo, SurfaceTargetUnsafe, UncapturedErrorHandler,
 };
 
 fn create_identified<T>(value: T) -> (Identified<T>, Sendable<T>) {
@@ -106,6 +106,88 @@ impl crate::Error {
     }
 }
 
+#[derive(Debug)]
+pub struct WebShaderModule {
+    module: webgpu_sys::GpuShaderModule,
+    compilation_info: WebShaderCompilationInfo,
+}
+
+#[derive(Debug, Clone)]
+enum WebShaderCompilationInfo {
+    /// WGSL shaders get their compilation info from a native WebGPU function.
+    /// We need the source to be able to do UTF16 to UTF8 location remapping.
+    Wgsl { source: String },
+    /// Transformed shaders get their compilation info from the transformer.
+    /// Further compilation errors are reported without a span.
+    Transformed {
+        compilation_info: crate::CompilationInfo,
+    },
+}
+
+fn map_utf16_to_utf8_offset(utf16_offset: u32, text: &str) -> u32 {
+    let mut utf16_i = 0;
+    for (utf8_index, c) in text.char_indices() {
+        if utf16_i >= utf16_offset {
+            return utf8_index as u32;
+        }
+        utf16_i += c.len_utf16() as u32;
+    }
+    if utf16_i >= utf16_offset {
+        text.len() as u32
+    } else {
+        log::error!(
+            "UTF16 offset {} is out of bounds for string {}",
+            utf16_offset,
+            text
+        );
+        u32::MAX
+    }
+}
+
+impl crate::CompilationMessage {
+    fn from_js(
+        js_message: webgpu_sys::GpuCompilationMessage,
+        compilation_info: &WebShaderCompilationInfo,
+    ) -> Self {
+        let message_type = match js_message.type_() {
+            webgpu_sys::GpuCompilationMessageType::Error => crate::CompilationMessageType::Error,
+            webgpu_sys::GpuCompilationMessageType::Warning => {
+                crate::CompilationMessageType::Warning
+            }
+            webgpu_sys::GpuCompilationMessageType::Info => crate::CompilationMessageType::Info,
+            _ => crate::CompilationMessageType::Error,
+        };
+        let utf16_offset = js_message.offset() as u32;
+        let utf16_length = js_message.length() as u32;
+        let span = match compilation_info {
+            WebShaderCompilationInfo::Wgsl { .. } if utf16_offset == 0 && utf16_length == 0 => None,
+            WebShaderCompilationInfo::Wgsl { source } => {
+                let offset = map_utf16_to_utf8_offset(utf16_offset, source);
+                let length = map_utf16_to_utf8_offset(utf16_length, &source[offset as usize..]);
+                let line_number = js_message.line_num() as u32; // That's legal, because we're counting lines the same way
+
+                let prefix = &source[..offset as usize];
+                let line_start = prefix.rfind('\n').map(|pos| pos + 1).unwrap_or(0) as u32;
+                let line_position = offset - line_start + 1; // Counting UTF-8 byte indices
+
+                Some(crate::SourceLocation {
+                    offset,
+                    length,
+                    line_number,
+                    line_position,
+                })
+            }
+            WebShaderCompilationInfo::Transformed { .. } => None,
+        };
+
+        crate::CompilationMessage {
+            message: js_message.message(),
+            message_type,
+            location: span,
+        }
+    }
+}
+
 // We need to assert that any future we return is Send to match the native API.
 //
 // This is safe on wasm32 *for now*, but similarly to the unsafe Send impls for the handle type
@@ -737,6 +819,8 @@ fn map_wgt_limits(limits: webgpu_sys::GpuSupportedLimits) -> wgt::Limits {
         max_compute_workgroup_size_z: limits.max_compute_workgroup_size_z(),
         max_compute_workgroups_per_dimension: limits.max_compute_workgroups_per_dimension(),
         // The following are not part of WebGPU
+        min_subgroup_size: wgt::Limits::default().min_subgroup_size,
+        max_subgroup_size: wgt::Limits::default().max_subgroup_size,
         max_push_constant_size: wgt::Limits::default().max_push_constant_size,
         max_non_sampler_bindings: wgt::Limits::default().max_non_sampler_bindings,
     }
@@ -844,6 +928,41 @@ fn future_pop_error_scope(result: JsFutureResult) -> Option<crate::Error> {
     }
 }
 
+fn future_compilation_info(
+    result: JsFutureResult,
+    base_compilation_info: &WebShaderCompilationInfo,
+) -> crate::CompilationInfo {
+    let base_messages = match base_compilation_info {
+        WebShaderCompilationInfo::Transformed { compilation_info } => {
+            compilation_info.messages.iter().cloned()
+        }
+        _ => [].iter().cloned(),
+    };
+
+    let messages = match result {
+        Ok(js_value) => {
+            let info = webgpu_sys::GpuCompilationInfo::from(js_value);
+            base_messages
+                .chain(info.messages().into_iter().map(|message| {
+                    crate::CompilationMessage::from_js(
+                        webgpu_sys::GpuCompilationMessage::from(message),
+                        base_compilation_info,
+                    )
+                }))
+                .collect()
+        }
+        Err(_v) => base_messages
+            .chain(std::iter::once(crate::CompilationMessage {
+                message: "Getting compilation info failed".to_string(),
+                message_type: crate::CompilationMessageType::Error,
+                location: None,
+            }))
+            .collect(),
+    };
+
+    crate::CompilationInfo { messages }
+}
+
 /// Calls `callback(success_value)` when the promise completes successfully, calls `callback(failure_value)`
 /// when the promise completes unsuccessfully.
 fn register_then_closures<F, T>(promise: &Promise, callback: F, success_value: T, failure_value: T)
@@ -1000,8 +1119,8 @@ impl crate::context::Context for ContextWebGpu {
     type DeviceData = Sendable<webgpu_sys::GpuDevice>;
     type QueueId = Identified<webgpu_sys::GpuQueue>;
     type QueueData = Sendable<webgpu_sys::GpuQueue>;
-    type ShaderModuleId = Identified<webgpu_sys::GpuShaderModule>;
-    type ShaderModuleData = Sendable<webgpu_sys::GpuShaderModule>;
+    type ShaderModuleId = Identified<WebShaderModule>;
+    type ShaderModuleData = Sendable<WebShaderModule>;
     type BindGroupLayoutId = Identified<webgpu_sys::GpuBindGroupLayout>;
     type BindGroupLayoutData = Sendable<webgpu_sys::GpuBindGroupLayout>;
     type BindGroupId = Identified<webgpu_sys::GpuBindGroup>;
@@ -1036,6 +1155,10 @@ impl crate::context::Context for ContextWebGpu {
     type RenderBundleData = Sendable<webgpu_sys::GpuRenderBundle>;
     type SurfaceId = Identified<(Canvas, webgpu_sys::GpuCanvasContext)>;
     type SurfaceData = Sendable<(Canvas, webgpu_sys::GpuCanvasContext)>;
+    type BlasData = ();
+    type BlasId = ObjectId;
+    type TlasData = ();
+    type TlasId = ObjectId;
 
     type SurfaceOutputDetail = SurfaceOutputDetail;
     type SubmissionIndex = Unused;
@@ -1062,10 +1185,10 @@ impl crate::context::Context for ContextWebGpu {
     type PopErrorScopeFuture =
         MakeSendFuture<wasm_bindgen_futures::JsFuture, fn(JsFutureResult) -> Option<crate::Error>>;
 
-    type BlasData = ();
-    type BlasId = ObjectId;
-    type TlasData = ();
-    type TlasId = ObjectId;
+    type CompilationInfoFuture = MakeSendFuture<
+        wasm_bindgen_futures::JsFuture,
+        Box<dyn Fn(JsFutureResult) -> CompilationInfo>,
+    >;
 
     fn init(_instance_desc: wgt::InstanceDescriptor) -> Self {
         let Some(gpu) = get_browser_gpu_property() else {
@@ -1423,10 +1546,10 @@ impl crate::context::Context for ContextWebGpu {
         desc: crate::ShaderModuleDescriptor<'_>,
         _shader_bound_checks: wgt::ShaderBoundChecks,
     ) -> (Self::ShaderModuleId, Self::ShaderModuleData) {
-        let mut descriptor: webgpu_sys::GpuShaderModuleDescriptor = match desc.source {
+        let shader_module_result = match desc.source {
             #[cfg(feature = "spirv")]
             crate::ShaderSource::SpirV(ref spv) => {
-                use naga::{back, front, valid};
+                use naga::front;
 
                 let options = naga::front::spv::Options {
                     adjust_coordinate_space: false,
@@ -1434,18 +1557,25 @@ impl crate::context::Context for ContextWebGpu {
                     block_ctx_dump_prefix: None,
                 };
                 let spv_parser = front::spv::Frontend::new(spv.iter().cloned(), &options);
-                let spv_module = spv_parser.parse().unwrap();
-
-                let mut validator = valid::Validator::new(
-                    valid::ValidationFlags::all(),
-                    valid::Capabilities::all(),
-                );
-                let spv_module_info = validator.validate(&spv_module).unwrap();
-
-                let writer_flags = naga::back::wgsl::WriterFlags::empty();
-                let wgsl_text =
-                    back::wgsl::write_string(&spv_module, &spv_module_info, writer_flags).unwrap();
-                webgpu_sys::GpuShaderModuleDescriptor::new(wgsl_text.as_str())
+                spv_parser
+                    .parse()
+                    .map_err(|inner| {
+                        CompilationInfo::from(naga::error::ShaderError {
+                            source: String::new(),
+                            label: desc.label.map(|s| s.to_string()),
+                            inner: Box::new(inner),
+                        })
+                    })
+                    .and_then(|spv_module| {
+                        validate_transformed_shader_module(&spv_module, "", &desc).map(|v| {
+                            (
+                                v,
+                                WebShaderCompilationInfo::Transformed {
+                                    compilation_info: CompilationInfo { messages: vec![] },
+                                },
+                            )
+                        })
+                    })
             }
             #[cfg(feature = "glsl")]
             crate::ShaderSource::Glsl {
@@ -1453,7 +1583,7 @@ impl crate::context::Context for ContextWebGpu {
                 stage,
                 ref defines,
             } => {
-                use naga::{back, front, valid};
+                use naga::front;
 
                 // Parse the given shader code and store its representation.
                 let options = front::glsl::Options {
@@ -1461,45 +1591,91 @@ impl crate::context::Context for ContextWebGpu {
                     defines: defines.clone(),
                 };
                 let mut parser = front::glsl::Frontend::default();
-                let glsl_module = parser.parse(&options, shader).unwrap();
-
-                let mut validator = valid::Validator::new(
-                    valid::ValidationFlags::all(),
-                    valid::Capabilities::all(),
-                );
-                let glsl_module_info = validator.validate(&glsl_module).unwrap();
-
-                let writer_flags = naga::back::wgsl::WriterFlags::empty();
-                let wgsl_text =
-                    back::wgsl::write_string(&glsl_module, &glsl_module_info, writer_flags)
-                        .unwrap();
-                webgpu_sys::GpuShaderModuleDescriptor::new(wgsl_text.as_str())
+                parser
+                    .parse(&options, shader)
+                    .map_err(|inner| {
+                        CompilationInfo::from(naga::error::ShaderError {
+                            source: shader.to_string(),
+                            label: desc.label.map(|s| s.to_string()),
+                            inner: Box::new(inner),
+                        })
+                    })
+                    .and_then(|glsl_module| {
+                        validate_transformed_shader_module(&glsl_module, shader, &desc).map(|v| {
+                            (
+                                v,
+                                WebShaderCompilationInfo::Transformed {
+                                    compilation_info: CompilationInfo { messages: vec![] },
+                                },
+                            )
+                        })
+                    })
             }
             #[cfg(feature = "wgsl")]
-            crate::ShaderSource::Wgsl(ref code) => webgpu_sys::GpuShaderModuleDescriptor::new(code),
+            crate::ShaderSource::Wgsl(ref code) => {
+                let shader_module = webgpu_sys::GpuShaderModuleDescriptor::new(code);
+                Ok((
+                    shader_module,
+                    WebShaderCompilationInfo::Wgsl {
+                        source: code.to_string(),
+                    },
+                ))
+            }
             #[cfg(feature = "naga-ir")]
-            crate::ShaderSource::Naga(module) => {
-                use naga::{back, valid};
-
-                let mut validator = valid::Validator::new(
-                    valid::ValidationFlags::all(),
-                    valid::Capabilities::all(),
-                );
-                let module_info = validator.validate(&module).unwrap();
-
-                let writer_flags = naga::back::wgsl::WriterFlags::empty();
-                let wgsl_text =
-                    back::wgsl::write_string(&module, &module_info, writer_flags).unwrap();
-                webgpu_sys::GpuShaderModuleDescriptor::new(wgsl_text.as_str())
+            crate::ShaderSource::Naga(ref module) => {
+                validate_transformed_shader_module(module, "", &desc).map(|v| {
+                    (
+                        v,
+                        WebShaderCompilationInfo::Transformed {
+                            compilation_info: CompilationInfo { messages: vec![] },
+                        },
+                    )
+                })
             }
             crate::ShaderSource::Dummy(_) => {
                 panic!("found `ShaderSource::Dummy`")
             }
         };
+
+        #[cfg(naga)]
+        fn validate_transformed_shader_module(
+            module: &naga::Module,
+            source: &str,
+            desc: &crate::ShaderModuleDescriptor<'_>,
+        ) -> Result<webgpu_sys::GpuShaderModuleDescriptor, crate::CompilationInfo> {
+            use naga::{back, valid};
+            let mut validator =
+                valid::Validator::new(valid::ValidationFlags::all(), valid::Capabilities::all());
+            let module_info = validator.validate(module).map_err(|err| {
+                CompilationInfo::from(naga::error::ShaderError {
+                    source: source.to_string(),
+                    label: desc.label.map(|s| s.to_string()),
+                    inner: Box::new(err),
+                })
+            })?;
+
+            let writer_flags = naga::back::wgsl::WriterFlags::empty();
+            let wgsl_text = back::wgsl::write_string(module, &module_info, writer_flags).unwrap();
+            Ok(webgpu_sys::GpuShaderModuleDescriptor::new(
+                wgsl_text.as_str(),
+            ))
+        }
+        let (mut descriptor, compilation_info) = match shader_module_result {
+            Ok(v) => v,
+            Err(compilation_info) => (
+                webgpu_sys::GpuShaderModuleDescriptor::new(""),
+                WebShaderCompilationInfo::Transformed { compilation_info },
+            ),
+        };
         if let Some(label) = desc.label {
             descriptor.label(label);
         }
-        create_identified(device_data.0.create_shader_module(&descriptor))
+        let shader_module = WebShaderModule {
+            module: device_data.0.create_shader_module(&descriptor),
+            compilation_info,
+        };
+        let (id, data) = create_identified(shader_module);
+        (id, data)
     }
 
     unsafe fn device_create_shader_module_spirv(
@@ -1704,7 +1880,7 @@ impl crate::context::Context for ContextWebGpu {
     ) -> (Self::RenderPipelineId, Self::RenderPipelineData) {
         let module: &<ContextWebGpu as crate::Context>::ShaderModuleData =
             downcast_ref(desc.vertex.module.data.as_ref());
-        let mut mapped_vertex_state = webgpu_sys::GpuVertexState::new(&module.0);
+        let mut mapped_vertex_state = webgpu_sys::GpuVertexState::new(&module.0.module);
         mapped_vertex_state.entry_point(desc.vertex.entry_point);
 
         let buffers = desc
@@ -1779,7 +1955,8 @@ impl crate::context::Context for ContextWebGpu {
                 .collect::<js_sys::Array>();
             let module: &<ContextWebGpu as crate::Context>::ShaderModuleData =
                 downcast_ref(frag.module.data.as_ref());
-            let mut mapped_fragment_desc = webgpu_sys::GpuFragmentState::new(&module.0, &targets);
+            let mut mapped_fragment_desc =
+                webgpu_sys::GpuFragmentState::new(&module.0.module, &targets);
             mapped_fragment_desc.entry_point(frag.entry_point);
             mapped_desc.fragment(&mapped_fragment_desc);
         }
@@ -1804,7 +1981,8 @@ impl crate::context::Context for ContextWebGpu {
     ) -> (Self::ComputePipelineId, Self::ComputePipelineData) {
         let shader_module: &<ContextWebGpu as crate::Context>::ShaderModuleData =
             downcast_ref(desc.module.data.as_ref());
-        let mut mapped_compute_stage = webgpu_sys::GpuProgrammableStage::new(&shader_module.0);
+        let mut mapped_compute_stage =
+            webgpu_sys::GpuProgrammableStage::new(&shader_module.0.module);
         mapped_compute_stage.entry_point(desc.entry_point);
         let auto_layout = wasm_bindgen::JsValue::from(webgpu_sys::GpuAutoLayoutMode::Auto);
         let mut mapped_desc = webgpu_sys::GpuComputePipelineDescriptor::new(
@@ -2097,6 +2275,22 @@ impl crate::context::Context for ContextWebGpu {
         buffer_data.0.mapping.borrow_mut().mapped_buffer = None;
     }
 
+    fn shader_get_compilation_info(
+        &self,
+        _shader: &Self::ShaderModuleId,
+        shader_data: &Self::ShaderModuleData,
+    ) -> Self::CompilationInfoFuture {
+        let compilation_info_promise = shader_data.0.module.get_compilation_info();
+        let map_future = Box::new({
+            let compilation_info = shader_data.0.compilation_info.clone();
+            move |result| future_compilation_info(result, &compilation_info)
+        });
+        MakeSendFuture::new(
+            wasm_bindgen_futures::JsFuture::from(compilation_info_promise),
+            map_future,
+        )
+    }
+
     fn texture_create_view(
         &self,
         _texture: &Self::TextureId,
diff --git a/wgpu/src/backend/wgpu_core.rs b/wgpu/src/backend/wgpu_core.rs
index d897b5d8a0..9c79909e1e 100644
--- a/wgpu/src/backend/wgpu_core.rs
+++ b/wgpu/src/backend/wgpu_core.rs
@@ -1,7 +1,8 @@
 use crate::{
     context::{ObjectId, Unused},
     AdapterInfo, BindGroupDescriptor, BindGroupLayoutDescriptor, BindingResource, BufferBinding,
-    BufferDescriptor, CommandEncoderDescriptor, ComputePassDescriptor, ComputePipelineDescriptor,
+    BufferDescriptor, CommandEncoderDescriptor, CompilationInfo, CompilationMessage,
+    CompilationMessageType, ComputePassDescriptor, ComputePipelineDescriptor,
     DownlevelCapabilities, Features, Label, Limits, LoadOp, MapMode, Operations,
     PipelineLayoutDescriptor, RenderBundleEncoderDescriptor, RenderPipelineDescriptor,
     SamplerDescriptor, ShaderModuleDescriptor, ShaderModuleDescriptorSpirV, ShaderSource, StoreOp,
@@ -23,9 +24,10 @@ use std::{
     sync::Arc,
 };
 use wgc::{
-    command::{bundle_ffi::*, compute_ffi::*, render_ffi::*},
+    command::{bundle_ffi::*, compute_commands::*, render_commands::*},
     device::DeviceLostClosure,
     id::{CommandEncoderId, TextureViewId},
+    pipeline::CreateShaderModuleError,
 };
 use wgt::WasmNotSendSync;
 
@@ -441,6 +443,11 @@ pub struct Buffer {
     error_sink: ErrorSink,
 }
 
+#[derive(Debug)]
+pub struct ShaderModule {
+    compilation_info: CompilationInfo,
+}
+
 #[derive(Debug)]
 pub struct Texture {
     id: wgc::id::TextureId,
@@ -493,7 +500,7 @@ impl crate::Context for ContextWgpuCore {
     type QueueId = wgc::id::QueueId;
     type QueueData = Queue;
     type ShaderModuleId = wgc::id::ShaderModuleId;
-    type ShaderModuleData = ();
+    type ShaderModuleData = ShaderModule;
     type BindGroupLayoutId = wgc::id::BindGroupLayoutId;
     type BindGroupLayoutData = ();
     type BindGroupId = wgc::id::BindGroupId;
@@ -554,6 +561,7 @@ impl crate::Context for ContextWgpuCore {
     >;
 
     type PopErrorScopeFuture = Ready<Option<crate::Error>>;
+    type CompilationInfoFuture = Ready<CompilationInfo>;
 
     fn init(instance_desc: wgt::InstanceDescriptor) -> Self {
         Self(wgc::global::Global::new("wgpu", instance_desc))
@@ -569,7 +577,7 @@ impl crate::Context for ContextWgpuCore {
                 raw_window_handle,
             } => unsafe {
                 self.0
-                    .instance_create_surface(raw_display_handle, raw_window_handle, None)?
+                    .instance_create_surface(raw_display_handle, raw_window_handle, None)
             },
 
             #[cfg(metal)]
@@ -593,7 +601,7 @@ impl crate::Context for ContextWgpuCore {
                 self.0
                     .instance_create_surface_from_swap_chain_panel(swap_chain_panel, None)
             },
-        };
+        }?;
 
         Ok((
             id,
@@ -906,16 +914,21 @@ impl crate::Context for ContextWgpuCore {
         let (id, error) = wgc::gfx_select!(
             device => self.0.device_create_shader_module(*device, &descriptor, source, None)
         );
-        if let Some(cause) = error {
-            self.handle_error(
-                &device_data.error_sink,
-                cause,
-                LABEL,
-                desc.label,
-                "Device::create_shader_module",
-            );
-        }
-        (id, ())
+        let compilation_info = match error {
+            Some(cause) => {
+                self.handle_error(
+                    &device_data.error_sink,
+                    cause.clone(),
+                    LABEL,
+                    desc.label,
+                    "Device::create_shader_module",
+                );
+                CompilationInfo::from(cause)
+            }
+            None => CompilationInfo { messages: vec![] },
+        };
+
+        (id, ShaderModule { compilation_info })
     }
 
     unsafe fn device_create_shader_module_spirv(
@@ -933,16 +946,20 @@ impl crate::Context for ContextWgpuCore {
         let (id, error) = wgc::gfx_select!(
             device => self.0.device_create_shader_module_spirv(*device, &descriptor, Borrowed(&desc.source), None)
         );
-        if let Some(cause) = error {
-            self.handle_error(
-                &device_data.error_sink,
-                cause,
-                LABEL,
-                desc.label,
-                "Device::create_shader_module_spirv",
-            );
-        }
-        (id, ())
+        let compilation_info = match error {
+            Some(cause) => {
+                self.handle_error(
+                    &device_data.error_sink,
+                    cause.clone(),
+                    LABEL,
+                    desc.label,
+                    "Device::create_shader_module_spirv",
+                );
+                CompilationInfo::from(cause)
+            }
+            None => CompilationInfo { messages: vec![] },
+        };
+        (id, ShaderModule { compilation_info })
     }
 
     fn device_create_bind_group_layout(
@@ -1161,7 +1178,11 @@ impl crate::Context for ContextWgpuCore {
                 stage: pipe::ProgrammableStageDescriptor {
                     module: desc.vertex.module.id.into(),
                     entry_point: Some(Borrowed(desc.vertex.entry_point)),
-                    constants: Borrowed(desc.vertex.constants),
+                    constants: Borrowed(desc.vertex.compilation_options.constants),
+                    zero_initialize_workgroup_memory: desc
+                        .vertex
+                        .compilation_options
+                        .zero_initialize_workgroup_memory,
                 },
                 buffers: Borrowed(&vertex_buffers),
             },
@@ -1172,7 +1193,10 @@ impl crate::Context for ContextWgpuCore {
                 stage: pipe::ProgrammableStageDescriptor {
                     module: frag.module.id.into(),
                     entry_point: Some(Borrowed(frag.entry_point)),
-                    constants: Borrowed(frag.constants),
+                    constants: Borrowed(frag.compilation_options.constants),
+                    zero_initialize_workgroup_memory: frag
+                        .compilation_options
+                        .zero_initialize_workgroup_memory,
                 },
                 targets: Borrowed(frag.targets),
             }),
@@ -1221,7 +1245,10 @@ impl crate::Context for ContextWgpuCore {
             stage: pipe::ProgrammableStageDescriptor {
                 module: desc.module.id.into(),
                 entry_point: Some(Borrowed(desc.entry_point)),
-                constants: Borrowed(desc.constants),
+                constants: Borrowed(desc.compilation_options.constants),
+                zero_initialize_workgroup_memory: desc
+                    .compilation_options
+                    .zero_initialize_workgroup_memory,
             },
         };
 
@@ -1554,6 +1581,14 @@ impl crate::Context for ContextWgpuCore {
         }
     }
 
+    fn shader_get_compilation_info(
+        &self,
+        _shader: &Self::ShaderModuleId,
+        shader_data: &Self::ShaderModuleData,
+    ) -> Self::CompilationInfoFuture {
+        ready(shader_data.compilation_info.clone())
+    }
+
     fn texture_create_view(
         &self,
         texture: &Self::TextureId,
@@ -2331,15 +2366,7 @@ impl crate::Context for ContextWgpuCore {
         _bind_group_data: &Self::BindGroupData,
         offsets: &[wgt::DynamicOffset],
     ) {
-        unsafe {
-            wgpu_compute_pass_set_bind_group(
-                pass_data,
-                index,
-                *bind_group,
-                offsets.as_ptr(),
-                offsets.len(),
-            )
-        }
+        wgpu_compute_pass_set_bind_group(pass_data, index, *bind_group, offsets);
     }
 
     fn compute_pass_set_push_constants(
@@ -2349,14 +2376,7 @@ impl crate::Context for ContextWgpuCore {
         offset: u32,
         data: &[u8],
     ) {
-        unsafe {
-            wgpu_compute_pass_set_push_constant(
-                pass_data,
-                offset,
-                data.len().try_into().unwrap(),
-                data.as_ptr(),
-            )
-        }
+        wgpu_compute_pass_set_push_constant(pass_data, offset, data);
     }
 
     fn compute_pass_insert_debug_marker(
@@ -2365,10 +2385,7 @@ impl crate::Context for ContextWgpuCore {
         pass_data: &mut Self::ComputePassData,
         label: &str,
     ) {
-        unsafe {
-            let label = std::ffi::CString::new(label).unwrap();
-            wgpu_compute_pass_insert_debug_marker(pass_data, label.as_ptr(), 0);
-        }
+        wgpu_compute_pass_insert_debug_marker(pass_data, label, 0);
     }
 
     fn compute_pass_push_debug_group(
@@ -2377,10 +2394,7 @@ impl crate::Context for ContextWgpuCore {
         pass_data: &mut Self::ComputePassData,
         group_label: &str,
     ) {
-        unsafe {
-            let label = std::ffi::CString::new(group_label).unwrap();
-            wgpu_compute_pass_push_debug_group(pass_data, label.as_ptr(), 0);
-        }
+        wgpu_compute_pass_push_debug_group(pass_data, group_label, 0);
     }
 
     fn compute_pass_pop_debug_group(
@@ -2647,15 +2661,7 @@ impl crate::Context for ContextWgpuCore {
         _bind_group_data: &Self::BindGroupData,
         offsets: &[wgt::DynamicOffset],
     ) {
-        unsafe {
-            wgpu_render_pass_set_bind_group(
-                pass_data,
-                index,
-                *bind_group,
-                offsets.as_ptr(),
-                offsets.len(),
-            )
-        }
+        wgpu_render_pass_set_bind_group(pass_data, index, *bind_group, offsets)
     }
 
     fn render_pass_set_index_buffer(
@@ -2692,15 +2698,7 @@ impl crate::Context for ContextWgpuCore {
         offset: u32,
         data: &[u8],
     ) {
-        unsafe {
-            wgpu_render_pass_set_push_constants(
-                pass_data,
-                stages,
-                offset,
-                data.len().try_into().unwrap(),
-                data.as_ptr(),
-            )
-        }
+        wgpu_render_pass_set_push_constants(pass_data, stages, offset, data)
     }
 
     fn render_pass_draw(
@@ -2882,10 +2880,7 @@ impl crate::Context for ContextWgpuCore {
         pass_data: &mut Self::RenderPassData,
         label: &str,
     ) {
-        unsafe {
-            let label = std::ffi::CString::new(label).unwrap();
-            wgpu_render_pass_insert_debug_marker(pass_data, label.as_ptr(), 0);
-        }
+        wgpu_render_pass_insert_debug_marker(pass_data, label, 0);
     }
 
     fn render_pass_push_debug_group(
@@ -2894,10 +2889,7 @@ impl crate::Context for ContextWgpuCore {
         pass_data: &mut Self::RenderPassData,
         group_label: &str,
     ) {
-        unsafe {
-            let label = std::ffi::CString::new(group_label).unwrap();
-            wgpu_render_pass_push_debug_group(pass_data, label.as_ptr(), 0);
-        }
+        wgpu_render_pass_push_debug_group(pass_data, group_label, 0);
     }
 
     fn render_pass_pop_debug_group(
@@ -2962,13 +2954,7 @@ impl crate::Context for ContextWgpuCore {
         render_bundles: &mut dyn Iterator<Item = (Self::RenderBundleId, &Self::RenderBundleData)>,
     ) {
         let temp_render_bundles = render_bundles.map(|(i, _)| i).collect::<SmallVec<[_; 4]>>();
-        unsafe {
-            wgpu_render_pass_execute_bundles(
-                pass_data,
-                temp_render_bundles.as_ptr(),
-                temp_render_bundles.len(),
-            )
-        }
+        wgpu_render_pass_execute_bundles(pass_data, &temp_render_bundles)
     }
 
     fn device_create_blas(
@@ -3258,6 +3244,35 @@ fn default_error_handler(err: crate::Error) {
     panic!("wgpu error: {err}\n");
 }
 
+impl From<CreateShaderModuleError> for CompilationInfo {
+    fn from(value: CreateShaderModuleError) -> Self {
+        match value {
+            #[cfg(feature = "wgsl")]
+            CreateShaderModuleError::Parsing(v) => v.into(),
+            #[cfg(feature = "glsl")]
+            CreateShaderModuleError::ParsingGlsl(v) => v.into(),
+            #[cfg(feature = "spirv")]
+            CreateShaderModuleError::ParsingSpirV(v) => v.into(),
+            CreateShaderModuleError::Validation(v) => v.into(),
+            // Device errors are reported through the error sink, and are not compilation errors.
+            // Same goes for native shader module generation errors.
+            CreateShaderModuleError::Device(_) | CreateShaderModuleError::Generation => {
+                CompilationInfo {
+                    messages: Vec::new(),
+                }
+            }
+            // Everything else is an error message without location information.
+            _ => CompilationInfo {
+                messages: vec![CompilationMessage {
+                    message: value.to_string(),
+                    message_type: CompilationMessageType::Error,
+                    location: None,
+                }],
+            },
+        }
+    }
+}
+
 #[derive(Debug)]
 pub struct QueueWriteBuffer {
     buffer_id: wgc::id::StagingBufferId,
diff --git a/wgpu/src/context.rs b/wgpu/src/context.rs
index ad74cd58a4..54c9a97e5b 100644
--- a/wgpu/src/context.rs
+++ b/wgpu/src/context.rs
@@ -9,13 +9,13 @@ use wgt::{
 
 use crate::{
     AnyWasmNotSendSync, BindGroupDescriptor, BindGroupLayoutDescriptor, Buffer, BufferAsyncError,
-    BufferDescriptor, CommandEncoderDescriptor, ComputePassDescriptor, ComputePipelineDescriptor,
-    DeviceDescriptor, Error, ErrorFilter, ImageCopyBuffer, ImageCopyTexture, Maintain,
-    MaintainResult, MapMode, PipelineLayoutDescriptor, QuerySetDescriptor, RenderBundleDescriptor,
-    RenderBundleEncoderDescriptor, RenderPassDescriptor, RenderPipelineDescriptor,
-    RequestAdapterOptions, RequestDeviceError, SamplerDescriptor, ShaderModuleDescriptor,
-    ShaderModuleDescriptorSpirV, SurfaceTargetUnsafe, Texture, TextureDescriptor,
-    TextureViewDescriptor, UncapturedErrorHandler,
+    BufferDescriptor, CommandEncoderDescriptor, CompilationInfo, ComputePassDescriptor,
+    ComputePipelineDescriptor, DeviceDescriptor, Error, ErrorFilter, ImageCopyBuffer,
+    ImageCopyTexture, Maintain, MaintainResult, MapMode, PipelineLayoutDescriptor,
+    QuerySetDescriptor, RenderBundleDescriptor, RenderBundleEncoderDescriptor,
+    RenderPassDescriptor, RenderPipelineDescriptor, RequestAdapterOptions, RequestDeviceError,
+    SamplerDescriptor, ShaderModuleDescriptor, ShaderModuleDescriptorSpirV, SurfaceTargetUnsafe,
+    Texture, TextureDescriptor, TextureViewDescriptor, UncapturedErrorHandler,
 };
 
 /// Meta trait for an id tracked by a context.
@@ -100,6 +100,8 @@ pub trait Context: Debug + WasmNotSendSync + Sized {
         + 'static;
     type PopErrorScopeFuture: Future<Output = Option<Error>> + WasmNotSend + 'static;
 
+    type CompilationInfoFuture: Future<Output = CompilationInfo> + WasmNotSend + 'static;
+
     fn init(instance_desc: wgt::InstanceDescriptor) -> Self;
     unsafe fn instance_create_surface(
         &self,
@@ -328,6 +330,11 @@ pub trait Context: Debug + WasmNotSendSync + Sized {
         sub_range: Range<BufferAddress>,
     ) -> Box<dyn BufferMappedRange>;
     fn buffer_unmap(&self, buffer: &Self::BufferId, buffer_data: &Self::BufferData);
+    fn shader_get_compilation_info(
+        &self,
+        shader: &Self::ShaderModuleId,
+        shader_data: &Self::ShaderModuleData,
+    ) -> Self::CompilationInfoFuture;
     fn texture_create_view(
         &self,
         texture: &Self::TextureId,
@@ -1160,6 +1167,11 @@ pub type DevicePopErrorFuture = Box<dyn Future<Output = Option<Error>> + Send>;
 #[cfg(not(send_sync))]
 pub type DevicePopErrorFuture = Box<dyn Future<Output = Option<Error>>>;
 
+#[cfg(send_sync)]
+pub type ShaderCompilationInfoFuture = Box<dyn Future<Output = CompilationInfo> + Send>;
+#[cfg(not(send_sync))]
+pub type ShaderCompilationInfoFuture = Box<dyn Future<Output = CompilationInfo>>;
+
 #[cfg(send_sync)]
 pub type SubmittedWorkDoneCallback = Box<dyn FnOnce() + Send + 'static>;
 #[cfg(not(send_sync))]
@@ -1382,6 +1394,11 @@ pub(crate) trait DynContext: Debug + WasmNotSendSync {
         sub_range: Range<BufferAddress>,
     ) -> Box<dyn BufferMappedRange>;
     fn buffer_unmap(&self, buffer: &ObjectId, buffer_data: &crate::Data);
+    fn shader_get_compilation_info(
+        &self,
+        shader: &ObjectId,
+        shader_data: &crate::Data,
+    ) -> Pin<ShaderCompilationInfoFuture>;
     fn texture_create_view(
         &self,
         texture: &ObjectId,
@@ -2537,6 +2554,17 @@ where
         Context::buffer_unmap(self, &buffer, buffer_data)
     }
 
+    fn shader_get_compilation_info(
+        &self,
+        shader: &ObjectId,
+        shader_data: &crate::Data,
+    ) -> Pin<ShaderCompilationInfoFuture> {
+        let shader = <T::ShaderModuleId>::from(*shader);
+        let shader_data = downcast_ref(shader_data);
+        let future = Context::shader_get_compilation_info(self, &shader, shader_data);
+        Box::pin(future)
+    }
+
     fn texture_create_view(
         &self,
         texture: &ObjectId,
diff --git a/wgpu/src/lib.rs b/wgpu/src/lib.rs
index 47bb1f0aaa..2f8033ff45 100644
--- a/wgpu/src/lib.rs
+++ b/wgpu/src/lib.rs
@@ -210,12 +210,31 @@ pub struct SubmissionIndex(ObjectId, Arc<crate::Data>);
 #[cfg(send_sync)]
 static_assertions::assert_impl_all!(SubmissionIndex: Send, Sync);
 
-/// The main purpose of this struct is to resolve mapped ranges (convert sizes
-/// to end points), and to ensure that the sub-ranges don't intersect.
+/// The mapped portion of a buffer, if any, and its outstanding views.
+///
+/// This ensures that views fall within the mapped range and don't overlap, and
+/// also takes care of turning `Option<BufferSize>` sizes into actual buffer
+/// offsets.
 #[derive(Debug)]
 struct MapContext {
+    /// The overall size of the buffer.
+    ///
+    /// This is just a convenient copy of [`Buffer::size`].
     total_size: BufferAddress,
+
+    /// The range of the buffer that is mapped.
+    ///
+    /// This is `0..0` if the buffer is not mapped. This becomes non-empty when
+    /// the buffer is mapped at creation time, and when you call `map_async` on
+    /// some [`BufferSlice`] (so technically, it indicates the portion that is
+    /// *or has been requested to be* mapped.)
+    ///
+    /// All [`BufferView`]s and [`BufferViewMut`]s must fall within this range.
     initial_range: Range<BufferAddress>,
+
+    /// The ranges covered by all outstanding [`BufferView`]s and
+    /// [`BufferViewMut`]s. These are non-overlapping, and are all contained
+    /// within `initial_range`.
     sub_ranges: Vec<Range<BufferAddress>>,
 }
 
@@ -228,6 +247,7 @@ impl MapContext {
         }
     }
 
+    /// Record that the buffer is no longer mapped.
     fn reset(&mut self) {
         self.initial_range = 0..0;
 
@@ -237,12 +257,22 @@ impl MapContext {
         );
     }
 
+    /// Record that the `size` bytes of the buffer at `offset` are now viewed.
+    ///
+    /// Return the byte offset within the buffer of the end of the viewed range.
+    ///
+    /// # Panics
+    ///
+    /// This panics if the given range overlaps with any existing range.
     fn add(&mut self, offset: BufferAddress, size: Option<BufferSize>) -> BufferAddress {
         let end = match size {
             Some(s) => offset + s.get(),
             None => self.initial_range.end,
         };
         assert!(self.initial_range.start <= offset && end <= self.initial_range.end);
+        // This check is essential for avoiding undefined behavior: it is the
+        // only thing that ensures that `&mut` references to the buffer's
+        // contents don't alias anything else.
         for sub in self.sub_ranges.iter() {
             assert!(
                 end <= sub.start || offset >= sub.end,
@@ -253,6 +283,14 @@ impl MapContext {
         end
     }
 
+    /// Record that the `size` bytes of the buffer at `offset` are no longer viewed.
+    ///
+    /// # Panics
+    ///
+    /// This panics if the given range does not exactly match one previously
+    /// passed to [`add`].
+    ///
+    /// [`add]`: MapContext::add
     fn remove(&mut self, offset: BufferAddress, size: Option<BufferSize>) {
         let end = match size {
             Some(s) => offset + s.get(),
@@ -274,6 +312,112 @@ impl MapContext {
 /// [`DeviceExt::create_buffer_init`](util::DeviceExt::create_buffer_init).
 ///
 /// Corresponds to [WebGPU `GPUBuffer`](https://gpuweb.github.io/gpuweb/#buffer-interface).
+///
+/// # Mapping buffers
+///
+/// If a `Buffer` is created with the appropriate [`usage`], it can be *mapped*:
+/// you can make its contents accessible to the CPU as an ordinary `&[u8]` or
+/// `&mut [u8]` slice of bytes. Buffers created with the
+/// [`mapped_at_creation`][mac] flag set are also mapped initially.
+///
+/// Depending on the hardware, the buffer could be memory shared between CPU and
+/// GPU, so that the CPU has direct access to the same bytes the GPU will
+/// consult; or it may be ordinary CPU memory, whose contents the system must
+/// copy to/from the GPU as needed. This crate's API is designed to work the
+/// same way in either case: at any given time, a buffer is either mapped and
+/// available to the CPU, or unmapped and ready for use by the GPU, but never
+/// both. This makes it impossible for either side to observe changes by the
+/// other immediately, and any necessary transfers can be carried out when the
+/// buffer transitions from one state to the other.
+///
+/// There are two ways to map a buffer:
+///
+/// - If [`BufferDescriptor::mapped_at_creation`] is `true`, then the entire
+///   buffer is mapped when it is created. This is the easiest way to initialize
+///   a new buffer. You can set `mapped_at_creation` on any kind of buffer,
+///   regardless of its [`usage`] flags.
+///
+/// - If the buffer's [`usage`] includes the [`MAP_READ`] or [`MAP_WRITE`]
+///   flags, then you can call `buffer.slice(range).map_async(mode, callback)`
+///   to map the portion of `buffer` given by `range`. This waits for the GPU to
+///   finish using the buffer, and invokes `callback` as soon as the buffer is
+///   safe for the CPU to access.
+///
+/// Once a buffer is mapped:
+///
+/// - You can call `buffer.slice(range).get_mapped_range()` to obtain a
+///   [`BufferView`], which dereferences to a `&[u8]` that you can use to read
+///   the buffer's contents.
+///
+/// - Or, you can call `buffer.slice(range).get_mapped_range_mut()` to obtain a
+///   [`BufferViewMut`], which dereferences to a `&mut [u8]` that you can use to
+///   read and write the buffer's contents.
+///
+/// The given `range` must fall within the mapped portion of the buffer. If you
+/// attempt to access overlapping ranges, even for shared access only, these
+/// methods panic.
+///
+/// For example:
+///
+/// ```no_run
+/// # let buffer: wgpu::Buffer = todo!();
+/// let slice = buffer.slice(10..20);
+/// slice.map_async(wgpu::MapMode::Read, |result| {
+///     match result {
+///         Ok(()) => {
+///             let view = slice.get_mapped_range();
+///             // read data from `view`, which dereferences to `&[u8]`
+///         }
+///         Err(e) => {
+///             // handle mapping error
+///         }
+///     }
+/// });
+/// ```
+///
+/// This example calls `Buffer::slice` to obtain a [`BufferSlice`] referring to
+/// the second ten bytes of `buffer`. (To obtain access to the entire buffer,
+/// you could call `buffer.slice(..)`.) The code then calls `map_async` to wait
+/// for the buffer to be available, and finally calls `get_mapped_range` on the
+/// slice to actually get at the bytes.
+///
+/// If using `map_async` directly is awkward, you may find it more convenient to
+/// use [`Queue::write_buffer`] and [`util::DownloadBuffer::read_buffer`].
+/// However, those each have their own tradeoffs; the asynchronous nature of GPU
+/// execution makes it hard to avoid friction altogether.
+///
+/// While a buffer is mapped, you must not submit any commands to the GPU that
+/// access it. You may record command buffers that use the buffer, but you must
+/// not submit such command buffers.
+///
+/// When you are done using the buffer on the CPU, you must call
+/// [`Buffer::unmap`] to make it available for use by the GPU again. All
+/// [`BufferView`] and [`BufferViewMut`] views referring to the buffer must be
+/// dropped before you unmap it; otherwise, [`Buffer::unmap`] will panic.
+///
+/// ## Mapping buffers on the web
+///
+/// When compiled to WebAssembly and running in a browser content process,
+/// `wgpu` implements its API in terms of the browser's WebGPU implementation.
+/// In this context, `wgpu` is further isolated from the GPU:
+///
+/// - Depending on the browser's WebGPU implementation, mapping and unmapping
+///   buffers probably entails copies between WebAssembly linear memory and the
+///   graphics driver's buffers.
+///
+/// - All modern web browsers isolate web content in its own sandboxed process,
+///   which can only interact with the GPU via interprocess communication (IPC).
+///   Although most browsers' IPC systems use shared memory for large data
+///   transfers, there will still probably need to be copies into and out of the
+///   shared memory buffers.
+///
+/// All of these copies contribute to the cost of buffer mapping in this
+/// configuration.
+///
+/// [`usage`]: BufferDescriptor::usage
+/// [mac]: BufferDescriptor::mapped_at_creation
+/// [`MAP_READ`]: BufferUsages::MAP_READ
+/// [`MAP_WRITE`]: BufferUsages::MAP_WRITE
 #[derive(Debug)]
 pub struct Buffer {
     context: Arc<C>,
@@ -287,14 +431,38 @@ pub struct Buffer {
 #[cfg(send_sync)]
 static_assertions::assert_impl_all!(Buffer: Send, Sync);
 
-/// Slice into a [`Buffer`].
+/// A slice of a [`Buffer`], to be mapped, used for vertex or index data, or the like.
 ///
-/// It can be created with [`Buffer::slice`]. To use the whole buffer, call with unbounded slice:
+/// You can create a `BufferSlice` by calling [`Buffer::slice`]:
 ///
-/// `buffer.slice(..)`
+/// ```no_run
+/// # let buffer: wgpu::Buffer = todo!();
+/// let slice = buffer.slice(10..20);
+/// ```
 ///
-/// This type is unique to the Rust API of `wgpu`. In the WebGPU specification,
-/// an offset and size are specified as arguments to each call working with the [`Buffer`], instead.
+/// This returns a slice referring to the second ten bytes of `buffer`. To get a
+/// slice of the entire `Buffer`:
+///
+/// ```no_run
+/// # let buffer: wgpu::Buffer = todo!();
+/// let whole_buffer_slice = buffer.slice(..);
+/// ```
+///
+/// A [`BufferSlice`] is nothing more than a reference to the `Buffer` and a
+/// starting and ending position. To access the slice's contents on the CPU, you
+/// must first [map] the buffer, and then call [`BufferSlice::get_mapped_range`]
+/// or [`BufferSlice::get_mapped_range_mut`] to obtain a view of the slice's
+/// contents, which dereferences to a `&[u8]` or `&mut [u8]`.
+///
+/// You can also pass buffer slices to methods like
+/// [`RenderPass::set_vertex_buffer`] and [`RenderPass::set_index_buffer`] to
+/// indicate which data a draw call should consume.
+///
+/// The `BufferSlice` type is unique to the Rust API of `wgpu`. In the WebGPU
+/// specification, an offset and size are specified as arguments to each call
+/// working with the [`Buffer`], instead.
+///
+/// [map]: Buffer#mapping-buffers
 #[derive(Copy, Clone, Debug)]
 pub struct BufferSlice<'a> {
     buffer: &'a Buffer,
@@ -380,9 +548,19 @@ static_assertions::assert_impl_all!(SurfaceConfiguration: Send, Sync);
 /// serves a similar role.
 pub struct Surface<'window> {
     context: Arc<C>,
-    _surface: Option<Box<dyn WindowHandle + 'window>>,
+
+    /// Optionally, keep the source of the handle used for the surface alive.
+    ///
+    /// This is useful for platforms where the surface is created from a window and the surface
+    /// would become invalid when the window is dropped.
+    _handle_source: Option<Box<dyn WindowHandle + 'window>>,
+
+    /// Wgpu-core surface id.
     id: ObjectId,
-    data: Box<Data>,
+
+    /// Additional surface data returned by [`DynContext::instance_create_surface`].
+    surface_data: Box<Data>,
+
     // Stores the latest `SurfaceConfiguration` that was set using `Surface::configure`.
     // It is required to set the attributes of the `SurfaceTexture` in the
     // `Surface::get_current_texture` method.
@@ -399,15 +577,15 @@ impl<'window> fmt::Debug for Surface<'window> {
         f.debug_struct("Surface")
             .field("context", &self.context)
             .field(
-                "_surface",
-                &if self._surface.is_some() {
+                "_handle_source",
+                &if self._handle_source.is_some() {
                     "Some"
                 } else {
                     "None"
                 },
             )
             .field("id", &self.id)
-            .field("data", &self.data)
+            .field("data", &self.surface_data)
             .field("config", &self.config)
             .finish()
     }
@@ -419,7 +597,8 @@ static_assertions::assert_impl_all!(Surface<'_>: Send, Sync);
 impl Drop for Surface<'_> {
     fn drop(&mut self) {
         if !thread::panicking() {
-            self.context.surface_drop(&self.id, self.data.as_ref())
+            self.context
+                .surface_drop(&self.id, self.surface_data.as_ref())
         }
     }
 }
@@ -647,6 +826,139 @@ impl Drop for ShaderModule {
     }
 }
 
+impl ShaderModule {
+    /// Get the compilation info for the shader module.
+    pub fn get_compilation_info(&self) -> impl Future<Output = CompilationInfo> + WasmNotSend {
+        self.context
+            .shader_get_compilation_info(&self.id, self.data.as_ref())
+    }
+}
+
+/// Compilation information for a shader module.
+///
+/// Corresponds to [WebGPU `GPUCompilationInfo`](https://gpuweb.github.io/gpuweb/#gpucompilationinfo).
+/// The source locations use bytes, and index a UTF-8 encoded string.
+#[derive(Debug, Clone)]
+pub struct CompilationInfo {
+    /// The messages from the shader compilation process.
+    pub messages: Vec<CompilationMessage>,
+}
+
+/// A single message from the shader compilation process.
+///
+/// Roughly corresponds to [`GPUCompilationMessage`](https://www.w3.org/TR/webgpu/#gpucompilationmessage),
+/// except that the location uses UTF-8 for all positions.
+#[derive(Debug, Clone)]
+pub struct CompilationMessage {
+    /// The text of the message.
+    pub message: String,
+    /// The type of the message.
+    pub message_type: CompilationMessageType,
+    /// Where in the source code the message points at.
+    pub location: Option<SourceLocation>,
+}
+
+/// The type of a compilation message.
+#[derive(Debug, Clone, Copy, PartialEq, Eq)]
+pub enum CompilationMessageType {
+    /// An error message.
+    Error,
+    /// A warning message.
+    Warning,
+    /// An informational message.
+    Info,
+}
+
+/// A human-readable representation for a span, tailored for text source.
+///
+/// Roughly corresponds to the positional members of [`GPUCompilationMessage`][gcm] from
+/// the WebGPU specification, except
+/// - `offset` and `length` are in bytes (UTF-8 code units), instead of UTF-16 code units.
+/// - `line_position` is in bytes (UTF-8 code units), and is usually not directly intended for humans.
+///
+/// [gcm]: https://www.w3.org/TR/webgpu/#gpucompilationmessage
+#[derive(Copy, Clone, Debug, PartialEq, Eq)]
+pub struct SourceLocation {
+    /// 1-based line number.
+    pub line_number: u32,
+    /// 1-based column in code units (in bytes) of the start of the span.
+    /// Remember to convert accordingly when displaying to the user.
+    pub line_position: u32,
+    /// 0-based Offset in code units (in bytes) of the start of the span.
+    pub offset: u32,
+    /// Length in code units (in bytes) of the span.
+    pub length: u32,
+}
+
+#[cfg(all(feature = "wgsl", wgpu_core))]
+impl From<naga::error::ShaderError<naga::front::wgsl::ParseError>> for CompilationInfo {
+    fn from(value: naga::error::ShaderError<naga::front::wgsl::ParseError>) -> Self {
+        CompilationInfo {
+            messages: vec![CompilationMessage {
+                message: value.to_string(),
+                message_type: CompilationMessageType::Error,
+                location: value.inner.location(&value.source).map(Into::into),
+            }],
+        }
+    }
+}
+#[cfg(feature = "glsl")]
+impl From<naga::error::ShaderError<naga::front::glsl::ParseErrors>> for CompilationInfo {
+    fn from(value: naga::error::ShaderError<naga::front::glsl::ParseErrors>) -> Self {
+        let messages = value
+            .inner
+            .errors
+            .into_iter()
+            .map(|err| CompilationMessage {
+                message: err.to_string(),
+                message_type: CompilationMessageType::Error,
+                location: err.location(&value.source).map(Into::into),
+            })
+            .collect();
+        CompilationInfo { messages }
+    }
+}
+
+#[cfg(feature = "spirv")]
+impl From<naga::error::ShaderError<naga::front::spv::Error>> for CompilationInfo {
+    fn from(value: naga::error::ShaderError<naga::front::spv::Error>) -> Self {
+        CompilationInfo {
+            messages: vec![CompilationMessage {
+                message: value.to_string(),
+                message_type: CompilationMessageType::Error,
+                location: None,
+            }],
+        }
+    }
+}
+
+#[cfg(any(wgpu_core, naga))]
+impl From<naga::error::ShaderError<naga::WithSpan<naga::valid::ValidationError>>>
+    for CompilationInfo
+{
+    fn from(value: naga::error::ShaderError<naga::WithSpan<naga::valid::ValidationError>>) -> Self {
+        CompilationInfo {
+            messages: vec![CompilationMessage {
+                message: value.to_string(),
+                message_type: CompilationMessageType::Error,
+                location: value.inner.location(&value.source).map(Into::into),
+            }],
+        }
+    }
+}
+
+#[cfg(any(wgpu_core, naga))]
+impl From<naga::SourceLocation> for SourceLocation {
+    fn from(value: naga::SourceLocation) -> Self {
+        SourceLocation {
+            length: value.length,
+            offset: value.offset,
+            line_number: value.line_number,
+            line_position: value.line_position,
+        }
+    }
+}
+
 /// Source of a shader module.
 ///
 /// The source will be parsed and validated.
@@ -1484,14 +1796,10 @@ pub struct VertexState<'a> {
     /// The name of the entry point in the compiled shader. There must be a function with this name
     /// in the shader.
     pub entry_point: &'a str,
-    /// Specifies the values of pipeline-overridable constants in the shader module.
-    ///
-    /// If an `@id` attribute was specified on the declaration,
-    /// the key must be the pipeline constant ID as a decimal ASCII number; if not,
-    /// the key must be the constant's identifier name.
+    /// Advanced options for when this pipeline is compiled
     ///
-    /// The value may represent any of WGSL's concrete scalar types.
-    pub constants: &'a HashMap<String, f64>,
+    /// This implements `Default`, and for most users can be set to `Default::default()`
+    pub compilation_options: PipelineCompilationOptions<'a>,
     /// The format of any vertex buffers used with this pipeline.
     pub buffers: &'a [VertexBufferLayout<'a>],
 }
@@ -1511,14 +1819,10 @@ pub struct FragmentState<'a> {
     /// The name of the entry point in the compiled shader. There must be a function with this name
     /// in the shader.
     pub entry_point: &'a str,
-    /// Specifies the values of pipeline-overridable constants in the shader module.
-    ///
-    /// If an `@id` attribute was specified on the declaration,
-    /// the key must be the pipeline constant ID as a decimal ASCII number; if not,
-    /// the key must be the constant's identifier name.
+    /// Advanced options for when this pipeline is compiled
     ///
-    /// The value may represent any of WGSL's concrete scalar types.
-    pub constants: &'a HashMap<String, f64>,
+    /// This implements `Default`, and for most users can be set to `Default::default()`
+    pub compilation_options: PipelineCompilationOptions<'a>,
     /// The color state of the render targets.
     pub targets: &'a [Option<ColorTargetState>],
 }
@@ -1591,6 +1895,41 @@ pub struct ComputePassDescriptor<'a> {
 #[cfg(send_sync)]
 static_assertions::assert_impl_all!(ComputePassDescriptor<'_>: Send, Sync);
 
+#[derive(Clone, Debug)]
+/// Advanced options for use when a pipeline is compiled
+///
+/// This implements `Default`, and for most users can be set to `Default::default()`
+pub struct PipelineCompilationOptions<'a> {
+    /// Specifies the values of pipeline-overridable constants in the shader module.
+    ///
+    /// If an `@id` attribute was specified on the declaration,
+    /// the key must be the pipeline constant ID as a decimal ASCII number; if not,
+    /// the key must be the constant's identifier name.
+    ///
+    /// The value may represent any of WGSL's concrete scalar types.
+    pub constants: &'a HashMap<String, f64>,
+    /// Whether workgroup scoped memory will be initialized with zero values for this stage.
+    ///
+    /// This is required by the WebGPU spec, but may have overhead which can be avoided
+    /// for cross-platform applications
+    pub zero_initialize_workgroup_memory: bool,
+}
+
+impl<'a> Default for PipelineCompilationOptions<'a> {
+    fn default() -> Self {
+        // HashMap doesn't have a const constructor, due to the use of RandomState
+        // This does introduce some synchronisation costs, but these should be minor,
+        // and might be cheaper than the alternative of getting new random state
+        static DEFAULT_CONSTANTS: std::sync::OnceLock<HashMap<String, f64>> =
+            std::sync::OnceLock::new();
+        let constants = DEFAULT_CONSTANTS.get_or_init(Default::default);
+        Self {
+            constants,
+            zero_initialize_workgroup_memory: true,
+        }
+    }
+}
+
 /// Describes a compute pipeline.
 ///
 /// For use with [`Device::create_compute_pipeline`].
@@ -1608,14 +1947,10 @@ pub struct ComputePipelineDescriptor<'a> {
     /// The name of the entry point in the compiled shader. There must be a function with this name
     /// and no return value in the shader.
     pub entry_point: &'a str,
-    /// Specifies the values of pipeline-overridable constants in the shader module.
-    ///
-    /// If an `@id` attribute was specified on the declaration,
-    /// the key must be the pipeline constant ID as a decimal ASCII number; if not,
-    /// the key must be the constant's identifier name.
+    /// Advanced options for when this pipeline is compiled
     ///
-    /// The value may represent any of WGSL's concrete scalar types.
-    pub constants: &'a HashMap<String, f64>,
+    /// This implements `Default`, and for most users can be set to `Default::default()`
+    pub compilation_options: PipelineCompilationOptions<'a>,
 }
 #[cfg(send_sync)]
 static_assertions::assert_impl_all!(ComputePipelineDescriptor<'_>: Send, Sync);
@@ -1983,6 +2318,8 @@ impl Instance {
 
     /// Creates a new surface targeting a given window/canvas/surface/etc..
     ///
+    /// Internally, this creates surfaces for all backends that are enabled for this instance.
+    ///
     /// See [`SurfaceTarget`] for what targets are supported.
     /// See [`Instance::create_surface_unsafe`] for surface creation with unsafe target variants.
     ///
@@ -1993,7 +2330,7 @@ impl Instance {
         target: impl Into<SurfaceTarget<'window>>,
     ) -> Result<Surface<'window>, CreateSurfaceError> {
         // Handle origin (i.e. window) to optionally take ownership of to make the surface outlast the window.
-        let handle_origin;
+        let handle_source;
 
         let target = target.into();
         let mut surface = match target {
@@ -2003,14 +2340,14 @@ impl Instance {
                         inner: CreateSurfaceErrorKind::RawHandle(e),
                     })?,
                 );
-                handle_origin = Some(window);
+                handle_source = Some(window);
 
                 surface
             }?,
 
             #[cfg(any(webgpu, webgl))]
             SurfaceTarget::Canvas(canvas) => {
-                handle_origin = None;
+                handle_source = None;
 
                 let value: &wasm_bindgen::JsValue = &canvas;
                 let obj = std::ptr::NonNull::from(value).cast();
@@ -2029,7 +2366,7 @@ impl Instance {
 
             #[cfg(any(webgpu, webgl))]
             SurfaceTarget::OffscreenCanvas(canvas) => {
-                handle_origin = None;
+                handle_source = None;
 
                 let value: &wasm_bindgen::JsValue = &canvas;
                 let obj = std::ptr::NonNull::from(value).cast();
@@ -2048,13 +2385,15 @@ impl Instance {
             }
         };
 
-        surface._surface = handle_origin;
+        surface._handle_source = handle_source;
 
         Ok(surface)
     }
 
     /// Creates a new surface targeting a given window/canvas/surface/etc. using an unsafe target.
     ///
+    /// Internally, this creates surfaces for all backends that are enabled for this instance.
+    ///
     /// See [`SurfaceTargetUnsafe`] for what targets are supported.
     /// See [`Instance::create_surface`] for surface creation with safe target variants.
     ///
@@ -2069,9 +2408,9 @@ impl Instance {
 
         Ok(Surface {
             context: Arc::clone(&self.context),
-            _surface: None,
+            _handle_source: None,
             id,
-            data,
+            surface_data: data,
             config: Mutex::new(None),
         })
     }
@@ -2245,7 +2584,7 @@ impl Adapter {
             &self.id,
             self.data.as_ref(),
             &surface.id,
-            surface.data.as_ref(),
+            surface.surface_data.as_ref(),
         )
     }
 
@@ -2933,6 +3272,18 @@ fn range_to_offset_size<S: RangeBounds<BufferAddress>>(
 }
 
 /// Read only view into a mapped buffer.
+///
+/// To get a `BufferView`, first [map] the buffer, and then
+/// call `buffer.slice(range).get_mapped_range()`.
+///
+/// `BufferView` dereferences to `&[u8]`, so you can use all the usual Rust
+/// slice methods to access the buffer's contents. It also implements
+/// `AsRef<[u8]>`, if that's more convenient.
+///
+/// If you try to create overlapping views of a buffer, mutable or
+/// otherwise, `get_mapped_range` will panic.
+///
+/// [map]: Buffer#mapping-buffers
 #[derive(Debug)]
 pub struct BufferView<'a> {
     slice: BufferSlice<'a>,
@@ -2941,8 +3292,20 @@ pub struct BufferView<'a> {
 
 /// Write only view into mapped buffer.
 ///
+/// To get a `BufferViewMut`, first [map] the buffer, and then
+/// call `buffer.slice(range).get_mapped_range_mut()`.
+///
+/// `BufferViewMut` dereferences to `&mut [u8]`, so you can use all the usual
+/// Rust slice methods to access the buffer's contents. It also implements
+/// `AsMut<[u8]>`, if that's more convenient.
+///
 /// It is possible to read the buffer using this view, but doing so is not
 /// recommended, as it is likely to be slow.
+///
+/// If you try to create overlapping views of a buffer, mutable or
+/// otherwise, `get_mapped_range_mut` will panic.
+///
+/// [map]: Buffer#mapping-buffers
 #[derive(Debug)]
 pub struct BufferViewMut<'a> {
     slice: BufferSlice<'a>,
@@ -4644,11 +5007,24 @@ impl<'a> Drop for QueueWriteBufferView<'a> {
 impl Queue {
     /// Schedule a data write into `buffer` starting at `offset`.
     ///
-    /// This method is intended to have low performance costs.
-    /// As such, the write is not immediately submitted, and instead enqueued
-    /// internally to happen at the start of the next `submit()` call.
-    ///
     /// This method fails if `data` overruns the size of `buffer` starting at `offset`.
+    ///
+    /// This does *not* submit the transfer to the GPU immediately. Calls to
+    /// `write_buffer` begin execution only on the next call to
+    /// [`Queue::submit`]. To get a set of scheduled transfers started
+    /// immediately, it's fine to call `submit` with no command buffers at all:
+    ///
+    /// ```no_run
+    /// # let queue: wgpu::Queue = todo!();
+    /// queue.submit([]);
+    /// ```
+    ///
+    /// However, `data` will be immediately copied into staging memory, so the
+    /// caller may discard it any time after this call completes.
+    ///
+    /// If possible, consider using [`Queue::write_buffer_with`] instead. That
+    /// method avoids an intermediate copy and is often able to transfer data
+    /// more efficiently than this one.
     pub fn write_buffer(&self, buffer: &Buffer, offset: BufferAddress, data: &[u8]) {
         DynContext::queue_write_buffer(
             &*self.context,
@@ -4661,14 +5037,32 @@ impl Queue {
         )
     }
 
-    /// Schedule a data write into `buffer` starting at `offset` via the returned
-    /// [`QueueWriteBufferView`].
+    /// Write to a buffer via a directly mapped staging buffer.
     ///
-    /// Reading from this buffer is slow and will not yield the actual contents of the buffer.
+    /// Return a [`QueueWriteBufferView`] which, when dropped, schedules a copy
+    /// of its contents into `buffer` at `offset`. The returned view
+    /// dereferences to a `size`-byte long `&mut [u8]`, in which you should
+    /// store the data you would like written to `buffer`.
     ///
-    /// This method is intended to have low performance costs.
-    /// As such, the write is not immediately submitted, and instead enqueued
-    /// internally to happen at the start of the next `submit()` call.
+    /// This method may perform transfers faster than [`Queue::write_buffer`],
+    /// because the returned [`QueueWriteBufferView`] is actually the staging
+    /// buffer for the write, mapped into the caller's address space. Writing
+    /// your data directly into this staging buffer avoids the temporary
+    /// CPU-side buffer needed by `write_buffer`.
+    ///
+    /// Reading from the returned view is slow, and will not yield the current
+    /// contents of `buffer`.
+    ///
+    /// Note that dropping the [`QueueWriteBufferView`] does *not* submit the
+    /// transfer to the GPU immediately. The transfer begins only on the next
+    /// call to [`Queue::submit`] after the view is dropped. To get a set of
+    /// scheduled transfers started immediately, it's fine to call `submit` with
+    /// no command buffers at all:
+    ///
+    /// ```no_run
+    /// # let queue: wgpu::Queue = todo!();
+    /// queue.submit([]);
+    /// ```
     ///
     /// This method fails if `size` is greater than the size of `buffer` starting at `offset`.
     #[must_use]
@@ -4712,13 +5106,20 @@ impl Queue {
     ///   texture (coordinate offset, mip level) that will be overwritten.
     /// * `size` is the size, in texels, of the region to be written.
     ///
-    /// This method is intended to have low performance costs.
-    /// As such, the write is not immediately submitted, and instead enqueued
-    /// internally to happen at the start of the next `submit()` call.
-    /// However, `data` will be immediately copied into staging memory; so the caller may
-    /// discard it any time after this call completes.
-    ///
     /// This method fails if `size` overruns the size of `texture`, or if `data` is too short.
+    ///
+    /// This does *not* submit the transfer to the GPU immediately. Calls to
+    /// `write_texture` begin execution only on the next call to
+    /// [`Queue::submit`]. To get a set of scheduled transfers started
+    /// immediately, it's fine to call `submit` with no command buffers at all:
+    ///
+    /// ```no_run
+    /// # let queue: wgpu::Queue = todo!();
+    /// queue.submit([]);
+    /// ```
+    ///
+    /// However, `data` will be immediately copied into staging memory, so the
+    /// caller may discard it any time after this call completes.
     pub fn write_texture(
         &self,
         texture: ImageCopyTexture<'_>,
@@ -4849,7 +5250,7 @@ impl Surface<'_> {
         DynContext::surface_get_capabilities(
             &*self.context,
             &self.id,
-            self.data.as_ref(),
+            self.surface_data.as_ref(),
             &adapter.id,
             adapter.data.as_ref(),
         )
@@ -4888,7 +5289,7 @@ impl Surface<'_> {
         DynContext::surface_configure(
             &*self.context,
             &self.id,
-            self.data.as_ref(),
+            self.surface_data.as_ref(),
             &device.id,
             device.data.as_ref(),
             config,
@@ -4907,8 +5308,11 @@ impl Surface<'_> {
     /// If a SurfaceTexture referencing this surface is alive when the swapchain is recreated,
     /// recreating the swapchain will panic.
     pub fn get_current_texture(&self) -> Result<SurfaceTexture, SurfaceError> {
-        let (texture_id, texture_data, status, detail) =
-            DynContext::surface_get_current_texture(&*self.context, &self.id, self.data.as_ref());
+        let (texture_id, texture_data, status, detail) = DynContext::surface_get_current_texture(
+            &*self.context,
+            &self.id,
+            self.surface_data.as_ref(),
+        );
 
         let suboptimal = match status {
             SurfaceStatus::Good => false,
@@ -4971,7 +5375,7 @@ impl Surface<'_> {
             .downcast_ref::<crate::backend::ContextWgpuCore>()
             .map(|ctx| unsafe {
                 ctx.surface_as_hal::<A, F, R>(
-                    self.data.downcast_ref().unwrap(),
+                    self.surface_data.downcast_ref().unwrap(),
                     hal_surface_callback,
                 )
             })