From fe9931ab5364679f8ea3ea19907927ef69db9da8 Mon Sep 17 00:00:00 2001
From: Frank Yellin <fy@fyellin.com>
Date: Fri, 1 Nov 2024 13:27:18 -0700
Subject: [PATCH 01/25] Fix asynchronous commands

---
 pyproject.toml                        |   5 +-
 tests/test_async.py                   | 105 ++++++++++
 tests/test_wgpu_native_basics.py      |   5 -
 tests/test_wgpu_native_buffer.py      |  21 ++
 wgpu/backends/wgpu_native/__init__.py |   1 +
 wgpu/backends/wgpu_native/_api.py     | 276 +++++++++++++-------------
 wgpu/backends/wgpu_native/_helpers.py |  85 ++++----
 7 files changed, 312 insertions(+), 186 deletions(-)
 create mode 100644 tests/test_async.py

diff --git a/pyproject.toml b/pyproject.toml
index f98f6ed3..d4b45d29 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -20,7 +20,7 @@ imgui = ["imgui-bundle>=1.2.1"]
 build = ["build", "hatchling", "requests", "twine"]
 codegen = ["pytest", "numpy", "ruff"]
 lint = ["ruff", "pre-commit"]
-tests = ["numpy", "pytest", "psutil", "imageio"]
+tests = ["numpy", "pytest", "pytest-asyncio", "psutil", "imageio"]
 examples = []
 docs = ["sphinx>7.2", "sphinx_rtd_theme"]
 dev = ["wgpu[build,codegen,lint,tests,examples,docs]"]
@@ -63,6 +63,9 @@ artifacts = ["*.so", "*.dll", "*.dylib"]
 [tool.hatch.build.targets.wheel.hooks.custom]
 path = "tools/hatch_build.py"
 
+[tool.pytest.ini_options]
+asyncio_default_fixture_loop_scope = "function"
+
 # ===== Tooling
 
 [tool.ruff]
diff --git a/tests/test_async.py b/tests/test_async.py
new file mode 100644
index 00000000..cdd0c4f0
--- /dev/null
+++ b/tests/test_async.py
@@ -0,0 +1,105 @@
+import asyncio
+
+import pytest
+
+import wgpu.utils
+from tests.testutils import run_tests
+from wgpu import MapMode, TextureFormat
+from wgpu.backends.wgpu_native import WgpuAwaitable
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize("use_async", [False, True])
+async def test_awaitable_async(use_async, loop_scope="function"):
+    count = 0
+
+    def finalizer(i):
+        return i * i
+
+    def callback(i):
+        awaitable.set_result(i)
+
+    def poll_function():
+        nonlocal count
+        count += 1
+        if count >= 3:
+            callback(10)
+
+    awaitable = WgpuAwaitable("test", callback, finalizer, poll_function)
+
+    if use_async:
+        result = await awaitable.wait_async()
+    else:
+        result = awaitable.wait_sync()
+
+
+@pytest.mark.asyncio
+async def test_asynchronous_get_device(loop_scope="function"):
+    adapter = await wgpu.gpu.request_adapter_async(power_preference="high-performance")
+    device = await adapter.request_device_async()
+    assert device is not None
+
+
+@pytest.mark.asyncio
+async def test_asynchronous_buffer_map(loop_scope="function"):
+    device = wgpu.utils.get_default_device()
+
+    data = b"1" * 10000
+    buffer1 = device.create_buffer(size=len(data), usage="MAP_WRITE|COPY_SRC")
+    buffer2 = device.create_buffer(size=len(data), usage="MAP_READ|COPY_DST")
+    await buffer1.map_async(MapMode.WRITE)
+    buffer1.write_mapped(data)
+    buffer1.unmap()
+
+    command_encoder = device.create_command_encoder()
+    command_encoder.copy_buffer_to_buffer(buffer1, 0, buffer2, 0, len(data))
+    device.queue.submit([command_encoder.finish()])
+
+    await buffer2.map_async(MapMode.READ)
+    data2 = buffer2.read_mapped()
+    buffer2.unmap()
+
+    assert bytes(data2) == data
+
+
+@pytest.mark.asyncio
+async def test_asynchronous_make_pipeline(loop_scope="function"):
+    device = wgpu.utils.get_default_device()
+
+    shader_source = """
+        @vertex
+        fn vertex_main() -> @builtin(position) vec4f {
+            return vec4f(0, 0, 0, 1.);
+        }
+    
+        @compute @workgroup_size(1)
+        fn compute_main() { }
+    """
+
+    shader = device.create_shader_module(code=shader_source)
+
+    render_pipeline, compute_pipeline = await asyncio.gather(
+        device.create_render_pipeline_async(
+            layout="auto",
+            vertex={
+                "module": shader,
+            },
+            depth_stencil={"format": TextureFormat.rgba8unorm},
+        ),
+        device.create_compute_pipeline_async(layout="auto", compute={"module": shader}),
+    )
+
+    assert compute_pipeline is not None
+    assert render_pipeline is not None
+
+    command_encoder = device.create_command_encoder()
+    compute_pass = command_encoder.begin_compute_pass()
+    compute_pass.set_pipeline(compute_pipeline)
+    compute_pass.dispatch_workgroups(10, 10)
+    compute_pass.end()
+    device.queue.submit([command_encoder.finish()])
+    await device.queue.on_submitted_work_done_async()
+
+
+if __name__ == "__main__":
+    run_tests(globals())
diff --git a/tests/test_wgpu_native_basics.py b/tests/test_wgpu_native_basics.py
index 21111d90..c30229d7 100644
--- a/tests/test_wgpu_native_basics.py
+++ b/tests/test_wgpu_native_basics.py
@@ -12,7 +12,6 @@
 from testutils import run_tests, can_use_wgpu_lib, is_ci
 from pytest import mark, raises
 
-
 is_win = sys.platform.startswith("win")
 
 
@@ -468,7 +467,3 @@ def test_limits_are_not_legal():
 
 if __name__ == "__main__":
     run_tests(globals())
-
-
-if __name__ == "__main__":
-    run_tests(globals())
diff --git a/tests/test_wgpu_native_buffer.py b/tests/test_wgpu_native_buffer.py
index 994aa6af..aa0d0704 100644
--- a/tests/test_wgpu_native_buffer.py
+++ b/tests/test_wgpu_native_buffer.py
@@ -548,5 +548,26 @@ def test_create_buffer_with_data(size):
     assert copy[size:] == bytes(buffer._nbytes - size)
 
 
+@pytest.mark.skip
+def test_show_bug_wgpu_native_305_still_not_fixed():
+    # When this bug is fixed, we can remove READ_NOSYNC, and just tread "READ" as if
+    # it were READ_NOSYNC.  No need to handle the command buffer.
+    device = wgpu.utils.get_default_device()
+    data1 = b"abcdefghijkl"
+
+    # Create buffer with data
+    buf = device.create_buffer(
+        size=len(data1), usage=wgpu.BufferUsage.MAP_READ, mapped_at_creation=True
+    )
+    buf.write_mapped(data1)
+    buf.unmap()
+
+    # Download from buffer to CPU
+    buf.map("READ_NOSYNC")
+    data2 = buf.read_mapped()
+    buf.unmap()
+    assert data1 == data2
+
+
 if __name__ == "__main__":
     run_tests(globals())
diff --git a/wgpu/backends/wgpu_native/__init__.py b/wgpu/backends/wgpu_native/__init__.py
index 87b38033..dc9116de 100644
--- a/wgpu/backends/wgpu_native/__init__.py
+++ b/wgpu/backends/wgpu_native/__init__.py
@@ -21,3 +21,4 @@
 _register_backend(gpu)
 
 from .extras import request_device_sync, request_device
+from ._helpers import WgpuAwaitable
diff --git a/wgpu/backends/wgpu_native/_api.py b/wgpu/backends/wgpu_native/_api.py
index 9bc3b8e4..e58ad2c0 100644
--- a/wgpu/backends/wgpu_native/_api.py
+++ b/wgpu/backends/wgpu_native/_api.py
@@ -34,8 +34,8 @@
     get_memoryview_and_address,
     to_snake_case,
     ErrorHandler,
-    WgpuAwaitable,
     SafeLibCalls,
+    WgpuAwaitable,
 )
 
 logger = logging.getLogger("wgpu")
@@ -331,7 +331,7 @@ def request_adapter_sync(
             canvas=canvas,
         )
 
-        return awaitable.sync_wait()
+        return awaitable.wait_sync()
 
     async def request_adapter_async(
         self,
@@ -357,8 +357,7 @@ async def request_adapter_async(
             force_fallback_adapter=force_fallback_adapter,
             canvas=canvas,
         )  # no-cover
-
-        return await awaitable
+        return await awaitable.wait_async()
 
     def _request_adapter(
         self, *, power_preference=None, force_fallback_adapter=False, canvas=None
@@ -401,28 +400,25 @@ def _request_adapter(
             # not used: nextInChain
         )
 
-        awaitable_result = {}
-
         @ffi.callback("void(WGPURequestAdapterStatus, WGPUAdapter, char *, void *)")
-        def callback(status, result, message, userdata):
+        def callback(status, result, message, _userdata):
             if status != 0:
                 msg = "-" if message == ffi.NULL else ffi.string(message).decode()
-                awaitable_result["error"] = f"Request adapter failed ({status}): {msg}"
+                awaitable.set_error(f"Request adapter failed ({status}): {msg}")
             else:
-                awaitable_result["result"] = result
-
-        def poll_func():
-            pass  # actually does not need a poll func at the moment
+                awaitable.set_result(result)
 
         def finalizer(adapter_id):
             return self._create_adapter(adapter_id)
 
+        # Note that although we claim this is an asynchronous method, the callback
+        # happens within libf.wgpuInstanceRequestAdapter
+        awaitable = WgpuAwaitable("request_adapter", callback, finalizer)
+
         # H: void f(WGPUInstance instance, WGPURequestAdapterOptions const * options, WGPUInstanceRequestAdapterCallback callback, void * userdata)
         libf.wgpuInstanceRequestAdapter(get_wgpu_instance(), struct, callback, ffi.NULL)
 
-        return WgpuAwaitable(
-            "request_adapter", awaitable_result, callback, poll_func, finalizer
-        )
+        return awaitable
 
     def enumerate_adapters_sync(self):
         """Sync version of ``enumerate_adapters_async()``.
@@ -858,7 +854,7 @@ def request_device_sync(
         awaitable = self._request_device(
             label, required_features, required_limits, default_queue, ""
         )
-        return awaitable.sync_wait()
+        return awaitable.wait_sync()
 
     async def request_device_async(
         self,
@@ -871,14 +867,11 @@ async def request_device_async(
         if default_queue:
             check_struct("QueueDescriptor", default_queue)
         awaitable = self._request_device(
-            label,
-            required_features=required_features,
-            required_limits=required_limits,
-            default_queue=default_queue,
-            trace_path="",
+            label, required_features, required_limits, default_queue, ""
         )
-
-        return await awaitable
+        # Note that although we claim this function is async, the callback always
+        # happens inside the call to libf.wgpuAdapterRequestDevice
+        return await awaitable.wait_async()
 
     def _request_device(
         self,
@@ -1033,18 +1026,13 @@ def uncaptured_error_callback(c_type, c_message, userdata):
             # not used: deviceLostUserdata
         )
 
-        awaitable_result = {}
-
         @ffi.callback("void(WGPURequestDeviceStatus, WGPUDevice, char *, void *)")
         def callback(status, result, message, userdata):
             if status != 0:
                 msg = "-" if message == ffi.NULL else ffi.string(message).decode()
-                awaitable_result["error"] = f"Request device failed ({status}): {msg}"
+                awaitable.set_error(f"Request device failed ({status}): {msg}")
             else:
-                awaitable_result["result"] = result
-
-        def poll_func():
-            pass  # not needed at the moment. Replace with future poll method later.
+                awaitable.set_result(result)
 
         def finalizer(device_id):
             limits = _get_limits(device_id, device=True)
@@ -1061,12 +1049,12 @@ def finalizer(device_id):
 
             return device
 
+        awaitable = WgpuAwaitable("request_device", callback, finalizer)
+
         # H: void f(WGPUAdapter adapter, WGPUDeviceDescriptor const * descriptor, WGPUAdapterRequestDeviceCallback callback, void * userdata)
         libf.wgpuAdapterRequestDevice(self._internal, struct, callback, ffi.NULL)
 
-        return WgpuAwaitable(
-            "request_device", awaitable_result, callback, poll_func, finalizer
-        )
+        return awaitable
 
     def _release(self):
         if self._internal is not None and libf is not None:
@@ -1079,6 +1067,8 @@ class GPUDevice(classes.GPUDevice, GPUObjectBase):
     # GPUObjectBaseMixin
     _release_function = libf.wgpuDeviceRelease
 
+    CREATE_PIPELINE_ASYNC_NOT_IMPLEMENTED = True
+
     def _poll(self):
         # Internal function
         if self._internal:
@@ -1567,7 +1557,9 @@ def create_compute_pipeline(
         layout: Union[GPUPipelineLayout, enums.AutoLayoutMode],
         compute: structs.ProgrammableStage,
     ):
-        return self._create_compute_pipeline(label, layout, compute, asynchronous=False)
+        descriptor = self._create_compute_pipeline_descriptor(label, layout, compute)
+        id = libf.wgpuDeviceCreateComputePipeline(self._internal, descriptor)
+        return GPUComputePipeline(label, id, self)
 
     async def create_compute_pipeline_async(
         self,
@@ -1576,16 +1568,43 @@ async def create_compute_pipeline_async(
         layout: Union[GPUPipelineLayout, enums.AutoLayoutMode],
         compute: structs.ProgrammableStage,
     ):
-        # TODO: wgpuDeviceCreateComputePipelineAsync is not yet implemented in wgpu-native
-        return self._create_compute_pipeline(label, layout, compute, asynchronous=False)
+        descriptor = self._create_compute_pipeline_descriptor(label, layout, compute)
+
+        if self.CREATE_PIPELINE_ASYNC_NOT_IMPLEMENTED:
+            id = libf.wgpuDeviceCreateComputePipeline(self._internal, descriptor)
+            return GPUComputePipeline(label, id, self)
+
+        # This code is virtually identical to the code in create_render_pipeline_async.
+        # Can they be merged??
+        @ffi.callback(
+            "void(WGPUCreatePipelineAsyncStatus, WGPUComputePipeline, char *, void *)"
+        )
+        def callback(status, result, message, _userdata):
+            if status != 0:
+                msg = "-" if message == ffi.NULL else ffi.string(message).decode()
+                awaitable.set_error(f"create_compute_pipeline failed ({status}): {msg}")
+            else:
+                awaitable.set_result(result)
+
+        def finalizer(id):
+            return GPUComputePipeline(label, id, self)
 
-    def _create_compute_pipeline(
+        awaitable = WgpuAwaitable(
+            "create_compute_pipeline", callback, finalizer, self._device.poll
+        )
+
+        # H: void f(WGPUDevice device, WGPURenderPipelineDescriptor const * descriptor, WGPUDeviceCreateRenderPipelineAsyncCallback callback, void * userdata)
+        libf.wgpuDeviceCreateComputePipelineAsync(
+            self._internal, descriptor, callback, ffi.NULL
+        )
+
+        return await awaitable.wait_async()
+
+    def _create_compute_pipeline_descriptor(
         self,
         label: str,
         layout: Union[GPUPipelineLayout, enums.AutoLayoutMode],
         compute: structs.ProgrammableStage,
-        *,
-        asynchronous,
     ):
         check_struct("ProgrammableStage", compute)
         c_constants, c_constant_entries = _get_override_constant_entries(compute)
@@ -1616,13 +1635,7 @@ def _create_compute_pipeline(
             compute=c_compute_stage,
             # not used: nextInChain
         )
-
-        if asynchronous:
-            raise NotImplementedError()
-
-        # H: WGPUComputePipeline f(WGPUDevice device, WGPUComputePipelineDescriptor const * descriptor)
-        id = libf.wgpuDeviceCreateComputePipeline(self._internal, struct)
-        return GPUComputePipeline(label, id, self)
+        return struct
 
     def create_render_pipeline(
         self,
@@ -1635,16 +1648,11 @@ def create_render_pipeline(
         multisample: structs.MultisampleState = {},
         fragment: structs.FragmentState = optional,
     ):
-        return self._create_render_pipeline(
-            label,
-            layout,
-            vertex,
-            primitive,
-            depth_stencil,
-            multisample,
-            fragment,
-            asynchronous=False,
+        descriptor = self._create_render_pipeline_descriptor(
+            label, layout, vertex, primitive, depth_stencil, multisample, fragment
         )
+        id = libf.wgpuDeviceCreateRenderPipeline(self._internal, descriptor)
+        return GPURenderPipeline(label, id, self)
 
     async def create_render_pipeline_async(
         self,
@@ -1658,18 +1666,40 @@ async def create_render_pipeline_async(
         fragment: structs.FragmentState = optional,
     ):
         # TODO: wgpuDeviceCreateRenderPipelineAsync is not yet implemented in wgpu-native
-        return self._create_render_pipeline(
-            label,
-            layout,
-            vertex,
-            primitive,
-            depth_stencil,
-            multisample,
-            fragment,
-            asynchronous=False,
-        )
-
-    def _create_render_pipeline(
+        descriptor = self._create_render_pipeline_descriptor(
+            label, layout, vertex, primitive, depth_stencil, multisample, fragment
+        )
+
+        if self.CREATE_PIPELINE_ASYNC_NOT_IMPLEMENTED:
+            id = libf.wgpuDeviceCreateRenderPipeline(self._internal, descriptor)
+            return GPURenderPipeline(label, id, self)
+
+        @ffi.callback(
+            "void(WGPUCreatePipelineAsyncStatus, WGPURenderPipeline, char *, void *)"
+        )
+        def callback(status, result, message, _userdata):
+            print("Callback!")
+            if status != 0:
+                msg = "-" if message == ffi.NULL else ffi.string(message).decode()
+                awaitable.set_error(f"Create renderPipeline failed ({status}): {msg}")
+            else:
+                awaitable.set_result(result)
+
+        def finalizer(id):
+            return GPURenderPipeline(label, id, self)
+
+        awaitable = WgpuAwaitable(
+            "create_render_pipeline", callback, finalizer, self._device._poll
+        )
+
+        # H: void f(WGPUDevice device, WGPURenderPipelineDescriptor const * descriptor, WGPUDeviceCreateRenderPipelineAsyncCallback callback, void * userdata)
+        libf.wgpuDeviceCreateRenderPipelineAsync(
+            self._internal, descriptor, callback, ffi.NULL
+        )
+
+        return await awaitable.wait_async()
+
+    def _create_render_pipeline_descriptor(
         self,
         label: str,
         layout: Union[GPUPipelineLayout, enums.AutoLayoutMode],
@@ -1678,8 +1708,6 @@ def _create_render_pipeline(
         depth_stencil: structs.DepthStencilState,
         multisample: structs.MultisampleState,
         fragment: structs.FragmentState,
-        *,
-        asynchronous,
     ):
         depth_stencil = depth_stencil or {}
         multisample = multisample or {}
@@ -1779,43 +1807,7 @@ def _create_render_pipeline(
             fragment=c_fragment_state,
             # not used: nextInChain
         )
-
-        if not asynchronous:
-            # H: WGPURenderPipeline f(WGPUDevice device, WGPURenderPipelineDescriptor const * descriptor)
-            id = libf.wgpuDeviceCreateRenderPipeline(self._internal, struct)
-            return GPURenderPipeline(label, id, self)
-
-        # Else, async approach
-
-        awaitable_result = {}
-
-        @ffi.callback(
-            "void(WGPUCreatePipelineAsyncStatus, WGPURenderPipeline, char *, void *)"
-        )
-        def callback(status, result, message, userdata):
-            if status != 0:
-                msg = "-" if message == ffi.NULL else ffi.string(message).decode()
-                awaitable_result["error"] = (
-                    f"Create renderPipeline failed ({status}): {msg}"
-                )
-            else:
-                awaitable_result["result"] = result
-
-        def poll_func():
-            # H: WGPUBool f(WGPUDevice device, WGPUBool wait, WGPUWrappedSubmissionIndex const * wrappedSubmissionIndex)
-            libf.wgpuDevicePoll(self._device._internal, False, ffi.NULL)
-
-        def finalizer(id):
-            return GPURenderPipeline(label, id, self)
-
-        # H: void f(WGPUDevice device, WGPURenderPipelineDescriptor const * descriptor, WGPUDeviceCreateRenderPipelineAsyncCallback callback, void * userdata)
-        libf.wgpuDeviceCreateRenderPipelineAsync(
-            self._internal, struct, callback, ffi.NULL
-        )
-
-        return WgpuAwaitable(
-            "create_render_pipeline", awaitable_result, callback, poll_func, finalizer
-        )
+        return struct
 
     def _create_color_target_state(self, target):
         if not target.get("blend", None):
@@ -2075,13 +2067,13 @@ def map_sync(
     ):
         check_can_use_sync_variants()
         awaitable = self._map(mode, offset, size)
-        return awaitable.sync_wait()
+        return awaitable.wait_sync()
 
     async def map_async(
         self, mode: flags.MapMode, offset: int = 0, size: Optional[int] = None
     ):
         awaitable = self._map(mode, offset, size)  # for now
-        return await awaitable
+        return await awaitable.wait_async()
 
     def _map(self, mode, offset=0, size=None):
         sync_on_read = True
@@ -2111,24 +2103,20 @@ def _map(self, mode, offset=0, size=None):
 
         # Setup awaitable
 
-        awaitable_result = {}
-
         @ffi.callback("void(WGPUBufferMapAsyncStatus, void*)")
-        def callback(status, user_data):
-            awaitable_result["result"] = status
-
-        def poll_func():
-            # Doing nothing here will result in timeouts. I.e. unlike request_device, this method is truly async.
-            # H: WGPUBool f(WGPUDevice device, WGPUBool wait, WGPUWrappedSubmissionIndex const * wrappedSubmissionIndex)
-            libf.wgpuDevicePoll(self._device._internal, False, ffi.NULL)
+        def callback(status, _user_data):
+            if status != 0:
+                awaitable.set_error(f"Could not map buffer ({status}).")
+            else:
+                awaitable.set_result(status)
 
-        def finalizer(status):
-            if status != 0:  # no-cover
-                raise RuntimeError(f"Could not map buffer ({status}).")
+        def finalizer(_status):
             self._map_state = enums.BufferMapState.mapped
             self._mapped_status = offset, offset + size, mode
             self._mapped_memoryviews = []
 
+        awaitable = WgpuAwaitable("buffer.map", callback, finalizer, self._device._poll)
+
         # Map it
         self._map_state = enums.BufferMapState.pending
         # H: void f(WGPUBuffer buffer, WGPUMapModeFlags mode, size_t offset, size_t size, WGPUBufferMapAsyncCallback callback, void * userdata)
@@ -2136,9 +2124,7 @@ def finalizer(status):
             self._internal, map_mode, offset, size, callback, ffi.NULL
         )
 
-        return WgpuAwaitable(
-            "buffer.map", awaitable_result, callback, poll_func, finalizer
-        )
+        return awaitable
 
     def unmap(self):
         if self._map_state != enums.BufferMapState.mapped:
@@ -3406,7 +3392,7 @@ def read_buffer(self, buffer, buffer_offset=0, size=None):
         self.submit([command_buffer])
 
         # Download from mappable buffer
-        tmp_buffer._map("READ_NOSYNC").sync_wait()
+        tmp_buffer._map("READ_NOSYNC").wait_sync()
         data = tmp_buffer.read_mapped()
 
         # Explicit drop.
@@ -3519,7 +3505,7 @@ def read_texture(self, source, data_layout, size):
         self.submit([command_buffer])
 
         # Download from mappable buffer
-        tmp_buffer._map("READ_NOSYNC").sync_wait()
+        tmp_buffer._map("READ_NOSYNC").wait_sync()
         data = tmp_buffer.read_mapped()
 
         # Explicit drop.
@@ -3542,31 +3528,35 @@ def read_texture(self, source, data_layout, size):
 
     def on_submitted_work_done_sync(self):
         check_can_use_sync_variants()
-        # In JS, this returns a Promise that can be awaited to (async) wait
-        # for the work that is currently in the pipeline. We need to figure out
-        # how to expose these async parts.
-        raise NotImplementedError()
+        awaitable = self._on_submitted_word_done()
+        awaitable.wait_sync()
 
-        status = 999
+    async def on_submitted_work_done_async(self):
+        awaitable = self._on_submitted_word_done()
+        awaitable.wait_async()
 
+    def _on_submitted_word_done(self):
         @ffi.callback("void(WGPUQueueWorkDoneStatus, void*)")
-        def callback(status_, user_data_p):
-            nonlocal status
-            status = status_
-            # -> here we must resolve the promise, or something
+        def callback(status, _user_data_p):
+            if status == 0:
+                awaitable.set_result(True)
+            else:
+                result = {1: "Error", 2: "Unknown", 3: "DeviceLost"}.get(
+                    status, "Other"
+                )
+                awaitable.set_error(f"Queue work done status: {result}")
 
-        # H: void f(WGPUQueue queue, WGPUQueueOnSubmittedWorkDoneCallback callback, void * userdata)
-        libf.wgpuQueueOnSubmittedWorkDone(self._internal, callback, ffi.NULL)
+        def finalizer(_value):
+            return None
 
-        # Wait for the queue to process all tasks (including the mapping of the buffer).
-        # Also see WebGPU's onSubmittedWorkDone() and C's WGPUQueueWorkDoneCallback.
-        self._device._poll()
+        awaitable = WgpuAwaitable(
+            "on_submitted_work_done", callback, finalizer, self._device._poll
+        )
 
-        if status != 0:
-            raise RuntimeError(f"Queue work done status: {status}")
+        # H: void f(WGPUQueue queue, WGPUQueueOnSubmittedWorkDoneCallback callback, void * userdata)
+        libf.wgpuQueueOnSubmittedWorkDone(self._internal, callback, ffi.NULL)
 
-    async def on_submitted_work_done_async(self):
-        raise NotImplementedError()
+        return awaitable
 
 
 class GPURenderBundle(classes.GPURenderBundle, GPUObjectBase):
diff --git a/wgpu/backends/wgpu_native/_helpers.py b/wgpu/backends/wgpu_native/_helpers.py
index d12ff299..d38d1cf3 100644
--- a/wgpu/backends/wgpu_native/_helpers.py
+++ b/wgpu/backends/wgpu_native/_helpers.py
@@ -1,5 +1,7 @@
 """Utilities used in the wgpu-native backend."""
 
+import asyncio
+import contextlib
 import sys
 import time
 import ctypes
@@ -230,53 +232,62 @@ def to_camel_case(name):
 class WgpuAwaitable:
     """An object that can be waited for, either synchronously using sync_wait() or asynchronously using await.
 
-    The purpose of this class is to implememt the asynchronous methods in a
-    truely async manner, as well as to support a synchronous version of them.
+    The purpose of this class is to implement the asynchronous methods in a
+    truly async manner, as well as to support a synchronous version of them.
     """
 
-    def __init__(self, title, result, callback, poll_function, finalizer, timeout=5.0):
+    def __init__(self, title, callback, finalizer, poll_function=None, timeout=5.0):
         self.title = title  # for context in error messages
-        self.result = result  # a dict that the callbacks writes to
         self.callback = callback  # only used to prevent it from being gc'd
-        self.poll_function = poll_function  # call this to poll wgpu
         self.finalizer = finalizer  # function to finish the result
-        self.maxtime = time.perf_counter() + float(timeout)
+        self.poll_function = poll_function  # call this to poll wgpu
+        self.timeout = timeout
+        self.event = asyncio.Event()
+        self.result = None
+
+    def set_result(self, result):
+        self.result = (result, None)
+        self.event.set()
+
+    def set_error(self, error):
+        self.result = (None, error)
+        self.event.set()
+
+    def wait_sync(self):
+        if not self.poll_function:
+            # The result must come back without any polling.
+            assert self.event.is_set()
+        else:
+            maxtime = time.perf_counter() + float(self.timeout)
+            while True:
+                self.poll_function()
+                if self.event.is_set() or time.perf_counter() > maxtime:
+                    break
+                time.sleep(0.100)
+        return self.finish()
 
-    def is_done(self):
-        self.poll_function()
-        if self.result:
-            return True
-        if time.perf_counter() > self.maxtime:
-            self.result["timeout"] = True
-            return True
-        return False
+    async def wait_async(self):
+        if not self.poll_function:
+            # The result must come back with the original function call.
+            assert self.event.is_set()
+        else:
+            maxtime = time.perf_counter() + float(self.timeout)
+            while True:
+                self.poll_function()
+                with contextlib.suppress(asyncio.TimeoutError):
+                    await asyncio.wait_for(self.event.wait(), 0.100)
+                if self.event.is_set() or time.perf_counter() > maxtime:
+                    break
+        return self.finish()
 
     def finish(self):
-        if "result" in self.result:
-            return self.finalizer(self.result["result"])
-        elif "error" in self.result:
-            raise RuntimeError(self.result["error"])
-        elif "timeout" in self.result:
+        if not self.result:
             raise RuntimeError(f"Waiting for {self.title} timed out.")
+        result, error = self.result
+        if error:
+            raise RuntimeError(error)
         else:
-            raise RuntimeError(f"Failed to obtain result for {self.title}.")
-
-    def sync_wait(self):
-        # Basically a spin-lock, but we sleep(0) to give other threads more room to run.
-        while not self.is_done():
-            time.sleep(0)  # yield to the GIL
-        return self.finish()
-
-    def __await__(self):
-        # With this approach the CPU is also kept busy, because the task will
-        # continuously be scheduled to do a next step, but at least other tasks
-        # will have a chance to run as well (e.g. GUI's running in the same loop
-        # stay active). In theory we could async-sleep here, but there are two
-        # problems: how long should we sleep, and using asyncio.sleep() would
-        # not work on e.g. Trio.
-        while not self.is_done():
-            yield None  # yield to the loop
-        return self.finish()
+            return self.finalizer(result)
 
 
 class ErrorHandler:

From c733c5f5b79ffcfb7b1c1ba450ede8ab0b3780e9 Mon Sep 17 00:00:00 2001
From: Frank Yellin <fy@fyellin.com>
Date: Fri, 1 Nov 2024 14:57:10 -0700
Subject: [PATCH 02/25] Fix lint problems and codegen problems.

---
 tests/test_async.py               | 3 ++-
 tests/test_wgpu_native_buffer.py  | 2 +-
 wgpu/backends/wgpu_native/_api.py | 6 +++++-
 wgpu/resources/codegen_report.md  | 6 +++---
 4 files changed, 11 insertions(+), 6 deletions(-)

diff --git a/tests/test_async.py b/tests/test_async.py
index cdd0c4f0..ac5096bb 100644
--- a/tests/test_async.py
+++ b/tests/test_async.py
@@ -31,6 +31,7 @@ def poll_function():
         result = await awaitable.wait_async()
     else:
         result = awaitable.wait_sync()
+    assert result == 10 * 10
 
 
 @pytest.mark.asyncio
@@ -71,7 +72,7 @@ async def test_asynchronous_make_pipeline(loop_scope="function"):
         fn vertex_main() -> @builtin(position) vec4f {
             return vec4f(0, 0, 0, 1.);
         }
-    
+
         @compute @workgroup_size(1)
         fn compute_main() { }
     """
diff --git a/tests/test_wgpu_native_buffer.py b/tests/test_wgpu_native_buffer.py
index aa0d0704..f9281352 100644
--- a/tests/test_wgpu_native_buffer.py
+++ b/tests/test_wgpu_native_buffer.py
@@ -564,7 +564,7 @@ def test_show_bug_wgpu_native_305_still_not_fixed():
 
     # Download from buffer to CPU
     buf.map("READ_NOSYNC")
-    data2 = buf.read_mapped()
+    data2 = bytes(buf.read_mapped())
     buf.unmap()
     assert data1 == data2
 
diff --git a/wgpu/backends/wgpu_native/_api.py b/wgpu/backends/wgpu_native/_api.py
index e58ad2c0..a3bef17a 100644
--- a/wgpu/backends/wgpu_native/_api.py
+++ b/wgpu/backends/wgpu_native/_api.py
@@ -1558,6 +1558,7 @@ def create_compute_pipeline(
         compute: structs.ProgrammableStage,
     ):
         descriptor = self._create_compute_pipeline_descriptor(label, layout, compute)
+        # H: WGPUComputePipeline f(WGPUDevice device, WGPUComputePipelineDescriptor const * descriptor)
         id = libf.wgpuDeviceCreateComputePipeline(self._internal, descriptor)
         return GPUComputePipeline(label, id, self)
 
@@ -1571,6 +1572,7 @@ async def create_compute_pipeline_async(
         descriptor = self._create_compute_pipeline_descriptor(label, layout, compute)
 
         if self.CREATE_PIPELINE_ASYNC_NOT_IMPLEMENTED:
+            # H: WGPUComputePipeline f(WGPUDevice device, WGPUComputePipelineDescriptor const * descriptor)
             id = libf.wgpuDeviceCreateComputePipeline(self._internal, descriptor)
             return GPUComputePipeline(label, id, self)
 
@@ -1593,7 +1595,7 @@ def finalizer(id):
             "create_compute_pipeline", callback, finalizer, self._device.poll
         )
 
-        # H: void f(WGPUDevice device, WGPURenderPipelineDescriptor const * descriptor, WGPUDeviceCreateRenderPipelineAsyncCallback callback, void * userdata)
+        # H: void f(WGPUDevice device, WGPUComputePipelineDescriptor const * descriptor, WGPUDeviceCreateComputePipelineAsyncCallback callback, void * userdata)
         libf.wgpuDeviceCreateComputePipelineAsync(
             self._internal, descriptor, callback, ffi.NULL
         )
@@ -1651,6 +1653,7 @@ def create_render_pipeline(
         descriptor = self._create_render_pipeline_descriptor(
             label, layout, vertex, primitive, depth_stencil, multisample, fragment
         )
+        # H: WGPURenderPipeline f(WGPUDevice device, WGPURenderPipelineDescriptor const * descriptor)
         id = libf.wgpuDeviceCreateRenderPipeline(self._internal, descriptor)
         return GPURenderPipeline(label, id, self)
 
@@ -1671,6 +1674,7 @@ async def create_render_pipeline_async(
         )
 
         if self.CREATE_PIPELINE_ASYNC_NOT_IMPLEMENTED:
+            # H: WGPURenderPipeline f(WGPUDevice device, WGPURenderPipelineDescriptor const * descriptor)
             id = libf.wgpuDeviceCreateRenderPipeline(self._internal, descriptor)
             return GPURenderPipeline(label, id, self)
 
diff --git a/wgpu/resources/codegen_report.md b/wgpu/resources/codegen_report.md
index 7b53c1cf..81bf7fea 100644
--- a/wgpu/resources/codegen_report.md
+++ b/wgpu/resources/codegen_report.md
@@ -20,7 +20,7 @@
 * Diffs for GPUQueue: add read_buffer, add read_texture, hide copy_external_image_to_texture
 * Validated 37 classes, 121 methods, 46 properties
 ### Patching API for backends/wgpu_native/_api.py
-* Validated 37 classes, 120 methods, 0 properties
+* Validated 37 classes, 121 methods, 0 properties
 ## Validating backends/wgpu_native/_api.py
 * Enum field FeatureName.texture-compression-bc-sliced-3d missing in wgpu.h
 * Enum field FeatureName.clip-distances missing in wgpu.h
@@ -35,6 +35,6 @@
 * Enum CanvasAlphaMode missing in wgpu.h
 * Enum CanvasToneMappingMode missing in wgpu.h
 * Wrote 236 enum mappings and 47 struct-field mappings to wgpu_native/_mappings.py
-* Validated 140 C function calls
-* Not using 65 C functions
+* Validated 141 C function calls
+* Not using 64 C functions
 * Validated 82 C structs

From f8a71e6b1af57590d48602c797a5b246ee65cf01 Mon Sep 17 00:00:00 2001
From: Frank Yellin <fy@fyellin.com>
Date: Sat, 2 Nov 2024 16:41:29 -0700
Subject: [PATCH 03/25] Remove assertion.

---
 wgpu/backends/wgpu_native/_helpers.py | 15 ++++++++++-----
 1 file changed, 10 insertions(+), 5 deletions(-)

diff --git a/wgpu/backends/wgpu_native/_helpers.py b/wgpu/backends/wgpu_native/_helpers.py
index d38d1cf3..47c095cf 100644
--- a/wgpu/backends/wgpu_native/_helpers.py
+++ b/wgpu/backends/wgpu_native/_helpers.py
@@ -230,10 +230,13 @@ def to_camel_case(name):
 
 
 class WgpuAwaitable:
-    """An object that can be waited for, either synchronously using sync_wait() or asynchronously using await.
+    """
+    Create an object representing the result of a wgpu call that requires a callback
+    to complete.  The code can then call "awaitable.wait_sync()" to wait for the result
+    synchronously, or "await awaitable.wait_async()" to perform an asynchronous wait.
 
-    The purpose of this class is to implement the asynchronous methods in a
-    truly async manner, as well as to support a synchronous version of them.
+    The callback should call "awaitable.set_result()" when it has a result, or
+    "awaitable.set_error()" when it encounters an error.
     """
 
     def __init__(self, title, callback, finalizer, poll_function=None, timeout=5.0):
@@ -255,7 +258,8 @@ def set_error(self, error):
 
     def wait_sync(self):
         if not self.poll_function:
-            # The result must come back without any polling.
+            if not self.event.is_set():
+                raise RuntimeError("Expected callback to have already happened")
             assert self.event.is_set()
         else:
             maxtime = time.perf_counter() + float(self.timeout)
@@ -268,7 +272,8 @@ def wait_sync(self):
 
     async def wait_async(self):
         if not self.poll_function:
-            # The result must come back with the original function call.
+            if not self.event.is_set():
+                raise RuntimeError("Expected callback to have already happened")
             assert self.event.is_set()
         else:
             maxtime = time.perf_counter() + float(self.timeout)

From bebac48d8c58190e26fd89bf19945505b440ab1f Mon Sep 17 00:00:00 2001
From: Frank Yellin <fy@fyellin.com>
Date: Mon, 4 Nov 2024 12:59:09 -0800
Subject: [PATCH 04/25] _api.GPU should be the owner of its internal
 representation.

---
 tests/test_async.py                   | 16 ++++----
 wgpu/backends/wgpu_native/_api.py     | 24 ++++++------
 wgpu/backends/wgpu_native/_helpers.py | 55 +++++++++++++++++++++++++++
 3 files changed, 75 insertions(+), 20 deletions(-)

diff --git a/tests/test_async.py b/tests/test_async.py
index ac5096bb..7c56ea0e 100644
--- a/tests/test_async.py
+++ b/tests/test_async.py
@@ -10,7 +10,7 @@
 
 @pytest.mark.asyncio
 @pytest.mark.parametrize("use_async", [False, True])
-async def test_awaitable_async(use_async, loop_scope="function"):
+async def test_awaitable_async(use_async):
     count = 0
 
     def finalizer(i):
@@ -28,21 +28,21 @@ def poll_function():
     awaitable = WgpuAwaitable("test", callback, finalizer, poll_function)
 
     if use_async:
-        result = await awaitable.wait_async()
+        result = await awaitable
     else:
-        result = awaitable.wait_sync()
+        result = awaitable.sync_wait()
     assert result == 10 * 10
 
 
 @pytest.mark.asyncio
-async def test_asynchronous_get_device(loop_scope="function"):
+async def test_asynchronous_get_device():
     adapter = await wgpu.gpu.request_adapter_async(power_preference="high-performance")
     device = await adapter.request_device_async()
     assert device is not None
 
 
 @pytest.mark.asyncio
-async def test_asynchronous_buffer_map(loop_scope="function"):
+async def test_asynchronous_buffer_map():
     device = wgpu.utils.get_default_device()
 
     data = b"1" * 10000
@@ -64,7 +64,7 @@ async def test_asynchronous_buffer_map(loop_scope="function"):
 
 
 @pytest.mark.asyncio
-async def test_asynchronous_make_pipeline(loop_scope="function"):
+async def test_asynchronous_make_pipeline():
     device = wgpu.utils.get_default_device()
 
     shader_source = """
@@ -79,7 +79,8 @@ async def test_asynchronous_make_pipeline(loop_scope="function"):
 
     shader = device.create_shader_module(code=shader_source)
 
-    render_pipeline, compute_pipeline = await asyncio.gather(
+    compute_pipeline, render_pipeline = await asyncio.gather(
+        device.create_compute_pipeline_async(layout="auto", compute={"module": shader}),
         device.create_render_pipeline_async(
             layout="auto",
             vertex={
@@ -87,7 +88,6 @@ async def test_asynchronous_make_pipeline(loop_scope="function"):
             },
             depth_stencil={"format": TextureFormat.rgba8unorm},
         ),
-        device.create_compute_pipeline_async(layout="auto", compute={"module": shader}),
     )
 
     assert compute_pipeline is not None
diff --git a/wgpu/backends/wgpu_native/_api.py b/wgpu/backends/wgpu_native/_api.py
index 86f22f01..df79b290 100644
--- a/wgpu/backends/wgpu_native/_api.py
+++ b/wgpu/backends/wgpu_native/_api.py
@@ -331,7 +331,7 @@ def request_adapter_sync(
             canvas=canvas,
         )
 
-        return awaitable.wait_sync()
+        return awaitable.sync_wait()
 
     async def request_adapter_async(
         self,
@@ -357,7 +357,7 @@ async def request_adapter_async(
             force_fallback_adapter=force_fallback_adapter,
             canvas=canvas,
         )  # no-cover
-        return await awaitable.wait_async()
+        return await awaitable
 
     def _request_adapter(
         self, *, power_preference=None, force_fallback_adapter=False, canvas=None
@@ -856,7 +856,7 @@ def request_device_sync(
         awaitable = self._request_device(
             label, required_features, required_limits, default_queue, ""
         )
-        return awaitable.wait_sync()
+        return awaitable.sync_wait()
 
     async def request_device_async(
         self,
@@ -873,7 +873,7 @@ async def request_device_async(
         )
         # Note that although we claim this function is async, the callback always
         # happens inside the call to libf.wgpuAdapterRequestDevice
-        return await awaitable.wait_async()
+        return await awaitable
 
     def _request_device(
         self,
@@ -1602,7 +1602,7 @@ def finalizer(id):
             self._internal, descriptor, callback, ffi.NULL
         )
 
-        return await awaitable.wait_async()
+        return await awaitable
 
     def _create_compute_pipeline_descriptor(
         self,
@@ -1703,7 +1703,7 @@ def finalizer(id):
             self._internal, descriptor, callback, ffi.NULL
         )
 
-        return await awaitable.wait_async()
+        return await awaitable
 
     def _create_render_pipeline_descriptor(
         self,
@@ -2073,13 +2073,13 @@ def map_sync(
     ):
         check_can_use_sync_variants()
         awaitable = self._map(mode, offset, size)
-        return awaitable.wait_sync()
+        return awaitable.sync_wait()
 
     async def map_async(
         self, mode: flags.MapMode, offset: int = 0, size: Optional[int] = None
     ):
         awaitable = self._map(mode, offset, size)  # for now
-        return await awaitable.wait_async()
+        return await awaitable
 
     def _map(self, mode, offset=0, size=None):
         sync_on_read = True
@@ -3398,7 +3398,7 @@ def read_buffer(self, buffer, buffer_offset=0, size=None):
         self.submit([command_buffer])
 
         # Download from mappable buffer
-        tmp_buffer._map("READ_NOSYNC").wait_sync()
+        tmp_buffer._map("READ_NOSYNC").sync_wait()
         data = tmp_buffer.read_mapped()
 
         # Explicit drop.
@@ -3511,7 +3511,7 @@ def read_texture(self, source, data_layout, size):
         self.submit([command_buffer])
 
         # Download from mappable buffer
-        tmp_buffer._map("READ_NOSYNC").wait_sync()
+        tmp_buffer._map("READ_NOSYNC").sync_wait()
         data = tmp_buffer.read_mapped()
 
         # Explicit drop.
@@ -3535,11 +3535,11 @@ def read_texture(self, source, data_layout, size):
     def on_submitted_work_done_sync(self):
         check_can_use_sync_variants()
         awaitable = self._on_submitted_word_done()
-        awaitable.wait_sync()
+        awaitable.sync_wait()
 
     async def on_submitted_work_done_async(self):
         awaitable = self._on_submitted_word_done()
-        awaitable.wait_async()
+        await awaitable
 
     def _on_submitted_word_done(self):
         @ffi.callback("void(WGPUQueueWorkDoneStatus, void*)")
diff --git a/wgpu/backends/wgpu_native/_helpers.py b/wgpu/backends/wgpu_native/_helpers.py
index 47c095cf..717ee0fb 100644
--- a/wgpu/backends/wgpu_native/_helpers.py
+++ b/wgpu/backends/wgpu_native/_helpers.py
@@ -295,6 +295,61 @@ def finish(self):
             return self.finalizer(result)
 
 
+class WgpuAwaitable:
+    """An object that can be waited for, either synchronously using sync_wait() or asynchronously using await.
+
+    The purpose of this class is to implememt the asynchronous methods in a
+    truely async manner, as well as to support a synchronous version of them.
+    """
+
+    SLEEP_TIME = 0.005  # 5ms
+
+    def __init__(self, title, callback, finalizer, poll_function=None, timeout=5.0):
+        self.title = title  # for context in error messages
+        self.callback = callback  # only used to prevent it from being gc'd
+        self.finalizer = finalizer  # function to finish the result
+        self.poll_function = poll_function  # call this to poll wgpu
+        self.maxtime = time.perf_counter() + float(timeout)
+        self.result = None
+
+    def set_result(self, result):
+        self.result = (result, None)
+
+    def set_error(self, error):
+        self.result = (None, error)
+
+    def sync_wait(self):
+        if not self.poll_function:
+            if self.result is None:
+                raise RuntimeError("Expected callback to have already happened")
+        else:
+            while not self._is_done():
+                time.sleep(self.SLEEP_TIME)
+        return self.finish()
+
+    def __await__(self):
+        if not self.poll_function:
+            if self.result is None:
+                raise RuntimeError("Expected callback to have already happened")
+        else:
+            while not self._is_done():
+                yield None
+        return self.finish()
+
+    def _is_done(self):
+        self.poll_function()
+        return self.result is not None or time.perf_counter() > self.maxtime
+
+    def finish(self):
+        if not self.result:
+            raise RuntimeError(f"Waiting for {self.title} timed out.")
+        result, error = self.result
+        if error:
+            raise RuntimeError(error)
+        else:
+            return self.finalizer(result)
+
+
 class ErrorHandler:
     """Object that logs errors, with the option to collect incoming
     errors elsewhere.

From 3ae778097f45378b49bd2d9cce8671e8353f9440 Mon Sep 17 00:00:00 2001
From: Frank Yellin <fy@fyellin.com>
Date: Mon, 4 Nov 2024 13:09:17 -0800
Subject: [PATCH 05/25] Thank you, ruff!

---
 wgpu/backends/wgpu_native/_helpers.py | 75 ++-------------------------
 1 file changed, 3 insertions(+), 72 deletions(-)

diff --git a/wgpu/backends/wgpu_native/_helpers.py b/wgpu/backends/wgpu_native/_helpers.py
index 717ee0fb..cc54faaa 100644
--- a/wgpu/backends/wgpu_native/_helpers.py
+++ b/wgpu/backends/wgpu_native/_helpers.py
@@ -1,23 +1,20 @@
 """Utilities used in the wgpu-native backend."""
 
-import asyncio
-import contextlib
+import ctypes
 import sys
 import time
-import ctypes
 from queue import deque
 
 from ._ffi import ffi, lib, lib_path
 from ..._diagnostics import DiagnosticsBase
 from ...classes import (
     GPUError,
+    GPUInternalError,
     GPUOutOfMemoryError,
-    GPUValidationError,
     GPUPipelineError,
-    GPUInternalError,
+    GPUValidationError,
 )
 
-
 ERROR_TYPES = {
     "": GPUError,
     "OutOfMemory": GPUOutOfMemoryError,
@@ -229,72 +226,6 @@ def to_camel_case(name):
     return name2
 
 
-class WgpuAwaitable:
-    """
-    Create an object representing the result of a wgpu call that requires a callback
-    to complete.  The code can then call "awaitable.wait_sync()" to wait for the result
-    synchronously, or "await awaitable.wait_async()" to perform an asynchronous wait.
-
-    The callback should call "awaitable.set_result()" when it has a result, or
-    "awaitable.set_error()" when it encounters an error.
-    """
-
-    def __init__(self, title, callback, finalizer, poll_function=None, timeout=5.0):
-        self.title = title  # for context in error messages
-        self.callback = callback  # only used to prevent it from being gc'd
-        self.finalizer = finalizer  # function to finish the result
-        self.poll_function = poll_function  # call this to poll wgpu
-        self.timeout = timeout
-        self.event = asyncio.Event()
-        self.result = None
-
-    def set_result(self, result):
-        self.result = (result, None)
-        self.event.set()
-
-    def set_error(self, error):
-        self.result = (None, error)
-        self.event.set()
-
-    def wait_sync(self):
-        if not self.poll_function:
-            if not self.event.is_set():
-                raise RuntimeError("Expected callback to have already happened")
-            assert self.event.is_set()
-        else:
-            maxtime = time.perf_counter() + float(self.timeout)
-            while True:
-                self.poll_function()
-                if self.event.is_set() or time.perf_counter() > maxtime:
-                    break
-                time.sleep(0.100)
-        return self.finish()
-
-    async def wait_async(self):
-        if not self.poll_function:
-            if not self.event.is_set():
-                raise RuntimeError("Expected callback to have already happened")
-            assert self.event.is_set()
-        else:
-            maxtime = time.perf_counter() + float(self.timeout)
-            while True:
-                self.poll_function()
-                with contextlib.suppress(asyncio.TimeoutError):
-                    await asyncio.wait_for(self.event.wait(), 0.100)
-                if self.event.is_set() or time.perf_counter() > maxtime:
-                    break
-        return self.finish()
-
-    def finish(self):
-        if not self.result:
-            raise RuntimeError(f"Waiting for {self.title} timed out.")
-        result, error = self.result
-        if error:
-            raise RuntimeError(error)
-        else:
-            return self.finalizer(result)
-
-
 class WgpuAwaitable:
     """An object that can be waited for, either synchronously using sync_wait() or asynchronously using await.
 

From 5fb6502d263cf70a8015a3716e0d8ed4bd2024b8 Mon Sep 17 00:00:00 2001
From: Frank Yellin <fy@fyellin.com>
Date: Tue, 5 Nov 2024 10:25:04 -0800
Subject: [PATCH 06/25] For now, use anyio

---
 pyproject.toml                        |  2 +-
 tests/test_async.py                   | 45 +++++++++++++++++----------
 wgpu/backends/wgpu_native/_api.py     | 12 +++----
 wgpu/backends/wgpu_native/_helpers.py | 10 ++++--
 4 files changed, 43 insertions(+), 26 deletions(-)

diff --git a/pyproject.toml b/pyproject.toml
index 1f436376..1c7eba12 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -9,7 +9,7 @@ license = { file = "LICENSE" }
 authors = [{ name = "Almar Klein" }, { name = "Korijn van Golen" }]
 keywords = ["webgpu", "wgpu", "vulkan", "metal", "DX12", "opengl"]
 requires-python = ">= 3.9"
-dependencies = ["cffi>=1.15.0", "rubicon-objc>=0.4.1; sys_platform == 'darwin'"]
+dependencies = ["cffi>=1.15.0", "rubicon-objc>=0.4.1; sys_platform == 'darwin'", "anyio>=4.6"]
 
 [project.optional-dependencies]
 # For users
diff --git a/tests/test_async.py b/tests/test_async.py
index 7c56ea0e..a58e1fd9 100644
--- a/tests/test_async.py
+++ b/tests/test_async.py
@@ -1,4 +1,4 @@
-import asyncio
+import anyio
 
 import pytest
 
@@ -8,7 +8,7 @@
 from wgpu.backends.wgpu_native import WgpuAwaitable
 
 
-@pytest.mark.asyncio
+@pytest.mark.anyio
 @pytest.mark.parametrize("use_async", [False, True])
 async def test_awaitable_async(use_async):
     count = 0
@@ -28,20 +28,20 @@ def poll_function():
     awaitable = WgpuAwaitable("test", callback, finalizer, poll_function)
 
     if use_async:
-        result = await awaitable
+        result = await awaitable.async_wait()
     else:
         result = awaitable.sync_wait()
     assert result == 10 * 10
 
 
-@pytest.mark.asyncio
+@pytest.mark.anyio
 async def test_asynchronous_get_device():
     adapter = await wgpu.gpu.request_adapter_async(power_preference="high-performance")
     device = await adapter.request_device_async()
     assert device is not None
 
 
-@pytest.mark.asyncio
+@pytest.mark.anyio
 async def test_asynchronous_buffer_map():
     device = wgpu.utils.get_default_device()
 
@@ -63,7 +63,7 @@ async def test_asynchronous_buffer_map():
     assert bytes(data2) == data
 
 
-@pytest.mark.asyncio
+@pytest.mark.anyio
 async def test_asynchronous_make_pipeline():
     device = wgpu.utils.get_default_device()
 
@@ -79,17 +79,28 @@ async def test_asynchronous_make_pipeline():
 
     shader = device.create_shader_module(code=shader_source)
 
-    compute_pipeline, render_pipeline = await asyncio.gather(
-        device.create_compute_pipeline_async(layout="auto", compute={"module": shader}),
-        device.create_render_pipeline_async(
-            layout="auto",
-            vertex={
-                "module": shader,
-            },
-            depth_stencil={"format": TextureFormat.rgba8unorm},
-        ),
-    )
-
+    results = [None, None]
+    async with anyio.create_task_group() as tg:
+        # It's unfortunate anyio doesn't have async.gather. This code would just be
+        # compute_pipeline, render_pipeline = asyncio.gather(.....)
+        async def create_compute_pipeline():
+            results[0] = await device.create_compute_pipeline_async(
+                layout="auto", compute={"module": shader}
+            )
+
+        async def create_render_pipeline():
+            results[1] = await device.create_render_pipeline_async(
+                layout="auto",
+                vertex={
+                    "module": shader,
+                },
+                depth_stencil={"format": TextureFormat.rgba8unorm},
+            )
+
+        tg.start_soon(create_compute_pipeline)
+        tg.start_soon(create_render_pipeline)
+
+    compute_pipeline, render_pipeline = results
     assert compute_pipeline is not None
     assert render_pipeline is not None
 
diff --git a/wgpu/backends/wgpu_native/_api.py b/wgpu/backends/wgpu_native/_api.py
index df79b290..6afffabc 100644
--- a/wgpu/backends/wgpu_native/_api.py
+++ b/wgpu/backends/wgpu_native/_api.py
@@ -357,7 +357,7 @@ async def request_adapter_async(
             force_fallback_adapter=force_fallback_adapter,
             canvas=canvas,
         )  # no-cover
-        return await awaitable
+        return await awaitable.async_wait()
 
     def _request_adapter(
         self, *, power_preference=None, force_fallback_adapter=False, canvas=None
@@ -873,7 +873,7 @@ async def request_device_async(
         )
         # Note that although we claim this function is async, the callback always
         # happens inside the call to libf.wgpuAdapterRequestDevice
-        return await awaitable
+        return await awaitable.async_wait()
 
     def _request_device(
         self,
@@ -1602,7 +1602,7 @@ def finalizer(id):
             self._internal, descriptor, callback, ffi.NULL
         )
 
-        return await awaitable
+        return await awaitable.async_wait()
 
     def _create_compute_pipeline_descriptor(
         self,
@@ -1703,7 +1703,7 @@ def finalizer(id):
             self._internal, descriptor, callback, ffi.NULL
         )
 
-        return await awaitable
+        return await awaitable.async_wait()
 
     def _create_render_pipeline_descriptor(
         self,
@@ -2079,7 +2079,7 @@ async def map_async(
         self, mode: flags.MapMode, offset: int = 0, size: Optional[int] = None
     ):
         awaitable = self._map(mode, offset, size)  # for now
-        return await awaitable
+        return await awaitable.async_wait()
 
     def _map(self, mode, offset=0, size=None):
         sync_on_read = True
@@ -3539,7 +3539,7 @@ def on_submitted_work_done_sync(self):
 
     async def on_submitted_work_done_async(self):
         awaitable = self._on_submitted_word_done()
-        await awaitable
+        await awaitable.async_wait()
 
     def _on_submitted_word_done(self):
         @ffi.callback("void(WGPUQueueWorkDoneStatus, void*)")
diff --git a/wgpu/backends/wgpu_native/_helpers.py b/wgpu/backends/wgpu_native/_helpers.py
index cc54faaa..dc6f8b2a 100644
--- a/wgpu/backends/wgpu_native/_helpers.py
+++ b/wgpu/backends/wgpu_native/_helpers.py
@@ -5,6 +5,8 @@
 import time
 from queue import deque
 
+import anyio
+
 from ._ffi import ffi, lib, lib_path
 from ..._diagnostics import DiagnosticsBase
 from ...classes import (
@@ -241,13 +243,16 @@ def __init__(self, title, callback, finalizer, poll_function=None, timeout=5.0):
         self.finalizer = finalizer  # function to finish the result
         self.poll_function = poll_function  # call this to poll wgpu
         self.maxtime = time.perf_counter() + float(timeout)
+        self.event = anyio.Event()
         self.result = None
 
     def set_result(self, result):
         self.result = (result, None)
+        self.event.set()
 
     def set_error(self, error):
         self.result = (None, error)
+        self.event.set()
 
     def sync_wait(self):
         if not self.poll_function:
@@ -258,13 +263,14 @@ def sync_wait(self):
                 time.sleep(self.SLEEP_TIME)
         return self.finish()
 
-    def __await__(self):
+    async def async_wait(self):
         if not self.poll_function:
             if self.result is None:
                 raise RuntimeError("Expected callback to have already happened")
         else:
             while not self._is_done():
-                yield None
+                with anyio.move_on_after(self.SLEEP_TIME):
+                    self.event.wait()
         return self.finish()
 
     def _is_done(self):

From 780ff9bed10b9c9a7ddbc2d761797f6631358036 Mon Sep 17 00:00:00 2001
From: Frank Yellin <fy@fyellin.com>
Date: Tue, 5 Nov 2024 10:43:59 -0800
Subject: [PATCH 07/25] Continue to find the minimum changes

---
 pyproject.toml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pyproject.toml b/pyproject.toml
index 1c7eba12..55e4a1fa 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -20,7 +20,7 @@ imgui = ["imgui-bundle>=1.2.1"]
 build = ["build", "hatchling", "requests", "twine"]
 codegen = ["pytest", "numpy", "ruff"]
 lint = ["ruff", "pre-commit"]
-tests = ["numpy", "pytest", "pytest-asyncio", "psutil", "imageio"]
+tests = ["numpy", "pytest", "pytest-anyio", "psutil", "imageio", "anyio"]
 examples = []
 docs = ["sphinx>7.2", "sphinx_rtd_theme"]
 dev = ["wgpu[build,codegen,lint,tests,examples,docs]"]

From f831e6db505dc6adb8601bd3ace0268f59bdc643 Mon Sep 17 00:00:00 2001
From: Frank Yellin <fy@fyellin.com>
Date: Tue, 5 Nov 2024 11:38:25 -0800
Subject: [PATCH 08/25] Continue to find the minimum changes

---
 pyproject.toml                        | 2 +-
 wgpu/backends/wgpu_native/_helpers.py | 7 ++-----
 2 files changed, 3 insertions(+), 6 deletions(-)

diff --git a/pyproject.toml b/pyproject.toml
index 55e4a1fa..5399a65b 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -20,7 +20,7 @@ imgui = ["imgui-bundle>=1.2.1"]
 build = ["build", "hatchling", "requests", "twine"]
 codegen = ["pytest", "numpy", "ruff"]
 lint = ["ruff", "pre-commit"]
-tests = ["numpy", "pytest", "pytest-anyio", "psutil", "imageio", "anyio"]
+tests = ["numpy", "pytest", "pytest-anyio", "psutil", "imageio", "anyio", "trio"]
 examples = []
 docs = ["sphinx>7.2", "sphinx_rtd_theme"]
 dev = ["wgpu[build,codegen,lint,tests,examples,docs]"]
diff --git a/wgpu/backends/wgpu_native/_helpers.py b/wgpu/backends/wgpu_native/_helpers.py
index dc6f8b2a..c2df819c 100644
--- a/wgpu/backends/wgpu_native/_helpers.py
+++ b/wgpu/backends/wgpu_native/_helpers.py
@@ -243,16 +243,13 @@ def __init__(self, title, callback, finalizer, poll_function=None, timeout=5.0):
         self.finalizer = finalizer  # function to finish the result
         self.poll_function = poll_function  # call this to poll wgpu
         self.maxtime = time.perf_counter() + float(timeout)
-        self.event = anyio.Event()
         self.result = None
 
     def set_result(self, result):
         self.result = (result, None)
-        self.event.set()
 
     def set_error(self, error):
         self.result = (None, error)
-        self.event.set()
 
     def sync_wait(self):
         if not self.poll_function:
@@ -269,8 +266,8 @@ async def async_wait(self):
                 raise RuntimeError("Expected callback to have already happened")
         else:
             while not self._is_done():
-                with anyio.move_on_after(self.SLEEP_TIME):
-                    self.event.wait()
+                # A bug in anyio prevents us from waiting on an Event()
+                await anyio.sleep(self.SLEEP_TIME)
         return self.finish()
 
     def _is_done(self):

From 81cf2f7d2f0577372121188278e74bf11e85570c Mon Sep 17 00:00:00 2001
From: Frank Yellin <fy@fyellin.com>
Date: Tue, 5 Nov 2024 16:29:59 -0800
Subject: [PATCH 09/25] Allow "await WgpuAwaitable(..)"

The object returned by WgpuAwaitable is now directly awaitable. Make Almar happy.
---
 tests/test_async.py                   |  3 +-
 wgpu/backends/wgpu_native/_api.py     | 12 +++----
 wgpu/backends/wgpu_native/_helpers.py | 48 ++++++++++++++++-----------
 3 files changed, 36 insertions(+), 27 deletions(-)

diff --git a/tests/test_async.py b/tests/test_async.py
index a58e1fd9..2bcd3ad0 100644
--- a/tests/test_async.py
+++ b/tests/test_async.py
@@ -1,3 +1,4 @@
+
 import anyio
 
 import pytest
@@ -28,7 +29,7 @@ def poll_function():
     awaitable = WgpuAwaitable("test", callback, finalizer, poll_function)
 
     if use_async:
-        result = await awaitable.async_wait()
+        result = await awaitable
     else:
         result = awaitable.sync_wait()
     assert result == 10 * 10
diff --git a/wgpu/backends/wgpu_native/_api.py b/wgpu/backends/wgpu_native/_api.py
index 6afffabc..df79b290 100644
--- a/wgpu/backends/wgpu_native/_api.py
+++ b/wgpu/backends/wgpu_native/_api.py
@@ -357,7 +357,7 @@ async def request_adapter_async(
             force_fallback_adapter=force_fallback_adapter,
             canvas=canvas,
         )  # no-cover
-        return await awaitable.async_wait()
+        return await awaitable
 
     def _request_adapter(
         self, *, power_preference=None, force_fallback_adapter=False, canvas=None
@@ -873,7 +873,7 @@ async def request_device_async(
         )
         # Note that although we claim this function is async, the callback always
         # happens inside the call to libf.wgpuAdapterRequestDevice
-        return await awaitable.async_wait()
+        return await awaitable
 
     def _request_device(
         self,
@@ -1602,7 +1602,7 @@ def finalizer(id):
             self._internal, descriptor, callback, ffi.NULL
         )
 
-        return await awaitable.async_wait()
+        return await awaitable
 
     def _create_compute_pipeline_descriptor(
         self,
@@ -1703,7 +1703,7 @@ def finalizer(id):
             self._internal, descriptor, callback, ffi.NULL
         )
 
-        return await awaitable.async_wait()
+        return await awaitable
 
     def _create_render_pipeline_descriptor(
         self,
@@ -2079,7 +2079,7 @@ async def map_async(
         self, mode: flags.MapMode, offset: int = 0, size: Optional[int] = None
     ):
         awaitable = self._map(mode, offset, size)  # for now
-        return await awaitable.async_wait()
+        return await awaitable
 
     def _map(self, mode, offset=0, size=None):
         sync_on_read = True
@@ -3539,7 +3539,7 @@ def on_submitted_work_done_sync(self):
 
     async def on_submitted_work_done_async(self):
         awaitable = self._on_submitted_word_done()
-        await awaitable.async_wait()
+        await awaitable
 
     def _on_submitted_word_done(self):
         @ffi.callback("void(WGPUQueueWorkDoneStatus, void*)")
diff --git a/wgpu/backends/wgpu_native/_helpers.py b/wgpu/backends/wgpu_native/_helpers.py
index c2df819c..4b2669b8 100644
--- a/wgpu/backends/wgpu_native/_helpers.py
+++ b/wgpu/backends/wgpu_native/_helpers.py
@@ -251,30 +251,11 @@ def set_result(self, result):
     def set_error(self, error):
         self.result = (None, error)
 
-    def sync_wait(self):
-        if not self.poll_function:
-            if self.result is None:
-                raise RuntimeError("Expected callback to have already happened")
-        else:
-            while not self._is_done():
-                time.sleep(self.SLEEP_TIME)
-        return self.finish()
-
-    async def async_wait(self):
-        if not self.poll_function:
-            if self.result is None:
-                raise RuntimeError("Expected callback to have already happened")
-        else:
-            while not self._is_done():
-                # A bug in anyio prevents us from waiting on an Event()
-                await anyio.sleep(self.SLEEP_TIME)
-        return self.finish()
-
     def _is_done(self):
         self.poll_function()
         return self.result is not None or time.perf_counter() > self.maxtime
 
-    def finish(self):
+    def _finish(self):
         if not self.result:
             raise RuntimeError(f"Waiting for {self.title} timed out.")
         result, error = self.result
@@ -283,6 +264,33 @@ def finish(self):
         else:
             return self.finalizer(result)
 
+    def sync_wait(self):
+        if self.result is not None:
+            pass
+        elif not self.poll_function:
+            raise RuntimeError("Expected callback to have already happened")
+        else:
+            while not self._is_done():
+                time.sleep(self.SLEEP_TIME)
+        return self._finish()
+
+    def async_wait(self):
+        return self
+
+    def __await__(self):
+        # There is no documentation on what __await__() is supposed to return, but we
+        # can certainly copy from a function that *does* know what to return
+        async def wait_for_callback():
+            if self.result is not None:
+                return
+            if not self.poll_function:
+                raise RuntimeError("Expected callback to have already happened")
+            while not self._is_done():
+                await anyio.sleep(self.SLEEP_TIME)
+
+        yield from wait_for_callback().__await__()
+        return self._finish()
+
 
 class ErrorHandler:
     """Object that logs errors, with the option to collect incoming

From b08dc8bf86a26591fccdf6a9c80a002bcfdedbab Mon Sep 17 00:00:00 2001
From: Frank Yellin <fy@fyellin.com>
Date: Tue, 5 Nov 2024 16:37:35 -0800
Subject: [PATCH 10/25] Fix ruff format

---
 tests/test_async.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/tests/test_async.py b/tests/test_async.py
index 2bcd3ad0..3b8b3d86 100644
--- a/tests/test_async.py
+++ b/tests/test_async.py
@@ -1,4 +1,3 @@
-
 import anyio
 
 import pytest

From 84db283fd930117054762a8f34090b36a3462d6e Mon Sep 17 00:00:00 2001
From: Frank Yellin <fy@fyellin.com>
Date: Tue, 5 Nov 2024 23:52:10 -0800
Subject: [PATCH 11/25] Attempt to delay installing anyio

---
 wgpu/backends/wgpu_native/_helpers.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/wgpu/backends/wgpu_native/_helpers.py b/wgpu/backends/wgpu_native/_helpers.py
index 4b2669b8..66acbb14 100644
--- a/wgpu/backends/wgpu_native/_helpers.py
+++ b/wgpu/backends/wgpu_native/_helpers.py
@@ -5,8 +5,6 @@
 import time
 from queue import deque
 
-import anyio
-
 from ._ffi import ffi, lib, lib_path
 from ..._diagnostics import DiagnosticsBase
 from ...classes import (
@@ -278,6 +276,7 @@ def async_wait(self):
         return self
 
     def __await__(self):
+        import anyio
         # There is no documentation on what __await__() is supposed to return, but we
         # can certainly copy from a function that *does* know what to return
         async def wait_for_callback():

From 16e082eb173abb1e05b7c90106e90429f71b61cf Mon Sep 17 00:00:00 2001
From: Frank Yellin <fy@fyellin.com>
Date: Wed, 6 Nov 2024 08:33:57 -0800
Subject: [PATCH 12/25] Attempt to delay installing anyio

---
 wgpu/backends/wgpu_native/_helpers.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/wgpu/backends/wgpu_native/_helpers.py b/wgpu/backends/wgpu_native/_helpers.py
index 66acbb14..75f3c7c9 100644
--- a/wgpu/backends/wgpu_native/_helpers.py
+++ b/wgpu/backends/wgpu_native/_helpers.py
@@ -277,6 +277,7 @@ def async_wait(self):
 
     def __await__(self):
         import anyio
+
         # There is no documentation on what __await__() is supposed to return, but we
         # can certainly copy from a function that *does* know what to return
         async def wait_for_callback():

From 3eb1a597c09ba9bb711a1f894b66c3f963846167 Mon Sep 17 00:00:00 2001
From: Frank Yellin <fy@fyellin.com>
Date: Thu, 7 Nov 2024 12:20:45 -0800
Subject: [PATCH 13/25] Add another test.

---
 tests/test_async.py | 35 ++++++++++++++++++++++++-----------
 wgpu/_classes.py    |  2 +-
 2 files changed, 25 insertions(+), 12 deletions(-)

diff --git a/tests/test_async.py b/tests/test_async.py
index 3b8b3d86..75c4e4e5 100644
--- a/tests/test_async.py
+++ b/tests/test_async.py
@@ -1,15 +1,15 @@
 import anyio
 
-import pytest
+from pytest import mark
 
 import wgpu.utils
-from tests.testutils import run_tests
-from wgpu import MapMode, TextureFormat
+from tests.testutils import can_use_wgpu_lib, run_tests
+from wgpu import GPUDevice, MapMode, TextureFormat
 from wgpu.backends.wgpu_native import WgpuAwaitable
 
 
-@pytest.mark.anyio
-@pytest.mark.parametrize("use_async", [False, True])
+@mark.anyio
+@mark.parametrize("use_async", [False, True])
 async def test_awaitable_async(use_async):
     count = 0
 
@@ -34,15 +34,27 @@ def poll_function():
     assert result == 10 * 10
 
 
-@pytest.mark.anyio
-async def test_asynchronous_get_device():
+@mark.skipif(not can_use_wgpu_lib, reason="Needs wgpu lib")
+@mark.anyio
+async def test_enumerate_adapters_async():
+    adapters = await wgpu.gpu.enumerate_adapters_async()
+    assert len(adapters) > 0
+    for adapter in adapters:
+        device = await adapter.request_device_async()
+        assert isinstance(device, GPUDevice)
+
+
+@mark.skipif(not can_use_wgpu_lib, reason="Needs wgpu lib")
+@mark.anyio
+async def test_request_device_async():
     adapter = await wgpu.gpu.request_adapter_async(power_preference="high-performance")
     device = await adapter.request_device_async()
     assert device is not None
 
 
-@pytest.mark.anyio
-async def test_asynchronous_buffer_map():
+@mark.skipif(not can_use_wgpu_lib, reason="Needs wgpu lib")
+@mark.anyio
+async def test_buffer_map_async():
     device = wgpu.utils.get_default_device()
 
     data = b"1" * 10000
@@ -63,8 +75,9 @@ async def test_asynchronous_buffer_map():
     assert bytes(data2) == data
 
 
-@pytest.mark.anyio
-async def test_asynchronous_make_pipeline():
+@mark.skipif(not can_use_wgpu_lib, reason="Needs wgpu lib")
+@mark.anyio
+async def make_pipeline_async():
     device = wgpu.utils.get_default_device()
 
     shader_source = """
diff --git a/wgpu/_classes.py b/wgpu/_classes.py
index 6fb98b95..cecf44e2 100644
--- a/wgpu/_classes.py
+++ b/wgpu/_classes.py
@@ -153,7 +153,7 @@ def enumerate_adapters_sync(self):
     async def enumerate_adapters_async(self):
         """Get a list of adapter objects available on the current system.
 
-        An adapter can then be selected (e.g. using it's summary), and a device
+        An adapter can then be selected (e.g. using its summary), and a device
         then created from it.
 
         The order of the devices is such that Vulkan adapters go first, then

From 54488dcf9d87880902f3f554b491cd702b15312a Mon Sep 17 00:00:00 2001
From: Frank Yellin <fy@fyellin.com>
Date: Thu, 7 Nov 2024 12:40:02 -0800
Subject: [PATCH 14/25] Changes requested by reviewers

---
 wgpu/backends/wgpu_native/_api.py | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/wgpu/backends/wgpu_native/_api.py b/wgpu/backends/wgpu_native/_api.py
index 21113b24..84f3cd9d 100644
--- a/wgpu/backends/wgpu_native/_api.py
+++ b/wgpu/backends/wgpu_native/_api.py
@@ -1069,7 +1069,9 @@ class GPUDevice(classes.GPUDevice, GPUObjectBase):
     # GPUObjectBaseMixin
     _release_function = libf.wgpuDeviceRelease
 
-    CREATE_PIPELINE_ASYNC_NOT_IMPLEMENTED = True
+    # This flag  should be deleted once create_compute_pipeline_async() and
+    # create_render_pipeline_async() are actually implemented in the wgpu-native library.
+    _CREATE_PIPELINE_ASYNC_NOT_IMPLEMENTED = True
 
     def _poll(self):
         # Internal function
@@ -1573,7 +1575,7 @@ async def create_compute_pipeline_async(
     ):
         descriptor = self._create_compute_pipeline_descriptor(label, layout, compute)
 
-        if self.CREATE_PIPELINE_ASYNC_NOT_IMPLEMENTED:
+        if self._CREATE_PIPELINE_ASYNC_NOT_IMPLEMENTED:
             # H: WGPUComputePipeline f(WGPUDevice device, WGPUComputePipelineDescriptor const * descriptor)
             id = libf.wgpuDeviceCreateComputePipeline(self._internal, descriptor)
             return GPUComputePipeline(label, id, self)
@@ -1675,7 +1677,7 @@ async def create_render_pipeline_async(
             label, layout, vertex, primitive, depth_stencil, multisample, fragment
         )
 
-        if self.CREATE_PIPELINE_ASYNC_NOT_IMPLEMENTED:
+        if self._CREATE_PIPELINE_ASYNC_NOT_IMPLEMENTED:
             # H: WGPURenderPipeline f(WGPUDevice device, WGPURenderPipelineDescriptor const * descriptor)
             id = libf.wgpuDeviceCreateRenderPipeline(self._internal, descriptor)
             return GPURenderPipeline(label, id, self)

From 45361ceab40e7c238d40c73865f292dca7ca8503 Mon Sep 17 00:00:00 2001
From: Frank Yellin <fy@fyellin.com>
Date: Thu, 14 Nov 2024 13:00:41 -0800
Subject: [PATCH 15/25] Change sleep to 0

---
 wgpu/backends/wgpu_native/_helpers.py | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/wgpu/backends/wgpu_native/_helpers.py b/wgpu/backends/wgpu_native/_helpers.py
index 75f3c7c9..360b8958 100644
--- a/wgpu/backends/wgpu_native/_helpers.py
+++ b/wgpu/backends/wgpu_native/_helpers.py
@@ -233,8 +233,6 @@ class WgpuAwaitable:
     truely async manner, as well as to support a synchronous version of them.
     """
 
-    SLEEP_TIME = 0.005  # 5ms
-
     def __init__(self, title, callback, finalizer, poll_function=None, timeout=5.0):
         self.title = title  # for context in error messages
         self.callback = callback  # only used to prevent it from being gc'd
@@ -269,7 +267,7 @@ def sync_wait(self):
             raise RuntimeError("Expected callback to have already happened")
         else:
             while not self._is_done():
-                time.sleep(self.SLEEP_TIME)
+                time.sleep(0)
         return self._finish()
 
     def async_wait(self):
@@ -286,7 +284,7 @@ async def wait_for_callback():
             if not self.poll_function:
                 raise RuntimeError("Expected callback to have already happened")
             while not self._is_done():
-                await anyio.sleep(self.SLEEP_TIME)
+                await anyio.sleep(0)
 
         yield from wait_for_callback().__await__()
         return self._finish()

From cd574c8e26e959908ed0807f23146b125bccc7a2 Mon Sep 17 00:00:00 2001
From: Almar Klein <almar@almarklein.org>
Date: Wed, 20 Nov 2024 09:51:05 +0100
Subject: [PATCH 16/25] Small tweaks/cleanup

---
 wgpu/backends/wgpu_native/_api.py     | 22 +++++++++++++---------
 wgpu/backends/wgpu_native/_helpers.py | 21 +++++++++++----------
 2 files changed, 24 insertions(+), 19 deletions(-)

diff --git a/wgpu/backends/wgpu_native/_api.py b/wgpu/backends/wgpu_native/_api.py
index 1aaa0ebc..3cf9a371 100644
--- a/wgpu/backends/wgpu_native/_api.py
+++ b/wgpu/backends/wgpu_native/_api.py
@@ -1071,10 +1071,15 @@ class GPUDevice(classes.GPUDevice, GPUObjectBase):
 
     # This flag  should be deleted once create_compute_pipeline_async() and
     # create_render_pipeline_async() are actually implemented in the wgpu-native library.
-    _CREATE_PIPELINE_ASYNC_NOT_IMPLEMENTED = True
+    _CREATE_PIPELINE_ASYNC_IS_IMPLEMENTED = False
 
     def _poll(self):
         # Internal function
+        if self._internal:
+            # H: WGPUBool f(WGPUDevice device, WGPUBool wait, WGPUWrappedSubmissionIndex const * wrappedSubmissionIndex)
+            libf.wgpuDevicePoll(self._internal, False, ffi.NULL)
+
+    def _poll_wait(self):
         if self._internal:
             # H: WGPUBool f(WGPUDevice device, WGPUBool wait, WGPUWrappedSubmissionIndex const * wrappedSubmissionIndex)
             libf.wgpuDevicePoll(self._internal, True, ffi.NULL)
@@ -1575,7 +1580,7 @@ async def create_compute_pipeline_async(
     ):
         descriptor = self._create_compute_pipeline_descriptor(label, layout, compute)
 
-        if self._CREATE_PIPELINE_ASYNC_NOT_IMPLEMENTED:
+        if not self._CREATE_PIPELINE_ASYNC_IS_IMPLEMENTED:
             # H: WGPUComputePipeline f(WGPUDevice device, WGPUComputePipelineDescriptor const * descriptor)
             id = libf.wgpuDeviceCreateComputePipeline(self._internal, descriptor)
             return GPUComputePipeline(label, id, self)
@@ -1596,7 +1601,7 @@ def finalizer(id):
             return GPUComputePipeline(label, id, self)
 
         awaitable = WgpuAwaitable(
-            "create_compute_pipeline", callback, finalizer, self._device.poll
+            "create_compute_pipeline", callback, finalizer, self._device._poll
         )
 
         # H: void f(WGPUDevice device, WGPUComputePipelineDescriptor const * descriptor, WGPUDeviceCreateComputePipelineAsyncCallback callback, void * userdata)
@@ -1677,7 +1682,7 @@ async def create_render_pipeline_async(
             label, layout, vertex, primitive, depth_stencil, multisample, fragment
         )
 
-        if self._CREATE_PIPELINE_ASYNC_NOT_IMPLEMENTED:
+        if not self._CREATE_PIPELINE_ASYNC_IS_IMPLEMENTED:
             # H: WGPURenderPipeline f(WGPUDevice device, WGPURenderPipelineDescriptor const * descriptor)
             id = libf.wgpuDeviceCreateRenderPipeline(self._internal, descriptor)
             return GPURenderPipeline(label, id, self)
@@ -1686,7 +1691,6 @@ async def create_render_pipeline_async(
             "void(WGPUCreatePipelineAsyncStatus, WGPURenderPipeline, char *, void *)"
         )
         def callback(status, result, message, _userdata):
-            print("Callback!")
             if status != 0:
                 msg = "-" if message == ffi.NULL else ffi.string(message).decode()
                 awaitable.set_error(f"Create renderPipeline failed ({status}): {msg}")
@@ -3552,14 +3556,14 @@ def read_texture(self, source, data_layout, size):
 
     def on_submitted_work_done_sync(self):
         check_can_use_sync_variants()
-        awaitable = self._on_submitted_word_done()
+        awaitable = self._on_submitted_work_done()
         awaitable.sync_wait()
 
     async def on_submitted_work_done_async(self):
-        awaitable = self._on_submitted_word_done()
+        awaitable = self._on_submitted_work_done()
         await awaitable
 
-    def _on_submitted_word_done(self):
+    def _on_submitted_work_done(self):
         @ffi.callback("void(WGPUQueueWorkDoneStatus, void*)")
         def callback(status, _user_data_p):
             if status == 0:
@@ -3574,7 +3578,7 @@ def finalizer(_value):
             return None
 
         awaitable = WgpuAwaitable(
-            "on_submitted_work_done", callback, finalizer, self._device._poll
+            "on_submitted_work_done", callback, finalizer, self._device._poll_wait
         )
 
         # H: void f(WGPUQueue queue, WGPUQueueOnSubmittedWorkDoneCallback callback, void * userdata)
diff --git a/wgpu/backends/wgpu_native/_helpers.py b/wgpu/backends/wgpu_native/_helpers.py
index 360b8958..49c402cf 100644
--- a/wgpu/backends/wgpu_native/_helpers.py
+++ b/wgpu/backends/wgpu_native/_helpers.py
@@ -252,13 +252,17 @@ def _is_done(self):
         return self.result is not None or time.perf_counter() > self.maxtime
 
     def _finish(self):
-        if not self.result:
-            raise RuntimeError(f"Waiting for {self.title} timed out.")
-        result, error = self.result
-        if error:
-            raise RuntimeError(error)
-        else:
-            return self.finalizer(result)
+        try:
+            if not self.result:
+                raise RuntimeError(f"Waiting for {self.title} timed out.")
+            result, error = self.result
+            if error:
+                raise RuntimeError(error)
+            else:
+                return self.finalizer(result)
+        finally:
+            # Reset attrs to prevent potential memory leaks
+            self.callback = self.finalizer = self.poll_function = self.result = None
 
     def sync_wait(self):
         if self.result is not None:
@@ -270,9 +274,6 @@ def sync_wait(self):
                 time.sleep(0)
         return self._finish()
 
-    def async_wait(self):
-        return self
-
     def __await__(self):
         import anyio
 

From bf5862c0e789adfe9366dead337d263bc89c0cd4 Mon Sep 17 00:00:00 2001
From: Almar Klein <almar@almarklein.org>
Date: Wed, 20 Nov 2024 11:23:55 +0100
Subject: [PATCH 17/25] Implement new_array to prevent premature substruct cg

---
 wgpu/backends/wgpu_native/_api.py | 71 ++++++++++++++++---------------
 1 file changed, 36 insertions(+), 35 deletions(-)

diff --git a/wgpu/backends/wgpu_native/_api.py b/wgpu/backends/wgpu_native/_api.py
index 3cf9a371..7b5f30c9 100644
--- a/wgpu/backends/wgpu_native/_api.py
+++ b/wgpu/backends/wgpu_native/_api.py
@@ -113,6 +113,18 @@ def _new_struct_p(ctype, **kwargs):
     return struct_p
 
 
+def new_array(ctype, elements):
+    assert ctype.endswith("[]")
+    if isinstance(elements, int):
+        return ffi.new(ctype, elements)
+    elif elements:
+        array = ffi.new(ctype, elements)
+        _refs_per_struct[array] = elements  # prevent gc
+        return array
+    else:
+        return ffi.NULL
+
+
 def _tuple_from_tuple_or_dict(ob, fields, defaults=()):
     """Given a tuple/list/dict, return a tuple. Also checks tuple size.
 
@@ -200,7 +212,7 @@ def _get_override_constant_entries(field):
         c_constant_entries.append(c_constant_entry)
     # We need to return and hold onto c_constant_entries in order to prevent the C
     # strings from being GC'ed.
-    c_constants = ffi.new("WGPUConstantEntry[]", c_constant_entries)
+    c_constants = new_array("WGPUConstantEntry[]", c_constant_entries)
     return c_constants, c_constant_entries
 
 
@@ -440,7 +452,7 @@ def _enumerate_adapters(self):
         instance = get_wgpu_instance()
         # H: size_t f(WGPUInstance instance, WGPUInstanceEnumerateAdapterOptions const * options, WGPUAdapter * adapters)
         count = libf.wgpuInstanceEnumerateAdapters(instance, ffi.NULL, ffi.NULL)
-        adapters = ffi.new("WGPUAdapter[]", count)
+        adapters = new_array("WGPUAdapter[]", count)
         # H: size_t f(WGPUInstance instance, WGPUInstanceEnumerateAdapterOptions const * options, WGPUAdapter * adapters)
         libf.wgpuInstanceEnumerateAdapters(instance, ffi.NULL, adapters)
         return [self._create_adapter(adapter) for adapter in adapters]
@@ -618,10 +630,8 @@ def _configure_screen(
 
         # Convert to C values
 
-        c_view_formats = ffi.NULL
-        if view_formats:
-            view_formats_list = [enummap["TextureFormat." + x] for x in view_formats]
-            c_view_formats = ffi.new("WGPUTextureFormat []", view_formats_list)
+        view_formats_list = [enummap["TextureFormat." + x] for x in view_formats]
+        c_view_formats = new_array("WGPUTextureFormat[]", view_formats_list)
 
         # Lookup alpha mode, needs explicit conversion because enum names mismatch
         c_alpha_mode = getattr(lib, f"WGPUCompositeAlphaMode_{alpha_mode.capitalize()}")
@@ -1020,7 +1030,7 @@ def uncaptured_error_callback(c_type, c_message, userdata):
             label=to_c_label(label),
             nextInChain=ffi.cast("WGPUChainedStruct * ", extras),
             requiredFeatureCount=len(c_features),
-            requiredFeatures=ffi.new("WGPUFeatureName []", c_features),
+            requiredFeatures=new_array("WGPUFeatureName[]", c_features),
             requiredLimits=c_required_limits,
             defaultQueue=queue_struct,
             deviceLostCallback=device_lost_callback,
@@ -1313,15 +1323,11 @@ def create_bind_group_layout(
             )
             c_entries_list.append(c_entry)
 
-        c_entries_array = ffi.NULL
-        if c_entries_list:
-            c_entries_array = ffi.new("WGPUBindGroupLayoutEntry []", c_entries_list)
-
         # H: nextInChain: WGPUChainedStruct *, label: char *, entryCount: int, entries: WGPUBindGroupLayoutEntry *
         struct = new_struct_p(
             "WGPUBindGroupLayoutDescriptor *",
             label=to_c_label(label),
-            entries=c_entries_array,
+            entries=new_array("WGPUBindGroupLayoutEntry[]", c_entries_list),
             entryCount=len(c_entries_list),
             # not used: nextInChain
         )
@@ -1391,16 +1397,12 @@ def create_bind_group(
                 raise TypeError(f"Unexpected resource type {type(resource)}")
             c_entries_list.append(c_entry)
 
-        c_entries_array = ffi.NULL
-        if c_entries_list:
-            c_entries_array = ffi.new("WGPUBindGroupEntry []", c_entries_list)
-
         # H: nextInChain: WGPUChainedStruct *, label: char *, layout: WGPUBindGroupLayout, entryCount: int, entries: WGPUBindGroupEntry *
         struct = new_struct_p(
             "WGPUBindGroupDescriptor *",
             label=to_c_label(label),
             layout=layout._internal,
-            entries=c_entries_array,
+            entries=new_array("WGPUBindGroupEntry[]", c_entries_list),
             entryCount=len(c_entries_list),
             # not used: nextInChain
         )
@@ -1421,12 +1423,12 @@ def _create_pipeline_layout(
         push_constant_layouts,
     ):
         bind_group_layouts_ids = [x._internal for x in bind_group_layouts]
+        c_layout_array = new_array("WGPUBindGroupLayout[]", bind_group_layouts_ids)
 
-        c_layout_array = ffi.new("WGPUBindGroupLayout []", bind_group_layouts_ids)
         next_in_chain = ffi.NULL
         if push_constant_layouts:
             count = len(push_constant_layouts)
-            c_push_constant_ranges = ffi.new("WGPUPushConstantRange[]", count)
+            c_push_constant_ranges = new_array("WGPUPushConstantRange[]", count)
             for layout, c_push_constant_range in zip(
                 push_constant_layouts, c_push_constant_ranges
             ):
@@ -1494,18 +1496,17 @@ def create_shader_module(
                         # H: name: char *, value: char *
                         new_struct(
                             "WGPUShaderDefine",
-                            name=ffi.new("char []", b"gl_VertexID"),
-                            value=ffi.new("char []", b"gl_VertexIndex"),
+                            name=to_c_string("gl_VertexID"),
+                            value=to_c_string("gl_VertexIndex"),
                         )
                     )
-                c_defines = ffi.new("WGPUShaderDefine []", defines)
                 # H: chain: WGPUChainedStruct, stage: WGPUShaderStage, code: char *, defineCount: int, defines: WGPUShaderDefine *
                 source_struct = new_struct_p(
                     "WGPUShaderModuleGLSLDescriptor *",
                     code=to_c_string(code),
                     stage=c_stage,
                     defineCount=len(defines),
-                    defines=c_defines,
+                    defines=new_array("WGPUShaderDefine[]", defines),
                     # not used: chain
                 )
                 source_struct[0].chain.next = ffi.NULL
@@ -1724,7 +1725,6 @@ def _create_render_pipeline_descriptor(
         depth_stencil = depth_stencil or {}
         multisample = multisample or {}
         primitive = primitive or {}
-
         check_struct("VertexState", vertex)
         check_struct("DepthStencilState", depth_stencil)
         check_struct("MultisampleState", multisample)
@@ -1734,8 +1734,8 @@ def _create_render_pipeline_descriptor(
             self._create_vertex_buffer_layout(buffer_des)
             for buffer_des in vertex.get("buffers", ())
         ]
-        c_vertex_buffer_descriptors_array = ffi.new(
-            "WGPUVertexBufferLayout []", c_vertex_buffer_layout_list
+        c_vertex_buffer_descriptors_array = new_array(
+            "WGPUVertexBufferLayout[]", c_vertex_buffer_layout_list
         )
         c_vertex_constants, c_vertex_entries = _get_override_constant_entries(vertex)
         # H: nextInChain: WGPUChainedStruct *, module: WGPUShaderModule, entryPoint: char *, constantCount: int, constants: WGPUConstantEntry *, bufferCount: int, buffers: WGPUVertexBufferLayout *
@@ -1779,8 +1779,8 @@ def _create_render_pipeline_descriptor(
                 self._create_color_target_state(target)
                 for target in fragment["targets"]
             ]
-            c_color_targets_array = ffi.new(
-                "WGPUColorTargetState []", c_color_targets_list
+            c_color_targets_array = new_array(
+                "WGPUColorTargetState[]", c_color_targets_list
             )
             check_struct("FragmentState", fragment)
             c_fragment_constants, c_fragment_entries = _get_override_constant_entries(
@@ -1819,6 +1819,7 @@ def _create_render_pipeline_descriptor(
             fragment=c_fragment_state,
             # not used: nextInChain
         )
+
         return struct
 
     def _create_color_target_state(self, target):
@@ -1865,7 +1866,7 @@ def _create_vertex_buffer_layout(self, buffer_des):
                 shaderLocation=attribute["shader_location"],
             )
             c_attributes_list.append(c_attribute)
-        c_attributes_array = ffi.new("WGPUVertexAttribute []", c_attributes_list)
+        c_attributes_array = new_array("WGPUVertexAttribute[]", c_attributes_list)
         # H: arrayStride: int, stepMode: WGPUVertexStepMode, attributeCount: int, attributes: WGPUVertexAttribute *
         c_vertex_buffer_descriptor = new_struct(
             "WGPUVertexBufferLayout",
@@ -1941,7 +1942,7 @@ def create_render_bundle_encoder(
         c_color_formats, color_formats_count = ffi.NULL, 0
         if color_formats:
             color_formats_list = [enummap["TextureFormat." + x] for x in color_formats]
-            c_color_formats = ffi.new("WGPUTextureFormat []", color_formats_list)
+            c_color_formats = new_array("WGPUTextureFormat[]", color_formats_list)
             color_formats_count = len(color_formats_list)
 
         # H: nextInChain: WGPUChainedStruct *, label: char *, colorFormatCount: int, colorFormats: WGPUTextureFormat *, depthStencilFormat: WGPUTextureFormat, sampleCount: int, depthReadOnly: WGPUBool/int, stencilReadOnly: WGPUBool/int
@@ -1981,7 +1982,7 @@ def _create_statistics_query_set(self, label, count, statistics):
     def _create_query_set(self, label, type, count, statistics):
         next_in_chain = ffi.NULL
         if statistics:
-            c_statistics = ffi.new("WGPUPipelineStatisticName[]", statistics)
+            c_statistics = new_array("WGPUPipelineStatisticName[]", statistics)
             # H: chain: WGPUChainedStruct, pipelineStatistics: WGPUPipelineStatisticName *, pipelineStatisticCount: int
             query_set_descriptor_extras = new_struct_p(
                 "WGPUQuerySetDescriptorExtras *",
@@ -2708,8 +2709,8 @@ def begin_render_pass(
             self._create_render_pass_color_attachment(color_attachment)
             for color_attachment in color_attachments
         ]
-        c_color_attachments_array = ffi.new(
-            "WGPURenderPassColorAttachment []", c_color_attachments_list
+        c_color_attachments_array = new_array(
+            "WGPURenderPassColorAttachment[]", c_color_attachments_list
         )
 
         c_depth_stencil_attachment = ffi.NULL
@@ -3263,7 +3264,7 @@ def end(self):
 
     def execute_bundles(self, bundles: List[GPURenderBundle]):
         bundle_ids = [bundle._internal for bundle in bundles]
-        c_bundle_info = ffi.new("WGPURenderBundle []", bundle_ids)
+        c_bundle_info = new_array("WGPURenderBundle[]", bundle_ids)
         # H: void f(WGPURenderPassEncoder renderPassEncoder, size_t bundleCount, WGPURenderBundle const * bundles)
         libf.wgpuRenderPassEncoderExecuteBundles(
             self._internal, len(bundles), c_bundle_info
@@ -3349,7 +3350,7 @@ class GPUQueue(classes.GPUQueue, GPUObjectBase):
 
     def submit(self, command_buffers: List[GPUCommandBuffer]):
         command_buffer_ids = [cb._internal for cb in command_buffers]
-        c_command_buffers = ffi.new("WGPUCommandBuffer []", command_buffer_ids)
+        c_command_buffers = new_array("WGPUCommandBuffer[]", command_buffer_ids)
         # H: void f(WGPUQueue queue, size_t commandCount, WGPUCommandBuffer const * commands)
         libf.wgpuQueueSubmit(self._internal, len(command_buffer_ids), c_command_buffers)
 

From 2f150c1e5be9efee7d25d7a42a0868dd0f4349cb Mon Sep 17 00:00:00 2001
From: Almar Klein <almar@almarklein.org>
Date: Wed, 20 Nov 2024 11:25:40 +0100
Subject: [PATCH 18/25] add little debug function that was very useful

---
 wgpu/backends/wgpu_native/_api.py | 22 ++++++++++++++++++++++
 1 file changed, 22 insertions(+)

diff --git a/wgpu/backends/wgpu_native/_api.py b/wgpu/backends/wgpu_native/_api.py
index 7b5f30c9..9e1aa059 100644
--- a/wgpu/backends/wgpu_native/_api.py
+++ b/wgpu/backends/wgpu_native/_api.py
@@ -68,6 +68,28 @@ def check_can_use_sync_variants():
 }
 
 
+def print_struct(s, indent=""):
+    """Tool to pretty-print struct contents during debugging."""
+    for key in dir(s):
+        val = getattr(s, key)
+        if repr(val).startswith("<cdata "):
+            if "NULL" in repr(val):
+                print(indent + key + ": null")
+            elif "'char *'" in repr(val):
+                print(indent + key + ":", ffi.string(val).decode())
+            elif " *'" in repr(val):
+                print(indent + key + ": pointer")
+            elif "struct WGPU" in repr(val):
+                print(indent + key + ":", repr(val))
+            else:
+                print(indent + key + ":")
+                print(indent + "{")
+                print_struct(val, indent + "  ")
+                print(indent + "}")
+        else:
+            print(indent + key + ":", val)
+
+
 def new_struct_p(ctype, **kwargs):
     """Create a pointer to an ffi struct. Provides a flatter syntax
     and converts our string enums to int enums needed in C. The passed

From b207bb7101bb9a97c6439c60abe12897685ca4a4 Mon Sep 17 00:00:00 2001
From: Almar Klein <almar@almarklein.org>
Date: Wed, 20 Nov 2024 11:26:53 +0100
Subject: [PATCH 19/25] fix import in test script

---
 tests/test_async.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/test_async.py b/tests/test_async.py
index 75c4e4e5..98bdd220 100644
--- a/tests/test_async.py
+++ b/tests/test_async.py
@@ -3,7 +3,7 @@
 from pytest import mark
 
 import wgpu.utils
-from tests.testutils import can_use_wgpu_lib, run_tests
+from testutils import can_use_wgpu_lib, run_tests
 from wgpu import GPUDevice, MapMode, TextureFormat
 from wgpu.backends.wgpu_native import WgpuAwaitable
 

From b1022ed43de37d987df2d438f56daeca0ac1a36c Mon Sep 17 00:00:00 2001
From: Almar Klein <almar@almarklein.org>
Date: Wed, 20 Nov 2024 11:30:41 +0100
Subject: [PATCH 20/25] codegen

---
 wgpu/resources/codegen_report.md | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/wgpu/resources/codegen_report.md b/wgpu/resources/codegen_report.md
index 09d31cf7..279fba39 100644
--- a/wgpu/resources/codegen_report.md
+++ b/wgpu/resources/codegen_report.md
@@ -20,7 +20,7 @@
 * Diffs for GPUQueue: add read_buffer, add read_texture, hide copy_external_image_to_texture
 * Validated 37 classes, 121 methods, 46 properties
 ### Patching API for backends/wgpu_native/_api.py
-* Validated 37 classes, 120 methods, 0 properties
+* Validated 37 classes, 121 methods, 0 properties
 ## Validating backends/wgpu_native/_api.py
 * Enum field FeatureName.texture-compression-bc-sliced-3d missing in wgpu.h
 * Enum field FeatureName.clip-distances missing in wgpu.h
@@ -35,6 +35,6 @@
 * Enum CanvasAlphaMode missing in wgpu.h
 * Enum CanvasToneMappingMode missing in wgpu.h
 * Wrote 236 enum mappings and 47 struct-field mappings to wgpu_native/_mappings.py
-* Validated 141 C function calls
+* Validated 142 C function calls
 * Not using 64 C functions
 * Validated 82 C structs

From f650208d124c6660aa055e51743c0d3e8797ee71 Mon Sep 17 00:00:00 2001
From: Almar Klein <almar@almarklein.org>
Date: Thu, 21 Nov 2024 12:33:39 +0100
Subject: [PATCH 21/25] Make array store sub-refs, not sub-elements

---
 wgpu/backends/wgpu_native/_api.py | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/wgpu/backends/wgpu_native/_api.py b/wgpu/backends/wgpu_native/_api.py
index 9e1aa059..8598d31e 100644
--- a/wgpu/backends/wgpu_native/_api.py
+++ b/wgpu/backends/wgpu_native/_api.py
@@ -115,7 +115,7 @@ def new_struct(ctype, **kwargs):
     assert not ctype.endswith("*")
     struct_p = _new_struct_p(ctype + " *", **kwargs)
     struct = struct_p[0]
-    _refs_per_struct[struct] = kwargs
+    _refs_per_struct[struct] = tuple(kwargs.values())
     return struct
 
 
@@ -141,7 +141,10 @@ def new_array(ctype, elements):
         return ffi.new(ctype, elements)
     elif elements:
         array = ffi.new(ctype, elements)
-        _refs_per_struct[array] = elements  # prevent gc
+        # The array is a contiguous copy of the element structs. We don't need
+        # to keep a reference to the elements, but we do to sub-structs and
+        # sub-arrays of these elements.
+        _refs_per_struct[array] = [_refs_per_struct.get(el, None) for el in elements]
         return array
     else:
         return ffi.NULL
@@ -1841,7 +1844,6 @@ def _create_render_pipeline_descriptor(
             fragment=c_fragment_state,
             # not used: nextInChain
         )
-
         return struct
 
     def _create_color_target_state(self, target):

From c4d2388f73a24c9b9abebde3ad24ebeb1f33f417 Mon Sep 17 00:00:00 2001
From: Almar Klein <almar@almarklein.org>
Date: Fri, 6 Dec 2024 16:26:45 +0100
Subject: [PATCH 22/25] use sniffio instead of anyio

---
 pyproject.toml                        | 16 ++++-
 wgpu/_classes.py                      | 56 ++++++++--------
 wgpu/backends/wgpu_native/_helpers.py | 13 +++-
 wgpu/enums.py                         | 52 +++++++--------
 wgpu/flags.py                         |  4 +-
 wgpu/gui/__init__.py                  |  4 +-
 wgpu/gui/auto.py                      |  2 +-
 wgpu/structs.py                       | 96 +++++++++++++--------------
 8 files changed, 131 insertions(+), 112 deletions(-)

diff --git a/pyproject.toml b/pyproject.toml
index 5399a65b..447d64c5 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -9,7 +9,11 @@ license = { file = "LICENSE" }
 authors = [{ name = "Almar Klein" }, { name = "Korijn van Golen" }]
 keywords = ["webgpu", "wgpu", "vulkan", "metal", "DX12", "opengl"]
 requires-python = ">= 3.9"
-dependencies = ["cffi>=1.15.0", "rubicon-objc>=0.4.1; sys_platform == 'darwin'", "anyio>=4.6"]
+dependencies = [
+    "cffi>=1.15.0",
+    "rubicon-objc>=0.4.1; sys_platform == 'darwin'",
+    "sniffio",
+]
 
 [project.optional-dependencies]
 # For users
@@ -20,7 +24,15 @@ imgui = ["imgui-bundle>=1.2.1"]
 build = ["build", "hatchling", "requests", "twine"]
 codegen = ["pytest", "numpy", "ruff"]
 lint = ["ruff", "pre-commit"]
-tests = ["numpy", "pytest", "pytest-anyio", "psutil", "imageio", "anyio", "trio"]
+tests = [
+    "numpy",
+    "pytest",
+    "pytest-anyio",
+    "psutil",
+    "imageio",
+    "anyio",
+    "trio",
+]
 examples = []
 docs = ["sphinx>7.2", "sphinx_rtd_theme"]
 dev = ["wgpu[build,codegen,lint,tests,examples,docs]"]
diff --git a/wgpu/_classes.py b/wgpu/_classes.py
index fae79ef2..b7032135 100644
--- a/wgpu/_classes.py
+++ b/wgpu/_classes.py
@@ -21,43 +21,43 @@
 
 
 __all__ = [
-    "GPUObjectBase",
-    "GPUAdapterInfo",
     "GPU",
     "GPUAdapter",
-    "GPUDevice",
-    "GPUBuffer",
-    "GPUTexture",
-    "GPUTextureView",
-    "GPUSampler",
-    "GPUBindGroupLayout",
+    "GPUAdapterInfo",
     "GPUBindGroup",
-    "GPUPipelineLayout",
-    "GPUShaderModule",
-    "GPUCompilationMessage",
-    "GPUCompilationInfo",
-    "GPUPipelineError",
-    "GPUPipelineBase",
-    "GPUComputePipeline",
-    "GPURenderPipeline",
+    "GPUBindGroupLayout",
+    "GPUBindingCommandsMixin",
+    "GPUBuffer",
+    "GPUCanvasContext",
     "GPUCommandBuffer",
-    "GPUCommandsMixin",
     "GPUCommandEncoder",
-    "GPUBindingCommandsMixin",
-    "GPUDebugCommandsMixin",
+    "GPUCommandsMixin",
+    "GPUCompilationInfo",
+    "GPUCompilationMessage",
     "GPUComputePassEncoder",
-    "GPURenderPassEncoder",
-    "GPURenderCommandsMixin",
-    "GPURenderBundle",
-    "GPURenderBundleEncoder",
-    "GPUQueue",
-    "GPUQuerySet",
-    "GPUCanvasContext",
+    "GPUComputePipeline",
+    "GPUDebugCommandsMixin",
+    "GPUDevice",
     "GPUDeviceLostInfo",
     "GPUError",
-    "GPUValidationError",
-    "GPUOutOfMemoryError",
     "GPUInternalError",
+    "GPUObjectBase",
+    "GPUOutOfMemoryError",
+    "GPUPipelineBase",
+    "GPUPipelineError",
+    "GPUPipelineLayout",
+    "GPUQuerySet",
+    "GPUQueue",
+    "GPURenderBundle",
+    "GPURenderBundleEncoder",
+    "GPURenderCommandsMixin",
+    "GPURenderPassEncoder",
+    "GPURenderPipeline",
+    "GPUSampler",
+    "GPUShaderModule",
+    "GPUTexture",
+    "GPUTextureView",
+    "GPUValidationError",
 ]
 
 logger = logging.getLogger("wgpu")
diff --git a/wgpu/backends/wgpu_native/_helpers.py b/wgpu/backends/wgpu_native/_helpers.py
index 49c402cf..bbd2f742 100644
--- a/wgpu/backends/wgpu_native/_helpers.py
+++ b/wgpu/backends/wgpu_native/_helpers.py
@@ -5,6 +5,8 @@
 import time
 from queue import deque
 
+import sniffio
+
 from ._ffi import ffi, lib, lib_path
 from ..._diagnostics import DiagnosticsBase
 from ...classes import (
@@ -226,6 +228,13 @@ def to_camel_case(name):
     return name2
 
 
+async def async_sleep(delay):
+    """Async sleep that uses sniffio to be compatible with asyncio, trio, rendercanvas.utils.asyncadapter, and possibly more."""
+    libname = sniffio.current_async_library()
+    sleep = sys.modules[libname].sleep
+    await sleep(delay)
+
+
 class WgpuAwaitable:
     """An object that can be waited for, either synchronously using sync_wait() or asynchronously using await.
 
@@ -275,8 +284,6 @@ def sync_wait(self):
         return self._finish()
 
     def __await__(self):
-        import anyio
-
         # There is no documentation on what __await__() is supposed to return, but we
         # can certainly copy from a function that *does* know what to return
         async def wait_for_callback():
@@ -285,7 +292,7 @@ async def wait_for_callback():
             if not self.poll_function:
                 raise RuntimeError("Expected callback to have already happened")
             while not self._is_done():
-                await anyio.sleep(0)
+                await async_sleep(0)
 
         yield from wait_for_callback().__await__()
         return self._finish()
diff --git a/wgpu/enums.py b/wgpu/enums.py
index 75ea9ad2..ecfe47f0 100644
--- a/wgpu/enums.py
+++ b/wgpu/enums.py
@@ -21,40 +21,40 @@ class Enum(_BaseEnum):
 # There are 34 enums
 
 __all__ = [
-    "PowerPreference",
-    "FeatureName",
-    "BufferMapState",
-    "TextureDimension",
-    "TextureViewDimension",
-    "TextureAspect",
-    "TextureFormat",
     "AddressMode",
-    "FilterMode",
-    "MipmapFilterMode",
-    "CompareFunction",
-    "BufferBindingType",
-    "SamplerBindingType",
-    "TextureSampleType",
-    "StorageTextureAccess",
-    "CompilationMessageType",
-    "PipelineErrorReason",
     "AutoLayoutMode",
-    "PrimitiveTopology",
-    "FrontFace",
-    "CullMode",
     "BlendFactor",
     "BlendOperation",
-    "StencilOperation",
-    "IndexFormat",
-    "VertexFormat",
-    "VertexStepMode",
-    "LoadOp",
-    "StoreOp",
-    "QueryType",
+    "BufferBindingType",
+    "BufferMapState",
     "CanvasAlphaMode",
     "CanvasToneMappingMode",
+    "CompareFunction",
+    "CompilationMessageType",
+    "CullMode",
     "DeviceLostReason",
     "ErrorFilter",
+    "FeatureName",
+    "FilterMode",
+    "FrontFace",
+    "IndexFormat",
+    "LoadOp",
+    "MipmapFilterMode",
+    "PipelineErrorReason",
+    "PowerPreference",
+    "PrimitiveTopology",
+    "QueryType",
+    "SamplerBindingType",
+    "StencilOperation",
+    "StorageTextureAccess",
+    "StoreOp",
+    "TextureAspect",
+    "TextureDimension",
+    "TextureFormat",
+    "TextureSampleType",
+    "TextureViewDimension",
+    "VertexFormat",
+    "VertexStepMode",
 ]
 
 
diff --git a/wgpu/flags.py b/wgpu/flags.py
index 5bd06793..7fa73a18 100644
--- a/wgpu/flags.py
+++ b/wgpu/flags.py
@@ -23,10 +23,10 @@ class Flags(_BaseEnum):
 
 __all__ = [
     "BufferUsage",
+    "ColorWrite",
     "MapMode",
-    "TextureUsage",
     "ShaderStage",
-    "ColorWrite",
+    "TextureUsage",
 ]
 
 
diff --git a/wgpu/gui/__init__.py b/wgpu/gui/__init__.py
index cb74d27e..c35d54cd 100644
--- a/wgpu/gui/__init__.py
+++ b/wgpu/gui/__init__.py
@@ -6,7 +6,7 @@
 from .base import WgpuCanvasInterface, WgpuCanvasBase, WgpuAutoGui
 
 __all__ = [
-    "WgpuCanvasInterface",
-    "WgpuCanvasBase",
     "WgpuAutoGui",
+    "WgpuCanvasBase",
+    "WgpuCanvasInterface",
 ]
diff --git a/wgpu/gui/auto.py b/wgpu/gui/auto.py
index 65536ec1..a3a11178 100644
--- a/wgpu/gui/auto.py
+++ b/wgpu/gui/auto.py
@@ -5,7 +5,7 @@
 for e.g. wx later. Or we might decide to stick with these three.
 """
 
-__all__ = ["WgpuCanvas", "run", "call_later"]
+__all__ = ["WgpuCanvas", "call_later", "run"]
 
 import os
 import sys
diff --git a/wgpu/structs.py b/wgpu/structs.py
index cd2d3707..857a34c7 100644
--- a/wgpu/structs.py
+++ b/wgpu/structs.py
@@ -30,65 +30,65 @@ def __repr__(self):
 # There are 60 structs
 
 __all__ = [
-    "RequestAdapterOptions",
-    "DeviceDescriptor",
-    "BufferDescriptor",
-    "TextureDescriptor",
-    "TextureViewDescriptor",
-    "ExternalTextureDescriptor",
-    "SamplerDescriptor",
-    "BindGroupLayoutDescriptor",
-    "BindGroupLayoutEntry",
-    "BufferBindingLayout",
-    "SamplerBindingLayout",
-    "TextureBindingLayout",
-    "StorageTextureBindingLayout",
-    "ExternalTextureBindingLayout",
     "BindGroupDescriptor",
     "BindGroupEntry",
+    "BindGroupLayoutDescriptor",
+    "BindGroupLayoutEntry",
+    "BlendComponent",
+    "BlendState",
     "BufferBinding",
-    "PipelineLayoutDescriptor",
-    "ShaderModuleDescriptor",
-    "ShaderModuleCompilationHint",
-    "PipelineErrorInit",
-    "ProgrammableStage",
-    "ComputePipelineDescriptor",
-    "RenderPipelineDescriptor",
-    "PrimitiveState",
-    "MultisampleState",
-    "FragmentState",
+    "BufferBindingLayout",
+    "BufferDescriptor",
+    "CanvasConfiguration",
+    "CanvasToneMapping",
+    "Color",
     "ColorTargetState",
-    "BlendState",
-    "BlendComponent",
-    "DepthStencilState",
-    "StencilFaceState",
-    "VertexState",
-    "VertexBufferLayout",
-    "VertexAttribute",
-    "ImageDataLayout",
-    "ImageCopyBuffer",
-    "ImageCopyTexture",
-    "ImageCopyExternalImage",
     "CommandBufferDescriptor",
     "CommandEncoderDescriptor",
-    "ComputePassTimestampWrites",
     "ComputePassDescriptor",
-    "RenderPassTimestampWrites",
-    "RenderPassDescriptor",
+    "ComputePassTimestampWrites",
+    "ComputePipelineDescriptor",
+    "DepthStencilState",
+    "DeviceDescriptor",
+    "Extent3D",
+    "ExternalTextureBindingLayout",
+    "ExternalTextureDescriptor",
+    "FragmentState",
+    "ImageCopyBuffer",
+    "ImageCopyExternalImage",
+    "ImageCopyTexture",
+    "ImageDataLayout",
+    "MultisampleState",
+    "Origin2D",
+    "Origin3D",
+    "PipelineErrorInit",
+    "PipelineLayoutDescriptor",
+    "PrimitiveState",
+    "ProgrammableStage",
+    "QuerySetDescriptor",
+    "QueueDescriptor",
+    "RenderBundleDescriptor",
+    "RenderBundleEncoderDescriptor",
     "RenderPassColorAttachment",
     "RenderPassDepthStencilAttachment",
+    "RenderPassDescriptor",
     "RenderPassLayout",
-    "RenderBundleDescriptor",
-    "RenderBundleEncoderDescriptor",
-    "QueueDescriptor",
-    "QuerySetDescriptor",
-    "CanvasToneMapping",
-    "CanvasConfiguration",
+    "RenderPassTimestampWrites",
+    "RenderPipelineDescriptor",
+    "RequestAdapterOptions",
+    "SamplerBindingLayout",
+    "SamplerDescriptor",
+    "ShaderModuleCompilationHint",
+    "ShaderModuleDescriptor",
+    "StencilFaceState",
+    "StorageTextureBindingLayout",
+    "TextureBindingLayout",
+    "TextureDescriptor",
+    "TextureViewDescriptor",
     "UncapturedErrorEventInit",
-    "Color",
-    "Origin2D",
-    "Origin3D",
-    "Extent3D",
+    "VertexAttribute",
+    "VertexBufferLayout",
+    "VertexState",
 ]
 
 

From 7e6e87e687180e76cb6f83e7273bd2ea4238d514 Mon Sep 17 00:00:00 2001
From: Almar Klein <almar@almarklein.org>
Date: Fri, 6 Dec 2024 16:47:17 +0100
Subject: [PATCH 23/25] at least one sleep

---
 wgpu/backends/wgpu_native/_helpers.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/wgpu/backends/wgpu_native/_helpers.py b/wgpu/backends/wgpu_native/_helpers.py
index bbd2f742..2dfc8ba1 100644
--- a/wgpu/backends/wgpu_native/_helpers.py
+++ b/wgpu/backends/wgpu_native/_helpers.py
@@ -287,6 +287,10 @@ def __await__(self):
         # There is no documentation on what __await__() is supposed to return, but we
         # can certainly copy from a function that *does* know what to return
         async def wait_for_callback():
+            # In all the async cases that I've tried, the result is either already set, or
+            # resolves after the first call to the poll function. To make sure that our
+            # sleep-logic actually works, we always do at least one sleep call.
+            await async_sleep(0)
             if self.result is not None:
                 return
             if not self.poll_function:

From f5acb6015b350edb236cda766c13aa776dcc4e95 Mon Sep 17 00:00:00 2001
From: Almar Klein <almar@almarklein.org>
Date: Fri, 6 Dec 2024 17:06:50 +0100
Subject: [PATCH 24/25] fix new_array

---
 wgpu/backends/wgpu_native/_api.py | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/wgpu/backends/wgpu_native/_api.py b/wgpu/backends/wgpu_native/_api.py
index 8598d31e..4b2177ba 100644
--- a/wgpu/backends/wgpu_native/_api.py
+++ b/wgpu/backends/wgpu_native/_api.py
@@ -138,13 +138,18 @@ def _new_struct_p(ctype, **kwargs):
 def new_array(ctype, elements):
     assert ctype.endswith("[]")
     if isinstance(elements, int):
+        # elements == count
         return ffi.new(ctype, elements)
     elif elements:
         array = ffi.new(ctype, elements)
         # The array is a contiguous copy of the element structs. We don't need
         # to keep a reference to the elements, but we do to sub-structs and
         # sub-arrays of these elements.
-        _refs_per_struct[array] = [_refs_per_struct.get(el, None) for el in elements]
+        _refs_per_struct[array] = [
+            _refs_per_struct.get(el, None)
+            for el in elements
+            if isinstance(el, ffi.CData)
+        ]
         return array
     else:
         return ffi.NULL

From 6975af003f1fd3698a506103285aa05e415a7bd8 Mon Sep 17 00:00:00 2001
From: Almar Klein <almar@almarklein.org>
Date: Fri, 6 Dec 2024 17:18:08 +0100
Subject: [PATCH 25/25] fix wheel test

---
 .github/workflows/cd.yml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/cd.yml b/.github/workflows/cd.yml
index d92ad2cd..2c7e79ab 100644
--- a/.github/workflows/cd.yml
+++ b/.github/workflows/cd.yml
@@ -75,9 +75,9 @@ jobs:
     - name: Install and test wheel
       shell: bash
       run: |
-          rm -rf ./wgpu
           # Install 'normally' to install deps, then force the install from dist-folder and nothing else
-          pip install --find-links dist wgpu
+          pip install .
+          rm -rf ./wgpu
           pip install --force-reinstall --no-deps --no-index --find-links dist wgpu
           pushd $HOME
           python -c 'import wgpu.backends.wgpu_native; print(wgpu.backends.wgpu_native._ffi.lib_path)'