diff --git a/src/vulkan/common.h b/src/vulkan/common.h index daeba201..ead439c4 100644 --- a/src/vulkan/common.h +++ b/src/vulkan/common.h @@ -46,6 +46,11 @@ #define PL_VK_MAX_QUEUED_CMDS 1024 #define PL_VK_MAX_PENDING_CMDS 1024 +// Shitty compatibility alias for very old vulkan.h versions +#ifndef VK_API_VERSION_1_2 +#define VK_API_VERSION_1_2 VK_MAKE_VERSION(1, 2, 0) +#endif + // Shared struct used to hold vulkan context information struct vk_ctx { void *ta; // allocations bound to the lifetime of this vk_ctx @@ -205,6 +210,7 @@ struct vk_ctx { VK_FUN(QueueSubmit); VK_FUN(ResetEvent); VK_FUN(ResetFences); + VK_FUN(ResetQueryPoolEXT); VK_FUN(SetDebugUtilsObjectNameEXT); VK_FUN(SetHdrMetadataEXT); VK_FUN(UpdateDescriptorSets); diff --git a/src/vulkan/context.c b/src/vulkan/context.c index 89bef521..fac07eb8 100644 --- a/src/vulkan/context.c +++ b/src/vulkan/context.c @@ -24,6 +24,7 @@ const struct pl_vk_inst_params pl_vk_inst_default_params = {0}; struct vk_fun { const char *name; + const char *alias; size_t offset; bool device_level; }; @@ -45,6 +46,13 @@ struct vk_ext { .device_level = true, \ } +#define VK_DEV_FUN_ALIAS(N, ALIAS) \ + { .name = "vk" #N, \ + .alias = #ALIAS, \ + .offset = offsetof(struct vk_ctx, N), \ + .device_level = true, \ + } + // Table of optional vulkan instance extensions static const char *vk_instance_extensions[] = { VK_KHR_SURFACE_EXTENSION_NAME, @@ -156,6 +164,13 @@ static const struct vk_ext vk_device_extensions[] = { VK_DEV_FUN(SetHdrMetadataEXT), {0}, }, + }, { + .name = VK_EXT_HOST_QUERY_RESET_EXTENSION_NAME, + .core_ver = VK_API_VERSION_1_2, + .funs = (struct vk_fun[]) { + VK_DEV_FUN_ALIAS(ResetQueryPoolEXT, vkResetQueryPool), + {0}, + }, }, }; @@ -173,13 +188,21 @@ const char * const pl_vulkan_recommended_extensions[] = { #endif VK_EXT_PCI_BUS_INFO_EXTENSION_NAME, VK_EXT_HDR_METADATA_EXTENSION_NAME, + VK_EXT_HOST_QUERY_RESET_EXTENSION_NAME, }; const int pl_vulkan_num_recommended_extensions = PL_ARRAY_SIZE(pl_vulkan_recommended_extensions); +// pNext chain of features we want enabled +static const VkPhysicalDeviceHostQueryResetFeaturesEXT host_query_reset = { + .sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_HOST_QUERY_RESET_FEATURES_EXT, + .hostQueryReset = true, +}; + const VkPhysicalDeviceFeatures2KHR pl_vulkan_recommended_features = { .sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_FEATURES_2_KHR, + .pNext = (void *) &host_query_reset, .features = { .shaderImageGatherExtended = true, @@ -1161,6 +1184,8 @@ static bool device_init(struct vk_ctx *vk, const struct pl_vulkan_params *params PFN_vkVoidFunction *pfn = (void *) ((uintptr_t) vk + (ptrdiff_t) fun->offset); if (fun->device_level) { *pfn = vk->GetDeviceProcAddr(vk->dev, fun->name); + if (fun->alias && !*pfn) + *pfn = vk->GetDeviceProcAddr(vk->dev, fun->alias); } else { *pfn = vk->GetInstanceProcAddr(vk->inst, fun->name); }; diff --git a/src/vulkan/gpu.c b/src/vulkan/gpu.c index 8af8798d..2e7140e9 100644 --- a/src/vulkan/gpu.c +++ b/src/vulkan/gpu.c @@ -40,9 +40,10 @@ struct pl_vk { struct vk_malloc *alloc; struct spirv_compiler *spirv; - // Some additional cached device limits + // Some additional cached device limits and features checks uint32_t max_push_descriptors; size_t min_texel_alignment; + bool host_query_reset; // This is a pl_dispatch used (on ourselves!) for the purposes of // dispatching compute shaders for performing various emulation tasks @@ -119,7 +120,7 @@ static inline bool supports_marks(struct vk_cmd *cmd) { } while (0) #define MAKE_LAZY_DESTRUCTOR(fun, argtype) \ - static void fun##_lazy(const struct pl_gpu *gpu, const argtype *arg) { \ + static void fun##_lazy(const struct pl_gpu *gpu, argtype *arg) { \ struct pl_vk *p = TA_PRIV(gpu); \ struct vk_ctx *vk = p->vk; \ if (p->cmd) { \ @@ -419,6 +420,15 @@ const struct pl_gpu *pl_gpu_create_vk(struct vk_ctx *vk) p->max_push_descriptors = pushd.maxPushDescriptors; } + if (vk->ResetQueryPoolEXT) { + const VkPhysicalDeviceHostQueryResetFeaturesEXT *host_query_reset; + host_query_reset = vk_find_struct(&vk->features, + VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_HOST_QUERY_RESET_FEATURES_EXT); + + if (host_query_reset) + p->host_query_reset = host_query_reset->hostQueryReset; + } + // We ostensibly support this, although it can still fail on buffer // creation (for certain combinations of buffers) gpu->caps |= PL_GPU_CAP_MAPPED_BUFFERS; @@ -678,7 +688,7 @@ static void vk_tex_destroy(const struct pl_gpu *gpu, struct pl_tex *tex) talloc_free(tex); } -MAKE_LAZY_DESTRUCTOR(vk_tex_destroy, struct pl_tex) +MAKE_LAZY_DESTRUCTOR(vk_tex_destroy, const struct pl_tex) static const VkFilter filters[] = { [PL_TEX_SAMPLE_NEAREST] = VK_FILTER_NEAREST, @@ -1487,7 +1497,7 @@ static void buf_flush(const struct pl_gpu *gpu, struct vk_cmd *cmd, } #define vk_buf_destroy vk_buf_deref -MAKE_LAZY_DESTRUCTOR(vk_buf_destroy, struct pl_buf) +MAKE_LAZY_DESTRUCTOR(vk_buf_destroy, const struct pl_buf) static void vk_buf_write(const struct pl_gpu *gpu, const struct pl_buf *buf, size_t offset, const void *data, size_t size) @@ -2084,7 +2094,7 @@ static void vk_pass_destroy(const struct pl_gpu *gpu, struct pl_pass *pass) talloc_free(pass); } -MAKE_LAZY_DESTRUCTOR(vk_pass_destroy, struct pl_pass) +MAKE_LAZY_DESTRUCTOR(vk_pass_destroy, const struct pl_pass) static const VkDescriptorType dsType[] = { [PL_DESC_SAMPLED_TEX] = VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER, @@ -3079,27 +3089,29 @@ static bool vk_tex_export(const struct pl_gpu *gpu, const struct pl_tex *tex, #define VK_QUERY_POOL_SIZE 16 struct pl_timer { - int refcount; bool recording; // true between vk_cmd_timer_begin() and vk_cmd_timer_end() VkQueryPool qpool; // even=start, odd=stop int index_write; // next index to write to int index_read; // next index to read from + uint64_t pending; // bitmask of queries that are still running }; +static inline uint64_t timer_bit(int index) +{ + return 1llu << (index / 2); +} + static void vk_timer_destroy(const struct pl_gpu *gpu, struct pl_timer *timer) { struct pl_vk *p = TA_PRIV(gpu); struct vk_ctx *vk = p->vk; + pl_assert(!timer->pending); vk->DestroyQueryPool(vk->dev, timer->qpool, VK_ALLOC); talloc_free(timer); } -static void vk_timer_deref(const struct pl_gpu *gpu, struct pl_timer *timer) -{ - if (--timer->refcount == 0) - vk_timer_destroy(gpu, timer); -} +MAKE_LAZY_DESTRUCTOR(vk_timer_destroy, struct pl_timer) static struct pl_timer *vk_timer_create(const struct pl_gpu *gpu) { @@ -3107,9 +3119,7 @@ static struct pl_timer *vk_timer_create(const struct pl_gpu *gpu) struct vk_ctx *vk = p->vk; struct pl_timer *timer = talloc_ptrtype(NULL, timer); - *timer = (struct pl_timer) { - .refcount = 1, - }; + *timer = (struct pl_timer) {0}; struct VkQueryPoolCreateInfo qinfo = { .sType = VK_STRUCTURE_TYPE_QUERY_POOL_CREATE_INFO, @@ -3167,19 +3177,36 @@ static void vk_cmd_timer_begin(const struct pl_gpu *gpu, struct vk_cmd *cmd, return; } + vk_poll_commands(vk, 0); + if (timer->pending & timer_bit(timer->index_write)) + return; // next query is still running, skip this timer + VkQueueFlags reset_flags = VK_QUEUE_GRAPHICS_BIT | VK_QUEUE_COMPUTE_BIT; - if (!(cmd->pool->props.queueFlags & reset_flags)) { - PL_TRACE(gpu, "QF %d does not support query pool resets", cmd->pool->qf); + if (cmd->pool->props.queueFlags & reset_flags) { + // Use direct command buffer resets + vk->CmdResetQueryPool(cmd->buf, timer->qpool, timer->index_write, 2); + } else if (p->host_query_reset) { + // Use host query resets + vk->ResetQueryPoolEXT(vk->dev, timer->qpool, timer->index_write, 2); + } else { + PL_TRACE(gpu, "QF %d supports no mechanism for resetting queries", + cmd->pool->qf); return; } - vk->CmdResetQueryPool(cmd->buf, timer->qpool, timer->index_write, 2); vk->CmdWriteTimestamp(cmd->buf, VK_PIPELINE_STAGE_BOTTOM_OF_PIPE_BIT, timer->qpool, timer->index_write); timer->recording = true; } +static void vk_timer_cb(void *ptimer, void *pindex) +{ + struct pl_timer *timer = ptimer; + int index = (uintptr_t) pindex; + timer->pending &= ~timer_bit(index); +} + static void vk_cmd_timer_end(const struct pl_gpu *gpu, struct vk_cmd *cmd, struct pl_timer *timer) { @@ -3192,18 +3219,18 @@ static void vk_cmd_timer_end(const struct pl_gpu *gpu, struct vk_cmd *cmd, vk->CmdWriteTimestamp(cmd->buf, VK_PIPELINE_STAGE_BOTTOM_OF_PIPE_BIT, timer->qpool, timer->index_write + 1); + timer->recording = false; + timer->pending |= timer_bit(timer->index_write); + vk_cmd_callback(cmd, (vk_cb) vk_timer_cb, timer, + (void *) (uintptr_t) timer->index_write); + timer->index_write = (timer->index_write + 2) % VK_QUERY_POOL_SIZE; if (timer->index_write == timer->index_read) { // forcibly drop the least recent result to make space timer->index_read = (timer->index_read + 2) % VK_QUERY_POOL_SIZE; } - - timer->recording = false; - timer->refcount++; - vk_cmd_callback(cmd, (vk_cb) vk_timer_deref, gpu, timer); } - static void vk_gpu_flush(const struct pl_gpu *gpu) { struct pl_vk *p = TA_PRIV(gpu); @@ -3252,7 +3279,7 @@ static const struct pl_gpu_fns pl_fns_vk = { .sync_destroy = vk_sync_deref, .tex_export = vk_tex_export, .timer_create = vk_timer_create, - .timer_destroy = vk_timer_deref, + .timer_destroy = vk_timer_destroy_lazy, .timer_query = vk_timer_query, .gpu_flush = vk_gpu_flush, .gpu_finish = vk_gpu_finish,