From 88d001383a7c5b3f27c80b9b063f29db88246e29 Mon Sep 17 00:00:00 2001 From: Connor Abbott Date: Thu, 28 Aug 2025 17:53:16 -0400 Subject: [PATCH] tu: Add support for a "lazy" sparse VMA Add an extremely limited form of sparse where zeroing memory is not supported and only one BO can be fully bound to the sparse VMA immediately when it's created. This can be implemented on drm/msm even without VM_BIND, by just reserving the iova range. However kgsl doesn't let us control iova offsets, so we have to use "real" sparse support to implement it. In effect this lets us reserve an iova range and then "lazily" allocate the BO. This will be used for transient allocations in Vulkan when we have to fallback to sysmem. As part of this we add skeleton sparse VMA support to virtio, which is just enough for lazy VMAs. Part-of: --- src/freedreno/vulkan/tu_cmd_buffer.cc | 2 +- src/freedreno/vulkan/tu_device.cc | 2 +- src/freedreno/vulkan/tu_device.h | 4 +- src/freedreno/vulkan/tu_knl.cc | 6 +- src/freedreno/vulkan/tu_knl.h | 10 ++- src/freedreno/vulkan/tu_knl_drm_msm.cc | 51 ++++++++------ src/freedreno/vulkan/tu_knl_drm_virtio.cc | 57 ++++++++++++++-- src/freedreno/vulkan/tu_knl_kgsl.cc | 81 ++++++++++++----------- 8 files changed, 145 insertions(+), 68 deletions(-) diff --git a/src/freedreno/vulkan/tu_cmd_buffer.cc b/src/freedreno/vulkan/tu_cmd_buffer.cc index 7e016e8d8ce..a405fc1de76 100644 --- a/src/freedreno/vulkan/tu_cmd_buffer.cc +++ b/src/freedreno/vulkan/tu_cmd_buffer.cc @@ -42,7 +42,7 @@ tu_cmd_buffer_setup_status_tracking(struct tu_device *device) VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT | VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT | VK_MEMORY_PROPERTY_HOST_COHERENT_BIT, - TU_BO_ALLOC_INTERNAL_RESOURCE, "cmd_buffer_status"); + TU_BO_ALLOC_INTERNAL_RESOURCE, NULL, "cmd_buffer_status"); if (result != VK_SUCCESS) return NULL; diff --git a/src/freedreno/vulkan/tu_device.cc b/src/freedreno/vulkan/tu_device.cc index 6600bb8940c..cb6ba0439e3 100644 --- a/src/freedreno/vulkan/tu_device.cc +++ b/src/freedreno/vulkan/tu_device.cc @@ -3306,7 +3306,7 @@ tu_AllocateMemory(VkDevice _device, device->physical_device->memory.types[pAllocateInfo->memoryTypeIndex]; result = tu_bo_init_new_explicit_iova( device, &mem->vk.base, &mem->bo, pAllocateInfo->allocationSize, - client_address, mem_property, alloc_flags, name); + client_address, mem_property, alloc_flags, NULL, name); } if (result == VK_SUCCESS) { diff --git a/src/freedreno/vulkan/tu_device.h b/src/freedreno/vulkan/tu_device.h index 02d8b7e9227..03294791f7a 100644 --- a/src/freedreno/vulkan/tu_device.h +++ b/src/freedreno/vulkan/tu_device.h @@ -132,6 +132,8 @@ struct tu_physical_device bool has_sparse; /* Whether TU_SPARSE_VMA_MAP_ZERO can be used. */ bool has_sparse_prr; + /* Whether lazy allocations are supported. */ + bool has_lazy_bos; uint64_t va_start; uint64_t va_size; @@ -654,7 +656,7 @@ tu_bo_init_new_cached(struct tu_device *dev, struct vk_object_base *base, VK_MEMORY_PROPERTY_HOST_COHERENT_BIT | (dev->physical_device->has_cached_coherent_memory ? VK_MEMORY_PROPERTY_HOST_CACHED_BIT : 0), - flags, name); + flags, NULL, name); } diff --git a/src/freedreno/vulkan/tu_knl.cc b/src/freedreno/vulkan/tu_knl.cc index d29fe9bcab4..5b8a0a31626 100644 --- a/src/freedreno/vulkan/tu_knl.cc +++ b/src/freedreno/vulkan/tu_knl.cc @@ -35,7 +35,9 @@ tu_bo_init_new_explicit_iova(struct tu_device *dev, uint64_t size, uint64_t client_iova, VkMemoryPropertyFlags mem_property, - enum tu_bo_alloc_flags flags, const char *name) + enum tu_bo_alloc_flags flags, + struct tu_sparse_vma *lazy_vma, + const char *name) { MESA_TRACE_FUNC(); struct tu_instance *instance = dev->physical_device->instance; @@ -44,7 +46,7 @@ tu_bo_init_new_explicit_iova(struct tu_device *dev, VkResult result = dev->instance->knl->bo_init(dev, base, out_bo, size, client_iova, - mem_property, flags, name); + mem_property, flags, lazy_vma, name); if (result != VK_SUCCESS) return result; diff --git a/src/freedreno/vulkan/tu_knl.h b/src/freedreno/vulkan/tu_knl.h index edaca35f3e9..60373b7bbce 100644 --- a/src/freedreno/vulkan/tu_knl.h +++ b/src/freedreno/vulkan/tu_knl.h @@ -57,6 +57,7 @@ struct tu_bo { bool implicit_sync : 1; bool never_unmap : 1; bool cached_non_coherent : 1; + bool lazy : 1; bool dump; @@ -67,6 +68,7 @@ struct tu_bo { }; enum tu_sparse_vma_flags { + TU_SPARSE_VMA_NONE = 0, TU_SPARSE_VMA_REPLAYABLE = 1 << 0, /* Make unmapped pages in the memory region map to the PRR NULL page. This @@ -120,10 +122,13 @@ struct tu_knl { VkResult (*bo_init)(struct tu_device *dev, struct vk_object_base *base, struct tu_bo **out_bo, uint64_t size, uint64_t client_iova, VkMemoryPropertyFlags mem_property, - enum tu_bo_alloc_flags flags, const char *name); + enum tu_bo_alloc_flags flags, + struct tu_sparse_vma *lazy_vma, + const char *name); VkResult (*bo_init_dmabuf)(struct tu_device *dev, struct tu_bo **out_bo, uint64_t size, int prime_fd); int (*bo_export_dmabuf)(struct tu_device *dev, struct tu_bo *bo); + VkResult (*bo_alloc_lazy)(struct tu_device *dev, struct tu_bo *bo); VkResult (*bo_map)(struct tu_device *dev, struct tu_bo *bo, void *placed_addr); void (*bo_allow_dump)(struct tu_device *dev, struct tu_bo *bo); void (*bo_finish)(struct tu_device *dev, struct tu_bo *bo); @@ -177,6 +182,7 @@ tu_bo_init_new_explicit_iova(struct tu_device *dev, uint64_t client_iova, VkMemoryPropertyFlags mem_property, enum tu_bo_alloc_flags flags, + struct tu_sparse_vma *lazy_vma, const char *name); static inline VkResult @@ -189,7 +195,7 @@ tu_bo_init_new(struct tu_device *dev, struct vk_object_base *base, VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT | VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT | VK_MEMORY_PROPERTY_HOST_COHERENT_BIT, - flags, name); + flags, NULL, name); } VkResult diff --git a/src/freedreno/vulkan/tu_knl_drm_msm.cc b/src/freedreno/vulkan/tu_knl_drm_msm.cc index 59d30540719..8c69dc6d37a 100644 --- a/src/freedreno/vulkan/tu_knl_drm_msm.cc +++ b/src/freedreno/vulkan/tu_knl_drm_msm.cc @@ -822,13 +822,17 @@ msm_bo_init(struct tu_device *dev, uint64_t client_iova, VkMemoryPropertyFlags mem_property, enum tu_bo_alloc_flags flags, + struct tu_sparse_vma *lazy_vma, const char *name) { MESA_TRACE_FUNC(); - VkResult result; + VkResult result = VK_SUCCESS; uint64_t iova; - result = tu_allocate_iova(dev, 0, size, client_iova, flags, &iova); + if (lazy_vma) + iova = lazy_vma->msm.iova; + else + result = tu_allocate_iova(dev, 0, size, client_iova, flags, &iova); if (result != VK_SUCCESS) return result; @@ -857,9 +861,11 @@ msm_bo_init(struct tu_device *dev, int ret = drmCommandWriteRead(dev->fd, DRM_MSM_GEM_NEW, &req, sizeof(req)); if (ret) { - msm_vma_lock(dev); - util_vma_heap_free(&dev->vma, iova, size); - msm_vma_unlock(dev); + if (!lazy_vma) { + msm_vma_lock(dev); + util_vma_heap_free(&dev->vma, iova, size); + msm_vma_unlock(dev); + } return vk_error(dev, VK_ERROR_OUT_OF_DEVICE_MEMORY); } @@ -877,10 +883,13 @@ msm_bo_init(struct tu_device *dev, TU_RMV(internal_resource_create, dev, bo); TU_RMV(resource_name, dev, bo, name); } + bo->lazy = !!lazy_vma; } else { - msm_vma_lock(dev); - util_vma_heap_free(&dev->vma, iova, size); - msm_vma_unlock(dev); + if (!lazy_vma) { + msm_vma_lock(dev); + util_vma_heap_free(&dev->vma, iova, size); + msm_vma_unlock(dev); + } memset(bo, 0, sizeof(*bo)); } @@ -1084,12 +1093,14 @@ msm_bo_finish(struct tu_device *dev, struct tu_bo *bo) TU_RMV(bo_destroy, dev, bo); if (dev->physical_device->has_vm_bind) { - tu_map_vm_bind(dev, MSM_VM_BIND_OP_UNMAP, 0, bo->iova, 0, 0, - bo->size); + if (!bo->lazy) { + tu_map_vm_bind(dev, MSM_VM_BIND_OP_UNMAP, 0, bo->iova, 0, 0, + bo->size); - mtx_lock(&dev->vma_mutex); - util_vma_heap_free(&dev->vma, bo->iova, bo->size); - mtx_unlock(&dev->vma_mutex); + mtx_lock(&dev->vma_mutex); + util_vma_heap_free(&dev->vma, bo->iova, bo->size); + mtx_unlock(&dev->vma_mutex); + } msm_bo_gem_close(dev, bo); } else if (dev->physical_device->has_set_iova) { @@ -1119,15 +1130,14 @@ msm_sparse_vma_init(struct tu_device *dev, out_vma->msm.size = size; - mtx_lock(&dev->vma_mutex); - result = tu_allocate_userspace_iova(dev, size, client_iova, bo_flags, - &out_vma->msm.iova); - mtx_unlock(&dev->vma_mutex); + result = tu_allocate_iova(dev, 0, size, client_iova, bo_flags, + &out_vma->msm.iova); if (result != VK_SUCCESS) return result; if (flags & TU_SPARSE_VMA_MAP_ZERO) { + assert(dev->physical_device->has_vm_bind); result = tu_map_vm_bind(dev, MSM_VM_BIND_OP_MAP_NULL, 0, out_vma->msm.iova, 0, 0, size); } @@ -1141,8 +1151,10 @@ static void msm_sparse_vma_finish(struct tu_device *dev, struct tu_sparse_vma *vma) { - tu_map_vm_bind(dev, MSM_VM_BIND_OP_UNMAP, 0, vma->msm.iova, 0, 0, - vma->msm.size); + if (dev->physical_device->has_vm_bind) { + tu_map_vm_bind(dev, MSM_VM_BIND_OP_UNMAP, 0, vma->msm.iova, 0, 0, + vma->msm.size); + } mtx_lock(&dev->vma_mutex); util_vma_heap_free(&dev->vma, vma->msm.iova, vma->msm.size); @@ -1549,6 +1561,7 @@ tu_knl_drm_msm_load(struct tu_instance *instance, device->has_set_iova = !tu_drm_get_va_prop(device, &device->va_start, &device->va_size); + device->has_lazy_bos = device->has_set_iova; device->has_raytracing = tu_drm_get_raytracing(device); device->has_sparse_prr = tu_drm_get_prr(device); diff --git a/src/freedreno/vulkan/tu_knl_drm_virtio.cc b/src/freedreno/vulkan/tu_knl_drm_virtio.cc index dc5d80b6733..4cbb0c51638 100644 --- a/src/freedreno/vulkan/tu_knl_drm_virtio.cc +++ b/src/freedreno/vulkan/tu_knl_drm_virtio.cc @@ -678,6 +678,7 @@ virtio_bo_init(struct tu_device *dev, uint64_t client_iova, VkMemoryPropertyFlags mem_property, enum tu_bo_alloc_flags flags, + struct tu_sparse_vma *lazy_vma, const char *name) { MESA_TRACE_FUNC(); @@ -686,7 +687,7 @@ virtio_bo_init(struct tu_device *dev, .hdr = MSM_CCMD(GEM_NEW, sizeof(req)), .size = size, }; - VkResult result; + VkResult result = VK_SUCCESS; uint32_t res_id; struct tu_bo *bo; @@ -716,10 +717,14 @@ virtio_bo_init(struct tu_device *dev, assert(!(flags & TU_BO_ALLOC_DMABUF)); - mtx_lock(&dev->vma_mutex); - result = virtio_allocate_userspace_iova_locked(dev, 0, size, client_iova, - flags, &req.iova); - mtx_unlock(&dev->vma_mutex); + if (lazy_vma) { + req.iova = lazy_vma->msm.iova; + } else { + mtx_lock(&dev->vma_mutex); + result = virtio_allocate_userspace_iova_locked(dev, 0, size, client_iova, + flags, &req.iova); + mtx_unlock(&dev->vma_mutex); + } if (result != VK_SUCCESS) return result; @@ -910,6 +915,45 @@ virtio_bo_finish(struct tu_device *dev, struct tu_bo *bo) u_rwlock_rdunlock(&dev->dma_bo_lock); } +static VkResult +virtio_sparse_vma_init(struct tu_device *dev, + struct vk_object_base *base, + struct tu_sparse_vma *out_vma, + uint64_t *out_iova, + enum tu_sparse_vma_flags flags, + uint64_t size, uint64_t client_iova) +{ + VkResult result; + enum tu_bo_alloc_flags bo_flags = + (flags & TU_SPARSE_VMA_REPLAYABLE) ? TU_BO_ALLOC_REPLAYABLE : + (enum tu_bo_alloc_flags)0; + + out_vma->msm.size = size; + + mtx_lock(&dev->vma_mutex); + result = virtio_allocate_userspace_iova_locked(dev, 0, size, client_iova, + bo_flags, &out_vma->msm.iova); + mtx_unlock(&dev->vma_mutex); + + if (result != VK_SUCCESS) + return result; + + assert(!(flags & TU_SPARSE_VMA_MAP_ZERO)); + + *out_iova = out_vma->msm.iova; + + return result; +} + +static void +virtio_sparse_vma_finish(struct tu_device *dev, + struct tu_sparse_vma *vma) +{ + mtx_lock(&dev->vma_mutex); + util_vma_heap_free(&dev->vma, vma->msm.iova, vma->msm.size); + mtx_unlock(&dev->vma_mutex); +} + static VkResult setup_fence_cmds(struct tu_device *dev) { @@ -1156,6 +1200,8 @@ static const struct tu_knl virtio_knl_funcs = { .submit_add_entries = msm_submit_add_entries, .queue_submit = virtio_queue_submit, .queue_wait_fence = virtio_queue_wait_fence, + .sparse_vma_init = virtio_sparse_vma_init, + .sparse_vma_finish = virtio_sparse_vma_finish, }; VkResult @@ -1282,6 +1328,7 @@ tu_knl_drm_virtio_load(struct tu_instance *instance, device->va_size = caps.u.msm.va_size; device->ubwc_config.highest_bank_bit = caps.u.msm.highest_bank_bit; device->has_set_iova = true; + device->has_lazy_bos = true; device->has_preemption = has_preemption; device->uche_trap_base = uche_trap_base; diff --git a/src/freedreno/vulkan/tu_knl_kgsl.cc b/src/freedreno/vulkan/tu_knl_kgsl.cc index 28b5acff0cd..a16a5f9207d 100644 --- a/src/freedreno/vulkan/tu_knl_kgsl.cc +++ b/src/freedreno/vulkan/tu_knl_kgsl.cc @@ -197,6 +197,39 @@ kgsl_bo_user_map(struct tu_device *dev, struct tu_bo *bo, uint64_t client_iova) return VK_SUCCESS; } +static VkResult +kgsl_sparse_vma_map(struct tu_device *dev, + struct tu_sparse_vma *vma, + struct tu_bo *bo, uint64_t bo_offset) +{ + struct kgsl_gpumem_bind_range range = { + .child_offset = bo_offset, + .target_offset = 0, + .length = vma->kgsl.virtual_bo->size, + .child_id = bo->gem_handle, + .op = KGSL_GPUMEM_RANGE_OP_BIND, + }; + + struct kgsl_gpumem_bind_ranges req = { + .ranges = (uint64_t)(uintptr_t)&range, + .ranges_nents = 1, + .ranges_size = sizeof(range), + .id = vma->kgsl.virtual_bo->gem_handle, + .flags = 0, + }; + + int ret; + + ret = safe_ioctl(dev->physical_device->local_fd, + IOCTL_KGSL_GPUMEM_BIND_RANGES, &req); + if (ret) { + return vk_errorf(dev, VK_ERROR_OUT_OF_DEVICE_MEMORY, + "GPUMEM_BIND_RANGES failed (%s)", strerror(errno)); + } + + return VK_SUCCESS; +} + static VkResult kgsl_bo_init(struct tu_device *dev, struct vk_object_base *base, @@ -205,6 +238,7 @@ kgsl_bo_init(struct tu_device *dev, uint64_t client_iova, VkMemoryPropertyFlags mem_property, enum tu_bo_alloc_flags flags, + struct tu_sparse_vma *lazy_vma, const char *name) { if (flags & TU_BO_ALLOC_SHAREABLE) { @@ -269,12 +303,17 @@ kgsl_bo_init(struct tu_device *dev, .base = base, }; - if (flags & TU_BO_ALLOC_REPLAYABLE) { - VkResult result = kgsl_bo_user_map(dev, bo, client_iova); - if (result != VK_SUCCESS) - return result; + VkResult result = VK_SUCCESS; + + if (lazy_vma) { + result = kgsl_sparse_vma_map(dev, lazy_vma, bo, 0); + } else if (flags & TU_BO_ALLOC_REPLAYABLE) { + result = kgsl_bo_user_map(dev, bo, client_iova); } + if (result != VK_SUCCESS) + return result; + tu_dump_bo_init(dev, bo); *out_bo = bo; @@ -478,39 +517,6 @@ kgsl_sparse_vma_init(struct tu_device *dev, return VK_SUCCESS; } -static VkResult -kgsl_sparse_vma_map(struct tu_device *dev, - struct tu_sparse_vma *vma, - struct tu_bo *bo, uint64_t bo_offset) -{ - struct kgsl_gpumem_bind_range range = { - .child_offset = bo_offset, - .target_offset = 0, - .length = vma->kgsl.virtual_bo->size, - .child_id = bo->gem_handle, - .op = KGSL_GPUMEM_RANGE_OP_BIND, - }; - - struct kgsl_gpumem_bind_ranges req = { - .ranges = (uint64_t)(uintptr_t)&range, - .ranges_nents = 1, - .ranges_size = sizeof(range), - .id = vma->kgsl.virtual_bo->gem_handle, - .flags = 0, - }; - - int ret; - - ret = safe_ioctl(dev->physical_device->local_fd, - IOCTL_KGSL_GPUMEM_BIND_RANGES, &req); - if (ret) { - return vk_errorf(dev, VK_ERROR_OUT_OF_DEVICE_MEMORY, - "GPUMEM_BIND_RANGES failed (%s)", strerror(errno)); - } - - return VK_SUCCESS; -} - static void kgsl_sparse_vma_finish(struct tu_device *dev, struct tu_sparse_vma *vma) @@ -1809,6 +1815,7 @@ tu_knl_kgsl_load(struct tu_instance *instance, int fd) device->has_sparse = kgsl_is_virtual_bo_supported(fd); device->has_sparse_prr = device->has_sparse; + device->has_lazy_bos = device->has_sparse; get_kgsl_prop(fd, KGSL_PROP_GPU_VA64_SIZE, &device->va_size, sizeof(device->va_size)); /* We don't actually use the VMA, but set a fake offset so that it doesn't