tu: Add support for a "lazy" sparse VMA

Add an extremely limited form of sparse where zeroing memory is not
supported and only one BO can be fully bound to the sparse VMA
immediately when it's created. This can be implemented on drm/msm even
without VM_BIND, by just reserving the iova range. However kgsl doesn't
let us control iova offsets, so we have to use "real" sparse support to
implement it. In effect this lets us reserve an iova range and then
"lazily" allocate the BO. This will be used for transient allocations in
Vulkan when we have to fallback to sysmem.

As part of this we add skeleton sparse VMA support to virtio, which is
just enough for lazy VMAs.

Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/37151>
This commit is contained in:
Connor Abbott 2025-08-28 17:53:16 -04:00 committed by Marge Bot
parent 93a80f4bb9
commit 88d001383a
8 changed files with 145 additions and 68 deletions

View file

@ -42,7 +42,7 @@ tu_cmd_buffer_setup_status_tracking(struct tu_device *device)
VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT |
VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT |
VK_MEMORY_PROPERTY_HOST_COHERENT_BIT,
TU_BO_ALLOC_INTERNAL_RESOURCE, "cmd_buffer_status");
TU_BO_ALLOC_INTERNAL_RESOURCE, NULL, "cmd_buffer_status");
if (result != VK_SUCCESS)
return NULL;

View file

@ -3306,7 +3306,7 @@ tu_AllocateMemory(VkDevice _device,
device->physical_device->memory.types[pAllocateInfo->memoryTypeIndex];
result = tu_bo_init_new_explicit_iova(
device, &mem->vk.base, &mem->bo, pAllocateInfo->allocationSize,
client_address, mem_property, alloc_flags, name);
client_address, mem_property, alloc_flags, NULL, name);
}
if (result == VK_SUCCESS) {

View file

@ -132,6 +132,8 @@ struct tu_physical_device
bool has_sparse;
/* Whether TU_SPARSE_VMA_MAP_ZERO can be used. */
bool has_sparse_prr;
/* Whether lazy allocations are supported. */
bool has_lazy_bos;
uint64_t va_start;
uint64_t va_size;
@ -654,7 +656,7 @@ tu_bo_init_new_cached(struct tu_device *dev, struct vk_object_base *base,
VK_MEMORY_PROPERTY_HOST_COHERENT_BIT |
(dev->physical_device->has_cached_coherent_memory ?
VK_MEMORY_PROPERTY_HOST_CACHED_BIT : 0),
flags, name);
flags, NULL, name);
}

View file

@ -35,7 +35,9 @@ tu_bo_init_new_explicit_iova(struct tu_device *dev,
uint64_t size,
uint64_t client_iova,
VkMemoryPropertyFlags mem_property,
enum tu_bo_alloc_flags flags, const char *name)
enum tu_bo_alloc_flags flags,
struct tu_sparse_vma *lazy_vma,
const char *name)
{
MESA_TRACE_FUNC();
struct tu_instance *instance = dev->physical_device->instance;
@ -44,7 +46,7 @@ tu_bo_init_new_explicit_iova(struct tu_device *dev,
VkResult result =
dev->instance->knl->bo_init(dev, base, out_bo, size, client_iova,
mem_property, flags, name);
mem_property, flags, lazy_vma, name);
if (result != VK_SUCCESS)
return result;

View file

@ -57,6 +57,7 @@ struct tu_bo {
bool implicit_sync : 1;
bool never_unmap : 1;
bool cached_non_coherent : 1;
bool lazy : 1;
bool dump;
@ -67,6 +68,7 @@ struct tu_bo {
};
enum tu_sparse_vma_flags {
TU_SPARSE_VMA_NONE = 0,
TU_SPARSE_VMA_REPLAYABLE = 1 << 0,
/* Make unmapped pages in the memory region map to the PRR NULL page. This
@ -120,10 +122,13 @@ struct tu_knl {
VkResult (*bo_init)(struct tu_device *dev, struct vk_object_base *base,
struct tu_bo **out_bo, uint64_t size, uint64_t client_iova,
VkMemoryPropertyFlags mem_property,
enum tu_bo_alloc_flags flags, const char *name);
enum tu_bo_alloc_flags flags,
struct tu_sparse_vma *lazy_vma,
const char *name);
VkResult (*bo_init_dmabuf)(struct tu_device *dev, struct tu_bo **out_bo,
uint64_t size, int prime_fd);
int (*bo_export_dmabuf)(struct tu_device *dev, struct tu_bo *bo);
VkResult (*bo_alloc_lazy)(struct tu_device *dev, struct tu_bo *bo);
VkResult (*bo_map)(struct tu_device *dev, struct tu_bo *bo, void *placed_addr);
void (*bo_allow_dump)(struct tu_device *dev, struct tu_bo *bo);
void (*bo_finish)(struct tu_device *dev, struct tu_bo *bo);
@ -177,6 +182,7 @@ tu_bo_init_new_explicit_iova(struct tu_device *dev,
uint64_t client_iova,
VkMemoryPropertyFlags mem_property,
enum tu_bo_alloc_flags flags,
struct tu_sparse_vma *lazy_vma,
const char *name);
static inline VkResult
@ -189,7 +195,7 @@ tu_bo_init_new(struct tu_device *dev, struct vk_object_base *base,
VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT |
VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT |
VK_MEMORY_PROPERTY_HOST_COHERENT_BIT,
flags, name);
flags, NULL, name);
}
VkResult

View file

@ -822,13 +822,17 @@ msm_bo_init(struct tu_device *dev,
uint64_t client_iova,
VkMemoryPropertyFlags mem_property,
enum tu_bo_alloc_flags flags,
struct tu_sparse_vma *lazy_vma,
const char *name)
{
MESA_TRACE_FUNC();
VkResult result;
VkResult result = VK_SUCCESS;
uint64_t iova;
result = tu_allocate_iova(dev, 0, size, client_iova, flags, &iova);
if (lazy_vma)
iova = lazy_vma->msm.iova;
else
result = tu_allocate_iova(dev, 0, size, client_iova, flags, &iova);
if (result != VK_SUCCESS)
return result;
@ -857,9 +861,11 @@ msm_bo_init(struct tu_device *dev,
int ret = drmCommandWriteRead(dev->fd,
DRM_MSM_GEM_NEW, &req, sizeof(req));
if (ret) {
msm_vma_lock(dev);
util_vma_heap_free(&dev->vma, iova, size);
msm_vma_unlock(dev);
if (!lazy_vma) {
msm_vma_lock(dev);
util_vma_heap_free(&dev->vma, iova, size);
msm_vma_unlock(dev);
}
return vk_error(dev, VK_ERROR_OUT_OF_DEVICE_MEMORY);
}
@ -877,10 +883,13 @@ msm_bo_init(struct tu_device *dev,
TU_RMV(internal_resource_create, dev, bo);
TU_RMV(resource_name, dev, bo, name);
}
bo->lazy = !!lazy_vma;
} else {
msm_vma_lock(dev);
util_vma_heap_free(&dev->vma, iova, size);
msm_vma_unlock(dev);
if (!lazy_vma) {
msm_vma_lock(dev);
util_vma_heap_free(&dev->vma, iova, size);
msm_vma_unlock(dev);
}
memset(bo, 0, sizeof(*bo));
}
@ -1084,12 +1093,14 @@ msm_bo_finish(struct tu_device *dev, struct tu_bo *bo)
TU_RMV(bo_destroy, dev, bo);
if (dev->physical_device->has_vm_bind) {
tu_map_vm_bind(dev, MSM_VM_BIND_OP_UNMAP, 0, bo->iova, 0, 0,
bo->size);
if (!bo->lazy) {
tu_map_vm_bind(dev, MSM_VM_BIND_OP_UNMAP, 0, bo->iova, 0, 0,
bo->size);
mtx_lock(&dev->vma_mutex);
util_vma_heap_free(&dev->vma, bo->iova, bo->size);
mtx_unlock(&dev->vma_mutex);
mtx_lock(&dev->vma_mutex);
util_vma_heap_free(&dev->vma, bo->iova, bo->size);
mtx_unlock(&dev->vma_mutex);
}
msm_bo_gem_close(dev, bo);
} else if (dev->physical_device->has_set_iova) {
@ -1119,15 +1130,14 @@ msm_sparse_vma_init(struct tu_device *dev,
out_vma->msm.size = size;
mtx_lock(&dev->vma_mutex);
result = tu_allocate_userspace_iova(dev, size, client_iova, bo_flags,
&out_vma->msm.iova);
mtx_unlock(&dev->vma_mutex);
result = tu_allocate_iova(dev, 0, size, client_iova, bo_flags,
&out_vma->msm.iova);
if (result != VK_SUCCESS)
return result;
if (flags & TU_SPARSE_VMA_MAP_ZERO) {
assert(dev->physical_device->has_vm_bind);
result = tu_map_vm_bind(dev, MSM_VM_BIND_OP_MAP_NULL, 0,
out_vma->msm.iova, 0, 0, size);
}
@ -1141,8 +1151,10 @@ static void
msm_sparse_vma_finish(struct tu_device *dev,
struct tu_sparse_vma *vma)
{
tu_map_vm_bind(dev, MSM_VM_BIND_OP_UNMAP, 0, vma->msm.iova, 0, 0,
vma->msm.size);
if (dev->physical_device->has_vm_bind) {
tu_map_vm_bind(dev, MSM_VM_BIND_OP_UNMAP, 0, vma->msm.iova, 0, 0,
vma->msm.size);
}
mtx_lock(&dev->vma_mutex);
util_vma_heap_free(&dev->vma, vma->msm.iova, vma->msm.size);
@ -1549,6 +1561,7 @@ tu_knl_drm_msm_load(struct tu_instance *instance,
device->has_set_iova = !tu_drm_get_va_prop(device, &device->va_start,
&device->va_size);
device->has_lazy_bos = device->has_set_iova;
device->has_raytracing = tu_drm_get_raytracing(device);
device->has_sparse_prr = tu_drm_get_prr(device);

View file

@ -678,6 +678,7 @@ virtio_bo_init(struct tu_device *dev,
uint64_t client_iova,
VkMemoryPropertyFlags mem_property,
enum tu_bo_alloc_flags flags,
struct tu_sparse_vma *lazy_vma,
const char *name)
{
MESA_TRACE_FUNC();
@ -686,7 +687,7 @@ virtio_bo_init(struct tu_device *dev,
.hdr = MSM_CCMD(GEM_NEW, sizeof(req)),
.size = size,
};
VkResult result;
VkResult result = VK_SUCCESS;
uint32_t res_id;
struct tu_bo *bo;
@ -716,10 +717,14 @@ virtio_bo_init(struct tu_device *dev,
assert(!(flags & TU_BO_ALLOC_DMABUF));
mtx_lock(&dev->vma_mutex);
result = virtio_allocate_userspace_iova_locked(dev, 0, size, client_iova,
flags, &req.iova);
mtx_unlock(&dev->vma_mutex);
if (lazy_vma) {
req.iova = lazy_vma->msm.iova;
} else {
mtx_lock(&dev->vma_mutex);
result = virtio_allocate_userspace_iova_locked(dev, 0, size, client_iova,
flags, &req.iova);
mtx_unlock(&dev->vma_mutex);
}
if (result != VK_SUCCESS)
return result;
@ -910,6 +915,45 @@ virtio_bo_finish(struct tu_device *dev, struct tu_bo *bo)
u_rwlock_rdunlock(&dev->dma_bo_lock);
}
static VkResult
virtio_sparse_vma_init(struct tu_device *dev,
struct vk_object_base *base,
struct tu_sparse_vma *out_vma,
uint64_t *out_iova,
enum tu_sparse_vma_flags flags,
uint64_t size, uint64_t client_iova)
{
VkResult result;
enum tu_bo_alloc_flags bo_flags =
(flags & TU_SPARSE_VMA_REPLAYABLE) ? TU_BO_ALLOC_REPLAYABLE :
(enum tu_bo_alloc_flags)0;
out_vma->msm.size = size;
mtx_lock(&dev->vma_mutex);
result = virtio_allocate_userspace_iova_locked(dev, 0, size, client_iova,
bo_flags, &out_vma->msm.iova);
mtx_unlock(&dev->vma_mutex);
if (result != VK_SUCCESS)
return result;
assert(!(flags & TU_SPARSE_VMA_MAP_ZERO));
*out_iova = out_vma->msm.iova;
return result;
}
static void
virtio_sparse_vma_finish(struct tu_device *dev,
struct tu_sparse_vma *vma)
{
mtx_lock(&dev->vma_mutex);
util_vma_heap_free(&dev->vma, vma->msm.iova, vma->msm.size);
mtx_unlock(&dev->vma_mutex);
}
static VkResult
setup_fence_cmds(struct tu_device *dev)
{
@ -1156,6 +1200,8 @@ static const struct tu_knl virtio_knl_funcs = {
.submit_add_entries = msm_submit_add_entries,
.queue_submit = virtio_queue_submit,
.queue_wait_fence = virtio_queue_wait_fence,
.sparse_vma_init = virtio_sparse_vma_init,
.sparse_vma_finish = virtio_sparse_vma_finish,
};
VkResult
@ -1282,6 +1328,7 @@ tu_knl_drm_virtio_load(struct tu_instance *instance,
device->va_size = caps.u.msm.va_size;
device->ubwc_config.highest_bank_bit = caps.u.msm.highest_bank_bit;
device->has_set_iova = true;
device->has_lazy_bos = true;
device->has_preemption = has_preemption;
device->uche_trap_base = uche_trap_base;

View file

@ -197,6 +197,39 @@ kgsl_bo_user_map(struct tu_device *dev, struct tu_bo *bo, uint64_t client_iova)
return VK_SUCCESS;
}
static VkResult
kgsl_sparse_vma_map(struct tu_device *dev,
struct tu_sparse_vma *vma,
struct tu_bo *bo, uint64_t bo_offset)
{
struct kgsl_gpumem_bind_range range = {
.child_offset = bo_offset,
.target_offset = 0,
.length = vma->kgsl.virtual_bo->size,
.child_id = bo->gem_handle,
.op = KGSL_GPUMEM_RANGE_OP_BIND,
};
struct kgsl_gpumem_bind_ranges req = {
.ranges = (uint64_t)(uintptr_t)&range,
.ranges_nents = 1,
.ranges_size = sizeof(range),
.id = vma->kgsl.virtual_bo->gem_handle,
.flags = 0,
};
int ret;
ret = safe_ioctl(dev->physical_device->local_fd,
IOCTL_KGSL_GPUMEM_BIND_RANGES, &req);
if (ret) {
return vk_errorf(dev, VK_ERROR_OUT_OF_DEVICE_MEMORY,
"GPUMEM_BIND_RANGES failed (%s)", strerror(errno));
}
return VK_SUCCESS;
}
static VkResult
kgsl_bo_init(struct tu_device *dev,
struct vk_object_base *base,
@ -205,6 +238,7 @@ kgsl_bo_init(struct tu_device *dev,
uint64_t client_iova,
VkMemoryPropertyFlags mem_property,
enum tu_bo_alloc_flags flags,
struct tu_sparse_vma *lazy_vma,
const char *name)
{
if (flags & TU_BO_ALLOC_SHAREABLE) {
@ -269,12 +303,17 @@ kgsl_bo_init(struct tu_device *dev,
.base = base,
};
if (flags & TU_BO_ALLOC_REPLAYABLE) {
VkResult result = kgsl_bo_user_map(dev, bo, client_iova);
if (result != VK_SUCCESS)
return result;
VkResult result = VK_SUCCESS;
if (lazy_vma) {
result = kgsl_sparse_vma_map(dev, lazy_vma, bo, 0);
} else if (flags & TU_BO_ALLOC_REPLAYABLE) {
result = kgsl_bo_user_map(dev, bo, client_iova);
}
if (result != VK_SUCCESS)
return result;
tu_dump_bo_init(dev, bo);
*out_bo = bo;
@ -478,39 +517,6 @@ kgsl_sparse_vma_init(struct tu_device *dev,
return VK_SUCCESS;
}
static VkResult
kgsl_sparse_vma_map(struct tu_device *dev,
struct tu_sparse_vma *vma,
struct tu_bo *bo, uint64_t bo_offset)
{
struct kgsl_gpumem_bind_range range = {
.child_offset = bo_offset,
.target_offset = 0,
.length = vma->kgsl.virtual_bo->size,
.child_id = bo->gem_handle,
.op = KGSL_GPUMEM_RANGE_OP_BIND,
};
struct kgsl_gpumem_bind_ranges req = {
.ranges = (uint64_t)(uintptr_t)&range,
.ranges_nents = 1,
.ranges_size = sizeof(range),
.id = vma->kgsl.virtual_bo->gem_handle,
.flags = 0,
};
int ret;
ret = safe_ioctl(dev->physical_device->local_fd,
IOCTL_KGSL_GPUMEM_BIND_RANGES, &req);
if (ret) {
return vk_errorf(dev, VK_ERROR_OUT_OF_DEVICE_MEMORY,
"GPUMEM_BIND_RANGES failed (%s)", strerror(errno));
}
return VK_SUCCESS;
}
static void
kgsl_sparse_vma_finish(struct tu_device *dev,
struct tu_sparse_vma *vma)
@ -1809,6 +1815,7 @@ tu_knl_kgsl_load(struct tu_instance *instance, int fd)
device->has_sparse = kgsl_is_virtual_bo_supported(fd);
device->has_sparse_prr = device->has_sparse;
device->has_lazy_bos = device->has_sparse;
get_kgsl_prop(fd, KGSL_PROP_GPU_VA64_SIZE, &device->va_size,
sizeof(device->va_size));
/* We don't actually use the VMA, but set a fake offset so that it doesn't