From 764b3d9161c27afb57d20536c6968fa5085f34f1 Mon Sep 17 00:00:00 2001 From: Connor Abbott Date: Tue, 2 Sep 2025 16:28:54 -0400 Subject: [PATCH] tu: Implement transient attachments and lazily allocated memory Transient attachments have been in Vulkan since 1.0, and are a way to avoid allocating memory for attachments that can be stored entirely in tile memory. The driver exposes a memory type with LAZILY_ALLOCATED_BIT, and apps use this type to allocate images with TRANSIENT_ATTACHMENT usage, which are restricted to color/depth/stencil/input attachment usage. The driver is supposed to then delay allocating memory until it knows that one of the images bound to the VkDeviceMemory must have actual backing memory. Implement this using the "lazy VMA" mechanism added earlier. We reserve an iova range for lazy BOs, and only allocate them if we chose sysmem rendering or there is a LOAD_OP_LOAD/STORE_OP_STORE. Because we never split render passes and force sysmem instead, we don't have to deal with the additional complexity of that here and just allocate everything. Part-of: --- src/freedreno/vulkan/tu_buffer.cc | 2 +- src/freedreno/vulkan/tu_cmd_buffer.cc | 40 ++++++++- src/freedreno/vulkan/tu_device.cc | 121 ++++++++++++++++++++++---- src/freedreno/vulkan/tu_device.h | 14 +++ src/freedreno/vulkan/tu_formats.cc | 3 +- src/freedreno/vulkan/tu_image.cc | 13 ++- 6 files changed, 172 insertions(+), 21 deletions(-) diff --git a/src/freedreno/vulkan/tu_buffer.cc b/src/freedreno/vulkan/tu_buffer.cc index 55964401b4f..429e8f28309 100644 --- a/src/freedreno/vulkan/tu_buffer.cc +++ b/src/freedreno/vulkan/tu_buffer.cc @@ -121,7 +121,7 @@ tu_GetDeviceBufferMemoryRequirements( pMemoryRequirements->memoryRequirements = (VkMemoryRequirements) { .size = MAX2(align64(size, alignment), size), .alignment = alignment, - .memoryTypeBits = (1 << device->physical_device->memory.type_count) - 1, + .memoryTypeBits = (1 << device->physical_device->memory.non_lazy_type_count) - 1, }; vk_foreach_struct(ext, pMemoryRequirements->pNext) { diff --git a/src/freedreno/vulkan/tu_cmd_buffer.cc b/src/freedreno/vulkan/tu_cmd_buffer.cc index 1e8f25b0cd7..3436f886d58 100644 --- a/src/freedreno/vulkan/tu_cmd_buffer.cc +++ b/src/freedreno/vulkan/tu_cmd_buffer.cc @@ -2481,7 +2481,7 @@ tu_trace_end_render_pass(struct tu_cmd_buffer *cmd, bool gmem) if (cmd->state.lrz.image_view) { struct tu_image *image = cmd->state.lrz.image_view->image; addr.bo = image->mem->bo; - addr.offset = (image->iova - image->mem->bo->iova) + + addr.offset = (image->iova - image->mem->iova) + image->lrz_layout.lrz_fc_offset + offsetof(fd_lrzfc_layout, dir_track); } @@ -3092,6 +3092,31 @@ tu_render_pipe_fdm(struct tu_cmd_buffer *cmd, uint32_t pipe, } } +static VkResult +tu_allocate_transient_attachments(struct tu_cmd_buffer *cmd, bool sysmem) +{ + const struct tu_framebuffer *fb = cmd->state.framebuffer; + const struct tu_render_pass *rp = cmd->state.pass; + + for (unsigned i = 0; i < fb->attachment_count; i++) { + const struct tu_image_view *iview = cmd->state.attachments[i]; + if (iview && !(iview->image->vk.create_flags & + VK_IMAGE_CREATE_SPARSE_BINDING_BIT) && + !iview->image->mem->bo && + (sysmem || rp->attachments[i].load || + rp->attachments[i].load_stencil || + rp->attachments[i].store || + rp->attachments[i].store_stencil)) { + VkResult result = tu_allocate_lazy_memory(cmd->device, + iview->image->mem); + if (result != VK_SUCCESS) + return result; + } + } + + return VK_SUCCESS; +} + template static void tu_cmd_render_tiles(struct tu_cmd_buffer *cmd, @@ -3102,6 +3127,12 @@ tu_cmd_render_tiles(struct tu_cmd_buffer *cmd, const struct tu_vsc_config *vsc = tu_vsc_config(cmd, tiling); const struct tu_image_view *fdm = NULL; + VkResult result = tu_allocate_transient_attachments(cmd, false); + if (result != VK_SUCCESS) { + vk_command_buffer_set_error(&cmd->vk, result); + return; + } + if (cmd->state.pass->fragment_density_map.attachment != VK_ATTACHMENT_UNUSED) { fdm = cmd->state.attachments[cmd->state.pass->fragment_density_map.attachment]; } @@ -3199,6 +3230,13 @@ static void tu_cmd_render_sysmem(struct tu_cmd_buffer *cmd, struct tu_renderpass_result *autotune_result) { + VkResult result = tu_allocate_transient_attachments(cmd, true); + + if (result != VK_SUCCESS) { + vk_command_buffer_set_error(&cmd->vk, result); + return; + } + tu_trace_start_render_pass(cmd); tu6_sysmem_render_begin(cmd, &cmd->cs, autotune_result); diff --git a/src/freedreno/vulkan/tu_device.cc b/src/freedreno/vulkan/tu_device.cc index cb6ba0439e3..cb1c54c6381 100644 --- a/src/freedreno/vulkan/tu_device.cc +++ b/src/freedreno/vulkan/tu_device.cc @@ -1617,6 +1617,14 @@ tu_physical_device_init(struct tu_physical_device *device, device->memory.type_count++; } + device->memory.non_lazy_type_count = device->memory.type_count; + if (device->has_lazy_bos) { + device->memory.types[device->memory.type_count] = + VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT | + VK_MEMORY_PROPERTY_LAZILY_ALLOCATED_BIT; + device->memory.type_count++; + } + /* Provide fallback UBWC config values if the kernel doesn't support * providing them. This should match what the kernel programs. */ @@ -3208,6 +3216,20 @@ vk_icdGetInstanceProcAddr(VkInstance instance, const char *pName) return tu_GetInstanceProcAddr(instance, pName); } +static VkResult +tu_add_to_heap(struct tu_device *dev, struct tu_bo *bo) +{ + struct tu_memory_heap *mem_heap = &dev->physical_device->heap; + uint64_t mem_heap_used = p_atomic_add_return(&mem_heap->used, bo->size); + if (mem_heap_used > mem_heap->size) { + p_atomic_add(&mem_heap->used, -bo->size); + tu_bo_finish(dev, bo); + return vk_errorf(dev, VK_ERROR_OUT_OF_DEVICE_MEMORY, + "Out of heap memory"); + } + return VK_SUCCESS; +} + VKAPI_ATTR VkResult VKAPI_CALL tu_AllocateMemory(VkDevice _device, const VkMemoryAllocateInfo *pAllocateInfo, @@ -3237,6 +3259,8 @@ tu_AllocateMemory(VkDevice _device, return VK_SUCCESS; } + mem->size = pAllocateInfo->allocationSize; + const VkImportMemoryFdInfoKHR *fd_info = vk_find_struct_const(pAllocateInfo->pNext, IMPORT_MEMORY_FD_INFO_KHR); @@ -3304,19 +3328,28 @@ tu_AllocateMemory(VkDevice _device, (long)DIV_ROUND_UP(pAllocateInfo->allocationSize, 1024)); VkMemoryPropertyFlags mem_property = device->physical_device->memory.types[pAllocateInfo->memoryTypeIndex]; - result = tu_bo_init_new_explicit_iova( - device, &mem->vk.base, &mem->bo, pAllocateInfo->allocationSize, - client_address, mem_property, alloc_flags, NULL, name); + + if (mem_property & VK_MEMORY_PROPERTY_LAZILY_ALLOCATED_BIT) { + mem->lazy = true; + mtx_init(&mem->lazy_mutex, mtx_plain); + enum tu_sparse_vma_flags sparse_flags = + (alloc_flags & TU_BO_ALLOC_REPLAYABLE) ? + TU_SPARSE_VMA_REPLAYABLE : TU_SPARSE_VMA_NONE; + result = tu_sparse_vma_init(device, &mem->vk.base, + &mem->lazy_vma, &mem->iova, + sparse_flags, + pAllocateInfo->allocationSize, + client_address); + } else { + result = tu_bo_init_new_explicit_iova( + device, &mem->vk.base, &mem->bo, pAllocateInfo->allocationSize, + client_address, mem_property, alloc_flags, NULL, name); + } } - if (result == VK_SUCCESS) { - mem_heap_used = p_atomic_add_return(&mem_heap->used, mem->bo->size); - if (mem_heap_used > mem_heap->size) { - p_atomic_add(&mem_heap->used, -mem->bo->size); - tu_bo_finish(device, mem->bo); - result = vk_errorf(device, VK_ERROR_OUT_OF_DEVICE_MEMORY, - "Out of heap memory"); - } + if (result == VK_SUCCESS && !mem->lazy) { + result = tu_add_to_heap(device, mem->bo); + mem->iova = mem->bo->iova; } if (result != VK_SUCCESS) { @@ -3339,6 +3372,53 @@ tu_AllocateMemory(VkDevice _device, return VK_SUCCESS; } +VkResult +tu_allocate_lazy_memory(struct tu_device *dev, + struct tu_device_memory *mem) +{ + assert(mem->lazy); + + if (mem->lazy_initialized) { + if (mem->bo) + return VK_SUCCESS; + else + return vk_error(dev, VK_ERROR_OUT_OF_HOST_MEMORY); + } + + VkResult result = VK_SUCCESS; + mtx_lock(&mem->lazy_mutex); + if (!mem->lazy_initialized) { + char name[64] = "lazy vkAllocateMemory()"; + if (dev->bo_sizes) + snprintf(name, ARRAY_SIZE(name), "lazy vkAllocateMemory(%ldkb)", + (long)DIV_ROUND_UP(mem->size, 1024)); + result = + tu_bo_init_new_explicit_iova(dev, &mem->vk.base, + &mem->bo, mem->size, 0, 0, + TU_BO_ALLOC_NO_FLAGS, + &mem->lazy_vma, name); + mem->lazy_initialized = true; + + if (result == VK_SUCCESS) { + result = tu_add_to_heap(dev, mem->bo); + + if (result != VK_SUCCESS) { + tu_bo_finish(dev, mem->bo); + mem->bo = NULL; + } + } + } + mtx_unlock(&mem->lazy_mutex); + + /* Fail if another thread won the race and failed to allocate a BO */ + if (result == VK_SUCCESS && !mem->bo) { + return vk_error(dev, VK_ERROR_OUT_OF_HOST_MEMORY); + } + + return result; +} + + VKAPI_ATTR void VKAPI_CALL tu_FreeMemory(VkDevice _device, VkDeviceMemory _mem, @@ -3352,8 +3432,16 @@ tu_FreeMemory(VkDevice _device, TU_RMV(resource_destroy, device, mem); - p_atomic_add(&device->physical_device->heap.used, -mem->bo->size); - tu_bo_finish(device, mem->bo); + if (mem->bo) { + p_atomic_add(&device->physical_device->heap.used, -mem->bo->size); + tu_bo_finish(device, mem->bo); + } + + if (mem->lazy) { + tu_sparse_vma_finish(device, &mem->lazy_vma); + mtx_destroy(&mem->lazy_mutex); + } + vk_device_memory_destroy(&device->vk, pAllocator, &mem->vk); } @@ -3438,10 +3526,11 @@ tu_InvalidateMappedMemoryRanges(VkDevice _device, VKAPI_ATTR void VKAPI_CALL tu_GetDeviceMemoryCommitment(VkDevice device, - VkDeviceMemory memory, + VkDeviceMemory _memory, VkDeviceSize *pCommittedMemoryInBytes) { - *pCommittedMemoryInBytes = 0; + VK_FROM_HANDLE(tu_device_memory, memory, _memory); + *pCommittedMemoryInBytes = memory->lazy_initialized ? memory->size : 0; } VKAPI_ATTR VkResult VKAPI_CALL @@ -3581,7 +3670,7 @@ tu_GetMemoryFdPropertiesKHR(VkDevice _device, VK_FROM_HANDLE(tu_device, device, _device); assert(handleType == VK_EXTERNAL_MEMORY_HANDLE_TYPE_DMA_BUF_BIT_EXT); pMemoryFdProperties->memoryTypeBits = - (1 << device->physical_device->memory.type_count) - 1; + (1 << device->physical_device->memory.non_lazy_type_count) - 1; return VK_SUCCESS; } diff --git a/src/freedreno/vulkan/tu_device.h b/src/freedreno/vulkan/tu_device.h index 03294791f7a..593a9ee0262 100644 --- a/src/freedreno/vulkan/tu_device.h +++ b/src/freedreno/vulkan/tu_device.h @@ -146,6 +146,7 @@ struct tu_physical_device bool has_preemption; struct { + uint32_t non_lazy_type_count; uint32_t type_count; VkMemoryPropertyFlags types[VK_MAX_MEMORY_TYPES]; } memory; @@ -478,6 +479,15 @@ struct tu_device_memory { struct vk_device_memory vk; + uint64_t iova; + uint64_t size; + + /* For lazy memory */ + bool lazy; + bool lazy_initialized; + struct tu_sparse_vma lazy_vma; + mtx_t lazy_mutex; + struct tu_bo *bo; /* for dedicated allocations */ @@ -486,6 +496,10 @@ struct tu_device_memory VK_DEFINE_NONDISP_HANDLE_CASTS(tu_device_memory, vk.base, VkDeviceMemory, VK_OBJECT_TYPE_DEVICE_MEMORY) +VkResult +tu_allocate_lazy_memory(struct tu_device *dev, + struct tu_device_memory *mem); + struct tu_attachment_info { struct tu_image_view *attachment; diff --git a/src/freedreno/vulkan/tu_formats.cc b/src/freedreno/vulkan/tu_formats.cc index 4c8357cf52f..a1768017157 100644 --- a/src/freedreno/vulkan/tu_formats.cc +++ b/src/freedreno/vulkan/tu_formats.cc @@ -563,7 +563,8 @@ tu_get_image_format_properties( } } - if (image_usage & VK_IMAGE_USAGE_INPUT_ATTACHMENT_BIT) { + if (image_usage & (VK_IMAGE_USAGE_INPUT_ATTACHMENT_BIT | + VK_IMAGE_USAGE_TRANSIENT_ATTACHMENT_BIT)) { if (!(format_feature_flags & (VK_FORMAT_FEATURE_2_COLOR_ATTACHMENT_BIT | VK_FORMAT_FEATURE_2_DEPTH_STENCIL_ATTACHMENT_BIT))) { diff --git a/src/freedreno/vulkan/tu_image.cc b/src/freedreno/vulkan/tu_image.cc index d8453a10896..e644c9c3e0f 100644 --- a/src/freedreno/vulkan/tu_image.cc +++ b/src/freedreno/vulkan/tu_image.cc @@ -1005,10 +1005,11 @@ tu_image_bind(struct tu_device *device, } image->mem = mem; image->mem_offset = offset; - image->iova = mem->bo->iova + offset; + image->iova = mem->iova + offset; if (image->vk.usage & (VK_IMAGE_USAGE_FRAGMENT_DENSITY_MAP_BIT_EXT | VK_IMAGE_USAGE_HOST_TRANSFER_BIT_EXT)) { + assert(mem->bo); /* Transient images cannot have these usages */ if (!mem->bo->map) { result = tu_bo_map(device, mem->bo, NULL); if (result != VK_SUCCESS) @@ -1063,6 +1064,14 @@ tu_get_image_memory_requirements(struct tu_device *dev, struct tu_image *image, if (image->vk.create_flags & VK_IMAGE_CREATE_SPARSE_RESIDENCY_BIT) alignment = 65536; + /* Only expose the lazy memory type for images with TRANSIENT_ATTACHMENT + * usage. + */ + uint32_t type_count = + (image->vk.usage & VK_IMAGE_USAGE_TRANSIENT_ATTACHMENT_BIT) ? + dev->physical_device->memory.type_count : + dev->physical_device->memory.non_lazy_type_count; + pMemoryRequirements->memoryRequirements = (VkMemoryRequirements) { /* Due to how we fake the sparse tile size, the real size may not be * aligned. CTS doesn't like this, and real apps may also be surprised, @@ -1070,7 +1079,7 @@ tu_get_image_memory_requirements(struct tu_device *dev, struct tu_image *image, */ .size = align64(image->total_size, alignment), .alignment = alignment, - .memoryTypeBits = (1 << dev->physical_device->memory.type_count) - 1, + .memoryTypeBits = (1 << type_count) - 1, }; vk_foreach_struct(ext, pMemoryRequirements->pNext) {