diff --git a/src/freedreno/vulkan/tu_buffer.cc b/src/freedreno/vulkan/tu_buffer.cc index 55964401b4f..429e8f28309 100644 --- a/src/freedreno/vulkan/tu_buffer.cc +++ b/src/freedreno/vulkan/tu_buffer.cc @@ -121,7 +121,7 @@ tu_GetDeviceBufferMemoryRequirements( pMemoryRequirements->memoryRequirements = (VkMemoryRequirements) { .size = MAX2(align64(size, alignment), size), .alignment = alignment, - .memoryTypeBits = (1 << device->physical_device->memory.type_count) - 1, + .memoryTypeBits = (1 << device->physical_device->memory.non_lazy_type_count) - 1, }; vk_foreach_struct(ext, pMemoryRequirements->pNext) { diff --git a/src/freedreno/vulkan/tu_cmd_buffer.cc b/src/freedreno/vulkan/tu_cmd_buffer.cc index 1e8f25b0cd7..3436f886d58 100644 --- a/src/freedreno/vulkan/tu_cmd_buffer.cc +++ b/src/freedreno/vulkan/tu_cmd_buffer.cc @@ -2481,7 +2481,7 @@ tu_trace_end_render_pass(struct tu_cmd_buffer *cmd, bool gmem) if (cmd->state.lrz.image_view) { struct tu_image *image = cmd->state.lrz.image_view->image; addr.bo = image->mem->bo; - addr.offset = (image->iova - image->mem->bo->iova) + + addr.offset = (image->iova - image->mem->iova) + image->lrz_layout.lrz_fc_offset + offsetof(fd_lrzfc_layout, dir_track); } @@ -3092,6 +3092,31 @@ tu_render_pipe_fdm(struct tu_cmd_buffer *cmd, uint32_t pipe, } } +static VkResult +tu_allocate_transient_attachments(struct tu_cmd_buffer *cmd, bool sysmem) +{ + const struct tu_framebuffer *fb = cmd->state.framebuffer; + const struct tu_render_pass *rp = cmd->state.pass; + + for (unsigned i = 0; i < fb->attachment_count; i++) { + const struct tu_image_view *iview = cmd->state.attachments[i]; + if (iview && !(iview->image->vk.create_flags & + VK_IMAGE_CREATE_SPARSE_BINDING_BIT) && + !iview->image->mem->bo && + (sysmem || rp->attachments[i].load || + rp->attachments[i].load_stencil || + rp->attachments[i].store || + rp->attachments[i].store_stencil)) { + VkResult result = tu_allocate_lazy_memory(cmd->device, + iview->image->mem); + if (result != VK_SUCCESS) + return result; + } + } + + return VK_SUCCESS; +} + template static void tu_cmd_render_tiles(struct tu_cmd_buffer *cmd, @@ -3102,6 +3127,12 @@ tu_cmd_render_tiles(struct tu_cmd_buffer *cmd, const struct tu_vsc_config *vsc = tu_vsc_config(cmd, tiling); const struct tu_image_view *fdm = NULL; + VkResult result = tu_allocate_transient_attachments(cmd, false); + if (result != VK_SUCCESS) { + vk_command_buffer_set_error(&cmd->vk, result); + return; + } + if (cmd->state.pass->fragment_density_map.attachment != VK_ATTACHMENT_UNUSED) { fdm = cmd->state.attachments[cmd->state.pass->fragment_density_map.attachment]; } @@ -3199,6 +3230,13 @@ static void tu_cmd_render_sysmem(struct tu_cmd_buffer *cmd, struct tu_renderpass_result *autotune_result) { + VkResult result = tu_allocate_transient_attachments(cmd, true); + + if (result != VK_SUCCESS) { + vk_command_buffer_set_error(&cmd->vk, result); + return; + } + tu_trace_start_render_pass(cmd); tu6_sysmem_render_begin(cmd, &cmd->cs, autotune_result); diff --git a/src/freedreno/vulkan/tu_device.cc b/src/freedreno/vulkan/tu_device.cc index cb6ba0439e3..cb1c54c6381 100644 --- a/src/freedreno/vulkan/tu_device.cc +++ b/src/freedreno/vulkan/tu_device.cc @@ -1617,6 +1617,14 @@ tu_physical_device_init(struct tu_physical_device *device, device->memory.type_count++; } + device->memory.non_lazy_type_count = device->memory.type_count; + if (device->has_lazy_bos) { + device->memory.types[device->memory.type_count] = + VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT | + VK_MEMORY_PROPERTY_LAZILY_ALLOCATED_BIT; + device->memory.type_count++; + } + /* Provide fallback UBWC config values if the kernel doesn't support * providing them. This should match what the kernel programs. */ @@ -3208,6 +3216,20 @@ vk_icdGetInstanceProcAddr(VkInstance instance, const char *pName) return tu_GetInstanceProcAddr(instance, pName); } +static VkResult +tu_add_to_heap(struct tu_device *dev, struct tu_bo *bo) +{ + struct tu_memory_heap *mem_heap = &dev->physical_device->heap; + uint64_t mem_heap_used = p_atomic_add_return(&mem_heap->used, bo->size); + if (mem_heap_used > mem_heap->size) { + p_atomic_add(&mem_heap->used, -bo->size); + tu_bo_finish(dev, bo); + return vk_errorf(dev, VK_ERROR_OUT_OF_DEVICE_MEMORY, + "Out of heap memory"); + } + return VK_SUCCESS; +} + VKAPI_ATTR VkResult VKAPI_CALL tu_AllocateMemory(VkDevice _device, const VkMemoryAllocateInfo *pAllocateInfo, @@ -3237,6 +3259,8 @@ tu_AllocateMemory(VkDevice _device, return VK_SUCCESS; } + mem->size = pAllocateInfo->allocationSize; + const VkImportMemoryFdInfoKHR *fd_info = vk_find_struct_const(pAllocateInfo->pNext, IMPORT_MEMORY_FD_INFO_KHR); @@ -3304,19 +3328,28 @@ tu_AllocateMemory(VkDevice _device, (long)DIV_ROUND_UP(pAllocateInfo->allocationSize, 1024)); VkMemoryPropertyFlags mem_property = device->physical_device->memory.types[pAllocateInfo->memoryTypeIndex]; - result = tu_bo_init_new_explicit_iova( - device, &mem->vk.base, &mem->bo, pAllocateInfo->allocationSize, - client_address, mem_property, alloc_flags, NULL, name); + + if (mem_property & VK_MEMORY_PROPERTY_LAZILY_ALLOCATED_BIT) { + mem->lazy = true; + mtx_init(&mem->lazy_mutex, mtx_plain); + enum tu_sparse_vma_flags sparse_flags = + (alloc_flags & TU_BO_ALLOC_REPLAYABLE) ? + TU_SPARSE_VMA_REPLAYABLE : TU_SPARSE_VMA_NONE; + result = tu_sparse_vma_init(device, &mem->vk.base, + &mem->lazy_vma, &mem->iova, + sparse_flags, + pAllocateInfo->allocationSize, + client_address); + } else { + result = tu_bo_init_new_explicit_iova( + device, &mem->vk.base, &mem->bo, pAllocateInfo->allocationSize, + client_address, mem_property, alloc_flags, NULL, name); + } } - if (result == VK_SUCCESS) { - mem_heap_used = p_atomic_add_return(&mem_heap->used, mem->bo->size); - if (mem_heap_used > mem_heap->size) { - p_atomic_add(&mem_heap->used, -mem->bo->size); - tu_bo_finish(device, mem->bo); - result = vk_errorf(device, VK_ERROR_OUT_OF_DEVICE_MEMORY, - "Out of heap memory"); - } + if (result == VK_SUCCESS && !mem->lazy) { + result = tu_add_to_heap(device, mem->bo); + mem->iova = mem->bo->iova; } if (result != VK_SUCCESS) { @@ -3339,6 +3372,53 @@ tu_AllocateMemory(VkDevice _device, return VK_SUCCESS; } +VkResult +tu_allocate_lazy_memory(struct tu_device *dev, + struct tu_device_memory *mem) +{ + assert(mem->lazy); + + if (mem->lazy_initialized) { + if (mem->bo) + return VK_SUCCESS; + else + return vk_error(dev, VK_ERROR_OUT_OF_HOST_MEMORY); + } + + VkResult result = VK_SUCCESS; + mtx_lock(&mem->lazy_mutex); + if (!mem->lazy_initialized) { + char name[64] = "lazy vkAllocateMemory()"; + if (dev->bo_sizes) + snprintf(name, ARRAY_SIZE(name), "lazy vkAllocateMemory(%ldkb)", + (long)DIV_ROUND_UP(mem->size, 1024)); + result = + tu_bo_init_new_explicit_iova(dev, &mem->vk.base, + &mem->bo, mem->size, 0, 0, + TU_BO_ALLOC_NO_FLAGS, + &mem->lazy_vma, name); + mem->lazy_initialized = true; + + if (result == VK_SUCCESS) { + result = tu_add_to_heap(dev, mem->bo); + + if (result != VK_SUCCESS) { + tu_bo_finish(dev, mem->bo); + mem->bo = NULL; + } + } + } + mtx_unlock(&mem->lazy_mutex); + + /* Fail if another thread won the race and failed to allocate a BO */ + if (result == VK_SUCCESS && !mem->bo) { + return vk_error(dev, VK_ERROR_OUT_OF_HOST_MEMORY); + } + + return result; +} + + VKAPI_ATTR void VKAPI_CALL tu_FreeMemory(VkDevice _device, VkDeviceMemory _mem, @@ -3352,8 +3432,16 @@ tu_FreeMemory(VkDevice _device, TU_RMV(resource_destroy, device, mem); - p_atomic_add(&device->physical_device->heap.used, -mem->bo->size); - tu_bo_finish(device, mem->bo); + if (mem->bo) { + p_atomic_add(&device->physical_device->heap.used, -mem->bo->size); + tu_bo_finish(device, mem->bo); + } + + if (mem->lazy) { + tu_sparse_vma_finish(device, &mem->lazy_vma); + mtx_destroy(&mem->lazy_mutex); + } + vk_device_memory_destroy(&device->vk, pAllocator, &mem->vk); } @@ -3438,10 +3526,11 @@ tu_InvalidateMappedMemoryRanges(VkDevice _device, VKAPI_ATTR void VKAPI_CALL tu_GetDeviceMemoryCommitment(VkDevice device, - VkDeviceMemory memory, + VkDeviceMemory _memory, VkDeviceSize *pCommittedMemoryInBytes) { - *pCommittedMemoryInBytes = 0; + VK_FROM_HANDLE(tu_device_memory, memory, _memory); + *pCommittedMemoryInBytes = memory->lazy_initialized ? memory->size : 0; } VKAPI_ATTR VkResult VKAPI_CALL @@ -3581,7 +3670,7 @@ tu_GetMemoryFdPropertiesKHR(VkDevice _device, VK_FROM_HANDLE(tu_device, device, _device); assert(handleType == VK_EXTERNAL_MEMORY_HANDLE_TYPE_DMA_BUF_BIT_EXT); pMemoryFdProperties->memoryTypeBits = - (1 << device->physical_device->memory.type_count) - 1; + (1 << device->physical_device->memory.non_lazy_type_count) - 1; return VK_SUCCESS; } diff --git a/src/freedreno/vulkan/tu_device.h b/src/freedreno/vulkan/tu_device.h index 03294791f7a..593a9ee0262 100644 --- a/src/freedreno/vulkan/tu_device.h +++ b/src/freedreno/vulkan/tu_device.h @@ -146,6 +146,7 @@ struct tu_physical_device bool has_preemption; struct { + uint32_t non_lazy_type_count; uint32_t type_count; VkMemoryPropertyFlags types[VK_MAX_MEMORY_TYPES]; } memory; @@ -478,6 +479,15 @@ struct tu_device_memory { struct vk_device_memory vk; + uint64_t iova; + uint64_t size; + + /* For lazy memory */ + bool lazy; + bool lazy_initialized; + struct tu_sparse_vma lazy_vma; + mtx_t lazy_mutex; + struct tu_bo *bo; /* for dedicated allocations */ @@ -486,6 +496,10 @@ struct tu_device_memory VK_DEFINE_NONDISP_HANDLE_CASTS(tu_device_memory, vk.base, VkDeviceMemory, VK_OBJECT_TYPE_DEVICE_MEMORY) +VkResult +tu_allocate_lazy_memory(struct tu_device *dev, + struct tu_device_memory *mem); + struct tu_attachment_info { struct tu_image_view *attachment; diff --git a/src/freedreno/vulkan/tu_formats.cc b/src/freedreno/vulkan/tu_formats.cc index 4c8357cf52f..a1768017157 100644 --- a/src/freedreno/vulkan/tu_formats.cc +++ b/src/freedreno/vulkan/tu_formats.cc @@ -563,7 +563,8 @@ tu_get_image_format_properties( } } - if (image_usage & VK_IMAGE_USAGE_INPUT_ATTACHMENT_BIT) { + if (image_usage & (VK_IMAGE_USAGE_INPUT_ATTACHMENT_BIT | + VK_IMAGE_USAGE_TRANSIENT_ATTACHMENT_BIT)) { if (!(format_feature_flags & (VK_FORMAT_FEATURE_2_COLOR_ATTACHMENT_BIT | VK_FORMAT_FEATURE_2_DEPTH_STENCIL_ATTACHMENT_BIT))) { diff --git a/src/freedreno/vulkan/tu_image.cc b/src/freedreno/vulkan/tu_image.cc index d8453a10896..e644c9c3e0f 100644 --- a/src/freedreno/vulkan/tu_image.cc +++ b/src/freedreno/vulkan/tu_image.cc @@ -1005,10 +1005,11 @@ tu_image_bind(struct tu_device *device, } image->mem = mem; image->mem_offset = offset; - image->iova = mem->bo->iova + offset; + image->iova = mem->iova + offset; if (image->vk.usage & (VK_IMAGE_USAGE_FRAGMENT_DENSITY_MAP_BIT_EXT | VK_IMAGE_USAGE_HOST_TRANSFER_BIT_EXT)) { + assert(mem->bo); /* Transient images cannot have these usages */ if (!mem->bo->map) { result = tu_bo_map(device, mem->bo, NULL); if (result != VK_SUCCESS) @@ -1063,6 +1064,14 @@ tu_get_image_memory_requirements(struct tu_device *dev, struct tu_image *image, if (image->vk.create_flags & VK_IMAGE_CREATE_SPARSE_RESIDENCY_BIT) alignment = 65536; + /* Only expose the lazy memory type for images with TRANSIENT_ATTACHMENT + * usage. + */ + uint32_t type_count = + (image->vk.usage & VK_IMAGE_USAGE_TRANSIENT_ATTACHMENT_BIT) ? + dev->physical_device->memory.type_count : + dev->physical_device->memory.non_lazy_type_count; + pMemoryRequirements->memoryRequirements = (VkMemoryRequirements) { /* Due to how we fake the sparse tile size, the real size may not be * aligned. CTS doesn't like this, and real apps may also be surprised, @@ -1070,7 +1079,7 @@ tu_get_image_memory_requirements(struct tu_device *dev, struct tu_image *image, */ .size = align64(image->total_size, alignment), .alignment = alignment, - .memoryTypeBits = (1 << dev->physical_device->memory.type_count) - 1, + .memoryTypeBits = (1 << type_count) - 1, }; vk_foreach_struct(ext, pMemoryRequirements->pNext) {