From: Hans-Kristian Arntzen Subject: [PATCH v2 1/2] vkd3d: Optimize GPU VA allocator. Message-Id: <20191008082910.4958-1-post@arntzen-software.no> Date: Tue, 8 Oct 2019 10:29:09 +0200 The GPU VA allocator was allocating memory in a way where dereferencing GPU VA required a lock + bsearch to find the right VA range. Rather than going this route, we turn the common case into O(1) and lockless by creating a slab allocator which allows us to lookup a ptr directly from GPU VA with (VA - Base) / PageSize. The number of allocations in the fast path must be limited since we cannot trivially grow the allocator while remaining lock-free for dereferences. Signed-off-by: Hans-Kristian Arntzen --- libs/vkd3d/device.c | 240 +++++++++++++++++++++++++++++++------ libs/vkd3d/resource.c | 2 +- libs/vkd3d/vkd3d_private.h | 31 +++-- 3 files changed, 227 insertions(+), 46 deletions(-) diff --git a/libs/vkd3d/device.c b/libs/vkd3d/device.c index 3da4273..0ecac9a 100644 --- a/libs/vkd3d/device.c +++ b/libs/vkd3d/device.c @@ -1822,42 +1822,106 @@ static void d3d12_device_destroy_pipeline_cache(struct d3d12_device *device) pthread_mutex_destroy(&device->mutex); } -D3D12_GPU_VIRTUAL_ADDRESS vkd3d_gpu_va_allocator_allocate(struct vkd3d_gpu_va_allocator *allocator, - size_t size, void *ptr) +#define VKD3D_MAX_VA_SLAB_ALLOCATIONS (64 * 1024) +#define VKD3D_BASE_VA_SLAB (0x1000000000ull) +#define VKD3D_BASE_VA_FALLBACK (0x8000000000000000ull) +#define VKD3D_SLAB_ALLOCATION_SIZE (0x100000000ull) +#define VKD3D_SLAB_ALLOCATION_SIZE_LOG2 32 + +static D3D12_GPU_VIRTUAL_ADDRESS vkd3d_gpu_va_allocator_allocate_fallback(struct vkd3d_gpu_va_allocator *allocator, + size_t size, size_t alignment, void *ptr) { D3D12_GPU_VIRTUAL_ADDRESS ceiling = ~(D3D12_GPU_VIRTUAL_ADDRESS)0; struct vkd3d_gpu_va_allocation *allocation; - int rc; - if ((rc = pthread_mutex_lock(&allocator->mutex))) + if (!vkd3d_array_reserve((void **)&allocator->fallback_mem_allocations, &allocator->fallback_mem_allocations_size, + allocator->fallback_mem_allocation_count + 1, sizeof(*allocator->fallback_mem_allocations))) { - ERR("Failed to lock mutex, error %d.\n", rc); return 0; } - if (!vkd3d_array_reserve((void **)&allocator->allocations, &allocator->allocations_size, - allocator->allocation_count + 1, sizeof(*allocator->allocations))) + allocator->fallback_mem_floor = (allocator->fallback_mem_floor + alignment - 1) & ~((D3D12_GPU_VIRTUAL_ADDRESS)alignment - 1); + + if (size > ceiling || ceiling - size < allocator->fallback_mem_floor) { - pthread_mutex_unlock(&allocator->mutex); return 0; } - if (size > ceiling || ceiling - size < allocator->floor) + allocation = &allocator->fallback_mem_allocations[allocator->fallback_mem_allocation_count++]; + allocation->base = allocator->fallback_mem_floor; + allocation->size = size; + allocation->ptr = ptr; + + /* This pointer is bumped and never lowered on a free. + * However, this will only fail once we have exhausted 63 bits of address space. */ + allocator->fallback_mem_floor += size; + + return allocation->base; +} + +static D3D12_GPU_VIRTUAL_ADDRESS vkd3d_gpu_va_allocator_allocate_slab(struct vkd3d_gpu_va_allocator *allocator, + size_t size, size_t alignment, void *ptr) +{ + int rc; + unsigned vacant_index; + D3D12_GPU_VIRTUAL_ADDRESS virtual_address = 0; + + if ((rc = pthread_mutex_lock(&allocator->mutex))) { - pthread_mutex_unlock(&allocator->mutex); + ERR("Failed to lock mutex, error %d.\n", rc); return 0; } - allocation = &allocator->allocations[allocator->allocation_count++]; - allocation->base = allocator->floor; - allocation->size = size; - allocation->ptr = ptr; + TRACE("Allocating %zu bytes (%zu align) of VA from slab allocator.\n", size, alignment); + if (allocator->mem_vacant_count > 0) + { + vacant_index = allocator->mem_vacant[--allocator->mem_vacant_count]; + + /* It is critical that the multiplication happens in 64-bit to not overflow. */ + virtual_address = VKD3D_BASE_VA_SLAB + vacant_index * VKD3D_SLAB_ALLOCATION_SIZE; + TRACE("Allocating VA: 0x%llx: vacant index %u from slab.\n", + (unsigned long long)virtual_address, vacant_index); + assert(!allocator->slab_mem_allocations[vacant_index].ptr); + allocator->slab_mem_allocations[vacant_index].ptr = ptr; + allocator->slab_mem_allocations[vacant_index].size = size; + } - allocator->floor += size; + if (virtual_address == 0) + { + TRACE("Slab allocator is empty, allocating %zu bytes (%zu align) of VA from fallback allocator.\n", + size, alignment); + /* Fall back to slow allocator. */ + virtual_address = vkd3d_gpu_va_allocator_allocate_fallback(allocator, size, alignment, ptr); + } pthread_mutex_unlock(&allocator->mutex); + return virtual_address; +} - return allocation->base; +D3D12_GPU_VIRTUAL_ADDRESS vkd3d_gpu_va_allocator_allocate(struct vkd3d_gpu_va_allocator *allocator, + size_t size, size_t alignment, void *ptr) +{ + D3D12_GPU_VIRTUAL_ADDRESS virtual_address; + int rc; + size_t aligned_size; + + aligned_size = size > alignment ? size : alignment; + + if (aligned_size > VKD3D_SLAB_ALLOCATION_SIZE) + { + /* For massive VA allocations, go straight to high-mem with a slower allocator. */ + if ((rc = pthread_mutex_lock(&allocator->mutex))) + { + ERR("Failed to lock mutex, error %d.\n", rc); + return 0; + } + virtual_address = vkd3d_gpu_va_allocator_allocate_fallback(allocator, size, alignment, ptr); + pthread_mutex_unlock(&allocator->mutex); + } + else + virtual_address = vkd3d_gpu_va_allocator_allocate_slab(allocator, size, alignment, ptr); + + return virtual_address; } static int vkd3d_gpu_va_allocation_compare(const void *k, const void *e) @@ -1872,24 +1936,93 @@ static int vkd3d_gpu_va_allocation_compare(const void *k, const void *e) return 0; } +static void *vkd3d_gpu_va_allocator_dereference_slab(struct vkd3d_gpu_va_allocator *allocator, D3D12_GPU_VIRTUAL_ADDRESS address) +{ + D3D12_GPU_VIRTUAL_ADDRESS base_offset; + uint64_t base_index; + const struct vkd3d_gpu_va_slab_entry *slab; + + base_offset = address - VKD3D_BASE_VA_SLAB; + base_index = base_offset >> VKD3D_SLAB_ALLOCATION_SIZE_LOG2; + if (base_index >= VKD3D_MAX_VA_SLAB_ALLOCATIONS) + { + ERR("Accessed slab size class out of range.\n"); + return NULL; + } + + slab = &allocator->slab_mem_allocations[base_index]; + base_offset -= base_index * VKD3D_SLAB_ALLOCATION_SIZE; + if (base_offset >= slab->size) + { + ERR("Accessed slab out of range.\n"); + return NULL; + } + return slab->ptr; +} + +static void vkd3d_gpu_va_allocator_free_slab(struct vkd3d_gpu_va_allocator *allocator, D3D12_GPU_VIRTUAL_ADDRESS address) +{ + D3D12_GPU_VIRTUAL_ADDRESS base_offset; + unsigned base_index; + struct vkd3d_gpu_va_slab_entry *slab; + + base_offset = address - VKD3D_BASE_VA_SLAB; + base_index = base_offset >> VKD3D_SLAB_ALLOCATION_SIZE_LOG2; + + if (base_index >= VKD3D_MAX_VA_SLAB_ALLOCATIONS) + { + ERR("Accessed slab size class out of range.\n"); + return; + } + + slab = &allocator->slab_mem_allocations[base_index]; + if (slab->ptr == NULL) + { + ERR("Attempting to free NULL VA.\n"); + return; + } + + if (allocator->mem_vacant_count >= VKD3D_MAX_VA_SLAB_ALLOCATIONS) + { + ERR("Invalid free, slab size class is fully freed.\n"); + return; + } + + TRACE("Freeing VA: 0x%llx: index %u from slab.\n", + (unsigned long long)address, base_index); + + slab->ptr = NULL; + allocator->mem_vacant[allocator->mem_vacant_count++] = base_index; +} + void *vkd3d_gpu_va_allocator_dereference(struct vkd3d_gpu_va_allocator *allocator, D3D12_GPU_VIRTUAL_ADDRESS address) { struct vkd3d_gpu_va_allocation *allocation; int rc; - if ((rc = pthread_mutex_lock(&allocator->mutex))) + /* If we land in the non-fallback region, dereferencing VA is lockless. The base pointer is immutable, + * and only way we can have a data race is if some other thread is poking into the slab_mem_allocation[class][base_index] block. + * This can only happen if someone is trying to free the entry while we're dereferencing, which would be a serious app bug. */ + if (address < VKD3D_BASE_VA_FALLBACK) { - ERR("Failed to lock mutex, error %d.\n", rc); - return NULL; + return vkd3d_gpu_va_allocator_dereference_slab(allocator, address); } + else + { + /* Slow fallback. */ + if ((rc = pthread_mutex_lock(&allocator->mutex))) + { + ERR("Failed to lock mutex, error %d.\n", rc); + return NULL; + } - allocation = bsearch(&address, allocator->allocations, allocator->allocation_count, - sizeof(*allocation), vkd3d_gpu_va_allocation_compare); - - pthread_mutex_unlock(&allocator->mutex); + allocation = bsearch(&address, allocator->fallback_mem_allocations, allocator->fallback_mem_allocation_count, + sizeof(*allocation), vkd3d_gpu_va_allocation_compare); - return allocation ? allocation->ptr : NULL; + pthread_mutex_unlock(&allocator->mutex); + return allocation ? allocation->ptr : NULL; + } } void vkd3d_gpu_va_allocator_free(struct vkd3d_gpu_va_allocator *allocator, D3D12_GPU_VIRTUAL_ADDRESS address) @@ -1904,16 +2037,23 @@ void vkd3d_gpu_va_allocator_free(struct vkd3d_gpu_va_allocator *allocator, D3D12 return; } - allocation = bsearch(&address, allocator->allocations, allocator->allocation_count, - sizeof(*allocation), vkd3d_gpu_va_allocation_compare); - if (allocation && allocation->base == address) + if (address < VKD3D_BASE_VA_FALLBACK) { - index = allocation - allocator->allocations; - --allocator->allocation_count; - if (index != allocator->allocation_count) + vkd3d_gpu_va_allocator_free_slab(allocator, address); + } + else + { + allocation = bsearch(&address, allocator->fallback_mem_allocations, allocator->fallback_mem_allocation_count, + sizeof(*allocation), vkd3d_gpu_va_allocation_compare); + if (allocation && allocation->base == address) { - memmove(&allocator->allocations[index], &allocator->allocations[index + 1], - (allocator->allocation_count - index) * sizeof(*allocation)); + index = allocation - allocator->fallback_mem_allocations; + --allocator->fallback_mem_allocation_count; + if (index != allocator->fallback_mem_allocation_count) + { + memmove(&allocator->fallback_mem_allocations[index], &allocator->fallback_mem_allocations[index + 1], + (allocator->fallback_mem_allocation_count - index) * sizeof(*allocation)); + } } } @@ -1923,29 +2063,59 @@ void vkd3d_gpu_va_allocator_free(struct vkd3d_gpu_va_allocator *allocator, D3D12 static bool vkd3d_gpu_va_allocator_init(struct vkd3d_gpu_va_allocator *allocator) { int rc; + int i; memset(allocator, 0, sizeof(*allocator)); - allocator->floor = 0x1000; + allocator->fallback_mem_floor = VKD3D_BASE_VA_FALLBACK; + + /* To remain lock-less, we cannot grow these lists after the fact. If we commit to a maximum number of allocations + * here, we can dereference without taking a lock as the base pointer never changes. + * We would be able to grow more seamlessly using an array of pointers, + * but would make dereferencing slightly less efficient. */ + allocator->slab_mem_allocations = vkd3d_calloc(VKD3D_MAX_VA_SLAB_ALLOCATIONS, sizeof(*allocator->slab_mem_allocations)); + if (!allocator->slab_mem_allocations) + goto error; + + /* Otherwise we need 32-bit indices. */ + assert(VKD3D_MAX_VA_SLAB_ALLOCATIONS <= 64 * 1024); + + allocator->mem_vacant = vkd3d_malloc(VKD3D_MAX_VA_SLAB_ALLOCATIONS * sizeof(uint16_t)); + if (!allocator->mem_vacant) + goto error; + + /* Build a stack of which slab indices are available for allocation. + * Place lowest indices last (first to be popped off stack). */ + for (i = 0; i < VKD3D_MAX_VA_SLAB_ALLOCATIONS; i++) + allocator->mem_vacant[i] = (VKD3D_MAX_VA_SLAB_ALLOCATIONS - 1) - i; + allocator->mem_vacant_count = VKD3D_MAX_VA_SLAB_ALLOCATIONS; if ((rc = pthread_mutex_init(&allocator->mutex, NULL))) { ERR("Failed to initialize mutex, error %d.\n", rc); - return false; + goto error; } return true; + +error: + vkd3d_free(allocator->slab_mem_allocations); + vkd3d_free(allocator->mem_vacant); + return false; } static void vkd3d_gpu_va_allocator_cleanup(struct vkd3d_gpu_va_allocator *allocator) { int rc; + vkd3d_free(allocator->slab_mem_allocations); + vkd3d_free(allocator->mem_vacant); + if ((rc = pthread_mutex_lock(&allocator->mutex))) { ERR("Failed to lock mutex, error %d.\n", rc); return; } - vkd3d_free(allocator->allocations); + vkd3d_free(allocator->fallback_mem_allocations); pthread_mutex_unlock(&allocator->mutex); pthread_mutex_destroy(&allocator->mutex); } diff --git a/libs/vkd3d/resource.c b/libs/vkd3d/resource.c index ccd1230..6c9564b 100644 --- a/libs/vkd3d/resource.c +++ b/libs/vkd3d/resource.c @@ -1710,7 +1710,7 @@ static HRESULT d3d12_resource_init(struct d3d12_resource *resource, struct d3d12 &resource->desc, &resource->u.vk_buffer))) return hr; if (!(resource->gpu_address = vkd3d_gpu_va_allocator_allocate(&device->gpu_va_allocator, - desc->Width, resource))) + desc->Width, D3D12_DEFAULT_RESOURCE_PLACEMENT_ALIGNMENT, resource))) { ERR("Failed to allocate GPU VA.\n"); d3d12_resource_destroy(resource, device); diff --git a/libs/vkd3d/vkd3d_private.h b/libs/vkd3d/vkd3d_private.h index 59f0eac..a5f7c81 100644 --- a/libs/vkd3d/vkd3d_private.h +++ b/libs/vkd3d/vkd3d_private.h @@ -202,24 +202,35 @@ HRESULT vkd3d_fence_worker_start(struct vkd3d_fence_worker *worker, HRESULT vkd3d_fence_worker_stop(struct vkd3d_fence_worker *worker, struct d3d12_device *device) DECLSPEC_HIDDEN; +struct vkd3d_gpu_va_allocation +{ + D3D12_GPU_VIRTUAL_ADDRESS base; + SIZE_T size; + void *ptr; +}; + +struct vkd3d_gpu_va_slab_entry +{ + void *ptr; + SIZE_T size; +}; + struct vkd3d_gpu_va_allocator { pthread_mutex_t mutex; - D3D12_GPU_VIRTUAL_ADDRESS floor; + struct vkd3d_gpu_va_slab_entry *slab_mem_allocations; + uint16_t *mem_vacant; + size_t mem_vacant_count; - struct vkd3d_gpu_va_allocation - { - D3D12_GPU_VIRTUAL_ADDRESS base; - SIZE_T size; - void *ptr; - } *allocations; - size_t allocations_size; - size_t allocation_count; + struct vkd3d_gpu_va_allocation *fallback_mem_allocations; + size_t fallback_mem_allocations_size; + size_t fallback_mem_allocation_count; + D3D12_GPU_VIRTUAL_ADDRESS fallback_mem_floor; }; D3D12_GPU_VIRTUAL_ADDRESS vkd3d_gpu_va_allocator_allocate(struct vkd3d_gpu_va_allocator *allocator, - size_t size, void *ptr) DECLSPEC_HIDDEN; + size_t size, size_t alignment, void *ptr) DECLSPEC_HIDDEN; void *vkd3d_gpu_va_allocator_dereference(struct vkd3d_gpu_va_allocator *allocator, D3D12_GPU_VIRTUAL_ADDRESS address) DECLSPEC_HIDDEN; void vkd3d_gpu_va_allocator_free(struct vkd3d_gpu_va_allocator *allocator, -- 2.23.0