blob: 7ddca2534a913841159ca7307a04bca359a3cafe [file] [log] [blame]
//
// Copyright 2021 The ANGLE Project Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.
//
// CLDeviceVk.cpp: Implements the class methods for CLDeviceVk.
//
#ifdef UNSAFE_BUFFERS_BUILD
# pragma allow_unsafe_buffers
#endif
#include "libANGLE/renderer/vulkan/CLDeviceVk.h"
#include "libANGLE/renderer/driver_utils.h"
#include "libANGLE/renderer/vulkan/clspv_utils.h"
#include "libANGLE/renderer/vulkan/vk_renderer.h"
#include "libANGLE/renderer/cl_types.h"
#include "libANGLE/renderer/driver_utils.h"
#include "libANGLE/cl_utils.h"
#include "common/mathutil.h"
namespace rx
{
CLDeviceVk::CLDeviceVk(const cl::Device &device, vk::Renderer *renderer)
: CLDeviceImpl(device), mRenderer(renderer), mSpirvVersion(ClspvGetSpirvVersion(renderer))
{
const VkPhysicalDeviceProperties &props = mRenderer->getPhysicalDeviceProperties();
// Setup initial device mInfo fields
// TODO(aannestrand) Create cl::Caps and use for device creation
// http://anglebug.com/42266954
mInfoString = {
{cl::DeviceInfo::Name, std::string(props.deviceName)},
{cl::DeviceInfo::Vendor, mRenderer->getVendorString()},
{cl::DeviceInfo::DriverVersion, mRenderer->getVersionString(true)},
{cl::DeviceInfo::Version, std::string("OpenCL 3.0 " + mRenderer->getVersionString(true))},
{cl::DeviceInfo::Profile, std::string("FULL_PROFILE")},
{cl::DeviceInfo::OpenCL_C_Version, std::string("OpenCL C 1.2 ")},
{cl::DeviceInfo::LatestConformanceVersionPassed, std::string("FIXME")}};
mInfoSizeT = {
{cl::DeviceInfo::MaxWorkGroupSize, props.limits.maxComputeWorkGroupInvocations},
{cl::DeviceInfo::MaxGlobalVariableSize, 0},
{cl::DeviceInfo::GlobalVariablePreferredTotalSize, 0},
// TODO(aannestrand) Update these hardcoded platform/device queries
// http://anglebug.com/42266935
{cl::DeviceInfo::MaxParameterSize, 1024},
{cl::DeviceInfo::ProfilingTimerResolution, 1},
{cl::DeviceInfo::PrintfBufferSize, 1024 * 1024},
{cl::DeviceInfo::PreferredWorkGroupSizeMultiple, 16},
};
// Minimum float configs/support required
cl_ulong halfFPConfig = 0;
cl_ulong singleFPConfig = CL_FP_ROUND_TO_NEAREST | CL_FP_INF_NAN | CL_FP_FMA;
cl_ulong doubleFPConfig = 0;
if (mRenderer->getFeatures().supportsShaderFloat16.enabled)
{
halfFPConfig |= CL_FP_ROUND_TO_NEAREST | CL_FP_INF_NAN;
}
if (mRenderer->getFeatures().supportsShaderFloat64.enabled)
{
doubleFPConfig |= CL_FP_FMA | CL_FP_ROUND_TO_NEAREST | CL_FP_ROUND_TO_ZERO |
CL_FP_ROUND_TO_INF | CL_FP_INF_NAN | CL_FP_DENORM;
}
mInfoULong = {
{cl::DeviceInfo::LocalMemSize, props.limits.maxComputeSharedMemorySize},
{cl::DeviceInfo::SVM_Capabilities, 0},
{cl::DeviceInfo::QueueOnDeviceProperties, 0},
{cl::DeviceInfo::PartitionAffinityDomain, 0},
{cl::DeviceInfo::DeviceEnqueueCapabilities, 0},
{cl::DeviceInfo::QueueOnHostProperties, CL_QUEUE_PROFILING_ENABLE},
// TODO(aannestrand) Update these hardcoded platform/device queries
// http://anglebug.com/42266935
{cl::DeviceInfo::HalfFpConfig, halfFPConfig},
{cl::DeviceInfo::DoubleFpConfig, doubleFPConfig},
{cl::DeviceInfo::GlobalMemCacheSize, 0},
{cl::DeviceInfo::GlobalMemSize, 1024 * 1024 * 1024},
{cl::DeviceInfo::MaxConstantBufferSize, 64 * 1024},
{cl::DeviceInfo::SingleFpConfig, singleFPConfig},
{cl::DeviceInfo::AtomicMemoryCapabilities,
CL_DEVICE_ATOMIC_ORDER_RELAXED | CL_DEVICE_ATOMIC_SCOPE_WORK_GROUP |
CL_DEVICE_ATOMIC_ORDER_ACQ_REL | CL_DEVICE_ATOMIC_SCOPE_DEVICE |
CL_DEVICE_ATOMIC_ORDER_SEQ_CST},
// TODO (http://anglebug.com/379669750) Add these based on the Vulkan features query
{cl::DeviceInfo::AtomicFenceCapabilities, CL_DEVICE_ATOMIC_ORDER_RELAXED |
CL_DEVICE_ATOMIC_ORDER_ACQ_REL |
CL_DEVICE_ATOMIC_SCOPE_WORK_GROUP |
// non-mandatory
CL_DEVICE_ATOMIC_SCOPE_WORK_ITEM},
};
cl_uint maxNumSubGroups = 0u;
if (mRenderer->getFeatures().supportsClKhrSubgroups.enabled)
{
const uint32_t subgroupSize = mRenderer->getPhysicalDeviceSubgroupProperties().subgroupSize;
ASSERT(subgroupSize > 0);
maxNumSubGroups =
static_cast<uint32_t>(mInfoSizeT[cl::DeviceInfo::MaxWorkGroupSize]) / subgroupSize;
}
mInfoUInt = {
{cl::DeviceInfo::VendorID, props.vendorID},
{cl::DeviceInfo::MaxReadImageArgs, cl::IMPLEMENATION_MAX_READ_IMAGES},
{cl::DeviceInfo::MaxWriteImageArgs, cl::IMPLEMENATION_MAX_WRITE_IMAGES},
{cl::DeviceInfo::MaxReadWriteImageArgs, cl::IMPLEMENATION_MAX_WRITE_IMAGES},
{cl::DeviceInfo::GlobalMemCachelineSize,
static_cast<cl_uint>(props.limits.nonCoherentAtomSize)},
{cl::DeviceInfo::Available, CL_TRUE},
{cl::DeviceInfo::LinkerAvailable, CL_TRUE},
{cl::DeviceInfo::CompilerAvailable, CL_TRUE},
{cl::DeviceInfo::MaxOnDeviceQueues, 0},
{cl::DeviceInfo::MaxOnDeviceEvents, 0},
{cl::DeviceInfo::QueueOnDeviceMaxSize, 0},
{cl::DeviceInfo::QueueOnDevicePreferredSize, 0},
{cl::DeviceInfo::MaxPipeArgs, 0},
{cl::DeviceInfo::PipeMaxPacketSize, 0},
{cl::DeviceInfo::PipeSupport, CL_FALSE},
{cl::DeviceInfo::PipeMaxActiveReservations, 0},
{cl::DeviceInfo::ErrorCorrectionSupport, CL_FALSE},
{cl::DeviceInfo::PreferredInteropUserSync, CL_TRUE},
{cl::DeviceInfo::ExecutionCapabilities, CL_EXEC_KERNEL},
// TODO(aannestrand) Update these hardcoded platform/device queries
// http://anglebug.com/42266935
{cl::DeviceInfo::AddressBits,
mRenderer->getFeatures().supportsBufferDeviceAddress.enabled ? 64 : 32},
{cl::DeviceInfo::EndianLittle, CL_TRUE},
{cl::DeviceInfo::LocalMemType, CL_LOCAL},
// TODO (http://anglebug.com/379669750) Vulkan reports a big sampler count number, we dont
// need that many and set it to minimum req for now.
{cl::DeviceInfo::MaxSamplers, 16u},
{cl::DeviceInfo::MaxConstantArgs, 8},
{cl::DeviceInfo::MaxNumSubGroups, maxNumSubGroups},
{cl::DeviceInfo::MaxComputeUnits, 4},
{cl::DeviceInfo::MaxClockFrequency, 555},
{cl::DeviceInfo::MaxWorkItemDimensions, 3},
{cl::DeviceInfo::MinDataTypeAlignSize, 128},
{cl::DeviceInfo::GlobalMemCacheType, CL_NONE},
{cl::DeviceInfo::HostUnifiedMemory, CL_TRUE},
{cl::DeviceInfo::NativeVectorWidthChar, 4},
{cl::DeviceInfo::NativeVectorWidthShort, 2},
{cl::DeviceInfo::NativeVectorWidthInt, 1},
{cl::DeviceInfo::NativeVectorWidthLong, 1},
{cl::DeviceInfo::NativeVectorWidthFloat, 1},
{cl::DeviceInfo::NativeVectorWidthDouble, mRenderer->getNativeVectorWidthDouble()},
{cl::DeviceInfo::NativeVectorWidthHalf, mRenderer->getNativeVectorWidthHalf()},
{cl::DeviceInfo::PartitionMaxSubDevices, 0},
{cl::DeviceInfo::PreferredVectorWidthChar, 4},
{cl::DeviceInfo::PreferredVectorWidthShort, 8},
{cl::DeviceInfo::PreferredVectorWidthInt, 1},
{cl::DeviceInfo::PreferredVectorWidthLong, 1},
{cl::DeviceInfo::PreferredVectorWidthFloat, 1},
{cl::DeviceInfo::PreferredVectorWidthDouble, mRenderer->getPreferredVectorWidthDouble()},
{cl::DeviceInfo::PreferredVectorWidthHalf, mRenderer->getPreferredVectorWidthHalf()},
{cl::DeviceInfo::PreferredLocalAtomicAlignment, 0},
{cl::DeviceInfo::PreferredGlobalAtomicAlignment, 0},
{cl::DeviceInfo::PreferredPlatformAtomicAlignment, 0},
{cl::DeviceInfo::NonUniformWorkGroupSupport, CL_TRUE},
{cl::DeviceInfo::GenericAddressSpaceSupport, CL_FALSE},
{cl::DeviceInfo::SubGroupIndependentForwardProgress,
maxNumSubGroups > 0 ? CL_TRUE : CL_FALSE},
{cl::DeviceInfo::WorkGroupCollectiveFunctionsSupport, CL_FALSE},
};
}
CLDeviceVk::~CLDeviceVk() = default;
CLDeviceImpl::Info CLDeviceVk::createInfo(cl::DeviceType type) const
{
Info info(type);
const VkPhysicalDeviceProperties &properties = mRenderer->getPhysicalDeviceProperties();
info.maxWorkItemSizes.push_back(properties.limits.maxComputeWorkGroupSize[0]);
info.maxWorkItemSizes.push_back(properties.limits.maxComputeWorkGroupSize[1]);
info.maxWorkItemSizes.push_back(properties.limits.maxComputeWorkGroupSize[2]);
// TODO(aannestrand) Update these hardcoded platform/device queries
// http://anglebug.com/42266935
info.maxMemAllocSize = 1 << 30;
info.memBaseAddrAlign = 1024;
info.imageSupport = CL_TRUE;
info.image2D_MaxWidth = properties.limits.maxImageDimension2D;
info.image2D_MaxHeight = properties.limits.maxImageDimension2D;
info.image3D_MaxWidth = properties.limits.maxImageDimension3D;
info.image3D_MaxHeight = properties.limits.maxImageDimension3D;
info.image3D_MaxDepth = properties.limits.maxImageDimension3D;
// Max number of pixels for a 1D image created from a buffer object.
info.imageMaxBufferSize = properties.limits.maxTexelBufferElements;
info.imageMaxArraySize = properties.limits.maxImageArrayLayers;
// The following are queried when image2d is created from buffer. We mimic its support for now
// by doing a copy and as such dont have alignment requirements.
info.imagePitchAlignment = 1u;
info.imageBaseAddressAlignment = 1u;
info.execCapabilities = CL_EXEC_KERNEL;
info.queueOnDeviceMaxSize = 0u;
info.builtInKernels = "";
info.version = CL_MAKE_VERSION(3, 0, 0);
info.versionStr = "OpenCL 3.0 " + mRenderer->getVersionString(true);
info.OpenCL_C_AllVersions = {{CL_MAKE_VERSION(1, 0, 0), "OpenCL C"},
{CL_MAKE_VERSION(1, 1, 0), "OpenCL C"},
{CL_MAKE_VERSION(1, 2, 0), "OpenCL C"},
{CL_MAKE_VERSION(3, 0, 0), "OpenCL C"}};
info.OpenCL_C_Features = {};
info.ILsWithVersion = {};
info.builtInKernelsWithVersion = {};
info.partitionProperties = {};
info.partitionType = {};
info.IL_Version = "";
// Below extensions are required as of OpenCL 1.1, add their versioned strings
NameVersionVector versionedExtensionList = {
// Below extensions are required as of OpenCL 1.1
cl_name_version{.version = CL_MAKE_VERSION(1, 0, 0),
.name = "cl_khr_byte_addressable_store"},
cl_name_version{.version = CL_MAKE_VERSION(1, 0, 0),
.name = "cl_khr_global_int32_base_atomics"},
cl_name_version{.version = CL_MAKE_VERSION(1, 0, 0),
.name = "cl_khr_global_int32_extended_atomics"},
cl_name_version{.version = CL_MAKE_VERSION(1, 0, 0),
.name = "cl_khr_local_int32_base_atomics"},
cl_name_version{.version = CL_MAKE_VERSION(1, 0, 0),
.name = "cl_khr_local_int32_extended_atomics"},
};
CLExtensions::ExternalMemoryHandleBitset supportedHandles;
supportedHandles.set(cl::ExternalMemoryHandle::OpaqueFd, supportsExternalMemoryFd());
supportedHandles.set(cl::ExternalMemoryHandle::DmaBuf, supportsExternalMemoryDmaBuf());
// Populate other extensions based on feature support
if (info.populateSupportedExternalMemoryHandleTypes(supportedHandles))
{
versionedExtensionList.push_back(
cl_name_version{.version = CL_MAKE_VERSION(1, 0, 0), .name = "cl_khr_external_memory"});
// cl_arm_import_memory is layered on top of cl_arm_import_memory
bool reportBaseArmImportMemString = false;
if (supportedHandles.test(cl::ExternalMemoryHandle::DmaBuf))
{
versionedExtensionList.push_back(cl_name_version{
.version = CL_MAKE_VERSION(1, 0, 0), .name = "cl_arm_import_memory_dma_buf"});
reportBaseArmImportMemString = true;
}
if (reportBaseArmImportMemString)
{
versionedExtensionList.push_back(cl_name_version{.version = CL_MAKE_VERSION(1, 11, 0),
.name = "cl_arm_import_memory"});
}
}
if (mRenderer->getFeatures().supportsShaderFloat16.enabled)
{
versionedExtensionList.push_back(
cl_name_version{.version = CL_MAKE_VERSION(1, 0, 0), .name = "cl_khr_fp16"});
}
if (mRenderer->getFeatures().supportsShaderFloat64.enabled)
{
versionedExtensionList.push_back(
cl_name_version{.version = CL_MAKE_VERSION(1, 0, 0), .name = "cl_khr_fp64"});
}
if (info.imageSupport && info.image3D_MaxDepth > 1)
{
versionedExtensionList.push_back(
cl_name_version{.version = CL_MAKE_VERSION(1, 0, 0), .name = "cl_khr_3d_image_writes"});
}
if (mRenderer->getQueueFamilyProperties().queueCount > 1)
{
versionedExtensionList.push_back(
cl_name_version{.version = CL_MAKE_VERSION(1, 0, 0), .name = "cl_khr_priority_hints"});
}
info.integerDotProductCapabilities = getIntegerDotProductCapabilities();
info.integerDotProductAccelerationProperties8Bit =
getIntegerDotProductAccelerationProperties8Bit();
info.integerDotProductAccelerationProperties4x8BitPacked =
getIntegerDotProductAccelerationProperties4x8BitPacked();
if (mRenderer->getFeatures().supportsShaderIntegerDotProduct.enabled)
{
versionedExtensionList.push_back(cl_name_version{.version = CL_MAKE_VERSION(2, 0, 0),
.name = "cl_khr_integer_dot_product"});
}
// cl_khr_int64_base_atomics and cl_khr_int64_extended_atomics
if (mRenderer->getFeatures().supportsShaderAtomicInt64.enabled)
{
versionedExtensionList.push_back(cl_name_version{.version = CL_MAKE_VERSION(1, 0, 0),
.name = "cl_khr_int64_base_atomics"});
versionedExtensionList.push_back(cl_name_version{.version = CL_MAKE_VERSION(1, 0, 0),
.name = "cl_khr_int64_extended_atomics"});
}
// cl_khr_depth_images
if (setupAndReportDepthImageSupport(info))
{
versionedExtensionList.push_back(
cl_name_version{.version = CL_MAKE_VERSION(1, 0, 0), .name = "cl_khr_depth_images"});
}
// cl_khr_image2d_from_buffer
if (info.version >= CL_MAKE_VERSION(3, 0, 0) && info.imageSupport)
{
versionedExtensionList.push_back(cl_name_version{.version = CL_MAKE_VERSION(1, 0, 0),
.name = "cl_khr_image2d_from_buffer"});
}
// cl_khr_subgroups
if (mRenderer->getFeatures().supportsClKhrSubgroups.enabled)
{
versionedExtensionList.push_back(
cl_name_version{.version = CL_MAKE_VERSION(1, 0, 0), .name = "cl_khr_subgroups"});
}
info.initializeVersionedExtensions(std::move(versionedExtensionList));
if (!mRenderer->getFeatures().supportsUniformBufferStandardLayout.enabled)
{
ERR() << "VK_KHR_uniform_buffer_standard_layout extension support is needed to properly "
"support uniform buffers. Otherwise, you must disable OpenCL.";
}
// Populate supported features
if (info.imageSupport)
{
info.OpenCL_C_Features.push_back(
cl_name_version{.version = CL_MAKE_VERSION(3, 0, 0), .name = "__opencl_c_images"});
info.OpenCL_C_Features.push_back(cl_name_version{.version = CL_MAKE_VERSION(3, 0, 0),
.name = "__opencl_c_3d_image_writes"});
info.OpenCL_C_Features.push_back(cl_name_version{.version = CL_MAKE_VERSION(3, 0, 0),
.name = "__opencl_c_read_write_images"});
}
if (mRenderer->getEnabledFeatures().features.shaderInt64)
{
info.OpenCL_C_Features.push_back(
cl_name_version{.version = CL_MAKE_VERSION(3, 0, 0), .name = "__opencl_c_int64"});
}
if (mRenderer->getFeatures().supportsShaderIntegerDotProduct.enabled)
{
info.OpenCL_C_Features.push_back(
cl_name_version{.version = CL_MAKE_VERSION(3, 0, 0),
.name = "__opencl_c_integer_dot_product_input_4x8bit"});
info.OpenCL_C_Features.push_back(
cl_name_version{.version = CL_MAKE_VERSION(3, 0, 0),
.name = "__opencl_c_integer_dot_product_input_4x8bit_packed"});
}
info.OpenCL_C_Features.push_back(cl_name_version{.version = CL_MAKE_VERSION(3, 0, 0),
.name = "__opencl_c_atomic_order_acq_rel"});
info.OpenCL_C_Features.push_back(cl_name_version{.version = CL_MAKE_VERSION(3, 0, 0),
.name = "__opencl_c_atomic_order_seq_cst"});
info.OpenCL_C_Features.push_back(cl_name_version{.version = CL_MAKE_VERSION(3, 0, 0),
.name = "__opencl_c_atomic_scope_device"});
if (mRenderer->getFeatures().supportsClKhrSubgroups.enabled)
{
info.OpenCL_C_Features.push_back(
cl_name_version{.version = CL_MAKE_VERSION(1, 0, 0), .name = "__opencl_c_subgroups"});
}
return info;
}
angle::Result CLDeviceVk::getInfoUInt(cl::DeviceInfo name, cl_uint *value) const
{
if (mInfoUInt.count(name))
{
*value = mInfoUInt.at(name);
return angle::Result::Continue;
}
ANGLE_CL_RETURN_ERROR(CL_INVALID_VALUE);
}
angle::Result CLDeviceVk::getInfoULong(cl::DeviceInfo name, cl_ulong *value) const
{
if (mInfoULong.count(name))
{
*value = mInfoULong.at(name);
return angle::Result::Continue;
}
ANGLE_CL_RETURN_ERROR(CL_INVALID_VALUE);
}
angle::Result CLDeviceVk::getInfoSizeT(cl::DeviceInfo name, size_t *value) const
{
if (mInfoSizeT.count(name))
{
*value = mInfoSizeT.at(name);
return angle::Result::Continue;
}
ANGLE_CL_RETURN_ERROR(CL_INVALID_VALUE);
}
angle::Result CLDeviceVk::getInfoStringLength(cl::DeviceInfo name, size_t *value) const
{
if (mInfoString.count(name))
{
*value = mInfoString.at(name).length() + 1;
return angle::Result::Continue;
}
ANGLE_CL_RETURN_ERROR(CL_INVALID_VALUE);
}
angle::Result CLDeviceVk::getInfoString(cl::DeviceInfo name, size_t size, char *value) const
{
if (mInfoString.count(name))
{
std::strcpy(value, mInfoString.at(name).c_str());
return angle::Result::Continue;
}
ANGLE_CL_RETURN_ERROR(CL_INVALID_VALUE);
}
bool CLDeviceVk::supportsExternalMemoryFd() const
{
return mRenderer->getFeatures().supportsExternalMemoryFd.enabled;
}
bool CLDeviceVk::supportsExternalMemoryDmaBuf() const
{
return mRenderer->getFeatures().supportsExternalMemoryDmaBuf.enabled;
}
angle::Result CLDeviceVk::createSubDevices(const cl_device_partition_property *properties,
cl_uint numDevices,
CreateFuncs &subDevices,
cl_uint *numDevicesRet)
{
UNIMPLEMENTED();
ANGLE_CL_RETURN_ERROR(CL_OUT_OF_RESOURCES);
}
cl::WorkgroupSize CLDeviceVk::selectWorkGroupSize(const cl::NDRange &ndrange) const
{
uint32_t subgroupSize = 0;
uint32_t maxSize = static_cast<uint32_t>(mInfoSizeT.at(cl::DeviceInfo::MaxWorkGroupSize));
if (mRenderer->getFeatures().supportsClKhrSubgroups.enabled)
{
// query the renderer SIMD width
subgroupSize = mRenderer->getPhysicalDeviceSubgroupProperties().subgroupSize;
}
// adjust max to be at least one full subgroup
maxSize = std::max(subgroupSize, std::min(64u, maxSize));
if (getRenderer()->getFeatures().clBestUniformFitWGS.enabled)
{
return CalculateUniformFitWGS(ndrange, maxSize);
}
else
{
return CalculateSimplePow2WGS(ndrange, maxSize);
}
}
cl::WorkgroupSize CLDeviceVk::CalculateSimplePow2WGS(const cl::NDRange &ndrange,
const uint32_t maxSize)
{
// simplest strategy - start with always-valid GWS, increase by power-of-two until limits
bool keepIncreasing = false;
cl::WorkgroupSize localSize = {1, 1, 1};
do
{
keepIncreasing = false;
for (uint32_t dim = 0; dim < ndrange.workDimensions; dim++)
{
cl::WorkgroupSize newLocalSize = localSize;
newLocalSize[dim] *= 2;
uint32_t threadsInWorkgroup = newLocalSize[0] * newLocalSize[1] * newLocalSize[2];
if (newLocalSize[dim] <= ndrange.globalWorkSize[dim] && threadsInWorkgroup <= maxSize)
{
localSize = newLocalSize;
keepIncreasing = true;
}
}
} while (keepIncreasing);
return localSize;
}
cl::WorkgroupSize CLDeviceVk::CalculateUniformFitWGS(const cl::NDRange &ndrange,
const uint32_t maxSize)
{
// uniform-fit strategy: prioritizes on ensuring calculated WGS is uniform to the given GWS.
// this tries to avoid non-uniform case which can be costly due to "chunking" dispatches into
// uniform regions where each new/unique WGS leads to a creation of a new compute pipeline
// (i.e. due to WGS being treated as a VK spec-constant)
cl::WorkgroupSize localSize = {std::min(ndrange.globalWorkSize[0], maxSize),
std::min(ndrange.globalWorkSize[1], maxSize),
std::min(ndrange.globalWorkSize[2], maxSize)};
uint32_t threadsInWorkgroup = localSize[2] * localSize[1] * localSize[0];
// 1st-pass: iterate the WGS (each dim) to ensure they each evenly divide into their GWS
while (threadsInWorkgroup > maxSize)
{
// check for dim with largest WGS on each try
uint32_t maxDim = 0;
for (uint32_t dim = 1; dim < ndrange.workDimensions; ++dim)
{
if (localSize[dim] > localSize[maxDim])
{
maxDim = dim;
}
}
if (localSize[maxDim] > 1)
{
--localSize[maxDim]; // back-off by one initially
while (localSize[maxDim] > 1 &&
(ndrange.globalWorkSize[maxDim] % localSize[maxDim]) != 0)
{
--localSize[maxDim];
}
}
threadsInWorkgroup = localSize[2] * localSize[1] * localSize[0];
}
// 2nd-pass: if nothing worked so far, try pinning dim with the most threads (ignore others)
if (localSize == cl::WorkgroupSize{1, 1, 1})
{
uint32_t dimWithMostThreads = 0, currentSize = 1;
for (uint32_t dim = 0; dim < ndrange.workDimensions; ++dim)
{
if (currentSize < ndrange.globalWorkSize[dim])
{
currentSize = ndrange.globalWorkSize[dim];
dimWithMostThreads = dim;
}
}
if (currentSize > maxSize)
{
// we tried our best to fit our WGS evenly into GWS, but it's not feasible - fall
// back to simpler pow2 generator
WARN() << "could not perform even-fit for WGS, falling back to simple pow2 WGS";
return CalculateSimplePow2WGS(ndrange, maxSize);
}
localSize[dimWithMostThreads] = currentSize;
}
return localSize;
}
cl_device_integer_dot_product_capabilities_khr CLDeviceVk::getIntegerDotProductCapabilities() const
{
cl_device_integer_dot_product_capabilities_khr integerDotProductCapabilities = {};
if (mRenderer->getFeatures().supportsShaderIntegerDotProduct.enabled)
{
// If the VK extension is supported, then all the caps mentioned in the CL spec are
// supported by default.
integerDotProductCapabilities = (CL_DEVICE_INTEGER_DOT_PRODUCT_INPUT_4x8BIT_PACKED_KHR |
CL_DEVICE_INTEGER_DOT_PRODUCT_INPUT_4x8BIT_KHR);
}
return integerDotProductCapabilities;
}
cl_device_integer_dot_product_acceleration_properties_khr
CLDeviceVk::getIntegerDotProductAccelerationProperties8Bit() const
{
cl_device_integer_dot_product_acceleration_properties_khr
integerDotProductAccelerationProperties = {};
const VkPhysicalDeviceShaderIntegerDotProductProperties &integerDotProductProps =
mRenderer->getPhysicalDeviceShaderIntegerDotProductProperties();
integerDotProductAccelerationProperties.signed_accelerated =
integerDotProductProps.integerDotProduct8BitSignedAccelerated;
integerDotProductAccelerationProperties.unsigned_accelerated =
integerDotProductProps.integerDotProduct8BitUnsignedAccelerated;
integerDotProductAccelerationProperties.mixed_signedness_accelerated =
integerDotProductProps.integerDotProduct8BitMixedSignednessAccelerated;
integerDotProductAccelerationProperties.accumulating_saturating_signed_accelerated =
integerDotProductProps.integerDotProductAccumulatingSaturating8BitSignedAccelerated;
integerDotProductAccelerationProperties.accumulating_saturating_unsigned_accelerated =
integerDotProductProps.integerDotProductAccumulatingSaturating8BitUnsignedAccelerated;
integerDotProductAccelerationProperties.accumulating_saturating_mixed_signedness_accelerated =
integerDotProductProps
.integerDotProductAccumulatingSaturating8BitMixedSignednessAccelerated;
return integerDotProductAccelerationProperties;
}
cl_device_integer_dot_product_acceleration_properties_khr
CLDeviceVk::getIntegerDotProductAccelerationProperties4x8BitPacked() const
{
cl_device_integer_dot_product_acceleration_properties_khr
integerDotProductAccelerationProperties = {};
const VkPhysicalDeviceShaderIntegerDotProductProperties &integerDotProductProps =
mRenderer->getPhysicalDeviceShaderIntegerDotProductProperties();
integerDotProductAccelerationProperties.signed_accelerated =
integerDotProductProps.integerDotProduct4x8BitPackedSignedAccelerated;
integerDotProductAccelerationProperties.unsigned_accelerated =
integerDotProductProps.integerDotProduct4x8BitPackedUnsignedAccelerated;
integerDotProductAccelerationProperties.mixed_signedness_accelerated =
integerDotProductProps.integerDotProduct4x8BitPackedMixedSignednessAccelerated;
integerDotProductAccelerationProperties.accumulating_saturating_signed_accelerated =
integerDotProductProps.integerDotProductAccumulatingSaturating4x8BitPackedSignedAccelerated;
integerDotProductAccelerationProperties.accumulating_saturating_unsigned_accelerated =
integerDotProductProps
.integerDotProductAccumulatingSaturating4x8BitPackedUnsignedAccelerated;
integerDotProductAccelerationProperties.accumulating_saturating_mixed_signedness_accelerated =
integerDotProductProps
.integerDotProductAccumulatingSaturating4x8BitPackedMixedSignednessAccelerated;
return integerDotProductAccelerationProperties;
}
bool CLDeviceVk::setupAndReportDepthImageSupport(Info &info) const
{
if (IsNvidia(getRenderer()->getPhysicalDeviceProperties().vendorID))
{
// TODO(aannestrand) CTS validation issue with (cl_copy_images.2D use_pitches) on nvidia
// platform, disable its cl_khr_depth_images support for now
// http://anglebug.com/472472687
return false;
}
constexpr VkFlags kDepthFeatures =
VK_FORMAT_FEATURE_SAMPLED_IMAGE_BIT | VK_FORMAT_FEATURE_DEPTH_STENCIL_ATTACHMENT_BIT;
// for reporting the extension string, we only need CL_FLOAT and CL_UNORM_INT16
// https://registry.khronos.org/OpenCL/specs/3.0-unified/html/OpenCL_API.html#minimum-list-of-supported-image-formats
CLExtensions::SupportedDepthOrderTypes minimumDepthOrderTypeSupport;
minimumDepthOrderTypeSupport.set(cl::ImageChannelType::Float);
minimumDepthOrderTypeSupport.set(cl::ImageChannelType::UnormInt16);
for (const cl::ImageChannelType imageChannelType : angle::AllEnums<cl::ImageChannelType>())
{
angle::FormatID format = angle::Format::CLDEPTHFormatToID(cl::ToCLenum(imageChannelType));
if (format != angle::FormatID::NONE &&
mRenderer->hasImageFormatFeatureBits(format, kDepthFeatures))
{
info.supportedDepthOrderTypes.set(imageChannelType);
}
}
// check/return-true if the minimum support is there
return (info.supportedDepthOrderTypes & minimumDepthOrderTypeSupport) ==
minimumDepthOrderTypeSupport;
}
} // namespace rx