ExecutionTest.cpp
// Copyright (C) Microsoft Corporation. All rights reserved. //
// This file is distributed under the University of Illinois Open Source //
// License. See LICENSE.TXT for details. //
// //
// These tests run by executing compiled programs, and thus involve more //
// moving parts, like the runtime and drivers. //
// //
// We need to keep & fix these warnings to integrate smoothly with HLK
#pragma warning(error: 4100 4146 4242 4244 4267 4701 4389)
#include <algorithm>
#include <memory>
#include <array>
#include <vector>
#include <string>
#include <map>
#include <unordered_set>
#include <strstream>
#include <iomanip>
#include "dxc/Test/CompilationResult.h"
#include "dxc/Test/HLSLTestData.h"
#include <Shlwapi.h>
#include <atlcoll.h>
#include <locale>
#include <algorithm>
#include <bitset>
#undef _read
#include "WexTestClass.h"
#include "dxc/Test/HlslTestUtils.h"
#include "dxc/Test/DxcTestUtils.h"
#include "dxc/Support/Global.h"
#include "dxc/Support/WinIncludes.h"
#include "dxc/Support/FileIOHelper.h"
#include "dxc/Support/Unicode.h"
// d3d12.h and dxgi1_4.h are included in the Windows 10 SDK
#include <d3d12.h>
#include <dxgi1_4.h>
#include <DXGIDebug.h>
#include "dxc/Support/d3dx12.h"
#include <DirectXMath.h>
#include <strsafe.h>
#include <d3dcompiler.h>
#include <wincodec.h>
#include "ShaderOpTest.h"
#include <libloaderapi.h>
#pragma comment(lib, "d3dcompiler.lib")
#pragma comment(lib, "windowscodecs.lib")
#pragma comment(lib, "dxguid.lib")
#pragma comment(lib, "version.lib")
// A more recent Windows SDK than currently required is needed for these.
typedef HRESULT(WINAPI *D3D12EnableExperimentalFeaturesFn)(
UINT NumFeatures,
__in_ecount(NumFeatures) const IID* pIIDs,
__in_ecount_opt(NumFeatures) void* pConfigurationStructs,
__in_ecount_opt(NumFeatures) UINT* pConfigurationStructSizes);
static const GUID D3D12ExperimentalShaderModelsID = { /* 76f5573e-f13a-40f5-b297-81ce9e18933f */
{ 0xb2, 0x97, 0x81, 0xce, 0x9e, 0x18, 0x93, 0x3f }
// Used to create D3D12SDKConfiguration to enable AgilitySDK programmatically.
typedef HRESULT(WINAPI *D3D12GetInterfaceFn)(REFCLSID rclsid, REFIID riid, void **ppvDebug);
#ifndef __ID3D12SDKConfiguration_INTERFACE_DEFINED__
// Copied from AgilitySDK D3D12.h to programmatically enable when in developer mode.
#define __ID3D12SDKConfiguration_INTERFACE_DEFINED__
EXTERN_C const GUID DECLSPEC_SELECTANY IID_ID3D12SDKConfiguration = {0xe9eb5314,0x33aa,0x42b2, {0xa7,0x18,0xd7,0x7f,0x58,0xb1,0xf1,0xc7}};
EXTERN_C const GUID DECLSPEC_SELECTANY CLSID_D3D12SDKConfiguration = {0x7cda6aca, 0xa03e, 0x49c8, {0x94, 0x58, 0x03, 0x34, 0xd2, 0x0e, 0x07, 0xce}};
ID3D12SDKConfiguration : public IUnknown
UINT SDKVersion,
_In_z_ LPCSTR SDKPath) = 0;
#endif /* __ID3D12SDKConfiguration_INTERFACE_DEFINED__ */
using namespace DirectX;
using namespace hlsl_test;
template <typename TSequence, typename T>
static bool contains(TSequence s, const T &val) {
return std::cend(s) != std::find(std::cbegin(s), std::cend(s), val);
template <typename InputIterator, typename T>
static bool contains(InputIterator b, InputIterator e, const T &val) {
return e != std::find(b, e, val);
static HRESULT ReportLiveObjects() {
CComPtr<IDXGIDebug1> pDebug;
IFR(DXGIGetDebugInterface1(0, IID_PPV_ARGS(&pDebug)));
return S_OK;
static void WriteInfoQueueMessages(void *pStrCtx, st::OutputStringFn pOutputStrFn, ID3D12InfoQueue *pInfoQueue) {
bool allMessagesOK = true;
UINT64 count = pInfoQueue->GetNumStoredMessages();
CAtlArray<BYTE> message;
for (UINT64 i = 0; i < count; ++i) {
// 'GetMessageA' rather than 'GetMessage' is an artifact of user32 headers.
SIZE_T msgLen = 0;
if (FAILED(pInfoQueue->GetMessageA(i, nullptr, &msgLen))) {
allMessagesOK = false;
if (message.GetCount() < msgLen) {
if (!message.SetCount(msgLen)) {
allMessagesOK = false;
D3D12_MESSAGE *pMessage = (D3D12_MESSAGE *)message.GetData();
if (FAILED(pInfoQueue->GetMessageA(i, pMessage, &msgLen))) {
allMessagesOK = false;
CA2W msgW(pMessage->pDescription, CP_ACP);
pOutputStrFn(pStrCtx, msgW.m_psz);
pOutputStrFn(pStrCtx, L"\r\n");
if (!allMessagesOK) {
pOutputStrFn(pStrCtx, L"Failed to retrieve some messages.\r\n");
class CComContext {
bool m_init;
CComContext() : m_init(false) {}
~CComContext() { Dispose(); }
void Dispose() { if (!m_init) return; m_init = false; CoUninitialize(); }
HRESULT Init() { HRESULT hr = CoInitializeEx(0, COINIT_MULTITHREADED); if (SUCCEEDED(hr)) { m_init = true; } return hr; }
static void SavePixelsToFile(LPCVOID pPixels, DXGI_FORMAT format, UINT32 m_width, UINT32 m_height, LPCWSTR pFileName) {
CComContext ctx;
CComPtr<IWICImagingFactory> pFactory;
CComPtr<IWICBitmap> pBitmap;
CComPtr<IWICBitmapEncoder> pEncoder;
CComPtr<IWICBitmapFrameEncode> pFrameEncode;
CComPtr<hlsl::AbstractMemoryStream> pStream;
CComPtr<IMalloc> pMalloc;
struct PF {
GUID PixelFormat;
UINT32 PixelSize;
bool operator==(DXGI_FORMAT F) const {
return F == Format;
} Vals[] = {
// Add more pixel format mappings as needed.
PF *pFormat = std::find(Vals, Vals + _countof(Vals), format);
VERIFY_SUCCEEDED(CoCreateInstance(CLSID_WICImagingFactory, NULL, CLSCTX_INPROC_SERVER, IID_IWICImagingFactory, (LPVOID*)&pFactory));
VERIFY_SUCCEEDED(CoGetMalloc(1, &pMalloc));
VERIFY_SUCCEEDED(hlsl::CreateMemoryStream(pMalloc, &pStream));
VERIFY_ARE_NOT_EQUAL(pFormat, Vals + _countof(Vals));
VERIFY_SUCCEEDED(pFactory->CreateBitmapFromMemory(m_width, m_height, pFormat->PixelFormat, m_width * pFormat->PixelSize, m_width * m_height * pFormat->PixelSize, (BYTE *)pPixels, &pBitmap));
VERIFY_SUCCEEDED(pFactory->CreateEncoder(GUID_ContainerFormatBmp, nullptr, &pEncoder));
VERIFY_SUCCEEDED(pEncoder->Initialize(pStream, WICBitmapEncoderNoCache));
VERIFY_SUCCEEDED(pEncoder->CreateNewFrame(&pFrameEncode, nullptr));
VERIFY_SUCCEEDED(pFrameEncode->WriteSource(pBitmap, nullptr));
hlsl::WriteBinaryFile(pFileName, pStream->GetPtr(), pStream->GetPtrSize());
// Checks if the given warp version supports the given operation.
bool IsValidWarpDllVersion(unsigned int minBuildNumber) {
HMODULE pLibrary = LoadLibrary("D3D10Warp.dll");
if (pLibrary) {
char path[MAX_PATH];
DWORD length = GetModuleFileName(pLibrary, path, MAX_PATH);
if (length) {
DWORD dwVerHnd = 0;
DWORD dwVersionInfoSize = GetFileVersionInfoSize(path, &dwVerHnd);
std::unique_ptr<int[]> VffInfo(new int[dwVersionInfoSize]);
if (GetFileVersionInfo(path, NULL, dwVersionInfoSize, VffInfo.get())) {
LPVOID versionInfo;
UINT size;
if (VerQueryValue(VffInfo.get(), "\\", &versionInfo, &size)) {
if (size) {
unsigned int warpBuildNumber = verInfo->dwFileVersionLS >> 16 & 0xffff;
if (verInfo->dwSignature == 0xFEEF04BD && warpBuildNumber >= minBuildNumber) {
return true;
return false;
#define D3D12_FEATURE_D3D12_OPTIONS3 ((D3D12_FEATURE)21)
#define NTDDI_WIN10_RS3 0x0A000004 /* ABRACADABRA_WIN10_RS2 */
typedef struct D3D12_FEATURE_DATA_D3D12_OPTIONS3
_Out_ BOOL CopyQueueTimestampQueriesSupported;
_Out_ BOOL CastingFullyTypedFormatSupported;
_Out_ DWORD WriteBufferImmediateSupportFlags;
_Out_ D3D12_VIEW_INSTANCING_TIER ViewInstancingTier;
_Out_ BOOL BarycentricsSupported;
#define D3D12_FEATURE_D3D12_OPTIONS4 ((D3D12_FEATURE)23)
typedef struct D3D12_FEATURE_DATA_D3D12_OPTIONS4
_Out_ BOOL ReservedBufferPlacementSupported;
_Out_ D3D12_SHARED_RESOURCE_COMPATIBILITY_TIER SharedResourceCompatibilityTier;
_Out_ BOOL Native16BitShaderOpsSupported;
// Virtual class to compute the expected result given a set of inputs
struct TableParameter;
class ExecutionTest {
// By default, ignore these tests, which require a recent build to run properly.
TEST_CLASS_PROPERTY(L"Parallel", L"true")
TEST_METHOD_PROPERTY(L"DataSource", L"Table:ShaderOpArithTable.xml#WaveIntrinsicsActiveIntTable")
TEST_METHOD_PROPERTY(L"DataSource", L"Table:ShaderOpArithTable.xml#WaveIntrinsicsActiveUintTable")
TEST_METHOD_PROPERTY(L"DataSource", L"Table:ShaderOpArithTable.xml#WaveIntrinsicsPrefixIntTable")
TEST_METHOD_PROPERTY(L"DataSource", L"Table:ShaderOpArithTable.xml#WaveIntrinsicsPrefixUintTable")
TEST_METHOD_PROPERTY(L"DataSource", L"Table:ShaderOpArithTable.xml#WaveIntrinsicsMultiPrefixIntTable")
TEST_METHOD_PROPERTY(L"DataSource", L"Table:ShaderOpArithTable.xml#WaveIntrinsicsMultiPrefixUintTable")
// TAEF data-driven tests.
TEST_METHOD_PROPERTY(L"DataSource", L"Table:ShaderOpArithTable.xml#UnaryFloatOpTable")
TEST_METHOD_PROPERTY(L"DataSource", L"Table:ShaderOpArithTable.xml#BinaryFloatOpTable")
TEST_METHOD_PROPERTY(L"DataSource", L"Table:ShaderOpArithTable.xml#TertiaryFloatOpTable")
TEST_METHOD_PROPERTY(L"DataSource", L"Table:ShaderOpArithTable.xml#UnaryHalfOpTable")
TEST_METHOD_PROPERTY(L"DataSource", L"Table:ShaderOpArithTable.xml#BinaryHalfOpTable")
TEST_METHOD_PROPERTY(L"DataSource", L"Table:ShaderOpArithTable.xml#TertiaryHalfOpTable")
TEST_METHOD_PROPERTY(L"DataSource", L"Table:ShaderOpArithTable.xml#UnaryIntOpTable")
TEST_METHOD_PROPERTY(L"DataSource", L"Table:ShaderOpArithTable.xml#BinaryIntOpTable")
TEST_METHOD_PROPERTY(L"DataSource", L"Table:ShaderOpArithTable.xml#TertiaryIntOpTable")
TEST_METHOD_PROPERTY(L"DataSource", L"Table:ShaderOpArithTable.xml#UnaryUintOpTable")
TEST_METHOD_PROPERTY(L"DataSource", L"Table:ShaderOpArithTable.xml#BinaryUintOpTable")
TEST_METHOD_PROPERTY(L"DataSource", L"Table:ShaderOpArithTable.xml#TertiaryUintOpTable")
TEST_METHOD_PROPERTY(L"DataSource", L"Table:ShaderOpArithTable.xml#UnaryInt16OpTable")
TEST_METHOD_PROPERTY(L"DataSource", L"Table:ShaderOpArithTable.xml#BinaryInt16OpTable")
TEST_METHOD_PROPERTY(L"DataSource", L"Table:ShaderOpArithTable.xml#TertiaryInt16OpTable")
TEST_METHOD_PROPERTY(L"DataSource", L"Table:ShaderOpArithTable.xml#UnaryUint16OpTable")
TEST_METHOD_PROPERTY(L"DataSource", L"Table:ShaderOpArithTable.xml#BinaryUint16OpTable")
TEST_METHOD_PROPERTY(L"DataSource", L"Table:ShaderOpArithTable.xml#TertiaryUint16OpTable")
TEST_METHOD_PROPERTY(L"DataSource", L"Table:ShaderOpArithTable.xml#DotOpTable")
TEST_METHOD_PROPERTY(L"DataSource", L"Table:ShaderOpArithTable.xml#Dot2AddHalfOpTable")
TEST_METHOD_PROPERTY(L"DataSource", L"Table:ShaderOpArithTable.xml#Dot4AddI8PackedOpTable")
TEST_METHOD_PROPERTY(L"DataSource", L"Table:ShaderOpArithTable.xml#Dot4AddU8PackedOpTable")
TEST_METHOD_PROPERTY(L"DataSource", L"Table:ShaderOpArithTable.xml#Msad4Table")
TEST_METHOD_PROPERTY(L"DataSource", L"Table:ShaderOpArithTable.xml#DenormBinaryFloatOpTable")
TEST_METHOD_PROPERTY(L"DataSource", L"Table:ShaderOpArithTable.xml#DenormTertiaryFloatOpTable")
TEST_METHOD_PROPERTY(L"DataSource", L"Table:ShaderOpArithTable.xml#PackUnpackOpTable")
dxc::DxcDllSupport m_support;
VersionSupportInfo m_ver;
bool m_D3DInitCompleted = false;
bool m_ExperimentalModeEnabled = false;
bool m_AgilitySDKEnabled = false;
const float ClearColor[4] = { 0.0f, 0.2f, 0.4f, 1.0f };
bool DivergentClassSetup() {
// Run this only once.
if (!m_D3DInitCompleted) {
m_D3DInitCompleted = true;
HMODULE hRuntime = LoadLibraryW(L"d3d12.dll");
if (hRuntime == NULL)
return false;
// Do not: FreeLibrary(hRuntime);
// If we actually free the library, it defeats the purpose of
// EnableAgilitySDK and EnableExperimentalMode.
hr = EnableAgilitySDK(hRuntime);
if (FAILED(hr)) {
LogCommentFmt(L"Unable to enable Agility SDK - 0x%08x.", hr);
} else if (hr == S_FALSE) {
LogCommentFmt(L"Agility SDK not enabled.");
} else {
LogCommentFmt(L"Agility SDK enabled.");
hr = EnableExperimentalMode(hRuntime);
if (FAILED(hr)) {
LogCommentFmt(L"Unable to enable shader experimental mode - 0x%08x.", hr);
} else if (hr == S_FALSE) {
LogCommentFmt(L"Experimental mode not enabled.");
} else {
LogCommentFmt(L"Experimental mode enabled.");
hr = EnableDebugLayer();
if (FAILED(hr)) {
LogCommentFmt(L"Unable to enable debug layer - 0x%08x.", hr);
} else if (hr == S_FALSE) {
LogCommentFmt(L"Debug layer not enabled.");
} else {
LogCommentFmt(L"Debug layer enabled.");
return true;
// Do not remove the following line - it is used by
// MARKER: ExecutionTest/DxilConf Shared Implementation Start
// This is defined in d3d.h for Windows 10 Anniversary Edition SDK, but we only
// require the Windows 10 SDK.
typedef enum D3D_SHADER_MODEL {
D3D_SHADER_MODEL_5_1 = 0x51,
D3D_SHADER_MODEL_6_0 = 0x60,
D3D_SHADER_MODEL_6_1 = 0x61,
D3D_SHADER_MODEL_6_2 = 0x62,
D3D_SHADER_MODEL_6_3 = 0x63,
D3D_SHADER_MODEL_6_4 = 0x64,
D3D_SHADER_MODEL_6_5 = 0x65,
D3D_SHADER_MODEL_6_6 = 0x66,
D3D_SHADER_MODEL_6_7 = 0x67,
bool UseDxbc() {
#ifdef _HLK_CONF
return false;
return GetTestParamBool(L"DXBC");
bool UseWarpByDefault() {
#ifdef _HLK_CONF
return false;
return true;
bool UseDebugIfaces() {
return true;
bool SaveImages() {
return GetTestParamBool(L"SaveImages");
// Base class used by raw gather test for polymorphic assignments
struct RawGatherTexture {
// Set Element <i> to a format-appropriate value derived from 2D coords <x,y>
virtual void SetElement(int i, int x, int y) = 0;
// Retrieve pointer to the elements
virtual void *GetElements() = 0;
// Get dimensions/format
virtual unsigned GetXDim() = 0;
virtual unsigned GetYDim() = 0;
virtual DXGI_FORMAT GetFormat() = 0;
template<typename GatherType>
void DoRawGatherTest(ID3D12Device *pDevice, RawGatherTexture *rawTex, DXGI_FORMAT viewFormat);
void RunResourceTest(ID3D12Device *pDevice, const char *pShader, const wchar_t *sm, bool isDynamic);
template <class T1, class T2>
void WaveIntrinsicsActivePrefixTest(TableParameter *pParameterList,
size_t numParameter, bool isPrefix);
template <typename T>
void WaveIntrinsicsMultiPrefixOpTest(TableParameter *pParameterList,
size_t numParameters);
void BasicTriangleTestSetup(LPCSTR OpName, LPCWSTR FileName, D3D_SHADER_MODEL testModel);
void RunBasicShaderModelTest(D3D_SHADER_MODEL shaderModel);
enum class RawBufferLdStType {
template <class Ty>
struct RawBufferLdStTestData {
Ty v1, v2[2], v3[3], v4[4];
template <class Ty>
struct RawBufferLdStUavData {
RawBufferLdStTestData<Ty> input, output, srvOut;
template <class Ty>
void RunComputeRawBufferLdStTest(D3D_SHADER_MODEL shaderModel, RawBufferLdStType dataType,
const char *shaderOpName, const RawBufferLdStTestData<Ty> &testData);
template <class Ty>
void RunGraphicsRawBufferLdStTest(D3D_SHADER_MODEL shaderModel, RawBufferLdStType dataType,
const char *shaderOpName, const RawBufferLdStTestData<Ty> &testData);
template <class Ty>
void VerifyRawBufferLdStTestResults(const std::shared_ptr<st::ShaderOpTest> test, const RawBufferLdStTestData<Ty> &testData);
bool SetupRawBufferLdStTest(D3D_SHADER_MODEL shaderModel, RawBufferLdStType dataType, CComPtr<ID3D12Device> &pDevice,
CComPtr<IStream> &pStream, char *&sTy, char *&additionalOptions);
template <class Ty>
void RunBasicShaderModelTest(CComPtr<ID3D12Device> pDevice, const char *pShaderModelStr, const char *pShader, Ty *pInputDataPairs, unsigned inputDataCount);
template <class Ty>
const wchar_t* BasicShaderModelTest_GetFormatString();
void CompileFromText(LPCSTR pText, LPCWSTR pEntryPoint, LPCWSTR pTargetProfile, ID3DBlob **ppBlob, LPCWSTR *pOptions = nullptr, int numOptions = 0) {
CComPtr<IDxcCompiler> pCompiler;
CComPtr<IDxcLibrary> pLibrary;
CComPtr<IDxcBlobEncoding> pTextBlob;
CComPtr<IDxcOperationResult> pResult;
HRESULT resultCode;
VERIFY_SUCCEEDED(m_support.CreateInstance(CLSID_DxcCompiler, &pCompiler));
VERIFY_SUCCEEDED(m_support.CreateInstance(CLSID_DxcLibrary, &pLibrary));
VERIFY_SUCCEEDED(pLibrary->CreateBlobWithEncodingFromPinned(pText, (UINT32)strlen(pText), CP_UTF8, &pTextBlob));
VERIFY_SUCCEEDED(pCompiler->Compile(pTextBlob, L"hlsl.hlsl", pEntryPoint, pTargetProfile, pOptions, numOptions, nullptr, 0, nullptr, &pResult));
if (FAILED(resultCode)) {
CComPtr<IDxcBlobEncoding> errors;
#ifndef _HLK_CONF
LogCommentFmt(L"Failed to compile shader: %s", BlobToWide(errors).data());
VERIFY_SUCCEEDED(pResult->GetResult((IDxcBlob **)ppBlob));
void CreateCommandQueue(ID3D12Device *pDevice, LPCWSTR pName, ID3D12CommandQueue **ppCommandQueue, D3D12_COMMAND_LIST_TYPE type) {
D3D12_COMMAND_QUEUE_DESC queueDesc = {};
queueDesc.Flags = D3D12_COMMAND_QUEUE_FLAG_NONE;
queueDesc.Type = type;
VERIFY_SUCCEEDED(pDevice->CreateCommandQueue(&queueDesc, IID_PPV_ARGS(ppCommandQueue)));
void CreateComputeCommandQueue(ID3D12Device *pDevice, LPCWSTR pName, ID3D12CommandQueue **ppCommandQueue) {
CreateCommandQueue(pDevice, pName, ppCommandQueue, D3D12_COMMAND_LIST_TYPE_COMPUTE);
void CreateComputePSO(ID3D12Device *pDevice, ID3D12RootSignature *pRootSignature, LPCSTR pShader, LPCWSTR pTargetProfile, ID3D12PipelineState **ppComputeState, LPCWSTR *pOptions = nullptr, int numOptions = 0) {
CComPtr<ID3DBlob> pComputeShader;
// Load and compile shaders.
if (UseDxbc()) {
#ifndef _HLK_CONF
DXBCFromText(pShader, L"main", pTargetProfile, &pComputeShader);
else {
CompileFromText(pShader, L"main", pTargetProfile, &pComputeShader, pOptions, numOptions);
// Describe and create the compute pipeline state object (PSO).
computePsoDesc.pRootSignature = pRootSignature;
computePsoDesc.CS = CD3DX12_SHADER_BYTECODE(pComputeShader);
VERIFY_SUCCEEDED(pDevice->CreateComputePipelineState(&computePsoDesc, IID_PPV_ARGS(ppComputeState)));
bool CreateDevice(_COM_Outptr_ ID3D12Device **ppDevice,
D3D_SHADER_MODEL testModel = D3D_SHADER_MODEL_6_0, bool skipUnsupported = true) {
if (testModel > HIGHEST_SHADER_MODEL) {
UINT minor = (UINT)testModel & 0x0f;
LogCommentFmt(L"Installed SDK does not support "
L"shader model 6.%1u", minor);
if (skipUnsupported) {
return false;
CComPtr<IDXGIFactory4> factory;
CComPtr<ID3D12Device> pDevice;
*ppDevice = nullptr;
if (GetTestParamUseWARP(UseWarpByDefault())) {
CComPtr<IDXGIAdapter> warpAdapter;
HRESULT createHR = D3D12CreateDevice(warpAdapter, D3D_FEATURE_LEVEL_11_0,
if (FAILED(createHR)) {
LogCommentFmt(L"The available version of WARP does not support d3d12.");
if (skipUnsupported) {
return false;
} else {
CComPtr<IDXGIAdapter1> hardwareAdapter;
WEX::Common::String AdapterValue;
HRESULT hr = WEX::TestExecution::RuntimeParameters::TryGetValue(L"Adapter",
if (SUCCEEDED(hr)) {
GetHardwareAdapter(factory, AdapterValue, &hardwareAdapter);
} else {
L"Using default hardware adapter with D3D12 support.");
VERIFY_SUCCEEDED(D3D12CreateDevice(hardwareAdapter, D3D_FEATURE_LEVEL_11_0,
// retrieve adapter information
LUID adapterID = pDevice->GetAdapterLuid();
CComPtr<IDXGIAdapter> adapter;
factory->EnumAdapterByLuid(adapterID, IID_PPV_ARGS(&adapter));
LogCommentFmt(L"Using Adapter:%s", AdapterDesc.Description);
if (pDevice == nullptr)
return false;
if (!UseDxbc()) {
// Check for DXIL support.
typedef struct D3D12_FEATURE_DATA_SHADER_MODEL {
_Inout_ D3D_SHADER_MODEL HighestShaderModel;
SMData.HighestShaderModel = testModel;
if (FAILED(pDevice->CheckFeatureSupport((D3D12_FEATURE)D3D12_FEATURE_SHADER_MODEL,
&SMData, sizeof(SMData))) ||
SMData.HighestShaderModel < testModel) {
UINT minor = (UINT)testModel & 0x0f;
LogCommentFmt(L"The selected device does not support "
L"shader model 6.%1u", minor);
if (skipUnsupported) {
return false;
if (UseDebugIfaces()) {
CComPtr<ID3D12InfoQueue> pInfoQueue;
if (SUCCEEDED(pDevice->QueryInterface(&pInfoQueue))) {
*ppDevice = pDevice.Detach();
return true;
void CreateGraphicsCommandQueue(ID3D12Device *pDevice, ID3D12CommandQueue **ppCommandQueue) {
D3D12_COMMAND_QUEUE_DESC queueDesc = {};
queueDesc.Flags = D3D12_COMMAND_QUEUE_FLAG_NONE;
VERIFY_SUCCEEDED(pDevice->CreateCommandQueue(&queueDesc, IID_PPV_ARGS(ppCommandQueue)));
void CreateGraphicsCommandQueueAndList(
ID3D12Device *pDevice, ID3D12CommandQueue **ppCommandQueue,
ID3D12CommandAllocator **ppAllocator,
ID3D12GraphicsCommandList **ppCommandList, ID3D12PipelineState *pPSO) {
CreateGraphicsCommandQueue(pDevice, ppCommandQueue);
0, D3D12_COMMAND_LIST_TYPE_DIRECT, *ppAllocator, pPSO,
void CreateGraphicsPSO(ID3D12Device *pDevice,
D3D12_INPUT_LAYOUT_DESC *pInputLayout,
ID3D12RootSignature *pRootSignature, LPCSTR pShaders,
ID3D12PipelineState **ppPSO) {
CComPtr<ID3DBlob> vertexShader;
CComPtr<ID3DBlob> pixelShader;
if (UseDxbc()) {
#ifndef _HLK_CONF
DXBCFromText(pShaders, L"VSMain", L"vs_6_0", &vertexShader);
DXBCFromText(pShaders, L"PSMain", L"ps_6_0", &pixelShader);
} else {
CompileFromText(pShaders, L"VSMain", L"vs_6_0", &vertexShader);
CompileFromText(pShaders, L"PSMain", L"ps_6_0", &pixelShader);
// Describe and create the graphics pipeline state object (PSO).
psoDesc.InputLayout = *pInputLayout;
psoDesc.pRootSignature = pRootSignature;
psoDesc.VS = CD3DX12_SHADER_BYTECODE(vertexShader);
psoDesc.PS = CD3DX12_SHADER_BYTECODE(pixelShader);
psoDesc.RasterizerState = CD3DX12_RASTERIZER_DESC(D3D12_DEFAULT);
psoDesc.BlendState = CD3DX12_BLEND_DESC(D3D12_DEFAULT);
psoDesc.DepthStencilState.DepthEnable = FALSE;
psoDesc.DepthStencilState.StencilEnable = FALSE;
psoDesc.SampleMask = UINT_MAX;
psoDesc.PrimitiveTopologyType = D3D12_PRIMITIVE_TOPOLOGY_TYPE_TRIANGLE;
psoDesc.NumRenderTargets = 1;
psoDesc.RTVFormats[0] = DXGI_FORMAT_R8G8B8A8_UNORM;
psoDesc.SampleDesc.Count = 1;
pDevice->CreateGraphicsPipelineState(&psoDesc, IID_PPV_ARGS(ppPSO)));
void CreateRenderTargetAndReadback(ID3D12Device *pDevice,
ID3D12DescriptorHeap *pHeap, UINT width,
UINT height,
ID3D12Resource **ppRenderTarget,
ID3D12Resource **ppBuffer) {
const size_t formatElementSize = 4;
CComPtr<ID3D12Resource> pRenderTarget;
CComPtr<ID3D12Resource> pBuffer;
CD3DX12_RESOURCE_DESC::Tex2D(format, width, height));
CD3DX12_CLEAR_VALUE rtClearVal(format, ClearColor);
&rtClearVal, IID_PPV_ARGS(&pRenderTarget)));
pDevice->CreateRenderTargetView(pRenderTarget, nullptr, rtvHandle);
// rtvHandle.Offset(1, rtvDescriptorSize); // Not needed for a single
// resource.
CD3DX12_RESOURCE_DESC::Buffer(width * height * formatElementSize));
&readHeap, D3D12_HEAP_FLAG_NONE, &readDesc,
*ppRenderTarget = pRenderTarget.Detach();
*ppBuffer = pBuffer.Detach();
void CreateRootSignatureFromDesc(ID3D12Device *pDevice,
ID3D12RootSignature **pRootSig) {
CComPtr<ID3DBlob> signature;
CComPtr<ID3DBlob> error;
VERIFY_SUCCEEDED(D3D12SerializeRootSignature(pDesc, D3D_ROOT_SIGNATURE_VERSION_1, &signature, &error));
0, signature->GetBufferPointer(), signature->GetBufferSize(),
void CreateRootSignatureFromRanges(ID3D12Device *pDevice, ID3D12RootSignature **pRootSig,
CD3DX12_DESCRIPTOR_RANGE *sampRanges = nullptr, UINT sampCt = 0,
UINT paramCt = 0;
CD3DX12_ROOT_PARAMETER rootParameters[2];
rootParameters[paramCt++].InitAsDescriptorTable(resCt, resRanges, D3D12_SHADER_VISIBILITY_ALL);
if (sampCt)
rootParameters[paramCt++].InitAsDescriptorTable(sampCt, sampRanges, D3D12_SHADER_VISIBILITY_ALL);
CD3DX12_ROOT_SIGNATURE_DESC rootSignatureDesc;
rootSignatureDesc.Init(paramCt, rootParameters, 0, nullptr, flags);
CreateRootSignatureFromDesc(pDevice, &rootSignatureDesc, pRootSig);
void CreateRtvDescriptorHeap(ID3D12Device *pDevice, UINT numDescriptors,
ID3D12DescriptorHeap **pRtvHeap, UINT *rtvDescriptorSize) {
D3D12_DESCRIPTOR_HEAP_DESC rtvHeapDesc = {};
rtvHeapDesc.NumDescriptors = numDescriptors;
pDevice->CreateDescriptorHeap(&rtvHeapDesc, IID_PPV_ARGS(pRtvHeap)));
if (rtvDescriptorSize != nullptr) {
*rtvDescriptorSize = pDevice->GetDescriptorHandleIncrementSize(
// Copy common fields from desc0 to desc1 and zero out the new one
void CopyDesc0ToDesc1(D3D12_RESOURCE_DESC1 &desc1, const D3D12_RESOURCE_DESC &desc0) {
desc1.Dimension = desc0.Dimension;
desc1.Alignment = desc0.Alignment;
desc1.Width = desc0.Width;
desc1.Height = desc0.Height;
desc1.DepthOrArraySize = desc0.DepthOrArraySize;
desc1.MipLevels = desc0.MipLevels;
desc1.Format = desc0.Format;
desc1.SampleDesc = desc0.SampleDesc;
desc1.Layout = desc0.Layout;
desc1.Flags = desc0.Flags;
desc1.SamplerFeedbackMipRegion = {};
// Create resources for the given <resDesc> described main resource
// creating and returning the resource, the upload resource,
// and the readback resource if requested, populating with <values> of size
// <valueSizeInBytes> using <pCommandList> and <pDevice>
// A pointer to a single <castFormat> target may be specified
// where CreateCommittedResource3 is available
void CreateTestResources(ID3D12Device *pDevice,
ID3D12GraphicsCommandList *pCommandList, LPCVOID values,
UINT64 valueSizeInBytes, D3D12_RESOURCE_DESC resDesc,
ID3D12Resource **ppResource,
ID3D12Resource **ppUploadResource,
ID3D12Resource **ppReadBuffer = nullptr,
DXGI_FORMAT *castFormat = nullptr) {
CComPtr<ID3D12Resource> pResource;
CComPtr<ID3D12Resource> pReadBuffer;
CComPtr<ID3D12Resource> pUploadResource;
D3D12_SUBRESOURCE_DATA transferData;
D3D12_RESOURCE_DESC uploadBufferDesc = CD3DX12_RESOURCE_DESC::Buffer(valueSizeInBytes);
CD3DX12_RESOURCE_DESC readDesc(CD3DX12_RESOURCE_DESC::Buffer(valueSizeInBytes));
pDevice->GetCopyableFootprints(&resDesc, 0, 1/*mipleveles*/, 0, nullptr, nullptr, nullptr, &uploadBufferDesc.Width);
uploadBufferDesc.Height = 1;
if (castFormat) {
CComPtr<ID3D12Device10> pDevice10;
// Copy resDesc0 to resDesc1 zeroing anything new
D3D12_RESOURCE_DESC1 resDesc1 = {0};
CopyDesc0ToDesc1(resDesc1, resDesc);
1, castFormat,
} else
if (ppUploadResource)
if (ppReadBuffer)
&readHeap, D3D12_HEAP_FLAG_NONE, &readDesc,
D3D12_RESOURCE_STATE_COPY_DEST, nullptr, IID_PPV_ARGS(&pReadBuffer)));
if (ppUploadResource) {
transferData.pData = values;
transferData.RowPitch = (LONG_PTR)(valueSizeInBytes/resDesc.Height);
transferData.SlicePitch = (LONG_PTR)valueSizeInBytes;
UpdateSubresources<1>(pCommandList, pResource.p, pUploadResource.p, 0, 0, 1, &transferData);
RecordTransitionBarrier(pCommandList, pResource, D3D12_RESOURCE_STATE_COPY_DEST, D3D12_RESOURCE_STATE_UNORDERED_ACCESS);
RecordTransitionBarrier(pCommandList, pResource, D3D12_RESOURCE_STATE_COPY_DEST, D3D12_RESOURCE_STATE_COMMON);
*ppResource = pResource.Detach();
if (ppUploadResource)
*ppUploadResource = pUploadResource.Detach();
if (ppReadBuffer)
*ppReadBuffer = pReadBuffer.Detach();
void CreateTestUavs(ID3D12Device *pDevice,
ID3D12GraphicsCommandList *pCommandList, LPCVOID values,
UINT64 valueSizeInBytes, ID3D12Resource **ppUavResource,
ID3D12Resource **ppUploadResource = nullptr,
ID3D12Resource **ppReadBuffer = nullptr) {
CreateTestResources(pDevice, pCommandList, values, valueSizeInBytes, bufferDesc,
ppUavResource, ppUploadResource, ppReadBuffer);
// Create and return descriptor heaps for the given device
// with the given number of resources and samples.
// using some reasonable defaults
void CreateDefaultDescHeaps(ID3D12Device *pDevice,
int NumResources, int NumSamplers,
ID3D12DescriptorHeap **ppResHeap, ID3D12DescriptorHeap **ppSampHeap) {
// Describe and create descriptor heaps.
ID3D12DescriptorHeap *pResHeap, *pSampHeap;
heapDesc.NumDescriptors = NumResources;
VERIFY_SUCCEEDED(pDevice->CreateDescriptorHeap(&heapDesc, IID_PPV_ARGS(&pResHeap)));
heapDesc.NumDescriptors = NumSamplers;
VERIFY_SUCCEEDED(pDevice->CreateDescriptorHeap(&heapDesc, IID_PPV_ARGS(&pSampHeap)));
*ppResHeap = pResHeap;
*ppSampHeap = pSampHeap;
void CreateSRV(ID3D12Device *pDevice, CD3DX12_CPU_DESCRIPTOR_HANDLE &baseHandle,
DXGI_FORMAT format, D3D12_SRV_DIMENSION viewDimension, UINT numElements, UINT stride,
const CComPtr<ID3D12Resource> pResource) {
UINT descriptorSize = pDevice->GetDescriptorHandleIncrementSize(D3D12_DESCRIPTOR_HEAP_TYPE_CBV_SRV_UAV);
// Create SRV
srvDesc.Format = format;
srvDesc.ViewDimension = viewDimension;
srvDesc.Shader4ComponentMapping = D3D12_DEFAULT_SHADER_4_COMPONENT_MAPPING;
switch (viewDimension) {
srvDesc.Buffer.FirstElement = 0;
srvDesc.Buffer.NumElements = numElements;
srvDesc.Buffer.StructureByteStride = stride;
if (format == DXGI_FORMAT_R32_TYPELESS && stride == 0)
srvDesc.Buffer.Flags = D3D12_BUFFER_SRV_FLAG_RAW;
srvDesc.Buffer.Flags = D3D12_BUFFER_SRV_FLAG_NONE;
srvDesc.Texture1D.MostDetailedMip = 0;
srvDesc.Texture1D.MipLevels = 1;
srvDesc.Texture1D.ResourceMinLODClamp = 0;
srvDesc.Texture2D.MostDetailedMip = 0;
srvDesc.Texture2D.MipLevels = 1;
srvDesc.Texture2D.PlaneSlice = 0;
srvDesc.Texture2D.ResourceMinLODClamp = 0;
pDevice->CreateShaderResourceView(pResource, &srvDesc, baseHandle);
void CreateRawSRV(ID3D12Device *pDevice, CD3DX12_CPU_DESCRIPTOR_HANDLE &heapStart,
UINT numElements, const CComPtr<ID3D12Resource> pResource) {
CreateSRV(pDevice, heapStart, DXGI_FORMAT_R32_TYPELESS, D3D12_SRV_DIMENSION_BUFFER, numElements, 0, pResource);
void CreateStructSRV(ID3D12Device *pDevice, CD3DX12_CPU_DESCRIPTOR_HANDLE &heapStart,
UINT numElements, UINT stride, const CComPtr<ID3D12Resource> pResource) {
CreateSRV(pDevice, heapStart, DXGI_FORMAT_UNKNOWN, D3D12_SRV_DIMENSION_BUFFER, numElements, stride, pResource);
void CreateTypedSRV(ID3D12Device *pDevice, CD3DX12_CPU_DESCRIPTOR_HANDLE &heapStart,
UINT numElements, DXGI_FORMAT format, const CComPtr<ID3D12Resource> pResource) {
CreateSRV(pDevice, heapStart, format, D3D12_SRV_DIMENSION_BUFFER, numElements, 0, pResource);
void CreateTex1DSRV(ID3D12Device *pDevice, CD3DX12_CPU_DESCRIPTOR_HANDLE &heapStart,
UINT numElements, DXGI_FORMAT format, const CComPtr<ID3D12Resource> pResource) {
CreateSRV(pDevice, heapStart, format, D3D12_SRV_DIMENSION_TEXTURE1D, numElements, 0, pResource);
void CreateTex2DSRV(ID3D12Device *pDevice, CD3DX12_CPU_DESCRIPTOR_HANDLE &heapStart,
DXGI_FORMAT format, const CComPtr<ID3D12Resource> pResource) {
CreateSRV(pDevice, heapStart, format, D3D12_SRV_DIMENSION_TEXTURE2D, 0/*numElements*/, 0/*stride*/, pResource);
void CreateUAV(ID3D12Device *pDevice, CD3DX12_CPU_DESCRIPTOR_HANDLE &baseHandle,
DXGI_FORMAT format, D3D12_UAV_DIMENSION viewDimension, UINT numElements, UINT stride,
const CComPtr<ID3D12Resource> pResource) {
UINT descriptorSize = pDevice->GetDescriptorHandleIncrementSize(D3D12_DESCRIPTOR_HEAP_TYPE_CBV_SRV_UAV);
uavDesc.Format = format;
uavDesc.ViewDimension = viewDimension;
switch (viewDimension) {
uavDesc.Buffer.FirstElement = 0;
uavDesc.Buffer.NumElements = numElements;
uavDesc.Buffer.StructureByteStride = stride;
if (format == DXGI_FORMAT_R32_TYPELESS && stride == 0)
uavDesc.Buffer.Flags = D3D12_BUFFER_UAV_FLAG_RAW;
uavDesc.Buffer.Flags = D3D12_BUFFER_UAV_FLAG_NONE;
uavDesc.Texture1D.MipSlice = 0;
uavDesc.Texture2D.MipSlice = 0;
uavDesc.Texture2D.PlaneSlice = 0;
uavDesc.Texture2DArray.MipSlice = 0;
uavDesc.Texture2DArray.PlaneSlice = 0;
uavDesc.Texture2DArray.FirstArraySlice = 0;
uavDesc.Texture2DArray.ArraySize = numElements;
pDevice->CreateUnorderedAccessView(pResource, nullptr, &uavDesc, baseHandle);
void CreateRawUAV(ID3D12Device *pDevice, CD3DX12_CPU_DESCRIPTOR_HANDLE &heapStart,
UINT numElements, const CComPtr<ID3D12Resource> pResource) {
CreateUAV(pDevice, heapStart, DXGI_FORMAT_R32_TYPELESS, D3D12_UAV_DIMENSION_BUFFER, numElements, 0/*stride*/, pResource);
void CreateStructUAV(ID3D12Device *pDevice, CD3DX12_CPU_DESCRIPTOR_HANDLE &heapStart,
UINT numElements, UINT stride, const CComPtr<ID3D12Resource> pResource) {
CreateUAV(pDevice, heapStart, DXGI_FORMAT_UNKNOWN, D3D12_UAV_DIMENSION_BUFFER, numElements, stride, pResource);
void CreateTypedUAV(ID3D12Device *pDevice, CD3DX12_CPU_DESCRIPTOR_HANDLE &heapStart,
UINT numElements, DXGI_FORMAT format, const CComPtr<ID3D12Resource> pResource) {
CreateUAV(pDevice, heapStart, format, D3D12_UAV_DIMENSION_BUFFER, numElements, 0/*stride*/, pResource);
void CreateTex1DUAV(ID3D12Device *pDevice, CD3DX12_CPU_DESCRIPTOR_HANDLE &heapStart,
DXGI_FORMAT format, const CComPtr<ID3D12Resource> pResource) {
CreateUAV(pDevice, heapStart, format, D3D12_UAV_DIMENSION_TEXTURE1D, 0/*numElements*/, 0/*stride*/, pResource);
void CreateTex2DUAV(ID3D12Device *pDevice, CD3DX12_CPU_DESCRIPTOR_HANDLE &heapStart,
DXGI_FORMAT format, const CComPtr<ID3D12Resource> pResource) {
CreateUAV(pDevice, heapStart, format, D3D12_UAV_DIMENSION_TEXTURE2D, 0/*numElements*/, 0/*stride*/, pResource);
void CreateTex2DArrayUAV(ID3D12Device *pDevice, CD3DX12_CPU_DESCRIPTOR_HANDLE &heapStart,
UINT numElements, DXGI_FORMAT format, const CComPtr<ID3D12Resource> pResource) {
CreateUAV(pDevice, heapStart, format, D3D12_UAV_DIMENSION_TEXTURE2DARRAY, numElements, 0/*stride*/, pResource);
void CreateTex2DMSUAV(ID3D12Device *pDevice, CD3DX12_CPU_DESCRIPTOR_HANDLE &heapStart,
DXGI_FORMAT format, const CComPtr<ID3D12Resource> pResource) {
CreateUAV(pDevice, heapStart, format, (D3D12_UAV_DIMENSION)6 /*D3D12_UAV_DIMENSION_TEXTURE2DMS*/, 0 /*numElements*/, 0/*stride*/, pResource);
// Create Samplers for <pDevice> given the filter and border color information provided
// using some reasonable defaults
void CreateDefaultSamplers(ID3D12Device *pDevice, D3D12_CPU_DESCRIPTOR_HANDLE heapStart,
D3D12_FILTER filters[], float *perSamplerBorderColors, int NumSamplers) {
CD3DX12_CPU_DESCRIPTOR_HANDLE sampHandle(heapStart);
UINT descriptorSize = pDevice->GetDescriptorHandleIncrementSize(D3D12_DESCRIPTOR_HEAP_TYPE_SAMPLER);
D3D12_SAMPLER_DESC sampDesc = {};
sampDesc.AddressU = sampDesc.AddressV = sampDesc.AddressW = addrMode;
sampDesc.MipLODBias = 0;
sampDesc.MaxAnisotropy = 1;
sampDesc.ComparisonFunc = D3D12_COMPARISON_FUNC_EQUAL;
sampDesc.MinLOD = 0;
sampDesc.MaxLOD = 0;
for (int i = 0; i < NumSamplers; i++) {
sampDesc.Filter = filters[i];
if (perSamplerBorderColors) {
for (int j = 0; j < 4; j++)
sampDesc.BorderColor[j] = perSamplerBorderColors[i];
pDevice->CreateSampler(&sampDesc, sampHandle);
sampHandle = sampHandle.Offset(descriptorSize);
template <typename TVertex, int len>
void CreateVertexBuffer(ID3D12Device *pDevice, TVertex(&vertices)[len],
ID3D12Resource **ppVertexBuffer,
D3D12_VERTEX_BUFFER_VIEW *pVertexBufferView) {
size_t vertexBufferSize = sizeof(vertices);
CComPtr<ID3D12Resource> pVertexBuffer;
&heapProps, D3D12_HEAP_FLAG_NONE, &bufferDesc,
UINT8 *pVertexDataBegin;
CD3DX12_RANGE readRange(0, 0);
0, &readRange, reinterpret_cast<void **>(&pVertexDataBegin)));
memcpy(pVertexDataBegin, vertices, vertexBufferSize);
pVertexBuffer->Unmap(0, nullptr);
// Initialize the vertex buffer view.
pVertexBufferView->BufferLocation = pVertexBuffer->GetGPUVirtualAddress();
pVertexBufferView->StrideInBytes = sizeof(TVertex);
pVertexBufferView->SizeInBytes = (UINT)vertexBufferSize;
*ppVertexBuffer = pVertexBuffer.Detach();
// Requires Anniversary Edition headers, so simplifying things for current setup.
const UINT D3D12_FEATURE_D3D12_OPTIONS1 = 8;
BOOL WaveOps;
UINT WaveLaneCountMin;
UINT WaveLaneCountMax;
UINT TotalLaneCount;
BOOL ExpandedComputeResourceStates;
BOOL Int64ShaderOps;
bool IsDeviceBasicAdapter(ID3D12Device *pDevice) {
CComPtr<IDXGIFactory4> factory;
LUID adapterID = pDevice->GetAdapterLuid();
CComPtr<IDXGIAdapter1> adapter;
factory->EnumAdapterByLuid(adapterID, IID_PPV_ARGS(&adapter));
return (AdapterDesc.Flags & DXGI_ADAPTER_FLAG_SOFTWARE) ||
(AdapterDesc.VendorId == 0x1414 &&
(AdapterDesc.DeviceId == 0x8c || AdapterDesc.DeviceId == 0x8d));
bool DoesDeviceSupportInt64(ID3D12Device *pDevice) {
if (FAILED(pDevice->CheckFeatureSupport((D3D12_FEATURE)D3D12_FEATURE_D3D12_OPTIONS1, &O, sizeof(O))))
return false;
return O.Int64ShaderOps != FALSE;
bool DoesDeviceSupportDouble(ID3D12Device *pDevice) {
if (FAILED(pDevice->CheckFeatureSupport((D3D12_FEATURE)D3D12_FEATURE_D3D12_OPTIONS, &O, sizeof(O))))
return false;
return O.DoublePrecisionFloatShaderOps != FALSE;
bool DoesDeviceSupportWaveOps(ID3D12Device *pDevice) {
if (FAILED(pDevice->CheckFeatureSupport((D3D12_FEATURE)D3D12_FEATURE_D3D12_OPTIONS1, &O, sizeof(O))))
return false;
return O.WaveOps != FALSE;
bool DoesDeviceSupportBarycentrics(ID3D12Device *pDevice) {
if (FAILED(pDevice->CheckFeatureSupport((D3D12_FEATURE)D3D12_FEATURE_D3D12_OPTIONS3, &O, sizeof(O))))
return false;
return O.BarycentricsSupported != FALSE;
bool DoesDeviceSupportNative16bitOps(ID3D12Device *pDevice) {
if (FAILED(pDevice->CheckFeatureSupport((D3D12_FEATURE)D3D12_FEATURE_D3D12_OPTIONS4, &O, sizeof(O))))
return false;
return O.Native16BitShaderOpsSupported != FALSE;
bool DoesDeviceSupportMeshShaders(ID3D12Device *pDevice) {
if (FAILED(pDevice->CheckFeatureSupport((D3D12_FEATURE)D3D12_FEATURE_D3D12_OPTIONS7, &O7, sizeof(O7))))
return false;
return O7.MeshShaderTier != D3D12_MESH_SHADER_TIER_NOT_SUPPORTED;
return false;
bool DoesDeviceSupportRayTracing(ID3D12Device *pDevice) {
if (FAILED(pDevice->CheckFeatureSupport((D3D12_FEATURE)D3D12_FEATURE_D3D12_OPTIONS5, &O5, sizeof(O5))))
return false;
return O5.RaytracingTier != D3D12_RAYTRACING_TIER_NOT_SUPPORTED;
return false;
bool DoesDeviceSupportMeshAmpDerivatives(ID3D12Device *pDevice) {
if (FAILED(pDevice->CheckFeatureSupport((D3D12_FEATURE)D3D12_FEATURE_D3D12_OPTIONS7, &O7, sizeof(O7))) ||
FAILED(pDevice->CheckFeatureSupport((D3D12_FEATURE)D3D12_FEATURE_D3D12_OPTIONS9, &O9, sizeof(O9))))
return false;
return O7.MeshShaderTier != D3D12_MESH_SHADER_TIER_NOT_SUPPORTED &&
O9.DerivativesInMeshAndAmplificationShadersSupported != FALSE;
return false;
bool DoesDeviceSupportTyped64Atomics(ID3D12Device *pDevice) {
if (FAILED(pDevice->CheckFeatureSupport((D3D12_FEATURE)D3D12_FEATURE_D3D12_OPTIONS9, &O9, sizeof(O9))))
return false;
return O9.AtomicInt64OnTypedResourceSupported != FALSE;
return false;
bool DoesDeviceSupportHeap64Atomics(ID3D12Device *pDevice) {
if (FAILED(pDevice->CheckFeatureSupport((D3D12_FEATURE)D3D12_FEATURE_D3D12_OPTIONS11, &O11, sizeof(O11))))
return false;
return O11.AtomicInt64OnDescriptorHeapResourceSupported != FALSE;
return false;
bool DoesDeviceSupportShared64Atomics(ID3D12Device *pDevice) {
if (FAILED(pDevice->CheckFeatureSupport((D3D12_FEATURE)D3D12_FEATURE_D3D12_OPTIONS9, &O9, sizeof(O9))))
return false;
return O9.AtomicInt64OnGroupSharedSupported != FALSE;
return false;
bool DoesDeviceSupportAdvancedTexOps(ID3D12Device *pDevice) {
if (FAILED(pDevice->CheckFeatureSupport((D3D12_FEATURE)D3D12_FEATURE_D3D12_OPTIONS14, &O14, sizeof(O14))))
return false;
return O14.AdvancedTextureOpsSupported != FALSE;
return false;
bool DoesDeviceSupportWritableMSAA(ID3D12Device *pDevice) {
if (FAILED(pDevice->CheckFeatureSupport((D3D12_FEATURE)D3D12_FEATURE_D3D12_OPTIONS14, &O14, sizeof(O14))))
return false;
return O14.WriteableMSAATexturesSupported != FALSE;
return false;
bool DoesDeviceSupportEnhancedBarriers(ID3D12Device *pDevice) {
if (FAILED(pDevice->CheckFeatureSupport((D3D12_FEATURE)D3D12_FEATURE_D3D12_OPTIONS12, &O12, sizeof(O12))))
return false;
return O12.EnhancedBarriersSupported != FALSE;
return false;
bool DoesDeviceSupportRelaxedFormatCasting(ID3D12Device *pDevice) {
if (!DoesDeviceSupportEnhancedBarriers(pDevice))
return false;
if (FAILED(pDevice->CheckFeatureSupport((D3D12_FEATURE)D3D12_FEATURE_D3D12_OPTIONS12, &O12, sizeof(O12))))
return false;
return O12.RelaxedFormatCastingSupported != FALSE;
return false;
bool IsFallbackPathEnabled(){
// Enable fallback paths with: /p:"EnableFallback=1"
UINT EnableFallbackValue = 0;
WEX::TestExecution::RuntimeParameters::TryGetValue(L"EnableFallback", EnableFallbackValue);
return EnableFallbackValue != 0;
#ifndef _HLK_CONF
void DXBCFromText(LPCSTR pText, LPCWSTR pEntryPoint, LPCWSTR pTargetProfile, ID3DBlob **ppBlob) {
CW2A pEntryPointA(pEntryPoint, CP_UTF8);
CW2A pTargetProfileA(pTargetProfile, CP_UTF8);
CComPtr<ID3DBlob> pErrors;
D3D_SHADER_MACRO d3dMacro[2];
ZeroMemory(d3dMacro, sizeof(d3dMacro));
d3dMacro[0].Definition = "1";
d3dMacro[0].Name = "USING_DXBC";
HRESULT hr = D3DCompile(pText, strlen(pText), "hlsl.hlsl", d3dMacro, nullptr, pEntryPointA, pTargetProfileA, 0, 0, ppBlob, &pErrors);
if (pErrors != nullptr) {
CA2W errors((char *)pErrors->GetBufferPointer(), CP_ACP);
LogCommentFmt(L"Compilation failure: %s", errors.m_szBuffer);
HRESULT EnableDebugLayer() {
// The debug layer does net yet validate DXIL programs that require rewriting,
// but basic logging should work properly.
if (UseDebugIfaces()) {
CComPtr<ID3D12Debug> debugController;
hr = D3D12GetDebugInterface(IID_PPV_ARGS(&debugController));
if (SUCCEEDED(hr)) {
hr = S_OK;
return hr;
static std::wstring GetModuleName() {
wchar_t moduleName[MAX_PATH+1] = {0};
DWORD length = GetModuleFileNameW(NULL, moduleName, MAX_PATH);
if (length == 0 || length == MAX_PATH) {
return std::wstring(); // Error condition
return std::wstring(moduleName, length);
static std::wstring ComputeSDKFullPath(std::wstring SDKPath) {
std::wstring modulePath = GetModuleName();
size_t pos = modulePath.rfind('\\');
if (pos == std::wstring::npos)
return SDKPath;
if (SDKPath.substr(0, 2) != L".\\")
return SDKPath;
return modulePath.substr(0, pos) + SDKPath.substr(1);
static UINT GetD3D12SDKVersion(std::wstring SDKPath) {
// Try to automatically get the D3D12SDKVersion from the DLL
UINT SDKVersion = 0;
std::wstring D3DCorePath = ComputeSDKFullPath(SDKPath);
HMODULE hCore = LoadLibraryW(D3DCorePath.c_str());
if (hCore) {
if (UINT *pSDKVersion = (UINT*)GetProcAddress(hCore, "D3D12SDKVersion"))
SDKVersion = *pSDKVersion;
return SDKVersion;
static HRESULT EnableAgilitySDK(HMODULE hRuntime, UINT SDKVersion,
D3D12GetInterfaceFn pD3D12GetInterface =
(D3D12GetInterfaceFn)GetProcAddress(hRuntime, "D3D12GetInterface");
CComPtr<ID3D12SDKConfiguration> pD3D12SDKConfiguration;
IFR(pD3D12SDKConfiguration->SetSDKVersion(SDKVersion, CW2A(SDKPath)));
// Currently, it appears that the SetSDKVersion will succeed even when
// D3D12Core is not found, or its version doesn't match. When that's the
// case, will cause a failure in the very next thing that actually requires
// D3D12Core.dll to be loaded instead. So, we attempt to clear experimental
// features next, which is a valid use case and a no-op at this point. This
// requires D3D12Core to be loaded. If this fails, we know the AgilitySDK
// setting actually failed.
D3D12EnableExperimentalFeaturesFn pD3D12EnableExperimentalFeatures =
hRuntime, "D3D12EnableExperimentalFeatures");
if (pD3D12EnableExperimentalFeatures == nullptr) {
// If this failed, D3D12 must be too old for AgilitySDK. But if that's
// the case, creating D3D12SDKConfiguration should have failed. So while
// this case shouldn't be hit, fail if it is.
return HRESULT_FROM_WIN32(GetLastError());
return pD3D12EnableExperimentalFeatures(0, nullptr, nullptr, nullptr);
static HRESULT EnableExperimentalShaderModels(HMODULE hRuntime) {
D3D12EnableExperimentalFeaturesFn pD3D12EnableExperimentalFeatures =
hRuntime, "D3D12EnableExperimentalFeatures");
if (pD3D12EnableExperimentalFeatures == nullptr) {
return HRESULT_FROM_WIN32(GetLastError());
return pD3D12EnableExperimentalFeatures(1, &D3D12ExperimentalShaderModelsID,
nullptr, nullptr);
static HRESULT EnableExperimentalShaderModels() {
HMODULE hRuntime = LoadLibraryW(L"d3d12.dll");
if (hRuntime == NULL)
return E_FAIL;
return EnableExperimentalShaderModels(hRuntime);
HRESULT EnableAgilitySDK(HMODULE hRuntime) {
// D3D12SDKVersion > 1 will use provided version, otherwise, auto-detect.
// D3D12SDKVersion == 1 means fail if we can't auto-detect.
UINT SDKVersion = 0;
L"D3D12SDKVersion", SDKVersion);
// SDKPath must be relative path from .exe, which means relative to
// TE.exe location, and must start with ".\\", such as with the
// default: ".\\D3D12\\"
WEX::Common::String SDKPath;
if (SUCCEEDED(WEX::TestExecution::RuntimeParameters::TryGetValue(
L"D3D12SDKPath", SDKPath))) {
// Make sure path ends in backslash
if (!SDKPath.IsEmpty() && SDKPath.Right(1) != "\\") {
if (SDKPath.IsEmpty()) {
SDKPath = L".\\D3D12\\";
bool mustFind = SDKVersion > 0;
if (SDKVersion <= 1) {
// lookup version from D3D12Core.dll
SDKVersion = GetD3D12SDKVersion((LPCWSTR)SDKPath);
if (mustFind && SDKVersion == 0) {
LogErrorFmt(L"Agility SDK not found in relative path: %s", (LPCWSTR)SDKPath);
return E_FAIL;
// Not found, not asked for.
if (SDKVersion == 0)
return S_FALSE;
HRESULT hr= EnableAgilitySDK(hRuntime, SDKVersion, (LPCWSTR)SDKPath);
if (FAILED(hr)) {
// If SDKVersion provided, fail if not successful.
// 1 means we should find it, and fill in the version automatically.
if (mustFind) {
LogErrorFmt(L"Failed to set Agility SDK version %d at path: %s", SDKVersion, (LPCWSTR)SDKPath);
return hr;
return S_FALSE;
if (hr == S_OK) {
LogCommentFmt(L"Agility SDK version set to: %d", SDKVersion);
m_AgilitySDKEnabled = true;
return hr;
HRESULT EnableExperimentalMode(HMODULE hRuntime) {
if (m_ExperimentalModeEnabled) {
return S_OK;
bool bExperimentalShaderModels = GetTestParamBool(L"ExperimentalShaders");
if (bExperimentalShaderModels) {
hr = EnableExperimentalShaderModels(hRuntime);
if (SUCCEEDED(hr)) {
m_ExperimentalModeEnabled = true;
return hr;
struct FenceObj {
HANDLE m_fenceEvent = NULL;
CComPtr<ID3D12Fence> m_fence;
UINT64 m_fenceValue;
~FenceObj() {
if (m_fenceEvent) CloseHandle(m_fenceEvent);
void InitFenceObj(ID3D12Device *pDevice, FenceObj *pObj) {
pObj->m_fenceValue = 1;
// Create an event handle to use for frame synchronization.
pObj->m_fenceEvent = CreateEvent(nullptr, FALSE, FALSE, nullptr);
if (pObj->m_fenceEvent == nullptr) {
void ReadHlslDataIntoNewStream(LPCWSTR relativePath, IStream **ppStream) {
CComPtr<IDxcLibrary> pLibrary;
CComPtr<IDxcBlobEncoding> pBlob;
CComPtr<IStream> pStream;
std::wstring path = GetPathToHlslDataFile(relativePath);
VERIFY_SUCCEEDED(m_support.CreateInstance(CLSID_DxcLibrary, &pLibrary));
VERIFY_SUCCEEDED(pLibrary->CreateBlobFromFile(path.c_str(), nullptr, &pBlob));
VERIFY_SUCCEEDED(pLibrary->CreateStreamFromBlobReadOnly(pBlob, &pStream));
*ppStream = pStream.Detach();
void RecordRenderAndReadback(ID3D12GraphicsCommandList *pList,
ID3D12DescriptorHeap *pRtvHeap,
UINT rtvDescriptorSize,
UINT instanceCount,
D3D12_VERTEX_BUFFER_VIEW *pVertexBufferView,
ID3D12RootSignature *pRootSig,
ID3D12Resource *pRenderTarget,
ID3D12Resource *pReadBuffer) {
D3D12_RESOURCE_DESC rtDesc = pRenderTarget->GetDesc();
D3D12_VIEWPORT viewport;
D3D12_RECT scissorRect;
memset(&viewport, 0, sizeof(viewport));
viewport.Height = (float)rtDesc.Height;
viewport.Width = (float)rtDesc.Width;
viewport.MaxDepth = 1.0f;
memset(&scissorRect, 0, sizeof(scissorRect));
scissorRect.right = (long)rtDesc.Width;
scissorRect.bottom = rtDesc.Height;
if (pRootSig != nullptr) {
pList->RSSetViewports(1, &viewport);
pList->RSSetScissorRects(1, &scissorRect);
// Indicate that the buffer will be used as a render target.
RecordTransitionBarrier(pList, pRenderTarget, D3D12_RESOURCE_STATE_COPY_DEST, D3D12_RESOURCE_STATE_RENDER_TARGET);
CD3DX12_CPU_DESCRIPTOR_HANDLE rtvHandle(pRtvHeap->GetCPUDescriptorHandleForHeapStart(), 0, rtvDescriptorSize);
pList->OMSetRenderTargets(1, &rtvHandle, FALSE, nullptr);
pList->ClearRenderTargetView(rtvHandle, ClearColor, 0, nullptr);
pList->IASetVertexBuffers(0, 1, pVertexBufferView);
pList->DrawInstanced(3, instanceCount, 0, 0);
// Transition to copy source and copy into read-back buffer.
RecordTransitionBarrier(pList, pRenderTarget, D3D12_RESOURCE_STATE_RENDER_TARGET, D3D12_RESOURCE_STATE_COPY_SOURCE);
// Copy into read-back buffer.
UINT64 rowPitch = rtDesc.Width * 4;
Footprint.Offset = 0;
Footprint.Footprint = CD3DX12_SUBRESOURCE_FOOTPRINT(DXGI_FORMAT_R8G8B8A8_UNORM, (UINT)rtDesc.Width, rtDesc.Height, 1, (UINT)rowPitch);
CD3DX12_TEXTURE_COPY_LOCATION DstLoc(pReadBuffer, Footprint);
CD3DX12_TEXTURE_COPY_LOCATION SrcLoc(pRenderTarget, 0);
pList->CopyTextureRegion(&DstLoc, 0, 0, 0, &SrcLoc, nullptr);
void RunRWByteBufferComputeTest(ID3D12Device *pDevice, LPCSTR shader, std::vector<uint32_t> &values);
void RunLifetimeIntrinsicTest(ID3D12Device *pDevice, LPCSTR shader, D3D_SHADER_MODEL shaderModel, bool useLibTarget, LPCWSTR *pOptions, int numOptions, std::vector<uint32_t> &values);
void RunLifetimeIntrinsicComputeTest(ID3D12Device *pDevice, LPCSTR pShader, CComPtr<ID3D12DescriptorHeap>& pUavHeap, CComPtr<ID3D12RootSignature>& pRootSignature,
LPCWSTR pTargetProfile, LPCWSTR *pOptions, int numOptions, std::vector<uint32_t> &values);
void RunLifetimeIntrinsicLibTest(ID3D12Device *pDevice0, LPCSTR pShader, CComPtr<ID3D12RootSignature>& pRootSignature,
LPCWSTR pTargetProfile, LPCWSTR *pOptions, int numOptions);
void SetDescriptorHeap(ID3D12GraphicsCommandList *pCommandList, ID3D12DescriptorHeap *pHeap) {
ID3D12DescriptorHeap *const pHeaps[1] = { pHeap };
pCommandList->SetDescriptorHeaps(1, pHeaps);
void WaitForSignal(ID3D12CommandQueue *pCQ, FenceObj &FO) {
::WaitForSignal(pCQ, FO.m_fence, FO.m_fenceEvent, FO.m_fenceValue++);
"#ifdef USING_DXBC\r\n" \
"uint WaveGetLaneIndex() { return 1; }\r\n" \
"uint WaveReadLaneFirst(uint u) { return u; }\r\n" \
"bool WaveIsFirstLane() { return true; }\r\n" \
"uint WaveGetLaneCount() { return 1; }\r\n" \
"uint WaveReadLaneAt(uint n, uint u) { return u; }\r\n" \
"bool WaveActiveAnyTrue(bool b) { return b; }\r\n" \
"bool WaveActiveAllTrue(bool b) { return false; }\r\n" \
"uint WaveActiveAllEqual(uint u) { return u; }\r\n" \
"uint4 WaveActiveBallot(bool b) { return 1; }\r\n" \
"uint WaveActiveCountBits(uint u) { return 1; }\r\n" \
"uint WaveActiveSum(uint u) { return 1; }\r\n" \
"uint WaveActiveProduct(uint u) { return 1; }\r\n" \
"uint WaveActiveBitAnd(uint u) { return 1; }\r\n" \
"uint WaveActiveBitOr(uint u) { return 1; }\r\n" \
"uint WaveActiveBitXor(uint u) { return 1; }\r\n" \
"uint WaveActiveMin(uint u) { return 1; }\r\n" \
"uint WaveActiveMax(uint u) { return 1; }\r\n" \
"uint WavePrefixCountBits(uint u) { return 1; }\r\n" \
"uint WavePrefixSum(uint u) { return 1; }\r\n" \
"uint WavePrefixProduct(uint u) { return 1; }\r\n" \
"uint QuadReadLaneAt(uint a, uint u) { return 1; }\r\n" \
"uint QuadReadAcrossX(uint u) { return 1; }\r\n" \
"uint QuadReadAcrossY(uint u) { return 1; }\r\n" \
"uint QuadReadAcrossDiagonal(uint u) { return 1; }\r\n" \
static void SetupComputeValuePattern(std::vector<uint32_t> &values,
size_t count) {
values.resize(count); // one element per dispatch group, in bytes
for (size_t i = 0; i < count; ++i) {
values[i] = (uint32_t)i;
bool ExecutionTest::ExecutionTestClassSetup() {
return DivergentClassSetup();
void ExecutionTest::RunRWByteBufferComputeTest(ID3D12Device *pDevice, LPCSTR pShader, std::vector<uint32_t> &values) {
static const int DispatchGroupX = 1;
static const int DispatchGroupY = 1;
static const int DispatchGroupZ = 1;
CComPtr<ID3D12GraphicsCommandList> pCommandList;
CComPtr<ID3D12CommandQueue> pCommandQueue;
CComPtr<ID3D12DescriptorHeap> pUavHeap;
CComPtr<ID3D12CommandAllocator> pCommandAllocator;
UINT uavDescriptorSize;
FenceObj FO;
const UINT valueSizeInBytes = (UINT)values.size() * sizeof(uint32_t);
CreateComputeCommandQueue(pDevice, L"RunRWByteBufferComputeTest Command Queue", &pCommandQueue);
InitFenceObj(pDevice, &FO);
// Describe and create a UAV descriptor heap.
heapDesc.NumDescriptors = 1;
VERIFY_SUCCEEDED(pDevice->CreateDescriptorHeap(&heapDesc, IID_PPV_ARGS(&pUavHeap)));
uavDescriptorSize = pDevice->GetDescriptorHandleIncrementSize(heapDesc.Type);
// Create root signature.
CComPtr<ID3D12RootSignature> pRootSignature;
ranges[0].Init(D3D12_DESCRIPTOR_RANGE_TYPE_UAV, 1, 0, 0, 0);
CD3DX12_ROOT_PARAMETER rootParameters[1];
rootParameters[0].InitAsDescriptorTable(1, &ranges[0], D3D12_SHADER_VISIBILITY_ALL);
CD3DX12_ROOT_SIGNATURE_DESC rootSignatureDesc;
rootSignatureDesc.Init(_countof(rootParameters), rootParameters, 0, nullptr, D3D12_ROOT_SIGNATURE_FLAG_NONE);
CreateRootSignatureFromDesc(pDevice, &rootSignatureDesc, &pRootSignature);
// Create pipeline state object.
CComPtr<ID3D12PipelineState> pComputeState;
CreateComputePSO(pDevice, pRootSignature, pShader, L"cs_6_0", &pComputeState);
// Create a command allocator and list for compute.
VERIFY_SUCCEEDED(pDevice->CreateCommandAllocator(D3D12_COMMAND_LIST_TYPE_COMPUTE, IID_PPV_ARGS(&pCommandAllocator)));
VERIFY_SUCCEEDED(pDevice->CreateCommandList(0, D3D12_COMMAND_LIST_TYPE_COMPUTE, pCommandAllocator, pComputeState, IID_PPV_ARGS(&pCommandList)));
pCommandList->SetName(L"ExecutionTest::RunRWByteButterComputeTest Command List");
// Set up UAV resource.
CComPtr<ID3D12Resource> pUavResource;
CComPtr<ID3D12Resource> pReadBuffer;
CComPtr<ID3D12Resource> pUploadResource;
CreateTestUavs(pDevice, pCommandList,, valueSizeInBytes, &pUavResource, &pUploadResource, &pReadBuffer);
VERIFY_SUCCEEDED(pUavResource->SetName(L"RunRWByteBufferComputeText UAV"));
VERIFY_SUCCEEDED(pReadBuffer->SetName(L"RunRWByteBufferComputeText UAV Read Buffer"));
VERIFY_SUCCEEDED(pUploadResource->SetName(L"RunRWByteBufferComputeText UAV Upload Buffer"));
// Close the command list and execute it to perform the GPU setup.
ExecuteCommandList(pCommandQueue, pCommandList);
WaitForSignal(pCommandQueue, FO);
VERIFY_SUCCEEDED(pCommandList->Reset(pCommandAllocator, pComputeState));
// Run the compute shader and copy the results back to readable memory.
uavDesc.Format = DXGI_FORMAT_R32_TYPELESS;
uavDesc.ViewDimension = D3D12_UAV_DIMENSION_BUFFER;
uavDesc.Buffer.FirstElement = 0;
uavDesc.Buffer.NumElements = (UINT)values.size();
uavDesc.Buffer.StructureByteStride = 0;
uavDesc.Buffer.CounterOffsetInBytes = 0;
uavDesc.Buffer.Flags = D3D12_BUFFER_UAV_FLAG_RAW;
CD3DX12_CPU_DESCRIPTOR_HANDLE uavHandle(pUavHeap->GetCPUDescriptorHandleForHeapStart());
CD3DX12_GPU_DESCRIPTOR_HANDLE uavHandleGpu(pUavHeap->GetGPUDescriptorHandleForHeapStart());
pDevice->CreateUnorderedAccessView(pUavResource, nullptr, &uavDesc, uavHandle);
SetDescriptorHeap(pCommandList, pUavHeap);
pCommandList->SetComputeRootDescriptorTable(0, uavHandleGpu);
pCommandList->Dispatch(DispatchGroupX, DispatchGroupY, DispatchGroupZ);
RecordTransitionBarrier(pCommandList, pUavResource, D3D12_RESOURCE_STATE_UNORDERED_ACCESS, D3D12_RESOURCE_STATE_COPY_SOURCE);
pCommandList->CopyResource(pReadBuffer, pUavResource);
ExecuteCommandList(pCommandQueue, pCommandList);
WaitForSignal(pCommandQueue, FO);
MappedData mappedData(pReadBuffer, valueSizeInBytes);
uint32_t *pData = (uint32_t *);
memcpy(, pData, (size_t)valueSizeInBytes);
WaitForSignal(pCommandQueue, FO);
void ExecutionTest::RunLifetimeIntrinsicComputeTest(ID3D12Device *pDevice, LPCSTR pShader, CComPtr<ID3D12DescriptorHeap>& pUavHeap, CComPtr<ID3D12RootSignature>& pRootSignature,
LPCWSTR pTargetProfile, LPCWSTR *pOptions, int numOptions, std::vector<uint32_t> &values) {
// Create command queue.
CComPtr<ID3D12CommandQueue> pCommandQueue;
CreateComputeCommandQueue(pDevice, L"RunLifetimeIntrinsicTest Command Queue", &pCommandQueue);
FenceObj FO;
InitFenceObj(pDevice, &FO);
// Compile shader "main" and create pipeline state object.
CComPtr<ID3D12PipelineState> pComputeState;
CreateComputePSO(pDevice, pRootSignature, pShader, pTargetProfile, &pComputeState, pOptions, numOptions);
// Create a command allocator and list for compute.
CComPtr<ID3D12CommandAllocator> pCommandAllocator;
CComPtr<ID3D12GraphicsCommandList> pCommandList;
VERIFY_SUCCEEDED(pDevice->CreateCommandAllocator(D3D12_COMMAND_LIST_TYPE_COMPUTE, IID_PPV_ARGS(&pCommandAllocator)));
VERIFY_SUCCEEDED(pDevice->CreateCommandList(0, D3D12_COMMAND_LIST_TYPE_COMPUTE, pCommandAllocator, pComputeState, IID_PPV_ARGS(&pCommandList)));
pCommandList->SetName(L"ExecutionTest::RunLifetimeIntrinsicTest Command List");
// Set up UAV resource.
const UINT valueSizeInBytes = (UINT)values.size() * sizeof(uint32_t);
CComPtr<ID3D12Resource> pUavResource;
CComPtr<ID3D12Resource> pReadBuffer;
CComPtr<ID3D12Resource> pUploadResource;
CreateTestUavs(pDevice, pCommandList,, valueSizeInBytes, &pUavResource, &pUploadResource, &pReadBuffer);
VERIFY_SUCCEEDED(pUavResource->SetName(L"RunLifetimeIntrinsicTest UAV"));
VERIFY_SUCCEEDED(pReadBuffer->SetName(L"RunLifetimeIntrinsicTest UAV Read Buffer"));
VERIFY_SUCCEEDED(pUploadResource->SetName(L"RunLifetimeIntrinsicTest UAV Upload Buffer"));
// Close the command list and execute it to perform the GPU setup.
ExecuteCommandList(pCommandQueue, pCommandList);
WaitForSignal(pCommandQueue, FO);
VERIFY_SUCCEEDED(pCommandList->Reset(pCommandAllocator, pComputeState));
// Run the compute shader and copy the results back to readable memory.
uavDesc.Format = DXGI_FORMAT_R32_TYPELESS;
uavDesc.ViewDimension = D3D12_UAV_DIMENSION_BUFFER;
uavDesc.Buffer.FirstElement = 0;
uavDesc.Buffer.NumElements = (UINT)values.size();
uavDesc.Buffer.StructureByteStride = 0;
uavDesc.Buffer.CounterOffsetInBytes = 0;
uavDesc.Buffer.Flags = D3D12_BUFFER_UAV_FLAG_RAW;
CD3DX12_CPU_DESCRIPTOR_HANDLE uavHandle(pUavHeap->GetCPUDescriptorHandleForHeapStart());
CD3DX12_GPU_DESCRIPTOR_HANDLE uavHandleGpu(pUavHeap->GetGPUDescriptorHandleForHeapStart());
pDevice->CreateUnorderedAccessView(pUavResource, nullptr, &uavDesc, uavHandle);
SetDescriptorHeap(pCommandList, pUavHeap);
pCommandList->SetComputeRootDescriptorTable(0, uavHandleGpu);
static const int DispatchGroupX = 1;
static const int DispatchGroupY = 1;
static const int DispatchGroupZ = 1;
pCommandList->Dispatch(DispatchGroupX, DispatchGroupY, DispatchGroupZ);
RecordTransitionBarrier(pCommandList, pUavResource, D3D12_RESOURCE_STATE_UNORDERED_ACCESS, D3D12_RESOURCE_STATE_COPY_SOURCE);
pCommandList->CopyResource(pReadBuffer, pUavResource);
ExecuteCommandList(pCommandQueue, pCommandList);
WaitForSignal(pCommandQueue, FO);
MappedData mappedData(pReadBuffer, valueSizeInBytes);
uint32_t *pData = (uint32_t *);
memcpy(, pData, (size_t)valueSizeInBytes);
WaitForSignal(pCommandQueue, FO);
void ExecutionTest::RunLifetimeIntrinsicLibTest(ID3D12Device *pDevice0, LPCSTR pShader, CComPtr<ID3D12RootSignature>& pRootSignature,
LPCWSTR pTargetProfile, LPCWSTR *pOptions, int numOptions) {
CComPtr<ID3D12Device5> pDevice;
// Create command queue.
CComPtr<ID3D12CommandQueue> pCommandQueue;
CreateCommandQueue(pDevice, L"RunLifetimeIntrinsicTest Command Queue", &pCommandQueue, D3D12_COMMAND_LIST_TYPE_DIRECT);
FenceObj FO;
InitFenceObj(pDevice, &FO);
// Compile raygen shader.
CComPtr<ID3DBlob> pShaderLib;
CompileFromText(pShader, L"RayGen", pTargetProfile, &pShaderLib, pOptions, numOptions);
// Describe and create the RT pipeline state object (RTPSO).
auto lib = stateObjectDesc.CreateSubobject<CD3DX12_DXIL_LIBRARY_SUBOBJECT>();
CD3DX12_SHADER_BYTECODE byteCode(pShaderLib);
const int payloadCount = 4;
const int attributeCount = 2;
const int maxRecursion = 2;
stateObjectDesc.CreateSubobject<CD3DX12_RAYTRACING_SHADER_CONFIG_SUBOBJECT>()->Config(payloadCount * sizeof(float), attributeCount * sizeof(float));
// Create (local!) root sig subobject and associate with shader.
auto localRootSigSubObj = stateObjectDesc.CreateSubobject<CD3DX12_LOCAL_ROOT_SIGNATURE_SUBOBJECT>();
auto x = stateObjectDesc.CreateSubobject<CD3DX12_SUBOBJECT_TO_EXPORTS_ASSOCIATION_SUBOBJECT>();
CComPtr<ID3D12StateObject> pStateObject;
VERIFY_SUCCEEDED(pDevice->CreateStateObject(stateObjectDesc, IID_PPV_ARGS(&pStateObject)));
// Create a command allocator and list.
CComPtr<ID3D12CommandAllocator> pCommandAllocator;
CComPtr<ID3D12GraphicsCommandList4> pCommandList;
VERIFY_SUCCEEDED(pDevice->CreateCommandAllocator(D3D12_COMMAND_LIST_TYPE_DIRECT, IID_PPV_ARGS(&pCommandAllocator)));
VERIFY_SUCCEEDED(pDevice->CreateCommandList(0, D3D12_COMMAND_LIST_TYPE_DIRECT, pCommandAllocator, nullptr, IID_PPV_ARGS(&pCommandList)));
pCommandList->SetName(L"ExecutionTest::RunLifetimeIntrinsicTest Command List");
// Close the command list and execute it to kick-off compilation in the driver.
// NOTE: We don't care about anything else, so we're not setting up any resources and don't actually execute the shader.
ExecuteCommandList(pCommandQueue, pCommandList);
WaitForSignal(pCommandQueue, FO);
void ExecutionTest::RunLifetimeIntrinsicTest(ID3D12Device *pDevice, LPCSTR pShader, D3D_SHADER_MODEL shaderModel, bool useLibTarget,
LPCWSTR *pOptions, int numOptions, std::vector<uint32_t> &values) {
LPCWSTR pTargetProfile;
switch (shaderModel) {
default: pTargetProfile = useLibTarget ? L"lib_6_3" : L"cs_6_0"; break; // Default to 6.3 for lib, 6.0 otherwise.
case D3D_SHADER_MODEL_6_0: pTargetProfile = useLibTarget ? L"lib_6_0" : L"cs_6_0"; break;
case D3D_SHADER_MODEL_6_3: pTargetProfile = useLibTarget ? L"lib_6_3" : L"cs_6_3"; break;
case D3D_SHADER_MODEL_6_5: pTargetProfile = useLibTarget ? L"lib_6_5" : L"cs_6_5"; break;
case D3D_SHADER_MODEL_6_6: pTargetProfile = useLibTarget ? L"lib_6_6" : L"cs_6_6"; break;
// Describe a UAV descriptor heap.
heapDesc.NumDescriptors = 1;
// Create the UAV descriptor heap.
CComPtr<ID3D12DescriptorHeap> pUavHeap;
VERIFY_SUCCEEDED(pDevice->CreateDescriptorHeap(&heapDesc, IID_PPV_ARGS(&pUavHeap)));
// Create root signature.
CComPtr<ID3D12RootSignature> pRootSignature;
ranges[0].Init(D3D12_DESCRIPTOR_RANGE_TYPE_UAV, 1, 0, 0, 0);
CD3DX12_ROOT_PARAMETER rootParameters[1];
rootParameters[0].InitAsDescriptorTable(1, &ranges[0], D3D12_SHADER_VISIBILITY_ALL);
CD3DX12_ROOT_SIGNATURE_DESC rootSignatureDesc;
rootSignatureDesc.Init(_countof(rootParameters), rootParameters, 0, nullptr, rootSigFlag);
CreateRootSignatureFromDesc(pDevice, &rootSignatureDesc, &pRootSignature);
if (useLibTarget) {
RunLifetimeIntrinsicLibTest(pDevice, pShader, pRootSignature, pTargetProfile,
pOptions, numOptions);
} else {
RunLifetimeIntrinsicComputeTest(pDevice, pShader, pUavHeap, pRootSignature, pTargetProfile,
pOptions, numOptions, values);
TEST_F(ExecutionTest, LifetimeIntrinsicTest) {
// The only thing we test here is that existence of lifetime intrinsics or
// their fallback replacement (store undef or store zeroinitializer) do not
// cause any issues in the runtime and driver stack.
// The easiest way to force placement of intrinsics is to create an array in
// a local scope that is dynamically indexed. It must not be optimized away,
// so we do some bogus initialization that prevents this. Since all the code
// is guarded by a conditional that is dynamically always false, the actual
// effect of the shader is that the same value that was read is written back.
static const char* pShader = R"(
RWByteAddressBuffer g_bab : register(u0);
void fn(uint GI) {
const uint addr = GI * 4;
const int val = g_bab.Load(addr);
int res = val;
if (val < 0) { // Never true.
int arr[200];
for (int i = 0; i < 200; ++i) {
arr[i] = arr[val - i];
res += arr[val];
g_bab.Store(addr, (uint)res);
void main(uint GI : SV_GroupIndex) {
void RayGen() {
const uint d = DispatchRaysIndex().x;
const uint g = g > 64 ? 63 : g;
static const int NumThreadsX = 8;
static const int NumThreadsY = 8;
static const int NumThreadsZ = 1;
static const int ThreadsPerGroup = NumThreadsX * NumThreadsY * NumThreadsZ;
static const int DispatchGroupCount = 1;
CComPtr<ID3D12Device> pDevice;
bool bSM_6_6_Supported = CreateDevice(&pDevice, D3D_SHADER_MODEL_6_6, false);
bool bSM_6_3_Supported = bSM_6_6_Supported;
if (!bSM_6_6_Supported) {
// Try 6.3 for downlevel DXR case
bSM_6_3_Supported = CreateDevice(&pDevice, D3D_SHADER_MODEL_6_3, false);
if (!bSM_6_3_Supported) {
// Otherwise, 6.0 better be supported for compute case
VERIFY_IS_TRUE(CreateDevice(&pDevice, D3D_SHADER_MODEL_6_0, false));
bool bDXRSupported = bSM_6_3_Supported && DoesDeviceSupportRayTracing(pDevice);
if (GetTestParamUseWARP(UseWarpByDefault()) || IsDeviceBasicAdapter(pDevice)) {
WEX::Logging::Log::Comment(L"WARP has a known issue with LifetimeIntrinsicTest.");
if (!bSM_6_6_Supported) {
WEX::Logging::Log::Comment(L"Native lifetime markers skipped, device does not support SM 6.6");
if (!bDXRSupported) {
WEX::Logging::Log::Comment(L"DXR lifetime tests skipped, device does not support DXR");
std::vector<uint32_t> values;
SetupComputeValuePattern(values, ThreadsPerGroup * DispatchGroupCount);
// Run a number of tests for different configurations that will cause
// lifetime intrinsics to be:
// - placed directly
// - translated to an undef store
// - translated to a zeroinitializer store
// against compute and DXR targets, downlevel and SM 6.6:
// - downlevel: cs_6_0, lib_6_3 (DXR)
// - cs_6_6, lib_6_6 (DXR)
VERIFY_ARE_EQUAL(values[1], (uint32_t)1);
LPCWSTR optsBase[] = {L"-enable-lifetime-markers"};
LPCWSTR optsZeroStore[] = {L"-enable-lifetime-markers", L"-force-zero-store-lifetimes"};
WEX::Logging::Log::Comment(L"==== cs_6_0 with default translation");
RunLifetimeIntrinsicTest(pDevice, pShader, D3D_SHADER_MODEL_6_0, false,
optsBase, _countof(optsBase), values);
VERIFY_ARE_EQUAL(values[1], (uint32_t)1);
if (bDXRSupported) {
WEX::Logging::Log::Comment(L"==== DXR lib_6_3 with default translation");
RunLifetimeIntrinsicTest(pDevice, pShader, D3D_SHADER_MODEL_6_3, true,
optsBase, _countof(optsBase), values);
VERIFY_ARE_EQUAL(values[1], (uint32_t)1);
WEX::Logging::Log::Comment(L"==== cs_6_0 with zeroinitializer translation");
RunLifetimeIntrinsicTest(pDevice, pShader, D3D_SHADER_MODEL_6_0, false,
optsZeroStore, _countof(optsZeroStore), values);
VERIFY_ARE_EQUAL(values[1], (uint32_t)1);
if (bDXRSupported) {
WEX::Logging::Log::Comment(L"==== DXR lib_6_3 with zeroinitializer translation");
RunLifetimeIntrinsicTest(pDevice, pShader, D3D_SHADER_MODEL_6_3, true,
optsZeroStore, _countof(optsZeroStore), values);
VERIFY_ARE_EQUAL(values[1], (uint32_t)1);
if (bSM_6_6_Supported) {
WEX::Logging::Log::Comment(L"==== cs_6_6 with zeroinitializer translation");
RunLifetimeIntrinsicTest(pDevice, pShader, D3D_SHADER_MODEL_6_6, false,
optsZeroStore, _countof(optsZeroStore), values);
VERIFY_ARE_EQUAL(values[1], (uint32_t)1);
if (bDXRSupported) {
WEX::Logging::Log::Comment(L"==== DXR lib_6_6 with zeroinitializer translation");
RunLifetimeIntrinsicTest(pDevice, pShader, D3D_SHADER_MODEL_6_6, true,
optsZeroStore, _countof(optsZeroStore), values);
VERIFY_ARE_EQUAL(values[1], (uint32_t)1);
WEX::Logging::Log::Comment(L"==== cs_6_6 with native lifetime markers");
RunLifetimeIntrinsicTest(pDevice, pShader, D3D_SHADER_MODEL_6_6, false,
optsBase, _countof(optsBase), values);
VERIFY_ARE_EQUAL(values[1], (uint32_t)1);
if (bDXRSupported) {
WEX::Logging::Log::Comment(L"==== DXR lib_6_6 with native lifetime markers");
RunLifetimeIntrinsicTest(pDevice, pShader, D3D_SHADER_MODEL_6_6, true,
optsBase, _countof(optsBase), values);
VERIFY_ARE_EQUAL(values[1], (uint32_t)1);
TEST_F(ExecutionTest, BasicComputeTest) {
#ifndef _HLK_CONF
// BasicComputeTest is a simple compute shader that can be used as the basis
// for more interesting compute execution tests.
// The HLSL is compatible with shader models <=5.1 to allow using the DXBC
// rendering code paths for comparison.
static const char pShader[] =
"RWByteAddressBuffer g_bab : register(u0);\r\n"
"void main(uint GI : SV_GroupIndex) {"
" uint addr = GI * 4;\r\n"
" uint val = g_bab.Load(addr);\r\n"
" DeviceMemoryBarrierWithGroupSync();\r\n"
" g_bab.Store(addr, val + 1);\r\n"
static const int NumThreadsX = 8;
static const int NumThreadsY = 8;
static const int NumThreadsZ = 1;
static const int ThreadsPerGroup = NumThreadsX * NumThreadsY * NumThreadsZ;
static const int DispatchGroupCount = 1;
CComPtr<ID3D12Device> pDevice;
if (!CreateDevice(&pDevice))
std::vector<uint32_t> values;
SetupComputeValuePattern(values, ThreadsPerGroup * DispatchGroupCount);
VERIFY_ARE_EQUAL(values[0], (uint32_t)0);
RunRWByteBufferComputeTest(pDevice, pShader, values);
VERIFY_ARE_EQUAL(values[0], (uint32_t)1);
TEST_F(ExecutionTest, BasicTriangleTest) {
#ifndef _HLK_CONF
static const UINT FrameCount = 2;
static const UINT m_width = 320;
static const UINT m_height = 200;
static const float m_aspectRatio = static_cast<float>(m_width) / static_cast<float>(m_height);
struct Vertex {
XMFLOAT3 position;
XMFLOAT4 color;
// Pipeline objects.
CComPtr<ID3D12Device> pDevice;
CComPtr<ID3D12Resource> pRenderTarget;
CComPtr<ID3D12CommandAllocator> pCommandAllocator;
CComPtr<ID3D12CommandQueue> pCommandQueue;
CComPtr<ID3D12RootSignature> pRootSig;
CComPtr<ID3D12DescriptorHeap> pRtvHeap;
CComPtr<ID3D12PipelineState> pPipelineState;
CComPtr<ID3D12GraphicsCommandList> pCommandList;
CComPtr<ID3D12Resource> pReadBuffer;
UINT rtvDescriptorSize;
CComPtr<ID3D12Resource> pVertexBuffer;
D3D12_VERTEX_BUFFER_VIEW vertexBufferView;
// Synchronization objects.
FenceObj FO;
// Shaders.
static const char pShaders[] =
"struct PSInput {\r\n"
" float4 position : SV_POSITION;\r\n"
" float4 color : COLOR;\r\n"
"PSInput VSMain(float4 position : POSITION, float4 color : COLOR) {\r\n"
" PSInput result;\r\n"
" result.position = position;\r\n"
" result.color = color;\r\n"
" return result;\r\n"
"float4 PSMain(PSInput input) : SV_TARGET {\r\n"
" return 1; //input.color;\r\n"
if (!CreateDevice(&pDevice))
struct BasicTestChecker {
CComPtr<ID3D12Device> m_pDevice;
CComPtr<ID3D12InfoQueue> m_pInfoQueue;
bool m_OK = false;
void SetOK(bool value) { m_OK = value; }
BasicTestChecker(ID3D12Device *pDevice) : m_pDevice(pDevice) {
if (FAILED(m_pDevice.QueryInterface(&m_pInfoQueue)))
~BasicTestChecker() {
if (!m_OK && m_pInfoQueue != nullptr) {
UINT64 count = m_pInfoQueue->GetNumStoredMessages();
bool invalidBytecodeFound = false;
CAtlArray<BYTE> m_pBytes;
for (UINT64 i = 0; i < count; ++i) {
SIZE_T len = 0;
if (FAILED(m_pInfoQueue->GetMessageA(i, nullptr, &len)))
if (m_pBytes.GetCount() < len && !m_pBytes.SetCount(len))
D3D12_MESSAGE *pMsg = (D3D12_MESSAGE *)m_pBytes.GetData();
if (FAILED(m_pInfoQueue->GetMessageA(i, pMsg, &len)))
invalidBytecodeFound = true;
if (invalidBytecodeFound) {
LogCommentFmt(L"%s", L"Found an invalid bytecode message. This "
L"typically indicates that experimental mode "
L"is not set up properly.");
if (!GetTestParamBool(L"ExperimentalShaders")) {
LogCommentFmt(L"Note that the ExperimentalShaders test parameter isn't set.");
else {
LogCommentFmt(L"Did not find corrupt pixel or vertex shaders in "
L"queue - dumping complete queue.");
WriteInfoQueueMessages(nullptr, OutputFn, m_pInfoQueue);
static void __stdcall OutputFn(void *pCtx, const wchar_t *pMsg) {
LogCommentFmt(L"%s", pMsg);
BasicTestChecker BTC(pDevice);
InitFenceObj(pDevice, &FO);
CreateRtvDescriptorHeap(pDevice, FrameCount, &pRtvHeap, &rtvDescriptorSize);
CreateRenderTargetAndReadback(pDevice, pRtvHeap, m_width, m_height, &pRenderTarget, &pReadBuffer);
// Create an empty root signature.
CD3DX12_ROOT_SIGNATURE_DESC rootSignatureDesc;
0, nullptr, 0, nullptr,
CreateRootSignatureFromDesc(pDevice, &rootSignatureDesc, &pRootSig);
// Create the pipeline state, which includes compiling and loading shaders.
// Define the vertex input layout.
D3D12_INPUT_ELEMENT_DESC inputElementDescs[] = {
{"COLOR", 0, DXGI_FORMAT_R32G32B32A32_FLOAT, 0, 12,
D3D12_INPUT_LAYOUT_DESC InputLayout = { inputElementDescs, _countof(inputElementDescs) };
CreateGraphicsPSO(pDevice, &InputLayout, pRootSig, pShaders, &pPipelineState);
CreateGraphicsCommandQueueAndList(pDevice, &pCommandQueue,
&pCommandAllocator, &pCommandList,
// Define the geometry for a triangle.
Vertex triangleVertices[] = {
{ { 0.0f, 0.25f * m_aspectRatio, 0.0f },{ 1.0f, 0.0f, 0.0f, 1.0f } },
{ { 0.25f, -0.25f * m_aspectRatio, 0.0f },{ 0.0f, 1.0f, 0.0f, 1.0f } },
{ { -0.25f, -0.25f * m_aspectRatio, 0.0f },{ 0.0f, 0.0f, 1.0f, 1.0f } } };
CreateVertexBuffer(pDevice, triangleVertices, &pVertexBuffer, &vertexBufferView);
WaitForSignal(pCommandQueue, FO);
// Render and execute the command list.
RecordRenderAndReadback(pCommandList, pRtvHeap, rtvDescriptorSize, 1,
&vertexBufferView, pRootSig, pRenderTarget,
ExecuteCommandList(pCommandQueue, pCommandList);
// Wait for previous frame.
WaitForSignal(pCommandQueue, FO);
// At this point, we've verified that execution succeeded with DXIL.
// Read back to CPU and examine contents.
MappedData data(pReadBuffer, m_width * m_height * 4);
const uint32_t *pPixels = (uint32_t *);
if (SaveImages()) {
SavePixelsToFile(pPixels, DXGI_FORMAT_R8G8B8A8_UNORM, m_width, m_height, L"basic.bmp");
uint32_t top = pPixels[m_width / 2]; // Top center.
uint32_t mid = pPixels[m_width / 2 + m_width * (m_height / 2)]; // Middle center.
VERIFY_ARE_EQUAL(0xff663300, top); // clear color
VERIFY_ARE_EQUAL(0xffffffff, mid); // white
TEST_F(ExecutionTest, Int64Test) {
static const char pShader[] =
"RWByteAddressBuffer g_bab : register(u0);\r\n"
"void main(uint GI : SV_GroupIndex) {"
" uint addr = GI * 4;\r\n"
" uint val = g_bab.Load(addr);\r\n"
" uint64_t u64 = val;\r\n"
" u64 *= val;\r\n"
" g_bab.Store(addr, (uint)(u64 >> 32));\r\n"
static const int NumThreadsX = 8;
static const int NumThreadsY = 8;
static const int NumThreadsZ = 1;
static const int ThreadsPerGroup = NumThreadsX * NumThreadsY * NumThreadsZ;
static const int DispatchGroupCount = 1;
CComPtr<ID3D12Device> pDevice;
if (!CreateDevice(&pDevice))
if (!DoesDeviceSupportInt64(pDevice)) {
// Optional feature, so it's correct to not support it if declared as such.
WEX::Logging::Log::Comment(L"Device does not support int64 operations.");
std::vector<uint32_t> values;
SetupComputeValuePattern(values, ThreadsPerGroup * DispatchGroupCount);
VERIFY_ARE_EQUAL(values[0], (uint32_t)0);
RunRWByteBufferComputeTest(pDevice, pShader, values);
VERIFY_ARE_EQUAL(values[0], (uint32_t)0);
TEST_F(ExecutionTest, SignTest) {
static const char pShader[] =
"RWByteAddressBuffer g_bab : register(u0);\r\n"
"void main(uint GI : SV_GroupIndex) {"
" uint addr = GI * 4;\r\n"
" int val = g_bab.Load(addr);\r\n"
" g_bab.Store(addr, (uint)(sign(val)));\r\n"
static const int NumThreadsX = 8;
static const int NumThreadsY = 1;
static const int NumThreadsZ = 1;
static const int ThreadsPerGroup = NumThreadsX * NumThreadsY * NumThreadsZ;
static const int DispatchGroupCount = 1;
CComPtr<ID3D12Device> pDevice;
if (!CreateDevice(&pDevice))
const uint32_t neg1 = (uint32_t)-1;
uint32_t origValues[] = { (uint32_t)-3, (uint32_t)-2, neg1, 0, 1, 2, 3, 4 };
std::vector<uint32_t> values(origValues, origValues + _countof(origValues));
RunRWByteBufferComputeTest(pDevice, pShader, values);
VERIFY_ARE_EQUAL(values[0], neg1);
VERIFY_ARE_EQUAL(values[1], neg1);
VERIFY_ARE_EQUAL(values[2], neg1);
VERIFY_ARE_EQUAL(values[3], (uint32_t)0);
VERIFY_ARE_EQUAL(values[4], (uint32_t)1);
VERIFY_ARE_EQUAL(values[5], (uint32_t)1);
VERIFY_ARE_EQUAL(values[6], (uint32_t)1);
VERIFY_ARE_EQUAL(values[7], (uint32_t)1);
TEST_F(ExecutionTest, WaveIntrinsicsDDITest) {
#ifndef _HLK_CONF
CComPtr<ID3D12Device> pDevice;
if (!CreateDevice(&pDevice))
if (FAILED(pDevice->CheckFeatureSupport((D3D12_FEATURE)D3D12_FEATURE_D3D12_OPTIONS1, &O, sizeof(O))))
bool waveSupported = O.WaveOps;
UINT laneCountMin = O.WaveLaneCountMin;
UINT laneCountMax = O.WaveLaneCountMax;
LogCommentFmt(L"WaveOps %i, WaveLaneCountMin %u, WaveLaneCountMax %u", waveSupported, laneCountMin, laneCountMax);
VERIFY_IS_TRUE(laneCountMin <= laneCountMax);
if (waveSupported) {
VERIFY_IS_TRUE(laneCountMin > 0 && laneCountMax > 0);
else {
VERIFY_IS_TRUE(laneCountMin == 0 && laneCountMax == 0);
TEST_F(ExecutionTest, WaveIntrinsicsTest) {
#ifndef _HLK_CONF
WEX::TestExecution::SetVerifyOutput verifySettings(WEX::TestExecution::VerifyOutputSettings::LogOnlyFailures);
struct PerThreadData {
uint32_t id, flags, laneIndex, laneCount, firstLaneId, preds, firstlaneX, lane1X;
uint32_t allBC, allSum, allProd, allAND, allOR, allXOR, allMin, allMax;
uint32_t pfBC, pfSum, pfProd;
uint32_t ballot[4];
uint32_t diver; // divergent value, used in calculation
int32_t i_diver; // divergent value, used in calculation
int32_t i_allMax, i_allMin, i_allSum, i_allProd;
int32_t i_pfSum, i_pfProd;
static const char pShader[] =
"struct PerThreadData {\r\n"
" uint id, flags, laneIndex, laneCount, firstLaneId, preds, firstlaneX, lane1X;\r\n"
" uint allBC, allSum, allProd, allAND, allOR, allXOR, allMin, allMax;\r\n"
" uint pfBC, pfSum, pfProd;\r\n"
" uint4 ballot;\r\n"
" uint diver;\r\n"
" int i_diver;\r\n"
" int i_allMax, i_allMin, i_allSum, i_allProd;\r\n"
" int i_pfSum, i_pfProd;\r\n"
"RWStructuredBuffer<PerThreadData> g_sb : register(u0);\r\n"
"void main(uint GI : SV_GroupIndex, uint3 GTID : SV_GroupThreadID) {"
" PerThreadData pts = g_sb[GI];\r\n"
" uint diver = GTID.x + 2;\r\n"
" pts.diver = diver;\r\n"
" pts.flags = 0;\r\n"
" pts.preds = 0;\r\n"
" if (WaveIsFirstLane()) pts.flags |= 1;\r\n"
" pts.laneIndex = WaveGetLaneIndex();\r\n"
" pts.laneCount = WaveGetLaneCount();\r\n"
" pts.firstLaneId = WaveReadLaneFirst(;\r\n"
" pts.preds |= ((WaveActiveAnyTrue(diver == 1) ? 1 : 0) << 0);\r\n"
" pts.preds |= ((WaveActiveAllTrue(diver == 1) ? 1 : 0) << 1);\r\n"
" pts.preds |= ((WaveActiveAllEqual(diver) ? 1 : 0) << 2);\r\n"
" pts.preds |= ((WaveActiveAllEqual(GTID.z) ? 1 : 0) << 3);\r\n"
" pts.preds |= ((WaveActiveAllEqual(WaveReadLaneFirst(diver)) ? 1 : 0) << 4);\r\n"
" pts.ballot = WaveActiveBallot(diver > 3);\r\n"
" pts.firstlaneX = WaveReadLaneFirst(GTID.x);\r\n"
" pts.lane1X = WaveReadLaneAt(GTID.x, 1);\r\n"
" pts.allBC = WaveActiveCountBits(diver > 3);\r\n"
" pts.allSum = WaveActiveSum(diver);\r\n"
" pts.allProd = WaveActiveProduct(diver);\r\n"
" pts.allAND = WaveActiveBitAnd(diver);\r\n"
" pts.allOR = WaveActiveBitOr(diver);\r\n"
" pts.allXOR = WaveActiveBitXor(diver);\r\n"
" pts.allMin = WaveActiveMin(diver);\r\n"
" pts.allMax = WaveActiveMax(diver);\r\n"
" pts.pfBC = WavePrefixCountBits(diver > 3);\r\n"
" pts.pfSum = WavePrefixSum(diver);\r\n"
" pts.pfProd = WavePrefixProduct(diver);\r\n"
" int i_diver = pts.i_diver;\r\n"
" pts.i_allMax = WaveActiveMax(i_diver);\r\n"
" pts.i_allMin = WaveActiveMin(i_diver);\r\n"
" pts.i_allSum = WaveActiveSum(i_diver);\r\n"
" pts.i_allProd = WaveActiveProduct(i_diver);\r\n"
" pts.i_pfSum = WavePrefixSum(i_diver);\r\n"
" pts.i_pfProd = WavePrefixProduct(i_diver);\r\n"
" g_sb[GI] = pts;\r\n"
static const int NumtheadsX = 8;
static const int NumtheadsY = 8;
static const int NumtheadsZ = 1;
static const int ThreadsPerGroup = NumtheadsX * NumtheadsY * NumtheadsZ;
static const int DispatchGroupCount = 1;
CComPtr<ID3D12Device> pDevice;
if (!CreateDevice(&pDevice))
if (!DoesDeviceSupportWaveOps(pDevice)) {
// Optional feature, so it's correct to not support it if declared as such.
WEX::Logging::Log::Comment(L"Device does not support wave operations.");
std::vector<PerThreadData> values;
values.resize(ThreadsPerGroup * DispatchGroupCount);
for (size_t i = 0; i < values.size(); ++i) {
memset(&values[i], 0, sizeof(PerThreadData));
values[i].id = (uint32_t)i;
values[i].i_diver = (int)i;
values[i].i_diver *= (i % 2) ? 1 : -1;
static const int DispatchGroupX = 1;
static const int DispatchGroupY = 1;
static const int DispatchGroupZ = 1;
CComPtr<ID3D12GraphicsCommandList> pCommandList;
CComPtr<ID3D12CommandQueue> pCommandQueue;
CComPtr<ID3D12DescriptorHeap> pUavHeap;
CComPtr<ID3D12CommandAllocator> pCommandAllocator;
UINT uavDescriptorSize;
FenceObj FO;
bool dxbc = UseDxbc();
const size_t valueSizeInBytes = values.size() * sizeof(PerThreadData);
CreateComputeCommandQueue(pDevice, L"WaveIntrinsicsTest Command Queue", &pCommandQueue);
InitFenceObj(pDevice, &FO);
// Describe and create a UAV descriptor heap.
heapDesc.NumDescriptors = 1;
VERIFY_SUCCEEDED(pDevice->CreateDescriptorHeap(&heapDesc, IID_PPV_ARGS(&pUavHeap)));
uavDescriptorSize = pDevice->GetDescriptorHandleIncrementSize(heapDesc.Type);
// Create root signature.
CComPtr<ID3D12RootSignature> pRootSignature;
ranges[0].Init(D3D12_DESCRIPTOR_RANGE_TYPE_UAV, 1, 0, 0, 0);
CD3DX12_ROOT_PARAMETER rootParameters[1];
rootParameters[0].InitAsDescriptorTable(1, &ranges[0], D3D12_SHADER_VISIBILITY_ALL);
CD3DX12_ROOT_SIGNATURE_DESC rootSignatureDesc;
rootSignatureDesc.Init(_countof(rootParameters), rootParameters, 0, nullptr, D3D12_ROOT_SIGNATURE_FLAG_NONE);
CComPtr<ID3DBlob> signature;
CComPtr<ID3DBlob> error;
VERIFY_SUCCEEDED(D3D12SerializeRootSignature(&rootSignatureDesc, D3D_ROOT_SIGNATURE_VERSION_1, &signature, &error));
VERIFY_SUCCEEDED(pDevice->CreateRootSignature(0, signature->GetBufferPointer(), signature->GetBufferSize(), IID_PPV_ARGS(&pRootSignature)));
// Create pipeline state object.
CComPtr<ID3D12PipelineState> pComputeState;
CreateComputePSO(pDevice, pRootSignature, pShader, L"cs_6_0", &pComputeState);
// Create a command allocator and list for compute.
VERIFY_SUCCEEDED(pDevice->CreateCommandAllocator(D3D12_COMMAND_LIST_TYPE_COMPUTE, IID_PPV_ARGS(&pCommandAllocator)));
VERIFY_SUCCEEDED(pDevice->CreateCommandList(0, D3D12_COMMAND_LIST_TYPE_COMPUTE, pCommandAllocator, pComputeState, IID_PPV_ARGS(&pCommandList)));
// Set up UAV resource.
CComPtr<ID3D12Resource> pUavResource;
CComPtr<ID3D12Resource> pReadBuffer;
CComPtr<ID3D12Resource> pUploadResource;
CreateTestUavs(pDevice, pCommandList,, (UINT)valueSizeInBytes, &pUavResource, &pUploadResource, &pReadBuffer);
// Close the command list and execute it to perform the GPU setup.
ExecuteCommandList(pCommandQueue, pCommandList);
WaitForSignal(pCommandQueue, FO);
VERIFY_SUCCEEDED(pCommandList->Reset(pCommandAllocator, pComputeState));
// Run the compute shader and copy the results back to readable memory.
uavDesc.ViewDimension = D3D12_UAV_DIMENSION_BUFFER;
uavDesc.Buffer.FirstElement = 0;
uavDesc.Buffer.NumElements = (UINT)values.size();
uavDesc.Buffer.StructureByteStride = sizeof(PerThreadData);
uavDesc.Buffer.CounterOffsetInBytes = 0;
uavDesc.Buffer.Flags = D3D12_BUFFER_UAV_FLAG_NONE;
CD3DX12_CPU_DESCRIPTOR_HANDLE uavHandle(pUavHeap->GetCPUDescriptorHandleForHeapStart());
CD3DX12_GPU_DESCRIPTOR_HANDLE uavHandleGpu(pUavHeap->GetGPUDescriptorHandleForHeapStart());
pDevice->CreateUnorderedAccessView(pUavResource, nullptr, &uavDesc, uavHandle);
SetDescriptorHeap(pCommandList, pUavHeap);
pCommandList->SetComputeRootDescriptorTable(0, uavHandleGpu);
pCommandList->Dispatch(DispatchGroupX, DispatchGroupY, DispatchGroupZ);
RecordTransitionBarrier(pCommandList, pUavResource, D3D12_RESOURCE_STATE_UNORDERED_ACCESS, D3D12_RESOURCE_STATE_COPY_SOURCE);
pCommandList->CopyResource(pReadBuffer, pUavResource);
ExecuteCommandList(pCommandQueue, pCommandList);
WaitForSignal(pCommandQueue, FO);
MappedData mappedData(pReadBuffer, (UINT)valueSizeInBytes);
PerThreadData *pData = (PerThreadData *);
memcpy(, pData, valueSizeInBytes);
// Gather some general data.
// The 'firstLaneId' captures a unique number per first-lane per wave.
// Counting the number distinct firstLaneIds gives us the number of waves.
std::vector<uint32_t> firstLaneIds;
for (size_t i = 0; i < values.size(); ++i) {
PerThreadData &pts = values[i];
uint32_t firstLaneId = pts.firstLaneId;
if (!contains(firstLaneIds, firstLaneId)) {
// Waves should cover 4 threads or more.
LogCommentFmt(L"Found %u distinct lane ids: %u", firstLaneIds.size());
if (!dxbc) {
VERIFY_IS_GREATER_THAN_OR_EQUAL(values.size() / 4, firstLaneIds.size());
// Now, group threads into waves.
std::map<uint32_t, std::unique_ptr<std::vector<PerThreadData *> > > waves;
for (size_t i = 0; i < firstLaneIds.size(); ++i) {
waves[firstLaneIds[i]] = std::make_unique<std::vector<PerThreadData *> >();
for (size_t i = 0; i < values.size(); ++i) {
PerThreadData &pts = values[i];
std::unique_ptr<std::vector<PerThreadData *> > &wave = waves[pts.firstLaneId];
// Verify that all the wave values are coherent across the wave.
for (size_t i = 0; i < values.size(); ++i) {
PerThreadData &pts = values[i];
std::unique_ptr<std::vector<PerThreadData *> > &wave = waves[pts.firstLaneId];
// Sort the lanes by increasing lane ID.
struct LaneIdOrderPred {
bool operator()(PerThreadData *a, PerThreadData *b) {
return a->laneIndex < b->laneIndex;
std::sort(wave.get()->begin(), wave.get()->end(), LaneIdOrderPred());
// Verify some interesting properties of the first lane.
uint32_t pfBC, pfSum, pfProd;
int32_t i_pfSum, i_pfProd;
int32_t i_allMax, i_allMin;
PerThreadData *ptdFirst = wave->front();
VERIFY_IS_TRUE(0 != (ptdFirst->flags & 1)); // FirstLane sets this bit.
VERIFY_IS_TRUE(0 == ptdFirst->pfBC);
VERIFY_IS_TRUE(0 == ptdFirst->pfSum);
VERIFY_IS_TRUE(1 == ptdFirst->pfProd);
VERIFY_IS_TRUE(0 == ptdFirst->i_pfSum);
VERIFY_IS_TRUE(1 == ptdFirst->i_pfProd);
pfBC = (ptdFirst->diver > 3) ? 1 : 0;
pfSum = ptdFirst->diver;
pfProd = ptdFirst->diver;
i_pfSum = ptdFirst->i_diver;
i_pfProd = ptdFirst->i_diver;
i_allMax = i_allMin = ptdFirst->i_diver;
// Calculate values which take into consideration all lanes.
uint32_t preds = 0;
preds |= 1 << 1; // AllTrue starts true, switches to false if needed.
preds |= 1 << 2; // AllEqual starts true, switches to false if needed.
preds |= 1 << 3; // WaveActiveAllEqual(GTID.z) is always true
preds |= 1 << 4; // (WaveActiveAllEqual(WaveReadLaneFirst(diver)) is always true
uint32_t ballot[4] = { 0, 0, 0, 0 };
int32_t i_allSum = 0, i_allProd = 1;
for (size_t n = 0; n < wave->size(); ++n) {
std::vector<PerThreadData *> &lanes = *wave.get();
// pts.preds |= ((WaveActiveAnyTrue(diver == 1) ? 1 : 0) << 0);
if (lanes[n]->diver == 1) preds |= (1 << 0);
// pts.preds |= ((WaveActiveAllTrue(diver == 1) ? 1 : 0) << 1);
if (lanes[n]->diver != 1) preds &= ~(1 << 1);
// pts.preds |= ((WaveActiveAllEqual(diver) ? 1 : 0) << 2);
if (lanes[0]->diver != lanes[n]->diver) preds &= ~(1 << 2);
// pts.ballot = WaveActiveBallot(diver > 3);\r\n"
if (lanes[n]->diver > 3) {
// This is the uint4 result layout:
// .x -> bits 0 .. 31
// .y -> bits 32 .. 63
// .z -> bits 64 .. 95
// .w -> bits 96 ..127
uint32_t component = lanes[n]->laneIndex / 32;
uint32_t bit = lanes[n]->laneIndex % 32;
ballot[component] |= 1 << bit;
i_allMax = std::max(lanes[n]->i_diver, i_allMax);
i_allMin = std::min(lanes[n]->i_diver, i_allMin);
i_allProd *= lanes[n]->i_diver;
i_allSum += lanes[n]->i_diver;
for (size_t n = 1; n < wave->size(); ++n) {
// 'All' operations are uniform across the wave.
std::vector<PerThreadData *> &lanes = *wave.get();
VERIFY_IS_TRUE(0 == (lanes[n]->flags & 1)); // non-firstlanes do not set this bit
VERIFY_ARE_EQUAL(lanes[0]->allBC, lanes[n]->allBC);
VERIFY_ARE_EQUAL(lanes[0]->allSum, lanes[n]->allSum);
VERIFY_ARE_EQUAL(lanes[0]->allProd, lanes[n]->allProd);
VERIFY_ARE_EQUAL(lanes[0]->allAND, lanes[n]->allAND);
VERIFY_ARE_EQUAL(lanes[0]->allOR, lanes[n]->allOR);
VERIFY_ARE_EQUAL(lanes[0]->allXOR, lanes[n]->allXOR);
VERIFY_ARE_EQUAL(lanes[0]->allMin, lanes[n]->allMin);
VERIFY_ARE_EQUAL(lanes[0]->allMax, lanes[n]->allMax);
VERIFY_ARE_EQUAL(i_allMax, lanes[n]->i_allMax);
VERIFY_ARE_EQUAL(i_allMin, lanes[n]->i_allMin);
VERIFY_ARE_EQUAL(i_allProd, lanes[n]->i_allProd);
VERIFY_ARE_EQUAL(i_allSum, lanes[n]->i_allSum);
// first-lane reads and uniform reads are uniform across the wave.
VERIFY_ARE_EQUAL(lanes[0]->firstlaneX, lanes[n]->firstlaneX);
VERIFY_ARE_EQUAL(lanes[0]->lane1X, lanes[n]->lane1X);
// the lane count is uniform across the wave.
VERIFY_ARE_EQUAL(lanes[0]->laneCount, lanes[n]->laneCount);
// The predicates are uniform across the wave.
VERIFY_ARE_EQUAL(lanes[n]->preds, preds);
// the lane index is distinct per thread.
for (size_t prior = 0; prior < n; ++prior) {
VERIFY_ARE_NOT_EQUAL(lanes[prior]->laneIndex, lanes[n]->laneIndex);
// Ballot results are uniform across the wave.
VERIFY_ARE_EQUAL(0, memcmp(ballot, lanes[n]->ballot, sizeof(ballot)));
// Keep running total of prefix calculation. Prefix values are exclusive to
// the executing lane.
VERIFY_ARE_EQUAL(pfBC, lanes[n]->pfBC);
VERIFY_ARE_EQUAL(pfSum, lanes[n]->pfSum);
VERIFY_ARE_EQUAL(pfProd, lanes[n]->pfProd);
VERIFY_ARE_EQUAL(i_pfSum, lanes[n]->i_pfSum);
VERIFY_ARE_EQUAL(i_pfProd, lanes[n]->i_pfProd);
pfBC += (lanes[n]->diver > 3) ? 1 : 0;
pfSum += lanes[n]->diver;
pfProd *= lanes[n]->diver;
i_pfSum += lanes[n]->i_diver;
i_pfProd *= lanes[n]->i_diver;
// TODO: add divergent branching and verify that the otherwise uniform values properly diverge
// Compare each value of each per-thread element.
for (size_t i = 0; i < values.size(); ++i) {
PerThreadData &pts = values[i];
VERIFY_ARE_EQUAL(i,; // ID is unchanged.
// This test is assuming that the adapter implements WaveReadLaneFirst correctly
TEST_F(ExecutionTest, WaveIntrinsicsInPSTest) {
WEX::TestExecution::SetVerifyOutput verifySettings(WEX::TestExecution::VerifyOutputSettings::LogOnlyFailures);
struct Vertex {
XMFLOAT3 position;
struct PerPixelData {
XMFLOAT4 position;
uint32_t id, flags, laneIndex, laneCount, firstLaneId, sum1;
uint32_t id0, id1, id2, id3;
uint32_t acrossX, acrossY, acrossDiag, quadActiveCount;
const UINT RTWidth = 128;
const UINT RTHeight = 128;
// Shaders.
static const char pShaders[] =
"struct PSInput {\r\n"
" float4 position : SV_POSITION;\r\n"
"PSInput VSMain(float4 position : POSITION) {\r\n"
" PSInput result;\r\n"
" result.position = position;\r\n"
" return result;\r\n"
"uint pos_to_id(float4 pos) { return pos.x * 128 + pos.y; }\r\n"
"struct PerPixelData {\r\n"
" float4 position;\r\n"
" uint id, flags, laneIndex, laneCount, firstLaneId, sum1;\r\n"
" uint id0, id1, id2, id3;\r\n"
" uint acrossX, acrossY, acrossDiag, quadActiveCount;\r\n"
"AppendStructuredBuffer<PerPixelData> g_sb : register(u1);\r\n"
"float4 PSMain(PSInput input) : SV_TARGET {\r\n"
" uint one = 1;\r\n"
" PerPixelData d;\r\n"
" d.position = input.position;\r\n"
" = pos_to_id(input.position);\r\n"
" d.flags = 0;\r\n"
" if (WaveIsFirstLane()) d.flags |= 1;\r\n"
" d.laneIndex = WaveGetLaneIndex();\r\n"
" d.laneCount = WaveGetLaneCount();\r\n"
" d.firstLaneId = WaveReadLaneFirst(;\r\n"
" d.sum1 = WaveActiveSum(one);\r\n"
" d.id0 = QuadReadLaneAt(, 0);\r\n"
" d.id1 = QuadReadLaneAt(, 1);\r\n"
" d.id2 = QuadReadLaneAt(, 2);\r\n"
" d.id3 = QuadReadLaneAt(, 3);\r\n"
" d.acrossX = QuadReadAcrossX(;\r\n"
" d.acrossY = QuadReadAcrossY(;\r\n"
" d.acrossDiag = QuadReadAcrossDiagonal(;\r\n"
" d.quadActiveCount = one + QuadReadAcrossX(one) + QuadReadAcrossY(one) + QuadReadAcrossDiagonal(one);\r\n"
" g_sb.Append(d);\r\n"
" return 1;\r\n"
CComPtr<ID3D12Device> pDevice;
CComPtr<ID3D12CommandQueue> pCommandQueue;
CComPtr<ID3D12DescriptorHeap> pUavHeap, pRtvHeap;
CComPtr<ID3D12CommandAllocator> pCommandAllocator;
CComPtr<ID3D12GraphicsCommandList> pCommandList;
CComPtr<ID3D12PipelineState> pPSO;
CComPtr<ID3D12Resource> pRenderTarget, pReadBuffer;
UINT uavDescriptorSize, rtvDescriptorSize;
CComPtr<ID3D12Resource> pVertexBuffer;
D3D12_VERTEX_BUFFER_VIEW vertexBufferView;
if (!CreateDevice(&pDevice))
if (!DoesDeviceSupportWaveOps(pDevice)) {
// Optional feature, so it's correct to not support it if declared as such.
WEX::Logging::Log::Comment(L"Device does not support wave operations.");
FenceObj FO;
InitFenceObj(pDevice, &FO);
// Describe and create a UAV descriptor heap.
heapDesc.NumDescriptors = 1;
VERIFY_SUCCEEDED(pDevice->CreateDescriptorHeap(&heapDesc, IID_PPV_ARGS(&pUavHeap)));
uavDescriptorSize = pDevice->GetDescriptorHandleIncrementSize(heapDesc.Type);
CreateRtvDescriptorHeap(pDevice, 1, &pRtvHeap, &rtvDescriptorSize);
CreateRenderTargetAndReadback(pDevice, pRtvHeap, RTHeight, RTWidth, &pRenderTarget, &pReadBuffer);
// Create root signature: one UAV.
CComPtr<ID3D12RootSignature> pRootSignature;
ranges[0].Init(D3D12_DESCRIPTOR_RANGE_TYPE_UAV, 1, 1, 0, 0);
CD3DX12_ROOT_PARAMETER rootParameters[1];
rootParameters[0].InitAsDescriptorTable(1, &ranges[0], D3D12_SHADER_VISIBILITY_ALL);
CD3DX12_ROOT_SIGNATURE_DESC rootSignatureDesc;
rootSignatureDesc.Init(_countof(rootParameters), rootParameters, 0, nullptr, D3D12_ROOT_SIGNATURE_FLAG_ALLOW_INPUT_ASSEMBLER_INPUT_LAYOUT);
CreateRootSignatureFromDesc(pDevice, &rootSignatureDesc, &pRootSignature);
D3D12_INPUT_ELEMENT_DESC elementDesc[] = {
D3D12_INPUT_LAYOUT_DESC InputLayout = {elementDesc, _countof(elementDesc)};
CreateGraphicsPSO(pDevice, &InputLayout, pRootSignature, pShaders, &pPSO);
CreateGraphicsCommandQueueAndList(pDevice, &pCommandQueue, &pCommandAllocator,
&pCommandList, pPSO);
// Single triangle covering half the target.
Vertex vertices[] = {
{ { -1.0f, 1.0f, 0.0f } },
{ { 1.0f, 1.0f, 0.0f } },
{ { -1.0f, -1.0f, 0.0f } } };
const UINT TriangleCount = _countof(vertices) / 3;
CreateVertexBuffer(pDevice, vertices, &pVertexBuffer, &vertexBufferView);
bool dxbc = UseDxbc();
// Set up UAV resource.
std::vector<PerPixelData> values;
values.resize(RTWidth * RTHeight * 2);
UINT valueSizeInBytes = (UINT)values.size() * sizeof(PerPixelData);
memset(, 0, valueSizeInBytes);
CComPtr<ID3D12Resource> pUavResource;
CComPtr<ID3D12Resource> pUavReadBuffer;
CComPtr<ID3D12Resource> pUploadResource;
CreateTestUavs(pDevice, pCommandList,, valueSizeInBytes, &pUavResource, &pUploadResource, &pUavReadBuffer);
// Set up the append counter resource.
CComPtr<ID3D12Resource> pUavCounterResource;
CComPtr<ID3D12Resource> pReadCounterBuffer;
CComPtr<ID3D12Resource> pUploadCounterResource;
BYTE zero[sizeof(UINT)] = { 0 };
CreateTestUavs(pDevice, pCommandList, zero, sizeof(zero), &pUavCounterResource, &pUploadCounterResource, &pReadCounterBuffer);
// Close the command list and execute it to perform the GPU setup.
ExecuteCommandList(pCommandQueue, pCommandList);
WaitForSignal(pCommandQueue, FO);
VERIFY_SUCCEEDED(pCommandList->Reset(pCommandAllocator, pPSO));
SetDescriptorHeap(pCommandList, pUavHeap);
uavDesc.ViewDimension = D3D12_UAV_DIMENSION_BUFFER;
uavDesc.Buffer.FirstElement = 0;
uavDesc.Buffer.NumElements = (UINT)values.size();
uavDesc.Buffer.StructureByteStride = sizeof(PerPixelData);
uavDesc.Buffer.CounterOffsetInBytes = 0;
uavDesc.Buffer.Flags = D3D12_BUFFER_UAV_FLAG_NONE;
CD3DX12_CPU_DESCRIPTOR_HANDLE uavHandle(pUavHeap->GetCPUDescriptorHandleForHeapStart());
CD3DX12_GPU_DESCRIPTOR_HANDLE uavHandleGpu(pUavHeap->GetGPUDescriptorHandleForHeapStart());
pDevice->CreateUnorderedAccessView(pUavResource, pUavCounterResource, &uavDesc, uavHandle);
pCommandList->SetGraphicsRootDescriptorTable(0, uavHandleGpu);
RecordRenderAndReadback(pCommandList, pRtvHeap, rtvDescriptorSize, TriangleCount, &vertexBufferView, nullptr, pRenderTarget, pReadBuffer);
RecordTransitionBarrier(pCommandList, pUavResource, D3D12_RESOURCE_STATE_UNORDERED_ACCESS, D3D12_RESOURCE_STATE_COPY_SOURCE);
RecordTransitionBarrier(pCommandList, pUavCounterResource, D3D12_RESOURCE_STATE_UNORDERED_ACCESS, D3D12_RESOURCE_STATE_COPY_SOURCE);
pCommandList->CopyResource(pUavReadBuffer, pUavResource);
pCommandList->CopyResource(pReadCounterBuffer, pUavCounterResource);
LogCommentFmt(L"Rendering to %u by %u", RTWidth, RTHeight);
ExecuteCommandList(pCommandQueue, pCommandList);
WaitForSignal(pCommandQueue, FO);
MappedData data(pReadBuffer, RTWidth * RTHeight * 4);
const uint32_t *pPixels = (uint32_t *);
if (SaveImages()) {
SavePixelsToFile(pPixels, DXGI_FORMAT_R8G8B8A8_UNORM, RTWidth, RTHeight, L"psintrin.bmp");
uint32_t appendCount;
MappedData mappedData(pReadCounterBuffer, sizeof(uint32_t));
appendCount = *((uint32_t *);
LogCommentFmt(L"%u elements in append buffer", appendCount);
MappedData mappedData(pUavReadBuffer, (UINT32)values.size());
PerPixelData *pData = (PerPixelData *);
memcpy(, pData, valueSizeInBytes);
// DXBC is handy to test pipeline setup, but interesting functions are
// stubbed out, so there is no point in further validation.
if (dxbc)
uint32_t maxActiveLaneCount = 0;
uint32_t maxLaneCount = 0;
for (uint32_t i = 0; i < appendCount; ++i) {
maxActiveLaneCount = std::max(maxActiveLaneCount, values[i].sum1);
maxLaneCount = std::max(maxLaneCount, values[i].laneCount);
uint32_t peerOfHelperLanes = 0;
for (uint32_t i = 0; i < appendCount; ++i) {
if (values[i].sum1 != maxActiveLaneCount) {
L"Found: %u threads. Waves reported up to %u total lanes, up "
L"to %u active lanes, and %u threads had helper/inactive lanes.",
appendCount, maxLaneCount, maxActiveLaneCount, peerOfHelperLanes);
// Group threads into quad invocations.
uint32_t singlePixelCount = 0;
uint32_t multiPixelCount = 0;
std::unordered_set<uint32_t> ids;
std::multimap<uint32_t, PerPixelData *> idGroups;
std::multimap<uint32_t, PerPixelData *> firstIdGroups;
for (uint32_t i = 0; i < appendCount; ++i) {
idGroups.insert(std::make_pair(values[i].id, &values[i]));
firstIdGroups.insert(std::make_pair(values[i].firstLaneId, &values[i]));
for (uint32_t id : ids) {
if (idGroups.count(id) == 1)
LogCommentFmt(L"%u pixels were processed by a single thread. %u invocations were for shared pixels.",
singlePixelCount, multiPixelCount);
// Multiple threads may have tried to shade the same pixel. (Is this true even if we have only one triangle?)
// Where every pixel is distinct, it's very straightforward to validate.
auto cur = firstIdGroups.begin(), end = firstIdGroups.end();
while (cur != end) {
bool simpleWave = true;
uint32_t firstId = (*cur).first;
auto groupEnd = cur;
while (groupEnd != end && (*groupEnd).first == firstId) {
if (idGroups.count((*groupEnd).second->id) > 1)
simpleWave = false;
if (simpleWave) {
// Break the wave into quads.
struct QuadData {
unsigned count;
PerPixelData *data[4];
std::map<uint32_t, QuadData> quads;
for (auto i = cur; i != groupEnd; ++i) {
// assuming that it is a simple wave, idGroups has a unique id for each entry.
uint32_t laneId = (*i).second->id;
uint32_t laneIds[4] = {(*i).second->id0, (*i).second->id1,
(*i).second->id2, (*i).second->id3};
// Since this is a simple wave, each lane has an unique id and
// therefore should not have any ids in there.
VERIFY_IS_TRUE(quads.find(laneId) == quads.end());
// check if QuadReadLaneAt is returning same values in a single quad.
bool newQuad = true;
for (unsigned quadIndex = 0; quadIndex < 4; ++quadIndex) {
auto match = quads.find(laneIds[quadIndex]);
if (match != quads.end()) {
(*match)[(*match).second.count++] = (*i).second;
newQuad = false;
auto quadMemberData = idGroups.find(laneIds[quadIndex]);
if (quadMemberData != idGroups.end()) {
VERIFY_IS_TRUE((*quadMemberData).second->id0 == laneIds[0]);
VERIFY_IS_TRUE((*quadMemberData).second->id1 == laneIds[1]);
VERIFY_IS_TRUE((*quadMemberData).second->id2 == laneIds[2]);
VERIFY_IS_TRUE((*quadMemberData).second->id3 == laneIds[3]);
if (newQuad) {
QuadData qdata;
qdata.count = 1;[0] = (*i).second;
quads.insert(std::make_pair(laneId, qdata));
for (auto quadPair : quads) {
unsigned count = quadPair.second.count;
// There could be only one pixel data on the edge of the triangle
if (count < 2) continue;
PerPixelData **data =;
bool isTop[4];
bool isLeft[4];
PerPixelData helperData;
memset(&helperData, sizeof(helperData), 0);
PerPixelData *layout[4]; // tl,tr,bl,br
memset(layout, sizeof(layout), 0);
auto fnToLayout = [&](bool top, bool left) -> PerPixelData ** {
int idx = top ? 0 : 2;
idx += left ? 0 : 1;
return &layout[idx];
auto fnToLayoutData = [&](bool top, bool left) -> PerPixelData * {
PerPixelData **pResult = fnToLayout(top, left);
if (*pResult == nullptr) return &helperData;
return *pResult;
VERIFY_IS_TRUE(count <= 4);
if (count == 2) {
isTop[0] = data[0]->position.y < data[1]->position.y;
isTop[1] = (data[0]->position.y == data[1]->position.y) ? isTop[0] : !isTop[0];
isLeft[0] = data[0]->position.x < data[1]->position.x;
isLeft[1] = (data[0]->position.x == data[1]->position.x) ? isLeft[0] : !isLeft[0];
else {
// with at least three samples, we have distinct x and y coordinates.
float left = std::min(data[0]->position.x, data[1]->position.x);
left = std::min(data[2]->position.x, left);
float top = std::min(data[0]->position.y, data[1]->position.y);
top = std::min(data[2]->position.y, top);
for (unsigned i = 0; i < count; ++i) {
isTop[i] = data[i]->position.y == top;
isLeft[i] = data[i]->position.x == left;
for (unsigned i = 0; i < count; ++i) {
*(fnToLayout(isTop[i], isLeft[i])) = data[i];
// Finally, we have a proper quad reconstructed. Validate.
for (unsigned i = 0; i < count; ++i) {
PerPixelData *d = data[i];
VERIFY_ARE_EQUAL(d->id0, fnToLayoutData(true, true)->id);
VERIFY_ARE_EQUAL(d->id1, fnToLayoutData(true, false)->id);
VERIFY_ARE_EQUAL(d->id2, fnToLayoutData(false, true)->id);
VERIFY_ARE_EQUAL(d->id3, fnToLayoutData(false, false)->id);
VERIFY_ARE_EQUAL(d->acrossX, fnToLayoutData(isTop[i], !isLeft[i])->id);
VERIFY_ARE_EQUAL(d->acrossY, fnToLayoutData(!isTop[i], isLeft[i])->id);
VERIFY_ARE_EQUAL(d->acrossDiag, fnToLayoutData(!isTop[i], !isLeft[i])->id);
VERIFY_ARE_EQUAL(d->quadActiveCount, count);
cur = groupEnd;
// TODO: provide validation for quads where the same pixel was shaded multiple times
// Consider: for pixels that were shaded multiple times, check whether
// some grouping of threads into quads satisfies all value requirements.
struct ShaderOpTestResult {
st::ShaderOp *ShaderOp;
std::shared_ptr<st::ShaderOpSet> ShaderOpSet;
std::shared_ptr<st::ShaderOpTest> Test;
struct SPrimitives {
float f_float;
float f_float2;
float f_float_o;
float f_float2_o;
RunShaderOpTestAfterParse(ID3D12Device *pDevice, dxc::DxcDllSupport &support,
st::ShaderOpTest::TInitCallbackFn pInitCallback,
std::shared_ptr<st::ShaderOpSet> ShaderOpSet) {
st::ShaderOp *pShaderOp;
if (pName == nullptr) {
if (ShaderOpSet->ShaderOps.size() != 1) {
VERIFY_FAIL(L"Expected a single shader operation.");
pShaderOp = ShaderOpSet->ShaderOps[0].get();
else {
pShaderOp = ShaderOpSet->GetShaderOp(pName);
if (pShaderOp == nullptr) {
std::string msg = "Unable to find shader op ";
msg += pName;
msg += "; available ops";
const char sep = ':';
for (auto &pAvailOp : ShaderOpSet->ShaderOps) {
msg += sep;
msg += pAvailOp->Name ? pAvailOp->Name : "[n/a]";
CA2W msgWide(msg.c_str());
// This won't actually be used since we're supplying the device,
// but let's make it consistent.
pShaderOp->UseWarpDevice = GetTestParamUseWARP(true);
std::shared_ptr<st::ShaderOpTest> test = std::make_shared<st::ShaderOpTest>();
std::shared_ptr<ShaderOpTestResult> result =
result->ShaderOpSet = ShaderOpSet;
result->Test = test;
result->ShaderOp = pShaderOp;
return result;
RunShaderOpTest(ID3D12Device *pDevice, dxc::DxcDllSupport &support,
IStream *pStream, LPCSTR pName,
st::ShaderOpTest::TInitCallbackFn pInitCallback) {
DXASSERT_NOMSG(pStream != nullptr);
std::shared_ptr<st::ShaderOpSet> ShaderOpSet =
st::ParseShaderOpSetFromStream(pStream, ShaderOpSet.get());
return RunShaderOpTestAfterParse(pDevice, support, pName, pInitCallback, ShaderOpSet);
TEST_F(ExecutionTest, OutOfBoundsTest) {
WEX::TestExecution::SetVerifyOutput verifySettings(WEX::TestExecution::VerifyOutputSettings::LogOnlyFailures);
CComPtr<IStream> pStream;
ReadHlslDataIntoNewStream(L"ShaderOpArith.xml", &pStream);
// Single operation test at the moment.
CComPtr<ID3D12Device> pDevice;
if (!CreateDevice(&pDevice))
std::shared_ptr<ShaderOpTestResult> test = RunShaderOpTest(pDevice, m_support, pStream, "OOB", nullptr);
MappedData data;
// Read back to CPU and examine contents - should get pure red.
MappedData data;
test->Test->GetReadBackData("RTarget", &data);
const uint32_t *pPixels = (uint32_t *);
uint32_t first = *pPixels;
VERIFY_ARE_EQUAL(0xff0000ff, first); // pure red - only first component is read
TEST_F(ExecutionTest, SaturateTest) {
WEX::TestExecution::SetVerifyOutput verifySettings(WEX::TestExecution::VerifyOutputSettings::LogOnlyFailures);
CComPtr<IStream> pStream;
ReadHlslDataIntoNewStream(L"ShaderOpArith.xml", &pStream);
// Single operation test at the moment.
CComPtr<ID3D12Device> pDevice;
if (!CreateDevice(&pDevice))
std::shared_ptr<ShaderOpTestResult> test = RunShaderOpTest(pDevice, m_support, pStream, "Saturate", nullptr);
MappedData data;
test->Test->GetReadBackData("U0", &data);
const float *pValues = (float *);
// Everything is zero except for 1.5f and +Inf, which saturate to 1.0f
const float ExpectedCases[9] = {
0.0f, 0.0f, 0.0f, 0.0f, // -inf, -1.5, -denorm, -0
0.0f, 0.0f, 1.0f, 1.0f, // 0, denorm, 1.5f, inf
0.0f // nan
for (size_t i = 0; i < _countof(ExpectedCases); ++i) {
VERIFY_IS_TRUE(ifdenorm_flushf_eq(*pValues, ExpectedCases[i]));
void ExecutionTest::BasicTriangleTestSetup(LPCSTR ShaderOpName, LPCWSTR FileName, D3D_SHADER_MODEL testModel) {
#ifdef _HLK_CONF
WEX::TestExecution::SetVerifyOutput verifySettings(WEX::TestExecution::VerifyOutputSettings::LogOnlyFailures);
CComPtr<IStream> pStream;
ReadHlslDataIntoNewStream(L"ShaderOpArith.xml", &pStream);
// Single operation test at the moment.
CComPtr<ID3D12Device> pDevice;
if (!CreateDevice(&pDevice, testModel))
// As this is used, 6.2 requirement always comes with requiring native 16-bit ops
if (testModel == D3D_SHADER_MODEL_6_2 && !DoesDeviceSupportNative16bitOps(pDevice)) {
WEX::Logging::Log::Comment(L"Device does not support native 16-bit operations.");
std::shared_ptr<ShaderOpTestResult> test = RunShaderOpTest(pDevice, m_support, pStream, ShaderOpName, nullptr);
MappedData data;
D3D12_RESOURCE_DESC &D = test->ShaderOp->GetResourceByName("RTarget")->Desc;
UINT width = (UINT)D.Width;
UINT height = D.Height;
test->Test->GetReadBackData("RTarget", &data);
const uint32_t *pPixels = (uint32_t *);
if (SaveImages()) {
SavePixelsToFile(pPixels, DXGI_FORMAT_R8G8B8A8_UNORM, 320, 200, FileName);
uint32_t top = pPixels[width / 2]; // Top center.
uint32_t mid = pPixels[width / 2 + width * (height / 2)]; // Middle center.
VERIFY_ARE_EQUAL(0xff663300, top); // clear color
VERIFY_ARE_EQUAL(0xffffffff, mid); // white
// This is the basic validation test for shader operations, so it's good to
// check this here at least for this one test case.
TEST_F(ExecutionTest, BasicTriangleOpTest) {
BasicTriangleTestSetup("Triangle", L"basic-triangle.bmp", D3D_SHADER_MODEL_6_0);
TEST_F(ExecutionTest, BasicTriangleOpTestHalf) {
BasicTriangleTestSetup("TriangleHalf", L"basic-triangle-half.bmp", D3D_SHADER_MODEL_6_2);
void VerifyDerivResults(const float *pPixels, UINT offsetCenter) {
// pixel at the center
float CenterDDXFine = pPixels[offsetCenter];
float CenterDDYFine = pPixels[offsetCenter + 1];
float CenterDDXCoarse = pPixels[offsetCenter + 2];
float CenterDDYCoarse = pPixels[offsetCenter + 3];
L"center ddx_fine: %8f, ddy_fine: %8f, ddx_coarse: %8f, ddy_coarse: %8f",
CenterDDXFine, CenterDDYFine, CenterDDXCoarse, CenterDDYCoarse);
// The texture for the 9 pixels in the center should look like the following
// 256 32 64
// 2048 256 512
// 1 .125 .25
// In D3D12 there is no guarantee of how the adapter is grouping 2x2 pixels
// So for fine derivatives there can be up to two possible results for the center pixel,
// while for coarse derivatives there can be up to six possible results.
int ulpTolerance = 1;
// 512 - 256 or 2048 - 256
bool left = CompareFloatULP(CenterDDXFine, -1792.0f, ulpTolerance);
VERIFY_IS_TRUE(left || CompareFloatULP(CenterDDXFine, 256.0f, ulpTolerance));
// 256 - 32 or 256 - .125
bool top = CompareFloatULP(CenterDDYFine, 224.0f, ulpTolerance);
VERIFY_IS_TRUE(top || CompareFloatULP(CenterDDYFine, -255.875, ulpTolerance));
if (top && left) {
VERIFY_IS_TRUE((CompareFloatULP(CenterDDXCoarse, -224.0f, ulpTolerance) ||
CompareFloatULP(CenterDDXCoarse, -1792.0f, ulpTolerance)) &&
(CompareFloatULP(CenterDDYCoarse, 224.0f, ulpTolerance) ||
CompareFloatULP(CenterDDYCoarse, 1792.0f, ulpTolerance)));
else if (top) { // top right quad
VERIFY_IS_TRUE((CompareFloatULP(CenterDDXCoarse, 256.0f, ulpTolerance) ||
CompareFloatULP(CenterDDXCoarse, 32.0f, ulpTolerance)) &&
(CompareFloatULP(CenterDDYCoarse, 224.0f, ulpTolerance) ||
CompareFloatULP(CenterDDYCoarse, 448.0f, ulpTolerance)));
else if (left) { // bottom left quad
VERIFY_IS_TRUE((CompareFloatULP(CenterDDXCoarse, -1792.0f, ulpTolerance) ||
CompareFloatULP(CenterDDXCoarse, -.875f, ulpTolerance)) &&
(CompareFloatULP(CenterDDYCoarse, -2047.0f, ulpTolerance) ||
CompareFloatULP(CenterDDYCoarse, -255.875f, ulpTolerance)));
else { // bottom right
VERIFY_IS_TRUE((CompareFloatULP(CenterDDXCoarse, 256.0f, ulpTolerance) ||
CompareFloatULP(CenterDDXCoarse, .125f, ulpTolerance)) &&
(CompareFloatULP(CenterDDYCoarse, -255.875f, ulpTolerance) ||
CompareFloatULP(CenterDDYCoarse, -511.75f, ulpTolerance)));
// Rendering two right triangles forming a square and assigning a texture value
// for each pixel to calculate derivates.
TEST_F(ExecutionTest, PartialDerivTest) {
WEX::TestExecution::SetVerifyOutput verifySettings(WEX::TestExecution::VerifyOutputSettings::LogOnlyFailures);
CComPtr<IStream> pStream;
ReadHlslDataIntoNewStream(L"ShaderOpArith.xml", &pStream);
CComPtr<ID3D12Device> pDevice;
if (!CreateDevice(&pDevice))
std::shared_ptr<ShaderOpTestResult> test = RunShaderOpTest(pDevice, m_support, pStream, "DerivFine", nullptr);
MappedData data;
D3D12_RESOURCE_DESC &D = test->ShaderOp->GetResourceByName("RTarget")->Desc;
UINT width = (UINT)D.Width;
UINT height = D.Height;
UINT pixelSize = GetByteSizeForFormat(D.Format) / 4;
test->Test->GetReadBackData("RTarget", &data);
const float *pPixels = (float *);
UINT centerIndex = (UINT64)width * height / 2 - width / 2;
UINT offsetCenter = centerIndex * pixelSize;
VerifyDerivResults(pPixels, offsetCenter);
struct Dispatch {
int width, height, depth;
RunDispatch(ID3D12Device *pDevice, dxc::DxcDllSupport &support,
st::ShaderOp *pShaderOp, const Dispatch D) {
char compilerOptions[256];
std::shared_ptr<st::ShaderOpTest> test = std::make_shared<st::ShaderOpTest>();
// format compiler args
VERIFY_IS_TRUE(sprintf_s(compilerOptions, sizeof(compilerOptions),
D.width, D.height, D.depth));
for (st::ShaderOpShader &S : pShaderOp->Shaders)
S.Arguments = compilerOptions;
pShaderOp->DispatchX = D.width;
pShaderOp->DispatchY = D.height;
pShaderOp->DispatchZ = D.depth;
return test;
TEST_F(ExecutionTest, DerivativesTest) {
const UINT pixelSize = 4; // always float4
WEX::TestExecution::SetVerifyOutput verifySettings(WEX::TestExecution::VerifyOutputSettings::LogOnlyFailures);
CComPtr<IStream> pStream;
ReadHlslDataIntoNewStream(L"ShaderOpArith.xml", &pStream);
CComPtr<ID3D12Device> pDevice;
if (!CreateDevice(&pDevice, D3D_SHADER_MODEL_6_6))
std::shared_ptr<st::ShaderOpSet> ShaderOpSet =
st::ParseShaderOpSetFromStream(pStream, ShaderOpSet.get());
st::ShaderOp *pShaderOp = ShaderOpSet->GetShaderOp("Derivatives");
std::vector<Dispatch> dispatches =
{40, 1, 1},
{1000, 1, 1},
{32, 32, 1},
{16, 64, 1},
{4, 12, 4},
{4, 64, 1},
{16, 16, 3},
{32, 8, 2}
std::vector<Dispatch> meshDispatches =
{60, 1, 1},
{128, 1, 1},
{8, 8, 1},
{32, 8, 1},
{8, 16, 4},
{8, 64, 1},
{8, 8, 3},
std::vector<Dispatch> badDispatches =
{16, 3, 1},
{2, 16, 1},
{33, 1, 1}
pShaderOp->UseWarpDevice = GetTestParamUseWARP(true);
LPCSTR CS = pShaderOp->CS;
MappedData data;
for (Dispatch &D : dispatches) {
// Test Compute Shader
std::shared_ptr<st::ShaderOpTest> test = RunDispatch(pDevice, m_support, pShaderOp, D);
test->GetReadBackData("U0", &data);
float *pPixels = (float *);;
UINT centerIndex = 0;
if (D.height == 1) {
centerIndex = (((UINT64)(D.width * D.height * D.depth) / 2) & ~0xF) + 10;
} else {
// To find roughly the center for compute, divide the height and width in half,
// truncate to the previous multiple of 4 to get to the start of the repeating pattern
// and then add 2 rows to get to the second row of quads and 2 to get to the first texel
// of the second row of that quad row
UINT centerRow = ((D.height/2UL) & ~0x3) + 2;
UINT centerCol = ((D.width/2UL) & ~0x3) + 2;
centerIndex = centerRow * D.width + centerCol;
UINT offsetCenter = centerIndex * pixelSize;
LogCommentFmt(L"Verifying derivatives in compute shader results");
VerifyDerivResults(pPixels, offsetCenter);
if (DoesDeviceSupportMeshAmpDerivatives(pDevice)) {
// Disable CS so mesh goes forward
pShaderOp->CS = nullptr;
for (Dispatch &D : meshDispatches) {
std::shared_ptr<st::ShaderOpTest> test = RunDispatch(pDevice, m_support, pShaderOp, D);
test->GetReadBackData("U1", &data);
const float *pPixels = (float *);
UINT centerIndex = (((UINT64)(D.width * D.height * D.depth)/2) & ~0xF) + 10;
UINT offsetCenter = centerIndex * pixelSize;
LogCommentFmt(L"Verifying derivatives in mesh shader results");
VerifyDerivResults(pPixels, offsetCenter);
test->GetReadBackData("U2", &data);
pPixels = (float *);
LogCommentFmt(L"Verifying derivatives in amplification shader results");
VerifyDerivResults(pPixels, offsetCenter);
// Final tests with invalid dispatch size just to make sure they run
for (Dispatch &D : badDispatches) {
// Test Compute Shader
pShaderOp->CS = CS;
std::shared_ptr<st::ShaderOpTest> test = RunDispatch(pDevice, m_support, pShaderOp, D);
if (DoesDeviceSupportMeshAmpDerivatives(pDevice)) {
pShaderOp->CS = nullptr;
test = RunDispatch(pDevice, m_support, pShaderOp, D);
// Verify the results for the quad starting with the given index
void VerifyQuadReadResults(const UINT *pPixels, UINT quadIndex) {
for (UINT i = 0; i < 4; i++) {
UINT ix = quadIndex + i;
UINT lix = pPixels[4*ix];
VERIFY_ARE_EQUAL(pPixels[4*ix + 1], (lix^1));// ReadAcrossX
VERIFY_ARE_EQUAL(pPixels[4*ix + 2], (lix^2));// ReadAcrossY
VERIFY_ARE_EQUAL(pPixels[4*ix + 3], (lix^3));// ReadAcrossDiagonal
TEST_F(ExecutionTest, QuadReadTest) {
WEX::TestExecution::SetVerifyOutput verifySettings(WEX::TestExecution::VerifyOutputSettings::LogOnlyFailures);
CComPtr<IStream> pStream;
ReadHlslDataIntoNewStream(L"ShaderOpArith.xml", &pStream);
CComPtr<ID3D12Device> pDevice;
if (!CreateDevice(&pDevice))
if (GetTestParamUseWARP(UseWarpByDefault()) || IsDeviceBasicAdapter(pDevice)) {
WEX::Logging::Log::Comment(L"WARP does not support QuadRead in compute shaders.");
if (!DoesDeviceSupportWaveOps(pDevice)) {
WEX::Logging::Log::Comment(L"Device does not support wave operations.");
std::shared_ptr<st::ShaderOpSet> ShaderOpSet =
st::ParseShaderOpSetFromStream(pStream, ShaderOpSet.get());
st::ShaderOp *pShaderOp = ShaderOpSet->GetShaderOp("QuadRead");
LPCSTR CS = pShaderOp->CS;
struct Dispatch {
int x, y, z;
int mx, my, mz;
//std::vector<std::tuple<int, int, int, int, int>> dispatches =
std::vector<Dispatch> dispatches =
{32, 32, 1, 8, 8, 1},
{64, 4, 1, 64, 2, 1},
{64, 1, 1, 64, 1, 1},
{16, 16, 3, 4, 4, 3},
for (Dispatch &D : dispatches) {
UINT width = D.x;
UINT height = D.y;
UINT depth = D.z;
UINT mwidth =;
UINT mheight =;
UINT mdepth =;
// format compiler args
char compilerOptions[256];
VERIFY_IS_TRUE(sprintf_s(compilerOptions, sizeof(compilerOptions),
width, height, depth, mwidth, mheight, mdepth));
for (st::ShaderOpShader &S : pShaderOp->Shaders)
S.Arguments = compilerOptions;
pShaderOp->DispatchX = width;
pShaderOp->DispatchY = height;
pShaderOp->DispatchZ = depth;
// Test Compute Shader
pShaderOp->CS = CS;
std::shared_ptr<ShaderOpTestResult> test = RunShaderOpTestAfterParse(pDevice, m_support, "QuadRead", nullptr, ShaderOpSet);
MappedData data;
test->Test->GetReadBackData("U0", &data);
const UINT *pPixels = (UINT *);
// To find roughly the center for compute, divide the pixel count in half
// and truncate to next lowest power of 4 to start at a quad
UINT offsetCenter = ((UINT64)(width * height * depth)/2) & ~0x3;
// Test first, second and center quads
LogCommentFmt(L"Verifying QuadRead* in compute shader results");
VerifyQuadReadResults(pPixels, 0);
VerifyQuadReadResults(pPixels, 4);
VerifyQuadReadResults(pPixels, offsetCenter);
if (DoesDeviceSupportMeshAmpDerivatives(pDevice)) {
offsetCenter = ((UINT64)(mwidth * mheight * mdepth)/2) & ~0x3;
// Disable CS so mesh goes forward
pShaderOp->CS = nullptr;
test = RunShaderOpTestAfterParse(pDevice, m_support, "QuadRead", nullptr, ShaderOpSet);
test->Test->GetReadBackData("U1", &data);
pPixels = (UINT *);
// Test first, second and center quads
LogCommentFmt(L"Verifying QuadRead* in mesh shader results");
VerifyQuadReadResults(pPixels, 0);
VerifyQuadReadResults(pPixels, 4);
VerifyQuadReadResults(pPixels, offsetCenter);
test->Test->GetReadBackData("U2", &data);
pPixels = (UINT *);
// Test first, second and center quads
LogCommentFmt(L"Verifying QuadRead* in amplification shader results");
VerifyQuadReadResults(pPixels, 0);
VerifyQuadReadResults(pPixels, 4);
VerifyQuadReadResults(pPixels, offsetCenter);
void VerifySampleResults(const UINT *pPixels, UINT width) {
UINT xlod = 0;
UINT ylod = 0;
// Each pixel contains 4 samples and 4 LOD calculations.
// 2 of these (called 'left' and 'right') have X values that vary and a constant Y
// 2 others (called 'top' and 'bot') have Y values that vary and a constant X
// Only of the X variant sample results and one of the Y variant results
// are actually reported for the pixel.
// The other 2 serve as "helpers" to the other pixels in the quad.
// On the left side of the quad, the 'left' samples are reported.
// Op the top of the quad, the 'top' samples are reported and so on.
// The varying coordinate values alternate between zero and a
// value whose magnitude increases with the index.
// As a result, the LOD level should steadily increas.
// Due to vagaries of implementation, the same derivatives
// in both directions might result in different levels for different locations
// in the quad. So only comparisons between sample results and LOD calculations
// and ensuring that the LOD increased and reaches the max can be tested reliably.
for (unsigned i = 0; i < width; i++) {
// CalculateLOD and Sample from texture with mip levels containing LOD index should match
VERIFY_ARE_EQUAL(pPixels[4*i + 0], pPixels[4*i + 1]);
VERIFY_ARE_EQUAL(pPixels[4*i + 2], pPixels[4*i + 3]);
// Make sure LODs are ever climbing as magnitudes increase
VERIFY_IS_TRUE(pPixels[4*i] >= xlod);
xlod = pPixels[4*i];
VERIFY_IS_TRUE(pPixels[4*i + 2] >= ylod);
ylod = pPixels[4*i + 2];
// Make sure we reached the max lod level for both tracks
TEST_F(ExecutionTest, ComputeSampleTest) {
WEX::TestExecution::SetVerifyOutput verifySettings(WEX::TestExecution::VerifyOutputSettings::LogOnlyFailures);
CComPtr<IStream> pStream;
ReadHlslDataIntoNewStream(L"ShaderOpArith.xml", &pStream);
CComPtr<ID3D12Device> pDevice;
if (!CreateDevice(&pDevice, D3D_SHADER_MODEL_6_6))
std::shared_ptr<st::ShaderOpSet> ShaderOpSet =
st::ParseShaderOpSetFromStream(pStream, ShaderOpSet.get());
st::ShaderOp *pShaderOp = ShaderOpSet->GetShaderOp("ComputeSample");
// Initialize texture with the LOD number in each corresponding mip level
auto SampleInitFn = [&](LPCSTR Name, std::vector<BYTE> &Data, st::ShaderOp *pShaderOp) {
VERIFY_ARE_EQUAL(0, _stricmp(Name, "T0"));
D3D12_RESOURCE_DESC &texDesc = pShaderOp->GetResourceByName("T0")->Desc;
UINT texWidth = (UINT)texDesc.Width;
UINT texHeight = (UINT)texDesc.Height;
size_t size = sizeof(float) * texWidth * texHeight * 2;
float *pPrimitives = (float *);
float lod = 0.0;
int ix = 0;
while (texHeight > 0 && texWidth > 0) {
if(!texHeight) texHeight = 1;
if(!texWidth) texWidth = 1;
for (size_t j = 0; j < texHeight; ++j) {
for (size_t i = 0; i < texWidth; ++i) {
pPrimitives[ix++] = lod;
lod += 1.0;
texHeight >>= 1;
texWidth >>= 1;
LPCSTR CS2 = nullptr, AS2 = nullptr, MS2 = nullptr;
for (st::ShaderOpShader &S : pShaderOp->Shaders) {
if (!strcmp(S.Name, "CS2")) CS2 = S.Name;
if (!strcmp(S.Name, "AS2")) AS2 = S.Name;
if (!strcmp(S.Name, "MS2")) MS2 = S.Name;
// Test 1D compute shader
std::shared_ptr<ShaderOpTestResult> test = RunShaderOpTestAfterParse(pDevice, m_support, "ComputeSample", SampleInitFn, ShaderOpSet);
MappedData data;
test->Test->GetReadBackData("U0", &data);
const UINT *pPixels = (UINT *);
VerifySampleResults(pPixels, 84*4);
// Test 2D compute shader
pShaderOp->CS = CS2;
test = RunShaderOpTestAfterParse(pDevice, m_support, "ComputeSample", SampleInitFn, ShaderOpSet);
test->Test->GetReadBackData("U0", &data);
pPixels = (UINT *);
VerifySampleResults(pPixels, 84*4);
if (DoesDeviceSupportMeshAmpDerivatives(pDevice)) {
// Disable CS so mesh goes forward
pShaderOp->CS = nullptr;
test = RunShaderOpTestAfterParse(pDevice, m_support, "ComputeSample", SampleInitFn, ShaderOpSet);
test->Test->GetReadBackData("U1", &data);
pPixels = (UINT *);
VerifySampleResults(pPixels, 116);
test->Test->GetReadBackData("U2", &data);
pPixels = (UINT *);
VerifySampleResults(pPixels, 84);
pShaderOp->AS = AS2;
pShaderOp->MS = MS2;
test = RunShaderOpTestAfterParse(pDevice, m_support, "ComputeSample", SampleInitFn, ShaderOpSet);
test->Test->GetReadBackData("U1", &data);
pPixels = (UINT *);
VerifySampleResults(pPixels, 116);
test->Test->GetReadBackData("U2", &data);
pPixels = (UINT *);
VerifySampleResults(pPixels, 84);
TEST_F(ExecutionTest, ATOWriteMSAATest) {
WEX::TestExecution::SetVerifyOutput verifySettings(WEX::TestExecution::VerifyOutputSettings::LogOnlyFailures);
CComPtr<ID3D12Device> pDevice;
if (!CreateDevice(&pDevice, sm))
if (!DoesDeviceSupportAdvancedTexOps(pDevice)) {
WEX::Logging::Log::Comment(L"Device does not support Advanced Texture Operations.");
if (!DoesDeviceSupportWritableMSAA(pDevice)) {
WEX::Logging::Log::Comment(L"Device does not support Writable MSAA.");
static const char pWriteShader[] =
"#define SAMPLES 4\n"
"RWStructuredBuffer<float> g_out : register(u0);\n"
"RWTexture2DMS<float, 4> g_texms : register(u1);\n"
"RWTexture2DArray<float> g_texms : register(u1);\n"
"[NumThreads(32, 32, 1)]\n"
"void main(uint3 id : SV_GroupThreadID) {\n"
" for(uint i = 0; i < SAMPLES; i++) {\n"
" g_texms.sample[i][id.xy] = id.x*id.y*(i+1);\n"
" g_texms[uint3(id.xy, i)] = id.x*id.y*(i+1);\n"
" }\n"
static const char pCopyShader[] =
"#define SAMPLES 4\n"
"RWStructuredBuffer<float> g_out : register(u0);\n"
"RWTexture2DMS<float, 4> g_texms : register(u1);\n"
"RWTexture2DArray<float> g_texms : register(u1);\n"
"[NumThreads(32, 32, 1)]\n"
" void main(uint3 id : SV_GroupThreadID) {\n"
" for(uint i = 0; i < SAMPLES; i++) {\n"
" g_out[i*32*32 + id.y*32 + id.x] = g_texms.sample[i][id.xy];\n"
" g_out[i*32*32 + id.y*32 + id.x] = g_texms[uint3(id.xy, i)];\n"
" }"
static const int NumThreadsX = 32;
static const int NumThreadsY = 32;
static const int NumSamples = 4;
static const int ArraySize = 4;
static const int NumSamples = 4;
static const int ArraySize = 1;
static const int ThreadsPerGroup = NumThreadsX * NumThreadsY;
const size_t valueSize = NumSamples * ThreadsPerGroup;
const size_t valueSizeInBytes = valueSize * sizeof(float);
static const int DispatchGroupX = 1;
static const int DispatchGroupY = 1;
static const int DispatchGroupZ = 1;
CComPtr<ID3D12CommandQueue> pCommandQueue;
CComPtr<ID3D12CommandAllocator> pCommandAllocator;
FenceObj FO;
CreateComputeCommandQueue(pDevice, L"WriteMSAA Queue", &pCommandQueue);
InitFenceObj(pDevice, &FO);
// Create root signature.
CComPtr<ID3D12RootSignature> pRootSignature;
ranges[0].Init(D3D12_DESCRIPTOR_RANGE_TYPE_UAV, 1, 0, 0);
ranges[1].Init(D3D12_DESCRIPTOR_RANGE_TYPE_UAV, 1, 1, 0);
CreateRootSignatureFromRanges(pDevice, &pRootSignature, ranges, 2);
VERIFY_SUCCEEDED(pDevice->CreateCommandAllocator(D3D12_COMMAND_LIST_TYPE_COMPUTE, IID_PPV_ARGS(&pCommandAllocator)));
// Create command list and resources
CComPtr<ID3D12GraphicsCommandList> pCommandList;
pCommandAllocator, nullptr, IID_PPV_ARGS(&pCommandList)));
// Set up Output Resource
CComPtr<ID3D12Resource> pOutputResource;
CComPtr<ID3D12Resource> pOutputReadBuffer;
CComPtr<ID3D12Resource> pOutputUploadResource;
float outVals[valueSize];
int ix = 0;
for (int i = 0; i < NumSamples; i++)
for (int j = 0; j < NumThreadsY; j++)
for (int k = 0; k < NumThreadsX; k++)
outVals[ix++] = (float)ix + 5;
CreateTestUavs(pDevice, pCommandList, outVals, sizeof(outVals), &pOutputResource,
&pOutputUploadResource, &pOutputReadBuffer);
// Set up texture Resource.
CComPtr<ID3D12Resource> pUavResource;
float values[valueSize];
memset(values, 0xc, valueSizeInBytes);
int numsamp = 1;
int numsamp = NumSamples;
NumThreadsX, NumThreadsY, ArraySize, 1, numsamp, 0,
CreateTestResources(pDevice, pCommandList, values, valueSizeInBytes, tex2dDesc,
&pUavResource, nullptr);
// Close the command list and execute it to perform the resource uploads
ID3D12CommandList *ppCommandLists[] = { pCommandList };
pCommandQueue->ExecuteCommandLists(1, ppCommandLists);
WaitForSignal(pCommandQueue, FO);
// Create shaders
const wchar_t *target = L"cs_6_6";
const wchar_t *target = L"cs_6_7";
CComPtr<ID3D12PipelineState> pWritePSO;
CreateComputePSO(pDevice, pRootSignature, pWriteShader, target, &pWritePSO);
CComPtr<ID3D12PipelineState> pCopyPSO;
CreateComputePSO(pDevice, pRootSignature, pCopyShader, target, &pCopyPSO);
// Reset commandlist to write PSO
VERIFY_SUCCEEDED(pCommandList->Reset(pCommandAllocator, pWritePSO));
// Describe and create a UAV descriptor heap.
CComPtr<ID3D12DescriptorHeap> pUavHeap;
heapDesc.NumDescriptors = 2;
VERIFY_SUCCEEDED(pDevice->CreateDescriptorHeap(&heapDesc, IID_PPV_ARGS(&pUavHeap)));
CD3DX12_CPU_DESCRIPTOR_HANDLE cpuHandle(pUavHeap->GetCPUDescriptorHandleForHeapStart());
CreateStructUAV(pDevice, cpuHandle, valueSize, sizeof(float), pOutputResource);
CreateTex2DArrayUAV(pDevice, cpuHandle, NumSamples, DXGI_FORMAT_R32_FLOAT, pUavResource);
CreateTex2DMSUAV(pDevice, cpuHandle, DXGI_FORMAT_R32_FLOAT, pUavResource);
// Set Heaps, Rootsignature and table
ID3D12DescriptorHeap *const pHeaps[1] = { pUavHeap };
pCommandList->SetDescriptorHeaps(1, pHeaps);
pCommandList->SetComputeRootDescriptorTable(0, pUavHeap->GetGPUDescriptorHandleForHeapStart());
// dispatch and close write shader
pCommandList->Dispatch(DispatchGroupX, DispatchGroupY, DispatchGroupZ);
pCommandQueue->ExecuteCommandLists(1, ppCommandLists);
WaitForSignal(pCommandQueue, FO);
// Create copy command list
VERIFY_SUCCEEDED(pCommandList->Reset(pCommandAllocator, pCopyPSO));
// Set Rootsignature and descriptor tables
SetDescriptorHeap(pCommandList, pUavHeap);
pCommandList->SetComputeRootDescriptorTable(0, pUavHeap->GetGPUDescriptorHandleForHeapStart());
// Run Copy shader and copy the results back to readable memory
pCommandList->Dispatch(DispatchGroupX, DispatchGroupY, DispatchGroupZ);
CD3DX12_RESOURCE_BARRIER barrier = CD3DX12_RESOURCE_BARRIER::Transition(pOutputResource,
pCommandList->ResourceBarrier(1, &barrier);
pCommandList->CopyResource(pOutputReadBuffer, pOutputResource);
pCommandQueue->ExecuteCommandLists(1, ppCommandLists);
WaitForSignal(pCommandQueue, FO);
MappedData mappedData(pOutputReadBuffer, valueSize*sizeof(float));
float *pData = (float *);
ix = 0;
for (int i = 0; i < NumSamples; i++)
for (int j = 0; j < NumThreadsY; j++)
for (int k = 0; k < NumThreadsX; k++)
VERIFY_ARE_EQUAL(pData[ix++], j*k*(i+1));
// Used to determine how an out of bounds offset should be converted
#define CLAMPOFFSET(offset) ((offset<<28)>>28)
// Determine if the values in pPixels correspond to the expected locations encoded into a uint
// based on the coordinates and offsets that were provided.
void VerifyProgOffsetResults(unsigned *pPixels, bool bCheckDeriv) {
// Check that each element matches the expected value given the offset
unsigned ix = 0;
int coords[18] = {100, 150, 200, 250, 300, 350, 400, 450, 500, 550, 600, 650, 700, 750, 800, 850, 900, 950};
int offsets[18] = {CLAMPOFFSET(-9), -8, -7, -6, -5, -4, -3, -2, -1, 0, 1, 2, 3, 4, 5, 6, 7, CLAMPOFFSET(8)};
for (unsigned y = 0; y < _countof(coords); y++) {
for (unsigned x = 0; x < _countof(coords); x++) {
unsigned cmp = (coords[y] + offsets[y])*1000 + coords[x] + offsets[x];
if (bCheckDeriv) {
VERIFY_ARE_EQUAL(pPixels[2*4*ix+0], cmp); // Sample
VERIFY_ARE_EQUAL(pPixels[2*4*ix+1], 1U); // SampleCmp
VERIFY_ARE_EQUAL(pPixels[2*4*ix+2], 1U); // SampleCmpLevel
VERIFY_ARE_EQUAL(pPixels[2*4*ix+3], 1U); // SampleCmpLevelZero
VERIFY_ARE_EQUAL(pPixels[2*4*ix+4], cmp); // Load
if (bCheckDeriv) {
VERIFY_ARE_EQUAL(pPixels[2*4*ix+5], cmp); // SampleBias
VERIFY_ARE_EQUAL(pPixels[2*4*ix+6], cmp); // SampleGrad
VERIFY_ARE_EQUAL(pPixels[2*4*ix+7], cmp); // SampleLevel
// Fills a 1000x1000 float texture with index values increasing in row-major order
// The shader then uses non-immediate offsets extending from -9 to 8 to access these using
// Load, Sample, SampleCmp and variants thereof.
// The test verifies that the locations accessed correspond to where they should.
TEST_F(ExecutionTest, ATOProgOffset) {
WEX::TestExecution::SetVerifyOutput verifySettings(WEX::TestExecution::VerifyOutputSettings::LogOnlyFailures);
CComPtr<IStream> pStream;
ReadHlslDataIntoNewStream(L"ShaderOpArith.xml", &pStream);
std::shared_ptr<st::ShaderOpSet> ShaderOpSet =
st::ParseShaderOpSetFromStream(pStream, ShaderOpSet.get());
st::ShaderOp *pShaderOp = ShaderOpSet->GetShaderOp("ProgOffset");
auto SampleInitFn = [&](LPCSTR Name, std::vector<BYTE> &Data, st::ShaderOp *pShaderOp) {
D3D12_RESOURCE_DESC &texDesc = pShaderOp->GetResourceByName(Name)->Desc;
UINT texWidth = (UINT)texDesc.Width;
UINT texHeight = (UINT)texDesc.Height;
size_t size = sizeof(float) * texWidth * texHeight;
float *pPrimitives = (float *);
int ix = 0;
for (size_t j = 0; j < texHeight; ++j) {
for (size_t i = 0; i < texWidth; ++i) {
pPrimitives[ix] = float(ix);
bool bTestsSkipped = true;
D3D_SHADER_MODEL TestShaderModels[] = {D3D_SHADER_MODEL_6_5,
for (unsigned i = 0; i < _countof(TestShaderModels); i++) {
D3D_SHADER_MODEL sm = TestShaderModels[i];
CComPtr<ID3D12Device> pDevice;
if (!CreateDevice(&pDevice, sm, /*skipUnsupported*/false)) {
LogCommentFmt(L"Device does not support shader model 6.%1u",
((UINT)sm & 0x0f));
if (sm >= D3D_SHADER_MODEL_6_7 && !DoesDeviceSupportAdvancedTexOps(pDevice)) {
LogCommentFmt(L"Device does not support Advanced Texture Ops");
bool bSupportMSASDeriv = DoesDeviceSupportMeshAmpDerivatives(pDevice);
bool bCheckDerivCS = sm >= D3D_SHADER_MODEL_6_6;
bool bCheckDerivMSAS = bCheckDerivCS && bSupportMSASDeriv;
if (bCheckDerivCS && !bSupportMSASDeriv) {
LogCommentFmt(L"Device does not support derivatives in Mesh and Amplification shaders");
switch (sm) {
case D3D_SHADER_MODEL_6_5:
pShaderOp->CS = pShaderOp->GetString("CS");
pShaderOp->PS = pShaderOp->GetString("PS");
pShaderOp->MS = pShaderOp->GetString("MS");
pShaderOp->AS = pShaderOp->GetString("AS");
case D3D_SHADER_MODEL_6_6:
pShaderOp->CS = pShaderOp->GetString("CS66");
pShaderOp->PS = pShaderOp->GetString("PS");
if (bCheckDerivMSAS) {
pShaderOp->MS = pShaderOp->GetString("MS66D");
pShaderOp->AS = pShaderOp->GetString("AS66D");
} else {
pShaderOp->MS = pShaderOp->GetString("MS66");
pShaderOp->AS = pShaderOp->GetString("AS66");
case D3D_SHADER_MODEL_6_7:
pShaderOp->CS = pShaderOp->GetString("CS67");
pShaderOp->PS = pShaderOp->GetString("PS67");
if (bCheckDerivMSAS) {
pShaderOp->MS = pShaderOp->GetString("MS67D");
pShaderOp->AS = pShaderOp->GetString("AS67D");
} else {
pShaderOp->MS = pShaderOp->GetString("MS67");
pShaderOp->AS = pShaderOp->GetString("AS67");
// Test compute shader
std::shared_ptr<ShaderOpTestResult> test = RunShaderOpTestAfterParse(pDevice, m_support, "ProgOffset", SampleInitFn, ShaderOpSet);
MappedData data;
test->Test->GetReadBackData("U0", &data);
VerifyProgOffsetResults((UINT*), bCheckDerivCS);
// Disable CS so graphics shaders go forward
pShaderOp->CS = nullptr;
if (DoesDeviceSupportMeshShaders(pDevice)) {
test = RunShaderOpTestAfterParse(pDevice, m_support, "ProgOffset", SampleInitFn, ShaderOpSet);
// PS
test->Test->GetReadBackData("U0", &data);
VerifyProgOffsetResults((UINT*), true);
// MS
test->Test->GetReadBackData("U1", &data);
VerifyProgOffsetResults((UINT*), bCheckDerivMSAS);
// AS
test->Test->GetReadBackData("U2", &data);
VerifyProgOffsetResults((UINT*), bCheckDerivMSAS);
// Disable MS so PS goes forward
pShaderOp->MS = nullptr;
test = RunShaderOpTestAfterParse(pDevice, m_support, "ProgOffset", SampleInitFn, ShaderOpSet);
test->Test->GetReadBackData("U0", &data);
VerifyProgOffsetResults((UINT*), true);
bTestsSkipped = false;
if (bTestsSkipped) {
// A mipmapped texture containing the value of LOD at each location in each
// level is used to sample at each level using SampleCmpLevel and confirm
// that the correct level is used for the comparison.
TEST_F(ExecutionTest, ATOSampleCmpLevelTest) {
WEX::TestExecution::SetVerifyOutput verifySettings(WEX::TestExecution::VerifyOutputSettings::LogOnlyFailures);
CComPtr<IStream> pStream;
ReadHlslDataIntoNewStream(L"ShaderOpArith.xml", &pStream);
CComPtr<ID3D12Device> pDevice;
if (!CreateDevice(&pDevice, D3D_SHADER_MODEL_6_7))
if (!DoesDeviceSupportAdvancedTexOps(pDevice)) {
WEX::Logging::Log::Comment(L"Device does not support Advanced Texture Operations.");
std::shared_ptr<st::ShaderOpSet> ShaderOpSet =
st::ParseShaderOpSetFromStream(pStream, ShaderOpSet.get());
st::ShaderOp *pShaderOp = ShaderOpSet->GetShaderOp("SampleCmpLevel");
// Initialize texture with the LOD number in each corresponding mip level
auto SampleInitFn = [&](LPCSTR Name, std::vector<BYTE> &Data, st::ShaderOp *pShaderOp) {
D3D12_RESOURCE_DESC &texDesc = pShaderOp->GetResourceByName(Name)->Desc;
UINT texWidth = (UINT)texDesc.Width;
UINT texHeight = (UINT)texDesc.Height;
size_t size = sizeof(float) * texWidth * texHeight * 2;
float *pPrimitives = (float *);
float val = 0.5;
int ix = 0;
while (texHeight > 0 && texWidth > 0) {
if(!texHeight) texHeight = 1;
if(!texWidth) texWidth = 1;
for (size_t j = 0; j < texHeight; ++j) {
for (size_t i = 0; i < texWidth; ++i) {
pPrimitives[ix++] = val;
val += 1.0;
texHeight >>= 1;
texWidth >>= 1;
// Test compute shader
std::shared_ptr<ShaderOpTestResult> test = RunShaderOpTestAfterParse(pDevice, m_support, "SampleCmpLevel", SampleInitFn, ShaderOpSet);
MappedData data;
test->Test->GetReadBackData("U0", &data);
const UINT *pPixels = (UINT *);
// Check that each LOD matches what's expected
unsigned count = 2*7;
// Since the results consist of a boolean, which should be true followed by the result of a sampcmplvl,
// the only result expected is 1.
for (unsigned i = 0; i < count; i++)
VERIFY_ARE_EQUAL(pPixels[i], 1U);
if (DoesDeviceSupportMeshShaders(pDevice)) {
// Disable CS so mesh goes forward
pShaderOp->CS = nullptr;
test = RunShaderOpTestAfterParse(pDevice, m_support, "SampleCmpLevel", SampleInitFn, ShaderOpSet);
test->Test->GetReadBackData("U0", &data);
pPixels = (UINT *);
for (unsigned i = 0; i < count; i++)
VERIFY_ARE_EQUAL(pPixels[i], 1U);
test->Test->GetReadBackData("U1", &data);
pPixels = (UINT *);
for (unsigned i = 0; i < count; i++)
VERIFY_ARE_EQUAL(pPixels[i], 1U);
test->Test->GetReadBackData("U2", &data);
pPixels = (UINT *);
for (unsigned i = 0; i < count; i++)
VERIFY_ARE_EQUAL(pPixels[i], 1U);
template <unsigned RSize>
struct IntR {
unsigned R : RSize;
void SetChannels(unsigned R, unsigned G, unsigned B, unsigned A) {
this->R = R;
static unsigned GetRSize() { return RSize; }
static unsigned GetGSize() { return 0; }
static unsigned GetBSize() { return 0; }
static unsigned GetASize() { return 0; }
template <unsigned RSize, unsigned GSize>
struct IntRG {
unsigned R : RSize;
unsigned G : GSize;
void SetChannels(unsigned R, unsigned G, unsigned B, unsigned A) {
this->R = R;
this->G = G;
static unsigned GetRSize() { return RSize; }
static unsigned GetGSize() { return GSize; }
static unsigned GetBSize() { return 0; }
static unsigned GetASize() { return 0; }
template <unsigned RSize, unsigned GSize, unsigned BSize>
struct IntRGB {
unsigned R : RSize;
unsigned G : GSize;
unsigned B : BSize;
void SetChannels(unsigned R, unsigned G, unsigned B, unsigned A) {
this->R = R;
this->G = G;
this->B = B;
static unsigned GetRSize() { return RSize; }
static unsigned GetGSize() { return GSize; }
static unsigned GetBSize() { return BSize; }
static unsigned GetASize() { return 0; }
template <unsigned RSize, unsigned GSize, unsigned BSize, unsigned ASize>
struct IntRGBA {
unsigned R : RSize;
unsigned G : GSize;
unsigned B : BSize;
unsigned A : ASize;
void SetChannels(unsigned R, unsigned G, unsigned B, unsigned A) {
this->R = R;
this->G = G;
this->B = B;
this->A = A;
static unsigned GetRSize() { return RSize; }
static unsigned GetGSize() { return GSize; }
static unsigned GetBSize() { return BSize; }
static unsigned GetASize() { return ASize; }
struct IntRGBA10XRA2UNORM {
uint32_t RGBA;
void SetChannels(float R, float G, float B, float A) {
uint32_t ur, ug, ub, ua;
// Conversion values taken from XR documentation
ur = GetMantissa(R*510+385);
ub = GetMantissa(B*510+385);
ug = GetMantissa(G*510+385);
ua = (uint32_t)A;
// Cast off all but the 10 MSB and shift for packing
ur = (ur&0x7fE000) >> 13;
ug = (ur&0x7fE000) >> 3;
ub = (ur&0x7fE000) << 7;
ua = (ua&0x3) << 30;
RGBA = ur | ug | ub | ua;
struct Float32R {
float R;
void SetChannels(float R, float G, float B, float A) {
this->R = R;
struct Float32RG {
float R, G;
void SetChannels(float R, float G, float B, float A) {
this->R = R;
this->G = G;
struct Float16R {
uint16_t R;
void SetChannels(float R, float G, float B, float A) {
this->R = ConvertFloat32ToFloat16(R);
struct Float16RG {
uint16_t R, G;
void SetChannels(float R, float G, float B, float A) {
this->R = ConvertFloat32ToFloat16(R);
this->G = ConvertFloat32ToFloat16(G);
// No Float16RGB needed
struct Float16RGBA {
uint16_t R, G, B, A;
void SetChannels(float R, float G, float B, float A) {
this->R = ConvertFloat32ToFloat16(R);
this->G = ConvertFloat32ToFloat16(G);
this->B = ConvertFloat32ToFloat16(B);
this->A = ConvertFloat32ToFloat16(A);
struct FloatR11G11B10 {
uint32_t RGB;
void SetChannels(float R, float G, float B, float A) {
uint32_t ur, ug, ub;
// Shift and mask so as to place R: 0-10, G: 11-21, B: 22-31
// Sign and lesser-significant mantissa bits are truncated
ur = (ConvertFloat32ToFloat16(R) >> 4) & 0x000007FF;
ug = (ConvertFloat32ToFloat16(G) << 7) & 0x003FF800;
ub = (ConvertFloat32ToFloat16(B) << 17) & 0xFFC00000;
RGB = ur | ug | ub;
struct FloatRGBE {
uint32_t RGBE;
// Conversion logic taken from miniengine PixelPacking header
void SetChannels(UINT R, UINT G, UINT B, UINT A) {
union { uint32_t i; float f; } ur, ug, ub, maxChannel, nextPow2;
ur.f = (float)R;
ug.f = (float)G;
ub.f = (float)B;
maxChannel.f = std::max(ur.f, std::max(ug.f, ub.f));
// nextPow2 has to have the biggest exponent plus 1 (and nothing in the mantissa)
nextPow2.i = (maxChannel.i + 0x800000) & 0x7F800000;
// By adding nextPow2, all channels have the same exponent, shifting their mantissa bits
// to the right to accomodate it. This also shifts in the implicit '1' bit of all channels.
// The largest channel will always have the high bit set.
ur.f += nextPow2.f;
ug.f += nextPow2.f;
ub.f += nextPow2.f;
ur.i = (ur.i << 9) >> 23;
ug.i = (ug.i << 9) >> 23;
ub.i = (ub.i << 9) >> 23;
uint32_t e = ConvertFloat32ToFloat16(nextPow2.f) << 17;
RGBE = ur.i | ug.i << 9 | ub.i << 18 | e;
static unsigned GetRSize() { return 9; }
static unsigned GetGSize() { return 9; }
static unsigned GetBSize() { return 9; }
static unsigned GetASize() { return 0; }
template <typename RGBAType, unsigned xdim, unsigned ydim>
struct RawFloatTexture : public ExecutionTest::RawGatherTexture {
DXGI_FORMAT m_format;
RGBAType RGBA[xdim*ydim];
RawFloatTexture(DXGI_FORMAT format) : m_format(format) {}
// Set i'th element to floatified x,y and some derived values
virtual void SetElement(int i, int x, int y) override {
float r = (float)x;
float g = (float)y;
// provide some different values just to fill in b and a
float b = (float)(x + y)*0.5f;
float a = (float)(x + y)*0.1f;
RGBA[i].SetChannels(r, g, b, a);
virtual void *GetElements() { return (void*)RGBA; }
virtual unsigned GetXDim() { return xdim; }
virtual unsigned GetYDim() { return ydim; }
virtual DXGI_FORMAT GetFormat() override { return m_format; };
template <unsigned xdim, unsigned ydim>
struct RawFloatR11G11B10ATexture : public ExecutionTest::RawGatherTexture {
FloatR11G11B10 RGBA[xdim*ydim];
// Set i'th element to floatified x,y and some derived values
virtual void SetElement(int i, int x, int y) override {
float r = (float)x;
float g = (float)y;
float b = (float)(x + y)*0.5f;
RGBA[i].SetChannels(r, g, b, 0);
virtual void *GetElements() { return (void*)RGBA; }
virtual unsigned GetXDim() { return xdim; }
virtual unsigned GetYDim() { return ydim; }
virtual DXGI_FORMAT GetFormat() override { return DXGI_FORMAT_R11G11B10_FLOAT; };
template <typename RGBAType, unsigned xdim, unsigned ydim>
struct RawIntTexture : public ExecutionTest::RawGatherTexture {
bool m_isSigned;
bool m_isNorm;
unsigned m_maxVal;
DXGI_FORMAT m_format;
RGBAType RGBA[xdim*ydim];
RawIntTexture(bool isSigned, bool isNorm, int maxVal, DXGI_FORMAT format)
: m_isSigned(isSigned), m_isNorm(isNorm), m_maxVal(maxVal + 2), m_format(format) {
if (isSigned)
m_maxVal /= 2;
// Set i'th element to values scaled per max dimentions for norms, shifted for signed
// but otherwise just the x and y values themselves
virtual void SetElement(int i, int x, int y) override {
double fr = x;
double fg = y;
// provide some different values just to fill in b and a
double fb = x + 2;
double fa = y + 2;
// If signed, get some unsigned values in there
if (m_isSigned) {
fr -= m_maxVal;
fg -= m_maxVal;
fb -= m_maxVal;
fa -= m_maxVal;
// If normalized, scale to given range
if (m_isNorm) {
fr /= m_maxVal;
fg /= m_maxVal;
fb /= m_maxVal;
fa /= m_maxVal;
fr *= (1 << (RGBAType::GetRSize() - m_isSigned - 1));
fg *= (1 << (RGBAType::GetGSize() - m_isSigned - 1));
fb *= (1 << (RGBAType::GetBSize() - m_isSigned - 1));
fa *= (1 << (RGBAType::GetASize() - 1));
RGBA[i].SetChannels((UINT)fr, (UINT)fg, (UINT)fb, (UINT)fa);
virtual void *GetElements() { return (void*)RGBA; }
virtual unsigned GetXDim() { return xdim; }
virtual unsigned GetYDim() { return ydim; }
virtual DXGI_FORMAT GetFormat() override { return m_format; };
template <unsigned xdim, unsigned ydim>
struct RawR10G10B10XRA2Texture : public ExecutionTest::RawGatherTexture {
unsigned m_maxVal;
DXGI_FORMAT m_format;
IntRGBA10XRA2UNORM RGBA[xdim*ydim];
RawR10G10B10XRA2Texture(int maxVal, DXGI_FORMAT format)
: m_maxVal((maxVal + 2)/2), m_format(format) {}
// Set i'th element to values scaled and shifted for available range
virtual void SetElement(int i, int x, int y) override {
double fr = x;
double fg = y;
// provide some different values just to fill in b and a
double fb = x + 2;
double fa = y + 2;
// Shift RGB to valid range which will be -0.75 - 1.25
fr -= m_maxVal*.75;
fg -= m_maxVal*.75;
fb -= m_maxVal*.75;
// normalize to something that will fit in the limited range
fr /= m_maxVal;
fg /= m_maxVal;
fb /= m_maxVal;
fa /= m_maxVal*2;
fa *= 3; // scale to max in range
RGBA[i].SetChannels((float)fr, (float)fg, (float)fb, (float)fa);
virtual void *GetElements() { return (void*)RGBA; }
virtual unsigned GetXDim() { return xdim; }
virtual unsigned GetYDim() { return ydim; }
virtual DXGI_FORMAT GetFormat() override { return m_format; };
//#define RAWGATHER_FALLBACK // Enable to use pre-6.7 fallback mechanisms to vet raw gather tests
// Create a single resource of <resFormat> and alias it to a view of <viewFormat>
// Then execute a shader that uses raw gather to copy the values into a UAV
// Verify that the UAV has the same values as passed in.
template<typename GatherType>
void ExecutionTest::DoRawGatherTest(ID3D12Device *pDevice, RawGatherTexture *rawTex, DXGI_FORMAT viewFormat) {
DXGI_FORMAT resFormat = rawTex->GetFormat();
// There is no uint64 version of Gather, so 64-bit fallback needs to use Loads
const char shaderTemplate64[] =
"Texture2D<uint%d_t> g_tex : register(t0);\n"
"RWStructuredBuffer<uint%d_t> g_out : register(u0);\n"
"SamplerState g_samp : register(s0);\n"
"[NumThreads(32, 32, 1)]\n"
"void main(uint3 id : SV_GroupThreadID, uint ix : SV_GroupIndex) {\n"
" //uint%d_t4 res = g_tex.%s(g_samp, (id.xy+0.5)/31.0);\n"
" g_out[4*ix+0] = g_tex.Load(uint3(id.x, id.y+1, 0));\n"
" g_out[4*ix+1] = g_tex.Load(uint3(id.x+1, id.y+1, 0));\n"
" g_out[4*ix+2] = g_tex.Load(uint3(id.x+1, id.y, 0));\n"
" g_out[4*ix+3] = g_tex.Load(uint3(id.x, id.y, 0));\n"
const char shaderTemplate[] =
"Texture2D<uint%d_t> g_tex : register(t0);\n"
"RWStructuredBuffer<uint%d_t> g_out : register(u0);\n"
"SamplerState g_samp : register(s0);\n"
"[NumThreads(32, 32, 1)]\n"
"void main(uint3 id : SV_GroupThreadID, uint ix : SV_GroupIndex) {\n"
" uint%d_t4 res = g_tex.%s(g_samp, (id.xy+0.5)/31.0);\n"
" g_out[4*ix+0] = res.x;\n"
" g_out[4*ix+1] = res.y;\n"
" g_out[4*ix+2] = res.z;\n"
" g_out[4*ix+3] = res.w;\n"
char pShader[sizeof(shaderTemplate) + 200]; // A little padding to account for variations
UINT uintSize = sizeof(GatherType)*8; // bytes to bits
const char *gatherFuncName = "GatherRaw";
gatherFuncName = "Gather";
if (sizeof(GatherType) == 8)
VERIFY_IS_GREATER_THAN(sprintf(pShader, shaderTemplate64, uintSize, uintSize, uintSize, gatherFuncName), 0);
VERIFY_IS_GREATER_THAN(sprintf(pShader, shaderTemplate, uintSize, uintSize, uintSize, gatherFuncName), 0);
const UINT xDim = rawTex->GetXDim();
const UINT yDim = rawTex->GetYDim();
const UINT valueSize = xDim * yDim;
const UINT valueSizeInBytes = valueSize * sizeof(GatherType);
CComPtr<ID3D12CommandQueue> pCommandQueue;
CComPtr<ID3D12CommandAllocator> pCommandAllocator;
FenceObj FO;
CreateComputeCommandQueue(pDevice, L"RawGather Queue", &pCommandQueue);
InitFenceObj(pDevice, &FO);
// Create root signature.
CComPtr<ID3D12RootSignature> pRootSignature;
ranges[0].Init(D3D12_DESCRIPTOR_RANGE_TYPE_SRV, 1, 0, 0);
ranges[1].Init(D3D12_DESCRIPTOR_RANGE_TYPE_UAV, 1, 0, 0);
srange[0].Init(D3D12_DESCRIPTOR_RANGE_TYPE_SAMPLER, 1, 0, 0);
CreateRootSignatureFromRanges(pDevice, &pRootSignature, ranges, 2, srange, 1);
VERIFY_SUCCEEDED(pDevice->CreateCommandAllocator(D3D12_COMMAND_LIST_TYPE_COMPUTE, IID_PPV_ARGS(&pCommandAllocator)));
// Create command list and resources
CComPtr<ID3D12GraphicsCommandList> pCommandList;
pCommandAllocator, nullptr, IID_PPV_ARGS(&pCommandList)));
// Set up castable format list (of one) if possible, or else just alias the
// formats with the expectation that unsupported cases won't be used by the caller
DXGI_FORMAT *castableFmt = nullptr;
if (DoesDeviceSupportEnhancedBarriers(pDevice))
castableFmt = &viewFormat;
resFormat = viewFormat;
// Set up texture to be raw gathered from
CComPtr<ID3D12Resource> pTexResource;
CComPtr<ID3D12Resource> pTexUploadResource;
int ix = 0;
for (UINT y = 0; y < yDim; y++)
for (UINT x = 0; x < xDim; x++)
rawTex->SetElement(ix++, x, y);
D3D12_RESOURCE_DESC tex2dDesc = CD3DX12_RESOURCE_DESC::Tex2D(resFormat, xDim, yDim, 1/* sampCt */, 1/* mipCt */);
CreateTestResources(pDevice, pCommandList, rawTex->GetElements(), valueSizeInBytes, tex2dDesc,
&pTexResource, &pTexUploadResource,
nullptr /*pReadBufer*/, castableFmt);
// Set up Output Resource
CComPtr<ID3D12Resource> pOutputResource;
CComPtr<ID3D12Resource> pOutputReadBuffer;
CComPtr<ID3D12Resource> pOutputUploadResource;
// 4x because gather produces four result values
GatherType *outVals = new GatherType[valueSize*4];
memset(outVals, 0xd, valueSizeInBytes*4); // 0xd to give a sentinal value for failures
CreateTestUavs(pDevice, pCommandList, outVals, valueSizeInBytes*4, &pOutputResource,
&pOutputUploadResource, &pOutputReadBuffer);
delete[] outVals;
// Close the command list and execute it to perform the resource uploads
ID3D12CommandList *ppCommandLists[] = { pCommandList };
pCommandQueue->ExecuteCommandLists(1, ppCommandLists);
WaitForSignal(pCommandQueue, FO);
// Create shaders
const wchar_t *target = L"cs_6_2";
const wchar_t *target = L"cs_6_7";
LPCWSTR opts[] = {L"-enable-16bit-types"};
CComPtr<ID3D12PipelineState> pPSO;
CreateComputePSO(pDevice, pRootSignature, pShader, target, &pPSO, opts, _countof(opts));
// Reset commandlist to shader PSO
VERIFY_SUCCEEDED(pCommandList->Reset(pCommandAllocator, pPSO));
// Describe and create a resource descriptor heap.
CComPtr<ID3D12DescriptorHeap> pResHeap;
CComPtr<ID3D12DescriptorHeap> pSampHeap;
heapDesc.NumDescriptors = 2;
VERIFY_SUCCEEDED(pDevice->CreateDescriptorHeap(&heapDesc, IID_PPV_ARGS(&pResHeap)));
// Describe and create a sampler descriptor heap.
heapDesc.NumDescriptors = 1;
VERIFY_SUCCEEDED(pDevice->CreateDescriptorHeap(&heapDesc, IID_PPV_ARGS(&pSampHeap)));
CD3DX12_CPU_DESCRIPTOR_HANDLE cpuHandle(pResHeap->GetCPUDescriptorHandleForHeapStart());
CreateTex2DSRV(pDevice, cpuHandle, viewFormat, pTexResource);
CreateStructUAV(pDevice, cpuHandle, 4*valueSize, sizeof(GatherType), pOutputResource);
CreateDefaultSamplers(pDevice, pSampHeap->GetCPUDescriptorHandleForHeapStart(),
filters, nullptr /*perSampleBorderColors*/, 1);
// Set Heaps, Rootsignature and table
ID3D12DescriptorHeap *const pHeaps[2] = { pResHeap, pSampHeap };
pCommandList->SetDescriptorHeaps(2, pHeaps);
pCommandList->SetComputeRootDescriptorTable(0, pResHeap->GetGPUDescriptorHandleForHeapStart());
pCommandList->SetComputeRootDescriptorTable(1, pSampHeap->GetGPUDescriptorHandleForHeapStart());
// dispatch and close shader
pCommandList->Dispatch(1, 1, 1);
// Copy the results back to readable memory
CD3DX12_RESOURCE_BARRIER barrier = CD3DX12_RESOURCE_BARRIER::Transition(pOutputResource,
pCommandList->ResourceBarrier(1, &barrier);
pCommandList->CopyResource(pOutputReadBuffer, pOutputResource);
pCommandQueue->ExecuteCommandLists(1, ppCommandLists);
WaitForSignal(pCommandQueue, FO);
MappedData mappedData(pOutputReadBuffer, 4*valueSizeInBytes);
GatherType *pData = (GatherType*);
GatherType *texVals = (GatherType*)rawTex->GetElements();
UINT yCt = yDim;
UINT xCt = xDim;
// 64-bit fallback uses Load, which doesn't support clamp addressing. so don't test it
if (sizeof(GatherType) == 8) {
for (UINT y = 0; y < yCt; y++) {
UINT yp1 = y+1>=yDim?y:y+1;
for (UINT x = 0; x < xCt; x++) {
UINT xp1 = x+1>=xDim?x:x+1;
// Because this order may be unexpected, I'll quote the spec:
// "The four samples that would contribute to filtering are placed into xyzw
// in counter clockwise order starting with the sample to the lower left"
VERIFY_ARE_EQUAL(pData[4*(32*y + x)+0], texVals[yp1*xDim + x]);
VERIFY_ARE_EQUAL(pData[4*(32*y + x)+1], texVals[yp1*xDim + xp1]);
VERIFY_ARE_EQUAL(pData[4*(32*y + x)+2], texVals[y*xDim + xp1]);
VERIFY_ARE_EQUAL(pData[4*(32*y + x)+3], texVals[y*xDim + x]);
// Create textures of various types and alias them to the unsigned integer format
// that has the same element size and initializes them with various values,
// The shader code copies the results of raw gather to an unsigned integer UAV
// The UAV contents are compared to the values assigned to the texture
// A few levels of support are available:
// pre-6.7 fallback - fakey hand waving to make it look like it's doing the right thing
// 6.7 support only - No casting ability of resources to views beyond native support, but GatherRaw is available
// 6.7 + Enh. Barriers - Same formats can be cast as in native, but use new createcommittedresource3()
// 6.7 + Enh. Barriers + Relaxed Cast - All format casting and raw gathering of all
TEST_F(ExecutionTest, ATORawGather) {
WEX::TestExecution::SetVerifyOutput verifySettings(WEX::TestExecution::VerifyOutputSettings::LogOnlyFailures);
CComPtr<ID3D12Device> pDevice;
if (!CreateDevice(&pDevice, sm))
if (!DoesDeviceSupportAdvancedTexOps(pDevice)) {
WEX::Logging::Log::Comment(L"Device does not support Advanced Texture Operations.");
static const int NumThreadsX = 32;
static const int NumThreadsY = 32;
static const int ThreadsPerGroup = NumThreadsX * NumThreadsY;
// Create an array of texture variants with the raw texture base class
// Then plug them into DoRawGather to perform the test and evaluate the results for each
RawIntTexture<IntRG<32, 32>, NumThreadsX, NumThreadsY> R32G32_TYPELESS(false, false, NumThreadsX, DXGI_FORMAT_R32G32_TYPELESS);
RawIntTexture<IntRG<32, 32>, NumThreadsX, NumThreadsY> R32G32_UINT(false, false, NumThreadsX, DXGI_FORMAT_R32G32_UINT);
RawIntTexture<IntRG<32, 32>, NumThreadsX, NumThreadsY> R32G32_SINT(true, false, NumThreadsX, DXGI_FORMAT_R32G32_SINT);
RawIntTexture<IntRGBA<16, 16, 16, 16>, NumThreadsX, NumThreadsY> R16G16B16A16_TYPELESS(false, false, NumThreadsX, DXGI_FORMAT_R16G16B16A16_TYPELESS);
RawIntTexture<IntRGBA<16, 16, 16, 16>, NumThreadsX, NumThreadsY> R16G16B16A16_UINT(false, false, NumThreadsX, DXGI_FORMAT_R16G16B16A16_UINT);
RawIntTexture<IntRGBA<16, 16, 16, 16>, NumThreadsX, NumThreadsY> R16G16B16A16_SINT(true, false, NumThreadsX, DXGI_FORMAT_R16G16B16A16_SINT);
RawIntTexture<IntRGBA<16, 16, 16, 16>, NumThreadsX, NumThreadsY> R16G16B16A16_UNORM(false, true, NumThreadsX, DXGI_FORMAT_R16G16B16A16_UNORM);
RawIntTexture<IntRGBA<16, 16, 16, 16>, NumThreadsX, NumThreadsY> R16G16B16A16_SNORM(true, true, NumThreadsX, DXGI_FORMAT_R16G16B16A16_SNORM);
RawFloatTexture<Float16RGBA, NumThreadsX, NumThreadsY> R16G16B16A16_FLOAT(DXGI_FORMAT_R16G16B16A16_FLOAT);
RawFloatTexture<Float32RG, NumThreadsX, NumThreadsY> R32G32_FLOAT(DXGI_FORMAT_R32G32_FLOAT);
RawGatherTexture *Int64Textures[] = {
RawIntTexture<IntR<32>, NumThreadsX, NumThreadsY> R32_TYPELESS(false, false, NumThreadsX, DXGI_FORMAT_R32_TYPELESS);
RawIntTexture<IntR<32>, NumThreadsX, NumThreadsY> R32_SINT(true, false, NumThreadsX, DXGI_FORMAT_R32_SINT);
RawIntTexture<IntR<32>, NumThreadsX, NumThreadsY> R32_UINT(true, false, NumThreadsX, DXGI_FORMAT_R32_UINT);
RawIntTexture<IntRGBA<10, 10, 10, 2>, NumThreadsX, NumThreadsY> R10G10B10A2_TYPELESS(false, false, NumThreadsX, DXGI_FORMAT_R10G10B10A2_TYPELESS);
RawIntTexture<IntRGBA<10, 10, 10, 2>, NumThreadsX, NumThreadsY> R10G10B10A2_UNORM(false, true, NumThreadsX, DXGI_FORMAT_R10G10B10A2_UNORM);
RawIntTexture<IntRGBA<10, 10, 10, 2>, NumThreadsX, NumThreadsY> R10G10B10A2_UINT(false, false, NumThreadsX, DXGI_FORMAT_R10G10B10A2_UINT);
RawR10G10B10XRA2Texture<NumThreadsX, NumThreadsY> R10G10B10A2_XR_BIAS_A2_UNORM(NumThreadsX, DXGI_FORMAT_R10G10B10_XR_BIAS_A2_UNORM);
RawIntTexture<FloatRGBE, NumThreadsX, NumThreadsY> R9G9B9E5_SHAREDEXP(false, false, NumThreadsX, DXGI_FORMAT_R9G9B9E5_SHAREDEXP);
RawIntTexture<IntRGBA<8, 8, 8, 8>, NumThreadsX, NumThreadsY> R8G8B8A8_TYPELESS(false, false, NumThreadsX, DXGI_FORMAT_R8G8B8A8_TYPELESS);
RawIntTexture<IntRGBA<8, 8, 8, 8>, NumThreadsX, NumThreadsY> R8G8B8A8_UNORM(false, true, NumThreadsX, DXGI_FORMAT_R8G8B8A8_UNORM);
RawIntTexture<IntRGBA<8, 8, 8, 8>, NumThreadsX, NumThreadsY> R8G8B8A8_UNORM_SRGB(false, true, NumThreadsX, DXGI_FORMAT_R8G8B8A8_UNORM);
RawIntTexture<IntRGBA<8, 8, 8, 8>, NumThreadsX, NumThreadsY> R8G8B8A8_UINT(false, false, NumThreadsX, DXGI_FORMAT_R8G8B8A8_UINT);
RawIntTexture<IntRGBA<8, 8, 8, 8>, NumThreadsX, NumThreadsY> R8G8B8A8_SNORM(true, true, NumThreadsX, DXGI_FORMAT_R8G8B8A8_SNORM);
RawIntTexture<IntRGBA<8, 8, 8, 8>, NumThreadsX, NumThreadsY> R8G8B8A8_SINT(true, false, NumThreadsX, DXGI_FORMAT_R8G8B8A8_SINT);
RawIntTexture<IntRG<16, 16>, NumThreadsX, NumThreadsY> R16G16_TYPELESS(false, false, NumThreadsX, DXGI_FORMAT_R16G16_TYPELESS);
RawIntTexture<IntRG<16, 16>, NumThreadsX, NumThreadsY> R16G16_UNORM(false, true, NumThreadsX, DXGI_FORMAT_R16G16_UNORM);
RawIntTexture<IntRG<16, 16>, NumThreadsX, NumThreadsY> R16G16_UINT(false, false, NumThreadsX, DXGI_FORMAT_R16G16_UINT);
RawIntTexture<IntRG<16, 16>, NumThreadsX, NumThreadsY> R16G16_SNORM(true, true, NumThreadsX, DXGI_FORMAT_R16G16_SNORM);
RawIntTexture<IntRG<16, 16>, NumThreadsX, NumThreadsY> R16G16_SINT(true, false, NumThreadsX, DXGI_FORMAT_R16G16_SINT);
RawIntTexture<IntRGBA<8, 8, 8, 8>, NumThreadsX, NumThreadsY> B8G8R8A8_TYPELESS(false, false, NumThreadsX, DXGI_FORMAT_B8G8R8A8_TYPELESS);
RawIntTexture<IntRGBA<8, 8, 8, 8>, NumThreadsX, NumThreadsY> B8G8R8A8_UNORM(false, true, NumThreadsX, DXGI_FORMAT_B8G8R8A8_UNORM);
RawIntTexture<IntRGBA<8, 8, 8, 8>, NumThreadsX, NumThreadsY> B8G8R8A8_UNORM_SRGB(false, true, NumThreadsX, DXGI_FORMAT_B8G8R8A8_UNORM_SRGB);
RawIntTexture<IntRGBA<8, 8, 8, 8>, NumThreadsX, NumThreadsY> B8G8R8X8_TYPELESS(false, false, NumThreadsX, DXGI_FORMAT_B8G8R8X8_TYPELESS);
RawIntTexture<IntRGBA<8, 8, 8, 8>, NumThreadsX, NumThreadsY> B8G8R8X8_UNORM(false, true, NumThreadsX, DXGI_FORMAT_B8G8R8X8_UNORM);
RawIntTexture<IntRGBA<8, 8, 8, 8>, NumThreadsX, NumThreadsY> B8G8R8X8_UNORM_SRGB(false, true, NumThreadsX, DXGI_FORMAT_B8G8R8X8_UNORM_SRGB);
RawFloatTexture<Float32R, NumThreadsX, NumThreadsY> R32_FLOAT(DXGI_FORMAT_R32_FLOAT);
RawFloatR11G11B10ATexture<NumThreadsX, NumThreadsY> R11G11B10_FLOAT;
RawFloatTexture<Float16RG, NumThreadsX, NumThreadsY> R16G16_FLOAT(DXGI_FORMAT_R16G16_FLOAT);
RawGatherTexture *Int32Textures[] = {
RawIntTexture<IntR<16>, NumThreadsX, NumThreadsY> R16_TYPELESS(false, false, NumThreadsX, DXGI_FORMAT_R16_TYPELESS);
RawIntTexture<IntR<16>, NumThreadsX, NumThreadsY> R16_SINT(true, false, NumThreadsX, DXGI_FORMAT_R16_SINT);
RawIntTexture<IntR<16>, NumThreadsX, NumThreadsY> R16_UINT(true, false, NumThreadsX, DXGI_FORMAT_R16_UINT);
RawIntTexture<IntR<16>, NumThreadsX, NumThreadsY> R16_UNORM(false, true, NumThreadsX, DXGI_FORMAT_R16_UNORM);
RawIntTexture<IntR<16>, NumThreadsX, NumThreadsY> R16_SNORM(true, true, NumThreadsX, DXGI_FORMAT_R16_SNORM);
RawFloatTexture<Float16R, NumThreadsX, NumThreadsY> R16_FLOAT(DXGI_FORMAT_R16_FLOAT);
RawIntTexture<IntRG<8, 8>, NumThreadsX, NumThreadsY> R8G8_TYPELESS(false, false, NumThreadsX, DXGI_FORMAT_R8G8_TYPELESS);
RawIntTexture<IntRG<8, 8>, NumThreadsX, NumThreadsY> R8G8_UINT(false, false, NumThreadsX, DXGI_FORMAT_R8G8_UINT);
RawIntTexture<IntRG<8, 8>, NumThreadsX, NumThreadsY> R8G8_SINT(true, false, NumThreadsX, DXGI_FORMAT_R8G8_SINT);
RawIntTexture<IntRG<8, 8>, NumThreadsX, NumThreadsY> R8G8_UNORM(false, true, NumThreadsX, DXGI_FORMAT_R8G8_UNORM);
RawIntTexture<IntRG<8, 8>, NumThreadsX, NumThreadsY> R8G8_SNORM(true, true, NumThreadsX, DXGI_FORMAT_R8G8_SNORM);
RawIntTexture<IntRGB<5, 6, 5>, NumThreadsX, NumThreadsY> B5G6R5_UNORM(false, true, NumThreadsX, DXGI_FORMAT_B5G6R5_UNORM);
RawIntTexture<IntRGBA<5, 5, 5, 1>, NumThreadsX, NumThreadsY> B5G5R5A1_UNORM(false, true, NumThreadsX, DXGI_FORMAT_B5G5R5A1_UNORM);
RawIntTexture<IntRGBA<4, 4, 4, 4>, NumThreadsX, NumThreadsY> B4G4R4A4_UNORM(false, true, NumThreadsX, DXGI_FORMAT_B4G4R4A4_UNORM);
RawGatherTexture *Int16Textures[] = {
bool canCast = DoesDeviceSupportRelaxedFormatCasting(pDevice);
int int32Ct = canCast? _countof(Int32Textures) : 3; // The first three are already castable to UINT32
for (int i = 0; i < int32Ct; i++) {
DoRawGatherTest<uint32_t>(pDevice, Int32Textures[i], DXGI_FORMAT_R32_UINT);
if (DoesDeviceSupportNative16bitOps(pDevice)) {
int int16Ct = canCast? _countof(Int16Textures) : 5; // The first five are already castable to UINT16
for (int i = 0; i < int16Ct; i++) {
DoRawGatherTest<uint16_t>(pDevice, Int16Textures[i], DXGI_FORMAT_R16_UINT);
if (DoesDeviceSupportInt64(pDevice)) {
int int64Ct = canCast? _countof(Int64Textures) : 3; // The first three are already castable to UINT64
for (int i = 0; i < int64Ct; i++) {
DoRawGatherTest<uint64_t>(pDevice, Int64Textures[i], DXGI_FORMAT_R32G32_UINT);
// Executing a simple binop to verify shadel model 6.1 support; runs with
// ShaderModel61.CoreRequirement
TEST_F(ExecutionTest, BasicShaderModel61) {
// Executing a simple binop to verify shadel model 6.3 support; runs with
// ShaderModel63.CoreRequirement
TEST_F(ExecutionTest, BasicShaderModel63) {
void ExecutionTest::RunBasicShaderModelTest(D3D_SHADER_MODEL shaderModel) {
WEX::TestExecution::SetVerifyOutput verifySettings(
CComPtr<ID3D12Device> pDevice;
if (!CreateDevice(&pDevice, shaderModel)) {
char *pShaderModelStr;
if (shaderModel == D3D_SHADER_MODEL_6_1) {
pShaderModelStr = "cs_6_1";
} else if (shaderModel == D3D_SHADER_MODEL_6_3) {
pShaderModelStr = "cs_6_3";
} else {
DXASSERT_NOMSG("Invalid Shader Model Parameter");
pShaderModelStr = nullptr;
const char shaderTemplate[] =
"struct SBinaryOp { %s input1; %s input2; %s output; };"
"RWStructuredBuffer<SBinaryOp> g_buf : register(u0);"
"void main(uint GI : SV_GroupIndex) {"
" SBinaryOp l = g_buf[GI];"
" l.output = l.input1 + l.input2;"
" g_buf[GI] = l;"
char shader[sizeof(shaderTemplate) + 50];
// Run simple shader with float data types
char* sTy = "float";
float inputFloatPairs[] = { 1.5f, -2.8f, 3.23e-5f, 6.0f, 181.621f, 14.978f };
VERIFY_IS_TRUE(sprintf(shader, shaderTemplate, sTy, sTy, sTy) > 0);
WEX::Logging::Log::Comment(L"BasicShaderModel float");
RunBasicShaderModelTest<float>(pDevice, pShaderModelStr, shader, inputFloatPairs, sizeof(inputFloatPairs) / (2 * sizeof(float)));
// Run simple shader with double data types
if (DoesDeviceSupportDouble(pDevice)) {
sTy = "double";
double inputDoublePairs[] = { 1.5891020, -2.8, 3.23e-5, 1 / 3, 181.91621, 14.654978 };
VERIFY_IS_TRUE(sprintf(shader, shaderTemplate, sTy, sTy, sTy) > 0);
WEX::Logging::Log::Comment(L"BasicShaderModel double");
RunBasicShaderModelTest<double>(pDevice, pShaderModelStr, shader, inputDoublePairs, sizeof(inputDoublePairs) / (2 * sizeof(double)));
else {
// Optional feature, so it's correct to not support it if declared as such.
WEX::Logging::Log::Comment(L"Device does not support double operations.");
// Run simple shader with int64 types
if (DoesDeviceSupportInt64(pDevice)) {
sTy = "int64_t";
int64_t inputInt64Pairs[] = { 1, -100, 6814684, -9814810, 654, 1021248900 };
VERIFY_IS_TRUE(sprintf(shader, shaderTemplate, sTy, sTy, sTy) > 0);
WEX::Logging::Log::Comment(L"BasicShaderModel int64_t");
RunBasicShaderModelTest<int64_t>(pDevice, pShaderModelStr, shader, inputInt64Pairs, sizeof(inputInt64Pairs) / (2 * sizeof(int64_t)));
else {
// Optional feature, so it's correct to not support it if declared as such.
WEX::Logging::Log::Comment(L"Device does not support int64 operations.");
template <class Ty>
const wchar_t* ExecutionTest::BasicShaderModelTest_GetFormatString() {
DXASSERT_NOMSG("Unsupported type");
return "";
template <>
const wchar_t* ExecutionTest::BasicShaderModelTest_GetFormatString<float>() {
return L"element #%u: input1 = %6.8f, input1 = %6.8f, output = %6.8f, expected = %6.8f";
template <>
const wchar_t* ExecutionTest::BasicShaderModelTest_GetFormatString<double>() {
return BasicShaderModelTest_GetFormatString<float>();
template <>
const wchar_t* ExecutionTest::BasicShaderModelTest_GetFormatString<int64_t>() {
return L"element #%u: input1 = %ld, input1 = %ld, output = %ld, expected = %ld";
template <class Ty>
void ExecutionTest::RunBasicShaderModelTest(CComPtr<ID3D12Device> pDevice, const char *pShaderModelStr, const char *pShader,
Ty *pInputDataPairs, unsigned inputDataCount) {
struct SBinaryOp {
Ty input1;
Ty input2;
Ty output;
CComPtr<IStream> pStream;
ReadHlslDataIntoNewStream(L"ShaderOpArith.xml", &pStream);
std::shared_ptr<ShaderOpTestResult> test = RunShaderOpTest(
pDevice, m_support, pStream, "BinaryFPOp",
// this callbacked is called when the test is creating the resource to run the test
[&](LPCSTR Name, std::vector<BYTE> &Data, st::ShaderOp *pShaderOp) {
pShaderOp-> = pShaderModelStr;
pShaderOp-> = pShader;
size_t size = sizeof(SBinaryOp) * inputDataCount;
SBinaryOp *pPrimitives = (SBinaryOp*);
Ty *pIn = pInputDataPairs;
for (size_t i = 0; i < inputDataCount; i++, pIn += 2) {
SBinaryOp *p = &pPrimitives[i];
p->input1 = pIn[0];
p->input2 = pIn[1];
MappedData data;
test->Test->GetReadBackData("SBinaryFPOp", &data);
SBinaryOp *pPrimitives = (SBinaryOp*);
const wchar_t* formatStr = BasicShaderModelTest_GetFormatString<Ty>();
Ty *pIn = pInputDataPairs;
for (unsigned i = 0; i < inputDataCount; i++, pIn += 2) {
Ty expValue = pIn[0] + pIn[1];
SBinaryOp *p = &pPrimitives[i];
LogCommentFmt(formatStr, i, pIn[0], pIn[1], p->output, expValue);
VERIFY_ARE_EQUAL(p->output, expValue);
// Resource structure for data-driven tests.
struct SUnaryFPOp {
float input;
float output;
struct SBinaryFPOp {
float input1;
float input2;
float output1;
float output2;
struct STertiaryFPOp {
float input1;
float input2;
float input3;
float output;
struct SUnaryHalfOp {
uint16_t input;
uint16_t output;
struct SBinaryHalfOp {
uint16_t input1;
uint16_t input2;
uint16_t output1;
uint16_t output2;
struct STertiaryHalfOp {
uint16_t input1;
uint16_t input2;
uint16_t input3;
uint16_t output;
struct SUnaryIntOp {
int input;
int output;
struct SUnaryUintOp {
unsigned int input;
unsigned int output;
struct SBinaryIntOp {
int input1;
int input2;
int output1;
int output2;
struct STertiaryIntOp {
int input1;
int input2;
int input3;
int output;
struct SBinaryUintOp {
unsigned int input1;
unsigned int input2;
unsigned int output1;
unsigned int output2;
struct STertiaryUintOp {
unsigned int input1;
unsigned int input2;
unsigned int input3;
unsigned int output;
struct SUnaryInt16Op {
short input;
short output;
struct SUnaryUint16Op {
unsigned short input;
unsigned short output;
struct SBinaryInt16Op {
short input1;
short input2;
short output1;
short output2;
struct STertiaryInt16Op {
short input1;
short input2;
short input3;
short output;
struct SBinaryUint16Op {
unsigned short input1;
unsigned short input2;
unsigned short output1;
unsigned short output2;
struct STertiaryUint16Op {
unsigned short input1;
unsigned short input2;
unsigned short input3;
unsigned short output;
// representation for HLSL float vectors
struct SDotOp {
XMFLOAT4 input1;
XMFLOAT4 input2;
float o_dot2;
float o_dot3;
float o_dot4;
struct Half2
uint16_t x;
uint16_t y;
Half2() = default;
Half2(const Half2&) = default;
Half2& operator=(const Half2&) = default;
Half2(Half2&&) = default;
Half2& operator=(Half2&&) = default;
constexpr Half2(uint16_t _x, uint16_t _y) : x(_x), y(_y) {}
explicit Half2(_In_reads_(2) const uint16_t *pArray) : x(pArray[0]), y(pArray[1]) {}
struct SDot2AddHalfOp {
Half2 input1;
Half2 input2;
float acc;
float result;
struct SDot4AddI8PackedOp {
uint32_t input1;
uint32_t input2;
int32_t acc;
int32_t result;
struct SDot4AddU8PackedOp {
uint32_t input1;
uint32_t input2;
uint32_t acc;
uint32_t result;
struct SMsad4 {
unsigned int ref;
XMUINT2 src;
XMUINT4 accum;
XMUINT4 result;
struct SPackUnpackOpOutPacked
uint32_t packedUint32;
uint32_t packedInt32;
uint32_t packedUint16;
uint32_t packedInt16;
uint32_t packedClampedUint32;
uint32_t packedClampedInt32;
uint32_t packedClampedUint16;
uint32_t packedClampedInt16;
struct SPackUnpackOpOutUnpacked {
std::array<uint32_t, 4> outputUint32;
std::array<int32_t, 4> outputInt32;
std::array<uint16_t, 4> outputUint16;
std::array<int16_t, 4> outputInt16;
std::array<uint32_t, 4> outputClampedUint32;
std::array<int32_t, 4> outputClampedInt32;
std::array<uint16_t, 4> outputClampedUint16;
std::array<int16_t, 4> outputClampedInt16;
// Parameter representation for taef data-driven tests
struct TableParameter {
LPCWSTR m_name;
enum TableParameterType {
TableParameterType m_type;
bool m_required; // required parameter
int8_t m_int8;
int16_t m_int16;
int m_int32;
unsigned int m_uint;
float m_float;
uint16_t m_half; // no such thing as half type in c++. Use int16 instead
double m_double;
bool m_bool;
WEX::Common::String m_str;
std::vector<int8_t> m_int8Table;
std::vector<int16_t> m_int16Table;
std::vector<int> m_int32Table;
std::vector<uint8_t> m_uint8Table;
std::vector<uint16_t> m_uint16Table;
std::vector<unsigned int> m_uint32Table;
std::vector<float> m_floatTable;
std::vector<uint16_t> m_halfTable; // no such thing as half type in c++
std::vector<double> m_doubleTable;
std::vector<bool> m_boolTable;
std::vector<WEX::Common::String> m_StringTable;
class TableParameterHandler {
HRESULT ParseTableRow();
TableParameter* m_table;
size_t m_tableSize;
TableParameterHandler(TableParameter *pTable, size_t size) : m_table(pTable), m_tableSize(size) {
TableParameter* GetTableParamByName(LPCWSTR name) {
for (size_t i = 0; i < m_tableSize; ++i) {
if (_wcsicmp(name, m_table[i].m_name) == 0) {
return &m_table[i];
DXASSERT_ARGS(false, "Invalid Table Parameter Name %s", name);
return nullptr;
void clearTableParameter() {
for (size_t i = 0; i < m_tableSize; ++i) {
m_table[i].m_int32 = 0;
m_table[i].m_uint = 0;
m_table[i].m_double = 0;
m_table[i].m_bool = false;
m_table[i].m_str = WEX::Common::String();
template <class T1>
std::vector<T1> *GetDataArray(LPCWSTR name) {
return nullptr;
template <>
std::vector<int> *GetDataArray(LPCWSTR name) {
for (size_t i = 0; i < m_tableSize; ++i) {
if (_wcsicmp(name, m_table[i].m_name) == 0) {
return &(m_table[i].m_int32Table);
DXASSERT_ARGS(false, "Invalid Table Parameter Name %s", name);
return nullptr;
template <>
std::vector<int8_t> *GetDataArray(LPCWSTR name) {
for (size_t i = 0; i < m_tableSize; ++i) {
if (_wcsicmp(name, m_table[i].m_name) == 0) {
return &(m_table[i].m_int8Table);
DXASSERT_ARGS(false, "Invalid Table Parameter Name %s", name);
return nullptr;
template <>
std::vector<int16_t> *GetDataArray(LPCWSTR name) {
for (size_t i = 0; i < m_tableSize; ++i) {
if (_wcsicmp(name, m_table[i].m_name) == 0) {
return &(m_table[i].m_int16Table);
DXASSERT_ARGS(false, "Invalid Table Parameter Name %s", name);
return nullptr;
template <>
std::vector<unsigned int> *GetDataArray(LPCWSTR name) {
for (size_t i = 0; i < m_tableSize; ++i) {
if (_wcsicmp(name, m_table[i].m_name) == 0) {
return &(m_table[i].m_uint32Table);
DXASSERT_ARGS(false, "Invalid Table Parameter Name %s", name);
return nullptr;
template <>
std::vector<float> *GetDataArray(LPCWSTR name) {
for (size_t i = 0; i < m_tableSize; ++i) {
if (_wcsicmp(name, m_table[i].m_name) == 0) {
return &(m_table[i].m_floatTable);
DXASSERT_ARGS(false, "Invalid Table Parameter Name %s", name);
return nullptr;
// TODO: uin16_t may be used to represent two different types when we introduce uint16
template <>
std::vector<uint16_t> *GetDataArray(LPCWSTR name) {
for (size_t i = 0; i < m_tableSize; ++i) {
if (_wcsicmp(name, m_table[i].m_name) == 0) {
return &(m_table[i].m_halfTable);
DXASSERT_ARGS(false, "Invalid Table Parameter Name %s", name);
return nullptr;
template <>
std::vector<double> *GetDataArray(LPCWSTR name) {
for (size_t i = 0; i < m_tableSize; ++i) {
if (_wcsicmp(name, m_table[i].m_name) == 0) {
return &(m_table[i].m_doubleTable);
DXASSERT_ARGS(false, "Invalid Table Parameter Name %s", name);
return nullptr;
template <>
std::vector<bool> *GetDataArray(LPCWSTR name) {
for (size_t i = 0; i < m_tableSize; ++i) {
if (_wcsicmp(name, m_table[i].m_name) == 0) {
return &(m_table[i].m_boolTable);
DXASSERT_ARGS(false, "Invalid Table Parameter Name %s", name);
return nullptr;
static TableParameter UnaryFPOpParameters[] = {
{ L"ShaderOp.Target", TableParameter::STRING, true },
{ L"ShaderOp.Text", TableParameter::STRING, true },
{ L"Validation.Input1", TableParameter::FLOAT_TABLE, true },
{ L"Validation.Expected1", TableParameter::FLOAT_TABLE, true },
{ L"Validation.Type", TableParameter::STRING, true },
{ L"Validation.Tolerance", TableParameter::DOUBLE, true },
{ L"Warp.Version", TableParameter::UINT, false }
static TableParameter BinaryFPOpParameters[] = {
{ L"ShaderOp.Target", TableParameter::STRING, true },
{ L"ShaderOp.Text", TableParameter::STRING, true },
{ L"Validation.Input1", TableParameter::FLOAT_TABLE, true },
{ L"Validation.Input2", TableParameter::FLOAT_TABLE, true },
{ L"Validation.Expected1", TableParameter::FLOAT_TABLE, true },
{ L"Validation.Expected2", TableParameter::FLOAT_TABLE, false },
{ L"Validation.Type", TableParameter::STRING, true },
{ L"Validation.Tolerance", TableParameter::DOUBLE, true },
static TableParameter TertiaryFPOpParameters[] = {
{ L"ShaderOp.Target", TableParameter::STRING, true },
{ L"ShaderOp.Text", TableParameter::STRING, true },
{ L"Validation.Input1", TableParameter::FLOAT_TABLE, true },
{ L"Validation.Input2", TableParameter::FLOAT_TABLE, true },
{ L"Validation.Input3", TableParameter::FLOAT_TABLE, true },
{ L"Validation.Expected1", TableParameter::FLOAT_TABLE, true },
{ L"Validation.Type", TableParameter::STRING, true },
{ L"Validation.Tolerance", TableParameter::DOUBLE, true },
static TableParameter UnaryHalfOpParameters[] = {
{ L"ShaderOp.Target", TableParameter::STRING, true },
{ L"ShaderOp.Text", TableParameter::STRING, true },
{ L"ShaderOp.Arguments", TableParameter::STRING, true },
{ L"Validation.Input1", TableParameter::HALF_TABLE, true },
{ L"Validation.Expected1", TableParameter::HALF_TABLE, true },
{ L"Validation.Type", TableParameter::STRING, true },
{ L"Validation.Tolerance", TableParameter::DOUBLE, true },
{ L"Warp.Version", TableParameter::UINT, false }
static TableParameter BinaryHalfOpParameters[] = {
{ L"ShaderOp.Target", TableParameter::STRING, true },
{ L"ShaderOp.Text", TableParameter::STRING, true },
{ L"ShaderOp.Arguments", TableParameter::STRING, true },
{ L"Validation.Input1", TableParameter::HALF_TABLE, true },
{ L"Validation.Input2", TableParameter::HALF_TABLE, true },
{ L"Validation.Expected1", TableParameter::HALF_TABLE, true },
{ L"Validation.Expected2", TableParameter::HALF_TABLE, false },
{ L"Validation.Type", TableParameter::STRING, true },
{ L"Validation.Tolerance", TableParameter::DOUBLE, true },
static TableParameter TertiaryHalfOpParameters[] = {
{ L"ShaderOp.Target", TableParameter::STRING, true },
{ L"ShaderOp.Text", TableParameter::STRING, true },
{ L"ShaderOp.Arguments", TableParameter::STRING, true },
{ L"Validation.Input1", TableParameter::HALF_TABLE, true },
{ L"Validation.Input2", TableParameter::HALF_TABLE, true },
{ L"Validation.Input3", TableParameter::HALF_TABLE, true },
{ L"Validation.Expected1", TableParameter::HALF_TABLE, true },
{ L"Validation.Type", TableParameter::STRING, true },
{ L"Validation.Tolerance", TableParameter::DOUBLE, true },
static TableParameter UnaryIntOpParameters[] = {
{ L"ShaderOp.Target", TableParameter::STRING, true },
{ L"ShaderOp.Text", TableParameter::STRING, true },
{ L"Validation.Input1", TableParameter::INT32_TABLE, true },
{ L"Validation.Expected1", TableParameter::INT32_TABLE, true },
{ L"Validation.Tolerance", TableParameter::INT32, true },
static TableParameter UnaryUintOpParameters[] = {
{ L"ShaderOp.Target", TableParameter::STRING, true },
{ L"ShaderOp.Text", TableParameter::STRING, true },
{ L"Validation.Input1", TableParameter::UINT32_TABLE, true },
{ L"Validation.Expected1", TableParameter::UINT32_TABLE, true },
{ L"Validation.Tolerance", TableParameter::INT32, true },
static TableParameter BinaryIntOpParameters[] = {
{ L"ShaderOp.Target", TableParameter::STRING, true },
{ L"ShaderOp.Text", TableParameter::STRING, true },
{ L"Validation.Input1", TableParameter::INT32_TABLE, true },
{ L"Validation.Input2", TableParameter::INT32_TABLE, true },
{ L"Validation.Expected1", TableParameter::INT32_TABLE, true },
{ L"Validation.Expected2", TableParameter::INT32_TABLE, false },
{ L"Validation.Tolerance", TableParameter::INT32, true },
static TableParameter TertiaryIntOpParameters[] = {
{ L"ShaderOp.Target", TableParameter::STRING, true },
{ L"ShaderOp.Text", TableParameter::STRING, true },
{ L"Validation.Input1", TableParameter::INT32_TABLE, true },
{ L"Validation.Input2", TableParameter::INT32_TABLE, true },
{ L"Validation.Input3", TableParameter::INT32_TABLE, true },
{ L"Validation.Expected1", TableParameter::INT32_TABLE, true },
{ L"Validation.Tolerance", TableParameter::INT32, true },
static TableParameter BinaryUintOpParameters[] = {
{ L"ShaderOp.Target", TableParameter::STRING, true },
{ L"ShaderOp.Text", TableParameter::STRING, true },
{ L"Validation.Input1", TableParameter::UINT32_TABLE, true },
{ L"Validation.Input2", TableParameter::UINT32_TABLE, true },
{ L"Validation.Expected1", TableParameter::UINT32_TABLE, true },
{ L"Validation.Expected2", TableParameter::UINT32_TABLE, false },
{ L"Validation.Tolerance", TableParameter::INT32, true },
static TableParameter TertiaryUintOpParameters[] = {
{ L"ShaderOp.Target", TableParameter::STRING, true },
{ L"ShaderOp.Text", TableParameter::STRING, true },
{ L"Validation.Input1", TableParameter::UINT32_TABLE, true },
{ L"Validation.Input2", TableParameter::UINT32_TABLE, true },
{ L"Validation.Input3", TableParameter::UINT32_TABLE, true },
{ L"Validation.Expected1", TableParameter::UINT32_TABLE, true },
{ L"Validation.Tolerance", TableParameter::INT32, true },
static TableParameter UnaryInt16OpParameters[] = {
{ L"ShaderOp.Target", TableParameter::STRING, true },
{ L"ShaderOp.Text", TableParameter::STRING, true },
{ L"ShaderOp.Arguments", TableParameter::STRING, true },
{ L"Validation.Input1", TableParameter::INT16_TABLE, true },
{ L"Validation.Expected1", TableParameter::INT16_TABLE, true },
{ L"Validation.Tolerance", TableParameter::INT32, true },
static TableParameter UnaryUint16OpParameters[] = {
{ L"ShaderOp.Target", TableParameter::STRING, true },
{ L"ShaderOp.Text", TableParameter::STRING, true },
{ L"ShaderOp.Arguments", TableParameter::STRING, true },
{ L"Validation.Input1", TableParameter::UINT16_TABLE, true },
{ L"Validation.Expected1", TableParameter::UINT16_TABLE, true },
{ L"Validation.Tolerance", TableParameter::INT32, true },
static TableParameter BinaryInt16OpParameters[] = {
{ L"ShaderOp.Target", TableParameter::STRING, true },
{ L"ShaderOp.Text", TableParameter::STRING, true },
{ L"ShaderOp.Arguments", TableParameter::STRING, true },
{ L"Validation.Input1", TableParameter::INT16_TABLE, true },
{ L"Validation.Input2", TableParameter::INT16_TABLE, true },
{ L"Validation.Expected1", TableParameter::INT16_TABLE, true },
{ L"Validation.Expected2", TableParameter::INT16_TABLE, false },
{ L"Validation.Tolerance", TableParameter::INT32, true },
static TableParameter TertiaryInt16OpParameters[] = {
{ L"ShaderOp.Target", TableParameter::STRING, true },
{ L"ShaderOp.Text", TableParameter::STRING, true },
{ L"ShaderOp.Arguments", TableParameter::STRING, true },
{ L"Validation.Input1", TableParameter::INT16_TABLE, true },
{ L"Validation.Input2", TableParameter::INT16_TABLE, true },
{ L"Validation.Input3", TableParameter::INT16_TABLE, true },
{ L"Validation.Expected1", TableParameter::INT16_TABLE, true },
{ L"Validation.Tolerance", TableParameter::INT32, true },
static TableParameter BinaryUint16OpParameters[] = {
{ L"ShaderOp.Target", TableParameter::STRING, true },
{ L"ShaderOp.Text", TableParameter::STRING, true },
{ L"ShaderOp.Arguments", TableParameter::STRING, true },
{ L"Validation.Input1", TableParameter::UINT16_TABLE, true },
{ L"Validation.Input2", TableParameter::UINT16_TABLE, true },
{ L"Validation.Expected1", TableParameter::UINT16_TABLE, true },
{ L"Validation.Expected2", TableParameter::UINT16_TABLE, false },
{ L"Validation.Tolerance", TableParameter::INT32, true },
static TableParameter TertiaryUint16OpParameters[] = {
{ L"ShaderOp.Target", TableParameter::STRING, true },
{ L"ShaderOp.Text", TableParameter::STRING, true },
{ L"ShaderOp.Arguments", TableParameter::STRING, true },
{ L"Validation.Input1", TableParameter::UINT16_TABLE, true },
{ L"Validation.Input2", TableParameter::UINT16_TABLE, true },
{ L"Validation.Input3", TableParameter::UINT16_TABLE, true },
{ L"Validation.Expected1", TableParameter::UINT16_TABLE, true },
{ L"Validation.Tolerance", TableParameter::INT32, true },
static TableParameter DotOpParameters[] = {
{ L"ShaderOp.Target", TableParameter::STRING, true },
{ L"ShaderOp.Text", TableParameter::STRING, true },
{ L"Validation.Input1", TableParameter::STRING_TABLE, true },
{ L"Validation.Input2", TableParameter::STRING_TABLE, true },
{ L"Validation.Expected1", TableParameter::STRING_TABLE, true },
{ L"Validation.Expected2", TableParameter::STRING_TABLE, true },
{ L"Validation.Expected3", TableParameter::STRING_TABLE, true },
{ L"Validation.Type", TableParameter::STRING, true },
{ L"Validation.Tolerance", TableParameter::DOUBLE, true },
static TableParameter Dot2AddHalfOpParameters[] = {
{ L"ShaderOp.Target", TableParameter::STRING, true },
{ L"ShaderOp.Text", TableParameter::STRING, true },
{ L"ShaderOp.Arguments", TableParameter::STRING, true },
{ L"Validation.Input1", TableParameter::STRING_TABLE, true },
{ L"Validation.Input2", TableParameter::STRING_TABLE, true },
{ L"Validation.Input3", TableParameter::FLOAT_TABLE, true },
{ L"Validation.Expected1", TableParameter::FLOAT_TABLE, true },
{ L"Validation.Type", TableParameter::STRING, true },
{ L"Validation.Tolerance", TableParameter::DOUBLE, true },
static TableParameter Dot4AddI8PackedOpParameters[] = {
{ L"ShaderOp.Target", TableParameter::STRING, true },
{ L"ShaderOp.Text", TableParameter::STRING, true },
{ L"Validation.Input1", TableParameter::UINT32_TABLE, true },
{ L"Validation.Input2", TableParameter::UINT32_TABLE, true },
{ L"Validation.Input3", TableParameter::INT32_TABLE, true },
{ L"Validation.Expected1", TableParameter::INT32_TABLE, true },
static TableParameter Dot4AddU8PackedOpParameters[] = {
{ L"ShaderOp.Target", TableParameter::STRING, true },
{ L"ShaderOp.Text", TableParameter::STRING, true },
{ L"Validation.Input1", TableParameter::UINT32_TABLE, true },
{ L"Validation.Input2", TableParameter::UINT32_TABLE, true },
{ L"Validation.Input3", TableParameter::UINT32_TABLE, true },
{ L"Validation.Expected1", TableParameter::UINT32_TABLE, true },
static TableParameter Msad4OpParameters[] = {
{ L"ShaderOp.Text", TableParameter::STRING, true },
{ L"Validation.Tolerance", TableParameter::DOUBLE, true },
{ L"Validation.Input1", TableParameter::UINT32_TABLE, true},
{ L"Validation.Input2", TableParameter::STRING_TABLE, true },
{ L"Validation.Input3", TableParameter::STRING_TABLE, true },
{ L"Validation.Expected1", TableParameter::STRING_TABLE, true }
static TableParameter WaveIntrinsicsActiveIntParameters[] = {
{ L"ShaderOp.Name", TableParameter::STRING, true },
{ L"ShaderOp.Text", TableParameter::STRING, true },
{ L"Validation.NumInputSet", TableParameter::UINT, true },
{ L"Validation.InputSet1", TableParameter::INT32_TABLE, true },
{ L"Validation.InputSet2", TableParameter::INT32_TABLE, false },
{ L"Validation.InputSet3", TableParameter::INT32_TABLE, false },
{ L"Validation.InputSet4", TableParameter::INT32_TABLE, false }
static TableParameter WaveIntrinsicsPrefixIntParameters[] = {
{ L"ShaderOp.Name", TableParameter::STRING, true },
{ L"ShaderOp.Text", TableParameter::STRING, true },
{ L"Validation.NumInputSet", TableParameter::UINT, true },
{ L"Validation.InputSet1", TableParameter::INT32_TABLE, true },
{ L"Validation.InputSet2", TableParameter::INT32_TABLE, false },
{ L"Validation.InputSet3", TableParameter::INT32_TABLE, false },
{ L"Validation.InputSet4", TableParameter::INT32_TABLE, false }
static TableParameter WaveIntrinsicsActiveUintParameters[] = {
{ L"ShaderOp.Name", TableParameter::STRING, true },
{ L"ShaderOp.Text", TableParameter::STRING, true },
{ L"Validation.NumInputSet", TableParameter::UINT, true },
{ L"Validation.InputSet1", TableParameter::UINT32_TABLE, true },
{ L"Validation.InputSet2", TableParameter::UINT32_TABLE, false },
{ L"Validation.InputSet3", TableParameter::UINT32_TABLE, false },
{ L"Validation.InputSet4", TableParameter::UINT32_TABLE, false }
static TableParameter WaveIntrinsicsPrefixUintParameters[] = {
{ L"ShaderOp.Name", TableParameter::STRING, true },
{ L"ShaderOp.Text", TableParameter::STRING, true },
{ L"Validation.NumInputSet", TableParameter::UINT, true },
{ L"Validation.InputSet1", TableParameter::UINT32_TABLE, true },
{ L"Validation.InputSet2", TableParameter::UINT32_TABLE, false },
{ L"Validation.InputSet3", TableParameter::UINT32_TABLE, false },
{ L"Validation.InputSet4", TableParameter::UINT32_TABLE, false }
static TableParameter WaveIntrinsicsMultiPrefixIntParameters[] = {
{ L"ShaderOp.Name", TableParameter::STRING, true },
{ L"ShaderOp.Target", TableParameter::STRING, true },
{ L"ShaderOp.Text", TableParameter::STRING, true },
{ L"Validation.Keys", TableParameter::INT32_TABLE, true },
{ L"Validation.Values", TableParameter::INT32_TABLE, true },
static TableParameter WaveIntrinsicsMultiPrefixUintParameters[] = {
{ L"ShaderOp.Name", TableParameter::STRING, true },
{ L"ShaderOp.Target", TableParameter::STRING, true },
{ L"ShaderOp.Text", TableParameter::STRING, true },
{ L"Validation.Keys", TableParameter::UINT32_TABLE, true },
{ L"Validation.Values", TableParameter::UINT32_TABLE, true },
static TableParameter WaveIntrinsicsActiveBoolParameters[] = {
{ L"ShaderOp.Name", TableParameter::STRING, true },
{ L"ShaderOp.Text", TableParameter::STRING, true },
{ L"Validation.NumInputSet", TableParameter::UINT, true },
{ L"Validation.InputSet1", TableParameter::BOOL_TABLE, true },
{ L"Validation.InputSet2", TableParameter::BOOL_TABLE, false },
{ L"Validation.InputSet3", TableParameter::BOOL_TABLE, false },
static TableParameter CBufferTestHalfParameters[] = {
{ L"Validation.InputSet", TableParameter::HALF_TABLE, true },
static TableParameter DenormBinaryFPOpParameters[] = {
{ L"ShaderOp.Target", TableParameter::STRING, true },
{ L"ShaderOp.Text", TableParameter::STRING, true },
{ L"ShaderOp.Arguments", TableParameter::STRING, true },
{ L"Validation.Input1", TableParameter::STRING_TABLE, true },
{ L"Validation.Input2", TableParameter::STRING_TABLE, true },
{ L"Validation.Expected1", TableParameter::STRING_TABLE, true },
{ L"Validation.Expected2", TableParameter::STRING_TABLE, false },
{ L"Validation.Type", TableParameter::STRING, true },
{ L"Validation.Tolerance", TableParameter::DOUBLE, true },
static TableParameter DenormTertiaryFPOpParameters[] = {
{ L"ShaderOp.Target", TableParameter::STRING, true },
{ L"ShaderOp.Text", TableParameter::STRING, true },
{ L"ShaderOp.Arguments", TableParameter::STRING, true },
{ L"Validation.Input1", TableParameter::STRING_TABLE, true },
{ L"Validation.Input2", TableParameter::STRING_TABLE, true },
{ L"Validation.Input3", TableParameter::STRING_TABLE, true },
{ L"Validation.Expected1", TableParameter::STRING_TABLE, true },
{ L"Validation.Expected2", TableParameter::STRING_TABLE, false },
{ L"Validation.Type", TableParameter::STRING, true },
{ L"Validation.Tolerance", TableParameter::DOUBLE, true },
static TableParameter PackUnpackOpParameters[] = {
{ L"ShaderOp.Text", TableParameter::STRING, true },
{ L"Validation.Type", TableParameter::STRING, true },
{ L"Validation.Tolerance", TableParameter::UINT, true },
{ L"Validation.Input", TableParameter::UINT32_TABLE, true },
static bool IsHexString(PCWSTR str, uint16_t *value) {
std::wstring wString(str);
wString.erase(std::remove(wString.begin(), wString.end(), L' '), wString.end());
LPCWSTR wstr = wString.c_str();
if (wcsncmp(wstr, L"0x", 2) == 0 || wcsncmp(wstr, L"0b", 2) == 0) {
*value = (uint16_t)wcstol(wstr, NULL, 0);
return true;
return false;
static HRESULT ParseDataToFloat(PCWSTR str, float &value) {
std::wstring wString(str);
wString.erase(std::remove(wString.begin(), wString.end(), L' '), wString.end());
PCWSTR wstr =;
if (_wcsicmp(wstr, L"NaN") == 0) {
value = NAN;
} else if (_wcsicmp(wstr, L"-inf") == 0) {
value = -(INFINITY);
} else if (_wcsicmp(wstr, L"inf") == 0) {
value = INFINITY;
} else if (_wcsicmp(wstr, L"-denorm") == 0) {
value = -(FLT_MIN / 2);
} else if (_wcsicmp(wstr, L"denorm") == 0) {
value = FLT_MIN / 2;
} else if (_wcsicmp(wstr, L"-0.0f") == 0 || _wcsicmp(wstr, L"-0.0") == 0 ||
_wcsicmp(wstr, L"-0") == 0) {
value = -0.0f;
} else if (_wcsicmp(wstr, L"0.0f") == 0 || _wcsicmp(wstr, L"0.0") == 0 ||
_wcsicmp(wstr, L"0") == 0) {
value = 0.0f;
} else if (_wcsnicmp(wstr, L"0x", 2) == 0) { // For hex values, take values literally
unsigned temp_i = std::stoul(wstr, nullptr, 16);
value = (float&)temp_i;
else {
// evaluate the expression of wstring
double val = _wtof(wstr);
if (val == 0) {
LogErrorFmt(L"Failed to parse parameter %s to float", wstr);
return E_FAIL;
value = (float)val;
return S_OK;
static HRESULT ParseDataToInt(PCWSTR str, int &value) {
std::wstring wString(str);
wString.erase(std::remove(wString.begin(), wString.end(), L' '), wString.end());
PCWSTR wstr =;
// evaluate the expression of string
if (_wcsicmp(wstr, L"0.0") == 0 || _wcsicmp(wstr, L"0") == 0) {
value = 0;
return S_OK;
int val = _wtoi(wstr);
if (val == 0) {
LogErrorFmt(L"Failed to parse parameter %s to int", wstr);
return E_FAIL;
value = val;
return S_OK;
static HRESULT ParseDataToUint(PCWSTR str, unsigned int &value) {
std::wstring wString(str);
wString.erase(std::remove(wString.begin(), wString.end(), L' '), wString.end());
PCWSTR wstr =;
// evaluate the expression of string
if (_wcsicmp(wstr, L"0") == 0 || _wcsicmp(wstr, L"0x00000000") == 0) {
value = 0;
return S_OK;
wchar_t *end;
unsigned int val = std::wcstoul(wstr, &end, 0);
if (val == 0) {
LogErrorFmt(L"Failed to parse parameter %s to int", wstr);
return E_FAIL;
value = val;
return S_OK;
static HRESULT ParseDataToVectorFloat(PCWSTR str, float *ptr, size_t count) {
std::wstring wstr(str);
size_t curPosition = 0;
// parse a string of dot product separated by commas
for (size_t i = 0; i < count; ++i) {
size_t nextPosition = wstr.find(L",", curPosition);
if (FAILED(ParseDataToFloat(
wstr.substr(curPosition, nextPosition - curPosition).data(),
*(ptr + i)))) {
return E_FAIL;
curPosition = nextPosition + 1;
return S_OK;
static HRESULT ParseDataToVectorHalf(PCWSTR str, uint16_t *ptr, size_t count) {
std::wstring wstr(str);
size_t curPosition = 0;
// parse a string of dot product separated by commas
for (size_t i = 0; i < count; ++i) {
size_t nextPosition = wstr.find(L",", curPosition);
float floatValue;
if (FAILED(ParseDataToFloat(
wstr.substr(curPosition, nextPosition - curPosition).data(), floatValue))) {
return E_FAIL;
*(ptr + i) = ConvertFloat32ToFloat16(floatValue);
curPosition = nextPosition + 1;
return S_OK;
static HRESULT ParseDataToVectorUint(PCWSTR str, unsigned int *ptr, size_t count) {
std::wstring wstr(str);
size_t curPosition = 0;
// parse a string of dot product separated by commas
for (size_t i = 0; i < count; ++i) {
size_t nextPosition = wstr.find(L",", curPosition);
if (FAILED(ParseDataToUint(
wstr.substr(curPosition, nextPosition - curPosition).data(),
*(ptr + i)))) {
return E_FAIL;
curPosition = nextPosition + 1;
return S_OK;
HRESULT TableParameterHandler::ParseTableRow() {
TableParameter *table = m_table;
for (unsigned int i = 0; i < m_tableSize; ++i) {
switch (table[i].m_type) {
case TableParameter::INT8:
if (FAILED(WEX::TestExecution::TestData::TryGetValue(table[i].m_name,
table[i].m_int32)) && table[i].m_required) {
// TryGetValue does not suppport reading from int16
LogErrorFmt(L"Failed to get %s", table[i].m_name);
return E_FAIL;
table[i].m_int8 = (int8_t)(table[i].m_int32);
case TableParameter::INT16:
if (FAILED(WEX::TestExecution::TestData::TryGetValue(table[i].m_name,
table[i].m_int32)) && table[i].m_required) {
// TryGetValue does not suppport reading from int16
LogErrorFmt(L"Failed to get %s", table[i].m_name);
return E_FAIL;
table[i].m_int16 = (short)(table[i].m_int32);
case TableParameter::INT32:
if (FAILED(WEX::TestExecution::TestData::TryGetValue(table[i].m_name,
table[i].m_int32)) && table[i].m_required) {
LogErrorFmt(L"Failed to get %s", table[i].m_name);
return E_FAIL;
case TableParameter::UINT:
if (FAILED(WEX::TestExecution::TestData::TryGetValue(table[i].m_name,
table[i].m_uint)) && table[i].m_required) {
LogErrorFmt(L"Failed to get %s", table[i].m_name);
return E_FAIL;
case TableParameter::DOUBLE:
if (FAILED(WEX::TestExecution::TestData::TryGetValue(
table[i].m_name, table[i].m_double)) && table[i].m_required) {
LogErrorFmt(L"Failed to get %s", table[i].m_name);
return E_FAIL;
case TableParameter::STRING:
if (FAILED(WEX::TestExecution::TestData::TryGetValue(table[i].m_name,
table[i].m_str)) && table[i].m_required) {
LogErrorFmt(L"Failed to get %s", table[i].m_name);
return E_FAIL;
case TableParameter::BOOL:
if (FAILED(WEX::TestExecution::TestData::TryGetValue(table[i].m_name,
table[i].m_str)) && table[i].m_bool) {
LogErrorFmt(L"Failed to get %s", table[i].m_name);
return E_FAIL;
case TableParameter::INT8_TABLE: {
WEX::TestExecution::TestDataArray<int> tempTable;
if (FAILED(WEX::TestExecution::TestData::TryGetValue(
table[i].m_name, tempTable)) && table[i].m_required) {
LogErrorFmt(L"Failed to get %s", table[i].m_name);
return E_FAIL;
// TryGetValue does not suppport reading from int8
for (size_t j = 0, end = tempTable.GetSize(); j != end; ++j) {
table[i].m_int8Table[j] = (int8_t)tempTable[j];
case TableParameter::INT16_TABLE: {
WEX::TestExecution::TestDataArray<int> tempTable;
if (FAILED(WEX::TestExecution::TestData::TryGetValue(
table[i].m_name, tempTable)) && table[i].m_required) {
LogErrorFmt(L"Failed to get %s", table[i].m_name);
return E_FAIL;
// TryGetValue does not suppport reading from int8
for (size_t j = 0, end = tempTable.GetSize(); j != end; ++j) {
table[i].m_int16Table[j] = (int16_t)tempTable[j];
}case TableParameter::INT32_TABLE: {
WEX::TestExecution::TestDataArray<int> tempTable;
if (FAILED(WEX::TestExecution::TestData::TryGetValue(
table[i].m_name, tempTable)) && table[i].m_required) {
// TryGetValue does not suppport reading from int8
LogErrorFmt(L"Failed to get %s", table[i].m_name);
return E_FAIL;
for (size_t j = 0, end = tempTable.GetSize(); j != end; ++j) {
table[i].m_int32Table[j] = tempTable[j];
case TableParameter::UINT8_TABLE: {
WEX::TestExecution::TestDataArray<int> tempTable;
if (FAILED(WEX::TestExecution::TestData::TryGetValue(
table[i].m_name, tempTable)) && table[i].m_required) {
LogErrorFmt(L"Failed to get %s", table[i].m_name);
return E_FAIL;
// TryGetValue does not suppport reading from int8
for (size_t j = 0, end = tempTable.GetSize(); j != end; ++j) {
table[i].m_int8Table[j] = (uint8_t)tempTable[j];
case TableParameter::UINT16_TABLE: {
WEX::TestExecution::TestDataArray<int> tempTable;
if (FAILED(WEX::TestExecution::TestData::TryGetValue(
table[i].m_name, tempTable)) && table[i].m_required) {
LogErrorFmt(L"Failed to get %s", table[i].m_name);
return E_FAIL;
// TryGetValue does not suppport reading from int8
for (size_t j = 0, end = tempTable.GetSize(); j != end; ++j) {
table[i].m_uint16Table[j] = (uint16_t)tempTable[j];
case TableParameter::UINT32_TABLE: {
WEX::TestExecution::TestDataArray<unsigned int> tempTable;
if (FAILED(WEX::TestExecution::TestData::TryGetValue(
table[i].m_name, tempTable)) && table[i].m_required) {
// TryGetValue does not suppport reading from int8
LogErrorFmt(L"Failed to get %s", table[i].m_name);
return E_FAIL;
for (size_t j = 0, end = tempTable.GetSize(); j != end; ++j) {
table[i].m_uint32Table[j] = tempTable[j];
case TableParameter::FLOAT_TABLE: {
WEX::TestExecution::TestDataArray<WEX::Common::String> tempTable;
if (FAILED(WEX::TestExecution::TestData::TryGetValue(
table[i].m_name, tempTable)) && table[i].m_required) {
// TryGetValue does not suppport reading from int8
LogErrorFmt(L"Failed to get %s", table[i].m_name);
return E_FAIL;
for (size_t j = 0, end = tempTable.GetSize(); j != end; ++j) {
ParseDataToFloat(tempTable[j], table[i].m_floatTable[j]);
case TableParameter::HALF_TABLE: {
WEX::TestExecution::TestDataArray<WEX::Common::String> tempTable;
if (FAILED(WEX::TestExecution::TestData::TryGetValue(
table[i].m_name, tempTable)) && table[i].m_required) {
// TryGetValue does not suppport reading from int8
LogErrorFmt(L"Failed to get %s", table[i].m_name);
return E_FAIL;
for (size_t j = 0, end = tempTable.GetSize(); j != end; ++j) {
uint16_t value = 0;
if (IsHexString(tempTable[j], &value)) {
table[i].m_halfTable[j] = value;
else {
float val;
ParseDataToFloat(tempTable[j], val);
if (isdenorm(val))
table[i].m_halfTable[j] = signbit(val) ? Float16NegDenorm : Float16PosDenorm;
table[i].m_halfTable[j] = ConvertFloat32ToFloat16(val);
case TableParameter::DOUBLE_TABLE: {
WEX::TestExecution::TestDataArray<double> tempTable;
if (FAILED(WEX::TestExecution::TestData::TryGetValue(
table[i].m_name, tempTable)) && table[i].m_required) {
// TryGetValue does not suppport reading from int8
LogErrorFmt(L"Failed to get %s", table[i].m_name);
return E_FAIL;
for (size_t j = 0, end = tempTable.GetSize(); j != end; ++j) {
table[i].m_doubleTable[j] = tempTable[j];
case TableParameter::BOOL_TABLE: {
WEX::TestExecution::TestDataArray<bool> tempTable;
if (FAILED(WEX::TestExecution::TestData::TryGetValue(
table[i].m_name, tempTable)) && table[i].m_required) {
// TryGetValue does not suppport reading from int8
LogErrorFmt(L"Failed to get %s", table[i].m_name);
return E_FAIL;
for (size_t j = 0, end = tempTable.GetSize(); j != end; ++j) {
table[i].m_boolTable[j] = tempTable[j];
case TableParameter::STRING_TABLE: {
WEX::TestExecution::TestDataArray<WEX::Common::String> tempTable;
if (FAILED(WEX::TestExecution::TestData::TryGetValue(
table[i].m_name, tempTable)) && table[i].m_required) {
// TryGetValue does not suppport reading from int8
LogErrorFmt(L"Failed to get %s", table[i].m_name);
return E_FAIL;
for (size_t j = 0, end = tempTable.GetSize(); j != end; ++j) {
table[i].m_StringTable[j] = tempTable[j];
DXASSERT_NOMSG("Invalid Parameter Type");
if (errno == ERANGE) {
LogErrorFmt(L"got out of range value for table %s", table[i].m_name);
return E_FAIL;
return S_OK;
static void VerifyOutputWithExpectedValueInt(int output, int ref, int tolerance) {
VERIFY_IS_TRUE(output - ref <= tolerance && ref - output <= tolerance);
static void VerifyOutputWithExpectedValueUInt(uint32_t output, uint32_t ref, uint32_t tolerance) {
VERIFY_IS_TRUE(output - ref <= tolerance && ref - output <= tolerance);
static void VerifyOutputWithExpectedValueUInt4(XMUINT4 output, XMUINT4 ref) {
VERIFY_ARE_EQUAL(output.x, ref.x);
VERIFY_ARE_EQUAL(output.y, ref.y);
VERIFY_ARE_EQUAL(output.z, ref.z);
VERIFY_ARE_EQUAL(output.w, ref.w);
static void VerifyOutputWithExpectedValueFloat(
float output, float ref, LPCWSTR type, double tolerance,
hlsl::DXIL::Float32DenormMode mode = hlsl::DXIL::Float32DenormMode::Any) {
if (_wcsicmp(type, L"Relative") == 0) {
VERIFY_IS_TRUE(CompareFloatRelativeEpsilon(output, ref, (int)tolerance, mode));
} else if (_wcsicmp(type, L"Epsilon") == 0) {
VERIFY_IS_TRUE(CompareFloatEpsilon(output, ref, (float)tolerance, mode));
} else if (_wcsicmp(type, L"ULP") == 0) {
VERIFY_IS_TRUE(CompareFloatULP(output, ref, (int)tolerance, mode));
} else {
LogErrorFmt(L"Failed to read comparison type %S", type);
static bool CompareOutputWithExpectedValueFloat(
float output, float ref, LPCWSTR type, double tolerance,
hlsl::DXIL::Float32DenormMode mode = hlsl::DXIL::Float32DenormMode::Any) {
if (_wcsicmp(type, L"Relative") == 0) {
return CompareFloatRelativeEpsilon(output, ref, (int)tolerance, mode);
} else if (_wcsicmp(type, L"Epsilon") == 0) {
return CompareFloatEpsilon(output, ref, (float)tolerance, mode);
} else if (_wcsicmp(type, L"ULP") == 0) {
return CompareFloatULP(output, ref, (int)tolerance, mode);
} else {
LogErrorFmt(L"Failed to read comparison type %S", type);
return false;
static void VerifyOutputWithExpectedValueHalf(
uint16_t output, uint16_t ref, LPCWSTR type, double tolerance) {
if (_wcsicmp(type, L"Relative") == 0) {
VERIFY_IS_TRUE(CompareHalfRelativeEpsilon(output, ref, (int)tolerance));
else if (_wcsicmp(type, L"Epsilon") == 0) {
VERIFY_IS_TRUE(CompareHalfEpsilon(output, ref, (float)tolerance));
else if (_wcsicmp(type, L"ULP") == 0) {
VERIFY_IS_TRUE(CompareHalfULP(output, ref, (float)tolerance));
else {
LogErrorFmt(L"Failed to read comparison type %S", type);
TEST_F(ExecutionTest, UnaryFloatOpTest) {
WEX::TestExecution::SetVerifyOutput verifySettings(
CComPtr<IStream> pStream;
ReadHlslDataIntoNewStream(L"ShaderOpArith.xml", &pStream);
CComPtr<ID3D12Device> pDevice;
if (!CreateDevice(&pDevice)) {
// Read data from the table
int tableSize = sizeof(UnaryFPOpParameters) / sizeof(TableParameter);
TableParameterHandler handler(UnaryFPOpParameters, tableSize);
CW2A Target(handler.GetTableParamByName(L"ShaderOp.Target")->m_str);
CW2A Text(handler.GetTableParamByName(L"ShaderOp.Text")->m_str);
unsigned int WarpVersion = handler.GetTableParamByName(L"Warp.Version")->m_uint;
if (GetTestParamUseWARP(true) && !IsValidWarpDllVersion(WarpVersion)) {
std::vector<float> *Validation_Input =
std::vector<float> *Validation_Expected =
LPCWSTR Validation_Type = handler.GetTableParamByName(L"Validation.Type")->m_str;
double Validation_Tolerance = handler.GetTableParamByName(L"Validation.Tolerance")->m_double;
size_t count = Validation_Input->size();
std::shared_ptr<ShaderOpTestResult> test = RunShaderOpTest(
pDevice, m_support, pStream, "UnaryFPOp",
// this callbacked is called when the test
// is creating the resource to run the test
[&](LPCSTR Name, std::vector<BYTE> &Data, st::ShaderOp *pShaderOp) {
VERIFY_IS_TRUE(0 == _stricmp(Name, "SUnaryFPOp"));
size_t size = sizeof(SUnaryFPOp) * count;
SUnaryFPOp *pPrimitives = (SUnaryFPOp *);
for (size_t i = 0; i < count; ++i) {
SUnaryFPOp *p = &pPrimitives[i];
p->input = (*Validation_Input)[i % Validation_Input->size()];
// use shader from data table
pShaderOp-> = Target.m_psz;
pShaderOp-> = Text.m_psz;
MappedData data;
test->Test->GetReadBackData("SUnaryFPOp", &data);
SUnaryFPOp *pPrimitives = (SUnaryFPOp*);
WEX::TestExecution::DisableVerifyExceptions dve;
for (unsigned i = 0; i < count; ++i) {
SUnaryFPOp *p = &pPrimitives[i];
float val = (*Validation_Expected)[i % Validation_Expected->size()];
L"element #%u, input = %6.8f, output = %6.8f, expected = %6.8f", i,
p->input, p->output, val);
VerifyOutputWithExpectedValueFloat(p->output, val, Validation_Type, Validation_Tolerance);
TEST_F(ExecutionTest, BinaryFloatOpTest) {
WEX::TestExecution::SetVerifyOutput verifySettings(
CComPtr<IStream> pStream;
ReadHlslDataIntoNewStream(L"ShaderOpArith.xml", &pStream);
CComPtr<ID3D12Device> pDevice;
if (!CreateDevice(&pDevice)) {
// Read data from the table
int tableSize = sizeof(BinaryFPOpParameters) / sizeof(TableParameter);
TableParameterHandler handler(BinaryFPOpParameters, tableSize);
CW2A Target(handler.GetTableParamByName(L"ShaderOp.Target")->m_str);
CW2A Text(handler.GetTableParamByName(L"ShaderOp.Text")->m_str);
std::vector<float> *Validation_Input1 =
std::vector<float> *Validation_Input2 =
std::vector<float> *Validation_Expected1 =
std::vector<float> *Validation_Expected2 =
LPCWSTR Validation_Type = handler.GetTableParamByName(L"Validation.Type")->m_str;
double Validation_Tolerance = handler.GetTableParamByName(L"Validation.Tolerance")->m_double;
size_t count = Validation_Input1->size();
std::shared_ptr<ShaderOpTestResult> test = RunShaderOpTest(
pDevice, m_support, pStream, "BinaryFPOp",
// this callbacked is called when the test
// is creating the resource to run the test
[&](LPCSTR Name, std::vector<BYTE> &Data, st::ShaderOp *pShaderOp) {
VERIFY_IS_TRUE(0 == _stricmp(Name, "SBinaryFPOp"));
size_t size = sizeof(SBinaryFPOp) * count;
SBinaryFPOp *pPrimitives = (SBinaryFPOp *);
for (size_t i = 0; i < count; ++i) {
SBinaryFPOp *p = &pPrimitives[i];
p->input1 = (*Validation_Input1)[i % Validation_Input1->size()];
p->input2 = (*Validation_Input2)[i % Validation_Input2->size()];
// use shader from data table
pShaderOp-> = Target.m_psz;
pShaderOp-> = Text.m_psz;
MappedData data;
test->Test->GetReadBackData("SBinaryFPOp", &data);
SBinaryFPOp *pPrimitives = (SBinaryFPOp *);
WEX::TestExecution::DisableVerifyExceptions dve;
unsigned numExpected = Validation_Expected2->size() == 0 ? 1 : 2;
if (numExpected == 2) {
for (unsigned i = 0; i < count; ++i) {
SBinaryFPOp *p = &pPrimitives[i];
float val1 = (*Validation_Expected1)[i % Validation_Expected1->size()];
float val2 = (*Validation_Expected2)[i % Validation_Expected2->size()];
LogCommentFmt(L"element #%u, input1 = %6.8f, input2 = %6.8f, output1 = "
L"%6.8f, expected1 = %6.8f, output2 = %6.8f, expected2 = %6.8f",
i, p->input1, p->input2, p->output1, val1, p->output2,
VerifyOutputWithExpectedValueFloat(p->output1, val1, Validation_Type,
VerifyOutputWithExpectedValueFloat(p->output2, val2, Validation_Type,
else if (numExpected == 1) {
for (unsigned i = 0; i < count; ++i) {
SBinaryFPOp *p = &pPrimitives[i];
float val1 = (*Validation_Expected1)[i % Validation_Expected1->size()];
LogCommentFmt(L"element #%u, input1 = %6.8f, input2 = %6.8f, output1 = "
L"%6.8f, expected1 = %6.8f",
i, p->input1, p->input2, p->output1, val1);
VerifyOutputWithExpectedValueFloat(p->output1, val1, Validation_Type,
else {
LogErrorFmt(L"Unexpected number of expected values for operation %i", numExpected);
TEST_F(ExecutionTest, TertiaryFloatOpTest) {
WEX::TestExecution::SetVerifyOutput verifySettings(
CComPtr<IStream> pStream;
ReadHlslDataIntoNewStream(L"ShaderOpArith.xml", &pStream);
CComPtr<ID3D12Device> pDevice;
if (!CreateDevice(&pDevice)) {
// Read data from the table
int tableSize = sizeof(TertiaryFPOpParameters) / sizeof(TableParameter);
TableParameterHandler handler(TertiaryFPOpParameters, tableSize);
CW2A Target(handler.GetTableParamByName(L"ShaderOp.Target")->m_str);
CW2A Text(handler.GetTableParamByName(L"ShaderOp.Text")->m_str);
std::vector<float> *Validation_Input1 =
std::vector<float> *Validation_Input2 =
std::vector<float> *Validation_Input3 =
std::vector<float> *Validation_Expected =
LPCWSTR Validation_Type = handler.GetTableParamByName(L"Validation.Type")->m_str;
double Validation_Tolerance = handler.GetTableParamByName(L"Validation.Tolerance")->m_double;
size_t count = Validation_Input1->size();
std::shared_ptr<ShaderOpTestResult> test = RunShaderOpTest(
pDevice, m_support, pStream, "TertiaryFPOp",
// this callbacked is called when the test
// is creating the resource to run the test
[&](LPCSTR Name, std::vector<BYTE> &Data, st::ShaderOp *pShaderOp) {
VERIFY_IS_TRUE(0 == _stricmp(Name, "STertiaryFPOp"));
size_t size = sizeof(STertiaryFPOp) * count;
STertiaryFPOp *pPrimitives = (STertiaryFPOp *);
for (size_t i = 0; i < count; ++i) {
STertiaryFPOp *p = &pPrimitives[i];
p->input1 = (*Validation_Input1)[i % Validation_Input1->size()];
p->input2 = (*Validation_Input2)[i % Validation_Input2->size()];
p->input3 = (*Validation_Input3)[i % Validation_Input3->size()];
// use shader from data table
pShaderOp-> = Target.m_psz;
pShaderOp-> = Text.m_psz;
MappedData data;
test->Test->GetReadBackData("STertiaryFPOp", &data);
STertiaryFPOp *pPrimitives = (STertiaryFPOp *);
WEX::TestExecution::DisableVerifyExceptions dve;
for (unsigned i = 0; i < count; ++i) {
STertiaryFPOp *p = &pPrimitives[i];
float val = (*Validation_Expected)[i % Validation_Expected->size()];
LogCommentFmt(L"element #%u, input1 = %6.8f, input2 = %6.8f, input3 = %6.8f, output1 = "
L"%6.8f, expected = %6.8f",
i, p->input1, p->input2, p->input3, p->output, val);
VerifyOutputWithExpectedValueFloat(p->output, val, Validation_Type,
TEST_F(ExecutionTest, UnaryHalfOpTest) {
WEX::TestExecution::SetVerifyOutput verifySettings(
CComPtr<IStream> pStream;
ReadHlslDataIntoNewStream(L"ShaderOpArith.xml", &pStream);
CComPtr<ID3D12Device> pDevice;
if (!CreateDevice(&pDevice, D3D_SHADER_MODEL::D3D_SHADER_MODEL_6_2)) {
if (!DoesDeviceSupportNative16bitOps(pDevice)) {
WEX::Logging::Log::Comment(L"Device does not support native 16-bit operations.");
// Read data from the table
int tableSize = sizeof(UnaryHalfOpParameters) / sizeof(TableParameter);
TableParameterHandler handler(UnaryHalfOpParameters, tableSize);
CW2A Target(handler.GetTableParamByName(L"ShaderOp.Target")->m_str);
CW2A Text(handler.GetTableParamByName(L"ShaderOp.Text")->m_str);
CW2A Arguments(handler.GetTableParamByName(L"ShaderOp.Arguments")->m_str);
unsigned int WarpVersion = handler.GetTableParamByName(L"Warp.Version")->m_uint;
if (GetTestParamUseWARP(true) && !IsValidWarpDllVersion(WarpVersion)) {
std::vector<uint16_t> *Validation_Input =
std::vector<uint16_t> *Validation_Expected =
LPCWSTR Validation_Type = handler.GetTableParamByName(L"Validation.Type")->m_str;
double Validation_Tolerance = handler.GetTableParamByName(L"Validation.Tolerance")->m_double;
size_t count = Validation_Input->size();
std::shared_ptr<ShaderOpTestResult> test = RunShaderOpTest(
pDevice, m_support, pStream, "UnaryFPOp",
// this callbacked is called when the test
// is creating the resource to run the test
[&](LPCSTR Name, std::vector<BYTE> &Data, st::ShaderOp *pShaderOp) {
VERIFY_IS_TRUE(0 == _stricmp(Name, "SUnaryFPOp"));
size_t size = sizeof(SUnaryHalfOp) * count;
SUnaryHalfOp *pPrimitives = (SUnaryHalfOp *);
for (size_t i = 0; i < count; ++i) {
SUnaryHalfOp *p = &pPrimitives[i];
p->input = (*Validation_Input)[i % Validation_Input->size()];
// use shader from data table
pShaderOp-> = Target.m_psz;
pShaderOp-> = Text.m_psz;
pShaderOp-> = Arguments.m_psz;
MappedData data;
test->Test->GetReadBackData("SUnaryFPOp", &data);
SUnaryHalfOp *pPrimitives = (SUnaryHalfOp*);
WEX::TestExecution::DisableVerifyExceptions dve;
for (unsigned i = 0; i < count; ++i) {
SUnaryHalfOp *p = &pPrimitives[i];
uint16_t expected = (*Validation_Expected)[i % Validation_Input->size()];
LogCommentFmt(L"element #%u, input = %6.8f(0x%04x), output = "
L"%6.8f(0x%04x), expected = %6.8f(0x%04x)",
i, ConvertFloat16ToFloat32(p->input), p->input,
ConvertFloat16ToFloat32(p->output), p->output,
ConvertFloat16ToFloat32(expected), expected);
VerifyOutputWithExpectedValueHalf(p->output, expected, Validation_Type, Validation_Tolerance);
TEST_F(ExecutionTest, BinaryHalfOpTest) {
WEX::TestExecution::SetVerifyOutput verifySettings(
CComPtr<IStream> pStream;
ReadHlslDataIntoNewStream(L"ShaderOpArith.xml", &pStream);
CComPtr<ID3D12Device> pDevice;
if (!CreateDevice(&pDevice, D3D_SHADER_MODEL::D3D_SHADER_MODEL_6_2)) {
if (!DoesDeviceSupportNative16bitOps(pDevice)) {
WEX::Logging::Log::Comment(L"Device does not support native 16-bit operations.");
// Read data from the table
int tableSize = sizeof(BinaryHalfOpParameters) / sizeof(TableParameter);
TableParameterHandler handler(BinaryHalfOpParameters, tableSize);
CW2A Target(handler.GetTableParamByName(L"ShaderOp.Target")->m_str);
CW2A Text(handler.GetTableParamByName(L"ShaderOp.Text")->m_str);
CW2A Arguments(handler.GetTableParamByName(L"ShaderOp.Arguments")->m_str);
std::vector<uint16_t> *Validation_Input1 =
std::vector<uint16_t> *Validation_Input2 =
std::vector<uint16_t> *Validation_Expected1 =
std::vector<uint16_t> *Validation_Expected2 =
LPCWSTR Validation_Type = handler.GetTableParamByName(L"Validation.Type")->m_str;
double Validation_Tolerance = handler.GetTableParamByName(L"Validation.Tolerance")->m_double;
size_t count = Validation_Input1->size();
std::shared_ptr<ShaderOpTestResult> test = RunShaderOpTest(
pDevice, m_support, pStream, "BinaryFPOp",
// this callbacked is called when the test
// is creating the resource to run the test
[&](LPCSTR Name, std::vector<BYTE> &Data, st::ShaderOp *pShaderOp) {
VERIFY_IS_TRUE(0 == _stricmp(Name, "SBinaryFPOp"));
size_t size = sizeof(SBinaryHalfOp) * count;
SBinaryHalfOp *pPrimitives = (SBinaryHalfOp *);
for (size_t i = 0; i < count; ++i) {
SBinaryHalfOp *p = &pPrimitives[i];
p->input1 = (*Validation_Input1)[i % Validation_Input1->size()];
p->input2 = (*Validation_Input2)[i % Validation_Input2->size()];
// use shader from data table
pShaderOp-> = Target.m_psz;
pShaderOp-> = Text.m_psz;
pShaderOp-> = Arguments.m_psz;
MappedData data;
test->Test->GetReadBackData("SBinaryFPOp", &data);
SBinaryHalfOp *pPrimitives = (SBinaryHalfOp *);
WEX::TestExecution::DisableVerifyExceptions dve;
unsigned numExpected = Validation_Expected2->size() == 0 ? 1 : 2;
if (numExpected == 2) {
for (unsigned i = 0; i < count; ++i) {
SBinaryHalfOp *p = &pPrimitives[i];
uint16_t expected1 = (*Validation_Expected1)[i % Validation_Input1->size()];
uint16_t expected2 = (*Validation_Expected2)[i % Validation_Input2->size()];
LogCommentFmt(L"element #%u, input1 = %6.8f(0x%04x), input2 = %6.8f(0x%04x), output1 = "
L"%6.8f(0x%04x), expected1 = %6.8f(0x%04x), output2 = %6.8f(0x%04x), expected2 = %6.8f(0x%04x)",
i, ConvertFloat16ToFloat32(p->input1), p->input1,
ConvertFloat16ToFloat32(p->input2), p->input2,
ConvertFloat16ToFloat32(p->output1), p->output1,
ConvertFloat16ToFloat32(p->output2), p->output2,
ConvertFloat16ToFloat32(expected1), expected1,
ConvertFloat16ToFloat32(expected2), expected2);
VerifyOutputWithExpectedValueHalf(p->output1, expected1, Validation_Type, Validation_Tolerance);
VerifyOutputWithExpectedValueHalf(p->output2, expected2, Validation_Type, Validation_Tolerance);
else if (numExpected == 1) {
for (unsigned i = 0; i < count; ++i) {
uint16_t expected = (*Validation_Expected1)[i % Validation_Input1->size()];
SBinaryHalfOp *p = &pPrimitives[i];
LogCommentFmt(L"element #%u, input = %6.8f(0x%04x), output = "
L"%6.8f(0x%04x), expected = %6.8f(0x%04x)",
i, ConvertFloat16ToFloat32(p->input1), p->input1,
ConvertFloat16ToFloat32(p->output1), p->output1,
ConvertFloat16ToFloat32(expected), expected);
VerifyOutputWithExpectedValueHalf(p->output1, expected, Validation_Type, Validation_Tolerance);
else {
LogErrorFmt(L"Unexpected number of expected values for operation %i", numExpected);
TEST_F(ExecutionTest, TertiaryHalfOpTest) {
WEX::TestExecution::SetVerifyOutput verifySettings(
CComPtr<IStream> pStream;
ReadHlslDataIntoNewStream(L"ShaderOpArith.xml", &pStream);
CComPtr<ID3D12Device> pDevice;
if (!CreateDevice(&pDevice, D3D_SHADER_MODEL::D3D_SHADER_MODEL_6_2)) {
if (!DoesDeviceSupportNative16bitOps(pDevice)) {
WEX::Logging::Log::Comment(L"Device does not support native 16-bit operations.");
// Read data from the table
int tableSize = sizeof(TertiaryHalfOpParameters) / sizeof(TableParameter);
TableParameterHandler handler(TertiaryHalfOpParameters, tableSize);
CW2A Target(handler.GetTableParamByName(L"ShaderOp.Target")->m_str);
CW2A Text(handler.GetTableParamByName(L"ShaderOp.Text")->m_str);
CW2A Arguments(handler.GetTableParamByName(L"ShaderOp.Arguments")->m_str);
std::vector<uint16_t> *Validation_Input1 =
std::vector<uint16_t> *Validation_Input2 =
std::vector<uint16_t> *Validation_Input3 =
std::vector<uint16_t> *Validation_Expected =
LPCWSTR Validation_Type = handler.GetTableParamByName(L"Validation.Type")->m_str;
double Validation_Tolerance = handler.GetTableParamByName(L"Validation.Tolerance")->m_double;
size_t count = Validation_Input1->size();
std::shared_ptr<ShaderOpTestResult> test = RunShaderOpTest(
pDevice, m_support, pStream, "TertiaryFPOp",
// this callbacked is called when the test
// is creating the resource to run the test
[&](LPCSTR Name, std::vector<BYTE> &Data, st::ShaderOp *pShaderOp) {
VERIFY_IS_TRUE(0 == _stricmp(Name, "STertiaryFPOp"));
size_t size = sizeof(STertiaryHalfOp) * count;
STertiaryHalfOp *pPrimitives = (STertiaryHalfOp *);
for (size_t i = 0; i < count; ++i) {
STertiaryHalfOp *p = &pPrimitives[i];
p->input1 = (*Validation_Input1)[i % Validation_Input1->size()];
p->input2 = (*Validation_Input2)[i % Validation_Input2->size()];
p->input3 = (*Validation_Input3)[i % Validation_Input3->size()];
// use shader from data table
pShaderOp-> = Target.m_psz;
pShaderOp-> = Text.m_psz;
pShaderOp-> = Arguments.m_psz;
MappedData data;
test->Test->GetReadBackData("STertiaryFPOp", &data);
STertiaryHalfOp *pPrimitives = (STertiaryHalfOp *);
WEX::TestExecution::DisableVerifyExceptions dve;
for (unsigned i = 0; i < count; ++i) {
STertiaryHalfOp *p = &pPrimitives[i];
uint16_t expected = (*Validation_Expected)[i % Validation_Expected->size()];
LogCommentFmt(L"element #%u, input1 = %6.8f(0x%04x), input2 = %6.8f(0x%04x), input3 = %6.8f(0x%04x), output = "
L"%6.8f(0x%04x), expected = %6.8f(0x%04x)",
i, ConvertFloat16ToFloat32(p->input1), p->input1,
ConvertFloat16ToFloat32(p->input2), p->input2,
ConvertFloat16ToFloat32(p->input3), p->input3,
ConvertFloat16ToFloat32(p->output), p->output,
ConvertFloat16ToFloat32(expected), expected);
VerifyOutputWithExpectedValueHalf(p->output, expected, Validation_Type, Validation_Tolerance);
TEST_F(ExecutionTest, UnaryIntOpTest) {
WEX::TestExecution::SetVerifyOutput verifySettings(
CComPtr<IStream> pStream;
ReadHlslDataIntoNewStream(L"ShaderOpArith.xml", &pStream);
CComPtr<ID3D12Device> pDevice;
if (!CreateDevice(&pDevice)) {
// Read data from the table
int tableSize = sizeof(UnaryIntOpParameters) / sizeof(TableParameter);
TableParameterHandler handler(UnaryIntOpParameters, tableSize);
CW2A Target(handler.GetTableParamByName(L"ShaderOp.Target")->m_str);
CW2A Text(handler.GetTableParamByName(L"ShaderOp.Text")->m_str);
std::vector<int> *Validation_Input =
std::vector<int> *Validation_Expected =
int Validation_Tolerance = handler.GetTableParamByName(L"Validation.Tolerance")->m_int32;
size_t count = Validation_Input->size();
std::shared_ptr<ShaderOpTestResult> test = RunShaderOpTest(
pDevice, m_support, pStream, "UnaryIntOp",
// this callbacked is called when the test
// is creating the resource to run the test
[&](LPCSTR Name, std::vector<BYTE> &Data, st::ShaderOp *pShaderOp) {
VERIFY_IS_TRUE(0 == _stricmp(Name, "SUnaryIntOp"));
size_t size = sizeof(SUnaryIntOp) * count;
SUnaryIntOp *pPrimitives = (SUnaryIntOp *);
for (size_t i = 0; i < count; ++i) {
SUnaryIntOp *p = &pPrimitives[i];
int val = (*Validation_Input)[i % Validation_Input->size()];
p->input = val;
// use shader data table
pShaderOp-> = Target.m_psz;
pShaderOp-> = Text.m_psz;
MappedData data;
test->Test->GetReadBackData("SUnaryIntOp", &data);
SUnaryIntOp *pPrimitives = (SUnaryIntOp *);
WEX::TestExecution::DisableVerifyExceptions dve;
for (unsigned i = 0; i < count; ++i) {
SUnaryIntOp *p = &pPrimitives[i];
int val = (*Validation_Expected)[i % Validation_Expected->size()];
LogCommentFmt(L"element #%u, input = %11i(0x%08x), output = %11i(0x%08x), "
L"expected = %11i(0x%08x)",
i, p->input, p->input, p->output, p->output, val, val);
VerifyOutputWithExpectedValueInt(p->output, val, Validation_Tolerance);
TEST_F(ExecutionTest, UnaryUintOpTest) {
WEX::TestExecution::SetVerifyOutput verifySettings(
CComPtr<IStream> pStream;
ReadHlslDataIntoNewStream(L"ShaderOpArith.xml", &pStream);
CComPtr<ID3D12Device> pDevice;
if (!CreateDevice(&pDevice)) {
// Read data from the table
int tableSize = sizeof(UnaryUintOpParameters) / sizeof(TableParameter);
TableParameterHandler handler(UnaryUintOpParameters, tableSize);
CW2A Target(handler.GetTableParamByName(L"ShaderOp.Target")->m_str);
CW2A Text(handler.GetTableParamByName(L"ShaderOp.Text")->m_str);
std::vector<unsigned int> *Validation_Input =
std::vector<unsigned int> *Validation_Expected =
int Validation_Tolerance = handler.GetTableParamByName(L"Validation.Tolerance")->m_int32;
size_t count = Validation_Input->size();
std::shared_ptr<ShaderOpTestResult> test = RunShaderOpTest(
pDevice, m_support, pStream, "UnaryUintOp",
// this callbacked is called when the test
// is creating the resource to run the test
[&](LPCSTR Name, std::vector<BYTE> &Data, st::ShaderOp *pShaderOp) {
VERIFY_IS_TRUE(0 == _stricmp(Name, "SUnaryUintOp"));
size_t size = sizeof(SUnaryUintOp) * count;
SUnaryUintOp *pPrimitives = (SUnaryUintOp *);
for (size_t i = 0; i < count; ++i) {
SUnaryUintOp *p = &pPrimitives[i];
unsigned int val = (*Validation_Input)[i % Validation_Input->size()];
p->input = val;
// use shader data table
pShaderOp-> = Target.m_psz;
pShaderOp-> = Text.m_psz;
MappedData data;
test->Test->GetReadBackData("SUnaryUintOp", &data);
SUnaryUintOp *pPrimitives = (SUnaryUintOp *);
WEX::TestExecution::DisableVerifyExceptions dve;
for (unsigned i = 0; i < count; ++i) {
SUnaryUintOp *p = &pPrimitives[i];
unsigned int val = (*Validation_Expected)[i % Validation_Expected->size()];
LogCommentFmt(L"element #%u, input = %11u(0x%08x), output = %11u(0x%08x), "
L"expected = %11u(0x%08x)",
i, p->input, p->input, p->output, p->output, val, val);
VerifyOutputWithExpectedValueInt(p->output, val, Validation_Tolerance);
TEST_F(ExecutionTest, BinaryIntOpTest) {
WEX::TestExecution::SetVerifyOutput verifySettings(
CComPtr<IStream> pStream;
ReadHlslDataIntoNewStream(L"ShaderOpArith.xml", &pStream);
CComPtr<ID3D12Device> pDevice;
if (!CreateDevice(&pDevice)) {
// Read data from the table
size_t tableSize = sizeof(BinaryIntOpParameters) / sizeof(TableParameter);
TableParameterHandler handler(BinaryIntOpParameters, tableSize);
CW2A Target(handler.GetTableParamByName(L"ShaderOp.Target")->m_str);
CW2A Text(handler.GetTableParamByName(L"ShaderOp.Text")->m_str);
std::vector<int> *Validation_Input1 =
std::vector<int> *Validation_Input2 =
std::vector<int> *Validation_Expected1 =
std::vector<int> *Validation_Expected2 =
int Validation_Tolerance = handler.GetTableParamByName(L"Validation.Tolerance")->m_int32;
size_t count = Validation_Input1->size();
size_t numExpected = Validation_Expected2->size() == 0 ? 1 : 2;
std::shared_ptr<ShaderOpTestResult> test = RunShaderOpTest(
pDevice, m_support, pStream, "BinaryIntOp",
// this callbacked is called when the test
// is creating the resource to run the test
[&](LPCSTR Name, std::vector<BYTE> &Data, st::ShaderOp *pShaderOp) {
VERIFY_IS_TRUE(0 == _stricmp(Name, "SBinaryIntOp"));
size_t size = sizeof(SBinaryIntOp) * count;
SBinaryIntOp *pPrimitives = (SBinaryIntOp *);
for (size_t i = 0; i < count; ++i) {
SBinaryIntOp *p = &pPrimitives[i];
int val1 = (*Validation_Input1)[i % Validation_Input1->size()];
int val2 = (*Validation_Input2)[i % Validation_Input2->size()];
p->input1 = val1;
p->input2 = val2;
// use shader from data table
pShaderOp-> = Target.m_psz;
pShaderOp-> = Text.m_psz;
MappedData data;
test->Test->GetReadBackData("SBinaryIntOp", &data);
SBinaryIntOp *pPrimitives = (SBinaryIntOp *);
WEX::TestExecution::DisableVerifyExceptions dve;
if (numExpected == 2) {
for (unsigned i = 0; i < count; ++i) {
SBinaryIntOp *p = &pPrimitives[i];
int val1 = (*Validation_Expected1)[i % Validation_Expected1->size()];
int val2 = (*Validation_Expected2)[i % Validation_Expected2->size()];
LogCommentFmt(L"element #%u, input1 = %11i(0x%08x), input2 = "
L"%11i(0x%08x), output1 = "
L"%11i(0x%08x), expected1 = %11i(0x%08x), output2 = "
L"%11i(0x%08x), expected2 = %11i(0x%08x)",
i, p->input1, p->input1, p->input2, p->input2, p->output1,
p->output1, val1, val1, p->output2, p->output2, val2,
VerifyOutputWithExpectedValueInt(p->output1, val1, Validation_Tolerance);
VerifyOutputWithExpectedValueInt(p->output2, val2, Validation_Tolerance);
else if (numExpected == 1) {
for (unsigned i = 0; i < count; ++i) {
SBinaryIntOp *p = &pPrimitives[i];
int val1 = (*Validation_Expected1)[i % Validation_Expected1->size()];
LogCommentFmt(L"element #%u, input1 = %11i(0x%08x), input2 = "
L"%11i(0x%08x), output = "
L"%11i(0x%08x), expected = %11i(0x%08x)", i,
p->input1, p->input1, p->input2, p->input2,
p->output1, p->output1, val1, val1);
VerifyOutputWithExpectedValueInt(p->output1, val1, Validation_Tolerance);
else {
LogErrorFmt(L"Unexpected number of expected values for operation %i", numExpected);
TEST_F(ExecutionTest, TertiaryIntOpTest) {
WEX::TestExecution::SetVerifyOutput verifySettings(
CComPtr<IStream> pStream;
ReadHlslDataIntoNewStream(L"ShaderOpArith.xml", &pStream);
CComPtr<ID3D12Device> pDevice;
if (!CreateDevice(&pDevice)) {
// Read data from the table
size_t tableSize = sizeof(TertiaryIntOpParameters) / sizeof(TableParameter);
TableParameterHandler handler(TertiaryIntOpParameters, tableSize);
CW2A Target(handler.GetTableParamByName(L"ShaderOp.Target")->m_str);
CW2A Text(handler.GetTableParamByName(L"ShaderOp.Text")->m_str);
std::vector<int> *Validation_Input1 =
std::vector<int> *Validation_Input2 =
std::vector<int> *Validation_Input3 =
std::vector<int> *Validation_Expected =
int Validation_Tolerance = handler.GetTableParamByName(L"Validation.Tolerance")->m_int32;
size_t count = Validation_Input1->size();
std::shared_ptr<ShaderOpTestResult> test = RunShaderOpTest(
pDevice, m_support, pStream, "TertiaryIntOp",
// this callbacked is called when the test
// is creating the resource to run the test
[&](LPCSTR Name, std::vector<BYTE> &Data, st::ShaderOp *pShaderOp) {
VERIFY_IS_TRUE(0 == _stricmp(Name, "STertiaryIntOp"));
size_t size = sizeof(STertiaryIntOp) * count;
STertiaryIntOp *pPrimitives = (STertiaryIntOp *);
for (size_t i = 0; i < count; ++i) {
STertiaryIntOp *p = &pPrimitives[i];
int val1 = (*Validation_Input1)[i % Validation_Input1->size()];
int val2 = (*Validation_Input2)[i % Validation_Input2->size()];
int val3 = (*Validation_Input3)[i % Validation_Input3->size()];
p->input1 = val1;
p->input2 = val2;
p->input3 = val3;
// use shader from data table
pShaderOp-> = Target.m_psz;
pShaderOp-> = Text.m_psz;
MappedData data;
test->Test->GetReadBackData("STertiaryIntOp", &data);
STertiaryIntOp *pPrimitives = (STertiaryIntOp *);
WEX::TestExecution::DisableVerifyExceptions dve;
for (unsigned i = 0; i < count; ++i) {
STertiaryIntOp *p = &pPrimitives[i];
int val1 = (*Validation_Expected)[i % Validation_Expected->size()];
LogCommentFmt(L"element #%u, input1 = %11i(0x%08x), input2 = "
L"%11i(0x%08x), input3= %11i(0x%08x), output = "
L"%11i(0x%08x), expected = %11i(0x%08x)",
i, p->input1, p->input1, p->input2, p->input2,
p->input3, p->input3, p->output, p->output, val1,
VerifyOutputWithExpectedValueInt(p->output, val1, Validation_Tolerance);
TEST_F(ExecutionTest, BinaryUintOpTest) {
WEX::TestExecution::SetVerifyOutput verifySettings(
CComPtr<IStream> pStream;
ReadHlslDataIntoNewStream(L"ShaderOpArith.xml", &pStream);
CComPtr<ID3D12Device> pDevice;
if (!CreateDevice(&pDevice)) {
// Read data from the table
size_t tableSize = sizeof(BinaryUintOpParameters) / sizeof(TableParameter);
TableParameterHandler handler(BinaryUintOpParameters, tableSize);
CW2A Target(handler.GetTableParamByName(L"ShaderOp.Target")->m_str);
CW2A Text(handler.GetTableParamByName(L"ShaderOp.Text")->m_str);
std::vector<unsigned int> *Validation_Input1 =
std::vector<unsigned int> *Validation_Input2 =
std::vector<unsigned int> *Validation_Expected1 =
std::vector<unsigned int> *Validation_Expected2 =
int Validation_Tolerance = handler.GetTableParamByName(L"Validation.Tolerance")->m_int32;
size_t count = Validation_Input1->size();
int numExpected = Validation_Expected2->size() == 0 ? 1 : 2;
std::shared_ptr<ShaderOpTestResult> test = RunShaderOpTest(
pDevice, m_support, pStream, "BinaryUintOp",
// this callbacked is called when the test
// is creating the resource to run the test
[&](LPCSTR Name, std::vector<BYTE> &Data, st::ShaderOp *pShaderOp) {
VERIFY_IS_TRUE(0 == _stricmp(Name, "SBinaryUintOp"));
size_t size = sizeof(SBinaryUintOp) * count;
SBinaryUintOp *pPrimitives = (SBinaryUintOp *);
for (size_t i = 0; i < count; ++i) {
SBinaryUintOp *p = &pPrimitives[i];
unsigned int val1 = (*Validation_Input1)[i % Validation_Input1->size()];
unsigned int val2 = (*Validation_Input2)[i % Validation_Input2->size()];
p->input1 = val1;
p->input2 = val2;
// use shader from data table
pShaderOp-> = Target.m_psz;
pShaderOp-> = Text.m_psz;
MappedData data;
test->Test->GetReadBackData("SBinaryUintOp", &data);
SBinaryUintOp *pPrimitives = (SBinaryUintOp *);
WEX::TestExecution::DisableVerifyExceptions dve;
if (numExpected == 2) {
for (unsigned i = 0; i < count; ++i) {
SBinaryUintOp *p = &pPrimitives[i];
unsigned int val1 = (*Validation_Expected1)[i % Validation_Expected1->size()];
unsigned int val2 = (*Validation_Expected2)[i % Validation_Expected2->size()];
LogCommentFmt(L"element #%u, input1 = %11u(0x%08x), input2 = "
L"%11u(0x%08x), output1 = "
L"%11u(0x%08x), expected1 = %11u(0x%08x), output2 = "
L"%11u(0x%08x), expected2 = %11u(0x%08x)",
i, p->input1, p->input1, p->input2, p->input2, p->output1,
p->output1, val1, val1, p->output2, p->output2, val2,
VerifyOutputWithExpectedValueInt(p->output1, val1, Validation_Tolerance);
VerifyOutputWithExpectedValueInt(p->output2, val2, Validation_Tolerance);
else if (numExpected == 1) {
for (unsigned i = 0; i < count; ++i) {
SBinaryUintOp *p = &pPrimitives[i];
unsigned int val1 = (*Validation_Expected1)[i % Validation_Expected1->size()];
LogCommentFmt(L"element #%u, input1 = %11u(0x%08x), input2 = "
L"%11u(0x%08x), output = "
L"%11u(0x%08x), expected = %11u(0x%08x)", i,
p->input1, p->input1, p->input2, p->input2,
p->output1, p->output1, val1, val1);
VerifyOutputWithExpectedValueInt(p->output1, val1, Validation_Tolerance);
else {
LogErrorFmt(L"Unexpected number of expected values for operation %i", numExpected);
TEST_F(ExecutionTest, TertiaryUintOpTest) {
WEX::TestExecution::SetVerifyOutput verifySettings(
CComPtr<IStream> pStream;
ReadHlslDataIntoNewStream(L"ShaderOpArith.xml", &pStream);
CComPtr<ID3D12Device> pDevice;
if (!CreateDevice(&pDevice)) {
// Read data from the table
size_t tableSize = sizeof(TertiaryUintOpParameters) / sizeof(TableParameter);
TableParameterHandler handler(TertiaryUintOpParameters, tableSize);
CW2A Target(handler.GetTableParamByName(L"ShaderOp.Target")->m_str);
CW2A Text(handler.GetTableParamByName(L"ShaderOp.Text")->m_str);
std::vector<unsigned int> *Validation_Input1 =
std::vector<unsigned int> *Validation_Input2 =
std::vector<unsigned int> *Validation_Input3 =
std::vector<unsigned int> *Validation_Expected =
int Validation_Tolerance = handler.GetTableParamByName(L"Validation.Tolerance")->m_int32;
size_t count = Validation_Input1->size();
std::shared_ptr<ShaderOpTestResult> test = RunShaderOpTest(
pDevice, m_support, pStream, "TertiaryUintOp",
// this callbacked is called when the test
// is creating the resource to run the test
[&](LPCSTR Name, std::vector<BYTE> &Data, st::ShaderOp *pShaderOp) {
VERIFY_IS_TRUE(0 == _stricmp(Name, "STertiaryUintOp"));
size_t size = sizeof(STertiaryUintOp) * count;
STertiaryUintOp *pPrimitives = (STertiaryUintOp *);
for (size_t i = 0; i < count; ++i) {
STertiaryUintOp *p = &pPrimitives[i];
unsigned int val1 = (*Validation_Input1)[i % Validation_Input1->size()];
unsigned int val2 = (*Validation_Input2)[i % Validation_Input2->size()];
unsigned int val3 = (*Validation_Input3)[i % Validation_Input3->size()];
p->input1 = val1;
p->input2 = val2;
p->input3 = val3;
// use shader from data table
pShaderOp-> = Target.m_psz;
pShaderOp-> = Text.m_psz;
MappedData data;
test->Test->GetReadBackData("STertiaryUintOp", &data);
STertiaryUintOp *pPrimitives = (STertiaryUintOp *);
WEX::TestExecution::DisableVerifyExceptions dve;
for (unsigned i = 0; i < count; ++i) {
STertiaryUintOp *p = &pPrimitives[i];
unsigned int val1 = (*Validation_Expected)[i % Validation_Expected->size()];
LogCommentFmt(L"element #%u, input1 = %11u(0x%08x), input2 = "
L"%11u(0x%08x), input3 = %11u(0x%08x), output = "
L"%11u(0x%08x), expected = %11u(0x%08x)", i,
p->input1, p->input1, p->input2, p->input2, p->input3, p->input3,
p->output, p->output, val1, val1);
VerifyOutputWithExpectedValueInt(p->output, val1, Validation_Tolerance);
// 16 bit integer type tests
TEST_F(ExecutionTest, UnaryInt16OpTest) {
WEX::TestExecution::SetVerifyOutput verifySettings(
CComPtr<IStream> pStream;
ReadHlslDataIntoNewStream(L"ShaderOpArith.xml", &pStream);
CComPtr<ID3D12Device> pDevice;
if (!CreateDevice(&pDevice, D3D_SHADER_MODEL::D3D_SHADER_MODEL_6_2)) {
if (!DoesDeviceSupportNative16bitOps(pDevice)) {
WEX::Logging::Log::Comment(L"Device does not support native 16-bit operations.");
// Read data from the table
int tableSize = sizeof(UnaryInt16OpParameters) / sizeof(TableParameter);
TableParameterHandler handler(UnaryInt16OpParameters, tableSize);
CW2A Target(handler.GetTableParamByName(L"ShaderOp.Target")->m_str);
CW2A Text(handler.GetTableParamByName(L"ShaderOp.Text")->m_str);
CW2A Arguments(handler.GetTableParamByName(L"ShaderOp.Arguments")->m_str);
std::vector<short> *Validation_Input =
std::vector<short> *Validation_Expected =
int Validation_Tolerance = handler.GetTableParamByName(L"Validation.Tolerance")->m_int32;
size_t count = Validation_Input->size();
std::shared_ptr<ShaderOpTestResult> test = RunShaderOpTest(
pDevice, m_support, pStream, "UnaryIntOp",
// this callbacked is called when the test
// is creating the resource to run the test
[&](LPCSTR Name, std::vector<BYTE> &Data, st::ShaderOp *pShaderOp) {
VERIFY_IS_TRUE(0 == _stricmp(Name, "SUnaryIntOp"));
size_t size = sizeof(SUnaryInt16Op) * count;
SUnaryInt16Op *pPrimitives = (SUnaryInt16Op *);
for (size_t i = 0; i < count; ++i) {
SUnaryInt16Op *p = &pPrimitives[i];
p->input = (*Validation_Input)[i % Validation_Input->size()];
// use shader data table
pShaderOp-> = Target.m_psz;
pShaderOp-> = Text.m_psz;
pShaderOp-> = Arguments.m_psz;
MappedData data;
test->Test->GetReadBackData("SUnaryIntOp", &data);
SUnaryInt16Op *pPrimitives = (SUnaryInt16Op *);
WEX::TestExecution::DisableVerifyExceptions dve;
for (unsigned i = 0; i < count; ++i) {
SUnaryInt16Op *p = &pPrimitives[i];
short val = (*Validation_Expected)[i % Validation_Expected->size()];
LogCommentFmt(L"element #%u, input = %5hi(0x%08x), output = %5hi(0x%08x), "
L"expected = %5hi(0x%08x)",
i, p->input, p->input, p->output, p->output, val, val);
VerifyOutputWithExpectedValueInt(p->output, val, Validation_Tolerance);
TEST_F(ExecutionTest, UnaryUint16OpTest) {
WEX::TestExecution::SetVerifyOutput verifySettings(
CComPtr<IStream> pStream;
ReadHlslDataIntoNewStream(L"ShaderOpArith.xml", &pStream);
CComPtr<ID3D12Device> pDevice;
if (!CreateDevice(&pDevice, D3D_SHADER_MODEL::D3D_SHADER_MODEL_6_2)) {
if (!DoesDeviceSupportNative16bitOps(pDevice)) {
WEX::Logging::Log::Comment(L"Device does not support native 16-bit operations.");
// Read data from the table
int tableSize = sizeof(UnaryUint16OpParameters) / sizeof(TableParameter);
TableParameterHandler handler(UnaryUint16OpParameters, tableSize);
CW2A Target(handler.GetTableParamByName(L"ShaderOp.Target")->m_str);
CW2A Text(handler.GetTableParamByName(L"ShaderOp.Text")->m_str);
CW2A Arguments(handler.GetTableParamByName(L"ShaderOp.Arguments")->m_str);
std::vector<unsigned short> *Validation_Input =
std::vector<unsigned short> *Validation_Expected =
int Validation_Tolerance = handler.GetTableParamByName(L"Validation.Tolerance")->m_int32;
size_t count = Validation_Input->size();
std::shared_ptr<ShaderOpTestResult> test = RunShaderOpTest(
pDevice, m_support, pStream, "UnaryUintOp",
// this callbacked is called when the test
// is creating the resource to run the test
[&](LPCSTR Name, std::vector<BYTE> &Data, st::ShaderOp *pShaderOp) {
VERIFY_IS_TRUE(0 == _stricmp(Name, "SUnaryUintOp"));
size_t size = sizeof(SUnaryUint16Op) * count;
SUnaryUint16Op *pPrimitives = (SUnaryUint16Op *);
for (size_t i = 0; i < count; ++i) {
SUnaryUint16Op *p = &pPrimitives[i];
p->input = (*Validation_Input)[i % Validation_Input->size()];
// use shader data table
pShaderOp-> = Target.m_psz;
pShaderOp-> = Text.m_psz;
pShaderOp-> = Arguments.m_psz;
MappedData data;
test->Test->GetReadBackData("SUnaryUintOp", &data);
SUnaryUint16Op *pPrimitives = (SUnaryUint16Op *);
WEX::TestExecution::DisableVerifyExceptions dve;
for (unsigned i = 0; i < count; ++i) {
SUnaryUint16Op *p = &pPrimitives[i];
unsigned short val = (*Validation_Expected)[i % Validation_Expected->size()];
LogCommentFmt(L"element #%u, input = %5hu(0x%08x), output = %5hu(0x%08x), "
L"expected = %5hu(0x%08x)",
i, p->input, p->input, p->output, p->output, val, val);
VerifyOutputWithExpectedValueInt(p->output, val, Validation_Tolerance);
TEST_F(ExecutionTest, BinaryInt16OpTest) {
WEX::TestExecution::SetVerifyOutput verifySettings(
CComPtr<IStream> pStream;
ReadHlslDataIntoNewStream(L"ShaderOpArith.xml", &pStream);
CComPtr<ID3D12Device> pDevice;
if (!CreateDevice(&pDevice, D3D_SHADER_MODEL::D3D_SHADER_MODEL_6_2)) {
if (!DoesDeviceSupportNative16bitOps(pDevice)) {
WEX::Logging::Log::Comment(L"Device does not support native 16-bit operations.");
// Read data from the table
size_t tableSize = sizeof(BinaryInt16OpParameters) / sizeof(TableParameter);
TableParameterHandler handler(BinaryInt16OpParameters, tableSize);
CW2A Target(handler.GetTableParamByName(L"ShaderOp.Target")->m_str);
CW2A Text(handler.GetTableParamByName(L"ShaderOp.Text")->m_str);
CW2A Arguments(handler.GetTableParamByName(L"ShaderOp.Arguments")->m_str);
std::vector<short> *Validation_Input1 =
std::vector<short> *Validation_Input2 =
std::vector<short> *Validation_Expected1 =
std::vector<short> *Validation_Expected2 =
int Validation_Tolerance = handler.GetTableParamByName(L"Validation.Tolerance")->m_int32;
size_t count = Validation_Input1->size();
size_t numExpected = Validation_Expected2->size() == 0 ? 1 : 2;
std::shared_ptr<ShaderOpTestResult> test = RunShaderOpTest(
pDevice, m_support, pStream, "BinaryIntOp",
// this callbacked is called when the test
// is creating the resource to run the test
[&](LPCSTR Name, std::vector<BYTE> &Data, st::ShaderOp *pShaderOp) {
VERIFY_IS_TRUE(0 == _stricmp(Name, "SBinaryIntOp"));
size_t size = sizeof(SBinaryInt16Op) * count;
SBinaryInt16Op *pPrimitives = (SBinaryInt16Op *);
for (size_t i = 0; i < count; ++i) {
SBinaryInt16Op *p = &pPrimitives[i];
p->input1 = (*Validation_Input1)[i % Validation_Input1->size()];
p->input2 = (*Validation_Input2)[i % Validation_Input2->size()];
// use shader from data table
pShaderOp-> = Target.m_psz;
pShaderOp-> = Text.m_psz;
pShaderOp-> = Arguments.m_psz;
MappedData data;
test->Test->GetReadBackData("SBinaryIntOp", &data);
SBinaryInt16Op *pPrimitives = (SBinaryInt16Op *);
WEX::TestExecution::DisableVerifyExceptions dve;
if (numExpected == 2) {
for (unsigned i = 0; i < count; ++i) {
SBinaryInt16Op *p = &pPrimitives[i];
short val1 = (*Validation_Expected1)[i % Validation_Expected1->size()];
short val2 = (*Validation_Expected2)[i % Validation_Expected2->size()];
LogCommentFmt(L"element #%u, input1 = %5hi(0x%08x), input2 = "
L"%5hi(0x%08x), output1 = "
L"%5hi(0x%08x), expected1 = %5hi(0x%08x), output2 = "
L"%5hi(0x%08x), expected2 = %5hi(0x%08x)",
i, p->input1, p->input1, p->input2, p->input2, p->output1,
p->output1, val1, val1, p->output2, p->output2, val2,
VerifyOutputWithExpectedValueInt(p->output1, val1, Validation_Tolerance);
VerifyOutputWithExpectedValueInt(p->output2, val2, Validation_Tolerance);
else if (numExpected == 1) {
for (unsigned i = 0; i < count; ++i) {
SBinaryInt16Op *p = &pPrimitives[i];
short val1 = (*Validation_Expected1)[i % Validation_Expected1->size()];
LogCommentFmt(L"element #%u, input1 = %5hi(0x%08x), input2 = "
L"%5hi(0x%08x), output = "
L"%5hi(0x%08x), expected = %5hi(0x%08x)", i,
p->input1, p->input1, p->input2, p->input2,
p->output1, p->output1, val1, val1);
VerifyOutputWithExpectedValueInt(p->output1, val1, Validation_Tolerance);
else {
LogErrorFmt(L"Unexpected number of expected values for operation %i", numExpected);
TEST_F(ExecutionTest, TertiaryInt16OpTest) {
WEX::TestExecution::SetVerifyOutput verifySettings(
CComPtr<IStream> pStream;
ReadHlslDataIntoNewStream(L"ShaderOpArith.xml", &pStream);
CComPtr<ID3D12Device> pDevice;
if (!CreateDevice(&pDevice, D3D_SHADER_MODEL::D3D_SHADER_MODEL_6_2)) {
if (!DoesDeviceSupportNative16bitOps(pDevice)) {
WEX::Logging::Log::Comment(L"Device does not support native 16-bit operations.");
// Read data from the table
size_t tableSize = sizeof(TertiaryInt16OpParameters) / sizeof(TableParameter);
TableParameterHandler handler(TertiaryInt16OpParameters, tableSize);
CW2A Target(handler.GetTableParamByName(L"ShaderOp.Target")->m_str);
CW2A Text(handler.GetTableParamByName(L"ShaderOp.Text")->m_str);
CW2A Arguments(handler.GetTableParamByName(L"ShaderOp.Arguments")->m_str);
std::vector<short> *Validation_Input1 =
std::vector<short> *Validation_Input2 =
std::vector<short> *Validation_Input3 =
std::vector<short> *Validation_Expected =
int Validation_Tolerance = handler.GetTableParamByName(L"Validation.Tolerance")->m_int32;
size_t count = Validation_Input1->size();
std::shared_ptr<ShaderOpTestResult> test = RunShaderOpTest(
pDevice, m_support, pStream, "TertiaryIntOp",
// this callbacked is called when the test
// is creating the resource to run the test
[&](LPCSTR Name, std::vector<BYTE> &Data, st::ShaderOp *pShaderOp) {
VERIFY_IS_TRUE(0 == _stricmp(Name, "STertiaryIntOp"));
size_t size = sizeof(STertiaryInt16Op) * count;
STertiaryInt16Op *pPrimitives = (STertiaryInt16Op *);
for (size_t i = 0; i < count; ++i) {
STertiaryInt16Op *p = &pPrimitives[i];
p->input1 = (*Validation_Input1)[i % Validation_Input1->size()];
p->input2 = (*Validation_Input2)[i % Validation_Input2->size()];
p->input3 = (*Validation_Input3)[i % Validation_Input3->size()];
// use shader from data table
pShaderOp-> = Target.m_psz;
pShaderOp-> = Text.m_psz;
pShaderOp-> = Arguments.m_psz;
MappedData data;
test->Test->GetReadBackData("STertiaryIntOp", &data);
STertiaryInt16Op *pPrimitives = (STertiaryInt16Op *);
WEX::TestExecution::DisableVerifyExceptions dve;
for (unsigned i = 0; i < count; ++i) {
STertiaryInt16Op *p = &pPrimitives[i];
short val1 = (*Validation_Expected)[i % Validation_Expected->size()];
LogCommentFmt(L"element #%u, input1 = %11i(0x%08x), input2 = "
L"%11i(0x%08x), input3= %11i(0x%08x), output = "
L"%11i(0x%08x), expected = %11i(0x%08x)",
i, p->input1, p->input1, p->input2, p->input2,
p->input3, p->input3, p->output, p->output, val1,
VerifyOutputWithExpectedValueInt(p->output, val1, Validation_Tolerance);
TEST_F(ExecutionTest, BinaryUint16OpTest) {
WEX::TestExecution::SetVerifyOutput verifySettings(
CComPtr<IStream> pStream;
ReadHlslDataIntoNewStream(L"ShaderOpArith.xml", &pStream);
CComPtr<ID3D12Device> pDevice;
if (!CreateDevice(&pDevice, D3D_SHADER_MODEL::D3D_SHADER_MODEL_6_2)) {
if (!DoesDeviceSupportNative16bitOps(pDevice)) {
WEX::Logging::Log::Comment(L"Device does not support native 16-bit operations.");
// Read data from the table
size_t tableSize = sizeof(BinaryUint16OpParameters) / sizeof(TableParameter);
TableParameterHandler handler(BinaryUint16OpParameters, tableSize);
CW2A Target(handler.GetTableParamByName(L"ShaderOp.Target")->m_str);
CW2A Text(handler.GetTableParamByName(L"ShaderOp.Text")->m_str);
CW2A Arguments(handler.GetTableParamByName(L"ShaderOp.Arguments")->m_str);
std::vector<unsigned short> *Validation_Input1 =
std::vector<unsigned short> *Validation_Input2 =
std::vector<unsigned short> *Validation_Expected1 =
std::vector<unsigned short> *Validation_Expected2 =
int Validation_Tolerance = handler.GetTableParamByName(L"Validation.Tolerance")->m_int32;
size_t count = Validation_Input1->size();
int numExpected = Validation_Expected2->size() == 0 ? 1 : 2;
std::shared_ptr<ShaderOpTestResult> test = RunShaderOpTest(
pDevice, m_support, pStream, "BinaryUintOp",
// this callbacked is called when the test
// is creating the resource to run the test
[&](LPCSTR Name, std::vector<BYTE> &Data, st::ShaderOp *pShaderOp) {
VERIFY_IS_TRUE(0 == _stricmp(Name, "SBinaryUintOp"));
size_t size = sizeof(SBinaryUint16Op) * count;
SBinaryUint16Op *pPrimitives = (SBinaryUint16Op *);
for (size_t i = 0; i < count; ++i) {
SBinaryUint16Op *p = &pPrimitives[i];
p->input1 = (*Validation_Input1)[i % Validation_Input1->size()];
p->input2 = (*Validation_Input2)[i % Validation_Input2->size()];
// use shader from data table
pShaderOp-> = Target.m_psz;
pShaderOp-> = Text.m_psz;
pShaderOp-> = Arguments.m_psz;
MappedData data;
test->Test->GetReadBackData("SBinaryUintOp", &data);
SBinaryUint16Op *pPrimitives = (SBinaryUint16Op *);
WEX::TestExecution::DisableVerifyExceptions dve;
if (numExpected == 2) {
for (unsigned i = 0; i < count; ++i) {
SBinaryUint16Op *p = &pPrimitives[i];
unsigned short val1 = (*Validation_Expected1)[i % Validation_Expected1->size()];
unsigned short val2 = (*Validation_Expected2)[i % Validation_Expected2->size()];
LogCommentFmt(L"element #%u, input1 = %5hu(0x%08x), input2 = "
L"%5hu(0x%08x), output1 = "
L"%5hu(0x%08x), expected1 = %5hu(0x%08x), output2 = "
L"%5hu(0x%08x), expected2 = %5hu(0x%08x)",
i, p->input1, p->input1, p->input2, p->input2, p->output1,
p->output1, val1, val1, p->output2, p->output2, val2,
VerifyOutputWithExpectedValueInt(p->output1, val1, Validation_Tolerance);
VerifyOutputWithExpectedValueInt(p->output2, val2, Validation_Tolerance);
else if (numExpected == 1) {
for (unsigned i = 0; i < count; ++i) {
SBinaryUint16Op *p = &pPrimitives[i];
unsigned short val1 = (*Validation_Expected1)[i % Validation_Expected1->size()];
LogCommentFmt(L"element #%u, input1 = %5hu(0x%08x), input2 = "
L"%5hu(0x%08x), output = "
L"%5hu(0x%08x), expected = %5hu(0x%08x)", i,
p->input1, p->input1, p->input2, p->input2,
p->output1, p->output1, val1, val1);
VerifyOutputWithExpectedValueInt(p->output1, val1, Validation_Tolerance);
else {
LogErrorFmt(L"Unexpected number of expected values for operation %i", numExpected);
TEST_F(ExecutionTest, TertiaryUint16OpTest) {
WEX::TestExecution::SetVerifyOutput verifySettings(
CComPtr<IStream> pStream;
ReadHlslDataIntoNewStream(L"ShaderOpArith.xml", &pStream);
CComPtr<ID3D12Device> pDevice;
if (!CreateDevice(&pDevice, D3D_SHADER_MODEL::D3D_SHADER_MODEL_6_2)) {
if (!DoesDeviceSupportNative16bitOps(pDevice)) {
WEX::Logging::Log::Comment(L"Device does not support native 16-bit operations.");
// Read data from the table
size_t tableSize = sizeof(TertiaryUint16OpParameters) / sizeof(TableParameter);
TableParameterHandler handler(TertiaryUint16OpParameters, tableSize);
CW2A Target(handler.GetTableParamByName(L"ShaderOp.Target")->m_str);
CW2A Text(handler.GetTableParamByName(L"ShaderOp.Text")->m_str);
CW2A Arguments(handler.GetTableParamByName(L"ShaderOp.Arguments")->m_str);
std::vector<unsigned short> *Validation_Input1 =
std::vector<unsigned short> *Validation_Input2 =
std::vector<unsigned short> *Validation_Input3 =
std::vector<unsigned short> *Validation_Expected =
int Validation_Tolerance = handler.GetTableParamByName(L"Validation.Tolerance")->m_int32;
size_t count = Validation_Input1->size();
std::shared_ptr<ShaderOpTestResult> test = RunShaderOpTest(
pDevice, m_support, pStream, "TertiaryUintOp",
// this callbacked is called when the test
// is creating the resource to run the test
[&](LPCSTR Name, std::vector<BYTE> &Data, st::ShaderOp *pShaderOp) {
VERIFY_IS_TRUE(0 == _stricmp(Name, "STertiaryUintOp"));
size_t size = sizeof(STertiaryUint16Op) * count;
STertiaryUint16Op *pPrimitives = (STertiaryUint16Op *);
for (size_t i = 0; i < count; ++i) {
STertiaryUint16Op *p = &pPrimitives[i];
p->input1 = (*Validation_Input1)[i % Validation_Input1->size()];
p->input2 = (*Validation_Input2)[i % Validation_Input2->size()];
p->input3 = (*Validation_Input3)[i % Validation_Input3->size()];
// use shader from data table
pShaderOp-> = Target.m_psz;
pShaderOp-> = Text.m_psz;
pShaderOp-> = Arguments.m_psz;
MappedData data;
test->Test->GetReadBackData("STertiaryUintOp", &data);
STertiaryUint16Op *pPrimitives = (STertiaryUint16Op *);
WEX::TestExecution::DisableVerifyExceptions dve;
for (unsigned i = 0; i < count; ++i) {
STertiaryUint16Op *p = &pPrimitives[i];
unsigned short val1 = (*Validation_Expected)[i % Validation_Expected->size()];
LogCommentFmt(L"element #%u, input1 = %5hu(0x%08x), input2 = "
L"%5hu(0x%08x), input3 = %5hu(0x%08x), output = "
L"%5hu(0x%08x), expected = %5hu(0x%08x)", i,
p->input1, p->input1, p->input2, p->input2, p->input3, p->input3,
p->output, p->output, val1, val1);
VerifyOutputWithExpectedValueInt(p->output, val1, Validation_Tolerance);
TEST_F(ExecutionTest, DotTest) {
WEX::TestExecution::SetVerifyOutput verifySettings(
CComPtr<IStream> pStream;
ReadHlslDataIntoNewStream(L"ShaderOpArith.xml", &pStream);
CComPtr<ID3D12Device> pDevice;
if (!CreateDevice(&pDevice)) {
int tableSize = sizeof(DotOpParameters) / sizeof(TableParameter);
TableParameterHandler handler(DotOpParameters, tableSize);
CW2A Target(handler.GetTableParamByName(L"ShaderOp.Target")->m_str);
CW2A Text(handler.GetTableParamByName(L"ShaderOp.Text")->m_str);
std::vector<WEX::Common::String> *Validation_Input1 =
std::vector<WEX::Common::String> *Validation_Input2 =
std::vector<WEX::Common::String> *Validation_dot2 =
std::vector<WEX::Common::String> *Validation_dot3 =
std::vector<WEX::Common::String> *Validation_dot4 =
PCWSTR Validation_type = handler.GetTableParamByName(L"Validation.Type")->m_str;
double tolerance = handler.GetTableParamByName(L"Validation.Tolerance")->m_double;
size_t count = Validation_Input1->size();
std::shared_ptr<ShaderOpTestResult> test = RunShaderOpTest(
pDevice, m_support, pStream, "DotOp",
// this callbacked is called when the test
// is creating the resource to run the test
[&](LPCSTR Name, std::vector<BYTE> &Data, st::ShaderOp *pShaderOp) {
VERIFY_IS_TRUE(0 == _stricmp(Name, "SDotOp"));
size_t size = sizeof(SDotOp) * count;
SDotOp *pPrimitives = (SDotOp*);
for (size_t i = 0; i < count; ++i) {
SDotOp *p = &pPrimitives[i];
XMFLOAT4 val1,val2;
(float *)&val1, 4));
(float *)&val2, 4));
p->input1 = val1;
p->input2 = val2;
// use shader from data table
pShaderOp-> = Target.m_psz;
pShaderOp-> = Text.m_psz;
MappedData data;
test->Test->GetReadBackData("SDotOp", &data);
SDotOp *pPrimitives = (SDotOp*);
WEX::TestExecution::DisableVerifyExceptions dve;
for (size_t i = 0; i < count; ++i) {
SDotOp *p = &pPrimitives[i];
float dot2, dot3, dot4;
VERIFY_SUCCEEDED(ParseDataToFloat((*Validation_dot2)[i], dot2));
VERIFY_SUCCEEDED(ParseDataToFloat((*Validation_dot3)[i], dot3));
VERIFY_SUCCEEDED(ParseDataToFloat((*Validation_dot4)[i], dot4));
L"element #%u, input1 = (%f, %f, %f, %f), input2 = (%f, %f, "
L"%f, %f), \n dot2 = %f, dot2_expected = %f, dot3 = %f, "
L"dot3_expected = %f, dot4 = %f, dot4_expected = %f",
i, p->input1.x, p->input1.y, p->input1.z, p->input1.w, p->input2.x,
p->input2.y, p->input2.z, p->input2.w, p->o_dot2, dot2, p->o_dot3, dot3,
p->o_dot4, dot4);
VerifyOutputWithExpectedValueFloat(p->o_dot2, dot2, Validation_type,
VerifyOutputWithExpectedValueFloat(p->o_dot3, dot3, Validation_type,
VerifyOutputWithExpectedValueFloat(p->o_dot4, dot4, Validation_type,
TEST_F(ExecutionTest, Dot2AddHalfTest) {
WEX::TestExecution::SetVerifyOutput verifySettings(
CComPtr<IStream> pStream;
ReadHlslDataIntoNewStream(L"ShaderOpArith.xml", &pStream);
CComPtr<ID3D12Device> pDevice;
if (!CreateDevice(&pDevice, D3D_SHADER_MODEL::D3D_SHADER_MODEL_6_4, false)) {
if (!DoesDeviceSupportNative16bitOps(pDevice)) {
WEX::Logging::Log::Comment(L"Device does not support native 16-bit operations.");
int tableSize = sizeof(Dot2AddHalfOpParameters) / sizeof(TableParameter);
TableParameterHandler handler(Dot2AddHalfOpParameters, tableSize);
CW2A Target(handler.GetTableParamByName(L"ShaderOp.Target")->m_str);
CW2A Text(handler.GetTableParamByName(L"ShaderOp.Text")->m_str);
CW2A Arguments(handler.GetTableParamByName(L"ShaderOp.Arguments")->m_str);
std::vector<WEX::Common::String> *validation_input1 =
std::vector<WEX::Common::String> *validation_input2 =
std::vector<float> *validation_acc = &handler.GetTableParamByName(L"Validation.Input3")->m_floatTable;
std::vector<float> *validation_result = &handler.GetTableParamByName(L"Validation.Expected1")->m_floatTable;
PCWSTR Validation_type = handler.GetTableParamByName(L"Validation.Type")->m_str;
double tolerance = handler.GetTableParamByName(L"Validation.Tolerance")->m_double;
size_t count = validation_input1->size();
std::shared_ptr<ShaderOpTestResult> test = RunShaderOpTest(
pDevice, m_support, pStream, "Dot2AddHalfOp",
// this callback is called when the test
// is creating the resource to run the test
[&](LPCSTR Name, std::vector<BYTE> &Data, st::ShaderOp *pShaderOp) {
VERIFY_IS_TRUE(0 == _stricmp(Name, "SDot2AddHalfOp"));
size_t size = sizeof(SDot2AddHalfOp) * count;
SDot2AddHalfOp *pPrimitives = (SDot2AddHalfOp*);
for (size_t i = 0; i < count; ++i) {
SDot2AddHalfOp *p = &pPrimitives[i];
Half2 val1,val2;
(uint16_t *)&val1, 2));
(uint16_t *)&val2, 2));
p->input1 = val1;
p->input2 = val2;
p->acc = (*validation_acc)[i];
// use shader from data table
pShaderOp-> = Target.m_psz;
pShaderOp-> = Text.m_psz;
pShaderOp-> = Arguments.m_psz;
MappedData data;
test->Test->GetReadBackData("SDot2AddHalfOp", &data);
SDot2AddHalfOp *pPrimitives = (SDot2AddHalfOp*);
WEX::TestExecution::DisableVerifyExceptions dve;
for (size_t i = 0; i < count; ++i) {
SDot2AddHalfOp *p = &pPrimitives[i];
float expectedResult = (*validation_result)[i];
float input1x = ConvertFloat16ToFloat32(p->input1.x);
float input1y = ConvertFloat16ToFloat32(p->input1.y);
float input2x = ConvertFloat16ToFloat32(p->input2.x);
float input2y = ConvertFloat16ToFloat32(p->input2.y);
L"element #%u, input1 = (%f, %f), input2 = (%f, %f), acc = %f\n"
L"result = %f, result_expected = %f",
i, input1x, input1y, input2x, input2y, p->acc, p->result, expectedResult);
VerifyOutputWithExpectedValueFloat(p->result, expectedResult, Validation_type, tolerance);
TEST_F(ExecutionTest, Dot4AddI8PackedTest) {
WEX::TestExecution::SetVerifyOutput verifySettings(
CComPtr<IStream> pStream;
ReadHlslDataIntoNewStream(L"ShaderOpArith.xml", &pStream);
CComPtr<ID3D12Device> pDevice;
if (!CreateDevice(&pDevice, D3D_SHADER_MODEL::D3D_SHADER_MODEL_6_4, false)) {
int tableSize = sizeof(Dot4AddI8PackedOpParameters) / sizeof(TableParameter);
TableParameterHandler handler(Dot4AddI8PackedOpParameters, tableSize);
CW2A Target(handler.GetTableParamByName(L"ShaderOp.Target")->m_str);
CW2A Text(handler.GetTableParamByName(L"ShaderOp.Text")->m_str);
std::vector<uint32_t> *validation_input1 = &handler.GetTableParamByName(L"Validation.Input1")->m_uint32Table;
std::vector<uint32_t> *validation_input2 = &handler.GetTableParamByName(L"Validation.Input2")->m_uint32Table;
std::vector<int32_t> *validation_acc = &handler.GetTableParamByName(L"Validation.Input3")->m_int32Table;
std::vector<int32_t> *validation_result = &handler.GetTableParamByName(L"Validation.Expected1")->m_int32Table;
size_t count = validation_input1->size();
std::shared_ptr<ShaderOpTestResult> test = RunShaderOpTest(
pDevice, m_support, pStream, "Dot4AddI8PackedOp",
// this callback is called when the test
// is creating the resource to run the test
[&](LPCSTR Name, std::vector<BYTE> &Data, st::ShaderOp *pShaderOp) {
VERIFY_IS_TRUE(0 == _stricmp(Name, "SDot4AddI8PackedOp"));
size_t size = sizeof(SDot4AddI8PackedOp) * count;
SDot4AddI8PackedOp *pPrimitives = (SDot4AddI8PackedOp*);
for (size_t i = 0; i < count; ++i) {
SDot4AddI8PackedOp *p = &pPrimitives[i];
p->input1 = (*validation_input1)[i];
p->input2 = (*validation_input2)[i];
p->acc = (*validation_acc)[i];
// use shader from data table
pShaderOp-> = Target.m_psz;
pShaderOp-> = Text.m_psz;
MappedData data;
test->Test->GetReadBackData("SDot4AddI8PackedOp", &data);
SDot4AddI8PackedOp *pPrimitives = (SDot4AddI8PackedOp*);
WEX::TestExecution::DisableVerifyExceptions dve;
for (size_t i = 0; i < count; ++i) {
SDot4AddI8PackedOp *p = &pPrimitives[i];
int32_t expectedResult = (*validation_result)[i];
L"element #%u, input1 = %u, input2 = %u, acc = %d \n"
L"result = %d, result_expected = %d",
i, p->input1, p->input2, p->acc, p->result, expectedResult);
VerifyOutputWithExpectedValueInt(p->result, expectedResult, 0);
TEST_F(ExecutionTest, Dot4AddU8PackedTest) {
WEX::TestExecution::SetVerifyOutput verifySettings(
CComPtr<IStream> pStream;
ReadHlslDataIntoNewStream(L"ShaderOpArith.xml", &pStream);
CComPtr<ID3D12Device> pDevice;
if (!CreateDevice(&pDevice, D3D_SHADER_MODEL::D3D_SHADER_MODEL_6_4, false)) {
int tableSize = sizeof(Dot4AddU8PackedOpParameters) / sizeof(TableParameter);
TableParameterHandler handler(Dot4AddU8PackedOpParameters, tableSize);
CW2A Target(handler.GetTableParamByName(L"ShaderOp.Target")->m_str);
CW2A Text(handler.GetTableParamByName(L"ShaderOp.Text")->m_str);
std::vector<uint32_t> *validation_input1 = &handler.GetTableParamByName(L"Validation.Input1")->m_uint32Table;
std::vector<uint32_t> *validation_input2 = &handler.GetTableParamByName(L"Validation.Input2")->m_uint32Table;
std::vector<uint32_t> *validation_acc = &handler.GetTableParamByName(L"Validation.Input3")->m_uint32Table;
std::vector<uint32_t> *validation_result = &handler.GetTableParamByName(L"Validation.Expected1")->m_uint32Table;
size_t count = validation_input1->size();
std::shared_ptr<ShaderOpTestResult> test = RunShaderOpTest(
pDevice, m_support, pStream, "Dot4AddU8PackedOp",
// this callback is called when the test
// is creating the resource to run the test
[&](LPCSTR Name, std::vector<BYTE> &Data, st::ShaderOp *pShaderOp) {
VERIFY_IS_TRUE(0 == _stricmp(Name, "SDot4AddU8PackedOp"));
size_t size = sizeof(SDot4AddU8PackedOp) * count;
SDot4AddU8PackedOp *pPrimitives = (SDot4AddU8PackedOp*);
for (size_t i = 0; i < count; ++i) {
SDot4AddU8PackedOp *p = &pPrimitives[i];
p->input1 = (*validation_input1)[i];
p->input2 = (*validation_input2)[i];
p->acc = (*validation_acc)[i];
// use shader from data table
pShaderOp-> = Target.m_psz;
pShaderOp-> = Text.m_psz;
MappedData data;
test->Test->GetReadBackData("SDot4AddU8PackedOp", &data);
SDot4AddU8PackedOp *pPrimitives = (SDot4AddU8PackedOp*);
WEX::TestExecution::DisableVerifyExceptions dve;
for (size_t i = 0; i < count; ++i) {
SDot4AddU8PackedOp *p = &pPrimitives[i];
uint32_t expectedResult = (*validation_result)[i];
L"element #%u, input1 = %u, input2 = %u, acc = %u \n"
L"result = %u, result_expected = %u, ",
i, p->input1, p->input2, p->acc, p->result, expectedResult);
VerifyOutputWithExpectedValueUInt(p->result, expectedResult, 0);
TEST_F(ExecutionTest, Msad4Test) {
WEX::TestExecution::SetVerifyOutput verifySettings(
CComPtr<IStream> pStream;
ReadHlslDataIntoNewStream(L"ShaderOpArith.xml", &pStream);
CComPtr<ID3D12Device> pDevice;
if (!CreateDevice(&pDevice)) {
size_t tableSize = sizeof(Msad4OpParameters) / sizeof(TableParameter);
TableParameterHandler handler(Msad4OpParameters, tableSize);
CW2A Text(handler.GetTableParamByName(L"ShaderOp.Text")->m_str);
double tolerance = handler.GetTableParamByName(L"Validation.Tolerance")->m_double;
std::vector<unsigned int> *Validation_Reference =
std::vector<WEX::Common::String> *Validation_Source =
std::vector<WEX::Common::String> *Validation_Accum =
std::vector<WEX::Common::String> *Validation_Expected =
size_t count = Validation_Expected->size();
std::shared_ptr<ShaderOpTestResult> test = RunShaderOpTest(
pDevice, m_support, pStream, "Msad4",
// this callbacked is called when the test
// is creating the resource to run the test
[&](LPCSTR Name, std::vector<BYTE> &Data, st::ShaderOp *pShaderOp) {
VERIFY_IS_TRUE(0 == _stricmp(Name, "SMsad4"));
size_t size = sizeof(SMsad4) * count;
SMsad4 *pPrimitives = (SMsad4*);
for (size_t i = 0; i < count; ++i) {
SMsad4 *p = &pPrimitives[i];
XMUINT2 src;
XMUINT4 accum;
VERIFY_SUCCEEDED(ParseDataToVectorUint((*Validation_Source)[i], (unsigned int*)&src, 2));
VERIFY_SUCCEEDED(ParseDataToVectorUint((*Validation_Accum)[i], (unsigned int*)&accum, 4));
p->ref = (*Validation_Reference)[i];
p->src = src;
p->accum = accum;
// use shader from data table
pShaderOp-> = Text.m_psz;
MappedData data;
test->Test->GetReadBackData("SMsad4", &data);
SMsad4 *pPrimitives = (SMsad4*);
WEX::TestExecution::DisableVerifyExceptions dve;
for (size_t i = 0; i < count; ++i) {
SMsad4 *p = &pPrimitives[i];
XMUINT4 result;
(unsigned int *)&result, 4));
L"element #%u, ref = %u(0x%08x), src = %u(0x%08x), %u(0x%08x), "
L"accum = %u(0x%08x), %u(0x%08x), %u(0x%08x), %u(0x%08x),\n"
L"result = %u(0x%08x), %u(0x%08x), %u(0x%08x), %u(0x%08x),\n"
L"expected = %u(0x%08x), %u(0x%08x), %u(0x%08x), %u(0x%08x)", i,
p->ref, p->ref, p->src.x, p->src.x, p->src.y, p->src.y, p->accum.x,
p->accum.x, p->accum.y, p->accum.y, p->accum.z, p->accum.z,
p->accum.w, p->accum.w, p->result.x, p->result.x, p->result.y,
p->result.y, p->result.z, p->result.z, p->result.w, p->result.w,
result.x, result.x, result.y, result.y, result.z, result.z,
result.w, result.w);
int toleranceInt = (int)tolerance;
VerifyOutputWithExpectedValueInt(p->result.x, result.x, toleranceInt);
VerifyOutputWithExpectedValueInt(p->result.y, result.y, toleranceInt);
VerifyOutputWithExpectedValueInt(p->result.z, result.z, toleranceInt);
VerifyOutputWithExpectedValueInt(p->result.w, result.w, toleranceInt);
TEST_F(ExecutionTest, DenormBinaryFloatOpTest) {
WEX::TestExecution::SetVerifyOutput verifySettings(
CComPtr<IStream> pStream;
ReadHlslDataIntoNewStream(L"ShaderOpArith.xml", &pStream);
CComPtr<ID3D12Device> pDevice;
if (!CreateDevice(&pDevice, D3D_SHADER_MODEL::D3D_SHADER_MODEL_6_2)) {
// Read data from the table
int tableSize = sizeof(DenormBinaryFPOpParameters) / sizeof(TableParameter);
TableParameterHandler handler(DenormBinaryFPOpParameters, tableSize);
CW2A Target(handler.GetTableParamByName(L"ShaderOp.Target")->m_str);
CW2A Text(handler.GetTableParamByName(L"ShaderOp.Text")->m_str);
CW2A Arguments(handler.GetTableParamByName(L"ShaderOp.Arguments")->m_str);
std::vector<WEX::Common::String> *Validation_Input1 =
std::vector<WEX::Common::String> *Validation_Input2 =
std::vector<WEX::Common::String> *Validation_Expected1 =
// two expected outputs for any mode
std::vector<WEX::Common::String> *Validation_Expected2 =
LPCWSTR Validation_Type = handler.GetTableParamByName(L"Validation.Type")->m_str;
double Validation_Tolerance = handler.GetTableParamByName(L"Validation.Tolerance")->m_double;
size_t count = Validation_Input1->size();
using namespace hlsl::DXIL;
Float32DenormMode mode = Float32DenormMode::Any;
if (strcmp(Arguments.m_psz, "-denorm preserve") == 0) {
mode = Float32DenormMode::Preserve;
else if (strcmp(Arguments.m_psz, "-denorm ftz") == 0) {
mode = Float32DenormMode::FTZ;
if (mode == Float32DenormMode::Any) {
DXASSERT(Validation_Expected2->size() == Validation_Expected1->size(),
"must have same number of expected values");
#if defined(_M_ARM64) || defined(_M_ARM64EC)
if ((GetTestParamUseWARP(UseWarpByDefault()) || IsDeviceBasicAdapter(pDevice)) && mode == Float32DenormMode::Preserve) {
WEX::Logging::Log::Comment(L"WARP has an issue with DenormBinaryFloatOpTest with '-denorm preserve' on ARM64.");
#endif // defined(_M_ARM64) || defined(_M_ARM64EC)
std::shared_ptr<ShaderOpTestResult> test = RunShaderOpTest(
pDevice, m_support, pStream, "BinaryFPOp",
// this callbacked is called when the test
// is creating the resource to run the test
[&](LPCSTR Name, std::vector<BYTE> &Data, st::ShaderOp *pShaderOp) {
VERIFY_IS_TRUE(0 == _stricmp(Name, "SBinaryFPOp"));
size_t size = sizeof(SBinaryFPOp) * count;
SBinaryFPOp *pPrimitives = (SBinaryFPOp *);
for (size_t i = 0; i < count; ++i) {
SBinaryFPOp *p = &pPrimitives[i];
PCWSTR str1 = (*Validation_Input1)[i % Validation_Input1->size()];
PCWSTR str2 = (*Validation_Input2)[i % Validation_Input2->size()];
float val1, val2;
VERIFY_SUCCEEDED(ParseDataToFloat(str1, val1));
VERIFY_SUCCEEDED(ParseDataToFloat(str2, val2));
p->input1 = val1;
p->input2 = val2;
// use shader from data table
pShaderOp-> = Target.m_psz;
pShaderOp-> = Text.m_psz;
pShaderOp-> = Arguments.m_psz;
MappedData data;
test->Test->GetReadBackData("SBinaryFPOp", &data);
SBinaryFPOp *pPrimitives = (SBinaryFPOp *);
WEX::TestExecution::DisableVerifyExceptions dve;
for (unsigned i = 0; i < count; ++i) {
SBinaryFPOp *p = &pPrimitives[i];
if (mode == Float32DenormMode::Any) {
LPCWSTR str1 = (*Validation_Expected1)[i % Validation_Expected1->size()];
LPCWSTR str2 = (*Validation_Expected2)[i % Validation_Expected2->size()];
float val1;
float val2;
VERIFY_SUCCEEDED(ParseDataToFloat(str1, val1));
VERIFY_SUCCEEDED(ParseDataToFloat(str2, val2));
LogCommentFmt(L"element #%u, input1 = %6.8f, input2 = %6.8f, output = "
L"%6.8f, expected = %6.8f(%x) or %6.8f(%x)",
i, p->input1, p->input2, p->output1, val1, *(int *)&val1, val2, *(int *)&val2);
p->output1, val1, Validation_Type, Validation_Tolerance, mode) ||
p->output1, val2, Validation_Type, Validation_Tolerance, mode));
else {
LPCWSTR str1 = (*Validation_Expected1)[i % Validation_Expected1->size()];
float val1;
VERIFY_SUCCEEDED(ParseDataToFloat(str1, val1));
LogCommentFmt(L"element #%u, input1 = %6.8f, input2 = %6.8f, output = "
L"%6.8f, expected = %6.8f(%a)",
i, p->input1, p->input2, p->output1, val1, *(int *)&val1);
VerifyOutputWithExpectedValueFloat(p->output1, val1, Validation_Type,
Validation_Tolerance, mode);
TEST_F(ExecutionTest, DenormTertiaryFloatOpTest) {
WEX::TestExecution::SetVerifyOutput verifySettings(
CComPtr<IStream> pStream;
ReadHlslDataIntoNewStream(L"ShaderOpArith.xml", &pStream);
CComPtr<ID3D12Device> pDevice;
if (!CreateDevice(&pDevice, D3D_SHADER_MODEL::D3D_SHADER_MODEL_6_2)) {
// Read data from the table
int tableSize = sizeof(DenormTertiaryFPOpParameters) / sizeof(TableParameter);
TableParameterHandler handler(DenormTertiaryFPOpParameters, tableSize);
CW2A Target(handler.GetTableParamByName(L"ShaderOp.Target")->m_str);
CW2A Text(handler.GetTableParamByName(L"ShaderOp.Text")->m_str);
CW2A Arguments(handler.GetTableParamByName(L"ShaderOp.Arguments")->m_str);
std::vector<WEX::Common::String> *Validation_Input1 =
std::vector<WEX::Common::String> *Validation_Input2 =
std::vector<WEX::Common::String> *Validation_Input3 =
std::vector<WEX::Common::String> *Validation_Expected1 =
// two expected outputs for any mode
std::vector<WEX::Common::String> *Validation_Expected2 =
LPCWSTR Validation_Type = handler.GetTableParamByName(L"Validation.Type")->m_str;
double Validation_Tolerance = handler.GetTableParamByName(L"Validation.Tolerance")->m_double;
size_t count = Validation_Input1->size();
using namespace hlsl::DXIL;
Float32DenormMode mode = Float32DenormMode::Any;
if (strcmp(Arguments.m_psz, "-denorm preserve") == 0) {
mode = Float32DenormMode::Preserve;
else if (strcmp(Arguments.m_psz, "-denorm ftz") == 0) {
mode = Float32DenormMode::FTZ;
if (mode == Float32DenormMode::Any) {
DXASSERT(Validation_Expected2->size() == Validation_Expected1->size(),
"must have same number of expected values");
#if defined(_M_ARM64) || defined(_M_ARM64EC)
if ((GetTestParamUseWARP(UseWarpByDefault()) || IsDeviceBasicAdapter(pDevice)) && mode == Float32DenormMode::Preserve) {
WEX::Logging::Log::Comment(L"WARP has an issue with DenormTertiaryFloatOpTest with '-denorm preserve' on ARM64.");
#endif // defined(_M_ARM64) || defined(_M_ARM64EC)
std::shared_ptr<ShaderOpTestResult> test = RunShaderOpTest(
pDevice, m_support, pStream, "TertiaryFPOp",
// this callbacked is called when the test
// is creating the resource to run the test
[&](LPCSTR Name, std::vector<BYTE> &Data, st::ShaderOp *pShaderOp) {
VERIFY_IS_TRUE(0 == _stricmp(Name, "STertiaryFPOp"));
size_t size = sizeof(STertiaryFPOp) * count;
STertiaryFPOp *pPrimitives = (STertiaryFPOp *);
for (size_t i = 0; i < count; ++i) {
STertiaryFPOp *p = &pPrimitives[i];
PCWSTR str1 = (*Validation_Input1)[i % Validation_Input1->size()];
PCWSTR str2 = (*Validation_Input2)[i % Validation_Input2->size()];
PCWSTR str3 = (*Validation_Input3)[i % Validation_Input3->size()];
float val1, val2, val3;
VERIFY_SUCCEEDED(ParseDataToFloat(str1, val1));
VERIFY_SUCCEEDED(ParseDataToFloat(str2, val2));
VERIFY_SUCCEEDED(ParseDataToFloat(str3, val3));
p->input1 = val1;
p->input2 = val2;
p->input3 = val3;
// use shader from data table
pShaderOp-> = Target.m_psz;
pShaderOp-> = Text.m_psz;
pShaderOp-> = Arguments.m_psz;
MappedData data;
test->Test->GetReadBackData("STertiaryFPOp", &data);
STertiaryFPOp *pPrimitives = (STertiaryFPOp *);
WEX::TestExecution::DisableVerifyExceptions dve;
for (unsigned i = 0; i < count; ++i) {
STertiaryFPOp *p = &pPrimitives[i];
if (mode == Float32DenormMode::Any) {
LPCWSTR str1 = (*Validation_Expected1)[i % Validation_Expected1->size()];
LPCWSTR str2 = (*Validation_Expected2)[i % Validation_Expected2->size()];
float val1;
float val2;
VERIFY_SUCCEEDED(ParseDataToFloat(str1, val1));
VERIFY_SUCCEEDED(ParseDataToFloat(str2, val2));
LogCommentFmt(L"element #%u, input1 = %6.8f, input2 = %6.8f, input3 = %6.8f, output = "
L"%6.8f, expected = %6.8f(%x) or %6.8f(%x)",
i, p->input1, p->input2, p->input3, p->output, val1, *(int *)&val1, val2, *(int *)&val2);
p->output, val1, Validation_Type, Validation_Tolerance, mode) ||
p->output, val2, Validation_Type, Validation_Tolerance, mode));
else {
LPCWSTR str1 = (*Validation_Expected1)[i % Validation_Expected1->size()];
float val1;
VERIFY_SUCCEEDED(ParseDataToFloat(str1, val1));
LogCommentFmt(L"element #%u, input1 = %6.8f, input2 = %6.8f, input3 = %6.8f, output = "
L"%6.8f, expected = %6.8f(%a)",
i, p->input1, p->input2, p->input3, p->output, val1, *(int *)&val1);
VerifyOutputWithExpectedValueFloat(p->output, val1, Validation_Type,
Validation_Tolerance, mode);
// Setup for wave intrinsics tests
enum class ShaderOpKind {
struct ShaderOpKindPair {
ShaderOpKind kind;
static ShaderOpKindPair ShaderOpKindTable[] = {
{ L"WaveActiveSum", ShaderOpKind::WaveSum },
{ L"WaveActiveUSum", ShaderOpKind::WaveSum },
{ L"WaveActiveProduct", ShaderOpKind::WaveProduct },
{ L"WaveActiveUProduct", ShaderOpKind::WaveProduct },
{ L"WaveActiveMax", ShaderOpKind::WaveActiveMax },
{ L"WaveActiveUMax", ShaderOpKind::WaveActiveMax },
{ L"WaveActiveMin", ShaderOpKind::WaveActiveMin },
{ L"WaveActiveUMin", ShaderOpKind::WaveActiveMin },
{ L"WaveActiveCountBits", ShaderOpKind::WaveCountBits },
{ L"WaveActiveAllEqual", ShaderOpKind::WaveActiveAllEqual },
{ L"WaveActiveAnyTrue", ShaderOpKind::WaveActiveAnyTrue },
{ L"WaveActiveAllTrue", ShaderOpKind::WaveActiveAllTrue },
{ L"WaveActiveBitOr", ShaderOpKind::WaveActiveBitOr },
{ L"WaveActiveBitAnd", ShaderOpKind::WaveActiveBitAnd },
{ L"WaveActiveBitXor", ShaderOpKind::WaveActiveBitXor },
{ L"WavePrefixSum", ShaderOpKind::WaveSum },
{ L"WavePrefixUSum", ShaderOpKind::WaveSum },
{ L"WavePrefixProduct", ShaderOpKind::WaveProduct },
{ L"WavePrefixUProduct", ShaderOpKind::WaveProduct },
{ L"WavePrefixMax", ShaderOpKind::WaveActiveMax },
{ L"WavePrefixUMax", ShaderOpKind::WaveActiveMax },
{ L"WavePrefixMin", ShaderOpKind::WaveActiveMin },
{ L"WavePrefixUMin", ShaderOpKind::WaveActiveMin },
{ L"WavePrefixCountBits", ShaderOpKind::WaveCountBits }
ShaderOpKind GetShaderOpKind(LPCWSTR str) {
for (size_t i = 0; i < sizeof(ShaderOpKindTable)/sizeof(ShaderOpKindPair); ++i) {
if (_wcsicmp(ShaderOpKindTable[i].name, str) == 0) {
return ShaderOpKindTable[i].kind;
DXASSERT_ARGS(false, "Invalid ShaderOp name: %s", str);
return ShaderOpKind::ShaderOpInvalid;
template <typename InType, typename OutType, ShaderOpKind kind>
struct computeExpected {
OutType operator()(const std::vector<InType> &inputs,
const std::vector<int> &masks, int maskValue,
unsigned int index) {
return 0;
template <typename InType, typename OutType>
struct computeExpected<InType, OutType, ShaderOpKind::WaveSum> {
OutType operator()(const std::vector<InType> &inputs,
const std::vector<int> &masks, int maskValue,
unsigned int index) {
OutType sum = 0;
for (size_t i = 0; i < index; ++i) {
if ( == maskValue) {
sum +=;
return sum;
template <typename InType, typename OutType>
struct computeExpected<InType, OutType, ShaderOpKind::WaveProduct> {
OutType operator()(const std::vector<InType> &inputs,
const std::vector<int> &masks, int maskValue,
unsigned int index) {
OutType prod = 1;
for (size_t i = 0; i < index; ++i) {
if ( == maskValue) {
prod *=;
return prod;
template <typename InType, typename OutType>
struct computeExpected<InType, OutType, ShaderOpKind::WaveActiveMax> {
OutType operator()(const std::vector<InType> &inputs,
const std::vector<int> &masks, int maskValue,
unsigned int index) {
OutType maximum = std::numeric_limits<OutType>::min();
for (size_t i = 0; i < index; ++i) {
if ( == maskValue && > maximum)
maximum =;
return maximum;
template <typename InType, typename OutType>
struct computeExpected<InType, OutType, ShaderOpKind::WaveActiveMin> {
OutType operator()(const std::vector<InType> &inputs,
const std::vector<int> &masks, int maskValue,
unsigned int index) {
OutType minimum = std::numeric_limits<OutType>::max();
for (size_t i = 0; i < index; ++i) {
if ( == maskValue && < minimum)
minimum =;
return minimum;
template <typename InType, typename OutType>
struct computeExpected<InType, OutType, ShaderOpKind::WaveCountBits> {
OutType operator()(const std::vector<InType> &inputs,
const std::vector<int> &masks, int maskValue,
unsigned int index) {
OutType count = 0;
for (size_t i = 0; i < index; ++i) {
if ( == maskValue && > 3) {
return count;
// In HLSL, boolean is represented in a 4 byte (uint32) format,
// So we cannot use c++ bool type to represent bool in HLSL
// HLSL returns 0 for false and 1 for true
template <typename InType, typename OutType>
struct computeExpected<InType, OutType, ShaderOpKind::WaveActiveAnyTrue> {
OutType operator()(const std::vector<InType> &inputs,
const std::vector<int> &masks, int maskValue,
unsigned int index) {
for (size_t i = 0; i < index; ++i) {
if ( == maskValue && != 0) {
return 1;
return 0;
template <typename InType, typename OutType>
struct computeExpected<InType, OutType, ShaderOpKind::WaveActiveAllTrue> {
OutType operator()(const std::vector<InType> &inputs,
const std::vector<int> &masks, int maskValue,
unsigned int index) {
for (size_t i = 0; i < index; ++i) {
if ( == maskValue && == 0) {
return 0;
return 1;
template <typename InType, typename OutType>
struct computeExpected<InType, OutType, ShaderOpKind::WaveActiveAllEqual> {
OutType operator()(const std::vector<InType> &inputs,
const std::vector<int> &masks, int maskValue,
unsigned int index) {
const InType *val = nullptr;
for (size_t i = 0; i < index; ++i) {
if ( == maskValue) {
if (val && *val != {
return 0;
val = &;
return 1;
template <typename InType, typename OutType>
struct computeExpected<InType, OutType, ShaderOpKind::WaveActiveBitOr> {
OutType operator()(const std::vector<InType> &inputs,
const std::vector<int> &masks, int maskValue,
unsigned int index) {
OutType bits = 0x00000000;
for (size_t i = 0; i < index; ++i) {
if ( == maskValue) {
bits |=;
return bits;
template <typename InType, typename OutType>
struct computeExpected<InType, OutType, ShaderOpKind::WaveActiveBitAnd> {
OutType operator()(const std::vector<InType> &inputs,
const std::vector<int> &masks, int maskValue,
unsigned int index) {
OutType bits = 0xffffffff;
for (size_t i = 0; i < index; ++i) {
if ( == maskValue) {
bits &=;
return bits;
template <typename InType, typename OutType>
struct computeExpected<InType, OutType, ShaderOpKind::WaveActiveBitXor> {
OutType operator()(const std::vector<InType> &inputs,
const std::vector<int> &masks, int maskValue,
unsigned int index) {
OutType bits = 0x00000000;
for (size_t i = 0; i < index; ++i) {
if ( == maskValue) {
bits ^=;
return bits;
// Mask functions used to control active lanes
static int MaskAll(int i) {
return 1;
static int MaskEveryOther(int i) {
return i % 2 == 0 ? 1 : 0;
static int MaskEveryThird(int i) {
return i % 3 == 0 ? 1 : 0;
typedef int(*MaskFunction)(int);
static MaskFunction MaskFunctionTable[] = {
MaskAll, MaskEveryOther, MaskEveryThird
template <typename InType, typename OutType>
static OutType computeExpectedWithShaderOp(const std::vector<InType> &inputs,
const std::vector<int> &masks,
int maskValue, unsigned int index,
LPCWSTR str) {
ShaderOpKind kind = GetShaderOpKind(str);
switch (kind) {
case ShaderOpKind::WaveSum:
return computeExpected<InType, OutType, ShaderOpKind::WaveSum>()(inputs, masks, maskValue, index);
case ShaderOpKind::WaveProduct:
return computeExpected<InType, OutType, ShaderOpKind::WaveProduct>()(inputs, masks, maskValue, index);
case ShaderOpKind::WaveActiveMax:
return computeExpected<InType, OutType, ShaderOpKind::WaveActiveMax>()(inputs, masks, maskValue, index);
case ShaderOpKind::WaveActiveMin:
return computeExpected<InType, OutType, ShaderOpKind::WaveActiveMin>()(inputs, masks, maskValue, index);
case ShaderOpKind::WaveCountBits:
return computeExpected<InType, OutType, ShaderOpKind::WaveCountBits>()(inputs, masks, maskValue, index);
case ShaderOpKind::WaveActiveBitOr:
return computeExpected<InType, OutType, ShaderOpKind::WaveActiveBitOr>()(inputs, masks, maskValue, index);
case ShaderOpKind::WaveActiveBitAnd:
return computeExpected<InType, OutType, ShaderOpKind::WaveActiveBitAnd>()(inputs, masks, maskValue, index);
case ShaderOpKind::WaveActiveBitXor:
return computeExpected<InType, OutType, ShaderOpKind::WaveActiveBitXor>()(inputs, masks, maskValue, index);
case ShaderOpKind::WaveActiveAnyTrue:
return computeExpected<InType, OutType, ShaderOpKind::WaveActiveAnyTrue>()(inputs, masks, maskValue, index);
case ShaderOpKind::WaveActiveAllTrue:
return computeExpected<InType, OutType, ShaderOpKind::WaveActiveAllTrue>()(inputs, masks, maskValue, index);
case ShaderOpKind::WaveActiveAllEqual:
return computeExpected<InType, OutType, ShaderOpKind::WaveActiveAllEqual>()(inputs, masks, maskValue, index);
DXASSERT_ARGS(false, "Invalid ShaderOp Name: %s", str);
return (OutType) 0;
// A framework for testing individual wave intrinsics tests.
// This test case is assuming that functions 1) WaveIsFirstLane and 2) WaveGetLaneIndex are correct for all lanes.
template <class T1, class T2>
void ExecutionTest::WaveIntrinsicsActivePrefixTest(
TableParameter *pParameterList, size_t numParameter, bool isPrefix) {
WEX::TestExecution::SetVerifyOutput verifySettings(WEX::TestExecution::VerifyOutputSettings::LogOnlyFailures);
// Resource representation for compute shader
// firstLaneId is used to group different waves
// laneIndex is used to identify lane within the wave.
// Lane ids are not necessarily in same order as thread ids.
struct PerThreadData {
unsigned firstLaneId;
unsigned laneIndex;
int mask;
T1 input;
T2 output;
unsigned int NumThreadsX = 8;
unsigned int NumThreadsY = 12;
unsigned int NumThreadsZ = 1;
static const unsigned int ThreadsPerGroup = NumThreadsX * NumThreadsY * NumThreadsZ;
static const unsigned int DispatchGroupCount = 1;
static const unsigned int ThreadCount = ThreadsPerGroup * DispatchGroupCount;
CComPtr<IStream> pStream;
ReadHlslDataIntoNewStream(L"ShaderOpArith.xml", &pStream);
CComPtr<ID3D12Device> pDevice;
if (!CreateDevice(&pDevice)) {
if (!DoesDeviceSupportWaveOps(pDevice)) {
// Optional feature, so it's correct to not support it if declared as such.
WEX::Logging::Log::Comment(L"Device does not support wave operations.");
TableParameterHandler handler(pParameterList, numParameter);
unsigned int numInputSet = handler.GetTableParamByName(L"Validation.NumInputSet")->m_uint;
// Obtain the list of input lists
std::vector<std::vector<T1>*> InputDataList;
for (unsigned int i = 0;
i < numInputSet; ++i) {
std::wstring inputName = L"Validation.InputSet";
inputName.append(std::to_wstring(i + 1));
CW2A Text(handler.GetTableParamByName(L"ShaderOp.text")->m_str);
std::shared_ptr<st::ShaderOpSet> ShaderOpSet = std::make_shared<st::ShaderOpSet>();
st::ParseShaderOpSetFromStream(pStream, ShaderOpSet.get());
// Running compute shader for each input set with different masks
for (size_t setIndex = 0; setIndex < numInputSet; ++setIndex) {
for (size_t maskIndex = 0; maskIndex < sizeof(MaskFunctionTable) / sizeof(MaskFunction); ++maskIndex) {
std::shared_ptr<ShaderOpTestResult> test = RunShaderOpTestAfterParse(
pDevice, m_support, "WaveIntrinsicsOp",
// this callbacked is called when the test
// is creating the resource to run the test
[&](LPCSTR Name, std::vector<BYTE> &Data, st::ShaderOp *pShaderOp) {
VERIFY_IS_TRUE(0 == _stricmp(Name, "SWaveIntrinsicsOp"));
size_t size = sizeof(PerThreadData) * ThreadCount;
PerThreadData *pPrimitives = (PerThreadData*);
// 4 different inputs for each operation test
size_t index = 0;
std::vector<T1> *IntList = InputDataList[setIndex];
while (index < ThreadCount) {
PerThreadData *p = &pPrimitives[index];
p->firstLaneId = 0xFFFFBFFF;
p->laneIndex = 0xFFFFBFFF;
p->mask = MaskFunctionTable[maskIndex]((int)index);
p->input = (*IntList)[index % IntList->size()];
p->output = 0xFFFFBFFF;
// use shader from data table
pShaderOp-> = Text.m_psz;
}, ShaderOpSet);
// Check the value
MappedData data;
test->Test->GetReadBackData("SWaveIntrinsicsOp", &data);
PerThreadData *pPrimitives = (PerThreadData*);
WEX::TestExecution::DisableVerifyExceptions dve;
// Grouping data by waves
std::vector<int> firstLaneIds;
for (size_t i = 0; i < ThreadCount; ++i) {
PerThreadData *p = &pPrimitives[i];
int firstLaneId = p->firstLaneId;
if (!contains(firstLaneIds, firstLaneId)) {
std::map<int, std::unique_ptr<std::vector<PerThreadData *>>> waves;
for (size_t i = 0; i < firstLaneIds.size(); ++i) {
waves[] = std::make_unique<std::vector<PerThreadData*>>();
for (size_t i = 0; i < ThreadCount; ++i) {
PerThreadData *p = &pPrimitives[i];
// validate for each wave
for (size_t i = 0; i < firstLaneIds.size(); ++i) {
// collect inputs and masks for a given wave
std::vector<PerThreadData *> *waveData = waves[].get();
std::vector<T1> inputList(waveData->size());
std::vector<int> maskList(waveData->size(), -1);
std::vector<T2> outputList(waveData->size());
// sort inputList and masklist by lane id. input for each lane can be computed for its group index
for (size_t j = 0, end = waveData->size(); j < end; ++j) {
unsigned laneID = waveData->at(j)->laneIndex;
// ensure that each lane ID is unique and within the range
VERIFY_IS_TRUE(0 <= laneID && laneID < waveData->size());
VERIFY_IS_TRUE( == -1); = waveData->at(j)->mask; = waveData->at(j)->input; = waveData->at(j)->output;
std::wstring inputStr = L"Wave Inputs: ";
std::wstring maskStr = L"Wave Masks: ";
std::wstring outputStr = L"Wave Outputs: ";
// append input string and mask string in lane id order
for (size_t j = 0, end = waveData->size(); j < end; ++j) {
maskStr.append(L" ");
inputStr.append(L" ");
outputStr.append(L" ");
// Compute expected output for a given inputs, masks, and index
for (size_t laneIndex = 0, laneEnd = inputList.size(); laneIndex < laneEnd; ++laneIndex) {
T2 expected;
// WaveActive is equivalent to WavePrefix lane # lane count
unsigned index = isPrefix ? (unsigned)laneIndex : (unsigned)inputList.size();
if ( == 1) {
expected = computeExpectedWithShaderOp<T1, T2>(
inputList, maskList, 1, index,
else {
expected = computeExpectedWithShaderOp<T1, T2>(
inputList, maskList, 0, index,
// TODO: use different comparison for floating point inputs
bool equal = == expected;
if (!equal) {
LogCommentFmt(L"lane%d: %4d, Expected : %4d", laneIndex,, expected);
static const unsigned int MinWarpVersionForWaveIntrinsics = 16202;
TEST_F(ExecutionTest, WaveIntrinsicsActiveIntTest) {
if (GetTestParamUseWARP(true) &&
!IsValidWarpDllVersion(MinWarpVersionForWaveIntrinsics)) {
WaveIntrinsicsActivePrefixTest<int, int>(
sizeof(WaveIntrinsicsActiveIntParameters) / sizeof(TableParameter),
/*isPrefix*/ false);
TEST_F(ExecutionTest, WaveIntrinsicsActiveUintTest) {
if (GetTestParamUseWARP(true) &&
!IsValidWarpDllVersion(MinWarpVersionForWaveIntrinsics)) {
WaveIntrinsicsActivePrefixTest<unsigned int, unsigned int>(
sizeof(WaveIntrinsicsActiveUintParameters) / sizeof(TableParameter),
/*isPrefix*/ false);
TEST_F(ExecutionTest, WaveIntrinsicsPrefixIntTest) {
if (GetTestParamUseWARP(true) &&
!IsValidWarpDllVersion(MinWarpVersionForWaveIntrinsics)) {
WaveIntrinsicsActivePrefixTest<int, int>(
sizeof(WaveIntrinsicsPrefixIntParameters) / sizeof(TableParameter),
/*isPrefix*/ true);
TEST_F(ExecutionTest, WaveIntrinsicsPrefixUintTest) {
if (GetTestParamUseWARP(true) &&
!IsValidWarpDllVersion(MinWarpVersionForWaveIntrinsics)) {
WaveIntrinsicsActivePrefixTest<unsigned int, unsigned int>(
sizeof(WaveIntrinsicsPrefixUintParameters) / sizeof(TableParameter),
/*isPrefix*/ true);
template <typename T>
static T GetWaveMultiPrefixInitialAccumValue(LPCWSTR testName) {
if (_wcsicmp(testName, L"WaveMultiPrefixProduct") == 0 ||
_wcsicmp(testName, L"WaveMultiPrefixUProduct") == 0) {
return static_cast<T>(1);
} else if (_wcsicmp(testName, L"WaveMultiPrefixSum") == 0 ||
_wcsicmp(testName, L"WaveMultiPrefixUSum") == 0 ||
_wcsicmp(testName, L"WaveMultiPrefixBitOr") == 0 ||
_wcsicmp(testName, L"WaveMultiPrefixUBitOr") == 0 ||
_wcsicmp(testName, L"WaveMultiPrefixBitXor") == 0 ||
_wcsicmp(testName, L"WaveMultiPrefixUBitXor") == 0 ||
_wcsicmp(testName, L"WaveMultiPrefixCountBits") == 0 ||
_wcsicmp(testName, L"WaveMultiPrefixUCountBits") == 0) {
return static_cast<T>(0);
} else if (_wcsicmp(testName, L"WaveMultiPrefixBitAnd") == 0 ||
_wcsicmp(testName, L"WaveMultiPrefixUBitAnd") == 0) {
return static_cast<T>(-1);
} else {
return static_cast<T>(0);
template <typename T>
std::function<T(T, T)> GetWaveMultiPrefixReferenceFunction(LPCWSTR testName) {
if (_wcsicmp(testName, L"WaveMultiPrefixProduct") == 0 ||
_wcsicmp(testName, L"WaveMultiPrefixUProduct") == 0) {
return [] (T lhs, T rhs) -> T { return lhs * rhs; };
} else if (_wcsicmp(testName, L"WaveMultiPrefixSum") == 0 ||
_wcsicmp(testName, L"WaveMultiPrefixUSum") == 0) {
return [] (T lhs, T rhs) -> T { return lhs + rhs; };
} else if (_wcsicmp(testName, L"WaveMultiPrefixBitAnd") == 0 ||
_wcsicmp(testName, L"WaveMultiPrefixUBitAnd") == 0) {
return [] (T lhs, T rhs) -> T { return lhs & rhs; };
} else if (_wcsicmp(testName, L"WaveMultiPrefixBitOr") == 0 ||
_wcsicmp(testName, L"WaveMultiPrefixUBitOr") == 0) {
return [] (T lhs, T rhs) -> T { return lhs | rhs; };
} else if (_wcsicmp(testName, L"WaveMultiPrefixBitXor") == 0 ||
_wcsicmp(testName, L"WaveMultiPrefixUBitXor") == 0) {
return [] (T lhs, T rhs) -> T { return lhs ^ rhs; };
} else if (_wcsicmp(testName, L"WaveMultiPrefixCountBits") == 0 ||
_wcsicmp(testName, L"WaveMultiPrefixUCountBits") == 0) {
// For CountBits, each lane contributes a boolean value. The test input is
// a zero or non-zero integer. If the input is a non-zero value then the
// condition is true, thus we contribute one to the bit count.
return [] (T lhs, T rhs) -> T { return lhs + (rhs ? 1 : 0); };
} else {
return [] (T lhs, T rhs) -> T { UNREFERENCED_PARAMETER(lhs); UNREFERENCED_PARAMETER(rhs); return 0; };
template <class T>
ExecutionTest::WaveIntrinsicsMultiPrefixOpTest(TableParameter *pParameterList,
size_t numParameters) {
struct PerThreadData {
uint32_t key;
uint32_t firstLaneId;
uint32_t laneId;
uint32_t mask;
T value;
T result;
constexpr size_t NumThreadsX = 8;
constexpr size_t NumThreadsY = 12;
constexpr size_t NumThreadsZ = 1;
constexpr size_t ThreadsPerGroup = NumThreadsX * NumThreadsY * NumThreadsZ;
constexpr size_t DispatchGroupSize = 1;
constexpr size_t ThreadCount = ThreadsPerGroup * DispatchGroupSize;
CComPtr<IStream> pStream;
ReadHlslDataIntoNewStream(L"ShaderOpArith.xml", &pStream);
CComPtr<ID3D12Device> pDevice;
if (!CreateDevice(&pDevice, D3D_SHADER_MODEL_6_5)) {
if (!DoesDeviceSupportWaveOps(pDevice)) {
// Optional feature, so it's correct to not support it if declared as such.
WEX::Logging::Log::Comment(L"Device does not support wave operations.");
ShaderOpSet = std::make_shared<st::ShaderOpSet>();
st::ParseShaderOpSetFromStream(pStream, ShaderOpSet.get());
TableParameterHandler handler(pParameterList, numParameters);
CW2A shaderSource(handler.GetTableParamByName(L"ShaderOp.Text")->m_str);
CW2A shaderProfile(handler.GetTableParamByName(L"ShaderOp.Target")->m_str);
auto testName = handler.GetTableParamByName(L"ShaderOp.Name")->m_str;
std::vector<T> *keys = handler.GetDataArray<T>(L"Validation.Keys");
std::vector<T> *values = handler.GetDataArray<T>(L"Validation.Values");
for (size_t maskIndex = 0; maskIndex < _countof(MaskFunctionTable); ++maskIndex) {
std::shared_ptr<ShaderOpTestResult> test =
RunShaderOpTestAfterParse(pDevice, m_support, "WaveIntrinsicsOp",
[&] (LPCSTR name, std::vector<BYTE> &data, st::ShaderOp *pShaderOp) {
const size_t dataSize = sizeof(PerThreadData) * ThreadCount;
PerThreadData *pThreadData = reinterpret_cast<PerThreadData *>(;
for (size_t i = 0; i != ThreadCount; ++i) {
pThreadData[i].key = keys->at(i % keys->size());
pThreadData[i].value = values->at(i % values->size());
pThreadData[i].firstLaneId = 0xdeadbeef;
pThreadData[i].laneId = 0xdeadbeef;
pThreadData[i].mask = MaskFunctionTable[maskIndex]((int)i);
pThreadData[i].result = 0xdeadbeef;
pShaderOp-> = shaderSource;
pShaderOp-> = shaderProfile;
}, ShaderOpSet);
MappedData mappedData;
test->Test->GetReadBackData("SWaveIntrinsicsOp", &mappedData);
PerThreadData *resultData = reinterpret_cast<PerThreadData *>(;
// Partition our data into waves
std::map<uint32_t, std::vector<PerThreadData *>> waves;
for (size_t i = 0, e = ThreadCount; i != e; ++i) {
PerThreadData *elt = &resultData[i];
// Basic sanity checks
VERIFY_IS_TRUE(elt->firstLaneId != 0xdeadbeef);
VERIFY_IS_TRUE(elt->laneId != 0xdeadbeef);
// Verify each wave
auto refFn = GetWaveMultiPrefixReferenceFunction<T>(testName);
for (auto &w : waves) {
std::vector<PerThreadData *> &waveData = w.second;
struct {
bool operator()(PerThreadData *a, PerThreadData *b) const {
return (a->laneId < b->laneId);
} compare;
// Need to sort based on the lane id
std::sort(waveData.begin(), waveData.end(), compare);
LogCommentFmt(L"LaneId Mask Key Value Result Expected");
LogCommentFmt(L"-------- -------- -------- -------- -------- --------");
for (size_t i = 0, e = waveData.size(); i != e; ++i) {
PerThreadData *data = waveData[i];
// Compute prefix operation over each previous lane element that has the
// same key value, and is part of the same active thread group
T accum = GetWaveMultiPrefixInitialAccumValue<T>(testName);
for (unsigned j = 0; j < i; ++j) {
if (waveData[j]->key == data->key && waveData[j]->mask == data->mask) {
accum = refFn(accum, waveData[j]->value);
LogCommentFmt(L"%08X %08X %08X %08X %08X %08X", data->laneId, data->mask, data->key, data->value, data->result, accum);
VERIFY_IS_TRUE(accum == data->result);
TEST_F(ExecutionTest, WaveIntrinsicsSM65IntTest) {
TEST_F(ExecutionTest, WaveIntrinsicsSM65UintTest) {
TEST_F(ExecutionTest, CBufferTestHalf) {
WEX::TestExecution::SetVerifyOutput verifySettings(WEX::TestExecution::VerifyOutputSettings::LogOnlyFailures);
CComPtr<IStream> pStream;
ReadHlslDataIntoNewStream(L"ShaderOpArith.xml", &pStream);
// Single operation test at the moment.
CComPtr<ID3D12Device> pDevice;
if (!CreateDevice(&pDevice, D3D_SHADER_MODEL_6_2))
if (!DoesDeviceSupportNative16bitOps(pDevice)) {
WEX::Logging::Log::Comment(L"Device does not support native 16-bit operations.");
uint16_t InputData[] = { 0x3F80, 0x3F00, 0x3D80, 0x7BFF };
std::shared_ptr<ShaderOpTestResult> test = RunShaderOpTest(pDevice, m_support, pStream, "CBufferTestHalf",
[&](LPCSTR Name, std::vector<BYTE> &Data, st::ShaderOp *pShaderOp) {
VERIFY_IS_TRUE(0 == _stricmp(Name, "CB0"));
// use shader from data table.
uint16_t *pData = (uint16_t *);
for (size_t i = 0; i < 4; ++i, ++pData) {
*pData = InputData[i];
MappedData data;
test->Test->GetReadBackData("RTarget", &data);
const uint16_t *pPixels = (uint16_t *);
for (int i = 0; i < 4; ++i) {
uint16_t output = *(pPixels + i);
float outputFloat = ConvertFloat16ToFloat32(output);
float inputFloat = ConvertFloat16ToFloat32(InputData[i]);
LogCommentFmt(L"element #%u: input = %6.8f(0x%04x), output = %6.8f(0x%04x)",
i, inputFloat, InputData[i], outputFloat, output);
VERIFY_ARE_EQUAL(inputFloat, outputFloat);
TEST_F(ExecutionTest, BarycentricsTest) {
WEX::TestExecution::SetVerifyOutput verifySettings(WEX::TestExecution::VerifyOutputSettings::LogOnlyFailures);
CComPtr<IStream> pStream;
ReadHlslDataIntoNewStream(L"ShaderOpArith.xml", &pStream);
CComPtr<ID3D12Device> pDevice;
if (!CreateDevice(&pDevice, D3D_SHADER_MODEL_6_1))
if (!DoesDeviceSupportBarycentrics(pDevice)) {
WEX::Logging::Log::Comment(L"Device does not support barycentrics.");
std::shared_ptr<ShaderOpTestResult> test = RunShaderOpTest(pDevice, m_support, pStream, "Barycentrics", nullptr);
MappedData data;
D3D12_RESOURCE_DESC &D = test->ShaderOp->GetResourceByName("RTarget")->Desc;
UINT width = (UINT)D.Width;
UINT height = D.Height;
UINT pixelSize = GetByteSizeForFormat(D.Format);
test->Test->GetReadBackData("RTarget", &data);
//const uint8_t *pPixels = (uint8_t *);
const float *pPixels = (float *);
// Get the vertex of barycentric coordinate using VBuffer
MappedData triangleData;
test->Test->GetReadBackData("VBuffer", &triangleData);
const float *pTriangleData = (float*);
// get the size of the input data
unsigned triangleVertexSizeInFloat = 0;
for (auto element : test->ShaderOp->InputElements)
triangleVertexSizeInFloat += GetByteSizeForFormat(element.Format) / 4;
XMFLOAT2 p0(pTriangleData[0], pTriangleData[1]);
XMFLOAT2 p1(pTriangleData[triangleVertexSizeInFloat], pTriangleData[triangleVertexSizeInFloat + 1]);
XMFLOAT2 p2(pTriangleData[triangleVertexSizeInFloat * 2], pTriangleData[triangleVertexSizeInFloat * 2 + 1]);
XMFLOAT3 barycentricWeights[4] = {
XMFLOAT3(0.3333f, 0.3333f, 0.3333f),
XMFLOAT3(0.5f, 0.25f, 0.25f),
XMFLOAT3(0.25f, 0.5f, 0.25f),
XMFLOAT3(0.25f, 0.25f, 0.50f)
float tolerance = 0.001f;
for (unsigned i = 0; i < sizeof(barycentricWeights) / sizeof(XMFLOAT3); ++i) {
float w0 = barycentricWeights[i].x;
float w1 = barycentricWeights[i].y;
float w2 = barycentricWeights[i].z;
float x1 = w0 * p0.x + w1 * p1.x + w2 * p2.x;
float y1 = w0 * p0.y + w1 * p1.y + w2 * p2.y;
// map from x1 y1 to rtv pixels
int pixelX = (int)((x1 + 1) * (width - 1) / 2);
int pixelY = (int)((1 - y1) * (height - 1) / 2);
int offset = pixelSize * (pixelX + pixelY * width) / sizeof(pPixels[0]);
LogCommentFmt(L"location %u %u, value %f, %f, %f", pixelX, pixelY, pPixels[offset], pPixels[offset + 1], pPixels[offset + 2]);
VERIFY_IS_TRUE(CompareFloatEpsilon(pPixels[offset], w0, tolerance));
VERIFY_IS_TRUE(CompareFloatEpsilon(pPixels[offset + 1], w1, tolerance));
VERIFY_IS_TRUE(CompareFloatEpsilon(pPixels[offset + 2], w2, tolerance));
//SavePixelsToFile(pPixels, DXGI_FORMAT_R32G32B32A32_FLOAT, width, height, L"barycentric.bmp");
static const char RawBufferTestShaderDeclarations[] =
"// Note: COMPONENT_TYPE and COMPONENT_SIZE will be defined via compiler option -D\r\n"
"typedef COMPONENT_TYPE scalar; \r\n"
"typedef vector<COMPONENT_TYPE, 2> vector2; \r\n"
"typedef vector<COMPONENT_TYPE, 3> vector3; \r\n"
"typedef vector<COMPONENT_TYPE, 4> vector4; \r\n"
"struct TestData { \r\n"
" scalar v1; \r\n"
" vector2 v2; \r\n"
" vector3 v3; \r\n"
" vector4 v4; \r\n"
"}; \r\n"
"struct UavData {\r\n"
" TestData input; \r\n"
" TestData output; \r\n"
" TestData srvOut; \r\n"
"}; \r\n"
"ByteAddressBuffer srv0 : register(t0); \r\n"
"StructuredBuffer<TestData> srv1 : register(t1); \r\n"
"ByteAddressBuffer srv2 : register(t2); \r\n"
"StructuredBuffer<TestData> srv3 : register(t3); \r\n"
"RWByteAddressBuffer uav0 : register(u0); \r\n"
"RWStructuredBuffer<UavData> uav1 : register(u1); \r\n"
"RWByteAddressBuffer uav2 : register(u2); \r\n"
"RWStructuredBuffer<UavData> uav3 : register(u3); \r\n";
static const char RawBufferTestShaderBody[] =
" // offset of 'out' in 'UavData'\r\n"
" const int out_offset = COMPONENT_SIZE * 10; \r\n"
" // offset of 'srv_out' in 'UavData'\r\n"
" const int srv_out_offset = COMPONENT_SIZE * 10 * 2; \r\n"
" // offsets within the 'Data' struct\r\n"
" const int v1_offset = 0; \r\n"
" const int v2_offset = COMPONENT_SIZE; \r\n"
" const int v3_offset = COMPONENT_SIZE * 3; \r\n"
" const int v4_offset = COMPONENT_SIZE * 6; \r\n"
" uav0.Store(srv_out_offset + v1_offset, srv0.Load<scalar>(v1_offset)); \r\n"
" uav0.Store(srv_out_offset + v2_offset, srv0.Load<vector2>(v2_offset)); \r\n"
" uav0.Store(srv_out_offset + v3_offset, srv0.Load<vector3>(v3_offset)); \r\n"
" uav0.Store(srv_out_offset + v4_offset, srv0.Load<vector4>(v4_offset)); \r\n"
" uav1[0].srvOut.v1 = srv1[0].v1; \r\n"
" uav1[0].srvOut.v2 = srv1[0].v2; \r\n"
" uav1[0].srvOut.v3 = srv1[0].v3; \r\n"
" uav1[0].srvOut.v4 = srv1[0].v4; \r\n"
" uav2.Store(srv_out_offset + v1_offset, srv2.Load<scalar>(v1_offset)); \r\n"
" uav2.Store(srv_out_offset + v2_offset, srv2.Load<vector2>(v2_offset)); \r\n"
" uav2.Store(srv_out_offset + v3_offset, srv2.Load<vector3>(v3_offset)); \r\n"
" uav2.Store(srv_out_offset + v4_offset, srv2.Load<vector4>(v4_offset)); \r\n"
" uav3[0].srvOut.v1 = srv3[0].v1; \r\n"
" uav3[0].srvOut.v2 = srv3[0].v2; \r\n"
" uav3[0].srvOut.v3 = srv3[0].v3; \r\n"
" uav3[0].srvOut.v4 = srv3[0].v4; \r\n"
" uav0.Store(out_offset + v1_offset, uav0.Load<scalar>(v1_offset)); \r\n"
" uav0.Store(out_offset + v2_offset, uav0.Load<vector2>(v2_offset)); \r\n"
" uav0.Store(out_offset + v3_offset, uav0.Load<vector3>(v3_offset)); \r\n"
" uav0.Store(out_offset + v4_offset, uav0.Load<vector4>(v4_offset)); \r\n"
" uav1[0].output.v1 = uav1[0].input.v1; \r\n"
" uav1[0].output.v2 = uav1[0].input.v2; \r\n"
" uav1[0].output.v3 = uav1[0].input.v3; \r\n"
" uav1[0].output.v4 = uav1[0].input.v4; \r\n"
" uav2.Store(out_offset + v1_offset, uav2.Load<scalar>(v1_offset)); \r\n"
" uav2.Store(out_offset + v2_offset, uav2.Load<vector2>(v2_offset)); \r\n"
" uav2.Store(out_offset + v3_offset, uav2.Load<vector3>(v3_offset)); \r\n"
" uav2.Store(out_offset + v4_offset, uav2.Load<vector4>(v4_offset)); \r\n"
" uav3[0].output.v1 = uav3[0].input.v1; \r\n"
" uav3[0].output.v2 = uav3[0].input.v2; \r\n"
" uav3[0].output.v3 = uav3[0].input.v3; \r\n"
" uav3[0].output.v4 = uav3[0].input.v4; \r\n";
static const char RawBufferTestComputeShaderTemplate[] =
"%s\r\n" // <- RawBufferTestShaderDeclarations
"[numthreads(1, 1, 1)]\r\n"
"void main(uint GI : SV_GroupIndex) {\r\n"
"%s\r\n" // <- RawBufferTestShaderBody
static const char RawBufferTestGraphicsPixelShaderTemplate[] =
"%s\r\n" // <- RawBufferTestShaderDeclarations
"struct PSInput { \r\n"
" float4 pos : SV_POSITION; \r\n"
"}; \r\n"
"uint4 main(PSInput input) : SV_TARGET{ \r\n"
" if (input.pos.x + input.pos.y == 1.0f) { // pixel { 0.5, 0.5, 0 } \r\n"
"%s\r\n" // <- RawBufferTestShaderBody
" } \r\n"
" return uint4(1, 2, 3, 4); \r\n"
TEST_F(ExecutionTest, ComputeRawBufferLdStI32) {
RawBufferLdStTestData<int32_t> data = { { 1 }, { 2, -1 }, { 256, -10517, 980 }, { 465, 13, -89, MAXUINT32 / 2 } };
RunComputeRawBufferLdStTest<int32_t>(D3D_SHADER_MODEL_6_2, RawBufferLdStType::I32, "ComputeRawBufferLdSt32Bit", data);
TEST_F(ExecutionTest, ComputeRawBufferLdStFloat) {
RawBufferLdStTestData<float> data = { { 3e-10f }, { 1.5f, -1.99988f }, { 256.0f, -105.17f, 980.0f }, { 465.1652f, -1.5694e2f, -0.8543e-2f, 1333.5f } };
RunComputeRawBufferLdStTest<float>(D3D_SHADER_MODEL_6_2, RawBufferLdStType::Float, "ComputeRawBufferLdSt32Bit", data);
TEST_F(ExecutionTest, ComputeRawBufferLdStI64) {
RawBufferLdStTestData<int64_t> data = { { 1 }, { 2, -1 }, { 256, -105171532, 980 }, { 465, 13, -89, MAXUINT64 / 2 } };
RunComputeRawBufferLdStTest<int64_t>(D3D_SHADER_MODEL_6_3, RawBufferLdStType::I64, "ComputeRawBufferLdSt64Bit", data);
TEST_F(ExecutionTest, ComputeRawBufferLdStDouble) {
RawBufferLdStTestData<double> data = { { 3e-10 }, { 1.5, -1.99988 }, { 256.0, -105.17, 980.0 }, { 465.1652, -1.5694e2, -0.8543e-2, 1333.5 } };
RunComputeRawBufferLdStTest<double>(D3D_SHADER_MODEL_6_3, RawBufferLdStType::I64, "ComputeRawBufferLdSt64Bit", data);
TEST_F(ExecutionTest, ComputeRawBufferLdStI16) {
RawBufferLdStTestData<int16_t> data = { { 1 }, { 2, -1 }, { 256, -10517, 980 }, { 465, 13, -89, MAXUINT16 / 2 } };
RunComputeRawBufferLdStTest<int16_t>(D3D_SHADER_MODEL_6_2, RawBufferLdStType::I16, "ComputeRawBufferLdSt16Bit", data);
TEST_F(ExecutionTest, ComputeRawBufferLdStHalf) {
RawBufferLdStTestData<float> floatData = { { 3e-10f }, { 1.5f, -1.99988f }, { 256.0f, 105.17f, 980.0f }, { 465.1652f, -1.5694e2f, -0.8543e-2f, 1333.5f } };
RawBufferLdStTestData<uint16_t> halfData;
for (int i = 0; i < sizeof(floatData)/sizeof(float); i++) {
((uint16_t*)&halfData)[i] = ConvertFloat32ToFloat16(((float*)&floatData)[i]);
RunComputeRawBufferLdStTest<uint16_t>(D3D_SHADER_MODEL_6_2, RawBufferLdStType::Half, "ComputeRawBufferLdSt16Bit", halfData);
TEST_F(ExecutionTest, GraphicsRawBufferLdStI32) {
RawBufferLdStTestData<int32_t> data = { { 1 }, { 2, -1 }, { 256, -10517, 980 }, { 465, 13, -89, MAXUINT32 / 2 } };
RunGraphicsRawBufferLdStTest<int32_t>(D3D_SHADER_MODEL_6_2, RawBufferLdStType::I32, "GraphicsRawBufferLdSt32Bit", data);
TEST_F(ExecutionTest, GraphicsRawBufferLdStFloat) {
RawBufferLdStTestData<float> data = { { 3e-10f }, { 1.5f, -1.99988f }, { 256.0f, -105.17f, 980.0f }, { 465.1652f, -1.5694e2f, -0.8543e-2f, 1333.5f } };
RunGraphicsRawBufferLdStTest<float>(D3D_SHADER_MODEL_6_2, RawBufferLdStType::Float, "GraphicsRawBufferLdSt32Bit", data);
TEST_F(ExecutionTest, GraphicsRawBufferLdStI64) {
RawBufferLdStTestData<int64_t> data = { { 1 }, { 2, -1 }, { 256, -105171532, 980 }, { 465, 13, -89, MAXUINT64 / 2 } };
RunGraphicsRawBufferLdStTest<int64_t>(D3D_SHADER_MODEL_6_3, RawBufferLdStType::I64, "GraphicsRawBufferLdSt64Bit", data);
TEST_F(ExecutionTest, GraphicsRawBufferLdStDouble) {
RawBufferLdStTestData<double> data = { { 3e-10 }, { 1.5, -1.99988 }, { 256.0, -105.17, 980.0 }, { 465.1652, -1.5694e2, -0.8543e-2, 1333.5 } };
RunGraphicsRawBufferLdStTest<double>(D3D_SHADER_MODEL_6_3, RawBufferLdStType::Double, "GraphicsRawBufferLdSt64Bit", data);
TEST_F(ExecutionTest, GraphicsRawBufferLdStI16) {
RawBufferLdStTestData<int16_t> data = { { 1 }, { 2, -1 }, { 256, -10517, 980 }, { 465, 13, -89, MAXUINT16 / 2 } };
RunGraphicsRawBufferLdStTest<int16_t>(D3D_SHADER_MODEL_6_2, RawBufferLdStType::I16, "GraphicsRawBufferLdSt16Bit", data);
TEST_F(ExecutionTest, GraphicsRawBufferLdStHalf) {
RawBufferLdStTestData<float> floatData = { { 3e-10f }, { 1.5f, -1.99988f }, { 256.0f, 105.17f, 0.0f }, { 465.1652f, -1.5694e2f, -0.8543e-2f, 1333.5f } };
RawBufferLdStTestData<uint16_t> halfData;
for (int i = 0; i < sizeof(floatData) / sizeof(float); i++) {
((uint16_t*)&halfData)[i] = ConvertFloat32ToFloat16(((float*)&floatData)[i]);
RunGraphicsRawBufferLdStTest<uint16_t>(D3D_SHADER_MODEL_6_2, RawBufferLdStType::Half, "GraphicsRawBufferLdSt16Bit", halfData);
bool ExecutionTest::SetupRawBufferLdStTest(D3D_SHADER_MODEL shaderModel, RawBufferLdStType dataType,
CComPtr<ID3D12Device> &pDevice, CComPtr<IStream> &pStream,
char *&sTy, char *&additionalOptions) {
if (!CreateDevice(&pDevice, shaderModel)) {
return false;
additionalOptions = "";
switch (dataType) {
case RawBufferLdStType::I64:
if (!DoesDeviceSupportInt64(pDevice)) {
WEX::Logging::Log::Comment(L"Device does not support int64 operations.");
return false;
sTy = "int64_t";
case RawBufferLdStType::Double:
if (!DoesDeviceSupportDouble(pDevice)) {
WEX::Logging::Log::Comment(L"Device does not support double operations.");
return false;
sTy = "double";
case RawBufferLdStType::I16:
case RawBufferLdStType::Half:
if (!DoesDeviceSupportNative16bitOps(pDevice)) {
WEX::Logging::Log::Comment(L"Device does not support native 16-bit operations.");
return false;
additionalOptions = "-enable-16bit-types";
sTy = (dataType == RawBufferLdStType::I16 ? "int16_t" : "half");
case RawBufferLdStType::I32:
sTy = "int32_t";
case RawBufferLdStType::Float:
sTy = "float";
DXASSERT_NOMSG("Invalid RawBufferLdStType");
// read shader config
ReadHlslDataIntoNewStream(L"ShaderOpArith.xml", &pStream);
return true;
template <class Ty>
void ExecutionTest::VerifyRawBufferLdStTestResults(const std::shared_ptr<st::ShaderOpTest> test, const RawBufferLdStTestData<Ty> &testData) {
// read buffers back & verify expected values
static const int UavBufferCount = 4;
char bufferName[11] = "UAVBufferX";
for (unsigned i = 0; i < UavBufferCount; i++) {
MappedData dataUav;
RawBufferLdStUavData<Ty> *pOutData;
bufferName[sizeof(bufferName) - 2] = (char)(i + '0');
test->GetReadBackData(bufferName, &dataUav);
VERIFY_ARE_EQUAL(sizeof(RawBufferLdStUavData<Ty>), dataUav.size());
pOutData = (RawBufferLdStUavData<Ty> *);
LogCommentFmt(L"Verifying UAVBuffer%d Load -> UAVBuffer%d Store", i, i);
// scalar
VERIFY_ARE_EQUAL(pOutData->output.v1, testData.v1);
// vector 2
VERIFY_ARE_EQUAL(pOutData->output.v2[0], testData.v2[0]);
VERIFY_ARE_EQUAL(pOutData->output.v2[1], testData.v2[1]);
// vector 3
VERIFY_ARE_EQUAL(pOutData->output.v3[0], testData.v3[0]);
VERIFY_ARE_EQUAL(pOutData->output.v3[1], testData.v3[1]);
VERIFY_ARE_EQUAL(pOutData->output.v3[2], testData.v3[2]);
// vector 4
VERIFY_ARE_EQUAL(pOutData->output.v4[0], testData.v4[0]);
VERIFY_ARE_EQUAL(pOutData->output.v4[1], testData.v4[1]);
VERIFY_ARE_EQUAL(pOutData->output.v4[2], testData.v4[2]);
VERIFY_ARE_EQUAL(pOutData->output.v4[3], testData.v4[3]);
// verify SRV Store
LogCommentFmt(L"Verifying SRVBuffer%d Load -> UAVBuffer%d Store", i, i);
// scalar
VERIFY_ARE_EQUAL(pOutData->srvOut.v1, testData.v1);
// vector 2
VERIFY_ARE_EQUAL(pOutData->srvOut.v2[0], testData.v2[0]);
VERIFY_ARE_EQUAL(pOutData->srvOut.v2[1], testData.v2[1]);
// vector 3
VERIFY_ARE_EQUAL(pOutData->srvOut.v3[0], testData.v3[0]);
VERIFY_ARE_EQUAL(pOutData->srvOut.v3[1], testData.v3[1]);
VERIFY_ARE_EQUAL(pOutData->srvOut.v3[2], testData.v3[2]);
// vector 4
VERIFY_ARE_EQUAL(pOutData->srvOut.v4[0], testData.v4[0]);
VERIFY_ARE_EQUAL(pOutData->srvOut.v4[1], testData.v4[1]);
VERIFY_ARE_EQUAL(pOutData->srvOut.v4[2], testData.v4[2]);
VERIFY_ARE_EQUAL(pOutData->srvOut.v4[3], testData.v4[3]);
template <class Ty>
void ExecutionTest::RunComputeRawBufferLdStTest(D3D_SHADER_MODEL shaderModel, RawBufferLdStType dataType,
const char *shaderOpName, const RawBufferLdStTestData<Ty> &testData) {
WEX::TestExecution::SetVerifyOutput verifySettings(WEX::TestExecution::VerifyOutputSettings::LogOnlyFailures);
CComPtr<ID3D12Device> pDevice;
CComPtr<IStream> pStream;
char *sTy = nullptr, *additionalOptions = nullptr;
if (!SetupRawBufferLdStTest(shaderModel, dataType, pDevice, pStream, sTy, additionalOptions)) {
// format shader source
char rawBufferTestShaderText[sizeof(RawBufferTestComputeShaderTemplate) + sizeof(RawBufferTestShaderDeclarations) + sizeof(RawBufferTestShaderBody)];
VERIFY_IS_TRUE(sprintf_s(rawBufferTestShaderText, sizeof(rawBufferTestShaderText),
RawBufferTestComputeShaderTemplate, RawBufferTestShaderDeclarations, RawBufferTestShaderBody) != -1);
// format compiler args
char compilerOptions[256];
VERIFY_IS_TRUE(sprintf_s(compilerOptions, sizeof(compilerOptions), "-D COMPONENT_TYPE=%s -D COMPONENT_SIZE=%d %s", sTy, (int)sizeof(Ty), additionalOptions) != -1);
// run the shader
std::shared_ptr<ShaderOpTestResult> test = RunShaderOpTest(pDevice, m_support, pStream, shaderOpName,
[&](LPCSTR Name, std::vector<BYTE> &Data, st::ShaderOp *pShaderOp) {
VERIFY_IS_TRUE(((0 == strncmp(Name, "SRVBuffer", 9)) || (0 == strncmp(Name, "UAVBuffer", 9))) &&
(Name[9] >= '0' && Name[9] <= '3'));
pShaderOp-> = compilerOptions;
pShaderOp-> = rawBufferTestShaderText;
VERIFY_IS_TRUE(sizeof(RawBufferLdStTestData<Ty>) <= Data.size());
RawBufferLdStTestData<Ty> *pInData = (RawBufferLdStTestData<Ty>*);
memcpy(pInData, &testData, sizeof(RawBufferLdStTestData<Ty>));
// verify expected values
VerifyRawBufferLdStTestResults<Ty>(test->Test, testData);
template <class Ty>
void ExecutionTest::RunGraphicsRawBufferLdStTest(D3D_SHADER_MODEL shaderModel, RawBufferLdStType dataType,
const char *shaderOpName, const RawBufferLdStTestData<Ty> &testData) {
WEX::TestExecution::SetVerifyOutput verifySettings(WEX::TestExecution::VerifyOutputSettings::LogOnlyFailures);
CComPtr<ID3D12Device> pDevice;
CComPtr<IStream> pStream;
char *sTy = nullptr, *additionalOptions = nullptr;
if (!SetupRawBufferLdStTest(shaderModel, dataType, pDevice, pStream, sTy, additionalOptions)) {
// format shader source
char rawBufferTestPixelShaderText[sizeof(RawBufferTestGraphicsPixelShaderTemplate) + sizeof(RawBufferTestShaderDeclarations) + sizeof(RawBufferTestShaderBody)];
VERIFY_IS_TRUE(sprintf_s(rawBufferTestPixelShaderText, sizeof(rawBufferTestPixelShaderText),
RawBufferTestGraphicsPixelShaderTemplate, RawBufferTestShaderDeclarations, RawBufferTestShaderBody) != -1);
// format compiler args
char compilerOptions[256];
VERIFY_IS_TRUE(sprintf_s(compilerOptions, sizeof(compilerOptions), "-D COMPONENT_TYPE=%s -D COMPONENT_SIZE=%d %s", sTy, (int)sizeof(Ty), additionalOptions) != -1);
// run the shader
std::shared_ptr<ShaderOpTestResult> test = RunShaderOpTest(pDevice, m_support, pStream, shaderOpName,
[&](LPCSTR Name, std::vector<BYTE> &Data, st::ShaderOp *pShaderOp) {
VERIFY_IS_TRUE(((0 == strncmp(Name, "SRVBuffer", 9)) || (0 == strncmp(Name, "UAVBuffer", 9))) &&
(Name[9] >= '0' && Name[9] <= '3'));
// pixel shader is at index 1, vertex shader at index 0
pShaderOp-> = compilerOptions;
pShaderOp-> = rawBufferTestPixelShaderText;
VERIFY_IS_TRUE(sizeof(RawBufferLdStTestData<Ty>) <= Data.size());
RawBufferLdStTestData<Ty> *pInData = (RawBufferLdStTestData<Ty>*);
memcpy(pInData, &testData, sizeof(RawBufferLdStTestData<Ty>));
// verify expected values
VerifyRawBufferLdStTestResults<Ty>(test->Test, testData);
template<typename T>
uint32_t pack(std::array<T, 4> unpackedVals)
uint32_t dst = 0;
constexpr uint32_t bitMask = 0xFF;
for (uint32_t i = 0U; i < 4U; ++i)
dst |= (unpackedVals[i] & bitMask) << (i * 8);
return dst;
template <typename T>
uint32_t pack_clamp_u8(std::array<T, 4> unpackedVals)
int32_t clamp_min = std::numeric_limits<uint8_t>::min();
int32_t clamp_max = std::numeric_limits<uint8_t>::max();
uint32_t dst = 0;
for (uint32_t i = 0U; i < 4U; ++i)
int32_t clamped = std::min(std::max((int32_t)unpackedVals[i], clamp_min), clamp_max);
dst |= ((uint8_t)clamped) << (i * 8);
return dst;
template <typename T>
uint32_t pack_clamp_s8(std::array<T, 4> unpackedVals)
int32_t clamp_min = std::numeric_limits<int8_t>::min();
int32_t clamp_max = std::numeric_limits<int8_t>::max();
uint32_t dst = 0;
for (uint32_t i = 0U; i < 4U; ++i)
int32_t clamped = std::min(std::max((int32_t)unpackedVals[i], clamp_min), clamp_max);
dst |= ((uint8_t)clamped) << (i * 8);
return dst;
template<typename T>
std::array<T, 4> unpack_u(uint32_t packedVal)
std::array<T, 4> ret;
ret[0] = (uint8_t)((packedVal & 0x000000FF) >> 0 );
ret[1] = (uint8_t)((packedVal & 0x0000FF00) >> 8 );
ret[2] = (uint8_t)((packedVal & 0x00FF0000) >> 16);
ret[3] = (uint8_t)((packedVal & 0xFF000000) >> 24);
return ret;
template<typename T>
std::array<T, 4> unpack_s(uint32_t packedVal)
std::array<T, 4> ret;
ret[0] = (int8_t)((packedVal & 0x000000FF) >> 0 );
ret[1] = (int8_t)((packedVal & 0x0000FF00) >> 8 );
ret[2] = (int8_t)((packedVal & 0x00FF0000) >> 16);
ret[3] = (int8_t)((packedVal & 0xFF000000) >> 24);
return ret;
TEST_F(ExecutionTest, PackUnpackTest) {
WEX::TestExecution::SetVerifyOutput verifySettings(
CComPtr<IStream> pStream;
ReadHlslDataIntoNewStream(L"ShaderOpArith.xml", &pStream);
CComPtr<ID3D12Device> pDevice;
string args = "-enable-16bit-types -DPACKUNPACK_PLACEHOLDER";
string target = "cs_6_2";
if (!CreateDevice(&pDevice)) {
string args = "-enable-16bit-types";
string target = "cs_6_6";
if (!CreateDevice(&pDevice, D3D_SHADER_MODEL_6_6)) {
if (!DoesDeviceSupportNative16bitOps(pDevice)) {
WEX::Logging::Log::Comment(L"Device does not support native 16-bit operations.");
int tableSize = sizeof(PackUnpackOpParameters) / sizeof(TableParameter);
TableParameterHandler handler(PackUnpackOpParameters, tableSize);
CW2A Text(handler.GetTableParamByName(L"ShaderOp.Text")->m_str);
std::vector<uint32_t> *validation_input = &handler.GetTableParamByName(L"Validation.Input")->m_uint32Table;
uint32_t validation_tolerance = handler.GetTableParamByName(L"Validation.Tolerance")->m_uint;
size_t count = validation_input->size();
std::vector<SPackUnpackOpOutPacked> expectedPacked(count / 4);
std::vector<SPackUnpackOpOutUnpacked> expectedUnpacked(count / 4);
std::shared_ptr<ShaderOpTestResult> test = RunShaderOpTest(
pDevice, m_support, pStream, "PackUnpackOp",
// this callback is called when the test
// is creating the resource to run the test
[&](LPCSTR Name, std::vector<BYTE> &Data, st::ShaderOp *pShaderOp) {
if (0 == _stricmp(Name, "g_bufIn"))
size_t size = sizeof(uint32_t) * 4 * count;
uint32_t *pPrimitives = (uint32_t*);
for (size_t i = 0; i < count / 4; ++i) {
uint32_t *p = &pPrimitives[i * 4];
uint32_t x = (*validation_input)[i * 4 + 0];
uint32_t y = (*validation_input)[i * 4 + 1];
uint32_t z = (*validation_input)[i * 4 + 2];
uint32_t w = (*validation_input)[i * 4 + 3];
p[0] = x;
p[1] = y;
p[2] = z;
p[3] = w;
std::array<uint32_t, 4> inputUint32 = { x, y, z, w };
std::array<int32_t, 4> inputInt32 = { (int32_t)x, (int32_t)y, (int32_t)z, (int32_t)w };
std::array<uint16_t, 4> inputUint16 = { (uint16_t)x, (uint16_t)y, (uint16_t)z, (uint16_t)w };
std::array<int16_t, 4> inputInt16 = { (int16_t)x, (int16_t)y, (int16_t)z, (int16_t)w };
// Pack unclamped
expectedPacked[i].packedUint32 = pack(inputUint32);
expectedPacked[i].packedInt32 = pack(inputInt32);
expectedPacked[i].packedUint16 = pack(inputUint16);
expectedPacked[i].packedInt16 = pack(inputInt16);
// pack clamped
expectedPacked[i].packedClampedUint32 = pack_clamp_u8(inputInt32);
expectedPacked[i].packedClampedInt32 = pack_clamp_s8(inputInt32);
expectedPacked[i].packedClampedUint16 = pack_clamp_u8(inputInt16);
expectedPacked[i].packedClampedInt16 = pack_clamp_s8(inputInt16);
// unpack
expectedUnpacked[i].outputUint32 = unpack_u<uint32_t>(expectedPacked[i].packedUint32);
expectedUnpacked[i].outputInt32 = unpack_s<int32_t >(expectedPacked[i].packedInt32 );
expectedUnpacked[i].outputUint16 = unpack_u<uint16_t>(expectedPacked[i].packedUint16);
expectedUnpacked[i].outputInt16 = unpack_s<int16_t >(expectedPacked[i].packedInt16 );
expectedUnpacked[i].outputClampedUint32 = unpack_u<uint32_t>(expectedPacked[i].packedClampedUint32);
expectedUnpacked[i].outputClampedInt32 = unpack_s<int32_t >(expectedPacked[i].packedClampedInt32 );
expectedUnpacked[i].outputClampedUint16 = unpack_u<uint16_t>(expectedPacked[i].packedClampedUint16);
expectedUnpacked[i].outputClampedInt16 = unpack_s<int16_t >(expectedPacked[i].packedClampedInt16 );
std::fill(Data.begin(), Data.end(), (BYTE)0);
// use shader from data table
pShaderOp-> = target.c_str();
pShaderOp-> = Text.m_psz;
pShaderOp-> = args.c_str();
MappedData packedData;
test->Test->GetReadBackData("g_bufOutPacked", &packedData);
SPackUnpackOpOutPacked *readBackPacked = (SPackUnpackOpOutPacked *);
MappedData unpackedData;
test->Test->GetReadBackData("g_bufOutPackedUnpacked", &unpackedData);
SPackUnpackOpOutUnpacked *readBackUnpacked = (SPackUnpackOpOutUnpacked *);
for (size_t i = 0; i < count / 4; ++i)
VerifyOutputWithExpectedValueUInt(readBackPacked[i].packedUint32, expectedPacked[i].packedUint32, validation_tolerance);
VerifyOutputWithExpectedValueInt (readBackPacked[i].packedInt32 , expectedPacked[i].packedInt32 , validation_tolerance);
VerifyOutputWithExpectedValueUInt(readBackPacked[i].packedUint16, expectedPacked[i].packedUint16, validation_tolerance);
VerifyOutputWithExpectedValueInt (readBackPacked[i].packedInt16 , expectedPacked[i].packedInt16 , validation_tolerance);
VerifyOutputWithExpectedValueUInt(readBackPacked[i].packedClampedUint32, expectedPacked[i].packedClampedUint32, validation_tolerance);
VerifyOutputWithExpectedValueInt (readBackPacked[i].packedClampedInt32 , expectedPacked[i].packedClampedInt32 , validation_tolerance);
VerifyOutputWithExpectedValueUInt(readBackPacked[i].packedClampedUint16, expectedPacked[i].packedClampedUint16, validation_tolerance);
VerifyOutputWithExpectedValueInt (readBackPacked[i].packedClampedInt16 , expectedPacked[i].packedClampedInt16 , validation_tolerance);
for (uint32_t j = 0; j < 4; ++j)
VerifyOutputWithExpectedValueUInt(readBackUnpacked[i].outputUint32[j], expectedUnpacked[i].outputUint32[j], validation_tolerance);
VerifyOutputWithExpectedValueInt (readBackUnpacked[i].outputInt32 [j], expectedUnpacked[i].outputInt32 [j], validation_tolerance);
VerifyOutputWithExpectedValueUInt(readBackUnpacked[i].outputUint16[j], expectedUnpacked[i].outputUint16[j], validation_tolerance);
VerifyOutputWithExpectedValueInt (readBackUnpacked[i].outputInt16 [j], expectedUnpacked[i].outputInt16 [j], validation_tolerance);
VerifyOutputWithExpectedValueUInt(readBackUnpacked[i].outputClampedUint32[j], expectedUnpacked[i].outputClampedUint32[j], validation_tolerance);
VerifyOutputWithExpectedValueInt (readBackUnpacked[i].outputClampedInt32 [j], expectedUnpacked[i].outputClampedInt32 [j], validation_tolerance);
VerifyOutputWithExpectedValueUInt(readBackUnpacked[i].outputClampedUint16[j], expectedUnpacked[i].outputClampedUint16[j], validation_tolerance);
VerifyOutputWithExpectedValueInt (readBackUnpacked[i].outputClampedInt16 [j], expectedUnpacked[i].outputClampedInt16 [j], validation_tolerance);
// This test expects a <pShader> that retrieves a signal value from each of a few
// resources that are initialized here. <isDynamic> determines if it uses the
// 6.6 Dynamic Resources feature.
// Values are read back from the result UAV and compared to the expected signals
void ExecutionTest::RunResourceTest(ID3D12Device *pDevice, const char *pShader,
const wchar_t *sm, bool isDynamic) {
WEX::TestExecution::SetVerifyOutput verifySettings(WEX::TestExecution::VerifyOutputSettings::LogOnlyFailures);
const int NumSRVs = 3;
const int NumUAVs = 4;
const int NumResources = NumSRVs + NumUAVs;
const int NumSamplers = 2;
const int valueSize = 16;
static const int DispatchGroupX = 1;
static const int DispatchGroupY = 1;
static const int DispatchGroupZ = 1;
CComPtr<ID3D12GraphicsCommandList> pCommandList;
CComPtr<ID3D12CommandQueue> pCommandQueue;
CComPtr<ID3D12CommandAllocator> pCommandAllocator;
FenceObj FO;
UINT valueSizeInBytes = valueSize * sizeof(float);
CreateComputeCommandQueue(pDevice, L"DynamicResourcesTest Command Queue", &pCommandQueue);
InitFenceObj(pDevice, &FO);
// Create root signature.
CComPtr<ID3D12RootSignature> pRootSignature;
if (!isDynamic) {
// Not dynamic, create a range for each resource and from them, the root signature
CD3DX12_DESCRIPTOR_RANGE ranges[NumResources];
CD3DX12_DESCRIPTOR_RANGE srange[NumSamplers];
for (int i = 0; i < NumSRVs; i++)
ranges[i].Init(D3D12_DESCRIPTOR_RANGE_TYPE_SRV, 1, i, 0);
for (int i = NumSRVs; i < NumResources; i++)
ranges[i].Init(D3D12_DESCRIPTOR_RANGE_TYPE_UAV, 1, i - NumSRVs, 0);
for (int i = 0; i < NumSamplers; i++)
srange[i].Init(D3D12_DESCRIPTOR_RANGE_TYPE_SAMPLER, 1, i, 0);
CreateRootSignatureFromRanges(pDevice, &pRootSignature, ranges, NumResources, srange, NumSamplers);
} else {
// Dynamic just requires the flags indicating that the builtin arrays should be accessible
CD3DX12_ROOT_SIGNATURE_DESC rootSignatureDesc;
rootSignatureDesc.Init(0, nullptr, 0, nullptr,
CreateRootSignatureFromDesc(pDevice, &rootSignatureDesc, &pRootSignature);
// Create pipeline state object.
CComPtr<ID3D12PipelineState> pComputeState;
CreateComputePSO(pDevice, pRootSignature, pShader, sm, &pComputeState);
// Create a command allocator and list for compute.
VERIFY_SUCCEEDED(pDevice->CreateCommandAllocator(D3D12_COMMAND_LIST_TYPE_COMPUTE, IID_PPV_ARGS(&pCommandAllocator)));
VERIFY_SUCCEEDED(pDevice->CreateCommandList(0, D3D12_COMMAND_LIST_TYPE_COMPUTE, pCommandAllocator, pComputeState, IID_PPV_ARGS(&pCommandList)));
// Set up SRV resources
CComPtr<ID3D12Resource> pSRVResources[NumSRVs];
CComPtr<ID3D12Resource> pUAVResources[NumUAVs];
CComPtr<ID3D12Resource> pUploadResources[NumResources];
D3D12_RESOURCE_DESC bufDesc = CD3DX12_RESOURCE_DESC::Buffer(valueSizeInBytes);
float values[valueSize];
for (int i = 0; i < NumSRVs - 1; i++) {
for (int j = 0; j < valueSize; j++)
values[j] = 10.0f + i;
CreateTestResources(pDevice, pCommandList, values, valueSizeInBytes, bufDesc,
&pSRVResources[i], &pUploadResources[i]);
for (int j = 0; j < valueSize; j++)
values[j] = 10.0 + (NumSRVs - 1);
CreateTestResources(pDevice, pCommandList, values, valueSizeInBytes, tex2dDesc,
&pSRVResources[NumSRVs - 1], &pUploadResources[NumSRVs - 1]);
// Set up UAV resources
CComPtr<ID3D12Resource> pReadBuffer;
float values[valueSize];
for (int i = 0; i < NumUAVs - 2; i++) {
for (int j = 0; j < valueSize; j++)
values[j] = 20.0f + i;
CreateTestUavs(pDevice, pCommandList, values, valueSizeInBytes,
&pUAVResources[i], &pUploadResources[NumSRVs + i]);
for (int j = 0; j < valueSize; j++)
values[j] = 20.0 + (NumUAVs - 1);
CreateTestUavs(pDevice, pCommandList, values, valueSizeInBytes,
&pUAVResources[NumUAVs - 2], &pUploadResources[NumResources - 2], &pReadBuffer);
for (int j = 0; j < valueSize; j++)
values[j] = 20.0 + (NumUAVs - 2);
CreateTestResources(pDevice, pCommandList, values, valueSizeInBytes, tex1dDesc,
&pUAVResources[NumUAVs - 1], &pUploadResources[NumResources - 1]);
// Close the command list and execute it to perform the GPU setup.
ExecuteCommandList(pCommandQueue, pCommandList);
WaitForSignal(pCommandQueue, FO);
VERIFY_SUCCEEDED(pCommandList->Reset(pCommandAllocator, pComputeState));
CComPtr<ID3D12DescriptorHeap> pResHeap;
CComPtr<ID3D12DescriptorHeap> pSampHeap;
CreateDefaultDescHeaps(pDevice, NumSRVs + NumUAVs, NumSamplers, &pResHeap, &pSampHeap);
// Create Rootsignature and descriptor tables
ID3D12DescriptorHeap *descHeaps[2] = {pResHeap, pSampHeap};
pCommandList->SetDescriptorHeaps(2, descHeaps);
if (!isDynamic) {
// Only non-dynamic resources require descriptortables
pCommandList->SetComputeRootDescriptorTable(0, pResHeap->GetGPUDescriptorHandleForHeapStart());
pCommandList->SetComputeRootDescriptorTable(1, pSampHeap->GetGPUDescriptorHandleForHeapStart());
CD3DX12_CPU_DESCRIPTOR_HANDLE baseHandle(pResHeap->GetCPUDescriptorHandleForHeapStart());
// Create SRVs
CreateRawSRV(pDevice, baseHandle, valueSize, pSRVResources[0]);
CreateStructSRV(pDevice, baseHandle, valueSize, sizeof(float), pSRVResources[1]);
CreateTex2DSRV(pDevice, baseHandle, DXGI_FORMAT_R32_FLOAT, pSRVResources[2]);
// Create UAVs
CreateRawUAV(pDevice, baseHandle, valueSize, pUAVResources[0]);
CreateStructUAV(pDevice, baseHandle, valueSize, sizeof(float), pUAVResources[1]);
CreateTypedUAV(pDevice, baseHandle, valueSize, DXGI_FORMAT_R32_FLOAT, pUAVResources[2]);
CreateTex1DUAV(pDevice, baseHandle, DXGI_FORMAT_R32_FLOAT, pUAVResources[3]);
float perSampleBorderColors[] = {30.0, 31.0};
CreateDefaultSamplers(pDevice, pSampHeap->GetCPUDescriptorHandleForHeapStart(),
filters, perSampleBorderColors, NumSamplers);
// Run the compute shader and copy the results back to readable memory.
pCommandList->Dispatch(DispatchGroupX, DispatchGroupY, DispatchGroupZ);
RecordTransitionBarrier(pCommandList, pUAVResources[NumUAVs - 2], D3D12_RESOURCE_STATE_UNORDERED_ACCESS, D3D12_RESOURCE_STATE_COPY_SOURCE);
pCommandList->CopyResource(pReadBuffer, pUAVResources[NumUAVs - 2]);
ExecuteCommandList(pCommandQueue, pCommandList);
WaitForSignal(pCommandQueue, FO);
MappedData data(pReadBuffer, valueSize*sizeof(float));
const float *pData = (float*);
LogCommentFmt(L"Verify bound resources are properly selected");
VERIFY_ARE_EQUAL(pData[0], 10);
VERIFY_ARE_EQUAL(pData[1], 11);
VERIFY_ARE_EQUAL(pData[2], 12);
VERIFY_ARE_EQUAL(pData[3], 20);
VERIFY_ARE_EQUAL(pData[4], 21);
VERIFY_ARE_EQUAL(pData[5], 22);
VERIFY_ARE_EQUAL(pData[6], 30);
VERIFY_ARE_EQUAL(pData[7], 1); // samplecmp 1 means it matched 31
TEST_F(ExecutionTest, SignatureResourcesTest) {
std::string pShader =
"ByteAddressBuffer g_rawBuf : register(t0);\n"
"StructuredBuffer<float> g_structBuf : register(t1);\n"
"Texture2D<float> g_tex : register(t2);\n"
"RWByteAddressBuffer g_rwRawBuf : register(u0);\n"
"RWStructuredBuffer<float> g_rwStructBuf : register(u1);\n"
"RWBuffer<float> g_result : register(u2);\n"
"RWTexture1D<float> g_rwTex : register(u3);\n"
"SamplerState g_samp : register(s0);\n"
"SamplerComparisonState g_sampCmp : register(s1);\n"
"[NumThreads(1, 1, 1)]\n"
"void main(uint ix : SV_GroupIndex) {\n"
" g_result[0] = g_rawBuf.Load<float>(0);\n"
" g_result[1] = g_structBuf.Load(0);\n"
" g_result[2] = g_tex.Load(0);\n"
" g_result[3] = g_rwRawBuf.Load<float>(0);\n"
" g_result[4] = g_rwStructBuf.Load(0);\n"
" g_result[5] = g_rwTex.Load(0);\n"
" g_result[6] = g_tex.SampleLevel(g_samp, -0.5, 0);\n"
" g_result[7] = g_tex.SampleCmpLevelZero(g_sampCmp, -0.5, 31.0);\n"
CComPtr<ID3D12Device> pDevice;
if (!CreateDevice(&pDevice, D3D_SHADER_MODEL_6_6))
RunResourceTest(pDevice, pShader.c_str(), L"cs_6_6", /*isDynamic*/false);
TEST_F(ExecutionTest, DynamicResourcesTest) {
static const char pShader[] =
"static ByteAddressBuffer g_rawBuf = ResourceDescriptorHeap[0];\n"
"static StructuredBuffer<float> g_structBuf = ResourceDescriptorHeap[1];\n"
"static Texture2D<float> g_tex = ResourceDescriptorHeap[2];\n"
"static RWByteAddressBuffer g_rwRawBuf = ResourceDescriptorHeap[3];\n"
"static RWStructuredBuffer<float> g_rwStructBuf = ResourceDescriptorHeap[4];\n"
"static RWBuffer<float> g_result = ResourceDescriptorHeap[5];\n"
"static RWTexture1D<float> g_rwTex = ResourceDescriptorHeap[6];\n"
"static SamplerState g_samp = SamplerDescriptorHeap[0];\n"
"static SamplerComparisonState g_sampCmp = SamplerDescriptorHeap[1];\n"
"[NumThreads(1, 1, 1)]\n"
"void main(uint ix : SV_GroupIndex) {\n"
" g_result[0] = g_rawBuf.Load<float>(0);\n"
" g_result[1] = g_structBuf.Load(0);\n"
" g_result[2] = g_tex.Load(0);\n"
" g_result[3] = g_rwRawBuf.Load<float>(0);\n"
" g_result[4] = g_rwStructBuf.Load(0);\n"
" g_result[5] = g_rwTex.Load(0);\n"
" g_result[6] = g_tex.SampleLevel(g_samp, -0.5, 0);\n"
" g_result[7] = g_tex.SampleCmpLevelZero(g_sampCmp, -0.5, 31.0);\n"
CComPtr<ID3D12Device> pDevice;
if (!CreateDevice(&pDevice, D3D_SHADER_MODEL_6_6))
// ResourceDescriptorHeap/SamplerDescriptorHeap requires Resource Binding Tier 3
VERIFY_SUCCEEDED(pDevice->CheckFeatureSupport((D3D12_FEATURE)D3D12_FEATURE_D3D12_OPTIONS, &devOptions, sizeof(devOptions)));
if (devOptions.ResourceBindingTier < D3D12_RESOURCE_BINDING_TIER_3) {
WEX::Logging::Log::Comment(L"Device does not support Resource Binding Tier 3");
RunResourceTest(pDevice, pShader, L"cs_6_6", /*isDynamic*/true);
//void ExecutionTest::TestComputeShaderDynamicResourcesUniformIndexing()
void EnableShaderBasedValidation() {
CComPtr<ID3D12Debug> spDebugController0;
CComPtr<ID3D12Debug1> spDebugController1;
void VerifyFloatArraysAreEqual(const float* resultFloats, float *expectedResults, int expectedResultsSize)
for (int j = 0; j < expectedResultsSize; j++)
VERIFY_ARE_EQUAL(resultFloats[j], expectedResults[j]);
TEST_F(ExecutionTest, DynamicResourcesDynamicIndexingTest) {
WEX::TestExecution::SetVerifyOutput verifySettings(
CComPtr<IStream> pStream;
ReadHlslDataIntoNewStream(L"ShaderOpArith.xml", &pStream);
std::shared_ptr<st::ShaderOpSet> ShaderOpSet =
st::ParseShaderOpSetFromStream(pStream, ShaderOpSet.get());
st::ShaderOp *pShaderOp =
vector<st::ShaderOpRootValue> fallbackRootValues = pShaderOp->RootValues;
bool Skipped = true;
//D3D_SHADER_MODEL TestShaderModels[] = {D3D_SHADER_MODEL_6_0}; // FALLBACK
const int expectedResultsSize = 16;
float expectedResultsUniform[expectedResultsSize] = {
10.0, 10.0,
12.0, 12.0,
14.0, 14.0,
20.0, 20.0,
22.0, 22.0,
24.0, 24.0,
30.0, 30.0,
32.0, 32.0};
float expectedResultsNonUniform[expectedResultsSize] = {
10.0, 11.0,
12.0, 13.0,
14.0, 15.0,
20.0, 21.0,
22.0, 23.0,
24.0, 25.0,
30.0, 31.0,
32.0, 33.0};
// TestShaderModels will be an array, where the first x models are "non-fallback", and the rest of the models
// are "fallback". If TestShaderModels has length y, and a test loops through all shader models, a convention
// to test based on whether fallback is enabled or not is to limit the loop like this:
// unsigned num_models_to_test = ExecutionTest::IsFallbackPathEnabled() ? y : x;
unsigned num_models_to_test = ExecutionTest::IsFallbackPathEnabled() ? 2 : 1;
for (unsigned i = 0; i < num_models_to_test; i++) {
D3D_SHADER_MODEL sm = TestShaderModels[i];
LogCommentFmt(L"\r\nVerifying Dynamic Resources Dynamic Indexing in shader "
L"model 6.%1u",
((UINT)sm & 0x0f));
CComPtr<ID3D12Device> pDevice;
if (!CreateDevice(&pDevice, sm, false /* skipUnsupported */)) {
&devOptions, sizeof(devOptions)));
if (devOptions.ResourceBindingTier < D3D12_RESOURCE_BINDING_TIER_3) {
L"Device does not support Resource Binding Tier 3");
for (unsigned int non_uniform_bit = 0; non_uniform_bit < 2; non_uniform_bit++) {
float *expectedResults = non_uniform_bit ? expectedResultsNonUniform : expectedResultsUniform;
LogCommentFmt(L"Testing %s Resource Indexing.", non_uniform_bit ? L"NonUniform" : L"Uniform");
// Add compile options
std::string compilerOptions = "";
if (sm==D3D_SHADER_MODEL_6_0)
compilerOptions += " -D FALLBACK=1";
if (non_uniform_bit)
compilerOptions += " -D NON_UNIFORM=1";
// by default a root value is added.
// remove the root value if this is the non-fallback path
if (sm==D3D_SHADER_MODEL_6_6)
pShaderOp->RootValues = fallbackRootValues;
// Update shader target in xml.
for (st::ShaderOpShader &S : pShaderOp->Shaders){
S.Arguments = NULL;
if (!compilerOptions.empty()){
S.Arguments = pShaderOp->GetString(compilerOptions.c_str());
// Set the target correctly. Setting here permanently overwrites
// the Target string even in future iterations.
if (sm==D3D_SHADER_MODEL_6_0){
std::string Target(S.Target);
Target[Target.length() - 1] = '0';
S.Target = pShaderOp->GetString(Target.c_str());
else if (sm==D3D_SHADER_MODEL_6_6){
std::string Target(S.Target);
Target[Target.length() - 1] = '6';
S.Target = pShaderOp->GetString(Target.c_str());
// Test Compute shader
pShaderOp->CS = pShaderOp->GetString("CS66");
std::shared_ptr<ShaderOpTestResult> test = RunShaderOpTestAfterParse(
pDevice, m_support, "DynamicResourcesDynamicIndexing", nullptr,
MappedData resultData;
test->Test->GetReadBackData("g_result", &resultData);
const float *resultCSFloats = (float *);
VerifyFloatArraysAreEqual(resultCSFloats, expectedResults, expectedResultsSize);
// Test Vertex + Pixel shader
pShaderOp->CS = nullptr;
pShaderOp->VS = pShaderOp->GetString("VS66");
pShaderOp->PS = pShaderOp->GetString("PS66");
std::shared_ptr<ShaderOpTestResult> test = RunShaderOpTestAfterParse(
pDevice, m_support, "DynamicResourcesDynamicIndexing", nullptr,
MappedData resultVSData;
MappedData resultPSData;
test->Test->GetReadBackData("g_resultVS", &resultVSData);
test->Test->GetReadBackData("g_resultPS", &resultPSData);
const float *resultVSFloats = (float *);
const float *resultPSFloats = (float *);
// VS
VerifyFloatArraysAreEqual(resultVSFloats, expectedResults, expectedResultsSize);
// PS
VerifyFloatArraysAreEqual(resultPSFloats, expectedResults, expectedResultsSize);
Skipped = false;
if (Skipped) {
#define MAX_WAVESIZE 128
#define strinfigy2(arg) #arg
#define strinfigy(arg) strinfigy2(arg)
void ExecutionTest::WaveSizeTest() {
WEX::TestExecution::SetVerifyOutput verifySettings(WEX::TestExecution::VerifyOutputSettings::LogOnlyFailures);
CComPtr<ID3D12Device> pDevice;
if (!CreateDevice(&pDevice, D3D_SHADER_MODEL_6_6)) {
// Check Wave support
if (!DoesDeviceSupportWaveOps(pDevice)) {
// Optional feature, so it's correct to not support it if declared as such.
WEX::Logging::Log::Comment(L"Device does not support wave operations.");
// Get supported wave sizes
VERIFY_SUCCEEDED(pDevice->CheckFeatureSupport((D3D12_FEATURE)D3D12_FEATURE_D3D12_OPTIONS1, &waveOpts, sizeof(waveOpts)));
UINT minWaveSize = waveOpts.WaveLaneCountMin;
UINT maxWaveSize = waveOpts.WaveLaneCountMax;
DXASSERT_NOMSG(minWaveSize <= maxWaveSize);
DXASSERT((minWaveSize & (minWaveSize - 1)) == 0, "must be a power of 2");
DXASSERT((maxWaveSize & (maxWaveSize - 1)) == 0, "must be a power of 2");
// read shader config
CComPtr<IStream> pStream;
std::shared_ptr<st::ShaderOpSet> ShaderOpSet = std::make_shared<st::ShaderOpSet>();
ReadHlslDataIntoNewStream(L"ShaderOpArith.xml", &pStream);
st::ParseShaderOpSetFromStream(pStream, ShaderOpSet.get());
// format shader source
const char waveSizeTestShader[] =
"struct TestData { \r\n"
" uint count; \r\n"
"}; \r\n"
"RWStructuredBuffer<TestData> data : register(u0); \r\n"
"// Note: WAVESIZE will be defined via compiler option -D\r\n"
"[numthreads(" strinfigy(MAX_WAVESIZE) "*2,1,1)]\r\n"
"void main(uint3 tid : SV_DispatchThreadID ) { \r\n"
" data[tid.x].count = WaveActiveSum(1); \r\n"
struct WaveSizeTestData {
uint32_t count;
for (UINT waveSize = minWaveSize; waveSize <= maxWaveSize; waveSize *= 2) {
// format compiler args
char compilerOptions[32];
VERIFY_IS_TRUE(sprintf_s(compilerOptions, sizeof(compilerOptions), "-D WAVESIZE=%d", waveSize) != -1);
// run the shader
std::shared_ptr<ShaderOpTestResult> test = RunShaderOpTestAfterParse(pDevice, m_support, "WaveSizeTest",
[&](LPCSTR Name, std::vector<BYTE> &Data, st::ShaderOp *pShaderOp) {
VERIFY_IS_TRUE((0 == strncmp(Name, "UAVBuffer0", 10)));
pShaderOp-> = compilerOptions;
pShaderOp-> = waveSizeTestShader;
VERIFY_IS_TRUE(sizeof(WaveSizeTestData)*MAX_WAVESIZE <= Data.size());
WaveSizeTestData *pInData = (WaveSizeTestData *);
memset(&pInData, sizeof(WaveSizeTestData)*MAX_WAVESIZE, 0);
}, ShaderOpSet);
// verify expected values
MappedData dataUav;
WaveSizeTestData *pOutData;
test->Test->GetReadBackData("UAVBuffer0", &dataUav);
VERIFY_ARE_EQUAL(sizeof(WaveSizeTestData)*MAX_WAVESIZE, dataUav.size());
pOutData = (WaveSizeTestData*);
LogCommentFmt(L"Verifying test result for wave size %d", waveSize);
for (unsigned i = 0; i < MAX_WAVESIZE; i++) {
if (!VERIFY_ARE_EQUAL(pOutData[i].count, waveSize))
// Atomic operation testing
// Atomic tests take a single integer index as input and contort it into some
// kind of interesting contributor to the operation in question.
// So each vertex, pixel, thread, or other will have a unique index that produces
// a contributing value to the calculation which is stored in a small resource
// For arithmetic or bitwise operations, each contributor accumulates to the same
// location in the resource indexed by the operation type. Addition is in index 0
// umin/umax are in 1 and 2 and so on.
// To make sure that the most significant bits are involved in the calculation,
// particularly in the case of 64-bit values, each contributing value is duplicated
// to the lower and upper halves of the value. There is an exception to this when
// addition exceeds the available size and also for compare and exchange explained below.
// For compare and exchange operations, 64 output locations are shared by the various lanes.
// Each lane attempts to write to a location that is shared with several others.
// The first one to write to it determines its contents, which will be the lane index <ix>
// in the upper bits and the output location index in the lower bits.
// This ensures that the compare operations consider the upper bits in the comparison.
// The initial compare store is followed by a compare exchange that compares for the
// value the current lane would have assigned there. Finally, the output of the cmpxchg
// is used to determine if the current lane should perform the final unconditional exchange.
// The values are verified by checking the lower bits for the matching location index
// and ensuring that the upper bits undergoing the same transformation result in the location index.
// For lane index <ix> the location is calculated and final result assigned as if by this code:
// g_outputBuf[(ix/3)%64] = (ix << shBits) | ((ix/3)%64);
bool AtomicResultMatches(const BYTE *uResults, uint64_t gold, size_t size) {
if (memcmp(uResults, &gold, size)) {
if (size == 4)
LogCommentFmt(L" value %d is not %d", ((uint32_t*)uResults)[0], (uint32_t)gold);
LogCommentFmt(L" value %lld is not %lld", ((uint64_t*)uResults)[0], gold);
return false;
return true;
// Used to duplicate the lower half bits into the upper half bits of an integer
// To verify that the full value is being considered, many tests duplicate the results into the upper half
#define SHIFT(val, bits) (((val)&((1ULL<<(bits))-1ULL)) | ((uint64_t)(val) << (bits)))
// Symbolic constants for the results
#define ADD_IDX 0
#define UMIN_IDX 1
#define UMAX_IDX 2
#define AND_IDX 3
#define OR_IDX 4
#define XOR_IDX 5
#define SMIN_IDX 0
#define SMAX_IDX 1
// Verify results for atomic operations. <uResults> and <sResults> are pointers to
// the readback resource sections containing unsigned and signed integers respectively.
// <pXchg> is a poiner to the readback resource containing the results of the compare
// and exchange operations tests. <stride> is the number of bytes between results for
// all of the results pointers. <maxIdx> is the number of indices that went into the results
// which is used to determine what the results should be. <bitSize> is the size in bits of
// the produced results, either 32 or 64.
void VerifyAtomicResults(const BYTE *uResults, const BYTE *sResults,
const BYTE *pXchg, size_t stride, uint64_t maxIdx, size_t bitSize) {
// Each atomic test performs the test on the value in the lower half
// and also duplicated in the upper half of the value. The SHIFT macros account for this.
// This is to verify that the upper bits are considered
uint64_t shBits = bitSize/2;
size_t byteSize = bitSize/8;
// Test ADD Operation
// ADD just sums all the indices. The result should the sum of the highest and lowest indices
// multiplied by half the number of sums.
uint64_t addResult = (maxIdx)*(maxIdx-1)/2;
LogCommentFmt(L"Verifying %d-bit integer atomic add", bitSize);
// For 32-bit values, the sum exceeds the 16 bit limit, so we can't duplicate
// That's fine, the duplication is really for 64-bit values.
if (bitSize < 64)
VERIFY_IS_TRUE(AtomicResultMatches(uResults + stride*ADD_IDX, addResult, byteSize));
VERIFY_IS_TRUE(AtomicResultMatches(uResults + stride*ADD_IDX, SHIFT(addResult, shBits), byteSize));
// Test MIN and MAX Operations
// The result of a simple min and max of any sequence of indices would be fairly uninteresting
// and certain erroneous behavior might mistakenly produce the correct results.
// To make it interesting, the contributing values will change depending on the evenness of the index.
// On an even index, min and max operate on the bitflipped index. For signed compares, this is
// interpretted as a negative value and for unsigned, a very high value.
// For unsigned min/max, index 0 will be bitflipped to ~0, which is interpretted as the maximum
// Because zero is manipulated, this leaves 1 as the lowest value.
LogCommentFmt(L"Verifying %d-bit integer atomic umin", bitSize);
VERIFY_IS_TRUE(AtomicResultMatches(uResults + stride*UMIN_IDX, SHIFT(1ULL, shBits), byteSize)); // UMin
LogCommentFmt(L"Verifying %d-bit integer atomic umax", bitSize);
VERIFY_IS_TRUE(AtomicResultMatches(uResults + stride*UMAX_IDX, ~0ULL, byteSize)); // UMax
// For signed min/max, the index just before the last will be bitflipped (maxIndex is always even).
// This is interpretted as -(maxIndex-1) and will be the lowest
// The maxIndex will be unaltered and interpretted as the highest.
LogCommentFmt(L"Verifying %d-bit integer atomic smin", bitSize);
VERIFY_IS_TRUE(AtomicResultMatches(sResults + stride*SMIN_IDX, SHIFT(-((int64_t)maxIdx-1), shBits), byteSize)); // SMin
LogCommentFmt(L"Verifying %d-bit integer atomic smax", bitSize);
VERIFY_IS_TRUE(AtomicResultMatches(sResults + stride*SMAX_IDX, SHIFT(maxIdx-1, shBits), byteSize)); // SMax
// Test AND and OR operations.
// For AND operations, all indices are bitflipped and ANDed to the previous result.
// This means that the highest bits, which are never set by the contributing indices will be set
// for all the indices, so they will be set in the final result.
// For OR operations, the indices are ORed to the previous result unaltered
// This means that any bit that is set in any index will be set in the final OR result.
// In practice, this means that the cumulative result of the AND and OR operations
// are bitflipped versions of each other.
// Finding the most significant set bit by the max index or next power of two (pot)
// gives us the pivot point for these results
uint64_t nextPot = 1ULL << (bitSize - 1);
for (;nextPot && !((maxIdx-1) & (nextPot)); nextPot >>= 1) {}
nextPot <<= 1;
LogCommentFmt(L"Verifying %d-bit integer atomic and", bitSize);
VERIFY_IS_TRUE(AtomicResultMatches(uResults + stride*AND_IDX, ~SHIFT(nextPot-1, shBits), byteSize)); // And
LogCommentFmt(L"Verifying %d-bit integer atomic or", bitSize);
VERIFY_IS_TRUE(AtomicResultMatches(uResults + stride*OR_IDX, SHIFT(nextPot-1, shBits), byteSize)); // Or
// Test XOR operation
// For XOR operations, a 1 is shifted by the number of spaces equal to the index and XORed
// to the previous result. Because this would rapidely shift off the end of the value,
// giving undefined and uninteresting results, the index is moduloed to a value that will
// fit within the type size.
// Because many of the tests use total numbers of lanes that can be evenly divisible by 32 or 64,
// these values aren't used for the modulo since the expected result might be zero,
// which could be encountered through erroneous behavior.
// Instead, one less than the type size in bits is used for the modulo.
// Even though we don't know the actual order these operations are performed,
// indices that make up a contiguous sequence of 31 or 63 values can be thought of as one of a series of "passes".
// Each "pass" sets or clears the bits depending on what's already there.
// if the number of the pass is odd, the bits are being unset and all above the mod position should be set.
// If even, the bits are in the process of being set and bits below the mod position should be set.
uint64_t xorResult = ((1ULL<<((maxIdx)%(bitSize-1))) -1);
if (((maxIdx/(bitSize-1))&1)) {
xorResult ^= ~0ULL;
// The XOR above may set uninvolved upper bits, messing up the compare. So AND off the uninvolved bits.
xorResult &= ((1ULL<<(bitSize-1)) - 1);
LogCommentFmt(L"Verifying %d-bit integer atomic xor", bitSize);
VERIFY_IS_TRUE(AtomicResultMatches(uResults + stride*XOR_IDX, xorResult, byteSize));
// Test CMP/XCHG Operations
// This tests CompareStore, CompareExchange, and Exchange operations.
// Unlike above, every lane isn't contributing to the same resource location
// Instead, every lane competes with a few others to update the same resource location.
// The first lane to find the contents of their location uninitialized will
// update it. To verify that upper bits are considered in the comparison and
// in the assignment, the value stored in the lowest bits is the location index.
// This ensures that part will be the same for each of the competing lanes.
// The uppermost bits are updated with the index of the lane that got there first.
// Subsequent calls to CompareExchange will verify this value matches and alter
// the content slightly. Finally, a simple check of the output value to what
// the current lane would expect and a call to exchange will update the value once more
// To verify this has gone through properly, the upper portion is converted as
// if to calculate the location index and compared with the location index.
// It could be the index of any of several lanes that assign to that location,
// but this ensures that it is not any lane outside of that group.
// The lower bits are compared to the location index as well.
LogCommentFmt(L"Verifying %d-bit integer atomic cmp/xchg results", bitSize);
for (size_t i = 0; i < 64; i++) {
uint64_t val = *((uint64_t*)(pXchg + i*stride));
// Verify lower bits match location index exactly
VERIFY_ARE_EQUAL(i, val & ((1ULL << shBits) - 1ULL));
// Verify that upper bits contain original index that transforms to location index
VERIFY_ARE_EQUAL(((val >> shBits)/3)%64, i);
void VerifyAtomicsRawTest(std::shared_ptr<ShaderOpTestResult> test,
uint64_t maxIdx, size_t bitSize) {
size_t stride = 8;
// struct mirroring that in the shader
struct AtomicStuff {
float prepad[2][3];
UINT uintEl[4];
int sintEl[4];
struct useless {
uint32_t unused[3];
} postpad;
float last;
MappedData uintData, xchgData;
test->Test->GetReadBackData("U0", &uintData);
test->Test->GetReadBackData("U1", &xchgData);
const AtomicStuff *pStruct = (AtomicStuff *);
const AtomicStuff *pStrXchg = (AtomicStuff *);
LogCommentFmt(L"Verifying %d-bit integer atomic operations on RWStructuredBuffer resource", bitSize);
VerifyAtomicResults((const BYTE*)&(pStruct[0].uintEl[2]), (const BYTE*)&(pStruct[1].sintEl[2]),
(const BYTE*)&(pStrXchg[0].uintEl[2]), sizeof(AtomicStuff), maxIdx, bitSize);
const BYTE *pUint = nullptr;
const BYTE *pXchg = nullptr;
test->Test->GetReadBackData("U2", &uintData);
test->Test->GetReadBackData("U3", &xchgData);
pUint = (BYTE *);
pXchg = (BYTE *);
LogCommentFmt(L"Verifying %d-bit integer atomic operations on RWByteAddressBuffer resource", bitSize);
VerifyAtomicResults(pUint, pUint + stride*6,
pXchg, stride, maxIdx, bitSize);
void VerifyAtomicsTypedTest(std::shared_ptr<ShaderOpTestResult> test,
uint64_t maxIdx, size_t bitSize) {
size_t stride = 8;
MappedData uintData, sintData, xchgData;
const BYTE *pUint = nullptr;
const BYTE *pSint = nullptr;
const BYTE *pXchg = nullptr;
// Typed resources can't share between 32 and 64 bits
if (bitSize == 32) {
test->Test->GetReadBackData("U6", &uintData);
test->Test->GetReadBackData("U7", &sintData);
test->Test->GetReadBackData("U8", &xchgData);
} else {
test->Test->GetReadBackData("U12", &uintData);
test->Test->GetReadBackData("U13", &sintData);
test->Test->GetReadBackData("U14", &xchgData);
pUint = (BYTE *);
pSint = (BYTE *);
pXchg = (BYTE *);
LogCommentFmt(L"Verifying %d-bit integer atomic operations on RWBuffer resource", bitSize);
VerifyAtomicResults(pUint, pSint + stride, pXchg, stride, maxIdx, bitSize);
// Typed resources can't share between 32 and 64 bits
if (bitSize == 32) {
test->Test->GetReadBackData("U9", &uintData);
test->Test->GetReadBackData("U10", &sintData);
test->Test->GetReadBackData("U11", &xchgData);
} else {
test->Test->GetReadBackData("U15", &uintData);
test->Test->GetReadBackData("U16", &sintData);
test->Test->GetReadBackData("U17", &xchgData);
pUint = (BYTE *);
pSint = (BYTE *);
pXchg = (BYTE *);
LogCommentFmt(L"Verifying %d-bit integer atomic operations on RWTexture resource", bitSize);
VerifyAtomicResults(pUint, pSint + stride, pXchg, stride, maxIdx, bitSize);
void VerifyAtomicsSharedTest(std::shared_ptr<ShaderOpTestResult> test,
uint64_t maxIdx, size_t bitSize) {
size_t stride = 8;
MappedData uintData, xchgData;
const BYTE *pUint = nullptr;
const BYTE *pXchg = nullptr;
test->Test->GetReadBackData("U4", &uintData);
test->Test->GetReadBackData("U5", &xchgData);
pUint = (BYTE *);
pXchg = (BYTE *);
LogCommentFmt(L"Verifying %d-bit integer atomic operations on groupshared variables", bitSize);
VerifyAtomicResults(pUint, pUint + stride*6,
pXchg, stride, maxIdx, bitSize);
void VerifyAtomicsTest(std::shared_ptr<ShaderOpTestResult> test,
uint64_t maxIdx, size_t bitSize) {
VerifyAtomicsRawTest(test, maxIdx, bitSize);
VerifyAtomicsTypedTest(test, maxIdx, bitSize);
TEST_F(ExecutionTest, AtomicsTest) {
WEX::TestExecution::SetVerifyOutput verifySettings(WEX::TestExecution::VerifyOutputSettings::LogOnlyFailures);
CComPtr<IStream> pStream;
ReadHlslDataIntoNewStream(L"ShaderOpArith.xml", &pStream);
CComPtr<ID3D12Device> pDevice;
if (!CreateDevice(&pDevice))
std::shared_ptr<st::ShaderOpSet> ShaderOpSet =
st::ParseShaderOpSetFromStream(pStream, ShaderOpSet.get());
st::ShaderOp *pShaderOp = ShaderOpSet->GetShaderOp("AtomicsHeap");
// Test compute shader
LogCommentFmt(L"Verifying 32-bit integer atomic operations in compute shader");
std::shared_ptr<ShaderOpTestResult> test = RunShaderOpTestAfterParse(pDevice, m_support, "AtomicsHeap", nullptr, ShaderOpSet);
VerifyAtomicsTest(test, 32*32, 32);
VerifyAtomicsSharedTest(test, 32*32, 32);
// Test mesh shader if available
pShaderOp->CS = nullptr;
if (DoesDeviceSupportMeshShaders(pDevice)) {
LogCommentFmt(L"Verifying 32-bit integer atomic operations in amp/mesh/pixel shaders");
test = RunShaderOpTestAfterParse(pDevice, m_support, "AtomicsHeap", nullptr, ShaderOpSet);
VerifyAtomicsTest(test, 8*8*2 + 8*8*2 + 64*64, 32);
VerifyAtomicsSharedTest(test, 8*8*2 + 8*8*2, 32);
// Test Vertex + Pixel shader
pShaderOp->MS = nullptr;
LogCommentFmt(L"Verifying 32-bit integer atomic operations in vert/pixel shaders");
test = RunShaderOpTestAfterParse(pDevice, m_support, "AtomicsHeap", nullptr, ShaderOpSet);
VerifyAtomicsTest(test, 64*64+6, 32);
TEST_F(ExecutionTest, Atomics64Test) {
WEX::TestExecution::SetVerifyOutput verifySettings(WEX::TestExecution::VerifyOutputSettings::LogOnlyFailures);
CComPtr<IStream> pStream;
ReadHlslDataIntoNewStream(L"ShaderOpArith.xml", &pStream);
CComPtr<ID3D12Device> pDevice;
if (!CreateDevice(&pDevice, D3D_SHADER_MODEL_6_6))
if (!DoesDeviceSupportInt64(pDevice)) {
WEX::Logging::Log::Comment(L"Device does not support int64 operations.");
std::shared_ptr<st::ShaderOpSet> ShaderOpSet =
st::ParseShaderOpSetFromStream(pStream, ShaderOpSet.get());
st::ShaderOp *pShaderOp = ShaderOpSet->GetShaderOp("AtomicsRoot");
// Reassign shader stages to 64-bit versions
// Collect 64-bit shaders
pShaderOp->CS = pShaderOp->GetString("CS");
pShaderOp->VS = pShaderOp->GetString("VS");
pShaderOp->PS = pShaderOp->GetString("PS");
pShaderOp->AS = pShaderOp->GetString("AS");
pShaderOp->MS = pShaderOp->GetString("MS");
// Test compute shader
LogCommentFmt(L"Verifying 64-bit integer atomic operations on raw buffers in compute shader");
std::shared_ptr<ShaderOpTestResult> test = RunShaderOpTestAfterParse(pDevice, m_support, "AtomicsRoot", nullptr, ShaderOpSet);
VerifyAtomicsRawTest(test, 32*32, 64);
// Test mesh shader if available
pShaderOp->CS = nullptr;
if (DoesDeviceSupportMeshShaders(pDevice)) {
LogCommentFmt(L"Verifying 64-bit integer atomic operations on raw buffers in amp/mesh/pixel shader");
test = RunShaderOpTestAfterParse(pDevice, m_support, "AtomicsRoot", nullptr, ShaderOpSet);
VerifyAtomicsRawTest(test, 8*8*2 + 8*8*2 + 64*64, 64);
// Test Vertex + Pixel shader
pShaderOp->MS = nullptr;
LogCommentFmt(L"Verifying 64-bit integer atomic operations on raw buffers in vert/pixel shader");
test = RunShaderOpTestAfterParse(pDevice, m_support, "AtomicsRoot", nullptr, ShaderOpSet);
VerifyAtomicsRawTest(test, 64*64+6, 64);
TEST_F(ExecutionTest, AtomicsRawHeap64Test) {
WEX::TestExecution::SetVerifyOutput verifySettings(WEX::TestExecution::VerifyOutputSettings::LogOnlyFailures);
CComPtr<IStream> pStream;
ReadHlslDataIntoNewStream(L"ShaderOpArith.xml", &pStream);
CComPtr<ID3D12Device> pDevice;
if (!CreateDevice(&pDevice, D3D_SHADER_MODEL_6_6))
if (!DoesDeviceSupportInt64(pDevice)) {
WEX::Logging::Log::Comment(L"Device does not support int64 operations.");
if (!DoesDeviceSupportHeap64Atomics(pDevice)) {
WEX::Logging::Log::Comment(L"Device does not support 64-bit atomic operations on heap resources.");
std::shared_ptr<st::ShaderOpSet> ShaderOpSet =
st::ParseShaderOpSetFromStream(pStream, ShaderOpSet.get());
st::ShaderOp *pShaderOp = ShaderOpSet->GetShaderOp("AtomicsHeap");
// Reassign shader stages to 64-bit versions
// Collect 64-bit shaders
pShaderOp->CS = pShaderOp->GetString("CS64");
pShaderOp->VS = pShaderOp->GetString("VS64");
pShaderOp->PS = pShaderOp->GetString("PS64");
pShaderOp->AS = pShaderOp->GetString("AS64");
pShaderOp->MS = pShaderOp->GetString("MS64");
// Test compute shader
LogCommentFmt(L"Verifying 64-bit integer atomic operations on heap raw buffers in compute shader");
std::shared_ptr<ShaderOpTestResult> test = RunShaderOpTestAfterParse(pDevice, m_support, "AtomicsHeap", nullptr, ShaderOpSet);
VerifyAtomicsRawTest(test, 32*32, 64);
// Test mesh shader if available
pShaderOp->CS = nullptr;
if (DoesDeviceSupportMeshShaders(pDevice)) {
LogCommentFmt(L"Verifying 64-bit integer atomic operations on heap raw buffers in amp/mesh/pixel shader");
test = RunShaderOpTestAfterParse(pDevice, m_support, "AtomicsHeap", nullptr, ShaderOpSet);
VerifyAtomicsRawTest(test, 8*8*2 + 8*8*2 + 64*64, 64);
// Test Vertex + Pixel shader
pShaderOp->MS = nullptr;
LogCommentFmt(L"Verifying 64-bit integer atomic operations on heap raw buffers in vert/pixel shader");
test = RunShaderOpTestAfterParse(pDevice, m_support, "AtomicsHeap", nullptr, ShaderOpSet);
VerifyAtomicsRawTest(test, 64*64+6, 64);
TEST_F(ExecutionTest, AtomicsTyped64Test) {
WEX::TestExecution::SetVerifyOutput verifySettings(WEX::TestExecution::VerifyOutputSettings::LogOnlyFailures);
CComPtr<IStream> pStream;
ReadHlslDataIntoNewStream(L"ShaderOpArith.xml", &pStream);
CComPtr<ID3D12Device> pDevice;
if (!CreateDevice(&pDevice, D3D_SHADER_MODEL_6_6))
if (!DoesDeviceSupportInt64(pDevice)) {
WEX::Logging::Log::Comment(L"Device does not support int64 operations.");
if (!DoesDeviceSupportTyped64Atomics(pDevice)) {
WEX::Logging::Log::Comment(L"Device does not support int64 atomic operations on typed resources.");
std::shared_ptr<st::ShaderOpSet> ShaderOpSet =
st::ParseShaderOpSetFromStream(pStream, ShaderOpSet.get());
st::ShaderOp *pShaderOp = ShaderOpSet->GetShaderOp("AtomicsHeap");
// Reassign shader stages to 64-bit versions
// Collect 64-bit shaders
pShaderOp->CS = pShaderOp->GetString("CSTY64");
pShaderOp->VS = pShaderOp->GetString("VSTY64");
pShaderOp->PS = pShaderOp->GetString("PSTY64");
pShaderOp->AS = pShaderOp->GetString("ASTY64");
pShaderOp->MS = pShaderOp->GetString("MSTY64");
// Test compute shader
LogCommentFmt(L"Verifying 64-bit integer atomic operations on typed resources in compute shader");
std::shared_ptr<ShaderOpTestResult> test = RunShaderOpTestAfterParse(pDevice, m_support, "AtomicsHeap", nullptr, ShaderOpSet);
VerifyAtomicsTypedTest(test, 32*32, 64);
// Test mesh shader if available
pShaderOp->CS = nullptr;
if (DoesDeviceSupportMeshShaders(pDevice)) {
LogCommentFmt(L"Verifying 64-bit integer atomic operations on typed resources in amp/mesh/pixel shader");
test = RunShaderOpTestAfterParse(pDevice, m_support, "AtomicsHeap", nullptr, ShaderOpSet);
VerifyAtomicsTypedTest(test, 8*8*2 + 8*8*2 + 64*64, 64);
// Test Vertex + Pixel shader
pShaderOp->MS = nullptr;
LogCommentFmt(L"Verifying 64-bit integer atomic operations on typed resources in vert/pixel shader");
test = RunShaderOpTestAfterParse(pDevice, m_support, "AtomicsHeap", nullptr, ShaderOpSet);
VerifyAtomicsTypedTest(test, 64*64+6, 64);
TEST_F(ExecutionTest, AtomicsShared64Test) {
WEX::TestExecution::SetVerifyOutput verifySettings(WEX::TestExecution::VerifyOutputSettings::LogOnlyFailures);
CComPtr<IStream> pStream;
ReadHlslDataIntoNewStream(L"ShaderOpArith.xml", &pStream);
CComPtr<ID3D12Device> pDevice;
if (!CreateDevice(&pDevice, D3D_SHADER_MODEL_6_6))
if (!DoesDeviceSupportInt64(pDevice)) {
WEX::Logging::Log::Comment(L"Device does not support int64 operations.");
if (!DoesDeviceSupportShared64Atomics(pDevice)) {
WEX::Logging::Log::Comment(L"Device does not support int64 atomic operations on groupshared variables.");
std::shared_ptr<st::ShaderOpSet> ShaderOpSet =
st::ParseShaderOpSetFromStream(pStream, ShaderOpSet.get());
st::ShaderOp *pShaderOp = ShaderOpSet->GetShaderOp("AtomicsRoot");
// Reassign shader stages to 64-bit versions
// Collect 64-bit shaders
pShaderOp->CS = pShaderOp->GetString("CSSH64");
pShaderOp->AS = pShaderOp->GetString("ASSH64");
pShaderOp->MS = pShaderOp->GetString("MSSH64");
LogCommentFmt(L"Verifying 64-bit integer atomic operations on groupshared variables in compute shader");
std::shared_ptr<ShaderOpTestResult> test = RunShaderOpTestAfterParse(pDevice, m_support, "AtomicsRoot", nullptr, ShaderOpSet);
VerifyAtomicsSharedTest(test, 32*32, 64);
// Test mesh shader if available
pShaderOp->CS = nullptr;
if (DoesDeviceSupportMeshShaders(pDevice)) {
LogCommentFmt(L"Verifying 64-bit integer atomic operations on groupshared variables in amp/mesh/pixel shader");
test = RunShaderOpTestAfterParse(pDevice, m_support, "AtomicsRoot", nullptr, ShaderOpSet);
VerifyAtomicsSharedTest(test, 8*8*2 + 8*8*2, 64);
// Float Atomics
// These operations are almost the same as for the 32-bit and 64-bit integer tests
// The difference is that there is no need to verify the upper bits.
// So there is no storing of different parts in upper and lower halves.
// Additionally, the only operations that are supported on floats
// are compare and exchange operations. So that's all that is tested here.
// Just as above, a number of lanes are assigned the same output value.
// Unlike above, one location is needed for the result of the special NaN test
// For this reason, the conversion is reduced by one and shifted by one to leave
// the zero-indexed location available.
// Verify results for a particular set of atomics results
void VerifyAtomicFloatResults(const float *results) {
// The first entry is for NaN to ensure that compares between NaNs succeed
// The sentinal value is 0.123, for which this compare is sufficient.
VERIFY_IS_TRUE(results[0] >= 0.120 && results[0] < 0.125);
// Start at 1 because 0 is just for NaN tests
for (int i = 1; i < 64; i++) {
VERIFY_ARE_EQUAL((int(results[i])/3)%63 + 1, i);
void VerifyAtomicsFloatSharedTest(std::shared_ptr<ShaderOpTestResult> test) {
MappedData Data;
const float *pData = nullptr;
test->Test->GetReadBackData("U4", &Data);
pData = (float *);
LogCommentFmt(L"Verifying float cmp/xchg atomic operations on groupshared variables");
void VerifyAtomicsFloatTest(std::shared_ptr<ShaderOpTestResult> test) {
// struct mirroring that in the shader
struct AtomicStuff {
float prepad[2][3];
float fltEl[2];
struct useless {
uint32_t unused[3];
} postpad;
// Test Compute Shader
MappedData Data;
const float *pData = nullptr;
test->Test->GetReadBackData("U0", &Data);
const AtomicStuff *pStructData = (AtomicStuff *);
LogCommentFmt(L"Verifying float cmp/xchg atomic operations on RWStructuredBuffer resources");
VERIFY_IS_TRUE(pStructData[0].fltEl[1] >= 0.120 && pStructData[0].fltEl[1] < 0.125);
for (int i = 1; i < 64; i++) {
VERIFY_ARE_EQUAL((int(pStructData[i].fltEl[1])/3)%63 + 1, i);
test->Test->GetReadBackData("U1", &Data);
pData = (float *);
LogCommentFmt(L"Verifying float cmp/xchg atomic operations on RWByteAddressBuffer resources");
test->Test->GetReadBackData("U2", &Data);
pData = (float *);
LogCommentFmt(L"Verifying float cmp/xchg atomic operations on RWBuffer resources");
test->Test->GetReadBackData("U3", &Data);
pData = (float *);
LogCommentFmt(L"Verifying float cmp/xchg atomic operations on RWTexture resources");
TEST_F(ExecutionTest, AtomicsFloatTest) {
WEX::TestExecution::SetVerifyOutput verifySettings(WEX::TestExecution::VerifyOutputSettings::LogOnlyFailures);
CComPtr<IStream> pStream;
ReadHlslDataIntoNewStream(L"ShaderOpArith.xml", &pStream);
CComPtr<ID3D12Device> pDevice;
if (!CreateDevice(&pDevice))
std::shared_ptr<st::ShaderOpSet> ShaderOpSet =
st::ParseShaderOpSetFromStream(pStream, ShaderOpSet.get());
st::ShaderOp *pShaderOp = ShaderOpSet->GetShaderOp("FloatAtomics");
// Test compute shader
LogCommentFmt(L"Verifying float cmp/xchg atomic operations in compute shader");
std::shared_ptr<ShaderOpTestResult> test = RunShaderOpTestAfterParse(pDevice, m_support, "FloatAtomics", nullptr, ShaderOpSet);
// Test mesh shader if available
pShaderOp->CS = nullptr;
if (DoesDeviceSupportMeshShaders(pDevice)) {
LogCommentFmt(L"Verifying float cmp/xchg atomic operations in amp/mesh/pixel shaders");
test = RunShaderOpTestAfterParse(pDevice, m_support, "FloatAtomics", nullptr, ShaderOpSet);
// Test Vertex + Pixel shader
pShaderOp->MS = nullptr;
LogCommentFmt(L"Verifying float cmp/xchg atomic operations in vert/pixel shaders");
test = RunShaderOpTestAfterParse(pDevice, m_support, "FloatAtomics", nullptr, ShaderOpSet);
// The IsHelperLane test renders 3-pixel triangle into 16x16 render target restricted
// to 2x2 viewport alligned at (0,0) which guarantees it will run in a single quad.
// Pixels to be rendered*
// (0,0)* (0,1)*
// (1,0) (1,1)*
// Pixel (1,0) is not rendered and is in helper lane.
// Each thread will use ddx_fine and ddy_fine to read the IsHelperLane() values from other threads.
// The bottom right pixel will write the results into the UAV buffer.
// Then the top level pixel (0,0) is discarded and the process above is repeated.
// Runs with shader models 6.0 and 6.6 to test both the HLSL built-in IsHelperLane fallback
// function (sm <= 6.5) and the IsHelperLane intrisics (sm >= 6.6).
TEST_F(ExecutionTest, HelperLaneTest) {
WEX::TestExecution::SetVerifyOutput verifySettings(WEX::TestExecution::VerifyOutputSettings::LogOnlyFailures);
CComPtr<IStream> pStream;
ReadHlslDataIntoNewStream(L"ShaderOpArith.xml", &pStream);
std::shared_ptr<st::ShaderOpSet> ShaderOpSet = std::make_shared<st::ShaderOpSet>();
st::ParseShaderOpSetFromStream(pStream, ShaderOpSet.get());
string args = "";
D3D_SHADER_MODEL TestShaderModels[] = { D3D_SHADER_MODEL_6_0, D3D_SHADER_MODEL_6_6 };
for (unsigned i = 0; i < _countof(TestShaderModels); i++) {
D3D_SHADER_MODEL sm = TestShaderModels[i];
LogCommentFmt(L"Verifying IsHelperLane in shader model 6.%1u", ((UINT)sm & 0x0f));
CComPtr<ID3D12Device> pDevice;
if (!CreateDevice(&pDevice, sm, false /* skipUnsupported */))
std::shared_ptr<ShaderOpTestResult> test = RunShaderOpTestAfterParse(pDevice, m_support, "HelperLaneTestNoWave",
// this callbacked is called when the test is creating the resource to run the test
[&](LPCSTR Name, std::vector<BYTE>& Data, st::ShaderOp* pShaderOp) {
VERIFY_IS_TRUE(0 == _stricmp(Name, "UAVBuffer0"));
std::fill(Data.begin(), Data.end(), (BYTE)0xCC);
pShaderOp-> = args.c_str();
pShaderOp-> = args.c_str();
}, ShaderOpSet);
struct HelperLaneTestResult {
int32_t is_helper_00;
int32_t is_helper_10;
int32_t is_helper_01;
int32_t is_helper_11;
MappedData uavData;
test->Test->GetReadBackData("UAVBuffer0", &uavData);
HelperLaneTestResult* pTestResults = (HelperLaneTestResult*);
MappedData renderData;
test->Test->GetReadBackData("RTarget", &renderData);
const uint32_t* pPixels = (uint32_t*);
// before discard
VERIFY_ARE_EQUAL(pTestResults[0].is_helper_00, 0);
VERIFY_ARE_EQUAL(pTestResults[0].is_helper_10, 0);
VERIFY_ARE_EQUAL(pTestResults[0].is_helper_01, 1);
VERIFY_ARE_EQUAL(pTestResults[0].is_helper_11, 0);
// after discard
VERIFY_ARE_EQUAL(pTestResults[1].is_helper_00, 1);
VERIFY_ARE_EQUAL(pTestResults[1].is_helper_10, 0);
VERIFY_ARE_EQUAL(pTestResults[1].is_helper_01, 1);
VERIFY_ARE_EQUAL(pTestResults[1].is_helper_11, 0);
struct HelperLaneWaveTestResult60 {
// 6.0 wave ops
int32_t anyTrue;
int32_t allTrue;
XMUINT4 ballot;
int32_t waterfallLoopCount;
int32_t allEqual;
int32_t countBits;
int32_t sum;
int32_t product;
int32_t bitAnd;
int32_t bitOr;
int32_t bitXor;
int32_t min;
int32_t max;
int32_t prefixCountBits;
int32_t prefixProduct;
int32_t prefixSum;
struct HelperLaneQuadTestResult {
int32_t is_helper_this;
int32_t is_helper_across_X;
int32_t is_helper_across_Y;
int32_t is_helper_across_Diag;
struct HelperLaneWaveTestResult65 {
// 6.5 wave ops
XMUINT4 match;
int32_t mpCountBits;
int32_t mpSum;
int32_t mpProduct;
int32_t mpBitAnd;
int32_t mpBitOr;
int32_t mpBitXor;
struct HelperLaneWaveTestResult {
HelperLaneWaveTestResult60 sm60;
HelperLaneQuadTestResult sm60_quad;
HelperLaneWaveTestResult65 sm65;
struct foo { int32_t a; int32_t b; int32_t c; };
struct bar { foo f; int32_t d; XMUINT4 g; };
foo f = {1, 2, 3};
bar b = { { 1, 2, 3 }, 0, { 1, 2, 3, 4 } };
HelperLaneWaveTestResult HelperLane_CS_ExpectedResults = {
// HelperLaneWaveTestResult60
{ 0, 1, { 0x7, 0, 0, 0 }, 3, 1, 3, 12, 64, 1, 0, 0, 10, 1, 2, 16, 4 },
// HelperLaneQuadTestResult
{ 0, 0, 0, 0 },
// HelperLaneWaveTestResult65
{ {0x7, 0, 0, 0}, 2, 4, 16, 1, 0, 0 }
HelperLaneWaveTestResult HelperLane_VS_ExpectedResults = HelperLane_CS_ExpectedResults;
HelperLaneWaveTestResult HelperLane_PS_ExpectedResults = {
// HelperLaneWaveTestResult60
{ 0, 1, { 0xB, 0, 0, 0 }, 3, 1, 3, 12, 64, 1, 0, 0, 10, 1, 2, 16, 4 },
// HelperLaneQuadTestResult
{ 0, 1, 0, 0 },
// HelperLaneWaveTestResult65
{ {0xB, 0, 0, 0}, 2, 4, 16, 1, 0, 0 }
HelperLaneWaveTestResult HelperLane_PSAfterDiscard_ExpectedResults = {
// HelperLaneWaveTestResult60
{ 0, 1, { 0xA, 0, 0, 0 }, 2, 1, 2, 8, 16, 1, 0, 0, 10, 1, 1, 4, 2 },
// HelperLaneQuadTestResult
{ 0, 1, 0, 1 },
// HelperLaneWaveTestResult65
{ {0xA, 0, 0, 0}, 1, 2, 4, 1, 0, 0 }
HelperLaneWaveTestResult IncludesHelperLane_PS_ExpectedResults = {
// HelperLaneWaveTestResult60
{ 1, 0, { 0xF, 0, 0, 0 }, 4, 0, 4, 16, 256, 0, 1, 1, 1, 10, 3, 64, 6 },
// HelperLaneQuadTestResult
{ 0, 1, 0, 0 },
// HelperLaneWaveTestResult65
{ {0xF, 0, 0, 0}, 3, 6, 64, 0, 1, 1 }
HelperLaneWaveTestResult IncludesHelperLane_PSAfterDiscard_ExpectedResults = {
// HelperLaneWaveTestResult60
{ 1, 0, { 0xF, 0, 0, 0 }, 4, 0, 4, 16, 256, 0, 1, 0, 1, 10, 3, 64, 6 },
// HelperLaneQuadTestResult
{ 0, 1, 0, 1 },
// HelperLaneWaveTestResult65
{ {0xF, 0, 0, 0}, 3, 6, 64, 0, 1, 0 }
bool HelperLaneResultLogAndVerify(const wchar_t* testDesc, uint32_t expectedValue, uint32_t actualValue) {
bool matches = (expectedValue == actualValue);
LogCommentFmt(L"%s%s, expected = %u, actual = %u", matches ? L" - " : L"FAILED: ", testDesc, expectedValue, actualValue);
return matches;
bool HelperLaneResultLogAndVerify(const wchar_t* testDesc, XMUINT4 expectedValue, XMUINT4 actualValue) {
bool matches = (expectedValue.x == actualValue.x && expectedValue.y == actualValue.y &&
expectedValue.z == actualValue.z && expectedValue.w == actualValue.w);
LogCommentFmt(L"%s%s, expected = (0x%X,0x%X,0x%X,0x%X), actual = (0x%X,0x%X,0x%X,0x%X)", matches ? L" - " : L"FAILED: ", testDesc,
expectedValue.x, expectedValue.y, expectedValue.z, expectedValue.w, actualValue.x, actualValue.y, actualValue.z, actualValue.w);
return matches;
bool VerifyHelperLaneWaveResults(ExecutionTest::D3D_SHADER_MODEL sm, HelperLaneWaveTestResult& testResults, HelperLaneWaveTestResult& expectedResults, bool verifyQuads) {
bool passed = true;
HelperLaneWaveTestResult60& tr60 = testResults.sm60;
HelperLaneWaveTestResult60& tr60exp = expectedResults.sm60;
passed &= HelperLaneResultLogAndVerify(L"WaveActiveAnyTrue(IsHelperLane())", tr60exp.anyTrue, tr60.anyTrue);
passed &= HelperLaneResultLogAndVerify(L"WaveActiveAllTrue(!IsHelperLane())", tr60exp.allTrue, tr60.allTrue);
passed &= HelperLaneResultLogAndVerify(L"WaveActiveBallot(true) has exactly 3 bits set", tr60exp.ballot, tr60.ballot);
passed &= HelperLaneResultLogAndVerify(L"!WaveReadLaneFirst(IsHelperLane()) && WaveIsFirstLane() in a waterfall loop", tr60exp.waterfallLoopCount, tr60.waterfallLoopCount);
passed &= HelperLaneResultLogAndVerify(L"WaveActiveAllEqual(IsHelperLane())", tr60exp.allEqual, tr60.allEqual);
passed &= HelperLaneResultLogAndVerify(L"WaveActiveCountBits(true)", tr60exp.countBits, tr60.countBits);
passed &= HelperLaneResultLogAndVerify(L"WaveActiveSum(4)", tr60exp.sum, tr60.sum);
passed &= HelperLaneResultLogAndVerify(L"WaveActiveProduct(4)", tr60exp.product, tr60.product);
passed &= HelperLaneResultLogAndVerify(L"WaveActiveBitAnd(!IsHelperLane())", tr60exp.bitAnd, tr60.bitAnd);
passed &= HelperLaneResultLogAndVerify(L"WaveActiveBitOr(IsHelperLane())", tr60exp.bitOr, tr60.bitOr);
passed &= HelperLaneResultLogAndVerify(L"WaveActiveBitXor(IsHelperLane())", tr60exp.bitXor, tr60.bitXor);
passed &= HelperLaneResultLogAndVerify(L"WaveActiveMin(IsHelperLane() ? 1 : 10)", tr60exp.min, tr60.min);
passed &= HelperLaneResultLogAndVerify(L"WaveActiveMax(IsHelperLane() ? 10 : 1)", tr60exp.max, tr60.max);
passed &= HelperLaneResultLogAndVerify(L"WavePrefixCountBits(1)", tr60exp.prefixCountBits, tr60.prefixCountBits);
passed &= HelperLaneResultLogAndVerify(L"WavePrefixProduct(4)", tr60exp.prefixProduct, tr60.prefixProduct);
passed &= HelperLaneResultLogAndVerify(L"WavePrefixSum(2)", tr60exp.prefixSum, tr60.prefixSum);
if (verifyQuads) {
HelperLaneQuadTestResult& quad_tr = testResults.sm60_quad;
HelperLaneQuadTestResult& quad_tr_exp = expectedResults.sm60_quad;
passed &= HelperLaneResultLogAndVerify(L"QuadReadAcross* - lane 3 / pixel (1,1) - IsHelperLane()", quad_tr_exp.is_helper_this, quad_tr.is_helper_this);
passed &= HelperLaneResultLogAndVerify(L"QuadReadAcross* - lane 2 / pixel (0,1) - IsHelperLane()", quad_tr_exp.is_helper_across_X, quad_tr.is_helper_across_X);
passed &= HelperLaneResultLogAndVerify(L"QuadReadAcross* - lane 1 / pixel (1,0) - IsHelperLane()", quad_tr_exp.is_helper_across_Y, quad_tr.is_helper_across_Y);
passed &= HelperLaneResultLogAndVerify(L"QuadReadAcross* - lane 0 / pixel (0,0) - IsHelperLane()", quad_tr_exp.is_helper_across_Diag, quad_tr.is_helper_across_Diag);
if (sm >= ExecutionTest::D3D_SHADER_MODEL_6_5) {
HelperLaneWaveTestResult65& tr65 = testResults.sm65;
HelperLaneWaveTestResult65& tr65exp = expectedResults.sm65;
passed &= HelperLaneResultLogAndVerify(L"WaveMatch(true) has exactly 3 bits set", tr65exp.match, tr65.match);
passed &= HelperLaneResultLogAndVerify(L"WaveMultiPrefixCountBits(1, no_masked_bits)", tr65exp.mpCountBits, tr65.mpCountBits);
passed &= HelperLaneResultLogAndVerify(L"WaveMultiPrefixSum(2, no_masked_bits)", tr65exp.mpSum, tr65.mpSum);
passed &= HelperLaneResultLogAndVerify(L"WaveMultiPrefixProduct(4, no_masked_bits)", tr65exp.mpProduct, tr65.mpProduct);
passed &= HelperLaneResultLogAndVerify(L"WaveMultiPrefixAnd(IsHelperLane() ? 0 : 1, no_masked_bits)", tr65exp.mpBitAnd, tr65.mpBitAnd);
passed &= HelperLaneResultLogAndVerify(L"WaveMultiPrefixOr(IsHelperLane() ? 1 : 0, no_masked_bits)", tr65exp.mpBitOr, tr65.mpBitOr);
passed &= HelperLaneResultLogAndVerify(L"verify WaveMultiPrefixXor(IsHelperLane() ? 1 : 0, no_masked_bits)", tr65exp.mpBitXor, tr65.mpBitXor);
return passed;
// Contrary to compute or pixel shaders the layout of lanes in vertex shaders is
// not specified. A conforming implementation could, in the extreme case, decide
// to dispatch three waves that each process only a single vertex.
// So instead of compare with fixed expected result, calculate the correct
// result from ballot.
bool VerifyHelperLaneWaveResultsForVS(ExecutionTest::D3D_SHADER_MODEL sm,
HelperLaneWaveTestResult &testResults) {
bool passed = true;
XMUINT4 mask = testResults.sm60.ballot;
unsigned countBits = 0;
std::bitset<32> x(mask.x);
std::bitset<32> y(mask.y);
std::bitset<32> z(mask.z);
std::bitset<32> w(mask.w);
countBits += (unsigned)x.count();
countBits += (unsigned)y.count();
countBits += (unsigned)z.count();
countBits += (unsigned)w.count();
// For VS, IsHelperLane always return false.
HelperLaneWaveTestResult60 &tr60 = testResults.sm60;
passed &= HelperLaneResultLogAndVerify(L"WaveActiveAnyTrue(IsHelperLane())",
0, tr60.anyTrue);
passed &= HelperLaneResultLogAndVerify(
L"WaveActiveAllTrue(!IsHelperLane())", 1, tr60.allTrue);
bool ballotMatch = 1 <= countBits && countBits <= 3;
LogCommentFmt(L"%sWaveActiveBallot(true) expected 1~3 bits set, actual = %u",
ballotMatch ? L" - " : L"FAILED: ", tr60.ballot);
passed &= HelperLaneResultLogAndVerify(
L"!WaveReadLaneFirst(IsHelperLane()) && WaveIsFirstLane() in a "
L"waterfall loop",
countBits, tr60.waterfallLoopCount);
passed &= HelperLaneResultLogAndVerify(
L"WaveActiveAllEqual(IsHelperLane())", 1, tr60.allEqual);
passed &= HelperLaneResultLogAndVerify(L"WaveActiveCountBits(true)",
countBits, tr60.countBits);
passed &= HelperLaneResultLogAndVerify(L"WaveActiveSum(4)", 4 * countBits,
passed &= HelperLaneResultLogAndVerify(L"WaveActiveProduct(4)", (unsigned)std::pow(4, countBits),
passed &= HelperLaneResultLogAndVerify(L"WaveActiveBitAnd(!IsHelperLane())",
1, tr60.bitAnd);
passed &= HelperLaneResultLogAndVerify(L"WaveActiveBitOr(IsHelperLane())",
0, tr60.bitOr);
passed &= HelperLaneResultLogAndVerify(L"WaveActiveBitXor(IsHelperLane())",
0, tr60.bitXor);
passed &= HelperLaneResultLogAndVerify(
L"WaveActiveMin(IsHelperLane() ? 1 : 10)", 10, tr60.min);
passed &= HelperLaneResultLogAndVerify(
L"WaveActiveMax(IsHelperLane() ? 10 : 1)", 1, tr60.max);
passed &= HelperLaneResultLogAndVerify(L"WavePrefixCountBits(1)",
passed &= HelperLaneResultLogAndVerify(L"WavePrefixProduct(4)",
(unsigned)std::pow(4, countBits - 1),
passed &= HelperLaneResultLogAndVerify(L"WavePrefixSum(2)",
2 * (countBits-1), tr60.prefixSum);
if (sm >= ExecutionTest::D3D_SHADER_MODEL_6_5) {
HelperLaneWaveTestResult65 &tr65 = testResults.sm65;
passed &= HelperLaneResultLogAndVerify(
L"WaveMatch(true) has exactly 3 bits set", mask, tr65.match);
passed &= HelperLaneResultLogAndVerify(
L"WaveMultiPrefixCountBits(1, no_masked_bits)", countBits-1,
passed &= HelperLaneResultLogAndVerify(
L"WaveMultiPrefixSum(2, no_masked_bits)", 2*(countBits-1), tr65.mpSum);
passed &= HelperLaneResultLogAndVerify(
L"WaveMultiPrefixProduct(4, no_masked_bits)",
(unsigned)std::pow(4, countBits - 1),
passed &= HelperLaneResultLogAndVerify(
L"WaveMultiPrefixAnd(IsHelperLane() ? 0 : 1, no_masked_bits)",
1, tr65.mpBitAnd);
passed &= HelperLaneResultLogAndVerify(
L"WaveMultiPrefixOr(IsHelperLane() ? 1 : 0, no_masked_bits)",
0, tr65.mpBitOr);
passed &= HelperLaneResultLogAndVerify(
L"verify WaveMultiPrefixXor(IsHelperLane() ? 1 : 0, no_masked_bits)",
0, tr65.mpBitXor);
return passed;
void CleanUAVBuffer0Buffer(LPCSTR BufferName, std::vector<BYTE>& Data, st::ShaderOp* pShaderOp) {
VERIFY_IS_TRUE(0 == _stricmp(BufferName, "UAVBuffer0"));
std::fill(Data.begin(), Data.end(), (BYTE)0xCC);
// The IsHelperLane test that use Wave intrinsics to verify IsHelperLane() and Wave operations on active lanes.
// Runs with shader models 6.0, 6.5 and 6.6 to test both the HLSL built-in IsHelperLane fallback
// function (sm <= 6.5) and the IsHelperLane intrisics (sm >= 6.6) and the shader model 6.5 wave intrinsics (sm >= 6.5).
// For compute and vertex shaders IsHelperLane() always returns false and might be optimized away in the front end.
// However it can be exposed to the driver in CS/VS through an exported function in a library so drivers need
// to be prepared to handle it. For this reason the test is compiled with disabled optimizations (/Od).
// The tests are also validating that wave intrinsics operate correctly with 3 threads in a CS or 3 vertices
// in a VS where the rest of the lanes in the wave are not active (dead lanes).
TEST_F(ExecutionTest, HelperLaneTestWave) {
WEX::TestExecution::SetVerifyOutput verifySettings(WEX::TestExecution::VerifyOutputSettings::LogOnlyFailures);
CComPtr<IStream> pStream;
ReadHlslDataIntoNewStream(L"ShaderOpArith.xml", &pStream);
std::shared_ptr<st::ShaderOpSet> ShaderOpSet = std::make_shared<st::ShaderOpSet>();
st::ParseShaderOpSetFromStream(pStream, ShaderOpSet.get());
st::ShaderOp* pShaderOp = ShaderOpSet->GetShaderOp("HelperLaneTestWave");
LPCSTR args = "/Od";
if (args[0]) {
for (st::ShaderOpShader& S : pShaderOp->Shaders)
S.Arguments = args;
bool testPassed = true;
for (unsigned i = 0; i < _countof(TestShaderModels); i++) {
D3D_SHADER_MODEL sm = TestShaderModels[i];
LogCommentFmt(L"\r\nVerifying IsHelperLane using Wave intrinsics in shader model 6.%1u", ((UINT)sm & 0x0f));
bool smPassed = true;
CComPtr<ID3D12Device> pDevice;
if (!CreateDevice(&pDevice, sm, false /* skipUnsupported */)) {
if (GetTestParamUseWARP(UseWarpByDefault()) || IsDeviceBasicAdapter(pDevice)) {
WEX::Logging::Log::Comment(L"WARP has a known issue with HelperLaneTestWave.");
if (!DoesDeviceSupportWaveOps(pDevice)) {
LogCommentFmt(L"Device does not support wave operations in shader model 6.%1u", ((UINT)sm & 0x0f));
if (sm == D3D_SHADER_MODEL_6_5) {
// Reassign shader stages to 6.5 versions
pShaderOp->CS = pShaderOp->GetString("CS65");
pShaderOp->VS = pShaderOp->GetString("VS65");
pShaderOp->PS = pShaderOp->GetString("PS65");
} else if (sm == D3D_SHADER_MODEL_6_6) {
// Reassign shader stages to 6.6 versions
pShaderOp->CS = pShaderOp->GetString("CS66");
pShaderOp->VS = pShaderOp->GetString("VS66");
pShaderOp->PS = pShaderOp->GetString("PS66");
} else if (sm == D3D_SHADER_MODEL_6_7) {
// Reassign shader stages to 6.7 versions
pShaderOp->CS = pShaderOp->GetString("CS66");
pShaderOp->VS = pShaderOp->GetString("VS66");
// Only PS has SM 6.7 version to test new [WaveOpsIncludeHelperLanes] attribute
pShaderOp->PS = pShaderOp->GetString("PS67");
const unsigned CS_INDEX = 0, VS_INDEX = 0, PS_INDEX = 1, PS_INDEX_AFTER_DISCARD = 2;
// Test Compute shader
std::shared_ptr<ShaderOpTestResult> test = RunShaderOpTestAfterParse(pDevice, m_support, "HelperLaneTestWave",
CleanUAVBuffer0Buffer, ShaderOpSet);
MappedData uavData;
test->Test->GetReadBackData("UAVBuffer0", &uavData);
HelperLaneWaveTestResult* pTestResults = (HelperLaneWaveTestResult*);
LogCommentFmt(L"\r\nCompute shader");
smPassed &= VerifyHelperLaneWaveResults(sm, pTestResults[CS_INDEX], HelperLane_CS_ExpectedResults, true);
HelperLaneWaveTestResult &PS_ExpectedResults =
(sm >= D3D_SHADER_MODEL_6_7) ? IncludesHelperLane_PS_ExpectedResults
: HelperLane_PS_ExpectedResults;
HelperLaneWaveTestResult &PSAfterDiscard_ExpectedResults =
(sm >= D3D_SHADER_MODEL_6_7)
? IncludesHelperLane_PSAfterDiscard_ExpectedResults
: HelperLane_PSAfterDiscard_ExpectedResults;
// Test Vertex + Pixel shader
pShaderOp->CS = nullptr;
std::shared_ptr<ShaderOpTestResult> test = RunShaderOpTestAfterParse(pDevice, m_support, "HelperLaneTestWave", CleanUAVBuffer0Buffer, ShaderOpSet);
MappedData uavData;
test->Test->GetReadBackData("UAVBuffer0", &uavData);
HelperLaneWaveTestResult* pTestResults = (HelperLaneWaveTestResult*);
LogCommentFmt(L"\r\nVertex shader");
smPassed &= VerifyHelperLaneWaveResultsForVS(sm, pTestResults[VS_INDEX]);
LogCommentFmt(L"\r\nPixel shader");
smPassed &= VerifyHelperLaneWaveResults(sm, pTestResults[PS_INDEX], PS_ExpectedResults, true);
LogCommentFmt(L"\r\nPixel shader with discarded pixel");
smPassed &= VerifyHelperLaneWaveResults(sm, pTestResults[PS_INDEX_AFTER_DISCARD], PSAfterDiscard_ExpectedResults, true);
MappedData renderData;
test->Test->GetReadBackData("RTarget", &renderData);
const uint32_t* pPixels = (uint32_t*);
testPassed &= smPassed;
VERIFY_ARE_EQUAL(testPassed, true);
struct int2 {
int x;
int y;
bool VerifyQuadAnyAllResults(int2 *Res) {
int Idx = 0;
for (; Idx < 4; ++Idx) {
if (Res[Idx].x != 2)
return false;
if (Res[Idx].y != 4)
return false;
for (; Idx < 60; ++Idx) {
if (Res[Idx].x != 1)
return false;
if (Res[Idx].y != 4)
return false;
for (; Idx < 64; ++Idx) {
if (Res[Idx].x != 1)
return false;
if (Res[Idx].y != 3)
return false;
return true;
TEST_F(ExecutionTest, QuadAnyAll) {
WEX::TestExecution::SetVerifyOutput verifySettings(
CComPtr<IStream> pStream;
ReadHlslDataIntoNewStream(L"ShaderOpArith.xml", &pStream);
std::shared_ptr<st::ShaderOpSet> ShaderOpSet =
st::ParseShaderOpSetFromStream(pStream, ShaderOpSet.get());
st::ShaderOp *pShaderOp = ShaderOpSet->GetShaderOp("QuadAnyAll");
LPCSTR args = "/Od";
if (args[0]) {
for (st::ShaderOpShader &S : pShaderOp->Shaders)
S.Arguments = args;
bool Skipped = true;
D3D_SHADER_MODEL TestShaderModels[] = {D3D_SHADER_MODEL_6_0,
for (unsigned i = 0; i < _countof(TestShaderModels); i++) {
D3D_SHADER_MODEL sm = TestShaderModels[i];
LogCommentFmt(L"\r\nVerifying QuadAny/QuadAll using Wave intrinsics in "
L"shader model 6.%1u",
((UINT)sm & 0x0f));
if (sm == D3D_SHADER_MODEL_6_5) {
pShaderOp->MS = pShaderOp->GetString("MS");
pShaderOp->AS = pShaderOp->GetString("AS");
} else if (sm == D3D_SHADER_MODEL_6_7) {
pShaderOp->AS = pShaderOp->GetString("AS67");
pShaderOp->MS = pShaderOp->GetString("MS67");
pShaderOp->CS = pShaderOp->GetString("CS67");
CComPtr<ID3D12Device> pDevice;
if (!CreateDevice(&pDevice, sm, false /* skipUnsupported */)) {
if (IsDeviceBasicAdapter(pDevice)) {
WEX::Logging::Log::Comment(L"QuadAny/All fails on basic render driver.");
if (!DoesDeviceSupportWaveOps(pDevice)) {
L"Device does not support wave operations in shader model 6.%1u",
((UINT)sm & 0x0f));
Skipped = false;
// test compute
std::shared_ptr<ShaderOpTestResult> test = RunShaderOpTestAfterParse(
pDevice, m_support, "QuadAnyAll", CleanUAVBuffer0Buffer, ShaderOpSet);
MappedData uavData;
test->Test->GetReadBackData("UAVBuffer0", &uavData);
bool Result = VerifyQuadAnyAllResults((int2 *);
if (sm < D3D_SHADER_MODEL_6_5 || !DoesDeviceSupportMeshShaders(pDevice))
pShaderOp->CS = nullptr;
// test AS/MS
test = RunShaderOpTestAfterParse(pDevice, m_support, "QuadAnyAll",
CleanUAVBuffer0Buffer, ShaderOpSet);
test->Test->GetReadBackData("UAVBuffer0", &uavData);
Result = VerifyQuadAnyAllResults((int2 *);
Result = VerifyQuadAnyAllResults(&((int2 *)[64]);
if (Skipped)
#ifndef _HLK_CONF
static void WriteReadBackDump(st::ShaderOp *pShaderOp, st::ShaderOpTest *pTest,
char **pReadBackDump) {
std::stringstream str;
unsigned count = 0;
for (auto &R : pShaderOp->Resources) {
if (!R.ReadBack)
str << "Resource: " << R.Name << "\r\n";
// Find a descriptor that can tell us how to dump this resource.
bool found = false;
for (auto &Heaps : pShaderOp->DescriptorHeaps) {
for (auto &D : Heaps.Descriptors) {
if (_stricmp(D.ResName, R.Name) != 0) {
found = true;
if (_stricmp(D.Kind, "UAV") != 0) {
str << "Resource dump for kind " << D.Kind << " not implemented yet.\r\n";
if (D.UavDesc.ViewDimension != D3D12_UAV_DIMENSION_BUFFER) {
str << "Resource dump for this kind of view dimension not implemented yet.\r\n";
// We can map back to the structure if a structured buffer via the shader, but
// we'll keep this simple and simply dump out 32-bit uint/float representations.
MappedData data;
pTest->GetReadBackData(R.Name, &data);
uint32_t *pData = (uint32_t *);
size_t u32_count = ((size_t)R.Desc.Width) / sizeof(uint32_t);
for (size_t i = 0; i < u32_count; ++i) {
float f = *(float *)pData;
str << i << ": 0n" << *pData << " 0x" << std::hex << *pData
<< std::dec << " " << f << "\r\n";
if (found) break;
if (!found) {
str << "Unable to find a view for the resource.\r\n";
str << "Resources read back: " << count << "\r\n";
std::string s(str.str());
CComHeapPtr<char> pDump;
if (!pDump.Allocate(s.size() + 1))
throw std::bad_alloc();
memcpy(pDump.m_pData,, s.size());
pDump.m_pData[s.size()] = '\0';
*pReadBackDump = pDump.Detach();
// This is the exported interface by use from HLSLHost.exe.
// It's exclusive with the use of the DLL as a TAEF target.
extern "C" {
__declspec(dllexport) HRESULT WINAPI InitializeOpTests(void *pStrCtx, st::OutputStringFn pOutputStrFn) {
HRESULT hr = ExecutionTest::EnableExperimentalShaderModels();
if (FAILED(hr)) {
pOutputStrFn(pStrCtx, L"Unable to enable experimental shader models.\r\n.");
return S_OK;
__declspec(dllexport) HRESULT WINAPI
RunOpTest(void *pStrCtx, st::OutputStringFn pOutputStrFn, LPCSTR pText,
ID3D12Device *pDevice, ID3D12CommandQueue *pCommandQueue,
ID3D12Resource *pRenderTarget, char **pReadBackDump) {
if (pReadBackDump) *pReadBackDump = nullptr;
st::SetOutputFn(pStrCtx, pOutputStrFn);
CComPtr<ID3D12InfoQueue> pInfoQueue;
CComHeapPtr<char> pDump;
bool FilterCreation = false;
if (SUCCEEDED(pDevice->QueryInterface(&pInfoQueue))) {
// Creation is largely driven by inputs, so don't log create/destroy messages.
if (FilterCreation) {
ZeroMemory(&filter, sizeof(filter));
filter.DenyList.NumCategories = _countof(denyCategories);
filter.DenyList.pCategoryList = denyCategories;
else {
pOutputStrFn(pStrCtx, L"Unable to enable info queue for D3D.\r\n.");
try {
dxc::DxcDllSupport m_support;
const char *pName = nullptr;
CComPtr<IStream> pStream = SHCreateMemStream((BYTE *)pText, (UINT)strlen(pText));
std::shared_ptr<st::ShaderOpSet> ShaderOpSet =
st::ParseShaderOpSetFromStream(pStream, ShaderOpSet.get());
st::ShaderOp *pShaderOp;
if (pName == nullptr) {
if (ShaderOpSet->ShaderOps.size() != 1) {
pOutputStrFn(pStrCtx, L"Expected a single shader operation.\r\n");
return E_FAIL;
pShaderOp = ShaderOpSet->ShaderOps[0].get();
else {
pShaderOp = ShaderOpSet->GetShaderOp(pName);
if (pShaderOp == nullptr) {
std::string msg = "Unable to find shader op ";
msg += pName;
msg += "; available ops";
const char sep = ':';
for (auto &pAvailOp : ShaderOpSet->ShaderOps) {
msg += sep;
msg += pAvailOp->Name ? pAvailOp->Name : "[n/a]";
CA2W msgWide(msg.c_str());
pOutputStrFn(pStrCtx, msgWide);
return E_FAIL;
std::shared_ptr<st::ShaderOpTest> test = std::make_shared<st::ShaderOpTest>();
test->SetupRenderTarget(pShaderOp, pDevice, pCommandQueue, pRenderTarget);
test->PresentRenderTarget(pShaderOp, pCommandQueue, pRenderTarget);
pOutputStrFn(pStrCtx, L"Rendering complete.\r\n");
if (!pShaderOp->IsCompute()) {
wchar_t statsText[400];
StringCchPrintfW(statsText, _countof(statsText),
L"Vertices/primitives read by input assembler: %I64u/%I64u\r\n"
L"Vertex shader invocations: %I64u\r\n"
L"Geometry shader invocations/output primitive: %I64u/%I64u\r\n"
L"Primitives sent to rasterizer/rendered: %I64u/%I64u\r\n"
L"PS/HS/DS/CS invocations: %I64u/%I64u/%I64u/%I64u\r\n",
stats.IAVertices, stats.IAPrimitives, stats.VSInvocations,
stats.GSInvocations, stats.GSPrimitives, stats.CInvocations,
stats.CPrimitives, stats.PSInvocations, stats.HSInvocations,
stats.DSInvocations, stats.CSInvocations);
pOutputStrFn(pStrCtx, statsText);
if (pReadBackDump) {
WriteReadBackDump(pShaderOp, test.get(), &pDump);
hr = S_OK;
catch (const CAtlException &E)
hr = E.m_hr;
catch (const std::bad_alloc &)
catch (const std::exception &)
hr = E_FAIL;
// Drain the device message queue if available.
if (pInfoQueue != nullptr) {
wchar_t buf[200];
StringCchPrintfW(buf, _countof(buf),
L"NumStoredMessages=%u limit/discarded by limit=%u/%u "
L"allowed/denied by storage filter=%u/%u "
pOutputStrFn(pStrCtx, buf);
WriteInfoQueueMessages(pStrCtx, pOutputStrFn, pInfoQueue);
if (FilterCreation) {
if (pReadBackDump) *pReadBackDump = pDump.Detach();
return hr;
// MARKER: ExecutionTest/DxilConf Shared Implementation End
// Do not remove the line above - it is used by