tools/clang/unittests/HLSL/ExecutionTest.cpp - external/github.com/microsoft/DirectXShaderCompiler - Git at Google

 ///////////////////////////////////////////////////////////////////////////////
 //                                                                           //
 // ExecutionTest.cpp                                                         //
 // Copyright (C) Microsoft Corporation. All rights reserved.                 //
 // This file is distributed under the University of Illinois Open Source     //
 // License. See LICENSE.TXT for details.                                     //
 //                                                                           //
 // These tests run by executing compiled programs, and thus involve more     //
 // moving parts, like the runtime and drivers.                               //
 //                                                                           //
 ///////////////////////////////////////////////////////////////////////////////

 // We need to keep & fix these warnings to integrate smoothly with HLK
 #pragma warning(error: 4100 4146 4242 4244 4267 4701 4389)

 #include <algorithm>
 #include <memory>
 #include <array>
 #include <vector>
 #include <string>
 #include <map>
 #include <unordered_set>
 #include <strstream>
 #include <iomanip>
 #include "dxc/Test/CompilationResult.h"
 #include "dxc/Test/HLSLTestData.h"
 #include <Shlwapi.h>
 #include <atlcoll.h>
 #include <locale>
 #include <algorithm>
 #include <bitset>

 #undef _read
 #include "WexTestClass.h"
 #include "dxc/Test/HlslTestUtils.h"
 #include "dxc/Test/DxcTestUtils.h"
 #include "dxc/Support/Global.h"
 #include "dxc/Support/WinIncludes.h"
 #include "dxc/Support/FileIOHelper.h"
 #include "dxc/Support/Unicode.h"

 //
 // d3d12.h and dxgi1_4.h are included in the Windows 10 SDK
 // https://msdn.microsoft.com/en-us/library/windows/desktop/dn899120(v=vs.85).aspx
 // https://developer.microsoft.com/en-US/windows/downloads/windows-10-sdk
 //
 #include <d3d12.h>
 #include <dxgi1_4.h>
 #include <DXGIDebug.h>
 #include "dxc/Support/d3dx12.h"
 #include <DirectXMath.h>
 #include <strsafe.h>
 #include <d3dcompiler.h>
 #include <wincodec.h>
 #include "ShaderOpTest.h"
 #include <libloaderapi.h>

 #pragma comment(lib, "d3dcompiler.lib")
 #pragma comment(lib, "windowscodecs.lib")
 #pragma comment(lib, "dxguid.lib")
 #pragma comment(lib, "version.lib")

 // A more recent Windows SDK than currently required is needed for these.
 typedef HRESULT(WINAPI *D3D12EnableExperimentalFeaturesFn)(
   UINT                                    NumFeatures,
   __in_ecount(NumFeatures) const IID*     pIIDs,
   __in_ecount_opt(NumFeatures) void*      pConfigurationStructs,
   __in_ecount_opt(NumFeatures) UINT*      pConfigurationStructSizes);

 static const GUID D3D12ExperimentalShaderModelsID = { /* 76f5573e-f13a-40f5-b297-81ce9e18933f */
   0x76f5573e,
   0xf13a,
   0x40f5,
   { 0xb2, 0x97, 0x81, 0xce, 0x9e, 0x18, 0x93, 0x3f }
 };

 // Used to create D3D12SDKConfiguration to enable AgilitySDK programmatically.
 typedef HRESULT(WINAPI *D3D12GetInterfaceFn)(REFCLSID rclsid, REFIID riid, void   **ppvDebug);

 #ifndef __ID3D12SDKConfiguration_INTERFACE_DEFINED__
 // Copied from AgilitySDK D3D12.h to programmatically enable when in developer mode.
 #define __ID3D12SDKConfiguration_INTERFACE_DEFINED__

 EXTERN_C const GUID DECLSPEC_SELECTANY IID_ID3D12SDKConfiguration = {0xe9eb5314,0x33aa,0x42b2, {0xa7,0x18,0xd7,0x7f,0x58,0xb1,0xf1,0xc7}};
 EXTERN_C const GUID DECLSPEC_SELECTANY CLSID_D3D12SDKConfiguration = {0x7cda6aca, 0xa03e, 0x49c8, {0x94, 0x58, 0x03, 0x34, 0xd2, 0x0e, 0x07, 0xce}};

 MIDL_INTERFACE("e9eb5314-33aa-42b2-a718-d77f58b1f1c7")
 ID3D12SDKConfiguration : public IUnknown
 {
 public:
     virtual HRESULT STDMETHODCALLTYPE SetSDKVersion(
         UINT SDKVersion,
         _In_z_  LPCSTR SDKPath) = 0;
 };
 #endif 	/* __ID3D12SDKConfiguration_INTERFACE_DEFINED__ */

 using namespace DirectX;
 using namespace hlsl_test;


 template <typename TSequence, typename T>
 static bool contains(TSequence s, const T &val) {
   return std::cend(s) != std::find(std::cbegin(s), std::cend(s), val);
 }

 template <typename InputIterator, typename T>
 static bool contains(InputIterator b, InputIterator e, const T &val) {
   return e != std::find(b, e, val);
 }

 static HRESULT ReportLiveObjects() {
   CComPtr<IDXGIDebug1> pDebug;
   IFR(DXGIGetDebugInterface1(0, IID_PPV_ARGS(&pDebug)));
   IFR(pDebug->ReportLiveObjects(DXGI_DEBUG_ALL, DXGI_DEBUG_RLO_ALL));
   return S_OK;
 }

 static void WriteInfoQueueMessages(void *pStrCtx, st::OutputStringFn pOutputStrFn, ID3D12InfoQueue *pInfoQueue) {
   bool allMessagesOK = true;
   UINT64 count = pInfoQueue->GetNumStoredMessages();
   CAtlArray<BYTE> message;
   for (UINT64 i = 0; i < count; ++i) {
     // 'GetMessageA' rather than 'GetMessage' is an artifact of user32 headers.
     SIZE_T msgLen = 0;
     if (FAILED(pInfoQueue->GetMessageA(i, nullptr, &msgLen))) {
       allMessagesOK = false;
       continue;
     }
     if (message.GetCount() < msgLen) {
       if (!message.SetCount(msgLen)) {
         allMessagesOK = false;
         continue;
       }
     }
     D3D12_MESSAGE *pMessage = (D3D12_MESSAGE *)message.GetData();
     if (FAILED(pInfoQueue->GetMessageA(i, pMessage, &msgLen))) {
       allMessagesOK = false;
       continue;
     }
     CA2W msgW(pMessage->pDescription, CP_ACP);
     pOutputStrFn(pStrCtx, msgW.m_psz);
     pOutputStrFn(pStrCtx, L"\r\n");
   }
   if (!allMessagesOK) {
     pOutputStrFn(pStrCtx, L"Failed to retrieve some messages.\r\n");
   }
 }

 class CComContext {
 private:
   bool m_init;
 public:
   CComContext() : m_init(false) {}
   ~CComContext() { Dispose(); }
   void Dispose() { if (!m_init) return; m_init = false; CoUninitialize(); }
   HRESULT Init() { HRESULT hr = CoInitializeEx(0, COINIT_MULTITHREADED); if (SUCCEEDED(hr)) { m_init = true; } return hr; }
 };

 static void SavePixelsToFile(LPCVOID pPixels, DXGI_FORMAT format, UINT32 m_width, UINT32 m_height, LPCWSTR pFileName) {
   CComContext ctx;
   CComPtr<IWICImagingFactory> pFactory;
   CComPtr<IWICBitmap> pBitmap;
   CComPtr<IWICBitmapEncoder> pEncoder;
   CComPtr<IWICBitmapFrameEncode> pFrameEncode;
   CComPtr<hlsl::AbstractMemoryStream> pStream;
   CComPtr<IMalloc> pMalloc;

   struct PF {
     DXGI_FORMAT Format;
     GUID PixelFormat;
     UINT32 PixelSize;
     bool operator==(DXGI_FORMAT F) const {
       return F == Format;
     }
   } Vals[] = {
     // Add more pixel format mappings as needed.
     { DXGI_FORMAT_R8G8B8A8_UNORM, GUID_WICPixelFormat32bppRGBA, 4 }
   };
   PF *pFormat = std::find(Vals, Vals + _countof(Vals), format);

   VERIFY_SUCCEEDED(ctx.Init());
   VERIFY_SUCCEEDED(CoCreateInstance(CLSID_WICImagingFactory, NULL, CLSCTX_INPROC_SERVER, IID_IWICImagingFactory, (LPVOID*)&pFactory));
   VERIFY_SUCCEEDED(CoGetMalloc(1, &pMalloc));
   VERIFY_SUCCEEDED(hlsl::CreateMemoryStream(pMalloc, &pStream));
   VERIFY_ARE_NOT_EQUAL(pFormat, Vals + _countof(Vals));
   VERIFY_SUCCEEDED(pFactory->CreateBitmapFromMemory(m_width, m_height, pFormat->PixelFormat, m_width * pFormat->PixelSize, m_width * m_height * pFormat->PixelSize, (BYTE *)pPixels, &pBitmap));
   VERIFY_SUCCEEDED(pFactory->CreateEncoder(GUID_ContainerFormatBmp, nullptr, &pEncoder));
   VERIFY_SUCCEEDED(pEncoder->Initialize(pStream, WICBitmapEncoderNoCache));
   VERIFY_SUCCEEDED(pEncoder->CreateNewFrame(&pFrameEncode, nullptr));
   VERIFY_SUCCEEDED(pFrameEncode->Initialize(nullptr));
   VERIFY_SUCCEEDED(pFrameEncode->WriteSource(pBitmap, nullptr));
   VERIFY_SUCCEEDED(pFrameEncode->Commit());
   VERIFY_SUCCEEDED(pEncoder->Commit());
   hlsl::WriteBinaryFile(pFileName, pStream->GetPtr(), pStream->GetPtrSize());
 }

 // Checks if the given warp version supports the given operation.
 bool IsValidWarpDllVersion(unsigned int minBuildNumber) {
     HMODULE pLibrary = LoadLibrary("D3D10Warp.dll");
     if (pLibrary) {
         char path[MAX_PATH];
         DWORD length = GetModuleFileName(pLibrary, path, MAX_PATH);
         if (length) {
             DWORD dwVerHnd = 0;
             DWORD dwVersionInfoSize = GetFileVersionInfoSize(path, &dwVerHnd);
             std::unique_ptr<int[]> VffInfo(new int[dwVersionInfoSize]);
             if (GetFileVersionInfo(path, NULL, dwVersionInfoSize, VffInfo.get())) {
                 LPVOID versionInfo;
                 UINT size;
                 if (VerQueryValue(VffInfo.get(), "\\", &versionInfo, &size)) {
                     if (size) {
                         VS_FIXEDFILEINFO *verInfo = (VS_FIXEDFILEINFO *)versionInfo;
                         unsigned int warpBuildNumber = verInfo->dwFileVersionLS >> 16 & 0xffff;
                         if (verInfo->dwSignature == 0xFEEF04BD && warpBuildNumber >= minBuildNumber) {
                             return true;
                         }
                     }
                 }
             }
         }
         FreeLibrary(pLibrary);
     }
     return false;
 }

 #if WDK_NTDDI_VERSION <= NTDDI_WIN10_RS2
 #define D3D12_FEATURE_D3D12_OPTIONS3 ((D3D12_FEATURE)21)
 #define NTDDI_WIN10_RS3                     0x0A000004  /* ABRACADABRA_WIN10_RS2 */
 typedef
 enum D3D12_COMMAND_LIST_SUPPORT_FLAGS
 {
   D3D12_COMMAND_LIST_SUPPORT_FLAG_NONE = 0,
   D3D12_COMMAND_LIST_SUPPORT_FLAG_DIRECT = (1 << D3D12_COMMAND_LIST_TYPE_DIRECT),
   D3D12_COMMAND_LIST_SUPPORT_FLAG_BUNDLE = (1 << D3D12_COMMAND_LIST_TYPE_BUNDLE),
   D3D12_COMMAND_LIST_SUPPORT_FLAG_COMPUTE = (1 << D3D12_COMMAND_LIST_TYPE_COMPUTE),
   D3D12_COMMAND_LIST_SUPPORT_FLAG_COPY = (1 << D3D12_COMMAND_LIST_TYPE_COPY),
   D3D12_COMMAND_LIST_SUPPORT_FLAG_VIDEO_DECODE = (1 << 4),
   D3D12_COMMAND_LIST_SUPPORT_FLAG_VIDEO_PROCESS = (1 << 5)
 } D3D12_COMMAND_LIST_SUPPORT_FLAGS;

 typedef
 enum D3D12_VIEW_INSTANCING_TIER
 {
   D3D12_VIEW_INSTANCING_TIER_NOT_SUPPORTED = 0,
   D3D12_VIEW_INSTANCING_TIER_1 = 1,
   D3D12_VIEW_INSTANCING_TIER_2 = 2,
   D3D12_VIEW_INSTANCING_TIER_3 = 3
 } D3D12_VIEW_INSTANCING_TIER;

 typedef struct D3D12_FEATURE_DATA_D3D12_OPTIONS3
 {
   _Out_  BOOL CopyQueueTimestampQueriesSupported;
   _Out_  BOOL CastingFullyTypedFormatSupported;
   _Out_  DWORD WriteBufferImmediateSupportFlags;
   _Out_  D3D12_VIEW_INSTANCING_TIER ViewInstancingTier;
   _Out_  BOOL BarycentricsSupported;
 } D3D12_FEATURE_DATA_D3D12_OPTIONS3;
 #endif

 #if WDK_NTDDI_VERSION <= NTDDI_WIN10_RS3
 #define D3D12_FEATURE_D3D12_OPTIONS4 ((D3D12_FEATURE)23)
 typedef enum D3D12_SHARED_RESOURCE_COMPATIBILITY_TIER
 {
     D3D12_SHARED_RESOURCE_COMPATIBILITY_TIER_0,
     D3D12_SHARED_RESOURCE_COMPATIBILITY_TIER_1,
 } D3D12_SHARED_RESOURCE_COMPATIBILITY_TIER;

 typedef struct D3D12_FEATURE_DATA_D3D12_OPTIONS4
 {
     _Out_ BOOL ReservedBufferPlacementSupported;
     _Out_ D3D12_SHARED_RESOURCE_COMPATIBILITY_TIER SharedResourceCompatibilityTier;
     _Out_ BOOL Native16BitShaderOpsSupported;
 } D3D12_FEATURE_DATA_D3D12_OPTIONS4;

 #endif

 // Virtual class to compute the expected result given a set of inputs
 struct TableParameter;

 class ExecutionTest {
 public:
   // By default, ignore these tests, which require a recent build to run properly.
   BEGIN_TEST_CLASS(ExecutionTest)
     TEST_CLASS_PROPERTY(L"Parallel", L"true")
     TEST_CLASS_PROPERTY(L"Ignore", L"true")
     TEST_METHOD_PROPERTY(L"Priority", L"0")
   END_TEST_CLASS()
   TEST_CLASS_SETUP(ExecutionTestClassSetup)

   TEST_METHOD(BasicComputeTest);
   TEST_METHOD(BasicTriangleTest);
   TEST_METHOD(BasicTriangleOpTest);

   TEST_METHOD(BasicTriangleOpTestHalf);

   TEST_METHOD(OutOfBoundsTest);
   TEST_METHOD(SaturateTest);
   TEST_METHOD(SignTest);
   TEST_METHOD(Int64Test);
   TEST_METHOD(LifetimeIntrinsicTest)
   TEST_METHOD(WaveIntrinsicsTest);
   TEST_METHOD(WaveIntrinsicsDDITest);
   TEST_METHOD(WaveIntrinsicsInPSTest);
   TEST_METHOD(WaveSizeTest);
   TEST_METHOD(PartialDerivTest);
   TEST_METHOD(DerivativesTest);
   TEST_METHOD(ComputeSampleTest);
   TEST_METHOD(ATOProgOffset);
   TEST_METHOD(ATOSampleCmpLevelTest);
   TEST_METHOD(ATOWriteMSAATest);
   TEST_METHOD(ATORawGather);
   TEST_METHOD(AtomicsTest);
   TEST_METHOD(Atomics64Test);
   TEST_METHOD(AtomicsRawHeap64Test);
   TEST_METHOD(AtomicsTyped64Test);
   TEST_METHOD(AtomicsShared64Test);
   TEST_METHOD(AtomicsFloatTest);
   TEST_METHOD(HelperLaneTest);
   TEST_METHOD(HelperLaneTestWave);
   TEST_METHOD(SignatureResourcesTest)
   TEST_METHOD(DynamicResourcesTest)
   TEST_METHOD(DynamicResourcesDynamicIndexingTest)

   TEST_METHOD(QuadReadTest)
   TEST_METHOD(QuadAnyAll);

   TEST_METHOD(CBufferTestHalf);

   TEST_METHOD(BasicShaderModel61);
   TEST_METHOD(BasicShaderModel63);

   BEGIN_TEST_METHOD(WaveIntrinsicsActiveIntTest)
     TEST_METHOD_PROPERTY(L"DataSource", L"Table:ShaderOpArithTable.xml#WaveIntrinsicsActiveIntTable")
   END_TEST_METHOD()

   BEGIN_TEST_METHOD(WaveIntrinsicsActiveUintTest)
     TEST_METHOD_PROPERTY(L"DataSource", L"Table:ShaderOpArithTable.xml#WaveIntrinsicsActiveUintTable")
   END_TEST_METHOD()

   BEGIN_TEST_METHOD(WaveIntrinsicsPrefixIntTest)
   TEST_METHOD_PROPERTY(L"DataSource", L"Table:ShaderOpArithTable.xml#WaveIntrinsicsPrefixIntTable")
   END_TEST_METHOD()

   BEGIN_TEST_METHOD(WaveIntrinsicsPrefixUintTest)
   TEST_METHOD_PROPERTY(L"DataSource", L"Table:ShaderOpArithTable.xml#WaveIntrinsicsPrefixUintTable")
   END_TEST_METHOD()

   BEGIN_TEST_METHOD(WaveIntrinsicsSM65IntTest)
   TEST_METHOD_PROPERTY(L"DataSource", L"Table:ShaderOpArithTable.xml#WaveIntrinsicsMultiPrefixIntTable")
   END_TEST_METHOD()

   BEGIN_TEST_METHOD(WaveIntrinsicsSM65UintTest)
   TEST_METHOD_PROPERTY(L"DataSource", L"Table:ShaderOpArithTable.xml#WaveIntrinsicsMultiPrefixUintTable")
   END_TEST_METHOD()

   // TAEF data-driven tests.
   BEGIN_TEST_METHOD(UnaryFloatOpTest)
     TEST_METHOD_PROPERTY(L"DataSource", L"Table:ShaderOpArithTable.xml#UnaryFloatOpTable")
   END_TEST_METHOD()
   BEGIN_TEST_METHOD(BinaryFloatOpTest)
     TEST_METHOD_PROPERTY(L"DataSource", L"Table:ShaderOpArithTable.xml#BinaryFloatOpTable")
   END_TEST_METHOD()
   BEGIN_TEST_METHOD(TertiaryFloatOpTest)
     TEST_METHOD_PROPERTY(L"DataSource", L"Table:ShaderOpArithTable.xml#TertiaryFloatOpTable")
   END_TEST_METHOD()

   BEGIN_TEST_METHOD(UnaryHalfOpTest)
     TEST_METHOD_PROPERTY(L"DataSource", L"Table:ShaderOpArithTable.xml#UnaryHalfOpTable")
   END_TEST_METHOD()
   BEGIN_TEST_METHOD(BinaryHalfOpTest)
     TEST_METHOD_PROPERTY(L"DataSource", L"Table:ShaderOpArithTable.xml#BinaryHalfOpTable")
   END_TEST_METHOD()
   BEGIN_TEST_METHOD(TertiaryHalfOpTest)
     TEST_METHOD_PROPERTY(L"DataSource", L"Table:ShaderOpArithTable.xml#TertiaryHalfOpTable")
   END_TEST_METHOD()

   BEGIN_TEST_METHOD(UnaryIntOpTest)
     TEST_METHOD_PROPERTY(L"DataSource", L"Table:ShaderOpArithTable.xml#UnaryIntOpTable")
   END_TEST_METHOD()
   BEGIN_TEST_METHOD(BinaryIntOpTest)
     TEST_METHOD_PROPERTY(L"DataSource", L"Table:ShaderOpArithTable.xml#BinaryIntOpTable")
   END_TEST_METHOD()
   BEGIN_TEST_METHOD(TertiaryIntOpTest)
     TEST_METHOD_PROPERTY(L"DataSource", L"Table:ShaderOpArithTable.xml#TertiaryIntOpTable")
   END_TEST_METHOD()

   BEGIN_TEST_METHOD(UnaryUintOpTest)
      TEST_METHOD_PROPERTY(L"DataSource", L"Table:ShaderOpArithTable.xml#UnaryUintOpTable")
   END_TEST_METHOD()
   BEGIN_TEST_METHOD(BinaryUintOpTest)
     TEST_METHOD_PROPERTY(L"DataSource", L"Table:ShaderOpArithTable.xml#BinaryUintOpTable")
   END_TEST_METHOD()
   BEGIN_TEST_METHOD(TertiaryUintOpTest)
     TEST_METHOD_PROPERTY(L"DataSource", L"Table:ShaderOpArithTable.xml#TertiaryUintOpTable")
   END_TEST_METHOD()

   BEGIN_TEST_METHOD(UnaryInt16OpTest)
     TEST_METHOD_PROPERTY(L"DataSource", L"Table:ShaderOpArithTable.xml#UnaryInt16OpTable")
   END_TEST_METHOD()
   BEGIN_TEST_METHOD(BinaryInt16OpTest)
     TEST_METHOD_PROPERTY(L"DataSource", L"Table:ShaderOpArithTable.xml#BinaryInt16OpTable")
   END_TEST_METHOD()
   BEGIN_TEST_METHOD(TertiaryInt16OpTest)
     TEST_METHOD_PROPERTY(L"DataSource", L"Table:ShaderOpArithTable.xml#TertiaryInt16OpTable")
   END_TEST_METHOD()

   BEGIN_TEST_METHOD(UnaryUint16OpTest)
     TEST_METHOD_PROPERTY(L"DataSource", L"Table:ShaderOpArithTable.xml#UnaryUint16OpTable")
   END_TEST_METHOD()
   BEGIN_TEST_METHOD(BinaryUint16OpTest)
     TEST_METHOD_PROPERTY(L"DataSource", L"Table:ShaderOpArithTable.xml#BinaryUint16OpTable")
   END_TEST_METHOD()
   BEGIN_TEST_METHOD(TertiaryUint16OpTest)
     TEST_METHOD_PROPERTY(L"DataSource", L"Table:ShaderOpArithTable.xml#TertiaryUint16OpTable")
   END_TEST_METHOD()

   BEGIN_TEST_METHOD(DotTest)
     TEST_METHOD_PROPERTY(L"DataSource", L"Table:ShaderOpArithTable.xml#DotOpTable")
   END_TEST_METHOD()

   BEGIN_TEST_METHOD(Dot2AddHalfTest)
     TEST_METHOD_PROPERTY(L"DataSource", L"Table:ShaderOpArithTable.xml#Dot2AddHalfOpTable")
   END_TEST_METHOD()

   BEGIN_TEST_METHOD(Dot4AddI8PackedTest)
     TEST_METHOD_PROPERTY(L"DataSource", L"Table:ShaderOpArithTable.xml#Dot4AddI8PackedOpTable")
   END_TEST_METHOD()

   BEGIN_TEST_METHOD(Dot4AddU8PackedTest)
     TEST_METHOD_PROPERTY(L"DataSource", L"Table:ShaderOpArithTable.xml#Dot4AddU8PackedOpTable")
   END_TEST_METHOD()

   BEGIN_TEST_METHOD(Msad4Test)
     TEST_METHOD_PROPERTY(L"DataSource", L"Table:ShaderOpArithTable.xml#Msad4Table")
   END_TEST_METHOD()

   BEGIN_TEST_METHOD(DenormBinaryFloatOpTest)
     TEST_METHOD_PROPERTY(L"DataSource", L"Table:ShaderOpArithTable.xml#DenormBinaryFloatOpTable")
   END_TEST_METHOD()

   BEGIN_TEST_METHOD(DenormTertiaryFloatOpTest)
     TEST_METHOD_PROPERTY(L"DataSource", L"Table:ShaderOpArithTable.xml#DenormTertiaryFloatOpTable")
   END_TEST_METHOD()

   TEST_METHOD(BarycentricsTest);

   TEST_METHOD(ComputeRawBufferLdStI32);
   TEST_METHOD(ComputeRawBufferLdStFloat);

   TEST_METHOD(ComputeRawBufferLdStI64);
   TEST_METHOD(ComputeRawBufferLdStDouble);

   TEST_METHOD(ComputeRawBufferLdStI16);
   TEST_METHOD(ComputeRawBufferLdStHalf);

   TEST_METHOD(GraphicsRawBufferLdStI32);
   TEST_METHOD(GraphicsRawBufferLdStFloat);

   TEST_METHOD(GraphicsRawBufferLdStI64);
   TEST_METHOD(GraphicsRawBufferLdStDouble);

   TEST_METHOD(GraphicsRawBufferLdStI16);
   TEST_METHOD(GraphicsRawBufferLdStHalf);

   BEGIN_TEST_METHOD(PackUnpackTest)
     TEST_METHOD_PROPERTY(L"DataSource", L"Table:ShaderOpArithTable.xml#PackUnpackOpTable")
   END_TEST_METHOD()

   dxc::DxcDllSupport m_support;
   VersionSupportInfo m_ver;

   bool m_D3DInitCompleted = false;
   bool m_ExperimentalModeEnabled = false;
   bool m_AgilitySDKEnabled = false;

   const float ClearColor[4] = { 0.0f, 0.2f, 0.4f, 1.0f };

   bool DivergentClassSetup() {
     // Run this only once.
     if (!m_D3DInitCompleted) {
       m_D3DInitCompleted = true;

       HMODULE hRuntime = LoadLibraryW(L"d3d12.dll");
       if (hRuntime == NULL)
         return false;
       // Do not: FreeLibrary(hRuntime);
       // If we actually free the library, it defeats the purpose of
       // EnableAgilitySDK and EnableExperimentalMode.

       HRESULT hr;
       hr = EnableAgilitySDK(hRuntime);
       if (FAILED(hr)) {
         LogCommentFmt(L"Unable to enable Agility SDK - 0x%08x.", hr);
       } else if (hr == S_FALSE) {
         LogCommentFmt(L"Agility SDK not enabled.");
       } else {
         LogCommentFmt(L"Agility SDK enabled.");
       }

       hr = EnableExperimentalMode(hRuntime);
       if (FAILED(hr)) {
         LogCommentFmt(L"Unable to enable shader experimental mode - 0x%08x.", hr);
       } else if (hr == S_FALSE) {
         LogCommentFmt(L"Experimental mode not enabled.");
       } else {
         LogCommentFmt(L"Experimental mode enabled.");
       }

       hr = EnableDebugLayer();
       if (FAILED(hr)) {
         LogCommentFmt(L"Unable to enable debug layer - 0x%08x.", hr);
       } else if (hr == S_FALSE) {
         LogCommentFmt(L"Debug layer not enabled.");
       } else {
         LogCommentFmt(L"Debug layer enabled.");
       }
     }

     return true;
   }

 // Do not remove the following line - it is used by TranslateExecutionTest.py
 // MARKER: ExecutionTest/DxilConf Shared Implementation Start

  // This is defined in d3d.h for Windows 10 Anniversary Edition SDK, but we only
   // require the Windows 10 SDK.
   typedef enum D3D_SHADER_MODEL {
     D3D_SHADER_MODEL_5_1 = 0x51,
     D3D_SHADER_MODEL_6_0 = 0x60,
     D3D_SHADER_MODEL_6_1 = 0x61,
     D3D_SHADER_MODEL_6_2 = 0x62,
     D3D_SHADER_MODEL_6_3 = 0x63,
     D3D_SHADER_MODEL_6_4 = 0x64,
     D3D_SHADER_MODEL_6_5 = 0x65,
     D3D_SHADER_MODEL_6_6 = 0x66,
     D3D_SHADER_MODEL_6_7 = 0x67,
 } D3D_SHADER_MODEL;

   static const D3D_SHADER_MODEL HIGHEST_SHADER_MODEL = D3D_SHADER_MODEL_6_7;

   bool UseDxbc() {
 #ifdef _HLK_CONF
     return false;
 #else
     return GetTestParamBool(L"DXBC");
 #endif
   }

   bool UseWarpByDefault() {
 #ifdef _HLK_CONF
     return false;
 #else
     return true;
 #endif
   }

   bool UseDebugIfaces() {
     return true;
   }

   bool SaveImages() {
     return GetTestParamBool(L"SaveImages");
   }

   // Base class used by raw gather test for polymorphic assignments
   struct RawGatherTexture {
     // Set Element <i> to a format-appropriate value derived from 2D coords <x,y>
     virtual void SetElement(int i, int x, int y) = 0;
     // Retrieve pointer to the elements
     virtual void *GetElements() = 0;
     // Get dimensions/format
     virtual unsigned GetXDim() = 0;
     virtual unsigned GetYDim() = 0;
     virtual DXGI_FORMAT GetFormat() = 0;
   };

   template<typename GatherType>
   void DoRawGatherTest(ID3D12Device *pDevice, RawGatherTexture *rawTex, DXGI_FORMAT viewFormat);
   void RunResourceTest(ID3D12Device *pDevice, const char *pShader, const wchar_t *sm, bool isDynamic);

   template <class T1, class T2>
   void WaveIntrinsicsActivePrefixTest(TableParameter *pParameterList,
                                       size_t numParameter, bool isPrefix);

   template <typename T>
   void WaveIntrinsicsMultiPrefixOpTest(TableParameter *pParameterList,
                                        size_t numParameters);

   void BasicTriangleTestSetup(LPCSTR OpName, LPCWSTR FileName, D3D_SHADER_MODEL testModel);

   void RunBasicShaderModelTest(D3D_SHADER_MODEL shaderModel);

   enum class RawBufferLdStType {
      I32,
      Float,
      I64,
      Double,
      I16,
      Half
   };

   template <class Ty>
   struct RawBufferLdStTestData {
     Ty v1, v2[2], v3[3], v4[4];
   };

   template <class Ty>
   struct RawBufferLdStUavData {
     RawBufferLdStTestData<Ty> input, output, srvOut;
   };

   template <class Ty>
   void RunComputeRawBufferLdStTest(D3D_SHADER_MODEL shaderModel, RawBufferLdStType dataType,
                             const char *shaderOpName, const RawBufferLdStTestData<Ty> &testData);

   template <class Ty>
   void RunGraphicsRawBufferLdStTest(D3D_SHADER_MODEL shaderModel, RawBufferLdStType dataType,
                             const char *shaderOpName, const RawBufferLdStTestData<Ty> &testData);

   template <class Ty>
   void VerifyRawBufferLdStTestResults(const std::shared_ptr<st::ShaderOpTest> test, const RawBufferLdStTestData<Ty> &testData);

   bool SetupRawBufferLdStTest(D3D_SHADER_MODEL shaderModel, RawBufferLdStType dataType, CComPtr<ID3D12Device> &pDevice,
                               CComPtr<IStream> &pStream, char *&sTy, char *&additionalOptions);

   template <class Ty>
   void RunBasicShaderModelTest(CComPtr<ID3D12Device> pDevice, const char *pShaderModelStr, const char *pShader, Ty *pInputDataPairs, unsigned inputDataCount);

   template <class Ty>
   const wchar_t* BasicShaderModelTest_GetFormatString();

   void CompileFromText(LPCSTR pText, LPCWSTR pEntryPoint, LPCWSTR pTargetProfile, ID3DBlob **ppBlob, LPCWSTR *pOptions = nullptr, int numOptions = 0) {
     VERIFY_SUCCEEDED(m_support.Initialize());
     CComPtr<IDxcCompiler> pCompiler;
     CComPtr<IDxcLibrary> pLibrary;
     CComPtr<IDxcBlobEncoding> pTextBlob;
     CComPtr<IDxcOperationResult> pResult;
     HRESULT resultCode;
     VERIFY_SUCCEEDED(m_support.CreateInstance(CLSID_DxcCompiler, &pCompiler));
     VERIFY_SUCCEEDED(m_support.CreateInstance(CLSID_DxcLibrary, &pLibrary));
     VERIFY_SUCCEEDED(pLibrary->CreateBlobWithEncodingFromPinned(pText, (UINT32)strlen(pText), CP_UTF8, &pTextBlob));
     VERIFY_SUCCEEDED(pCompiler->Compile(pTextBlob, L"hlsl.hlsl", pEntryPoint, pTargetProfile, pOptions, numOptions, nullptr, 0, nullptr, &pResult));
     VERIFY_SUCCEEDED(pResult->GetStatus(&resultCode));
     if (FAILED(resultCode)) {
       CComPtr<IDxcBlobEncoding> errors;
       VERIFY_SUCCEEDED(pResult->GetErrorBuffer(&errors));
 #ifndef _HLK_CONF
       LogCommentFmt(L"Failed to compile shader: %s", BlobToWide(errors).data());
 #endif
     }
     VERIFY_SUCCEEDED(resultCode);
     VERIFY_SUCCEEDED(pResult->GetResult((IDxcBlob **)ppBlob));
   }

   void CreateCommandQueue(ID3D12Device *pDevice, LPCWSTR pName, ID3D12CommandQueue **ppCommandQueue, D3D12_COMMAND_LIST_TYPE type) {
     D3D12_COMMAND_QUEUE_DESC queueDesc = {};
     queueDesc.Flags = D3D12_COMMAND_QUEUE_FLAG_NONE;
     queueDesc.Type = type;
     VERIFY_SUCCEEDED(pDevice->CreateCommandQueue(&queueDesc, IID_PPV_ARGS(ppCommandQueue)));
     VERIFY_SUCCEEDED((*ppCommandQueue)->SetName(pName));
   }

   void CreateComputeCommandQueue(ID3D12Device *pDevice, LPCWSTR pName, ID3D12CommandQueue **ppCommandQueue) {
     CreateCommandQueue(pDevice, pName, ppCommandQueue, D3D12_COMMAND_LIST_TYPE_COMPUTE);
   }

   void CreateComputePSO(ID3D12Device *pDevice, ID3D12RootSignature *pRootSignature, LPCSTR pShader, LPCWSTR pTargetProfile, ID3D12PipelineState **ppComputeState, LPCWSTR *pOptions = nullptr, int numOptions = 0) {
     CComPtr<ID3DBlob> pComputeShader;

     // Load and compile shaders.
     if (UseDxbc()) {
 #ifndef _HLK_CONF
       DXBCFromText(pShader, L"main", pTargetProfile, &pComputeShader);
 #endif
     }
     else {
       CompileFromText(pShader, L"main", pTargetProfile, &pComputeShader, pOptions, numOptions);
     }

     // Describe and create the compute pipeline state object (PSO).
     D3D12_COMPUTE_PIPELINE_STATE_DESC computePsoDesc = {};
     computePsoDesc.pRootSignature = pRootSignature;
     computePsoDesc.CS = CD3DX12_SHADER_BYTECODE(pComputeShader);

     VERIFY_SUCCEEDED(pDevice->CreateComputePipelineState(&computePsoDesc, IID_PPV_ARGS(ppComputeState)));
   }

   bool CreateDevice(_COM_Outptr_ ID3D12Device **ppDevice,
                     D3D_SHADER_MODEL testModel = D3D_SHADER_MODEL_6_0, bool skipUnsupported = true) {
     if (testModel > HIGHEST_SHADER_MODEL) {
       UINT minor = (UINT)testModel & 0x0f;
       LogCommentFmt(L"Installed SDK does not support "
           L"shader model 6.%1u", minor);

       if (skipUnsupported) {
         WEX::Logging::Log::Result(WEX::Logging::TestResults::Skipped);
       }

       return false;
     }
     CComPtr<IDXGIFactory4> factory;
     CComPtr<ID3D12Device> pDevice;

     *ppDevice = nullptr;

     VERIFY_SUCCEEDED(CreateDXGIFactory1(IID_PPV_ARGS(&factory)));
     if (GetTestParamUseWARP(UseWarpByDefault())) {
       CComPtr<IDXGIAdapter> warpAdapter;
       VERIFY_SUCCEEDED(factory->EnumWarpAdapter(IID_PPV_ARGS(&warpAdapter)));
       HRESULT createHR = D3D12CreateDevice(warpAdapter, D3D_FEATURE_LEVEL_11_0,
                                            IID_PPV_ARGS(&pDevice));
       if (FAILED(createHR)) {
         LogCommentFmt(L"The available version of WARP does not support d3d12.");

         if (skipUnsupported) {
           WEX::Logging::Log::Result(WEX::Logging::TestResults::Skipped);
         }

         return false;
       }
     } else {
       CComPtr<IDXGIAdapter1> hardwareAdapter;
       WEX::Common::String AdapterValue;
       HRESULT hr = WEX::TestExecution::RuntimeParameters::TryGetValue(L"Adapter",
                                                              AdapterValue);
       if (SUCCEEDED(hr)) {
           GetHardwareAdapter(factory, AdapterValue, &hardwareAdapter);
       } else {
         WEX::Logging::Log::Comment(
             L"Using default hardware adapter with D3D12 support.");
       }
       VERIFY_SUCCEEDED(D3D12CreateDevice(hardwareAdapter, D3D_FEATURE_LEVEL_11_0,
                                          IID_PPV_ARGS(&pDevice)));
     }
     // retrieve adapter information
     LUID adapterID = pDevice->GetAdapterLuid();
     CComPtr<IDXGIAdapter> adapter;
     factory->EnumAdapterByLuid(adapterID, IID_PPV_ARGS(&adapter));
     DXGI_ADAPTER_DESC AdapterDesc;
     VERIFY_SUCCEEDED(adapter->GetDesc(&AdapterDesc));
     LogCommentFmt(L"Using Adapter:%s", AdapterDesc.Description);

     if (pDevice == nullptr)
       return false;

     if (!UseDxbc()) {
       // Check for DXIL support.
       typedef struct D3D12_FEATURE_DATA_SHADER_MODEL {
         _Inout_ D3D_SHADER_MODEL HighestShaderModel;
       } D3D12_FEATURE_DATA_SHADER_MODEL;
       const UINT D3D12_FEATURE_SHADER_MODEL = 7;
       D3D12_FEATURE_DATA_SHADER_MODEL SMData;
       SMData.HighestShaderModel = testModel;
       if (FAILED(pDevice->CheckFeatureSupport((D3D12_FEATURE)D3D12_FEATURE_SHADER_MODEL,
                                               &SMData, sizeof(SMData))) ||
           SMData.HighestShaderModel < testModel) {
         UINT minor = (UINT)testModel & 0x0f;
         LogCommentFmt(L"The selected device does not support "
                       L"shader model 6.%1u", minor);

         if (skipUnsupported) {
           WEX::Logging::Log::Result(WEX::Logging::TestResults::Skipped);
         }

         return false;
       }
     }

     if (UseDebugIfaces()) {
       CComPtr<ID3D12InfoQueue> pInfoQueue;
       if (SUCCEEDED(pDevice->QueryInterface(&pInfoQueue))) {
         pInfoQueue->SetMuteDebugOutput(FALSE);
       }
     }

     *ppDevice = pDevice.Detach();
     return true;
   }

   void CreateGraphicsCommandQueue(ID3D12Device *pDevice, ID3D12CommandQueue **ppCommandQueue) {
     D3D12_COMMAND_QUEUE_DESC queueDesc = {};
     queueDesc.Flags = D3D12_COMMAND_QUEUE_FLAG_NONE;
     queueDesc.Type = D3D12_COMMAND_LIST_TYPE_DIRECT;;
     VERIFY_SUCCEEDED(pDevice->CreateCommandQueue(&queueDesc, IID_PPV_ARGS(ppCommandQueue)));
   }

   void CreateGraphicsCommandQueueAndList(
       ID3D12Device *pDevice, ID3D12CommandQueue **ppCommandQueue,
       ID3D12CommandAllocator **ppAllocator,
       ID3D12GraphicsCommandList **ppCommandList, ID3D12PipelineState *pPSO) {
     CreateGraphicsCommandQueue(pDevice, ppCommandQueue);
     VERIFY_SUCCEEDED(pDevice->CreateCommandAllocator(
         D3D12_COMMAND_LIST_TYPE_DIRECT, IID_PPV_ARGS(ppAllocator)));
     VERIFY_SUCCEEDED(pDevice->CreateCommandList(
         0, D3D12_COMMAND_LIST_TYPE_DIRECT, *ppAllocator, pPSO,
         IID_PPV_ARGS(ppCommandList)));
   }

   void CreateGraphicsPSO(ID3D12Device *pDevice,
                          D3D12_INPUT_LAYOUT_DESC *pInputLayout,
                          ID3D12RootSignature *pRootSignature, LPCSTR pShaders,
                          ID3D12PipelineState **ppPSO) {
     CComPtr<ID3DBlob> vertexShader;
     CComPtr<ID3DBlob> pixelShader;

     if (UseDxbc()) {
 #ifndef _HLK_CONF
       DXBCFromText(pShaders, L"VSMain", L"vs_6_0", &vertexShader);
       DXBCFromText(pShaders, L"PSMain", L"ps_6_0", &pixelShader);
 #endif
     } else {
       CompileFromText(pShaders, L"VSMain", L"vs_6_0", &vertexShader);
       CompileFromText(pShaders, L"PSMain", L"ps_6_0", &pixelShader);
     }

     // Describe and create the graphics pipeline state object (PSO).
     D3D12_GRAPHICS_PIPELINE_STATE_DESC psoDesc = {};
     psoDesc.InputLayout = *pInputLayout;
     psoDesc.pRootSignature = pRootSignature;
     psoDesc.VS = CD3DX12_SHADER_BYTECODE(vertexShader);
     psoDesc.PS = CD3DX12_SHADER_BYTECODE(pixelShader);
     psoDesc.RasterizerState = CD3DX12_RASTERIZER_DESC(D3D12_DEFAULT);
     psoDesc.BlendState = CD3DX12_BLEND_DESC(D3D12_DEFAULT);
     psoDesc.DepthStencilState.DepthEnable = FALSE;
     psoDesc.DepthStencilState.StencilEnable = FALSE;
     psoDesc.SampleMask = UINT_MAX;
     psoDesc.PrimitiveTopologyType = D3D12_PRIMITIVE_TOPOLOGY_TYPE_TRIANGLE;
     psoDesc.NumRenderTargets = 1;
     psoDesc.RTVFormats[0] = DXGI_FORMAT_R8G8B8A8_UNORM;
     psoDesc.SampleDesc.Count = 1;
     VERIFY_SUCCEEDED(
         pDevice->CreateGraphicsPipelineState(&psoDesc, IID_PPV_ARGS(ppPSO)));
   }

   void CreateRenderTargetAndReadback(ID3D12Device *pDevice,
                                      ID3D12DescriptorHeap *pHeap, UINT width,
                                      UINT height,
                                      ID3D12Resource **ppRenderTarget,
                                      ID3D12Resource **ppBuffer) {
     const DXGI_FORMAT format = DXGI_FORMAT_R8G8B8A8_UNORM;
     const size_t formatElementSize = 4;
     CComPtr<ID3D12Resource> pRenderTarget;
     CComPtr<ID3D12Resource> pBuffer;

     CD3DX12_CPU_DESCRIPTOR_HANDLE rtvHandle(
         pHeap->GetCPUDescriptorHandleForHeapStart());
     CD3DX12_HEAP_PROPERTIES rtHeap(D3D12_HEAP_TYPE_DEFAULT);
     CD3DX12_RESOURCE_DESC rtDesc(
         CD3DX12_RESOURCE_DESC::Tex2D(format, width, height));
     CD3DX12_CLEAR_VALUE rtClearVal(format, ClearColor);
     rtDesc.Flags = D3D12_RESOURCE_FLAG_ALLOW_RENDER_TARGET;
     VERIFY_SUCCEEDED(pDevice->CreateCommittedResource(
         &rtHeap, D3D12_HEAP_FLAG_NONE, &rtDesc, D3D12_RESOURCE_STATE_COPY_DEST,
         &rtClearVal, IID_PPV_ARGS(&pRenderTarget)));
     pDevice->CreateRenderTargetView(pRenderTarget, nullptr, rtvHandle);
     // rtvHandle.Offset(1, rtvDescriptorSize);  // Not needed for a single
     // resource.

     CD3DX12_HEAP_PROPERTIES readHeap(D3D12_HEAP_TYPE_READBACK);
     CD3DX12_RESOURCE_DESC readDesc(
         CD3DX12_RESOURCE_DESC::Buffer(width * height * formatElementSize));
     VERIFY_SUCCEEDED(pDevice->CreateCommittedResource(
         &readHeap, D3D12_HEAP_FLAG_NONE, &readDesc,
         D3D12_RESOURCE_STATE_COPY_DEST, nullptr, IID_PPV_ARGS(&pBuffer)));

     *ppRenderTarget = pRenderTarget.Detach();
     *ppBuffer = pBuffer.Detach();
   }

   void CreateRootSignatureFromDesc(ID3D12Device *pDevice,
                                    const D3D12_ROOT_SIGNATURE_DESC *pDesc,
                                    ID3D12RootSignature **pRootSig) {
     CComPtr<ID3DBlob> signature;
     CComPtr<ID3DBlob> error;
     VERIFY_SUCCEEDED(D3D12SerializeRootSignature(pDesc, D3D_ROOT_SIGNATURE_VERSION_1, &signature, &error));
     VERIFY_SUCCEEDED(pDevice->CreateRootSignature(
         0, signature->GetBufferPointer(), signature->GetBufferSize(),
         IID_PPV_ARGS(pRootSig)));
   }

   void CreateRootSignatureFromRanges(ID3D12Device *pDevice, ID3D12RootSignature **pRootSig,
                                      CD3DX12_DESCRIPTOR_RANGE *resRanges, UINT resCt,
                                      CD3DX12_DESCRIPTOR_RANGE *sampRanges = nullptr, UINT sampCt = 0,
                                      D3D12_ROOT_SIGNATURE_FLAGS flags = D3D12_ROOT_SIGNATURE_FLAG_NONE) {
     UINT paramCt = 0;
     CD3DX12_ROOT_PARAMETER rootParameters[2];
     rootParameters[paramCt++].InitAsDescriptorTable(resCt, resRanges, D3D12_SHADER_VISIBILITY_ALL);
     if (sampCt)
       rootParameters[paramCt++].InitAsDescriptorTable(sampCt, sampRanges, D3D12_SHADER_VISIBILITY_ALL);

     CD3DX12_ROOT_SIGNATURE_DESC rootSignatureDesc;
     rootSignatureDesc.Init(paramCt, rootParameters, 0, nullptr, flags);
     CreateRootSignatureFromDesc(pDevice, &rootSignatureDesc, pRootSig);
   }

   void CreateRtvDescriptorHeap(ID3D12Device *pDevice, UINT numDescriptors,
                                ID3D12DescriptorHeap **pRtvHeap, UINT *rtvDescriptorSize) {
     D3D12_DESCRIPTOR_HEAP_DESC rtvHeapDesc = {};
     rtvHeapDesc.NumDescriptors = numDescriptors;
     rtvHeapDesc.Type = D3D12_DESCRIPTOR_HEAP_TYPE_RTV;
     rtvHeapDesc.Flags = D3D12_DESCRIPTOR_HEAP_FLAG_NONE;
     VERIFY_SUCCEEDED(
         pDevice->CreateDescriptorHeap(&rtvHeapDesc, IID_PPV_ARGS(pRtvHeap)));

     if (rtvDescriptorSize != nullptr) {
       *rtvDescriptorSize = pDevice->GetDescriptorHandleIncrementSize(
           D3D12_DESCRIPTOR_HEAP_TYPE_RTV);
     }
   }

 #if defined(NTDDI_WIN10_CU) && WDK_NTDDI_VERSION >= NTDDI_WIN10_CU
   // Copy common fields from desc0 to desc1 and zero out the new one
   void CopyDesc0ToDesc1(D3D12_RESOURCE_DESC1 &desc1, const D3D12_RESOURCE_DESC &desc0) {
     desc1.Dimension = desc0.Dimension;
     desc1.Alignment = desc0.Alignment;
     desc1.Width = desc0.Width;
     desc1.Height = desc0.Height;
     desc1.DepthOrArraySize = desc0.DepthOrArraySize;
     desc1.MipLevels = desc0.MipLevels;
     desc1.Format = desc0.Format;
     desc1.SampleDesc = desc0.SampleDesc;
     desc1.Layout = desc0.Layout;
     desc1.Flags = desc0.Flags;
     desc1.SamplerFeedbackMipRegion = {};
   }
 #endif

   // Create resources for the given <resDesc> described main resource
   // creating and returning the resource, the upload resource,
   // and the readback resource if requested, populating with <values> of size
   // <valueSizeInBytes> using <pCommandList> and <pDevice>
   // A pointer to a single <castFormat> target may be specified
   // where CreateCommittedResource3 is available
   void CreateTestResources(ID3D12Device *pDevice,
                            ID3D12GraphicsCommandList *pCommandList, LPCVOID values,
                            UINT64 valueSizeInBytes, D3D12_RESOURCE_DESC resDesc,
                            ID3D12Resource **ppResource,
                            ID3D12Resource **ppUploadResource,
                            ID3D12Resource **ppReadBuffer = nullptr,
                            DXGI_FORMAT *castFormat = nullptr) {
     CComPtr<ID3D12Resource> pResource;
     CComPtr<ID3D12Resource> pReadBuffer;
     CComPtr<ID3D12Resource> pUploadResource;
     D3D12_SUBRESOURCE_DATA transferData;
     D3D12_HEAP_PROPERTIES defaultHeapProperties = CD3DX12_HEAP_PROPERTIES(D3D12_HEAP_TYPE_DEFAULT);
     D3D12_HEAP_PROPERTIES uploadHeapProperties = CD3DX12_HEAP_PROPERTIES(D3D12_HEAP_TYPE_UPLOAD);
     D3D12_RESOURCE_DESC uploadBufferDesc = CD3DX12_RESOURCE_DESC::Buffer(valueSizeInBytes);
     CD3DX12_HEAP_PROPERTIES readHeap(D3D12_HEAP_TYPE_READBACK);
     CD3DX12_RESOURCE_DESC readDesc(CD3DX12_RESOURCE_DESC::Buffer(valueSizeInBytes));

     pDevice->GetCopyableFootprints(&resDesc, 0, 1/*mipleveles*/, 0, nullptr, nullptr, nullptr, &uploadBufferDesc.Width);
     uploadBufferDesc.Height = 1;

 #if defined(NTDDI_WIN10_CU) && WDK_NTDDI_VERSION >= NTDDI_WIN10_CU
     if (castFormat) {
       CComPtr<ID3D12Device10> pDevice10;
       // Copy resDesc0 to resDesc1 zeroing anything new
       D3D12_RESOURCE_DESC1 resDesc1 = {0};
       CopyDesc0ToDesc1(resDesc1, resDesc);
       VERIFY_SUCCEEDED(pDevice->QueryInterface(IID_PPV_ARGS(&pDevice10)));
       VERIFY_SUCCEEDED(pDevice10->CreateCommittedResource3(
         &defaultHeapProperties,
         D3D12_HEAP_FLAG_NONE,
         &resDesc1,
         D3D12_BARRIER_LAYOUT_COPY_DEST,
         nullptr,
         nullptr,
         1, castFormat,
         IID_PPV_ARGS(&pResource)));
     } else
 #else
     UNREFERENCED_PARAMETER(castFormat);
 #endif
     {
       VERIFY_SUCCEEDED(pDevice->CreateCommittedResource(
         &defaultHeapProperties,
         D3D12_HEAP_FLAG_NONE,
         &resDesc,
         D3D12_RESOURCE_STATE_COPY_DEST,
         nullptr,
         IID_PPV_ARGS(&pResource)));
     }

     if (ppUploadResource)
       VERIFY_SUCCEEDED(pDevice->CreateCommittedResource(
         &uploadHeapProperties,
         D3D12_HEAP_FLAG_NONE,
         &uploadBufferDesc,
         D3D12_RESOURCE_STATE_GENERIC_READ,
         nullptr,
         IID_PPV_ARGS(&pUploadResource)));

     if (ppReadBuffer)
       VERIFY_SUCCEEDED(pDevice->CreateCommittedResource(
         &readHeap, D3D12_HEAP_FLAG_NONE, &readDesc,
         D3D12_RESOURCE_STATE_COPY_DEST, nullptr, IID_PPV_ARGS(&pReadBuffer)));

     if (ppUploadResource) {
       transferData.pData = values;
       transferData.RowPitch = (LONG_PTR)(valueSizeInBytes/resDesc.Height);
       transferData.SlicePitch = (LONG_PTR)valueSizeInBytes;

       UpdateSubresources<1>(pCommandList, pResource.p, pUploadResource.p, 0, 0, 1, &transferData);
       if (resDesc.Flags & D3D12_RESOURCE_FLAG_ALLOW_UNORDERED_ACCESS)
         RecordTransitionBarrier(pCommandList, pResource, D3D12_RESOURCE_STATE_COPY_DEST, D3D12_RESOURCE_STATE_UNORDERED_ACCESS);
       else
         RecordTransitionBarrier(pCommandList, pResource, D3D12_RESOURCE_STATE_COPY_DEST, D3D12_RESOURCE_STATE_COMMON);
     }

     *ppResource = pResource.Detach();
     if (ppUploadResource)
       *ppUploadResource = pUploadResource.Detach();
     if (ppReadBuffer)
       *ppReadBuffer = pReadBuffer.Detach();
   }

   void CreateTestUavs(ID3D12Device *pDevice,
                       ID3D12GraphicsCommandList *pCommandList, LPCVOID values,
                       UINT64 valueSizeInBytes, ID3D12Resource **ppUavResource,
                       ID3D12Resource **ppUploadResource = nullptr,
                       ID3D12Resource **ppReadBuffer = nullptr) {
     D3D12_RESOURCE_DESC bufferDesc = CD3DX12_RESOURCE_DESC::Buffer(valueSizeInBytes, D3D12_RESOURCE_FLAG_ALLOW_UNORDERED_ACCESS);
     CreateTestResources(pDevice, pCommandList, values, valueSizeInBytes, bufferDesc,
                         ppUavResource, ppUploadResource, ppReadBuffer);

   }

   // Create and return descriptor heaps for the given device
   // with the given number of resources and samples.
   // using some reasonable defaults
   void CreateDefaultDescHeaps(ID3D12Device *pDevice,
                               int NumResources, int NumSamplers,
                               ID3D12DescriptorHeap **ppResHeap, ID3D12DescriptorHeap **ppSampHeap) {
     // Describe and create descriptor heaps.
     ID3D12DescriptorHeap *pResHeap, *pSampHeap;
     D3D12_DESCRIPTOR_HEAP_DESC heapDesc = {};
     heapDesc.NumDescriptors = NumResources;
     heapDesc.Type = D3D12_DESCRIPTOR_HEAP_TYPE_CBV_SRV_UAV;
     heapDesc.Flags = D3D12_DESCRIPTOR_HEAP_FLAG_SHADER_VISIBLE;
     VERIFY_SUCCEEDED(pDevice->CreateDescriptorHeap(&heapDesc, IID_PPV_ARGS(&pResHeap)));

     heapDesc.NumDescriptors = NumSamplers;
     heapDesc.Type = D3D12_DESCRIPTOR_HEAP_TYPE_SAMPLER;
     VERIFY_SUCCEEDED(pDevice->CreateDescriptorHeap(&heapDesc, IID_PPV_ARGS(&pSampHeap)));

     *ppResHeap = pResHeap;
     *ppSampHeap = pSampHeap;
   }

   void CreateSRV(ID3D12Device *pDevice, CD3DX12_CPU_DESCRIPTOR_HANDLE &baseHandle,
                  DXGI_FORMAT format, D3D12_SRV_DIMENSION viewDimension, UINT numElements, UINT stride,
                  const CComPtr<ID3D12Resource> pResource) {
     UINT descriptorSize = pDevice->GetDescriptorHandleIncrementSize(D3D12_DESCRIPTOR_HEAP_TYPE_CBV_SRV_UAV);
     // Create SRV
     D3D12_SHADER_RESOURCE_VIEW_DESC srvDesc = {};
     srvDesc.Format = format;
     srvDesc.ViewDimension = viewDimension;
     srvDesc.Shader4ComponentMapping = D3D12_DEFAULT_SHADER_4_COMPONENT_MAPPING;
     switch (viewDimension) {
     case D3D12_SRV_DIMENSION_BUFFER:
       srvDesc.Buffer.FirstElement = 0;
       srvDesc.Buffer.NumElements = numElements;
       srvDesc.Buffer.StructureByteStride = stride;
       if (format == DXGI_FORMAT_R32_TYPELESS && stride == 0)
         srvDesc.Buffer.Flags = D3D12_BUFFER_SRV_FLAG_RAW;
       else
         srvDesc.Buffer.Flags = D3D12_BUFFER_SRV_FLAG_NONE;
       break;
     case D3D12_SRV_DIMENSION_TEXTURE1D:
       srvDesc.Texture1D.MostDetailedMip = 0;
       srvDesc.Texture1D.MipLevels = 1;
       srvDesc.Texture1D.ResourceMinLODClamp = 0;
       break;
     case D3D12_SRV_DIMENSION_TEXTURE2D:
       srvDesc.Texture2D.MostDetailedMip = 0;
       srvDesc.Texture2D.MipLevels = 1;
       srvDesc.Texture2D.PlaneSlice = 0;
       srvDesc.Texture2D.ResourceMinLODClamp = 0;
       break;
     }
     pDevice->CreateShaderResourceView(pResource, &srvDesc, baseHandle);
     baseHandle.Offset(descriptorSize);
   }


   void CreateRawSRV(ID3D12Device *pDevice, CD3DX12_CPU_DESCRIPTOR_HANDLE &heapStart,
                     UINT numElements, const CComPtr<ID3D12Resource> pResource) {
     CreateSRV(pDevice, heapStart, DXGI_FORMAT_R32_TYPELESS, D3D12_SRV_DIMENSION_BUFFER, numElements, 0, pResource);
   }

   void CreateStructSRV(ID3D12Device *pDevice, CD3DX12_CPU_DESCRIPTOR_HANDLE &heapStart,
                        UINT numElements, UINT stride, const CComPtr<ID3D12Resource> pResource) {
     CreateSRV(pDevice, heapStart, DXGI_FORMAT_UNKNOWN, D3D12_SRV_DIMENSION_BUFFER, numElements, stride, pResource);
   }

   void CreateTypedSRV(ID3D12Device *pDevice, CD3DX12_CPU_DESCRIPTOR_HANDLE &heapStart,
                       UINT numElements, DXGI_FORMAT format, const CComPtr<ID3D12Resource> pResource) {
     CreateSRV(pDevice, heapStart, format, D3D12_SRV_DIMENSION_BUFFER, numElements, 0, pResource);
   }

   void CreateTex1DSRV(ID3D12Device *pDevice, CD3DX12_CPU_DESCRIPTOR_HANDLE &heapStart,
                       UINT numElements, DXGI_FORMAT format, const CComPtr<ID3D12Resource> pResource) {
     CreateSRV(pDevice, heapStart, format, D3D12_SRV_DIMENSION_TEXTURE1D, numElements, 0, pResource);
   }

   void CreateTex2DSRV(ID3D12Device *pDevice, CD3DX12_CPU_DESCRIPTOR_HANDLE &heapStart,
                       DXGI_FORMAT format, const CComPtr<ID3D12Resource> pResource) {
     CreateSRV(pDevice, heapStart, format, D3D12_SRV_DIMENSION_TEXTURE2D, 0/*numElements*/, 0/*stride*/, pResource);
   }

   void CreateUAV(ID3D12Device *pDevice, CD3DX12_CPU_DESCRIPTOR_HANDLE &baseHandle,
                  DXGI_FORMAT format, D3D12_UAV_DIMENSION viewDimension, UINT numElements, UINT stride,
                  const CComPtr<ID3D12Resource> pResource) {
     UINT descriptorSize = pDevice->GetDescriptorHandleIncrementSize(D3D12_DESCRIPTOR_HEAP_TYPE_CBV_SRV_UAV);
     D3D12_UNORDERED_ACCESS_VIEW_DESC uavDesc = {};
     uavDesc.Format = format;
     uavDesc.ViewDimension = viewDimension;
     switch (viewDimension) {
     case D3D12_UAV_DIMENSION_BUFFER:
       uavDesc.Buffer.FirstElement = 0;
       uavDesc.Buffer.NumElements = numElements;
       uavDesc.Buffer.StructureByteStride = stride;
       if (format == DXGI_FORMAT_R32_TYPELESS && stride == 0)
         uavDesc.Buffer.Flags = D3D12_BUFFER_UAV_FLAG_RAW;
       else
         uavDesc.Buffer.Flags = D3D12_BUFFER_UAV_FLAG_NONE;
       break;
     case D3D12_UAV_DIMENSION_TEXTURE1D:
       uavDesc.Texture1D.MipSlice = 0;
       break;
     case D3D12_UAV_DIMENSION_TEXTURE2D:
       uavDesc.Texture2D.MipSlice = 0;
       uavDesc.Texture2D.PlaneSlice = 0;
       break;
     case D3D12_UAV_DIMENSION_TEXTURE2DARRAY:
       uavDesc.Texture2DArray.MipSlice = 0;
       uavDesc.Texture2DArray.PlaneSlice = 0;
       uavDesc.Texture2DArray.FirstArraySlice = 0;
       uavDesc.Texture2DArray.ArraySize = numElements;
       break;
     default:
       break;
     }
     pDevice->CreateUnorderedAccessView(pResource, nullptr, &uavDesc, baseHandle);
     baseHandle.Offset(descriptorSize);
   }

   void CreateRawUAV(ID3D12Device *pDevice, CD3DX12_CPU_DESCRIPTOR_HANDLE &heapStart,
                     UINT numElements, const CComPtr<ID3D12Resource> pResource) {
     CreateUAV(pDevice, heapStart, DXGI_FORMAT_R32_TYPELESS, D3D12_UAV_DIMENSION_BUFFER, numElements, 0/*stride*/, pResource);
   }

   void CreateStructUAV(ID3D12Device *pDevice, CD3DX12_CPU_DESCRIPTOR_HANDLE &heapStart,
                        UINT numElements, UINT stride, const CComPtr<ID3D12Resource> pResource) {
     CreateUAV(pDevice, heapStart, DXGI_FORMAT_UNKNOWN, D3D12_UAV_DIMENSION_BUFFER, numElements, stride, pResource);
   }

   void CreateTypedUAV(ID3D12Device *pDevice, CD3DX12_CPU_DESCRIPTOR_HANDLE &heapStart,
                       UINT numElements, DXGI_FORMAT format, const CComPtr<ID3D12Resource> pResource) {
     CreateUAV(pDevice, heapStart, format, D3D12_UAV_DIMENSION_BUFFER, numElements, 0/*stride*/, pResource);
   }

   void CreateTex1DUAV(ID3D12Device *pDevice, CD3DX12_CPU_DESCRIPTOR_HANDLE &heapStart,
                       DXGI_FORMAT format, const CComPtr<ID3D12Resource> pResource) {
     CreateUAV(pDevice, heapStart, format, D3D12_UAV_DIMENSION_TEXTURE1D, 0/*numElements*/, 0/*stride*/, pResource);
   }

   void CreateTex2DUAV(ID3D12Device *pDevice, CD3DX12_CPU_DESCRIPTOR_HANDLE &heapStart,
                       DXGI_FORMAT format, const CComPtr<ID3D12Resource> pResource) {
     CreateUAV(pDevice, heapStart, format, D3D12_UAV_DIMENSION_TEXTURE2D, 0/*numElements*/, 0/*stride*/, pResource);
   }

   void CreateTex2DArrayUAV(ID3D12Device *pDevice, CD3DX12_CPU_DESCRIPTOR_HANDLE &heapStart,
                            UINT numElements, DXGI_FORMAT format, const CComPtr<ID3D12Resource> pResource) {
     CreateUAV(pDevice, heapStart, format, D3D12_UAV_DIMENSION_TEXTURE2DARRAY, numElements, 0/*stride*/, pResource);
   }

   void CreateTex2DMSUAV(ID3D12Device *pDevice, CD3DX12_CPU_DESCRIPTOR_HANDLE &heapStart,
                         DXGI_FORMAT format, const CComPtr<ID3D12Resource> pResource) {
     CreateUAV(pDevice, heapStart, format, (D3D12_UAV_DIMENSION)6 /*D3D12_UAV_DIMENSION_TEXTURE2DMS*/, 0 /*numElements*/, 0/*stride*/, pResource);
   }

   // Create Samplers for <pDevice> given the filter and border color information provided
   // using some reasonable defaults
   void CreateDefaultSamplers(ID3D12Device *pDevice, D3D12_CPU_DESCRIPTOR_HANDLE heapStart,
                              D3D12_FILTER filters[], float *perSamplerBorderColors, int NumSamplers) {

     CD3DX12_CPU_DESCRIPTOR_HANDLE sampHandle(heapStart);
     UINT descriptorSize = pDevice->GetDescriptorHandleIncrementSize(D3D12_DESCRIPTOR_HEAP_TYPE_SAMPLER);
     D3D12_SAMPLER_DESC sampDesc = {};
     sampDesc.Filter = D3D12_FILTER_MIN_MAG_LINEAR_MIP_POINT;
     D3D12_TEXTURE_ADDRESS_MODE addrMode = perSamplerBorderColors? D3D12_TEXTURE_ADDRESS_MODE_BORDER : D3D12_TEXTURE_ADDRESS_MODE_CLAMP;
     sampDesc.AddressU = sampDesc.AddressV = sampDesc.AddressW = addrMode;
     sampDesc.MipLODBias = 0;
     sampDesc.MaxAnisotropy = 1;
     sampDesc.ComparisonFunc = D3D12_COMPARISON_FUNC_EQUAL;
     sampDesc.MinLOD = 0;
     sampDesc.MaxLOD = 0;

     for (int i = 0; i < NumSamplers; i++) {
       sampDesc.Filter = filters[i];
       if (perSamplerBorderColors) {
         for (int j = 0; j < 4; j++)
           sampDesc.BorderColor[j] = perSamplerBorderColors[i];
       }

       pDevice->CreateSampler(&sampDesc, sampHandle);
       sampHandle = sampHandle.Offset(descriptorSize);
     }
   }

   template <typename TVertex, int len>
   void CreateVertexBuffer(ID3D12Device *pDevice, TVertex(&vertices)[len],
                           ID3D12Resource **ppVertexBuffer,
                           D3D12_VERTEX_BUFFER_VIEW *pVertexBufferView) {
     size_t vertexBufferSize = sizeof(vertices);
     CComPtr<ID3D12Resource> pVertexBuffer;
     CD3DX12_HEAP_PROPERTIES heapProps(D3D12_HEAP_TYPE_UPLOAD);
     CD3DX12_RESOURCE_DESC bufferDesc(
         CD3DX12_RESOURCE_DESC::Buffer(vertexBufferSize));
     VERIFY_SUCCEEDED(pDevice->CreateCommittedResource(
         &heapProps, D3D12_HEAP_FLAG_NONE, &bufferDesc,
         D3D12_RESOURCE_STATE_GENERIC_READ, nullptr,
         IID_PPV_ARGS(&pVertexBuffer)));

     UINT8 *pVertexDataBegin;
     CD3DX12_RANGE readRange(0, 0);
     VERIFY_SUCCEEDED(pVertexBuffer->Map(
         0, &readRange, reinterpret_cast<void **>(&pVertexDataBegin)));
     memcpy(pVertexDataBegin, vertices, vertexBufferSize);
     pVertexBuffer->Unmap(0, nullptr);

     // Initialize the vertex buffer view.
     pVertexBufferView->BufferLocation = pVertexBuffer->GetGPUVirtualAddress();
     pVertexBufferView->StrideInBytes = sizeof(TVertex);
     pVertexBufferView->SizeInBytes = (UINT)vertexBufferSize;

     *ppVertexBuffer = pVertexBuffer.Detach();
   }

   // Requires Anniversary Edition headers, so simplifying things for current setup.
   const UINT D3D12_FEATURE_D3D12_OPTIONS1 = 8;
   struct D3D12_FEATURE_DATA_D3D12_OPTIONS1 {
     BOOL WaveOps;
     UINT WaveLaneCountMin;
     UINT WaveLaneCountMax;
     UINT TotalLaneCount;
     BOOL ExpandedComputeResourceStates;
     BOOL Int64ShaderOps;
   };

   bool IsDeviceBasicAdapter(ID3D12Device *pDevice) {
     CComPtr<IDXGIFactory4> factory;
     VERIFY_SUCCEEDED(CreateDXGIFactory1(IID_PPV_ARGS(&factory)));
     LUID adapterID = pDevice->GetAdapterLuid();
     CComPtr<IDXGIAdapter1> adapter;
     factory->EnumAdapterByLuid(adapterID, IID_PPV_ARGS(&adapter));
     DXGI_ADAPTER_DESC1 AdapterDesc;
     VERIFY_SUCCEEDED(adapter->GetDesc1(&AdapterDesc));
     return (AdapterDesc.Flags & DXGI_ADAPTER_FLAG_SOFTWARE) ||
            (AdapterDesc.VendorId == 0x1414 &&
             (AdapterDesc.DeviceId == 0x8c || AdapterDesc.DeviceId == 0x8d));
   }

   bool DoesDeviceSupportInt64(ID3D12Device *pDevice) {
     D3D12_FEATURE_DATA_D3D12_OPTIONS1 O;
     if (FAILED(pDevice->CheckFeatureSupport((D3D12_FEATURE)D3D12_FEATURE_D3D12_OPTIONS1, &O, sizeof(O))))
       return false;
     return O.Int64ShaderOps != FALSE;
   }

   bool DoesDeviceSupportDouble(ID3D12Device *pDevice) {
     D3D12_FEATURE_DATA_D3D12_OPTIONS O;
     if (FAILED(pDevice->CheckFeatureSupport((D3D12_FEATURE)D3D12_FEATURE_D3D12_OPTIONS, &O, sizeof(O))))
       return false;
     return O.DoublePrecisionFloatShaderOps != FALSE;
   }

   bool DoesDeviceSupportWaveOps(ID3D12Device *pDevice) {
     D3D12_FEATURE_DATA_D3D12_OPTIONS1 O;
     if (FAILED(pDevice->CheckFeatureSupport((D3D12_FEATURE)D3D12_FEATURE_D3D12_OPTIONS1, &O, sizeof(O))))
       return false;
     return O.WaveOps != FALSE;
   }

   bool DoesDeviceSupportBarycentrics(ID3D12Device *pDevice) {
     D3D12_FEATURE_DATA_D3D12_OPTIONS3 O;
     if (FAILED(pDevice->CheckFeatureSupport((D3D12_FEATURE)D3D12_FEATURE_D3D12_OPTIONS3, &O, sizeof(O))))
       return false;
     return O.BarycentricsSupported != FALSE;
   }

   bool DoesDeviceSupportNative16bitOps(ID3D12Device *pDevice) {
     D3D12_FEATURE_DATA_D3D12_OPTIONS4 O;
     if (FAILED(pDevice->CheckFeatureSupport((D3D12_FEATURE)D3D12_FEATURE_D3D12_OPTIONS4, &O, sizeof(O))))
       return false;
     return O.Native16BitShaderOpsSupported != FALSE;
   }

   bool DoesDeviceSupportMeshShaders(ID3D12Device *pDevice) {
 #if defined(NTDDI_WIN10_VB) && WDK_NTDDI_VERSION >= NTDDI_WIN10_VB
     D3D12_FEATURE_DATA_D3D12_OPTIONS7 O7;
     if (FAILED(pDevice->CheckFeatureSupport((D3D12_FEATURE)D3D12_FEATURE_D3D12_OPTIONS7, &O7, sizeof(O7))))
       return false;
     return O7.MeshShaderTier != D3D12_MESH_SHADER_TIER_NOT_SUPPORTED;
 #else
     UNREFERENCED_PARAMETER(pDevice);
     return false;
 #endif
   }

   bool DoesDeviceSupportRayTracing(ID3D12Device *pDevice) {
 #if WDK_NTDDI_VERSION > NTDDI_WIN10_RS4
     D3D12_FEATURE_DATA_D3D12_OPTIONS5 O5;
     if (FAILED(pDevice->CheckFeatureSupport((D3D12_FEATURE)D3D12_FEATURE_D3D12_OPTIONS5, &O5, sizeof(O5))))
       return false;
     return O5.RaytracingTier != D3D12_RAYTRACING_TIER_NOT_SUPPORTED;
 #else
     UNREFERENCED_PARAMETER(pDevice);
     return false;
 #endif
   }


   bool DoesDeviceSupportMeshAmpDerivatives(ID3D12Device *pDevice) {
 #if defined(NTDDI_WIN10_FE) && WDK_NTDDI_VERSION >= NTDDI_WIN10_FE
     D3D12_FEATURE_DATA_D3D12_OPTIONS7 O7;
     D3D12_FEATURE_DATA_D3D12_OPTIONS9 O9;
     if (FAILED(pDevice->CheckFeatureSupport((D3D12_FEATURE)D3D12_FEATURE_D3D12_OPTIONS7, &O7, sizeof(O7))) ||
         FAILED(pDevice->CheckFeatureSupport((D3D12_FEATURE)D3D12_FEATURE_D3D12_OPTIONS9, &O9, sizeof(O9))))
       return false;
     return O7.MeshShaderTier != D3D12_MESH_SHADER_TIER_NOT_SUPPORTED &&
       O9.DerivativesInMeshAndAmplificationShadersSupported != FALSE;
 #else
     UNREFERENCED_PARAMETER(pDevice);
     return false;
 #endif
   }

   bool DoesDeviceSupportTyped64Atomics(ID3D12Device *pDevice) {
 #if defined(NTDDI_WIN10_FE) && WDK_NTDDI_VERSION >= NTDDI_WIN10_FE
     D3D12_FEATURE_DATA_D3D12_OPTIONS9 O9;
     if (FAILED(pDevice->CheckFeatureSupport((D3D12_FEATURE)D3D12_FEATURE_D3D12_OPTIONS9, &O9, sizeof(O9))))
       return false;
     return O9.AtomicInt64OnTypedResourceSupported != FALSE;
 #else
     UNREFERENCED_PARAMETER(pDevice);
     return false;
 #endif
   }

   bool DoesDeviceSupportHeap64Atomics(ID3D12Device *pDevice) {
 #if defined(NTDDI_WIN10_CO) && WDK_NTDDI_VERSION >= NTDDI_WIN10_CO
     D3D12_FEATURE_DATA_D3D12_OPTIONS11 O11;
     if (FAILED(pDevice->CheckFeatureSupport((D3D12_FEATURE)D3D12_FEATURE_D3D12_OPTIONS11, &O11, sizeof(O11))))
       return false;
     return O11.AtomicInt64OnDescriptorHeapResourceSupported != FALSE;
 #else
     UNREFERENCED_PARAMETER(pDevice);
     return false;
 #endif
   }

   bool DoesDeviceSupportShared64Atomics(ID3D12Device *pDevice) {
 #if defined(NTDDI_WIN10_FE) && WDK_NTDDI_VERSION >= NTDDI_WIN10_FE
     D3D12_FEATURE_DATA_D3D12_OPTIONS9 O9;
     if (FAILED(pDevice->CheckFeatureSupport((D3D12_FEATURE)D3D12_FEATURE_D3D12_OPTIONS9, &O9, sizeof(O9))))
       return false;
     return O9.AtomicInt64OnGroupSharedSupported != FALSE;
 #else
     UNREFERENCED_PARAMETER(pDevice);
     return false;
 #endif
   }

   bool DoesDeviceSupportAdvancedTexOps(ID3D12Device *pDevice) {
 #if defined(NTDDI_WIN10_CU) && WDK_NTDDI_VERSION >= NTDDI_WIN10_CU
     D3D12_FEATURE_DATA_D3D12_OPTIONS14 O14;
     if (FAILED(pDevice->CheckFeatureSupport((D3D12_FEATURE)D3D12_FEATURE_D3D12_OPTIONS14, &O14, sizeof(O14))))
       return false;
     return O14.AdvancedTextureOpsSupported != FALSE;
 #else
     UNREFERENCED_PARAMETER(pDevice);
     return false;
 #endif
   }

   bool DoesDeviceSupportWritableMSAA(ID3D12Device *pDevice) {
 #if defined(NTDDI_WIN10_CU) && WDK_NTDDI_VERSION >= NTDDI_WIN10_CU
     D3D12_FEATURE_DATA_D3D12_OPTIONS14 O14;
     if (FAILED(pDevice->CheckFeatureSupport((D3D12_FEATURE)D3D12_FEATURE_D3D12_OPTIONS14, &O14, sizeof(O14))))
       return false;
     return O14.WriteableMSAATexturesSupported != FALSE;
 #else
     UNREFERENCED_PARAMETER(pDevice);
     return false;
 #endif
   }

   bool DoesDeviceSupportEnhancedBarriers(ID3D12Device *pDevice) {
 #if defined(NTDDI_WIN10_CU) && WDK_NTDDI_VERSION >= NTDDI_WIN10_CU
     D3D12_FEATURE_DATA_D3D12_OPTIONS12 O12;
     if (FAILED(pDevice->CheckFeatureSupport((D3D12_FEATURE)D3D12_FEATURE_D3D12_OPTIONS12, &O12, sizeof(O12))))
       return false;
     return O12.EnhancedBarriersSupported != FALSE;
 #else
     UNREFERENCED_PARAMETER(pDevice);
     return false;
 #endif
   }

   bool DoesDeviceSupportRelaxedFormatCasting(ID3D12Device *pDevice) {
 #if defined(NTDDI_WIN10_CU) && WDK_NTDDI_VERSION >= NTDDI_WIN10_CU
     D3D12_FEATURE_DATA_D3D12_OPTIONS12 O12;
     if (!DoesDeviceSupportEnhancedBarriers(pDevice))
       return false;

     if (FAILED(pDevice->CheckFeatureSupport((D3D12_FEATURE)D3D12_FEATURE_D3D12_OPTIONS12, &O12, sizeof(O12))))
       return false;
     return O12.RelaxedFormatCastingSupported != FALSE;
 #else
     UNREFERENCED_PARAMETER(pDevice);
     return false;
 #endif
   }

   bool IsFallbackPathEnabled(){
     // Enable fallback paths with: /p:"EnableFallback=1"
     UINT EnableFallbackValue = 0;
     WEX::TestExecution::RuntimeParameters::TryGetValue(L"EnableFallback", EnableFallbackValue);
     return EnableFallbackValue != 0;
   }

 #ifndef _HLK_CONF
   void DXBCFromText(LPCSTR pText, LPCWSTR pEntryPoint, LPCWSTR pTargetProfile, ID3DBlob **ppBlob) {
     CW2A pEntryPointA(pEntryPoint, CP_UTF8);
     CW2A pTargetProfileA(pTargetProfile, CP_UTF8);
     CComPtr<ID3DBlob> pErrors;
     D3D_SHADER_MACRO d3dMacro[2];
     ZeroMemory(d3dMacro, sizeof(d3dMacro));
     d3dMacro[0].Definition = "1";
     d3dMacro[0].Name = "USING_DXBC";
     HRESULT hr = D3DCompile(pText, strlen(pText), "hlsl.hlsl", d3dMacro, nullptr, pEntryPointA, pTargetProfileA, 0, 0, ppBlob, &pErrors);
     if (pErrors != nullptr) {
       CA2W errors((char *)pErrors->GetBufferPointer(), CP_ACP);
       LogCommentFmt(L"Compilation failure: %s", errors.m_szBuffer);
     }
     VERIFY_SUCCEEDED(hr);
   }
 #endif

   HRESULT EnableDebugLayer() {
     // The debug layer does net yet validate DXIL programs that require rewriting,
     // but basic logging should work properly.
     HRESULT hr = S_FALSE;
     if (UseDebugIfaces()) {
       CComPtr<ID3D12Debug> debugController;
       hr = D3D12GetDebugInterface(IID_PPV_ARGS(&debugController));
       if (SUCCEEDED(hr)) {
         debugController->EnableDebugLayer();
         hr = S_OK;
       }
     }
     return hr;
   }

   static std::wstring GetModuleName() {
     wchar_t moduleName[MAX_PATH+1] = {0};
     DWORD length = GetModuleFileNameW(NULL, moduleName, MAX_PATH);
     if (length == 0 || length == MAX_PATH) {
       return std::wstring(); // Error condition
     }
     return std::wstring(moduleName, length);
   }

   static std::wstring ComputeSDKFullPath(std::wstring SDKPath) {
     std::wstring modulePath = GetModuleName();
     size_t pos = modulePath.rfind('\\');
     if (pos == std::wstring::npos)
       return SDKPath;
     if (SDKPath.substr(0, 2) != L".\\")
       return SDKPath;
     return modulePath.substr(0, pos) + SDKPath.substr(1);
   }

   static UINT GetD3D12SDKVersion(std::wstring SDKPath) {
     // Try to automatically get the D3D12SDKVersion from the DLL
     UINT SDKVersion = 0;
     std::wstring D3DCorePath = ComputeSDKFullPath(SDKPath);
     D3DCorePath.append(L"D3D12Core.dll");
     HMODULE hCore = LoadLibraryW(D3DCorePath.c_str());
     if (hCore) {
       if (UINT *pSDKVersion = (UINT*)GetProcAddress(hCore, "D3D12SDKVersion"))
         SDKVersion = *pSDKVersion;
       FreeModule(hCore);
     }
     return SDKVersion;
   }

   static HRESULT EnableAgilitySDK(HMODULE hRuntime, UINT SDKVersion,
                                   LPCWSTR SDKPath) {
     D3D12GetInterfaceFn pD3D12GetInterface =
         (D3D12GetInterfaceFn)GetProcAddress(hRuntime, "D3D12GetInterface");
     CComPtr<ID3D12SDKConfiguration> pD3D12SDKConfiguration;
     IFR(pD3D12GetInterface(CLSID_D3D12SDKConfiguration,
                            IID_PPV_ARGS(&pD3D12SDKConfiguration)));
     IFR(pD3D12SDKConfiguration->SetSDKVersion(SDKVersion, CW2A(SDKPath)));

     // Currently, it appears that the SetSDKVersion will succeed even when
     // D3D12Core is not found, or its version doesn't match.  When that's the
     // case, will cause a failure in the very next thing that actually requires
     // D3D12Core.dll to be loaded instead.  So, we attempt to clear experimental
     // features next, which is a valid use case and a no-op at this point.  This
     // requires D3D12Core to be loaded.  If this fails, we know the AgilitySDK
     // setting actually failed.
     D3D12EnableExperimentalFeaturesFn pD3D12EnableExperimentalFeatures =
         (D3D12EnableExperimentalFeaturesFn)GetProcAddress(
             hRuntime, "D3D12EnableExperimentalFeatures");
     if (pD3D12EnableExperimentalFeatures == nullptr) {
       // If this failed, D3D12 must be too old for AgilitySDK.  But if that's
       // the case, creating D3D12SDKConfiguration should have failed.  So while
       // this case shouldn't be hit, fail if it is.
       return HRESULT_FROM_WIN32(GetLastError());
     }
     return pD3D12EnableExperimentalFeatures(0, nullptr, nullptr, nullptr);
   }

   static HRESULT EnableExperimentalShaderModels(HMODULE hRuntime) {
     D3D12EnableExperimentalFeaturesFn pD3D12EnableExperimentalFeatures =
         (D3D12EnableExperimentalFeaturesFn)GetProcAddress(
             hRuntime, "D3D12EnableExperimentalFeatures");
     if (pD3D12EnableExperimentalFeatures == nullptr) {
       return HRESULT_FROM_WIN32(GetLastError());
     }
     return pD3D12EnableExperimentalFeatures(1, &D3D12ExperimentalShaderModelsID,
                                             nullptr, nullptr);
   }

   static HRESULT EnableExperimentalShaderModels() {
     HMODULE hRuntime = LoadLibraryW(L"d3d12.dll");
     if (hRuntime == NULL)
       return E_FAIL;
     return EnableExperimentalShaderModels(hRuntime);
   }

   HRESULT EnableAgilitySDK(HMODULE hRuntime) {
     // D3D12SDKVersion > 1 will use provided version, otherwise, auto-detect.
     // D3D12SDKVersion == 1 means fail if we can't auto-detect.
     UINT SDKVersion = 0;
     WEX::TestExecution::RuntimeParameters::TryGetValue(
             L"D3D12SDKVersion", SDKVersion);

     // SDKPath must be relative path from .exe, which means relative to
     // TE.exe location, and must start with ".\\", such as with the
     // default: ".\\D3D12\\"
     WEX::Common::String SDKPath;
     if (SUCCEEDED(WEX::TestExecution::RuntimeParameters::TryGetValue(
             L"D3D12SDKPath", SDKPath))) {
       // Make sure path ends in backslash
       if (!SDKPath.IsEmpty() && SDKPath.Right(1) != "\\") {
         SDKPath.Append("\\");
       }
     }
     if (SDKPath.IsEmpty()) {
       SDKPath = L".\\D3D12\\";
     }

     bool mustFind = SDKVersion > 0;
     if (SDKVersion <= 1) {
       // lookup version from D3D12Core.dll
       SDKVersion = GetD3D12SDKVersion((LPCWSTR)SDKPath);
       if (mustFind && SDKVersion == 0) {
         LogErrorFmt(L"Agility SDK not found in relative path: %s", (LPCWSTR)SDKPath);
         return E_FAIL;
       }
     }

     // Not found, not asked for.
     if (SDKVersion == 0)
       return S_FALSE;

     HRESULT hr= EnableAgilitySDK(hRuntime, SDKVersion, (LPCWSTR)SDKPath);
     if (FAILED(hr)) {
       // If SDKVersion provided, fail if not successful.
       // 1 means we should find it, and fill in the version automatically.
       if (mustFind) {
         LogErrorFmt(L"Failed to set Agility SDK version %d at path: %s", SDKVersion, (LPCWSTR)SDKPath);
         return hr;
       }
       return S_FALSE;
     }
     if (hr == S_OK) {
       LogCommentFmt(L"Agility SDK version set to: %d", SDKVersion);
       m_AgilitySDKEnabled = true;
     }
     return hr;
   }

   HRESULT EnableExperimentalMode(HMODULE hRuntime) {
     if (m_ExperimentalModeEnabled) {
       return S_OK;
     }

     bool bExperimentalShaderModels = GetTestParamBool(L"ExperimentalShaders");

     HRESULT hr = S_FALSE;
     if (bExperimentalShaderModels) {
       hr = EnableExperimentalShaderModels(hRuntime);
       if (SUCCEEDED(hr)) {
         m_ExperimentalModeEnabled = true;
       }
     }

     return hr;
   }

   struct FenceObj {
     HANDLE m_fenceEvent = NULL;
     CComPtr<ID3D12Fence> m_fence;
     UINT64 m_fenceValue;
     ~FenceObj() {
       if (m_fenceEvent) CloseHandle(m_fenceEvent);
     }
   };

   void InitFenceObj(ID3D12Device *pDevice, FenceObj *pObj) {
     pObj->m_fenceValue = 1;
     VERIFY_SUCCEEDED(pDevice->CreateFence(0, D3D12_FENCE_FLAG_NONE,
                                           IID_PPV_ARGS(&pObj->m_fence)));
     // Create an event handle to use for frame synchronization.
     pObj->m_fenceEvent = CreateEvent(nullptr, FALSE, FALSE, nullptr);
     if (pObj->m_fenceEvent == nullptr) {
       VERIFY_SUCCEEDED(HRESULT_FROM_WIN32(GetLastError()));
     }
   }

   void ReadHlslDataIntoNewStream(LPCWSTR relativePath, IStream **ppStream) {
     VERIFY_SUCCEEDED(m_support.Initialize());
     CComPtr<IDxcLibrary> pLibrary;
     CComPtr<IDxcBlobEncoding> pBlob;
     CComPtr<IStream> pStream;
     std::wstring path = GetPathToHlslDataFile(relativePath);
     VERIFY_SUCCEEDED(m_support.CreateInstance(CLSID_DxcLibrary, &pLibrary));
     VERIFY_SUCCEEDED(pLibrary->CreateBlobFromFile(path.c_str(), nullptr, &pBlob));
     VERIFY_SUCCEEDED(pLibrary->CreateStreamFromBlobReadOnly(pBlob, &pStream));
     *ppStream = pStream.Detach();
   }

   void RecordRenderAndReadback(ID3D12GraphicsCommandList *pList,
                                ID3D12DescriptorHeap *pRtvHeap,
                                UINT rtvDescriptorSize,
                                UINT instanceCount,
                                D3D12_VERTEX_BUFFER_VIEW *pVertexBufferView,
                                ID3D12RootSignature *pRootSig,
                                ID3D12Resource *pRenderTarget,
                                ID3D12Resource *pReadBuffer) {
     D3D12_RESOURCE_DESC rtDesc = pRenderTarget->GetDesc();
     D3D12_VIEWPORT viewport;
     D3D12_RECT scissorRect;

     memset(&viewport, 0, sizeof(viewport));
     viewport.Height = (float)rtDesc.Height;
     viewport.Width = (float)rtDesc.Width;
     viewport.MaxDepth = 1.0f;
     memset(&scissorRect, 0, sizeof(scissorRect));
     scissorRect.right = (long)rtDesc.Width;
     scissorRect.bottom = rtDesc.Height;
     if (pRootSig != nullptr) {
       pList->SetGraphicsRootSignature(pRootSig);
     }
     pList->RSSetViewports(1, &viewport);
     pList->RSSetScissorRects(1, &scissorRect);

     // Indicate that the buffer will be used as a render target.
     RecordTransitionBarrier(pList, pRenderTarget, D3D12_RESOURCE_STATE_COPY_DEST, D3D12_RESOURCE_STATE_RENDER_TARGET);

     CD3DX12_CPU_DESCRIPTOR_HANDLE rtvHandle(pRtvHeap->GetCPUDescriptorHandleForHeapStart(), 0, rtvDescriptorSize);
     pList->OMSetRenderTargets(1, &rtvHandle, FALSE, nullptr);

     pList->ClearRenderTargetView(rtvHandle, ClearColor, 0, nullptr);
     pList->IASetPrimitiveTopology(D3D_PRIMITIVE_TOPOLOGY_TRIANGLELIST);
     pList->IASetVertexBuffers(0, 1, pVertexBufferView);
     pList->DrawInstanced(3, instanceCount, 0, 0);

     // Transition to copy source and copy into read-back buffer.
     RecordTransitionBarrier(pList, pRenderTarget, D3D12_RESOURCE_STATE_RENDER_TARGET, D3D12_RESOURCE_STATE_COPY_SOURCE);

     // Copy into read-back buffer.
     UINT64 rowPitch = rtDesc.Width * 4;
     if (rowPitch % D3D12_TEXTURE_DATA_PITCH_ALIGNMENT)
       rowPitch += D3D12_TEXTURE_DATA_PITCH_ALIGNMENT - (rowPitch % D3D12_TEXTURE_DATA_PITCH_ALIGNMENT);
     D3D12_PLACED_SUBRESOURCE_FOOTPRINT Footprint;
     Footprint.Offset = 0;
     Footprint.Footprint = CD3DX12_SUBRESOURCE_FOOTPRINT(DXGI_FORMAT_R8G8B8A8_UNORM, (UINT)rtDesc.Width, rtDesc.Height, 1, (UINT)rowPitch);
     CD3DX12_TEXTURE_COPY_LOCATION DstLoc(pReadBuffer, Footprint);
     CD3DX12_TEXTURE_COPY_LOCATION SrcLoc(pRenderTarget, 0);
     pList->CopyTextureRegion(&DstLoc, 0, 0, 0, &SrcLoc, nullptr);
   }

   void RunRWByteBufferComputeTest(ID3D12Device *pDevice, LPCSTR shader, std::vector<uint32_t> &values);
   void RunLifetimeIntrinsicTest(ID3D12Device *pDevice, LPCSTR shader, D3D_SHADER_MODEL shaderModel, bool useLibTarget, LPCWSTR *pOptions, int numOptions, std::vector<uint32_t> &values);
   void RunLifetimeIntrinsicComputeTest(ID3D12Device *pDevice, LPCSTR pShader, CComPtr<ID3D12DescriptorHeap>& pUavHeap, CComPtr<ID3D12RootSignature>& pRootSignature,
                                        LPCWSTR pTargetProfile, LPCWSTR *pOptions, int numOptions, std::vector<uint32_t> &values);
   void RunLifetimeIntrinsicLibTest(ID3D12Device *pDevice0, LPCSTR pShader, CComPtr<ID3D12RootSignature>& pRootSignature,
                                    LPCWSTR pTargetProfile, LPCWSTR *pOptions, int numOptions);

   void SetDescriptorHeap(ID3D12GraphicsCommandList *pCommandList, ID3D12DescriptorHeap *pHeap) {
     ID3D12DescriptorHeap *const pHeaps[1] = { pHeap };
     pCommandList->SetDescriptorHeaps(1, pHeaps);
   }

   void WaitForSignal(ID3D12CommandQueue *pCQ, FenceObj &FO) {
     ::WaitForSignal(pCQ, FO.m_fence, FO.m_fenceEvent, FO.m_fenceValue++);
   }
 };
 #define WAVE_INTRINSIC_DXBC_GUARD \
   "#ifdef USING_DXBC\r\n" \
   "uint WaveGetLaneIndex() { return 1; }\r\n" \
   "uint WaveReadLaneFirst(uint u) { return u; }\r\n" \
   "bool WaveIsFirstLane() { return true; }\r\n" \
   "uint WaveGetLaneCount() { return 1; }\r\n" \
   "uint WaveReadLaneAt(uint n, uint u) { return u; }\r\n" \
   "bool WaveActiveAnyTrue(bool b) { return b; }\r\n" \
   "bool WaveActiveAllTrue(bool b) { return false; }\r\n" \
   "uint WaveActiveAllEqual(uint u) { return u; }\r\n" \
   "uint4 WaveActiveBallot(bool b) { return 1; }\r\n" \
   "uint WaveActiveCountBits(uint u) { return 1; }\r\n" \
   "uint WaveActiveSum(uint u) { return 1; }\r\n" \
   "uint WaveActiveProduct(uint u) { return 1; }\r\n" \
   "uint WaveActiveBitAnd(uint u) { return 1; }\r\n" \
   "uint WaveActiveBitOr(uint u) { return 1; }\r\n" \
   "uint WaveActiveBitXor(uint u) { return 1; }\r\n" \
   "uint WaveActiveMin(uint u) { return 1; }\r\n" \
   "uint WaveActiveMax(uint u) { return 1; }\r\n" \
   "uint WavePrefixCountBits(uint u) { return 1; }\r\n" \
   "uint WavePrefixSum(uint u) { return 1; }\r\n" \
   "uint WavePrefixProduct(uint u) { return 1; }\r\n" \
   "uint QuadReadLaneAt(uint a, uint u) { return 1; }\r\n" \
   "uint QuadReadAcrossX(uint u) { return 1; }\r\n" \
   "uint QuadReadAcrossY(uint u) { return 1; }\r\n" \
   "uint QuadReadAcrossDiagonal(uint u) { return 1; }\r\n" \
   "#endif\r\n"

 static void SetupComputeValuePattern(std::vector<uint32_t> &values,
                                      size_t count) {
   values.resize(count); // one element per dispatch group, in bytes
   for (size_t i = 0; i < count; ++i) {
     values[i] = (uint32_t)i;
   }
 }

 bool ExecutionTest::ExecutionTestClassSetup() {
   return DivergentClassSetup();
 }

 void ExecutionTest::RunRWByteBufferComputeTest(ID3D12Device *pDevice, LPCSTR pShader, std::vector<uint32_t> &values) {
   static const int DispatchGroupX = 1;
   static const int DispatchGroupY = 1;
   static const int DispatchGroupZ = 1;

   CComPtr<ID3D12GraphicsCommandList> pCommandList;
   CComPtr<ID3D12CommandQueue> pCommandQueue;
   CComPtr<ID3D12DescriptorHeap> pUavHeap;
   CComPtr<ID3D12CommandAllocator> pCommandAllocator;
   UINT uavDescriptorSize;
   FenceObj FO;

   const UINT valueSizeInBytes = (UINT)values.size() * sizeof(uint32_t);
   CreateComputeCommandQueue(pDevice, L"RunRWByteBufferComputeTest Command Queue", &pCommandQueue);
   InitFenceObj(pDevice, &FO);

   // Describe and create a UAV descriptor heap.
   D3D12_DESCRIPTOR_HEAP_DESC heapDesc = {};
   heapDesc.NumDescriptors = 1;
   heapDesc.Type = D3D12_DESCRIPTOR_HEAP_TYPE_CBV_SRV_UAV;
   heapDesc.Flags = D3D12_DESCRIPTOR_HEAP_FLAG_SHADER_VISIBLE;
   VERIFY_SUCCEEDED(pDevice->CreateDescriptorHeap(&heapDesc, IID_PPV_ARGS(&pUavHeap)));
   uavDescriptorSize = pDevice->GetDescriptorHandleIncrementSize(heapDesc.Type);

   // Create root signature.
   CComPtr<ID3D12RootSignature> pRootSignature;
   {
     CD3DX12_DESCRIPTOR_RANGE ranges[1];
     ranges[0].Init(D3D12_DESCRIPTOR_RANGE_TYPE_UAV, 1, 0, 0, 0);

     CD3DX12_ROOT_PARAMETER rootParameters[1];
     rootParameters[0].InitAsDescriptorTable(1, &ranges[0], D3D12_SHADER_VISIBILITY_ALL);

     CD3DX12_ROOT_SIGNATURE_DESC rootSignatureDesc;
     rootSignatureDesc.Init(_countof(rootParameters), rootParameters, 0, nullptr, D3D12_ROOT_SIGNATURE_FLAG_NONE);

     CreateRootSignatureFromDesc(pDevice, &rootSignatureDesc, &pRootSignature);
   }

   // Create pipeline state object.
   CComPtr<ID3D12PipelineState> pComputeState;
   CreateComputePSO(pDevice, pRootSignature, pShader, L"cs_6_0", &pComputeState);

   // Create a command allocator and list for compute.
   VERIFY_SUCCEEDED(pDevice->CreateCommandAllocator(D3D12_COMMAND_LIST_TYPE_COMPUTE, IID_PPV_ARGS(&pCommandAllocator)));
   VERIFY_SUCCEEDED(pDevice->CreateCommandList(0, D3D12_COMMAND_LIST_TYPE_COMPUTE, pCommandAllocator, pComputeState, IID_PPV_ARGS(&pCommandList)));
   pCommandList->SetName(L"ExecutionTest::RunRWByteButterComputeTest Command List");

   // Set up UAV resource.
   CComPtr<ID3D12Resource> pUavResource;
   CComPtr<ID3D12Resource> pReadBuffer;
   CComPtr<ID3D12Resource> pUploadResource;
   CreateTestUavs(pDevice, pCommandList, values.data(), valueSizeInBytes, &pUavResource, &pUploadResource, &pReadBuffer);
   VERIFY_SUCCEEDED(pUavResource->SetName(L"RunRWByteBufferComputeText UAV"));
   VERIFY_SUCCEEDED(pReadBuffer->SetName(L"RunRWByteBufferComputeText UAV Read Buffer"));
   VERIFY_SUCCEEDED(pUploadResource->SetName(L"RunRWByteBufferComputeText UAV Upload Buffer"));

   // Close the command list and execute it to perform the GPU setup.
   pCommandList->Close();
   ExecuteCommandList(pCommandQueue, pCommandList);
   WaitForSignal(pCommandQueue, FO);
   VERIFY_SUCCEEDED(pCommandAllocator->Reset());
   VERIFY_SUCCEEDED(pCommandList->Reset(pCommandAllocator, pComputeState));

   // Run the compute shader and copy the results back to readable memory.
   {
     D3D12_UNORDERED_ACCESS_VIEW_DESC uavDesc = {};
     uavDesc.Format = DXGI_FORMAT_R32_TYPELESS;
     uavDesc.ViewDimension = D3D12_UAV_DIMENSION_BUFFER;
     uavDesc.Buffer.FirstElement = 0;
     uavDesc.Buffer.NumElements = (UINT)values.size();
     uavDesc.Buffer.StructureByteStride = 0;
     uavDesc.Buffer.CounterOffsetInBytes = 0;
     uavDesc.Buffer.Flags = D3D12_BUFFER_UAV_FLAG_RAW;
     CD3DX12_CPU_DESCRIPTOR_HANDLE uavHandle(pUavHeap->GetCPUDescriptorHandleForHeapStart());
     CD3DX12_GPU_DESCRIPTOR_HANDLE uavHandleGpu(pUavHeap->GetGPUDescriptorHandleForHeapStart());
     pDevice->CreateUnorderedAccessView(pUavResource, nullptr, &uavDesc, uavHandle);
     SetDescriptorHeap(pCommandList, pUavHeap);
     pCommandList->SetComputeRootSignature(pRootSignature);
     pCommandList->SetComputeRootDescriptorTable(0, uavHandleGpu);
   }
   pCommandList->Dispatch(DispatchGroupX, DispatchGroupY, DispatchGroupZ);
   RecordTransitionBarrier(pCommandList, pUavResource, D3D12_RESOURCE_STATE_UNORDERED_ACCESS, D3D12_RESOURCE_STATE_COPY_SOURCE);
   pCommandList->CopyResource(pReadBuffer, pUavResource);
   pCommandList->Close();
   ExecuteCommandList(pCommandQueue, pCommandList);
   WaitForSignal(pCommandQueue, FO);
   {
     MappedData mappedData(pReadBuffer, valueSizeInBytes);
     uint32_t *pData = (uint32_t *)mappedData.data();
     memcpy(values.data(), pData, (size_t)valueSizeInBytes);
   }
   WaitForSignal(pCommandQueue, FO);
 }

 void ExecutionTest::RunLifetimeIntrinsicComputeTest(ID3D12Device *pDevice, LPCSTR pShader, CComPtr<ID3D12DescriptorHeap>& pUavHeap, CComPtr<ID3D12RootSignature>& pRootSignature,
                                                     LPCWSTR pTargetProfile, LPCWSTR *pOptions, int numOptions, std::vector<uint32_t> &values) {
   // Create command queue.
   CComPtr<ID3D12CommandQueue> pCommandQueue;
   CreateComputeCommandQueue(pDevice, L"RunLifetimeIntrinsicTest Command Queue", &pCommandQueue);

   FenceObj FO;
   InitFenceObj(pDevice, &FO);

   // Compile shader "main" and create pipeline state object.
   CComPtr<ID3D12PipelineState> pComputeState;
   CreateComputePSO(pDevice, pRootSignature, pShader, pTargetProfile, &pComputeState, pOptions, numOptions);

   // Create a command allocator and list for compute.
   CComPtr<ID3D12CommandAllocator> pCommandAllocator;
   CComPtr<ID3D12GraphicsCommandList> pCommandList;
   VERIFY_SUCCEEDED(pDevice->CreateCommandAllocator(D3D12_COMMAND_LIST_TYPE_COMPUTE, IID_PPV_ARGS(&pCommandAllocator)));
   VERIFY_SUCCEEDED(pDevice->CreateCommandList(0, D3D12_COMMAND_LIST_TYPE_COMPUTE, pCommandAllocator, pComputeState, IID_PPV_ARGS(&pCommandList)));
   pCommandList->SetName(L"ExecutionTest::RunLifetimeIntrinsicTest Command List");

   // Set up UAV resource.
   const UINT valueSizeInBytes = (UINT)values.size() * sizeof(uint32_t);
   CComPtr<ID3D12Resource> pUavResource;
   CComPtr<ID3D12Resource> pReadBuffer;
   CComPtr<ID3D12Resource> pUploadResource;
   CreateTestUavs(pDevice, pCommandList, values.data(), valueSizeInBytes, &pUavResource, &pUploadResource, &pReadBuffer);
   VERIFY_SUCCEEDED(pUavResource->SetName(L"RunLifetimeIntrinsicTest UAV"));
   VERIFY_SUCCEEDED(pReadBuffer->SetName(L"RunLifetimeIntrinsicTest UAV Read Buffer"));
   VERIFY_SUCCEEDED(pUploadResource->SetName(L"RunLifetimeIntrinsicTest UAV Upload Buffer"));

   // Close the command list and execute it to perform the GPU setup.
   pCommandList->Close();
   ExecuteCommandList(pCommandQueue, pCommandList);
   WaitForSignal(pCommandQueue, FO);
   VERIFY_SUCCEEDED(pCommandAllocator->Reset());
   VERIFY_SUCCEEDED(pCommandList->Reset(pCommandAllocator, pComputeState));

   // Run the compute shader and copy the results back to readable memory.
   {
     D3D12_UNORDERED_ACCESS_VIEW_DESC uavDesc = {};
     uavDesc.Format = DXGI_FORMAT_R32_TYPELESS;
     uavDesc.ViewDimension = D3D12_UAV_DIMENSION_BUFFER;
     uavDesc.Buffer.FirstElement = 0;
     uavDesc.Buffer.NumElements = (UINT)values.size();
     uavDesc.Buffer.StructureByteStride = 0;
     uavDesc.Buffer.CounterOffsetInBytes = 0;
     uavDesc.Buffer.Flags = D3D12_BUFFER_UAV_FLAG_RAW;
     CD3DX12_CPU_DESCRIPTOR_HANDLE uavHandle(pUavHeap->GetCPUDescriptorHandleForHeapStart());
     CD3DX12_GPU_DESCRIPTOR_HANDLE uavHandleGpu(pUavHeap->GetGPUDescriptorHandleForHeapStart());
     pDevice->CreateUnorderedAccessView(pUavResource, nullptr, &uavDesc, uavHandle);
     SetDescriptorHeap(pCommandList, pUavHeap);
     pCommandList->SetComputeRootSignature(pRootSignature);
     pCommandList->SetComputeRootDescriptorTable(0, uavHandleGpu);
   }

   static const int DispatchGroupX = 1;
   static const int DispatchGroupY = 1;
   static const int DispatchGroupZ = 1;
   pCommandList->Dispatch(DispatchGroupX, DispatchGroupY, DispatchGroupZ);
   RecordTransitionBarrier(pCommandList, pUavResource, D3D12_RESOURCE_STATE_UNORDERED_ACCESS, D3D12_RESOURCE_STATE_COPY_SOURCE);
   pCommandList->CopyResource(pReadBuffer, pUavResource);
   pCommandList->Close();
   ExecuteCommandList(pCommandQueue, pCommandList);
   WaitForSignal(pCommandQueue, FO);
   {
     MappedData mappedData(pReadBuffer, valueSizeInBytes);
     uint32_t *pData = (uint32_t *)mappedData.data();
     memcpy(values.data(), pData, (size_t)valueSizeInBytes);
   }
   WaitForSignal(pCommandQueue, FO);
 }

 void ExecutionTest::RunLifetimeIntrinsicLibTest(ID3D12Device *pDevice0, LPCSTR pShader, CComPtr<ID3D12RootSignature>& pRootSignature,
                                                 LPCWSTR pTargetProfile, LPCWSTR *pOptions, int numOptions) {
   CComPtr<ID3D12Device5> pDevice;
   VERIFY_SUCCEEDED(pDevice0->QueryInterface(IID_PPV_ARGS(&pDevice)));

   // Create command queue.
   CComPtr<ID3D12CommandQueue> pCommandQueue;
   CreateCommandQueue(pDevice, L"RunLifetimeIntrinsicTest Command Queue", &pCommandQueue, D3D12_COMMAND_LIST_TYPE_DIRECT);

   FenceObj FO;
   InitFenceObj(pDevice, &FO);

   // Compile raygen shader.
   CComPtr<ID3DBlob> pShaderLib;
   CompileFromText(pShader, L"RayGen", pTargetProfile, &pShaderLib, pOptions, numOptions);

   // Describe and create the RT pipeline state object (RTPSO).
   CD3DX12_STATE_OBJECT_DESC stateObjectDesc(D3D12_STATE_OBJECT_TYPE_RAYTRACING_PIPELINE);
   auto lib = stateObjectDesc.CreateSubobject<CD3DX12_DXIL_LIBRARY_SUBOBJECT>();
   CD3DX12_SHADER_BYTECODE byteCode(pShaderLib);
   lib->SetDXILLibrary(&byteCode);
   lib->DefineExport(L"RayGen");

   const int payloadCount = 4;
   const int attributeCount = 2;
   const int maxRecursion = 2;
   stateObjectDesc.CreateSubobject<CD3DX12_RAYTRACING_SHADER_CONFIG_SUBOBJECT>()->Config(payloadCount * sizeof(float), attributeCount * sizeof(float));
   stateObjectDesc.CreateSubobject<CD3DX12_RAYTRACING_PIPELINE_CONFIG_SUBOBJECT>()->Config(maxRecursion);

   // Create (local!) root sig subobject and associate with  shader.
   auto localRootSigSubObj = stateObjectDesc.CreateSubobject<CD3DX12_LOCAL_ROOT_SIGNATURE_SUBOBJECT>();
   localRootSigSubObj->SetRootSignature(pRootSignature);
   auto x = stateObjectDesc.CreateSubobject<CD3DX12_SUBOBJECT_TO_EXPORTS_ASSOCIATION_SUBOBJECT>();
   x->SetSubobjectToAssociate(*localRootSigSubObj);
   x->AddExport(L"RayGen");

   CComPtr<ID3D12StateObject> pStateObject;
   VERIFY_SUCCEEDED(pDevice->CreateStateObject(stateObjectDesc, IID_PPV_ARGS(&pStateObject)));

   // Create a command allocator and list.
   CComPtr<ID3D12CommandAllocator> pCommandAllocator;
   CComPtr<ID3D12GraphicsCommandList4> pCommandList;
   VERIFY_SUCCEEDED(pDevice->CreateCommandAllocator(D3D12_COMMAND_LIST_TYPE_DIRECT, IID_PPV_ARGS(&pCommandAllocator)));
   VERIFY_SUCCEEDED(pDevice->CreateCommandList(0, D3D12_COMMAND_LIST_TYPE_DIRECT, pCommandAllocator, nullptr, IID_PPV_ARGS(&pCommandList)));
   pCommandList->SetPipelineState1(pStateObject);
   pCommandList->SetName(L"ExecutionTest::RunLifetimeIntrinsicTest Command List");

   // Close the command list and execute it to kick-off compilation in the driver.
   // NOTE: We don't care about anything else, so we're not setting up any resources and don't actually execute the shader.
   pCommandList->Close();
   ExecuteCommandList(pCommandQueue, pCommandList);
   WaitForSignal(pCommandQueue, FO);
 }

 void ExecutionTest::RunLifetimeIntrinsicTest(ID3D12Device *pDevice, LPCSTR pShader, D3D_SHADER_MODEL shaderModel, bool useLibTarget,
                                              LPCWSTR *pOptions, int numOptions, std::vector<uint32_t> &values) {
   LPCWSTR pTargetProfile;
   switch (shaderModel) {
       default: pTargetProfile = useLibTarget ? L"lib_6_3" : L"cs_6_0"; break; // Default to 6.3 for lib, 6.0 otherwise.
       case D3D_SHADER_MODEL_6_0: pTargetProfile = useLibTarget ? L"lib_6_0" : L"cs_6_0"; break;
       case D3D_SHADER_MODEL_6_3: pTargetProfile = useLibTarget ? L"lib_6_3" : L"cs_6_3"; break;
       case D3D_SHADER_MODEL_6_5: pTargetProfile = useLibTarget ? L"lib_6_5" : L"cs_6_5"; break;
       case D3D_SHADER_MODEL_6_6: pTargetProfile = useLibTarget ? L"lib_6_6" : L"cs_6_6"; break;
   }

   // Describe a UAV descriptor heap.
   D3D12_DESCRIPTOR_HEAP_DESC heapDesc = {};
   heapDesc.NumDescriptors = 1;
   heapDesc.Type = D3D12_DESCRIPTOR_HEAP_TYPE_CBV_SRV_UAV;
   heapDesc.Flags = D3D12_DESCRIPTOR_HEAP_FLAG_SHADER_VISIBLE;

   // Create the UAV descriptor heap.
   CComPtr<ID3D12DescriptorHeap> pUavHeap;
   VERIFY_SUCCEEDED(pDevice->CreateDescriptorHeap(&heapDesc, IID_PPV_ARGS(&pUavHeap)));

   // Create root signature.
   CComPtr<ID3D12RootSignature> pRootSignature;
   {
     CD3DX12_DESCRIPTOR_RANGE ranges[1];
     ranges[0].Init(D3D12_DESCRIPTOR_RANGE_TYPE_UAV, 1, 0, 0, 0);

     CD3DX12_ROOT_PARAMETER rootParameters[1];
     rootParameters[0].InitAsDescriptorTable(1, &ranges[0], D3D12_SHADER_VISIBILITY_ALL);

     CD3DX12_ROOT_SIGNATURE_DESC rootSignatureDesc;
     D3D12_ROOT_SIGNATURE_FLAGS rootSigFlag = useLibTarget ? D3D12_ROOT_SIGNATURE_FLAG_LOCAL_ROOT_SIGNATURE : D3D12_ROOT_SIGNATURE_FLAG_NONE;
     rootSignatureDesc.Init(_countof(rootParameters), rootParameters, 0, nullptr, rootSigFlag);

     CreateRootSignatureFromDesc(pDevice, &rootSignatureDesc, &pRootSignature);
   }

   if (useLibTarget) {
     RunLifetimeIntrinsicLibTest(pDevice, pShader, pRootSignature, pTargetProfile,
       pOptions, numOptions);
   } else {
     RunLifetimeIntrinsicComputeTest(pDevice, pShader, pUavHeap, pRootSignature, pTargetProfile,
       pOptions, numOptions, values);
   }
 }

 TEST_F(ExecutionTest, LifetimeIntrinsicTest) {
   // The only thing we test here is that existence of lifetime intrinsics or
   // their fallback replacement (store undef or store zeroinitializer) do not
   // cause any issues in the runtime and driver stack.
   // The easiest way to force placement of intrinsics is to create an array in
   // a local scope that is dynamically indexed. It must not be optimized away,
   // so we do some bogus initialization that prevents this. Since all the code
   // is guarded by a conditional that is dynamically always false, the actual
   // effect of the shader is that the same value that was read is written back.
   static const char* pShader = R"(
     RWByteAddressBuffer g_bab : register(u0);

     void fn(uint GI) {
       const uint addr = GI * 4;
       const int val = g_bab.Load(addr);
       int res = val;
       if (val < 0) { // Never true.
         int arr[200];
         for (int i = 0; i < 200; ++i) {
             arr[i] = arr[val - i];
         }
         res += arr[val];
       }
       g_bab.Store(addr, (uint)res);
     }

     [numthreads(8,8,1)]
     void main(uint GI : SV_GroupIndex) {
       fn(GI);
     }

     [shader("raygeneration")]
     void RayGen() {
       const uint d = DispatchRaysIndex().x;
       const uint g = g > 64 ? 63 : g;
       fn(g);
     }
   )";
   static const int NumThreadsX = 8;
   static const int NumThreadsY = 8;
   static const int NumThreadsZ = 1;
   static const int ThreadsPerGroup = NumThreadsX * NumThreadsY * NumThreadsZ;
   static const int DispatchGroupCount = 1;

   CComPtr<ID3D12Device> pDevice;
   bool bSM_6_6_Supported = CreateDevice(&pDevice, D3D_SHADER_MODEL_6_6, false);
   bool bSM_6_3_Supported = bSM_6_6_Supported;
   if (!bSM_6_6_Supported) {
     // Try 6.3 for downlevel DXR case
     bSM_6_3_Supported = CreateDevice(&pDevice, D3D_SHADER_MODEL_6_3, false);
   }
   if (!bSM_6_3_Supported) {
     // Otherwise, 6.0 better be supported for compute case
     VERIFY_IS_TRUE(CreateDevice(&pDevice, D3D_SHADER_MODEL_6_0, false));
   }
   bool bDXRSupported = bSM_6_3_Supported && DoesDeviceSupportRayTracing(pDevice);

   if (GetTestParamUseWARP(UseWarpByDefault()) || IsDeviceBasicAdapter(pDevice)) {
     WEX::Logging::Log::Comment(L"WARP has a known issue with LifetimeIntrinsicTest.");
     WEX::Logging::Log::Result(WEX::Logging::TestResults::Skipped);
     return;
   }

   if (!bSM_6_6_Supported) {
     WEX::Logging::Log::Comment(L"Native lifetime markers skipped, device does not support SM 6.6");
   }
   if (!bDXRSupported) {
     WEX::Logging::Log::Comment(L"DXR lifetime tests skipped, device does not support DXR");
   }

   std::vector<uint32_t> values;
   SetupComputeValuePattern(values, ThreadsPerGroup * DispatchGroupCount);

   // Run a number of tests for different configurations that will cause
   // lifetime intrinsics to be:
   // - placed directly
   // - translated to an undef store
   // - translated to a zeroinitializer store
   // against compute and DXR targets, downlevel and SM 6.6:
   // - downlevel: cs_6_0, lib_6_3 (DXR)
   // - cs_6_6, lib_6_6 (DXR)

   VERIFY_ARE_EQUAL(values[1], (uint32_t)1);

   LPCWSTR optsBase[] = {L"-enable-lifetime-markers"};
   LPCWSTR optsZeroStore[] = {L"-enable-lifetime-markers", L"-force-zero-store-lifetimes"};

   WEX::Logging::Log::Comment(L"==== cs_6_0 with default translation");
   RunLifetimeIntrinsicTest(pDevice, pShader, D3D_SHADER_MODEL_6_0, false,
     optsBase, _countof(optsBase), values);
   VERIFY_ARE_EQUAL(values[1], (uint32_t)1);

   if (bDXRSupported) {
     WEX::Logging::Log::Comment(L"==== DXR lib_6_3 with default translation");
     RunLifetimeIntrinsicTest(pDevice, pShader, D3D_SHADER_MODEL_6_3, true,
       optsBase, _countof(optsBase), values);
     VERIFY_ARE_EQUAL(values[1], (uint32_t)1);
   }

   WEX::Logging::Log::Comment(L"==== cs_6_0 with zeroinitializer translation");
   RunLifetimeIntrinsicTest(pDevice, pShader, D3D_SHADER_MODEL_6_0, false,
     optsZeroStore, _countof(optsZeroStore), values);
   VERIFY_ARE_EQUAL(values[1], (uint32_t)1);

   if (bDXRSupported) {
     WEX::Logging::Log::Comment(L"==== DXR lib_6_3 with zeroinitializer translation");
     RunLifetimeIntrinsicTest(pDevice, pShader, D3D_SHADER_MODEL_6_3, true,
       optsZeroStore, _countof(optsZeroStore), values);
     VERIFY_ARE_EQUAL(values[1], (uint32_t)1);
   }

   if (bSM_6_6_Supported) {
     WEX::Logging::Log::Comment(L"==== cs_6_6 with zeroinitializer translation");
     RunLifetimeIntrinsicTest(pDevice, pShader, D3D_SHADER_MODEL_6_6, false,
       optsZeroStore, _countof(optsZeroStore), values);
     VERIFY_ARE_EQUAL(values[1], (uint32_t)1);

     if (bDXRSupported) {
       WEX::Logging::Log::Comment(L"==== DXR lib_6_6 with zeroinitializer translation");
       RunLifetimeIntrinsicTest(pDevice, pShader, D3D_SHADER_MODEL_6_6, true,
         optsZeroStore, _countof(optsZeroStore), values);
       VERIFY_ARE_EQUAL(values[1], (uint32_t)1);
     }

     WEX::Logging::Log::Comment(L"==== cs_6_6 with native lifetime markers");
     RunLifetimeIntrinsicTest(pDevice, pShader, D3D_SHADER_MODEL_6_6, false,
       optsBase, _countof(optsBase), values);
     VERIFY_ARE_EQUAL(values[1], (uint32_t)1);

     if (bDXRSupported) {
       WEX::Logging::Log::Comment(L"==== DXR lib_6_6 with native lifetime markers");
       RunLifetimeIntrinsicTest(pDevice, pShader, D3D_SHADER_MODEL_6_6, true,
         optsBase, _countof(optsBase), values);
       VERIFY_ARE_EQUAL(values[1], (uint32_t)1);
     }
   }
 }

 TEST_F(ExecutionTest, BasicComputeTest) {
 #ifndef _HLK_CONF
   //
   // BasicComputeTest is a simple compute shader that can be used as the basis
   // for more interesting compute execution tests.
   // The HLSL is compatible with shader models <=5.1 to allow using the DXBC
   // rendering code paths for comparison.
   //
   static const char pShader[] =
     "RWByteAddressBuffer g_bab : register(u0);\r\n"
     "[numthreads(8,8,1)]\r\n"
     "void main(uint GI : SV_GroupIndex) {"
     "  uint addr = GI * 4;\r\n"
     "  uint val = g_bab.Load(addr);\r\n"
     "  DeviceMemoryBarrierWithGroupSync();\r\n"
     "  g_bab.Store(addr, val + 1);\r\n"
     "}";
   static const int NumThreadsX = 8;
   static const int NumThreadsY = 8;
   static const int NumThreadsZ = 1;
   static const int ThreadsPerGroup = NumThreadsX * NumThreadsY * NumThreadsZ;
   static const int DispatchGroupCount = 1;

   CComPtr<ID3D12Device> pDevice;
   if (!CreateDevice(&pDevice))
     return;

   std::vector<uint32_t> values;
   SetupComputeValuePattern(values, ThreadsPerGroup * DispatchGroupCount);
   VERIFY_ARE_EQUAL(values[0], (uint32_t)0);
   RunRWByteBufferComputeTest(pDevice, pShader, values);
   VERIFY_ARE_EQUAL(values[0], (uint32_t)1);
 #endif
 }

 TEST_F(ExecutionTest, BasicTriangleTest) {
 #ifndef _HLK_CONF
   static const UINT FrameCount = 2;
   static const UINT m_width = 320;
   static const UINT m_height = 200;
   static const float m_aspectRatio = static_cast<float>(m_width) / static_cast<float>(m_height);

   struct Vertex {
     XMFLOAT3 position;
     XMFLOAT4 color;
   };

   // Pipeline objects.
   CComPtr<ID3D12Device> pDevice;
   CComPtr<ID3D12Resource> pRenderTarget;
   CComPtr<ID3D12CommandAllocator> pCommandAllocator;
   CComPtr<ID3D12CommandQueue> pCommandQueue;
   CComPtr<ID3D12RootSignature> pRootSig;
   CComPtr<ID3D12DescriptorHeap> pRtvHeap;
   CComPtr<ID3D12PipelineState> pPipelineState;
   CComPtr<ID3D12GraphicsCommandList> pCommandList;
   CComPtr<ID3D12Resource> pReadBuffer;
   UINT rtvDescriptorSize;

   CComPtr<ID3D12Resource> pVertexBuffer;
   D3D12_VERTEX_BUFFER_VIEW vertexBufferView;

   // Synchronization objects.
   FenceObj FO;

   // Shaders.
   static const char pShaders[] =
     "struct PSInput {\r\n"
     "  float4 position : SV_POSITION;\r\n"
     "  float4 color : COLOR;\r\n"
     "};\r\n\r\n"
     "PSInput VSMain(float4 position : POSITION, float4 color : COLOR) {\r\n"
     "  PSInput result;\r\n"
     "\r\n"
     "  result.position = position;\r\n"
     "  result.color = color;\r\n"
     "  return result;\r\n"
     "}\r\n\r\n"
     "float4 PSMain(PSInput input) : SV_TARGET {\r\n"
     "  return 1; //input.color;\r\n"
     "};\r\n";

   if (!CreateDevice(&pDevice))
     return;

   struct BasicTestChecker {
     CComPtr<ID3D12Device> m_pDevice;
     CComPtr<ID3D12InfoQueue> m_pInfoQueue;
     bool m_OK = false;
     void SetOK(bool value) { m_OK = value; }
     BasicTestChecker(ID3D12Device *pDevice) : m_pDevice(pDevice) {
       if (FAILED(m_pDevice.QueryInterface(&m_pInfoQueue)))
         return;
       m_pInfoQueue->PushEmptyStorageFilter();
       m_pInfoQueue->PushEmptyRetrievalFilter();
     }
     ~BasicTestChecker() {
       if (!m_OK && m_pInfoQueue != nullptr) {
         UINT64 count = m_pInfoQueue->GetNumStoredMessages();
         bool invalidBytecodeFound = false;
         CAtlArray<BYTE> m_pBytes;
         for (UINT64 i = 0; i < count; ++i) {
           SIZE_T len = 0;
           if (FAILED(m_pInfoQueue->GetMessageA(i, nullptr, &len)))
             continue;
           if (m_pBytes.GetCount() < len && !m_pBytes.SetCount(len))
             continue;
           D3D12_MESSAGE *pMsg = (D3D12_MESSAGE *)m_pBytes.GetData();
           if (FAILED(m_pInfoQueue->GetMessageA(i, pMsg, &len)))
             continue;
           if (pMsg->ID == D3D12_MESSAGE_ID_CREATEVERTEXSHADER_INVALIDSHADERBYTECODE ||
               pMsg->ID == D3D12_MESSAGE_ID_CREATEPIXELSHADER_INVALIDSHADERBYTECODE) {
             invalidBytecodeFound = true;
             break;
           }
         }
         if (invalidBytecodeFound) {
           LogCommentFmt(L"%s", L"Found an invalid bytecode message. This "
             L"typically indicates that experimental mode "
             L"is not set up properly.");
           if (!GetTestParamBool(L"ExperimentalShaders")) {
             LogCommentFmt(L"Note that the ExperimentalShaders test parameter isn't set.");
           }
         }
         else {
           LogCommentFmt(L"Did not find corrupt pixel or vertex shaders in "
                         L"queue - dumping complete queue.");
           WriteInfoQueueMessages(nullptr, OutputFn, m_pInfoQueue);
         }
       }
     }
     static void __stdcall OutputFn(void *pCtx, const wchar_t *pMsg) {
       UNREFERENCED_PARAMETER(pCtx);
       LogCommentFmt(L"%s", pMsg);
     }
   };
   BasicTestChecker BTC(pDevice);
   {
     InitFenceObj(pDevice, &FO);
     CreateRtvDescriptorHeap(pDevice, FrameCount, &pRtvHeap, &rtvDescriptorSize);
     CreateRenderTargetAndReadback(pDevice, pRtvHeap, m_width, m_height, &pRenderTarget, &pReadBuffer);

     // Create an empty root signature.
     CD3DX12_ROOT_SIGNATURE_DESC rootSignatureDesc;
     rootSignatureDesc.Init(
       0, nullptr, 0, nullptr,
       D3D12_ROOT_SIGNATURE_FLAG_ALLOW_INPUT_ASSEMBLER_INPUT_LAYOUT);
     CreateRootSignatureFromDesc(pDevice, &rootSignatureDesc, &pRootSig);

     // Create the pipeline state, which includes compiling and loading shaders.
     // Define the vertex input layout.
     D3D12_INPUT_ELEMENT_DESC inputElementDescs[] = {
         {"POSITION", 0, DXGI_FORMAT_R32G32B32_FLOAT, 0, 0,
          D3D12_INPUT_CLASSIFICATION_PER_VERTEX_DATA, 0},
         {"COLOR", 0, DXGI_FORMAT_R32G32B32A32_FLOAT, 0, 12,
          D3D12_INPUT_CLASSIFICATION_PER_VERTEX_DATA, 0}};
     D3D12_INPUT_LAYOUT_DESC InputLayout = { inputElementDescs, _countof(inputElementDescs) };
     CreateGraphicsPSO(pDevice, &InputLayout, pRootSig, pShaders, &pPipelineState);

     CreateGraphicsCommandQueueAndList(pDevice, &pCommandQueue,
                                       &pCommandAllocator, &pCommandList,
                                       pPipelineState);

     // Define the geometry for a triangle.
     Vertex triangleVertices[] = {
       { { 0.0f, 0.25f * m_aspectRatio, 0.0f },{ 1.0f, 0.0f, 0.0f, 1.0f } },
       { { 0.25f, -0.25f * m_aspectRatio, 0.0f },{ 0.0f, 1.0f, 0.0f, 1.0f } },
       { { -0.25f, -0.25f * m_aspectRatio, 0.0f },{ 0.0f, 0.0f, 1.0f, 1.0f } } };

     CreateVertexBuffer(pDevice, triangleVertices, &pVertexBuffer, &vertexBufferView);
     WaitForSignal(pCommandQueue, FO);
   }

   // Render and execute the command list.
   RecordRenderAndReadback(pCommandList, pRtvHeap, rtvDescriptorSize, 1,
                           &vertexBufferView, pRootSig, pRenderTarget,
                           pReadBuffer);
   VERIFY_SUCCEEDED(pCommandList->Close());
   ExecuteCommandList(pCommandQueue, pCommandList);

   // Wait for previous frame.
   WaitForSignal(pCommandQueue, FO);

   // At this point, we've verified that execution succeeded with DXIL.
   BTC.SetOK(true);

   // Read back to CPU and examine contents.
   {
     MappedData data(pReadBuffer, m_width * m_height * 4);
     const uint32_t *pPixels = (uint32_t *)data.data();
     if (SaveImages()) {
       SavePixelsToFile(pPixels, DXGI_FORMAT_R8G8B8A8_UNORM, m_width, m_height, L"basic.bmp");
     }
     uint32_t top = pPixels[m_width / 2]; // Top center.
     uint32_t mid = pPixels[m_width / 2 + m_width * (m_height / 2)]; // Middle center.
     VERIFY_ARE_EQUAL(0xff663300, top); // clear color
     VERIFY_ARE_EQUAL(0xffffffff, mid); // white
   }
 #endif
 }

 TEST_F(ExecutionTest, Int64Test) {
   static const char pShader[] =
     "RWByteAddressBuffer g_bab : register(u0);\r\n"
     "[numthreads(8,8,1)]\r\n"
     "void main(uint GI : SV_GroupIndex) {"
     "  uint addr = GI * 4;\r\n"
     "  uint val = g_bab.Load(addr);\r\n"
     "  uint64_t u64 = val;\r\n"
     "  u64 *= val;\r\n"
     "  g_bab.Store(addr, (uint)(u64 >> 32));\r\n"
     "}";
   static const int NumThreadsX = 8;
   static const int NumThreadsY = 8;
   static const int NumThreadsZ = 1;
   static const int ThreadsPerGroup = NumThreadsX * NumThreadsY * NumThreadsZ;
   static const int DispatchGroupCount = 1;

   CComPtr<ID3D12Device> pDevice;
   if (!CreateDevice(&pDevice))
     return;

   if (!DoesDeviceSupportInt64(pDevice)) {
     // Optional feature, so it's correct to not support it if declared as such.
     WEX::Logging::Log::Comment(L"Device does not support int64 operations.");
     return;
   }
   std::vector<uint32_t> values;
   SetupComputeValuePattern(values, ThreadsPerGroup * DispatchGroupCount);
   VERIFY_ARE_EQUAL(values[0], (uint32_t)0);
   RunRWByteBufferComputeTest(pDevice, pShader, values);
   VERIFY_ARE_EQUAL(values[0], (uint32_t)0);
 }

 TEST_F(ExecutionTest, SignTest) {
   static const char pShader[] =
     "RWByteAddressBuffer g_bab : register(u0);\r\n"
     "[numthreads(8,1,1)]\r\n"
     "void main(uint GI : SV_GroupIndex) {"
     "  uint addr = GI * 4;\r\n"
     "  int val = g_bab.Load(addr);\r\n"
     "  g_bab.Store(addr, (uint)(sign(val)));\r\n"
     "}";
   static const int NumThreadsX = 8;
   static const int NumThreadsY = 1;
   static const int NumThreadsZ = 1;
   static const int ThreadsPerGroup = NumThreadsX * NumThreadsY * NumThreadsZ;
   static const int DispatchGroupCount = 1;

   CComPtr<ID3D12Device> pDevice;
   if (!CreateDevice(&pDevice))
     return;

   const uint32_t neg1 = (uint32_t)-1;
   uint32_t origValues[] = { (uint32_t)-3, (uint32_t)-2, neg1, 0, 1, 2, 3, 4 };
   std::vector<uint32_t> values(origValues, origValues + _countof(origValues));

   RunRWByteBufferComputeTest(pDevice, pShader, values);
   VERIFY_ARE_EQUAL(values[0], neg1);
   VERIFY_ARE_EQUAL(values[1], neg1);
   VERIFY_ARE_EQUAL(values[2], neg1);
   VERIFY_ARE_EQUAL(values[3], (uint32_t)0);
   VERIFY_ARE_EQUAL(values[4], (uint32_t)1);
   VERIFY_ARE_EQUAL(values[5], (uint32_t)1);
   VERIFY_ARE_EQUAL(values[6], (uint32_t)1);
   VERIFY_ARE_EQUAL(values[7], (uint32_t)1);
 }

 TEST_F(ExecutionTest, WaveIntrinsicsDDITest) {
 #ifndef _HLK_CONF
   CComPtr<ID3D12Device> pDevice;
   if (!CreateDevice(&pDevice))
     return;
   D3D12_FEATURE_DATA_D3D12_OPTIONS1 O;
   if (FAILED(pDevice->CheckFeatureSupport((D3D12_FEATURE)D3D12_FEATURE_D3D12_OPTIONS1, &O, sizeof(O))))
     return;
   bool waveSupported = O.WaveOps;
   UINT laneCountMin = O.WaveLaneCountMin;
   UINT laneCountMax = O.WaveLaneCountMax;
   LogCommentFmt(L"WaveOps %i, WaveLaneCountMin %u, WaveLaneCountMax %u", waveSupported, laneCountMin, laneCountMax);
   VERIFY_IS_TRUE(laneCountMin <= laneCountMax);
   if (waveSupported) {
     VERIFY_IS_TRUE(laneCountMin > 0 && laneCountMax > 0);
   }
   else {
     VERIFY_IS_TRUE(laneCountMin == 0 && laneCountMax == 0);
   }
 #endif
 }

 TEST_F(ExecutionTest, WaveIntrinsicsTest) {
 #ifndef _HLK_CONF
   WEX::TestExecution::SetVerifyOutput verifySettings(WEX::TestExecution::VerifyOutputSettings::LogOnlyFailures);

   struct PerThreadData {
     uint32_t id, flags, laneIndex, laneCount, firstLaneId, preds, firstlaneX, lane1X;
     uint32_t allBC, allSum, allProd, allAND, allOR, allXOR, allMin, allMax;
     uint32_t pfBC, pfSum, pfProd;
     uint32_t ballot[4];
     uint32_t diver;   // divergent value, used in calculation
     int32_t i_diver;  // divergent value, used in calculation
     int32_t i_allMax, i_allMin, i_allSum, i_allProd;
     int32_t i_pfSum, i_pfProd;
   };
   static const char pShader[] =
     WAVE_INTRINSIC_DXBC_GUARD
     "struct PerThreadData {\r\n"
     " uint id, flags, laneIndex, laneCount, firstLaneId, preds, firstlaneX, lane1X;\r\n"
     " uint allBC, allSum, allProd, allAND, allOR, allXOR, allMin, allMax;\r\n"
     " uint pfBC, pfSum, pfProd;\r\n"
     " uint4 ballot;\r\n"
     " uint diver;\r\n"
     " int i_diver;\r\n"
     " int i_allMax, i_allMin, i_allSum, i_allProd;\r\n"
     " int i_pfSum, i_pfProd;\r\n"
     "};\r\n"
     "RWStructuredBuffer<PerThreadData> g_sb : register(u0);\r\n"
     "[numthreads(8,8,1)]\r\n"
     "void main(uint GI : SV_GroupIndex, uint3 GTID : SV_GroupThreadID) {"
     "  PerThreadData pts = g_sb[GI];\r\n"
     "  uint diver = GTID.x + 2;\r\n"
     "  pts.diver = diver;\r\n"
     "  pts.flags = 0;\r\n"
     "  pts.preds = 0;\r\n"
     "  if (WaveIsFirstLane()) pts.flags |= 1;\r\n"
     "  pts.laneIndex = WaveGetLaneIndex();\r\n"
     "  pts.laneCount = WaveGetLaneCount();\r\n"
     "  pts.firstLaneId = WaveReadLaneFirst(pts.id);\r\n"
     "  pts.preds |= ((WaveActiveAnyTrue(diver == 1) ? 1 : 0) << 0);\r\n"
     "  pts.preds |= ((WaveActiveAllTrue(diver == 1) ? 1 : 0) << 1);\r\n"
     "  pts.preds |= ((WaveActiveAllEqual(diver) ? 1 : 0) << 2);\r\n"
     "  pts.preds |= ((WaveActiveAllEqual(GTID.z) ? 1 : 0) << 3);\r\n"
     "  pts.preds |= ((WaveActiveAllEqual(WaveReadLaneFirst(diver)) ? 1 : 0) << 4);\r\n"
     "  pts.ballot = WaveActiveBallot(diver > 3);\r\n"
     "  pts.firstlaneX = WaveReadLaneFirst(GTID.x);\r\n"
     "  pts.lane1X = WaveReadLaneAt(GTID.x, 1);\r\n"
     "\r\n"
     "  pts.allBC = WaveActiveCountBits(diver > 3);\r\n"
     "  pts.allSum = WaveActiveSum(diver);\r\n"
     "  pts.allProd = WaveActiveProduct(diver);\r\n"
     "  pts.allAND = WaveActiveBitAnd(diver);\r\n"
     "  pts.allOR = WaveActiveBitOr(diver);\r\n"
     "  pts.allXOR = WaveActiveBitXor(diver);\r\n"
     "  pts.allMin = WaveActiveMin(diver);\r\n"
     "  pts.allMax = WaveActiveMax(diver);\r\n"
     "\r\n"
     "  pts.pfBC = WavePrefixCountBits(diver > 3);\r\n"
     "  pts.pfSum = WavePrefixSum(diver);\r\n"
     "  pts.pfProd = WavePrefixProduct(diver);\r\n"
     "\r\n"
     "  int i_diver = pts.i_diver;\r\n"
     "  pts.i_allMax = WaveActiveMax(i_diver);\r\n"
     "  pts.i_allMin = WaveActiveMin(i_diver);\r\n"
     "  pts.i_allSum = WaveActiveSum(i_diver);\r\n"
     "  pts.i_allProd = WaveActiveProduct(i_diver);\r\n"
     "  pts.i_pfSum = WavePrefixSum(i_diver);\r\n"
     "  pts.i_pfProd = WavePrefixProduct(i_diver);\r\n"
     "\r\n"
     "  g_sb[GI] = pts;\r\n"
     "}";
   static const int NumtheadsX = 8;
   static const int NumtheadsY = 8;
   static const int NumtheadsZ = 1;
   static const int ThreadsPerGroup = NumtheadsX * NumtheadsY * NumtheadsZ;
   static const int DispatchGroupCount = 1;

   CComPtr<ID3D12Device> pDevice;
   if (!CreateDevice(&pDevice))
     return;

   if (!DoesDeviceSupportWaveOps(pDevice)) {
     // Optional feature, so it's correct to not support it if declared as such.
     WEX::Logging::Log::Comment(L"Device does not support wave operations.");
     return;
   }

   std::vector<PerThreadData> values;
   values.resize(ThreadsPerGroup * DispatchGroupCount);
   for (size_t i = 0; i < values.size(); ++i) {
     memset(&values[i], 0, sizeof(PerThreadData));
     values[i].id = (uint32_t)i;
     values[i].i_diver = (int)i;
     values[i].i_diver *= (i % 2) ? 1 : -1;
   }

   static const int DispatchGroupX = 1;
   static const int DispatchGroupY = 1;
   static const int DispatchGroupZ = 1;

   CComPtr<ID3D12GraphicsCommandList> pCommandList;
   CComPtr<ID3D12CommandQueue> pCommandQueue;
   CComPtr<ID3D12DescriptorHeap> pUavHeap;
   CComPtr<ID3D12CommandAllocator> pCommandAllocator;
   UINT uavDescriptorSize;
   FenceObj FO;
   bool dxbc = UseDxbc();

   const size_t valueSizeInBytes = values.size() * sizeof(PerThreadData);
   CreateComputeCommandQueue(pDevice, L"WaveIntrinsicsTest Command Queue", &pCommandQueue);
   InitFenceObj(pDevice, &FO);

   // Describe and create a UAV descriptor heap.
   D3D12_DESCRIPTOR_HEAP_DESC heapDesc = {};
   heapDesc.NumDescriptors = 1;
   heapDesc.Type = D3D12_DESCRIPTOR_HEAP_TYPE_CBV_SRV_UAV;
   heapDesc.Flags = D3D12_DESCRIPTOR_HEAP_FLAG_SHADER_VISIBLE;
   VERIFY_SUCCEEDED(pDevice->CreateDescriptorHeap(&heapDesc, IID_PPV_ARGS(&pUavHeap)));
   uavDescriptorSize = pDevice->GetDescriptorHandleIncrementSize(heapDesc.Type);

   // Create root signature.
   CComPtr<ID3D12RootSignature> pRootSignature;
   {
     CD3DX12_DESCRIPTOR_RANGE ranges[1];
     ranges[0].Init(D3D12_DESCRIPTOR_RANGE_TYPE_UAV, 1, 0, 0, 0);

     CD3DX12_ROOT_PARAMETER rootParameters[1];
     rootParameters[0].InitAsDescriptorTable(1, &ranges[0], D3D12_SHADER_VISIBILITY_ALL);

     CD3DX12_ROOT_SIGNATURE_DESC rootSignatureDesc;
     rootSignatureDesc.Init(_countof(rootParameters), rootParameters, 0, nullptr, D3D12_ROOT_SIGNATURE_FLAG_NONE);

     CComPtr<ID3DBlob> signature;
     CComPtr<ID3DBlob> error;
     VERIFY_SUCCEEDED(D3D12SerializeRootSignature(&rootSignatureDesc, D3D_ROOT_SIGNATURE_VERSION_1, &signature, &error));
     VERIFY_SUCCEEDED(pDevice->CreateRootSignature(0, signature->GetBufferPointer(), signature->GetBufferSize(), IID_PPV_ARGS(&pRootSignature)));
   }

   // Create pipeline state object.
   CComPtr<ID3D12PipelineState> pComputeState;
   CreateComputePSO(pDevice, pRootSignature, pShader, L"cs_6_0", &pComputeState);

   // Create a command allocator and list for compute.
   VERIFY_SUCCEEDED(pDevice->CreateCommandAllocator(D3D12_COMMAND_LIST_TYPE_COMPUTE, IID_PPV_ARGS(&pCommandAllocator)));
   VERIFY_SUCCEEDED(pDevice->CreateCommandList(0, D3D12_COMMAND_LIST_TYPE_COMPUTE, pCommandAllocator, pComputeState, IID_PPV_ARGS(&pCommandList)));

   // Set up UAV resource.
   CComPtr<ID3D12Resource> pUavResource;
   CComPtr<ID3D12Resource> pReadBuffer;
   CComPtr<ID3D12Resource> pUploadResource;
   CreateTestUavs(pDevice, pCommandList, values.data(), (UINT)valueSizeInBytes, &pUavResource, &pUploadResource, &pReadBuffer);

   // Close the command list and execute it to perform the GPU setup.
   pCommandList->Close();
   ExecuteCommandList(pCommandQueue, pCommandList);
   WaitForSignal(pCommandQueue, FO);
   VERIFY_SUCCEEDED(pCommandAllocator->Reset());
   VERIFY_SUCCEEDED(pCommandList->Reset(pCommandAllocator, pComputeState));

   // Run the compute shader and copy the results back to readable memory.
   {
     D3D12_UNORDERED_ACCESS_VIEW_DESC uavDesc = {};
     uavDesc.Format = DXGI_FORMAT_UNKNOWN;
     uavDesc.ViewDimension = D3D12_UAV_DIMENSION_BUFFER;
     uavDesc.Buffer.FirstElement = 0;
     uavDesc.Buffer.NumElements = (UINT)values.size();
     uavDesc.Buffer.StructureByteStride = sizeof(PerThreadData);
     uavDesc.Buffer.CounterOffsetInBytes = 0;
     uavDesc.Buffer.Flags = D3D12_BUFFER_UAV_FLAG_NONE;
     CD3DX12_CPU_DESCRIPTOR_HANDLE uavHandle(pUavHeap->GetCPUDescriptorHandleForHeapStart());
     CD3DX12_GPU_DESCRIPTOR_HANDLE uavHandleGpu(pUavHeap->GetGPUDescriptorHandleForHeapStart());
     pDevice->CreateUnorderedAccessView(pUavResource, nullptr, &uavDesc, uavHandle);
     SetDescriptorHeap(pCommandList, pUavHeap);
     pCommandList->SetComputeRootSignature(pRootSignature);
     pCommandList->SetComputeRootDescriptorTable(0, uavHandleGpu);
   }
   pCommandList->Dispatch(DispatchGroupX, DispatchGroupY, DispatchGroupZ);
   RecordTransitionBarrier(pCommandList, pUavResource, D3D12_RESOURCE_STATE_UNORDERED_ACCESS, D3D12_RESOURCE_STATE_COPY_SOURCE);
   pCommandList->CopyResource(pReadBuffer, pUavResource);
   pCommandList->Close();
   ExecuteCommandList(pCommandQueue, pCommandList);
   WaitForSignal(pCommandQueue, FO);
   {
     MappedData mappedData(pReadBuffer, (UINT)valueSizeInBytes);
     PerThreadData *pData = (PerThreadData *)mappedData.data();
     memcpy(values.data(), pData, valueSizeInBytes);

     // Gather some general data.
     // The 'firstLaneId' captures a unique number per first-lane per wave.
     // Counting the number distinct firstLaneIds gives us the number of waves.
     std::vector<uint32_t> firstLaneIds;
     for (size_t i = 0; i < values.size(); ++i) {
       PerThreadData &pts = values[i];
       uint32_t firstLaneId = pts.firstLaneId;
       if (!contains(firstLaneIds, firstLaneId)) {
         firstLaneIds.push_back(firstLaneId);
       }
     }

     // Waves should cover 4 threads or more.
     LogCommentFmt(L"Found %u distinct lane ids: %u", firstLaneIds.size());
     if (!dxbc) {
       VERIFY_IS_GREATER_THAN_OR_EQUAL(values.size() / 4, firstLaneIds.size());
     }

     // Now, group threads into waves.
     std::map<uint32_t, std::unique_ptr<std::vector<PerThreadData *> > > waves;
     for (size_t i = 0; i < firstLaneIds.size(); ++i) {
       waves[firstLaneIds[i]] = std::make_unique<std::vector<PerThreadData *> >();
     }
     for (size_t i = 0; i < values.size(); ++i) {
       PerThreadData &pts = values[i];
       std::unique_ptr<std::vector<PerThreadData *> > &wave = waves[pts.firstLaneId];
       wave->push_back(&pts);
     }

     // Verify that all the wave values are coherent across the wave.
     for (size_t i = 0; i < values.size(); ++i) {
       PerThreadData &pts = values[i];
       std::unique_ptr<std::vector<PerThreadData *> > &wave = waves[pts.firstLaneId];
       // Sort the lanes by increasing lane ID.
       struct LaneIdOrderPred {
         bool operator()(PerThreadData *a, PerThreadData *b) {
           return a->laneIndex < b->laneIndex;
         }
       };
       std::sort(wave.get()->begin(), wave.get()->end(), LaneIdOrderPred());

       // Verify some interesting properties of the first lane.
       uint32_t pfBC, pfSum, pfProd;
       int32_t i_pfSum, i_pfProd;
       int32_t i_allMax, i_allMin;
       {
         PerThreadData *ptdFirst = wave->front();
         VERIFY_IS_TRUE(0 != (ptdFirst->flags & 1)); // FirstLane sets this bit.
         VERIFY_IS_TRUE(0 == ptdFirst->pfBC);
         VERIFY_IS_TRUE(0 == ptdFirst->pfSum);
         VERIFY_IS_TRUE(1 == ptdFirst->pfProd);
         VERIFY_IS_TRUE(0 == ptdFirst->i_pfSum);
         VERIFY_IS_TRUE(1 == ptdFirst->i_pfProd);
         pfBC = (ptdFirst->diver > 3) ? 1 : 0;
         pfSum = ptdFirst->diver;
         pfProd = ptdFirst->diver;
         i_pfSum = ptdFirst->i_diver;
         i_pfProd = ptdFirst->i_diver;
         i_allMax = i_allMin = ptdFirst->i_diver;
       }

       // Calculate values which take into consideration all lanes.
       uint32_t preds = 0;
       preds |= 1 << 1; // AllTrue starts true, switches to false if needed.
       preds |= 1 << 2; // AllEqual starts true, switches to false if needed.
       preds |= 1 << 3; // WaveActiveAllEqual(GTID.z) is always true
       preds |= 1 << 4; // (WaveActiveAllEqual(WaveReadLaneFirst(diver)) is always true
       uint32_t ballot[4] = { 0, 0, 0, 0 };
       int32_t i_allSum = 0, i_allProd = 1;
       for (size_t n = 0; n < wave->size(); ++n) {
         std::vector<PerThreadData *> &lanes = *wave.get();
         // pts.preds |= ((WaveActiveAnyTrue(diver == 1) ? 1 : 0) << 0);
         if (lanes[n]->diver == 1) preds |= (1 << 0);
         // pts.preds |= ((WaveActiveAllTrue(diver == 1) ? 1 : 0) << 1);
         if (lanes[n]->diver != 1) preds &= ~(1 << 1);
         // pts.preds |= ((WaveActiveAllEqual(diver) ? 1 : 0) << 2);
         if (lanes[0]->diver != lanes[n]->diver) preds &= ~(1 << 2);
         // pts.ballot = WaveActiveBallot(diver > 3);\r\n"
         if (lanes[n]->diver > 3) {
           // This is the uint4 result layout:
           // .x -> bits  0 .. 31
           // .y -> bits 32 .. 63
           // .z -> bits 64 .. 95
           // .w -> bits 96 ..127
           uint32_t component = lanes[n]->laneIndex / 32;
           uint32_t bit = lanes[n]->laneIndex % 32;
           ballot[component] |= 1 << bit;
         }
         i_allMax = std::max(lanes[n]->i_diver, i_allMax);
         i_allMin = std::min(lanes[n]->i_diver, i_allMin);
         i_allProd *= lanes[n]->i_diver;
         i_allSum += lanes[n]->i_diver;
       }

       for (size_t n = 1; n < wave->size(); ++n) {
         // 'All' operations are uniform across the wave.
         std::vector<PerThreadData *> &lanes = *wave.get();
         VERIFY_IS_TRUE(0 == (lanes[n]->flags & 1)); // non-firstlanes do not set this bit
         VERIFY_ARE_EQUAL(lanes[0]->allBC, lanes[n]->allBC);
         VERIFY_ARE_EQUAL(lanes[0]->allSum, lanes[n]->allSum);
         VERIFY_ARE_EQUAL(lanes[0]->allProd, lanes[n]->allProd);
         VERIFY_ARE_EQUAL(lanes[0]->allAND, lanes[n]->allAND);
         VERIFY_ARE_EQUAL(lanes[0]->allOR, lanes[n]->allOR);
         VERIFY_ARE_EQUAL(lanes[0]->allXOR, lanes[n]->allXOR);
         VERIFY_ARE_EQUAL(lanes[0]->allMin, lanes[n]->allMin);
         VERIFY_ARE_EQUAL(lanes[0]->allMax, lanes[n]->allMax);
         VERIFY_ARE_EQUAL(i_allMax, lanes[n]->i_allMax);
         VERIFY_ARE_EQUAL(i_allMin, lanes[n]->i_allMin);
         VERIFY_ARE_EQUAL(i_allProd, lanes[n]->i_allProd);
         VERIFY_ARE_EQUAL(i_allSum, lanes[n]->i_allSum);

         // first-lane reads and uniform reads are uniform across the wave.
         VERIFY_ARE_EQUAL(lanes[0]->firstlaneX, lanes[n]->firstlaneX);
         VERIFY_ARE_EQUAL(lanes[0]->lane1X, lanes[n]->lane1X);

         // the lane count is uniform across the wave.
         VERIFY_ARE_EQUAL(lanes[0]->laneCount, lanes[n]->laneCount);

         // The predicates are uniform across the wave.
         VERIFY_ARE_EQUAL(lanes[n]->preds, preds);

         // the lane index is distinct per thread.
         for (size_t prior = 0; prior < n; ++prior) {
           VERIFY_ARE_NOT_EQUAL(lanes[prior]->laneIndex, lanes[n]->laneIndex);
         }
         // Ballot results are uniform across the wave.
         VERIFY_ARE_EQUAL(0, memcmp(ballot, lanes[n]->ballot, sizeof(ballot)));

         // Keep running total of prefix calculation. Prefix values are exclusive to
         // the executing lane.
         VERIFY_ARE_EQUAL(pfBC, lanes[n]->pfBC);
         VERIFY_ARE_EQUAL(pfSum, lanes[n]->pfSum);
         VERIFY_ARE_EQUAL(pfProd, lanes[n]->pfProd);
         VERIFY_ARE_EQUAL(i_pfSum, lanes[n]->i_pfSum);
         VERIFY_ARE_EQUAL(i_pfProd, lanes[n]->i_pfProd);
         pfBC += (lanes[n]->diver > 3) ? 1 : 0;
         pfSum += lanes[n]->diver;
         pfProd *= lanes[n]->diver;
         i_pfSum += lanes[n]->i_diver;
         i_pfProd *= lanes[n]->i_diver;
       }
       // TODO: add divergent branching and verify that the otherwise uniform values properly diverge
     }

     // Compare each value of each per-thread element.
     for (size_t i = 0; i < values.size(); ++i) {
       PerThreadData &pts = values[i];
       VERIFY_ARE_EQUAL(i, pts.id); // ID is unchanged.
     }
   }
 #endif
 }

 // This test is assuming that the adapter implements WaveReadLaneFirst correctly
 TEST_F(ExecutionTest, WaveIntrinsicsInPSTest) {
   WEX::TestExecution::SetVerifyOutput verifySettings(WEX::TestExecution::VerifyOutputSettings::LogOnlyFailures);

   struct Vertex {
     XMFLOAT3 position;
   };

   struct PerPixelData {
     XMFLOAT4 position;
     uint32_t id, flags, laneIndex, laneCount, firstLaneId, sum1;
     uint32_t id0, id1, id2, id3;
     uint32_t acrossX, acrossY, acrossDiag, quadActiveCount;
   };

   const UINT RTWidth = 128;
   const UINT RTHeight = 128;

   // Shaders.
   static const char pShaders[] =
     WAVE_INTRINSIC_DXBC_GUARD
     "struct PSInput {\r\n"
     "  float4 position : SV_POSITION;\r\n"
     "};\r\n\r\n"
     "PSInput VSMain(float4 position : POSITION) {\r\n"
     "  PSInput result;\r\n"
     "\r\n"
     "  result.position = position;\r\n"
     "  return result;\r\n"
     "}\r\n\r\n"
     "uint pos_to_id(float4 pos) { return pos.x * 128 + pos.y; }\r\n"
     "struct PerPixelData {\r\n"
     " float4 position;\r\n"
     " uint id, flags, laneIndex, laneCount, firstLaneId, sum1;\r\n"
     " uint id0, id1, id2, id3;\r\n"
     " uint acrossX, acrossY, acrossDiag, quadActiveCount;\r\n"
     "};\r\n"
     "AppendStructuredBuffer<PerPixelData> g_sb : register(u1);\r\n"
     "float4 PSMain(PSInput input) : SV_TARGET {\r\n"
     "  uint one = 1;\r\n"
     "  PerPixelData d;\r\n"
     "  d.position = input.position;\r\n"
     "  d.id = pos_to_id(input.position);\r\n"
     "  d.flags = 0;\r\n"
     "  if (WaveIsFirstLane()) d.flags |= 1;\r\n"
     "  d.laneIndex = WaveGetLaneIndex();\r\n"
     "  d.laneCount = WaveGetLaneCount();\r\n"
     "  d.firstLaneId = WaveReadLaneFirst(d.id);\r\n"
     "  d.sum1 = WaveActiveSum(one);\r\n"
     "  d.id0 = QuadReadLaneAt(d.id, 0);\r\n"
     "  d.id1 = QuadReadLaneAt(d.id, 1);\r\n"
     "  d.id2 = QuadReadLaneAt(d.id, 2);\r\n"
     "  d.id3 = QuadReadLaneAt(d.id, 3);\r\n"
     "  d.acrossX = QuadReadAcrossX(d.id);\r\n"
     "  d.acrossY = QuadReadAcrossY(d.id);\r\n"
     "  d.acrossDiag = QuadReadAcrossDiagonal(d.id);\r\n"
     "  d.quadActiveCount = one + QuadReadAcrossX(one) + QuadReadAcrossY(one) + QuadReadAcrossDiagonal(one);\r\n"
     "  g_sb.Append(d);\r\n"
     "  return 1;\r\n"
     "};\r\n";

   CComPtr<ID3D12Device> pDevice;
   CComPtr<ID3D12CommandQueue> pCommandQueue;
   CComPtr<ID3D12DescriptorHeap> pUavHeap, pRtvHeap;
   CComPtr<ID3D12CommandAllocator> pCommandAllocator;
   CComPtr<ID3D12GraphicsCommandList> pCommandList;
   CComPtr<ID3D12PipelineState> pPSO;
   CComPtr<ID3D12Resource> pRenderTarget, pReadBuffer;
   UINT uavDescriptorSize, rtvDescriptorSize;
   CComPtr<ID3D12Resource> pVertexBuffer;
   D3D12_VERTEX_BUFFER_VIEW vertexBufferView;

   if (!CreateDevice(&pDevice))
     return;
   if (!DoesDeviceSupportWaveOps(pDevice)) {
     // Optional feature, so it's correct to not support it if declared as such.
     WEX::Logging::Log::Comment(L"Device does not support wave operations.");
     return;
   }

   FenceObj FO;
   InitFenceObj(pDevice, &FO);

   // Describe and create a UAV descriptor heap.
   D3D12_DESCRIPTOR_HEAP_DESC heapDesc = {};
   heapDesc.NumDescriptors = 1;
   heapDesc.Type = D3D12_DESCRIPTOR_HEAP_TYPE_CBV_SRV_UAV;
   heapDesc.Flags = D3D12_DESCRIPTOR_HEAP_FLAG_SHADER_VISIBLE;
   VERIFY_SUCCEEDED(pDevice->CreateDescriptorHeap(&heapDesc, IID_PPV_ARGS(&pUavHeap)));
   uavDescriptorSize = pDevice->GetDescriptorHandleIncrementSize(heapDesc.Type);

   CreateRtvDescriptorHeap(pDevice, 1, &pRtvHeap, &rtvDescriptorSize);
   CreateRenderTargetAndReadback(pDevice, pRtvHeap, RTHeight, RTWidth, &pRenderTarget, &pReadBuffer);

   // Create root signature: one UAV.
   CComPtr<ID3D12RootSignature> pRootSignature;
   {
     CD3DX12_DESCRIPTOR_RANGE ranges[1];
     ranges[0].Init(D3D12_DESCRIPTOR_RANGE_TYPE_UAV, 1, 1, 0, 0);

     CD3DX12_ROOT_PARAMETER rootParameters[1];
     rootParameters[0].InitAsDescriptorTable(1, &ranges[0], D3D12_SHADER_VISIBILITY_ALL);

     CD3DX12_ROOT_SIGNATURE_DESC rootSignatureDesc;
     rootSignatureDesc.Init(_countof(rootParameters), rootParameters, 0, nullptr, D3D12_ROOT_SIGNATURE_FLAG_ALLOW_INPUT_ASSEMBLER_INPUT_LAYOUT);

     CreateRootSignatureFromDesc(pDevice, &rootSignatureDesc, &pRootSignature);
   }

   D3D12_INPUT_ELEMENT_DESC elementDesc[] = {
       {"POSITION", 0, DXGI_FORMAT_R32G32B32_FLOAT, 0, 0,
        D3D12_INPUT_CLASSIFICATION_PER_VERTEX_DATA, 0}};
   D3D12_INPUT_LAYOUT_DESC InputLayout = {elementDesc, _countof(elementDesc)};
   CreateGraphicsPSO(pDevice, &InputLayout, pRootSignature, pShaders, &pPSO);

   CreateGraphicsCommandQueueAndList(pDevice, &pCommandQueue, &pCommandAllocator,
                                     &pCommandList, pPSO);

   // Single triangle covering half the target.
   Vertex vertices[] = {
     { { -1.0f,  1.0f, 0.0f } },
     { {  1.0f,  1.0f, 0.0f } },
     { { -1.0f, -1.0f, 0.0f } } };
   const UINT TriangleCount = _countof(vertices) / 3;

   CreateVertexBuffer(pDevice, vertices, &pVertexBuffer, &vertexBufferView);

   bool dxbc = UseDxbc();

   // Set up UAV resource.
   std::vector<PerPixelData> values;
   values.resize(RTWidth * RTHeight * 2);
   UINT valueSizeInBytes = (UINT)values.size() * sizeof(PerPixelData);
   memset(values.data(), 0, valueSizeInBytes);
   CComPtr<ID3D12Resource> pUavResource;
   CComPtr<ID3D12Resource> pUavReadBuffer;
   CComPtr<ID3D12Resource> pUploadResource;
   CreateTestUavs(pDevice, pCommandList, values.data(), valueSizeInBytes, &pUavResource, &pUploadResource, &pUavReadBuffer);

   // Set up the append counter resource.
   CComPtr<ID3D12Resource> pUavCounterResource;
   CComPtr<ID3D12Resource> pReadCounterBuffer;
   CComPtr<ID3D12Resource> pUploadCounterResource;
   BYTE zero[sizeof(UINT)] = { 0 };
   CreateTestUavs(pDevice, pCommandList, zero, sizeof(zero), &pUavCounterResource, &pUploadCounterResource, &pReadCounterBuffer);

   // Close the command list and execute it to perform the GPU setup.
   pCommandList->Close();
   ExecuteCommandList(pCommandQueue, pCommandList);
   WaitForSignal(pCommandQueue, FO);
   VERIFY_SUCCEEDED(pCommandAllocator->Reset());
   VERIFY_SUCCEEDED(pCommandList->Reset(pCommandAllocator, pPSO));

   pCommandList->SetGraphicsRootSignature(pRootSignature);
   SetDescriptorHeap(pCommandList, pUavHeap);
   {
     D3D12_UNORDERED_ACCESS_VIEW_DESC uavDesc = {};
     uavDesc.Format = DXGI_FORMAT_UNKNOWN;
     uavDesc.ViewDimension = D3D12_UAV_DIMENSION_BUFFER;
     uavDesc.Buffer.FirstElement = 0;
     uavDesc.Buffer.NumElements = (UINT)values.size();
     uavDesc.Buffer.StructureByteStride = sizeof(PerPixelData);
     uavDesc.Buffer.CounterOffsetInBytes = 0;
     uavDesc.Buffer.Flags = D3D12_BUFFER_UAV_FLAG_NONE;
     CD3DX12_CPU_DESCRIPTOR_HANDLE uavHandle(pUavHeap->GetCPUDescriptorHandleForHeapStart());
     CD3DX12_GPU_DESCRIPTOR_HANDLE uavHandleGpu(pUavHeap->GetGPUDescriptorHandleForHeapStart());
     pDevice->CreateUnorderedAccessView(pUavResource, pUavCounterResource, &uavDesc, uavHandle);
     pCommandList->SetGraphicsRootDescriptorTable(0, uavHandleGpu);
   }
   RecordRenderAndReadback(pCommandList, pRtvHeap, rtvDescriptorSize, TriangleCount, &vertexBufferView, nullptr, pRenderTarget, pReadBuffer);
   RecordTransitionBarrier(pCommandList, pUavResource, D3D12_RESOURCE_STATE_UNORDERED_ACCESS, D3D12_RESOURCE_STATE_COPY_SOURCE);
   RecordTransitionBarrier(pCommandList, pUavCounterResource, D3D12_RESOURCE_STATE_UNORDERED_ACCESS, D3D12_RESOURCE_STATE_COPY_SOURCE);
   pCommandList->CopyResource(pUavReadBuffer, pUavResource);
   pCommandList->CopyResource(pReadCounterBuffer, pUavCounterResource);
   VERIFY_SUCCEEDED(pCommandList->Close());
   LogCommentFmt(L"Rendering to %u by %u", RTWidth, RTHeight);
   ExecuteCommandList(pCommandQueue, pCommandList);
   WaitForSignal(pCommandQueue, FO);
   {
     MappedData data(pReadBuffer, RTWidth * RTHeight * 4);
     const uint32_t *pPixels = (uint32_t *)data.data();
     if (SaveImages()) {
       SavePixelsToFile(pPixels, DXGI_FORMAT_R8G8B8A8_UNORM, RTWidth, RTHeight, L"psintrin.bmp");
     }
   }

   uint32_t appendCount;
   {
     MappedData mappedData(pReadCounterBuffer, sizeof(uint32_t));
     appendCount = *((uint32_t *)mappedData.data());
     LogCommentFmt(L"%u elements in append buffer", appendCount);
   }

   {
     MappedData mappedData(pUavReadBuffer, (UINT32)values.size());
     PerPixelData *pData = (PerPixelData *)mappedData.data();
     memcpy(values.data(), pData, valueSizeInBytes);

     // DXBC is handy to test pipeline setup, but interesting functions are
     // stubbed out, so there is no point in further validation.
     if (dxbc)
       return;

     uint32_t maxActiveLaneCount = 0;
     uint32_t maxLaneCount = 0;
     for (uint32_t i = 0; i < appendCount; ++i) {
       maxActiveLaneCount = std::max(maxActiveLaneCount, values[i].sum1);
       maxLaneCount = std::max(maxLaneCount, values[i].laneCount);
     }

     uint32_t peerOfHelperLanes = 0;
     for (uint32_t i = 0; i < appendCount; ++i) {
       if (values[i].sum1 != maxActiveLaneCount) {
         ++peerOfHelperLanes;
       }
     }

     LogCommentFmt(
         L"Found: %u threads. Waves reported up to %u total lanes, up "
         L"to %u active lanes, and %u threads had helper/inactive lanes.",
         appendCount, maxLaneCount, maxActiveLaneCount, peerOfHelperLanes);

     // Group threads into quad invocations.
     uint32_t singlePixelCount = 0;
     uint32_t multiPixelCount = 0;
     std::unordered_set<uint32_t> ids;
     std::multimap<uint32_t, PerPixelData *> idGroups;
     std::multimap<uint32_t, PerPixelData *> firstIdGroups;
     for (uint32_t i = 0; i < appendCount; ++i) {
       ids.insert(values[i].id);
       idGroups.insert(std::make_pair(values[i].id, &values[i]));
       firstIdGroups.insert(std::make_pair(values[i].firstLaneId, &values[i]));
     }
     for (uint32_t id : ids) {
       if (idGroups.count(id) == 1)
         ++singlePixelCount;
       else
         ++multiPixelCount;
     }
     LogCommentFmt(L"%u pixels were processed by a single thread. %u invocations were for shared pixels.",
       singlePixelCount, multiPixelCount);

     // Multiple threads may have tried to shade the same pixel. (Is this true even if we have only one triangle?)
     // Where every pixel is distinct, it's very straightforward to validate.
     {
       auto cur = firstIdGroups.begin(), end = firstIdGroups.end();
       while (cur != end) {
         bool simpleWave = true;
         uint32_t firstId = (*cur).first;
         auto groupEnd = cur;
         while (groupEnd != end && (*groupEnd).first == firstId) {
           if (idGroups.count((*groupEnd).second->id) > 1)
             simpleWave = false;
           ++groupEnd;
         }
         if (simpleWave) {
           // Break the wave into quads.
           struct QuadData {
             unsigned count;
             PerPixelData *data[4];
           };
           std::map<uint32_t, QuadData> quads;
           for (auto i = cur; i != groupEnd; ++i) {
             // assuming that it is a simple wave, idGroups has a unique id for each entry.
             uint32_t laneId = (*i).second->id;
             uint32_t laneIds[4] = {(*i).second->id0, (*i).second->id1,
                                    (*i).second->id2, (*i).second->id3};
             // Since this is a simple wave, each lane has an unique id and
             // therefore should not have any ids in there.
             VERIFY_IS_TRUE(quads.find(laneId) == quads.end());
             // check if QuadReadLaneAt is returning same values in a single quad.
             bool newQuad = true;
             for (unsigned quadIndex = 0; quadIndex < 4; ++quadIndex) {
               auto match = quads.find(laneIds[quadIndex]);
               if (match != quads.end()) {
                 (*match).second.data[(*match).second.count++] = (*i).second;
                 newQuad = false;
                 break;
               }
               auto quadMemberData = idGroups.find(laneIds[quadIndex]);
               if (quadMemberData != idGroups.end()) {
                 VERIFY_IS_TRUE((*quadMemberData).second->id0 == laneIds[0]);
                 VERIFY_IS_TRUE((*quadMemberData).second->id1 == laneIds[1]);
                 VERIFY_IS_TRUE((*quadMemberData).second->id2 == laneIds[2]);
                 VERIFY_IS_TRUE((*quadMemberData).second->id3 == laneIds[3]);
               }
             }
             if (newQuad) {
               QuadData qdata;
               qdata.count = 1;
               qdata.data[0] = (*i).second;
               quads.insert(std::make_pair(laneId, qdata));
             }
           }
           for (auto quadPair : quads) {
             unsigned count = quadPair.second.count;
             // There could be only one pixel data on the edge of the triangle
             if (count < 2) continue;
             PerPixelData **data = quadPair.second.data;
             bool isTop[4];
             bool isLeft[4];
             PerPixelData helperData;
             memset(&helperData, sizeof(helperData), 0);
             PerPixelData *layout[4]; // tl,tr,bl,br
             memset(layout, sizeof(layout), 0);
             auto fnToLayout = [&](bool top, bool left) -> PerPixelData ** {
               int idx = top ? 0 : 2;
               idx += left ? 0 : 1;
               return &layout[idx];
             };
             auto fnToLayoutData = [&](bool top, bool left) -> PerPixelData * {
               PerPixelData **pResult = fnToLayout(top, left);
               if (*pResult == nullptr) return &helperData;
               return *pResult;
             };
             VERIFY_IS_TRUE(count <= 4);
             if (count == 2) {
               isTop[0] = data[0]->position.y < data[1]->position.y;
               isTop[1] = (data[0]->position.y == data[1]->position.y) ? isTop[0] : !isTop[0];
               isLeft[0] = data[0]->position.x < data[1]->position.x;
               isLeft[1] = (data[0]->position.x == data[1]->position.x) ? isLeft[0] : !isLeft[0];
             }
             else {
               // with at least three samples, we have distinct x and y coordinates.
               float left = std::min(data[0]->position.x, data[1]->position.x);
               left = std::min(data[2]->position.x, left);
               float top = std::min(data[0]->position.y, data[1]->position.y);
               top = std::min(data[2]->position.y, top);
               for (unsigned i = 0; i < count; ++i) {
                 isTop[i] = data[i]->position.y == top;
                 isLeft[i] = data[i]->position.x == left;
               }
             }
             for (unsigned i = 0; i < count; ++i) {
               *(fnToLayout(isTop[i], isLeft[i])) = data[i];
             }

             // Finally, we have a proper quad reconstructed. Validate.
             for (unsigned i = 0; i < count; ++i) {
               PerPixelData *d = data[i];
               VERIFY_ARE_EQUAL(d->id0, fnToLayoutData(true, true)->id);
               VERIFY_ARE_EQUAL(d->id1, fnToLayoutData(true, false)->id);
               VERIFY_ARE_EQUAL(d->id2, fnToLayoutData(false, true)->id);
               VERIFY_ARE_EQUAL(d->id3, fnToLayoutData(false, false)->id);
               VERIFY_ARE_EQUAL(d->acrossX, fnToLayoutData(isTop[i], !isLeft[i])->id);
               VERIFY_ARE_EQUAL(d->acrossY, fnToLayoutData(!isTop[i], isLeft[i])->id);
               VERIFY_ARE_EQUAL(d->acrossDiag, fnToLayoutData(!isTop[i], !isLeft[i])->id);
               VERIFY_ARE_EQUAL(d->quadActiveCount, count);
             }
           }
         }
         cur = groupEnd;
       }
     }

     // TODO: provide validation for quads where the same pixel was shaded multiple times
     //
     // Consider: for pixels that were shaded multiple times, check whether
     // some grouping of threads into quads satisfies all value requirements.
   }
 }

 struct ShaderOpTestResult {
   st::ShaderOp *ShaderOp;
   std::shared_ptr<st::ShaderOpSet> ShaderOpSet;
   std::shared_ptr<st::ShaderOpTest> Test;
 };

 struct SPrimitives {
   float f_float;
   float f_float2;
   float f_float_o;
   float f_float2_o;
 };

 std::shared_ptr<ShaderOpTestResult>
 RunShaderOpTestAfterParse(ID3D12Device *pDevice, dxc::DxcDllSupport &support,
                           LPCSTR pName,
                           st::ShaderOpTest::TInitCallbackFn pInitCallback,
                           std::shared_ptr<st::ShaderOpSet> ShaderOpSet) {
   st::ShaderOp *pShaderOp;
   if (pName == nullptr) {
     if (ShaderOpSet->ShaderOps.size() != 1) {
       VERIFY_FAIL(L"Expected a single shader operation.");
     }
     pShaderOp = ShaderOpSet->ShaderOps[0].get();
   }
   else {
     pShaderOp = ShaderOpSet->GetShaderOp(pName);
   }
   if (pShaderOp == nullptr) {
     std::string msg = "Unable to find shader op ";
     msg += pName;
     msg += "; available ops";
     const char sep = ':';
     for (auto &pAvailOp : ShaderOpSet->ShaderOps) {
       msg += sep;
       msg += pAvailOp->Name ? pAvailOp->Name : "[n/a]";
     }
     CA2W msgWide(msg.c_str());
     VERIFY_FAIL(msgWide.m_psz);
   }

   // This won't actually be used since we're supplying the device,
   // but let's make it consistent.
   pShaderOp->UseWarpDevice = GetTestParamUseWARP(true);

   std::shared_ptr<st::ShaderOpTest> test = std::make_shared<st::ShaderOpTest>();
   test->SetDxcSupport(&support);
   test->SetInitCallback(pInitCallback);
   test->SetDevice(pDevice);
   test->RunShaderOp(pShaderOp);

   std::shared_ptr<ShaderOpTestResult> result =
       std::make_shared<ShaderOpTestResult>();
   result->ShaderOpSet = ShaderOpSet;
   result->Test = test;
   result->ShaderOp = pShaderOp;
   return result;
 }

 std::shared_ptr<ShaderOpTestResult>
 RunShaderOpTest(ID3D12Device *pDevice, dxc::DxcDllSupport &support,
                 IStream *pStream, LPCSTR pName,
                 st::ShaderOpTest::TInitCallbackFn pInitCallback) {
   DXASSERT_NOMSG(pStream != nullptr);
   std::shared_ptr<st::ShaderOpSet> ShaderOpSet =
         std::make_shared<st::ShaderOpSet>();
   st::ParseShaderOpSetFromStream(pStream, ShaderOpSet.get());
   return RunShaderOpTestAfterParse(pDevice, support, pName, pInitCallback, ShaderOpSet);
 }

 TEST_F(ExecutionTest, OutOfBoundsTest) {
   WEX::TestExecution::SetVerifyOutput verifySettings(WEX::TestExecution::VerifyOutputSettings::LogOnlyFailures);
   CComPtr<IStream> pStream;
   ReadHlslDataIntoNewStream(L"ShaderOpArith.xml", &pStream);

   // Single operation test at the moment.
   CComPtr<ID3D12Device> pDevice;
   if (!CreateDevice(&pDevice))
     return;

   std::shared_ptr<ShaderOpTestResult> test = RunShaderOpTest(pDevice, m_support, pStream, "OOB", nullptr);
   MappedData data;
   // Read back to CPU and examine contents - should get pure red.
   {
     MappedData data;
     test->Test->GetReadBackData("RTarget", &data);
     const uint32_t *pPixels = (uint32_t *)data.data();
     uint32_t first = *pPixels;
     VERIFY_ARE_EQUAL(0xff0000ff, first); // pure red - only first component is read
   }
 }

 TEST_F(ExecutionTest, SaturateTest) {
   WEX::TestExecution::SetVerifyOutput verifySettings(WEX::TestExecution::VerifyOutputSettings::LogOnlyFailures);
   CComPtr<IStream> pStream;
   ReadHlslDataIntoNewStream(L"ShaderOpArith.xml", &pStream);

   // Single operation test at the moment.
   CComPtr<ID3D12Device> pDevice;
   if (!CreateDevice(&pDevice))
     return;

   std::shared_ptr<ShaderOpTestResult> test = RunShaderOpTest(pDevice, m_support, pStream, "Saturate", nullptr);
   MappedData data;
   test->Test->GetReadBackData("U0", &data);
   const float *pValues = (float *)data.data();
   // Everything is zero except for 1.5f and +Inf, which saturate to 1.0f
   const float ExpectedCases[9] = {
     0.0f, 0.0f, 0.0f, 0.0f, // -inf, -1.5, -denorm, -0
     0.0f, 0.0f, 1.0f, 1.0f, // 0, denorm, 1.5f, inf
     0.0f                    // nan
   };
   for (size_t i = 0; i < _countof(ExpectedCases); ++i) {
     VERIFY_IS_TRUE(ifdenorm_flushf_eq(*pValues, ExpectedCases[i]));
     ++pValues;
   }
 }

 void ExecutionTest::BasicTriangleTestSetup(LPCSTR ShaderOpName, LPCWSTR FileName, D3D_SHADER_MODEL testModel) {
 #ifdef _HLK_CONF
   UNREFERENCED_PARAMETER(ShaderOpName);
   UNREFERENCED_PARAMETER(FileName);
   UNREFERENCED_PARAMETER(testModel);
 #else
   WEX::TestExecution::SetVerifyOutput verifySettings(WEX::TestExecution::VerifyOutputSettings::LogOnlyFailures);
   CComPtr<IStream> pStream;
   ReadHlslDataIntoNewStream(L"ShaderOpArith.xml", &pStream);

   // Single operation test at the moment.
   CComPtr<ID3D12Device> pDevice;
   if (!CreateDevice(&pDevice, testModel))
     return;

   // As this is used, 6.2 requirement always comes with requiring native 16-bit ops
   if (testModel == D3D_SHADER_MODEL_6_2 && !DoesDeviceSupportNative16bitOps(pDevice)) {
     WEX::Logging::Log::Comment(L"Device does not support native 16-bit operations.");
     WEX::Logging::Log::Result(WEX::Logging::TestResults::Skipped);
     return;
   }

   std::shared_ptr<ShaderOpTestResult> test = RunShaderOpTest(pDevice, m_support, pStream, ShaderOpName, nullptr);
   MappedData data;
   D3D12_RESOURCE_DESC &D = test->ShaderOp->GetResourceByName("RTarget")->Desc;
   UINT width = (UINT)D.Width;
   UINT height = D.Height;
   test->Test->GetReadBackData("RTarget", &data);
   const uint32_t *pPixels = (uint32_t *)data.data();
   if (SaveImages()) {
     SavePixelsToFile(pPixels, DXGI_FORMAT_R8G8B8A8_UNORM, 320, 200, FileName);
   }
   uint32_t top = pPixels[width / 2]; // Top center.
   uint32_t mid = pPixels[width / 2 + width * (height / 2)]; // Middle center.
   VERIFY_ARE_EQUAL(0xff663300, top); // clear color
   VERIFY_ARE_EQUAL(0xffffffff, mid); // white

   // This is the basic validation test for shader operations, so it's good to
   // check this here at least for this one test case.
   data.reset();
   test.reset();
   ReportLiveObjects();
 #endif
 }

 TEST_F(ExecutionTest, BasicTriangleOpTest) {
   BasicTriangleTestSetup("Triangle", L"basic-triangle.bmp", D3D_SHADER_MODEL_6_0);
 }

 TEST_F(ExecutionTest, BasicTriangleOpTestHalf) {
   BasicTriangleTestSetup("TriangleHalf", L"basic-triangle-half.bmp", D3D_SHADER_MODEL_6_2);
 }

 void VerifyDerivResults(const float *pPixels, UINT offsetCenter) {

   // pixel at the center
   float CenterDDXFine = pPixels[offsetCenter];
   float CenterDDYFine = pPixels[offsetCenter + 1];
   float CenterDDXCoarse = pPixels[offsetCenter + 2];
   float CenterDDYCoarse = pPixels[offsetCenter + 3];

   LogCommentFmt(
       L"center  ddx_fine: %8f, ddy_fine: %8f, ddx_coarse: %8f, ddy_coarse: %8f",
       CenterDDXFine, CenterDDYFine, CenterDDXCoarse, CenterDDYCoarse);

   // The texture for the 9 pixels in the center should look like the following

   // 256   32  64
   // 2048 256 512
   // 1   .125 .25

   // In D3D12 there is no guarantee of how the adapter is grouping 2x2 pixels
   // So for fine derivatives there can be up to two possible results for the center pixel,
   // while for coarse derivatives there can be up to six possible results.
   int ulpTolerance = 1;
   // 512 - 256 or 2048 - 256
   bool left = CompareFloatULP(CenterDDXFine, -1792.0f, ulpTolerance);
   VERIFY_IS_TRUE(left || CompareFloatULP(CenterDDXFine, 256.0f, ulpTolerance));
   // 256 - 32 or 256 - .125
   bool top = CompareFloatULP(CenterDDYFine, 224.0f, ulpTolerance);
   VERIFY_IS_TRUE(top || CompareFloatULP(CenterDDYFine, -255.875, ulpTolerance));

   if (top && left) {
     VERIFY_IS_TRUE((CompareFloatULP(CenterDDXCoarse, -224.0f, ulpTolerance) ||
                    CompareFloatULP(CenterDDXCoarse, -1792.0f, ulpTolerance)) &&
                    (CompareFloatULP(CenterDDYCoarse, 224.0f, ulpTolerance) ||
                    CompareFloatULP(CenterDDYCoarse, 1792.0f, ulpTolerance)));
   }
   else if (top) { // top right quad
     VERIFY_IS_TRUE((CompareFloatULP(CenterDDXCoarse, 256.0f, ulpTolerance)  ||
                    CompareFloatULP(CenterDDXCoarse, 32.0f, ulpTolerance))   &&
                    (CompareFloatULP(CenterDDYCoarse, 224.0f, ulpTolerance) ||
                    CompareFloatULP(CenterDDYCoarse, 448.0f, ulpTolerance)));
   }
   else if (left) { // bottom left quad
     VERIFY_IS_TRUE((CompareFloatULP(CenterDDXCoarse, -1792.0f, ulpTolerance) ||
                    CompareFloatULP(CenterDDXCoarse, -.875f, ulpTolerance))   &&
                    (CompareFloatULP(CenterDDYCoarse, -2047.0f, ulpTolerance) ||
                    CompareFloatULP(CenterDDYCoarse, -255.875f, ulpTolerance)));
   }
   else { // bottom right
     VERIFY_IS_TRUE((CompareFloatULP(CenterDDXCoarse, 256.0f, ulpTolerance) ||
                    CompareFloatULP(CenterDDXCoarse, .125f, ulpTolerance))  &&
                    (CompareFloatULP(CenterDDYCoarse, -255.875f, ulpTolerance) ||
                    CompareFloatULP(CenterDDYCoarse, -511.75f, ulpTolerance)));
   }
 }

 // Rendering two right triangles forming a square and assigning a texture value
 // for each pixel to calculate derivates.
 TEST_F(ExecutionTest, PartialDerivTest) {
   WEX::TestExecution::SetVerifyOutput verifySettings(WEX::TestExecution::VerifyOutputSettings::LogOnlyFailures);
   CComPtr<IStream> pStream;
   ReadHlslDataIntoNewStream(L"ShaderOpArith.xml", &pStream);

   CComPtr<ID3D12Device> pDevice;
   if (!CreateDevice(&pDevice))
       return;

   std::shared_ptr<ShaderOpTestResult> test = RunShaderOpTest(pDevice, m_support, pStream, "DerivFine", nullptr);
   MappedData data;
   D3D12_RESOURCE_DESC &D = test->ShaderOp->GetResourceByName("RTarget")->Desc;
   UINT width = (UINT)D.Width;
   UINT height = D.Height;
   UINT pixelSize = GetByteSizeForFormat(D.Format) / 4;

   test->Test->GetReadBackData("RTarget", &data);
   const float *pPixels = (float *)data.data();

   UINT centerIndex = (UINT64)width * height / 2 - width / 2;
   UINT offsetCenter = centerIndex * pixelSize;

   VerifyDerivResults(pPixels, offsetCenter);
 }

 struct Dispatch {
   int width, height, depth;
 };

 std::shared_ptr<st::ShaderOpTest>
 RunDispatch(ID3D12Device *pDevice, dxc::DxcDllSupport &support,
             st::ShaderOp *pShaderOp, const Dispatch D) {
   char compilerOptions[256];

   std::shared_ptr<st::ShaderOpTest> test = std::make_shared<st::ShaderOpTest>();
   test->SetDxcSupport(&support);
   test->SetInitCallback(nullptr);
   test->SetDevice(pDevice);

   // format compiler args
   VERIFY_IS_TRUE(sprintf_s(compilerOptions, sizeof(compilerOptions),
                            "-D DISPATCHX=%d -D DISPATCHY=%d -D DISPATCHZ=%d ",
                            D.width, D.height, D.depth));

   for (st::ShaderOpShader &S : pShaderOp->Shaders)
     S.Arguments = compilerOptions;

   pShaderOp->DispatchX = D.width;
   pShaderOp->DispatchY = D.height;
   pShaderOp->DispatchZ = D.depth;

   test->RunShaderOp(pShaderOp);

   return test;
 }

 TEST_F(ExecutionTest, DerivativesTest) {
   const UINT pixelSize = 4; // always float4

   WEX::TestExecution::SetVerifyOutput verifySettings(WEX::TestExecution::VerifyOutputSettings::LogOnlyFailures);
   CComPtr<IStream> pStream;
   ReadHlslDataIntoNewStream(L"ShaderOpArith.xml", &pStream);

   CComPtr<ID3D12Device> pDevice;
   if (!CreateDevice(&pDevice, D3D_SHADER_MODEL_6_6))
     return;

   std::shared_ptr<st::ShaderOpSet> ShaderOpSet =
     std::make_shared<st::ShaderOpSet>();
   st::ParseShaderOpSetFromStream(pStream, ShaderOpSet.get());

   st::ShaderOp *pShaderOp = ShaderOpSet->GetShaderOp("Derivatives");

   std::vector<Dispatch> dispatches =
   {
    {40, 1, 1},
    {1000, 1, 1},
    {32, 32, 1},
    {16, 64, 1},
    {4, 12, 4},
    {4, 64, 1},
    {16, 16, 3},
    {32, 8, 2}
   };

   std::vector<Dispatch> meshDispatches =
   {
    {60, 1, 1},
    {128, 1, 1},
    {8, 8, 1},
    {32, 8, 1},
    {8, 16, 4},
    {8, 64, 1},
    {8, 8, 3},
   };

   std::vector<Dispatch> badDispatches =
   {
    {16, 3, 1},
    {2, 16, 1},
    {33, 1, 1}
   };

   pShaderOp->UseWarpDevice = GetTestParamUseWARP(true);
   LPCSTR CS = pShaderOp->CS;

   MappedData data;

   for (Dispatch &D : dispatches) {
     // Test Compute Shader
     std::shared_ptr<st::ShaderOpTest> test = RunDispatch(pDevice, m_support, pShaderOp, D);

     test->GetReadBackData("U0", &data);

     float *pPixels = (float *)data.data();;

     UINT centerIndex = 0;
     if (D.height == 1) {
       centerIndex = (((UINT64)(D.width * D.height * D.depth) / 2) & ~0xF) + 10;
     } else {
       // To find roughly the center for compute, divide the height and width in half,
       // truncate to the previous multiple of 4 to get to the start of the repeating pattern
       // and then add 2 rows to get to the second row of quads and 2 to get to the first texel
       // of the second row of that quad row
       UINT centerRow = ((D.height/2UL) & ~0x3) + 2;
       UINT centerCol = ((D.width/2UL) & ~0x3) + 2;
       centerIndex = centerRow * D.width + centerCol;
     }
     UINT offsetCenter = centerIndex * pixelSize;
     LogCommentFmt(L"Verifying derivatives in compute shader results");
     VerifyDerivResults(pPixels, offsetCenter);
   }

   if (DoesDeviceSupportMeshAmpDerivatives(pDevice)) {
     // Disable CS so mesh goes forward
     pShaderOp->CS = nullptr;

     for (Dispatch &D : meshDispatches) {
       std::shared_ptr<st::ShaderOpTest> test = RunDispatch(pDevice, m_support, pShaderOp, D);

       test->GetReadBackData("U1", &data);
       const float *pPixels = (float *)data.data();
       UINT centerIndex = (((UINT64)(D.width * D.height * D.depth)/2) & ~0xF) + 10;
       UINT offsetCenter = centerIndex * pixelSize;
       LogCommentFmt(L"Verifying derivatives in mesh shader results");
       VerifyDerivResults(pPixels, offsetCenter);

       test->GetReadBackData("U2", &data);
       pPixels = (float *)data.data();
       LogCommentFmt(L"Verifying derivatives in amplification shader results");
       VerifyDerivResults(pPixels, offsetCenter);
     }
   }

   // Final tests with invalid dispatch size just to make sure they run
   for (Dispatch &D : badDispatches) {
     // Test Compute Shader
     pShaderOp->CS = CS;
     std::shared_ptr<st::ShaderOpTest> test = RunDispatch(pDevice, m_support, pShaderOp, D);

     if (DoesDeviceSupportMeshAmpDerivatives(pDevice)) {
       pShaderOp->CS = nullptr;
       test = RunDispatch(pDevice, m_support, pShaderOp, D);
     }
   }
 }

 // Verify the results for the quad starting with the given index
 void VerifyQuadReadResults(const UINT *pPixels, UINT quadIndex) {
   for (UINT i = 0; i < 4; i++) {
     UINT ix = quadIndex + i;
     UINT lix = pPixels[4*ix];
     VERIFY_ARE_EQUAL(pPixels[4*ix + 1], (lix^1));// ReadAcrossX
     VERIFY_ARE_EQUAL(pPixels[4*ix + 2], (lix^2));// ReadAcrossY
     VERIFY_ARE_EQUAL(pPixels[4*ix + 3], (lix^3));// ReadAcrossDiagonal
   }
 }


 TEST_F(ExecutionTest, QuadReadTest) {
   WEX::TestExecution::SetVerifyOutput verifySettings(WEX::TestExecution::VerifyOutputSettings::LogOnlyFailures);
   CComPtr<IStream> pStream;
   ReadHlslDataIntoNewStream(L"ShaderOpArith.xml", &pStream);

   CComPtr<ID3D12Device> pDevice;
   if (!CreateDevice(&pDevice))
     return;

   if (GetTestParamUseWARP(UseWarpByDefault()) || IsDeviceBasicAdapter(pDevice)) {
     WEX::Logging::Log::Comment(L"WARP does not support QuadRead in compute shaders.");
     WEX::Logging::Log::Result(WEX::Logging::TestResults::Skipped);
     return;
   }

   if (!DoesDeviceSupportWaveOps(pDevice)) {
     WEX::Logging::Log::Comment(L"Device does not support wave operations.");
     return;
   }

   std::shared_ptr<st::ShaderOpSet> ShaderOpSet =
     std::make_shared<st::ShaderOpSet>();
   st::ParseShaderOpSetFromStream(pStream, ShaderOpSet.get());

   st::ShaderOp *pShaderOp = ShaderOpSet->GetShaderOp("QuadRead");
   LPCSTR CS = pShaderOp->CS;

   struct Dispatch {
     int x, y, z;
     int mx, my, mz;
   };
   //std::vector<std::tuple<int, int, int, int, int>> dispatches =
   std::vector<Dispatch> dispatches =
   {
    {32, 32, 1, 8, 8, 1},
    {64, 4, 1, 64, 2, 1},
    {64, 1, 1, 64, 1, 1},
    {16, 16, 3, 4, 4, 3},
   };

   for (Dispatch &D : dispatches) {

     UINT width = D.x;
     UINT height = D.y;
     UINT depth = D.z;

     UINT mwidth = D.mx;
     UINT mheight = D.my;
     UINT mdepth = D.mz;
     // format compiler args
     char compilerOptions[256];
     VERIFY_IS_TRUE(sprintf_s(compilerOptions, sizeof(compilerOptions),
                              "-D DISPATCHX=%d -D DISPATCHY=%d -D DISPATCHZ=%d "
                              "-D MESHDISPATCHX=%d -D MESHDISPATCHY=%d -D MESHDISPATCHZ=%d",
                              width, height, depth, mwidth, mheight, mdepth));

     for (st::ShaderOpShader &S : pShaderOp->Shaders)
       S.Arguments = compilerOptions;

     pShaderOp->DispatchX = width;
     pShaderOp->DispatchY = height;
     pShaderOp->DispatchZ = depth;

     // Test Compute Shader
     pShaderOp->CS = CS;
     std::shared_ptr<ShaderOpTestResult> test = RunShaderOpTestAfterParse(pDevice, m_support, "QuadRead", nullptr, ShaderOpSet);
     MappedData data;

     test->Test->GetReadBackData("U0", &data);
     const UINT *pPixels = (UINT *)data.data();

     // To find roughly the center for compute, divide the pixel count in half
     // and truncate to next lowest power of 4 to start at a quad
     UINT offsetCenter = ((UINT64)(width * height * depth)/2) & ~0x3;

     // Test first, second and center quads
     LogCommentFmt(L"Verifying QuadRead* in compute shader results");
     VerifyQuadReadResults(pPixels, 0);
     VerifyQuadReadResults(pPixels, 4);
     VerifyQuadReadResults(pPixels, offsetCenter);

     if (DoesDeviceSupportMeshAmpDerivatives(pDevice)) {
       offsetCenter = ((UINT64)(mwidth * mheight * mdepth)/2) & ~0x3;

       // Disable CS so mesh goes forward
       pShaderOp->CS = nullptr;
       test = RunShaderOpTestAfterParse(pDevice, m_support, "QuadRead", nullptr, ShaderOpSet);
       test->Test->GetReadBackData("U1", &data);
       pPixels = (UINT *)data.data();
       // Test first, second and center quads
       LogCommentFmt(L"Verifying QuadRead* in mesh shader results");
       VerifyQuadReadResults(pPixels, 0);
       VerifyQuadReadResults(pPixels, 4);
       VerifyQuadReadResults(pPixels, offsetCenter);

       test->Test->GetReadBackData("U2", &data);
       pPixels = (UINT *)data.data();
       // Test first, second and center quads
       LogCommentFmt(L"Verifying QuadRead* in amplification shader results");
       VerifyQuadReadResults(pPixels, 0);
       VerifyQuadReadResults(pPixels, 4);
       VerifyQuadReadResults(pPixels, offsetCenter);
     }
   }
 }

 void VerifySampleResults(const UINT *pPixels, UINT width) {
   UINT xlod = 0;
   UINT ylod = 0;
   // Each pixel contains 4 samples and 4 LOD calculations.
   // 2 of these (called 'left' and 'right') have X values that vary and a constant Y
   // 2 others (called 'top' and 'bot') have Y values that vary and a constant X
   // Only of the X variant sample results and one of the Y variant results
   // are actually reported for the pixel.
   // The other 2 serve as "helpers" to the other pixels in the quad.
   // On the left side of the quad, the 'left' samples are reported.
   // Op the top of the quad, the 'top' samples are reported and so on.
   // The varying coordinate values alternate between zero and a
   // value whose magnitude increases with the index.
   // As a result, the LOD level should steadily increas.
   // Due to vagaries of implementation, the same derivatives
   // in both directions might result in different levels for different locations
   // in the quad. So only comparisons between sample results and LOD calculations
   // and ensuring that the LOD increased and reaches the max can be tested reliably.
   for (unsigned i = 0; i < width; i++) {
     // CalculateLOD and Sample from texture with mip levels containing LOD index should match
     VERIFY_ARE_EQUAL(pPixels[4*i + 0], pPixels[4*i + 1]);
     VERIFY_ARE_EQUAL(pPixels[4*i + 2], pPixels[4*i + 3]);
     // Make sure LODs are ever climbing as magnitudes increase
     VERIFY_IS_TRUE(pPixels[4*i] >= xlod);
     xlod = pPixels[4*i];
     VERIFY_IS_TRUE(pPixels[4*i + 2] >= ylod);
     ylod = pPixels[4*i + 2];
   }
   // Make sure we reached the max lod level for both tracks
   VERIFY_ARE_EQUAL(xlod, 6u);
   VERIFY_ARE_EQUAL(ylod, 6u);
 }

 TEST_F(ExecutionTest, ComputeSampleTest) {
   WEX::TestExecution::SetVerifyOutput verifySettings(WEX::TestExecution::VerifyOutputSettings::LogOnlyFailures);
   CComPtr<IStream> pStream;
   ReadHlslDataIntoNewStream(L"ShaderOpArith.xml", &pStream);

   CComPtr<ID3D12Device> pDevice;
   if (!CreateDevice(&pDevice, D3D_SHADER_MODEL_6_6))
       return;

   std::shared_ptr<st::ShaderOpSet> ShaderOpSet =
     std::make_shared<st::ShaderOpSet>();
   st::ParseShaderOpSetFromStream(pStream, ShaderOpSet.get());

   st::ShaderOp *pShaderOp = ShaderOpSet->GetShaderOp("ComputeSample");

   // Initialize texture with the LOD number in each corresponding mip level
   auto SampleInitFn = [&](LPCSTR Name, std::vector<BYTE> &Data, st::ShaderOp *pShaderOp) {
                         UNREFERENCED_PARAMETER(pShaderOp);
                         VERIFY_ARE_EQUAL(0, _stricmp(Name, "T0"));
                         D3D12_RESOURCE_DESC &texDesc = pShaderOp->GetResourceByName("T0")->Desc;
                         UINT texWidth = (UINT)texDesc.Width;
                         UINT texHeight = (UINT)texDesc.Height;
                         size_t size = sizeof(float) * texWidth * texHeight * 2;
                         Data.resize(size);
                         float *pPrimitives = (float *)Data.data();
                         float lod = 0.0;
                         int ix = 0;
                         while (texHeight > 0 && texWidth > 0) {
                           if(!texHeight) texHeight = 1;
                           if(!texWidth) texWidth = 1;
                           for (size_t j = 0; j < texHeight; ++j) {
                             for (size_t i = 0; i < texWidth; ++i) {
                               pPrimitives[ix++] = lod;
                             }
                           }
                           lod += 1.0;
                           texHeight >>= 1;
                           texWidth >>= 1;
                         }
                       };
   LPCSTR CS2 = nullptr, AS2 = nullptr, MS2 = nullptr;
   for (st::ShaderOpShader &S : pShaderOp->Shaders) {
     if (!strcmp(S.Name, "CS2")) CS2 = S.Name;
     if (!strcmp(S.Name, "AS2")) AS2 = S.Name;
     if (!strcmp(S.Name, "MS2")) MS2 = S.Name;
   }

   // Test 1D compute shader
   std::shared_ptr<ShaderOpTestResult> test = RunShaderOpTestAfterParse(pDevice, m_support, "ComputeSample", SampleInitFn, ShaderOpSet);
   MappedData data;

   test->Test->GetReadBackData("U0", &data);
   const UINT *pPixels = (UINT *)data.data();

   VerifySampleResults(pPixels, 84*4);

   // Test 2D compute shader
   pShaderOp->CS = CS2;

   test.reset();
   test = RunShaderOpTestAfterParse(pDevice, m_support, "ComputeSample", SampleInitFn, ShaderOpSet);

   test->Test->GetReadBackData("U0", &data);
   pPixels = (UINT *)data.data();

   VerifySampleResults(pPixels, 84*4);


   if (DoesDeviceSupportMeshAmpDerivatives(pDevice)) {
     // Disable CS so mesh goes forward
     pShaderOp->CS = nullptr;
     test = RunShaderOpTestAfterParse(pDevice, m_support, "ComputeSample", SampleInitFn, ShaderOpSet);
     test->Test->GetReadBackData("U1", &data);
     pPixels = (UINT *)data.data();

     VerifySampleResults(pPixels, 116);

     test->Test->GetReadBackData("U2", &data);
     pPixels = (UINT *)data.data();

     VerifySampleResults(pPixels, 84);

     pShaderOp->AS = AS2;
     pShaderOp->MS = MS2;
     test = RunShaderOpTestAfterParse(pDevice, m_support, "ComputeSample", SampleInitFn, ShaderOpSet);
     test->Test->GetReadBackData("U1", &data);
     pPixels = (UINT *)data.data();

     VerifySampleResults(pPixels, 116);

     test->Test->GetReadBackData("U2", &data);
     pPixels = (UINT *)data.data();

     VerifySampleResults(pPixels, 84);
   }
 }

 TEST_F(ExecutionTest, ATOWriteMSAATest) {
   WEX::TestExecution::SetVerifyOutput verifySettings(WEX::TestExecution::VerifyOutputSettings::LogOnlyFailures);

   //  #define WRITEMSAA_FALLBACK

   CComPtr<ID3D12Device> pDevice;
 #ifdef WRITEMSAA_FALLBACK
   D3D_SHADER_MODEL sm = D3D_SHADER_MODEL_6_6;
 #else
   D3D_SHADER_MODEL sm = D3D_SHADER_MODEL_6_7;
 #endif
   if (!CreateDevice(&pDevice, sm))
       return;

 #ifndef WRITEMSAA_FALLBACK
   if (!DoesDeviceSupportAdvancedTexOps(pDevice)) {
     WEX::Logging::Log::Comment(L"Device does not support Advanced Texture Operations.");
     WEX::Logging::Log::Result(WEX::Logging::TestResults::Skipped);
     return;
   }

   if (!DoesDeviceSupportWritableMSAA(pDevice)) {
     WEX::Logging::Log::Comment(L"Device does not support Writable MSAA.");
     WEX::Logging::Log::Result(WEX::Logging::TestResults::Skipped);
     return;
   }
 #endif

   static const char pWriteShader[] =
     "#define SAMPLES 4\n"
     "RWStructuredBuffer<float> g_out : register(u0);\n"
     "#if  __SHADER_TARGET_MAJOR > 6 || (__SHADER_TARGET_MAJOR == 6 && __SHADER_TARGET_MINOR >= 7)\n"
     "RWTexture2DMS<float, 4> g_texms : register(u1);\n"
     "#else\n"
     "RWTexture2DArray<float> g_texms : register(u1);\n"
     "#endif\n"
     "[NumThreads(32, 32, 1)]\n"
     "void main(uint3 id : SV_GroupThreadID) {\n"
     "  for(uint i = 0; i < SAMPLES; i++) {\n"
     "#if  __SHADER_TARGET_MAJOR > 6 || (__SHADER_TARGET_MAJOR == 6 && __SHADER_TARGET_MINOR >= 7)\n"
     "    g_texms.sample[i][id.xy] = id.x*id.y*(i+1);\n"
     "#else\n"
     "    g_texms[uint3(id.xy, i)] = id.x*id.y*(i+1);\n"
     "#endif\n"
     "  }\n"
     "}";

   static const char pCopyShader[] =
     "#define SAMPLES 4\n"
     "RWStructuredBuffer<float> g_out : register(u0);\n"
     "#if  __SHADER_TARGET_MAJOR > 6 || (__SHADER_TARGET_MAJOR == 6 && __SHADER_TARGET_MINOR >= 7)\n"
     "RWTexture2DMS<float, 4> g_texms : register(u1);\n"
     "#else\n"
     "RWTexture2DArray<float> g_texms : register(u1);\n"
     "#endif\n"
     "[NumThreads(32, 32, 1)]\n"
     "  void main(uint3 id : SV_GroupThreadID) {\n"
     "  for(uint i = 0; i < SAMPLES; i++) {\n"
     "#if  __SHADER_TARGET_MAJOR > 6 || (__SHADER_TARGET_MAJOR == 6 && __SHADER_TARGET_MINOR >= 7)\n"
     "    g_out[i*32*32 + id.y*32 + id.x] = g_texms.sample[i][id.xy];\n"
     "#else\n"
     "    g_out[i*32*32 + id.y*32 + id.x] = g_texms[uint3(id.xy, i)];\n"
     "#endif\n"
     "  }"
     "}";

   static const int NumThreadsX = 32;
   static const int NumThreadsY = 32;

 #ifdef WRITEMSAA_FALLBACK
   static const int NumSamples = 4;
   static const int ArraySize = 4;
 #else
   static const int NumSamples = 4;
   static const int ArraySize = 1;
 #endif
   static const int ThreadsPerGroup = NumThreadsX * NumThreadsY;
   const size_t valueSize = NumSamples * ThreadsPerGroup;
   const size_t valueSizeInBytes =  valueSize * sizeof(float);

   static const int DispatchGroupX = 1;
   static const int DispatchGroupY = 1;
   static const int DispatchGroupZ = 1;

   CComPtr<ID3D12CommandQueue> pCommandQueue;
   CComPtr<ID3D12CommandAllocator> pCommandAllocator;
   FenceObj FO;

   CreateComputeCommandQueue(pDevice, L"WriteMSAA Queue", &pCommandQueue);
   InitFenceObj(pDevice, &FO);

   // Create root signature.
   CComPtr<ID3D12RootSignature> pRootSignature;
   CD3DX12_DESCRIPTOR_RANGE ranges[2];
   ranges[0].Init(D3D12_DESCRIPTOR_RANGE_TYPE_UAV, 1, 0, 0);
   ranges[1].Init(D3D12_DESCRIPTOR_RANGE_TYPE_UAV, 1, 1, 0);

   CreateRootSignatureFromRanges(pDevice, &pRootSignature, ranges, 2);

   VERIFY_SUCCEEDED(pDevice->CreateCommandAllocator(D3D12_COMMAND_LIST_TYPE_COMPUTE, IID_PPV_ARGS(&pCommandAllocator)));

   // Create command list and resources
   CComPtr<ID3D12GraphicsCommandList> pCommandList;
   VERIFY_SUCCEEDED(pDevice->CreateCommandList(0, D3D12_COMMAND_LIST_TYPE_COMPUTE,
                                               pCommandAllocator, nullptr, IID_PPV_ARGS(&pCommandList)));

   // Set up Output Resource
   CComPtr<ID3D12Resource> pOutputResource;
   CComPtr<ID3D12Resource> pOutputReadBuffer;
   CComPtr<ID3D12Resource> pOutputUploadResource;

   float outVals[valueSize];
   int ix = 0;
   for (int i = 0; i < NumSamples; i++)
     for (int j = 0; j < NumThreadsY; j++)
       for (int k = 0; k < NumThreadsX; k++)
         outVals[ix++] = (float)ix + 5;
   CreateTestUavs(pDevice, pCommandList, outVals, sizeof(outVals), &pOutputResource,
                  &pOutputUploadResource, &pOutputReadBuffer);

   // Set up texture Resource.
   CComPtr<ID3D12Resource> pUavResource;
   float values[valueSize];
   memset(values, 0xc, valueSizeInBytes);


 #ifdef WRITEMSAA_FALLBACK
   int numsamp = 1;
 #else
   int numsamp = NumSamples;
 #endif

   D3D12_RESOURCE_DESC tex2dDesc = CD3DX12_RESOURCE_DESC::Tex2D(DXGI_FORMAT_R32_FLOAT,
                                    NumThreadsX, NumThreadsY, ArraySize, 1, numsamp, 0,
                                    D3D12_RESOURCE_FLAG_ALLOW_UNORDERED_ACCESS | D3D12_RESOURCE_FLAG_ALLOW_RENDER_TARGET);
   CreateTestResources(pDevice, pCommandList, values, valueSizeInBytes, tex2dDesc,
                       &pUavResource, nullptr);

   // Close the command list and execute it to perform the resource uploads
   pCommandList->Close();
   ID3D12CommandList *ppCommandLists[] = { pCommandList };
   pCommandQueue->ExecuteCommandLists(1, ppCommandLists);
   WaitForSignal(pCommandQueue, FO);

   // Create shaders
 #ifdef WRITEMSAA_FALLBACK
   const wchar_t *target = L"cs_6_6";
 #else
   const wchar_t *target = L"cs_6_7";
 #endif

   CComPtr<ID3D12PipelineState> pWritePSO;
   CreateComputePSO(pDevice, pRootSignature, pWriteShader, target, &pWritePSO);
   CComPtr<ID3D12PipelineState> pCopyPSO;
   CreateComputePSO(pDevice, pRootSignature, pCopyShader, target, &pCopyPSO);

   // Reset commandlist to write PSO
   VERIFY_SUCCEEDED(pCommandList->Reset(pCommandAllocator, pWritePSO));

   // Describe and create a UAV descriptor heap.
   CComPtr<ID3D12DescriptorHeap> pUavHeap;
   D3D12_DESCRIPTOR_HEAP_DESC heapDesc = {};
   heapDesc.NumDescriptors = 2;
   heapDesc.Type = D3D12_DESCRIPTOR_HEAP_TYPE_CBV_SRV_UAV;
   heapDesc.Flags = D3D12_DESCRIPTOR_HEAP_FLAG_SHADER_VISIBLE;
   VERIFY_SUCCEEDED(pDevice->CreateDescriptorHeap(&heapDesc, IID_PPV_ARGS(&pUavHeap)));

   CD3DX12_CPU_DESCRIPTOR_HANDLE cpuHandle(pUavHeap->GetCPUDescriptorHandleForHeapStart());
   CreateStructUAV(pDevice, cpuHandle, valueSize, sizeof(float), pOutputResource);
 #ifdef WRITEMSAA_FALLBACK
   CreateTex2DArrayUAV(pDevice, cpuHandle, NumSamples, DXGI_FORMAT_R32_FLOAT, pUavResource);
 #else
   CreateTex2DMSUAV(pDevice, cpuHandle, DXGI_FORMAT_R32_FLOAT, pUavResource);
 #endif

   // Set Heaps, Rootsignature and table
   ID3D12DescriptorHeap *const pHeaps[1] = { pUavHeap };
   pCommandList->SetDescriptorHeaps(1, pHeaps);
   pCommandList->SetComputeRootSignature(pRootSignature);
   pCommandList->SetComputeRootDescriptorTable(0, pUavHeap->GetGPUDescriptorHandleForHeapStart());

   // dispatch and close write shader
   pCommandList->Dispatch(DispatchGroupX, DispatchGroupY, DispatchGroupZ);
   pCommandList->Close();

   pCommandQueue->ExecuteCommandLists(1, ppCommandLists);
   WaitForSignal(pCommandQueue, FO);

   // Create copy command list
   VERIFY_SUCCEEDED(pCommandList->Reset(pCommandAllocator, pCopyPSO));

   // Set Rootsignature and descriptor tables
   SetDescriptorHeap(pCommandList, pUavHeap);
   pCommandList->SetComputeRootSignature(pRootSignature);

   pCommandList->SetComputeRootDescriptorTable(0, pUavHeap->GetGPUDescriptorHandleForHeapStart());

   // Run Copy shader and copy the results back to readable memory
   pCommandList->Dispatch(DispatchGroupX, DispatchGroupY, DispatchGroupZ);

   CD3DX12_RESOURCE_BARRIER barrier = CD3DX12_RESOURCE_BARRIER::Transition(pOutputResource,
                                         D3D12_RESOURCE_STATE_UNORDERED_ACCESS, D3D12_RESOURCE_STATE_COPY_SOURCE);
   pCommandList->ResourceBarrier(1, &barrier);
   pCommandList->CopyResource(pOutputReadBuffer, pOutputResource);

   pCommandList->Close();

   pCommandQueue->ExecuteCommandLists(1, ppCommandLists);
   WaitForSignal(pCommandQueue, FO);

   MappedData mappedData(pOutputReadBuffer, valueSize*sizeof(float));
   float *pData = (float *)mappedData.data();
   ix = 0;
   for (int i = 0; i < NumSamples; i++)
     for (int j = 0; j < NumThreadsY; j++)
       for (int k = 0; k < NumThreadsX; k++)
         VERIFY_ARE_EQUAL(pData[ix++], j*k*(i+1));
 }

 // Used to determine how an out of bounds offset should be converted
 #define CLAMPOFFSET(offset) ((offset<<28)>>28)

 // Determine if the values in pPixels correspond to the expected locations encoded into a uint
 // based on the coordinates and offsets that were provided.
 void VerifyProgOffsetResults(unsigned *pPixels, bool bCheckDeriv) {
   // Check that each element matches the expected value given the offset
   unsigned ix = 0;
   int coords[18] = {100, 150, 200, 250, 300, 350, 400, 450, 500, 550, 600, 650, 700, 750, 800, 850, 900, 950};
   int offsets[18] = {CLAMPOFFSET(-9), -8, -7, -6, -5, -4, -3, -2, -1, 0, 1, 2, 3, 4, 5, 6, 7, CLAMPOFFSET(8)};
   for (unsigned y = 0; y < _countof(coords); y++) {
     for (unsigned x = 0; x < _countof(coords); x++) {
       unsigned cmp = (coords[y] + offsets[y])*1000 + coords[x] + offsets[x];
       if (bCheckDeriv) {
         VERIFY_ARE_EQUAL(pPixels[2*4*ix+0], cmp); // Sample
         VERIFY_ARE_EQUAL(pPixels[2*4*ix+1], 1U); // SampleCmp
       }
       VERIFY_ARE_EQUAL(pPixels[2*4*ix+2], 1U); // SampleCmpLevel
       VERIFY_ARE_EQUAL(pPixels[2*4*ix+3], 1U); // SampleCmpLevelZero
       VERIFY_ARE_EQUAL(pPixels[2*4*ix+4], cmp); // Load
       if (bCheckDeriv) {
         VERIFY_ARE_EQUAL(pPixels[2*4*ix+5], cmp); // SampleBias
       }
       VERIFY_ARE_EQUAL(pPixels[2*4*ix+6], cmp); // SampleGrad
       VERIFY_ARE_EQUAL(pPixels[2*4*ix+7], cmp); // SampleLevel
       ix++;
     }
   }
 }

 // Fills a 1000x1000 float texture with index values increasing in row-major order
 // The shader then uses non-immediate offsets extending from -9 to 8 to access these using
 // Load, Sample, SampleCmp and variants thereof.
 // The test verifies that the locations accessed correspond to where they should.
 TEST_F(ExecutionTest, ATOProgOffset) {
   WEX::TestExecution::SetVerifyOutput verifySettings(WEX::TestExecution::VerifyOutputSettings::LogOnlyFailures);
   CComPtr<IStream> pStream;
   ReadHlslDataIntoNewStream(L"ShaderOpArith.xml", &pStream);

   std::shared_ptr<st::ShaderOpSet> ShaderOpSet =
     std::make_shared<st::ShaderOpSet>();
   st::ParseShaderOpSetFromStream(pStream, ShaderOpSet.get());

   st::ShaderOp *pShaderOp = ShaderOpSet->GetShaderOp("ProgOffset");

   auto SampleInitFn = [&](LPCSTR Name, std::vector<BYTE> &Data, st::ShaderOp *pShaderOp) {
                         UNREFERENCED_PARAMETER(pShaderOp);
                         D3D12_RESOURCE_DESC &texDesc = pShaderOp->GetResourceByName(Name)->Desc;
                         UINT texWidth = (UINT)texDesc.Width;
                         UINT texHeight = (UINT)texDesc.Height;
                         size_t size = sizeof(float) * texWidth * texHeight;
                         Data.resize(size);
                         float *pPrimitives = (float *)Data.data();
                         int ix = 0;
                         for (size_t j = 0; j < texHeight; ++j) {
                           for (size_t i = 0; i < texWidth; ++i) {
                             pPrimitives[ix] = float(ix);
                             ix++;
                           }
                         }
                       };

   bool bTestsSkipped = true;
   D3D_SHADER_MODEL TestShaderModels[] = {D3D_SHADER_MODEL_6_5,
                                          D3D_SHADER_MODEL_6_6,
                                          D3D_SHADER_MODEL_6_7};
   for (unsigned i = 0; i < _countof(TestShaderModels); i++) {
     D3D_SHADER_MODEL sm = TestShaderModels[i];

     CComPtr<ID3D12Device> pDevice;
     if (!CreateDevice(&pDevice, sm, /*skipUnsupported*/false)) {
       LogCommentFmt(L"Device does not support shader model 6.%1u",
                     ((UINT)sm & 0x0f));
       break;
     }
     if (sm >= D3D_SHADER_MODEL_6_7 && !DoesDeviceSupportAdvancedTexOps(pDevice)) {
       LogCommentFmt(L"Device does not support Advanced Texture Ops");
       break;
     }

     bool bSupportMSASDeriv = DoesDeviceSupportMeshAmpDerivatives(pDevice);

     bool bCheckDerivCS = sm >= D3D_SHADER_MODEL_6_6;
     bool bCheckDerivMSAS = bCheckDerivCS && bSupportMSASDeriv;

     if (bCheckDerivCS && !bSupportMSASDeriv) {
       LogCommentFmt(L"Device does not support derivatives in Mesh and Amplification shaders");
     }

     switch (sm) {
     case D3D_SHADER_MODEL_6_5:
       pShaderOp->CS = pShaderOp->GetString("CS");
       pShaderOp->PS = pShaderOp->GetString("PS");
       pShaderOp->MS = pShaderOp->GetString("MS");
       pShaderOp->AS = pShaderOp->GetString("AS");
       break;
     case D3D_SHADER_MODEL_6_6:
       pShaderOp->CS = pShaderOp->GetString("CS66");
       pShaderOp->PS = pShaderOp->GetString("PS");
       if (bCheckDerivMSAS) {
         pShaderOp->MS = pShaderOp->GetString("MS66D");
         pShaderOp->AS = pShaderOp->GetString("AS66D");
       } else {
         pShaderOp->MS = pShaderOp->GetString("MS66");
         pShaderOp->AS = pShaderOp->GetString("AS66");
       }
       break;
     case D3D_SHADER_MODEL_6_7:
       pShaderOp->CS = pShaderOp->GetString("CS67");
       pShaderOp->PS = pShaderOp->GetString("PS67");
       if (bCheckDerivMSAS) {
         pShaderOp->MS = pShaderOp->GetString("MS67D");
         pShaderOp->AS = pShaderOp->GetString("AS67D");
       } else {
         pShaderOp->MS = pShaderOp->GetString("MS67");
         pShaderOp->AS = pShaderOp->GetString("AS67");
       }
       break;
     }

     // Test compute shader
     std::shared_ptr<ShaderOpTestResult> test = RunShaderOpTestAfterParse(pDevice, m_support, "ProgOffset", SampleInitFn, ShaderOpSet);
     MappedData data;

     test->Test->GetReadBackData("U0", &data);
     VerifyProgOffsetResults((UINT*)data.data(), bCheckDerivCS);

     // Disable CS so graphics shaders go forward
     pShaderOp->CS = nullptr;

     if (DoesDeviceSupportMeshShaders(pDevice)) {
       test = RunShaderOpTestAfterParse(pDevice, m_support, "ProgOffset", SampleInitFn, ShaderOpSet);

       // PS
       test->Test->GetReadBackData("U0", &data);
       VerifyProgOffsetResults((UINT*)data.data(), true);

       // MS
       test->Test->GetReadBackData("U1", &data);
       VerifyProgOffsetResults((UINT*)data.data(), bCheckDerivMSAS);

       // AS
       test->Test->GetReadBackData("U2", &data);
       VerifyProgOffsetResults((UINT*)data.data(), bCheckDerivMSAS);
     }

     // Disable MS so PS goes forward
     pShaderOp->MS = nullptr;
     test = RunShaderOpTestAfterParse(pDevice, m_support, "ProgOffset", SampleInitFn, ShaderOpSet);

     test->Test->GetReadBackData("U0", &data);
     VerifyProgOffsetResults((UINT*)data.data(), true);

     bTestsSkipped = false;
   }

   if (bTestsSkipped) {
     WEX::Logging::Log::Result(WEX::Logging::TestResults::Skipped);
   }

 }

 // A mipmapped texture containing the value of LOD at each location in each
 // level is used to sample at each level using SampleCmpLevel and confirm
 // that the correct level is used for the comparison.
 TEST_F(ExecutionTest, ATOSampleCmpLevelTest) {
   WEX::TestExecution::SetVerifyOutput verifySettings(WEX::TestExecution::VerifyOutputSettings::LogOnlyFailures);
   CComPtr<IStream> pStream;
   ReadHlslDataIntoNewStream(L"ShaderOpArith.xml", &pStream);

   CComPtr<ID3D12Device> pDevice;
   if (!CreateDevice(&pDevice, D3D_SHADER_MODEL_6_7))
       return;

   if (!DoesDeviceSupportAdvancedTexOps(pDevice)) {
     WEX::Logging::Log::Comment(L"Device does not support Advanced Texture Operations.");
     WEX::Logging::Log::Result(WEX::Logging::TestResults::Skipped);
     return;
   }

   std::shared_ptr<st::ShaderOpSet> ShaderOpSet =
     std::make_shared<st::ShaderOpSet>();
   st::ParseShaderOpSetFromStream(pStream, ShaderOpSet.get());

   st::ShaderOp *pShaderOp = ShaderOpSet->GetShaderOp("SampleCmpLevel");

   // Initialize texture with the LOD number in each corresponding mip level
   auto SampleInitFn = [&](LPCSTR Name, std::vector<BYTE> &Data, st::ShaderOp *pShaderOp) {
                         UNREFERENCED_PARAMETER(pShaderOp);
                         D3D12_RESOURCE_DESC &texDesc = pShaderOp->GetResourceByName(Name)->Desc;
                         UINT texWidth = (UINT)texDesc.Width;
                         UINT texHeight = (UINT)texDesc.Height;
                         size_t size = sizeof(float) * texWidth * texHeight * 2;
                         Data.resize(size);
                         float *pPrimitives = (float *)Data.data();
                         float val = 0.5;
                         int ix = 0;
                         while (texHeight > 0 && texWidth > 0) {
                           if(!texHeight) texHeight = 1;
                           if(!texWidth) texWidth = 1;
                           for (size_t j = 0; j < texHeight; ++j) {
                             for (size_t i = 0; i < texWidth; ++i) {
                               pPrimitives[ix++] = val;
                             }
                           }
                           val += 1.0;
                           texHeight >>= 1;
                           texWidth >>= 1;
                         }
                       };

   // Test compute shader
   std::shared_ptr<ShaderOpTestResult> test = RunShaderOpTestAfterParse(pDevice, m_support, "SampleCmpLevel", SampleInitFn, ShaderOpSet);
   MappedData data;

   test->Test->GetReadBackData("U0", &data);
   const UINT *pPixels = (UINT *)data.data();

   // Check that each LOD matches what's expected
   unsigned count = 2*7;
   // Since the results consist of a boolean, which should be true followed by the result of a sampcmplvl,
   // the only result expected is 1.
   for (unsigned i = 0; i < count; i++)
     VERIFY_ARE_EQUAL(pPixels[i], 1U);

   if (DoesDeviceSupportMeshShaders(pDevice)) {
     // Disable CS so mesh goes forward
     pShaderOp->CS = nullptr;
     test = RunShaderOpTestAfterParse(pDevice, m_support, "SampleCmpLevel", SampleInitFn, ShaderOpSet);

     test->Test->GetReadBackData("U0", &data);
     pPixels = (UINT *)data.data();

     for (unsigned i = 0; i < count; i++)
       VERIFY_ARE_EQUAL(pPixels[i], 1U);

     test->Test->GetReadBackData("U1", &data);
     pPixels = (UINT *)data.data();

     for (unsigned i = 0; i < count; i++)
       VERIFY_ARE_EQUAL(pPixels[i], 1U);

     test->Test->GetReadBackData("U2", &data);
     pPixels = (UINT *)data.data();

     for (unsigned i = 0; i < count; i++)
       VERIFY_ARE_EQUAL(pPixels[i], 1U);
   }
 }

 template <unsigned RSize>
 struct IntR {
   unsigned R : RSize;
   void SetChannels(unsigned R, unsigned G, unsigned B, unsigned A) {
     this->R = R;
     UNREFERENCED_PARAMETER(G);
     UNREFERENCED_PARAMETER(B);
     UNREFERENCED_PARAMETER(A);
   }
   static unsigned GetRSize() { return RSize; }
   static unsigned GetGSize() { return 0; }
   static unsigned GetBSize() { return 0; }
   static unsigned GetASize() { return 0; }
 };

 template <unsigned RSize, unsigned GSize>
 struct IntRG {
   unsigned R : RSize;
   unsigned G : GSize;
   void SetChannels(unsigned R, unsigned G, unsigned B, unsigned A) {
     this->R = R;
     this->G = G;
     UNREFERENCED_PARAMETER(B);
     UNREFERENCED_PARAMETER(A);
   }
   static unsigned GetRSize() { return RSize; }
   static unsigned GetGSize() { return GSize; }
   static unsigned GetBSize() { return 0; }
   static unsigned GetASize() { return 0; }
 };

 template <unsigned RSize, unsigned GSize, unsigned BSize>
 struct IntRGB {
   unsigned R : RSize;
   unsigned G : GSize;
   unsigned B : BSize;
   void SetChannels(unsigned R, unsigned G, unsigned B, unsigned A) {
     this->R = R;
     this->G = G;
     this->B = B;
     UNREFERENCED_PARAMETER(A);
   }
   static unsigned GetRSize() { return RSize; }
   static unsigned GetGSize() { return GSize; }
   static unsigned GetBSize() { return BSize; }
   static unsigned GetASize() { return 0; }
 };


 template <unsigned RSize, unsigned GSize, unsigned BSize, unsigned ASize>
 struct IntRGBA {
   unsigned R : RSize;
   unsigned G : GSize;
   unsigned B : BSize;
   unsigned A : ASize;

   void SetChannels(unsigned R, unsigned G, unsigned B, unsigned A) {
     this->R = R;
     this->G = G;
     this->B = B;
     this->A = A;
   }
   static unsigned GetRSize() { return RSize; }
   static unsigned GetGSize() { return GSize; }
   static unsigned GetBSize() { return BSize; }
   static unsigned GetASize() { return ASize; }
 };

 struct IntRGBA10XRA2UNORM {
   uint32_t RGBA;
   void SetChannels(float R, float G, float B, float A) {
     uint32_t ur, ug, ub, ua;
     // Conversion values taken from XR documentation
     ur = GetMantissa(R*510+385);
     ub = GetMantissa(B*510+385);
     ug = GetMantissa(G*510+385);
     ua = (uint32_t)A;

     // Cast off all but the 10 MSB and shift for packing
     ur = (ur&0x7fE000) >> 13;
     ug = (ur&0x7fE000) >> 3;
     ub = (ur&0x7fE000) << 7;
     ua = (ua&0x3) << 30;

     RGBA = ur | ug | ub | ua;
   }
 };

 struct Float32R {
   float R;
   void SetChannels(float R, float G, float B, float A) {
     this->R = R;
     UNREFERENCED_PARAMETER(G);
     UNREFERENCED_PARAMETER(B);
     UNREFERENCED_PARAMETER(A);
   }
 };

 struct Float32RG {
   float R, G;
   void SetChannels(float R, float G, float B, float A) {
     this->R = R;
     this->G = G;
     UNREFERENCED_PARAMETER(B);
     UNREFERENCED_PARAMETER(A);
   }
 };

 struct Float16R {
   uint16_t R;
   void SetChannels(float R, float G, float B, float A) {
     this->R = ConvertFloat32ToFloat16(R);
     UNREFERENCED_PARAMETER(G);
     UNREFERENCED_PARAMETER(B);
     UNREFERENCED_PARAMETER(A);
   }
 };

 struct Float16RG {
   uint16_t R, G;
   void SetChannels(float R, float G, float B, float A) {
     this->R = ConvertFloat32ToFloat16(R);
     this->G = ConvertFloat32ToFloat16(G);
     UNREFERENCED_PARAMETER(B);
     UNREFERENCED_PARAMETER(A);
   }
 };

 // No Float16RGB needed

 struct Float16RGBA {
   uint16_t R, G, B, A;
   void SetChannels(float R, float G, float B, float A) {
     this->R = ConvertFloat32ToFloat16(R);
     this->G = ConvertFloat32ToFloat16(G);
     this->B = ConvertFloat32ToFloat16(B);
     this->A = ConvertFloat32ToFloat16(A);
   }
 };

 struct FloatR11G11B10 {
   uint32_t RGB;
   void SetChannels(float R, float G, float B, float A) {
     uint32_t ur, ug, ub;
     // Shift and mask so as to place R: 0-10, G: 11-21, B: 22-31
     // Sign and lesser-significant mantissa bits are truncated
     ur = (ConvertFloat32ToFloat16(R) >> 4) & 0x000007FF;
     ug = (ConvertFloat32ToFloat16(G) << 7) & 0x003FF800;
     ub = (ConvertFloat32ToFloat16(B) << 17) & 0xFFC00000;
     UNREFERENCED_PARAMETER(A);
     RGB = ur | ug | ub;
   }
 };

 struct FloatRGBE {
   uint32_t RGBE;
   // Conversion logic taken from miniengine PixelPacking header
   void SetChannels(UINT R, UINT G, UINT B, UINT A) {
     union { uint32_t i; float f; } ur, ug, ub, maxChannel, nextPow2;
     ur.f = (float)R;
     ug.f = (float)G;
     ub.f = (float)B;
     maxChannel.f = std::max(ur.f, std::max(ug.f, ub.f));
     // nextPow2 has to have the biggest exponent plus 1 (and nothing in the mantissa)
     nextPow2.i = (maxChannel.i + 0x800000) & 0x7F800000;

     // By adding nextPow2, all channels have the same exponent, shifting their mantissa bits
     // to the right to accomodate it.  This also shifts in the implicit '1' bit of all channels.
     // The largest channel will always have the high bit set.
     ur.f += nextPow2.f;
     ug.f += nextPow2.f;
     ub.f += nextPow2.f;
     UNREFERENCED_PARAMETER(A);

     ur.i = (ur.i << 9) >> 23;
     ug.i = (ug.i << 9) >> 23;
     ub.i = (ub.i << 9) >> 23;

     uint32_t e = ConvertFloat32ToFloat16(nextPow2.f) << 17;
     RGBE = ur.i | ug.i << 9 | ub.i << 18 | e;
   }

   static unsigned GetRSize() { return 9; }
   static unsigned GetGSize() { return 9; }
   static unsigned GetBSize() { return 9; }
   static unsigned GetASize() { return 0; }
 };

 template <typename RGBAType, unsigned xdim, unsigned ydim>
 struct RawFloatTexture : public ExecutionTest::RawGatherTexture {
   DXGI_FORMAT m_format;
   RGBAType RGBA[xdim*ydim];
   RawFloatTexture(DXGI_FORMAT format) : m_format(format) {}
   // Set i'th element to floatified x,y and some derived values
   virtual void SetElement(int i, int x, int y) override {
     float r = (float)x;
     float g = (float)y;
     // provide some different values just to fill in b and a
     float b = (float)(x + y)*0.5f;
     float a = (float)(x + y)*0.1f;
     RGBA[i].SetChannels(r, g, b, a);
   }
   virtual void *GetElements() { return (void*)RGBA; }
   virtual unsigned GetXDim() { return xdim; }
   virtual unsigned GetYDim() { return ydim; }
   virtual DXGI_FORMAT GetFormat() override { return m_format; };
 };

 template <unsigned xdim, unsigned ydim>
 struct RawFloatR11G11B10ATexture : public ExecutionTest::RawGatherTexture {
   FloatR11G11B10 RGBA[xdim*ydim];
   // Set i'th element to floatified x,y and some derived values
   virtual void SetElement(int i, int x, int y) override {
     float r = (float)x;
     float g = (float)y;
     float b = (float)(x + y)*0.5f;
     RGBA[i].SetChannels(r, g, b, 0);
   }
   virtual void *GetElements() { return (void*)RGBA; }
   virtual unsigned GetXDim() { return xdim; }
   virtual unsigned GetYDim() { return ydim; }
   virtual DXGI_FORMAT GetFormat() override { return DXGI_FORMAT_R11G11B10_FLOAT; };
 };

 template <typename RGBAType, unsigned xdim, unsigned ydim>
 struct RawIntTexture : public ExecutionTest::RawGatherTexture {
   bool m_isSigned;
   bool m_isNorm;
   unsigned m_maxVal;
   DXGI_FORMAT m_format;
   RGBAType RGBA[xdim*ydim];
   RawIntTexture(bool isSigned, bool isNorm, int maxVal, DXGI_FORMAT format)
     : m_isSigned(isSigned), m_isNorm(isNorm), m_maxVal(maxVal + 2), m_format(format) {
     if (isSigned)
       m_maxVal /= 2;
   }
   // Set i'th element to values scaled per max dimentions for norms, shifted for signed
   // but otherwise just the x and y values themselves
   virtual void SetElement(int i, int x, int y) override {
     double fr = x;
     double fg = y;
     // provide some different values just to fill in b and a
     double fb = x + 2;
     double fa = y + 2;
     // If signed, get some unsigned values in there
     if (m_isSigned) {
       fr -= m_maxVal;
       fg -= m_maxVal;
       fb -= m_maxVal;
       fa -= m_maxVal;
     }
     // If normalized, scale to given range
     if (m_isNorm) {
       fr /= m_maxVal;
       fg /= m_maxVal;
       fb /= m_maxVal;
       fa /= m_maxVal;

       fr *= (1 << (RGBAType::GetRSize() - m_isSigned - 1));
       fg *= (1 << (RGBAType::GetGSize() - m_isSigned - 1));
       fb *= (1 << (RGBAType::GetBSize() - m_isSigned - 1));
       fa *= (1 << (RGBAType::GetASize() - 1));
     }
     RGBA[i].SetChannels((UINT)fr, (UINT)fg, (UINT)fb, (UINT)fa);
   }
   virtual void *GetElements() { return (void*)RGBA; }
   virtual unsigned GetXDim() { return xdim; }
   virtual unsigned GetYDim() { return ydim; }
   virtual DXGI_FORMAT GetFormat() override { return m_format; };
 };

 template <unsigned xdim, unsigned ydim>
 struct RawR10G10B10XRA2Texture : public ExecutionTest::RawGatherTexture {
   unsigned m_maxVal;
   DXGI_FORMAT m_format;
   IntRGBA10XRA2UNORM RGBA[xdim*ydim];
   RawR10G10B10XRA2Texture(int maxVal, DXGI_FORMAT format)
     : m_maxVal((maxVal + 2)/2), m_format(format) {}
   // Set i'th element to values scaled and shifted for available range
   virtual void SetElement(int i, int x, int y) override {
     double fr = x;
     double fg = y;
     // provide some different values just to fill in b and a
     double fb = x + 2;
     double fa = y + 2;

     // Shift RGB to valid range which will be -0.75 - 1.25
     fr -= m_maxVal*.75;
     fg -= m_maxVal*.75;
     fb -= m_maxVal*.75;

     // normalize to something that will fit in the limited range
     fr /= m_maxVal;
     fg /= m_maxVal;
     fb /= m_maxVal;
     fa /= m_maxVal*2;

     fa *= 3; // scale to max in range

     RGBA[i].SetChannels((float)fr, (float)fg, (float)fb, (float)fa);
   }
   virtual void *GetElements() { return (void*)RGBA; }
   virtual unsigned GetXDim() { return xdim; }
   virtual unsigned GetYDim() { return ydim; }
   virtual DXGI_FORMAT GetFormat() override { return m_format; };
 };

 //#define RAWGATHER_FALLBACK // Enable to use pre-6.7 fallback mechanisms to vet raw gather tests

 // Create a single resource of <resFormat> and alias it to a view of <viewFormat>
 // Then execute a shader that uses raw gather to copy the values into a UAV
 // Verify that the UAV has the same values as passed in.
 template<typename GatherType>
 void ExecutionTest::DoRawGatherTest(ID3D12Device *pDevice, RawGatherTexture *rawTex, DXGI_FORMAT viewFormat) {

   DXGI_FORMAT resFormat = rawTex->GetFormat();
 #ifdef RAWGATHER_FALLBACK
   // There is no uint64 version of Gather, so 64-bit fallback needs to use Loads
   const char shaderTemplate64[] =
     "Texture2D<uint%d_t> g_tex : register(t0);\n"
     "RWStructuredBuffer<uint%d_t> g_out : register(u0);\n"
     "SamplerState g_samp : register(s0);\n"
     "[NumThreads(32, 32, 1)]\n"
     "void main(uint3 id : SV_GroupThreadID, uint ix : SV_GroupIndex) {\n"
     "  //uint%d_t4 res = g_tex.%s(g_samp, (id.xy+0.5)/31.0);\n"
     "  g_out[4*ix+0] = g_tex.Load(uint3(id.x, id.y+1, 0));\n"
     "  g_out[4*ix+1] = g_tex.Load(uint3(id.x+1, id.y+1, 0));\n"
     "  g_out[4*ix+2] = g_tex.Load(uint3(id.x+1, id.y, 0));\n"
     "  g_out[4*ix+3] = g_tex.Load(uint3(id.x, id.y, 0));\n"
     "}";
 #endif
   const char shaderTemplate[] =
     "Texture2D<uint%d_t> g_tex : register(t0);\n"
     "RWStructuredBuffer<uint%d_t> g_out : register(u0);\n"
     "SamplerState g_samp : register(s0);\n"
     "[NumThreads(32, 32, 1)]\n"
     "void main(uint3 id : SV_GroupThreadID, uint ix : SV_GroupIndex) {\n"
     "  uint%d_t4 res = g_tex.%s(g_samp, (id.xy+0.5)/31.0);\n"
     "  g_out[4*ix+0] = res.x;\n"
     "  g_out[4*ix+1] = res.y;\n"
     "  g_out[4*ix+2] = res.z;\n"
     "  g_out[4*ix+3] = res.w;\n"
     "}";

   char pShader[sizeof(shaderTemplate) + 200]; // A little padding to account for variations
   UINT uintSize = sizeof(GatherType)*8; // bytes to bits

   const char *gatherFuncName = "GatherRaw";
 #ifdef RAWGATHER_FALLBACK
   gatherFuncName = "Gather";
   if (sizeof(GatherType) == 8)
     VERIFY_IS_GREATER_THAN(sprintf(pShader, shaderTemplate64, uintSize, uintSize, uintSize, gatherFuncName), 0);
   else
 #endif
     VERIFY_IS_GREATER_THAN(sprintf(pShader, shaderTemplate, uintSize, uintSize, uintSize, gatherFuncName), 0);

   const UINT xDim = rawTex->GetXDim();
   const UINT yDim = rawTex->GetYDim();
   const UINT valueSize = xDim * yDim;
   const UINT valueSizeInBytes =  valueSize * sizeof(GatherType);

   CComPtr<ID3D12CommandQueue> pCommandQueue;
   CComPtr<ID3D12CommandAllocator> pCommandAllocator;
   FenceObj FO;

   CreateComputeCommandQueue(pDevice, L"RawGather Queue", &pCommandQueue);
   InitFenceObj(pDevice, &FO);

   // Create root signature.
   CComPtr<ID3D12RootSignature> pRootSignature;
   CD3DX12_DESCRIPTOR_RANGE ranges[2];
   CD3DX12_DESCRIPTOR_RANGE srange[1];
   ranges[0].Init(D3D12_DESCRIPTOR_RANGE_TYPE_SRV, 1, 0, 0);
   ranges[1].Init(D3D12_DESCRIPTOR_RANGE_TYPE_UAV, 1, 0, 0);
   srange[0].Init(D3D12_DESCRIPTOR_RANGE_TYPE_SAMPLER, 1, 0, 0);

   CreateRootSignatureFromRanges(pDevice, &pRootSignature, ranges, 2, srange, 1);

   VERIFY_SUCCEEDED(pDevice->CreateCommandAllocator(D3D12_COMMAND_LIST_TYPE_COMPUTE, IID_PPV_ARGS(&pCommandAllocator)));

   // Create command list and resources
   CComPtr<ID3D12GraphicsCommandList> pCommandList;
   VERIFY_SUCCEEDED(pDevice->CreateCommandList(0, D3D12_COMMAND_LIST_TYPE_COMPUTE,
                                               pCommandAllocator, nullptr, IID_PPV_ARGS(&pCommandList)));

   // Set up castable format list (of one) if possible, or else just alias the
   // formats with the expectation that unsupported cases won't be used by the caller
   DXGI_FORMAT *castableFmt = nullptr;
   if (DoesDeviceSupportEnhancedBarriers(pDevice))
     castableFmt = &viewFormat;
   else
     resFormat = viewFormat;

   // Set up texture to be raw gathered from
   CComPtr<ID3D12Resource> pTexResource;
   CComPtr<ID3D12Resource> pTexUploadResource;
   int ix = 0;
   for (UINT y = 0; y < yDim; y++)
     for (UINT x = 0; x < xDim; x++)
       rawTex->SetElement(ix++, x, y);
   D3D12_RESOURCE_DESC tex2dDesc = CD3DX12_RESOURCE_DESC::Tex2D(resFormat, xDim, yDim, 1/* sampCt */, 1/* mipCt */);

   CreateTestResources(pDevice, pCommandList, rawTex->GetElements(), valueSizeInBytes, tex2dDesc,
                       &pTexResource, &pTexUploadResource,
                       nullptr /*pReadBufer*/, castableFmt);

   // Set up Output Resource
   CComPtr<ID3D12Resource> pOutputResource;
   CComPtr<ID3D12Resource> pOutputReadBuffer;
   CComPtr<ID3D12Resource> pOutputUploadResource;

   // 4x because gather produces four result values
   GatherType *outVals = new GatherType[valueSize*4];
   memset(outVals, 0xd, valueSizeInBytes*4); // 0xd to give a sentinal value for failures
   CreateTestUavs(pDevice, pCommandList, outVals, valueSizeInBytes*4, &pOutputResource,
                  &pOutputUploadResource, &pOutputReadBuffer);
   delete[] outVals;

   // Close the command list and execute it to perform the resource uploads
   pCommandList->Close();
   ID3D12CommandList *ppCommandLists[] = { pCommandList };
   pCommandQueue->ExecuteCommandLists(1, ppCommandLists);
   WaitForSignal(pCommandQueue, FO);

   // Create shaders
 #ifdef RAWGATHER_FALLBACK
   const wchar_t *target = L"cs_6_2";
 #else
   const wchar_t *target = L"cs_6_7";
 #endif

   LPCWSTR opts[] = {L"-enable-16bit-types"};

   CComPtr<ID3D12PipelineState> pPSO;
   CreateComputePSO(pDevice, pRootSignature, pShader, target, &pPSO, opts, _countof(opts));

   // Reset commandlist to shader PSO
   VERIFY_SUCCEEDED(pCommandList->Reset(pCommandAllocator, pPSO));

   // Describe and create a resource descriptor heap.
   CComPtr<ID3D12DescriptorHeap> pResHeap;
   CComPtr<ID3D12DescriptorHeap> pSampHeap;
   D3D12_DESCRIPTOR_HEAP_DESC heapDesc = {};
   heapDesc.NumDescriptors = 2;
   heapDesc.Type = D3D12_DESCRIPTOR_HEAP_TYPE_CBV_SRV_UAV;
   heapDesc.Flags = D3D12_DESCRIPTOR_HEAP_FLAG_SHADER_VISIBLE;
   VERIFY_SUCCEEDED(pDevice->CreateDescriptorHeap(&heapDesc, IID_PPV_ARGS(&pResHeap)));

   // Describe and create a sampler descriptor heap.
   heapDesc.NumDescriptors = 1;
   heapDesc.Type = D3D12_DESCRIPTOR_HEAP_TYPE_SAMPLER;
   VERIFY_SUCCEEDED(pDevice->CreateDescriptorHeap(&heapDesc, IID_PPV_ARGS(&pSampHeap)));

   CD3DX12_CPU_DESCRIPTOR_HANDLE cpuHandle(pResHeap->GetCPUDescriptorHandleForHeapStart());
   CreateTex2DSRV(pDevice, cpuHandle, viewFormat, pTexResource);
   CreateStructUAV(pDevice, cpuHandle, 4*valueSize, sizeof(GatherType), pOutputResource);

   D3D12_FILTER filters[] = {D3D12_FILTER_MIN_MAG_LINEAR_MIP_POINT,
                             D3D12_FILTER_COMPARISON_MIN_MAG_LINEAR_MIP_POINT};
   CreateDefaultSamplers(pDevice, pSampHeap->GetCPUDescriptorHandleForHeapStart(),
                         filters, nullptr /*perSampleBorderColors*/, 1);

   // Set Heaps, Rootsignature and table
   ID3D12DescriptorHeap *const pHeaps[2] = { pResHeap, pSampHeap };
   pCommandList->SetDescriptorHeaps(2, pHeaps);
   pCommandList->SetComputeRootSignature(pRootSignature);
   pCommandList->SetComputeRootDescriptorTable(0, pResHeap->GetGPUDescriptorHandleForHeapStart());
   pCommandList->SetComputeRootDescriptorTable(1, pSampHeap->GetGPUDescriptorHandleForHeapStart());

   // dispatch and close shader
   pCommandList->Dispatch(1, 1, 1);

   // Copy the results back to readable memory
   CD3DX12_RESOURCE_BARRIER barrier = CD3DX12_RESOURCE_BARRIER::Transition(pOutputResource,
                                         D3D12_RESOURCE_STATE_UNORDERED_ACCESS, D3D12_RESOURCE_STATE_COPY_SOURCE);
   pCommandList->ResourceBarrier(1, &barrier);
   pCommandList->CopyResource(pOutputReadBuffer, pOutputResource);

   pCommandList->Close();

   pCommandQueue->ExecuteCommandLists(1, ppCommandLists);
   WaitForSignal(pCommandQueue, FO);

   MappedData mappedData(pOutputReadBuffer, 4*valueSizeInBytes);
   GatherType *pData = (GatherType*)mappedData.data();
   GatherType *texVals = (GatherType*)rawTex->GetElements();
   UINT yCt = yDim;
   UINT xCt = xDim;
 #ifdef RAWGATHER_FALLBACK
   // 64-bit fallback uses Load, which doesn't support clamp addressing. so don't test it
   if (sizeof(GatherType) == 8) {
     yCt--;
     xCt--;
   }
 #endif
   for (UINT y = 0; y < yCt; y++) {
     UINT yp1 = y+1>=yDim?y:y+1;
     for (UINT x = 0; x < xCt; x++) {
       UINT xp1 = x+1>=xDim?x:x+1;
       // Because this order may be unexpected, I'll quote the spec:
       // "The four samples that would contribute to filtering are placed into xyzw
       //  in counter clockwise order starting with the sample to the lower left"
       VERIFY_ARE_EQUAL(pData[4*(32*y + x)+0], texVals[yp1*xDim + x]);
       VERIFY_ARE_EQUAL(pData[4*(32*y + x)+1], texVals[yp1*xDim + xp1]);
       VERIFY_ARE_EQUAL(pData[4*(32*y + x)+2], texVals[y*xDim   + xp1]);
       VERIFY_ARE_EQUAL(pData[4*(32*y + x)+3], texVals[y*xDim   + x]);
     }
   }
 }

 // Create textures of various types and alias them to the unsigned integer format
 // that has the same element size and initializes them with various values,
 // The shader code copies the results of raw gather to an unsigned integer UAV
 // The UAV contents are compared to the values assigned to the texture
 // A few levels of support are available:
 // pre-6.7 fallback - fakey hand waving to make it look like it's doing the right thing
 // 6.7 support only - No casting ability of resources to views beyond native support, but GatherRaw is available
 // 6.7 + Enh. Barriers - Same formats can be cast as in native, but use new createcommittedresource3()
 // 6.7 + Enh. Barriers + Relaxed Cast - All format casting and raw gathering of all
 TEST_F(ExecutionTest, ATORawGather) {

   WEX::TestExecution::SetVerifyOutput verifySettings(WEX::TestExecution::VerifyOutputSettings::LogOnlyFailures);

 #ifdef RAWGATHER_FALLBACK
   D3D_SHADER_MODEL sm = D3D_SHADER_MODEL_6_6;
 #else
   D3D_SHADER_MODEL sm = D3D_SHADER_MODEL_6_7;
 #endif
   CComPtr<ID3D12Device> pDevice;
   if (!CreateDevice(&pDevice, sm))
       return;

 #ifndef RAWGATHER_FALLBACK
   if (!DoesDeviceSupportAdvancedTexOps(pDevice)) {
     WEX::Logging::Log::Comment(L"Device does not support Advanced Texture Operations.");
     WEX::Logging::Log::Result(WEX::Logging::TestResults::Skipped);
     return;
   }
 #endif

   static const int NumThreadsX = 32;
   static const int NumThreadsY = 32;
   static const int ThreadsPerGroup = NumThreadsX * NumThreadsY;

   // Create an array of texture variants with the raw texture base class
   // Then plug them into DoRawGather to perform the test and evaluate the results for each
   RawIntTexture<IntRG<32, 32>, NumThreadsX, NumThreadsY> R32G32_TYPELESS(false, false, NumThreadsX, DXGI_FORMAT_R32G32_TYPELESS);
   RawIntTexture<IntRG<32, 32>, NumThreadsX, NumThreadsY> R32G32_UINT(false, false, NumThreadsX, DXGI_FORMAT_R32G32_UINT);
   RawIntTexture<IntRG<32, 32>, NumThreadsX, NumThreadsY> R32G32_SINT(true, false, NumThreadsX, DXGI_FORMAT_R32G32_SINT);

   RawIntTexture<IntRGBA<16, 16, 16, 16>, NumThreadsX, NumThreadsY> R16G16B16A16_TYPELESS(false, false, NumThreadsX, DXGI_FORMAT_R16G16B16A16_TYPELESS);
   RawIntTexture<IntRGBA<16, 16, 16, 16>, NumThreadsX, NumThreadsY> R16G16B16A16_UINT(false, false, NumThreadsX, DXGI_FORMAT_R16G16B16A16_UINT);
   RawIntTexture<IntRGBA<16, 16, 16, 16>, NumThreadsX, NumThreadsY> R16G16B16A16_SINT(true, false, NumThreadsX, DXGI_FORMAT_R16G16B16A16_SINT);
   RawIntTexture<IntRGBA<16, 16, 16, 16>, NumThreadsX, NumThreadsY> R16G16B16A16_UNORM(false, true, NumThreadsX, DXGI_FORMAT_R16G16B16A16_UNORM);
   RawIntTexture<IntRGBA<16, 16, 16, 16>, NumThreadsX, NumThreadsY> R16G16B16A16_SNORM(true, true, NumThreadsX, DXGI_FORMAT_R16G16B16A16_SNORM);
   RawFloatTexture<Float16RGBA, NumThreadsX, NumThreadsY> R16G16B16A16_FLOAT(DXGI_FORMAT_R16G16B16A16_FLOAT);
   RawFloatTexture<Float32RG, NumThreadsX, NumThreadsY> R32G32_FLOAT(DXGI_FORMAT_R32G32_FLOAT);

   RawGatherTexture *Int64Textures[] = {
                               &R32G32_TYPELESS,
                               &R32G32_UINT,
                               &R32G32_SINT,
                               &R16G16B16A16_TYPELESS,
                               &R16G16B16A16_UINT,
                               &R16G16B16A16_SINT,
                               &R16G16B16A16_UNORM,
                               &R16G16B16A16_SNORM,
                               &R16G16B16A16_FLOAT,
                               &R32G32_FLOAT};

   RawIntTexture<IntR<32>, NumThreadsX, NumThreadsY> R32_TYPELESS(false, false, NumThreadsX, DXGI_FORMAT_R32_TYPELESS);
   RawIntTexture<IntR<32>, NumThreadsX, NumThreadsY> R32_SINT(true, false, NumThreadsX, DXGI_FORMAT_R32_SINT);
   RawIntTexture<IntR<32>, NumThreadsX, NumThreadsY> R32_UINT(true, false, NumThreadsX, DXGI_FORMAT_R32_UINT);

   RawIntTexture<IntRGBA<10, 10, 10, 2>, NumThreadsX, NumThreadsY> R10G10B10A2_TYPELESS(false, false, NumThreadsX, DXGI_FORMAT_R10G10B10A2_TYPELESS);
   RawIntTexture<IntRGBA<10, 10, 10, 2>, NumThreadsX, NumThreadsY> R10G10B10A2_UNORM(false, true, NumThreadsX, DXGI_FORMAT_R10G10B10A2_UNORM);
   RawIntTexture<IntRGBA<10, 10, 10, 2>, NumThreadsX, NumThreadsY> R10G10B10A2_UINT(false, false, NumThreadsX, DXGI_FORMAT_R10G10B10A2_UINT);
   RawR10G10B10XRA2Texture<NumThreadsX, NumThreadsY> R10G10B10A2_XR_BIAS_A2_UNORM(NumThreadsX, DXGI_FORMAT_R10G10B10_XR_BIAS_A2_UNORM);
   RawIntTexture<FloatRGBE, NumThreadsX, NumThreadsY> R9G9B9E5_SHAREDEXP(false, false, NumThreadsX, DXGI_FORMAT_R9G9B9E5_SHAREDEXP);

   RawIntTexture<IntRGBA<8, 8, 8, 8>, NumThreadsX, NumThreadsY> R8G8B8A8_TYPELESS(false, false, NumThreadsX, DXGI_FORMAT_R8G8B8A8_TYPELESS);
   RawIntTexture<IntRGBA<8, 8, 8, 8>, NumThreadsX, NumThreadsY> R8G8B8A8_UNORM(false, true, NumThreadsX, DXGI_FORMAT_R8G8B8A8_UNORM);
   RawIntTexture<IntRGBA<8, 8, 8, 8>, NumThreadsX, NumThreadsY> R8G8B8A8_UNORM_SRGB(false, true, NumThreadsX, DXGI_FORMAT_R8G8B8A8_UNORM);
   RawIntTexture<IntRGBA<8, 8, 8, 8>, NumThreadsX, NumThreadsY> R8G8B8A8_UINT(false, false, NumThreadsX, DXGI_FORMAT_R8G8B8A8_UINT);
   RawIntTexture<IntRGBA<8, 8, 8, 8>, NumThreadsX, NumThreadsY> R8G8B8A8_SNORM(true, true, NumThreadsX, DXGI_FORMAT_R8G8B8A8_SNORM);
   RawIntTexture<IntRGBA<8, 8, 8, 8>, NumThreadsX, NumThreadsY> R8G8B8A8_SINT(true, false, NumThreadsX, DXGI_FORMAT_R8G8B8A8_SINT);

   RawIntTexture<IntRG<16, 16>, NumThreadsX, NumThreadsY> R16G16_TYPELESS(false, false, NumThreadsX, DXGI_FORMAT_R16G16_TYPELESS);
   RawIntTexture<IntRG<16, 16>, NumThreadsX, NumThreadsY> R16G16_UNORM(false, true, NumThreadsX, DXGI_FORMAT_R16G16_UNORM);
   RawIntTexture<IntRG<16, 16>, NumThreadsX, NumThreadsY> R16G16_UINT(false, false, NumThreadsX, DXGI_FORMAT_R16G16_UINT);
   RawIntTexture<IntRG<16, 16>, NumThreadsX, NumThreadsY> R16G16_SNORM(true, true, NumThreadsX, DXGI_FORMAT_R16G16_SNORM);
   RawIntTexture<IntRG<16, 16>, NumThreadsX, NumThreadsY> R16G16_SINT(true, false, NumThreadsX, DXGI_FORMAT_R16G16_SINT);

   RawIntTexture<IntRGBA<8, 8, 8, 8>, NumThreadsX, NumThreadsY> B8G8R8A8_TYPELESS(false, false, NumThreadsX, DXGI_FORMAT_B8G8R8A8_TYPELESS);
   RawIntTexture<IntRGBA<8, 8, 8, 8>, NumThreadsX, NumThreadsY> B8G8R8A8_UNORM(false, true, NumThreadsX, DXGI_FORMAT_B8G8R8A8_UNORM);
   RawIntTexture<IntRGBA<8, 8, 8, 8>, NumThreadsX, NumThreadsY> B8G8R8A8_UNORM_SRGB(false, true, NumThreadsX, DXGI_FORMAT_B8G8R8A8_UNORM_SRGB);

   RawIntTexture<IntRGBA<8, 8, 8, 8>, NumThreadsX, NumThreadsY> B8G8R8X8_TYPELESS(false, false, NumThreadsX, DXGI_FORMAT_B8G8R8X8_TYPELESS);
   RawIntTexture<IntRGBA<8, 8, 8, 8>, NumThreadsX, NumThreadsY> B8G8R8X8_UNORM(false, true, NumThreadsX, DXGI_FORMAT_B8G8R8X8_UNORM);
   RawIntTexture<IntRGBA<8, 8, 8, 8>, NumThreadsX, NumThreadsY> B8G8R8X8_UNORM_SRGB(false, true, NumThreadsX, DXGI_FORMAT_B8G8R8X8_UNORM_SRGB);

   RawFloatTexture<Float32R, NumThreadsX, NumThreadsY> R32_FLOAT(DXGI_FORMAT_R32_FLOAT);
   RawFloatR11G11B10ATexture<NumThreadsX, NumThreadsY> R11G11B10_FLOAT;
   RawFloatTexture<Float16RG, NumThreadsX, NumThreadsY> R16G16_FLOAT(DXGI_FORMAT_R16G16_FLOAT);

   RawGatherTexture *Int32Textures[] = {
                                 &R32_TYPELESS,
                                 &R32_UINT,
                                 &R32_SINT,
                                 &R10G10B10A2_TYPELESS,
                                 &R10G10B10A2_UNORM,
                                 &R10G10B10A2_UINT,
                                 &R10G10B10A2_XR_BIAS_A2_UNORM,
                                 &R9G9B9E5_SHAREDEXP,
                                 &R8G8B8A8_TYPELESS,
                                 &R8G8B8A8_UNORM,
                                 &R8G8B8A8_UNORM_SRGB,
                                 &R8G8B8A8_UINT,
                                 &R8G8B8A8_SNORM,
                                 &R8G8B8A8_SINT,
                                 &R16G16_TYPELESS,
                                 &R16G16_UNORM,
                                 &R16G16_UINT,
                                 &R16G16_SNORM,
                                 &R16G16_SINT,
                                 &B8G8R8A8_TYPELESS,
                                 &B8G8R8A8_UNORM,
                                 &B8G8R8A8_UNORM_SRGB,
                                 &B8G8R8X8_TYPELESS,
                                 &B8G8R8X8_UNORM,
                                 &B8G8R8X8_UNORM_SRGB,
                                 &R32_FLOAT,
                                 &R11G11B10_FLOAT,
                                 &R16G16_FLOAT};

   RawIntTexture<IntR<16>, NumThreadsX, NumThreadsY> R16_TYPELESS(false, false, NumThreadsX, DXGI_FORMAT_R16_TYPELESS);
   RawIntTexture<IntR<16>, NumThreadsX, NumThreadsY> R16_SINT(true,  false, NumThreadsX, DXGI_FORMAT_R16_SINT);
   RawIntTexture<IntR<16>, NumThreadsX, NumThreadsY> R16_UINT(true,  false, NumThreadsX, DXGI_FORMAT_R16_UINT);
   RawIntTexture<IntR<16>, NumThreadsX, NumThreadsY> R16_UNORM(false, true,  NumThreadsX, DXGI_FORMAT_R16_UNORM);
   RawIntTexture<IntR<16>, NumThreadsX, NumThreadsY> R16_SNORM(true,  true,  NumThreadsX, DXGI_FORMAT_R16_SNORM);
   RawFloatTexture<Float16R, NumThreadsX, NumThreadsY> R16_FLOAT(DXGI_FORMAT_R16_FLOAT);

   RawIntTexture<IntRG<8, 8>, NumThreadsX, NumThreadsY> R8G8_TYPELESS(false, false, NumThreadsX, DXGI_FORMAT_R8G8_TYPELESS);
   RawIntTexture<IntRG<8, 8>, NumThreadsX, NumThreadsY> R8G8_UINT(false, false, NumThreadsX, DXGI_FORMAT_R8G8_UINT);
   RawIntTexture<IntRG<8, 8>, NumThreadsX, NumThreadsY> R8G8_SINT(true,  false, NumThreadsX, DXGI_FORMAT_R8G8_SINT);
   RawIntTexture<IntRG<8, 8>, NumThreadsX, NumThreadsY> R8G8_UNORM(false, true,  NumThreadsX, DXGI_FORMAT_R8G8_UNORM);
   RawIntTexture<IntRG<8, 8>, NumThreadsX, NumThreadsY> R8G8_SNORM(true,  true,  NumThreadsX, DXGI_FORMAT_R8G8_SNORM);
   RawIntTexture<IntRGB<5, 6, 5>, NumThreadsX, NumThreadsY> B5G6R5_UNORM(false, true, NumThreadsX, DXGI_FORMAT_B5G6R5_UNORM);
   RawIntTexture<IntRGBA<5, 5, 5, 1>, NumThreadsX, NumThreadsY> B5G5R5A1_UNORM(false, true, NumThreadsX, DXGI_FORMAT_B5G5R5A1_UNORM);
   RawIntTexture<IntRGBA<4, 4, 4, 4>, NumThreadsX, NumThreadsY> B4G4R4A4_UNORM(false, true, NumThreadsX, DXGI_FORMAT_B4G4R4A4_UNORM);

   RawGatherTexture *Int16Textures[] = {
                                &R16_TYPELESS,
                                &R16_UINT,
                                &R16_SINT,
                                &R16_UNORM,
                                &R16_SNORM,
                                &R8G8_TYPELESS,
                                &R8G8_UINT,
                                &R8G8_SINT,
                                &R8G8_UNORM,
                                &R8G8_SNORM,
                                &B5G6R5_UNORM,
                                &B5G5R5A1_UNORM,
                                &B4G4R4A4_UNORM,
                                &R16_FLOAT};

   bool canCast = DoesDeviceSupportRelaxedFormatCasting(pDevice);
   int int32Ct = canCast? _countof(Int32Textures) : 3; // The first three are already castable to UINT32

   for (int i = 0; i < int32Ct; i++) {
     DoRawGatherTest<uint32_t>(pDevice, Int32Textures[i], DXGI_FORMAT_R32_UINT);
   }

   if (DoesDeviceSupportNative16bitOps(pDevice)) {
     int int16Ct = canCast? _countof(Int16Textures) : 5; // The first five are already castable to UINT16
     for (int i = 0; i < int16Ct; i++) {
       DoRawGatherTest<uint16_t>(pDevice, Int16Textures[i], DXGI_FORMAT_R16_UINT);
     }
   }
   if (DoesDeviceSupportInt64(pDevice)) {
     int int64Ct = canCast? _countof(Int64Textures) : 3; // The first three are already castable to UINT64
     for (int i = 0; i < int64Ct; i++) {
       DoRawGatherTest<uint64_t>(pDevice, Int64Textures[i], DXGI_FORMAT_R32G32_UINT);
     }
   }
 }

 // Executing a simple binop to verify shadel model 6.1 support; runs with
 // ShaderModel61.CoreRequirement
 TEST_F(ExecutionTest, BasicShaderModel61) {
   RunBasicShaderModelTest(D3D_SHADER_MODEL_6_1);
 }

 // Executing a simple binop to verify shadel model 6.3 support; runs with
 // ShaderModel63.CoreRequirement
 TEST_F(ExecutionTest, BasicShaderModel63) {
   RunBasicShaderModelTest(D3D_SHADER_MODEL_6_3);
 }

 void ExecutionTest::RunBasicShaderModelTest(D3D_SHADER_MODEL shaderModel) {

   WEX::TestExecution::SetVerifyOutput verifySettings(
       WEX::TestExecution::VerifyOutputSettings::LogOnlyFailures);

   CComPtr<ID3D12Device> pDevice;
   if (!CreateDevice(&pDevice, shaderModel)) {
     return;
   }

   char *pShaderModelStr;
   if (shaderModel == D3D_SHADER_MODEL_6_1) {
     pShaderModelStr = "cs_6_1";
   } else if (shaderModel == D3D_SHADER_MODEL_6_3) {
     pShaderModelStr = "cs_6_3";
   } else {
     DXASSERT_NOMSG("Invalid Shader Model Parameter");
     pShaderModelStr = nullptr;
   }

   const char shaderTemplate[] =
       "struct SBinaryOp { %s input1; %s input2; %s output; };"
       "RWStructuredBuffer<SBinaryOp> g_buf : register(u0);"
       "[numthreads(8,8,1)]"
       "void main(uint GI : SV_GroupIndex) {"
       "    SBinaryOp l = g_buf[GI];"
       "    l.output = l.input1 + l.input2;"
       "    g_buf[GI] = l;"
       "}";
   char shader[sizeof(shaderTemplate) + 50];

   // Run simple shader with float data types
   char* sTy = "float";
   float inputFloatPairs[] = { 1.5f, -2.8f, 3.23e-5f, 6.0f, 181.621f, 14.978f };
   VERIFY_IS_TRUE(sprintf(shader, shaderTemplate, sTy, sTy, sTy) > 0);
   WEX::Logging::Log::Comment(L"BasicShaderModel float");
   RunBasicShaderModelTest<float>(pDevice, pShaderModelStr, shader, inputFloatPairs, sizeof(inputFloatPairs) / (2 * sizeof(float)));

    // Run simple shader with double data types
   if (DoesDeviceSupportDouble(pDevice)) {
     sTy = "double";
     double inputDoublePairs[] = { 1.5891020, -2.8, 3.23e-5, 1 / 3, 181.91621, 14.654978 };
     VERIFY_IS_TRUE(sprintf(shader, shaderTemplate, sTy, sTy, sTy) > 0);
     WEX::Logging::Log::Comment(L"BasicShaderModel double");
     RunBasicShaderModelTest<double>(pDevice, pShaderModelStr, shader, inputDoublePairs, sizeof(inputDoublePairs) / (2 * sizeof(double)));
    }
    else {
      // Optional feature, so it's correct to not support it if declared as such.
      WEX::Logging::Log::Comment(L"Device does not support double operations.");
    }

    // Run simple shader with int64 types
    if (DoesDeviceSupportInt64(pDevice)) {
      sTy = "int64_t";
      int64_t inputInt64Pairs[] = { 1, -100, 6814684, -9814810, 654, 1021248900 };
      VERIFY_IS_TRUE(sprintf(shader, shaderTemplate, sTy, sTy, sTy) > 0);
      WEX::Logging::Log::Comment(L"BasicShaderModel int64_t");
      RunBasicShaderModelTest<int64_t>(pDevice, pShaderModelStr, shader, inputInt64Pairs, sizeof(inputInt64Pairs) / (2 * sizeof(int64_t)));
    }
    else {
      // Optional feature, so it's correct to not support it if declared as such.
      WEX::Logging::Log::Comment(L"Device does not support int64 operations.");
    }
 }

 template <class Ty>
 const wchar_t* ExecutionTest::BasicShaderModelTest_GetFormatString() {
   DXASSERT_NOMSG("Unsupported type");
   return "";
 }

 template <>
 const wchar_t* ExecutionTest::BasicShaderModelTest_GetFormatString<float>() {
   return L"element #%u: input1 = %6.8f, input1 = %6.8f, output = %6.8f, expected = %6.8f";
 }

 template <>
 const wchar_t* ExecutionTest::BasicShaderModelTest_GetFormatString<double>() {
   return BasicShaderModelTest_GetFormatString<float>();
 }

 template <>
 const wchar_t* ExecutionTest::BasicShaderModelTest_GetFormatString<int64_t>() {
   return L"element #%u: input1 = %ld, input1 = %ld, output = %ld, expected = %ld";
 }

 template <class Ty>
 void ExecutionTest::RunBasicShaderModelTest(CComPtr<ID3D12Device> pDevice, const char *pShaderModelStr, const char *pShader,
                                            Ty *pInputDataPairs, unsigned inputDataCount) {
   struct SBinaryOp {
     Ty input1;
     Ty input2;
     Ty output;
   };

   CComPtr<IStream> pStream;
   ReadHlslDataIntoNewStream(L"ShaderOpArith.xml", &pStream);

   std::shared_ptr<ShaderOpTestResult> test = RunShaderOpTest(
     pDevice, m_support, pStream, "BinaryFPOp",
     // this callbacked is called when the test is creating the resource to run the test
     [&](LPCSTR Name, std::vector<BYTE> &Data, st::ShaderOp *pShaderOp) {
       UNREFERENCED_PARAMETER(Name);
       pShaderOp->Shaders.at(0).Target = pShaderModelStr;
       pShaderOp->Shaders.at(0).Text = pShader;
       size_t size = sizeof(SBinaryOp) * inputDataCount;
       Data.resize(size);
       SBinaryOp *pPrimitives = (SBinaryOp*)Data.data();
       Ty *pIn = pInputDataPairs;
       for (size_t i = 0; i < inputDataCount; i++, pIn += 2) {
         SBinaryOp *p = &pPrimitives[i];
         p->input1 = pIn[0];
         p->input2 = pIn[1];
       }
   });

   VERIFY_SUCCEEDED(S_OK);

   MappedData data;
   test->Test->GetReadBackData("SBinaryFPOp", &data);
   SBinaryOp *pPrimitives = (SBinaryOp*)data.data();

   const wchar_t* formatStr = BasicShaderModelTest_GetFormatString<Ty>();
   Ty *pIn = pInputDataPairs;

   for (unsigned i = 0; i < inputDataCount; i++, pIn += 2) {
     Ty expValue = pIn[0] + pIn[1];
     SBinaryOp *p = &pPrimitives[i];

     LogCommentFmt(formatStr, i,  pIn[0], pIn[1], p->output, expValue);
     VERIFY_ARE_EQUAL(p->output, expValue);
   }
 }


 // Resource structure for data-driven tests.

 struct SUnaryFPOp {
     float input;
     float output;
 };

 struct SBinaryFPOp {
     float input1;
     float input2;
     float output1;
     float output2;
 };

 struct STertiaryFPOp {
     float input1;
     float input2;
     float input3;
     float output;
 };

 struct SUnaryHalfOp {
   uint16_t input;
   uint16_t output;
 };

 struct SBinaryHalfOp {
   uint16_t input1;
   uint16_t input2;
   uint16_t output1;
   uint16_t output2;
 };

 struct STertiaryHalfOp {
   uint16_t input1;
   uint16_t input2;
   uint16_t input3;
   uint16_t output;
 };

 struct SUnaryIntOp {
     int input;
     int output;
 };

 struct SUnaryUintOp {
     unsigned int input;
     unsigned int output;
 };

 struct SBinaryIntOp {
     int input1;
     int input2;
     int output1;
     int output2;
 };

 struct STertiaryIntOp {
     int input1;
     int input2;
     int input3;
     int output;
 };

 struct SBinaryUintOp {
     unsigned int input1;
     unsigned int input2;
     unsigned int output1;
     unsigned int output2;
 };

 struct STertiaryUintOp {
     unsigned int input1;
     unsigned int input2;
     unsigned int input3;
     unsigned int output;
 };

 struct SUnaryInt16Op {
   short input;
   short output;
 };

 struct SUnaryUint16Op {
   unsigned short input;
   unsigned short output;
 };

 struct SBinaryInt16Op {
   short input1;
   short input2;
   short output1;
   short output2;
 };

 struct STertiaryInt16Op {
   short input1;
   short input2;
   short input3;
   short output;
 };

 struct SBinaryUint16Op {
   unsigned short input1;
   unsigned short input2;
   unsigned short output1;
   unsigned short output2;
 };

 struct STertiaryUint16Op {
   unsigned short input1;
   unsigned short input2;
   unsigned short input3;
   unsigned short output;
 };
 // representation for HLSL float vectors
 struct SDotOp {
     XMFLOAT4 input1;
     XMFLOAT4 input2;
     float o_dot2;
     float o_dot3;
     float o_dot4;
 };

 struct Half2
 {
     uint16_t x;
     uint16_t y;

     Half2() = default;

     Half2(const Half2&) = default;
     Half2& operator=(const Half2&) = default;

     Half2(Half2&&) = default;
     Half2& operator=(Half2&&) = default;

     constexpr Half2(uint16_t _x, uint16_t _y) : x(_x), y(_y) {}
     explicit Half2(_In_reads_(2) const uint16_t *pArray) : x(pArray[0]), y(pArray[1]) {}
 };

 struct SDot2AddHalfOp {
     Half2 input1;
     Half2 input2;
     float acc;
     float result;
 };

 struct SDot4AddI8PackedOp {
     uint32_t input1;
     uint32_t input2;
     int32_t acc;
     int32_t result;
 };

 struct SDot4AddU8PackedOp {
     uint32_t input1;
     uint32_t input2;
     uint32_t acc;
     uint32_t result;
 };

 struct SMsad4 {
     unsigned int ref;
     XMUINT2 src;
     XMUINT4 accum;
     XMUINT4 result;
 };

 struct SPackUnpackOpOutPacked
 {
     uint32_t packedUint32;
     uint32_t packedInt32;
     uint32_t packedUint16;
     uint32_t packedInt16;

     uint32_t packedClampedUint32;
     uint32_t packedClampedInt32;
     uint32_t packedClampedUint16;
     uint32_t packedClampedInt16;
 };

 struct SPackUnpackOpOutUnpacked {
     std::array<uint32_t, 4> outputUint32;
     std::array<int32_t,  4> outputInt32;
     std::array<uint16_t, 4> outputUint16;
     std::array<int16_t,  4> outputInt16;

     std::array<uint32_t, 4> outputClampedUint32;
     std::array<int32_t,  4> outputClampedInt32;
     std::array<uint16_t, 4> outputClampedUint16;
     std::array<int16_t,  4> outputClampedInt16;
 };


 // Parameter representation for taef data-driven tests
 struct TableParameter {
     LPCWSTR m_name;
     enum TableParameterType {
         INT8,
         INT16,
         INT32,
         UINT,
         FLOAT,
         HALF,
         DOUBLE,
         STRING,
         BOOL,
         INT8_TABLE,
         INT16_TABLE,
         INT32_TABLE,
         FLOAT_TABLE,
         HALF_TABLE,
         DOUBLE_TABLE,
         STRING_TABLE,
         UINT8_TABLE,
         UINT16_TABLE,
         UINT32_TABLE,
         BOOL_TABLE
     };
     TableParameterType m_type;
     bool m_required; // required parameter
     int8_t m_int8;
     int16_t m_int16;
     int m_int32;
     unsigned int m_uint;
     float m_float;
     uint16_t m_half; // no such thing as half type in c++. Use int16 instead
     double m_double;
     bool m_bool;
     WEX::Common::String m_str;
     std::vector<int8_t> m_int8Table;
     std::vector<int16_t> m_int16Table;
     std::vector<int> m_int32Table;
     std::vector<uint8_t> m_uint8Table;
     std::vector<uint16_t> m_uint16Table;
     std::vector<unsigned int> m_uint32Table;
     std::vector<float> m_floatTable;
     std::vector<uint16_t> m_halfTable; // no such thing as half type in c++
     std::vector<double> m_doubleTable;
     std::vector<bool> m_boolTable;
     std::vector<WEX::Common::String> m_StringTable;
 };

 class TableParameterHandler {
 private:
   HRESULT ParseTableRow();
 public:
   TableParameter* m_table;
   size_t m_tableSize;
   TableParameterHandler(TableParameter *pTable, size_t size) : m_table(pTable), m_tableSize(size) {
     clearTableParameter();
     VERIFY_SUCCEEDED(ParseTableRow());
   }

   TableParameter* GetTableParamByName(LPCWSTR name) {
     for (size_t i = 0; i < m_tableSize; ++i) {
       if (_wcsicmp(name, m_table[i].m_name) == 0) {
         return &m_table[i];
       }
     }
     DXASSERT_ARGS(false, "Invalid Table Parameter Name %s", name);
     return nullptr;
   }

   void clearTableParameter() {
     for (size_t i = 0; i < m_tableSize; ++i) {
       m_table[i].m_int32 = 0;
       m_table[i].m_uint = 0;
       m_table[i].m_double = 0;
       m_table[i].m_bool = false;
       m_table[i].m_str = WEX::Common::String();
     }
   }

   template <class T1>
   std::vector<T1> *GetDataArray(LPCWSTR name) {
     return nullptr;
   }

   template <>
   std::vector<int> *GetDataArray(LPCWSTR name) {
     for (size_t i = 0; i < m_tableSize; ++i) {
       if (_wcsicmp(name, m_table[i].m_name) == 0) {
         return &(m_table[i].m_int32Table);
       }
     }
     DXASSERT_ARGS(false, "Invalid Table Parameter Name %s", name);
     return nullptr;
   }

   template <>
   std::vector<int8_t> *GetDataArray(LPCWSTR name) {
     for (size_t i = 0; i < m_tableSize; ++i) {
       if (_wcsicmp(name, m_table[i].m_name) == 0) {
         return &(m_table[i].m_int8Table);
       }
     }
     DXASSERT_ARGS(false, "Invalid Table Parameter Name %s", name);
     return nullptr;
   }

   template <>
   std::vector<int16_t> *GetDataArray(LPCWSTR name) {
     for (size_t i = 0; i < m_tableSize; ++i) {
       if (_wcsicmp(name, m_table[i].m_name) == 0) {
         return &(m_table[i].m_int16Table);
       }
     }
     DXASSERT_ARGS(false, "Invalid Table Parameter Name %s", name);
     return nullptr;
   }

   template <>
   std::vector<unsigned int> *GetDataArray(LPCWSTR name) {
     for (size_t i = 0; i < m_tableSize; ++i) {
       if (_wcsicmp(name, m_table[i].m_name) == 0) {
         return &(m_table[i].m_uint32Table);
       }
     }
     DXASSERT_ARGS(false, "Invalid Table Parameter Name %s", name);
     return nullptr;
   }

   template <>
   std::vector<float> *GetDataArray(LPCWSTR name) {
     for (size_t i = 0; i < m_tableSize; ++i) {
       if (_wcsicmp(name, m_table[i].m_name) == 0) {
         return &(m_table[i].m_floatTable);
       }
     }
     DXASSERT_ARGS(false, "Invalid Table Parameter Name %s", name);
     return nullptr;
   }

   // TODO: uin16_t may be used to represent two different types when we introduce uint16
   template <>
   std::vector<uint16_t> *GetDataArray(LPCWSTR name) {
     for (size_t i = 0; i < m_tableSize; ++i) {
       if (_wcsicmp(name, m_table[i].m_name) == 0) {
         return &(m_table[i].m_halfTable);
       }
     }
     DXASSERT_ARGS(false, "Invalid Table Parameter Name %s", name);
     return nullptr;
   }

   template <>
   std::vector<double> *GetDataArray(LPCWSTR name) {
     for (size_t i = 0; i < m_tableSize; ++i) {
       if (_wcsicmp(name, m_table[i].m_name) == 0) {
         return &(m_table[i].m_doubleTable);
       }
     }
     DXASSERT_ARGS(false, "Invalid Table Parameter Name %s", name);
     return nullptr;
   }

   template <>
   std::vector<bool> *GetDataArray(LPCWSTR name) {
     for (size_t i = 0; i < m_tableSize; ++i) {
       if (_wcsicmp(name, m_table[i].m_name) == 0) {
         return &(m_table[i].m_boolTable);
       }
     }
     DXASSERT_ARGS(false, "Invalid Table Parameter Name %s", name);
     return nullptr;
   }

 };

 static TableParameter UnaryFPOpParameters[] = {
     { L"ShaderOp.Target", TableParameter::STRING, true },
     { L"ShaderOp.Text", TableParameter::STRING, true },
     { L"Validation.Input1", TableParameter::FLOAT_TABLE, true },
     { L"Validation.Expected1", TableParameter::FLOAT_TABLE, true },
     { L"Validation.Type", TableParameter::STRING, true },
     { L"Validation.Tolerance", TableParameter::DOUBLE, true },
     { L"Warp.Version", TableParameter::UINT, false }
 };

 static TableParameter BinaryFPOpParameters[] = {
     { L"ShaderOp.Target", TableParameter::STRING, true },
     { L"ShaderOp.Text", TableParameter::STRING, true },
     { L"Validation.Input1", TableParameter::FLOAT_TABLE, true },
     { L"Validation.Input2", TableParameter::FLOAT_TABLE, true },
     { L"Validation.Expected1", TableParameter::FLOAT_TABLE, true },
     { L"Validation.Expected2", TableParameter::FLOAT_TABLE, false },
     { L"Validation.Type", TableParameter::STRING, true },
     { L"Validation.Tolerance", TableParameter::DOUBLE, true },
 };

 static TableParameter TertiaryFPOpParameters[] = {
     { L"ShaderOp.Target", TableParameter::STRING, true },
     { L"ShaderOp.Text", TableParameter::STRING, true },
     { L"Validation.Input1", TableParameter::FLOAT_TABLE, true },
     { L"Validation.Input2", TableParameter::FLOAT_TABLE, true },
     { L"Validation.Input3", TableParameter::FLOAT_TABLE, true },
     { L"Validation.Expected1", TableParameter::FLOAT_TABLE, true },
     { L"Validation.Type", TableParameter::STRING, true },
     { L"Validation.Tolerance", TableParameter::DOUBLE, true },
 };

 static TableParameter UnaryHalfOpParameters[] = {
     { L"ShaderOp.Target", TableParameter::STRING, true },
     { L"ShaderOp.Text", TableParameter::STRING, true },
     { L"ShaderOp.Arguments", TableParameter::STRING, true },
     { L"Validation.Input1", TableParameter::HALF_TABLE, true },
     { L"Validation.Expected1", TableParameter::HALF_TABLE, true },
     { L"Validation.Type", TableParameter::STRING, true },
     { L"Validation.Tolerance", TableParameter::DOUBLE, true },
     { L"Warp.Version", TableParameter::UINT, false }
 };

 static TableParameter BinaryHalfOpParameters[] = {
     { L"ShaderOp.Target", TableParameter::STRING, true },
     { L"ShaderOp.Text", TableParameter::STRING, true },
     { L"ShaderOp.Arguments", TableParameter::STRING, true },
     { L"Validation.Input1", TableParameter::HALF_TABLE, true },
     { L"Validation.Input2", TableParameter::HALF_TABLE, true },
     { L"Validation.Expected1", TableParameter::HALF_TABLE, true },
     { L"Validation.Expected2", TableParameter::HALF_TABLE, false },
     { L"Validation.Type", TableParameter::STRING, true },
     { L"Validation.Tolerance", TableParameter::DOUBLE, true },
 };

 static TableParameter TertiaryHalfOpParameters[] = {
     { L"ShaderOp.Target", TableParameter::STRING, true },
     { L"ShaderOp.Text", TableParameter::STRING, true },
     { L"ShaderOp.Arguments", TableParameter::STRING, true },
     { L"Validation.Input1", TableParameter::HALF_TABLE, true },
     { L"Validation.Input2", TableParameter::HALF_TABLE, true },
     { L"Validation.Input3", TableParameter::HALF_TABLE, true },
     { L"Validation.Expected1", TableParameter::HALF_TABLE, true },
     { L"Validation.Type", TableParameter::STRING, true },
     { L"Validation.Tolerance", TableParameter::DOUBLE, true },
 };

 static TableParameter UnaryIntOpParameters[] = {
     { L"ShaderOp.Target", TableParameter::STRING, true },
     { L"ShaderOp.Text", TableParameter::STRING, true },
     { L"Validation.Input1", TableParameter::INT32_TABLE, true },
     { L"Validation.Expected1", TableParameter::INT32_TABLE, true },
     { L"Validation.Tolerance", TableParameter::INT32, true },
 };

 static TableParameter UnaryUintOpParameters[] = {
     { L"ShaderOp.Target", TableParameter::STRING, true },
     { L"ShaderOp.Text", TableParameter::STRING, true },
     { L"Validation.Input1", TableParameter::UINT32_TABLE, true },
     { L"Validation.Expected1", TableParameter::UINT32_TABLE, true },
     { L"Validation.Tolerance", TableParameter::INT32, true },
 };

 static TableParameter BinaryIntOpParameters[] = {
     { L"ShaderOp.Target", TableParameter::STRING, true },
     { L"ShaderOp.Text", TableParameter::STRING, true },
     { L"Validation.Input1", TableParameter::INT32_TABLE, true },
     { L"Validation.Input2", TableParameter::INT32_TABLE, true },
     { L"Validation.Expected1", TableParameter::INT32_TABLE, true },
     { L"Validation.Expected2", TableParameter::INT32_TABLE, false },
     { L"Validation.Tolerance", TableParameter::INT32, true },
 };

 static TableParameter TertiaryIntOpParameters[] = {
     { L"ShaderOp.Target", TableParameter::STRING, true },
     { L"ShaderOp.Text", TableParameter::STRING, true },
     { L"Validation.Input1", TableParameter::INT32_TABLE, true },
     { L"Validation.Input2", TableParameter::INT32_TABLE, true },
     { L"Validation.Input3", TableParameter::INT32_TABLE, true },
     { L"Validation.Expected1", TableParameter::INT32_TABLE, true },
     { L"Validation.Tolerance", TableParameter::INT32, true },
 };

 static TableParameter BinaryUintOpParameters[] = {
     { L"ShaderOp.Target", TableParameter::STRING, true },
     { L"ShaderOp.Text", TableParameter::STRING, true },
     { L"Validation.Input1", TableParameter::UINT32_TABLE, true },
     { L"Validation.Input2", TableParameter::UINT32_TABLE, true },
     { L"Validation.Expected1", TableParameter::UINT32_TABLE, true },
     { L"Validation.Expected2", TableParameter::UINT32_TABLE, false },
     { L"Validation.Tolerance", TableParameter::INT32, true },
 };

 static TableParameter TertiaryUintOpParameters[] = {
     { L"ShaderOp.Target", TableParameter::STRING, true },
     { L"ShaderOp.Text", TableParameter::STRING, true },
     { L"Validation.Input1", TableParameter::UINT32_TABLE, true },
     { L"Validation.Input2", TableParameter::UINT32_TABLE, true },
     { L"Validation.Input3", TableParameter::UINT32_TABLE, true },
     { L"Validation.Expected1", TableParameter::UINT32_TABLE, true },
     { L"Validation.Tolerance", TableParameter::INT32, true },
 };

 static TableParameter UnaryInt16OpParameters[] = {
   { L"ShaderOp.Target", TableParameter::STRING, true },
   { L"ShaderOp.Text", TableParameter::STRING, true },
   { L"ShaderOp.Arguments", TableParameter::STRING, true },
   { L"Validation.Input1", TableParameter::INT16_TABLE, true },
   { L"Validation.Expected1", TableParameter::INT16_TABLE, true },
   { L"Validation.Tolerance", TableParameter::INT32, true },
 };

 static TableParameter UnaryUint16OpParameters[] = {
   { L"ShaderOp.Target", TableParameter::STRING, true },
   { L"ShaderOp.Text", TableParameter::STRING, true },
   { L"ShaderOp.Arguments", TableParameter::STRING, true },
   { L"Validation.Input1", TableParameter::UINT16_TABLE, true },
   { L"Validation.Expected1", TableParameter::UINT16_TABLE, true },
   { L"Validation.Tolerance", TableParameter::INT32, true },
 };

 static TableParameter BinaryInt16OpParameters[] = {
   { L"ShaderOp.Target", TableParameter::STRING, true },
   { L"ShaderOp.Text", TableParameter::STRING, true },
   { L"ShaderOp.Arguments", TableParameter::STRING, true },
   { L"Validation.Input1", TableParameter::INT16_TABLE, true },
   { L"Validation.Input2", TableParameter::INT16_TABLE, true },
   { L"Validation.Expected1", TableParameter::INT16_TABLE, true },
   { L"Validation.Expected2", TableParameter::INT16_TABLE, false },
   { L"Validation.Tolerance", TableParameter::INT32, true },
 };

 static TableParameter TertiaryInt16OpParameters[] = {
   { L"ShaderOp.Target", TableParameter::STRING, true },
   { L"ShaderOp.Text", TableParameter::STRING, true },
   { L"ShaderOp.Arguments", TableParameter::STRING, true },
   { L"Validation.Input1", TableParameter::INT16_TABLE, true },
   { L"Validation.Input2", TableParameter::INT16_TABLE, true },
   { L"Validation.Input3", TableParameter::INT16_TABLE, true },
   { L"Validation.Expected1", TableParameter::INT16_TABLE, true },
   { L"Validation.Tolerance", TableParameter::INT32, true },
 };

 static TableParameter BinaryUint16OpParameters[] = {
   { L"ShaderOp.Target", TableParameter::STRING, true },
   { L"ShaderOp.Text", TableParameter::STRING, true },
   { L"ShaderOp.Arguments", TableParameter::STRING, true },
   { L"Validation.Input1", TableParameter::UINT16_TABLE, true },
   { L"Validation.Input2", TableParameter::UINT16_TABLE, true },
   { L"Validation.Expected1", TableParameter::UINT16_TABLE, true },
   { L"Validation.Expected2", TableParameter::UINT16_TABLE, false },
   { L"Validation.Tolerance", TableParameter::INT32, true },
 };

 static TableParameter TertiaryUint16OpParameters[] = {
   { L"ShaderOp.Target", TableParameter::STRING, true },
   { L"ShaderOp.Text", TableParameter::STRING, true },
   { L"ShaderOp.Arguments", TableParameter::STRING, true },
   { L"Validation.Input1", TableParameter::UINT16_TABLE, true },
   { L"Validation.Input2", TableParameter::UINT16_TABLE, true },
   { L"Validation.Input3", TableParameter::UINT16_TABLE, true },
   { L"Validation.Expected1", TableParameter::UINT16_TABLE, true },
   { L"Validation.Tolerance", TableParameter::INT32, true },
 };

 static TableParameter DotOpParameters[] = {
     { L"ShaderOp.Target", TableParameter::STRING, true },
     { L"ShaderOp.Text", TableParameter::STRING, true },
     { L"Validation.Input1", TableParameter::STRING_TABLE, true },
     { L"Validation.Input2", TableParameter::STRING_TABLE, true },
     { L"Validation.Expected1", TableParameter::STRING_TABLE, true },
     { L"Validation.Expected2", TableParameter::STRING_TABLE, true },
     { L"Validation.Expected3", TableParameter::STRING_TABLE, true },
     { L"Validation.Type", TableParameter::STRING, true },
     { L"Validation.Tolerance", TableParameter::DOUBLE, true },
 };

 static TableParameter Dot2AddHalfOpParameters[] = {
     { L"ShaderOp.Target", TableParameter::STRING, true },
     { L"ShaderOp.Text", TableParameter::STRING, true },
     { L"ShaderOp.Arguments", TableParameter::STRING, true },
     { L"Validation.Input1", TableParameter::STRING_TABLE, true },
     { L"Validation.Input2", TableParameter::STRING_TABLE, true },
     { L"Validation.Input3", TableParameter::FLOAT_TABLE, true },
     { L"Validation.Expected1", TableParameter::FLOAT_TABLE, true },
     { L"Validation.Type", TableParameter::STRING, true },
     { L"Validation.Tolerance", TableParameter::DOUBLE, true },
 };

 static TableParameter Dot4AddI8PackedOpParameters[] = {
     { L"ShaderOp.Target", TableParameter::STRING, true },
     { L"ShaderOp.Text", TableParameter::STRING, true },
     { L"Validation.Input1", TableParameter::UINT32_TABLE, true },
     { L"Validation.Input2", TableParameter::UINT32_TABLE, true },
     { L"Validation.Input3", TableParameter::INT32_TABLE, true },
     { L"Validation.Expected1", TableParameter::INT32_TABLE, true },
 };

 static TableParameter Dot4AddU8PackedOpParameters[] = {
     { L"ShaderOp.Target", TableParameter::STRING, true },
     { L"ShaderOp.Text", TableParameter::STRING, true },
     { L"Validation.Input1", TableParameter::UINT32_TABLE, true },
     { L"Validation.Input2", TableParameter::UINT32_TABLE, true },
     { L"Validation.Input3", TableParameter::UINT32_TABLE, true },
     { L"Validation.Expected1", TableParameter::UINT32_TABLE, true },
 };

 static TableParameter Msad4OpParameters[] = {
     { L"ShaderOp.Text", TableParameter::STRING, true },
     { L"Validation.Tolerance", TableParameter::DOUBLE, true },
     { L"Validation.Input1", TableParameter::UINT32_TABLE, true},
     { L"Validation.Input2", TableParameter::STRING_TABLE, true },
     { L"Validation.Input3", TableParameter::STRING_TABLE, true },
     { L"Validation.Expected1", TableParameter::STRING_TABLE, true }
 };

 static TableParameter WaveIntrinsicsActiveIntParameters[] = {
     { L"ShaderOp.Name", TableParameter::STRING, true },
     { L"ShaderOp.Text", TableParameter::STRING, true },
     { L"Validation.NumInputSet", TableParameter::UINT, true },
     { L"Validation.InputSet1", TableParameter::INT32_TABLE, true },
     { L"Validation.InputSet2", TableParameter::INT32_TABLE, false },
     { L"Validation.InputSet3", TableParameter::INT32_TABLE, false },
     { L"Validation.InputSet4", TableParameter::INT32_TABLE, false }
 };

 static TableParameter WaveIntrinsicsPrefixIntParameters[] = {
   { L"ShaderOp.Name", TableParameter::STRING, true },
   { L"ShaderOp.Text", TableParameter::STRING, true },
   { L"Validation.NumInputSet", TableParameter::UINT, true },
   { L"Validation.InputSet1", TableParameter::INT32_TABLE, true },
   { L"Validation.InputSet2", TableParameter::INT32_TABLE, false },
   { L"Validation.InputSet3", TableParameter::INT32_TABLE, false },
   { L"Validation.InputSet4", TableParameter::INT32_TABLE, false }
 };

 static TableParameter WaveIntrinsicsActiveUintParameters[] = {
   { L"ShaderOp.Name", TableParameter::STRING, true },
   { L"ShaderOp.Text", TableParameter::STRING, true },
   { L"Validation.NumInputSet", TableParameter::UINT, true },
   { L"Validation.InputSet1", TableParameter::UINT32_TABLE, true },
   { L"Validation.InputSet2", TableParameter::UINT32_TABLE, false },
   { L"Validation.InputSet3", TableParameter::UINT32_TABLE, false },
   { L"Validation.InputSet4", TableParameter::UINT32_TABLE, false }
 };

 static TableParameter WaveIntrinsicsPrefixUintParameters[] = {
   { L"ShaderOp.Name", TableParameter::STRING, true },
   { L"ShaderOp.Text", TableParameter::STRING, true },
   { L"Validation.NumInputSet", TableParameter::UINT, true },
   { L"Validation.InputSet1", TableParameter::UINT32_TABLE, true },
   { L"Validation.InputSet2", TableParameter::UINT32_TABLE, false },
   { L"Validation.InputSet3", TableParameter::UINT32_TABLE, false },
   { L"Validation.InputSet4", TableParameter::UINT32_TABLE, false }
 };

 static TableParameter WaveIntrinsicsMultiPrefixIntParameters[] = {
   { L"ShaderOp.Name", TableParameter::STRING, true },
   { L"ShaderOp.Target", TableParameter::STRING, true },
   { L"ShaderOp.Text", TableParameter::STRING, true },
   { L"Validation.Keys", TableParameter::INT32_TABLE, true },
   { L"Validation.Values", TableParameter::INT32_TABLE, true },
 };

 static TableParameter WaveIntrinsicsMultiPrefixUintParameters[] = {
   { L"ShaderOp.Name", TableParameter::STRING, true },
   { L"ShaderOp.Target", TableParameter::STRING, true },
   { L"ShaderOp.Text", TableParameter::STRING, true },
   { L"Validation.Keys", TableParameter::UINT32_TABLE, true },
   { L"Validation.Values", TableParameter::UINT32_TABLE, true },
 };

 static TableParameter WaveIntrinsicsActiveBoolParameters[] = {
   { L"ShaderOp.Name", TableParameter::STRING, true },
   { L"ShaderOp.Text", TableParameter::STRING, true },
   { L"Validation.NumInputSet", TableParameter::UINT, true },
   { L"Validation.InputSet1", TableParameter::BOOL_TABLE, true },
   { L"Validation.InputSet2", TableParameter::BOOL_TABLE, false },
   { L"Validation.InputSet3", TableParameter::BOOL_TABLE, false },
 };

 static TableParameter CBufferTestHalfParameters[] = {
   { L"Validation.InputSet", TableParameter::HALF_TABLE, true },
 };

 static TableParameter DenormBinaryFPOpParameters[] = {
     { L"ShaderOp.Target", TableParameter::STRING, true },
     { L"ShaderOp.Text", TableParameter::STRING, true },
     { L"ShaderOp.Arguments", TableParameter::STRING, true },
     { L"Validation.Input1", TableParameter::STRING_TABLE, true },
     { L"Validation.Input2", TableParameter::STRING_TABLE, true },
     { L"Validation.Expected1", TableParameter::STRING_TABLE, true },
     { L"Validation.Expected2", TableParameter::STRING_TABLE, false },
     { L"Validation.Type", TableParameter::STRING, true },
     { L"Validation.Tolerance", TableParameter::DOUBLE, true },
 };

 static TableParameter DenormTertiaryFPOpParameters[] = {
     { L"ShaderOp.Target", TableParameter::STRING, true },
     { L"ShaderOp.Text", TableParameter::STRING, true },
     { L"ShaderOp.Arguments", TableParameter::STRING, true },
     { L"Validation.Input1", TableParameter::STRING_TABLE, true },
     { L"Validation.Input2", TableParameter::STRING_TABLE, true },
     { L"Validation.Input3", TableParameter::STRING_TABLE, true },
     { L"Validation.Expected1", TableParameter::STRING_TABLE, true },
     { L"Validation.Expected2", TableParameter::STRING_TABLE, false },
     { L"Validation.Type", TableParameter::STRING, true },
     { L"Validation.Tolerance", TableParameter::DOUBLE, true },
 };

 static TableParameter PackUnpackOpParameters[] = {
     { L"ShaderOp.Text", TableParameter::STRING, true },
     { L"Validation.Type", TableParameter::STRING, true },
     { L"Validation.Tolerance", TableParameter::UINT, true },
     { L"Validation.Input", TableParameter::UINT32_TABLE, true },
 };

 static bool IsHexString(PCWSTR str, uint16_t *value) {
   std::wstring wString(str);
   wString.erase(std::remove(wString.begin(), wString.end(), L' '), wString.end());
   LPCWSTR wstr = wString.c_str();
   if (wcsncmp(wstr, L"0x", 2) == 0 || wcsncmp(wstr, L"0b", 2) == 0) {
     *value = (uint16_t)wcstol(wstr, NULL, 0);
     return true;
   }
   return false;
 }

 static HRESULT ParseDataToFloat(PCWSTR str, float &value) {
   std::wstring wString(str);
   wString.erase(std::remove(wString.begin(), wString.end(), L' '), wString.end());
   PCWSTR wstr = wString.data();
   if (_wcsicmp(wstr, L"NaN") == 0) {
     value = NAN;
   } else if (_wcsicmp(wstr, L"-inf") == 0) {
     value = -(INFINITY);
   } else if (_wcsicmp(wstr, L"inf") == 0) {
     value = INFINITY;
   } else if (_wcsicmp(wstr, L"-denorm") == 0) {
     value = -(FLT_MIN / 2);
   } else if (_wcsicmp(wstr, L"denorm") == 0) {
     value = FLT_MIN / 2;
   } else if (_wcsicmp(wstr, L"-0.0f") == 0 || _wcsicmp(wstr, L"-0.0") == 0 ||
              _wcsicmp(wstr, L"-0") == 0) {
     value = -0.0f;
   } else if (_wcsicmp(wstr, L"0.0f") == 0 || _wcsicmp(wstr, L"0.0") == 0 ||
              _wcsicmp(wstr, L"0") == 0) {
     value = 0.0f;
   } else if (_wcsnicmp(wstr, L"0x", 2) == 0) { // For hex values, take values literally
     unsigned temp_i = std::stoul(wstr, nullptr, 16);
     value = (float&)temp_i;
   }
   else {
     // evaluate the expression of wstring
     double val = _wtof(wstr);
     if (val == 0) {
       LogErrorFmt(L"Failed to parse parameter %s to float", wstr);
       return E_FAIL;
     }
     value = (float)val;
   }
   return S_OK;
 }

 static HRESULT ParseDataToInt(PCWSTR str, int &value) {
   std::wstring wString(str);
   wString.erase(std::remove(wString.begin(), wString.end(), L' '), wString.end());
   PCWSTR wstr = wString.data();
   // evaluate the expression of string
   if (_wcsicmp(wstr, L"0.0") == 0 || _wcsicmp(wstr, L"0") == 0) {
       value = 0;
       return S_OK;
   }
   int val = _wtoi(wstr);
   if (val == 0) {
       LogErrorFmt(L"Failed to parse parameter %s to int", wstr);
       return E_FAIL;
   }
   value = val;
   return S_OK;
 }

 static HRESULT ParseDataToUint(PCWSTR str, unsigned int &value) {
     std::wstring wString(str);
     wString.erase(std::remove(wString.begin(), wString.end(), L' '), wString.end());
     PCWSTR wstr = wString.data();
     // evaluate the expression of string
     if (_wcsicmp(wstr, L"0") == 0 || _wcsicmp(wstr, L"0x00000000") == 0) {
         value = 0;
         return S_OK;
     }
     wchar_t *end;
     unsigned int val = std::wcstoul(wstr, &end, 0);
     if (val == 0) {
         LogErrorFmt(L"Failed to parse parameter %s to int", wstr);
         return E_FAIL;
     }
     value = val;
     return S_OK;
 }

 static HRESULT ParseDataToVectorFloat(PCWSTR str, float *ptr, size_t count) {
     std::wstring wstr(str);
     size_t curPosition = 0;
     // parse a string of dot product separated by commas
     for (size_t i = 0; i < count; ++i) {
         size_t nextPosition = wstr.find(L",", curPosition);
         if (FAILED(ParseDataToFloat(
             wstr.substr(curPosition, nextPosition - curPosition).data(),
             *(ptr + i)))) {
             return E_FAIL;
         }
         curPosition = nextPosition + 1;
     }
     return S_OK;
 }

 static HRESULT ParseDataToVectorHalf(PCWSTR str, uint16_t *ptr, size_t count) {
     std::wstring wstr(str);
     size_t curPosition = 0;
     // parse a string of dot product separated by commas
     for (size_t i = 0; i < count; ++i) {
         size_t nextPosition = wstr.find(L",", curPosition);
         float floatValue;
         if (FAILED(ParseDataToFloat(
             wstr.substr(curPosition, nextPosition - curPosition).data(), floatValue))) {
             return E_FAIL;
         }
         *(ptr + i) = ConvertFloat32ToFloat16(floatValue);
         curPosition = nextPosition + 1;
     }
     return S_OK;
 }

 static HRESULT ParseDataToVectorUint(PCWSTR str, unsigned int *ptr, size_t count) {
     std::wstring wstr(str);
     size_t curPosition = 0;
     // parse a string of dot product separated by commas
     for (size_t i = 0; i < count; ++i) {
         size_t nextPosition = wstr.find(L",", curPosition);
         if (FAILED(ParseDataToUint(
             wstr.substr(curPosition, nextPosition - curPosition).data(),
             *(ptr + i)))) {
             return E_FAIL;
         }
         curPosition = nextPosition + 1;
     }
     return S_OK;
 }

 HRESULT TableParameterHandler::ParseTableRow() {
   TableParameter *table = m_table;
   for (unsigned int i = 0; i < m_tableSize; ++i) {
     switch (table[i].m_type) {
     case TableParameter::INT8:
       if (FAILED(WEX::TestExecution::TestData::TryGetValue(table[i].m_name,
         table[i].m_int32)) && table[i].m_required) {
         // TryGetValue does not suppport reading from int16
         LogErrorFmt(L"Failed to get %s", table[i].m_name);
         return E_FAIL;
       }
       table[i].m_int8 = (int8_t)(table[i].m_int32);
       break;
     case TableParameter::INT16:
       if (FAILED(WEX::TestExecution::TestData::TryGetValue(table[i].m_name,
         table[i].m_int32)) && table[i].m_required) {
         // TryGetValue does not suppport reading from int16
         LogErrorFmt(L"Failed to get %s", table[i].m_name);
         return E_FAIL;
       }
       table[i].m_int16 = (short)(table[i].m_int32);
       break;
     case TableParameter::INT32:
       if (FAILED(WEX::TestExecution::TestData::TryGetValue(table[i].m_name,
         table[i].m_int32)) && table[i].m_required) {
         LogErrorFmt(L"Failed to get %s", table[i].m_name);
         return E_FAIL;
       }
       break;
     case TableParameter::UINT:
       if (FAILED(WEX::TestExecution::TestData::TryGetValue(table[i].m_name,
         table[i].m_uint)) && table[i].m_required) {
         LogErrorFmt(L"Failed to get %s", table[i].m_name);
         return E_FAIL;
       }
       break;
     case TableParameter::DOUBLE:
       if (FAILED(WEX::TestExecution::TestData::TryGetValue(
         table[i].m_name, table[i].m_double)) && table[i].m_required) {
         LogErrorFmt(L"Failed to get %s", table[i].m_name);
         return E_FAIL;
       }
       break;
     case TableParameter::STRING:
       if (FAILED(WEX::TestExecution::TestData::TryGetValue(table[i].m_name,
         table[i].m_str)) && table[i].m_required) {
         LogErrorFmt(L"Failed to get %s", table[i].m_name);
         return E_FAIL;
       }
       break;
     case TableParameter::BOOL:
       if (FAILED(WEX::TestExecution::TestData::TryGetValue(table[i].m_name,
         table[i].m_str)) && table[i].m_bool) {
         LogErrorFmt(L"Failed to get %s", table[i].m_name);
         return E_FAIL;
       }
       break;
     case TableParameter::INT8_TABLE: {
       WEX::TestExecution::TestDataArray<int> tempTable;
       if (FAILED(WEX::TestExecution::TestData::TryGetValue(
         table[i].m_name, tempTable)) && table[i].m_required) {

         LogErrorFmt(L"Failed to get %s", table[i].m_name);
         return E_FAIL;
       }
       // TryGetValue does not suppport reading from int8
       table[i].m_int8Table.resize(tempTable.GetSize());
       for (size_t j = 0, end = tempTable.GetSize(); j != end; ++j) {
         table[i].m_int8Table[j] = (int8_t)tempTable[j];
       }
       break;
     }
     case TableParameter::INT16_TABLE: {
       WEX::TestExecution::TestDataArray<int> tempTable;
       if (FAILED(WEX::TestExecution::TestData::TryGetValue(
         table[i].m_name, tempTable)) && table[i].m_required) {
         LogErrorFmt(L"Failed to get %s", table[i].m_name);
         return E_FAIL;
       }
       // TryGetValue does not suppport reading from int8
       table[i].m_int16Table.resize(tempTable.GetSize());
       for (size_t j = 0, end = tempTable.GetSize(); j != end; ++j) {
         table[i].m_int16Table[j] = (int16_t)tempTable[j];
       }
       break;
     }case TableParameter::INT32_TABLE: {
       WEX::TestExecution::TestDataArray<int> tempTable;
       if (FAILED(WEX::TestExecution::TestData::TryGetValue(
         table[i].m_name, tempTable)) && table[i].m_required) {
         // TryGetValue does not suppport reading from int8
         LogErrorFmt(L"Failed to get %s", table[i].m_name);
         return E_FAIL;
       }
       table[i].m_int32Table.resize(tempTable.GetSize());
       for (size_t j = 0, end = tempTable.GetSize(); j != end; ++j) {
         table[i].m_int32Table[j] = tempTable[j];
       }
       break;
     }
     case TableParameter::UINT8_TABLE: {
       WEX::TestExecution::TestDataArray<int> tempTable;
       if (FAILED(WEX::TestExecution::TestData::TryGetValue(
         table[i].m_name, tempTable)) && table[i].m_required) {

         LogErrorFmt(L"Failed to get %s", table[i].m_name);
         return E_FAIL;
       }
       // TryGetValue does not suppport reading from int8
       table[i].m_int8Table.resize(tempTable.GetSize());
       for (size_t j = 0, end = tempTable.GetSize(); j != end; ++j) {
         table[i].m_int8Table[j] = (uint8_t)tempTable[j];
       }
       break;
     }
     case TableParameter::UINT16_TABLE: {
       WEX::TestExecution::TestDataArray<int> tempTable;
       if (FAILED(WEX::TestExecution::TestData::TryGetValue(
         table[i].m_name, tempTable)) && table[i].m_required) {
         LogErrorFmt(L"Failed to get %s", table[i].m_name);
         return E_FAIL;
       }
       // TryGetValue does not suppport reading from int8
       table[i].m_uint16Table.resize(tempTable.GetSize());
       for (size_t j = 0, end = tempTable.GetSize(); j != end; ++j) {
         table[i].m_uint16Table[j] = (uint16_t)tempTable[j];
       }
       break;
     }
     case TableParameter::UINT32_TABLE: {
       WEX::TestExecution::TestDataArray<unsigned int> tempTable;
       if (FAILED(WEX::TestExecution::TestData::TryGetValue(
         table[i].m_name, tempTable)) && table[i].m_required) {
         // TryGetValue does not suppport reading from int8
         LogErrorFmt(L"Failed to get %s", table[i].m_name);
         return E_FAIL;
       }
       table[i].m_uint32Table.resize(tempTable.GetSize());
       for (size_t j = 0, end = tempTable.GetSize(); j != end; ++j) {
         table[i].m_uint32Table[j] = tempTable[j];
       }
       break;
     }
     case TableParameter::FLOAT_TABLE: {
       WEX::TestExecution::TestDataArray<WEX::Common::String> tempTable;
       if (FAILED(WEX::TestExecution::TestData::TryGetValue(
         table[i].m_name, tempTable)) && table[i].m_required) {
         // TryGetValue does not suppport reading from int8
         LogErrorFmt(L"Failed to get %s", table[i].m_name);
         return E_FAIL;
       }
       table[i].m_floatTable.resize(tempTable.GetSize());
       for (size_t j = 0, end = tempTable.GetSize(); j != end; ++j) {
         ParseDataToFloat(tempTable[j], table[i].m_floatTable[j]);
       }
       break;
     }
     case TableParameter::HALF_TABLE: {
       WEX::TestExecution::TestDataArray<WEX::Common::String> tempTable;
       if (FAILED(WEX::TestExecution::TestData::TryGetValue(
         table[i].m_name, tempTable)) && table[i].m_required) {
         // TryGetValue does not suppport reading from int8
         LogErrorFmt(L"Failed to get %s", table[i].m_name);
         return E_FAIL;
       }
       table[i].m_halfTable.resize(tempTable.GetSize());
       for (size_t j = 0, end = tempTable.GetSize(); j != end; ++j) {
         uint16_t value = 0;
         if (IsHexString(tempTable[j], &value)) {
           table[i].m_halfTable[j] = value;
         }
         else {
           float val;
           ParseDataToFloat(tempTable[j], val);
           if (isdenorm(val))
             table[i].m_halfTable[j] = signbit(val) ? Float16NegDenorm : Float16PosDenorm;
           else
             table[i].m_halfTable[j] = ConvertFloat32ToFloat16(val);
         }
      }
       break;
     }
     case TableParameter::DOUBLE_TABLE: {
       WEX::TestExecution::TestDataArray<double> tempTable;
       if (FAILED(WEX::TestExecution::TestData::TryGetValue(
         table[i].m_name, tempTable)) && table[i].m_required) {
         // TryGetValue does not suppport reading from int8
         LogErrorFmt(L"Failed to get %s", table[i].m_name);
         return E_FAIL;
       }
       table[i].m_doubleTable.resize(tempTable.GetSize());
       for (size_t j = 0, end = tempTable.GetSize(); j != end; ++j) {
         table[i].m_doubleTable[j] = tempTable[j];
       }
       break;
     }
     case TableParameter::BOOL_TABLE: {
       WEX::TestExecution::TestDataArray<bool> tempTable;
       if (FAILED(WEX::TestExecution::TestData::TryGetValue(
         table[i].m_name, tempTable)) && table[i].m_required) {
         // TryGetValue does not suppport reading from int8
         LogErrorFmt(L"Failed to get %s", table[i].m_name);
         return E_FAIL;
       }
       table[i].m_boolTable.resize(tempTable.GetSize());
       for (size_t j = 0, end = tempTable.GetSize(); j != end; ++j) {
         table[i].m_boolTable[j] = tempTable[j];
       }
       break;
     }
     case TableParameter::STRING_TABLE: {
       WEX::TestExecution::TestDataArray<WEX::Common::String> tempTable;
       if (FAILED(WEX::TestExecution::TestData::TryGetValue(
         table[i].m_name, tempTable)) && table[i].m_required) {
         // TryGetValue does not suppport reading from int8
         LogErrorFmt(L"Failed to get %s", table[i].m_name);
         return E_FAIL;
       }
       table[i].m_StringTable.resize(tempTable.GetSize());
       for (size_t j = 0, end = tempTable.GetSize(); j != end; ++j) {
         table[i].m_StringTable[j] = tempTable[j];
       }
       break;
     }
     default:
       DXASSERT_NOMSG("Invalid Parameter Type");
     }
     if (errno == ERANGE) {
       LogErrorFmt(L"got out of range value for table %s", table[i].m_name);
       return E_FAIL;
     }
   }
   return S_OK;
 }

 static void VerifyOutputWithExpectedValueInt(int output, int ref, int tolerance) {
     VERIFY_IS_TRUE(output - ref <= tolerance && ref - output <= tolerance);
 }

 static void VerifyOutputWithExpectedValueUInt(uint32_t output, uint32_t ref, uint32_t tolerance) {
     VERIFY_IS_TRUE(output - ref <= tolerance && ref - output <= tolerance);
 }

 static void VerifyOutputWithExpectedValueUInt4(XMUINT4 output, XMUINT4 ref) {
   VERIFY_ARE_EQUAL(output.x, ref.x);
   VERIFY_ARE_EQUAL(output.y, ref.y);
   VERIFY_ARE_EQUAL(output.z, ref.z);
   VERIFY_ARE_EQUAL(output.w, ref.w);
 }

 static void VerifyOutputWithExpectedValueFloat(
     float output, float ref, LPCWSTR type, double tolerance,
     hlsl::DXIL::Float32DenormMode mode = hlsl::DXIL::Float32DenormMode::Any) {
   if (_wcsicmp(type, L"Relative") == 0) {
     VERIFY_IS_TRUE(CompareFloatRelativeEpsilon(output, ref, (int)tolerance, mode));
   } else if (_wcsicmp(type, L"Epsilon") == 0) {
     VERIFY_IS_TRUE(CompareFloatEpsilon(output, ref, (float)tolerance, mode));
   } else if (_wcsicmp(type, L"ULP") == 0) {
     VERIFY_IS_TRUE(CompareFloatULP(output, ref, (int)tolerance, mode));
   } else {
     LogErrorFmt(L"Failed to read comparison type %S", type);
   }
 }

 static bool CompareOutputWithExpectedValueFloat(
     float output, float ref, LPCWSTR type, double tolerance,
     hlsl::DXIL::Float32DenormMode mode = hlsl::DXIL::Float32DenormMode::Any) {
   if (_wcsicmp(type, L"Relative") == 0) {
     return CompareFloatRelativeEpsilon(output, ref, (int)tolerance, mode);
   } else if (_wcsicmp(type, L"Epsilon") == 0) {
     return CompareFloatEpsilon(output, ref, (float)tolerance, mode);
   } else if (_wcsicmp(type, L"ULP") == 0) {
     return CompareFloatULP(output, ref, (int)tolerance, mode);
   } else {
     LogErrorFmt(L"Failed to read comparison type %S", type);
     return false;
   }
 }

 static void VerifyOutputWithExpectedValueHalf(
   uint16_t output, uint16_t ref, LPCWSTR type, double tolerance) {
   if (_wcsicmp(type, L"Relative") == 0) {
     VERIFY_IS_TRUE(CompareHalfRelativeEpsilon(output, ref, (int)tolerance));
   }
   else if (_wcsicmp(type, L"Epsilon") == 0) {
     VERIFY_IS_TRUE(CompareHalfEpsilon(output, ref, (float)tolerance));
   }
   else if (_wcsicmp(type, L"ULP") == 0) {
     VERIFY_IS_TRUE(CompareHalfULP(output, ref, (float)tolerance));
   }
   else {
     LogErrorFmt(L"Failed to read comparison type %S", type);
   }
 }

 TEST_F(ExecutionTest, UnaryFloatOpTest) {
     WEX::TestExecution::SetVerifyOutput verifySettings(
         WEX::TestExecution::VerifyOutputSettings::LogOnlyFailures);
     CComPtr<IStream> pStream;
     ReadHlslDataIntoNewStream(L"ShaderOpArith.xml", &pStream);

     CComPtr<ID3D12Device> pDevice;
     if (!CreateDevice(&pDevice)) {
       return;
     }
     // Read data from the table
     int tableSize = sizeof(UnaryFPOpParameters) / sizeof(TableParameter);
     TableParameterHandler handler(UnaryFPOpParameters, tableSize);

     CW2A Target(handler.GetTableParamByName(L"ShaderOp.Target")->m_str);
     CW2A Text(handler.GetTableParamByName(L"ShaderOp.Text")->m_str);

     unsigned int WarpVersion = handler.GetTableParamByName(L"Warp.Version")->m_uint;
     if (GetTestParamUseWARP(true) && !IsValidWarpDllVersion(WarpVersion)) {
         return;
     }

     std::vector<float> *Validation_Input =
         &(handler.GetTableParamByName(L"Validation.Input1")->m_floatTable);
     std::vector<float> *Validation_Expected =
         &(handler.GetTableParamByName(L"Validation.Expected1")->m_floatTable);

     LPCWSTR Validation_Type = handler.GetTableParamByName(L"Validation.Type")->m_str;
     double Validation_Tolerance = handler.GetTableParamByName(L"Validation.Tolerance")->m_double;

     size_t count = Validation_Input->size();

     std::shared_ptr<ShaderOpTestResult> test = RunShaderOpTest(
         pDevice, m_support, pStream, "UnaryFPOp",
         // this callbacked is called when the test
         // is creating the resource to run the test
         [&](LPCSTR Name, std::vector<BYTE> &Data, st::ShaderOp *pShaderOp) {
           VERIFY_IS_TRUE(0 == _stricmp(Name, "SUnaryFPOp"));
           size_t size = sizeof(SUnaryFPOp) * count;
           Data.resize(size);
           SUnaryFPOp *pPrimitives = (SUnaryFPOp *)Data.data();
           for (size_t i = 0; i < count; ++i) {
             SUnaryFPOp *p = &pPrimitives[i];
             p->input = (*Validation_Input)[i % Validation_Input->size()];
           }
           // use shader from data table
           pShaderOp->Shaders.at(0).Target = Target.m_psz;
           pShaderOp->Shaders.at(0).Text = Text.m_psz;
         });

     MappedData data;
     test->Test->GetReadBackData("SUnaryFPOp", &data);

     SUnaryFPOp *pPrimitives = (SUnaryFPOp*)data.data();
     WEX::TestExecution::DisableVerifyExceptions dve;
     for (unsigned i = 0; i < count; ++i) {
         SUnaryFPOp *p = &pPrimitives[i];
         float val = (*Validation_Expected)[i % Validation_Expected->size()];
         LogCommentFmt(
             L"element #%u, input = %6.8f, output = %6.8f, expected = %6.8f", i,
             p->input, p->output, val);
         VerifyOutputWithExpectedValueFloat(p->output, val, Validation_Type, Validation_Tolerance);
     }
 }

 TEST_F(ExecutionTest, BinaryFloatOpTest) {
     WEX::TestExecution::SetVerifyOutput verifySettings(
         WEX::TestExecution::VerifyOutputSettings::LogOnlyFailures);
     CComPtr<IStream> pStream;
     ReadHlslDataIntoNewStream(L"ShaderOpArith.xml", &pStream);

     CComPtr<ID3D12Device> pDevice;
     if (!CreateDevice(&pDevice)) {
         return;
     }
     // Read data from the table
     int tableSize = sizeof(BinaryFPOpParameters) / sizeof(TableParameter);
     TableParameterHandler handler(BinaryFPOpParameters, tableSize);

     CW2A Target(handler.GetTableParamByName(L"ShaderOp.Target")->m_str);
     CW2A Text(handler.GetTableParamByName(L"ShaderOp.Text")->m_str);

     std::vector<float> *Validation_Input1 =
         &(handler.GetTableParamByName(L"Validation.Input1")->m_floatTable);
     std::vector<float> *Validation_Input2 =
         &(handler.GetTableParamByName(L"Validation.Input2")->m_floatTable);

     std::vector<float> *Validation_Expected1 =
         &(handler.GetTableParamByName(L"Validation.Expected1")->m_floatTable);

     std::vector<float> *Validation_Expected2 =
         &(handler.GetTableParamByName(L"Validation.Expected2")->m_floatTable);

     LPCWSTR Validation_Type = handler.GetTableParamByName(L"Validation.Type")->m_str;
     double Validation_Tolerance = handler.GetTableParamByName(L"Validation.Tolerance")->m_double;
     size_t count = Validation_Input1->size();

     std::shared_ptr<ShaderOpTestResult> test = RunShaderOpTest(
         pDevice, m_support, pStream, "BinaryFPOp",
         // this callbacked is called when the test
         // is creating the resource to run the test
         [&](LPCSTR Name, std::vector<BYTE> &Data, st::ShaderOp *pShaderOp) {
         VERIFY_IS_TRUE(0 == _stricmp(Name, "SBinaryFPOp"));
         size_t size = sizeof(SBinaryFPOp) * count;
         Data.resize(size);
         SBinaryFPOp *pPrimitives = (SBinaryFPOp *)Data.data();
         for (size_t i = 0; i < count; ++i) {
             SBinaryFPOp *p = &pPrimitives[i];
             p->input1 = (*Validation_Input1)[i % Validation_Input1->size()];
             p->input2 = (*Validation_Input2)[i % Validation_Input2->size()];
         }

         // use shader from data table
         pShaderOp->Shaders.at(0).Target = Target.m_psz;
         pShaderOp->Shaders.at(0).Text = Text.m_psz;
     });

     MappedData data;
     test->Test->GetReadBackData("SBinaryFPOp", &data);

     SBinaryFPOp *pPrimitives = (SBinaryFPOp *)data.data();
     WEX::TestExecution::DisableVerifyExceptions dve;
     unsigned numExpected = Validation_Expected2->size() == 0 ? 1 : 2;
     if (numExpected == 2) {
       for (unsigned i = 0; i < count; ++i) {
         SBinaryFPOp *p = &pPrimitives[i];
         float val1 = (*Validation_Expected1)[i % Validation_Expected1->size()];
         float val2 = (*Validation_Expected2)[i % Validation_Expected2->size()];
         LogCommentFmt(L"element #%u, input1 = %6.8f, input2 = %6.8f, output1 = "
             L"%6.8f, expected1 = %6.8f, output2 = %6.8f, expected2 = %6.8f",
             i, p->input1, p->input2, p->output1, val1, p->output2,
             val2);
         VerifyOutputWithExpectedValueFloat(p->output1, val1, Validation_Type,
           Validation_Tolerance);
         VerifyOutputWithExpectedValueFloat(p->output2, val2, Validation_Type,
           Validation_Tolerance);
       }
     }
     else if (numExpected == 1) {
       for (unsigned i = 0; i < count; ++i) {
         SBinaryFPOp *p = &pPrimitives[i];
         float val1 = (*Validation_Expected1)[i % Validation_Expected1->size()];
         LogCommentFmt(L"element #%u, input1 = %6.8f, input2 = %6.8f, output1 = "
           L"%6.8f, expected1 = %6.8f",
           i, p->input1, p->input2, p->output1, val1);
         VerifyOutputWithExpectedValueFloat(p->output1, val1, Validation_Type,
           Validation_Tolerance);
       }
     }
     else {
       LogErrorFmt(L"Unexpected number of expected values for operation %i", numExpected);
     }
 }

 TEST_F(ExecutionTest, TertiaryFloatOpTest) {
     WEX::TestExecution::SetVerifyOutput verifySettings(
         WEX::TestExecution::VerifyOutputSettings::LogOnlyFailures);
     CComPtr<IStream> pStream;
     ReadHlslDataIntoNewStream(L"ShaderOpArith.xml", &pStream);

     CComPtr<ID3D12Device> pDevice;
     if (!CreateDevice(&pDevice)) {
         return;
     }
     // Read data from the table

     int tableSize = sizeof(TertiaryFPOpParameters) / sizeof(TableParameter);
     TableParameterHandler handler(TertiaryFPOpParameters, tableSize);

     CW2A Target(handler.GetTableParamByName(L"ShaderOp.Target")->m_str);
     CW2A Text(handler.GetTableParamByName(L"ShaderOp.Text")->m_str);

     std::vector<float> *Validation_Input1 =
         &(handler.GetTableParamByName(L"Validation.Input1")->m_floatTable);
     std::vector<float> *Validation_Input2 =
         &(handler.GetTableParamByName(L"Validation.Input2")->m_floatTable);
     std::vector<float> *Validation_Input3 =
         &(handler.GetTableParamByName(L"Validation.Input3")->m_floatTable);

     std::vector<float> *Validation_Expected =
         &(handler.GetTableParamByName(L"Validation.Expected1")->m_floatTable);

     LPCWSTR Validation_Type = handler.GetTableParamByName(L"Validation.Type")->m_str;
     double Validation_Tolerance = handler.GetTableParamByName(L"Validation.Tolerance")->m_double;
     size_t count = Validation_Input1->size();

     std::shared_ptr<ShaderOpTestResult> test = RunShaderOpTest(
         pDevice, m_support, pStream, "TertiaryFPOp",
         // this callbacked is called when the test
         // is creating the resource to run the test
         [&](LPCSTR Name, std::vector<BYTE> &Data, st::ShaderOp *pShaderOp) {
         VERIFY_IS_TRUE(0 == _stricmp(Name, "STertiaryFPOp"));
         size_t size = sizeof(STertiaryFPOp) * count;
         Data.resize(size);
         STertiaryFPOp *pPrimitives = (STertiaryFPOp *)Data.data();
         for (size_t i = 0; i < count; ++i) {
             STertiaryFPOp *p = &pPrimitives[i];
             p->input1 = (*Validation_Input1)[i % Validation_Input1->size()];
             p->input2 = (*Validation_Input2)[i % Validation_Input2->size()];
             p->input3 = (*Validation_Input3)[i % Validation_Input3->size()];
         }

         // use shader from data table
         pShaderOp->Shaders.at(0).Target = Target.m_psz;
         pShaderOp->Shaders.at(0).Text = Text.m_psz;
     });

     MappedData data;
     test->Test->GetReadBackData("STertiaryFPOp", &data);

     STertiaryFPOp *pPrimitives = (STertiaryFPOp *)data.data();
     WEX::TestExecution::DisableVerifyExceptions dve;

     for (unsigned i = 0; i < count; ++i) {
       STertiaryFPOp *p = &pPrimitives[i];
       float val = (*Validation_Expected)[i % Validation_Expected->size()];
       LogCommentFmt(L"element #%u, input1 = %6.8f, input2 = %6.8f, input3 = %6.8f, output1 = "
                     L"%6.8f, expected = %6.8f",
                     i, p->input1, p->input2, p->input3, p->output, val);
       VerifyOutputWithExpectedValueFloat(p->output, val, Validation_Type,
                                Validation_Tolerance);
     }
 }

 TEST_F(ExecutionTest, UnaryHalfOpTest) {
     WEX::TestExecution::SetVerifyOutput verifySettings(
         WEX::TestExecution::VerifyOutputSettings::LogOnlyFailures);
     CComPtr<IStream> pStream;
     ReadHlslDataIntoNewStream(L"ShaderOpArith.xml", &pStream);

     CComPtr<ID3D12Device> pDevice;
     if (!CreateDevice(&pDevice, D3D_SHADER_MODEL::D3D_SHADER_MODEL_6_2)) {
       return;
     }

     if (!DoesDeviceSupportNative16bitOps(pDevice)) {
       WEX::Logging::Log::Comment(L"Device does not support native 16-bit operations.");
       WEX::Logging::Log::Result(WEX::Logging::TestResults::Skipped);
       return;
     }

     // Read data from the table
     int tableSize = sizeof(UnaryHalfOpParameters) / sizeof(TableParameter);
     TableParameterHandler handler(UnaryHalfOpParameters, tableSize);

     CW2A Target(handler.GetTableParamByName(L"ShaderOp.Target")->m_str);
     CW2A Text(handler.GetTableParamByName(L"ShaderOp.Text")->m_str);
     CW2A Arguments(handler.GetTableParamByName(L"ShaderOp.Arguments")->m_str);

     unsigned int WarpVersion = handler.GetTableParamByName(L"Warp.Version")->m_uint;
     if (GetTestParamUseWARP(true) && !IsValidWarpDllVersion(WarpVersion)) {
         return;
     }

     std::vector<uint16_t> *Validation_Input =
         &(handler.GetTableParamByName(L"Validation.Input1")->m_halfTable);
     std::vector<uint16_t> *Validation_Expected =
         &(handler.GetTableParamByName(L"Validation.Expected1")->m_halfTable);

     LPCWSTR Validation_Type = handler.GetTableParamByName(L"Validation.Type")->m_str;
     double Validation_Tolerance = handler.GetTableParamByName(L"Validation.Tolerance")->m_double;

     size_t count = Validation_Input->size();

     std::shared_ptr<ShaderOpTestResult> test = RunShaderOpTest(
         pDevice, m_support, pStream, "UnaryFPOp",
         // this callbacked is called when the test
         // is creating the resource to run the test
         [&](LPCSTR Name, std::vector<BYTE> &Data, st::ShaderOp *pShaderOp) {
           VERIFY_IS_TRUE(0 == _stricmp(Name, "SUnaryFPOp"));
           size_t size = sizeof(SUnaryHalfOp) * count;
           Data.resize(size);
           SUnaryHalfOp *pPrimitives = (SUnaryHalfOp *)Data.data();
           for (size_t i = 0; i < count; ++i) {
             SUnaryHalfOp *p = &pPrimitives[i];
             p->input = (*Validation_Input)[i % Validation_Input->size()];
           }
           // use shader from data table
           pShaderOp->Shaders.at(0).Target = Target.m_psz;
           pShaderOp->Shaders.at(0).Text = Text.m_psz;
           pShaderOp->Shaders.at(0).Arguments = Arguments.m_psz;
         });

     MappedData data;
     test->Test->GetReadBackData("SUnaryFPOp", &data);

     SUnaryHalfOp *pPrimitives = (SUnaryHalfOp*)data.data();
     WEX::TestExecution::DisableVerifyExceptions dve;
     for (unsigned i = 0; i < count; ++i) {
         SUnaryHalfOp *p = &pPrimitives[i];
         uint16_t expected = (*Validation_Expected)[i % Validation_Input->size()];
         LogCommentFmt(L"element #%u, input = %6.8f(0x%04x), output = "
                       L"%6.8f(0x%04x), expected = %6.8f(0x%04x)",
                       i, ConvertFloat16ToFloat32(p->input), p->input,
                       ConvertFloat16ToFloat32(p->output), p->output,
                       ConvertFloat16ToFloat32(expected), expected);
         VerifyOutputWithExpectedValueHalf(p->output, expected, Validation_Type, Validation_Tolerance);
     }
 }

 TEST_F(ExecutionTest, BinaryHalfOpTest) {
     WEX::TestExecution::SetVerifyOutput verifySettings(
         WEX::TestExecution::VerifyOutputSettings::LogOnlyFailures);
     CComPtr<IStream> pStream;
     ReadHlslDataIntoNewStream(L"ShaderOpArith.xml", &pStream);

     CComPtr<ID3D12Device> pDevice;
     if (!CreateDevice(&pDevice, D3D_SHADER_MODEL::D3D_SHADER_MODEL_6_2)) {
         return;
     }

     if (!DoesDeviceSupportNative16bitOps(pDevice)) {
       WEX::Logging::Log::Comment(L"Device does not support native 16-bit operations.");
       WEX::Logging::Log::Result(WEX::Logging::TestResults::Skipped);
       return;
     }

     // Read data from the table
     int tableSize = sizeof(BinaryHalfOpParameters) / sizeof(TableParameter);
     TableParameterHandler handler(BinaryHalfOpParameters, tableSize);

     CW2A Target(handler.GetTableParamByName(L"ShaderOp.Target")->m_str);
     CW2A Text(handler.GetTableParamByName(L"ShaderOp.Text")->m_str);
     CW2A Arguments(handler.GetTableParamByName(L"ShaderOp.Arguments")->m_str);

     std::vector<uint16_t> *Validation_Input1 =
         &(handler.GetTableParamByName(L"Validation.Input1")->m_halfTable);
     std::vector<uint16_t> *Validation_Input2 =
         &(handler.GetTableParamByName(L"Validation.Input2")->m_halfTable);

     std::vector<uint16_t> *Validation_Expected1 =
         &(handler.GetTableParamByName(L"Validation.Expected1")->m_halfTable);

     std::vector<uint16_t> *Validation_Expected2 =
         &(handler.GetTableParamByName(L"Validation.Expected2")->m_halfTable);

     LPCWSTR Validation_Type = handler.GetTableParamByName(L"Validation.Type")->m_str;
     double Validation_Tolerance = handler.GetTableParamByName(L"Validation.Tolerance")->m_double;
     size_t count = Validation_Input1->size();

     std::shared_ptr<ShaderOpTestResult> test = RunShaderOpTest(
         pDevice, m_support, pStream, "BinaryFPOp",
         // this callbacked is called when the test
         // is creating the resource to run the test
         [&](LPCSTR Name, std::vector<BYTE> &Data, st::ShaderOp *pShaderOp) {
         VERIFY_IS_TRUE(0 == _stricmp(Name, "SBinaryFPOp"));
         size_t size = sizeof(SBinaryHalfOp) * count;
         Data.resize(size);
         SBinaryHalfOp *pPrimitives = (SBinaryHalfOp *)Data.data();
         for (size_t i = 0; i < count; ++i) {
             SBinaryHalfOp *p = &pPrimitives[i];
             p->input1 = (*Validation_Input1)[i % Validation_Input1->size()];
             p->input2 = (*Validation_Input2)[i % Validation_Input2->size()];
         }

         // use shader from data table
         pShaderOp->Shaders.at(0).Target = Target.m_psz;
         pShaderOp->Shaders.at(0).Text = Text.m_psz;
         pShaderOp->Shaders.at(0).Arguments = Arguments.m_psz;
     });

     MappedData data;
     test->Test->GetReadBackData("SBinaryFPOp", &data);

     SBinaryHalfOp *pPrimitives = (SBinaryHalfOp *)data.data();
     WEX::TestExecution::DisableVerifyExceptions dve;
     unsigned numExpected = Validation_Expected2->size() == 0 ? 1 : 2;
     if (numExpected == 2) {
       for (unsigned i = 0; i < count; ++i) {
         SBinaryHalfOp *p = &pPrimitives[i];
         uint16_t expected1 = (*Validation_Expected1)[i % Validation_Input1->size()];
         uint16_t expected2 = (*Validation_Expected2)[i % Validation_Input2->size()];
         LogCommentFmt(L"element #%u, input1 = %6.8f(0x%04x), input2 = %6.8f(0x%04x), output1 = "
           L"%6.8f(0x%04x), expected1 = %6.8f(0x%04x), output2 = %6.8f(0x%04x), expected2 = %6.8f(0x%04x)",
           i, ConvertFloat16ToFloat32(p->input1), p->input1,
           ConvertFloat16ToFloat32(p->input2), p->input2,
           ConvertFloat16ToFloat32(p->output1), p->output1,
           ConvertFloat16ToFloat32(p->output2), p->output2,
           ConvertFloat16ToFloat32(expected1), expected1,
           ConvertFloat16ToFloat32(expected2), expected2);
         VerifyOutputWithExpectedValueHalf(p->output1, expected1, Validation_Type, Validation_Tolerance);
         VerifyOutputWithExpectedValueHalf(p->output2, expected2, Validation_Type, Validation_Tolerance);
       }
     }
     else if (numExpected == 1) {
       for (unsigned i = 0; i < count; ++i) {
         uint16_t expected = (*Validation_Expected1)[i % Validation_Input1->size()];
         SBinaryHalfOp *p = &pPrimitives[i];
         LogCommentFmt(L"element #%u, input = %6.8f(0x%04x), output = "
           L"%6.8f(0x%04x), expected = %6.8f(0x%04x)",
           i, ConvertFloat16ToFloat32(p->input1), p->input1,
           ConvertFloat16ToFloat32(p->output1), p->output1,
           ConvertFloat16ToFloat32(expected), expected);
         VerifyOutputWithExpectedValueHalf(p->output1, expected, Validation_Type, Validation_Tolerance);
       }
     }
     else {
       LogErrorFmt(L"Unexpected number of expected values for operation %i", numExpected);
     }
 }

 TEST_F(ExecutionTest, TertiaryHalfOpTest) {
     WEX::TestExecution::SetVerifyOutput verifySettings(
         WEX::TestExecution::VerifyOutputSettings::LogOnlyFailures);
     CComPtr<IStream> pStream;
     ReadHlslDataIntoNewStream(L"ShaderOpArith.xml", &pStream);

     CComPtr<ID3D12Device> pDevice;
     if (!CreateDevice(&pDevice, D3D_SHADER_MODEL::D3D_SHADER_MODEL_6_2)) {
         return;
     }

     if (!DoesDeviceSupportNative16bitOps(pDevice)) {
       WEX::Logging::Log::Comment(L"Device does not support native 16-bit operations.");
       WEX::Logging::Log::Result(WEX::Logging::TestResults::Skipped);
       return;
     }

     // Read data from the table
     int tableSize = sizeof(TertiaryHalfOpParameters) / sizeof(TableParameter);
     TableParameterHandler handler(TertiaryHalfOpParameters, tableSize);

     CW2A Target(handler.GetTableParamByName(L"ShaderOp.Target")->m_str);
     CW2A Text(handler.GetTableParamByName(L"ShaderOp.Text")->m_str);
     CW2A Arguments(handler.GetTableParamByName(L"ShaderOp.Arguments")->m_str);

     std::vector<uint16_t> *Validation_Input1 =
         &(handler.GetTableParamByName(L"Validation.Input1")->m_halfTable);
     std::vector<uint16_t> *Validation_Input2 =
         &(handler.GetTableParamByName(L"Validation.Input2")->m_halfTable);
     std::vector<uint16_t> *Validation_Input3 =
         &(handler.GetTableParamByName(L"Validation.Input3")->m_halfTable);

     std::vector<uint16_t> *Validation_Expected =
         &(handler.GetTableParamByName(L"Validation.Expected1")->m_halfTable);

     LPCWSTR Validation_Type = handler.GetTableParamByName(L"Validation.Type")->m_str;
     double Validation_Tolerance = handler.GetTableParamByName(L"Validation.Tolerance")->m_double;
     size_t count = Validation_Input1->size();

     std::shared_ptr<ShaderOpTestResult> test = RunShaderOpTest(
         pDevice, m_support, pStream, "TertiaryFPOp",
         // this callbacked is called when the test
         // is creating the resource to run the test
         [&](LPCSTR Name, std::vector<BYTE> &Data, st::ShaderOp *pShaderOp) {
         VERIFY_IS_TRUE(0 == _stricmp(Name, "STertiaryFPOp"));
         size_t size = sizeof(STertiaryHalfOp) * count;
         Data.resize(size);
         STertiaryHalfOp *pPrimitives = (STertiaryHalfOp *)Data.data();
         for (size_t i = 0; i < count; ++i) {
             STertiaryHalfOp *p = &pPrimitives[i];
             p->input1 = (*Validation_Input1)[i % Validation_Input1->size()];
             p->input2 = (*Validation_Input2)[i % Validation_Input2->size()];
             p->input3 = (*Validation_Input3)[i % Validation_Input3->size()];
         }

         // use shader from data table
         pShaderOp->Shaders.at(0).Target = Target.m_psz;
         pShaderOp->Shaders.at(0).Text = Text.m_psz;
         pShaderOp->Shaders.at(0).Arguments = Arguments.m_psz;
     });

     MappedData data;
     test->Test->GetReadBackData("STertiaryFPOp", &data);

     STertiaryHalfOp *pPrimitives = (STertiaryHalfOp *)data.data();
     WEX::TestExecution::DisableVerifyExceptions dve;

     for (unsigned i = 0; i < count; ++i) {
       STertiaryHalfOp *p = &pPrimitives[i];
       uint16_t expected = (*Validation_Expected)[i % Validation_Expected->size()];
       LogCommentFmt(L"element #%u,  input1 = %6.8f(0x%04x), input2 = %6.8f(0x%04x), input3 = %6.8f(0x%04x), output = "
         L"%6.8f(0x%04x), expected = %6.8f(0x%04x)",
         i, ConvertFloat16ToFloat32(p->input1), p->input1,
         ConvertFloat16ToFloat32(p->input2), p->input2,
         ConvertFloat16ToFloat32(p->input3), p->input3,
         ConvertFloat16ToFloat32(p->output), p->output,
         ConvertFloat16ToFloat32(expected), expected);
       VerifyOutputWithExpectedValueHalf(p->output, expected, Validation_Type, Validation_Tolerance);
     }
 }

 TEST_F(ExecutionTest, UnaryIntOpTest) {
     WEX::TestExecution::SetVerifyOutput verifySettings(
         WEX::TestExecution::VerifyOutputSettings::LogOnlyFailures);
     CComPtr<IStream> pStream;
     ReadHlslDataIntoNewStream(L"ShaderOpArith.xml", &pStream);

     CComPtr<ID3D12Device> pDevice;
     if (!CreateDevice(&pDevice)) {
         return;
     }
     // Read data from the table

     int tableSize = sizeof(UnaryIntOpParameters) / sizeof(TableParameter);
     TableParameterHandler handler(UnaryIntOpParameters, tableSize);

     CW2A Target(handler.GetTableParamByName(L"ShaderOp.Target")->m_str);
     CW2A Text(handler.GetTableParamByName(L"ShaderOp.Text")->m_str);

     std::vector<int> *Validation_Input =
         &handler.GetTableParamByName(L"Validation.Input1")->m_int32Table;
     std::vector<int> *Validation_Expected =
         &handler.GetTableParamByName(L"Validation.Expected1")->m_int32Table;
     int Validation_Tolerance = handler.GetTableParamByName(L"Validation.Tolerance")->m_int32;
     size_t count = Validation_Input->size();

     std::shared_ptr<ShaderOpTestResult> test = RunShaderOpTest(
         pDevice, m_support, pStream, "UnaryIntOp",
         // this callbacked is called when the test
         // is creating the resource to run the test
         [&](LPCSTR Name, std::vector<BYTE> &Data, st::ShaderOp *pShaderOp) {
           VERIFY_IS_TRUE(0 == _stricmp(Name, "SUnaryIntOp"));
           size_t size = sizeof(SUnaryIntOp) * count;
           Data.resize(size);
           SUnaryIntOp *pPrimitives = (SUnaryIntOp *)Data.data();
           for (size_t i = 0; i < count; ++i) {
             SUnaryIntOp *p = &pPrimitives[i];
             int val = (*Validation_Input)[i % Validation_Input->size()];
             p->input = val;
           }
           // use shader data table
           pShaderOp->Shaders.at(0).Target = Target.m_psz;
           pShaderOp->Shaders.at(0).Text = Text.m_psz;
         });

     MappedData data;
     test->Test->GetReadBackData("SUnaryIntOp", &data);

     SUnaryIntOp *pPrimitives = (SUnaryIntOp *)data.data();
     WEX::TestExecution::DisableVerifyExceptions dve;
     for (unsigned i = 0; i < count; ++i) {
       SUnaryIntOp *p = &pPrimitives[i];
       int val = (*Validation_Expected)[i % Validation_Expected->size()];
       LogCommentFmt(L"element #%u, input = %11i(0x%08x), output = %11i(0x%08x), "
                     L"expected = %11i(0x%08x)",
                     i, p->input, p->input, p->output, p->output, val, val);
       VerifyOutputWithExpectedValueInt(p->output, val, Validation_Tolerance);
     }
 }

 TEST_F(ExecutionTest, UnaryUintOpTest) {
     WEX::TestExecution::SetVerifyOutput verifySettings(
         WEX::TestExecution::VerifyOutputSettings::LogOnlyFailures);
     CComPtr<IStream> pStream;
     ReadHlslDataIntoNewStream(L"ShaderOpArith.xml", &pStream);

     CComPtr<ID3D12Device> pDevice;
     if (!CreateDevice(&pDevice)) {
         return;
     }
     // Read data from the table

     int tableSize = sizeof(UnaryUintOpParameters) / sizeof(TableParameter);
     TableParameterHandler handler(UnaryUintOpParameters, tableSize);

     CW2A Target(handler.GetTableParamByName(L"ShaderOp.Target")->m_str);
     CW2A Text(handler.GetTableParamByName(L"ShaderOp.Text")->m_str);

     std::vector<unsigned int> *Validation_Input =
         &handler.GetTableParamByName(L"Validation.Input1")->m_uint32Table;
     std::vector<unsigned int> *Validation_Expected =
         &handler.GetTableParamByName(L"Validation.Expected1")->m_uint32Table;
     int Validation_Tolerance = handler.GetTableParamByName(L"Validation.Tolerance")->m_int32;
     size_t count = Validation_Input->size();

     std::shared_ptr<ShaderOpTestResult> test = RunShaderOpTest(
         pDevice, m_support, pStream, "UnaryUintOp",
         // this callbacked is called when the test
         // is creating the resource to run the test
         [&](LPCSTR Name, std::vector<BYTE> &Data, st::ShaderOp *pShaderOp) {
         VERIFY_IS_TRUE(0 == _stricmp(Name, "SUnaryUintOp"));
         size_t size = sizeof(SUnaryUintOp) * count;
         Data.resize(size);
         SUnaryUintOp *pPrimitives = (SUnaryUintOp *)Data.data();
         for (size_t i = 0; i < count; ++i) {
             SUnaryUintOp *p = &pPrimitives[i];
             unsigned int val = (*Validation_Input)[i % Validation_Input->size()];
             p->input = val;
         }
         // use shader data table
         pShaderOp->Shaders.at(0).Target = Target.m_psz;
         pShaderOp->Shaders.at(0).Text = Text.m_psz;
     });

     MappedData data;
     test->Test->GetReadBackData("SUnaryUintOp", &data);

     SUnaryUintOp *pPrimitives = (SUnaryUintOp *)data.data();
     WEX::TestExecution::DisableVerifyExceptions dve;
     for (unsigned i = 0; i < count; ++i) {
         SUnaryUintOp *p = &pPrimitives[i];
         unsigned int val = (*Validation_Expected)[i % Validation_Expected->size()];
         LogCommentFmt(L"element #%u, input = %11u(0x%08x), output = %11u(0x%08x), "
             L"expected = %11u(0x%08x)",
             i, p->input, p->input, p->output, p->output, val, val);
         VerifyOutputWithExpectedValueInt(p->output, val, Validation_Tolerance);
     }
 }

 TEST_F(ExecutionTest, BinaryIntOpTest) {
     WEX::TestExecution::SetVerifyOutput verifySettings(
         WEX::TestExecution::VerifyOutputSettings::LogOnlyFailures);
     CComPtr<IStream> pStream;
     ReadHlslDataIntoNewStream(L"ShaderOpArith.xml", &pStream);

     CComPtr<ID3D12Device> pDevice;
     if (!CreateDevice(&pDevice)) {
       return;
     }
     // Read data from the table
     size_t tableSize = sizeof(BinaryIntOpParameters) / sizeof(TableParameter);
     TableParameterHandler handler(BinaryIntOpParameters, tableSize);

     CW2A Target(handler.GetTableParamByName(L"ShaderOp.Target")->m_str);
     CW2A Text(handler.GetTableParamByName(L"ShaderOp.Text")->m_str);


     std::vector<int> *Validation_Input1 =
         &handler.GetTableParamByName(L"Validation.Input1")->m_int32Table;
     std::vector<int> *Validation_Input2 =
         &handler.GetTableParamByName(L"Validation.Input2")->m_int32Table;
     std::vector<int> *Validation_Expected1 =
         &handler.GetTableParamByName(L"Validation.Expected1")->m_int32Table;
     std::vector<int> *Validation_Expected2 =
         &handler.GetTableParamByName(L"Validation.Expected2")->m_int32Table;
     int Validation_Tolerance = handler.GetTableParamByName(L"Validation.Tolerance")->m_int32;
     size_t count = Validation_Input1->size();

     size_t numExpected = Validation_Expected2->size() == 0 ? 1 : 2;

     std::shared_ptr<ShaderOpTestResult> test = RunShaderOpTest(
         pDevice, m_support, pStream, "BinaryIntOp",
         // this callbacked is called when the test
         // is creating the resource to run the test
         [&](LPCSTR Name, std::vector<BYTE> &Data, st::ShaderOp *pShaderOp) {
           VERIFY_IS_TRUE(0 == _stricmp(Name, "SBinaryIntOp"));
           size_t size = sizeof(SBinaryIntOp) * count;
           Data.resize(size);
           SBinaryIntOp *pPrimitives = (SBinaryIntOp *)Data.data();
           for (size_t i = 0; i < count; ++i) {
             SBinaryIntOp *p = &pPrimitives[i];
             int val1 = (*Validation_Input1)[i % Validation_Input1->size()];
             int val2 = (*Validation_Input2)[i % Validation_Input2->size()];
             p->input1 = val1;
             p->input2 = val2;
           }

           // use shader from data table
           pShaderOp->Shaders.at(0).Target = Target.m_psz;
           pShaderOp->Shaders.at(0).Text = Text.m_psz;
         });

     MappedData data;
     test->Test->GetReadBackData("SBinaryIntOp", &data);

     SBinaryIntOp *pPrimitives = (SBinaryIntOp *)data.data();
     WEX::TestExecution::DisableVerifyExceptions dve;

     if (numExpected == 2) {
         for (unsigned i = 0; i < count; ++i) {
             SBinaryIntOp *p = &pPrimitives[i];
             int val1 = (*Validation_Expected1)[i % Validation_Expected1->size()];
             int val2 = (*Validation_Expected2)[i % Validation_Expected2->size()];
             LogCommentFmt(L"element #%u, input1 = %11i(0x%08x), input2 = "
                 L"%11i(0x%08x), output1 = "
                 L"%11i(0x%08x), expected1 = %11i(0x%08x), output2 = "
                 L"%11i(0x%08x), expected2 = %11i(0x%08x)",
                 i, p->input1, p->input1, p->input2, p->input2, p->output1,
                 p->output1, val1, val1, p->output2, p->output2, val2,
                 val2);
             VerifyOutputWithExpectedValueInt(p->output1, val1, Validation_Tolerance);
             VerifyOutputWithExpectedValueInt(p->output2, val2, Validation_Tolerance);
         }
     }
     else if (numExpected == 1) {
         for (unsigned i = 0; i < count; ++i) {
             SBinaryIntOp *p = &pPrimitives[i];
             int val1 = (*Validation_Expected1)[i % Validation_Expected1->size()];
             LogCommentFmt(L"element #%u, input1 = %11i(0x%08x), input2 = "
                           L"%11i(0x%08x), output = "
                           L"%11i(0x%08x), expected = %11i(0x%08x)", i,
                           p->input1, p->input1, p->input2, p->input2,
                           p->output1, p->output1, val1, val1);
             VerifyOutputWithExpectedValueInt(p->output1, val1, Validation_Tolerance);
         }
     }
     else {
         LogErrorFmt(L"Unexpected number of expected values for operation %i", numExpected);
     }
 }

 TEST_F(ExecutionTest, TertiaryIntOpTest) {
     WEX::TestExecution::SetVerifyOutput verifySettings(
         WEX::TestExecution::VerifyOutputSettings::LogOnlyFailures);
     CComPtr<IStream> pStream;
     ReadHlslDataIntoNewStream(L"ShaderOpArith.xml", &pStream);

     CComPtr<ID3D12Device> pDevice;
     if (!CreateDevice(&pDevice)) {
         return;
     }
     // Read data from the table
     size_t tableSize = sizeof(TertiaryIntOpParameters) / sizeof(TableParameter);
     TableParameterHandler handler(TertiaryIntOpParameters, tableSize);

     CW2A Target(handler.GetTableParamByName(L"ShaderOp.Target")->m_str);
     CW2A Text(handler.GetTableParamByName(L"ShaderOp.Text")->m_str);

     std::vector<int> *Validation_Input1 =
         &handler.GetTableParamByName(L"Validation.Input1")->m_int32Table;
     std::vector<int> *Validation_Input2 =
         &handler.GetTableParamByName(L"Validation.Input2")->m_int32Table;
     std::vector<int> *Validation_Input3 =
         &handler.GetTableParamByName(L"Validation.Input3")->m_int32Table;
     std::vector<int> *Validation_Expected =
         &handler.GetTableParamByName(L"Validation.Expected1")->m_int32Table;
     int Validation_Tolerance = handler.GetTableParamByName(L"Validation.Tolerance")->m_int32;
     size_t count = Validation_Input1->size();

     std::shared_ptr<ShaderOpTestResult> test = RunShaderOpTest(
         pDevice, m_support, pStream, "TertiaryIntOp",
         // this callbacked is called when the test
         // is creating the resource to run the test
         [&](LPCSTR Name, std::vector<BYTE> &Data, st::ShaderOp *pShaderOp) {
         VERIFY_IS_TRUE(0 == _stricmp(Name, "STertiaryIntOp"));
         size_t size = sizeof(STertiaryIntOp) * count;
         Data.resize(size);
         STertiaryIntOp *pPrimitives = (STertiaryIntOp *)Data.data();
         for (size_t i = 0; i < count; ++i) {
             STertiaryIntOp *p = &pPrimitives[i];
             int val1 = (*Validation_Input1)[i % Validation_Input1->size()];
             int val2 = (*Validation_Input2)[i % Validation_Input2->size()];
             int val3 = (*Validation_Input3)[i % Validation_Input3->size()];
             p->input1 = val1;
             p->input2 = val2;
             p->input3 = val3;
         }

         // use shader from data table
         pShaderOp->Shaders.at(0).Target = Target.m_psz;
         pShaderOp->Shaders.at(0).Text = Text.m_psz;
     });

     MappedData data;
     test->Test->GetReadBackData("STertiaryIntOp", &data);

     STertiaryIntOp *pPrimitives = (STertiaryIntOp *)data.data();
     WEX::TestExecution::DisableVerifyExceptions dve;
     for (unsigned i = 0; i < count; ++i) {
         STertiaryIntOp *p = &pPrimitives[i];
         int val1 = (*Validation_Expected)[i % Validation_Expected->size()];
         LogCommentFmt(L"element #%u, input1 = %11i(0x%08x), input2 = "
             L"%11i(0x%08x), input3= %11i(0x%08x), output = "
             L"%11i(0x%08x), expected = %11i(0x%08x)",
             i, p->input1, p->input1, p->input2, p->input2,
             p->input3, p->input3, p->output, p->output, val1,
             val1);
         VerifyOutputWithExpectedValueInt(p->output, val1, Validation_Tolerance);
     }
 }

 TEST_F(ExecutionTest, BinaryUintOpTest) {
     WEX::TestExecution::SetVerifyOutput verifySettings(
         WEX::TestExecution::VerifyOutputSettings::LogOnlyFailures);
     CComPtr<IStream> pStream;
     ReadHlslDataIntoNewStream(L"ShaderOpArith.xml", &pStream);

     CComPtr<ID3D12Device> pDevice;
     if (!CreateDevice(&pDevice)) {
         return;
     }
     // Read data from the table
     size_t tableSize = sizeof(BinaryUintOpParameters) / sizeof(TableParameter);
     TableParameterHandler handler(BinaryUintOpParameters, tableSize);

     CW2A Target(handler.GetTableParamByName(L"ShaderOp.Target")->m_str);
     CW2A Text(handler.GetTableParamByName(L"ShaderOp.Text")->m_str);


     std::vector<unsigned int> *Validation_Input1 =
         &handler.GetTableParamByName(L"Validation.Input1")->m_uint32Table;
     std::vector<unsigned int> *Validation_Input2 =
         &handler.GetTableParamByName(L"Validation.Input2")->m_uint32Table;
     std::vector<unsigned int> *Validation_Expected1 =
         &handler.GetTableParamByName(L"Validation.Expected1")->m_uint32Table;
     std::vector<unsigned int> *Validation_Expected2 =
         &handler.GetTableParamByName(L"Validation.Expected2")->m_uint32Table;
     int Validation_Tolerance = handler.GetTableParamByName(L"Validation.Tolerance")->m_int32;
     size_t count = Validation_Input1->size();
     int numExpected = Validation_Expected2->size() == 0 ? 1 : 2;
     std::shared_ptr<ShaderOpTestResult> test = RunShaderOpTest(
         pDevice, m_support, pStream, "BinaryUintOp",
         // this callbacked is called when the test
         // is creating the resource to run the test
         [&](LPCSTR Name, std::vector<BYTE> &Data, st::ShaderOp *pShaderOp) {
         VERIFY_IS_TRUE(0 == _stricmp(Name, "SBinaryUintOp"));
         size_t size = sizeof(SBinaryUintOp) * count;
         Data.resize(size);
         SBinaryUintOp *pPrimitives = (SBinaryUintOp *)Data.data();
         for (size_t i = 0; i < count; ++i) {
             SBinaryUintOp *p = &pPrimitives[i];
             unsigned int val1 = (*Validation_Input1)[i % Validation_Input1->size()];
             unsigned int val2 = (*Validation_Input2)[i % Validation_Input2->size()];
             p->input1 = val1;
             p->input2 = val2;
         }

         // use shader from data table
         pShaderOp->Shaders.at(0).Target = Target.m_psz;
         pShaderOp->Shaders.at(0).Text = Text.m_psz;
     });

     MappedData data;
     test->Test->GetReadBackData("SBinaryUintOp", &data);

     SBinaryUintOp *pPrimitives = (SBinaryUintOp *)data.data();
     WEX::TestExecution::DisableVerifyExceptions dve;
     if (numExpected == 2) {
         for (unsigned i = 0; i < count; ++i) {
             SBinaryUintOp *p = &pPrimitives[i];
             unsigned int val1 = (*Validation_Expected1)[i % Validation_Expected1->size()];
             unsigned int val2 = (*Validation_Expected2)[i % Validation_Expected2->size()];
             LogCommentFmt(L"element #%u, input1 = %11u(0x%08x), input2 = "
                 L"%11u(0x%08x), output1 = "
                 L"%11u(0x%08x), expected1 = %11u(0x%08x), output2 = "
                 L"%11u(0x%08x), expected2 = %11u(0x%08x)",
                 i, p->input1, p->input1, p->input2, p->input2, p->output1,
                 p->output1, val1, val1, p->output2, p->output2, val2,
                 val2);
             VerifyOutputWithExpectedValueInt(p->output1, val1, Validation_Tolerance);
             VerifyOutputWithExpectedValueInt(p->output2, val2, Validation_Tolerance);
         }
     }
     else if (numExpected == 1) {
         for (unsigned i = 0; i < count; ++i) {
             SBinaryUintOp *p = &pPrimitives[i];
             unsigned int val1 = (*Validation_Expected1)[i % Validation_Expected1->size()];
             LogCommentFmt(L"element #%u, input1 = %11u(0x%08x), input2 = "
                 L"%11u(0x%08x), output = "
                 L"%11u(0x%08x), expected = %11u(0x%08x)", i,
                 p->input1, p->input1, p->input2, p->input2,
                 p->output1, p->output1, val1, val1);
             VerifyOutputWithExpectedValueInt(p->output1, val1, Validation_Tolerance);
         }
     }
     else {
         LogErrorFmt(L"Unexpected number of expected values for operation %i", numExpected);
     }
 }

 TEST_F(ExecutionTest, TertiaryUintOpTest) {
     WEX::TestExecution::SetVerifyOutput verifySettings(
         WEX::TestExecution::VerifyOutputSettings::LogOnlyFailures);
     CComPtr<IStream> pStream;
     ReadHlslDataIntoNewStream(L"ShaderOpArith.xml", &pStream);

     CComPtr<ID3D12Device> pDevice;
     if (!CreateDevice(&pDevice)) {
         return;
     }
     // Read data from the table
     size_t tableSize = sizeof(TertiaryUintOpParameters) / sizeof(TableParameter);
     TableParameterHandler handler(TertiaryUintOpParameters, tableSize);

     CW2A Target(handler.GetTableParamByName(L"ShaderOp.Target")->m_str);
     CW2A Text(handler.GetTableParamByName(L"ShaderOp.Text")->m_str);

     std::vector<unsigned int> *Validation_Input1 =
         &handler.GetTableParamByName(L"Validation.Input1")->m_uint32Table;
     std::vector<unsigned int> *Validation_Input2 =
         &handler.GetTableParamByName(L"Validation.Input2")->m_uint32Table;
     std::vector<unsigned int> *Validation_Input3 =
         &handler.GetTableParamByName(L"Validation.Input3")->m_uint32Table;
     std::vector<unsigned int> *Validation_Expected =
         &handler.GetTableParamByName(L"Validation.Expected1")->m_uint32Table;
     int Validation_Tolerance = handler.GetTableParamByName(L"Validation.Tolerance")->m_int32;
     size_t count = Validation_Input1->size();

     std::shared_ptr<ShaderOpTestResult> test = RunShaderOpTest(
         pDevice, m_support, pStream, "TertiaryUintOp",
         // this callbacked is called when the test
         // is creating the resource to run the test
         [&](LPCSTR Name, std::vector<BYTE> &Data, st::ShaderOp *pShaderOp) {
         VERIFY_IS_TRUE(0 == _stricmp(Name, "STertiaryUintOp"));
         size_t size = sizeof(STertiaryUintOp) * count;
         Data.resize(size);
         STertiaryUintOp *pPrimitives = (STertiaryUintOp *)Data.data();
         for (size_t i = 0; i < count; ++i) {
             STertiaryUintOp *p = &pPrimitives[i];
             unsigned int val1 = (*Validation_Input1)[i % Validation_Input1->size()];
             unsigned int val2 = (*Validation_Input2)[i % Validation_Input2->size()];
             unsigned int val3 = (*Validation_Input3)[i % Validation_Input3->size()];
             p->input1 = val1;
             p->input2 = val2;
             p->input3 = val3;
         }

         // use shader from data table
         pShaderOp->Shaders.at(0).Target = Target.m_psz;
         pShaderOp->Shaders.at(0).Text = Text.m_psz;
     });

     MappedData data;
     test->Test->GetReadBackData("STertiaryUintOp", &data);

     STertiaryUintOp *pPrimitives = (STertiaryUintOp *)data.data();
     WEX::TestExecution::DisableVerifyExceptions dve;
     for (unsigned i = 0; i < count; ++i) {
         STertiaryUintOp *p = &pPrimitives[i];
         unsigned int val1 = (*Validation_Expected)[i % Validation_Expected->size()];
         LogCommentFmt(L"element #%u, input1 = %11u(0x%08x), input2 = "
             L"%11u(0x%08x), input3 = %11u(0x%08x), output = "
             L"%11u(0x%08x), expected = %11u(0x%08x)", i,
             p->input1, p->input1, p->input2, p->input2, p->input3, p->input3,
             p->output, p->output, val1, val1);
         VerifyOutputWithExpectedValueInt(p->output, val1, Validation_Tolerance);
     }
 }

 // 16 bit integer type tests
 TEST_F(ExecutionTest, UnaryInt16OpTest) {
   WEX::TestExecution::SetVerifyOutput verifySettings(
     WEX::TestExecution::VerifyOutputSettings::LogOnlyFailures);
   CComPtr<IStream> pStream;
   ReadHlslDataIntoNewStream(L"ShaderOpArith.xml", &pStream);

   CComPtr<ID3D12Device> pDevice;
   if (!CreateDevice(&pDevice, D3D_SHADER_MODEL::D3D_SHADER_MODEL_6_2)) {
     return;
   }

   if (!DoesDeviceSupportNative16bitOps(pDevice)) {
     WEX::Logging::Log::Comment(L"Device does not support native 16-bit operations.");
     WEX::Logging::Log::Result(WEX::Logging::TestResults::Skipped);
     return;
   }

   // Read data from the table
   int tableSize = sizeof(UnaryInt16OpParameters) / sizeof(TableParameter);
   TableParameterHandler handler(UnaryInt16OpParameters, tableSize);

   CW2A Target(handler.GetTableParamByName(L"ShaderOp.Target")->m_str);
   CW2A Text(handler.GetTableParamByName(L"ShaderOp.Text")->m_str);
   CW2A Arguments(handler.GetTableParamByName(L"ShaderOp.Arguments")->m_str);

   std::vector<short> *Validation_Input =
     &handler.GetTableParamByName(L"Validation.Input1")->m_int16Table;
   std::vector<short> *Validation_Expected =
     &handler.GetTableParamByName(L"Validation.Expected1")->m_int16Table;
   int Validation_Tolerance = handler.GetTableParamByName(L"Validation.Tolerance")->m_int32;
   size_t count = Validation_Input->size();

   std::shared_ptr<ShaderOpTestResult> test = RunShaderOpTest(
     pDevice, m_support, pStream, "UnaryIntOp",
     // this callbacked is called when the test
     // is creating the resource to run the test
     [&](LPCSTR Name, std::vector<BYTE> &Data, st::ShaderOp *pShaderOp) {
     VERIFY_IS_TRUE(0 == _stricmp(Name, "SUnaryIntOp"));
     size_t size = sizeof(SUnaryInt16Op) * count;
     Data.resize(size);
     SUnaryInt16Op *pPrimitives = (SUnaryInt16Op *)Data.data();
     for (size_t i = 0; i < count; ++i) {
       SUnaryInt16Op *p = &pPrimitives[i];
       p->input = (*Validation_Input)[i % Validation_Input->size()];
     }
     // use shader data table
     pShaderOp->Shaders.at(0).Target = Target.m_psz;
     pShaderOp->Shaders.at(0).Text = Text.m_psz;
     pShaderOp->Shaders.at(0).Arguments = Arguments.m_psz;
   });

   MappedData data;
   test->Test->GetReadBackData("SUnaryIntOp", &data);

   SUnaryInt16Op *pPrimitives = (SUnaryInt16Op *)data.data();
   WEX::TestExecution::DisableVerifyExceptions dve;
   for (unsigned i = 0; i < count; ++i) {
     SUnaryInt16Op *p = &pPrimitives[i];
     short val = (*Validation_Expected)[i % Validation_Expected->size()];
     LogCommentFmt(L"element #%u, input = %5hi(0x%08x), output = %5hi(0x%08x), "
       L"expected = %5hi(0x%08x)",
       i, p->input, p->input, p->output, p->output, val, val);
     VerifyOutputWithExpectedValueInt(p->output, val, Validation_Tolerance);
   }
 }

 TEST_F(ExecutionTest, UnaryUint16OpTest) {
   WEX::TestExecution::SetVerifyOutput verifySettings(
     WEX::TestExecution::VerifyOutputSettings::LogOnlyFailures);
   CComPtr<IStream> pStream;
   ReadHlslDataIntoNewStream(L"ShaderOpArith.xml", &pStream);

   CComPtr<ID3D12Device> pDevice;
   if (!CreateDevice(&pDevice, D3D_SHADER_MODEL::D3D_SHADER_MODEL_6_2)) {
     return;
   }

   if (!DoesDeviceSupportNative16bitOps(pDevice)) {
     WEX::Logging::Log::Comment(L"Device does not support native 16-bit operations.");
     WEX::Logging::Log::Result(WEX::Logging::TestResults::Skipped);
     return;
   }

   // Read data from the table
   int tableSize = sizeof(UnaryUint16OpParameters) / sizeof(TableParameter);
   TableParameterHandler handler(UnaryUint16OpParameters, tableSize);

   CW2A Target(handler.GetTableParamByName(L"ShaderOp.Target")->m_str);
   CW2A Text(handler.GetTableParamByName(L"ShaderOp.Text")->m_str);
   CW2A Arguments(handler.GetTableParamByName(L"ShaderOp.Arguments")->m_str);

   std::vector<unsigned short> *Validation_Input =
     &handler.GetTableParamByName(L"Validation.Input1")->m_uint16Table;
   std::vector<unsigned short> *Validation_Expected =
     &handler.GetTableParamByName(L"Validation.Expected1")->m_uint16Table;
   int Validation_Tolerance = handler.GetTableParamByName(L"Validation.Tolerance")->m_int32;
   size_t count = Validation_Input->size();

   std::shared_ptr<ShaderOpTestResult> test = RunShaderOpTest(
     pDevice, m_support, pStream, "UnaryUintOp",
     // this callbacked is called when the test
     // is creating the resource to run the test
     [&](LPCSTR Name, std::vector<BYTE> &Data, st::ShaderOp *pShaderOp) {
     VERIFY_IS_TRUE(0 == _stricmp(Name, "SUnaryUintOp"));
     size_t size = sizeof(SUnaryUint16Op) * count;
     Data.resize(size);
     SUnaryUint16Op *pPrimitives = (SUnaryUint16Op *)Data.data();
     for (size_t i = 0; i < count; ++i) {
       SUnaryUint16Op *p = &pPrimitives[i];
       p->input = (*Validation_Input)[i % Validation_Input->size()];
     }
     // use shader data table
     pShaderOp->Shaders.at(0).Target = Target.m_psz;
     pShaderOp->Shaders.at(0).Text = Text.m_psz;
     pShaderOp->Shaders.at(0).Arguments = Arguments.m_psz;
   });

   MappedData data;
   test->Test->GetReadBackData("SUnaryUintOp", &data);

   SUnaryUint16Op *pPrimitives = (SUnaryUint16Op *)data.data();
   WEX::TestExecution::DisableVerifyExceptions dve;
   for (unsigned i = 0; i < count; ++i) {
     SUnaryUint16Op *p = &pPrimitives[i];
     unsigned short val = (*Validation_Expected)[i % Validation_Expected->size()];
     LogCommentFmt(L"element #%u, input = %5hu(0x%08x), output = %5hu(0x%08x), "
       L"expected = %5hu(0x%08x)",
       i, p->input, p->input, p->output, p->output, val, val);
     VerifyOutputWithExpectedValueInt(p->output, val, Validation_Tolerance);
   }
 }

 TEST_F(ExecutionTest, BinaryInt16OpTest) {
   WEX::TestExecution::SetVerifyOutput verifySettings(
     WEX::TestExecution::VerifyOutputSettings::LogOnlyFailures);
   CComPtr<IStream> pStream;
   ReadHlslDataIntoNewStream(L"ShaderOpArith.xml", &pStream);

   CComPtr<ID3D12Device> pDevice;
   if (!CreateDevice(&pDevice, D3D_SHADER_MODEL::D3D_SHADER_MODEL_6_2)) {
     return;
   }

   if (!DoesDeviceSupportNative16bitOps(pDevice)) {
     WEX::Logging::Log::Comment(L"Device does not support native 16-bit operations.");
     WEX::Logging::Log::Result(WEX::Logging::TestResults::Skipped);
     return;
   }

   // Read data from the table
   size_t tableSize = sizeof(BinaryInt16OpParameters) / sizeof(TableParameter);
   TableParameterHandler handler(BinaryInt16OpParameters, tableSize);

   CW2A Target(handler.GetTableParamByName(L"ShaderOp.Target")->m_str);
   CW2A Text(handler.GetTableParamByName(L"ShaderOp.Text")->m_str);
   CW2A Arguments(handler.GetTableParamByName(L"ShaderOp.Arguments")->m_str);

   std::vector<short> *Validation_Input1 =
     &handler.GetTableParamByName(L"Validation.Input1")->m_int16Table;
   std::vector<short> *Validation_Input2 =
     &handler.GetTableParamByName(L"Validation.Input2")->m_int16Table;
   std::vector<short> *Validation_Expected1 =
     &handler.GetTableParamByName(L"Validation.Expected1")->m_int16Table;
   std::vector<short> *Validation_Expected2 =
     &handler.GetTableParamByName(L"Validation.Expected2")->m_int16Table;
   int Validation_Tolerance = handler.GetTableParamByName(L"Validation.Tolerance")->m_int32;
   size_t count = Validation_Input1->size();

   size_t numExpected = Validation_Expected2->size() == 0 ? 1 : 2;

   std::shared_ptr<ShaderOpTestResult> test = RunShaderOpTest(
     pDevice, m_support, pStream, "BinaryIntOp",
     // this callbacked is called when the test
     // is creating the resource to run the test
     [&](LPCSTR Name, std::vector<BYTE> &Data, st::ShaderOp *pShaderOp) {
     VERIFY_IS_TRUE(0 == _stricmp(Name, "SBinaryIntOp"));
     size_t size = sizeof(SBinaryInt16Op) * count;
     Data.resize(size);
     SBinaryInt16Op *pPrimitives = (SBinaryInt16Op *)Data.data();
     for (size_t i = 0; i < count; ++i) {
       SBinaryInt16Op *p = &pPrimitives[i];
       p->input1 = (*Validation_Input1)[i % Validation_Input1->size()];
       p->input2 = (*Validation_Input2)[i % Validation_Input2->size()];
     }

     // use shader from data table
     pShaderOp->Shaders.at(0).Target = Target.m_psz;
     pShaderOp->Shaders.at(0).Text = Text.m_psz;
     pShaderOp->Shaders.at(0).Arguments = Arguments.m_psz;
   });

   MappedData data;
   test->Test->GetReadBackData("SBinaryIntOp", &data);

   SBinaryInt16Op *pPrimitives = (SBinaryInt16Op *)data.data();
   WEX::TestExecution::DisableVerifyExceptions dve;

   if (numExpected == 2) {
     for (unsigned i = 0; i < count; ++i) {
       SBinaryInt16Op *p = &pPrimitives[i];
       short val1 = (*Validation_Expected1)[i % Validation_Expected1->size()];
       short val2 = (*Validation_Expected2)[i % Validation_Expected2->size()];
       LogCommentFmt(L"element #%u, input1 = %5hi(0x%08x), input2 = "
         L"%5hi(0x%08x), output1 = "
         L"%5hi(0x%08x), expected1 = %5hi(0x%08x), output2 = "
         L"%5hi(0x%08x), expected2 = %5hi(0x%08x)",
         i, p->input1, p->input1, p->input2, p->input2, p->output1,
         p->output1, val1, val1, p->output2, p->output2, val2,
         val2);
       VerifyOutputWithExpectedValueInt(p->output1, val1, Validation_Tolerance);
       VerifyOutputWithExpectedValueInt(p->output2, val2, Validation_Tolerance);
     }
   }
   else if (numExpected == 1) {
     for (unsigned i = 0; i < count; ++i) {
       SBinaryInt16Op *p = &pPrimitives[i];
       short val1 = (*Validation_Expected1)[i % Validation_Expected1->size()];
       LogCommentFmt(L"element #%u, input1 = %5hi(0x%08x), input2 = "
         L"%5hi(0x%08x), output = "
         L"%5hi(0x%08x), expected = %5hi(0x%08x)", i,
         p->input1, p->input1, p->input2, p->input2,
         p->output1, p->output1, val1, val1);
       VerifyOutputWithExpectedValueInt(p->output1, val1, Validation_Tolerance);
     }
   }
   else {
     LogErrorFmt(L"Unexpected number of expected values for operation %i", numExpected);
   }
 }

 TEST_F(ExecutionTest, TertiaryInt16OpTest) {
   WEX::TestExecution::SetVerifyOutput verifySettings(
     WEX::TestExecution::VerifyOutputSettings::LogOnlyFailures);
   CComPtr<IStream> pStream;
   ReadHlslDataIntoNewStream(L"ShaderOpArith.xml", &pStream);

   CComPtr<ID3D12Device> pDevice;
   if (!CreateDevice(&pDevice, D3D_SHADER_MODEL::D3D_SHADER_MODEL_6_2)) {
     return;
   }

   if (!DoesDeviceSupportNative16bitOps(pDevice)) {
     WEX::Logging::Log::Comment(L"Device does not support native 16-bit operations.");
     WEX::Logging::Log::Result(WEX::Logging::TestResults::Skipped);
     return;
   }

   // Read data from the table
   size_t tableSize = sizeof(TertiaryInt16OpParameters) / sizeof(TableParameter);
   TableParameterHandler handler(TertiaryInt16OpParameters, tableSize);

   CW2A Target(handler.GetTableParamByName(L"ShaderOp.Target")->m_str);
   CW2A Text(handler.GetTableParamByName(L"ShaderOp.Text")->m_str);
   CW2A Arguments(handler.GetTableParamByName(L"ShaderOp.Arguments")->m_str);

   std::vector<short> *Validation_Input1 =
     &handler.GetTableParamByName(L"Validation.Input1")->m_int16Table;
   std::vector<short> *Validation_Input2 =
     &handler.GetTableParamByName(L"Validation.Input2")->m_int16Table;
   std::vector<short> *Validation_Input3 =
     &handler.GetTableParamByName(L"Validation.Input3")->m_int16Table;
   std::vector<short> *Validation_Expected =
     &handler.GetTableParamByName(L"Validation.Expected1")->m_int16Table;
   int Validation_Tolerance = handler.GetTableParamByName(L"Validation.Tolerance")->m_int32;
   size_t count = Validation_Input1->size();

   std::shared_ptr<ShaderOpTestResult> test = RunShaderOpTest(
     pDevice, m_support, pStream, "TertiaryIntOp",
     // this callbacked is called when the test
     // is creating the resource to run the test
     [&](LPCSTR Name, std::vector<BYTE> &Data, st::ShaderOp *pShaderOp) {
     VERIFY_IS_TRUE(0 == _stricmp(Name, "STertiaryIntOp"));
     size_t size = sizeof(STertiaryInt16Op) * count;
     Data.resize(size);
     STertiaryInt16Op *pPrimitives = (STertiaryInt16Op *)Data.data();
     for (size_t i = 0; i < count; ++i) {
       STertiaryInt16Op *p = &pPrimitives[i];
       p->input1 = (*Validation_Input1)[i % Validation_Input1->size()];
       p->input2 = (*Validation_Input2)[i % Validation_Input2->size()];
       p->input3 = (*Validation_Input3)[i % Validation_Input3->size()];
     }

     // use shader from data table
     pShaderOp->Shaders.at(0).Target = Target.m_psz;
     pShaderOp->Shaders.at(0).Text = Text.m_psz;
     pShaderOp->Shaders.at(0).Arguments = Arguments.m_psz;
   });

   MappedData data;
   test->Test->GetReadBackData("STertiaryIntOp", &data);

   STertiaryInt16Op *pPrimitives = (STertiaryInt16Op *)data.data();
   WEX::TestExecution::DisableVerifyExceptions dve;
   for (unsigned i = 0; i < count; ++i) {
     STertiaryInt16Op *p = &pPrimitives[i];
     short val1 = (*Validation_Expected)[i % Validation_Expected->size()];
     LogCommentFmt(L"element #%u, input1 = %11i(0x%08x), input2 = "
       L"%11i(0x%08x), input3= %11i(0x%08x), output = "
       L"%11i(0x%08x), expected = %11i(0x%08x)",
       i, p->input1, p->input1, p->input2, p->input2,
       p->input3, p->input3, p->output, p->output, val1,
       val1);
     VerifyOutputWithExpectedValueInt(p->output, val1, Validation_Tolerance);
   }
 }

 TEST_F(ExecutionTest, BinaryUint16OpTest) {
   WEX::TestExecution::SetVerifyOutput verifySettings(
     WEX::TestExecution::VerifyOutputSettings::LogOnlyFailures);
   CComPtr<IStream> pStream;
   ReadHlslDataIntoNewStream(L"ShaderOpArith.xml", &pStream);

   CComPtr<ID3D12Device> pDevice;
   if (!CreateDevice(&pDevice, D3D_SHADER_MODEL::D3D_SHADER_MODEL_6_2)) {
     return;
   }

   if (!DoesDeviceSupportNative16bitOps(pDevice)) {
     WEX::Logging::Log::Comment(L"Device does not support native 16-bit operations.");
     WEX::Logging::Log::Result(WEX::Logging::TestResults::Skipped);
     return;
   }

   // Read data from the table
   size_t tableSize = sizeof(BinaryUint16OpParameters) / sizeof(TableParameter);
   TableParameterHandler handler(BinaryUint16OpParameters, tableSize);

   CW2A Target(handler.GetTableParamByName(L"ShaderOp.Target")->m_str);
   CW2A Text(handler.GetTableParamByName(L"ShaderOp.Text")->m_str);
   CW2A Arguments(handler.GetTableParamByName(L"ShaderOp.Arguments")->m_str);

   std::vector<unsigned short> *Validation_Input1 =
     &handler.GetTableParamByName(L"Validation.Input1")->m_uint16Table;
   std::vector<unsigned short> *Validation_Input2 =
     &handler.GetTableParamByName(L"Validation.Input2")->m_uint16Table;
   std::vector<unsigned short> *Validation_Expected1 =
     &handler.GetTableParamByName(L"Validation.Expected1")->m_uint16Table;
   std::vector<unsigned short> *Validation_Expected2 =
     &handler.GetTableParamByName(L"Validation.Expected2")->m_uint16Table;
   int Validation_Tolerance = handler.GetTableParamByName(L"Validation.Tolerance")->m_int32;
   size_t count = Validation_Input1->size();
   int numExpected = Validation_Expected2->size() == 0 ? 1 : 2;
   std::shared_ptr<ShaderOpTestResult> test = RunShaderOpTest(
     pDevice, m_support, pStream, "BinaryUintOp",
     // this callbacked is called when the test
     // is creating the resource to run the test
     [&](LPCSTR Name, std::vector<BYTE> &Data, st::ShaderOp *pShaderOp) {
     VERIFY_IS_TRUE(0 == _stricmp(Name, "SBinaryUintOp"));
     size_t size = sizeof(SBinaryUint16Op) * count;
     Data.resize(size);
     SBinaryUint16Op *pPrimitives = (SBinaryUint16Op *)Data.data();
     for (size_t i = 0; i < count; ++i) {
       SBinaryUint16Op *p = &pPrimitives[i];
       p->input1 = (*Validation_Input1)[i % Validation_Input1->size()];
       p->input2 = (*Validation_Input2)[i % Validation_Input2->size()];
     }

     // use shader from data table
     pShaderOp->Shaders.at(0).Target = Target.m_psz;
     pShaderOp->Shaders.at(0).Text = Text.m_psz;
     pShaderOp->Shaders.at(0).Arguments = Arguments.m_psz;
   });

   MappedData data;
   test->Test->GetReadBackData("SBinaryUintOp", &data);

   SBinaryUint16Op *pPrimitives = (SBinaryUint16Op *)data.data();
   WEX::TestExecution::DisableVerifyExceptions dve;
   if (numExpected == 2) {
     for (unsigned i = 0; i < count; ++i) {
       SBinaryUint16Op *p = &pPrimitives[i];
       unsigned short val1 = (*Validation_Expected1)[i % Validation_Expected1->size()];
       unsigned short val2 = (*Validation_Expected2)[i % Validation_Expected2->size()];
       LogCommentFmt(L"element #%u, input1 = %5hu(0x%08x), input2 = "
         L"%5hu(0x%08x), output1 = "
         L"%5hu(0x%08x), expected1 = %5hu(0x%08x), output2 = "
         L"%5hu(0x%08x), expected2 = %5hu(0x%08x)",
         i, p->input1, p->input1, p->input2, p->input2, p->output1,
         p->output1, val1, val1, p->output2, p->output2, val2,
         val2);
       VerifyOutputWithExpectedValueInt(p->output1, val1, Validation_Tolerance);
       VerifyOutputWithExpectedValueInt(p->output2, val2, Validation_Tolerance);
     }
   }
   else if (numExpected == 1) {
     for (unsigned i = 0; i < count; ++i) {
       SBinaryUint16Op *p = &pPrimitives[i];
       unsigned short val1 = (*Validation_Expected1)[i % Validation_Expected1->size()];
       LogCommentFmt(L"element #%u, input1 = %5hu(0x%08x), input2 = "
         L"%5hu(0x%08x), output = "
         L"%5hu(0x%08x), expected = %5hu(0x%08x)", i,
         p->input1, p->input1, p->input2, p->input2,
         p->output1, p->output1, val1, val1);
       VerifyOutputWithExpectedValueInt(p->output1, val1, Validation_Tolerance);
     }
   }
   else {
     LogErrorFmt(L"Unexpected number of expected values for operation %i", numExpected);
   }
 }

 TEST_F(ExecutionTest, TertiaryUint16OpTest) {
   WEX::TestExecution::SetVerifyOutput verifySettings(
     WEX::TestExecution::VerifyOutputSettings::LogOnlyFailures);
   CComPtr<IStream> pStream;
   ReadHlslDataIntoNewStream(L"ShaderOpArith.xml", &pStream);

   CComPtr<ID3D12Device> pDevice;
   if (!CreateDevice(&pDevice, D3D_SHADER_MODEL::D3D_SHADER_MODEL_6_2)) {
     return;
   }

   if (!DoesDeviceSupportNative16bitOps(pDevice)) {
     WEX::Logging::Log::Comment(L"Device does not support native 16-bit operations.");
     WEX::Logging::Log::Result(WEX::Logging::TestResults::Skipped);
     return;
   }

   // Read data from the table
   size_t tableSize = sizeof(TertiaryUint16OpParameters) / sizeof(TableParameter);
   TableParameterHandler handler(TertiaryUint16OpParameters, tableSize);

   CW2A Target(handler.GetTableParamByName(L"ShaderOp.Target")->m_str);
   CW2A Text(handler.GetTableParamByName(L"ShaderOp.Text")->m_str);
   CW2A Arguments(handler.GetTableParamByName(L"ShaderOp.Arguments")->m_str);

   std::vector<unsigned short> *Validation_Input1 =
     &handler.GetTableParamByName(L"Validation.Input1")->m_uint16Table;
   std::vector<unsigned short> *Validation_Input2 =
     &handler.GetTableParamByName(L"Validation.Input2")->m_uint16Table;
   std::vector<unsigned short> *Validation_Input3 =
     &handler.GetTableParamByName(L"Validation.Input3")->m_uint16Table;
   std::vector<unsigned short> *Validation_Expected =
     &handler.GetTableParamByName(L"Validation.Expected1")->m_uint16Table;
   int Validation_Tolerance = handler.GetTableParamByName(L"Validation.Tolerance")->m_int32;
   size_t count = Validation_Input1->size();

   std::shared_ptr<ShaderOpTestResult> test = RunShaderOpTest(
     pDevice, m_support, pStream, "TertiaryUintOp",
     // this callbacked is called when the test
     // is creating the resource to run the test
     [&](LPCSTR Name, std::vector<BYTE> &Data, st::ShaderOp *pShaderOp) {
     VERIFY_IS_TRUE(0 == _stricmp(Name, "STertiaryUintOp"));
     size_t size = sizeof(STertiaryUint16Op) * count;
     Data.resize(size);
     STertiaryUint16Op *pPrimitives = (STertiaryUint16Op *)Data.data();
     for (size_t i = 0; i < count; ++i) {
       STertiaryUint16Op *p = &pPrimitives[i];
       p->input1 = (*Validation_Input1)[i % Validation_Input1->size()];
       p->input2 = (*Validation_Input2)[i % Validation_Input2->size()];
       p->input3 = (*Validation_Input3)[i % Validation_Input3->size()];
     }

     // use shader from data table
     pShaderOp->Shaders.at(0).Target = Target.m_psz;
     pShaderOp->Shaders.at(0).Text = Text.m_psz;
     pShaderOp->Shaders.at(0).Arguments = Arguments.m_psz;
   });

   MappedData data;
   test->Test->GetReadBackData("STertiaryUintOp", &data);

   STertiaryUint16Op *pPrimitives = (STertiaryUint16Op *)data.data();
   WEX::TestExecution::DisableVerifyExceptions dve;
   for (unsigned i = 0; i < count; ++i) {
     STertiaryUint16Op *p = &pPrimitives[i];
     unsigned short val1 = (*Validation_Expected)[i % Validation_Expected->size()];
     LogCommentFmt(L"element #%u, input1 = %5hu(0x%08x), input2 = "
       L"%5hu(0x%08x), input3 = %5hu(0x%08x), output = "
       L"%5hu(0x%08x), expected = %5hu(0x%08x)", i,
       p->input1, p->input1, p->input2, p->input2, p->input3, p->input3,
       p->output, p->output, val1, val1);
     VerifyOutputWithExpectedValueInt(p->output, val1, Validation_Tolerance);
   }
 }

 TEST_F(ExecutionTest, DotTest) {
     WEX::TestExecution::SetVerifyOutput verifySettings(
         WEX::TestExecution::VerifyOutputSettings::LogOnlyFailures);
     CComPtr<IStream> pStream;
     ReadHlslDataIntoNewStream(L"ShaderOpArith.xml", &pStream);

     CComPtr<ID3D12Device> pDevice;
     if (!CreateDevice(&pDevice)) {
         return;
     }

     int tableSize = sizeof(DotOpParameters) / sizeof(TableParameter);
     TableParameterHandler handler(DotOpParameters, tableSize);

     CW2A Target(handler.GetTableParamByName(L"ShaderOp.Target")->m_str);
     CW2A Text(handler.GetTableParamByName(L"ShaderOp.Text")->m_str);

     std::vector<WEX::Common::String> *Validation_Input1 =
         &handler.GetTableParamByName(L"Validation.Input1")->m_StringTable;
     std::vector<WEX::Common::String> *Validation_Input2 =
         &handler.GetTableParamByName(L"Validation.Input2")->m_StringTable;
     std::vector<WEX::Common::String> *Validation_dot2 =
         &handler.GetTableParamByName(L"Validation.Expected1")->m_StringTable;
     std::vector<WEX::Common::String> *Validation_dot3 =
         &handler.GetTableParamByName(L"Validation.Expected2")->m_StringTable;
     std::vector<WEX::Common::String> *Validation_dot4 =
         &handler.GetTableParamByName(L"Validation.Expected3")->m_StringTable;

     PCWSTR Validation_type = handler.GetTableParamByName(L"Validation.Type")->m_str;
     double tolerance = handler.GetTableParamByName(L"Validation.Tolerance")->m_double;
     size_t count = Validation_Input1->size();

     std::shared_ptr<ShaderOpTestResult> test = RunShaderOpTest(
         pDevice, m_support, pStream, "DotOp",
         // this callbacked is called when the test
         // is creating the resource to run the test
         [&](LPCSTR Name, std::vector<BYTE> &Data, st::ShaderOp *pShaderOp) {
         VERIFY_IS_TRUE(0 == _stricmp(Name, "SDotOp"));
         size_t size = sizeof(SDotOp) * count;
         Data.resize(size);
         SDotOp *pPrimitives = (SDotOp*)Data.data();
         for (size_t i = 0; i < count; ++i) {
             SDotOp *p = &pPrimitives[i];
             XMFLOAT4 val1,val2;
             VERIFY_SUCCEEDED(ParseDataToVectorFloat((*Validation_Input1)[i],
                                                     (float *)&val1, 4));
             VERIFY_SUCCEEDED(ParseDataToVectorFloat((*Validation_Input2)[i],
                                                     (float *)&val2, 4));
             p->input1 = val1;
             p->input2 = val2;
         }
         // use shader from data table
         pShaderOp->Shaders.at(0).Target = Target.m_psz;
         pShaderOp->Shaders.at(0).Text = Text.m_psz;
     });

     MappedData data;
     test->Test->GetReadBackData("SDotOp", &data);

     SDotOp *pPrimitives = (SDotOp*)data.data();
     WEX::TestExecution::DisableVerifyExceptions dve;
     for (size_t i = 0; i < count; ++i) {
         SDotOp *p = &pPrimitives[i];
         float dot2, dot3, dot4;
         VERIFY_SUCCEEDED(ParseDataToFloat((*Validation_dot2)[i], dot2));
         VERIFY_SUCCEEDED(ParseDataToFloat((*Validation_dot3)[i], dot3));
         VERIFY_SUCCEEDED(ParseDataToFloat((*Validation_dot4)[i], dot4));
         LogCommentFmt(
             L"element #%u, input1 = (%f, %f, %f, %f), input2 = (%f, %f, "
             L"%f, %f), \n dot2 = %f, dot2_expected = %f, dot3 = %f, "
             L"dot3_expected = %f, dot4 = %f, dot4_expected = %f",
             i, p->input1.x, p->input1.y, p->input1.z, p->input1.w, p->input2.x,
             p->input2.y, p->input2.z, p->input2.w, p->o_dot2, dot2, p->o_dot3, dot3,
             p->o_dot4, dot4);
         VerifyOutputWithExpectedValueFloat(p->o_dot2, dot2, Validation_type,
                                            tolerance);
         VerifyOutputWithExpectedValueFloat(p->o_dot3, dot3, Validation_type,
                                            tolerance);
         VerifyOutputWithExpectedValueFloat(p->o_dot4, dot4, Validation_type,
                                            tolerance);
     }
 }

 TEST_F(ExecutionTest, Dot2AddHalfTest) {
     WEX::TestExecution::SetVerifyOutput verifySettings(
         WEX::TestExecution::VerifyOutputSettings::LogOnlyFailures);
     CComPtr<IStream> pStream;
     ReadHlslDataIntoNewStream(L"ShaderOpArith.xml", &pStream);

     CComPtr<ID3D12Device> pDevice;
     if (!CreateDevice(&pDevice, D3D_SHADER_MODEL::D3D_SHADER_MODEL_6_4, false)) {
         return;
     }

     if (!DoesDeviceSupportNative16bitOps(pDevice)) {
         WEX::Logging::Log::Comment(L"Device does not support native 16-bit operations.");
         WEX::Logging::Log::Result(WEX::Logging::TestResults::Skipped);
         return;
     }

     int tableSize = sizeof(Dot2AddHalfOpParameters) / sizeof(TableParameter);
     TableParameterHandler handler(Dot2AddHalfOpParameters, tableSize);

     CW2A Target(handler.GetTableParamByName(L"ShaderOp.Target")->m_str);
     CW2A Text(handler.GetTableParamByName(L"ShaderOp.Text")->m_str);
     CW2A Arguments(handler.GetTableParamByName(L"ShaderOp.Arguments")->m_str);

     std::vector<WEX::Common::String> *validation_input1 =
         &handler.GetTableParamByName(L"Validation.Input1")->m_StringTable;
     std::vector<WEX::Common::String> *validation_input2 =
         &handler.GetTableParamByName(L"Validation.Input2")->m_StringTable;
     std::vector<float> *validation_acc = &handler.GetTableParamByName(L"Validation.Input3")->m_floatTable;
     std::vector<float> *validation_result = &handler.GetTableParamByName(L"Validation.Expected1")->m_floatTable;

     PCWSTR Validation_type = handler.GetTableParamByName(L"Validation.Type")->m_str;
     double tolerance = handler.GetTableParamByName(L"Validation.Tolerance")->m_double;
     size_t count = validation_input1->size();

     std::shared_ptr<ShaderOpTestResult> test = RunShaderOpTest(
         pDevice, m_support, pStream, "Dot2AddHalfOp",
         // this callback is called when the test
         // is creating the resource to run the test
         [&](LPCSTR Name, std::vector<BYTE> &Data, st::ShaderOp *pShaderOp) {
         VERIFY_IS_TRUE(0 == _stricmp(Name, "SDot2AddHalfOp"));
         size_t size = sizeof(SDot2AddHalfOp) * count;
         Data.resize(size);
         SDot2AddHalfOp *pPrimitives = (SDot2AddHalfOp*)Data.data();
         for (size_t i = 0; i < count; ++i) {
             SDot2AddHalfOp *p = &pPrimitives[i];
             Half2 val1,val2;
             VERIFY_SUCCEEDED(ParseDataToVectorHalf((*validation_input1)[i],
                                                     (uint16_t *)&val1, 2));
             VERIFY_SUCCEEDED(ParseDataToVectorHalf((*validation_input2)[i],
                                                     (uint16_t *)&val2, 2));
             p->input1 = val1;
             p->input2 = val2;
             p->acc = (*validation_acc)[i];
         }
         // use shader from data table
         pShaderOp->Shaders.at(0).Target = Target.m_psz;
         pShaderOp->Shaders.at(0).Text = Text.m_psz;
         pShaderOp->Shaders.at(0).Arguments = Arguments.m_psz;
     });

     MappedData data;
     test->Test->GetReadBackData("SDot2AddHalfOp", &data);

     SDot2AddHalfOp *pPrimitives = (SDot2AddHalfOp*)data.data();
     WEX::TestExecution::DisableVerifyExceptions dve;
     for (size_t i = 0; i < count; ++i) {
         SDot2AddHalfOp *p = &pPrimitives[i];
         float expectedResult = (*validation_result)[i];
         float input1x = ConvertFloat16ToFloat32(p->input1.x);
         float input1y = ConvertFloat16ToFloat32(p->input1.y);
         float input2x = ConvertFloat16ToFloat32(p->input2.x);
         float input2y = ConvertFloat16ToFloat32(p->input2.y);
         LogCommentFmt(
             L"element #%u, input1 = (%f, %f), input2 = (%f, %f), acc = %f\n"
             L"result = %f, result_expected = %f",
             i, input1x, input1y, input2x, input2y, p->acc, p->result, expectedResult);
         VerifyOutputWithExpectedValueFloat(p->result, expectedResult, Validation_type, tolerance);
     }
 }

 TEST_F(ExecutionTest, Dot4AddI8PackedTest) {
     WEX::TestExecution::SetVerifyOutput verifySettings(
         WEX::TestExecution::VerifyOutputSettings::LogOnlyFailures);
     CComPtr<IStream> pStream;
     ReadHlslDataIntoNewStream(L"ShaderOpArith.xml", &pStream);

     CComPtr<ID3D12Device> pDevice;
     if (!CreateDevice(&pDevice, D3D_SHADER_MODEL::D3D_SHADER_MODEL_6_4, false)) {
         return;
     }

     int tableSize = sizeof(Dot4AddI8PackedOpParameters) / sizeof(TableParameter);
     TableParameterHandler handler(Dot4AddI8PackedOpParameters, tableSize);

     CW2A Target(handler.GetTableParamByName(L"ShaderOp.Target")->m_str);
     CW2A Text(handler.GetTableParamByName(L"ShaderOp.Text")->m_str);

     std::vector<uint32_t> *validation_input1 = &handler.GetTableParamByName(L"Validation.Input1")->m_uint32Table;
     std::vector<uint32_t> *validation_input2 = &handler.GetTableParamByName(L"Validation.Input2")->m_uint32Table;
     std::vector<int32_t> *validation_acc = &handler.GetTableParamByName(L"Validation.Input3")->m_int32Table;
     std::vector<int32_t> *validation_result = &handler.GetTableParamByName(L"Validation.Expected1")->m_int32Table;

     size_t count = validation_input1->size();

     std::shared_ptr<ShaderOpTestResult> test = RunShaderOpTest(
         pDevice, m_support, pStream, "Dot4AddI8PackedOp",
         // this callback is called when the test
         // is creating the resource to run the test
         [&](LPCSTR Name, std::vector<BYTE> &Data, st::ShaderOp *pShaderOp) {
         VERIFY_IS_TRUE(0 == _stricmp(Name, "SDot4AddI8PackedOp"));
         size_t size = sizeof(SDot4AddI8PackedOp) * count;
         Data.resize(size);
         SDot4AddI8PackedOp *pPrimitives = (SDot4AddI8PackedOp*)Data.data();
         for (size_t i = 0; i < count; ++i) {
             SDot4AddI8PackedOp *p = &pPrimitives[i];
             p->input1 = (*validation_input1)[i];
             p->input2 = (*validation_input2)[i];
             p->acc = (*validation_acc)[i];
         }
         // use shader from data table
         pShaderOp->Shaders.at(0).Target = Target.m_psz;
         pShaderOp->Shaders.at(0).Text = Text.m_psz;
     });

     MappedData data;
     test->Test->GetReadBackData("SDot4AddI8PackedOp", &data);

     SDot4AddI8PackedOp *pPrimitives = (SDot4AddI8PackedOp*)data.data();
     WEX::TestExecution::DisableVerifyExceptions dve;
     for (size_t i = 0; i < count; ++i) {
         SDot4AddI8PackedOp *p = &pPrimitives[i];
         int32_t expectedResult = (*validation_result)[i];
         LogCommentFmt(
             L"element #%u, input1 = %u, input2 = %u, acc = %d \n"
             L"result = %d, result_expected = %d",
             i, p->input1, p->input2, p->acc, p->result, expectedResult);
         VerifyOutputWithExpectedValueInt(p->result, expectedResult, 0);
     }
 }

 TEST_F(ExecutionTest, Dot4AddU8PackedTest) {
     WEX::TestExecution::SetVerifyOutput verifySettings(
         WEX::TestExecution::VerifyOutputSettings::LogOnlyFailures);
     CComPtr<IStream> pStream;
     ReadHlslDataIntoNewStream(L"ShaderOpArith.xml", &pStream);

     CComPtr<ID3D12Device> pDevice;
     if (!CreateDevice(&pDevice, D3D_SHADER_MODEL::D3D_SHADER_MODEL_6_4, false)) {
         return;
     }

     int tableSize = sizeof(Dot4AddU8PackedOpParameters) / sizeof(TableParameter);
     TableParameterHandler handler(Dot4AddU8PackedOpParameters, tableSize);

     CW2A Target(handler.GetTableParamByName(L"ShaderOp.Target")->m_str);
     CW2A Text(handler.GetTableParamByName(L"ShaderOp.Text")->m_str);

     std::vector<uint32_t> *validation_input1 = &handler.GetTableParamByName(L"Validation.Input1")->m_uint32Table;
     std::vector<uint32_t> *validation_input2 = &handler.GetTableParamByName(L"Validation.Input2")->m_uint32Table;
     std::vector<uint32_t> *validation_acc = &handler.GetTableParamByName(L"Validation.Input3")->m_uint32Table;
     std::vector<uint32_t> *validation_result = &handler.GetTableParamByName(L"Validation.Expected1")->m_uint32Table;

     size_t count = validation_input1->size();

     std::shared_ptr<ShaderOpTestResult> test = RunShaderOpTest(
         pDevice, m_support, pStream, "Dot4AddU8PackedOp",
         // this callback is called when the test
         // is creating the resource to run the test
         [&](LPCSTR Name, std::vector<BYTE> &Data, st::ShaderOp *pShaderOp) {
         VERIFY_IS_TRUE(0 == _stricmp(Name, "SDot4AddU8PackedOp"));
         size_t size = sizeof(SDot4AddU8PackedOp) * count;
         Data.resize(size);
         SDot4AddU8PackedOp *pPrimitives = (SDot4AddU8PackedOp*)Data.data();
         for (size_t i = 0; i < count; ++i) {
             SDot4AddU8PackedOp *p = &pPrimitives[i];
             p->input1 = (*validation_input1)[i];
             p->input2 = (*validation_input2)[i];
             p->acc = (*validation_acc)[i];
         }
         // use shader from data table
         pShaderOp->Shaders.at(0).Target = Target.m_psz;
         pShaderOp->Shaders.at(0).Text = Text.m_psz;
     });

     MappedData data;
     test->Test->GetReadBackData("SDot4AddU8PackedOp", &data);

     SDot4AddU8PackedOp *pPrimitives = (SDot4AddU8PackedOp*)data.data();
     WEX::TestExecution::DisableVerifyExceptions dve;
     for (size_t i = 0; i < count; ++i) {
         SDot4AddU8PackedOp *p = &pPrimitives[i];
         uint32_t expectedResult = (*validation_result)[i];
         LogCommentFmt(
             L"element #%u, input1 = %u, input2 = %u, acc = %u \n"
             L"result = %u, result_expected = %u, ",
             i, p->input1, p->input2, p->acc, p->result, expectedResult);
         VerifyOutputWithExpectedValueUInt(p->result, expectedResult, 0);
     }
 }

 TEST_F(ExecutionTest, Msad4Test) {
     WEX::TestExecution::SetVerifyOutput verifySettings(
         WEX::TestExecution::VerifyOutputSettings::LogOnlyFailures);
     CComPtr<IStream> pStream;
     ReadHlslDataIntoNewStream(L"ShaderOpArith.xml", &pStream);

     CComPtr<ID3D12Device> pDevice;
     if (!CreateDevice(&pDevice)) {
         return;
     }
     size_t tableSize = sizeof(Msad4OpParameters) / sizeof(TableParameter);
     TableParameterHandler handler(Msad4OpParameters, tableSize);

     CW2A Text(handler.GetTableParamByName(L"ShaderOp.Text")->m_str);
     double tolerance = handler.GetTableParamByName(L"Validation.Tolerance")->m_double;

     std::vector<unsigned int> *Validation_Reference =
         &handler.GetTableParamByName(L"Validation.Input1")->m_uint32Table;
     std::vector<WEX::Common::String> *Validation_Source =
         &handler.GetTableParamByName(L"Validation.Input2")->m_StringTable;
     std::vector<WEX::Common::String> *Validation_Accum =
         &handler.GetTableParamByName(L"Validation.Input3")->m_StringTable;
     std::vector<WEX::Common::String> *Validation_Expected =
         &handler.GetTableParamByName(L"Validation.Expected1")->m_StringTable;

     size_t count = Validation_Expected->size();

     std::shared_ptr<ShaderOpTestResult> test = RunShaderOpTest(
         pDevice, m_support, pStream, "Msad4",
         // this callbacked is called when the test
         // is creating the resource to run the test
         [&](LPCSTR Name, std::vector<BYTE> &Data, st::ShaderOp *pShaderOp) {
         VERIFY_IS_TRUE(0 == _stricmp(Name, "SMsad4"));
         size_t size = sizeof(SMsad4) * count;
         Data.resize(size);
         SMsad4 *pPrimitives = (SMsad4*)Data.data();
         for (size_t i = 0; i < count; ++i) {
             SMsad4 *p = &pPrimitives[i];
             XMUINT2 src;
             XMUINT4 accum;
             VERIFY_SUCCEEDED(ParseDataToVectorUint((*Validation_Source)[i], (unsigned int*)&src, 2));
             VERIFY_SUCCEEDED(ParseDataToVectorUint((*Validation_Accum)[i], (unsigned int*)&accum, 4));
             p->ref = (*Validation_Reference)[i];
             p->src = src;
             p->accum = accum;
         }
         // use shader from data table
         pShaderOp->Shaders.at(0).Text = Text.m_psz;
     });

     MappedData data;
     test->Test->GetReadBackData("SMsad4", &data);

     SMsad4 *pPrimitives = (SMsad4*)data.data();
     WEX::TestExecution::DisableVerifyExceptions dve;
     for (size_t i = 0; i < count; ++i) {
         SMsad4 *p = &pPrimitives[i];
         XMUINT4 result;
         VERIFY_SUCCEEDED(ParseDataToVectorUint((*Validation_Expected)[i],
                                                (unsigned int *)&result, 4));
         LogCommentFmt(
             L"element #%u, ref = %u(0x%08x), src = %u(0x%08x), %u(0x%08x), "
             L"accum = %u(0x%08x), %u(0x%08x), %u(0x%08x), %u(0x%08x),\n"
             L"result = %u(0x%08x), %u(0x%08x), %u(0x%08x), %u(0x%08x),\n"
             L"expected = %u(0x%08x), %u(0x%08x), %u(0x%08x), %u(0x%08x)", i,
             p->ref, p->ref, p->src.x, p->src.x, p->src.y, p->src.y, p->accum.x,
             p->accum.x, p->accum.y, p->accum.y, p->accum.z, p->accum.z,
             p->accum.w, p->accum.w, p->result.x, p->result.x, p->result.y,
             p->result.y, p->result.z, p->result.z, p->result.w, p->result.w,
             result.x, result.x, result.y, result.y, result.z, result.z,
             result.w, result.w);

         int toleranceInt = (int)tolerance;
         VerifyOutputWithExpectedValueInt(p->result.x, result.x, toleranceInt);
         VerifyOutputWithExpectedValueInt(p->result.y, result.y, toleranceInt);
         VerifyOutputWithExpectedValueInt(p->result.z, result.z, toleranceInt);
         VerifyOutputWithExpectedValueInt(p->result.w, result.w, toleranceInt);
     }
 }

 TEST_F(ExecutionTest, DenormBinaryFloatOpTest) {
   WEX::TestExecution::SetVerifyOutput verifySettings(
     WEX::TestExecution::VerifyOutputSettings::LogOnlyFailures);
   CComPtr<IStream> pStream;
   ReadHlslDataIntoNewStream(L"ShaderOpArith.xml", &pStream);

   CComPtr<ID3D12Device> pDevice;
   if (!CreateDevice(&pDevice, D3D_SHADER_MODEL::D3D_SHADER_MODEL_6_2)) {
     return;
   }

   // Read data from the table
   int tableSize = sizeof(DenormBinaryFPOpParameters) / sizeof(TableParameter);
   TableParameterHandler handler(DenormBinaryFPOpParameters, tableSize);

   CW2A Target(handler.GetTableParamByName(L"ShaderOp.Target")->m_str);
   CW2A Text(handler.GetTableParamByName(L"ShaderOp.Text")->m_str);
   CW2A Arguments(handler.GetTableParamByName(L"ShaderOp.Arguments")->m_str);

   std::vector<WEX::Common::String> *Validation_Input1 =
     &(handler.GetTableParamByName(L"Validation.Input1")->m_StringTable);
   std::vector<WEX::Common::String> *Validation_Input2 =
     &(handler.GetTableParamByName(L"Validation.Input2")->m_StringTable);

   std::vector<WEX::Common::String> *Validation_Expected1 =
     &(handler.GetTableParamByName(L"Validation.Expected1")->m_StringTable);
   // two expected outputs for any mode
   std::vector<WEX::Common::String> *Validation_Expected2 =
     &(handler.GetTableParamByName(L"Validation.Expected2")->m_StringTable);

   LPCWSTR Validation_Type = handler.GetTableParamByName(L"Validation.Type")->m_str;
   double Validation_Tolerance = handler.GetTableParamByName(L"Validation.Tolerance")->m_double;
   size_t count = Validation_Input1->size();

   using namespace hlsl::DXIL;
   Float32DenormMode mode = Float32DenormMode::Any;
   if (strcmp(Arguments.m_psz, "-denorm preserve") == 0) {
     mode = Float32DenormMode::Preserve;
   }
   else if (strcmp(Arguments.m_psz, "-denorm ftz") == 0) {
     mode = Float32DenormMode::FTZ;
   }
   if (mode == Float32DenormMode::Any) {
     DXASSERT(Validation_Expected2->size() == Validation_Expected1->size(),
              "must have same number of expected values");
   }

   #if defined(_M_ARM64) || defined(_M_ARM64EC)
     if ((GetTestParamUseWARP(UseWarpByDefault()) || IsDeviceBasicAdapter(pDevice)) && mode == Float32DenormMode::Preserve) {
       WEX::Logging::Log::Comment(L"WARP has an issue with DenormBinaryFloatOpTest with '-denorm preserve' on ARM64.");
       WEX::Logging::Log::Result(WEX::Logging::TestResults::Skipped);
       return;
     }
   #endif // defined(_M_ARM64) || defined(_M_ARM64EC)

   std::shared_ptr<ShaderOpTestResult> test = RunShaderOpTest(
     pDevice, m_support, pStream, "BinaryFPOp",
     // this callbacked is called when the test
     // is creating the resource to run the test
     [&](LPCSTR Name, std::vector<BYTE> &Data, st::ShaderOp *pShaderOp) {
     VERIFY_IS_TRUE(0 == _stricmp(Name, "SBinaryFPOp"));
     size_t size = sizeof(SBinaryFPOp) * count;
     Data.resize(size);
     SBinaryFPOp *pPrimitives = (SBinaryFPOp *)Data.data();
     for (size_t i = 0; i < count; ++i) {
       SBinaryFPOp *p = &pPrimitives[i];
       PCWSTR str1 = (*Validation_Input1)[i % Validation_Input1->size()];
       PCWSTR str2 = (*Validation_Input2)[i % Validation_Input2->size()];
       float val1, val2;
       VERIFY_SUCCEEDED(ParseDataToFloat(str1, val1));
       VERIFY_SUCCEEDED(ParseDataToFloat(str2, val2));
       p->input1 = val1;
       p->input2 = val2;
     }

     // use shader from data table
     pShaderOp->Shaders.at(0).Target = Target.m_psz;
     pShaderOp->Shaders.at(0).Text = Text.m_psz;
     pShaderOp->Shaders.at(0).Arguments = Arguments.m_psz;
   });

   MappedData data;
   test->Test->GetReadBackData("SBinaryFPOp", &data);

   SBinaryFPOp *pPrimitives = (SBinaryFPOp *)data.data();
   WEX::TestExecution::DisableVerifyExceptions dve;

   for (unsigned i = 0; i < count; ++i) {
     SBinaryFPOp *p = &pPrimitives[i];
     if (mode == Float32DenormMode::Any) {
        LPCWSTR str1 = (*Validation_Expected1)[i % Validation_Expected1->size()];
        LPCWSTR str2 = (*Validation_Expected2)[i % Validation_Expected2->size()];
        float val1;
        float val2;
        VERIFY_SUCCEEDED(ParseDataToFloat(str1, val1));
        VERIFY_SUCCEEDED(ParseDataToFloat(str2, val2));
        LogCommentFmt(L"element #%u, input1 = %6.8f, input2 = %6.8f, output = "
          L"%6.8f, expected = %6.8f(%x) or %6.8f(%x)",
          i, p->input1, p->input2, p->output1, val1, *(int *)&val1, val2, *(int *)&val2);
        VERIFY_IS_TRUE(
            CompareOutputWithExpectedValueFloat(
                p->output1, val1, Validation_Type, Validation_Tolerance, mode) ||
            CompareOutputWithExpectedValueFloat(
                p->output1, val2, Validation_Type, Validation_Tolerance, mode));
     }
     else {
        LPCWSTR str1 = (*Validation_Expected1)[i % Validation_Expected1->size()];
        float val1;
        VERIFY_SUCCEEDED(ParseDataToFloat(str1, val1));
        LogCommentFmt(L"element #%u, input1 = %6.8f, input2 = %6.8f, output = "
          L"%6.8f, expected = %6.8f(%a)",
          i, p->input1, p->input2, p->output1, val1, *(int *)&val1);
        VerifyOutputWithExpectedValueFloat(p->output1, val1, Validation_Type,
           Validation_Tolerance, mode);
     }
   }
 }

 TEST_F(ExecutionTest, DenormTertiaryFloatOpTest) {
   WEX::TestExecution::SetVerifyOutput verifySettings(
     WEX::TestExecution::VerifyOutputSettings::LogOnlyFailures);
   CComPtr<IStream> pStream;
   ReadHlslDataIntoNewStream(L"ShaderOpArith.xml", &pStream);

   CComPtr<ID3D12Device> pDevice;
   if (!CreateDevice(&pDevice, D3D_SHADER_MODEL::D3D_SHADER_MODEL_6_2)) {
     return;
   }

   // Read data from the table
   int tableSize = sizeof(DenormTertiaryFPOpParameters) / sizeof(TableParameter);
   TableParameterHandler handler(DenormTertiaryFPOpParameters, tableSize);

   CW2A Target(handler.GetTableParamByName(L"ShaderOp.Target")->m_str);
   CW2A Text(handler.GetTableParamByName(L"ShaderOp.Text")->m_str);
   CW2A Arguments(handler.GetTableParamByName(L"ShaderOp.Arguments")->m_str);

   std::vector<WEX::Common::String> *Validation_Input1 =
     &(handler.GetTableParamByName(L"Validation.Input1")->m_StringTable);
   std::vector<WEX::Common::String> *Validation_Input2 =
     &(handler.GetTableParamByName(L"Validation.Input2")->m_StringTable);
   std::vector<WEX::Common::String> *Validation_Input3 =
     &(handler.GetTableParamByName(L"Validation.Input3")->m_StringTable);

   std::vector<WEX::Common::String> *Validation_Expected1 =
     &(handler.GetTableParamByName(L"Validation.Expected1")->m_StringTable);

   // two expected outputs for any mode
   std::vector<WEX::Common::String> *Validation_Expected2 =
     &(handler.GetTableParamByName(L"Validation.Expected2")->m_StringTable);
   LPCWSTR Validation_Type = handler.GetTableParamByName(L"Validation.Type")->m_str;
   double Validation_Tolerance = handler.GetTableParamByName(L"Validation.Tolerance")->m_double;
   size_t count = Validation_Input1->size();

   using namespace hlsl::DXIL;
   Float32DenormMode mode = Float32DenormMode::Any;
   if (strcmp(Arguments.m_psz, "-denorm preserve") == 0) {
     mode = Float32DenormMode::Preserve;
   }
   else if (strcmp(Arguments.m_psz, "-denorm ftz") == 0) {
     mode = Float32DenormMode::FTZ;
   }
   if (mode == Float32DenormMode::Any) {
     DXASSERT(Validation_Expected2->size() == Validation_Expected1->size(),
       "must have same number of expected values");
   }

 #if defined(_M_ARM64) || defined(_M_ARM64EC)
   if ((GetTestParamUseWARP(UseWarpByDefault()) || IsDeviceBasicAdapter(pDevice)) && mode == Float32DenormMode::Preserve) {
     WEX::Logging::Log::Comment(L"WARP has an issue with DenormTertiaryFloatOpTest with '-denorm preserve' on ARM64.");
     WEX::Logging::Log::Result(WEX::Logging::TestResults::Skipped);
     return;
   }
 #endif // defined(_M_ARM64) || defined(_M_ARM64EC)

   std::shared_ptr<ShaderOpTestResult> test = RunShaderOpTest(
     pDevice, m_support, pStream, "TertiaryFPOp",
     // this callbacked is called when the test
     // is creating the resource to run the test
     [&](LPCSTR Name, std::vector<BYTE> &Data, st::ShaderOp *pShaderOp) {
     VERIFY_IS_TRUE(0 == _stricmp(Name, "STertiaryFPOp"));
     size_t size = sizeof(STertiaryFPOp) * count;
     Data.resize(size);
     STertiaryFPOp *pPrimitives = (STertiaryFPOp *)Data.data();
     for (size_t i = 0; i < count; ++i) {
       STertiaryFPOp *p = &pPrimitives[i];
       PCWSTR str1 = (*Validation_Input1)[i % Validation_Input1->size()];
       PCWSTR str2 = (*Validation_Input2)[i % Validation_Input2->size()];
       PCWSTR str3 = (*Validation_Input3)[i % Validation_Input3->size()];
       float val1, val2, val3;
       VERIFY_SUCCEEDED(ParseDataToFloat(str1, val1));
       VERIFY_SUCCEEDED(ParseDataToFloat(str2, val2));
       VERIFY_SUCCEEDED(ParseDataToFloat(str3, val3));
       p->input1 = val1;
       p->input2 = val2;
       p->input3 = val3;
     }

     // use shader from data table
     pShaderOp->Shaders.at(0).Target = Target.m_psz;
     pShaderOp->Shaders.at(0).Text = Text.m_psz;
     pShaderOp->Shaders.at(0).Arguments = Arguments.m_psz;
   });

   MappedData data;
   test->Test->GetReadBackData("STertiaryFPOp", &data);

   STertiaryFPOp *pPrimitives = (STertiaryFPOp *)data.data();
   WEX::TestExecution::DisableVerifyExceptions dve;

   for (unsigned i = 0; i < count; ++i) {
     STertiaryFPOp *p = &pPrimitives[i];
     if (mode == Float32DenormMode::Any) {
         LPCWSTR str1 = (*Validation_Expected1)[i % Validation_Expected1->size()];
         LPCWSTR str2 = (*Validation_Expected2)[i % Validation_Expected2->size()];
         float val1;
         float val2;
         VERIFY_SUCCEEDED(ParseDataToFloat(str1, val1));
         VERIFY_SUCCEEDED(ParseDataToFloat(str2, val2));
         LogCommentFmt(L"element #%u, input1 = %6.8f, input2 = %6.8f, input3 = %6.8f, output = "
             L"%6.8f, expected = %6.8f(%x) or %6.8f(%x)",
             i, p->input1, p->input2, p->input3, p->output, val1, *(int *)&val1, val2, *(int *)&val2);
         VERIFY_IS_TRUE(
             CompareOutputWithExpectedValueFloat(
                 p->output, val1, Validation_Type, Validation_Tolerance, mode) ||
             CompareOutputWithExpectedValueFloat(
                 p->output, val2, Validation_Type, Validation_Tolerance, mode));
     }
     else {
         LPCWSTR str1 = (*Validation_Expected1)[i % Validation_Expected1->size()];
         float val1;
         VERIFY_SUCCEEDED(ParseDataToFloat(str1, val1));
         LogCommentFmt(L"element #%u, input1 = %6.8f, input2 = %6.8f, input3 = %6.8f, output = "
             L"%6.8f, expected = %6.8f(%a)",
             i, p->input1, p->input2, p->input3, p->output, val1, *(int *)&val1);
         VerifyOutputWithExpectedValueFloat(p->output, val1, Validation_Type,
             Validation_Tolerance, mode);
     }
   }
 }

 // Setup for wave intrinsics tests
 enum class ShaderOpKind {
   WaveSum,
   WaveProduct,
   WaveActiveMax,
   WaveActiveMin,
   WaveCountBits,
   WaveActiveAllEqual,
   WaveActiveAnyTrue,
   WaveActiveAllTrue,
   WaveActiveBitOr,
   WaveActiveBitAnd,
   WaveActiveBitXor,
   ShaderOpInvalid
 };

 struct ShaderOpKindPair {
   LPCWSTR name;
   ShaderOpKind kind;
 };

 static ShaderOpKindPair ShaderOpKindTable[] = {
   { L"WaveActiveSum", ShaderOpKind::WaveSum },
   { L"WaveActiveUSum", ShaderOpKind::WaveSum },
   { L"WaveActiveProduct", ShaderOpKind::WaveProduct },
   { L"WaveActiveUProduct", ShaderOpKind::WaveProduct },
   { L"WaveActiveMax", ShaderOpKind::WaveActiveMax },
   { L"WaveActiveUMax", ShaderOpKind::WaveActiveMax },
   { L"WaveActiveMin", ShaderOpKind::WaveActiveMin },
   { L"WaveActiveUMin", ShaderOpKind::WaveActiveMin },
   { L"WaveActiveCountBits", ShaderOpKind::WaveCountBits },
   { L"WaveActiveAllEqual", ShaderOpKind::WaveActiveAllEqual },
   { L"WaveActiveAnyTrue", ShaderOpKind::WaveActiveAnyTrue },
   { L"WaveActiveAllTrue", ShaderOpKind::WaveActiveAllTrue },
   { L"WaveActiveBitOr", ShaderOpKind::WaveActiveBitOr },
   { L"WaveActiveBitAnd", ShaderOpKind::WaveActiveBitAnd },
   { L"WaveActiveBitXor", ShaderOpKind::WaveActiveBitXor },
   { L"WavePrefixSum", ShaderOpKind::WaveSum },
   { L"WavePrefixUSum", ShaderOpKind::WaveSum },
   { L"WavePrefixProduct", ShaderOpKind::WaveProduct },
   { L"WavePrefixUProduct", ShaderOpKind::WaveProduct },
   { L"WavePrefixMax", ShaderOpKind::WaveActiveMax },
   { L"WavePrefixUMax", ShaderOpKind::WaveActiveMax },
   { L"WavePrefixMin", ShaderOpKind::WaveActiveMin },
   { L"WavePrefixUMin", ShaderOpKind::WaveActiveMin },
   { L"WavePrefixCountBits", ShaderOpKind::WaveCountBits }
 };

 ShaderOpKind GetShaderOpKind(LPCWSTR str) {
   for (size_t i = 0; i < sizeof(ShaderOpKindTable)/sizeof(ShaderOpKindPair); ++i) {
     if (_wcsicmp(ShaderOpKindTable[i].name, str) == 0) {
       return ShaderOpKindTable[i].kind;
     }
   }
   DXASSERT_ARGS(false, "Invalid ShaderOp name: %s", str);
   return ShaderOpKind::ShaderOpInvalid;
 }

 template <typename InType, typename OutType, ShaderOpKind kind>
 struct computeExpected {
   OutType operator()(const std::vector<InType> &inputs,
                      const std::vector<int> &masks, int maskValue,
                      unsigned int index) {
     return 0;
   }
 };

 template <typename InType, typename OutType>
 struct computeExpected<InType, OutType, ShaderOpKind::WaveSum> {
   OutType operator()(const std::vector<InType> &inputs,
                      const std::vector<int> &masks, int maskValue,
                      unsigned int index) {
     OutType sum = 0;
     for (size_t i = 0; i < index; ++i) {
       if (masks.at(i) == maskValue) {
         sum += inputs.at(i);
       }
     }
     return sum;
   }
 };

 template <typename InType, typename OutType>
 struct computeExpected<InType, OutType, ShaderOpKind::WaveProduct> {
   OutType operator()(const std::vector<InType> &inputs,
                      const std::vector<int> &masks, int maskValue,
                      unsigned int index) {
     OutType prod = 1;
     for (size_t i = 0; i < index; ++i) {
       if (masks.at(i) == maskValue) {
         prod *= inputs.at(i);
       }
     }
     return prod;
   }
 };

 template <typename InType, typename OutType>
 struct computeExpected<InType, OutType, ShaderOpKind::WaveActiveMax> {
   OutType operator()(const std::vector<InType> &inputs,
                      const std::vector<int> &masks, int maskValue,
                      unsigned int index) {
     OutType maximum = std::numeric_limits<OutType>::min();
     for (size_t i = 0; i < index; ++i) {
       if (masks.at(i) == maskValue && inputs.at(i) > maximum)
         maximum = inputs.at(i);
     }
     return maximum;
   }
 };

 template <typename InType, typename OutType>
 struct computeExpected<InType, OutType, ShaderOpKind::WaveActiveMin> {
   OutType operator()(const std::vector<InType> &inputs,
                      const std::vector<int> &masks, int maskValue,
                      unsigned int index) {
     OutType minimum = std::numeric_limits<OutType>::max();
     for (size_t i = 0; i < index; ++i) {
       if (masks.at(i) == maskValue && inputs.at(i) < minimum)
         minimum = inputs.at(i);
     }
     return minimum;
   }
 };

 template <typename InType, typename OutType>
 struct computeExpected<InType, OutType, ShaderOpKind::WaveCountBits> {
   OutType operator()(const std::vector<InType> &inputs,
                      const std::vector<int> &masks, int maskValue,
                      unsigned int index) {
     OutType count = 0;
     for (size_t i = 0; i < index; ++i) {
       if (masks.at(i) == maskValue && inputs.at(i) > 3) {
         count++;
       }
     }
     return count;
   }
 };

 // In HLSL, boolean is represented in a 4 byte (uint32) format,
 // So we cannot use c++ bool type to represent bool in HLSL
 // HLSL returns 0 for false and 1 for true
 template <typename InType, typename OutType>
 struct computeExpected<InType, OutType, ShaderOpKind::WaveActiveAnyTrue> {
   OutType operator()(const std::vector<InType> &inputs,
                      const std::vector<int> &masks, int maskValue,
                      unsigned int index) {
     for (size_t i = 0; i < index; ++i) {
       if (masks.at(i) == maskValue && inputs.at(i) != 0) {
         return 1;
       }
     }
     return 0;
   }
 };

 template <typename InType, typename OutType>
 struct computeExpected<InType, OutType, ShaderOpKind::WaveActiveAllTrue> {
   OutType operator()(const std::vector<InType> &inputs,
                      const std::vector<int> &masks, int maskValue,
                      unsigned int index) {
     for (size_t i = 0; i < index; ++i) {
       if (masks.at(i) == maskValue && inputs.at(i) == 0) {
         return 0;
       }
     }
     return 1;
   }
 };

 template <typename InType, typename OutType>
 struct computeExpected<InType, OutType, ShaderOpKind::WaveActiveAllEqual> {
   OutType operator()(const std::vector<InType> &inputs,
                      const std::vector<int> &masks, int maskValue,
                      unsigned int index) {
     const InType *val = nullptr;
     for (size_t i = 0; i < index; ++i) {
       if (masks.at(i) == maskValue) {
         if (val && *val != inputs.at(i)) {
           return 0;
         }
         val = &inputs.at(i);
       }
     }
     return 1;
   }
 };

 template <typename InType, typename OutType>
 struct computeExpected<InType, OutType, ShaderOpKind::WaveActiveBitOr> {
   OutType operator()(const std::vector<InType> &inputs,
                      const std::vector<int> &masks, int maskValue,
                      unsigned int index) {
     OutType bits = 0x00000000;
     for (size_t i = 0; i < index; ++i) {
       if (masks.at(i) == maskValue) {
         bits |= inputs.at(i);
       }
     }
     return bits;
   }
 };

 template <typename InType, typename OutType>
 struct computeExpected<InType, OutType, ShaderOpKind::WaveActiveBitAnd> {
   OutType operator()(const std::vector<InType> &inputs,
                      const std::vector<int> &masks, int maskValue,
                      unsigned int index) {
     OutType bits = 0xffffffff;
     for (size_t i = 0; i < index; ++i) {
       if (masks.at(i) == maskValue) {
         bits &= inputs.at(i);
       }
     }
     return bits;
   }
 };

 template <typename InType, typename OutType>
 struct computeExpected<InType, OutType, ShaderOpKind::WaveActiveBitXor> {
   OutType operator()(const std::vector<InType> &inputs,
                      const std::vector<int> &masks, int maskValue,
                      unsigned int index) {
     OutType bits = 0x00000000;
     for (size_t i = 0; i < index; ++i) {
       if (masks.at(i) == maskValue) {
         bits ^= inputs.at(i);
       }
     }
     return bits;
   }
 };

 // Mask functions used to control active lanes
 static int MaskAll(int i) {
   UNREFERENCED_PARAMETER(i);
   return 1;
 }

 static int MaskEveryOther(int i) {
   return i % 2 == 0 ? 1 : 0;
 }

 static int MaskEveryThird(int i) {
   return i % 3 == 0 ? 1 : 0;
 }

 typedef int(*MaskFunction)(int);
 static MaskFunction MaskFunctionTable[] = {
   MaskAll, MaskEveryOther, MaskEveryThird
 };

 template <typename InType, typename OutType>
 static OutType computeExpectedWithShaderOp(const std::vector<InType> &inputs,
                                            const std::vector<int> &masks,
                                            int maskValue, unsigned int index,
                                            LPCWSTR str) {
   ShaderOpKind kind = GetShaderOpKind(str);
   switch (kind) {
   case ShaderOpKind::WaveSum:
     return computeExpected<InType, OutType, ShaderOpKind::WaveSum>()(inputs, masks, maskValue, index);
   case ShaderOpKind::WaveProduct:
     return computeExpected<InType, OutType, ShaderOpKind::WaveProduct>()(inputs, masks, maskValue, index);
   case ShaderOpKind::WaveActiveMax:
     return computeExpected<InType, OutType, ShaderOpKind::WaveActiveMax>()(inputs, masks, maskValue, index);
   case ShaderOpKind::WaveActiveMin:
     return computeExpected<InType, OutType, ShaderOpKind::WaveActiveMin>()(inputs, masks, maskValue, index);
   case ShaderOpKind::WaveCountBits:
     return computeExpected<InType, OutType, ShaderOpKind::WaveCountBits>()(inputs, masks, maskValue, index);
   case ShaderOpKind::WaveActiveBitOr:
     return computeExpected<InType, OutType, ShaderOpKind::WaveActiveBitOr>()(inputs, masks, maskValue, index);
   case ShaderOpKind::WaveActiveBitAnd:
     return computeExpected<InType, OutType, ShaderOpKind::WaveActiveBitAnd>()(inputs, masks, maskValue, index);
   case ShaderOpKind::WaveActiveBitXor:
     return computeExpected<InType, OutType, ShaderOpKind::WaveActiveBitXor>()(inputs, masks, maskValue, index);
   case ShaderOpKind::WaveActiveAnyTrue:
     return computeExpected<InType, OutType, ShaderOpKind::WaveActiveAnyTrue>()(inputs, masks, maskValue, index);
   case ShaderOpKind::WaveActiveAllTrue:
     return computeExpected<InType, OutType, ShaderOpKind::WaveActiveAllTrue>()(inputs, masks, maskValue, index);
   case ShaderOpKind::WaveActiveAllEqual:
     return computeExpected<InType, OutType, ShaderOpKind::WaveActiveAllEqual>()(inputs, masks, maskValue, index);
   default:
     DXASSERT_ARGS(false, "Invalid ShaderOp Name: %s", str);
     return (OutType) 0;
   }
 };

 // A framework for testing individual wave intrinsics tests.
 // This test case is assuming that functions 1) WaveIsFirstLane and 2) WaveGetLaneIndex are correct for all lanes.
 template <class T1, class T2>
 void ExecutionTest::WaveIntrinsicsActivePrefixTest(
     TableParameter *pParameterList, size_t numParameter, bool isPrefix) {
   WEX::TestExecution::SetVerifyOutput verifySettings(WEX::TestExecution::VerifyOutputSettings::LogOnlyFailures);

   // Resource representation for compute shader
   // firstLaneId is used to group different waves
   // laneIndex is used to identify lane within the wave.
   // Lane ids are not necessarily in same order as thread ids.
   struct PerThreadData {
       unsigned firstLaneId;
       unsigned laneIndex;
       int mask;
       T1 input;
       T2 output;
   };

   unsigned int NumThreadsX = 8;
   unsigned int NumThreadsY = 12;
   unsigned int NumThreadsZ = 1;

   static const unsigned int ThreadsPerGroup = NumThreadsX * NumThreadsY * NumThreadsZ;
   static const unsigned int DispatchGroupCount = 1;
   static const unsigned int ThreadCount = ThreadsPerGroup * DispatchGroupCount;
   CComPtr<IStream> pStream;
   ReadHlslDataIntoNewStream(L"ShaderOpArith.xml", &pStream);

   CComPtr<ID3D12Device> pDevice;
   if (!CreateDevice(&pDevice)) {
     return;
   }
   if (!DoesDeviceSupportWaveOps(pDevice)) {
     // Optional feature, so it's correct to not support it if declared as such.
     WEX::Logging::Log::Comment(L"Device does not support wave operations.");
     return;
   }

   TableParameterHandler handler(pParameterList, numParameter);

   unsigned int numInputSet = handler.GetTableParamByName(L"Validation.NumInputSet")->m_uint;

   // Obtain the list of input lists
   std::vector<std::vector<T1>*> InputDataList;
   for (unsigned int i = 0;
     i < numInputSet; ++i) {
     std::wstring inputName = L"Validation.InputSet";
     inputName.append(std::to_wstring(i + 1));
     InputDataList.push_back(handler.GetDataArray<T1>(inputName.data()));
   }
   CW2A Text(handler.GetTableParamByName(L"ShaderOp.text")->m_str);

   std::shared_ptr<st::ShaderOpSet> ShaderOpSet = std::make_shared<st::ShaderOpSet>();
   st::ParseShaderOpSetFromStream(pStream, ShaderOpSet.get());

   // Running compute shader for each input set with different masks
   for (size_t setIndex = 0; setIndex < numInputSet; ++setIndex) {
     for (size_t maskIndex = 0; maskIndex < sizeof(MaskFunctionTable) / sizeof(MaskFunction); ++maskIndex) {
       std::shared_ptr<ShaderOpTestResult> test = RunShaderOpTestAfterParse(
         pDevice, m_support, "WaveIntrinsicsOp",
         // this callbacked is called when the test
         // is creating the resource to run the test
         [&](LPCSTR Name, std::vector<BYTE> &Data, st::ShaderOp *pShaderOp) {
         VERIFY_IS_TRUE(0 == _stricmp(Name, "SWaveIntrinsicsOp"));
         size_t size = sizeof(PerThreadData) * ThreadCount;
         Data.resize(size);
         PerThreadData *pPrimitives = (PerThreadData*)Data.data();
         // 4 different inputs for each operation test
         size_t index = 0;
         std::vector<T1> *IntList = InputDataList[setIndex];
         while (index < ThreadCount) {
           PerThreadData *p = &pPrimitives[index];
           p->firstLaneId = 0xFFFFBFFF;
           p->laneIndex = 0xFFFFBFFF;
           p->mask = MaskFunctionTable[maskIndex]((int)index);
           p->input = (*IntList)[index % IntList->size()];
           p->output = 0xFFFFBFFF;
           index++;
         }
         // use shader from data table
         pShaderOp->Shaders.at(0).Text = Text.m_psz;
       }, ShaderOpSet);

       // Check the value
       MappedData data;
       test->Test->GetReadBackData("SWaveIntrinsicsOp", &data);

       PerThreadData *pPrimitives = (PerThreadData*)data.data();
       WEX::TestExecution::DisableVerifyExceptions dve;

       // Grouping data by waves
       std::vector<int> firstLaneIds;
       for (size_t i = 0; i < ThreadCount; ++i) {
         PerThreadData *p = &pPrimitives[i];
         int firstLaneId = p->firstLaneId;
         if (!contains(firstLaneIds, firstLaneId)) {
           firstLaneIds.push_back(firstLaneId);
         }
       }

       std::map<int, std::unique_ptr<std::vector<PerThreadData *>>> waves;
       for (size_t i = 0; i < firstLaneIds.size(); ++i) {
         waves[firstLaneIds.at(i)] = std::make_unique<std::vector<PerThreadData*>>();
       }

       for (size_t i = 0; i < ThreadCount; ++i) {
         PerThreadData *p = &pPrimitives[i];
         waves[p->firstLaneId].get()->push_back(p);
       }

       // validate for each wave
       for (size_t i = 0; i < firstLaneIds.size(); ++i) {
         // collect inputs and masks for a given wave
         std::vector<PerThreadData *> *waveData = waves[firstLaneIds.at(i)].get();
         std::vector<T1> inputList(waveData->size());
         std::vector<int> maskList(waveData->size(), -1);
         std::vector<T2> outputList(waveData->size());
         // sort inputList and masklist by lane id. input for each lane can be computed for its group index
         for (size_t j = 0, end = waveData->size(); j < end; ++j) {
           unsigned laneID = waveData->at(j)->laneIndex;
           // ensure that each lane ID is unique and within the range
           VERIFY_IS_TRUE(0 <= laneID && laneID < waveData->size());
           VERIFY_IS_TRUE(maskList.at(laneID) == -1);
           maskList.at(laneID) = waveData->at(j)->mask;
           inputList.at(laneID) = waveData->at(j)->input;
           outputList.at(laneID) = waveData->at(j)->output;
         }
         std::wstring inputStr = L"Wave Inputs:  ";
         std::wstring maskStr =  L"Wave Masks:   ";
         std::wstring outputStr = L"Wave Outputs: ";
         // append input string and mask string in lane id order
         for (size_t j = 0, end = waveData->size(); j < end; ++j) {
           maskStr.append(std::to_wstring(maskList.at(j)));
           maskStr.append(L" ");
           inputStr.append(std::to_wstring(inputList.at(j)));
           inputStr.append(L" ");
           outputStr.append(std::to_wstring(outputList.at(j)));
           outputStr.append(L" ");
         }

         LogCommentFmt(inputStr.data());
         LogCommentFmt(maskStr.data());
         LogCommentFmt(outputStr.data());
         LogCommentFmt(L"\n");
         // Compute expected output for a given inputs, masks, and index
         for (size_t laneIndex = 0, laneEnd = inputList.size(); laneIndex < laneEnd; ++laneIndex) {
           T2 expected;
           // WaveActive is equivalent to WavePrefix lane # lane count
           unsigned index = isPrefix ? (unsigned)laneIndex : (unsigned)inputList.size();
           if (maskList.at(laneIndex) == 1) {
             expected = computeExpectedWithShaderOp<T1, T2>(
               inputList, maskList, 1, index,
               handler.GetTableParamByName(L"ShaderOp.Name")->m_str);
           }
           else {
             expected = computeExpectedWithShaderOp<T1, T2>(
               inputList, maskList, 0, index,
               handler.GetTableParamByName(L"ShaderOp.Name")->m_str);
           }
           // TODO: use different comparison for floating point inputs
           bool equal = outputList.at(laneIndex) == expected;
           if (!equal) {
             LogCommentFmt(L"lane%d: %4d, Expected : %4d", laneIndex, outputList.at(laneIndex), expected);
           }
           VERIFY_IS_TRUE(equal);
         }
       }
     }
   }
 }

 static const unsigned int MinWarpVersionForWaveIntrinsics = 16202;

 TEST_F(ExecutionTest, WaveIntrinsicsActiveIntTest) {
   if (GetTestParamUseWARP(true) &&
       !IsValidWarpDllVersion(MinWarpVersionForWaveIntrinsics)) {
     return;
   }
   WaveIntrinsicsActivePrefixTest<int, int>(
       WaveIntrinsicsActiveIntParameters,
       sizeof(WaveIntrinsicsActiveIntParameters) / sizeof(TableParameter),
       /*isPrefix*/ false);
 }

 TEST_F(ExecutionTest, WaveIntrinsicsActiveUintTest) {
   if (GetTestParamUseWARP(true) &&
       !IsValidWarpDllVersion(MinWarpVersionForWaveIntrinsics)) {
     return;
   }
   WaveIntrinsicsActivePrefixTest<unsigned int, unsigned int>(
       WaveIntrinsicsActiveUintParameters,
       sizeof(WaveIntrinsicsActiveUintParameters) / sizeof(TableParameter),
       /*isPrefix*/ false);
 }

 TEST_F(ExecutionTest, WaveIntrinsicsPrefixIntTest) {
   if (GetTestParamUseWARP(true) &&
       !IsValidWarpDllVersion(MinWarpVersionForWaveIntrinsics)) {
     return;
   }
   WaveIntrinsicsActivePrefixTest<int, int>(
       WaveIntrinsicsPrefixIntParameters,
       sizeof(WaveIntrinsicsPrefixIntParameters) / sizeof(TableParameter),
       /*isPrefix*/ true);
 }

 TEST_F(ExecutionTest, WaveIntrinsicsPrefixUintTest) {
   if (GetTestParamUseWARP(true) &&
       !IsValidWarpDllVersion(MinWarpVersionForWaveIntrinsics)) {
     return;
   }
   WaveIntrinsicsActivePrefixTest<unsigned int, unsigned int>(
       WaveIntrinsicsPrefixUintParameters,
       sizeof(WaveIntrinsicsPrefixUintParameters) / sizeof(TableParameter),
       /*isPrefix*/ true);
 }

 template <typename T>
 static T GetWaveMultiPrefixInitialAccumValue(LPCWSTR testName) {
   if (_wcsicmp(testName, L"WaveMultiPrefixProduct") == 0 ||
       _wcsicmp(testName, L"WaveMultiPrefixUProduct") == 0) {
     return static_cast<T>(1);
   } else if (_wcsicmp(testName, L"WaveMultiPrefixSum") == 0 ||
              _wcsicmp(testName, L"WaveMultiPrefixUSum") == 0 ||
              _wcsicmp(testName, L"WaveMultiPrefixBitOr") == 0 ||
              _wcsicmp(testName, L"WaveMultiPrefixUBitOr") == 0 ||
              _wcsicmp(testName, L"WaveMultiPrefixBitXor") == 0 ||
              _wcsicmp(testName, L"WaveMultiPrefixUBitXor") == 0 ||
              _wcsicmp(testName, L"WaveMultiPrefixCountBits") == 0 ||
              _wcsicmp(testName, L"WaveMultiPrefixUCountBits") == 0) {
     return static_cast<T>(0);
   } else if (_wcsicmp(testName, L"WaveMultiPrefixBitAnd") == 0 ||
              _wcsicmp(testName, L"WaveMultiPrefixUBitAnd") == 0) {
     return static_cast<T>(-1);
   } else {
     return static_cast<T>(0);
   }
 }

 template <typename T>
 std::function<T(T, T)> GetWaveMultiPrefixReferenceFunction(LPCWSTR testName) {
   if (_wcsicmp(testName, L"WaveMultiPrefixProduct") == 0 ||
       _wcsicmp(testName, L"WaveMultiPrefixUProduct") == 0) {
     return [] (T lhs, T rhs) -> T { return lhs * rhs; };
   } else if (_wcsicmp(testName, L"WaveMultiPrefixSum") == 0 ||
              _wcsicmp(testName, L"WaveMultiPrefixUSum") == 0) {
     return [] (T lhs, T rhs) -> T { return lhs + rhs; };
   } else if (_wcsicmp(testName, L"WaveMultiPrefixBitAnd") == 0 ||
              _wcsicmp(testName, L"WaveMultiPrefixUBitAnd") == 0) {
     return [] (T lhs, T rhs) -> T { return lhs & rhs; };
   } else if (_wcsicmp(testName, L"WaveMultiPrefixBitOr") == 0 ||
              _wcsicmp(testName, L"WaveMultiPrefixUBitOr") == 0) {
     return [] (T lhs, T rhs) -> T { return lhs | rhs; };
   } else if (_wcsicmp(testName, L"WaveMultiPrefixBitXor") == 0 ||
              _wcsicmp(testName, L"WaveMultiPrefixUBitXor") == 0) {
     return [] (T lhs, T rhs) -> T { return lhs ^ rhs; };
   } else if (_wcsicmp(testName, L"WaveMultiPrefixCountBits") == 0 ||
              _wcsicmp(testName, L"WaveMultiPrefixUCountBits") == 0) {
     // For CountBits, each lane contributes a boolean value. The test input is
     // a zero or non-zero integer. If the input is a non-zero value then the
     // condition is true, thus we contribute one to the bit count.
     return [] (T lhs, T rhs) -> T { return lhs + (rhs ? 1 : 0); };
   } else {
     return [] (T lhs, T rhs) -> T { UNREFERENCED_PARAMETER(lhs); UNREFERENCED_PARAMETER(rhs); return 0; };
   }
 }

 template <class T>
 void
 ExecutionTest::WaveIntrinsicsMultiPrefixOpTest(TableParameter *pParameterList,
                                                size_t numParameters) {
   WEX::TestExecution::SetVerifyOutput
     verifySettings(WEX::TestExecution::VerifyOutputSettings::LogOnlyFailures);

   struct PerThreadData {
     uint32_t key;
     uint32_t firstLaneId;
     uint32_t laneId;
     uint32_t mask;
     T value;
     T result;
   };

   constexpr size_t NumThreadsX = 8;
   constexpr size_t NumThreadsY = 12;
   constexpr size_t NumThreadsZ = 1;

   constexpr size_t ThreadsPerGroup = NumThreadsX * NumThreadsY * NumThreadsZ;
   constexpr size_t DispatchGroupSize = 1;
   constexpr size_t ThreadCount = ThreadsPerGroup * DispatchGroupSize;

   CComPtr<IStream> pStream;
   ReadHlslDataIntoNewStream(L"ShaderOpArith.xml", &pStream);

   CComPtr<ID3D12Device> pDevice;

   if (!CreateDevice(&pDevice, D3D_SHADER_MODEL_6_5)) {
     return;
   }

   if (!DoesDeviceSupportWaveOps(pDevice)) {
     // Optional feature, so it's correct to not support it if declared as such.
     WEX::Logging::Log::Comment(L"Device does not support wave operations.");
     return;
   }

   std::shared_ptr<st::ShaderOpSet>
     ShaderOpSet = std::make_shared<st::ShaderOpSet>();
   st::ParseShaderOpSetFromStream(pStream, ShaderOpSet.get());

   TableParameterHandler handler(pParameterList, numParameters);
   CW2A shaderSource(handler.GetTableParamByName(L"ShaderOp.Text")->m_str);
   CW2A shaderProfile(handler.GetTableParamByName(L"ShaderOp.Target")->m_str);
   auto testName = handler.GetTableParamByName(L"ShaderOp.Name")->m_str;

   std::vector<T> *keys = handler.GetDataArray<T>(L"Validation.Keys");
   std::vector<T> *values = handler.GetDataArray<T>(L"Validation.Values");

   for (size_t maskIndex = 0; maskIndex < _countof(MaskFunctionTable); ++maskIndex) {
     std::shared_ptr<ShaderOpTestResult> test =
       RunShaderOpTestAfterParse(pDevice, m_support, "WaveIntrinsicsOp",
       [&] (LPCSTR name, std::vector<BYTE> &data, st::ShaderOp *pShaderOp) {
         UNREFERENCED_PARAMETER(name);

         const size_t dataSize = sizeof(PerThreadData) * ThreadCount;

         data.resize(dataSize);
         PerThreadData *pThreadData = reinterpret_cast<PerThreadData *>(data.data());

         for (size_t i = 0; i != ThreadCount; ++i) {
           pThreadData[i].key = keys->at(i % keys->size());
           pThreadData[i].value = values->at(i % values->size());
           pThreadData[i].firstLaneId = 0xdeadbeef;
           pThreadData[i].laneId = 0xdeadbeef;
           pThreadData[i].mask = MaskFunctionTable[maskIndex]((int)i);
           pThreadData[i].result = 0xdeadbeef;
         }

         pShaderOp->Shaders.at(0).Text = shaderSource;
         pShaderOp->Shaders.at(0).Target = shaderProfile;
       }, ShaderOpSet);

     MappedData mappedData;
     test->Test->GetReadBackData("SWaveIntrinsicsOp", &mappedData);
     PerThreadData *resultData = reinterpret_cast<PerThreadData *>(mappedData.data());

     // Partition our data into waves
     std::map<uint32_t, std::vector<PerThreadData *>> waves;

     for (size_t i = 0, e = ThreadCount; i != e; ++i) {
       PerThreadData *elt = &resultData[i];

       // Basic sanity checks
       VERIFY_IS_TRUE(elt->firstLaneId != 0xdeadbeef);
       VERIFY_IS_TRUE(elt->laneId != 0xdeadbeef);

       waves[elt->firstLaneId].push_back(elt);
     }

     // Verify each wave
     auto refFn = GetWaveMultiPrefixReferenceFunction<T>(testName);

     for (auto &w : waves) {
       std::vector<PerThreadData *> &waveData = w.second;

       struct {
         bool operator()(PerThreadData *a, PerThreadData *b) const {
           return (a->laneId < b->laneId);
         }
       } compare;
 	  // Need to sort based on the lane id
       std::sort(waveData.begin(), waveData.end(), compare);

       LogCommentFmt(L"LaneId    Mask      Key       Value     Result    Expected");
       LogCommentFmt(L"--------  --------  --------  --------  --------  --------");
       for (size_t i = 0, e = waveData.size(); i != e; ++i) {
         PerThreadData *data = waveData[i];

         // Compute prefix operation over each previous lane element that has the
         // same key value, and is part of the same active thread group
         T accum = GetWaveMultiPrefixInitialAccumValue<T>(testName);
         for (unsigned j = 0; j < i; ++j) {
           if (waveData[j]->key == data->key && waveData[j]->mask == data->mask) {
             accum = refFn(accum, waveData[j]->value);
           }
         }

         LogCommentFmt(L"%08X  %08X  %08X  %08X  %08X  %08X", data->laneId, data->mask, data->key, data->value, data->result, accum);

         VERIFY_IS_TRUE(accum == data->result);
       }
       LogCommentFmt(L"\n");
     }
   }
 }

 TEST_F(ExecutionTest, WaveIntrinsicsSM65IntTest) {
   WaveIntrinsicsMultiPrefixOpTest<int>(WaveIntrinsicsMultiPrefixIntParameters,
                                        _countof(WaveIntrinsicsMultiPrefixIntParameters));
 }

 TEST_F(ExecutionTest, WaveIntrinsicsSM65UintTest) {
   WaveIntrinsicsMultiPrefixOpTest<unsigned>(WaveIntrinsicsMultiPrefixUintParameters,
                                             _countof(WaveIntrinsicsMultiPrefixUintParameters));
 }

 TEST_F(ExecutionTest, CBufferTestHalf) {
   WEX::TestExecution::SetVerifyOutput verifySettings(WEX::TestExecution::VerifyOutputSettings::LogOnlyFailures);
   CComPtr<IStream> pStream;
   ReadHlslDataIntoNewStream(L"ShaderOpArith.xml", &pStream);

   // Single operation test at the moment.
   CComPtr<ID3D12Device> pDevice;
   if (!CreateDevice(&pDevice, D3D_SHADER_MODEL_6_2))
     return;

   if (!DoesDeviceSupportNative16bitOps(pDevice)) {
     WEX::Logging::Log::Comment(L"Device does not support native 16-bit operations.");
     WEX::Logging::Log::Result(WEX::Logging::TestResults::Skipped);
     return;
   }

   uint16_t InputData[] = { 0x3F80, 0x3F00, 0x3D80, 0x7BFF };

   std::shared_ptr<ShaderOpTestResult> test = RunShaderOpTest(pDevice, m_support, pStream, "CBufferTestHalf",
     [&](LPCSTR Name, std::vector<BYTE> &Data, st::ShaderOp *pShaderOp) {
     UNREFERENCED_PARAMETER(pShaderOp);
     VERIFY_IS_TRUE(0 == _stricmp(Name, "CB0"));
     // use shader from data table.
     Data.resize(sizeof(InputData));
     uint16_t *pData = (uint16_t *)Data.data();
     for (size_t i = 0; i < 4; ++i, ++pData) {
       *pData = InputData[i];
     }
   });
   {
     MappedData data;
     test->Test->GetReadBackData("RTarget", &data);
     const uint16_t *pPixels = (uint16_t *)data.data();

     for (int i = 0; i < 4; ++i) {
       uint16_t output = *(pPixels + i);
       float outputFloat = ConvertFloat16ToFloat32(output);
       float inputFloat = ConvertFloat16ToFloat32(InputData[i]);
       LogCommentFmt(L"element #%u: input = %6.8f(0x%04x), output = %6.8f(0x%04x)",
           i, inputFloat, InputData[i], outputFloat, output);
       VERIFY_ARE_EQUAL(inputFloat, outputFloat);
     }
   }
 }

 TEST_F(ExecutionTest, BarycentricsTest) {
     WEX::TestExecution::SetVerifyOutput verifySettings(WEX::TestExecution::VerifyOutputSettings::LogOnlyFailures);
     CComPtr<IStream> pStream;
     ReadHlslDataIntoNewStream(L"ShaderOpArith.xml", &pStream);

     CComPtr<ID3D12Device> pDevice;
     if (!CreateDevice(&pDevice, D3D_SHADER_MODEL_6_1))
         return;

     if (!DoesDeviceSupportBarycentrics(pDevice)) {
       WEX::Logging::Log::Comment(L"Device does not support barycentrics.");
       WEX::Logging::Log::Result(WEX::Logging::TestResults::Skipped);
       return;
     }

     std::shared_ptr<ShaderOpTestResult> test = RunShaderOpTest(pDevice, m_support, pStream, "Barycentrics", nullptr);
     MappedData data;
     D3D12_RESOURCE_DESC &D = test->ShaderOp->GetResourceByName("RTarget")->Desc;
     UINT width = (UINT)D.Width;
     UINT height = D.Height;
     UINT pixelSize = GetByteSizeForFormat(D.Format);

     test->Test->GetReadBackData("RTarget", &data);
     //const uint8_t *pPixels = (uint8_t *)data.data();
     const float *pPixels = (float *)data.data();
     // Get the vertex of barycentric coordinate using VBuffer
     MappedData triangleData;
     test->Test->GetReadBackData("VBuffer", &triangleData);
     const float *pTriangleData = (float*)triangleData.data();
     // get the size of the input data
     unsigned triangleVertexSizeInFloat = 0;
     for (auto element : test->ShaderOp->InputElements)
         triangleVertexSizeInFloat += GetByteSizeForFormat(element.Format) / 4;

     XMFLOAT2 p0(pTriangleData[0], pTriangleData[1]);
     XMFLOAT2 p1(pTriangleData[triangleVertexSizeInFloat], pTriangleData[triangleVertexSizeInFloat + 1]);
     XMFLOAT2 p2(pTriangleData[triangleVertexSizeInFloat * 2], pTriangleData[triangleVertexSizeInFloat * 2 + 1]);

     XMFLOAT3 barycentricWeights[4] = {
         XMFLOAT3(0.3333f, 0.3333f, 0.3333f),
         XMFLOAT3(0.5f, 0.25f, 0.25f),
         XMFLOAT3(0.25f, 0.5f, 0.25f),
         XMFLOAT3(0.25f, 0.25f, 0.50f)
     };

     float tolerance = 0.001f;
     for (unsigned i = 0; i < sizeof(barycentricWeights) / sizeof(XMFLOAT3); ++i) {
         float w0 = barycentricWeights[i].x;
         float w1 = barycentricWeights[i].y;
         float w2 = barycentricWeights[i].z;
         float x1 = w0 * p0.x + w1 * p1.x + w2 * p2.x;
         float y1 = w0 * p0.y + w1 * p1.y + w2 * p2.y;
         // map from x1 y1 to rtv pixels
         int pixelX = (int)((x1 + 1) * (width - 1) / 2);
         int pixelY = (int)((1 - y1) * (height - 1) / 2);
         int offset = pixelSize * (pixelX + pixelY * width) / sizeof(pPixels[0]);
         LogCommentFmt(L"location  %u %u, value %f, %f, %f", pixelX, pixelY, pPixels[offset], pPixels[offset + 1], pPixels[offset + 2]);
         VERIFY_IS_TRUE(CompareFloatEpsilon(pPixels[offset], w0, tolerance));
         VERIFY_IS_TRUE(CompareFloatEpsilon(pPixels[offset + 1], w1, tolerance));
         VERIFY_IS_TRUE(CompareFloatEpsilon(pPixels[offset + 2], w2, tolerance));
     }
     //SavePixelsToFile(pPixels, DXGI_FORMAT_R32G32B32A32_FLOAT, width, height, L"barycentric.bmp");
 }

 static const char RawBufferTestShaderDeclarations[] =
 "// Note: COMPONENT_TYPE and COMPONENT_SIZE will be defined via compiler option -D\r\n"
 "typedef COMPONENT_TYPE scalar; \r\n"
 "typedef vector<COMPONENT_TYPE, 2> vector2; \r\n"
 "typedef vector<COMPONENT_TYPE, 3> vector3; \r\n"
 "typedef vector<COMPONENT_TYPE, 4> vector4; \r\n"
 "\r\n"
 "struct TestData { \r\n"
 "  scalar  v1; \r\n"
 "  vector2 v2; \r\n"
 "  vector3 v3; \r\n"
 "  vector4 v4; \r\n"
 "}; \r\n"
 "\r\n"
 "struct UavData {\r\n"
 "  TestData input; \r\n"
 "  TestData output; \r\n"
 "  TestData srvOut; \r\n"
 "}; \r\n"
 "\r\n"
 "ByteAddressBuffer           srv0 : register(t0); \r\n"
 "StructuredBuffer<TestData>  srv1 : register(t1); \r\n"
 "ByteAddressBuffer           srv2 : register(t2); \r\n"
 "StructuredBuffer<TestData>  srv3 : register(t3); \r\n"
 "\r\n"
 "RWByteAddressBuffer         uav0 : register(u0); \r\n"
 "RWStructuredBuffer<UavData> uav1 : register(u1); \r\n"
 "RWByteAddressBuffer         uav2 : register(u2); \r\n"
 "RWStructuredBuffer<UavData> uav3 : register(u3); \r\n";

 static const char RawBufferTestShaderBody[] =
 "  // offset of 'out' in 'UavData'\r\n"
 "  const int out_offset = COMPONENT_SIZE * 10; \r\n"
 "\r\n"
 "  // offset of 'srv_out' in 'UavData'\r\n"
 "  const int srv_out_offset = COMPONENT_SIZE * 10 * 2; \r\n"
 "\r\n"
 "  // offsets within the 'Data' struct\r\n"
 "  const int v1_offset = 0; \r\n"
 "  const int v2_offset = COMPONENT_SIZE; \r\n"
 "  const int v3_offset = COMPONENT_SIZE * 3; \r\n"
 "  const int v4_offset = COMPONENT_SIZE * 6; \r\n"
 "\r\n"
 "  uav0.Store(srv_out_offset + v1_offset, srv0.Load<scalar>(v1_offset)); \r\n"
 "  uav0.Store(srv_out_offset + v2_offset, srv0.Load<vector2>(v2_offset)); \r\n"
 "  uav0.Store(srv_out_offset + v3_offset, srv0.Load<vector3>(v3_offset)); \r\n"
 "  uav0.Store(srv_out_offset + v4_offset, srv0.Load<vector4>(v4_offset)); \r\n"
 "\r\n"
 "  uav1[0].srvOut.v1 = srv1[0].v1; \r\n"
 "  uav1[0].srvOut.v2 = srv1[0].v2; \r\n"
 "  uav1[0].srvOut.v3 = srv1[0].v3; \r\n"
 "  uav1[0].srvOut.v4 = srv1[0].v4; \r\n"
 "\r\n"
 "  uav2.Store(srv_out_offset + v1_offset, srv2.Load<scalar>(v1_offset)); \r\n"
 "  uav2.Store(srv_out_offset + v2_offset, srv2.Load<vector2>(v2_offset)); \r\n"
 "  uav2.Store(srv_out_offset + v3_offset, srv2.Load<vector3>(v3_offset)); \r\n"
 "  uav2.Store(srv_out_offset + v4_offset, srv2.Load<vector4>(v4_offset)); \r\n"
 "\r\n"
 "  uav3[0].srvOut.v1 = srv3[0].v1; \r\n"
 "  uav3[0].srvOut.v2 = srv3[0].v2; \r\n"
 "  uav3[0].srvOut.v3 = srv3[0].v3; \r\n"
 "  uav3[0].srvOut.v4 = srv3[0].v4; \r\n"
 "\r\n"
 "  uav0.Store(out_offset + v1_offset, uav0.Load<scalar>(v1_offset)); \r\n"
 "  uav0.Store(out_offset + v2_offset, uav0.Load<vector2>(v2_offset)); \r\n"
 "  uav0.Store(out_offset + v3_offset, uav0.Load<vector3>(v3_offset)); \r\n"
 "  uav0.Store(out_offset + v4_offset, uav0.Load<vector4>(v4_offset)); \r\n"
 "\r\n"
 "  uav1[0].output.v1 = uav1[0].input.v1; \r\n"
 "  uav1[0].output.v2 = uav1[0].input.v2; \r\n"
 "  uav1[0].output.v3 = uav1[0].input.v3; \r\n"
 "  uav1[0].output.v4 = uav1[0].input.v4; \r\n"
 "\r\n"
 "  uav2.Store(out_offset + v1_offset, uav2.Load<scalar>(v1_offset)); \r\n"
 "  uav2.Store(out_offset + v2_offset, uav2.Load<vector2>(v2_offset)); \r\n"
 "  uav2.Store(out_offset + v3_offset, uav2.Load<vector3>(v3_offset)); \r\n"
 "  uav2.Store(out_offset + v4_offset, uav2.Load<vector4>(v4_offset)); \r\n"
 "\r\n"
 "  uav3[0].output.v1 = uav3[0].input.v1; \r\n"
 "  uav3[0].output.v2 = uav3[0].input.v2; \r\n"
 "  uav3[0].output.v3 = uav3[0].input.v3; \r\n"
 "  uav3[0].output.v4 = uav3[0].input.v4; \r\n";


 static const char RawBufferTestComputeShaderTemplate[] =
 "%s\r\n" // <- RawBufferTestShaderDeclarations
 "[numthreads(1, 1, 1)]\r\n"
 "void main(uint GI : SV_GroupIndex) {\r\n"
 "%s\r\n" // <- RawBufferTestShaderBody
 "};";

 static const char RawBufferTestGraphicsPixelShaderTemplate[] =
 "%s\r\n" // <- RawBufferTestShaderDeclarations
 "struct PSInput { \r\n"
 "  float4 pos : SV_POSITION; \r\n"
 "}; \r\n"
 "uint4 main(PSInput input) : SV_TARGET{ \r\n"
 "  if (input.pos.x + input.pos.y == 1.0f) { // pixel { 0.5, 0.5, 0 } \r\n"
 "%s\r\n" // <- RawBufferTestShaderBody
 "  } \r\n"
 "  return uint4(1, 2, 3, 4); \r\n"
 "};";

 TEST_F(ExecutionTest, ComputeRawBufferLdStI32) {
   RawBufferLdStTestData<int32_t> data = { { 1 }, { 2, -1 }, { 256, -10517, 980 }, { 465, 13, -89, MAXUINT32 / 2 } };
   RunComputeRawBufferLdStTest<int32_t>(D3D_SHADER_MODEL_6_2, RawBufferLdStType::I32, "ComputeRawBufferLdSt32Bit", data);
 }

 TEST_F(ExecutionTest, ComputeRawBufferLdStFloat)  {
   RawBufferLdStTestData<float> data = { { 3e-10f }, { 1.5f, -1.99988f }, { 256.0f, -105.17f, 980.0f }, { 465.1652f, -1.5694e2f, -0.8543e-2f, 1333.5f } };
   RunComputeRawBufferLdStTest<float>(D3D_SHADER_MODEL_6_2, RawBufferLdStType::Float, "ComputeRawBufferLdSt32Bit", data);
 }

 TEST_F(ExecutionTest,  ComputeRawBufferLdStI64)  {
   RawBufferLdStTestData<int64_t> data = { { 1 }, { 2, -1 }, { 256, -105171532, 980 }, { 465, 13, -89, MAXUINT64 / 2 } };
   RunComputeRawBufferLdStTest<int64_t>(D3D_SHADER_MODEL_6_3, RawBufferLdStType::I64, "ComputeRawBufferLdSt64Bit", data);
 }

 TEST_F(ExecutionTest,  ComputeRawBufferLdStDouble)  {
   RawBufferLdStTestData<double> data = { { 3e-10 }, { 1.5, -1.99988 }, { 256.0, -105.17, 980.0 }, { 465.1652, -1.5694e2, -0.8543e-2, 1333.5 } };
   RunComputeRawBufferLdStTest<double>(D3D_SHADER_MODEL_6_3, RawBufferLdStType::I64, "ComputeRawBufferLdSt64Bit", data);
 }

 TEST_F(ExecutionTest, ComputeRawBufferLdStI16) {
   RawBufferLdStTestData<int16_t> data = { { 1 }, { 2, -1 }, { 256, -10517, 980 }, { 465, 13, -89, MAXUINT16 / 2 } };
   RunComputeRawBufferLdStTest<int16_t>(D3D_SHADER_MODEL_6_2, RawBufferLdStType::I16, "ComputeRawBufferLdSt16Bit", data);
 }

 TEST_F(ExecutionTest,  ComputeRawBufferLdStHalf)  {
   RawBufferLdStTestData<float> floatData = { { 3e-10f }, { 1.5f, -1.99988f }, { 256.0f, 105.17f, 980.0f }, { 465.1652f, -1.5694e2f, -0.8543e-2f, 1333.5f } };
   RawBufferLdStTestData<uint16_t> halfData;
   for (int i = 0; i < sizeof(floatData)/sizeof(float); i++) {
     ((uint16_t*)&halfData)[i] = ConvertFloat32ToFloat16(((float*)&floatData)[i]);
   }
   RunComputeRawBufferLdStTest<uint16_t>(D3D_SHADER_MODEL_6_2, RawBufferLdStType::Half, "ComputeRawBufferLdSt16Bit", halfData);
 }

 TEST_F(ExecutionTest,  GraphicsRawBufferLdStI32)  {
   RawBufferLdStTestData<int32_t> data = { { 1 }, { 2, -1 }, { 256, -10517, 980 }, { 465, 13, -89, MAXUINT32 / 2 } };
   RunGraphicsRawBufferLdStTest<int32_t>(D3D_SHADER_MODEL_6_2, RawBufferLdStType::I32, "GraphicsRawBufferLdSt32Bit", data);
 }

 TEST_F(ExecutionTest,  GraphicsRawBufferLdStFloat)  {
   RawBufferLdStTestData<float> data = { { 3e-10f }, { 1.5f, -1.99988f }, { 256.0f, -105.17f, 980.0f }, { 465.1652f, -1.5694e2f, -0.8543e-2f, 1333.5f } };
   RunGraphicsRawBufferLdStTest<float>(D3D_SHADER_MODEL_6_2, RawBufferLdStType::Float, "GraphicsRawBufferLdSt32Bit", data);
 }

 TEST_F(ExecutionTest,  GraphicsRawBufferLdStI64)  {
   RawBufferLdStTestData<int64_t> data = { { 1 }, { 2, -1 }, { 256, -105171532, 980 }, { 465, 13, -89, MAXUINT64 / 2 } };
   RunGraphicsRawBufferLdStTest<int64_t>(D3D_SHADER_MODEL_6_3, RawBufferLdStType::I64, "GraphicsRawBufferLdSt64Bit", data);
 }

 TEST_F(ExecutionTest,  GraphicsRawBufferLdStDouble)  {
   RawBufferLdStTestData<double> data = { { 3e-10 }, { 1.5, -1.99988 }, { 256.0, -105.17, 980.0 }, { 465.1652, -1.5694e2, -0.8543e-2, 1333.5 } };
   RunGraphicsRawBufferLdStTest<double>(D3D_SHADER_MODEL_6_3, RawBufferLdStType::Double, "GraphicsRawBufferLdSt64Bit", data);
 }

 TEST_F(ExecutionTest, GraphicsRawBufferLdStI16) {
   RawBufferLdStTestData<int16_t> data = { { 1 }, { 2, -1 }, { 256, -10517, 980 }, { 465, 13, -89, MAXUINT16 / 2 } };
   RunGraphicsRawBufferLdStTest<int16_t>(D3D_SHADER_MODEL_6_2, RawBufferLdStType::I16, "GraphicsRawBufferLdSt16Bit", data);
 }

 TEST_F(ExecutionTest, GraphicsRawBufferLdStHalf) {
   RawBufferLdStTestData<float> floatData = { { 3e-10f }, { 1.5f, -1.99988f }, { 256.0f, 105.17f, 0.0f }, { 465.1652f, -1.5694e2f, -0.8543e-2f, 1333.5f } };
   RawBufferLdStTestData<uint16_t> halfData;
   for (int i = 0; i < sizeof(floatData) / sizeof(float); i++) {
     ((uint16_t*)&halfData)[i] = ConvertFloat32ToFloat16(((float*)&floatData)[i]);
   }
   RunGraphicsRawBufferLdStTest<uint16_t>(D3D_SHADER_MODEL_6_2, RawBufferLdStType::Half, "GraphicsRawBufferLdSt16Bit", halfData);
 }

 bool ExecutionTest::SetupRawBufferLdStTest(D3D_SHADER_MODEL shaderModel, RawBufferLdStType dataType,
                                            CComPtr<ID3D12Device> &pDevice, CComPtr<IStream> &pStream,
                                            char *&sTy, char *&additionalOptions) {
   if (!CreateDevice(&pDevice, shaderModel)) {
     return false;
   }

   additionalOptions = "";

   switch (dataType) {
   case RawBufferLdStType::I64:
     if (!DoesDeviceSupportInt64(pDevice)) {
       WEX::Logging::Log::Comment(L"Device does not support int64 operations.");
       WEX::Logging::Log::Result(WEX::Logging::TestResults::Skipped);
       return false;
     }
     sTy = "int64_t";
     break;
   case RawBufferLdStType::Double:
     if (!DoesDeviceSupportDouble(pDevice)) {
       WEX::Logging::Log::Comment(L"Device does not support double operations.");
       WEX::Logging::Log::Result(WEX::Logging::TestResults::Skipped);
       return false;
     }
     sTy = "double";
     break;
   case RawBufferLdStType::I16:
   case RawBufferLdStType::Half:
     if (!DoesDeviceSupportNative16bitOps(pDevice)) {
       WEX::Logging::Log::Comment(L"Device does not support native 16-bit operations.");
       WEX::Logging::Log::Result(WEX::Logging::TestResults::Skipped);
       return false;
     }
     additionalOptions = "-enable-16bit-types";
     sTy = (dataType == RawBufferLdStType::I16 ? "int16_t" : "half");
     break;
   case RawBufferLdStType::I32:
     sTy = "int32_t";
     break;
   case RawBufferLdStType::Float:
     sTy = "float";
     break;
   default:
     DXASSERT_NOMSG("Invalid RawBufferLdStType");
   }

   // read shader config
   ReadHlslDataIntoNewStream(L"ShaderOpArith.xml", &pStream);

   return true;
 }

 template <class Ty>
 void ExecutionTest::VerifyRawBufferLdStTestResults(const std::shared_ptr<st::ShaderOpTest> test, const RawBufferLdStTestData<Ty> &testData) {
   // read buffers back & verify expected values
   static const int UavBufferCount = 4;
   char bufferName[11] = "UAVBufferX";

   for (unsigned i = 0; i < UavBufferCount; i++) {
     MappedData dataUav;
     RawBufferLdStUavData<Ty> *pOutData;

     bufferName[sizeof(bufferName) - 2] = (char)(i + '0');

     test->GetReadBackData(bufferName, &dataUav);
     VERIFY_ARE_EQUAL(sizeof(RawBufferLdStUavData<Ty>), dataUav.size());
     pOutData = (RawBufferLdStUavData<Ty> *)dataUav.data();

     LogCommentFmt(L"Verifying UAVBuffer%d Load -> UAVBuffer%d Store", i, i);
     // scalar
     VERIFY_ARE_EQUAL(pOutData->output.v1, testData.v1);
     // vector 2
     VERIFY_ARE_EQUAL(pOutData->output.v2[0], testData.v2[0]);
     VERIFY_ARE_EQUAL(pOutData->output.v2[1], testData.v2[1]);
     // vector 3
     VERIFY_ARE_EQUAL(pOutData->output.v3[0], testData.v3[0]);
     VERIFY_ARE_EQUAL(pOutData->output.v3[1], testData.v3[1]);
     VERIFY_ARE_EQUAL(pOutData->output.v3[2], testData.v3[2]);
     // vector 4
     VERIFY_ARE_EQUAL(pOutData->output.v4[0], testData.v4[0]);
     VERIFY_ARE_EQUAL(pOutData->output.v4[1], testData.v4[1]);
     VERIFY_ARE_EQUAL(pOutData->output.v4[2], testData.v4[2]);
     VERIFY_ARE_EQUAL(pOutData->output.v4[3], testData.v4[3]);

     // verify SRV Store
     LogCommentFmt(L"Verifying SRVBuffer%d Load -> UAVBuffer%d Store", i, i);
     // scalar
     VERIFY_ARE_EQUAL(pOutData->srvOut.v1, testData.v1);
     // vector 2
     VERIFY_ARE_EQUAL(pOutData->srvOut.v2[0], testData.v2[0]);
     VERIFY_ARE_EQUAL(pOutData->srvOut.v2[1], testData.v2[1]);
     // vector 3
     VERIFY_ARE_EQUAL(pOutData->srvOut.v3[0], testData.v3[0]);
     VERIFY_ARE_EQUAL(pOutData->srvOut.v3[1], testData.v3[1]);
     VERIFY_ARE_EQUAL(pOutData->srvOut.v3[2], testData.v3[2]);
     // vector 4
     VERIFY_ARE_EQUAL(pOutData->srvOut.v4[0], testData.v4[0]);
     VERIFY_ARE_EQUAL(pOutData->srvOut.v4[1], testData.v4[1]);
     VERIFY_ARE_EQUAL(pOutData->srvOut.v4[2], testData.v4[2]);
     VERIFY_ARE_EQUAL(pOutData->srvOut.v4[3], testData.v4[3]);
   }
 }

 template <class Ty>
 void ExecutionTest::RunComputeRawBufferLdStTest(D3D_SHADER_MODEL shaderModel, RawBufferLdStType dataType,
                                                 const char *shaderOpName, const RawBufferLdStTestData<Ty> &testData) {
    WEX::TestExecution::SetVerifyOutput verifySettings(WEX::TestExecution::VerifyOutputSettings::LogOnlyFailures);

    CComPtr<ID3D12Device> pDevice;
    CComPtr<IStream> pStream;
    char *sTy = nullptr, *additionalOptions = nullptr;

    if (!SetupRawBufferLdStTest(shaderModel, dataType, pDevice, pStream, sTy, additionalOptions)) {
      return;
    }

    // format shader source
    char rawBufferTestShaderText[sizeof(RawBufferTestComputeShaderTemplate) + sizeof(RawBufferTestShaderDeclarations) + sizeof(RawBufferTestShaderBody)];
    VERIFY_IS_TRUE(sprintf_s(rawBufferTestShaderText, sizeof(rawBufferTestShaderText),
                             RawBufferTestComputeShaderTemplate, RawBufferTestShaderDeclarations, RawBufferTestShaderBody) != -1);

    // format compiler args
    char compilerOptions[256];
    VERIFY_IS_TRUE(sprintf_s(compilerOptions, sizeof(compilerOptions), "-D COMPONENT_TYPE=%s -D COMPONENT_SIZE=%d %s", sTy, (int)sizeof(Ty), additionalOptions) != -1);

    // run the shader
    std::shared_ptr<ShaderOpTestResult> test = RunShaderOpTest(pDevice, m_support, pStream, shaderOpName,
      [&](LPCSTR Name, std::vector<BYTE> &Data, st::ShaderOp *pShaderOp) {
      VERIFY_IS_TRUE(((0 == strncmp(Name, "SRVBuffer", 9)) || (0 == strncmp(Name, "UAVBuffer", 9))) &&
                     (Name[9] >= '0' && Name[9] <= '3'));
      pShaderOp->Shaders.at(0).Arguments = compilerOptions;
      pShaderOp->Shaders.at(0).Text = rawBufferTestShaderText;

      VERIFY_IS_TRUE(sizeof(RawBufferLdStTestData<Ty>) <= Data.size());
      RawBufferLdStTestData<Ty> *pInData = (RawBufferLdStTestData<Ty>*)Data.data();
      memcpy(pInData, &testData, sizeof(RawBufferLdStTestData<Ty>));
    });

    // verify expected values
    VerifyRawBufferLdStTestResults<Ty>(test->Test, testData);
 }

 template <class Ty>
 void ExecutionTest::RunGraphicsRawBufferLdStTest(D3D_SHADER_MODEL shaderModel, RawBufferLdStType dataType,
                                                  const char *shaderOpName, const RawBufferLdStTestData<Ty> &testData) {

   WEX::TestExecution::SetVerifyOutput verifySettings(WEX::TestExecution::VerifyOutputSettings::LogOnlyFailures);

   CComPtr<ID3D12Device> pDevice;
   CComPtr<IStream> pStream;
   char *sTy = nullptr, *additionalOptions = nullptr;

   if (!SetupRawBufferLdStTest(shaderModel, dataType, pDevice, pStream, sTy, additionalOptions)) {
     return;
   }

   // format shader source
   char rawBufferTestPixelShaderText[sizeof(RawBufferTestGraphicsPixelShaderTemplate) + sizeof(RawBufferTestShaderDeclarations) + sizeof(RawBufferTestShaderBody)];
   VERIFY_IS_TRUE(sprintf_s(rawBufferTestPixelShaderText, sizeof(rawBufferTestPixelShaderText),
                            RawBufferTestGraphicsPixelShaderTemplate, RawBufferTestShaderDeclarations, RawBufferTestShaderBody) != -1);

   // format compiler args
   char compilerOptions[256];
   VERIFY_IS_TRUE(sprintf_s(compilerOptions, sizeof(compilerOptions), "-D COMPONENT_TYPE=%s -D COMPONENT_SIZE=%d %s", sTy, (int)sizeof(Ty), additionalOptions) != -1);

   // run the shader
   std::shared_ptr<ShaderOpTestResult> test = RunShaderOpTest(pDevice, m_support, pStream, shaderOpName,
     [&](LPCSTR Name, std::vector<BYTE> &Data, st::ShaderOp *pShaderOp) {
     VERIFY_IS_TRUE(((0 == strncmp(Name, "SRVBuffer", 9)) || (0 == strncmp(Name, "UAVBuffer", 9))) &&
       (Name[9] >= '0' && Name[9] <= '3'));
     // pixel shader is at index 1, vertex shader at index 0
     pShaderOp->Shaders.at(1).Arguments = compilerOptions;
     pShaderOp->Shaders.at(1).Text = rawBufferTestPixelShaderText;

     VERIFY_IS_TRUE(sizeof(RawBufferLdStTestData<Ty>) <= Data.size());
     RawBufferLdStTestData<Ty> *pInData = (RawBufferLdStTestData<Ty>*)Data.data();
     memcpy(pInData, &testData, sizeof(RawBufferLdStTestData<Ty>));
   });

   // verify expected values
   VerifyRawBufferLdStTestResults<Ty>(test->Test, testData);
 }

 template<typename T>
 uint32_t pack(std::array<T, 4> unpackedVals)
 {
     uint32_t dst = 0;
     constexpr uint32_t bitMask = 0xFF;
     for (uint32_t i = 0U; i < 4U; ++i)
     {
         dst |= (unpackedVals[i] & bitMask) << (i * 8);
     }

     return dst;
 }

 template <typename T>
 uint32_t pack_clamp_u8(std::array<T, 4> unpackedVals)
 {
     int32_t clamp_min = std::numeric_limits<uint8_t>::min();
     int32_t clamp_max = std::numeric_limits<uint8_t>::max();

     uint32_t dst = 0;
     for (uint32_t i = 0U; i < 4U; ++i)
     {
         int32_t clamped = std::min(std::max((int32_t)unpackedVals[i], clamp_min), clamp_max);
         dst |= ((uint8_t)clamped) << (i * 8);
     }

     return dst;
 }

 template <typename T>
 uint32_t pack_clamp_s8(std::array<T, 4> unpackedVals)
 {
     int32_t clamp_min = std::numeric_limits<int8_t>::min();
     int32_t clamp_max = std::numeric_limits<int8_t>::max();

     uint32_t dst = 0;
     for (uint32_t i = 0U; i < 4U; ++i)
     {
         int32_t clamped = std::min(std::max((int32_t)unpackedVals[i], clamp_min), clamp_max);
         dst |= ((uint8_t)clamped) << (i * 8);
     }

     return dst;
 }

 template<typename T>
 std::array<T, 4> unpack_u(uint32_t packedVal)
 {
     std::array<T, 4> ret;
     ret[0] = (uint8_t)((packedVal & 0x000000FF) >> 0 );
     ret[1] = (uint8_t)((packedVal & 0x0000FF00) >> 8 );
     ret[2] = (uint8_t)((packedVal & 0x00FF0000) >> 16);
     ret[3] = (uint8_t)((packedVal & 0xFF000000) >> 24);

     return ret;
 }

 template<typename T>
 std::array<T, 4> unpack_s(uint32_t packedVal)
 {
     std::array<T, 4> ret;
     ret[0] = (int8_t)((packedVal & 0x000000FF) >> 0 );
     ret[1] = (int8_t)((packedVal & 0x0000FF00) >> 8 );
     ret[2] = (int8_t)((packedVal & 0x00FF0000) >> 16);
     ret[3] = (int8_t)((packedVal & 0xFF000000) >> 24);

     return ret;
 }


 TEST_F(ExecutionTest, PackUnpackTest) {
     WEX::TestExecution::SetVerifyOutput verifySettings(
         WEX::TestExecution::VerifyOutputSettings::LogOnlyFailures);
     CComPtr<IStream> pStream;
     ReadHlslDataIntoNewStream(L"ShaderOpArith.xml", &pStream);

     CComPtr<ID3D12Device> pDevice;

 #ifdef PACKUNPACK_PLACEHOLDER
     string args = "-enable-16bit-types -DPACKUNPACK_PLACEHOLDER";
     string target = "cs_6_2";

     if (!CreateDevice(&pDevice)) {
         return;
     }
 #else
     string args = "-enable-16bit-types";
     string target = "cs_6_6";

     if (!CreateDevice(&pDevice, D3D_SHADER_MODEL_6_6)) {
         return;
     }
 #endif

     if (!DoesDeviceSupportNative16bitOps(pDevice)) {
         WEX::Logging::Log::Comment(L"Device does not support native 16-bit operations.");
         WEX::Logging::Log::Result(WEX::Logging::TestResults::Skipped);
         return;
     }

     int tableSize = sizeof(PackUnpackOpParameters) / sizeof(TableParameter);
     TableParameterHandler handler(PackUnpackOpParameters, tableSize);

     CW2A Text(handler.GetTableParamByName(L"ShaderOp.Text")->m_str);

     std::vector<uint32_t> *validation_input = &handler.GetTableParamByName(L"Validation.Input")->m_uint32Table;
     uint32_t validation_tolerance = handler.GetTableParamByName(L"Validation.Tolerance")->m_uint;

     size_t count = validation_input->size();
     std::vector<SPackUnpackOpOutPacked> expectedPacked(count / 4);
     std::vector<SPackUnpackOpOutUnpacked> expectedUnpacked(count / 4);

     std::shared_ptr<ShaderOpTestResult> test = RunShaderOpTest(
         pDevice, m_support, pStream, "PackUnpackOp",
         // this callback is called when the test
         // is creating the resource to run the test
         [&](LPCSTR Name, std::vector<BYTE> &Data, st::ShaderOp *pShaderOp) {

         if (0 == _stricmp(Name, "g_bufIn"))
         {
             size_t size = sizeof(uint32_t) * 4 * count;
             Data.resize(size);
             uint32_t *pPrimitives = (uint32_t*)Data.data();

             for (size_t i = 0; i < count / 4; ++i) {
                 uint32_t *p = &pPrimitives[i * 4];
                 uint32_t x = (*validation_input)[i * 4 + 0];
                 uint32_t y = (*validation_input)[i * 4 + 1];
                 uint32_t z = (*validation_input)[i * 4 + 2];
                 uint32_t w = (*validation_input)[i * 4 + 3];

                 p[0] = x;
                 p[1] = y;
                 p[2] = z;
                 p[3] = w;

                 std::array<uint32_t, 4> inputUint32 = { x, y, z, w };
                 std::array<int32_t, 4> inputInt32 = { (int32_t)x, (int32_t)y, (int32_t)z, (int32_t)w };
                 std::array<uint16_t, 4> inputUint16 = { (uint16_t)x, (uint16_t)y, (uint16_t)z, (uint16_t)w };
                 std::array<int16_t, 4> inputInt16 = { (int16_t)x, (int16_t)y, (int16_t)z, (int16_t)w };

                 // Pack unclamped
                 expectedPacked[i].packedUint32 = pack(inputUint32);
                 expectedPacked[i].packedInt32 = pack(inputInt32);
                 expectedPacked[i].packedUint16 = pack(inputUint16);
                 expectedPacked[i].packedInt16 = pack(inputInt16);
                 // pack clamped
                 expectedPacked[i].packedClampedUint32 = pack_clamp_u8(inputInt32);
                 expectedPacked[i].packedClampedInt32 = pack_clamp_s8(inputInt32);
                 expectedPacked[i].packedClampedUint16 = pack_clamp_u8(inputInt16);
                 expectedPacked[i].packedClampedInt16 = pack_clamp_s8(inputInt16);

                 // unpack
                 expectedUnpacked[i].outputUint32 = unpack_u<uint32_t>(expectedPacked[i].packedUint32);
                 expectedUnpacked[i].outputInt32  = unpack_s<int32_t >(expectedPacked[i].packedInt32 );
                 expectedUnpacked[i].outputUint16 = unpack_u<uint16_t>(expectedPacked[i].packedUint16);
                 expectedUnpacked[i].outputInt16  = unpack_s<int16_t >(expectedPacked[i].packedInt16 );
                 expectedUnpacked[i].outputClampedUint32 = unpack_u<uint32_t>(expectedPacked[i].packedClampedUint32);
                 expectedUnpacked[i].outputClampedInt32  = unpack_s<int32_t >(expectedPacked[i].packedClampedInt32 );
                 expectedUnpacked[i].outputClampedUint16 = unpack_u<uint16_t>(expectedPacked[i].packedClampedUint16);
                 expectedUnpacked[i].outputClampedInt16  = unpack_s<int16_t >(expectedPacked[i].packedClampedInt16 );
             }
         }
         else
         {
             std::fill(Data.begin(), Data.end(), (BYTE)0);
         }

         // use shader from data table
         pShaderOp->Shaders.at(0).Target = target.c_str();
         pShaderOp->Shaders.at(0).Text = Text.m_psz;
         pShaderOp->Shaders.at(0).Arguments = args.c_str();
     });

     MappedData packedData;
     test->Test->GetReadBackData("g_bufOutPacked", &packedData);
     SPackUnpackOpOutPacked *readBackPacked = (SPackUnpackOpOutPacked *)packedData.data();

     MappedData unpackedData;
     test->Test->GetReadBackData("g_bufOutPackedUnpacked", &unpackedData);
     SPackUnpackOpOutUnpacked *readBackUnpacked = (SPackUnpackOpOutUnpacked *)unpackedData.data();

     for (size_t i = 0; i < count / 4; ++i)
     {
         VerifyOutputWithExpectedValueUInt(readBackPacked[i].packedUint32, expectedPacked[i].packedUint32, validation_tolerance);
         VerifyOutputWithExpectedValueInt (readBackPacked[i].packedInt32 , expectedPacked[i].packedInt32 , validation_tolerance);
         VerifyOutputWithExpectedValueUInt(readBackPacked[i].packedUint16, expectedPacked[i].packedUint16, validation_tolerance);
         VerifyOutputWithExpectedValueInt (readBackPacked[i].packedInt16 , expectedPacked[i].packedInt16 , validation_tolerance);
         VerifyOutputWithExpectedValueUInt(readBackPacked[i].packedClampedUint32, expectedPacked[i].packedClampedUint32, validation_tolerance);
         VerifyOutputWithExpectedValueInt (readBackPacked[i].packedClampedInt32 , expectedPacked[i].packedClampedInt32 , validation_tolerance);
         VerifyOutputWithExpectedValueUInt(readBackPacked[i].packedClampedUint16, expectedPacked[i].packedClampedUint16, validation_tolerance);
         VerifyOutputWithExpectedValueInt (readBackPacked[i].packedClampedInt16 , expectedPacked[i].packedClampedInt16 , validation_tolerance);

         for (uint32_t j = 0; j < 4; ++j)
         {
             VerifyOutputWithExpectedValueUInt(readBackUnpacked[i].outputUint32[j], expectedUnpacked[i].outputUint32[j], validation_tolerance);
             VerifyOutputWithExpectedValueInt (readBackUnpacked[i].outputInt32 [j], expectedUnpacked[i].outputInt32 [j], validation_tolerance);
             VerifyOutputWithExpectedValueUInt(readBackUnpacked[i].outputUint16[j], expectedUnpacked[i].outputUint16[j], validation_tolerance);
             VerifyOutputWithExpectedValueInt (readBackUnpacked[i].outputInt16 [j], expectedUnpacked[i].outputInt16 [j], validation_tolerance);
             VerifyOutputWithExpectedValueUInt(readBackUnpacked[i].outputClampedUint32[j], expectedUnpacked[i].outputClampedUint32[j], validation_tolerance);
             VerifyOutputWithExpectedValueInt (readBackUnpacked[i].outputClampedInt32 [j], expectedUnpacked[i].outputClampedInt32 [j], validation_tolerance);
             VerifyOutputWithExpectedValueUInt(readBackUnpacked[i].outputClampedUint16[j], expectedUnpacked[i].outputClampedUint16[j], validation_tolerance);
             VerifyOutputWithExpectedValueInt (readBackUnpacked[i].outputClampedInt16 [j], expectedUnpacked[i].outputClampedInt16 [j], validation_tolerance);
         }
     }
 }


 // This test expects a <pShader> that retrieves a signal value from each of a few
 // resources that are initialized here. <isDynamic> determines if it uses the
 // 6.6 Dynamic Resources feature.
 // Values are read back from the result UAV and compared to the expected signals
 void ExecutionTest::RunResourceTest(ID3D12Device *pDevice, const char *pShader,
                                     const wchar_t *sm, bool isDynamic) {
   WEX::TestExecution::SetVerifyOutput verifySettings(WEX::TestExecution::VerifyOutputSettings::LogOnlyFailures);

   const int NumSRVs = 3;
   const int NumUAVs = 4;
   const int NumResources = NumSRVs + NumUAVs;
   const int NumSamplers = 2;
   const int valueSize = 16;

   static const int DispatchGroupX = 1;
   static const int DispatchGroupY = 1;
   static const int DispatchGroupZ = 1;

   CComPtr<ID3D12GraphicsCommandList> pCommandList;
   CComPtr<ID3D12CommandQueue> pCommandQueue;
   CComPtr<ID3D12CommandAllocator> pCommandAllocator;
   FenceObj FO;

   UINT valueSizeInBytes = valueSize * sizeof(float);
   CreateComputeCommandQueue(pDevice, L"DynamicResourcesTest Command Queue", &pCommandQueue);
   InitFenceObj(pDevice, &FO);

   // Create root signature.
   CComPtr<ID3D12RootSignature> pRootSignature;
   if (!isDynamic) {
     // Not dynamic, create a range for each resource and from them, the root signature
     CD3DX12_DESCRIPTOR_RANGE ranges[NumResources];
     CD3DX12_DESCRIPTOR_RANGE srange[NumSamplers];
     for (int i = 0; i < NumSRVs; i++)
       ranges[i].Init(D3D12_DESCRIPTOR_RANGE_TYPE_SRV, 1, i, 0);

     for (int i = NumSRVs; i < NumResources; i++)
       ranges[i].Init(D3D12_DESCRIPTOR_RANGE_TYPE_UAV, 1, i - NumSRVs, 0);

     for (int i = 0; i < NumSamplers; i++)
       srange[i].Init(D3D12_DESCRIPTOR_RANGE_TYPE_SAMPLER, 1, i, 0);

     CreateRootSignatureFromRanges(pDevice, &pRootSignature, ranges, NumResources, srange, NumSamplers);
   } else {
     // Dynamic just requires the flags indicating that the builtin arrays should be accessible
 #if !defined(D3D12_ROOT_SIGNATURE_FLAG_CBV_SRV_UAV_HEAP_DIRECTLY_INDEXED)
 #define D3D12_ROOT_SIGNATURE_FLAG_CBV_SRV_UAV_HEAP_DIRECTLY_INDEXED (D3D12_ROOT_SIGNATURE_FLAGS)0x400
 #define D3D12_ROOT_SIGNATURE_FLAG_SAMPLER_HEAP_DIRECTLY_INDEXED (D3D12_ROOT_SIGNATURE_FLAGS)0x800
 #endif
     CD3DX12_ROOT_SIGNATURE_DESC rootSignatureDesc;
     rootSignatureDesc.Init(0, nullptr, 0, nullptr,
                            D3D12_ROOT_SIGNATURE_FLAG_CBV_SRV_UAV_HEAP_DIRECTLY_INDEXED |
                            D3D12_ROOT_SIGNATURE_FLAG_SAMPLER_HEAP_DIRECTLY_INDEXED);
     CreateRootSignatureFromDesc(pDevice, &rootSignatureDesc, &pRootSignature);
   }

   // Create pipeline state object.
   CComPtr<ID3D12PipelineState> pComputeState;
   CreateComputePSO(pDevice, pRootSignature, pShader, sm, &pComputeState);

   // Create a command allocator and list for compute.
   VERIFY_SUCCEEDED(pDevice->CreateCommandAllocator(D3D12_COMMAND_LIST_TYPE_COMPUTE, IID_PPV_ARGS(&pCommandAllocator)));
   VERIFY_SUCCEEDED(pDevice->CreateCommandList(0, D3D12_COMMAND_LIST_TYPE_COMPUTE, pCommandAllocator, pComputeState, IID_PPV_ARGS(&pCommandList)));

   // Set up SRV resources
   CComPtr<ID3D12Resource> pSRVResources[NumSRVs];
   CComPtr<ID3D12Resource> pUAVResources[NumUAVs];
   CComPtr<ID3D12Resource> pUploadResources[NumResources];
   {
     D3D12_RESOURCE_DESC bufDesc = CD3DX12_RESOURCE_DESC::Buffer(valueSizeInBytes);
     float values[valueSize];
     for (int i = 0; i < NumSRVs - 1; i++) {
       for (int j = 0; j < valueSize; j++)
         values[j] = 10.0f + i;
       CreateTestResources(pDevice, pCommandList, values, valueSizeInBytes, bufDesc,
                           &pSRVResources[i], &pUploadResources[i]);
     }
     D3D12_RESOURCE_DESC tex2dDesc = CD3DX12_RESOURCE_DESC::Tex2D(DXGI_FORMAT_R32_FLOAT, 4, 4);
     for (int j = 0; j < valueSize; j++)
       values[j] = 10.0 + (NumSRVs - 1);
     CreateTestResources(pDevice, pCommandList, values, valueSizeInBytes, tex2dDesc,
                         &pSRVResources[NumSRVs - 1], &pUploadResources[NumSRVs - 1]);
   }

   // Set up UAV resources
   CComPtr<ID3D12Resource> pReadBuffer;
   float values[valueSize];
   for (int i = 0; i < NumUAVs - 2; i++) {
     for (int j = 0; j < valueSize; j++)
       values[j] = 20.0f + i;
     CreateTestUavs(pDevice, pCommandList, values, valueSizeInBytes,
                    &pUAVResources[i], &pUploadResources[NumSRVs + i]);
   }
   for (int j = 0; j < valueSize; j++)
     values[j] = 20.0 + (NumUAVs - 1);
   CreateTestUavs(pDevice, pCommandList, values, valueSizeInBytes,
                  &pUAVResources[NumUAVs - 2], &pUploadResources[NumResources - 2], &pReadBuffer);

   for (int j = 0; j < valueSize; j++)
     values[j] = 20.0 + (NumUAVs - 2);
   D3D12_RESOURCE_DESC tex1dDesc = CD3DX12_RESOURCE_DESC::Tex1D(DXGI_FORMAT_R32_FLOAT, valueSize, 1, 0, D3D12_RESOURCE_FLAG_ALLOW_UNORDERED_ACCESS);
   CreateTestResources(pDevice, pCommandList, values, valueSizeInBytes, tex1dDesc,
                       &pUAVResources[NumUAVs - 1], &pUploadResources[NumResources - 1]);

   // Close the command list and execute it to perform the GPU setup.
   pCommandList->Close();
   ExecuteCommandList(pCommandQueue, pCommandList);
   WaitForSignal(pCommandQueue, FO);
   VERIFY_SUCCEEDED(pCommandAllocator->Reset());
   VERIFY_SUCCEEDED(pCommandList->Reset(pCommandAllocator, pComputeState));

   CComPtr<ID3D12DescriptorHeap> pResHeap;
   CComPtr<ID3D12DescriptorHeap> pSampHeap;
   CreateDefaultDescHeaps(pDevice, NumSRVs + NumUAVs, NumSamplers, &pResHeap, &pSampHeap);

   // Create Rootsignature and descriptor tables
   {
     ID3D12DescriptorHeap *descHeaps[2] = {pResHeap, pSampHeap};
     pCommandList->SetDescriptorHeaps(2, descHeaps);
     pCommandList->SetComputeRootSignature(pRootSignature);

     if (!isDynamic) {
       // Only non-dynamic resources require descriptortables
       pCommandList->SetComputeRootDescriptorTable(0, pResHeap->GetGPUDescriptorHandleForHeapStart());
       pCommandList->SetComputeRootDescriptorTable(1, pSampHeap->GetGPUDescriptorHandleForHeapStart());
     }
   }
   CD3DX12_CPU_DESCRIPTOR_HANDLE baseHandle(pResHeap->GetCPUDescriptorHandleForHeapStart());
   // Create SRVs
   CreateRawSRV(pDevice, baseHandle, valueSize, pSRVResources[0]);
   CreateStructSRV(pDevice, baseHandle, valueSize, sizeof(float), pSRVResources[1]);
   CreateTex2DSRV(pDevice, baseHandle, DXGI_FORMAT_R32_FLOAT, pSRVResources[2]);
   // Create UAVs
   CreateRawUAV(pDevice, baseHandle, valueSize, pUAVResources[0]);
   CreateStructUAV(pDevice, baseHandle, valueSize, sizeof(float), pUAVResources[1]);
   CreateTypedUAV(pDevice, baseHandle, valueSize, DXGI_FORMAT_R32_FLOAT, pUAVResources[2]);
   CreateTex1DUAV(pDevice, baseHandle, DXGI_FORMAT_R32_FLOAT, pUAVResources[3]);

   D3D12_FILTER filters[] = {D3D12_FILTER_MIN_MAG_LINEAR_MIP_POINT, D3D12_FILTER_COMPARISON_MIN_MAG_LINEAR_MIP_POINT};
   float perSampleBorderColors[] = {30.0, 31.0};
   CreateDefaultSamplers(pDevice, pSampHeap->GetCPUDescriptorHandleForHeapStart(),
                         filters, perSampleBorderColors, NumSamplers);

   // Run the compute shader and copy the results back to readable memory.
   pCommandList->Dispatch(DispatchGroupX, DispatchGroupY, DispatchGroupZ);

   RecordTransitionBarrier(pCommandList, pUAVResources[NumUAVs - 2], D3D12_RESOURCE_STATE_UNORDERED_ACCESS, D3D12_RESOURCE_STATE_COPY_SOURCE);
   pCommandList->CopyResource(pReadBuffer, pUAVResources[NumUAVs - 2]);

   pCommandList->Close();
   ExecuteCommandList(pCommandQueue, pCommandList);
   WaitForSignal(pCommandQueue, FO);

   MappedData data(pReadBuffer,  valueSize*sizeof(float));
   const float *pData = (float*)data.data();
   LogCommentFmt(L"Verify bound resources are properly selected");
   VERIFY_ARE_EQUAL(pData[0], 10);
   VERIFY_ARE_EQUAL(pData[1], 11);
   VERIFY_ARE_EQUAL(pData[2], 12);

   VERIFY_ARE_EQUAL(pData[3], 20);
   VERIFY_ARE_EQUAL(pData[4], 21);
   VERIFY_ARE_EQUAL(pData[5], 22);
   VERIFY_ARE_EQUAL(pData[6], 30);
   VERIFY_ARE_EQUAL(pData[7], 1); // samplecmp 1 means it matched 31
 }

 TEST_F(ExecutionTest, SignatureResourcesTest) {
   std::string pShader =
     "ByteAddressBuffer         g_rawBuf      : register(t0);\n"
     "StructuredBuffer<float>   g_structBuf   : register(t1);\n"
     "Texture2D<float>          g_tex         : register(t2);\n"
     "RWByteAddressBuffer       g_rwRawBuf    : register(u0);\n"
     "RWStructuredBuffer<float> g_rwStructBuf : register(u1);\n"
     "RWBuffer<float>           g_result      : register(u2);\n"
     "RWTexture1D<float>        g_rwTex       : register(u3);\n"
     "SamplerState              g_samp        : register(s0);\n"
     "SamplerComparisonState    g_sampCmp     : register(s1);\n"
     "[NumThreads(1, 1, 1)]\n"
     "void main(uint ix : SV_GroupIndex) {\n"
     "  g_result[0] = g_rawBuf.Load<float>(0);\n"
     "  g_result[1] = g_structBuf.Load(0);\n"
     "  g_result[2] = g_tex.Load(0);\n"
     "  g_result[3] = g_rwRawBuf.Load<float>(0);\n"
     "  g_result[4] = g_rwStructBuf.Load(0);\n"
     "  g_result[5] = g_rwTex.Load(0);\n"
     "  g_result[6] = g_tex.SampleLevel(g_samp, -0.5, 0);\n"
     "  g_result[7] = g_tex.SampleCmpLevelZero(g_sampCmp, -0.5, 31.0);\n"
     "}\n";

   CComPtr<ID3D12Device> pDevice;
   if (!CreateDevice(&pDevice, D3D_SHADER_MODEL_6_6))
     return;

   RunResourceTest(pDevice, pShader.c_str(), L"cs_6_6", /*isDynamic*/false);
 }

 TEST_F(ExecutionTest, DynamicResourcesTest) {
   static const char pShader[] =
     "static ByteAddressBuffer         g_rawBuf      = ResourceDescriptorHeap[0];\n"
     "static StructuredBuffer<float>   g_structBuf   = ResourceDescriptorHeap[1];\n"
     "static Texture2D<float>          g_tex         = ResourceDescriptorHeap[2];\n"
     "static RWByteAddressBuffer       g_rwRawBuf    = ResourceDescriptorHeap[3];\n"
     "static RWStructuredBuffer<float> g_rwStructBuf = ResourceDescriptorHeap[4];\n"
     "static RWBuffer<float>           g_result      = ResourceDescriptorHeap[5];\n"
     "static RWTexture1D<float>        g_rwTex       = ResourceDescriptorHeap[6];\n"
     "static SamplerState              g_samp        = SamplerDescriptorHeap[0];\n"
     "static SamplerComparisonState    g_sampCmp     = SamplerDescriptorHeap[1];\n"
     "[NumThreads(1, 1, 1)]\n"
     "void main(uint ix : SV_GroupIndex) {\n"
     "  g_result[0] = g_rawBuf.Load<float>(0);\n"
     "  g_result[1] = g_structBuf.Load(0);\n"
     "  g_result[2] = g_tex.Load(0);\n"
     "  g_result[3] = g_rwRawBuf.Load<float>(0);\n"
     "  g_result[4] = g_rwStructBuf.Load(0);\n"
     "  g_result[5] = g_rwTex.Load(0);\n"
     "  g_result[6] = g_tex.SampleLevel(g_samp, -0.5, 0);\n"
     "  g_result[7] = g_tex.SampleCmpLevelZero(g_sampCmp, -0.5, 31.0);\n"
     "}\n";

   CComPtr<ID3D12Device> pDevice;
   if (!CreateDevice(&pDevice, D3D_SHADER_MODEL_6_6))
     return;

   // ResourceDescriptorHeap/SamplerDescriptorHeap requires Resource Binding Tier 3
   D3D12_FEATURE_DATA_D3D12_OPTIONS devOptions;
   VERIFY_SUCCEEDED(pDevice->CheckFeatureSupport((D3D12_FEATURE)D3D12_FEATURE_D3D12_OPTIONS, &devOptions, sizeof(devOptions)));
   if (devOptions.ResourceBindingTier < D3D12_RESOURCE_BINDING_TIER_3) {
     WEX::Logging::Log::Comment(L"Device does not support Resource Binding Tier 3");
     WEX::Logging::Log::Result(WEX::Logging::TestResults::Skipped);
     return;
   }

   RunResourceTest(pDevice, pShader, L"cs_6_6", /*isDynamic*/true);
 }

 //void ExecutionTest::TestComputeShaderDynamicResourcesUniformIndexing()

 void EnableShaderBasedValidation() {
   CComPtr<ID3D12Debug> spDebugController0;
   CComPtr<ID3D12Debug1> spDebugController1;
   VERIFY_SUCCEEDED(D3D12GetDebugInterface(IID_PPV_ARGS(&spDebugController0)));
   VERIFY_SUCCEEDED(
       spDebugController0->QueryInterface(IID_PPV_ARGS(&spDebugController1)));
   spDebugController1->SetEnableGPUBasedValidation(true);
 }

 void VerifyFloatArraysAreEqual(const float* resultFloats, float *expectedResults, int expectedResultsSize)
 {
   for (int j = 0; j < expectedResultsSize; j++)
   {
     VERIFY_ARE_EQUAL(resultFloats[j], expectedResults[j]);
   }
 }

 TEST_F(ExecutionTest, DynamicResourcesDynamicIndexingTest) {
   //EnableShaderBasedValidation();
   WEX::TestExecution::SetVerifyOutput verifySettings(
       WEX::TestExecution::VerifyOutputSettings::LogOnlyFailures);
   CComPtr<IStream> pStream;
   ReadHlslDataIntoNewStream(L"ShaderOpArith.xml", &pStream);

   std::shared_ptr<st::ShaderOpSet> ShaderOpSet =
       std::make_shared<st::ShaderOpSet>();
   st::ParseShaderOpSetFromStream(pStream, ShaderOpSet.get());
   st::ShaderOp *pShaderOp =
       ShaderOpSet->GetShaderOp("DynamicResourcesDynamicIndexing");
   vector<st::ShaderOpRootValue> fallbackRootValues = pShaderOp->RootValues;

   bool Skipped = true;

   //D3D_SHADER_MODEL TestShaderModels[] = {D3D_SHADER_MODEL_6_0}; // FALLBACK
   D3D_SHADER_MODEL TestShaderModels[] = {D3D_SHADER_MODEL_6_6, D3D_SHADER_MODEL_6_0};

   const int expectedResultsSize = 16;
   float expectedResultsUniform[expectedResultsSize] = {
     10.0, 10.0,
     12.0, 12.0,
     14.0, 14.0,
     20.0, 20.0,
     22.0, 22.0,
     24.0, 24.0,
     30.0, 30.0,
     32.0, 32.0};

   float expectedResultsNonUniform[expectedResultsSize] = {
     10.0, 11.0,
     12.0, 13.0,
     14.0, 15.0,
     20.0, 21.0,
     22.0, 23.0,
     24.0, 25.0,
     30.0, 31.0,
     32.0, 33.0};

   // TestShaderModels will be an array, where the first x models are "non-fallback", and the rest of the models
   // are "fallback". If TestShaderModels has length y, and a test loops through all shader models, a convention
   // to test based on whether fallback is enabled or not is to limit the loop like this:
   // unsigned num_models_to_test = ExecutionTest::IsFallbackPathEnabled() ? y : x;
   unsigned num_models_to_test = ExecutionTest::IsFallbackPathEnabled() ? 2 : 1;
   for (unsigned i = 0; i < num_models_to_test; i++) {
     D3D_SHADER_MODEL sm = TestShaderModels[i];
     LogCommentFmt(L"\r\nVerifying Dynamic Resources Dynamic Indexing in shader "
                   L"model 6.%1u",
                   ((UINT)sm & 0x0f));

     CComPtr<ID3D12Device> pDevice;
     if (!CreateDevice(&pDevice, sm, false /* skipUnsupported */)) {
       continue;
     }
     D3D12_FEATURE_DATA_D3D12_OPTIONS devOptions;
     VERIFY_SUCCEEDED(
         pDevice->CheckFeatureSupport((D3D12_FEATURE)D3D12_FEATURE_D3D12_OPTIONS,
                                       &devOptions, sizeof(devOptions)));
     if (devOptions.ResourceBindingTier < D3D12_RESOURCE_BINDING_TIER_3) {
       WEX::Logging::Log::Comment(
           L"Device does not support Resource Binding Tier 3");
       WEX::Logging::Log::Result(WEX::Logging::TestResults::Skipped);
       return;
     }

     for (unsigned int non_uniform_bit = 0; non_uniform_bit < 2; non_uniform_bit++) {
       float *expectedResults = non_uniform_bit ? expectedResultsNonUniform : expectedResultsUniform;

       LogCommentFmt(L"Testing %s Resource Indexing.", non_uniform_bit ? L"NonUniform" : L"Uniform");

       // Add compile options
       std::string compilerOptions = "";
       if (sm==D3D_SHADER_MODEL_6_0)
         compilerOptions += " -D FALLBACK=1";
       if (non_uniform_bit)
         compilerOptions += " -D NON_UNIFORM=1";

       // by default a root value is added.
       // remove the root value if this is the non-fallback path
       if (sm==D3D_SHADER_MODEL_6_6)
       {
         pShaderOp->RootValues.clear();
       }
       else
       {
          pShaderOp->RootValues = fallbackRootValues;
       }

       // Update shader target in xml.
       for (st::ShaderOpShader &S : pShaderOp->Shaders){
         S.Arguments = NULL;
         if (!compilerOptions.empty()){
           S.Arguments = pShaderOp->GetString(compilerOptions.c_str());
         }
         // Set the target correctly. Setting here permanently overwrites
         // the Target string even in future iterations.
         if (sm==D3D_SHADER_MODEL_6_0){
           std::string Target(S.Target);
           Target[Target.length() - 1] = '0';
           S.Target = pShaderOp->GetString(Target.c_str());
         }
         else if (sm==D3D_SHADER_MODEL_6_6){
           std::string Target(S.Target);
           Target[Target.length() - 1] = '6';
           S.Target = pShaderOp->GetString(Target.c_str());
         }
       }

       // Test Compute shader
       {
         pShaderOp->CS = pShaderOp->GetString("CS66");
         std::shared_ptr<ShaderOpTestResult> test = RunShaderOpTestAfterParse(
             pDevice, m_support, "DynamicResourcesDynamicIndexing", nullptr,
             ShaderOpSet);

         MappedData resultData;
         test->Test->GetReadBackData("g_result", &resultData);
         const float *resultCSFloats = (float *)resultData.data();

         VerifyFloatArraysAreEqual(resultCSFloats, expectedResults, expectedResultsSize);
       }

       // Test Vertex + Pixel shader
       {
         pShaderOp->CS = nullptr;
         pShaderOp->VS = pShaderOp->GetString("VS66");
         pShaderOp->PS = pShaderOp->GetString("PS66");
         std::shared_ptr<ShaderOpTestResult> test = RunShaderOpTestAfterParse(
             pDevice, m_support, "DynamicResourcesDynamicIndexing", nullptr,
             ShaderOpSet);

         MappedData resultVSData;
         MappedData resultPSData;
         test->Test->GetReadBackData("g_resultVS", &resultVSData);
         test->Test->GetReadBackData("g_resultPS", &resultPSData);
         const float *resultVSFloats = (float *)resultVSData.data();
         const float *resultPSFloats = (float *)resultPSData.data();
         D3D12_QUERY_DATA_PIPELINE_STATISTICS Stats;
         test->Test->GetPipelineStats(&Stats);


         // VS
         VerifyFloatArraysAreEqual(resultVSFloats, expectedResults, expectedResultsSize);

         // PS
         VerifyFloatArraysAreEqual(resultPSFloats, expectedResults, expectedResultsSize);
       }
       Skipped = false;
     }
   }

   if (Skipped) {
     WEX::Logging::Log::Result(WEX::Logging::TestResults::Skipped);
   }

 }

 #define MAX_WAVESIZE 128

 #define strinfigy2(arg) #arg
 #define strinfigy(arg) strinfigy2(arg)

 void ExecutionTest::WaveSizeTest() {
   WEX::TestExecution::SetVerifyOutput verifySettings(WEX::TestExecution::VerifyOutputSettings::LogOnlyFailures);

   CComPtr<ID3D12Device> pDevice;
   if (!CreateDevice(&pDevice, D3D_SHADER_MODEL_6_6)) {
     return;
   }

   // Check Wave support
   if (!DoesDeviceSupportWaveOps(pDevice)) {
     // Optional feature, so it's correct to not support it if declared as such.
     WEX::Logging::Log::Comment(L"Device does not support wave operations.");
     return;
   }

   // Get supported wave sizes
   D3D12_FEATURE_DATA_D3D12_OPTIONS1 waveOpts;
   VERIFY_SUCCEEDED(pDevice->CheckFeatureSupport((D3D12_FEATURE)D3D12_FEATURE_D3D12_OPTIONS1, &waveOpts, sizeof(waveOpts)));
   UINT minWaveSize = waveOpts.WaveLaneCountMin;
   UINT maxWaveSize = waveOpts.WaveLaneCountMax;

   DXASSERT_NOMSG(minWaveSize <= maxWaveSize);
   DXASSERT((minWaveSize & (minWaveSize - 1)) == 0, "must be a power of 2");
   DXASSERT((maxWaveSize & (maxWaveSize - 1)) == 0, "must be a power of 2");

   // read shader config
   CComPtr<IStream> pStream;
   std::shared_ptr<st::ShaderOpSet> ShaderOpSet = std::make_shared<st::ShaderOpSet>();
   ReadHlslDataIntoNewStream(L"ShaderOpArith.xml", &pStream);
   st::ParseShaderOpSetFromStream(pStream, ShaderOpSet.get());

   // format shader source
   const char waveSizeTestShader[] =
     "struct TestData { \r\n"
     "  uint count; \r\n"
     "}; \r\n"
     "RWStructuredBuffer<TestData> data : register(u0); \r\n"
     "\r\n"
     "// Note: WAVESIZE will be defined via compiler option -D\r\n"
     "[wavesize(WAVESIZE)]\r\n"
     "[numthreads(" strinfigy(MAX_WAVESIZE) "*2,1,1)]\r\n"
     "void main(uint3 tid : SV_DispatchThreadID ) { \r\n"
     "  data[tid.x].count = WaveActiveSum(1); \r\n"
     "}\r\n";

   struct WaveSizeTestData {
     uint32_t count;
   };

   for (UINT waveSize = minWaveSize; waveSize <= maxWaveSize; waveSize *= 2) {
     // format compiler args
     char compilerOptions[32];
     VERIFY_IS_TRUE(sprintf_s(compilerOptions, sizeof(compilerOptions), "-D WAVESIZE=%d", waveSize) != -1);

     // run the shader
     std::shared_ptr<ShaderOpTestResult> test = RunShaderOpTestAfterParse(pDevice, m_support, "WaveSizeTest",
       [&](LPCSTR Name, std::vector<BYTE> &Data, st::ShaderOp *pShaderOp) {
       VERIFY_IS_TRUE((0 == strncmp(Name, "UAVBuffer0", 10)));
       pShaderOp->Shaders.at(0).Arguments = compilerOptions;
       pShaderOp->Shaders.at(0).Text = waveSizeTestShader;

       VERIFY_IS_TRUE(sizeof(WaveSizeTestData)*MAX_WAVESIZE <= Data.size());
       WaveSizeTestData *pInData = (WaveSizeTestData *)Data.data();
       memset(&pInData, sizeof(WaveSizeTestData)*MAX_WAVESIZE, 0);
     }, ShaderOpSet);

     // verify expected values
     MappedData dataUav;
     WaveSizeTestData *pOutData;

     test->Test->GetReadBackData("UAVBuffer0", &dataUav);
     VERIFY_ARE_EQUAL(sizeof(WaveSizeTestData)*MAX_WAVESIZE, dataUav.size());
     pOutData = (WaveSizeTestData*)dataUav.data();

     LogCommentFmt(L"Verifying test result for wave size %d", waveSize);

     for (unsigned i = 0; i < MAX_WAVESIZE; i++) {
       if (!VERIFY_ARE_EQUAL(pOutData[i].count, waveSize))
         break;
     }
   }
 }


 // Atomic operation testing

 // Atomic tests take a single integer index as input and contort it into some
 // kind of interesting contributor to the operation in question.
 // So each vertex, pixel, thread, or other will have a unique index that produces
 // a contributing value to the calculation which is stored in a small resource

 // For arithmetic or bitwise operations, each contributor accumulates to the same
 // location in the resource indexed by the operation type. Addition is in index 0
 // umin/umax are in 1 and 2 and so on.

 // To make sure that the most significant bits are involved in the calculation,
 // particularly in the case of 64-bit values, each contributing value is duplicated
 // to the lower and upper halves of the value. There is an exception to this when
 // addition exceeds the available size and also for compare and exchange explained below.

 // For compare and exchange operations, 64 output locations are shared by the various lanes.
 // Each lane attempts to write to a location that is shared with several others.
 // The first one to write to it determines its contents, which will be the lane index <ix>
 // in the upper bits and the output location index in the lower bits.
 // This ensures that the compare operations consider the upper bits in the comparison.
 // The initial compare store is followed by a compare exchange that compares for the
 // value the current lane would have assigned there. Finally, the output of the cmpxchg
 // is used to determine if the current lane should perform the final unconditional exchange.
 // The values are verified by checking the lower bits for the matching location index
 // and ensuring that the upper bits undergoing the same transformation result in the location index.
 // For lane index <ix> the location is calculated and final result assigned as if by this code:
 //    g_outputBuf[(ix/3)%64] = (ix << shBits) | ((ix/3)%64);

 bool AtomicResultMatches(const BYTE *uResults, uint64_t gold, size_t size) {
   if (memcmp(uResults, &gold, size)) {
     if (size == 4)
       LogCommentFmt(L"  value %d is not %d", ((uint32_t*)uResults)[0], (uint32_t)gold);
     else
       LogCommentFmt(L"  value %lld is not %lld", ((uint64_t*)uResults)[0], gold);
     return false;
   }
   return true;
 }

 // Used to duplicate the lower half bits into the upper half bits of an integer
 // To verify that the full value is being considered, many tests duplicate the results into the upper half
 #define SHIFT(val, bits) (((val)&((1ULL<<(bits))-1ULL)) | ((uint64_t)(val) << (bits)))

 // Symbolic constants for the results
 #define ADD_IDX 0
 #define UMIN_IDX 1
 #define UMAX_IDX 2
 #define AND_IDX 3
 #define OR_IDX 4
 #define XOR_IDX 5

 #define SMIN_IDX 0
 #define SMAX_IDX 1

 // Verify results for atomic operations. <uResults> and <sResults> are pointers to
 // the readback resource sections containing unsigned and signed integers respectively.
 // <pXchg> is a poiner to the readback resource containing the results of the compare
 // and exchange operations tests. <stride> is the number of bytes between results for
 // all of the results pointers. <maxIdx> is the number of indices that went into the results
 // which is used to determine what the results should be. <bitSize> is the size in bits of
 // the produced results, either 32 or 64.
 void VerifyAtomicResults(const BYTE *uResults, const BYTE *sResults,
                          const BYTE *pXchg, size_t stride, uint64_t maxIdx, size_t bitSize) {
   // Each atomic test performs the test on the value in the lower half
   // and also duplicated in the upper half of the value. The SHIFT macros account for this.
   // This is to verify that the upper bits are considered
   uint64_t shBits = bitSize/2;
   size_t byteSize = bitSize/8;

   // Test ADD Operation
   // ADD just sums all the indices. The result should the sum of the highest and lowest indices
   // multiplied by half the number of sums.
   uint64_t addResult = (maxIdx)*(maxIdx-1)/2;
   LogCommentFmt(L"Verifying %d-bit integer atomic add", bitSize);
   // For 32-bit values, the sum exceeds the 16 bit limit, so we can't duplicate
   // That's fine, the duplication is really for 64-bit values.
   if (bitSize < 64)
     VERIFY_IS_TRUE(AtomicResultMatches(uResults + stride*ADD_IDX, addResult, byteSize));
   else
     VERIFY_IS_TRUE(AtomicResultMatches(uResults + stride*ADD_IDX, SHIFT(addResult, shBits), byteSize));

   // Test MIN and MAX Operations

   // The result of a simple min and max of any sequence of indices would be fairly uninteresting
   // and certain erroneous behavior might mistakenly produce the correct results.

   // To make it interesting, the contributing values will change depending on the evenness of the index.
   // On an even index, min and max operate on the bitflipped index. For signed compares, this is
   // interpretted as a negative value and for unsigned, a very high value.

   // For unsigned min/max, index 0 will be bitflipped to ~0, which is interpretted as the maximum
   // Because zero is manipulated, this leaves 1 as the lowest value.
   LogCommentFmt(L"Verifying %d-bit integer atomic umin", bitSize);
   VERIFY_IS_TRUE(AtomicResultMatches(uResults + stride*UMIN_IDX, SHIFT(1ULL, shBits), byteSize)); // UMin
   LogCommentFmt(L"Verifying %d-bit integer atomic umax", bitSize);
   VERIFY_IS_TRUE(AtomicResultMatches(uResults + stride*UMAX_IDX, ~0ULL, byteSize)); // UMax

   // For signed min/max, the index just before the last will be bitflipped (maxIndex is always even).
   // This is interpretted as -(maxIndex-1) and will be the lowest
   // The maxIndex will be unaltered and interpretted as the highest.
   LogCommentFmt(L"Verifying %d-bit integer atomic smin", bitSize);
   VERIFY_IS_TRUE(AtomicResultMatches(sResults + stride*SMIN_IDX, SHIFT(-((int64_t)maxIdx-1), shBits), byteSize)); // SMin
   LogCommentFmt(L"Verifying %d-bit integer atomic smax", bitSize);
   VERIFY_IS_TRUE(AtomicResultMatches(sResults + stride*SMAX_IDX, SHIFT(maxIdx-1, shBits), byteSize)); // SMax

   // Test AND and OR operations.

   // For AND operations, all indices are bitflipped and ANDed to the previous result.
   // This means that the highest bits, which are never set by the contributing indices will be set
   // for all the indices, so they will be set in the final result.

   // For OR operations, the indices are ORed to the previous result unaltered
   // This means that any bit that is set in any index will be set in the final OR result.

   // In practice, this means that the cumulative result of the AND and OR operations
   // are bitflipped versions of each other.
   // Finding the most significant set bit by the max index or next power of two (pot)
   // gives us the pivot point for these results
   uint64_t nextPot = 1ULL << (bitSize - 1);
   for (;nextPot && !((maxIdx-1) & (nextPot)); nextPot >>= 1) {}
   nextPot <<= 1;
   LogCommentFmt(L"Verifying %d-bit integer atomic and", bitSize);
   VERIFY_IS_TRUE(AtomicResultMatches(uResults + stride*AND_IDX, ~SHIFT(nextPot-1, shBits), byteSize)); // And
   LogCommentFmt(L"Verifying %d-bit integer atomic or", bitSize);
   VERIFY_IS_TRUE(AtomicResultMatches(uResults + stride*OR_IDX, SHIFT(nextPot-1, shBits), byteSize)); // Or

   // Test XOR operation

   // For XOR operations, a 1 is shifted by the number of spaces equal to the index and XORed
   // to the previous result. Because this would rapidely shift off the end of the value,
   // giving undefined and uninteresting results, the index is moduloed to a value that will
   // fit within the type size.

   // Because many of the tests use total numbers of lanes that can be evenly divisible by 32 or 64,
   // these values aren't used for the modulo since the expected result might be zero,
   // which could be encountered through erroneous behavior.

   // Instead, one less than the type size in bits is used for the modulo.
   // Even though we don't know the actual order these operations are performed,
   // indices that make up a contiguous sequence of 31 or 63 values can be thought of as one of a series of "passes".
   // Each "pass" sets or clears the bits depending on what's already there.
   // if the number of the pass is odd, the bits are being unset and all above the mod position should be set.
   // If even, the bits are in the process of being set and bits below the mod position should be set.
   uint64_t xorResult = ((1ULL<<((maxIdx)%(bitSize-1))) -1);

   if (((maxIdx/(bitSize-1))&1)) {
     xorResult ^= ~0ULL;
     // The XOR above may set uninvolved upper bits, messing up the compare. So AND off the uninvolved bits.
     xorResult &= ((1ULL<<(bitSize-1)) - 1);
   }

   LogCommentFmt(L"Verifying %d-bit integer atomic xor", bitSize);
   VERIFY_IS_TRUE(AtomicResultMatches(uResults + stride*XOR_IDX, xorResult, byteSize));

   // Test CMP/XCHG Operations
   // This tests CompareStore, CompareExchange, and Exchange operations.

   // Unlike above, every lane isn't contributing to the same resource location
   // Instead, every lane competes with a few others to update the same resource location.
   // The first lane to find the contents of their location uninitialized will
   // update it. To verify that upper bits are considered in the comparison and
   // in the assignment, the value stored in the lowest bits is the location index.
   // This ensures that part will be the same for each of the competing lanes.
   // The uppermost bits are updated with the index of the lane that got there first.
   // Subsequent calls to CompareExchange will verify this value matches and alter
   // the content slightly. Finally, a simple check of the output value to what
   // the current lane would expect and a call to exchange will update the value once more

   // To verify this has gone through properly, the upper portion is converted as
   // if to calculate the location index and compared with the location index.
   // It could be the index of any of several lanes that assign to that location,
   // but this ensures that it is not any lane outside of that group.
   // The lower bits are compared to the location index as well.
   LogCommentFmt(L"Verifying %d-bit integer atomic cmp/xchg results", bitSize);
   for (size_t i = 0; i < 64; i++) {
     uint64_t val = *((uint64_t*)(pXchg + i*stride));
     // Verify lower bits match location index exactly
     VERIFY_ARE_EQUAL(i, val & ((1ULL << shBits) - 1ULL));
     // Verify that upper bits contain original index that transforms to location index
     VERIFY_ARE_EQUAL(((val >> shBits)/3)%64, i);
   }
 }

 void VerifyAtomicsRawTest(std::shared_ptr<ShaderOpTestResult> test,
                           uint64_t maxIdx, size_t bitSize) {

   size_t stride = 8;
   // struct mirroring that in the shader
   struct AtomicStuff {
     float prepad[2][3];
     UINT uintEl[4];
     int  sintEl[4];
     struct useless {
       uint32_t unused[3];
     } postpad;
     float last;
   };

   MappedData uintData, xchgData;

   test->Test->GetReadBackData("U0", &uintData);
   test->Test->GetReadBackData("U1", &xchgData);

   const AtomicStuff *pStruct = (AtomicStuff *)uintData.data();
   const AtomicStuff *pStrXchg = (AtomicStuff *)xchgData.data();

   LogCommentFmt(L"Verifying %d-bit integer atomic operations on RWStructuredBuffer resource", bitSize);

   VerifyAtomicResults((const BYTE*)&(pStruct[0].uintEl[2]), (const BYTE*)&(pStruct[1].sintEl[2]),
                       (const BYTE*)&(pStrXchg[0].uintEl[2]), sizeof(AtomicStuff), maxIdx, bitSize);

   const BYTE *pUint = nullptr;
   const BYTE *pXchg = nullptr;

   test->Test->GetReadBackData("U2", &uintData);
   test->Test->GetReadBackData("U3", &xchgData);

   pUint = (BYTE *)uintData.data();
   pXchg = (BYTE *)xchgData.data();

   LogCommentFmt(L"Verifying %d-bit integer atomic operations on RWByteAddressBuffer resource", bitSize);

   VerifyAtomicResults(pUint, pUint + stride*6,
                       pXchg, stride, maxIdx, bitSize);

 }

 void VerifyAtomicsTypedTest(std::shared_ptr<ShaderOpTestResult> test,
                             uint64_t maxIdx, size_t bitSize) {


   size_t stride = 8;
   MappedData uintData, sintData, xchgData;
   const BYTE *pUint = nullptr;
   const BYTE *pSint = nullptr;
   const BYTE *pXchg = nullptr;

   // Typed resources can't share between 32 and 64 bits
   if (bitSize == 32) {
     test->Test->GetReadBackData("U6", &uintData);
     test->Test->GetReadBackData("U7", &sintData);
     test->Test->GetReadBackData("U8", &xchgData);
   } else {
     test->Test->GetReadBackData("U12", &uintData);
     test->Test->GetReadBackData("U13", &sintData);
     test->Test->GetReadBackData("U14", &xchgData);
   }

   pUint = (BYTE *)uintData.data();
   pSint = (BYTE *)sintData.data();
   pXchg = (BYTE *)xchgData.data();

   LogCommentFmt(L"Verifying %d-bit integer atomic operations on RWBuffer resource", bitSize);

   VerifyAtomicResults(pUint, pSint + stride, pXchg, stride, maxIdx, bitSize);

   // Typed resources can't share between 32 and 64 bits
   if (bitSize == 32) {
     test->Test->GetReadBackData("U9", &uintData);
     test->Test->GetReadBackData("U10", &sintData);
     test->Test->GetReadBackData("U11", &xchgData);
   } else {
     test->Test->GetReadBackData("U15", &uintData);
     test->Test->GetReadBackData("U16", &sintData);
     test->Test->GetReadBackData("U17", &xchgData);
   }

   pUint = (BYTE *)uintData.data();
   pSint = (BYTE *)sintData.data();
   pXchg = (BYTE *)xchgData.data();

   LogCommentFmt(L"Verifying %d-bit integer atomic operations on RWTexture resource", bitSize);

   VerifyAtomicResults(pUint, pSint + stride, pXchg, stride, maxIdx, bitSize);

 }

 void VerifyAtomicsSharedTest(std::shared_ptr<ShaderOpTestResult> test,
                              uint64_t maxIdx, size_t bitSize) {

   size_t stride = 8;
   MappedData uintData, xchgData;
   const BYTE *pUint = nullptr;
   const BYTE *pXchg = nullptr;

   test->Test->GetReadBackData("U4", &uintData);
   test->Test->GetReadBackData("U5", &xchgData);

   pUint = (BYTE *)uintData.data();
   pXchg = (BYTE *)xchgData.data();

   LogCommentFmt(L"Verifying %d-bit integer atomic operations on groupshared variables", bitSize);
   VerifyAtomicResults(pUint, pUint + stride*6,
                       pXchg, stride, maxIdx, bitSize);
 }

 void VerifyAtomicsTest(std::shared_ptr<ShaderOpTestResult> test,
                        uint64_t maxIdx, size_t bitSize) {
   VerifyAtomicsRawTest(test, maxIdx, bitSize);
   VerifyAtomicsTypedTest(test, maxIdx, bitSize);
 }

 TEST_F(ExecutionTest, AtomicsTest) {
   WEX::TestExecution::SetVerifyOutput verifySettings(WEX::TestExecution::VerifyOutputSettings::LogOnlyFailures);
   CComPtr<IStream> pStream;
   ReadHlslDataIntoNewStream(L"ShaderOpArith.xml", &pStream);

   CComPtr<ID3D12Device> pDevice;
   if (!CreateDevice(&pDevice))
     return;

   std::shared_ptr<st::ShaderOpSet> ShaderOpSet =
     std::make_shared<st::ShaderOpSet>();
   st::ParseShaderOpSetFromStream(pStream, ShaderOpSet.get());

   st::ShaderOp *pShaderOp = ShaderOpSet->GetShaderOp("AtomicsHeap");

   // Test compute shader
   LogCommentFmt(L"Verifying 32-bit integer atomic operations in compute shader");
   std::shared_ptr<ShaderOpTestResult> test = RunShaderOpTestAfterParse(pDevice, m_support, "AtomicsHeap", nullptr, ShaderOpSet);

   VerifyAtomicsTest(test, 32*32, 32);
   VerifyAtomicsSharedTest(test, 32*32, 32);

   // Test mesh shader if available
   pShaderOp->CS = nullptr;
   if (DoesDeviceSupportMeshShaders(pDevice)) {
     LogCommentFmt(L"Verifying 32-bit integer atomic operations in amp/mesh/pixel shaders");
     test = RunShaderOpTestAfterParse(pDevice, m_support, "AtomicsHeap", nullptr, ShaderOpSet);
     VerifyAtomicsTest(test, 8*8*2 + 8*8*2 + 64*64, 32);
     VerifyAtomicsSharedTest(test, 8*8*2 + 8*8*2, 32);
   }

   // Test Vertex + Pixel shader
   pShaderOp->MS = nullptr;
   LogCommentFmt(L"Verifying 32-bit integer atomic operations in vert/pixel shaders");
   test = RunShaderOpTestAfterParse(pDevice, m_support, "AtomicsHeap", nullptr, ShaderOpSet);
   VerifyAtomicsTest(test, 64*64+6, 32);
 }

 TEST_F(ExecutionTest, Atomics64Test) {
   WEX::TestExecution::SetVerifyOutput verifySettings(WEX::TestExecution::VerifyOutputSettings::LogOnlyFailures);
   CComPtr<IStream> pStream;
   ReadHlslDataIntoNewStream(L"ShaderOpArith.xml", &pStream);

   CComPtr<ID3D12Device> pDevice;
   if (!CreateDevice(&pDevice, D3D_SHADER_MODEL_6_6))
     return;

   if (!DoesDeviceSupportInt64(pDevice)) {
     WEX::Logging::Log::Comment(L"Device does not support int64 operations.");
     WEX::Logging::Log::Result(WEX::Logging::TestResults::Skipped);
     return;
   }

   std::shared_ptr<st::ShaderOpSet> ShaderOpSet =
     std::make_shared<st::ShaderOpSet>();
   st::ParseShaderOpSetFromStream(pStream, ShaderOpSet.get());

   st::ShaderOp *pShaderOp = ShaderOpSet->GetShaderOp("AtomicsRoot");

   // Reassign shader stages to 64-bit versions
   // Collect 64-bit shaders
   pShaderOp->CS = pShaderOp->GetString("CS");
   pShaderOp->VS = pShaderOp->GetString("VS");
   pShaderOp->PS = pShaderOp->GetString("PS");
   pShaderOp->AS = pShaderOp->GetString("AS");
   pShaderOp->MS = pShaderOp->GetString("MS");

   // Test compute shader
   LogCommentFmt(L"Verifying 64-bit integer atomic operations on raw buffers in compute shader");
   std::shared_ptr<ShaderOpTestResult> test = RunShaderOpTestAfterParse(pDevice, m_support, "AtomicsRoot", nullptr, ShaderOpSet);
   VerifyAtomicsRawTest(test, 32*32, 64);

   // Test mesh shader if available
   pShaderOp->CS = nullptr;
   if (DoesDeviceSupportMeshShaders(pDevice)) {
     LogCommentFmt(L"Verifying 64-bit integer atomic operations on raw buffers in amp/mesh/pixel shader");
     test = RunShaderOpTestAfterParse(pDevice, m_support, "AtomicsRoot", nullptr, ShaderOpSet);
     VerifyAtomicsRawTest(test, 8*8*2 + 8*8*2 + 64*64, 64);
   }

   // Test Vertex + Pixel shader
   pShaderOp->MS = nullptr;
   LogCommentFmt(L"Verifying 64-bit integer atomic operations on raw buffers in vert/pixel shader");
   test = RunShaderOpTestAfterParse(pDevice, m_support, "AtomicsRoot", nullptr, ShaderOpSet);
   VerifyAtomicsRawTest(test, 64*64+6, 64);
 }

 TEST_F(ExecutionTest, AtomicsRawHeap64Test) {
   WEX::TestExecution::SetVerifyOutput verifySettings(WEX::TestExecution::VerifyOutputSettings::LogOnlyFailures);
   CComPtr<IStream> pStream;
   ReadHlslDataIntoNewStream(L"ShaderOpArith.xml", &pStream);

   CComPtr<ID3D12Device> pDevice;
   if (!CreateDevice(&pDevice, D3D_SHADER_MODEL_6_6))
     return;

   if (!DoesDeviceSupportInt64(pDevice)) {
     WEX::Logging::Log::Comment(L"Device does not support int64 operations.");
     WEX::Logging::Log::Result(WEX::Logging::TestResults::Skipped);
     return;
   }

   if (!DoesDeviceSupportHeap64Atomics(pDevice)) {
     WEX::Logging::Log::Comment(L"Device does not support 64-bit atomic operations on heap resources.");
     WEX::Logging::Log::Result(WEX::Logging::TestResults::Skipped);
     return;
   }

   std::shared_ptr<st::ShaderOpSet> ShaderOpSet =
     std::make_shared<st::ShaderOpSet>();
   st::ParseShaderOpSetFromStream(pStream, ShaderOpSet.get());

   st::ShaderOp *pShaderOp = ShaderOpSet->GetShaderOp("AtomicsHeap");

   // Reassign shader stages to 64-bit versions
   // Collect 64-bit shaders
   pShaderOp->CS = pShaderOp->GetString("CS64");
   pShaderOp->VS = pShaderOp->GetString("VS64");
   pShaderOp->PS = pShaderOp->GetString("PS64");
   pShaderOp->AS = pShaderOp->GetString("AS64");
   pShaderOp->MS = pShaderOp->GetString("MS64");

   // Test compute shader
   LogCommentFmt(L"Verifying 64-bit integer atomic operations on heap raw buffers in compute shader");
   std::shared_ptr<ShaderOpTestResult> test = RunShaderOpTestAfterParse(pDevice, m_support, "AtomicsHeap", nullptr, ShaderOpSet);
   VerifyAtomicsRawTest(test, 32*32, 64);

   // Test mesh shader if available
   pShaderOp->CS = nullptr;
   if (DoesDeviceSupportMeshShaders(pDevice)) {
     LogCommentFmt(L"Verifying 64-bit integer atomic operations on heap raw buffers in amp/mesh/pixel shader");
     test = RunShaderOpTestAfterParse(pDevice, m_support, "AtomicsHeap", nullptr, ShaderOpSet);
     VerifyAtomicsRawTest(test, 8*8*2 + 8*8*2 + 64*64, 64);
   }

   // Test Vertex + Pixel shader
   pShaderOp->MS = nullptr;
   LogCommentFmt(L"Verifying 64-bit integer atomic operations on heap raw buffers in vert/pixel shader");
   test = RunShaderOpTestAfterParse(pDevice, m_support, "AtomicsHeap", nullptr, ShaderOpSet);
   VerifyAtomicsRawTest(test, 64*64+6, 64);
 }

 TEST_F(ExecutionTest, AtomicsTyped64Test) {
   WEX::TestExecution::SetVerifyOutput verifySettings(WEX::TestExecution::VerifyOutputSettings::LogOnlyFailures);
   CComPtr<IStream> pStream;
   ReadHlslDataIntoNewStream(L"ShaderOpArith.xml", &pStream);

   CComPtr<ID3D12Device> pDevice;
   if (!CreateDevice(&pDevice, D3D_SHADER_MODEL_6_6))
     return;

   if (!DoesDeviceSupportInt64(pDevice)) {
     WEX::Logging::Log::Comment(L"Device does not support int64 operations.");
     WEX::Logging::Log::Result(WEX::Logging::TestResults::Skipped);
     return;
   }

   if (!DoesDeviceSupportTyped64Atomics(pDevice)) {
     WEX::Logging::Log::Comment(L"Device does not support int64 atomic operations on typed resources.");
     WEX::Logging::Log::Result(WEX::Logging::TestResults::Skipped);
     return;
   }

   std::shared_ptr<st::ShaderOpSet> ShaderOpSet =
     std::make_shared<st::ShaderOpSet>();
   st::ParseShaderOpSetFromStream(pStream, ShaderOpSet.get());

   st::ShaderOp *pShaderOp = ShaderOpSet->GetShaderOp("AtomicsHeap");

   // Reassign shader stages to 64-bit versions
   // Collect 64-bit shaders
   pShaderOp->CS = pShaderOp->GetString("CSTY64");
   pShaderOp->VS = pShaderOp->GetString("VSTY64");
   pShaderOp->PS = pShaderOp->GetString("PSTY64");
   pShaderOp->AS = pShaderOp->GetString("ASTY64");
   pShaderOp->MS = pShaderOp->GetString("MSTY64");

   // Test compute shader
   LogCommentFmt(L"Verifying 64-bit integer atomic operations on typed resources in compute shader");
   std::shared_ptr<ShaderOpTestResult> test = RunShaderOpTestAfterParse(pDevice, m_support, "AtomicsHeap", nullptr, ShaderOpSet);
   VerifyAtomicsTypedTest(test, 32*32, 64);

   // Test mesh shader if available
   pShaderOp->CS = nullptr;
   if (DoesDeviceSupportMeshShaders(pDevice)) {
     LogCommentFmt(L"Verifying 64-bit integer atomic operations on typed resources in amp/mesh/pixel shader");
     test = RunShaderOpTestAfterParse(pDevice, m_support, "AtomicsHeap", nullptr, ShaderOpSet);
     VerifyAtomicsTypedTest(test, 8*8*2 + 8*8*2 + 64*64, 64);
   }

   // Test Vertex + Pixel shader
   pShaderOp->MS = nullptr;
   LogCommentFmt(L"Verifying 64-bit integer atomic operations on typed resources in vert/pixel shader");
   test = RunShaderOpTestAfterParse(pDevice, m_support, "AtomicsHeap", nullptr, ShaderOpSet);
   VerifyAtomicsTypedTest(test, 64*64+6, 64);
 }

 TEST_F(ExecutionTest, AtomicsShared64Test) {
   WEX::TestExecution::SetVerifyOutput verifySettings(WEX::TestExecution::VerifyOutputSettings::LogOnlyFailures);
   CComPtr<IStream> pStream;
   ReadHlslDataIntoNewStream(L"ShaderOpArith.xml", &pStream);

   CComPtr<ID3D12Device> pDevice;
   if (!CreateDevice(&pDevice, D3D_SHADER_MODEL_6_6))
     return;

   if (!DoesDeviceSupportInt64(pDevice)) {
     WEX::Logging::Log::Comment(L"Device does not support int64 operations.");
     WEX::Logging::Log::Result(WEX::Logging::TestResults::Skipped);
     return;
   }

   if (!DoesDeviceSupportShared64Atomics(pDevice)) {
     WEX::Logging::Log::Comment(L"Device does not support int64 atomic operations on groupshared variables.");
     WEX::Logging::Log::Result(WEX::Logging::TestResults::Skipped);
     return;
   }

   std::shared_ptr<st::ShaderOpSet> ShaderOpSet =
     std::make_shared<st::ShaderOpSet>();
   st::ParseShaderOpSetFromStream(pStream, ShaderOpSet.get());

   st::ShaderOp *pShaderOp = ShaderOpSet->GetShaderOp("AtomicsRoot");

   // Reassign shader stages to 64-bit versions
   // Collect 64-bit shaders
   pShaderOp->CS = pShaderOp->GetString("CSSH64");
   pShaderOp->AS = pShaderOp->GetString("ASSH64");
   pShaderOp->MS = pShaderOp->GetString("MSSH64");

   LogCommentFmt(L"Verifying 64-bit integer atomic operations on groupshared variables in compute shader");
   std::shared_ptr<ShaderOpTestResult> test = RunShaderOpTestAfterParse(pDevice, m_support, "AtomicsRoot", nullptr, ShaderOpSet);
   VerifyAtomicsSharedTest(test, 32*32, 64);

   // Test mesh shader if available
   pShaderOp->CS = nullptr;
   if (DoesDeviceSupportMeshShaders(pDevice)) {
     LogCommentFmt(L"Verifying 64-bit integer atomic operations on groupshared variables in amp/mesh/pixel shader");
     test = RunShaderOpTestAfterParse(pDevice, m_support, "AtomicsRoot", nullptr, ShaderOpSet);
     VerifyAtomicsSharedTest(test, 8*8*2 + 8*8*2, 64);
   }
 }


 // Float Atomics

 // These operations are almost the same as for the 32-bit and 64-bit integer tests
 // The difference is that there is no need to verify the upper bits.
 // So there is no storing of different parts in upper and lower halves.
 // Additionally, the only operations that are supported on floats
 // are compare and exchange operations. So that's all that is tested here.
 // Just as above, a number of lanes are assigned the same output value.
 // Unlike above, one location is needed for the result of the special NaN test
 // For this reason, the conversion is reduced by one and shifted by one to leave
 // the zero-indexed location available.

 // Verify results for a particular set of atomics results
 void VerifyAtomicFloatResults(const float *results) {
   // The first entry is for NaN to ensure that compares between NaNs succeed
   // The sentinal value is 0.123, for which this compare is sufficient.
   VERIFY_IS_TRUE(results[0] >= 0.120 && results[0] < 0.125);
   // Start at 1 because 0 is just for NaN tests
   for (int i = 1; i < 64; i++) {
     VERIFY_ARE_EQUAL((int(results[i])/3)%63 + 1, i);
   }
 }

 void VerifyAtomicsFloatSharedTest(std::shared_ptr<ShaderOpTestResult> test) {
   MappedData Data;
   const float *pData = nullptr;

   test->Test->GetReadBackData("U4", &Data);
   pData = (float *)Data.data();

   LogCommentFmt(L"Verifying float cmp/xchg atomic operations on groupshared variables");
   VerifyAtomicFloatResults(pData);
 }

 void VerifyAtomicsFloatTest(std::shared_ptr<ShaderOpTestResult> test) {

   // struct mirroring that in the shader
   struct AtomicStuff {
     float prepad[2][3];
     float fltEl[2];
     struct useless {
       uint32_t unused[3];
     } postpad;
   };

   // Test Compute Shader
   MappedData Data;
   const float *pData = nullptr;

   test->Test->GetReadBackData("U0", &Data);
   const AtomicStuff *pStructData = (AtomicStuff *)Data.data();
   LogCommentFmt(L"Verifying float cmp/xchg atomic operations on RWStructuredBuffer resources");
   VERIFY_IS_TRUE(pStructData[0].fltEl[1] >= 0.120 && pStructData[0].fltEl[1] < 0.125);
   for (int i = 1; i < 64; i++) {
     VERIFY_ARE_EQUAL((int(pStructData[i].fltEl[1])/3)%63 + 1, i);
   }

   test->Test->GetReadBackData("U1", &Data);
   pData = (float *)Data.data();
   LogCommentFmt(L"Verifying float cmp/xchg atomic operations on RWByteAddressBuffer resources");
   VerifyAtomicFloatResults(pData);

   test->Test->GetReadBackData("U2", &Data);
   pData = (float *)Data.data();
   LogCommentFmt(L"Verifying float cmp/xchg atomic operations on RWBuffer resources");
   VerifyAtomicFloatResults(pData);

   test->Test->GetReadBackData("U3", &Data);
   pData = (float *)Data.data();
   LogCommentFmt(L"Verifying float cmp/xchg atomic operations on RWTexture resources");
   VerifyAtomicFloatResults(pData);

 }

 TEST_F(ExecutionTest, AtomicsFloatTest) {
   WEX::TestExecution::SetVerifyOutput verifySettings(WEX::TestExecution::VerifyOutputSettings::LogOnlyFailures);
   CComPtr<IStream> pStream;
   ReadHlslDataIntoNewStream(L"ShaderOpArith.xml", &pStream);

   CComPtr<ID3D12Device> pDevice;
   if (!CreateDevice(&pDevice))
     return;

   std::shared_ptr<st::ShaderOpSet> ShaderOpSet =
     std::make_shared<st::ShaderOpSet>();
   st::ParseShaderOpSetFromStream(pStream, ShaderOpSet.get());

   st::ShaderOp *pShaderOp = ShaderOpSet->GetShaderOp("FloatAtomics");

   // Test compute shader
   LogCommentFmt(L"Verifying float cmp/xchg atomic operations in compute shader");
   std::shared_ptr<ShaderOpTestResult> test = RunShaderOpTestAfterParse(pDevice, m_support, "FloatAtomics", nullptr, ShaderOpSet);
   VerifyAtomicsFloatTest(test);
   VerifyAtomicsFloatSharedTest(test);

   // Test mesh shader if available
   pShaderOp->CS = nullptr;
   if (DoesDeviceSupportMeshShaders(pDevice)) {
     LogCommentFmt(L"Verifying float cmp/xchg atomic operations in amp/mesh/pixel shaders");
     test = RunShaderOpTestAfterParse(pDevice, m_support, "FloatAtomics", nullptr, ShaderOpSet);
     VerifyAtomicsFloatTest(test);
     VerifyAtomicsFloatSharedTest(test);
   }

   // Test Vertex + Pixel shader
   pShaderOp->MS = nullptr;
     LogCommentFmt(L"Verifying float cmp/xchg atomic operations in vert/pixel shaders");
   test = RunShaderOpTestAfterParse(pDevice, m_support, "FloatAtomics", nullptr, ShaderOpSet);
   VerifyAtomicsFloatTest(test);
 }

 // The IsHelperLane test renders 3-pixel triangle into 16x16 render target restricted
 // to 2x2 viewport alligned at (0,0) which guarantees it will run in a single quad.
 //
 // Pixels to be rendered*
 // (0,0)*  (0,1)*
 // (1,0)   (1,1)*
 //
 // Pixel (1,0) is not rendered and is in helper lane.
 //
 // Each thread will use ddx_fine and ddy_fine to read the IsHelperLane() values from other threads.
 // The bottom right pixel will write the results into the UAV buffer.
 //
 // Then the top level pixel (0,0) is discarded and the process above is repeated.
 //
 // Runs with shader models 6.0 and 6.6 to test both the HLSL built-in IsHelperLane fallback
 // function (sm <= 6.5) and the IsHelperLane intrisics (sm >= 6.6).
 //
 TEST_F(ExecutionTest, HelperLaneTest) {
   WEX::TestExecution::SetVerifyOutput verifySettings(WEX::TestExecution::VerifyOutputSettings::LogOnlyFailures);
   CComPtr<IStream> pStream;
   ReadHlslDataIntoNewStream(L"ShaderOpArith.xml", &pStream);

   std::shared_ptr<st::ShaderOpSet> ShaderOpSet = std::make_shared<st::ShaderOpSet>();
   st::ParseShaderOpSetFromStream(pStream, ShaderOpSet.get());

 #ifdef ISHELPERLANE_PLACEHOLDER
   string args = "-DISHELPERLANE_PLACEHOLDER";
 #else
   string args = "";
 #endif

   D3D_SHADER_MODEL TestShaderModels[] = { D3D_SHADER_MODEL_6_0, D3D_SHADER_MODEL_6_6 };
   for (unsigned i = 0; i < _countof(TestShaderModels); i++) {
     D3D_SHADER_MODEL sm = TestShaderModels[i];
     LogCommentFmt(L"Verifying IsHelperLane in shader model 6.%1u", ((UINT)sm & 0x0f));

     CComPtr<ID3D12Device> pDevice;
     if (!CreateDevice(&pDevice, sm, false /* skipUnsupported */))
       continue;

     std::shared_ptr<ShaderOpTestResult> test = RunShaderOpTestAfterParse(pDevice, m_support, "HelperLaneTestNoWave",
       // this callbacked is called when the test is creating the resource to run the test
       [&](LPCSTR Name, std::vector<BYTE>& Data, st::ShaderOp* pShaderOp) {
         VERIFY_IS_TRUE(0 == _stricmp(Name, "UAVBuffer0"));
         std::fill(Data.begin(), Data.end(), (BYTE)0xCC);
         pShaderOp->Shaders.at(0).Arguments = args.c_str();
         pShaderOp->Shaders.at(1).Arguments = args.c_str();
       }, ShaderOpSet);

     struct HelperLaneTestResult {
       int32_t is_helper_00;
       int32_t is_helper_10;
       int32_t is_helper_01;
       int32_t is_helper_11;
     };

     MappedData uavData;
     test->Test->GetReadBackData("UAVBuffer0", &uavData);
     HelperLaneTestResult* pTestResults = (HelperLaneTestResult*)uavData.data();

     MappedData renderData;
     test->Test->GetReadBackData("RTarget", &renderData);
     const uint32_t* pPixels = (uint32_t*)renderData.data();

     // before discard
     VERIFY_ARE_EQUAL(pTestResults[0].is_helper_00, 0);
     VERIFY_ARE_EQUAL(pTestResults[0].is_helper_10, 0);
     VERIFY_ARE_EQUAL(pTestResults[0].is_helper_01, 1);
     VERIFY_ARE_EQUAL(pTestResults[0].is_helper_11, 0);

     // after discard
     VERIFY_ARE_EQUAL(pTestResults[1].is_helper_00, 1);
     VERIFY_ARE_EQUAL(pTestResults[1].is_helper_10, 0);
     VERIFY_ARE_EQUAL(pTestResults[1].is_helper_01, 1);
     VERIFY_ARE_EQUAL(pTestResults[1].is_helper_11, 0);

     UNREFERENCED_PARAMETER(pPixels);
   }
 }

 struct HelperLaneWaveTestResult60 {
   // 6.0 wave ops
   int32_t anyTrue;
   int32_t allTrue;
   XMUINT4 ballot;
   int32_t waterfallLoopCount;
   int32_t allEqual;
   int32_t countBits;
   int32_t sum;
   int32_t product;
   int32_t bitAnd;
   int32_t bitOr;
   int32_t bitXor;
   int32_t min;
   int32_t max;
   int32_t prefixCountBits;
   int32_t prefixProduct;
   int32_t prefixSum;
 };

 struct HelperLaneQuadTestResult {
   int32_t is_helper_this;
   int32_t is_helper_across_X;
   int32_t is_helper_across_Y;
   int32_t is_helper_across_Diag;
 };

 struct HelperLaneWaveTestResult65 {
   // 6.5 wave ops
   XMUINT4  match;
   int32_t mpCountBits;
   int32_t mpSum;
   int32_t mpProduct;
   int32_t mpBitAnd;
   int32_t mpBitOr;
   int32_t mpBitXor;
 };

 struct HelperLaneWaveTestResult {
   HelperLaneWaveTestResult60 sm60;
   HelperLaneQuadTestResult sm60_quad;
   HelperLaneWaveTestResult65 sm65;
 };

 struct foo { int32_t a; int32_t b; int32_t c; };
 struct bar { foo f; int32_t d; XMUINT4 g; };
 foo f = {1, 2, 3};
 bar b = { { 1, 2, 3 }, 0, { 1, 2, 3, 4 } };

 HelperLaneWaveTestResult HelperLane_CS_ExpectedResults = {
   // HelperLaneWaveTestResult60
   { 0, 1, { 0x7, 0, 0, 0 }, 3, 1, 3, 12, 64, 1, 0, 0, 10, 1, 2, 16, 4 },
   // HelperLaneQuadTestResult
   { 0, 0, 0, 0 },
   // HelperLaneWaveTestResult65
   { {0x7, 0, 0, 0}, 2, 4, 16, 1, 0, 0 }
 };

 HelperLaneWaveTestResult HelperLane_VS_ExpectedResults = HelperLane_CS_ExpectedResults;

 HelperLaneWaveTestResult HelperLane_PS_ExpectedResults = {
   // HelperLaneWaveTestResult60
   { 0, 1, { 0xB, 0, 0, 0 }, 3, 1, 3, 12, 64, 1, 0, 0, 10, 1, 2, 16, 4 },
   // HelperLaneQuadTestResult
   { 0, 1, 0, 0 },
   // HelperLaneWaveTestResult65
   { {0xB, 0, 0, 0}, 2, 4, 16, 1, 0, 0 }
 };

 HelperLaneWaveTestResult HelperLane_PSAfterDiscard_ExpectedResults = {
   // HelperLaneWaveTestResult60
   { 0, 1, { 0xA, 0, 0, 0 }, 2, 1, 2, 8, 16, 1, 0, 0, 10, 1, 1, 4, 2 },
   // HelperLaneQuadTestResult
   { 0, 1, 0, 1 },
   // HelperLaneWaveTestResult65
   { {0xA, 0, 0, 0}, 1, 2, 4, 1, 0, 0 }
 };

 HelperLaneWaveTestResult IncludesHelperLane_PS_ExpectedResults = {
   // HelperLaneWaveTestResult60
   { 1, 0, { 0xF, 0, 0, 0 }, 4, 0, 4, 16, 256, 0, 1, 1, 1, 10, 3, 64, 6 },
   // HelperLaneQuadTestResult
   { 0, 1, 0, 0 },
   // HelperLaneWaveTestResult65
   { {0xF, 0, 0, 0}, 3, 6, 64, 0, 1, 1 }
 };

 HelperLaneWaveTestResult IncludesHelperLane_PSAfterDiscard_ExpectedResults = {
   // HelperLaneWaveTestResult60
   { 1, 0, { 0xF, 0, 0, 0 }, 4, 0, 4, 16, 256, 0, 1, 0, 1, 10, 3, 64, 6 },
   // HelperLaneQuadTestResult
   { 0, 1, 0, 1 },
   // HelperLaneWaveTestResult65
   { {0xF, 0, 0, 0}, 3, 6, 64, 0, 1, 0 }
 };

 bool HelperLaneResultLogAndVerify(const wchar_t* testDesc, uint32_t expectedValue, uint32_t actualValue) {
   bool matches = (expectedValue == actualValue);
   LogCommentFmt(L"%s%s, expected = %u, actual = %u", matches ? L" - " : L"FAILED: ", testDesc, expectedValue, actualValue);
   return matches;
 }

 bool HelperLaneResultLogAndVerify(const wchar_t* testDesc, XMUINT4 expectedValue, XMUINT4 actualValue) {
   bool matches = (expectedValue.x == actualValue.x && expectedValue.y == actualValue.y &&
                   expectedValue.z == actualValue.z && expectedValue.w == actualValue.w);
   LogCommentFmt(L"%s%s, expected = (0x%X,0x%X,0x%X,0x%X), actual = (0x%X,0x%X,0x%X,0x%X)", matches ? L" - " : L"FAILED: ", testDesc,
     expectedValue.x, expectedValue.y, expectedValue.z, expectedValue.w, actualValue.x, actualValue.y, actualValue.z, actualValue.w);
   return matches;
 }


 bool VerifyHelperLaneWaveResults(ExecutionTest::D3D_SHADER_MODEL sm, HelperLaneWaveTestResult& testResults, HelperLaneWaveTestResult& expectedResults, bool verifyQuads) {
   bool passed = true;
   {
     HelperLaneWaveTestResult60& tr60 = testResults.sm60;
     HelperLaneWaveTestResult60& tr60exp = expectedResults.sm60;

     passed &= HelperLaneResultLogAndVerify(L"WaveActiveAnyTrue(IsHelperLane())", tr60exp.anyTrue, tr60.anyTrue);
     passed &= HelperLaneResultLogAndVerify(L"WaveActiveAllTrue(!IsHelperLane())", tr60exp.allTrue, tr60.allTrue);
     passed &= HelperLaneResultLogAndVerify(L"WaveActiveBallot(true) has exactly 3 bits set", tr60exp.ballot, tr60.ballot);

     passed &= HelperLaneResultLogAndVerify(L"!WaveReadLaneFirst(IsHelperLane()) && WaveIsFirstLane() in a waterfall loop", tr60exp.waterfallLoopCount, tr60.waterfallLoopCount);
     passed &= HelperLaneResultLogAndVerify(L"WaveActiveAllEqual(IsHelperLane())", tr60exp.allEqual, tr60.allEqual);
     passed &= HelperLaneResultLogAndVerify(L"WaveActiveCountBits(true)", tr60exp.countBits, tr60.countBits);
     passed &= HelperLaneResultLogAndVerify(L"WaveActiveSum(4)", tr60exp.sum, tr60.sum);
     passed &= HelperLaneResultLogAndVerify(L"WaveActiveProduct(4)", tr60exp.product, tr60.product);

     passed &= HelperLaneResultLogAndVerify(L"WaveActiveBitAnd(!IsHelperLane())", tr60exp.bitAnd, tr60.bitAnd);
     passed &= HelperLaneResultLogAndVerify(L"WaveActiveBitOr(IsHelperLane())", tr60exp.bitOr, tr60.bitOr);
     passed &= HelperLaneResultLogAndVerify(L"WaveActiveBitXor(IsHelperLane())", tr60exp.bitXor, tr60.bitXor);

     passed &= HelperLaneResultLogAndVerify(L"WaveActiveMin(IsHelperLane() ? 1 : 10)", tr60exp.min, tr60.min);
     passed &= HelperLaneResultLogAndVerify(L"WaveActiveMax(IsHelperLane() ? 10 : 1)", tr60exp.max, tr60.max);

     passed &= HelperLaneResultLogAndVerify(L"WavePrefixCountBits(1)", tr60exp.prefixCountBits, tr60.prefixCountBits);
     passed &= HelperLaneResultLogAndVerify(L"WavePrefixProduct(4)", tr60exp.prefixProduct, tr60.prefixProduct);
     passed &= HelperLaneResultLogAndVerify(L"WavePrefixSum(2)", tr60exp.prefixSum, tr60.prefixSum);
   }

   if (verifyQuads) {
     HelperLaneQuadTestResult& quad_tr = testResults.sm60_quad;
     HelperLaneQuadTestResult& quad_tr_exp = expectedResults.sm60_quad;
     passed &= HelperLaneResultLogAndVerify(L"QuadReadAcross* - lane 3 / pixel (1,1) - IsHelperLane()", quad_tr_exp.is_helper_this, quad_tr.is_helper_this);
     passed &= HelperLaneResultLogAndVerify(L"QuadReadAcross* - lane 2 / pixel (0,1) - IsHelperLane()", quad_tr_exp.is_helper_across_X, quad_tr.is_helper_across_X);
     passed &= HelperLaneResultLogAndVerify(L"QuadReadAcross* - lane 1 / pixel (1,0) - IsHelperLane()", quad_tr_exp.is_helper_across_Y, quad_tr.is_helper_across_Y);
     passed &= HelperLaneResultLogAndVerify(L"QuadReadAcross* - lane 0 / pixel (0,0) - IsHelperLane()", quad_tr_exp.is_helper_across_Diag, quad_tr.is_helper_across_Diag);
   }

   if (sm >= ExecutionTest::D3D_SHADER_MODEL_6_5) {
     HelperLaneWaveTestResult65& tr65 = testResults.sm65;
     HelperLaneWaveTestResult65& tr65exp = expectedResults.sm65;

     passed &= HelperLaneResultLogAndVerify(L"WaveMatch(true) has exactly 3 bits set", tr65exp.match, tr65.match);
     passed &= HelperLaneResultLogAndVerify(L"WaveMultiPrefixCountBits(1, no_masked_bits)", tr65exp.mpCountBits, tr65.mpCountBits);
     passed &= HelperLaneResultLogAndVerify(L"WaveMultiPrefixSum(2, no_masked_bits)", tr65exp.mpSum, tr65.mpSum);
     passed &= HelperLaneResultLogAndVerify(L"WaveMultiPrefixProduct(4, no_masked_bits)", tr65exp.mpProduct, tr65.mpProduct);

     passed &= HelperLaneResultLogAndVerify(L"WaveMultiPrefixAnd(IsHelperLane() ? 0 : 1, no_masked_bits)", tr65exp.mpBitAnd, tr65.mpBitAnd);
     passed &= HelperLaneResultLogAndVerify(L"WaveMultiPrefixOr(IsHelperLane() ? 1 : 0, no_masked_bits)", tr65exp.mpBitOr, tr65.mpBitOr);
     passed &= HelperLaneResultLogAndVerify(L"verify WaveMultiPrefixXor(IsHelperLane() ? 1 : 0, no_masked_bits)", tr65exp.mpBitXor, tr65.mpBitXor);
   }
   return passed;
 }
 // Contrary to compute or pixel shaders the layout of lanes in vertex shaders is
 // not specified. A conforming implementation could, in the extreme case, decide
 // to dispatch three waves that each process only a single vertex.
 // So instead of compare with fixed expected result, calculate the correct
 // result from ballot.
 bool VerifyHelperLaneWaveResultsForVS(ExecutionTest::D3D_SHADER_MODEL sm,
                                       HelperLaneWaveTestResult &testResults) {
   bool passed = true;
   XMUINT4 mask = testResults.sm60.ballot;
   unsigned countBits = 0;
   std::bitset<32> x(mask.x);
   std::bitset<32> y(mask.y);
   std::bitset<32> z(mask.z);
   std::bitset<32> w(mask.w);
   countBits += (unsigned)x.count();
   countBits += (unsigned)y.count();
   countBits += (unsigned)z.count();
   countBits += (unsigned)w.count();

   {
     // For VS, IsHelperLane always return false.
     HelperLaneWaveTestResult60 &tr60 = testResults.sm60;
     passed &= HelperLaneResultLogAndVerify(L"WaveActiveAnyTrue(IsHelperLane())",
                                            0, tr60.anyTrue);
     passed &= HelperLaneResultLogAndVerify(
         L"WaveActiveAllTrue(!IsHelperLane())", 1, tr60.allTrue);
     bool ballotMatch = 1 <= countBits && countBits <= 3;

     LogCommentFmt(L"%sWaveActiveBallot(true) expected 1~3 bits set, actual = %u",
         ballotMatch ? L" - " : L"FAILED: ", tr60.ballot);


     passed &= HelperLaneResultLogAndVerify(
         L"!WaveReadLaneFirst(IsHelperLane()) && WaveIsFirstLane() in a "
         L"waterfall loop",
         countBits, tr60.waterfallLoopCount);
     passed &= HelperLaneResultLogAndVerify(
         L"WaveActiveAllEqual(IsHelperLane())", 1, tr60.allEqual);
     passed &= HelperLaneResultLogAndVerify(L"WaveActiveCountBits(true)",
                                            countBits, tr60.countBits);
     passed &= HelperLaneResultLogAndVerify(L"WaveActiveSum(4)", 4 * countBits,
                                            tr60.sum);
     passed &= HelperLaneResultLogAndVerify(L"WaveActiveProduct(4)", (unsigned)std::pow(4, countBits),
                                            tr60.product);

     passed &= HelperLaneResultLogAndVerify(L"WaveActiveBitAnd(!IsHelperLane())",
                                            1, tr60.bitAnd);
     passed &= HelperLaneResultLogAndVerify(L"WaveActiveBitOr(IsHelperLane())",
                                            0, tr60.bitOr);
     passed &= HelperLaneResultLogAndVerify(L"WaveActiveBitXor(IsHelperLane())",
                                            0, tr60.bitXor);

     passed &= HelperLaneResultLogAndVerify(
         L"WaveActiveMin(IsHelperLane() ? 1 : 10)", 10, tr60.min);
     passed &= HelperLaneResultLogAndVerify(
         L"WaveActiveMax(IsHelperLane() ? 10 : 1)", 1, tr60.max);

     passed &= HelperLaneResultLogAndVerify(L"WavePrefixCountBits(1)",
                                            countBits-1,
                                            tr60.prefixCountBits);
     passed &= HelperLaneResultLogAndVerify(L"WavePrefixProduct(4)",
                                            (unsigned)std::pow(4, countBits - 1),
                                            tr60.prefixProduct);
     passed &= HelperLaneResultLogAndVerify(L"WavePrefixSum(2)",
                                            2 * (countBits-1), tr60.prefixSum);
   }

   if (sm >= ExecutionTest::D3D_SHADER_MODEL_6_5) {
     HelperLaneWaveTestResult65 &tr65 = testResults.sm65;

     passed &= HelperLaneResultLogAndVerify(
         L"WaveMatch(true) has exactly 3 bits set", mask, tr65.match);
     passed &= HelperLaneResultLogAndVerify(
         L"WaveMultiPrefixCountBits(1, no_masked_bits)", countBits-1,
         tr65.mpCountBits);
     passed &= HelperLaneResultLogAndVerify(
         L"WaveMultiPrefixSum(2, no_masked_bits)", 2*(countBits-1), tr65.mpSum);
     passed &= HelperLaneResultLogAndVerify(
         L"WaveMultiPrefixProduct(4, no_masked_bits)",
         (unsigned)std::pow(4, countBits - 1),
         tr65.mpProduct);

     passed &= HelperLaneResultLogAndVerify(
         L"WaveMultiPrefixAnd(IsHelperLane() ? 0 : 1, no_masked_bits)",
         1, tr65.mpBitAnd);
     passed &= HelperLaneResultLogAndVerify(
         L"WaveMultiPrefixOr(IsHelperLane() ? 1 : 0, no_masked_bits)",
         0, tr65.mpBitOr);
     passed &= HelperLaneResultLogAndVerify(
         L"verify WaveMultiPrefixXor(IsHelperLane() ? 1 : 0, no_masked_bits)",
         0, tr65.mpBitXor);
   }
   return passed;
 }

 void CleanUAVBuffer0Buffer(LPCSTR BufferName, std::vector<BYTE>& Data, st::ShaderOp* pShaderOp) {
   UNREFERENCED_PARAMETER(pShaderOp);
   VERIFY_IS_TRUE(0 == _stricmp(BufferName, "UAVBuffer0"));
   std::fill(Data.begin(), Data.end(), (BYTE)0xCC);
 }

 //
 // The IsHelperLane test that use Wave intrinsics to verify IsHelperLane() and Wave operations on active lanes.
 //
 // Runs with shader models 6.0, 6.5 and 6.6 to test both the HLSL built-in IsHelperLane fallback
 // function (sm <= 6.5) and the IsHelperLane intrisics (sm >= 6.6) and the shader model 6.5 wave intrinsics (sm >= 6.5).
 //
 // For compute and vertex shaders IsHelperLane() always returns false and might be optimized away in the front end.
 // However it can be exposed to the driver in CS/VS through an exported function in a library so drivers need
 // to be prepared to handle it. For this reason the test is compiled with disabled optimizations (/Od).
 // The tests are also validating that wave intrinsics operate correctly with 3 threads in a CS or 3 vertices
 // in a VS where the rest of the lanes in the wave are not active (dead lanes).
 //
 TEST_F(ExecutionTest, HelperLaneTestWave) {
   WEX::TestExecution::SetVerifyOutput verifySettings(WEX::TestExecution::VerifyOutputSettings::LogOnlyFailures);
   CComPtr<IStream> pStream;
   ReadHlslDataIntoNewStream(L"ShaderOpArith.xml", &pStream);

   std::shared_ptr<st::ShaderOpSet> ShaderOpSet = std::make_shared<st::ShaderOpSet>();
   st::ParseShaderOpSetFromStream(pStream, ShaderOpSet.get());
   st::ShaderOp* pShaderOp = ShaderOpSet->GetShaderOp("HelperLaneTestWave");

 #ifdef ISHELPERLANE_PLACEHOLDER
   LPCSTR args = "/Od -DISHELPERLANE_PLACEHOLDER";
 #else
   LPCSTR args = "/Od";
 #endif

   if (args[0]) {
     for (st::ShaderOpShader& S : pShaderOp->Shaders)
       S.Arguments = args;
   }

   bool testPassed = true;

   D3D_SHADER_MODEL TestShaderModels[] = { D3D_SHADER_MODEL_6_0, D3D_SHADER_MODEL_6_5, D3D_SHADER_MODEL_6_6, D3D_SHADER_MODEL_6_7 };
   for (unsigned i = 0; i < _countof(TestShaderModels); i++) {
     D3D_SHADER_MODEL sm = TestShaderModels[i];
     LogCommentFmt(L"\r\nVerifying IsHelperLane using Wave intrinsics in shader model 6.%1u", ((UINT)sm & 0x0f));

     bool smPassed = true;

     CComPtr<ID3D12Device> pDevice;
     if (!CreateDevice(&pDevice, sm, false /* skipUnsupported */)) {
       continue;
     }

     if (GetTestParamUseWARP(UseWarpByDefault()) || IsDeviceBasicAdapter(pDevice)) {
       WEX::Logging::Log::Comment(L"WARP has a known issue with HelperLaneTestWave.");
       WEX::Logging::Log::Result(WEX::Logging::TestResults::Skipped);
       return;
     }

     if (!DoesDeviceSupportWaveOps(pDevice)) {
       LogCommentFmt(L"Device does not support wave operations in shader model 6.%1u", ((UINT)sm & 0x0f));
       continue;
     }

     if (sm == D3D_SHADER_MODEL_6_5) {
       // Reassign shader stages to 6.5 versions
       pShaderOp->CS = pShaderOp->GetString("CS65");
       pShaderOp->VS = pShaderOp->GetString("VS65");
       pShaderOp->PS = pShaderOp->GetString("PS65");
     } else if (sm == D3D_SHADER_MODEL_6_6) {
       // Reassign shader stages to 6.6 versions
       pShaderOp->CS = pShaderOp->GetString("CS66");
       pShaderOp->VS = pShaderOp->GetString("VS66");
       pShaderOp->PS = pShaderOp->GetString("PS66");
     } else if (sm == D3D_SHADER_MODEL_6_7) {
       // Reassign shader stages to 6.7 versions
       pShaderOp->CS = pShaderOp->GetString("CS66");
       pShaderOp->VS = pShaderOp->GetString("VS66");
       // Only PS has SM 6.7 version to test new [WaveOpsIncludeHelperLanes] attribute
       pShaderOp->PS = pShaderOp->GetString("PS67");
     }

     const unsigned CS_INDEX = 0, VS_INDEX = 0, PS_INDEX = 1, PS_INDEX_AFTER_DISCARD = 2;

     // Test Compute shader
     {
       std::shared_ptr<ShaderOpTestResult> test = RunShaderOpTestAfterParse(pDevice, m_support, "HelperLaneTestWave",
         CleanUAVBuffer0Buffer, ShaderOpSet);

       MappedData uavData;
       test->Test->GetReadBackData("UAVBuffer0", &uavData);
       HelperLaneWaveTestResult* pTestResults = (HelperLaneWaveTestResult*)uavData.data();
       LogCommentFmt(L"\r\nCompute shader");
       smPassed &= VerifyHelperLaneWaveResults(sm, pTestResults[CS_INDEX], HelperLane_CS_ExpectedResults, true);
     }

     HelperLaneWaveTestResult &PS_ExpectedResults =
         (sm >= D3D_SHADER_MODEL_6_7) ? IncludesHelperLane_PS_ExpectedResults
                                      : HelperLane_PS_ExpectedResults;
     HelperLaneWaveTestResult &PSAfterDiscard_ExpectedResults =
         (sm >= D3D_SHADER_MODEL_6_7)
             ? IncludesHelperLane_PSAfterDiscard_ExpectedResults
             : HelperLane_PSAfterDiscard_ExpectedResults;

     // Test Vertex + Pixel shader
     {
       pShaderOp->CS = nullptr;
       std::shared_ptr<ShaderOpTestResult> test = RunShaderOpTestAfterParse(pDevice, m_support, "HelperLaneTestWave", CleanUAVBuffer0Buffer, ShaderOpSet);

       MappedData uavData;
       test->Test->GetReadBackData("UAVBuffer0", &uavData);
       HelperLaneWaveTestResult* pTestResults = (HelperLaneWaveTestResult*)uavData.data();
       LogCommentFmt(L"\r\nVertex shader");
       smPassed &= VerifyHelperLaneWaveResultsForVS(sm, pTestResults[VS_INDEX]);
       LogCommentFmt(L"\r\nPixel shader");
       smPassed &= VerifyHelperLaneWaveResults(sm, pTestResults[PS_INDEX], PS_ExpectedResults, true);
       LogCommentFmt(L"\r\nPixel shader with discarded pixel");
       smPassed &= VerifyHelperLaneWaveResults(sm, pTestResults[PS_INDEX_AFTER_DISCARD], PSAfterDiscard_ExpectedResults, true);

       MappedData renderData;
       test->Test->GetReadBackData("RTarget", &renderData);
       const uint32_t* pPixels = (uint32_t*)renderData.data();

       UNREFERENCED_PARAMETER(pPixels);
     }
     testPassed &= smPassed;
   }
   VERIFY_ARE_EQUAL(testPassed, true);
 }

 struct int2 {
   int x;
   int y;
 };

 bool VerifyQuadAnyAllResults(int2 *Res) {
   int Idx = 0;
   for (; Idx < 4; ++Idx) {
     if (Res[Idx].x != 2)
       return false;
     if (Res[Idx].y != 4)
       return false;
   }
   for (; Idx < 60; ++Idx) {
     if (Res[Idx].x != 1)
       return false;
     if (Res[Idx].y != 4)
       return false;
   }
   for (; Idx < 64; ++Idx) {
     if (Res[Idx].x != 1)
       return false;
     if (Res[Idx].y != 3)
       return false;
   }
   return true;
 }

 TEST_F(ExecutionTest, QuadAnyAll) {
   WEX::TestExecution::SetVerifyOutput verifySettings(
       WEX::TestExecution::VerifyOutputSettings::LogOnlyFailures);
   CComPtr<IStream> pStream;
   ReadHlslDataIntoNewStream(L"ShaderOpArith.xml", &pStream);

   std::shared_ptr<st::ShaderOpSet> ShaderOpSet =
       std::make_shared<st::ShaderOpSet>();
   st::ParseShaderOpSetFromStream(pStream, ShaderOpSet.get());
   st::ShaderOp *pShaderOp = ShaderOpSet->GetShaderOp("QuadAnyAll");

   LPCSTR args = "/Od";

   if (args[0]) {
     for (st::ShaderOpShader &S : pShaderOp->Shaders)
       S.Arguments = args;
   }

   bool Skipped = true;
   D3D_SHADER_MODEL TestShaderModels[] = {D3D_SHADER_MODEL_6_0,
                                          D3D_SHADER_MODEL_6_5,
                                          D3D_SHADER_MODEL_6_7};
   for (unsigned i = 0; i < _countof(TestShaderModels); i++) {
     D3D_SHADER_MODEL sm = TestShaderModels[i];
     LogCommentFmt(L"\r\nVerifying QuadAny/QuadAll using Wave intrinsics in "
                   L"shader model 6.%1u",
                   ((UINT)sm & 0x0f));

     if (sm == D3D_SHADER_MODEL_6_5) {
       pShaderOp->MS = pShaderOp->GetString("MS");
       pShaderOp->AS = pShaderOp->GetString("AS");
     } else if (sm == D3D_SHADER_MODEL_6_7) {
       pShaderOp->AS = pShaderOp->GetString("AS67");
       pShaderOp->MS = pShaderOp->GetString("MS67");
       pShaderOp->CS = pShaderOp->GetString("CS67");
     }

     CComPtr<ID3D12Device> pDevice;
     if (!CreateDevice(&pDevice, sm, false /* skipUnsupported */)) {
       continue;
     }

     if (IsDeviceBasicAdapter(pDevice)) {
       WEX::Logging::Log::Comment(L"QuadAny/All fails on basic render driver.");
       continue;
     }

     if (!DoesDeviceSupportWaveOps(pDevice)) {
       LogCommentFmt(
           L"Device does not support wave operations in shader model 6.%1u",
           ((UINT)sm & 0x0f));
       continue;
     }
     Skipped = false;

     // test compute
     std::shared_ptr<ShaderOpTestResult> test = RunShaderOpTestAfterParse(
         pDevice, m_support, "QuadAnyAll", CleanUAVBuffer0Buffer, ShaderOpSet);

     MappedData uavData;
     test->Test->GetReadBackData("UAVBuffer0", &uavData);
     bool Result = VerifyQuadAnyAllResults((int2 *)uavData.data());
     VERIFY_IS_TRUE(Result);

     if (sm < D3D_SHADER_MODEL_6_5 || !DoesDeviceSupportMeshShaders(pDevice))
       continue;

     pShaderOp->CS = nullptr;
     // test AS/MS
     test = RunShaderOpTestAfterParse(pDevice, m_support, "QuadAnyAll",
                                      CleanUAVBuffer0Buffer, ShaderOpSet);

     test->Test->GetReadBackData("UAVBuffer0", &uavData);
     Result = VerifyQuadAnyAllResults((int2 *)uavData.data());
     VERIFY_IS_TRUE(Result);
     Result = VerifyQuadAnyAllResults(&((int2 *)uavData.data())[64]);
     VERIFY_IS_TRUE(Result);
   }
   if (Skipped)
     WEX::Logging::Log::Result(WEX::Logging::TestResults::Skipped);
 }

 #ifndef _HLK_CONF
 static void WriteReadBackDump(st::ShaderOp *pShaderOp, st::ShaderOpTest *pTest,
                               char **pReadBackDump) {
   std::stringstream str;

   unsigned count = 0;
   for (auto &R : pShaderOp->Resources) {
     if (!R.ReadBack)
       continue;
     ++count;
     str << "Resource: " << R.Name << "\r\n";
     // Find a descriptor that can tell us how to dump this resource.
     bool found = false;
     for (auto &Heaps : pShaderOp->DescriptorHeaps) {
       for (auto &D : Heaps.Descriptors) {
         if (_stricmp(D.ResName, R.Name) != 0) {
           continue;
         }
         found = true;
         if (_stricmp(D.Kind, "UAV") != 0) {
           str << "Resource dump for kind " << D.Kind << " not implemented yet.\r\n";
           break;
         }
         if (D.UavDesc.ViewDimension != D3D12_UAV_DIMENSION_BUFFER) {
           str << "Resource dump for this kind of view dimension not implemented yet.\r\n";
           break;
         }
         // We can map back to the structure if a structured buffer via the shader, but
         // we'll keep this simple and simply dump out 32-bit uint/float representations.
         MappedData data;
         pTest->GetReadBackData(R.Name, &data);
         uint32_t *pData = (uint32_t *)data.data();
         size_t u32_count = ((size_t)R.Desc.Width) / sizeof(uint32_t);
         for (size_t i = 0; i < u32_count; ++i) {
           float f = *(float *)pData;
           str << i << ": 0n" << *pData << "   0x" << std::hex << *pData
               << std::dec << "   " << f << "\r\n";
           ++pData;
         }
         break;
       }
       if (found) break;
     }
     if (!found) {
       str << "Unable to find a view for the resource.\r\n";
     }
   }

   str << "Resources read back: " << count << "\r\n";

   std::string s(str.str());
   CComHeapPtr<char> pDump;
   if (!pDump.Allocate(s.size() + 1))
     throw std::bad_alloc();
   memcpy(pDump.m_pData, s.data(), s.size());
   pDump.m_pData[s.size()] = '\0';
   *pReadBackDump = pDump.Detach();
 }

 // This is the exported interface by use from HLSLHost.exe.
 // It's exclusive with the use of the DLL as a TAEF target.
 extern "C" {
   __declspec(dllexport) HRESULT WINAPI InitializeOpTests(void *pStrCtx, st::OutputStringFn pOutputStrFn) {
     HRESULT hr = ExecutionTest::EnableExperimentalShaderModels();
     if (FAILED(hr)) {
       pOutputStrFn(pStrCtx, L"Unable to enable experimental shader models.\r\n.");
     }
     return S_OK;
   }

   __declspec(dllexport) HRESULT WINAPI
       RunOpTest(void *pStrCtx, st::OutputStringFn pOutputStrFn, LPCSTR pText,
                 ID3D12Device *pDevice, ID3D12CommandQueue *pCommandQueue,
                 ID3D12Resource *pRenderTarget, char **pReadBackDump) {

     HRESULT hr;
     if (pReadBackDump) *pReadBackDump = nullptr;
     st::SetOutputFn(pStrCtx, pOutputStrFn);
     CComPtr<ID3D12InfoQueue> pInfoQueue;
     CComHeapPtr<char> pDump;
     bool FilterCreation = false;
     if (SUCCEEDED(pDevice->QueryInterface(&pInfoQueue))) {
       // Creation is largely driven by inputs, so don't log create/destroy messages.
       pInfoQueue->PushEmptyStorageFilter();
       pInfoQueue->PushEmptyRetrievalFilter();
       if (FilterCreation) {
         D3D12_INFO_QUEUE_FILTER filter;
         D3D12_MESSAGE_CATEGORY denyCategories[] = { D3D12_MESSAGE_CATEGORY_STATE_CREATION };
         ZeroMemory(&filter, sizeof(filter));
         filter.DenyList.NumCategories = _countof(denyCategories);
         filter.DenyList.pCategoryList = denyCategories;
         pInfoQueue->PushStorageFilter(&filter);
       }
     }
     else {
       pOutputStrFn(pStrCtx, L"Unable to enable info queue for D3D.\r\n.");
     }
     try {
       dxc::DxcDllSupport m_support;
       m_support.Initialize();

       const char *pName = nullptr;
       CComPtr<IStream> pStream = SHCreateMemStream((BYTE *)pText, (UINT)strlen(pText));
       std::shared_ptr<st::ShaderOpSet> ShaderOpSet =
         std::make_shared<st::ShaderOpSet>();
       st::ParseShaderOpSetFromStream(pStream, ShaderOpSet.get());
       st::ShaderOp *pShaderOp;
       if (pName == nullptr) {
         if (ShaderOpSet->ShaderOps.size() != 1) {
           pOutputStrFn(pStrCtx, L"Expected a single shader operation.\r\n");
           return E_FAIL;
         }
         pShaderOp = ShaderOpSet->ShaderOps[0].get();
       }
       else {
         pShaderOp = ShaderOpSet->GetShaderOp(pName);
       }
       if (pShaderOp == nullptr) {
         std::string msg = "Unable to find shader op ";
         msg += pName;
         msg += "; available ops";
         const char sep = ':';
         for (auto &pAvailOp : ShaderOpSet->ShaderOps) {
           msg += sep;
           msg += pAvailOp->Name ? pAvailOp->Name : "[n/a]";
         }
         CA2W msgWide(msg.c_str());
         pOutputStrFn(pStrCtx, msgWide);
         return E_FAIL;
       }

       std::shared_ptr<st::ShaderOpTest> test = std::make_shared<st::ShaderOpTest>();
       test->SetupRenderTarget(pShaderOp, pDevice, pCommandQueue, pRenderTarget);
       test->SetDxcSupport(&m_support);
       test->RunShaderOp(pShaderOp);
       test->PresentRenderTarget(pShaderOp, pCommandQueue, pRenderTarget);

       pOutputStrFn(pStrCtx, L"Rendering complete.\r\n");

       if (!pShaderOp->IsCompute()) {
         D3D12_QUERY_DATA_PIPELINE_STATISTICS stats;
         test->GetPipelineStats(&stats);
         wchar_t statsText[400];
         StringCchPrintfW(statsText, _countof(statsText),
           L"Vertices/primitives read by input assembler: %I64u/%I64u\r\n"
           L"Vertex shader invocations: %I64u\r\n"
           L"Geometry shader invocations/output primitive: %I64u/%I64u\r\n"
           L"Primitives sent to rasterizer/rendered: %I64u/%I64u\r\n"
           L"PS/HS/DS/CS invocations: %I64u/%I64u/%I64u/%I64u\r\n",
           stats.IAVertices, stats.IAPrimitives, stats.VSInvocations,
           stats.GSInvocations, stats.GSPrimitives, stats.CInvocations,
           stats.CPrimitives, stats.PSInvocations, stats.HSInvocations,
           stats.DSInvocations, stats.CSInvocations);
         pOutputStrFn(pStrCtx, statsText);
       }

       if (pReadBackDump) {
         WriteReadBackDump(pShaderOp, test.get(), &pDump);
       }

       hr = S_OK;
     }
     catch (const CAtlException &E)
     {
       hr = E.m_hr;
     }
     catch (const std::bad_alloc &)
     {
       hr = E_OUTOFMEMORY;
     }
     catch (const std::exception &)
     {
       hr = E_FAIL;
     }

     // Drain the device message queue if available.
     if (pInfoQueue != nullptr) {
       wchar_t buf[200];
       StringCchPrintfW(buf, _countof(buf),
         L"NumStoredMessages=%u limit/discarded by limit=%u/%u "
         L"allowed/denied by storage filter=%u/%u "
         L"NumStoredMessagesAllowedByRetrievalFilter=%u\r\n",
         (unsigned)pInfoQueue->GetNumStoredMessages(),
         (unsigned)pInfoQueue->GetMessageCountLimit(),
         (unsigned)pInfoQueue->GetNumMessagesDiscardedByMessageCountLimit(),
         (unsigned)pInfoQueue->GetNumMessagesAllowedByStorageFilter(),
         (unsigned)pInfoQueue->GetNumMessagesDeniedByStorageFilter(),
         (unsigned)pInfoQueue->GetNumStoredMessagesAllowedByRetrievalFilter());
       pOutputStrFn(pStrCtx, buf);

       WriteInfoQueueMessages(pStrCtx, pOutputStrFn, pInfoQueue);

       pInfoQueue->ClearStoredMessages();
       pInfoQueue->PopRetrievalFilter();
       pInfoQueue->PopStorageFilter();
       if (FilterCreation) {
         pInfoQueue->PopStorageFilter();
       }
     }

     if (pReadBackDump) *pReadBackDump = pDump.Detach();

     return hr;
   }
 }
 #endif
 // MARKER: ExecutionTest/DxilConf Shared Implementation End
 // Do not remove the line above - it is used by TranslateExecutionTest.py