blob: 1c1d706315db128eadc372810af89f6ae4a7364a [file]
#version 450 core
#extension GL_KHR_memory_scope_semantics : enable
#extension GL_EXT_long_vector : enable
#extension GL_EXT_shader_explicit_arithmetic_types : enable
#extension GL_EXT_buffer_reference : enable
#extension GL_EXT_expect_assume : enable
#extension GL_EXT_bfloat16 : enable
#extension GL_KHR_shader_subgroup_arithmetic : enable
#extension GL_KHR_shader_subgroup_clustered : enable
#extension GL_NV_shader_subgroup_partitioned : enable
#extension GL_KHR_shader_subgroup_quad : enable
#extension GL_KHR_shader_subgroup_shuffle : enable
#extension GL_KHR_shader_subgroup_shuffle_relative : enable
#extension GL_KHR_shader_subgroup_rotate : enable
#extension GL_KHR_shader_subgroup_vote : enable
#extension GL_KHR_shader_subgroup_ballot : enable
#extension GL_EXT_shader_subgroup_extended_types_int64 : enable
layout (local_size_x = 64, local_size_y = 1, local_size_z = 1) in;
void main()
{
vector<float16_t, 5> vf16;
vector<float, 5> vf32;
vector<float64_t, 5> vf64;
vector<uint16_t, 5> vu16;
vector<int16_t, 5> vi16;
vector<uint32_t, 5> vu32;
vector<int32_t, 5> vi32;
vector<uint64_t, 5> vu64;
vector<int64_t, 5> vi64;
vector<bool, 5> vb;
bool b;
uint u;
uvec4 vu4;
b = subgroupAllEqual(vu32);
vu32 = subgroupBroadcast(vu32, 1u);
vu32 = subgroupBroadcastFirst(vu32);
vu32 = subgroupShuffle(vu32, u);
vu32 = subgroupShuffleXor(vu32, u);
vu32 = subgroupShuffleUp(vu32, u);
vu32 = subgroupShuffleDown(vu32, u);
vu32 = subgroupRotate(vu32, u);
vu32 = subgroupClusteredRotate(vu32, u, 4);
vu32 = subgroupAdd(vu32);
vu32 = subgroupMul(vu32);
vu32 = subgroupMin(vu32);
vu32 = subgroupMax(vu32);
vu32 = subgroupAnd(vu32);
vu32 = subgroupOr(vu32);
vu32 = subgroupXor(vu32);
vu32 = subgroupInclusiveAdd(vu32);
vu32 = subgroupInclusiveMul(vu32);
vu32 = subgroupInclusiveMin(vu32);
vu32 = subgroupInclusiveMax(vu32);
vu32 = subgroupInclusiveAnd(vu32);
vu32 = subgroupInclusiveOr(vu32);
vu32 = subgroupInclusiveXor(vu32);
vu32 = subgroupExclusiveAdd(vu32);
vu32 = subgroupExclusiveMul(vu32);
vu32 = subgroupExclusiveMin(vu32);
vu32 = subgroupExclusiveMax(vu32);
vu32 = subgroupExclusiveAnd(vu32);
vu32 = subgroupExclusiveOr(vu32);
vu32 = subgroupExclusiveXor(vu32);
vu32 = subgroupClusteredAdd(vu32, 4);
vu32 = subgroupClusteredMul(vu32, 4);
vu32 = subgroupClusteredMin(vu32, 4);
vu32 = subgroupClusteredMax(vu32, 4);
vu32 = subgroupClusteredAnd(vu32, 4);
vu32 = subgroupClusteredOr(vu32, 4);
vu32 = subgroupClusteredXor(vu32, 4);
vu32 = subgroupQuadBroadcast(vu32, 1);
vu32 = subgroupQuadSwapHorizontal(vu32);
vu32 = subgroupQuadSwapVertical(vu32);
vu32 = subgroupQuadSwapDiagonal(vu32);
vu4 = subgroupPartitionNV(vu32);
vu32 = subgroupPartitionedAddNV(vu32, vu4);
vu32 = subgroupPartitionedMulNV(vu32, vu4);
vu32 = subgroupPartitionedMinNV(vu32, vu4);
vu32 = subgroupPartitionedMaxNV(vu32, vu4);
vu32 = subgroupPartitionedAndNV(vu32, vu4);
vu32 = subgroupPartitionedOrNV(vu32, vu4);
vu32 = subgroupPartitionedXorNV(vu32, vu4);
vu32 = subgroupPartitionedInclusiveAddNV(vu32, vu4);
vu32 = subgroupPartitionedInclusiveMulNV(vu32, vu4);
vu32 = subgroupPartitionedInclusiveMinNV(vu32, vu4);
vu32 = subgroupPartitionedInclusiveMaxNV(vu32, vu4);
vu32 = subgroupPartitionedInclusiveAndNV(vu32, vu4);
vu32 = subgroupPartitionedInclusiveOrNV(vu32, vu4);
vu32 = subgroupPartitionedInclusiveXorNV(vu32, vu4);
vu32 = subgroupPartitionedExclusiveAddNV(vu32, vu4);
vu32 = subgroupPartitionedExclusiveMulNV(vu32, vu4);
vu32 = subgroupPartitionedExclusiveMinNV(vu32, vu4);
vu32 = subgroupPartitionedExclusiveMaxNV(vu32, vu4);
vu32 = subgroupPartitionedExclusiveAndNV(vu32, vu4);
vu32 = subgroupPartitionedExclusiveOrNV(vu32, vu4);
vu32 = subgroupPartitionedExclusiveXorNV(vu32, vu4);
b = subgroupAllEqual(vf32);
vf32 = subgroupBroadcast(vf32, 1u);
vf32 = subgroupBroadcastFirst(vf32);
vf32 = subgroupShuffle(vf32, u);
vf32 = subgroupShuffleXor(vf32, u);
vf32 = subgroupShuffleUp(vf32, u);
vf32 = subgroupShuffleDown(vf32, u);
vf32 = subgroupRotate(vf32, u);
vf32 = subgroupClusteredRotate(vf32, u, 4);
vf32 = subgroupAdd(vf32);
vf32 = subgroupMul(vf32);
vf32 = subgroupMin(vf32);
vf32 = subgroupMax(vf32);
//vf32 = subgroupAnd(vf32);
//vf32 = subgroupOr(vf32);
//vf32 = subgroupXor(vf32);
vf32 = subgroupInclusiveAdd(vf32);
vf32 = subgroupInclusiveMul(vf32);
vf32 = subgroupInclusiveMin(vf32);
vf32 = subgroupInclusiveMax(vf32);
//vf32 = subgroupInclusiveAnd(vf32);
//vf32 = subgroupInclusiveOr(vf32);
//vf32 = subgroupInclusiveXor(vf32);
vf32 = subgroupExclusiveAdd(vf32);
vf32 = subgroupExclusiveMul(vf32);
vf32 = subgroupExclusiveMin(vf32);
vf32 = subgroupExclusiveMax(vf32);
//vf32 = subgroupExclusiveAnd(vf32);
//vf32 = subgroupExclusiveOr(vf32);
//vf32 = subgroupExclusiveXor(vf32);
vf32 = subgroupClusteredAdd(vf32, 4);
vf32 = subgroupClusteredMul(vf32, 4);
vf32 = subgroupClusteredMin(vf32, 4);
vf32 = subgroupClusteredMax(vf32, 4);
//vf32 = subgroupClusteredAnd(vf32, 4);
//vf32 = subgroupClusteredOr(vf32, 4);
//vf32 = subgroupClusteredXor(vf32, 4);
vf32 = subgroupQuadBroadcast(vf32, 1);
vf32 = subgroupQuadSwapHorizontal(vf32);
vf32 = subgroupQuadSwapVertical(vf32);
vf32 = subgroupQuadSwapDiagonal(vf32);
vu4 = subgroupPartitionNV(vf32);
vf32 = subgroupPartitionedAddNV(vf32, vu4);
vf32 = subgroupPartitionedMulNV(vf32, vu4);
vf32 = subgroupPartitionedMinNV(vf32, vu4);
vf32 = subgroupPartitionedMaxNV(vf32, vu4);
//vf32 = subgroupPartitionedAndNV(vf32, vu4);
//vf32 = subgroupPartitionedOrNV(vf32, vu4);
//vf32 = subgroupPartitionedXorNV(vf32, vu4);
vf32 = subgroupPartitionedInclusiveAddNV(vf32, vu4);
vf32 = subgroupPartitionedInclusiveMulNV(vf32, vu4);
vf32 = subgroupPartitionedInclusiveMinNV(vf32, vu4);
vf32 = subgroupPartitionedInclusiveMaxNV(vf32, vu4);
//vf32 = subgroupPartitionedInclusiveAndNV(vf32, vu4);
//vf32 = subgroupPartitionedInclusiveOrNV(vf32, vu4);
//vf32 = subgroupPartitionedInclusiveXorNV(vf32, vu4);
vf32 = subgroupPartitionedExclusiveAddNV(vf32, vu4);
vf32 = subgroupPartitionedExclusiveMulNV(vf32, vu4);
vf32 = subgroupPartitionedExclusiveMinNV(vf32, vu4);
vf32 = subgroupPartitionedExclusiveMaxNV(vf32, vu4);
//vf32 = subgroupPartitionedExclusiveAndNV(vf32, vu4);
//vf32 = subgroupPartitionedExclusiveOrNV(vf32, vu4);
//vf32 = subgroupPartitionedExclusiveXorNV(vf32, vu4);
b = subgroupAllEqual(vf64);
vf64 = subgroupBroadcast(vf64, 1u);
vf64 = subgroupBroadcastFirst(vf64);
vf64 = subgroupShuffle(vf64, u);
vf64 = subgroupShuffleXor(vf64, u);
vf64 = subgroupShuffleUp(vf64, u);
vf64 = subgroupShuffleDown(vf64, u);
vf64 = subgroupRotate(vf64, u);
vf64 = subgroupClusteredRotate(vf64, u, 4);
vf64 = subgroupAdd(vf64);
vf64 = subgroupMul(vf64);
vf64 = subgroupMin(vf64);
vf64 = subgroupMax(vf64);
//vf64 = subgroupAnd(vf64);
//vf64 = subgroupOr(vf64);
//vf64 = subgroupXor(vf64);
vf64 = subgroupInclusiveAdd(vf64);
vf64 = subgroupInclusiveMul(vf64);
vf64 = subgroupInclusiveMin(vf64);
vf64 = subgroupInclusiveMax(vf64);
//vf64 = subgroupInclusiveAnd(vf64);
//vf64 = subgroupInclusiveOr(vf64);
//vf64 = subgroupInclusiveXor(vf64);
vf64 = subgroupExclusiveAdd(vf64);
vf64 = subgroupExclusiveMul(vf64);
vf64 = subgroupExclusiveMin(vf64);
vf64 = subgroupExclusiveMax(vf64);
//vf64 = subgroupExclusiveAnd(vf64);
//vf64 = subgroupExclusiveOr(vf64);
//vf64 = subgroupExclusiveXor(vf64);
vf64 = subgroupClusteredAdd(vf64, 4);
vf64 = subgroupClusteredMul(vf64, 4);
vf64 = subgroupClusteredMin(vf64, 4);
vf64 = subgroupClusteredMax(vf64, 4);
//vf64 = subgroupClusteredAnd(vf64, 4);
//vf64 = subgroupClusteredOr(vf64, 4);
//vf64 = subgroupClusteredXor(vf64, 4);
vf64 = subgroupQuadBroadcast(vf64, 1);
vf64 = subgroupQuadSwapHorizontal(vf64);
vf64 = subgroupQuadSwapVertical(vf64);
vf64 = subgroupQuadSwapDiagonal(vf64);
vu4 = subgroupPartitionNV(vf64);
vf64 = subgroupPartitionedAddNV(vf64, vu4);
vf64 = subgroupPartitionedMulNV(vf64, vu4);
vf64 = subgroupPartitionedMinNV(vf64, vu4);
vf64 = subgroupPartitionedMaxNV(vf64, vu4);
//vf64 = subgroupPartitionedAndNV(vf64, vu4);
//vf64 = subgroupPartitionedOrNV(vf64, vu4);
//vf64 = subgroupPartitionedXorNV(vf64, vu4);
vf64 = subgroupPartitionedInclusiveAddNV(vf64, vu4);
vf64 = subgroupPartitionedInclusiveMulNV(vf64, vu4);
vf64 = subgroupPartitionedInclusiveMinNV(vf64, vu4);
vf64 = subgroupPartitionedInclusiveMaxNV(vf64, vu4);
//vf64 = subgroupPartitionedInclusiveAndNV(vf64, vu4);
//vf64 = subgroupPartitionedInclusiveOrNV(vf64, vu4);
//vf64 = subgroupPartitionedInclusiveXorNV(vf64, vu4);
vf64 = subgroupPartitionedExclusiveAddNV(vf64, vu4);
vf64 = subgroupPartitionedExclusiveMulNV(vf64, vu4);
vf64 = subgroupPartitionedExclusiveMinNV(vf64, vu4);
vf64 = subgroupPartitionedExclusiveMaxNV(vf64, vu4);
//vf64 = subgroupPartitionedExclusiveAndNV(vf64, vu4);
//vf64 = subgroupPartitionedExclusiveOrNV(vf64, vu4);
//vf64 = subgroupPartitionedExclusiveXorNV(vf64, vu4);
b = subgroupAllEqual(vb);
vb = subgroupBroadcast(vb, 1u);
vb = subgroupBroadcastFirst(vb);
vb = subgroupShuffle(vb, u);
vb = subgroupShuffleXor(vb, u);
vb = subgroupShuffleUp(vb, u);
vb = subgroupShuffleDown(vb, u);
vb = subgroupRotate(vb, u);
vb = subgroupClusteredRotate(vb, u, 4);
//vb = subgroupAdd(vb);
//vb = subgroupMul(vb);
//vb = subgroupMin(vb);
//vb = subgroupMax(vb);
vb = subgroupAnd(vb);
vb = subgroupOr(vb);
vb = subgroupXor(vb);
//vb = subgroupInclusiveAdd(vb);
//vb = subgroupInclusiveMul(vb);
//vb = subgroupInclusiveMin(vb);
//vb = subgroupInclusiveMax(vb);
vb = subgroupInclusiveAnd(vb);
vb = subgroupInclusiveOr(vb);
vb = subgroupInclusiveXor(vb);
//vb = subgroupExclusiveAdd(vb);
//vb = subgroupExclusiveMul(vb);
//vb = subgroupExclusiveMin(vb);
//vb = subgroupExclusiveMax(vb);
vb = subgroupExclusiveAnd(vb);
vb = subgroupExclusiveOr(vb);
vb = subgroupExclusiveXor(vb);
//vb = subgroupClusteredAdd(vb, 4);
//vb = subgroupClusteredMul(vb, 4);
//vb = subgroupClusteredMin(vb, 4);
//vb = subgroupClusteredMax(vb, 4);
vb = subgroupClusteredAnd(vb, 4);
vb = subgroupClusteredOr(vb, 4);
vb = subgroupClusteredXor(vb, 4);
vb = subgroupQuadBroadcast(vb, 1);
vb = subgroupQuadSwapHorizontal(vb);
vb = subgroupQuadSwapVertical(vb);
vb = subgroupQuadSwapDiagonal(vb);
vu4 = subgroupPartitionNV(vb);
//vb = subgroupPartitionedAddNV(vb, vu4);
//vb = subgroupPartitionedMulNV(vb, vu4);
//vb = subgroupPartitionedMinNV(vb, vu4);
//vb = subgroupPartitionedMaxNV(vb, vu4);
vb = subgroupPartitionedAndNV(vb, vu4);
vb = subgroupPartitionedOrNV(vb, vu4);
vb = subgroupPartitionedXorNV(vb, vu4);
//vb = subgroupPartitionedInclusiveAddNV(vb, vu4);
//vb = subgroupPartitionedInclusiveMulNV(vb, vu4);
//vb = subgroupPartitionedInclusiveMinNV(vb, vu4);
//vb = subgroupPartitionedInclusiveMaxNV(vb, vu4);
vb = subgroupPartitionedInclusiveAndNV(vb, vu4);
vb = subgroupPartitionedInclusiveOrNV(vb, vu4);
vb = subgroupPartitionedInclusiveXorNV(vb, vu4);
//vb = subgroupPartitionedExclusiveAddNV(vb, vu4);
//vb = subgroupPartitionedExclusiveMulNV(vb, vu4);
//vb = subgroupPartitionedExclusiveMinNV(vb, vu4);
//vb = subgroupPartitionedExclusiveMaxNV(vb, vu4);
vb = subgroupPartitionedExclusiveAndNV(vb, vu4);
vb = subgroupPartitionedExclusiveOrNV(vb, vu4);
vb = subgroupPartitionedExclusiveXorNV(vb, vu4);
b = subgroupAllEqual(vu64);
vu64 = subgroupBroadcast(vu64, 1u);
vu64 = subgroupBroadcastFirst(vu64);
vu64 = subgroupShuffle(vu64, u);
vu64 = subgroupShuffleXor(vu64, u);
vu64 = subgroupShuffleUp(vu64, u);
vu64 = subgroupShuffleDown(vu64, u);
vu64 = subgroupRotate(vu64, u);
vu64 = subgroupClusteredRotate(vu64, u, 4);
vu64 = subgroupAdd(vu64);
vu64 = subgroupMul(vu64);
vu64 = subgroupMin(vu64);
vu64 = subgroupMax(vu64);
vu64 = subgroupAnd(vu64);
vu64 = subgroupOr(vu64);
vu64 = subgroupXor(vu64);
vu64 = subgroupInclusiveAdd(vu64);
vu64 = subgroupInclusiveMul(vu64);
vu64 = subgroupInclusiveMin(vu64);
vu64 = subgroupInclusiveMax(vu64);
vu64 = subgroupInclusiveAnd(vu64);
vu64 = subgroupInclusiveOr(vu64);
vu64 = subgroupInclusiveXor(vu64);
vu64 = subgroupExclusiveAdd(vu64);
vu64 = subgroupExclusiveMul(vu64);
vu64 = subgroupExclusiveMin(vu64);
vu64 = subgroupExclusiveMax(vu64);
vu64 = subgroupExclusiveAnd(vu64);
vu64 = subgroupExclusiveOr(vu64);
vu64 = subgroupExclusiveXor(vu64);
vu64 = subgroupClusteredAdd(vu64, 4);
vu64 = subgroupClusteredMul(vu64, 4);
vu64 = subgroupClusteredMin(vu64, 4);
vu64 = subgroupClusteredMax(vu64, 4);
vu64 = subgroupClusteredAnd(vu64, 4);
vu64 = subgroupClusteredOr(vu64, 4);
vu64 = subgroupClusteredXor(vu64, 4);
vu64 = subgroupQuadBroadcast(vu64, 1);
vu64 = subgroupQuadSwapHorizontal(vu64);
vu64 = subgroupQuadSwapVertical(vu64);
vu64 = subgroupQuadSwapDiagonal(vu64);
vu4 = subgroupPartitionNV(vu64);
vu64 = subgroupPartitionedAddNV(vu64, vu4);
vu64 = subgroupPartitionedMulNV(vu64, vu4);
vu64 = subgroupPartitionedMinNV(vu64, vu4);
vu64 = subgroupPartitionedMaxNV(vu64, vu4);
vu64 = subgroupPartitionedAndNV(vu64, vu4);
vu64 = subgroupPartitionedOrNV(vu64, vu4);
vu64 = subgroupPartitionedXorNV(vu64, vu4);
vu64 = subgroupPartitionedInclusiveAddNV(vu64, vu4);
vu64 = subgroupPartitionedInclusiveMulNV(vu64, vu4);
vu64 = subgroupPartitionedInclusiveMinNV(vu64, vu4);
vu64 = subgroupPartitionedInclusiveMaxNV(vu64, vu4);
vu64 = subgroupPartitionedInclusiveAndNV(vu64, vu4);
vu64 = subgroupPartitionedInclusiveOrNV(vu64, vu4);
vu64 = subgroupPartitionedInclusiveXorNV(vu64, vu4);
vu64 = subgroupPartitionedExclusiveAddNV(vu64, vu4);
vu64 = subgroupPartitionedExclusiveMulNV(vu64, vu4);
vu64 = subgroupPartitionedExclusiveMinNV(vu64, vu4);
vu64 = subgroupPartitionedExclusiveMaxNV(vu64, vu4);
vu64 = subgroupPartitionedExclusiveAndNV(vu64, vu4);
vu64 = subgroupPartitionedExclusiveOrNV(vu64, vu4);
vu64 = subgroupPartitionedExclusiveXorNV(vu64, vu4);
}