lib/AllocateDescriptorsPass.cpp - chromiumos/third_party/clspv - Git at Google

 // Copyright 2018 The Clspv Authors. All rights reserved.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
 // You may obtain a copy of the License at
 //
 //     http://www.apache.org/licenses/LICENSE-2.0
 //
 // Unless required by applicable law or agreed to in writing, software
 // distributed under the License is distributed on an "AS IS" BASIS,
 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 // See the License for the specific language governing permissions and
 // limitations under the License.

 #include <climits>
 #include <string>

 #include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/IR/DerivedTypes.h"
 #include "llvm/IR/Function.h"
 #include "llvm/IR/IRBuilder.h"
 #include "llvm/IR/Module.h"
 #include "llvm/Pass.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/raw_ostream.h"

 #include "spirv/unified1/spirv.hpp"

 #include "clspv/AddressSpace.h"
 #include "clspv/Option.h"

 #include "ArgKind.h"
 #include "Builtins.h"
 #include "Constants.h"
 #include "DescriptorCounter.h"
 #include "Passes.h"
 #include "SpecConstant.h"

 using namespace llvm;

 #define DEBUG_TYPE "allocatedescriptors"

 namespace {

 // Constant that represents bitfield for UniformMemory Memory Semantics from
 // SPIR-V. Used to test barrier semantics.
 const uint32_t kMemorySemanticsUniformMemory = 0x40;

 // Constant that represents bitfield for ImageMemory Memory Semantics from
 // SPIR-V. Used to test barrier semantics.
 const uint32_t kMemorySemanticsImageMemory = 0x800;

 cl::opt<bool> ShowDescriptors("show-desc", cl::init(false), cl::Hidden,
                               cl::desc("Show descriptors"));

 using SamplerMapType = llvm::ArrayRef<std::pair<unsigned, std::string>>;

 class AllocateDescriptorsPass final : public ModulePass {
 public:
   static char ID;
   AllocateDescriptorsPass()
       : ModulePass(ID), sampler_map_(), descriptor_set_(0), binding_(0) {}
   bool runOnModule(Module &M) override;

   SamplerMapType &sampler_map() { return sampler_map_; }

 private:
   // Allocates descriptors for all samplers and kernel arguments that have uses.
   // Replace their uses with calls to a special compiler builtin.  Returns true
   // if we changed the module.
   bool AllocateDescriptors(Module &M);

   // Allocate descriptor for literal samplers.  Returns true if we changed the
   // module.
   bool AllocateLiteralSamplerDescriptors(Module &M);

   // Allocate descriptor for kernel arguments with uses.  Returns true if we
   // changed the module.
   bool AllocateKernelArgDescriptors(Module &M);

   bool AllocateLocalKernelArgSpecIds(Module &M);

   // Allocates the next descriptor set and resets the tracked binding number to
   // 0.
   unsigned StartNewDescriptorSet(Module &M) {
     // Allocate the descriptor set we used.
     binding_ = 0;
     const auto set = clspv::TakeDescriptorIndex(&M);
     assert(set == descriptor_set_);
     descriptor_set_++;
     return set;
   }

   // Returns true if |F| or call function |F| calls contains a global barrier.
   // Specifically, it checks that the memory semantics operand contains
   // UniformMemory memory semantics.
   //
   // The compiler targets OpenCL 1.2, which only provides support for relaxed
   // atomics which means they cannot be used as synchronization primitives.
   // That is why the pass does not consider them for the addition of coherence.
   bool CallTreeContainsGlobalBarrier(Function *F);

   // Returns a pair indicating if |V| is read and/or written to.
   // Traces the use chain looking for loads and stores and proceeding through
   // function calls until a non-pointer value is encountered.
   //
   // This function assumes loads, stores and function calls are the only
   // instructions that can read or write to memory.
   std::pair<bool, bool> HasReadsAndWrites(Value *V);

   // Cache for which functions' call trees contain a global barrier.
   DenseMap<Function *, bool> barrier_map_;

   // The sampler map, which is an array ref of pairs, each of which is the
   // sampler constant as an integer, followed by the string expression for
   // the sampler.
   SamplerMapType sampler_map_;

   // Which descriptor set are we using?
   int descriptor_set_;
   // The next binding number to use.
   int binding_;

   // What makes a kernel argument require a new descriptor?
   struct KernelArgDiscriminant {
     KernelArgDiscriminant(Type *the_type = nullptr, int the_arg_index = 0,
                           int the_separation_token = 0, int is_coherent = 0)
         : type(the_type), arg_index(the_arg_index),
           separation_token(the_separation_token), coherent(is_coherent) {}
     // Different argument type requires different descriptor since logical
     // addressing requires strongly typed storage buffer variables.
     Type *type;
     // If we have multiple arguments of the same type to the same kernel,
     // then we have to use distinct descriptors because the user could
     // bind different storage buffers for them.  Use argument index
     // as a proxy for distinctness.  This might overcount, but we
     // don't worry about yet.
     int arg_index;
     // An extra bit of data that can be used to separate resource
     // variables that otherwise share the same type and argument index.
     // By default this will be zero, and so it won't force any separation.
     int separation_token;
     // An extra bit that marks whether the variable is coherent. This means
     // coherent and non-coherent variables will not share a binding.
     int coherent;
   };
   struct KADDenseMapInfo {
     static KernelArgDiscriminant getEmptyKey() {
       return KernelArgDiscriminant(nullptr, 0, 0);
     }
     static KernelArgDiscriminant getTombstoneKey() {
       return KernelArgDiscriminant(nullptr, -1, 0);
     }
     static unsigned getHashValue(const KernelArgDiscriminant &key) {
       return unsigned(uintptr_t(key.type)) ^ key.arg_index ^
              key.separation_token ^ key.coherent;
     }
     static bool isEqual(const KernelArgDiscriminant &lhs,
                         const KernelArgDiscriminant &rhs) {
       return lhs.type == rhs.type && lhs.arg_index == rhs.arg_index &&
              lhs.separation_token == rhs.separation_token &&
              lhs.coherent == rhs.coherent;
     }
   };
 };
 } // namespace

 char AllocateDescriptorsPass::ID = 0;
 INITIALIZE_PASS(AllocateDescriptorsPass, "AllocateDescriptorsPass",
                 "Allocate resource descriptors", false, false)

 namespace clspv {
 ModulePass *createAllocateDescriptorsPass(SamplerMapType sampler_map) {
   auto *result = new AllocateDescriptorsPass();
   result->sampler_map() = sampler_map;
   return result;
 }
 } // namespace clspv

 bool AllocateDescriptorsPass::runOnModule(Module &M) {
   bool Changed = false;

   // Samplers from the sampler map always grab descriptor set 0.
   Changed |= AllocateLiteralSamplerDescriptors(M);
   Changed |= AllocateKernelArgDescriptors(M);
   Changed |= AllocateLocalKernelArgSpecIds(M);

   return Changed;
 }

 bool AllocateDescriptorsPass::AllocateLiteralSamplerDescriptors(Module &M) {
   if (ShowDescriptors) {
     outs() << "Allocate literal sampler descriptors\n";
   }
   bool Changed = false;
   auto init_fn = M.getFunction(clspv::TranslateSamplerInitializerFunction());
   if (!init_fn)
     return Changed;

   if (init_fn && clspv::Option::UseSamplerMap() && sampler_map_.size() == 0) {
     errs() << "error: kernel uses a literal sampler but option -samplermap "
               "has not been specified\n";
     llvm_unreachable("Sampler literal in source without sampler map!");
   }

   const unsigned descriptor_set = StartNewDescriptorSet(M);
   Changed = true;
   if (!sampler_map_.empty()) {
     if (ShowDescriptors) {
       outs() << "  Found " << sampler_map_.size()
              << " samplers in the sampler map\n";
     }
   }

   // Replace all things that look like
   //  call %opencl.sampler_t addrspace(2)*
   //     @__translate_sampler_initializer(i32 sampler-literal-constant-value)
   //     #2
   //
   // with (if sampler map is provided):
   //
   //   call %opencl.sampler_t addrspace(2)*
   //       @clspv.sampler.var.literal(i32 descriptor set, i32 binding, i32
   //       index-into-sampler-map)
   //
   //  or (if no sampler map is provided):
   //
   //   call %opencl.sampler_t addrspace(2)*
   //       @clspv.sampler.var.literal(i32 descriptor set, i32 binding, i32
   //       sampler-literal-value)
   //
   // We need to preserve the index into the sampler map so that later we can
   // generate the sampler lines in the embedded reflection. That needs both the
   // literal value and the string expression for the literal.

   // Generate the function type for clspv::LiteralSamplerFunction()
   IRBuilder<> Builder(M.getContext());
   auto *sampler_struct_ty =
       StructType::getTypeByName(M.getContext(), "opencl.sampler_t");
   if (!sampler_struct_ty) {
     sampler_struct_ty = StructType::create(M.getContext(), "opencl.sampler_t");
   }
   auto *sampler_ty =
       sampler_struct_ty->getPointerTo(clspv::AddressSpace::Constant);
   Type *i32 = Builder.getInt32Ty();
   FunctionType *fn_ty = FunctionType::get(sampler_ty, {i32, i32, i32}, false);

   auto var_fn = M.getOrInsertFunction(clspv::LiteralSamplerFunction(), fn_ty);

   // Map sampler literal to binding number.
   DenseMap<unsigned, unsigned> binding_for_value;
   DenseMap<unsigned, unsigned> index_for_value;
   unsigned index = 0;
   if (!sampler_map_.empty()) {
     for (auto sampler_info : sampler_map_) {
       const unsigned value = sampler_info.first;
       const std::string &expr = sampler_info.second;
       if (0 == binding_for_value.count(value)) {
         // Make a new entry.
         binding_for_value[value] = binding_++;
         index_for_value[value] = index;
         if (ShowDescriptors) {
           outs() << "  Map " << value << " to (" << descriptor_set << ","
                  << binding_for_value[value] << ") << " << expr << "\n";
         }
       }
       index++;
     }
   }

   // Now replace calls to __translate_sampler_initializer
   if (init_fn) {
     // Copy users, to avoid modifying the list in place.
     SmallVector<User *, 8> users(init_fn->users());
     for (auto user : users) {
       if (auto *call = dyn_cast<CallInst>(user)) {
         auto const_val = dyn_cast<ConstantInt>(call->getArgOperand(0));

         if (!const_val) {
           call->getArgOperand(0)->print(errs());
           llvm_unreachable("Argument of sampler initializer was non-constant!");
         }

         const auto value = static_cast<unsigned>(const_val->getZExtValue());

         auto where = binding_for_value.find(value);
         if (where == binding_for_value.end()) {
           if (!sampler_map_.empty()) {
             errs() << "Sampler literal " << value
                    << " was not in the sampler map\n";
             llvm_unreachable("Sampler literal was not found in sampler map!");
           } else {
             // Allocate a binding for this sampler value.
             binding_for_value.insert(std::make_pair(value, index++));
             if (ShowDescriptors) {
               outs() << "  Map " << value << " to (" << descriptor_set << ","
                      << binding_for_value[value] << ")\n";
             }
           }
         }
         const unsigned binding = binding_for_value[value];
         // Third parameter is either the data mask if no sampler map is
         // specified or the index into the sampler map if one is provided.
         unsigned third_param = value;
         if (!sampler_map_.empty()) {
           // Use the sampler map index when a sampler map is provided.
           third_param = index_for_value[value];
         }

         SmallVector<Value *, 3> args = {Builder.getInt32(descriptor_set),
                                         Builder.getInt32(binding),
                                         Builder.getInt32(third_param)};
         if (ShowDescriptors) {
           outs() << "  translate literal sampler " << *const_val << " to ("
                  << descriptor_set << "," << binding << ")\n";
         }
         auto *new_call =
             CallInst::Create(var_fn, args, "", dyn_cast<Instruction>(call));
         call->replaceAllUsesWith(new_call);
         call->eraseFromParent();
       }
     }
     if (!init_fn->user_empty()) {
       errs() << "Function: " << init_fn->getName().str()
              << " still has users after rewrite\n";
       for (auto U : init_fn->users()) {
         errs() << " User: " << *U << "\n";
       }
       llvm_unreachable("Unexpected uses remain");
     }
     init_fn->eraseFromParent();
   } else {
     if (ShowDescriptors) {
       outs() << "  No sampler\n";
     }
   }
   return Changed;
 }

 bool AllocateDescriptorsPass::AllocateKernelArgDescriptors(Module &M) {
   bool Changed = false;
   if (ShowDescriptors) {
     outs() << "Allocate kernel arg descriptors\n";
   }

   // First classify all kernel arguments by arg discriminant which
   // is the pair (type, arg index).
   //
   // FIRST RULE: There will be at least one resource variable for each
   // different discriminant.

   // Map a descriminant to a unique index.  We don't use a UniqueVector
   // because that requires operator< that I don't want to define on
   // llvm::Type*
   using KernelArgDiscriminantMap =
       DenseMap<KernelArgDiscriminant, int, KADDenseMapInfo>;

   // Maps a discriminant to its unique index, starting at 0.
   KernelArgDiscriminantMap discriminant_map;

   // SECOND RULE: We can use several strategies for descriptor binding
   // to these variables.
   //
   // It may not be obvious, but:
   // - A single resource variable can only be decorated once with
   //   DescriptorSet and Binding.  Otherwise it's impossible to interpret
   //   how to use the variable.
   // - Different resource variables can have the same binding.  (For example,
   //   do that to save on descriptors, or to save on the number of resource
   //   variables.)
   //   - SPIR-V (trivially) allows reuse of (set,binding) pairs.
   //   - Vulkan permits this as well, but requires that for a given entry
   //     point all such variables statically referenced by the entry point's
   //     call tree must have a type compatible with the descriptor actually
   //     bound to the pipeline.
   // - When setting up a pipeline, Vulkan does not care about the resource
   //   variables that are *not* statically referenced by the used entry points'
   //   call trees.
   // For more, see Vulkan 14.5.3 DescriptorSet and Binding Assignment
   const bool always_distinct_sets =
       clspv::Option::DistinctKernelDescriptorSets();
   // The default is that all kernels use the same descriptor set.
   const bool always_single_kernel_descriptor = true;
   // By default look for as much sharing as possible.  But sometimes we need to
   // ensure each kernel argument that is an image or sampler gets a different
   // resourcee variable.
   const bool always_distinct_image_sampler =
       clspv::Option::HackDistinctImageSampler();

   // Bookkeeping:
   //  - Each discriminant remembers which functions use it.
   //  - Each function remembers the pairs associated with each argument.

   // Maps an arg discriminant index to the list of functions using that
   // discriminant.
   using FunctionsUsedByDiscriminantMap =
       SmallVector<SmallVector<Function *, 3>, 3>;
   FunctionsUsedByDiscriminantMap functions_used_by_discriminant;

   struct DiscriminantInfo {
     int index;
     KernelArgDiscriminant discriminant;
   };
   // Maps a function to an ordered list of discriminants and their.  The -1
   // value is a sentinel indicating the argument does not use a descriptor.
   // TODO(dneto): This probably shouldn't be a DenseMap because its value type
   // is pretty big.
   DenseMap<Function *, SmallVector<DiscriminantInfo, 3>>
       discriminants_used_by_function;

   // Remember the list of kernels with bodies, for convenience.
   // This is in module-order.
   SmallVector<Function *, 3> kernels_with_bodies;

   int num_image_sampler_arguments = 0;
   for (Function &F : M) {
     // Only scan arguments of kernel functions that have bodies.
     if (F.isDeclaration() || F.getCallingConv() != CallingConv::SPIR_KERNEL) {
       continue;
     }
     kernels_with_bodies.push_back(&F);
     auto &discriminants_list = discriminants_used_by_function[&F];
     bool uses_barriers = CallTreeContainsGlobalBarrier(&F);

     int arg_index = 0;
     for (Argument &Arg : F.args()) {
       Type *argTy = Arg.getType();
       const auto arg_kind = clspv::GetArgKind(Arg);

       int separation_token = 0;
       switch (arg_kind) {
       case clspv::ArgKind::SampledImage:
       case clspv::ArgKind::StorageImage:
       case clspv::ArgKind::Sampler:
         if (always_distinct_image_sampler) {
           separation_token = num_image_sampler_arguments;
         }
         num_image_sampler_arguments++;
         break;
       default:
         break;
       }

       int coherent = 0;
       if (uses_barriers && (arg_kind == clspv::ArgKind::Buffer ||
                             arg_kind == clspv::ArgKind::StorageImage)) {
         // Coherency is only required if the argument is an SSBO or storage
         // image that is both read and written to.
         bool reads = false;
         bool writes = false;
         std::tie(reads, writes) = HasReadsAndWrites(&Arg);
         coherent = (reads && writes) ? 1 : 0;
       }

       KernelArgDiscriminant key(argTy, arg_index, separation_token, coherent);

       // First assume no descriptor is required.
       discriminants_list.push_back(DiscriminantInfo{-1, key});

       // Pointer-to-local arguments don't become resource variables.
       if (arg_kind == clspv::ArgKind::Local) {
         if (ShowDescriptors) {
           errs() << "DBA: skip pointer-to-local\n\n";
         }
       } else {
         int index;
         auto where = discriminant_map.find(key);
         if (where == discriminant_map.end()) {
           index = int(discriminant_map.size());
           // Save the new unique idex for this discriminant.
           discriminant_map[key] = index;
           functions_used_by_discriminant.push_back(
               SmallVector<Function *, 3>{&F});
         } else {
           index = where->second;
           functions_used_by_discriminant[index].push_back(&F);
         }

         discriminants_list.back().index = index;

         if (ShowDescriptors) {
           outs() << F.getName() << " " << Arg.getName() << " -> index " << index
                  << "\n";
         }
       }

       arg_index++;
     }
   }

   // Now map kernel arguments to descriptor sets and bindings.
   // There are two buckets of descriptor sets:
   // - The all_kernels_descriptor_set is for resources that are used
   //   by all kernels in the module.
   // - Otherwise, each kernel gets is own descriptor set for its
   //   arguments that don't map to the same discriminant in *all*
   //   kernels. (It might map to a few, but not all.)
   // The kUnallocated descriptor set value means "not yet allocated".
   enum { kUnallocated = UINT_MAX };
   unsigned all_kernels_descriptor_set = kUnallocated;
   // Map the arg index to the binding to use in the all-descriptors descriptor
   // set.
   DenseMap<int, unsigned> all_kernels_binding_for_arg_index;

   // Maps a function to the list of set and binding to use, per argument.
   // For an argument that does not use a descriptor, its set and binding are
   // both the kUnallocated value.
   DenseMap<Function *, SmallVector<std::pair<unsigned, unsigned>, 3>>
       set_and_binding_pairs_for_function;

   // Determine set and binding for each kernel argument requiring a descriptor.
   if (always_distinct_sets) {
     for (Function *f_ptr : kernels_with_bodies) {
       auto &set_and_binding_list = set_and_binding_pairs_for_function[f_ptr];
       auto &discriminants_list = discriminants_used_by_function[f_ptr];
       const auto set = clspv::TakeDescriptorIndex(&M);
       unsigned binding = 0;
       int arg_index = 0;
       for (Argument &Arg : f_ptr->args()) {
         set_and_binding_list.emplace_back(kUnallocated, kUnallocated);
         if (discriminants_list[arg_index].index >= 0) {
           if (clspv::GetArgKind(Arg) != clspv::ArgKind::PodPushConstant) {
             // Don't assign a descriptor set to push constants.
             set_and_binding_list.back().first = set;
           }
           set_and_binding_list.back().second = binding++;
         }
         arg_index++;
       }
     }
   } else {
     // Share resource variables.
     for (Function *f_ptr : kernels_with_bodies) {
       unsigned this_kernel_descriptor_set = kUnallocated;
       unsigned this_kernel_next_binding = 0;

       auto &discriminants_list = discriminants_used_by_function[f_ptr];

       int arg_index = 0;

       auto &set_and_binding_list = set_and_binding_pairs_for_function[f_ptr];
       for (auto &info : discriminants_used_by_function[f_ptr]) {
         set_and_binding_list.emplace_back(kUnallocated, kUnallocated);
         if (discriminants_list[arg_index].index >= 0) {
           // This argument will map to a resource.
           unsigned set = kUnallocated;
           unsigned binding = kUnallocated;
           const bool is_push_constant_arg =
               clspv::GetArgKind(*f_ptr->getArg(arg_index)) ==
               clspv::ArgKind::PodPushConstant;
           if (always_single_kernel_descriptor ||
               functions_used_by_discriminant[info.index].size() ==
                   kernels_with_bodies.size() ||
               is_push_constant_arg) {
             // Reuse the descriptor because one of the following is true:
             // - This kernel argument discriminant is consistent across all
             //   kernels.
             // - Convention is to use a single descriptor for all kernels.
             //
             // Push constants args always take this path because they share a
             // dummy descriptor, kUnallocated, that is never codegen'd.
             if (!is_push_constant_arg) {
               if (all_kernels_descriptor_set == kUnallocated) {
                 all_kernels_descriptor_set = clspv::TakeDescriptorIndex(&M);
               }
               set = all_kernels_descriptor_set;
             }
             auto where = all_kernels_binding_for_arg_index.find(arg_index);
             if (where == all_kernels_binding_for_arg_index.end()) {
               binding = all_kernels_binding_for_arg_index.size();
               all_kernels_binding_for_arg_index[arg_index] = binding;
             } else {
               binding = where->second;
             }
           } else {
             // Use a descriptor in the descriptor set dedicated to this
             // kernel.
             if (this_kernel_descriptor_set == kUnallocated) {
               this_kernel_descriptor_set = clspv::TakeDescriptorIndex(&M);
             }
             set = this_kernel_descriptor_set;
             binding = this_kernel_next_binding++;
           }
           set_and_binding_list.back().first = set;
           set_and_binding_list.back().second = binding;
         }
         arg_index++;
       }
     }
   }

   // Rewrite the uses of the arguments.
   IRBuilder<> Builder(M.getContext());
   for (Function *f_ptr : kernels_with_bodies) {
     auto &set_and_binding_list = set_and_binding_pairs_for_function[f_ptr];
     auto &discriminants_list = discriminants_used_by_function[f_ptr];
     const auto num_args = unsigned(set_and_binding_list.size());
     if (!always_distinct_sets &&
         (num_args != unsigned(discriminants_list.size()))) {
       errs() << "num_args " << num_args << " != num discriminants "
              << discriminants_list.size() << "\n";
       llvm_unreachable("Bad accounting in descriptor allocation");
     }
     const auto num_fun_args = unsigned(f_ptr->arg_end() - f_ptr->arg_begin());
     if (num_fun_args != num_args) {
       errs() << f_ptr->getName() << " has " << num_fun_args
              << " params but we have set_and_binding list of length "
              << num_args << "\n";
       errs() << *f_ptr << "\n";
       errs() << *(f_ptr->getType()) << "\n";
       for (auto &arg : f_ptr->args()) {
         errs() << "   " << arg << "\n";
       }
       llvm_unreachable("Bad accounting in descriptor allocation. Mismatch with "
                        "function param list");
     }

     // Prepare to insert arg remapping instructions at the start of the
     // function.
     Builder.SetInsertPoint(f_ptr->getEntryBlock().getFirstNonPHI());

     int arg_index = 0;
     for (Argument &Arg : f_ptr->args()) {
       if (discriminants_list[arg_index].index >= 0) {
         Changed = true;
         // This argument needs to be rewritten.

         const auto set = set_and_binding_list[arg_index].first;
         const auto binding = set_and_binding_list[arg_index].second;
 #if 0
         // TODO(dneto) Should we ignore unused arguments?  It's probably not an
         // issue in practice.  Adding this condition would change a bunch of our
         // tests.
         if (!Arg.hasNUsesOrMore(1)) {
           continue;
         }
 #endif

         Type *argTy = discriminants_list[arg_index].discriminant.type;
         assert(arg_index ==
                discriminants_list[arg_index].discriminant.arg_index);

         if (ShowDescriptors) {
           outs() << "DBA: Function " << f_ptr->getName() << " arg " << arg_index
                  << " type " << *argTy << "\n";
         }

         const auto arg_kind = clspv::GetArgKind(Arg);

         Type *resource_type = nullptr;
         unsigned addr_space = kUnallocated;

         // TODO(dneto): Describe opaque case.
         // For pointer-to-global and POD arguments, we will remap this
         // kernel argument to a SPIR-V module-scope OpVariable, as follows:
         //
         // Create a %clspv.resource.var.<kind>.N function that returns
         // the same kind of pointer that the OpVariable evaluates to.
         // The first two arguments are the descriptor set and binding
         // to use.
         //
         // For each call to a %clspv.resource.var.<kind>.N with a unique
         // descriptor set and binding, the SPIRVProducer pass will:
         // 1) Create a unique OpVariable
         // 2) Map uses of the call to the function with the base pointer
         // to use.
         //   For a storage buffer it's the the elements in the runtime
         // array in the module-scope storage buffer variable.
         // So it's something that maps to:
         //     OpAccessChain %ptr_to_elem %the-var %uint_0 %uint_0
         //   For POD data, its something like this:
         //     OpAccessChain %ptr_to_elem %the-var %uint_0
         // 3) Generate no SPIR-V code for the call itself.

         switch (arg_kind) {
         case clspv::ArgKind::Buffer: {
           // If original argument is:
           //   Elem addrspace(1)*
           // Then make a zero-length array to mimic a StorageBuffer struct
           // whose first element is a RuntimeArray:
           //
           //   { [0 x Elem] }
           //
           // Use unnamed struct types so we generate less SPIR-V code.

           // Create the type only once.
           auto *arr_type = ArrayType::get(argTy->getPointerElementType(), 0);
           resource_type = StructType::get(arr_type);
           // Preserve the address space in case the pointer is passed into a
           // helper function: we don't want to change the type of the helper
           // function parameter.
           addr_space = argTy->getPointerAddressSpace();
           break;
         }
         case clspv::ArgKind::BufferUBO: {
           // If original argument is:
           //   Elem addrspace(2)*
           // Then make a n-element sized array to mimic an Uniform struct whose
           // first element is an array:
           //
           //   { [n x Elem] }
           //
           // Use unnamed struct types so we generate less SPIR-V code.

           // Max UBO size can be specified on the command line. Size the array
           // to pretend we are using that space.
           uint64_t struct_size = M.getDataLayout().getTypeAllocSize(
               argTy->getPointerElementType());
           uint64_t num_elements =
               clspv::Option::MaxUniformBufferSize() / struct_size;

           // Create the type only once.
           auto *arr_type =
               ArrayType::get(argTy->getPointerElementType(), num_elements);
           resource_type = StructType::get(arr_type);
           // Preserve the address space in case the pointer is passed into a
           // helper function: we don't want to change the type of the helper
           // function parameter.
           addr_space = argTy->getPointerAddressSpace();
           break;
         }
         case clspv::ArgKind::Pod:
         case clspv::ArgKind::PodUBO:
         case clspv::ArgKind::PodPushConstant: {
           // If original argument is:
           //   Elem %arg
           // Then make a StorageBuffer struct whose element is pod-type:
           //
           //   { Elem }
           //
           // Use unnamed struct types so we generate less SPIR-V code.
           resource_type = StructType::get(argTy);
           if (arg_kind == clspv::ArgKind::PodUBO)
             addr_space = clspv::AddressSpace::Uniform;
           else if (arg_kind == clspv::ArgKind::PodPushConstant)
             addr_space = clspv::AddressSpace::PushConstant;
           else
             addr_space = clspv::AddressSpace::Global;
           break;
         }
         case clspv::ArgKind::Sampler:
         case clspv::ArgKind::SampledImage:
         case clspv::ArgKind::StorageImage:
           // We won't be translating the value here.  Keep the type the same.
           // since calls using these values need to keep the same type.
           resource_type = argTy->getPointerElementType();
           addr_space = argTy->getPointerAddressSpace();
           break;
         default:
           errs() << "Unhandled type " << *argTy << "\n";
           llvm_unreachable("Allocation of descriptors: Unhandled type");
         }

         assert(resource_type);

         auto fn_name = clspv::ResourceAccessorFunction() + "." +
                        std::to_string(discriminants_list[arg_index].index);

         Function *var_fn = M.getFunction(fn_name);

         if (!var_fn) {
           // Make the function
           PointerType *ptrTy = PointerType::get(resource_type, addr_space);
           // The parameters are:
           //  descriptor set
           //  binding
           //  arg kind
           //  arg index
           //  discriminant index
           //  coherent
           Type *i32 = Builder.getInt32Ty();
           FunctionType *fnTy =
               FunctionType::get(ptrTy, {i32, i32, i32, i32, i32, i32}, false);
           var_fn =
               cast<Function>(M.getOrInsertFunction(fn_name, fnTy).getCallee());
         }

         // Replace uses of this argument with something dependent on a GEP
         // into the the result of a call to the special builtin.
         auto *set_arg = Builder.getInt32(set);
         auto *binding_arg = Builder.getInt32(binding);
         auto *arg_kind_arg = Builder.getInt32(unsigned(arg_kind));
         auto *arg_index_arg = Builder.getInt32(arg_index);
         auto *discriminant_index_arg =
             Builder.getInt32(discriminants_list[arg_index].index);
         auto *coherent_arg = Builder.getInt32(
             discriminants_list[arg_index].discriminant.coherent);
         auto *call = Builder.CreateCall(
             var_fn, {set_arg, binding_arg, arg_kind_arg, arg_index_arg,
                      discriminant_index_arg, coherent_arg});

         Value *replacement = nullptr;
         Value *zero = Builder.getInt32(0);
         switch (arg_kind) {
         case clspv::ArgKind::Buffer:
         case clspv::ArgKind::BufferUBO:
           // Return a GEP to the first element
           // in the runtime array we'll make.
           replacement = Builder.CreateGEP(call, {zero, zero, zero});
           break;
         case clspv::ArgKind::Pod:
         case clspv::ArgKind::PodUBO:
         case clspv::ArgKind::PodPushConstant: {
           // Replace with a load of the start of the (virtual) variable.
           auto *gep = Builder.CreateGEP(call, {zero, zero});
           replacement = Builder.CreateLoad(gep);
         } break;
         case clspv::ArgKind::SampledImage:
         case clspv::ArgKind::StorageImage:
         case clspv::ArgKind::Sampler: {
           // The call returns a pointer to an opaque type.  Eventually the
           // SPIR-V will need to load the variable, so the natural thing would
           // be to emit an LLVM load here.  But LLVM does not allow a load of
           // an opaque type because it's unsized.  So keep the bare call here,
           // and do the translation to a load in the SPIRVProducer pass.
           replacement = call;
         } break;
         case clspv::ArgKind::Local:
           llvm_unreachable("local is unhandled");
         }

         if (ShowDescriptors) {
           outs() << "DBA: Map " << *argTy << " " << arg_index << "\n"
                  << "DBA:   index " << discriminants_list[arg_index].index
                  << " -> (" << set << "," << binding << ")"
                  << "\n";
           outs() << "DBA:   resource type        " << *resource_type << "\n";
           outs() << "DBA:   var fn               " << var_fn->getName() << "\n";
           outs() << "DBA:     var call           " << *call << "\n";
           outs() << "DBA:     var replacement    " << *replacement << "\n";
           outs() << "DBA:     var replacement ty " << *(replacement->getType())
                  << "\n";
           outs() << "\n\n";
         }

         Arg.replaceAllUsesWith(replacement);
       }
       arg_index++;
     }
   }
   return Changed;
 }

 bool AllocateDescriptorsPass::AllocateLocalKernelArgSpecIds(Module &M) {
   bool Changed = false;
   if (ShowDescriptors) {
     outs() << "Allocate local kernel arg spec ids\n";
   }

   // Maps argument type to assigned SpecIds.
   DenseMap<Type *, SmallVector<uint32_t, 4>> spec_id_types;
   // Tracks SpecIds assigned in the current function.
   DenseSet<int> function_spec_ids;
   // Tracks newly allocated spec ids.
   std::vector<std::pair<Type *, uint32_t>> function_allocations;

   // Allocates a SpecId for |type|.
   auto GetSpecId = [&M, &spec_id_types, &function_spec_ids,
                     &function_allocations](Type *type) {
     // Attempt to reuse a SpecId. If the SpecId is associated with the same type
     // in another kernel and not yet assigned to this kernel it can be reused.
     auto where = spec_id_types.find(type);
     if (where != spec_id_types.end()) {
       for (auto id : where->second) {
         if (!function_spec_ids.count(id)) {
           // Reuse |id| for |type| in this kernel. Record the use of |id| in
           // this kernel.
           function_allocations.emplace_back(type, id);
           function_spec_ids.insert(id);
           return id;
         }
       }
     }

     // Need to allocate a new SpecId.
     uint32_t spec_id =
         clspv::AllocateSpecConstant(&M, clspv::SpecConstant::kLocalMemorySize);
     function_allocations.push_back(std::make_pair(type, spec_id));
     function_spec_ids.insert(spec_id);
     return spec_id;
   };

   IRBuilder<> Builder(M.getContext());
   for (Function &F : M) {
     // Only scan arguments of kernel functions that have bodies.
     if (F.isDeclaration() || F.getCallingConv() != CallingConv::SPIR_KERNEL) {
       continue;
     }

     // Prepare to insert arg remapping instructions at the start of the
     // function.
     Builder.SetInsertPoint(F.getEntryBlock().getFirstNonPHI());

     function_allocations.clear();
     function_spec_ids.clear();
     int arg_index = 0;
     for (Argument &Arg : F.args()) {
       Type *argTy = Arg.getType();
       const auto arg_kind = clspv::GetArgKind(Arg);
       if (arg_kind == clspv::ArgKind::Local) {
         // Assign a SpecId to this argument.
         int spec_id = GetSpecId(Arg.getType());

         if (ShowDescriptors) {
           outs() << "DBA: " << F.getName() << " arg " << arg_index << " " << Arg
                  << " allocated SpecId " << spec_id << "\n";
         }

         // The type returned by the accessor function is [ Elem x 0 ]
         // addrspace(3)*. The zero-sized array is used to match the correct
         // indexing required by gep's, but the zero size will eventually be
         // codegen'd as an OpSpecConstant.
         auto fn_name =
             clspv::WorkgroupAccessorFunction() + "." + std::to_string(spec_id);
         Function *var_fn = M.getFunction(fn_name);
         auto *zero = Builder.getInt32(0);
         auto *array_ty = ArrayType::get(argTy->getPointerElementType(), 0);
         auto *ptr_ty =
             PointerType::get(array_ty, argTy->getPointerAddressSpace());
         if (!var_fn) {
           // Generate the function.
           Type *i32 = Builder.getInt32Ty();
           FunctionType *fn_ty = FunctionType::get(ptr_ty, i32, false);
           var_fn =
               cast<Function>(M.getOrInsertFunction(fn_name, fn_ty).getCallee());
         }

         // Generate an accessor call.
         auto *spec_id_arg = Builder.getInt32(spec_id);
         auto *call = Builder.CreateCall(var_fn, {spec_id_arg});

         // Add the correct gep. Since the workgroup variable is [ <type> x 0 ]
         // addrspace(3)*, generate two zero indices for the gep.
         auto *replacement = Builder.CreateGEP(call, {zero, zero});
         Arg.replaceAllUsesWith(replacement);

         // We record the assignment of the spec id for this particular argument
         // in module-level metadata. This allows us to reconstruct the
         // connection during SPIR-V generation. We cannot use the argument as an
         // operand to the function because DirectResourceAccess will generate
         // these calls in different function scopes potentially.
         auto *arg_const = Builder.getInt32(arg_index);
         NamedMDNode *nmd =
             M.getOrInsertNamedMetadata(clspv::LocalSpecIdMetadataName());
         Metadata *ops[3];
         ops[0] = ValueAsMetadata::get(&F);
         ops[1] = ConstantAsMetadata::get(arg_const);
         ops[2] = ConstantAsMetadata::get(spec_id_arg);
         MDTuple *tuple = MDTuple::get(M.getContext(), ops);
         nmd->addOperand(tuple);
         Changed = true;
       }

       ++arg_index;
     }

     // Move newly allocated SpecIds for this function into the overall mapping.
     for (auto &pair : function_allocations) {
       spec_id_types[pair.first].push_back(pair.second);
     }
   }

   return Changed;
 }

 bool AllocateDescriptorsPass::CallTreeContainsGlobalBarrier(Function *F) {
   auto iter = barrier_map_.find(F);
   if (iter != barrier_map_.end()) {
     return iter->second;
   }

   bool uses_barrier = false;
   for (auto &BB : *F) {
     for (auto &I : BB) {
       if (auto *call = dyn_cast<CallInst>(&I)) {
         // For barrier and mem_fence semantics, only Uniform (covering Uniform
         // and StorageBuffer storage classes) and Image semantics are checked
         // because Workgroup variables are inherently coherent (and do not
         // require the decoration).
         auto &func_info = clspv::Builtins::Lookup(call->getCalledFunction());
         if (func_info.getType() == clspv::Builtins::kSpirvOp) {
           auto *arg0 = dyn_cast<ConstantInt>(call->getArgOperand(0));
           spv::Op opcode = static_cast<spv::Op>(arg0->getZExtValue());
           if (opcode == spv::OpControlBarrier) {
             // barrier()
             if (auto *semantics = dyn_cast<ConstantInt>(call->getOperand(3))) {
               uses_barrier =
                   (semantics->getZExtValue() & kMemorySemanticsUniformMemory) ||
                   (semantics->getZExtValue() & kMemorySemanticsImageMemory);
             }

           } else if (opcode == spv::OpMemoryBarrier) {
             // mem_fence()
             if (auto *semantics = dyn_cast<ConstantInt>(call->getOperand(2))) {
               uses_barrier =
                   (semantics->getZExtValue() & kMemorySemanticsUniformMemory) ||
                   (semantics->getZExtValue() & kMemorySemanticsImageMemory);
             }
           }
         } else if (!call->getCalledFunction()->isDeclaration()) {
           // Continue searching in the subfunction.
           uses_barrier =
               CallTreeContainsGlobalBarrier(call->getCalledFunction());
         }

         if (uses_barrier)
           break;
       }

       if (uses_barrier)
         break;
     }

     if (uses_barrier)
       break;
   }

   barrier_map_.insert(std::make_pair(F, uses_barrier));
   return uses_barrier;
 }

 std::pair<bool, bool> AllocateDescriptorsPass::HasReadsAndWrites(Value *V) {
   // Atomics and OpenCL builtins modf and frexp are all represented as function
   // calls.
   //
   // A user is interesting if reads or writes memory or could eventually read
   // or write memory.
   auto IsInterestingUser = [](const User *user) {
     if (isa<StoreInst>(user) || isa<LoadInst>(user) || isa<CallInst>(user) ||
         user->getType()->isPointerTy())
       return true;
     return false;
   };

   bool read = false;
   bool write = false;
   DenseSet<Value *> visited;
   std::vector<std::pair<Value *, unsigned>> stack;
   for (auto &Use : V->uses()) {
     if (IsInterestingUser(Use.getUser()))
       stack.push_back(std::make_pair(Use.getUser(), Use.getOperandNo()));
   }

   while (!stack.empty() && !(read && write)) {
     Value *value = stack.back().first;
     unsigned operand_no = stack.back().second;
     stack.pop_back();
     if (!visited.insert(value).second)
       continue;

     if (isa<LoadInst>(value)) {
       read = true;
     } else if (isa<StoreInst>(value)) {
       write = true;
     } else {
       auto *call = dyn_cast<CallInst>(value);
       if (call && !call->getCalledFunction()->isDeclaration()) {
         // Trace through the function call and grab the right argument.
         auto arg_iter = call->getCalledFunction()->arg_begin();
         for (size_t i = 0; i != operand_no; ++i, ++arg_iter) {
         }

         for (auto &Use : arg_iter->uses()) {
           auto *User = Use.getUser();
           if (IsInterestingUser(User))
             stack.push_back(std::make_pair(Use.getUser(), Use.getOperandNo()));
         }
       } else if (call) {
         auto func_info = clspv::Builtins::Lookup(call->getCalledFunction());
         // Note that image queries (e.g. get_image_width()) do not touch the
         // actual image memory.
         switch (func_info.getType()) {
         case clspv::Builtins::kReadImagef:
         case clspv::Builtins::kReadImagei:
         case clspv::Builtins::kReadImageui:
         case clspv::Builtins::kReadImageh:
           read = true;
           break;
         case clspv::Builtins::kWriteImagef:
         case clspv::Builtins::kWriteImagei:
         case clspv::Builtins::kWriteImageui:
         case clspv::Builtins::kWriteImageh:
           write = true;
           break;
         case clspv::Builtins::kGetImageWidth:
         case clspv::Builtins::kGetImageHeight:
         case clspv::Builtins::kGetImageDepth:
         case clspv::Builtins::kGetImageDim:
           break;
         default:
           // For other calls, check the function attributes.
           if (!call->getCalledFunction()->doesNotAccessMemory()) {
             if (!call->getCalledFunction()->doesNotReadMemory())
               read = true;
             if (!call->getCalledFunction()->onlyReadsMemory())
               write = true;
           }
           break;
         }
       } else {
         // Trace uses that remain a pointer or a function calls.
         for (auto &U : value->uses()) {
           auto *User = U.getUser();
           if (IsInterestingUser(User))
             stack.push_back(std::make_pair(U.getUser(), U.getOperandNo()));
         }
       }
     }
   }

   return std::make_pair(read, write);
 }