src/passes/Heap2Local.cpp - external/github.com/WebAssembly/binaryen - Git at Google

 /*
  * Copyright 2021 WebAssembly Community Group participants
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
  * You may obtain a copy of the License at
  *
  *     http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing, software
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */

 //
 // Find heap allocations that never escape the current function, and lower the
 // allocation's data into locals. That is, avoid allocating a GC object, and
 // instead use one local for each of its fields.
 //
 // To get a sense for what this pass does, here is an example to clarify. First,
 // in pseudocode:
 //
 //   ref = new Int(42)
 //   do {
 //     ref.set(ref.get() + 1)
 //   } while (import(ref.get())
 //
 // That is, we allocate an int on the heap and use it as a counter.
 // Unnecessarily, as it could be a normal int on the stack.
 //
 // Wat:
 //
 //   (module
 //    ;; A boxed integer: an entire struct just to hold an int.
 //    (type $boxed-int (struct (field (mut i32))))
 //
 //    (import "env" "import" (func $import (param i32) (result i32)))
 //
 //    (func $example
 //     (local $ref (ref null $boxed-int))
 //
 //     ;; Allocate a boxed integer of 42 and save the reference to it.
 //     (local.set $ref
 //      (struct.new $boxed-int
 //       (i32.const 42)
 //      )
 //     )
 //
 //     ;; Increment the integer in a loop, looking for some condition.
 //     (loop $loop
 //      (struct.set $boxed-int 0
 //       (local.get $ref)
 //       (i32.add
 //        (struct.get $boxed-int 0
 //         (local.get $ref)
 //        )
 //        (i32.const 1)
 //       )
 //      )
 //      (br_if $loop
 //       (call $import
 //        (struct.get $boxed-int 0
 //         (local.get $ref)
 //        )
 //       )
 //      )
 //     )
 //    )
 //   )
 //
 // Before this pass, the optimizer could do essentially nothing with this. Even
 // with this pass, running -O1 has no effect, as the pass is only used in -O2+.
 // However, running --heap2local -O1 leads to this:
 //
 //    (func $0
 //     (local $0 i32)
 //     (local.set $0
 //      (i32.const 42)
 //     )
 //     (loop $loop
 //      (br_if $loop
 //       (call $import
 //        (local.tee $0
 //         (i32.add
 //          (local.get $0)
 //          (i32.const 1)
 //         )
 //        )
 //       )
 //      )
 //     )
 //    )
 //
 // All the GC heap operations have been removed, and we just have a plain int
 // now, allowing a bunch of other opts to run.
 //
 // For us to replace an allocation with locals, we need to prove two things:
 //
 //  * It must not escape from the function. If it escapes, we must pass out a
 //    reference anyhow. (In theory we could do a whole-program transformation
 //    to replace the reference with parameters in some cases, but inlining can
 //    hopefully let us optimize most cases.)
 //  * It must be used "exclusively", without overlap. That is, we cannot
 //    handle the case where a local.get might return our allocation, but might
 //    also get some other value. We also cannot handle a select where one arm
 //    is our allocation and another is something else. If the use is exclusive
 //    then we have a simple guarantee of being able to replace the heap
 //    allocation with the locals.
 //
 // Non-exclusive uses are optimizable too, but they require more work and add
 // overhead. For example, here is a non-exclusive use:
 //
 //   var x;
 //   if (..) {
 //     x = new Something(); // the allocation we want to optimize
 //   } else {
 //     x = something_else;
 //   }
 //   // 'x' here is not used exclusively by our allocation
 //   return x.field0;
 //
 // To optimize x.field0, we'd need to check if it contains our allocation or
 // not, perhaps marking a boolean as true when it is, then doing an if on that
 // local, etc.:
 //
 //   var x_is_our_alloc; // whether x is our allocation
 //   var x; // keep x around for when it is not our allocation
 //   var x_field0; // the value of field0 on x, when x is our allocation
 //   if (..) {
 //     x_field0 = ..default value for the type..
 //     x_is_our_alloc = true;
 //   } else {
 //     x = something_else;
 //     x_is_our_alloc = false;
 //   }
 //   return x_is_our_alloc ? x_field0 : x.field0;
 //
 // (node splitting/code duplication is another possible approach). On the other
 // hand, if the allocation is used exclusively in all places (the if-else above
 // does not have an else any more) then we can do this:
 //
 //   var x_field0; // the value of field0 on x
 //   if (..) {
 //     x_field0 = ..default value for the type..
 //   }
 //   return x_field0;
 //
 // This optimization focuses on such cases.
 //

 #include "ir/abstract.h"
 #include "ir/bits.h"
 #include "ir/branch-utils.h"
 #include "ir/eh-utils.h"
 #include "ir/find_all.h"
 #include "ir/local-graph.h"
 #include "ir/parents.h"
 #include "ir/properties.h"
 #include "ir/type-updating.h"
 #include "ir/utils.h"
 #include "pass.h"
 #include "support/unique_deferring_queue.h"
 #include "wasm-builder.h"
 #include "wasm.h"

 namespace wasm {

 namespace {

 // Interactions between a child and a parent, with regard to the behavior of the
 // allocation.
 enum class ParentChildInteraction : int8_t {
   // The parent lets the child escape. E.g. the parent is a call.
   Escapes,
   // The parent fully consumes the child in a safe, non-escaping way, and
   // after consuming it nothing remains to flow further through the parent.
   // E.g. the parent is a struct.get, which reads from the allocated heap
   // value and does nothing more with the reference.
   FullyConsumes,
   // The parent flows the child out, that is, the child is the single value
   // that can flow out from the parent. E.g. the parent is a block with no
   // branches and the child is the final value that is returned.
   Flows,
   // The parent does not consume the child completely, so the child's value
   // can be used through it. However the child does not flow cleanly through.
   // E.g. the parent is a block with branches, and the value on them may be
   // returned from the block and not only the child. This means the allocation
   // is not used in an exclusive way, and we cannot optimize it.
   Mixes,
   // No interaction (not relevant to the analysis).
   None,
 };

 // Core analysis that provides an escapes() method to check if an allocation
 // escapes in a way that prevents optimizing it away as described above. It also
 // stashes information about the relevant expressions as it goes, which helps
 // optimization later (|reached|).
 struct EscapeAnalyzer {
   // To find what escapes, we need to follow where values flow, both up to
   // parents, and via branches, and through locals.
   //
   // We use a lazy graph here because we only need this for reference locals,
   // and even among them, only ones we see an allocation is stored to.
   const LazyLocalGraph& localGraph;
   const Parents& parents;
   const BranchUtils::BranchTargets& branchTargets;

   const PassOptions& passOptions;
   Module& wasm;

   EscapeAnalyzer(const LazyLocalGraph& localGraph,
                  const Parents& parents,
                  const BranchUtils::BranchTargets& branchTargets,
                  const PassOptions& passOptions,
                  Module& wasm)
     : localGraph(localGraph), parents(parents), branchTargets(branchTargets),
       passOptions(passOptions), wasm(wasm) {}

   // We must track all the local.sets that write the allocation, to verify
   // exclusivity.
   std::unordered_set<LocalSet*> sets;

   // A map of every expression we reached during the flow analysis (which is
   // exactly all the places where our allocation is used) to the interaction of
   // the allocation there. If we optimize, anything in this map will be fixed up
   // at the end, and how we fix it up may depend on the interaction,
   // specifically, it can matter if the allocations flows out of here (Flows,
   // which is the case for e.g. a Block that we flow through) or if it is fully
   // consumed (FullyConsumes, e.g. for a struct.get). We do not store irrelevant
   // things here (that is, anything not in the map has the interaction |None|,
   // implicitly).
   std::unordered_map<Expression*, ParentChildInteraction> reachedInteractions;

   // Analyze an allocation to see if it escapes or not.
   bool escapes(Expression* allocation) {
     // A queue of flows from children to parents. When something is in the queue
     // here then it assumed that it is ok for the allocation to be at the child
     // (that is, we have already checked the child before placing it in the
     // queue), and we need to check if it is ok to be at the parent, and to flow
     // from the child to the parent. We will analyze that (see
     // ParentChildInteraction, above) and continue accordingly.
     using ChildAndParent = std::pair<Expression*, Expression*>;
     UniqueNonrepeatingDeferredQueue<ChildAndParent> flows;

     // Start the flow from the allocation itself to its parent.
     flows.push({allocation, parents.getParent(allocation)});

     // Keep flowing while we can.
     while (!flows.empty()) {
       auto flow = flows.pop();
       auto* child = flow.first;
       auto* parent = flow.second;

       auto interaction = getParentChildInteraction(allocation, parent, child);
       if (interaction == ParentChildInteraction::Escapes ||
           interaction == ParentChildInteraction::Mixes) {
         // If the parent may let us escape, or the parent mixes other values
         // up with us, give up.
         return true;
       }

       // The parent either fully consumes us, or flows us onwards; either way,
       // we can proceed here, hopefully.
       assert(interaction == ParentChildInteraction::FullyConsumes ||
              interaction == ParentChildInteraction::Flows);

       // We can proceed, as the parent interacts with us properly, and we are
       // the only allocation to get here.

       if (interaction == ParentChildInteraction::Flows) {
         // The value flows through the parent; we need to look further at the
         // grandparent.
         flows.push({parent, parents.getParent(parent)});
       }

       if (auto* set = parent->dynCast<LocalSet>()) {
         // This is one of the sets we are written to, and so we must check for
         // exclusive use of our allocation by all the gets that read the value.
         // Note the set, and we will check the gets at the end once we know all
         // of our sets.
         sets.insert(set);

         // We must also look at how the value flows from those gets.
         for (auto* get : localGraph.getSetInfluences(set)) {
           flows.push({get, parents.getParent(get)});
         }
       }

       // If the parent may send us on a branch, we will need to look at the flow
       // to the branch target(s).
       for (auto name : branchesSentByParent(child, parent)) {
         flows.push({child, branchTargets.getTarget(name)});
       }

       // If we got to here, then we can continue to hope that we can optimize
       // this allocation. Mark the parent and child as reached by it, and
       // continue. The child flows the value to the parent, and the parent's
       // behavior was computed before.
       reachedInteractions[child] = ParentChildInteraction::Flows;
       reachedInteractions[parent] = interaction;
     }

     // We finished the loop over the flows. Do the final checks.
     if (!getsAreExclusiveToSets()) {
       return true;
     }

     // Nothing escapes, hurray!
     return false;
   }

   ParentChildInteraction getParentChildInteraction(Expression* allocation,
                                                    Expression* parent,
                                                    Expression* child) const {
     // If there is no parent then we are the body of the function, and that
     // means we escape by flowing to the caller.
     if (!parent) {
       return ParentChildInteraction::Escapes;
     }

     struct Checker : public Visitor<Checker> {
       Expression* allocation;
       Expression* child;

       // Assume escaping (or some other problem we cannot analyze) unless we are
       // certain otherwise.
       bool escapes = true;

       // Assume we do not fully consume the value unless we are certain
       // otherwise. If this is set to true, then we do not need to check any
       // further. If it remains false, then we will analyze the value that
       // falls through later to check for mixing.
       //
       // Note that this does not need to be set for expressions if their type
       // proves that the value does not continue onwards (e.g. if their type is
       // none, or not a reference type), but for clarity some do still mark this
       // field as true when it is clearly so.
       bool fullyConsumes = false;

       // General operations
       void visitBlock(Block* curr) {
         escapes = false;
         // We do not mark fullyConsumes as the value may continue through this
         // and other control flow structures.
       }
       // Note that If is not supported here, because for our value to flow
       // through it there must be an if-else, and that means there is no single
       // value falling through anyhow.
       void visitLoop(Loop* curr) { escapes = false; }
       void visitDrop(Drop* curr) {
         escapes = false;
         fullyConsumes = true;
       }
       void visitBreak(Break* curr) { escapes = false; }
       void visitSwitch(Switch* curr) { escapes = false; }

       // Local operations. Locals by themselves do not escape; the analysis
       // tracks where locals are used.
       void visitLocalGet(LocalGet* curr) { escapes = false; }
       void visitLocalSet(LocalSet* curr) { escapes = false; }

       // Reference operations. TODO add more
       void visitRefIsNull(RefIsNull* curr) {
         // The reference is compared to null, but nothing more.
         escapes = false;
         fullyConsumes = true;
       }

       void visitRefEq(RefEq* curr) {
         // The reference is compared for identity, but nothing more.
         escapes = false;
         fullyConsumes = true;
       }

       void visitRefAs(RefAs* curr) {
         // TODO General OptimizeInstructions integration, that is, since we know
         //      that our allocation is what flows into this RefAs, we can
         //      know the exact outcome of the operation.
         if (curr->op == RefAsNonNull) {
           // As it is our allocation that flows through here, we know it is not
           // null (so there is no trap), and we can continue to (hopefully)
           // optimize this allocation.
           escapes = false;
         }
       }

       void visitRefTest(RefTest* curr) {
         escapes = false;
         fullyConsumes = true;
       }

       void visitRefCast(RefCast* curr) {
         // Whether the cast succeeds or fails, it does not escape.
         escapes = false;

         // If the cast fails then the allocation is fully consumed and does not
         // flow any further (instead, we trap).
         if (!Type::isSubType(allocation->type, curr->type)) {
           fullyConsumes = true;
         }
       }

       // GC operations.
       void visitStructSet(StructSet* curr) {
         // The reference does not escape (but the value is stored to memory and
         // therefore might).
         if (curr->ref == child) {
           escapes = false;
           fullyConsumes = true;
         }
       }
       void visitStructGet(StructGet* curr) {
         escapes = false;
         fullyConsumes = true;
       }
       void visitStructRMW(StructRMW* curr) {
         if (curr->ref == child) {
           escapes = false;
           fullyConsumes = true;
         }
       }
       void visitStructCmpxchg(StructCmpxchg* curr) {
         if (curr->ref == child || curr->expected == child) {
           escapes = false;
           fullyConsumes = true;
         }
       }
       void visitArraySet(ArraySet* curr) {
         if (!curr->index->is<Const>()) {
           // Array operations on nonconstant indexes do not escape in the normal
           // sense, but they do escape from our being able to analyze them, so
           // stop as soon as we see one.
           return;
         }

         // As StructGet.
         if (curr->ref == child) {
           escapes = false;
           fullyConsumes = true;
         }
       }
       void visitArrayGet(ArrayGet* curr) {
         if (!curr->index->is<Const>()) {
           return;
         }
         escapes = false;
         fullyConsumes = true;
       }
       // TODO other GC operations
     } checker;

     checker.allocation = allocation;
     checker.child = child;
     checker.visit(parent);

     if (checker.escapes) {
       return ParentChildInteraction::Escapes;
     }

     // If the parent returns a type that is not a reference, then by definition
     // it fully consumes the value as it does not flow our allocation onward.
     if (checker.fullyConsumes || !parent->type.isRef()) {
       return ParentChildInteraction::FullyConsumes;
     }

     // Finally, check for mixing. If the child is the immediate fallthrough
     // of the parent then no other values can be mixed in.
     if (Properties::getImmediateFallthrough(parent, passOptions, wasm) ==
         child) {
       return ParentChildInteraction::Flows;
     }

     // Likewise, if the child branches to the parent, and it is the sole branch,
     // with no other value exiting the block (in particular, no final value at
     // the end that flows out), then there is no mixing.
     auto branches =
       branchTargets.getBranches(BranchUtils::getDefinedName(parent));
     if (branches.size() == 1 &&
         BranchUtils::getSentValue(*branches.begin()) == child) {
       // TODO: support more types of branch targets.
       if (auto* parentAsBlock = parent->dynCast<Block>()) {
         if (parentAsBlock->list.back()->type == Type::unreachable) {
           return ParentChildInteraction::Flows;
         }
       }
     }

     // TODO: Also check for safe merges where our allocation is in all places,
     //       like two if or select arms, or branches.

     return ParentChildInteraction::Mixes;
   }

   const BranchUtils::NameSet branchesSentByParent(Expression* child,
                                                   Expression* parent) {
     BranchUtils::NameSet names;
     BranchUtils::operateOnScopeNameUsesAndSentValues(
       parent, [&](Name name, Expression* value) {
         if (value == child) {
           names.insert(name);
         }
       });
     return names;
   }

   // Verify exclusivity of all the gets for a bunch of sets. That is, assuming
   // the sets are exclusive (they all write exactly our allocation, and nothing
   // else), we need to check whether all the gets that read that value cannot
   // read anything else (which would be the case if another set writes to that
   // local, in the right live range).
   bool getsAreExclusiveToSets() {
     // Find all the relevant gets (which may overlap between the sets).
     std::unordered_set<LocalGet*> gets;
     for (auto* set : sets) {
       for (auto* get : localGraph.getSetInfluences(set)) {
         gets.insert(get);
       }
     }

     // Check that the gets can only read from the specific known sets.
     for (auto* get : gets) {
       for (auto* set : localGraph.getSets(get)) {
         if (sets.count(set) == 0) {
           return false;
         }
       }
     }

     return true;
   }

   // Helper function for Struct2Local and Array2Struct. Given an old expression
   // that is being replaced by a new one, add the proper interaction for the
   // replacement.
   void applyOldInteractionToReplacement(Expression* old, Expression* rep) {
     // We can only replace something relevant that we found in the analysis.
     // (Not only would anything else be invalid to process, but also we wouldn't
     // know what interaction to give the replacement.)
     assert(reachedInteractions.count(old));

     // The replacement should have the same interaction as the thing it
     // replaces, since it is a drop-in replacement for it. The one exception is
     // when we replace with something unreachable, which is the result of us
     // figuring out that some code will trap at runtime. In that case, we've
     // made the code unreachable and the allocation does not interact with that
     // code at all.
     if (rep->type != Type::unreachable) {
       reachedInteractions[rep] = reachedInteractions[old];
     }
   }

   // Get the interaction of an expression.
   ParentChildInteraction getInteraction(Expression* curr) {
     auto iter = reachedInteractions.find(curr);
     if (iter == reachedInteractions.end()) {
       // This is not interacted with.
       return ParentChildInteraction::None;
     }
     return iter->second;
   }
 };

 // An optimizer that handles the rewriting to turn a struct allocation into
 // locals. We run this after proving that allocation does not escape.
 //
 // TODO: Doing a single rewrite walk at the end (for all structs) would be more
 //       efficient, but it would need to be more complex.
 struct Struct2Local : PostWalker<Struct2Local> {
   StructNew* allocation;

   // The analyzer is not |const| because we update
   // |analyzer.reachedInteractions| as we go (see replaceCurrent, below).
   EscapeAnalyzer& analyzer;

   Function* func;
   Module& wasm;
   Builder builder;
   const FieldList& fields;

   Struct2Local(StructNew* allocation,
                EscapeAnalyzer& analyzer,
                Function* func,
                Module& wasm)
     : allocation(allocation), analyzer(analyzer), func(func), wasm(wasm),
       builder(wasm), fields(allocation->type.getHeapType().getStruct().fields) {

     // Allocate locals to store the allocation's fields in.
     for (auto field : fields) {
       localIndexes.push_back(builder.addVar(func, field.type));
     }

     // Replace the things we need to using the visit* methods.
     walk(func->body);

     if (refinalize) {
       ReFinalize().walkFunctionInModule(func, &wasm);
     }
   }

   // Maps indexes in the struct to the local index that will replace them.
   std::vector<Index> localIndexes;

   // In rare cases we may need to refinalize, see below.
   bool refinalize = false;

   Expression* replaceCurrent(Expression* expression) {
     analyzer.applyOldInteractionToReplacement(getCurrent(), expression);
     PostWalker<Struct2Local>::replaceCurrent(expression);
     return expression;
   }

   // Rewrite the code in visit* methods. The general approach taken is to
   // replace the allocation with a null reference (which may require changing
   // types in some places, like making a block return value nullable), and to
   // remove all uses of it as much as possible, using the information we have
   // (for example, when our allocation reaches a RefAsNonNull we can simply
   // remove that operation as we know it would not throw). Some things are
   // left to other passes, like getting rid of dropped code without side
   // effects.

   // Adjust the type that flows through an expression, updating that type as
   // necessary.
   void adjustTypeFlowingThrough(Expression* curr) {
     if (analyzer.getInteraction(curr) != ParentChildInteraction::Flows) {
       return;
     }

     // Our allocation passes through this expr. We must turn its type into a
     // nullable one, because we will remove things like RefAsNonNull of it,
     // which means we may no longer have a non-nullable value as our input,
     // and we could fail to validate. It is safe to make this change in terms
     // of our parent, since we know very specifically that only safe things
     // will end up using our value, like a StructGet or a Drop, which do not
     // care about non-nullability.
     assert(curr->type.isRef());
     curr->type = Type(curr->type.getHeapType(), Nullable);
   }

   void visitBlock(Block* curr) { adjustTypeFlowingThrough(curr); }

   void visitLoop(Loop* curr) { adjustTypeFlowingThrough(curr); }

   void visitLocalSet(LocalSet* curr) {
     if (analyzer.getInteraction(curr) == ParentChildInteraction::None) {
       return;
     }

     // We don't need any sets of the reference to any of the locals it
     // originally was written to.
     if (curr->isTee()) {
       replaceCurrent(curr->value);
     } else {
       replaceCurrent(builder.makeDrop(curr->value));
     }
   }

   void visitLocalGet(LocalGet* curr) {
     if (analyzer.getInteraction(curr) == ParentChildInteraction::None) {
       return;
     }

     // Uses of this get will drop it, so the value does not matter. Replace it
     // with something else, which avoids issues with non-nullability (when
     // non-nullable locals are enabled), which could happen like this:
     //
     //   (local $x (ref $foo))
     //   (local.set $x ..)
     //   (.. (local.get $x))
     //
     // If we remove the set but not the get then the get would appear to read
     // the default value of a non-nullable local, which is not allowed.
     //
     // For simplicity, replace the get with a null. We anyhow have null types
     // in the places where our allocation was earlier, see notes on
     // visitBlock, and so using a null here adds no extra complexity.
     replaceCurrent(builder.makeRefNull(curr->type.getHeapType()));
   }

   void visitBreak(Break* curr) {
     if (analyzer.getInteraction(curr) == ParentChildInteraction::None) {
       return;
     }

     // Breaks that our allocation flows through may change type, as we now
     // have a nullable type there.
     curr->finalize();
   }

   void visitStructNew(StructNew* curr) {
     if (curr != allocation) {
       return;
     }

     // First, assign the initial values to the new locals.
     std::vector<Expression*> contents;

     if (!allocation->isWithDefault()) {
       // We must assign the initial values to temp indexes, then copy them
       // over all at once. If instead we did set them as we go, then we might
       // hit a problem like this:
       //
       //  (local.set X (new_X))
       //  (local.set Y (block (result ..)
       //                 (.. (local.get X) ..) ;; returns new_X, wrongly
       //                 (new_Y)
       //               )
       //
       // Note how we assign to the local X and use it during the assignment to
       // the local Y - but we should still see the old value of X, not new_X.
       // Temp locals X', Y' can ensure that:
       //
       //  (local.set X' (new_X))
       //  (local.set Y' (block (result ..)
       //                  (.. (local.get X) ..) ;; returns the proper, old X
       //                  (new_Y)
       //                )
       //  ..
       //  (local.set X (local.get X'))
       //  (local.set Y (local.get Y'))
       std::vector<Index> tempIndexes;

       for (auto field : fields) {
         tempIndexes.push_back(builder.addVar(func, field.type));
       }

       // Store the initial values into the temp locals.
       for (Index i = 0; i < tempIndexes.size(); i++) {
         contents.push_back(
           builder.makeLocalSet(tempIndexes[i], allocation->operands[i]));
       }

       // Copy them to the normal ones.
       for (Index i = 0; i < tempIndexes.size(); i++) {
         auto* value = builder.makeLocalGet(tempIndexes[i], fields[i].type);
         contents.push_back(builder.makeLocalSet(localIndexes[i], value));
       }

       // TODO Check if the nondefault case does not increase code size in some
       //      cases. A heap allocation that implicitly sets the default values
       //      is smaller than multiple explicit settings of locals to
       //      defaults.
     } else {
       // Set the default values.
       //
       // Note that we must assign the defaults because we might be in a loop,
       // that is, there might be a previous value.
       for (Index i = 0; i < localIndexes.size(); i++) {
         contents.push_back(builder.makeLocalSet(
           localIndexes[i],
           builder.makeConstantExpression(Literal::makeZero(fields[i].type))));
       }
     }

     // Replace the allocation with a null reference. This changes the type
     // from non-nullable to nullable, but as we optimize away the code that
     // the allocation reaches, we will handle that.
     contents.push_back(builder.makeRefNull(allocation->type.getHeapType()));
     replaceCurrent(builder.makeBlock(contents));
   }

   void visitRefIsNull(RefIsNull* curr) {
     if (analyzer.getInteraction(curr) == ParentChildInteraction::None) {
       return;
     }

     // The result must be 0, since the allocation is not null. Drop the RefIs
     // and append that.
     replaceCurrent(builder.makeSequence(
       builder.makeDrop(curr), builder.makeConst(Literal(int32_t(0)))));
   }

   void visitRefEq(RefEq* curr) {
     if (analyzer.getInteraction(curr) == ParentChildInteraction::None) {
       return;
     }

     if (curr->type == Type::unreachable) {
       // The result does not matter. Leave things as they are (and let DCE
       // handle it).
       return;
     }

     // If our reference is compared to itself, the result is 1. If it is
     // compared to something else, the result must be 0, as our reference does
     // not escape to any other place.
     int32_t result =
       analyzer.getInteraction(curr->left) == ParentChildInteraction::Flows &&
       analyzer.getInteraction(curr->right) == ParentChildInteraction::Flows;
     replaceCurrent(builder.makeBlock({builder.makeDrop(curr->left),
                                       builder.makeDrop(curr->right),
                                       builder.makeConst(Literal(result))}));
   }

   void visitRefAs(RefAs* curr) {
     if (analyzer.getInteraction(curr) == ParentChildInteraction::None) {
       return;
     }

     // It is safe to optimize out this RefAsNonNull, since we proved it
     // contains our allocation, and so cannot trap.
     assert(curr->op == RefAsNonNull);
     replaceCurrent(curr->value);
   }

   void visitRefTest(RefTest* curr) {
     if (analyzer.getInteraction(curr) == ParentChildInteraction::None) {
       return;
     }

     // This test operates on the allocation, which means we can compute whether
     // it will succeed statically. We do not even need
     // GCTypeUtils::evaluateCastCheck because we know the allocation's type
     // precisely (it cannot be a strict subtype of the type - it is the type).
     int32_t result = Type::isSubType(allocation->type, curr->castType);
     // Remove the RefTest and leave only its reference child. If we kept it,
     // we'd need to refinalize (as the input to the test changes, since the
     // reference becomes a null, which has a different type).
     replaceCurrent(builder.makeSequence(builder.makeDrop(curr->ref),
                                         builder.makeConst(Literal(result))));
   }

   void visitRefCast(RefCast* curr) {
     if (analyzer.getInteraction(curr) == ParentChildInteraction::None) {
       return;
     }

     // We know this RefCast receives our allocation, so we can see whether it
     // succeeds or fails.
     if (Type::isSubType(allocation->type, curr->type)) {
       // The cast succeeds, so it is a no-op, and we can skip it, since after we
       // remove the allocation it will not even be needed for validation.
       replaceCurrent(curr->ref);
     } else {
       // The cast fails, so this must trap.
       replaceCurrent(builder.makeSequence(builder.makeDrop(curr->ref),
                                           builder.makeUnreachable()));
     }

     // Either way, we need to refinalize here (we either added an unreachable,
     // or we replaced a cast with the value being cast, which may have a less-
     // refined type - it will not be used after we remove the allocation, but we
     // must still fix that up for validation).
     refinalize = true;
   }

   void visitStructSet(StructSet* curr) {
     if (analyzer.getInteraction(curr) == ParentChildInteraction::None) {
       return;
     }

     // Drop the ref (leaving it to other opts to remove, when possible), and
     // write the data to the local instead of the heap allocation.
     auto* replacement = builder.makeSequence(
       builder.makeDrop(curr->ref),
       builder.makeLocalSet(localIndexes[curr->index], curr->value));

     // This struct.set cannot possibly synchronize with other threads via the
     // read value, since the struct never escapes this function, so we don't
     // need a fence.
     replaceCurrent(replacement);
   }

   void visitStructGet(StructGet* curr) {
     if (analyzer.getInteraction(curr) == ParentChildInteraction::None) {
       return;
     }

     auto& field = fields[curr->index];
     auto type = field.type;
     if (type != curr->type) {
       // Normally we are just replacing a struct.get with a local.get of a
       // local that was created to have the same type as the struct's field,
       // but in some cases we may refine, if the struct.get's reference type
       // is less refined than the reference that actually arrives, like here:
       //
       //  (struct.get $parent 0
       //    (block (ref $parent)
       //      (struct.new $child)))
       //
       // We allocated locals for the field of the child, and are replacing a
       // get of the parent field with a local of the same type as the child's,
       // which may be more refined.
       refinalize = true;
     }
     Expression* value = builder.makeLocalGet(localIndexes[curr->index], type);
     // Note that in theory we could try to do better here than to fix up the
     // packing and signedness on gets: we could truncate on sets. That would be
     // more efficient if all gets are unsigned, as gets outnumber sets in
     // general. However, signed gets make that more complicated, so leave this
     // for other opts to handle.
     value = Bits::makePackedFieldGet(value, field, curr->signed_, wasm);
     auto* replacement = builder.blockify(builder.makeDrop(curr->ref));
     // Just like optimized struct.set, this struct.get cannot synchronize with
     // anything, so we don't need a fence.
     replaceCurrent(builder.blockify(replacement, value));
   }

   void visitStructRMW(StructRMW* curr) {
     if (analyzer.getInteraction(curr) == ParentChildInteraction::None) {
       return;
     }

     [[maybe_unused]] auto& field = fields[curr->index];
     auto type = curr->type;
     assert(type == field.type);
     assert(!field.isPacked());

     // We need a scratch local to hold the old, unmodified field value while we
     // update the original local with the modified value. We also need another
     // scratch local to hold the evaluated modification value while we set the
     // first scratch local in case the evaluation of the modification value ends
     // up changing the field value. This is similar to the scratch locals used
     // for struct.new.
     auto oldScratch = builder.addVar(func, type);
     auto valScratch = builder.addVar(func, type);
     auto local = localIndexes[curr->index];

     auto* block =
       builder.makeSequence(builder.makeDrop(curr->ref),
                            builder.makeLocalSet(valScratch, curr->value));

     // Stash the old value to return.
     block->list.push_back(
       builder.makeLocalSet(oldScratch, builder.makeLocalGet(local, type)));

     // Store the updated value.
     Expression* newVal = nullptr;
     if (curr->op == RMWXchg) {
       newVal = builder.makeLocalGet(valScratch, type);
     } else {
       Abstract::Op binop = Abstract::Add;
       switch (curr->op) {
         case RMWAdd:
           binop = Abstract::Add;
           break;
         case RMWSub:
           binop = Abstract::Sub;
           break;
         case RMWAnd:
           binop = Abstract::And;
           break;
         case RMWOr:
           binop = Abstract::Or;
           break;
         case RMWXor:
           binop = Abstract::Xor;
           break;
         case RMWXchg:
           WASM_UNREACHABLE("unexpected op");
       }
       newVal = builder.makeBinary(Abstract::getBinary(type, binop),
                                   builder.makeLocalGet(local, type),
                                   builder.makeLocalGet(valScratch, type));
     }
     block->list.push_back(builder.makeLocalSet(local, newVal));

     // Unstash the old value.
     block->list.push_back(builder.makeLocalGet(oldScratch, type));
     block->type = type;
     replaceCurrent(block);
   }

   void visitStructCmpxchg(StructCmpxchg* curr) {
     if (analyzer.getInteraction(curr->ref) != ParentChildInteraction::Flows) {
       // The allocation can't flow into `replacement` if we've made it this far,
       // but it might flow into `expected`, in which case we don't need to do
       // anything because we would still be performing the cmpxchg on a real
       // struct. We only need to replace the cmpxchg if the ref is being
       // replaced with locals.
       return;
     }

     [[maybe_unused]] auto& field = fields[curr->index];
     auto type = curr->type;
     assert(type == field.type);
     assert(!field.isPacked());

     // Hold everything in scratch locals, just like for other RMW ops and
     // struct.new.
     auto oldScratch = builder.addVar(func, type);
     auto expectedScratch = builder.addVar(func, type);
     auto replacementScratch = builder.addVar(func, type);
     auto local = localIndexes[curr->index];

     auto* block = builder.makeBlock(
       {builder.makeDrop(curr->ref),
        builder.makeLocalSet(expectedScratch, curr->expected),
        builder.makeLocalSet(replacementScratch, curr->replacement),
        builder.makeLocalSet(oldScratch, builder.makeLocalGet(local, type))});

     // Create the check for whether we should do the exchange.
     auto* lhs = builder.makeLocalGet(local, type);
     auto* rhs = builder.makeLocalGet(expectedScratch, type);
     Expression* pred;
     if (type.isRef()) {
       pred = builder.makeRefEq(lhs, rhs);
     } else {
       pred =
         builder.makeBinary(Abstract::getBinary(type, Abstract::Eq), lhs, rhs);
     }

     // The conditional exchange.
     block->list.push_back(
       builder.makeIf(pred,
                      builder.makeLocalSet(
                        local, builder.makeLocalGet(replacementScratch, type))));

     // Unstash the old value.
     block->list.push_back(builder.makeLocalGet(oldScratch, type));
     block->type = type;
     replaceCurrent(block);
   }
 };

 // An optimizer that handles the rewriting to turn a nonescaping array
 // allocation into a struct allocation. Struct2Local can then be run on that
 // allocation.
 // TODO: As with Struct2Local doing a single rewrite walk at the end (for all
 //       structs) would be more efficient, but more complex.
 struct Array2Struct : PostWalker<Array2Struct> {
   Expression* allocation;
   EscapeAnalyzer& analyzer;
   Function* func;
   Builder builder;
   // The original type of the allocation, before we turn it into a struct.
   Type originalType;

   // The type of the struct we are changing to (nullable and non-nullable
   // variations).
   Type nullStruct;
   Type nonNullStruct;

   Array2Struct(Expression* allocation,
                EscapeAnalyzer& analyzer,
                Function* func,
                Module& wasm)
     : allocation(allocation), analyzer(analyzer), func(func), builder(wasm),
       originalType(allocation->type) {

     // Build the struct type we need: as many fields as the size of the array,
     // all of the same type as the array's element.
     numFields = getArrayNewSize(allocation);
     auto arrayType = allocation->type.getHeapType();
     auto element = arrayType.getArray().element;
     FieldList fields;
     for (Index i = 0; i < numFields; i++) {
       fields.push_back(element);
     }
     HeapType structType = Struct(fields);

     // Generate a StructNew to replace the ArrayNew*.
     if (auto* arrayNew = allocation->dynCast<ArrayNew>()) {
       if (arrayNew->isWithDefault()) {
         structNew = builder.makeStructNew(structType, {});
         arrayNewReplacement = structNew;
       } else {
         // The ArrayNew is writing the same value to each slot of the array. To
         // do the same for the struct, we store that value in an local and
         // generate multiple local.gets of it.
         auto local = builder.addVar(func, element.type);
         auto* set = builder.makeLocalSet(local, arrayNew->init);
         std::vector<Expression*> gets;
         for (Index i = 0; i < numFields; i++) {
           gets.push_back(builder.makeLocalGet(local, element.type));
         }
         structNew = builder.makeStructNew(structType, gets);
         // The ArrayNew* will be replaced with a block containing the local.set
         // and the structNew.
         arrayNewReplacement = builder.makeSequence(set, structNew);
       }
     } else if (auto* arrayNewFixed = allocation->dynCast<ArrayNewFixed>()) {
       // Simply use the same values as the array.
       structNew = builder.makeStructNew(structType, arrayNewFixed->values);
       arrayNewReplacement = structNew;
     } else {
       WASM_UNREACHABLE("bad allocation");
     }

     // Mark new expressions we created as flowing out the allocation. We need to
     // inform the analysis of this because Struct2Local will only process such
     // code (it depends on the analysis to tell it what the allocation is and
     // where it flowed). Note that the two values here may be identical but
     // there is no harm to doing this twice in that case.
     analyzer.reachedInteractions[structNew] =
       analyzer.reachedInteractions[arrayNewReplacement] =
         ParentChildInteraction::Flows;

     // Update types along the path reached by the allocation: whenever we see
     // the array type, it should be the struct type. Note that we do this before
     // the walk that is after us, because the walk may read these types and
     // depend on them to be valid.
     //
     // Note that |reached| contains array.get operations, which are reached in
     // the analysis, and so we will update their types if they happen to have
     // the array type (which can be the case of an array of arrays). But that is
     // fine to do as the array.get is rewritten to a struct.get which is then
     // lowered away to locals anyhow.
     auto nullArray = Type(arrayType, Nullable);
     auto nonNullArray = Type(arrayType, NonNullable);
     nullStruct = Type(structType, Nullable);
     nonNullStruct = Type(structType, NonNullable);
     for (auto& [reached, _] : analyzer.reachedInteractions) {
       if (reached->is<RefCast>()) {
         // Casts must be handled later: We need to see the old type, and to
         // potentially replace the cast based on that, see below.
         continue;
       }

       // We must check subtyping here because the allocation may be upcast as it
       // flows around. If we do see such upcasting then we are refining here and
       // must refinalize.
       if (Type::isSubType(nullArray, reached->type)) {
         if (nullArray != reached->type) {
           refinalize = true;
         }
         reached->type = nullStruct;
       } else if (Type::isSubType(nonNullArray, reached->type)) {
         if (nonNullArray != reached->type) {
           refinalize = true;
         }
         reached->type = nonNullStruct;
       }
     }

     // Technically we should also fix up the types of locals as well, but after
     // Struct2Local those locals will no longer be used anyhow (the locals hold
     // allocations that are removed), so avoid that work (though it makes the
     // IR temporarily invalid in between Array2Struct and Struct2Local).

     // Replace the things we need to using the visit* methods.
     walk(func->body);

     if (refinalize) {
       ReFinalize().walkFunctionInModule(func, &wasm);
     }
   }

   Expression* replaceCurrent(Expression* expression) {
     analyzer.applyOldInteractionToReplacement(getCurrent(), expression);
     PostWalker<Array2Struct>::replaceCurrent(expression);
     return expression;
   }

   // In rare cases we may need to refinalize, as with Struct2Local.
   bool refinalize = false;

   // The number of slots in the array (which will become the number of fields in
   // the struct).
   Index numFields;

   // The StructNew that replaces the ArrayNew*. The user of this class can then
   // optimize that StructNew using Struct2Local.
   StructNew* structNew;

   // The replacement for the original ArrayNew*. Typically this is |structNew|,
   // unless we have additional code we need alongside it.
   Expression* arrayNewReplacement;

   void visitArrayNew(ArrayNew* curr) {
     if (curr == allocation) {
       replaceCurrent(arrayNewReplacement);
     }
   }

   void visitArrayNewFixed(ArrayNewFixed* curr) {
     if (curr == allocation) {
       replaceCurrent(arrayNewReplacement);
     }
   }

   void visitArraySet(ArraySet* curr) {
     if (analyzer.getInteraction(curr) == ParentChildInteraction::None) {
       return;
     }

     // If this is an OOB array.set then we trap.
     auto index = getIndex(curr->index);
     if (index >= numFields) {
       replaceCurrent(builder.makeBlock({builder.makeDrop(curr->ref),
                                         builder.makeDrop(curr->value),
                                         builder.makeUnreachable()}));
       // We added an unreachable, and must propagate that type.
       refinalize = true;
       return;
     }

     // Convert the ArraySet into a StructSet.
     // TODO: Handle atomic array accesses.
     replaceCurrent(builder.makeStructSet(
       index, curr->ref, curr->value, MemoryOrder::Unordered));
   }

   void visitArrayGet(ArrayGet* curr) {
     if (analyzer.getInteraction(curr) == ParentChildInteraction::None) {
       return;
     }

     // If this is an OOB array.get then we trap.
     auto index = getIndex(curr->index);
     if (index >= numFields) {
       replaceCurrent(builder.makeSequence(builder.makeDrop(curr->ref),
                                           builder.makeUnreachable()));
       // We added an unreachable, and must propagate that type.
       refinalize = true;
       return;
     }

     // Convert the ArrayGet into a StructGet.
     // TODO: Handle atomic array accesses.
     replaceCurrent(builder.makeStructGet(
       index, curr->ref, MemoryOrder::Unordered, curr->type, curr->signed_));
   }

   // Some additional operations need special handling

   void visitRefTest(RefTest* curr) {
     if (analyzer.getInteraction(curr) == ParentChildInteraction::None) {
       return;
     }

     // When we ref.test an array allocation, we cannot simply turn the array
     // into a struct, as then the test will behave differently. To properly
     // handle this, check if the test succeeds or not, and write out the outcome
     // here (similar to Struct2Local::visitRefTest). Note that we test on
     // |originalType| here and not |allocation->type|, as the allocation has
     // been turned into a struct.
     int32_t result = Type::isSubType(originalType, curr->castType);
     replaceCurrent(builder.makeSequence(builder.makeDrop(curr),
                                         builder.makeConst(Literal(result))));
   }

   void visitRefCast(RefCast* curr) {
     if (analyzer.getInteraction(curr) == ParentChildInteraction::None) {
       return;
     }

     // As with RefTest, we need to check if the cast succeeds with the array
     // type before we turn it into a struct type (as after that change, the
     // outcome of the cast will look different).
     if (!Type::isSubType(originalType, curr->type)) {
       // The cast fails, ensure we trap with an unreachable.
       replaceCurrent(builder.makeSequence(builder.makeDrop(curr),
                                           builder.makeUnreachable()));
     } else {
       // The cast succeeds. Update the type. (It is ok to use the non-nullable
       // type here unconditionally, since we know the allocation flows through
       // here, and anyhow we will be removing the reference during Struct2Local,
       // later.)
       curr->type = nonNullStruct;
     }

     // Regardless of how we altered the type here, refinalize.
     refinalize = true;
   }

   // Get the value in an expression we know must contain a constant index.
   Index getIndex(Expression* curr) {
     return curr->cast<Const>()->value.getUnsigned();
   }

   // Given an ArrayNew or ArrayNewFixed, return the size of the array that is
   // being allocated.
   Index getArrayNewSize(Expression* allocation) {
     if (auto* arrayNew = allocation->dynCast<ArrayNew>()) {
       return getIndex(arrayNew->size);
     } else if (auto* arrayNewFixed = allocation->dynCast<ArrayNewFixed>()) {
       return arrayNewFixed->values.size();
     } else {
       WASM_UNREACHABLE("bad allocation");
     }
   }
 };

 // Core Heap2Local optimization that operates on a function: Builds up the data
 // structures we need (LocalGraph, etc.) that we will use across multiple
 // analyses of allocations, and then runs those analyses and optimizes where
 // possible.
 struct Heap2Local {
   Function* func;
   Module& wasm;
   const PassOptions& passOptions;

   LazyLocalGraph localGraph;
   Parents parents;
   BranchUtils::BranchTargets branchTargets;

   Heap2Local(Function* func, Module& wasm, const PassOptions& passOptions)
     : func(func), wasm(wasm), passOptions(passOptions), localGraph(func, &wasm),
       parents(func->body), branchTargets(func->body) {

     // Find all the relevant allocations in the function: StructNew, ArrayNew,
     // ArrayNewFixed.
     struct AllocationFinder : public PostWalker<AllocationFinder> {
       std::vector<StructNew*> structNews;
       std::vector<Expression*> arrayNews;

       void visitStructNew(StructNew* curr) {
         // Ignore unreachable allocations that DCE will remove anyhow.
         if (curr->type != Type::unreachable) {
           structNews.push_back(curr);
         }
       }
       void visitArrayNew(ArrayNew* curr) {
         // Only new arrays of fixed size are relevant for us.
         if (curr->type != Type::unreachable && isValidSize(curr->size)) {
           arrayNews.push_back(curr);
         }
       }
       void visitArrayNewFixed(ArrayNewFixed* curr) {
         if (curr->type != Type::unreachable &&
             isValidSize(curr->values.size())) {
           arrayNews.push_back(curr);
         }
       }

       bool isValidSize(Expression* size) {
         // The size of an array is valid if it is constant, and its value is
         // valid.
         if (auto* c = size->dynCast<Const>()) {
           return isValidSize(c->value.getUnsigned());
         }
         return false;
       }

       bool isValidSize(Index size) {
         // Set a reasonable limit on the size here, as valid wasm can contain
         // things like (array.new (i32.const -1)) which will likely fail at
         // runtime on a VM limitation on array size. We also are converting a
         // heap allocation to a stack allocation, which can be noticeable in
         // some cases, so to be careful here use a fairly small limit.
         return size < 20;
       }

       // Also note if a pop exists here, as they may require fixups.
       bool hasPop = false;

       void visitPop(Pop* curr) { hasPop = true; }
     } finder;
     finder.walk(func->body);

     bool optimized = false;

     // First, lower non-escaping arrays into structs. That allows us to handle
     // arrays in a single place, and let all the rest of this pass assume we are
     // working on structs. We are in fact only optimizing struct-like arrays
     // here, that is, arrays of a fixed size and whose items are accessed using
     // constant indexes, so they are effectively structs, and turning them into
     // such allows uniform handling later.
     for (auto* allocation : finder.arrayNews) {
       // The point of this optimization is to replace heap allocations with
       // locals, so we must be able to place the data in locals.
       if (!canHandleAsLocals(allocation->type)) {
         continue;
       }

       EscapeAnalyzer analyzer(
         localGraph, parents, branchTargets, passOptions, wasm);
       if (!analyzer.escapes(allocation)) {
         // Convert the allocation and all its uses into a struct. Then convert
         // the struct into locals.
         auto* structNew =
           Array2Struct(allocation, analyzer, func, wasm).structNew;
         Struct2Local(structNew, analyzer, func, wasm);
         optimized = true;
       }
     }

     // Next, process all structNews.
     for (auto* allocation : finder.structNews) {
       // As above, we must be able to use locals for this data.
       if (!canHandleAsLocals(allocation->type)) {
         continue;
       }

       // Check for escaping, noting relevant information as we go. If this does
       // not escape, optimize it into locals.
       EscapeAnalyzer analyzer(
         localGraph, parents, branchTargets, passOptions, wasm);
       if (!analyzer.escapes(allocation)) {
         Struct2Local(allocation, analyzer, func, wasm);
         optimized = true;
       }
     }

     // We conservatively run the EH pop fixup if this function has a 'pop' and
     // if we have ever optimized, as all of the things we do here involve
     // creating blocks, so we might have moved pops into the blocks.
     if (finder.hasPop && optimized) {
       EHUtils::handleBlockNestedPops(func, wasm);
     }
   }

   bool canHandleAsLocal(const Field& field) {
     return TypeUpdating::canHandleAsLocal(field.type);
   }

   bool canHandleAsLocals(Type type) {
     if (type == Type::unreachable) {
       return false;
     }

     auto heapType = type.getHeapType();
     if (heapType.isStruct()) {
       auto& fields = heapType.getStruct().fields;
       for (auto field : fields) {
         if (!canHandleAsLocal(field)) {
           return false;
         }
       }
       return true;
     }

     assert(heapType.isArray());
     return canHandleAsLocal(heapType.getArray().element);
   }
 };

 struct Heap2LocalPass : public WalkerPass<PostWalker<Heap2LocalPass>> {
   bool isFunctionParallel() override { return true; }

   std::unique_ptr<Pass> create() override {
     return std::make_unique<Heap2LocalPass>();
   }

   void doWalkFunction(Function* func) {
     // Multiple rounds of optimization may work in theory, as once we turn one
     // allocation into locals, references written to its fields become
     // references written to locals, which we may see do not escape. However,
     // this does not work yet, since we do not remove the original allocation -
     // we just "detach" it from other things and then depend on other
     // optimizations to remove it. That means this pass must be interleaved with
     // vacuum, in particular, to optimize such nested allocations.
     // TODO Consider running multiple iterations here, and running vacuum in
     //      between them.
     Heap2Local(func, *getModule(), getPassOptions());
   }
 };

 } // anonymous namespace

 Pass* createHeap2LocalPass() { return new Heap2LocalPass(); }

 } // namespace wasm