src/passes/ReorderGlobals.cpp - external/github.com/WebAssembly/binaryen.git - Git at Google

 /*
  * Copyright 2022 WebAssembly Community Group participants
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
  * You may obtain a copy of the License at
  *
  *     http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing, software
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */

 //
 // Sorts globals by their static use count. This helps reduce the size of wasm
 // binaries because fewer bytes are needed to encode references to frequently
 // used globals.
 //
 // This pass also sorts by dependencies, as each global can only appear after
 // those it refers to. Other passes can use this internally to fix the ordering
 // after they add new globals.
 //
 // The "always" variant of this pass will always sort globals, even if there are
 // so few they all fit in a single LEB. In "always" mode we sort regardless and
 // we measure size by considering each subsequent index to have a higher cost.
 // That is, in reality the size of all globals up to 128 is a single byte, and
 // then the LEB grows to 2, while in "always" mode we increase the size by 1/128
 // for each global in a smooth manner. "Always" is mainly useful for testing.
 //

 #include "memory"

 #include "ir/find_all.h"
 #include "pass.h"
 #include "support/topological_sort.h"
 #include "wasm.h"

 namespace wasm {

 // We'll count uses in parallel.
 using AtomicNameCountMap = std::unordered_map<Name, std::atomic<Index>>;

 struct UseCountScanner : public WalkerPass<PostWalker<UseCountScanner>> {
   bool isFunctionParallel() override { return true; }

   bool modifiesBinaryenIR() override { return false; }

   UseCountScanner(AtomicNameCountMap& counts) : counts(counts) {}

   std::unique_ptr<Pass> create() override {
     return std::make_unique<UseCountScanner>(counts);
   }

   void visitGlobalGet(GlobalGet* curr) {
     // We can't add a new element to the map in parallel.
     assert(counts.count(curr->name) > 0);
     counts[curr->name]++;
   }
   void visitGlobalSet(GlobalSet* curr) {
     assert(counts.count(curr->name) > 0);
     counts[curr->name]++;
   }

 private:
   AtomicNameCountMap& counts;
 };

 struct ReorderGlobals : public Pass {
   bool always;

   ReorderGlobals(bool always) : always(always) {}

   // For efficiency we will use global indices rather than names. That is, we
   // use the index of the global in the original ordering to identify each
   // global. A different ordering is then a vector of new indices, saying where
   // each one moves, which is logically a mapping between indices.
   using IndexIndexMap = std::vector<Index>;

   // We will also track counts of uses for each global. We use floating-point
   // values here since while the initial counts are integers, we will be
   // considering fractional sums of them later.
   using IndexCountMap = std::vector<double>;

   // We must take into account dependencies, so that globals appear before
   // their users in other globals:
   //
   //   (global $a i32 (i32.const 10))
   //   (global $b i32 (global.get $a)) ;; $b depends on $a; $a must be first
   //
   // To do so we construct a map from each global to those it depends on. We
   // also build the reverse map, of those that it is depended upon by.
   struct Dependencies {
     std::unordered_map<Index, std::unordered_set<Index>> dependsOn;
     std::unordered_map<Index, std::unordered_set<Index>> dependedUpon;
   };

   void run(Module* module) override {
     auto& globals = module->globals;

     if (globals.size() < 128 && !always) {
       // The module has so few globals that they all fit in a single-byte U32LEB
       // value, so no reordering we can do can actually decrease code size. Note
       // that this is the common case with wasm MVP modules where the only
       // globals are typically the stack pointer and perhaps a handful of others
       // (however, features like wasm GC there may be a great many globals).
       // TODO: we still need to sort here to fix dependencies sometimes
       return;
     }

     AtomicNameCountMap atomicCounts;
     // Fill in info, as we'll operate on it in parallel.
     for (auto& global : globals) {
       atomicCounts[global->name];
     }

     // Count uses.
     UseCountScanner scanner(atomicCounts);
     scanner.run(getPassRunner(), module);
     scanner.runOnModuleCode(getPassRunner(), module);

     // Switch to non-atomic for all further processing, and convert names to
     // indices.
     std::unordered_map<Name, Index> originalIndices;
     for (Index i = 0; i < globals.size(); i++) {
       originalIndices[globals[i]->name] = i;
     }
     IndexCountMap counts(globals.size());
     for (auto& [name, count] : atomicCounts) {
       counts[originalIndices[name]] = count;
     }

     // Compute dependencies.
     Dependencies deps;
     for (Index i = 0; i < globals.size(); i++) {
       auto& global = globals[i];
       if (!global->imported()) {
         for (auto* get : FindAll<GlobalGet>(global->init).list) {
           auto getIndex = originalIndices[get->name];
           deps.dependsOn[i].insert(getIndex);
           deps.dependedUpon[getIndex].insert(i);
         }
       }
     }

     // Compute various sorting options. All the options use a variation of the
     // algorithm in doSort() below, see there for more details; the only
     // difference between the sorts is in the use counts we provide it.
     struct SortAndSize {
       IndexIndexMap sort;
       double size;
       SortAndSize(IndexIndexMap&& sort, double size)
         : sort(std::move(sort)), size(size) {}
     };
     std::vector<SortAndSize> options;
     auto addOption = [&](const IndexCountMap& customCounts) {
       // Compute the sort using custom counts that guide us how to order.
       auto sort = doSort(customCounts, deps, module);
       // Compute the size using the true counts.
       auto size = computeSize(sort, counts);
       options.emplace_back(std::move(sort), size);
     };

     // Compute the closest thing we can to the original, unoptimized sort, by
     // setting all counts to 0 there, so it only takes into account dependencies
     // and the original ordering and nothing else.
     //
     // We put this sort first because if they all end up with equal size we
     // prefer it (as it avoids pointless changes).
     IndexCountMap zeroes(globals.size());
     addOption(zeroes);

     // A simple sort that uses the counts. As the algorithm in doSort() is
     // greedy (see below), this is a pure greedy sort (at each point it time it
     // picks the global with the highest count that it can).
     addOption(counts);

     // We can make the sort less greedy by adding to each global's count some of
     // the count of its children. Then we'd still be picking in a greedy manner
     // but our choices would be influenced by the bigger picture of what can be
     // unlocked by emitting a particular global (it may have a low count itself,
     // but allow emitting something that depends on it that has a high count).
     // We try two variations of this, one which is a simple sum (add the
     // dependent's counts to the global's) and one which exponentially decreases
     // them (that is, we add a small multiple of the dependent's counts, which
     // may help as the dependents also depend on other things potentially, so a
     // simple sum may be too naive).
     double const EXPONENTIAL_FACTOR = 0.095;
     IndexCountMap sumCounts(globals.size()), exponentialCounts(globals.size());

     struct Sort : public TopologicalSort<Index, Sort> {
       const Dependencies& deps;

       Sort(Index numGlobals, const Dependencies& deps) : deps(deps) {
         for (Index i = 0; i < numGlobals; i++) {
           push(i);
         }
       }

       void pushPredecessors(Index global) {
         auto iter = deps.dependedUpon.find(global);
         if (iter == deps.dependedUpon.end()) {
           return;
         }
         for (auto dep : iter->second) {
           push(dep);
         }
       }
     } sort(globals.size(), deps);

     for (auto global : sort) {
       // We can compute this global's count as in the sorted order all the
       // values it cares about are resolved. Start with the self-count, then
       // add the deps.
       sumCounts[global] = exponentialCounts[global] = counts[global];
       for (auto dep : deps.dependedUpon[global]) {
         sumCounts[global] += sumCounts[dep];
         exponentialCounts[global] +=
           EXPONENTIAL_FACTOR * exponentialCounts[dep];
       }
     }

     addOption(sumCounts);
     addOption(exponentialCounts);

     // Pick the best out of all the options we computed.
     IndexIndexMap* best = nullptr;
     double bestSize;
     for (auto& [sort, size] : options) {
       if (!best || size < bestSize) {
         best = &sort;
         bestSize = size;
       }
     }

     // Apply the indices we computed.
     std::vector<std::unique_ptr<Global>> old(std::move(globals));
     globals.resize(old.size());
     for (Index i = 0; i < old.size(); i++) {
       globals[(*best)[i]] = std::move(old[i]);
     }
     module->updateMaps();
   }

   IndexIndexMap doSort(const IndexCountMap& counts,
                        const Dependencies& originalDeps,
                        Module* module) {
     auto& globals = module->globals;

     // Copy the deps as we will operate on them as we go.
     auto deps = originalDeps;

     // To sort the globals we do a simple greedy approach of always picking the
     // global with the highest count at every point in time, subject to the
     // constraint that we can only emit globals that have all of their
     // dependencies already emitted. To do so we keep a list of the "available"
     // globals, which are those with no remaining dependencies. Then by keeping
     // the list of available globals in heap form we can simply pop the largest
     // from the heap each time, and add new available ones as they become so.
     //
     // Other approaches here could be to do a topological sort, but the optimal
     // order may not require strict ordering by topological depth, e.g.:
     /*
     //     $c - $a
     //    /
     //  $e
     //    \
     //     $d - $b
     */
     // Here $e depends on $c and $d, $c depends on $a, and $d on $b. This is a
     // partial order, as $d can be before or after $a, for example. As a result,
     // if we sorted topologically by sub-trees here then we'd keep $c and $a
     // together, and $d and $b, but a better order might interleave them. A good
     // order also may not keep topological depths separated, e.g. we may want to
     // put $a in between $c and $d despite it having a greater depth.
     //
     // The greedy approach here may also be unoptimal, however. Consider that we
     // might see that the best available global is $a, but if we popped $b
     // instead that could unlock $c which depends on $b, and $c may have a much
     // higher use count than $a. For that reason we try several variations of
     // this with different counts, see earlier.
     std::vector<Index> availableHeap;

     // Comparison function. Given a and b, returns if a should be before b. This
     // is used in a heap, where "highest" means "popped first", so see the notes
     // below on how we order.
     auto cmp = [&](Index a, Index b) {
       // Imports always go first. The binary writer takes care of this itself
       // anyhow, but it is better to do it here in the IR so we can actually
       // see what the final layout will be.
       auto aImported = globals[a]->imported();
       auto bImported = globals[b]->imported();
       // The highest items will be popped first off the heap, so we want imports
       // to be at higher indexes, that is,
       //
       //  unimported, unimported, imported, imported.
       //
       // Then the imports are popped first.
       if (aImported != bImported) {
         return bImported;
       }

       // Sort by the counts. We want higher counts at higher indexes so they are
       // popped first, that is,
       //
       //  10, 20, 30, 40
       //
       auto aCount = counts[a];
       auto bCount = counts[b];
       if (aCount != bCount) {
         return aCount < bCount;
       }

       // Break ties using the original order, which means just using the
       // indices we have. We need lower indexes at the top so they are popped
       // first, that is,
       //
       //  3, 2, 1, 0
       //
       return a > b;
     };

     // Push an item that just became available to the available heap.
     auto push = [&](Index global) {
       availableHeap.push_back(global);
       std::push_heap(availableHeap.begin(), availableHeap.end(), cmp);
     };

     // The initially available globals are those with no dependencies.
     for (Index i = 0; i < globals.size(); i++) {
       if (deps.dependsOn[i].empty()) {
         push(i);
       }
     }

     // Pop off the heap: Emit the global and its final, sorted index. Keep
     // doing that until we finish processing all the globals.
     IndexIndexMap sortedindices(globals.size());
     Index numSortedindices = 0;
     while (!availableHeap.empty()) {
       std::pop_heap(availableHeap.begin(), availableHeap.end(), cmp);
       auto global = availableHeap.back();
       sortedindices[global] = numSortedindices++;
       availableHeap.pop_back();

       // Each time we pop we emit the global, which means anything that only
       // depended on it becomes available to be popped as well.
       for (auto other : deps.dependedUpon[global]) {
         assert(deps.dependsOn[other].count(global));
         deps.dependsOn[other].erase(global);
         if (deps.dependsOn[other].empty()) {
           push(other);
         }
       }
     }

     // All globals must have been handled. Cycles would prevent this, but they
     // cannot exist in valid IR.
     assert(numSortedindices == globals.size());

     return sortedindices;
   }

   // Given an indexing of the globals and the counts of how many times each is
   // used, estimate the size of relevant parts of the wasm binary (that is, of
   // LEBs in global.gets).
   double computeSize(IndexIndexMap& indices, IndexCountMap& counts) {
     // |indices| maps each old index to its new position in the sort. We need
     // the reverse map here, which at index 0 has the old index of the global
     // that will be first, and so forth.
     IndexIndexMap actualOrder(indices.size());
     for (Index i = 0; i < indices.size(); i++) {
       // Each global has a unique index, so we only replace 0's here, and they
       // must be in bounds.
       assert(indices[i] < indices.size());
       assert(actualOrder[indices[i]] == 0);

       actualOrder[indices[i]] = i;
     }

     if (always) {
       // In this mode we gradually increase the cost of later globals, in an
       // unrealistic but smooth manner.
       double total = 0;
       for (Index i = 0; i < actualOrder.size(); i++) {
         // Multiply the count for this global by a smoothed LEB factor, which
         // starts at 1 (for 1 byte) at index 0, and then increases linearly with
         // i, so that after 128 globals we reach 2 (which is the true index at
         // which the LEB size normally jumps from 1 to 2), and so forth.
         total += counts[actualOrder[i]] * (1.0 + (i / 128.0));
       }
       return total;
     }

     // The total size we are computing.
     double total = 0;
     // Track the size in bits and the next index at which the size increases. At
     // the first iteration we'll compute the size of the LEB for index 0, and so
     // forth.
     size_t sizeInBits = 0;
     size_t nextSizeIncrease = 0;
     for (Index i = 0; i < actualOrder.size(); i++) {
       if (i == nextSizeIncrease) {
         sizeInBits++;
         // At the current size we have 7 * sizeInBits bits to use.  For example,
         // at index 0 the size is 1 and we'll compute 128 here, and indeed after
         // emitting 128 globals (0,..,127) the 129th (at index 128) requires a
         // larger LEB.
         nextSizeIncrease = 1 << (7 * sizeInBits);
       }
       total += counts[actualOrder[i]] * sizeInBits;
     }
     return total;
   }
 };

 Pass* createReorderGlobalsPass() { return new ReorderGlobals(false); }

 Pass* createReorderGlobalsAlwaysPass() { return new ReorderGlobals(true); }

 } // namespace wasm
	/*
	* Copyright 2022 WebAssembly Community Group participants
	*
	* Licensed under the Apache License, Version 2.0 (the "License");
	* you may not use this file except in compliance with the License.
	* You may obtain a copy of the License at
	*
	* http://www.apache.org/licenses/LICENSE-2.0
	*
	* Unless required by applicable law or agreed to in writing, software
	* distributed under the License is distributed on an "AS IS" BASIS,
	* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	* See the License for the specific language governing permissions and
	* limitations under the License.
	*/

	//
	// Sorts globals by their static use count. This helps reduce the size of wasm
	// binaries because fewer bytes are needed to encode references to frequently
	// used globals.
	//
	// This pass also sorts by dependencies, as each global can only appear after
	// those it refers to. Other passes can use this internally to fix the ordering
	// after they add new globals.
	//
	// The "always" variant of this pass will always sort globals, even if there are
	// so few they all fit in a single LEB. In "always" mode we sort regardless and
	// we measure size by considering each subsequent index to have a higher cost.
	// That is, in reality the size of all globals up to 128 is a single byte, and
	// then the LEB grows to 2, while in "always" mode we increase the size by 1/128
	// for each global in a smooth manner. "Always" is mainly useful for testing.
	//

	#include "memory"

	#include "ir/find_all.h"
	#include "pass.h"
	#include "support/topological_sort.h"
	#include "wasm.h"

	namespace wasm {

	// We'll count uses in parallel.
	using AtomicNameCountMap = std::unordered_map<Name, std::atomic<Index>>;

	struct UseCountScanner : public WalkerPass<PostWalker<UseCountScanner>> {
	bool isFunctionParallel() override { return true; }

	bool modifiesBinaryenIR() override { return false; }

	UseCountScanner(AtomicNameCountMap& counts) : counts(counts) {}

	std::unique_ptr<Pass> create() override {
	return std::make_unique<UseCountScanner>(counts);
	}

	void visitGlobalGet(GlobalGet* curr) {
	// We can't add a new element to the map in parallel.
	assert(counts.count(curr->name) > 0);
	counts[curr->name]++;
	}
	void visitGlobalSet(GlobalSet* curr) {
	assert(counts.count(curr->name) > 0);
	counts[curr->name]++;
	}

	private:
	AtomicNameCountMap& counts;
	};

	struct ReorderGlobals : public Pass {
	bool always;

	ReorderGlobals(bool always) : always(always) {}

	// For efficiency we will use global indices rather than names. That is, we
	// use the index of the global in the original ordering to identify each
	// global. A different ordering is then a vector of new indices, saying where
	// each one moves, which is logically a mapping between indices.
	using IndexIndexMap = std::vector<Index>;

	// We will also track counts of uses for each global. We use floating-point
	// values here since while the initial counts are integers, we will be
	// considering fractional sums of them later.
	using IndexCountMap = std::vector<double>;

	// We must take into account dependencies, so that globals appear before
	// their users in other globals:
	//
	// (global $a i32 (i32.const 10))
	// (global $b i32 (global.get $a)) ;; $b depends on $a; $a must be first
	//
	// To do so we construct a map from each global to those it depends on. We
	// also build the reverse map, of those that it is depended upon by.
	struct Dependencies {
	std::unordered_map<Index, std::unordered_set<Index>> dependsOn;
	std::unordered_map<Index, std::unordered_set<Index>> dependedUpon;
	};

	void run(Module* module) override {
	auto& globals = module->globals;

	if (globals.size() < 128 && !always) {
	// The module has so few globals that they all fit in a single-byte U32LEB
	// value, so no reordering we can do can actually decrease code size. Note
	// that this is the common case with wasm MVP modules where the only
	// globals are typically the stack pointer and perhaps a handful of others
	// (however, features like wasm GC there may be a great many globals).
	// TODO: we still need to sort here to fix dependencies sometimes
	return;
	}

	AtomicNameCountMap atomicCounts;
	// Fill in info, as we'll operate on it in parallel.
	for (auto& global : globals) {
	atomicCounts[global->name];
	}

	// Count uses.
	UseCountScanner scanner(atomicCounts);
	scanner.run(getPassRunner(), module);
	scanner.runOnModuleCode(getPassRunner(), module);

	// Switch to non-atomic for all further processing, and convert names to
	// indices.
	std::unordered_map<Name, Index> originalIndices;
	for (Index i = 0; i < globals.size(); i++) {
	originalIndices[globals[i]->name] = i;
	}
	IndexCountMap counts(globals.size());
	for (auto& [name, count] : atomicCounts) {
	counts[originalIndices[name]] = count;
	}

	// Compute dependencies.
	Dependencies deps;
	for (Index i = 0; i < globals.size(); i++) {
	auto& global = globals[i];
	if (!global->imported()) {
	for (auto* get : FindAll<GlobalGet>(global->init).list) {
	auto getIndex = originalIndices[get->name];
	deps.dependsOn[i].insert(getIndex);
	deps.dependedUpon[getIndex].insert(i);
	}
	}
	}

	// Compute various sorting options. All the options use a variation of the
	// algorithm in doSort() below, see there for more details; the only
	// difference between the sorts is in the use counts we provide it.
	struct SortAndSize {
	IndexIndexMap sort;
	double size;
	SortAndSize(IndexIndexMap&& sort, double size)
	: sort(std::move(sort)), size(size) {}
	};
	std::vector<SortAndSize> options;
	auto addOption = [&](const IndexCountMap& customCounts) {
	// Compute the sort using custom counts that guide us how to order.
	auto sort = doSort(customCounts, deps, module);
	// Compute the size using the true counts.
	auto size = computeSize(sort, counts);
	options.emplace_back(std::move(sort), size);
	};

	// Compute the closest thing we can to the original, unoptimized sort, by
	// setting all counts to 0 there, so it only takes into account dependencies
	// and the original ordering and nothing else.
	//
	// We put this sort first because if they all end up with equal size we
	// prefer it (as it avoids pointless changes).
	IndexCountMap zeroes(globals.size());
	addOption(zeroes);

	// A simple sort that uses the counts. As the algorithm in doSort() is
	// greedy (see below), this is a pure greedy sort (at each point it time it
	// picks the global with the highest count that it can).
	addOption(counts);

	// We can make the sort less greedy by adding to each global's count some of
	// the count of its children. Then we'd still be picking in a greedy manner
	// but our choices would be influenced by the bigger picture of what can be
	// unlocked by emitting a particular global (it may have a low count itself,
	// but allow emitting something that depends on it that has a high count).
	// We try two variations of this, one which is a simple sum (add the
	// dependent's counts to the global's) and one which exponentially decreases
	// them (that is, we add a small multiple of the dependent's counts, which
	// may help as the dependents also depend on other things potentially, so a
	// simple sum may be too naive).
	double const EXPONENTIAL_FACTOR = 0.095;
	IndexCountMap sumCounts(globals.size()), exponentialCounts(globals.size());

	struct Sort : public TopologicalSort<Index, Sort> {
	const Dependencies& deps;

	Sort(Index numGlobals, const Dependencies& deps) : deps(deps) {
	for (Index i = 0; i < numGlobals; i++) {
	push(i);
	}
	}

	void pushPredecessors(Index global) {
	auto iter = deps.dependedUpon.find(global);
	if (iter == deps.dependedUpon.end()) {
	return;
	}
	for (auto dep : iter->second) {
	push(dep);
	}
	}
	} sort(globals.size(), deps);

	for (auto global : sort) {
	// We can compute this global's count as in the sorted order all the
	// values it cares about are resolved. Start with the self-count, then
	// add the deps.
	sumCounts[global] = exponentialCounts[global] = counts[global];
	for (auto dep : deps.dependedUpon[global]) {
	sumCounts[global] += sumCounts[dep];
	exponentialCounts[global] +=
	EXPONENTIAL_FACTOR * exponentialCounts[dep];
	}
	}

	addOption(sumCounts);
	addOption(exponentialCounts);

	// Pick the best out of all the options we computed.
	IndexIndexMap* best = nullptr;
	double bestSize;
	for (auto& [sort, size] : options) {
	if (!best \|\| size < bestSize) {
	best = &sort;
	bestSize = size;
	}
	}

	// Apply the indices we computed.
	std::vector<std::unique_ptr<Global>> old(std::move(globals));
	globals.resize(old.size());
	for (Index i = 0; i < old.size(); i++) {
	globals[(*best)[i]] = std::move(old[i]);
	}
	module->updateMaps();
	}

	IndexIndexMap doSort(const IndexCountMap& counts,
	const Dependencies& originalDeps,
	Module* module) {
	auto& globals = module->globals;

	// Copy the deps as we will operate on them as we go.
	auto deps = originalDeps;

	// To sort the globals we do a simple greedy approach of always picking the
	// global with the highest count at every point in time, subject to the
	// constraint that we can only emit globals that have all of their
	// dependencies already emitted. To do so we keep a list of the "available"
	// globals, which are those with no remaining dependencies. Then by keeping
	// the list of available globals in heap form we can simply pop the largest
	// from the heap each time, and add new available ones as they become so.
	//
	// Other approaches here could be to do a topological sort, but the optimal
	// order may not require strict ordering by topological depth, e.g.:
	/*
	// $c - $a
	// /
	// $e
	// \
	// $d - $b
	*/
	// Here $e depends on $c and $d, $c depends on $a, and $d on $b. This is a
	// partial order, as $d can be before or after $a, for example. As a result,
	// if we sorted topologically by sub-trees here then we'd keep $c and $a
	// together, and $d and $b, but a better order might interleave them. A good
	// order also may not keep topological depths separated, e.g. we may want to
	// put $a in between $c and $d despite it having a greater depth.
	//
	// The greedy approach here may also be unoptimal, however. Consider that we
	// might see that the best available global is $a, but if we popped $b
	// instead that could unlock $c which depends on $b, and $c may have a much
	// higher use count than $a. For that reason we try several variations of
	// this with different counts, see earlier.
	std::vector<Index> availableHeap;

	// Comparison function. Given a and b, returns if a should be before b. This
	// is used in a heap, where "highest" means "popped first", so see the notes
	// below on how we order.
	auto cmp = [&](Index a, Index b) {
	// Imports always go first. The binary writer takes care of this itself
	// anyhow, but it is better to do it here in the IR so we can actually
	// see what the final layout will be.
	auto aImported = globals[a]->imported();
	auto bImported = globals[b]->imported();
	// The highest items will be popped first off the heap, so we want imports
	// to be at higher indexes, that is,
	//
	// unimported, unimported, imported, imported.
	//
	// Then the imports are popped first.
	if (aImported != bImported) {
	return bImported;
	}

	// Sort by the counts. We want higher counts at higher indexes so they are
	// popped first, that is,
	//
	// 10, 20, 30, 40
	//
	auto aCount = counts[a];
	auto bCount = counts[b];
	if (aCount != bCount) {
	return aCount < bCount;
	}

	// Break ties using the original order, which means just using the
	// indices we have. We need lower indexes at the top so they are popped
	// first, that is,
	//
	// 3, 2, 1, 0
	//
	return a > b;
	};

	// Push an item that just became available to the available heap.
	auto push = [&](Index global) {
	availableHeap.push_back(global);
	std::push_heap(availableHeap.begin(), availableHeap.end(), cmp);
	};

	// The initially available globals are those with no dependencies.
	for (Index i = 0; i < globals.size(); i++) {
	if (deps.dependsOn[i].empty()) {
	push(i);
	}
	}

	// Pop off the heap: Emit the global and its final, sorted index. Keep
	// doing that until we finish processing all the globals.
	IndexIndexMap sortedindices(globals.size());
	Index numSortedindices = 0;
	while (!availableHeap.empty()) {
	std::pop_heap(availableHeap.begin(), availableHeap.end(), cmp);
	auto global = availableHeap.back();
	sortedindices[global] = numSortedindices++;
	availableHeap.pop_back();

	// Each time we pop we emit the global, which means anything that only
	// depended on it becomes available to be popped as well.
	for (auto other : deps.dependedUpon[global]) {
	assert(deps.dependsOn[other].count(global));
	deps.dependsOn[other].erase(global);
	if (deps.dependsOn[other].empty()) {
	push(other);
	}
	}
	}

	// All globals must have been handled. Cycles would prevent this, but they
	// cannot exist in valid IR.
	assert(numSortedindices == globals.size());

	return sortedindices;
	}

	// Given an indexing of the globals and the counts of how many times each is
	// used, estimate the size of relevant parts of the wasm binary (that is, of
	// LEBs in global.gets).
	double computeSize(IndexIndexMap& indices, IndexCountMap& counts) {
	// \|indices\| maps each old index to its new position in the sort. We need
	// the reverse map here, which at index 0 has the old index of the global
	// that will be first, and so forth.
	IndexIndexMap actualOrder(indices.size());
	for (Index i = 0; i < indices.size(); i++) {
	// Each global has a unique index, so we only replace 0's here, and they
	// must be in bounds.
	assert(indices[i] < indices.size());
	assert(actualOrder[indices[i]] == 0);

	actualOrder[indices[i]] = i;
	}

	if (always) {
	// In this mode we gradually increase the cost of later globals, in an
	// unrealistic but smooth manner.
	double total = 0;
	for (Index i = 0; i < actualOrder.size(); i++) {
	// Multiply the count for this global by a smoothed LEB factor, which
	// starts at 1 (for 1 byte) at index 0, and then increases linearly with
	// i, so that after 128 globals we reach 2 (which is the true index at
	// which the LEB size normally jumps from 1 to 2), and so forth.
	total += counts[actualOrder[i]] * (1.0 + (i / 128.0));
	}
	return total;
	}

	// The total size we are computing.
	double total = 0;
	// Track the size in bits and the next index at which the size increases. At
	// the first iteration we'll compute the size of the LEB for index 0, and so
	// forth.
	size_t sizeInBits = 0;
	size_t nextSizeIncrease = 0;
	for (Index i = 0; i < actualOrder.size(); i++) {
	if (i == nextSizeIncrease) {
	sizeInBits++;
	// At the current size we have 7 * sizeInBits bits to use. For example,
	// at index 0 the size is 1 and we'll compute 128 here, and indeed after
	// emitting 128 globals (0,..,127) the 129th (at index 128) requires a
	// larger LEB.
	nextSizeIncrease = 1 << (7 * sizeInBits);
	}
	total += counts[actualOrder[i]] * sizeInBits;
	}
	return total;
	}
	};

	Pass* createReorderGlobalsPass() { return new ReorderGlobals(false); }

	Pass* createReorderGlobalsAlwaysPass() { return new ReorderGlobals(true); }

	} // namespace wasm