blob: 4602f328496fba4877cac7044c90dc9572e977a9 [file] [log] [blame] [edit]
/*
* Copyright 2022 WebAssembly Community Group participants
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
//
// Sorts globals by their static use count. This helps reduce the size of wasm
// binaries because fewer bytes are needed to encode references to frequently
// used globals.
//
// This pass also sorts by dependencies, as each global can only appear after
// those it refers to. Other passes can use this internally to fix the ordering
// after they add new globals.
//
// The "always" variant of this pass will always sort globals, even if there are
// so few they all fit in a single LEB. In "always" mode we sort regardless and
// we measure size by considering each subsequent index to have a higher cost.
// That is, in reality the size of all globals up to 128 is a single byte, and
// then the LEB grows to 2, while in "always" mode we increase the size by 1/128
// for each global in a smooth manner. "Always" is mainly useful for testing.
//
#include "memory"
#include "ir/find_all.h"
#include "pass.h"
#include "support/topological_sort.h"
#include "wasm.h"
namespace wasm {
// We'll count uses in parallel.
using AtomicNameCountMap = std::unordered_map<Name, std::atomic<Index>>;
struct UseCountScanner : public WalkerPass<PostWalker<UseCountScanner>> {
bool isFunctionParallel() override { return true; }
bool modifiesBinaryenIR() override { return false; }
UseCountScanner(AtomicNameCountMap& counts) : counts(counts) {}
std::unique_ptr<Pass> create() override {
return std::make_unique<UseCountScanner>(counts);
}
void visitGlobalGet(GlobalGet* curr) {
// We can't add a new element to the map in parallel.
assert(counts.count(curr->name) > 0);
counts[curr->name]++;
}
void visitGlobalSet(GlobalSet* curr) {
assert(counts.count(curr->name) > 0);
counts[curr->name]++;
}
private:
AtomicNameCountMap& counts;
};
struct ReorderGlobals : public Pass {
bool always;
ReorderGlobals(bool always) : always(always) {}
// For efficiency we will use global indices rather than names. That is, we
// use the index of the global in the original ordering to identify each
// global. A different ordering is then a vector of new indices, saying where
// each one moves, which is logically a mapping between indices.
using IndexIndexMap = std::vector<Index>;
// We will also track counts of uses for each global. We use floating-point
// values here since while the initial counts are integers, we will be
// considering fractional sums of them later.
using IndexCountMap = std::vector<double>;
// We must take into account dependencies, so that globals appear before
// their users in other globals:
//
// (global $a i32 (i32.const 10))
// (global $b i32 (global.get $a)) ;; $b depends on $a; $a must be first
//
// To do so we construct a map from each global to those it depends on. We
// also build the reverse map, of those that it is depended upon by.
struct Dependencies {
std::unordered_map<Index, std::unordered_set<Index>> dependsOn;
std::unordered_map<Index, std::unordered_set<Index>> dependedUpon;
};
void run(Module* module) override {
auto& globals = module->globals;
if (globals.size() < 128 && !always) {
// The module has so few globals that they all fit in a single-byte U32LEB
// value, so no reordering we can do can actually decrease code size. Note
// that this is the common case with wasm MVP modules where the only
// globals are typically the stack pointer and perhaps a handful of others
// (however, features like wasm GC there may be a great many globals).
// TODO: we still need to sort here to fix dependencies sometimes
return;
}
AtomicNameCountMap atomicCounts;
// Fill in info, as we'll operate on it in parallel.
for (auto& global : globals) {
atomicCounts[global->name];
}
// Count uses.
UseCountScanner scanner(atomicCounts);
scanner.run(getPassRunner(), module);
scanner.runOnModuleCode(getPassRunner(), module);
// Switch to non-atomic for all further processing, and convert names to
// indices.
std::unordered_map<Name, Index> originalIndices;
for (Index i = 0; i < globals.size(); i++) {
originalIndices[globals[i]->name] = i;
}
IndexCountMap counts(globals.size());
for (auto& [name, count] : atomicCounts) {
counts[originalIndices[name]] = count;
}
// Compute dependencies.
Dependencies deps;
for (Index i = 0; i < globals.size(); i++) {
auto& global = globals[i];
if (!global->imported()) {
for (auto* get : FindAll<GlobalGet>(global->init).list) {
auto getIndex = originalIndices[get->name];
deps.dependsOn[i].insert(getIndex);
deps.dependedUpon[getIndex].insert(i);
}
}
}
// Compute various sorting options. All the options use a variation of the
// algorithm in doSort() below, see there for more details; the only
// difference between the sorts is in the use counts we provide it.
struct SortAndSize {
IndexIndexMap sort;
double size;
SortAndSize(IndexIndexMap&& sort, double size)
: sort(std::move(sort)), size(size) {}
};
std::vector<SortAndSize> options;
auto addOption = [&](const IndexCountMap& customCounts) {
// Compute the sort using custom counts that guide us how to order.
auto sort = doSort(customCounts, deps, module);
// Compute the size using the true counts.
auto size = computeSize(sort, counts);
options.emplace_back(std::move(sort), size);
};
// Compute the closest thing we can to the original, unoptimized sort, by
// setting all counts to 0 there, so it only takes into account dependencies
// and the original ordering and nothing else.
//
// We put this sort first because if they all end up with equal size we
// prefer it (as it avoids pointless changes).
IndexCountMap zeroes(globals.size());
addOption(zeroes);
// A simple sort that uses the counts. As the algorithm in doSort() is
// greedy (see below), this is a pure greedy sort (at each point it time it
// picks the global with the highest count that it can).
addOption(counts);
// We can make the sort less greedy by adding to each global's count some of
// the count of its children. Then we'd still be picking in a greedy manner
// but our choices would be influenced by the bigger picture of what can be
// unlocked by emitting a particular global (it may have a low count itself,
// but allow emitting something that depends on it that has a high count).
// We try two variations of this, one which is a simple sum (add the
// dependent's counts to the global's) and one which exponentially decreases
// them (that is, we add a small multiple of the dependent's counts, which
// may help as the dependents also depend on other things potentially, so a
// simple sum may be too naive).
double const EXPONENTIAL_FACTOR = 0.095;
IndexCountMap sumCounts(globals.size()), exponentialCounts(globals.size());
struct Sort : public TopologicalSort<Index, Sort> {
const Dependencies& deps;
Sort(Index numGlobals, const Dependencies& deps) : deps(deps) {
for (Index i = 0; i < numGlobals; i++) {
push(i);
}
}
void pushPredecessors(Index global) {
auto iter = deps.dependedUpon.find(global);
if (iter == deps.dependedUpon.end()) {
return;
}
for (auto dep : iter->second) {
push(dep);
}
}
} sort(globals.size(), deps);
for (auto global : sort) {
// We can compute this global's count as in the sorted order all the
// values it cares about are resolved. Start with the self-count, then
// add the deps.
sumCounts[global] = exponentialCounts[global] = counts[global];
for (auto dep : deps.dependedUpon[global]) {
sumCounts[global] += sumCounts[dep];
exponentialCounts[global] +=
EXPONENTIAL_FACTOR * exponentialCounts[dep];
}
}
addOption(sumCounts);
addOption(exponentialCounts);
// Pick the best out of all the options we computed.
IndexIndexMap* best = nullptr;
double bestSize;
for (auto& [sort, size] : options) {
if (!best || size < bestSize) {
best = &sort;
bestSize = size;
}
}
// Apply the indices we computed.
std::vector<std::unique_ptr<Global>> old(std::move(globals));
globals.resize(old.size());
for (Index i = 0; i < old.size(); i++) {
globals[(*best)[i]] = std::move(old[i]);
}
module->updateMaps();
}
IndexIndexMap doSort(const IndexCountMap& counts,
const Dependencies& originalDeps,
Module* module) {
auto& globals = module->globals;
// Copy the deps as we will operate on them as we go.
auto deps = originalDeps;
// To sort the globals we do a simple greedy approach of always picking the
// global with the highest count at every point in time, subject to the
// constraint that we can only emit globals that have all of their
// dependencies already emitted. To do so we keep a list of the "available"
// globals, which are those with no remaining dependencies. Then by keeping
// the list of available globals in heap form we can simply pop the largest
// from the heap each time, and add new available ones as they become so.
//
// Other approaches here could be to do a topological sort, but the optimal
// order may not require strict ordering by topological depth, e.g.:
/*
// $c - $a
// /
// $e
// \
// $d - $b
*/
// Here $e depends on $c and $d, $c depends on $a, and $d on $b. This is a
// partial order, as $d can be before or after $a, for example. As a result,
// if we sorted topologically by sub-trees here then we'd keep $c and $a
// together, and $d and $b, but a better order might interleave them. A good
// order also may not keep topological depths separated, e.g. we may want to
// put $a in between $c and $d despite it having a greater depth.
//
// The greedy approach here may also be unoptimal, however. Consider that we
// might see that the best available global is $a, but if we popped $b
// instead that could unlock $c which depends on $b, and $c may have a much
// higher use count than $a. For that reason we try several variations of
// this with different counts, see earlier.
std::vector<Index> availableHeap;
// Comparison function. Given a and b, returns if a should be before b. This
// is used in a heap, where "highest" means "popped first", so see the notes
// below on how we order.
auto cmp = [&](Index a, Index b) {
// Imports always go first. The binary writer takes care of this itself
// anyhow, but it is better to do it here in the IR so we can actually
// see what the final layout will be.
auto aImported = globals[a]->imported();
auto bImported = globals[b]->imported();
// The highest items will be popped first off the heap, so we want imports
// to be at higher indexes, that is,
//
// unimported, unimported, imported, imported.
//
// Then the imports are popped first.
if (aImported != bImported) {
return bImported;
}
// Sort by the counts. We want higher counts at higher indexes so they are
// popped first, that is,
//
// 10, 20, 30, 40
//
auto aCount = counts[a];
auto bCount = counts[b];
if (aCount != bCount) {
return aCount < bCount;
}
// Break ties using the original order, which means just using the
// indices we have. We need lower indexes at the top so they are popped
// first, that is,
//
// 3, 2, 1, 0
//
return a > b;
};
// Push an item that just became available to the available heap.
auto push = [&](Index global) {
availableHeap.push_back(global);
std::push_heap(availableHeap.begin(), availableHeap.end(), cmp);
};
// The initially available globals are those with no dependencies.
for (Index i = 0; i < globals.size(); i++) {
if (deps.dependsOn[i].empty()) {
push(i);
}
}
// Pop off the heap: Emit the global and its final, sorted index. Keep
// doing that until we finish processing all the globals.
IndexIndexMap sortedindices(globals.size());
Index numSortedindices = 0;
while (!availableHeap.empty()) {
std::pop_heap(availableHeap.begin(), availableHeap.end(), cmp);
auto global = availableHeap.back();
sortedindices[global] = numSortedindices++;
availableHeap.pop_back();
// Each time we pop we emit the global, which means anything that only
// depended on it becomes available to be popped as well.
for (auto other : deps.dependedUpon[global]) {
assert(deps.dependsOn[other].count(global));
deps.dependsOn[other].erase(global);
if (deps.dependsOn[other].empty()) {
push(other);
}
}
}
// All globals must have been handled. Cycles would prevent this, but they
// cannot exist in valid IR.
assert(numSortedindices == globals.size());
return sortedindices;
}
// Given an indexing of the globals and the counts of how many times each is
// used, estimate the size of relevant parts of the wasm binary (that is, of
// LEBs in global.gets).
double computeSize(IndexIndexMap& indices, IndexCountMap& counts) {
// |indices| maps each old index to its new position in the sort. We need
// the reverse map here, which at index 0 has the old index of the global
// that will be first, and so forth.
IndexIndexMap actualOrder(indices.size());
for (Index i = 0; i < indices.size(); i++) {
// Each global has a unique index, so we only replace 0's here, and they
// must be in bounds.
assert(indices[i] < indices.size());
assert(actualOrder[indices[i]] == 0);
actualOrder[indices[i]] = i;
}
if (always) {
// In this mode we gradually increase the cost of later globals, in an
// unrealistic but smooth manner.
double total = 0;
for (Index i = 0; i < actualOrder.size(); i++) {
// Multiply the count for this global by a smoothed LEB factor, which
// starts at 1 (for 1 byte) at index 0, and then increases linearly with
// i, so that after 128 globals we reach 2 (which is the true index at
// which the LEB size normally jumps from 1 to 2), and so forth.
total += counts[actualOrder[i]] * (1.0 + (i / 128.0));
}
return total;
}
// The total size we are computing.
double total = 0;
// Track the size in bits and the next index at which the size increases. At
// the first iteration we'll compute the size of the LEB for index 0, and so
// forth.
size_t sizeInBits = 0;
size_t nextSizeIncrease = 0;
for (Index i = 0; i < actualOrder.size(); i++) {
if (i == nextSizeIncrease) {
sizeInBits++;
// At the current size we have 7 * sizeInBits bits to use. For example,
// at index 0 the size is 1 and we'll compute 128 here, and indeed after
// emitting 128 globals (0,..,127) the 129th (at index 128) requires a
// larger LEB.
nextSizeIncrease = 1 << (7 * sizeInBits);
}
total += counts[actualOrder[i]] * sizeInBits;
}
return total;
}
};
Pass* createReorderGlobalsPass() { return new ReorderGlobals(false); }
Pass* createReorderGlobalsAlwaysPass() { return new ReorderGlobals(true); }
} // namespace wasm