| /* |
| * Copyright 2016 WebAssembly Community Group participants |
| * |
| * Licensed under the Apache License, Version 2.0 (the "License"); |
| * you may not use this file except in compliance with the License. |
| * You may obtain a copy of the License at |
| * |
| * http://www.apache.org/licenses/LICENSE-2.0 |
| * |
| * Unless required by applicable law or agreed to in writing, software |
| * distributed under the License is distributed on an "AS IS" BASIS, |
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| * See the License for the specific language governing permissions and |
| * limitations under the License. |
| */ |
| |
| // |
| // Coalesce locals, in order to reduce the total number of locals. This |
| // is similar to register allocation, however, there is never any |
| // spilling, and there isn't a fixed number of locals. |
| // |
| // NB: This pass is nonlinear in the number of locals. It is best to run it |
| // after the number of locals has been somewhat reduced by other passes, |
| // for example by simplify-locals (to remove unneeded uses of locals) and |
| // reorder-locals (to sort them by # of uses and remove all unneeded ones). |
| // |
| |
| #include <algorithm> |
| #include <memory> |
| #include <unordered_set> |
| |
| #include "cfg/liveness-traversal.h" |
| #include "ir/utils.h" |
| #include "pass.h" |
| #include "support/learning.h" |
| #include "support/permutations.h" |
| #include "wasm.h" |
| #ifdef CFG_PROFILE |
| #include "support/timing.h" |
| #endif |
| |
| namespace wasm { |
| |
| struct CoalesceLocals |
| : public WalkerPass<LivenessWalker<CoalesceLocals, Visitor<CoalesceLocals>>> { |
| bool isFunctionParallel() override { return true; } |
| |
| // This pass merges locals, mapping the originals to new ones. |
| // FIXME DWARF updating does not handle local changes yet. |
| bool invalidatesDWARF() override { return true; } |
| |
| Pass* create() override { return new CoalesceLocals; } |
| |
| // main entry point |
| |
| void doWalkFunction(Function* func); |
| |
| void increaseBackEdgePriorities(); |
| |
| void calculateInterferences(); |
| |
| void calculateInterferences(const SetOfLocals& locals); |
| |
| void pickIndicesFromOrder(std::vector<Index>& order, |
| std::vector<Index>& indices); |
| void pickIndicesFromOrder(std::vector<Index>& order, |
| std::vector<Index>& indices, |
| Index& removedCopies); |
| |
| // returns a vector of oldIndex => newIndex |
| virtual void pickIndices(std::vector<Index>& indices); |
| |
| void applyIndices(std::vector<Index>& indices, Expression* root); |
| |
| // interference state |
| |
| // canonicalized - accesses should check (low, high) |
| std::vector<bool> interferences; |
| |
| void interfere(Index i, Index j) { |
| if (i == j) { |
| return; |
| } |
| interferences[std::min(i, j) * numLocals + std::max(i, j)] = 1; |
| } |
| |
| // optimized version where you know that low < high |
| void interfereLowHigh(Index low, Index high) { |
| assert(low < high); |
| interferences[low * numLocals + high] = 1; |
| } |
| |
| void unInterfere(Index i, Index j) { |
| interferences[std::min(i, j) * numLocals + std::max(i, j)] = 0; |
| } |
| |
| bool interferes(Index i, Index j) { |
| return interferences[std::min(i, j) * numLocals + std::max(i, j)]; |
| } |
| }; |
| |
| void CoalesceLocals::doWalkFunction(Function* func) { |
| if (!canRun(func)) { |
| return; |
| } |
| super::doWalkFunction(func); |
| // prioritize back edges |
| increaseBackEdgePriorities(); |
| // use liveness to find interference |
| calculateInterferences(); |
| // pick new indices |
| std::vector<Index> indices; |
| pickIndices(indices); |
| // apply indices |
| applyIndices(indices, func->body); |
| } |
| |
| // A copy on a backedge can be especially costly, forcing us to branch just to |
| // do that copy. Add weight to such copies, so we prioritize getting rid of |
| // them. |
| void CoalesceLocals::increaseBackEdgePriorities() { |
| for (auto* loopTop : loopTops) { |
| // ignore the first edge, it is the initial entry, we just want backedges |
| auto& in = loopTop->in; |
| for (Index i = 1; i < in.size(); i++) { |
| auto* arrivingBlock = in[i]; |
| if (arrivingBlock->out.size() > 1) { |
| // we just want unconditional branches to the loop top, true phi |
| // fragments |
| continue; |
| } |
| for (auto& action : arrivingBlock->contents.actions) { |
| if (action.isSet()) { |
| auto* set = (*action.origin)->cast<LocalSet>(); |
| if (auto* get = getCopy(set)) { |
| // this is indeed a copy, add to the cost (default cost is 2, so |
| // this adds 50%, and can mostly break ties) |
| addCopy(set->index, get->index); |
| } |
| } |
| } |
| } |
| } |
| } |
| |
| void CoalesceLocals::calculateInterferences() { |
| interferences.resize(numLocals * numLocals); |
| std::fill(interferences.begin(), interferences.end(), false); |
| for (auto& curr : basicBlocks) { |
| if (liveBlocks.count(curr.get()) == 0) { |
| continue; // ignore dead blocks |
| } |
| // everything coming in might interfere, as it might come from a different |
| // block |
| auto live = curr->contents.end; |
| calculateInterferences(live); |
| // scan through the block itself |
| auto& actions = curr->contents.actions; |
| for (int i = int(actions.size()) - 1; i >= 0; i--) { |
| auto& action = actions[i]; |
| auto index = action.index; |
| if (action.isGet()) { |
| // new live local, interferes with all the rest |
| live.insert(index); |
| for (auto i : live) { |
| interfere(i, index); |
| } |
| } else { |
| if (live.erase(index)) { |
| action.effective = true; |
| } |
| } |
| } |
| } |
| // Params have a value on entry, so mark them as live, as variables |
| // live at the entry expect their zero-init value. |
| SetOfLocals start = entry->contents.start; |
| auto numParams = getFunction()->getNumParams(); |
| for (Index i = 0; i < numParams; i++) { |
| start.insert(i); |
| } |
| calculateInterferences(start); |
| } |
| |
| void CoalesceLocals::calculateInterferences(const SetOfLocals& locals) { |
| Index size = locals.size(); |
| for (Index i = 0; i < size; i++) { |
| for (Index j = i + 1; j < size; j++) { |
| interfereLowHigh(locals[i], locals[j]); |
| } |
| } |
| } |
| |
| // Indices decision making |
| |
| void CoalesceLocals::pickIndicesFromOrder(std::vector<Index>& order, |
| std::vector<Index>& indices) { |
| Index removedCopies; |
| pickIndicesFromOrder(order, indices, removedCopies); |
| } |
| |
| void CoalesceLocals::pickIndicesFromOrder(std::vector<Index>& order, |
| std::vector<Index>& indices, |
| Index& removedCopies) { |
| // mostly-simple greedy coloring |
| #if CFG_DEBUG |
| std::cerr << "\npickIndicesFromOrder on " << getFunction()->name << '\n'; |
| std::cerr << getFunction()->body << '\n'; |
| std::cerr << "order:\n"; |
| for (auto i : order) |
| std::cerr << i << ' '; |
| std::cerr << '\n'; |
| std::cerr << "interferences:\n"; |
| for (Index i = 0; i < numLocals; i++) { |
| for (Index j = 0; j < i + 1; j++) { |
| std::cerr << " "; |
| } |
| for (Index j = i + 1; j < numLocals; j++) { |
| std::cerr << int(interferes(i, j)) << ' '; |
| } |
| std::cerr << " : $" << i << '\n'; |
| } |
| std::cerr << "copies:\n"; |
| for (Index i = 0; i < numLocals; i++) { |
| for (Index j = 0; j < i + 1; j++) { |
| std::cerr << " "; |
| } |
| for (Index j = i + 1; j < numLocals; j++) { |
| std::cerr << int(getCopies(i, j)) << ' '; |
| } |
| std::cerr << " : $" << i << '\n'; |
| } |
| std::cerr << "total copies:\n"; |
| for (Index i = 0; i < numLocals; i++) { |
| std::cerr << " $" << i << ": " << totalCopies[i] << '\n'; |
| } |
| #endif |
| // TODO: take into account distribution (99-1 is better than 50-50 with two |
| // registers, for gzip) |
| std::vector<Type> types; |
| // new index * numLocals => list of all interferences of locals merged to it |
| std::vector<bool> newInterferences; |
| // new index * numLocals => list of all copies of locals merged to it |
| std::vector<uint8_t> newCopies; |
| indices.resize(numLocals); |
| types.resize(numLocals); |
| newInterferences.resize(numLocals * numLocals); |
| std::fill(newInterferences.begin(), newInterferences.end(), false); |
| auto numParams = getFunction()->getNumParams(); |
| // start with enough room for the params |
| newCopies.resize(numParams * numLocals); |
| std::fill(newCopies.begin(), newCopies.end(), 0); |
| Index nextFree = 0; |
| removedCopies = 0; |
| // we can't reorder parameters, they are fixed in order, and cannot coalesce |
| Index i = 0; |
| for (; i < numParams; i++) { |
| assert(order[i] == i); // order must leave the params in place |
| indices[i] = i; |
| types[i] = getFunction()->getLocalType(i); |
| for (Index j = numParams; j < numLocals; j++) { |
| newInterferences[numLocals * i + j] = interferes(i, j); |
| newCopies[numLocals * i + j] = getCopies(i, j); |
| } |
| nextFree++; |
| } |
| for (; i < numLocals; i++) { |
| Index actual = order[i]; |
| Index found = -1; |
| uint8_t foundCopies = -1; |
| for (Index j = 0; j < nextFree; j++) { |
| if (!newInterferences[j * numLocals + actual] && |
| getFunction()->getLocalType(actual) == types[j]) { |
| // this does not interfere, so it might be what we want. but pick the |
| // one eliminating the most copies (we could stop looking forward when |
| // there are no more items that have copies anyhow, but it doesn't seem |
| // to help) |
| auto currCopies = newCopies[j * numLocals + actual]; |
| if (found == Index(-1) || currCopies > foundCopies) { |
| indices[actual] = found = j; |
| foundCopies = currCopies; |
| } |
| } |
| } |
| if (found == Index(-1)) { |
| indices[actual] = found = nextFree; |
| types[found] = getFunction()->getLocalType(actual); |
| nextFree++; |
| removedCopies += getCopies(found, actual); |
| newCopies.resize(nextFree * numLocals); |
| } else { |
| removedCopies += foundCopies; |
| } |
| #if CFG_DEBUG |
| std::cerr << "set local $" << actual << " to $" << found << '\n'; |
| #endif |
| // merge new interferences and copies for the new index |
| for (Index k = i + 1; k < numLocals; k++) { |
| // go in the order, we only need to update for those we will see later |
| auto j = order[k]; |
| newInterferences[found * numLocals + j] = |
| newInterferences[found * numLocals + j] || interferes(actual, j); |
| newCopies[found * numLocals + j] += getCopies(actual, j); |
| } |
| } |
| } |
| |
| // given a baseline order, adjust it based on an important order of priorities |
| // (higher values are higher priority). The priorities take precedence, unless |
| // they are equal and then the original order should be kept. |
| std::vector<Index> adjustOrderByPriorities(std::vector<Index>& baseline, |
| std::vector<Index>& priorities) { |
| std::vector<Index> ret = baseline; |
| std::vector<Index> reversed = makeReversed(baseline); |
| std::sort(ret.begin(), ret.end(), [&priorities, &reversed](Index x, Index y) { |
| return priorities[x] > priorities[y] || |
| (priorities[x] == priorities[y] && reversed[x] < reversed[y]); |
| }); |
| return ret; |
| } |
| |
| void CoalesceLocals::pickIndices(std::vector<Index>& indices) { |
| if (numLocals == 0) { |
| return; |
| } |
| if (numLocals == 1) { |
| indices.push_back(0); |
| return; |
| } |
| // take into account total copies. but we must keep params in place, so give |
| // them max priority |
| auto adjustedTotalCopies = totalCopies; |
| auto numParams = getFunction()->getNumParams(); |
| for (Index i = 0; i < numParams; i++) { |
| adjustedTotalCopies[i] = std::numeric_limits<Index>::max(); |
| } |
| // first try the natural order. this is less arbitrary than it seems, as the |
| // program may have a natural order of locals inherent in it. |
| auto order = makeIdentity(numLocals); |
| order = adjustOrderByPriorities(order, adjustedTotalCopies); |
| Index removedCopies; |
| pickIndicesFromOrder(order, indices, removedCopies); |
| auto maxIndex = *std::max_element(indices.begin(), indices.end()); |
| // next try the reverse order. this both gives us another chance at something |
| // good, and also the very naturalness of the simple order may be quite |
| // suboptimal |
| setIdentity(order); |
| for (Index i = numParams; i < numLocals; i++) { |
| order[i] = numParams + numLocals - 1 - i; |
| } |
| order = adjustOrderByPriorities(order, adjustedTotalCopies); |
| std::vector<Index> reverseIndices; |
| Index reverseRemovedCopies; |
| pickIndicesFromOrder(order, reverseIndices, reverseRemovedCopies); |
| auto reverseMaxIndex = |
| *std::max_element(reverseIndices.begin(), reverseIndices.end()); |
| // prefer to remove copies foremost, as it matters more for code size (minus |
| // gzip), and improves throughput. |
| if (reverseRemovedCopies > removedCopies || |
| (reverseRemovedCopies == removedCopies && reverseMaxIndex < maxIndex)) { |
| indices.swap(reverseIndices); |
| } |
| } |
| |
| void CoalesceLocals::applyIndices(std::vector<Index>& indices, |
| Expression* root) { |
| assert(indices.size() == numLocals); |
| for (auto& curr : basicBlocks) { |
| auto& actions = curr->contents.actions; |
| for (auto& action : actions) { |
| if (action.isGet()) { |
| auto* get = (*action.origin)->cast<LocalGet>(); |
| get->index = indices[get->index]; |
| } else if (action.isSet()) { |
| auto* set = (*action.origin)->cast<LocalSet>(); |
| set->index = indices[set->index]; |
| // in addition, we can optimize out redundant copies and ineffective |
| // sets |
| LocalGet* get; |
| if ((get = set->value->dynCast<LocalGet>()) && |
| get->index == set->index) { |
| action.removeCopy(); |
| continue; |
| } |
| // remove ineffective actions |
| if (!action.effective) { |
| // value may have no side effects, further optimizations can eliminate |
| // it |
| *action.origin = set->value; |
| if (!set->isTee()) { |
| // we need to drop it |
| Drop* drop = ExpressionManipulator::convert<LocalSet, Drop>(set); |
| drop->value = *action.origin; |
| *action.origin = drop; |
| } |
| continue; |
| } |
| } |
| } |
| } |
| // update type list |
| auto numParams = getFunction()->getNumParams(); |
| Index newNumLocals = 0; |
| for (auto index : indices) { |
| newNumLocals = std::max(newNumLocals, index + 1); |
| } |
| auto oldVars = getFunction()->vars; |
| getFunction()->vars.resize(newNumLocals - numParams); |
| for (Index index = numParams; index < numLocals; index++) { |
| Index newIndex = indices[index]; |
| if (newIndex >= numParams) { |
| getFunction()->vars[newIndex - numParams] = oldVars[index - numParams]; |
| } |
| } |
| // names are gone |
| getFunction()->localNames.clear(); |
| getFunction()->localIndices.clear(); |
| } |
| |
| struct CoalesceLocalsWithLearning : public CoalesceLocals { |
| virtual Pass* create() override { return new CoalesceLocalsWithLearning; } |
| |
| virtual void pickIndices(std::vector<Index>& indices) override; |
| }; |
| |
| void CoalesceLocalsWithLearning::pickIndices(std::vector<Index>& indices) { |
| if (getFunction()->getNumVars() <= 1) { |
| // nothing to think about here |
| CoalesceLocals::pickIndices(indices); |
| return; |
| } |
| |
| struct Order : public std::vector<Index> { |
| void setFitness(double f) { fitness = f; } |
| double getFitness() { return fitness; } |
| void dump(std::string text) { |
| std::cout << text + ": ( "; |
| for (Index i = 0; i < size(); i++) { |
| std::cout << (*this)[i] << " "; |
| } |
| std::cout << ")\n"; |
| std::cout << "of quality: " << getFitness() << "\n"; |
| } |
| |
| private: |
| double fitness; |
| }; |
| |
| struct Generator { |
| Generator(CoalesceLocalsWithLearning* parent) : parent(parent), noise(42) {} |
| |
| void calculateFitness(Order* order) { |
| // apply the order |
| std::vector<Index> indices; // the phenotype |
| Index removedCopies; |
| parent->pickIndicesFromOrder(*order, indices, removedCopies); |
| auto maxIndex = *std::max_element(indices.begin(), indices.end()); |
| assert(maxIndex <= parent->numLocals); |
| // main part of fitness is the number of locals |
| double fitness = parent->numLocals - maxIndex; // higher fitness is better |
| // secondarily, it is nice to not reorder locals unnecessarily |
| double fragment = 1.0 / (2.0 * parent->numLocals); |
| for (Index i = 0; i < parent->numLocals; i++) { |
| if ((*order)[i] == i) { |
| fitness += fragment; // boost for each that wasn't moved |
| } |
| } |
| // removing copies is a secondary concern |
| fitness = (100 * fitness) + removedCopies; |
| order->setFitness(fitness); |
| } |
| |
| Order* makeRandom() { |
| auto* ret = new Order; |
| ret->resize(parent->numLocals); |
| for (Index i = 0; i < parent->numLocals; i++) { |
| (*ret)[i] = i; |
| } |
| if (first) { |
| // as the first guess, use the natural order. this is not arbitrary for |
| // two reasons. first, there may be an inherent order in the input |
| // (frequent indices are lower, etc.). second, by ensuring we start with |
| // the natural order, we ensure we are at least as good as the |
| // non-learning variant. |
| // TODO: use ::pickIndices from the parent, so we literally get the |
| // simpler approach as our first option |
| first = false; |
| } else { |
| // leave params alone, shuffle the rest |
| std::shuffle(ret->begin() + parent->getFunction()->getNumParams(), |
| ret->end(), |
| noise); |
| } |
| calculateFitness(ret); |
| #ifdef CFG_LEARN_DEBUG |
| order->dump("new rando"); |
| #endif |
| return ret; |
| } |
| |
| Order* makeMixture(Order* left, Order* right) { |
| // perturb left using right. this is useful since |
| // we don't care about absolute locations, relative ones matter more, |
| // and a true merge of two vectors could obscure that (e.g. |
| // a.......... and ..........a would merge a into the middle, for no |
| // reason), and cause a lot of unnecessary noise |
| Index size = left->size(); |
| Order reverseRight; // reverseRight[x] is the index of x in right |
| reverseRight.resize(size); |
| for (Index i = 0; i < size; i++) { |
| reverseRight[(*right)[i]] = i; |
| } |
| auto* ret = new Order; |
| *ret = *left; |
| assert(size >= 1); |
| for (Index i = parent->getFunction()->getNumParams(); i < size - 1; i++) { |
| // if (i, i + 1) is in reverse order in right, flip them |
| if (reverseRight[(*ret)[i]] > reverseRight[(*ret)[i + 1]]) { |
| std::swap((*ret)[i], (*ret)[i + 1]); |
| // if we don't skip, we might end up pushing an element all the way to |
| // the end, which is not very perturbation-y |
| i++; |
| } |
| } |
| calculateFitness(ret); |
| #ifdef CFG_LEARN_DEBUG |
| ret->dump("new mixture"); |
| #endif |
| return ret; |
| } |
| |
| private: |
| CoalesceLocalsWithLearning* parent; |
| std::mt19937 noise; |
| bool first = true; |
| }; |
| |
| #ifdef CFG_LEARN_DEBUG |
| std::cout << "[learning for " << getFunction()->name << "]\n"; |
| #endif |
| auto numVars = this->getFunction()->getNumVars(); |
| const int GENERATION_SIZE = |
| std::min(Index(numVars * (numVars - 1)), Index(20)); |
| Generator generator(this); |
| GeneticLearner<Order, double, Generator> learner(generator, GENERATION_SIZE); |
| #ifdef CFG_LEARN_DEBUG |
| learner.getBest()->dump("first best"); |
| #endif |
| // keep working while we see improvement |
| auto oldBest = learner.getBest()->getFitness(); |
| while (1) { |
| learner.runGeneration(); |
| auto newBest = learner.getBest()->getFitness(); |
| if (newBest == oldBest) { |
| break; // unlikely we can improve |
| } |
| oldBest = newBest; |
| #ifdef CFG_LEARN_DEBUG |
| learner.getBest()->dump("current best"); |
| #endif |
| } |
| #ifdef CFG_LEARN_DEBUG |
| learner.getBest()->dump("the best"); |
| #endif |
| // TODO: cache indices in Orders, at the cost of more memory? |
| this->pickIndicesFromOrder(*learner.getBest(), indices); |
| } |
| |
| // declare passes |
| |
| Pass* createCoalesceLocalsPass() { return new CoalesceLocals(); } |
| |
| Pass* createCoalesceLocalsWithLearningPass() { |
| return new CoalesceLocalsWithLearning(); |
| } |
| |
| } // namespace wasm |