blob: 2c652e7c2cdb3172941fcfb84c97c42f72ee31b9 [file] [log] [blame]
// Copyright 2023 the V8 project authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.
#ifndef V8_WASM_INLINING_TREE_H_
#define V8_WASM_INLINING_TREE_H_
#if !V8_ENABLE_WEBASSEMBLY
#error This header should only be included if WebAssembly is enabled.
#endif // !V8_ENABLE_WEBASSEMBLY
#include <cinttypes>
#include <cstdint>
#include <queue>
#include <vector>
#include "src/utils/utils.h"
#include "src/wasm/compilation-environment.h"
#include "src/wasm/wasm-module.h"
namespace v8::internal::wasm {
// Represents a tree of inlining decisions.
// A node in the tree represents a function frame, and `function_calls_`
// represent all direct/call_ref/call_indirect function calls in this frame.
// Each element of `function_calls_` is itself a `Vector` of `InliningTree`s,
// corresponding to the different speculative candidates for a
// call_ref/call_indirect; for a direct call, it has a single element.
// If a transitive element of `function_calls_` has its `is_inlined_` field set,
// it should be inlined into the caller.
// We have this additional datastructure for Turboshaft, since nodes in the
// Turboshaft IR aren't easily expanded incrementally, so all the inlining
// decisions are already made before graph building on this abstracted form of
// the code.
class InliningTree : public ZoneObject {
private:
struct Data;
public:
using CasesPerCallSite = base::Vector<InliningTree*>;
static InliningTree* CreateRoot(Zone* zone, const WasmModule* module,
uint32_t function_index) {
InliningTree* tree = zone->New<InliningTree>(
zone->New<Data>(zone, module, function_index), function_index,
0, // Call count.
0, // Wire byte size. `0` causes the root node to always get
// expanded, regardless of budget.
-1, -1, -1, // Caller, feedback slot, case.
0 // Inlining depth.
);
tree->FullyExpand();
return tree;
}
// This should stay roughly in sync with the full logic below, but not rely
// on having observed any call counts. Since it therefore can't simulate
// regular behavior accurately anyway, it may be a very coarse approximation.
static int NoLiftoffBudget(const WasmModule* module, uint32_t func_index) {
size_t wirebytes = module->functions[func_index].code.length();
double scaled = BudgetScaleFactor(module);
// TODO(jkummerow): When TF is gone, remove this adjustment by folding
// it into the flag's default value.
constexpr int kTurboshaftAdjustment = 2;
int high_growth =
static_cast<int>(v8_flags.wasm_inlining_factor) + kTurboshaftAdjustment;
constexpr int kLowestUsefulValue = 2;
int low_growth = std::max(kLowestUsefulValue, high_growth - 3);
double max_growth_factor = low_growth * (1 - scaled) + high_growth * scaled;
return std::max(static_cast<int>(v8_flags.wasm_inlining_min_budget),
static_cast<int>(max_growth_factor * wirebytes));
}
int64_t score() const {
// Note that the zero-point is arbitrary. Functions with negative score
// can still get inlined.
constexpr int count_factor = 2;
constexpr int size_factor = 3;
return int64_t{call_count_} * count_factor -
int64_t{wire_byte_size_} * size_factor;
}
// TODO(dlehmann,manoskouk): We are running into this limit, e.g., for the
// "argon2-wasm" benchmark.
// IIUC, this limit is in place because of the encoding of inlining IDs in
// a 6-bit bitfield in Turboshaft IR, which we should revisit.
static constexpr int kMaxInlinedCount = 60;
// Limit the nesting depth of inlining. Inlining decisions are based on call
// counts. A small function with high call counts that is called recursively
// would be inlined until all budget is used.
// TODO(14108): This still might not lead to ideal results. Other options
// could be explored like penalizing nested inlinees.
static constexpr uint32_t kMaxInliningNestingDepth = 7;
base::Vector<CasesPerCallSite> function_calls() { return function_calls_; }
base::Vector<bool> has_non_inlineable_targets() {
return has_non_inlineable_targets_;
}
bool feedback_found() { return feedback_found_; }
bool is_inlined() { return is_inlined_; }
uint32_t function_index() { return function_index_; }
private:
friend class v8::internal::Zone; // For `zone->New<InliningTree>`.
static double BudgetScaleFactor(const WasmModule* module) {
// If there are few small functions, that indicates that the toolchain
// already performed significant inlining, so we reduce the budget
// significantly as further inlining has diminishing benefits.
// For both major knobs, we apply a smoothened step function based on
// the module's percentage of small functions (sfp):
// sfp <= 25%: use "low" budget
// sfp >= 50%: use "high" budget
// 25% < sfp < 50%: interpolate linearly between both budgets.
double small_function_percentage =
module->num_small_functions * 100.0 / module->num_declared_functions;
if (small_function_percentage <= 25) {
return 0;
} else if (small_function_percentage >= 50) {
return 1;
} else {
return (small_function_percentage - 25) / 25;
}
}
struct Data {
Data(Zone* zone, const WasmModule* module, uint32_t topmost_caller_index)
: zone(zone),
module(module),
topmost_caller_index(topmost_caller_index) {
double scaled = BudgetScaleFactor(module);
// We found experimentally that we need to allow a larger growth factor
// for Turboshaft to achieve similar inlining decisions as in Turbofan;
// presumably because some functions that have a small wire size of their
// own still need to be allowed to inline some callees.
// TODO(jkummerow): When TF is gone, remove this adjustment by folding
// it into the flag's default value.
constexpr int kTurboshaftAdjustment = 2;
int high_growth = static_cast<int>(v8_flags.wasm_inlining_factor) +
kTurboshaftAdjustment;
// A value of 1 would be equivalent to disabling inlining entirely.
constexpr int kLowestUsefulValue = 2;
int low_growth = std::max(kLowestUsefulValue, high_growth - 3);
max_growth_factor = low_growth * (1 - scaled) + high_growth * scaled;
// The {wasm_inlining_budget} value has been tuned for Turbofan node
// counts. Turboshaft looks at wire bytes instead, and on average there
// are about 0.74 TF nodes per wire byte, so we apply a small factor to
// account for the difference, so we get similar inlining decisions in
// both compilers.
// TODO(jkummerow): When TF is gone, remove this factor by folding it
// into the flag's default value.
constexpr double kTurboshaftCorrectionFactor = 1.4;
double high_cap =
v8_flags.wasm_inlining_budget * kTurboshaftCorrectionFactor;
double low_cap = high_cap / 10;
budget_cap = low_cap * (1 - scaled) + high_cap * scaled;
}
Zone* zone;
const WasmModule* module;
double max_growth_factor;
size_t budget_cap;
uint32_t topmost_caller_index;
};
InliningTree(Data* shared, uint32_t function_index, int call_count,
int wire_byte_size, uint32_t caller_index, int feedback_slot,
int the_case, uint32_t depth)
: data_(shared),
function_index_(function_index),
call_count_(call_count),
wire_byte_size_(wire_byte_size),
depth_(depth),
caller_index_(caller_index),
feedback_slot_(feedback_slot),
case_(the_case) {}
// Recursively expand the tree by expanding this node and children nodes etc.
// Nodes are prioritized by their `score`. Expansion continues until
// `kMaxInlinedCount` nodes are expanded or `budget` (in wire-bytes size) is
// depleted.
void FullyExpand();
// Mark this function call as inline and initialize `function_calls_` based
// on the `module_->type_feedback`.
void Inline();
bool SmallEnoughToInline(size_t initial_wire_byte_size,
size_t inlined_wire_byte_count);
Data* data_;
uint32_t function_index_;
int call_count_;
int wire_byte_size_;
bool is_inlined_ = false;
bool feedback_found_ = false;
base::Vector<CasesPerCallSite> function_calls_{};
base::Vector<bool> has_non_inlineable_targets_{};
uint32_t depth_;
// For tracing.
uint32_t caller_index_;
int feedback_slot_;
int case_;
};
void InliningTree::Inline() {
is_inlined_ = true;
auto& feedback_map = data_->module->type_feedback.feedback_for_function;
auto feedback_it = feedback_map.find(function_index_);
if (feedback_it == feedback_map.end()) return;
const FunctionTypeFeedback& feedback = feedback_it->second;
base::Vector<CallSiteFeedback> type_feedback =
feedback.feedback_vector.as_vector();
if (type_feedback.empty()) return; // No feedback yet.
DCHECK_EQ(type_feedback.size(), feedback.call_targets.size());
feedback_found_ = true;
function_calls_ =
data_->zone->AllocateVector<CasesPerCallSite>(type_feedback.size());
has_non_inlineable_targets_ =
data_->zone->AllocateVector<bool>(type_feedback.size());
for (size_t i = 0; i < type_feedback.size(); i++) {
function_calls_[i] = data_->zone->AllocateVector<InliningTree*>(
type_feedback[i].num_cases());
has_non_inlineable_targets_[i] =
type_feedback[i].has_non_inlineable_targets();
for (int the_case = 0; the_case < type_feedback[i].num_cases();
the_case++) {
uint32_t callee_index = type_feedback[i].function_index(the_case);
// TODO(jkummerow): Experiment with propagating relative call counts
// into the nested InliningTree, and weighting scores there accordingly.
function_calls_[i][the_case] = data_->zone->New<InliningTree>(
data_, callee_index, type_feedback[i].call_count(the_case),
data_->module->functions[callee_index].code.length(), function_index_,
static_cast<int>(i), the_case, depth_ + 1);
}
}
}
struct TreeNodeOrdering {
bool operator()(InliningTree* t1, InliningTree* t2) {
// Prefer callees with a higher score, and if the scores are equal,
// those with a lower function index (to make the queue ordering strict).
return std::make_pair(t1->score(), t2->function_index()) <
std::make_pair(t2->score(), t1->function_index());
}
};
void InliningTree::FullyExpand() {
DCHECK_EQ(this->function_index_, data_->topmost_caller_index);
size_t initial_wire_byte_size =
data_->module->functions[function_index_].code.length();
size_t inlined_wire_byte_count = 0;
std::priority_queue<InliningTree*, std::vector<InliningTree*>,
TreeNodeOrdering>
queue;
queue.push(this);
int inlined_count = 0;
base::MutexGuard mutex_guard(&data_->module->type_feedback.mutex);
while (!queue.empty() && inlined_count < kMaxInlinedCount) {
InliningTree* top = queue.top();
if (v8_flags.trace_wasm_inlining) {
if (top != this) {
PrintF(
"[function %d: in function %d, considering call #%d, case #%d, to "
"function %d (count=%d, size=%d, score=%" PRId64 ")... ",
data_->topmost_caller_index, top->caller_index_,
top->feedback_slot_, static_cast<int>(top->case_),
static_cast<int>(top->function_index_), top->call_count_,
top->wire_byte_size_, static_cast<int64_t>(top->score()));
} else {
PrintF("[function %d: expanding topmost caller... ",
data_->topmost_caller_index);
}
}
queue.pop();
if (top->function_index_ < data_->module->num_imported_functions) {
if (v8_flags.trace_wasm_inlining && top != this) {
PrintF("imported function]\n");
}
continue;
}
if (is_asmjs_module(data_->module)) {
if (v8_flags.trace_wasm_inlining) {
PrintF("cannot inline asm.js function]\n");
}
continue;
}
// Key idea: inlining hot calls is good, inlining big functions is bad,
// so inline when a candidate is "hotter than it is big". Exception:
// tiny candidates can get inlined regardless of their call count.
if (top != this && top->wire_byte_size_ >= 12 &&
!v8_flags.wasm_inlining_ignore_call_counts) {
if (top->call_count_ < top->wire_byte_size_ / 2) {
if (v8_flags.trace_wasm_inlining) {
PrintF("not called often enough]\n");
}
continue;
}
}
if (!top->SmallEnoughToInline(initial_wire_byte_size,
inlined_wire_byte_count)) {
if (v8_flags.trace_wasm_inlining && top != this) {
PrintF("not enough inlining budget]\n");
}
continue;
}
if (v8_flags.trace_wasm_inlining && top != this) {
PrintF("decided to inline! ");
}
top->Inline();
inlined_count++;
// For tiny functions, inlining may actually decrease generated code size
// because we have one less call and don't need to push arguments, etc.
// Subtract a little bit from the code size increase, such that inlining
// these tiny functions doesn't use up any of the budget.
constexpr int kOneLessCall = 6; // Guesstimated savings per call.
inlined_wire_byte_count += std::max(top->wire_byte_size_ - kOneLessCall, 0);
if (!top->feedback_found()) {
if (v8_flags.trace_wasm_inlining) {
PrintF("no feedback yet or no callees]\n");
}
} else if (top->depth_ < kMaxInliningNestingDepth) {
if (v8_flags.trace_wasm_inlining) {
PrintF("queueing %zu callee(s)]\n", top->function_calls_.size());
}
for (CasesPerCallSite cases : top->function_calls_) {
for (InliningTree* call : cases) {
if (call != nullptr) {
queue.push(call);
}
}
}
} else if (v8_flags.trace_wasm_inlining) {
PrintF("max inlining depth reached]\n");
}
}
if (v8_flags.trace_wasm_inlining && !queue.empty()) {
PrintF("[function %d: too many inlining candidates, stopping...]\n",
data_->topmost_caller_index);
}
}
// Returns true if there is still enough budget left to inline the current
// candidate given the initial graph size and the already inlined wire bytes.
bool InliningTree::SmallEnoughToInline(size_t initial_wire_byte_size,
size_t inlined_wire_byte_count) {
if (wire_byte_size_ > static_cast<int>(v8_flags.wasm_inlining_max_size)) {
return false;
}
// For tiny functions, let's be a bit more generous.
// TODO(dlehmann): Since we don't use up budget (i.e., increase
// `inlined_wire_byte_count` see above) for very tiny functions, we might be
// able to remove/simplify this code in the future.
if (wire_byte_size_ < 12) {
if (inlined_wire_byte_count > 100) {
inlined_wire_byte_count -= 100;
} else {
inlined_wire_byte_count = 0;
}
}
// For small-ish functions, the inlining budget is defined by the larger of
// 1) the wasm_inlining_min_budget and
// 2) the max_growth_factor * initial_wire_byte_size.
// Inlining a little bit should always be fine even for tiny functions (1),
// otherwise (2) makes sure that the budget scales in relation with the
// original function size, to limit the compile time increase caused by
// inlining.
size_t budget_small_function =
std::max<size_t>(v8_flags.wasm_inlining_min_budget,
data_->max_growth_factor * initial_wire_byte_size);
// For large functions, growing by the same factor would add too much
// compilation effort, so we also apply a fixed cap. However, independent
// of the budget cap, for large functions we should still allow a little
// inlining, which is why we allow 10% of the graph size is the minimal
// budget even for large functions that exceed the regular budget.
//
// Note for future tuning: it might make sense to allow 20% here, and in
// turn perhaps lower --wasm-inlining-budget. The drawback is that this
// would allow truly huge functions to grow even bigger; the benefit is
// that we wouldn't fall off as steep a cliff when hitting the cap.
size_t budget_large_function =
std::max<size_t>(data_->budget_cap, initial_wire_byte_size * 1.1);
size_t total_size = initial_wire_byte_size + inlined_wire_byte_count +
static_cast<size_t>(wire_byte_size_);
if (v8_flags.trace_wasm_inlining) {
PrintF("budget=min(%zu, %zu), size %zu->%zu ", budget_small_function,
budget_large_function,
(initial_wire_byte_size + inlined_wire_byte_count), total_size);
}
return total_size <
std::min<size_t>(budget_small_function, budget_large_function);
}
} // namespace v8::internal::wasm
#endif // V8_WASM_INLINING_TREE_H_