blob: 456a6d2919cea691d6dd6bcc7d8f5de767814141 [file] [log] [blame] [edit]
// Copyright 2021 the V8 project authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.
#include "src/bigint/bigint-internal.h"
#include "src/bigint/vector-arithmetic.h"
namespace v8 {
namespace bigint {
// The classic algorithm: for every part, multiply the accumulator with
// the appropriate multiplier, and add the part. O(n²) overall.
void ProcessorImpl::FromStringClassic(RWDigits Z,
FromStringAccumulator* accumulator) {
// We always have at least one part to process.
DCHECK(accumulator->stack_parts_used_ > 0);
Z[0] = accumulator->stack_parts_[0];
RWDigits already_set(Z, 0, 1);
for (int i = 1; i < Z.len(); i++) Z[i] = 0;
// The {FromStringAccumulator} uses stack-allocated storage for the first
// few parts; if heap storage is used at all then all parts are copied there.
int num_stack_parts = accumulator->stack_parts_used_;
if (num_stack_parts == 1) return;
const std::vector<digit_t>& heap_parts = accumulator->heap_parts_;
int num_heap_parts = static_cast<int>(heap_parts.size());
// All multipliers are the same, except possibly for the last.
const digit_t max_multiplier = accumulator->max_multiplier_;
if (num_heap_parts == 0) {
for (int i = 1; i < num_stack_parts - 1; i++) {
MultiplySingle(Z, already_set, max_multiplier);
Add(Z, accumulator->stack_parts_[i]);
already_set.set_len(already_set.len() + 1);
}
MultiplySingle(Z, already_set, accumulator->last_multiplier_);
Add(Z, accumulator->stack_parts_[num_stack_parts - 1]);
return;
}
// Parts are stored on the heap.
for (int i = 1; i < num_heap_parts - 1; i++) {
MultiplySingle(Z, already_set, max_multiplier);
Add(Z, accumulator->heap_parts_[i]);
already_set.set_len(already_set.len() + 1);
}
MultiplySingle(Z, already_set, accumulator->last_multiplier_);
Add(Z, accumulator->heap_parts_.back());
}
// The fast algorithm: combine parts in a balanced-binary-tree like order:
// Multiply-and-add neighboring pairs of parts, then loop, until only one
// part is left. The benefit is that the multiplications will have inputs of
// similar sizes, which makes them amenable to fast multiplication algorithms.
// We have to do more multiplications than the classic algorithm though,
// because we also have to multiply the multipliers.
// Optimizations:
// - We can skip the multiplier for the first part, because we never need it.
// - Most multipliers are the same; we can avoid repeated multiplications and
// just copy the previous result. (In theory we could even de-dupe them, but
// as the parts/multipliers grow, we'll need most of the memory anyway.)
// Copied results are marked with a * below.
// - We can re-use memory using a system of three buffers whose usage rotates:
// - one is considered empty, and is overwritten with the new parts,
// - one holds the multipliers (and will be "empty" in the next round), and
// - one initially holds the parts and is overwritten with the new multipliers
// Parts and multipliers both grow in each iteration, and get fewer, so we
// use the space of two adjacent old chunks for one new chunk.
// Since the {heap_parts_} vectors has the right size, and so does the
// result {Z}, we can use that memory, and only need to allocate one scratch
// vector. If the final result ends up in the wrong bucket, we have to copy it
// to the correct one.
// - We don't have to keep track of the positions and sizes of the chunks,
// because we can deduce their precise placement from the iteration index.
//
// Example, assuming digit_t is 4 bits, fitting one decimal digit:
// Initial state:
// parts_: 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5
// multipliers_: 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10
// After the first iteration of the outer loop:
// parts: 12 34 56 78 90 12 34 5
// multipliers: 100 *100 *100 *100 *100 *100 10
// After the second iteration:
// parts: 1234 5678 9012 345
// multipliers: 10000 *10000 1000
// After the third iteration:
// parts: 12345678 9012345
// multipliers: 10000000
// And then there's an obvious last iteration.
void ProcessorImpl::FromStringLarge(RWDigits Z,
FromStringAccumulator* accumulator) {
int num_parts = static_cast<int>(accumulator->heap_parts_.size());
DCHECK(num_parts >= 2);
DCHECK(Z.len() >= num_parts);
RWDigits parts(accumulator->heap_parts_.data(), num_parts);
Storage multipliers_storage(num_parts);
RWDigits multipliers(multipliers_storage.get(), num_parts);
RWDigits temp(Z, 0, num_parts);
// Unrolled and specialized first iteration: part_len == 1, so instead of
// Digits sub-vectors we have individual digit_t values, and the multipliers
// are known up front.
{
digit_t max_multiplier = accumulator->max_multiplier_;
digit_t last_multiplier = accumulator->last_multiplier_;
RWDigits new_parts = temp;
RWDigits new_multipliers = parts;
int i = 0;
for (; i + 1 < num_parts; i += 2) {
digit_t p_in = parts[i];
digit_t p_in2 = parts[i + 1];
digit_t m_in = max_multiplier;
digit_t m_in2 = i == num_parts - 2 ? last_multiplier : max_multiplier;
// p[j] = p[i] * m[i+1] + p[i+1]
digit_t p_high;
digit_t p_low = digit_mul(p_in, m_in2, &p_high);
digit_t carry;
new_parts[i] = digit_add2(p_low, p_in2, &carry);
new_parts[i + 1] = p_high + carry;
// m[j] = m[i] * m[i+1]
if (i > 0) {
if (i > 2 && m_in2 != last_multiplier) {
new_multipliers[i] = new_multipliers[i - 2];
new_multipliers[i + 1] = new_multipliers[i - 1];
} else {
digit_t m_high;
new_multipliers[i] = digit_mul(m_in, m_in2, &m_high);
new_multipliers[i + 1] = m_high;
}
}
}
// Trailing last part (if {num_parts} was odd).
if (i < num_parts) {
new_parts[i] = parts[i];
new_multipliers[i] = last_multiplier;
i += 2;
}
num_parts = i >> 1;
RWDigits new_temp = multipliers;
parts = new_parts;
multipliers = new_multipliers;
temp = new_temp;
AddWorkEstimate(num_parts);
}
int part_len = 2;
// Remaining iterations.
while (num_parts > 1) {
RWDigits new_parts = temp;
RWDigits new_multipliers = parts;
int new_part_len = part_len * 2;
int i = 0;
for (; i + 1 < num_parts; i += 2) {
int start = i * part_len;
Digits p_in(parts, start, part_len);
Digits p_in2(parts, start + part_len, part_len);
Digits m_in(multipliers, start, part_len);
Digits m_in2(multipliers, start + part_len, part_len);
RWDigits p_out(new_parts, start, new_part_len);
RWDigits m_out(new_multipliers, start, new_part_len);
// p[j] = p[i] * m[i+1] + p[i+1]
Multiply(p_out, p_in, m_in2);
if (should_terminate()) return;
digit_t overflow = AddAndReturnOverflow(p_out, p_in2);
DCHECK(overflow == 0);
USE(overflow);
// m[j] = m[i] * m[i+1]
if (i > 0) {
bool copied = false;
if (i > 2) {
int prev_start = (i - 2) * part_len;
Digits m_in_prev(multipliers, prev_start, part_len);
Digits m_in2_prev(multipliers, prev_start + part_len, part_len);
if (Compare(m_in, m_in_prev) == 0 &&
Compare(m_in2, m_in2_prev) == 0) {
copied = true;
Digits m_out_prev(new_multipliers, prev_start, new_part_len);
for (int k = 0; k < new_part_len; k++) m_out[k] = m_out_prev[k];
}
}
if (!copied) {
Multiply(m_out, m_in, m_in2);
if (should_terminate()) return;
}
}
}
// Trailing last part (if {num_parts} was odd).
if (i < num_parts) {
Digits p_in(parts, i * part_len, part_len);
Digits m_in(multipliers, i * part_len, part_len);
RWDigits p_out(new_parts, i * part_len, new_part_len);
RWDigits m_out(new_multipliers, i * part_len, new_part_len);
int k = 0;
for (; k < p_in.len(); k++) p_out[k] = p_in[k];
for (; k < p_out.len(); k++) p_out[k] = 0;
k = 0;
for (; k < m_in.len(); k++) m_out[k] = m_in[k];
for (; k < m_out.len(); k++) m_out[k] = 0;
i += 2;
}
num_parts = i >> 1;
part_len = new_part_len;
RWDigits new_temp = multipliers;
parts = new_parts;
multipliers = new_multipliers;
temp = new_temp;
}
// Copy the result to Z, if it doesn't happen to be there already.
if (parts.digits() != Z.digits()) {
int i = 0;
for (; i < parts.len(); i++) Z[i] = parts[i];
// Z might be bigger than we requested; be robust towards that.
for (; i < Z.len(); i++) Z[i] = 0;
}
}
// Specialized algorithms for power-of-two radixes. Designed to work with
// {ParsePowerTwo}: {max_multiplier_} isn't saved, but {radix_} is, and
// {last_multiplier_} has special meaning, namely the number of unpopulated bits
// in the last part.
// For these radixes, {parts} already is a list of correct bit sequences, we
// just have to put them together in the right way:
// - The parts are currently in reversed order. The highest-index parts[i]
// will go into Z[0].
// - All parts, possibly except for the last, are maximally populated.
// - A maximally populated part stores a non-fractional number of characters,
// i.e. the largest fitting multiple of {char_bits} of it is populated.
// - The populated bits in a part are at the low end.
// - The number of unused bits in the last part is stored in
// {accumulator->last_multiplier_}.
//
// Example: Given the following parts vector, where letters are used to
// label bits, bit order is big endian (i.e. [00000101] encodes "5"),
// 'x' means "unpopulated", kDigitBits == 8, radix == 8, and char_bits == 3:
//
// parts[0] -> [xxABCDEF][xxGHIJKL][xxMNOPQR][xxxxxSTU] <- parts[3]
//
// We have to assemble the following result:
//
// Z[0] -> [NOPQRSTU][FGHIJKLM][xxxABCDE] <- Z[2]
//
void ProcessorImpl::FromStringBasePowerOfTwo(
RWDigits Z, FromStringAccumulator* accumulator) {
const int num_parts = accumulator->ResultLength();
DCHECK(num_parts >= 1);
DCHECK(Z.len() >= num_parts);
Digits parts(accumulator->heap_parts_.size() > 0
? accumulator->heap_parts_.data()
: accumulator->stack_parts_,
num_parts);
uint8_t radix = accumulator->radix_;
DCHECK(radix == 2 || radix == 4 || radix == 8 || radix == 16 || radix == 32);
const int char_bits = BitLength(radix - 1);
const int unused_last_part_bits =
static_cast<int>(accumulator->last_multiplier_);
const int unused_part_bits = kDigitBits % char_bits;
const int max_part_bits = kDigitBits - unused_part_bits;
int z_index = 0;
int part_index = num_parts - 1;
// If the last part is fully populated, then all parts must be, and we can
// simply copy them (in reversed order).
if (unused_last_part_bits == 0) {
DCHECK(kDigitBits % char_bits == 0);
while (part_index >= 0) {
Z[z_index++] = parts[part_index--];
}
for (; z_index < Z.len(); z_index++) Z[z_index] = 0;
return;
}
// Otherwise we have to shift parts contents around as needed.
// Holds the next Z digit that we want to store...
digit_t digit = parts[part_index--];
// ...and the number of bits (at the right end) we already know.
int digit_bits = kDigitBits - unused_last_part_bits;
while (part_index >= 0) {
// Holds the last part that we read from {parts}...
digit_t part;
// ...and the number of bits (at the right end) that we haven't used yet.
int part_bits;
while (digit_bits < kDigitBits) {
part = parts[part_index--];
part_bits = max_part_bits;
digit |= part << digit_bits;
int part_shift = kDigitBits - digit_bits;
if (part_shift > part_bits) {
digit_bits += part_bits;
part = 0;
part_bits = 0;
if (part_index < 0) break;
} else {
digit_bits = kDigitBits;
part >>= part_shift;
part_bits -= part_shift;
}
}
Z[z_index++] = digit;
digit = part;
digit_bits = part_bits;
}
if (digit_bits > 0) {
Z[z_index++] = digit;
}
for (; z_index < Z.len(); z_index++) Z[z_index] = 0;
}
void ProcessorImpl::FromString(RWDigits Z, FromStringAccumulator* accumulator) {
if (accumulator->inline_everything_) {
int i = 0;
for (; i < accumulator->stack_parts_used_; i++) {
Z[i] = accumulator->stack_parts_[i];
}
for (; i < Z.len(); i++) Z[i] = 0;
} else if (accumulator->stack_parts_used_ == 0) {
for (int i = 0; i < Z.len(); i++) Z[i] = 0;
} else if (IsPowerOfTwo(accumulator->radix_)) {
FromStringBasePowerOfTwo(Z, accumulator);
} else if (accumulator->ResultLength() < kFromStringLargeThreshold) {
FromStringClassic(Z, accumulator);
} else {
FromStringLarge(Z, accumulator);
}
}
Status Processor::FromString(RWDigits Z, FromStringAccumulator* accumulator) {
ProcessorImpl* impl = static_cast<ProcessorImpl*>(this);
impl->FromString(Z, accumulator);
return impl->get_and_clear_status();
}
} // namespace bigint
} // namespace v8