blob: ff2f8aa88a9a9c1b9ee4d203a897cd02c9400af8 [file] [log] [blame]
// Copyright 2022 the V8 project authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.
#include "src/debug/liveedit-diff.h"
#include <cmath>
#include <map>
#include <vector>
#include "src/base/logging.h"
#include "src/base/optional.h"
namespace v8 {
namespace internal {
namespace {
// Implements Myer's Algorithm from
// "An O(ND) Difference Algorithm and Its Variations", particularly the
// linear space refinement mentioned in section 4b.
//
// The differ is input agnostic.
//
// The algorithm works by finding the shortest edit string (SES) in the edit
// graph. The SES describes how to get from a string A of length N to a string
// B of length M via deleting from A and inserting from B.
//
// Example: A = "abbaa", B = "abab"
//
// A
//
// a b b a a
// o---o---o---o---o---o
// a | \ | | | \ | \ |
// o---o---o---o---o---o
// b | | \ | \ | | |
// B o---o---o---o---o---o
// a | \ | | | \ | \ |
// o---o---o---o---o---o
// b | | \ | \ | | |
// o---o---o---o---o---o
//
// The edit graph is constructed with the characters from string A on the x-axis
// and the characters from string B on the y-axis. Starting from (0, 0) we can:
//
// - Move right, which is equivalent to deleting from A
// - Move downwards, which is equivalent to inserting from B
// - Move diagonally if the characters from string A and B match, which
// means no insertion or deletion.
//
// Any path from (0, 0) to (N, M) describes a valid edit string, but we try to
// find the path with the most diagonals, conversely that is the path with the
// least insertions or deletions.
// Note that a path with "D" insertions/deletions is called a D-path.
class MyersDiffer {
private:
// A point in the edit graph.
struct Point {
int x, y;
// Less-than for a point in the edit graph is defined as less than in both
// components (i.e. at least one diagonal away).
bool operator<(const Point& other) const {
return x < other.x && y < other.y;
}
};
// Describes a rectangle in the edit graph.
struct EditGraphArea {
Point top_left, bottom_right;
int width() const { return bottom_right.x - top_left.x; }
int height() const { return bottom_right.y - top_left.y; }
int size() const { return width() + height(); }
int delta() const { return width() - height(); }
};
// A path or path-segment through the edit graph. Not all points along
// the path are necessarily listed since it is trivial to figure out all
// the concrete points along a snake.
struct Path {
std::vector<Point> points;
void Add(const Point& p) { points.push_back(p); }
void Add(const Path& p) {
points.insert(points.end(), p.points.begin(), p.points.end());
}
};
// A snake is a path between two points that is either:
//
// - A single right or down move followed by a (possibly empty) list of
// diagonals (in the normal case).
// - A (possibly empty) list of diagonals followed by a single right or
// or down move (in the reverse case).
struct Snake {
Point from, to;
};
// A thin wrapper around std::vector<int> that allows negative indexing.
//
// This class stores the x-value of the furthest reaching path
// for each k-diagonal. k-diagonals are numbered from -M to N and defined
// by y(x) = x - k.
//
// We only store the x-value instead of the full point since we can
// calculate y via y = x - k.
class FurthestReaching {
public:
explicit FurthestReaching(std::vector<int>::size_type size) : v_(size) {}
int& operator[](int index) {
const size_t idx = index >= 0 ? index : v_.size() + index;
return v_[idx];
}
const int& operator[](int index) const {
const size_t idx = index >= 0 ? index : v_.size() + index;
return v_[idx];
}
private:
std::vector<int> v_;
};
class ResultWriter;
Comparator::Input* input_;
Comparator::Output* output_;
// Stores the x-value of the furthest reaching path for each k-diagonal.
// k-diagonals are numbered from '-height' to 'width', centered on (0,0) and
// are defined by y(x) = x - k.
FurthestReaching fr_forward_;
// Stores the x-value of the furthest reaching reverse path for each
// l-diagonal. l-diagonals are numbered from '-width' to 'height' and centered
// on 'bottom_right' of the edit graph area.
// k-diagonals and l-diagonals represent the same diagonals. While we refer to
// the diagonals as k-diagonals when calculating SES from (0,0), we refer to
// the diagonals as l-diagonals when calculating SES from (M,N).
// The corresponding k-diagonal name of an l-diagonal is: k = l + delta
// where delta = width -height.
FurthestReaching fr_reverse_;
MyersDiffer(Comparator::Input* input, Comparator::Output* output)
: input_(input),
output_(output),
fr_forward_(input->GetLength1() + input->GetLength2() + 1),
fr_reverse_(input->GetLength1() + input->GetLength2() + 1) {
// Length1 + Length2 + 1 is the upper bound for our work arrays.
// We allocate the work arrays once and re-use them for all invocations of
// `FindMiddleSnake`.
}
base::Optional<Path> FindEditPath() {
return FindEditPath(Point{0, 0},
Point{input_->GetLength1(), input_->GetLength2()});
}
// Returns the path of the SES between `from` and `to`.
base::Optional<Path> FindEditPath(Point from, Point to) {
// Divide the area described by `from` and `to` by finding the
// middle snake ...
base::Optional<Snake> snake = FindMiddleSnake(from, to);
if (!snake) return base::nullopt;
// ... and then conquer the two resulting sub-areas.
base::Optional<Path> head = FindEditPath(from, snake->from);
base::Optional<Path> tail = FindEditPath(snake->to, to);
// Combine `head` and `tail` or use the snake start/end points for
// zero-size areas.
Path result;
if (head) {
result.Add(*head);
} else {
result.Add(snake->from);
}
if (tail) {
result.Add(*tail);
} else {
result.Add(snake->to);
}
return result;
}
// Returns the snake in the middle of the area described by `from` and `to`.
//
// Incrementally calculates the D-paths (starting from 'from') and the
// "reverse" D-paths (starting from 'to') until we find a "normal" and a
// "reverse" path that overlap. That is we first calculate the normal
// and reverse 0-path, then the normal and reverse 1-path and so on.
//
// If a step from a (d-1)-path to a d-path overlaps with a reverse path on
// the same diagonal (or the other way around), then we consider that step
// our middle snake and return it immediately.
base::Optional<Snake> FindMiddleSnake(Point from, Point to) {
EditGraphArea area{from, to};
if (area.size() == 0) return base::nullopt;
// Initialise the furthest reaching vectors with an "artificial" edge
// from (0, -1) -> (0, 0) and (N, -M) -> (N, M) to serve as the initial
// snake when d = 0.
fr_forward_[1] = area.top_left.x;
fr_reverse_[-1] = area.bottom_right.x;
for (int d = 0; d <= std::ceil(area.size() / 2.0f); ++d) {
if (auto snake = ShortestEditForward(area, d)) return snake;
if (auto snake = ShortestEditReverse(area, d)) return snake;
}
return base::nullopt;
}
// Greedily calculates the furthest reaching `d`-paths for each k-diagonal
// where k is in [-d, d]. For each k-diagonal we look at the furthest
// reaching `d-1`-path on the `k-1` and `k+1` depending on which is further
// along the x-axis we either add an insertion from the `k+1`-diagonal or
// a deletion from the `k-1`-diagonal. Then we follow all possible diagonal
// moves and finally record the result as the furthest reaching path on the
// k-diagonal.
base::Optional<Snake> ShortestEditForward(const EditGraphArea& area, int d) {
Point from, to;
// We alternate between looking at odd and even k-diagonals. That is
// because when we extend a `d-path` by a single move we can at most move
// one diagonal over. That is either move from `k-1` to `k` or from `k+1` to
// `k`. That is if `d` is even (odd) then we require only the odd (even)
// k-diagonals calculated in step `d-1`.
for (int k = -d; k <= d; k += 2) {
if (k == -d || (k != d && fr_forward_[k - 1] < fr_forward_[k + 1])) {
// Move downwards, i.e. add an insertion, because either we are at the
// edge and downwards is the only way we can move, or because the
// `d-1`-path along the `k+1` diagonal reaches further on the x-axis
// than the `d-1`-path along the `k-1` diagonal.
from.x = to.x = fr_forward_[k + 1];
} else {
// Move right, i.e. add a deletion.
from.x = fr_forward_[k - 1];
to.x = from.x + 1;
}
// Calculate y via y = x - k. We need to adjust k though since the k=0
// diagonal is centered on `area.top_left` and not (0, 0).
to.y = area.top_left.y + (to.x - area.top_left.x) - k;
from.y = (d == 0 || from.x != to.x) ? to.y : to.y - 1;
// Extend the snake diagonally as long as we can.
while (to < area.bottom_right && input_->Equals(to.x, to.y)) {
++to.x;
++to.y;
}
fr_forward_[k] = to.x;
// Check whether there is a reverse path on this k-diagonal which we
// are overlapping with. If yes, that is our snake.
const bool odd = area.delta() % 2 != 0;
const int l = k - area.delta();
if (odd && l >= (-d + 1) && l <= d - 1 && to.x >= fr_reverse_[l]) {
return Snake{from, to};
}
}
return base::nullopt;
}
// Greedily calculates the furthest reaching reverse `d`-paths for each
// l-diagonal where l is in [-d, d].
// Works the same as `ShortestEditForward` but we move upwards and left
// instead.
base::Optional<Snake> ShortestEditReverse(const EditGraphArea& area, int d) {
Point from, to;
// We alternate between looking at odd and even l-diagonals. That is
// because when we extend a `d-path` by a single move we can at most move
// one diagonal over. That is either move from `l-1` to `l` or from `l+1` to
// `l`. That is if `d` is even (odd) then we require only the odd (even)
// l-diagonals calculated in step `d-1`.
for (int l = d; l >= -d; l -= 2) {
if (l == d || (l != -d && fr_reverse_[l - 1] > fr_reverse_[l + 1])) {
// Move upwards, i.e. add an insertion, because either we are at the
// edge and upwards is the only way we can move, or because the
// `d-1`-path along the `l-1` diagonal reaches further on the x-axis
// than the `d-1`-path along the `l+1` diagonal.
from.x = to.x = fr_reverse_[l - 1];
} else {
// Move left, i.e. add a deletion.
from.x = fr_reverse_[l + 1];
to.x = from.x - 1;
}
// Calculate y via y = x - k. We need to adjust k though since the k=0
// diagonal is centered on `area.top_left` and not (0, 0).
const int k = l + area.delta();
to.y = area.top_left.y + (to.x - area.top_left.x) - k;
from.y = (d == 0 || from.x != to.x) ? to.y : to.y + 1;
// Extend the snake diagonally as long as we can.
while (area.top_left < to && input_->Equals(to.x - 1, to.y - 1)) {
--to.x;
--to.y;
}
fr_reverse_[l] = to.x;
// Check whether there is a path on this k-diagonal which we
// are overlapping with. If yes, that is our snake.
const bool even = area.delta() % 2 == 0;
if (even && k >= -d && k <= d && to.x <= fr_forward_[k]) {
// Invert the points so the snake goes left to right, top to bottom.
return Snake{to, from};
}
}
return base::nullopt;
}
// Small helper class that converts a "shortest edit script" path into a
// source mapping. The result is a list of "chunks" where each "chunk"
// describes a range in the input string and where it can now be found
// in the output string.
//
// The list of chunks can be calculated in a simple pass over all the points
// of the edit path:
//
// - For any diagonal we close and report the current chunk if there is
// one open at the moment.
// - For an insertion or deletion we open a new chunk if none is ongoing.
class ResultWriter {
public:
explicit ResultWriter(Comparator::Output* output) : output_(output) {}
void RecordNoModification(const Point& from) {
if (!change_is_ongoing_) return;
// We close the current chunk, going from `change_start_` to `from`.
CHECK(change_start_);
output_->AddChunk(change_start_->x, change_start_->y,
from.x - change_start_->x, from.y - change_start_->y);
change_is_ongoing_ = false;
}
void RecordInsertionOrDeletion(const Point& from) {
if (change_is_ongoing_) return;
// We start a new chunk beginning at `from`.
change_start_ = from;
change_is_ongoing_ = true;
}
private:
Comparator::Output* output_;
bool change_is_ongoing_ = false;
base::Optional<Point> change_start_;
};
// Takes an edit path and "fills in the blanks". That is we notify the
// `ResultWriter` after each single downwards, left or diagonal move.
void WriteResult(const Path& path) {
ResultWriter writer(output_);
for (size_t i = 1; i < path.points.size(); ++i) {
Point p1 = path.points[i - 1];
Point p2 = path.points[i];
p1 = WalkDiagonal(writer, p1, p2);
const int cmp = (p2.x - p1.x) - (p2.y - p1.y);
if (cmp == -1) {
writer.RecordInsertionOrDeletion(p1);
p1.y++;
} else if (cmp == 1) {
writer.RecordInsertionOrDeletion(p1);
p1.x++;
}
p1 = WalkDiagonal(writer, p1, p2);
DCHECK(p1.x == p2.x && p1.y == p2.y);
}
// Write one diagonal in the end to flush out any open chunk.
writer.RecordNoModification(path.points.back());
}
Point WalkDiagonal(ResultWriter& writer, Point p1, Point p2) {
while (p1.x < p2.x && p1.y < p2.y && input_->Equals(p1.x, p1.y)) {
writer.RecordNoModification(p1);
p1.x++;
p1.y++;
}
return p1;
}
public:
static void MyersDiff(Comparator::Input* input, Comparator::Output* output) {
MyersDiffer differ(input, output);
auto result = differ.FindEditPath();
if (!result) return; // Empty input doesn't produce a path.
differ.WriteResult(*result);
}
};
} // namespace
void Comparator::CalculateDifference(Comparator::Input* input,
Comparator::Output* result_writer) {
MyersDiffer::MyersDiff(input, result_writer);
}
} // namespace internal
} // namespace v8