blob: ff2f8aa88a9a9c1b9ee4d203a897cd02c9400af8 [file] [log] [blame]
 // Copyright 2022 the V8 project authors. All rights reserved. // Use of this source code is governed by a BSD-style license that can be // found in the LICENSE file. #include "src/debug/liveedit-diff.h" #include #include #include #include "src/base/logging.h" #include "src/base/optional.h" namespace v8 { namespace internal { namespace { // Implements Myer's Algorithm from // "An O(ND) Difference Algorithm and Its Variations", particularly the // linear space refinement mentioned in section 4b. // // The differ is input agnostic. // // The algorithm works by finding the shortest edit string (SES) in the edit // graph. The SES describes how to get from a string A of length N to a string // B of length M via deleting from A and inserting from B. // // Example: A = "abbaa", B = "abab" // // A // // a b b a a // o---o---o---o---o---o // a | \ | | | \ | \ | // o---o---o---o---o---o // b | | \ | \ | | | // B o---o---o---o---o---o // a | \ | | | \ | \ | // o---o---o---o---o---o // b | | \ | \ | | | // o---o---o---o---o---o // // The edit graph is constructed with the characters from string A on the x-axis // and the characters from string B on the y-axis. Starting from (0, 0) we can: // // - Move right, which is equivalent to deleting from A // - Move downwards, which is equivalent to inserting from B // - Move diagonally if the characters from string A and B match, which // means no insertion or deletion. // // Any path from (0, 0) to (N, M) describes a valid edit string, but we try to // find the path with the most diagonals, conversely that is the path with the // least insertions or deletions. // Note that a path with "D" insertions/deletions is called a D-path. class MyersDiffer { private: // A point in the edit graph. struct Point { int x, y; // Less-than for a point in the edit graph is defined as less than in both // components (i.e. at least one diagonal away). bool operator<(const Point& other) const { return x < other.x && y < other.y; } }; // Describes a rectangle in the edit graph. struct EditGraphArea { Point top_left, bottom_right; int width() const { return bottom_right.x - top_left.x; } int height() const { return bottom_right.y - top_left.y; } int size() const { return width() + height(); } int delta() const { return width() - height(); } }; // A path or path-segment through the edit graph. Not all points along // the path are necessarily listed since it is trivial to figure out all // the concrete points along a snake. struct Path { std::vector points; void Add(const Point& p) { points.push_back(p); } void Add(const Path& p) { points.insert(points.end(), p.points.begin(), p.points.end()); } }; // A snake is a path between two points that is either: // // - A single right or down move followed by a (possibly empty) list of // diagonals (in the normal case). // - A (possibly empty) list of diagonals followed by a single right or // or down move (in the reverse case). struct Snake { Point from, to; }; // A thin wrapper around std::vector that allows negative indexing. // // This class stores the x-value of the furthest reaching path // for each k-diagonal. k-diagonals are numbered from -M to N and defined // by y(x) = x - k. // // We only store the x-value instead of the full point since we can // calculate y via y = x - k. class FurthestReaching { public: explicit FurthestReaching(std::vector::size_type size) : v_(size) {} int& operator[](int index) { const size_t idx = index >= 0 ? index : v_.size() + index; return v_[idx]; } const int& operator[](int index) const { const size_t idx = index >= 0 ? index : v_.size() + index; return v_[idx]; } private: std::vector v_; }; class ResultWriter; Comparator::Input* input_; Comparator::Output* output_; // Stores the x-value of the furthest reaching path for each k-diagonal. // k-diagonals are numbered from '-height' to 'width', centered on (0,0) and // are defined by y(x) = x - k. FurthestReaching fr_forward_; // Stores the x-value of the furthest reaching reverse path for each // l-diagonal. l-diagonals are numbered from '-width' to 'height' and centered // on 'bottom_right' of the edit graph area. // k-diagonals and l-diagonals represent the same diagonals. While we refer to // the diagonals as k-diagonals when calculating SES from (0,0), we refer to // the diagonals as l-diagonals when calculating SES from (M,N). // The corresponding k-diagonal name of an l-diagonal is: k = l + delta // where delta = width -height. FurthestReaching fr_reverse_; MyersDiffer(Comparator::Input* input, Comparator::Output* output) : input_(input), output_(output), fr_forward_(input->GetLength1() + input->GetLength2() + 1), fr_reverse_(input->GetLength1() + input->GetLength2() + 1) { // Length1 + Length2 + 1 is the upper bound for our work arrays. // We allocate the work arrays once and re-use them for all invocations of // `FindMiddleSnake`. } base::Optional FindEditPath() { return FindEditPath(Point{0, 0}, Point{input_->GetLength1(), input_->GetLength2()}); } // Returns the path of the SES between `from` and `to`. base::Optional FindEditPath(Point from, Point to) { // Divide the area described by `from` and `to` by finding the // middle snake ... base::Optional snake = FindMiddleSnake(from, to); if (!snake) return base::nullopt; // ... and then conquer the two resulting sub-areas. base::Optional head = FindEditPath(from, snake->from); base::Optional tail = FindEditPath(snake->to, to); // Combine `head` and `tail` or use the snake start/end points for // zero-size areas. Path result; if (head) { result.Add(*head); } else { result.Add(snake->from); } if (tail) { result.Add(*tail); } else { result.Add(snake->to); } return result; } // Returns the snake in the middle of the area described by `from` and `to`. // // Incrementally calculates the D-paths (starting from 'from') and the // "reverse" D-paths (starting from 'to') until we find a "normal" and a // "reverse" path that overlap. That is we first calculate the normal // and reverse 0-path, then the normal and reverse 1-path and so on. // // If a step from a (d-1)-path to a d-path overlaps with a reverse path on // the same diagonal (or the other way around), then we consider that step // our middle snake and return it immediately. base::Optional FindMiddleSnake(Point from, Point to) { EditGraphArea area{from, to}; if (area.size() == 0) return base::nullopt; // Initialise the furthest reaching vectors with an "artificial" edge // from (0, -1) -> (0, 0) and (N, -M) -> (N, M) to serve as the initial // snake when d = 0. fr_forward_[1] = area.top_left.x; fr_reverse_[-1] = area.bottom_right.x; for (int d = 0; d <= std::ceil(area.size() / 2.0f); ++d) { if (auto snake = ShortestEditForward(area, d)) return snake; if (auto snake = ShortestEditReverse(area, d)) return snake; } return base::nullopt; } // Greedily calculates the furthest reaching `d`-paths for each k-diagonal // where k is in [-d, d]. For each k-diagonal we look at the furthest // reaching `d-1`-path on the `k-1` and `k+1` depending on which is further // along the x-axis we either add an insertion from the `k+1`-diagonal or // a deletion from the `k-1`-diagonal. Then we follow all possible diagonal // moves and finally record the result as the furthest reaching path on the // k-diagonal. base::Optional ShortestEditForward(const EditGraphArea& area, int d) { Point from, to; // We alternate between looking at odd and even k-diagonals. That is // because when we extend a `d-path` by a single move we can at most move // one diagonal over. That is either move from `k-1` to `k` or from `k+1` to // `k`. That is if `d` is even (odd) then we require only the odd (even) // k-diagonals calculated in step `d-1`. for (int k = -d; k <= d; k += 2) { if (k == -d || (k != d && fr_forward_[k - 1] < fr_forward_[k + 1])) { // Move downwards, i.e. add an insertion, because either we are at the // edge and downwards is the only way we can move, or because the // `d-1`-path along the `k+1` diagonal reaches further on the x-axis // than the `d-1`-path along the `k-1` diagonal. from.x = to.x = fr_forward_[k + 1]; } else { // Move right, i.e. add a deletion. from.x = fr_forward_[k - 1]; to.x = from.x + 1; } // Calculate y via y = x - k. We need to adjust k though since the k=0 // diagonal is centered on `area.top_left` and not (0, 0). to.y = area.top_left.y + (to.x - area.top_left.x) - k; from.y = (d == 0 || from.x != to.x) ? to.y : to.y - 1; // Extend the snake diagonally as long as we can. while (to < area.bottom_right && input_->Equals(to.x, to.y)) { ++to.x; ++to.y; } fr_forward_[k] = to.x; // Check whether there is a reverse path on this k-diagonal which we // are overlapping with. If yes, that is our snake. const bool odd = area.delta() % 2 != 0; const int l = k - area.delta(); if (odd && l >= (-d + 1) && l <= d - 1 && to.x >= fr_reverse_[l]) { return Snake{from, to}; } } return base::nullopt; } // Greedily calculates the furthest reaching reverse `d`-paths for each // l-diagonal where l is in [-d, d]. // Works the same as `ShortestEditForward` but we move upwards and left // instead. base::Optional ShortestEditReverse(const EditGraphArea& area, int d) { Point from, to; // We alternate between looking at odd and even l-diagonals. That is // because when we extend a `d-path` by a single move we can at most move // one diagonal over. That is either move from `l-1` to `l` or from `l+1` to // `l`. That is if `d` is even (odd) then we require only the odd (even) // l-diagonals calculated in step `d-1`. for (int l = d; l >= -d; l -= 2) { if (l == d || (l != -d && fr_reverse_[l - 1] > fr_reverse_[l + 1])) { // Move upwards, i.e. add an insertion, because either we are at the // edge and upwards is the only way we can move, or because the // `d-1`-path along the `l-1` diagonal reaches further on the x-axis // than the `d-1`-path along the `l+1` diagonal. from.x = to.x = fr_reverse_[l - 1]; } else { // Move left, i.e. add a deletion. from.x = fr_reverse_[l + 1]; to.x = from.x - 1; } // Calculate y via y = x - k. We need to adjust k though since the k=0 // diagonal is centered on `area.top_left` and not (0, 0). const int k = l + area.delta(); to.y = area.top_left.y + (to.x - area.top_left.x) - k; from.y = (d == 0 || from.x != to.x) ? to.y : to.y + 1; // Extend the snake diagonally as long as we can. while (area.top_left < to && input_->Equals(to.x - 1, to.y - 1)) { --to.x; --to.y; } fr_reverse_[l] = to.x; // Check whether there is a path on this k-diagonal which we // are overlapping with. If yes, that is our snake. const bool even = area.delta() % 2 == 0; if (even && k >= -d && k <= d && to.x <= fr_forward_[k]) { // Invert the points so the snake goes left to right, top to bottom. return Snake{to, from}; } } return base::nullopt; } // Small helper class that converts a "shortest edit script" path into a // source mapping. The result is a list of "chunks" where each "chunk" // describes a range in the input string and where it can now be found // in the output string. // // The list of chunks can be calculated in a simple pass over all the points // of the edit path: // // - For any diagonal we close and report the current chunk if there is // one open at the moment. // - For an insertion or deletion we open a new chunk if none is ongoing. class ResultWriter { public: explicit ResultWriter(Comparator::Output* output) : output_(output) {} void RecordNoModification(const Point& from) { if (!change_is_ongoing_) return; // We close the current chunk, going from `change_start_` to `from`. CHECK(change_start_); output_->AddChunk(change_start_->x, change_start_->y, from.x - change_start_->x, from.y - change_start_->y); change_is_ongoing_ = false; } void RecordInsertionOrDeletion(const Point& from) { if (change_is_ongoing_) return; // We start a new chunk beginning at `from`. change_start_ = from; change_is_ongoing_ = true; } private: Comparator::Output* output_; bool change_is_ongoing_ = false; base::Optional change_start_; }; // Takes an edit path and "fills in the blanks". That is we notify the // `ResultWriter` after each single downwards, left or diagonal move. void WriteResult(const Path& path) { ResultWriter writer(output_); for (size_t i = 1; i < path.points.size(); ++i) { Point p1 = path.points[i - 1]; Point p2 = path.points[i]; p1 = WalkDiagonal(writer, p1, p2); const int cmp = (p2.x - p1.x) - (p2.y - p1.y); if (cmp == -1) { writer.RecordInsertionOrDeletion(p1); p1.y++; } else if (cmp == 1) { writer.RecordInsertionOrDeletion(p1); p1.x++; } p1 = WalkDiagonal(writer, p1, p2); DCHECK(p1.x == p2.x && p1.y == p2.y); } // Write one diagonal in the end to flush out any open chunk. writer.RecordNoModification(path.points.back()); } Point WalkDiagonal(ResultWriter& writer, Point p1, Point p2) { while (p1.x < p2.x && p1.y < p2.y && input_->Equals(p1.x, p1.y)) { writer.RecordNoModification(p1); p1.x++; p1.y++; } return p1; } public: static void MyersDiff(Comparator::Input* input, Comparator::Output* output) { MyersDiffer differ(input, output); auto result = differ.FindEditPath(); if (!result) return; // Empty input doesn't produce a path. differ.WriteResult(*result); } }; } // namespace void Comparator::CalculateDifference(Comparator::Input* input, Comparator::Output* result_writer) { MyersDiffer::MyersDiff(input, result_writer); } } // namespace internal } // namespace v8