blob: 6325554009aa5b1b4cf0657cca1de7441e899e4f [file] [log] [blame]
// Copyright 2013 Google Inc. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//
// Author: dsites@google.com (Dick Sites)
//
#ifndef UTIL_UTF8_OFFSETMAP_H_
#define UTIL_UTF8_OFFSETMAP_H_
#include <string> // for string
#include "integral_types.h" // for uint32
// ***************************** OffsetMap **************************
//
// An OffsetMap object is a container for a mapping from offsets in one text
// buffer A' to offsets in another text buffer A. It is most useful when A' is
// built from A via substitutions that occasionally do not preserve byte length.
//
// A series of operators are used to build the correspondence map, then
// calls can be made to map an offset in A' to an offset in A, or vice versa.
// The map starts with offset 0 in A corresponding to offset 0 in A'.
// The mapping is then built sequentially, adding on byte ranges that are
// identical in A and A', byte ranges that are inserted in A', and byte ranges
// that are deleted from A. All bytes beyond those specified when building the
// map are assumed to correspond, i.e. a Copy(infinity) is assumed at the
// end of the map.
//
// The internal data structure records positions at which bytes are added or
// deleted. Using the map is O(1) when increasing the A' or A offset
// monotonically, and O(n) when accessing random offsets, where n is the
// number of differences.
//
namespace CLD2 {
class OffsetMap {
public:
// Constructor, destructor
OffsetMap();
~OffsetMap();
// Clear the map
void Clear();
// Add to mapping from A to A', specifying how many next bytes correspond
// in A and A'
void Copy(int bytes);
// Add to mapping from A to A', specifying how many next bytes are
// inserted in A' while not advancing in A at all
void Insert(int bytes);
// Add to mapping from A to A', specifying how many next bytes are
// deleted from A while not advancing in A' at all
void Delete(int bytes);
// Print map to file, for debugging
void Printmap(const char* filename);
// [Finish building map,] Re-position to offset 0
// This call is optional; MapForward and MapBack finish building the map
// if necessary
void Reset();
// Map an offset in A' to the corresponding offset in A
int MapBack(int aprimeoffset);
// Map an offset in A to the corresponding offset in A'
int MapForward(int aoffset);
// h = ComposeOffsetMap(g, f), where f is a map from A to A', g is
// from A' to A'' and h is from A to A''.
//
// Note that g->MoveForward(f->MoveForward(aoffset)) always equals
// to h->MoveForward(aoffset), while
// f->MoveBack(g->MoveBack(aprimeprimeoffset)) doesn't always equals
// to h->MoveBack(aprimeprimeoffset). This happens when deletion in
// f and insertion in g are at the same place. For example,
//
// A 1 2 3 4
// ^ | ^ ^
// | | / | f
// v vv v
// A' 1' 2' 3'
// ^ ^^ ^
// | | \ | g
// v | v v
// A'' 1'' 2'' 3'' 4''
//
// results in:
//
// A 1 2 3 4
// ^ ^\ ^ ^
// | | \ | | h
// v | vv v
// A'' 1'' 2'' 3'' 4''
//
// 2'' is mapped 3 in the former figure, while 2'' is mapped to 2 in
// the latter figure.
static void ComposeOffsetMap(OffsetMap* g, OffsetMap* f, OffsetMap* h);
// For debugging only; writes to stderr
void DumpWindow();
// For testing only -- force a mapping
void StuffIt(const std::string& diffs, int max_aoffset, int max_aprimeoffset);
private:
enum MapOp {PREFIX_OP, COPY_OP, INSERT_OP, DELETE_OP};
void Flush();
void FlushAll();
void MaybeFlushAll();
void Emit(MapOp op, int len);
void SetLeft();
void SetRight();
// Back up over previous range, 1..5 bytes
// Return subscript at the beginning of that. Pins at 0
int Backup(int sub);
// Parse next range, 1..5 bytes
// Return subscript just off the end of that
int ParseNext(int sub, MapOp* op, int* length);
// Parse previous range, 1..5 bytes
// Return current subscript
int ParsePrevious(int sub, MapOp* op, int* length);
void PrintPosition(const char* str);
bool MoveRight(); // Returns true if OK
bool MoveLeft(); // Returns true if OK
void DumpString();
// Copies insert operations from source to dest. Returns true if no
// other operations are found.
static bool CopyInserts(OffsetMap* source, OffsetMap* dest);
// Copies delete operations from source to dest. Returns true if no other
// operations are found.
static bool CopyDeletes(OffsetMap* source, OffsetMap* dest);
std::string diffs_;
MapOp pending_op_;
uint32 pending_length_;
// Offsets in the ranges below correspond to each other, with A' = A + diff
int next_diff_sub_;
int current_lo_aoffset_;
int current_hi_aoffset_;
int current_lo_aprimeoffset_;
int current_hi_aprimeoffset_;
int current_diff_;
int max_aoffset_;
int max_aprimeoffset_;
};
} // namespace CLD2
#endif // UTIL_UTF8_OFFSETMAP_H_