blob: 97e731d59b36fa48156937d3ce93dab3bbfd0516 [file] [log] [blame]
/*
Copyright (C) 2004, 2005, 2006, 2007, 2008 Apple Inc. All rights reserved.
This library is free software; you can redistribute it and/or
modify it under the terms of the GNU Library General Public
License as published by the Free Software Foundation; either
version 2 of the License, or (at your option) any later version.
This library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Library General Public License for more details.
You should have received a copy of the GNU Library General Public License
along with this library; see the file COPYING.LIB. If not, write to
the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
Boston, MA 02110-1301, USA.
*/
#ifndef THIRD_PARTY_BLINK_RENDERER_PLATFORM_TEXT_SEGMENTED_STRING_H_
#define THIRD_PARTY_BLINK_RENDERER_PLATFORM_TEXT_SEGMENTED_STRING_H_
#include "base/check_op.h"
#include "base/compiler_specific.h"
#include "base/memory/raw_ptr.h"
#include "base/memory/raw_ptr_exclusion.h"
#include "third_party/blink/renderer/platform/platform_export.h"
#include "third_party/blink/renderer/platform/wtf/allocator/allocator.h"
#include "third_party/blink/renderer/platform/wtf/deque.h"
#include "third_party/blink/renderer/platform/wtf/text/string_builder.h"
#include "third_party/blink/renderer/platform/wtf/text/string_view.h"
#include "third_party/blink/renderer/platform/wtf/text/text_position.h"
#include "third_party/blink/renderer/platform/wtf/text/unicode.h"
#include "third_party/blink/renderer/platform/wtf/text/wtf_string.h"
namespace blink {
class SegmentedString;
class PLATFORM_EXPORT SegmentedSubstring {
DISALLOW_NEW();
public:
SegmentedSubstring() { Clear(); }
explicit SegmentedSubstring(const String& str) : string_(str) {
unsigned len = str.length();
if (len) {
if (string_.Is8Bit()) {
is_8bit_ = true;
data_.string8_ptr = UNSAFE_TODO(string_.Characters8());
// SAFETY: len is length of string and checked to be non-zero.
data_last_char_ = UNSAFE_BUFFERS(data_.string8_ptr + len - 1);
} else {
is_8bit_ = false;
data_.string16_ptr = UNSAFE_TODO(string_.Characters16());
// SAFETY: len is length of string and checked to be non-zero.
data_last_char_ = UNSAFE_BUFFERS(
reinterpret_cast<const LChar*>(data_.string16_ptr + len - 1));
}
} else {
is_8bit_ = true;
data_.string8_ptr = nullptr;
data_last_char_ = nullptr;
}
data_start_ = data_.string8_ptr;
}
void Clear() {
is_8bit_ = true;
data_.string8_ptr = nullptr;
data_start_ = data_last_char_ = nullptr;
}
bool ExcludeLineNumbers() const { return !do_not_exclude_line_numbers_; }
bool DoNotExcludeLineNumbers() const { return do_not_exclude_line_numbers_; }
void SetExcludeLineNumbers() { do_not_exclude_line_numbers_ = false; }
int NumberOfCharactersConsumed() const { return offset(); }
void AppendTo(StringBuilder& builder) const {
int off = offset();
int len = length();
if (!off) {
if (len)
builder.Append(string_);
} else {
builder.Append(string_.Substring(off, len));
}
}
bool PushIfPossible(UChar c) {
// This checks if either 8 or 16 bit strings are in the first character
// or they are both nullptr (where we can't rewind).
if (data_.string8_ptr == data_start_)
return false;
// SAFETY: if the string pointer is not at the start, then it points
// into the string data and is safe to index one before it.
if (is_8bit_) {
if (*(UNSAFE_BUFFERS(data_.string8_ptr - 1)) != c) {
return false;
}
UNSAFE_BUFFERS(--data_.string8_ptr);
} else {
if (*(UNSAFE_BUFFERS(data_.string16_ptr - 1)) != c) {
return false;
}
UNSAFE_BUFFERS(--data_.string16_ptr);
}
return true;
}
ALWAYS_INLINE UChar GetCurrentChar() const {
if (is_8bit_)
return *data_.string8_ptr;
return *data_.string16_ptr;
}
ALWAYS_INLINE bool CanAdvance() {
return data_.string8_ptr < data_last_char_;
}
// Advances up to `delta` characters, returning how many characters were
// advanced. This will not advance past the last character.
unsigned Advance(unsigned delta) {
DCHECK_NE(0, length());
delta = std::min(static_cast<unsigned>(length()) - 1, delta);
// Unsafe since a stronger check for non-zero length is required.
if (is_8bit_)
UNSAFE_TODO(data_.string8_ptr += delta);
else
UNSAFE_TODO(data_.string16_ptr += delta);
return delta;
}
ALWAYS_INLINE UChar Advance() {
return UNSAFE_TODO(is_8bit_ ? *++data_.string8_ptr : *++data_.string16_ptr);
}
StringView CurrentSubString(unsigned len) const {
return StringView(string_, offset(), len);
}
ALWAYS_INLINE int offset() const {
DCHECK_LE(data_start_, data_.string8_ptr);
return static_cast<int>(data_.string8_ptr - data_start_) >> !is_8bit_;
}
ALWAYS_INLINE int length() const {
DCHECK_LE(data_.string8_ptr, data_last_char_);
return static_cast<int>(data_end() - data_.string8_ptr) >> !is_8bit_;
}
private:
ALWAYS_INLINE const LChar* data_end() const {
if (!data_last_char_)
return nullptr;
return UNSAFE_TODO(data_last_char_ + 1 + !is_8bit_);
}
union {
// RAW_PTR_EXCLUSION: #union
RAW_PTR_EXCLUSION const LChar* string8_ptr;
RAW_PTR_EXCLUSION const UChar* string16_ptr;
} data_;
raw_ptr<const LChar, AllowPtrArithmetic | DanglingUntriaged> data_start_;
// |data_last_char_| points to the last character (or nullptr). This is to
// avoid extra computation in |CanAdvance|, which is in the critical path of
// HTMLTokenizer.
// RAW_PTR_EXCLUSION: End of the buffer already protected by raw_ptr.
RAW_PTR_EXCLUSION const LChar* data_last_char_;
bool do_not_exclude_line_numbers_ = true;
bool is_8bit_ = true;
String string_;
};
class PLATFORM_EXPORT SegmentedString {
DISALLOW_NEW();
public:
SegmentedString()
: number_of_characters_consumed_prior_to_current_string_(0),
number_of_characters_consumed_prior_to_current_line_(0),
current_line_(0),
closed_(false),
empty_(true),
current_char_('\0') {}
SegmentedString(const String& str)
: current_string_(str),
number_of_characters_consumed_prior_to_current_string_(0),
number_of_characters_consumed_prior_to_current_line_(0),
current_line_(0),
closed_(false),
empty_(!str.length()),
current_char_(empty_ ? '\0' : current_string_.GetCurrentChar()) {}
void Clear();
void Close();
void Append(const SegmentedString&);
enum class PrependType {
kNewInput = 0,
kUnconsume = 1,
};
void Prepend(const SegmentedString&, PrependType);
const SegmentedString* NextSegmentedString() const {
return next_segmented_string_;
}
void SetNextSegmentedString(const SegmentedString* next) {
next_segmented_string_ = next;
}
bool ExcludeLineNumbers() const {
return current_string_.ExcludeLineNumbers();
}
void SetExcludeLineNumbers();
void Push(UChar);
bool IsEmpty() const { return empty_; }
unsigned length() const;
bool IsClosed() const { return closed_; }
enum LookAheadResult {
kDidNotMatch,
kDidMatch,
kNotEnoughCharacters,
};
LookAheadResult LookAhead(const String& string) {
return LookAheadInline(string, kTextCaseSensitive);
}
LookAheadResult LookAheadIgnoringCase(const String& string) {
return LookAheadInline(string, kTextCaseASCIIInsensitive);
}
// Used to advance by multiple characters. Specifically this advances by
// `num_chars` and `num_lines`. This function advances without analyzing the
// input string in anyway. As a result, the caller must know `num_lines` and
// `current_column`.
void Advance(unsigned num_chars, unsigned num_lines, int current_column);
ALWAYS_INLINE UChar Advance() {
if (current_string_.CanAdvance()) [[likely]] {
current_char_ = current_string_.Advance();
return current_char_;
}
return AdvanceSubstring();
}
ALWAYS_INLINE void UpdateLineNumber() {
if (current_string_.DoNotExcludeLineNumbers()) [[likely]] {
++current_line_;
// Plus 1 because numberOfCharactersConsumed value hasn't incremented yet;
// it does with length() decrement below.
number_of_characters_consumed_prior_to_current_line_ =
NumberOfCharactersConsumed() + 1;
}
}
ALWAYS_INLINE UChar AdvanceAndUpdateLineNumber() {
DCHECK_GE(current_string_.length(), 1);
if (current_char_ == '\n')
UpdateLineNumber();
return Advance();
}
ALWAYS_INLINE UChar AdvanceAndASSERT(UChar expected_character) {
DCHECK_EQ(expected_character, CurrentChar());
return Advance();
}
ALWAYS_INLINE UChar AdvanceAndASSERTIgnoringCase(UChar expected_character) {
DCHECK_EQ(unicode::FoldCase(CurrentChar()),
unicode::FoldCase(expected_character));
return Advance();
}
ALWAYS_INLINE UChar AdvancePastNonNewline() {
DCHECK_NE(CurrentChar(), '\n');
return Advance();
}
ALWAYS_INLINE UChar AdvancePastNewlineAndUpdateLineNumber() {
DCHECK_EQ(CurrentChar(), '\n');
DCHECK_GE(current_string_.length(), 1);
UpdateLineNumber();
return Advance();
}
ALWAYS_INLINE int NumberOfCharactersConsumed() const {
int number_of_pushed_characters = 0;
return number_of_characters_consumed_prior_to_current_string_ +
current_string_.NumberOfCharactersConsumed() -
number_of_pushed_characters;
}
String ToString() const;
ALWAYS_INLINE UChar CurrentChar() const { return current_char_; }
// The method is moderately slow, comparing to currentLine method.
OrdinalNumber CurrentColumn() const;
OrdinalNumber CurrentLine() const;
// Sets value of line/column variables. Column is specified indirectly by a
// parameter columnAftreProlog which is a value of column that we should get
// after a prolog (first prologLength characters) has been consumed.
void SetCurrentPosition(OrdinalNumber line,
OrdinalNumber column_aftre_prolog,
int prolog_length);
private:
void Append(const SegmentedSubstring&);
void Prepend(const SegmentedSubstring&, PrependType);
UChar AdvanceSubstring();
// Consume characters into `characters`, which should not be bigger than
// `length()`.
void AdvanceAndCollect(base::span<UChar> characters);
inline LookAheadResult LookAheadInline(const String& string,
TextCaseSensitivity case_sensitivity) {
if (string.length() <= static_cast<unsigned>(current_string_.length())) {
StringView current_substring =
current_string_.CurrentSubString(string.length());
if (string.StartsWith(current_substring, case_sensitivity))
return kDidMatch;
return kDidNotMatch;
}
return LookAheadSlowCase(string, case_sensitivity);
}
LookAheadResult LookAheadSlowCase(const String& string,
TextCaseSensitivity case_sensitivity) {
unsigned count = string.length();
if (count > length())
return kNotEnoughCharacters;
base::span<UChar> consumed_characters;
String consumed_string =
String::CreateUninitialized(count, consumed_characters);
AdvanceAndCollect(consumed_characters);
LookAheadResult result = kDidNotMatch;
if (consumed_string.StartsWith(string, case_sensitivity))
result = kDidMatch;
Prepend(SegmentedString(consumed_string), PrependType::kUnconsume);
return result;
}
bool IsComposite() const { return !substrings_.empty(); }
SegmentedSubstring current_string_;
int number_of_characters_consumed_prior_to_current_string_;
int number_of_characters_consumed_prior_to_current_line_;
int current_line_;
Deque<SegmentedSubstring> substrings_;
bool closed_;
bool empty_;
UChar current_char_;
raw_ptr<const SegmentedString> next_segmented_string_ = nullptr;
};
} // namespace blink
#endif // THIRD_PARTY_BLINK_RENDERER_PLATFORM_TEXT_SEGMENTED_STRING_H_