blob: 201bdd9b799433b7a90539ebca6c74394d5d8b33 [file] [log] [blame]
// Copyright 2016 the V8 project authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.
#include "src/builtins/builtins-utils-inl.h"
#include "src/builtins/builtins.h"
#include "src/logging/counters.h"
#include "src/objects/objects-inl.h"
#include "src/regexp/regexp-utils.h"
#include "src/regexp/regexp.h"
#include "src/strings/string-builder-inl.h"
namespace v8 {
namespace internal {
// -----------------------------------------------------------------------------
// ES6 section 21.2 RegExp Objects
BUILTIN(RegExpPrototypeToString) {
HandleScope scope(isolate);
CHECK_RECEIVER(JSReceiver, recv, "RegExp.prototype.toString");
if (*recv == isolate->regexp_function()->prototype()) {
isolate->CountUsage(v8::Isolate::kRegExpPrototypeToString);
}
IncrementalStringBuilder builder(isolate);
builder.AppendCharacter('/');
{
Handle<Object> source;
ASSIGN_RETURN_FAILURE_ON_EXCEPTION(
isolate, source,
JSReceiver::GetProperty(isolate, recv,
isolate->factory()->source_string()));
DirectHandle<String> source_str;
ASSIGN_RETURN_FAILURE_ON_EXCEPTION(isolate, source_str,
Object::ToString(isolate, source));
builder.AppendString(source_str);
}
builder.AppendCharacter('/');
{
Handle<Object> flags;
ASSIGN_RETURN_FAILURE_ON_EXCEPTION(
isolate, flags,
JSReceiver::GetProperty(isolate, recv,
isolate->factory()->flags_string()));
DirectHandle<String> flags_str;
ASSIGN_RETURN_FAILURE_ON_EXCEPTION(isolate, flags_str,
Object::ToString(isolate, flags));
builder.AppendString(flags_str);
}
RETURN_RESULT_OR_FAILURE(isolate, builder.Finish());
}
// The properties $1..$9 are the first nine capturing substrings of the last
// successful match, or ''. The function RegExpMakeCaptureGetter will be
// called with indices from 1 to 9.
#define DEFINE_CAPTURE_GETTER(i) \
BUILTIN(RegExpCapture##i##Getter) { \
HandleScope scope(isolate); \
return *RegExpUtils::GenericCaptureGetter( \
isolate, isolate->regexp_last_match_info(), i); \
}
DEFINE_CAPTURE_GETTER(1)
DEFINE_CAPTURE_GETTER(2)
DEFINE_CAPTURE_GETTER(3)
DEFINE_CAPTURE_GETTER(4)
DEFINE_CAPTURE_GETTER(5)
DEFINE_CAPTURE_GETTER(6)
DEFINE_CAPTURE_GETTER(7)
DEFINE_CAPTURE_GETTER(8)
DEFINE_CAPTURE_GETTER(9)
#undef DEFINE_CAPTURE_GETTER
// The properties `input` and `$_` are aliases for each other. When this
// value is set, the value it is set to is coerced to a string.
// Getter and setter for the input.
BUILTIN(RegExpInputGetter) {
HandleScope scope(isolate);
DirectHandle<Object> obj(isolate->regexp_last_match_info()->last_input(),
isolate);
return IsUndefined(*obj, isolate) ? ReadOnlyRoots(isolate).empty_string()
: Cast<String>(*obj);
}
BUILTIN(RegExpInputSetter) {
HandleScope scope(isolate);
Handle<Object> value = args.atOrUndefined(isolate, 1);
DirectHandle<String> str;
ASSIGN_RETURN_FAILURE_ON_EXCEPTION(isolate, str,
Object::ToString(isolate, value));
isolate->regexp_last_match_info()->set_last_input(*str);
return ReadOnlyRoots(isolate).undefined_value();
}
// Getters for the static properties lastMatch, lastParen, leftContext, and
// rightContext of the RegExp constructor. The properties are computed based
// on the captures array of the last successful match and the subject string
// of the last successful match.
BUILTIN(RegExpLastMatchGetter) {
HandleScope scope(isolate);
return *RegExpUtils::GenericCaptureGetter(
isolate, isolate->regexp_last_match_info(), 0);
}
BUILTIN(RegExpLastParenGetter) {
HandleScope scope(isolate);
DirectHandle<RegExpMatchInfo> match_info = isolate->regexp_last_match_info();
const int length = match_info->number_of_capture_registers();
if (length <= 2) {
return ReadOnlyRoots(isolate).empty_string(); // No captures.
}
DCHECK_EQ(0, length % 2);
const int last_capture = (length / 2) - 1;
// We match the SpiderMonkey behavior: return the substring defined by the
// last pair (after the first pair) of elements of the capture array even if
// it is empty.
return *RegExpUtils::GenericCaptureGetter(isolate, match_info, last_capture);
}
BUILTIN(RegExpLeftContextGetter) {
HandleScope scope(isolate);
DirectHandle<RegExpMatchInfo> match_info = isolate->regexp_last_match_info();
const int start_index = match_info->capture(0);
Handle<String> last_subject(match_info->last_subject(), isolate);
return *isolate->factory()->NewSubString(last_subject, 0, start_index);
}
BUILTIN(RegExpRightContextGetter) {
HandleScope scope(isolate);
DirectHandle<RegExpMatchInfo> match_info = isolate->regexp_last_match_info();
const int start_index = match_info->capture(1);
Handle<String> last_subject(match_info->last_subject(), isolate);
const int len = last_subject->length();
return *isolate->factory()->NewSubString(last_subject, start_index, len);
}
namespace {
constexpr uint8_t kNoEscape = 0;
constexpr uint8_t kEscapeToHex = std::numeric_limits<uint8_t>::max();
constexpr uint8_t GetAsciiEscape(char c) {
switch (c) {
// SyntaxCharacter :: one of
// ^ $ \ . * + ? ( ) [ ] { } |
//
// SyntaxCharacter and U+002F (SOLIDUS) are escaped as-is.
case '^':
case '$':
case '\\':
case '.':
case '*':
case '+':
case '?':
case '(':
case ')':
case '[':
case ']':
case '{':
case '}':
case '|':
case '/':
return c;
// ControlEscape :: one of
// f n r t v
case '\f':
return 'f';
case '\n':
return 'n';
case '\r':
return 'r';
case '\t':
return 't';
case '\v':
return 'v';
// One of ",-=<>#&!%:;@~'`", the code unit 0x0022 (QUOTATION MARK), and
// ASCII whitespace are escaped to hex.
case ',':
case '-':
case '=':
case '<':
case '>':
case '#':
case '&':
case '!':
case '%':
case ':':
case ';':
case '@':
case '~':
case '\'':
case '`':
case '"':
case ' ':
return kEscapeToHex;
default:
return kNoEscape;
}
}
constexpr const uint8_t kAsciiEscapes[128]{
#define GET_ASCII_ESCAPE(c) GetAsciiEscape(c),
INT_0_TO_127_LIST(GET_ASCII_ESCAPE)
#undef GET_ASCII_ESCAPE
};
template <typename CharT>
MaybeDirectHandle<String> RegExpEscapeImpl(Isolate* isolate,
base::OwnedVector<CharT> source) {
char double_to_radix_chars[kDoubleToRadixMaxChars];
base::Vector<char> double_to_radix_buffer =
base::ArrayVector(double_to_radix_chars);
// 2. Let escaped be the empty String.
IncrementalStringBuilder escaped_builder(isolate);
if constexpr (sizeof(CharT) == 2) {
escaped_builder.ChangeEncoding();
}
// 3. Let cpList be StringToCodePoints(S).
// 4. For each code point c of cpList, do
// (Done below.)
size_t start;
std::remove_const_t<CharT> first_c = source[0];
if (IsAlphaNumeric(first_c)) {
// a. If escaped is the empty String and c is matched by either
// DecimalDigit or AsciiLetter, then
// i. NOTE: Escaping a leading digit ensures that output corresponds
// with pattern text which may be used after a \0 character escape or
// a DecimalEscape such as \1 and still match S rather than be
// interpreted as an extension of the preceding escape sequence.
// Escaping a leading ASCII letter does the same for the context after
// \c.
// ii. Let numericValue be the numeric value of c.
// iii. Let hex be Number::toString(đť”˝(numericValue), 16).
// iv. Assert: The length of hex is 2.
// v. Set escaped to the string-concatenation of the code unit 0x005C
// (REVERSE SOLIDUS), "x", and hex.
start = 1;
escaped_builder.AppendCStringLiteral("\\x");
std::string_view hex =
DoubleToRadixStringView(first_c, 16, double_to_radix_buffer);
escaped_builder.AppendString(hex);
} else {
start = 0;
}
// EncodeForRegExpEscape
//
// 1. If c is matched by SyntaxCharacter or c is U+002F (SOLIDUS), then a.
// Return the string-concatenation of 0x005C (REVERSE SOLIDUS) and
// UTF16EncodeCodePoint(c).
// 2. Else if c is the code point listed in some cell of the “Code Point”
// column of Table 63, then a. Return the string-concatenation of 0x005C
// (REVERSE SOLIDUS) and the string in the “ControlEscape” column of the
// row whose “Code Point” column contains c.
// 3. Let otherPunctuators be the string-concatenation of ",-=<>#&!%:;@~'`"
// and the code unit 0x0022 (QUOTATION MARK).
// 4. Let toEscape be StringToCodePoints(otherPunctuators).
// 5. If toEscape contains c, c is matched by either WhiteSpace or
// LineTerminator, or c has the same numeric value as a leading surrogate
// or trailing surrogate, then a. Let cNum be the numeric value of c. b. If
// cNum ≤ 0xFF, then i. Let hex be Number::toString(𝔽(cNum), 16). ii.
// Return the string-concatenation of the code unit 0x005C (REVERSE
// SOLIDUS), "x", and StringPad(hex, 2, "0", start). c. Let escaped be the
// empty String. d. Let codeUnits be UTF16EncodeCodePoint(c). e. For each
// code unit cu of codeUnits, do i. Set escaped to the string-concatenation
// of escaped and UnicodeEscape(cu). f. Return escaped.
// 6. Return UTF16EncodeCodePoint(c).
//
// Steps 1-2 above are done by table lookup in kAsciiEscapes. For step 3,
// matching otherPuncatuators, quotation mark, and ASCII whitespace is done by
// table lookup in kAsciiEscapes. Non-ASCII whitespace and line terminators in
// step 5 are matched manually below.
for (size_t i = start; i < source.size(); i++) {
CharT cu = source[i];
base::uc32 cp = cu;
uint8_t cmd = kNoEscape;
if (IsAscii(cu)) {
cmd = kAsciiEscapes[cu];
} else {
if constexpr (sizeof(CharT) == 2) {
if (unibrow::Utf16::IsLeadSurrogate(cu)) {
if (i + 1 < source.size() &&
unibrow::Utf16::IsTrailSurrogate(source[i + 1])) {
// Surrogate pair. Combine them.
cp = unibrow::Utf16::CombineSurrogatePair(cu, source[i + 1]);
i++;
} else {
// Lone lead surrogate.
cmd = kEscapeToHex;
}
} else if (unibrow::Utf16::IsTrailSurrogate(cu)) {
// Lone trailing surrogate.
cmd = kEscapeToHex;
}
}
// ASCII whitespace and line terminators are hardcoded in the
// kAsciiEscapes table.
if (IsWhiteSpaceOrLineTerminator(cp)) {
cmd = kEscapeToHex;
}
}
if (cmd == kNoEscape) {
// Code point does not need to be escaped.
if (cp == cu) {
escaped_builder.Append<CharT, CharT>(cp);
} else {
DCHECK_LT(i, source.size());
DCHECK(unibrow::Utf16::IsSurrogatePair(cu, source[i]));
DCHECK_EQ(cp, unibrow::Utf16::CombineSurrogatePair(cu, source[i]));
escaped_builder.Append<CharT, CharT>(cu);
escaped_builder.Append<CharT, CharT>(source[i]);
}
} else if (cmd == kEscapeToHex) {
// An escape to hex. Output \x or \u depending on how many code units.
escaped_builder.AppendCStringLiteral(cp <= 0xFF ? "\\x" : "\\u");
std::string_view hex =
DoubleToRadixStringView(cp, 16, double_to_radix_buffer);
escaped_builder.AppendString(hex);
} else {
// A manual, non-hex escape. See table in kAsciiEscapes.
escaped_builder.AppendCharacter('\\');
escaped_builder.AppendCharacter(cmd);
}
}
return escaped_builder.Finish();
}
} // namespace
BUILTIN(RegExpEscape) {
HandleScope scope(isolate);
Handle<Object> value = args.atOrUndefined(isolate, 1);
isolate->CountUsage(v8::Isolate::kRegExpEscape);
// 1. If S is not a String, throw a TypeError exception.
if (!IsString(*value)) {
THROW_NEW_ERROR_RETURN_FAILURE(
isolate, NewTypeError(MessageTemplate::kArgumentIsNonString,
isolate->factory()->input_string()));
}
Handle<String> str = Cast<String>(value);
if (str->length() == 0) return ReadOnlyRoots(isolate).empty_string();
DirectHandle<String> escaped;
// A copy of the input characters is needed because RegExpEscapeImpl builds up
// the escaped string using IncrementalStringBuilder, which may allocate.
str = String::Flatten(isolate, str);
if (str->IsOneByteRepresentation()) {
base::OwnedVector<const uint8_t> copy;
{
DisallowGarbageCollection no_gc;
copy = base::OwnedCopyOf(str->GetFlatContent(no_gc).ToOneByteVector());
}
ASSIGN_RETURN_FAILURE_ON_EXCEPTION(
isolate, escaped, RegExpEscapeImpl(isolate, std::move(copy)));
} else {
base::OwnedVector<const base::uc16> copy;
{
DisallowGarbageCollection no_gc;
copy = base::OwnedCopyOf(str->GetFlatContent(no_gc).ToUC16Vector());
}
ASSIGN_RETURN_FAILURE_ON_EXCEPTION(
isolate, escaped, RegExpEscapeImpl(isolate, std::move(copy)));
}
return *escaped;
}
} // namespace internal
} // namespace v8