| // Copyright 2019 the V8 project authors. All rights reserved. |
| // Use of this source code is governed by a BSD-style license that can be |
| // found in the LICENSE file. |
| |
| #include "src/objects/js-regexp.h" |
| |
| #include "src/base/strings.h" |
| #include "src/common/globals.h" |
| #include "src/objects/code.h" |
| #include "src/objects/js-array-inl.h" |
| #include "src/objects/js-regexp-inl.h" |
| #include "src/regexp/regexp.h" |
| |
| namespace v8 { |
| namespace internal { |
| |
| Handle<JSRegExpResultIndices> JSRegExpResultIndices::BuildIndices( |
| Isolate* isolate, Handle<RegExpMatchInfo> match_info, |
| Handle<Object> maybe_names) { |
| Handle<JSRegExpResultIndices> indices(Handle<JSRegExpResultIndices>::cast( |
| isolate->factory()->NewJSObjectFromMap( |
| isolate->regexp_result_indices_map()))); |
| |
| // Initialize indices length to avoid having a partially initialized object |
| // should GC be triggered by creating a NewFixedArray. |
| indices->set_length(Smi::zero()); |
| |
| // Build indices array from RegExpMatchInfo. |
| int num_indices = match_info->number_of_capture_registers(); |
| int num_results = num_indices >> 1; |
| Handle<FixedArray> indices_array = |
| isolate->factory()->NewFixedArray(num_results); |
| JSArray::SetContent(indices, indices_array); |
| |
| for (int i = 0; i < num_results; i++) { |
| const int start_offset = |
| match_info->capture(RegExpMatchInfo::capture_start_index(i)); |
| const int end_offset = |
| match_info->capture(RegExpMatchInfo::capture_end_index(i)); |
| |
| // Any unmatched captures are set to undefined, otherwise we set them to a |
| // subarray of the indices. |
| if (start_offset == -1) { |
| indices_array->set(i, ReadOnlyRoots(isolate).undefined_value()); |
| } else { |
| Handle<FixedArray> indices_sub_array( |
| isolate->factory()->NewFixedArray(2)); |
| indices_sub_array->set(0, Smi::FromInt(start_offset)); |
| indices_sub_array->set(1, Smi::FromInt(end_offset)); |
| Handle<JSArray> indices_sub_jsarray = |
| isolate->factory()->NewJSArrayWithElements(indices_sub_array, |
| PACKED_SMI_ELEMENTS, 2); |
| indices_array->set(i, *indices_sub_jsarray); |
| } |
| } |
| |
| // If there are no capture groups, set the groups property to undefined. |
| FieldIndex groups_index = FieldIndex::ForDescriptor( |
| indices->map(), InternalIndex(kGroupsDescriptorIndex)); |
| if (IsUndefined(*maybe_names, isolate)) { |
| indices->FastPropertyAtPut(groups_index, |
| ReadOnlyRoots(isolate).undefined_value()); |
| return indices; |
| } |
| |
| // Create a groups property which returns a dictionary of named captures to |
| // their corresponding capture indices. |
| Handle<FixedArray> names(Handle<FixedArray>::cast(maybe_names)); |
| int num_names = names->length() >> 1; |
| Handle<HeapObject> group_names; |
| if constexpr (V8_ENABLE_SWISS_NAME_DICTIONARY_BOOL) { |
| group_names = isolate->factory()->NewSwissNameDictionary(num_names); |
| } else { |
| group_names = isolate->factory()->NewNameDictionary(num_names); |
| } |
| Handle<PropertyDictionary> group_names_dict = |
| Handle<PropertyDictionary>::cast(group_names); |
| for (int i = 0; i < num_names; i++) { |
| int base_offset = i * 2; |
| int name_offset = base_offset; |
| int index_offset = base_offset + 1; |
| Handle<String> name(String::cast(names->get(name_offset)), isolate); |
| Tagged<Smi> smi_index = Smi::cast(names->get(index_offset)); |
| Handle<Object> capture_indices(indices_array->get(smi_index.value()), |
| isolate); |
| if (!IsUndefined(*capture_indices, isolate)) { |
| capture_indices = Handle<JSArray>::cast(capture_indices); |
| } |
| InternalIndex group_entry = group_names_dict->FindEntry(isolate, name); |
| // Duplicate group entries are possible if the capture groups are in |
| // different alternatives, i.e. only one of them can actually match. |
| // Therefore when we find a duplicate entry, either the current entry is |
| // undefined (didn't match anything) or the indices for the current capture |
| // are undefined. In the latter case we don't do anything, in the former |
| // case we update the entry. |
| if (group_entry.is_found()) { |
| DCHECK(v8_flags.js_regexp_duplicate_named_groups); |
| if (!IsUndefined(*capture_indices, isolate)) { |
| DCHECK(IsUndefined(group_names_dict->ValueAt(group_entry), isolate)); |
| group_names_dict->ValueAtPut(group_entry, *capture_indices); |
| } |
| } else { |
| group_names_dict = |
| PropertyDictionary::Add(isolate, group_names_dict, name, |
| capture_indices, PropertyDetails::Empty()); |
| } |
| } |
| |
| // Convert group_names to a JSObject and store at the groups property of the |
| // result indices. |
| Handle<FixedArrayBase> elements = isolate->factory()->empty_fixed_array(); |
| Handle<HeapObject> null = |
| Handle<HeapObject>::cast(isolate->factory()->null_value()); |
| Handle<JSObject> js_group_names = |
| isolate->factory()->NewSlowJSObjectWithPropertiesAndElements( |
| null, group_names, elements); |
| indices->FastPropertyAtPut(groups_index, *js_group_names); |
| return indices; |
| } |
| |
| uint32_t JSRegExp::backtrack_limit() const { |
| CHECK_EQ(type_tag(), IRREGEXP); |
| return static_cast<uint32_t>(Smi::ToInt(DataAt(kIrregexpBacktrackLimit))); |
| } |
| |
| // static |
| base::Optional<JSRegExp::Flags> JSRegExp::FlagsFromString( |
| Isolate* isolate, Handle<String> flags) { |
| const int length = flags->length(); |
| |
| // A longer flags string cannot be valid. |
| if (length > JSRegExp::kFlagCount) return {}; |
| |
| RegExpFlags value; |
| FlatStringReader reader(isolate, String::Flatten(isolate, flags)); |
| |
| for (int i = 0; i < length; i++) { |
| base::Optional<RegExpFlag> flag = JSRegExp::FlagFromChar(reader.Get(i)); |
| if (!flag.has_value()) return {}; |
| if (value & flag.value()) return {}; // Duplicate. |
| value |= flag.value(); |
| } |
| |
| return JSRegExp::AsJSRegExpFlags(value); |
| } |
| |
| // static |
| Handle<String> JSRegExp::StringFromFlags(Isolate* isolate, |
| JSRegExp::Flags flags) { |
| FlagsBuffer buffer; |
| return isolate->factory()->NewStringFromAsciiChecked( |
| FlagsToString(flags, &buffer)); |
| } |
| |
| // static |
| MaybeHandle<JSRegExp> JSRegExp::New(Isolate* isolate, Handle<String> pattern, |
| Flags flags, uint32_t backtrack_limit) { |
| Handle<JSFunction> constructor = isolate->regexp_function(); |
| Handle<JSRegExp> regexp = |
| Handle<JSRegExp>::cast(isolate->factory()->NewJSObject(constructor)); |
| |
| return JSRegExp::Initialize(regexp, pattern, flags, backtrack_limit); |
| } |
| |
| Tagged<Object> JSRegExp::code(IsolateForSandbox isolate, bool is_latin1) const { |
| DCHECK_EQ(type_tag(), JSRegExp::IRREGEXP); |
| Tagged<Object> value = DataAt(code_index(is_latin1)); |
| DCHECK(IsSmi(value) || IsCodeWrapper(value)); |
| // TODO(saelo): it would be nice if we could directly use a code pointer to |
| // reference our Code rather than use the CodeWrapper object. However, this |
| // is currently not possible since we use essentially a FixedArray to store |
| // all our fields, and a code pointer isn't a tagged pointer. Instead, we |
| // should consider adding a trusted pointer field that references either the |
| // bytecode or the native code in a sandbox-compatible way. |
| if (IsCodeWrapper(value)) { |
| value = CodeWrapper::cast(value)->code(isolate); |
| } |
| DCHECK(IsSmi(value) || IsCode(value)); |
| return value; |
| } |
| |
| void JSRegExp::set_code(bool is_latin1, Handle<Code> code) { |
| SetDataAt(code_index(is_latin1), code->wrapper()); |
| } |
| |
| Tagged<Object> JSRegExp::bytecode(bool is_latin1) const { |
| DCHECK(type_tag() == JSRegExp::IRREGEXP || |
| type_tag() == JSRegExp::EXPERIMENTAL); |
| return DataAt(bytecode_index(is_latin1)); |
| } |
| |
| void JSRegExp::set_bytecode_and_trampoline(Isolate* isolate, |
| Handle<ByteArray> bytecode) { |
| SetDataAt(kIrregexpLatin1BytecodeIndex, *bytecode); |
| SetDataAt(kIrregexpUC16BytecodeIndex, *bytecode); |
| |
| Handle<Code> trampoline = BUILTIN_CODE(isolate, RegExpExperimentalTrampoline); |
| SetDataAt(JSRegExp::kIrregexpLatin1CodeIndex, trampoline->wrapper()); |
| SetDataAt(JSRegExp::kIrregexpUC16CodeIndex, trampoline->wrapper()); |
| } |
| |
| bool JSRegExp::ShouldProduceBytecode() { |
| return v8_flags.regexp_interpret_all || |
| (v8_flags.regexp_tier_up && !MarkedForTierUp()); |
| } |
| |
| // Only irregexps are subject to tier-up. |
| bool JSRegExp::CanTierUp() { |
| return v8_flags.regexp_tier_up && type_tag() == JSRegExp::IRREGEXP; |
| } |
| |
| // An irregexp is considered to be marked for tier up if the tier-up ticks |
| // value reaches zero. |
| bool JSRegExp::MarkedForTierUp() { |
| DCHECK(IsFixedArray(data())); |
| |
| if (!CanTierUp()) { |
| return false; |
| } |
| |
| return Smi::ToInt(DataAt(kIrregexpTicksUntilTierUpIndex)) == 0; |
| } |
| |
| void JSRegExp::ResetLastTierUpTick() { |
| DCHECK(v8_flags.regexp_tier_up); |
| DCHECK_EQ(type_tag(), JSRegExp::IRREGEXP); |
| int tier_up_ticks = Smi::ToInt(DataAt(kIrregexpTicksUntilTierUpIndex)) + 1; |
| FixedArray::cast(data())->set(JSRegExp::kIrregexpTicksUntilTierUpIndex, |
| Smi::FromInt(tier_up_ticks)); |
| } |
| |
| void JSRegExp::TierUpTick() { |
| DCHECK(v8_flags.regexp_tier_up); |
| DCHECK_EQ(type_tag(), JSRegExp::IRREGEXP); |
| int tier_up_ticks = Smi::ToInt(DataAt(kIrregexpTicksUntilTierUpIndex)); |
| if (tier_up_ticks == 0) { |
| return; |
| } |
| FixedArray::cast(data())->set(JSRegExp::kIrregexpTicksUntilTierUpIndex, |
| Smi::FromInt(tier_up_ticks - 1)); |
| } |
| |
| void JSRegExp::MarkTierUpForNextExec() { |
| DCHECK(v8_flags.regexp_tier_up); |
| DCHECK_EQ(type_tag(), JSRegExp::IRREGEXP); |
| FixedArray::cast(data())->set(JSRegExp::kIrregexpTicksUntilTierUpIndex, |
| Smi::zero()); |
| } |
| |
| // static |
| MaybeHandle<JSRegExp> JSRegExp::Initialize(Handle<JSRegExp> regexp, |
| Handle<String> source, |
| Handle<String> flags_string) { |
| Isolate* isolate = regexp->GetIsolate(); |
| base::Optional<Flags> flags = |
| JSRegExp::FlagsFromString(isolate, flags_string); |
| if (!flags.has_value() || |
| !RegExp::VerifyFlags(JSRegExp::AsRegExpFlags(flags.value()))) { |
| THROW_NEW_ERROR( |
| isolate, |
| NewSyntaxError(MessageTemplate::kInvalidRegExpFlags, flags_string), |
| JSRegExp); |
| } |
| return Initialize(regexp, source, flags.value()); |
| } |
| |
| namespace { |
| |
| bool IsLineTerminator(int c) { |
| // Expected to return true for '\n', '\r', 0x2028, and 0x2029. |
| return unibrow::IsLineTerminator(static_cast<unibrow::uchar>(c)); |
| } |
| |
| // TODO(jgruber): Consider merging CountAdditionalEscapeChars and |
| // WriteEscapedRegExpSource into a single function to deduplicate dispatch logic |
| // and move related code closer to each other. |
| template <typename Char> |
| int CountAdditionalEscapeChars(Handle<String> source, bool* needs_escapes_out) { |
| DisallowGarbageCollection no_gc; |
| int escapes = 0; |
| bool needs_escapes = false; |
| bool in_character_class = false; |
| base::Vector<const Char> src = source->GetCharVector<Char>(no_gc); |
| for (int i = 0; i < src.length(); i++) { |
| const Char c = src[i]; |
| if (c == '\\') { |
| if (i + 1 < src.length() && IsLineTerminator(src[i + 1])) { |
| // This '\' is ignored since the next character itself will be escaped. |
| escapes--; |
| } else { |
| // Escape. Skip next character, which will be copied verbatim; |
| i++; |
| } |
| } else if (c == '/' && !in_character_class) { |
| // Not escaped forward-slash needs escape. |
| needs_escapes = true; |
| escapes++; |
| } else if (c == '[') { |
| in_character_class = true; |
| } else if (c == ']') { |
| in_character_class = false; |
| } else if (c == '\n') { |
| needs_escapes = true; |
| escapes++; |
| } else if (c == '\r') { |
| needs_escapes = true; |
| escapes++; |
| } else if (static_cast<int>(c) == 0x2028) { |
| needs_escapes = true; |
| escapes += std::strlen("\\u2028") - 1; |
| } else if (static_cast<int>(c) == 0x2029) { |
| needs_escapes = true; |
| escapes += std::strlen("\\u2029") - 1; |
| } else { |
| DCHECK(!IsLineTerminator(c)); |
| } |
| } |
| DCHECK(!in_character_class); |
| DCHECK_GE(escapes, 0); |
| DCHECK_IMPLIES(escapes != 0, needs_escapes); |
| *needs_escapes_out = needs_escapes; |
| return escapes; |
| } |
| |
| template <typename Char> |
| void WriteStringToCharVector(base::Vector<Char> v, int* d, const char* string) { |
| int s = 0; |
| while (string[s] != '\0') v[(*d)++] = string[s++]; |
| } |
| |
| template <typename Char, typename StringType> |
| Handle<StringType> WriteEscapedRegExpSource(Handle<String> source, |
| Handle<StringType> result) { |
| DisallowGarbageCollection no_gc; |
| base::Vector<const Char> src = source->GetCharVector<Char>(no_gc); |
| base::Vector<Char> dst(result->GetChars(no_gc), result->length()); |
| int s = 0; |
| int d = 0; |
| bool in_character_class = false; |
| while (s < src.length()) { |
| const Char c = src[s]; |
| if (c == '\\') { |
| if (s + 1 < src.length() && IsLineTerminator(src[s + 1])) { |
| // This '\' is ignored since the next character itself will be escaped. |
| s++; |
| continue; |
| } else { |
| // Escape. Copy this and next character. |
| dst[d++] = src[s++]; |
| } |
| if (s == src.length()) break; |
| } else if (c == '/' && !in_character_class) { |
| // Not escaped forward-slash needs escape. |
| dst[d++] = '\\'; |
| } else if (c == '[') { |
| in_character_class = true; |
| } else if (c == ']') { |
| in_character_class = false; |
| } else if (c == '\n') { |
| WriteStringToCharVector(dst, &d, "\\n"); |
| s++; |
| continue; |
| } else if (c == '\r') { |
| WriteStringToCharVector(dst, &d, "\\r"); |
| s++; |
| continue; |
| } else if (static_cast<int>(c) == 0x2028) { |
| WriteStringToCharVector(dst, &d, "\\u2028"); |
| s++; |
| continue; |
| } else if (static_cast<int>(c) == 0x2029) { |
| WriteStringToCharVector(dst, &d, "\\u2029"); |
| s++; |
| continue; |
| } else { |
| DCHECK(!IsLineTerminator(c)); |
| } |
| dst[d++] = src[s++]; |
| } |
| DCHECK_EQ(result->length(), d); |
| DCHECK(!in_character_class); |
| return result; |
| } |
| |
| MaybeHandle<String> EscapeRegExpSource(Isolate* isolate, |
| Handle<String> source) { |
| DCHECK(source->IsFlat()); |
| if (source->length() == 0) return isolate->factory()->query_colon_string(); |
| bool one_byte = String::IsOneByteRepresentationUnderneath(*source); |
| bool needs_escapes = false; |
| int additional_escape_chars = |
| one_byte ? CountAdditionalEscapeChars<uint8_t>(source, &needs_escapes) |
| : CountAdditionalEscapeChars<base::uc16>(source, &needs_escapes); |
| if (!needs_escapes) return source; |
| int length = source->length() + additional_escape_chars; |
| if (one_byte) { |
| Handle<SeqOneByteString> result; |
| ASSIGN_RETURN_ON_EXCEPTION(isolate, result, |
| isolate->factory()->NewRawOneByteString(length), |
| String); |
| return WriteEscapedRegExpSource<uint8_t>(source, result); |
| } else { |
| Handle<SeqTwoByteString> result; |
| ASSIGN_RETURN_ON_EXCEPTION(isolate, result, |
| isolate->factory()->NewRawTwoByteString(length), |
| String); |
| return WriteEscapedRegExpSource<base::uc16>(source, result); |
| } |
| } |
| |
| } // namespace |
| |
| // static |
| MaybeHandle<JSRegExp> JSRegExp::Initialize(Handle<JSRegExp> regexp, |
| Handle<String> source, Flags flags, |
| uint32_t backtrack_limit) { |
| Isolate* isolate = regexp->GetIsolate(); |
| Factory* factory = isolate->factory(); |
| // If source is the empty string we set it to "(?:)" instead as |
| // suggested by ECMA-262, 5th, section 15.10.4.1. |
| if (source->length() == 0) source = factory->query_colon_string(); |
| |
| source = String::Flatten(isolate, source); |
| |
| RETURN_ON_EXCEPTION( |
| isolate, |
| RegExp::Compile(isolate, regexp, source, JSRegExp::AsRegExpFlags(flags), |
| backtrack_limit), |
| JSRegExp); |
| |
| Handle<String> escaped_source; |
| ASSIGN_RETURN_ON_EXCEPTION(isolate, escaped_source, |
| EscapeRegExpSource(isolate, source), JSRegExp); |
| |
| regexp->set_source(*escaped_source); |
| regexp->set_flags(Smi::FromInt(flags)); |
| |
| Tagged<Map> map = regexp->map(); |
| Tagged<Object> constructor = map->GetConstructor(); |
| if (IsJSFunction(constructor) && |
| JSFunction::cast(constructor)->initial_map() == map) { |
| // If we still have the original map, set in-object properties directly. |
| regexp->InObjectPropertyAtPut(JSRegExp::kLastIndexFieldIndex, |
| Smi::FromInt(kInitialLastIndexValue), |
| SKIP_WRITE_BARRIER); |
| } else { |
| // Map has changed, so use generic, but slower, method. |
| RETURN_ON_EXCEPTION( |
| isolate, |
| Object::SetProperty( |
| isolate, regexp, factory->lastIndex_string(), |
| Handle<Smi>(Smi::FromInt(kInitialLastIndexValue), isolate)), |
| JSRegExp); |
| } |
| |
| return regexp; |
| } |
| |
| } // namespace internal |
| } // namespace v8 |