| // Copyright 2014 the V8 project authors. All rights reserved. |
| // Use of this source code is governed by a BSD-style license that can be |
| // found in the LICENSE file. |
| |
| #ifndef V8_UNICODE_DECODER_H_ |
| #define V8_UNICODE_DECODER_H_ |
| |
| #include <sys/types.h> |
| #include <algorithm> |
| #include "src/globals.h" |
| #include "src/memcopy.h" |
| #include "src/unicode.h" |
| #include "src/vector.h" |
| |
| namespace unibrow { |
| |
| class Utf8Iterator { |
| public: |
| explicit Utf8Iterator(const v8::internal::Vector<const char>& stream) |
| : Utf8Iterator(stream, 0, false) {} |
| Utf8Iterator(const v8::internal::Vector<const char>& stream, size_t offset, |
| bool trailing) |
| : stream_(stream), |
| cursor_(offset), |
| offset_(0), |
| char_(0), |
| trailing_(false) { |
| DCHECK_LE(offset, stream.length()); |
| // Read the first char, setting offset_ to offset in the process. |
| ++*this; |
| |
| // This must be set after reading the first char, since the offset marks |
| // the start of the octet sequence that the trailing char is part of. |
| trailing_ = trailing; |
| if (trailing) { |
| DCHECK_GT(char_, Utf16::kMaxNonSurrogateCharCode); |
| } |
| } |
| |
| uint16_t operator*(); |
| Utf8Iterator& operator++(); |
| Utf8Iterator operator++(int); |
| bool Done(); |
| bool Trailing() { return trailing_; } |
| size_t Offset() { return offset_; } |
| |
| private: |
| const v8::internal::Vector<const char>& stream_; |
| size_t cursor_; |
| size_t offset_; |
| uint32_t char_; |
| bool trailing_; |
| }; |
| |
| class V8_EXPORT_PRIVATE Utf8DecoderBase { |
| public: |
| // Initialization done in subclass. |
| inline Utf8DecoderBase(); |
| inline Utf8DecoderBase(uint16_t* buffer, size_t buffer_length, |
| const v8::internal::Vector<const char>& stream); |
| inline size_t Utf16Length() const { return utf16_length_; } |
| |
| protected: |
| // This reads all characters and sets the utf16_length_. |
| // The first buffer_length utf16 chars are cached in the buffer. |
| void Reset(uint16_t* buffer, size_t buffer_length, |
| const v8::internal::Vector<const char>& vector); |
| static void WriteUtf16Slow(uint16_t* data, size_t length, |
| const v8::internal::Vector<const char>& stream, |
| size_t offset, bool trailing); |
| |
| size_t bytes_read_; |
| size_t chars_written_; |
| size_t utf16_length_; |
| bool trailing_; |
| |
| private: |
| DISALLOW_COPY_AND_ASSIGN(Utf8DecoderBase); |
| }; |
| |
| template <size_t kBufferSize> |
| class Utf8Decoder : public Utf8DecoderBase { |
| public: |
| inline Utf8Decoder() = default; |
| explicit inline Utf8Decoder(const v8::internal::Vector<const char>& stream); |
| inline void Reset(const v8::internal::Vector<const char>& stream); |
| inline size_t WriteUtf16( |
| uint16_t* data, size_t length, |
| const v8::internal::Vector<const char>& stream) const; |
| |
| private: |
| uint16_t buffer_[kBufferSize]; |
| }; |
| |
| Utf8DecoderBase::Utf8DecoderBase() |
| : bytes_read_(0), chars_written_(0), utf16_length_(0), trailing_(false) {} |
| |
| Utf8DecoderBase::Utf8DecoderBase( |
| uint16_t* buffer, size_t buffer_length, |
| const v8::internal::Vector<const char>& stream) { |
| Reset(buffer, buffer_length, stream); |
| } |
| |
| template <size_t kBufferSize> |
| Utf8Decoder<kBufferSize>::Utf8Decoder( |
| const v8::internal::Vector<const char>& stream) |
| : Utf8DecoderBase(buffer_, kBufferSize, stream) {} |
| |
| template <size_t kBufferSize> |
| void Utf8Decoder<kBufferSize>::Reset( |
| const v8::internal::Vector<const char>& stream) { |
| Utf8DecoderBase::Reset(buffer_, kBufferSize, stream); |
| } |
| |
| template <size_t kBufferSize> |
| size_t Utf8Decoder<kBufferSize>::WriteUtf16( |
| uint16_t* data, size_t data_length, |
| const v8::internal::Vector<const char>& stream) const { |
| DCHECK_GT(data_length, 0); |
| data_length = std::min(data_length, utf16_length_); |
| |
| // memcpy everything in buffer. |
| size_t memcpy_length = std::min(data_length, chars_written_); |
| v8::internal::MemCopy(data, buffer_, memcpy_length * sizeof(uint16_t)); |
| |
| if (data_length <= chars_written_) return data_length; |
| |
| // Copy the rest the slow way. |
| WriteUtf16Slow(data + chars_written_, data_length - chars_written_, stream, |
| bytes_read_, trailing_); |
| return data_length; |
| } |
| |
| class Latin1 { |
| public: |
| static const unsigned kMaxChar = 0xff; |
| // Convert the character to Latin-1 case equivalent if possible. |
| static inline uint16_t TryConvertToLatin1(uint16_t); |
| }; |
| |
| uint16_t Latin1::TryConvertToLatin1(uint16_t c) { |
| switch (c) { |
| // This are equivalent characters in unicode. |
| case 0x39c: |
| case 0x3bc: |
| return 0xb5; |
| // This is an uppercase of a Latin-1 character |
| // outside of Latin-1. |
| case 0x178: |
| return 0xff; |
| } |
| return c; |
| } |
| |
| |
| } // namespace unibrow |
| |
| #endif // V8_UNICODE_DECODER_H_ |