| /* |
| * Copyright 2023 WebAssembly Community Group participants |
| * |
| * Licensed under the Apache License, Version 2.0 (the "License"); |
| * you may not use this file except in compliance with the License. |
| * You may obtain a copy of the License at |
| * |
| * http://www.apache.org/licenses/LICENSE-2.0 |
| * |
| * Unless required by applicable law or agreed to in writing, software |
| * distributed under the License is distributed on an "AS IS" BASIS, |
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| * See the License for the specific language governing permissions and |
| * limitations under the License. |
| */ |
| |
| #ifndef parser_lexer_h |
| #define parser_lexer_h |
| |
| #include <cassert> |
| #include <cmath> |
| #include <cstddef> |
| #include <cstdint> |
| #include <cstring> |
| #include <optional> |
| #include <ostream> |
| #include <sstream> |
| #include <string_view> |
| #include <variant> |
| #include <vector> |
| |
| #include "support/bits.h" |
| #include "support/name.h" |
| #include "support/result.h" |
| #include "support/string.h" |
| |
| namespace wasm::WATParser { |
| |
| struct TextPos { |
| size_t line; |
| size_t col; |
| |
| bool operator==(const TextPos& other) const { |
| return line == other.line && col == other.col; |
| } |
| bool operator!=(const TextPos& other) const { return !(*this == other); } |
| }; |
| |
| inline std::ostream& operator<<(std::ostream& os, const TextPos& pos) { |
| return os << pos.line << ":" << pos.col; |
| } |
| |
| // =========== |
| // Annotations |
| // =========== |
| |
| struct Annotation { |
| Name kind; |
| std::string_view contents; |
| }; |
| |
| // ===== |
| // Lexer |
| // ===== |
| |
| struct Lexer { |
| private: |
| size_t pos = 0; |
| std::vector<Annotation> annotations; |
| std::optional<std::string> file; |
| |
| public: |
| std::string_view buffer; |
| |
| Lexer(std::string_view buffer, |
| std::optional<std::string> file = std::nullopt); |
| |
| size_t getPos() const { return pos; } |
| |
| void setPos(size_t i) { |
| pos = i; |
| advance(); |
| } |
| |
| // Consume the next `n` characters. |
| void take(size_t n) { pos += n; } |
| void takeAll() { pos = buffer.size(); } |
| |
| // Whether the unlexed input starts with prefix `sv`. |
| size_t startsWith(std::string_view sv) const { |
| return next().starts_with(sv); |
| } |
| |
| std::optional<char> peekChar() const; |
| |
| bool peekLParen() { return !empty() && peek() == '('; } |
| |
| bool takeLParen(); |
| |
| bool peekRParen() { return !empty() && peek() == ')'; } |
| |
| bool takeRParen(); |
| |
| bool takeUntilParen(); |
| |
| std::optional<Name> takeID(); |
| |
| std::optional<std::string_view> peekKeyword(); |
| |
| std::optional<std::string_view> takeKeyword(); |
| bool takeKeyword(std::string_view expected); |
| |
| std::optional<uint64_t> takeOffset(); |
| std::optional<uint32_t> takeAlign(); |
| |
| std::optional<uint64_t> takeU64() { return takeU<uint64_t>(); } |
| std::optional<uint64_t> takeI64() { return takeI<uint64_t>(); } |
| std::optional<uint32_t> takeU32() { return takeU<uint32_t>(); } |
| std::optional<uint32_t> takeI32() { return takeI<uint32_t>(); } |
| std::optional<uint16_t> takeI16() { return takeI<uint16_t>(); } |
| std::optional<uint8_t> takeU8() { return takeU<uint8_t>(); } |
| std::optional<uint8_t> takeI8() { return takeI<uint8_t>(); } |
| |
| std::optional<float> takeF32(); |
| std::optional<double> takeF64(); |
| |
| std::optional<std::string> takeString(); |
| |
| std::optional<Name> takeName(); |
| |
| bool takeSExprStart(std::string_view expected); |
| |
| bool peekSExprStart(std::string_view expected); |
| |
| std::string_view next() const { return buffer.substr(pos); } |
| |
| uint8_t peek() const { return buffer[pos]; } |
| |
| void advance() { |
| annotations.clear(); |
| skipSpace(); |
| } |
| |
| bool empty() const { return pos == buffer.size(); } |
| size_t remaining() const { return buffer.size() - pos; } |
| |
| TextPos position(const char* c) const; |
| |
| TextPos position(size_t i) const { return position(buffer.data() + i); } |
| TextPos position(std::string_view span) const { |
| return position(span.data()); |
| } |
| TextPos position() const { return position(getPos()); } |
| |
| [[nodiscard]] Err err(size_t pos, std::string reason); |
| |
| [[nodiscard]] Err err(std::string reason) { return err(getPos(), reason); } |
| |
| const std::vector<Annotation> getAnnotations() { return annotations; } |
| std::vector<Annotation> takeAnnotations() { return std::move(annotations); } |
| |
| void setAnnotations(std::vector<Annotation> annotations) { |
| this->annotations = std::move(annotations); |
| } |
| |
| private: |
| std::optional<int> getDigit(char c); |
| |
| std::optional<int> getHexDigit(char c); |
| |
| // Consume the prefix and return true if possible. |
| bool takePrefix(std::string_view sv); |
| |
| std::optional<int> takeDigit(); |
| |
| std::optional<int> takeHexdigit(); |
| |
| enum OverflowBehavior { DisallowOverflow, IgnoreOverflow }; |
| |
| std::optional<uint64_t> takeNum(OverflowBehavior behavior = DisallowOverflow); |
| |
| std::optional<uint64_t> |
| takeHexnum(OverflowBehavior behavior = DisallowOverflow); |
| |
| enum Sign { NoSign, Pos, Neg }; |
| |
| Sign takeSign(); |
| |
| struct LexedInteger { |
| uint64_t n; |
| Sign sign; |
| |
| template<typename T> bool isUnsigned(); |
| template<typename T> bool isSigned(); |
| }; |
| |
| std::optional<LexedInteger> takeInteger(); |
| |
| template<typename T> std::optional<T> takeU(); |
| |
| template<typename T> std::optional<T> takeS(); |
| |
| template<typename T> std::optional<T> takeI(); |
| |
| std::optional<std::string_view> takeDecfloat(); |
| |
| std::optional<std::string_view> takeHexfloat(); |
| |
| struct LexedFloat { |
| std::optional<uint64_t> nanPayload; |
| double d; |
| }; |
| |
| std::optional<LexedFloat> takeFloat(); |
| |
| struct StringOrView : std::variant<std::string, std::string_view> { |
| using std::variant<std::string, std::string_view>::variant; |
| std::string_view str() const { |
| return std::visit([](auto& s) -> std::string_view { return s; }, *this); |
| } |
| }; |
| |
| std::optional<StringOrView> takeStr(); |
| |
| bool idchar(); |
| |
| std::optional<StringOrView> takeIdent(); |
| |
| bool spacechar(); |
| |
| bool takeSpacechar(); |
| |
| bool takeComment(); |
| |
| bool takeSpace(); |
| |
| std::optional<Annotation> takeAnnotation(); |
| |
| void skipSpace(); |
| |
| bool canFinish(); |
| }; |
| |
| inline Lexer::Lexer(std::string_view buffer, std::optional<std::string> file) |
| : file(file), buffer(buffer) { |
| setPos(0); |
| } |
| |
| inline std::optional<char> Lexer::peekChar() const { |
| if (!empty()) { |
| return peek(); |
| } |
| return std::nullopt; |
| } |
| |
| inline bool Lexer::takeLParen() { |
| if (peekLParen()) { |
| take(1); |
| advance(); |
| return true; |
| } |
| return false; |
| } |
| |
| inline bool Lexer::takeRParen() { |
| if (peekRParen()) { |
| take(1); |
| advance(); |
| return true; |
| } |
| return false; |
| } |
| |
| inline bool Lexer::takeUntilParen() { |
| while (true) { |
| if (empty()) { |
| return false; |
| } |
| if (peekLParen() || peekRParen()) { |
| return true; |
| } |
| // Do not count the parentheses in strings. |
| if (takeString()) { |
| continue; |
| } |
| ++pos; |
| advance(); |
| } |
| } |
| |
| inline std::optional<Name> Lexer::takeID() { |
| if (auto result = takeIdent()) { |
| auto name = Name(result->str()); |
| advance(); |
| return name; |
| } |
| return std::nullopt; |
| } |
| |
| inline std::optional<std::string_view> Lexer::peekKeyword() { |
| if (empty()) { |
| return std::nullopt; |
| } |
| auto startPos = pos; |
| uint8_t start = peek(); |
| if ('a' <= start && start <= 'z') { |
| take(1); |
| } else { |
| return std::nullopt; |
| } |
| while (idchar()) { |
| take(1); |
| } |
| auto ret = buffer.substr(startPos, pos - startPos); |
| pos = startPos; |
| return ret; |
| } |
| |
| inline std::optional<std::string_view> Lexer::takeKeyword() { |
| auto keyword = peekKeyword(); |
| if (keyword) { |
| take(keyword->size()); |
| advance(); |
| } |
| return keyword; |
| } |
| |
| inline bool Lexer::takeKeyword(std::string_view expected) { |
| if (!startsWith(expected)) { |
| return false; |
| } |
| auto startPos = pos; |
| take(expected.size()); |
| if (canFinish()) { |
| advance(); |
| return true; |
| } |
| pos = startPos; |
| return false; |
| } |
| |
| inline std::optional<uint64_t> Lexer::takeOffset() { |
| using namespace std::string_view_literals; |
| auto startPos = pos; |
| if (auto offset = takeKeyword()) { |
| if (!offset->starts_with("offset="sv)) { |
| pos = startPos; |
| return std::nullopt; |
| } |
| Lexer subLexer(offset->substr(7)); |
| if (auto o = subLexer.takeU64()) { |
| advance(); |
| return o; |
| } |
| } |
| pos = startPos; |
| return std::nullopt; |
| } |
| |
| inline std::optional<uint32_t> Lexer::takeAlign() { |
| using namespace std::string_view_literals; |
| auto startPos = pos; |
| if (auto result = takeKeyword()) { |
| if (!result->starts_with("align="sv)) { |
| pos = startPos; |
| return std::nullopt; |
| } |
| Lexer subLexer(result->substr(6)); |
| if (auto o = subLexer.takeU32()) { |
| if (Bits::popCount(*o) != 1) { |
| pos = startPos; |
| return std::nullopt; |
| } |
| advance(); |
| return o; |
| } |
| } |
| pos = startPos; |
| return std::nullopt; |
| } |
| |
| inline std::optional<float> Lexer::takeF32() { |
| constexpr int signif = 23; |
| constexpr uint32_t payloadMask = (1u << signif) - 1; |
| constexpr uint64_t nanDefault = 1ull << (signif - 1); |
| auto startPos = pos; |
| if (auto result = takeFloat()) { |
| float f = result->d; |
| if (std::isnan(f)) { |
| // Validate and inject payload. |
| uint64_t payload = result->nanPayload ? *result->nanPayload : nanDefault; |
| if (payload == 0 || payload > payloadMask) { |
| // TODO: Add error production for out-of-bounds payload. |
| pos = startPos; |
| return std::nullopt; |
| } |
| uint32_t bits; |
| static_assert(sizeof(bits) == sizeof(f)); |
| memcpy(&bits, &f, sizeof(bits)); |
| bits = (bits & ~payloadMask) | payload; |
| memcpy(&f, &bits, sizeof(bits)); |
| } |
| advance(); |
| return f; |
| } |
| if (auto result = takeInteger()) { |
| advance(); |
| if (result->sign == Neg) { |
| if (result->n == 0) { |
| return -0.0f; |
| } |
| return -static_cast<float>(result->n); |
| } |
| return static_cast<float>(result->n); |
| } |
| return std::nullopt; |
| } |
| |
| inline std::optional<double> Lexer::takeF64() { |
| constexpr int signif = 52; |
| constexpr uint64_t payloadMask = (1ull << signif) - 1; |
| constexpr uint64_t nanDefault = 1ull << (signif - 1); |
| auto startPos = pos; |
| if (auto result = takeFloat()) { |
| double d = result->d; |
| if (std::isnan(d)) { |
| // Inject payload. |
| uint64_t payload = result->nanPayload ? *result->nanPayload : nanDefault; |
| if (payload == 0 || payload > payloadMask) { |
| // TODO: Add error production for out-of-bounds payload. |
| pos = startPos; |
| return std::nullopt; |
| } |
| uint64_t bits; |
| static_assert(sizeof(bits) == sizeof(d)); |
| memcpy(&bits, &d, sizeof(bits)); |
| bits = (bits & ~payloadMask) | payload; |
| memcpy(&d, &bits, sizeof(bits)); |
| } |
| advance(); |
| return d; |
| } |
| if (auto result = takeInteger()) { |
| advance(); |
| if (result->sign == Neg) { |
| if (result->n == 0) { |
| return -0.0; |
| } |
| return -static_cast<double>(result->n); |
| } |
| return static_cast<double>(result->n); |
| } |
| return std::nullopt; |
| } |
| |
| inline std::optional<std::string> Lexer::takeString() { |
| if (auto str = takeStr()) { |
| advance(); |
| if (auto* s = std::get_if<std::string>(&*str)) { |
| return std::move(*s); |
| } |
| auto view = std::get<std::string_view>(*str); |
| return std::string(view); |
| } |
| return std::nullopt; |
| } |
| |
| inline std::optional<Name> Lexer::takeName() { |
| auto str = takeString(); |
| if (!str || !String::isUTF8(*str)) { |
| return std::nullopt; |
| } |
| return Name(*str); |
| } |
| |
| inline bool Lexer::takeSExprStart(std::string_view expected) { |
| auto original = *this; |
| if (takeLParen() && takeKeyword(expected)) { |
| return true; |
| } |
| *this = original; |
| return false; |
| } |
| |
| inline bool Lexer::peekSExprStart(std::string_view expected) { |
| auto original = *this; |
| if (!takeLParen()) { |
| return false; |
| } |
| bool ret = takeKeyword(expected); |
| *this = original; |
| return ret; |
| } |
| |
| inline TextPos Lexer::position(const char* c) const { |
| assert(size_t(c - buffer.data()) <= buffer.size()); |
| TextPos pos{1, 0}; |
| for (const char* p = buffer.data(); p != c; ++p) { |
| if (*p == '\n') { |
| pos.line++; |
| pos.col = 0; |
| } else { |
| pos.col++; |
| } |
| } |
| return pos; |
| } |
| |
| inline Err Lexer::err(size_t pos, std::string reason) { |
| std::stringstream msg; |
| if (file) { |
| msg << *file << ":"; |
| } |
| msg << position(pos) << ": error: " << reason; |
| return Err{msg.str()}; |
| } |
| |
| inline std::optional<int> Lexer::getDigit(char c) { |
| if ('0' <= c && c <= '9') { |
| return c - '0'; |
| } |
| return std::nullopt; |
| } |
| |
| inline std::optional<int> Lexer::getHexDigit(char c) { |
| if (auto d = getDigit(c)) { |
| return d; |
| } |
| if ('A' <= c && c <= 'F') { |
| return 10 + c - 'A'; |
| } |
| if ('a' <= c && c <= 'f') { |
| return 10 + c - 'a'; |
| } |
| return std::nullopt; |
| } |
| |
| inline bool Lexer::takePrefix(std::string_view sv) { |
| if (startsWith(sv)) { |
| take(sv.size()); |
| return true; |
| } |
| return false; |
| } |
| |
| inline std::optional<int> Lexer::takeDigit() { |
| if (empty()) { |
| return std::nullopt; |
| } |
| if (auto d = getDigit(peek())) { |
| take(1); |
| return d; |
| } |
| return std::nullopt; |
| } |
| |
| inline std::optional<int> Lexer::takeHexdigit() { |
| if (empty()) { |
| return std::nullopt; |
| } |
| if (auto h = getHexDigit(peek())) { |
| take(1); |
| return h; |
| } |
| return std::nullopt; |
| } |
| |
| inline std::optional<uint64_t> Lexer::takeNum(OverflowBehavior behavior) { |
| using namespace std::string_view_literals; |
| auto startPos = pos; |
| bool overflow = false; |
| uint64_t n = 0; |
| if (auto d = takeDigit()) { |
| n = *d; |
| } else { |
| return std::nullopt; |
| } |
| while (true) { |
| bool under = takePrefix("_"sv); |
| if (auto d = takeDigit()) { |
| uint64_t newN = n * 10 + *d; |
| if (newN < n) { |
| overflow = true; |
| } |
| n = newN; |
| continue; |
| } |
| if (!under && (!overflow || behavior == IgnoreOverflow)) { |
| return n; |
| } |
| // TODO: Add error productions for trailing underscore and overflow. |
| pos = startPos; |
| return std::nullopt; |
| } |
| } |
| |
| inline std::optional<uint64_t> Lexer::takeHexnum(OverflowBehavior behavior) { |
| using namespace std::string_view_literals; |
| auto startPos = pos; |
| bool overflow = false; |
| uint64_t n = 0; |
| if (auto d = takeHexdigit()) { |
| n = *d; |
| } else { |
| return std::nullopt; |
| } |
| while (true) { |
| bool under = takePrefix("_"sv); |
| if (auto d = takeHexdigit()) { |
| uint64_t newN = n * 16 + *d; |
| if (newN < n) { |
| overflow = true; |
| } |
| n = newN; |
| continue; |
| } |
| if (!under && (!overflow || behavior == IgnoreOverflow)) { |
| return n; |
| } |
| // TODO: Add error productions for trailing underscore and overflow. |
| pos = startPos; |
| return std::nullopt; |
| } |
| } |
| |
| inline Lexer::Sign Lexer::takeSign() { |
| auto c = peek(); |
| if (c == '+') { |
| take(1); |
| return Pos; |
| } |
| if (c == '-') { |
| take(1); |
| return Neg; |
| } |
| return NoSign; |
| } |
| |
| template<typename T> bool Lexer::LexedInteger::isUnsigned() { |
| static_assert(std::is_integral_v<T> && std::is_unsigned_v<T>); |
| return sign == NoSign && n <= std::numeric_limits<T>::max(); |
| } |
| |
| template<typename T> bool Lexer::LexedInteger::isSigned() { |
| static_assert(std::is_integral_v<T> && std::is_signed_v<T>); |
| if (sign == Neg) { |
| // Absolute value of min() for two's complement integers is max() + 1. |
| uint64_t absMin = uint64_t(std::numeric_limits<T>::max()) + 1; |
| return n <= absMin; |
| } |
| return n <= uint64_t(std::numeric_limits<T>::max()); |
| } |
| |
| inline std::optional<Lexer::LexedInteger> Lexer::takeInteger() { |
| using namespace std::string_view_literals; |
| auto startPos = pos; |
| auto sign = takeSign(); |
| if (takePrefix("0x"sv)) { |
| if (auto n = takeHexnum()) { |
| if (canFinish()) { |
| return LexedInteger{*n, sign}; |
| } |
| } |
| // TODO: Add error production for unrecognized hexnum. |
| pos = startPos; |
| return std::nullopt; |
| } |
| if (auto n = takeNum()) { |
| if (canFinish()) { |
| return LexedInteger{*n, sign}; |
| } |
| } |
| pos = startPos; |
| return std::nullopt; |
| } |
| |
| template<typename T> std::optional<T> Lexer::takeU() { |
| static_assert(std::is_integral_v<T> && std::is_unsigned_v<T>); |
| auto startPos = pos; |
| if (auto result = takeInteger(); result && result->isUnsigned<T>()) { |
| advance(); |
| return static_cast<T>(result->n); |
| } |
| // TODO: Add error production for unsigned overflow. |
| pos = startPos; |
| return std::nullopt; |
| } |
| |
| template<typename T> std::optional<T> Lexer::takeS() { |
| static_assert(std::is_integral_v<T> && std::is_signed_v<T>); |
| auto startPos = pos; |
| if (auto result = takeInteger(); result && result->isSigned<T>()) { |
| advance(); |
| if (result->sign == Neg) { |
| return static_cast<T>(-result->n); |
| } |
| return static_cast<T>(result->n); |
| } |
| pos = startPos; |
| return std::nullopt; |
| } |
| |
| template<typename T> std::optional<T> Lexer::takeI() { |
| static_assert(std::is_integral_v<T> && std::is_unsigned_v<T>); |
| auto startPos = pos; |
| if (auto result = takeInteger()) { |
| if (result->isUnsigned<T>() || result->isSigned<std::make_signed_t<T>>()) { |
| advance(); |
| if (result->sign == Neg) { |
| return static_cast<T>(-result->n); |
| } |
| return static_cast<T>(result->n); |
| } |
| } |
| pos = startPos; |
| return std::nullopt; |
| } |
| |
| inline std::optional<std::string_view> Lexer::takeDecfloat() { |
| using namespace std::string_view_literals; |
| auto startPos = pos; |
| if (!takeNum(IgnoreOverflow)) { |
| return std::nullopt; |
| } |
| // Optional '.' followed by optional frac |
| if (takePrefix("."sv)) { |
| takeNum(IgnoreOverflow); |
| } |
| if (takePrefix("E"sv) || takePrefix("e"sv)) { |
| // Optional sign |
| takeSign(); |
| if (!takeNum(IgnoreOverflow)) { |
| // TODO: Add error production for missing exponent. |
| pos = startPos; |
| return std::nullopt; |
| } |
| } |
| return buffer.substr(startPos, pos - startPos); |
| } |
| |
| inline std::optional<std::string_view> Lexer::takeHexfloat() { |
| using namespace std::string_view_literals; |
| auto startPos = pos; |
| if (!takePrefix("0x"sv)) { |
| return std::nullopt; |
| } |
| if (!takeHexnum(IgnoreOverflow)) { |
| pos = startPos; |
| return std::nullopt; |
| } |
| // Optional '.' followed by optional hexfrac |
| if (takePrefix("."sv)) { |
| takeHexnum(IgnoreOverflow); |
| } |
| if (takePrefix("P"sv) || takePrefix("p"sv)) { |
| // Optional sign |
| takeSign(); |
| if (!takeNum(IgnoreOverflow)) { |
| // TODO: Add error production for missing exponent. |
| pos = startPos; |
| return std::nullopt; |
| } |
| } |
| return buffer.substr(startPos, pos - startPos); |
| } |
| |
| inline std::optional<Lexer::LexedFloat> Lexer::takeFloat() { |
| using namespace std::string_view_literals; |
| auto startPos = pos; |
| std::optional<uint64_t> nanPayload; |
| bool isNan = false; |
| // Optional sign |
| auto sign = takeSign(); |
| if (takeHexfloat() || takeDecfloat() || takePrefix("inf"sv)) { |
| // nop. |
| } else if (takePrefix("nan"sv)) { |
| isNan = true; |
| if (takePrefix(":0x"sv)) { |
| if (auto n = takeHexnum()) { |
| nanPayload = n; |
| } else { |
| // TODO: Add error production for malformed NaN payload. |
| pos = startPos; |
| return std::nullopt; |
| } |
| } else { |
| // No explicit payload necessary; we will inject the default payload |
| // later. |
| } |
| } else { |
| pos = startPos; |
| return std::nullopt; |
| } |
| if (!canFinish()) { |
| pos = startPos; |
| return std::nullopt; |
| } |
| // strtod does not return NaNs with the expected signs on all platforms. |
| if (isNan) { |
| if (sign == Neg) { |
| const double negNan = std::copysign(NAN, -1.0); |
| assert(std::signbit(negNan) && "expected negative NaN to be negative"); |
| return LexedFloat{nanPayload, negNan}; |
| } else { |
| const double posNan = std::copysign(NAN, 1.0); |
| assert(!std::signbit(posNan) && "expected positive NaN to be positive"); |
| return LexedFloat{nanPayload, posNan}; |
| } |
| } |
| // Do not try to implement fully general and precise float parsing |
| // ourselves. Instead, call out to std::strtod to do our parsing. This means |
| // we need to strip any underscores since `std::strtod` does not understand |
| // them. |
| std::stringstream ss; |
| for (const char *curr = &buffer[startPos], *end = &buffer[pos]; curr != end; |
| ++curr) { |
| if (*curr != '_') { |
| ss << *curr; |
| } |
| } |
| std::string str = ss.str(); |
| char* last; |
| double d = std::strtod(str.data(), &last); |
| assert(last == str.data() + str.size() && "could not parse float"); |
| return LexedFloat{std::nullopt, d}; |
| } |
| |
| inline std::optional<Lexer::StringOrView> Lexer::takeStr() { |
| using namespace std::string_view_literals; |
| auto startPos = pos; |
| if (!takePrefix("\""sv)) { |
| return std::nullopt; |
| } |
| // Used to build a string with resolved escape sequences. Only used when the |
| // parsed string contains escape sequences, otherwise we can just use the |
| // parsed string directly. |
| std::optional<std::stringstream> escapeBuilder; |
| auto ensureBuildingEscaped = [&]() { |
| if (escapeBuilder) { |
| return; |
| } |
| // Drop the opening '"'. |
| escapeBuilder = std::stringstream{}; |
| *escapeBuilder << buffer.substr(startPos + 1, pos - startPos - 1); |
| }; |
| while (!takePrefix("\""sv)) { |
| if (empty()) { |
| // TODO: Add error production for unterminated string. |
| pos = startPos; |
| return std::nullopt; |
| } |
| if (startsWith("\\"sv)) { |
| // Escape sequences |
| ensureBuildingEscaped(); |
| take(1); |
| auto c = peek(); |
| take(1); |
| switch (c) { |
| case 't': |
| *escapeBuilder << '\t'; |
| break; |
| case 'n': |
| *escapeBuilder << '\n'; |
| break; |
| case 'r': |
| *escapeBuilder << '\r'; |
| break; |
| case '\\': |
| *escapeBuilder << '\\'; |
| break; |
| case '"': |
| *escapeBuilder << '"'; |
| break; |
| case '\'': |
| *escapeBuilder << '\''; |
| break; |
| case 'u': { |
| if (!takePrefix("{"sv)) { |
| pos = startPos; |
| return std::nullopt; |
| } |
| auto code = takeHexnum(); |
| if (!code) { |
| // TODO: Add error production for malformed unicode escapes. |
| pos = startPos; |
| return std::nullopt; |
| } |
| if (!takePrefix("}"sv)) { |
| // TODO: Add error production for malformed unicode escapes. |
| pos = startPos; |
| return std::nullopt; |
| } |
| if ((0xd800 <= *code && *code < 0xe000) || 0x110000 <= *code) { |
| // TODO: Add error production for invalid unicode values. |
| pos = startPos; |
| return std::nullopt; |
| } |
| String::writeWTF8CodePoint(*escapeBuilder, *code); |
| break; |
| } |
| default: { |
| // Byte escape: \hh |
| // We already took the first h as c. |
| auto first = getHexDigit(c); |
| auto second = takeHexdigit(); |
| if (!first || !second) { |
| // TODO: Add error production for unrecognized escape sequence. |
| pos = startPos; |
| return std::nullopt; |
| } |
| *escapeBuilder << char(*first * 16 + *second); |
| } |
| } |
| } else { |
| // Normal characters |
| if (uint8_t c = peek(); c >= 0x20 && c != 0x7F) { |
| if (escapeBuilder) { |
| *escapeBuilder << c; |
| } |
| take(1); |
| } else { |
| // TODO: Add error production for unescaped control characters. |
| pos = startPos; |
| return std::nullopt; |
| } |
| } |
| } |
| if (escapeBuilder) { |
| return escapeBuilder->str(); |
| } |
| // Drop the quotes. |
| return buffer.substr(startPos + 1, pos - startPos - 2); |
| } |
| |
| inline bool Lexer::idchar() { |
| if (empty()) { |
| return false; |
| } |
| uint8_t c = peek(); |
| // All the allowed characters lie in the range '!' to '~', and within that |
| // range the vast majority of characters are allowed, so it is significantly |
| // faster to check for the disallowed characters instead. |
| if (c < '!' || c > '~') { |
| return false; |
| } |
| switch (c) { |
| case '"': |
| case '(': |
| case ')': |
| case ',': |
| case ';': |
| case '[': |
| case ']': |
| case '{': |
| case '}': |
| return false; |
| } |
| return true; |
| } |
| |
| inline std::optional<Lexer::StringOrView> Lexer::takeIdent() { |
| using namespace std::string_view_literals; |
| auto startPos = pos; |
| if (!takePrefix("$"sv)) { |
| return {}; |
| } |
| // Quoted identifier e.g. $"foo" |
| std::optional<StringOrView> str; |
| if ((str = takeStr())) { |
| if (str->str().empty() || !String::isUTF8(str->str())) { |
| pos = startPos; |
| return std::nullopt; |
| } |
| } else if (idchar()) { |
| take(1); |
| while (idchar()) { |
| take(1); |
| } |
| } else { |
| pos = startPos; |
| return std::nullopt; |
| } |
| if (canFinish()) { |
| if (str) { |
| return str; |
| } |
| // Drop the "$". |
| return buffer.substr(startPos + 1, pos - startPos - 1); |
| } |
| pos = startPos; |
| return std::nullopt; |
| } |
| |
| inline bool Lexer::spacechar() { |
| if (empty()) { |
| return false; |
| } |
| switch (peek()) { |
| case ' ': |
| case '\n': |
| case '\r': |
| case '\t': |
| return true; |
| default: |
| return false; |
| } |
| } |
| |
| inline bool Lexer::takeSpacechar() { |
| if (spacechar()) { |
| take(1); |
| return true; |
| } |
| return false; |
| } |
| |
| inline bool Lexer::takeComment() { |
| using namespace std::string_view_literals; |
| |
| if (remaining() < 2) { |
| return false; |
| } |
| |
| // Line comment |
| if (!startsWith(";;@"sv) && takePrefix(";;"sv)) { |
| if (auto size = next().find('\n'); size != ""sv.npos) { |
| take(size); |
| } else { |
| takeAll(); |
| } |
| return true; |
| } |
| |
| // Block comment (possibly nested!) |
| if (takePrefix("(;"sv)) { |
| size_t depth = 1; |
| while (depth > 0 && remaining() >= 2) { |
| if (takePrefix("(;"sv)) { |
| ++depth; |
| } else if (takePrefix(";)"sv)) { |
| --depth; |
| } else { |
| take(1); |
| } |
| } |
| if (depth > 0) { |
| // TODO: Add error production for non-terminated block comment. |
| return false; |
| } |
| return true; |
| } |
| |
| return false; |
| } |
| |
| inline bool Lexer::takeSpace() { |
| bool taken = false; |
| while (remaining() && (takeSpacechar() || takeComment())) { |
| taken = true; |
| continue; |
| } |
| return taken; |
| } |
| |
| inline std::optional<Annotation> Lexer::takeAnnotation() { |
| using namespace std::string_view_literals; |
| auto startPos = pos; |
| std::string_view kind; |
| std::string_view contents; |
| if (takePrefix(";;@"sv)) { |
| kind = "src"sv; |
| auto contentPos = pos; |
| if (auto size = next().find('\n'); size != ""sv.npos) { |
| take(size); |
| } else { |
| takeAll(); |
| } |
| contents = buffer.substr(contentPos, pos - contentPos); |
| } else if (takePrefix("(@"sv)) { |
| auto kindPos = pos; |
| bool hasIdchar = false; |
| while (idchar()) { |
| take(1); |
| hasIdchar = true; |
| } |
| if (!hasIdchar) { |
| pos = startPos; |
| return std::nullopt; |
| } |
| kind = buffer.substr(kindPos, pos - kindPos); |
| auto contentPos = pos; |
| size_t depth = 1; |
| while (true) { |
| if (empty()) { |
| pos = startPos; |
| return std::nullopt; |
| } |
| if (takeSpace() || takeKeyword() || takeInteger() || takeFloat() || |
| takeStr() || takeIdent()) { |
| continue; |
| } |
| if (takePrefix("(@"sv)) { |
| bool hasIdchar = false; |
| while (idchar()) { |
| take(1); |
| hasIdchar = true; |
| } |
| if (!hasIdchar) { |
| pos = startPos; |
| return std::nullopt; |
| } |
| ++depth; |
| continue; |
| } |
| if (takeLParen()) { |
| ++depth; |
| continue; |
| } |
| if (takePrefix(")"sv)) { |
| --depth; |
| if (depth == 0) { |
| break; |
| } |
| continue; |
| } |
| // Unrecognized token. |
| pos = startPos; |
| return std::nullopt; |
| } |
| contents = buffer.substr(contentPos, pos - contentPos - 1); |
| } else { |
| return std::nullopt; |
| } |
| return Annotation{Name(kind), contents}; |
| } |
| |
| inline void Lexer::skipSpace() { |
| while (true) { |
| if (auto annotation = takeAnnotation()) { |
| annotations.emplace_back(*std::move(annotation)); |
| continue; |
| } |
| if (takeSpace()) { |
| continue; |
| } |
| break; |
| } |
| } |
| |
| inline bool Lexer::canFinish() { |
| // Logically we want to check for eof, parens, and space. But we don't |
| // actually want to parse more than a couple characters of space, so check |
| // for individual space chars or comment starts instead. |
| using namespace std::string_view_literals; |
| return empty() || spacechar() || peek() == '(' || peek() == ')' || |
| startsWith(";;"sv); |
| } |
| |
| } // namespace wasm::WATParser |
| |
| #endif // parser_lexer_h |