blob: 44aecdc2bc124714d5a4d79cab703f58f2c44082 [file] [log] [blame] [edit]
/*
* Copyright 2023 WebAssembly Community Group participants
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include <cassert>
#include <cctype>
#include <cmath>
#include <iostream>
#include <optional>
#include <sstream>
#include <variant>
#include "lexer.h"
#include "support/bits.h"
#include "support/string.h"
using namespace std::string_view_literals;
namespace wasm::WATParser {
Name srcAnnotationKind("src");
namespace {
// ================
// Lexical Analysis
// ================
// The result of lexing a token fragment.
struct LexResult {
std::string_view span;
};
// Lexing context that accumulates lexed input to produce a token fragment.
struct LexCtx {
private:
// The input we are lexing.
std::string_view input;
// How much of the input we have already lexed.
size_t lexedSize = 0;
public:
explicit LexCtx(std::string_view in) : input(in) {}
// Return the fragment that has been lexed so far.
std::optional<LexResult> lexed() const {
if (lexedSize > 0) {
return {LexResult{input.substr(0, lexedSize)}};
}
return {};
}
// The next input that has not already been lexed.
std::string_view next() const { return input.substr(lexedSize); }
// Get the next character without consuming it.
uint8_t peek() const { return next()[0]; }
// The size of the unlexed input.
size_t size() const { return input.size() - lexedSize; }
// Whether there is no more input.
bool empty() const { return size() == 0; }
// Tokens must be separated by spaces or parentheses.
bool canFinish() const;
// Whether the unlexed input starts with prefix `sv`.
size_t startsWith(std::string_view sv) const {
return next().substr(0, sv.size()) == sv;
}
// Consume the next `n` characters.
void take(size_t n) { lexedSize += n; }
// Consume an additional lexed fragment.
void take(const LexResult& res) { lexedSize += res.span.size(); }
// Consume the prefix and return true if possible.
bool takePrefix(std::string_view sv) {
if (startsWith(sv)) {
take(sv.size());
return true;
}
return false;
}
// Consume the rest of the input.
void takeAll() { lexedSize = input.size(); }
};
enum OverflowBehavior { DisallowOverflow, IgnoreOverflow };
std::optional<int> getDigit(char c) {
if ('0' <= c && c <= '9') {
return c - '0';
}
return {};
}
std::optional<int> getHexDigit(char c) {
if ('0' <= c && c <= '9') {
return c - '0';
}
if ('A' <= c && c <= 'F') {
return 10 + c - 'A';
}
if ('a' <= c && c <= 'f') {
return 10 + c - 'a';
}
return {};
}
enum Sign { NoSign, Pos, Neg };
// The result of lexing an integer token fragment.
struct LexIntResult : LexResult {
uint64_t n;
Sign sign;
template<typename T> bool isUnsigned() {
static_assert(std::is_integral_v<T> && std::is_unsigned_v<T>);
return sign == NoSign && n <= std::numeric_limits<T>::max();
}
template<typename T> bool isSigned() {
static_assert(std::is_integral_v<T> && std::is_signed_v<T>);
if (sign == Neg) {
return uint64_t(std::numeric_limits<T>::min()) <= n || n == 0;
}
return n <= uint64_t(std::numeric_limits<T>::max());
}
};
// Lexing context that accumulates lexed input to produce an integer token
// fragment.
struct LexIntCtx : LexCtx {
using LexCtx::take;
private:
uint64_t n = 0;
Sign sign = NoSign;
bool overflow = false;
public:
explicit LexIntCtx(std::string_view in) : LexCtx(in) {}
// Lex only the underlying span, ignoring the overflow and value.
std::optional<LexIntResult> lexedRaw() {
if (auto basic = LexCtx::lexed()) {
return LexIntResult{*basic, 0, NoSign};
}
return {};
}
std::optional<LexIntResult> lexed() {
if (overflow) {
return {};
}
if (auto basic = LexCtx::lexed()) {
return LexIntResult{*basic, sign == Neg ? -n : n, sign};
}
return {};
}
void takeSign() {
if (takePrefix("+"sv)) {
sign = Pos;
} else if (takePrefix("-"sv)) {
sign = Neg;
} else {
sign = NoSign;
}
}
bool takeDigit() {
if (!empty()) {
if (auto d = getDigit(peek())) {
take(1);
uint64_t newN = n * 10 + *d;
if (newN < n) {
overflow = true;
}
n = newN;
return true;
}
}
return false;
}
bool takeHexdigit() {
if (!empty()) {
if (auto h = getHexDigit(peek())) {
take(1);
uint64_t newN = n * 16 + *h;
if (newN < n) {
overflow = true;
}
n = newN;
return true;
}
}
return false;
}
void take(const LexIntResult& res) {
LexCtx::take(res);
n = res.n;
}
};
struct LexFloatResult : LexResult {
// The payload if we lexed a nan with payload. We cannot store the payload
// directly in `d` because we do not know at this point whether we are parsing
// an f32 or f64 and therefore we do not know what the allowable payloads are.
// No payload with NaN means to use the default payload for the expected float
// width.
std::optional<uint64_t> nanPayload;
double d;
};
struct LexFloatCtx : LexCtx {
std::optional<uint64_t> nanPayload;
LexFloatCtx(std::string_view in) : LexCtx(in) {}
std::optional<LexFloatResult> lexed() {
const double posNan = std::copysign(NAN, 1.0);
const double negNan = std::copysign(NAN, -1.0);
assert(!std::signbit(posNan) && "expected positive NaN to be positive");
assert(std::signbit(negNan) && "expected negative NaN to be negative");
auto basic = LexCtx::lexed();
if (!basic) {
return {};
}
// strtod does not return NaNs with the expected signs on all platforms.
// TODO: use starts_with once we have C++20.
if (basic->span.substr(0, 3) == "nan"sv ||
basic->span.substr(0, 4) == "+nan"sv) {
return LexFloatResult{*basic, nanPayload, posNan};
}
if (basic->span.substr(0, 4) == "-nan"sv) {
return LexFloatResult{*basic, nanPayload, negNan};
}
// Do not try to implement fully general and precise float parsing
// ourselves. Instead, call out to std::strtod to do our parsing. This means
// we need to strip any underscores since `std::strtod` does not understand
// them.
std::stringstream ss;
for (const char *curr = basic->span.data(),
*end = curr + basic->span.size();
curr != end;
++curr) {
if (*curr != '_') {
ss << *curr;
}
}
std::string str = ss.str();
char* last;
double d = std::strtod(str.data(), &last);
assert(last == str.data() + str.size() && "could not parse float");
return LexFloatResult{*basic, {}, d};
}
};
struct LexStrResult : LexResult {
// Allocate a string only if there are escape sequences, otherwise just use
// the original string_view.
std::optional<std::string> str;
std::string_view getStr() {
if (str) {
return *str;
}
return span;
}
};
struct LexStrCtx : LexCtx {
private:
// Used to build a string with resolved escape sequences. Only used when the
// parsed string contains escape sequences, otherwise we can just use the
// parsed string directly.
std::optional<std::stringstream> escapeBuilder;
public:
LexStrCtx(std::string_view in) : LexCtx(in) {}
std::optional<LexStrResult> lexed() {
if (auto basic = LexCtx::lexed()) {
if (escapeBuilder) {
return LexStrResult{*basic, {escapeBuilder->str()}};
} else {
return LexStrResult{*basic, {}};
}
}
return {};
}
void takeChar() {
if (escapeBuilder) {
*escapeBuilder << peek();
}
LexCtx::take(1);
}
void ensureBuildingEscaped() {
if (escapeBuilder) {
return;
}
// Drop the opening '"'.
escapeBuilder = std::stringstream{};
*escapeBuilder << LexCtx::lexed()->span.substr(1);
}
void appendEscaped(char c) { *escapeBuilder << c; }
bool appendUnicode(uint64_t u) {
if ((0xd800 <= u && u < 0xe000) || 0x110000 <= u) {
return false;
}
String::writeWTF8CodePoint(*escapeBuilder, u);
return true;
}
};
struct LexIdResult : LexResult {
bool isStr = false;
std::optional<std::string> str;
};
struct LexIdCtx : LexCtx {
bool isStr = false;
std::optional<std::string> str;
LexIdCtx(std::string_view in) : LexCtx(in) {}
std::optional<LexIdResult> lexed() {
if (auto basic = LexCtx::lexed()) {
return LexIdResult{*basic, isStr, str};
}
return {};
}
};
struct LexAnnotationResult : LexResult {
Annotation annotation;
};
struct LexAnnotationCtx : LexCtx {
std::string_view kind;
size_t kindSize = 0;
std::string_view contents;
size_t contentsSize = 0;
explicit LexAnnotationCtx(std::string_view in) : LexCtx(in) {}
void startKind() { kind = next(); }
void takeKind(size_t size) {
kindSize += size;
take(size);
}
void setKind(std::string_view kind) {
this->kind = kind;
kindSize = kind.size();
}
void startContents() { contents = next(); }
void takeContents(size_t size) {
contentsSize += size;
take(size);
}
std::optional<LexAnnotationResult> lexed() {
if (auto basic = LexCtx::lexed()) {
return LexAnnotationResult{
*basic,
{Name(kind.substr(0, kindSize)), contents.substr(0, contentsSize)}};
}
return std::nullopt;
}
};
std::optional<LexResult> idchar(std::string_view);
std::optional<LexResult> space(std::string_view);
std::optional<LexResult> keyword(std::string_view);
std::optional<LexIntResult> integer(std::string_view);
std::optional<LexFloatResult> float_(std::string_view);
std::optional<LexStrResult> str(std::string_view);
std::optional<LexIdResult> ident(std::string_view);
// annotation ::= ';;@' [^\n]* | '(@'idchar+ annotelem* ')'
// annotelem ::= keyword | reserved | uN | sN | fN | string | id
// | '(' annotelem* ')' | '(@'idchar+ annotelem* ')'
std::optional<LexAnnotationResult> annotation(std::string_view in) {
LexAnnotationCtx ctx(in);
if (ctx.takePrefix(";;@"sv)) {
ctx.setKind(srcAnnotationKind.str);
ctx.startContents();
if (auto size = ctx.next().find('\n'); size != ""sv.npos) {
ctx.takeContents(size);
} else {
ctx.takeContents(ctx.next().size());
}
} else if (ctx.takePrefix("(@"sv)) {
ctx.startKind();
bool hasIdchar = false;
while (auto lexed = idchar(ctx.next())) {
ctx.takeKind(1);
hasIdchar = true;
}
if (!hasIdchar) {
return std::nullopt;
}
ctx.startContents();
size_t depth = 1;
while (true) {
if (ctx.empty()) {
return std::nullopt;
}
if (auto lexed = space(ctx.next())) {
ctx.takeContents(lexed->span.size());
continue;
}
if (auto lexed = keyword(ctx.next())) {
ctx.takeContents(lexed->span.size());
continue;
}
if (auto lexed = integer(ctx.next())) {
ctx.takeContents(lexed->span.size());
continue;
}
if (auto lexed = float_(ctx.next())) {
ctx.takeContents(lexed->span.size());
continue;
}
if (auto lexed = str(ctx.next())) {
ctx.takeContents(lexed->span.size());
continue;
}
if (auto lexed = ident(ctx.next())) {
ctx.takeContents(lexed->span.size());
continue;
}
if (ctx.startsWith("(@"sv)) {
ctx.takeContents(2);
bool hasIdchar = false;
while (auto lexed = idchar(ctx.next())) {
ctx.takeContents(1);
hasIdchar = true;
}
if (!hasIdchar) {
return std::nullopt;
}
++depth;
continue;
}
if (ctx.startsWith("("sv)) {
ctx.takeContents(1);
++depth;
continue;
}
if (ctx.startsWith(")"sv)) {
--depth;
if (depth == 0) {
ctx.take(1);
break;
}
ctx.takeContents(1);
continue;
}
// Unrecognized token.
return std::nullopt;
}
}
return ctx.lexed();
}
// comment ::= linecomment | blockcomment
// linecomment ::= ';;' linechar* ('\n' | eof)
// linechar ::= c:char (if c != '\n')
// blockcomment ::= '(;' blockchar* ';)'
// blockchar ::= c:char (if c != ';' and c != '(')
// | ';' (if the next char is not ')')
// | '(' (if the next char is not ';')
// | blockcomment
std::optional<LexResult> comment(std::string_view in) {
LexCtx ctx(in);
if (ctx.size() < 2) {
return {};
}
// Line comment
if (!ctx.startsWith(";;@"sv) && ctx.takePrefix(";;"sv)) {
if (auto size = ctx.next().find('\n'); size != ""sv.npos) {
ctx.take(size);
} else {
ctx.takeAll();
}
return ctx.lexed();
}
// Block comment (possibly nested!)
if (ctx.takePrefix("(;"sv)) {
size_t depth = 1;
while (depth > 0 && ctx.size() >= 2) {
if (ctx.takePrefix("(;"sv)) {
++depth;
} else if (ctx.takePrefix(";)"sv)) {
--depth;
} else {
ctx.take(1);
}
}
if (depth > 0) {
// TODO: Add error production for non-terminated block comment.
return {};
}
return ctx.lexed();
}
return {};
}
std::optional<LexResult> spacechar(std::string_view in) {
LexCtx ctx(in);
ctx.takePrefix(" "sv) || ctx.takePrefix("\n"sv) || ctx.takePrefix("\r"sv) ||
ctx.takePrefix("\t"sv);
return ctx.lexed();
}
// space ::= (' ' | format | comment)*
// format ::= '\t' | '\n' | '\r'
std::optional<LexResult> space(std::string_view in) {
LexCtx ctx(in);
while (ctx.size()) {
if (auto lexed = spacechar(ctx.next())) {
ctx.take(*lexed);
} else if (auto lexed = comment(ctx.next())) {
ctx.take(*lexed);
} else {
break;
}
}
return ctx.lexed();
}
bool LexCtx::canFinish() const {
// Logically we want to check for eof, parens, and space. But we don't
// actually want to parse more than a couple characters of space, so check for
// individual space chars or comment starts instead.
return empty() || startsWith("("sv) || startsWith(")"sv) ||
spacechar(next()) || startsWith(";;"sv);
}
// num ::= d:digit => d
// | n:num '_'? d:digit => 10*n + d
// digit ::= '0' => 0 | ... | '9' => 9
std::optional<LexIntResult> num(std::string_view in,
OverflowBehavior overflow = DisallowOverflow) {
LexIntCtx ctx(in);
if (ctx.empty()) {
return {};
}
if (!ctx.takeDigit()) {
return {};
}
while (true) {
bool under = ctx.takePrefix("_"sv);
if (!ctx.takeDigit()) {
if (!under) {
return overflow == DisallowOverflow ? ctx.lexed() : ctx.lexedRaw();
}
// TODO: Add error production for trailing underscore.
return {};
}
}
}
// hexnum ::= h:hexdigit => h
// | n:hexnum '_'? h:hexdigit => 16*n + h
// hexdigit ::= d:digit => d
// | 'A' => 10 | ... | 'F' => 15
// | 'a' => 10 | ... | 'f' => 15
std::optional<LexIntResult>
hexnum(std::string_view in, OverflowBehavior overflow = DisallowOverflow) {
LexIntCtx ctx(in);
if (!ctx.takeHexdigit()) {
return {};
}
while (true) {
bool under = ctx.takePrefix("_"sv);
if (!ctx.takeHexdigit()) {
if (!under) {
return overflow == DisallowOverflow ? ctx.lexed() : ctx.lexedRaw();
}
// TODO: Add error production for trailing underscore.
return {};
}
}
}
// uN ::= n:num => n (if n < 2^N)
// | '0x' n:hexnum => n (if n < 2^N)
// sN ::= s:sign n:num => [s]n (if -2^(N-1) <= [s]n < 2^(N-1))
// | s:sign '0x' n:hexnum => [s]n (if -2^(N-1) <= [s]n < 2^(N-1))
// sign ::= {} => + | '+' => + | '-' => -
//
// Note: Defer bounds and sign checking until we know what kind of integer we
// expect.
std::optional<LexIntResult> integer(std::string_view in) {
LexIntCtx ctx(in);
ctx.takeSign();
if (ctx.takePrefix("0x"sv)) {
if (auto lexed = hexnum(ctx.next())) {
ctx.take(*lexed);
if (ctx.canFinish()) {
return ctx.lexed();
}
}
// TODO: Add error production for unrecognized hexnum.
return {};
}
if (auto lexed = num(ctx.next())) {
ctx.take(*lexed);
if (ctx.canFinish()) {
return ctx.lexed();
}
}
return {};
}
// float ::= p:num '.'? => p
// | p:num '.' q:frac => p + q
// | p:num '.'? ('E'|'e') s:sign e:num => p * 10^([s]e)
// | p:num '.' q:frac ('E'|'e') s:sign e:num => (p + q) * 10^([s]e)
// frac ::= d:digit => d/10
// | d:digit '_'? p:frac => (d + p/10) / 10
std::optional<LexResult> decfloat(std::string_view in) {
LexCtx ctx(in);
if (auto lexed = num(ctx.next(), IgnoreOverflow)) {
ctx.take(*lexed);
} else {
return {};
}
// Optional '.' followed by optional frac
if (ctx.takePrefix("."sv)) {
if (auto lexed = num(ctx.next(), IgnoreOverflow)) {
ctx.take(*lexed);
}
}
if (ctx.takePrefix("E"sv) || ctx.takePrefix("e"sv)) {
// Optional sign
ctx.takePrefix("+"sv) || ctx.takePrefix("-"sv);
if (auto lexed = num(ctx.next(), IgnoreOverflow)) {
ctx.take(*lexed);
} else {
// TODO: Add error production for missing exponent.
return {};
}
}
return ctx.lexed();
}
// hexfloat ::= '0x' p:hexnum '.'? => p
// | '0x' p:hexnum '.' q:hexfrac => p + q
// | '0x' p:hexnum '.'? ('P'|'p') s:sign e:num => p * 2^([s]e)
// | '0x' p:hexnum '.' q:hexfrac ('P'|'p') s:sign e:num
// => (p + q) * 2^([s]e)
// hexfrac ::= h:hexdigit => h/16
// | h:hexdigit '_'? p:hexfrac => (h + p/16) / 16
std::optional<LexResult> hexfloat(std::string_view in) {
LexCtx ctx(in);
if (!ctx.takePrefix("0x"sv)) {
return {};
}
if (auto lexed = hexnum(ctx.next(), IgnoreOverflow)) {
ctx.take(*lexed);
} else {
return {};
}
// Optional '.' followed by optional hexfrac
if (ctx.takePrefix("."sv)) {
if (auto lexed = hexnum(ctx.next(), IgnoreOverflow)) {
ctx.take(*lexed);
}
}
if (ctx.takePrefix("P"sv) || ctx.takePrefix("p"sv)) {
// Optional sign
ctx.takePrefix("+"sv) || ctx.takePrefix("-"sv);
if (auto lexed = num(ctx.next(), IgnoreOverflow)) {
ctx.take(*lexed);
} else {
// TODO: Add error production for missing exponent.
return {};
}
}
return ctx.lexed();
}
// fN ::= s:sign z:fNmag => [s]z
// fNmag ::= z:float => float_N(z) (if float_N(z) != +/-infinity)
// | z:hexfloat => float_N(z) (if float_N(z) != +/-infinity)
// | 'inf' => infinity
// | 'nan' => nan(2^(signif(N)-1))
// | 'nan:0x' n:hexnum => nan(n) (if 1 <= n < 2^signif(N))
std::optional<LexFloatResult> float_(std::string_view in) {
LexFloatCtx ctx(in);
// Optional sign
ctx.takePrefix("+"sv) || ctx.takePrefix("-"sv);
if (auto lexed = hexfloat(ctx.next())) {
ctx.take(*lexed);
} else if (auto lexed = decfloat(ctx.next())) {
ctx.take(*lexed);
} else if (ctx.takePrefix("inf"sv)) {
// nop
} else if (ctx.takePrefix("nan"sv)) {
if (ctx.takePrefix(":0x"sv)) {
if (auto lexed = hexnum(ctx.next())) {
ctx.take(*lexed);
ctx.nanPayload = lexed->n;
} else {
// TODO: Add error production for malformed NaN payload.
return {};
}
} else {
// No explicit payload necessary; we will inject the default payload
// later.
}
} else {
return {};
}
if (ctx.canFinish()) {
return ctx.lexed();
}
return {};
}
// idchar ::= '0' | ... | '9'
// | 'A' | ... | 'Z'
// | 'a' | ... | 'z'
// | '!' | '#' | '$' | '%' | '&' | ''' | '*' | '+'
// | '-' | '.' | '/' | ':' | '<' | '=' | '>' | '?'
// | '@' | '\' | '^' | '_' | '`' | '|' | '~'
std::optional<LexResult> idchar(std::string_view in) {
LexCtx ctx(in);
if (ctx.empty()) {
return {};
}
uint8_t c = ctx.peek();
// All the allowed characters lie in the range '!' to '~', and within that
// range the vast majority of characters are allowed, so it is significantly
// faster to check for the disallowed characters instead.
if (c < '!' || c > '~') {
return ctx.lexed();
}
switch (c) {
case '"':
case '(':
case ')':
case ',':
case ';':
case '[':
case ']':
case '{':
case '}':
return ctx.lexed();
}
ctx.take(1);
return ctx.lexed();
}
// string ::= '"' (b*:stringelem)* '"' => concat((b*)*)
// (if |concat((b*)*)| < 2^32)
// stringelem ::= c:stringchar => utf8(c)
// | '\' n:hexdigit m:hexdigit => 16*n + m
// stringchar ::= c:char => c
// (if c >= U+20 && c != U+7f && c != '"' && c != '\')
// | '\t' => \t | '\n' => \n | '\r' => \r
// | '\\' => \ | '\"' => " | '\'' => '
// | '\u{' n:hexnum '}' => U+(n)
// (if n < 0xD800 and 0xE000 <= n <= 0x110000)
std::optional<LexStrResult> str(std::string_view in) {
LexStrCtx ctx(in);
if (!ctx.takePrefix("\""sv)) {
return {};
}
while (!ctx.takePrefix("\""sv)) {
if (ctx.empty()) {
// TODO: Add error production for unterminated string.
return {};
}
if (ctx.startsWith("\\"sv)) {
// Escape sequences
ctx.ensureBuildingEscaped();
ctx.take(1);
if (ctx.takePrefix("t"sv)) {
ctx.appendEscaped('\t');
} else if (ctx.takePrefix("n"sv)) {
ctx.appendEscaped('\n');
} else if (ctx.takePrefix("r"sv)) {
ctx.appendEscaped('\r');
} else if (ctx.takePrefix("\\"sv)) {
ctx.appendEscaped('\\');
} else if (ctx.takePrefix("\""sv)) {
ctx.appendEscaped('"');
} else if (ctx.takePrefix("'"sv)) {
ctx.appendEscaped('\'');
} else if (ctx.takePrefix("u{"sv)) {
auto lexed = hexnum(ctx.next());
if (!lexed) {
// TODO: Add error production for malformed unicode escapes.
return {};
}
ctx.take(*lexed);
if (!ctx.takePrefix("}"sv)) {
// TODO: Add error production for malformed unicode escapes.
return {};
}
if (!ctx.appendUnicode(lexed->n)) {
// TODO: Add error production for invalid unicode values.
return {};
}
} else {
LexIntCtx ictx(ctx.next());
if (!ictx.takeHexdigit() || !ictx.takeHexdigit()) {
// TODO: Add error production for unrecognized escape sequence.
return {};
}
auto lexed = *ictx.lexed();
ctx.take(lexed);
ctx.appendEscaped(char(lexed.n));
}
} else {
// Normal characters
if (uint8_t c = ctx.peek(); c >= 0x20 && c != 0x7F) {
ctx.takeChar();
} else {
// TODO: Add error production for unescaped control characters.
return {};
}
}
}
return ctx.lexed();
}
// id ::= '$' idchar+ | '$' str
std::optional<LexIdResult> ident(std::string_view in) {
LexIdCtx ctx(in);
if (!ctx.takePrefix("$"sv)) {
return {};
}
if (auto s = str(ctx.next())) {
if (!String::isUTF8(s->getStr())) {
return {};
}
ctx.isStr = true;
ctx.str = s->str;
ctx.take(*s);
} else if (auto lexed = idchar(ctx.next())) {
ctx.take(*lexed);
while (auto lexed = idchar(ctx.next())) {
ctx.take(*lexed);
}
} else {
return {};
}
if (ctx.canFinish()) {
return ctx.lexed();
}
return {};
}
// keyword ::= ( 'a' | ... | 'z' ) idchar* (if literal terminal in grammar)
// reserved ::= idchar+
//
// The "keyword" token we lex here covers both keywords as well as any reserved
// tokens that match the keyword format. This saves us from having to enumerate
// all the valid keywords here. These invalid keywords will still produce
// errors, just at a higher level of the parser.
std::optional<LexResult> keyword(std::string_view in) {
LexCtx ctx(in);
if (ctx.empty()) {
return {};
}
uint8_t start = ctx.peek();
if ('a' <= start && start <= 'z') {
ctx.take(1);
} else {
return {};
}
while (auto lexed = idchar(ctx.next())) {
ctx.take(*lexed);
}
return ctx.lexed();
}
} // anonymous namespace
void Lexer::skipSpace() {
while (true) {
if (auto ctx = annotation(next())) {
pos += ctx->span.size();
annotations.push_back(ctx->annotation);
continue;
}
if (auto ctx = space(next())) {
pos += ctx->span.size();
continue;
}
break;
}
}
bool Lexer::takeLParen() {
if (LexCtx(next()).startsWith("("sv)) {
++pos;
advance();
return true;
}
return false;
}
bool Lexer::takeRParen() {
if (LexCtx(next()).startsWith(")"sv)) {
++pos;
advance();
return true;
}
return false;
}
std::optional<std::string> Lexer::takeString() {
if (auto result = str(next())) {
pos += result->span.size();
advance();
if (result->str) {
return result->str;
}
// Remove quotes.
return std::string(result->span.substr(1, result->span.size() - 2));
}
return std::nullopt;
}
std::optional<Name> Lexer::takeID() {
if (auto result = ident(next())) {
pos += result->span.size();
advance();
if (result->str) {
return Name(*result->str);
}
if (result->isStr) {
// Remove '$' and quotes.
return Name(result->span.substr(2, result->span.size() - 3));
}
// Remove '$'.
return Name(result->span.substr(1));
}
return std::nullopt;
}
std::optional<std::string_view> Lexer::takeKeyword() {
if (auto result = keyword(next())) {
pos += result->span.size();
advance();
return result->span;
}
return std::nullopt;
}
bool Lexer::takeKeyword(std::string_view expected) {
if (auto result = keyword(next()); result && result->span == expected) {
pos += expected.size();
advance();
return true;
}
return false;
}
std::optional<uint64_t> Lexer::takeOffset() {
if (auto result = keyword(next())) {
if (result->span.substr(0, 7) != "offset="sv) {
return std::nullopt;
}
Lexer subLexer(result->span.substr(7));
if (auto o = subLexer.takeU64()) {
pos += result->span.size();
advance();
return o;
}
}
return std::nullopt;
}
std::optional<uint32_t> Lexer::takeAlign() {
if (auto result = keyword(next())) {
if (result->span.substr(0, 6) != "align="sv) {
return std::nullopt;
}
Lexer subLexer(result->span.substr(6));
if (auto o = subLexer.takeU32()) {
if (Bits::popCount(*o) != 1) {
return std::nullopt;
}
pos += result->span.size();
advance();
return o;
}
}
return std::nullopt;
}
template<typename T> std::optional<T> Lexer::takeU() {
static_assert(std::is_integral_v<T> && std::is_unsigned_v<T>);
if (auto result = integer(next()); result && result->isUnsigned<T>()) {
pos += result->span.size();
advance();
return T(result->n);
}
// TODO: Add error production for unsigned overflow.
return std::nullopt;
}
template<typename T> std::optional<T> Lexer::takeS() {
static_assert(std::is_integral_v<T> && std::is_signed_v<T>);
if (auto result = integer(next()); result && result->isSigned<T>()) {
pos += result->span.size();
advance();
return T(result->n);
}
return std::nullopt;
}
template<typename T> std::optional<T> Lexer::takeI() {
static_assert(std::is_integral_v<T> && std::is_unsigned_v<T>);
if (auto result = integer(next())) {
if (result->isUnsigned<T>() || result->isSigned<std::make_signed_t<T>>()) {
pos += result->span.size();
advance();
return T(result->n);
}
}
return std::nullopt;
}
template std::optional<uint64_t> Lexer::takeU<uint64_t>();
template std::optional<int64_t> Lexer::takeS<int64_t>();
template std::optional<uint64_t> Lexer::takeI<uint64_t>();
template std::optional<uint32_t> Lexer::takeU<uint32_t>();
template std::optional<int32_t> Lexer::takeS<int32_t>();
template std::optional<uint32_t> Lexer::takeI<uint32_t>();
template std::optional<uint16_t> Lexer::takeU<uint16_t>();
template std::optional<int16_t> Lexer::takeS<int16_t>();
template std::optional<uint16_t> Lexer::takeI<uint16_t>();
template std::optional<uint8_t> Lexer::takeU<uint8_t>();
template std::optional<int8_t> Lexer::takeS<int8_t>();
template std::optional<uint8_t> Lexer::takeI<uint8_t>();
std::optional<double> Lexer::takeF64() {
constexpr int signif = 52;
constexpr uint64_t payloadMask = (1ull << signif) - 1;
constexpr uint64_t nanDefault = 1ull << (signif - 1);
if (auto result = float_(next())) {
double d = result->d;
if (std::isnan(d)) {
// Inject payload.
uint64_t payload = result->nanPayload ? *result->nanPayload : nanDefault;
if (payload == 0 || payload > payloadMask) {
// TODO: Add error production for out-of-bounds payload.
return std::nullopt;
}
uint64_t bits;
static_assert(sizeof(bits) == sizeof(d));
memcpy(&bits, &d, sizeof(bits));
bits = (bits & ~payloadMask) | payload;
memcpy(&d, &bits, sizeof(bits));
}
pos += result->span.size();
advance();
return d;
}
if (auto result = integer(next())) {
pos += result->span.size();
advance();
if (result->sign == Neg) {
if (result->n == 0) {
return -0.0;
}
return double(int64_t(result->n));
}
return double(result->n);
}
return std::nullopt;
}
std::optional<float> Lexer::takeF32() {
constexpr int signif = 23;
constexpr uint32_t payloadMask = (1u << signif) - 1;
constexpr uint64_t nanDefault = 1ull << (signif - 1);
if (auto result = float_(next())) {
float f = result->d;
if (std::isnan(f)) {
// Validate and inject payload.
uint64_t payload = result->nanPayload ? *result->nanPayload : nanDefault;
if (payload == 0 || payload > payloadMask) {
// TODO: Add error production for out-of-bounds payload.
return std::nullopt;
}
uint32_t bits;
static_assert(sizeof(bits) == sizeof(f));
memcpy(&bits, &f, sizeof(bits));
bits = (bits & ~payloadMask) | payload;
memcpy(&f, &bits, sizeof(bits));
}
pos += result->span.size();
advance();
return f;
}
if (auto result = integer(next())) {
pos += result->span.size();
advance();
if (result->sign == Neg) {
if (result->n == 0) {
return -0.0f;
}
return float(int64_t(result->n));
}
return float(result->n);
}
return std::nullopt;
}
TextPos Lexer::position(const char* c) const {
assert(size_t(c - buffer.data()) <= buffer.size());
TextPos pos{1, 0};
for (const char* p = buffer.data(); p != c; ++p) {
if (*p == '\n') {
pos.line++;
pos.col = 0;
} else {
pos.col++;
}
}
return pos;
}
bool TextPos::operator==(const TextPos& other) const {
return line == other.line && col == other.col;
}
std::ostream& operator<<(std::ostream& os, const TextPos& pos) {
return os << pos.line << ":" << pos.col;
}
} // namespace wasm::WATParser