blob: d0f365e8cede94d516c5190c586a447c607e4516 [file] [log] [blame]
// Copyright 2018 The Chromium Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.
#include "json_parser.h"
#include <cassert>
#include <limits>
#include <string>
#include "status.h"
namespace inspector_protocol {
namespace {
const int kStackLimit = 1000;
enum Token {
ObjectBegin,
ObjectEnd,
ArrayBegin,
ArrayEnd,
StringLiteral,
Number,
BoolTrue,
BoolFalse,
NullToken,
ListSeparator,
ObjectPairSeparator,
InvalidToken,
NoInput
};
const char* const kNullString = "null";
const char* const kTrueString = "true";
const char* const kFalseString = "false";
template <typename Char>
class JsonParser {
public:
JsonParser(const Platform* platform, JSONParserHandler* handler)
: platform_(platform), handler_(handler) {}
void Parse(const Char* start, std::size_t length) {
start_pos_ = start;
const Char* end = start + length;
const Char* tokenEnd;
ParseValue(start, end, &tokenEnd, 0);
if (tokenEnd != end) {
HandleError(Error::JSON_PARSER_UNPROCESSED_INPUT_REMAINS, tokenEnd);
}
}
private:
bool CharsToDouble(const uint16_t* chars, std::size_t length,
double* result) {
std::string buffer;
buffer.reserve(length + 1);
for (std::size_t ii = 0; ii < length; ++ii) {
bool is_ascii = !(chars[ii] & ~0x7F);
if (!is_ascii) return false;
buffer.push_back(static_cast<char>(chars[ii]));
}
return platform_->StrToD(buffer.c_str(), result);
}
bool CharsToDouble(const uint8_t* chars, std::size_t length, double* result) {
std::string buffer(reinterpret_cast<const char*>(chars), length);
return platform_->StrToD(buffer.c_str(), result);
}
static bool ParseConstToken(const Char* start, const Char* end,
const Char** token_end, const char* token) {
// |token| is \0 terminated, it's one of the constants at top of the file.
while (start < end && *token != '\0' && *start++ == *token++) {
}
if (*token != '\0') return false;
*token_end = start;
return true;
}
static bool ReadInt(const Char* start, const Char* end,
const Char** token_end, bool allow_leading_zeros) {
if (start == end) return false;
bool has_leading_zero = '0' == *start;
int length = 0;
while (start < end && '0' <= *start && *start <= '9') {
++start;
++length;
}
if (!length) return false;
if (!allow_leading_zeros && length > 1 && has_leading_zero) return false;
*token_end = start;
return true;
}
static bool ParseNumberToken(const Char* start, const Char* end,
const Char** token_end) {
// We just grab the number here. We validate the size in DecodeNumber.
// According to RFC4627, a valid number is: [minus] int [frac] [exp]
if (start == end) return false;
Char c = *start;
if ('-' == c) ++start;
if (!ReadInt(start, end, &start, /*allow_leading_zeros=*/false))
return false;
if (start == end) {
*token_end = start;
return true;
}
// Optional fraction part
c = *start;
if ('.' == c) {
++start;
if (!ReadInt(start, end, &start, /*allow_leading_zeros=*/true))
return false;
if (start == end) {
*token_end = start;
return true;
}
c = *start;
}
// Optional exponent part
if ('e' == c || 'E' == c) {
++start;
if (start == end) return false;
c = *start;
if ('-' == c || '+' == c) {
++start;
if (start == end) return false;
}
if (!ReadInt(start, end, &start, /*allow_leading_zeros=*/true))
return false;
}
*token_end = start;
return true;
}
static bool ReadHexDigits(const Char* start, const Char* end,
const Char** token_end, int digits) {
if (end - start < digits) return false;
for (int i = 0; i < digits; ++i) {
Char c = *start++;
if (!(('0' <= c && c <= '9') || ('a' <= c && c <= 'f') ||
('A' <= c && c <= 'F')))
return false;
}
*token_end = start;
return true;
}
static bool ParseStringToken(const Char* start, const Char* end,
const Char** token_end) {
while (start < end) {
Char c = *start++;
if ('\\' == c) {
if (start == end) return false;
c = *start++;
// Make sure the escaped char is valid.
switch (c) {
case 'x':
if (!ReadHexDigits(start, end, &start, 2)) return false;
break;
case 'u':
if (!ReadHexDigits(start, end, &start, 4)) return false;
break;
case '\\':
case '/':
case 'b':
case 'f':
case 'n':
case 'r':
case 't':
case 'v':
case '"':
break;
default:
return false;
}
} else if ('"' == c) {
*token_end = start;
return true;
}
}
return false;
}
static bool SkipComment(const Char* start, const Char* end,
const Char** comment_end) {
if (start == end) return false;
if (*start != '/' || start + 1 >= end) return false;
++start;
if (*start == '/') {
// Single line comment, read to newline.
for (++start; start < end; ++start) {
if (*start == '\n' || *start == '\r') {
*comment_end = start + 1;
return true;
}
}
*comment_end = end;
// Comment reaches end-of-input, which is fine.
return true;
}
if (*start == '*') {
Char previous = '\0';
// Block comment, read until end marker.
for (++start; start < end; previous = *start++) {
if (previous == '*' && *start == '/') {
*comment_end = start + 1;
return true;
}
}
// Block comment must close before end-of-input.
return false;
}
return false;
}
static bool IsSpaceOrNewLine(Char c) {
// \v = vertial tab; \f = form feed page break.
return c == ' ' || c == '\n' || c == '\v' || c == '\f' || c == '\r' ||
c == '\t';
}
static void SkipWhitespaceAndComments(const Char* start, const Char* end,
const Char** whitespace_end) {
while (start < end) {
if (IsSpaceOrNewLine(*start)) {
++start;
} else if (*start == '/') {
const Char* comment_end;
if (!SkipComment(start, end, &comment_end)) break;
start = comment_end;
} else {
break;
}
}
*whitespace_end = start;
}
static Token ParseToken(const Char* start, const Char* end,
const Char** tokenStart, const Char** token_end) {
SkipWhitespaceAndComments(start, end, tokenStart);
start = *tokenStart;
if (start == end) return NoInput;
switch (*start) {
case 'n':
if (ParseConstToken(start, end, token_end, kNullString))
return NullToken;
break;
case 't':
if (ParseConstToken(start, end, token_end, kTrueString))
return BoolTrue;
break;
case 'f':
if (ParseConstToken(start, end, token_end, kFalseString))
return BoolFalse;
break;
case '[':
*token_end = start + 1;
return ArrayBegin;
case ']':
*token_end = start + 1;
return ArrayEnd;
case ',':
*token_end = start + 1;
return ListSeparator;
case '{':
*token_end = start + 1;
return ObjectBegin;
case '}':
*token_end = start + 1;
return ObjectEnd;
case ':':
*token_end = start + 1;
return ObjectPairSeparator;
case '0':
case '1':
case '2':
case '3':
case '4':
case '5':
case '6':
case '7':
case '8':
case '9':
case '-':
if (ParseNumberToken(start, end, token_end)) return Number;
break;
case '"':
if (ParseStringToken(start + 1, end, token_end)) return StringLiteral;
break;
}
return InvalidToken;
}
static int HexToInt(Char c) {
if ('0' <= c && c <= '9') return c - '0';
if ('A' <= c && c <= 'F') return c - 'A' + 10;
if ('a' <= c && c <= 'f') return c - 'a' + 10;
assert(false); // Unreachable.
return 0;
}
static bool DecodeString(const Char* start, const Char* end,
std::vector<uint16_t>* output) {
if (start == end) return true;
if (start > end) return false;
output->reserve(end - start);
while (start < end) {
uint16_t c = *start++;
// If the |Char| we're dealing with is really a byte, then
// we have utf8 here, and we need to check for multibyte characters
// and transcode them to utf16 (either one or two utf16 chars).
if (sizeof(Char) == sizeof(uint8_t) && c >= 0x7f) {
// Inspect the leading byte to figure out how long the utf8
// byte sequence is; while doing this initialize |codepoint|
// with the first few bits.
// See table in: https://en.wikipedia.org/wiki/UTF-8
// byte one is 110x xxxx -> 2 byte utf8 sequence
// byte one is 1110 xxxx -> 3 byte utf8 sequence
// byte one is 1111 0xxx -> 4 byte utf8 sequence
uint32_t codepoint;
int num_bytes_left;
if ((c & 0xe0) == 0xc0) { // 2 byte utf8 sequence
num_bytes_left = 1;
codepoint = c & 0x1f;
} else if ((c & 0xf0) == 0xe0) { // 3 byte utf8 sequence
num_bytes_left = 2;
codepoint = c & 0x0f;
} else if ((c & 0xf8) == 0xf0) { // 4 byte utf8 sequence
codepoint = c & 0x07;
num_bytes_left = 3;
} else {
return false; // invalid leading byte
}
// If we have enough bytes in our inpput, decode the remaining ones
// belonging to this Unicode character into |codepoint|.
if (start + num_bytes_left > end) return false;
while (num_bytes_left > 0) {
c = *start++;
--num_bytes_left;
// Check the next byte is a continuation byte, that is 10xx xxxx.
if ((c & 0xc0) != 0x80) return false;
codepoint = (codepoint << 6) | (c & 0x3f);
}
// Disallow overlong encodings for ascii characters, as these
// would include " and other characters significant to JSON
// string termination / control.
if (codepoint < 0x7f) return false;
// Invalid in UTF8, and can't be represented in UTF16 anyway.
if (codepoint > 0x10ffff) return false;
// So, now we transcode to UTF16,
// using the math described at https://en.wikipedia.org/wiki/UTF-16,
// for either one or two 16 bit characters.
if (codepoint < 0xffff) {
output->push_back(codepoint);
continue;
}
codepoint -= 0x10000;
output->push_back((codepoint >> 10) + 0xd800); // high surrogate
output->push_back((codepoint & 0x3ff) + 0xdc00); // low surrogate
continue;
}
if ('\\' != c) {
output->push_back(c);
continue;
}
if (start == end) return false;
c = *start++;
if (c == 'x') {
// \x is not supported.
return false;
}
switch (c) {
case '"':
case '/':
case '\\':
break;
case 'b':
c = '\b';
break;
case 'f':
c = '\f';
break;
case 'n':
c = '\n';
break;
case 'r':
c = '\r';
break;
case 't':
c = '\t';
break;
case 'v':
c = '\v';
break;
case 'u':
c = (HexToInt(*start) << 12) + (HexToInt(*(start + 1)) << 8) +
(HexToInt(*(start + 2)) << 4) + HexToInt(*(start + 3));
start += 4;
break;
default:
return false;
}
output->push_back(c);
}
return true;
}
void ParseValue(const Char* start, const Char* end,
const Char** value_token_end, int depth) {
if (depth > kStackLimit) {
HandleError(Error::JSON_PARSER_STACK_LIMIT_EXCEEDED, start);
return;
}
const Char* token_start;
const Char* token_end;
Token token = ParseToken(start, end, &token_start, &token_end);
switch (token) {
case NoInput:
HandleError(Error::JSON_PARSER_NO_INPUT, token_start);
return;
case InvalidToken:
HandleError(Error::JSON_PARSER_INVALID_TOKEN, token_start);
return;
case NullToken:
handler_->HandleNull();
break;
case BoolTrue:
handler_->HandleBool(true);
break;
case BoolFalse:
handler_->HandleBool(false);
break;
case Number: {
double value;
if (!CharsToDouble(token_start, token_end - token_start, &value)) {
HandleError(Error::JSON_PARSER_INVALID_NUMBER, token_start);
return;
}
if (value >= std::numeric_limits<int32_t>::min() &&
value <= std::numeric_limits<int32_t>::max() &&
static_cast<int32_t>(value) == value)
handler_->HandleInt32(static_cast<int32_t>(value));
else
handler_->HandleDouble(value);
break;
}
case StringLiteral: {
std::vector<uint16_t> value;
bool ok = DecodeString(token_start + 1, token_end - 1, &value);
if (!ok) {
HandleError(Error::JSON_PARSER_INVALID_STRING, token_start);
return;
}
handler_->HandleString16(span<uint16_t>(value.data(), value.size()));
break;
}
case ArrayBegin: {
handler_->HandleArrayBegin();
start = token_end;
token = ParseToken(start, end, &token_start, &token_end);
while (token != ArrayEnd) {
ParseValue(start, end, &token_end, depth + 1);
if (error_) return;
// After a list value, we expect a comma or the end of the list.
start = token_end;
token = ParseToken(start, end, &token_start, &token_end);
if (token == ListSeparator) {
start = token_end;
token = ParseToken(start, end, &token_start, &token_end);
if (token == ArrayEnd) {
HandleError(Error::JSON_PARSER_UNEXPECTED_ARRAY_END, token_start);
return;
}
} else if (token != ArrayEnd) {
// Unexpected value after list value. Bail out.
HandleError(Error::JSON_PARSER_COMMA_OR_ARRAY_END_EXPECTED,
token_start);
return;
}
}
handler_->HandleArrayEnd();
break;
}
case ObjectBegin: {
handler_->HandleObjectBegin();
start = token_end;
token = ParseToken(start, end, &token_start, &token_end);
while (token != ObjectEnd) {
if (token != StringLiteral) {
HandleError(Error::JSON_PARSER_STRING_LITERAL_EXPECTED,
token_start);
return;
}
std::vector<uint16_t> key;
if (!DecodeString(token_start + 1, token_end - 1, &key)) {
HandleError(Error::JSON_PARSER_INVALID_STRING, token_start);
return;
}
handler_->HandleString16(span<uint16_t>(key.data(), key.size()));
start = token_end;
token = ParseToken(start, end, &token_start, &token_end);
if (token != ObjectPairSeparator) {
HandleError(Error::JSON_PARSER_COLON_EXPECTED, token_start);
return;
}
start = token_end;
ParseValue(start, end, &token_end, depth + 1);
if (error_) return;
start = token_end;
// After a key/value pair, we expect a comma or the end of the
// object.
token = ParseToken(start, end, &token_start, &token_end);
if (token == ListSeparator) {
start = token_end;
token = ParseToken(start, end, &token_start, &token_end);
if (token == ObjectEnd) {
HandleError(Error::JSON_PARSER_UNEXPECTED_OBJECT_END,
token_start);
return;
}
} else if (token != ObjectEnd) {
// Unexpected value after last object value. Bail out.
HandleError(Error::JSON_PARSER_COMMA_OR_OBJECT_END_EXPECTED,
token_start);
return;
}
}
handler_->HandleObjectEnd();
break;
}
default:
// We got a token that's not a value.
HandleError(Error::JSON_PARSER_VALUE_EXPECTED, token_start);
return;
}
SkipWhitespaceAndComments(token_end, end, value_token_end);
}
void HandleError(Error error, const Char* pos) {
assert(error != Error::OK);
if (!error_) {
handler_->HandleError(Status{error, pos - start_pos_});
error_ = true;
}
}
const Char* start_pos_ = nullptr;
bool error_ = false;
const Platform* platform_;
JSONParserHandler* handler_;
};
} // namespace
void ParseJSONChars(const Platform* platform, span<uint8_t> chars,
JSONParserHandler* handler) {
JsonParser<uint8_t> parser(platform, handler);
parser.Parse(chars.data(), chars.size());
}
void ParseJSONChars(const Platform* platform, span<uint16_t> chars,
JSONParserHandler* handler) {
JsonParser<uint16_t> parser(platform, handler);
parser.Parse(chars.data(), chars.size());
}
} // namespace inspector_protocol