blob: 02469c8d145651a40a16e4bb7d187343df1ec0cb [file] [log] [blame]
// Copyright 2017 The Goma Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.
#include "cpp_tokenizer.h"
#ifndef NO_SSE2
#include <emmintrin.h>
#endif // NO_SSE2
#ifdef _WIN32
#include <intrin.h>
#endif
#include "absl/strings/ascii.h"
#include "compiler_specific.h"
#include "glog/logging.h"
namespace {
#ifdef _WIN32
static inline int CountZero(int v) {
unsigned long r;
_BitScanForward(&r, v);
return r;
}
#else
static inline int CountZero(int v) {
return __builtin_ctz(v);
}
#endif
// __popcnt (on MSVC) emits POPCNT. Some engineers are still using older
// machine that does not have POPCNT. So, we'd like to avoid __popcnt.
// clang-cl.exe must have __builtin_popcunt, so use it.
// For cl.exe, use this somewhat fast algorithm.
// See b/65465347
#if defined(_WIN32) && !defined(__clang__)
static inline int PopCount(int v) {
v = (v & 0x55555555) + (v >> 1 & 0x55555555);
v = (v & 0x33333333) + (v >> 2 & 0x33333333);
v = (v & 0x0f0f0f0f) + (v >> 4 & 0x0f0f0f0f);
v = (v & 0x00ff00ff) + (v >> 8 & 0x00ff00ff);
return (v & 0x0000ffff) + (v >>16 & 0x0000ffff);
}
#else
static inline int PopCount(int v) {
return __builtin_popcount(v);
}
#endif
#ifndef NO_SSE2
typedef ALIGNAS(16) char aligned_char16[16];
const aligned_char16 kNewlinePattern = {
0xA, 0xA, 0xA, 0xA, 0xA, 0xA, 0xA, 0xA,
0xA, 0xA, 0xA, 0xA, 0xA, 0xA, 0xA, 0xA,
};
const aligned_char16 kSlashPattern = {
'/', '/', '/', '/', '/', '/', '/', '/',
'/', '/', '/', '/', '/', '/', '/', '/',
};
const aligned_char16 kSharpPattern = {
'#', '#', '#', '#', '#', '#', '#', '#',
'#', '#', '#', '#', '#', '#', '#', '#',
};
#endif // NO_SSE2
} // anonymous namespace
namespace devtools_goma {
// static
bool CppTokenizer::TokenizeAll(const std::string& str,
bool skip_space,
ArrayTokenList* result) {
std::unique_ptr<Content> content = Content::CreateFromString(str);
CppInputStream stream(content.get());
string error_reason;
ArrayTokenList tokens;
while (true) {
CppToken token;
if (!NextTokenFrom(&stream, skip_space, &token, &error_reason)) {
break;
}
if (token.type == CppToken::END) {
break;
}
tokens.push_back(std::move(token));
}
if (!error_reason.empty()) {
LOG(ERROR) << "failed to tokenize:"
<< " input=" << str
<< " error=" << error_reason;
return false;
}
*result = std::move(tokens);
return true;
}
// static
bool CppTokenizer::NextTokenFrom(CppInputStream* stream,
bool skip_space,
CppToken* token,
std::string* error_reason) {
for (;;) {
const char* cur = stream->cur();
int c = stream->GetChar();
if (c == EOF) {
*token = CppToken(CppToken::END);
return true;
}
if (c >= 128) {
*token = CppToken(CppToken::PUNCTUATOR, static_cast<char>(c));
return true;
}
if (IsCppBlank(c)) {
if (skip_space) {
stream->SkipWhiteSpaces();
continue;
}
*token = CppToken(CppToken::SPACE, static_cast<char>(c));
return true;
}
int c1 = stream->PeekChar();
switch (c) {
case '/':
if (c1 == '/') {
SkipUntilLineBreakIgnoreComment(stream);
*token = CppToken(CppToken::NEWLINE);
return true;
}
if (c1 == '*') {
stream->Advance(1, 0);
if (!SkipComment(stream, error_reason)) {
*token = CppToken(CppToken::END);
return false;
}
*token = CppToken(CppToken::SPACE, ' ');
return true;
}
*token = CppToken(CppToken::DIV, '/');
return true;
case '%':
if (c1 == ':') {
stream->Advance(1, 0);
if (stream->PeekChar(0) == '%' &&
stream->PeekChar(1) == ':') {
stream->Advance(2, 0);
*token = CppToken(CppToken::DOUBLESHARP);
return true;
}
*token = CppToken(CppToken::SHARP, '#');
return true;
}
*token = CppToken(CppToken::MOD, '%');
return true;
case '.':
if (c1 >= '0' && c1 <= '9') {
*token = ReadNumber(stream, c, cur);
return true;
}
if (c1 == '.' && stream->PeekChar(1) == '.') {
stream->Advance(2, 0);
*token = CppToken(CppToken::TRIPLEDOT);
return true;
}
*token = CppToken(CppToken::PUNCTUATOR, '.');
return true;
case '\\':
c = stream->GetChar();
if (c != '\r' && c != '\n') {
*token = CppToken(CppToken::ESCAPED, static_cast<char>(c));
return true;
}
if (c == '\r' && stream->PeekChar() == '\n')
stream->Advance(1, 1);
break;
case '"': {
*token = CppToken(CppToken::STRING);
if (!ReadString(stream, token, error_reason)) {
return false;
}
return true;
}
case '\'':
if (ReadCharLiteral(stream, token)) {
return true;
}
// Non-ended single quotation is valid in preprocessor.
// e.g. 'A will be PUNCTUATOR '\'' and IDENTIFIER('A).
FALLTHROUGH_INTENDED;
default:
if (c == '_' || c == '$' || absl::ascii_isalpha(c)) {
*token = ReadIdentifier(stream, cur);
return true;
}
if (c >= '0' && c <= '9') {
*token = ReadNumber(stream, c, cur);
return true;
}
if (c1 == EOF) {
*token = CppToken(TypeFrom(c, 0), static_cast<char>(c));
return true;
}
if ((c1 & ~0x7f) == 0 && TypeFrom(c, c1) != CppToken::PUNCTUATOR) {
stream->Advance(1, 0);
*token = CppToken(TypeFrom(c, c1),
static_cast<char>(c), static_cast<char>(c1));
return true;
}
*token = CppToken(TypeFrom(c, 0), static_cast<char>(c));
return true;
}
}
}
// static
bool CppTokenizer::ReadStringUntilDelimiter(CppInputStream* stream,
std::string* result_str,
char delimiter,
std::string* error_reason) {
const char* begin = stream->cur();
for (;;) {
int c = stream->PeekChar();
if (c == EOF) {
return true;
}
if (c == delimiter) {
const char* cur = stream->cur() - 1;
stream->Advance(1, 0);
if (*cur != '\\') {
result_str->append(begin, stream->cur() - begin - 1);
return true;
}
} else if (c == '\n') {
const char* cur = stream->cur() - 1;
stream->Advance(1, 1);
cur -= (*cur == '\r');
if (*cur != '\\') {
*error_reason = "missing terminating character";
return false;
}
result_str->append(begin, stream->cur() - begin - 2);
begin = stream->cur();
} else {
stream->Advance(1, 0);
}
}
}
// static
CppToken CppTokenizer::ReadIdentifier(CppInputStream* stream,
const char* begin) {
CppToken token(CppToken::IDENTIFIER);
for (;;) {
int c = stream->GetChar();
if (absl::ascii_isalnum(c) || c == '_' || c == '$' ||
(c == '\\' && HandleLineFoldingWithToken(stream, &token, &begin))) {
continue;
}
token.Append(begin, stream->GetLengthToCurrentFrom(begin, c));
stream->UngetChar(c);
return token;
}
}
// (6.4.2) Preprocessing numbers
// pp-number :
// digit
// .digit
// pp-number digit
// pp-number nondigit
// pp-number [eEpP] sign ([pP] is new in C99)
// pp-number .
//
// static
CppToken CppTokenizer::ReadNumber(CppInputStream* stream, int c0,
const char* begin) {
CppToken token(CppToken::NUMBER);
bool maybe_int_constant = (c0 != '.');
int base = 10;
int value = 0;
std::string suffix;
int c;
// Handle base prefix.
if (c0 == '0') {
base = 8;
int c1 = stream->PeekChar();
if (c1 == 'x' || c1 == 'X') {
stream->Advance(1, 0);
base = 16;
}
} else {
value = c0 - '0';
}
if (maybe_int_constant) {
// Read the digits part.
c = absl::ascii_tolower(stream->GetChar());
while ((c >= '0' && c <= ('0' + std::min(9, base - 1))) ||
(base == 16 && c >= 'a' && c <= 'f')) {
value = value * base + ((c >= 'a') ? (c - 'a' + 10) : (c - '0'));
c = absl::ascii_tolower(stream->GetChar());
}
stream->UngetChar(c);
}
// (digit | [a-zA-Z_] | . | [eEpP][+-])*
for (;;) {
c = stream->GetChar();
if (c == '\\' && HandleLineFoldingWithToken(stream, &token, &begin)) {
continue;
}
if ((c >= '0' && c <= '9') || c == '.' || c == '_') {
maybe_int_constant = false;
continue;
}
c = absl::ascii_tolower(c);
if (c >= 'a' && c <= 'z') {
if (maybe_int_constant) {
suffix += static_cast<char>(c);
}
if (c == 'e' || c == 'p') {
int c1 = stream->PeekChar();
if (c1 == '+' || c1 == '-') {
maybe_int_constant = false;
stream->Advance(1, 0);
}
}
continue;
}
break;
}
token.Append(begin, stream->GetLengthToCurrentFrom(begin, c));
stream->UngetChar(c);
if (maybe_int_constant && (suffix.empty() || IsValidIntegerSuffix(suffix))) {
token.v.int_value = value;
}
return token;
}
// static
bool CppTokenizer::ReadString(CppInputStream* stream,
CppToken* result_token,
std::string* error_reason) {
CppToken token(CppToken::STRING);
if (!ReadStringUntilDelimiter(stream, &token.string_value,
'"', error_reason)) {
return false;
}
*result_token = std::move(token);
return true;
}
int hex2int(int ch) {
if (ch >= '0' && ch <= '9') {
return ch - '0';
}
if (ch >= 'a' && ch <= 'f') {
return ch - 'a' + 10;
}
return ch - 'A' + 10;
}
// http://www.iso-9899.info/n1256.html#6.4.4.4
// static
bool CppTokenizer::ReadCharLiteral(CppInputStream* stream,
CppToken* result_token) {
// TODO: preserve original literal in token.string_value?
CppToken token(CppToken::CHAR_LITERAL);
const char* cur = stream->cur();
const ptrdiff_t cur_len = stream->end() - cur;
if (cur_len >= 3 && cur[0] == '\\' && cur[2] == '\'') {
switch (cur[1]) {
case '0': case '1': case '2': case '3': case '4':
case '5': case '6': case '7':
// \ octal-digit
token.v.int_value = cur[1] - '0';
break;
case '\'':
token.v.int_value = '\'';
break;
case '"':
token.v.int_value = '"';
break;
case '?':
token.v.int_value = '?';
break;
case '\\':
token.v.int_value = '\\';
break;
case 'a':
token.v.int_value = '\a';
break;
case 'b':
token.v.int_value = '\b';
break;
case 'f':
token.v.int_value = '\f';
break;
case 'n':
token.v.int_value = '\n';
break;
case 'r':
token.v.int_value = '\r';
break;
case 't':
token.v.int_value = '\t';
break;
case 'v':
token.v.int_value = '\v';
break;
default:
LOG(ERROR) << "Unexpected escaped char literal?: " << cur[1]
<< " in line " << stream->line()
<< " of file: " << stream->filename();
return false;
}
stream->Advance(3, 0);
} else if (cur_len >= 2 &&
cur[0] != '\\' && cur[0] != '\'' &&
cur[0] != '\n' &&
cur[1] == '\'') {
// c-char
token.v.int_value = cur[0];
stream->Advance(2, 0);
} else if (cur_len >= 5 && cur[0] == '\\'
&& cur[1] == 'x' &&
absl::ascii_isxdigit(cur[2]) &&
absl::ascii_isxdigit(cur[3]) &&
cur[4] == '\'') {
// \x hexadecimal-digit hexadecimal-digit
token.v.int_value = hex2int(cur[2]) << 4 | hex2int(cur[3]);
stream->Advance(5, 0);
} else if (cur_len >= 4 && cur[0] == '\\' &&
cur[1] >= '0' && cur[1] < '8' &&
cur[2] >= '0' && cur[2] < '8' &&
cur[3] == '\'') {
// \ octal-digit octal-digit
token.v.int_value =
(cur[1] - '0') << 3 |
(cur[2] - '0');
stream->Advance(4, 0);
} else if (cur_len >= 5 && cur[0] == '\\' &&
cur[1] >= '0' && cur[1] < '8' &&
cur[2] >= '0' && cur[2] < '8' &&
cur[3] >= '0' && cur[3] < '8' &&
cur[4] == '\'') {
// \ octal-digit octal-digit octal-digit
token.v.int_value =
(cur[1] - '0') << 6 |
(cur[2] - '0') << 3 |
(cur[3] - '0');
stream->Advance(5, 0);
} else if (cur_len >= 3 &&
cur[0] != '\'' && cur[0] != '\\' &&
cur[1] != '\'' && cur[1] != '\\' &&
cur[2] == '\'') {
// c-char-sequence
// support only 2 char sequence here
// Windows winioctl.h uses such sequence. http://b/74048713
// winioctl.h in win_sdk
// #define IRP_EXT_TRACK_OFFSET_HEADER_VALIDATION_VALUE 'TO'
token.v.int_value =
(cur[0] << 8) |
cur[1];
stream->Advance(3, 0);
} else if (cur_len >= 5 &&
cur[0] != '\'' && cur[0] != '\\' &&
cur[1] != '\'' && cur[1] != '\\' &&
cur[2] != '\'' && cur[2] != '\\' &&
cur[3] != '\'' && cur[3] != '\\' &&
cur[4] == '\'') {
// c-char-sequence
// support only 4 char sequence here.
// MacOSX system header uses such sequence. http://b/74048713
// - PMPrintAETypes.h in PrintCore.framework
// #define kPMPrintSettingsAEType 'pset'
// etc
// - Debugging.h in CarbonCore.framework
// #define COMPONENT_SIGNATURE '?*?*'
//
// The value of an integer character constant containing more than
// one character (e.g., 'ab'), or containing a character or escape
// sequence that does not map to a single-byte execution character,
// is implementation-defined.
token.v.int_value =
(cur[0] << 24) |
(cur[1] << 16) |
(cur[2] << 8) |
cur[3];
stream->Advance(5, 0);
} else {
// TODO: Support other literal form if necessary.
LOG(ERROR) << "Unsupported char literal?: "
<< absl::string_view(cur, std::min<ptrdiff_t>(10, cur_len))
<< " in line " << stream->line()
<< " of file: " << stream->filename();
return false;
}
*result_token = std::move(token);
return true;
}
// static
bool CppTokenizer::HandleLineFoldingWithToken(CppInputStream* stream,
CppToken* token,
const char** begin) {
int c = stream->PeekChar();
if (c != '\r' && c != '\n')
return false;
stream->ConsumeChar();
token->Append(*begin, stream->cur() - *begin - 2);
if (c == '\r' && stream->PeekChar() == '\n')
stream->Advance(1, 1);
*begin = stream->cur();
return true;
}
// static
bool CppTokenizer::SkipComment(CppInputStream* stream,
std::string* error_reason) {
const char* begin = stream->cur();
#ifndef NO_SSE2
__m128i slash_pattern = *(__m128i*)kSlashPattern;
__m128i newline_pattern = *(__m128i*)kNewlinePattern;
while (stream->cur() + 16 < stream->end()) {
__m128i s = _mm_loadu_si128((__m128i const*)stream->cur());
__m128i slash_test = _mm_cmpeq_epi8(s, slash_pattern);
__m128i newline_test = _mm_cmpeq_epi8(s, newline_pattern);
int result = _mm_movemask_epi8(slash_test);
int newline_result = _mm_movemask_epi8(newline_test);
while (result) {
int index = CountZero(result);
unsigned int shift = (1 << index);
result &= ~shift;
const char* cur = stream->cur() + index - 1;
if (*cur == '*') {
unsigned int mask = shift - 1;
stream->Advance(index + 1, PopCount(newline_result & mask));
return true;
}
}
stream->Advance(16, PopCount(newline_result));
}
#endif // NO_SSE2
for (;;) {
int c = stream->PeekChar();
if (c == EOF) {
*error_reason = "missing terminating '*/' for comment";
return false;
}
if (c == '/' && stream->cur() != begin &&
*(stream->cur() - 1) == '*') {
stream->Advance(1, 0);
return true;
}
stream->ConsumeChar();
}
}
// static
bool CppTokenizer::SkipUntilDirective(CppInputStream* stream,
std::string* error_reason) {
const char* begin = stream->cur();
#ifndef NO_SSE2
// TODO: String index instruction (pcmpestri) would work better
// on sse4.2 enabled platforms.
__m128i slash_pattern = *(__m128i*)kSlashPattern;
__m128i sharp_pattern = *(__m128i*)kSharpPattern;
__m128i newline_pattern = *(__m128i*)kNewlinePattern;
while (stream->cur() + 16 < stream->end()) {
__m128i s = _mm_loadu_si128((__m128i const*)stream->cur());
__m128i slash_test = _mm_cmpeq_epi8(s, slash_pattern);
__m128i sharp_test = _mm_cmpeq_epi8(s, sharp_pattern);
__m128i newline_test = _mm_cmpeq_epi8(s, newline_pattern);
int slash_result = _mm_movemask_epi8(slash_test);
int sharp_result = _mm_movemask_epi8(sharp_test);
int newline_result = _mm_movemask_epi8(newline_test);
int result = slash_result | sharp_result;
while (result) {
int index = CountZero(result);
unsigned int shift = (1 << index);
result &= ~shift;
unsigned int mask = shift - 1;
const char* cur = stream->cur() + index;
if (*cur == '/') {
int c1 = *(cur + 1);
if (c1 == '/') {
stream->Advance(index + 2, PopCount(newline_result & mask));
SkipUntilLineBreakIgnoreComment(stream);
goto done;
} else if (c1 == '*') {
stream->Advance(index + 2, PopCount(newline_result & mask));
if (!SkipComment(stream, error_reason))
return false;
goto done;
}
} else if (*cur == '#') {
if (IsAfterEndOfLine(cur, stream->begin())) {
stream->Advance(index + 1, PopCount(newline_result & mask));
return true;
}
}
}
stream->Advance(16, PopCount(newline_result));
done:
continue;
}
#endif // NO_SSE2
for (;;) {
int c = stream->PeekChar();
if (c == EOF)
return false;
if (stream->cur() != begin) {
int c0 = *(stream->cur() - 1);
if (c0 == '/' && c == '/') {
stream->Advance(1, 0);
SkipUntilLineBreakIgnoreComment(stream);
continue;
}
if (c0 == '/' && c == '*') {
stream->Advance(1, 0);
if (!SkipComment(stream, error_reason))
return false;
}
}
if (c == '#') {
if (IsAfterEndOfLine(stream->cur(),
stream->begin())) {
stream->Advance(1, 0);
return true;
}
stream->Advance(1, 0);
continue;
}
stream->ConsumeChar();
}
return false;
}
// static
void CppTokenizer::SkipUntilLineBreakIgnoreComment(CppInputStream* stream) {
#ifndef NO_SSE2
__m128i newline_pattern = *(__m128i*)kNewlinePattern;
while (stream->cur() + 16 < stream->end()) {
__m128i s = _mm_loadu_si128((__m128i const*)stream->cur());
__m128i newline_test = _mm_cmpeq_epi8(s, newline_pattern);
int newline_result = _mm_movemask_epi8(newline_test);
int result = newline_result;
while (result) {
int index = CountZero(result);
unsigned int shift = (1 << index);
result &= ~shift;
unsigned int mask = shift - 1;
const char* cur = stream->cur() + index - 1;
cur -= (*cur == '\r');
if (*cur != '\\') {
stream->Advance(index + 1, PopCount(newline_result & mask));
return;
}
}
stream->Advance(16, PopCount(newline_result));
}
#endif // NO_SSE2
for (;;) {
int c = stream->PeekChar();
if (c == EOF)
return;
if (c == '\n') {
const char* cur = stream->cur() - 1;
stream->Advance(1, 1);
cur -= (*cur == '\r');
if (*cur != '\\')
return;
} else {
stream->Advance(1, 0);
}
}
}
// static
bool CppTokenizer::IsAfterEndOfLine(const char* cur, const char* begin) {
for (;;) {
if (cur == begin)
return true;
int c = *--cur;
if (!IsCppBlank(c))
break;
}
while (begin <= cur) {
int c = *cur;
if (c == '\n') {
if (--cur < begin)
return true;
cur -= (*cur == '\r');
if (cur < begin || *cur != '\\')
return true;
--cur;
continue;
}
if (c == '/') {
if (--cur < begin || *cur != '*')
return false;
--cur;
bool block_comment_start_found = false;
// Move backward until "/*" is found.
while (cur - 1 >= begin) {
if (*(cur - 1) == '/' && *cur == '*') {
cur -= 2;
block_comment_start_found = true;
break;
}
--cur;
}
if (block_comment_start_found)
continue;
// When '/*' is not found, it's not after end of line.
return false;
}
if (IsCppBlank(c)) {
--cur;
continue;
}
return false;
}
return true;
}
// static
bool CppTokenizer::IsValidIntegerSuffix(const std::string& s) {
switch (s.size()) {
case 1:
return s == "u" || s == "l";
case 2:
return s == "ul" || s == "lu" || s == "ll";
case 3:
return s == "ull" || s == "llu";
default:
return false;
}
}
// static
CppToken::Type CppTokenizer::TypeFrom(int c1, int c2) {
switch (c1) {
case '!':
if (c2 == '=') { return CppToken::NE; }
break;
case '#':
if (c2 == 0) { return CppToken::SHARP; }
if (c2 == '#') { return CppToken::DOUBLESHARP; }
break;
case '&':
if (c2 == 0) { return CppToken::AND; }
if (c2 == '&') { return CppToken::LAND; }
break;
case '*':
if (c2 == 0) { return CppToken::MUL; }
break;
case '+':
if (c2 == 0) { return CppToken::ADD; }
break;
case '-':
if (c2 == 0) { return CppToken::SUB; }
break;
case '<':
if (c2 == 0) { return CppToken::LT; }
if (c2 == '<') { return CppToken::LSHIFT; }
if (c2 == '=') { return CppToken::LE; }
break;
case '=':
if (c2 == '=') { return CppToken::EQ; }
break;
case '>':
if (c2 == 0) { return CppToken::GT; }
if (c2 == '=') { return CppToken::GE; }
if (c2 == '>') { return CppToken::RSHIFT; }
break;
case '\n':
if (c2 == 0) { return CppToken::NEWLINE; }
break;
case '\r':
if (c2 == '\n') { return CppToken::NEWLINE; }
break;
case '^':
if (c2 == 0) { return CppToken::XOR; }
break;
case '|':
if (c2 == 0) { return CppToken::OR; }
if (c2 == '|') { return CppToken::LOR; }
break;
}
return CppToken::PUNCTUATOR;
}
} // namespace devtools_goma