// tokeniser.hpp | |
// Copyright (c) 2007-2009 Ben Hanson (http://www.benhanson.net/) | |
// | |
// Distributed under the Boost Software License, Version 1.0. (See accompanying | |
// file licence_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) | |
#ifndef BOOST_LEXER_RE_TOKENISER_HPP | |
#define BOOST_LEXER_RE_TOKENISER_HPP | |
// memcpy() | |
#include <cstring> | |
#include <map> | |
#include "num_token.hpp" | |
#include "../../runtime_error.hpp" | |
#include "../../size_t.hpp" | |
#include <sstream> | |
#include "../../string_token.hpp" | |
#include "re_tokeniser_helper.hpp" | |
namespace boost | |
{ | |
namespace lexer | |
{ | |
namespace detail | |
{ | |
template<typename CharT> | |
class basic_re_tokeniser | |
{ | |
public: | |
typedef basic_num_token<CharT> num_token; | |
typedef basic_re_tokeniser_state<CharT> state; | |
typedef basic_string_token<CharT> string_token; | |
typedef typename string_token::string string; | |
typedef std::map<string_token, std::size_t> token_map; | |
typedef std::pair<string_token, std::size_t> token_pair; | |
static void next (state &state_, token_map &map_, num_token &token_) | |
{ | |
CharT ch_ = 0; | |
bool eos_ = state_.next (ch_); | |
token_.min_max (0, false, 0); | |
while (!eos_ && ch_ == '"') | |
{ | |
state_._in_string ^= 1; | |
eos_ = state_.next (ch_); | |
} | |
if (eos_) | |
{ | |
if (state_._in_string) | |
{ | |
throw runtime_error ("Unexpected end of regex " | |
"(missing '\"')."); | |
} | |
if (state_._paren_count) | |
{ | |
throw runtime_error ("Unexpected end of regex " | |
"(missing ')')."); | |
} | |
token_.set (num_token::END, null_token); | |
} | |
else | |
{ | |
if (ch_ == '\\') | |
{ | |
// Even if we are in a string, respect escape sequences... | |
escape (state_, map_, token_); | |
} | |
else if (state_._in_string) | |
{ | |
// All other meta characters lose their special meaning | |
// inside a string. | |
create_charset_token (string (1, ch_), false, map_, token_); | |
} | |
else | |
{ | |
// Not an escape sequence and not inside a string, so | |
// check for meta characters. | |
switch (ch_) | |
{ | |
case '(': | |
token_.set (num_token::OPENPAREN, null_token); | |
++state_._paren_count; | |
read_options (state_); | |
break; | |
case ')': | |
--state_._paren_count; | |
if (state_._paren_count < 0) | |
{ | |
std::ostringstream ss_; | |
ss_ << "Number of open parenthesis < 0 at index " << | |
state_.index () - 1 << '.'; | |
throw runtime_error (ss_.str ().c_str ()); | |
} | |
token_.set (num_token::CLOSEPAREN, null_token); | |
if (!state_._flags_stack.empty ()) | |
{ | |
state_._flags = state_._flags_stack.top (); | |
state_._flags_stack.pop (); | |
} | |
break; | |
case '?': | |
if (!state_.eos () && *state_._curr == '?') | |
{ | |
token_.set (num_token::AOPT, null_token); | |
state_.increment (); | |
} | |
else | |
{ | |
token_.set (num_token::OPT, null_token); | |
} | |
break; | |
case '*': | |
if (!state_.eos () && *state_._curr == '?') | |
{ | |
token_.set (num_token::AZEROORMORE, null_token); | |
state_.increment (); | |
} | |
else | |
{ | |
token_.set (num_token::ZEROORMORE, null_token); | |
} | |
break; | |
case '+': | |
if (!state_.eos () && *state_._curr == '?') | |
{ | |
token_.set (num_token::AONEORMORE, null_token); | |
state_.increment (); | |
} | |
else | |
{ | |
token_.set (num_token::ONEORMORE, null_token); | |
} | |
break; | |
case '{': | |
open_curly (state_, token_); | |
break; | |
case '|': | |
token_.set (num_token::OR, null_token); | |
break; | |
case '^': | |
if (state_._curr - 1 == state_._start) | |
{ | |
token_.set (num_token::CHARSET, bol_token); | |
state_._seen_BOL_assertion = true; | |
} | |
else | |
{ | |
create_charset_token (string (1, ch_), false, | |
map_, token_); | |
} | |
break; | |
case '$': | |
if (state_._curr == state_._end) | |
{ | |
token_.set (num_token::CHARSET, eol_token); | |
state_._seen_EOL_assertion = true; | |
} | |
else | |
{ | |
create_charset_token (string (1, ch_), false, | |
map_, token_); | |
} | |
break; | |
case '.': | |
{ | |
string dot_; | |
if (state_._flags & dot_not_newline) | |
{ | |
dot_ = '\n'; | |
} | |
create_charset_token (dot_, true, map_, token_); | |
break; | |
} | |
case '[': | |
{ | |
charset (state_, map_, token_); | |
break; | |
} | |
case '/': | |
throw runtime_error("Lookahead ('/') is not supported yet."); | |
break; | |
default: | |
if ((state_._flags & icase) && | |
(std::isupper (ch_, state_._locale) || | |
std::islower (ch_, state_._locale))) | |
{ | |
CharT upper_ = std::toupper (ch_, state_._locale); | |
CharT lower_ = std::tolower (ch_, state_._locale); | |
string str_ (1, upper_); | |
str_ += lower_; | |
create_charset_token (str_, false, map_, token_); | |
} | |
else | |
{ | |
create_charset_token (string (1, ch_), false, | |
map_, token_); | |
} | |
break; | |
} | |
} | |
} | |
} | |
private: | |
typedef basic_re_tokeniser_helper<CharT> tokeniser_helper; | |
static void read_options (state &state_) | |
{ | |
if (!state_.eos () && *state_._curr == '?') | |
{ | |
CharT ch_ = 0; | |
bool eos_ = false; | |
bool negate_ = false; | |
state_.increment (); | |
eos_ = state_.next (ch_); | |
state_._flags_stack.push (state_._flags); | |
while (!eos_ && ch_ != ':') | |
{ | |
switch (ch_) | |
{ | |
case '-': | |
negate_ ^= 1; | |
break; | |
case 'i': | |
if (negate_) | |
{ | |
state_._flags = static_cast<regex_flags> | |
(state_._flags & ~icase); | |
} | |
else | |
{ | |
state_._flags = static_cast<regex_flags> | |
(state_._flags | icase); | |
} | |
negate_ = false; | |
break; | |
case 's': | |
if (negate_) | |
{ | |
state_._flags = static_cast<regex_flags> | |
(state_._flags | dot_not_newline); | |
} | |
else | |
{ | |
state_._flags = static_cast<regex_flags> | |
(state_._flags & ~dot_not_newline); | |
} | |
negate_ = false; | |
break; | |
default: | |
{ | |
std::ostringstream ss_; | |
ss_ << "Unknown option at index " << | |
state_.index () - 1 << '.'; | |
throw runtime_error (ss_.str ().c_str ()); | |
} | |
} | |
eos_ = state_.next (ch_); | |
} | |
// End of string handler will handle early termination | |
} | |
else if (!state_._flags_stack.empty ()) | |
{ | |
state_._flags_stack.push (state_._flags); | |
} | |
} | |
static void escape (state &state_, token_map &map_, num_token &token_) | |
{ | |
CharT ch_ = 0; | |
std::size_t str_len_ = 0; | |
const CharT *str_ = tokeniser_helper::escape_sequence (state_, | |
ch_, str_len_); | |
if (str_) | |
{ | |
state state2_ (str_ + 1, str_ + str_len_, state_._flags, | |
state_._locale); | |
charset (state2_, map_, token_); | |
} | |
else | |
{ | |
create_charset_token (string (1, ch_), false, map_, token_); | |
} | |
} | |
static void charset (state &state_, token_map &map_, num_token &token_) | |
{ | |
string chars_; | |
bool negated_ = false; | |
tokeniser_helper::charset (state_, chars_, negated_); | |
create_charset_token (chars_, negated_, map_, token_); | |
} | |
static void create_charset_token (const string &charset_, | |
const bool negated_, token_map &map_, num_token &token_) | |
{ | |
std::size_t id_ = null_token; | |
string_token stok_ (negated_, charset_); | |
stok_.remove_duplicates (); | |
stok_.normalise (); | |
typename token_map::const_iterator iter_ = map_.find (stok_); | |
if (iter_ == map_.end ()) | |
{ | |
id_ = map_.size (); | |
map_.insert (token_pair (stok_, id_)); | |
} | |
else | |
{ | |
id_ = iter_->second; | |
} | |
token_.set (num_token::CHARSET, id_); | |
} | |
static void open_curly (state &state_, num_token &token_) | |
{ | |
if (state_.eos ()) | |
{ | |
throw runtime_error ("Unexpected end of regex " | |
"(missing '}')."); | |
} | |
else if (*state_._curr >= '0' && *state_._curr <= '9') | |
{ | |
repeat_n (state_, token_); | |
if (!state_.eos () && *state_._curr == '?') | |
{ | |
token_._type = num_token::AREPEATN; | |
state_.increment (); | |
} | |
} | |
else | |
{ | |
macro (state_, token_); | |
} | |
} | |
// SYNTAX: | |
// {n[,[n]]} | |
// SEMANTIC RULES: | |
// {0} - INVALID (throw exception) | |
// {0,} = * | |
// {0,0} - INVALID (throw exception) | |
// {0,1} = ? | |
// {1,} = + | |
// {min,max} where min == max - {min} | |
// {min,max} where max < min - INVALID (throw exception) | |
static void repeat_n (state &state_, num_token &token_) | |
{ | |
CharT ch_ = 0; | |
bool eos_ = state_.next (ch_); | |
while (!eos_ && ch_ >= '0' && ch_ <= '9') | |
{ | |
token_._min *= 10; | |
token_._min += ch_ - '0'; | |
eos_ = state_.next (ch_); | |
} | |
if (eos_) | |
{ | |
throw runtime_error ("Unexpected end of regex " | |
"(missing '}')."); | |
} | |
bool min_max_ = false; | |
bool repeatn_ = true; | |
token_._comma = ch_ == ','; | |
if (token_._comma) | |
{ | |
eos_ = state_.next (ch_); | |
if (eos_) | |
{ | |
throw runtime_error ("Unexpected end of regex " | |
"(missing '}')."); | |
} | |
if (ch_ == '}') | |
{ | |
// Small optimisation: Check for '*' equivalency. | |
if (token_._min == 0) | |
{ | |
token_.set (num_token::ZEROORMORE, null_token); | |
repeatn_ = false; | |
} | |
// Small optimisation: Check for '+' equivalency. | |
else if (token_._min == 1) | |
{ | |
token_.set (num_token::ONEORMORE, null_token); | |
repeatn_ = false; | |
} | |
} | |
else | |
{ | |
if (ch_ < '0' || ch_ > '9') | |
{ | |
std::ostringstream ss_; | |
ss_ << "Missing '}' at index " << | |
state_.index () - 1 << '.'; | |
throw runtime_error (ss_.str ().c_str ()); | |
} | |
min_max_ = true; | |
do | |
{ | |
token_._max *= 10; | |
token_._max += ch_ - '0'; | |
eos_ = state_.next (ch_); | |
} while (!eos_ && ch_ >= '0' && ch_ <= '9'); | |
if (eos_) | |
{ | |
throw runtime_error ("Unexpected end of regex " | |
"(missing '}')."); | |
} | |
// Small optimisation: Check for '?' equivalency. | |
if (token_._min == 0 && token_._max == 1) | |
{ | |
token_.set (num_token::OPT, null_token); | |
repeatn_ = false; | |
} | |
// Small optimisation: if min == max, then min. | |
else if (token_._min == token_._max) | |
{ | |
token_._comma = false; | |
min_max_ = false; | |
token_._max = 0; | |
} | |
} | |
} | |
if (ch_ != '}') | |
{ | |
std::ostringstream ss_; | |
ss_ << "Missing '}' at index " << state_.index () - 1 << '.'; | |
throw runtime_error (ss_.str ().c_str ()); | |
} | |
if (repeatn_) | |
{ | |
// SEMANTIC VALIDATION follows: | |
// NOTE: {0,} has already become * | |
// therefore we don't check for a comma. | |
if (token_._min == 0 && token_._max == 0) | |
{ | |
std::ostringstream ss_; | |
ss_ << "Cannot have exactly zero repeats preceding index " << | |
state_.index () << '.'; | |
throw runtime_error (ss_.str ().c_str ()); | |
} | |
if (min_max_ && token_._max < token_._min) | |
{ | |
std::ostringstream ss_; | |
ss_ << "Max less than min preceding index " << | |
state_.index () << '.'; | |
throw runtime_error (ss_.str ().c_str ()); | |
} | |
token_.set (num_token::REPEATN, null_token); | |
} | |
} | |
static void macro (state &state_, num_token &token_) | |
{ | |
CharT ch_ = 0; | |
bool eos_ = false; | |
const CharT *start_ = state_._curr; | |
state_.next (ch_); | |
if (ch_ != '_' && !(ch_ >= 'A' && ch_ <= 'Z') && | |
!(ch_ >= 'a' && ch_ <= 'z')) | |
{ | |
std::ostringstream ss_; | |
ss_ << "Invalid MACRO name at index " << | |
state_.index () - 1 << '.'; | |
throw runtime_error (ss_.str ().c_str ()); | |
} | |
do | |
{ | |
eos_ = state_.next (ch_); | |
if (eos_) | |
{ | |
throw runtime_error ("Unexpected end of regex " | |
"(missing '}')."); | |
} | |
} while (ch_ == '_' || ch_ == '-' || (ch_ >= 'A' && ch_ <= 'Z') || | |
(ch_ >= 'a' && ch_ <= 'z') || (ch_ >= '0' && ch_ <= '9')); | |
if (ch_ != '}') | |
{ | |
std::ostringstream ss_; | |
ss_ << "Missing '}' at index " << state_.index () - 1 << '.'; | |
throw runtime_error (ss_.str ().c_str ()); | |
} | |
std::size_t len_ = state_._curr - 1 - start_; | |
if (len_ > max_macro_len) | |
{ | |
std::basic_stringstream<CharT> ss_; | |
std::ostringstream os_; | |
os_ << "MACRO name '"; | |
while (len_) | |
{ | |
os_ << ss_.narrow (*start_++, ' '); | |
--len_; | |
} | |
os_ << "' too long."; | |
throw runtime_error (os_.str ()); | |
} | |
token_.set (num_token::MACRO, null_token); | |
// Some systems have memcpy in namespace std. | |
using namespace std; | |
memcpy (token_._macro, start_, len_ * sizeof (CharT)); | |
token_._macro[len_] = 0; | |
} | |
}; | |
} | |
} | |
} | |
#endif |