blob: 88367ed416ceff342ad94d444eb882e4d5c37db6 [file] [log] [blame]
// tokeniser.hpp
// Copyright (c) 2007-2009 Ben Hanson (http://www.benhanson.net/)
//
// Distributed under the Boost Software License, Version 1.0. (See accompanying
// file licence_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
#ifndef BOOST_LEXER_RE_TOKENISER_HPP
#define BOOST_LEXER_RE_TOKENISER_HPP
// memcpy()
#include <cstring>
#include <map>
#include "num_token.hpp"
#include "../../runtime_error.hpp"
#include "../../size_t.hpp"
#include <sstream>
#include "../../string_token.hpp"
#include "re_tokeniser_helper.hpp"
namespace boost
{
namespace lexer
{
namespace detail
{
template<typename CharT>
class basic_re_tokeniser
{
public:
typedef basic_num_token<CharT> num_token;
typedef basic_re_tokeniser_state<CharT> state;
typedef basic_string_token<CharT> string_token;
typedef typename string_token::string string;
typedef std::map<string_token, std::size_t> token_map;
typedef std::pair<string_token, std::size_t> token_pair;
static void next (state &state_, token_map &map_, num_token &token_)
{
CharT ch_ = 0;
bool eos_ = state_.next (ch_);
token_.min_max (0, false, 0);
while (!eos_ && ch_ == '"')
{
state_._in_string ^= 1;
eos_ = state_.next (ch_);
}
if (eos_)
{
if (state_._in_string)
{
throw runtime_error ("Unexpected end of regex "
"(missing '\"').");
}
if (state_._paren_count)
{
throw runtime_error ("Unexpected end of regex "
"(missing ')').");
}
token_.set (num_token::END, null_token);
}
else
{
if (ch_ == '\\')
{
// Even if we are in a string, respect escape sequences...
escape (state_, map_, token_);
}
else if (state_._in_string)
{
// All other meta characters lose their special meaning
// inside a string.
create_charset_token (string (1, ch_), false, map_, token_);
}
else
{
// Not an escape sequence and not inside a string, so
// check for meta characters.
switch (ch_)
{
case '(':
token_.set (num_token::OPENPAREN, null_token);
++state_._paren_count;
read_options (state_);
break;
case ')':
--state_._paren_count;
if (state_._paren_count < 0)
{
std::ostringstream ss_;
ss_ << "Number of open parenthesis < 0 at index " <<
state_.index () - 1 << '.';
throw runtime_error (ss_.str ().c_str ());
}
token_.set (num_token::CLOSEPAREN, null_token);
if (!state_._flags_stack.empty ())
{
state_._flags = state_._flags_stack.top ();
state_._flags_stack.pop ();
}
break;
case '?':
if (!state_.eos () && *state_._curr == '?')
{
token_.set (num_token::AOPT, null_token);
state_.increment ();
}
else
{
token_.set (num_token::OPT, null_token);
}
break;
case '*':
if (!state_.eos () && *state_._curr == '?')
{
token_.set (num_token::AZEROORMORE, null_token);
state_.increment ();
}
else
{
token_.set (num_token::ZEROORMORE, null_token);
}
break;
case '+':
if (!state_.eos () && *state_._curr == '?')
{
token_.set (num_token::AONEORMORE, null_token);
state_.increment ();
}
else
{
token_.set (num_token::ONEORMORE, null_token);
}
break;
case '{':
open_curly (state_, token_);
break;
case '|':
token_.set (num_token::OR, null_token);
break;
case '^':
if (state_._curr - 1 == state_._start)
{
token_.set (num_token::CHARSET, bol_token);
state_._seen_BOL_assertion = true;
}
else
{
create_charset_token (string (1, ch_), false,
map_, token_);
}
break;
case '$':
if (state_._curr == state_._end)
{
token_.set (num_token::CHARSET, eol_token);
state_._seen_EOL_assertion = true;
}
else
{
create_charset_token (string (1, ch_), false,
map_, token_);
}
break;
case '.':
{
string dot_;
if (state_._flags & dot_not_newline)
{
dot_ = '\n';
}
create_charset_token (dot_, true, map_, token_);
break;
}
case '[':
{
charset (state_, map_, token_);
break;
}
case '/':
throw runtime_error("Lookahead ('/') is not supported yet.");
break;
default:
if ((state_._flags & icase) &&
(std::isupper (ch_, state_._locale) ||
std::islower (ch_, state_._locale)))
{
CharT upper_ = std::toupper (ch_, state_._locale);
CharT lower_ = std::tolower (ch_, state_._locale);
string str_ (1, upper_);
str_ += lower_;
create_charset_token (str_, false, map_, token_);
}
else
{
create_charset_token (string (1, ch_), false,
map_, token_);
}
break;
}
}
}
}
private:
typedef basic_re_tokeniser_helper<CharT> tokeniser_helper;
static void read_options (state &state_)
{
if (!state_.eos () && *state_._curr == '?')
{
CharT ch_ = 0;
bool eos_ = false;
bool negate_ = false;
state_.increment ();
eos_ = state_.next (ch_);
state_._flags_stack.push (state_._flags);
while (!eos_ && ch_ != ':')
{
switch (ch_)
{
case '-':
negate_ ^= 1;
break;
case 'i':
if (negate_)
{
state_._flags = static_cast<regex_flags>
(state_._flags & ~icase);
}
else
{
state_._flags = static_cast<regex_flags>
(state_._flags | icase);
}
negate_ = false;
break;
case 's':
if (negate_)
{
state_._flags = static_cast<regex_flags>
(state_._flags | dot_not_newline);
}
else
{
state_._flags = static_cast<regex_flags>
(state_._flags & ~dot_not_newline);
}
negate_ = false;
break;
default:
{
std::ostringstream ss_;
ss_ << "Unknown option at index " <<
state_.index () - 1 << '.';
throw runtime_error (ss_.str ().c_str ());
}
}
eos_ = state_.next (ch_);
}
// End of string handler will handle early termination
}
else if (!state_._flags_stack.empty ())
{
state_._flags_stack.push (state_._flags);
}
}
static void escape (state &state_, token_map &map_, num_token &token_)
{
CharT ch_ = 0;
std::size_t str_len_ = 0;
const CharT *str_ = tokeniser_helper::escape_sequence (state_,
ch_, str_len_);
if (str_)
{
state state2_ (str_ + 1, str_ + str_len_, state_._flags,
state_._locale);
charset (state2_, map_, token_);
}
else
{
create_charset_token (string (1, ch_), false, map_, token_);
}
}
static void charset (state &state_, token_map &map_, num_token &token_)
{
string chars_;
bool negated_ = false;
tokeniser_helper::charset (state_, chars_, negated_);
create_charset_token (chars_, negated_, map_, token_);
}
static void create_charset_token (const string &charset_,
const bool negated_, token_map &map_, num_token &token_)
{
std::size_t id_ = null_token;
string_token stok_ (negated_, charset_);
stok_.remove_duplicates ();
stok_.normalise ();
typename token_map::const_iterator iter_ = map_.find (stok_);
if (iter_ == map_.end ())
{
id_ = map_.size ();
map_.insert (token_pair (stok_, id_));
}
else
{
id_ = iter_->second;
}
token_.set (num_token::CHARSET, id_);
}
static void open_curly (state &state_, num_token &token_)
{
if (state_.eos ())
{
throw runtime_error ("Unexpected end of regex "
"(missing '}').");
}
else if (*state_._curr >= '0' && *state_._curr <= '9')
{
repeat_n (state_, token_);
if (!state_.eos () && *state_._curr == '?')
{
token_._type = num_token::AREPEATN;
state_.increment ();
}
}
else
{
macro (state_, token_);
}
}
// SYNTAX:
// {n[,[n]]}
// SEMANTIC RULES:
// {0} - INVALID (throw exception)
// {0,} = *
// {0,0} - INVALID (throw exception)
// {0,1} = ?
// {1,} = +
// {min,max} where min == max - {min}
// {min,max} where max < min - INVALID (throw exception)
static void repeat_n (state &state_, num_token &token_)
{
CharT ch_ = 0;
bool eos_ = state_.next (ch_);
while (!eos_ && ch_ >= '0' && ch_ <= '9')
{
token_._min *= 10;
token_._min += ch_ - '0';
eos_ = state_.next (ch_);
}
if (eos_)
{
throw runtime_error ("Unexpected end of regex "
"(missing '}').");
}
bool min_max_ = false;
bool repeatn_ = true;
token_._comma = ch_ == ',';
if (token_._comma)
{
eos_ = state_.next (ch_);
if (eos_)
{
throw runtime_error ("Unexpected end of regex "
"(missing '}').");
}
if (ch_ == '}')
{
// Small optimisation: Check for '*' equivalency.
if (token_._min == 0)
{
token_.set (num_token::ZEROORMORE, null_token);
repeatn_ = false;
}
// Small optimisation: Check for '+' equivalency.
else if (token_._min == 1)
{
token_.set (num_token::ONEORMORE, null_token);
repeatn_ = false;
}
}
else
{
if (ch_ < '0' || ch_ > '9')
{
std::ostringstream ss_;
ss_ << "Missing '}' at index " <<
state_.index () - 1 << '.';
throw runtime_error (ss_.str ().c_str ());
}
min_max_ = true;
do
{
token_._max *= 10;
token_._max += ch_ - '0';
eos_ = state_.next (ch_);
} while (!eos_ && ch_ >= '0' && ch_ <= '9');
if (eos_)
{
throw runtime_error ("Unexpected end of regex "
"(missing '}').");
}
// Small optimisation: Check for '?' equivalency.
if (token_._min == 0 && token_._max == 1)
{
token_.set (num_token::OPT, null_token);
repeatn_ = false;
}
// Small optimisation: if min == max, then min.
else if (token_._min == token_._max)
{
token_._comma = false;
min_max_ = false;
token_._max = 0;
}
}
}
if (ch_ != '}')
{
std::ostringstream ss_;
ss_ << "Missing '}' at index " << state_.index () - 1 << '.';
throw runtime_error (ss_.str ().c_str ());
}
if (repeatn_)
{
// SEMANTIC VALIDATION follows:
// NOTE: {0,} has already become *
// therefore we don't check for a comma.
if (token_._min == 0 && token_._max == 0)
{
std::ostringstream ss_;
ss_ << "Cannot have exactly zero repeats preceding index " <<
state_.index () << '.';
throw runtime_error (ss_.str ().c_str ());
}
if (min_max_ && token_._max < token_._min)
{
std::ostringstream ss_;
ss_ << "Max less than min preceding index " <<
state_.index () << '.';
throw runtime_error (ss_.str ().c_str ());
}
token_.set (num_token::REPEATN, null_token);
}
}
static void macro (state &state_, num_token &token_)
{
CharT ch_ = 0;
bool eos_ = false;
const CharT *start_ = state_._curr;
state_.next (ch_);
if (ch_ != '_' && !(ch_ >= 'A' && ch_ <= 'Z') &&
!(ch_ >= 'a' && ch_ <= 'z'))
{
std::ostringstream ss_;
ss_ << "Invalid MACRO name at index " <<
state_.index () - 1 << '.';
throw runtime_error (ss_.str ().c_str ());
}
do
{
eos_ = state_.next (ch_);
if (eos_)
{
throw runtime_error ("Unexpected end of regex "
"(missing '}').");
}
} while (ch_ == '_' || ch_ == '-' || (ch_ >= 'A' && ch_ <= 'Z') ||
(ch_ >= 'a' && ch_ <= 'z') || (ch_ >= '0' && ch_ <= '9'));
if (ch_ != '}')
{
std::ostringstream ss_;
ss_ << "Missing '}' at index " << state_.index () - 1 << '.';
throw runtime_error (ss_.str ().c_str ());
}
std::size_t len_ = state_._curr - 1 - start_;
if (len_ > max_macro_len)
{
std::basic_stringstream<CharT> ss_;
std::ostringstream os_;
os_ << "MACRO name '";
while (len_)
{
os_ << ss_.narrow (*start_++, ' ');
--len_;
}
os_ << "' too long.";
throw runtime_error (os_.str ());
}
token_.set (num_token::MACRO, null_token);
// Some systems have memcpy in namespace std.
using namespace std;
memcpy (token_._macro, start_, len_ * sizeof (CharT));
token_._macro[len_] = 0;
}
};
}
}
}
#endif