// tokeniser_helper.hpp | |
// Copyright (c) 2007-2009 Ben Hanson (http://www.benhanson.net/) | |
// | |
// Distributed under the Boost Software License, Version 1.0. (See accompanying | |
// file licence_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) | |
#ifndef BOOST_LEXER_RE_TOKENISER_HELPER_H | |
#define BOOST_LEXER_RE_TOKENISER_HELPER_H | |
#include "../../char_traits.hpp" | |
// strlen() | |
#include <cstring> | |
#include "../../size_t.hpp" | |
#include "re_tokeniser_state.hpp" | |
namespace boost | |
{ | |
namespace lexer | |
{ | |
namespace detail | |
{ | |
template<typename CharT, typename Traits = char_traits<CharT> > | |
class basic_re_tokeniser_helper | |
{ | |
public: | |
typedef basic_re_tokeniser_state<CharT> state; | |
typedef std::basic_string<CharT> string; | |
static const CharT *escape_sequence (state &state_, CharT &ch_, | |
std::size_t &str_len_) | |
{ | |
bool eos_ = state_.eos (); | |
if (eos_) | |
{ | |
throw runtime_error ("Unexpected end of regex " | |
"following '\\'."); | |
} | |
const CharT *str_ = charset_shortcut (*state_._curr, str_len_); | |
if (str_) | |
{ | |
state_.increment (); | |
} | |
else | |
{ | |
ch_ = chr (state_); | |
} | |
return str_; | |
} | |
// This function can call itself. | |
static void charset (state &state_, string &chars_, bool &negated_) | |
{ | |
CharT ch_ = 0; | |
bool eos_ = state_.next (ch_); | |
if (eos_) | |
{ | |
// Pointless returning index if at end of string | |
throw runtime_error ("Unexpected end of regex " | |
"following '['."); | |
} | |
negated_ = ch_ == '^'; | |
if (negated_) | |
{ | |
eos_ = state_.next (ch_); | |
if (eos_) | |
{ | |
// Pointless returning index if at end of string | |
throw runtime_error ("Unexpected end of regex " | |
"following '^'."); | |
} | |
} | |
bool chset_ = false; | |
CharT prev_ = 0; | |
while (ch_ != ']') | |
{ | |
if (ch_ == '\\') | |
{ | |
std::size_t str_len_ = 0; | |
const CharT *str_ = escape_sequence (state_, prev_, str_len_); | |
chset_ = str_ != 0; | |
if (chset_) | |
{ | |
state temp_state_ (str_ + 1, str_ + str_len_, | |
state_._flags, state_._locale); | |
string temp_chars_; | |
bool temp_negated_ = false; | |
charset (temp_state_, temp_chars_, temp_negated_); | |
if (negated_ != temp_negated_) | |
{ | |
std::ostringstream ss_; | |
ss_ << "Mismatch in charset negation preceding " | |
"index " << state_.index () << '.'; | |
throw runtime_error (ss_.str ().c_str ()); | |
} | |
chars_ += temp_chars_; | |
} | |
} | |
/* | |
else if (ch_ == '[' && !state_.eos () && *state_._curr == ':') | |
{ | |
// TODO: POSIX charsets | |
} | |
*/ | |
else | |
{ | |
chset_ = false; | |
prev_ = ch_; | |
} | |
eos_ = state_.next (ch_); | |
// Covers preceding if, else if and else | |
if (eos_) | |
{ | |
// Pointless returning index if at end of string | |
throw runtime_error ("Unexpected end of regex " | |
"(missing ']')."); | |
} | |
if (ch_ == '-') | |
{ | |
charset_range (chset_, state_, eos_, ch_, prev_, chars_); | |
} | |
else if (!chset_) | |
{ | |
if ((state_._flags & icase) && | |
(std::isupper (prev_, state_._locale) || | |
std::islower (prev_, state_._locale))) | |
{ | |
CharT upper_ = std::toupper (prev_, state_._locale); | |
CharT lower_ = std::tolower (prev_, state_._locale); | |
chars_ += upper_; | |
chars_ += lower_; | |
} | |
else | |
{ | |
chars_ += prev_; | |
} | |
} | |
} | |
if (!negated_ && chars_.empty ()) | |
{ | |
throw runtime_error ("Empty charsets not allowed."); | |
} | |
} | |
static CharT chr (state &state_) | |
{ | |
CharT ch_ = 0; | |
// eos_ has already been checked for. | |
switch (*state_._curr) | |
{ | |
case '0': | |
case '1': | |
case '2': | |
case '3': | |
case '4': | |
case '5': | |
case '6': | |
case '7': | |
ch_ = decode_octal (state_); | |
break; | |
case 'a': | |
ch_ = '\a'; | |
state_.increment (); | |
break; | |
case 'b': | |
ch_ = '\b'; | |
state_.increment (); | |
break; | |
case 'c': | |
ch_ = decode_control_char (state_); | |
break; | |
case 'e': | |
ch_ = 27; // '\e' not recognised by compiler | |
state_.increment (); | |
break; | |
case 'f': | |
ch_ = '\f'; | |
state_.increment (); | |
break; | |
case 'n': | |
ch_ = '\n'; | |
state_.increment (); | |
break; | |
case 'r': | |
ch_ = '\r'; | |
state_.increment (); | |
break; | |
case 't': | |
ch_ = '\t'; | |
state_.increment (); | |
break; | |
case 'v': | |
ch_ = '\v'; | |
state_.increment (); | |
break; | |
case 'x': | |
ch_ = decode_hex (state_); | |
break; | |
default: | |
ch_ = *state_._curr; | |
state_.increment (); | |
break; | |
} | |
return ch_; | |
} | |
private: | |
static const char *charset_shortcut (const char ch_, | |
std::size_t &str_len_) | |
{ | |
const char *str_ = 0; | |
switch (ch_) | |
{ | |
case 'd': | |
str_ = "[0-9]"; | |
break; | |
case 'D': | |
str_ = "[^0-9]"; | |
break; | |
case 's': | |
str_ = "[ \t\n\r\f\v]"; | |
break; | |
case 'S': | |
str_ = "[^ \t\n\r\f\v]"; | |
break; | |
case 'w': | |
str_ = "[_0-9A-Za-z]"; | |
break; | |
case 'W': | |
str_ = "[^_0-9A-Za-z]"; | |
break; | |
} | |
if (str_) | |
{ | |
// Some systems have strlen in namespace std. | |
using namespace std; | |
str_len_ = strlen (str_); | |
} | |
else | |
{ | |
str_len_ = 0; | |
} | |
return str_; | |
} | |
static const wchar_t *charset_shortcut (const wchar_t ch_, | |
std::size_t &str_len_) | |
{ | |
const wchar_t *str_ = 0; | |
switch (ch_) | |
{ | |
case 'd': | |
str_ = L"[0-9]"; | |
break; | |
case 'D': | |
str_ = L"[^0-9]"; | |
break; | |
case 's': | |
str_ = L"[ \t\n\r\f\v]"; | |
break; | |
case 'S': | |
str_ = L"[^ \t\n\r\f\v]"; | |
break; | |
case 'w': | |
str_ = L"[_0-9A-Za-z]"; | |
break; | |
case 'W': | |
str_ = L"[^_0-9A-Za-z]"; | |
break; | |
} | |
if (str_) | |
{ | |
// Some systems have wcslen in namespace std. | |
using namespace std; | |
str_len_ = wcslen (str_); | |
} | |
else | |
{ | |
str_len_ = 0; | |
} | |
return str_; | |
} | |
static CharT decode_octal (state &state_) | |
{ | |
std::size_t accumulator_ = 0; | |
CharT ch_ = *state_._curr; | |
unsigned short count_ = 3; | |
bool eos_ = false; | |
for (;;) | |
{ | |
accumulator_ *= 8; | |
accumulator_ += ch_ - '0'; | |
--count_; | |
state_.increment (); | |
eos_ = state_.eos (); | |
if (!count_ || eos_) break; | |
ch_ = *state_._curr; | |
// Don't consume invalid chars! | |
if (ch_ < '0' || ch_ > '7') | |
{ | |
break; | |
} | |
} | |
return static_cast<CharT> (accumulator_); | |
} | |
static CharT decode_control_char (state &state_) | |
{ | |
// Skip over 'c' | |
state_.increment (); | |
CharT ch_ = 0; | |
bool eos_ = state_.next (ch_); | |
if (eos_) | |
{ | |
// Pointless returning index if at end of string | |
throw runtime_error ("Unexpected end of regex following \\c."); | |
} | |
else | |
{ | |
if (ch_ >= 'a' && ch_ <= 'z') | |
{ | |
ch_ -= 'a' - 1; | |
} | |
else if (ch_ >= 'A' && ch_ <= 'Z') | |
{ | |
ch_ -= 'A' - 1; | |
} | |
else if (ch_ == '@') | |
{ | |
// Apparently... | |
ch_ = 0; | |
} | |
else | |
{ | |
std::ostringstream ss_; | |
ss_ << "Invalid control char at index " << | |
state_.index () - 1 << '.'; | |
throw runtime_error (ss_.str ().c_str ()); | |
} | |
} | |
return ch_; | |
} | |
static CharT decode_hex (state &state_) | |
{ | |
// Skip over 'x' | |
state_.increment (); | |
CharT ch_ = 0; | |
bool eos_ = state_.next (ch_); | |
if (eos_) | |
{ | |
// Pointless returning index if at end of string | |
throw runtime_error ("Unexpected end of regex following \\x."); | |
} | |
if (!((ch_ >= '0' && ch_ <= '9') || (ch_ >= 'a' && ch_ <= 'f') || | |
(ch_ >= 'A' && ch_ <= 'F'))) | |
{ | |
std::ostringstream ss_; | |
ss_ << "Illegal char following \\x at index " << | |
state_.index () - 1 << '.'; | |
throw runtime_error (ss_.str ().c_str ()); | |
} | |
std::size_t hex_ = 0; | |
do | |
{ | |
hex_ *= 16; | |
if (ch_ >= '0' && ch_ <= '9') | |
{ | |
hex_ += ch_ - '0'; | |
} | |
else if (ch_ >= 'a' && ch_ <= 'f') | |
{ | |
hex_ += 10 + (ch_ - 'a'); | |
} | |
else | |
{ | |
hex_ += 10 + (ch_ - 'A'); | |
} | |
eos_ = state_.eos (); | |
if (!eos_) | |
{ | |
ch_ = *state_._curr; | |
// Don't consume invalid chars! | |
if (((ch_ >= '0' && ch_ <= '9') || | |
(ch_ >= 'a' && ch_ <= 'f') || (ch_ >= 'A' && ch_ <= 'F'))) | |
{ | |
state_.increment (); | |
} | |
else | |
{ | |
eos_ = true; | |
} | |
} | |
} while (!eos_); | |
return static_cast<CharT> (hex_); | |
} | |
static void charset_range (const bool chset_, state &state_, bool &eos_, | |
CharT &ch_, const CharT prev_, string &chars_) | |
{ | |
if (chset_) | |
{ | |
std::ostringstream ss_; | |
ss_ << "Charset cannot form start of range preceding " | |
"index " << state_.index () - 1 << '.'; | |
throw runtime_error (ss_.str ().c_str ()); | |
} | |
eos_ = state_.next (ch_); | |
if (eos_) | |
{ | |
// Pointless returning index if at end of string | |
throw runtime_error ("Unexpected end of regex " | |
"following '-'."); | |
} | |
CharT curr_ = 0; | |
if (ch_ == '\\') | |
{ | |
std::size_t str_len_ = 0; | |
if (escape_sequence (state_, curr_, str_len_)) | |
{ | |
std::ostringstream ss_; | |
ss_ << "Charset cannot form end of range preceding index " | |
<< state_.index () << '.'; | |
throw runtime_error (ss_.str ().c_str ()); | |
} | |
} | |
/* | |
else if (ch_ == '[' && !state_.eos () && *state_._curr == ':') | |
{ | |
std::ostringstream ss_; | |
ss_ << "POSIX char class cannot form end of range at " | |
"index " << state_.index () - 1 << '.'; | |
throw runtime_error (ss_.str ().c_str ()); | |
} | |
*/ | |
else | |
{ | |
curr_ = ch_; | |
} | |
eos_ = state_.next (ch_); | |
// Covers preceding if and else | |
if (eos_) | |
{ | |
// Pointless returning index if at end of string | |
throw runtime_error ("Unexpected end of regex " | |
"(missing ']')."); | |
} | |
std::size_t start_ = static_cast<typename Traits::index_type> (prev_); | |
std::size_t end_ = static_cast<typename Traits::index_type> (curr_); | |
// Semanic check | |
if (end_ < start_) | |
{ | |
std::ostringstream ss_; | |
ss_ << "Invalid range in charset preceding index " << | |
state_.index () - 1 << '.'; | |
throw runtime_error (ss_.str ().c_str ()); | |
} | |
chars_.reserve (chars_.size () + (end_ + 1 - start_)); | |
for (; start_ <= end_; ++start_) | |
{ | |
CharT ch_ = static_cast<CharT> (start_); | |
if ((state_._flags & icase) && | |
(std::isupper (ch_, state_._locale) || | |
std::islower (ch_, state_._locale))) | |
{ | |
CharT upper_ = std::toupper (ch_, state_._locale); | |
CharT lower_ = std::tolower (ch_, state_._locale); | |
chars_ += (upper_); | |
chars_ += (lower_); | |
} | |
else | |
{ | |
chars_ += (ch_); | |
} | |
} | |
} | |
}; | |
} | |
} | |
} | |
#endif |