blob: 363bebcc1e4abd0f03ed7fa248640e343504c9f8 [file] [log] [blame]
// tokeniser_helper.hpp
// Copyright (c) 2007-2009 Ben Hanson (http://www.benhanson.net/)
//
// Distributed under the Boost Software License, Version 1.0. (See accompanying
// file licence_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
#ifndef BOOST_LEXER_RE_TOKENISER_HELPER_H
#define BOOST_LEXER_RE_TOKENISER_HELPER_H
#include "../../char_traits.hpp"
// strlen()
#include <cstring>
#include "../../size_t.hpp"
#include "re_tokeniser_state.hpp"
namespace boost
{
namespace lexer
{
namespace detail
{
template<typename CharT, typename Traits = char_traits<CharT> >
class basic_re_tokeniser_helper
{
public:
typedef basic_re_tokeniser_state<CharT> state;
typedef std::basic_string<CharT> string;
static const CharT *escape_sequence (state &state_, CharT &ch_,
std::size_t &str_len_)
{
bool eos_ = state_.eos ();
if (eos_)
{
throw runtime_error ("Unexpected end of regex "
"following '\\'.");
}
const CharT *str_ = charset_shortcut (*state_._curr, str_len_);
if (str_)
{
state_.increment ();
}
else
{
ch_ = chr (state_);
}
return str_;
}
// This function can call itself.
static void charset (state &state_, string &chars_, bool &negated_)
{
CharT ch_ = 0;
bool eos_ = state_.next (ch_);
if (eos_)
{
// Pointless returning index if at end of string
throw runtime_error ("Unexpected end of regex "
"following '['.");
}
negated_ = ch_ == '^';
if (negated_)
{
eos_ = state_.next (ch_);
if (eos_)
{
// Pointless returning index if at end of string
throw runtime_error ("Unexpected end of regex "
"following '^'.");
}
}
bool chset_ = false;
CharT prev_ = 0;
while (ch_ != ']')
{
if (ch_ == '\\')
{
std::size_t str_len_ = 0;
const CharT *str_ = escape_sequence (state_, prev_, str_len_);
chset_ = str_ != 0;
if (chset_)
{
state temp_state_ (str_ + 1, str_ + str_len_,
state_._flags, state_._locale);
string temp_chars_;
bool temp_negated_ = false;
charset (temp_state_, temp_chars_, temp_negated_);
if (negated_ != temp_negated_)
{
std::ostringstream ss_;
ss_ << "Mismatch in charset negation preceding "
"index " << state_.index () << '.';
throw runtime_error (ss_.str ().c_str ());
}
chars_ += temp_chars_;
}
}
/*
else if (ch_ == '[' && !state_.eos () && *state_._curr == ':')
{
// TODO: POSIX charsets
}
*/
else
{
chset_ = false;
prev_ = ch_;
}
eos_ = state_.next (ch_);
// Covers preceding if, else if and else
if (eos_)
{
// Pointless returning index if at end of string
throw runtime_error ("Unexpected end of regex "
"(missing ']').");
}
if (ch_ == '-')
{
charset_range (chset_, state_, eos_, ch_, prev_, chars_);
}
else if (!chset_)
{
if ((state_._flags & icase) &&
(std::isupper (prev_, state_._locale) ||
std::islower (prev_, state_._locale)))
{
CharT upper_ = std::toupper (prev_, state_._locale);
CharT lower_ = std::tolower (prev_, state_._locale);
chars_ += upper_;
chars_ += lower_;
}
else
{
chars_ += prev_;
}
}
}
if (!negated_ && chars_.empty ())
{
throw runtime_error ("Empty charsets not allowed.");
}
}
static CharT chr (state &state_)
{
CharT ch_ = 0;
// eos_ has already been checked for.
switch (*state_._curr)
{
case '0':
case '1':
case '2':
case '3':
case '4':
case '5':
case '6':
case '7':
ch_ = decode_octal (state_);
break;
case 'a':
ch_ = '\a';
state_.increment ();
break;
case 'b':
ch_ = '\b';
state_.increment ();
break;
case 'c':
ch_ = decode_control_char (state_);
break;
case 'e':
ch_ = 27; // '\e' not recognised by compiler
state_.increment ();
break;
case 'f':
ch_ = '\f';
state_.increment ();
break;
case 'n':
ch_ = '\n';
state_.increment ();
break;
case 'r':
ch_ = '\r';
state_.increment ();
break;
case 't':
ch_ = '\t';
state_.increment ();
break;
case 'v':
ch_ = '\v';
state_.increment ();
break;
case 'x':
ch_ = decode_hex (state_);
break;
default:
ch_ = *state_._curr;
state_.increment ();
break;
}
return ch_;
}
private:
static const char *charset_shortcut (const char ch_,
std::size_t &str_len_)
{
const char *str_ = 0;
switch (ch_)
{
case 'd':
str_ = "[0-9]";
break;
case 'D':
str_ = "[^0-9]";
break;
case 's':
str_ = "[ \t\n\r\f\v]";
break;
case 'S':
str_ = "[^ \t\n\r\f\v]";
break;
case 'w':
str_ = "[_0-9A-Za-z]";
break;
case 'W':
str_ = "[^_0-9A-Za-z]";
break;
}
if (str_)
{
// Some systems have strlen in namespace std.
using namespace std;
str_len_ = strlen (str_);
}
else
{
str_len_ = 0;
}
return str_;
}
static const wchar_t *charset_shortcut (const wchar_t ch_,
std::size_t &str_len_)
{
const wchar_t *str_ = 0;
switch (ch_)
{
case 'd':
str_ = L"[0-9]";
break;
case 'D':
str_ = L"[^0-9]";
break;
case 's':
str_ = L"[ \t\n\r\f\v]";
break;
case 'S':
str_ = L"[^ \t\n\r\f\v]";
break;
case 'w':
str_ = L"[_0-9A-Za-z]";
break;
case 'W':
str_ = L"[^_0-9A-Za-z]";
break;
}
if (str_)
{
// Some systems have wcslen in namespace std.
using namespace std;
str_len_ = wcslen (str_);
}
else
{
str_len_ = 0;
}
return str_;
}
static CharT decode_octal (state &state_)
{
std::size_t accumulator_ = 0;
CharT ch_ = *state_._curr;
unsigned short count_ = 3;
bool eos_ = false;
for (;;)
{
accumulator_ *= 8;
accumulator_ += ch_ - '0';
--count_;
state_.increment ();
eos_ = state_.eos ();
if (!count_ || eos_) break;
ch_ = *state_._curr;
// Don't consume invalid chars!
if (ch_ < '0' || ch_ > '7')
{
break;
}
}
return static_cast<CharT> (accumulator_);
}
static CharT decode_control_char (state &state_)
{
// Skip over 'c'
state_.increment ();
CharT ch_ = 0;
bool eos_ = state_.next (ch_);
if (eos_)
{
// Pointless returning index if at end of string
throw runtime_error ("Unexpected end of regex following \\c.");
}
else
{
if (ch_ >= 'a' && ch_ <= 'z')
{
ch_ -= 'a' - 1;
}
else if (ch_ >= 'A' && ch_ <= 'Z')
{
ch_ -= 'A' - 1;
}
else if (ch_ == '@')
{
// Apparently...
ch_ = 0;
}
else
{
std::ostringstream ss_;
ss_ << "Invalid control char at index " <<
state_.index () - 1 << '.';
throw runtime_error (ss_.str ().c_str ());
}
}
return ch_;
}
static CharT decode_hex (state &state_)
{
// Skip over 'x'
state_.increment ();
CharT ch_ = 0;
bool eos_ = state_.next (ch_);
if (eos_)
{
// Pointless returning index if at end of string
throw runtime_error ("Unexpected end of regex following \\x.");
}
if (!((ch_ >= '0' && ch_ <= '9') || (ch_ >= 'a' && ch_ <= 'f') ||
(ch_ >= 'A' && ch_ <= 'F')))
{
std::ostringstream ss_;
ss_ << "Illegal char following \\x at index " <<
state_.index () - 1 << '.';
throw runtime_error (ss_.str ().c_str ());
}
std::size_t hex_ = 0;
do
{
hex_ *= 16;
if (ch_ >= '0' && ch_ <= '9')
{
hex_ += ch_ - '0';
}
else if (ch_ >= 'a' && ch_ <= 'f')
{
hex_ += 10 + (ch_ - 'a');
}
else
{
hex_ += 10 + (ch_ - 'A');
}
eos_ = state_.eos ();
if (!eos_)
{
ch_ = *state_._curr;
// Don't consume invalid chars!
if (((ch_ >= '0' && ch_ <= '9') ||
(ch_ >= 'a' && ch_ <= 'f') || (ch_ >= 'A' && ch_ <= 'F')))
{
state_.increment ();
}
else
{
eos_ = true;
}
}
} while (!eos_);
return static_cast<CharT> (hex_);
}
static void charset_range (const bool chset_, state &state_, bool &eos_,
CharT &ch_, const CharT prev_, string &chars_)
{
if (chset_)
{
std::ostringstream ss_;
ss_ << "Charset cannot form start of range preceding "
"index " << state_.index () - 1 << '.';
throw runtime_error (ss_.str ().c_str ());
}
eos_ = state_.next (ch_);
if (eos_)
{
// Pointless returning index if at end of string
throw runtime_error ("Unexpected end of regex "
"following '-'.");
}
CharT curr_ = 0;
if (ch_ == '\\')
{
std::size_t str_len_ = 0;
if (escape_sequence (state_, curr_, str_len_))
{
std::ostringstream ss_;
ss_ << "Charset cannot form end of range preceding index "
<< state_.index () << '.';
throw runtime_error (ss_.str ().c_str ());
}
}
/*
else if (ch_ == '[' && !state_.eos () && *state_._curr == ':')
{
std::ostringstream ss_;
ss_ << "POSIX char class cannot form end of range at "
"index " << state_.index () - 1 << '.';
throw runtime_error (ss_.str ().c_str ());
}
*/
else
{
curr_ = ch_;
}
eos_ = state_.next (ch_);
// Covers preceding if and else
if (eos_)
{
// Pointless returning index if at end of string
throw runtime_error ("Unexpected end of regex "
"(missing ']').");
}
std::size_t start_ = static_cast<typename Traits::index_type> (prev_);
std::size_t end_ = static_cast<typename Traits::index_type> (curr_);
// Semanic check
if (end_ < start_)
{
std::ostringstream ss_;
ss_ << "Invalid range in charset preceding index " <<
state_.index () - 1 << '.';
throw runtime_error (ss_.str ().c_str ());
}
chars_.reserve (chars_.size () + (end_ + 1 - start_));
for (; start_ <= end_; ++start_)
{
CharT ch_ = static_cast<CharT> (start_);
if ((state_._flags & icase) &&
(std::isupper (ch_, state_._locale) ||
std::islower (ch_, state_._locale)))
{
CharT upper_ = std::toupper (ch_, state_._locale);
CharT lower_ = std::tolower (ch_, state_._locale);
chars_ += (upper_);
chars_ += (lower_);
}
else
{
chars_ += (ch_);
}
}
}
};
}
}
}
#endif