/* | |
* | |
* Copyright (c) 2004 | |
* John Maddock | |
* | |
* Use, modification and distribution are subject to the | |
* Boost Software License, Version 1.0. (See accompanying file | |
* LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) | |
* | |
*/ | |
/* | |
* LOCATION: see http://www.boost.org for most recent version. | |
* FILE basic_regex_parser.cpp | |
* VERSION see <boost/version.hpp> | |
* DESCRIPTION: Declares template class basic_regex_parser. | |
*/ | |
#ifndef BOOST_REGEX_V4_BASIC_REGEX_PARSER_HPP | |
#define BOOST_REGEX_V4_BASIC_REGEX_PARSER_HPP | |
#ifdef BOOST_MSVC | |
#pragma warning(push) | |
#pragma warning(disable: 4103) | |
#endif | |
#ifdef BOOST_HAS_ABI_HEADERS | |
# include BOOST_ABI_PREFIX | |
#endif | |
#ifdef BOOST_MSVC | |
#pragma warning(pop) | |
#endif | |
namespace boost{ | |
namespace re_detail{ | |
#ifdef BOOST_MSVC | |
#pragma warning(push) | |
#pragma warning(disable:4244 4800) | |
#endif | |
template <class charT, class traits> | |
class basic_regex_parser : public basic_regex_creator<charT, traits> | |
{ | |
public: | |
basic_regex_parser(regex_data<charT, traits>* data); | |
void parse(const charT* p1, const charT* p2, unsigned flags); | |
void fail(regex_constants::error_type error_code, std::ptrdiff_t position); | |
void fail(regex_constants::error_type error_code, std::ptrdiff_t position, std::string message, std::ptrdiff_t start_pos); | |
void fail(regex_constants::error_type error_code, std::ptrdiff_t position, const std::string& message) | |
{ | |
fail(error_code, position, message, position); | |
} | |
bool parse_all(); | |
bool parse_basic(); | |
bool parse_extended(); | |
bool parse_literal(); | |
bool parse_open_paren(); | |
bool parse_basic_escape(); | |
bool parse_extended_escape(); | |
bool parse_match_any(); | |
bool parse_repeat(std::size_t low = 0, std::size_t high = (std::numeric_limits<std::size_t>::max)()); | |
bool parse_repeat_range(bool isbasic); | |
bool parse_alt(); | |
bool parse_set(); | |
bool parse_backref(); | |
void parse_set_literal(basic_char_set<charT, traits>& char_set); | |
bool parse_inner_set(basic_char_set<charT, traits>& char_set); | |
bool parse_QE(); | |
bool parse_perl_extension(); | |
bool add_emacs_code(bool negate); | |
bool unwind_alts(std::ptrdiff_t last_paren_start); | |
digraph<charT> get_next_set_literal(basic_char_set<charT, traits>& char_set); | |
charT unescape_character(); | |
regex_constants::syntax_option_type parse_options(); | |
private: | |
typedef bool (basic_regex_parser::*parser_proc_type)(); | |
typedef typename traits::string_type string_type; | |
typedef typename traits::char_class_type char_class_type; | |
parser_proc_type m_parser_proc; // the main parser to use | |
const charT* m_base; // the start of the string being parsed | |
const charT* m_end; // the end of the string being parsed | |
const charT* m_position; // our current parser position | |
unsigned m_mark_count; // how many sub-expressions we have | |
int m_mark_reset; // used to indicate that we're inside a (?|...) block. | |
unsigned m_max_mark; // largest mark count seen inside a (?|...) block. | |
std::ptrdiff_t m_paren_start; // where the last seen ')' began (where repeats are inserted). | |
std::ptrdiff_t m_alt_insert_point; // where to insert the next alternative | |
bool m_has_case_change; // true if somewhere in the current block the case has changed | |
#if defined(BOOST_MSVC) && defined(_M_IX86) | |
// This is an ugly warning suppression workaround (for warnings *inside* std::vector | |
// that can not otherwise be suppressed)... | |
BOOST_STATIC_ASSERT(sizeof(long) >= sizeof(void*)); | |
std::vector<long> m_alt_jumps; // list of alternative in the current scope. | |
#else | |
std::vector<std::ptrdiff_t> m_alt_jumps; // list of alternative in the current scope. | |
#endif | |
basic_regex_parser& operator=(const basic_regex_parser&); | |
basic_regex_parser(const basic_regex_parser&); | |
}; | |
template <class charT, class traits> | |
basic_regex_parser<charT, traits>::basic_regex_parser(regex_data<charT, traits>* data) | |
: basic_regex_creator<charT, traits>(data), m_mark_count(0), m_mark_reset(-1), m_max_mark(0), m_paren_start(0), m_alt_insert_point(0), m_has_case_change(false) | |
{ | |
} | |
template <class charT, class traits> | |
void basic_regex_parser<charT, traits>::parse(const charT* p1, const charT* p2, unsigned l_flags) | |
{ | |
// pass l_flags on to base class: | |
this->init(l_flags); | |
// set up pointers: | |
m_position = m_base = p1; | |
m_end = p2; | |
// empty strings are errors: | |
if((p1 == p2) && | |
( | |
((l_flags & regbase::main_option_type) != regbase::perl_syntax_group) | |
|| (l_flags & regbase::no_empty_expressions) | |
) | |
) | |
{ | |
fail(regex_constants::error_empty, 0); | |
return; | |
} | |
// select which parser to use: | |
switch(l_flags & regbase::main_option_type) | |
{ | |
case regbase::perl_syntax_group: | |
{ | |
m_parser_proc = &basic_regex_parser<charT, traits>::parse_extended; | |
// | |
// Add a leading paren with index zero to give recursions a target: | |
// | |
re_brace* br = static_cast<re_brace*>(this->append_state(syntax_element_startmark, sizeof(re_brace))); | |
br->index = 0; | |
br->icase = this->flags() & regbase::icase; | |
break; | |
} | |
case regbase::basic_syntax_group: | |
m_parser_proc = &basic_regex_parser<charT, traits>::parse_basic; | |
break; | |
case regbase::literal: | |
m_parser_proc = &basic_regex_parser<charT, traits>::parse_literal; | |
break; | |
default: | |
// Ooops, someone has managed to set more than one of the main option flags, | |
// so this must be an error: | |
fail(regex_constants::error_unknown, 0, "An invalid combination of regular expression syntax flags was used."); | |
return; | |
} | |
// parse all our characters: | |
bool result = parse_all(); | |
// | |
// Unwind our alternatives: | |
// | |
unwind_alts(-1); | |
// reset l_flags as a global scope (?imsx) may have altered them: | |
this->flags(l_flags); | |
// if we haven't gobbled up all the characters then we must | |
// have had an unexpected ')' : | |
if(!result) | |
{ | |
fail(regex_constants::error_paren, ::boost::re_detail::distance(m_base, m_position), "Found a closing ) with no corresponding openening parenthesis."); | |
return; | |
} | |
// if an error has been set then give up now: | |
if(this->m_pdata->m_status) | |
return; | |
// fill in our sub-expression count: | |
this->m_pdata->m_mark_count = 1 + m_mark_count; | |
this->finalize(p1, p2); | |
} | |
template <class charT, class traits> | |
void basic_regex_parser<charT, traits>::fail(regex_constants::error_type error_code, std::ptrdiff_t position) | |
{ | |
// get the error message: | |
std::string message = this->m_pdata->m_ptraits->error_string(error_code); | |
fail(error_code, position, message); | |
} | |
template <class charT, class traits> | |
void basic_regex_parser<charT, traits>::fail(regex_constants::error_type error_code, std::ptrdiff_t position, std::string message, std::ptrdiff_t start_pos) | |
{ | |
if(0 == this->m_pdata->m_status) // update the error code if not already set | |
this->m_pdata->m_status = error_code; | |
m_position = m_end; // don't bother parsing anything else | |
// | |
// Augment error message with the regular expression text: | |
// | |
if(start_pos == position) | |
start_pos = (std::max)(static_cast<std::ptrdiff_t>(0), position - static_cast<std::ptrdiff_t>(10)); | |
std::ptrdiff_t end_pos = (std::min)(position + static_cast<std::ptrdiff_t>(10), static_cast<std::ptrdiff_t>(m_end - m_base)); | |
if(error_code != regex_constants::error_empty) | |
{ | |
if((start_pos != 0) || (end_pos != (m_end - m_base))) | |
message += " The error occured while parsing the regular expression fragment: '"; | |
else | |
message += " The error occured while parsing the regular expression: '"; | |
if(start_pos != end_pos) | |
{ | |
message += std::string(m_base + start_pos, m_base + position); | |
message += ">>>HERE>>>"; | |
message += std::string(m_base + position, m_base + end_pos); | |
} | |
message += "'."; | |
} | |
#ifndef BOOST_NO_EXCEPTIONS | |
if(0 == (this->flags() & regex_constants::no_except)) | |
{ | |
boost::regex_error e(message, error_code, position); | |
e.raise(); | |
} | |
#else | |
(void)position; // suppress warnings. | |
#endif | |
} | |
template <class charT, class traits> | |
bool basic_regex_parser<charT, traits>::parse_all() | |
{ | |
bool result = true; | |
while(result && (m_position != m_end)) | |
{ | |
result = (this->*m_parser_proc)(); | |
} | |
return result; | |
} | |
#ifdef BOOST_MSVC | |
#pragma warning(push) | |
#pragma warning(disable:4702) | |
#endif | |
template <class charT, class traits> | |
bool basic_regex_parser<charT, traits>::parse_basic() | |
{ | |
switch(this->m_traits.syntax_type(*m_position)) | |
{ | |
case regex_constants::syntax_escape: | |
return parse_basic_escape(); | |
case regex_constants::syntax_dot: | |
return parse_match_any(); | |
case regex_constants::syntax_caret: | |
++m_position; | |
this->append_state(syntax_element_start_line); | |
break; | |
case regex_constants::syntax_dollar: | |
++m_position; | |
this->append_state(syntax_element_end_line); | |
break; | |
case regex_constants::syntax_star: | |
if(!(this->m_last_state) || (this->m_last_state->type == syntax_element_start_line)) | |
return parse_literal(); | |
else | |
{ | |
++m_position; | |
return parse_repeat(); | |
} | |
case regex_constants::syntax_plus: | |
if(!(this->m_last_state) || (this->m_last_state->type == syntax_element_start_line) || !(this->flags() & regbase::emacs_ex)) | |
return parse_literal(); | |
else | |
{ | |
++m_position; | |
return parse_repeat(1); | |
} | |
case regex_constants::syntax_question: | |
if(!(this->m_last_state) || (this->m_last_state->type == syntax_element_start_line) || !(this->flags() & regbase::emacs_ex)) | |
return parse_literal(); | |
else | |
{ | |
++m_position; | |
return parse_repeat(0, 1); | |
} | |
case regex_constants::syntax_open_set: | |
return parse_set(); | |
case regex_constants::syntax_newline: | |
if(this->flags() & regbase::newline_alt) | |
return parse_alt(); | |
else | |
return parse_literal(); | |
default: | |
return parse_literal(); | |
} | |
return true; | |
} | |
template <class charT, class traits> | |
bool basic_regex_parser<charT, traits>::parse_extended() | |
{ | |
bool result = true; | |
switch(this->m_traits.syntax_type(*m_position)) | |
{ | |
case regex_constants::syntax_open_mark: | |
return parse_open_paren(); | |
case regex_constants::syntax_close_mark: | |
return false; | |
case regex_constants::syntax_escape: | |
return parse_extended_escape(); | |
case regex_constants::syntax_dot: | |
return parse_match_any(); | |
case regex_constants::syntax_caret: | |
++m_position; | |
this->append_state( | |
(this->flags() & regex_constants::no_mod_m ? syntax_element_buffer_start : syntax_element_start_line)); | |
break; | |
case regex_constants::syntax_dollar: | |
++m_position; | |
this->append_state( | |
(this->flags() & regex_constants::no_mod_m ? syntax_element_buffer_end : syntax_element_end_line)); | |
break; | |
case regex_constants::syntax_star: | |
if(m_position == this->m_base) | |
{ | |
fail(regex_constants::error_badrepeat, 0, "The repeat operator \"*\" cannot start a regular expression."); | |
return false; | |
} | |
++m_position; | |
return parse_repeat(); | |
case regex_constants::syntax_question: | |
if(m_position == this->m_base) | |
{ | |
fail(regex_constants::error_badrepeat, 0, "The repeat operator \"?\" cannot start a regular expression."); | |
return false; | |
} | |
++m_position; | |
return parse_repeat(0,1); | |
case regex_constants::syntax_plus: | |
if(m_position == this->m_base) | |
{ | |
fail(regex_constants::error_badrepeat, 0, "The repeat operator \"+\" cannot start a regular expression."); | |
return false; | |
} | |
++m_position; | |
return parse_repeat(1); | |
case regex_constants::syntax_open_brace: | |
++m_position; | |
return parse_repeat_range(false); | |
case regex_constants::syntax_close_brace: | |
fail(regex_constants::error_brace, this->m_position - this->m_base, "Found a closing repetition operator } with no corresponding {."); | |
return false; | |
case regex_constants::syntax_or: | |
return parse_alt(); | |
case regex_constants::syntax_open_set: | |
return parse_set(); | |
case regex_constants::syntax_newline: | |
if(this->flags() & regbase::newline_alt) | |
return parse_alt(); | |
else | |
return parse_literal(); | |
case regex_constants::syntax_hash: | |
// | |
// If we have a mod_x flag set, then skip until | |
// we get to a newline character: | |
// | |
if((this->flags() | |
& (regbase::no_perl_ex|regbase::mod_x)) | |
== regbase::mod_x) | |
{ | |
while((m_position != m_end) && !is_separator(*m_position++)){} | |
return true; | |
} | |
// Otherwise fall through: | |
default: | |
result = parse_literal(); | |
break; | |
} | |
return result; | |
} | |
#ifdef BOOST_MSVC | |
#pragma warning(pop) | |
#endif | |
template <class charT, class traits> | |
bool basic_regex_parser<charT, traits>::parse_literal() | |
{ | |
// append this as a literal provided it's not a space character | |
// or the perl option regbase::mod_x is not set: | |
if( | |
((this->flags() | |
& (regbase::main_option_type|regbase::mod_x|regbase::no_perl_ex)) | |
!= regbase::mod_x) | |
|| !this->m_traits.isctype(*m_position, this->m_mask_space)) | |
this->append_literal(*m_position); | |
++m_position; | |
return true; | |
} | |
template <class charT, class traits> | |
bool basic_regex_parser<charT, traits>::parse_open_paren() | |
{ | |
// | |
// skip the '(' and error check: | |
// | |
if(++m_position == m_end) | |
{ | |
fail(regex_constants::error_paren, m_position - m_base); | |
return false; | |
} | |
// | |
// begin by checking for a perl-style (?...) extension: | |
// | |
if( | |
((this->flags() & (regbase::main_option_type | regbase::no_perl_ex)) == 0) | |
|| ((this->flags() & (regbase::main_option_type | regbase::emacs_ex)) == (regbase::basic_syntax_group|regbase::emacs_ex)) | |
) | |
{ | |
if(this->m_traits.syntax_type(*m_position) == regex_constants::syntax_question) | |
return parse_perl_extension(); | |
} | |
// | |
// update our mark count, and append the required state: | |
// | |
unsigned markid = 0; | |
if(0 == (this->flags() & regbase::nosubs)) | |
{ | |
markid = ++m_mark_count; | |
#ifndef BOOST_NO_STD_DISTANCE | |
if(this->flags() & regbase::save_subexpression_location) | |
this->m_pdata->m_subs.push_back(std::pair<std::size_t, std::size_t>(std::distance(m_base, m_position) - 1, 0)); | |
#else | |
if(this->flags() & regbase::save_subexpression_location) | |
this->m_pdata->m_subs.push_back(std::pair<std::size_t, std::size_t>((m_position - m_base) - 1, 0)); | |
#endif | |
} | |
re_brace* pb = static_cast<re_brace*>(this->append_state(syntax_element_startmark, sizeof(re_brace))); | |
pb->index = markid; | |
pb->icase = this->flags() & regbase::icase; | |
std::ptrdiff_t last_paren_start = this->getoffset(pb); | |
// back up insertion point for alternations, and set new point: | |
std::ptrdiff_t last_alt_point = m_alt_insert_point; | |
this->m_pdata->m_data.align(); | |
m_alt_insert_point = this->m_pdata->m_data.size(); | |
// | |
// back up the current flags in case we have a nested (?imsx) group: | |
// | |
regex_constants::syntax_option_type opts = this->flags(); | |
bool old_case_change = m_has_case_change; | |
m_has_case_change = false; // no changes to this scope as yet... | |
// | |
// Back up branch reset data in case we have a nested (?|...) | |
// | |
int mark_reset = m_mark_reset; | |
m_mark_reset = -1; | |
// | |
// now recursively add more states, this will terminate when we get to a | |
// matching ')' : | |
// | |
parse_all(); | |
// | |
// Unwind pushed alternatives: | |
// | |
if(0 == unwind_alts(last_paren_start)) | |
return false; | |
// | |
// restore flags: | |
// | |
if(m_has_case_change) | |
{ | |
// the case has changed in one or more of the alternatives | |
// within the scoped (...) block: we have to add a state | |
// to reset the case sensitivity: | |
static_cast<re_case*>( | |
this->append_state(syntax_element_toggle_case, sizeof(re_case)) | |
)->icase = opts & regbase::icase; | |
} | |
this->flags(opts); | |
m_has_case_change = old_case_change; | |
// | |
// restore branch reset: | |
// | |
m_mark_reset = mark_reset; | |
// | |
// we either have a ')' or we have run out of characters prematurely: | |
// | |
if(m_position == m_end) | |
{ | |
this->fail(regex_constants::error_paren, ::boost::re_detail::distance(m_base, m_end)); | |
return false; | |
} | |
BOOST_ASSERT(this->m_traits.syntax_type(*m_position) == regex_constants::syntax_close_mark); | |
#ifndef BOOST_NO_STD_DISTANCE | |
if(markid && (this->flags() & regbase::save_subexpression_location)) | |
this->m_pdata->m_subs.at(markid - 1).second = std::distance(m_base, m_position); | |
#else | |
if(markid && (this->flags() & regbase::save_subexpression_location)) | |
this->m_pdata->m_subs.at(markid - 1).second = (m_position - m_base); | |
#endif | |
++m_position; | |
// | |
// append closing parenthesis state: | |
// | |
pb = static_cast<re_brace*>(this->append_state(syntax_element_endmark, sizeof(re_brace))); | |
pb->index = markid; | |
pb->icase = this->flags() & regbase::icase; | |
this->m_paren_start = last_paren_start; | |
// | |
// restore the alternate insertion point: | |
// | |
this->m_alt_insert_point = last_alt_point; | |
// | |
// allow backrefs to this mark: | |
// | |
if((markid > 0) && (markid < sizeof(unsigned) * CHAR_BIT)) | |
this->m_backrefs |= 1u << (markid - 1); | |
return true; | |
} | |
template <class charT, class traits> | |
bool basic_regex_parser<charT, traits>::parse_basic_escape() | |
{ | |
++m_position; | |
bool result = true; | |
switch(this->m_traits.escape_syntax_type(*m_position)) | |
{ | |
case regex_constants::syntax_open_mark: | |
return parse_open_paren(); | |
case regex_constants::syntax_close_mark: | |
return false; | |
case regex_constants::syntax_plus: | |
if(this->flags() & regex_constants::bk_plus_qm) | |
{ | |
++m_position; | |
return parse_repeat(1); | |
} | |
else | |
return parse_literal(); | |
case regex_constants::syntax_question: | |
if(this->flags() & regex_constants::bk_plus_qm) | |
{ | |
++m_position; | |
return parse_repeat(0, 1); | |
} | |
else | |
return parse_literal(); | |
case regex_constants::syntax_open_brace: | |
if(this->flags() & regbase::no_intervals) | |
return parse_literal(); | |
++m_position; | |
return parse_repeat_range(true); | |
case regex_constants::syntax_close_brace: | |
if(this->flags() & regbase::no_intervals) | |
return parse_literal(); | |
fail(regex_constants::error_brace, this->m_position - this->m_base, "Found a closing repetition operator } with no corresponding {."); | |
return false; | |
case regex_constants::syntax_or: | |
if(this->flags() & regbase::bk_vbar) | |
return parse_alt(); | |
else | |
result = parse_literal(); | |
break; | |
case regex_constants::syntax_digit: | |
return parse_backref(); | |
case regex_constants::escape_type_start_buffer: | |
if(this->flags() & regbase::emacs_ex) | |
{ | |
++m_position; | |
this->append_state(syntax_element_buffer_start); | |
} | |
else | |
result = parse_literal(); | |
break; | |
case regex_constants::escape_type_end_buffer: | |
if(this->flags() & regbase::emacs_ex) | |
{ | |
++m_position; | |
this->append_state(syntax_element_buffer_end); | |
} | |
else | |
result = parse_literal(); | |
break; | |
case regex_constants::escape_type_word_assert: | |
if(this->flags() & regbase::emacs_ex) | |
{ | |
++m_position; | |
this->append_state(syntax_element_word_boundary); | |
} | |
else | |
result = parse_literal(); | |
break; | |
case regex_constants::escape_type_not_word_assert: | |
if(this->flags() & regbase::emacs_ex) | |
{ | |
++m_position; | |
this->append_state(syntax_element_within_word); | |
} | |
else | |
result = parse_literal(); | |
break; | |
case regex_constants::escape_type_left_word: | |
if(this->flags() & regbase::emacs_ex) | |
{ | |
++m_position; | |
this->append_state(syntax_element_word_start); | |
} | |
else | |
result = parse_literal(); | |
break; | |
case regex_constants::escape_type_right_word: | |
if(this->flags() & regbase::emacs_ex) | |
{ | |
++m_position; | |
this->append_state(syntax_element_word_end); | |
} | |
else | |
result = parse_literal(); | |
break; | |
default: | |
if(this->flags() & regbase::emacs_ex) | |
{ | |
bool negate = true; | |
switch(*m_position) | |
{ | |
case 'w': | |
negate = false; | |
// fall through: | |
case 'W': | |
{ | |
basic_char_set<charT, traits> char_set; | |
if(negate) | |
char_set.negate(); | |
char_set.add_class(this->m_word_mask); | |
if(0 == this->append_set(char_set)) | |
{ | |
fail(regex_constants::error_ctype, m_position - m_base); | |
return false; | |
} | |
++m_position; | |
return true; | |
} | |
case 's': | |
negate = false; | |
// fall through: | |
case 'S': | |
return add_emacs_code(negate); | |
case 'c': | |
case 'C': | |
// not supported yet: | |
fail(regex_constants::error_escape, m_position - m_base, "The \\c and \\C escape sequences are not supported by POSIX basic regular expressions: try the Perl syntax instead."); | |
return false; | |
default: | |
break; | |
} | |
} | |
result = parse_literal(); | |
break; | |
} | |
return result; | |
} | |
template <class charT, class traits> | |
bool basic_regex_parser<charT, traits>::parse_extended_escape() | |
{ | |
++m_position; | |
bool negate = false; // in case this is a character class escape: \w \d etc | |
switch(this->m_traits.escape_syntax_type(*m_position)) | |
{ | |
case regex_constants::escape_type_not_class: | |
negate = true; | |
// fall through: | |
case regex_constants::escape_type_class: | |
{ | |
escape_type_class_jump: | |
typedef typename traits::char_class_type mask_type; | |
mask_type m = this->m_traits.lookup_classname(m_position, m_position+1); | |
if(m != 0) | |
{ | |
basic_char_set<charT, traits> char_set; | |
if(negate) | |
char_set.negate(); | |
char_set.add_class(m); | |
if(0 == this->append_set(char_set)) | |
{ | |
fail(regex_constants::error_ctype, m_position - m_base); | |
return false; | |
} | |
++m_position; | |
return true; | |
} | |
// | |
// not a class, just a regular unknown escape: | |
// | |
this->append_literal(unescape_character()); | |
break; | |
} | |
case regex_constants::syntax_digit: | |
return parse_backref(); | |
case regex_constants::escape_type_left_word: | |
++m_position; | |
this->append_state(syntax_element_word_start); | |
break; | |
case regex_constants::escape_type_right_word: | |
++m_position; | |
this->append_state(syntax_element_word_end); | |
break; | |
case regex_constants::escape_type_start_buffer: | |
++m_position; | |
this->append_state(syntax_element_buffer_start); | |
break; | |
case regex_constants::escape_type_end_buffer: | |
++m_position; | |
this->append_state(syntax_element_buffer_end); | |
break; | |
case regex_constants::escape_type_word_assert: | |
++m_position; | |
this->append_state(syntax_element_word_boundary); | |
break; | |
case regex_constants::escape_type_not_word_assert: | |
++m_position; | |
this->append_state(syntax_element_within_word); | |
break; | |
case regex_constants::escape_type_Z: | |
++m_position; | |
this->append_state(syntax_element_soft_buffer_end); | |
break; | |
case regex_constants::escape_type_Q: | |
return parse_QE(); | |
case regex_constants::escape_type_C: | |
return parse_match_any(); | |
case regex_constants::escape_type_X: | |
++m_position; | |
this->append_state(syntax_element_combining); | |
break; | |
case regex_constants::escape_type_G: | |
++m_position; | |
this->append_state(syntax_element_restart_continue); | |
break; | |
case regex_constants::escape_type_not_property: | |
negate = true; | |
// fall through: | |
case regex_constants::escape_type_property: | |
{ | |
++m_position; | |
char_class_type m; | |
if(m_position == m_end) | |
{ | |
fail(regex_constants::error_escape, m_position - m_base, "Incomplete property escape found."); | |
return false; | |
} | |
// maybe have \p{ddd} | |
if(this->m_traits.syntax_type(*m_position) == regex_constants::syntax_open_brace) | |
{ | |
const charT* base = m_position; | |
// skip forward until we find enclosing brace: | |
while((m_position != m_end) && (this->m_traits.syntax_type(*m_position) != regex_constants::syntax_close_brace)) | |
++m_position; | |
if(m_position == m_end) | |
{ | |
fail(regex_constants::error_escape, m_position - m_base, "Closing } missing from property escape sequence."); | |
return false; | |
} | |
m = this->m_traits.lookup_classname(++base, m_position++); | |
} | |
else | |
{ | |
m = this->m_traits.lookup_classname(m_position, m_position+1); | |
++m_position; | |
} | |
if(m != 0) | |
{ | |
basic_char_set<charT, traits> char_set; | |
if(negate) | |
char_set.negate(); | |
char_set.add_class(m); | |
if(0 == this->append_set(char_set)) | |
{ | |
fail(regex_constants::error_ctype, m_position - m_base); | |
return false; | |
} | |
return true; | |
} | |
fail(regex_constants::error_ctype, m_position - m_base, "Escape sequence was neither a valid property nor a valid character class name."); | |
return false; | |
} | |
case regex_constants::escape_type_reset_start_mark: | |
if(0 == (this->flags() & (regbase::main_option_type | regbase::no_perl_ex))) | |
{ | |
re_brace* pb = static_cast<re_brace*>(this->append_state(syntax_element_startmark, sizeof(re_brace))); | |
pb->index = -5; | |
pb->icase = this->flags() & regbase::icase; | |
this->m_pdata->m_data.align(); | |
++m_position; | |
return true; | |
} | |
goto escape_type_class_jump; | |
case regex_constants::escape_type_line_ending: | |
if(0 == (this->flags() & (regbase::main_option_type | regbase::no_perl_ex))) | |
{ | |
const charT* e = get_escape_R_string<charT>(); | |
const charT* old_position = m_position; | |
const charT* old_end = m_end; | |
const charT* old_base = m_base; | |
m_position = e; | |
m_base = e; | |
m_end = e + traits::length(e); | |
bool r = parse_all(); | |
m_position = ++old_position; | |
m_end = old_end; | |
m_base = old_base; | |
return r; | |
} | |
goto escape_type_class_jump; | |
case regex_constants::escape_type_extended_backref: | |
if(0 == (this->flags() & (regbase::main_option_type | regbase::no_perl_ex))) | |
{ | |
bool have_brace = false; | |
bool negative = false; | |
static const char* incomplete_message = "Incomplete \\g escape found."; | |
if(++m_position == m_end) | |
{ | |
fail(regex_constants::error_escape, m_position - m_base, incomplete_message); | |
return false; | |
} | |
// maybe have \g{ddd} | |
regex_constants::syntax_type syn = this->m_traits.syntax_type(*m_position); | |
regex_constants::syntax_type syn_end = 0; | |
if((syn == regex_constants::syntax_open_brace) | |
|| (syn == regex_constants::escape_type_left_word) | |
|| (syn == regex_constants::escape_type_end_buffer)) | |
{ | |
if(++m_position == m_end) | |
{ | |
fail(regex_constants::error_escape, m_position - m_base, incomplete_message); | |
return false; | |
} | |
have_brace = true; | |
switch(syn) | |
{ | |
case regex_constants::syntax_open_brace: | |
syn_end = regex_constants::syntax_close_brace; | |
break; | |
case regex_constants::escape_type_left_word: | |
syn_end = regex_constants::escape_type_right_word; | |
break; | |
default: | |
syn_end = regex_constants::escape_type_end_buffer; | |
break; | |
} | |
} | |
negative = (*m_position == static_cast<charT>('-')); | |
if((negative) && (++m_position == m_end)) | |
{ | |
fail(regex_constants::error_escape, m_position - m_base, incomplete_message); | |
return false; | |
} | |
const charT* pc = m_position; | |
int i = this->m_traits.toi(pc, m_end, 10); | |
if((i < 0) && syn_end) | |
{ | |
// Check for a named capture, get the leftmost one if there is more than one: | |
const charT* base = m_position; | |
while((m_position != m_end) && (this->m_traits.syntax_type(*m_position) != syn_end)) | |
{ | |
++m_position; | |
} | |
i = hash_value_from_capture_name(base, m_position); | |
pc = m_position; | |
} | |
if(negative) | |
i = 1 + m_mark_count - i; | |
if(((i > 0) && (this->m_backrefs & (1u << (i-1)))) || ((i > 10000) && (this->m_pdata->get_id(i) > 0) && (this->m_backrefs & (1u << (this->m_pdata->get_id(i)-1))))) | |
{ | |
m_position = pc; | |
re_brace* pb = static_cast<re_brace*>(this->append_state(syntax_element_backref, sizeof(re_brace))); | |
pb->index = i; | |
pb->icase = this->flags() & regbase::icase; | |
} | |
else | |
{ | |
fail(regex_constants::error_backref, m_position - m_base); | |
return false; | |
} | |
m_position = pc; | |
if(have_brace) | |
{ | |
if((m_position == m_end) || (this->m_traits.syntax_type(*m_position) != syn_end)) | |
{ | |
fail(regex_constants::error_escape, m_position - m_base, incomplete_message); | |
return false; | |
} | |
++m_position; | |
} | |
return true; | |
} | |
goto escape_type_class_jump; | |
case regex_constants::escape_type_control_v: | |
if(0 == (this->flags() & (regbase::main_option_type | regbase::no_perl_ex))) | |
goto escape_type_class_jump; | |
// fallthrough: | |
default: | |
this->append_literal(unescape_character()); | |
break; | |
} | |
return true; | |
} | |
template <class charT, class traits> | |
bool basic_regex_parser<charT, traits>::parse_match_any() | |
{ | |
// | |
// we have a '.' that can match any character: | |
// | |
++m_position; | |
static_cast<re_dot*>( | |
this->append_state(syntax_element_wild, sizeof(re_dot)) | |
)->mask = static_cast<unsigned char>(this->flags() & regbase::no_mod_s | |
? re_detail::force_not_newline | |
: this->flags() & regbase::mod_s ? | |
re_detail::force_newline : re_detail::dont_care); | |
return true; | |
} | |
template <class charT, class traits> | |
bool basic_regex_parser<charT, traits>::parse_repeat(std::size_t low, std::size_t high) | |
{ | |
bool greedy = true; | |
bool pocessive = false; | |
std::size_t insert_point; | |
// | |
// when we get to here we may have a non-greedy ? mark still to come: | |
// | |
if((m_position != m_end) | |
&& ( | |
(0 == (this->flags() & (regbase::main_option_type | regbase::no_perl_ex))) | |
|| ((regbase::basic_syntax_group|regbase::emacs_ex) == (this->flags() & (regbase::main_option_type | regbase::emacs_ex))) | |
) | |
) | |
{ | |
// OK we have a perl or emacs regex, check for a '?': | |
if(this->m_traits.syntax_type(*m_position) == regex_constants::syntax_question) | |
{ | |
greedy = false; | |
++m_position; | |
} | |
// for perl regexes only check for pocessive ++ repeats. | |
if((0 == (this->flags() & regbase::main_option_type)) | |
&& (this->m_traits.syntax_type(*m_position) == regex_constants::syntax_plus)) | |
{ | |
pocessive = true; | |
++m_position; | |
} | |
} | |
if(0 == this->m_last_state) | |
{ | |
fail(regex_constants::error_badrepeat, ::boost::re_detail::distance(m_base, m_position), "Nothing to repeat."); | |
return false; | |
} | |
if(this->m_last_state->type == syntax_element_endmark) | |
{ | |
// insert a repeat before the '(' matching the last ')': | |
insert_point = this->m_paren_start; | |
} | |
else if((this->m_last_state->type == syntax_element_literal) && (static_cast<re_literal*>(this->m_last_state)->length > 1)) | |
{ | |
// the last state was a literal with more than one character, split it in two: | |
re_literal* lit = static_cast<re_literal*>(this->m_last_state); | |
charT c = (static_cast<charT*>(static_cast<void*>(lit+1)))[lit->length - 1]; | |
--(lit->length); | |
// now append new state: | |
lit = static_cast<re_literal*>(this->append_state(syntax_element_literal, sizeof(re_literal) + sizeof(charT))); | |
lit->length = 1; | |
(static_cast<charT*>(static_cast<void*>(lit+1)))[0] = c; | |
insert_point = this->getoffset(this->m_last_state); | |
} | |
else | |
{ | |
// repeat the last state whatever it was, need to add some error checking here: | |
switch(this->m_last_state->type) | |
{ | |
case syntax_element_start_line: | |
case syntax_element_end_line: | |
case syntax_element_word_boundary: | |
case syntax_element_within_word: | |
case syntax_element_word_start: | |
case syntax_element_word_end: | |
case syntax_element_buffer_start: | |
case syntax_element_buffer_end: | |
case syntax_element_alt: | |
case syntax_element_soft_buffer_end: | |
case syntax_element_restart_continue: | |
case syntax_element_jump: | |
case syntax_element_startmark: | |
case syntax_element_backstep: | |
// can't legally repeat any of the above: | |
fail(regex_constants::error_badrepeat, m_position - m_base); | |
return false; | |
default: | |
// do nothing... | |
break; | |
} | |
insert_point = this->getoffset(this->m_last_state); | |
} | |
// | |
// OK we now know what to repeat, so insert the repeat around it: | |
// | |
re_repeat* rep = static_cast<re_repeat*>(this->insert_state(insert_point, syntax_element_rep, re_repeater_size)); | |
rep->min = low; | |
rep->max = high; | |
rep->greedy = greedy; | |
rep->leading = false; | |
// store our repeater position for later: | |
std::ptrdiff_t rep_off = this->getoffset(rep); | |
// and append a back jump to the repeat: | |
re_jump* jmp = static_cast<re_jump*>(this->append_state(syntax_element_jump, sizeof(re_jump))); | |
jmp->alt.i = rep_off - this->getoffset(jmp); | |
this->m_pdata->m_data.align(); | |
// now fill in the alt jump for the repeat: | |
rep = static_cast<re_repeat*>(this->getaddress(rep_off)); | |
rep->alt.i = this->m_pdata->m_data.size() - rep_off; | |
// | |
// If the repeat is pocessive then bracket the repeat with a (?>...) | |
// independent sub-expression construct: | |
// | |
if(pocessive) | |
{ | |
if(m_position != m_end) | |
{ | |
// | |
// Check for illegal following quantifier, we have to do this here, because | |
// the extra states we insert below circumvents are usual error checking :-( | |
// | |
switch(this->m_traits.syntax_type(*m_position)) | |
{ | |
case regex_constants::syntax_star: | |
case regex_constants::syntax_plus: | |
case regex_constants::syntax_question: | |
fail(regex_constants::error_badrepeat, m_position - m_base); | |
return false; | |
} | |
} | |
re_brace* pb = static_cast<re_brace*>(this->insert_state(insert_point, syntax_element_startmark, sizeof(re_brace))); | |
pb->index = -3; | |
pb->icase = this->flags() & regbase::icase; | |
jmp = static_cast<re_jump*>(this->insert_state(insert_point + sizeof(re_brace), syntax_element_jump, sizeof(re_jump))); | |
this->m_pdata->m_data.align(); | |
jmp->alt.i = this->m_pdata->m_data.size() - this->getoffset(jmp); | |
pb = static_cast<re_brace*>(this->append_state(syntax_element_endmark, sizeof(re_brace))); | |
pb->index = -3; | |
pb->icase = this->flags() & regbase::icase; | |
} | |
return true; | |
} | |
template <class charT, class traits> | |
bool basic_regex_parser<charT, traits>::parse_repeat_range(bool isbasic) | |
{ | |
static const char* incomplete_message = "Missing } in quantified repetition."; | |
// | |
// parse a repeat-range: | |
// | |
std::size_t min, max; | |
int v; | |
// skip whitespace: | |
while((m_position != m_end) && this->m_traits.isctype(*m_position, this->m_mask_space)) | |
++m_position; | |
// fail if at end: | |
if(this->m_position == this->m_end) | |
{ | |
fail(regex_constants::error_brace, this->m_position - this->m_base, incomplete_message); | |
return false; | |
} | |
// get min: | |
v = this->m_traits.toi(m_position, m_end, 10); | |
// skip whitespace: | |
while((m_position != m_end) && this->m_traits.isctype(*m_position, this->m_mask_space)) | |
++m_position; | |
if(v < 0) | |
{ | |
fail(regex_constants::error_badbrace, this->m_position - this->m_base); | |
return false; | |
} | |
else if(this->m_position == this->m_end) | |
{ | |
fail(regex_constants::error_brace, this->m_position - this->m_base, incomplete_message); | |
return false; | |
} | |
min = v; | |
// see if we have a comma: | |
if(this->m_traits.syntax_type(*m_position) == regex_constants::syntax_comma) | |
{ | |
// move on and error check: | |
++m_position; | |
// skip whitespace: | |
while((m_position != m_end) && this->m_traits.isctype(*m_position, this->m_mask_space)) | |
++m_position; | |
if(this->m_position == this->m_end) | |
{ | |
fail(regex_constants::error_brace, this->m_position - this->m_base, incomplete_message); | |
return false; | |
} | |
// get the value if any: | |
v = this->m_traits.toi(m_position, m_end, 10); | |
max = (v >= 0) ? v : (std::numeric_limits<std::size_t>::max)(); | |
} | |
else | |
{ | |
// no comma, max = min: | |
max = min; | |
} | |
// skip whitespace: | |
while((m_position != m_end) && this->m_traits.isctype(*m_position, this->m_mask_space)) | |
++m_position; | |
// OK now check trailing }: | |
if(this->m_position == this->m_end) | |
{ | |
fail(regex_constants::error_brace, this->m_position - this->m_base, incomplete_message); | |
return false; | |
} | |
if(isbasic) | |
{ | |
if(this->m_traits.syntax_type(*m_position) == regex_constants::syntax_escape) | |
{ | |
++m_position; | |
if(this->m_position == this->m_end) | |
{ | |
fail(regex_constants::error_brace, this->m_position - this->m_base, incomplete_message); | |
return false; | |
} | |
} | |
else | |
{ | |
fail(regex_constants::error_brace, this->m_position - this->m_base, incomplete_message); | |
return false; | |
} | |
} | |
if(this->m_traits.syntax_type(*m_position) == regex_constants::syntax_close_brace) | |
++m_position; | |
else | |
{ | |
fail(regex_constants::error_brace, this->m_position - this->m_base, incomplete_message); | |
return false; | |
} | |
// | |
// finally go and add the repeat, unless error: | |
// | |
if(min > max) | |
{ | |
// Backtrack to error location: | |
m_position -= 2; | |
while(this->m_traits.isctype(*m_position, this->m_word_mask)) --m_position; | |
++m_position; | |
fail(regex_constants::error_badbrace, m_position - m_base); | |
return false; | |
} | |
return parse_repeat(min, max); | |
} | |
template <class charT, class traits> | |
bool basic_regex_parser<charT, traits>::parse_alt() | |
{ | |
// | |
// error check: if there have been no previous states, | |
// or if the last state was a '(' then error: | |
// | |
if( | |
((this->m_last_state == 0) || (this->m_last_state->type == syntax_element_startmark)) | |
&& | |
!( | |
((this->flags() & regbase::main_option_type) == regbase::perl_syntax_group) | |
&& | |
((this->flags() & regbase::no_empty_expressions) == 0) | |
) | |
) | |
{ | |
fail(regex_constants::error_empty, this->m_position - this->m_base, "A regular expression can start with the alternation operator |."); | |
return false; | |
} | |
// | |
// Reset mark count if required: | |
// | |
if(m_max_mark < m_mark_count) | |
m_max_mark = m_mark_count; | |
if(m_mark_reset >= 0) | |
m_mark_count = m_mark_reset; | |
++m_position; | |
// | |
// we need to append a trailing jump: | |
// | |
re_syntax_base* pj = this->append_state(re_detail::syntax_element_jump, sizeof(re_jump)); | |
std::ptrdiff_t jump_offset = this->getoffset(pj); | |
// | |
// now insert the alternative: | |
// | |
re_alt* palt = static_cast<re_alt*>(this->insert_state(this->m_alt_insert_point, syntax_element_alt, re_alt_size)); | |
jump_offset += re_alt_size; | |
this->m_pdata->m_data.align(); | |
palt->alt.i = this->m_pdata->m_data.size() - this->getoffset(palt); | |
// | |
// update m_alt_insert_point so that the next alternate gets | |
// inserted at the start of the second of the two we've just created: | |
// | |
this->m_alt_insert_point = this->m_pdata->m_data.size(); | |
// | |
// the start of this alternative must have a case changes state | |
// if the current block has messed around with case changes: | |
// | |
if(m_has_case_change) | |
{ | |
static_cast<re_case*>( | |
this->append_state(syntax_element_toggle_case, sizeof(re_case)) | |
)->icase = this->m_icase; | |
} | |
// | |
// push the alternative onto our stack, a recursive | |
// implementation here is easier to understand (and faster | |
// as it happens), but causes all kinds of stack overflow problems | |
// on programs with small stacks (COM+). | |
// | |
m_alt_jumps.push_back(jump_offset); | |
return true; | |
} | |
template <class charT, class traits> | |
bool basic_regex_parser<charT, traits>::parse_set() | |
{ | |
static const char* incomplete_message = "Character set declaration starting with [ terminated prematurely - either no ] was found or the set had no content."; | |
++m_position; | |
if(m_position == m_end) | |
{ | |
fail(regex_constants::error_brack, m_position - m_base, incomplete_message); | |
return false; | |
} | |
basic_char_set<charT, traits> char_set; | |
const charT* base = m_position; // where the '[' was | |
const charT* item_base = m_position; // where the '[' or '^' was | |
while(m_position != m_end) | |
{ | |
switch(this->m_traits.syntax_type(*m_position)) | |
{ | |
case regex_constants::syntax_caret: | |
if(m_position == base) | |
{ | |
char_set.negate(); | |
++m_position; | |
item_base = m_position; | |
} | |
else | |
parse_set_literal(char_set); | |
break; | |
case regex_constants::syntax_close_set: | |
if(m_position == item_base) | |
{ | |
parse_set_literal(char_set); | |
break; | |
} | |
else | |
{ | |
++m_position; | |
if(0 == this->append_set(char_set)) | |
{ | |
fail(regex_constants::error_ctype, m_position - m_base); | |
return false; | |
} | |
} | |
return true; | |
case regex_constants::syntax_open_set: | |
if(parse_inner_set(char_set)) | |
break; | |
return true; | |
case regex_constants::syntax_escape: | |
{ | |
// | |
// look ahead and see if this is a character class shortcut | |
// \d \w \s etc... | |
// | |
++m_position; | |
if(this->m_traits.escape_syntax_type(*m_position) | |
== regex_constants::escape_type_class) | |
{ | |
char_class_type m = this->m_traits.lookup_classname(m_position, m_position+1); | |
if(m != 0) | |
{ | |
char_set.add_class(m); | |
++m_position; | |
break; | |
} | |
} | |
else if(this->m_traits.escape_syntax_type(*m_position) | |
== regex_constants::escape_type_not_class) | |
{ | |
// negated character class: | |
char_class_type m = this->m_traits.lookup_classname(m_position, m_position+1); | |
if(m != 0) | |
{ | |
char_set.add_negated_class(m); | |
++m_position; | |
break; | |
} | |
} | |
// not a character class, just a regular escape: | |
--m_position; | |
parse_set_literal(char_set); | |
break; | |
} | |
default: | |
parse_set_literal(char_set); | |
break; | |
} | |
} | |
return m_position != m_end; | |
} | |
template <class charT, class traits> | |
bool basic_regex_parser<charT, traits>::parse_inner_set(basic_char_set<charT, traits>& char_set) | |
{ | |
static const char* incomplete_message = "Character class declaration starting with [ terminated prematurely - either no ] was found or the set had no content."; | |
// | |
// we have either a character class [:name:] | |
// a collating element [.name.] | |
// or an equivalence class [=name=] | |
// | |
if(m_end == ++m_position) | |
{ | |
fail(regex_constants::error_brack, m_position - m_base, incomplete_message); | |
return false; | |
} | |
switch(this->m_traits.syntax_type(*m_position)) | |
{ | |
case regex_constants::syntax_dot: | |
// | |
// a collating element is treated as a literal: | |
// | |
--m_position; | |
parse_set_literal(char_set); | |
return true; | |
case regex_constants::syntax_colon: | |
{ | |
// check that character classes are actually enabled: | |
if((this->flags() & (regbase::main_option_type | regbase::no_char_classes)) | |
== (regbase::basic_syntax_group | regbase::no_char_classes)) | |
{ | |
--m_position; | |
parse_set_literal(char_set); | |
return true; | |
} | |
// skip the ':' | |
if(m_end == ++m_position) | |
{ | |
fail(regex_constants::error_brack, m_position - m_base, incomplete_message); | |
return false; | |
} | |
const charT* name_first = m_position; | |
// skip at least one character, then find the matching ':]' | |
if(m_end == ++m_position) | |
{ | |
fail(regex_constants::error_brack, m_position - m_base, incomplete_message); | |
return false; | |
} | |
while((m_position != m_end) | |
&& (this->m_traits.syntax_type(*m_position) != regex_constants::syntax_colon)) | |
++m_position; | |
const charT* name_last = m_position; | |
if(m_end == m_position) | |
{ | |
fail(regex_constants::error_brack, m_position - m_base, incomplete_message); | |
return false; | |
} | |
if((m_end == ++m_position) | |
|| (this->m_traits.syntax_type(*m_position) != regex_constants::syntax_close_set)) | |
{ | |
fail(regex_constants::error_brack, m_position - m_base, incomplete_message); | |
return false; | |
} | |
// | |
// check for negated class: | |
// | |
bool negated = false; | |
if(this->m_traits.syntax_type(*name_first) == regex_constants::syntax_caret) | |
{ | |
++name_first; | |
negated = true; | |
} | |
typedef typename traits::char_class_type mask_type; | |
mask_type m = this->m_traits.lookup_classname(name_first, name_last); | |
if(m == 0) | |
{ | |
if(char_set.empty() && (name_last - name_first == 1)) | |
{ | |
// maybe a special case: | |
++m_position; | |
if( (m_position != m_end) | |
&& (this->m_traits.syntax_type(*m_position) | |
== regex_constants::syntax_close_set)) | |
{ | |
if(this->m_traits.escape_syntax_type(*name_first) | |
== regex_constants::escape_type_left_word) | |
{ | |
++m_position; | |
this->append_state(syntax_element_word_start); | |
return false; | |
} | |
if(this->m_traits.escape_syntax_type(*name_first) | |
== regex_constants::escape_type_right_word) | |
{ | |
++m_position; | |
this->append_state(syntax_element_word_end); | |
return false; | |
} | |
} | |
} | |
fail(regex_constants::error_ctype, name_first - m_base); | |
return false; | |
} | |
if(negated == false) | |
char_set.add_class(m); | |
else | |
char_set.add_negated_class(m); | |
++m_position; | |
break; | |
} | |
case regex_constants::syntax_equal: | |
{ | |
// skip the '=' | |
if(m_end == ++m_position) | |
{ | |
fail(regex_constants::error_brack, m_position - m_base, incomplete_message); | |
return false; | |
} | |
const charT* name_first = m_position; | |
// skip at least one character, then find the matching '=]' | |
if(m_end == ++m_position) | |
{ | |
fail(regex_constants::error_brack, m_position - m_base, incomplete_message); | |
return false; | |
} | |
while((m_position != m_end) | |
&& (this->m_traits.syntax_type(*m_position) != regex_constants::syntax_equal)) | |
++m_position; | |
const charT* name_last = m_position; | |
if(m_end == m_position) | |
{ | |
fail(regex_constants::error_brack, m_position - m_base, incomplete_message); | |
return false; | |
} | |
if((m_end == ++m_position) | |
|| (this->m_traits.syntax_type(*m_position) != regex_constants::syntax_close_set)) | |
{ | |
fail(regex_constants::error_brack, m_position - m_base, incomplete_message); | |
return false; | |
} | |
string_type m = this->m_traits.lookup_collatename(name_first, name_last); | |
if((0 == m.size()) || (m.size() > 2)) | |
{ | |
fail(regex_constants::error_collate, name_first - m_base); | |
return false; | |
} | |
digraph<charT> d; | |
d.first = m[0]; | |
if(m.size() > 1) | |
d.second = m[1]; | |
else | |
d.second = 0; | |
char_set.add_equivalent(d); | |
++m_position; | |
break; | |
} | |
default: | |
--m_position; | |
parse_set_literal(char_set); | |
break; | |
} | |
return true; | |
} | |
template <class charT, class traits> | |
void basic_regex_parser<charT, traits>::parse_set_literal(basic_char_set<charT, traits>& char_set) | |
{ | |
digraph<charT> start_range(get_next_set_literal(char_set)); | |
if(m_end == m_position) | |
{ | |
fail(regex_constants::error_brack, m_position - m_base); | |
return; | |
} | |
if(this->m_traits.syntax_type(*m_position) == regex_constants::syntax_dash) | |
{ | |
// we have a range: | |
if(m_end == ++m_position) | |
{ | |
fail(regex_constants::error_brack, m_position - m_base); | |
return; | |
} | |
if(this->m_traits.syntax_type(*m_position) != regex_constants::syntax_close_set) | |
{ | |
digraph<charT> end_range = get_next_set_literal(char_set); | |
char_set.add_range(start_range, end_range); | |
if(this->m_traits.syntax_type(*m_position) == regex_constants::syntax_dash) | |
{ | |
if(m_end == ++m_position) | |
{ | |
fail(regex_constants::error_brack, m_position - m_base); | |
return; | |
} | |
if(this->m_traits.syntax_type(*m_position) == regex_constants::syntax_close_set) | |
{ | |
// trailing - : | |
--m_position; | |
return; | |
} | |
fail(regex_constants::error_range, m_position - m_base); | |
return; | |
} | |
return; | |
} | |
--m_position; | |
} | |
char_set.add_single(start_range); | |
} | |
template <class charT, class traits> | |
digraph<charT> basic_regex_parser<charT, traits>::get_next_set_literal(basic_char_set<charT, traits>& char_set) | |
{ | |
digraph<charT> result; | |
switch(this->m_traits.syntax_type(*m_position)) | |
{ | |
case regex_constants::syntax_dash: | |
if(!char_set.empty()) | |
{ | |
// see if we are at the end of the set: | |
if((++m_position == m_end) || (this->m_traits.syntax_type(*m_position) != regex_constants::syntax_close_set)) | |
{ | |
fail(regex_constants::error_range, m_position - m_base); | |
return result; | |
} | |
--m_position; | |
} | |
result.first = *m_position++; | |
return result; | |
case regex_constants::syntax_escape: | |
// check to see if escapes are supported first: | |
if(this->flags() & regex_constants::no_escape_in_lists) | |
{ | |
result = *m_position++; | |
break; | |
} | |
++m_position; | |
result = unescape_character(); | |
break; | |
case regex_constants::syntax_open_set: | |
{ | |
if(m_end == ++m_position) | |
{ | |
fail(regex_constants::error_collate, m_position - m_base); | |
return result; | |
} | |
if(this->m_traits.syntax_type(*m_position) != regex_constants::syntax_dot) | |
{ | |
--m_position; | |
result.first = *m_position; | |
++m_position; | |
return result; | |
} | |
if(m_end == ++m_position) | |
{ | |
fail(regex_constants::error_collate, m_position - m_base); | |
return result; | |
} | |
const charT* name_first = m_position; | |
// skip at least one character, then find the matching ':]' | |
if(m_end == ++m_position) | |
{ | |
fail(regex_constants::error_collate, name_first - m_base); | |
return result; | |
} | |
while((m_position != m_end) | |
&& (this->m_traits.syntax_type(*m_position) != regex_constants::syntax_dot)) | |
++m_position; | |
const charT* name_last = m_position; | |
if(m_end == m_position) | |
{ | |
fail(regex_constants::error_collate, name_first - m_base); | |
return result; | |
} | |
if((m_end == ++m_position) | |
|| (this->m_traits.syntax_type(*m_position) != regex_constants::syntax_close_set)) | |
{ | |
fail(regex_constants::error_collate, name_first - m_base); | |
return result; | |
} | |
++m_position; | |
string_type s = this->m_traits.lookup_collatename(name_first, name_last); | |
if(s.empty() || (s.size() > 2)) | |
{ | |
fail(regex_constants::error_collate, name_first - m_base); | |
return result; | |
} | |
result.first = s[0]; | |
if(s.size() > 1) | |
result.second = s[1]; | |
else | |
result.second = 0; | |
return result; | |
} | |
default: | |
result = *m_position++; | |
} | |
return result; | |
} | |
// | |
// does a value fit in the specified charT type? | |
// | |
template <class charT> | |
bool valid_value(charT, int v, const mpl::true_&) | |
{ | |
return (v >> (sizeof(charT) * CHAR_BIT)) == 0; | |
} | |
template <class charT> | |
bool valid_value(charT, int, const mpl::false_&) | |
{ | |
return true; // v will alsways fit in a charT | |
} | |
template <class charT> | |
bool valid_value(charT c, int v) | |
{ | |
return valid_value(c, v, mpl::bool_<(sizeof(charT) < sizeof(int))>()); | |
} | |
template <class charT, class traits> | |
charT basic_regex_parser<charT, traits>::unescape_character() | |
{ | |
#ifdef BOOST_MSVC | |
#pragma warning(push) | |
#pragma warning(disable:4127) | |
#endif | |
charT result(0); | |
if(m_position == m_end) | |
{ | |
fail(regex_constants::error_escape, m_position - m_base, "Escape sequence terminated prematurely."); | |
return false; | |
} | |
switch(this->m_traits.escape_syntax_type(*m_position)) | |
{ | |
case regex_constants::escape_type_control_a: | |
result = charT('\a'); | |
break; | |
case regex_constants::escape_type_e: | |
result = charT(27); | |
break; | |
case regex_constants::escape_type_control_f: | |
result = charT('\f'); | |
break; | |
case regex_constants::escape_type_control_n: | |
result = charT('\n'); | |
break; | |
case regex_constants::escape_type_control_r: | |
result = charT('\r'); | |
break; | |
case regex_constants::escape_type_control_t: | |
result = charT('\t'); | |
break; | |
case regex_constants::escape_type_control_v: | |
result = charT('\v'); | |
break; | |
case regex_constants::escape_type_word_assert: | |
result = charT('\b'); | |
break; | |
case regex_constants::escape_type_ascii_control: | |
++m_position; | |
if(m_position == m_end) | |
{ | |
// Rewind to start of escape: | |
--m_position; | |
while(this->m_traits.syntax_type(*m_position) != regex_constants::syntax_escape) --m_position; | |
fail(regex_constants::error_escape, m_position - m_base, "ASCII escape sequence terminated prematurely."); | |
return result; | |
} | |
result = static_cast<charT>(*m_position % 32); | |
break; | |
case regex_constants::escape_type_hex: | |
++m_position; | |
if(m_position == m_end) | |
{ | |
// Rewind to start of escape: | |
--m_position; | |
while(this->m_traits.syntax_type(*m_position) != regex_constants::syntax_escape) --m_position; | |
fail(regex_constants::error_escape, m_position - m_base, "Hexadecimal escape sequence terminated prematurely."); | |
return result; | |
} | |
// maybe have \x{ddd} | |
if(this->m_traits.syntax_type(*m_position) == regex_constants::syntax_open_brace) | |
{ | |
++m_position; | |
if(m_position == m_end) | |
{ | |
// Rewind to start of escape: | |
--m_position; | |
while(this->m_traits.syntax_type(*m_position) != regex_constants::syntax_escape) --m_position; | |
fail(regex_constants::error_escape, m_position - m_base, "Missing } in hexadecimal escape sequence."); | |
return result; | |
} | |
int i = this->m_traits.toi(m_position, m_end, 16); | |
if((m_position == m_end) | |
|| (i < 0) | |
|| ((std::numeric_limits<charT>::is_specialized) && (i > (int)(std::numeric_limits<charT>::max)())) | |
|| (this->m_traits.syntax_type(*m_position) != regex_constants::syntax_close_brace)) | |
{ | |
// Rewind to start of escape: | |
--m_position; | |
while(this->m_traits.syntax_type(*m_position) != regex_constants::syntax_escape) --m_position; | |
fail(regex_constants::error_badbrace, m_position - m_base, "Hexadecimal escape sequence was invalid."); | |
return result; | |
} | |
++m_position; | |
result = charT(i); | |
} | |
else | |
{ | |
std::ptrdiff_t len = (std::min)(static_cast<std::ptrdiff_t>(2), static_cast<std::ptrdiff_t>(m_end - m_position)); | |
int i = this->m_traits.toi(m_position, m_position + len, 16); | |
if((i < 0) | |
|| !valid_value(charT(0), i)) | |
{ | |
// Rewind to start of escape: | |
--m_position; | |
while(this->m_traits.syntax_type(*m_position) != regex_constants::syntax_escape) --m_position; | |
fail(regex_constants::error_escape, m_position - m_base, "Escape sequence did not encode a valid character."); | |
return result; | |
} | |
result = charT(i); | |
} | |
return result; | |
case regex_constants::syntax_digit: | |
{ | |
// an octal escape sequence, the first character must be a zero | |
// followed by up to 3 octal digits: | |
std::ptrdiff_t len = (std::min)(::boost::re_detail::distance(m_position, m_end), static_cast<std::ptrdiff_t>(4)); | |
const charT* bp = m_position; | |
int val = this->m_traits.toi(bp, bp + 1, 8); | |
if(val != 0) | |
{ | |
// Rewind to start of escape: | |
--m_position; | |
while(this->m_traits.syntax_type(*m_position) != regex_constants::syntax_escape) --m_position; | |
// Oops not an octal escape after all: | |
fail(regex_constants::error_escape, m_position - m_base, "Invalid octal escape sequence."); | |
return result; | |
} | |
val = this->m_traits.toi(m_position, m_position + len, 8); | |
if(val < 0) | |
{ | |
// Rewind to start of escape: | |
--m_position; | |
while(this->m_traits.syntax_type(*m_position) != regex_constants::syntax_escape) --m_position; | |
fail(regex_constants::error_escape, m_position - m_base, "Octal escape sequence is invalid."); | |
return result; | |
} | |
return static_cast<charT>(val); | |
} | |
case regex_constants::escape_type_named_char: | |
{ | |
++m_position; | |
if(m_position == m_end) | |
{ | |
// Rewind to start of escape: | |
--m_position; | |
while(this->m_traits.syntax_type(*m_position) != regex_constants::syntax_escape) --m_position; | |
fail(regex_constants::error_escape, m_position - m_base); | |
return false; | |
} | |
// maybe have \N{name} | |
if(this->m_traits.syntax_type(*m_position) == regex_constants::syntax_open_brace) | |
{ | |
const charT* base = m_position; | |
// skip forward until we find enclosing brace: | |
while((m_position != m_end) && (this->m_traits.syntax_type(*m_position) != regex_constants::syntax_close_brace)) | |
++m_position; | |
if(m_position == m_end) | |
{ | |
// Rewind to start of escape: | |
--m_position; | |
while(this->m_traits.syntax_type(*m_position) != regex_constants::syntax_escape) --m_position; | |
fail(regex_constants::error_escape, m_position - m_base); | |
return false; | |
} | |
string_type s = this->m_traits.lookup_collatename(++base, m_position++); | |
if(s.empty()) | |
{ | |
// Rewind to start of escape: | |
--m_position; | |
while(this->m_traits.syntax_type(*m_position) != regex_constants::syntax_escape) --m_position; | |
fail(regex_constants::error_collate, m_position - m_base); | |
return false; | |
} | |
if(s.size() == 1) | |
{ | |
return s[0]; | |
} | |
} | |
// fall through is a failure: | |
// Rewind to start of escape: | |
--m_position; | |
while(this->m_traits.syntax_type(*m_position) != regex_constants::syntax_escape) --m_position; | |
fail(regex_constants::error_escape, m_position - m_base); | |
return false; | |
} | |
default: | |
result = *m_position; | |
break; | |
} | |
++m_position; | |
return result; | |
#ifdef BOOST_MSVC | |
#pragma warning(pop) | |
#endif | |
} | |
template <class charT, class traits> | |
bool basic_regex_parser<charT, traits>::parse_backref() | |
{ | |
BOOST_ASSERT(m_position != m_end); | |
const charT* pc = m_position; | |
int i = this->m_traits.toi(pc, pc + 1, 10); | |
if((i == 0) || (((this->flags() & regbase::main_option_type) == regbase::perl_syntax_group) && (this->flags() & regbase::no_bk_refs))) | |
{ | |
// not a backref at all but an octal escape sequence: | |
charT c = unescape_character(); | |
this->append_literal(c); | |
} | |
else if((i > 0) && (this->m_backrefs & (1u << (i-1)))) | |
{ | |
m_position = pc; | |
re_brace* pb = static_cast<re_brace*>(this->append_state(syntax_element_backref, sizeof(re_brace))); | |
pb->index = i; | |
pb->icase = this->flags() & regbase::icase; | |
} | |
else | |
{ | |
// Rewind to start of escape: | |
--m_position; | |
while(this->m_traits.syntax_type(*m_position) != regex_constants::syntax_escape) --m_position; | |
fail(regex_constants::error_backref, m_position - m_base); | |
return false; | |
} | |
return true; | |
} | |
template <class charT, class traits> | |
bool basic_regex_parser<charT, traits>::parse_QE() | |
{ | |
#ifdef BOOST_MSVC | |
#pragma warning(push) | |
#pragma warning(disable:4127) | |
#endif | |
// | |
// parse a \Q...\E sequence: | |
// | |
++m_position; // skip the Q | |
const charT* start = m_position; | |
const charT* end; | |
do | |
{ | |
while((m_position != m_end) | |
&& (this->m_traits.syntax_type(*m_position) != regex_constants::syntax_escape)) | |
++m_position; | |
if(m_position == m_end) | |
{ | |
// a \Q...\E sequence may terminate with the end of the expression: | |
end = m_position; | |
break; | |
} | |
if(++m_position == m_end) // skip the escape | |
{ | |
fail(regex_constants::error_escape, m_position - m_base, "Unterminated \\Q...\\E sequence."); | |
return false; | |
} | |
// check to see if it's a \E: | |
if(this->m_traits.escape_syntax_type(*m_position) == regex_constants::escape_type_E) | |
{ | |
++m_position; | |
end = m_position - 2; | |
break; | |
} | |
// otherwise go round again: | |
}while(true); | |
// | |
// now add all the character between the two escapes as literals: | |
// | |
while(start != end) | |
{ | |
this->append_literal(*start); | |
++start; | |
} | |
return true; | |
#ifdef BOOST_MSVC | |
#pragma warning(pop) | |
#endif | |
} | |
template <class charT, class traits> | |
bool basic_regex_parser<charT, traits>::parse_perl_extension() | |
{ | |
if(++m_position == m_end) | |
{ | |
// Rewind to start of (? sequence: | |
--m_position; | |
while(this->m_traits.syntax_type(*m_position) != regex_constants::syntax_open_mark) --m_position; | |
fail(regex_constants::error_perl_extension, m_position - m_base); | |
return false; | |
} | |
// | |
// treat comments as a special case, as these | |
// are the only ones that don't start with a leading | |
// startmark state: | |
// | |
if(this->m_traits.syntax_type(*m_position) == regex_constants::syntax_hash) | |
{ | |
while((m_position != m_end) | |
&& (this->m_traits.syntax_type(*m_position++) != regex_constants::syntax_close_mark)) | |
{} | |
return true; | |
} | |
// | |
// backup some state, and prepare the way: | |
// | |
int markid = 0; | |
std::ptrdiff_t jump_offset = 0; | |
re_brace* pb = static_cast<re_brace*>(this->append_state(syntax_element_startmark, sizeof(re_brace))); | |
pb->icase = this->flags() & regbase::icase; | |
std::ptrdiff_t last_paren_start = this->getoffset(pb); | |
// back up insertion point for alternations, and set new point: | |
std::ptrdiff_t last_alt_point = m_alt_insert_point; | |
this->m_pdata->m_data.align(); | |
m_alt_insert_point = this->m_pdata->m_data.size(); | |
std::ptrdiff_t expected_alt_point = m_alt_insert_point; | |
bool restore_flags = true; | |
regex_constants::syntax_option_type old_flags = this->flags(); | |
bool old_case_change = m_has_case_change; | |
m_has_case_change = false; | |
charT name_delim; | |
int mark_reset = m_mark_reset; | |
int max_mark = m_max_mark; | |
m_mark_reset = -1; | |
m_max_mark = m_mark_count; | |
int v; | |
// | |
// select the actual extension used: | |
// | |
switch(this->m_traits.syntax_type(*m_position)) | |
{ | |
case regex_constants::syntax_or: | |
m_mark_reset = m_mark_count; | |
// fall through: | |
case regex_constants::syntax_colon: | |
// | |
// a non-capturing mark: | |
// | |
pb->index = markid = 0; | |
++m_position; | |
break; | |
case regex_constants::syntax_digit: | |
{ | |
// | |
// a recursive subexpression: | |
// | |
v = this->m_traits.toi(m_position, m_end, 10); | |
if((v < 0) || (this->m_traits.syntax_type(*m_position) != regex_constants::syntax_close_mark)) | |
{ | |
// Rewind to start of (? sequence: | |
--m_position; | |
while(this->m_traits.syntax_type(*m_position) != regex_constants::syntax_open_mark) --m_position; | |
fail(regex_constants::error_perl_extension, m_position - m_base, "The recursive sub-expression refers to an invalid marking group, or is unterminated."); | |
return false; | |
} | |
insert_recursion: | |
pb->index = markid = 0; | |
re_recurse* pr = static_cast<re_recurse*>(this->append_state(syntax_element_recurse, sizeof(re_recurse))); | |
pr->alt.i = v; | |
pr->state_id = 0; | |
static_cast<re_case*>( | |
this->append_state(syntax_element_toggle_case, sizeof(re_case)) | |
)->icase = this->flags() & regbase::icase; | |
break; | |
} | |
case regex_constants::syntax_plus: | |
// | |
// A forward-relative recursive subexpression: | |
// | |
++m_position; | |
v = this->m_traits.toi(m_position, m_end, 10); | |
if((v <= 0) || (this->m_traits.syntax_type(*m_position) != regex_constants::syntax_close_mark)) | |
{ | |
// Rewind to start of (? sequence: | |
--m_position; | |
while(this->m_traits.syntax_type(*m_position) != regex_constants::syntax_open_mark) --m_position; | |
fail(regex_constants::error_perl_extension, m_position - m_base, "An invalid or unterminated recursive sub-expression."); | |
return false; | |
} | |
v += m_mark_count; | |
goto insert_recursion; | |
case regex_constants::syntax_dash: | |
// | |
// Possibly a backward-relative recursive subexpression: | |
// | |
++m_position; | |
v = this->m_traits.toi(m_position, m_end, 10); | |
if(v <= 0) | |
{ | |
--m_position; | |
// Oops not a relative recursion at all, but a (?-imsx) group: | |
goto option_group_jump; | |
} | |
v = m_mark_count + 1 - v; | |
if(v <= 0) | |
{ | |
// Rewind to start of (? sequence: | |
--m_position; | |
while(this->m_traits.syntax_type(*m_position) != regex_constants::syntax_open_mark) --m_position; | |
fail(regex_constants::error_perl_extension, m_position - m_base, "An invalid or unterminated recursive sub-expression."); | |
return false; | |
} | |
goto insert_recursion; | |
case regex_constants::syntax_equal: | |
pb->index = markid = -1; | |
++m_position; | |
jump_offset = this->getoffset(this->append_state(syntax_element_jump, sizeof(re_jump))); | |
this->m_pdata->m_data.align(); | |
m_alt_insert_point = this->m_pdata->m_data.size(); | |
break; | |
case regex_constants::syntax_not: | |
pb->index = markid = -2; | |
++m_position; | |
jump_offset = this->getoffset(this->append_state(syntax_element_jump, sizeof(re_jump))); | |
this->m_pdata->m_data.align(); | |
m_alt_insert_point = this->m_pdata->m_data.size(); | |
break; | |
case regex_constants::escape_type_left_word: | |
{ | |
// a lookbehind assertion: | |
if(++m_position == m_end) | |
{ | |
// Rewind to start of (? sequence: | |
--m_position; | |
while(this->m_traits.syntax_type(*m_position) != regex_constants::syntax_open_mark) --m_position; | |
fail(regex_constants::error_perl_extension, m_position - m_base); | |
return false; | |
} | |
regex_constants::syntax_type t = this->m_traits.syntax_type(*m_position); | |
if(t == regex_constants::syntax_not) | |
pb->index = markid = -2; | |
else if(t == regex_constants::syntax_equal) | |
pb->index = markid = -1; | |
else | |
{ | |
// Probably a named capture which also starts (?< : | |
name_delim = '>'; | |
--m_position; | |
goto named_capture_jump; | |
} | |
++m_position; | |
jump_offset = this->getoffset(this->append_state(syntax_element_jump, sizeof(re_jump))); | |
this->append_state(syntax_element_backstep, sizeof(re_brace)); | |
this->m_pdata->m_data.align(); | |
m_alt_insert_point = this->m_pdata->m_data.size(); | |
break; | |
} | |
case regex_constants::escape_type_right_word: | |
// | |
// an independent sub-expression: | |
// | |
pb->index = markid = -3; | |
++m_position; | |
jump_offset = this->getoffset(this->append_state(syntax_element_jump, sizeof(re_jump))); | |
this->m_pdata->m_data.align(); | |
m_alt_insert_point = this->m_pdata->m_data.size(); | |
break; | |
case regex_constants::syntax_open_mark: | |
{ | |
// a conditional expression: | |
pb->index = markid = -4; | |
if(++m_position == m_end) | |
{ | |
// Rewind to start of (? sequence: | |
--m_position; | |
while(this->m_traits.syntax_type(*m_position) != regex_constants::syntax_open_mark) --m_position; | |
fail(regex_constants::error_perl_extension, m_position - m_base); | |
return false; | |
} | |
v = this->m_traits.toi(m_position, m_end, 10); | |
if(*m_position == charT('R')) | |
{ | |
if(++m_position == m_end) | |
{ | |
// Rewind to start of (? sequence: | |
--m_position; | |
while(this->m_traits.syntax_type(*m_position) != regex_constants::syntax_open_mark) --m_position; | |
fail(regex_constants::error_perl_extension, m_position - m_base); | |
return false; | |
} | |
if(*m_position == charT('&')) | |
{ | |
const charT* base = ++m_position; | |
while((m_position != m_end) && (this->m_traits.syntax_type(*m_position) != regex_constants::syntax_close_mark)) | |
++m_position; | |
if(m_position == m_end) | |
{ | |
// Rewind to start of (? sequence: | |
--m_position; | |
while(this->m_traits.syntax_type(*m_position) != regex_constants::syntax_open_mark) --m_position; | |
fail(regex_constants::error_perl_extension, m_position - m_base); | |
return false; | |
} | |
v = -static_cast<int>(hash_value_from_capture_name(base, m_position)); | |
} | |
else | |
{ | |
v = -this->m_traits.toi(m_position, m_end, 10); | |
} | |
re_brace* br = static_cast<re_brace*>(this->append_state(syntax_element_assert_backref, sizeof(re_brace))); | |
br->index = v < 0 ? (v - 1) : 0; | |
if(this->m_traits.syntax_type(*m_position) != regex_constants::syntax_close_mark) | |
{ | |
// Rewind to start of (? sequence: | |
--m_position; | |
while(this->m_traits.syntax_type(*m_position) != regex_constants::syntax_open_mark) --m_position; | |
fail(regex_constants::error_perl_extension, m_position - m_base); | |
return false; | |
} | |
if(++m_position == m_end) | |
{ | |
// Rewind to start of (? sequence: | |
--m_position; | |
while(this->m_traits.syntax_type(*m_position) != regex_constants::syntax_open_mark) --m_position; | |
fail(regex_constants::error_perl_extension, m_position - m_base); | |
return false; | |
} | |
} | |
else if((*m_position == charT('\'')) || (*m_position == charT('<'))) | |
{ | |
const charT* base = ++m_position; | |
while((m_position != m_end) && (*m_position != charT('>')) && (*m_position != charT('\''))) | |
++m_position; | |
if(m_position == m_end) | |
{ | |
// Rewind to start of (? sequence: | |
--m_position; | |
while(this->m_traits.syntax_type(*m_position) != regex_constants::syntax_open_mark) --m_position; | |
fail(regex_constants::error_perl_extension, m_position - m_base); | |
return false; | |
} | |
v = static_cast<int>(hash_value_from_capture_name(base, m_position)); | |
re_brace* br = static_cast<re_brace*>(this->append_state(syntax_element_assert_backref, sizeof(re_brace))); | |
br->index = v; | |
if(((*m_position != charT('>')) && (*m_position != charT('\''))) || (++m_position == m_end)) | |
{ | |
// Rewind to start of (? sequence: | |
--m_position; | |
while(this->m_traits.syntax_type(*m_position) != regex_constants::syntax_open_mark) --m_position; | |
fail(regex_constants::error_perl_extension, m_position - m_base, "Unterminated named capture."); | |
return false; | |
} | |
if(this->m_traits.syntax_type(*m_position) != regex_constants::syntax_close_mark) | |
{ | |
// Rewind to start of (? sequence: | |
--m_position; | |
while(this->m_traits.syntax_type(*m_position) != regex_constants::syntax_open_mark) --m_position; | |
fail(regex_constants::error_perl_extension, m_position - m_base); | |
return false; | |
} | |
if(++m_position == m_end) | |
{ | |
// Rewind to start of (? sequence: | |
--m_position; | |
while(this->m_traits.syntax_type(*m_position) != regex_constants::syntax_open_mark) --m_position; | |
fail(regex_constants::error_perl_extension, m_position - m_base); | |
return false; | |
} | |
} | |
else if(*m_position == charT('D')) | |
{ | |
const char* def = "DEFINE"; | |
while(*def && (m_position != m_end) && (*m_position == charT(*def))) | |
++m_position, ++def; | |
if((m_position == m_end) || *def) | |
{ | |
// Rewind to start of (? sequence: | |
--m_position; | |
while(this->m_traits.syntax_type(*m_position) != regex_constants::syntax_open_mark) --m_position; | |
fail(regex_constants::error_perl_extension, m_position - m_base); | |
return false; | |
} | |
re_brace* br = static_cast<re_brace*>(this->append_state(syntax_element_assert_backref, sizeof(re_brace))); | |
br->index = 9999; // special magic value! | |
if(this->m_traits.syntax_type(*m_position) != regex_constants::syntax_close_mark) | |
{ | |
// Rewind to start of (? sequence: | |
--m_position; | |
while(this->m_traits.syntax_type(*m_position) != regex_constants::syntax_open_mark) --m_position; | |
fail(regex_constants::error_perl_extension, m_position - m_base); | |
return false; | |
} | |
if(++m_position == m_end) | |
{ | |
// Rewind to start of (? sequence: | |
--m_position; | |
while(this->m_traits.syntax_type(*m_position) != regex_constants::syntax_open_mark) --m_position; | |
fail(regex_constants::error_perl_extension, m_position - m_base); | |
return false; | |
} | |
} | |
else if(v > 0) | |
{ | |
re_brace* br = static_cast<re_brace*>(this->append_state(syntax_element_assert_backref, sizeof(re_brace))); | |
br->index = v; | |
if(this->m_traits.syntax_type(*m_position) != regex_constants::syntax_close_mark) | |
{ | |
// Rewind to start of (? sequence: | |
--m_position; | |
while(this->m_traits.syntax_type(*m_position) != regex_constants::syntax_open_mark) --m_position; | |
fail(regex_constants::error_perl_extension, m_position - m_base); | |
return false; | |
} | |
if(++m_position == m_end) | |
{ | |
// Rewind to start of (? sequence: | |
--m_position; | |
while(this->m_traits.syntax_type(*m_position) != regex_constants::syntax_open_mark) --m_position; | |
fail(regex_constants::error_perl_extension, m_position - m_base); | |
return false; | |
} | |
} | |
else | |
{ | |
// verify that we have a lookahead or lookbehind assert: | |
if(this->m_traits.syntax_type(*m_position) != regex_constants::syntax_question) | |
{ | |
// Rewind to start of (? sequence: | |
--m_position; | |
while(this->m_traits.syntax_type(*m_position) != regex_constants::syntax_open_mark) --m_position; | |
fail(regex_constants::error_perl_extension, m_position - m_base); | |
return false; | |
} | |
if(++m_position == m_end) | |
{ | |
// Rewind to start of (? sequence: | |
--m_position; | |
while(this->m_traits.syntax_type(*m_position) != regex_constants::syntax_open_mark) --m_position; | |
fail(regex_constants::error_perl_extension, m_position - m_base); | |
return false; | |
} | |
if(this->m_traits.syntax_type(*m_position) == regex_constants::escape_type_left_word) | |
{ | |
if(++m_position == m_end) | |
{ | |
// Rewind to start of (? sequence: | |
--m_position; | |
while(this->m_traits.syntax_type(*m_position) != regex_constants::syntax_open_mark) --m_position; | |
fail(regex_constants::error_perl_extension, m_position - m_base); | |
return false; | |
} | |
if((this->m_traits.syntax_type(*m_position) != regex_constants::syntax_equal) | |
&& (this->m_traits.syntax_type(*m_position) != regex_constants::syntax_not)) | |
{ | |
// Rewind to start of (? sequence: | |
--m_position; | |
while(this->m_traits.syntax_type(*m_position) != regex_constants::syntax_open_mark) --m_position; | |
fail(regex_constants::error_perl_extension, m_position - m_base); | |
return false; | |
} | |
m_position -= 3; | |
} | |
else | |
{ | |
if((this->m_traits.syntax_type(*m_position) != regex_constants::syntax_equal) | |
&& (this->m_traits.syntax_type(*m_position) != regex_constants::syntax_not)) | |
{ | |
// Rewind to start of (? sequence: | |
--m_position; | |
while(this->m_traits.syntax_type(*m_position) != regex_constants::syntax_open_mark) --m_position; | |
fail(regex_constants::error_perl_extension, m_position - m_base); | |
return false; | |
} | |
m_position -= 2; | |
} | |
} | |
break; | |
} | |
case regex_constants::syntax_close_mark: | |
// Rewind to start of (? sequence: | |
--m_position; | |
while(this->m_traits.syntax_type(*m_position) != regex_constants::syntax_open_mark) --m_position; | |
fail(regex_constants::error_perl_extension, m_position - m_base); | |
return false; | |
case regex_constants::escape_type_end_buffer: | |
{ | |
name_delim = *m_position; | |
named_capture_jump: | |
markid = 0; | |
if(0 == (this->flags() & regbase::nosubs)) | |
{ | |
markid = ++m_mark_count; | |
#ifndef BOOST_NO_STD_DISTANCE | |
if(this->flags() & regbase::save_subexpression_location) | |
this->m_pdata->m_subs.push_back(std::pair<std::size_t, std::size_t>(std::distance(m_base, m_position) - 2, 0)); | |
#else | |
if(this->flags() & regbase::save_subexpression_location) | |
this->m_pdata->m_subs.push_back(std::pair<std::size_t, std::size_t>((m_position - m_base) - 2, 0)); | |
#endif | |
} | |
pb->index = markid; | |
const charT* base = ++m_position; | |
if(m_position == m_end) | |
{ | |
// Rewind to start of (? sequence: | |
--m_position; | |
while(this->m_traits.syntax_type(*m_position) != regex_constants::syntax_open_mark) --m_position; | |
fail(regex_constants::error_perl_extension, m_position - m_base); | |
return false; | |
} | |
while((m_position != m_end) && (*m_position != name_delim)) | |
++m_position; | |
if(m_position == m_end) | |
{ | |
// Rewind to start of (? sequence: | |
--m_position; | |
while(this->m_traits.syntax_type(*m_position) != regex_constants::syntax_open_mark) --m_position; | |
fail(regex_constants::error_perl_extension, m_position - m_base); | |
return false; | |
} | |
this->m_pdata->set_name(base, m_position, markid); | |
++m_position; | |
break; | |
} | |
default: | |
if(*m_position == charT('R')) | |
{ | |
++m_position; | |
v = 0; | |
if(this->m_traits.syntax_type(*m_position) != regex_constants::syntax_close_mark) | |
{ | |
// Rewind to start of (? sequence: | |
--m_position; | |
while(this->m_traits.syntax_type(*m_position) != regex_constants::syntax_open_mark) --m_position; | |
fail(regex_constants::error_perl_extension, m_position - m_base); | |
return false; | |
} | |
goto insert_recursion; | |
} | |
if(*m_position == charT('&')) | |
{ | |
++m_position; | |
const charT* base = m_position; | |
while((m_position != m_end) && (this->m_traits.syntax_type(*m_position) != regex_constants::syntax_close_mark)) | |
++m_position; | |
if(m_position == m_end) | |
{ | |
// Rewind to start of (? sequence: | |
--m_position; | |
while(this->m_traits.syntax_type(*m_position) != regex_constants::syntax_open_mark) --m_position; | |
fail(regex_constants::error_perl_extension, m_position - m_base); | |
return false; | |
} | |
v = static_cast<int>(hash_value_from_capture_name(base, m_position)); | |
goto insert_recursion; | |
} | |
if(*m_position == charT('P')) | |
{ | |
++m_position; | |
if(m_position == m_end) | |
{ | |
// Rewind to start of (? sequence: | |
--m_position; | |
while(this->m_traits.syntax_type(*m_position) != regex_constants::syntax_open_mark) --m_position; | |
fail(regex_constants::error_perl_extension, m_position - m_base); | |
return false; | |
} | |
if(*m_position == charT('>')) | |
{ | |
++m_position; | |
const charT* base = m_position; | |
while((m_position != m_end) && (this->m_traits.syntax_type(*m_position) != regex_constants::syntax_close_mark)) | |
++m_position; | |
if(m_position == m_end) | |
{ | |
// Rewind to start of (? sequence: | |
--m_position; | |
while(this->m_traits.syntax_type(*m_position) != regex_constants::syntax_open_mark) --m_position; | |
fail(regex_constants::error_perl_extension, m_position - m_base); | |
return false; | |
} | |
v = static_cast<int>(hash_value_from_capture_name(base, m_position)); | |
goto insert_recursion; | |
} | |
} | |
// | |
// lets assume that we have a (?imsx) group and try and parse it: | |
// | |
option_group_jump: | |
regex_constants::syntax_option_type opts = parse_options(); | |
if(m_position == m_end) | |
{ | |
// Rewind to start of (? sequence: | |
--m_position; | |
while(this->m_traits.syntax_type(*m_position) != regex_constants::syntax_open_mark) --m_position; | |
fail(regex_constants::error_perl_extension, m_position - m_base); | |
return false; | |
} | |
// make a note of whether we have a case change: | |
m_has_case_change = ((opts & regbase::icase) != (this->flags() & regbase::icase)); | |
pb->index = markid = 0; | |
if(this->m_traits.syntax_type(*m_position) == regex_constants::syntax_close_mark) | |
{ | |
// update flags and carry on as normal: | |
this->flags(opts); | |
restore_flags = false; | |
old_case_change |= m_has_case_change; // defer end of scope by one ')' | |
} | |
else if(this->m_traits.syntax_type(*m_position) == regex_constants::syntax_colon) | |
{ | |
// update flags and carry on until the matching ')' is found: | |
this->flags(opts); | |
++m_position; | |
} | |
else | |
{ | |
// Rewind to start of (? sequence: | |
--m_position; | |
while(this->m_traits.syntax_type(*m_position) != regex_constants::syntax_open_mark) --m_position; | |
fail(regex_constants::error_perl_extension, m_position - m_base); | |
return false; | |
} | |
// finally append a case change state if we need it: | |
if(m_has_case_change) | |
{ | |
static_cast<re_case*>( | |
this->append_state(syntax_element_toggle_case, sizeof(re_case)) | |
)->icase = opts & regbase::icase; | |
} | |
} | |
// | |
// now recursively add more states, this will terminate when we get to a | |
// matching ')' : | |
// | |
parse_all(); | |
// | |
// Unwind alternatives: | |
// | |
if(0 == unwind_alts(last_paren_start)) | |
{ | |
// Rewind to start of (? sequence: | |
--m_position; | |
while(this->m_traits.syntax_type(*m_position) != regex_constants::syntax_open_mark) --m_position; | |
fail(regex_constants::error_perl_extension, m_position - m_base, "Invalid alternation operators within (?...) block."); | |
return false; | |
} | |
// | |
// we either have a ')' or we have run out of characters prematurely: | |
// | |
if(m_position == m_end) | |
{ | |
// Rewind to start of (? sequence: | |
--m_position; | |
while(this->m_traits.syntax_type(*m_position) != regex_constants::syntax_open_mark) --m_position; | |
this->fail(regex_constants::error_paren, ::boost::re_detail::distance(m_base, m_end)); | |
return false; | |
} | |
BOOST_ASSERT(this->m_traits.syntax_type(*m_position) == regex_constants::syntax_close_mark); | |
++m_position; | |
// | |
// restore the flags: | |
// | |
if(restore_flags) | |
{ | |
// append a case change state if we need it: | |
if(m_has_case_change) | |
{ | |
static_cast<re_case*>( | |
this->append_state(syntax_element_toggle_case, sizeof(re_case)) | |
)->icase = old_flags & regbase::icase; | |
} | |
this->flags(old_flags); | |
} | |
// | |
// set up the jump pointer if we have one: | |
// | |
if(jump_offset) | |
{ | |
this->m_pdata->m_data.align(); | |
re_jump* jmp = static_cast<re_jump*>(this->getaddress(jump_offset)); | |
jmp->alt.i = this->m_pdata->m_data.size() - this->getoffset(jmp); | |
if(this->m_last_state == jmp) | |
{ | |
// Oops... we didn't have anything inside the assertion: | |
// Rewind to start of (? sequence: | |
--m_position; | |
while(this->m_traits.syntax_type(*m_position) != regex_constants::syntax_open_mark) --m_position; | |
fail(regex_constants::error_perl_extension, m_position - m_base, "Invalid or empty zero width assertion."); | |
return false; | |
} | |
} | |
// | |
// verify that if this is conditional expression, that we do have | |
// an alternative, if not add one: | |
// | |
if(markid == -4) | |
{ | |
re_syntax_base* b = this->getaddress(expected_alt_point); | |
// Make sure we have exactly one alternative following this state: | |
if(b->type != syntax_element_alt) | |
{ | |
re_alt* alt = static_cast<re_alt*>(this->insert_state(expected_alt_point, syntax_element_alt, sizeof(re_alt))); | |
alt->alt.i = this->m_pdata->m_data.size() - this->getoffset(alt); | |
} | |
else if(this->getaddress(static_cast<re_alt*>(b)->alt.i, b)->type == syntax_element_alt) | |
{ | |
// Can't have seen more than one alternative: | |
// Rewind to start of (? sequence: | |
--m_position; | |
while(this->m_traits.syntax_type(*m_position) != regex_constants::syntax_open_mark) --m_position; | |
fail(regex_constants::error_bad_pattern, m_position - m_base, "More than one alternation operator | was encountered inside a conditional expression."); | |
return false; | |
} | |
else | |
{ | |
// We must *not* have seen an alternative inside a (DEFINE) block: | |
b = this->getaddress(b->next.i, b); | |
if((b->type == syntax_element_assert_backref) && (static_cast<re_brace*>(b)->index == 9999)) | |
{ | |
// Rewind to start of (? sequence: | |
--m_position; | |
while(this->m_traits.syntax_type(*m_position) != regex_constants::syntax_open_mark) --m_position; | |
fail(regex_constants::error_bad_pattern, m_position - m_base, "Alternation operators are not allowed inside a DEFINE block."); | |
return false; | |
} | |
} | |
// check for invalid repetition of next state: | |
b = this->getaddress(expected_alt_point); | |
b = this->getaddress(static_cast<re_alt*>(b)->next.i, b); | |
if((b->type != syntax_element_assert_backref) | |
&& (b->type != syntax_element_startmark)) | |
{ | |
// Rewind to start of (? sequence: | |
--m_position; | |
while(this->m_traits.syntax_type(*m_position) != regex_constants::syntax_open_mark) --m_position; | |
fail(regex_constants::error_badrepeat, m_position - m_base, "A repetition operator cannot be applied to a zero-width assertion."); | |
return false; | |
} | |
} | |
// | |
// append closing parenthesis state: | |
// | |
pb = static_cast<re_brace*>(this->append_state(syntax_element_endmark, sizeof(re_brace))); | |
pb->index = markid; | |
pb->icase = this->flags() & regbase::icase; | |
this->m_paren_start = last_paren_start; | |
// | |
// restore the alternate insertion point: | |
// | |
this->m_alt_insert_point = last_alt_point; | |
// | |
// and the case change data: | |
// | |
m_has_case_change = old_case_change; | |
// | |
// And the mark_reset data: | |
// | |
if(m_max_mark > m_mark_count) | |
{ | |
m_mark_count = m_max_mark; | |
} | |
m_mark_reset = mark_reset; | |
m_max_mark = max_mark; | |
if(markid > 0) | |
{ | |
#ifndef BOOST_NO_STD_DISTANCE | |
if(this->flags() & regbase::save_subexpression_location) | |
this->m_pdata->m_subs.at(markid - 1).second = std::distance(m_base, m_position) - 1; | |
#else | |
if(this->flags() & regbase::save_subexpression_location) | |
this->m_pdata->m_subs.at(markid - 1).second = (m_position - m_base) - 1; | |
#endif | |
// | |
// allow backrefs to this mark: | |
// | |
if((markid > 0) && (markid < (int)(sizeof(unsigned) * CHAR_BIT))) | |
this->m_backrefs |= 1u << (markid - 1); | |
} | |
return true; | |
} | |
template <class charT, class traits> | |
bool basic_regex_parser<charT, traits>::add_emacs_code(bool negate) | |
{ | |
// | |
// parses an emacs style \sx or \Sx construct. | |
// | |
if(++m_position == m_end) | |
{ | |
// Rewind to start of sequence: | |
--m_position; | |
while(this->m_traits.syntax_type(*m_position) != regex_constants::syntax_escape) --m_position; | |
fail(regex_constants::error_escape, m_position - m_base); | |
return false; | |
} | |
basic_char_set<charT, traits> char_set; | |
if(negate) | |
char_set.negate(); | |
static const charT s_punct[5] = { 'p', 'u', 'n', 'c', 't', }; | |
switch(*m_position) | |
{ | |
case 's': | |
case ' ': | |
char_set.add_class(this->m_mask_space); | |
break; | |
case 'w': | |
char_set.add_class(this->m_word_mask); | |
break; | |
case '_': | |
char_set.add_single(digraph<charT>(charT('$'))); | |
char_set.add_single(digraph<charT>(charT('&'))); | |
char_set.add_single(digraph<charT>(charT('*'))); | |
char_set.add_single(digraph<charT>(charT('+'))); | |
char_set.add_single(digraph<charT>(charT('-'))); | |
char_set.add_single(digraph<charT>(charT('_'))); | |
char_set.add_single(digraph<charT>(charT('<'))); | |
char_set.add_single(digraph<charT>(charT('>'))); | |
break; | |
case '.': | |
char_set.add_class(this->m_traits.lookup_classname(s_punct, s_punct+5)); | |
break; | |
case '(': | |
char_set.add_single(digraph<charT>(charT('('))); | |
char_set.add_single(digraph<charT>(charT('['))); | |
char_set.add_single(digraph<charT>(charT('{'))); | |
break; | |
case ')': | |
char_set.add_single(digraph<charT>(charT(')'))); | |
char_set.add_single(digraph<charT>(charT(']'))); | |
char_set.add_single(digraph<charT>(charT('}'))); | |
break; | |
case '"': | |
char_set.add_single(digraph<charT>(charT('"'))); | |
char_set.add_single(digraph<charT>(charT('\''))); | |
char_set.add_single(digraph<charT>(charT('`'))); | |
break; | |
case '\'': | |
char_set.add_single(digraph<charT>(charT('\''))); | |
char_set.add_single(digraph<charT>(charT(','))); | |
char_set.add_single(digraph<charT>(charT('#'))); | |
break; | |
case '<': | |
char_set.add_single(digraph<charT>(charT(';'))); | |
break; | |
case '>': | |
char_set.add_single(digraph<charT>(charT('\n'))); | |
char_set.add_single(digraph<charT>(charT('\f'))); | |
break; | |
default: | |
fail(regex_constants::error_ctype, m_position - m_base); | |
return false; | |
} | |
if(0 == this->append_set(char_set)) | |
{ | |
fail(regex_constants::error_ctype, m_position - m_base); | |
return false; | |
} | |
++m_position; | |
return true; | |
} | |
template <class charT, class traits> | |
regex_constants::syntax_option_type basic_regex_parser<charT, traits>::parse_options() | |
{ | |
// we have a (?imsx-imsx) group, convert it into a set of flags: | |
regex_constants::syntax_option_type f = this->flags(); | |
bool breakout = false; | |
do | |
{ | |
switch(*m_position) | |
{ | |
case 's': | |
f |= regex_constants::mod_s; | |
f &= ~regex_constants::no_mod_s; | |
break; | |
case 'm': | |
f &= ~regex_constants::no_mod_m; | |
break; | |
case 'i': | |
f |= regex_constants::icase; | |
break; | |
case 'x': | |
f |= regex_constants::mod_x; | |
break; | |
default: | |
breakout = true; | |
continue; | |
} | |
if(++m_position == m_end) | |
{ | |
// Rewind to start of (? sequence: | |
--m_position; | |
while(this->m_traits.syntax_type(*m_position) != regex_constants::syntax_open_mark) --m_position; | |
fail(regex_constants::error_paren, m_position - m_base); | |
return false; | |
} | |
} | |
while(!breakout); | |
if(*m_position == static_cast<charT>('-')) | |
{ | |
if(++m_position == m_end) | |
{ | |
// Rewind to start of (? sequence: | |
--m_position; | |
while(this->m_traits.syntax_type(*m_position) != regex_constants::syntax_open_mark) --m_position; | |
fail(regex_constants::error_paren, m_position - m_base); | |
return false; | |
} | |
do | |
{ | |
switch(*m_position) | |
{ | |
case 's': | |
f &= ~regex_constants::mod_s; | |
f |= regex_constants::no_mod_s; | |
break; | |
case 'm': | |
f |= regex_constants::no_mod_m; | |
break; | |
case 'i': | |
f &= ~regex_constants::icase; | |
break; | |
case 'x': | |
f &= ~regex_constants::mod_x; | |
break; | |
default: | |
breakout = true; | |
continue; | |
} | |
if(++m_position == m_end) | |
{ | |
// Rewind to start of (? sequence: | |
--m_position; | |
while(this->m_traits.syntax_type(*m_position) != regex_constants::syntax_open_mark) --m_position; | |
fail(regex_constants::error_paren, m_position - m_base); | |
return false; | |
} | |
} | |
while(!breakout); | |
} | |
return f; | |
} | |
template <class charT, class traits> | |
bool basic_regex_parser<charT, traits>::unwind_alts(std::ptrdiff_t last_paren_start) | |
{ | |
// | |
// If we didn't actually add any states after the last | |
// alternative then that's an error: | |
// | |
if((this->m_alt_insert_point == static_cast<std::ptrdiff_t>(this->m_pdata->m_data.size())) | |
&& m_alt_jumps.size() && (m_alt_jumps.back() > last_paren_start) | |
&& | |
!( | |
((this->flags() & regbase::main_option_type) == regbase::perl_syntax_group) | |
&& | |
((this->flags() & regbase::no_empty_expressions) == 0) | |
) | |
) | |
{ | |
fail(regex_constants::error_empty, this->m_position - this->m_base, "Can't terminate a sub-expression with an alternation operator |."); | |
return false; | |
} | |
// | |
// Fix up our alternatives: | |
// | |
while(m_alt_jumps.size() && (m_alt_jumps.back() > last_paren_start)) | |
{ | |
// | |
// fix up the jump to point to the end of the states | |
// that we've just added: | |
// | |
std::ptrdiff_t jump_offset = m_alt_jumps.back(); | |
m_alt_jumps.pop_back(); | |
this->m_pdata->m_data.align(); | |
re_jump* jmp = static_cast<re_jump*>(this->getaddress(jump_offset)); | |
BOOST_ASSERT(jmp->type == syntax_element_jump); | |
jmp->alt.i = this->m_pdata->m_data.size() - jump_offset; | |
} | |
return true; | |
} | |
#ifdef BOOST_MSVC | |
#pragma warning(pop) | |
#endif | |
} // namespace re_detail | |
} // namespace boost | |
#ifdef BOOST_MSVC | |
#pragma warning(push) | |
#pragma warning(disable: 4103) | |
#endif | |
#ifdef BOOST_HAS_ABI_HEADERS | |
# include BOOST_ABI_SUFFIX | |
#endif | |
#ifdef BOOST_MSVC | |
#pragma warning(pop) | |
#endif | |
#endif |