/* | |
* | |
* Copyright (c) 1998-2002 | |
* John Maddock | |
* | |
* Use, modification and distribution are subject to the | |
* Boost Software License, Version 1.0. (See accompanying file | |
* LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) | |
* | |
*/ | |
/* | |
* LOCATION: see http://www.boost.org for most recent version. | |
* FILE states.cpp | |
* VERSION see <boost/version.hpp> | |
* DESCRIPTION: Declares internal state machine structures. | |
*/ | |
#ifndef BOOST_REGEX_V4_STATES_HPP | |
#define BOOST_REGEX_V4_STATES_HPP | |
#ifdef BOOST_MSVC | |
#pragma warning(push) | |
#pragma warning(disable: 4103) | |
#endif | |
#ifdef BOOST_HAS_ABI_HEADERS | |
# include BOOST_ABI_PREFIX | |
#endif | |
#ifdef BOOST_MSVC | |
#pragma warning(pop) | |
#endif | |
namespace boost{ | |
namespace re_detail{ | |
/*** mask_type ******************************************************* | |
Whenever we have a choice of two alternatives, we use an array of bytes | |
to indicate which of the two alternatives it is possible to take for any | |
given input character. If mask_take is set, then we can take the next | |
state, and if mask_skip is set then we can take the alternative. | |
***********************************************************************/ | |
enum mask_type | |
{ | |
mask_take = 1, | |
mask_skip = 2, | |
mask_init = 4, | |
mask_any = mask_skip | mask_take, | |
mask_all = mask_any | |
}; | |
/*** helpers ********************************************************** | |
These helpers let us use function overload resolution to detect whether | |
we have narrow or wide character strings: | |
***********************************************************************/ | |
struct _narrow_type{}; | |
struct _wide_type{}; | |
template <class charT> struct is_byte; | |
template<> struct is_byte<char> { typedef _narrow_type width_type; }; | |
template<> struct is_byte<unsigned char>{ typedef _narrow_type width_type; }; | |
template<> struct is_byte<signed char> { typedef _narrow_type width_type; }; | |
template <class charT> struct is_byte { typedef _wide_type width_type; }; | |
/*** enum syntax_element_type ****************************************** | |
Every record in the state machine falls into one of the following types: | |
***********************************************************************/ | |
enum syntax_element_type | |
{ | |
// start of a marked sub-expression, or perl-style (?...) extension | |
syntax_element_startmark = 0, | |
// end of a marked sub-expression, or perl-style (?...) extension | |
syntax_element_endmark = syntax_element_startmark + 1, | |
// any sequence of literal characters | |
syntax_element_literal = syntax_element_endmark + 1, | |
// start of line assertion: ^ | |
syntax_element_start_line = syntax_element_literal + 1, | |
// end of line assertion $ | |
syntax_element_end_line = syntax_element_start_line + 1, | |
// match any character: . | |
syntax_element_wild = syntax_element_end_line + 1, | |
// end of expression: we have a match when we get here | |
syntax_element_match = syntax_element_wild + 1, | |
// perl style word boundary: \b | |
syntax_element_word_boundary = syntax_element_match + 1, | |
// perl style within word boundary: \B | |
syntax_element_within_word = syntax_element_word_boundary + 1, | |
// start of word assertion: \< | |
syntax_element_word_start = syntax_element_within_word + 1, | |
// end of word assertion: \> | |
syntax_element_word_end = syntax_element_word_start + 1, | |
// start of buffer assertion: \` | |
syntax_element_buffer_start = syntax_element_word_end + 1, | |
// end of buffer assertion: \' | |
syntax_element_buffer_end = syntax_element_buffer_start + 1, | |
// backreference to previously matched sub-expression | |
syntax_element_backref = syntax_element_buffer_end + 1, | |
// either a wide character set [..] or one with multicharacter collating elements: | |
syntax_element_long_set = syntax_element_backref + 1, | |
// narrow character set: [...] | |
syntax_element_set = syntax_element_long_set + 1, | |
// jump to a new state in the machine: | |
syntax_element_jump = syntax_element_set + 1, | |
// choose between two production states: | |
syntax_element_alt = syntax_element_jump + 1, | |
// a repeat | |
syntax_element_rep = syntax_element_alt + 1, | |
// match a combining character sequence | |
syntax_element_combining = syntax_element_rep + 1, | |
// perl style soft buffer end: \z | |
syntax_element_soft_buffer_end = syntax_element_combining + 1, | |
// perl style continuation: \G | |
syntax_element_restart_continue = syntax_element_soft_buffer_end + 1, | |
// single character repeats: | |
syntax_element_dot_rep = syntax_element_restart_continue + 1, | |
syntax_element_char_rep = syntax_element_dot_rep + 1, | |
syntax_element_short_set_rep = syntax_element_char_rep + 1, | |
syntax_element_long_set_rep = syntax_element_short_set_rep + 1, | |
// a backstep for lookbehind repeats: | |
syntax_element_backstep = syntax_element_long_set_rep + 1, | |
// an assertion that a mark was matched: | |
syntax_element_assert_backref = syntax_element_backstep + 1, | |
syntax_element_toggle_case = syntax_element_assert_backref + 1, | |
// a recursive expression: | |
syntax_element_recurse = syntax_element_toggle_case + 1 | |
}; | |
#ifdef BOOST_REGEX_DEBUG | |
// dwa 09/26/00 - This is needed to suppress warnings about an ambiguous conversion | |
std::ostream& operator<<(std::ostream&, syntax_element_type); | |
#endif | |
struct re_syntax_base; | |
/*** union offset_type ************************************************ | |
Points to another state in the machine. During machine construction | |
we use integral offsets, but these are converted to pointers before | |
execution of the machine. | |
***********************************************************************/ | |
union offset_type | |
{ | |
re_syntax_base* p; | |
std::ptrdiff_t i; | |
}; | |
/*** struct re_syntax_base ******************************************** | |
Base class for all states in the machine. | |
***********************************************************************/ | |
struct re_syntax_base | |
{ | |
syntax_element_type type; // what kind of state this is | |
offset_type next; // next state in the machine | |
}; | |
/*** struct re_brace ************************************************** | |
A marked parenthesis. | |
***********************************************************************/ | |
struct re_brace : public re_syntax_base | |
{ | |
// The index to match, can be zero (don't mark the sub-expression) | |
// or negative (for perl style (?...) extentions): | |
int index; | |
bool icase; | |
}; | |
/*** struct re_dot ************************************************** | |
Match anything. | |
***********************************************************************/ | |
enum | |
{ | |
dont_care = 1, | |
force_not_newline = 0, | |
force_newline = 2, | |
test_not_newline = 2, | |
test_newline = 3 | |
}; | |
struct re_dot : public re_syntax_base | |
{ | |
unsigned char mask; | |
}; | |
/*** struct re_literal ************************************************ | |
A string of literals, following this structure will be an | |
array of characters: charT[length] | |
***********************************************************************/ | |
struct re_literal : public re_syntax_base | |
{ | |
unsigned int length; | |
}; | |
/*** struct re_case ************************************************ | |
Indicates whether we are moving to a case insensive block or not | |
***********************************************************************/ | |
struct re_case : public re_syntax_base | |
{ | |
bool icase; | |
}; | |
/*** struct re_set_long *********************************************** | |
A wide character set of characters, following this structure will be | |
an array of type charT: | |
First csingles null-terminated strings | |
Then 2 * cranges NULL terminated strings | |
Then cequivalents NULL terminated strings | |
***********************************************************************/ | |
template <class mask_type> | |
struct re_set_long : public re_syntax_base | |
{ | |
unsigned int csingles, cranges, cequivalents; | |
mask_type cclasses; | |
mask_type cnclasses; | |
bool isnot; | |
bool singleton; | |
}; | |
/*** struct re_set **************************************************** | |
A set of narrow-characters, matches any of _map which is none-zero | |
***********************************************************************/ | |
struct re_set : public re_syntax_base | |
{ | |
unsigned char _map[1 << CHAR_BIT]; | |
}; | |
/*** struct re_jump *************************************************** | |
Jump to a new location in the machine (not next). | |
***********************************************************************/ | |
struct re_jump : public re_syntax_base | |
{ | |
offset_type alt; // location to jump to | |
}; | |
/*** struct re_alt *************************************************** | |
Jump to a new location in the machine (possibly next). | |
***********************************************************************/ | |
struct re_alt : public re_jump | |
{ | |
unsigned char _map[1 << CHAR_BIT]; // which characters can take the jump | |
unsigned int can_be_null; // true if we match a NULL string | |
}; | |
/*** struct re_repeat ************************************************* | |
Repeat a section of the machine | |
***********************************************************************/ | |
struct re_repeat : public re_alt | |
{ | |
std::size_t min, max; // min and max allowable repeats | |
int state_id; // Unique identifier for this repeat | |
bool leading; // True if this repeat is at the start of the machine (lets us optimize some searches) | |
bool greedy; // True if this is a greedy repeat | |
}; | |
/*** struct re_recurse ************************************************ | |
Recurse to a particular subexpression. | |
**********************************************************************/ | |
struct re_recurse : public re_jump | |
{ | |
int state_id; // identifier of first nested repeat within the recursion. | |
}; | |
/*** enum re_jump_size_type ******************************************* | |
Provides compiled size of re_jump structure (allowing for trailing alignment). | |
We provide this so we know how manybytes to insert when constructing the machine | |
(The value of padding_mask is defined in regex_raw_buffer.hpp). | |
***********************************************************************/ | |
enum re_jump_size_type | |
{ | |
re_jump_size = (sizeof(re_jump) + padding_mask) & ~(padding_mask), | |
re_repeater_size = (sizeof(re_repeat) + padding_mask) & ~(padding_mask), | |
re_alt_size = (sizeof(re_alt) + padding_mask) & ~(padding_mask) | |
}; | |
/*** proc re_is_set_member ********************************************* | |
Forward declaration: we'll need this one later... | |
***********************************************************************/ | |
template<class charT, class traits> | |
struct regex_data; | |
template <class iterator, class charT, class traits_type, class char_classT> | |
iterator BOOST_REGEX_CALL re_is_set_member(iterator next, | |
iterator last, | |
const re_set_long<char_classT>* set_, | |
const regex_data<charT, traits_type>& e, bool icase); | |
} // namespace re_detail | |
} // namespace boost | |
#ifdef BOOST_MSVC | |
#pragma warning(push) | |
#pragma warning(disable: 4103) | |
#endif | |
#ifdef BOOST_HAS_ABI_HEADERS | |
# include BOOST_ABI_SUFFIX | |
#endif | |
#ifdef BOOST_MSVC | |
#pragma warning(pop) | |
#endif | |
#endif | |