blob: 3bbd8728a8266513cf4612a21bc0d5d3f886a047 [file] [log] [blame]
// parser.hpp
// Copyright (c) 2007-2009 Ben Hanson (http://www.benhanson.net/)
//
// Distributed under the Boost Software License, Version 1.0. (See accompanying
// file licence_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
#ifndef BOOST_LEXER_PARSER_HPP
#define BOOST_LEXER_PARSER_HPP
#include <boost/assert.hpp>
#include "tree/end_node.hpp"
#include "tree/iteration_node.hpp"
#include "tree/leaf_node.hpp"
#include "../runtime_error.hpp"
#include "tree/selection_node.hpp"
#include "tree/sequence_node.hpp"
#include "../size_t.hpp"
#include "tokeniser/re_tokeniser.hpp"
namespace boost
{
namespace lexer
{
namespace detail
{
template<typename CharT>
class basic_parser
{
public:
typedef basic_re_tokeniser<CharT> tokeniser;
typedef typename tokeniser::string string;
typedef std::map<string, const node *> macro_map;
typedef node::node_ptr_vector node_ptr_vector;
typedef typename tokeniser::num_token token;
/*
General principles of regex parsing:
- Every regex is a sequence of sub-regexes.
- Regexes consist of operands and operators
- All operators decompose to sequence, selection ('|') and iteration ('*')
- Regex tokens are stored on the stack.
- When a complete sequence of regex tokens is on the stack it is processed.
Grammar:
<REGEX> -> <OREXP>
<OREXP> -> <SEQUENCE> | <OREXP>'|'<SEQUENCE>
<SEQUENCE> -> <SUB>
<SUB> -> <EXPRESSION> | <SUB><EXPRESSION>
<EXPRESSION> -> <REPEAT>
<REPEAT> -> charset | macro | '('<REGEX>')' | <REPEAT><DUPLICATE>
<DUPLICATE> -> '?' | '*' | '+' | '{n[,[m]]}'
*/
static node *parse (const CharT *start_, const CharT * const end_,
const std::size_t id_, const std::size_t unique_id_,
const std::size_t dfa_state_, const regex_flags flags_,
const std::locale &locale_, node_ptr_vector &node_ptr_vector_,
const macro_map &macromap_, typename tokeniser::token_map &map_,
bool &seen_BOL_assertion_, bool &seen_EOL_assertion_)
{
node *root_ = 0;
state state_ (start_, end_, flags_, locale_);
token lhs_token_;
token rhs_token_;
token_stack token_stack_;
tree_node_stack tree_node_stack_;
char action_ = 0;
token_stack_.push (rhs_token_);
tokeniser::next (state_, map_, rhs_token_);
do
{
lhs_token_ = token_stack_.top ();
action_ = lhs_token_.precedence (rhs_token_._type);
switch (action_)
{
case '<':
case '=':
token_stack_.push (rhs_token_);
tokeniser::next (state_, map_, rhs_token_);
break;
case '>':
reduce (token_stack_, macromap_, node_ptr_vector_,
tree_node_stack_);
break;
default:
std::ostringstream ss_;
ss_ << "A syntax error occurred: '" <<
lhs_token_.precedence_string () <<
"' against '" << rhs_token_.precedence_string () <<
"' at index " << state_.index () << ".";
throw runtime_error (ss_.str ().c_str ());
break;
}
} while (!token_stack_.empty ());
if (tree_node_stack_.empty ())
{
throw runtime_error ("Empty rules are not allowed.");
}
BOOST_ASSERT(tree_node_stack_.size () == 1);
node *lhs_node_ = tree_node_stack_.top ();
tree_node_stack_.pop ();
if (id_ == 0)
{
// Macros have no end state...
root_ = lhs_node_;
}
else
{
node_ptr_vector_->push_back (static_cast<end_node *>(0));
node *rhs_node_ = new end_node (id_, unique_id_, dfa_state_);
node_ptr_vector_->back () = rhs_node_;
node_ptr_vector_->push_back (static_cast<sequence_node *>(0));
node_ptr_vector_->back () = new sequence_node
(lhs_node_, rhs_node_);
root_ = node_ptr_vector_->back ();
}
// Done this way as bug in VC++ 6 prevents |= operator working
// properly!
if (state_._seen_BOL_assertion) seen_BOL_assertion_ = true;
if (state_._seen_EOL_assertion) seen_EOL_assertion_ = true;
return root_;
}
private:
typedef typename tokeniser::state state;
typedef std::stack<token> token_stack;
typedef node::node_stack tree_node_stack;
static void reduce (token_stack &token_stack_,
const macro_map &macromap_, node_ptr_vector &node_vector_ptr_,
tree_node_stack &tree_node_stack_)
{
typename tokeniser::num_token lhs_;
typename tokeniser::num_token rhs_;
token_stack handle_;
char action_ = 0;
do
{
rhs_ = token_stack_.top ();
token_stack_.pop ();
handle_.push (rhs_);
if (!token_stack_.empty ())
{
lhs_ = token_stack_.top ();
action_ = lhs_.precedence (rhs_._type);
}
} while (!token_stack_.empty () && action_ == '=');
BOOST_ASSERT(token_stack_.empty () || action_ == '<');
switch (rhs_._type)
{
case token::BEGIN:
// finished processing so exit
break;
case token::REGEX:
// finished parsing, nothing to do
break;
case token::OREXP:
orexp (handle_, token_stack_, node_vector_ptr_, tree_node_stack_);
break;
case token::SEQUENCE:
token_stack_.push (token::OREXP);
break;
case token::SUB:
sub (handle_, token_stack_, node_vector_ptr_, tree_node_stack_);
break;
case token::EXPRESSION:
token_stack_.push (token::SUB);
break;
case token::REPEAT:
repeat (handle_, token_stack_);
break;
case token::CHARSET:
charset (handle_, token_stack_, node_vector_ptr_,
tree_node_stack_);
break;
case token::MACRO:
macro (handle_, token_stack_, macromap_, node_vector_ptr_,
tree_node_stack_);
break;
case token::OPENPAREN:
openparen (handle_, token_stack_);
break;
case token::OPT:
case token::AOPT:
optional (rhs_._type == token::OPT, node_vector_ptr_,
tree_node_stack_);
token_stack_.push (token::DUP);
break;
case token::ZEROORMORE:
case token::AZEROORMORE:
zero_or_more (rhs_._type == token::ZEROORMORE, node_vector_ptr_,
tree_node_stack_);
token_stack_.push (token::DUP);
break;
case token::ONEORMORE:
case token::AONEORMORE:
one_or_more (rhs_._type == token::ONEORMORE, node_vector_ptr_,
tree_node_stack_);
token_stack_.push (token::DUP);
break;
case token::REPEATN:
case token::AREPEATN:
repeatn (rhs_._type == token::REPEATN, handle_.top (),
node_vector_ptr_, tree_node_stack_);
token_stack_.push (token::DUP);
break;
default:
throw runtime_error
("Internal error regex_parser::reduce");
break;
}
}
static void orexp (token_stack &handle_, token_stack &token_stack_,
node_ptr_vector &node_ptr_vector_, tree_node_stack &tree_node_stack_)
{
BOOST_ASSERT(handle_.top ()._type == token::OREXP &&
(handle_.size () == 1 || handle_.size () == 3));
if (handle_.size () == 1)
{
token_stack_.push (token::REGEX);
}
else
{
handle_.pop ();
BOOST_ASSERT(handle_.top ()._type == token::OR);
handle_.pop ();
BOOST_ASSERT(handle_.top ()._type == token::SEQUENCE);
perform_or (node_ptr_vector_, tree_node_stack_);
token_stack_.push (token::OREXP);
}
}
static void sub (token_stack &handle_, token_stack &token_stack_,
node_ptr_vector &node_ptr_vector_, tree_node_stack &tree_node_stack_)
{
BOOST_ASSERT(handle_.top ()._type == token::SUB &&
(handle_.size () == 1 || handle_.size () == 2));
if (handle_.size () == 1)
{
token_stack_.push (token::SEQUENCE);
}
else
{
handle_.pop ();
BOOST_ASSERT(handle_.top ()._type == token::EXPRESSION);
// perform join
sequence (node_ptr_vector_, tree_node_stack_);
token_stack_.push (token::SUB);
}
}
static void repeat (token_stack &handle_, token_stack &token_stack_)
{
BOOST_ASSERT(handle_.top ()._type == token::REPEAT &&
handle_.size () >= 1 && handle_.size () <= 3);
if (handle_.size () == 1)
{
token_stack_.push (token::EXPRESSION);
}
else
{
handle_.pop ();
BOOST_ASSERT(handle_.top ()._type == token::DUP);
token_stack_.push (token::REPEAT);
}
}
static void charset (token_stack &handle_, token_stack &token_stack_,
node_ptr_vector &node_ptr_vector_, tree_node_stack &tree_node_stack_)
{
BOOST_ASSERT(handle_.top ()._type == token::CHARSET &&
handle_.size () == 1);
// store charset
node_ptr_vector_->push_back (static_cast<leaf_node *>(0));
const size_t id_ = handle_.top ()._id;
node_ptr_vector_->back () = new leaf_node (id_, true);
tree_node_stack_.push (node_ptr_vector_->back ());
token_stack_.push (token::REPEAT);
}
static void macro (token_stack &handle_, token_stack &token_stack_,
const macro_map &macromap_, node_ptr_vector &node_ptr_vector_,
tree_node_stack &tree_node_stack_)
{
token &top_ = handle_.top ();
BOOST_ASSERT(top_._type == token::MACRO && handle_.size () == 1);
typename macro_map::const_iterator iter_ =
macromap_.find (top_._macro);
if (iter_ == macromap_.end ())
{
const CharT *name_ = top_._macro;
std::basic_stringstream<CharT> ss_;
std::ostringstream os_;
os_ << "Unknown MACRO name '";
while (*name_)
{
os_ << ss_.narrow (*name_++, ' ');
}
os_ << "'.";
throw runtime_error (os_.str ());
}
tree_node_stack_.push (iter_->second->copy (node_ptr_vector_));
token_stack_.push (token::REPEAT);
}
static void openparen (token_stack &handle_, token_stack &token_stack_)
{
BOOST_ASSERT(handle_.top ()._type == token::OPENPAREN &&
handle_.size () == 3);
handle_.pop ();
BOOST_ASSERT(handle_.top ()._type == token::REGEX);
handle_.pop ();
BOOST_ASSERT(handle_.top ()._type == token::CLOSEPAREN);
token_stack_.push (token::REPEAT);
}
static void perform_or (node_ptr_vector &node_ptr_vector_,
tree_node_stack &tree_node_stack_)
{
// perform or
node *rhs_ = tree_node_stack_.top ();
tree_node_stack_.pop ();
node *lhs_ = tree_node_stack_.top ();
node_ptr_vector_->push_back (static_cast<selection_node *>(0));
node_ptr_vector_->back () = new selection_node (lhs_, rhs_);
tree_node_stack_.top () = node_ptr_vector_->back ();
}
static void sequence (node_ptr_vector &node_ptr_vector_,
tree_node_stack &tree_node_stack_)
{
node *rhs_ = tree_node_stack_.top ();
tree_node_stack_.pop ();
node *lhs_ = tree_node_stack_.top ();
node_ptr_vector_->push_back (static_cast<sequence_node *>(0));
node_ptr_vector_->back () = new sequence_node (lhs_, rhs_);
tree_node_stack_.top () = node_ptr_vector_->back ();
}
static void optional (const bool greedy_,
node_ptr_vector &node_ptr_vector_, tree_node_stack &tree_node_stack_)
{
// perform ?
node *lhs_ = tree_node_stack_.top ();
// You don't know if lhs_ is a leaf_node, so get firstpos.
node::node_vector &firstpos_ = lhs_->firstpos ();
for (node::node_vector::iterator iter_ = firstpos_.begin (),
end_ = firstpos_.end (); iter_ != end_; ++iter_)
{
// These are leaf_nodes!
(*iter_)->greedy (greedy_);
}
node_ptr_vector_->push_back (static_cast<leaf_node *>(0));
node *rhs_ = new leaf_node (null_token, greedy_);
node_ptr_vector_->back () = rhs_;
node_ptr_vector_->push_back (static_cast<selection_node *>(0));
node_ptr_vector_->back () = new selection_node (lhs_, rhs_);
tree_node_stack_.top () = node_ptr_vector_->back ();
}
static void zero_or_more (const bool greedy_,
node_ptr_vector &node_ptr_vector_, tree_node_stack &tree_node_stack_)
{
// perform *
node *ptr_ = tree_node_stack_.top ();
node_ptr_vector_->push_back (static_cast<iteration_node *>(0));
node_ptr_vector_->back () = new iteration_node (ptr_, greedy_);
tree_node_stack_.top () = node_ptr_vector_->back ();
}
static void one_or_more (const bool greedy_,
node_ptr_vector &node_ptr_vector_, tree_node_stack &tree_node_stack_)
{
// perform +
node *lhs_ = tree_node_stack_.top ();
node *copy_ = lhs_->copy (node_ptr_vector_);
node_ptr_vector_->push_back (static_cast<iteration_node *>(0));
node *rhs_ = new iteration_node (copy_, greedy_);
node_ptr_vector_->back () = rhs_;
node_ptr_vector_->push_back (static_cast<sequence_node *>(0));
node_ptr_vector_->back () = new sequence_node (lhs_, rhs_);
tree_node_stack_.top () = node_ptr_vector_->back ();
}
// This is one of the most mind bending routines in this code...
static void repeatn (const bool greedy_, const token &token_,
node_ptr_vector &node_ptr_vector_, tree_node_stack &tree_node_stack_)
{
// perform {n[,[m]]}
// Semantic checks have already been performed.
// {0,} = *
// {0,1} = ?
// {1,} = +
// therefore we do not check for these cases.
if (!(token_._min == 1 && !token_._comma))
{
const std::size_t top_ = token_._min > 0 ?
token_._min : token_._max;
if (token_._min == 0)
{
optional (greedy_, node_ptr_vector_, tree_node_stack_);
}
node *prev_ = tree_node_stack_.top ()->copy (node_ptr_vector_);
node *curr_ = 0;
for (std::size_t i_ = 2; i_ < top_; ++i_)
{
curr_ = prev_->copy (node_ptr_vector_);
tree_node_stack_.push (static_cast<node *>(0));
tree_node_stack_.top () = prev_;
sequence (node_ptr_vector_, tree_node_stack_);
prev_ = curr_;
}
if (token_._comma && token_._min > 0)
{
if (token_._min > 1)
{
curr_ = prev_->copy (node_ptr_vector_);
tree_node_stack_.push (static_cast<node *>(0));
tree_node_stack_.top () = prev_;
sequence (node_ptr_vector_, tree_node_stack_);
prev_ = curr_;
}
if (token_._comma && token_._max)
{
tree_node_stack_.push (static_cast<node *>(0));
tree_node_stack_.top () = prev_;
optional (greedy_, node_ptr_vector_, tree_node_stack_);
prev_ = tree_node_stack_.top ();
tree_node_stack_.pop ();
const std::size_t count_ = token_._max - token_._min;
for (std::size_t i_ = 1; i_ < count_; ++i_)
{
curr_ = prev_->copy (node_ptr_vector_);
tree_node_stack_.push (static_cast<node *>(0));
tree_node_stack_.top () = prev_;
sequence (node_ptr_vector_, tree_node_stack_);
prev_ = curr_;
}
}
else
{
tree_node_stack_.push (static_cast<node *>(0));
tree_node_stack_.top () = prev_;
zero_or_more (greedy_, node_ptr_vector_, tree_node_stack_);
prev_ = tree_node_stack_.top ();
tree_node_stack_.pop ();
}
}
tree_node_stack_.push (static_cast<node *>(0));
tree_node_stack_.top () = prev_;
sequence (node_ptr_vector_, tree_node_stack_);
}
}
};
}
}
}
#endif