// input.hpp | |
// Copyright (c) 2008-2009 Ben Hanson (http://www.benhanson.net/) | |
// | |
// Distributed under the Boost Software License, Version 1.0. (See accompanying | |
// file licence_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) | |
#ifndef BOOST_LEXER_INPUT | |
#define BOOST_LEXER_INPUT | |
#include "char_traits.hpp" | |
#include <boost/detail/iterator.hpp> | |
#include "size_t.hpp" | |
#include "state_machine.hpp" | |
namespace boost | |
{ | |
namespace lexer | |
{ | |
template<typename FwdIter, typename Traits = | |
char_traits<typename boost::detail::iterator_traits<FwdIter>::value_type> > | |
class basic_input | |
{ | |
public: | |
class iterator | |
{ | |
public: | |
#if defined _MSC_VER && _MSC_VER <= 1200 | |
friend basic_input; | |
#else | |
friend class basic_input; | |
#endif | |
struct data | |
{ | |
std::size_t id; | |
std::size_t unique_id; | |
FwdIter start; | |
FwdIter end; | |
bool bol; | |
std::size_t state; | |
// Construct in end() state. | |
data () : | |
id (0), | |
unique_id (npos), | |
bol (false), | |
state (npos) | |
{ | |
} | |
bool operator == (const data &rhs_) const | |
{ | |
return id == rhs_.id && unique_id == rhs_.unique_id && | |
start == rhs_.start && end == rhs_.end && | |
bol == rhs_.bol && state == rhs_.state; | |
} | |
}; | |
iterator () : | |
_input (0) | |
{ | |
} | |
bool operator == (const iterator &rhs_) const | |
{ | |
return _data == rhs_._data; | |
} | |
bool operator != (const iterator &rhs_) const | |
{ | |
return !(*this == rhs_); | |
} | |
data &operator * () | |
{ | |
return _data; | |
} | |
data *operator -> () | |
{ | |
return &_data; | |
} | |
// Let compiler generate operator = (). | |
// prefix version | |
iterator &operator ++ () | |
{ | |
next_token (); | |
return *this; | |
} | |
// postfix version | |
iterator operator ++ (int) | |
{ | |
iterator iter_ = *this; | |
next_token (); | |
return iter_; | |
} | |
private: | |
// Not owner (obviously!) | |
const basic_input *_input; | |
data _data; | |
void next_token () | |
{ | |
const detail::internals &internals_ = | |
_input->_state_machine->data (); | |
_data.start = _data.end; | |
if (internals_._dfa->size () == 1) | |
{ | |
if (internals_._seen_BOL_assertion || | |
internals_._seen_EOL_assertion) | |
{ | |
_data.id = next | |
(&internals_._lookup->front ()->front (), | |
internals_._dfa_alphabet.front (), | |
&internals_._dfa->front ()->front (), | |
_data.bol, _data.end, _input->_end, _data.unique_id); | |
} | |
else | |
{ | |
_data.id = next (&internals_._lookup->front ()->front (), | |
internals_._dfa_alphabet.front (), &internals_. | |
_dfa->front ()->front (), _data.end, _input->_end, | |
_data.unique_id); | |
} | |
} | |
else | |
{ | |
if (internals_._seen_BOL_assertion || | |
internals_._seen_EOL_assertion) | |
{ | |
_data.id = next (internals_, _data.state, | |
_data.bol, _data.end, _input->_end, _data.unique_id); | |
} | |
else | |
{ | |
_data.id = next (internals_, _data.state, | |
_data.end, _input->_end, _data.unique_id); | |
} | |
} | |
if (_data.end == _input->_end && _data.start == _data.end) | |
{ | |
// Ensure current state matches that returned by end(). | |
_data.state = npos; | |
} | |
} | |
std::size_t next (const detail::internals &internals_, | |
std::size_t &start_state_, bool bol_, | |
FwdIter &start_token_, const FwdIter &end_, | |
std::size_t &unique_id_) | |
{ | |
if (start_token_ == end_) | |
{ | |
unique_id_ = npos; | |
return 0; | |
} | |
again: | |
const std::size_t * lookup_ = &internals_._lookup[start_state_]-> | |
front (); | |
std::size_t dfa_alphabet_ = internals_._dfa_alphabet[start_state_]; | |
const std::size_t *dfa_ = &internals_._dfa[start_state_]->front (); | |
const std::size_t *ptr_ = dfa_ + dfa_alphabet_; | |
FwdIter curr_ = start_token_; | |
bool end_state_ = *ptr_ != 0; | |
std::size_t id_ = *(ptr_ + id_index); | |
std::size_t uid_ = *(ptr_ + unique_id_index); | |
std::size_t end_start_state_ = start_state_; | |
bool end_bol_ = bol_; | |
FwdIter end_token_ = start_token_; | |
while (curr_ != end_) | |
{ | |
const std::size_t BOL_state_ = ptr_[bol_index]; | |
const std::size_t EOL_state_ = ptr_[eol_index]; | |
if (BOL_state_ && bol_) | |
{ | |
ptr_ = &dfa_[BOL_state_ * dfa_alphabet_]; | |
} | |
else if (EOL_state_ && *curr_ == '\n') | |
{ | |
ptr_ = &dfa_[EOL_state_ * dfa_alphabet_]; | |
} | |
else | |
{ | |
typename Traits::char_type prev_char_ = *curr_++; | |
bol_ = prev_char_ == '\n'; | |
const std::size_t state_ = | |
ptr_[lookup_[static_cast<typename Traits::index_type> | |
(prev_char_)]]; | |
if (state_ == 0) | |
{ | |
break; | |
} | |
ptr_ = &dfa_[state_ * dfa_alphabet_]; | |
} | |
if (*ptr_) | |
{ | |
end_state_ = true; | |
id_ = *(ptr_ + id_index); | |
uid_ = *(ptr_ + unique_id_index); | |
end_start_state_ = *(ptr_ + state_index); | |
end_bol_ = bol_; | |
end_token_ = curr_; | |
} | |
} | |
const std::size_t EOL_state_ = ptr_[eol_index]; | |
if (EOL_state_ && curr_ == end_) | |
{ | |
ptr_ = &dfa_[EOL_state_ * dfa_alphabet_]; | |
if (*ptr_) | |
{ | |
end_state_ = true; | |
id_ = *(ptr_ + id_index); | |
uid_ = *(ptr_ + unique_id_index); | |
end_start_state_ = *(ptr_ + state_index); | |
end_bol_ = bol_; | |
end_token_ = curr_; | |
} | |
} | |
if (end_state_) | |
{ | |
// return longest match | |
start_state_ = end_start_state_; | |
start_token_ = end_token_; | |
if (id_ == 0) | |
{ | |
bol_ = end_bol_; | |
goto again; | |
} | |
else | |
{ | |
_data.bol = end_bol_; | |
} | |
} | |
else | |
{ | |
// No match causes char to be skipped | |
_data.bol = *start_token_ == '\n'; | |
++start_token_; | |
id_ = npos; | |
uid_ = npos; | |
} | |
unique_id_ = uid_; | |
return id_; | |
} | |
std::size_t next (const detail::internals &internals_, | |
std::size_t &start_state_, FwdIter &start_token_, | |
FwdIter const &end_, std::size_t &unique_id_) | |
{ | |
if (start_token_ == end_) | |
{ | |
unique_id_ = npos; | |
return 0; | |
} | |
again: | |
const std::size_t * lookup_ = &internals_._lookup[start_state_]-> | |
front (); | |
std::size_t dfa_alphabet_ = internals_._dfa_alphabet[start_state_]; | |
const std::size_t *dfa_ = &internals_._dfa[start_state_]->front (); | |
const std::size_t *ptr_ = dfa_ + dfa_alphabet_; | |
FwdIter curr_ = start_token_; | |
bool end_state_ = *ptr_ != 0; | |
std::size_t id_ = *(ptr_ + id_index); | |
std::size_t uid_ = *(ptr_ + unique_id_index); | |
std::size_t end_start_state_ = start_state_; | |
FwdIter end_token_ = start_token_; | |
while (curr_ != end_) | |
{ | |
const std::size_t state_ = ptr_[lookup_[static_cast | |
<typename Traits::index_type>(*curr_++)]]; | |
if (state_ == 0) | |
{ | |
break; | |
} | |
ptr_ = &dfa_[state_ * dfa_alphabet_]; | |
if (*ptr_) | |
{ | |
end_state_ = true; | |
id_ = *(ptr_ + id_index); | |
uid_ = *(ptr_ + unique_id_index); | |
end_start_state_ = *(ptr_ + state_index); | |
end_token_ = curr_; | |
} | |
} | |
if (end_state_) | |
{ | |
// return longest match | |
start_state_ = end_start_state_; | |
start_token_ = end_token_; | |
if (id_ == 0) goto again; | |
} | |
else | |
{ | |
// No match causes char to be skipped | |
++start_token_; | |
id_ = npos; | |
uid_ = npos; | |
} | |
unique_id_ = uid_; | |
return id_; | |
} | |
std::size_t next (const std::size_t * const lookup_, | |
const std::size_t dfa_alphabet_, const std::size_t * const dfa_, | |
bool bol_, FwdIter &start_token_, FwdIter const &end_, | |
std::size_t &unique_id_) | |
{ | |
if (start_token_ == end_) | |
{ | |
unique_id_ = npos; | |
return 0; | |
} | |
const std::size_t *ptr_ = dfa_ + dfa_alphabet_; | |
FwdIter curr_ = start_token_; | |
bool end_state_ = *ptr_ != 0; | |
std::size_t id_ = *(ptr_ + id_index); | |
std::size_t uid_ = *(ptr_ + unique_id_index); | |
bool end_bol_ = bol_; | |
FwdIter end_token_ = start_token_; | |
while (curr_ != end_) | |
{ | |
const std::size_t BOL_state_ = ptr_[bol_index]; | |
const std::size_t EOL_state_ = ptr_[eol_index]; | |
if (BOL_state_ && bol_) | |
{ | |
ptr_ = &dfa_[BOL_state_ * dfa_alphabet_]; | |
} | |
else if (EOL_state_ && *curr_ == '\n') | |
{ | |
ptr_ = &dfa_[EOL_state_ * dfa_alphabet_]; | |
} | |
else | |
{ | |
typename Traits::char_type prev_char_ = *curr_++; | |
bol_ = prev_char_ == '\n'; | |
const std::size_t state_ = | |
ptr_[lookup_[static_cast<typename Traits::index_type> | |
(prev_char_)]]; | |
if (state_ == 0) | |
{ | |
break; | |
} | |
ptr_ = &dfa_[state_ * dfa_alphabet_]; | |
} | |
if (*ptr_) | |
{ | |
end_state_ = true; | |
id_ = *(ptr_ + id_index); | |
uid_ = *(ptr_ + unique_id_index); | |
end_bol_ = bol_; | |
end_token_ = curr_; | |
} | |
} | |
const std::size_t EOL_state_ = ptr_[eol_index]; | |
if (EOL_state_ && curr_ == end_) | |
{ | |
ptr_ = &dfa_[EOL_state_ * dfa_alphabet_]; | |
if (*ptr_) | |
{ | |
end_state_ = true; | |
id_ = *(ptr_ + id_index); | |
uid_ = *(ptr_ + unique_id_index); | |
end_bol_ = bol_; | |
end_token_ = curr_; | |
} | |
} | |
if (end_state_) | |
{ | |
// return longest match | |
_data.bol = end_bol_; | |
start_token_ = end_token_; | |
} | |
else | |
{ | |
// No match causes char to be skipped | |
_data.bol = *start_token_ == '\n'; | |
++start_token_; | |
id_ = npos; | |
uid_ = npos; | |
} | |
unique_id_ = uid_; | |
return id_; | |
} | |
std::size_t next (const std::size_t * const lookup_, | |
const std::size_t dfa_alphabet_, const std::size_t * const dfa_, | |
FwdIter &start_token_, FwdIter const &end_, | |
std::size_t &unique_id_) | |
{ | |
if (start_token_ == end_) | |
{ | |
unique_id_ = npos; | |
return 0; | |
} | |
const std::size_t *ptr_ = dfa_ + dfa_alphabet_; | |
FwdIter curr_ = start_token_; | |
bool end_state_ = *ptr_ != 0; | |
std::size_t id_ = *(ptr_ + id_index); | |
std::size_t uid_ = *(ptr_ + unique_id_index); | |
FwdIter end_token_ = start_token_; | |
while (curr_ != end_) | |
{ | |
const std::size_t state_ = ptr_[lookup_[static_cast | |
<typename Traits::index_type>(*curr_++)]]; | |
if (state_ == 0) | |
{ | |
break; | |
} | |
ptr_ = &dfa_[state_ * dfa_alphabet_]; | |
if (*ptr_) | |
{ | |
end_state_ = true; | |
id_ = *(ptr_ + id_index); | |
uid_ = *(ptr_ + unique_id_index); | |
end_token_ = curr_; | |
} | |
} | |
if (end_state_) | |
{ | |
// return longest match | |
start_token_ = end_token_; | |
} | |
else | |
{ | |
// No match causes char to be skipped | |
++start_token_; | |
id_ = npos; | |
uid_ = npos; | |
} | |
unique_id_ = uid_; | |
return id_; | |
} | |
}; | |
#if defined _MSC_VER && _MSC_VER <= 1200 | |
friend iterator; | |
#else | |
friend class iterator; | |
#endif | |
// Make it explict that we are NOT taking a copy of state_machine_! | |
basic_input (const basic_state_machine<typename Traits::char_type> | |
*state_machine_, const FwdIter &begin_, const FwdIter &end_) : | |
_state_machine (state_machine_), | |
_begin (begin_), | |
_end (end_) | |
{ | |
} | |
iterator begin () const | |
{ | |
iterator iter_; | |
iter_._input = this; | |
// Over-ride default of 0 (EOI) | |
iter_._data.id = npos; | |
iter_._data.start = _begin; | |
iter_._data.end = _begin; | |
iter_._data.bol = _state_machine->data ()._seen_BOL_assertion; | |
iter_._data.state = 0; | |
++iter_; | |
return iter_; | |
} | |
iterator end () const | |
{ | |
iterator iter_; | |
iter_._input = this; | |
iter_._data.start = _end; | |
iter_._data.end = _end; | |
return iter_; | |
} | |
private: | |
const basic_state_machine<typename Traits::char_type> *_state_machine; | |
FwdIter _begin; | |
FwdIter _end; | |
}; | |
typedef basic_input<std::string::iterator> iter_input; | |
typedef basic_input<std::basic_string<wchar_t>::iterator> iter_winput; | |
typedef basic_input<const char *> ptr_input; | |
typedef basic_input<const wchar_t *> ptr_winput; | |
} | |
} | |
#endif |