/*============================================================================= Lexer Spirit V1.3 Copyright (c) 2001, Daniel C. Nuffer This software is provided 'as-is', without any express or implied warranty. In no event will the copyright holder be held liable for any damages arising from the use of this software. Permission is granted to anyone to use this software for any purpose, including commercial applications, and to alter it and redistribute it freely, subject to the following restrictions: 1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required. 2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software. 3. This notice may not be removed or altered from any source distribution. Acknowledgements: Special thanks to Dan Nuffer, John (EBo) David, Chris Uzdavinis, and Doug Gregor. These people are most instrumental in steering Spirit in the right direction. Special thanks also to people who have contributed to the code base and sample code, ported Spirit to various platforms and compilers, gave suggestions, reported and provided bug fixes. Alexander Hirner, Andy Elvey, Bogdan Kushnir, Brett Calcott, Bruce Florman, Changzhe Han, Colin McPhail, Hakki Dogusan, Jan Bares, Joseph Smith, Martijn W. van der Lee, Raghavendra Satish, Remi Delcos, Tom Spilman, Vladimir Prus, W. Scott Dillman, David A. Greene, Bob Bailey, Hartmut Kaiser. Finally special thanks also to people who gave feedback and valuable comments, particularly members of Spirit's Source Forge mailing list and boost.org. URL: http://spirit.sourceforge.net/ TODO List: X callback objects (called when a match is made.) X callback passed first & last iterator, and a reference to a lexercontrol object that supports some operations on the lexer. set state terminate state stack (push, pop, top) set new token return value ignore the current token yymore get length of matched token get current lexer state lexer states. organize the file into hpp and ipp. arrange the classes in a logical order. use buffering - iterator only needs be an input iterator, lexer & callback are not templatized on iterator type, but char type. next_token is templatized on iterator type. beginning/ending contexts. ^ and $ DFA minimization. DFA table compression. DFA serialization to save recomputing the DFA. =============================================================================*/ #ifndef SPIRIT_LEXER_HPP #define SPIRIT_LEXER_HPP /////////////////////////////////////////////////////////////////////////////// #include "boost/spirit/core.hpp" #include "boost/spirit/symbols/symbols.hpp" #include "boost/spirit/utility/chset.hpp" #include "boost/spirit/utility/escape_char.hpp" #include <set> #include <map> #include <vector> #include <stack> #include <utility> // for pair #include <iostream> #include <fstream> #include "boost/limits.hpp" #if defined(BOOST_MSVC) #include "boost/spirit/core/impl/msvc.hpp" #endif #if defined(BOOST_MSVC) && (BOOST_MSVC <= 1300) #define SPIRIT_IT_NS impl #else #define SPIRIT_IT_NS std #endif /////////////////////////////////////////////////////////////////////////////// namespace spirit { typedef unsigned char uchar; typedef unsigned int node_id_t; const node_id_t invalid_node = node_id_t(-1); typedef std::set<node_id_t> node_set; typedef std::vector<uchar> uchar_vector; typedef std::map<node_id_t, node_set> followpos_t; typedef std::vector<uchar_vector> state_match_t; template <typename TokenT> class lexer_control; class bad_regex : public std::exception { }; namespace lexerimpl { class node { public: virtual ~node() {} virtual node* clone() const = 0; virtual bool nullable() const = 0; virtual node_set firstpos() const = 0; virtual node_set lastpos() const = 0; virtual void compute_followpos(followpos_t& followpos) const = 0; virtual void compute_state_match(state_match_t& state_match) const = 0; virtual void get_eof_ids(node_set& eof_set) const = 0; virtual void assign_node_ids(node_id_t& node_count) = 0; #if defined(SPIRIT_DEBUG) virtual void dump(std::ostream& out) const = 0; #endif }; class char_node : public node { public: char_node(const uchar c); char_node(const char_node& x); virtual ~char_node(){} virtual node* clone() const; virtual bool nullable() const; virtual node_set firstpos() const; virtual node_set lastpos() const; virtual void compute_followpos(followpos_t& followpos) const; virtual void compute_state_match(state_match_t& state_match ) const; virtual void get_eof_ids(node_set& eof_set) const; virtual void assign_node_ids(node_id_t& node_count); #if defined(SPIRIT_DEBUG) virtual void dump(std::ostream& out) const; #endif private: uchar m_char; node_id_t m_node_num; }; inline char_node::char_node(const uchar c) : node() , m_char(c) , m_node_num(0) { } inline char_node::char_node(const char_node& x) : node(x) , m_char(x.m_char) , m_node_num(x.m_node_num) { } inline node * char_node::clone() const { return new char_node(*this); } inline bool char_node::nullable() const { return false; } inline node_set char_node::firstpos() const { node_set rval; rval.insert(m_node_num); return rval; } inline node_set char_node::lastpos() const { return firstpos(); } inline void char_node::compute_followpos(followpos_t&) const { return; } inline void char_node::compute_state_match(state_match_t& state_match) const { if (state_match.size() < m_node_num + 1) state_match.resize(m_node_num + 1); state_match[m_node_num].resize(256); state_match[m_node_num][m_char] = 1; } inline void char_node::get_eof_ids(node_set&) const { return; } inline void char_node::assign_node_ids(node_id_t& node_count) { m_node_num = node_count++; } #if defined(SPIRIT_DEBUG) inline void char_node::dump(std::ostream& out) const { out << "\nchar_node m_char = " << m_char; out << " m_node_num = " << m_node_num; out << " nullable() = " << (nullable() ? "true" : "false"); out << " firstpos() = "; node_set fp = firstpos(); std::copy(fp.begin(), fp.end(), std::ostream_iterator<node_id_t>(out, ",")); out << " lastpos() = "; node_set lp = lastpos(); std::copy(lp.begin(), lp.end(), std::ostream_iterator<node_id_t>(out, ",")); } #endif class epsilon_node : public node { public: epsilon_node(); epsilon_node(const epsilon_node& x); virtual ~epsilon_node(){} virtual node* clone() const; virtual bool nullable() const; virtual node_set firstpos() const; virtual node_set lastpos() const; virtual void compute_followpos(followpos_t& followpos) const; virtual void compute_state_match(state_match_t& state_match ) const; virtual void get_eof_ids(node_set& eof_set) const; virtual void assign_node_ids(node_id_t& node_count); #if defined(SPIRIT_DEBUG) virtual void dump(std::ostream& out) const; #endif private: node_id_t m_node_num; }; inline epsilon_node::epsilon_node() : node() , m_node_num(0) { } inline epsilon_node::epsilon_node(const epsilon_node& x) : node(x) , m_node_num(x.m_node_num) { } inline node * epsilon_node::clone() const { return new epsilon_node(*this); } inline bool epsilon_node::nullable() const { return true; } inline node_set epsilon_node::firstpos() const { return node_set(); } inline node_set epsilon_node::lastpos() const { return node_set(); } inline void epsilon_node::compute_followpos(followpos_t&) const { return; } inline void epsilon_node::compute_state_match(state_match_t& state_match) const { if (state_match.size() < m_node_num + 1) state_match.resize(m_node_num + 1); state_match[m_node_num].resize(256, 1); } inline void epsilon_node::get_eof_ids(node_set&) const { return; } inline void epsilon_node::assign_node_ids(node_id_t& node_count) { m_node_num = node_count++; } #if defined(SPIRIT_DEBUG) inline void epsilon_node::dump(std::ostream& out) const { out << "\nepsilon_node"; out << " m_node_num = " << m_node_num; out << " nullable() = " << (nullable() ? "true" : "false"); out << " firstpos() = "; node_set fp = firstpos(); std::copy(fp.begin(), fp.end(), std::ostream_iterator<node_id_t>(out, ",")); out << " lastpos() = "; node_set lp = lastpos(); std::copy(lp.begin(), lp.end(), std::ostream_iterator<node_id_t>(out, ",")); } #endif class or_node : public node { public: or_node(node* left, node* right); or_node(const or_node& x); virtual ~or_node(){} virtual node* clone() const; virtual bool nullable() const; virtual node_set firstpos() const; virtual node_set lastpos() const; virtual void compute_followpos(followpos_t& followpos) const; virtual void compute_state_match(state_match_t& state_match ) const; virtual void get_eof_ids(node_set& eof_set) const; virtual void assign_node_ids(node_id_t& node_count); #if defined(SPIRIT_DEBUG) virtual void dump(std::ostream& out) const; #endif private: std::auto_ptr<node> m_left; std::auto_ptr<node> m_right; }; inline or_node::or_node(node* left, node* right) : node() , m_left(left) , m_right(right) { } inline or_node::or_node(const or_node& x) : node(x) , m_left(x.m_left->clone()) , m_right(x.m_right->clone()) { } inline node * or_node::clone() const { return new or_node(m_left->clone(), m_right->clone()); } inline bool or_node::nullable() const { return m_left->nullable() || m_right->nullable(); } inline node_set or_node::firstpos() const { node_set rval; node_set l = m_left->firstpos(); node_set r = m_right->firstpos(); std::set_union(l.begin(), l.end(), r.begin(), r.end(), std::inserter(rval, rval.begin())); return rval; } inline node_set or_node::lastpos() const { node_set rval; node_set l = m_left->lastpos(); node_set r = m_right->lastpos(); std::set_union(l.begin(), l.end(), r.begin(), r.end(), std::inserter(rval, rval.begin())); return rval; } inline void or_node::compute_followpos(followpos_t& followpos) const { m_left->compute_followpos(followpos); m_right->compute_followpos(followpos); } inline void or_node::compute_state_match(state_match_t& state_match) const { m_left->compute_state_match(state_match); m_right->compute_state_match(state_match); } inline void or_node::get_eof_ids(node_set& eof_nodes) const { m_left->get_eof_ids(eof_nodes); m_right->get_eof_ids(eof_nodes); } inline void or_node::assign_node_ids(node_id_t& node_count) { m_left->assign_node_ids(node_count); m_right->assign_node_ids(node_count); } #if defined(SPIRIT_DEBUG) inline void or_node::dump(std::ostream& out) const { m_left->dump(out); out << "\nor_node"; out << " nullable() = " << (nullable() ? "true" : "false"); out << " firstpos() = "; node_set fp = firstpos(); std::copy(fp.begin(), fp.end(), std::ostream_iterator<node_id_t>(out, ",")); out << " lastpos() = "; node_set lp = lastpos(); std::copy(lp.begin(), lp.end(), std::ostream_iterator<node_id_t>(out, ",")); m_right->dump(out); } #endif class cat_node : public node { public: cat_node(node* left, node* right); cat_node(const cat_node& x); virtual ~cat_node(){} virtual node* clone() const; virtual bool nullable() const; virtual node_set firstpos() const; virtual node_set lastpos() const; virtual void compute_followpos(followpos_t& followpos) const; virtual void compute_state_match(state_match_t& state_match ) const; virtual void get_eof_ids(node_set& eof_set) const; virtual void assign_node_ids(node_id_t& node_count); #if defined(SPIRIT_DEBUG) virtual void dump(std::ostream& out) const; #endif private: std::auto_ptr<node> m_left; std::auto_ptr<node> m_right; }; inline cat_node::cat_node(node* left, node* right) : node() , m_left(left) , m_right(right) { } inline cat_node::cat_node(const cat_node& x) : node(x) , m_left(x.m_left->clone()) , m_right(x.m_right->clone()) { } inline node * cat_node::clone() const { return new cat_node(m_left->clone(), m_right->clone()); } inline bool cat_node::nullable() const { return m_left->nullable() && m_right->nullable(); } inline node_set cat_node::firstpos() const { if (m_left->nullable()) { node_set rval; node_set l = m_left->firstpos(); node_set r = m_right->firstpos(); std::set_union(l.begin(), l.end(), r.begin(), r.end(), std::inserter(rval, rval.begin())); return rval; } else { return m_left->firstpos(); } } inline node_set cat_node::lastpos() const { if (m_right->nullable()) { node_set rval; node_set l = m_left->lastpos(); node_set r = m_right->lastpos(); std::set_union(l.begin(), l.end(), r.begin(), r.end(), std::inserter(rval, rval.begin())); return rval; } else { return m_right->lastpos(); } } inline void cat_node::compute_followpos(followpos_t& followpos) const { node_set l = m_left->lastpos(); for (node_set::iterator i = l.begin(); i != l.end(); ++i) { node_set rf = m_right->firstpos(); followpos[*i].insert(rf.begin(), rf.end()); } m_left->compute_followpos(followpos); m_right->compute_followpos(followpos); } inline void cat_node::compute_state_match(state_match_t& state_match) const { m_left->compute_state_match(state_match); m_right->compute_state_match(state_match); } inline void cat_node::get_eof_ids(node_set& eof_nodes) const { m_left->get_eof_ids(eof_nodes); m_right->get_eof_ids(eof_nodes); } inline void cat_node::assign_node_ids(node_id_t& node_count) { m_left->assign_node_ids(node_count); m_right->assign_node_ids(node_count); } #if defined(SPIRIT_DEBUG) inline void cat_node::dump(std::ostream& out) const { m_left->dump(out); out << "\ncat_node"; out << " nullable() = " << (nullable() ? "true" : "false"); out << " firstpos() = "; node_set fp = firstpos(); std::copy(fp.begin(), fp.end(), std::ostream_iterator<node_id_t>(out, ",")); out << " lastpos() = "; node_set lp = lastpos(); std::copy(lp.begin(), lp.end(), std::ostream_iterator<node_id_t>(out, ",")); m_right->dump(out); } #endif class star_node : public node { public: star_node(node* left); star_node(const star_node& x); virtual ~star_node(){} virtual node* clone() const; virtual bool nullable() const; virtual node_set firstpos() const; virtual node_set lastpos() const; virtual void compute_followpos(followpos_t& followpos) const; virtual void compute_state_match(state_match_t& state_match ) const; virtual void get_eof_ids(node_set& eof_set) const; virtual void assign_node_ids(node_id_t& node_count); #if defined(SPIRIT_DEBUG) virtual void dump(std::ostream& out) const; #endif private: std::auto_ptr<node> m_left; }; inline star_node::star_node(node* left) : node() , m_left(left) { } inline star_node::star_node(const star_node& x) : node(x) , m_left(x.m_left->clone()) { } inline node * star_node::clone() const { return new star_node(m_left->clone()); } inline bool star_node::nullable() const { return true; } inline node_set star_node::firstpos() const { return m_left->firstpos(); } inline node_set star_node::lastpos() const { return m_left->lastpos(); } inline void star_node::compute_followpos(followpos_t& followpos) const { node_set lp = this->lastpos(); for (node_set::iterator i = lp.begin(); i != lp.end(); ++i) { node_set fp = this->firstpos(); followpos[*i].insert(fp.begin(), fp.end()); } m_left->compute_followpos(followpos); } inline void star_node::compute_state_match(state_match_t& state_match) const { m_left->compute_state_match(state_match); } inline void star_node::get_eof_ids(node_set& eof_nodes) const { m_left->get_eof_ids(eof_nodes); } inline void star_node::assign_node_ids(node_id_t& node_count) { m_left->assign_node_ids(node_count); } #if defined(SPIRIT_DEBUG) inline void star_node::dump(std::ostream& out) const { m_left->dump(out); out << "\nstar_node"; out << " nullable() = " << (nullable() ? "true" : "false"); out << " firstpos() = "; node_set fp = firstpos(); std::copy(fp.begin(), fp.end(), std::ostream_iterator<node_id_t>(out, ",")); out << " lastpos() = "; node_set lp = lastpos(); std::copy(lp.begin(), lp.end(), std::ostream_iterator<node_id_t>(out, ",")); } #endif class eof_node : public node { public: eof_node(); eof_node(const eof_node& x); virtual ~eof_node(){} virtual node* clone() const; virtual bool nullable() const; virtual node_set firstpos() const; virtual node_set lastpos() const; virtual void compute_followpos(followpos_t& followpos) const; virtual void compute_state_match(state_match_t& state_match ) const; virtual void get_eof_ids(node_set& eof_set) const; virtual void assign_node_ids(node_id_t& node_count); #if defined(SPIRIT_DEBUG) virtual void dump(std::ostream& out) const; #endif private: node_id_t m_node_num; }; inline eof_node::eof_node() : node() , m_node_num(0) { } inline eof_node::eof_node(const eof_node& x) : node(x) , m_node_num(x.m_node_num) { } inline node * eof_node::clone() const { return new eof_node(*this); } inline bool eof_node::nullable() const { return false; } inline node_set eof_node::firstpos() const { node_set rval; rval.insert(m_node_num); return rval; } inline node_set eof_node::lastpos() const { node_set rval; rval.insert(m_node_num); return rval; } inline void eof_node::compute_followpos(followpos_t&) const { return; } inline void eof_node::compute_state_match(state_match_t& state_match) const { if (state_match.size() < m_node_num + 1) state_match.resize(m_node_num + 1); state_match[m_node_num].resize(256, 0); } inline void eof_node::get_eof_ids(node_set& eof_nodes) const { eof_nodes.insert(m_node_num); } inline void eof_node::assign_node_ids(node_id_t& node_count) { m_node_num = node_count++; } #if defined(SPIRIT_DEBUG) inline void eof_node::dump(std::ostream& out) const { out << "\neof_node"; out << " m_node_num = " << m_node_num; out << " nullable() = " << (nullable() ? "true" : "false"); out << " firstpos() = "; node_set fp = firstpos(); std::copy(fp.begin(), fp.end(), std::ostream_iterator<node_id_t>(out, ",")); out << " lastpos() = "; node_set lp = lastpos(); std::copy(lp.begin(), lp.end(), std::ostream_iterator<node_id_t>(out, ",")); } #endif class ccl_node : public node { public: ccl_node(const std::vector<uchar>& v); ccl_node(const uchar c1, const uchar c2); ccl_node(const ccl_node& x); virtual ~ccl_node(){} virtual node* clone() const; virtual bool nullable() const; virtual node_set firstpos() const; virtual node_set lastpos() const; virtual void compute_followpos(followpos_t& followpos) const; virtual void compute_state_match(state_match_t& state_match ) const; virtual void get_eof_ids(node_set& eof_set) const; virtual void assign_node_ids(node_id_t& node_count); #if defined(SPIRIT_DEBUG) virtual void dump(std::ostream& out) const; #endif private: std::vector<uchar> m_match; node_id_t m_node_num; }; inline ccl_node::ccl_node(const std::vector<uchar>& v) : node() , m_match(v) , m_node_num(0) { m_match.resize(256); // make sure it's the right size } inline ccl_node::ccl_node(const uchar c1, const uchar c2) : node() , m_match(256, 0) , m_node_num(0) { SPIRIT_ASSERT(c1 < c2); for (size_t i = c1; i <= size_t(c2); ++i) { m_match[i] = 1; } } inline ccl_node::ccl_node(const ccl_node& x) : node(x) , m_match(x.m_match) , m_node_num(x.m_node_num) { } inline node * ccl_node::clone() const { return new ccl_node(*this); } inline bool ccl_node::nullable() const { return false; } inline node_set ccl_node::firstpos() const { node_set rval; rval.insert(m_node_num); return rval; } inline node_set ccl_node::lastpos() const { return firstpos(); } inline void ccl_node::compute_followpos(followpos_t&) const { return; } inline void ccl_node::compute_state_match(state_match_t& state_match) const { if (state_match.size() < m_node_num + 1) state_match.resize(m_node_num + 1); state_match[m_node_num] = m_match; } inline void ccl_node::get_eof_ids(node_set&) const { return; } inline void ccl_node::assign_node_ids(node_id_t& node_count) { m_node_num = node_count++; } #if defined(SPIRIT_DEBUG) inline void ccl_node::dump(std::ostream& out) const { out << "\nccl_node m_match = "; for (size_t i = 0; i < m_match.size(); ++i) { if (m_match[i]) out << i << ", "; } out << " m_node_num = " << m_node_num; out << " nullable() = " << (nullable() ? "true" : "false"); out << " firstpos() = "; node_set fp = firstpos(); std::copy(fp.begin(), fp.end(), std::ostream_iterator<node_id_t>(out, ",")); out << " lastpos() = "; node_set lp = lastpos(); std::copy(lp.begin(), lp.end(), std::ostream_iterator<node_id_t>(out, ",")); } #endif template <typename ScannerT> class make_concat { typedef typename ScannerT::iterator_t iterator_t; public: make_concat(std::stack<node*>& the_stack) : m_stack(the_stack) {} void operator()(iterator_t const &, iterator_t const &) const { node* right = m_stack.top(); m_stack.pop(); node* left = m_stack.top(); m_stack.pop(); node* newnode = new cat_node(left, right); m_stack.push(newnode); } std::stack<node*>& m_stack; }; template <int CharTSize> struct get_byte_aux; template<> struct get_byte_aux<1> { template <typename CharT> unsigned char operator()(CharT c, unsigned int byte) { SPIRIT_ASSERT(byte == 0); return c; } }; template<> struct get_byte_aux<2> { template <typename CharT> unsigned char operator()(CharT c, unsigned int byte) { static unsigned long mask[] = { 0xFF00, 0x00FF }; SPIRIT_ASSERT(byte < 2); return (c & mask[byte]) >> ((sizeof(c) - 1 - byte) * 8); } }; template<> struct get_byte_aux<4> { template <typename CharT> unsigned char operator()(CharT c, unsigned int byte) { static unsigned long mask[] = { 0xFF000000, 0x00FF0000, 0x0000FF00, 0x000000FF }; SPIRIT_ASSERT(byte < 4); return (c & mask[byte]) >> ((sizeof(c) - 1 - byte) * 8); } }; template <typename CharT> inline unsigned char get_byte(CharT c, unsigned int byte) { return get_byte_aux<sizeof(c)>()(c, byte); } template <typename ScannerT> class make_star { typedef typename ScannerT::iterator_t iterator_t; public: typedef typename SPIRIT_IT_NS::iterator_traits<iterator_t>::value_type char_t; make_star(std::stack<node*>& the_stack) : m_stack(the_stack) {} void operator()(const char_t) const { node* left = m_stack.top(); m_stack.pop(); node* newnode = new star_node(left); m_stack.push(newnode); } std::stack<node*>& m_stack; }; template <typename ScannerT> class make_or { typedef typename ScannerT::iterator_t iterator_t; public: make_or(std::stack<node*>& the_stack) : m_stack(the_stack) {} void operator()(iterator_t const&, iterator_t const&) const { node* right = m_stack.top(); m_stack.pop(); node* left = m_stack.top(); m_stack.pop(); node* newnode = new or_node(left, right); m_stack.push(newnode); } std::stack<node*>& m_stack; }; template <typename ScannerT> class make_plus { typedef typename ScannerT::iterator_t iterator_t; public: typedef typename SPIRIT_IT_NS::iterator_traits<iterator_t>::value_type char_t; make_plus(std::stack<node*>& the_stack) : m_stack(the_stack) {} void operator()(const char_t) const { node* left = m_stack.top(); m_stack.pop(); node* copy = left->clone(); node* new_star = new star_node(copy); node* new_cat = new cat_node(left, new_star); m_stack.push(new_cat); } std::stack<node*>& m_stack; }; template <typename ScannerT> class make_optional { typedef typename ScannerT::iterator_t iterator_t; public: typedef typename SPIRIT_IT_NS::iterator_traits<iterator_t>::value_type char_t; make_optional(std::stack<node*>& the_stack) : m_stack(the_stack) {} void operator()(const char_t) const { node* left = m_stack.top(); m_stack.pop(); node* new_or = new or_node(left, new epsilon_node()); m_stack.push(new_or); } std::stack<node*>& m_stack; }; /////////////////////////////////////////////////////////////////////////////// // utility function template <typename CharT> inline impl::range<CharT> const& full_range() { static impl::range<CharT> full(std::numeric_limits<CharT>::min(), std::numeric_limits<CharT>::max()); return full; } namespace ccl_utils { template <typename char_t> inline impl::range_run<char_t> negate_range_run( const impl::range_run<char_t>& rr) { impl::range_run<char_t> newrr; newrr.set(full_range<char_t>()); for (typename impl::range_run<char_t>::const_iterator iter = rr.begin(); iter != rr.end(); ++iter) newrr.clear(*iter); return newrr; } template <typename char_t> inline node* create_mb_node_seq(char_t c) { node* newnode = new char_node(get_byte(c, 0)); for (unsigned int i = 1; i < sizeof(c); ++i) { node* cnode = new char_node(get_byte(c, i)); node* top_node = new cat_node(newnode, cnode); newnode = top_node; } return newnode; } template <typename char_t> inline void handle_mb_char(char_t c, bool first_time, std::stack<node*>& stack) { node* newnode = create_mb_node_seq(c); if (first_time) { stack.push(newnode); } else { node* top = stack.top(); stack.pop(); node* newtop = new or_node(top, newnode); stack.push(newtop); } } // forward decl only template <typename char_t> inline void handle_mb_range(char_t c1, char_t c2, bool first_time, std::stack<node*>& stack); template <typename char_t> inline void create_nodes(const impl::range_run<char_t>& rr, std::stack<node*>& stack) { if (sizeof(char_t) == 1) { std::vector<uchar> ccl; ccl.resize(256); for (typename impl::range_run<char_t>::const_iterator iter = rr.begin(); iter != rr.end(); ++iter) { for (int i = iter->first; i <= iter->last; ++i) { SPIRIT_ASSERT(uchar(i) < 256 && ccl.size() == 256); ccl[uchar(i)] = 1; } } node* new_ccl = new ccl_node(ccl); stack.push(new_ccl); } else { bool mb_first_time = true; for (typename impl::range_run<char_t>::const_iterator iter = rr.begin(); iter != rr.end(); ++iter) { if (iter->first == iter->last) { handle_mb_char(iter->first, mb_first_time, stack); } else { handle_mb_range(iter->first, iter->last, mb_first_time, stack); } mb_first_time = false; } } } template <typename char_t> inline size_t compute_differing_byte(char_t c1, char_t c2) { size_t rval = 0; while (rval < sizeof(c1) && get_byte(c1, rval) == get_byte(c2, rval)) ++rval; return rval; } template <typename char_t> inline node* create_mb_node_type1(size_t j, char_t c1, char_t c2) { size_t diff = get_byte(c2, j) - get_byte(c1, j); if (diff == 1) return 0; else if (diff == 2) return new char_node(get_byte(c1, j)+1); else return new ccl_node(get_byte(c1, j)+1, get_byte(c2, j)-1); } template <typename char_t> inline node * create_mb_node_for_byte(size_t i, size_t j, size_t sizem1, size_t differing_byte, char_t c1, char_t c2, node* newnode) { node* cnode; if (i == sizem1 && j == differing_byte && j != sizem1) { node* tmp = create_mb_node_type1(j, c1, c2); if (tmp == 0) { delete newnode; return 0; } else cnode = tmp; } else if (i == differing_byte && j == sizem1) { if (i != sizem1) cnode = new ccl_node(get_byte(c1, j), 0xFF); else cnode = new ccl_node(get_byte(c1, j), get_byte(c2, j)); } else if (i != differing_byte && i != sizem1 && j == (sizem1 - i + differing_byte)) cnode = new ccl_node(get_byte(c1, j)+1, 0xFF); else if (i + j - differing_byte > sizem1) cnode = new ccl_node(0, 0xFF); else //if (is plain) cnode = new char_node(get_byte(c1, j)); node* top_node = new cat_node(newnode, cnode); return top_node; } // On platforms, where wchar_t is a typedef for unsigned short, the // comparision for a negative value is pointless template <bool is_signed> struct correct_char_aux { }; template <> struct correct_char_aux<true> { template <typename char_t> static char_t correct(char_t c) { if (c < 0) c = 0; return c; } }; template <> struct correct_char_aux<false> { template <typename char_t> static char_t correct(char_t c) { return c; } }; template <typename char_t> struct correct_char { static char_t correct(char_t c) { return correct_char_aux<std::numeric_limits<char_t>::is_signed >:: correct(c); } }; template <typename char_t> inline void handle_mb_range(char_t c1, char_t c2, bool first_time, std::stack<node*>& stack) { // The algorithm can't handle negative value chars, which don't make // much sense anyway. This comparision is pointless for wchar_t's on // platforms, where wchar_t is a typedef for unsigned short c1 = correct_char<char_t>::correct(c1); //if (c1 < 0) // c1 = 0; SPIRIT_ASSERT(c1 < c2); node* newnode = 0; node* savednode = 0; const size_t differing_byte = compute_differing_byte(c1, c2); const size_t sizem1 = sizeof(c1) - 1; const size_t ndb = sizem1 - differing_byte; for (size_t i = differing_byte; i < sizeof(c1); ++i) { // generate node for the first byte if (differing_byte == 0 && i == ndb) { node* tmp = create_mb_node_type1(0, c1, c2); if (tmp == 0) continue; else newnode = tmp; } else { newnode = new char_node(get_byte(c1, 0)); } for (size_t j = 1; j < sizeof(c1); ++j) { newnode = create_mb_node_for_byte(i, j, sizem1, differing_byte, c1, c2, newnode); if (newnode == 0) goto end_outer_for; } // or together the various parts if (savednode) { node* top_node = new or_node(savednode, newnode); savednode = top_node; } else { savednode = newnode; } end_outer_for: continue; } for (size_t k = 0; k < ndb; ++k) { newnode = new char_node(get_byte(c2, 0)); for (size_t j = 1; j < sizeof(c2); ++j) { node* cnode; if (k == differing_byte && j == sizem1 && k != sizem1) cnode = new ccl_node(0, get_byte(c2, j)); else if (k != differing_byte && k != sizem1 && j == (sizem1 - k + differing_byte)) cnode = new ccl_node(0, get_byte(c2, j)-1); else if (k + j - differing_byte > sizem1) cnode = new ccl_node(0, 0xFF); else //if (is plain) cnode = new char_node(get_byte(c2, j)); node* top_node = new cat_node(newnode, cnode); newnode = top_node; } // or together the various parts if (savednode) { node* top_node = new or_node(savednode, newnode); savednode = top_node; } else { savednode = newnode; } } if (first_time) { stack.push(savednode); } else { node* top = stack.top(); stack.pop(); node* newtop = new or_node(top, savednode); stack.push(newtop); } } } // namespace ccl_utils template <typename ScannerT> class make_char { typedef typename ScannerT::iterator_t iterator_t; public: typedef typename SPIRIT_IT_NS::iterator_traits<iterator_t>::value_type char_t; make_char(std::stack<node*>& the_stack) : m_stack(the_stack) {} void operator()(iterator_t const& first, iterator_t const& last) const { const escape_char_parser<lex_escapes, char_t> lex_escape_ch = escape_char_parser<lex_escapes, char_t>(); char_t the_char; iterator_t first_ = first; ScannerT scan(first_, last); lex_escape_ch[assign(the_char)].parse(scan); node* newnode = ccl_utils::create_mb_node_seq(the_char); m_stack.push(newnode); } std::stack<node*>& m_stack; }; template <typename ScannerT> class make_ccl { typedef typename ScannerT::iterator_t iterator_t; public: typedef typename SPIRIT_IT_NS::iterator_traits<iterator_t>::value_type char_t; make_ccl(std::stack<node*>& the_stack) : m_stack(the_stack) {} static bool is_equal_to_string(iterator_t first, iterator_t const & last, const char* str) { while (first != last &&*str &&*first ==*str) { ++first; ++str; } return*str == 0; } template <typename ParserT> static void fill_ccl(impl::range_run<char_t>& rr, const ParserT& parser) { for (int i = 0; i < 256; ++i) { if (parser.test(uchar(i))) rr.set(impl::range<char_t>(i, i)); } } void operator()(iterator_t const& first_, iterator_t const& last) const { SPIRIT_ASSERT(*first_ == '['); iterator_t first = first_; ++first; // skip over '[' bool negated_ccl = false; if (*first == '^') { negated_ccl = true; ++first; } impl::range_run<char_t> rr; while (first != last &&*first != ']') { if (*first == '[') // it's a ccl_expr like [:space:] { // check for [:space:], etc. if (is_equal_to_string(first, last, "[:alnum:]")) { fill_ccl(rr, alnum_p); } else if (is_equal_to_string(first, last, "[:alpha:]")) { fill_ccl(rr, alpha_p); } else if (is_equal_to_string(first, last, "[:blank:]")) { fill_ccl(rr, blank_p); } else if (is_equal_to_string(first, last, "[:cntrl:]")) { fill_ccl(rr, cntrl_p); } else if (is_equal_to_string(first, last, "[:digit:]")) { fill_ccl(rr, digit_p); } else if (is_equal_to_string(first, last, "[:graph:]")) { fill_ccl(rr, graph_p); } else if (is_equal_to_string(first, last, "[:lower:]")) { fill_ccl(rr, lower_p); } else if (is_equal_to_string(first, last, "[:print:]")) { fill_ccl(rr, print_p); } else if (is_equal_to_string(first, last, "[:punct:]")) { fill_ccl(rr, punct_p); } else if (is_equal_to_string(first, last, "[:space:]")) { fill_ccl(rr, space_p); } else if (is_equal_to_string(first, last, "[:upper:]")) { fill_ccl(rr, upper_p); } else if (is_equal_to_string(first, last, "[:xdigit:]")) { fill_ccl(rr, xdigit_p); } // this can't happen, because it's parsed before we get here. //else // throw bad_regex(); // Advance past the character class expression while (first != last &&*first != ']') ++first; SPIRIT_ASSERT(*first == ']'); ++first; } else { const escape_char_parser<lex_escapes, char_t> lex_escape_ch = escape_char_parser<lex_escapes, char_t>(); char_t c1; ScannerT scan(first, last); lex_escape_ch[assign(c1)].parse(scan); if (*scan.first == '-') // insert a range { ++scan.first; char_t c2; lex_escape_ch[assign(c2)].parse(scan); SPIRIT_ASSERT(c1 < c2); // Throw exception? rr.set(impl::range<char_t>(c1, c2)); } else // insert 1 char { rr.set(impl::range<char_t>(c1, c1)); } } } if (negated_ccl) { rr = ccl_utils::negate_range_run(rr); } ccl_utils::create_nodes(rr, m_stack); } std::stack<node*>& m_stack; }; template <typename ScannerT> class make_any_char { typedef typename ScannerT::iterator_t iterator_t; public: typedef typename SPIRIT_IT_NS::iterator_traits<iterator_t>::value_type char_t; std::stack<node*>& m_stack; make_any_char(std::stack<node*>& the_stack) : m_stack(the_stack) {} void operator()(const char_t c) const { SPIRIT_ASSERT(c == '.'); do_any_char(); } void do_any_char() const { static impl::range_run<char_t> rr; rr.set(full_range<char_t>()); char_t newline = '\n'; rr.clear(impl::range<char_t>(newline, newline)); ccl_utils::create_nodes(rr, m_stack); } }; template <typename ScannerT> class make_string { typedef typename ScannerT::iterator_t iterator_t; public: typedef typename SPIRIT_IT_NS::iterator_traits<iterator_t>::value_type char_t; std::stack<node*>& m_stack; make_string(std::stack<node*>& the_stack) : m_stack(the_stack) {} void operator()(iterator_t const& first, iterator_t const& last) const { SPIRIT_ASSERT(*first == '"'); iterator_t first_ = first; ScannerT scan(first_, last); ++scan.first; // skip over '"' // empty string not allowed if (*scan.first == '"') { throw bad_regex(); } const escape_char_parser<lex_escapes, char_t> lex_escape_ch = escape_char_parser<lex_escapes, char_t>(); char_t c; lex_escape_ch[assign(c)].parse(scan); node* top_node = ccl_utils::create_mb_node_seq(c); while (*scan.first != '"' && scan.first != scan.last) { lex_escape_ch[assign(c)].parse(scan); node* cur_node = ccl_utils::create_mb_node_seq(c); top_node = new cat_node(top_node, cur_node); } m_stack.push(top_node); } }; inline node* repeat_node(node* n, int num) { node* list_of_nodes = n; for (int i = 1; i < num; ++i) { list_of_nodes = new cat_node(list_of_nodes, n->clone()); } return list_of_nodes; } inline node* optional_node(node* n) { return new or_node(n, new epsilon_node()); } template <typename ScannerT> class make_rep1 { typedef typename ScannerT::iterator_t iterator_t; public: typedef typename SPIRIT_IT_NS::iterator_traits<iterator_t>::value_type char_t; std::stack<node*>& m_stack; make_rep1(std::stack<node*>& the_stack) : m_stack(the_stack) {} void operator()(iterator_t const& first, iterator_t const& last) const { SPIRIT_ASSERT(*first == '{'); iterator_t first_ = first; ScannerT scan(first_, last); ++scan.first; // skip over '{' unsigned int count; uint_p[assign(count)].parse(scan); if (count == 0) throw bad_regex(); node* top_node = m_stack.top(); m_stack.pop(); top_node = repeat_node(top_node, count); m_stack.push(top_node); } }; template <typename ScannerT> class make_rep2 { typedef typename ScannerT::iterator_t iterator_t; public: typedef typename SPIRIT_IT_NS::iterator_traits<iterator_t>::value_type char_t; std::stack<node*>& m_stack; make_rep2(std::stack<node*>& the_stack) : m_stack(the_stack) {} void operator()(iterator_t const& first, iterator_t const& last) const { SPIRIT_ASSERT(*first == '{'); iterator_t first_ = first; ScannerT scan (first_, last); ++scan.first; // skip over '{' unsigned int count; uint_p[assign(count)].parse(scan); if (count == 0) throw bad_regex(); node* top_node = m_stack.top(); m_stack.pop(); top_node = new cat_node(repeat_node(top_node, count), new star_node(top_node->clone())); m_stack.push(top_node); } }; template <typename ScannerT> class make_rep3 { typedef typename ScannerT::iterator_t iterator_t; public: typedef typename SPIRIT_IT_NS::iterator_traits<iterator_t>::value_type char_t; std::stack<node*>& m_stack; make_rep3(std::stack<node*>& the_stack) : m_stack(the_stack) {} void operator()(iterator_t const& first, iterator_t const& last) const { SPIRIT_ASSERT(*first == '{'); iterator_t first_ = first; ScannerT scan(first_, last); ++scan.first; // skip over '{' unsigned int count1, count2; uint_p[assign(count1)].parse(scan); if (count1 == 0) throw bad_regex(); ++scan.first; // skip over ',' uint_p[assign(count2)].parse(scan); if (count2 <= count1) throw bad_regex(); node* top_node = m_stack.top(); m_stack.pop(); node* repeats = repeat_node(top_node, count1); top_node = new cat_node(repeats, repeat_node(optional_node(top_node->clone()), count2 - count1)); m_stack.push(top_node); } }; /////////////////////////////////////////////////////////////////////////////// // // Lexer grammar // // Defines the grammar, which mandates the syntax of the understood // lexeme definitions passed to lexer::register_regex. // /////////////////////////////////////////////////////////////////////////////// class lexer_grammar : public spirit::grammar<lexer_grammar> { public: lexer_grammar(std::stack<node*> &node_stack_) : node_stack(node_stack_) {} template <typename ScannerT> struct definition { typedef rule<ScannerT> rule_t; typedef typename ScannerT::iterator_t iterator_t; typedef typename SPIRIT_IT_NS::iterator_traits<iterator_t>::value_type char_t; rule_t regex, re, series, singleton, singleton2, fullccl, ccl, string, escseq, ccl_char; symbols<> ccl_expr; definition(lexer_grammar const &self) { regex = re >> !('/' >> re) >> !ch_p('$') ; re = series >>*( ('|' >> series)[make_or<ScannerT>(self.node_stack)] ) ; series = singleton >>*( singleton[make_concat<ScannerT>(self.node_stack)] ) ; singleton = ch_p('.')[make_any_char<ScannerT>(self.node_stack)] >> singleton2 | fullccl >> singleton2 | ('"' >> string >> '"') [ make_string<ScannerT>(self.node_stack) ] >> singleton2 | '(' >> re >> ')' >> singleton2 | ((anychar_p - chset<>("/|*+?.(){}\\")) | escseq) [ make_char<ScannerT>(self.node_stack) ] >> singleton2 ; singleton2 = ch_p('*')[make_star<ScannerT>(self.node_stack)] >> singleton2 | ch_p('+')[make_plus<ScannerT>(self.node_stack)] >> singleton2 | ch_p('?')[make_optional<ScannerT>(self.node_stack)] >> singleton2 | ('{' >> uint_p >> '}') [ make_rep1<ScannerT>(self.node_stack) ] >> singleton2 | ('{' >> uint_p >> ',' >> '}') [ make_rep2<ScannerT>(self.node_stack) ] >> singleton2 | ('{' >> uint_p >> ',' >> uint_p >> '}') [ make_rep3<ScannerT>(self.node_stack) ] >> singleton2 | epsilon_p ; fullccl = ('[' >> !ch_p('^') >> ccl >> ']') [ make_ccl<ScannerT>(self.node_stack) ] ; ccl = *(ccl_expr | (ccl_char >> !('-' >> ccl_char))) ; ccl_char = ( (anychar_p - chset<>("\\\n]")) | escseq ) ; ccl_expr = "[:alnum:]", "[:alpha:]", "[:blank:]", "[:cntrl:]", "[:digit:]", "[:graph:]", "[:lower:]", "[:print:]", "[:punct:]", "[:space:]", "[:upper:]", "[:xdigit:]" ; string = +( (anychar_p - chset<>("\"\\")) | escseq ) ; typedef uint_parser<char_t, 8, 1, std::numeric_limits<char_t>::digits / 3 + 1 > oct_parser_t; typedef uint_parser<char_t, 16, 1, std::numeric_limits<char_t>::digits / 4 + 1 > hex_parser_t; escseq = ch_p('\\') >> ( oct_parser_t() | nocase_d['x'] >> hex_parser_t() | (anychar_p - chset<>('\n')) ) ; #if defined(SPIRIT_DEBUG) SPIRIT_DEBUG_RULE(regex); SPIRIT_DEBUG_RULE(re); SPIRIT_DEBUG_RULE(series); SPIRIT_DEBUG_RULE(singleton); SPIRIT_DEBUG_RULE(singleton2); SPIRIT_DEBUG_RULE(fullccl); SPIRIT_DEBUG_RULE(ccl); SPIRIT_DEBUG_RULE(string); SPIRIT_DEBUG_RULE(escseq); SPIRIT_DEBUG_RULE(ccl_char); #endif } rule<ScannerT> const& start() const { return regex; } }; std::stack<node*> &node_stack; }; // class lexer_grammar template <typename StringT> inline node * parse(lexer_grammar& g, StringT const& str) { typedef scanner<typename StringT::const_iterator, scanner_policies<> > scanner_t; typename StringT::const_iterator first = str.begin(); typename StringT::const_iterator last = str.end(); scanner_t scan(first, last); typename rule<scanner_t>::result_t hit = g.parse(scan); if (!hit || !scan.at_end()) { while (g.node_stack.size()) { delete g.node_stack.top(); g.node_stack.pop(); } return 0; } SPIRIT_ASSERT(g.node_stack.size() == 1); node* rval = g.node_stack.top(); g.node_stack.pop(); node* an_eof_node = new eof_node(); rval = new cat_node(rval, an_eof_node); return rval; } inline void make_case_insensitive(state_match_t& state_match) { // TODO: Fix this. // This doesn't take into account foreign languages, figure out how to // do that. Also this approach is broken for this implementation of // wide chars. for (state_match_t::iterator iter = state_match.begin(); iter != state_match.end(); ++iter) { int i, j; for (i = 'A', j = 'a'; i <= 'Z'; ++i, ++j) { // if either is set, turn them both on (*iter)[i] = (*iter)[j] = (*iter)[i] | (*iter)[j]; } } } template<bool wide_char> struct regex_match_helper; template<> struct regex_match_helper<false> // single byte char { template <typename DfaT, typename IteratorT> static bool do_match(DfaT const& dfa, IteratorT &first, IteratorT const& last, int& regex_index) { node_id_t s = 0; node_id_t last_accepting_index = invalid_node; IteratorT p = first; IteratorT last_accepting_cpos = first; while (p != last) { s = dfa.transition_table[s][(uchar)*p]; if (s == invalid_node) break; ++p; if (dfa.acceptance_index[s] != invalid_node) { last_accepting_index = s; last_accepting_cpos = p; } } if (last_accepting_index != invalid_node) { #if defined(SPIRIT_DEBUG) std::cout << "dfa.acceptance_index[" << last_accepting_index << "] = " << dfa.acceptance_index[last_accepting_index] << '\n'; #endif first = last_accepting_cpos; regex_index = dfa.acceptance_index[last_accepting_index]; return true; } else return false; } }; template<> struct regex_match_helper<true> // wide char { template <typename DfaT, typename IteratorT> static bool do_match(DfaT const& dfa, IteratorT &first, IteratorT const& last, int& regex_index) { typedef typename SPIRIT_IT_NS::iterator_traits<IteratorT>::value_type char_t; node_id_t s = 0; node_id_t last_accepting_index = invalid_node; IteratorT wp = first; IteratorT last_accepting_cpos = first; while (wp != last) { for (unsigned int i = 0; i < sizeof(char_t); ++i) { s = dfa.transition_table[s][get_byte(*wp,i)]; if (s == invalid_node) { goto break_while; } } ++wp; if (dfa.acceptance_index[s] != invalid_node) { last_accepting_index = s; last_accepting_cpos = wp; } } break_while: if (last_accepting_index != invalid_node) { #if defined(SPIRIT_DEBUG) std::cout << "dfa.acceptance_index[" << last_accepting_index << "] = " << dfa.acceptance_index[last_accepting_index] << '\n'; #endif first = last_accepting_cpos; regex_index = dfa.acceptance_index[last_accepting_index]; return true; } else return false; } }; template <typename DfaT, typename IteratorT, bool wide_char> struct regex_match { static bool do_match(DfaT const& dfa, IteratorT &first, IteratorT const& last, int& regex_index) { return regex_match_helper<wide_char>::do_match( dfa, first, last, regex_index); } }; } // namespace lexerimpl /////////////////////////////////////////////////////////////////////////////// // template <typename IteratorT = char const*, typename TokenT = int, typename CallbackT = void(*)(IteratorT const &, IteratorT &, IteratorT const&, TokenT const&, lexer_control<TokenT> &)> class lexer { public: typedef CallbackT callback_t; typedef typename SPIRIT_IT_NS::iterator_traits<IteratorT>::value_type char_t; struct regex_info { std::basic_string<char_t> str; TokenT token; CallbackT callback; regex_info(const std::basic_string<char_t>& _str, const TokenT& _token, const CallbackT& _callback) : str(_str) , token(_token) , callback(_callback) {} }; struct dfa_table { std::vector<std::vector<node_id_t> > transition_table; std::vector<node_id_t> acceptance_index; }; typedef std::vector<node_id_t> node_table_t; typedef std::vector<node_table_t> transition_table_t; typedef std::vector<dfa_table> dfa_t; lexer(unsigned int states = 1); void register_regex(const std::basic_string<char_t>& regex, const TokenT& id, const CallbackT& cb = CallbackT(), unsigned int state = 0); TokenT next_token(IteratorT &first, IteratorT const &last); void create_dfa(); void set_case_insensitive(bool insensitive); #if defined(SPIRIT_DEBUG) void dump(std::ostream& out); #endif typedef std::vector<std::vector<regex_info> > regex_list_t; bool load (std::ifstream &in, long unique_id = 0); bool save (std::ofstream &out, long unique_id = 0); enum { SLEX_SIGNATURE = 0x58454C53, // "SLEX" SLEX_VERSION_100 = 0x0100, // persistance version SLEX_LAST_KNOWN_VERSION = SLEX_VERSION_100, SLEX_MINOR_VERSION_MASK = 0xFF }; private: void create_dfa_for_state(int state); static bool regex_match(const dfa_t& dfa, IteratorT& first, IteratorT& last, int& regex_index); mutable std::stack<lexerimpl::node*> node_stack; lexerimpl::lexer_grammar g; mutable bool m_compiled_dfa; mutable dfa_t m_dfa; regex_list_t m_regex_list; bool m_case_insensitive; unsigned int m_state; std::stack<unsigned int> m_state_stack; unsigned int m_num_states; }; template <typename IteratorT, typename TokenT, typename CallbackT> inline lexer<IteratorT, TokenT, CallbackT>::lexer(unsigned int states) : g(node_stack) , m_compiled_dfa(false) , m_regex_list(states) , m_case_insensitive(false) , m_state(0) , m_state_stack() , m_num_states(states) { } template <typename IteratorT, typename TokenT, typename CallbackT> inline void lexer<IteratorT, TokenT, CallbackT>::register_regex( const std::basic_string<char_t>& regex, const TokenT& id, const CallbackT& callback, unsigned int state) { if (state > m_num_states) { m_regex_list.resize(state); m_num_states = state; } m_regex_list[state].push_back(regex_info(regex, id, callback)); } template <typename IteratorT, typename TokenT, typename CallbackT> inline TokenT lexer<IteratorT, TokenT, CallbackT>::next_token( IteratorT &first, IteratorT const& last) { if (!m_compiled_dfa) { create_dfa(); } IteratorT saved = first; int regex_index; if (!lexerimpl::regex_match<dfa_table, IteratorT, (sizeof(char_t) > 1)>:: do_match(m_dfa[m_state], first, last, regex_index)) return -1; // TODO: can't return -1, need to return some invalid token. // how to figure this out? We can use traits I guess. else { regex_info regex = m_regex_list[m_state][regex_index]; TokenT rval = regex.token; if (regex.callback) { // execute corresponding callback lexer_control<TokenT> controller(rval, m_state, m_state_stack); regex.callback(saved, first, last, regex.token, controller); if (controller.ignore_current_token_set()) return next_token(first, last); } return rval; } } namespace lexerimpl { inline bool find_acceptance_state(const node_set& eof_node_ids, const node_set& current_set, node_id_t& acceptance_node_id) { for(node_set::const_iterator nsi = eof_node_ids.begin(); nsi != eof_node_ids.end(); ++nsi) { node_id_t eof_node_id =*nsi; if (current_set.end() != current_set.find(eof_node_id)) { // store the first one we come to as the // matched pattern acceptance_node_id = eof_node_id; // don't bother searching for more return true; } } return false; } template <typename RegexListT, typename GrammarT> inline std::auto_ptr<node> parse_regexes(const RegexListT& regex_list, GrammarT& g) { // parse the expressions into a tree if (regex_list.begin() == regex_list.end()) throw bad_regex(); typename RegexListT::const_iterator ri = regex_list.begin(); std::auto_ptr<node> tree(lexerimpl::parse(g, (*ri).str)); if (tree.get() == 0) throw bad_regex(); ++ri; for (/**/; ri != regex_list.end(); ++ri) { std::auto_ptr<node> next_tree(lexerimpl::parse(g, (*ri).str)); if (next_tree.get() == 0) throw bad_regex(); std::auto_ptr<node> newnode(new or_node(tree.release(), next_tree.release())); tree = newnode; } return tree; } } //namespace lexerimpl template <typename IteratorT, typename TokenT, typename CallbackT> inline void lexer<IteratorT, TokenT, CallbackT>::create_dfa() { m_dfa.resize(m_num_states); for (unsigned int i = 0; i < m_num_states; ++i) create_dfa_for_state(i); } // Algorithm from Compilers: Principles, Techniques, and Tools p. 141 template <typename IteratorT, typename TokenT, typename CallbackT> inline void lexer<IteratorT, TokenT, CallbackT>::create_dfa_for_state(int state) { using lexerimpl::node; std::auto_ptr<node> tree = lexerimpl::parse_regexes(m_regex_list[state], g); node_id_t dummy = 0; tree->assign_node_ids(dummy); #if defined(SPIRIT_DEBUG) tree->dump(std::cout); #endif // compute followpos(root) followpos_t followpos; tree->compute_followpos(followpos); // the dfa states <-> nfa state groups std::map<node_set, node_id_t> dstates1; std::map<node_id_t, node_set> dstates2; // the dfa transitions m_dfa[state].transition_table.push_back( std::vector<node_id_t>(256, invalid_node)); m_dfa[state].acceptance_index.push_back(invalid_node); // whether the dfa state has been processed yet std::vector<node_id_t> marked; // used to give a unique id to each dfa state node_id_t num_states = 0; // initially, the only unmarked state in Dstates is firstpos(root). marked.push_back(0); node_set fpr = tree->firstpos(); dstates1[fpr] = 0; dstates2[0] = fpr; state_match_t state_match; tree->compute_state_match(state_match); if (m_case_insensitive) lexerimpl::make_case_insensitive(state_match); node_set eof_node_ids; tree->get_eof_ids(eof_node_ids); // translate the eof_node_ids into a 0-based index std::map<node_id_t, node_id_t> eof_node_id_map; unsigned int x = 0; for (node_set::iterator node_id_it = eof_node_ids.begin(); node_id_it != eof_node_ids.end(); ++node_id_it) { eof_node_id_map[*node_id_it] = x++; } // figure out if this is an acceptance state node_id_t eof_node_id; if (lexerimpl::find_acceptance_state(eof_node_ids, fpr, eof_node_id)) { m_dfa[state].acceptance_index[0] = eof_node_id_map[eof_node_id]; } std::vector<node_id_t>::iterator i = std::find(marked.begin(), marked.end(), node_id_t(0)); while (marked.end() != i) { *i = 1; node_id_t T = node_id_t(std::distance(marked.begin(), i)); SPIRIT_ASSERT(T < dstates2.size()); node_set Tstates = dstates2[T]; for (node_id_t j = 0; j < 256; ++j) { node_set U; for (node_set::iterator k = Tstates.begin(); k != Tstates.end(); ++k) { node_id_t p =*k; SPIRIT_ASSERT(p < state_match.size()); SPIRIT_ASSERT(j < state_match[p].size()); if (state_match[p][j]) { node_set fpp = followpos[p]; U.insert(fpp.begin(), fpp.end()); } } if (U.size() > 0) { std::map<node_set, node_id_t>::iterator l = dstates1.find(U); node_id_t target_state; if (l == dstates1.end()) // not in the states yet { ++num_states; dstates1[U] = target_state = num_states; dstates2[target_state] = U; marked.push_back(0); m_dfa[state].transition_table.push_back( std::vector<node_id_t>(256, invalid_node)); m_dfa[state].acceptance_index.push_back(invalid_node); // figure out if this is an acceptance state node_id_t eof_node_id; if (lexerimpl::find_acceptance_state(eof_node_ids, U, eof_node_id)) { m_dfa[state].acceptance_index[target_state] = eof_node_id_map[eof_node_id]; } } else { target_state = dstates1[U]; } SPIRIT_ASSERT(T < m_dfa[state].transition_table.size()); SPIRIT_ASSERT(j < m_dfa[state].transition_table[T].size()); m_dfa[state].transition_table[T][j] = target_state; } } i = std::find(marked.begin(), marked.end(), node_id_t(0)); } m_compiled_dfa = true; } template <typename IteratorT, typename TokenT, typename CallbackT> inline void lexer<IteratorT, TokenT, CallbackT>::set_case_insensitive(bool insensitive) { m_case_insensitive = insensitive; } #if defined(SPIRIT_DEBUG) template <typename IteratorT, typename TokenT, typename CallbackT> inline void lexer<IteratorT, TokenT, CallbackT>::dump(std::ostream& out) { for (unsigned x = 0; x < m_dfa.size(); ++x) { out << "\nm_dfa[" << x << "] has " << m_dfa[x].transition_table.size() << " states\n"; for (node_id_t i = 0; i < m_dfa[x].transition_table.size(); ++i) { out << "state " << i << ":"; for (node_id_t j = 0; j < m_dfa[x].transition_table[i].size(); ++j) { if (m_dfa[x].transition_table[i][j] != invalid_node) out << j << "->" << m_dfa[x].transition_table[i][j] << " "; } out << "\n"; } out << "acceptance states: "; for(unsigned int k = 0; k < m_dfa[x].acceptance_index.size(); ++k) { if (m_dfa[x].acceptance_index[k] != invalid_node) out << '<' << k << ',' << m_dfa[x].acceptance_index[k] << "> "; } out << endl; } } #endif /////////////////////////////////////////////////////////////////////////////// // load the lexer tables #define slex_in(strm, val) \ strm.read((char*)&val, sizeof(val)); \ if (std::ios::goodbit != strm.rdstate()) return false template <typename IteratorT, typename TokenT, typename CallbackT> inline bool lexer<IteratorT, TokenT, CallbackT>::load (std::ifstream &in, long unique_id) { // ensure correct signature and version long signature = 0; slex_in (in, signature); if (signature != SLEX_SIGNATURE) return false; // not for us long version = 0; slex_in (in, version); if ((version & ~SLEX_MINOR_VERSION_MASK) > SLEX_LAST_KNOWN_VERSION) return false; // to new for us long uid = 0; slex_in (in, uid); if (uid != unique_id) return false; // not saved by us // load auxiliary members int num_states = 0; slex_in (in, num_states); // load the dfa tables dfa_t in_dfa; size_t dfa_size = 0; slex_in (in, dfa_size); in_dfa.resize(dfa_size); for (size_t dfa = 0; dfa < dfa_size; ++dfa) { // load the transition tables size_t tt_size = 0; transition_table_t &tt_table = in_dfa[dfa].transition_table; slex_in (in, tt_size); tt_table.resize(tt_size); for (size_t tt = 0; tt < tt_size; ++tt) { size_t nt_size = 0; node_table_t &nt_table = tt_table[tt]; slex_in (in, nt_size); nt_table.resize(nt_size); for (size_t nt = 0; nt < nt_size; ++nt) { slex_in (in, nt_table[nt]); } } // load the acceptance index table size_t ai_size = 0; node_table_t &ai_table = in_dfa[dfa].acceptance_index; slex_in (in, ai_size); ai_table.resize(ai_size); for (size_t ai = 0; ai < ai_size; ++ai) { slex_in (in, ai_table[ai]); } } m_dfa.swap(in_dfa); // success, swap in the read values m_num_states = num_states; m_compiled_dfa = true; return true; } #undef slex_in /////////////////////////////////////////////////////////////////////////////// // save the lexer tables #define slex_out(strm, val) \ strm.write((char*)&val, sizeof(val)); \ if (std::ios::goodbit != strm.rdstate()) return false template <typename IteratorT, typename TokenT, typename CallbackT> inline bool lexer<IteratorT, TokenT, CallbackT>::save (std::ofstream &out, long unique_id) { // save signature and version information long out_long = SLEX_SIGNATURE; slex_out(out, out_long); out_long = SLEX_VERSION_100; slex_out(out, out_long); slex_out(out, unique_id); // save auxiliary members slex_out(out, m_num_states); // save the dfa tables typedef typename dfa_t::const_iterator dfa_iter_t; typedef transition_table_t::const_iterator transition_table_iter_t; typedef node_table_t::const_iterator node_table_iter_t; out_long = m_dfa.size(); slex_out(out, out_long); dfa_iter_t end = m_dfa.end(); for (dfa_iter_t it = m_dfa.begin(); it != end; ++it) { // save the transition table out_long = (*it).transition_table.size(); slex_out(out, out_long); transition_table_iter_t tt_end = (*it).transition_table.end(); for (transition_table_iter_t tt_it = (*it).transition_table.begin(); tt_it != tt_end; ++tt_it) { out_long = (*tt_it).size(); slex_out(out, out_long); node_table_iter_t nt_end = (*tt_it).end(); for (node_table_iter_t nt_it = (*tt_it).begin(); nt_it != nt_end; ++nt_it) { slex_out(out, (*nt_it)); } } // save the acceptance index table out_long = (*it).acceptance_index.size(); slex_out(out, out_long); node_table_iter_t nt_end = (*it).acceptance_index.end(); for (node_table_iter_t nt_it = (*it).acceptance_index.begin(); nt_it != nt_end; ++nt_it) { slex_out(out, (*nt_it)); } } return true; } #undef slex_out /* a lexer_control object supports some operations on the lexer. get current lexer state set state terminate state stack (push, pop, top) set new token return value ignore the current token yymore get length of matched token */ template <typename TokenT> class lexer_control { public: lexer_control(TokenT& token, unsigned int& current_state, std::stack<unsigned int>& state_stack); // functions dealing with the lexer state // set the state to state void set_state(unsigned int state); // get the current state unsigned int get_state(); // pushes the current state onto the top of the state stack and // switches to new_state void push_state(unsigned int new_state); // pops the top of the state stack and switches to it. void pop_state(); // returns the top of the stack without altering the stack's contents. unsigned int top_state(); // functions dealing with the token returned. // set a new TokenT return value, overriding that one that was // registered via. register_regex() void set_token(const TokenT& token); // tell the lexer to return an invalid token, signifying termination. void terminate(); // ignore the current token, and move on to the next one. The current // token will NOT be returned from next_token() void ignore_current_token(); // returns true if ignore_current_token() has been called, // false otherwise. bool ignore_current_token_set(); private: TokenT& m_token; bool m_ignore_current_token; unsigned int& m_current_state; std::stack<unsigned int>& m_state_stack; }; template <typename TokenT> inline lexer_control<TokenT>::lexer_control(TokenT& token, unsigned int& current_state, std::stack<unsigned int>& state_stack) : m_token(token) , m_ignore_current_token(false) , m_current_state(current_state) , m_state_stack(state_stack) { } template <typename TokenT> inline void lexer_control<TokenT>::set_state(unsigned int state) { m_current_state = state; } template <typename TokenT> inline unsigned int lexer_control<TokenT>::get_state() { return m_current_state; } template <typename TokenT> inline void lexer_control<TokenT>::push_state(unsigned int new_state) { m_state_stack.push(m_current_state); m_current_state = new_state; } template <typename TokenT> inline void lexer_control<TokenT>::pop_state() { m_current_state = m_state_stack.top(); m_state_stack.pop(); } template <typename TokenT> inline unsigned int lexer_control<TokenT>::top_state() { return m_state_stack.top(); } template <typename TokenT> inline void lexer_control<TokenT>::set_token(const TokenT& token) { m_token = token; } template <typename TokenT> inline void lexer_control<TokenT>::terminate() { m_token = -1; // TOOD: fix this, need to be determined by traits } template <typename TokenT> inline void lexer_control<TokenT>::ignore_current_token() { m_ignore_current_token = true; } template <typename TokenT> inline bool lexer_control<TokenT>::ignore_current_token_set() { return m_ignore_current_token; } } // namespace spirit #undef SPIRIT_IT_NS #endif