diff options
Diffstat (limited to 'libstdc++-v3/include/bits/regex_executor.h')
-rw-r--r-- | libstdc++-v3/include/bits/regex_executor.h | 308 |
1 files changed, 236 insertions, 72 deletions
diff --git a/libstdc++-v3/include/bits/regex_executor.h b/libstdc++-v3/include/bits/regex_executor.h index 6d66d881584..b8e9266f910 100644 --- a/libstdc++-v3/include/bits/regex_executor.h +++ b/libstdc++-v3/include/bits/regex_executor.h @@ -28,7 +28,11 @@ * Do not attempt to use it directly. @headername{regex} */ -// TODO: convert comments to doxygen format. +// FIXME convert comments to doxygen format. + +// TODO Put _DFSExecutor and _BFSExecutor into one class. They are becoming +// much more similar. Also, make grouping seperated. The +// regex_constants::nosubs enables much more simpler execution. namespace std _GLIBCXX_VISIBILITY(default) { @@ -57,42 +61,107 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION class _Executor { public: + typedef basic_regex<_CharT, _TraitsT> _RegexT; typedef match_results<_BiIter, _Alloc> _ResultsT; typedef std::vector<sub_match<_BiIter>, _Alloc> _ResultsVec; typedef regex_constants::match_flag_type _FlagT; + typedef typename _TraitsT::char_class_type _ClassT; - virtual - ~_Executor() + public: + _Executor(_BiIter __begin, + _BiIter __end, + _ResultsT& __results, + const _RegexT& __re, + _FlagT __flags) + : _M_begin(__begin), + _M_end(__end), + _M_results(__results), + _M_re(__re), + _M_flags(__flags) { } // Set matched when string exactly match the pattern. - virtual void - _M_match() = 0; + bool + _M_match() + { + _M_match_mode = true; + _M_init(_M_begin); + return _M_main(); + } // Set matched when some prefix of the string matches the pattern. - virtual void - _M_search_from_first() = 0; - - protected: - typedef typename _NFA<_CharT, _TraitsT>::_SizeT _SizeT; - _Executor(_BiIter __begin, - _BiIter __end, - _ResultsT& __results, - _FlagT __flags, - _SizeT __size) - : _M_current(__begin), _M_end(__end), _M_results(__results), - _M_flags(__flags) + bool + _M_search_from_first() { - __size += 2; - _M_results.resize(__size); - for (auto __i = 0; __i < __size; __i++) - _M_results[__i].matched = false; + _M_match_mode = false; + _M_init(_M_begin); + return _M_main(); } - _BiIter _M_current; - _BiIter _M_end; - _ResultsVec& _M_results; - _FlagT _M_flags; + bool + _M_search() + { + if (_M_flags & regex_constants::match_continuous) + return _M_search_from_first(); + auto __cur = _M_begin; + do + { + _M_match_mode = false; + _M_init(__cur); + if (_M_main()) + return true; + } + // Continue when __cur == _M_end + while (__cur++ != _M_end); + return false; + } + + bool + _M_is_word(_CharT __ch) const + { + static const _CharT __s = 'w'; + return _M_re._M_traits.isctype + (__ch, _M_re._M_traits.lookup_classname(&__s, &__s+1)); + } + + bool + _M_at_begin() const + { + return _M_current == _M_begin + && !(_M_flags & (regex_constants::match_not_bol + | regex_constants::match_prev_avail)); + } + + bool + _M_at_end() const + { + return _M_current == _M_end + && !(_M_flags & regex_constants::match_not_eol); + } + + bool + _M_word_boundry(_State<_CharT, _TraitsT> __state) const; + + bool + _M_lookahead(_State<_CharT, _TraitsT> __state) const; + + public: + virtual void + _M_init(_BiIter __cur) = 0; + + virtual void + _M_set_start(_StateIdT __start) = 0; + + virtual bool + _M_main() = 0; + + _BiIter _M_current; + const _BiIter _M_begin; + const _BiIter _M_end; + const _RegexT& _M_re; + _ResultsT& _M_results; + const _FlagT _M_flags; + bool _M_match_mode; }; // A _DFSExecutor perform a DFS on given NFA and input string. At the very @@ -115,37 +184,47 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION { public: typedef _Executor<_BiIter, _Alloc, _CharT, _TraitsT> _BaseT; - typedef _NFA<_CharT, _TraitsT> _RegexT; + typedef _NFA<_CharT, _TraitsT> _NFAT; + typedef typename _BaseT::_RegexT _RegexT; typedef typename _BaseT::_ResultsT _ResultsT; typedef typename _BaseT::_ResultsVec _ResultsVec; - typedef regex_constants::match_flag_type _FlagT; + typedef typename _BaseT::_FlagT _FlagT; + public: _DFSExecutor(_BiIter __begin, _BiIter __end, _ResultsT& __results, - const _RegexT& __nfa, - const _TraitsT& __traits, + const _RegexT& __re, _FlagT __flags) - : _BaseT(__begin, __end, __results, __flags, __nfa._M_sub_count()), - _M_traits(__traits), _M_nfa(__nfa), _M_results_ret(this->_M_results) + : _BaseT(__begin, __end, __results, __re, __flags), + _M_nfa(*std::static_pointer_cast<_NFA<_CharT, _TraitsT>> + (__re._M_automaton)), + _M_start_state(_M_nfa._M_start()) { } + private: void - _M_match() - { _M_dfs<true>(_M_nfa._M_start()); } + _M_init(_BiIter __cur) + { + _M_cur_results.resize(_M_nfa._M_sub_count() + 2); + this->_M_current = __cur; + } void - _M_search_from_first() - { _M_dfs<false>(_M_nfa._M_start()); } + _M_set_start(_StateIdT __start) + { _M_start_state = __start; } - private: - template<bool __match_mode> - bool - _M_dfs(_StateIdT __i); + bool + _M_main() + { return _M_dfs(this->_M_start_state); } + + bool + _M_dfs(_StateIdT __start); - _ResultsVec _M_results_ret; - const _TraitsT& _M_traits; - const _RegexT& _M_nfa; + // To record current solution. + _ResultsVec _M_cur_results; + const _NFAT& _M_nfa; + _StateIdT _M_start_state; }; // Like the DFS approach, it try every possible state transition; Unlike DFS, @@ -168,38 +247,114 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION { public: typedef _Executor<_BiIter, _Alloc, _CharT, _TraitsT> _BaseT; - typedef _NFA<_CharT, _TraitsT> _RegexT; + typedef _NFA<_CharT, _TraitsT> _NFAT; + typedef typename _BaseT::_RegexT _RegexT; typedef typename _BaseT::_ResultsT _ResultsT; typedef typename _BaseT::_ResultsVec _ResultsVec; - typedef std::unique_ptr<_ResultsVec> _ResultsPtr; - typedef regex_constants::match_flag_type _FlagT; - - _BFSExecutor(_BiIter __begin, - _BiIter __end, - _ResultsT& __results, - const _RegexT& __nfa, - _FlagT __flags) - : _BaseT(__begin, __end, __results, __flags, __nfa._M_sub_count()), - _M_nfa(__nfa) + typedef typename _BaseT::_FlagT _FlagT; + // Here's a solution for greedy/ungreedy mode in BFS approach. We need to + // carefully work out how to compare to conflict matching states. + // + // A matching state is a pair(where, when); `where` is a NFA node; `when` + // is a _BiIter, indicating which char is the next to be matched. Two + // matching states conflict if they have equivalent `where` and `when`. + // + // Now we need to drop one and keep another, because at most one of them + // could be the final optimal solution. This behavior is affected by + // greedy policy. + // + // The definition of `greedy`: + // For the sequence of quantifiers in NFA sorted by there start position, + // now maintain a vector in every matching state, with equal length to + // quantifier seq, recording repeating times of every quantifier. Now to + // compare two matching states, we just lexically compare these two + // vectors. To win the compare(to survive), one matching state needs to + // make its greedy quantifier count larger, and ungreedy quantifiers + // count smaller. + // + // In the implementation, we recorded negtive counts for greedy + // quantifiers and positive counts of ungreedy ones. Now the implicit + // operator<() for lexicographical_compare will emit the answer. + // + // When two vectors equal, it means the `where`, `when` and quantifier + // counts are identical, and indicates the same solution; so just return + // false. + struct _ResultsEntry + : private _ResultsVec { - if (_M_nfa._M_start() != _S_invalid_state_id) - _M_covered[_M_nfa._M_start()] = - _ResultsPtr(new _ResultsVec(this->_M_results)); - _M_e_closure(); - } + public: + _ResultsEntry(unsigned int __res_sz, unsigned int __sz) + : _ResultsVec(__res_sz), _M_quant_keys(__sz) + { } + + void + resize(unsigned int __n) + { _ResultsVec::resize(__n); } + + unsigned int + size() + { return _ResultsVec::size(); } + + sub_match<_BiIter>& + operator[](unsigned int __idx) + { return _ResultsVec::operator[](__idx); } + + bool + operator<(const _ResultsEntry& __rhs) const + { + _GLIBCXX_DEBUG_ASSERT(_M_quant_keys.size() + == __rhs._M_quant_keys.size()); + return lexicographical_compare(_M_quant_keys.begin(), + _M_quant_keys.end(), + __rhs._M_quant_keys.begin(), + __rhs._M_quant_keys.end()); + } + + void + _M_inc(unsigned int __idx, bool __neg) + { _M_quant_keys[__idx] += __neg ? 1 : -1; } + + _ResultsVec + _M_get() + { return *this; } + + public: + std::vector<int> _M_quant_keys; + }; + typedef std::unique_ptr<_ResultsEntry> _ResultsPtr; + + public: + _BFSExecutor(_BiIter __begin, + _BiIter __end, + _ResultsT& __results, + const _RegexT& __re, + _FlagT __flags) + : _BaseT(__begin, __end, __results, __re, __flags), + _M_nfa(*std::static_pointer_cast<_NFA<_CharT, _TraitsT>> + (__re._M_automaton)), + _M_start_state(_M_nfa._M_start()) + { } + private: void - _M_match() - { _M_main_loop<true>(); } + _M_init(_BiIter __cur) + { + _GLIBCXX_DEBUG_ASSERT(this->_M_start_state != _S_invalid_state_id); + this->_M_current = __cur; + _M_covered.clear(); + _ResultsVec& __res(this->_M_results); + _M_covered[this->_M_start_state] = + _ResultsPtr(new _ResultsEntry(__res.size(), + _M_nfa._M_quant_count)); + _M_e_closure(); + } void - _M_search_from_first() - { _M_main_loop<false>(); } + _M_set_start(_StateIdT __start) + { _M_start_state = __start; } - private: - template<bool __match_mode> - void - _M_main_loop(); + bool + _M_main(); void _M_e_closure(); @@ -208,15 +363,24 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION _M_move(); bool - _M_match_less_than(const _ResultsVec& __u, const _ResultsVec& __v) const; + _M_includes_some(); - bool - _M_includes_some() const; - - std::map<_StateIdT, _ResultsPtr> _M_covered; - const _RegexT& _M_nfa; + std::map<_StateIdT, _ResultsPtr> _M_covered; + // To record global optimal solution. + _ResultsPtr _M_cur_results; + const _NFAT& _M_nfa; + _StateIdT _M_start_state; }; + template<typename _BiIter, typename _Alloc, + typename _CharT, typename _TraitsT> + std::unique_ptr<_Executor<_BiIter, _Alloc, _CharT, _TraitsT>> + __get_executor(_BiIter __b, + _BiIter __e, + match_results<_BiIter, _Alloc>& __m, + const basic_regex<_CharT, _TraitsT>& __re, + regex_constants::match_flag_type __flags); + //@} regex-detail _GLIBCXX_END_NAMESPACE_VERSION } // namespace __detail |