Commit 7b86458e by Tim Shen Committed by Tim Shen

regex.h (regex_match<>, [...]): Change regex_executor caller.

2013-09-14  Tim Shen  <timshen91@gmail.com>

	* include/bits/regex.h (regex_match<>, regex_search<>):
	Change regex_executor caller. Now use their return value instead
	of checking __m[0].matched to find out if it's successful.
	(regex_search<>): Move the search logic to regex_executor.
	* include/bits/regex_automaton.h: Add some new _Opcode. Refactor
	_NFA::_M_insert_*.
	* include/bits/regex_automaton.tcc: Add DEBUG dump for new
	_Opcode. Refactor _NFA::_M_insert_*.
	* include/bits/regex_compiler.h (_Compiler<>::_M_get_nfa):
	Use make_shared instead of construct by hand.
	* include/bits/regex_compiler.tcc: Implement _Compiler<>::_M_assertion.
	* include/bits/regex_constants.h: Fix indentation and line breaking.
	* include/bits/regex_executor.h: Add _ResultsEntry to support
	greedy/ungreedy mode. Move regex_search logic here.
	* include/bits/regex_executor.tcc: Implement assertions and
	greedy/ungreedy matching.
	* include/bits/regex_scanner.h: Add a new token _S_token_ungreedy.
	* include/bits/regex_scanner.tcc: Parse a new token _S_token_ungreedy.
	* testsuite/28_regex/algorithms/regex_search/ecma/assertion.cc: New.
	* testsuite/28_regex/algorithms/regex_search/ecma/greedy.cc: New.
	* testsuite/28_regex/algorithms/regex_search/ecma/string_01.cc:
	Fix comment.

From-SVN: r202591
parent 492d1e0a
2013-09-14 Tim Shen <timshen91@gmail.com>
* include/bits/regex.h (regex_match<>, regex_search<>):
Change regex_executor caller. Now use their return value instead
of checking __m[0].matched to find out if it's successful.
(regex_search<>): Move the search logic to regex_executor.
* include/bits/regex_automaton.h: Add some new _Opcode. Refactor
_NFA::_M_insert_*.
* include/bits/regex_automaton.tcc: Add DEBUG dump for new
_Opcode. Refactor _NFA::_M_insert_*.
* include/bits/regex_compiler.h (_Compiler<>::_M_get_nfa):
Use make_shared instead of construct by hand.
* include/bits/regex_compiler.tcc: Implement _Compiler<>::_M_assertion.
* include/bits/regex_constants.h: Fix indentation and line breaking.
* include/bits/regex_executor.h: Add _ResultsEntry to support
greedy/ungreedy mode. Move regex_search logic here.
* include/bits/regex_executor.tcc: Implement assertions and
greedy/ungreedy matching.
* include/bits/regex_scanner.h: Add a new token _S_token_ungreedy.
* include/bits/regex_scanner.tcc: Parse a new token _S_token_ungreedy.
* testsuite/28_regex/algorithms/regex_search/ecma/assertion.cc: New.
* testsuite/28_regex/algorithms/regex_search/ecma/greedy.cc: New.
* testsuite/28_regex/algorithms/regex_search/ecma/string_01.cc:
Fix comment.
2013-09-13 Paolo Carlini <paolo.carlini@oracle.com>
PR libstdc++/58415
......
......@@ -2106,14 +2106,16 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
template<typename, typename, typename, typename>
friend class __detail::_BFSExecutor;
template<typename _Bp, typename _Ap, typename _Ch_type, typename _Rx_traits>
template<typename _Bp, typename _Ap,
typename _Ch_type, typename _Rx_traits>
friend bool
regex_match(_Bp, _Bp, match_results<_Bp, _Ap>&,
const basic_regex<_Ch_type,
_Rx_traits>&,
regex_constants::match_flag_type);
template<typename _Bp, typename _Ap, typename _Ch_type, typename _Rx_traits>
template<typename _Bp, typename _Ap,
typename _Ch_type, typename _Rx_traits>
friend bool
regex_search(_Bp, _Bp, match_results<_Bp, _Ap>&,
const basic_regex<_Ch_type,
......@@ -2213,8 +2215,7 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
{
if (__re._M_automaton == nullptr)
return false;
__detail::__get_executor(__s, __e, __m, __re, __flags)->_M_match();
if (__m.size() > 0 && __m[0].matched)
if (__detail::__get_executor(__s, __e, __m, __re, __flags)->_M_match())
{
for (auto __it : __m)
if (!__it.matched)
......@@ -2373,13 +2374,8 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
{
if (__re._M_automaton == nullptr)
return false;
auto __cur = __first;
// Continue when __cur == __last
do
{
__detail::__get_executor(__cur, __last, __m, __re, __flags)
->_M_search_from_first();
if (__m.size() > 0 && __m[0].matched)
if (__detail::__get_executor(__first, __last, __m, __re, __flags)
->_M_search())
{
for (auto __it : __m)
if (!__it.matched)
......@@ -2394,8 +2390,6 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
(__m.suffix().first != __m.suffix().second);
return true;
}
}
while (__cur++ != __last);
return false;
}
......
......@@ -51,14 +51,18 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
/// that represents the regular expression.
enum _Opcode
{
_S_opcode_unknown = 0,
_S_opcode_alternative = 1,
_S_opcode_backref = 2,
_S_opcode_subexpr_begin = 4,
_S_opcode_subexpr_end = 5,
_S_opcode_dummy = 6,
_S_opcode_match = 100,
_S_opcode_accept = 255
_S_opcode_unknown,
_S_opcode_alternative,
_S_opcode_backref,
_S_opcode_line_begin_assertion,
_S_opcode_line_end_assertion,
_S_opcode_word_boundry,
_S_opcode_subexpr_lookahead,
_S_opcode_subexpr_begin,
_S_opcode_subexpr_end,
_S_opcode_dummy,
_S_opcode_match,
_S_opcode_accept,
};
template<typename _CharT, typename _TraitsT>
......@@ -72,9 +76,18 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
_StateIdT _M_next; // outgoing transition
union // Since they are mutually exclusive.
{
_StateIdT _M_alt; // for _S_opcode_alternative
unsigned int _M_subexpr; // for _S_opcode_subexpr_*
unsigned int _M_backref_index; // for _S_opcode_backref
struct
{
// for _S_opcode_alternative.
_StateIdT _M_quant_index;
// for _S_opcode_alternative or _S_opcode_subexpr_lookahead
_StateIdT _M_alt;
// for _S_opcode_word_boundry or _S_opcode_subexpr_lookahead or
// quantifiers(ungreedy if set true)
bool _M_neg;
};
};
_MatcherT _M_matches; // for _S_opcode_match
......@@ -82,25 +95,6 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
: _M_opcode(__opcode), _M_next(_S_invalid_state_id)
{ }
_State(const _MatcherT& __m)
: _M_opcode(_S_opcode_match), _M_next(_S_invalid_state_id),
_M_matches(__m)
{ }
_State(_OpcodeT __opcode, unsigned __index)
: _M_opcode(__opcode), _M_next(_S_invalid_state_id)
{
if (__opcode == _S_opcode_subexpr_begin
|| __opcode == _S_opcode_subexpr_end)
_M_subexpr = __index;
else if (__opcode == _S_opcode_backref)
_M_backref_index = __index;
}
_State(_StateIdT __next, _StateIdT __alt)
: _M_opcode(_S_opcode_alternative), _M_next(__next), _M_alt(__alt)
{ }
#ifdef _GLIBCXX_DEBUG
std::ostream&
_M_print(std::ostream& ostr) const;
......@@ -141,7 +135,7 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
_NFA(_FlagT __f)
: _M_flags(__f), _M_start_state(0), _M_subexpr_count(0),
_M_has_backref(false)
_M_has_backref(false), _M_quant_count(0)
{ }
_FlagT
......@@ -163,23 +157,30 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
_StateIdT
_M_insert_accept()
{
this->push_back(_StateT(_S_opcode_accept));
_M_accepting_states.insert(this->size()-1);
return this->size()-1;
auto __ret = _M_insert_state(_StateT(_S_opcode_accept));
_M_accepting_states.insert(__ret);
return __ret;
}
_StateIdT
_M_insert_alt(_StateIdT __next, _StateIdT __alt)
_M_insert_alt(_StateIdT __next, _StateIdT __alt, bool __neg)
{
this->push_back(_StateT(__next, __alt));
return this->size()-1;
_StateT __tmp(_S_opcode_alternative);
// It labels every quantifier to make greedy comparison easier in BFS
// approach.
__tmp._M_quant_index = _M_quant_count++;
__tmp._M_next = __next;
__tmp._M_alt = __alt;
__tmp._M_neg = __neg;
return _M_insert_state(__tmp);
}
_StateIdT
_M_insert_matcher(_MatcherT __m)
{
this->push_back(_StateT(__m));
return this->size()-1;
_StateT __tmp(_S_opcode_match);
__tmp._M_matches = __m;
return _M_insert_state(__tmp);
}
_StateIdT
......@@ -187,29 +188,53 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
{
auto __id = _M_subexpr_count++;
_M_paren_stack.push_back(__id);
this->push_back(_StateT(_S_opcode_subexpr_begin, __id));
return this->size()-1;
_StateT __tmp(_S_opcode_subexpr_begin);
__tmp._M_subexpr = __id;
return _M_insert_state(__tmp);
}
_StateIdT
_M_insert_subexpr_end()
{
this->push_back(_StateT(_S_opcode_subexpr_end, _M_paren_stack.back()));
_StateT __tmp(_S_opcode_subexpr_end);
__tmp._M_subexpr = _M_paren_stack.back();
_M_paren_stack.pop_back();
return this->size()-1;
return _M_insert_state(__tmp);
}
_StateIdT
_M_insert_backref(unsigned int __index);
_StateIdT
_M_insert_dummy()
_M_insert_line_begin()
{ return _M_insert_state(_StateT(_S_opcode_line_begin_assertion)); }
_StateIdT
_M_insert_line_end()
{ return _M_insert_state(_StateT(_S_opcode_line_end_assertion)); }
_StateIdT
_M_insert_word_bound(bool __neg)
{
this->push_back(_StateT(_S_opcode_dummy));
return this->size()-1;
_StateT __tmp(_S_opcode_word_boundry);
__tmp._M_neg = __neg;
return _M_insert_state(__tmp);
}
_StateIdT
_M_insert_lookahead(_StateIdT __alt, bool __neg)
{
_StateT __tmp(_S_opcode_subexpr_lookahead);
__tmp._M_alt = __alt;
__tmp._M_neg = __neg;
return _M_insert_state(__tmp);
}
_StateIdT
_M_insert_dummy()
{ return _M_insert_state(_StateT(_S_opcode_dummy)); }
_StateIdT
_M_insert_state(_StateT __s)
{
this->push_back(__s);
......@@ -230,6 +255,7 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
_FlagT _M_flags;
_StateIdT _M_start_state;
_SizeT _M_subexpr_count;
_SizeT _M_quant_count;
bool _M_has_backref;
};
......
......@@ -80,6 +80,31 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
<< __id << " -> " << _M_alt
<< " [label=\"epsilon\", tailport=\"n\"];\n";
break;
case _S_opcode_backref:
__ostr << __id << " [label=\"" << __id << "\\nBACKREF "
<< _M_subexpr << "\"];\n"
<< __id << " -> " << _M_next << " [label=\"<match>\"];\n";
break;
case _S_opcode_line_begin_assertion:
__ostr << __id << " [label=\"" << __id << "\\nLINE_BEGIN \"];\n"
<< __id << " -> " << _M_next << " [label=\"epsilon\"];\n";
break;
case _S_opcode_line_end_assertion:
__ostr << __id << " [label=\"" << __id << "\\nLINE_END \"];\n"
<< __id << " -> " << _M_next << " [label=\"epsilon\"];\n";
break;
case _S_opcode_word_boundry:
__ostr << __id << " [label=\"" << __id << "\\nWORD_BOUNDRY "
<< _M_neg << "\"];\n"
<< __id << " -> " << _M_next << " [label=\"epsilon\"];\n";
break;
case _S_opcode_subexpr_lookahead:
__ostr << __id << " [label=\"" << __id << "\\nLOOK_AHEAD\"];\n"
<< __id << " -> " << _M_next
<< " [label=\"epsilon\", tailport=\"s\"];\n"
<< __id << " -> " << _M_alt
<< " [label=\"<assert>\", tailport=\"n\"];\n";
break;
case _S_opcode_subexpr_begin:
__ostr << __id << " [label=\"" << __id << "\\nSBEGIN "
<< _M_subexpr << "\"];\n"
......@@ -90,10 +115,7 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
<< _M_subexpr << "\"];\n"
<< __id << " -> " << _M_next << " [label=\"epsilon\"];\n";
break;
case _S_opcode_backref:
__ostr << __id << " [label=\"" << __id << "\\nBACKREF "
<< _M_subexpr << "\"];\n"
<< __id << " -> " << _M_next << " [label=\"<match>\"];\n";
case _S_opcode_dummy:
break;
case _S_opcode_match:
__ostr << __id << " [label=\"" << __id << "\\nMATCH\"];\n"
......@@ -102,8 +124,6 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
case _S_opcode_accept:
__ostr << __id << " [label=\"" << __id << "\\nACC\"];\n" ;
break;
case _S_opcode_dummy:
break;
default:
_GLIBCXX_DEBUG_ASSERT(false);
break;
......@@ -141,8 +161,9 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
if (__index == __it)
__throw_regex_error(regex_constants::error_backref);
_M_has_backref = true;
this->push_back(_StateT(_S_opcode_backref, __index));
return this->size()-1;
_StateT __tmp(_S_opcode_backref);
__tmp._M_backref_index = __index;
return _M_insert_state(__tmp);
}
template<typename _CharT, typename _TraitsT>
......
......@@ -56,7 +56,7 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
std::shared_ptr<_RegexT>
_M_get_nfa() const
{ return std::shared_ptr<_RegexT>(new _RegexT(_M_nfa)); }
{ return make_shared<_RegexT>(_M_nfa); }
private:
typedef _Scanner<_FwdIter> _ScannerT;
......
......@@ -96,7 +96,7 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
__alt2._M_append(__end);
_M_stack.push(_StateSeqT(_M_nfa,
_M_nfa._M_insert_alt(__alt1._M_start,
__alt2._M_start),
__alt2._M_start, false),
__end));
}
}
......@@ -132,25 +132,34 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
return false;
}
// TODO Implement it.
template<typename _FwdIter, typename _CharT, typename _TraitsT>
bool
_Compiler<_FwdIter, _CharT, _TraitsT>::
_M_assertion()
{
// temporary place holders.
if (_M_match_token(_ScannerT::_S_token_line_begin))
_M_stack.push(_StateSeqT(_M_nfa, _M_nfa._M_insert_dummy()));
_M_stack.push(_StateSeqT(_M_nfa, _M_nfa.
_M_insert_line_begin()));
else if (_M_match_token(_ScannerT::_S_token_line_end))
_M_stack.push(_StateSeqT(_M_nfa, _M_nfa._M_insert_dummy()));
_M_stack.push(_StateSeqT(_M_nfa, _M_nfa.
_M_insert_line_end()));
else if (_M_match_token(_ScannerT::_S_token_word_bound))
_M_stack.push(_StateSeqT(_M_nfa, _M_nfa._M_insert_dummy()));
else if (_M_match_token(_ScannerT::_S_token_neg_word_bound))
_M_stack.push(_StateSeqT(_M_nfa, _M_nfa._M_insert_dummy()));
// _M_value[0] == 'n' means it's negtive, say "not word boundary".
_M_stack.push(_StateSeqT(_M_nfa, _M_nfa.
_M_insert_word_bound(_M_value[0] == 'n')));
else if (_M_match_token(_ScannerT::_S_token_subexpr_lookahead_begin))
_M_stack.push(_StateSeqT(_M_nfa, _M_nfa._M_insert_dummy()));
else if (_M_match_token(_ScannerT::_S_token_subexpr_neg_lookahead_begin))
_M_stack.push(_StateSeqT(_M_nfa, _M_nfa._M_insert_dummy()));
{
auto __neg = _M_value[0] == 'n';
this->_M_disjunction();
if (!_M_match_token(_ScannerT::_S_token_subexpr_end))
__throw_regex_error(regex_constants::error_paren);
auto __tmp = _M_pop();
__tmp._M_append(_M_nfa._M_insert_accept());
_M_stack.push(
_StateSeqT(
_M_nfa,
_M_nfa._M_insert_lookahead(__tmp._M_start, __neg)));
}
else
return false;
return true;
......@@ -161,40 +170,44 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
_Compiler<_FwdIter, _CharT, _TraitsT>::
_M_quantifier()
{
if (_M_match_token(_ScannerT::_S_token_closure0))
bool __neg = regex_constants::ECMAScript;
auto __init = [this, &__neg]()
{
if (_M_stack.empty())
__throw_regex_error(regex_constants::error_badrepeat);
__neg = __neg && _M_match_token(_ScannerT::_S_token_opt);
};
if (_M_match_token(_ScannerT::_S_token_closure0))
{
__init();
auto __e = _M_pop();
_StateSeqT __r(_M_nfa, _M_nfa._M_insert_alt(_S_invalid_state_id,
__e._M_start));
__e._M_start, __neg));
__e._M_append(__r);
_M_stack.push(__r);
}
else if (_M_match_token(_ScannerT::_S_token_closure1))
{
if (_M_stack.empty())
__throw_regex_error(regex_constants::error_badrepeat);
__init();
auto __e = _M_pop();
__e._M_append(_M_nfa._M_insert_alt(_S_invalid_state_id, __e._M_start));
__e._M_append(_M_nfa._M_insert_alt(_S_invalid_state_id, __e._M_start,
__neg));
_M_stack.push(__e);
}
else if (_M_match_token(_ScannerT::_S_token_opt))
{
if (_M_stack.empty())
__throw_regex_error(regex_constants::error_badrepeat);
__init();
auto __e = _M_pop();
auto __end = _M_nfa._M_insert_dummy();
_StateSeqT __r(_M_nfa, _M_nfa._M_insert_alt(_S_invalid_state_id,
__e._M_start));
__e._M_start, __neg));
__e._M_append(__end);
__r._M_append(__end);
_M_stack.push(__r);
}
else if (_M_match_token(_ScannerT::_S_token_interval_begin))
{
if (_M_stack.empty())
__throw_regex_error(regex_constants::error_badrepeat);
__init();
if (!_M_match_token(_ScannerT::_S_token_dup_count))
__throw_regex_error(regex_constants::error_badbrace);
_StateSeqT __r(_M_pop());
......@@ -213,16 +226,20 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
for (int __i = 0; __i < __n; ++__i)
{
auto __tmp = __r._M_clone();
__e._M_append(_StateSeqT(_M_nfa, _M_nfa.
_M_insert_alt(__tmp._M_start, __end), __tmp._M_end));
__e._M_append
(_StateSeqT(_M_nfa,
_M_nfa._M_insert_alt(__tmp._M_start,
__end, __neg),
__tmp._M_end));
}
__e._M_append(__end);
}
else // {3,}
{
auto __tmp = __r._M_clone();
_StateSeqT __s(_M_nfa, _M_nfa._M_insert_alt(_S_invalid_state_id,
__tmp._M_start));
_StateSeqT __s(_M_nfa,
_M_nfa._M_insert_alt(_S_invalid_state_id,
__tmp._M_start, __neg));
__tmp._M_append(__s);
__e._M_append(__s);
}
......
......@@ -128,8 +128,8 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
/**
* Specifies that the grammar recognized by the regular expression engine is
* that used by POSIX extended regular expressions in IEEE Std 1003.1-2001,
* Portable Operating System Interface (POSIX), Base Definitions and Headers,
* Section 9, Regular Expressions.
* Portable Operating System Interface (POSIX), Base Definitions and
* Headers, Section 9, Regular Expressions.
*/
extended = 1 << _S_extended,
......
......@@ -66,32 +66,45 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
{ }
// Set matched when string exactly match the pattern.
virtual void
virtual bool
_M_match() = 0;
// Set matched when some prefix of the string matches the pattern.
virtual void
_M_search_from_first() = 0;
virtual bool
_M_search() = 0;
protected:
typedef typename _NFA<_CharT, _TraitsT>::_SizeT _SizeT;
typedef typename _TraitsT::char_class_type _ClassT;
_Executor(_BiIter __begin,
_BiIter __end,
_ResultsT& __results,
_FlagT __flags,
_SizeT __size)
: _M_current(__begin), _M_end(__end), _M_results(__results),
_M_flags(__flags)
_SizeT __size,
const _TraitsT& __traits)
: _M_current(__begin), _M_begin(__begin), _M_end(__end),
_M_results(__results), _M_flags(__flags), _M_traits(__traits)
{
__size += 2;
_M_results.resize(__size);
for (auto __i = 0; __i < __size; __i++)
for (_SizeT __i = 0; __i < __size; ++__i)
_M_results[__i].matched = false;
}
bool
_M_is_word(_CharT __ch)
{
static const _CharT __s = 'w';
return _M_traits.isctype(__ch,
_M_traits.lookup_classname(&__s, &__s+1));
}
_BiIter _M_current;
_BiIter _M_end;
const _BiIter _M_begin;
const _BiIter _M_end;
_ResultsVec& _M_results;
const _TraitsT& _M_traits;
_FlagT _M_flags;
};
......@@ -126,26 +139,51 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
const _RegexT& __nfa,
const _TraitsT& __traits,
_FlagT __flags)
: _BaseT(__begin, __end, __results, __flags, __nfa._M_sub_count()),
_M_traits(__traits), _M_nfa(__nfa), _M_results_ret(this->_M_results)
: _BaseT(__begin, __end, __results, __flags, __nfa._M_sub_count(),
__traits),
_M_traits(__traits), _M_nfa(__nfa), _M_cur_results(this->_M_results),
_M_start_state(__nfa._M_start())
{ }
void
bool
_M_match()
{ _M_dfs<true>(_M_nfa._M_start()); }
{
this->_M_current = this->_M_begin;
return _M_dfs<true>(_M_start_state);
}
void
bool
_M_search_from_first()
{ _M_dfs<false>(_M_nfa._M_start()); }
{
this->_M_current = this->_M_begin;
return _M_dfs<false>(_M_start_state);
}
bool
_M_search()
{
auto __cur = this->_M_begin;
do
{
this->_M_current = __cur;
if (_M_dfs<false>(_M_start_state))
return true;
}
// Continue when __cur == _M_end
while (__cur++ != this->_M_end);
return false;
}
private:
template<bool __match_mode>
bool
_M_dfs(_StateIdT __i);
_ResultsVec _M_results_ret;
// To record current solution.
_ResultsVec _M_cur_results;
const _TraitsT& _M_traits;
const _RegexT& _M_nfa;
_StateIdT _M_start_state;
};
// Like the DFS approach, it try every possible state transition; Unlike DFS,
......@@ -170,35 +208,129 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
typedef _Executor<_BiIter, _Alloc, _CharT, _TraitsT> _BaseT;
typedef _NFA<_CharT, _TraitsT> _RegexT;
typedef typename _BaseT::_ResultsT _ResultsT;
typedef typename _BaseT::_ResultsVec _ResultsVec;
typedef std::unique_ptr<_ResultsVec> _ResultsPtr;
// Here's a solution for greedy/ungreedy mode in BFS approach. We need to
// carefully work out how to compare to conflict matching states.
//
// A matching state is a pair(where, when); `where` is a NFA node; `when`
// is a _BiIter, indicating which char is the next to be mathed one. Two
// matching states conflict means that they have equivalent `where` and
// `when`.
//
// Now since we need to drop one and keep another, because at most one of
// them could be the final optimal solution. This behavior is affected by
// greedy policy.
//
// The definition of `greedy`:
// For the sequence of quantifiers in NFA sorted by there start position,
// now maintain a vector in a matching state, with equal length to
// quantifier seq, recording repeating times of every quantifier. Now to
// compare two matching states, we just lexically compare these two
// vectors. To win the compare(to survive), one matching state needs to
// make its greedy quantifier count larger, and ungreedy quantifiers
// count smaller.
//
// In the implementation, we recorded negtive numbers for greedy
// quantifiers and positive numbers of ungreedy ones. Now a simple
// operator<() for lexicographical_compare will emit the answer.
//
// When two vectors equal, it means the `where`, `when` and quantifier
// counts are identical, it indicates the same answer, so just return
// false.
struct _ResultsEntry
: private _BaseT::_ResultsVec
{
public:
_ResultsEntry(unsigned int __res_sz, unsigned int __sz)
: _BaseT::_ResultsVec(__res_sz), _M_quant_keys(__sz)
{ }
sub_match<_BiIter>&
operator[](unsigned int __idx)
{ return this->_BaseT::_ResultsVec::operator[](__idx); }
bool
operator<(const _ResultsEntry& __rhs) const
{
_GLIBCXX_DEBUG_ASSERT(_M_quant_keys.size()
== __rhs._M_quant_keys.size());
return lexicographical_compare(_M_quant_keys.begin(),
_M_quant_keys.end(),
__rhs._M_quant_keys.begin(),
__rhs._M_quant_keys.end());
}
void
_M_inc(unsigned int __idx, bool __neg)
{ _M_quant_keys[__idx] += __neg ? 1 : -1; }
typename _BaseT::_ResultsVec
_M_get()
{ return *this; }
public:
std::vector<int> _M_quant_keys;
};
typedef std::unique_ptr<_ResultsEntry> _ResultsPtr;
typedef regex_constants::match_flag_type _FlagT;
_BFSExecutor(_BiIter __begin,
_BiIter __end,
_ResultsT& __results,
const _RegexT& __nfa,
const _TraitsT& __traits,
_FlagT __flags)
: _BaseT(__begin, __end, __results, __flags, __nfa._M_sub_count()),
_M_nfa(__nfa)
{
if (_M_nfa._M_start() != _S_invalid_state_id)
_M_covered[_M_nfa._M_start()] =
_ResultsPtr(new _ResultsVec(this->_M_results));
_M_e_closure();
}
: _BaseT(__begin, __end, __results, __flags, __nfa._M_sub_count(),
__traits),
_M_nfa(__nfa),
_M_cur_results(nullptr),
_M_start_state(__nfa._M_start())
{ }
void
bool
_M_match()
{ _M_main_loop<true>(); }
{
_M_init(this->_M_begin);
return _M_main_loop<true>();
}
void
bool
_M_search_from_first()
{ _M_main_loop<false>(); }
{
_M_init(this->_M_begin);
return _M_main_loop<false>();
}
bool
_M_search()
{
auto __cur = this->_M_begin;
do
{
_M_init(__cur);
if (_M_main_loop<false>())
return true;
}
// Continue when __cur == _M_end
while (__cur++ != this->_M_end);
return false;
}
private:
template<bool __match_mode>
void
_M_init(_BiIter __cur)
{
_GLIBCXX_DEBUG_ASSERT(_M_start_state != _S_invalid_state_id);
this->_M_current = __cur;
_M_covered.clear();
_M_covered[_M_start_state] =
_ResultsPtr(new _ResultsEntry(this->_M_results.size(),
_M_nfa._M_quant_count));
_M_e_closure();
}
template<bool __match_mode>
bool
_M_main_loop();
void
......@@ -208,13 +340,13 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
_M_move();
bool
_M_match_less_than(const _ResultsVec& __u, const _ResultsVec& __v) const;
bool
_M_includes_some() const;
_M_includes_some();
std::map<_StateIdT, _ResultsPtr> _M_covered;
// To record global optimal solution.
_ResultsPtr _M_cur_results;
const _RegexT& _M_nfa;
_StateIdT _M_start_state;
};
//@} regex-detail
......
......@@ -44,18 +44,21 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
// This is not that certain. Need deeper investigate.
return false;
auto& __current = this->_M_current;
auto& __begin = this->_M_begin;
auto& __end = this->_M_end;
auto& __results = _M_results_ret;
auto& __results = _M_cur_results;
const auto& __state = _M_nfa[__i];
bool __ret = false;
switch (__state._M_opcode)
{
case _S_opcode_alternative:
// Greedy mode by default. For non-greedy mode,
// swap _M_alt and _M_next.
// TODO: Add greedy mode option.
// Greedy or not, this is a question ;)
if (!__state._M_neg)
__ret = _M_dfs<__match_mode>(__state._M_alt)
|| _M_dfs<__match_mode>(__state._M_next);
else
__ret = _M_dfs<__match_mode>(__state._M_next)
|| _M_dfs<__match_mode>(__state._M_alt);
break;
case _S_opcode_subexpr_begin:
// Here's the critical part: if there's nothing changed since last
......@@ -86,6 +89,52 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
else
__ret = _M_dfs<__match_mode>(__state._M_next);
break;
case _S_opcode_line_begin_assertion:
if (__current == __begin)
__ret = _M_dfs<__match_mode>(__state._M_next);
break;
case _S_opcode_line_end_assertion:
if (__current == __end)
__ret = _M_dfs<__match_mode>(__state._M_next);
break;
// By definition.
case _S_opcode_word_boundry:
{
bool __ans = false;
if (__current == __begin && this->_M_is_word(*__current))
__ans = true;
else if (__current == __end && this->_M_is_word(*__current))
__ans = true;
else
{
auto __pre = __current;
--__pre;
if (this->_M_is_word(*__current)
!= this->_M_is_word(*__pre))
__ans = true;
}
if (__ans == !__state._M_neg)
__ret = _M_dfs<__match_mode>(__state._M_next);
}
break;
// Here __state._M_alt offers a single start node for a sub-NFA.
// We recursivly invoke our algorithm to match the sub-NFA.
case _S_opcode_subexpr_lookahead:
{
_ResultsT __m;
// FIXME Here's not necessarily a DFSExecutor. But we need to
// refactor the whole NFA to a recursive tree structure first.
_DFSExecutor __sub(this->_M_current,
this->_M_end,
__m,
this->_M_nfa,
this->_M_traits,
this->_M_flags);
__sub._M_start_state = __state._M_alt;
if (__sub._M_search_from_first() == !__state._M_neg)
__ret = _M_dfs<__match_mode>(__state._M_next);
}
break;
case _S_opcode_match:
if (__current != __end && __state._M_matches(*__current))
{
......@@ -138,19 +187,23 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
template<typename _BiIter, typename _Alloc,
typename _CharT, typename _TraitsT>
template<bool __match_mode>
void _BFSExecutor<_BiIter, _Alloc, _CharT, _TraitsT>::
bool _BFSExecutor<_BiIter, _Alloc, _CharT, _TraitsT>::
_M_main_loop()
{
bool __ret = false;
while (this->_M_current != this->_M_end)
{
if (!__match_mode)
if (_M_includes_some())
return;
// To keep regex_search greedy, no "return true" here.
__ret = _M_includes_some() || __ret;
_M_move();
++this->_M_current;
_M_e_closure();
}
_M_includes_some();
__ret = _M_includes_some() || __ret;
if (__ret)
this->_M_results = _M_cur_results->_M_get();
return __ret;
}
template<typename _BiIter, typename _Alloc,
......@@ -161,6 +214,9 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
auto& __current = this->_M_current;
std::queue<_StateIdT> __q;
std::vector<bool> __in_q(_M_nfa.size(), false);
auto& __begin = this->_M_begin;
auto& __end = this->_M_end;
for (auto& __it : _M_covered)
{
__in_q[__it.first] = true;
......@@ -173,18 +229,19 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
__in_q[__u] = false;
const auto& __state = _M_nfa[__u];
// Can be implemented using method, but there're too much arguments.
// I would use macro function before C++11, but lambda is a better
// choice, since hopefully compiler can inline it.
// Can be implemented using method, but there will be too many
// arguments. I would use macro function before C++11, but lambda is
// a better choice, since hopefully compiler can inline it.
auto __add_visited_state = [&](_StateIdT __v)
{
if (__v == _S_invalid_state_id)
return;
if (_M_covered.count(__u) != 0
&& (_M_covered.count(__v) == 0
|| _M_match_less_than(*_M_covered[__u], *_M_covered[__v])))
|| *_M_covered[__u] < *_M_covered[__v]))
{
_M_covered[__v] = _ResultsPtr(new _ResultsVec(*_M_covered[__u]));
_M_covered[__v] =
_ResultsPtr(new _ResultsEntry(*_M_covered[__u]));
// if a state is updated, it's outgoing neighbors should be
// reconsidered too. Push them to the queue.
if (!__in_q[__v])
......@@ -195,19 +252,33 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
}
};
// Identical to DFS's switch part.
switch (__state._M_opcode)
{
// Needs to maintain quantifier count vector here. A quantifier
// must be concerned with a alt node.
case _S_opcode_alternative:
{
__add_visited_state(__state._M_next);
auto __back =
_M_covered[__u]->_M_quant_keys[__state._M_quant_index];
_M_covered[__u]->_M_inc(__state._M_quant_index,
__state._M_neg);
__add_visited_state(__state._M_alt);
_M_covered[__u]->_M_quant_keys[__state._M_quant_index]
= __back;
}
break;
case _S_opcode_subexpr_begin:
{
auto& __cu = *_M_covered[__u];
auto __back = __cu[__state._M_subexpr].first;
__cu[__state._M_subexpr].first = __current;
auto& __sub = (*_M_covered[__u])[__state._M_subexpr];
if (!__sub.matched || __sub.first != __current)
{
auto __back = __sub.first;
__sub.first = __current;
__add_visited_state(__state._M_next);
__cu[__state._M_subexpr].first = __back;
__sub.first = __back;
}
}
break;
case _S_opcode_subexpr_end:
......@@ -220,10 +291,51 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
__cu[__state._M_subexpr] = __back;
}
break;
case _S_opcode_line_begin_assertion:
if (__current == __begin)
__add_visited_state(__state._M_next);
break;
case _S_opcode_line_end_assertion:
if (__current == __end)
__add_visited_state(__state._M_next);
break;
case _S_opcode_word_boundry:
{
bool __ans = false;
if (__current == __begin && this->_M_is_word(*__current))
__ans = true;
else if (__current == __end && this->_M_is_word(*__current))
__ans = true;
else
{
auto __pre = __current;
--__pre;
if (this->_M_is_word(*__current)
!= this->_M_is_word(*__pre))
__ans = true;
}
if (__ans == !__state._M_neg)
__add_visited_state(__state._M_next);
}
break;
case _S_opcode_subexpr_lookahead:
{
_ResultsT __m;
// Same comment as in DFS.
_BFSExecutor __sub(this->_M_current,
this->_M_end,
__m,
this->_M_nfa,
this->_M_traits,
this->_M_flags);
__sub._M_start_state = __state._M_alt;
if (__sub._M_search_from_first() == !__state._M_neg)
__add_visited_state(__state._M_next);
}
break;
case _S_opcode_match:
break;
case _S_opcode_accept:
__add_visited_state(__state._M_next);
break;
default:
_GLIBCXX_DEBUG_ASSERT(false);
......@@ -244,7 +356,7 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
&& __state._M_matches(*this->_M_current))
if (__state._M_next != _S_invalid_state_id)
if (__next.count(__state._M_next) == 0
|| _M_match_less_than(*__it.second, *__next[__state._M_next]))
|| *__it.second < *__next[__state._M_next])
__next[__state._M_next] = move(__it.second);
}
_M_covered = move(__next);
......@@ -253,37 +365,11 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
template<typename _BiIter, typename _Alloc,
typename _CharT, typename _TraitsT>
bool _BFSExecutor<_BiIter, _Alloc, _CharT, _TraitsT>::
_M_match_less_than(const _ResultsVec& __u, const _ResultsVec& __v) const
{
// TODO: Greedy and Non-greedy support
_GLIBCXX_DEBUG_ASSERT(__u.size() == __v.size());
auto __size = __u.size();
for (auto __i = 0; __i < __size; __i++)
{
auto __uit = __u[__i], __vit = __v[__i];
if (__uit.matched && !__vit.matched)
return true;
if (!__uit.matched && __vit.matched)
return false;
if (__uit.matched && __vit.matched)
{
// GREEDY
if (__uit.first != __vit.first)
return __uit.first < __vit.first;
if (__uit.second != __vit.second)
return __uit.second > __vit.second;
}
}
return false;
}
template<typename _BiIter, typename _Alloc,
typename _CharT, typename _TraitsT>
bool _BFSExecutor<_BiIter, _Alloc, _CharT, _TraitsT>::
_M_includes_some() const
_M_includes_some()
{
auto& __s = _M_nfa._M_final_states();
auto& __t = _M_covered;
bool __succ = false;
if (__s.size() > 0 && __t.size() > 0)
{
auto __first = __s.begin();
......@@ -292,16 +378,21 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
{
if (*__first < __second->first)
++__first;
else if (__second->first < *__first)
else if (*__first > __second->first)
++__second;
else
{
this->_M_results = *__second->second;
return true;
if (_M_cur_results == nullptr
|| *__second->second < *_M_cur_results)
_M_cur_results =
_ResultsPtr(new _ResultsEntry(*__second->second));
__succ = true;
++__first;
++__second;
}
}
}
return false;
return __succ;
}
template<typename _BiIter, typename _Alloc,
......@@ -322,7 +413,8 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
if (__p->_M_has_backref)
return _ExecutorPtr(new _DFSExecutorT(__b, __e, __m, *__p,
__re._M_traits, __flags));
return _ExecutorPtr(new _BFSExecutorT(__b, __e, __m, *__p, __flags));
return _ExecutorPtr(new _BFSExecutorT(__b, __e, __m, *__p,
__re._M_traits, __flags));
}
_GLIBCXX_END_NAMESPACE_VERSION
......
......@@ -69,7 +69,6 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
_S_token_subexpr_begin,
_S_token_subexpr_no_group_begin,
_S_token_subexpr_lookahead_begin,
_S_token_subexpr_neg_lookahead_begin,
_S_token_subexpr_end,
_S_token_bracket_begin,
_S_token_bracket_neg_begin,
......@@ -84,10 +83,10 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
_S_token_or,
_S_token_closure0,
_S_token_closure1,
_S_token_ungreedy,
_S_token_line_begin,
_S_token_line_end,
_S_token_word_bound,
_S_token_neg_word_bound,
_S_token_comma,
_S_token_dup_count,
_S_token_eof,
......
......@@ -210,11 +210,13 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
{
++_M_current;
_M_token = _S_token_subexpr_lookahead_begin;
_M_value.assign(1, 'p');
}
else if (*_M_current == '!')
{
++_M_current;
_M_token = _S_token_subexpr_neg_lookahead_begin;
_M_token = _S_token_subexpr_lookahead_begin;
_M_value.assign(1, 'n');
}
else
__throw_regex_error(regex_constants::error_paren);
......@@ -371,9 +373,15 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
_M_value.assign(1, _M_escape_map.at(__c));
}
else if (__c == 'b')
{
_M_token = _S_token_word_bound;
_M_value.assign(1, 'p');
}
else if (__c == 'B')
_M_token = _S_token_neg_word_bound;
{
_M_token = _S_token_word_bound;
_M_value.assign(1, 'n');
}
// N3376 28.13
else if (__c == 'd'
|| __c == 'D'
......@@ -581,9 +589,6 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
case _S_token_subexpr_lookahead_begin:
ostr << "lookahead subexpr begin\n";
break;
case _S_token_subexpr_neg_lookahead_begin:
ostr << "neg lookahead subexpr begin\n";
break;
case _S_token_subexpr_end:
ostr << "subexpr end\n";
break;
......
// { dg-options "-std=gnu++11" }
// { dg-do run { xfail *-*-* } }
//
// 2013-09-14 Tim Shen <timshen91@gmail.com>
//
// Copyright (C) 2013 Free Software Foundation, Inc.
//
// This file is part of the GNU ISO C++ Library. This library is free
// software; you can redistribute it and/or modify it under the
// terms of the GNU General Public License as published by the
// Free Software Foundation; either version 3, or (at your option)
// any later version.
//
// This library is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// GNU General Public License for more details.
//
// You should have received a copy of the GNU General Public License along
// with this library; see the file COPYING3. If not see
// <http://www.gnu.org/licenses/>.
// 28.11.3 regex_search
// Tests ECMAScript assertion.
#include <regex>
#include <testsuite_hooks.h>
using namespace std;
void
test01()
{
bool test __attribute__((unused)) = true;
VERIFY(!regex_search("2123456", regex("^1234")));
VERIFY(regex_search("123456", regex("^1234")));
VERIFY(regex_search("123456", regex("(5|^)1234")));
VERIFY(regex_search("5123456", regex("(5|^)1234")));
VERIFY(!regex_search("1234562", regex("3456$")));
VERIFY(regex_search("123456", regex("3456$")));
VERIFY(!regex_search("123456", regex("(?=1234)56")));
VERIFY(regex_search("123456", regex("(?=1234)123456")));
VERIFY(regex_search("123456", regex("(?!1234)56")));
VERIFY(!regex_search("123456", regex("(?!1234)123456")));
VERIFY(regex_search("a-", regex("a\\b-")));
VERIFY(!regex_search("ab", regex("a\\bb")));
VERIFY(!regex_search("a-", regex("a\\B-")));
VERIFY(regex_search("ab", regex("a\\Bb")));
string s("This is a regular expression");
string sol[] =
{
"This",
"is",
"a",
"regular",
"expression",
};
regex re("\\b\\w*\\b");
int i = 0;
for (auto it = sregex_iterator(s.begin(), s.end(), re);
it != sregex_iterator() && i < 5;
++it)
{
string s((*it)[0].first, (*it)[0].second);
VERIFY(s == sol[i++]);
}
VERIFY(i == 5);
}
int
main()
{
test01();
return 0;
}
// { dg-options "-std=gnu++11" }
//
// 2013-09-14 Tim Shen <timshen91@gmail.com>
//
// Copyright (C) 2013 Free Software Foundation, Inc.
//
// This file is part of the GNU ISO C++ Library. This library is free
// software; you can redistribute it and/or modify it under the
// terms of the GNU General Public License as published by the
// Free Software Foundation; either version 3, or (at your option)
// any later version.
//
// This library is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// GNU General Public License for more details.
//
// You should have received a copy of the GNU General Public License along
// with this library; see the file COPYING3. If not see
// <http://www.gnu.org/licenses/>.
// 28.11.3 regex_search
// Tests ECMAScript greedy and ungreedy quantifiers.
#include <regex>
#include <testsuite_hooks.h>
using namespace std;
void
test01()
{
bool test __attribute__((unused)) = true;
cmatch m;
#define TEST(i, s) VERIFY(m[i].matched && string(m[i].first, m[i].second) == s)
VERIFY(regex_search("aaaa", m, regex("a*")));
TEST(0, "aaaa");
VERIFY(regex_search("aaaa", m, regex("a*?")));
TEST(0, "");
VERIFY(regex_search("aaaa", m, regex("a+")));
TEST(0, "aaaa");
VERIFY(regex_search("aaaa", m, regex("a+?")));
TEST(0, "a");
VERIFY(regex_search("a", m, regex("a?")));
TEST(0, "a");
VERIFY(regex_search("a", m, regex("a??")));
TEST(0, "");
VERIFY(regex_search("", m, regex("a??")));
TEST(0, "");
VERIFY(regex_search("aaaa", m, regex("(a+)(a+)")));
TEST(1, "aaa");
TEST(2, "a");
VERIFY(regex_search("aaaa", m, regex("(a+?)(a+)")));
TEST(1, "a");
TEST(2, "aaa");
VERIFY(regex_search("aaaa", m, regex("(a+?)(a+)")));
TEST(1, "a");
TEST(2, "aaa");
VERIFY(regex_search("aaaa", m, regex("(a+?)(a+?)")));
TEST(1, "a");
TEST(2, "a");
}
int
main()
{
test01();
return 0;
}
......@@ -21,7 +21,7 @@
// <http://www.gnu.org/licenses/>.
// 28.11.3 regex_search
// Tests BRE against a std::string target.
// Tests ECMAScript against a std::string target.
#include <regex>
#include <testsuite_hooks.h>
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment