Commit 7b86458e by Tim Shen Committed by Tim Shen

regex.h (regex_match<>, [...]): Change regex_executor caller.

2013-09-14  Tim Shen  <timshen91@gmail.com>

	* include/bits/regex.h (regex_match<>, regex_search<>):
	Change regex_executor caller. Now use their return value instead
	of checking __m[0].matched to find out if it's successful.
	(regex_search<>): Move the search logic to regex_executor.
	* include/bits/regex_automaton.h: Add some new _Opcode. Refactor
	_NFA::_M_insert_*.
	* include/bits/regex_automaton.tcc: Add DEBUG dump for new
	_Opcode. Refactor _NFA::_M_insert_*.
	* include/bits/regex_compiler.h (_Compiler<>::_M_get_nfa):
	Use make_shared instead of construct by hand.
	* include/bits/regex_compiler.tcc: Implement _Compiler<>::_M_assertion.
	* include/bits/regex_constants.h: Fix indentation and line breaking.
	* include/bits/regex_executor.h: Add _ResultsEntry to support
	greedy/ungreedy mode. Move regex_search logic here.
	* include/bits/regex_executor.tcc: Implement assertions and
	greedy/ungreedy matching.
	* include/bits/regex_scanner.h: Add a new token _S_token_ungreedy.
	* include/bits/regex_scanner.tcc: Parse a new token _S_token_ungreedy.
	* testsuite/28_regex/algorithms/regex_search/ecma/assertion.cc: New.
	* testsuite/28_regex/algorithms/regex_search/ecma/greedy.cc: New.
	* testsuite/28_regex/algorithms/regex_search/ecma/string_01.cc:
	Fix comment.

From-SVN: r202591
parent 492d1e0a
2013-09-14 Tim Shen <timshen91@gmail.com>
* include/bits/regex.h (regex_match<>, regex_search<>):
Change regex_executor caller. Now use their return value instead
of checking __m[0].matched to find out if it's successful.
(regex_search<>): Move the search logic to regex_executor.
* include/bits/regex_automaton.h: Add some new _Opcode. Refactor
_NFA::_M_insert_*.
* include/bits/regex_automaton.tcc: Add DEBUG dump for new
_Opcode. Refactor _NFA::_M_insert_*.
* include/bits/regex_compiler.h (_Compiler<>::_M_get_nfa):
Use make_shared instead of construct by hand.
* include/bits/regex_compiler.tcc: Implement _Compiler<>::_M_assertion.
* include/bits/regex_constants.h: Fix indentation and line breaking.
* include/bits/regex_executor.h: Add _ResultsEntry to support
greedy/ungreedy mode. Move regex_search logic here.
* include/bits/regex_executor.tcc: Implement assertions and
greedy/ungreedy matching.
* include/bits/regex_scanner.h: Add a new token _S_token_ungreedy.
* include/bits/regex_scanner.tcc: Parse a new token _S_token_ungreedy.
* testsuite/28_regex/algorithms/regex_search/ecma/assertion.cc: New.
* testsuite/28_regex/algorithms/regex_search/ecma/greedy.cc: New.
* testsuite/28_regex/algorithms/regex_search/ecma/string_01.cc:
Fix comment.
2013-09-13 Paolo Carlini <paolo.carlini@oracle.com>
PR libstdc++/58415
......
......@@ -2106,14 +2106,16 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
template<typename, typename, typename, typename>
friend class __detail::_BFSExecutor;
template<typename _Bp, typename _Ap, typename _Ch_type, typename _Rx_traits>
template<typename _Bp, typename _Ap,
typename _Ch_type, typename _Rx_traits>
friend bool
regex_match(_Bp, _Bp, match_results<_Bp, _Ap>&,
const basic_regex<_Ch_type,
_Rx_traits>&,
regex_constants::match_flag_type);
template<typename _Bp, typename _Ap, typename _Ch_type, typename _Rx_traits>
template<typename _Bp, typename _Ap,
typename _Ch_type, typename _Rx_traits>
friend bool
regex_search(_Bp, _Bp, match_results<_Bp, _Ap>&,
const basic_regex<_Ch_type,
......@@ -2213,8 +2215,7 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
{
if (__re._M_automaton == nullptr)
return false;
__detail::__get_executor(__s, __e, __m, __re, __flags)->_M_match();
if (__m.size() > 0 && __m[0].matched)
if (__detail::__get_executor(__s, __e, __m, __re, __flags)->_M_match())
{
for (auto __it : __m)
if (!__it.matched)
......@@ -2373,29 +2374,22 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
{
if (__re._M_automaton == nullptr)
return false;
auto __cur = __first;
// Continue when __cur == __last
do
if (__detail::__get_executor(__first, __last, __m, __re, __flags)
->_M_search())
{
__detail::__get_executor(__cur, __last, __m, __re, __flags)
->_M_search_from_first();
if (__m.size() > 0 && __m[0].matched)
{
for (auto __it : __m)
if (!__it.matched)
__it.first = __it.second = __last;
__m.at(__m.size()).first = __first;
__m.at(__m.size()).second = __m[0].first;
__m.at(__m.size()+1).first = __m[0].second;
__m.at(__m.size()+1).second = __last;
__m.at(__m.size()).matched =
(__m.prefix().first != __m.prefix().second);
__m.at(__m.size()+1).matched =
(__m.suffix().first != __m.suffix().second);
return true;
}
for (auto __it : __m)
if (!__it.matched)
__it.first = __it.second = __last;
__m.at(__m.size()).first = __first;
__m.at(__m.size()).second = __m[0].first;
__m.at(__m.size()+1).first = __m[0].second;
__m.at(__m.size()+1).second = __last;
__m.at(__m.size()).matched =
(__m.prefix().first != __m.prefix().second);
__m.at(__m.size()+1).matched =
(__m.suffix().first != __m.suffix().second);
return true;
}
while (__cur++ != __last);
return false;
}
......
......@@ -51,14 +51,18 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
/// that represents the regular expression.
enum _Opcode
{
_S_opcode_unknown = 0,
_S_opcode_alternative = 1,
_S_opcode_backref = 2,
_S_opcode_subexpr_begin = 4,
_S_opcode_subexpr_end = 5,
_S_opcode_dummy = 6,
_S_opcode_match = 100,
_S_opcode_accept = 255
_S_opcode_unknown,
_S_opcode_alternative,
_S_opcode_backref,
_S_opcode_line_begin_assertion,
_S_opcode_line_end_assertion,
_S_opcode_word_boundry,
_S_opcode_subexpr_lookahead,
_S_opcode_subexpr_begin,
_S_opcode_subexpr_end,
_S_opcode_dummy,
_S_opcode_match,
_S_opcode_accept,
};
template<typename _CharT, typename _TraitsT>
......@@ -72,35 +76,25 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
_StateIdT _M_next; // outgoing transition
union // Since they are mutually exclusive.
{
_StateIdT _M_alt; // for _S_opcode_alternative
unsigned int _M_subexpr; // for _S_opcode_subexpr_*
unsigned int _M_backref_index; // for _S_opcode_backref
struct
{
// for _S_opcode_alternative.
_StateIdT _M_quant_index;
// for _S_opcode_alternative or _S_opcode_subexpr_lookahead
_StateIdT _M_alt;
// for _S_opcode_word_boundry or _S_opcode_subexpr_lookahead or
// quantifiers(ungreedy if set true)
bool _M_neg;
};
};
_MatcherT _M_matches; // for _S_opcode_match
_MatcherT _M_matches; // for _S_opcode_match
explicit _State(_OpcodeT __opcode)
: _M_opcode(__opcode), _M_next(_S_invalid_state_id)
{ }
_State(const _MatcherT& __m)
: _M_opcode(_S_opcode_match), _M_next(_S_invalid_state_id),
_M_matches(__m)
{ }
_State(_OpcodeT __opcode, unsigned __index)
: _M_opcode(__opcode), _M_next(_S_invalid_state_id)
{
if (__opcode == _S_opcode_subexpr_begin
|| __opcode == _S_opcode_subexpr_end)
_M_subexpr = __index;
else if (__opcode == _S_opcode_backref)
_M_backref_index = __index;
}
_State(_StateIdT __next, _StateIdT __alt)
: _M_opcode(_S_opcode_alternative), _M_next(__next), _M_alt(__alt)
{ }
#ifdef _GLIBCXX_DEBUG
std::ostream&
_M_print(std::ostream& ostr) const;
......@@ -141,7 +135,7 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
_NFA(_FlagT __f)
: _M_flags(__f), _M_start_state(0), _M_subexpr_count(0),
_M_has_backref(false)
_M_has_backref(false), _M_quant_count(0)
{ }
_FlagT
......@@ -163,23 +157,30 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
_StateIdT
_M_insert_accept()
{
this->push_back(_StateT(_S_opcode_accept));
_M_accepting_states.insert(this->size()-1);
return this->size()-1;
auto __ret = _M_insert_state(_StateT(_S_opcode_accept));
_M_accepting_states.insert(__ret);
return __ret;
}
_StateIdT
_M_insert_alt(_StateIdT __next, _StateIdT __alt)
_M_insert_alt(_StateIdT __next, _StateIdT __alt, bool __neg)
{
this->push_back(_StateT(__next, __alt));
return this->size()-1;
_StateT __tmp(_S_opcode_alternative);
// It labels every quantifier to make greedy comparison easier in BFS
// approach.
__tmp._M_quant_index = _M_quant_count++;
__tmp._M_next = __next;
__tmp._M_alt = __alt;
__tmp._M_neg = __neg;
return _M_insert_state(__tmp);
}
_StateIdT
_M_insert_matcher(_MatcherT __m)
{
this->push_back(_StateT(__m));
return this->size()-1;
_StateT __tmp(_S_opcode_match);
__tmp._M_matches = __m;
return _M_insert_state(__tmp);
}
_StateIdT
......@@ -187,29 +188,53 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
{
auto __id = _M_subexpr_count++;
_M_paren_stack.push_back(__id);
this->push_back(_StateT(_S_opcode_subexpr_begin, __id));
return this->size()-1;
_StateT __tmp(_S_opcode_subexpr_begin);
__tmp._M_subexpr = __id;
return _M_insert_state(__tmp);
}
_StateIdT
_M_insert_subexpr_end()
{
this->push_back(_StateT(_S_opcode_subexpr_end, _M_paren_stack.back()));
_StateT __tmp(_S_opcode_subexpr_end);
__tmp._M_subexpr = _M_paren_stack.back();
_M_paren_stack.pop_back();
return this->size()-1;
return _M_insert_state(__tmp);
}
_StateIdT
_M_insert_backref(unsigned int __index);
_StateIdT
_M_insert_dummy()
_M_insert_line_begin()
{ return _M_insert_state(_StateT(_S_opcode_line_begin_assertion)); }
_StateIdT
_M_insert_line_end()
{ return _M_insert_state(_StateT(_S_opcode_line_end_assertion)); }
_StateIdT
_M_insert_word_bound(bool __neg)
{
this->push_back(_StateT(_S_opcode_dummy));
return this->size()-1;
_StateT __tmp(_S_opcode_word_boundry);
__tmp._M_neg = __neg;
return _M_insert_state(__tmp);
}
_StateIdT
_M_insert_lookahead(_StateIdT __alt, bool __neg)
{
_StateT __tmp(_S_opcode_subexpr_lookahead);
__tmp._M_alt = __alt;
__tmp._M_neg = __neg;
return _M_insert_state(__tmp);
}
_StateIdT
_M_insert_dummy()
{ return _M_insert_state(_StateT(_S_opcode_dummy)); }
_StateIdT
_M_insert_state(_StateT __s)
{
this->push_back(__s);
......@@ -230,6 +255,7 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
_FlagT _M_flags;
_StateIdT _M_start_state;
_SizeT _M_subexpr_count;
_SizeT _M_quant_count;
bool _M_has_backref;
};
......
......@@ -80,6 +80,31 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
<< __id << " -> " << _M_alt
<< " [label=\"epsilon\", tailport=\"n\"];\n";
break;
case _S_opcode_backref:
__ostr << __id << " [label=\"" << __id << "\\nBACKREF "
<< _M_subexpr << "\"];\n"
<< __id << " -> " << _M_next << " [label=\"<match>\"];\n";
break;
case _S_opcode_line_begin_assertion:
__ostr << __id << " [label=\"" << __id << "\\nLINE_BEGIN \"];\n"
<< __id << " -> " << _M_next << " [label=\"epsilon\"];\n";
break;
case _S_opcode_line_end_assertion:
__ostr << __id << " [label=\"" << __id << "\\nLINE_END \"];\n"
<< __id << " -> " << _M_next << " [label=\"epsilon\"];\n";
break;
case _S_opcode_word_boundry:
__ostr << __id << " [label=\"" << __id << "\\nWORD_BOUNDRY "
<< _M_neg << "\"];\n"
<< __id << " -> " << _M_next << " [label=\"epsilon\"];\n";
break;
case _S_opcode_subexpr_lookahead:
__ostr << __id << " [label=\"" << __id << "\\nLOOK_AHEAD\"];\n"
<< __id << " -> " << _M_next
<< " [label=\"epsilon\", tailport=\"s\"];\n"
<< __id << " -> " << _M_alt
<< " [label=\"<assert>\", tailport=\"n\"];\n";
break;
case _S_opcode_subexpr_begin:
__ostr << __id << " [label=\"" << __id << "\\nSBEGIN "
<< _M_subexpr << "\"];\n"
......@@ -90,10 +115,7 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
<< _M_subexpr << "\"];\n"
<< __id << " -> " << _M_next << " [label=\"epsilon\"];\n";
break;
case _S_opcode_backref:
__ostr << __id << " [label=\"" << __id << "\\nBACKREF "
<< _M_subexpr << "\"];\n"
<< __id << " -> " << _M_next << " [label=\"<match>\"];\n";
case _S_opcode_dummy:
break;
case _S_opcode_match:
__ostr << __id << " [label=\"" << __id << "\\nMATCH\"];\n"
......@@ -102,8 +124,6 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
case _S_opcode_accept:
__ostr << __id << " [label=\"" << __id << "\\nACC\"];\n" ;
break;
case _S_opcode_dummy:
break;
default:
_GLIBCXX_DEBUG_ASSERT(false);
break;
......@@ -141,8 +161,9 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
if (__index == __it)
__throw_regex_error(regex_constants::error_backref);
_M_has_backref = true;
this->push_back(_StateT(_S_opcode_backref, __index));
return this->size()-1;
_StateT __tmp(_S_opcode_backref);
__tmp._M_backref_index = __index;
return _M_insert_state(__tmp);
}
template<typename _CharT, typename _TraitsT>
......@@ -152,7 +173,7 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
for (auto& __it : *this)
{
while (__it._M_next >= 0 && (*this)[__it._M_next]._M_opcode
== _S_opcode_dummy)
== _S_opcode_dummy)
__it._M_next = (*this)[__it._M_next]._M_next;
if (__it._M_opcode == _S_opcode_alternative)
while (__it._M_alt >= 0 && (*this)[__it._M_alt]._M_opcode
......
......@@ -56,7 +56,7 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
std::shared_ptr<_RegexT>
_M_get_nfa() const
{ return std::shared_ptr<_RegexT>(new _RegexT(_M_nfa)); }
{ return make_shared<_RegexT>(_M_nfa); }
private:
typedef _Scanner<_FwdIter> _ScannerT;
......
......@@ -96,7 +96,7 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
__alt2._M_append(__end);
_M_stack.push(_StateSeqT(_M_nfa,
_M_nfa._M_insert_alt(__alt1._M_start,
__alt2._M_start),
__alt2._M_start, false),
__end));
}
}
......@@ -132,25 +132,34 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
return false;
}
// TODO Implement it.
template<typename _FwdIter, typename _CharT, typename _TraitsT>
bool
_Compiler<_FwdIter, _CharT, _TraitsT>::
_M_assertion()
{
// temporary place holders.
if (_M_match_token(_ScannerT::_S_token_line_begin))
_M_stack.push(_StateSeqT(_M_nfa, _M_nfa._M_insert_dummy()));
_M_stack.push(_StateSeqT(_M_nfa, _M_nfa.
_M_insert_line_begin()));
else if (_M_match_token(_ScannerT::_S_token_line_end))
_M_stack.push(_StateSeqT(_M_nfa, _M_nfa._M_insert_dummy()));
_M_stack.push(_StateSeqT(_M_nfa, _M_nfa.
_M_insert_line_end()));
else if (_M_match_token(_ScannerT::_S_token_word_bound))
_M_stack.push(_StateSeqT(_M_nfa, _M_nfa._M_insert_dummy()));
else if (_M_match_token(_ScannerT::_S_token_neg_word_bound))
_M_stack.push(_StateSeqT(_M_nfa, _M_nfa._M_insert_dummy()));
// _M_value[0] == 'n' means it's negtive, say "not word boundary".
_M_stack.push(_StateSeqT(_M_nfa, _M_nfa.
_M_insert_word_bound(_M_value[0] == 'n')));
else if (_M_match_token(_ScannerT::_S_token_subexpr_lookahead_begin))
_M_stack.push(_StateSeqT(_M_nfa, _M_nfa._M_insert_dummy()));
else if (_M_match_token(_ScannerT::_S_token_subexpr_neg_lookahead_begin))
_M_stack.push(_StateSeqT(_M_nfa, _M_nfa._M_insert_dummy()));
{
auto __neg = _M_value[0] == 'n';
this->_M_disjunction();
if (!_M_match_token(_ScannerT::_S_token_subexpr_end))
__throw_regex_error(regex_constants::error_paren);
auto __tmp = _M_pop();
__tmp._M_append(_M_nfa._M_insert_accept());
_M_stack.push(
_StateSeqT(
_M_nfa,
_M_nfa._M_insert_lookahead(__tmp._M_start, __neg)));
}
else
return false;
return true;
......@@ -161,40 +170,44 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
_Compiler<_FwdIter, _CharT, _TraitsT>::
_M_quantifier()
{
if (_M_match_token(_ScannerT::_S_token_closure0))
bool __neg = regex_constants::ECMAScript;
auto __init = [this, &__neg]()
{
if (_M_stack.empty())
__throw_regex_error(regex_constants::error_badrepeat);
__neg = __neg && _M_match_token(_ScannerT::_S_token_opt);
};
if (_M_match_token(_ScannerT::_S_token_closure0))
{
__init();
auto __e = _M_pop();
_StateSeqT __r(_M_nfa, _M_nfa._M_insert_alt(_S_invalid_state_id,
__e._M_start));
__e._M_start, __neg));
__e._M_append(__r);
_M_stack.push(__r);
}
else if (_M_match_token(_ScannerT::_S_token_closure1))
{
if (_M_stack.empty())
__throw_regex_error(regex_constants::error_badrepeat);
__init();
auto __e = _M_pop();
__e._M_append(_M_nfa._M_insert_alt(_S_invalid_state_id, __e._M_start));
__e._M_append(_M_nfa._M_insert_alt(_S_invalid_state_id, __e._M_start,
__neg));
_M_stack.push(__e);
}
else if (_M_match_token(_ScannerT::_S_token_opt))
{
if (_M_stack.empty())
__throw_regex_error(regex_constants::error_badrepeat);
__init();
auto __e = _M_pop();
auto __end = _M_nfa._M_insert_dummy();
_StateSeqT __r(_M_nfa, _M_nfa._M_insert_alt(_S_invalid_state_id,
__e._M_start));
__e._M_start, __neg));
__e._M_append(__end);
__r._M_append(__end);
_M_stack.push(__r);
}
else if (_M_match_token(_ScannerT::_S_token_interval_begin))
{
if (_M_stack.empty())
__throw_regex_error(regex_constants::error_badrepeat);
__init();
if (!_M_match_token(_ScannerT::_S_token_dup_count))
__throw_regex_error(regex_constants::error_badbrace);
_StateSeqT __r(_M_pop());
......@@ -206,23 +219,27 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
if (_M_match_token(_ScannerT::_S_token_comma))
if (_M_match_token(_ScannerT::_S_token_dup_count)) // {3,7}
{
int __n = _M_cur_int_value(10) - __min_rep;
if (__n < 0)
__throw_regex_error(regex_constants::error_badbrace);
auto __end = _M_nfa._M_insert_dummy();
for (int __i = 0; __i < __n; ++__i)
{
int __n = _M_cur_int_value(10) - __min_rep;
if (__n < 0)
__throw_regex_error(regex_constants::error_badbrace);
auto __end = _M_nfa._M_insert_dummy();
for (int __i = 0; __i < __n; ++__i)
{
auto __tmp = __r._M_clone();
__e._M_append(_StateSeqT(_M_nfa, _M_nfa.
_M_insert_alt(__tmp._M_start, __end), __tmp._M_end));
}
__e._M_append
(_StateSeqT(_M_nfa,
_M_nfa._M_insert_alt(__tmp._M_start,
__end, __neg),
__tmp._M_end));
}
__e._M_append(__end);
}
else // {3,}
{
auto __tmp = __r._M_clone();
_StateSeqT __s(_M_nfa, _M_nfa._M_insert_alt(_S_invalid_state_id,
__tmp._M_start));
_StateSeqT __s(_M_nfa,
_M_nfa._M_insert_alt(_S_invalid_state_id,
__tmp._M_start, __neg));
__tmp._M_append(__s);
__e._M_append(__s);
}
......
......@@ -78,87 +78,87 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
* %set.
*/
enum syntax_option_type : unsigned int
{
/**
* Specifies that the matching of regular expressions against a character
* sequence shall be performed without regard to case.
*/
icase = 1 << _S_icase,
/**
* Specifies that when a regular expression is matched against a character
* container sequence, no sub-expression matches are to be stored in the
* supplied match_results structure.
*/
nosubs = 1 << _S_nosubs,
/**
* Specifies that the regular expression engine should pay more attention to
* the speed with which regular expressions are matched, and less to the
* speed with which regular expression objects are constructed. Otherwise
* it has no detectable effect on the program output.
*/
optimize = 1 << _S_optimize,
/**
* Specifies that character ranges of the form [a-b] should be locale
* sensitive.
*/
collate = 1 << _S_collate,
/**
* Specifies that the grammar recognized by the regular expression engine is
* that used by ECMAScript in ECMA-262 [Ecma International, ECMAScript
* Language Specification, Standard Ecma-262, third edition, 1999], as
* modified in section [28.13]. This grammar is similar to that defined
* in the PERL scripting language but extended with elements found in the
* POSIX regular expression grammar.
*/
ECMAScript = 1 << _S_ECMAScript,
/**
* Specifies that the grammar recognized by the regular expression engine is
* that used by POSIX basic regular expressions in IEEE Std 1003.1-2001,
* Portable Operating System Interface (POSIX), Base Definitions and
* Headers, Section 9, Regular Expressions [IEEE, Information Technology --
* Portable Operating System Interface (POSIX), IEEE Standard 1003.1-2001].
*/
basic = 1 << _S_basic,
/**
* Specifies that the grammar recognized by the regular expression engine is
* that used by POSIX extended regular expressions in IEEE Std 1003.1-2001,
* Portable Operating System Interface (POSIX), Base Definitions and Headers,
* Section 9, Regular Expressions.
*/
extended = 1 << _S_extended,
/**
* Specifies that the grammar recognized by the regular expression engine is
* that used by POSIX utility awk in IEEE Std 1003.1-2001. This option is
* identical to syntax_option_type extended, except that C-style escape
* sequences are supported. These sequences are:
* \\\\, \\a, \\b, \\f, \\n, \\r, \\t , \\v, \\&apos,, &apos,,
* and \\ddd (where ddd is one, two, or three octal digits).
*/
awk = 1 << _S_awk,
/**
* Specifies that the grammar recognized by the regular expression engine is
* that used by POSIX utility grep in IEEE Std 1003.1-2001. This option is
* identical to syntax_option_type basic, except that newlines are treated
* as whitespace.
*/
grep = 1 << _S_grep,
/**
* Specifies that the grammar recognized by the regular expression engine is
* that used by POSIX utility grep when given the -E option in
* IEEE Std 1003.1-2001. This option is identical to syntax_option_type
* extended, except that newlines are treated as whitespace.
*/
egrep = 1 << _S_egrep,
};
{
/**
* Specifies that the matching of regular expressions against a character
* sequence shall be performed without regard to case.
*/
icase = 1 << _S_icase,
/**
* Specifies that when a regular expression is matched against a character
* container sequence, no sub-expression matches are to be stored in the
* supplied match_results structure.
*/
nosubs = 1 << _S_nosubs,
/**
* Specifies that the regular expression engine should pay more attention to
* the speed with which regular expressions are matched, and less to the
* speed with which regular expression objects are constructed. Otherwise
* it has no detectable effect on the program output.
*/
optimize = 1 << _S_optimize,
/**
* Specifies that character ranges of the form [a-b] should be locale
* sensitive.
*/
collate = 1 << _S_collate,
/**
* Specifies that the grammar recognized by the regular expression engine is
* that used by ECMAScript in ECMA-262 [Ecma International, ECMAScript
* Language Specification, Standard Ecma-262, third edition, 1999], as
* modified in section [28.13]. This grammar is similar to that defined
* in the PERL scripting language but extended with elements found in the
* POSIX regular expression grammar.
*/
ECMAScript = 1 << _S_ECMAScript,
/**
* Specifies that the grammar recognized by the regular expression engine is
* that used by POSIX basic regular expressions in IEEE Std 1003.1-2001,
* Portable Operating System Interface (POSIX), Base Definitions and
* Headers, Section 9, Regular Expressions [IEEE, Information Technology --
* Portable Operating System Interface (POSIX), IEEE Standard 1003.1-2001].
*/
basic = 1 << _S_basic,
/**
* Specifies that the grammar recognized by the regular expression engine is
* that used by POSIX extended regular expressions in IEEE Std 1003.1-2001,
* Portable Operating System Interface (POSIX), Base Definitions and
* Headers, Section 9, Regular Expressions.
*/
extended = 1 << _S_extended,
/**
* Specifies that the grammar recognized by the regular expression engine is
* that used by POSIX utility awk in IEEE Std 1003.1-2001. This option is
* identical to syntax_option_type extended, except that C-style escape
* sequences are supported. These sequences are:
* \\\\, \\a, \\b, \\f, \\n, \\r, \\t , \\v, \\&apos,, &apos,,
* and \\ddd (where ddd is one, two, or three octal digits).
*/
awk = 1 << _S_awk,
/**
* Specifies that the grammar recognized by the regular expression engine is
* that used by POSIX utility grep in IEEE Std 1003.1-2001. This option is
* identical to syntax_option_type basic, except that newlines are treated
* as whitespace.
*/
grep = 1 << _S_grep,
/**
* Specifies that the grammar recognized by the regular expression engine is
* that used by POSIX utility grep when given the -E option in
* IEEE Std 1003.1-2001. This option is identical to syntax_option_type
* extended, except that newlines are treated as whitespace.
*/
egrep = 1 << _S_egrep,
};
constexpr inline syntax_option_type
operator&(syntax_option_type __a, syntax_option_type __b)
......
......@@ -66,33 +66,46 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
{ }
// Set matched when string exactly match the pattern.
virtual void
virtual bool
_M_match() = 0;
// Set matched when some prefix of the string matches the pattern.
virtual void
_M_search_from_first() = 0;
virtual bool
_M_search() = 0;
protected:
typedef typename _NFA<_CharT, _TraitsT>::_SizeT _SizeT;
_Executor(_BiIter __begin,
_BiIter __end,
_ResultsT& __results,
_FlagT __flags,
_SizeT __size)
: _M_current(__begin), _M_end(__end), _M_results(__results),
_M_flags(__flags)
typedef typename _TraitsT::char_class_type _ClassT;
_Executor(_BiIter __begin,
_BiIter __end,
_ResultsT& __results,
_FlagT __flags,
_SizeT __size,
const _TraitsT& __traits)
: _M_current(__begin), _M_begin(__begin), _M_end(__end),
_M_results(__results), _M_flags(__flags), _M_traits(__traits)
{
__size += 2;
_M_results.resize(__size);
for (auto __i = 0; __i < __size; __i++)
for (_SizeT __i = 0; __i < __size; ++__i)
_M_results[__i].matched = false;
}
_BiIter _M_current;
_BiIter _M_end;
_ResultsVec& _M_results;
_FlagT _M_flags;
bool
_M_is_word(_CharT __ch)
{
static const _CharT __s = 'w';
return _M_traits.isctype(__ch,
_M_traits.lookup_classname(&__s, &__s+1));
}
_BiIter _M_current;
const _BiIter _M_begin;
const _BiIter _M_end;
_ResultsVec& _M_results;
const _TraitsT& _M_traits;
_FlagT _M_flags;
};
// A _DFSExecutor perform a DFS on given NFA and input string. At the very
......@@ -126,26 +139,51 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
const _RegexT& __nfa,
const _TraitsT& __traits,
_FlagT __flags)
: _BaseT(__begin, __end, __results, __flags, __nfa._M_sub_count()),
_M_traits(__traits), _M_nfa(__nfa), _M_results_ret(this->_M_results)
: _BaseT(__begin, __end, __results, __flags, __nfa._M_sub_count(),
__traits),
_M_traits(__traits), _M_nfa(__nfa), _M_cur_results(this->_M_results),
_M_start_state(__nfa._M_start())
{ }
void
bool
_M_match()
{ _M_dfs<true>(_M_nfa._M_start()); }
{
this->_M_current = this->_M_begin;
return _M_dfs<true>(_M_start_state);
}
void
bool
_M_search_from_first()
{ _M_dfs<false>(_M_nfa._M_start()); }
{
this->_M_current = this->_M_begin;
return _M_dfs<false>(_M_start_state);
}
bool
_M_search()
{
auto __cur = this->_M_begin;
do
{
this->_M_current = __cur;
if (_M_dfs<false>(_M_start_state))
return true;
}
// Continue when __cur == _M_end
while (__cur++ != this->_M_end);
return false;
}
private:
template<bool __match_mode>
bool
_M_dfs(_StateIdT __i);
_ResultsVec _M_results_ret;
// To record current solution.
_ResultsVec _M_cur_results;
const _TraitsT& _M_traits;
const _RegexT& _M_nfa;
_StateIdT _M_start_state;
};
// Like the DFS approach, it try every possible state transition; Unlike DFS,
......@@ -170,35 +208,129 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
typedef _Executor<_BiIter, _Alloc, _CharT, _TraitsT> _BaseT;
typedef _NFA<_CharT, _TraitsT> _RegexT;
typedef typename _BaseT::_ResultsT _ResultsT;
typedef typename _BaseT::_ResultsVec _ResultsVec;
typedef std::unique_ptr<_ResultsVec> _ResultsPtr;
// Here's a solution for greedy/ungreedy mode in BFS approach. We need to
// carefully work out how to compare to conflict matching states.
//
// A matching state is a pair(where, when); `where` is a NFA node; `when`
// is a _BiIter, indicating which char is the next to be mathed one. Two
// matching states conflict means that they have equivalent `where` and
// `when`.
//
// Now since we need to drop one and keep another, because at most one of
// them could be the final optimal solution. This behavior is affected by
// greedy policy.
//
// The definition of `greedy`:
// For the sequence of quantifiers in NFA sorted by there start position,
// now maintain a vector in a matching state, with equal length to
// quantifier seq, recording repeating times of every quantifier. Now to
// compare two matching states, we just lexically compare these two
// vectors. To win the compare(to survive), one matching state needs to
// make its greedy quantifier count larger, and ungreedy quantifiers
// count smaller.
//
// In the implementation, we recorded negtive numbers for greedy
// quantifiers and positive numbers of ungreedy ones. Now a simple
// operator<() for lexicographical_compare will emit the answer.
//
// When two vectors equal, it means the `where`, `when` and quantifier
// counts are identical, it indicates the same answer, so just return
// false.
struct _ResultsEntry
: private _BaseT::_ResultsVec
{
public:
_ResultsEntry(unsigned int __res_sz, unsigned int __sz)
: _BaseT::_ResultsVec(__res_sz), _M_quant_keys(__sz)
{ }
sub_match<_BiIter>&
operator[](unsigned int __idx)
{ return this->_BaseT::_ResultsVec::operator[](__idx); }
bool
operator<(const _ResultsEntry& __rhs) const
{
_GLIBCXX_DEBUG_ASSERT(_M_quant_keys.size()
== __rhs._M_quant_keys.size());
return lexicographical_compare(_M_quant_keys.begin(),
_M_quant_keys.end(),
__rhs._M_quant_keys.begin(),
__rhs._M_quant_keys.end());
}
void
_M_inc(unsigned int __idx, bool __neg)
{ _M_quant_keys[__idx] += __neg ? 1 : -1; }
typename _BaseT::_ResultsVec
_M_get()
{ return *this; }
public:
std::vector<int> _M_quant_keys;
};
typedef std::unique_ptr<_ResultsEntry> _ResultsPtr;
typedef regex_constants::match_flag_type _FlagT;
_BFSExecutor(_BiIter __begin,
_BiIter __end,
_ResultsT& __results,
const _RegexT& __nfa,
_FlagT __flags)
: _BaseT(__begin, __end, __results, __flags, __nfa._M_sub_count()),
_M_nfa(__nfa)
{
if (_M_nfa._M_start() != _S_invalid_state_id)
_M_covered[_M_nfa._M_start()] =
_ResultsPtr(new _ResultsVec(this->_M_results));
_M_e_closure();
}
_BFSExecutor(_BiIter __begin,
_BiIter __end,
_ResultsT& __results,
const _RegexT& __nfa,
const _TraitsT& __traits,
_FlagT __flags)
: _BaseT(__begin, __end, __results, __flags, __nfa._M_sub_count(),
__traits),
_M_nfa(__nfa),
_M_cur_results(nullptr),
_M_start_state(__nfa._M_start())
{ }
void
bool
_M_match()
{ _M_main_loop<true>(); }
{
_M_init(this->_M_begin);
return _M_main_loop<true>();
}
void
bool
_M_search_from_first()
{ _M_main_loop<false>(); }
{
_M_init(this->_M_begin);
return _M_main_loop<false>();
}
bool
_M_search()
{
auto __cur = this->_M_begin;
do
{
_M_init(__cur);
if (_M_main_loop<false>())
return true;
}
// Continue when __cur == _M_end
while (__cur++ != this->_M_end);
return false;
}
private:
void
_M_init(_BiIter __cur)
{
_GLIBCXX_DEBUG_ASSERT(_M_start_state != _S_invalid_state_id);
this->_M_current = __cur;
_M_covered.clear();
_M_covered[_M_start_state] =
_ResultsPtr(new _ResultsEntry(this->_M_results.size(),
_M_nfa._M_quant_count));
_M_e_closure();
}
template<bool __match_mode>
void
bool
_M_main_loop();
void
......@@ -208,13 +340,13 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
_M_move();
bool
_M_match_less_than(const _ResultsVec& __u, const _ResultsVec& __v) const;
bool
_M_includes_some() const;
_M_includes_some();
std::map<_StateIdT, _ResultsPtr> _M_covered;
const _RegexT& _M_nfa;
std::map<_StateIdT, _ResultsPtr> _M_covered;
// To record global optimal solution.
_ResultsPtr _M_cur_results;
const _RegexT& _M_nfa;
_StateIdT _M_start_state;
};
//@} regex-detail
......
......@@ -69,7 +69,6 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
_S_token_subexpr_begin,
_S_token_subexpr_no_group_begin,
_S_token_subexpr_lookahead_begin,
_S_token_subexpr_neg_lookahead_begin,
_S_token_subexpr_end,
_S_token_bracket_begin,
_S_token_bracket_neg_begin,
......@@ -84,10 +83,10 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
_S_token_or,
_S_token_closure0,
_S_token_closure1,
_S_token_ungreedy,
_S_token_line_begin,
_S_token_line_end,
_S_token_word_bound,
_S_token_neg_word_bound,
_S_token_comma,
_S_token_dup_count,
_S_token_eof,
......
......@@ -210,11 +210,13 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
{
++_M_current;
_M_token = _S_token_subexpr_lookahead_begin;
_M_value.assign(1, 'p');
}
else if (*_M_current == '!')
{
++_M_current;
_M_token = _S_token_subexpr_neg_lookahead_begin;
_M_token = _S_token_subexpr_lookahead_begin;
_M_value.assign(1, 'n');
}
else
__throw_regex_error(regex_constants::error_paren);
......@@ -371,9 +373,15 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
_M_value.assign(1, _M_escape_map.at(__c));
}
else if (__c == 'b')
_M_token = _S_token_word_bound;
{
_M_token = _S_token_word_bound;
_M_value.assign(1, 'p');
}
else if (__c == 'B')
_M_token = _S_token_neg_word_bound;
{
_M_token = _S_token_word_bound;
_M_value.assign(1, 'n');
}
// N3376 28.13
else if (__c == 'd'
|| __c == 'D'
......@@ -581,9 +589,6 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
case _S_token_subexpr_lookahead_begin:
ostr << "lookahead subexpr begin\n";
break;
case _S_token_subexpr_neg_lookahead_begin:
ostr << "neg lookahead subexpr begin\n";
break;
case _S_token_subexpr_end:
ostr << "subexpr end\n";
break;
......
// { dg-options "-std=gnu++11" }
// { dg-do run { xfail *-*-* } }
//
// 2013-09-14 Tim Shen <timshen91@gmail.com>
//
// Copyright (C) 2013 Free Software Foundation, Inc.
//
// This file is part of the GNU ISO C++ Library. This library is free
// software; you can redistribute it and/or modify it under the
// terms of the GNU General Public License as published by the
// Free Software Foundation; either version 3, or (at your option)
// any later version.
//
// This library is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// GNU General Public License for more details.
//
// You should have received a copy of the GNU General Public License along
// with this library; see the file COPYING3. If not see
// <http://www.gnu.org/licenses/>.
// 28.11.3 regex_search
// Tests ECMAScript assertion.
#include <regex>
#include <testsuite_hooks.h>
using namespace std;
void
test01()
{
bool test __attribute__((unused)) = true;
VERIFY(!regex_search("2123456", regex("^1234")));
VERIFY(regex_search("123456", regex("^1234")));
VERIFY(regex_search("123456", regex("(5|^)1234")));
VERIFY(regex_search("5123456", regex("(5|^)1234")));
VERIFY(!regex_search("1234562", regex("3456$")));
VERIFY(regex_search("123456", regex("3456$")));
VERIFY(!regex_search("123456", regex("(?=1234)56")));
VERIFY(regex_search("123456", regex("(?=1234)123456")));
VERIFY(regex_search("123456", regex("(?!1234)56")));
VERIFY(!regex_search("123456", regex("(?!1234)123456")));
VERIFY(regex_search("a-", regex("a\\b-")));
VERIFY(!regex_search("ab", regex("a\\bb")));
VERIFY(!regex_search("a-", regex("a\\B-")));
VERIFY(regex_search("ab", regex("a\\Bb")));
string s("This is a regular expression");
string sol[] =
{
"This",
"is",
"a",
"regular",
"expression",
};
regex re("\\b\\w*\\b");
int i = 0;
for (auto it = sregex_iterator(s.begin(), s.end(), re);
it != sregex_iterator() && i < 5;
++it)
{
string s((*it)[0].first, (*it)[0].second);
VERIFY(s == sol[i++]);
}
VERIFY(i == 5);
}
int
main()
{
test01();
return 0;
}
// { dg-options "-std=gnu++11" }
//
// 2013-09-14 Tim Shen <timshen91@gmail.com>
//
// Copyright (C) 2013 Free Software Foundation, Inc.
//
// This file is part of the GNU ISO C++ Library. This library is free
// software; you can redistribute it and/or modify it under the
// terms of the GNU General Public License as published by the
// Free Software Foundation; either version 3, or (at your option)
// any later version.
//
// This library is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// GNU General Public License for more details.
//
// You should have received a copy of the GNU General Public License along
// with this library; see the file COPYING3. If not see
// <http://www.gnu.org/licenses/>.
// 28.11.3 regex_search
// Tests ECMAScript greedy and ungreedy quantifiers.
#include <regex>
#include <testsuite_hooks.h>
using namespace std;
void
test01()
{
bool test __attribute__((unused)) = true;
cmatch m;
#define TEST(i, s) VERIFY(m[i].matched && string(m[i].first, m[i].second) == s)
VERIFY(regex_search("aaaa", m, regex("a*")));
TEST(0, "aaaa");
VERIFY(regex_search("aaaa", m, regex("a*?")));
TEST(0, "");
VERIFY(regex_search("aaaa", m, regex("a+")));
TEST(0, "aaaa");
VERIFY(regex_search("aaaa", m, regex("a+?")));
TEST(0, "a");
VERIFY(regex_search("a", m, regex("a?")));
TEST(0, "a");
VERIFY(regex_search("a", m, regex("a??")));
TEST(0, "");
VERIFY(regex_search("", m, regex("a??")));
TEST(0, "");
VERIFY(regex_search("aaaa", m, regex("(a+)(a+)")));
TEST(1, "aaa");
TEST(2, "a");
VERIFY(regex_search("aaaa", m, regex("(a+?)(a+)")));
TEST(1, "a");
TEST(2, "aaa");
VERIFY(regex_search("aaaa", m, regex("(a+?)(a+)")));
TEST(1, "a");
TEST(2, "aaa");
VERIFY(regex_search("aaaa", m, regex("(a+?)(a+?)")));
TEST(1, "a");
TEST(2, "a");
}
int
main()
{
test01();
return 0;
}
......@@ -21,7 +21,7 @@
// <http://www.gnu.org/licenses/>.
// 28.11.3 regex_search
// Tests BRE against a std::string target.
// Tests ECMAScript against a std::string target.
#include <regex>
#include <testsuite_hooks.h>
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment