WebSVN – Kolibri OS – Blame – /contrib/toolchain/gcc/5x/libstdc++-v3/include/bits/regex_scanner.tcc

Rev	Author	Line No.	Line
6554	serge	1	// class template regex -- C++ --
		2
		3	// Copyright (C) 2013-2015 Free Software Foundation, Inc.
		4	//
		5	// This file is part of the GNU ISO C++ Library. This library is free
		6	// software; you can redistribute it and/or modify it under the
		7	// terms of the GNU General Public License as published by the
		8	// Free Software Foundation; either version 3, or (at your option)
		9	// any later version.
		10
		11	// This library is distributed in the hope that it will be useful,
		12	// but WITHOUT ANY WARRANTY; without even the implied warranty of
		13	// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
		14	// GNU General Public License for more details.
		15
		16	// Under Section 7 of GPL version 3, you are granted additional
		17	// permissions described in the GCC Runtime Library Exception, version
		18	// 3.1, as published by the Free Software Foundation.
		19
		20	// You should have received a copy of the GNU General Public License and
		21	// a copy of the GCC Runtime Library Exception along with this program;
		22	// see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
		23	// .
		24
		25	/**
		26	* @file bits/regex_scanner.tcc
		27	* This is an internal header file, included by other library headers.
		28	* Do not attempt to use it directly. @headername{regex}
		29	*/
		30
		31	// FIXME make comments doxygen format.
		32
		33	// N3376 specified 6 regex styles: ECMAScript, basic, extended, grep, egrep
		34	// and awk
		35	// 1) grep is basic except '\n' is treated as '\|'
		36	// 2) egrep is extended except '\n' is treated as '\|'
		37	// 3) awk is extended except special escaping rules, and there's no
		38	// back-reference.
		39	//
		40	// References:
		41	//
		42	// ECMAScript: ECMA-262 15.10
		43	//
		44	// basic, extended:
		45	// http://pubs.opengroup.org/onlinepubs/009695399/basedefs/xbd_chap09.html
		46	//
		47	// awk: http://pubs.opengroup.org/onlinepubs/000095399/utilities/awk.html
		48
		49	namespace std _GLIBCXX_VISIBILITY(default)
		50	{
		51	namespace __detail
		52	{
		53	_GLIBCXX_BEGIN_NAMESPACE_VERSION
		54
		55	template
		56	_Scanner<_CharT>::
		57	_Scanner(typename _Scanner::_IterT __begin,
		58	typename _Scanner::_IterT __end,
		59	_FlagT __flags, std::locale __loc)
		60	: _ScannerBase(__flags),
		61	_M_current(__begin), _M_end(__end),
		62	_M_ctype(std::use_facet<_CtypeT>(__loc)),
		63	_M_eat_escape(_M_is_ecma()
		64	? &_Scanner::_M_eat_escape_ecma
		65	: &_Scanner::_M_eat_escape_posix)
		66	{ _M_advance(); }
		67
		68	template
		69	void
		70	_Scanner<_CharT>::
		71	_M_advance()
		72	{
		73	if (_M_current == _M_end)
		74	{
		75	_M_token = _S_token_eof;
		76	return;
		77	}
		78
		79	if (_M_state == _S_state_normal)
		80	_M_scan_normal();
		81	else if (_M_state == _S_state_in_bracket)
		82	_M_scan_in_bracket();
		83	else if (_M_state == _S_state_in_brace)
		84	_M_scan_in_brace();
		85	else
		86	{
		87	_GLIBCXX_DEBUG_ASSERT(false);
		88	}
		89	}
		90
		91	// Differences between styles:
		92	// 1) "\(", "\)", "\{" in basic. It's not escaping.
		93	// 2) "(?:", "(?=", "(?!" in ECMAScript.
		94	template
		95	void
		96	_Scanner<_CharT>::
		97	_M_scan_normal()
		98	{
		99	auto __c = *_M_current++;
		100
		101	if (std::strchr(_M_spec_char, _M_ctype.narrow(__c, ' ')) == nullptr)
		102	{
		103	_M_token = _S_token_ord_char;
		104	_M_value.assign(1, __c);
		105	return;
		106	}
		107	if (__c == '\\')
		108	{
		109	if (_M_current == _M_end)
		110	__throw_regex_error(regex_constants::error_escape);
		111
		112	if (!_M_is_basic()
		113	\|\| (*_M_current != '('
		114	&& *_M_current != ')'
		115	&& *_M_current != '{'))
		116	{
		117	(this->*_M_eat_escape)();
		118	return;
		119	}
		120	__c = *_M_current++;
		121	}
		122	if (__c == '(')
		123	{
		124	if (_M_is_ecma() && *_M_current == '?')
		125	{
		126	if (++_M_current == _M_end)
		127	__throw_regex_error(regex_constants::error_paren);
		128
		129	if (*_M_current == ':')
		130	{
		131	++_M_current;
		132	_M_token = _S_token_subexpr_no_group_begin;
		133	}
		134	else if (*_M_current == '=')
		135	{
		136	++_M_current;
		137	_M_token = _S_token_subexpr_lookahead_begin;
		138	_M_value.assign(1, 'p');
		139	}
		140	else if (*_M_current == '!')
		141	{
		142	++_M_current;
		143	_M_token = _S_token_subexpr_lookahead_begin;
		144	_M_value.assign(1, 'n');
		145	}
		146	else
		147	__throw_regex_error(regex_constants::error_paren);
		148	}
		149	else if (_M_flags & regex_constants::nosubs)
		150	_M_token = _S_token_subexpr_no_group_begin;
		151	else
		152	_M_token = _S_token_subexpr_begin;
		153	}
		154	else if (__c == ')')
		155	_M_token = _S_token_subexpr_end;
		156	else if (__c == '[')
		157	{
		158	_M_state = _S_state_in_bracket;
		159	_M_at_bracket_start = true;
		160	if (_M_current != _M_end && *_M_current == '^')
		161	{
		162	_M_token = _S_token_bracket_neg_begin;
		163	++_M_current;
		164	}
		165	else
		166	_M_token = _S_token_bracket_begin;
		167	}
		168	else if (__c == '{')
		169	{
		170	_M_state = _S_state_in_brace;
		171	_M_token = _S_token_interval_begin;
		172	}
		173	else if (__c != ']' && __c != '}')
		174	{
		175	auto __it = _M_token_tbl;
		176	auto __narrowc = _M_ctype.narrow(__c, '\0');
		177	for (; __it->first != '\0'; ++__it)
		178	if (__it->first == __narrowc)
		179	{
		180	_M_token = __it->second;
		181	return;
		182	}
		183	_GLIBCXX_DEBUG_ASSERT(false);
		184	}
		185	else
		186	{
		187	_M_token = _S_token_ord_char;
		188	_M_value.assign(1, __c);
		189	}
		190	}
		191
		192	// Differences between styles:
		193	// 1) different semantics of "[]" and "[^]".
		194	// 2) Escaping in bracket expr.
		195	template
		196	void
		197	_Scanner<_CharT>::
		198	_M_scan_in_bracket()
		199	{
		200	if (_M_current == _M_end)
		201	__throw_regex_error(regex_constants::error_brack);
		202
		203	auto __c = *_M_current++;
		204
		205	if (__c == '[')
		206	{
		207	if (_M_current == _M_end)
		208	__throw_regex_error(regex_constants::error_brack);
		209
		210	if (*_M_current == '.')
		211	{
		212	_M_token = _S_token_collsymbol;
		213	_M_eat_class(*_M_current++);
		214	}
		215	else if (*_M_current == ':')
		216	{
		217	_M_token = _S_token_char_class_name;
		218	_M_eat_class(*_M_current++);
		219	}
		220	else if (*_M_current == '=')
		221	{
		222	_M_token = _S_token_equiv_class_name;
		223	_M_eat_class(*_M_current++);
		224	}
		225	else
		226	{
		227	_M_token = _S_token_ord_char;
		228	_M_value.assign(1, __c);
		229	}
		230	}
		231	// In POSIX, when encountering "[]" or "[^]", the ']' is interpreted
		232	// literally. So "[]]" and "[^]]" are valid regexes. See the testcases
		233	// `*/empty_range.cc`.
		234	else if (__c == ']' && (_M_is_ecma() \|\| !_M_at_bracket_start))
		235	{
		236	_M_token = _S_token_bracket_end;
		237	_M_state = _S_state_normal;
		238	}
		239	// ECMAScript and awk permits escaping in bracket.
		240	else if (__c == '\\' && (_M_is_ecma() \|\| _M_is_awk()))
		241	(this->*_M_eat_escape)();
		242	else
		243	{
		244	_M_token = _S_token_ord_char;
		245	_M_value.assign(1, __c);
		246	}
		247	_M_at_bracket_start = false;
		248	}
		249
		250	// Differences between styles:
		251	// 1) "\}" in basic style.
		252	template
		253	void
		254	_Scanner<_CharT>::
		255	_M_scan_in_brace()
		256	{
		257	if (_M_current == _M_end)
		258	__throw_regex_error(regex_constants::error_brace);
		259
		260	auto __c = *_M_current++;
		261
		262	if (_M_ctype.is(_CtypeT::digit, __c))
		263	{
		264	_M_token = _S_token_dup_count;
		265	_M_value.assign(1, __c);
		266	while (_M_current != _M_end
		267	&& _M_ctype.is(_CtypeT::digit, *_M_current))
		268	_M_value += *_M_current++;
		269	}
		270	else if (__c == ',')
		271	_M_token = _S_token_comma;
		272	// basic use \}.
		273	else if (_M_is_basic())
		274	{
		275	if (__c == '\\' && _M_current != _M_end && *_M_current == '}')
		276	{
		277	_M_state = _S_state_normal;
		278	_M_token = _S_token_interval_end;
		279	++_M_current;
		280	}
		281	else
		282	__throw_regex_error(regex_constants::error_badbrace);
		283	}
		284	else if (__c == '}')
		285	{
		286	_M_state = _S_state_normal;
		287	_M_token = _S_token_interval_end;
		288	}
		289	else
		290	__throw_regex_error(regex_constants::error_badbrace);
		291	}
		292
		293	template
		294	void
		295	_Scanner<_CharT>::
		296	_M_eat_escape_ecma()
		297	{
		298	if (_M_current == _M_end)
		299	__throw_regex_error(regex_constants::error_escape);
		300
		301	auto __c = *_M_current++;
		302	auto __pos = _M_find_escape(_M_ctype.narrow(__c, '\0'));
		303
		304	if (__pos != nullptr && (__c != 'b' \|\| _M_state == _S_state_in_bracket))
		305	{
		306	_M_token = _S_token_ord_char;
		307	_M_value.assign(1, *__pos);
		308	}
		309	else if (__c == 'b')
		310	{
		311	_M_token = _S_token_word_bound;
		312	_M_value.assign(1, 'p');
		313	}
		314	else if (__c == 'B')
		315	{
		316	_M_token = _S_token_word_bound;
		317	_M_value.assign(1, 'n');
		318	}
		319	// N3376 28.13
		320	else if (__c == 'd'
		321	\|\| __c == 'D'
		322	\|\| __c == 's'
		323	\|\| __c == 'S'
		324	\|\| __c == 'w'
		325	\|\| __c == 'W')
		326	{
		327	_M_token = _S_token_quoted_class;
		328	_M_value.assign(1, __c);
		329	}
		330	else if (__c == 'c')
		331	{
		332	if (_M_current == _M_end)
		333	__throw_regex_error(regex_constants::error_escape);
		334	_M_token = _S_token_ord_char;
		335	_M_value.assign(1, *_M_current++);
		336	}
		337	else if (__c == 'x' \|\| __c == 'u')
		338	{
		339	_M_value.erase();
		340	for (int __i = 0; __i < (__c == 'x' ? 2 : 4); __i++)
		341	{
		342	if (_M_current == _M_end
		343	\|\| !_M_ctype.is(_CtypeT::xdigit, *_M_current))
		344	__throw_regex_error(regex_constants::error_escape);
		345	_M_value += *_M_current++;
		346	}
		347	_M_token = _S_token_hex_num;
		348	}
		349	// ECMAScript recognizes multi-digit back-references.
		350	else if (_M_ctype.is(_CtypeT::digit, __c))
		351	{
		352	_M_value.assign(1, __c);
		353	while (_M_current != _M_end
		354	&& _M_ctype.is(_CtypeT::digit, *_M_current))
		355	_M_value += *_M_current++;
		356	_M_token = _S_token_backref;
		357	}
		358	else
		359	{
		360	_M_token = _S_token_ord_char;
		361	_M_value.assign(1, __c);
		362	}
		363	}
		364
		365	// Differences between styles:
		366	// 1) Extended doesn't support backref, but basic does.
		367	template
		368	void
		369	_Scanner<_CharT>::
		370	_M_eat_escape_posix()
		371	{
		372	if (_M_current == _M_end)
		373	__throw_regex_error(regex_constants::error_escape);
		374
		375	auto __c = *_M_current;
		376	auto __pos = std::strchr(_M_spec_char, _M_ctype.narrow(__c, '\0'));
		377
		378	if (__pos != nullptr && *__pos != '\0')
		379	{
		380	_M_token = _S_token_ord_char;
		381	_M_value.assign(1, __c);
		382	}
		383	// We MUST judge awk before handling backrefs. There's no backref in awk.
		384	else if (_M_is_awk())
		385	{
		386	_M_eat_escape_awk();
		387	return;
		388	}
		389	else if (_M_is_basic() && _M_ctype.is(_CtypeT::digit, __c) && __c != '0')
		390	{
		391	_M_token = _S_token_backref;
		392	_M_value.assign(1, __c);
		393	}
		394	else
		395	{
		396	#ifdef __STRICT_ANSI__
		397	// POSIX says it is undefined to escape ordinary characters
		398	__throw_regex_error(regex_constants::error_escape);
		399	#else
		400	_M_token = _S_token_ord_char;
		401	_M_value.assign(1, __c);
		402	#endif
		403	}
		404	++_M_current;
		405	}
		406
		407	template
		408	void
		409	_Scanner<_CharT>::
		410	_M_eat_escape_awk()
		411	{
		412	auto __c = *_M_current++;
		413	auto __pos = _M_find_escape(_M_ctype.narrow(__c, '\0'));
		414
		415	if (__pos != nullptr)
		416	{
		417	_M_token = _S_token_ord_char;
		418	_M_value.assign(1, *__pos);
		419	}
		420	// \ddd for oct representation
		421	else if (_M_ctype.is(_CtypeT::digit, __c)
		422	&& __c != '8'
		423	&& __c != '9')
		424	{
		425	_M_value.assign(1, __c);
		426	for (int __i = 0;
		427	__i < 2
		428	&& _M_current != _M_end
		429	&& _M_ctype.is(_CtypeT::digit, *_M_current)
		430	&& *_M_current != '8'
		431	&& *_M_current != '9';
		432	__i++)
		433	_M_value += *_M_current++;
		434	_M_token = _S_token_oct_num;
		435	return;
		436	}
		437	else
		438	__throw_regex_error(regex_constants::error_escape);
		439	}
		440
		441	// Eats a character class or throws an exception.
		442	// __ch could be ':', '.' or '=', _M_current is the char after ']' when
		443	// returning.
		444	template
		445	void
		446	_Scanner<_CharT>::
		447	_M_eat_class(char __ch)
		448	{
		449	for (_M_value.clear(); _M_current != _M_end && *_M_current != __ch;)
		450	_M_value += *_M_current++;
		451	if (_M_current == _M_end
		452	\|\| *_M_current++ != __ch
		453	\|\| _M_current == _M_end // skip __ch
		454	\|\| *_M_current++ != ']') // skip ']'
		455	{
		456	if (__ch == ':')
		457	__throw_regex_error(regex_constants::error_ctype);
		458	else
		459	__throw_regex_error(regex_constants::error_collate);
		460	}
		461	}
		462
		463	#ifdef _GLIBCXX_DEBUG
		464	template
		465	std::ostream&
		466	_Scanner<_CharT>::
		467	_M_print(std::ostream& ostr)
		468	{
		469	switch (_M_token)
		470	{
		471	case _S_token_anychar:
		472	ostr << "any-character\n";
		473	break;
		474	case _S_token_backref:
		475	ostr << "backref\n";
		476	break;
		477	case _S_token_bracket_begin:
		478	ostr << "bracket-begin\n";
		479	break;
		480	case _S_token_bracket_neg_begin:
		481	ostr << "bracket-neg-begin\n";
		482	break;
		483	case _S_token_bracket_end:
		484	ostr << "bracket-end\n";
		485	break;
		486	case _S_token_char_class_name:
		487	ostr << "char-class-name \"" << _M_value << "\"\n";
		488	break;
		489	case _S_token_closure0:
		490	ostr << "closure0\n";
		491	break;
		492	case _S_token_closure1:
		493	ostr << "closure1\n";
		494	break;
		495	case _S_token_collsymbol:
		496	ostr << "collsymbol \"" << _M_value << "\"\n";
		497	break;
		498	case _S_token_comma:
		499	ostr << "comma\n";
		500	break;
		501	case _S_token_dup_count:
		502	ostr << "dup count: " << _M_value << "\n";
		503	break;
		504	case _S_token_eof:
		505	ostr << "EOF\n";
		506	break;
		507	case _S_token_equiv_class_name:
		508	ostr << "equiv-class-name \"" << _M_value << "\"\n";
		509	break;
		510	case _S_token_interval_begin:
		511	ostr << "interval begin\n";
		512	break;
		513	case _S_token_interval_end:
		514	ostr << "interval end\n";
		515	break;
		516	case _S_token_line_begin:
		517	ostr << "line begin\n";
		518	break;
		519	case _S_token_line_end:
		520	ostr << "line end\n";
		521	break;
		522	case _S_token_opt:
		523	ostr << "opt\n";
		524	break;
		525	case _S_token_or:
		526	ostr << "or\n";
		527	break;
		528	case _S_token_ord_char:
		529	ostr << "ordinary character: \"" << _M_value << "\"\n";
		530	break;
		531	case _S_token_subexpr_begin:
		532	ostr << "subexpr begin\n";
		533	break;
		534	case _S_token_subexpr_no_group_begin:
		535	ostr << "no grouping subexpr begin\n";
		536	break;
		537	case _S_token_subexpr_lookahead_begin:
		538	ostr << "lookahead subexpr begin\n";
		539	break;
		540	case _S_token_subexpr_end:
		541	ostr << "subexpr end\n";
		542	break;
		543	case _S_token_unknown:
		544	ostr << "-- unknown token --\n";
		545	break;
		546	case _S_token_oct_num:
		547	ostr << "oct number " << _M_value << "\n";
		548	break;
		549	case _S_token_hex_num:
		550	ostr << "hex number " << _M_value << "\n";
		551	break;
		552	case _S_token_quoted_class:
		553	ostr << "quoted class " << "\\" << _M_value << "\n";
		554	break;
		555	default:
		556	_GLIBCXX_DEBUG_ASSERT(false);
		557	}
		558	return ostr;
		559	}
		560	#endif
		561
		562	_GLIBCXX_END_NAMESPACE_VERSION
		563	} // namespace __detail
		564	} // namespace

Subversion Repositories Kolibri OS

(root)/contrib/toolchain/gcc/5x/libstdc++-v3/include/bits/regex_scanner.tcc – Rev 6555