Subversion Repositories Kolibri OS

Rev

Go to most recent revision | Details | Last modification | View Log | RSS feed

Rev Author Line No. Line
6554 serge 1
// class template regex -*- C++ -*-
2
 
3
// Copyright (C) 2013-2015 Free Software Foundation, Inc.
4
//
5
// This file is part of the GNU ISO C++ Library.  This library is free
6
// software; you can redistribute it and/or modify it under the
7
// terms of the GNU General Public License as published by the
8
// Free Software Foundation; either version 3, or (at your option)
9
// any later version.
10
 
11
// This library is distributed in the hope that it will be useful,
12
// but WITHOUT ANY WARRANTY; without even the implied warranty of
13
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
14
// GNU General Public License for more details.
15
 
16
// Under Section 7 of GPL version 3, you are granted additional
17
// permissions described in the GCC Runtime Library Exception, version
18
// 3.1, as published by the Free Software Foundation.
19
 
20
// You should have received a copy of the GNU General Public License and
21
// a copy of the GCC Runtime Library Exception along with this program;
22
// see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
23
// .
24
 
25
/**
26
 *  @file bits/regex_compiler.tcc
27
 *  This is an internal header file, included by other library headers.
28
 *  Do not attempt to use it directly. @headername{regex}
29
 */
30
 
31
// FIXME make comments doxygen format.
32
 
33
// This compiler refers to "Regular Expression Matching Can Be Simple And Fast"
34
// (http://swtch.com/~rsc/regexp/regexp1.html"),
35
// but doesn't strictly follow it.
36
//
37
// When compiling, states are *chained* instead of tree- or graph-constructed.
38
// It's more like structured programs: there's if statement and loop statement.
39
//
40
// For alternative structure (say "a|b"), aka "if statement", two branches
41
// should be constructed. However, these two shall merge to an "end_tag" at
42
// the end of this operator:
43
//
44
//                branch1
45
//              /        \
46
// => begin_tag            end_tag =>
47
//              \        /
48
//                branch2
49
//
50
// This is the difference between this implementation and that in Russ's
51
// article.
52
//
53
// That's why we introduced dummy node here ------ "end_tag" is a dummy node.
54
// All dummy node will be eliminated at the end of compiling process.
55
 
56
namespace std _GLIBCXX_VISIBILITY(default)
57
{
58
namespace __detail
59
{
60
_GLIBCXX_BEGIN_NAMESPACE_VERSION
61
 
62
  template
63
    _Compiler<_TraitsT>::
64
    _Compiler(_IterT __b, _IterT __e,
65
	      const typename _TraitsT::locale_type& __loc, _FlagT __flags)
66
    : _M_flags((__flags
67
		& (regex_constants::ECMAScript
68
		   | regex_constants::basic
69
		   | regex_constants::extended
70
		   | regex_constants::grep
71
		   | regex_constants::egrep
72
		   | regex_constants::awk))
73
	       ? __flags
74
	       : __flags | regex_constants::ECMAScript),
75
      _M_scanner(__b, __e, _M_flags, __loc),
76
      _M_nfa(make_shared<_RegexT>(__loc, _M_flags)),
77
      _M_traits(_M_nfa->_M_traits),
78
      _M_ctype(std::use_facet<_CtypeT>(__loc))
79
    {
80
      _StateSeqT __r(*_M_nfa, _M_nfa->_M_start());
81
      __r._M_append(_M_nfa->_M_insert_subexpr_begin());
82
      this->_M_disjunction();
83
      if (!_M_match_token(_ScannerT::_S_token_eof))
84
	__throw_regex_error(regex_constants::error_paren);
85
      __r._M_append(_M_pop());
86
      _GLIBCXX_DEBUG_ASSERT(_M_stack.empty());
87
      __r._M_append(_M_nfa->_M_insert_subexpr_end());
88
      __r._M_append(_M_nfa->_M_insert_accept());
89
      _M_nfa->_M_eliminate_dummy();
90
    }
91
 
92
  template
93
    void
94
    _Compiler<_TraitsT>::
95
    _M_disjunction()
96
    {
97
      this->_M_alternative();
98
      while (_M_match_token(_ScannerT::_S_token_or))
99
	{
100
	  _StateSeqT __alt1 = _M_pop();
101
	  this->_M_alternative();
102
	  _StateSeqT __alt2 = _M_pop();
103
	  auto __end = _M_nfa->_M_insert_dummy();
104
	  __alt1._M_append(__end);
105
	  __alt2._M_append(__end);
106
	  // __alt2 is state._M_next, __alt1 is state._M_alt. The executor
107
	  // executes _M_alt before _M_next, as well as executing left
108
	  // alternative before right one.
109
	  _M_stack.push(_StateSeqT(*_M_nfa,
110
				   _M_nfa->_M_insert_alt(
111
				     __alt2._M_start, __alt1._M_start, false),
112
				   __end));
113
	}
114
    }
115
 
116
  template
117
    void
118
    _Compiler<_TraitsT>::
119
    _M_alternative()
120
    {
121
      if (this->_M_term())
122
	{
123
	  _StateSeqT __re = _M_pop();
124
	  this->_M_alternative();
125
	  __re._M_append(_M_pop());
126
	  _M_stack.push(__re);
127
	}
128
      else
129
	_M_stack.push(_StateSeqT(*_M_nfa, _M_nfa->_M_insert_dummy()));
130
    }
131
 
132
  template
133
    bool
134
    _Compiler<_TraitsT>::
135
    _M_term()
136
    {
137
      if (this->_M_assertion())
138
	return true;
139
      if (this->_M_atom())
140
	{
141
	  while (this->_M_quantifier());
142
	  return true;
143
	}
144
      return false;
145
    }
146
 
147
  template
148
    bool
149
    _Compiler<_TraitsT>::
150
    _M_assertion()
151
    {
152
      if (_M_match_token(_ScannerT::_S_token_line_begin))
153
	_M_stack.push(_StateSeqT(*_M_nfa, _M_nfa->_M_insert_line_begin()));
154
      else if (_M_match_token(_ScannerT::_S_token_line_end))
155
	_M_stack.push(_StateSeqT(*_M_nfa, _M_nfa->_M_insert_line_end()));
156
      else if (_M_match_token(_ScannerT::_S_token_word_bound))
157
	// _M_value[0] == 'n' means it's negative, say "not word boundary".
158
	_M_stack.push(_StateSeqT(*_M_nfa, _M_nfa->
159
	      _M_insert_word_bound(_M_value[0] == 'n')));
160
      else if (_M_match_token(_ScannerT::_S_token_subexpr_lookahead_begin))
161
	{
162
	  auto __neg = _M_value[0] == 'n';
163
	  this->_M_disjunction();
164
	  if (!_M_match_token(_ScannerT::_S_token_subexpr_end))
165
	    __throw_regex_error(regex_constants::error_paren);
166
	  auto __tmp = _M_pop();
167
	  __tmp._M_append(_M_nfa->_M_insert_accept());
168
	  _M_stack.push(
169
	      _StateSeqT(
170
		*_M_nfa,
171
		_M_nfa->_M_insert_lookahead(__tmp._M_start, __neg)));
172
	}
173
      else
174
	return false;
175
      return true;
176
    }
177
 
178
  template
179
    bool
180
    _Compiler<_TraitsT>::
181
    _M_quantifier()
182
    {
183
      bool __neg = (_M_flags & regex_constants::ECMAScript);
184
      auto __init = [this, &__neg]()
185
	{
186
	  if (_M_stack.empty())
187
	    __throw_regex_error(regex_constants::error_badrepeat);
188
	  __neg = __neg && _M_match_token(_ScannerT::_S_token_opt);
189
	};
190
      if (_M_match_token(_ScannerT::_S_token_closure0))
191
	{
192
	  __init();
193
	  auto __e = _M_pop();
194
	  _StateSeqT __r(*_M_nfa,
195
			 _M_nfa->_M_insert_repeat(_S_invalid_state_id,
196
						  __e._M_start, __neg));
197
	  __e._M_append(__r);
198
	  _M_stack.push(__r);
199
	}
200
      else if (_M_match_token(_ScannerT::_S_token_closure1))
201
	{
202
	  __init();
203
	  auto __e = _M_pop();
204
	  __e._M_append(_M_nfa->_M_insert_repeat(_S_invalid_state_id,
205
						 __e._M_start, __neg));
206
	  _M_stack.push(__e);
207
	}
208
      else if (_M_match_token(_ScannerT::_S_token_opt))
209
	{
210
	  __init();
211
	  auto __e = _M_pop();
212
	  auto __end = _M_nfa->_M_insert_dummy();
213
	  _StateSeqT __r(*_M_nfa,
214
			 _M_nfa->_M_insert_repeat(_S_invalid_state_id,
215
						  __e._M_start, __neg));
216
	  __e._M_append(__end);
217
	  __r._M_append(__end);
218
	  _M_stack.push(__r);
219
	}
220
      else if (_M_match_token(_ScannerT::_S_token_interval_begin))
221
	{
222
	  if (_M_stack.empty())
223
	    __throw_regex_error(regex_constants::error_badrepeat);
224
	  if (!_M_match_token(_ScannerT::_S_token_dup_count))
225
	    __throw_regex_error(regex_constants::error_badbrace);
226
	  _StateSeqT __r(_M_pop());
227
	  _StateSeqT __e(*_M_nfa, _M_nfa->_M_insert_dummy());
228
	  long __min_rep = _M_cur_int_value(10);
229
	  bool __infi = false;
230
	  long __n;
231
 
232
	  // {3
233
	  if (_M_match_token(_ScannerT::_S_token_comma))
234
	    if (_M_match_token(_ScannerT::_S_token_dup_count)) // {3,7}
235
	      __n = _M_cur_int_value(10) - __min_rep;
236
	    else
237
	      __infi = true;
238
	  else
239
	    __n = 0;
240
	  if (!_M_match_token(_ScannerT::_S_token_interval_end))
241
	    __throw_regex_error(regex_constants::error_brace);
242
 
243
	  __neg = __neg && _M_match_token(_ScannerT::_S_token_opt);
244
 
245
	  for (long __i = 0; __i < __min_rep; ++__i)
246
	    __e._M_append(__r._M_clone());
247
 
248
	  if (__infi)
249
	    {
250
	      auto __tmp = __r._M_clone();
251
	      _StateSeqT __s(*_M_nfa,
252
			     _M_nfa->_M_insert_repeat(_S_invalid_state_id,
253
						      __tmp._M_start, __neg));
254
	      __tmp._M_append(__s);
255
	      __e._M_append(__s);
256
	    }
257
	  else
258
	    {
259
	      if (__n < 0)
260
		__throw_regex_error(regex_constants::error_badbrace);
261
	      auto __end = _M_nfa->_M_insert_dummy();
262
	      // _M_alt is the "match more" branch, and _M_next is the
263
	      // "match less" one. Switch _M_alt and _M_next of all created
264
	      // nodes. This is a hack but IMO works well.
265
	      std::stack<_StateIdT> __stack;
266
	      for (long __i = 0; __i < __n; ++__i)
267
		{
268
		  auto __tmp = __r._M_clone();
269
		  auto __alt = _M_nfa->_M_insert_repeat(__tmp._M_start,
270
							__end, __neg);
271
		  __stack.push(__alt);
272
		  __e._M_append(_StateSeqT(*_M_nfa, __alt, __tmp._M_end));
273
		}
274
	      __e._M_append(__end);
275
	      while (!__stack.empty())
276
		{
277
		  auto& __tmp = (*_M_nfa)[__stack.top()];
278
		  __stack.pop();
279
		  std::swap(__tmp._M_next, __tmp._M_alt);
280
		}
281
	    }
282
	  _M_stack.push(__e);
283
	}
284
      else
285
	return false;
286
      return true;
287
    }
288
 
289
#define __INSERT_REGEX_MATCHER(__func, args...)\
290
	do\
291
	  if (!(_M_flags & regex_constants::icase))\
292
	    if (!(_M_flags & regex_constants::collate))\
293
	      __func(args);\
294
	    else\
295
	      __func(args);\
296
	  else\
297
	    if (!(_M_flags & regex_constants::collate))\
298
	      __func(args);\
299
	    else\
300
	      __func(args);\
301
	while (false)
302
 
303
  template
304
    bool
305
    _Compiler<_TraitsT>::
306
    _M_atom()
307
    {
308
      if (_M_match_token(_ScannerT::_S_token_anychar))
309
	{
310
	  if (!(_M_flags & regex_constants::ECMAScript))
311
	    __INSERT_REGEX_MATCHER(_M_insert_any_matcher_posix);
312
	  else
313
	    __INSERT_REGEX_MATCHER(_M_insert_any_matcher_ecma);
314
	}
315
      else if (_M_try_char())
316
	__INSERT_REGEX_MATCHER(_M_insert_char_matcher);
317
      else if (_M_match_token(_ScannerT::_S_token_backref))
318
	_M_stack.push(_StateSeqT(*_M_nfa, _M_nfa->
319
				 _M_insert_backref(_M_cur_int_value(10))));
320
      else if (_M_match_token(_ScannerT::_S_token_quoted_class))
321
	__INSERT_REGEX_MATCHER(_M_insert_character_class_matcher);
322
      else if (_M_match_token(_ScannerT::_S_token_subexpr_no_group_begin))
323
	{
324
	  _StateSeqT __r(*_M_nfa, _M_nfa->_M_insert_dummy());
325
	  this->_M_disjunction();
326
	  if (!_M_match_token(_ScannerT::_S_token_subexpr_end))
327
	    __throw_regex_error(regex_constants::error_paren);
328
	  __r._M_append(_M_pop());
329
	  _M_stack.push(__r);
330
	}
331
      else if (_M_match_token(_ScannerT::_S_token_subexpr_begin))
332
	{
333
	  _StateSeqT __r(*_M_nfa, _M_nfa->_M_insert_subexpr_begin());
334
	  this->_M_disjunction();
335
	  if (!_M_match_token(_ScannerT::_S_token_subexpr_end))
336
	    __throw_regex_error(regex_constants::error_paren);
337
	  __r._M_append(_M_pop());
338
	  __r._M_append(_M_nfa->_M_insert_subexpr_end());
339
	  _M_stack.push(__r);
340
	}
341
      else if (!_M_bracket_expression())
342
	return false;
343
      return true;
344
    }
345
 
346
  template
347
    bool
348
    _Compiler<_TraitsT>::
349
    _M_bracket_expression()
350
    {
351
      bool __neg =
352
	_M_match_token(_ScannerT::_S_token_bracket_neg_begin);
353
      if (!(__neg || _M_match_token(_ScannerT::_S_token_bracket_begin)))
354
	return false;
355
      __INSERT_REGEX_MATCHER(_M_insert_bracket_matcher, __neg);
356
      return true;
357
    }
358
#undef __INSERT_REGEX_MATCHER
359
 
360
  template
361
  template
362
    void
363
    _Compiler<_TraitsT>::
364
    _M_insert_any_matcher_ecma()
365
    {
366
      _M_stack.push(_StateSeqT(*_M_nfa,
367
	_M_nfa->_M_insert_matcher
368
	  (_AnyMatcher<_TraitsT, true, __icase, __collate>
369
	    (_M_traits))));
370
    }
371
 
372
  template
373
  template
374
    void
375
    _Compiler<_TraitsT>::
376
    _M_insert_any_matcher_posix()
377
    {
378
      _M_stack.push(_StateSeqT(*_M_nfa,
379
	_M_nfa->_M_insert_matcher
380
	  (_AnyMatcher<_TraitsT, false, __icase, __collate>
381
	    (_M_traits))));
382
    }
383
 
384
  template
385
  template
386
    void
387
    _Compiler<_TraitsT>::
388
    _M_insert_char_matcher()
389
    {
390
      _M_stack.push(_StateSeqT(*_M_nfa,
391
	_M_nfa->_M_insert_matcher
392
	  (_CharMatcher<_TraitsT, __icase, __collate>
393
	    (_M_value[0], _M_traits))));
394
    }
395
 
396
  template
397
  template
398
    void
399
    _Compiler<_TraitsT>::
400
    _M_insert_character_class_matcher()
401
    {
402
      _GLIBCXX_DEBUG_ASSERT(_M_value.size() == 1);
403
      _BracketMatcher<_TraitsT, __icase, __collate> __matcher
404
	(_M_ctype.is(_CtypeT::upper, _M_value[0]), _M_traits);
405
      __matcher._M_add_character_class(_M_value, false);
406
      __matcher._M_ready();
407
      _M_stack.push(_StateSeqT(*_M_nfa,
408
	_M_nfa->_M_insert_matcher(std::move(__matcher))));
409
    }
410
 
411
  template
412
  template
413
    void
414
    _Compiler<_TraitsT>::
415
    _M_insert_bracket_matcher(bool __neg)
416
    {
417
      _BracketMatcher<_TraitsT, __icase, __collate> __matcher(__neg, _M_traits);
418
      pair __last_char; // Optional<_CharT>
419
      __last_char.first = false;
420
      if (!(_M_flags & regex_constants::ECMAScript))
421
	if (_M_try_char())
422
	  {
423
	    __matcher._M_add_char(_M_value[0]);
424
	    __last_char.first = true;
425
	    __last_char.second = _M_value[0];
426
	  }
427
      while (_M_expression_term(__last_char, __matcher));
428
      __matcher._M_ready();
429
      _M_stack.push(_StateSeqT(
430
		      *_M_nfa,
431
		      _M_nfa->_M_insert_matcher(std::move(__matcher))));
432
    }
433
 
434
  template
435
  template
436
    bool
437
    _Compiler<_TraitsT>::
438
    _M_expression_term(pair& __last_char,
439
		       _BracketMatcher<_TraitsT, __icase, __collate>& __matcher)
440
    {
441
      if (_M_match_token(_ScannerT::_S_token_bracket_end))
442
	return false;
443
 
444
      if (_M_match_token(_ScannerT::_S_token_collsymbol))
445
	{
446
	  auto __symbol = __matcher._M_add_collate_element(_M_value);
447
	  if (__symbol.size() == 1)
448
	    {
449
	      __last_char.first = true;
450
	      __last_char.second = __symbol[0];
451
	    }
452
	}
453
      else if (_M_match_token(_ScannerT::_S_token_equiv_class_name))
454
	__matcher._M_add_equivalence_class(_M_value);
455
      else if (_M_match_token(_ScannerT::_S_token_char_class_name))
456
	__matcher._M_add_character_class(_M_value, false);
457
      // POSIX doesn't allow '-' as a start-range char (say [a-z--0]),
458
      // except when the '-' is the first or last character in the bracket
459
      // expression ([--0]). ECMAScript treats all '-' after a range as a
460
      // normal character. Also see above, where _M_expression_term gets called.
461
      //
462
      // As a result, POSIX rejects [-----], but ECMAScript doesn't.
463
      // Boost (1.57.0) always uses POSIX style even in its ECMAScript syntax.
464
      // Clang (3.5) always uses ECMAScript style even in its POSIX syntax.
465
      //
466
      // It turns out that no one reads BNFs ;)
467
      else if (_M_try_char())
468
	{
469
	  if (!__last_char.first)
470
	    {
471
	      __matcher._M_add_char(_M_value[0]);
472
	      if (_M_value[0] == '-'
473
		  && !(_M_flags & regex_constants::ECMAScript))
474
		{
475
		  if (_M_match_token(_ScannerT::_S_token_bracket_end))
476
		    return false;
477
		  __throw_regex_error(regex_constants::error_range);
478
		}
479
	      __last_char.first = true;
480
	      __last_char.second = _M_value[0];
481
	    }
482
	  else
483
	    {
484
	      if (_M_value[0] == '-')
485
		{
486
		  if (_M_try_char())
487
		    {
488
		      __matcher._M_make_range(__last_char.second , _M_value[0]);
489
		      __last_char.first = false;
490
		    }
491
		  else
492
		    {
493
		      if (_M_scanner._M_get_token()
494
			  != _ScannerT::_S_token_bracket_end)
495
			__throw_regex_error(regex_constants::error_range);
496
		      __matcher._M_add_char(_M_value[0]);
497
		    }
498
		}
499
	      else
500
		{
501
		  __matcher._M_add_char(_M_value[0]);
502
		  __last_char.second = _M_value[0];
503
		}
504
	    }
505
	}
506
      else if (_M_match_token(_ScannerT::_S_token_quoted_class))
507
	__matcher._M_add_character_class(_M_value,
508
					 _M_ctype.is(_CtypeT::upper,
509
						     _M_value[0]));
510
      else
511
	__throw_regex_error(regex_constants::error_brack);
512
 
513
      return true;
514
    }
515
 
516
  template
517
    bool
518
    _Compiler<_TraitsT>::
519
    _M_try_char()
520
    {
521
      bool __is_char = false;
522
      if (_M_match_token(_ScannerT::_S_token_oct_num))
523
	{
524
	  __is_char = true;
525
	  _M_value.assign(1, _M_cur_int_value(8));
526
	}
527
      else if (_M_match_token(_ScannerT::_S_token_hex_num))
528
	{
529
	  __is_char = true;
530
	  _M_value.assign(1, _M_cur_int_value(16));
531
	}
532
      else if (_M_match_token(_ScannerT::_S_token_ord_char))
533
	__is_char = true;
534
      return __is_char;
535
    }
536
 
537
  template
538
    bool
539
    _Compiler<_TraitsT>::
540
    _M_match_token(_TokenT token)
541
    {
542
      if (token == _M_scanner._M_get_token())
543
	{
544
	  _M_value = _M_scanner._M_get_value();
545
	  _M_scanner._M_advance();
546
	  return true;
547
	}
548
      return false;
549
    }
550
 
551
  template
552
    int
553
    _Compiler<_TraitsT>::
554
    _M_cur_int_value(int __radix)
555
    {
556
      long __v = 0;
557
      for (typename _StringT::size_type __i = 0;
558
	   __i < _M_value.length(); ++__i)
559
	__v =__v * __radix + _M_traits.value(_M_value[__i], __radix);
560
      return __v;
561
    }
562
 
563
  template
564
    bool
565
    _BracketMatcher<_TraitsT, __icase, __collate>::
566
    _M_apply(_CharT __ch, false_type) const
567
    {
568
      bool __ret = std::binary_search(_M_char_set.begin(), _M_char_set.end(),
569
				      _M_translator._M_translate(__ch));
570
      if (!__ret)
571
	{
572
	  auto __s = _M_translator._M_transform(__ch);
573
	  for (auto& __it : _M_range_set)
574
	    if (__it.first <= __s && __s <= __it.second)
575
	      {
576
		__ret = true;
577
		break;
578
	      }
579
	  if (_M_traits.isctype(__ch, _M_class_set))
580
	    __ret = true;
581
	  else if (std::find(_M_equiv_set.begin(), _M_equiv_set.end(),
582
			     _M_traits.transform_primary(&__ch, &__ch+1))
583
		   != _M_equiv_set.end())
584
	    __ret = true;
585
	  else
586
	    {
587
	      for (auto& __it : _M_neg_class_set)
588
		if (!_M_traits.isctype(__ch, __it))
589
		  {
590
		    __ret = true;
591
		    break;
592
		  }
593
	    }
594
	}
595
      if (_M_is_non_matching)
596
	return !__ret;
597
      else
598
	return __ret;
599
    }
600
 
601
_GLIBCXX_END_NAMESPACE_VERSION
602
} // namespace __detail
603
} // namespace