Subversion Repositories Kolibri OS

Rev

Go to most recent revision | Details | Last modification | View Log | RSS feed

Rev Author Line No. Line
6554 serge 1
// class template regex -*- C++ -*-
2
 
3
// Copyright (C) 2013-2015 Free Software Foundation, Inc.
4
//
5
// This file is part of the GNU ISO C++ Library.  This library is free
6
// software; you can redistribute it and/or modify it under the
7
// terms of the GNU General Public License as published by the
8
// Free Software Foundation; either version 3, or (at your option)
9
// any later version.
10
 
11
// This library is distributed in the hope that it will be useful,
12
// but WITHOUT ANY WARRANTY; without even the implied warranty of
13
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
14
// GNU General Public License for more details.
15
 
16
// Under Section 7 of GPL version 3, you are granted additional
17
// permissions described in the GCC Runtime Library Exception, version
18
// 3.1, as published by the Free Software Foundation.
19
 
20
// You should have received a copy of the GNU General Public License and
21
// a copy of the GCC Runtime Library Exception along with this program;
22
// see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
23
// .
24
 
25
/**
26
 *  @file bits/regex_scanner.tcc
27
 *  This is an internal header file, included by other library headers.
28
 *  Do not attempt to use it directly. @headername{regex}
29
 */
30
 
31
// FIXME make comments doxygen format.
32
 
33
// N3376 specified 6 regex styles: ECMAScript, basic, extended, grep, egrep
34
// and awk
35
// 1) grep is basic except '\n' is treated as '|'
36
// 2) egrep is extended except '\n' is treated as '|'
37
// 3) awk is extended except special escaping rules, and there's no
38
//    back-reference.
39
//
40
// References:
41
//
42
// ECMAScript: ECMA-262 15.10
43
//
44
// basic, extended:
45
// http://pubs.opengroup.org/onlinepubs/009695399/basedefs/xbd_chap09.html
46
//
47
// awk: http://pubs.opengroup.org/onlinepubs/000095399/utilities/awk.html
48
 
49
namespace std _GLIBCXX_VISIBILITY(default)
50
{
51
namespace __detail
52
{
53
_GLIBCXX_BEGIN_NAMESPACE_VERSION
54
 
55
  template
56
    _Scanner<_CharT>::
57
    _Scanner(typename _Scanner::_IterT __begin,
58
	     typename _Scanner::_IterT __end,
59
	     _FlagT __flags, std::locale __loc)
60
    : _ScannerBase(__flags),
61
      _M_current(__begin), _M_end(__end),
62
      _M_ctype(std::use_facet<_CtypeT>(__loc)),
63
      _M_eat_escape(_M_is_ecma()
64
		    ? &_Scanner::_M_eat_escape_ecma
65
		    : &_Scanner::_M_eat_escape_posix)
66
    { _M_advance(); }
67
 
68
  template
69
    void
70
    _Scanner<_CharT>::
71
    _M_advance()
72
    {
73
      if (_M_current == _M_end)
74
	{
75
	  _M_token = _S_token_eof;
76
	  return;
77
	}
78
 
79
      if (_M_state == _S_state_normal)
80
	_M_scan_normal();
81
      else if (_M_state == _S_state_in_bracket)
82
	_M_scan_in_bracket();
83
      else if (_M_state == _S_state_in_brace)
84
	_M_scan_in_brace();
85
      else
86
	{
87
	  _GLIBCXX_DEBUG_ASSERT(false);
88
	}
89
    }
90
 
91
  // Differences between styles:
92
  // 1) "\(", "\)", "\{" in basic. It's not escaping.
93
  // 2) "(?:", "(?=", "(?!" in ECMAScript.
94
  template
95
    void
96
    _Scanner<_CharT>::
97
    _M_scan_normal()
98
    {
99
      auto __c = *_M_current++;
100
 
101
      if (std::strchr(_M_spec_char, _M_ctype.narrow(__c, ' ')) == nullptr)
102
	{
103
	  _M_token = _S_token_ord_char;
104
	  _M_value.assign(1, __c);
105
	  return;
106
	}
107
      if (__c == '\\')
108
	{
109
	  if (_M_current == _M_end)
110
	    __throw_regex_error(regex_constants::error_escape);
111
 
112
	  if (!_M_is_basic()
113
	      || (*_M_current != '('
114
		  && *_M_current != ')'
115
		  && *_M_current != '{'))
116
	    {
117
	      (this->*_M_eat_escape)();
118
	      return;
119
	    }
120
	  __c = *_M_current++;
121
	}
122
      if (__c == '(')
123
	{
124
	  if (_M_is_ecma() && *_M_current == '?')
125
	    {
126
	      if (++_M_current == _M_end)
127
		__throw_regex_error(regex_constants::error_paren);
128
 
129
	      if (*_M_current == ':')
130
		{
131
		  ++_M_current;
132
		  _M_token = _S_token_subexpr_no_group_begin;
133
		}
134
	      else if (*_M_current == '=')
135
		{
136
		  ++_M_current;
137
		  _M_token = _S_token_subexpr_lookahead_begin;
138
		  _M_value.assign(1, 'p');
139
		}
140
	      else if (*_M_current == '!')
141
		{
142
		  ++_M_current;
143
		  _M_token = _S_token_subexpr_lookahead_begin;
144
		  _M_value.assign(1, 'n');
145
		}
146
	      else
147
		__throw_regex_error(regex_constants::error_paren);
148
	    }
149
	  else if (_M_flags & regex_constants::nosubs)
150
	    _M_token = _S_token_subexpr_no_group_begin;
151
	  else
152
	    _M_token = _S_token_subexpr_begin;
153
	}
154
      else if (__c == ')')
155
	_M_token = _S_token_subexpr_end;
156
      else if (__c == '[')
157
	{
158
	  _M_state = _S_state_in_bracket;
159
	  _M_at_bracket_start = true;
160
	  if (_M_current != _M_end && *_M_current == '^')
161
	    {
162
	      _M_token = _S_token_bracket_neg_begin;
163
	      ++_M_current;
164
	    }
165
	  else
166
	    _M_token = _S_token_bracket_begin;
167
	}
168
      else if (__c == '{')
169
	{
170
	  _M_state = _S_state_in_brace;
171
	  _M_token = _S_token_interval_begin;
172
	}
173
      else if (__c != ']' && __c != '}')
174
	{
175
	  auto __it = _M_token_tbl;
176
	  auto __narrowc = _M_ctype.narrow(__c, '\0');
177
	  for (; __it->first != '\0'; ++__it)
178
	    if (__it->first == __narrowc)
179
	      {
180
		_M_token = __it->second;
181
		return;
182
	      }
183
	  _GLIBCXX_DEBUG_ASSERT(false);
184
	}
185
      else
186
	{
187
	  _M_token = _S_token_ord_char;
188
	  _M_value.assign(1, __c);
189
	}
190
    }
191
 
192
  // Differences between styles:
193
  // 1) different semantics of "[]" and "[^]".
194
  // 2) Escaping in bracket expr.
195
  template
196
    void
197
    _Scanner<_CharT>::
198
    _M_scan_in_bracket()
199
    {
200
      if (_M_current == _M_end)
201
	__throw_regex_error(regex_constants::error_brack);
202
 
203
      auto __c = *_M_current++;
204
 
205
      if (__c == '[')
206
	{
207
	  if (_M_current == _M_end)
208
	    __throw_regex_error(regex_constants::error_brack);
209
 
210
	  if (*_M_current == '.')
211
	    {
212
	      _M_token = _S_token_collsymbol;
213
	      _M_eat_class(*_M_current++);
214
	    }
215
	  else if (*_M_current == ':')
216
	    {
217
	      _M_token = _S_token_char_class_name;
218
	      _M_eat_class(*_M_current++);
219
	    }
220
	  else if (*_M_current == '=')
221
	    {
222
	      _M_token = _S_token_equiv_class_name;
223
	      _M_eat_class(*_M_current++);
224
	    }
225
	  else
226
	    {
227
	      _M_token = _S_token_ord_char;
228
	      _M_value.assign(1, __c);
229
	    }
230
	}
231
      // In POSIX, when encountering "[]" or "[^]", the ']' is interpreted
232
      // literally. So "[]]" and "[^]]" are valid regexes. See the testcases
233
      // `*/empty_range.cc`.
234
      else if (__c == ']' && (_M_is_ecma() || !_M_at_bracket_start))
235
	{
236
	  _M_token = _S_token_bracket_end;
237
	  _M_state = _S_state_normal;
238
	}
239
      // ECMAScript and awk permits escaping in bracket.
240
      else if (__c == '\\' && (_M_is_ecma() || _M_is_awk()))
241
	(this->*_M_eat_escape)();
242
      else
243
	{
244
	  _M_token = _S_token_ord_char;
245
	  _M_value.assign(1, __c);
246
	}
247
      _M_at_bracket_start = false;
248
    }
249
 
250
  // Differences between styles:
251
  // 1) "\}" in basic style.
252
  template
253
    void
254
    _Scanner<_CharT>::
255
    _M_scan_in_brace()
256
    {
257
      if (_M_current == _M_end)
258
	__throw_regex_error(regex_constants::error_brace);
259
 
260
      auto __c = *_M_current++;
261
 
262
      if (_M_ctype.is(_CtypeT::digit, __c))
263
	{
264
	  _M_token = _S_token_dup_count;
265
	  _M_value.assign(1, __c);
266
	  while (_M_current != _M_end
267
		 && _M_ctype.is(_CtypeT::digit, *_M_current))
268
	    _M_value += *_M_current++;
269
	}
270
      else if (__c == ',')
271
	_M_token = _S_token_comma;
272
      // basic use \}.
273
      else if (_M_is_basic())
274
	{
275
	  if (__c == '\\' && _M_current != _M_end && *_M_current == '}')
276
	    {
277
	      _M_state = _S_state_normal;
278
	      _M_token = _S_token_interval_end;
279
	      ++_M_current;
280
	    }
281
	  else
282
	    __throw_regex_error(regex_constants::error_badbrace);
283
	}
284
      else if (__c == '}')
285
	{
286
	  _M_state = _S_state_normal;
287
	  _M_token = _S_token_interval_end;
288
	}
289
      else
290
	__throw_regex_error(regex_constants::error_badbrace);
291
    }
292
 
293
  template
294
    void
295
    _Scanner<_CharT>::
296
    _M_eat_escape_ecma()
297
    {
298
      if (_M_current == _M_end)
299
	__throw_regex_error(regex_constants::error_escape);
300
 
301
      auto __c = *_M_current++;
302
      auto __pos = _M_find_escape(_M_ctype.narrow(__c, '\0'));
303
 
304
      if (__pos != nullptr && (__c != 'b' || _M_state == _S_state_in_bracket))
305
	{
306
	  _M_token = _S_token_ord_char;
307
	  _M_value.assign(1, *__pos);
308
	}
309
      else if (__c == 'b')
310
	{
311
	  _M_token = _S_token_word_bound;
312
	  _M_value.assign(1, 'p');
313
	}
314
      else if (__c == 'B')
315
	{
316
	  _M_token = _S_token_word_bound;
317
	  _M_value.assign(1, 'n');
318
	}
319
      // N3376 28.13
320
      else if (__c == 'd'
321
	       || __c == 'D'
322
	       || __c == 's'
323
	       || __c == 'S'
324
	       || __c == 'w'
325
	       || __c == 'W')
326
	{
327
	  _M_token = _S_token_quoted_class;
328
	  _M_value.assign(1, __c);
329
	}
330
      else if (__c == 'c')
331
	{
332
	  if (_M_current == _M_end)
333
	    __throw_regex_error(regex_constants::error_escape);
334
	  _M_token = _S_token_ord_char;
335
	  _M_value.assign(1, *_M_current++);
336
	}
337
      else if (__c == 'x' || __c == 'u')
338
	{
339
	  _M_value.erase();
340
	  for (int __i = 0; __i < (__c == 'x' ? 2 : 4); __i++)
341
	    {
342
	      if (_M_current == _M_end
343
		  || !_M_ctype.is(_CtypeT::xdigit, *_M_current))
344
		__throw_regex_error(regex_constants::error_escape);
345
	      _M_value += *_M_current++;
346
	    }
347
	  _M_token = _S_token_hex_num;
348
	}
349
      // ECMAScript recognizes multi-digit back-references.
350
      else if (_M_ctype.is(_CtypeT::digit, __c))
351
	{
352
	  _M_value.assign(1, __c);
353
	  while (_M_current != _M_end
354
		 && _M_ctype.is(_CtypeT::digit, *_M_current))
355
	    _M_value += *_M_current++;
356
	  _M_token = _S_token_backref;
357
	}
358
      else
359
	{
360
	  _M_token = _S_token_ord_char;
361
	  _M_value.assign(1, __c);
362
	}
363
    }
364
 
365
  // Differences between styles:
366
  // 1) Extended doesn't support backref, but basic does.
367
  template
368
    void
369
    _Scanner<_CharT>::
370
    _M_eat_escape_posix()
371
    {
372
      if (_M_current == _M_end)
373
	__throw_regex_error(regex_constants::error_escape);
374
 
375
      auto __c = *_M_current;
376
      auto __pos = std::strchr(_M_spec_char, _M_ctype.narrow(__c, '\0'));
377
 
378
      if (__pos != nullptr && *__pos != '\0')
379
	{
380
	  _M_token = _S_token_ord_char;
381
	  _M_value.assign(1, __c);
382
	}
383
      // We MUST judge awk before handling backrefs. There's no backref in awk.
384
      else if (_M_is_awk())
385
	{
386
	  _M_eat_escape_awk();
387
	  return;
388
	}
389
      else if (_M_is_basic() && _M_ctype.is(_CtypeT::digit, __c) && __c != '0')
390
	{
391
	  _M_token = _S_token_backref;
392
	  _M_value.assign(1, __c);
393
	}
394
      else
395
	{
396
#ifdef __STRICT_ANSI__
397
	  // POSIX says it is undefined to escape ordinary characters
398
	  __throw_regex_error(regex_constants::error_escape);
399
#else
400
	  _M_token = _S_token_ord_char;
401
	  _M_value.assign(1, __c);
402
#endif
403
	}
404
      ++_M_current;
405
    }
406
 
407
  template
408
    void
409
    _Scanner<_CharT>::
410
    _M_eat_escape_awk()
411
    {
412
      auto __c = *_M_current++;
413
      auto __pos = _M_find_escape(_M_ctype.narrow(__c, '\0'));
414
 
415
      if (__pos != nullptr)
416
	{
417
	  _M_token = _S_token_ord_char;
418
	  _M_value.assign(1, *__pos);
419
	}
420
      // \ddd for oct representation
421
      else if (_M_ctype.is(_CtypeT::digit, __c)
422
	       && __c != '8'
423
	       && __c != '9')
424
	{
425
	  _M_value.assign(1,  __c);
426
	  for (int __i = 0;
427
	       __i < 2
428
	       && _M_current != _M_end
429
	       && _M_ctype.is(_CtypeT::digit, *_M_current)
430
	       && *_M_current != '8'
431
	       && *_M_current != '9';
432
	       __i++)
433
	    _M_value += *_M_current++;
434
	  _M_token = _S_token_oct_num;
435
	  return;
436
	}
437
      else
438
	__throw_regex_error(regex_constants::error_escape);
439
    }
440
 
441
  // Eats a character class or throws an exception.
442
  // __ch could be ':', '.' or '=', _M_current is the char after ']' when
443
  // returning.
444
  template
445
    void
446
    _Scanner<_CharT>::
447
    _M_eat_class(char __ch)
448
    {
449
      for (_M_value.clear(); _M_current != _M_end && *_M_current != __ch;)
450
	_M_value += *_M_current++;
451
      if (_M_current == _M_end
452
	  || *_M_current++ != __ch
453
	  || _M_current == _M_end // skip __ch
454
	  || *_M_current++ != ']') // skip ']'
455
	{
456
	  if (__ch == ':')
457
	    __throw_regex_error(regex_constants::error_ctype);
458
	  else
459
	    __throw_regex_error(regex_constants::error_collate);
460
	}
461
    }
462
 
463
#ifdef _GLIBCXX_DEBUG
464
  template
465
    std::ostream&
466
    _Scanner<_CharT>::
467
    _M_print(std::ostream& ostr)
468
    {
469
      switch (_M_token)
470
      {
471
      case _S_token_anychar:
472
	ostr << "any-character\n";
473
	break;
474
      case _S_token_backref:
475
	ostr << "backref\n";
476
	break;
477
      case _S_token_bracket_begin:
478
	ostr << "bracket-begin\n";
479
	break;
480
      case _S_token_bracket_neg_begin:
481
	ostr << "bracket-neg-begin\n";
482
	break;
483
      case _S_token_bracket_end:
484
	ostr << "bracket-end\n";
485
	break;
486
      case _S_token_char_class_name:
487
	ostr << "char-class-name \"" << _M_value << "\"\n";
488
	break;
489
      case _S_token_closure0:
490
	ostr << "closure0\n";
491
	break;
492
      case _S_token_closure1:
493
	ostr << "closure1\n";
494
	break;
495
      case _S_token_collsymbol:
496
	ostr << "collsymbol \"" << _M_value << "\"\n";
497
	break;
498
      case _S_token_comma:
499
	ostr << "comma\n";
500
	break;
501
      case _S_token_dup_count:
502
	ostr << "dup count: " << _M_value << "\n";
503
	break;
504
      case _S_token_eof:
505
	ostr << "EOF\n";
506
	break;
507
      case _S_token_equiv_class_name:
508
	ostr << "equiv-class-name \"" << _M_value << "\"\n";
509
	break;
510
      case _S_token_interval_begin:
511
	ostr << "interval begin\n";
512
	break;
513
      case _S_token_interval_end:
514
	ostr << "interval end\n";
515
	break;
516
      case _S_token_line_begin:
517
	ostr << "line begin\n";
518
	break;
519
      case _S_token_line_end:
520
	ostr << "line end\n";
521
	break;
522
      case _S_token_opt:
523
	ostr << "opt\n";
524
	break;
525
      case _S_token_or:
526
	ostr << "or\n";
527
	break;
528
      case _S_token_ord_char:
529
	ostr << "ordinary character: \"" << _M_value << "\"\n";
530
	break;
531
      case _S_token_subexpr_begin:
532
	ostr << "subexpr begin\n";
533
	break;
534
      case _S_token_subexpr_no_group_begin:
535
	ostr << "no grouping subexpr begin\n";
536
	break;
537
      case _S_token_subexpr_lookahead_begin:
538
	ostr << "lookahead subexpr begin\n";
539
	break;
540
      case _S_token_subexpr_end:
541
	ostr << "subexpr end\n";
542
	break;
543
      case _S_token_unknown:
544
	ostr << "-- unknown token --\n";
545
	break;
546
      case _S_token_oct_num:
547
	ostr << "oct number " << _M_value << "\n";
548
	break;
549
      case _S_token_hex_num:
550
	ostr << "hex number " << _M_value << "\n";
551
	break;
552
      case _S_token_quoted_class:
553
	ostr << "quoted class " << "\\" << _M_value << "\n";
554
	break;
555
      default:
556
	_GLIBCXX_DEBUG_ASSERT(false);
557
      }
558
      return ostr;
559
    }
560
#endif
561
 
562
_GLIBCXX_END_NAMESPACE_VERSION
563
} // namespace __detail
564
} // namespace