1 // class template regex -*- C++ -*-
2 
3 // Copyright (C) 2013-2014 Free Software Foundation, Inc.
4 //
5 // This file is part of the GNU ISO C++ Library.  This library is free
6 // software; you can redistribute it and/or modify it under the
7 // terms of the GNU General Public License as published by the
8 // Free Software Foundation; either version 3, or (at your option)
9 // any later version.
10 
11 // This library is distributed in the hope that it will be useful,
12 // but WITHOUT ANY WARRANTY; without even the implied warranty of
13 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
14 // GNU General Public License for more details.
15 
16 // Under Section 7 of GPL version 3, you are granted additional
17 // permissions described in the GCC Runtime Library Exception, version
18 // 3.1, as published by the Free Software Foundation.
19 
20 // You should have received a copy of the GNU General Public License and
21 // a copy of the GCC Runtime Library Exception along with this program;
22 // see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
23 // <http://www.gnu.org/licenses/>.
24 
25 /**
26  *  @file bits/regex_scanner.tcc
27  *  This is an internal header file, included by other library headers.
28  *  Do not attempt to use it directly. @headername{regex}
29  */
30 
31 // FIXME make comments doxygen format.
32 
33 // N3376 specified 6 regex styles: ECMAScript, basic, extended, grep, egrep
34 // and awk
35 // 1) grep is basic except '\n' is treated as '|'
36 // 2) egrep is extended except '\n' is treated as '|'
37 // 3) awk is extended except special escaping rules, and there's no
38 //    back-reference.
39 //
40 // References:
41 //
42 // ECMAScript: ECMA-262 15.10
43 //
44 // basic, extended:
45 // http://pubs.opengroup.org/onlinepubs/009695399/basedefs/xbd_chap09.html
46 //
47 // awk: http://pubs.opengroup.org/onlinepubs/000095399/utilities/awk.html
48 
49 namespace std _GLIBCXX_VISIBILITY(default)
50 {
51 namespace __detail
52 {
53 _GLIBCXX_BEGIN_NAMESPACE_VERSION
54 
55   template<typename _CharT>
56     _Scanner<_CharT>::
_Scanner(typename _Scanner::_IterT __begin,typename _Scanner::_IterT __end,_FlagT __flags,std::locale __loc)57     _Scanner(typename _Scanner::_IterT __begin,
58 	     typename _Scanner::_IterT __end,
59 	     _FlagT __flags, std::locale __loc)
60     : _ScannerBase(__flags),
61       _M_current(__begin), _M_end(__end),
62       _M_ctype(std::use_facet<_CtypeT>(__loc)),
63       _M_eat_escape(_M_is_ecma()
64 		    ? &_Scanner::_M_eat_escape_ecma
65 		    : &_Scanner::_M_eat_escape_posix)
66     { _M_advance(); }
67 
68   template<typename _CharT>
69     void
70     _Scanner<_CharT>::
_M_advance()71     _M_advance()
72     {
73       if (_M_current == _M_end)
74 	{
75 	  _M_token = _S_token_eof;
76 	  return;
77 	}
78 
79       if (_M_state == _S_state_normal)
80 	_M_scan_normal();
81       else if (_M_state == _S_state_in_bracket)
82 	_M_scan_in_bracket();
83       else if (_M_state == _S_state_in_brace)
84 	_M_scan_in_brace();
85       else
86 	_GLIBCXX_DEBUG_ASSERT(false);
87     }
88 
89   // Differences between styles:
90   // 1) "\(", "\)", "\{" in basic. It's not escaping.
91   // 2) "(?:", "(?=", "(?!" in ECMAScript.
92   template<typename _CharT>
93     void
94     _Scanner<_CharT>::
_M_scan_normal()95     _M_scan_normal()
96     {
97       auto __c = *_M_current++;
98       const char* __pos;
99 
100       if (std::strchr(_M_spec_char, _M_ctype.narrow(__c, '\0')) == nullptr)
101 	{
102 	  _M_token = _S_token_ord_char;
103 	  _M_value.assign(1, __c);
104 	  return;
105 	}
106       if (__c == '\\')
107 	{
108 	  if (_M_current == _M_end)
109 	    __throw_regex_error(regex_constants::error_escape);
110 
111 	  if (!_M_is_basic()
112 	      || (*_M_current != '('
113 		  && *_M_current != ')'
114 		  && *_M_current != '{'))
115 	    {
116 	      (this->*_M_eat_escape)();
117 	      return;
118 	    }
119 	  __c = *_M_current++;
120 	}
121       if (__c == '(')
122 	{
123 	  if (_M_is_ecma() && *_M_current == '?')
124 	    {
125 	      if (++_M_current == _M_end)
126 		__throw_regex_error(regex_constants::error_paren);
127 
128 	      if (*_M_current == ':')
129 		{
130 		  ++_M_current;
131 		  _M_token = _S_token_subexpr_no_group_begin;
132 		}
133 	      else if (*_M_current == '=')
134 		{
135 		  ++_M_current;
136 		  _M_token = _S_token_subexpr_lookahead_begin;
137 		  _M_value.assign(1, 'p');
138 		}
139 	      else if (*_M_current == '!')
140 		{
141 		  ++_M_current;
142 		  _M_token = _S_token_subexpr_lookahead_begin;
143 		  _M_value.assign(1, 'n');
144 		}
145 	      else
146 		__throw_regex_error(regex_constants::error_paren);
147 	    }
148 	  else if (_M_flags & regex_constants::nosubs)
149 	    _M_token = _S_token_subexpr_no_group_begin;
150 	  else
151 	    _M_token = _S_token_subexpr_begin;
152 	}
153       else if (__c == ')')
154 	_M_token = _S_token_subexpr_end;
155       else if (__c == '[')
156 	{
157 	  _M_state = _S_state_in_bracket;
158 	  _M_at_bracket_start = true;
159 	  if (_M_current != _M_end && *_M_current == '^')
160 	    {
161 	      _M_token = _S_token_bracket_neg_begin;
162 	      ++_M_current;
163 	    }
164 	  else
165 	    _M_token = _S_token_bracket_begin;
166 	}
167       else if (__c == '{')
168 	{
169 	  _M_state = _S_state_in_brace;
170 	  _M_token = _S_token_interval_begin;
171 	}
172       else if (((__pos = std::strchr(_M_spec_char, _M_ctype.narrow(__c, '\0')))
173 		  != nullptr
174 		&& *__pos != '\0'
175 		&& __c != ']'
176 		&& __c != '}')
177 	       || (_M_is_grep() && __c == '\n'))
178 	{
179 	  auto __it = _M_token_tbl;
180 	  auto __narrowc = _M_ctype.narrow(__c, '\0');
181 	  for (; __it->first != '\0'; ++__it)
182 	    if (__it->first == __narrowc)
183 	      {
184 		_M_token = __it->second;
185 		return;
186 	      }
187 	  _GLIBCXX_DEBUG_ASSERT(false);
188 	}
189       else
190 	{
191 	  _M_token = _S_token_ord_char;
192 	  _M_value.assign(1, __c);
193 	}
194     }
195 
196   // Differences between styles:
197   // 1) different semantics of "[]" and "[^]".
198   // 2) Escaping in bracket expr.
199   template<typename _CharT>
200     void
201     _Scanner<_CharT>::
_M_scan_in_bracket()202     _M_scan_in_bracket()
203     {
204       if (_M_current == _M_end)
205 	__throw_regex_error(regex_constants::error_brack);
206 
207       auto __c = *_M_current++;
208 
209       if (__c == '[')
210 	{
211 	  if (_M_current == _M_end)
212 	    __throw_regex_error(regex_constants::error_brack);
213 
214 	  if (*_M_current == '.')
215 	    {
216 	      _M_token = _S_token_collsymbol;
217 	      _M_eat_class(*_M_current++);
218 	    }
219 	  else if (*_M_current == ':')
220 	    {
221 	      _M_token = _S_token_char_class_name;
222 	      _M_eat_class(*_M_current++);
223 	    }
224 	  else if (*_M_current == '=')
225 	    {
226 	      _M_token = _S_token_equiv_class_name;
227 	      _M_eat_class(*_M_current++);
228 	    }
229 	  else
230 	    {
231 	      _M_token = _S_token_ord_char;
232 	      _M_value.assign(1, __c);
233 	    }
234 	}
235       // In POSIX, when encountering "[]" or "[^]", the ']' is interpreted
236       // literally. So "[]]" or "[^]]" is valid regex. See the testcases
237       // `*/empty_range.cc`.
238       else if (__c == ']' && (_M_is_ecma() || !_M_at_bracket_start))
239 	{
240 	  _M_token = _S_token_bracket_end;
241 	  _M_state = _S_state_normal;
242 	}
243       // ECMAScirpt and awk permmits escaping in bracket.
244       else if (__c == '\\' && (_M_is_ecma() || _M_is_awk()))
245 	(this->*_M_eat_escape)();
246       else
247 	{
248 	  _M_token = _S_token_ord_char;
249 	  _M_value.assign(1, __c);
250 	}
251       _M_at_bracket_start = false;
252     }
253 
254   // Differences between styles:
255   // 1) "\}" in basic style.
256   template<typename _CharT>
257     void
258     _Scanner<_CharT>::
_M_scan_in_brace()259     _M_scan_in_brace()
260     {
261       if (_M_current == _M_end)
262 	__throw_regex_error(regex_constants::error_brace);
263 
264       auto __c = *_M_current++;
265 
266       if (_M_ctype.is(_CtypeT::digit, __c))
267 	{
268 	  _M_token = _S_token_dup_count;
269 	  _M_value.assign(1, __c);
270 	  while (_M_current != _M_end
271 		 && _M_ctype.is(_CtypeT::digit, *_M_current))
272 	    _M_value += *_M_current++;
273 	}
274       else if (__c == ',')
275 	_M_token = _S_token_comma;
276       // basic use \}.
277       else if (_M_is_basic())
278 	{
279 	  if (__c == '\\' && _M_current != _M_end && *_M_current == '}')
280 	    {
281 	      _M_state = _S_state_normal;
282 	      _M_token = _S_token_interval_end;
283 	      ++_M_current;
284 	    }
285 	  else
286 	    __throw_regex_error(regex_constants::error_badbrace);
287 	}
288       else if (__c == '}')
289 	{
290 	  _M_state = _S_state_normal;
291 	  _M_token = _S_token_interval_end;
292 	}
293       else
294 	__throw_regex_error(regex_constants::error_badbrace);
295     }
296 
297   template<typename _CharT>
298     void
299     _Scanner<_CharT>::
_M_eat_escape_ecma()300     _M_eat_escape_ecma()
301     {
302       if (_M_current == _M_end)
303 	__throw_regex_error(regex_constants::error_escape);
304 
305       auto __c = *_M_current++;
306       auto __pos = _M_find_escape(_M_ctype.narrow(__c, '\0'));
307 
308       if (__pos != nullptr && (__c != 'b' || _M_state == _S_state_in_bracket))
309 	{
310 	  _M_token = _S_token_ord_char;
311 	  _M_value.assign(1, *__pos);
312 	}
313       else if (__c == 'b')
314 	{
315 	  _M_token = _S_token_word_bound;
316 	  _M_value.assign(1, 'p');
317 	}
318       else if (__c == 'B')
319 	{
320 	  _M_token = _S_token_word_bound;
321 	  _M_value.assign(1, 'n');
322 	}
323       // N3376 28.13
324       else if (__c == 'd'
325 	       || __c == 'D'
326 	       || __c == 's'
327 	       || __c == 'S'
328 	       || __c == 'w'
329 	       || __c == 'W')
330 	{
331 	  _M_token = _S_token_quoted_class;
332 	  _M_value.assign(1, __c);
333 	}
334       else if (__c == 'c')
335 	{
336 	  if (_M_current == _M_end)
337 	    __throw_regex_error(regex_constants::error_escape);
338 	  _M_token = _S_token_ord_char;
339 	  _M_value.assign(1, *_M_current++);
340 	}
341       else if (__c == 'x' || __c == 'u')
342 	{
343 	  _M_value.erase();
344 	  for (int i = 0; i < (__c == 'x' ? 2 : 4); i++)
345 	    {
346 	      if (_M_current == _M_end
347 		  || !_M_ctype.is(_CtypeT::xdigit, *_M_current))
348 		__throw_regex_error(regex_constants::error_escape);
349 	      _M_value += *_M_current++;
350 	    }
351 	  _M_token = _S_token_hex_num;
352 	}
353       // ECMAScript recongnizes multi-digit back-references.
354       else if (_M_ctype.is(_CtypeT::digit, __c))
355 	{
356 	  _M_value.assign(1, __c);
357 	  while (_M_current != _M_end
358 		 && _M_ctype.is(_CtypeT::digit, *_M_current))
359 	    _M_value += *_M_current++;
360 	  _M_token = _S_token_backref;
361 	}
362       else
363 	{
364 	  _M_token = _S_token_ord_char;
365 	  _M_value.assign(1, __c);
366 	}
367     }
368 
369   // Differences between styles:
370   // 1) Extended doesn't support backref, but basic does.
371   template<typename _CharT>
372     void
373     _Scanner<_CharT>::
_M_eat_escape_posix()374     _M_eat_escape_posix()
375     {
376       if (_M_current == _M_end)
377 	__throw_regex_error(regex_constants::error_escape);
378 
379       auto __c = *_M_current;
380       auto __pos = std::strchr(_M_spec_char, _M_ctype.narrow(__c, '\0'));
381 
382       if (__pos != nullptr && *__pos != '\0')
383 	{
384 	  _M_token = _S_token_ord_char;
385 	  _M_value.assign(1, __c);
386 	}
387       // We MUST judge awk before handling backrefs. There's no backref in awk.
388       else if (_M_is_awk())
389 	{
390 	  _M_eat_escape_awk();
391 	  return;
392 	}
393       else if (_M_is_basic() && _M_ctype.is(_CtypeT::digit, __c) && __c != '0')
394 	{
395 	  _M_token = _S_token_backref;
396 	  _M_value.assign(1, __c);
397 	}
398       else
399 	{
400 #ifdef __STRICT_ANSI__
401 	  __throw_regex_error(regex_constants::error_escape);
402 #else
403 	  _M_token = _S_token_ord_char;
404 	  _M_value.assign(1, __c);
405 #endif
406 	}
407       ++_M_current;
408     }
409 
410   template<typename _CharT>
411     void
412     _Scanner<_CharT>::
_M_eat_escape_awk()413     _M_eat_escape_awk()
414     {
415       auto __c = *_M_current++;
416       auto __pos = _M_find_escape(_M_ctype.narrow(__c, '\0'));
417 
418       if (__pos != nullptr)
419 	{
420 	  _M_token = _S_token_ord_char;
421 	  _M_value.assign(1, *__pos);
422 	}
423       // \ddd for oct representation
424       else if (_M_ctype.is(_CtypeT::digit, __c)
425 	       && __c != '8'
426 	       && __c != '9')
427 	{
428 	  _M_value.assign(1,  __c);
429 	  for (int __i = 0;
430 	       __i < 2
431 	       && _M_current != _M_end
432 	       && _M_ctype.is(_CtypeT::digit, *_M_current)
433 	       && *_M_current != '8'
434 	       && *_M_current != '9';
435 	       __i++)
436 	    _M_value += *_M_current++;
437 	  _M_token = _S_token_oct_num;
438 	  return;
439 	}
440       else
441 	__throw_regex_error(regex_constants::error_escape);
442     }
443 
444   // Eats a character class or throwns an exception.
445   // __ch cound be ':', '.' or '=', _M_current is the char after ']' when
446   // returning.
447   template<typename _CharT>
448     void
449     _Scanner<_CharT>::
_M_eat_class(char __ch)450     _M_eat_class(char __ch)
451     {
452       for (_M_value.clear(); _M_current != _M_end && *_M_current != __ch;)
453 	_M_value += *_M_current++;
454       if (_M_current == _M_end
455 	  || *_M_current++ != __ch
456 	  || _M_current == _M_end // skip __ch
457 	  || *_M_current++ != ']') // skip ']'
458 	{
459 	  if (__ch == ':')
460 	    __throw_regex_error(regex_constants::error_ctype);
461 	  else
462 	    __throw_regex_error(regex_constants::error_collate);
463 	}
464     }
465 
466 #ifdef _GLIBCXX_DEBUG
467   template<typename _CharT>
468     std::ostream&
469     _Scanner<_CharT>::
_M_print(std::ostream & ostr)470     _M_print(std::ostream& ostr)
471     {
472       switch (_M_token)
473       {
474       case _S_token_anychar:
475 	ostr << "any-character\n";
476 	break;
477       case _S_token_backref:
478 	ostr << "backref\n";
479 	break;
480       case _S_token_bracket_begin:
481 	ostr << "bracket-begin\n";
482 	break;
483       case _S_token_bracket_neg_begin:
484 	ostr << "bracket-neg-begin\n";
485 	break;
486       case _S_token_bracket_end:
487 	ostr << "bracket-end\n";
488 	break;
489       case _S_token_char_class_name:
490 	ostr << "char-class-name \"" << _M_value << "\"\n";
491 	break;
492       case _S_token_closure0:
493 	ostr << "closure0\n";
494 	break;
495       case _S_token_closure1:
496 	ostr << "closure1\n";
497 	break;
498       case _S_token_collsymbol:
499 	ostr << "collsymbol \"" << _M_value << "\"\n";
500 	break;
501       case _S_token_comma:
502 	ostr << "comma\n";
503 	break;
504       case _S_token_dup_count:
505 	ostr << "dup count: " << _M_value << "\n";
506 	break;
507       case _S_token_eof:
508 	ostr << "EOF\n";
509 	break;
510       case _S_token_equiv_class_name:
511 	ostr << "equiv-class-name \"" << _M_value << "\"\n";
512 	break;
513       case _S_token_interval_begin:
514 	ostr << "interval begin\n";
515 	break;
516       case _S_token_interval_end:
517 	ostr << "interval end\n";
518 	break;
519       case _S_token_line_begin:
520 	ostr << "line begin\n";
521 	break;
522       case _S_token_line_end:
523 	ostr << "line end\n";
524 	break;
525       case _S_token_opt:
526 	ostr << "opt\n";
527 	break;
528       case _S_token_or:
529 	ostr << "or\n";
530 	break;
531       case _S_token_ord_char:
532 	ostr << "ordinary character: \"" << _M_value << "\"\n";
533 	break;
534       case _S_token_subexpr_begin:
535 	ostr << "subexpr begin\n";
536 	break;
537       case _S_token_subexpr_no_group_begin:
538 	ostr << "no grouping subexpr begin\n";
539 	break;
540       case _S_token_subexpr_lookahead_begin:
541 	ostr << "lookahead subexpr begin\n";
542 	break;
543       case _S_token_subexpr_end:
544 	ostr << "subexpr end\n";
545 	break;
546       case _S_token_unknown:
547 	ostr << "-- unknown token --\n";
548 	break;
549       case _S_token_oct_num:
550 	ostr << "oct number " << _M_value << "\n";
551 	break;
552       case _S_token_hex_num:
553 	ostr << "hex number " << _M_value << "\n";
554 	break;
555       case _S_token_quoted_class:
556 	ostr << "quoted class " << "\\" << _M_value << "\n";
557 	break;
558       default:
559 	_GLIBCXX_DEBUG_ASSERT(false);
560       }
561       return ostr;
562     }
563 #endif
564 
565 _GLIBCXX_END_NAMESPACE_VERSION
566 } // namespace __detail
567 } // namespace
568