1 // class template regex -*- C++ -*- 2 3 // Copyright (C) 2013-2014 Free Software Foundation, Inc. 4 // 5 // This file is part of the GNU ISO C++ Library. This library is free 6 // software; you can redistribute it and/or modify it under the 7 // terms of the GNU General Public License as published by the 8 // Free Software Foundation; either version 3, or (at your option) 9 // any later version. 10 11 // This library is distributed in the hope that it will be useful, 12 // but WITHOUT ANY WARRANTY; without even the implied warranty of 13 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 14 // GNU General Public License for more details. 15 16 // Under Section 7 of GPL version 3, you are granted additional 17 // permissions described in the GCC Runtime Library Exception, version 18 // 3.1, as published by the Free Software Foundation. 19 20 // You should have received a copy of the GNU General Public License and 21 // a copy of the GCC Runtime Library Exception along with this program; 22 // see the files COPYING3 and COPYING.RUNTIME respectively. If not, see 23 // <http://www.gnu.org/licenses/>. 24 25 /** 26 * @file bits/regex_scanner.tcc 27 * This is an internal header file, included by other library headers. 28 * Do not attempt to use it directly. @headername{regex} 29 */ 30 31 // FIXME make comments doxygen format. 32 33 // N3376 specified 6 regex styles: ECMAScript, basic, extended, grep, egrep 34 // and awk 35 // 1) grep is basic except '\n' is treated as '|' 36 // 2) egrep is extended except '\n' is treated as '|' 37 // 3) awk is extended except special escaping rules, and there's no 38 // back-reference. 39 // 40 // References: 41 // 42 // ECMAScript: ECMA-262 15.10 43 // 44 // basic, extended: 45 // http://pubs.opengroup.org/onlinepubs/009695399/basedefs/xbd_chap09.html 46 // 47 // awk: http://pubs.opengroup.org/onlinepubs/000095399/utilities/awk.html 48 49 namespace std _GLIBCXX_VISIBILITY(default) 50 { 51 namespace __detail 52 { 53 _GLIBCXX_BEGIN_NAMESPACE_VERSION 54 55 template<typename _CharT> 56 _Scanner<_CharT>:: _Scanner(typename _Scanner::_IterT __begin,typename _Scanner::_IterT __end,_FlagT __flags,std::locale __loc)57 _Scanner(typename _Scanner::_IterT __begin, 58 typename _Scanner::_IterT __end, 59 _FlagT __flags, std::locale __loc) 60 : _ScannerBase(__flags), 61 _M_current(__begin), _M_end(__end), 62 _M_ctype(std::use_facet<_CtypeT>(__loc)), 63 _M_eat_escape(_M_is_ecma() 64 ? &_Scanner::_M_eat_escape_ecma 65 : &_Scanner::_M_eat_escape_posix) 66 { _M_advance(); } 67 68 template<typename _CharT> 69 void 70 _Scanner<_CharT>:: _M_advance()71 _M_advance() 72 { 73 if (_M_current == _M_end) 74 { 75 _M_token = _S_token_eof; 76 return; 77 } 78 79 if (_M_state == _S_state_normal) 80 _M_scan_normal(); 81 else if (_M_state == _S_state_in_bracket) 82 _M_scan_in_bracket(); 83 else if (_M_state == _S_state_in_brace) 84 _M_scan_in_brace(); 85 else 86 _GLIBCXX_DEBUG_ASSERT(false); 87 } 88 89 // Differences between styles: 90 // 1) "\(", "\)", "\{" in basic. It's not escaping. 91 // 2) "(?:", "(?=", "(?!" in ECMAScript. 92 template<typename _CharT> 93 void 94 _Scanner<_CharT>:: _M_scan_normal()95 _M_scan_normal() 96 { 97 auto __c = *_M_current++; 98 const char* __pos; 99 100 if (std::strchr(_M_spec_char, _M_ctype.narrow(__c, '\0')) == nullptr) 101 { 102 _M_token = _S_token_ord_char; 103 _M_value.assign(1, __c); 104 return; 105 } 106 if (__c == '\\') 107 { 108 if (_M_current == _M_end) 109 __throw_regex_error(regex_constants::error_escape); 110 111 if (!_M_is_basic() 112 || (*_M_current != '(' 113 && *_M_current != ')' 114 && *_M_current != '{')) 115 { 116 (this->*_M_eat_escape)(); 117 return; 118 } 119 __c = *_M_current++; 120 } 121 if (__c == '(') 122 { 123 if (_M_is_ecma() && *_M_current == '?') 124 { 125 if (++_M_current == _M_end) 126 __throw_regex_error(regex_constants::error_paren); 127 128 if (*_M_current == ':') 129 { 130 ++_M_current; 131 _M_token = _S_token_subexpr_no_group_begin; 132 } 133 else if (*_M_current == '=') 134 { 135 ++_M_current; 136 _M_token = _S_token_subexpr_lookahead_begin; 137 _M_value.assign(1, 'p'); 138 } 139 else if (*_M_current == '!') 140 { 141 ++_M_current; 142 _M_token = _S_token_subexpr_lookahead_begin; 143 _M_value.assign(1, 'n'); 144 } 145 else 146 __throw_regex_error(regex_constants::error_paren); 147 } 148 else if (_M_flags & regex_constants::nosubs) 149 _M_token = _S_token_subexpr_no_group_begin; 150 else 151 _M_token = _S_token_subexpr_begin; 152 } 153 else if (__c == ')') 154 _M_token = _S_token_subexpr_end; 155 else if (__c == '[') 156 { 157 _M_state = _S_state_in_bracket; 158 _M_at_bracket_start = true; 159 if (_M_current != _M_end && *_M_current == '^') 160 { 161 _M_token = _S_token_bracket_neg_begin; 162 ++_M_current; 163 } 164 else 165 _M_token = _S_token_bracket_begin; 166 } 167 else if (__c == '{') 168 { 169 _M_state = _S_state_in_brace; 170 _M_token = _S_token_interval_begin; 171 } 172 else if (((__pos = std::strchr(_M_spec_char, _M_ctype.narrow(__c, '\0'))) 173 != nullptr 174 && *__pos != '\0' 175 && __c != ']' 176 && __c != '}') 177 || (_M_is_grep() && __c == '\n')) 178 { 179 auto __it = _M_token_tbl; 180 auto __narrowc = _M_ctype.narrow(__c, '\0'); 181 for (; __it->first != '\0'; ++__it) 182 if (__it->first == __narrowc) 183 { 184 _M_token = __it->second; 185 return; 186 } 187 _GLIBCXX_DEBUG_ASSERT(false); 188 } 189 else 190 { 191 _M_token = _S_token_ord_char; 192 _M_value.assign(1, __c); 193 } 194 } 195 196 // Differences between styles: 197 // 1) different semantics of "[]" and "[^]". 198 // 2) Escaping in bracket expr. 199 template<typename _CharT> 200 void 201 _Scanner<_CharT>:: _M_scan_in_bracket()202 _M_scan_in_bracket() 203 { 204 if (_M_current == _M_end) 205 __throw_regex_error(regex_constants::error_brack); 206 207 auto __c = *_M_current++; 208 209 if (__c == '[') 210 { 211 if (_M_current == _M_end) 212 __throw_regex_error(regex_constants::error_brack); 213 214 if (*_M_current == '.') 215 { 216 _M_token = _S_token_collsymbol; 217 _M_eat_class(*_M_current++); 218 } 219 else if (*_M_current == ':') 220 { 221 _M_token = _S_token_char_class_name; 222 _M_eat_class(*_M_current++); 223 } 224 else if (*_M_current == '=') 225 { 226 _M_token = _S_token_equiv_class_name; 227 _M_eat_class(*_M_current++); 228 } 229 else 230 { 231 _M_token = _S_token_ord_char; 232 _M_value.assign(1, __c); 233 } 234 } 235 // In POSIX, when encountering "[]" or "[^]", the ']' is interpreted 236 // literally. So "[]]" or "[^]]" is valid regex. See the testcases 237 // `*/empty_range.cc`. 238 else if (__c == ']' && (_M_is_ecma() || !_M_at_bracket_start)) 239 { 240 _M_token = _S_token_bracket_end; 241 _M_state = _S_state_normal; 242 } 243 // ECMAScirpt and awk permmits escaping in bracket. 244 else if (__c == '\\' && (_M_is_ecma() || _M_is_awk())) 245 (this->*_M_eat_escape)(); 246 else 247 { 248 _M_token = _S_token_ord_char; 249 _M_value.assign(1, __c); 250 } 251 _M_at_bracket_start = false; 252 } 253 254 // Differences between styles: 255 // 1) "\}" in basic style. 256 template<typename _CharT> 257 void 258 _Scanner<_CharT>:: _M_scan_in_brace()259 _M_scan_in_brace() 260 { 261 if (_M_current == _M_end) 262 __throw_regex_error(regex_constants::error_brace); 263 264 auto __c = *_M_current++; 265 266 if (_M_ctype.is(_CtypeT::digit, __c)) 267 { 268 _M_token = _S_token_dup_count; 269 _M_value.assign(1, __c); 270 while (_M_current != _M_end 271 && _M_ctype.is(_CtypeT::digit, *_M_current)) 272 _M_value += *_M_current++; 273 } 274 else if (__c == ',') 275 _M_token = _S_token_comma; 276 // basic use \}. 277 else if (_M_is_basic()) 278 { 279 if (__c == '\\' && _M_current != _M_end && *_M_current == '}') 280 { 281 _M_state = _S_state_normal; 282 _M_token = _S_token_interval_end; 283 ++_M_current; 284 } 285 else 286 __throw_regex_error(regex_constants::error_badbrace); 287 } 288 else if (__c == '}') 289 { 290 _M_state = _S_state_normal; 291 _M_token = _S_token_interval_end; 292 } 293 else 294 __throw_regex_error(regex_constants::error_badbrace); 295 } 296 297 template<typename _CharT> 298 void 299 _Scanner<_CharT>:: _M_eat_escape_ecma()300 _M_eat_escape_ecma() 301 { 302 if (_M_current == _M_end) 303 __throw_regex_error(regex_constants::error_escape); 304 305 auto __c = *_M_current++; 306 auto __pos = _M_find_escape(_M_ctype.narrow(__c, '\0')); 307 308 if (__pos != nullptr && (__c != 'b' || _M_state == _S_state_in_bracket)) 309 { 310 _M_token = _S_token_ord_char; 311 _M_value.assign(1, *__pos); 312 } 313 else if (__c == 'b') 314 { 315 _M_token = _S_token_word_bound; 316 _M_value.assign(1, 'p'); 317 } 318 else if (__c == 'B') 319 { 320 _M_token = _S_token_word_bound; 321 _M_value.assign(1, 'n'); 322 } 323 // N3376 28.13 324 else if (__c == 'd' 325 || __c == 'D' 326 || __c == 's' 327 || __c == 'S' 328 || __c == 'w' 329 || __c == 'W') 330 { 331 _M_token = _S_token_quoted_class; 332 _M_value.assign(1, __c); 333 } 334 else if (__c == 'c') 335 { 336 if (_M_current == _M_end) 337 __throw_regex_error(regex_constants::error_escape); 338 _M_token = _S_token_ord_char; 339 _M_value.assign(1, *_M_current++); 340 } 341 else if (__c == 'x' || __c == 'u') 342 { 343 _M_value.erase(); 344 for (int i = 0; i < (__c == 'x' ? 2 : 4); i++) 345 { 346 if (_M_current == _M_end 347 || !_M_ctype.is(_CtypeT::xdigit, *_M_current)) 348 __throw_regex_error(regex_constants::error_escape); 349 _M_value += *_M_current++; 350 } 351 _M_token = _S_token_hex_num; 352 } 353 // ECMAScript recongnizes multi-digit back-references. 354 else if (_M_ctype.is(_CtypeT::digit, __c)) 355 { 356 _M_value.assign(1, __c); 357 while (_M_current != _M_end 358 && _M_ctype.is(_CtypeT::digit, *_M_current)) 359 _M_value += *_M_current++; 360 _M_token = _S_token_backref; 361 } 362 else 363 { 364 _M_token = _S_token_ord_char; 365 _M_value.assign(1, __c); 366 } 367 } 368 369 // Differences between styles: 370 // 1) Extended doesn't support backref, but basic does. 371 template<typename _CharT> 372 void 373 _Scanner<_CharT>:: _M_eat_escape_posix()374 _M_eat_escape_posix() 375 { 376 if (_M_current == _M_end) 377 __throw_regex_error(regex_constants::error_escape); 378 379 auto __c = *_M_current; 380 auto __pos = std::strchr(_M_spec_char, _M_ctype.narrow(__c, '\0')); 381 382 if (__pos != nullptr && *__pos != '\0') 383 { 384 _M_token = _S_token_ord_char; 385 _M_value.assign(1, __c); 386 } 387 // We MUST judge awk before handling backrefs. There's no backref in awk. 388 else if (_M_is_awk()) 389 { 390 _M_eat_escape_awk(); 391 return; 392 } 393 else if (_M_is_basic() && _M_ctype.is(_CtypeT::digit, __c) && __c != '0') 394 { 395 _M_token = _S_token_backref; 396 _M_value.assign(1, __c); 397 } 398 else 399 { 400 #ifdef __STRICT_ANSI__ 401 __throw_regex_error(regex_constants::error_escape); 402 #else 403 _M_token = _S_token_ord_char; 404 _M_value.assign(1, __c); 405 #endif 406 } 407 ++_M_current; 408 } 409 410 template<typename _CharT> 411 void 412 _Scanner<_CharT>:: _M_eat_escape_awk()413 _M_eat_escape_awk() 414 { 415 auto __c = *_M_current++; 416 auto __pos = _M_find_escape(_M_ctype.narrow(__c, '\0')); 417 418 if (__pos != nullptr) 419 { 420 _M_token = _S_token_ord_char; 421 _M_value.assign(1, *__pos); 422 } 423 // \ddd for oct representation 424 else if (_M_ctype.is(_CtypeT::digit, __c) 425 && __c != '8' 426 && __c != '9') 427 { 428 _M_value.assign(1, __c); 429 for (int __i = 0; 430 __i < 2 431 && _M_current != _M_end 432 && _M_ctype.is(_CtypeT::digit, *_M_current) 433 && *_M_current != '8' 434 && *_M_current != '9'; 435 __i++) 436 _M_value += *_M_current++; 437 _M_token = _S_token_oct_num; 438 return; 439 } 440 else 441 __throw_regex_error(regex_constants::error_escape); 442 } 443 444 // Eats a character class or throwns an exception. 445 // __ch cound be ':', '.' or '=', _M_current is the char after ']' when 446 // returning. 447 template<typename _CharT> 448 void 449 _Scanner<_CharT>:: _M_eat_class(char __ch)450 _M_eat_class(char __ch) 451 { 452 for (_M_value.clear(); _M_current != _M_end && *_M_current != __ch;) 453 _M_value += *_M_current++; 454 if (_M_current == _M_end 455 || *_M_current++ != __ch 456 || _M_current == _M_end // skip __ch 457 || *_M_current++ != ']') // skip ']' 458 { 459 if (__ch == ':') 460 __throw_regex_error(regex_constants::error_ctype); 461 else 462 __throw_regex_error(regex_constants::error_collate); 463 } 464 } 465 466 #ifdef _GLIBCXX_DEBUG 467 template<typename _CharT> 468 std::ostream& 469 _Scanner<_CharT>:: _M_print(std::ostream & ostr)470 _M_print(std::ostream& ostr) 471 { 472 switch (_M_token) 473 { 474 case _S_token_anychar: 475 ostr << "any-character\n"; 476 break; 477 case _S_token_backref: 478 ostr << "backref\n"; 479 break; 480 case _S_token_bracket_begin: 481 ostr << "bracket-begin\n"; 482 break; 483 case _S_token_bracket_neg_begin: 484 ostr << "bracket-neg-begin\n"; 485 break; 486 case _S_token_bracket_end: 487 ostr << "bracket-end\n"; 488 break; 489 case _S_token_char_class_name: 490 ostr << "char-class-name \"" << _M_value << "\"\n"; 491 break; 492 case _S_token_closure0: 493 ostr << "closure0\n"; 494 break; 495 case _S_token_closure1: 496 ostr << "closure1\n"; 497 break; 498 case _S_token_collsymbol: 499 ostr << "collsymbol \"" << _M_value << "\"\n"; 500 break; 501 case _S_token_comma: 502 ostr << "comma\n"; 503 break; 504 case _S_token_dup_count: 505 ostr << "dup count: " << _M_value << "\n"; 506 break; 507 case _S_token_eof: 508 ostr << "EOF\n"; 509 break; 510 case _S_token_equiv_class_name: 511 ostr << "equiv-class-name \"" << _M_value << "\"\n"; 512 break; 513 case _S_token_interval_begin: 514 ostr << "interval begin\n"; 515 break; 516 case _S_token_interval_end: 517 ostr << "interval end\n"; 518 break; 519 case _S_token_line_begin: 520 ostr << "line begin\n"; 521 break; 522 case _S_token_line_end: 523 ostr << "line end\n"; 524 break; 525 case _S_token_opt: 526 ostr << "opt\n"; 527 break; 528 case _S_token_or: 529 ostr << "or\n"; 530 break; 531 case _S_token_ord_char: 532 ostr << "ordinary character: \"" << _M_value << "\"\n"; 533 break; 534 case _S_token_subexpr_begin: 535 ostr << "subexpr begin\n"; 536 break; 537 case _S_token_subexpr_no_group_begin: 538 ostr << "no grouping subexpr begin\n"; 539 break; 540 case _S_token_subexpr_lookahead_begin: 541 ostr << "lookahead subexpr begin\n"; 542 break; 543 case _S_token_subexpr_end: 544 ostr << "subexpr end\n"; 545 break; 546 case _S_token_unknown: 547 ostr << "-- unknown token --\n"; 548 break; 549 case _S_token_oct_num: 550 ostr << "oct number " << _M_value << "\n"; 551 break; 552 case _S_token_hex_num: 553 ostr << "hex number " << _M_value << "\n"; 554 break; 555 case _S_token_quoted_class: 556 ostr << "quoted class " << "\\" << _M_value << "\n"; 557 break; 558 default: 559 _GLIBCXX_DEBUG_ASSERT(false); 560 } 561 return ostr; 562 } 563 #endif 564 565 _GLIBCXX_END_NAMESPACE_VERSION 566 } // namespace __detail 567 } // namespace 568