1 /*
2 * Copyright (c) 2004-2005 Sergey Lyubka <valenok@gmail.com>
3 * All rights reserved
4 *
5 * "THE BEER-WARE LICENSE" (Revision 42):
6 * Sergey Lyubka wrote this file. As long as you retain this notice you
7 * can do whatever you want with this stuff. If we meet some day, and you think
8 * this stuff is worth it, you can buy me a beer in return.
9 */
10
11 /*
12 * Downloaded Sat Nov 5 17:43:06 CET 2011 at
13 * http://slre.sourceforge.net/1.0/slre.c
14 */
15
16 #ifdef SLRE_TEST
17 #include <stdio.h>
18 #include <assert.h>
19 #include <ctype.h>
20 #include <stdlib.h>
21 #include <string.h>
22 #else
23 #include <log.h>
24 #include <linux/ctype.h>
25 #include <linux/string.h>
26 #endif /* SLRE_TEST */
27
28 #include <errno.h>
29
30 #include <slre.h>
31
32 enum {END, BRANCH, ANY, EXACT, ANYOF, ANYBUT, OPEN, CLOSE, BOL, EOL,
33 STAR, PLUS, STARQ, PLUSQ, QUEST, SPACE, NONSPACE, DIGIT, RANGE};
34
35 #ifdef SLRE_TEST
36 static struct {
37 const char *name;
38 int narg;
39 const char *flags;
40 } opcodes[] = {
41 {"END", 0, ""}, /* End of code block or program */
42 {"BRANCH", 2, "oo"}, /* Alternative operator, "|" */
43 {"ANY", 0, ""}, /* Match any character, "." */
44 {"EXACT", 2, "d"}, /* Match exact string */
45 {"ANYOF", 2, "D"}, /* Match any from set, "[]" */
46 {"ANYBUT", 2, "D"}, /* Match any but from set, "[^]"*/
47 {"OPEN ", 1, "i"}, /* Capture start, "(" */
48 {"CLOSE", 1, "i"}, /* Capture end, ")" */
49 {"BOL", 0, ""}, /* Beginning of string, "^" */
50 {"EOL", 0, ""}, /* End of string, "$" */
51 {"STAR", 1, "o"}, /* Match zero or more times "*" */
52 {"PLUS", 1, "o"}, /* Match one or more times, "+" */
53 {"STARQ", 1, "o"}, /* Non-greedy STAR, "*?" */
54 {"PLUSQ", 1, "o"}, /* Non-greedy PLUS, "+?" */
55 {"QUEST", 1, "o"}, /* Match zero or one time, "?" */
56 {"SPACE", 0, ""}, /* Match whitespace, "\s" */
57 {"NONSPACE", 0, ""}, /* Match non-space, "\S" */
58 {"DIGIT", 0, ""}, /* Match digit, "\d" */
59 {"RANGE", 0, ""}, /* Range separator - */
60 };
61 #endif /* SLRE_TEST */
62
63 /*
64 * Commands and operands are all unsigned char (1 byte long). All code offsets
65 * are relative to current address, and positive (always point forward). Data
66 * offsets are absolute. Commands with operands:
67 *
68 * BRANCH offset1 offset2
69 * Try to match the code block that follows the BRANCH instruction
70 * (code block ends with END). If no match, try to match code block that
71 * starts at offset1. If either of these match, jump to offset2.
72 *
73 * EXACT data_offset data_length
74 * Try to match exact string. String is recorded in data section from
75 * data_offset, and has length data_length.
76 *
77 * OPEN capture_number
78 * CLOSE capture_number
79 * If the user have passed 'struct cap' array for captures, OPEN
80 * records the beginning of the matched substring (cap->ptr), CLOSE
81 * sets the length (cap->len) for respective capture_number.
82 *
83 * STAR code_offset
84 * PLUS code_offset
85 * QUEST code_offset
86 * *, +, ?, respectively. Try to gobble as much as possible from the
87 * matched buffer, until code block that follows these instructions
88 * matches. When the longest possible string is matched,
89 * jump to code_offset
90 *
91 * STARQ, PLUSQ are non-greedy versions of STAR and PLUS.
92 */
93
94 static const char *meta_chars = "|.^$*+?()[\\";
95
96 #ifdef SLRE_TEST
97
98 static void
print_character_set(FILE * fp,const unsigned char * p,int len)99 print_character_set(FILE *fp, const unsigned char *p, int len)
100 {
101 int i;
102
103 for (i = 0; i < len; i++) {
104 if (i > 0)
105 (void) fputc(',', fp);
106 if (p[i] == 0) {
107 i++;
108 if (p[i] == 0)
109 (void) fprintf(fp, "\\x%02x", p[i]);
110 else
111 (void) fprintf(fp, "%s", opcodes[p[i]].name);
112 } else if (isprint(p[i])) {
113 (void) fputc(p[i], fp);
114 } else {
115 (void) fprintf(fp, "\\x%02x", p[i]);
116 }
117 }
118 }
119
120 void
slre_dump(const struct slre * r,FILE * fp)121 slre_dump(const struct slre *r, FILE *fp)
122 {
123 int i, j, ch, op, pc;
124
125 for (pc = 0; pc < r->code_size; pc++) {
126
127 op = r->code[pc];
128 (void) fprintf(fp, "%3d %s ", pc, opcodes[op].name);
129
130 for (i = 0; opcodes[op].flags[i] != '\0'; i++)
131 switch (opcodes[op].flags[i]) {
132 case 'i':
133 (void) fprintf(fp, "%d ", r->code[pc + 1]);
134 pc++;
135 break;
136 case 'o':
137 (void) fprintf(fp, "%d ",
138 pc + r->code[pc + 1] - i);
139 pc++;
140 break;
141 case 'D':
142 print_character_set(fp, r->data +
143 r->code[pc + 1], r->code[pc + 2]);
144 pc += 2;
145 break;
146 case 'd':
147 (void) fputc('"', fp);
148 for (j = 0; j < r->code[pc + 2]; j++) {
149 ch = r->data[r->code[pc + 1] + j];
150 if (isprint(ch)) {
151 (void) fputc(ch, fp);
152 } else {
153 (void) fprintf(fp,
154 "\\x%02x", ch);
155 }
156 }
157 (void) fputc('"', fp);
158 pc += 2;
159 break;
160 }
161
162 (void) fputc('\n', fp);
163 }
164 }
165 #endif /* SLRE_TEST */
166
167 static void
set_jump_offset(struct slre * r,int pc,int offset)168 set_jump_offset(struct slre *r, int pc, int offset)
169 {
170 assert(offset < r->code_size);
171
172 if (r->code_size - offset > 0xff)
173 r->err_str = "Jump offset is too big";
174 else
175 r->code[pc] = (unsigned char) (r->code_size - offset);
176 }
177
178 static void
emit(struct slre * r,int code)179 emit(struct slre *r, int code)
180 {
181 if (r->code_size >= (int) (sizeof(r->code) / sizeof(r->code[0])))
182 r->err_str = "RE is too long (code overflow)";
183 else
184 r->code[r->code_size++] = (unsigned char) code;
185 }
186
187 static void
store_char_in_data(struct slre * r,int ch)188 store_char_in_data(struct slre *r, int ch)
189 {
190 if (r->data_size >= (int) sizeof(r->data))
191 r->err_str = "RE is too long (data overflow)";
192 else
193 r->data[r->data_size++] = ch;
194 }
195
196 static void
exact(struct slre * r,const char ** re)197 exact(struct slre *r, const char **re)
198 {
199 int old_data_size = r->data_size;
200
201 while (**re != '\0' && (strchr(meta_chars, **re)) == NULL)
202 store_char_in_data(r, *(*re)++);
203
204 emit(r, EXACT);
205 emit(r, old_data_size);
206 emit(r, r->data_size - old_data_size);
207 }
208
209 static int
get_escape_char(const char ** re)210 get_escape_char(const char **re)
211 {
212 int res;
213
214 switch (*(*re)++) {
215 case 'n':
216 res = '\n';
217 break;
218 case 'r':
219 res = '\r';
220 break;
221 case 't':
222 res = '\t';
223 break;
224 case '0':
225 res = 0;
226 break;
227 case 'S':
228 res = NONSPACE << 8;
229 break;
230 case 's':
231 res = SPACE << 8;
232 break;
233 case 'd':
234 res = DIGIT << 8;
235 break;
236 default:
237 res = (*re)[-1];
238 break;
239 }
240
241 return res;
242 }
243
244 static void
anyof(struct slre * r,const char ** re)245 anyof(struct slre *r, const char **re)
246 {
247 int esc, old_data_size = r->data_size, op = ANYOF;
248
249 if (**re == '^') {
250 op = ANYBUT;
251 (*re)++;
252 }
253
254 while (**re != '\0')
255
256 switch (*(*re)++) {
257 case ']':
258 emit(r, op);
259 emit(r, old_data_size);
260 emit(r, r->data_size - old_data_size);
261 return;
262 /* NOTREACHED */
263 break;
264 case '-':
265 if (r->data_size == old_data_size || **re == ']') {
266 /* First or last character, just match - itself. */
267 store_char_in_data(r, '-');
268 break;
269 }
270 store_char_in_data(r, 0);
271 store_char_in_data(r, RANGE);
272 break;
273 case '\\':
274 esc = get_escape_char(re);
275 if ((esc & 0xff) == 0) {
276 store_char_in_data(r, 0);
277 store_char_in_data(r, esc >> 8);
278 } else {
279 store_char_in_data(r, esc);
280 }
281 break;
282 default:
283 store_char_in_data(r, (*re)[-1]);
284 break;
285 }
286
287 r->err_str = "No closing ']' bracket";
288 }
289
290 static void
relocate(struct slre * r,int begin,int shift)291 relocate(struct slre *r, int begin, int shift)
292 {
293 emit(r, END);
294 memmove(r->code + begin + shift, r->code + begin, r->code_size - begin);
295 r->code_size += shift;
296 }
297
298 static void
quantifier(struct slre * r,int prev,int op)299 quantifier(struct slre *r, int prev, int op)
300 {
301 if (r->code[prev] == EXACT && r->code[prev + 2] > 1) {
302 r->code[prev + 2]--;
303 emit(r, EXACT);
304 emit(r, r->code[prev + 1] + r->code[prev + 2]);
305 emit(r, 1);
306 prev = r->code_size - 3;
307 }
308 relocate(r, prev, 2);
309 r->code[prev] = op;
310 set_jump_offset(r, prev + 1, prev);
311 }
312
313 static void
exact_one_char(struct slre * r,int ch)314 exact_one_char(struct slre *r, int ch)
315 {
316 emit(r, EXACT);
317 emit(r, r->data_size);
318 emit(r, 1);
319 store_char_in_data(r, ch);
320 }
321
322 static void
fixup_branch(struct slre * r,int fixup)323 fixup_branch(struct slre *r, int fixup)
324 {
325 if (fixup > 0) {
326 emit(r, END);
327 set_jump_offset(r, fixup, fixup - 2);
328 }
329 }
330
331 static void
compile(struct slre * r,const char ** re)332 compile(struct slre *r, const char **re)
333 {
334 int op, esc, branch_start, last_op, fixup, cap_no, level;
335
336 fixup = 0;
337 level = r->num_caps;
338 branch_start = last_op = r->code_size;
339
340 for (;;)
341 switch (*(*re)++) {
342 case '\0':
343 (*re)--;
344 return;
345 /* NOTREACHED */
346 break;
347 case '^':
348 emit(r, BOL);
349 break;
350 case '$':
351 emit(r, EOL);
352 break;
353 case '.':
354 last_op = r->code_size;
355 emit(r, ANY);
356 break;
357 case '[':
358 last_op = r->code_size;
359 anyof(r, re);
360 break;
361 case '\\':
362 last_op = r->code_size;
363 esc = get_escape_char(re);
364 if (esc & 0xff00)
365 emit(r, esc >> 8);
366 else
367 exact_one_char(r, esc);
368 break;
369 case '(':
370 last_op = r->code_size;
371 cap_no = ++r->num_caps;
372 emit(r, OPEN);
373 emit(r, cap_no);
374
375 compile(r, re);
376 if (*(*re)++ != ')') {
377 r->err_str = "No closing bracket";
378 return;
379 }
380
381 emit(r, CLOSE);
382 emit(r, cap_no);
383 break;
384 case ')':
385 (*re)--;
386 fixup_branch(r, fixup);
387 if (level == 0) {
388 r->err_str = "Unbalanced brackets";
389 return;
390 }
391 return;
392 /* NOTREACHED */
393 break;
394 case '+':
395 case '*':
396 op = (*re)[-1] == '*' ? STAR : PLUS;
397 if (**re == '?') {
398 (*re)++;
399 op = op == STAR ? STARQ : PLUSQ;
400 }
401 quantifier(r, last_op, op);
402 break;
403 case '?':
404 quantifier(r, last_op, QUEST);
405 break;
406 case '|':
407 fixup_branch(r, fixup);
408 relocate(r, branch_start, 3);
409 r->code[branch_start] = BRANCH;
410 set_jump_offset(r, branch_start + 1, branch_start);
411 fixup = branch_start + 2;
412 r->code[fixup] = 0xff;
413 break;
414 default:
415 (*re)--;
416 last_op = r->code_size;
417 exact(r, re);
418 break;
419 }
420 }
421
422 int
slre_compile(struct slre * r,const char * re)423 slre_compile(struct slre *r, const char *re)
424 {
425 r->err_str = NULL;
426 r->code_size = r->data_size = r->num_caps = 0;
427
428 emit(r, OPEN); /* This will capture what matches full RE */
429 emit(r, 0);
430
431 while (*re != '\0')
432 compile(r, &re);
433
434 if (r->code[2] == BRANCH)
435 fixup_branch(r, 4);
436
437 emit(r, CLOSE);
438 emit(r, 0);
439 emit(r, END);
440
441 return (r->err_str == NULL ? 1 : 0);
442 }
443
444 static int match(const struct slre *, int,
445 const char *, int, int *, struct cap *);
446
447 static void
loop_greedy(const struct slre * r,int pc,const char * s,int len,int * ofs)448 loop_greedy(const struct slre *r, int pc, const char *s, int len, int *ofs)
449 {
450 int saved_offset, matched_offset;
451
452 matched_offset = *ofs;
453
454 while (match(r, pc + 2, s, len, ofs, NULL)) {
455 saved_offset = *ofs;
456 if (match(r, pc + r->code[pc + 1], s, len, ofs, NULL))
457 matched_offset = saved_offset;
458 *ofs = saved_offset;
459 }
460
461 *ofs = matched_offset;
462 }
463
464 static void
loop_non_greedy(const struct slre * r,int pc,const char * s,int len,int * ofs)465 loop_non_greedy(const struct slre *r, int pc, const char *s, int len, int *ofs)
466 {
467 int saved_offset = *ofs;
468
469 while (match(r, pc + 2, s, len, ofs, NULL)) {
470 saved_offset = *ofs;
471 if (match(r, pc + r->code[pc + 1], s, len, ofs, NULL))
472 break;
473 }
474
475 *ofs = saved_offset;
476 }
477
478 static int
is_any_of(const unsigned char * p,int len,const char * s,int * ofs)479 is_any_of(const unsigned char *p, int len, const char *s, int *ofs)
480 {
481 int i, ch;
482
483 ch = s[*ofs];
484
485 for (i = 0; i < len; i++) {
486 if (p[i] == '\0') {
487 switch (p[++i]) {
488 case NONSPACE:
489 if (!isspace(ch))
490 goto match;
491 break;
492 case SPACE:
493 if (isspace(ch))
494 goto match;
495 break;
496 case DIGIT:
497 if (isdigit(ch))
498 goto match;
499 break;
500 case RANGE:
501 /*
502 * a-z is represented in the data array as {'a', \0, RANGE, 'z'}
503 */
504 ++i;
505 if (p[i - 3] <= (unsigned char)ch && (unsigned char)ch <= p[i])
506 goto match;
507 break;
508 }
509 continue;
510 }
511 if (p[i] == ch)
512 goto match;
513 }
514
515 return 0;
516
517 match:
518 (*ofs)++;
519 return 1;
520 }
521
522 static int
is_any_but(const unsigned char * p,int len,const char * s,int * ofs)523 is_any_but(const unsigned char *p, int len, const char *s, int *ofs)
524 {
525 int dummy = *ofs;
526
527 if (is_any_of(p, len, s, &dummy)) {
528 return 0;
529 } else {
530 (*ofs)++;
531 return 1;
532 }
533 }
534
535 static int
match(const struct slre * r,int pc,const char * s,int len,int * ofs,struct cap * caps)536 match(const struct slre *r, int pc, const char *s, int len,
537 int *ofs, struct cap *caps)
538 {
539 int n, saved_offset, res = 1;
540
541 while (res && r->code[pc] != END) {
542
543 assert(pc < r->code_size);
544 assert(pc < (int) (sizeof(r->code) / sizeof(r->code[0])));
545
546 switch (r->code[pc]) {
547 case BRANCH:
548 saved_offset = *ofs;
549 res = match(r, pc + 3, s, len, ofs, caps);
550 if (res == 0) {
551 *ofs = saved_offset;
552 res = match(r, pc + r->code[pc + 1],
553 s, len, ofs, caps);
554 }
555 pc += r->code[pc + 2];
556 break;
557 case EXACT:
558 res = 0;
559 n = r->code[pc + 2]; /* String length */
560 if (n <= len - *ofs && !memcmp(s + *ofs, r->data +
561 r->code[pc + 1], n)) {
562 (*ofs) += n;
563 res = 1;
564 }
565 pc += 3;
566 break;
567 case QUEST:
568 res = 1;
569 saved_offset = *ofs;
570 if (!match(r, pc + 2, s, len, ofs, caps))
571 *ofs = saved_offset;
572 pc += r->code[pc + 1];
573 break;
574 case STAR:
575 res = 1;
576 loop_greedy(r, pc, s, len, ofs);
577 pc += r->code[pc + 1];
578 break;
579 case STARQ:
580 res = 1;
581 loop_non_greedy(r, pc, s, len, ofs);
582 pc += r->code[pc + 1];
583 break;
584 case PLUS:
585 res = match(r, pc + 2, s, len, ofs, caps);
586 if (res == 0)
587 break;
588
589 loop_greedy(r, pc, s, len, ofs);
590 pc += r->code[pc + 1];
591 break;
592 case PLUSQ:
593 res = match(r, pc + 2, s, len, ofs, caps);
594 if (res == 0)
595 break;
596
597 loop_non_greedy(r, pc, s, len, ofs);
598 pc += r->code[pc + 1];
599 break;
600 case SPACE:
601 res = 0;
602 if (*ofs < len && isspace(((unsigned char *)s)[*ofs])) {
603 (*ofs)++;
604 res = 1;
605 }
606 pc++;
607 break;
608 case NONSPACE:
609 res = 0;
610 if (*ofs < len &&
611 !isspace(((unsigned char *)s)[*ofs])) {
612 (*ofs)++;
613 res = 1;
614 }
615 pc++;
616 break;
617 case DIGIT:
618 res = 0;
619 if (*ofs < len && isdigit(((unsigned char *)s)[*ofs])) {
620 (*ofs)++;
621 res = 1;
622 }
623 pc++;
624 break;
625 case ANY:
626 res = 0;
627 if (*ofs < len) {
628 (*ofs)++;
629 res = 1;
630 }
631 pc++;
632 break;
633 case ANYOF:
634 res = 0;
635 if (*ofs < len)
636 res = is_any_of(r->data + r->code[pc + 1],
637 r->code[pc + 2], s, ofs);
638 pc += 3;
639 break;
640 case ANYBUT:
641 res = 0;
642 if (*ofs < len)
643 res = is_any_but(r->data + r->code[pc + 1],
644 r->code[pc + 2], s, ofs);
645 pc += 3;
646 break;
647 case BOL:
648 res = *ofs == 0 ? 1 : 0;
649 pc++;
650 break;
651 case EOL:
652 res = *ofs == len ? 1 : 0;
653 pc++;
654 break;
655 case OPEN:
656 if (caps != NULL)
657 caps[r->code[pc + 1]].ptr = s + *ofs;
658 pc += 2;
659 break;
660 case CLOSE:
661 if (caps != NULL)
662 caps[r->code[pc + 1]].len = (s + *ofs) -
663 caps[r->code[pc + 1]].ptr;
664 pc += 2;
665 break;
666 case END:
667 pc++;
668 break;
669 default:
670 printf("unknown cmd (%d) at %d\n", r->code[pc], pc);
671 assert(0);
672 break;
673 }
674 }
675
676 return res;
677 }
678
679 int
slre_match(const struct slre * r,const char * buf,int len,struct cap * caps)680 slre_match(const struct slre *r, const char *buf, int len,
681 struct cap *caps)
682 {
683 int i, ofs = 0, res = 0;
684
685 for (i = 0; i <= len && res == 0; i++) {
686 ofs = i;
687 res = match(r, 0, buf, len, &ofs, caps);
688 }
689
690 return res;
691 }
692
693 #ifdef SLRE_TEST
694 #define N_CAPS 5
695
main(int argc,char * argv[])696 int main(int argc, char *argv[])
697 {
698 struct slre slre;
699 struct cap caps[N_CAPS];
700 unsigned char data[1 * 1024 * 1024];
701 FILE *fp;
702 int i, res, len;
703
704 if (argc < 2) {
705 fprintf(stderr, "Usage: %s 'slre' <file>\n", argv[0]);
706 return 1;
707 }
708
709 fp = fopen(argv[2], "rb");
710 if (fp == NULL) {
711 fprintf(stderr, "Error: cannot open %s:%s\n",
712 argv[2], strerror(errno));
713 return 1;
714 }
715
716 if (!slre_compile(&slre, argv[1])) {
717 (void) fclose(fp);
718 fprintf(stderr, "Error compiling slre: %s\n", slre.err_str);
719 return 1;
720 }
721
722 slre_dump(&slre, stderr);
723
724 while (fgets(data, sizeof(data), fp) != NULL) {
725 len = strlen(data);
726
727 if ((len > 0) && (data[len-1] == '\n')) {
728 data[len-1] = '\0';
729 --len;
730 }
731
732 printf("Data = \"%s\"\n", data);
733
734 (void) memset(caps, 0, sizeof(caps));
735
736 res = slre_match(&slre, data, len, caps);
737 printf("Result [%d]: %d\n", i, res);
738
739 for (i = 0; i < N_CAPS; i++) {
740 if (caps[i].len > 0) {
741 printf("Substring %d: len=%d [%.*s]\n", i,
742 caps[i].len,
743 caps[i].len, caps[i].ptr);
744 }
745 }
746 printf("----------------------------------------------------\n");
747 }
748 (void) fclose(fp);
749
750 return 0;
751 }
752 #endif /* SLRE_TEST */
753