1 /*
2  * Copyright (c) 2004-2005 Sergey Lyubka <valenok@gmail.com>
3  * All rights reserved
4  *
5  * "THE BEER-WARE LICENSE" (Revision 42):
6  * Sergey Lyubka wrote this file.  As long as you retain this notice you
7  * can do whatever you want with this stuff. If we meet some day, and you think
8  * this stuff is worth it, you can buy me a beer in return.
9  */
10 
11 /*
12  * Downloaded Sat Nov  5 17:43:06 CET 2011 at
13  * http://slre.sourceforge.net/1.0/slre.c
14  */
15 
16 #ifdef SLRE_TEST
17 #include <stdio.h>
18 #include <assert.h>
19 #include <ctype.h>
20 #include <stdlib.h>
21 #include <string.h>
22 #else
23 #include <log.h>
24 #include <linux/ctype.h>
25 #include <linux/string.h>
26 #endif /* SLRE_TEST */
27 
28 #include <errno.h>
29 
30 #include <slre.h>
31 
32 enum {END, BRANCH, ANY, EXACT, ANYOF, ANYBUT, OPEN, CLOSE, BOL, EOL,
33 	STAR, PLUS, STARQ, PLUSQ, QUEST, SPACE, NONSPACE, DIGIT, RANGE};
34 
35 #ifdef SLRE_TEST
36 static struct {
37 	const char	*name;
38 	int		narg;
39 	const char	*flags;
40 } opcodes[] = {
41 	{"END",		0, ""},		/* End of code block or program	*/
42 	{"BRANCH",	2, "oo"},	/* Alternative operator, "|"	*/
43 	{"ANY",		0, ""},		/* Match any character, "."	*/
44 	{"EXACT",	2, "d"},	/* Match exact string		*/
45 	{"ANYOF",	2, "D"},	/* Match any from set, "[]"	*/
46 	{"ANYBUT",	2, "D"},	/* Match any but from set, "[^]"*/
47 	{"OPEN ",	1, "i"},	/* Capture start, "("		*/
48 	{"CLOSE",	1, "i"},	/* Capture end, ")"		*/
49 	{"BOL",		0, ""},		/* Beginning of string, "^"	*/
50 	{"EOL",		0, ""},		/* End of string, "$"		*/
51 	{"STAR",	1, "o"},	/* Match zero or more times "*"	*/
52 	{"PLUS",	1, "o"},	/* Match one or more times, "+"	*/
53 	{"STARQ",	1, "o"},	/* Non-greedy STAR,  "*?"	*/
54 	{"PLUSQ",	1, "o"},	/* Non-greedy PLUS, "+?"	*/
55 	{"QUEST",	1, "o"},	/* Match zero or one time, "?"	*/
56 	{"SPACE",	0, ""},		/* Match whitespace, "\s"	*/
57 	{"NONSPACE",	0, ""},		/* Match non-space, "\S"	*/
58 	{"DIGIT",	0, ""},		/* Match digit, "\d"		*/
59 	{"RANGE",	0, ""},		/* Range separator -		*/
60 };
61 #endif /* SLRE_TEST */
62 
63 /*
64  * Commands and operands are all unsigned char (1 byte long). All code offsets
65  * are relative to current address, and positive (always point forward). Data
66  * offsets are absolute. Commands with operands:
67  *
68  * BRANCH offset1 offset2
69  *	Try to match the code block that follows the BRANCH instruction
70  *	(code block ends with END). If no match, try to match code block that
71  *	starts at offset1. If either of these match, jump to offset2.
72  *
73  * EXACT data_offset data_length
74  *	Try to match exact string. String is recorded in data section from
75  *	data_offset, and has length data_length.
76  *
77  * OPEN capture_number
78  * CLOSE capture_number
79  *	If the user have passed 'struct cap' array for captures, OPEN
80  *	records the beginning of the matched substring (cap->ptr), CLOSE
81  *	sets the length (cap->len) for respective capture_number.
82  *
83  * STAR code_offset
84  * PLUS code_offset
85  * QUEST code_offset
86  *	*, +, ?, respectively. Try to gobble as much as possible from the
87  *	matched buffer, until code block that follows these instructions
88  *	matches. When the longest possible string is matched,
89  *	jump to code_offset
90  *
91  * STARQ, PLUSQ are non-greedy versions of STAR and PLUS.
92  */
93 
94 static const char *meta_chars = "|.^$*+?()[\\";
95 
96 #ifdef SLRE_TEST
97 
98 static void
print_character_set(FILE * fp,const unsigned char * p,int len)99 print_character_set(FILE *fp, const unsigned char *p, int len)
100 {
101 	int	i;
102 
103 	for (i = 0; i < len; i++) {
104 		if (i > 0)
105 			(void) fputc(',', fp);
106 		if (p[i] == 0) {
107 			i++;
108 			if (p[i] == 0)
109 				(void) fprintf(fp, "\\x%02x", p[i]);
110 			else
111 				(void) fprintf(fp, "%s", opcodes[p[i]].name);
112 		} else if (isprint(p[i])) {
113 			(void) fputc(p[i], fp);
114 		} else {
115 			(void) fprintf(fp, "\\x%02x", p[i]);
116 		}
117 	}
118 }
119 
120 void
slre_dump(const struct slre * r,FILE * fp)121 slre_dump(const struct slre *r, FILE *fp)
122 {
123 	int	i, j, ch, op, pc;
124 
125 	for (pc = 0; pc < r->code_size; pc++) {
126 
127 		op = r->code[pc];
128 		(void) fprintf(fp, "%3d %s ", pc, opcodes[op].name);
129 
130 		for (i = 0; opcodes[op].flags[i] != '\0'; i++)
131 			switch (opcodes[op].flags[i]) {
132 			case 'i':
133 				(void) fprintf(fp, "%d ", r->code[pc + 1]);
134 				pc++;
135 				break;
136 			case 'o':
137 				(void) fprintf(fp, "%d ",
138 				    pc + r->code[pc + 1] - i);
139 				pc++;
140 				break;
141 			case 'D':
142 				print_character_set(fp, r->data +
143 				    r->code[pc + 1], r->code[pc + 2]);
144 				pc += 2;
145 				break;
146 			case 'd':
147 				(void) fputc('"', fp);
148 				for (j = 0; j < r->code[pc + 2]; j++) {
149 					ch = r->data[r->code[pc + 1] + j];
150 					if (isprint(ch)) {
151 						(void) fputc(ch, fp);
152 					} else {
153 						(void) fprintf(fp,
154 							"\\x%02x", ch);
155 					}
156 				}
157 				(void) fputc('"', fp);
158 				pc += 2;
159 				break;
160 			}
161 
162 		(void) fputc('\n', fp);
163 	}
164 }
165 #endif /* SLRE_TEST */
166 
167 static void
set_jump_offset(struct slre * r,int pc,int offset)168 set_jump_offset(struct slre *r, int pc, int offset)
169 {
170 	assert(offset < r->code_size);
171 
172 	if (r->code_size - offset > 0xff)
173 		r->err_str = "Jump offset is too big";
174 	else
175 		r->code[pc] = (unsigned char) (r->code_size - offset);
176 }
177 
178 static void
emit(struct slre * r,int code)179 emit(struct slre *r, int code)
180 {
181 	if (r->code_size >= (int) (sizeof(r->code) / sizeof(r->code[0])))
182 		r->err_str = "RE is too long (code overflow)";
183 	else
184 		r->code[r->code_size++] = (unsigned char) code;
185 }
186 
187 static void
store_char_in_data(struct slre * r,int ch)188 store_char_in_data(struct slre *r, int ch)
189 {
190 	if (r->data_size >= (int) sizeof(r->data))
191 		r->err_str = "RE is too long (data overflow)";
192 	else
193 		r->data[r->data_size++] = ch;
194 }
195 
196 static void
exact(struct slre * r,const char ** re)197 exact(struct slre *r, const char **re)
198 {
199 	int	old_data_size = r->data_size;
200 
201 	while (**re != '\0' && (strchr(meta_chars, **re)) == NULL)
202 		store_char_in_data(r, *(*re)++);
203 
204 	emit(r, EXACT);
205 	emit(r, old_data_size);
206 	emit(r, r->data_size - old_data_size);
207 }
208 
209 static int
get_escape_char(const char ** re)210 get_escape_char(const char **re)
211 {
212 	int	res;
213 
214 	switch (*(*re)++) {
215 	case 'n':
216 		res = '\n';
217 		break;
218 	case 'r':
219 		res = '\r';
220 		break;
221 	case 't':
222 		res = '\t';
223 		break;
224 	case '0':
225 		res = 0;
226 		break;
227 	case 'S':
228 		res = NONSPACE << 8;
229 		break;
230 	case 's':
231 		res = SPACE << 8;
232 		break;
233 	case 'd':
234 		res = DIGIT << 8;
235 		break;
236 	default:
237 		res = (*re)[-1];
238 		break;
239 	}
240 
241 	return res;
242 }
243 
244 static void
anyof(struct slre * r,const char ** re)245 anyof(struct slre *r, const char **re)
246 {
247 	int	esc, old_data_size = r->data_size, op = ANYOF;
248 
249 	if (**re == '^') {
250 		op = ANYBUT;
251 		(*re)++;
252 	}
253 
254 	while (**re != '\0')
255 
256 		switch (*(*re)++) {
257 		case ']':
258 			emit(r, op);
259 			emit(r, old_data_size);
260 			emit(r, r->data_size - old_data_size);
261 			return;
262 			/* NOTREACHED */
263 			break;
264 		case '-':
265 			if (r->data_size == old_data_size || **re == ']') {
266 				/* First or last character, just match - itself. */
267 				store_char_in_data(r, '-');
268 				break;
269 			}
270 			store_char_in_data(r, 0);
271 			store_char_in_data(r, RANGE);
272 			break;
273 		case '\\':
274 			esc = get_escape_char(re);
275 			if ((esc & 0xff) == 0) {
276 				store_char_in_data(r, 0);
277 				store_char_in_data(r, esc >> 8);
278 			} else {
279 				store_char_in_data(r, esc);
280 			}
281 			break;
282 		default:
283 			store_char_in_data(r, (*re)[-1]);
284 			break;
285 		}
286 
287 	r->err_str = "No closing ']' bracket";
288 }
289 
290 static void
relocate(struct slre * r,int begin,int shift)291 relocate(struct slre *r, int begin, int shift)
292 {
293 	emit(r, END);
294 	memmove(r->code + begin + shift, r->code + begin, r->code_size - begin);
295 	r->code_size += shift;
296 }
297 
298 static void
quantifier(struct slre * r,int prev,int op)299 quantifier(struct slre *r, int prev, int op)
300 {
301 	if (r->code[prev] == EXACT && r->code[prev + 2] > 1) {
302 		r->code[prev + 2]--;
303 		emit(r, EXACT);
304 		emit(r, r->code[prev + 1] + r->code[prev + 2]);
305 		emit(r, 1);
306 		prev = r->code_size - 3;
307 	}
308 	relocate(r, prev, 2);
309 	r->code[prev] = op;
310 	set_jump_offset(r, prev + 1, prev);
311 }
312 
313 static void
exact_one_char(struct slre * r,int ch)314 exact_one_char(struct slre *r, int ch)
315 {
316 	emit(r, EXACT);
317 	emit(r, r->data_size);
318 	emit(r, 1);
319 	store_char_in_data(r, ch);
320 }
321 
322 static void
fixup_branch(struct slre * r,int fixup)323 fixup_branch(struct slre *r, int fixup)
324 {
325 	if (fixup > 0) {
326 		emit(r, END);
327 		set_jump_offset(r, fixup, fixup - 2);
328 	}
329 }
330 
331 static void
compile(struct slre * r,const char ** re)332 compile(struct slre *r, const char **re)
333 {
334 	int	op, esc, branch_start, last_op, fixup, cap_no, level;
335 
336 	fixup = 0;
337 	level = r->num_caps;
338 	branch_start = last_op = r->code_size;
339 
340 	for (;;)
341 		switch (*(*re)++) {
342 		case '\0':
343 			(*re)--;
344 			return;
345 			/* NOTREACHED */
346 			break;
347 		case '^':
348 			emit(r, BOL);
349 			break;
350 		case '$':
351 			emit(r, EOL);
352 			break;
353 		case '.':
354 			last_op = r->code_size;
355 			emit(r, ANY);
356 			break;
357 		case '[':
358 			last_op = r->code_size;
359 			anyof(r, re);
360 			break;
361 		case '\\':
362 			last_op = r->code_size;
363 			esc = get_escape_char(re);
364 			if (esc & 0xff00)
365 				emit(r, esc >> 8);
366 			else
367 				exact_one_char(r, esc);
368 			break;
369 		case '(':
370 			last_op = r->code_size;
371 			cap_no = ++r->num_caps;
372 			emit(r, OPEN);
373 			emit(r, cap_no);
374 
375 			compile(r, re);
376 			if (*(*re)++ != ')') {
377 				r->err_str = "No closing bracket";
378 				return;
379 			}
380 
381 			emit(r, CLOSE);
382 			emit(r, cap_no);
383 			break;
384 		case ')':
385 			(*re)--;
386 			fixup_branch(r, fixup);
387 			if (level == 0) {
388 				r->err_str = "Unbalanced brackets";
389 				return;
390 			}
391 			return;
392 			/* NOTREACHED */
393 			break;
394 		case '+':
395 		case '*':
396 			op = (*re)[-1] == '*' ? STAR : PLUS;
397 			if (**re == '?') {
398 				(*re)++;
399 				op = op == STAR ? STARQ : PLUSQ;
400 			}
401 			quantifier(r, last_op, op);
402 			break;
403 		case '?':
404 			quantifier(r, last_op, QUEST);
405 			break;
406 		case '|':
407 			fixup_branch(r, fixup);
408 			relocate(r, branch_start, 3);
409 			r->code[branch_start] = BRANCH;
410 			set_jump_offset(r, branch_start + 1, branch_start);
411 			fixup = branch_start + 2;
412 			r->code[fixup] = 0xff;
413 			break;
414 		default:
415 			(*re)--;
416 			last_op = r->code_size;
417 			exact(r, re);
418 			break;
419 		}
420 }
421 
422 int
slre_compile(struct slre * r,const char * re)423 slre_compile(struct slre *r, const char *re)
424 {
425 	r->err_str = NULL;
426 	r->code_size = r->data_size = r->num_caps = 0;
427 
428 	emit(r, OPEN);	/* This will capture what matches full RE */
429 	emit(r, 0);
430 
431 	while (*re != '\0')
432 		compile(r, &re);
433 
434 	if (r->code[2] == BRANCH)
435 		fixup_branch(r, 4);
436 
437 	emit(r, CLOSE);
438 	emit(r, 0);
439 	emit(r, END);
440 
441 	return (r->err_str == NULL ? 1 : 0);
442 }
443 
444 static int match(const struct slre *, int,
445 		const char *, int, int *, struct cap *);
446 
447 static void
loop_greedy(const struct slre * r,int pc,const char * s,int len,int * ofs)448 loop_greedy(const struct slre *r, int pc, const char *s, int len, int *ofs)
449 {
450 	int	saved_offset, matched_offset;
451 
452 	matched_offset = *ofs;
453 
454 	while (match(r, pc + 2, s, len, ofs, NULL)) {
455 		saved_offset = *ofs;
456 		if (match(r, pc + r->code[pc + 1], s, len, ofs, NULL))
457 			matched_offset = saved_offset;
458 		*ofs = saved_offset;
459 	}
460 
461 	*ofs = matched_offset;
462 }
463 
464 static void
loop_non_greedy(const struct slre * r,int pc,const char * s,int len,int * ofs)465 loop_non_greedy(const struct slre *r, int pc, const char *s, int len, int *ofs)
466 {
467 	int	saved_offset = *ofs;
468 
469 	while (match(r, pc + 2, s, len, ofs, NULL)) {
470 		saved_offset = *ofs;
471 		if (match(r, pc + r->code[pc + 1], s, len, ofs, NULL))
472 			break;
473 	}
474 
475 	*ofs = saved_offset;
476 }
477 
478 static int
is_any_of(const unsigned char * p,int len,const char * s,int * ofs)479 is_any_of(const unsigned char *p, int len, const char *s, int *ofs)
480 {
481 	int	i, ch;
482 
483 	ch = s[*ofs];
484 
485 	for (i = 0; i < len; i++) {
486 		if (p[i] == '\0') {
487 			switch (p[++i]) {
488 			case NONSPACE:
489 				if (!isspace(ch))
490 					goto match;
491 				break;
492 			case SPACE:
493 				if (isspace(ch))
494 					goto match;
495 				break;
496 			case DIGIT:
497 				if (isdigit(ch))
498 					goto match;
499 				break;
500 			case RANGE:
501 				/*
502 				 * a-z is represented in the data array as {'a', \0, RANGE, 'z'}
503 				 */
504 				++i;
505 				if (p[i - 3] <= (unsigned char)ch && (unsigned char)ch <= p[i])
506 					goto match;
507 				break;
508 			}
509 			continue;
510 		}
511 		if (p[i] == ch)
512 			goto match;
513 	}
514 
515 	return 0;
516 
517 match:
518 	(*ofs)++;
519 	return 1;
520 }
521 
522 static int
is_any_but(const unsigned char * p,int len,const char * s,int * ofs)523 is_any_but(const unsigned char *p, int len, const char *s, int *ofs)
524 {
525 	int	dummy = *ofs;
526 
527 	if (is_any_of(p, len, s, &dummy)) {
528 		return 0;
529 	} else {
530 		(*ofs)++;
531 		return 1;
532 	}
533 }
534 
535 static int
match(const struct slre * r,int pc,const char * s,int len,int * ofs,struct cap * caps)536 match(const struct slre *r, int pc, const char *s, int len,
537 		int *ofs, struct cap *caps)
538 {
539 	int	n, saved_offset, res = 1;
540 
541 	while (res && r->code[pc] != END) {
542 
543 		assert(pc < r->code_size);
544 		assert(pc < (int) (sizeof(r->code) / sizeof(r->code[0])));
545 
546 		switch (r->code[pc]) {
547 		case BRANCH:
548 			saved_offset = *ofs;
549 			res = match(r, pc + 3, s, len, ofs, caps);
550 			if (res == 0) {
551 				*ofs = saved_offset;
552 				res = match(r, pc + r->code[pc + 1],
553 				    s, len, ofs, caps);
554 			}
555 			pc += r->code[pc + 2];
556 			break;
557 		case EXACT:
558 			res = 0;
559 			n = r->code[pc + 2];	/* String length */
560 			if (n <= len - *ofs && !memcmp(s + *ofs, r->data +
561 			    r->code[pc + 1], n)) {
562 				(*ofs) += n;
563 				res = 1;
564 			}
565 			pc += 3;
566 			break;
567 		case QUEST:
568 			res = 1;
569 			saved_offset = *ofs;
570 			if (!match(r, pc + 2, s, len, ofs, caps))
571 				*ofs = saved_offset;
572 			pc += r->code[pc + 1];
573 			break;
574 		case STAR:
575 			res = 1;
576 			loop_greedy(r, pc, s, len, ofs);
577 			pc += r->code[pc + 1];
578 			break;
579 		case STARQ:
580 			res = 1;
581 			loop_non_greedy(r, pc, s, len, ofs);
582 			pc += r->code[pc + 1];
583 			break;
584 		case PLUS:
585 			res = match(r, pc + 2, s, len, ofs, caps);
586 			if (res == 0)
587 				break;
588 
589 			loop_greedy(r, pc, s, len, ofs);
590 			pc += r->code[pc + 1];
591 			break;
592 		case PLUSQ:
593 			res = match(r, pc + 2, s, len, ofs, caps);
594 			if (res == 0)
595 				break;
596 
597 			loop_non_greedy(r, pc, s, len, ofs);
598 			pc += r->code[pc + 1];
599 			break;
600 		case SPACE:
601 			res = 0;
602 			if (*ofs < len && isspace(((unsigned char *)s)[*ofs])) {
603 				(*ofs)++;
604 				res = 1;
605 			}
606 			pc++;
607 			break;
608 		case NONSPACE:
609 			res = 0;
610 			if (*ofs < len &&
611 					!isspace(((unsigned char *)s)[*ofs])) {
612 				(*ofs)++;
613 				res = 1;
614 			}
615 			pc++;
616 			break;
617 		case DIGIT:
618 			res = 0;
619 			if (*ofs < len && isdigit(((unsigned char *)s)[*ofs])) {
620 				(*ofs)++;
621 				res = 1;
622 			}
623 			pc++;
624 			break;
625 		case ANY:
626 			res = 0;
627 			if (*ofs < len) {
628 				(*ofs)++;
629 				res = 1;
630 			}
631 			pc++;
632 			break;
633 		case ANYOF:
634 			res = 0;
635 			if (*ofs < len)
636 				res = is_any_of(r->data + r->code[pc + 1],
637 					r->code[pc + 2], s, ofs);
638 			pc += 3;
639 			break;
640 		case ANYBUT:
641 			res = 0;
642 			if (*ofs < len)
643 				res = is_any_but(r->data + r->code[pc + 1],
644 					r->code[pc + 2], s, ofs);
645 			pc += 3;
646 			break;
647 		case BOL:
648 			res = *ofs == 0 ? 1 : 0;
649 			pc++;
650 			break;
651 		case EOL:
652 			res = *ofs == len ? 1 : 0;
653 			pc++;
654 			break;
655 		case OPEN:
656 			if (caps != NULL)
657 				caps[r->code[pc + 1]].ptr = s + *ofs;
658 			pc += 2;
659 			break;
660 		case CLOSE:
661 			if (caps != NULL)
662 				caps[r->code[pc + 1]].len = (s + *ofs) -
663 				    caps[r->code[pc + 1]].ptr;
664 			pc += 2;
665 			break;
666 		case END:
667 			pc++;
668 			break;
669 		default:
670 			printf("unknown cmd (%d) at %d\n", r->code[pc], pc);
671 			assert(0);
672 			break;
673 		}
674 	}
675 
676 	return res;
677 }
678 
679 int
slre_match(const struct slre * r,const char * buf,int len,struct cap * caps)680 slre_match(const struct slre *r, const char *buf, int len,
681 		struct cap *caps)
682 {
683 	int	i, ofs = 0, res = 0;
684 
685 	for (i = 0; i <= len && res == 0; i++) {
686 		ofs = i;
687 		res = match(r, 0, buf, len, &ofs, caps);
688 	}
689 
690 	return res;
691 }
692 
693 #ifdef SLRE_TEST
694 #define N_CAPS	5
695 
main(int argc,char * argv[])696 int main(int argc, char *argv[])
697 {
698 	struct slre	slre;
699 	struct cap	caps[N_CAPS];
700 	unsigned char	data[1 * 1024 * 1024];
701 	FILE		*fp;
702 	int		i, res, len;
703 
704 	if (argc < 2) {
705 		fprintf(stderr, "Usage: %s 'slre' <file>\n", argv[0]);
706 		return 1;
707 	}
708 
709 	fp = fopen(argv[2], "rb");
710 	if (fp == NULL) {
711 		fprintf(stderr, "Error: cannot open %s:%s\n",
712 			argv[2], strerror(errno));
713 		return 1;
714 	}
715 
716 	if (!slre_compile(&slre, argv[1])) {
717 		(void) fclose(fp);
718 		fprintf(stderr, "Error compiling slre: %s\n", slre.err_str);
719 		return 1;
720 	}
721 
722 	slre_dump(&slre, stderr);
723 
724 	while (fgets(data, sizeof(data), fp) != NULL) {
725 		len = strlen(data);
726 
727 		if ((len > 0) && (data[len-1] == '\n')) {
728 			data[len-1] = '\0';
729 			--len;
730 		}
731 
732 		printf("Data = \"%s\"\n", data);
733 
734 		(void) memset(caps, 0, sizeof(caps));
735 
736 		res = slre_match(&slre, data, len, caps);
737 		printf("Result [%d]: %d\n", i, res);
738 
739 		for (i = 0; i < N_CAPS; i++) {
740 			if (caps[i].len > 0) {
741 				printf("Substring %d: len=%d  [%.*s]\n", i,
742 					caps[i].len,
743 					caps[i].len, caps[i].ptr);
744 			}
745 		}
746 		printf("----------------------------------------------------\n");
747 	}
748 	(void) fclose(fp);
749 
750 	return 0;
751 }
752 #endif /* SLRE_TEST */
753