1 // Copyright 2014 Paul Sokolovsky.
2 // Use of this source code is governed by a BSD-style
3 // license that can be found in the LICENSE file.
4
5 #include "re1.5.h"
6
7 #define INSERT_CODE(at, num, pc) \
8 ((code ? memmove(code + at + num, code + at, pc - at) : 0), pc += num)
9 #define REL(at, to) (to - at - 2)
10 #define EMIT(at, byte) (code ? (code[at] = byte) : (at))
11 #define EMIT_CHECKED(at, byte) (_emit_checked(at, code, byte, &err))
12 #define PC (prog->bytelen)
13
_emit_checked(int at,char * code,int val,bool * err)14 static void _emit_checked(int at, char *code, int val, bool *err) {
15 *err |= val != (int8_t)val;
16 if (code) {
17 code[at] = val;
18 }
19 }
20
_compilecode(const char * re,ByteProg * prog,int sizecode)21 static const char *_compilecode(const char *re, ByteProg *prog, int sizecode)
22 {
23 char *code = sizecode ? NULL : prog->insts;
24 bool err = false;
25 int start = PC;
26 int term = PC;
27 int alt_label = 0;
28
29 for (; *re && *re != ')'; re++) {
30 switch (*re) {
31 case '\\':
32 re++;
33 if (!*re) return NULL; // Trailing backslash
34 if ((*re | 0x20) == 'd' || (*re | 0x20) == 's' || (*re | 0x20) == 'w') {
35 term = PC;
36 EMIT(PC++, NamedClass);
37 EMIT(PC++, *re);
38 prog->len++;
39 break;
40 }
41 MP_FALLTHROUGH
42 default:
43 term = PC;
44 EMIT(PC++, Char);
45 EMIT(PC++, *re);
46 prog->len++;
47 break;
48 case '.':
49 term = PC;
50 EMIT(PC++, Any);
51 prog->len++;
52 break;
53 case '[': {
54 int cnt;
55 term = PC;
56 re++;
57 if (*re == '^') {
58 EMIT(PC++, ClassNot);
59 re++;
60 } else {
61 EMIT(PC++, Class);
62 }
63 PC++; // Skip # of pair byte
64 prog->len++;
65 for (cnt = 0; *re != ']'; re++, cnt++) {
66 if (*re == '\\') {
67 ++re;
68 }
69 if (!*re) return NULL;
70 EMIT(PC++, *re);
71 if (re[1] == '-' && re[2] != ']') {
72 re += 2;
73 }
74 EMIT(PC++, *re);
75 }
76 EMIT_CHECKED(term + 1, cnt);
77 break;
78 }
79 case '(': {
80 term = PC;
81 int sub = 0;
82 int capture = re[1] != '?' || re[2] != ':';
83
84 if (capture) {
85 sub = ++prog->sub;
86 EMIT(PC++, Save);
87 EMIT_CHECKED(PC++, 2 * sub);
88 prog->len++;
89 } else {
90 re += 2;
91 }
92
93 re = _compilecode(re + 1, prog, sizecode);
94 if (re == NULL || *re != ')') return NULL; // error, or no matching paren
95
96 if (capture) {
97 EMIT(PC++, Save);
98 EMIT_CHECKED(PC++, 2 * sub + 1);
99 prog->len++;
100 }
101
102 break;
103 }
104 case '?':
105 if (PC == term) return NULL; // nothing to repeat
106 INSERT_CODE(term, 2, PC);
107 if (re[1] == '?') {
108 EMIT(term, RSplit);
109 re++;
110 } else {
111 EMIT(term, Split);
112 }
113 EMIT_CHECKED(term + 1, REL(term, PC));
114 prog->len++;
115 term = PC;
116 break;
117 case '*':
118 if (PC == term) return NULL; // nothing to repeat
119 INSERT_CODE(term, 2, PC);
120 EMIT(PC, Jmp);
121 EMIT_CHECKED(PC + 1, REL(PC, term));
122 PC += 2;
123 if (re[1] == '?') {
124 EMIT(term, RSplit);
125 re++;
126 } else {
127 EMIT(term, Split);
128 }
129 EMIT_CHECKED(term + 1, REL(term, PC));
130 prog->len += 2;
131 term = PC;
132 break;
133 case '+':
134 if (PC == term) return NULL; // nothing to repeat
135 if (re[1] == '?') {
136 EMIT(PC, Split);
137 re++;
138 } else {
139 EMIT(PC, RSplit);
140 }
141 EMIT_CHECKED(PC + 1, REL(PC, term));
142 PC += 2;
143 prog->len++;
144 term = PC;
145 break;
146 case '|':
147 if (alt_label) {
148 EMIT_CHECKED(alt_label, REL(alt_label, PC) + 1);
149 }
150 INSERT_CODE(start, 2, PC);
151 EMIT(PC++, Jmp);
152 alt_label = PC++;
153 EMIT(start, Split);
154 EMIT_CHECKED(start + 1, REL(start, PC));
155 prog->len += 2;
156 term = PC;
157 break;
158 case '^':
159 EMIT(PC++, Bol);
160 prog->len++;
161 term = PC;
162 break;
163 case '$':
164 EMIT(PC++, Eol);
165 prog->len++;
166 term = PC;
167 break;
168 }
169 }
170
171 if (alt_label) {
172 EMIT_CHECKED(alt_label, REL(alt_label, PC) + 1);
173 }
174 return err ? NULL : re;
175 }
176
re1_5_sizecode(const char * re)177 int re1_5_sizecode(const char *re)
178 {
179 ByteProg dummyprog = {
180 // Save 0, Save 1, Match; more bytes for "search" (vs "match") prefix code
181 .bytelen = 5 + NON_ANCHORED_PREFIX
182 };
183
184 if (_compilecode(re, &dummyprog, /*sizecode*/1) == NULL) return -1;
185
186 return dummyprog.bytelen;
187 }
188
re1_5_compilecode(ByteProg * prog,const char * re)189 int re1_5_compilecode(ByteProg *prog, const char *re)
190 {
191 prog->len = 0;
192 prog->bytelen = 0;
193 prog->sub = 0;
194
195 // Add code to implement non-anchored operation ("search"),
196 // for anchored operation ("match"), this code will be just skipped.
197 // TODO: Implement search in much more efficient manner
198 prog->insts[prog->bytelen++] = RSplit;
199 prog->insts[prog->bytelen++] = 3;
200 prog->insts[prog->bytelen++] = Any;
201 prog->insts[prog->bytelen++] = Jmp;
202 prog->insts[prog->bytelen++] = -5;
203 prog->len += 3;
204
205 prog->insts[prog->bytelen++] = Save;
206 prog->insts[prog->bytelen++] = 0;
207 prog->len++;
208
209 re = _compilecode(re, prog, /*sizecode*/0);
210 if (re == NULL || *re) return 1;
211
212 prog->insts[prog->bytelen++] = Save;
213 prog->insts[prog->bytelen++] = 1;
214 prog->len++;
215
216 prog->insts[prog->bytelen++] = Match;
217 prog->len++;
218
219 return 0;
220 }
221
222 #if 0
223 int main(int argc, char *argv[])
224 {
225 int pc = 0;
226 ByteProg *code = re1_5_compilecode(argv[1]);
227 re1_5_dumpcode(code);
228 }
229 #endif
230