1 /*
2 * This file is part of the MicroPython project, http://micropython.org/
3 *
4 * The MIT License (MIT)
5 *
6 * Copyright (c) 2013, 2014 Damien P. George
7 *
8 * Permission is hereby granted, free of charge, to any person obtaining a copy
9 * of this software and associated documentation files (the "Software"), to deal
10 * in the Software without restriction, including without limitation the rights
11 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
12 * copies of the Software, and to permit persons to whom the Software is
13 * furnished to do so, subject to the following conditions:
14 *
15 * The above copyright notice and this permission notice shall be included in
16 * all copies or substantial portions of the Software.
17 *
18 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
19 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
20 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
21 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
22 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
23 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
24 * THE SOFTWARE.
25 */
26
27 #include <stdio.h>
28 #include <string.h>
29 #include <assert.h>
30
31 #include "py/reader.h"
32 #include "py/lexer.h"
33 #include "py/runtime.h"
34
35 #if MICROPY_ENABLE_COMPILER
36
37 #define TAB_SIZE (8)
38
39 // TODO seems that CPython allows NULL byte in the input stream
40 // don't know if that's intentional or not, but we don't allow it
41
42 #define MP_LEXER_EOF ((unichar)MP_READER_EOF)
43 #define CUR_CHAR(lex) ((lex)->chr0)
44
is_end(mp_lexer_t * lex)45 STATIC bool is_end(mp_lexer_t *lex) {
46 return lex->chr0 == MP_LEXER_EOF;
47 }
48
is_physical_newline(mp_lexer_t * lex)49 STATIC bool is_physical_newline(mp_lexer_t *lex) {
50 return lex->chr0 == '\n';
51 }
52
is_char(mp_lexer_t * lex,byte c)53 STATIC bool is_char(mp_lexer_t *lex, byte c) {
54 return lex->chr0 == c;
55 }
56
is_char_or(mp_lexer_t * lex,byte c1,byte c2)57 STATIC bool is_char_or(mp_lexer_t *lex, byte c1, byte c2) {
58 return lex->chr0 == c1 || lex->chr0 == c2;
59 }
60
is_char_or3(mp_lexer_t * lex,byte c1,byte c2,byte c3)61 STATIC bool is_char_or3(mp_lexer_t *lex, byte c1, byte c2, byte c3) {
62 return lex->chr0 == c1 || lex->chr0 == c2 || lex->chr0 == c3;
63 }
64
65 #if MICROPY_PY_FSTRINGS
is_char_or4(mp_lexer_t * lex,byte c1,byte c2,byte c3,byte c4)66 STATIC bool is_char_or4(mp_lexer_t *lex, byte c1, byte c2, byte c3, byte c4) {
67 return lex->chr0 == c1 || lex->chr0 == c2 || lex->chr0 == c3 || lex->chr0 == c4;
68 }
69 #endif
70
is_char_following(mp_lexer_t * lex,byte c)71 STATIC bool is_char_following(mp_lexer_t *lex, byte c) {
72 return lex->chr1 == c;
73 }
74
is_char_following_or(mp_lexer_t * lex,byte c1,byte c2)75 STATIC bool is_char_following_or(mp_lexer_t *lex, byte c1, byte c2) {
76 return lex->chr1 == c1 || lex->chr1 == c2;
77 }
78
is_char_following_following_or(mp_lexer_t * lex,byte c1,byte c2)79 STATIC bool is_char_following_following_or(mp_lexer_t *lex, byte c1, byte c2) {
80 return lex->chr2 == c1 || lex->chr2 == c2;
81 }
82
is_char_and(mp_lexer_t * lex,byte c1,byte c2)83 STATIC bool is_char_and(mp_lexer_t *lex, byte c1, byte c2) {
84 return lex->chr0 == c1 && lex->chr1 == c2;
85 }
86
is_whitespace(mp_lexer_t * lex)87 STATIC bool is_whitespace(mp_lexer_t *lex) {
88 return unichar_isspace(lex->chr0);
89 }
90
is_letter(mp_lexer_t * lex)91 STATIC bool is_letter(mp_lexer_t *lex) {
92 return unichar_isalpha(lex->chr0);
93 }
94
is_digit(mp_lexer_t * lex)95 STATIC bool is_digit(mp_lexer_t *lex) {
96 return unichar_isdigit(lex->chr0);
97 }
98
is_following_digit(mp_lexer_t * lex)99 STATIC bool is_following_digit(mp_lexer_t *lex) {
100 return unichar_isdigit(lex->chr1);
101 }
102
is_following_base_char(mp_lexer_t * lex)103 STATIC bool is_following_base_char(mp_lexer_t *lex) {
104 const unichar chr1 = lex->chr1 | 0x20;
105 return chr1 == 'b' || chr1 == 'o' || chr1 == 'x';
106 }
107
is_following_odigit(mp_lexer_t * lex)108 STATIC bool is_following_odigit(mp_lexer_t *lex) {
109 return lex->chr1 >= '0' && lex->chr1 <= '7';
110 }
111
is_string_or_bytes(mp_lexer_t * lex)112 STATIC bool is_string_or_bytes(mp_lexer_t *lex) {
113 return is_char_or(lex, '\'', '\"')
114 #if MICROPY_PY_FSTRINGS
115 || (is_char_or4(lex, 'r', 'u', 'b', 'f') && is_char_following_or(lex, '\'', '\"'))
116 || (((is_char_and(lex, 'r', 'f') || is_char_and(lex, 'f', 'r'))
117 && is_char_following_following_or(lex, '\'', '\"')))
118 #else
119 || (is_char_or3(lex, 'r', 'u', 'b') && is_char_following_or(lex, '\'', '\"'))
120 #endif
121 || ((is_char_and(lex, 'r', 'b') || is_char_and(lex, 'b', 'r'))
122 && is_char_following_following_or(lex, '\'', '\"'));
123 }
124
125 // to easily parse utf-8 identifiers we allow any raw byte with high bit set
is_head_of_identifier(mp_lexer_t * lex)126 STATIC bool is_head_of_identifier(mp_lexer_t *lex) {
127 return is_letter(lex) || lex->chr0 == '_' || lex->chr0 >= 0x80;
128 }
129
is_tail_of_identifier(mp_lexer_t * lex)130 STATIC bool is_tail_of_identifier(mp_lexer_t *lex) {
131 return is_head_of_identifier(lex) || is_digit(lex);
132 }
133
next_char(mp_lexer_t * lex)134 STATIC void next_char(mp_lexer_t *lex) {
135 if (lex->chr0 == '\n') {
136 // a new line
137 ++lex->line;
138 lex->column = 1;
139 } else if (lex->chr0 == '\t') {
140 // a tab
141 lex->column = (((lex->column - 1 + TAB_SIZE) / TAB_SIZE) * TAB_SIZE) + 1;
142 } else {
143 // a character worth one column
144 ++lex->column;
145 }
146
147 // shift the input queue forward
148 lex->chr0 = lex->chr1;
149 lex->chr1 = lex->chr2;
150
151 // and add the next byte from either the fstring args or the reader
152 #if MICROPY_PY_FSTRINGS
153 if (lex->fstring_args_idx) {
154 // if there are saved chars, then we're currently injecting fstring args
155 if (lex->fstring_args_idx < lex->fstring_args.len) {
156 lex->chr2 = lex->fstring_args.buf[lex->fstring_args_idx++];
157 } else {
158 // no more fstring arg bytes
159 lex->chr2 = '\0';
160 }
161
162 if (lex->chr0 == '\0') {
163 // consumed all fstring data, restore saved input queue
164 lex->chr0 = lex->chr0_saved;
165 lex->chr1 = lex->chr1_saved;
166 lex->chr2 = lex->chr2_saved;
167 // stop consuming fstring arg data
168 vstr_reset(&lex->fstring_args);
169 lex->fstring_args_idx = 0;
170 }
171 } else
172 #endif
173 {
174 lex->chr2 = lex->reader.readbyte(lex->reader.data);
175 }
176
177 if (lex->chr1 == '\r') {
178 // CR is a new line, converted to LF
179 lex->chr1 = '\n';
180 if (lex->chr2 == '\n') {
181 // CR LF is a single new line, throw out the extra LF
182 lex->chr2 = lex->reader.readbyte(lex->reader.data);
183 }
184 }
185
186 // check if we need to insert a newline at end of file
187 if (lex->chr2 == MP_LEXER_EOF && lex->chr1 != MP_LEXER_EOF && lex->chr1 != '\n') {
188 lex->chr2 = '\n';
189 }
190 }
191
indent_push(mp_lexer_t * lex,size_t indent)192 STATIC void indent_push(mp_lexer_t *lex, size_t indent) {
193 if (lex->num_indent_level >= lex->alloc_indent_level) {
194 lex->indent_level = m_renew(uint16_t, lex->indent_level, lex->alloc_indent_level, lex->alloc_indent_level + MICROPY_ALLOC_LEXEL_INDENT_INC);
195 lex->alloc_indent_level += MICROPY_ALLOC_LEXEL_INDENT_INC;
196 }
197 lex->indent_level[lex->num_indent_level++] = indent;
198 }
199
indent_top(mp_lexer_t * lex)200 STATIC size_t indent_top(mp_lexer_t *lex) {
201 return lex->indent_level[lex->num_indent_level - 1];
202 }
203
indent_pop(mp_lexer_t * lex)204 STATIC void indent_pop(mp_lexer_t *lex) {
205 lex->num_indent_level -= 1;
206 }
207
208 // some tricky operator encoding:
209 // <op> = begin with <op>, if this opchar matches then begin here
210 // e<op> = end with <op>, if this opchar matches then end
211 // c<op> = continue with <op>, if this opchar matches then continue matching
212 // this means if the start of two ops are the same then they are equal til the last char
213
214 STATIC const char *const tok_enc =
215 "()[]{},;~" // singles
216 ":e=" // : :=
217 "<e=c<e=" // < <= << <<=
218 ">e=c>e=" // > >= >> >>=
219 "*e=c*e=" // * *= ** **=
220 "+e=" // + +=
221 "-e=e>" // - -= ->
222 "&e=" // & &=
223 "|e=" // | |=
224 "/e=c/e=" // / /= // //=
225 "%e=" // % %=
226 "^e=" // ^ ^=
227 "@e=" // @ @=
228 "=e=" // = ==
229 "!."; // start of special cases: != . ...
230
231 // TODO static assert that number of tokens is less than 256 so we can safely make this table with byte sized entries
232 STATIC const uint8_t tok_enc_kind[] = {
233 MP_TOKEN_DEL_PAREN_OPEN, MP_TOKEN_DEL_PAREN_CLOSE,
234 MP_TOKEN_DEL_BRACKET_OPEN, MP_TOKEN_DEL_BRACKET_CLOSE,
235 MP_TOKEN_DEL_BRACE_OPEN, MP_TOKEN_DEL_BRACE_CLOSE,
236 MP_TOKEN_DEL_COMMA, MP_TOKEN_DEL_SEMICOLON, MP_TOKEN_OP_TILDE,
237
238 MP_TOKEN_DEL_COLON, MP_TOKEN_OP_ASSIGN,
239 MP_TOKEN_OP_LESS, MP_TOKEN_OP_LESS_EQUAL, MP_TOKEN_OP_DBL_LESS, MP_TOKEN_DEL_DBL_LESS_EQUAL,
240 MP_TOKEN_OP_MORE, MP_TOKEN_OP_MORE_EQUAL, MP_TOKEN_OP_DBL_MORE, MP_TOKEN_DEL_DBL_MORE_EQUAL,
241 MP_TOKEN_OP_STAR, MP_TOKEN_DEL_STAR_EQUAL, MP_TOKEN_OP_DBL_STAR, MP_TOKEN_DEL_DBL_STAR_EQUAL,
242 MP_TOKEN_OP_PLUS, MP_TOKEN_DEL_PLUS_EQUAL,
243 MP_TOKEN_OP_MINUS, MP_TOKEN_DEL_MINUS_EQUAL, MP_TOKEN_DEL_MINUS_MORE,
244 MP_TOKEN_OP_AMPERSAND, MP_TOKEN_DEL_AMPERSAND_EQUAL,
245 MP_TOKEN_OP_PIPE, MP_TOKEN_DEL_PIPE_EQUAL,
246 MP_TOKEN_OP_SLASH, MP_TOKEN_DEL_SLASH_EQUAL, MP_TOKEN_OP_DBL_SLASH, MP_TOKEN_DEL_DBL_SLASH_EQUAL,
247 MP_TOKEN_OP_PERCENT, MP_TOKEN_DEL_PERCENT_EQUAL,
248 MP_TOKEN_OP_CARET, MP_TOKEN_DEL_CARET_EQUAL,
249 MP_TOKEN_OP_AT, MP_TOKEN_DEL_AT_EQUAL,
250 MP_TOKEN_DEL_EQUAL, MP_TOKEN_OP_DBL_EQUAL,
251 };
252
253 // must have the same order as enum in lexer.h
254 // must be sorted according to strcmp
255 STATIC const char *const tok_kw[] = {
256 "False",
257 "None",
258 "True",
259 "__debug__",
260 "and",
261 "as",
262 "assert",
263 #if MICROPY_PY_ASYNC_AWAIT
264 "async",
265 "await",
266 #endif
267 "break",
268 "class",
269 "continue",
270 "def",
271 "del",
272 "elif",
273 "else",
274 "except",
275 "finally",
276 "for",
277 "from",
278 "global",
279 "if",
280 "import",
281 "in",
282 "is",
283 "lambda",
284 "nonlocal",
285 "not",
286 "or",
287 "pass",
288 "raise",
289 "return",
290 "try",
291 "while",
292 "with",
293 "yield",
294 };
295
296 // This is called with CUR_CHAR() before first hex digit, and should return with
297 // it pointing to last hex digit
298 // num_digits must be greater than zero
get_hex(mp_lexer_t * lex,size_t num_digits,mp_uint_t * result)299 STATIC bool get_hex(mp_lexer_t *lex, size_t num_digits, mp_uint_t *result) {
300 mp_uint_t num = 0;
301 while (num_digits-- != 0) {
302 next_char(lex);
303 unichar c = CUR_CHAR(lex);
304 if (!unichar_isxdigit(c)) {
305 return false;
306 }
307 num = (num << 4) + unichar_xdigit_value(c);
308 }
309 *result = num;
310 return true;
311 }
312
parse_string_literal(mp_lexer_t * lex,bool is_raw,bool is_fstring)313 STATIC void parse_string_literal(mp_lexer_t *lex, bool is_raw, bool is_fstring) {
314 // get first quoting character
315 char quote_char = '\'';
316 if (is_char(lex, '\"')) {
317 quote_char = '\"';
318 }
319 next_char(lex);
320
321 // work out if it's a single or triple quoted literal
322 size_t num_quotes;
323 if (is_char_and(lex, quote_char, quote_char)) {
324 // triple quotes
325 next_char(lex);
326 next_char(lex);
327 num_quotes = 3;
328 } else {
329 // single quotes
330 num_quotes = 1;
331 }
332
333 size_t n_closing = 0;
334 #if MICROPY_PY_FSTRINGS
335 if (is_fstring) {
336 // assume there's going to be interpolation, so prep the injection data
337 // fstring_args_idx==0 && len(fstring_args)>0 means we're extracting the args.
338 // only when fstring_args_idx>0 will we consume the arg data
339 // note: lex->fstring_args will be empty already (it's reset when finished)
340 vstr_add_str(&lex->fstring_args, ".format(");
341 }
342 #endif
343
344 while (!is_end(lex) && (num_quotes > 1 || !is_char(lex, '\n')) && n_closing < num_quotes) {
345 if (is_char(lex, quote_char)) {
346 n_closing += 1;
347 vstr_add_char(&lex->vstr, CUR_CHAR(lex));
348 } else {
349 n_closing = 0;
350
351 #if MICROPY_PY_FSTRINGS
352 while (is_fstring && is_char(lex, '{')) {
353 next_char(lex);
354 if (is_char(lex, '{')) {
355 // "{{" is passed through unchanged to be handled by str.format
356 vstr_add_byte(&lex->vstr, '{');
357 next_char(lex);
358 } else {
359 // remember the start of this argument (if we need it for f'{a=}').
360 size_t i = lex->fstring_args.len;
361 // extract characters inside the { until we reach the
362 // format specifier or closing }.
363 // (MicroPython limitation) note: this is completely unaware of
364 // Python syntax and will not handle any expression containing '}' or ':'.
365 // e.g. f'{"}"}' or f'{foo({})}'.
366 while (!is_end(lex) && !is_char_or(lex, ':', '}')) {
367 // like the default case at the end of this function, stay 8-bit clean
368 vstr_add_byte(&lex->fstring_args, CUR_CHAR(lex));
369 next_char(lex);
370 }
371 if (lex->fstring_args.buf[lex->fstring_args.len - 1] == '=') {
372 // if the last character of the arg was '=', then inject "arg=" before the '{'.
373 // f'{a=}' --> 'a={}'.format(a)
374 vstr_add_strn(&lex->vstr, lex->fstring_args.buf + i, lex->fstring_args.len - i);
375 // remove the trailing '='
376 lex->fstring_args.len--;
377 }
378 // comma-separate args
379 vstr_add_byte(&lex->fstring_args, ',');
380 }
381 vstr_add_byte(&lex->vstr, '{');
382 }
383 #endif
384
385 if (is_char(lex, '\\')) {
386 next_char(lex);
387 unichar c = CUR_CHAR(lex);
388 if (is_raw) {
389 // raw strings allow escaping of quotes, but the backslash is also emitted
390 vstr_add_char(&lex->vstr, '\\');
391 } else {
392 switch (c) {
393 // note: "c" can never be MP_LEXER_EOF because next_char
394 // always inserts a newline at the end of the input stream
395 case '\n':
396 c = MP_LEXER_EOF;
397 break; // backslash escape the newline, just ignore it
398 case '\\':
399 break;
400 case '\'':
401 break;
402 case '"':
403 break;
404 case 'a':
405 c = 0x07;
406 break;
407 case 'b':
408 c = 0x08;
409 break;
410 case 't':
411 c = 0x09;
412 break;
413 case 'n':
414 c = 0x0a;
415 break;
416 case 'v':
417 c = 0x0b;
418 break;
419 case 'f':
420 c = 0x0c;
421 break;
422 case 'r':
423 c = 0x0d;
424 break;
425 case 'u':
426 case 'U':
427 if (lex->tok_kind == MP_TOKEN_BYTES) {
428 // b'\u1234' == b'\\u1234'
429 vstr_add_char(&lex->vstr, '\\');
430 break;
431 }
432 // Otherwise fall through.
433 MP_FALLTHROUGH
434 case 'x': {
435 mp_uint_t num = 0;
436 if (!get_hex(lex, (c == 'x' ? 2 : c == 'u' ? 4 : 8), &num)) {
437 // not enough hex chars for escape sequence
438 lex->tok_kind = MP_TOKEN_INVALID;
439 }
440 c = num;
441 break;
442 }
443 case 'N':
444 // Supporting '\N{LATIN SMALL LETTER A}' == 'a' would require keeping the
445 // entire Unicode name table in the core. As of Unicode 6.3.0, that's nearly
446 // 3MB of text; even gzip-compressed and with minimal structure, it'll take
447 // roughly half a meg of storage. This form of Unicode escape may be added
448 // later on, but it's definitely not a priority right now. -- CJA 20140607
449 mp_raise_NotImplementedError(MP_ERROR_TEXT("unicode name escapes"));
450 break;
451 default:
452 if (c >= '0' && c <= '7') {
453 // Octal sequence, 1-3 chars
454 size_t digits = 3;
455 mp_uint_t num = c - '0';
456 while (is_following_odigit(lex) && --digits != 0) {
457 next_char(lex);
458 num = num * 8 + (CUR_CHAR(lex) - '0');
459 }
460 c = num;
461 } else {
462 // unrecognised escape character; CPython lets this through verbatim as '\' and then the character
463 vstr_add_char(&lex->vstr, '\\');
464 }
465 break;
466 }
467 }
468 if (c != MP_LEXER_EOF) {
469 if (MICROPY_PY_BUILTINS_STR_UNICODE_DYNAMIC) {
470 if (c < 0x110000 && lex->tok_kind == MP_TOKEN_STRING) {
471 vstr_add_char(&lex->vstr, c);
472 } else if (c < 0x100 && lex->tok_kind == MP_TOKEN_BYTES) {
473 vstr_add_byte(&lex->vstr, c);
474 } else {
475 // unicode character out of range
476 // this raises a generic SyntaxError; could provide more info
477 lex->tok_kind = MP_TOKEN_INVALID;
478 }
479 } else {
480 // without unicode everything is just added as an 8-bit byte
481 if (c < 0x100) {
482 vstr_add_byte(&lex->vstr, c);
483 } else {
484 // 8-bit character out of range
485 // this raises a generic SyntaxError; could provide more info
486 lex->tok_kind = MP_TOKEN_INVALID;
487 }
488 }
489 }
490 } else {
491 // Add the "character" as a byte so that we remain 8-bit clean.
492 // This way, strings are parsed correctly whether or not they contain utf-8 chars.
493 vstr_add_byte(&lex->vstr, CUR_CHAR(lex));
494 }
495 }
496 next_char(lex);
497 }
498
499 // check we got the required end quotes
500 if (n_closing < num_quotes) {
501 lex->tok_kind = MP_TOKEN_LONELY_STRING_OPEN;
502 }
503
504 // cut off the end quotes from the token text
505 vstr_cut_tail_bytes(&lex->vstr, n_closing);
506 }
507
skip_whitespace(mp_lexer_t * lex,bool stop_at_newline)508 STATIC bool skip_whitespace(mp_lexer_t *lex, bool stop_at_newline) {
509 bool had_physical_newline = false;
510 while (!is_end(lex)) {
511 if (is_physical_newline(lex)) {
512 if (stop_at_newline && lex->nested_bracket_level == 0) {
513 break;
514 }
515 had_physical_newline = true;
516 next_char(lex);
517 } else if (is_whitespace(lex)) {
518 next_char(lex);
519 } else if (is_char(lex, '#')) {
520 next_char(lex);
521 while (!is_end(lex) && !is_physical_newline(lex)) {
522 next_char(lex);
523 }
524 // had_physical_newline will be set on next loop
525 } else if (is_char_and(lex, '\\', '\n')) {
526 // line-continuation, so don't set had_physical_newline
527 next_char(lex);
528 next_char(lex);
529 } else {
530 break;
531 }
532 }
533 return had_physical_newline;
534 }
535
mp_lexer_to_next(mp_lexer_t * lex)536 void mp_lexer_to_next(mp_lexer_t *lex) {
537 #if MICROPY_PY_FSTRINGS
538 if (lex->fstring_args.len && lex->fstring_args_idx == 0) {
539 // moving onto the next token means the literal string is complete.
540 // switch into injecting the format args.
541 vstr_add_byte(&lex->fstring_args, ')');
542 lex->chr0_saved = lex->chr0;
543 lex->chr1_saved = lex->chr1;
544 lex->chr2_saved = lex->chr2;
545 lex->chr0 = lex->fstring_args.buf[0];
546 lex->chr1 = lex->fstring_args.buf[1];
547 lex->chr2 = lex->fstring_args.buf[2];
548 // we've already extracted 3 chars, but setting this non-zero also
549 // means we'll start consuming the fstring data
550 lex->fstring_args_idx = 3;
551 }
552 #endif
553
554 // start new token text
555 vstr_reset(&lex->vstr);
556
557 // skip white space and comments
558 bool had_physical_newline = skip_whitespace(lex, false);
559
560 // set token source information
561 lex->tok_line = lex->line;
562 lex->tok_column = lex->column;
563
564 if (lex->emit_dent < 0) {
565 lex->tok_kind = MP_TOKEN_DEDENT;
566 lex->emit_dent += 1;
567
568 } else if (lex->emit_dent > 0) {
569 lex->tok_kind = MP_TOKEN_INDENT;
570 lex->emit_dent -= 1;
571
572 } else if (had_physical_newline && lex->nested_bracket_level == 0) {
573 lex->tok_kind = MP_TOKEN_NEWLINE;
574
575 size_t num_spaces = lex->column - 1;
576 if (num_spaces == indent_top(lex)) {
577 } else if (num_spaces > indent_top(lex)) {
578 indent_push(lex, num_spaces);
579 lex->emit_dent += 1;
580 } else {
581 while (num_spaces < indent_top(lex)) {
582 indent_pop(lex);
583 lex->emit_dent -= 1;
584 }
585 if (num_spaces != indent_top(lex)) {
586 lex->tok_kind = MP_TOKEN_DEDENT_MISMATCH;
587 }
588 }
589
590 } else if (is_end(lex)) {
591 lex->tok_kind = MP_TOKEN_END;
592
593 } else if (is_string_or_bytes(lex)) {
594 // a string or bytes literal
595
596 // Python requires adjacent string/bytes literals to be automatically
597 // concatenated. We do it here in the tokeniser to make efficient use of RAM,
598 // because then the lexer's vstr can be used to accumulate the string literal,
599 // in contrast to creating a parse tree of strings and then joining them later
600 // in the compiler. It's also more compact in code size to do it here.
601
602 // MP_TOKEN_END is used to indicate that this is the first string token
603 lex->tok_kind = MP_TOKEN_END;
604
605 // Loop to accumulate string/bytes literals
606 do {
607 // parse type codes
608 bool is_raw = false;
609 bool is_fstring = false;
610 mp_token_kind_t kind = MP_TOKEN_STRING;
611 int n_char = 0;
612 if (is_char(lex, 'u')) {
613 n_char = 1;
614 } else if (is_char(lex, 'b')) {
615 kind = MP_TOKEN_BYTES;
616 n_char = 1;
617 if (is_char_following(lex, 'r')) {
618 is_raw = true;
619 n_char = 2;
620 }
621 } else if (is_char(lex, 'r')) {
622 is_raw = true;
623 n_char = 1;
624 if (is_char_following(lex, 'b')) {
625 kind = MP_TOKEN_BYTES;
626 n_char = 2;
627 }
628 #if MICROPY_PY_FSTRINGS
629 if (is_char_following(lex, 'f')) {
630 // raw-f-strings unsupported, immediately return (invalid) token.
631 lex->tok_kind = MP_TOKEN_FSTRING_RAW;
632 break;
633 }
634 #endif
635 }
636 #if MICROPY_PY_FSTRINGS
637 else if (is_char(lex, 'f')) {
638 if (is_char_following(lex, 'r')) {
639 // raw-f-strings unsupported, immediately return (invalid) token.
640 lex->tok_kind = MP_TOKEN_FSTRING_RAW;
641 break;
642 }
643 n_char = 1;
644 is_fstring = true;
645 }
646 #endif
647
648 // Set or check token kind
649 if (lex->tok_kind == MP_TOKEN_END) {
650 lex->tok_kind = kind;
651 } else if (lex->tok_kind != kind) {
652 // Can't concatenate string with bytes
653 break;
654 }
655
656 // Skip any type code characters
657 if (n_char != 0) {
658 next_char(lex);
659 if (n_char == 2) {
660 next_char(lex);
661 }
662 }
663
664 // Parse the literal
665 parse_string_literal(lex, is_raw, is_fstring);
666
667 // Skip whitespace so we can check if there's another string following
668 skip_whitespace(lex, true);
669
670 } while (is_string_or_bytes(lex));
671
672 } else if (is_head_of_identifier(lex)) {
673 lex->tok_kind = MP_TOKEN_NAME;
674
675 // get first char (add as byte to remain 8-bit clean and support utf-8)
676 vstr_add_byte(&lex->vstr, CUR_CHAR(lex));
677 next_char(lex);
678
679 // get tail chars
680 while (!is_end(lex) && is_tail_of_identifier(lex)) {
681 vstr_add_byte(&lex->vstr, CUR_CHAR(lex));
682 next_char(lex);
683 }
684
685 // Check if the name is a keyword.
686 // We also check for __debug__ here and convert it to its value. This is
687 // so the parser gives a syntax error on, eg, x.__debug__. Otherwise, we
688 // need to check for this special token in many places in the compiler.
689 const char *s = vstr_null_terminated_str(&lex->vstr);
690 for (size_t i = 0; i < MP_ARRAY_SIZE(tok_kw); i++) {
691 int cmp = strcmp(s, tok_kw[i]);
692 if (cmp == 0) {
693 lex->tok_kind = MP_TOKEN_KW_FALSE + i;
694 if (lex->tok_kind == MP_TOKEN_KW___DEBUG__) {
695 lex->tok_kind = (MP_STATE_VM(mp_optimise_value) == 0 ? MP_TOKEN_KW_TRUE : MP_TOKEN_KW_FALSE);
696 }
697 break;
698 } else if (cmp < 0) {
699 // Table is sorted and comparison was less-than, so stop searching
700 break;
701 }
702 }
703
704 } else if (is_digit(lex) || (is_char(lex, '.') && is_following_digit(lex))) {
705 bool forced_integer = false;
706 if (is_char(lex, '.')) {
707 lex->tok_kind = MP_TOKEN_FLOAT_OR_IMAG;
708 } else {
709 lex->tok_kind = MP_TOKEN_INTEGER;
710 if (is_char(lex, '0') && is_following_base_char(lex)) {
711 forced_integer = true;
712 }
713 }
714
715 // get first char
716 vstr_add_char(&lex->vstr, CUR_CHAR(lex));
717 next_char(lex);
718
719 // get tail chars
720 while (!is_end(lex)) {
721 if (!forced_integer && is_char_or(lex, 'e', 'E')) {
722 lex->tok_kind = MP_TOKEN_FLOAT_OR_IMAG;
723 vstr_add_char(&lex->vstr, 'e');
724 next_char(lex);
725 if (is_char(lex, '+') || is_char(lex, '-')) {
726 vstr_add_char(&lex->vstr, CUR_CHAR(lex));
727 next_char(lex);
728 }
729 } else if (is_letter(lex) || is_digit(lex) || is_char(lex, '.')) {
730 if (is_char_or3(lex, '.', 'j', 'J')) {
731 lex->tok_kind = MP_TOKEN_FLOAT_OR_IMAG;
732 }
733 vstr_add_char(&lex->vstr, CUR_CHAR(lex));
734 next_char(lex);
735 } else if (is_char(lex, '_')) {
736 next_char(lex);
737 } else {
738 break;
739 }
740 }
741
742 } else {
743 // search for encoded delimiter or operator
744
745 const char *t = tok_enc;
746 size_t tok_enc_index = 0;
747 for (; *t != 0 && !is_char(lex, *t); t += 1) {
748 if (*t == 'e' || *t == 'c') {
749 t += 1;
750 }
751 tok_enc_index += 1;
752 }
753
754 next_char(lex);
755
756 if (*t == 0) {
757 // didn't match any delimiter or operator characters
758 lex->tok_kind = MP_TOKEN_INVALID;
759
760 } else if (*t == '!') {
761 // "!=" is a special case because "!" is not a valid operator
762 if (is_char(lex, '=')) {
763 next_char(lex);
764 lex->tok_kind = MP_TOKEN_OP_NOT_EQUAL;
765 } else {
766 lex->tok_kind = MP_TOKEN_INVALID;
767 }
768
769 } else if (*t == '.') {
770 // "." and "..." are special cases because ".." is not a valid operator
771 if (is_char_and(lex, '.', '.')) {
772 next_char(lex);
773 next_char(lex);
774 lex->tok_kind = MP_TOKEN_ELLIPSIS;
775 } else {
776 lex->tok_kind = MP_TOKEN_DEL_PERIOD;
777 }
778
779 } else {
780 // matched a delimiter or operator character
781
782 // get the maximum characters for a valid token
783 t += 1;
784 size_t t_index = tok_enc_index;
785 while (*t == 'c' || *t == 'e') {
786 t_index += 1;
787 if (is_char(lex, t[1])) {
788 next_char(lex);
789 tok_enc_index = t_index;
790 if (*t == 'e') {
791 break;
792 }
793 } else if (*t == 'c') {
794 break;
795 }
796 t += 2;
797 }
798
799 // set token kind
800 lex->tok_kind = tok_enc_kind[tok_enc_index];
801
802 // compute bracket level for implicit line joining
803 if (lex->tok_kind == MP_TOKEN_DEL_PAREN_OPEN || lex->tok_kind == MP_TOKEN_DEL_BRACKET_OPEN || lex->tok_kind == MP_TOKEN_DEL_BRACE_OPEN) {
804 lex->nested_bracket_level += 1;
805 } else if (lex->tok_kind == MP_TOKEN_DEL_PAREN_CLOSE || lex->tok_kind == MP_TOKEN_DEL_BRACKET_CLOSE || lex->tok_kind == MP_TOKEN_DEL_BRACE_CLOSE) {
806 lex->nested_bracket_level -= 1;
807 }
808 }
809 }
810 }
811
mp_lexer_new(qstr src_name,mp_reader_t reader)812 mp_lexer_t *mp_lexer_new(qstr src_name, mp_reader_t reader) {
813 mp_lexer_t *lex = m_new_obj(mp_lexer_t);
814
815 lex->source_name = src_name;
816 lex->reader = reader;
817 lex->line = 1;
818 lex->column = (size_t)-2; // account for 3 dummy bytes
819 lex->emit_dent = 0;
820 lex->nested_bracket_level = 0;
821 lex->alloc_indent_level = MICROPY_ALLOC_LEXER_INDENT_INIT;
822 lex->num_indent_level = 1;
823 lex->indent_level = m_new(uint16_t, lex->alloc_indent_level);
824 vstr_init(&lex->vstr, 32);
825 #if MICROPY_PY_FSTRINGS
826 vstr_init(&lex->fstring_args, 0);
827 #endif
828
829 // store sentinel for first indentation level
830 lex->indent_level[0] = 0;
831
832 // load lexer with start of file, advancing lex->column to 1
833 // start with dummy bytes and use next_char() for proper EOL/EOF handling
834 lex->chr0 = lex->chr1 = lex->chr2 = 0;
835 next_char(lex);
836 next_char(lex);
837 next_char(lex);
838
839 // preload first token
840 mp_lexer_to_next(lex);
841
842 // Check that the first token is in the first column. If it's not then we
843 // convert the token kind to INDENT so that the parser gives a syntax error.
844 if (lex->tok_column != 1) {
845 lex->tok_kind = MP_TOKEN_INDENT;
846 }
847
848 return lex;
849 }
850
mp_lexer_new_from_str_len(qstr src_name,const char * str,size_t len,size_t free_len)851 mp_lexer_t *mp_lexer_new_from_str_len(qstr src_name, const char *str, size_t len, size_t free_len) {
852 mp_reader_t reader;
853 mp_reader_new_mem(&reader, (const byte *)str, len, free_len);
854 return mp_lexer_new(src_name, reader);
855 }
856
857 #if MICROPY_READER_POSIX || MICROPY_READER_VFS
858
mp_lexer_new_from_file(const char * filename)859 mp_lexer_t *mp_lexer_new_from_file(const char *filename) {
860 mp_reader_t reader;
861 mp_reader_new_file(&reader, filename);
862 return mp_lexer_new(qstr_from_str(filename), reader);
863 }
864
865 #if MICROPY_HELPER_LEXER_UNIX
866
mp_lexer_new_from_fd(qstr filename,int fd,bool close_fd)867 mp_lexer_t *mp_lexer_new_from_fd(qstr filename, int fd, bool close_fd) {
868 mp_reader_t reader;
869 mp_reader_new_file_from_fd(&reader, fd, close_fd);
870 return mp_lexer_new(filename, reader);
871 }
872
873 #endif
874
875 #endif
876
mp_lexer_free(mp_lexer_t * lex)877 void mp_lexer_free(mp_lexer_t *lex) {
878 if (lex) {
879 lex->reader.close(lex->reader.data);
880 vstr_clear(&lex->vstr);
881 #if MICROPY_PY_FSTRINGS
882 vstr_clear(&lex->fstring_args);
883 #endif
884 m_del(uint16_t, lex->indent_level, lex->alloc_indent_level);
885 m_del_obj(mp_lexer_t, lex);
886 }
887 }
888
889 #if 0
890 // This function is used to print the current token and should only be
891 // needed to debug the lexer, so it's not available via a config option.
892 void mp_lexer_show_token(const mp_lexer_t *lex) {
893 printf("(" UINT_FMT ":" UINT_FMT ") kind:%u str:%p len:%zu", lex->tok_line, lex->tok_column, lex->tok_kind, lex->vstr.buf, lex->vstr.len);
894 if (lex->vstr.len > 0) {
895 const byte *i = (const byte *)lex->vstr.buf;
896 const byte *j = (const byte *)i + lex->vstr.len;
897 printf(" ");
898 while (i < j) {
899 unichar c = utf8_get_char(i);
900 i = utf8_next_char(i);
901 if (unichar_isprint(c)) {
902 printf("%c", (int)c);
903 } else {
904 printf("?");
905 }
906 }
907 }
908 printf("\n");
909 }
910 #endif
911
912 #endif // MICROPY_ENABLE_COMPILER
913