1"""
2Process raw qstr file and output qstr data with length, hash and data bytes.
3
4This script works with Python 2.6, 2.7, 3.3 and 3.4.
5"""
6
7from __future__ import print_function
8
9import re
10import sys
11
12# Python 2/3 compatibility:
13#   - iterating through bytes is different
14#   - codepoint2name lives in a different module
15import platform
16
17if platform.python_version_tuple()[0] == "2":
18    bytes_cons = lambda val, enc=None: bytearray(val)
19    from htmlentitydefs import codepoint2name
20elif platform.python_version_tuple()[0] == "3":
21    bytes_cons = bytes
22    from html.entities import codepoint2name
23# end compatibility code
24
25codepoint2name[ord("-")] = "hyphen"
26
27# add some custom names to map characters that aren't in HTML
28codepoint2name[ord(" ")] = "space"
29codepoint2name[ord("'")] = "squot"
30codepoint2name[ord(",")] = "comma"
31codepoint2name[ord(".")] = "dot"
32codepoint2name[ord(":")] = "colon"
33codepoint2name[ord(";")] = "semicolon"
34codepoint2name[ord("/")] = "slash"
35codepoint2name[ord("%")] = "percent"
36codepoint2name[ord("#")] = "hash"
37codepoint2name[ord("(")] = "paren_open"
38codepoint2name[ord(")")] = "paren_close"
39codepoint2name[ord("[")] = "bracket_open"
40codepoint2name[ord("]")] = "bracket_close"
41codepoint2name[ord("{")] = "brace_open"
42codepoint2name[ord("}")] = "brace_close"
43codepoint2name[ord("*")] = "star"
44codepoint2name[ord("!")] = "bang"
45codepoint2name[ord("\\")] = "backslash"
46codepoint2name[ord("+")] = "plus"
47codepoint2name[ord("$")] = "dollar"
48codepoint2name[ord("=")] = "equals"
49codepoint2name[ord("?")] = "question"
50codepoint2name[ord("@")] = "at_sign"
51codepoint2name[ord("^")] = "caret"
52codepoint2name[ord("|")] = "pipe"
53codepoint2name[ord("~")] = "tilde"
54
55# static qstrs, should be sorted
56
57static_qstr_list = [
58    "",
59    "__dir__",  # Put __dir__ after empty qstr for builtin dir() to work
60    "\n",
61    " ",
62    "*",
63    "/",
64    "<module>",
65    "_",
66    "__call__",
67    "__class__",
68    "__delitem__",
69    "__enter__",
70    "__exit__",
71    "__getattr__",
72    "__getitem__",
73    "__hash__",
74    "__init__",
75    "__int__",
76    "__iter__",
77    "__len__",
78    "__main__",
79    "__module__",
80    "__name__",
81    "__new__",
82    "__next__",
83    "__qualname__",
84    "__repr__",
85    "__setitem__",
86    "__str__",
87    "ArithmeticError",
88    "AssertionError",
89    "AttributeError",
90    "BaseException",
91    "EOFError",
92    "Ellipsis",
93    "Exception",
94    "GeneratorExit",
95    "ImportError",
96    "IndentationError",
97    "IndexError",
98    "KeyError",
99    "KeyboardInterrupt",
100    "LookupError",
101    "MemoryError",
102    "NameError",
103    "NoneType",
104    "NotImplementedError",
105    "OSError",
106    "OverflowError",
107    "RuntimeError",
108    "StopIteration",
109    "SyntaxError",
110    "SystemExit",
111    "TypeError",
112    "ValueError",
113    "ZeroDivisionError",
114    "abs",
115    "all",
116    "any",
117    "append",
118    "args",
119    "bool",
120    "builtins",
121    "bytearray",
122    "bytecode",
123    "bytes",
124    "callable",
125    "chr",
126    "classmethod",
127    "clear",
128    "close",
129    "const",
130    "copy",
131    "count",
132    "dict",
133    "dir",
134    "divmod",
135    "end",
136    "endswith",
137    "eval",
138    "exec",
139    "extend",
140    "find",
141    "format",
142    "from_bytes",
143    "get",
144    "getattr",
145    "globals",
146    "hasattr",
147    "hash",
148    "id",
149    "index",
150    "insert",
151    "int",
152    "isalpha",
153    "isdigit",
154    "isinstance",
155    "islower",
156    "isspace",
157    "issubclass",
158    "isupper",
159    "items",
160    "iter",
161    "join",
162    "key",
163    "keys",
164    "len",
165    "list",
166    "little",
167    "locals",
168    "lower",
169    "lstrip",
170    "main",
171    "map",
172    "micropython",
173    "next",
174    "object",
175    "open",
176    "ord",
177    "pop",
178    "popitem",
179    "pow",
180    "print",
181    "range",
182    "read",
183    "readinto",
184    "readline",
185    "remove",
186    "replace",
187    "repr",
188    "reverse",
189    "rfind",
190    "rindex",
191    "round",
192    "rsplit",
193    "rstrip",
194    "self",
195    "send",
196    "sep",
197    "set",
198    "setattr",
199    "setdefault",
200    "sort",
201    "sorted",
202    "split",
203    "start",
204    "startswith",
205    "staticmethod",
206    "step",
207    "stop",
208    "str",
209    "strip",
210    "sum",
211    "super",
212    "throw",
213    "to_bytes",
214    "tuple",
215    "type",
216    "update",
217    "upper",
218    "utf-8",
219    "value",
220    "values",
221    "write",
222    "zip",
223]
224
225# this must match the equivalent function in qstr.c
226def compute_hash(qstr, bytes_hash):
227    hash = 5381
228    for b in qstr:
229        hash = (hash * 33) ^ b
230    # Make sure that valid hash is never zero, zero means "hash not computed"
231    return (hash & ((1 << (8 * bytes_hash)) - 1)) or 1
232
233
234def qstr_escape(qst):
235    def esc_char(m):
236        c = ord(m.group(0))
237        try:
238            name = codepoint2name[c]
239        except KeyError:
240            name = "0x%02x" % c
241        return "_" + name + "_"
242
243    return re.sub(r"[^A-Za-z0-9_]", esc_char, qst)
244
245
246def parse_input_headers(infiles):
247    qcfgs = {}
248    qstrs = {}
249
250    # add static qstrs
251    for qstr in static_qstr_list:
252        # work out the corresponding qstr name
253        ident = qstr_escape(qstr)
254
255        # don't add duplicates
256        assert ident not in qstrs
257
258        # add the qstr to the list, with order number to retain original order in file
259        order = len(qstrs) - 300000
260        qstrs[ident] = (order, ident, qstr)
261
262    # read the qstrs in from the input files
263    for infile in infiles:
264        with open(infile, "rt") as f:
265            for line in f:
266                line = line.strip()
267
268                # is this a config line?
269                match = re.match(r"^QCFG\((.+), (.+)\)", line)
270                if match:
271                    value = match.group(2)
272                    if value[0] == "(" and value[-1] == ")":
273                        # strip parenthesis from config value
274                        value = value[1:-1]
275                    qcfgs[match.group(1)] = value
276                    continue
277
278                # is this a QSTR line?
279                match = re.match(r"^Q\((.*)\)$", line)
280                if not match:
281                    continue
282
283                # get the qstr value
284                qstr = match.group(1)
285
286                # special cases to specify control characters
287                if qstr == "\\n":
288                    qstr = "\n"
289                elif qstr == "\\r\\n":
290                    qstr = "\r\n"
291
292                # work out the corresponding qstr name
293                ident = qstr_escape(qstr)
294
295                # don't add duplicates
296                if ident in qstrs:
297                    continue
298
299                # add the qstr to the list, with order number to retain original order in file
300                order = len(qstrs)
301                # but put special method names like __add__ at the top of list, so
302                # that their id's fit into a byte
303                if ident == "":
304                    # Sort empty qstr above all still
305                    order = -200000
306                elif ident == "__dir__":
307                    # Put __dir__ after empty qstr for builtin dir() to work
308                    order = -190000
309                elif ident.startswith("__"):
310                    order -= 100000
311                qstrs[ident] = (order, ident, qstr)
312
313    if not qcfgs:
314        sys.stderr.write("ERROR: Empty preprocessor output - check for errors above\n")
315        sys.exit(1)
316
317    return qcfgs, qstrs
318
319
320def make_bytes(cfg_bytes_len, cfg_bytes_hash, qstr):
321    qbytes = bytes_cons(qstr, "utf8")
322    qlen = len(qbytes)
323    qhash = compute_hash(qbytes, cfg_bytes_hash)
324    if all(32 <= ord(c) <= 126 and c != "\\" and c != '"' for c in qstr):
325        # qstr is all printable ASCII so render it as-is (for easier debugging)
326        qdata = qstr
327    else:
328        # qstr contains non-printable codes so render entire thing as hex pairs
329        qdata = "".join(("\\x%02x" % b) for b in qbytes)
330    if qlen >= (1 << (8 * cfg_bytes_len)):
331        print("qstr is too long:", qstr)
332        assert False
333    qlen_str = ("\\x%02x" * cfg_bytes_len) % tuple(
334        ((qlen >> (8 * i)) & 0xFF) for i in range(cfg_bytes_len)
335    )
336    qhash_str = ("\\x%02x" * cfg_bytes_hash) % tuple(
337        ((qhash >> (8 * i)) & 0xFF) for i in range(cfg_bytes_hash)
338    )
339    return '(const byte*)"%s%s" "%s"' % (qhash_str, qlen_str, qdata)
340
341
342def print_qstr_data(qcfgs, qstrs):
343    # get config variables
344    cfg_bytes_len = int(qcfgs["BYTES_IN_LEN"])
345    cfg_bytes_hash = int(qcfgs["BYTES_IN_HASH"])
346
347    # print out the starter of the generated C header file
348    print("// This file was automatically generated by makeqstrdata.py")
349    print("")
350
351    # add NULL qstr with no hash or data
352    print(
353        'QDEF(MP_QSTRnull, (const byte*)"%s%s" "")'
354        % ("\\x00" * cfg_bytes_hash, "\\x00" * cfg_bytes_len)
355    )
356
357    # go through each qstr and print it out
358    for order, ident, qstr in sorted(qstrs.values(), key=lambda x: x[0]):
359        qbytes = make_bytes(cfg_bytes_len, cfg_bytes_hash, qstr)
360        print("QDEF(MP_QSTR_%s, %s)" % (ident, qbytes))
361
362
363def do_work(infiles):
364    qcfgs, qstrs = parse_input_headers(infiles)
365    print_qstr_data(qcfgs, qstrs)
366
367
368if __name__ == "__main__":
369    do_work(sys.argv[1:])
370