1#!/usr/bin/env python3
2#
3# Copyright 2019 The Hafnium Authors.
4#
5# Use of this source code is governed by a BSD-style
6# license that can be found in the LICENSE file or at
7# https://opensource.org/licenses/BSD-3-Clause.
8
9"""Script which parses the output of `strace` and dumping a list of files
10that were touched by the traced processes outside of whitelisted folders.
11It assumes that strace was invoked with the following arguments:
12    -e trace=%file,chdir,%process   record required syscalls
13    -qq                             silence 'exit code' records
14    -o <file>                       output format is different when writing
15                                    to a file from printing to the console
16"""
17
18import argparse
19import os
20import sys
21
22FORK_SYSCALLS = [
23    "clone",
24    "fork",
25    "vfork",
26    ]
27OPEN_SYSCALLS = [
28    "access",
29    "creat",
30    "lstat",
31    "mkdir",
32    "open",
33    "openat",
34    "readlink",
35    "stat",
36    ]
37
38def get_unfinished(line):
39    pos = line.find("<unfinished ...>")
40    if pos < 0:
41        return None
42    else:
43        return line[:pos]
44
45def get_resumed(line):
46    pos = line.find(" resumed>")
47    if pos < 0:
48        return None
49    else:
50        return line[pos + len(" resumed>"):]
51
52def merge_unfinished_lines(lines):
53    """Process input lines and merge those split by an interrupting syscall."""
54    # Lines in the order they were started being written.
55    finished = []
56
57    # Pending unfinished lines. Map from PID to index in `finished`.
58    cursor = {}
59
60    for line in lines:
61        pid = int(line.split()[0])
62
63        resumed = get_resumed(line)
64        if resumed is not None:
65            assert(pid in cursor)
66            unfinished = get_unfinished(resumed)
67            if unfinished is not None:
68                finished[cursor[pid]] += unfinished
69            else:
70                finished[cursor[pid]] += resumed
71                del(cursor[pid])
72        else:
73            assert(pid not in cursor)
74            unfinished = get_unfinished(line)
75            if unfinished is not None:
76                # Line is unfinished. Store its location to `cursor`.
77                cursor[pid] = len(finished)
78                finished += [ unfinished ]
79            else:
80                finished += [ line ]
81    return finished
82
83def abs_path(cwd, path):
84    """If `path` is relative, resolve it against the current working directory.
85       Also normalize the resulting path."""
86    if path[0] != '/':
87        path = os.path.join(cwd, path)
88    path = os.path.abspath(path)
89    # while '//' in path:
90    #     path = path.replace('//', '/')
91    path = os.path.realpath(path)
92    return path
93
94def get_touched_files(lines, orig_cwd):
95    """Parse strace output and return all files that an open()-like syscall was
96       called on."""
97    files = set()
98
99    # Map from PID to the current working directory.
100    cwd = {}
101
102    # Map from PID to executable name
103    executable = {}
104
105    # Map from PID to the PID of the process which forked it.
106    fork_of = {}
107
108    first_pid = True
109    for line in lines:
110        # Split line: <pid>  <syscall info>
111        line = line.split()
112        pid = int(line[0])
113        syscall = " ".join(line[1:])
114
115        # If seeing a PID for the first time, derive its working directory
116        # from its parent.
117        if pid not in cwd:
118            if first_pid:
119                # Very first line of strace output. Set working directory from
120                # command line arguments (should match cwd of strace).
121                first_pid = False
122                cwd[pid] = orig_cwd
123            else:
124                # There should have been a fork/clone syscall which spawned this
125                # process. Inherit its working directory.
126                cwd[pid] = cwd[fork_of[pid]]
127
128        # We are looking for lines which match:
129        #   name(arg1, arg2, ..., argN) = result
130        left_bracket = syscall.find("(")
131        right_bracket = syscall.rfind(")")
132        assign_sign = syscall.rfind("=")
133        if left_bracket < 0 or right_bracket < 0 or assign_sign < right_bracket:
134            continue
135
136        syscall_name = syscall[:left_bracket]
137        syscall_result = syscall[assign_sign+2:]
138
139        syscall_args = syscall[left_bracket+1:right_bracket].split(",")
140        syscall_args = list(map(lambda x: x.strip(), syscall_args))
141
142        if syscall_name in FORK_SYSCALLS:
143            # If this is a fork, keep track of the parent-child relationship.
144            # The child's PID is the syscall's return code.
145            new_pid = int(syscall_result)
146            fork_of[new_pid] = pid
147            executable[new_pid] = executable[pid]
148        elif syscall_name == "chdir":
149            # If this is a change of working directory, keep track of it.
150            # It is in the first argument in quotes.
151            new_dir = syscall_args[0][1:-1]
152            cwd[pid] = abs_path(cwd[pid], new_dir)
153        elif syscall_name == "execve":
154            # If this is executing a new program, record its name.
155            # It is in the first argument in quotes.
156            binary_name = syscall_args[0][1:-1]
157            executable[pid] = binary_name
158        elif syscall_name in OPEN_SYSCALLS:
159            # If this is a syscall touching a file, record the path.
160            # We ignore the result code, i.e. record the path even if the
161            # syscall failed to open it.
162            arg_idx = 0
163            if syscall_name == "openat":
164                # openat() can open a file (second arg) relative to a given
165                # folder (first arg). We only support passing AT_FDCWD, ie.
166                # resolve against the current working directory.
167                arg_idx = 1
168                assert(syscall_args[0] == "AT_FDCWD")
169            fname = abs_path(cwd[pid], syscall_args[arg_idx][1:-1])
170            # Record the file and the name of the program which touched it.
171            files.add((fname, executable[pid]))
172    return files
173
174def filter_results(files, root_dir):
175    """Remove paths which are whitelisted from the results."""
176    # Anything in the Hafnium directory is allowed.
177    files = filter(lambda x: not x[0].startswith(root_dir + "/"), files)
178    # Clang puts intermediate files in /tmp.
179    files = filter(lambda x: not x[0].startswith("/tmp/"), files)
180    return list(files)
181
182def main(args):
183    parser = argparse.ArgumentParser()
184    parser.add_argument("root_dir",
185                        help="Root directory of Hafnium, cwd of strace")
186    args, make_args = parser.parse_known_args()
187
188    stdin = map(lambda x: x.strip(), sys.stdin.readlines())
189    stdin = merge_unfinished_lines(stdin)
190    files = get_touched_files(stdin, args.root_dir)
191    files = filter_results(files, args.root_dir)
192    files = sorted(list(files))
193
194    print("\n".join(map(lambda x: "{} ({})".format(x[0], x[1]), files)))
195
196if __name__ == "__main__":
197    main(sys.argv)
198