1#!/usr/bin/env python3 2# 3# Copyright 2019 The Hafnium Authors. 4# 5# Use of this source code is governed by a BSD-style 6# license that can be found in the LICENSE file or at 7# https://opensource.org/licenses/BSD-3-Clause. 8 9"""Script which parses the output of `strace` and dumping a list of files 10that were touched by the traced processes outside of whitelisted folders. 11It assumes that strace was invoked with the following arguments: 12 -e trace=%file,chdir,%process record required syscalls 13 -qq silence 'exit code' records 14 -o <file> output format is different when writing 15 to a file from printing to the console 16""" 17 18import argparse 19import os 20import sys 21 22FORK_SYSCALLS = [ 23 "clone", 24 "fork", 25 "vfork", 26 ] 27OPEN_SYSCALLS = [ 28 "access", 29 "creat", 30 "lstat", 31 "mkdir", 32 "open", 33 "openat", 34 "readlink", 35 "stat", 36 ] 37 38def get_unfinished(line): 39 pos = line.find("<unfinished ...>") 40 if pos < 0: 41 return None 42 else: 43 return line[:pos] 44 45def get_resumed(line): 46 pos = line.find(" resumed>") 47 if pos < 0: 48 return None 49 else: 50 return line[pos + len(" resumed>"):] 51 52def merge_unfinished_lines(lines): 53 """Process input lines and merge those split by an interrupting syscall.""" 54 # Lines in the order they were started being written. 55 finished = [] 56 57 # Pending unfinished lines. Map from PID to index in `finished`. 58 cursor = {} 59 60 for line in lines: 61 pid = int(line.split()[0]) 62 63 resumed = get_resumed(line) 64 if resumed is not None: 65 assert(pid in cursor) 66 unfinished = get_unfinished(resumed) 67 if unfinished is not None: 68 finished[cursor[pid]] += unfinished 69 else: 70 finished[cursor[pid]] += resumed 71 del(cursor[pid]) 72 else: 73 assert(pid not in cursor) 74 unfinished = get_unfinished(line) 75 if unfinished is not None: 76 # Line is unfinished. Store its location to `cursor`. 77 cursor[pid] = len(finished) 78 finished += [ unfinished ] 79 else: 80 finished += [ line ] 81 return finished 82 83def abs_path(cwd, path): 84 """If `path` is relative, resolve it against the current working directory. 85 Also normalize the resulting path.""" 86 if path[0] != '/': 87 path = os.path.join(cwd, path) 88 path = os.path.abspath(path) 89 # while '//' in path: 90 # path = path.replace('//', '/') 91 path = os.path.realpath(path) 92 return path 93 94def get_touched_files(lines, orig_cwd): 95 """Parse strace output and return all files that an open()-like syscall was 96 called on.""" 97 files = set() 98 99 # Map from PID to the current working directory. 100 cwd = {} 101 102 # Map from PID to executable name 103 executable = {} 104 105 # Map from PID to the PID of the process which forked it. 106 fork_of = {} 107 108 first_pid = True 109 for line in lines: 110 # Split line: <pid> <syscall info> 111 line = line.split() 112 pid = int(line[0]) 113 syscall = " ".join(line[1:]) 114 115 # If seeing a PID for the first time, derive its working directory 116 # from its parent. 117 if pid not in cwd: 118 if first_pid: 119 # Very first line of strace output. Set working directory from 120 # command line arguments (should match cwd of strace). 121 first_pid = False 122 cwd[pid] = orig_cwd 123 else: 124 # There should have been a fork/clone syscall which spawned this 125 # process. Inherit its working directory. 126 cwd[pid] = cwd[fork_of[pid]] 127 128 # We are looking for lines which match: 129 # name(arg1, arg2, ..., argN) = result 130 left_bracket = syscall.find("(") 131 right_bracket = syscall.rfind(")") 132 assign_sign = syscall.rfind("=") 133 if left_bracket < 0 or right_bracket < 0 or assign_sign < right_bracket: 134 continue 135 136 syscall_name = syscall[:left_bracket] 137 syscall_result = syscall[assign_sign+2:] 138 139 syscall_args = syscall[left_bracket+1:right_bracket].split(",") 140 syscall_args = list(map(lambda x: x.strip(), syscall_args)) 141 142 if syscall_name in FORK_SYSCALLS: 143 # If this is a fork, keep track of the parent-child relationship. 144 # The child's PID is the syscall's return code. 145 new_pid = int(syscall_result) 146 fork_of[new_pid] = pid 147 executable[new_pid] = executable[pid] 148 elif syscall_name == "chdir": 149 # If this is a change of working directory, keep track of it. 150 # It is in the first argument in quotes. 151 new_dir = syscall_args[0][1:-1] 152 cwd[pid] = abs_path(cwd[pid], new_dir) 153 elif syscall_name == "execve": 154 # If this is executing a new program, record its name. 155 # It is in the first argument in quotes. 156 binary_name = syscall_args[0][1:-1] 157 executable[pid] = binary_name 158 elif syscall_name in OPEN_SYSCALLS: 159 # If this is a syscall touching a file, record the path. 160 # We ignore the result code, i.e. record the path even if the 161 # syscall failed to open it. 162 arg_idx = 0 163 if syscall_name == "openat": 164 # openat() can open a file (second arg) relative to a given 165 # folder (first arg). We only support passing AT_FDCWD, ie. 166 # resolve against the current working directory. 167 arg_idx = 1 168 assert(syscall_args[0] == "AT_FDCWD") 169 fname = abs_path(cwd[pid], syscall_args[arg_idx][1:-1]) 170 # Record the file and the name of the program which touched it. 171 files.add((fname, executable[pid])) 172 return files 173 174def filter_results(files, root_dir): 175 """Remove paths which are whitelisted from the results.""" 176 # Anything in the Hafnium directory is allowed. 177 files = filter(lambda x: not x[0].startswith(root_dir + "/"), files) 178 # Clang puts intermediate files in /tmp. 179 files = filter(lambda x: not x[0].startswith("/tmp/"), files) 180 return list(files) 181 182def main(args): 183 parser = argparse.ArgumentParser() 184 parser.add_argument("root_dir", 185 help="Root directory of Hafnium, cwd of strace") 186 args, make_args = parser.parse_known_args() 187 188 stdin = map(lambda x: x.strip(), sys.stdin.readlines()) 189 stdin = merge_unfinished_lines(stdin) 190 files = get_touched_files(stdin, args.root_dir) 191 files = filter_results(files, args.root_dir) 192 files = sorted(list(files)) 193 194 print("\n".join(map(lambda x: "{} ({})".format(x[0], x[1]), files))) 195 196if __name__ == "__main__": 197 main(sys.argv) 198