1 // Copyright 2016 The Fuchsia Authors
2 //
3 // Use of this source code is governed by a MIT-style
4 // license that can be found in the LICENSE file or at
5 // https://opensource.org/licenses/MIT
6 
7 // TODO(ZX-992): Need to be able to r/w MSRs.
8 // The thought is to use resources (as in ResourceDispatcher), at which point
9 // this will all get rewritten. Until such time, the goal here is KISS.
10 // This file contains the lower part of Intel Processor Trace support that must
11 // be done in the kernel (so that we can read/write msrs).
12 // The userspace driver is in system/dev/misc/cpu-trace/intel-pt.c.
13 //
14 // We currently only support Table of Physical Addresses mode:
15 // it supports discontiguous buffers and supports stop-on-full behavior
16 // in addition to wrap-around.
17 //
18 // IPT tracing has two "modes":
19 // - per-cpu tracing
20 // - thread-specific tracing
21 // Tracing can only be done in one mode at a time. This is because saving/
22 // restoring thread PT state via the xsaves/xrstors instructions is a global
23 // flag in the XSS msr.
24 // Plus once a trace has been done with IPT_TRACE_THREADS one cannot go back
25 // to IPT_TRACE_CPUS: supporting this requires flushing trace state from all
26 // threads which is a bit of work. For now it's easy enough to just require
27 // the user to reboot. ZX-892
28 
29 #include <arch/arch_ops.h>
30 #include <arch/mmu.h>
31 #include <arch/x86.h>
32 #include <arch/x86/feature.h>
33 #include <arch/x86/mmu.h>
34 #include <arch/x86/proc_trace.h>
35 #include <err.h>
36 #include <fbl/auto_lock.h>
37 #include <fbl/macros.h>
38 #include <fbl/mutex.h>
39 #include <ktl/unique_ptr.h>
40 #include <kernel/mp.h>
41 #include <kernel/thread.h>
42 #include <lib/ktrace.h>
43 #include <pow2.h>
44 #include <string.h>
45 #include <trace.h>
46 #include <vm/vm.h>
47 #include <vm/vm_aspace.h>
48 #include <lib/zircon-internal/device/cpu-trace/intel-pt.h>
49 #include <lib/zircon-internal/ktrace.h>
50 #include <lib/zircon-internal/mtrace.h>
51 #include <zircon/thread_annotations.h>
52 #include <zircon/types.h>
53 
54 using fbl::AutoLock;
55 
56 #define LOCAL_TRACE 0
57 
58 // Control MSRs
59 #define IA32_RTIT_OUTPUT_BASE 0x560
60 #define IA32_RTIT_OUTPUT_MASK_PTRS 0x561
61 #define IA32_RTIT_CTL 0x570
62 #define IA32_RTIT_STATUS 0x571
63 #define IA32_RTIT_CR3_MATCH 0x572
64 #define IA32_RTIT_ADDR0_A 0x580
65 #define IA32_RTIT_ADDR0_B 0x581
66 #define IA32_RTIT_ADDR1_A 0x582
67 #define IA32_RTIT_ADDR1_B 0x583
68 #define IA32_RTIT_ADDR2_A 0x584
69 #define IA32_RTIT_ADDR2_B 0x585
70 #define IA32_RTIT_ADDR3_A 0x586
71 #define IA32_RTIT_ADDR3_B 0x587
72 
73 // We need bits[15:8] to get the "maximum non-turbo ratio".
74 // See libipt:intel-pt.h:pt_config, and Intel Vol. 3 chapter 35.5.
75 #define IA32_PLATFORM_INFO 0xce
76 
77 // Our own copy of what h/w supports, mostly for sanity checking.
78 static bool supports_pt = false;
79 static bool supports_cr3_filtering = false;
80 static bool supports_psb = false;
81 static bool supports_ip_filtering = false;
82 static bool supports_mtc = false;
83 static bool supports_ptwrite = false;
84 static bool supports_power_events = false;
85 static bool supports_output_topa = false;
86 static bool supports_output_topa_multi = false;
87 static bool supports_output_single = false;
88 static bool supports_output_transport = false;
89 
90 struct ipt_trace_state_t {
91     uint64_t ctl;
92     uint64_t status;
93     uint64_t output_base;
94     uint64_t output_mask_ptrs;
95     uint64_t cr3_match;
96     struct {
97         uint64_t a, b;
98     } addr_ranges[IPT_MAX_NUM_ADDR_RANGES];
99 };
100 
101 static fbl::Mutex ipt_lock;
102 
103 static ipt_trace_state_t* ipt_trace_state TA_GUARDED(ipt_lock);
104 
105 static bool active TA_GUARDED(ipt_lock) = false;
106 
107 static ipt_trace_mode_t trace_mode TA_GUARDED(ipt_lock) = IPT_TRACE_CPUS;
108 
109 // In cpu mode this arch_max_num_cpus.
110 // In thread mode this is provided by the user.
111 static uint32_t ipt_num_traces TA_GUARDED(ipt_lock);
112 
x86_processor_trace_init(void)113 void x86_processor_trace_init(void) {
114     if (!x86_feature_test(X86_FEATURE_PT)) {
115         return;
116     }
117 
118     struct cpuid_leaf leaf;
119     if (!x86_get_cpuid_subleaf(X86_CPUID_PT, 0, &leaf)) {
120         return;
121     }
122 
123     supports_pt = true;
124 
125     // Keep our own copy of these flags, mostly for potential sanity checks.
126     supports_cr3_filtering = !!(leaf.b & (1 << 0));
127     supports_psb = !!(leaf.b & (1 << 1));
128     supports_ip_filtering = !!(leaf.b & (1 << 2));
129     supports_mtc = !!(leaf.b & (1 << 3));
130     supports_ptwrite = !!(leaf.b & (1 << 4));
131     supports_power_events = !!(leaf.b & (1 << 5));
132 
133     supports_output_topa = !!(leaf.c & (1 << 0));
134     supports_output_topa_multi = !!(leaf.c & (1 << 1));
135     supports_output_single = !!(leaf.c & (1 << 2));
136     supports_output_transport = !!(leaf.c & (1 << 3));
137 }
138 
139 // Intel Processor Trace support needs to be able to map cr3 values that
140 // appear in the trace to pids that ld.so uses to dump memory maps.
arch_trace_process_create(uint64_t pid,paddr_t pt_phys)141 void arch_trace_process_create(uint64_t pid, paddr_t pt_phys) {
142     // The cr3 value that appears in Intel PT h/w tracing.
143     uint64_t cr3 = pt_phys;
144     ktrace(TAG_IPT_PROCESS_CREATE, (uint32_t)pid, (uint32_t)(pid >> 32),
145            (uint32_t)cr3, (uint32_t)(cr3 >> 32));
146 }
147 
148 // Worker for x86_ipt_alloc_trace to be executed on all cpus.
149 // This is invoked via mp_sync_exec which thread safety analysis cannot follow.
x86_ipt_set_mode_task(void * raw_context)150 static void x86_ipt_set_mode_task(void* raw_context) TA_NO_THREAD_SAFETY_ANALYSIS {
151     DEBUG_ASSERT(arch_ints_disabled());
152     DEBUG_ASSERT(!active);
153 
154     // When changing modes make sure all PT MSRs are in the init state.
155     // We don't want a value to appear in the xsave buffer and have xrstors
156     // #gp because XCOMP_BV has the PT bit set that's not set in XSS.
157     // We still need to do this, even with ZX-892, when transitioning
158     // from IPT_TRACE_CPUS to IPT_TRACE_THREADS.
159     write_msr(IA32_RTIT_CTL, 0);
160     write_msr(IA32_RTIT_STATUS, 0);
161     write_msr(IA32_RTIT_OUTPUT_BASE, 0);
162     write_msr(IA32_RTIT_OUTPUT_MASK_PTRS, 0);
163     if (supports_cr3_filtering)
164         write_msr(IA32_RTIT_CR3_MATCH, 0);
165     // TODO(dje): addr range msrs
166 
167     ipt_trace_mode_t new_mode = static_cast<ipt_trace_mode_t>(reinterpret_cast<uintptr_t>(raw_context));
168 
169     // PT state saving, if supported, was enabled during boot so there's no
170     // need to recalculate the xsave space needed.
171     x86_set_extended_register_pt_state(new_mode == IPT_TRACE_THREADS);
172 }
173 
x86_ipt_alloc_trace(ipt_trace_mode_t mode,uint32_t num_traces)174 zx_status_t x86_ipt_alloc_trace(ipt_trace_mode_t mode, uint32_t num_traces) {
175     AutoLock al(&ipt_lock);
176 
177     DEBUG_ASSERT(mode == IPT_TRACE_CPUS || mode == IPT_TRACE_THREADS);
178     if (mode == IPT_TRACE_CPUS) {
179         if (num_traces != arch_max_num_cpus())
180             return ZX_ERR_INVALID_ARGS;
181     } else {
182         return ZX_ERR_NOT_SUPPORTED;
183     }
184 
185     if (!supports_pt)
186         return ZX_ERR_NOT_SUPPORTED;
187     if (active)
188         return ZX_ERR_BAD_STATE;
189     if (ipt_trace_state)
190         return ZX_ERR_BAD_STATE;
191 
192     // ZX-892: We don't support changing the mode from IPT_TRACE_THREADS to
193     // IPT_TRACE_CPUS: We can't turn off XSS.PT until we're sure all threads
194     // have no PT state, and that's too tricky to do right now. Instead,
195     // require the developer to reboot.
196     if (trace_mode == IPT_TRACE_THREADS && mode == IPT_TRACE_CPUS)
197         return ZX_ERR_NOT_SUPPORTED;
198 
199     ipt_trace_state =
200         reinterpret_cast<ipt_trace_state_t*>(calloc(num_traces,
201                                                     sizeof(*ipt_trace_state)));
202     if (!ipt_trace_state)
203         return ZX_ERR_NO_MEMORY;
204 
205     mp_sync_exec(MP_IPI_TARGET_ALL, 0, x86_ipt_set_mode_task,
206                  reinterpret_cast<void*>(static_cast<uintptr_t>(mode)));
207 
208     trace_mode = mode;
209     ipt_num_traces = num_traces;
210     return ZX_OK;
211 }
212 
213 // Free resources obtained by x86_ipt_alloc_trace().
214 // This doesn't care if resources have already been freed to save callers
215 // from having to care during any cleanup.
216 
x86_ipt_free_trace()217 zx_status_t x86_ipt_free_trace() {
218     AutoLock al(&ipt_lock);
219 
220     if (!supports_pt)
221         return ZX_ERR_NOT_SUPPORTED;
222     if (trace_mode == IPT_TRACE_THREADS)
223         return ZX_ERR_BAD_STATE;
224     if (active)
225         return ZX_ERR_BAD_STATE;
226 
227     free(ipt_trace_state);
228     ipt_trace_state = nullptr;
229     return ZX_OK;
230 }
231 
232 // This is invoked via mp_sync_exec which thread safety analysis cannot follow.
x86_ipt_start_cpu_task(void * raw_context)233 static void x86_ipt_start_cpu_task(void* raw_context) TA_NO_THREAD_SAFETY_ANALYSIS {
234     DEBUG_ASSERT(arch_ints_disabled());
235     DEBUG_ASSERT(active && raw_context);
236 
237     ipt_trace_state_t* context = reinterpret_cast<ipt_trace_state_t*>(raw_context);
238     uint32_t cpu = arch_curr_cpu_num();
239     ipt_trace_state_t* state = &context[cpu];
240 
241     DEBUG_ASSERT(!(read_msr(IA32_RTIT_CTL) & IPT_CTL_TRACE_EN_MASK));
242 
243     // Load the ToPA configuration
244     write_msr(IA32_RTIT_OUTPUT_BASE, state->output_base);
245     write_msr(IA32_RTIT_OUTPUT_MASK_PTRS, state->output_mask_ptrs);
246 
247     // Load all other msrs, prior to enabling tracing.
248     write_msr(IA32_RTIT_STATUS, state->status);
249     if (supports_cr3_filtering)
250         write_msr(IA32_RTIT_CR3_MATCH, state->cr3_match);
251 
252     // Enable the trace
253     write_msr(IA32_RTIT_CTL, state->ctl);
254 }
255 
256 // Begin the trace.
257 
x86_ipt_start()258 zx_status_t x86_ipt_start() {
259     AutoLock al(&ipt_lock);
260 
261     if (!supports_pt)
262         return ZX_ERR_NOT_SUPPORTED;
263     if (trace_mode == IPT_TRACE_THREADS)
264         return ZX_ERR_BAD_STATE;
265     if (active)
266         return ZX_ERR_BAD_STATE;
267     if (!ipt_trace_state)
268         return ZX_ERR_BAD_STATE;
269 
270     uint64_t kernel_cr3 = x86_kernel_cr3();
271     TRACEF("Starting processor trace, kernel cr3: 0x%" PRIxPTR "\n",
272            kernel_cr3);
273 
274     if (LOCAL_TRACE && trace_mode == IPT_TRACE_CPUS) {
275         uint32_t num_cpus = ipt_num_traces;
276         for (uint32_t cpu = 0; cpu < num_cpus; ++cpu) {
277             TRACEF("Cpu %u: ctl 0x%" PRIx64 ", status 0x%" PRIx64 ", base 0x%" PRIx64 ", mask 0x%" PRIx64 "\n",
278                    cpu, ipt_trace_state[cpu].ctl, ipt_trace_state[cpu].status,
279                    ipt_trace_state[cpu].output_base,
280                    ipt_trace_state[cpu].output_mask_ptrs);
281         }
282     }
283 
284     active = true;
285 
286     // Sideband info needed by the trace reader.
287     uint64_t platform_msr = read_msr(IA32_PLATFORM_INFO);
288     unsigned nom_freq = (platform_msr >> 8) & 0xff;
289     ktrace(TAG_IPT_START, (uint32_t)nom_freq, 0,
290            (uint32_t)kernel_cr3, (uint32_t)(kernel_cr3 >> 32));
291     const struct x86_model_info* model_info = x86_get_model();
292     ktrace(TAG_IPT_CPU_INFO, model_info->processor_type,
293            model_info->display_family, model_info->display_model,
294            model_info->stepping);
295 
296     if (trace_mode == IPT_TRACE_CPUS) {
297         mp_sync_exec(MP_IPI_TARGET_ALL, 0, x86_ipt_start_cpu_task, ipt_trace_state);
298     }
299 
300     return ZX_OK;
301 }
302 
303 // This is invoked via mp_sync_exec which thread safety analysis cannot follow.
x86_ipt_stop_cpu_task(void * raw_context)304 static void x86_ipt_stop_cpu_task(void* raw_context) TA_NO_THREAD_SAFETY_ANALYSIS {
305     DEBUG_ASSERT(arch_ints_disabled());
306     DEBUG_ASSERT(raw_context);
307 
308     ipt_trace_state_t* context = reinterpret_cast<ipt_trace_state_t*>(raw_context);
309     uint32_t cpu = arch_curr_cpu_num();
310     ipt_trace_state_t* state = &context[cpu];
311 
312     // Disable the trace
313     write_msr(IA32_RTIT_CTL, 0);
314 
315     // Retrieve msr values for later providing to userspace
316     state->ctl = 0;
317     state->status = read_msr(IA32_RTIT_STATUS);
318     state->output_base = read_msr(IA32_RTIT_OUTPUT_BASE);
319     state->output_mask_ptrs = read_msr(IA32_RTIT_OUTPUT_MASK_PTRS);
320 
321     // Zero all MSRs so that we are in the XSAVE initial configuration.
322     // This allows h/w to do some optimizations regarding the state.
323     write_msr(IA32_RTIT_STATUS, 0);
324     write_msr(IA32_RTIT_OUTPUT_BASE, 0);
325     write_msr(IA32_RTIT_OUTPUT_MASK_PTRS, 0);
326     if (supports_cr3_filtering)
327         write_msr(IA32_RTIT_CR3_MATCH, 0);
328 
329     // TODO(dje): Make it explicit that packets have been completely written.
330     // See Intel Vol 3 chapter 36.2.4.
331 
332     // TODO(teisenbe): Clear ADDR* MSRs depending on leaf 1
333 }
334 
335 // This can be called while not active, so the caller doesn't have to care
336 // during any cleanup.
337 
x86_ipt_stop()338 zx_status_t x86_ipt_stop() {
339     AutoLock al(&ipt_lock);
340 
341     if (!supports_pt)
342         return ZX_ERR_NOT_SUPPORTED;
343     if (trace_mode == IPT_TRACE_THREADS)
344         return ZX_ERR_BAD_STATE;
345     if (!ipt_trace_state)
346         return ZX_ERR_BAD_STATE;
347 
348     TRACEF("Stopping processor trace\n");
349 
350     if (trace_mode == IPT_TRACE_CPUS) {
351         mp_sync_exec(MP_IPI_TARGET_ALL, 0, x86_ipt_stop_cpu_task, ipt_trace_state);
352     }
353 
354     ktrace(TAG_IPT_STOP, 0, 0, 0, 0);
355     active = false;
356 
357     if (LOCAL_TRACE && trace_mode == IPT_TRACE_CPUS) {
358         uint32_t num_cpus = ipt_num_traces;
359         for (uint32_t cpu = 0; cpu < num_cpus; ++cpu) {
360             TRACEF("Cpu %u: ctl 0x%" PRIx64 ", status 0x%" PRIx64 ", base 0x%" PRIx64 ", mask 0x%" PRIx64 "\n",
361                    cpu, ipt_trace_state[cpu].ctl, ipt_trace_state[cpu].status,
362                    ipt_trace_state[cpu].output_base,
363                    ipt_trace_state[cpu].output_mask_ptrs);
364         }
365     }
366 
367     return ZX_OK;
368 }
369 
x86_ipt_stage_trace_data(zx_itrace_buffer_descriptor_t descriptor,const zx_x86_pt_regs_t * regs)370 zx_status_t x86_ipt_stage_trace_data(zx_itrace_buffer_descriptor_t descriptor,
371                                      const zx_x86_pt_regs_t* regs) {
372     AutoLock al(&ipt_lock);
373 
374     if (!supports_pt)
375         return ZX_ERR_NOT_SUPPORTED;
376     if (trace_mode == IPT_TRACE_CPUS && active)
377         return ZX_ERR_BAD_STATE;
378     if (!ipt_trace_state)
379         return ZX_ERR_BAD_STATE;
380     if (descriptor >= ipt_num_traces)
381         return ZX_ERR_INVALID_ARGS;
382 
383     ipt_trace_state[descriptor].ctl = regs->ctl;
384     ipt_trace_state[descriptor].status = regs->status;
385     ipt_trace_state[descriptor].output_base = regs->output_base;
386     ipt_trace_state[descriptor].output_mask_ptrs = regs->output_mask_ptrs;
387     ipt_trace_state[descriptor].cr3_match = regs->cr3_match;
388     static_assert(sizeof(ipt_trace_state[descriptor].addr_ranges) == sizeof(regs->addr_ranges), "addr_ranges size mismatch");
389     memcpy(ipt_trace_state[descriptor].addr_ranges, regs->addr_ranges, sizeof(regs->addr_ranges));
390 
391     return ZX_OK;
392 }
393 
x86_ipt_get_trace_data(zx_itrace_buffer_descriptor_t descriptor,zx_x86_pt_regs_t * regs)394 zx_status_t x86_ipt_get_trace_data(zx_itrace_buffer_descriptor_t descriptor,
395                                    zx_x86_pt_regs_t* regs) {
396     AutoLock al(&ipt_lock);
397 
398     if (!supports_pt)
399         return ZX_ERR_NOT_SUPPORTED;
400     if (trace_mode == IPT_TRACE_CPUS && active)
401         return ZX_ERR_BAD_STATE;
402     if (!ipt_trace_state)
403         return ZX_ERR_BAD_STATE;
404     if (descriptor >= ipt_num_traces)
405         return ZX_ERR_INVALID_ARGS;
406 
407     regs->ctl = ipt_trace_state[descriptor].ctl;
408     regs->status = ipt_trace_state[descriptor].status;
409     regs->output_base = ipt_trace_state[descriptor].output_base;
410     regs->output_mask_ptrs = ipt_trace_state[descriptor].output_mask_ptrs;
411     regs->cr3_match = ipt_trace_state[descriptor].cr3_match;
412     static_assert(sizeof(regs->addr_ranges) == sizeof(ipt_trace_state[descriptor].addr_ranges), "addr_ranges size mismatch");
413     memcpy(regs->addr_ranges, ipt_trace_state[descriptor].addr_ranges, sizeof(regs->addr_ranges));
414 
415     return ZX_OK;
416 }
417