1 // SPDX-License-Identifier: GPL-2.0
2 #include "util/bpf_counter.h"
3 #include "util/debug.h"
4 #include "util/evsel.h"
5 #include "util/evlist.h"
6 #include "util/off_cpu.h"
7 #include "util/perf-hooks.h"
8 #include "util/record.h"
9 #include "util/session.h"
10 #include "util/target.h"
11 #include "util/cpumap.h"
12 #include "util/thread_map.h"
13 #include "util/cgroup.h"
14 #include "util/strlist.h"
15 #include <bpf/bpf.h>
16 #include <internal/xyarray.h>
17 #include <linux/time64.h>
18
19 #include "bpf_skel/off_cpu.skel.h"
20
21 #define MAX_STACKS 32
22 #define MAX_PROC 4096
23 /* we don't need actual timestamp, just want to put the samples at last */
24 #define OFF_CPU_TIMESTAMP (~0ull << 32)
25
26 static struct off_cpu_bpf *skel;
27
28 struct off_cpu_key {
29 u32 pid;
30 u32 tgid;
31 u32 stack_id;
32 u32 state;
33 u64 cgroup_id;
34 };
35
36 union off_cpu_data {
37 struct perf_event_header hdr;
38 u64 array[1024 / sizeof(u64)];
39 };
40
41 u64 off_cpu_raw[MAX_STACKS + 5];
42
off_cpu_config(struct evlist * evlist)43 static int off_cpu_config(struct evlist *evlist)
44 {
45 char off_cpu_event[64];
46 struct evsel *evsel;
47
48 scnprintf(off_cpu_event, sizeof(off_cpu_event), "bpf-output/name=%s/", OFFCPU_EVENT);
49 if (parse_event(evlist, off_cpu_event)) {
50 pr_err("Failed to open off-cpu event\n");
51 return -1;
52 }
53
54 evlist__for_each_entry(evlist, evsel) {
55 if (evsel__is_offcpu_event(evsel)) {
56 evsel->core.system_wide = true;
57 break;
58 }
59 }
60
61 return 0;
62 }
63
off_cpu_start(void * arg)64 static void off_cpu_start(void *arg)
65 {
66 struct evlist *evlist = arg;
67 struct evsel *evsel;
68 struct perf_cpu pcpu;
69 int i;
70
71 /* update task filter for the given workload */
72 if (skel->rodata->has_task && skel->rodata->uses_tgid &&
73 perf_thread_map__pid(evlist->core.threads, 0) != -1) {
74 int fd;
75 u32 pid;
76 u8 val = 1;
77
78 fd = bpf_map__fd(skel->maps.task_filter);
79 pid = perf_thread_map__pid(evlist->core.threads, 0);
80 bpf_map_update_elem(fd, &pid, &val, BPF_ANY);
81 }
82
83 /* update BPF perf_event map */
84 evsel = evlist__find_evsel_by_str(evlist, OFFCPU_EVENT);
85 if (evsel == NULL) {
86 pr_err("%s evsel not found\n", OFFCPU_EVENT);
87 return;
88 }
89
90 perf_cpu_map__for_each_cpu(pcpu, i, evsel->core.cpus) {
91 int err;
92 int cpu_nr = pcpu.cpu;
93
94 err = bpf_map__update_elem(skel->maps.offcpu_output, &cpu_nr, sizeof(int),
95 xyarray__entry(evsel->core.fd, cpu_nr, 0),
96 sizeof(int), BPF_ANY);
97 if (err) {
98 pr_err("Failed to update perf event map for direct off-cpu dumping\n");
99 return;
100 }
101 }
102
103 skel->bss->enabled = 1;
104 }
105
off_cpu_finish(void * arg __maybe_unused)106 static void off_cpu_finish(void *arg __maybe_unused)
107 {
108 skel->bss->enabled = 0;
109 off_cpu_bpf__destroy(skel);
110 }
111
112 /* v5.18 kernel added prev_state arg, so it needs to check the signature */
check_sched_switch_args(void)113 static void check_sched_switch_args(void)
114 {
115 struct btf *btf = btf__load_vmlinux_btf();
116 const struct btf_type *t1, *t2, *t3;
117 u32 type_id;
118
119 if (!btf) {
120 pr_debug("Missing btf, check if CONFIG_DEBUG_INFO_BTF is enabled\n");
121 goto cleanup;
122 }
123
124 type_id = btf__find_by_name_kind(btf, "btf_trace_sched_switch",
125 BTF_KIND_TYPEDEF);
126 if ((s32)type_id < 0)
127 goto cleanup;
128
129 t1 = btf__type_by_id(btf, type_id);
130 if (t1 == NULL)
131 goto cleanup;
132
133 t2 = btf__type_by_id(btf, t1->type);
134 if (t2 == NULL || !btf_is_ptr(t2))
135 goto cleanup;
136
137 t3 = btf__type_by_id(btf, t2->type);
138 /* btf_trace func proto has one more argument for the context */
139 if (t3 && btf_is_func_proto(t3) && btf_vlen(t3) == 5) {
140 /* new format: pass prev_state as 4th arg */
141 skel->rodata->has_prev_state = true;
142 }
143 cleanup:
144 btf__free(btf);
145 }
146
off_cpu_prepare(struct evlist * evlist,struct target * target,struct record_opts * opts)147 int off_cpu_prepare(struct evlist *evlist, struct target *target,
148 struct record_opts *opts)
149 {
150 int err, fd, i;
151 int ncpus = 1, ntasks = 1, ncgrps = 1;
152 struct strlist *pid_slist = NULL;
153 struct str_node *pos;
154
155 if (off_cpu_config(evlist) < 0) {
156 pr_err("Failed to config off-cpu BPF event\n");
157 return -1;
158 }
159
160 skel = off_cpu_bpf__open();
161 if (!skel) {
162 pr_err("Failed to open off-cpu BPF skeleton\n");
163 return -1;
164 }
165
166 /* don't need to set cpu filter for system-wide mode */
167 if (target->cpu_list) {
168 ncpus = perf_cpu_map__nr(evlist->core.user_requested_cpus);
169 bpf_map__set_max_entries(skel->maps.cpu_filter, ncpus);
170 skel->rodata->has_cpu = 1;
171 }
172
173 if (target->pid) {
174 pid_slist = strlist__new(target->pid, NULL);
175 if (!pid_slist) {
176 pr_err("Failed to create a strlist for pid\n");
177 return -1;
178 }
179
180 ntasks = 0;
181 strlist__for_each_entry(pos, pid_slist) {
182 char *end_ptr;
183 int pid = strtol(pos->s, &end_ptr, 10);
184
185 if (pid == INT_MIN || pid == INT_MAX ||
186 (*end_ptr != '\0' && *end_ptr != ','))
187 continue;
188
189 ntasks++;
190 }
191
192 if (ntasks < MAX_PROC)
193 ntasks = MAX_PROC;
194
195 bpf_map__set_max_entries(skel->maps.task_filter, ntasks);
196 skel->rodata->has_task = 1;
197 skel->rodata->uses_tgid = 1;
198 } else if (target__has_task(target)) {
199 ntasks = perf_thread_map__nr(evlist->core.threads);
200 bpf_map__set_max_entries(skel->maps.task_filter, ntasks);
201 skel->rodata->has_task = 1;
202 } else if (target__none(target)) {
203 bpf_map__set_max_entries(skel->maps.task_filter, MAX_PROC);
204 skel->rodata->has_task = 1;
205 skel->rodata->uses_tgid = 1;
206 }
207
208 if (evlist__first(evlist)->cgrp) {
209 ncgrps = evlist->core.nr_entries - 1; /* excluding a dummy */
210 bpf_map__set_max_entries(skel->maps.cgroup_filter, ncgrps);
211
212 if (!cgroup_is_v2("perf_event"))
213 skel->rodata->uses_cgroup_v1 = true;
214 skel->rodata->has_cgroup = 1;
215 }
216
217 if (opts->record_cgroup) {
218 skel->rodata->needs_cgroup = true;
219
220 if (!cgroup_is_v2("perf_event"))
221 skel->rodata->uses_cgroup_v1 = true;
222 }
223
224 set_max_rlimit();
225 check_sched_switch_args();
226
227 err = off_cpu_bpf__load(skel);
228 if (err) {
229 pr_err("Failed to load off-cpu skeleton\n");
230 goto out;
231 }
232
233 if (target->cpu_list) {
234 u32 cpu;
235 u8 val = 1;
236
237 fd = bpf_map__fd(skel->maps.cpu_filter);
238
239 for (i = 0; i < ncpus; i++) {
240 cpu = perf_cpu_map__cpu(evlist->core.user_requested_cpus, i).cpu;
241 bpf_map_update_elem(fd, &cpu, &val, BPF_ANY);
242 }
243 }
244
245 if (target->pid) {
246 u8 val = 1;
247
248 fd = bpf_map__fd(skel->maps.task_filter);
249
250 strlist__for_each_entry(pos, pid_slist) {
251 char *end_ptr;
252 u32 tgid;
253 int pid = strtol(pos->s, &end_ptr, 10);
254
255 if (pid == INT_MIN || pid == INT_MAX ||
256 (*end_ptr != '\0' && *end_ptr != ','))
257 continue;
258
259 tgid = pid;
260 bpf_map_update_elem(fd, &tgid, &val, BPF_ANY);
261 }
262 } else if (target__has_task(target)) {
263 u32 pid;
264 u8 val = 1;
265
266 fd = bpf_map__fd(skel->maps.task_filter);
267
268 for (i = 0; i < ntasks; i++) {
269 pid = perf_thread_map__pid(evlist->core.threads, i);
270 bpf_map_update_elem(fd, &pid, &val, BPF_ANY);
271 }
272 }
273
274 if (evlist__first(evlist)->cgrp) {
275 struct evsel *evsel;
276 u8 val = 1;
277
278 fd = bpf_map__fd(skel->maps.cgroup_filter);
279
280 evlist__for_each_entry(evlist, evsel) {
281 struct cgroup *cgrp = evsel->cgrp;
282
283 if (cgrp == NULL)
284 continue;
285
286 if (!cgrp->id && read_cgroup_id(cgrp) < 0) {
287 pr_err("Failed to read cgroup id of %s\n",
288 cgrp->name);
289 goto out;
290 }
291
292 bpf_map_update_elem(fd, &cgrp->id, &val, BPF_ANY);
293 }
294 }
295
296 skel->bss->offcpu_thresh_ns = opts->off_cpu_thresh_ns;
297
298 err = off_cpu_bpf__attach(skel);
299 if (err) {
300 pr_err("Failed to attach off-cpu BPF skeleton\n");
301 goto out;
302 }
303
304 if (perf_hooks__set_hook("record_start", off_cpu_start, evlist) ||
305 perf_hooks__set_hook("record_end", off_cpu_finish, evlist)) {
306 pr_err("Failed to attach off-cpu skeleton\n");
307 goto out;
308 }
309
310 return 0;
311
312 out:
313 off_cpu_bpf__destroy(skel);
314 return -1;
315 }
316
off_cpu_write(struct perf_session * session)317 int off_cpu_write(struct perf_session *session)
318 {
319 int bytes = 0, size;
320 int fd, stack;
321 u32 raw_size;
322 u64 sample_type, val, sid = 0;
323 struct evsel *evsel;
324 struct perf_data_file *file = &session->data->file;
325 struct off_cpu_key prev, key;
326 union off_cpu_data data = {
327 .hdr = {
328 .type = PERF_RECORD_SAMPLE,
329 .misc = PERF_RECORD_MISC_USER,
330 },
331 };
332 u64 tstamp = OFF_CPU_TIMESTAMP;
333
334 skel->bss->enabled = 0;
335
336 evsel = evlist__find_evsel_by_str(session->evlist, OFFCPU_EVENT);
337 if (evsel == NULL) {
338 pr_err("%s evsel not found\n", OFFCPU_EVENT);
339 return 0;
340 }
341
342 sample_type = evsel->core.attr.sample_type;
343
344 if (sample_type & ~OFFCPU_SAMPLE_TYPES) {
345 pr_err("not supported sample type: %llx\n",
346 (unsigned long long)sample_type);
347 return -1;
348 }
349
350 if (sample_type & (PERF_SAMPLE_ID | PERF_SAMPLE_IDENTIFIER)) {
351 if (evsel->core.id)
352 sid = evsel->core.id[0];
353 }
354
355 fd = bpf_map__fd(skel->maps.off_cpu);
356 stack = bpf_map__fd(skel->maps.stacks);
357 memset(&prev, 0, sizeof(prev));
358
359 while (!bpf_map_get_next_key(fd, &prev, &key)) {
360 int n = 1; /* start from perf_event_header */
361
362 bpf_map_lookup_elem(fd, &key, &val);
363
364 /* zero-fill some of the fields, will be overwritten by raw_data when parsing */
365 if (sample_type & PERF_SAMPLE_IDENTIFIER)
366 data.array[n++] = sid;
367 if (sample_type & PERF_SAMPLE_IP)
368 data.array[n++] = 0; /* will be updated */
369 if (sample_type & PERF_SAMPLE_TID)
370 data.array[n++] = 0;
371 if (sample_type & PERF_SAMPLE_TIME)
372 data.array[n++] = tstamp;
373 if (sample_type & PERF_SAMPLE_CPU)
374 data.array[n++] = 0;
375 if (sample_type & PERF_SAMPLE_PERIOD)
376 data.array[n++] = 0;
377 if (sample_type & PERF_SAMPLE_RAW) {
378 /*
379 * [ size ][ data ]
380 * [ data ]
381 * [ data ]
382 * [ data ]
383 * [ data ][ empty]
384 */
385 int len = 0, i = 0;
386 void *raw_data = (void *)data.array + n * sizeof(u64);
387
388 off_cpu_raw[i++] = (u64)key.pid << 32 | key.tgid;
389 off_cpu_raw[i++] = val;
390
391 /* off_cpu_raw[i] is callchain->nr (updated later) */
392 off_cpu_raw[i + 1] = PERF_CONTEXT_USER;
393 off_cpu_raw[i + 2] = 0;
394
395 bpf_map_lookup_elem(stack, &key.stack_id, &off_cpu_raw[i + 2]);
396 while (off_cpu_raw[i + 2 + len])
397 len++;
398
399 off_cpu_raw[i] = len + 1;
400 i += len + 2;
401
402 off_cpu_raw[i++] = key.cgroup_id;
403
404 raw_size = i * sizeof(u64) + sizeof(u32); /* 4 bytes for alignment */
405 memcpy(raw_data, &raw_size, sizeof(raw_size));
406 memcpy(raw_data + sizeof(u32), off_cpu_raw, i * sizeof(u64));
407
408 n += i + 1;
409 }
410 if (sample_type & PERF_SAMPLE_CGROUP)
411 data.array[n++] = key.cgroup_id;
412
413 size = n * sizeof(u64);
414 data.hdr.size = size;
415 bytes += size;
416
417 if (perf_data_file__write(file, &data, size) < 0) {
418 pr_err("failed to write perf data, error: %m\n");
419 return bytes;
420 }
421
422 prev = key;
423 /* increase dummy timestamp to sort later samples */
424 tstamp++;
425 }
426 return bytes;
427 }
428