1 // SPDX-License-Identifier: GPL-2.0
2 #include "util/bpf_counter.h"
3 #include "util/debug.h"
4 #include "util/evsel.h"
5 #include "util/evlist.h"
6 #include "util/off_cpu.h"
7 #include "util/perf-hooks.h"
8 #include "util/record.h"
9 #include "util/session.h"
10 #include "util/target.h"
11 #include "util/cpumap.h"
12 #include "util/thread_map.h"
13 #include "util/cgroup.h"
14 #include "util/strlist.h"
15 #include <bpf/bpf.h>
16 #include <internal/xyarray.h>
17 #include <linux/time64.h>
18 
19 #include "bpf_skel/off_cpu.skel.h"
20 
21 #define MAX_STACKS  32
22 #define MAX_PROC  4096
23 /* we don't need actual timestamp, just want to put the samples at last */
24 #define OFF_CPU_TIMESTAMP  (~0ull << 32)
25 
26 static struct off_cpu_bpf *skel;
27 
28 struct off_cpu_key {
29 	u32 pid;
30 	u32 tgid;
31 	u32 stack_id;
32 	u32 state;
33 	u64 cgroup_id;
34 };
35 
36 union off_cpu_data {
37 	struct perf_event_header hdr;
38 	u64 array[1024 / sizeof(u64)];
39 };
40 
41 u64 off_cpu_raw[MAX_STACKS + 5];
42 
off_cpu_config(struct evlist * evlist)43 static int off_cpu_config(struct evlist *evlist)
44 {
45 	char off_cpu_event[64];
46 	struct evsel *evsel;
47 
48 	scnprintf(off_cpu_event, sizeof(off_cpu_event), "bpf-output/name=%s/", OFFCPU_EVENT);
49 	if (parse_event(evlist, off_cpu_event)) {
50 		pr_err("Failed to open off-cpu event\n");
51 		return -1;
52 	}
53 
54 	evlist__for_each_entry(evlist, evsel) {
55 		if (evsel__is_offcpu_event(evsel)) {
56 			evsel->core.system_wide = true;
57 			break;
58 		}
59 	}
60 
61 	return 0;
62 }
63 
off_cpu_start(void * arg)64 static void off_cpu_start(void *arg)
65 {
66 	struct evlist *evlist = arg;
67 	struct evsel *evsel;
68 	struct perf_cpu pcpu;
69 	int i;
70 
71 	/* update task filter for the given workload */
72 	if (skel->rodata->has_task && skel->rodata->uses_tgid &&
73 	    perf_thread_map__pid(evlist->core.threads, 0) != -1) {
74 		int fd;
75 		u32 pid;
76 		u8 val = 1;
77 
78 		fd = bpf_map__fd(skel->maps.task_filter);
79 		pid = perf_thread_map__pid(evlist->core.threads, 0);
80 		bpf_map_update_elem(fd, &pid, &val, BPF_ANY);
81 	}
82 
83 	/* update BPF perf_event map */
84 	evsel = evlist__find_evsel_by_str(evlist, OFFCPU_EVENT);
85 	if (evsel == NULL) {
86 		pr_err("%s evsel not found\n", OFFCPU_EVENT);
87 		return;
88 	}
89 
90 	perf_cpu_map__for_each_cpu(pcpu, i, evsel->core.cpus) {
91 		int err;
92 		int cpu_nr = pcpu.cpu;
93 
94 		err = bpf_map__update_elem(skel->maps.offcpu_output, &cpu_nr, sizeof(int),
95 					   xyarray__entry(evsel->core.fd, cpu_nr, 0),
96 					   sizeof(int), BPF_ANY);
97 		if (err) {
98 			pr_err("Failed to update perf event map for direct off-cpu dumping\n");
99 			return;
100 		}
101 	}
102 
103 	skel->bss->enabled = 1;
104 }
105 
off_cpu_finish(void * arg __maybe_unused)106 static void off_cpu_finish(void *arg __maybe_unused)
107 {
108 	skel->bss->enabled = 0;
109 	off_cpu_bpf__destroy(skel);
110 }
111 
112 /* v5.18 kernel added prev_state arg, so it needs to check the signature */
check_sched_switch_args(void)113 static void check_sched_switch_args(void)
114 {
115 	struct btf *btf = btf__load_vmlinux_btf();
116 	const struct btf_type *t1, *t2, *t3;
117 	u32 type_id;
118 
119 	if (!btf) {
120 		pr_debug("Missing btf, check if CONFIG_DEBUG_INFO_BTF is enabled\n");
121 		goto cleanup;
122 	}
123 
124 	type_id = btf__find_by_name_kind(btf, "btf_trace_sched_switch",
125 					 BTF_KIND_TYPEDEF);
126 	if ((s32)type_id < 0)
127 		goto cleanup;
128 
129 	t1 = btf__type_by_id(btf, type_id);
130 	if (t1 == NULL)
131 		goto cleanup;
132 
133 	t2 = btf__type_by_id(btf, t1->type);
134 	if (t2 == NULL || !btf_is_ptr(t2))
135 		goto cleanup;
136 
137 	t3 = btf__type_by_id(btf, t2->type);
138 	/* btf_trace func proto has one more argument for the context */
139 	if (t3 && btf_is_func_proto(t3) && btf_vlen(t3) == 5) {
140 		/* new format: pass prev_state as 4th arg */
141 		skel->rodata->has_prev_state = true;
142 	}
143 cleanup:
144 	btf__free(btf);
145 }
146 
off_cpu_prepare(struct evlist * evlist,struct target * target,struct record_opts * opts)147 int off_cpu_prepare(struct evlist *evlist, struct target *target,
148 		    struct record_opts *opts)
149 {
150 	int err, fd, i;
151 	int ncpus = 1, ntasks = 1, ncgrps = 1;
152 	struct strlist *pid_slist = NULL;
153 	struct str_node *pos;
154 
155 	if (off_cpu_config(evlist) < 0) {
156 		pr_err("Failed to config off-cpu BPF event\n");
157 		return -1;
158 	}
159 
160 	skel = off_cpu_bpf__open();
161 	if (!skel) {
162 		pr_err("Failed to open off-cpu BPF skeleton\n");
163 		return -1;
164 	}
165 
166 	/* don't need to set cpu filter for system-wide mode */
167 	if (target->cpu_list) {
168 		ncpus = perf_cpu_map__nr(evlist->core.user_requested_cpus);
169 		bpf_map__set_max_entries(skel->maps.cpu_filter, ncpus);
170 		skel->rodata->has_cpu = 1;
171 	}
172 
173 	if (target->pid) {
174 		pid_slist = strlist__new(target->pid, NULL);
175 		if (!pid_slist) {
176 			pr_err("Failed to create a strlist for pid\n");
177 			return -1;
178 		}
179 
180 		ntasks = 0;
181 		strlist__for_each_entry(pos, pid_slist) {
182 			char *end_ptr;
183 			int pid = strtol(pos->s, &end_ptr, 10);
184 
185 			if (pid == INT_MIN || pid == INT_MAX ||
186 			    (*end_ptr != '\0' && *end_ptr != ','))
187 				continue;
188 
189 			ntasks++;
190 		}
191 
192 		if (ntasks < MAX_PROC)
193 			ntasks = MAX_PROC;
194 
195 		bpf_map__set_max_entries(skel->maps.task_filter, ntasks);
196 		skel->rodata->has_task = 1;
197 		skel->rodata->uses_tgid = 1;
198 	} else if (target__has_task(target)) {
199 		ntasks = perf_thread_map__nr(evlist->core.threads);
200 		bpf_map__set_max_entries(skel->maps.task_filter, ntasks);
201 		skel->rodata->has_task = 1;
202 	} else if (target__none(target)) {
203 		bpf_map__set_max_entries(skel->maps.task_filter, MAX_PROC);
204 		skel->rodata->has_task = 1;
205 		skel->rodata->uses_tgid = 1;
206 	}
207 
208 	if (evlist__first(evlist)->cgrp) {
209 		ncgrps = evlist->core.nr_entries - 1; /* excluding a dummy */
210 		bpf_map__set_max_entries(skel->maps.cgroup_filter, ncgrps);
211 
212 		if (!cgroup_is_v2("perf_event"))
213 			skel->rodata->uses_cgroup_v1 = true;
214 		skel->rodata->has_cgroup = 1;
215 	}
216 
217 	if (opts->record_cgroup) {
218 		skel->rodata->needs_cgroup = true;
219 
220 		if (!cgroup_is_v2("perf_event"))
221 			skel->rodata->uses_cgroup_v1 = true;
222 	}
223 
224 	set_max_rlimit();
225 	check_sched_switch_args();
226 
227 	err = off_cpu_bpf__load(skel);
228 	if (err) {
229 		pr_err("Failed to load off-cpu skeleton\n");
230 		goto out;
231 	}
232 
233 	if (target->cpu_list) {
234 		u32 cpu;
235 		u8 val = 1;
236 
237 		fd = bpf_map__fd(skel->maps.cpu_filter);
238 
239 		for (i = 0; i < ncpus; i++) {
240 			cpu = perf_cpu_map__cpu(evlist->core.user_requested_cpus, i).cpu;
241 			bpf_map_update_elem(fd, &cpu, &val, BPF_ANY);
242 		}
243 	}
244 
245 	if (target->pid) {
246 		u8 val = 1;
247 
248 		fd = bpf_map__fd(skel->maps.task_filter);
249 
250 		strlist__for_each_entry(pos, pid_slist) {
251 			char *end_ptr;
252 			u32 tgid;
253 			int pid = strtol(pos->s, &end_ptr, 10);
254 
255 			if (pid == INT_MIN || pid == INT_MAX ||
256 			    (*end_ptr != '\0' && *end_ptr != ','))
257 				continue;
258 
259 			tgid = pid;
260 			bpf_map_update_elem(fd, &tgid, &val, BPF_ANY);
261 		}
262 	} else if (target__has_task(target)) {
263 		u32 pid;
264 		u8 val = 1;
265 
266 		fd = bpf_map__fd(skel->maps.task_filter);
267 
268 		for (i = 0; i < ntasks; i++) {
269 			pid = perf_thread_map__pid(evlist->core.threads, i);
270 			bpf_map_update_elem(fd, &pid, &val, BPF_ANY);
271 		}
272 	}
273 
274 	if (evlist__first(evlist)->cgrp) {
275 		struct evsel *evsel;
276 		u8 val = 1;
277 
278 		fd = bpf_map__fd(skel->maps.cgroup_filter);
279 
280 		evlist__for_each_entry(evlist, evsel) {
281 			struct cgroup *cgrp = evsel->cgrp;
282 
283 			if (cgrp == NULL)
284 				continue;
285 
286 			if (!cgrp->id && read_cgroup_id(cgrp) < 0) {
287 				pr_err("Failed to read cgroup id of %s\n",
288 				       cgrp->name);
289 				goto out;
290 			}
291 
292 			bpf_map_update_elem(fd, &cgrp->id, &val, BPF_ANY);
293 		}
294 	}
295 
296 	skel->bss->offcpu_thresh_ns = opts->off_cpu_thresh_ns;
297 
298 	err = off_cpu_bpf__attach(skel);
299 	if (err) {
300 		pr_err("Failed to attach off-cpu BPF skeleton\n");
301 		goto out;
302 	}
303 
304 	if (perf_hooks__set_hook("record_start", off_cpu_start, evlist) ||
305 	    perf_hooks__set_hook("record_end", off_cpu_finish, evlist)) {
306 		pr_err("Failed to attach off-cpu skeleton\n");
307 		goto out;
308 	}
309 
310 	return 0;
311 
312 out:
313 	off_cpu_bpf__destroy(skel);
314 	return -1;
315 }
316 
off_cpu_write(struct perf_session * session)317 int off_cpu_write(struct perf_session *session)
318 {
319 	int bytes = 0, size;
320 	int fd, stack;
321 	u32 raw_size;
322 	u64 sample_type, val, sid = 0;
323 	struct evsel *evsel;
324 	struct perf_data_file *file = &session->data->file;
325 	struct off_cpu_key prev, key;
326 	union off_cpu_data data = {
327 		.hdr = {
328 			.type = PERF_RECORD_SAMPLE,
329 			.misc = PERF_RECORD_MISC_USER,
330 		},
331 	};
332 	u64 tstamp = OFF_CPU_TIMESTAMP;
333 
334 	skel->bss->enabled = 0;
335 
336 	evsel = evlist__find_evsel_by_str(session->evlist, OFFCPU_EVENT);
337 	if (evsel == NULL) {
338 		pr_err("%s evsel not found\n", OFFCPU_EVENT);
339 		return 0;
340 	}
341 
342 	sample_type = evsel->core.attr.sample_type;
343 
344 	if (sample_type & ~OFFCPU_SAMPLE_TYPES) {
345 		pr_err("not supported sample type: %llx\n",
346 		       (unsigned long long)sample_type);
347 		return -1;
348 	}
349 
350 	if (sample_type & (PERF_SAMPLE_ID | PERF_SAMPLE_IDENTIFIER)) {
351 		if (evsel->core.id)
352 			sid = evsel->core.id[0];
353 	}
354 
355 	fd = bpf_map__fd(skel->maps.off_cpu);
356 	stack = bpf_map__fd(skel->maps.stacks);
357 	memset(&prev, 0, sizeof(prev));
358 
359 	while (!bpf_map_get_next_key(fd, &prev, &key)) {
360 		int n = 1;  /* start from perf_event_header */
361 
362 		bpf_map_lookup_elem(fd, &key, &val);
363 
364 		/* zero-fill some of the fields, will be overwritten by raw_data when parsing */
365 		if (sample_type & PERF_SAMPLE_IDENTIFIER)
366 			data.array[n++] = sid;
367 		if (sample_type & PERF_SAMPLE_IP)
368 			data.array[n++] = 0;  /* will be updated */
369 		if (sample_type & PERF_SAMPLE_TID)
370 			data.array[n++] = 0;
371 		if (sample_type & PERF_SAMPLE_TIME)
372 			data.array[n++] = tstamp;
373 		if (sample_type & PERF_SAMPLE_CPU)
374 			data.array[n++] = 0;
375 		if (sample_type & PERF_SAMPLE_PERIOD)
376 			data.array[n++] = 0;
377 		if (sample_type & PERF_SAMPLE_RAW) {
378 			/*
379 			 *  [ size ][ data ]
380 			 *  [     data     ]
381 			 *  [     data     ]
382 			 *  [     data     ]
383 			 *  [ data ][ empty]
384 			 */
385 			int len = 0, i = 0;
386 			void *raw_data = (void *)data.array + n * sizeof(u64);
387 
388 			off_cpu_raw[i++] = (u64)key.pid << 32 | key.tgid;
389 			off_cpu_raw[i++] = val;
390 
391 			/* off_cpu_raw[i] is callchain->nr (updated later) */
392 			off_cpu_raw[i + 1] = PERF_CONTEXT_USER;
393 			off_cpu_raw[i + 2] = 0;
394 
395 			bpf_map_lookup_elem(stack, &key.stack_id, &off_cpu_raw[i + 2]);
396 			while (off_cpu_raw[i + 2 + len])
397 				len++;
398 
399 			off_cpu_raw[i] = len + 1;
400 			i += len + 2;
401 
402 			off_cpu_raw[i++] = key.cgroup_id;
403 
404 			raw_size = i * sizeof(u64) + sizeof(u32); /* 4 bytes for alignment */
405 			memcpy(raw_data, &raw_size, sizeof(raw_size));
406 			memcpy(raw_data + sizeof(u32), off_cpu_raw, i * sizeof(u64));
407 
408 			n += i + 1;
409 		}
410 		if (sample_type & PERF_SAMPLE_CGROUP)
411 			data.array[n++] = key.cgroup_id;
412 
413 		size = n * sizeof(u64);
414 		data.hdr.size = size;
415 		bytes += size;
416 
417 		if (perf_data_file__write(file, &data, size) < 0) {
418 			pr_err("failed to write perf data, error: %m\n");
419 			return bytes;
420 		}
421 
422 		prev = key;
423 		/* increase dummy timestamp to sort later samples */
424 		tstamp++;
425 	}
426 	return bytes;
427 }
428