1 // SPDX-License-Identifier: GPL-2.0
2 #include <string.h>
3 #include <stdio.h>
4 #include <sys/types.h>
5 #include <dirent.h>
6 #include <fcntl.h>
7 #include <linux/stddef.h>
8 #include <linux/perf_event.h>
9 #include <linux/zalloc.h>
10 #include <api/fs/fs.h>
11 #include <api/io_dir.h>
12 #include <internal/cpumap.h>
13 #include <errno.h>
14
15 #include "../../../util/intel-pt.h"
16 #include "../../../util/intel-bts.h"
17 #include "../../../util/pmu.h"
18 #include "../../../util/fncache.h"
19 #include "../../../util/pmus.h"
20 #include "mem-events.h"
21 #include "util/debug.h"
22 #include "util/env.h"
23 #include "util/header.h"
24
x86__is_intel_graniterapids(void)25 static bool x86__is_intel_graniterapids(void)
26 {
27 static bool checked_if_graniterapids;
28 static bool is_graniterapids;
29
30 if (!checked_if_graniterapids) {
31 const char *graniterapids_cpuid = "GenuineIntel-6-A[DE]";
32 char *cpuid = get_cpuid_str((struct perf_cpu){0});
33
34 is_graniterapids = cpuid && strcmp_cpuid_str(graniterapids_cpuid, cpuid) == 0;
35 free(cpuid);
36 checked_if_graniterapids = true;
37 }
38 return is_graniterapids;
39 }
40
read_sysfs_cpu_map(const char * sysfs_path)41 static struct perf_cpu_map *read_sysfs_cpu_map(const char *sysfs_path)
42 {
43 struct perf_cpu_map *cpus;
44 char *buf = NULL;
45 size_t buf_len;
46
47 if (sysfs__read_str(sysfs_path, &buf, &buf_len) < 0)
48 return NULL;
49
50 cpus = perf_cpu_map__new(buf);
51 free(buf);
52 return cpus;
53 }
54
snc_nodes_per_l3_cache(void)55 static int snc_nodes_per_l3_cache(void)
56 {
57 static bool checked_snc;
58 static int snc_nodes;
59
60 if (!checked_snc) {
61 struct perf_cpu_map *node_cpus =
62 read_sysfs_cpu_map("devices/system/node/node0/cpulist");
63 struct perf_cpu_map *cache_cpus =
64 read_sysfs_cpu_map("devices/system/cpu/cpu0/cache/index3/shared_cpu_list");
65
66 snc_nodes = perf_cpu_map__nr(cache_cpus) / perf_cpu_map__nr(node_cpus);
67 perf_cpu_map__put(cache_cpus);
68 perf_cpu_map__put(node_cpus);
69 checked_snc = true;
70 }
71 return snc_nodes;
72 }
73
starts_with(const char * str,const char * prefix)74 static bool starts_with(const char *str, const char *prefix)
75 {
76 return !strncmp(prefix, str, strlen(prefix));
77 }
78
num_chas(void)79 static int num_chas(void)
80 {
81 static bool checked_chas;
82 static int num_chas;
83
84 if (!checked_chas) {
85 int fd = perf_pmu__event_source_devices_fd();
86 struct io_dir dir;
87 struct io_dirent64 *dent;
88
89 if (fd < 0)
90 return -1;
91
92 io_dir__init(&dir, fd);
93
94 while ((dent = io_dir__readdir(&dir)) != NULL) {
95 /* Note, dent->d_type will be DT_LNK and so isn't a useful filter. */
96 if (starts_with(dent->d_name, "uncore_cha_"))
97 num_chas++;
98 }
99 close(fd);
100 checked_chas = true;
101 }
102 return num_chas;
103 }
104
105 #define MAX_SNCS 6
106
uncore_cha_snc(struct perf_pmu * pmu)107 static int uncore_cha_snc(struct perf_pmu *pmu)
108 {
109 // CHA SNC numbers are ordered correspond to the CHAs number.
110 unsigned int cha_num;
111 int num_cha, chas_per_node, cha_snc;
112 int snc_nodes = snc_nodes_per_l3_cache();
113
114 if (snc_nodes <= 1)
115 return 0;
116
117 num_cha = num_chas();
118 if (num_cha <= 0) {
119 pr_warning("Unexpected: no CHAs found\n");
120 return 0;
121 }
122
123 /* Compute SNC for PMU. */
124 if (sscanf(pmu->name, "uncore_cha_%u", &cha_num) != 1) {
125 pr_warning("Unexpected: unable to compute CHA number '%s'\n", pmu->name);
126 return 0;
127 }
128 chas_per_node = num_cha / snc_nodes;
129 cha_snc = cha_num / chas_per_node;
130
131 /* Range check cha_snc. for unexpected out of bounds. */
132 return cha_snc >= MAX_SNCS ? 0 : cha_snc;
133 }
134
uncore_imc_snc(struct perf_pmu * pmu)135 static int uncore_imc_snc(struct perf_pmu *pmu)
136 {
137 // Compute the IMC SNC using lookup tables.
138 unsigned int imc_num;
139 int snc_nodes = snc_nodes_per_l3_cache();
140 const u8 snc2_map[] = {1, 1, 0, 0, 1, 1, 0, 0};
141 const u8 snc3_map[] = {1, 1, 0, 0, 2, 2, 1, 1, 0, 0, 2, 2};
142 const u8 *snc_map;
143 size_t snc_map_len;
144
145 switch (snc_nodes) {
146 case 2:
147 snc_map = snc2_map;
148 snc_map_len = ARRAY_SIZE(snc2_map);
149 break;
150 case 3:
151 snc_map = snc3_map;
152 snc_map_len = ARRAY_SIZE(snc3_map);
153 break;
154 default:
155 /* Error or no lookup support for SNC with >3 nodes. */
156 return 0;
157 }
158
159 /* Compute SNC for PMU. */
160 if (sscanf(pmu->name, "uncore_imc_%u", &imc_num) != 1) {
161 pr_warning("Unexpected: unable to compute IMC number '%s'\n", pmu->name);
162 return 0;
163 }
164 if (imc_num >= snc_map_len) {
165 pr_warning("Unexpected IMC %d for SNC%d mapping\n", imc_num, snc_nodes);
166 return 0;
167 }
168 return snc_map[imc_num];
169 }
170
uncore_cha_imc_compute_cpu_adjust(int pmu_snc)171 static int uncore_cha_imc_compute_cpu_adjust(int pmu_snc)
172 {
173 static bool checked_cpu_adjust[MAX_SNCS];
174 static int cpu_adjust[MAX_SNCS];
175 struct perf_cpu_map *node_cpus;
176 char node_path[] = "devices/system/node/node0/cpulist";
177
178 /* Was adjust already computed? */
179 if (checked_cpu_adjust[pmu_snc])
180 return cpu_adjust[pmu_snc];
181
182 /* SNC0 doesn't need an adjust. */
183 if (pmu_snc == 0) {
184 cpu_adjust[0] = 0;
185 checked_cpu_adjust[0] = true;
186 return 0;
187 }
188
189 /*
190 * Use NUMA topology to compute first CPU of the NUMA node, we want to
191 * adjust CPU 0 to be this and similarly for other CPUs if there is >1
192 * socket.
193 */
194 assert(pmu_snc >= 0 && pmu_snc <= 9);
195 node_path[24] += pmu_snc; // Shift node0 to be node<pmu_snc>.
196 node_cpus = read_sysfs_cpu_map(node_path);
197 cpu_adjust[pmu_snc] = perf_cpu_map__cpu(node_cpus, 0).cpu;
198 if (cpu_adjust[pmu_snc] < 0) {
199 pr_debug("Failed to read valid CPU list from <sysfs>/%s\n", node_path);
200 cpu_adjust[pmu_snc] = 0;
201 } else {
202 checked_cpu_adjust[pmu_snc] = true;
203 }
204 perf_cpu_map__put(node_cpus);
205 return cpu_adjust[pmu_snc];
206 }
207
gnr_uncore_cha_imc_adjust_cpumask_for_snc(struct perf_pmu * pmu,bool cha)208 static void gnr_uncore_cha_imc_adjust_cpumask_for_snc(struct perf_pmu *pmu, bool cha)
209 {
210 // With sub-NUMA clustering (SNC) there is a NUMA node per SNC in the
211 // topology. For example, a two socket graniterapids machine may be set
212 // up with 3-way SNC meaning there are 6 NUMA nodes that should be
213 // displayed with --per-node. The cpumask of the CHA and IMC PMUs
214 // reflects per-socket information meaning, for example, uncore_cha_60
215 // on a two socket graniterapids machine with 120 cores per socket will
216 // have a cpumask of "0,120". This cpumask needs adjusting to "40,160"
217 // to reflect that uncore_cha_60 is used for the 2nd SNC of each
218 // socket. Without the adjustment events on uncore_cha_60 will appear in
219 // node 0 and node 3 (in our example 2 socket 3-way set up), but with
220 // the adjustment they will appear in node 1 and node 4. The number of
221 // CHAs is typically larger than the number of cores. The CHA numbers
222 // are assumed to split evenly and inorder wrt core numbers. There are
223 // fewer memory IMC PMUs than cores and mapping is handled using lookup
224 // tables.
225 static struct perf_cpu_map *cha_adjusted[MAX_SNCS];
226 static struct perf_cpu_map *imc_adjusted[MAX_SNCS];
227 struct perf_cpu_map **adjusted = cha ? cha_adjusted : imc_adjusted;
228 int idx, pmu_snc, cpu_adjust;
229 struct perf_cpu cpu;
230 bool alloc;
231
232 // Cpus from the kernel holds first CPU of each socket. e.g. 0,120.
233 if (perf_cpu_map__cpu(pmu->cpus, 0).cpu != 0) {
234 pr_debug("Ignoring cpumask adjust for %s as unexpected first CPU\n", pmu->name);
235 return;
236 }
237
238 pmu_snc = cha ? uncore_cha_snc(pmu) : uncore_imc_snc(pmu);
239 if (pmu_snc == 0) {
240 // No adjustment necessary for the first SNC.
241 return;
242 }
243
244 alloc = adjusted[pmu_snc] == NULL;
245 if (alloc) {
246 // Hold onto the perf_cpu_map globally to avoid recomputation.
247 cpu_adjust = uncore_cha_imc_compute_cpu_adjust(pmu_snc);
248 adjusted[pmu_snc] = perf_cpu_map__empty_new(perf_cpu_map__nr(pmu->cpus));
249 if (!adjusted[pmu_snc])
250 return;
251 }
252
253 perf_cpu_map__for_each_cpu(cpu, idx, pmu->cpus) {
254 // Compute the new cpu map values or if not allocating, assert
255 // that they match expectations. asserts will be removed to
256 // avoid overhead in NDEBUG builds.
257 if (alloc) {
258 RC_CHK_ACCESS(adjusted[pmu_snc])->map[idx].cpu = cpu.cpu + cpu_adjust;
259 } else if (idx == 0) {
260 cpu_adjust = perf_cpu_map__cpu(adjusted[pmu_snc], idx).cpu - cpu.cpu;
261 assert(uncore_cha_imc_compute_cpu_adjust(pmu_snc) == cpu_adjust);
262 } else {
263 assert(perf_cpu_map__cpu(adjusted[pmu_snc], idx).cpu ==
264 cpu.cpu + cpu_adjust);
265 }
266 }
267
268 perf_cpu_map__put(pmu->cpus);
269 pmu->cpus = perf_cpu_map__get(adjusted[pmu_snc]);
270 }
271
perf_pmu__arch_init(struct perf_pmu * pmu)272 void perf_pmu__arch_init(struct perf_pmu *pmu)
273 {
274 struct perf_pmu_caps *ldlat_cap;
275
276 #ifdef HAVE_AUXTRACE_SUPPORT
277 if (!strcmp(pmu->name, INTEL_PT_PMU_NAME)) {
278 pmu->auxtrace = true;
279 pmu->selectable = true;
280 pmu->perf_event_attr_init_default = intel_pt_pmu_default_config;
281 }
282 if (!strcmp(pmu->name, INTEL_BTS_PMU_NAME)) {
283 pmu->auxtrace = true;
284 pmu->selectable = true;
285 }
286 #endif
287
288 if (x86__is_amd_cpu()) {
289 if (strcmp(pmu->name, "ibs_op"))
290 return;
291
292 pmu->mem_events = perf_mem_events_amd;
293
294 if (!perf_pmu__caps_parse(pmu))
295 return;
296
297 ldlat_cap = perf_pmu__get_cap(pmu, "ldlat");
298 if (!ldlat_cap || strcmp(ldlat_cap->value, "1"))
299 return;
300
301 perf_mem_events__loads_ldlat = 0;
302 pmu->mem_events = perf_mem_events_amd_ldlat;
303 } else {
304 if (pmu->is_core) {
305 if (perf_pmu__have_event(pmu, "mem-loads-aux"))
306 pmu->mem_events = perf_mem_events_intel_aux;
307 else
308 pmu->mem_events = perf_mem_events_intel;
309 } else if (x86__is_intel_graniterapids()) {
310 if (starts_with(pmu->name, "uncore_cha_"))
311 gnr_uncore_cha_imc_adjust_cpumask_for_snc(pmu, /*cha=*/true);
312 else if (starts_with(pmu->name, "uncore_imc_"))
313 gnr_uncore_cha_imc_adjust_cpumask_for_snc(pmu, /*cha=*/false);
314 }
315 }
316 }
317