1 // SPDX-License-Identifier: GPL-2.0
2 /*
3 * Performance event support for the System z CPU-measurement Sampling Facility
4 *
5 * Copyright IBM Corp. 2013, 2018
6 * Author(s): Hendrik Brueckner <brueckner@linux.vnet.ibm.com>
7 */
8 #define KMSG_COMPONENT "cpum_sf"
9 #define pr_fmt(fmt) KMSG_COMPONENT ": " fmt
10
11 #include <linux/kernel.h>
12 #include <linux/kernel_stat.h>
13 #include <linux/perf_event.h>
14 #include <linux/percpu.h>
15 #include <linux/pid.h>
16 #include <linux/notifier.h>
17 #include <linux/export.h>
18 #include <linux/slab.h>
19 #include <linux/mm.h>
20 #include <linux/moduleparam.h>
21 #include <asm/cpu_mf.h>
22 #include <asm/irq.h>
23 #include <asm/debug.h>
24 #include <asm/timex.h>
25 #include <asm-generic/io.h>
26
27 /* Minimum number of sample-data-block-tables:
28 * At least one table is required for the sampling buffer structure.
29 * A single table contains up to 511 pointers to sample-data-blocks.
30 */
31 #define CPUM_SF_MIN_SDBT 1
32
33 /* Number of sample-data-blocks per sample-data-block-table (SDBT):
34 * A table contains SDB pointers (8 bytes) and one table-link entry
35 * that points to the origin of the next SDBT.
36 */
37 #define CPUM_SF_SDB_PER_TABLE ((PAGE_SIZE - 8) / 8)
38
39 /* Maximum page offset for an SDBT table-link entry:
40 * If this page offset is reached, a table-link entry to the next SDBT
41 * must be added.
42 */
43 #define CPUM_SF_SDBT_TL_OFFSET (CPUM_SF_SDB_PER_TABLE * 8)
require_table_link(const void * sdbt)44 static inline int require_table_link(const void *sdbt)
45 {
46 return ((unsigned long) sdbt & ~PAGE_MASK) == CPUM_SF_SDBT_TL_OFFSET;
47 }
48
49 /* Minimum and maximum sampling buffer sizes:
50 *
51 * This number represents the maximum size of the sampling buffer taking
52 * the number of sample-data-block-tables into account. Note that these
53 * numbers apply to the basic-sampling function only.
54 * The maximum number of SDBs is increased by CPUM_SF_SDB_DIAG_FACTOR if
55 * the diagnostic-sampling function is active.
56 *
57 * Sampling buffer size Buffer characteristics
58 * ---------------------------------------------------
59 * 64KB == 16 pages (4KB per page)
60 * 1 page for SDB-tables
61 * 15 pages for SDBs
62 *
63 * 32MB == 8192 pages (4KB per page)
64 * 16 pages for SDB-tables
65 * 8176 pages for SDBs
66 */
67 static unsigned long __read_mostly CPUM_SF_MIN_SDB = 15;
68 static unsigned long __read_mostly CPUM_SF_MAX_SDB = 8176;
69 static unsigned long __read_mostly CPUM_SF_SDB_DIAG_FACTOR = 1;
70
71 struct sf_buffer {
72 unsigned long *sdbt; /* Sample-data-block-table origin */
73 /* buffer characteristics (required for buffer increments) */
74 unsigned long num_sdb; /* Number of sample-data-blocks */
75 unsigned long num_sdbt; /* Number of sample-data-block-tables */
76 unsigned long *tail; /* last sample-data-block-table */
77 };
78
79 struct aux_buffer {
80 struct sf_buffer sfb;
81 unsigned long head; /* index of SDB of buffer head */
82 unsigned long alert_mark; /* index of SDB of alert request position */
83 unsigned long empty_mark; /* mark of SDB not marked full */
84 unsigned long *sdb_index; /* SDB address for fast lookup */
85 unsigned long *sdbt_index; /* SDBT address for fast lookup */
86 };
87
88 struct cpu_hw_sf {
89 /* CPU-measurement sampling information block */
90 struct hws_qsi_info_block qsi;
91 /* CPU-measurement sampling control block */
92 struct hws_lsctl_request_block lsctl;
93 struct sf_buffer sfb; /* Sampling buffer */
94 unsigned int flags; /* Status flags */
95 struct perf_event *event; /* Scheduled perf event */
96 struct perf_output_handle handle; /* AUX buffer output handle */
97 };
98 static DEFINE_PER_CPU(struct cpu_hw_sf, cpu_hw_sf);
99
100 /* Debug feature */
101 static debug_info_t *sfdbg;
102
103 /* Sampling control helper functions */
freq_to_sample_rate(struct hws_qsi_info_block * qsi,unsigned long freq)104 static inline unsigned long freq_to_sample_rate(struct hws_qsi_info_block *qsi,
105 unsigned long freq)
106 {
107 return (USEC_PER_SEC / freq) * qsi->cpu_speed;
108 }
109
sample_rate_to_freq(struct hws_qsi_info_block * qsi,unsigned long rate)110 static inline unsigned long sample_rate_to_freq(struct hws_qsi_info_block *qsi,
111 unsigned long rate)
112 {
113 return USEC_PER_SEC * qsi->cpu_speed / rate;
114 }
115
116 /* Return TOD timestamp contained in an trailer entry */
trailer_timestamp(struct hws_trailer_entry * te)117 static inline unsigned long long trailer_timestamp(struct hws_trailer_entry *te)
118 {
119 /* TOD in STCKE format */
120 if (te->header.t)
121 return *((unsigned long long *)&te->timestamp[1]);
122
123 /* TOD in STCK format */
124 return *((unsigned long long *)&te->timestamp[0]);
125 }
126
127 /* Return pointer to trailer entry of an sample data block */
trailer_entry_ptr(unsigned long v)128 static inline struct hws_trailer_entry *trailer_entry_ptr(unsigned long v)
129 {
130 void *ret;
131
132 ret = (void *)v;
133 ret += PAGE_SIZE;
134 ret -= sizeof(struct hws_trailer_entry);
135
136 return ret;
137 }
138
139 /*
140 * Return true if the entry in the sample data block table (sdbt)
141 * is a link to the next sdbt
142 */
is_link_entry(unsigned long * s)143 static inline int is_link_entry(unsigned long *s)
144 {
145 return *s & 0x1UL ? 1 : 0;
146 }
147
148 /* Return pointer to the linked sdbt */
get_next_sdbt(unsigned long * s)149 static inline unsigned long *get_next_sdbt(unsigned long *s)
150 {
151 return phys_to_virt(*s & ~0x1UL);
152 }
153
154 /*
155 * sf_disable() - Switch off sampling facility
156 */
sf_disable(void)157 static int sf_disable(void)
158 {
159 struct hws_lsctl_request_block sreq;
160
161 memset(&sreq, 0, sizeof(sreq));
162 return lsctl(&sreq);
163 }
164
165 /*
166 * sf_buffer_available() - Check for an allocated sampling buffer
167 */
sf_buffer_available(struct cpu_hw_sf * cpuhw)168 static int sf_buffer_available(struct cpu_hw_sf *cpuhw)
169 {
170 return !!cpuhw->sfb.sdbt;
171 }
172
173 /*
174 * deallocate sampling facility buffer
175 */
free_sampling_buffer(struct sf_buffer * sfb)176 static void free_sampling_buffer(struct sf_buffer *sfb)
177 {
178 unsigned long *sdbt, *curr;
179
180 if (!sfb->sdbt)
181 return;
182
183 sdbt = sfb->sdbt;
184 curr = sdbt;
185
186 /* Free the SDBT after all SDBs are processed... */
187 while (1) {
188 if (!*curr || !sdbt)
189 break;
190
191 /* Process table-link entries */
192 if (is_link_entry(curr)) {
193 curr = get_next_sdbt(curr);
194 if (sdbt)
195 free_page((unsigned long) sdbt);
196
197 /* If the origin is reached, sampling buffer is freed */
198 if (curr == sfb->sdbt)
199 break;
200 else
201 sdbt = curr;
202 } else {
203 /* Process SDB pointer */
204 if (*curr) {
205 free_page((unsigned long)phys_to_virt(*curr));
206 curr++;
207 }
208 }
209 }
210
211 debug_sprintf_event(sfdbg, 5, "%s: freed sdbt %#lx\n", __func__,
212 (unsigned long)sfb->sdbt);
213 memset(sfb, 0, sizeof(*sfb));
214 }
215
alloc_sample_data_block(unsigned long * sdbt,gfp_t gfp_flags)216 static int alloc_sample_data_block(unsigned long *sdbt, gfp_t gfp_flags)
217 {
218 struct hws_trailer_entry *te;
219 unsigned long sdb;
220
221 /* Allocate and initialize sample-data-block */
222 sdb = get_zeroed_page(gfp_flags);
223 if (!sdb)
224 return -ENOMEM;
225 te = trailer_entry_ptr(sdb);
226 te->header.a = 1;
227
228 /* Link SDB into the sample-data-block-table */
229 *sdbt = virt_to_phys((void *)sdb);
230
231 return 0;
232 }
233
234 /*
235 * realloc_sampling_buffer() - extend sampler memory
236 *
237 * Allocates new sample-data-blocks and adds them to the specified sampling
238 * buffer memory.
239 *
240 * Important: This modifies the sampling buffer and must be called when the
241 * sampling facility is disabled.
242 *
243 * Returns zero on success, non-zero otherwise.
244 */
realloc_sampling_buffer(struct sf_buffer * sfb,unsigned long num_sdb,gfp_t gfp_flags)245 static int realloc_sampling_buffer(struct sf_buffer *sfb,
246 unsigned long num_sdb, gfp_t gfp_flags)
247 {
248 int i, rc;
249 unsigned long *new, *tail, *tail_prev = NULL;
250
251 if (!sfb->sdbt || !sfb->tail)
252 return -EINVAL;
253
254 if (!is_link_entry(sfb->tail))
255 return -EINVAL;
256
257 /* Append to the existing sampling buffer, overwriting the table-link
258 * register.
259 * The tail variables always points to the "tail" (last and table-link)
260 * entry in an SDB-table.
261 */
262 tail = sfb->tail;
263
264 /* Do a sanity check whether the table-link entry points to
265 * the sampling buffer origin.
266 */
267 if (sfb->sdbt != get_next_sdbt(tail)) {
268 debug_sprintf_event(sfdbg, 3, "%s: "
269 "sampling buffer is not linked: origin %#lx"
270 " tail %#lx\n", __func__,
271 (unsigned long)sfb->sdbt,
272 (unsigned long)tail);
273 return -EINVAL;
274 }
275
276 /* Allocate remaining SDBs */
277 rc = 0;
278 for (i = 0; i < num_sdb; i++) {
279 /* Allocate a new SDB-table if it is full. */
280 if (require_table_link(tail)) {
281 new = (unsigned long *) get_zeroed_page(gfp_flags);
282 if (!new) {
283 rc = -ENOMEM;
284 break;
285 }
286 sfb->num_sdbt++;
287 /* Link current page to tail of chain */
288 *tail = virt_to_phys((void *)new) + 1;
289 tail_prev = tail;
290 tail = new;
291 }
292
293 /* Allocate a new sample-data-block.
294 * If there is not enough memory, stop the realloc process
295 * and simply use what was allocated. If this is a temporary
296 * issue, a new realloc call (if required) might succeed.
297 */
298 rc = alloc_sample_data_block(tail, gfp_flags);
299 if (rc) {
300 /* Undo last SDBT. An SDBT with no SDB at its first
301 * entry but with an SDBT entry instead can not be
302 * handled by the interrupt handler code.
303 * Avoid this situation.
304 */
305 if (tail_prev) {
306 sfb->num_sdbt--;
307 free_page((unsigned long) new);
308 tail = tail_prev;
309 }
310 break;
311 }
312 sfb->num_sdb++;
313 tail++;
314 tail_prev = new = NULL; /* Allocated at least one SBD */
315 }
316
317 /* Link sampling buffer to its origin */
318 *tail = virt_to_phys(sfb->sdbt) + 1;
319 sfb->tail = tail;
320
321 debug_sprintf_event(sfdbg, 4, "%s: new buffer"
322 " settings: sdbt %lu sdb %lu\n", __func__,
323 sfb->num_sdbt, sfb->num_sdb);
324 return rc;
325 }
326
327 /*
328 * allocate_sampling_buffer() - allocate sampler memory
329 *
330 * Allocates and initializes a sampling buffer structure using the
331 * specified number of sample-data-blocks (SDB). For each allocation,
332 * a 4K page is used. The number of sample-data-block-tables (SDBT)
333 * are calculated from SDBs.
334 * Also set the ALERT_REQ mask in each SDBs trailer.
335 *
336 * Returns zero on success, non-zero otherwise.
337 */
alloc_sampling_buffer(struct sf_buffer * sfb,unsigned long num_sdb)338 static int alloc_sampling_buffer(struct sf_buffer *sfb, unsigned long num_sdb)
339 {
340 int rc;
341
342 if (sfb->sdbt)
343 return -EINVAL;
344
345 /* Allocate the sample-data-block-table origin */
346 sfb->sdbt = (unsigned long *) get_zeroed_page(GFP_KERNEL);
347 if (!sfb->sdbt)
348 return -ENOMEM;
349 sfb->num_sdb = 0;
350 sfb->num_sdbt = 1;
351
352 /* Link the table origin to point to itself to prepare for
353 * realloc_sampling_buffer() invocation.
354 */
355 sfb->tail = sfb->sdbt;
356 *sfb->tail = virt_to_phys((void *)sfb->sdbt) + 1;
357
358 /* Allocate requested number of sample-data-blocks */
359 rc = realloc_sampling_buffer(sfb, num_sdb, GFP_KERNEL);
360 if (rc) {
361 free_sampling_buffer(sfb);
362 debug_sprintf_event(sfdbg, 4, "%s: "
363 "realloc_sampling_buffer failed with rc %i\n",
364 __func__, rc);
365 } else
366 debug_sprintf_event(sfdbg, 4,
367 "%s: tear %#lx dear %#lx\n", __func__,
368 (unsigned long)sfb->sdbt, (unsigned long)*sfb->sdbt);
369 return rc;
370 }
371
sfb_set_limits(unsigned long min,unsigned long max)372 static void sfb_set_limits(unsigned long min, unsigned long max)
373 {
374 struct hws_qsi_info_block si;
375
376 CPUM_SF_MIN_SDB = min;
377 CPUM_SF_MAX_SDB = max;
378
379 memset(&si, 0, sizeof(si));
380 if (!qsi(&si))
381 CPUM_SF_SDB_DIAG_FACTOR = DIV_ROUND_UP(si.dsdes, si.bsdes);
382 }
383
sfb_max_limit(struct hw_perf_event * hwc)384 static unsigned long sfb_max_limit(struct hw_perf_event *hwc)
385 {
386 return SAMPL_DIAG_MODE(hwc) ? CPUM_SF_MAX_SDB * CPUM_SF_SDB_DIAG_FACTOR
387 : CPUM_SF_MAX_SDB;
388 }
389
sfb_pending_allocs(struct sf_buffer * sfb,struct hw_perf_event * hwc)390 static unsigned long sfb_pending_allocs(struct sf_buffer *sfb,
391 struct hw_perf_event *hwc)
392 {
393 if (!sfb->sdbt)
394 return SFB_ALLOC_REG(hwc);
395 if (SFB_ALLOC_REG(hwc) > sfb->num_sdb)
396 return SFB_ALLOC_REG(hwc) - sfb->num_sdb;
397 return 0;
398 }
399
sfb_has_pending_allocs(struct sf_buffer * sfb,struct hw_perf_event * hwc)400 static int sfb_has_pending_allocs(struct sf_buffer *sfb,
401 struct hw_perf_event *hwc)
402 {
403 return sfb_pending_allocs(sfb, hwc) > 0;
404 }
405
sfb_account_allocs(unsigned long num,struct hw_perf_event * hwc)406 static void sfb_account_allocs(unsigned long num, struct hw_perf_event *hwc)
407 {
408 /* Limit the number of SDBs to not exceed the maximum */
409 num = min_t(unsigned long, num, sfb_max_limit(hwc) - SFB_ALLOC_REG(hwc));
410 if (num)
411 SFB_ALLOC_REG(hwc) += num;
412 }
413
sfb_init_allocs(unsigned long num,struct hw_perf_event * hwc)414 static void sfb_init_allocs(unsigned long num, struct hw_perf_event *hwc)
415 {
416 SFB_ALLOC_REG(hwc) = 0;
417 sfb_account_allocs(num, hwc);
418 }
419
deallocate_buffers(struct cpu_hw_sf * cpuhw)420 static void deallocate_buffers(struct cpu_hw_sf *cpuhw)
421 {
422 if (cpuhw->sfb.sdbt)
423 free_sampling_buffer(&cpuhw->sfb);
424 }
425
allocate_buffers(struct cpu_hw_sf * cpuhw,struct hw_perf_event * hwc)426 static int allocate_buffers(struct cpu_hw_sf *cpuhw, struct hw_perf_event *hwc)
427 {
428 unsigned long n_sdb, freq;
429 size_t sample_size;
430
431 /* Calculate sampling buffers using 4K pages
432 *
433 * 1. The sampling size is 32 bytes for basic sampling. This size
434 * is the same for all machine types. Diagnostic
435 * sampling uses auxlilary data buffer setup which provides the
436 * memory for SDBs using linux common code auxiliary trace
437 * setup.
438 *
439 * 2. Function alloc_sampling_buffer() sets the Alert Request
440 * Control indicator to trigger a measurement-alert to harvest
441 * sample-data-blocks (SDB). This is done per SDB. This
442 * measurement alert interrupt fires quick enough to handle
443 * one SDB, on very high frequency and work loads there might
444 * be 2 to 3 SBDs available for sample processing.
445 * Currently there is no need for setup alert request on every
446 * n-th page. This is counterproductive as one IRQ triggers
447 * a very high number of samples to be processed at one IRQ.
448 *
449 * 3. Use the sampling frequency as input.
450 * Compute the number of SDBs and ensure a minimum
451 * of CPUM_SF_MIN_SDB. Depending on frequency add some more
452 * SDBs to handle a higher sampling rate.
453 * Use a minimum of CPUM_SF_MIN_SDB and allow for 100 samples
454 * (one SDB) for every 10000 HZ frequency increment.
455 *
456 * 4. Compute the number of sample-data-block-tables (SDBT) and
457 * ensure a minimum of CPUM_SF_MIN_SDBT (one table can manage up
458 * to 511 SDBs).
459 */
460 sample_size = sizeof(struct hws_basic_entry);
461 freq = sample_rate_to_freq(&cpuhw->qsi, SAMPL_RATE(hwc));
462 n_sdb = CPUM_SF_MIN_SDB + DIV_ROUND_UP(freq, 10000);
463
464 /* If there is already a sampling buffer allocated, it is very likely
465 * that the sampling facility is enabled too. If the event to be
466 * initialized requires a greater sampling buffer, the allocation must
467 * be postponed. Changing the sampling buffer requires the sampling
468 * facility to be in the disabled state. So, account the number of
469 * required SDBs and let cpumsf_pmu_enable() resize the buffer just
470 * before the event is started.
471 */
472 sfb_init_allocs(n_sdb, hwc);
473 if (sf_buffer_available(cpuhw))
474 return 0;
475
476 debug_sprintf_event(sfdbg, 3,
477 "%s: rate %lu f %lu sdb %lu/%lu"
478 " sample_size %lu cpuhw %p\n", __func__,
479 SAMPL_RATE(hwc), freq, n_sdb, sfb_max_limit(hwc),
480 sample_size, cpuhw);
481
482 return alloc_sampling_buffer(&cpuhw->sfb,
483 sfb_pending_allocs(&cpuhw->sfb, hwc));
484 }
485
min_percent(unsigned int percent,unsigned long base,unsigned long min)486 static unsigned long min_percent(unsigned int percent, unsigned long base,
487 unsigned long min)
488 {
489 return min_t(unsigned long, min, DIV_ROUND_UP(percent * base, 100));
490 }
491
compute_sfb_extent(unsigned long ratio,unsigned long base)492 static unsigned long compute_sfb_extent(unsigned long ratio, unsigned long base)
493 {
494 /* Use a percentage-based approach to extend the sampling facility
495 * buffer. Accept up to 5% sample data loss.
496 * Vary the extents between 1% to 5% of the current number of
497 * sample-data-blocks.
498 */
499 if (ratio <= 5)
500 return 0;
501 if (ratio <= 25)
502 return min_percent(1, base, 1);
503 if (ratio <= 50)
504 return min_percent(1, base, 1);
505 if (ratio <= 75)
506 return min_percent(2, base, 2);
507 if (ratio <= 100)
508 return min_percent(3, base, 3);
509 if (ratio <= 250)
510 return min_percent(4, base, 4);
511
512 return min_percent(5, base, 8);
513 }
514
sfb_account_overflows(struct cpu_hw_sf * cpuhw,struct hw_perf_event * hwc)515 static void sfb_account_overflows(struct cpu_hw_sf *cpuhw,
516 struct hw_perf_event *hwc)
517 {
518 unsigned long ratio, num;
519
520 if (!OVERFLOW_REG(hwc))
521 return;
522
523 /* The sample_overflow contains the average number of sample data
524 * that has been lost because sample-data-blocks were full.
525 *
526 * Calculate the total number of sample data entries that has been
527 * discarded. Then calculate the ratio of lost samples to total samples
528 * per second in percent.
529 */
530 ratio = DIV_ROUND_UP(100 * OVERFLOW_REG(hwc) * cpuhw->sfb.num_sdb,
531 sample_rate_to_freq(&cpuhw->qsi, SAMPL_RATE(hwc)));
532
533 /* Compute number of sample-data-blocks */
534 num = compute_sfb_extent(ratio, cpuhw->sfb.num_sdb);
535 if (num)
536 sfb_account_allocs(num, hwc);
537
538 debug_sprintf_event(sfdbg, 5, "%s: overflow %llu ratio %lu num %lu\n",
539 __func__, OVERFLOW_REG(hwc), ratio, num);
540 OVERFLOW_REG(hwc) = 0;
541 }
542
543 /* extend_sampling_buffer() - Extend sampling buffer
544 * @sfb: Sampling buffer structure (for local CPU)
545 * @hwc: Perf event hardware structure
546 *
547 * Use this function to extend the sampling buffer based on the overflow counter
548 * and postponed allocation extents stored in the specified Perf event hardware.
549 *
550 * Important: This function disables the sampling facility in order to safely
551 * change the sampling buffer structure. Do not call this function
552 * when the PMU is active.
553 */
extend_sampling_buffer(struct sf_buffer * sfb,struct hw_perf_event * hwc)554 static void extend_sampling_buffer(struct sf_buffer *sfb,
555 struct hw_perf_event *hwc)
556 {
557 unsigned long num, num_old;
558 int rc;
559
560 num = sfb_pending_allocs(sfb, hwc);
561 if (!num)
562 return;
563 num_old = sfb->num_sdb;
564
565 /* Disable the sampling facility to reset any states and also
566 * clear pending measurement alerts.
567 */
568 sf_disable();
569
570 /* Extend the sampling buffer.
571 * This memory allocation typically happens in an atomic context when
572 * called by perf. Because this is a reallocation, it is fine if the
573 * new SDB-request cannot be satisfied immediately.
574 */
575 rc = realloc_sampling_buffer(sfb, num, GFP_ATOMIC);
576 if (rc)
577 debug_sprintf_event(sfdbg, 5, "%s: realloc failed with rc %i\n",
578 __func__, rc);
579
580 if (sfb_has_pending_allocs(sfb, hwc))
581 debug_sprintf_event(sfdbg, 5, "%s: "
582 "req %lu alloc %lu remaining %lu\n",
583 __func__, num, sfb->num_sdb - num_old,
584 sfb_pending_allocs(sfb, hwc));
585 }
586
587 /* Number of perf events counting hardware events */
588 static atomic_t num_events;
589 /* Used to avoid races in calling reserve/release_cpumf_hardware */
590 static DEFINE_MUTEX(pmc_reserve_mutex);
591
592 #define PMC_INIT 0
593 #define PMC_RELEASE 1
594 #define PMC_FAILURE 2
setup_pmc_cpu(void * flags)595 static void setup_pmc_cpu(void *flags)
596 {
597 int err;
598 struct cpu_hw_sf *cpusf = this_cpu_ptr(&cpu_hw_sf);
599
600 err = 0;
601 switch (*((int *) flags)) {
602 case PMC_INIT:
603 memset(cpusf, 0, sizeof(*cpusf));
604 err = qsi(&cpusf->qsi);
605 if (err)
606 break;
607 cpusf->flags |= PMU_F_RESERVED;
608 err = sf_disable();
609 if (err)
610 pr_err("Switching off the sampling facility failed "
611 "with rc %i\n", err);
612 break;
613 case PMC_RELEASE:
614 cpusf->flags &= ~PMU_F_RESERVED;
615 err = sf_disable();
616 if (err) {
617 pr_err("Switching off the sampling facility failed "
618 "with rc %i\n", err);
619 } else
620 deallocate_buffers(cpusf);
621 break;
622 }
623 if (err)
624 *((int *) flags) |= PMC_FAILURE;
625 }
626
release_pmc_hardware(void)627 static void release_pmc_hardware(void)
628 {
629 int flags = PMC_RELEASE;
630
631 irq_subclass_unregister(IRQ_SUBCLASS_MEASUREMENT_ALERT);
632 on_each_cpu(setup_pmc_cpu, &flags, 1);
633 }
634
reserve_pmc_hardware(void)635 static int reserve_pmc_hardware(void)
636 {
637 int flags = PMC_INIT;
638
639 on_each_cpu(setup_pmc_cpu, &flags, 1);
640 if (flags & PMC_FAILURE) {
641 release_pmc_hardware();
642 return -ENODEV;
643 }
644 irq_subclass_register(IRQ_SUBCLASS_MEASUREMENT_ALERT);
645
646 return 0;
647 }
648
hw_perf_event_destroy(struct perf_event * event)649 static void hw_perf_event_destroy(struct perf_event *event)
650 {
651 /* Release PMC if this is the last perf event */
652 if (!atomic_add_unless(&num_events, -1, 1)) {
653 mutex_lock(&pmc_reserve_mutex);
654 if (atomic_dec_return(&num_events) == 0)
655 release_pmc_hardware();
656 mutex_unlock(&pmc_reserve_mutex);
657 }
658 }
659
hw_init_period(struct hw_perf_event * hwc,u64 period)660 static void hw_init_period(struct hw_perf_event *hwc, u64 period)
661 {
662 hwc->sample_period = period;
663 hwc->last_period = hwc->sample_period;
664 local64_set(&hwc->period_left, hwc->sample_period);
665 }
666
hw_limit_rate(const struct hws_qsi_info_block * si,unsigned long rate)667 static unsigned long hw_limit_rate(const struct hws_qsi_info_block *si,
668 unsigned long rate)
669 {
670 return clamp_t(unsigned long, rate,
671 si->min_sampl_rate, si->max_sampl_rate);
672 }
673
cpumsf_pid_type(struct perf_event * event,u32 pid,enum pid_type type)674 static u32 cpumsf_pid_type(struct perf_event *event,
675 u32 pid, enum pid_type type)
676 {
677 struct task_struct *tsk;
678
679 /* Idle process */
680 if (!pid)
681 goto out;
682
683 tsk = find_task_by_pid_ns(pid, &init_pid_ns);
684 pid = -1;
685 if (tsk) {
686 /*
687 * Only top level events contain the pid namespace in which
688 * they are created.
689 */
690 if (event->parent)
691 event = event->parent;
692 pid = __task_pid_nr_ns(tsk, type, event->ns);
693 /*
694 * See also 1d953111b648
695 * "perf/core: Don't report zero PIDs for exiting tasks".
696 */
697 if (!pid && !pid_alive(tsk))
698 pid = -1;
699 }
700 out:
701 return pid;
702 }
703
cpumsf_output_event_pid(struct perf_event * event,struct perf_sample_data * data,struct pt_regs * regs)704 static void cpumsf_output_event_pid(struct perf_event *event,
705 struct perf_sample_data *data,
706 struct pt_regs *regs)
707 {
708 u32 pid;
709 struct perf_event_header header;
710 struct perf_output_handle handle;
711
712 /*
713 * Obtain the PID from the basic-sampling data entry and
714 * correct the data->tid_entry.pid value.
715 */
716 pid = data->tid_entry.pid;
717
718 /* Protect callchain buffers, tasks */
719 rcu_read_lock();
720
721 perf_prepare_sample(data, event, regs);
722 perf_prepare_header(&header, data, event, regs);
723 if (perf_output_begin(&handle, data, event, header.size))
724 goto out;
725
726 /* Update the process ID (see also kernel/events/core.c) */
727 data->tid_entry.pid = cpumsf_pid_type(event, pid, PIDTYPE_TGID);
728 data->tid_entry.tid = cpumsf_pid_type(event, pid, PIDTYPE_PID);
729
730 perf_output_sample(&handle, &header, data, event);
731 perf_output_end(&handle);
732 out:
733 rcu_read_unlock();
734 }
735
getrate(bool freq,unsigned long sample,struct hws_qsi_info_block * si)736 static unsigned long getrate(bool freq, unsigned long sample,
737 struct hws_qsi_info_block *si)
738 {
739 unsigned long rate;
740
741 if (freq) {
742 rate = freq_to_sample_rate(si, sample);
743 rate = hw_limit_rate(si, rate);
744 } else {
745 /* The min/max sampling rates specifies the valid range
746 * of sample periods. If the specified sample period is
747 * out of range, limit the period to the range boundary.
748 */
749 rate = hw_limit_rate(si, sample);
750
751 /* The perf core maintains a maximum sample rate that is
752 * configurable through the sysctl interface. Ensure the
753 * sampling rate does not exceed this value. This also helps
754 * to avoid throttling when pushing samples with
755 * perf_event_overflow().
756 */
757 if (sample_rate_to_freq(si, rate) >
758 sysctl_perf_event_sample_rate) {
759 debug_sprintf_event(sfdbg, 1, "%s: "
760 "Sampling rate exceeds maximum "
761 "perf sample rate\n", __func__);
762 rate = 0;
763 }
764 }
765 return rate;
766 }
767
768 /* The sampling information (si) contains information about the
769 * min/max sampling intervals and the CPU speed. So calculate the
770 * correct sampling interval and avoid the whole period adjust
771 * feedback loop.
772 *
773 * Since the CPU Measurement sampling facility can not handle frequency
774 * calculate the sampling interval when frequency is specified using
775 * this formula:
776 * interval := cpu_speed * 1000000 / sample_freq
777 *
778 * Returns errno on bad input and zero on success with parameter interval
779 * set to the correct sampling rate.
780 *
781 * Note: This function turns off freq bit to avoid calling function
782 * perf_adjust_period(). This causes frequency adjustment in the common
783 * code part which causes tremendous variations in the counter values.
784 */
__hw_perf_event_init_rate(struct perf_event * event,struct hws_qsi_info_block * si)785 static int __hw_perf_event_init_rate(struct perf_event *event,
786 struct hws_qsi_info_block *si)
787 {
788 struct perf_event_attr *attr = &event->attr;
789 struct hw_perf_event *hwc = &event->hw;
790 unsigned long rate;
791
792 if (attr->freq) {
793 if (!attr->sample_freq)
794 return -EINVAL;
795 rate = getrate(attr->freq, attr->sample_freq, si);
796 attr->freq = 0; /* Don't call perf_adjust_period() */
797 SAMPL_FLAGS(hwc) |= PERF_CPUM_SF_FREQ_MODE;
798 } else {
799 rate = getrate(attr->freq, attr->sample_period, si);
800 if (!rate)
801 return -EINVAL;
802 }
803 attr->sample_period = rate;
804 SAMPL_RATE(hwc) = rate;
805 hw_init_period(hwc, SAMPL_RATE(hwc));
806 debug_sprintf_event(sfdbg, 4, "%s: cpu %d period %#llx freq %d,%#lx\n",
807 __func__, event->cpu, event->attr.sample_period,
808 event->attr.freq, SAMPLE_FREQ_MODE(hwc));
809 return 0;
810 }
811
__hw_perf_event_init(struct perf_event * event)812 static int __hw_perf_event_init(struct perf_event *event)
813 {
814 struct cpu_hw_sf *cpuhw;
815 struct hws_qsi_info_block si;
816 struct perf_event_attr *attr = &event->attr;
817 struct hw_perf_event *hwc = &event->hw;
818 int cpu, err;
819
820 /* Reserve CPU-measurement sampling facility */
821 err = 0;
822 if (!atomic_inc_not_zero(&num_events)) {
823 mutex_lock(&pmc_reserve_mutex);
824 if (atomic_read(&num_events) == 0 && reserve_pmc_hardware())
825 err = -EBUSY;
826 else
827 atomic_inc(&num_events);
828 mutex_unlock(&pmc_reserve_mutex);
829 }
830 event->destroy = hw_perf_event_destroy;
831
832 if (err)
833 goto out;
834
835 /* Access per-CPU sampling information (query sampling info) */
836 /*
837 * The event->cpu value can be -1 to count on every CPU, for example,
838 * when attaching to a task. If this is specified, use the query
839 * sampling info from the current CPU, otherwise use event->cpu to
840 * retrieve the per-CPU information.
841 * Later, cpuhw indicates whether to allocate sampling buffers for a
842 * particular CPU (cpuhw!=NULL) or each online CPU (cpuw==NULL).
843 */
844 memset(&si, 0, sizeof(si));
845 cpuhw = NULL;
846 if (event->cpu == -1)
847 qsi(&si);
848 else {
849 /* Event is pinned to a particular CPU, retrieve the per-CPU
850 * sampling structure for accessing the CPU-specific QSI.
851 */
852 cpuhw = &per_cpu(cpu_hw_sf, event->cpu);
853 si = cpuhw->qsi;
854 }
855
856 /* Check sampling facility authorization and, if not authorized,
857 * fall back to other PMUs. It is safe to check any CPU because
858 * the authorization is identical for all configured CPUs.
859 */
860 if (!si.as) {
861 err = -ENOENT;
862 goto out;
863 }
864
865 if (si.ribm & CPU_MF_SF_RIBM_NOTAV) {
866 pr_warn("CPU Measurement Facility sampling is temporarily not available\n");
867 err = -EBUSY;
868 goto out;
869 }
870
871 /* Always enable basic sampling */
872 SAMPL_FLAGS(hwc) = PERF_CPUM_SF_BASIC_MODE;
873
874 /* Check if diagnostic sampling is requested. Deny if the required
875 * sampling authorization is missing.
876 */
877 if (attr->config == PERF_EVENT_CPUM_SF_DIAG) {
878 if (!si.ad) {
879 err = -EPERM;
880 goto out;
881 }
882 SAMPL_FLAGS(hwc) |= PERF_CPUM_SF_DIAG_MODE;
883 }
884
885 /* Check and set other sampling flags */
886 if (attr->config1 & PERF_CPUM_SF_FULL_BLOCKS)
887 SAMPL_FLAGS(hwc) |= PERF_CPUM_SF_FULL_BLOCKS;
888
889 err = __hw_perf_event_init_rate(event, &si);
890 if (err)
891 goto out;
892
893 /* Initialize sample data overflow accounting */
894 hwc->extra_reg.reg = REG_OVERFLOW;
895 OVERFLOW_REG(hwc) = 0;
896
897 /* Use AUX buffer. No need to allocate it by ourself */
898 if (attr->config == PERF_EVENT_CPUM_SF_DIAG)
899 return 0;
900
901 /* Allocate the per-CPU sampling buffer using the CPU information
902 * from the event. If the event is not pinned to a particular
903 * CPU (event->cpu == -1; or cpuhw == NULL), allocate sampling
904 * buffers for each online CPU.
905 */
906 if (cpuhw)
907 /* Event is pinned to a particular CPU */
908 err = allocate_buffers(cpuhw, hwc);
909 else {
910 /* Event is not pinned, allocate sampling buffer on
911 * each online CPU
912 */
913 for_each_online_cpu(cpu) {
914 cpuhw = &per_cpu(cpu_hw_sf, cpu);
915 err = allocate_buffers(cpuhw, hwc);
916 if (err)
917 break;
918 }
919 }
920
921 /* If PID/TID sampling is active, replace the default overflow
922 * handler to extract and resolve the PIDs from the basic-sampling
923 * data entries.
924 */
925 if (event->attr.sample_type & PERF_SAMPLE_TID)
926 if (is_default_overflow_handler(event))
927 event->overflow_handler = cpumsf_output_event_pid;
928 out:
929 return err;
930 }
931
is_callchain_event(struct perf_event * event)932 static bool is_callchain_event(struct perf_event *event)
933 {
934 u64 sample_type = event->attr.sample_type;
935
936 return sample_type & (PERF_SAMPLE_CALLCHAIN | PERF_SAMPLE_REGS_USER |
937 PERF_SAMPLE_STACK_USER);
938 }
939
cpumsf_pmu_event_init(struct perf_event * event)940 static int cpumsf_pmu_event_init(struct perf_event *event)
941 {
942 int err;
943
944 /* No support for taken branch sampling */
945 /* No support for callchain, stacks and registers */
946 if (has_branch_stack(event) || is_callchain_event(event))
947 return -EOPNOTSUPP;
948
949 switch (event->attr.type) {
950 case PERF_TYPE_RAW:
951 if ((event->attr.config != PERF_EVENT_CPUM_SF) &&
952 (event->attr.config != PERF_EVENT_CPUM_SF_DIAG))
953 return -ENOENT;
954 break;
955 case PERF_TYPE_HARDWARE:
956 /* Support sampling of CPU cycles in addition to the
957 * counter facility. However, the counter facility
958 * is more precise and, hence, restrict this PMU to
959 * sampling events only.
960 */
961 if (event->attr.config != PERF_COUNT_HW_CPU_CYCLES)
962 return -ENOENT;
963 if (!is_sampling_event(event))
964 return -ENOENT;
965 break;
966 default:
967 return -ENOENT;
968 }
969
970 /* Check online status of the CPU to which the event is pinned */
971 if (event->cpu >= 0 && !cpu_online(event->cpu))
972 return -ENODEV;
973
974 /* Force reset of idle/hv excludes regardless of what the
975 * user requested.
976 */
977 if (event->attr.exclude_hv)
978 event->attr.exclude_hv = 0;
979 if (event->attr.exclude_idle)
980 event->attr.exclude_idle = 0;
981
982 err = __hw_perf_event_init(event);
983 if (unlikely(err))
984 if (event->destroy)
985 event->destroy(event);
986 return err;
987 }
988
cpumsf_pmu_enable(struct pmu * pmu)989 static void cpumsf_pmu_enable(struct pmu *pmu)
990 {
991 struct cpu_hw_sf *cpuhw = this_cpu_ptr(&cpu_hw_sf);
992 struct hw_perf_event *hwc;
993 int err;
994
995 if (cpuhw->flags & PMU_F_ENABLED)
996 return;
997
998 if (cpuhw->flags & PMU_F_ERR_MASK)
999 return;
1000
1001 /* Check whether to extent the sampling buffer.
1002 *
1003 * Two conditions trigger an increase of the sampling buffer for a
1004 * perf event:
1005 * 1. Postponed buffer allocations from the event initialization.
1006 * 2. Sampling overflows that contribute to pending allocations.
1007 *
1008 * Note that the extend_sampling_buffer() function disables the sampling
1009 * facility, but it can be fully re-enabled using sampling controls that
1010 * have been saved in cpumsf_pmu_disable().
1011 */
1012 if (cpuhw->event) {
1013 hwc = &cpuhw->event->hw;
1014 if (!(SAMPL_DIAG_MODE(hwc))) {
1015 /*
1016 * Account number of overflow-designated
1017 * buffer extents
1018 */
1019 sfb_account_overflows(cpuhw, hwc);
1020 extend_sampling_buffer(&cpuhw->sfb, hwc);
1021 }
1022 /* Rate may be adjusted with ioctl() */
1023 cpuhw->lsctl.interval = SAMPL_RATE(&cpuhw->event->hw);
1024 }
1025
1026 /* (Re)enable the PMU and sampling facility */
1027 cpuhw->flags |= PMU_F_ENABLED;
1028 barrier();
1029
1030 err = lsctl(&cpuhw->lsctl);
1031 if (err) {
1032 cpuhw->flags &= ~PMU_F_ENABLED;
1033 pr_err("Loading sampling controls failed: op %i err %i\n",
1034 1, err);
1035 return;
1036 }
1037
1038 /* Load current program parameter */
1039 lpp(&S390_lowcore.lpp);
1040
1041 debug_sprintf_event(sfdbg, 6, "%s: es %i cs %i ed %i cd %i "
1042 "interval %#lx tear %#lx dear %#lx\n", __func__,
1043 cpuhw->lsctl.es, cpuhw->lsctl.cs, cpuhw->lsctl.ed,
1044 cpuhw->lsctl.cd, cpuhw->lsctl.interval,
1045 cpuhw->lsctl.tear, cpuhw->lsctl.dear);
1046 }
1047
cpumsf_pmu_disable(struct pmu * pmu)1048 static void cpumsf_pmu_disable(struct pmu *pmu)
1049 {
1050 struct cpu_hw_sf *cpuhw = this_cpu_ptr(&cpu_hw_sf);
1051 struct hws_lsctl_request_block inactive;
1052 struct hws_qsi_info_block si;
1053 int err;
1054
1055 if (!(cpuhw->flags & PMU_F_ENABLED))
1056 return;
1057
1058 if (cpuhw->flags & PMU_F_ERR_MASK)
1059 return;
1060
1061 /* Switch off sampling activation control */
1062 inactive = cpuhw->lsctl;
1063 inactive.cs = 0;
1064 inactive.cd = 0;
1065
1066 err = lsctl(&inactive);
1067 if (err) {
1068 pr_err("Loading sampling controls failed: op %i err %i\n",
1069 2, err);
1070 return;
1071 }
1072
1073 /* Save state of TEAR and DEAR register contents */
1074 err = qsi(&si);
1075 if (!err) {
1076 /* TEAR/DEAR values are valid only if the sampling facility is
1077 * enabled. Note that cpumsf_pmu_disable() might be called even
1078 * for a disabled sampling facility because cpumsf_pmu_enable()
1079 * controls the enable/disable state.
1080 */
1081 if (si.es) {
1082 cpuhw->lsctl.tear = si.tear;
1083 cpuhw->lsctl.dear = si.dear;
1084 }
1085 } else
1086 debug_sprintf_event(sfdbg, 3, "%s: qsi() failed with err %i\n",
1087 __func__, err);
1088
1089 cpuhw->flags &= ~PMU_F_ENABLED;
1090 }
1091
1092 /* perf_exclude_event() - Filter event
1093 * @event: The perf event
1094 * @regs: pt_regs structure
1095 * @sde_regs: Sample-data-entry (sde) regs structure
1096 *
1097 * Filter perf events according to their exclude specification.
1098 *
1099 * Return non-zero if the event shall be excluded.
1100 */
perf_exclude_event(struct perf_event * event,struct pt_regs * regs,struct perf_sf_sde_regs * sde_regs)1101 static int perf_exclude_event(struct perf_event *event, struct pt_regs *regs,
1102 struct perf_sf_sde_regs *sde_regs)
1103 {
1104 if (event->attr.exclude_user && user_mode(regs))
1105 return 1;
1106 if (event->attr.exclude_kernel && !user_mode(regs))
1107 return 1;
1108 if (event->attr.exclude_guest && sde_regs->in_guest)
1109 return 1;
1110 if (event->attr.exclude_host && !sde_regs->in_guest)
1111 return 1;
1112 return 0;
1113 }
1114
1115 /* perf_push_sample() - Push samples to perf
1116 * @event: The perf event
1117 * @sample: Hardware sample data
1118 *
1119 * Use the hardware sample data to create perf event sample. The sample
1120 * is the pushed to the event subsystem and the function checks for
1121 * possible event overflows. If an event overflow occurs, the PMU is
1122 * stopped.
1123 *
1124 * Return non-zero if an event overflow occurred.
1125 */
perf_push_sample(struct perf_event * event,struct hws_basic_entry * basic)1126 static int perf_push_sample(struct perf_event *event,
1127 struct hws_basic_entry *basic)
1128 {
1129 int overflow;
1130 struct pt_regs regs;
1131 struct perf_sf_sde_regs *sde_regs;
1132 struct perf_sample_data data;
1133
1134 /* Setup perf sample */
1135 perf_sample_data_init(&data, 0, event->hw.last_period);
1136
1137 /* Setup pt_regs to look like an CPU-measurement external interrupt
1138 * using the Program Request Alert code. The regs.int_parm_long
1139 * field which is unused contains additional sample-data-entry related
1140 * indicators.
1141 */
1142 memset(®s, 0, sizeof(regs));
1143 regs.int_code = 0x1407;
1144 regs.int_parm = CPU_MF_INT_SF_PRA;
1145 sde_regs = (struct perf_sf_sde_regs *) ®s.int_parm_long;
1146
1147 psw_bits(regs.psw).ia = basic->ia;
1148 psw_bits(regs.psw).dat = basic->T;
1149 psw_bits(regs.psw).wait = basic->W;
1150 psw_bits(regs.psw).pstate = basic->P;
1151 psw_bits(regs.psw).as = basic->AS;
1152
1153 /*
1154 * Use the hardware provided configuration level to decide if the
1155 * sample belongs to a guest or host. If that is not available,
1156 * fall back to the following heuristics:
1157 * A non-zero guest program parameter always indicates a guest
1158 * sample. Some early samples or samples from guests without
1159 * lpp usage would be misaccounted to the host. We use the asn
1160 * value as an addon heuristic to detect most of these guest samples.
1161 * If the value differs from 0xffff (the host value), we assume to
1162 * be a KVM guest.
1163 */
1164 switch (basic->CL) {
1165 case 1: /* logical partition */
1166 sde_regs->in_guest = 0;
1167 break;
1168 case 2: /* virtual machine */
1169 sde_regs->in_guest = 1;
1170 break;
1171 default: /* old machine, use heuristics */
1172 if (basic->gpp || basic->prim_asn != 0xffff)
1173 sde_regs->in_guest = 1;
1174 break;
1175 }
1176
1177 /*
1178 * Store the PID value from the sample-data-entry to be
1179 * processed and resolved by cpumsf_output_event_pid().
1180 */
1181 data.tid_entry.pid = basic->hpp & LPP_PID_MASK;
1182
1183 overflow = 0;
1184 if (perf_exclude_event(event, ®s, sde_regs))
1185 goto out;
1186 if (perf_event_overflow(event, &data, ®s)) {
1187 overflow = 1;
1188 event->pmu->stop(event, 0);
1189 }
1190 perf_event_update_userpage(event);
1191 out:
1192 return overflow;
1193 }
1194
perf_event_count_update(struct perf_event * event,u64 count)1195 static void perf_event_count_update(struct perf_event *event, u64 count)
1196 {
1197 local64_add(count, &event->count);
1198 }
1199
1200 /* hw_collect_samples() - Walk through a sample-data-block and collect samples
1201 * @event: The perf event
1202 * @sdbt: Sample-data-block table
1203 * @overflow: Event overflow counter
1204 *
1205 * Walks through a sample-data-block and collects sampling data entries that are
1206 * then pushed to the perf event subsystem. Depending on the sampling function,
1207 * there can be either basic-sampling or combined-sampling data entries. A
1208 * combined-sampling data entry consists of a basic- and a diagnostic-sampling
1209 * data entry. The sampling function is determined by the flags in the perf
1210 * event hardware structure. The function always works with a combined-sampling
1211 * data entry but ignores the the diagnostic portion if it is not available.
1212 *
1213 * Note that the implementation focuses on basic-sampling data entries and, if
1214 * such an entry is not valid, the entire combined-sampling data entry is
1215 * ignored.
1216 *
1217 * The overflow variables counts the number of samples that has been discarded
1218 * due to a perf event overflow.
1219 */
hw_collect_samples(struct perf_event * event,unsigned long * sdbt,unsigned long long * overflow)1220 static void hw_collect_samples(struct perf_event *event, unsigned long *sdbt,
1221 unsigned long long *overflow)
1222 {
1223 struct hws_trailer_entry *te;
1224 struct hws_basic_entry *sample;
1225
1226 te = trailer_entry_ptr((unsigned long)sdbt);
1227 sample = (struct hws_basic_entry *)sdbt;
1228 while ((unsigned long *) sample < (unsigned long *) te) {
1229 /* Check for an empty sample */
1230 if (!sample->def || sample->LS)
1231 break;
1232
1233 /* Update perf event period */
1234 perf_event_count_update(event, SAMPL_RATE(&event->hw));
1235
1236 /* Check whether sample is valid */
1237 if (sample->def == 0x0001) {
1238 /* If an event overflow occurred, the PMU is stopped to
1239 * throttle event delivery. Remaining sample data is
1240 * discarded.
1241 */
1242 if (!*overflow) {
1243 /* Check whether sample is consistent */
1244 if (sample->I == 0 && sample->W == 0) {
1245 /* Deliver sample data to perf */
1246 *overflow = perf_push_sample(event,
1247 sample);
1248 }
1249 } else
1250 /* Count discarded samples */
1251 *overflow += 1;
1252 } else {
1253 debug_sprintf_event(sfdbg, 4,
1254 "%s: Found unknown"
1255 " sampling data entry: te->f %i"
1256 " basic.def %#4x (%p)\n", __func__,
1257 te->header.f, sample->def, sample);
1258 /* Sample slot is not yet written or other record.
1259 *
1260 * This condition can occur if the buffer was reused
1261 * from a combined basic- and diagnostic-sampling.
1262 * If only basic-sampling is then active, entries are
1263 * written into the larger diagnostic entries.
1264 * This is typically the case for sample-data-blocks
1265 * that are not full. Stop processing if the first
1266 * invalid format was detected.
1267 */
1268 if (!te->header.f)
1269 break;
1270 }
1271
1272 /* Reset sample slot and advance to next sample */
1273 sample->def = 0;
1274 sample++;
1275 }
1276 }
1277
__cdsg(__uint128_t * ptr,__uint128_t old,__uint128_t new)1278 static inline __uint128_t __cdsg(__uint128_t *ptr, __uint128_t old, __uint128_t new)
1279 {
1280 asm volatile(
1281 " cdsg %[old],%[new],%[ptr]\n"
1282 : [old] "+d" (old), [ptr] "+QS" (*ptr)
1283 : [new] "d" (new)
1284 : "memory", "cc");
1285 return old;
1286 }
1287
1288 /* hw_perf_event_update() - Process sampling buffer
1289 * @event: The perf event
1290 * @flush_all: Flag to also flush partially filled sample-data-blocks
1291 *
1292 * Processes the sampling buffer and create perf event samples.
1293 * The sampling buffer position are retrieved and saved in the TEAR_REG
1294 * register of the specified perf event.
1295 *
1296 * Only full sample-data-blocks are processed. Specify the flash_all flag
1297 * to also walk through partially filled sample-data-blocks. It is ignored
1298 * if PERF_CPUM_SF_FULL_BLOCKS is set. The PERF_CPUM_SF_FULL_BLOCKS flag
1299 * enforces the processing of full sample-data-blocks only (trailer entries
1300 * with the block-full-indicator bit set).
1301 */
hw_perf_event_update(struct perf_event * event,int flush_all)1302 static void hw_perf_event_update(struct perf_event *event, int flush_all)
1303 {
1304 unsigned long long event_overflow, sampl_overflow, num_sdb;
1305 union hws_trailer_header old, prev, new;
1306 struct hw_perf_event *hwc = &event->hw;
1307 struct hws_trailer_entry *te;
1308 unsigned long *sdbt, sdb;
1309 int done;
1310
1311 /*
1312 * AUX buffer is used when in diagnostic sampling mode.
1313 * No perf events/samples are created.
1314 */
1315 if (SAMPL_DIAG_MODE(&event->hw))
1316 return;
1317
1318 if (flush_all && SDB_FULL_BLOCKS(hwc))
1319 flush_all = 0;
1320
1321 sdbt = (unsigned long *) TEAR_REG(hwc);
1322 done = event_overflow = sampl_overflow = num_sdb = 0;
1323 while (!done) {
1324 /* Get the trailer entry of the sample-data-block */
1325 sdb = (unsigned long)phys_to_virt(*sdbt);
1326 te = trailer_entry_ptr(sdb);
1327
1328 /* Leave loop if no more work to do (block full indicator) */
1329 if (!te->header.f) {
1330 done = 1;
1331 if (!flush_all)
1332 break;
1333 }
1334
1335 /* Check the sample overflow count */
1336 if (te->header.overflow)
1337 /* Account sample overflows and, if a particular limit
1338 * is reached, extend the sampling buffer.
1339 * For details, see sfb_account_overflows().
1340 */
1341 sampl_overflow += te->header.overflow;
1342
1343 /* Timestamps are valid for full sample-data-blocks only */
1344 debug_sprintf_event(sfdbg, 6, "%s: sdbt %#lx/%#lx "
1345 "overflow %llu timestamp %#llx\n",
1346 __func__, sdb, (unsigned long)sdbt,
1347 te->header.overflow,
1348 (te->header.f) ? trailer_timestamp(te) : 0ULL);
1349
1350 /* Collect all samples from a single sample-data-block and
1351 * flag if an (perf) event overflow happened. If so, the PMU
1352 * is stopped and remaining samples will be discarded.
1353 */
1354 hw_collect_samples(event, (unsigned long *)sdb, &event_overflow);
1355 num_sdb++;
1356
1357 /* Reset trailer (using compare-double-and-swap) */
1358 prev.val = READ_ONCE_ALIGNED_128(te->header.val);
1359 do {
1360 old.val = prev.val;
1361 new.val = prev.val;
1362 new.f = 0;
1363 new.a = 1;
1364 new.overflow = 0;
1365 prev.val = __cdsg(&te->header.val, old.val, new.val);
1366 } while (prev.val != old.val);
1367
1368 /* Advance to next sample-data-block */
1369 sdbt++;
1370 if (is_link_entry(sdbt))
1371 sdbt = get_next_sdbt(sdbt);
1372
1373 /* Update event hardware registers */
1374 TEAR_REG(hwc) = (unsigned long) sdbt;
1375
1376 /* Stop processing sample-data if all samples of the current
1377 * sample-data-block were flushed even if it was not full.
1378 */
1379 if (flush_all && done)
1380 break;
1381 }
1382
1383 /* Account sample overflows in the event hardware structure */
1384 if (sampl_overflow)
1385 OVERFLOW_REG(hwc) = DIV_ROUND_UP(OVERFLOW_REG(hwc) +
1386 sampl_overflow, 1 + num_sdb);
1387
1388 /* Perf_event_overflow() and perf_event_account_interrupt() limit
1389 * the interrupt rate to an upper limit. Roughly 1000 samples per
1390 * task tick.
1391 * Hitting this limit results in a large number
1392 * of throttled REF_REPORT_THROTTLE entries and the samples
1393 * are dropped.
1394 * Slightly increase the interval to avoid hitting this limit.
1395 */
1396 if (event_overflow) {
1397 SAMPL_RATE(hwc) += DIV_ROUND_UP(SAMPL_RATE(hwc), 10);
1398 debug_sprintf_event(sfdbg, 1, "%s: rate adjustment %ld\n",
1399 __func__,
1400 DIV_ROUND_UP(SAMPL_RATE(hwc), 10));
1401 }
1402
1403 if (sampl_overflow || event_overflow)
1404 debug_sprintf_event(sfdbg, 4, "%s: "
1405 "overflows: sample %llu event %llu"
1406 " total %llu num_sdb %llu\n",
1407 __func__, sampl_overflow, event_overflow,
1408 OVERFLOW_REG(hwc), num_sdb);
1409 }
1410
aux_sdb_index(struct aux_buffer * aux,unsigned long i)1411 static inline unsigned long aux_sdb_index(struct aux_buffer *aux,
1412 unsigned long i)
1413 {
1414 return i % aux->sfb.num_sdb;
1415 }
1416
aux_sdb_num(unsigned long start,unsigned long end)1417 static inline unsigned long aux_sdb_num(unsigned long start, unsigned long end)
1418 {
1419 return end >= start ? end - start + 1 : 0;
1420 }
1421
aux_sdb_num_alert(struct aux_buffer * aux)1422 static inline unsigned long aux_sdb_num_alert(struct aux_buffer *aux)
1423 {
1424 return aux_sdb_num(aux->head, aux->alert_mark);
1425 }
1426
aux_sdb_num_empty(struct aux_buffer * aux)1427 static inline unsigned long aux_sdb_num_empty(struct aux_buffer *aux)
1428 {
1429 return aux_sdb_num(aux->head, aux->empty_mark);
1430 }
1431
1432 /*
1433 * Get trailer entry by index of SDB.
1434 */
aux_sdb_trailer(struct aux_buffer * aux,unsigned long index)1435 static struct hws_trailer_entry *aux_sdb_trailer(struct aux_buffer *aux,
1436 unsigned long index)
1437 {
1438 unsigned long sdb;
1439
1440 index = aux_sdb_index(aux, index);
1441 sdb = aux->sdb_index[index];
1442 return trailer_entry_ptr(sdb);
1443 }
1444
1445 /*
1446 * Finish sampling on the cpu. Called by cpumsf_pmu_del() with pmu
1447 * disabled. Collect the full SDBs in AUX buffer which have not reached
1448 * the point of alert indicator. And ignore the SDBs which are not
1449 * full.
1450 *
1451 * 1. Scan SDBs to see how much data is there and consume them.
1452 * 2. Remove alert indicator in the buffer.
1453 */
aux_output_end(struct perf_output_handle * handle)1454 static void aux_output_end(struct perf_output_handle *handle)
1455 {
1456 unsigned long i, range_scan, idx;
1457 struct aux_buffer *aux;
1458 struct hws_trailer_entry *te;
1459
1460 aux = perf_get_aux(handle);
1461 if (!aux)
1462 return;
1463
1464 range_scan = aux_sdb_num_alert(aux);
1465 for (i = 0, idx = aux->head; i < range_scan; i++, idx++) {
1466 te = aux_sdb_trailer(aux, idx);
1467 if (!te->header.f)
1468 break;
1469 }
1470 /* i is num of SDBs which are full */
1471 perf_aux_output_end(handle, i << PAGE_SHIFT);
1472
1473 /* Remove alert indicators in the buffer */
1474 te = aux_sdb_trailer(aux, aux->alert_mark);
1475 te->header.a = 0;
1476
1477 debug_sprintf_event(sfdbg, 6, "%s: SDBs %ld range %ld head %ld\n",
1478 __func__, i, range_scan, aux->head);
1479 }
1480
1481 /*
1482 * Start sampling on the CPU. Called by cpumsf_pmu_add() when an event
1483 * is first added to the CPU or rescheduled again to the CPU. It is called
1484 * with pmu disabled.
1485 *
1486 * 1. Reset the trailer of SDBs to get ready for new data.
1487 * 2. Tell the hardware where to put the data by reset the SDBs buffer
1488 * head(tear/dear).
1489 */
aux_output_begin(struct perf_output_handle * handle,struct aux_buffer * aux,struct cpu_hw_sf * cpuhw)1490 static int aux_output_begin(struct perf_output_handle *handle,
1491 struct aux_buffer *aux,
1492 struct cpu_hw_sf *cpuhw)
1493 {
1494 unsigned long range, i, range_scan, idx, head, base, offset;
1495 struct hws_trailer_entry *te;
1496
1497 if (WARN_ON_ONCE(handle->head & ~PAGE_MASK))
1498 return -EINVAL;
1499
1500 aux->head = handle->head >> PAGE_SHIFT;
1501 range = (handle->size + 1) >> PAGE_SHIFT;
1502 if (range <= 1)
1503 return -ENOMEM;
1504
1505 /*
1506 * SDBs between aux->head and aux->empty_mark are already ready
1507 * for new data. range_scan is num of SDBs not within them.
1508 */
1509 debug_sprintf_event(sfdbg, 6,
1510 "%s: range %ld head %ld alert %ld empty %ld\n",
1511 __func__, range, aux->head, aux->alert_mark,
1512 aux->empty_mark);
1513 if (range > aux_sdb_num_empty(aux)) {
1514 range_scan = range - aux_sdb_num_empty(aux);
1515 idx = aux->empty_mark + 1;
1516 for (i = 0; i < range_scan; i++, idx++) {
1517 te = aux_sdb_trailer(aux, idx);
1518 te->header.f = 0;
1519 te->header.a = 0;
1520 te->header.overflow = 0;
1521 }
1522 /* Save the position of empty SDBs */
1523 aux->empty_mark = aux->head + range - 1;
1524 }
1525
1526 /* Set alert indicator */
1527 aux->alert_mark = aux->head + range/2 - 1;
1528 te = aux_sdb_trailer(aux, aux->alert_mark);
1529 te->header.a = 1;
1530
1531 /* Reset hardware buffer head */
1532 head = aux_sdb_index(aux, aux->head);
1533 base = aux->sdbt_index[head / CPUM_SF_SDB_PER_TABLE];
1534 offset = head % CPUM_SF_SDB_PER_TABLE;
1535 cpuhw->lsctl.tear = virt_to_phys((void *)base) + offset * sizeof(unsigned long);
1536 cpuhw->lsctl.dear = virt_to_phys((void *)aux->sdb_index[head]);
1537
1538 debug_sprintf_event(sfdbg, 6, "%s: head %ld alert %ld empty %ld "
1539 "index %ld tear %#lx dear %#lx\n", __func__,
1540 aux->head, aux->alert_mark, aux->empty_mark,
1541 head / CPUM_SF_SDB_PER_TABLE,
1542 cpuhw->lsctl.tear, cpuhw->lsctl.dear);
1543
1544 return 0;
1545 }
1546
1547 /*
1548 * Set alert indicator on SDB at index @alert_index while sampler is running.
1549 *
1550 * Return true if successfully.
1551 * Return false if full indicator is already set by hardware sampler.
1552 */
aux_set_alert(struct aux_buffer * aux,unsigned long alert_index,unsigned long long * overflow)1553 static bool aux_set_alert(struct aux_buffer *aux, unsigned long alert_index,
1554 unsigned long long *overflow)
1555 {
1556 union hws_trailer_header old, prev, new;
1557 struct hws_trailer_entry *te;
1558
1559 te = aux_sdb_trailer(aux, alert_index);
1560 prev.val = READ_ONCE_ALIGNED_128(te->header.val);
1561 do {
1562 old.val = prev.val;
1563 new.val = prev.val;
1564 *overflow = old.overflow;
1565 if (old.f) {
1566 /*
1567 * SDB is already set by hardware.
1568 * Abort and try to set somewhere
1569 * behind.
1570 */
1571 return false;
1572 }
1573 new.a = 1;
1574 new.overflow = 0;
1575 prev.val = __cdsg(&te->header.val, old.val, new.val);
1576 } while (prev.val != old.val);
1577 return true;
1578 }
1579
1580 /*
1581 * aux_reset_buffer() - Scan and setup SDBs for new samples
1582 * @aux: The AUX buffer to set
1583 * @range: The range of SDBs to scan started from aux->head
1584 * @overflow: Set to overflow count
1585 *
1586 * Set alert indicator on the SDB at index of aux->alert_mark. If this SDB is
1587 * marked as empty, check if it is already set full by the hardware sampler.
1588 * If yes, that means new data is already there before we can set an alert
1589 * indicator. Caller should try to set alert indicator to some position behind.
1590 *
1591 * Scan the SDBs in AUX buffer from behind aux->empty_mark. They are used
1592 * previously and have already been consumed by user space. Reset these SDBs
1593 * (clear full indicator and alert indicator) for new data.
1594 * If aux->alert_mark fall in this area, just set it. Overflow count is
1595 * recorded while scanning.
1596 *
1597 * SDBs between aux->head and aux->empty_mark are already reset at last time.
1598 * and ready for new samples. So scanning on this area could be skipped.
1599 *
1600 * Return true if alert indicator is set successfully and false if not.
1601 */
aux_reset_buffer(struct aux_buffer * aux,unsigned long range,unsigned long long * overflow)1602 static bool aux_reset_buffer(struct aux_buffer *aux, unsigned long range,
1603 unsigned long long *overflow)
1604 {
1605 unsigned long i, range_scan, idx, idx_old;
1606 union hws_trailer_header old, prev, new;
1607 unsigned long long orig_overflow;
1608 struct hws_trailer_entry *te;
1609
1610 debug_sprintf_event(sfdbg, 6, "%s: range %ld head %ld alert %ld "
1611 "empty %ld\n", __func__, range, aux->head,
1612 aux->alert_mark, aux->empty_mark);
1613 if (range <= aux_sdb_num_empty(aux))
1614 /*
1615 * No need to scan. All SDBs in range are marked as empty.
1616 * Just set alert indicator. Should check race with hardware
1617 * sampler.
1618 */
1619 return aux_set_alert(aux, aux->alert_mark, overflow);
1620
1621 if (aux->alert_mark <= aux->empty_mark)
1622 /*
1623 * Set alert indicator on empty SDB. Should check race
1624 * with hardware sampler.
1625 */
1626 if (!aux_set_alert(aux, aux->alert_mark, overflow))
1627 return false;
1628
1629 /*
1630 * Scan the SDBs to clear full and alert indicator used previously.
1631 * Start scanning from one SDB behind empty_mark. If the new alert
1632 * indicator fall into this range, set it.
1633 */
1634 range_scan = range - aux_sdb_num_empty(aux);
1635 idx_old = idx = aux->empty_mark + 1;
1636 for (i = 0; i < range_scan; i++, idx++) {
1637 te = aux_sdb_trailer(aux, idx);
1638 prev.val = READ_ONCE_ALIGNED_128(te->header.val);
1639 do {
1640 old.val = prev.val;
1641 new.val = prev.val;
1642 orig_overflow = old.overflow;
1643 new.f = 0;
1644 new.overflow = 0;
1645 if (idx == aux->alert_mark)
1646 new.a = 1;
1647 else
1648 new.a = 0;
1649 prev.val = __cdsg(&te->header.val, old.val, new.val);
1650 } while (prev.val != old.val);
1651 *overflow += orig_overflow;
1652 }
1653
1654 /* Update empty_mark to new position */
1655 aux->empty_mark = aux->head + range - 1;
1656
1657 debug_sprintf_event(sfdbg, 6, "%s: range_scan %ld idx %ld..%ld "
1658 "empty %ld\n", __func__, range_scan, idx_old,
1659 idx - 1, aux->empty_mark);
1660 return true;
1661 }
1662
1663 /*
1664 * Measurement alert handler for diagnostic mode sampling.
1665 */
hw_collect_aux(struct cpu_hw_sf * cpuhw)1666 static void hw_collect_aux(struct cpu_hw_sf *cpuhw)
1667 {
1668 struct aux_buffer *aux;
1669 int done = 0;
1670 unsigned long range = 0, size;
1671 unsigned long long overflow = 0;
1672 struct perf_output_handle *handle = &cpuhw->handle;
1673 unsigned long num_sdb;
1674
1675 aux = perf_get_aux(handle);
1676 if (WARN_ON_ONCE(!aux))
1677 return;
1678
1679 /* Inform user space new data arrived */
1680 size = aux_sdb_num_alert(aux) << PAGE_SHIFT;
1681 debug_sprintf_event(sfdbg, 6, "%s: #alert %ld\n", __func__,
1682 size >> PAGE_SHIFT);
1683 perf_aux_output_end(handle, size);
1684
1685 num_sdb = aux->sfb.num_sdb;
1686 while (!done) {
1687 /* Get an output handle */
1688 aux = perf_aux_output_begin(handle, cpuhw->event);
1689 if (handle->size == 0) {
1690 pr_err("The AUX buffer with %lu pages for the "
1691 "diagnostic-sampling mode is full\n",
1692 num_sdb);
1693 debug_sprintf_event(sfdbg, 1,
1694 "%s: AUX buffer used up\n",
1695 __func__);
1696 break;
1697 }
1698 if (WARN_ON_ONCE(!aux))
1699 return;
1700
1701 /* Update head and alert_mark to new position */
1702 aux->head = handle->head >> PAGE_SHIFT;
1703 range = (handle->size + 1) >> PAGE_SHIFT;
1704 if (range == 1)
1705 aux->alert_mark = aux->head;
1706 else
1707 aux->alert_mark = aux->head + range/2 - 1;
1708
1709 if (aux_reset_buffer(aux, range, &overflow)) {
1710 if (!overflow) {
1711 done = 1;
1712 break;
1713 }
1714 size = range << PAGE_SHIFT;
1715 perf_aux_output_end(&cpuhw->handle, size);
1716 pr_err("Sample data caused the AUX buffer with %lu "
1717 "pages to overflow\n", aux->sfb.num_sdb);
1718 debug_sprintf_event(sfdbg, 1, "%s: head %ld range %ld "
1719 "overflow %lld\n", __func__,
1720 aux->head, range, overflow);
1721 } else {
1722 size = aux_sdb_num_alert(aux) << PAGE_SHIFT;
1723 perf_aux_output_end(&cpuhw->handle, size);
1724 debug_sprintf_event(sfdbg, 6, "%s: head %ld alert %ld "
1725 "already full, try another\n",
1726 __func__,
1727 aux->head, aux->alert_mark);
1728 }
1729 }
1730
1731 if (done)
1732 debug_sprintf_event(sfdbg, 6, "%s: head %ld alert %ld "
1733 "empty %ld\n", __func__, aux->head,
1734 aux->alert_mark, aux->empty_mark);
1735 }
1736
1737 /*
1738 * Callback when freeing AUX buffers.
1739 */
aux_buffer_free(void * data)1740 static void aux_buffer_free(void *data)
1741 {
1742 struct aux_buffer *aux = data;
1743 unsigned long i, num_sdbt;
1744
1745 if (!aux)
1746 return;
1747
1748 /* Free SDBT. SDB is freed by the caller */
1749 num_sdbt = aux->sfb.num_sdbt;
1750 for (i = 0; i < num_sdbt; i++)
1751 free_page(aux->sdbt_index[i]);
1752
1753 kfree(aux->sdbt_index);
1754 kfree(aux->sdb_index);
1755 kfree(aux);
1756
1757 debug_sprintf_event(sfdbg, 4, "%s: SDBTs %lu\n", __func__, num_sdbt);
1758 }
1759
aux_sdb_init(unsigned long sdb)1760 static void aux_sdb_init(unsigned long sdb)
1761 {
1762 struct hws_trailer_entry *te;
1763
1764 te = trailer_entry_ptr(sdb);
1765
1766 /* Save clock base */
1767 te->clock_base = 1;
1768 te->progusage2 = tod_clock_base.tod;
1769 }
1770
1771 /*
1772 * aux_buffer_setup() - Setup AUX buffer for diagnostic mode sampling
1773 * @event: Event the buffer is setup for, event->cpu == -1 means current
1774 * @pages: Array of pointers to buffer pages passed from perf core
1775 * @nr_pages: Total pages
1776 * @snapshot: Flag for snapshot mode
1777 *
1778 * This is the callback when setup an event using AUX buffer. Perf tool can
1779 * trigger this by an additional mmap() call on the event. Unlike the buffer
1780 * for basic samples, AUX buffer belongs to the event. It is scheduled with
1781 * the task among online cpus when it is a per-thread event.
1782 *
1783 * Return the private AUX buffer structure if success or NULL if fails.
1784 */
aux_buffer_setup(struct perf_event * event,void ** pages,int nr_pages,bool snapshot)1785 static void *aux_buffer_setup(struct perf_event *event, void **pages,
1786 int nr_pages, bool snapshot)
1787 {
1788 struct sf_buffer *sfb;
1789 struct aux_buffer *aux;
1790 unsigned long *new, *tail;
1791 int i, n_sdbt;
1792
1793 if (!nr_pages || !pages)
1794 return NULL;
1795
1796 if (nr_pages > CPUM_SF_MAX_SDB * CPUM_SF_SDB_DIAG_FACTOR) {
1797 pr_err("AUX buffer size (%i pages) is larger than the "
1798 "maximum sampling buffer limit\n",
1799 nr_pages);
1800 return NULL;
1801 } else if (nr_pages < CPUM_SF_MIN_SDB * CPUM_SF_SDB_DIAG_FACTOR) {
1802 pr_err("AUX buffer size (%i pages) is less than the "
1803 "minimum sampling buffer limit\n",
1804 nr_pages);
1805 return NULL;
1806 }
1807
1808 /* Allocate aux_buffer struct for the event */
1809 aux = kzalloc(sizeof(struct aux_buffer), GFP_KERNEL);
1810 if (!aux)
1811 goto no_aux;
1812 sfb = &aux->sfb;
1813
1814 /* Allocate sdbt_index for fast reference */
1815 n_sdbt = DIV_ROUND_UP(nr_pages, CPUM_SF_SDB_PER_TABLE);
1816 aux->sdbt_index = kmalloc_array(n_sdbt, sizeof(void *), GFP_KERNEL);
1817 if (!aux->sdbt_index)
1818 goto no_sdbt_index;
1819
1820 /* Allocate sdb_index for fast reference */
1821 aux->sdb_index = kmalloc_array(nr_pages, sizeof(void *), GFP_KERNEL);
1822 if (!aux->sdb_index)
1823 goto no_sdb_index;
1824
1825 /* Allocate the first SDBT */
1826 sfb->num_sdbt = 0;
1827 sfb->sdbt = (unsigned long *) get_zeroed_page(GFP_KERNEL);
1828 if (!sfb->sdbt)
1829 goto no_sdbt;
1830 aux->sdbt_index[sfb->num_sdbt++] = (unsigned long)sfb->sdbt;
1831 tail = sfb->tail = sfb->sdbt;
1832
1833 /*
1834 * Link the provided pages of AUX buffer to SDBT.
1835 * Allocate SDBT if needed.
1836 */
1837 for (i = 0; i < nr_pages; i++, tail++) {
1838 if (require_table_link(tail)) {
1839 new = (unsigned long *) get_zeroed_page(GFP_KERNEL);
1840 if (!new)
1841 goto no_sdbt;
1842 aux->sdbt_index[sfb->num_sdbt++] = (unsigned long)new;
1843 /* Link current page to tail of chain */
1844 *tail = virt_to_phys(new) + 1;
1845 tail = new;
1846 }
1847 /* Tail is the entry in a SDBT */
1848 *tail = virt_to_phys(pages[i]);
1849 aux->sdb_index[i] = (unsigned long)pages[i];
1850 aux_sdb_init((unsigned long)pages[i]);
1851 }
1852 sfb->num_sdb = nr_pages;
1853
1854 /* Link the last entry in the SDBT to the first SDBT */
1855 *tail = virt_to_phys(sfb->sdbt) + 1;
1856 sfb->tail = tail;
1857
1858 /*
1859 * Initial all SDBs are zeroed. Mark it as empty.
1860 * So there is no need to clear the full indicator
1861 * when this event is first added.
1862 */
1863 aux->empty_mark = sfb->num_sdb - 1;
1864
1865 debug_sprintf_event(sfdbg, 4, "%s: SDBTs %lu SDBs %lu\n", __func__,
1866 sfb->num_sdbt, sfb->num_sdb);
1867
1868 return aux;
1869
1870 no_sdbt:
1871 /* SDBs (AUX buffer pages) are freed by caller */
1872 for (i = 0; i < sfb->num_sdbt; i++)
1873 free_page(aux->sdbt_index[i]);
1874 kfree(aux->sdb_index);
1875 no_sdb_index:
1876 kfree(aux->sdbt_index);
1877 no_sdbt_index:
1878 kfree(aux);
1879 no_aux:
1880 return NULL;
1881 }
1882
cpumsf_pmu_read(struct perf_event * event)1883 static void cpumsf_pmu_read(struct perf_event *event)
1884 {
1885 /* Nothing to do ... updates are interrupt-driven */
1886 }
1887
1888 /* Check if the new sampling period/freqeuncy is appropriate.
1889 *
1890 * Return non-zero on error and zero on passed checks.
1891 */
cpumsf_pmu_check_period(struct perf_event * event,u64 value)1892 static int cpumsf_pmu_check_period(struct perf_event *event, u64 value)
1893 {
1894 struct hws_qsi_info_block si;
1895 unsigned long rate;
1896 bool do_freq;
1897
1898 memset(&si, 0, sizeof(si));
1899 if (event->cpu == -1) {
1900 if (qsi(&si))
1901 return -ENODEV;
1902 } else {
1903 /* Event is pinned to a particular CPU, retrieve the per-CPU
1904 * sampling structure for accessing the CPU-specific QSI.
1905 */
1906 struct cpu_hw_sf *cpuhw = &per_cpu(cpu_hw_sf, event->cpu);
1907
1908 si = cpuhw->qsi;
1909 }
1910
1911 do_freq = !!SAMPLE_FREQ_MODE(&event->hw);
1912 rate = getrate(do_freq, value, &si);
1913 if (!rate)
1914 return -EINVAL;
1915
1916 event->attr.sample_period = rate;
1917 SAMPL_RATE(&event->hw) = rate;
1918 hw_init_period(&event->hw, SAMPL_RATE(&event->hw));
1919 debug_sprintf_event(sfdbg, 4, "%s:"
1920 " cpu %d value %#llx period %#llx freq %d\n",
1921 __func__, event->cpu, value,
1922 event->attr.sample_period, do_freq);
1923 return 0;
1924 }
1925
1926 /* Activate sampling control.
1927 * Next call of pmu_enable() starts sampling.
1928 */
cpumsf_pmu_start(struct perf_event * event,int flags)1929 static void cpumsf_pmu_start(struct perf_event *event, int flags)
1930 {
1931 struct cpu_hw_sf *cpuhw = this_cpu_ptr(&cpu_hw_sf);
1932
1933 if (WARN_ON_ONCE(!(event->hw.state & PERF_HES_STOPPED)))
1934 return;
1935
1936 if (flags & PERF_EF_RELOAD)
1937 WARN_ON_ONCE(!(event->hw.state & PERF_HES_UPTODATE));
1938
1939 perf_pmu_disable(event->pmu);
1940 event->hw.state = 0;
1941 cpuhw->lsctl.cs = 1;
1942 if (SAMPL_DIAG_MODE(&event->hw))
1943 cpuhw->lsctl.cd = 1;
1944 perf_pmu_enable(event->pmu);
1945 }
1946
1947 /* Deactivate sampling control.
1948 * Next call of pmu_enable() stops sampling.
1949 */
cpumsf_pmu_stop(struct perf_event * event,int flags)1950 static void cpumsf_pmu_stop(struct perf_event *event, int flags)
1951 {
1952 struct cpu_hw_sf *cpuhw = this_cpu_ptr(&cpu_hw_sf);
1953
1954 if (event->hw.state & PERF_HES_STOPPED)
1955 return;
1956
1957 perf_pmu_disable(event->pmu);
1958 cpuhw->lsctl.cs = 0;
1959 cpuhw->lsctl.cd = 0;
1960 event->hw.state |= PERF_HES_STOPPED;
1961
1962 if ((flags & PERF_EF_UPDATE) && !(event->hw.state & PERF_HES_UPTODATE)) {
1963 hw_perf_event_update(event, 1);
1964 event->hw.state |= PERF_HES_UPTODATE;
1965 }
1966 perf_pmu_enable(event->pmu);
1967 }
1968
cpumsf_pmu_add(struct perf_event * event,int flags)1969 static int cpumsf_pmu_add(struct perf_event *event, int flags)
1970 {
1971 struct cpu_hw_sf *cpuhw = this_cpu_ptr(&cpu_hw_sf);
1972 struct aux_buffer *aux;
1973 int err;
1974
1975 if (cpuhw->flags & PMU_F_IN_USE)
1976 return -EAGAIN;
1977
1978 if (!SAMPL_DIAG_MODE(&event->hw) && !cpuhw->sfb.sdbt)
1979 return -EINVAL;
1980
1981 err = 0;
1982 perf_pmu_disable(event->pmu);
1983
1984 event->hw.state = PERF_HES_UPTODATE | PERF_HES_STOPPED;
1985
1986 /* Set up sampling controls. Always program the sampling register
1987 * using the SDB-table start. Reset TEAR_REG event hardware register
1988 * that is used by hw_perf_event_update() to store the sampling buffer
1989 * position after samples have been flushed.
1990 */
1991 cpuhw->lsctl.s = 0;
1992 cpuhw->lsctl.h = 1;
1993 cpuhw->lsctl.interval = SAMPL_RATE(&event->hw);
1994 if (!SAMPL_DIAG_MODE(&event->hw)) {
1995 cpuhw->lsctl.tear = virt_to_phys(cpuhw->sfb.sdbt);
1996 cpuhw->lsctl.dear = *(unsigned long *) cpuhw->sfb.sdbt;
1997 TEAR_REG(&event->hw) = (unsigned long) cpuhw->sfb.sdbt;
1998 }
1999
2000 /* Ensure sampling functions are in the disabled state. If disabled,
2001 * switch on sampling enable control. */
2002 if (WARN_ON_ONCE(cpuhw->lsctl.es == 1 || cpuhw->lsctl.ed == 1)) {
2003 err = -EAGAIN;
2004 goto out;
2005 }
2006 if (SAMPL_DIAG_MODE(&event->hw)) {
2007 aux = perf_aux_output_begin(&cpuhw->handle, event);
2008 if (!aux) {
2009 err = -EINVAL;
2010 goto out;
2011 }
2012 err = aux_output_begin(&cpuhw->handle, aux, cpuhw);
2013 if (err)
2014 goto out;
2015 cpuhw->lsctl.ed = 1;
2016 }
2017 cpuhw->lsctl.es = 1;
2018
2019 /* Set in_use flag and store event */
2020 cpuhw->event = event;
2021 cpuhw->flags |= PMU_F_IN_USE;
2022
2023 if (flags & PERF_EF_START)
2024 cpumsf_pmu_start(event, PERF_EF_RELOAD);
2025 out:
2026 perf_event_update_userpage(event);
2027 perf_pmu_enable(event->pmu);
2028 return err;
2029 }
2030
cpumsf_pmu_del(struct perf_event * event,int flags)2031 static void cpumsf_pmu_del(struct perf_event *event, int flags)
2032 {
2033 struct cpu_hw_sf *cpuhw = this_cpu_ptr(&cpu_hw_sf);
2034
2035 perf_pmu_disable(event->pmu);
2036 cpumsf_pmu_stop(event, PERF_EF_UPDATE);
2037
2038 cpuhw->lsctl.es = 0;
2039 cpuhw->lsctl.ed = 0;
2040 cpuhw->flags &= ~PMU_F_IN_USE;
2041 cpuhw->event = NULL;
2042
2043 if (SAMPL_DIAG_MODE(&event->hw))
2044 aux_output_end(&cpuhw->handle);
2045 perf_event_update_userpage(event);
2046 perf_pmu_enable(event->pmu);
2047 }
2048
2049 CPUMF_EVENT_ATTR(SF, SF_CYCLES_BASIC, PERF_EVENT_CPUM_SF);
2050 CPUMF_EVENT_ATTR(SF, SF_CYCLES_BASIC_DIAG, PERF_EVENT_CPUM_SF_DIAG);
2051
2052 /* Attribute list for CPU_SF.
2053 *
2054 * The availablitiy depends on the CPU_MF sampling facility authorization
2055 * for basic + diagnositic samples. This is determined at initialization
2056 * time by the sampling facility device driver.
2057 * If the authorization for basic samples is turned off, it should be
2058 * also turned off for diagnostic sampling.
2059 *
2060 * During initialization of the device driver, check the authorization
2061 * level for diagnostic sampling and installs the attribute
2062 * file for diagnostic sampling if necessary.
2063 *
2064 * For now install a placeholder to reference all possible attributes:
2065 * SF_CYCLES_BASIC and SF_CYCLES_BASIC_DIAG.
2066 * Add another entry for the final NULL pointer.
2067 */
2068 enum {
2069 SF_CYCLES_BASIC_ATTR_IDX = 0,
2070 SF_CYCLES_BASIC_DIAG_ATTR_IDX,
2071 SF_CYCLES_ATTR_MAX
2072 };
2073
2074 static struct attribute *cpumsf_pmu_events_attr[SF_CYCLES_ATTR_MAX + 1] = {
2075 [SF_CYCLES_BASIC_ATTR_IDX] = CPUMF_EVENT_PTR(SF, SF_CYCLES_BASIC)
2076 };
2077
2078 PMU_FORMAT_ATTR(event, "config:0-63");
2079
2080 static struct attribute *cpumsf_pmu_format_attr[] = {
2081 &format_attr_event.attr,
2082 NULL,
2083 };
2084
2085 static struct attribute_group cpumsf_pmu_events_group = {
2086 .name = "events",
2087 .attrs = cpumsf_pmu_events_attr,
2088 };
2089
2090 static struct attribute_group cpumsf_pmu_format_group = {
2091 .name = "format",
2092 .attrs = cpumsf_pmu_format_attr,
2093 };
2094
2095 static const struct attribute_group *cpumsf_pmu_attr_groups[] = {
2096 &cpumsf_pmu_events_group,
2097 &cpumsf_pmu_format_group,
2098 NULL,
2099 };
2100
2101 static struct pmu cpumf_sampling = {
2102 .pmu_enable = cpumsf_pmu_enable,
2103 .pmu_disable = cpumsf_pmu_disable,
2104
2105 .event_init = cpumsf_pmu_event_init,
2106 .add = cpumsf_pmu_add,
2107 .del = cpumsf_pmu_del,
2108
2109 .start = cpumsf_pmu_start,
2110 .stop = cpumsf_pmu_stop,
2111 .read = cpumsf_pmu_read,
2112
2113 .attr_groups = cpumsf_pmu_attr_groups,
2114
2115 .setup_aux = aux_buffer_setup,
2116 .free_aux = aux_buffer_free,
2117
2118 .check_period = cpumsf_pmu_check_period,
2119 };
2120
cpumf_measurement_alert(struct ext_code ext_code,unsigned int alert,unsigned long unused)2121 static void cpumf_measurement_alert(struct ext_code ext_code,
2122 unsigned int alert, unsigned long unused)
2123 {
2124 struct cpu_hw_sf *cpuhw;
2125
2126 if (!(alert & CPU_MF_INT_SF_MASK))
2127 return;
2128 inc_irq_stat(IRQEXT_CMS);
2129 cpuhw = this_cpu_ptr(&cpu_hw_sf);
2130
2131 /* Measurement alerts are shared and might happen when the PMU
2132 * is not reserved. Ignore these alerts in this case. */
2133 if (!(cpuhw->flags & PMU_F_RESERVED))
2134 return;
2135
2136 /* The processing below must take care of multiple alert events that
2137 * might be indicated concurrently. */
2138
2139 /* Program alert request */
2140 if (alert & CPU_MF_INT_SF_PRA) {
2141 if (cpuhw->flags & PMU_F_IN_USE)
2142 if (SAMPL_DIAG_MODE(&cpuhw->event->hw))
2143 hw_collect_aux(cpuhw);
2144 else
2145 hw_perf_event_update(cpuhw->event, 0);
2146 else
2147 WARN_ON_ONCE(!(cpuhw->flags & PMU_F_IN_USE));
2148 }
2149
2150 /* Report measurement alerts only for non-PRA codes */
2151 if (alert != CPU_MF_INT_SF_PRA)
2152 debug_sprintf_event(sfdbg, 6, "%s: alert %#x\n", __func__,
2153 alert);
2154
2155 /* Sampling authorization change request */
2156 if (alert & CPU_MF_INT_SF_SACA)
2157 qsi(&cpuhw->qsi);
2158
2159 /* Loss of sample data due to high-priority machine activities */
2160 if (alert & CPU_MF_INT_SF_LSDA) {
2161 pr_err("Sample data was lost\n");
2162 cpuhw->flags |= PMU_F_ERR_LSDA;
2163 sf_disable();
2164 }
2165
2166 /* Invalid sampling buffer entry */
2167 if (alert & (CPU_MF_INT_SF_IAE|CPU_MF_INT_SF_ISE)) {
2168 pr_err("A sampling buffer entry is incorrect (alert=0x%x)\n",
2169 alert);
2170 cpuhw->flags |= PMU_F_ERR_IBE;
2171 sf_disable();
2172 }
2173 }
2174
cpusf_pmu_setup(unsigned int cpu,int flags)2175 static int cpusf_pmu_setup(unsigned int cpu, int flags)
2176 {
2177 /* Ignore the notification if no events are scheduled on the PMU.
2178 * This might be racy...
2179 */
2180 if (!atomic_read(&num_events))
2181 return 0;
2182
2183 local_irq_disable();
2184 setup_pmc_cpu(&flags);
2185 local_irq_enable();
2186 return 0;
2187 }
2188
s390_pmu_sf_online_cpu(unsigned int cpu)2189 static int s390_pmu_sf_online_cpu(unsigned int cpu)
2190 {
2191 return cpusf_pmu_setup(cpu, PMC_INIT);
2192 }
2193
s390_pmu_sf_offline_cpu(unsigned int cpu)2194 static int s390_pmu_sf_offline_cpu(unsigned int cpu)
2195 {
2196 return cpusf_pmu_setup(cpu, PMC_RELEASE);
2197 }
2198
param_get_sfb_size(char * buffer,const struct kernel_param * kp)2199 static int param_get_sfb_size(char *buffer, const struct kernel_param *kp)
2200 {
2201 if (!cpum_sf_avail())
2202 return -ENODEV;
2203 return sprintf(buffer, "%lu,%lu", CPUM_SF_MIN_SDB, CPUM_SF_MAX_SDB);
2204 }
2205
param_set_sfb_size(const char * val,const struct kernel_param * kp)2206 static int param_set_sfb_size(const char *val, const struct kernel_param *kp)
2207 {
2208 int rc;
2209 unsigned long min, max;
2210
2211 if (!cpum_sf_avail())
2212 return -ENODEV;
2213 if (!val || !strlen(val))
2214 return -EINVAL;
2215
2216 /* Valid parameter values: "min,max" or "max" */
2217 min = CPUM_SF_MIN_SDB;
2218 max = CPUM_SF_MAX_SDB;
2219 if (strchr(val, ','))
2220 rc = (sscanf(val, "%lu,%lu", &min, &max) == 2) ? 0 : -EINVAL;
2221 else
2222 rc = kstrtoul(val, 10, &max);
2223
2224 if (min < 2 || min >= max || max > get_num_physpages())
2225 rc = -EINVAL;
2226 if (rc)
2227 return rc;
2228
2229 sfb_set_limits(min, max);
2230 pr_info("The sampling buffer limits have changed to: "
2231 "min %lu max %lu (diag %lu)\n",
2232 CPUM_SF_MIN_SDB, CPUM_SF_MAX_SDB, CPUM_SF_SDB_DIAG_FACTOR);
2233 return 0;
2234 }
2235
2236 #define param_check_sfb_size(name, p) __param_check(name, p, void)
2237 static const struct kernel_param_ops param_ops_sfb_size = {
2238 .set = param_set_sfb_size,
2239 .get = param_get_sfb_size,
2240 };
2241
2242 #define RS_INIT_FAILURE_QSI 0x0001
2243 #define RS_INIT_FAILURE_BSDES 0x0002
2244 #define RS_INIT_FAILURE_ALRT 0x0003
2245 #define RS_INIT_FAILURE_PERF 0x0004
pr_cpumsf_err(unsigned int reason)2246 static void __init pr_cpumsf_err(unsigned int reason)
2247 {
2248 pr_err("Sampling facility support for perf is not available: "
2249 "reason %#x\n", reason);
2250 }
2251
init_cpum_sampling_pmu(void)2252 static int __init init_cpum_sampling_pmu(void)
2253 {
2254 struct hws_qsi_info_block si;
2255 int err;
2256
2257 if (!cpum_sf_avail())
2258 return -ENODEV;
2259
2260 memset(&si, 0, sizeof(si));
2261 if (qsi(&si)) {
2262 pr_cpumsf_err(RS_INIT_FAILURE_QSI);
2263 return -ENODEV;
2264 }
2265
2266 if (!si.as && !si.ad)
2267 return -ENODEV;
2268
2269 if (si.bsdes != sizeof(struct hws_basic_entry)) {
2270 pr_cpumsf_err(RS_INIT_FAILURE_BSDES);
2271 return -EINVAL;
2272 }
2273
2274 if (si.ad) {
2275 sfb_set_limits(CPUM_SF_MIN_SDB, CPUM_SF_MAX_SDB);
2276 /* Sampling of diagnostic data authorized,
2277 * install event into attribute list of PMU device.
2278 */
2279 cpumsf_pmu_events_attr[SF_CYCLES_BASIC_DIAG_ATTR_IDX] =
2280 CPUMF_EVENT_PTR(SF, SF_CYCLES_BASIC_DIAG);
2281 }
2282
2283 sfdbg = debug_register(KMSG_COMPONENT, 2, 1, 80);
2284 if (!sfdbg) {
2285 pr_err("Registering for s390dbf failed\n");
2286 return -ENOMEM;
2287 }
2288 debug_register_view(sfdbg, &debug_sprintf_view);
2289
2290 err = register_external_irq(EXT_IRQ_MEASURE_ALERT,
2291 cpumf_measurement_alert);
2292 if (err) {
2293 pr_cpumsf_err(RS_INIT_FAILURE_ALRT);
2294 debug_unregister(sfdbg);
2295 goto out;
2296 }
2297
2298 err = perf_pmu_register(&cpumf_sampling, "cpum_sf", PERF_TYPE_RAW);
2299 if (err) {
2300 pr_cpumsf_err(RS_INIT_FAILURE_PERF);
2301 unregister_external_irq(EXT_IRQ_MEASURE_ALERT,
2302 cpumf_measurement_alert);
2303 debug_unregister(sfdbg);
2304 goto out;
2305 }
2306
2307 cpuhp_setup_state(CPUHP_AP_PERF_S390_SF_ONLINE, "perf/s390/sf:online",
2308 s390_pmu_sf_online_cpu, s390_pmu_sf_offline_cpu);
2309 out:
2310 return err;
2311 }
2312
2313 arch_initcall(init_cpum_sampling_pmu);
2314 core_param(cpum_sfb_size, CPUM_SF_MAX_SDB, sfb_size, 0644);
2315