1 // SPDX-License-Identifier: MIT
2 /*
3 * Copyright © 2020 Intel Corporation
4 */
5
6 #include <linux/pm_qos.h>
7 #include <linux/sort.h>
8
9 #include "gem/i915_gem_internal.h"
10
11 #include "intel_engine_heartbeat.h"
12 #include "intel_engine_pm.h"
13 #include "intel_engine_regs.h"
14 #include "intel_gpu_commands.h"
15 #include "intel_gt_clock_utils.h"
16 #include "intel_gt_pm.h"
17 #include "intel_rc6.h"
18 #include "selftest_engine_heartbeat.h"
19 #include "selftest_rps.h"
20 #include "selftests/igt_flush_test.h"
21 #include "selftests/igt_spinner.h"
22 #include "selftests/librapl.h"
23
24 /* Try to isolate the impact of cstates from determing frequency response */
25 #define CPU_LATENCY 0 /* -1 to disable pm_qos, 0 to disable cstates */
26
dummy_rps_work(struct work_struct * wrk)27 static void dummy_rps_work(struct work_struct *wrk)
28 {
29 }
30
cmp_u64(const void * A,const void * B)31 static int cmp_u64(const void *A, const void *B)
32 {
33 const u64 *a = A, *b = B;
34
35 if (*a < *b)
36 return -1;
37 else if (*a > *b)
38 return 1;
39 else
40 return 0;
41 }
42
cmp_u32(const void * A,const void * B)43 static int cmp_u32(const void *A, const void *B)
44 {
45 const u32 *a = A, *b = B;
46
47 if (*a < *b)
48 return -1;
49 else if (*a > *b)
50 return 1;
51 else
52 return 0;
53 }
54
55 static struct i915_vma *
create_spin_counter(struct intel_engine_cs * engine,struct i915_address_space * vm,bool srm,u32 ** cancel,u32 ** counter)56 create_spin_counter(struct intel_engine_cs *engine,
57 struct i915_address_space *vm,
58 bool srm,
59 u32 **cancel,
60 u32 **counter)
61 {
62 enum {
63 COUNT,
64 INC,
65 __NGPR__,
66 };
67 #define CS_GPR(x) GEN8_RING_CS_GPR(engine->mmio_base, x)
68 struct drm_i915_gem_object *obj;
69 struct i915_vma *vma;
70 unsigned long end;
71 u32 *base, *cs;
72 int loop, i;
73 int err;
74
75 obj = i915_gem_object_create_internal(vm->i915, 64 << 10);
76 if (IS_ERR(obj))
77 return ERR_CAST(obj);
78
79 end = obj->base.size / sizeof(u32) - 1;
80
81 vma = i915_vma_instance(obj, vm, NULL);
82 if (IS_ERR(vma)) {
83 err = PTR_ERR(vma);
84 goto err_put;
85 }
86
87 err = i915_vma_pin(vma, 0, 0, PIN_USER);
88 if (err)
89 goto err_unlock;
90
91 i915_vma_lock(vma);
92
93 base = i915_gem_object_pin_map(obj, I915_MAP_WC);
94 if (IS_ERR(base)) {
95 err = PTR_ERR(base);
96 goto err_unpin;
97 }
98 cs = base;
99
100 *cs++ = MI_LOAD_REGISTER_IMM(__NGPR__ * 2);
101 for (i = 0; i < __NGPR__; i++) {
102 *cs++ = i915_mmio_reg_offset(CS_GPR(i));
103 *cs++ = 0;
104 *cs++ = i915_mmio_reg_offset(CS_GPR(i)) + 4;
105 *cs++ = 0;
106 }
107
108 *cs++ = MI_LOAD_REGISTER_IMM(1);
109 *cs++ = i915_mmio_reg_offset(CS_GPR(INC));
110 *cs++ = 1;
111
112 loop = cs - base;
113
114 /* Unroll the loop to avoid MI_BB_START stalls impacting measurements */
115 for (i = 0; i < 1024; i++) {
116 *cs++ = MI_MATH(4);
117 *cs++ = MI_MATH_LOAD(MI_MATH_REG_SRCA, MI_MATH_REG(COUNT));
118 *cs++ = MI_MATH_LOAD(MI_MATH_REG_SRCB, MI_MATH_REG(INC));
119 *cs++ = MI_MATH_ADD;
120 *cs++ = MI_MATH_STORE(MI_MATH_REG(COUNT), MI_MATH_REG_ACCU);
121
122 if (srm) {
123 *cs++ = MI_STORE_REGISTER_MEM_GEN8;
124 *cs++ = i915_mmio_reg_offset(CS_GPR(COUNT));
125 *cs++ = lower_32_bits(i915_vma_offset(vma) + end * sizeof(*cs));
126 *cs++ = upper_32_bits(i915_vma_offset(vma) + end * sizeof(*cs));
127 }
128 }
129
130 *cs++ = MI_BATCH_BUFFER_START_GEN8;
131 *cs++ = lower_32_bits(i915_vma_offset(vma) + loop * sizeof(*cs));
132 *cs++ = upper_32_bits(i915_vma_offset(vma) + loop * sizeof(*cs));
133 GEM_BUG_ON(cs - base > end);
134
135 i915_gem_object_flush_map(obj);
136
137 *cancel = base + loop;
138 *counter = srm ? memset32(base + end, 0, 1) : NULL;
139 return vma;
140
141 err_unpin:
142 i915_vma_unpin(vma);
143 err_unlock:
144 i915_vma_unlock(vma);
145 err_put:
146 i915_gem_object_put(obj);
147 return ERR_PTR(err);
148 }
149
wait_for_freq(struct intel_rps * rps,u8 freq,int timeout_ms)150 static u8 wait_for_freq(struct intel_rps *rps, u8 freq, int timeout_ms)
151 {
152 u8 history[64], i;
153 unsigned long end;
154 int sleep;
155
156 i = 0;
157 memset(history, freq, sizeof(history));
158 sleep = 20;
159
160 /* The PCU does not change instantly, but drifts towards the goal? */
161 end = jiffies + msecs_to_jiffies(timeout_ms);
162 do {
163 u8 act;
164
165 act = read_cagf(rps);
166 if (time_after(jiffies, end))
167 return act;
168
169 /* Target acquired */
170 if (act == freq)
171 return act;
172
173 /* Any change within the last N samples? */
174 if (!memchr_inv(history, act, sizeof(history)))
175 return act;
176
177 history[i] = act;
178 i = (i + 1) % ARRAY_SIZE(history);
179
180 usleep_range(sleep, 2 * sleep);
181 sleep *= 2;
182 if (sleep > timeout_ms * 20)
183 sleep = timeout_ms * 20;
184 } while (1);
185 }
186
rps_set_check(struct intel_rps * rps,u8 freq)187 static u8 rps_set_check(struct intel_rps *rps, u8 freq)
188 {
189 mutex_lock(&rps->lock);
190 GEM_BUG_ON(!intel_rps_is_active(rps));
191 if (wait_for(!intel_rps_set(rps, freq), 50)) {
192 mutex_unlock(&rps->lock);
193 return 0;
194 }
195 GEM_BUG_ON(rps->last_freq != freq);
196 mutex_unlock(&rps->lock);
197
198 return wait_for_freq(rps, freq, 50);
199 }
200
show_pstate_limits(struct intel_rps * rps)201 static void show_pstate_limits(struct intel_rps *rps)
202 {
203 struct drm_i915_private *i915 = rps_to_i915(rps);
204
205 if (IS_BROXTON(i915)) {
206 pr_info("P_STATE_CAP[%x]: 0x%08x\n",
207 i915_mmio_reg_offset(BXT_RP_STATE_CAP),
208 intel_uncore_read(rps_to_uncore(rps),
209 BXT_RP_STATE_CAP));
210 } else if (GRAPHICS_VER(i915) == 9) {
211 pr_info("P_STATE_LIMITS[%x]: 0x%08x\n",
212 i915_mmio_reg_offset(GEN9_RP_STATE_LIMITS),
213 intel_uncore_read(rps_to_uncore(rps),
214 GEN9_RP_STATE_LIMITS));
215 }
216 }
217
live_rps_clock_interval(void * arg)218 int live_rps_clock_interval(void *arg)
219 {
220 struct intel_gt *gt = arg;
221 struct intel_rps *rps = >->rps;
222 void (*saved_work)(struct work_struct *wrk);
223 struct intel_engine_cs *engine;
224 enum intel_engine_id id;
225 struct igt_spinner spin;
226 int err = 0;
227
228 if (!intel_rps_is_enabled(rps) || GRAPHICS_VER(gt->i915) < 6)
229 return 0;
230
231 if (igt_spinner_init(&spin, gt))
232 return -ENOMEM;
233
234 intel_gt_pm_wait_for_idle(gt);
235 saved_work = rps->work.func;
236 rps->work.func = dummy_rps_work;
237
238 intel_gt_pm_get(gt);
239 intel_rps_disable(>->rps);
240
241 intel_gt_check_clock_frequency(gt);
242
243 for_each_engine(engine, gt, id) {
244 struct i915_request *rq;
245 u32 cycles;
246 u64 dt;
247
248 if (!intel_engine_can_store_dword(engine))
249 continue;
250
251 st_engine_heartbeat_disable(engine);
252
253 rq = igt_spinner_create_request(&spin,
254 engine->kernel_context,
255 MI_NOOP);
256 if (IS_ERR(rq)) {
257 st_engine_heartbeat_enable(engine);
258 err = PTR_ERR(rq);
259 break;
260 }
261
262 i915_request_add(rq);
263
264 if (!igt_wait_for_spinner(&spin, rq)) {
265 pr_err("%s: RPS spinner did not start\n",
266 engine->name);
267 igt_spinner_end(&spin);
268 st_engine_heartbeat_enable(engine);
269 intel_gt_set_wedged(engine->gt);
270 err = -EIO;
271 break;
272 }
273
274 intel_uncore_forcewake_get(gt->uncore, FORCEWAKE_ALL);
275
276 intel_uncore_write_fw(gt->uncore, GEN6_RP_CUR_UP_EI, 0);
277
278 /* Set the evaluation interval to infinity! */
279 intel_uncore_write_fw(gt->uncore,
280 GEN6_RP_UP_EI, 0xffffffff);
281 intel_uncore_write_fw(gt->uncore,
282 GEN6_RP_UP_THRESHOLD, 0xffffffff);
283
284 intel_uncore_write_fw(gt->uncore, GEN6_RP_CONTROL,
285 GEN6_RP_ENABLE | GEN6_RP_UP_BUSY_AVG);
286
287 if (wait_for(intel_uncore_read_fw(gt->uncore,
288 GEN6_RP_CUR_UP_EI),
289 10)) {
290 /* Just skip the test; assume lack of HW support */
291 pr_notice("%s: rps evaluation interval not ticking\n",
292 engine->name);
293 err = -ENODEV;
294 } else {
295 ktime_t dt_[5];
296 u32 cycles_[5];
297 int i;
298
299 for (i = 0; i < 5; i++) {
300 preempt_disable();
301
302 dt_[i] = ktime_get();
303 cycles_[i] = -intel_uncore_read_fw(gt->uncore, GEN6_RP_CUR_UP_EI);
304
305 udelay(1000);
306
307 dt_[i] = ktime_sub(ktime_get(), dt_[i]);
308 cycles_[i] += intel_uncore_read_fw(gt->uncore, GEN6_RP_CUR_UP_EI);
309
310 preempt_enable();
311 }
312
313 /* Use the median of both cycle/dt; close enough */
314 sort(cycles_, 5, sizeof(*cycles_), cmp_u32, NULL);
315 cycles = (cycles_[1] + 2 * cycles_[2] + cycles_[3]) / 4;
316 sort(dt_, 5, sizeof(*dt_), cmp_u64, NULL);
317 dt = div_u64(dt_[1] + 2 * dt_[2] + dt_[3], 4);
318 }
319
320 intel_uncore_write_fw(gt->uncore, GEN6_RP_CONTROL, 0);
321 intel_uncore_forcewake_put(gt->uncore, FORCEWAKE_ALL);
322
323 igt_spinner_end(&spin);
324 st_engine_heartbeat_enable(engine);
325
326 if (err == 0) {
327 u64 time = intel_gt_pm_interval_to_ns(gt, cycles);
328 u32 expected =
329 intel_gt_ns_to_pm_interval(gt, dt);
330
331 pr_info("%s: rps counted %d C0 cycles [%lldns] in %lldns [%d cycles], using GT clock frequency of %uKHz\n",
332 engine->name, cycles, time, dt, expected,
333 gt->clock_frequency / 1000);
334
335 if (10 * time < 8 * dt ||
336 8 * time > 10 * dt) {
337 pr_err("%s: rps clock time does not match walltime!\n",
338 engine->name);
339 err = -EINVAL;
340 }
341
342 if (10 * expected < 8 * cycles ||
343 8 * expected > 10 * cycles) {
344 pr_err("%s: walltime does not match rps clock ticks!\n",
345 engine->name);
346 err = -EINVAL;
347 }
348 }
349
350 if (igt_flush_test(gt->i915))
351 err = -EIO;
352
353 break; /* once is enough */
354 }
355
356 intel_rps_enable(>->rps);
357 intel_gt_pm_put(gt);
358
359 igt_spinner_fini(&spin);
360
361 intel_gt_pm_wait_for_idle(gt);
362 rps->work.func = saved_work;
363
364 if (err == -ENODEV) /* skipped, don't report a fail */
365 err = 0;
366
367 return err;
368 }
369
live_rps_control(void * arg)370 int live_rps_control(void *arg)
371 {
372 struct intel_gt *gt = arg;
373 struct intel_rps *rps = >->rps;
374 void (*saved_work)(struct work_struct *wrk);
375 struct intel_engine_cs *engine;
376 enum intel_engine_id id;
377 struct igt_spinner spin;
378 int err = 0;
379
380 /*
381 * Check that the actual frequency matches our requested frequency,
382 * to verify our control mechanism. We have to be careful that the
383 * PCU may throttle the GPU in which case the actual frequency used
384 * will be lowered than requested.
385 */
386
387 if (!intel_rps_is_enabled(rps))
388 return 0;
389
390 if (IS_CHERRYVIEW(gt->i915)) /* XXX fragile PCU */
391 return 0;
392
393 if (igt_spinner_init(&spin, gt))
394 return -ENOMEM;
395
396 intel_gt_pm_wait_for_idle(gt);
397 saved_work = rps->work.func;
398 rps->work.func = dummy_rps_work;
399
400 intel_gt_pm_get(gt);
401 for_each_engine(engine, gt, id) {
402 struct i915_request *rq;
403 ktime_t min_dt, max_dt;
404 int f, limit;
405 int min, max;
406
407 if (!intel_engine_can_store_dword(engine))
408 continue;
409
410 st_engine_heartbeat_disable(engine);
411
412 rq = igt_spinner_create_request(&spin,
413 engine->kernel_context,
414 MI_NOOP);
415 if (IS_ERR(rq)) {
416 err = PTR_ERR(rq);
417 break;
418 }
419
420 i915_request_add(rq);
421
422 if (!igt_wait_for_spinner(&spin, rq)) {
423 pr_err("%s: RPS spinner did not start\n",
424 engine->name);
425 igt_spinner_end(&spin);
426 st_engine_heartbeat_enable(engine);
427 intel_gt_set_wedged(engine->gt);
428 err = -EIO;
429 break;
430 }
431
432 if (rps_set_check(rps, rps->min_freq) != rps->min_freq) {
433 pr_err("%s: could not set minimum frequency [%x], only %x!\n",
434 engine->name, rps->min_freq, read_cagf(rps));
435 igt_spinner_end(&spin);
436 st_engine_heartbeat_enable(engine);
437 show_pstate_limits(rps);
438 err = -EINVAL;
439 break;
440 }
441
442 for (f = rps->min_freq + 1; f < rps->max_freq; f++) {
443 if (rps_set_check(rps, f) < f)
444 break;
445 }
446
447 limit = rps_set_check(rps, f);
448
449 if (rps_set_check(rps, rps->min_freq) != rps->min_freq) {
450 pr_err("%s: could not restore minimum frequency [%x], only %x!\n",
451 engine->name, rps->min_freq, read_cagf(rps));
452 igt_spinner_end(&spin);
453 st_engine_heartbeat_enable(engine);
454 show_pstate_limits(rps);
455 err = -EINVAL;
456 break;
457 }
458
459 max_dt = ktime_get();
460 max = rps_set_check(rps, limit);
461 max_dt = ktime_sub(ktime_get(), max_dt);
462
463 min_dt = ktime_get();
464 min = rps_set_check(rps, rps->min_freq);
465 min_dt = ktime_sub(ktime_get(), min_dt);
466
467 igt_spinner_end(&spin);
468 st_engine_heartbeat_enable(engine);
469
470 pr_info("%s: range:[%x:%uMHz, %x:%uMHz] limit:[%x:%uMHz], %x:%x response %lluns:%lluns\n",
471 engine->name,
472 rps->min_freq, intel_gpu_freq(rps, rps->min_freq),
473 rps->max_freq, intel_gpu_freq(rps, rps->max_freq),
474 limit, intel_gpu_freq(rps, limit),
475 min, max, ktime_to_ns(min_dt), ktime_to_ns(max_dt));
476
477 if (limit == rps->min_freq) {
478 pr_err("%s: GPU throttled to minimum!\n",
479 engine->name);
480 show_pstate_limits(rps);
481 err = -ENODEV;
482 break;
483 }
484
485 if (igt_flush_test(gt->i915)) {
486 err = -EIO;
487 break;
488 }
489 }
490 intel_gt_pm_put(gt);
491
492 igt_spinner_fini(&spin);
493
494 intel_gt_pm_wait_for_idle(gt);
495 rps->work.func = saved_work;
496
497 return err;
498 }
499
show_pcu_config(struct intel_rps * rps)500 static void show_pcu_config(struct intel_rps *rps)
501 {
502 struct drm_i915_private *i915 = rps_to_i915(rps);
503 unsigned int max_gpu_freq, min_gpu_freq;
504 intel_wakeref_t wakeref;
505 int gpu_freq;
506
507 if (!HAS_LLC(i915))
508 return;
509
510 min_gpu_freq = rps->min_freq;
511 max_gpu_freq = rps->max_freq;
512 if (GRAPHICS_VER(i915) >= 9) {
513 /* Convert GT frequency to 50 HZ units */
514 min_gpu_freq /= GEN9_FREQ_SCALER;
515 max_gpu_freq /= GEN9_FREQ_SCALER;
516 }
517
518 wakeref = intel_runtime_pm_get(rps_to_uncore(rps)->rpm);
519
520 pr_info("%5s %5s %5s\n", "GPU", "eCPU", "eRing");
521 for (gpu_freq = min_gpu_freq; gpu_freq <= max_gpu_freq; gpu_freq++) {
522 int ia_freq = gpu_freq;
523
524 snb_pcode_read(rps_to_gt(rps)->uncore, GEN6_PCODE_READ_MIN_FREQ_TABLE,
525 &ia_freq, NULL);
526
527 pr_info("%5d %5d %5d\n",
528 gpu_freq * 50,
529 ((ia_freq >> 0) & 0xff) * 100,
530 ((ia_freq >> 8) & 0xff) * 100);
531 }
532
533 intel_runtime_pm_put(rps_to_uncore(rps)->rpm, wakeref);
534 }
535
__measure_frequency(u32 * cntr,int duration_ms)536 static u64 __measure_frequency(u32 *cntr, int duration_ms)
537 {
538 u64 dc, dt;
539
540 dt = ktime_get();
541 dc = READ_ONCE(*cntr);
542 usleep_range(1000 * duration_ms, 2000 * duration_ms);
543 dc = READ_ONCE(*cntr) - dc;
544 dt = ktime_get() - dt;
545
546 return div64_u64(1000 * 1000 * dc, dt);
547 }
548
measure_frequency_at(struct intel_rps * rps,u32 * cntr,int * freq)549 static u64 measure_frequency_at(struct intel_rps *rps, u32 *cntr, int *freq)
550 {
551 u64 x[5];
552 int i;
553
554 *freq = rps_set_check(rps, *freq);
555 for (i = 0; i < 5; i++)
556 x[i] = __measure_frequency(cntr, 2);
557 *freq = (*freq + read_cagf(rps)) / 2;
558
559 /* A simple triangle filter for better result stability */
560 sort(x, 5, sizeof(*x), cmp_u64, NULL);
561 return div_u64(x[1] + 2 * x[2] + x[3], 4);
562 }
563
__measure_cs_frequency(struct intel_engine_cs * engine,int duration_ms)564 static u64 __measure_cs_frequency(struct intel_engine_cs *engine,
565 int duration_ms)
566 {
567 u64 dc, dt;
568
569 dt = ktime_get();
570 dc = intel_uncore_read_fw(engine->uncore, CS_GPR(0));
571 usleep_range(1000 * duration_ms, 2000 * duration_ms);
572 dc = intel_uncore_read_fw(engine->uncore, CS_GPR(0)) - dc;
573 dt = ktime_get() - dt;
574
575 return div64_u64(1000 * 1000 * dc, dt);
576 }
577
measure_cs_frequency_at(struct intel_rps * rps,struct intel_engine_cs * engine,int * freq)578 static u64 measure_cs_frequency_at(struct intel_rps *rps,
579 struct intel_engine_cs *engine,
580 int *freq)
581 {
582 u64 x[5];
583 int i;
584
585 *freq = rps_set_check(rps, *freq);
586 for (i = 0; i < 5; i++)
587 x[i] = __measure_cs_frequency(engine, 2);
588 *freq = (*freq + read_cagf(rps)) / 2;
589
590 /* A simple triangle filter for better result stability */
591 sort(x, 5, sizeof(*x), cmp_u64, NULL);
592 return div_u64(x[1] + 2 * x[2] + x[3], 4);
593 }
594
scaled_within(u64 x,u64 y,u32 f_n,u32 f_d)595 static bool scaled_within(u64 x, u64 y, u32 f_n, u32 f_d)
596 {
597 return f_d * x > f_n * y && f_n * x < f_d * y;
598 }
599
live_rps_frequency_cs(void * arg)600 int live_rps_frequency_cs(void *arg)
601 {
602 void (*saved_work)(struct work_struct *wrk);
603 struct intel_gt *gt = arg;
604 struct intel_rps *rps = >->rps;
605 struct intel_engine_cs *engine;
606 struct pm_qos_request qos;
607 enum intel_engine_id id;
608 int err = 0;
609
610 /*
611 * The premise is that the GPU does change frequency at our behest.
612 * Let's check there is a correspondence between the requested
613 * frequency, the actual frequency, and the observed clock rate.
614 */
615
616 if (!intel_rps_is_enabled(rps))
617 return 0;
618
619 if (GRAPHICS_VER(gt->i915) < 8) /* for CS simplicity */
620 return 0;
621
622 if (CPU_LATENCY >= 0)
623 cpu_latency_qos_add_request(&qos, CPU_LATENCY);
624
625 intel_gt_pm_wait_for_idle(gt);
626 saved_work = rps->work.func;
627 rps->work.func = dummy_rps_work;
628
629 for_each_engine(engine, gt, id) {
630 struct i915_request *rq;
631 struct i915_vma *vma;
632 u32 *cancel, *cntr;
633 struct {
634 u64 count;
635 int freq;
636 } min, max;
637
638 st_engine_heartbeat_disable(engine);
639
640 vma = create_spin_counter(engine,
641 engine->kernel_context->vm, false,
642 &cancel, &cntr);
643 if (IS_ERR(vma)) {
644 err = PTR_ERR(vma);
645 st_engine_heartbeat_enable(engine);
646 break;
647 }
648
649 rq = intel_engine_create_kernel_request(engine);
650 if (IS_ERR(rq)) {
651 err = PTR_ERR(rq);
652 goto err_vma;
653 }
654
655 err = i915_vma_move_to_active(vma, rq, 0);
656 if (!err)
657 err = rq->engine->emit_bb_start(rq,
658 i915_vma_offset(vma),
659 PAGE_SIZE, 0);
660 i915_request_add(rq);
661 if (err)
662 goto err_vma;
663
664 if (wait_for(intel_uncore_read(engine->uncore, CS_GPR(0)),
665 10)) {
666 pr_err("%s: timed loop did not start\n",
667 engine->name);
668 goto err_vma;
669 }
670
671 min.freq = rps->min_freq;
672 min.count = measure_cs_frequency_at(rps, engine, &min.freq);
673
674 max.freq = rps->max_freq;
675 max.count = measure_cs_frequency_at(rps, engine, &max.freq);
676
677 pr_info("%s: min:%lluKHz @ %uMHz, max:%lluKHz @ %uMHz [%d%%]\n",
678 engine->name,
679 min.count, intel_gpu_freq(rps, min.freq),
680 max.count, intel_gpu_freq(rps, max.freq),
681 (int)DIV64_U64_ROUND_CLOSEST(100 * min.freq * max.count,
682 max.freq * min.count));
683
684 if (!scaled_within(max.freq * min.count,
685 min.freq * max.count,
686 2, 3)) {
687 int f;
688
689 pr_err("%s: CS did not scale with frequency! scaled min:%llu, max:%llu\n",
690 engine->name,
691 max.freq * min.count,
692 min.freq * max.count);
693 show_pcu_config(rps);
694
695 for (f = min.freq + 1; f <= rps->max_freq; f++) {
696 int act = f;
697 u64 count;
698
699 count = measure_cs_frequency_at(rps, engine, &act);
700 if (act < f)
701 break;
702
703 pr_info("%s: %x:%uMHz: %lluKHz [%d%%]\n",
704 engine->name,
705 act, intel_gpu_freq(rps, act), count,
706 (int)DIV64_U64_ROUND_CLOSEST(100 * min.freq * count,
707 act * min.count));
708
709 f = act; /* may skip ahead [pcu granularity] */
710 }
711
712 err = -EINTR; /* ignore error, continue on with test */
713 }
714
715 err_vma:
716 *cancel = MI_BATCH_BUFFER_END;
717 i915_gem_object_flush_map(vma->obj);
718 i915_gem_object_unpin_map(vma->obj);
719 i915_vma_unpin(vma);
720 i915_vma_unlock(vma);
721 i915_vma_put(vma);
722
723 st_engine_heartbeat_enable(engine);
724 if (igt_flush_test(gt->i915))
725 err = -EIO;
726 if (err)
727 break;
728 }
729
730 intel_gt_pm_wait_for_idle(gt);
731 rps->work.func = saved_work;
732
733 if (CPU_LATENCY >= 0)
734 cpu_latency_qos_remove_request(&qos);
735
736 return err;
737 }
738
live_rps_frequency_srm(void * arg)739 int live_rps_frequency_srm(void *arg)
740 {
741 void (*saved_work)(struct work_struct *wrk);
742 struct intel_gt *gt = arg;
743 struct intel_rps *rps = >->rps;
744 struct intel_engine_cs *engine;
745 struct pm_qos_request qos;
746 enum intel_engine_id id;
747 int err = 0;
748
749 /*
750 * The premise is that the GPU does change frequency at our behest.
751 * Let's check there is a correspondence between the requested
752 * frequency, the actual frequency, and the observed clock rate.
753 */
754
755 if (!intel_rps_is_enabled(rps))
756 return 0;
757
758 if (GRAPHICS_VER(gt->i915) < 8) /* for CS simplicity */
759 return 0;
760
761 if (CPU_LATENCY >= 0)
762 cpu_latency_qos_add_request(&qos, CPU_LATENCY);
763
764 intel_gt_pm_wait_for_idle(gt);
765 saved_work = rps->work.func;
766 rps->work.func = dummy_rps_work;
767
768 for_each_engine(engine, gt, id) {
769 struct i915_request *rq;
770 struct i915_vma *vma;
771 u32 *cancel, *cntr;
772 struct {
773 u64 count;
774 int freq;
775 } min, max;
776
777 st_engine_heartbeat_disable(engine);
778
779 vma = create_spin_counter(engine,
780 engine->kernel_context->vm, true,
781 &cancel, &cntr);
782 if (IS_ERR(vma)) {
783 err = PTR_ERR(vma);
784 st_engine_heartbeat_enable(engine);
785 break;
786 }
787
788 rq = intel_engine_create_kernel_request(engine);
789 if (IS_ERR(rq)) {
790 err = PTR_ERR(rq);
791 goto err_vma;
792 }
793
794 err = i915_vma_move_to_active(vma, rq, 0);
795 if (!err)
796 err = rq->engine->emit_bb_start(rq,
797 i915_vma_offset(vma),
798 PAGE_SIZE, 0);
799 i915_request_add(rq);
800 if (err)
801 goto err_vma;
802
803 if (wait_for(READ_ONCE(*cntr), 10)) {
804 pr_err("%s: timed loop did not start\n",
805 engine->name);
806 goto err_vma;
807 }
808
809 min.freq = rps->min_freq;
810 min.count = measure_frequency_at(rps, cntr, &min.freq);
811
812 max.freq = rps->max_freq;
813 max.count = measure_frequency_at(rps, cntr, &max.freq);
814
815 pr_info("%s: min:%lluKHz @ %uMHz, max:%lluKHz @ %uMHz [%d%%]\n",
816 engine->name,
817 min.count, intel_gpu_freq(rps, min.freq),
818 max.count, intel_gpu_freq(rps, max.freq),
819 (int)DIV64_U64_ROUND_CLOSEST(100 * min.freq * max.count,
820 max.freq * min.count));
821
822 if (!scaled_within(max.freq * min.count,
823 min.freq * max.count,
824 1, 2)) {
825 int f;
826
827 pr_err("%s: CS did not scale with frequency! scaled min:%llu, max:%llu\n",
828 engine->name,
829 max.freq * min.count,
830 min.freq * max.count);
831 show_pcu_config(rps);
832
833 for (f = min.freq + 1; f <= rps->max_freq; f++) {
834 int act = f;
835 u64 count;
836
837 count = measure_frequency_at(rps, cntr, &act);
838 if (act < f)
839 break;
840
841 pr_info("%s: %x:%uMHz: %lluKHz [%d%%]\n",
842 engine->name,
843 act, intel_gpu_freq(rps, act), count,
844 (int)DIV64_U64_ROUND_CLOSEST(100 * min.freq * count,
845 act * min.count));
846
847 f = act; /* may skip ahead [pcu granularity] */
848 }
849
850 err = -EINTR; /* ignore error, continue on with test */
851 }
852
853 err_vma:
854 *cancel = MI_BATCH_BUFFER_END;
855 i915_gem_object_flush_map(vma->obj);
856 i915_gem_object_unpin_map(vma->obj);
857 i915_vma_unpin(vma);
858 i915_vma_unlock(vma);
859 i915_vma_put(vma);
860
861 st_engine_heartbeat_enable(engine);
862 if (igt_flush_test(gt->i915))
863 err = -EIO;
864 if (err)
865 break;
866 }
867
868 intel_gt_pm_wait_for_idle(gt);
869 rps->work.func = saved_work;
870
871 if (CPU_LATENCY >= 0)
872 cpu_latency_qos_remove_request(&qos);
873
874 return err;
875 }
876
sleep_for_ei(struct intel_rps * rps,int timeout_us)877 static void sleep_for_ei(struct intel_rps *rps, int timeout_us)
878 {
879 /* Flush any previous EI */
880 usleep_range(timeout_us, 2 * timeout_us);
881
882 /* Reset the interrupt status */
883 rps_disable_interrupts(rps);
884 GEM_BUG_ON(rps->pm_iir);
885 rps_enable_interrupts(rps);
886
887 /* And then wait for the timeout, for real this time */
888 usleep_range(2 * timeout_us, 3 * timeout_us);
889 }
890
__rps_up_interrupt(struct intel_rps * rps,struct intel_engine_cs * engine,struct igt_spinner * spin)891 static int __rps_up_interrupt(struct intel_rps *rps,
892 struct intel_engine_cs *engine,
893 struct igt_spinner *spin)
894 {
895 struct intel_uncore *uncore = engine->uncore;
896 struct i915_request *rq;
897 u32 timeout;
898
899 if (!intel_engine_can_store_dword(engine))
900 return 0;
901
902 rps_set_check(rps, rps->min_freq);
903
904 rq = igt_spinner_create_request(spin, engine->kernel_context, MI_NOOP);
905 if (IS_ERR(rq))
906 return PTR_ERR(rq);
907
908 i915_request_get(rq);
909 i915_request_add(rq);
910
911 if (!igt_wait_for_spinner(spin, rq)) {
912 pr_err("%s: RPS spinner did not start\n",
913 engine->name);
914 i915_request_put(rq);
915 intel_gt_set_wedged(engine->gt);
916 return -EIO;
917 }
918
919 if (!intel_rps_is_active(rps)) {
920 pr_err("%s: RPS not enabled on starting spinner\n",
921 engine->name);
922 igt_spinner_end(spin);
923 i915_request_put(rq);
924 return -EINVAL;
925 }
926
927 if (!(rps->pm_events & GEN6_PM_RP_UP_THRESHOLD)) {
928 pr_err("%s: RPS did not register UP interrupt\n",
929 engine->name);
930 i915_request_put(rq);
931 return -EINVAL;
932 }
933
934 if (rps->last_freq != rps->min_freq) {
935 pr_err("%s: RPS did not program min frequency\n",
936 engine->name);
937 i915_request_put(rq);
938 return -EINVAL;
939 }
940
941 timeout = intel_uncore_read(uncore, GEN6_RP_UP_EI);
942 timeout = intel_gt_pm_interval_to_ns(engine->gt, timeout);
943 timeout = DIV_ROUND_UP(timeout, 1000);
944
945 sleep_for_ei(rps, timeout);
946 GEM_BUG_ON(i915_request_completed(rq));
947
948 igt_spinner_end(spin);
949 i915_request_put(rq);
950
951 if (rps->cur_freq != rps->min_freq) {
952 pr_err("%s: Frequency unexpectedly changed [up], now %d!\n",
953 engine->name, intel_rps_read_actual_frequency(rps));
954 return -EINVAL;
955 }
956
957 if (!(rps->pm_iir & GEN6_PM_RP_UP_THRESHOLD)) {
958 pr_err("%s: UP interrupt not recorded for spinner, pm_iir:%x, prev_up:%x, up_threshold:%x, up_ei:%x\n",
959 engine->name, rps->pm_iir,
960 intel_uncore_read(uncore, GEN6_RP_PREV_UP),
961 intel_uncore_read(uncore, GEN6_RP_UP_THRESHOLD),
962 intel_uncore_read(uncore, GEN6_RP_UP_EI));
963 return -EINVAL;
964 }
965
966 return 0;
967 }
968
__rps_down_interrupt(struct intel_rps * rps,struct intel_engine_cs * engine)969 static int __rps_down_interrupt(struct intel_rps *rps,
970 struct intel_engine_cs *engine)
971 {
972 struct intel_uncore *uncore = engine->uncore;
973 u32 timeout;
974
975 rps_set_check(rps, rps->max_freq);
976
977 if (!(rps->pm_events & GEN6_PM_RP_DOWN_THRESHOLD)) {
978 pr_err("%s: RPS did not register DOWN interrupt\n",
979 engine->name);
980 return -EINVAL;
981 }
982
983 if (rps->last_freq != rps->max_freq) {
984 pr_err("%s: RPS did not program max frequency\n",
985 engine->name);
986 return -EINVAL;
987 }
988
989 timeout = intel_uncore_read(uncore, GEN6_RP_DOWN_EI);
990 timeout = intel_gt_pm_interval_to_ns(engine->gt, timeout);
991 timeout = DIV_ROUND_UP(timeout, 1000);
992
993 sleep_for_ei(rps, timeout);
994
995 if (rps->cur_freq != rps->max_freq) {
996 pr_err("%s: Frequency unexpectedly changed [down], now %d!\n",
997 engine->name,
998 intel_rps_read_actual_frequency(rps));
999 return -EINVAL;
1000 }
1001
1002 if (!(rps->pm_iir & (GEN6_PM_RP_DOWN_THRESHOLD | GEN6_PM_RP_DOWN_TIMEOUT))) {
1003 pr_err("%s: DOWN interrupt not recorded for idle, pm_iir:%x, prev_down:%x, down_threshold:%x, down_ei:%x [prev_up:%x, up_threshold:%x, up_ei:%x]\n",
1004 engine->name, rps->pm_iir,
1005 intel_uncore_read(uncore, GEN6_RP_PREV_DOWN),
1006 intel_uncore_read(uncore, GEN6_RP_DOWN_THRESHOLD),
1007 intel_uncore_read(uncore, GEN6_RP_DOWN_EI),
1008 intel_uncore_read(uncore, GEN6_RP_PREV_UP),
1009 intel_uncore_read(uncore, GEN6_RP_UP_THRESHOLD),
1010 intel_uncore_read(uncore, GEN6_RP_UP_EI));
1011 return -EINVAL;
1012 }
1013
1014 return 0;
1015 }
1016
live_rps_interrupt(void * arg)1017 int live_rps_interrupt(void *arg)
1018 {
1019 struct intel_gt *gt = arg;
1020 struct intel_rps *rps = >->rps;
1021 void (*saved_work)(struct work_struct *wrk);
1022 struct intel_engine_cs *engine;
1023 enum intel_engine_id id;
1024 struct igt_spinner spin;
1025 u32 pm_events;
1026 int err = 0;
1027
1028 /*
1029 * First, let's check whether or not we are receiving interrupts.
1030 */
1031
1032 if (!intel_rps_has_interrupts(rps) || GRAPHICS_VER(gt->i915) < 6)
1033 return 0;
1034
1035 intel_gt_pm_get(gt);
1036 pm_events = rps->pm_events;
1037 intel_gt_pm_put(gt);
1038 if (!pm_events) {
1039 pr_err("No RPS PM events registered, but RPS is enabled?\n");
1040 return -ENODEV;
1041 }
1042
1043 if (igt_spinner_init(&spin, gt))
1044 return -ENOMEM;
1045
1046 intel_gt_pm_wait_for_idle(gt);
1047 saved_work = rps->work.func;
1048 rps->work.func = dummy_rps_work;
1049
1050 for_each_engine(engine, gt, id) {
1051 /* Keep the engine busy with a spinner; expect an UP! */
1052 if (pm_events & GEN6_PM_RP_UP_THRESHOLD) {
1053 intel_gt_pm_wait_for_idle(engine->gt);
1054 GEM_BUG_ON(intel_rps_is_active(rps));
1055
1056 st_engine_heartbeat_disable(engine);
1057
1058 err = __rps_up_interrupt(rps, engine, &spin);
1059
1060 st_engine_heartbeat_enable(engine);
1061 if (err)
1062 goto out;
1063
1064 intel_gt_pm_wait_for_idle(engine->gt);
1065 }
1066
1067 /* Keep the engine awake but idle and check for DOWN */
1068 if (pm_events & GEN6_PM_RP_DOWN_THRESHOLD) {
1069 st_engine_heartbeat_disable(engine);
1070 intel_rc6_disable(>->rc6);
1071
1072 err = __rps_down_interrupt(rps, engine);
1073
1074 intel_rc6_enable(>->rc6);
1075 st_engine_heartbeat_enable(engine);
1076 if (err)
1077 goto out;
1078 }
1079 }
1080
1081 out:
1082 if (igt_flush_test(gt->i915))
1083 err = -EIO;
1084
1085 igt_spinner_fini(&spin);
1086
1087 intel_gt_pm_wait_for_idle(gt);
1088 rps->work.func = saved_work;
1089
1090 return err;
1091 }
1092
__measure_power(int duration_ms)1093 static u64 __measure_power(int duration_ms)
1094 {
1095 u64 dE, dt;
1096
1097 dt = ktime_get();
1098 dE = librapl_energy_uJ();
1099 usleep_range(1000 * duration_ms, 2000 * duration_ms);
1100 dE = librapl_energy_uJ() - dE;
1101 dt = ktime_get() - dt;
1102
1103 return div64_u64(1000 * 1000 * dE, dt);
1104 }
1105
measure_power(struct intel_rps * rps,int * freq)1106 static u64 measure_power(struct intel_rps *rps, int *freq)
1107 {
1108 u64 x[5];
1109 int i;
1110
1111 for (i = 0; i < 5; i++)
1112 x[i] = __measure_power(5);
1113
1114 *freq = (*freq + intel_rps_read_actual_frequency(rps)) / 2;
1115
1116 /* A simple triangle filter for better result stability */
1117 sort(x, 5, sizeof(*x), cmp_u64, NULL);
1118 return div_u64(x[1] + 2 * x[2] + x[3], 4);
1119 }
1120
measure_power_at(struct intel_rps * rps,int * freq)1121 static u64 measure_power_at(struct intel_rps *rps, int *freq)
1122 {
1123 *freq = rps_set_check(rps, *freq);
1124 return measure_power(rps, freq);
1125 }
1126
live_rps_power(void * arg)1127 int live_rps_power(void *arg)
1128 {
1129 struct intel_gt *gt = arg;
1130 struct intel_rps *rps = >->rps;
1131 void (*saved_work)(struct work_struct *wrk);
1132 struct intel_engine_cs *engine;
1133 enum intel_engine_id id;
1134 struct igt_spinner spin;
1135 int err = 0;
1136
1137 /*
1138 * Our fundamental assumption is that running at lower frequency
1139 * actually saves power. Let's see if our RAPL measurement support
1140 * that theory.
1141 */
1142
1143 if (!intel_rps_is_enabled(rps) || GRAPHICS_VER(gt->i915) < 6)
1144 return 0;
1145
1146 if (!librapl_supported(gt->i915))
1147 return 0;
1148
1149 if (igt_spinner_init(&spin, gt))
1150 return -ENOMEM;
1151
1152 intel_gt_pm_wait_for_idle(gt);
1153 saved_work = rps->work.func;
1154 rps->work.func = dummy_rps_work;
1155
1156 for_each_engine(engine, gt, id) {
1157 struct i915_request *rq;
1158 struct {
1159 u64 power;
1160 int freq;
1161 } min, max;
1162
1163 if (!intel_engine_can_store_dword(engine))
1164 continue;
1165
1166 st_engine_heartbeat_disable(engine);
1167
1168 rq = igt_spinner_create_request(&spin,
1169 engine->kernel_context,
1170 MI_NOOP);
1171 if (IS_ERR(rq)) {
1172 st_engine_heartbeat_enable(engine);
1173 err = PTR_ERR(rq);
1174 break;
1175 }
1176
1177 i915_request_add(rq);
1178
1179 if (!igt_wait_for_spinner(&spin, rq)) {
1180 pr_err("%s: RPS spinner did not start\n",
1181 engine->name);
1182 igt_spinner_end(&spin);
1183 st_engine_heartbeat_enable(engine);
1184 intel_gt_set_wedged(engine->gt);
1185 err = -EIO;
1186 break;
1187 }
1188
1189 max.freq = rps->max_freq;
1190 max.power = measure_power_at(rps, &max.freq);
1191
1192 min.freq = rps->min_freq;
1193 min.power = measure_power_at(rps, &min.freq);
1194
1195 igt_spinner_end(&spin);
1196 st_engine_heartbeat_enable(engine);
1197
1198 pr_info("%s: min:%llumW @ %uMHz, max:%llumW @ %uMHz\n",
1199 engine->name,
1200 min.power, intel_gpu_freq(rps, min.freq),
1201 max.power, intel_gpu_freq(rps, max.freq));
1202
1203 if (10 * min.freq >= 9 * max.freq) {
1204 pr_notice("Could not control frequency, ran at [%d:%uMHz, %d:%uMhz]\n",
1205 min.freq, intel_gpu_freq(rps, min.freq),
1206 max.freq, intel_gpu_freq(rps, max.freq));
1207 continue;
1208 }
1209
1210 if (11 * min.power > 10 * max.power) {
1211 pr_err("%s: did not conserve power when setting lower frequency!\n",
1212 engine->name);
1213 err = -EINVAL;
1214 break;
1215 }
1216
1217 if (igt_flush_test(gt->i915)) {
1218 err = -EIO;
1219 break;
1220 }
1221 }
1222
1223 igt_spinner_fini(&spin);
1224
1225 intel_gt_pm_wait_for_idle(gt);
1226 rps->work.func = saved_work;
1227
1228 return err;
1229 }
1230
live_rps_dynamic(void * arg)1231 int live_rps_dynamic(void *arg)
1232 {
1233 struct intel_gt *gt = arg;
1234 struct intel_rps *rps = >->rps;
1235 struct intel_engine_cs *engine;
1236 enum intel_engine_id id;
1237 struct igt_spinner spin;
1238 int err = 0;
1239
1240 /*
1241 * We've looked at the bascs, and have established that we
1242 * can change the clock frequency and that the HW will generate
1243 * interrupts based on load. Now we check how we integrate those
1244 * moving parts into dynamic reclocking based on load.
1245 */
1246
1247 if (!intel_rps_is_enabled(rps) || GRAPHICS_VER(gt->i915) < 6)
1248 return 0;
1249
1250 if (igt_spinner_init(&spin, gt))
1251 return -ENOMEM;
1252
1253 if (intel_rps_has_interrupts(rps))
1254 pr_info("RPS has interrupt support\n");
1255 if (intel_rps_uses_timer(rps))
1256 pr_info("RPS has timer support\n");
1257
1258 for_each_engine(engine, gt, id) {
1259 struct i915_request *rq;
1260 struct {
1261 ktime_t dt;
1262 u8 freq;
1263 } min, max;
1264
1265 if (!intel_engine_can_store_dword(engine))
1266 continue;
1267
1268 intel_gt_pm_wait_for_idle(gt);
1269 GEM_BUG_ON(intel_rps_is_active(rps));
1270 rps->cur_freq = rps->min_freq;
1271
1272 intel_engine_pm_get(engine);
1273 intel_rc6_disable(>->rc6);
1274 GEM_BUG_ON(rps->last_freq != rps->min_freq);
1275
1276 rq = igt_spinner_create_request(&spin,
1277 engine->kernel_context,
1278 MI_NOOP);
1279 if (IS_ERR(rq)) {
1280 err = PTR_ERR(rq);
1281 goto err;
1282 }
1283
1284 i915_request_add(rq);
1285
1286 max.dt = ktime_get();
1287 max.freq = wait_for_freq(rps, rps->max_freq, 500);
1288 max.dt = ktime_sub(ktime_get(), max.dt);
1289
1290 igt_spinner_end(&spin);
1291
1292 min.dt = ktime_get();
1293 min.freq = wait_for_freq(rps, rps->min_freq, 2000);
1294 min.dt = ktime_sub(ktime_get(), min.dt);
1295
1296 pr_info("%s: dynamically reclocked to %u:%uMHz while busy in %lluns, and %u:%uMHz while idle in %lluns\n",
1297 engine->name,
1298 max.freq, intel_gpu_freq(rps, max.freq),
1299 ktime_to_ns(max.dt),
1300 min.freq, intel_gpu_freq(rps, min.freq),
1301 ktime_to_ns(min.dt));
1302 if (min.freq >= max.freq) {
1303 pr_err("%s: dynamic reclocking of spinner failed\n!",
1304 engine->name);
1305 err = -EINVAL;
1306 }
1307
1308 err:
1309 intel_rc6_enable(>->rc6);
1310 intel_engine_pm_put(engine);
1311
1312 if (igt_flush_test(gt->i915))
1313 err = -EIO;
1314 if (err)
1315 break;
1316 }
1317
1318 igt_spinner_fini(&spin);
1319
1320 return err;
1321 }
1322