1 // SPDX-License-Identifier: GPL-2.0 or BSD-3-Clause
2 /*
3 * Copyright(c) 2015 - 2020 Intel Corporation.
4 */
5
6 #include <linux/topology.h>
7 #include <linux/cpumask.h>
8 #include <linux/interrupt.h>
9 #include <linux/numa.h>
10
11 #include "hfi.h"
12 #include "affinity.h"
13 #include "sdma.h"
14 #include "trace.h"
15
16 struct hfi1_affinity_node_list node_affinity = {
17 .list = LIST_HEAD_INIT(node_affinity.list),
18 .lock = __MUTEX_INITIALIZER(node_affinity.lock)
19 };
20
21 /* Name of IRQ types, indexed by enum irq_type */
22 static const char * const irq_type_names[] = {
23 "SDMA",
24 "RCVCTXT",
25 "NETDEVCTXT",
26 "GENERAL",
27 "OTHER",
28 };
29
30 /* Per NUMA node count of HFI devices */
31 static unsigned int *hfi1_per_node_cntr;
32
init_cpu_mask_set(struct cpu_mask_set * set)33 static inline void init_cpu_mask_set(struct cpu_mask_set *set)
34 {
35 cpumask_clear(&set->mask);
36 cpumask_clear(&set->used);
37 set->gen = 0;
38 }
39
40 /* Increment generation of CPU set if needed */
_cpu_mask_set_gen_inc(struct cpu_mask_set * set)41 static void _cpu_mask_set_gen_inc(struct cpu_mask_set *set)
42 {
43 if (cpumask_equal(&set->mask, &set->used)) {
44 /*
45 * We've used up all the CPUs, bump up the generation
46 * and reset the 'used' map
47 */
48 set->gen++;
49 cpumask_clear(&set->used);
50 }
51 }
52
_cpu_mask_set_gen_dec(struct cpu_mask_set * set)53 static void _cpu_mask_set_gen_dec(struct cpu_mask_set *set)
54 {
55 if (cpumask_empty(&set->used) && set->gen) {
56 set->gen--;
57 cpumask_copy(&set->used, &set->mask);
58 }
59 }
60
61 /* Get the first CPU from the list of unused CPUs in a CPU set data structure */
cpu_mask_set_get_first(struct cpu_mask_set * set,cpumask_var_t diff)62 static int cpu_mask_set_get_first(struct cpu_mask_set *set, cpumask_var_t diff)
63 {
64 int cpu;
65
66 if (!diff || !set)
67 return -EINVAL;
68
69 _cpu_mask_set_gen_inc(set);
70
71 /* Find out CPUs left in CPU mask */
72 cpumask_andnot(diff, &set->mask, &set->used);
73
74 cpu = cpumask_first(diff);
75 if (cpu >= nr_cpu_ids) /* empty */
76 cpu = -EINVAL;
77 else
78 cpumask_set_cpu(cpu, &set->used);
79
80 return cpu;
81 }
82
cpu_mask_set_put(struct cpu_mask_set * set,int cpu)83 static void cpu_mask_set_put(struct cpu_mask_set *set, int cpu)
84 {
85 if (!set)
86 return;
87
88 cpumask_clear_cpu(cpu, &set->used);
89 _cpu_mask_set_gen_dec(set);
90 }
91
92 /* Initialize non-HT cpu cores mask */
init_real_cpu_mask(void)93 void init_real_cpu_mask(void)
94 {
95 int possible, curr_cpu, i, ht;
96
97 cpumask_clear(&node_affinity.real_cpu_mask);
98
99 /* Start with cpu online mask as the real cpu mask */
100 cpumask_copy(&node_affinity.real_cpu_mask, cpu_online_mask);
101
102 /*
103 * Remove HT cores from the real cpu mask. Do this in two steps below.
104 */
105 possible = cpumask_weight(&node_affinity.real_cpu_mask);
106 ht = cpumask_weight(topology_sibling_cpumask(
107 cpumask_first(&node_affinity.real_cpu_mask)));
108 /*
109 * Step 1. Skip over the first N HT siblings and use them as the
110 * "real" cores. Assumes that HT cores are not enumerated in
111 * succession (except in the single core case).
112 */
113 curr_cpu = cpumask_first(&node_affinity.real_cpu_mask);
114 for (i = 0; i < possible / ht; i++)
115 curr_cpu = cpumask_next(curr_cpu, &node_affinity.real_cpu_mask);
116 /*
117 * Step 2. Remove the remaining HT siblings. Use cpumask_next() to
118 * skip any gaps.
119 */
120 for (; i < possible; i++) {
121 cpumask_clear_cpu(curr_cpu, &node_affinity.real_cpu_mask);
122 curr_cpu = cpumask_next(curr_cpu, &node_affinity.real_cpu_mask);
123 }
124 }
125
node_affinity_init(void)126 int node_affinity_init(void)
127 {
128 int node;
129 struct pci_dev *dev = NULL;
130 const struct pci_device_id *ids = hfi1_pci_tbl;
131
132 cpumask_clear(&node_affinity.proc.used);
133 cpumask_copy(&node_affinity.proc.mask, cpu_online_mask);
134
135 node_affinity.proc.gen = 0;
136 node_affinity.num_core_siblings =
137 cpumask_weight(topology_sibling_cpumask(
138 cpumask_first(&node_affinity.proc.mask)
139 ));
140 node_affinity.num_possible_nodes = num_possible_nodes();
141 node_affinity.num_online_nodes = num_online_nodes();
142 node_affinity.num_online_cpus = num_online_cpus();
143
144 /*
145 * The real cpu mask is part of the affinity struct but it has to be
146 * initialized early. It is needed to calculate the number of user
147 * contexts in set_up_context_variables().
148 */
149 init_real_cpu_mask();
150
151 hfi1_per_node_cntr = kcalloc(node_affinity.num_possible_nodes,
152 sizeof(*hfi1_per_node_cntr), GFP_KERNEL);
153 if (!hfi1_per_node_cntr)
154 return -ENOMEM;
155
156 while (ids->vendor) {
157 dev = NULL;
158 while ((dev = pci_get_device(ids->vendor, ids->device, dev))) {
159 node = pcibus_to_node(dev->bus);
160 if (node < 0)
161 goto out;
162
163 hfi1_per_node_cntr[node]++;
164 }
165 ids++;
166 }
167
168 return 0;
169
170 out:
171 /*
172 * Invalid PCI NUMA node information found, note it, and populate
173 * our database 1:1.
174 */
175 pr_err("HFI: Invalid PCI NUMA node. Performance may be affected\n");
176 pr_err("HFI: System BIOS may need to be upgraded\n");
177 for (node = 0; node < node_affinity.num_possible_nodes; node++)
178 hfi1_per_node_cntr[node] = 1;
179
180 pci_dev_put(dev);
181
182 return 0;
183 }
184
node_affinity_destroy(struct hfi1_affinity_node * entry)185 static void node_affinity_destroy(struct hfi1_affinity_node *entry)
186 {
187 free_percpu(entry->comp_vect_affinity);
188 kfree(entry);
189 }
190
node_affinity_destroy_all(void)191 void node_affinity_destroy_all(void)
192 {
193 struct list_head *pos, *q;
194 struct hfi1_affinity_node *entry;
195
196 mutex_lock(&node_affinity.lock);
197 list_for_each_safe(pos, q, &node_affinity.list) {
198 entry = list_entry(pos, struct hfi1_affinity_node,
199 list);
200 list_del(pos);
201 node_affinity_destroy(entry);
202 }
203 mutex_unlock(&node_affinity.lock);
204 kfree(hfi1_per_node_cntr);
205 }
206
node_affinity_allocate(int node)207 static struct hfi1_affinity_node *node_affinity_allocate(int node)
208 {
209 struct hfi1_affinity_node *entry;
210
211 entry = kzalloc(sizeof(*entry), GFP_KERNEL);
212 if (!entry)
213 return NULL;
214 entry->node = node;
215 entry->comp_vect_affinity = alloc_percpu(u16);
216 INIT_LIST_HEAD(&entry->list);
217
218 return entry;
219 }
220
221 /*
222 * It appends an entry to the list.
223 * It *must* be called with node_affinity.lock held.
224 */
node_affinity_add_tail(struct hfi1_affinity_node * entry)225 static void node_affinity_add_tail(struct hfi1_affinity_node *entry)
226 {
227 list_add_tail(&entry->list, &node_affinity.list);
228 }
229
230 /* It must be called with node_affinity.lock held */
node_affinity_lookup(int node)231 static struct hfi1_affinity_node *node_affinity_lookup(int node)
232 {
233 struct list_head *pos;
234 struct hfi1_affinity_node *entry;
235
236 list_for_each(pos, &node_affinity.list) {
237 entry = list_entry(pos, struct hfi1_affinity_node, list);
238 if (entry->node == node)
239 return entry;
240 }
241
242 return NULL;
243 }
244
per_cpu_affinity_get(cpumask_var_t possible_cpumask,u16 __percpu * comp_vect_affinity)245 static int per_cpu_affinity_get(cpumask_var_t possible_cpumask,
246 u16 __percpu *comp_vect_affinity)
247 {
248 int curr_cpu;
249 u16 cntr;
250 u16 prev_cntr;
251 int ret_cpu;
252
253 if (!possible_cpumask) {
254 ret_cpu = -EINVAL;
255 goto fail;
256 }
257
258 if (!comp_vect_affinity) {
259 ret_cpu = -EINVAL;
260 goto fail;
261 }
262
263 ret_cpu = cpumask_first(possible_cpumask);
264 if (ret_cpu >= nr_cpu_ids) {
265 ret_cpu = -EINVAL;
266 goto fail;
267 }
268
269 prev_cntr = *per_cpu_ptr(comp_vect_affinity, ret_cpu);
270 for_each_cpu(curr_cpu, possible_cpumask) {
271 cntr = *per_cpu_ptr(comp_vect_affinity, curr_cpu);
272
273 if (cntr < prev_cntr) {
274 ret_cpu = curr_cpu;
275 prev_cntr = cntr;
276 }
277 }
278
279 *per_cpu_ptr(comp_vect_affinity, ret_cpu) += 1;
280
281 fail:
282 return ret_cpu;
283 }
284
per_cpu_affinity_put_max(cpumask_var_t possible_cpumask,u16 __percpu * comp_vect_affinity)285 static int per_cpu_affinity_put_max(cpumask_var_t possible_cpumask,
286 u16 __percpu *comp_vect_affinity)
287 {
288 int curr_cpu;
289 int max_cpu;
290 u16 cntr;
291 u16 prev_cntr;
292
293 if (!possible_cpumask)
294 return -EINVAL;
295
296 if (!comp_vect_affinity)
297 return -EINVAL;
298
299 max_cpu = cpumask_first(possible_cpumask);
300 if (max_cpu >= nr_cpu_ids)
301 return -EINVAL;
302
303 prev_cntr = *per_cpu_ptr(comp_vect_affinity, max_cpu);
304 for_each_cpu(curr_cpu, possible_cpumask) {
305 cntr = *per_cpu_ptr(comp_vect_affinity, curr_cpu);
306
307 if (cntr > prev_cntr) {
308 max_cpu = curr_cpu;
309 prev_cntr = cntr;
310 }
311 }
312
313 *per_cpu_ptr(comp_vect_affinity, max_cpu) -= 1;
314
315 return max_cpu;
316 }
317
318 /*
319 * Non-interrupt CPUs are used first, then interrupt CPUs.
320 * Two already allocated cpu masks must be passed.
321 */
_dev_comp_vect_cpu_get(struct hfi1_devdata * dd,struct hfi1_affinity_node * entry,cpumask_var_t non_intr_cpus,cpumask_var_t available_cpus)322 static int _dev_comp_vect_cpu_get(struct hfi1_devdata *dd,
323 struct hfi1_affinity_node *entry,
324 cpumask_var_t non_intr_cpus,
325 cpumask_var_t available_cpus)
326 __must_hold(&node_affinity.lock)
327 {
328 int cpu;
329 struct cpu_mask_set *set = dd->comp_vect;
330
331 lockdep_assert_held(&node_affinity.lock);
332 if (!non_intr_cpus) {
333 cpu = -1;
334 goto fail;
335 }
336
337 if (!available_cpus) {
338 cpu = -1;
339 goto fail;
340 }
341
342 /* Available CPUs for pinning completion vectors */
343 _cpu_mask_set_gen_inc(set);
344 cpumask_andnot(available_cpus, &set->mask, &set->used);
345
346 /* Available CPUs without SDMA engine interrupts */
347 cpumask_andnot(non_intr_cpus, available_cpus,
348 &entry->def_intr.used);
349
350 /* If there are non-interrupt CPUs available, use them first */
351 if (!cpumask_empty(non_intr_cpus))
352 cpu = cpumask_first(non_intr_cpus);
353 else /* Otherwise, use interrupt CPUs */
354 cpu = cpumask_first(available_cpus);
355
356 if (cpu >= nr_cpu_ids) { /* empty */
357 cpu = -1;
358 goto fail;
359 }
360 cpumask_set_cpu(cpu, &set->used);
361
362 fail:
363 return cpu;
364 }
365
_dev_comp_vect_cpu_put(struct hfi1_devdata * dd,int cpu)366 static void _dev_comp_vect_cpu_put(struct hfi1_devdata *dd, int cpu)
367 {
368 struct cpu_mask_set *set = dd->comp_vect;
369
370 if (cpu < 0)
371 return;
372
373 cpu_mask_set_put(set, cpu);
374 }
375
376 /* _dev_comp_vect_mappings_destroy() is reentrant */
_dev_comp_vect_mappings_destroy(struct hfi1_devdata * dd)377 static void _dev_comp_vect_mappings_destroy(struct hfi1_devdata *dd)
378 {
379 int i, cpu;
380
381 if (!dd->comp_vect_mappings)
382 return;
383
384 for (i = 0; i < dd->comp_vect_possible_cpus; i++) {
385 cpu = dd->comp_vect_mappings[i];
386 _dev_comp_vect_cpu_put(dd, cpu);
387 dd->comp_vect_mappings[i] = -1;
388 hfi1_cdbg(AFFINITY,
389 "[%s] Release CPU %d from completion vector %d",
390 rvt_get_ibdev_name(&(dd)->verbs_dev.rdi), cpu, i);
391 }
392
393 kfree(dd->comp_vect_mappings);
394 dd->comp_vect_mappings = NULL;
395 }
396
397 /*
398 * This function creates the table for looking up CPUs for completion vectors.
399 * num_comp_vectors needs to have been initilized before calling this function.
400 */
_dev_comp_vect_mappings_create(struct hfi1_devdata * dd,struct hfi1_affinity_node * entry)401 static int _dev_comp_vect_mappings_create(struct hfi1_devdata *dd,
402 struct hfi1_affinity_node *entry)
403 __must_hold(&node_affinity.lock)
404 {
405 int i, cpu, ret;
406 cpumask_var_t non_intr_cpus;
407 cpumask_var_t available_cpus;
408
409 lockdep_assert_held(&node_affinity.lock);
410
411 if (!zalloc_cpumask_var(&non_intr_cpus, GFP_KERNEL))
412 return -ENOMEM;
413
414 if (!zalloc_cpumask_var(&available_cpus, GFP_KERNEL)) {
415 free_cpumask_var(non_intr_cpus);
416 return -ENOMEM;
417 }
418
419 dd->comp_vect_mappings = kcalloc(dd->comp_vect_possible_cpus,
420 sizeof(*dd->comp_vect_mappings),
421 GFP_KERNEL);
422 if (!dd->comp_vect_mappings) {
423 ret = -ENOMEM;
424 goto fail;
425 }
426 for (i = 0; i < dd->comp_vect_possible_cpus; i++)
427 dd->comp_vect_mappings[i] = -1;
428
429 for (i = 0; i < dd->comp_vect_possible_cpus; i++) {
430 cpu = _dev_comp_vect_cpu_get(dd, entry, non_intr_cpus,
431 available_cpus);
432 if (cpu < 0) {
433 ret = -EINVAL;
434 goto fail;
435 }
436
437 dd->comp_vect_mappings[i] = cpu;
438 hfi1_cdbg(AFFINITY,
439 "[%s] Completion Vector %d -> CPU %d",
440 rvt_get_ibdev_name(&(dd)->verbs_dev.rdi), i, cpu);
441 }
442
443 free_cpumask_var(available_cpus);
444 free_cpumask_var(non_intr_cpus);
445 return 0;
446
447 fail:
448 free_cpumask_var(available_cpus);
449 free_cpumask_var(non_intr_cpus);
450 _dev_comp_vect_mappings_destroy(dd);
451
452 return ret;
453 }
454
hfi1_comp_vectors_set_up(struct hfi1_devdata * dd)455 int hfi1_comp_vectors_set_up(struct hfi1_devdata *dd)
456 {
457 int ret;
458 struct hfi1_affinity_node *entry;
459
460 mutex_lock(&node_affinity.lock);
461 entry = node_affinity_lookup(dd->node);
462 if (!entry) {
463 ret = -EINVAL;
464 goto unlock;
465 }
466 ret = _dev_comp_vect_mappings_create(dd, entry);
467 unlock:
468 mutex_unlock(&node_affinity.lock);
469
470 return ret;
471 }
472
hfi1_comp_vectors_clean_up(struct hfi1_devdata * dd)473 void hfi1_comp_vectors_clean_up(struct hfi1_devdata *dd)
474 {
475 _dev_comp_vect_mappings_destroy(dd);
476 }
477
hfi1_comp_vect_mappings_lookup(struct rvt_dev_info * rdi,int comp_vect)478 int hfi1_comp_vect_mappings_lookup(struct rvt_dev_info *rdi, int comp_vect)
479 {
480 struct hfi1_ibdev *verbs_dev = dev_from_rdi(rdi);
481 struct hfi1_devdata *dd = dd_from_dev(verbs_dev);
482
483 if (!dd->comp_vect_mappings)
484 return -EINVAL;
485 if (comp_vect >= dd->comp_vect_possible_cpus)
486 return -EINVAL;
487
488 return dd->comp_vect_mappings[comp_vect];
489 }
490
491 /*
492 * It assumes dd->comp_vect_possible_cpus is available.
493 */
_dev_comp_vect_cpu_mask_init(struct hfi1_devdata * dd,struct hfi1_affinity_node * entry,bool first_dev_init)494 static int _dev_comp_vect_cpu_mask_init(struct hfi1_devdata *dd,
495 struct hfi1_affinity_node *entry,
496 bool first_dev_init)
497 __must_hold(&node_affinity.lock)
498 {
499 int i, j, curr_cpu;
500 int possible_cpus_comp_vect = 0;
501 struct cpumask *dev_comp_vect_mask = &dd->comp_vect->mask;
502
503 lockdep_assert_held(&node_affinity.lock);
504 /*
505 * If there's only one CPU available for completion vectors, then
506 * there will only be one completion vector available. Othewise,
507 * the number of completion vector available will be the number of
508 * available CPUs divide it by the number of devices in the
509 * local NUMA node.
510 */
511 if (cpumask_weight(&entry->comp_vect_mask) == 1) {
512 possible_cpus_comp_vect = 1;
513 dd_dev_warn(dd,
514 "Number of kernel receive queues is too large for completion vector affinity to be effective\n");
515 } else {
516 possible_cpus_comp_vect +=
517 cpumask_weight(&entry->comp_vect_mask) /
518 hfi1_per_node_cntr[dd->node];
519
520 /*
521 * If the completion vector CPUs available doesn't divide
522 * evenly among devices, then the first device device to be
523 * initialized gets an extra CPU.
524 */
525 if (first_dev_init &&
526 cpumask_weight(&entry->comp_vect_mask) %
527 hfi1_per_node_cntr[dd->node] != 0)
528 possible_cpus_comp_vect++;
529 }
530
531 dd->comp_vect_possible_cpus = possible_cpus_comp_vect;
532
533 /* Reserving CPUs for device completion vector */
534 for (i = 0; i < dd->comp_vect_possible_cpus; i++) {
535 curr_cpu = per_cpu_affinity_get(&entry->comp_vect_mask,
536 entry->comp_vect_affinity);
537 if (curr_cpu < 0)
538 goto fail;
539
540 cpumask_set_cpu(curr_cpu, dev_comp_vect_mask);
541 }
542
543 hfi1_cdbg(AFFINITY,
544 "[%s] Completion vector affinity CPU set(s) %*pbl",
545 rvt_get_ibdev_name(&(dd)->verbs_dev.rdi),
546 cpumask_pr_args(dev_comp_vect_mask));
547
548 return 0;
549
550 fail:
551 for (j = 0; j < i; j++)
552 per_cpu_affinity_put_max(&entry->comp_vect_mask,
553 entry->comp_vect_affinity);
554
555 return curr_cpu;
556 }
557
558 /*
559 * It assumes dd->comp_vect_possible_cpus is available.
560 */
_dev_comp_vect_cpu_mask_clean_up(struct hfi1_devdata * dd,struct hfi1_affinity_node * entry)561 static void _dev_comp_vect_cpu_mask_clean_up(struct hfi1_devdata *dd,
562 struct hfi1_affinity_node *entry)
563 __must_hold(&node_affinity.lock)
564 {
565 int i, cpu;
566
567 lockdep_assert_held(&node_affinity.lock);
568 if (!dd->comp_vect_possible_cpus)
569 return;
570
571 for (i = 0; i < dd->comp_vect_possible_cpus; i++) {
572 cpu = per_cpu_affinity_put_max(&dd->comp_vect->mask,
573 entry->comp_vect_affinity);
574 /* Clearing CPU in device completion vector cpu mask */
575 if (cpu >= 0)
576 cpumask_clear_cpu(cpu, &dd->comp_vect->mask);
577 }
578
579 dd->comp_vect_possible_cpus = 0;
580 }
581
582 /*
583 * Interrupt affinity.
584 *
585 * non-rcv avail gets a default mask that
586 * starts as possible cpus with threads reset
587 * and each rcv avail reset.
588 *
589 * rcv avail gets node relative 1 wrapping back
590 * to the node relative 1 as necessary.
591 *
592 */
hfi1_dev_affinity_init(struct hfi1_devdata * dd)593 int hfi1_dev_affinity_init(struct hfi1_devdata *dd)
594 {
595 struct hfi1_affinity_node *entry;
596 const struct cpumask *local_mask;
597 int curr_cpu, possible, i, ret;
598 bool new_entry = false;
599
600 local_mask = cpumask_of_node(dd->node);
601 if (cpumask_first(local_mask) >= nr_cpu_ids)
602 local_mask = topology_core_cpumask(0);
603
604 mutex_lock(&node_affinity.lock);
605 entry = node_affinity_lookup(dd->node);
606
607 /*
608 * If this is the first time this NUMA node's affinity is used,
609 * create an entry in the global affinity structure and initialize it.
610 */
611 if (!entry) {
612 entry = node_affinity_allocate(dd->node);
613 if (!entry) {
614 dd_dev_err(dd,
615 "Unable to allocate global affinity node\n");
616 ret = -ENOMEM;
617 goto fail;
618 }
619 new_entry = true;
620
621 init_cpu_mask_set(&entry->def_intr);
622 init_cpu_mask_set(&entry->rcv_intr);
623 cpumask_clear(&entry->comp_vect_mask);
624 cpumask_clear(&entry->general_intr_mask);
625 /* Use the "real" cpu mask of this node as the default */
626 cpumask_and(&entry->def_intr.mask, &node_affinity.real_cpu_mask,
627 local_mask);
628
629 /* fill in the receive list */
630 possible = cpumask_weight(&entry->def_intr.mask);
631 curr_cpu = cpumask_first(&entry->def_intr.mask);
632
633 if (possible == 1) {
634 /* only one CPU, everyone will use it */
635 cpumask_set_cpu(curr_cpu, &entry->rcv_intr.mask);
636 cpumask_set_cpu(curr_cpu, &entry->general_intr_mask);
637 } else {
638 /*
639 * The general/control context will be the first CPU in
640 * the default list, so it is removed from the default
641 * list and added to the general interrupt list.
642 */
643 cpumask_clear_cpu(curr_cpu, &entry->def_intr.mask);
644 cpumask_set_cpu(curr_cpu, &entry->general_intr_mask);
645 curr_cpu = cpumask_next(curr_cpu,
646 &entry->def_intr.mask);
647
648 /*
649 * Remove the remaining kernel receive queues from
650 * the default list and add them to the receive list.
651 */
652 for (i = 0;
653 i < (dd->n_krcv_queues - 1) *
654 hfi1_per_node_cntr[dd->node];
655 i++) {
656 cpumask_clear_cpu(curr_cpu,
657 &entry->def_intr.mask);
658 cpumask_set_cpu(curr_cpu,
659 &entry->rcv_intr.mask);
660 curr_cpu = cpumask_next(curr_cpu,
661 &entry->def_intr.mask);
662 if (curr_cpu >= nr_cpu_ids)
663 break;
664 }
665
666 /*
667 * If there ends up being 0 CPU cores leftover for SDMA
668 * engines, use the same CPU cores as general/control
669 * context.
670 */
671 if (cpumask_empty(&entry->def_intr.mask))
672 cpumask_copy(&entry->def_intr.mask,
673 &entry->general_intr_mask);
674 }
675
676 /* Determine completion vector CPUs for the entire node */
677 cpumask_and(&entry->comp_vect_mask,
678 &node_affinity.real_cpu_mask, local_mask);
679 cpumask_andnot(&entry->comp_vect_mask,
680 &entry->comp_vect_mask,
681 &entry->rcv_intr.mask);
682 cpumask_andnot(&entry->comp_vect_mask,
683 &entry->comp_vect_mask,
684 &entry->general_intr_mask);
685
686 /*
687 * If there ends up being 0 CPU cores leftover for completion
688 * vectors, use the same CPU core as the general/control
689 * context.
690 */
691 if (cpumask_empty(&entry->comp_vect_mask))
692 cpumask_copy(&entry->comp_vect_mask,
693 &entry->general_intr_mask);
694 }
695
696 ret = _dev_comp_vect_cpu_mask_init(dd, entry, new_entry);
697 if (ret < 0)
698 goto fail;
699
700 if (new_entry)
701 node_affinity_add_tail(entry);
702
703 dd->affinity_entry = entry;
704 mutex_unlock(&node_affinity.lock);
705
706 return 0;
707
708 fail:
709 if (new_entry)
710 node_affinity_destroy(entry);
711 mutex_unlock(&node_affinity.lock);
712 return ret;
713 }
714
hfi1_dev_affinity_clean_up(struct hfi1_devdata * dd)715 void hfi1_dev_affinity_clean_up(struct hfi1_devdata *dd)
716 {
717 struct hfi1_affinity_node *entry;
718
719 mutex_lock(&node_affinity.lock);
720 if (!dd->affinity_entry)
721 goto unlock;
722 entry = node_affinity_lookup(dd->node);
723 if (!entry)
724 goto unlock;
725
726 /*
727 * Free device completion vector CPUs to be used by future
728 * completion vectors
729 */
730 _dev_comp_vect_cpu_mask_clean_up(dd, entry);
731 unlock:
732 dd->affinity_entry = NULL;
733 mutex_unlock(&node_affinity.lock);
734 }
735
736 /*
737 * Function updates the irq affinity hint for msix after it has been changed
738 * by the user using the /proc/irq interface. This function only accepts
739 * one cpu in the mask.
740 */
hfi1_update_sdma_affinity(struct hfi1_msix_entry * msix,int cpu)741 static void hfi1_update_sdma_affinity(struct hfi1_msix_entry *msix, int cpu)
742 {
743 struct sdma_engine *sde = msix->arg;
744 struct hfi1_devdata *dd = sde->dd;
745 struct hfi1_affinity_node *entry;
746 struct cpu_mask_set *set;
747 int i, old_cpu;
748
749 if (cpu > num_online_cpus() || cpu == sde->cpu)
750 return;
751
752 mutex_lock(&node_affinity.lock);
753 entry = node_affinity_lookup(dd->node);
754 if (!entry)
755 goto unlock;
756
757 old_cpu = sde->cpu;
758 sde->cpu = cpu;
759 cpumask_clear(&msix->mask);
760 cpumask_set_cpu(cpu, &msix->mask);
761 dd_dev_dbg(dd, "IRQ: %u, type %s engine %u -> cpu: %d\n",
762 msix->irq, irq_type_names[msix->type],
763 sde->this_idx, cpu);
764 irq_set_affinity_hint(msix->irq, &msix->mask);
765
766 /*
767 * Set the new cpu in the hfi1_affinity_node and clean
768 * the old cpu if it is not used by any other IRQ
769 */
770 set = &entry->def_intr;
771 cpumask_set_cpu(cpu, &set->mask);
772 cpumask_set_cpu(cpu, &set->used);
773 for (i = 0; i < dd->msix_info.max_requested; i++) {
774 struct hfi1_msix_entry *other_msix;
775
776 other_msix = &dd->msix_info.msix_entries[i];
777 if (other_msix->type != IRQ_SDMA || other_msix == msix)
778 continue;
779
780 if (cpumask_test_cpu(old_cpu, &other_msix->mask))
781 goto unlock;
782 }
783 cpumask_clear_cpu(old_cpu, &set->mask);
784 cpumask_clear_cpu(old_cpu, &set->used);
785 unlock:
786 mutex_unlock(&node_affinity.lock);
787 }
788
hfi1_irq_notifier_notify(struct irq_affinity_notify * notify,const cpumask_t * mask)789 static void hfi1_irq_notifier_notify(struct irq_affinity_notify *notify,
790 const cpumask_t *mask)
791 {
792 int cpu = cpumask_first(mask);
793 struct hfi1_msix_entry *msix = container_of(notify,
794 struct hfi1_msix_entry,
795 notify);
796
797 /* Only one CPU configuration supported currently */
798 hfi1_update_sdma_affinity(msix, cpu);
799 }
800
hfi1_irq_notifier_release(struct kref * ref)801 static void hfi1_irq_notifier_release(struct kref *ref)
802 {
803 /*
804 * This is required by affinity notifier. We don't have anything to
805 * free here.
806 */
807 }
808
hfi1_setup_sdma_notifier(struct hfi1_msix_entry * msix)809 static void hfi1_setup_sdma_notifier(struct hfi1_msix_entry *msix)
810 {
811 struct irq_affinity_notify *notify = &msix->notify;
812
813 notify->irq = msix->irq;
814 notify->notify = hfi1_irq_notifier_notify;
815 notify->release = hfi1_irq_notifier_release;
816
817 if (irq_set_affinity_notifier(notify->irq, notify))
818 pr_err("Failed to register sdma irq affinity notifier for irq %d\n",
819 notify->irq);
820 }
821
hfi1_cleanup_sdma_notifier(struct hfi1_msix_entry * msix)822 static void hfi1_cleanup_sdma_notifier(struct hfi1_msix_entry *msix)
823 {
824 struct irq_affinity_notify *notify = &msix->notify;
825
826 if (irq_set_affinity_notifier(notify->irq, NULL))
827 pr_err("Failed to cleanup sdma irq affinity notifier for irq %d\n",
828 notify->irq);
829 }
830
831 /*
832 * Function sets the irq affinity for msix.
833 * It *must* be called with node_affinity.lock held.
834 */
get_irq_affinity(struct hfi1_devdata * dd,struct hfi1_msix_entry * msix)835 static int get_irq_affinity(struct hfi1_devdata *dd,
836 struct hfi1_msix_entry *msix)
837 {
838 cpumask_var_t diff;
839 struct hfi1_affinity_node *entry;
840 struct cpu_mask_set *set = NULL;
841 struct sdma_engine *sde = NULL;
842 struct hfi1_ctxtdata *rcd = NULL;
843 char extra[64];
844 int cpu = -1;
845
846 extra[0] = '\0';
847 cpumask_clear(&msix->mask);
848
849 entry = node_affinity_lookup(dd->node);
850
851 switch (msix->type) {
852 case IRQ_SDMA:
853 sde = (struct sdma_engine *)msix->arg;
854 scnprintf(extra, 64, "engine %u", sde->this_idx);
855 set = &entry->def_intr;
856 break;
857 case IRQ_GENERAL:
858 cpu = cpumask_first(&entry->general_intr_mask);
859 break;
860 case IRQ_RCVCTXT:
861 rcd = (struct hfi1_ctxtdata *)msix->arg;
862 if (rcd->ctxt == HFI1_CTRL_CTXT)
863 cpu = cpumask_first(&entry->general_intr_mask);
864 else
865 set = &entry->rcv_intr;
866 scnprintf(extra, 64, "ctxt %u", rcd->ctxt);
867 break;
868 case IRQ_NETDEVCTXT:
869 rcd = (struct hfi1_ctxtdata *)msix->arg;
870 set = &entry->def_intr;
871 scnprintf(extra, 64, "ctxt %u", rcd->ctxt);
872 break;
873 default:
874 dd_dev_err(dd, "Invalid IRQ type %d\n", msix->type);
875 return -EINVAL;
876 }
877
878 /*
879 * The general and control contexts are placed on a particular
880 * CPU, which is set above. Skip accounting for it. Everything else
881 * finds its CPU here.
882 */
883 if (cpu == -1 && set) {
884 if (!zalloc_cpumask_var(&diff, GFP_KERNEL))
885 return -ENOMEM;
886
887 cpu = cpu_mask_set_get_first(set, diff);
888 if (cpu < 0) {
889 free_cpumask_var(diff);
890 dd_dev_err(dd, "Failure to obtain CPU for IRQ\n");
891 return cpu;
892 }
893
894 free_cpumask_var(diff);
895 }
896
897 cpumask_set_cpu(cpu, &msix->mask);
898 dd_dev_info(dd, "IRQ: %u, type %s %s -> cpu: %d\n",
899 msix->irq, irq_type_names[msix->type],
900 extra, cpu);
901 irq_set_affinity_hint(msix->irq, &msix->mask);
902
903 if (msix->type == IRQ_SDMA) {
904 sde->cpu = cpu;
905 hfi1_setup_sdma_notifier(msix);
906 }
907
908 return 0;
909 }
910
hfi1_get_irq_affinity(struct hfi1_devdata * dd,struct hfi1_msix_entry * msix)911 int hfi1_get_irq_affinity(struct hfi1_devdata *dd, struct hfi1_msix_entry *msix)
912 {
913 int ret;
914
915 mutex_lock(&node_affinity.lock);
916 ret = get_irq_affinity(dd, msix);
917 mutex_unlock(&node_affinity.lock);
918 return ret;
919 }
920
hfi1_put_irq_affinity(struct hfi1_devdata * dd,struct hfi1_msix_entry * msix)921 void hfi1_put_irq_affinity(struct hfi1_devdata *dd,
922 struct hfi1_msix_entry *msix)
923 {
924 struct cpu_mask_set *set = NULL;
925 struct hfi1_affinity_node *entry;
926
927 mutex_lock(&node_affinity.lock);
928 entry = node_affinity_lookup(dd->node);
929
930 switch (msix->type) {
931 case IRQ_SDMA:
932 set = &entry->def_intr;
933 hfi1_cleanup_sdma_notifier(msix);
934 break;
935 case IRQ_GENERAL:
936 /* Don't do accounting for general contexts */
937 break;
938 case IRQ_RCVCTXT: {
939 struct hfi1_ctxtdata *rcd = msix->arg;
940
941 /* Don't do accounting for control contexts */
942 if (rcd->ctxt != HFI1_CTRL_CTXT)
943 set = &entry->rcv_intr;
944 break;
945 }
946 case IRQ_NETDEVCTXT:
947 set = &entry->def_intr;
948 break;
949 default:
950 mutex_unlock(&node_affinity.lock);
951 return;
952 }
953
954 if (set) {
955 cpumask_andnot(&set->used, &set->used, &msix->mask);
956 _cpu_mask_set_gen_dec(set);
957 }
958
959 irq_set_affinity_hint(msix->irq, NULL);
960 cpumask_clear(&msix->mask);
961 mutex_unlock(&node_affinity.lock);
962 }
963
964 /* This should be called with node_affinity.lock held */
find_hw_thread_mask(uint hw_thread_no,cpumask_var_t hw_thread_mask,struct hfi1_affinity_node_list * affinity)965 static void find_hw_thread_mask(uint hw_thread_no, cpumask_var_t hw_thread_mask,
966 struct hfi1_affinity_node_list *affinity)
967 {
968 int possible, curr_cpu, i;
969 uint num_cores_per_socket = node_affinity.num_online_cpus /
970 affinity->num_core_siblings /
971 node_affinity.num_online_nodes;
972
973 cpumask_copy(hw_thread_mask, &affinity->proc.mask);
974 if (affinity->num_core_siblings > 0) {
975 /* Removing other siblings not needed for now */
976 possible = cpumask_weight(hw_thread_mask);
977 curr_cpu = cpumask_first(hw_thread_mask);
978 for (i = 0;
979 i < num_cores_per_socket * node_affinity.num_online_nodes;
980 i++)
981 curr_cpu = cpumask_next(curr_cpu, hw_thread_mask);
982
983 for (; i < possible; i++) {
984 cpumask_clear_cpu(curr_cpu, hw_thread_mask);
985 curr_cpu = cpumask_next(curr_cpu, hw_thread_mask);
986 }
987
988 /* Identifying correct HW threads within physical cores */
989 cpumask_shift_left(hw_thread_mask, hw_thread_mask,
990 num_cores_per_socket *
991 node_affinity.num_online_nodes *
992 hw_thread_no);
993 }
994 }
995
hfi1_get_proc_affinity(int node)996 int hfi1_get_proc_affinity(int node)
997 {
998 int cpu = -1, ret, i;
999 struct hfi1_affinity_node *entry;
1000 cpumask_var_t diff, hw_thread_mask, available_mask, intrs_mask;
1001 const struct cpumask *node_mask,
1002 *proc_mask = current->cpus_ptr;
1003 struct hfi1_affinity_node_list *affinity = &node_affinity;
1004 struct cpu_mask_set *set = &affinity->proc;
1005
1006 /*
1007 * check whether process/context affinity has already
1008 * been set
1009 */
1010 if (current->nr_cpus_allowed == 1) {
1011 hfi1_cdbg(PROC, "PID %u %s affinity set to CPU %*pbl",
1012 current->pid, current->comm,
1013 cpumask_pr_args(proc_mask));
1014 /*
1015 * Mark the pre-set CPU as used. This is atomic so we don't
1016 * need the lock
1017 */
1018 cpu = cpumask_first(proc_mask);
1019 cpumask_set_cpu(cpu, &set->used);
1020 goto done;
1021 } else if (current->nr_cpus_allowed < cpumask_weight(&set->mask)) {
1022 hfi1_cdbg(PROC, "PID %u %s affinity set to CPU set(s) %*pbl",
1023 current->pid, current->comm,
1024 cpumask_pr_args(proc_mask));
1025 goto done;
1026 }
1027
1028 /*
1029 * The process does not have a preset CPU affinity so find one to
1030 * recommend using the following algorithm:
1031 *
1032 * For each user process that is opening a context on HFI Y:
1033 * a) If all cores are filled, reinitialize the bitmask
1034 * b) Fill real cores first, then HT cores (First set of HT
1035 * cores on all physical cores, then second set of HT core,
1036 * and, so on) in the following order:
1037 *
1038 * 1. Same NUMA node as HFI Y and not running an IRQ
1039 * handler
1040 * 2. Same NUMA node as HFI Y and running an IRQ handler
1041 * 3. Different NUMA node to HFI Y and not running an IRQ
1042 * handler
1043 * 4. Different NUMA node to HFI Y and running an IRQ
1044 * handler
1045 * c) Mark core as filled in the bitmask. As user processes are
1046 * done, clear cores from the bitmask.
1047 */
1048
1049 ret = zalloc_cpumask_var(&diff, GFP_KERNEL);
1050 if (!ret)
1051 goto done;
1052 ret = zalloc_cpumask_var(&hw_thread_mask, GFP_KERNEL);
1053 if (!ret)
1054 goto free_diff;
1055 ret = zalloc_cpumask_var(&available_mask, GFP_KERNEL);
1056 if (!ret)
1057 goto free_hw_thread_mask;
1058 ret = zalloc_cpumask_var(&intrs_mask, GFP_KERNEL);
1059 if (!ret)
1060 goto free_available_mask;
1061
1062 mutex_lock(&affinity->lock);
1063 /*
1064 * If we've used all available HW threads, clear the mask and start
1065 * overloading.
1066 */
1067 _cpu_mask_set_gen_inc(set);
1068
1069 /*
1070 * If NUMA node has CPUs used by interrupt handlers, include them in the
1071 * interrupt handler mask.
1072 */
1073 entry = node_affinity_lookup(node);
1074 if (entry) {
1075 cpumask_copy(intrs_mask, (entry->def_intr.gen ?
1076 &entry->def_intr.mask :
1077 &entry->def_intr.used));
1078 cpumask_or(intrs_mask, intrs_mask, (entry->rcv_intr.gen ?
1079 &entry->rcv_intr.mask :
1080 &entry->rcv_intr.used));
1081 cpumask_or(intrs_mask, intrs_mask, &entry->general_intr_mask);
1082 }
1083 hfi1_cdbg(PROC, "CPUs used by interrupts: %*pbl",
1084 cpumask_pr_args(intrs_mask));
1085
1086 cpumask_copy(hw_thread_mask, &set->mask);
1087
1088 /*
1089 * If HT cores are enabled, identify which HW threads within the
1090 * physical cores should be used.
1091 */
1092 if (affinity->num_core_siblings > 0) {
1093 for (i = 0; i < affinity->num_core_siblings; i++) {
1094 find_hw_thread_mask(i, hw_thread_mask, affinity);
1095
1096 /*
1097 * If there's at least one available core for this HW
1098 * thread number, stop looking for a core.
1099 *
1100 * diff will always be not empty at least once in this
1101 * loop as the used mask gets reset when
1102 * (set->mask == set->used) before this loop.
1103 */
1104 cpumask_andnot(diff, hw_thread_mask, &set->used);
1105 if (!cpumask_empty(diff))
1106 break;
1107 }
1108 }
1109 hfi1_cdbg(PROC, "Same available HW thread on all physical CPUs: %*pbl",
1110 cpumask_pr_args(hw_thread_mask));
1111
1112 node_mask = cpumask_of_node(node);
1113 hfi1_cdbg(PROC, "Device on NUMA %u, CPUs %*pbl", node,
1114 cpumask_pr_args(node_mask));
1115
1116 /* Get cpumask of available CPUs on preferred NUMA */
1117 cpumask_and(available_mask, hw_thread_mask, node_mask);
1118 cpumask_andnot(available_mask, available_mask, &set->used);
1119 hfi1_cdbg(PROC, "Available CPUs on NUMA %u: %*pbl", node,
1120 cpumask_pr_args(available_mask));
1121
1122 /*
1123 * At first, we don't want to place processes on the same
1124 * CPUs as interrupt handlers. Then, CPUs running interrupt
1125 * handlers are used.
1126 *
1127 * 1) If diff is not empty, then there are CPUs not running
1128 * non-interrupt handlers available, so diff gets copied
1129 * over to available_mask.
1130 * 2) If diff is empty, then all CPUs not running interrupt
1131 * handlers are taken, so available_mask contains all
1132 * available CPUs running interrupt handlers.
1133 * 3) If available_mask is empty, then all CPUs on the
1134 * preferred NUMA node are taken, so other NUMA nodes are
1135 * used for process assignments using the same method as
1136 * the preferred NUMA node.
1137 */
1138 cpumask_andnot(diff, available_mask, intrs_mask);
1139 if (!cpumask_empty(diff))
1140 cpumask_copy(available_mask, diff);
1141
1142 /* If we don't have CPUs on the preferred node, use other NUMA nodes */
1143 if (cpumask_empty(available_mask)) {
1144 cpumask_andnot(available_mask, hw_thread_mask, &set->used);
1145 /* Excluding preferred NUMA cores */
1146 cpumask_andnot(available_mask, available_mask, node_mask);
1147 hfi1_cdbg(PROC,
1148 "Preferred NUMA node cores are taken, cores available in other NUMA nodes: %*pbl",
1149 cpumask_pr_args(available_mask));
1150
1151 /*
1152 * At first, we don't want to place processes on the same
1153 * CPUs as interrupt handlers.
1154 */
1155 cpumask_andnot(diff, available_mask, intrs_mask);
1156 if (!cpumask_empty(diff))
1157 cpumask_copy(available_mask, diff);
1158 }
1159 hfi1_cdbg(PROC, "Possible CPUs for process: %*pbl",
1160 cpumask_pr_args(available_mask));
1161
1162 cpu = cpumask_first(available_mask);
1163 if (cpu >= nr_cpu_ids) /* empty */
1164 cpu = -1;
1165 else
1166 cpumask_set_cpu(cpu, &set->used);
1167
1168 mutex_unlock(&affinity->lock);
1169 hfi1_cdbg(PROC, "Process assigned to CPU %d", cpu);
1170
1171 free_cpumask_var(intrs_mask);
1172 free_available_mask:
1173 free_cpumask_var(available_mask);
1174 free_hw_thread_mask:
1175 free_cpumask_var(hw_thread_mask);
1176 free_diff:
1177 free_cpumask_var(diff);
1178 done:
1179 return cpu;
1180 }
1181
hfi1_put_proc_affinity(int cpu)1182 void hfi1_put_proc_affinity(int cpu)
1183 {
1184 struct hfi1_affinity_node_list *affinity = &node_affinity;
1185 struct cpu_mask_set *set = &affinity->proc;
1186
1187 if (cpu < 0)
1188 return;
1189
1190 mutex_lock(&affinity->lock);
1191 cpu_mask_set_put(set, cpu);
1192 hfi1_cdbg(PROC, "Returning CPU %d for future process assignment", cpu);
1193 mutex_unlock(&affinity->lock);
1194 }
1195