1 /*
2 * ACPI 3.0 based NUMA setup
3 * Copyright 2004 Andi Kleen, SuSE Labs.
4 *
5 * Reads the ACPI SRAT table to figure out what memory belongs to which CPUs.
6 *
7 * Called from acpi_numa_init while reading the SRAT and SLIT tables.
8 * Assumes all memory regions belonging to a single proximity domain
9 * are in one chunk. Holes between them will be included in the node.
10 *
11 * Adapted for Xen: Ryan Harper <ryanh@us.ibm.com>
12 */
13
14 #include <xen/init.h>
15 #include <xen/mm.h>
16 #include <xen/inttypes.h>
17 #include <xen/nodemask.h>
18 #include <xen/acpi.h>
19 #include <xen/numa.h>
20 #include <xen/pfn.h>
21 #include <asm/e820.h>
22 #include <asm/page.h>
23
24 static struct acpi_table_slit *__read_mostly acpi_slit;
25
26 static nodemask_t memory_nodes_parsed __initdata;
27 static nodemask_t processor_nodes_parsed __initdata;
28 static struct node nodes[MAX_NUMNODES] __initdata;
29
30 struct pxm2node {
31 unsigned pxm;
32 nodeid_t node;
33 };
34 static struct pxm2node __read_mostly pxm2node[MAX_NUMNODES] =
35 { [0 ... MAX_NUMNODES - 1] = {.node = NUMA_NO_NODE} };
36
37 static unsigned node_to_pxm(nodeid_t n);
38
39 static int num_node_memblks;
40 static struct node node_memblk_range[NR_NODE_MEMBLKS];
41 static nodeid_t memblk_nodeid[NR_NODE_MEMBLKS];
42 static __initdata DECLARE_BITMAP(memblk_hotplug, NR_NODE_MEMBLKS);
43
node_found(unsigned idx,unsigned pxm)44 static inline bool node_found(unsigned idx, unsigned pxm)
45 {
46 return ((pxm2node[idx].pxm == pxm) &&
47 (pxm2node[idx].node != NUMA_NO_NODE));
48 }
49
pxm_to_node(unsigned pxm)50 nodeid_t pxm_to_node(unsigned pxm)
51 {
52 unsigned i;
53
54 if ((pxm < ARRAY_SIZE(pxm2node)) && node_found(pxm, pxm))
55 return pxm2node[pxm].node;
56
57 for (i = 0; i < ARRAY_SIZE(pxm2node); i++)
58 if (node_found(i, pxm))
59 return pxm2node[i].node;
60
61 return NUMA_NO_NODE;
62 }
63
setup_node(unsigned pxm)64 nodeid_t setup_node(unsigned pxm)
65 {
66 nodeid_t node;
67 unsigned idx;
68 static bool warned;
69 static unsigned nodes_found;
70
71 BUILD_BUG_ON(MAX_NUMNODES >= NUMA_NO_NODE);
72
73 if (pxm < ARRAY_SIZE(pxm2node)) {
74 if (node_found(pxm, pxm))
75 return pxm2node[pxm].node;
76
77 /* Try to maintain indexing of pxm2node by pxm */
78 if (pxm2node[pxm].node == NUMA_NO_NODE) {
79 idx = pxm;
80 goto finish;
81 }
82 }
83
84 for (idx = 0; idx < ARRAY_SIZE(pxm2node); idx++)
85 if (pxm2node[idx].node == NUMA_NO_NODE)
86 goto finish;
87
88 if (!warned) {
89 printk(KERN_WARNING "SRAT: Too many proximity domains (%#x)\n",
90 pxm);
91 warned = true;
92 }
93
94 return NUMA_NO_NODE;
95
96 finish:
97 node = nodes_found++;
98 if (node >= MAX_NUMNODES)
99 return NUMA_NO_NODE;
100 pxm2node[idx].pxm = pxm;
101 pxm2node[idx].node = node;
102
103 return node;
104 }
105
valid_numa_range(u64 start,u64 end,nodeid_t node)106 int valid_numa_range(u64 start, u64 end, nodeid_t node)
107 {
108 int i;
109
110 for (i = 0; i < num_node_memblks; i++) {
111 struct node *nd = &node_memblk_range[i];
112
113 if (nd->start <= start && nd->end > end &&
114 memblk_nodeid[i] == node )
115 return 1;
116 }
117
118 return 0;
119 }
120
conflicting_memblks(u64 start,u64 end)121 static __init int conflicting_memblks(u64 start, u64 end)
122 {
123 int i;
124
125 for (i = 0; i < num_node_memblks; i++) {
126 struct node *nd = &node_memblk_range[i];
127 if (nd->start == nd->end)
128 continue;
129 if (nd->end > start && nd->start < end)
130 return i;
131 if (nd->end == end && nd->start == start)
132 return i;
133 }
134 return -1;
135 }
136
cutoff_node(int i,u64 start,u64 end)137 static __init void cutoff_node(int i, u64 start, u64 end)
138 {
139 struct node *nd = &nodes[i];
140 if (nd->start < start) {
141 nd->start = start;
142 if (nd->end < nd->start)
143 nd->start = nd->end;
144 }
145 if (nd->end > end) {
146 nd->end = end;
147 if (nd->start > nd->end)
148 nd->start = nd->end;
149 }
150 }
151
bad_srat(void)152 static __init void bad_srat(void)
153 {
154 int i;
155 printk(KERN_ERR "SRAT: SRAT not used.\n");
156 acpi_numa = -1;
157 for (i = 0; i < MAX_LOCAL_APIC; i++)
158 apicid_to_node[i] = NUMA_NO_NODE;
159 for (i = 0; i < ARRAY_SIZE(pxm2node); i++)
160 pxm2node[i].node = NUMA_NO_NODE;
161 mem_hotplug = 0;
162 }
163
164 /*
165 * A lot of BIOS fill in 10 (= no distance) everywhere. This messes
166 * up the NUMA heuristics which wants the local node to have a smaller
167 * distance than the others.
168 * Do some quick checks here and only use the SLIT if it passes.
169 */
slit_valid(struct acpi_table_slit * slit)170 static __init int slit_valid(struct acpi_table_slit *slit)
171 {
172 int i, j;
173 int d = slit->locality_count;
174 for (i = 0; i < d; i++) {
175 for (j = 0; j < d; j++) {
176 u8 val = slit->entry[d*i + j];
177 if (i == j) {
178 if (val != 10)
179 return 0;
180 } else if (val <= 10)
181 return 0;
182 }
183 }
184 return 1;
185 }
186
187 /* Callback for SLIT parsing */
acpi_numa_slit_init(struct acpi_table_slit * slit)188 void __init acpi_numa_slit_init(struct acpi_table_slit *slit)
189 {
190 mfn_t mfn;
191
192 if (!slit_valid(slit)) {
193 printk(KERN_INFO "ACPI: SLIT table looks invalid. "
194 "Not used.\n");
195 return;
196 }
197 mfn = alloc_boot_pages(PFN_UP(slit->header.length), 1);
198 acpi_slit = mfn_to_virt(mfn_x(mfn));
199 memcpy(acpi_slit, slit, slit->header.length);
200 }
201
202 /* Callback for Proximity Domain -> x2APIC mapping */
203 void __init
acpi_numa_x2apic_affinity_init(const struct acpi_srat_x2apic_cpu_affinity * pa)204 acpi_numa_x2apic_affinity_init(const struct acpi_srat_x2apic_cpu_affinity *pa)
205 {
206 unsigned pxm;
207 nodeid_t node;
208
209 if (srat_disabled())
210 return;
211 if (pa->header.length < sizeof(struct acpi_srat_x2apic_cpu_affinity)) {
212 bad_srat();
213 return;
214 }
215 if (!(pa->flags & ACPI_SRAT_CPU_ENABLED))
216 return;
217 if (pa->apic_id >= MAX_LOCAL_APIC) {
218 printk(KERN_INFO "SRAT: APIC %08x ignored\n", pa->apic_id);
219 return;
220 }
221
222 pxm = pa->proximity_domain;
223 node = setup_node(pxm);
224 if (node == NUMA_NO_NODE) {
225 bad_srat();
226 return;
227 }
228
229 apicid_to_node[pa->apic_id] = node;
230 node_set(node, processor_nodes_parsed);
231 acpi_numa = 1;
232 printk(KERN_INFO "SRAT: PXM %u -> APIC %08x -> Node %u\n",
233 pxm, pa->apic_id, node);
234 }
235
236 /* Callback for Proximity Domain -> LAPIC mapping */
237 void __init
acpi_numa_processor_affinity_init(const struct acpi_srat_cpu_affinity * pa)238 acpi_numa_processor_affinity_init(const struct acpi_srat_cpu_affinity *pa)
239 {
240 unsigned pxm;
241 nodeid_t node;
242
243 if (srat_disabled())
244 return;
245 if (pa->header.length != sizeof(struct acpi_srat_cpu_affinity)) {
246 bad_srat();
247 return;
248 }
249 if (!(pa->flags & ACPI_SRAT_CPU_ENABLED))
250 return;
251 pxm = pa->proximity_domain_lo;
252 if (srat_rev >= 2) {
253 pxm |= pa->proximity_domain_hi[0] << 8;
254 pxm |= pa->proximity_domain_hi[1] << 16;
255 pxm |= pa->proximity_domain_hi[2] << 24;
256 }
257 node = setup_node(pxm);
258 if (node == NUMA_NO_NODE) {
259 bad_srat();
260 return;
261 }
262 apicid_to_node[pa->apic_id] = node;
263 node_set(node, processor_nodes_parsed);
264 acpi_numa = 1;
265 printk(KERN_INFO "SRAT: PXM %u -> APIC %02x -> Node %u\n",
266 pxm, pa->apic_id, node);
267 }
268
269 /* Callback for parsing of the Proximity Domain <-> Memory Area mappings */
270 void __init
acpi_numa_memory_affinity_init(const struct acpi_srat_mem_affinity * ma)271 acpi_numa_memory_affinity_init(const struct acpi_srat_mem_affinity *ma)
272 {
273 u64 start, end;
274 unsigned pxm;
275 nodeid_t node;
276 int i;
277
278 if (srat_disabled())
279 return;
280 if (ma->header.length != sizeof(struct acpi_srat_mem_affinity)) {
281 bad_srat();
282 return;
283 }
284 if (!(ma->flags & ACPI_SRAT_MEM_ENABLED))
285 return;
286
287 if (num_node_memblks >= NR_NODE_MEMBLKS)
288 {
289 dprintk(XENLOG_WARNING,
290 "Too many numa entry, try bigger NR_NODE_MEMBLKS \n");
291 bad_srat();
292 return;
293 }
294
295 start = ma->base_address;
296 end = start + ma->length;
297 pxm = ma->proximity_domain;
298 if (srat_rev < 2)
299 pxm &= 0xff;
300 node = setup_node(pxm);
301 if (node == NUMA_NO_NODE) {
302 bad_srat();
303 return;
304 }
305 /* It is fine to add this area to the nodes data it will be used later*/
306 i = conflicting_memblks(start, end);
307 if (i < 0)
308 /* everything fine */;
309 else if (memblk_nodeid[i] == node) {
310 bool mismatch = !(ma->flags & ACPI_SRAT_MEM_HOT_PLUGGABLE) !=
311 !test_bit(i, memblk_hotplug);
312
313 printk("%sSRAT: PXM %u (%"PRIx64"-%"PRIx64") overlaps with itself (%"PRIx64"-%"PRIx64")\n",
314 mismatch ? KERN_ERR : KERN_WARNING, pxm, start, end,
315 node_memblk_range[i].start, node_memblk_range[i].end);
316 if (mismatch) {
317 bad_srat();
318 return;
319 }
320 } else {
321 printk(KERN_ERR
322 "SRAT: PXM %u (%"PRIx64"-%"PRIx64") overlaps with PXM %u (%"PRIx64"-%"PRIx64")\n",
323 pxm, start, end, node_to_pxm(memblk_nodeid[i]),
324 node_memblk_range[i].start, node_memblk_range[i].end);
325 bad_srat();
326 return;
327 }
328 if (!(ma->flags & ACPI_SRAT_MEM_HOT_PLUGGABLE)) {
329 struct node *nd = &nodes[node];
330
331 if (!node_test_and_set(node, memory_nodes_parsed)) {
332 nd->start = start;
333 nd->end = end;
334 } else {
335 if (start < nd->start)
336 nd->start = start;
337 if (nd->end < end)
338 nd->end = end;
339 }
340 }
341 printk(KERN_INFO "SRAT: Node %u PXM %u %"PRIx64"-%"PRIx64"%s\n",
342 node, pxm, start, end,
343 ma->flags & ACPI_SRAT_MEM_HOT_PLUGGABLE ? " (hotplug)" : "");
344
345 node_memblk_range[num_node_memblks].start = start;
346 node_memblk_range[num_node_memblks].end = end;
347 memblk_nodeid[num_node_memblks] = node;
348 if (ma->flags & ACPI_SRAT_MEM_HOT_PLUGGABLE) {
349 __set_bit(num_node_memblks, memblk_hotplug);
350 if (end > mem_hotplug)
351 mem_hotplug = end;
352 }
353 num_node_memblks++;
354 }
355
356 /* Sanity check to catch more bad SRATs (they are amazingly common).
357 Make sure the PXMs cover all memory. */
nodes_cover_memory(void)358 static int __init nodes_cover_memory(void)
359 {
360 int i;
361
362 for (i = 0; i < e820.nr_map; i++) {
363 int j, found;
364 unsigned long long start, end;
365
366 if (e820.map[i].type != E820_RAM) {
367 continue;
368 }
369
370 start = e820.map[i].addr;
371 end = e820.map[i].addr + e820.map[i].size - 1;
372
373 do {
374 found = 0;
375 for_each_node_mask(j, memory_nodes_parsed)
376 if (start < nodes[j].end
377 && end > nodes[j].start) {
378 if (start >= nodes[j].start) {
379 start = nodes[j].end;
380 found = 1;
381 }
382 if (end <= nodes[j].end) {
383 end = nodes[j].start;
384 found = 1;
385 }
386 }
387 } while (found && start < end);
388
389 if (start < end) {
390 printk(KERN_ERR "SRAT: No PXM for e820 range: "
391 "%016Lx - %016Lx\n", start, end);
392 return 0;
393 }
394 }
395 return 1;
396 }
397
acpi_numa_arch_fixup(void)398 void __init acpi_numa_arch_fixup(void) {}
399
400 static u64 __initdata srat_region_mask;
401
srat_parse_region(struct acpi_subtable_header * header,const unsigned long end)402 static int __init srat_parse_region(struct acpi_subtable_header *header,
403 const unsigned long end)
404 {
405 struct acpi_srat_mem_affinity *ma;
406
407 if (!header)
408 return -EINVAL;
409
410 ma = container_of(header, struct acpi_srat_mem_affinity, header);
411
412 if (!ma->length ||
413 !(ma->flags & ACPI_SRAT_MEM_ENABLED) ||
414 (ma->flags & ACPI_SRAT_MEM_NON_VOLATILE))
415 return 0;
416
417 if (numa_off)
418 printk(KERN_INFO "SRAT: %013"PRIx64"-%013"PRIx64"\n",
419 ma->base_address, ma->base_address + ma->length - 1);
420
421 srat_region_mask |= ma->base_address |
422 pdx_region_mask(ma->base_address, ma->length);
423
424 return 0;
425 }
426
srat_parse_regions(u64 addr)427 void __init srat_parse_regions(u64 addr)
428 {
429 u64 mask;
430 unsigned int i;
431
432 if (acpi_disabled || acpi_numa < 0 ||
433 acpi_table_parse(ACPI_SIG_SRAT, acpi_parse_srat))
434 return;
435
436 srat_region_mask = pdx_init_mask(addr);
437 acpi_table_parse_srat(ACPI_SRAT_TYPE_MEMORY_AFFINITY,
438 srat_parse_region, 0);
439
440 for (mask = srat_region_mask, i = 0; mask && i < e820.nr_map; i++) {
441 if (e820.map[i].type != E820_RAM)
442 continue;
443
444 if (~mask & pdx_region_mask(e820.map[i].addr, e820.map[i].size))
445 mask = 0;
446 }
447
448 pfn_pdx_hole_setup(mask >> PAGE_SHIFT);
449 }
450
451 /* Use the information discovered above to actually set up the nodes. */
acpi_scan_nodes(u64 start,u64 end)452 int __init acpi_scan_nodes(u64 start, u64 end)
453 {
454 int i;
455 nodemask_t all_nodes_parsed;
456
457 /* First clean up the node list */
458 for (i = 0; i < MAX_NUMNODES; i++)
459 cutoff_node(i, start, end);
460
461 if (acpi_numa <= 0)
462 return -1;
463
464 if (!nodes_cover_memory()) {
465 bad_srat();
466 return -1;
467 }
468
469 memnode_shift = compute_hash_shift(node_memblk_range, num_node_memblks,
470 memblk_nodeid);
471
472 if (memnode_shift < 0) {
473 printk(KERN_ERR
474 "SRAT: No NUMA node hash function found. Contact maintainer\n");
475 bad_srat();
476 return -1;
477 }
478
479 nodes_or(all_nodes_parsed, memory_nodes_parsed, processor_nodes_parsed);
480
481 /* Finally register nodes */
482 for_each_node_mask(i, all_nodes_parsed)
483 {
484 u64 size = nodes[i].end - nodes[i].start;
485 if ( size == 0 )
486 printk(KERN_WARNING "SRAT: Node %u has no memory. "
487 "BIOS Bug or mis-configured hardware?\n", i);
488
489 setup_node_bootmem(i, nodes[i].start, nodes[i].end);
490 }
491 for (i = 0; i < nr_cpu_ids; i++) {
492 if (cpu_to_node[i] == NUMA_NO_NODE)
493 continue;
494 if (!node_isset(cpu_to_node[i], processor_nodes_parsed))
495 numa_set_node(i, NUMA_NO_NODE);
496 }
497 numa_init_array();
498 return 0;
499 }
500
node_to_pxm(nodeid_t n)501 static unsigned node_to_pxm(nodeid_t n)
502 {
503 unsigned i;
504
505 if ((n < ARRAY_SIZE(pxm2node)) && (pxm2node[n].node == n))
506 return pxm2node[n].pxm;
507 for (i = 0; i < ARRAY_SIZE(pxm2node); i++)
508 if (pxm2node[i].node == n)
509 return pxm2node[i].pxm;
510 return 0;
511 }
512
__node_distance(nodeid_t a,nodeid_t b)513 u8 __node_distance(nodeid_t a, nodeid_t b)
514 {
515 unsigned index;
516 u8 slit_val;
517
518 if (!acpi_slit)
519 return a == b ? 10 : 20;
520 index = acpi_slit->locality_count * node_to_pxm(a);
521 slit_val = acpi_slit->entry[index + node_to_pxm(b)];
522
523 /* ACPI defines 0xff as an unreachable node and 0-9 are undefined */
524 if ((slit_val == 0xff) || (slit_val <= 9))
525 return NUMA_NO_DISTANCE;
526 else
527 return slit_val;
528 }
529
530 EXPORT_SYMBOL(__node_distance);
531