1 /*
2  * ACPI 3.0 based NUMA setup
3  * Copyright 2004 Andi Kleen, SuSE Labs.
4  *
5  * Reads the ACPI SRAT table to figure out what memory belongs to which CPUs.
6  *
7  * Called from acpi_numa_init while reading the SRAT and SLIT tables.
8  * Assumes all memory regions belonging to a single proximity domain
9  * are in one chunk. Holes between them will be included in the node.
10  *
11  * Adapted for Xen: Ryan Harper <ryanh@us.ibm.com>
12  */
13 
14 #include <xen/init.h>
15 #include <xen/mm.h>
16 #include <xen/inttypes.h>
17 #include <xen/nodemask.h>
18 #include <xen/acpi.h>
19 #include <xen/numa.h>
20 #include <xen/pfn.h>
21 #include <asm/e820.h>
22 #include <asm/page.h>
23 
24 static struct acpi_table_slit *__read_mostly acpi_slit;
25 
26 static nodemask_t memory_nodes_parsed __initdata;
27 static nodemask_t processor_nodes_parsed __initdata;
28 static struct node nodes[MAX_NUMNODES] __initdata;
29 
30 struct pxm2node {
31 	unsigned pxm;
32 	nodeid_t node;
33 };
34 static struct pxm2node __read_mostly pxm2node[MAX_NUMNODES] =
35 	{ [0 ... MAX_NUMNODES - 1] = {.node = NUMA_NO_NODE} };
36 
37 static unsigned node_to_pxm(nodeid_t n);
38 
39 static int num_node_memblks;
40 static struct node node_memblk_range[NR_NODE_MEMBLKS];
41 static nodeid_t memblk_nodeid[NR_NODE_MEMBLKS];
42 static __initdata DECLARE_BITMAP(memblk_hotplug, NR_NODE_MEMBLKS);
43 
node_found(unsigned idx,unsigned pxm)44 static inline bool node_found(unsigned idx, unsigned pxm)
45 {
46 	return ((pxm2node[idx].pxm == pxm) &&
47 		(pxm2node[idx].node != NUMA_NO_NODE));
48 }
49 
pxm_to_node(unsigned pxm)50 nodeid_t pxm_to_node(unsigned pxm)
51 {
52 	unsigned i;
53 
54 	if ((pxm < ARRAY_SIZE(pxm2node)) && node_found(pxm, pxm))
55 		return pxm2node[pxm].node;
56 
57 	for (i = 0; i < ARRAY_SIZE(pxm2node); i++)
58 		if (node_found(i, pxm))
59 			return pxm2node[i].node;
60 
61 	return NUMA_NO_NODE;
62 }
63 
setup_node(unsigned pxm)64 nodeid_t setup_node(unsigned pxm)
65 {
66 	nodeid_t node;
67 	unsigned idx;
68 	static bool warned;
69 	static unsigned nodes_found;
70 
71 	BUILD_BUG_ON(MAX_NUMNODES >= NUMA_NO_NODE);
72 
73 	if (pxm < ARRAY_SIZE(pxm2node)) {
74 		if (node_found(pxm, pxm))
75 			return pxm2node[pxm].node;
76 
77 		/* Try to maintain indexing of pxm2node by pxm */
78 		if (pxm2node[pxm].node == NUMA_NO_NODE) {
79 			idx = pxm;
80 			goto finish;
81 		}
82 	}
83 
84 	for (idx = 0; idx < ARRAY_SIZE(pxm2node); idx++)
85 		if (pxm2node[idx].node == NUMA_NO_NODE)
86 			goto finish;
87 
88 	if (!warned) {
89 		printk(KERN_WARNING "SRAT: Too many proximity domains (%#x)\n",
90 		       pxm);
91 		warned = true;
92 	}
93 
94 	return NUMA_NO_NODE;
95 
96  finish:
97 	node = nodes_found++;
98 	if (node >= MAX_NUMNODES)
99 		return NUMA_NO_NODE;
100 	pxm2node[idx].pxm = pxm;
101 	pxm2node[idx].node = node;
102 
103 	return node;
104 }
105 
valid_numa_range(u64 start,u64 end,nodeid_t node)106 int valid_numa_range(u64 start, u64 end, nodeid_t node)
107 {
108 	int i;
109 
110 	for (i = 0; i < num_node_memblks; i++) {
111 		struct node *nd = &node_memblk_range[i];
112 
113 		if (nd->start <= start && nd->end > end &&
114 			memblk_nodeid[i] == node )
115 			return 1;
116 	}
117 
118 	return 0;
119 }
120 
conflicting_memblks(u64 start,u64 end)121 static __init int conflicting_memblks(u64 start, u64 end)
122 {
123 	int i;
124 
125 	for (i = 0; i < num_node_memblks; i++) {
126 		struct node *nd = &node_memblk_range[i];
127 		if (nd->start == nd->end)
128 			continue;
129 		if (nd->end > start && nd->start < end)
130 			return i;
131 		if (nd->end == end && nd->start == start)
132 			return i;
133 	}
134 	return -1;
135 }
136 
cutoff_node(int i,u64 start,u64 end)137 static __init void cutoff_node(int i, u64 start, u64 end)
138 {
139 	struct node *nd = &nodes[i];
140 	if (nd->start < start) {
141 		nd->start = start;
142 		if (nd->end < nd->start)
143 			nd->start = nd->end;
144 	}
145 	if (nd->end > end) {
146 		nd->end = end;
147 		if (nd->start > nd->end)
148 			nd->start = nd->end;
149 	}
150 }
151 
bad_srat(void)152 static __init void bad_srat(void)
153 {
154 	int i;
155 	printk(KERN_ERR "SRAT: SRAT not used.\n");
156 	acpi_numa = -1;
157 	for (i = 0; i < MAX_LOCAL_APIC; i++)
158 		apicid_to_node[i] = NUMA_NO_NODE;
159 	for (i = 0; i < ARRAY_SIZE(pxm2node); i++)
160 		pxm2node[i].node = NUMA_NO_NODE;
161 	mem_hotplug = 0;
162 }
163 
164 /*
165  * A lot of BIOS fill in 10 (= no distance) everywhere. This messes
166  * up the NUMA heuristics which wants the local node to have a smaller
167  * distance than the others.
168  * Do some quick checks here and only use the SLIT if it passes.
169  */
slit_valid(struct acpi_table_slit * slit)170 static __init int slit_valid(struct acpi_table_slit *slit)
171 {
172 	int i, j;
173 	int d = slit->locality_count;
174 	for (i = 0; i < d; i++) {
175 		for (j = 0; j < d; j++)  {
176 			u8 val = slit->entry[d*i + j];
177 			if (i == j) {
178 				if (val != 10)
179 					return 0;
180 			} else if (val <= 10)
181 				return 0;
182 		}
183 	}
184 	return 1;
185 }
186 
187 /* Callback for SLIT parsing */
acpi_numa_slit_init(struct acpi_table_slit * slit)188 void __init acpi_numa_slit_init(struct acpi_table_slit *slit)
189 {
190 	mfn_t mfn;
191 
192 	if (!slit_valid(slit)) {
193 		printk(KERN_INFO "ACPI: SLIT table looks invalid. "
194 		       "Not used.\n");
195 		return;
196 	}
197 	mfn = alloc_boot_pages(PFN_UP(slit->header.length), 1);
198 	acpi_slit = mfn_to_virt(mfn_x(mfn));
199 	memcpy(acpi_slit, slit, slit->header.length);
200 }
201 
202 /* Callback for Proximity Domain -> x2APIC mapping */
203 void __init
acpi_numa_x2apic_affinity_init(const struct acpi_srat_x2apic_cpu_affinity * pa)204 acpi_numa_x2apic_affinity_init(const struct acpi_srat_x2apic_cpu_affinity *pa)
205 {
206 	unsigned pxm;
207 	nodeid_t node;
208 
209 	if (srat_disabled())
210 		return;
211 	if (pa->header.length < sizeof(struct acpi_srat_x2apic_cpu_affinity)) {
212 		bad_srat();
213 		return;
214 	}
215 	if (!(pa->flags & ACPI_SRAT_CPU_ENABLED))
216 		return;
217 	if (pa->apic_id >= MAX_LOCAL_APIC) {
218 		printk(KERN_INFO "SRAT: APIC %08x ignored\n", pa->apic_id);
219 		return;
220 	}
221 
222 	pxm = pa->proximity_domain;
223 	node = setup_node(pxm);
224 	if (node == NUMA_NO_NODE) {
225 		bad_srat();
226 		return;
227 	}
228 
229 	apicid_to_node[pa->apic_id] = node;
230 	node_set(node, processor_nodes_parsed);
231 	acpi_numa = 1;
232 	printk(KERN_INFO "SRAT: PXM %u -> APIC %08x -> Node %u\n",
233 	       pxm, pa->apic_id, node);
234 }
235 
236 /* Callback for Proximity Domain -> LAPIC mapping */
237 void __init
acpi_numa_processor_affinity_init(const struct acpi_srat_cpu_affinity * pa)238 acpi_numa_processor_affinity_init(const struct acpi_srat_cpu_affinity *pa)
239 {
240 	unsigned pxm;
241 	nodeid_t node;
242 
243 	if (srat_disabled())
244 		return;
245 	if (pa->header.length != sizeof(struct acpi_srat_cpu_affinity)) {
246 		bad_srat();
247 		return;
248 	}
249 	if (!(pa->flags & ACPI_SRAT_CPU_ENABLED))
250 		return;
251 	pxm = pa->proximity_domain_lo;
252 	if (srat_rev >= 2) {
253 		pxm |= pa->proximity_domain_hi[0] << 8;
254 		pxm |= pa->proximity_domain_hi[1] << 16;
255 		pxm |= pa->proximity_domain_hi[2] << 24;
256 	}
257 	node = setup_node(pxm);
258 	if (node == NUMA_NO_NODE) {
259 		bad_srat();
260 		return;
261 	}
262 	apicid_to_node[pa->apic_id] = node;
263 	node_set(node, processor_nodes_parsed);
264 	acpi_numa = 1;
265 	printk(KERN_INFO "SRAT: PXM %u -> APIC %02x -> Node %u\n",
266 	       pxm, pa->apic_id, node);
267 }
268 
269 /* Callback for parsing of the Proximity Domain <-> Memory Area mappings */
270 void __init
acpi_numa_memory_affinity_init(const struct acpi_srat_mem_affinity * ma)271 acpi_numa_memory_affinity_init(const struct acpi_srat_mem_affinity *ma)
272 {
273 	u64 start, end;
274 	unsigned pxm;
275 	nodeid_t node;
276 	int i;
277 
278 	if (srat_disabled())
279 		return;
280 	if (ma->header.length != sizeof(struct acpi_srat_mem_affinity)) {
281 		bad_srat();
282 		return;
283 	}
284 	if (!(ma->flags & ACPI_SRAT_MEM_ENABLED))
285 		return;
286 
287 	if (num_node_memblks >= NR_NODE_MEMBLKS)
288 	{
289 		dprintk(XENLOG_WARNING,
290                 "Too many numa entry, try bigger NR_NODE_MEMBLKS \n");
291 		bad_srat();
292 		return;
293 	}
294 
295 	start = ma->base_address;
296 	end = start + ma->length;
297 	pxm = ma->proximity_domain;
298 	if (srat_rev < 2)
299 		pxm &= 0xff;
300 	node = setup_node(pxm);
301 	if (node == NUMA_NO_NODE) {
302 		bad_srat();
303 		return;
304 	}
305 	/* It is fine to add this area to the nodes data it will be used later*/
306 	i = conflicting_memblks(start, end);
307 	if (i < 0)
308 		/* everything fine */;
309 	else if (memblk_nodeid[i] == node) {
310 		bool mismatch = !(ma->flags & ACPI_SRAT_MEM_HOT_PLUGGABLE) !=
311 		                !test_bit(i, memblk_hotplug);
312 
313 		printk("%sSRAT: PXM %u (%"PRIx64"-%"PRIx64") overlaps with itself (%"PRIx64"-%"PRIx64")\n",
314 		       mismatch ? KERN_ERR : KERN_WARNING, pxm, start, end,
315 		       node_memblk_range[i].start, node_memblk_range[i].end);
316 		if (mismatch) {
317 			bad_srat();
318 			return;
319 		}
320 	} else {
321 		printk(KERN_ERR
322 		       "SRAT: PXM %u (%"PRIx64"-%"PRIx64") overlaps with PXM %u (%"PRIx64"-%"PRIx64")\n",
323 		       pxm, start, end, node_to_pxm(memblk_nodeid[i]),
324 		       node_memblk_range[i].start, node_memblk_range[i].end);
325 		bad_srat();
326 		return;
327 	}
328 	if (!(ma->flags & ACPI_SRAT_MEM_HOT_PLUGGABLE)) {
329 		struct node *nd = &nodes[node];
330 
331 		if (!node_test_and_set(node, memory_nodes_parsed)) {
332 			nd->start = start;
333 			nd->end = end;
334 		} else {
335 			if (start < nd->start)
336 				nd->start = start;
337 			if (nd->end < end)
338 				nd->end = end;
339 		}
340 	}
341 	printk(KERN_INFO "SRAT: Node %u PXM %u %"PRIx64"-%"PRIx64"%s\n",
342 	       node, pxm, start, end,
343 	       ma->flags & ACPI_SRAT_MEM_HOT_PLUGGABLE ? " (hotplug)" : "");
344 
345 	node_memblk_range[num_node_memblks].start = start;
346 	node_memblk_range[num_node_memblks].end = end;
347 	memblk_nodeid[num_node_memblks] = node;
348 	if (ma->flags & ACPI_SRAT_MEM_HOT_PLUGGABLE) {
349 		__set_bit(num_node_memblks, memblk_hotplug);
350 		if (end > mem_hotplug)
351 			mem_hotplug = end;
352 	}
353 	num_node_memblks++;
354 }
355 
356 /* Sanity check to catch more bad SRATs (they are amazingly common).
357    Make sure the PXMs cover all memory. */
nodes_cover_memory(void)358 static int __init nodes_cover_memory(void)
359 {
360 	int i;
361 
362 	for (i = 0; i < e820.nr_map; i++) {
363 		int j, found;
364 		unsigned long long start, end;
365 
366 		if (e820.map[i].type != E820_RAM) {
367 			continue;
368 		}
369 
370 		start = e820.map[i].addr;
371 		end = e820.map[i].addr + e820.map[i].size - 1;
372 
373 		do {
374 			found = 0;
375 			for_each_node_mask(j, memory_nodes_parsed)
376 				if (start < nodes[j].end
377 				    && end > nodes[j].start) {
378 					if (start >= nodes[j].start) {
379 						start = nodes[j].end;
380 						found = 1;
381 					}
382 					if (end <= nodes[j].end) {
383 						end = nodes[j].start;
384 						found = 1;
385 					}
386 				}
387 		} while (found && start < end);
388 
389 		if (start < end) {
390 			printk(KERN_ERR "SRAT: No PXM for e820 range: "
391 				"%016Lx - %016Lx\n", start, end);
392 			return 0;
393 		}
394 	}
395 	return 1;
396 }
397 
acpi_numa_arch_fixup(void)398 void __init acpi_numa_arch_fixup(void) {}
399 
400 static u64 __initdata srat_region_mask;
401 
srat_parse_region(struct acpi_subtable_header * header,const unsigned long end)402 static int __init srat_parse_region(struct acpi_subtable_header *header,
403 				    const unsigned long end)
404 {
405 	struct acpi_srat_mem_affinity *ma;
406 
407 	if (!header)
408 		return -EINVAL;
409 
410 	ma = container_of(header, struct acpi_srat_mem_affinity, header);
411 
412 	if (!ma->length ||
413 	    !(ma->flags & ACPI_SRAT_MEM_ENABLED) ||
414 	    (ma->flags & ACPI_SRAT_MEM_NON_VOLATILE))
415 		return 0;
416 
417 	if (numa_off)
418 		printk(KERN_INFO "SRAT: %013"PRIx64"-%013"PRIx64"\n",
419 		       ma->base_address, ma->base_address + ma->length - 1);
420 
421 	srat_region_mask |= ma->base_address |
422 			    pdx_region_mask(ma->base_address, ma->length);
423 
424 	return 0;
425 }
426 
srat_parse_regions(u64 addr)427 void __init srat_parse_regions(u64 addr)
428 {
429 	u64 mask;
430 	unsigned int i;
431 
432 	if (acpi_disabled || acpi_numa < 0 ||
433 	    acpi_table_parse(ACPI_SIG_SRAT, acpi_parse_srat))
434 		return;
435 
436 	srat_region_mask = pdx_init_mask(addr);
437 	acpi_table_parse_srat(ACPI_SRAT_TYPE_MEMORY_AFFINITY,
438 			      srat_parse_region, 0);
439 
440 	for (mask = srat_region_mask, i = 0; mask && i < e820.nr_map; i++) {
441 		if (e820.map[i].type != E820_RAM)
442 			continue;
443 
444 		if (~mask & pdx_region_mask(e820.map[i].addr, e820.map[i].size))
445 			mask = 0;
446 	}
447 
448 	pfn_pdx_hole_setup(mask >> PAGE_SHIFT);
449 }
450 
451 /* Use the information discovered above to actually set up the nodes. */
acpi_scan_nodes(u64 start,u64 end)452 int __init acpi_scan_nodes(u64 start, u64 end)
453 {
454 	int i;
455 	nodemask_t all_nodes_parsed;
456 
457 	/* First clean up the node list */
458 	for (i = 0; i < MAX_NUMNODES; i++)
459 		cutoff_node(i, start, end);
460 
461 	if (acpi_numa <= 0)
462 		return -1;
463 
464 	if (!nodes_cover_memory()) {
465 		bad_srat();
466 		return -1;
467 	}
468 
469 	memnode_shift = compute_hash_shift(node_memblk_range, num_node_memblks,
470 				memblk_nodeid);
471 
472 	if (memnode_shift < 0) {
473 		printk(KERN_ERR
474 		     "SRAT: No NUMA node hash function found. Contact maintainer\n");
475 		bad_srat();
476 		return -1;
477 	}
478 
479 	nodes_or(all_nodes_parsed, memory_nodes_parsed, processor_nodes_parsed);
480 
481 	/* Finally register nodes */
482 	for_each_node_mask(i, all_nodes_parsed)
483 	{
484 		u64 size = nodes[i].end - nodes[i].start;
485 		if ( size == 0 )
486 			printk(KERN_WARNING "SRAT: Node %u has no memory. "
487 			       "BIOS Bug or mis-configured hardware?\n", i);
488 
489 		setup_node_bootmem(i, nodes[i].start, nodes[i].end);
490 	}
491 	for (i = 0; i < nr_cpu_ids; i++) {
492 		if (cpu_to_node[i] == NUMA_NO_NODE)
493 			continue;
494 		if (!node_isset(cpu_to_node[i], processor_nodes_parsed))
495 			numa_set_node(i, NUMA_NO_NODE);
496 	}
497 	numa_init_array();
498 	return 0;
499 }
500 
node_to_pxm(nodeid_t n)501 static unsigned node_to_pxm(nodeid_t n)
502 {
503 	unsigned i;
504 
505 	if ((n < ARRAY_SIZE(pxm2node)) && (pxm2node[n].node == n))
506 		return pxm2node[n].pxm;
507 	for (i = 0; i < ARRAY_SIZE(pxm2node); i++)
508 		if (pxm2node[i].node == n)
509 			return pxm2node[i].pxm;
510 	return 0;
511 }
512 
__node_distance(nodeid_t a,nodeid_t b)513 u8 __node_distance(nodeid_t a, nodeid_t b)
514 {
515 	unsigned index;
516 	u8 slit_val;
517 
518 	if (!acpi_slit)
519 		return a == b ? 10 : 20;
520 	index = acpi_slit->locality_count * node_to_pxm(a);
521 	slit_val = acpi_slit->entry[index + node_to_pxm(b)];
522 
523 	/* ACPI defines 0xff as an unreachable node and 0-9 are undefined */
524 	if ((slit_val == 0xff) || (slit_val <= 9))
525 		return NUMA_NO_DISTANCE;
526 	else
527 		return slit_val;
528 }
529 
530 EXPORT_SYMBOL(__node_distance);
531