1 /*
2  * ACPI 3.0 based NUMA setup
3  * Copyright 2004 Andi Kleen, SuSE Labs.
4  *
5  * Reads the ACPI SRAT table to figure out what memory belongs to which CPUs.
6  *
7  * Called from acpi_numa_init while reading the SRAT and SLIT tables.
8  * Assumes all memory regions belonging to a single proximity domain
9  * are in one chunk. Holes between them will be included in the node.
10  *
11  * Adapted for Xen: Ryan Harper <ryanh@us.ibm.com>
12  */
13 
14 #include <xen/init.h>
15 #include <xen/mm.h>
16 #include <xen/inttypes.h>
17 #include <xen/nodemask.h>
18 #include <xen/acpi.h>
19 #include <xen/numa.h>
20 #include <xen/pfn.h>
21 #include <asm/e820.h>
22 #include <asm/page.h>
23 #include <asm/spec_ctrl.h>
24 
25 static struct acpi_table_slit *__read_mostly acpi_slit;
26 
27 struct pxm2node {
28 	unsigned pxm;
29 	nodeid_t node;
30 };
31 static struct pxm2node __read_mostly pxm2node[MAX_NUMNODES] =
32 	{ [0 ... MAX_NUMNODES - 1] = {.node = NUMA_NO_NODE} };
33 
node_found(unsigned idx,unsigned pxm)34 static inline bool node_found(unsigned idx, unsigned pxm)
35 {
36 	return ((pxm2node[idx].pxm == pxm) &&
37 		(pxm2node[idx].node != NUMA_NO_NODE));
38 }
39 
pxm_to_node(unsigned pxm)40 nodeid_t pxm_to_node(unsigned pxm)
41 {
42 	unsigned i;
43 
44 	if ((pxm < ARRAY_SIZE(pxm2node)) && node_found(pxm, pxm))
45 		return pxm2node[pxm].node;
46 
47 	for (i = 0; i < ARRAY_SIZE(pxm2node); i++)
48 		if (node_found(i, pxm))
49 			return pxm2node[i].node;
50 
51 	return NUMA_NO_NODE;
52 }
53 
setup_node(unsigned pxm)54 nodeid_t setup_node(unsigned pxm)
55 {
56 	nodeid_t node;
57 	unsigned idx;
58 	static unsigned nodes_found;
59 
60 	BUILD_BUG_ON(MAX_NUMNODES >= NUMA_NO_NODE);
61 
62 	if (pxm < ARRAY_SIZE(pxm2node)) {
63 		if (node_found(pxm, pxm))
64 			return pxm2node[pxm].node;
65 
66 		/* Try to maintain indexing of pxm2node by pxm */
67 		if (pxm2node[pxm].node == NUMA_NO_NODE) {
68 			idx = pxm;
69 			goto finish;
70 		}
71 	}
72 
73 	for (idx = 0; idx < ARRAY_SIZE(pxm2node); idx++)
74 		if (pxm2node[idx].node == NUMA_NO_NODE)
75 			goto finish;
76 
77 	printk_once(XENLOG_WARNING "SRAT: Too many proximity domains (%#x)\n",
78 		    pxm);
79 
80 	return NUMA_NO_NODE;
81 
82  finish:
83 	node = nodes_found++;
84 	if (node >= MAX_NUMNODES)
85 		return NUMA_NO_NODE;
86 	pxm2node[idx].pxm = pxm;
87 	pxm2node[idx].node = node;
88 
89 	return node;
90 }
91 
numa_fw_bad(void)92 void __init numa_fw_bad(void)
93 {
94 	int i;
95 	printk(KERN_ERR "SRAT: SRAT not used.\n");
96 	acpi_numa = -1;
97 	for (i = 0; i < MAX_LOCAL_APIC; i++)
98 		apicid_to_node[i] = NUMA_NO_NODE;
99 	for (i = 0; i < ARRAY_SIZE(pxm2node); i++)
100 		pxm2node[i].node = NUMA_NO_NODE;
101 	mem_hotplug = 0;
102 }
103 
104 /*
105  * A lot of BIOS fill in 10 (= no distance) everywhere. This messes
106  * up the NUMA heuristics which wants the local node to have a smaller
107  * distance than the others.
108  * Do some quick checks here and only use the SLIT if it passes.
109  */
slit_valid(struct acpi_table_slit * slit)110 static __init int slit_valid(struct acpi_table_slit *slit)
111 {
112 	int i, j;
113 	int d = slit->locality_count;
114 	for (i = 0; i < d; i++) {
115 		for (j = 0; j < d; j++)  {
116 			u8 val = slit->entry[d*i + j];
117 			if (i == j) {
118 				if (val != 10)
119 					return 0;
120 			} else if (val <= 10)
121 				return 0;
122 		}
123 	}
124 	return 1;
125 }
126 
127 /* Callback for SLIT parsing */
acpi_numa_slit_init(struct acpi_table_slit * slit)128 void __init acpi_numa_slit_init(struct acpi_table_slit *slit)
129 {
130 	mfn_t mfn;
131 
132 	if (!slit_valid(slit)) {
133 		printk(KERN_INFO "ACPI: SLIT table looks invalid. "
134 		       "Not used.\n");
135 		return;
136 	}
137 	mfn = alloc_boot_pages(PFN_UP(slit->header.length), 1);
138 	acpi_slit = vmap_contig(mfn, PFN_UP(slit->header.length));
139 	if ( !acpi_slit )
140 		panic("Unable to map the ACPI SLIT. Retry with numa=off");
141 	memcpy(acpi_slit, slit, slit->header.length);
142 }
143 
144 /* Callback for Proximity Domain -> x2APIC mapping */
145 void __init
acpi_numa_x2apic_affinity_init(const struct acpi_srat_x2apic_cpu_affinity * pa)146 acpi_numa_x2apic_affinity_init(const struct acpi_srat_x2apic_cpu_affinity *pa)
147 {
148 	unsigned pxm;
149 	nodeid_t node;
150 
151 	if (numa_disabled())
152 		return;
153 	if (pa->header.length < sizeof(struct acpi_srat_x2apic_cpu_affinity)) {
154 		numa_fw_bad();
155 		return;
156 	}
157 	if (!(pa->flags & ACPI_SRAT_CPU_ENABLED))
158 		return;
159 	if (pa->apic_id >= MAX_LOCAL_APIC) {
160 		printk(KERN_INFO "SRAT: APIC %08x ignored\n", pa->apic_id);
161 		return;
162 	}
163 
164 	pxm = pa->proximity_domain;
165 	node = setup_node(pxm);
166 	if (node == NUMA_NO_NODE) {
167 		numa_fw_bad();
168 		return;
169 	}
170 
171 	apicid_to_node[pa->apic_id] = node;
172 	numa_set_processor_nodes_parsed(node);
173 	acpi_numa = 1;
174 
175 	if (opt_acpi_verbose)
176 		printk(KERN_INFO "SRAT: PXM %u -> APIC %08x -> Node %u\n",
177 		       pxm, pa->apic_id, node);
178 }
179 
180 /* Callback for Proximity Domain -> LAPIC mapping */
181 void __init
acpi_numa_processor_affinity_init(const struct acpi_srat_cpu_affinity * pa)182 acpi_numa_processor_affinity_init(const struct acpi_srat_cpu_affinity *pa)
183 {
184 	unsigned pxm;
185 	nodeid_t node;
186 
187 	if (numa_disabled())
188 		return;
189 	if (pa->header.length != sizeof(struct acpi_srat_cpu_affinity)) {
190 		numa_fw_bad();
191 		return;
192 	}
193 	if (!(pa->flags & ACPI_SRAT_CPU_ENABLED))
194 		return;
195 	pxm = pa->proximity_domain_lo;
196 	if (srat_rev >= 2) {
197 		pxm |= pa->proximity_domain_hi[0] << 8;
198 		pxm |= pa->proximity_domain_hi[1] << 16;
199 		pxm |= pa->proximity_domain_hi[2] << 24;
200 	}
201 	node = setup_node(pxm);
202 	if (node == NUMA_NO_NODE) {
203 		numa_fw_bad();
204 		return;
205 	}
206 	apicid_to_node[pa->apic_id] = node;
207 	numa_set_processor_nodes_parsed(node);
208 	acpi_numa = 1;
209 
210 	if (opt_acpi_verbose)
211 		printk(KERN_INFO "SRAT: PXM %u -> APIC %02x -> Node %u\n",
212 		       pxm, pa->apic_id, node);
213 }
214 
215 /* Callback for parsing of the Proximity Domain <-> Memory Area mappings */
216 void __init
acpi_numa_memory_affinity_init(const struct acpi_srat_mem_affinity * ma)217 acpi_numa_memory_affinity_init(const struct acpi_srat_mem_affinity *ma)
218 {
219 	unsigned pxm;
220 	nodeid_t node;
221 
222 	if (numa_disabled())
223 		return;
224 	if (ma->header.length != sizeof(struct acpi_srat_mem_affinity)) {
225 		numa_fw_bad();
226 		return;
227 	}
228 	if (!(ma->flags & ACPI_SRAT_MEM_ENABLED))
229 		return;
230 
231 	/* Supplement the heuristics in l1tf_calculations(). */
232 	l1tf_safe_maddr = max(l1tf_safe_maddr,
233 			      ROUNDUP(ma->base_address + ma->length,
234 				      PAGE_SIZE));
235 
236 	if (!numa_memblks_available()) {
237 		dprintk(XENLOG_WARNING,
238 			"Too many numa entries, try bigger NR_NODE_MEMBLKS!\n");
239 		numa_fw_bad();
240 		return;
241 	}
242 
243 	pxm = ma->proximity_domain;
244 	if (srat_rev < 2)
245 		pxm &= 0xff;
246 	node = setup_node(pxm);
247 	if (node == NUMA_NO_NODE) {
248 		numa_fw_bad();
249 		return;
250 	}
251 
252 	/*
253 	 * In an extremely unlikely case, srat_parse_regions might not
254 	 * be called. So set the variable here just in case.
255 	 */
256 	numa_fw_nid_name = "PXM";
257 	if (!numa_update_node_memblks(node, pxm, ma->base_address, ma->length,
258 				      ma->flags & ACPI_SRAT_MEM_HOT_PLUGGABLE))
259 		numa_fw_bad();
260 }
261 
acpi_numa_arch_fixup(void)262 void __init acpi_numa_arch_fixup(void) {}
263 
srat_parse_region(struct acpi_subtable_header * header,const unsigned long end)264 static int __init cf_check srat_parse_region(
265     struct acpi_subtable_header *header, const unsigned long end)
266 {
267 	struct acpi_srat_mem_affinity *ma;
268 
269 	if (!header)
270 		return -EINVAL;
271 
272 	ma = container_of(header, struct acpi_srat_mem_affinity, header);
273 
274 	if (!ma->length ||
275 	    !(ma->flags & ACPI_SRAT_MEM_ENABLED) ||
276 	    (ma->flags & ACPI_SRAT_MEM_NON_VOLATILE))
277 		return 0;
278 
279 	if (numa_off)
280 		printk(KERN_INFO "SRAT: %013"PRIx64"-%013"PRIx64"\n",
281 		       ma->base_address, ma->base_address + ma->length - 1);
282 
283 	pfn_pdx_add_region(ma->base_address, ma->length);
284 
285 	return 0;
286 }
287 
srat_parse_regions(paddr_t addr)288 void __init srat_parse_regions(paddr_t addr)
289 {
290 	unsigned int i;
291 
292 	if (acpi_disabled || acpi_numa < 0 ||
293 	    acpi_table_parse(ACPI_SIG_SRAT, acpi_parse_srat))
294 		return;
295 
296 	/* Set "PXM" as early as feasible. */
297 	numa_fw_nid_name = "PXM";
298 	acpi_table_parse_srat(ACPI_SRAT_TYPE_MEMORY_AFFINITY,
299 			      srat_parse_region, 0);
300 
301 	if (!pfn_pdx_compression_setup(addr))
302 		return;
303 
304 	/* Ensure all RAM ranges in the e820 are covered. */
305 	for (i = 0; i < e820.nr_map; i++) {
306 		if (e820.map[i].type != E820_RAM)
307 			continue;
308 
309 		if (!pdx_is_region_compressible(
310 		          e820.map[i].addr,
311 		          PFN_UP(e820.map[i].addr + e820.map[i].size) -
312 		          PFN_DOWN(e820.map[i].addr))) {
313 			pfn_pdx_compression_reset();
314 			printk(XENLOG_WARNING
315 			       "PFN compression disabled, RAM region [%#" PRIx64
316 			       ", %#" PRIx64 "] not covered\n",
317 			       e820.map[i].addr,
318 			       e820.map[i].addr + e820.map[i].size - 1);
319 			return;
320 		}
321 	}
322 
323 	/* If we got this far compression is working as expected. */
324 	setup_force_cpu_cap(X86_FEATURE_PDX_COMPRESSION);
325 }
326 
numa_node_to_arch_nid(nodeid_t n)327 unsigned int numa_node_to_arch_nid(nodeid_t n)
328 {
329 	unsigned int i;
330 
331 	if ((n < ARRAY_SIZE(pxm2node)) && (pxm2node[n].node == n))
332 		return pxm2node[n].pxm;
333 	for (i = 0; i < ARRAY_SIZE(pxm2node); i++)
334 		if (pxm2node[i].node == n)
335 			return pxm2node[i].pxm;
336 	return 0;
337 }
338 
__node_distance(nodeid_t a,nodeid_t b)339 u8 __node_distance(nodeid_t a, nodeid_t b)
340 {
341 	unsigned index;
342 	u8 slit_val;
343 
344 	if (!acpi_slit)
345 		return a == b ? 10 : 20;
346 	index = acpi_slit->locality_count * numa_node_to_arch_nid(a);
347 	slit_val = acpi_slit->entry[index + numa_node_to_arch_nid(b)];
348 
349 	/* ACPI defines 0xff as an unreachable node and 0-9 are undefined */
350 	if ((slit_val == 0xff) || (slit_val <= 9))
351 		return NUMA_NO_DISTANCE;
352 	else
353 		return slit_val;
354 }
355 
356 EXPORT_SYMBOL(__node_distance);
357