1 /*
2 * ACPI 3.0 based NUMA setup
3 * Copyright 2004 Andi Kleen, SuSE Labs.
4 *
5 * Reads the ACPI SRAT table to figure out what memory belongs to which CPUs.
6 *
7 * Called from acpi_numa_init while reading the SRAT and SLIT tables.
8 * Assumes all memory regions belonging to a single proximity domain
9 * are in one chunk. Holes between them will be included in the node.
10 *
11 * Adapted for Xen: Ryan Harper <ryanh@us.ibm.com>
12 */
13
14 #include <xen/init.h>
15 #include <xen/mm.h>
16 #include <xen/inttypes.h>
17 #include <xen/nodemask.h>
18 #include <xen/acpi.h>
19 #include <xen/numa.h>
20 #include <xen/pfn.h>
21 #include <asm/e820.h>
22 #include <asm/page.h>
23 #include <asm/spec_ctrl.h>
24
25 static struct acpi_table_slit *__read_mostly acpi_slit;
26
27 struct pxm2node {
28 unsigned pxm;
29 nodeid_t node;
30 };
31 static struct pxm2node __read_mostly pxm2node[MAX_NUMNODES] =
32 { [0 ... MAX_NUMNODES - 1] = {.node = NUMA_NO_NODE} };
33
node_found(unsigned idx,unsigned pxm)34 static inline bool node_found(unsigned idx, unsigned pxm)
35 {
36 return ((pxm2node[idx].pxm == pxm) &&
37 (pxm2node[idx].node != NUMA_NO_NODE));
38 }
39
pxm_to_node(unsigned pxm)40 nodeid_t pxm_to_node(unsigned pxm)
41 {
42 unsigned i;
43
44 if ((pxm < ARRAY_SIZE(pxm2node)) && node_found(pxm, pxm))
45 return pxm2node[pxm].node;
46
47 for (i = 0; i < ARRAY_SIZE(pxm2node); i++)
48 if (node_found(i, pxm))
49 return pxm2node[i].node;
50
51 return NUMA_NO_NODE;
52 }
53
setup_node(unsigned pxm)54 nodeid_t setup_node(unsigned pxm)
55 {
56 nodeid_t node;
57 unsigned idx;
58 static unsigned nodes_found;
59
60 BUILD_BUG_ON(MAX_NUMNODES >= NUMA_NO_NODE);
61
62 if (pxm < ARRAY_SIZE(pxm2node)) {
63 if (node_found(pxm, pxm))
64 return pxm2node[pxm].node;
65
66 /* Try to maintain indexing of pxm2node by pxm */
67 if (pxm2node[pxm].node == NUMA_NO_NODE) {
68 idx = pxm;
69 goto finish;
70 }
71 }
72
73 for (idx = 0; idx < ARRAY_SIZE(pxm2node); idx++)
74 if (pxm2node[idx].node == NUMA_NO_NODE)
75 goto finish;
76
77 printk_once(XENLOG_WARNING "SRAT: Too many proximity domains (%#x)\n",
78 pxm);
79
80 return NUMA_NO_NODE;
81
82 finish:
83 node = nodes_found++;
84 if (node >= MAX_NUMNODES)
85 return NUMA_NO_NODE;
86 pxm2node[idx].pxm = pxm;
87 pxm2node[idx].node = node;
88
89 return node;
90 }
91
numa_fw_bad(void)92 void __init numa_fw_bad(void)
93 {
94 int i;
95 printk(KERN_ERR "SRAT: SRAT not used.\n");
96 acpi_numa = -1;
97 for (i = 0; i < MAX_LOCAL_APIC; i++)
98 apicid_to_node[i] = NUMA_NO_NODE;
99 for (i = 0; i < ARRAY_SIZE(pxm2node); i++)
100 pxm2node[i].node = NUMA_NO_NODE;
101 mem_hotplug = 0;
102 }
103
104 /*
105 * A lot of BIOS fill in 10 (= no distance) everywhere. This messes
106 * up the NUMA heuristics which wants the local node to have a smaller
107 * distance than the others.
108 * Do some quick checks here and only use the SLIT if it passes.
109 */
slit_valid(struct acpi_table_slit * slit)110 static __init int slit_valid(struct acpi_table_slit *slit)
111 {
112 int i, j;
113 int d = slit->locality_count;
114 for (i = 0; i < d; i++) {
115 for (j = 0; j < d; j++) {
116 u8 val = slit->entry[d*i + j];
117 if (i == j) {
118 if (val != 10)
119 return 0;
120 } else if (val <= 10)
121 return 0;
122 }
123 }
124 return 1;
125 }
126
127 /* Callback for SLIT parsing */
acpi_numa_slit_init(struct acpi_table_slit * slit)128 void __init acpi_numa_slit_init(struct acpi_table_slit *slit)
129 {
130 mfn_t mfn;
131
132 if (!slit_valid(slit)) {
133 printk(KERN_INFO "ACPI: SLIT table looks invalid. "
134 "Not used.\n");
135 return;
136 }
137 mfn = alloc_boot_pages(PFN_UP(slit->header.length), 1);
138 acpi_slit = vmap_contig(mfn, PFN_UP(slit->header.length));
139 if ( !acpi_slit )
140 panic("Unable to map the ACPI SLIT. Retry with numa=off");
141 memcpy(acpi_slit, slit, slit->header.length);
142 }
143
144 /* Callback for Proximity Domain -> x2APIC mapping */
145 void __init
acpi_numa_x2apic_affinity_init(const struct acpi_srat_x2apic_cpu_affinity * pa)146 acpi_numa_x2apic_affinity_init(const struct acpi_srat_x2apic_cpu_affinity *pa)
147 {
148 unsigned pxm;
149 nodeid_t node;
150
151 if (numa_disabled())
152 return;
153 if (pa->header.length < sizeof(struct acpi_srat_x2apic_cpu_affinity)) {
154 numa_fw_bad();
155 return;
156 }
157 if (!(pa->flags & ACPI_SRAT_CPU_ENABLED))
158 return;
159 if (pa->apic_id >= MAX_LOCAL_APIC) {
160 printk(KERN_INFO "SRAT: APIC %08x ignored\n", pa->apic_id);
161 return;
162 }
163
164 pxm = pa->proximity_domain;
165 node = setup_node(pxm);
166 if (node == NUMA_NO_NODE) {
167 numa_fw_bad();
168 return;
169 }
170
171 apicid_to_node[pa->apic_id] = node;
172 numa_set_processor_nodes_parsed(node);
173 acpi_numa = 1;
174
175 if (opt_acpi_verbose)
176 printk(KERN_INFO "SRAT: PXM %u -> APIC %08x -> Node %u\n",
177 pxm, pa->apic_id, node);
178 }
179
180 /* Callback for Proximity Domain -> LAPIC mapping */
181 void __init
acpi_numa_processor_affinity_init(const struct acpi_srat_cpu_affinity * pa)182 acpi_numa_processor_affinity_init(const struct acpi_srat_cpu_affinity *pa)
183 {
184 unsigned pxm;
185 nodeid_t node;
186
187 if (numa_disabled())
188 return;
189 if (pa->header.length != sizeof(struct acpi_srat_cpu_affinity)) {
190 numa_fw_bad();
191 return;
192 }
193 if (!(pa->flags & ACPI_SRAT_CPU_ENABLED))
194 return;
195 pxm = pa->proximity_domain_lo;
196 if (srat_rev >= 2) {
197 pxm |= pa->proximity_domain_hi[0] << 8;
198 pxm |= pa->proximity_domain_hi[1] << 16;
199 pxm |= pa->proximity_domain_hi[2] << 24;
200 }
201 node = setup_node(pxm);
202 if (node == NUMA_NO_NODE) {
203 numa_fw_bad();
204 return;
205 }
206 apicid_to_node[pa->apic_id] = node;
207 numa_set_processor_nodes_parsed(node);
208 acpi_numa = 1;
209
210 if (opt_acpi_verbose)
211 printk(KERN_INFO "SRAT: PXM %u -> APIC %02x -> Node %u\n",
212 pxm, pa->apic_id, node);
213 }
214
215 /* Callback for parsing of the Proximity Domain <-> Memory Area mappings */
216 void __init
acpi_numa_memory_affinity_init(const struct acpi_srat_mem_affinity * ma)217 acpi_numa_memory_affinity_init(const struct acpi_srat_mem_affinity *ma)
218 {
219 unsigned pxm;
220 nodeid_t node;
221
222 if (numa_disabled())
223 return;
224 if (ma->header.length != sizeof(struct acpi_srat_mem_affinity)) {
225 numa_fw_bad();
226 return;
227 }
228 if (!(ma->flags & ACPI_SRAT_MEM_ENABLED))
229 return;
230
231 /* Supplement the heuristics in l1tf_calculations(). */
232 l1tf_safe_maddr = max(l1tf_safe_maddr,
233 ROUNDUP(ma->base_address + ma->length,
234 PAGE_SIZE));
235
236 if (!numa_memblks_available()) {
237 dprintk(XENLOG_WARNING,
238 "Too many numa entries, try bigger NR_NODE_MEMBLKS!\n");
239 numa_fw_bad();
240 return;
241 }
242
243 pxm = ma->proximity_domain;
244 if (srat_rev < 2)
245 pxm &= 0xff;
246 node = setup_node(pxm);
247 if (node == NUMA_NO_NODE) {
248 numa_fw_bad();
249 return;
250 }
251
252 /*
253 * In an extremely unlikely case, srat_parse_regions might not
254 * be called. So set the variable here just in case.
255 */
256 numa_fw_nid_name = "PXM";
257 if (!numa_update_node_memblks(node, pxm, ma->base_address, ma->length,
258 ma->flags & ACPI_SRAT_MEM_HOT_PLUGGABLE))
259 numa_fw_bad();
260 }
261
acpi_numa_arch_fixup(void)262 void __init acpi_numa_arch_fixup(void) {}
263
srat_parse_region(struct acpi_subtable_header * header,const unsigned long end)264 static int __init cf_check srat_parse_region(
265 struct acpi_subtable_header *header, const unsigned long end)
266 {
267 struct acpi_srat_mem_affinity *ma;
268
269 if (!header)
270 return -EINVAL;
271
272 ma = container_of(header, struct acpi_srat_mem_affinity, header);
273
274 if (!ma->length ||
275 !(ma->flags & ACPI_SRAT_MEM_ENABLED) ||
276 (ma->flags & ACPI_SRAT_MEM_NON_VOLATILE))
277 return 0;
278
279 if (numa_off)
280 printk(KERN_INFO "SRAT: %013"PRIx64"-%013"PRIx64"\n",
281 ma->base_address, ma->base_address + ma->length - 1);
282
283 pfn_pdx_add_region(ma->base_address, ma->length);
284
285 return 0;
286 }
287
srat_parse_regions(paddr_t addr)288 void __init srat_parse_regions(paddr_t addr)
289 {
290 unsigned int i;
291
292 if (acpi_disabled || acpi_numa < 0 ||
293 acpi_table_parse(ACPI_SIG_SRAT, acpi_parse_srat))
294 return;
295
296 /* Set "PXM" as early as feasible. */
297 numa_fw_nid_name = "PXM";
298 acpi_table_parse_srat(ACPI_SRAT_TYPE_MEMORY_AFFINITY,
299 srat_parse_region, 0);
300
301 if (!pfn_pdx_compression_setup(addr))
302 return;
303
304 /* Ensure all RAM ranges in the e820 are covered. */
305 for (i = 0; i < e820.nr_map; i++) {
306 if (e820.map[i].type != E820_RAM)
307 continue;
308
309 if (!pdx_is_region_compressible(
310 e820.map[i].addr,
311 PFN_UP(e820.map[i].addr + e820.map[i].size) -
312 PFN_DOWN(e820.map[i].addr))) {
313 pfn_pdx_compression_reset();
314 printk(XENLOG_WARNING
315 "PFN compression disabled, RAM region [%#" PRIx64
316 ", %#" PRIx64 "] not covered\n",
317 e820.map[i].addr,
318 e820.map[i].addr + e820.map[i].size - 1);
319 return;
320 }
321 }
322
323 /* If we got this far compression is working as expected. */
324 setup_force_cpu_cap(X86_FEATURE_PDX_COMPRESSION);
325 }
326
numa_node_to_arch_nid(nodeid_t n)327 unsigned int numa_node_to_arch_nid(nodeid_t n)
328 {
329 unsigned int i;
330
331 if ((n < ARRAY_SIZE(pxm2node)) && (pxm2node[n].node == n))
332 return pxm2node[n].pxm;
333 for (i = 0; i < ARRAY_SIZE(pxm2node); i++)
334 if (pxm2node[i].node == n)
335 return pxm2node[i].pxm;
336 return 0;
337 }
338
__node_distance(nodeid_t a,nodeid_t b)339 u8 __node_distance(nodeid_t a, nodeid_t b)
340 {
341 unsigned index;
342 u8 slit_val;
343
344 if (!acpi_slit)
345 return a == b ? 10 : 20;
346 index = acpi_slit->locality_count * numa_node_to_arch_nid(a);
347 slit_val = acpi_slit->entry[index + numa_node_to_arch_nid(b)];
348
349 /* ACPI defines 0xff as an unreachable node and 0-9 are undefined */
350 if ((slit_val == 0xff) || (slit_val <= 9))
351 return NUMA_NO_DISTANCE;
352 else
353 return slit_val;
354 }
355
356 EXPORT_SYMBOL(__node_distance);
357