1 /*
2 * Generic VM initialization for x86-64 NUMA setups.
3 * Copyright 2002,2003 Andi Kleen, SuSE Labs.
4 * Adapted for Xen: Ryan Harper <ryanh@us.ibm.com>
5 */
6
7 #include <xen/mm.h>
8 #include <xen/string.h>
9 #include <xen/init.h>
10 #include <xen/ctype.h>
11 #include <xen/nodemask.h>
12 #include <xen/numa.h>
13 #include <xen/keyhandler.h>
14 #include <xen/time.h>
15 #include <xen/smp.h>
16 #include <xen/pfn.h>
17 #include <asm/acpi.h>
18 #include <xen/sched.h>
19 #include <xen/softirq.h>
20
21 static int numa_setup(const char *s);
22 custom_param("numa", numa_setup);
23
24 #ifndef Dprintk
25 #define Dprintk(x...)
26 #endif
27
28 /* from proto.h */
29 #define round_up(x,y) ((((x)+(y))-1) & (~((y)-1)))
30
31 struct node_data node_data[MAX_NUMNODES];
32
33 /* Mapping from pdx to node id */
34 int memnode_shift;
35 static typeof(*memnodemap) _memnodemap[64];
36 unsigned long memnodemapsize;
37 u8 *memnodemap;
38
39 nodeid_t cpu_to_node[NR_CPUS] __read_mostly = {
40 [0 ... NR_CPUS-1] = NUMA_NO_NODE
41 };
42 /*
43 * Keep BIOS's CPU2node information, should not be used for memory allocaion
44 */
45 nodeid_t apicid_to_node[MAX_LOCAL_APIC] = {
46 [0 ... MAX_LOCAL_APIC-1] = NUMA_NO_NODE
47 };
48 cpumask_t node_to_cpumask[MAX_NUMNODES] __read_mostly;
49
50 nodemask_t __read_mostly node_online_map = { { [0] = 1UL } };
51
52 bool numa_off;
53 s8 acpi_numa = 0;
54
srat_disabled(void)55 int srat_disabled(void)
56 {
57 return numa_off || acpi_numa < 0;
58 }
59
60 /*
61 * Given a shift value, try to populate memnodemap[]
62 * Returns :
63 * 1 if OK
64 * 0 if memnodmap[] too small (of shift too small)
65 * -1 if node overlap or lost ram (shift too big)
66 */
populate_memnodemap(const struct node * nodes,int numnodes,int shift,nodeid_t * nodeids)67 static int __init populate_memnodemap(const struct node *nodes,
68 int numnodes, int shift, nodeid_t *nodeids)
69 {
70 unsigned long spdx, epdx;
71 int i, res = -1;
72
73 memset(memnodemap, NUMA_NO_NODE, memnodemapsize * sizeof(*memnodemap));
74 for ( i = 0; i < numnodes; i++ )
75 {
76 spdx = paddr_to_pdx(nodes[i].start);
77 epdx = paddr_to_pdx(nodes[i].end - 1) + 1;
78 if ( spdx >= epdx )
79 continue;
80 if ( (epdx >> shift) >= memnodemapsize )
81 return 0;
82 do {
83 if ( memnodemap[spdx >> shift] != NUMA_NO_NODE )
84 return -1;
85
86 if ( !nodeids )
87 memnodemap[spdx >> shift] = i;
88 else
89 memnodemap[spdx >> shift] = nodeids[i];
90
91 spdx += (1UL << shift);
92 } while ( spdx < epdx );
93 res = 1;
94 }
95
96 return res;
97 }
98
allocate_cachealigned_memnodemap(void)99 static int __init allocate_cachealigned_memnodemap(void)
100 {
101 unsigned long size = PFN_UP(memnodemapsize * sizeof(*memnodemap));
102 unsigned long mfn = mfn_x(alloc_boot_pages(size, 1));
103
104 memnodemap = mfn_to_virt(mfn);
105 mfn <<= PAGE_SHIFT;
106 size <<= PAGE_SHIFT;
107 printk(KERN_DEBUG "NUMA: Allocated memnodemap from %lx - %lx\n",
108 mfn, mfn + size);
109 memnodemapsize = size / sizeof(*memnodemap);
110
111 return 0;
112 }
113
114 /*
115 * The LSB of all start and end addresses in the node map is the value of the
116 * maximum possible shift.
117 */
extract_lsb_from_nodes(const struct node * nodes,int numnodes)118 static int __init extract_lsb_from_nodes(const struct node *nodes,
119 int numnodes)
120 {
121 int i, nodes_used = 0;
122 unsigned long spdx, epdx;
123 unsigned long bitfield = 0, memtop = 0;
124
125 for ( i = 0; i < numnodes; i++ )
126 {
127 spdx = paddr_to_pdx(nodes[i].start);
128 epdx = paddr_to_pdx(nodes[i].end - 1) + 1;
129 if ( spdx >= epdx )
130 continue;
131 bitfield |= spdx;
132 nodes_used++;
133 if ( epdx > memtop )
134 memtop = epdx;
135 }
136 if ( nodes_used <= 1 )
137 i = BITS_PER_LONG - 1;
138 else
139 i = find_first_bit(&bitfield, sizeof(unsigned long)*8);
140 memnodemapsize = (memtop >> i) + 1;
141 return i;
142 }
143
compute_hash_shift(struct node * nodes,int numnodes,nodeid_t * nodeids)144 int __init compute_hash_shift(struct node *nodes, int numnodes,
145 nodeid_t *nodeids)
146 {
147 int shift;
148
149 shift = extract_lsb_from_nodes(nodes, numnodes);
150 if ( memnodemapsize <= ARRAY_SIZE(_memnodemap) )
151 memnodemap = _memnodemap;
152 else if ( allocate_cachealigned_memnodemap() )
153 return -1;
154 printk(KERN_DEBUG "NUMA: Using %d for the hash shift.\n", shift);
155
156 if ( populate_memnodemap(nodes, numnodes, shift, nodeids) != 1 )
157 {
158 printk(KERN_INFO "Your memory is not aligned you need to "
159 "rebuild your hypervisor with a bigger NODEMAPSIZE "
160 "shift=%d\n", shift);
161 return -1;
162 }
163
164 return shift;
165 }
166 /* initialize NODE_DATA given nodeid and start/end */
setup_node_bootmem(nodeid_t nodeid,u64 start,u64 end)167 void __init setup_node_bootmem(nodeid_t nodeid, u64 start, u64 end)
168 {
169 unsigned long start_pfn, end_pfn;
170
171 start_pfn = start >> PAGE_SHIFT;
172 end_pfn = end >> PAGE_SHIFT;
173
174 NODE_DATA(nodeid)->node_start_pfn = start_pfn;
175 NODE_DATA(nodeid)->node_spanned_pages = end_pfn - start_pfn;
176
177 node_set_online(nodeid);
178 }
179
numa_init_array(void)180 void __init numa_init_array(void)
181 {
182 int rr, i;
183
184 /* There are unfortunately some poorly designed mainboards around
185 that only connect memory to a single CPU. This breaks the 1:1 cpu->node
186 mapping. To avoid this fill in the mapping for all possible
187 CPUs, as the number of CPUs is not known yet.
188 We round robin the existing nodes. */
189 rr = first_node(node_online_map);
190 for ( i = 0; i < nr_cpu_ids; i++ )
191 {
192 if ( cpu_to_node[i] != NUMA_NO_NODE )
193 continue;
194 numa_set_node(i, rr);
195 rr = next_node(rr, node_online_map);
196 if ( rr == MAX_NUMNODES )
197 rr = first_node(node_online_map);
198 }
199 }
200
201 #ifdef CONFIG_NUMA_EMU
202 static int numa_fake __initdata = 0;
203
204 /* Numa emulation */
numa_emulation(u64 start_pfn,u64 end_pfn)205 static int __init numa_emulation(u64 start_pfn, u64 end_pfn)
206 {
207 int i;
208 struct node nodes[MAX_NUMNODES];
209 u64 sz = ((end_pfn - start_pfn)<<PAGE_SHIFT) / numa_fake;
210
211 /* Kludge needed for the hash function */
212 if ( hweight64(sz) > 1 )
213 {
214 u64 x = 1;
215 while ( (x << 1) < sz )
216 x <<= 1;
217 if ( x < sz/2 )
218 printk(KERN_ERR "Numa emulation unbalanced. Complain to maintainer\n");
219 sz = x;
220 }
221
222 memset(&nodes,0,sizeof(nodes));
223 for ( i = 0; i < numa_fake; i++ )
224 {
225 nodes[i].start = (start_pfn<<PAGE_SHIFT) + i*sz;
226 if ( i == numa_fake - 1 )
227 sz = (end_pfn<<PAGE_SHIFT) - nodes[i].start;
228 nodes[i].end = nodes[i].start + sz;
229 printk(KERN_INFO "Faking node %d at %"PRIx64"-%"PRIx64" (%"PRIu64"MB)\n",
230 i,
231 nodes[i].start, nodes[i].end,
232 (nodes[i].end - nodes[i].start) >> 20);
233 node_set_online(i);
234 }
235 memnode_shift = compute_hash_shift(nodes, numa_fake, NULL);
236 if ( memnode_shift < 0 )
237 {
238 memnode_shift = 0;
239 printk(KERN_ERR "No NUMA hash function found. Emulation disabled.\n");
240 return -1;
241 }
242 for_each_online_node ( i )
243 setup_node_bootmem(i, nodes[i].start, nodes[i].end);
244 numa_init_array();
245
246 return 0;
247 }
248 #endif
249
numa_initmem_init(unsigned long start_pfn,unsigned long end_pfn)250 void __init numa_initmem_init(unsigned long start_pfn, unsigned long end_pfn)
251 {
252 int i;
253
254 #ifdef CONFIG_NUMA_EMU
255 if ( numa_fake && !numa_emulation(start_pfn, end_pfn) )
256 return;
257 #endif
258
259 #ifdef CONFIG_ACPI_NUMA
260 if ( !numa_off && !acpi_scan_nodes((u64)start_pfn << PAGE_SHIFT,
261 (u64)end_pfn << PAGE_SHIFT) )
262 return;
263 #endif
264
265 printk(KERN_INFO "%s\n",
266 numa_off ? "NUMA turned off" : "No NUMA configuration found");
267
268 printk(KERN_INFO "Faking a node at %016"PRIx64"-%016"PRIx64"\n",
269 (u64)start_pfn << PAGE_SHIFT,
270 (u64)end_pfn << PAGE_SHIFT);
271 /* setup dummy node covering all memory */
272 memnode_shift = BITS_PER_LONG - 1;
273 memnodemap = _memnodemap;
274 nodes_clear(node_online_map);
275 node_set_online(0);
276 for ( i = 0; i < nr_cpu_ids; i++ )
277 numa_set_node(i, 0);
278 cpumask_copy(&node_to_cpumask[0], cpumask_of(0));
279 setup_node_bootmem(0, (u64)start_pfn << PAGE_SHIFT,
280 (u64)end_pfn << PAGE_SHIFT);
281 }
282
numa_add_cpu(int cpu)283 void numa_add_cpu(int cpu)
284 {
285 cpumask_set_cpu(cpu, &node_to_cpumask[cpu_to_node(cpu)]);
286 }
287
numa_set_node(int cpu,nodeid_t node)288 void numa_set_node(int cpu, nodeid_t node)
289 {
290 cpu_to_node[cpu] = node;
291 }
292
293 /* [numa=off] */
numa_setup(const char * opt)294 static __init int numa_setup(const char *opt)
295 {
296 if ( !strncmp(opt,"off",3) )
297 numa_off = true;
298 else if ( !strncmp(opt,"on",2) )
299 numa_off = false;
300 #ifdef CONFIG_NUMA_EMU
301 else if ( !strncmp(opt, "fake=", 5) )
302 {
303 numa_off = false;
304 numa_fake = simple_strtoul(opt+5,NULL,0);
305 if ( numa_fake >= MAX_NUMNODES )
306 numa_fake = MAX_NUMNODES;
307 }
308 #endif
309 #ifdef CONFIG_ACPI_NUMA
310 else if ( !strncmp(opt,"noacpi",6) )
311 {
312 numa_off = false;
313 acpi_numa = -1;
314 }
315 #endif
316 else
317 return -EINVAL;
318
319 return 0;
320 }
321
322 /*
323 * Setup early cpu_to_node.
324 *
325 * Populate cpu_to_node[] only if x86_cpu_to_apicid[],
326 * and apicid_to_node[] tables have valid entries for a CPU.
327 * This means we skip cpu_to_node[] initialisation for NUMA
328 * emulation and faking node case (when running a kernel compiled
329 * for NUMA on a non NUMA box), which is OK as cpu_to_node[]
330 * is already initialized in a round robin manner at numa_init_array,
331 * prior to this call, and this initialization is good enough
332 * for the fake NUMA cases.
333 */
init_cpu_to_node(void)334 void __init init_cpu_to_node(void)
335 {
336 unsigned int i;
337 nodeid_t node;
338
339 for ( i = 0; i < nr_cpu_ids; i++ )
340 {
341 u32 apicid = x86_cpu_to_apicid[i];
342 if ( apicid == BAD_APICID )
343 continue;
344 node = apicid < MAX_LOCAL_APIC ? apicid_to_node[apicid] : NUMA_NO_NODE;
345 if ( node == NUMA_NO_NODE || !node_online(node) )
346 node = 0;
347 numa_set_node(i, node);
348 }
349 }
350
arch_get_dma_bitsize(void)351 unsigned int __init arch_get_dma_bitsize(void)
352 {
353 unsigned int node;
354
355 for_each_online_node(node)
356 if ( node_spanned_pages(node) &&
357 !(node_start_pfn(node) >> (32 - PAGE_SHIFT)) )
358 break;
359 if ( node >= MAX_NUMNODES )
360 panic("No node with memory below 4Gb");
361
362 /*
363 * Try to not reserve the whole node's memory for DMA, but dividing
364 * its spanned pages by (arbitrarily chosen) 4.
365 */
366 return min_t(unsigned int,
367 flsl(node_start_pfn(node) + node_spanned_pages(node) / 4 - 1)
368 + PAGE_SHIFT, 32);
369 }
370
dump_numa(unsigned char key)371 static void dump_numa(unsigned char key)
372 {
373 s_time_t now = NOW();
374 unsigned int i, j, n;
375 int err;
376 struct domain *d;
377 struct page_info *page;
378 unsigned int page_num_node[MAX_NUMNODES];
379 const struct vnuma_info *vnuma;
380
381 printk("'%c' pressed -> dumping numa info (now-0x%X:%08X)\n", key,
382 (u32)(now>>32), (u32)now);
383
384 for_each_online_node ( i )
385 {
386 paddr_t pa = pfn_to_paddr(node_start_pfn(i) + 1);
387
388 printk("NODE%u start->%lu size->%lu free->%lu\n",
389 i, node_start_pfn(i), node_spanned_pages(i),
390 avail_node_heap_pages(i));
391 /* sanity check phys_to_nid() */
392 if ( phys_to_nid(pa) != i )
393 printk("phys_to_nid(%"PRIpaddr") -> %d should be %u\n",
394 pa, phys_to_nid(pa), i);
395 }
396
397 j = cpumask_first(&cpu_online_map);
398 n = 0;
399 for_each_online_cpu ( i )
400 {
401 if ( i != j + n || cpu_to_node[j] != cpu_to_node[i] )
402 {
403 if ( n > 1 )
404 printk("CPU%u...%u -> NODE%d\n", j, j + n - 1, cpu_to_node[j]);
405 else
406 printk("CPU%u -> NODE%d\n", j, cpu_to_node[j]);
407 j = i;
408 n = 1;
409 }
410 else
411 ++n;
412 }
413 if ( n > 1 )
414 printk("CPU%u...%u -> NODE%d\n", j, j + n - 1, cpu_to_node[j]);
415 else
416 printk("CPU%u -> NODE%d\n", j, cpu_to_node[j]);
417
418 rcu_read_lock(&domlist_read_lock);
419
420 printk("Memory location of each domain:\n");
421 for_each_domain ( d )
422 {
423 process_pending_softirqs();
424
425 printk("Domain %u (total: %u):\n", d->domain_id, d->tot_pages);
426
427 for_each_online_node ( i )
428 page_num_node[i] = 0;
429
430 spin_lock(&d->page_alloc_lock);
431 page_list_for_each(page, &d->page_list)
432 {
433 i = phys_to_nid((paddr_t)page_to_mfn(page) << PAGE_SHIFT);
434 page_num_node[i]++;
435 }
436 spin_unlock(&d->page_alloc_lock);
437
438 for_each_online_node ( i )
439 printk(" Node %u: %u\n", i, page_num_node[i]);
440
441 if ( !read_trylock(&d->vnuma_rwlock) )
442 continue;
443
444 if ( !d->vnuma )
445 {
446 read_unlock(&d->vnuma_rwlock);
447 continue;
448 }
449
450 vnuma = d->vnuma;
451 printk(" %u vnodes, %u vcpus, guest physical layout:\n",
452 vnuma->nr_vnodes, d->max_vcpus);
453 for ( i = 0; i < vnuma->nr_vnodes; i++ )
454 {
455 unsigned int start_cpu = ~0U;
456
457 err = snprintf(keyhandler_scratch, 12, "%3u",
458 vnuma->vnode_to_pnode[i]);
459 if ( err < 0 || vnuma->vnode_to_pnode[i] == NUMA_NO_NODE )
460 strlcpy(keyhandler_scratch, "???", sizeof(keyhandler_scratch));
461
462 printk(" %3u: pnode %s,", i, keyhandler_scratch);
463
464 printk(" vcpus ");
465
466 for ( j = 0; j < d->max_vcpus; j++ )
467 {
468 if ( !(j & 0x3f) )
469 process_pending_softirqs();
470
471 if ( vnuma->vcpu_to_vnode[j] == i )
472 {
473 if ( start_cpu == ~0U )
474 {
475 printk("%d", j);
476 start_cpu = j;
477 }
478 }
479 else if ( start_cpu != ~0U )
480 {
481 if ( j - 1 != start_cpu )
482 printk("-%d ", j - 1);
483 else
484 printk(" ");
485 start_cpu = ~0U;
486 }
487 }
488
489 if ( start_cpu != ~0U && start_cpu != j - 1 )
490 printk("-%d", j - 1);
491
492 printk("\n");
493
494 for ( j = 0; j < vnuma->nr_vmemranges; j++ )
495 {
496 if ( vnuma->vmemrange[j].nid == i )
497 printk(" %016"PRIx64" - %016"PRIx64"\n",
498 vnuma->vmemrange[j].start,
499 vnuma->vmemrange[j].end);
500 }
501 }
502
503 read_unlock(&d->vnuma_rwlock);
504 }
505
506 rcu_read_unlock(&domlist_read_lock);
507 }
508
register_numa_trigger(void)509 static __init int register_numa_trigger(void)
510 {
511 register_keyhandler('u', dump_numa, "dump NUMA info", 1);
512 return 0;
513 }
514 __initcall(register_numa_trigger);
515
516