1 /*
2  * Generic VM initialization for x86-64 NUMA setups.
3  * Copyright 2002,2003 Andi Kleen, SuSE Labs.
4  * Adapted for Xen: Ryan Harper <ryanh@us.ibm.com>
5  */
6 
7 #include <xen/mm.h>
8 #include <xen/string.h>
9 #include <xen/init.h>
10 #include <xen/ctype.h>
11 #include <xen/nodemask.h>
12 #include <xen/numa.h>
13 #include <xen/keyhandler.h>
14 #include <xen/time.h>
15 #include <xen/smp.h>
16 #include <xen/pfn.h>
17 #include <asm/acpi.h>
18 #include <xen/sched.h>
19 #include <xen/softirq.h>
20 
21 static int numa_setup(const char *s);
22 custom_param("numa", numa_setup);
23 
24 #ifndef Dprintk
25 #define Dprintk(x...)
26 #endif
27 
28 /* from proto.h */
29 #define round_up(x,y) ((((x)+(y))-1) & (~((y)-1)))
30 
31 struct node_data node_data[MAX_NUMNODES];
32 
33 /* Mapping from pdx to node id */
34 int memnode_shift;
35 static typeof(*memnodemap) _memnodemap[64];
36 unsigned long memnodemapsize;
37 u8 *memnodemap;
38 
39 nodeid_t cpu_to_node[NR_CPUS] __read_mostly = {
40     [0 ... NR_CPUS-1] = NUMA_NO_NODE
41 };
42 /*
43  * Keep BIOS's CPU2node information, should not be used for memory allocaion
44  */
45 nodeid_t apicid_to_node[MAX_LOCAL_APIC] = {
46     [0 ... MAX_LOCAL_APIC-1] = NUMA_NO_NODE
47 };
48 cpumask_t node_to_cpumask[MAX_NUMNODES] __read_mostly;
49 
50 nodemask_t __read_mostly node_online_map = { { [0] = 1UL } };
51 
52 bool numa_off;
53 s8 acpi_numa = 0;
54 
srat_disabled(void)55 int srat_disabled(void)
56 {
57     return numa_off || acpi_numa < 0;
58 }
59 
60 /*
61  * Given a shift value, try to populate memnodemap[]
62  * Returns :
63  * 1 if OK
64  * 0 if memnodmap[] too small (of shift too small)
65  * -1 if node overlap or lost ram (shift too big)
66  */
populate_memnodemap(const struct node * nodes,int numnodes,int shift,nodeid_t * nodeids)67 static int __init populate_memnodemap(const struct node *nodes,
68                                       int numnodes, int shift, nodeid_t *nodeids)
69 {
70     unsigned long spdx, epdx;
71     int i, res = -1;
72 
73     memset(memnodemap, NUMA_NO_NODE, memnodemapsize * sizeof(*memnodemap));
74     for ( i = 0; i < numnodes; i++ )
75     {
76         spdx = paddr_to_pdx(nodes[i].start);
77         epdx = paddr_to_pdx(nodes[i].end - 1) + 1;
78         if ( spdx >= epdx )
79             continue;
80         if ( (epdx >> shift) >= memnodemapsize )
81             return 0;
82         do {
83             if ( memnodemap[spdx >> shift] != NUMA_NO_NODE )
84                 return -1;
85 
86             if ( !nodeids )
87                 memnodemap[spdx >> shift] = i;
88             else
89                 memnodemap[spdx >> shift] = nodeids[i];
90 
91             spdx += (1UL << shift);
92         } while ( spdx < epdx );
93         res = 1;
94     }
95 
96     return res;
97 }
98 
allocate_cachealigned_memnodemap(void)99 static int __init allocate_cachealigned_memnodemap(void)
100 {
101     unsigned long size = PFN_UP(memnodemapsize * sizeof(*memnodemap));
102     unsigned long mfn = mfn_x(alloc_boot_pages(size, 1));
103 
104     memnodemap = mfn_to_virt(mfn);
105     mfn <<= PAGE_SHIFT;
106     size <<= PAGE_SHIFT;
107     printk(KERN_DEBUG "NUMA: Allocated memnodemap from %lx - %lx\n",
108            mfn, mfn + size);
109     memnodemapsize = size / sizeof(*memnodemap);
110 
111     return 0;
112 }
113 
114 /*
115  * The LSB of all start and end addresses in the node map is the value of the
116  * maximum possible shift.
117  */
extract_lsb_from_nodes(const struct node * nodes,int numnodes)118 static int __init extract_lsb_from_nodes(const struct node *nodes,
119                                          int numnodes)
120 {
121     int i, nodes_used = 0;
122     unsigned long spdx, epdx;
123     unsigned long bitfield = 0, memtop = 0;
124 
125     for ( i = 0; i < numnodes; i++ )
126     {
127         spdx = paddr_to_pdx(nodes[i].start);
128         epdx = paddr_to_pdx(nodes[i].end - 1) + 1;
129         if ( spdx >= epdx )
130             continue;
131         bitfield |= spdx;
132         nodes_used++;
133         if ( epdx > memtop )
134             memtop = epdx;
135     }
136     if ( nodes_used <= 1 )
137         i = BITS_PER_LONG - 1;
138     else
139         i = find_first_bit(&bitfield, sizeof(unsigned long)*8);
140     memnodemapsize = (memtop >> i) + 1;
141     return i;
142 }
143 
compute_hash_shift(struct node * nodes,int numnodes,nodeid_t * nodeids)144 int __init compute_hash_shift(struct node *nodes, int numnodes,
145                               nodeid_t *nodeids)
146 {
147     int shift;
148 
149     shift = extract_lsb_from_nodes(nodes, numnodes);
150     if ( memnodemapsize <= ARRAY_SIZE(_memnodemap) )
151         memnodemap = _memnodemap;
152     else if ( allocate_cachealigned_memnodemap() )
153         return -1;
154     printk(KERN_DEBUG "NUMA: Using %d for the hash shift.\n", shift);
155 
156     if ( populate_memnodemap(nodes, numnodes, shift, nodeids) != 1 )
157     {
158         printk(KERN_INFO "Your memory is not aligned you need to "
159                "rebuild your hypervisor with a bigger NODEMAPSIZE "
160                "shift=%d\n", shift);
161         return -1;
162     }
163 
164     return shift;
165 }
166 /* initialize NODE_DATA given nodeid and start/end */
setup_node_bootmem(nodeid_t nodeid,u64 start,u64 end)167 void __init setup_node_bootmem(nodeid_t nodeid, u64 start, u64 end)
168 {
169     unsigned long start_pfn, end_pfn;
170 
171     start_pfn = start >> PAGE_SHIFT;
172     end_pfn = end >> PAGE_SHIFT;
173 
174     NODE_DATA(nodeid)->node_start_pfn = start_pfn;
175     NODE_DATA(nodeid)->node_spanned_pages = end_pfn - start_pfn;
176 
177     node_set_online(nodeid);
178 }
179 
numa_init_array(void)180 void __init numa_init_array(void)
181 {
182     int rr, i;
183 
184     /* There are unfortunately some poorly designed mainboards around
185        that only connect memory to a single CPU. This breaks the 1:1 cpu->node
186        mapping. To avoid this fill in the mapping for all possible
187        CPUs, as the number of CPUs is not known yet.
188        We round robin the existing nodes. */
189     rr = first_node(node_online_map);
190     for ( i = 0; i < nr_cpu_ids; i++ )
191     {
192         if ( cpu_to_node[i] != NUMA_NO_NODE )
193             continue;
194         numa_set_node(i, rr);
195         rr = next_node(rr, node_online_map);
196         if ( rr == MAX_NUMNODES )
197             rr = first_node(node_online_map);
198     }
199 }
200 
201 #ifdef CONFIG_NUMA_EMU
202 static int numa_fake __initdata = 0;
203 
204 /* Numa emulation */
numa_emulation(u64 start_pfn,u64 end_pfn)205 static int __init numa_emulation(u64 start_pfn, u64 end_pfn)
206 {
207     int i;
208     struct node nodes[MAX_NUMNODES];
209     u64 sz = ((end_pfn - start_pfn)<<PAGE_SHIFT) / numa_fake;
210 
211     /* Kludge needed for the hash function */
212     if ( hweight64(sz) > 1 )
213     {
214         u64 x = 1;
215         while ( (x << 1) < sz )
216             x <<= 1;
217         if ( x < sz/2 )
218             printk(KERN_ERR "Numa emulation unbalanced. Complain to maintainer\n");
219         sz = x;
220     }
221 
222     memset(&nodes,0,sizeof(nodes));
223     for ( i = 0; i < numa_fake; i++ )
224     {
225         nodes[i].start = (start_pfn<<PAGE_SHIFT) + i*sz;
226         if ( i == numa_fake - 1 )
227             sz = (end_pfn<<PAGE_SHIFT) - nodes[i].start;
228         nodes[i].end = nodes[i].start + sz;
229         printk(KERN_INFO "Faking node %d at %"PRIx64"-%"PRIx64" (%"PRIu64"MB)\n",
230                i,
231                nodes[i].start, nodes[i].end,
232                (nodes[i].end - nodes[i].start) >> 20);
233         node_set_online(i);
234     }
235     memnode_shift = compute_hash_shift(nodes, numa_fake, NULL);
236     if ( memnode_shift < 0 )
237     {
238         memnode_shift = 0;
239         printk(KERN_ERR "No NUMA hash function found. Emulation disabled.\n");
240         return -1;
241     }
242     for_each_online_node ( i )
243         setup_node_bootmem(i, nodes[i].start, nodes[i].end);
244     numa_init_array();
245 
246     return 0;
247 }
248 #endif
249 
numa_initmem_init(unsigned long start_pfn,unsigned long end_pfn)250 void __init numa_initmem_init(unsigned long start_pfn, unsigned long end_pfn)
251 {
252     int i;
253 
254 #ifdef CONFIG_NUMA_EMU
255     if ( numa_fake && !numa_emulation(start_pfn, end_pfn) )
256         return;
257 #endif
258 
259 #ifdef CONFIG_ACPI_NUMA
260     if ( !numa_off && !acpi_scan_nodes((u64)start_pfn << PAGE_SHIFT,
261          (u64)end_pfn << PAGE_SHIFT) )
262         return;
263 #endif
264 
265     printk(KERN_INFO "%s\n",
266            numa_off ? "NUMA turned off" : "No NUMA configuration found");
267 
268     printk(KERN_INFO "Faking a node at %016"PRIx64"-%016"PRIx64"\n",
269            (u64)start_pfn << PAGE_SHIFT,
270            (u64)end_pfn << PAGE_SHIFT);
271     /* setup dummy node covering all memory */
272     memnode_shift = BITS_PER_LONG - 1;
273     memnodemap = _memnodemap;
274     nodes_clear(node_online_map);
275     node_set_online(0);
276     for ( i = 0; i < nr_cpu_ids; i++ )
277         numa_set_node(i, 0);
278     cpumask_copy(&node_to_cpumask[0], cpumask_of(0));
279     setup_node_bootmem(0, (u64)start_pfn << PAGE_SHIFT,
280                     (u64)end_pfn << PAGE_SHIFT);
281 }
282 
numa_add_cpu(int cpu)283 void numa_add_cpu(int cpu)
284 {
285     cpumask_set_cpu(cpu, &node_to_cpumask[cpu_to_node(cpu)]);
286 }
287 
numa_set_node(int cpu,nodeid_t node)288 void numa_set_node(int cpu, nodeid_t node)
289 {
290     cpu_to_node[cpu] = node;
291 }
292 
293 /* [numa=off] */
numa_setup(const char * opt)294 static __init int numa_setup(const char *opt)
295 {
296     if ( !strncmp(opt,"off",3) )
297         numa_off = true;
298     else if ( !strncmp(opt,"on",2) )
299         numa_off = false;
300 #ifdef CONFIG_NUMA_EMU
301     else if ( !strncmp(opt, "fake=", 5) )
302     {
303         numa_off = false;
304         numa_fake = simple_strtoul(opt+5,NULL,0);
305         if ( numa_fake >= MAX_NUMNODES )
306             numa_fake = MAX_NUMNODES;
307     }
308 #endif
309 #ifdef CONFIG_ACPI_NUMA
310     else if ( !strncmp(opt,"noacpi",6) )
311     {
312         numa_off = false;
313         acpi_numa = -1;
314     }
315 #endif
316     else
317         return -EINVAL;
318 
319     return 0;
320 }
321 
322 /*
323  * Setup early cpu_to_node.
324  *
325  * Populate cpu_to_node[] only if x86_cpu_to_apicid[],
326  * and apicid_to_node[] tables have valid entries for a CPU.
327  * This means we skip cpu_to_node[] initialisation for NUMA
328  * emulation and faking node case (when running a kernel compiled
329  * for NUMA on a non NUMA box), which is OK as cpu_to_node[]
330  * is already initialized in a round robin manner at numa_init_array,
331  * prior to this call, and this initialization is good enough
332  * for the fake NUMA cases.
333  */
init_cpu_to_node(void)334 void __init init_cpu_to_node(void)
335 {
336     unsigned int i;
337     nodeid_t node;
338 
339     for ( i = 0; i < nr_cpu_ids; i++ )
340     {
341         u32 apicid = x86_cpu_to_apicid[i];
342         if ( apicid == BAD_APICID )
343             continue;
344         node = apicid < MAX_LOCAL_APIC ? apicid_to_node[apicid] : NUMA_NO_NODE;
345         if ( node == NUMA_NO_NODE || !node_online(node) )
346             node = 0;
347         numa_set_node(i, node);
348     }
349 }
350 
arch_get_dma_bitsize(void)351 unsigned int __init arch_get_dma_bitsize(void)
352 {
353     unsigned int node;
354 
355     for_each_online_node(node)
356         if ( node_spanned_pages(node) &&
357              !(node_start_pfn(node) >> (32 - PAGE_SHIFT)) )
358             break;
359     if ( node >= MAX_NUMNODES )
360         panic("No node with memory below 4Gb");
361 
362     /*
363      * Try to not reserve the whole node's memory for DMA, but dividing
364      * its spanned pages by (arbitrarily chosen) 4.
365      */
366     return min_t(unsigned int,
367                  flsl(node_start_pfn(node) + node_spanned_pages(node) / 4 - 1)
368                  + PAGE_SHIFT, 32);
369 }
370 
dump_numa(unsigned char key)371 static void dump_numa(unsigned char key)
372 {
373     s_time_t now = NOW();
374     unsigned int i, j, n;
375     int err;
376     struct domain *d;
377     struct page_info *page;
378     unsigned int page_num_node[MAX_NUMNODES];
379     const struct vnuma_info *vnuma;
380 
381     printk("'%c' pressed -> dumping numa info (now-0x%X:%08X)\n", key,
382            (u32)(now>>32), (u32)now);
383 
384     for_each_online_node ( i )
385     {
386         paddr_t pa = pfn_to_paddr(node_start_pfn(i) + 1);
387 
388         printk("NODE%u start->%lu size->%lu free->%lu\n",
389                i, node_start_pfn(i), node_spanned_pages(i),
390                avail_node_heap_pages(i));
391         /* sanity check phys_to_nid() */
392         if ( phys_to_nid(pa) != i )
393             printk("phys_to_nid(%"PRIpaddr") -> %d should be %u\n",
394                    pa, phys_to_nid(pa), i);
395     }
396 
397     j = cpumask_first(&cpu_online_map);
398     n = 0;
399     for_each_online_cpu ( i )
400     {
401         if ( i != j + n || cpu_to_node[j] != cpu_to_node[i] )
402         {
403             if ( n > 1 )
404                 printk("CPU%u...%u -> NODE%d\n", j, j + n - 1, cpu_to_node[j]);
405             else
406                 printk("CPU%u -> NODE%d\n", j, cpu_to_node[j]);
407             j = i;
408             n = 1;
409         }
410         else
411             ++n;
412     }
413     if ( n > 1 )
414         printk("CPU%u...%u -> NODE%d\n", j, j + n - 1, cpu_to_node[j]);
415     else
416         printk("CPU%u -> NODE%d\n", j, cpu_to_node[j]);
417 
418     rcu_read_lock(&domlist_read_lock);
419 
420     printk("Memory location of each domain:\n");
421     for_each_domain ( d )
422     {
423         process_pending_softirqs();
424 
425         printk("Domain %u (total: %u):\n", d->domain_id, d->tot_pages);
426 
427         for_each_online_node ( i )
428             page_num_node[i] = 0;
429 
430         spin_lock(&d->page_alloc_lock);
431         page_list_for_each(page, &d->page_list)
432         {
433             i = phys_to_nid((paddr_t)page_to_mfn(page) << PAGE_SHIFT);
434             page_num_node[i]++;
435         }
436         spin_unlock(&d->page_alloc_lock);
437 
438         for_each_online_node ( i )
439             printk("    Node %u: %u\n", i, page_num_node[i]);
440 
441         if ( !read_trylock(&d->vnuma_rwlock) )
442             continue;
443 
444         if ( !d->vnuma )
445         {
446             read_unlock(&d->vnuma_rwlock);
447             continue;
448         }
449 
450         vnuma = d->vnuma;
451         printk("     %u vnodes, %u vcpus, guest physical layout:\n",
452                vnuma->nr_vnodes, d->max_vcpus);
453         for ( i = 0; i < vnuma->nr_vnodes; i++ )
454         {
455             unsigned int start_cpu = ~0U;
456 
457             err = snprintf(keyhandler_scratch, 12, "%3u",
458                     vnuma->vnode_to_pnode[i]);
459             if ( err < 0 || vnuma->vnode_to_pnode[i] == NUMA_NO_NODE )
460                 strlcpy(keyhandler_scratch, "???", sizeof(keyhandler_scratch));
461 
462             printk("       %3u: pnode %s,", i, keyhandler_scratch);
463 
464             printk(" vcpus ");
465 
466             for ( j = 0; j < d->max_vcpus; j++ )
467             {
468                 if ( !(j & 0x3f) )
469                     process_pending_softirqs();
470 
471                 if ( vnuma->vcpu_to_vnode[j] == i )
472                 {
473                     if ( start_cpu == ~0U )
474                     {
475                         printk("%d", j);
476                         start_cpu = j;
477                     }
478                 }
479                 else if ( start_cpu != ~0U )
480                 {
481                     if ( j - 1 != start_cpu )
482                         printk("-%d ", j - 1);
483                     else
484                         printk(" ");
485                     start_cpu = ~0U;
486                 }
487             }
488 
489             if ( start_cpu != ~0U  && start_cpu != j - 1 )
490                 printk("-%d", j - 1);
491 
492             printk("\n");
493 
494             for ( j = 0; j < vnuma->nr_vmemranges; j++ )
495             {
496                 if ( vnuma->vmemrange[j].nid == i )
497                     printk("           %016"PRIx64" - %016"PRIx64"\n",
498                            vnuma->vmemrange[j].start,
499                            vnuma->vmemrange[j].end);
500             }
501         }
502 
503         read_unlock(&d->vnuma_rwlock);
504     }
505 
506     rcu_read_unlock(&domlist_read_lock);
507 }
508 
register_numa_trigger(void)509 static __init int register_numa_trigger(void)
510 {
511     register_keyhandler('u', dump_numa, "dump NUMA info", 1);
512     return 0;
513 }
514 __initcall(register_numa_trigger);
515 
516