1 /*
2  * x86 SMP booting functions
3  *
4  * This inherits a great deal from Linux's SMP boot code:
5  *  (c) 1995 Alan Cox, Building #3 <alan@redhat.com>
6  *  (c) 1998, 1999, 2000 Ingo Molnar <mingo@redhat.com>
7  *
8  * This program is free software; you can redistribute it and/or modify
9  * it under the terms of the GNU General Public License as published by
10  * the Free Software Foundation; either version 2 of the License, or
11  * (at your option) any later version.
12  *
13  * This program is distributed in the hope that it will be useful,
14  * but WITHOUT ANY WARRANTY; without even the implied warranty of
15  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
16  * GNU General Public License for more details.
17  *
18  * You should have received a copy of the GNU General Public License
19  * along with this program; If not, see <http://www.gnu.org/licenses/>.
20  */
21 
22 #include <xen/init.h>
23 #include <xen/kernel.h>
24 #include <xen/mm.h>
25 #include <xen/domain.h>
26 #include <xen/domain_page.h>
27 #include <xen/sched.h>
28 #include <xen/sched-if.h>
29 #include <xen/irq.h>
30 #include <xen/delay.h>
31 #include <xen/softirq.h>
32 #include <xen/tasklet.h>
33 #include <xen/serial.h>
34 #include <xen/numa.h>
35 #include <xen/cpu.h>
36 #include <asm/current.h>
37 #include <asm/mc146818rtc.h>
38 #include <asm/desc.h>
39 #include <asm/div64.h>
40 #include <asm/flushtlb.h>
41 #include <asm/guest.h>
42 #include <asm/msr.h>
43 #include <asm/mtrr.h>
44 #include <asm/time.h>
45 #include <asm/tboot.h>
46 #include <mach_apic.h>
47 #include <mach_wakecpu.h>
48 #include <smpboot_hooks.h>
49 
50 /* Override macros from asm/page.h to make them work with mfn_t */
51 #undef mfn_to_page
52 #define mfn_to_page(mfn) __mfn_to_page(mfn_x(mfn))
53 #undef page_to_mfn
54 #define page_to_mfn(pg) _mfn(__page_to_mfn(pg))
55 
56 #define setup_trampoline()    (bootsym_phys(trampoline_realmode_entry))
57 
58 unsigned long __read_mostly trampoline_phys;
59 
60 /* representing HT siblings of each logical CPU */
61 DEFINE_PER_CPU_READ_MOSTLY(cpumask_var_t, cpu_sibling_mask);
62 /* representing HT and core siblings of each logical CPU */
63 DEFINE_PER_CPU_READ_MOSTLY(cpumask_var_t, cpu_core_mask);
64 
65 DEFINE_PER_CPU_READ_MOSTLY(cpumask_var_t, scratch_cpumask);
66 static cpumask_t scratch_cpu0mask;
67 
68 cpumask_t cpu_online_map __read_mostly;
69 EXPORT_SYMBOL(cpu_online_map);
70 
71 unsigned int __read_mostly nr_sockets;
72 cpumask_t **__read_mostly socket_cpumask;
73 static cpumask_t *secondary_socket_cpumask;
74 
75 struct cpuinfo_x86 cpu_data[NR_CPUS];
76 
77 u32 x86_cpu_to_apicid[NR_CPUS] __read_mostly =
78 	{ [0 ... NR_CPUS-1] = BAD_APICID };
79 
80 static int cpu_error;
81 static enum cpu_state {
82     CPU_STATE_DYING,    /* slave -> master: I am dying */
83     CPU_STATE_DEAD,     /* slave -> master: I am completely dead */
84     CPU_STATE_INIT,     /* master -> slave: Early bringup phase 1 */
85     CPU_STATE_CALLOUT,  /* master -> slave: Early bringup phase 2 */
86     CPU_STATE_CALLIN,   /* slave -> master: Completed phase 2 */
87     CPU_STATE_ONLINE    /* master -> slave: Go fully online now. */
88 } cpu_state;
89 #define set_cpu_state(state) do { mb(); cpu_state = (state); } while (0)
90 
91 void *stack_base[NR_CPUS];
92 
initialize_cpu_data(unsigned int cpu)93 void initialize_cpu_data(unsigned int cpu)
94 {
95     cpu_data[cpu] = boot_cpu_data;
96 }
97 
smp_store_cpu_info(int id)98 static void smp_store_cpu_info(int id)
99 {
100     unsigned int socket;
101 
102     identify_cpu(&cpu_data[id]);
103 
104     socket = cpu_to_socket(id);
105     if ( !socket_cpumask[socket] )
106     {
107         socket_cpumask[socket] = secondary_socket_cpumask;
108         secondary_socket_cpumask = NULL;
109     }
110 }
111 
112 /*
113  * TSC's upper 32 bits can't be written in earlier CPUs (before
114  * Prescott), there is no way to resync one AP against BP.
115  */
116 bool disable_tsc_sync;
117 
118 static atomic_t tsc_count;
119 static uint64_t tsc_value;
120 static cpumask_t tsc_sync_cpu_mask;
121 
synchronize_tsc_master(unsigned int slave)122 static void synchronize_tsc_master(unsigned int slave)
123 {
124     unsigned int i;
125 
126     if ( disable_tsc_sync )
127         return;
128 
129     if ( boot_cpu_has(X86_FEATURE_TSC_RELIABLE) &&
130          !cpumask_test_cpu(slave, &tsc_sync_cpu_mask) )
131         return;
132 
133     for ( i = 1; i <= 5; i++ )
134     {
135         tsc_value = rdtsc_ordered();
136         wmb();
137         atomic_inc(&tsc_count);
138         while ( atomic_read(&tsc_count) != (i<<1) )
139             cpu_relax();
140     }
141 
142     atomic_set(&tsc_count, 0);
143     cpumask_clear_cpu(slave, &tsc_sync_cpu_mask);
144 }
145 
synchronize_tsc_slave(unsigned int slave)146 static void synchronize_tsc_slave(unsigned int slave)
147 {
148     unsigned int i;
149 
150     if ( disable_tsc_sync )
151         return;
152 
153     if ( boot_cpu_has(X86_FEATURE_TSC_RELIABLE) &&
154          !cpumask_test_cpu(slave, &tsc_sync_cpu_mask) )
155         return;
156 
157     for ( i = 1; i <= 5; i++ )
158     {
159         while ( atomic_read(&tsc_count) != ((i<<1)-1) )
160             cpu_relax();
161         rmb();
162         /*
163          * If a CPU has been physically hotplugged, we may as well write
164          * to its TSC in spite of X86_FEATURE_TSC_RELIABLE. The platform does
165          * not sync up a new CPU's TSC for us.
166          */
167         __write_tsc(tsc_value);
168         atomic_inc(&tsc_count);
169     }
170 }
171 
smp_callin(void)172 static void smp_callin(void)
173 {
174     unsigned int cpu = smp_processor_id();
175     int i, rc;
176 
177     /* Wait 2s total for startup. */
178     Dprintk("Waiting for CALLOUT.\n");
179     for ( i = 0; cpu_state != CPU_STATE_CALLOUT; i++ )
180     {
181         BUG_ON(i >= 200);
182         cpu_relax();
183         mdelay(10);
184     }
185 
186     /*
187      * The boot CPU has finished the init stage and is spinning on cpu_state
188      * update until we finish. We are free to set up this CPU: first the APIC.
189      */
190     Dprintk("CALLIN, before setup_local_APIC().\n");
191     x2apic_ap_setup();
192     setup_local_APIC();
193 
194     /* Save our processor parameters. */
195     smp_store_cpu_info(cpu);
196 
197     if ( (rc = hvm_cpu_up()) != 0 )
198     {
199         printk("CPU%d: Failed to initialise HVM. Not coming online.\n", cpu);
200         cpu_error = rc;
201         clear_local_APIC();
202         spin_debug_enable();
203         cpu_exit_clear(cpu);
204         (*dead_idle)();
205     }
206 
207     /* Allow the master to continue. */
208     set_cpu_state(CPU_STATE_CALLIN);
209 
210     synchronize_tsc_slave(cpu);
211 
212     /* And wait for our final Ack. */
213     while ( cpu_state != CPU_STATE_ONLINE )
214         cpu_relax();
215 }
216 
217 static int booting_cpu;
218 
219 /* CPUs for which sibling maps can be computed. */
220 static cpumask_t cpu_sibling_setup_map;
221 
link_thread_siblings(int cpu1,int cpu2)222 static void link_thread_siblings(int cpu1, int cpu2)
223 {
224     cpumask_set_cpu(cpu1, per_cpu(cpu_sibling_mask, cpu2));
225     cpumask_set_cpu(cpu2, per_cpu(cpu_sibling_mask, cpu1));
226     cpumask_set_cpu(cpu1, per_cpu(cpu_core_mask, cpu2));
227     cpumask_set_cpu(cpu2, per_cpu(cpu_core_mask, cpu1));
228 }
229 
set_cpu_sibling_map(int cpu)230 static void set_cpu_sibling_map(int cpu)
231 {
232     int i;
233     struct cpuinfo_x86 *c = cpu_data;
234 
235     cpumask_set_cpu(cpu, &cpu_sibling_setup_map);
236 
237     cpumask_set_cpu(cpu, socket_cpumask[cpu_to_socket(cpu)]);
238 
239     if ( c[cpu].x86_num_siblings > 1 )
240     {
241         for_each_cpu ( i, &cpu_sibling_setup_map )
242         {
243             if ( cpu_has(c, X86_FEATURE_TOPOEXT) ) {
244                 if ( (c[cpu].phys_proc_id == c[i].phys_proc_id) &&
245                      (c[cpu].compute_unit_id == c[i].compute_unit_id) )
246                     link_thread_siblings(cpu, i);
247             } else if ( (c[cpu].phys_proc_id == c[i].phys_proc_id) &&
248                         (c[cpu].cpu_core_id == c[i].cpu_core_id) ) {
249                 link_thread_siblings(cpu, i);
250             }
251         }
252     }
253     else
254     {
255         cpumask_set_cpu(cpu, per_cpu(cpu_sibling_mask, cpu));
256     }
257 
258     if ( c[cpu].x86_max_cores == 1 )
259     {
260         cpumask_copy(per_cpu(cpu_core_mask, cpu),
261                      per_cpu(cpu_sibling_mask, cpu));
262         c[cpu].booted_cores = 1;
263         return;
264     }
265 
266     for_each_cpu ( i, &cpu_sibling_setup_map )
267     {
268         if ( c[cpu].phys_proc_id == c[i].phys_proc_id )
269         {
270             cpumask_set_cpu(i, per_cpu(cpu_core_mask, cpu));
271             cpumask_set_cpu(cpu, per_cpu(cpu_core_mask, i));
272             /*
273              *  Does this new cpu bringup a new core?
274              */
275             if ( cpumask_weight(per_cpu(cpu_sibling_mask, cpu)) == 1 )
276             {
277                 /*
278                  * for each core in package, increment
279                  * the booted_cores for this new cpu
280                  */
281                 if ( cpumask_first(per_cpu(cpu_sibling_mask, i)) == i )
282                     c[cpu].booted_cores++;
283                 /*
284                  * increment the core count for all
285                  * the other cpus in this package
286                  */
287                 if ( i != cpu )
288                     c[i].booted_cores++;
289             }
290             else if ( (i != cpu) && !c[cpu].booted_cores )
291             {
292                 c[cpu].booted_cores = c[i].booted_cores;
293             }
294         }
295     }
296 }
297 
start_secondary(void * unused)298 void start_secondary(void *unused)
299 {
300     /*
301      * Dont put anything before smp_callin(), SMP booting is so fragile that we
302      * want to limit the things done here to the most necessary things.
303      */
304     unsigned int cpu = booting_cpu;
305 
306     /* Critical region without IDT or TSS.  Any fault is deadly! */
307 
308     set_processor_id(cpu);
309     set_current(idle_vcpu[cpu]);
310     this_cpu(curr_vcpu) = idle_vcpu[cpu];
311     rdmsrl(MSR_EFER, this_cpu(efer));
312 
313     /*
314      * Just as during early bootstrap, it is convenient here to disable
315      * spinlock checking while we have IRQs disabled. This allows us to
316      * acquire IRQ-unsafe locks when it would otherwise be disallowed.
317      *
318      * It is safe because the race we are usually trying to avoid involves
319      * a group of CPUs rendezvousing in an IPI handler, where one cannot
320      * join because it is spinning with IRQs disabled waiting to acquire a
321      * lock held by another in the rendezvous group (the lock must be an
322      * IRQ-unsafe lock since the CPU took the IPI after acquiring it, and
323      * hence had IRQs enabled). This is a deadlock scenario.
324      *
325      * However, no CPU can be involved in rendezvous until it is online,
326      * hence no such group can be waiting for this CPU until it is
327      * visible in cpu_online_map. Hence such a deadlock is not possible.
328      */
329     spin_debug_disable();
330 
331     load_system_tables();
332 
333     /* Full exception support from here on in. */
334 
335     /* Safe to enable feature such as CR4.MCE with the IDT set up now. */
336     write_cr4(mmu_cr4_features);
337 
338     percpu_traps_init();
339 
340     cpu_init();
341 
342     initialize_cpu_data(cpu);
343 
344     if ( system_state <= SYS_STATE_smp_boot )
345         early_microcode_update_cpu(false);
346     else
347         microcode_resume_cpu(cpu);
348 
349     if ( xen_guest )
350         hypervisor_ap_setup();
351 
352     smp_callin();
353 
354     init_percpu_time();
355 
356     setup_secondary_APIC_clock();
357 
358     /*
359      * low-memory mappings have been cleared, flush them from
360      * the local TLBs too.
361      */
362     flush_tlb_local();
363 
364     /* This must be done before setting cpu_online_map */
365     spin_debug_enable();
366     set_cpu_sibling_map(cpu);
367     notify_cpu_starting(cpu);
368     wmb();
369 
370     /*
371      * We need to hold vector_lock so there the set of online cpus
372      * does not change while we are assigning vectors to cpus.  Holding
373      * this lock ensures we don't half assign or remove an irq from a cpu.
374      */
375     lock_vector_lock();
376     setup_vector_irq(cpu);
377     cpumask_set_cpu(cpu, &cpu_online_map);
378     unlock_vector_lock();
379 
380     /* We can take interrupts now: we're officially "up". */
381     local_irq_enable();
382     mtrr_ap_init();
383 
384     wmb();
385     startup_cpu_idle_loop();
386 }
387 
388 extern void *stack_start;
389 
wakeup_secondary_cpu(int phys_apicid,unsigned long start_eip)390 static int wakeup_secondary_cpu(int phys_apicid, unsigned long start_eip)
391 {
392     unsigned long send_status = 0, accept_status = 0;
393     int maxlvt, timeout, i;
394 
395     /*
396      * Be paranoid about clearing APIC errors.
397      */
398     apic_write(APIC_ESR, 0);
399     apic_read(APIC_ESR);
400 
401     Dprintk("Asserting INIT.\n");
402 
403     /*
404      * Turn INIT on target chip via IPI
405      */
406     apic_icr_write(APIC_INT_LEVELTRIG | APIC_INT_ASSERT | APIC_DM_INIT,
407                    phys_apicid);
408 
409     if ( !x2apic_enabled )
410     {
411         Dprintk("Waiting for send to finish...\n");
412         timeout = 0;
413         do {
414             Dprintk("+");
415             udelay(100);
416             send_status = apic_read(APIC_ICR) & APIC_ICR_BUSY;
417         } while ( send_status && (timeout++ < 1000) );
418 
419         mdelay(10);
420 
421         Dprintk("Deasserting INIT.\n");
422 
423         apic_icr_write(APIC_INT_LEVELTRIG | APIC_DM_INIT, phys_apicid);
424 
425         Dprintk("Waiting for send to finish...\n");
426         timeout = 0;
427         do {
428             Dprintk("+");
429             udelay(100);
430             send_status = apic_read(APIC_ICR) & APIC_ICR_BUSY;
431         } while ( send_status && (timeout++ < 1000) );
432     }
433     else if ( tboot_in_measured_env() )
434     {
435         /*
436          * With tboot AP is actually spinning in a mini-guest before
437          * receiving INIT. Upon receiving INIT ipi, AP need time to VMExit,
438          * update VMCS to tracking SIPIs and VMResume.
439          *
440          * While AP is in root mode handling the INIT the CPU will drop
441          * any SIPIs
442          */
443         udelay(10);
444     }
445 
446     maxlvt = get_maxlvt();
447 
448     for ( i = 0; i < 2; i++ )
449     {
450         Dprintk("Sending STARTUP #%d.\n", i+1);
451         apic_write(APIC_ESR, 0);
452         apic_read(APIC_ESR);
453         Dprintk("After apic_write.\n");
454 
455         /*
456          * STARTUP IPI
457          * Boot on the stack
458          */
459         apic_icr_write(APIC_DM_STARTUP | (start_eip >> 12), phys_apicid);
460 
461         if ( !x2apic_enabled )
462         {
463             /* Give the other CPU some time to accept the IPI. */
464             udelay(300);
465 
466             Dprintk("Startup point 1.\n");
467 
468             Dprintk("Waiting for send to finish...\n");
469             timeout = 0;
470             do {
471                 Dprintk("+");
472                 udelay(100);
473                 send_status = apic_read(APIC_ICR) & APIC_ICR_BUSY;
474             } while ( send_status && (timeout++ < 1000) );
475 
476             /* Give the other CPU some time to accept the IPI. */
477             udelay(200);
478         }
479 
480         /* Due to the Pentium erratum 3AP. */
481         if ( maxlvt > 3 )
482         {
483             apic_write(APIC_ESR, 0);
484         }
485         accept_status = (apic_read(APIC_ESR) & 0xEF);
486         if ( send_status || accept_status )
487             break;
488     }
489     Dprintk("After Startup.\n");
490 
491     if ( send_status )
492         printk("APIC never delivered???\n");
493     if ( accept_status )
494         printk("APIC delivery error (%lx).\n", accept_status);
495 
496     return (send_status | accept_status);
497 }
498 
alloc_cpu_id(void)499 int alloc_cpu_id(void)
500 {
501     cpumask_t tmp_map;
502     int cpu;
503 
504     cpumask_complement(&tmp_map, &cpu_present_map);
505     cpu = cpumask_first(&tmp_map);
506     return (cpu < nr_cpu_ids) ? cpu : -ENODEV;
507 }
508 
do_boot_cpu(int apicid,int cpu)509 static int do_boot_cpu(int apicid, int cpu)
510 {
511     int timeout, boot_error = 0, rc = 0;
512     unsigned long start_eip;
513 
514     /*
515      * Save current MTRR state in case it was changed since early boot
516      * (e.g. by the ACPI SMI) to initialize new CPUs with MTRRs in sync:
517      */
518     mtrr_save_state();
519 
520     booting_cpu = cpu;
521 
522     /* start_eip had better be page-aligned! */
523     start_eip = setup_trampoline();
524 
525     /* So we see what's up   */
526     if ( opt_cpu_info )
527         printk("Booting processor %d/%d eip %lx\n",
528                cpu, apicid, start_eip);
529 
530     stack_start = stack_base[cpu];
531 
532     /* This grunge runs the startup process for the targeted processor. */
533 
534     set_cpu_state(CPU_STATE_INIT);
535 
536     Dprintk("Setting warm reset code and vector.\n");
537 
538     smpboot_setup_warm_reset_vector(start_eip);
539 
540     /* Starting actual IPI sequence... */
541     if ( !tboot_in_measured_env() || tboot_wake_ap(apicid, start_eip) )
542         boot_error = wakeup_secondary_cpu(apicid, start_eip);
543 
544     if ( !boot_error )
545     {
546         /* Allow AP to start initializing. */
547         set_cpu_state(CPU_STATE_CALLOUT);
548         Dprintk("After Callout %d.\n", cpu);
549 
550         /* Wait 5s total for a response. */
551         for ( timeout = 0; timeout < 50000; timeout++ )
552         {
553             if ( cpu_state != CPU_STATE_CALLOUT )
554                 break;
555             udelay(100);
556         }
557 
558         if ( cpu_state == CPU_STATE_CALLIN )
559         {
560             /* number CPUs logically, starting from 1 (BSP is 0) */
561             Dprintk("OK.\n");
562             print_cpu_info(cpu);
563             synchronize_tsc_master(cpu);
564             Dprintk("CPU has booted.\n");
565         }
566         else if ( cpu_state == CPU_STATE_DEAD )
567         {
568             rmb();
569             rc = cpu_error;
570         }
571         else
572         {
573             boot_error = 1;
574             mb();
575             if ( bootsym(trampoline_cpu_started) == 0xA5 )
576                 /* trampoline started but...? */
577                 printk("Stuck ??\n");
578             else
579                 /* trampoline code not run */
580                 printk("Not responding.\n");
581         }
582     }
583 
584     if ( boot_error )
585     {
586         cpu_exit_clear(cpu);
587         rc = -EIO;
588     }
589 
590     /* mark "stuck" area as not stuck */
591     bootsym(trampoline_cpu_started) = 0;
592     mb();
593 
594     smpboot_restore_warm_reset_vector();
595 
596     return rc;
597 }
598 
599 #define STUB_BUF_CPU_OFFS(cpu) (((cpu) & (STUBS_PER_PAGE - 1)) * STUB_BUF_SIZE)
600 
alloc_stub_page(unsigned int cpu,unsigned long * mfn)601 unsigned long alloc_stub_page(unsigned int cpu, unsigned long *mfn)
602 {
603     unsigned long stub_va;
604     struct page_info *pg;
605 
606     BUILD_BUG_ON(STUBS_PER_PAGE & (STUBS_PER_PAGE - 1));
607 
608     if ( *mfn )
609         pg = mfn_to_page(_mfn(*mfn));
610     else
611     {
612         nodeid_t node = cpu_to_node(cpu);
613         unsigned int memflags = node != NUMA_NO_NODE ? MEMF_node(node) : 0;
614 
615         pg = alloc_domheap_page(NULL, memflags);
616         if ( !pg )
617             return 0;
618 
619         unmap_domain_page(memset(__map_domain_page(pg), 0xcc, PAGE_SIZE));
620     }
621 
622     stub_va = XEN_VIRT_END - (cpu + 1) * PAGE_SIZE;
623     if ( map_pages_to_xen(stub_va, mfn_x(page_to_mfn(pg)), 1,
624                           PAGE_HYPERVISOR_RX | MAP_SMALL_PAGES) )
625     {
626         if ( !*mfn )
627             free_domheap_page(pg);
628         stub_va = 0;
629     }
630     else if ( !*mfn )
631         *mfn = mfn_x(page_to_mfn(pg));
632 
633     return stub_va;
634 }
635 
cpu_exit_clear(unsigned int cpu)636 void cpu_exit_clear(unsigned int cpu)
637 {
638     cpu_uninit(cpu);
639     set_cpu_state(CPU_STATE_DEAD);
640 }
641 
cpu_smpboot_free(unsigned int cpu)642 static void cpu_smpboot_free(unsigned int cpu)
643 {
644     unsigned int order, socket = cpu_to_socket(cpu);
645     struct cpuinfo_x86 *c = cpu_data;
646 
647     if ( cpumask_empty(socket_cpumask[socket]) )
648     {
649         xfree(socket_cpumask[socket]);
650         socket_cpumask[socket] = NULL;
651     }
652 
653     c[cpu].phys_proc_id = XEN_INVALID_SOCKET_ID;
654     c[cpu].cpu_core_id = XEN_INVALID_CORE_ID;
655     c[cpu].compute_unit_id = INVALID_CUID;
656     cpumask_clear_cpu(cpu, &cpu_sibling_setup_map);
657 
658     free_cpumask_var(per_cpu(cpu_sibling_mask, cpu));
659     free_cpumask_var(per_cpu(cpu_core_mask, cpu));
660     if ( per_cpu(scratch_cpumask, cpu) != &scratch_cpu0mask )
661         free_cpumask_var(per_cpu(scratch_cpumask, cpu));
662 
663     if ( per_cpu(stubs.addr, cpu) )
664     {
665         mfn_t mfn = _mfn(per_cpu(stubs.mfn, cpu));
666         unsigned char *stub_page = map_domain_page(mfn);
667         unsigned int i;
668 
669         memset(stub_page + STUB_BUF_CPU_OFFS(cpu), 0xcc, STUB_BUF_SIZE);
670         for ( i = 0; i < STUBS_PER_PAGE; ++i )
671             if ( stub_page[i * STUB_BUF_SIZE] != 0xcc )
672                 break;
673         unmap_domain_page(stub_page);
674         destroy_xen_mappings(per_cpu(stubs.addr, cpu) & PAGE_MASK,
675                              (per_cpu(stubs.addr, cpu) | ~PAGE_MASK) + 1);
676         if ( i == STUBS_PER_PAGE )
677             free_domheap_page(mfn_to_page(mfn));
678     }
679 
680     order = get_order_from_pages(NR_RESERVED_GDT_PAGES);
681     free_xenheap_pages(per_cpu(gdt_table, cpu), order);
682 
683     free_xenheap_pages(per_cpu(compat_gdt_table, cpu), order);
684 
685     order = get_order_from_bytes(IDT_ENTRIES * sizeof(idt_entry_t));
686     free_xenheap_pages(idt_tables[cpu], order);
687     idt_tables[cpu] = NULL;
688 
689     if ( stack_base[cpu] != NULL )
690     {
691         memguard_unguard_stack(stack_base[cpu]);
692         free_xenheap_pages(stack_base[cpu], STACK_ORDER);
693         stack_base[cpu] = NULL;
694     }
695 }
696 
cpu_smpboot_alloc(unsigned int cpu)697 static int cpu_smpboot_alloc(unsigned int cpu)
698 {
699     unsigned int i, order, memflags = 0;
700     nodeid_t node = cpu_to_node(cpu);
701     struct desc_struct *gdt;
702     unsigned long stub_page;
703 
704     if ( node != NUMA_NO_NODE )
705         memflags = MEMF_node(node);
706 
707     stack_base[cpu] = alloc_xenheap_pages(STACK_ORDER, memflags);
708     if ( stack_base[cpu] == NULL )
709         goto oom;
710     memguard_guard_stack(stack_base[cpu]);
711 
712     order = get_order_from_pages(NR_RESERVED_GDT_PAGES);
713     per_cpu(gdt_table, cpu) = gdt = alloc_xenheap_pages(order, memflags);
714     if ( gdt == NULL )
715         goto oom;
716     memcpy(gdt, boot_cpu_gdt_table, NR_RESERVED_GDT_PAGES * PAGE_SIZE);
717     BUILD_BUG_ON(NR_CPUS > 0x10000);
718     gdt[PER_CPU_GDT_ENTRY - FIRST_RESERVED_GDT_ENTRY].a = cpu;
719 
720     per_cpu(compat_gdt_table, cpu) = gdt = alloc_xenheap_pages(order, memflags);
721     if ( gdt == NULL )
722         goto oom;
723     memcpy(gdt, boot_cpu_compat_gdt_table, NR_RESERVED_GDT_PAGES * PAGE_SIZE);
724     gdt[PER_CPU_GDT_ENTRY - FIRST_RESERVED_GDT_ENTRY].a = cpu;
725 
726     order = get_order_from_bytes(IDT_ENTRIES * sizeof(idt_entry_t));
727     idt_tables[cpu] = alloc_xenheap_pages(order, memflags);
728     if ( idt_tables[cpu] == NULL )
729         goto oom;
730     memcpy(idt_tables[cpu], idt_table, IDT_ENTRIES * sizeof(idt_entry_t));
731     set_ist(&idt_tables[cpu][TRAP_double_fault],  IST_NONE);
732     set_ist(&idt_tables[cpu][TRAP_nmi],           IST_NONE);
733     set_ist(&idt_tables[cpu][TRAP_machine_check], IST_NONE);
734 
735     for ( stub_page = 0, i = cpu & ~(STUBS_PER_PAGE - 1);
736           i < nr_cpu_ids && i <= (cpu | (STUBS_PER_PAGE - 1)); ++i )
737         if ( cpu_online(i) && cpu_to_node(i) == node )
738         {
739             per_cpu(stubs.mfn, cpu) = per_cpu(stubs.mfn, i);
740             break;
741         }
742     BUG_ON(i == cpu);
743     stub_page = alloc_stub_page(cpu, &per_cpu(stubs.mfn, cpu));
744     if ( !stub_page )
745         goto oom;
746     per_cpu(stubs.addr, cpu) = stub_page + STUB_BUF_CPU_OFFS(cpu);
747 
748     if ( secondary_socket_cpumask == NULL &&
749          (secondary_socket_cpumask = xzalloc(cpumask_t)) == NULL )
750         goto oom;
751 
752     if ( zalloc_cpumask_var(&per_cpu(cpu_sibling_mask, cpu)) &&
753          zalloc_cpumask_var(&per_cpu(cpu_core_mask, cpu)) &&
754          alloc_cpumask_var(&per_cpu(scratch_cpumask, cpu)) )
755         return 0;
756 
757  oom:
758     cpu_smpboot_free(cpu);
759     return -ENOMEM;
760 }
761 
cpu_smpboot_callback(struct notifier_block * nfb,unsigned long action,void * hcpu)762 static int cpu_smpboot_callback(
763     struct notifier_block *nfb, unsigned long action, void *hcpu)
764 {
765     unsigned int cpu = (unsigned long)hcpu;
766     int rc = 0;
767 
768     switch ( action )
769     {
770     case CPU_UP_PREPARE:
771         rc = cpu_smpboot_alloc(cpu);
772         break;
773     case CPU_UP_CANCELED:
774     case CPU_DEAD:
775         cpu_smpboot_free(cpu);
776         break;
777     default:
778         break;
779     }
780 
781     return !rc ? NOTIFY_DONE : notifier_from_errno(rc);
782 }
783 
784 static struct notifier_block cpu_smpboot_nfb = {
785     .notifier_call = cpu_smpboot_callback
786 };
787 
smp_prepare_cpus(unsigned int max_cpus)788 void __init smp_prepare_cpus(unsigned int max_cpus)
789 {
790     register_cpu_notifier(&cpu_smpboot_nfb);
791 
792     mtrr_aps_sync_begin();
793 
794     /* Setup boot CPU information */
795     initialize_cpu_data(0); /* Final full version of the data */
796     print_cpu_info(0);
797 
798     boot_cpu_physical_apicid = get_apic_id();
799     x86_cpu_to_apicid[0] = boot_cpu_physical_apicid;
800 
801     stack_base[0] = stack_start;
802 
803     set_nr_sockets();
804 
805     socket_cpumask = xzalloc_array(cpumask_t *, nr_sockets);
806     if ( socket_cpumask == NULL ||
807          (socket_cpumask[cpu_to_socket(0)] = xzalloc(cpumask_t)) == NULL )
808         panic("No memory for socket CPU siblings map");
809 
810     if ( !zalloc_cpumask_var(&per_cpu(cpu_sibling_mask, 0)) ||
811          !zalloc_cpumask_var(&per_cpu(cpu_core_mask, 0)) )
812         panic("No memory for boot CPU sibling/core maps");
813 
814     set_cpu_sibling_map(0);
815 
816     /*
817      * If we couldn't find an SMP configuration at boot time,
818      * get out of here now!
819      */
820     if ( !smp_found_config && !acpi_lapic )
821     {
822         printk(KERN_NOTICE "SMP motherboard not detected.\n");
823     init_uniprocessor:
824         physids_clear(phys_cpu_present_map);
825         physid_set(0, phys_cpu_present_map);
826         if (APIC_init_uniprocessor())
827             printk(KERN_NOTICE "Local APIC not detected."
828                    " Using dummy APIC emulation.\n");
829         return;
830     }
831 
832     /*
833      * Should not be necessary because the MP table should list the boot
834      * CPU too, but we do it for the sake of robustness anyway.
835      * Makes no sense to do this check in clustered apic mode, so skip it
836      */
837     if ( !check_apicid_present(boot_cpu_physical_apicid) )
838     {
839         printk("weird, boot CPU (#%d) not listed by the BIOS.\n",
840                boot_cpu_physical_apicid);
841         physid_set(get_apic_id(), phys_cpu_present_map);
842     }
843 
844     /* If we couldn't find a local APIC, then get out of here now! */
845     if ( !cpu_has_apic )
846     {
847         printk(KERN_ERR "BIOS bug, local APIC #%d not detected!...\n",
848                boot_cpu_physical_apicid);
849         goto init_uniprocessor;
850     }
851 
852     verify_local_APIC();
853 
854     connect_bsp_APIC();
855     setup_local_APIC();
856 
857     smpboot_setup_io_apic();
858 
859     setup_boot_APIC_clock();
860 }
861 
smp_prepare_boot_cpu(void)862 void __init smp_prepare_boot_cpu(void)
863 {
864     unsigned int cpu = smp_processor_id();
865 
866     cpumask_set_cpu(cpu, &cpu_online_map);
867     cpumask_set_cpu(cpu, &cpu_present_map);
868 #if NR_CPUS > 2 * BITS_PER_LONG
869     per_cpu(scratch_cpumask, cpu) = &scratch_cpu0mask;
870 #endif
871 }
872 
873 static void
remove_siblinginfo(int cpu)874 remove_siblinginfo(int cpu)
875 {
876     int sibling;
877 
878     cpumask_clear_cpu(cpu, socket_cpumask[cpu_to_socket(cpu)]);
879 
880     for_each_cpu ( sibling, per_cpu(cpu_core_mask, cpu) )
881     {
882         cpumask_clear_cpu(cpu, per_cpu(cpu_core_mask, sibling));
883         /* Last thread sibling in this cpu core going down. */
884         if ( cpumask_weight(per_cpu(cpu_sibling_mask, cpu)) == 1 )
885             cpu_data[sibling].booted_cores--;
886     }
887 
888     for_each_cpu(sibling, per_cpu(cpu_sibling_mask, cpu))
889         cpumask_clear_cpu(cpu, per_cpu(cpu_sibling_mask, sibling));
890     cpumask_clear(per_cpu(cpu_sibling_mask, cpu));
891     cpumask_clear(per_cpu(cpu_core_mask, cpu));
892 }
893 
__cpu_disable(void)894 void __cpu_disable(void)
895 {
896     int cpu = smp_processor_id();
897 
898     set_cpu_state(CPU_STATE_DYING);
899 
900     local_irq_disable();
901     clear_local_APIC();
902     /* Allow any queued timer interrupts to get serviced */
903     local_irq_enable();
904     mdelay(1);
905     local_irq_disable();
906 
907     time_suspend();
908 
909     remove_siblinginfo(cpu);
910 
911     /* It's now safe to remove this processor from the online map */
912     cpumask_clear_cpu(cpu, &cpu_online_map);
913     fixup_irqs(&cpu_online_map, 1);
914     fixup_eoi();
915 
916     if ( cpu_disable_scheduler(cpu) )
917         BUG();
918 }
919 
__cpu_die(unsigned int cpu)920 void __cpu_die(unsigned int cpu)
921 {
922     /* We don't do anything here: idle task is faking death itself. */
923     unsigned int i = 0;
924     enum cpu_state seen_state;
925 
926     while ( (seen_state = cpu_state) != CPU_STATE_DEAD )
927     {
928         BUG_ON(seen_state != CPU_STATE_DYING);
929         mdelay(100);
930         cpu_relax();
931         process_pending_softirqs();
932         if ( (++i % 10) == 0 )
933             printk(KERN_ERR "CPU %u still not dead...\n", cpu);
934     }
935 }
936 
cpu_add(uint32_t apic_id,uint32_t acpi_id,uint32_t pxm)937 int cpu_add(uint32_t apic_id, uint32_t acpi_id, uint32_t pxm)
938 {
939     int cpu = -1;
940 
941     dprintk(XENLOG_DEBUG, "cpu_add apic_id %x acpi_id %x pxm %x\n",
942             apic_id, acpi_id, pxm);
943 
944     if ( (acpi_id >= MAX_MADT_ENTRIES) ||
945          (apic_id >= MAX_APICS) ||
946          (pxm >= 256) )
947         return -EINVAL;
948 
949     if ( !cpu_hotplug_begin() )
950         return -EBUSY;
951 
952     /* Detect if the cpu has been added before */
953     if ( x86_acpiid_to_apicid[acpi_id] != BAD_APICID )
954     {
955         cpu = (x86_acpiid_to_apicid[acpi_id] != apic_id)
956             ? -EINVAL : -EEXIST;
957         goto out;
958     }
959 
960     if ( physid_isset(apic_id, phys_cpu_present_map) )
961     {
962         cpu = -EEXIST;
963         goto out;
964     }
965 
966     if ( (cpu = mp_register_lapic(apic_id, 1, 1)) < 0 )
967         goto out;
968 
969     x86_acpiid_to_apicid[acpi_id] = apic_id;
970 
971     if ( !srat_disabled() )
972     {
973         nodeid_t node = setup_node(pxm);
974 
975         if ( node == NUMA_NO_NODE )
976         {
977             dprintk(XENLOG_WARNING,
978                     "Setup node failed for pxm %x\n", pxm);
979             x86_acpiid_to_apicid[acpi_id] = BAD_APICID;
980             mp_unregister_lapic(apic_id, cpu);
981             cpu = node;
982             goto out;
983         }
984         if ( apic_id < MAX_LOCAL_APIC )
985              apicid_to_node[apic_id] = node;
986     }
987 
988     /* Physically added CPUs do not have synchronised TSC. */
989     if ( boot_cpu_has(X86_FEATURE_TSC_RELIABLE) )
990     {
991         static bool once_only;
992 
993         if ( !test_and_set_bool(once_only) )
994             printk(XENLOG_WARNING
995                    " ** New physical CPU %u may have skewed TSC and hence "
996                    "break assumed cross-CPU TSC coherency.\n"
997                    " ** Consider using boot parameter \"tsc=skewed\" "
998                    "which forces TSC emulation where appropriate.\n", cpu);
999         cpumask_set_cpu(cpu, &tsc_sync_cpu_mask);
1000     }
1001 
1002     srat_detect_node(cpu);
1003     numa_add_cpu(cpu);
1004     dprintk(XENLOG_INFO, "Add CPU %x with index %x\n", apic_id, cpu);
1005  out:
1006     cpu_hotplug_done();
1007     return cpu;
1008 }
1009 
1010 
__cpu_up(unsigned int cpu)1011 int __cpu_up(unsigned int cpu)
1012 {
1013     int apicid, ret;
1014 
1015     if ( (apicid = x86_cpu_to_apicid[cpu]) == BAD_APICID )
1016         return -ENODEV;
1017 
1018     if ( (ret = do_boot_cpu(apicid, cpu)) != 0 )
1019         return ret;
1020 
1021     time_latch_stamps();
1022 
1023     set_cpu_state(CPU_STATE_ONLINE);
1024     while ( !cpu_online(cpu) )
1025     {
1026         cpu_relax();
1027         process_pending_softirqs();
1028     }
1029 
1030     return 0;
1031 }
1032 
1033 
smp_cpus_done(void)1034 void __init smp_cpus_done(void)
1035 {
1036     if ( nmi_watchdog == NMI_LOCAL_APIC )
1037         check_nmi_watchdog();
1038 
1039     setup_ioapic_dest();
1040 
1041     mtrr_save_state();
1042     mtrr_aps_sync_end();
1043 }
1044 
smp_intr_init(void)1045 void __init smp_intr_init(void)
1046 {
1047     int irq, vector, seridx, cpu = smp_processor_id();
1048 
1049     /*
1050      * IRQ0 must be given a fixed assignment and initialized,
1051      * because it's used before the IO-APIC is set up.
1052      */
1053     irq_to_desc(0)->arch.vector = IRQ0_VECTOR;
1054 
1055     /*
1056      * Also ensure serial interrupts are high priority. We do not
1057      * want them to be blocked by unacknowledged guest-bound interrupts.
1058      */
1059     for ( seridx = 0; seridx <= SERHND_IDX; seridx++ )
1060     {
1061         if ( (irq = serial_irq(seridx)) < 0 )
1062             continue;
1063         vector = alloc_hipriority_vector();
1064         per_cpu(vector_irq, cpu)[vector] = irq;
1065         irq_to_desc(irq)->arch.vector = vector;
1066         cpumask_copy(irq_to_desc(irq)->arch.cpu_mask, &cpu_online_map);
1067     }
1068 
1069     /* Direct IPI vectors. */
1070     set_direct_apic_vector(IRQ_MOVE_CLEANUP_VECTOR, irq_move_cleanup_interrupt);
1071     set_direct_apic_vector(EVENT_CHECK_VECTOR, event_check_interrupt);
1072     set_direct_apic_vector(INVALIDATE_TLB_VECTOR, invalidate_interrupt);
1073     set_direct_apic_vector(CALL_FUNCTION_VECTOR, call_function_interrupt);
1074 }
1075