1 /*
2 * x86 SMP booting functions
3 *
4 * This inherits a great deal from Linux's SMP boot code:
5 * (c) 1995 Alan Cox, Building #3 <alan@redhat.com>
6 * (c) 1998, 1999, 2000 Ingo Molnar <mingo@redhat.com>
7 *
8 * This program is free software; you can redistribute it and/or modify
9 * it under the terms of the GNU General Public License as published by
10 * the Free Software Foundation; either version 2 of the License, or
11 * (at your option) any later version.
12 *
13 * This program is distributed in the hope that it will be useful,
14 * but WITHOUT ANY WARRANTY; without even the implied warranty of
15 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 * GNU General Public License for more details.
17 *
18 * You should have received a copy of the GNU General Public License
19 * along with this program; If not, see <http://www.gnu.org/licenses/>.
20 */
21
22 #include <xen/init.h>
23 #include <xen/kernel.h>
24 #include <xen/mm.h>
25 #include <xen/domain.h>
26 #include <xen/domain_page.h>
27 #include <xen/sched.h>
28 #include <xen/sched-if.h>
29 #include <xen/irq.h>
30 #include <xen/delay.h>
31 #include <xen/softirq.h>
32 #include <xen/tasklet.h>
33 #include <xen/serial.h>
34 #include <xen/numa.h>
35 #include <xen/cpu.h>
36 #include <asm/current.h>
37 #include <asm/mc146818rtc.h>
38 #include <asm/desc.h>
39 #include <asm/div64.h>
40 #include <asm/flushtlb.h>
41 #include <asm/guest.h>
42 #include <asm/msr.h>
43 #include <asm/mtrr.h>
44 #include <asm/time.h>
45 #include <asm/tboot.h>
46 #include <mach_apic.h>
47 #include <mach_wakecpu.h>
48 #include <smpboot_hooks.h>
49
50 /* Override macros from asm/page.h to make them work with mfn_t */
51 #undef mfn_to_page
52 #define mfn_to_page(mfn) __mfn_to_page(mfn_x(mfn))
53 #undef page_to_mfn
54 #define page_to_mfn(pg) _mfn(__page_to_mfn(pg))
55
56 #define setup_trampoline() (bootsym_phys(trampoline_realmode_entry))
57
58 unsigned long __read_mostly trampoline_phys;
59
60 /* representing HT siblings of each logical CPU */
61 DEFINE_PER_CPU_READ_MOSTLY(cpumask_var_t, cpu_sibling_mask);
62 /* representing HT and core siblings of each logical CPU */
63 DEFINE_PER_CPU_READ_MOSTLY(cpumask_var_t, cpu_core_mask);
64
65 DEFINE_PER_CPU_READ_MOSTLY(cpumask_var_t, scratch_cpumask);
66 static cpumask_t scratch_cpu0mask;
67
68 cpumask_t cpu_online_map __read_mostly;
69 EXPORT_SYMBOL(cpu_online_map);
70
71 unsigned int __read_mostly nr_sockets;
72 cpumask_t **__read_mostly socket_cpumask;
73 static cpumask_t *secondary_socket_cpumask;
74
75 struct cpuinfo_x86 cpu_data[NR_CPUS];
76
77 u32 x86_cpu_to_apicid[NR_CPUS] __read_mostly =
78 { [0 ... NR_CPUS-1] = BAD_APICID };
79
80 static int cpu_error;
81 static enum cpu_state {
82 CPU_STATE_DYING, /* slave -> master: I am dying */
83 CPU_STATE_DEAD, /* slave -> master: I am completely dead */
84 CPU_STATE_INIT, /* master -> slave: Early bringup phase 1 */
85 CPU_STATE_CALLOUT, /* master -> slave: Early bringup phase 2 */
86 CPU_STATE_CALLIN, /* slave -> master: Completed phase 2 */
87 CPU_STATE_ONLINE /* master -> slave: Go fully online now. */
88 } cpu_state;
89 #define set_cpu_state(state) do { mb(); cpu_state = (state); } while (0)
90
91 void *stack_base[NR_CPUS];
92
initialize_cpu_data(unsigned int cpu)93 void initialize_cpu_data(unsigned int cpu)
94 {
95 cpu_data[cpu] = boot_cpu_data;
96 }
97
smp_store_cpu_info(int id)98 static void smp_store_cpu_info(int id)
99 {
100 unsigned int socket;
101
102 identify_cpu(&cpu_data[id]);
103
104 socket = cpu_to_socket(id);
105 if ( !socket_cpumask[socket] )
106 {
107 socket_cpumask[socket] = secondary_socket_cpumask;
108 secondary_socket_cpumask = NULL;
109 }
110 }
111
112 /*
113 * TSC's upper 32 bits can't be written in earlier CPUs (before
114 * Prescott), there is no way to resync one AP against BP.
115 */
116 bool disable_tsc_sync;
117
118 static atomic_t tsc_count;
119 static uint64_t tsc_value;
120 static cpumask_t tsc_sync_cpu_mask;
121
synchronize_tsc_master(unsigned int slave)122 static void synchronize_tsc_master(unsigned int slave)
123 {
124 unsigned int i;
125
126 if ( disable_tsc_sync )
127 return;
128
129 if ( boot_cpu_has(X86_FEATURE_TSC_RELIABLE) &&
130 !cpumask_test_cpu(slave, &tsc_sync_cpu_mask) )
131 return;
132
133 for ( i = 1; i <= 5; i++ )
134 {
135 tsc_value = rdtsc_ordered();
136 wmb();
137 atomic_inc(&tsc_count);
138 while ( atomic_read(&tsc_count) != (i<<1) )
139 cpu_relax();
140 }
141
142 atomic_set(&tsc_count, 0);
143 cpumask_clear_cpu(slave, &tsc_sync_cpu_mask);
144 }
145
synchronize_tsc_slave(unsigned int slave)146 static void synchronize_tsc_slave(unsigned int slave)
147 {
148 unsigned int i;
149
150 if ( disable_tsc_sync )
151 return;
152
153 if ( boot_cpu_has(X86_FEATURE_TSC_RELIABLE) &&
154 !cpumask_test_cpu(slave, &tsc_sync_cpu_mask) )
155 return;
156
157 for ( i = 1; i <= 5; i++ )
158 {
159 while ( atomic_read(&tsc_count) != ((i<<1)-1) )
160 cpu_relax();
161 rmb();
162 /*
163 * If a CPU has been physically hotplugged, we may as well write
164 * to its TSC in spite of X86_FEATURE_TSC_RELIABLE. The platform does
165 * not sync up a new CPU's TSC for us.
166 */
167 __write_tsc(tsc_value);
168 atomic_inc(&tsc_count);
169 }
170 }
171
smp_callin(void)172 static void smp_callin(void)
173 {
174 unsigned int cpu = smp_processor_id();
175 int i, rc;
176
177 /* Wait 2s total for startup. */
178 Dprintk("Waiting for CALLOUT.\n");
179 for ( i = 0; cpu_state != CPU_STATE_CALLOUT; i++ )
180 {
181 BUG_ON(i >= 200);
182 cpu_relax();
183 mdelay(10);
184 }
185
186 /*
187 * The boot CPU has finished the init stage and is spinning on cpu_state
188 * update until we finish. We are free to set up this CPU: first the APIC.
189 */
190 Dprintk("CALLIN, before setup_local_APIC().\n");
191 x2apic_ap_setup();
192 setup_local_APIC();
193
194 /* Save our processor parameters. */
195 smp_store_cpu_info(cpu);
196
197 if ( (rc = hvm_cpu_up()) != 0 )
198 {
199 printk("CPU%d: Failed to initialise HVM. Not coming online.\n", cpu);
200 cpu_error = rc;
201 clear_local_APIC();
202 spin_debug_enable();
203 cpu_exit_clear(cpu);
204 (*dead_idle)();
205 }
206
207 /* Allow the master to continue. */
208 set_cpu_state(CPU_STATE_CALLIN);
209
210 synchronize_tsc_slave(cpu);
211
212 /* And wait for our final Ack. */
213 while ( cpu_state != CPU_STATE_ONLINE )
214 cpu_relax();
215 }
216
217 static int booting_cpu;
218
219 /* CPUs for which sibling maps can be computed. */
220 static cpumask_t cpu_sibling_setup_map;
221
link_thread_siblings(int cpu1,int cpu2)222 static void link_thread_siblings(int cpu1, int cpu2)
223 {
224 cpumask_set_cpu(cpu1, per_cpu(cpu_sibling_mask, cpu2));
225 cpumask_set_cpu(cpu2, per_cpu(cpu_sibling_mask, cpu1));
226 cpumask_set_cpu(cpu1, per_cpu(cpu_core_mask, cpu2));
227 cpumask_set_cpu(cpu2, per_cpu(cpu_core_mask, cpu1));
228 }
229
set_cpu_sibling_map(int cpu)230 static void set_cpu_sibling_map(int cpu)
231 {
232 int i;
233 struct cpuinfo_x86 *c = cpu_data;
234
235 cpumask_set_cpu(cpu, &cpu_sibling_setup_map);
236
237 cpumask_set_cpu(cpu, socket_cpumask[cpu_to_socket(cpu)]);
238
239 if ( c[cpu].x86_num_siblings > 1 )
240 {
241 for_each_cpu ( i, &cpu_sibling_setup_map )
242 {
243 if ( cpu_has(c, X86_FEATURE_TOPOEXT) ) {
244 if ( (c[cpu].phys_proc_id == c[i].phys_proc_id) &&
245 (c[cpu].compute_unit_id == c[i].compute_unit_id) )
246 link_thread_siblings(cpu, i);
247 } else if ( (c[cpu].phys_proc_id == c[i].phys_proc_id) &&
248 (c[cpu].cpu_core_id == c[i].cpu_core_id) ) {
249 link_thread_siblings(cpu, i);
250 }
251 }
252 }
253 else
254 {
255 cpumask_set_cpu(cpu, per_cpu(cpu_sibling_mask, cpu));
256 }
257
258 if ( c[cpu].x86_max_cores == 1 )
259 {
260 cpumask_copy(per_cpu(cpu_core_mask, cpu),
261 per_cpu(cpu_sibling_mask, cpu));
262 c[cpu].booted_cores = 1;
263 return;
264 }
265
266 for_each_cpu ( i, &cpu_sibling_setup_map )
267 {
268 if ( c[cpu].phys_proc_id == c[i].phys_proc_id )
269 {
270 cpumask_set_cpu(i, per_cpu(cpu_core_mask, cpu));
271 cpumask_set_cpu(cpu, per_cpu(cpu_core_mask, i));
272 /*
273 * Does this new cpu bringup a new core?
274 */
275 if ( cpumask_weight(per_cpu(cpu_sibling_mask, cpu)) == 1 )
276 {
277 /*
278 * for each core in package, increment
279 * the booted_cores for this new cpu
280 */
281 if ( cpumask_first(per_cpu(cpu_sibling_mask, i)) == i )
282 c[cpu].booted_cores++;
283 /*
284 * increment the core count for all
285 * the other cpus in this package
286 */
287 if ( i != cpu )
288 c[i].booted_cores++;
289 }
290 else if ( (i != cpu) && !c[cpu].booted_cores )
291 {
292 c[cpu].booted_cores = c[i].booted_cores;
293 }
294 }
295 }
296 }
297
start_secondary(void * unused)298 void start_secondary(void *unused)
299 {
300 /*
301 * Dont put anything before smp_callin(), SMP booting is so fragile that we
302 * want to limit the things done here to the most necessary things.
303 */
304 unsigned int cpu = booting_cpu;
305
306 /* Critical region without IDT or TSS. Any fault is deadly! */
307
308 set_processor_id(cpu);
309 set_current(idle_vcpu[cpu]);
310 this_cpu(curr_vcpu) = idle_vcpu[cpu];
311 rdmsrl(MSR_EFER, this_cpu(efer));
312
313 /*
314 * Just as during early bootstrap, it is convenient here to disable
315 * spinlock checking while we have IRQs disabled. This allows us to
316 * acquire IRQ-unsafe locks when it would otherwise be disallowed.
317 *
318 * It is safe because the race we are usually trying to avoid involves
319 * a group of CPUs rendezvousing in an IPI handler, where one cannot
320 * join because it is spinning with IRQs disabled waiting to acquire a
321 * lock held by another in the rendezvous group (the lock must be an
322 * IRQ-unsafe lock since the CPU took the IPI after acquiring it, and
323 * hence had IRQs enabled). This is a deadlock scenario.
324 *
325 * However, no CPU can be involved in rendezvous until it is online,
326 * hence no such group can be waiting for this CPU until it is
327 * visible in cpu_online_map. Hence such a deadlock is not possible.
328 */
329 spin_debug_disable();
330
331 load_system_tables();
332
333 /* Full exception support from here on in. */
334
335 /* Safe to enable feature such as CR4.MCE with the IDT set up now. */
336 write_cr4(mmu_cr4_features);
337
338 percpu_traps_init();
339
340 cpu_init();
341
342 initialize_cpu_data(cpu);
343
344 if ( system_state <= SYS_STATE_smp_boot )
345 early_microcode_update_cpu(false);
346 else
347 microcode_resume_cpu(cpu);
348
349 if ( xen_guest )
350 hypervisor_ap_setup();
351
352 smp_callin();
353
354 init_percpu_time();
355
356 setup_secondary_APIC_clock();
357
358 /*
359 * low-memory mappings have been cleared, flush them from
360 * the local TLBs too.
361 */
362 flush_tlb_local();
363
364 /* This must be done before setting cpu_online_map */
365 spin_debug_enable();
366 set_cpu_sibling_map(cpu);
367 notify_cpu_starting(cpu);
368 wmb();
369
370 /*
371 * We need to hold vector_lock so there the set of online cpus
372 * does not change while we are assigning vectors to cpus. Holding
373 * this lock ensures we don't half assign or remove an irq from a cpu.
374 */
375 lock_vector_lock();
376 setup_vector_irq(cpu);
377 cpumask_set_cpu(cpu, &cpu_online_map);
378 unlock_vector_lock();
379
380 /* We can take interrupts now: we're officially "up". */
381 local_irq_enable();
382 mtrr_ap_init();
383
384 wmb();
385 startup_cpu_idle_loop();
386 }
387
388 extern void *stack_start;
389
wakeup_secondary_cpu(int phys_apicid,unsigned long start_eip)390 static int wakeup_secondary_cpu(int phys_apicid, unsigned long start_eip)
391 {
392 unsigned long send_status = 0, accept_status = 0;
393 int maxlvt, timeout, i;
394
395 /*
396 * Be paranoid about clearing APIC errors.
397 */
398 apic_write(APIC_ESR, 0);
399 apic_read(APIC_ESR);
400
401 Dprintk("Asserting INIT.\n");
402
403 /*
404 * Turn INIT on target chip via IPI
405 */
406 apic_icr_write(APIC_INT_LEVELTRIG | APIC_INT_ASSERT | APIC_DM_INIT,
407 phys_apicid);
408
409 if ( !x2apic_enabled )
410 {
411 Dprintk("Waiting for send to finish...\n");
412 timeout = 0;
413 do {
414 Dprintk("+");
415 udelay(100);
416 send_status = apic_read(APIC_ICR) & APIC_ICR_BUSY;
417 } while ( send_status && (timeout++ < 1000) );
418
419 mdelay(10);
420
421 Dprintk("Deasserting INIT.\n");
422
423 apic_icr_write(APIC_INT_LEVELTRIG | APIC_DM_INIT, phys_apicid);
424
425 Dprintk("Waiting for send to finish...\n");
426 timeout = 0;
427 do {
428 Dprintk("+");
429 udelay(100);
430 send_status = apic_read(APIC_ICR) & APIC_ICR_BUSY;
431 } while ( send_status && (timeout++ < 1000) );
432 }
433 else if ( tboot_in_measured_env() )
434 {
435 /*
436 * With tboot AP is actually spinning in a mini-guest before
437 * receiving INIT. Upon receiving INIT ipi, AP need time to VMExit,
438 * update VMCS to tracking SIPIs and VMResume.
439 *
440 * While AP is in root mode handling the INIT the CPU will drop
441 * any SIPIs
442 */
443 udelay(10);
444 }
445
446 maxlvt = get_maxlvt();
447
448 for ( i = 0; i < 2; i++ )
449 {
450 Dprintk("Sending STARTUP #%d.\n", i+1);
451 apic_write(APIC_ESR, 0);
452 apic_read(APIC_ESR);
453 Dprintk("After apic_write.\n");
454
455 /*
456 * STARTUP IPI
457 * Boot on the stack
458 */
459 apic_icr_write(APIC_DM_STARTUP | (start_eip >> 12), phys_apicid);
460
461 if ( !x2apic_enabled )
462 {
463 /* Give the other CPU some time to accept the IPI. */
464 udelay(300);
465
466 Dprintk("Startup point 1.\n");
467
468 Dprintk("Waiting for send to finish...\n");
469 timeout = 0;
470 do {
471 Dprintk("+");
472 udelay(100);
473 send_status = apic_read(APIC_ICR) & APIC_ICR_BUSY;
474 } while ( send_status && (timeout++ < 1000) );
475
476 /* Give the other CPU some time to accept the IPI. */
477 udelay(200);
478 }
479
480 /* Due to the Pentium erratum 3AP. */
481 if ( maxlvt > 3 )
482 {
483 apic_write(APIC_ESR, 0);
484 }
485 accept_status = (apic_read(APIC_ESR) & 0xEF);
486 if ( send_status || accept_status )
487 break;
488 }
489 Dprintk("After Startup.\n");
490
491 if ( send_status )
492 printk("APIC never delivered???\n");
493 if ( accept_status )
494 printk("APIC delivery error (%lx).\n", accept_status);
495
496 return (send_status | accept_status);
497 }
498
alloc_cpu_id(void)499 int alloc_cpu_id(void)
500 {
501 cpumask_t tmp_map;
502 int cpu;
503
504 cpumask_complement(&tmp_map, &cpu_present_map);
505 cpu = cpumask_first(&tmp_map);
506 return (cpu < nr_cpu_ids) ? cpu : -ENODEV;
507 }
508
do_boot_cpu(int apicid,int cpu)509 static int do_boot_cpu(int apicid, int cpu)
510 {
511 int timeout, boot_error = 0, rc = 0;
512 unsigned long start_eip;
513
514 /*
515 * Save current MTRR state in case it was changed since early boot
516 * (e.g. by the ACPI SMI) to initialize new CPUs with MTRRs in sync:
517 */
518 mtrr_save_state();
519
520 booting_cpu = cpu;
521
522 /* start_eip had better be page-aligned! */
523 start_eip = setup_trampoline();
524
525 /* So we see what's up */
526 if ( opt_cpu_info )
527 printk("Booting processor %d/%d eip %lx\n",
528 cpu, apicid, start_eip);
529
530 stack_start = stack_base[cpu];
531
532 /* This grunge runs the startup process for the targeted processor. */
533
534 set_cpu_state(CPU_STATE_INIT);
535
536 Dprintk("Setting warm reset code and vector.\n");
537
538 smpboot_setup_warm_reset_vector(start_eip);
539
540 /* Starting actual IPI sequence... */
541 if ( !tboot_in_measured_env() || tboot_wake_ap(apicid, start_eip) )
542 boot_error = wakeup_secondary_cpu(apicid, start_eip);
543
544 if ( !boot_error )
545 {
546 /* Allow AP to start initializing. */
547 set_cpu_state(CPU_STATE_CALLOUT);
548 Dprintk("After Callout %d.\n", cpu);
549
550 /* Wait 5s total for a response. */
551 for ( timeout = 0; timeout < 50000; timeout++ )
552 {
553 if ( cpu_state != CPU_STATE_CALLOUT )
554 break;
555 udelay(100);
556 }
557
558 if ( cpu_state == CPU_STATE_CALLIN )
559 {
560 /* number CPUs logically, starting from 1 (BSP is 0) */
561 Dprintk("OK.\n");
562 print_cpu_info(cpu);
563 synchronize_tsc_master(cpu);
564 Dprintk("CPU has booted.\n");
565 }
566 else if ( cpu_state == CPU_STATE_DEAD )
567 {
568 rmb();
569 rc = cpu_error;
570 }
571 else
572 {
573 boot_error = 1;
574 mb();
575 if ( bootsym(trampoline_cpu_started) == 0xA5 )
576 /* trampoline started but...? */
577 printk("Stuck ??\n");
578 else
579 /* trampoline code not run */
580 printk("Not responding.\n");
581 }
582 }
583
584 if ( boot_error )
585 {
586 cpu_exit_clear(cpu);
587 rc = -EIO;
588 }
589
590 /* mark "stuck" area as not stuck */
591 bootsym(trampoline_cpu_started) = 0;
592 mb();
593
594 smpboot_restore_warm_reset_vector();
595
596 return rc;
597 }
598
599 #define STUB_BUF_CPU_OFFS(cpu) (((cpu) & (STUBS_PER_PAGE - 1)) * STUB_BUF_SIZE)
600
alloc_stub_page(unsigned int cpu,unsigned long * mfn)601 unsigned long alloc_stub_page(unsigned int cpu, unsigned long *mfn)
602 {
603 unsigned long stub_va;
604 struct page_info *pg;
605
606 BUILD_BUG_ON(STUBS_PER_PAGE & (STUBS_PER_PAGE - 1));
607
608 if ( *mfn )
609 pg = mfn_to_page(_mfn(*mfn));
610 else
611 {
612 nodeid_t node = cpu_to_node(cpu);
613 unsigned int memflags = node != NUMA_NO_NODE ? MEMF_node(node) : 0;
614
615 pg = alloc_domheap_page(NULL, memflags);
616 if ( !pg )
617 return 0;
618
619 unmap_domain_page(memset(__map_domain_page(pg), 0xcc, PAGE_SIZE));
620 }
621
622 stub_va = XEN_VIRT_END - (cpu + 1) * PAGE_SIZE;
623 if ( map_pages_to_xen(stub_va, mfn_x(page_to_mfn(pg)), 1,
624 PAGE_HYPERVISOR_RX | MAP_SMALL_PAGES) )
625 {
626 if ( !*mfn )
627 free_domheap_page(pg);
628 stub_va = 0;
629 }
630 else if ( !*mfn )
631 *mfn = mfn_x(page_to_mfn(pg));
632
633 return stub_va;
634 }
635
cpu_exit_clear(unsigned int cpu)636 void cpu_exit_clear(unsigned int cpu)
637 {
638 cpu_uninit(cpu);
639 set_cpu_state(CPU_STATE_DEAD);
640 }
641
cpu_smpboot_free(unsigned int cpu)642 static void cpu_smpboot_free(unsigned int cpu)
643 {
644 unsigned int order, socket = cpu_to_socket(cpu);
645 struct cpuinfo_x86 *c = cpu_data;
646
647 if ( cpumask_empty(socket_cpumask[socket]) )
648 {
649 xfree(socket_cpumask[socket]);
650 socket_cpumask[socket] = NULL;
651 }
652
653 c[cpu].phys_proc_id = XEN_INVALID_SOCKET_ID;
654 c[cpu].cpu_core_id = XEN_INVALID_CORE_ID;
655 c[cpu].compute_unit_id = INVALID_CUID;
656 cpumask_clear_cpu(cpu, &cpu_sibling_setup_map);
657
658 free_cpumask_var(per_cpu(cpu_sibling_mask, cpu));
659 free_cpumask_var(per_cpu(cpu_core_mask, cpu));
660 if ( per_cpu(scratch_cpumask, cpu) != &scratch_cpu0mask )
661 free_cpumask_var(per_cpu(scratch_cpumask, cpu));
662
663 if ( per_cpu(stubs.addr, cpu) )
664 {
665 mfn_t mfn = _mfn(per_cpu(stubs.mfn, cpu));
666 unsigned char *stub_page = map_domain_page(mfn);
667 unsigned int i;
668
669 memset(stub_page + STUB_BUF_CPU_OFFS(cpu), 0xcc, STUB_BUF_SIZE);
670 for ( i = 0; i < STUBS_PER_PAGE; ++i )
671 if ( stub_page[i * STUB_BUF_SIZE] != 0xcc )
672 break;
673 unmap_domain_page(stub_page);
674 destroy_xen_mappings(per_cpu(stubs.addr, cpu) & PAGE_MASK,
675 (per_cpu(stubs.addr, cpu) | ~PAGE_MASK) + 1);
676 if ( i == STUBS_PER_PAGE )
677 free_domheap_page(mfn_to_page(mfn));
678 }
679
680 order = get_order_from_pages(NR_RESERVED_GDT_PAGES);
681 free_xenheap_pages(per_cpu(gdt_table, cpu), order);
682
683 free_xenheap_pages(per_cpu(compat_gdt_table, cpu), order);
684
685 order = get_order_from_bytes(IDT_ENTRIES * sizeof(idt_entry_t));
686 free_xenheap_pages(idt_tables[cpu], order);
687 idt_tables[cpu] = NULL;
688
689 if ( stack_base[cpu] != NULL )
690 {
691 memguard_unguard_stack(stack_base[cpu]);
692 free_xenheap_pages(stack_base[cpu], STACK_ORDER);
693 stack_base[cpu] = NULL;
694 }
695 }
696
cpu_smpboot_alloc(unsigned int cpu)697 static int cpu_smpboot_alloc(unsigned int cpu)
698 {
699 unsigned int i, order, memflags = 0;
700 nodeid_t node = cpu_to_node(cpu);
701 struct desc_struct *gdt;
702 unsigned long stub_page;
703
704 if ( node != NUMA_NO_NODE )
705 memflags = MEMF_node(node);
706
707 stack_base[cpu] = alloc_xenheap_pages(STACK_ORDER, memflags);
708 if ( stack_base[cpu] == NULL )
709 goto oom;
710 memguard_guard_stack(stack_base[cpu]);
711
712 order = get_order_from_pages(NR_RESERVED_GDT_PAGES);
713 per_cpu(gdt_table, cpu) = gdt = alloc_xenheap_pages(order, memflags);
714 if ( gdt == NULL )
715 goto oom;
716 memcpy(gdt, boot_cpu_gdt_table, NR_RESERVED_GDT_PAGES * PAGE_SIZE);
717 BUILD_BUG_ON(NR_CPUS > 0x10000);
718 gdt[PER_CPU_GDT_ENTRY - FIRST_RESERVED_GDT_ENTRY].a = cpu;
719
720 per_cpu(compat_gdt_table, cpu) = gdt = alloc_xenheap_pages(order, memflags);
721 if ( gdt == NULL )
722 goto oom;
723 memcpy(gdt, boot_cpu_compat_gdt_table, NR_RESERVED_GDT_PAGES * PAGE_SIZE);
724 gdt[PER_CPU_GDT_ENTRY - FIRST_RESERVED_GDT_ENTRY].a = cpu;
725
726 order = get_order_from_bytes(IDT_ENTRIES * sizeof(idt_entry_t));
727 idt_tables[cpu] = alloc_xenheap_pages(order, memflags);
728 if ( idt_tables[cpu] == NULL )
729 goto oom;
730 memcpy(idt_tables[cpu], idt_table, IDT_ENTRIES * sizeof(idt_entry_t));
731 set_ist(&idt_tables[cpu][TRAP_double_fault], IST_NONE);
732 set_ist(&idt_tables[cpu][TRAP_nmi], IST_NONE);
733 set_ist(&idt_tables[cpu][TRAP_machine_check], IST_NONE);
734
735 for ( stub_page = 0, i = cpu & ~(STUBS_PER_PAGE - 1);
736 i < nr_cpu_ids && i <= (cpu | (STUBS_PER_PAGE - 1)); ++i )
737 if ( cpu_online(i) && cpu_to_node(i) == node )
738 {
739 per_cpu(stubs.mfn, cpu) = per_cpu(stubs.mfn, i);
740 break;
741 }
742 BUG_ON(i == cpu);
743 stub_page = alloc_stub_page(cpu, &per_cpu(stubs.mfn, cpu));
744 if ( !stub_page )
745 goto oom;
746 per_cpu(stubs.addr, cpu) = stub_page + STUB_BUF_CPU_OFFS(cpu);
747
748 if ( secondary_socket_cpumask == NULL &&
749 (secondary_socket_cpumask = xzalloc(cpumask_t)) == NULL )
750 goto oom;
751
752 if ( zalloc_cpumask_var(&per_cpu(cpu_sibling_mask, cpu)) &&
753 zalloc_cpumask_var(&per_cpu(cpu_core_mask, cpu)) &&
754 alloc_cpumask_var(&per_cpu(scratch_cpumask, cpu)) )
755 return 0;
756
757 oom:
758 cpu_smpboot_free(cpu);
759 return -ENOMEM;
760 }
761
cpu_smpboot_callback(struct notifier_block * nfb,unsigned long action,void * hcpu)762 static int cpu_smpboot_callback(
763 struct notifier_block *nfb, unsigned long action, void *hcpu)
764 {
765 unsigned int cpu = (unsigned long)hcpu;
766 int rc = 0;
767
768 switch ( action )
769 {
770 case CPU_UP_PREPARE:
771 rc = cpu_smpboot_alloc(cpu);
772 break;
773 case CPU_UP_CANCELED:
774 case CPU_DEAD:
775 cpu_smpboot_free(cpu);
776 break;
777 default:
778 break;
779 }
780
781 return !rc ? NOTIFY_DONE : notifier_from_errno(rc);
782 }
783
784 static struct notifier_block cpu_smpboot_nfb = {
785 .notifier_call = cpu_smpboot_callback
786 };
787
smp_prepare_cpus(unsigned int max_cpus)788 void __init smp_prepare_cpus(unsigned int max_cpus)
789 {
790 register_cpu_notifier(&cpu_smpboot_nfb);
791
792 mtrr_aps_sync_begin();
793
794 /* Setup boot CPU information */
795 initialize_cpu_data(0); /* Final full version of the data */
796 print_cpu_info(0);
797
798 boot_cpu_physical_apicid = get_apic_id();
799 x86_cpu_to_apicid[0] = boot_cpu_physical_apicid;
800
801 stack_base[0] = stack_start;
802
803 set_nr_sockets();
804
805 socket_cpumask = xzalloc_array(cpumask_t *, nr_sockets);
806 if ( socket_cpumask == NULL ||
807 (socket_cpumask[cpu_to_socket(0)] = xzalloc(cpumask_t)) == NULL )
808 panic("No memory for socket CPU siblings map");
809
810 if ( !zalloc_cpumask_var(&per_cpu(cpu_sibling_mask, 0)) ||
811 !zalloc_cpumask_var(&per_cpu(cpu_core_mask, 0)) )
812 panic("No memory for boot CPU sibling/core maps");
813
814 set_cpu_sibling_map(0);
815
816 /*
817 * If we couldn't find an SMP configuration at boot time,
818 * get out of here now!
819 */
820 if ( !smp_found_config && !acpi_lapic )
821 {
822 printk(KERN_NOTICE "SMP motherboard not detected.\n");
823 init_uniprocessor:
824 physids_clear(phys_cpu_present_map);
825 physid_set(0, phys_cpu_present_map);
826 if (APIC_init_uniprocessor())
827 printk(KERN_NOTICE "Local APIC not detected."
828 " Using dummy APIC emulation.\n");
829 return;
830 }
831
832 /*
833 * Should not be necessary because the MP table should list the boot
834 * CPU too, but we do it for the sake of robustness anyway.
835 * Makes no sense to do this check in clustered apic mode, so skip it
836 */
837 if ( !check_apicid_present(boot_cpu_physical_apicid) )
838 {
839 printk("weird, boot CPU (#%d) not listed by the BIOS.\n",
840 boot_cpu_physical_apicid);
841 physid_set(get_apic_id(), phys_cpu_present_map);
842 }
843
844 /* If we couldn't find a local APIC, then get out of here now! */
845 if ( !cpu_has_apic )
846 {
847 printk(KERN_ERR "BIOS bug, local APIC #%d not detected!...\n",
848 boot_cpu_physical_apicid);
849 goto init_uniprocessor;
850 }
851
852 verify_local_APIC();
853
854 connect_bsp_APIC();
855 setup_local_APIC();
856
857 smpboot_setup_io_apic();
858
859 setup_boot_APIC_clock();
860 }
861
smp_prepare_boot_cpu(void)862 void __init smp_prepare_boot_cpu(void)
863 {
864 unsigned int cpu = smp_processor_id();
865
866 cpumask_set_cpu(cpu, &cpu_online_map);
867 cpumask_set_cpu(cpu, &cpu_present_map);
868 #if NR_CPUS > 2 * BITS_PER_LONG
869 per_cpu(scratch_cpumask, cpu) = &scratch_cpu0mask;
870 #endif
871 }
872
873 static void
remove_siblinginfo(int cpu)874 remove_siblinginfo(int cpu)
875 {
876 int sibling;
877
878 cpumask_clear_cpu(cpu, socket_cpumask[cpu_to_socket(cpu)]);
879
880 for_each_cpu ( sibling, per_cpu(cpu_core_mask, cpu) )
881 {
882 cpumask_clear_cpu(cpu, per_cpu(cpu_core_mask, sibling));
883 /* Last thread sibling in this cpu core going down. */
884 if ( cpumask_weight(per_cpu(cpu_sibling_mask, cpu)) == 1 )
885 cpu_data[sibling].booted_cores--;
886 }
887
888 for_each_cpu(sibling, per_cpu(cpu_sibling_mask, cpu))
889 cpumask_clear_cpu(cpu, per_cpu(cpu_sibling_mask, sibling));
890 cpumask_clear(per_cpu(cpu_sibling_mask, cpu));
891 cpumask_clear(per_cpu(cpu_core_mask, cpu));
892 }
893
__cpu_disable(void)894 void __cpu_disable(void)
895 {
896 int cpu = smp_processor_id();
897
898 set_cpu_state(CPU_STATE_DYING);
899
900 local_irq_disable();
901 clear_local_APIC();
902 /* Allow any queued timer interrupts to get serviced */
903 local_irq_enable();
904 mdelay(1);
905 local_irq_disable();
906
907 time_suspend();
908
909 remove_siblinginfo(cpu);
910
911 /* It's now safe to remove this processor from the online map */
912 cpumask_clear_cpu(cpu, &cpu_online_map);
913 fixup_irqs(&cpu_online_map, 1);
914 fixup_eoi();
915
916 if ( cpu_disable_scheduler(cpu) )
917 BUG();
918 }
919
__cpu_die(unsigned int cpu)920 void __cpu_die(unsigned int cpu)
921 {
922 /* We don't do anything here: idle task is faking death itself. */
923 unsigned int i = 0;
924 enum cpu_state seen_state;
925
926 while ( (seen_state = cpu_state) != CPU_STATE_DEAD )
927 {
928 BUG_ON(seen_state != CPU_STATE_DYING);
929 mdelay(100);
930 cpu_relax();
931 process_pending_softirqs();
932 if ( (++i % 10) == 0 )
933 printk(KERN_ERR "CPU %u still not dead...\n", cpu);
934 }
935 }
936
cpu_add(uint32_t apic_id,uint32_t acpi_id,uint32_t pxm)937 int cpu_add(uint32_t apic_id, uint32_t acpi_id, uint32_t pxm)
938 {
939 int cpu = -1;
940
941 dprintk(XENLOG_DEBUG, "cpu_add apic_id %x acpi_id %x pxm %x\n",
942 apic_id, acpi_id, pxm);
943
944 if ( (acpi_id >= MAX_MADT_ENTRIES) ||
945 (apic_id >= MAX_APICS) ||
946 (pxm >= 256) )
947 return -EINVAL;
948
949 if ( !cpu_hotplug_begin() )
950 return -EBUSY;
951
952 /* Detect if the cpu has been added before */
953 if ( x86_acpiid_to_apicid[acpi_id] != BAD_APICID )
954 {
955 cpu = (x86_acpiid_to_apicid[acpi_id] != apic_id)
956 ? -EINVAL : -EEXIST;
957 goto out;
958 }
959
960 if ( physid_isset(apic_id, phys_cpu_present_map) )
961 {
962 cpu = -EEXIST;
963 goto out;
964 }
965
966 if ( (cpu = mp_register_lapic(apic_id, 1, 1)) < 0 )
967 goto out;
968
969 x86_acpiid_to_apicid[acpi_id] = apic_id;
970
971 if ( !srat_disabled() )
972 {
973 nodeid_t node = setup_node(pxm);
974
975 if ( node == NUMA_NO_NODE )
976 {
977 dprintk(XENLOG_WARNING,
978 "Setup node failed for pxm %x\n", pxm);
979 x86_acpiid_to_apicid[acpi_id] = BAD_APICID;
980 mp_unregister_lapic(apic_id, cpu);
981 cpu = node;
982 goto out;
983 }
984 if ( apic_id < MAX_LOCAL_APIC )
985 apicid_to_node[apic_id] = node;
986 }
987
988 /* Physically added CPUs do not have synchronised TSC. */
989 if ( boot_cpu_has(X86_FEATURE_TSC_RELIABLE) )
990 {
991 static bool once_only;
992
993 if ( !test_and_set_bool(once_only) )
994 printk(XENLOG_WARNING
995 " ** New physical CPU %u may have skewed TSC and hence "
996 "break assumed cross-CPU TSC coherency.\n"
997 " ** Consider using boot parameter \"tsc=skewed\" "
998 "which forces TSC emulation where appropriate.\n", cpu);
999 cpumask_set_cpu(cpu, &tsc_sync_cpu_mask);
1000 }
1001
1002 srat_detect_node(cpu);
1003 numa_add_cpu(cpu);
1004 dprintk(XENLOG_INFO, "Add CPU %x with index %x\n", apic_id, cpu);
1005 out:
1006 cpu_hotplug_done();
1007 return cpu;
1008 }
1009
1010
__cpu_up(unsigned int cpu)1011 int __cpu_up(unsigned int cpu)
1012 {
1013 int apicid, ret;
1014
1015 if ( (apicid = x86_cpu_to_apicid[cpu]) == BAD_APICID )
1016 return -ENODEV;
1017
1018 if ( (ret = do_boot_cpu(apicid, cpu)) != 0 )
1019 return ret;
1020
1021 time_latch_stamps();
1022
1023 set_cpu_state(CPU_STATE_ONLINE);
1024 while ( !cpu_online(cpu) )
1025 {
1026 cpu_relax();
1027 process_pending_softirqs();
1028 }
1029
1030 return 0;
1031 }
1032
1033
smp_cpus_done(void)1034 void __init smp_cpus_done(void)
1035 {
1036 if ( nmi_watchdog == NMI_LOCAL_APIC )
1037 check_nmi_watchdog();
1038
1039 setup_ioapic_dest();
1040
1041 mtrr_save_state();
1042 mtrr_aps_sync_end();
1043 }
1044
smp_intr_init(void)1045 void __init smp_intr_init(void)
1046 {
1047 int irq, vector, seridx, cpu = smp_processor_id();
1048
1049 /*
1050 * IRQ0 must be given a fixed assignment and initialized,
1051 * because it's used before the IO-APIC is set up.
1052 */
1053 irq_to_desc(0)->arch.vector = IRQ0_VECTOR;
1054
1055 /*
1056 * Also ensure serial interrupts are high priority. We do not
1057 * want them to be blocked by unacknowledged guest-bound interrupts.
1058 */
1059 for ( seridx = 0; seridx <= SERHND_IDX; seridx++ )
1060 {
1061 if ( (irq = serial_irq(seridx)) < 0 )
1062 continue;
1063 vector = alloc_hipriority_vector();
1064 per_cpu(vector_irq, cpu)[vector] = irq;
1065 irq_to_desc(irq)->arch.vector = vector;
1066 cpumask_copy(irq_to_desc(irq)->arch.cpu_mask, &cpu_online_map);
1067 }
1068
1069 /* Direct IPI vectors. */
1070 set_direct_apic_vector(IRQ_MOVE_CLEANUP_VECTOR, irq_move_cleanup_interrupt);
1071 set_direct_apic_vector(EVENT_CHECK_VECTOR, event_check_interrupt);
1072 set_direct_apic_vector(INVALIDATE_TLB_VECTOR, invalidate_interrupt);
1073 set_direct_apic_vector(CALL_FUNCTION_VECTOR, call_function_interrupt);
1074 }
1075