1 // Copyright 2016 The Fuchsia Authors
2 // Copyright (c) 2014 Travis Geiselbrecht
3 //
4 // Use of this source code is governed by a MIT-style
5 // license that can be found in the LICENSE file or at
6 // https://opensource.org/licenses/MIT
7 
8 #include <kernel/mp.h>
9 
10 #include <arch/mp.h>
11 #include <arch/ops.h>
12 #include <assert.h>
13 #include <debug.h>
14 #include <dev/interrupt.h>
15 #include <err.h>
16 #include <fbl/algorithm.h>
17 #include <inttypes.h>
18 #include <kernel/align.h>
19 #include <kernel/dpc.h>
20 #include <kernel/event.h>
21 #include <kernel/mp.h>
22 #include <kernel/mutex.h>
23 #include <kernel/sched.h>
24 #include <kernel/spinlock.h>
25 #include <kernel/stats.h>
26 #include <kernel/timer.h>
27 #include <lk/init.h>
28 #include <platform.h>
29 #include <platform/timer.h>
30 #include <stdlib.h>
31 #include <trace.h>
32 #include <zircon/types.h>
33 
34 #define LOCAL_TRACE 0
35 
36 // a global state structure, aligned on cpu cache line to minimize aliasing
37 struct mp_state mp __CPU_ALIGN_EXCLUSIVE;
38 
39 // Helpers used for implementing mp_sync
40 struct mp_sync_context;
41 static void mp_sync_task(void* context);
42 
mp_init(void)43 void mp_init(void) {
44     mutex_init(&mp.hotplug_lock);
45     mp.ipi_task_lock = SPIN_LOCK_INITIAL_VALUE;
46     for (uint i = 0; i < fbl::count_of(mp.ipi_task_list); ++i) {
47         list_initialize(&mp.ipi_task_list[i]);
48     }
49 }
50 
mp_prepare_current_cpu_idle_state(bool idle)51 void mp_prepare_current_cpu_idle_state(bool idle) {
52     arch_prepare_current_cpu_idle_state(idle);
53 }
54 
mp_reschedule(cpu_mask_t mask,uint flags)55 void mp_reschedule(cpu_mask_t mask, uint flags) {
56     // we must be holding the thread lock to access some of the cpu
57     // state bitmaps and some arch_mp_reschedule implementations.
58     DEBUG_ASSERT(thread_lock_held());
59 
60     const cpu_num_t local_cpu = arch_curr_cpu_num();
61 
62     LTRACEF("local %u, mask %#x\n", local_cpu, mask);
63 
64     // mask out cpus that are not active and the local cpu
65     mask &= mp.active_cpus;
66     mask &= ~cpu_num_to_mask(local_cpu);
67 
68     // mask out cpus that are currently running realtime code
69     if ((flags & MP_RESCHEDULE_FLAG_REALTIME) == 0) {
70         mask &= ~mp.realtime_cpus;
71     }
72 
73     LTRACEF("local %u, post mask target now 0x%x\n", local_cpu, mask);
74 
75     // if we have no work to do, return
76     if (mask == 0) {
77         return;
78     }
79 
80     arch_mp_reschedule(mask);
81 }
82 
mp_interrupt(mp_ipi_target_t target,cpu_mask_t mask)83 void mp_interrupt(mp_ipi_target_t target, cpu_mask_t mask) {
84     arch_mp_send_ipi(target, mask, MP_IPI_INTERRUPT);
85 }
86 
87 struct mp_sync_context {
88     mp_sync_task_t task;
89     void* task_context;
90     // Mask of which CPUs need to finish the task
91     volatile cpu_mask_t outstanding_cpus;
92 };
93 
mp_sync_task(void * raw_context)94 static void mp_sync_task(void* raw_context) {
95     auto context = reinterpret_cast<mp_sync_context*>(raw_context);
96     context->task(context->task_context);
97     // use seq-cst atomic to ensure this update is not seen before the
98     // side-effects of context->task
99     atomic_and((int*)&context->outstanding_cpus, ~cpu_num_to_mask(arch_curr_cpu_num()));
100 }
101 
102 /* @brief Execute a task on the specified CPUs, and block on the calling
103  *        CPU until all CPUs have finished the task.
104  *
105  *  If MP_IPI_TARGET_ALL or MP_IPI_TARGET_ALL_BUT_LOCAL is the target, the online CPU
106  *  mask will be used to determine actual targets.
107  *
108  * Interrupts must be disabled if calling with MP_IPI_TARGET_ALL_BUT_LOCAL as target
109  *
110  * The callback in |task| will always be called with |arch_blocking_disallowed()|
111  * set to true.
112  */
mp_sync_exec(mp_ipi_target_t target,cpu_mask_t mask,mp_sync_task_t task,void * context)113 void mp_sync_exec(mp_ipi_target_t target, cpu_mask_t mask, mp_sync_task_t task, void* context) {
114     uint num_cpus = arch_max_num_cpus();
115 
116     if (target == MP_IPI_TARGET_ALL) {
117         mask = mp_get_online_mask();
118     } else if (target == MP_IPI_TARGET_ALL_BUT_LOCAL) {
119         // targeting all other CPUs but the current one is hazardous
120         // if the local CPU may be changed underneath us
121         DEBUG_ASSERT(arch_ints_disabled());
122         mask = mp_get_online_mask() & ~cpu_num_to_mask(arch_curr_cpu_num());
123     } else {
124         // Mask any offline CPUs from target list
125         mask &= mp_get_online_mask();
126     }
127 
128     // disable interrupts so our current CPU doesn't change
129     spin_lock_saved_state_t irqstate;
130     arch_interrupt_save(&irqstate, SPIN_LOCK_FLAG_INTERRUPTS);
131     smp_mb();
132 
133     const uint local_cpu = arch_curr_cpu_num();
134 
135     // remove self from target lists, since no need to IPI ourselves
136     bool targetting_self = !!(mask & cpu_num_to_mask(local_cpu));
137     mask &= ~cpu_num_to_mask(local_cpu);
138 
139     // create tasks to enqueue (we need one per target due to each containing
140     // a linked list node
141     struct mp_sync_context sync_context = {
142         .task = task,
143         .task_context = context,
144         .outstanding_cpus = mask,
145     };
146 
147     struct mp_ipi_task sync_tasks[SMP_MAX_CPUS] = {};
148     for (uint i = 0; i < num_cpus; ++i) {
149         sync_tasks[i].func = mp_sync_task;
150         sync_tasks[i].context = &sync_context;
151     }
152 
153     // enqueue tasks
154     spin_lock(&mp.ipi_task_lock);
155     cpu_mask_t remaining = mask;
156     uint cpu_id = 0;
157     while (remaining && cpu_id < num_cpus) {
158         if (remaining & 1) {
159             list_add_tail(&mp.ipi_task_list[cpu_id], &sync_tasks[cpu_id].node);
160         }
161         remaining >>= 1;
162         cpu_id++;
163     }
164     spin_unlock(&mp.ipi_task_lock);
165 
166     // let CPUs know to begin executing
167     __UNUSED zx_status_t status = arch_mp_send_ipi(MP_IPI_TARGET_MASK, mask, MP_IPI_GENERIC);
168     DEBUG_ASSERT(status == ZX_OK);
169 
170     if (targetting_self) {
171         bool previous_blocking_disallowed = arch_blocking_disallowed();
172         arch_set_blocking_disallowed(true);
173         mp_sync_task(&sync_context);
174         arch_set_blocking_disallowed(previous_blocking_disallowed);
175     }
176     smp_mb();
177 
178     // we can take interrupts again once we've executed our task
179     arch_interrupt_restore(irqstate, SPIN_LOCK_FLAG_INTERRUPTS);
180 
181     bool ints_disabled = arch_ints_disabled();
182     // wait for all other CPUs to be done with the context
183     while (1) {
184         // See comment in mp_unplug_trampoline about related CPU hotplug
185         // guarantees.
186         cpu_mask_t outstanding = atomic_load_relaxed(
187             (int*)&sync_context.outstanding_cpus);
188         cpu_mask_t online = mp_get_online_mask();
189         if ((outstanding & online) == 0) {
190             break;
191         }
192 
193         // If interrupts are still disabled, we need to attempt to process any
194         // tasks queued for us in order to prevent deadlock.
195         if (ints_disabled) {
196             // Optimistically check if our task list has work without the lock.
197             // mp_mbx_generic_irq will take the lock and check again.
198             if (!list_is_empty(&mp.ipi_task_list[local_cpu])) {
199                 bool previous_blocking_disallowed = arch_blocking_disallowed();
200                 arch_set_blocking_disallowed(true);
201                 mp_mbx_generic_irq(nullptr);
202                 arch_set_blocking_disallowed(previous_blocking_disallowed);
203                 continue;
204             }
205         }
206 
207         arch_spinloop_pause();
208     }
209     smp_mb();
210 
211     // make sure the sync_tasks aren't in lists anymore, since they're
212     // stack allocated
213     spin_lock_irqsave(&mp.ipi_task_lock, irqstate);
214     for (uint i = 0; i < num_cpus; ++i) {
215         // If a task is still around, it's because the CPU went offline.
216         if (list_in_list(&sync_tasks[i].node)) {
217             list_delete(&sync_tasks[i].node);
218         }
219     }
220     spin_unlock_irqrestore(&mp.ipi_task_lock, irqstate);
221 }
222 
223 static void mp_unplug_trampoline(void) TA_REQ(thread_lock) __NO_RETURN;
mp_unplug_trampoline(void)224 static void mp_unplug_trampoline(void) {
225     // We're still holding the thread lock from the reschedule that took us
226     // here.
227 
228     thread_t* ct = get_current_thread();
229     auto unplug_done = reinterpret_cast<event_t*>(ct->arg);
230 
231     cpu_num_t cpu_num = arch_curr_cpu_num();
232     sched_transition_off_cpu(cpu_num);
233 
234     // Note that before this invocation, but after we stopped accepting
235     // interrupts, we may have received a synchronous task to perform.
236     // Clearing this flag will cause the mp_sync_exec caller to consider
237     // this CPU done.  If this CPU comes back online before other all
238     // of the other CPUs finish their work (very unlikely, since tasks
239     // should be quick), then this CPU may execute the task.
240     mp_set_curr_cpu_online(false);
241 
242     // do *not* enable interrupts, we want this CPU to never receive another
243     // interrupt
244     spin_unlock(&thread_lock);
245 
246     // Stop and then shutdown this CPU's platform timer.
247     platform_stop_timer();
248     platform_shutdown_timer();
249 
250     // Shutdown the interrupt controller for this CPU.  On some platforms (arm64 with GIC) receiving
251     // an interrupt at a powered off CPU can result in implementation defined behavior (including
252     // resetting the whole system).
253     shutdown_interrupts_curr_cpu();
254 
255     // flush all of our caches
256     arch_flush_state_and_halt(unplug_done);
257 }
258 
259 // Hotplug the given cpus.  Blocks until the CPUs are up, or a failure is
260 // detected.
261 //
262 // This should be called in a thread context
mp_hotplug_cpu_mask(cpu_mask_t cpu_mask)263 zx_status_t mp_hotplug_cpu_mask(cpu_mask_t cpu_mask) {
264     DEBUG_ASSERT(!arch_ints_disabled());
265 
266     zx_status_t status = ZX_OK;
267 
268     mutex_acquire(&mp.hotplug_lock);
269 
270     // Make sure all of the requested CPUs are offline
271     if (cpu_mask & mp_get_online_mask()) {
272         status = ZX_ERR_BAD_STATE;
273         goto cleanup_mutex;
274     }
275 
276     while (cpu_mask != 0) {
277         cpu_num_t cpu_id = highest_cpu_set(cpu_mask);
278         cpu_mask &= ~cpu_num_to_mask(cpu_id);
279 
280         status = platform_mp_cpu_hotplug(cpu_id);
281         if (status != ZX_OK) {
282             break;
283         }
284     }
285 cleanup_mutex:
286     mutex_release(&mp.hotplug_lock);
287     return status;
288 }
289 
290 // Unplug a single CPU.  Must be called while hodling the hotplug lock
mp_unplug_cpu_mask_single_locked(cpu_num_t cpu_id)291 static zx_status_t mp_unplug_cpu_mask_single_locked(cpu_num_t cpu_id) {
292     // Wait for |cpu_id| to complete any in-progress DPCs and terminate its DPC thread.  Later, once
293     // nothing is running on it, we'll migrate its queued DPCs to another CPU.
294     dpc_shutdown(cpu_id);
295 
296     // TODO(maniscalco): |cpu_id| is about to shutdown.  We should ensure it has no pinned threads
297     // (except maybe the idle thread).  Once we're confident we've terminated/migrated them all,
298     // this would be a good place to DEBUG_ASSERT.
299 
300     // Create a thread for the unplug.  We will cause the target CPU to
301     // context switch to this thread.  After this happens, it should no
302     // longer be accessing system state and can be safely shut down.
303     //
304     // This thread is pinned to the target CPU and set to run with the
305     // highest priority.  This should cause it to pick up the thread
306     // immediately (or very soon, if for some reason there is another
307     // HIGHEST_PRIORITY task scheduled in between when we resume the
308     // thread and when the CPU is woken up).
309     event_t unplug_done = EVENT_INITIAL_VALUE(unplug_done, false, 0);
310     thread_t* t = thread_create_etc(
311         NULL,
312         "unplug_thread",
313         NULL,
314         &unplug_done,
315         HIGHEST_PRIORITY,
316         mp_unplug_trampoline);
317     if (t == NULL) {
318         return ZX_ERR_NO_MEMORY;
319     }
320 
321     zx_status_t status = platform_mp_prep_cpu_unplug(cpu_id);
322     if (status != ZX_OK) {
323         return status;
324     }
325 
326     // Pin to the target CPU
327     thread_set_cpu_affinity(t, cpu_num_to_mask(cpu_id));
328     // Set real time to cancel the pre-emption timer
329     thread_set_real_time(t);
330 
331     status = thread_detach_and_resume(t);
332     if (status != ZX_OK) {
333         goto cleanup_thread;
334     }
335 
336     // Wait for the unplug thread to get scheduled on the target
337     do {
338         status = event_wait(&unplug_done);
339     } while (status != ZX_OK);
340 
341     // Now that the CPU is no longer processing tasks, move all of its timers
342     timer_transition_off_cpu(cpu_id);
343     // Move the CPU's queued DPCs to the current CPU.
344     dpc_shutdown_transition_off_cpu(cpu_id);
345 
346     status = platform_mp_cpu_unplug(cpu_id);
347     if (status != ZX_OK) {
348         // Do not cleanup the unplug thread in this case.  We have successfully
349         // unplugged the CPU from the scheduler's perspective, but the platform
350         // may have failed to shut down the CPU
351         return status;
352     }
353 
354 // Fall through.  Since the thread is scheduled, it should not be in any
355 // queues.  Since the CPU running this thread is now shutdown, we can just
356 // erase the thread's existence.
357 cleanup_thread:
358     thread_forget(t);
359     return status;
360 }
361 
362 // Unplug the given cpus.  Blocks until the CPUs are removed.  Partial
363 // failure may occur (in which some CPUs are removed but not others).
364 //
365 // This should be called in a thread context
mp_unplug_cpu_mask(cpu_mask_t cpu_mask)366 zx_status_t mp_unplug_cpu_mask(cpu_mask_t cpu_mask) {
367     DEBUG_ASSERT(!arch_ints_disabled());
368 
369     zx_status_t status = ZX_OK;
370 
371     mutex_acquire(&mp.hotplug_lock);
372 
373     // Make sure all of the requested CPUs are online
374     if (cpu_mask & ~mp_get_online_mask()) {
375         status = ZX_ERR_BAD_STATE;
376         goto cleanup_mutex;
377     }
378 
379     while (cpu_mask != 0) {
380         cpu_num_t cpu_id = highest_cpu_set(cpu_mask);
381         cpu_mask &= ~cpu_num_to_mask(cpu_id);
382 
383         status = mp_unplug_cpu_mask_single_locked(cpu_id);
384         if (status != ZX_OK) {
385             break;
386         }
387     }
388 
389 cleanup_mutex:
390     mutex_release(&mp.hotplug_lock);
391     return status;
392 }
393 
mp_mbx_generic_irq(void *)394 interrupt_eoi mp_mbx_generic_irq(void*) {
395     DEBUG_ASSERT(arch_ints_disabled());
396     const cpu_num_t local_cpu = arch_curr_cpu_num();
397 
398     CPU_STATS_INC(generic_ipis);
399 
400     while (1) {
401         struct mp_ipi_task* task;
402         spin_lock(&mp.ipi_task_lock);
403         task = list_remove_head_type(&mp.ipi_task_list[local_cpu], struct mp_ipi_task, node);
404         spin_unlock(&mp.ipi_task_lock);
405         if (task == NULL) {
406             break;
407         }
408 
409         task->func(task->context);
410     }
411 
412     return IRQ_EOI_DEACTIVATE;
413 }
414 
mp_mbx_reschedule_irq(void *)415 interrupt_eoi mp_mbx_reschedule_irq(void*) {
416     const cpu_num_t cpu = arch_curr_cpu_num();
417 
418     LTRACEF("cpu %u\n", cpu);
419 
420     CPU_STATS_INC(reschedule_ipis);
421 
422     if (mp.active_cpus & cpu_num_to_mask(cpu)) {
423         thread_preempt_set_pending();
424     }
425 
426     return IRQ_EOI_DEACTIVATE;
427 }
428 
mp_mbx_interrupt_irq(void *)429 interrupt_eoi mp_mbx_interrupt_irq(void*) {
430     const cpu_num_t cpu = arch_curr_cpu_num();
431 
432     LTRACEF("cpu %u\n", cpu);
433 
434     // do nothing, the entire point of this interrupt is to simply have one
435     // delivered to the cpu.
436 
437     return IRQ_EOI_DEACTIVATE;
438 }
439 
arch_mp_cpu_hotplug(uint cpu_id)440 __WEAK zx_status_t arch_mp_cpu_hotplug(uint cpu_id) {
441     return ZX_ERR_NOT_SUPPORTED;
442 }
arch_mp_prep_cpu_unplug(uint cpu_id)443 __WEAK zx_status_t arch_mp_prep_cpu_unplug(uint cpu_id) {
444     return ZX_ERR_NOT_SUPPORTED;
445 }
arch_mp_cpu_unplug(uint cpu_id)446 __WEAK zx_status_t arch_mp_cpu_unplug(uint cpu_id) {
447     return ZX_ERR_NOT_SUPPORTED;
448 }
platform_mp_cpu_hotplug(uint cpu_id)449 __WEAK zx_status_t platform_mp_cpu_hotplug(uint cpu_id) {
450     return arch_mp_cpu_hotplug(cpu_id);
451 }
platform_mp_prep_cpu_unplug(uint cpu_id)452 __WEAK zx_status_t platform_mp_prep_cpu_unplug(uint cpu_id) {
453     return arch_mp_prep_cpu_unplug(cpu_id);
454 }
platform_mp_cpu_unplug(uint cpu_id)455 __WEAK zx_status_t platform_mp_cpu_unplug(uint cpu_id) {
456     return arch_mp_cpu_unplug(cpu_id);
457 }
458