1 /*
2  * Copyright (c) 2006, Intel Corporation.
3  *
4  * This program is free software; you can redistribute it and/or modify it
5  * under the terms and conditions of the GNU General Public License,
6  * version 2, as published by the Free Software Foundation.
7  *
8  * This program is distributed in the hope it will be useful, but WITHOUT
9  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
10  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
11  * more details.
12  *
13  * You should have received a copy of the GNU General Public License along with
14  * this program; If not, see <http://www.gnu.org/licenses/>.
15  *
16  * Copyright (C) Allen Kay <allen.m.kay@intel.com>
17  * Copyright (C) Xiaohui Xin <xiaohui.xin@intel.com>
18  */
19 
20 #include <xen/event.h>
21 #include <xen/iommu.h>
22 #include <xen/cpu.h>
23 #include <xen/irq.h>
24 #include <asm/hvm/irq.h>
25 #include <asm/hvm/support.h>
26 #include <asm/io_apic.h>
27 
28 static DEFINE_PER_CPU(struct list_head, dpci_list);
29 
30 /*
31  * These two bit states help to safely schedule, deschedule, and wait until
32  * the softirq has finished.
33  *
34  * The semantics behind these two bits is as follow:
35  *  - STATE_SCHED - whoever modifies it has to ref-count the domain (->dom).
36  *  - STATE_RUN - only softirq is allowed to set and clear it. If it has
37  *      been set hvm_dirq_assist will RUN with a saved value of the
38  *      'struct domain' copied from 'pirq_dpci->dom' before STATE_RUN was set.
39  *
40  * The usual states are: STATE_SCHED(set) -> STATE_RUN(set) ->
41  * STATE_SCHED(unset) -> STATE_RUN(unset).
42  *
43  * However the states can also diverge such as: STATE_SCHED(set) ->
44  * STATE_SCHED(unset) -> STATE_RUN(set) -> STATE_RUN(unset). That means
45  * the 'hvm_dirq_assist' never run and that the softirq did not do any
46  * ref-counting.
47  */
48 
49 enum {
50     STATE_SCHED,
51     STATE_RUN
52 };
53 
54 /*
55  * This can be called multiple times, but the softirq is only raised once.
56  * That is until the STATE_SCHED state has been cleared. The state can be
57  * cleared by: the 'dpci_softirq' (when it has executed 'hvm_dirq_assist'),
58  * or by 'pt_pirq_softirq_reset' (which will try to clear the state before
59  * the softirq had a chance to run).
60  */
raise_softirq_for(struct hvm_pirq_dpci * pirq_dpci)61 static void raise_softirq_for(struct hvm_pirq_dpci *pirq_dpci)
62 {
63     unsigned long flags;
64 
65     if ( test_and_set_bit(STATE_SCHED, &pirq_dpci->state) )
66         return;
67 
68     get_knownalive_domain(pirq_dpci->dom);
69 
70     local_irq_save(flags);
71     list_add_tail(&pirq_dpci->softirq_list, &this_cpu(dpci_list));
72     local_irq_restore(flags);
73 
74     raise_softirq(HVM_DPCI_SOFTIRQ);
75 }
76 
77 /*
78  * If we are racing with softirq_dpci (STATE_SCHED) we return
79  * true. Otherwise we return false.
80  *
81  * If it is false, it is the callers responsibility to make sure
82  * that the softirq (with the event_lock dropped) has ran.
83  */
pt_pirq_softirq_active(struct hvm_pirq_dpci * pirq_dpci)84 bool pt_pirq_softirq_active(struct hvm_pirq_dpci *pirq_dpci)
85 {
86     if ( pirq_dpci->state & ((1 << STATE_RUN) | (1 << STATE_SCHED)) )
87         return true;
88 
89     /*
90      * If in the future we would call 'raise_softirq_for' right away
91      * after 'pt_pirq_softirq_active' we MUST reset the list (otherwise it
92      * might have stale data).
93      */
94     return false;
95 }
96 
97 /*
98  * Reset the pirq_dpci->dom parameter to NULL.
99  *
100  * This function checks the different states to make sure it can do it
101  * at the right time. If it unschedules the 'hvm_dirq_assist' from running
102  * it also refcounts (which is what the softirq would have done) properly.
103  */
pt_pirq_softirq_reset(struct hvm_pirq_dpci * pirq_dpci)104 static void pt_pirq_softirq_reset(struct hvm_pirq_dpci *pirq_dpci)
105 {
106     struct domain *d = pirq_dpci->dom;
107 
108     ASSERT(spin_is_locked(&d->event_lock));
109 
110     switch ( cmpxchg(&pirq_dpci->state, 1 << STATE_SCHED, 0) )
111     {
112     case (1 << STATE_SCHED):
113         /*
114          * We are going to try to de-schedule the softirq before it goes in
115          * STATE_RUN. Whoever clears STATE_SCHED MUST refcount the 'dom'.
116          */
117         put_domain(d);
118         /* fallthrough. */
119     case (1 << STATE_RUN):
120     case (1 << STATE_RUN) | (1 << STATE_SCHED):
121         /*
122          * The reason it is OK to reset 'dom' when STATE_RUN bit is set is due
123          * to a shortcut the 'dpci_softirq' implements. It stashes the 'dom'
124          * in local variable before it sets STATE_RUN - and therefore will not
125          * dereference '->dom' which would crash.
126          */
127         pirq_dpci->dom = NULL;
128         break;
129     }
130     /*
131      * Inhibit 'hvm_dirq_assist' from doing anything useful and at worst
132      * calling 'set_timer' which will blow up (as we have called kill_timer
133      * or never initialized it). Note that we hold the lock that
134      * 'hvm_dirq_assist' could be spinning on.
135      */
136     pirq_dpci->masked = 0;
137 }
138 
pt_irq_need_timer(uint32_t flags)139 bool pt_irq_need_timer(uint32_t flags)
140 {
141     return !(flags & (HVM_IRQ_DPCI_GUEST_MSI | HVM_IRQ_DPCI_TRANSLATE));
142 }
143 
pt_irq_guest_eoi(struct domain * d,struct hvm_pirq_dpci * pirq_dpci,void * arg)144 static int pt_irq_guest_eoi(struct domain *d, struct hvm_pirq_dpci *pirq_dpci,
145                             void *arg)
146 {
147     if ( __test_and_clear_bit(_HVM_IRQ_DPCI_EOI_LATCH_SHIFT,
148                               &pirq_dpci->flags) )
149     {
150         pirq_dpci->masked = 0;
151         pirq_dpci->pending = 0;
152         pirq_guest_eoi(dpci_pirq(pirq_dpci));
153     }
154 
155     return 0;
156 }
157 
pt_irq_time_out(void * data)158 static void pt_irq_time_out(void *data)
159 {
160     struct hvm_pirq_dpci *irq_map = data;
161     const struct hvm_irq_dpci *dpci;
162     const struct dev_intx_gsi_link *digl;
163 
164     spin_lock(&irq_map->dom->event_lock);
165 
166     if ( irq_map->flags & HVM_IRQ_DPCI_IDENTITY_GSI )
167     {
168         ASSERT(is_hardware_domain(irq_map->dom));
169         /*
170          * Identity mapped, no need to iterate over the guest GSI list to find
171          * other pirqs sharing the same guest GSI.
172          *
173          * In the identity mapped case the EOI can also be done now, this way
174          * the iteration over the list of domain pirqs is avoided.
175          */
176         hvm_gsi_deassert(irq_map->dom, dpci_pirq(irq_map)->pirq);
177         irq_map->flags |= HVM_IRQ_DPCI_EOI_LATCH;
178         pt_irq_guest_eoi(irq_map->dom, irq_map, NULL);
179         spin_unlock(&irq_map->dom->event_lock);
180         return;
181     }
182 
183     dpci = domain_get_irq_dpci(irq_map->dom);
184     if ( unlikely(!dpci) )
185     {
186         ASSERT_UNREACHABLE();
187         spin_unlock(&irq_map->dom->event_lock);
188         return;
189     }
190     list_for_each_entry ( digl, &irq_map->digl_list, list )
191     {
192         unsigned int guest_gsi = hvm_pci_intx_gsi(digl->device, digl->intx);
193         const struct hvm_girq_dpci_mapping *girq;
194 
195         list_for_each_entry ( girq, &dpci->girq[guest_gsi], list )
196         {
197             struct pirq *pirq = pirq_info(irq_map->dom, girq->machine_gsi);
198 
199             pirq_dpci(pirq)->flags |= HVM_IRQ_DPCI_EOI_LATCH;
200         }
201         hvm_pci_intx_deassert(irq_map->dom, digl->device, digl->intx);
202     }
203 
204     pt_pirq_iterate(irq_map->dom, pt_irq_guest_eoi, NULL);
205 
206     spin_unlock(&irq_map->dom->event_lock);
207 }
208 
domain_get_irq_dpci(const struct domain * d)209 struct hvm_irq_dpci *domain_get_irq_dpci(const struct domain *d)
210 {
211     if ( !d || !is_hvm_domain(d) )
212         return NULL;
213 
214     return hvm_domain_irq(d)->dpci;
215 }
216 
free_hvm_irq_dpci(struct hvm_irq_dpci * dpci)217 void free_hvm_irq_dpci(struct hvm_irq_dpci *dpci)
218 {
219     xfree(dpci);
220 }
221 
222 /*
223  * This routine handles lowest-priority interrupts using vector-hashing
224  * mechanism. As an example, modern Intel CPUs use this method to handle
225  * lowest-priority interrupts.
226  *
227  * Here is the details about the vector-hashing mechanism:
228  * 1. For lowest-priority interrupts, store all the possible destination
229  *    vCPUs in an array.
230  * 2. Use "gvec % max number of destination vCPUs" to find the right
231  *    destination vCPU in the array for the lowest-priority interrupt.
232  */
vector_hashing_dest(const struct domain * d,uint32_t dest_id,bool dest_mode,uint8_t gvec)233 static struct vcpu *vector_hashing_dest(const struct domain *d,
234                                         uint32_t dest_id,
235                                         bool dest_mode,
236                                         uint8_t gvec)
237 
238 {
239     unsigned long *dest_vcpu_bitmap;
240     unsigned int dest_vcpus = 0;
241     struct vcpu *v, *dest = NULL;
242     unsigned int i;
243 
244     dest_vcpu_bitmap = xzalloc_array(unsigned long,
245                                      BITS_TO_LONGS(d->max_vcpus));
246     if ( !dest_vcpu_bitmap )
247         return NULL;
248 
249     for_each_vcpu ( d, v )
250     {
251         if ( !vlapic_match_dest(vcpu_vlapic(v), NULL, APIC_DEST_NOSHORT,
252                                 dest_id, dest_mode) )
253             continue;
254 
255         __set_bit(v->vcpu_id, dest_vcpu_bitmap);
256         dest_vcpus++;
257     }
258 
259     if ( dest_vcpus != 0 )
260     {
261         unsigned int mod = gvec % dest_vcpus;
262         unsigned int idx = 0;
263 
264         for ( i = 0; i <= mod; i++ )
265         {
266             idx = find_next_bit(dest_vcpu_bitmap, d->max_vcpus, idx) + 1;
267             BUG_ON(idx > d->max_vcpus);
268         }
269 
270         dest = d->vcpu[idx - 1];
271     }
272 
273     xfree(dest_vcpu_bitmap);
274 
275     return dest;
276 }
277 
pt_irq_create_bind(struct domain * d,const struct xen_domctl_bind_pt_irq * pt_irq_bind)278 int pt_irq_create_bind(
279     struct domain *d, const struct xen_domctl_bind_pt_irq *pt_irq_bind)
280 {
281     struct hvm_irq_dpci *hvm_irq_dpci;
282     struct hvm_pirq_dpci *pirq_dpci;
283     struct pirq *info;
284     int rc, pirq = pt_irq_bind->machine_irq;
285 
286     if ( pirq < 0 || pirq >= d->nr_pirqs )
287         return -EINVAL;
288 
289  restart:
290     spin_lock(&d->event_lock);
291 
292     hvm_irq_dpci = domain_get_irq_dpci(d);
293     if ( !hvm_irq_dpci && !is_hardware_domain(d) )
294     {
295         unsigned int i;
296 
297         /*
298          * NB: the hardware domain doesn't use a hvm_irq_dpci struct because
299          * it's only allowed to identity map GSIs, and so the data contained in
300          * that struct (used to map guest GSIs into machine GSIs and perform
301          * interrupt routing) is completely useless to it.
302          */
303         hvm_irq_dpci = xzalloc(struct hvm_irq_dpci);
304         if ( hvm_irq_dpci == NULL )
305         {
306             spin_unlock(&d->event_lock);
307             return -ENOMEM;
308         }
309         for ( i = 0; i < NR_HVM_DOMU_IRQS; i++ )
310             INIT_LIST_HEAD(&hvm_irq_dpci->girq[i]);
311 
312         hvm_domain_irq(d)->dpci = hvm_irq_dpci;
313     }
314 
315     info = pirq_get_info(d, pirq);
316     if ( !info )
317     {
318         spin_unlock(&d->event_lock);
319         return -ENOMEM;
320     }
321     pirq_dpci = pirq_dpci(info);
322 
323     /*
324      * A crude 'while' loop with us dropping the spinlock and giving
325      * the softirq_dpci a chance to run.
326      * We MUST check for this condition as the softirq could be scheduled
327      * and hasn't run yet. Note that this code replaced tasklet_kill which
328      * would have spun forever and would do the same thing (wait to flush out
329      * outstanding hvm_dirq_assist calls.
330      */
331     if ( pt_pirq_softirq_active(pirq_dpci) )
332     {
333         spin_unlock(&d->event_lock);
334         cpu_relax();
335         goto restart;
336     }
337 
338     switch ( pt_irq_bind->irq_type )
339     {
340     case PT_IRQ_TYPE_MSI:
341     {
342         uint8_t dest, delivery_mode;
343         bool dest_mode;
344         int dest_vcpu_id;
345         const struct vcpu *vcpu;
346         uint32_t gflags = pt_irq_bind->u.msi.gflags &
347                           ~XEN_DOMCTL_VMSI_X86_UNMASKED;
348 
349         if ( !(pirq_dpci->flags & HVM_IRQ_DPCI_MAPPED) )
350         {
351             pirq_dpci->flags = HVM_IRQ_DPCI_MAPPED | HVM_IRQ_DPCI_MACH_MSI |
352                                HVM_IRQ_DPCI_GUEST_MSI;
353             pirq_dpci->gmsi.gvec = pt_irq_bind->u.msi.gvec;
354             pirq_dpci->gmsi.gflags = gflags;
355             /*
356              * 'pt_irq_create_bind' can be called after 'pt_irq_destroy_bind'.
357              * The 'pirq_cleanup_check' which would free the structure is only
358              * called if the event channel for the PIRQ is active. However
359              * OS-es that use event channels usually bind PIRQs to eventds
360              * and unbind them before calling 'pt_irq_destroy_bind' - with the
361              * result that we re-use the 'dpci' structure. This can be
362              * reproduced with unloading and loading the driver for a device.
363              *
364              * As such on every 'pt_irq_create_bind' call we MUST set it.
365              */
366             pirq_dpci->dom = d;
367             /* bind after hvm_irq_dpci is setup to avoid race with irq handler*/
368             rc = pirq_guest_bind(d->vcpu[0], info, 0);
369             if ( rc == 0 && pt_irq_bind->u.msi.gtable )
370             {
371                 rc = msixtbl_pt_register(d, info, pt_irq_bind->u.msi.gtable);
372                 if ( unlikely(rc) )
373                 {
374                     pirq_guest_unbind(d, info);
375                     /*
376                      * Between 'pirq_guest_bind' and before 'pirq_guest_unbind'
377                      * an interrupt can be scheduled. No more of them are going
378                      * to be scheduled but we must deal with the one that may be
379                      * in the queue.
380                      */
381                     pt_pirq_softirq_reset(pirq_dpci);
382                 }
383             }
384             if ( unlikely(rc) )
385             {
386                 pirq_dpci->gmsi.gflags = 0;
387                 pirq_dpci->gmsi.gvec = 0;
388                 pirq_dpci->dom = NULL;
389                 pirq_dpci->flags = 0;
390                 pirq_cleanup_check(info, d);
391                 spin_unlock(&d->event_lock);
392                 return rc;
393             }
394         }
395         else
396         {
397             uint32_t mask = HVM_IRQ_DPCI_MACH_MSI | HVM_IRQ_DPCI_GUEST_MSI;
398 
399             if ( (pirq_dpci->flags & mask) != mask )
400             {
401                 spin_unlock(&d->event_lock);
402                 return -EBUSY;
403             }
404 
405             /* If pirq is already mapped as vmsi, update guest data/addr. */
406             if ( pirq_dpci->gmsi.gvec != pt_irq_bind->u.msi.gvec ||
407                  pirq_dpci->gmsi.gflags != gflags )
408             {
409                 /* Directly clear pending EOIs before enabling new MSI info. */
410                 pirq_guest_eoi(info);
411 
412                 pirq_dpci->gmsi.gvec = pt_irq_bind->u.msi.gvec;
413                 pirq_dpci->gmsi.gflags = gflags;
414             }
415         }
416         /* Calculate dest_vcpu_id for MSI-type pirq migration. */
417         dest = MASK_EXTR(pirq_dpci->gmsi.gflags,
418                          XEN_DOMCTL_VMSI_X86_DEST_ID_MASK);
419         dest_mode = pirq_dpci->gmsi.gflags & XEN_DOMCTL_VMSI_X86_DM_MASK;
420         delivery_mode = MASK_EXTR(pirq_dpci->gmsi.gflags,
421                                   XEN_DOMCTL_VMSI_X86_DELIV_MASK);
422 
423         dest_vcpu_id = hvm_girq_dest_2_vcpu_id(d, dest, dest_mode);
424         pirq_dpci->gmsi.dest_vcpu_id = dest_vcpu_id;
425         spin_unlock(&d->event_lock);
426 
427         pirq_dpci->gmsi.posted = false;
428         vcpu = (dest_vcpu_id >= 0) ? d->vcpu[dest_vcpu_id] : NULL;
429         if ( iommu_intpost )
430         {
431             if ( delivery_mode == dest_LowestPrio )
432                 vcpu = vector_hashing_dest(d, dest, dest_mode,
433                                            pirq_dpci->gmsi.gvec);
434             if ( vcpu )
435                 pirq_dpci->gmsi.posted = true;
436         }
437         if ( dest_vcpu_id >= 0 )
438             hvm_migrate_pirqs(d->vcpu[dest_vcpu_id]);
439 
440         /* Use interrupt posting if it is supported. */
441         if ( iommu_intpost )
442             pi_update_irte(vcpu ? &vcpu->arch.hvm_vmx.pi_desc : NULL,
443                            info, pirq_dpci->gmsi.gvec);
444 
445         if ( pt_irq_bind->u.msi.gflags & XEN_DOMCTL_VMSI_X86_UNMASKED )
446         {
447             unsigned long flags;
448             struct irq_desc *desc = pirq_spin_lock_irq_desc(info, &flags);
449 
450             if ( !desc )
451             {
452                 pt_irq_destroy_bind(d, pt_irq_bind);
453                 return -EINVAL;
454             }
455 
456             guest_mask_msi_irq(desc, false);
457             spin_unlock_irqrestore(&desc->lock, flags);
458         }
459 
460         break;
461     }
462 
463     case PT_IRQ_TYPE_PCI:
464     case PT_IRQ_TYPE_MSI_TRANSLATE:
465     {
466         struct dev_intx_gsi_link *digl = NULL;
467         struct hvm_girq_dpci_mapping *girq = NULL;
468         unsigned int guest_gsi;
469 
470         /*
471          * Mapping GSIs for the hardware domain is different than doing it for
472          * an unpriviledged guest, the hardware domain is only allowed to
473          * identity map GSIs, and as such all the data in the u.pci union is
474          * discarded.
475          */
476         if ( hvm_irq_dpci )
477         {
478             unsigned int link;
479 
480             digl = xmalloc(struct dev_intx_gsi_link);
481             girq = xmalloc(struct hvm_girq_dpci_mapping);
482 
483             if ( !digl || !girq )
484             {
485                 spin_unlock(&d->event_lock);
486                 xfree(girq);
487                 xfree(digl);
488                 return -ENOMEM;
489             }
490 
491             girq->bus = digl->bus = pt_irq_bind->u.pci.bus;
492             girq->device = digl->device = pt_irq_bind->u.pci.device;
493             girq->intx = digl->intx = pt_irq_bind->u.pci.intx;
494             list_add_tail(&digl->list, &pirq_dpci->digl_list);
495 
496             guest_gsi = hvm_pci_intx_gsi(digl->device, digl->intx);
497             link = hvm_pci_intx_link(digl->device, digl->intx);
498 
499             hvm_irq_dpci->link_cnt[link]++;
500 
501             girq->machine_gsi = pirq;
502             list_add_tail(&girq->list, &hvm_irq_dpci->girq[guest_gsi]);
503         }
504         else
505         {
506             ASSERT(is_hardware_domain(d));
507 
508             /* MSI_TRANSLATE is not supported for the hardware domain. */
509             if ( pt_irq_bind->irq_type != PT_IRQ_TYPE_PCI ||
510                  pirq >= hvm_domain_irq(d)->nr_gsis )
511             {
512                 spin_unlock(&d->event_lock);
513 
514                 return -EINVAL;
515             }
516             guest_gsi = pirq;
517         }
518 
519         /* Bind the same mirq once in the same domain */
520         if ( !(pirq_dpci->flags & HVM_IRQ_DPCI_MAPPED) )
521         {
522             unsigned int share;
523 
524             /* MUST be set, as the pirq_dpci can be re-used. */
525             pirq_dpci->dom = d;
526             if ( pt_irq_bind->irq_type == PT_IRQ_TYPE_MSI_TRANSLATE )
527             {
528                 pirq_dpci->flags = HVM_IRQ_DPCI_MAPPED |
529                                    HVM_IRQ_DPCI_MACH_MSI |
530                                    HVM_IRQ_DPCI_GUEST_PCI |
531                                    HVM_IRQ_DPCI_TRANSLATE;
532                 share = 0;
533             }
534             else    /* PT_IRQ_TYPE_PCI */
535             {
536                 pirq_dpci->flags = HVM_IRQ_DPCI_MAPPED |
537                                    HVM_IRQ_DPCI_MACH_PCI |
538                                    HVM_IRQ_DPCI_GUEST_PCI;
539                 if ( !is_hardware_domain(d) )
540                     share = BIND_PIRQ__WILL_SHARE;
541                 else
542                 {
543                     int mask = vioapic_get_mask(d, guest_gsi);
544                     int trigger_mode = vioapic_get_trigger_mode(d, guest_gsi);
545 
546                     if ( mask < 0 || trigger_mode < 0 )
547                     {
548                         spin_unlock(&d->event_lock);
549 
550                         ASSERT_UNREACHABLE();
551                         return -EINVAL;
552                     }
553                     pirq_dpci->flags |= HVM_IRQ_DPCI_IDENTITY_GSI;
554                     /*
555                      * Check if the corresponding vIO APIC pin is configured
556                      * level or edge trigger, level triggered interrupts will
557                      * be marked as shareable.
558                      */
559                     ASSERT(!mask);
560                     share = trigger_mode;
561                 }
562             }
563 
564             /* Init timer before binding */
565             if ( pt_irq_need_timer(pirq_dpci->flags) )
566                 init_timer(&pirq_dpci->timer, pt_irq_time_out, pirq_dpci, 0);
567             /* Deal with gsi for legacy devices */
568             rc = pirq_guest_bind(d->vcpu[0], info, share);
569             if ( unlikely(rc) )
570             {
571                 if ( pt_irq_need_timer(pirq_dpci->flags) )
572                     kill_timer(&pirq_dpci->timer);
573                 /*
574                  * There is no path for __do_IRQ to schedule softirq as
575                  * IRQ_GUEST is not set. As such we can reset 'dom' directly.
576                  */
577                 pirq_dpci->dom = NULL;
578                 if ( hvm_irq_dpci )
579                 {
580                     unsigned int link;
581 
582                     ASSERT(girq && digl);
583                     list_del(&girq->list);
584                     list_del(&digl->list);
585                     link = hvm_pci_intx_link(digl->device, digl->intx);
586                     hvm_irq_dpci->link_cnt[link]--;
587                 }
588                 pirq_dpci->flags = 0;
589                 pirq_cleanup_check(info, d);
590                 spin_unlock(&d->event_lock);
591                 xfree(girq);
592                 xfree(digl);
593                 return rc;
594             }
595         }
596 
597         spin_unlock(&d->event_lock);
598 
599         if ( iommu_verbose )
600         {
601             char buf[24] = "";
602 
603             if ( digl )
604                 snprintf(buf, ARRAY_SIZE(buf), " dev=%02x.%02x.%u intx=%u",
605                          digl->bus, PCI_SLOT(digl->device),
606                          PCI_FUNC(digl->device), digl->intx);
607 
608             printk(XENLOG_G_INFO "d%d: bind: m_gsi=%u g_gsi=%u%s\n",
609                    d->domain_id, pirq, guest_gsi, buf);
610         }
611         break;
612     }
613 
614     default:
615         spin_unlock(&d->event_lock);
616         return -EOPNOTSUPP;
617     }
618 
619     return 0;
620 }
621 
pt_irq_destroy_bind(struct domain * d,const struct xen_domctl_bind_pt_irq * pt_irq_bind)622 int pt_irq_destroy_bind(
623     struct domain *d, const struct xen_domctl_bind_pt_irq *pt_irq_bind)
624 {
625     struct hvm_irq_dpci *hvm_irq_dpci;
626     struct hvm_pirq_dpci *pirq_dpci;
627     unsigned int machine_gsi = pt_irq_bind->machine_irq;
628     struct pirq *pirq;
629     const char *what = NULL;
630 
631     switch ( pt_irq_bind->irq_type )
632     {
633     case PT_IRQ_TYPE_PCI:
634     case PT_IRQ_TYPE_MSI_TRANSLATE:
635         if ( iommu_verbose )
636         {
637             unsigned int device = pt_irq_bind->u.pci.device;
638             unsigned int intx = pt_irq_bind->u.pci.intx;
639 
640             printk(XENLOG_G_INFO
641                    "d%d: unbind: m_gsi=%u g_gsi=%u dev=%02x:%02x.%u intx=%u\n",
642                    d->domain_id, machine_gsi, hvm_pci_intx_gsi(device, intx),
643                    pt_irq_bind->u.pci.bus,
644                    PCI_SLOT(device), PCI_FUNC(device), intx);
645         }
646         break;
647     case PT_IRQ_TYPE_MSI:
648         break;
649     default:
650         return -EOPNOTSUPP;
651     }
652 
653     spin_lock(&d->event_lock);
654 
655     hvm_irq_dpci = domain_get_irq_dpci(d);
656 
657     if ( !hvm_irq_dpci && !is_hardware_domain(d) )
658     {
659         spin_unlock(&d->event_lock);
660         return -EINVAL;
661     }
662 
663     pirq = pirq_info(d, machine_gsi);
664     pirq_dpci = pirq_dpci(pirq);
665 
666     if ( hvm_irq_dpci && pt_irq_bind->irq_type != PT_IRQ_TYPE_MSI )
667     {
668         unsigned int bus = pt_irq_bind->u.pci.bus;
669         unsigned int device = pt_irq_bind->u.pci.device;
670         unsigned int intx = pt_irq_bind->u.pci.intx;
671         unsigned int guest_gsi = hvm_pci_intx_gsi(device, intx);
672         unsigned int link = hvm_pci_intx_link(device, intx);
673         struct hvm_girq_dpci_mapping *girq;
674         struct dev_intx_gsi_link *digl, *tmp;
675 
676         list_for_each_entry ( girq, &hvm_irq_dpci->girq[guest_gsi], list )
677         {
678             if ( girq->bus         == bus &&
679                  girq->device      == device &&
680                  girq->intx        == intx &&
681                  girq->machine_gsi == machine_gsi )
682             {
683                 list_del(&girq->list);
684                 xfree(girq);
685                 girq = NULL;
686                 break;
687             }
688         }
689 
690         if ( girq )
691         {
692             spin_unlock(&d->event_lock);
693             return -EINVAL;
694         }
695 
696         hvm_irq_dpci->link_cnt[link]--;
697 
698         /* clear the mirq info */
699         if ( pirq_dpci && (pirq_dpci->flags & HVM_IRQ_DPCI_MAPPED) )
700         {
701             list_for_each_entry_safe ( digl, tmp, &pirq_dpci->digl_list, list )
702             {
703                 if ( digl->bus    == bus &&
704                      digl->device == device &&
705                      digl->intx   == intx )
706                 {
707                     list_del(&digl->list);
708                     xfree(digl);
709                 }
710             }
711             what = list_empty(&pirq_dpci->digl_list) ? "final" : "partial";
712         }
713         else
714             what = "bogus";
715     }
716     else if ( pirq_dpci && pirq_dpci->gmsi.posted )
717         pi_update_irte(NULL, pirq, 0);
718 
719     if ( pirq_dpci && (pirq_dpci->flags & HVM_IRQ_DPCI_MAPPED) &&
720          list_empty(&pirq_dpci->digl_list) )
721     {
722         pirq_guest_unbind(d, pirq);
723         msixtbl_pt_unregister(d, pirq);
724         if ( pt_irq_need_timer(pirq_dpci->flags) )
725             kill_timer(&pirq_dpci->timer);
726         pirq_dpci->flags = 0;
727         /*
728          * See comment in pt_irq_create_bind's PT_IRQ_TYPE_MSI before the
729          * call to pt_pirq_softirq_reset.
730          */
731         pt_pirq_softirq_reset(pirq_dpci);
732 
733         pirq_cleanup_check(pirq, d);
734     }
735 
736     spin_unlock(&d->event_lock);
737 
738     if ( what && iommu_verbose )
739     {
740         unsigned int device = pt_irq_bind->u.pci.device;
741         char buf[24] = "";
742 
743         if ( hvm_irq_dpci )
744             snprintf(buf, ARRAY_SIZE(buf), " dev=%02x.%02x.%u intx=%u",
745                      pt_irq_bind->u.pci.bus, PCI_SLOT(device),
746                      PCI_FUNC(device), pt_irq_bind->u.pci.intx);
747 
748         printk(XENLOG_G_INFO "d%d %s unmap: m_irq=%u%s\n",
749                d->domain_id, what, machine_gsi, buf);
750     }
751 
752     return 0;
753 }
754 
pt_pirq_init(struct domain * d,struct hvm_pirq_dpci * dpci)755 void pt_pirq_init(struct domain *d, struct hvm_pirq_dpci *dpci)
756 {
757     INIT_LIST_HEAD(&dpci->digl_list);
758     dpci->gmsi.dest_vcpu_id = -1;
759 }
760 
pt_pirq_cleanup_check(struct hvm_pirq_dpci * dpci)761 bool pt_pirq_cleanup_check(struct hvm_pirq_dpci *dpci)
762 {
763     if ( !dpci->flags && !pt_pirq_softirq_active(dpci) )
764     {
765         dpci->dom = NULL;
766         return true;
767     }
768     return false;
769 }
770 
pt_pirq_iterate(struct domain * d,int (* cb)(struct domain *,struct hvm_pirq_dpci *,void *),void * arg)771 int pt_pirq_iterate(struct domain *d,
772                     int (*cb)(struct domain *,
773                               struct hvm_pirq_dpci *, void *),
774                     void *arg)
775 {
776     int rc = 0;
777     unsigned int pirq = 0, n, i;
778     struct pirq *pirqs[8];
779 
780     ASSERT(spin_is_locked(&d->event_lock));
781 
782     do {
783         n = radix_tree_gang_lookup(&d->pirq_tree, (void **)pirqs, pirq,
784                                    ARRAY_SIZE(pirqs));
785         for ( i = 0; i < n; ++i )
786         {
787             struct hvm_pirq_dpci *pirq_dpci = pirq_dpci(pirqs[i]);
788 
789             pirq = pirqs[i]->pirq;
790             if ( (pirq_dpci->flags & HVM_IRQ_DPCI_MAPPED) )
791                 rc = cb(d, pirq_dpci, arg);
792         }
793     } while ( !rc && ++pirq < d->nr_pirqs && n == ARRAY_SIZE(pirqs) );
794 
795     return rc;
796 }
797 
hvm_do_IRQ_dpci(struct domain * d,struct pirq * pirq)798 int hvm_do_IRQ_dpci(struct domain *d, struct pirq *pirq)
799 {
800     struct hvm_irq_dpci *dpci = domain_get_irq_dpci(d);
801     struct hvm_pirq_dpci *pirq_dpci = pirq_dpci(pirq);
802 
803     ASSERT(is_hvm_domain(d));
804 
805     if ( !iommu_enabled || (!is_hardware_domain(d) && !dpci) ||
806          !pirq_dpci || !(pirq_dpci->flags & HVM_IRQ_DPCI_MAPPED) )
807         return 0;
808 
809     pirq_dpci->masked = 1;
810     raise_softirq_for(pirq_dpci);
811     return 1;
812 }
813 
814 /* called with d->event_lock held */
__msi_pirq_eoi(struct hvm_pirq_dpci * pirq_dpci)815 static void __msi_pirq_eoi(struct hvm_pirq_dpci *pirq_dpci)
816 {
817     irq_desc_t *desc;
818 
819     if ( (pirq_dpci->flags & HVM_IRQ_DPCI_MAPPED) &&
820          (pirq_dpci->flags & HVM_IRQ_DPCI_MACH_MSI) )
821     {
822         struct pirq *pirq = dpci_pirq(pirq_dpci);
823 
824         BUG_ON(!local_irq_is_enabled());
825         desc = pirq_spin_lock_irq_desc(pirq, NULL);
826         if ( !desc )
827             return;
828         desc_guest_eoi(desc, pirq);
829     }
830 }
831 
_hvm_dpci_msi_eoi(struct domain * d,struct hvm_pirq_dpci * pirq_dpci,void * arg)832 static int _hvm_dpci_msi_eoi(struct domain *d,
833                              struct hvm_pirq_dpci *pirq_dpci, void *arg)
834 {
835     int vector = (long)arg;
836 
837     if ( (pirq_dpci->flags & HVM_IRQ_DPCI_MACH_MSI) &&
838          (pirq_dpci->gmsi.gvec == vector) )
839     {
840         unsigned int dest = MASK_EXTR(pirq_dpci->gmsi.gflags,
841                                       XEN_DOMCTL_VMSI_X86_DEST_ID_MASK);
842         bool dest_mode = pirq_dpci->gmsi.gflags & XEN_DOMCTL_VMSI_X86_DM_MASK;
843 
844         if ( vlapic_match_dest(vcpu_vlapic(current), NULL, 0, dest,
845                                dest_mode) )
846         {
847             __msi_pirq_eoi(pirq_dpci);
848             return 1;
849         }
850     }
851 
852     return 0;
853 }
854 
hvm_dpci_msi_eoi(struct domain * d,int vector)855 void hvm_dpci_msi_eoi(struct domain *d, int vector)
856 {
857     if ( !iommu_enabled || !hvm_domain_irq(d)->dpci )
858        return;
859 
860     spin_lock(&d->event_lock);
861     pt_pirq_iterate(d, _hvm_dpci_msi_eoi, (void *)(long)vector);
862     spin_unlock(&d->event_lock);
863 }
864 
hvm_dirq_assist(struct domain * d,struct hvm_pirq_dpci * pirq_dpci)865 static void hvm_dirq_assist(struct domain *d, struct hvm_pirq_dpci *pirq_dpci)
866 {
867     if ( unlikely(!hvm_domain_irq(d)->dpci) && !is_hardware_domain(d) )
868     {
869         ASSERT_UNREACHABLE();
870         return;
871     }
872 
873     spin_lock(&d->event_lock);
874     if ( test_and_clear_bool(pirq_dpci->masked) )
875     {
876         struct pirq *pirq = dpci_pirq(pirq_dpci);
877         const struct dev_intx_gsi_link *digl;
878 
879         if ( hvm_domain_use_pirq(d, pirq) )
880         {
881             send_guest_pirq(d, pirq);
882 
883             if ( pirq_dpci->flags & HVM_IRQ_DPCI_GUEST_MSI )
884             {
885                 spin_unlock(&d->event_lock);
886                 return;
887             }
888         }
889 
890         if ( pirq_dpci->flags & HVM_IRQ_DPCI_GUEST_MSI )
891         {
892             vmsi_deliver_pirq(d, pirq_dpci);
893             spin_unlock(&d->event_lock);
894             return;
895         }
896 
897         list_for_each_entry ( digl, &pirq_dpci->digl_list, list )
898         {
899             ASSERT(!(pirq_dpci->flags & HVM_IRQ_DPCI_IDENTITY_GSI));
900             hvm_pci_intx_assert(d, digl->device, digl->intx);
901             pirq_dpci->pending++;
902         }
903 
904         if ( pirq_dpci->flags & HVM_IRQ_DPCI_IDENTITY_GSI )
905         {
906             hvm_gsi_assert(d, pirq->pirq);
907             pirq_dpci->pending++;
908         }
909 
910         if ( pirq_dpci->flags & HVM_IRQ_DPCI_TRANSLATE )
911         {
912             /* for translated MSI to INTx interrupt, eoi as early as possible */
913             __msi_pirq_eoi(pirq_dpci);
914             spin_unlock(&d->event_lock);
915             return;
916         }
917 
918         /*
919          * Set a timer to see if the guest can finish the interrupt or not. For
920          * example, the guest OS may unmask the PIC during boot, before the
921          * guest driver is loaded. hvm_pci_intx_assert() may succeed, but the
922          * guest will never deal with the irq, then the physical interrupt line
923          * will never be deasserted.
924          */
925         ASSERT(pt_irq_need_timer(pirq_dpci->flags));
926         set_timer(&pirq_dpci->timer, NOW() + PT_IRQ_TIME_OUT);
927     }
928     spin_unlock(&d->event_lock);
929 }
930 
hvm_pirq_eoi(struct pirq * pirq,const union vioapic_redir_entry * ent)931 static void hvm_pirq_eoi(struct pirq *pirq,
932                          const union vioapic_redir_entry *ent)
933 {
934     struct hvm_pirq_dpci *pirq_dpci;
935 
936     if ( !pirq )
937     {
938         ASSERT_UNREACHABLE();
939         return;
940     }
941 
942     pirq_dpci = pirq_dpci(pirq);
943 
944     /*
945      * No need to get vector lock for timer
946      * since interrupt is still not EOIed
947      */
948     if ( --pirq_dpci->pending ||
949          (ent && ent->fields.mask) ||
950          !pt_irq_need_timer(pirq_dpci->flags) )
951         return;
952 
953     stop_timer(&pirq_dpci->timer);
954     pirq_guest_eoi(pirq);
955 }
956 
__hvm_dpci_eoi(struct domain * d,const struct hvm_girq_dpci_mapping * girq,const union vioapic_redir_entry * ent)957 static void __hvm_dpci_eoi(struct domain *d,
958                            const struct hvm_girq_dpci_mapping *girq,
959                            const union vioapic_redir_entry *ent)
960 {
961     struct pirq *pirq = pirq_info(d, girq->machine_gsi);
962 
963     if ( !hvm_domain_use_pirq(d, pirq) )
964         hvm_pci_intx_deassert(d, girq->device, girq->intx);
965 
966     hvm_pirq_eoi(pirq, ent);
967 }
968 
hvm_gsi_eoi(struct domain * d,unsigned int gsi,const union vioapic_redir_entry * ent)969 static void hvm_gsi_eoi(struct domain *d, unsigned int gsi,
970                         const union vioapic_redir_entry *ent)
971 {
972     struct pirq *pirq = pirq_info(d, gsi);
973 
974     /* Check if GSI is actually mapped. */
975     if ( !pirq_dpci(pirq) )
976         return;
977 
978     hvm_gsi_deassert(d, gsi);
979     hvm_pirq_eoi(pirq, ent);
980 }
981 
hvm_dpci_eoi(struct domain * d,unsigned int guest_gsi,const union vioapic_redir_entry * ent)982 void hvm_dpci_eoi(struct domain *d, unsigned int guest_gsi,
983                   const union vioapic_redir_entry *ent)
984 {
985     const struct hvm_irq_dpci *hvm_irq_dpci;
986     const struct hvm_girq_dpci_mapping *girq;
987 
988     if ( !iommu_enabled )
989         return;
990 
991     if ( is_hardware_domain(d) )
992     {
993         spin_lock(&d->event_lock);
994         hvm_gsi_eoi(d, guest_gsi, ent);
995         goto unlock;
996     }
997 
998     if ( guest_gsi < NR_ISAIRQS )
999     {
1000         hvm_dpci_isairq_eoi(d, guest_gsi);
1001         return;
1002     }
1003 
1004     spin_lock(&d->event_lock);
1005     hvm_irq_dpci = domain_get_irq_dpci(d);
1006 
1007     if ( !hvm_irq_dpci )
1008         goto unlock;
1009 
1010     list_for_each_entry ( girq, &hvm_irq_dpci->girq[guest_gsi], list )
1011         __hvm_dpci_eoi(d, girq, ent);
1012 
1013 unlock:
1014     spin_unlock(&d->event_lock);
1015 }
1016 
1017 /*
1018  * Note: 'pt_pirq_softirq_reset' can clear the STATE_SCHED before we get to
1019  * doing it. If that is the case we let 'pt_pirq_softirq_reset' do ref-counting.
1020  */
dpci_softirq(void)1021 static void dpci_softirq(void)
1022 {
1023     unsigned int cpu = smp_processor_id();
1024     LIST_HEAD(our_list);
1025 
1026     local_irq_disable();
1027     list_splice_init(&per_cpu(dpci_list, cpu), &our_list);
1028     local_irq_enable();
1029 
1030     while ( !list_empty(&our_list) )
1031     {
1032         struct hvm_pirq_dpci *pirq_dpci;
1033         struct domain *d;
1034 
1035         pirq_dpci = list_entry(our_list.next, struct hvm_pirq_dpci, softirq_list);
1036         list_del(&pirq_dpci->softirq_list);
1037 
1038         d = pirq_dpci->dom;
1039         smp_mb(); /* 'd' MUST be saved before we set/clear the bits. */
1040         if ( test_and_set_bit(STATE_RUN, &pirq_dpci->state) )
1041         {
1042             unsigned long flags;
1043 
1044             /* Put back on the list and retry. */
1045             local_irq_save(flags);
1046             list_add_tail(&pirq_dpci->softirq_list, &this_cpu(dpci_list));
1047             local_irq_restore(flags);
1048 
1049             raise_softirq(HVM_DPCI_SOFTIRQ);
1050             continue;
1051         }
1052         /*
1053          * The one who clears STATE_SCHED MUST refcount the domain.
1054          */
1055         if ( test_and_clear_bit(STATE_SCHED, &pirq_dpci->state) )
1056         {
1057             hvm_dirq_assist(d, pirq_dpci);
1058             put_domain(d);
1059         }
1060         clear_bit(STATE_RUN, &pirq_dpci->state);
1061     }
1062 }
1063 
cpu_callback(struct notifier_block * nfb,unsigned long action,void * hcpu)1064 static int cpu_callback(
1065     struct notifier_block *nfb, unsigned long action, void *hcpu)
1066 {
1067     unsigned int cpu = (unsigned long)hcpu;
1068 
1069     switch ( action )
1070     {
1071     case CPU_UP_PREPARE:
1072         INIT_LIST_HEAD(&per_cpu(dpci_list, cpu));
1073         break;
1074     case CPU_UP_CANCELED:
1075     case CPU_DEAD:
1076         /*
1077          * On CPU_DYING this callback is called (on the CPU that is dying)
1078          * with an possible HVM_DPIC_SOFTIRQ pending - at which point we can
1079          * clear out any outstanding domains (by the virtue of the idle loop
1080          * calling the softirq later). In CPU_DEAD case the CPU is deaf and
1081          * there are no pending softirqs for us to handle so we can chill.
1082          */
1083         ASSERT(list_empty(&per_cpu(dpci_list, cpu)));
1084         break;
1085     }
1086 
1087     return NOTIFY_DONE;
1088 }
1089 
1090 static struct notifier_block cpu_nfb = {
1091     .notifier_call = cpu_callback,
1092 };
1093 
setup_dpci_softirq(void)1094 static int __init setup_dpci_softirq(void)
1095 {
1096     unsigned int cpu;
1097 
1098     for_each_online_cpu(cpu)
1099         INIT_LIST_HEAD(&per_cpu(dpci_list, cpu));
1100 
1101     open_softirq(HVM_DPCI_SOFTIRQ, dpci_softirq);
1102     register_cpu_notifier(&cpu_nfb);
1103     return 0;
1104 }
1105 __initcall(setup_dpci_softirq);
1106