/* * Copyright (c) 2006, Intel Corporation. * * This program is free software; you can redistribute it and/or modify it * under the terms and conditions of the GNU General Public License, * version 2, as published by the Free Software Foundation. * * This program is distributed in the hope it will be useful, but WITHOUT * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for * more details. * * You should have received a copy of the GNU General Public License along with * this program; If not, see . * * Copyright (C) Allen Kay * Copyright (C) Xiaohui Xin */ #include #include #include #include #include #include #include static DEFINE_PER_CPU(struct list_head, dpci_list); /* * These two bit states help to safely schedule, deschedule, and wait until * the softirq has finished. * * The semantics behind these two bits is as follow: * - STATE_SCHED - whoever modifies it has to ref-count the domain (->dom). * - STATE_RUN - only softirq is allowed to set and clear it. If it has * been set hvm_dirq_assist will RUN with a saved value of the * 'struct domain' copied from 'pirq_dpci->dom' before STATE_RUN was set. * * The usual states are: STATE_SCHED(set) -> STATE_RUN(set) -> * STATE_SCHED(unset) -> STATE_RUN(unset). * * However the states can also diverge such as: STATE_SCHED(set) -> * STATE_SCHED(unset) -> STATE_RUN(set) -> STATE_RUN(unset). That means * the 'hvm_dirq_assist' never run and that the softirq did not do any * ref-counting. */ enum { STATE_SCHED, STATE_RUN }; /* * This can be called multiple times, but the softirq is only raised once. * That is until the STATE_SCHED state has been cleared. The state can be * cleared by: the 'dpci_softirq' (when it has executed 'hvm_dirq_assist'), * or by 'pt_pirq_softirq_reset' (which will try to clear the state before * the softirq had a chance to run). */ static void raise_softirq_for(struct hvm_pirq_dpci *pirq_dpci) { unsigned long flags; if ( test_and_set_bit(STATE_SCHED, &pirq_dpci->state) ) return; get_knownalive_domain(pirq_dpci->dom); local_irq_save(flags); list_add_tail(&pirq_dpci->softirq_list, &this_cpu(dpci_list)); local_irq_restore(flags); raise_softirq(HVM_DPCI_SOFTIRQ); } /* * If we are racing with softirq_dpci (STATE_SCHED) we return * true. Otherwise we return false. * * If it is false, it is the callers responsibility to make sure * that the softirq (with the event_lock dropped) has ran. */ bool pt_pirq_softirq_active(struct hvm_pirq_dpci *pirq_dpci) { if ( pirq_dpci->state & ((1 << STATE_RUN) | (1 << STATE_SCHED)) ) return true; /* * If in the future we would call 'raise_softirq_for' right away * after 'pt_pirq_softirq_active' we MUST reset the list (otherwise it * might have stale data). */ return false; } /* * Reset the pirq_dpci->dom parameter to NULL. * * This function checks the different states to make sure it can do it * at the right time. If it unschedules the 'hvm_dirq_assist' from running * it also refcounts (which is what the softirq would have done) properly. */ static void pt_pirq_softirq_reset(struct hvm_pirq_dpci *pirq_dpci) { struct domain *d = pirq_dpci->dom; ASSERT(spin_is_locked(&d->event_lock)); switch ( cmpxchg(&pirq_dpci->state, 1 << STATE_SCHED, 0) ) { case (1 << STATE_SCHED): /* * We are going to try to de-schedule the softirq before it goes in * STATE_RUN. Whoever clears STATE_SCHED MUST refcount the 'dom'. */ put_domain(d); /* fallthrough. */ case (1 << STATE_RUN): case (1 << STATE_RUN) | (1 << STATE_SCHED): /* * The reason it is OK to reset 'dom' when STATE_RUN bit is set is due * to a shortcut the 'dpci_softirq' implements. It stashes the 'dom' * in local variable before it sets STATE_RUN - and therefore will not * dereference '->dom' which would crash. */ pirq_dpci->dom = NULL; break; } /* * Inhibit 'hvm_dirq_assist' from doing anything useful and at worst * calling 'set_timer' which will blow up (as we have called kill_timer * or never initialized it). Note that we hold the lock that * 'hvm_dirq_assist' could be spinning on. */ pirq_dpci->masked = 0; } bool pt_irq_need_timer(uint32_t flags) { return !(flags & (HVM_IRQ_DPCI_GUEST_MSI | HVM_IRQ_DPCI_TRANSLATE)); } static int pt_irq_guest_eoi(struct domain *d, struct hvm_pirq_dpci *pirq_dpci, void *arg) { if ( __test_and_clear_bit(_HVM_IRQ_DPCI_EOI_LATCH_SHIFT, &pirq_dpci->flags) ) { pirq_dpci->masked = 0; pirq_dpci->pending = 0; pirq_guest_eoi(dpci_pirq(pirq_dpci)); } return 0; } static void pt_irq_time_out(void *data) { struct hvm_pirq_dpci *irq_map = data; const struct hvm_irq_dpci *dpci; const struct dev_intx_gsi_link *digl; spin_lock(&irq_map->dom->event_lock); if ( irq_map->flags & HVM_IRQ_DPCI_IDENTITY_GSI ) { ASSERT(is_hardware_domain(irq_map->dom)); /* * Identity mapped, no need to iterate over the guest GSI list to find * other pirqs sharing the same guest GSI. * * In the identity mapped case the EOI can also be done now, this way * the iteration over the list of domain pirqs is avoided. */ hvm_gsi_deassert(irq_map->dom, dpci_pirq(irq_map)->pirq); irq_map->flags |= HVM_IRQ_DPCI_EOI_LATCH; pt_irq_guest_eoi(irq_map->dom, irq_map, NULL); spin_unlock(&irq_map->dom->event_lock); return; } dpci = domain_get_irq_dpci(irq_map->dom); if ( unlikely(!dpci) ) { ASSERT_UNREACHABLE(); spin_unlock(&irq_map->dom->event_lock); return; } list_for_each_entry ( digl, &irq_map->digl_list, list ) { unsigned int guest_gsi = hvm_pci_intx_gsi(digl->device, digl->intx); const struct hvm_girq_dpci_mapping *girq; list_for_each_entry ( girq, &dpci->girq[guest_gsi], list ) { struct pirq *pirq = pirq_info(irq_map->dom, girq->machine_gsi); pirq_dpci(pirq)->flags |= HVM_IRQ_DPCI_EOI_LATCH; } hvm_pci_intx_deassert(irq_map->dom, digl->device, digl->intx); } pt_pirq_iterate(irq_map->dom, pt_irq_guest_eoi, NULL); spin_unlock(&irq_map->dom->event_lock); } struct hvm_irq_dpci *domain_get_irq_dpci(const struct domain *d) { if ( !d || !is_hvm_domain(d) ) return NULL; return hvm_domain_irq(d)->dpci; } void free_hvm_irq_dpci(struct hvm_irq_dpci *dpci) { xfree(dpci); } /* * This routine handles lowest-priority interrupts using vector-hashing * mechanism. As an example, modern Intel CPUs use this method to handle * lowest-priority interrupts. * * Here is the details about the vector-hashing mechanism: * 1. For lowest-priority interrupts, store all the possible destination * vCPUs in an array. * 2. Use "gvec % max number of destination vCPUs" to find the right * destination vCPU in the array for the lowest-priority interrupt. */ static struct vcpu *vector_hashing_dest(const struct domain *d, uint32_t dest_id, bool dest_mode, uint8_t gvec) { unsigned long *dest_vcpu_bitmap; unsigned int dest_vcpus = 0; struct vcpu *v, *dest = NULL; unsigned int i; dest_vcpu_bitmap = xzalloc_array(unsigned long, BITS_TO_LONGS(d->max_vcpus)); if ( !dest_vcpu_bitmap ) return NULL; for_each_vcpu ( d, v ) { if ( !vlapic_match_dest(vcpu_vlapic(v), NULL, APIC_DEST_NOSHORT, dest_id, dest_mode) ) continue; __set_bit(v->vcpu_id, dest_vcpu_bitmap); dest_vcpus++; } if ( dest_vcpus != 0 ) { unsigned int mod = gvec % dest_vcpus; unsigned int idx = 0; for ( i = 0; i <= mod; i++ ) { idx = find_next_bit(dest_vcpu_bitmap, d->max_vcpus, idx) + 1; BUG_ON(idx > d->max_vcpus); } dest = d->vcpu[idx - 1]; } xfree(dest_vcpu_bitmap); return dest; } int pt_irq_create_bind( struct domain *d, const struct xen_domctl_bind_pt_irq *pt_irq_bind) { struct hvm_irq_dpci *hvm_irq_dpci; struct hvm_pirq_dpci *pirq_dpci; struct pirq *info; int rc, pirq = pt_irq_bind->machine_irq; if ( pirq < 0 || pirq >= d->nr_pirqs ) return -EINVAL; restart: spin_lock(&d->event_lock); hvm_irq_dpci = domain_get_irq_dpci(d); if ( !hvm_irq_dpci && !is_hardware_domain(d) ) { unsigned int i; /* * NB: the hardware domain doesn't use a hvm_irq_dpci struct because * it's only allowed to identity map GSIs, and so the data contained in * that struct (used to map guest GSIs into machine GSIs and perform * interrupt routing) is completely useless to it. */ hvm_irq_dpci = xzalloc(struct hvm_irq_dpci); if ( hvm_irq_dpci == NULL ) { spin_unlock(&d->event_lock); return -ENOMEM; } for ( i = 0; i < NR_HVM_DOMU_IRQS; i++ ) INIT_LIST_HEAD(&hvm_irq_dpci->girq[i]); hvm_domain_irq(d)->dpci = hvm_irq_dpci; } info = pirq_get_info(d, pirq); if ( !info ) { spin_unlock(&d->event_lock); return -ENOMEM; } pirq_dpci = pirq_dpci(info); /* * A crude 'while' loop with us dropping the spinlock and giving * the softirq_dpci a chance to run. * We MUST check for this condition as the softirq could be scheduled * and hasn't run yet. Note that this code replaced tasklet_kill which * would have spun forever and would do the same thing (wait to flush out * outstanding hvm_dirq_assist calls. */ if ( pt_pirq_softirq_active(pirq_dpci) ) { spin_unlock(&d->event_lock); cpu_relax(); goto restart; } switch ( pt_irq_bind->irq_type ) { case PT_IRQ_TYPE_MSI: { uint8_t dest, delivery_mode; bool dest_mode; int dest_vcpu_id; const struct vcpu *vcpu; uint32_t gflags = pt_irq_bind->u.msi.gflags & ~XEN_DOMCTL_VMSI_X86_UNMASKED; if ( !(pirq_dpci->flags & HVM_IRQ_DPCI_MAPPED) ) { pirq_dpci->flags = HVM_IRQ_DPCI_MAPPED | HVM_IRQ_DPCI_MACH_MSI | HVM_IRQ_DPCI_GUEST_MSI; pirq_dpci->gmsi.gvec = pt_irq_bind->u.msi.gvec; pirq_dpci->gmsi.gflags = gflags; /* * 'pt_irq_create_bind' can be called after 'pt_irq_destroy_bind'. * The 'pirq_cleanup_check' which would free the structure is only * called if the event channel for the PIRQ is active. However * OS-es that use event channels usually bind PIRQs to eventds * and unbind them before calling 'pt_irq_destroy_bind' - with the * result that we re-use the 'dpci' structure. This can be * reproduced with unloading and loading the driver for a device. * * As such on every 'pt_irq_create_bind' call we MUST set it. */ pirq_dpci->dom = d; /* bind after hvm_irq_dpci is setup to avoid race with irq handler*/ rc = pirq_guest_bind(d->vcpu[0], info, 0); if ( rc == 0 && pt_irq_bind->u.msi.gtable ) { rc = msixtbl_pt_register(d, info, pt_irq_bind->u.msi.gtable); if ( unlikely(rc) ) { pirq_guest_unbind(d, info); /* * Between 'pirq_guest_bind' and before 'pirq_guest_unbind' * an interrupt can be scheduled. No more of them are going * to be scheduled but we must deal with the one that may be * in the queue. */ pt_pirq_softirq_reset(pirq_dpci); } } if ( unlikely(rc) ) { pirq_dpci->gmsi.gflags = 0; pirq_dpci->gmsi.gvec = 0; pirq_dpci->dom = NULL; pirq_dpci->flags = 0; pirq_cleanup_check(info, d); spin_unlock(&d->event_lock); return rc; } } else { uint32_t mask = HVM_IRQ_DPCI_MACH_MSI | HVM_IRQ_DPCI_GUEST_MSI; if ( (pirq_dpci->flags & mask) != mask ) { spin_unlock(&d->event_lock); return -EBUSY; } /* If pirq is already mapped as vmsi, update guest data/addr. */ if ( pirq_dpci->gmsi.gvec != pt_irq_bind->u.msi.gvec || pirq_dpci->gmsi.gflags != gflags ) { /* Directly clear pending EOIs before enabling new MSI info. */ pirq_guest_eoi(info); pirq_dpci->gmsi.gvec = pt_irq_bind->u.msi.gvec; pirq_dpci->gmsi.gflags = gflags; } } /* Calculate dest_vcpu_id for MSI-type pirq migration. */ dest = MASK_EXTR(pirq_dpci->gmsi.gflags, XEN_DOMCTL_VMSI_X86_DEST_ID_MASK); dest_mode = pirq_dpci->gmsi.gflags & XEN_DOMCTL_VMSI_X86_DM_MASK; delivery_mode = MASK_EXTR(pirq_dpci->gmsi.gflags, XEN_DOMCTL_VMSI_X86_DELIV_MASK); dest_vcpu_id = hvm_girq_dest_2_vcpu_id(d, dest, dest_mode); pirq_dpci->gmsi.dest_vcpu_id = dest_vcpu_id; spin_unlock(&d->event_lock); pirq_dpci->gmsi.posted = false; vcpu = (dest_vcpu_id >= 0) ? d->vcpu[dest_vcpu_id] : NULL; if ( iommu_intpost ) { if ( delivery_mode == dest_LowestPrio ) vcpu = vector_hashing_dest(d, dest, dest_mode, pirq_dpci->gmsi.gvec); if ( vcpu ) pirq_dpci->gmsi.posted = true; } if ( dest_vcpu_id >= 0 ) hvm_migrate_pirqs(d->vcpu[dest_vcpu_id]); /* Use interrupt posting if it is supported. */ if ( iommu_intpost ) pi_update_irte(vcpu ? &vcpu->arch.hvm_vmx.pi_desc : NULL, info, pirq_dpci->gmsi.gvec); if ( pt_irq_bind->u.msi.gflags & XEN_DOMCTL_VMSI_X86_UNMASKED ) { unsigned long flags; struct irq_desc *desc = pirq_spin_lock_irq_desc(info, &flags); if ( !desc ) { pt_irq_destroy_bind(d, pt_irq_bind); return -EINVAL; } guest_mask_msi_irq(desc, false); spin_unlock_irqrestore(&desc->lock, flags); } break; } case PT_IRQ_TYPE_PCI: case PT_IRQ_TYPE_MSI_TRANSLATE: { struct dev_intx_gsi_link *digl = NULL; struct hvm_girq_dpci_mapping *girq = NULL; unsigned int guest_gsi; /* * Mapping GSIs for the hardware domain is different than doing it for * an unpriviledged guest, the hardware domain is only allowed to * identity map GSIs, and as such all the data in the u.pci union is * discarded. */ if ( hvm_irq_dpci ) { unsigned int link; digl = xmalloc(struct dev_intx_gsi_link); girq = xmalloc(struct hvm_girq_dpci_mapping); if ( !digl || !girq ) { spin_unlock(&d->event_lock); xfree(girq); xfree(digl); return -ENOMEM; } girq->bus = digl->bus = pt_irq_bind->u.pci.bus; girq->device = digl->device = pt_irq_bind->u.pci.device; girq->intx = digl->intx = pt_irq_bind->u.pci.intx; list_add_tail(&digl->list, &pirq_dpci->digl_list); guest_gsi = hvm_pci_intx_gsi(digl->device, digl->intx); link = hvm_pci_intx_link(digl->device, digl->intx); hvm_irq_dpci->link_cnt[link]++; girq->machine_gsi = pirq; list_add_tail(&girq->list, &hvm_irq_dpci->girq[guest_gsi]); } else { ASSERT(is_hardware_domain(d)); /* MSI_TRANSLATE is not supported for the hardware domain. */ if ( pt_irq_bind->irq_type != PT_IRQ_TYPE_PCI || pirq >= hvm_domain_irq(d)->nr_gsis ) { spin_unlock(&d->event_lock); return -EINVAL; } guest_gsi = pirq; } /* Bind the same mirq once in the same domain */ if ( !(pirq_dpci->flags & HVM_IRQ_DPCI_MAPPED) ) { unsigned int share; /* MUST be set, as the pirq_dpci can be re-used. */ pirq_dpci->dom = d; if ( pt_irq_bind->irq_type == PT_IRQ_TYPE_MSI_TRANSLATE ) { pirq_dpci->flags = HVM_IRQ_DPCI_MAPPED | HVM_IRQ_DPCI_MACH_MSI | HVM_IRQ_DPCI_GUEST_PCI | HVM_IRQ_DPCI_TRANSLATE; share = 0; } else /* PT_IRQ_TYPE_PCI */ { pirq_dpci->flags = HVM_IRQ_DPCI_MAPPED | HVM_IRQ_DPCI_MACH_PCI | HVM_IRQ_DPCI_GUEST_PCI; if ( !is_hardware_domain(d) ) share = BIND_PIRQ__WILL_SHARE; else { int mask = vioapic_get_mask(d, guest_gsi); int trigger_mode = vioapic_get_trigger_mode(d, guest_gsi); if ( mask < 0 || trigger_mode < 0 ) { spin_unlock(&d->event_lock); ASSERT_UNREACHABLE(); return -EINVAL; } pirq_dpci->flags |= HVM_IRQ_DPCI_IDENTITY_GSI; /* * Check if the corresponding vIO APIC pin is configured * level or edge trigger, level triggered interrupts will * be marked as shareable. */ ASSERT(!mask); share = trigger_mode; } } /* Init timer before binding */ if ( pt_irq_need_timer(pirq_dpci->flags) ) init_timer(&pirq_dpci->timer, pt_irq_time_out, pirq_dpci, 0); /* Deal with gsi for legacy devices */ rc = pirq_guest_bind(d->vcpu[0], info, share); if ( unlikely(rc) ) { if ( pt_irq_need_timer(pirq_dpci->flags) ) kill_timer(&pirq_dpci->timer); /* * There is no path for __do_IRQ to schedule softirq as * IRQ_GUEST is not set. As such we can reset 'dom' directly. */ pirq_dpci->dom = NULL; if ( hvm_irq_dpci ) { unsigned int link; ASSERT(girq && digl); list_del(&girq->list); list_del(&digl->list); link = hvm_pci_intx_link(digl->device, digl->intx); hvm_irq_dpci->link_cnt[link]--; } pirq_dpci->flags = 0; pirq_cleanup_check(info, d); spin_unlock(&d->event_lock); xfree(girq); xfree(digl); return rc; } } spin_unlock(&d->event_lock); if ( iommu_verbose ) { char buf[24] = ""; if ( digl ) snprintf(buf, ARRAY_SIZE(buf), " dev=%02x.%02x.%u intx=%u", digl->bus, PCI_SLOT(digl->device), PCI_FUNC(digl->device), digl->intx); printk(XENLOG_G_INFO "d%d: bind: m_gsi=%u g_gsi=%u%s\n", d->domain_id, pirq, guest_gsi, buf); } break; } default: spin_unlock(&d->event_lock); return -EOPNOTSUPP; } return 0; } int pt_irq_destroy_bind( struct domain *d, const struct xen_domctl_bind_pt_irq *pt_irq_bind) { struct hvm_irq_dpci *hvm_irq_dpci; struct hvm_pirq_dpci *pirq_dpci; unsigned int machine_gsi = pt_irq_bind->machine_irq; struct pirq *pirq; const char *what = NULL; switch ( pt_irq_bind->irq_type ) { case PT_IRQ_TYPE_PCI: case PT_IRQ_TYPE_MSI_TRANSLATE: if ( iommu_verbose ) { unsigned int device = pt_irq_bind->u.pci.device; unsigned int intx = pt_irq_bind->u.pci.intx; printk(XENLOG_G_INFO "d%d: unbind: m_gsi=%u g_gsi=%u dev=%02x:%02x.%u intx=%u\n", d->domain_id, machine_gsi, hvm_pci_intx_gsi(device, intx), pt_irq_bind->u.pci.bus, PCI_SLOT(device), PCI_FUNC(device), intx); } break; case PT_IRQ_TYPE_MSI: break; default: return -EOPNOTSUPP; } spin_lock(&d->event_lock); hvm_irq_dpci = domain_get_irq_dpci(d); if ( !hvm_irq_dpci && !is_hardware_domain(d) ) { spin_unlock(&d->event_lock); return -EINVAL; } pirq = pirq_info(d, machine_gsi); pirq_dpci = pirq_dpci(pirq); if ( hvm_irq_dpci && pt_irq_bind->irq_type != PT_IRQ_TYPE_MSI ) { unsigned int bus = pt_irq_bind->u.pci.bus; unsigned int device = pt_irq_bind->u.pci.device; unsigned int intx = pt_irq_bind->u.pci.intx; unsigned int guest_gsi = hvm_pci_intx_gsi(device, intx); unsigned int link = hvm_pci_intx_link(device, intx); struct hvm_girq_dpci_mapping *girq; struct dev_intx_gsi_link *digl, *tmp; list_for_each_entry ( girq, &hvm_irq_dpci->girq[guest_gsi], list ) { if ( girq->bus == bus && girq->device == device && girq->intx == intx && girq->machine_gsi == machine_gsi ) { list_del(&girq->list); xfree(girq); girq = NULL; break; } } if ( girq ) { spin_unlock(&d->event_lock); return -EINVAL; } hvm_irq_dpci->link_cnt[link]--; /* clear the mirq info */ if ( pirq_dpci && (pirq_dpci->flags & HVM_IRQ_DPCI_MAPPED) ) { list_for_each_entry_safe ( digl, tmp, &pirq_dpci->digl_list, list ) { if ( digl->bus == bus && digl->device == device && digl->intx == intx ) { list_del(&digl->list); xfree(digl); } } what = list_empty(&pirq_dpci->digl_list) ? "final" : "partial"; } else what = "bogus"; } else if ( pirq_dpci && pirq_dpci->gmsi.posted ) pi_update_irte(NULL, pirq, 0); if ( pirq_dpci && (pirq_dpci->flags & HVM_IRQ_DPCI_MAPPED) && list_empty(&pirq_dpci->digl_list) ) { pirq_guest_unbind(d, pirq); msixtbl_pt_unregister(d, pirq); if ( pt_irq_need_timer(pirq_dpci->flags) ) kill_timer(&pirq_dpci->timer); pirq_dpci->flags = 0; /* * See comment in pt_irq_create_bind's PT_IRQ_TYPE_MSI before the * call to pt_pirq_softirq_reset. */ pt_pirq_softirq_reset(pirq_dpci); pirq_cleanup_check(pirq, d); } spin_unlock(&d->event_lock); if ( what && iommu_verbose ) { unsigned int device = pt_irq_bind->u.pci.device; char buf[24] = ""; if ( hvm_irq_dpci ) snprintf(buf, ARRAY_SIZE(buf), " dev=%02x.%02x.%u intx=%u", pt_irq_bind->u.pci.bus, PCI_SLOT(device), PCI_FUNC(device), pt_irq_bind->u.pci.intx); printk(XENLOG_G_INFO "d%d %s unmap: m_irq=%u%s\n", d->domain_id, what, machine_gsi, buf); } return 0; } void pt_pirq_init(struct domain *d, struct hvm_pirq_dpci *dpci) { INIT_LIST_HEAD(&dpci->digl_list); dpci->gmsi.dest_vcpu_id = -1; } bool pt_pirq_cleanup_check(struct hvm_pirq_dpci *dpci) { if ( !dpci->flags && !pt_pirq_softirq_active(dpci) ) { dpci->dom = NULL; return true; } return false; } int pt_pirq_iterate(struct domain *d, int (*cb)(struct domain *, struct hvm_pirq_dpci *, void *), void *arg) { int rc = 0; unsigned int pirq = 0, n, i; struct pirq *pirqs[8]; ASSERT(spin_is_locked(&d->event_lock)); do { n = radix_tree_gang_lookup(&d->pirq_tree, (void **)pirqs, pirq, ARRAY_SIZE(pirqs)); for ( i = 0; i < n; ++i ) { struct hvm_pirq_dpci *pirq_dpci = pirq_dpci(pirqs[i]); pirq = pirqs[i]->pirq; if ( (pirq_dpci->flags & HVM_IRQ_DPCI_MAPPED) ) rc = cb(d, pirq_dpci, arg); } } while ( !rc && ++pirq < d->nr_pirqs && n == ARRAY_SIZE(pirqs) ); return rc; } int hvm_do_IRQ_dpci(struct domain *d, struct pirq *pirq) { struct hvm_irq_dpci *dpci = domain_get_irq_dpci(d); struct hvm_pirq_dpci *pirq_dpci = pirq_dpci(pirq); ASSERT(is_hvm_domain(d)); if ( !iommu_enabled || (!is_hardware_domain(d) && !dpci) || !pirq_dpci || !(pirq_dpci->flags & HVM_IRQ_DPCI_MAPPED) ) return 0; pirq_dpci->masked = 1; raise_softirq_for(pirq_dpci); return 1; } /* called with d->event_lock held */ static void __msi_pirq_eoi(struct hvm_pirq_dpci *pirq_dpci) { irq_desc_t *desc; if ( (pirq_dpci->flags & HVM_IRQ_DPCI_MAPPED) && (pirq_dpci->flags & HVM_IRQ_DPCI_MACH_MSI) ) { struct pirq *pirq = dpci_pirq(pirq_dpci); BUG_ON(!local_irq_is_enabled()); desc = pirq_spin_lock_irq_desc(pirq, NULL); if ( !desc ) return; desc_guest_eoi(desc, pirq); } } static int _hvm_dpci_msi_eoi(struct domain *d, struct hvm_pirq_dpci *pirq_dpci, void *arg) { int vector = (long)arg; if ( (pirq_dpci->flags & HVM_IRQ_DPCI_MACH_MSI) && (pirq_dpci->gmsi.gvec == vector) ) { unsigned int dest = MASK_EXTR(pirq_dpci->gmsi.gflags, XEN_DOMCTL_VMSI_X86_DEST_ID_MASK); bool dest_mode = pirq_dpci->gmsi.gflags & XEN_DOMCTL_VMSI_X86_DM_MASK; if ( vlapic_match_dest(vcpu_vlapic(current), NULL, 0, dest, dest_mode) ) { __msi_pirq_eoi(pirq_dpci); return 1; } } return 0; } void hvm_dpci_msi_eoi(struct domain *d, int vector) { if ( !iommu_enabled || !hvm_domain_irq(d)->dpci ) return; spin_lock(&d->event_lock); pt_pirq_iterate(d, _hvm_dpci_msi_eoi, (void *)(long)vector); spin_unlock(&d->event_lock); } static void hvm_dirq_assist(struct domain *d, struct hvm_pirq_dpci *pirq_dpci) { if ( unlikely(!hvm_domain_irq(d)->dpci) && !is_hardware_domain(d) ) { ASSERT_UNREACHABLE(); return; } spin_lock(&d->event_lock); if ( test_and_clear_bool(pirq_dpci->masked) ) { struct pirq *pirq = dpci_pirq(pirq_dpci); const struct dev_intx_gsi_link *digl; if ( hvm_domain_use_pirq(d, pirq) ) { send_guest_pirq(d, pirq); if ( pirq_dpci->flags & HVM_IRQ_DPCI_GUEST_MSI ) { spin_unlock(&d->event_lock); return; } } if ( pirq_dpci->flags & HVM_IRQ_DPCI_GUEST_MSI ) { vmsi_deliver_pirq(d, pirq_dpci); spin_unlock(&d->event_lock); return; } list_for_each_entry ( digl, &pirq_dpci->digl_list, list ) { ASSERT(!(pirq_dpci->flags & HVM_IRQ_DPCI_IDENTITY_GSI)); hvm_pci_intx_assert(d, digl->device, digl->intx); pirq_dpci->pending++; } if ( pirq_dpci->flags & HVM_IRQ_DPCI_IDENTITY_GSI ) { hvm_gsi_assert(d, pirq->pirq); pirq_dpci->pending++; } if ( pirq_dpci->flags & HVM_IRQ_DPCI_TRANSLATE ) { /* for translated MSI to INTx interrupt, eoi as early as possible */ __msi_pirq_eoi(pirq_dpci); spin_unlock(&d->event_lock); return; } /* * Set a timer to see if the guest can finish the interrupt or not. For * example, the guest OS may unmask the PIC during boot, before the * guest driver is loaded. hvm_pci_intx_assert() may succeed, but the * guest will never deal with the irq, then the physical interrupt line * will never be deasserted. */ ASSERT(pt_irq_need_timer(pirq_dpci->flags)); set_timer(&pirq_dpci->timer, NOW() + PT_IRQ_TIME_OUT); } spin_unlock(&d->event_lock); } static void hvm_pirq_eoi(struct pirq *pirq, const union vioapic_redir_entry *ent) { struct hvm_pirq_dpci *pirq_dpci; if ( !pirq ) { ASSERT_UNREACHABLE(); return; } pirq_dpci = pirq_dpci(pirq); /* * No need to get vector lock for timer * since interrupt is still not EOIed */ if ( --pirq_dpci->pending || (ent && ent->fields.mask) || !pt_irq_need_timer(pirq_dpci->flags) ) return; stop_timer(&pirq_dpci->timer); pirq_guest_eoi(pirq); } static void __hvm_dpci_eoi(struct domain *d, const struct hvm_girq_dpci_mapping *girq, const union vioapic_redir_entry *ent) { struct pirq *pirq = pirq_info(d, girq->machine_gsi); if ( !hvm_domain_use_pirq(d, pirq) ) hvm_pci_intx_deassert(d, girq->device, girq->intx); hvm_pirq_eoi(pirq, ent); } static void hvm_gsi_eoi(struct domain *d, unsigned int gsi, const union vioapic_redir_entry *ent) { struct pirq *pirq = pirq_info(d, gsi); /* Check if GSI is actually mapped. */ if ( !pirq_dpci(pirq) ) return; hvm_gsi_deassert(d, gsi); hvm_pirq_eoi(pirq, ent); } void hvm_dpci_eoi(struct domain *d, unsigned int guest_gsi, const union vioapic_redir_entry *ent) { const struct hvm_irq_dpci *hvm_irq_dpci; const struct hvm_girq_dpci_mapping *girq; if ( !iommu_enabled ) return; if ( is_hardware_domain(d) ) { spin_lock(&d->event_lock); hvm_gsi_eoi(d, guest_gsi, ent); goto unlock; } if ( guest_gsi < NR_ISAIRQS ) { hvm_dpci_isairq_eoi(d, guest_gsi); return; } spin_lock(&d->event_lock); hvm_irq_dpci = domain_get_irq_dpci(d); if ( !hvm_irq_dpci ) goto unlock; list_for_each_entry ( girq, &hvm_irq_dpci->girq[guest_gsi], list ) __hvm_dpci_eoi(d, girq, ent); unlock: spin_unlock(&d->event_lock); } /* * Note: 'pt_pirq_softirq_reset' can clear the STATE_SCHED before we get to * doing it. If that is the case we let 'pt_pirq_softirq_reset' do ref-counting. */ static void dpci_softirq(void) { unsigned int cpu = smp_processor_id(); LIST_HEAD(our_list); local_irq_disable(); list_splice_init(&per_cpu(dpci_list, cpu), &our_list); local_irq_enable(); while ( !list_empty(&our_list) ) { struct hvm_pirq_dpci *pirq_dpci; struct domain *d; pirq_dpci = list_entry(our_list.next, struct hvm_pirq_dpci, softirq_list); list_del(&pirq_dpci->softirq_list); d = pirq_dpci->dom; smp_mb(); /* 'd' MUST be saved before we set/clear the bits. */ if ( test_and_set_bit(STATE_RUN, &pirq_dpci->state) ) { unsigned long flags; /* Put back on the list and retry. */ local_irq_save(flags); list_add_tail(&pirq_dpci->softirq_list, &this_cpu(dpci_list)); local_irq_restore(flags); raise_softirq(HVM_DPCI_SOFTIRQ); continue; } /* * The one who clears STATE_SCHED MUST refcount the domain. */ if ( test_and_clear_bit(STATE_SCHED, &pirq_dpci->state) ) { hvm_dirq_assist(d, pirq_dpci); put_domain(d); } clear_bit(STATE_RUN, &pirq_dpci->state); } } static int cpu_callback( struct notifier_block *nfb, unsigned long action, void *hcpu) { unsigned int cpu = (unsigned long)hcpu; switch ( action ) { case CPU_UP_PREPARE: INIT_LIST_HEAD(&per_cpu(dpci_list, cpu)); break; case CPU_UP_CANCELED: case CPU_DEAD: /* * On CPU_DYING this callback is called (on the CPU that is dying) * with an possible HVM_DPIC_SOFTIRQ pending - at which point we can * clear out any outstanding domains (by the virtue of the idle loop * calling the softirq later). In CPU_DEAD case the CPU is deaf and * there are no pending softirqs for us to handle so we can chill. */ ASSERT(list_empty(&per_cpu(dpci_list, cpu))); break; } return NOTIFY_DONE; } static struct notifier_block cpu_nfb = { .notifier_call = cpu_callback, }; static int __init setup_dpci_softirq(void) { unsigned int cpu; for_each_online_cpu(cpu) INIT_LIST_HEAD(&per_cpu(dpci_list, cpu)); open_softirq(HVM_DPCI_SOFTIRQ, dpci_softirq); register_cpu_notifier(&cpu_nfb); return 0; } __initcall(setup_dpci_softirq);