/******************************************************************************
* vm_event.c
*
* VM event support.
*
* Copyright (c) 2009 Citrix Systems, Inc. (Patrick Colp)
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; If not, see .
*/
#include
#include
#include
#include
#include
#include
#include
#include
#include
/* for public/io/ring.h macros */
#define xen_mb() smp_mb()
#define xen_rmb() smp_rmb()
#define xen_wmb() smp_wmb()
#define vm_event_ring_lock_init(_ved) spin_lock_init(&(_ved)->ring_lock)
#define vm_event_ring_lock(_ved) spin_lock(&(_ved)->ring_lock)
#define vm_event_ring_unlock(_ved) spin_unlock(&(_ved)->ring_lock)
static int vm_event_enable(
struct domain *d,
struct xen_domctl_vm_event_op *vec,
struct vm_event_domain **ved,
int pause_flag,
int param,
xen_event_channel_notification_t notification_fn)
{
int rc;
unsigned long ring_gfn = d->arch.hvm_domain.params[param];
if ( !*ved )
*ved = xzalloc(struct vm_event_domain);
if ( !*ved )
return -ENOMEM;
/* Only one helper at a time. If the helper crashed,
* the ring is in an undefined state and so is the guest.
*/
if ( (*ved)->ring_page )
return -EBUSY;;
/* The parameter defaults to zero, and it should be
* set to something */
if ( ring_gfn == 0 )
return -ENOSYS;
vm_event_ring_lock_init(*ved);
vm_event_ring_lock(*ved);
rc = vm_event_init_domain(d);
if ( rc < 0 )
goto err;
rc = prepare_ring_for_helper(d, ring_gfn, &(*ved)->ring_pg_struct,
&(*ved)->ring_page);
if ( rc < 0 )
goto err;
/* Set the number of currently blocked vCPUs to 0. */
(*ved)->blocked = 0;
/* Allocate event channel */
rc = alloc_unbound_xen_event_channel(d, 0, current->domain->domain_id,
notification_fn);
if ( rc < 0 )
goto err;
(*ved)->xen_port = vec->port = rc;
/* Prepare ring buffer */
FRONT_RING_INIT(&(*ved)->front_ring,
(vm_event_sring_t *)(*ved)->ring_page,
PAGE_SIZE);
/* Save the pause flag for this particular ring. */
(*ved)->pause_flag = pause_flag;
/* Initialize the last-chance wait queue. */
init_waitqueue_head(&(*ved)->wq);
vm_event_ring_unlock(*ved);
return 0;
err:
destroy_ring_for_helper(&(*ved)->ring_page,
(*ved)->ring_pg_struct);
vm_event_ring_unlock(*ved);
xfree(*ved);
*ved = NULL;
return rc;
}
static unsigned int vm_event_ring_available(struct vm_event_domain *ved)
{
int avail_req = RING_FREE_REQUESTS(&ved->front_ring);
avail_req -= ved->target_producers;
avail_req -= ved->foreign_producers;
BUG_ON(avail_req < 0);
return avail_req;
}
/*
* vm_event_wake_blocked() will wakeup vcpus waiting for room in the
* ring. These vCPUs were paused on their way out after placing an event,
* but need to be resumed where the ring is capable of processing at least
* one event from them.
*/
static void vm_event_wake_blocked(struct domain *d, struct vm_event_domain *ved)
{
struct vcpu *v;
unsigned int avail_req = vm_event_ring_available(ved);
if ( avail_req == 0 || ved->blocked == 0 )
return;
/* We remember which vcpu last woke up to avoid scanning always linearly
* from zero and starving higher-numbered vcpus under high load */
if ( d->vcpu )
{
int i, j, k;
for (i = ved->last_vcpu_wake_up + 1, j = 0; j < d->max_vcpus; i++, j++)
{
k = i % d->max_vcpus;
v = d->vcpu[k];
if ( !v )
continue;
if ( !(ved->blocked) || avail_req == 0 )
break;
if ( test_and_clear_bit(ved->pause_flag, &v->pause_flags) )
{
vcpu_unpause(v);
avail_req--;
ved->blocked--;
ved->last_vcpu_wake_up = k;
}
}
}
}
/*
* In the event that a vCPU attempted to place an event in the ring and
* was unable to do so, it is queued on a wait queue. These are woken as
* needed, and take precedence over the blocked vCPUs.
*/
static void vm_event_wake_queued(struct domain *d, struct vm_event_domain *ved)
{
unsigned int avail_req = vm_event_ring_available(ved);
if ( avail_req > 0 )
wake_up_nr(&ved->wq, avail_req);
}
/*
* vm_event_wake() will wakeup all vcpus waiting for the ring to
* become available. If we have queued vCPUs, they get top priority. We
* are guaranteed that they will go through code paths that will eventually
* call vm_event_wake() again, ensuring that any blocked vCPUs will get
* unpaused once all the queued vCPUs have made it through.
*/
void vm_event_wake(struct domain *d, struct vm_event_domain *ved)
{
if (!list_empty(&ved->wq.list))
vm_event_wake_queued(d, ved);
else
vm_event_wake_blocked(d, ved);
}
static int vm_event_disable(struct domain *d, struct vm_event_domain **ved)
{
if ( vm_event_check_ring(*ved) )
{
struct vcpu *v;
vm_event_ring_lock(*ved);
if ( !list_empty(&(*ved)->wq.list) )
{
vm_event_ring_unlock(*ved);
return -EBUSY;
}
/* Free domU's event channel and leave the other one unbound */
free_xen_event_channel(d, (*ved)->xen_port);
/* Unblock all vCPUs */
for_each_vcpu ( d, v )
{
if ( test_and_clear_bit((*ved)->pause_flag, &v->pause_flags) )
{
vcpu_unpause(v);
(*ved)->blocked--;
}
}
destroy_ring_for_helper(&(*ved)->ring_page,
(*ved)->ring_pg_struct);
vm_event_cleanup_domain(d);
vm_event_ring_unlock(*ved);
}
xfree(*ved);
*ved = NULL;
return 0;
}
static inline void vm_event_release_slot(struct domain *d,
struct vm_event_domain *ved)
{
/* Update the accounting */
if ( current->domain == d )
ved->target_producers--;
else
ved->foreign_producers--;
/* Kick any waiters */
vm_event_wake(d, ved);
}
/*
* vm_event_mark_and_pause() tags vcpu and put it to sleep.
* The vcpu will resume execution in vm_event_wake_blocked().
*/
void vm_event_mark_and_pause(struct vcpu *v, struct vm_event_domain *ved)
{
if ( !test_and_set_bit(ved->pause_flag, &v->pause_flags) )
{
vcpu_pause_nosync(v);
ved->blocked++;
}
}
/*
* This must be preceded by a call to claim_slot(), and is guaranteed to
* succeed. As a side-effect however, the vCPU may be paused if the ring is
* overly full and its continued execution would cause stalling and excessive
* waiting. The vCPU will be automatically unpaused when the ring clears.
*/
void vm_event_put_request(struct domain *d,
struct vm_event_domain *ved,
vm_event_request_t *req)
{
vm_event_front_ring_t *front_ring;
int free_req;
unsigned int avail_req;
RING_IDX req_prod;
struct vcpu *curr = current;
if( !vm_event_check_ring(ved))
return;
if ( curr->domain != d )
{
req->flags |= VM_EVENT_FLAG_FOREIGN;
#ifndef NDEBUG
if ( !(req->flags & VM_EVENT_FLAG_VCPU_PAUSED) )
gdprintk(XENLOG_G_WARNING, "d%dv%d was not paused.\n",
d->domain_id, req->vcpu_id);
#endif
}
req->version = VM_EVENT_INTERFACE_VERSION;
vm_event_ring_lock(ved);
/* Due to the reservations, this step must succeed. */
front_ring = &ved->front_ring;
free_req = RING_FREE_REQUESTS(front_ring);
ASSERT(free_req > 0);
/* Copy request */
req_prod = front_ring->req_prod_pvt;
memcpy(RING_GET_REQUEST(front_ring, req_prod), req, sizeof(*req));
req_prod++;
/* Update ring */
front_ring->req_prod_pvt = req_prod;
RING_PUSH_REQUESTS(front_ring);
/* We've actually *used* our reservation, so release the slot. */
vm_event_release_slot(d, ved);
/* Give this vCPU a black eye if necessary, on the way out.
* See the comments above wake_blocked() for more information
* on how this mechanism works to avoid waiting. */
avail_req = vm_event_ring_available(ved);
if( curr->domain == d && avail_req < d->max_vcpus &&
!atomic_read(&curr->vm_event_pause_count) )
vm_event_mark_and_pause(curr, ved);
vm_event_ring_unlock(ved);
notify_via_xen_event_channel(d, ved->xen_port);
}
int vm_event_get_response(struct domain *d, struct vm_event_domain *ved,
vm_event_response_t *rsp)
{
vm_event_front_ring_t *front_ring;
RING_IDX rsp_cons;
vm_event_ring_lock(ved);
front_ring = &ved->front_ring;
rsp_cons = front_ring->rsp_cons;
if ( !RING_HAS_UNCONSUMED_RESPONSES(front_ring) )
{
vm_event_ring_unlock(ved);
return 0;
}
/* Copy response */
memcpy(rsp, RING_GET_RESPONSE(front_ring, rsp_cons), sizeof(*rsp));
rsp_cons++;
/* Update ring */
front_ring->rsp_cons = rsp_cons;
front_ring->sring->rsp_event = rsp_cons + 1;
/* Kick any waiters -- since we've just consumed an event,
* there may be additional space available in the ring. */
vm_event_wake(d, ved);
vm_event_ring_unlock(ved);
return 1;
}
/*
* Pull all responses from the given ring and unpause the corresponding vCPU
* if required. Based on the response type, here we can also call custom
* handlers.
*
* Note: responses are handled the same way regardless of which ring they
* arrive on.
*/
void vm_event_resume(struct domain *d, struct vm_event_domain *ved)
{
vm_event_response_t rsp;
/*
* vm_event_resume() runs in either XEN_DOMCTL_VM_EVENT_OP_*, or
* EVTCHN_send context from the introspection consumer. Both contexts
* are guaranteed not to be the subject of vm_event responses.
* While we could ASSERT(v != current) for each VCPU in d in the loop
* below, this covers the case where we would need to iterate over all
* of them more succintly.
*/
ASSERT(d != current->domain);
/* Pull all responses off the ring. */
while ( vm_event_get_response(d, ved, &rsp) )
{
struct vcpu *v;
if ( rsp.version != VM_EVENT_INTERFACE_VERSION )
{
printk(XENLOG_G_WARNING "vm_event interface version mismatch\n");
continue;
}
/* Validate the vcpu_id in the response. */
if ( (rsp.vcpu_id >= d->max_vcpus) || !d->vcpu[rsp.vcpu_id] )
continue;
v = d->vcpu[rsp.vcpu_id];
/*
* In some cases the response type needs extra handling, so here
* we call the appropriate handlers.
*/
/* Check flags which apply only when the vCPU is paused */
if ( atomic_read(&v->vm_event_pause_count) )
{
#ifdef CONFIG_HAS_MEM_PAGING
if ( rsp.reason == VM_EVENT_REASON_MEM_PAGING )
p2m_mem_paging_resume(d, &rsp);
#endif
/*
* Check emulation flags in the arch-specific handler only, as it
* has to set arch-specific flags when supported, and to avoid
* bitmask overhead when it isn't supported.
*/
vm_event_emulate_check(v, &rsp);
/*
* Check in arch-specific handler to avoid bitmask overhead when
* not supported.
*/
vm_event_register_write_resume(v, &rsp);
/*
* Check in arch-specific handler to avoid bitmask overhead when
* not supported.
*/
vm_event_toggle_singlestep(d, v, &rsp);
/* Check for altp2m switch */
if ( rsp.flags & VM_EVENT_FLAG_ALTERNATE_P2M )
p2m_altp2m_check(v, rsp.altp2m_idx);
if ( rsp.flags & VM_EVENT_FLAG_SET_REGISTERS )
vm_event_set_registers(v, &rsp);
if ( rsp.flags & VM_EVENT_FLAG_GET_NEXT_INTERRUPT )
vm_event_monitor_next_interrupt(v);
if ( rsp.flags & VM_EVENT_FLAG_VCPU_PAUSED )
vm_event_vcpu_unpause(v);
}
}
}
void vm_event_cancel_slot(struct domain *d, struct vm_event_domain *ved)
{
if( !vm_event_check_ring(ved) )
return;
vm_event_ring_lock(ved);
vm_event_release_slot(d, ved);
vm_event_ring_unlock(ved);
}
static int vm_event_grab_slot(struct vm_event_domain *ved, int foreign)
{
unsigned int avail_req;
if ( !ved->ring_page )
return -ENOSYS;
vm_event_ring_lock(ved);
avail_req = vm_event_ring_available(ved);
if ( avail_req == 0 )
{
vm_event_ring_unlock(ved);
return -EBUSY;
}
if ( !foreign )
ved->target_producers++;
else
ved->foreign_producers++;
vm_event_ring_unlock(ved);
return 0;
}
/* Simple try_grab wrapper for use in the wait_event() macro. */
static int vm_event_wait_try_grab(struct vm_event_domain *ved, int *rc)
{
*rc = vm_event_grab_slot(ved, 0);
return *rc;
}
/* Call vm_event_grab_slot() until the ring doesn't exist, or is available. */
static int vm_event_wait_slot(struct vm_event_domain *ved)
{
int rc = -EBUSY;
wait_event(ved->wq, vm_event_wait_try_grab(ved, &rc) != -EBUSY);
return rc;
}
bool_t vm_event_check_ring(struct vm_event_domain *ved)
{
return (ved && ved->ring_page);
}
/*
* Determines whether or not the current vCPU belongs to the target domain,
* and calls the appropriate wait function. If it is a guest vCPU, then we
* use vm_event_wait_slot() to reserve a slot. As long as there is a ring,
* this function will always return 0 for a guest. For a non-guest, we check
* for space and return -EBUSY if the ring is not available.
*
* Return codes: -ENOSYS: the ring is not yet configured
* -EBUSY: the ring is busy
* 0: a spot has been reserved
*
*/
int __vm_event_claim_slot(struct domain *d, struct vm_event_domain *ved,
bool_t allow_sleep)
{
if ( !vm_event_check_ring(ved) )
return -EOPNOTSUPP;
if ( (current->domain == d) && allow_sleep )
return vm_event_wait_slot(ved);
else
return vm_event_grab_slot(ved, (current->domain != d));
}
#ifdef CONFIG_HAS_MEM_PAGING
/* Registered with Xen-bound event channel for incoming notifications. */
static void mem_paging_notification(struct vcpu *v, unsigned int port)
{
struct domain *domain = v->domain;
if ( likely(vm_event_check_ring(domain->vm_event_paging)) )
vm_event_resume(domain, domain->vm_event_paging);
}
#endif
/* Registered with Xen-bound event channel for incoming notifications. */
static void monitor_notification(struct vcpu *v, unsigned int port)
{
struct domain *domain = v->domain;
if ( likely(vm_event_check_ring(domain->vm_event_monitor)) )
vm_event_resume(domain, domain->vm_event_monitor);
}
#ifdef CONFIG_HAS_MEM_SHARING
/* Registered with Xen-bound event channel for incoming notifications. */
static void mem_sharing_notification(struct vcpu *v, unsigned int port)
{
struct domain *domain = v->domain;
if ( likely(vm_event_check_ring(domain->vm_event_share)) )
vm_event_resume(domain, domain->vm_event_share);
}
#endif
/* Clean up on domain destruction */
void vm_event_cleanup(struct domain *d)
{
#ifdef CONFIG_HAS_MEM_PAGING
if ( vm_event_check_ring(d->vm_event_paging) )
{
/* Destroying the wait queue head means waking up all
* queued vcpus. This will drain the list, allowing
* the disable routine to complete. It will also drop
* all domain refs the wait-queued vcpus are holding.
* Finally, because this code path involves previously
* pausing the domain (domain_kill), unpausing the
* vcpus causes no harm. */
destroy_waitqueue_head(&d->vm_event_paging->wq);
(void)vm_event_disable(d, &d->vm_event_paging);
}
#endif
if ( vm_event_check_ring(d->vm_event_monitor) )
{
destroy_waitqueue_head(&d->vm_event_monitor->wq);
(void)vm_event_disable(d, &d->vm_event_monitor);
}
#ifdef CONFIG_HAS_MEM_SHARING
if ( vm_event_check_ring(d->vm_event_share) )
{
destroy_waitqueue_head(&d->vm_event_share->wq);
(void)vm_event_disable(d, &d->vm_event_share);
}
#endif
}
int vm_event_domctl(struct domain *d, struct xen_domctl_vm_event_op *vec,
XEN_GUEST_HANDLE_PARAM(void) u_domctl)
{
int rc;
rc = xsm_vm_event_control(XSM_PRIV, d, vec->mode, vec->op);
if ( rc )
return rc;
if ( unlikely(d == current->domain) ) /* no domain_pause() */
{
gdprintk(XENLOG_INFO, "Tried to do a memory event op on itself.\n");
return -EINVAL;
}
if ( unlikely(d->is_dying) )
{
gdprintk(XENLOG_INFO, "Ignoring memory event op on dying domain %u\n",
d->domain_id);
return 0;
}
if ( unlikely(d->vcpu == NULL) || unlikely(d->vcpu[0] == NULL) )
{
gdprintk(XENLOG_INFO,
"Memory event op on a domain (%u) with no vcpus\n",
d->domain_id);
return -EINVAL;
}
rc = -ENOSYS;
switch ( vec->mode )
{
#ifdef CONFIG_HAS_MEM_PAGING
case XEN_DOMCTL_VM_EVENT_OP_PAGING:
{
rc = -EINVAL;
switch( vec->op )
{
case XEN_VM_EVENT_ENABLE:
{
struct p2m_domain *p2m = p2m_get_hostp2m(d);
rc = -EOPNOTSUPP;
/* hvm fixme: p2m_is_foreign types need addressing */
if ( is_hvm_domain(hardware_domain) )
break;
rc = -ENODEV;
/* Only HAP is supported */
if ( !hap_enabled(d) )
break;
/* No paging if iommu is used */
rc = -EMLINK;
if ( unlikely(need_iommu(d)) )
break;
rc = -EXDEV;
/* Disallow paging in a PoD guest */
if ( p2m->pod.entry_count )
break;
/* domain_pause() not required here, see XSA-99 */
rc = vm_event_enable(d, vec, &d->vm_event_paging, _VPF_mem_paging,
HVM_PARAM_PAGING_RING_PFN,
mem_paging_notification);
}
break;
case XEN_VM_EVENT_DISABLE:
if ( vm_event_check_ring(d->vm_event_paging) )
{
domain_pause(d);
rc = vm_event_disable(d, &d->vm_event_paging);
domain_unpause(d);
}
break;
case XEN_VM_EVENT_RESUME:
if ( vm_event_check_ring(d->vm_event_paging) )
vm_event_resume(d, d->vm_event_paging);
else
rc = -ENODEV;
break;
default:
rc = -ENOSYS;
break;
}
}
break;
#endif
case XEN_DOMCTL_VM_EVENT_OP_MONITOR:
{
rc = -EINVAL;
switch( vec->op )
{
case XEN_VM_EVENT_ENABLE:
/* domain_pause() not required here, see XSA-99 */
rc = arch_monitor_init_domain(d);
if ( rc )
break;
rc = vm_event_enable(d, vec, &d->vm_event_monitor, _VPF_mem_access,
HVM_PARAM_MONITOR_RING_PFN,
monitor_notification);
break;
case XEN_VM_EVENT_DISABLE:
if ( vm_event_check_ring(d->vm_event_monitor) )
{
domain_pause(d);
rc = vm_event_disable(d, &d->vm_event_monitor);
arch_monitor_cleanup_domain(d);
domain_unpause(d);
}
break;
case XEN_VM_EVENT_RESUME:
if ( vm_event_check_ring(d->vm_event_monitor) )
vm_event_resume(d, d->vm_event_monitor);
else
rc = -ENODEV;
break;
default:
rc = -ENOSYS;
break;
}
}
break;
#ifdef CONFIG_HAS_MEM_SHARING
case XEN_DOMCTL_VM_EVENT_OP_SHARING:
{
rc = -EINVAL;
switch( vec->op )
{
case XEN_VM_EVENT_ENABLE:
rc = -EOPNOTSUPP;
/* hvm fixme: p2m_is_foreign types need addressing */
if ( is_hvm_domain(hardware_domain) )
break;
rc = -ENODEV;
/* Only HAP is supported */
if ( !hap_enabled(d) )
break;
/* domain_pause() not required here, see XSA-99 */
rc = vm_event_enable(d, vec, &d->vm_event_share, _VPF_mem_sharing,
HVM_PARAM_SHARING_RING_PFN,
mem_sharing_notification);
break;
case XEN_VM_EVENT_DISABLE:
if ( vm_event_check_ring(d->vm_event_share) )
{
domain_pause(d);
rc = vm_event_disable(d, &d->vm_event_share);
domain_unpause(d);
}
break;
case XEN_VM_EVENT_RESUME:
if ( vm_event_check_ring(d->vm_event_share) )
vm_event_resume(d, d->vm_event_share);
else
rc = -ENODEV;
break;
default:
rc = -ENOSYS;
break;
}
}
break;
#endif
default:
rc = -ENOSYS;
}
return rc;
}
void vm_event_vcpu_pause(struct vcpu *v)
{
ASSERT(v == current);
atomic_inc(&v->vm_event_pause_count);
vcpu_pause_nosync(v);
}
void vm_event_vcpu_unpause(struct vcpu *v)
{
int old, new, prev = v->vm_event_pause_count.counter;
/*
* All unpause requests as a result of toolstack responses.
* Prevent underflow of the vcpu pause count.
*/
do
{
old = prev;
new = old - 1;
if ( new < 0 )
{
printk(XENLOG_G_WARNING
"%pv vm_event: Too many unpause attempts\n", v);
return;
}
prev = cmpxchg(&v->vm_event_pause_count.counter, old, new);
} while ( prev != old );
vcpu_unpause(v);
}
/*
* Local variables:
* mode: C
* c-file-style: "BSD"
* c-basic-offset: 4
* indent-tabs-mode: nil
* End:
*/