remarkable-linux/virt/kvm/eventfd.c
Christian Borntraeger 719d93cd5f kvm/irqchip: Speed up KVM_SET_GSI_ROUTING
When starting lots of dataplane devices the bootup takes very long on
Christian's s390 with irqfd patches. With larger setups he is even
able to trigger some timeouts in some components.  Turns out that the
KVM_SET_GSI_ROUTING ioctl takes very long (strace claims up to 0.1 sec)
when having multiple CPUs.  This is caused by the  synchronize_rcu and
the HZ=100 of s390.  By changing the code to use a private srcu we can
speed things up.  This patch reduces the boot time till mounting root
from 8 to 2 seconds on my s390 guest with 100 disks.

Uses of hlist_for_each_entry_rcu, hlist_add_head_rcu, hlist_del_init_rcu
are fine because they do not have lockdep checks (hlist_for_each_entry_rcu
uses rcu_dereference_raw rather than rcu_dereference, and write-sides
do not do rcu lockdep at all).

Note that we're hardly relying on the "sleepable" part of srcu.  We just
want SRCU's faster detection of grace periods.

Testing was done by Andrew Theurer using netperf tests STREAM, MAERTS
and RR.  The difference between results "before" and "after" the patch
has mean -0.2% and standard deviation 0.6%.  Using a paired t-test on the
data points says that there is a 2.5% probability that the patch is the
cause of the performance difference (rather than a random fluctuation).

(Restricting the t-test to RR, which is the most likely to be affected,
changes the numbers to respectively -0.3% mean, 0.7% stdev, and 8%
probability that the numbers actually say something about the patch.
The probability increases mostly because there are fewer data points).

Cc: Marcelo Tosatti <mtosatti@redhat.com>
Cc: Michael S. Tsirkin <mst@redhat.com>
Tested-by: Christian Borntraeger <borntraeger@de.ibm.com> # s390
Reviewed-by: Christian Borntraeger <borntraeger@de.ibm.com>
Signed-off-by: Christian Borntraeger <borntraeger@de.ibm.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
2014-05-05 16:29:11 +02:00

861 lines
21 KiB
C

/*
* kvm eventfd support - use eventfd objects to signal various KVM events
*
* Copyright 2009 Novell. All Rights Reserved.
* Copyright 2010 Red Hat, Inc. and/or its affiliates.
*
* Author:
* Gregory Haskins <ghaskins@novell.com>
*
* This file is free software; you can redistribute it and/or modify
* it under the terms of version 2 of the GNU General Public License
* as published by the Free Software Foundation.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software Foundation,
* Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301, USA.
*/
#include <linux/kvm_host.h>
#include <linux/kvm.h>
#include <linux/workqueue.h>
#include <linux/syscalls.h>
#include <linux/wait.h>
#include <linux/poll.h>
#include <linux/file.h>
#include <linux/list.h>
#include <linux/eventfd.h>
#include <linux/kernel.h>
#include <linux/srcu.h>
#include <linux/slab.h>
#include "iodev.h"
#ifdef CONFIG_HAVE_KVM_IRQ_ROUTING
/*
* --------------------------------------------------------------------
* irqfd: Allows an fd to be used to inject an interrupt to the guest
*
* Credit goes to Avi Kivity for the original idea.
* --------------------------------------------------------------------
*/
/*
* Resampling irqfds are a special variety of irqfds used to emulate
* level triggered interrupts. The interrupt is asserted on eventfd
* trigger. On acknowledgement through the irq ack notifier, the
* interrupt is de-asserted and userspace is notified through the
* resamplefd. All resamplers on the same gsi are de-asserted
* together, so we don't need to track the state of each individual
* user. We can also therefore share the same irq source ID.
*/
struct _irqfd_resampler {
struct kvm *kvm;
/*
* List of resampling struct _irqfd objects sharing this gsi.
* RCU list modified under kvm->irqfds.resampler_lock
*/
struct list_head list;
struct kvm_irq_ack_notifier notifier;
/*
* Entry in list of kvm->irqfd.resampler_list. Use for sharing
* resamplers among irqfds on the same gsi.
* Accessed and modified under kvm->irqfds.resampler_lock
*/
struct list_head link;
};
struct _irqfd {
/* Used for MSI fast-path */
struct kvm *kvm;
wait_queue_t wait;
/* Update side is protected by irqfds.lock */
struct kvm_kernel_irq_routing_entry __rcu *irq_entry;
/* Used for level IRQ fast-path */
int gsi;
struct work_struct inject;
/* The resampler used by this irqfd (resampler-only) */
struct _irqfd_resampler *resampler;
/* Eventfd notified on resample (resampler-only) */
struct eventfd_ctx *resamplefd;
/* Entry in list of irqfds for a resampler (resampler-only) */
struct list_head resampler_link;
/* Used for setup/shutdown */
struct eventfd_ctx *eventfd;
struct list_head list;
poll_table pt;
struct work_struct shutdown;
};
static struct workqueue_struct *irqfd_cleanup_wq;
static void
irqfd_inject(struct work_struct *work)
{
struct _irqfd *irqfd = container_of(work, struct _irqfd, inject);
struct kvm *kvm = irqfd->kvm;
if (!irqfd->resampler) {
kvm_set_irq(kvm, KVM_USERSPACE_IRQ_SOURCE_ID, irqfd->gsi, 1,
false);
kvm_set_irq(kvm, KVM_USERSPACE_IRQ_SOURCE_ID, irqfd->gsi, 0,
false);
} else
kvm_set_irq(kvm, KVM_IRQFD_RESAMPLE_IRQ_SOURCE_ID,
irqfd->gsi, 1, false);
}
/*
* Since resampler irqfds share an IRQ source ID, we de-assert once
* then notify all of the resampler irqfds using this GSI. We can't
* do multiple de-asserts or we risk racing with incoming re-asserts.
*/
static void
irqfd_resampler_ack(struct kvm_irq_ack_notifier *kian)
{
struct _irqfd_resampler *resampler;
struct kvm *kvm;
struct _irqfd *irqfd;
int idx;
resampler = container_of(kian, struct _irqfd_resampler, notifier);
kvm = resampler->kvm;
kvm_set_irq(kvm, KVM_IRQFD_RESAMPLE_IRQ_SOURCE_ID,
resampler->notifier.gsi, 0, false);
idx = srcu_read_lock(&kvm->irq_srcu);
list_for_each_entry_rcu(irqfd, &resampler->list, resampler_link)
eventfd_signal(irqfd->resamplefd, 1);
srcu_read_unlock(&kvm->irq_srcu, idx);
}
static void
irqfd_resampler_shutdown(struct _irqfd *irqfd)
{
struct _irqfd_resampler *resampler = irqfd->resampler;
struct kvm *kvm = resampler->kvm;
mutex_lock(&kvm->irqfds.resampler_lock);
list_del_rcu(&irqfd->resampler_link);
synchronize_srcu(&kvm->irq_srcu);
if (list_empty(&resampler->list)) {
list_del(&resampler->link);
kvm_unregister_irq_ack_notifier(kvm, &resampler->notifier);
kvm_set_irq(kvm, KVM_IRQFD_RESAMPLE_IRQ_SOURCE_ID,
resampler->notifier.gsi, 0, false);
kfree(resampler);
}
mutex_unlock(&kvm->irqfds.resampler_lock);
}
/*
* Race-free decouple logic (ordering is critical)
*/
static void
irqfd_shutdown(struct work_struct *work)
{
struct _irqfd *irqfd = container_of(work, struct _irqfd, shutdown);
u64 cnt;
/*
* Synchronize with the wait-queue and unhook ourselves to prevent
* further events.
*/
eventfd_ctx_remove_wait_queue(irqfd->eventfd, &irqfd->wait, &cnt);
/*
* We know no new events will be scheduled at this point, so block
* until all previously outstanding events have completed
*/
flush_work(&irqfd->inject);
if (irqfd->resampler) {
irqfd_resampler_shutdown(irqfd);
eventfd_ctx_put(irqfd->resamplefd);
}
/*
* It is now safe to release the object's resources
*/
eventfd_ctx_put(irqfd->eventfd);
kfree(irqfd);
}
/* assumes kvm->irqfds.lock is held */
static bool
irqfd_is_active(struct _irqfd *irqfd)
{
return list_empty(&irqfd->list) ? false : true;
}
/*
* Mark the irqfd as inactive and schedule it for removal
*
* assumes kvm->irqfds.lock is held
*/
static void
irqfd_deactivate(struct _irqfd *irqfd)
{
BUG_ON(!irqfd_is_active(irqfd));
list_del_init(&irqfd->list);
queue_work(irqfd_cleanup_wq, &irqfd->shutdown);
}
/*
* Called with wqh->lock held and interrupts disabled
*/
static int
irqfd_wakeup(wait_queue_t *wait, unsigned mode, int sync, void *key)
{
struct _irqfd *irqfd = container_of(wait, struct _irqfd, wait);
unsigned long flags = (unsigned long)key;
struct kvm_kernel_irq_routing_entry *irq;
struct kvm *kvm = irqfd->kvm;
int idx;
if (flags & POLLIN) {
idx = srcu_read_lock(&kvm->irq_srcu);
irq = srcu_dereference(irqfd->irq_entry, &kvm->irq_srcu);
/* An event has been signaled, inject an interrupt */
if (irq)
kvm_set_msi(irq, kvm, KVM_USERSPACE_IRQ_SOURCE_ID, 1,
false);
else
schedule_work(&irqfd->inject);
srcu_read_unlock(&kvm->irq_srcu, idx);
}
if (flags & POLLHUP) {
/* The eventfd is closing, detach from KVM */
unsigned long flags;
spin_lock_irqsave(&kvm->irqfds.lock, flags);
/*
* We must check if someone deactivated the irqfd before
* we could acquire the irqfds.lock since the item is
* deactivated from the KVM side before it is unhooked from
* the wait-queue. If it is already deactivated, we can
* simply return knowing the other side will cleanup for us.
* We cannot race against the irqfd going away since the
* other side is required to acquire wqh->lock, which we hold
*/
if (irqfd_is_active(irqfd))
irqfd_deactivate(irqfd);
spin_unlock_irqrestore(&kvm->irqfds.lock, flags);
}
return 0;
}
static void
irqfd_ptable_queue_proc(struct file *file, wait_queue_head_t *wqh,
poll_table *pt)
{
struct _irqfd *irqfd = container_of(pt, struct _irqfd, pt);
add_wait_queue(wqh, &irqfd->wait);
}
/* Must be called under irqfds.lock */
static void irqfd_update(struct kvm *kvm, struct _irqfd *irqfd,
struct kvm_irq_routing_table *irq_rt)
{
struct kvm_kernel_irq_routing_entry *e;
if (irqfd->gsi >= irq_rt->nr_rt_entries) {
rcu_assign_pointer(irqfd->irq_entry, NULL);
return;
}
hlist_for_each_entry(e, &irq_rt->map[irqfd->gsi], link) {
/* Only fast-path MSI. */
if (e->type == KVM_IRQ_ROUTING_MSI)
rcu_assign_pointer(irqfd->irq_entry, e);
else
rcu_assign_pointer(irqfd->irq_entry, NULL);
}
}
static int
kvm_irqfd_assign(struct kvm *kvm, struct kvm_irqfd *args)
{
struct kvm_irq_routing_table *irq_rt;
struct _irqfd *irqfd, *tmp;
struct fd f;
struct eventfd_ctx *eventfd = NULL, *resamplefd = NULL;
int ret;
unsigned int events;
irqfd = kzalloc(sizeof(*irqfd), GFP_KERNEL);
if (!irqfd)
return -ENOMEM;
irqfd->kvm = kvm;
irqfd->gsi = args->gsi;
INIT_LIST_HEAD(&irqfd->list);
INIT_WORK(&irqfd->inject, irqfd_inject);
INIT_WORK(&irqfd->shutdown, irqfd_shutdown);
f = fdget(args->fd);
if (!f.file) {
ret = -EBADF;
goto out;
}
eventfd = eventfd_ctx_fileget(f.file);
if (IS_ERR(eventfd)) {
ret = PTR_ERR(eventfd);
goto fail;
}
irqfd->eventfd = eventfd;
if (args->flags & KVM_IRQFD_FLAG_RESAMPLE) {
struct _irqfd_resampler *resampler;
resamplefd = eventfd_ctx_fdget(args->resamplefd);
if (IS_ERR(resamplefd)) {
ret = PTR_ERR(resamplefd);
goto fail;
}
irqfd->resamplefd = resamplefd;
INIT_LIST_HEAD(&irqfd->resampler_link);
mutex_lock(&kvm->irqfds.resampler_lock);
list_for_each_entry(resampler,
&kvm->irqfds.resampler_list, link) {
if (resampler->notifier.gsi == irqfd->gsi) {
irqfd->resampler = resampler;
break;
}
}
if (!irqfd->resampler) {
resampler = kzalloc(sizeof(*resampler), GFP_KERNEL);
if (!resampler) {
ret = -ENOMEM;
mutex_unlock(&kvm->irqfds.resampler_lock);
goto fail;
}
resampler->kvm = kvm;
INIT_LIST_HEAD(&resampler->list);
resampler->notifier.gsi = irqfd->gsi;
resampler->notifier.irq_acked = irqfd_resampler_ack;
INIT_LIST_HEAD(&resampler->link);
list_add(&resampler->link, &kvm->irqfds.resampler_list);
kvm_register_irq_ack_notifier(kvm,
&resampler->notifier);
irqfd->resampler = resampler;
}
list_add_rcu(&irqfd->resampler_link, &irqfd->resampler->list);
synchronize_srcu(&kvm->irq_srcu);
mutex_unlock(&kvm->irqfds.resampler_lock);
}
/*
* Install our own custom wake-up handling so we are notified via
* a callback whenever someone signals the underlying eventfd
*/
init_waitqueue_func_entry(&irqfd->wait, irqfd_wakeup);
init_poll_funcptr(&irqfd->pt, irqfd_ptable_queue_proc);
spin_lock_irq(&kvm->irqfds.lock);
ret = 0;
list_for_each_entry(tmp, &kvm->irqfds.items, list) {
if (irqfd->eventfd != tmp->eventfd)
continue;
/* This fd is used for another irq already. */
ret = -EBUSY;
spin_unlock_irq(&kvm->irqfds.lock);
goto fail;
}
irq_rt = rcu_dereference_protected(kvm->irq_routing,
lockdep_is_held(&kvm->irqfds.lock));
irqfd_update(kvm, irqfd, irq_rt);
list_add_tail(&irqfd->list, &kvm->irqfds.items);
spin_unlock_irq(&kvm->irqfds.lock);
/*
* Check if there was an event already pending on the eventfd
* before we registered, and trigger it as if we didn't miss it.
*/
events = f.file->f_op->poll(f.file, &irqfd->pt);
if (events & POLLIN)
schedule_work(&irqfd->inject);
/*
* do not drop the file until the irqfd is fully initialized, otherwise
* we might race against the POLLHUP
*/
fdput(f);
return 0;
fail:
if (irqfd->resampler)
irqfd_resampler_shutdown(irqfd);
if (resamplefd && !IS_ERR(resamplefd))
eventfd_ctx_put(resamplefd);
if (eventfd && !IS_ERR(eventfd))
eventfd_ctx_put(eventfd);
fdput(f);
out:
kfree(irqfd);
return ret;
}
#endif
void
kvm_eventfd_init(struct kvm *kvm)
{
#ifdef CONFIG_HAVE_KVM_IRQ_ROUTING
spin_lock_init(&kvm->irqfds.lock);
INIT_LIST_HEAD(&kvm->irqfds.items);
INIT_LIST_HEAD(&kvm->irqfds.resampler_list);
mutex_init(&kvm->irqfds.resampler_lock);
#endif
INIT_LIST_HEAD(&kvm->ioeventfds);
}
#ifdef CONFIG_HAVE_KVM_IRQ_ROUTING
/*
* shutdown any irqfd's that match fd+gsi
*/
static int
kvm_irqfd_deassign(struct kvm *kvm, struct kvm_irqfd *args)
{
struct _irqfd *irqfd, *tmp;
struct eventfd_ctx *eventfd;
eventfd = eventfd_ctx_fdget(args->fd);
if (IS_ERR(eventfd))
return PTR_ERR(eventfd);
spin_lock_irq(&kvm->irqfds.lock);
list_for_each_entry_safe(irqfd, tmp, &kvm->irqfds.items, list) {
if (irqfd->eventfd == eventfd && irqfd->gsi == args->gsi) {
/*
* This rcu_assign_pointer is needed for when
* another thread calls kvm_irq_routing_update before
* we flush workqueue below (we synchronize with
* kvm_irq_routing_update using irqfds.lock).
* It is paired with synchronize_srcu done by caller
* of that function.
*/
rcu_assign_pointer(irqfd->irq_entry, NULL);
irqfd_deactivate(irqfd);
}
}
spin_unlock_irq(&kvm->irqfds.lock);
eventfd_ctx_put(eventfd);
/*
* Block until we know all outstanding shutdown jobs have completed
* so that we guarantee there will not be any more interrupts on this
* gsi once this deassign function returns.
*/
flush_workqueue(irqfd_cleanup_wq);
return 0;
}
int
kvm_irqfd(struct kvm *kvm, struct kvm_irqfd *args)
{
if (args->flags & ~(KVM_IRQFD_FLAG_DEASSIGN | KVM_IRQFD_FLAG_RESAMPLE))
return -EINVAL;
if (args->flags & KVM_IRQFD_FLAG_DEASSIGN)
return kvm_irqfd_deassign(kvm, args);
return kvm_irqfd_assign(kvm, args);
}
/*
* This function is called as the kvm VM fd is being released. Shutdown all
* irqfds that still remain open
*/
void
kvm_irqfd_release(struct kvm *kvm)
{
struct _irqfd *irqfd, *tmp;
spin_lock_irq(&kvm->irqfds.lock);
list_for_each_entry_safe(irqfd, tmp, &kvm->irqfds.items, list)
irqfd_deactivate(irqfd);
spin_unlock_irq(&kvm->irqfds.lock);
/*
* Block until we know all outstanding shutdown jobs have completed
* since we do not take a kvm* reference.
*/
flush_workqueue(irqfd_cleanup_wq);
}
/*
* Change irq_routing and irqfd.
* Caller must invoke synchronize_srcu(&kvm->irq_srcu) afterwards.
*/
void kvm_irq_routing_update(struct kvm *kvm,
struct kvm_irq_routing_table *irq_rt)
{
struct _irqfd *irqfd;
spin_lock_irq(&kvm->irqfds.lock);
rcu_assign_pointer(kvm->irq_routing, irq_rt);
list_for_each_entry(irqfd, &kvm->irqfds.items, list)
irqfd_update(kvm, irqfd, irq_rt);
spin_unlock_irq(&kvm->irqfds.lock);
}
/*
* create a host-wide workqueue for issuing deferred shutdown requests
* aggregated from all vm* instances. We need our own isolated single-thread
* queue to prevent deadlock against flushing the normal work-queue.
*/
int kvm_irqfd_init(void)
{
irqfd_cleanup_wq = create_singlethread_workqueue("kvm-irqfd-cleanup");
if (!irqfd_cleanup_wq)
return -ENOMEM;
return 0;
}
void kvm_irqfd_exit(void)
{
destroy_workqueue(irqfd_cleanup_wq);
}
#endif
/*
* --------------------------------------------------------------------
* ioeventfd: translate a PIO/MMIO memory write to an eventfd signal.
*
* userspace can register a PIO/MMIO address with an eventfd for receiving
* notification when the memory has been touched.
* --------------------------------------------------------------------
*/
struct _ioeventfd {
struct list_head list;
u64 addr;
int length;
struct eventfd_ctx *eventfd;
u64 datamatch;
struct kvm_io_device dev;
u8 bus_idx;
bool wildcard;
};
static inline struct _ioeventfd *
to_ioeventfd(struct kvm_io_device *dev)
{
return container_of(dev, struct _ioeventfd, dev);
}
static void
ioeventfd_release(struct _ioeventfd *p)
{
eventfd_ctx_put(p->eventfd);
list_del(&p->list);
kfree(p);
}
static bool
ioeventfd_in_range(struct _ioeventfd *p, gpa_t addr, int len, const void *val)
{
u64 _val;
if (addr != p->addr)
/* address must be precise for a hit */
return false;
if (!p->length)
/* length = 0 means only look at the address, so always a hit */
return true;
if (len != p->length)
/* address-range must be precise for a hit */
return false;
if (p->wildcard)
/* all else equal, wildcard is always a hit */
return true;
/* otherwise, we have to actually compare the data */
BUG_ON(!IS_ALIGNED((unsigned long)val, len));
switch (len) {
case 1:
_val = *(u8 *)val;
break;
case 2:
_val = *(u16 *)val;
break;
case 4:
_val = *(u32 *)val;
break;
case 8:
_val = *(u64 *)val;
break;
default:
return false;
}
return _val == p->datamatch ? true : false;
}
/* MMIO/PIO writes trigger an event if the addr/val match */
static int
ioeventfd_write(struct kvm_io_device *this, gpa_t addr, int len,
const void *val)
{
struct _ioeventfd *p = to_ioeventfd(this);
if (!ioeventfd_in_range(p, addr, len, val))
return -EOPNOTSUPP;
eventfd_signal(p->eventfd, 1);
return 0;
}
/*
* This function is called as KVM is completely shutting down. We do not
* need to worry about locking just nuke anything we have as quickly as possible
*/
static void
ioeventfd_destructor(struct kvm_io_device *this)
{
struct _ioeventfd *p = to_ioeventfd(this);
ioeventfd_release(p);
}
static const struct kvm_io_device_ops ioeventfd_ops = {
.write = ioeventfd_write,
.destructor = ioeventfd_destructor,
};
/* assumes kvm->slots_lock held */
static bool
ioeventfd_check_collision(struct kvm *kvm, struct _ioeventfd *p)
{
struct _ioeventfd *_p;
list_for_each_entry(_p, &kvm->ioeventfds, list)
if (_p->bus_idx == p->bus_idx &&
_p->addr == p->addr &&
(!_p->length || !p->length ||
(_p->length == p->length &&
(_p->wildcard || p->wildcard ||
_p->datamatch == p->datamatch))))
return true;
return false;
}
static enum kvm_bus ioeventfd_bus_from_flags(__u32 flags)
{
if (flags & KVM_IOEVENTFD_FLAG_PIO)
return KVM_PIO_BUS;
if (flags & KVM_IOEVENTFD_FLAG_VIRTIO_CCW_NOTIFY)
return KVM_VIRTIO_CCW_NOTIFY_BUS;
return KVM_MMIO_BUS;
}
static int
kvm_assign_ioeventfd(struct kvm *kvm, struct kvm_ioeventfd *args)
{
enum kvm_bus bus_idx;
struct _ioeventfd *p;
struct eventfd_ctx *eventfd;
int ret;
bus_idx = ioeventfd_bus_from_flags(args->flags);
/* must be natural-word sized, or 0 to ignore length */
switch (args->len) {
case 0:
case 1:
case 2:
case 4:
case 8:
break;
default:
return -EINVAL;
}
/* check for range overflow */
if (args->addr + args->len < args->addr)
return -EINVAL;
/* check for extra flags that we don't understand */
if (args->flags & ~KVM_IOEVENTFD_VALID_FLAG_MASK)
return -EINVAL;
/* ioeventfd with no length can't be combined with DATAMATCH */
if (!args->len &&
args->flags & (KVM_IOEVENTFD_FLAG_PIO |
KVM_IOEVENTFD_FLAG_DATAMATCH))
return -EINVAL;
eventfd = eventfd_ctx_fdget(args->fd);
if (IS_ERR(eventfd))
return PTR_ERR(eventfd);
p = kzalloc(sizeof(*p), GFP_KERNEL);
if (!p) {
ret = -ENOMEM;
goto fail;
}
INIT_LIST_HEAD(&p->list);
p->addr = args->addr;
p->bus_idx = bus_idx;
p->length = args->len;
p->eventfd = eventfd;
/* The datamatch feature is optional, otherwise this is a wildcard */
if (args->flags & KVM_IOEVENTFD_FLAG_DATAMATCH)
p->datamatch = args->datamatch;
else
p->wildcard = true;
mutex_lock(&kvm->slots_lock);
/* Verify that there isn't a match already */
if (ioeventfd_check_collision(kvm, p)) {
ret = -EEXIST;
goto unlock_fail;
}
kvm_iodevice_init(&p->dev, &ioeventfd_ops);
ret = kvm_io_bus_register_dev(kvm, bus_idx, p->addr, p->length,
&p->dev);
if (ret < 0)
goto unlock_fail;
/* When length is ignored, MMIO is also put on a separate bus, for
* faster lookups.
*/
if (!args->len && !(args->flags & KVM_IOEVENTFD_FLAG_PIO)) {
ret = kvm_io_bus_register_dev(kvm, KVM_FAST_MMIO_BUS,
p->addr, 0, &p->dev);
if (ret < 0)
goto register_fail;
}
kvm->buses[bus_idx]->ioeventfd_count++;
list_add_tail(&p->list, &kvm->ioeventfds);
mutex_unlock(&kvm->slots_lock);
return 0;
register_fail:
kvm_io_bus_unregister_dev(kvm, bus_idx, &p->dev);
unlock_fail:
mutex_unlock(&kvm->slots_lock);
fail:
kfree(p);
eventfd_ctx_put(eventfd);
return ret;
}
static int
kvm_deassign_ioeventfd(struct kvm *kvm, struct kvm_ioeventfd *args)
{
enum kvm_bus bus_idx;
struct _ioeventfd *p, *tmp;
struct eventfd_ctx *eventfd;
int ret = -ENOENT;
bus_idx = ioeventfd_bus_from_flags(args->flags);
eventfd = eventfd_ctx_fdget(args->fd);
if (IS_ERR(eventfd))
return PTR_ERR(eventfd);
mutex_lock(&kvm->slots_lock);
list_for_each_entry_safe(p, tmp, &kvm->ioeventfds, list) {
bool wildcard = !(args->flags & KVM_IOEVENTFD_FLAG_DATAMATCH);
if (p->bus_idx != bus_idx ||
p->eventfd != eventfd ||
p->addr != args->addr ||
p->length != args->len ||
p->wildcard != wildcard)
continue;
if (!p->wildcard && p->datamatch != args->datamatch)
continue;
kvm_io_bus_unregister_dev(kvm, bus_idx, &p->dev);
if (!p->length) {
kvm_io_bus_unregister_dev(kvm, KVM_FAST_MMIO_BUS,
&p->dev);
}
kvm->buses[bus_idx]->ioeventfd_count--;
ioeventfd_release(p);
ret = 0;
break;
}
mutex_unlock(&kvm->slots_lock);
eventfd_ctx_put(eventfd);
return ret;
}
int
kvm_ioeventfd(struct kvm *kvm, struct kvm_ioeventfd *args)
{
if (args->flags & KVM_IOEVENTFD_FLAG_DEASSIGN)
return kvm_deassign_ioeventfd(kvm, args);
return kvm_assign_ioeventfd(kvm, args);
}