1
0
Fork 0

Lots of virtio work which wasn't quite ready for last merge window. Plus

I dived into lguest again, reworking the pagetable code so we can move
 the switcher page: our fixmaps sometimes take more than 2MB now...
 
 Cheers,
 Rusty.
 -----BEGIN PGP SIGNATURE-----
 Version: GnuPG v1.4.12 (GNU/Linux)
 
 iQIcBAABAgAGBQJRga7lAAoJENkgDmzRrbjx/yIQAKpqIBtxOJeYH3SY+Uoe7Cfp
 toNYcpJEldvb0UcWN8M2cSZpHoxl1SUoq9djwcM29tcKa7EZAjHaGtb/Q1qMTDgv
 +B3WAfiGU2pmXFxLAkbrlLNGnysy24JspqJQ5hcYV84EiBxQdZp+nCYgOphd+GMK
 ww16vo9ya8jFjzt3GeRp/Heb3vEzV4Cp6BC3i0m8A3WNpEpbRb66pqXNk5o8ggJO
 SxQOKSXmUM+0m+jKSul5xn3e2Ls2LOrZZ8/DIHA+gW66N4Zab7n2/j1Q9VRxb4lh
 FqnR7KwgBX8OCh9IsBDqQYS7MohvMYge6eUdLtFrq84jvMleMEhrC8q9v2tucFUb
 5t18CLwvyK7Gdg6UCKiZ7YSPcuURAILO16al9bh5IseeBDsuX+43VsvQoBmFn9k6
 cLOVTZ6BlOmahK5PyRYFSvLa9Rxzr/05Mr7oYq9UgshD9io78dnqczFYIORF53rW
 zD7C4HuTZfYJFfNd0wAJ0RfVXnf8QvDlMdo7zPC26DSXNWqj8OexCY0qqSWUB+2F
 vcfJP6NkV4fZB8aawWIFUVwc64yqtt2uPVLa7ATZWqk16PgKrchGewmw3tiEwOgu
 1l7xgffTRRUIJsqaCZoXdgw3yezcKRjuUBcOxL09lDAAhc+NxWNvzZBsKp66DwDk
 yZQKn0OdXnuf0CeEOfFf
 =1tYL
 -----END PGP SIGNATURE-----

Merge tag 'virtio-next-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/rusty/linux

Pull virtio & lguest updates from Rusty Russell:
 "Lots of virtio work which wasn't quite ready for last merge window.

  Plus I dived into lguest again, reworking the pagetable code so we can
  move the switcher page: our fixmaps sometimes take more than 2MB now..."

Ugh.  Annoying conflicts with the tcm_vhost -> vhost_scsi rename.
Hopefully correctly resolved.

* tag 'virtio-next-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/rusty/linux: (57 commits)
  caif_virtio: Remove bouncing email addresses
  lguest: improve code readability in lg_cpu_start.
  virtio-net: fill only rx queues which are being used
  lguest: map Switcher below fixmap.
  lguest: cache last cpu we ran on.
  lguest: map Switcher text whenever we allocate a new pagetable.
  lguest: don't share Switcher PTE pages between guests.
  lguest: expost switcher_pages array (as lg_switcher_pages).
  lguest: extract shadow PTE walking / allocating.
  lguest: make check_gpte et. al return bool.
  lguest: assume Switcher text is a single page.
  lguest: rename switcher_page to switcher_pages.
  lguest: remove RESERVE_MEM constant.
  lguest: check vaddr not pgd for Switcher protection.
  lguest: prepare to make SWITCHER_ADDR a variable.
  virtio: console: replace EMFILE with EBUSY for already-open port
  virtio-scsi: reset virtqueue affinity when doing cpu hotplug
  virtio-scsi: introduce multiqueue support
  virtio-scsi: push vq lock/unlock into virtscsi_vq_done
  virtio-scsi: pass struct virtio_scsi to virtqueue completion function
  ...
hifive-unleashed-5.1
Linus Torvalds 2013-05-02 14:14:04 -07:00
commit 736a2dd257
60 changed files with 4537 additions and 4130 deletions

View File

@ -6,6 +6,3 @@ kvm/
- Kernel Virtual Machine. See also http://linux-kvm.org - Kernel Virtual Machine. See also http://linux-kvm.org
uml/ uml/
- User Mode Linux, builds/runs Linux kernel as a userspace program. - User Mode Linux, builds/runs Linux kernel as a userspace program.
virtio.txt
- Text version of draft virtio spec.
See http://ozlabs.org/~rusty/virtio-spec

File diff suppressed because it is too large Load Diff

View File

@ -8743,6 +8743,7 @@ F: drivers/virtio/
F: drivers/net/virtio_net.c F: drivers/net/virtio_net.c
F: drivers/block/virtio_blk.c F: drivers/block/virtio_blk.c
F: include/linux/virtio_*.h F: include/linux/virtio_*.h
F: include/uapi/linux/virtio_*.h
VIRTIO HOST (VHOST) VIRTIO HOST (VHOST)
M: "Michael S. Tsirkin" <mst@redhat.com> M: "Michael S. Tsirkin" <mst@redhat.com>

View File

@ -11,18 +11,11 @@
#define GUEST_PL 1 #define GUEST_PL 1
/* Every guest maps the core switcher code. */ /* Page for Switcher text itself, then two pages per cpu */
#define SHARED_SWITCHER_PAGES \ #define TOTAL_SWITCHER_PAGES (1 + 2 * nr_cpu_ids)
DIV_ROUND_UP(end_switcher_text - start_switcher_text, PAGE_SIZE)
/* Pages for switcher itself, then two pages per cpu */
#define TOTAL_SWITCHER_PAGES (SHARED_SWITCHER_PAGES + 2 * nr_cpu_ids)
/* We map at -4M (-2M for PAE) for ease of mapping (one PTE page). */ /* Where we map the Switcher, in both Host and Guest. */
#ifdef CONFIG_X86_PAE extern unsigned long switcher_addr;
#define SWITCHER_ADDR 0xFFE00000
#else
#define SWITCHER_ADDR 0xFFC00000
#endif
/* Found in switcher.S */ /* Found in switcher.S */
extern unsigned long default_idt_entries[]; extern unsigned long default_idt_entries[];

View File

@ -110,7 +110,7 @@ new_segment:
if (!sg) if (!sg)
sg = sglist; sg = sglist;
else { else {
sg->page_link &= ~0x02; sg_unmark_end(sg);
sg = sg_next(sg); sg = sg_next(sg);
} }

View File

@ -143,7 +143,7 @@ new_segment:
* termination bit to avoid doing a full * termination bit to avoid doing a full
* sg_init_table() in drivers for each command. * sg_init_table() in drivers for each command.
*/ */
(*sg)->page_link &= ~0x02; sg_unmark_end(*sg);
*sg = sg_next(*sg); *sg = sg_next(*sg);
} }

View File

@ -124,7 +124,7 @@ obj-$(CONFIG_PPC_PS3) += ps3/
obj-$(CONFIG_OF) += of/ obj-$(CONFIG_OF) += of/
obj-$(CONFIG_SSB) += ssb/ obj-$(CONFIG_SSB) += ssb/
obj-$(CONFIG_BCMA) += bcma/ obj-$(CONFIG_BCMA) += bcma/
obj-$(CONFIG_VHOST_NET) += vhost/ obj-$(CONFIG_VHOST_RING) += vhost/
obj-$(CONFIG_VLYNQ) += vlynq/ obj-$(CONFIG_VLYNQ) += vlynq/
obj-$(CONFIG_STAGING) += staging/ obj-$(CONFIG_STAGING) += staging/
obj-y += platform/ obj-y += platform/

View File

@ -100,96 +100,103 @@ static inline struct virtblk_req *virtblk_alloc_req(struct virtio_blk *vblk,
return vbr; return vbr;
} }
static void virtblk_add_buf_wait(struct virtio_blk *vblk, static int __virtblk_add_req(struct virtqueue *vq,
struct virtblk_req *vbr, struct virtblk_req *vbr,
unsigned long out, struct scatterlist *data_sg,
unsigned long in) bool have_data)
{ {
DEFINE_WAIT(wait); struct scatterlist hdr, status, cmd, sense, inhdr, *sgs[6];
unsigned int num_out = 0, num_in = 0;
int type = vbr->out_hdr.type & ~VIRTIO_BLK_T_OUT;
for (;;) { sg_init_one(&hdr, &vbr->out_hdr, sizeof(vbr->out_hdr));
sgs[num_out++] = &hdr;
/*
* If this is a packet command we need a couple of additional headers.
* Behind the normal outhdr we put a segment with the scsi command
* block, and before the normal inhdr we put the sense data and the
* inhdr with additional status information.
*/
if (type == VIRTIO_BLK_T_SCSI_CMD) {
sg_init_one(&cmd, vbr->req->cmd, vbr->req->cmd_len);
sgs[num_out++] = &cmd;
}
if (have_data) {
if (vbr->out_hdr.type & VIRTIO_BLK_T_OUT)
sgs[num_out++] = data_sg;
else
sgs[num_out + num_in++] = data_sg;
}
if (type == VIRTIO_BLK_T_SCSI_CMD) {
sg_init_one(&sense, vbr->req->sense, SCSI_SENSE_BUFFERSIZE);
sgs[num_out + num_in++] = &sense;
sg_init_one(&inhdr, &vbr->in_hdr, sizeof(vbr->in_hdr));
sgs[num_out + num_in++] = &inhdr;
}
sg_init_one(&status, &vbr->status, sizeof(vbr->status));
sgs[num_out + num_in++] = &status;
return virtqueue_add_sgs(vq, sgs, num_out, num_in, vbr, GFP_ATOMIC);
}
static void virtblk_add_req(struct virtblk_req *vbr, bool have_data)
{
struct virtio_blk *vblk = vbr->vblk;
DEFINE_WAIT(wait);
int ret;
spin_lock_irq(vblk->disk->queue->queue_lock);
while (unlikely((ret = __virtblk_add_req(vblk->vq, vbr, vbr->sg,
have_data)) < 0)) {
prepare_to_wait_exclusive(&vblk->queue_wait, &wait, prepare_to_wait_exclusive(&vblk->queue_wait, &wait,
TASK_UNINTERRUPTIBLE); TASK_UNINTERRUPTIBLE);
spin_lock_irq(vblk->disk->queue->queue_lock);
if (virtqueue_add_buf(vblk->vq, vbr->sg, out, in, vbr,
GFP_ATOMIC) < 0) {
spin_unlock_irq(vblk->disk->queue->queue_lock);
io_schedule();
} else {
virtqueue_kick(vblk->vq);
spin_unlock_irq(vblk->disk->queue->queue_lock);
break;
}
}
finish_wait(&vblk->queue_wait, &wait);
}
static inline void virtblk_add_req(struct virtblk_req *vbr,
unsigned int out, unsigned int in)
{
struct virtio_blk *vblk = vbr->vblk;
spin_lock_irq(vblk->disk->queue->queue_lock);
if (unlikely(virtqueue_add_buf(vblk->vq, vbr->sg, out, in, vbr,
GFP_ATOMIC) < 0)) {
spin_unlock_irq(vblk->disk->queue->queue_lock); spin_unlock_irq(vblk->disk->queue->queue_lock);
virtblk_add_buf_wait(vblk, vbr, out, in); io_schedule();
return; spin_lock_irq(vblk->disk->queue->queue_lock);
finish_wait(&vblk->queue_wait, &wait);
} }
virtqueue_kick(vblk->vq); virtqueue_kick(vblk->vq);
spin_unlock_irq(vblk->disk->queue->queue_lock); spin_unlock_irq(vblk->disk->queue->queue_lock);
} }
static int virtblk_bio_send_flush(struct virtblk_req *vbr) static void virtblk_bio_send_flush(struct virtblk_req *vbr)
{ {
unsigned int out = 0, in = 0;
vbr->flags |= VBLK_IS_FLUSH; vbr->flags |= VBLK_IS_FLUSH;
vbr->out_hdr.type = VIRTIO_BLK_T_FLUSH; vbr->out_hdr.type = VIRTIO_BLK_T_FLUSH;
vbr->out_hdr.sector = 0; vbr->out_hdr.sector = 0;
vbr->out_hdr.ioprio = 0; vbr->out_hdr.ioprio = 0;
sg_set_buf(&vbr->sg[out++], &vbr->out_hdr, sizeof(vbr->out_hdr));
sg_set_buf(&vbr->sg[out + in++], &vbr->status, sizeof(vbr->status));
virtblk_add_req(vbr, out, in); virtblk_add_req(vbr, false);
return 0;
} }
static int virtblk_bio_send_data(struct virtblk_req *vbr) static void virtblk_bio_send_data(struct virtblk_req *vbr)
{ {
struct virtio_blk *vblk = vbr->vblk; struct virtio_blk *vblk = vbr->vblk;
unsigned int num, out = 0, in = 0;
struct bio *bio = vbr->bio; struct bio *bio = vbr->bio;
bool have_data;
vbr->flags &= ~VBLK_IS_FLUSH; vbr->flags &= ~VBLK_IS_FLUSH;
vbr->out_hdr.type = 0; vbr->out_hdr.type = 0;
vbr->out_hdr.sector = bio->bi_sector; vbr->out_hdr.sector = bio->bi_sector;
vbr->out_hdr.ioprio = bio_prio(bio); vbr->out_hdr.ioprio = bio_prio(bio);
sg_set_buf(&vbr->sg[out++], &vbr->out_hdr, sizeof(vbr->out_hdr)); if (blk_bio_map_sg(vblk->disk->queue, bio, vbr->sg)) {
have_data = true;
num = blk_bio_map_sg(vblk->disk->queue, bio, vbr->sg + out); if (bio->bi_rw & REQ_WRITE)
sg_set_buf(&vbr->sg[num + out + in++], &vbr->status,
sizeof(vbr->status));
if (num) {
if (bio->bi_rw & REQ_WRITE) {
vbr->out_hdr.type |= VIRTIO_BLK_T_OUT; vbr->out_hdr.type |= VIRTIO_BLK_T_OUT;
out += num; else
} else {
vbr->out_hdr.type |= VIRTIO_BLK_T_IN; vbr->out_hdr.type |= VIRTIO_BLK_T_IN;
in += num; } else
} have_data = false;
}
virtblk_add_req(vbr, out, in); virtblk_add_req(vbr, have_data);
return 0;
} }
static void virtblk_bio_send_data_work(struct work_struct *work) static void virtblk_bio_send_data_work(struct work_struct *work)
@ -298,7 +305,7 @@ static void virtblk_done(struct virtqueue *vq)
static bool do_req(struct request_queue *q, struct virtio_blk *vblk, static bool do_req(struct request_queue *q, struct virtio_blk *vblk,
struct request *req) struct request *req)
{ {
unsigned long num, out = 0, in = 0; unsigned int num;
struct virtblk_req *vbr; struct virtblk_req *vbr;
vbr = virtblk_alloc_req(vblk, GFP_ATOMIC); vbr = virtblk_alloc_req(vblk, GFP_ATOMIC);
@ -335,40 +342,15 @@ static bool do_req(struct request_queue *q, struct virtio_blk *vblk,
} }
} }
sg_set_buf(&vblk->sg[out++], &vbr->out_hdr, sizeof(vbr->out_hdr)); num = blk_rq_map_sg(q, vbr->req, vblk->sg);
/*
* If this is a packet command we need a couple of additional headers.
* Behind the normal outhdr we put a segment with the scsi command
* block, and before the normal inhdr we put the sense data and the
* inhdr with additional status information before the normal inhdr.
*/
if (vbr->req->cmd_type == REQ_TYPE_BLOCK_PC)
sg_set_buf(&vblk->sg[out++], vbr->req->cmd, vbr->req->cmd_len);
num = blk_rq_map_sg(q, vbr->req, vblk->sg + out);
if (vbr->req->cmd_type == REQ_TYPE_BLOCK_PC) {
sg_set_buf(&vblk->sg[num + out + in++], vbr->req->sense, SCSI_SENSE_BUFFERSIZE);
sg_set_buf(&vblk->sg[num + out + in++], &vbr->in_hdr,
sizeof(vbr->in_hdr));
}
sg_set_buf(&vblk->sg[num + out + in++], &vbr->status,
sizeof(vbr->status));
if (num) { if (num) {
if (rq_data_dir(vbr->req) == WRITE) { if (rq_data_dir(vbr->req) == WRITE)
vbr->out_hdr.type |= VIRTIO_BLK_T_OUT; vbr->out_hdr.type |= VIRTIO_BLK_T_OUT;
out += num; else
} else {
vbr->out_hdr.type |= VIRTIO_BLK_T_IN; vbr->out_hdr.type |= VIRTIO_BLK_T_IN;
in += num;
}
} }
if (virtqueue_add_buf(vblk->vq, vblk->sg, out, in, vbr, if (__virtblk_add_req(vblk->vq, vbr, vblk->sg, num) < 0) {
GFP_ATOMIC) < 0) {
mempool_free(vbr, vblk->pool); mempool_free(vbr, vblk->pool);
return false; return false;
} }
@ -539,6 +521,7 @@ static void virtblk_config_changed_work(struct work_struct *work)
struct virtio_device *vdev = vblk->vdev; struct virtio_device *vdev = vblk->vdev;
struct request_queue *q = vblk->disk->queue; struct request_queue *q = vblk->disk->queue;
char cap_str_2[10], cap_str_10[10]; char cap_str_2[10], cap_str_10[10];
char *envp[] = { "RESIZE=1", NULL };
u64 capacity, size; u64 capacity, size;
mutex_lock(&vblk->config_lock); mutex_lock(&vblk->config_lock);
@ -568,6 +551,7 @@ static void virtblk_config_changed_work(struct work_struct *work)
set_capacity(vblk->disk, capacity); set_capacity(vblk->disk, capacity);
revalidate_disk(vblk->disk); revalidate_disk(vblk->disk);
kobject_uevent_env(&disk_to_dev(vblk->disk)->kobj, KOBJ_CHANGE, envp);
done: done:
mutex_unlock(&vblk->config_lock); mutex_unlock(&vblk->config_lock);
} }

View File

@ -47,7 +47,7 @@ static void register_buffer(u8 *buf, size_t size)
sg_init_one(&sg, buf, size); sg_init_one(&sg, buf, size);
/* There should always be room for one buffer. */ /* There should always be room for one buffer. */
if (virtqueue_add_buf(vq, &sg, 0, 1, buf, GFP_KERNEL) < 0) if (virtqueue_add_inbuf(vq, &sg, 1, buf, GFP_KERNEL) < 0)
BUG(); BUG();
virtqueue_kick(vq); virtqueue_kick(vq);

View File

@ -78,8 +78,8 @@ struct ports_driver_data {
}; };
static struct ports_driver_data pdrvdata; static struct ports_driver_data pdrvdata;
DEFINE_SPINLOCK(pdrvdata_lock); static DEFINE_SPINLOCK(pdrvdata_lock);
DECLARE_COMPLETION(early_console_added); static DECLARE_COMPLETION(early_console_added);
/* This struct holds information that's relevant only for console ports */ /* This struct holds information that's relevant only for console ports */
struct console { struct console {
@ -503,7 +503,7 @@ static int add_inbuf(struct virtqueue *vq, struct port_buffer *buf)
sg_init_one(sg, buf->buf, buf->size); sg_init_one(sg, buf->buf, buf->size);
ret = virtqueue_add_buf(vq, sg, 0, 1, buf, GFP_ATOMIC); ret = virtqueue_add_inbuf(vq, sg, 1, buf, GFP_ATOMIC);
virtqueue_kick(vq); virtqueue_kick(vq);
if (!ret) if (!ret)
ret = vq->num_free; ret = vq->num_free;
@ -572,7 +572,7 @@ static ssize_t __send_control_msg(struct ports_device *portdev, u32 port_id,
sg_init_one(sg, &cpkt, sizeof(cpkt)); sg_init_one(sg, &cpkt, sizeof(cpkt));
spin_lock(&portdev->c_ovq_lock); spin_lock(&portdev->c_ovq_lock);
if (virtqueue_add_buf(vq, sg, 1, 0, &cpkt, GFP_ATOMIC) == 0) { if (virtqueue_add_outbuf(vq, sg, 1, &cpkt, GFP_ATOMIC) == 0) {
virtqueue_kick(vq); virtqueue_kick(vq);
while (!virtqueue_get_buf(vq, &len)) while (!virtqueue_get_buf(vq, &len))
cpu_relax(); cpu_relax();
@ -622,7 +622,7 @@ static ssize_t __send_to_port(struct port *port, struct scatterlist *sg,
reclaim_consumed_buffers(port); reclaim_consumed_buffers(port);
err = virtqueue_add_buf(out_vq, sg, nents, 0, data, GFP_ATOMIC); err = virtqueue_add_outbuf(out_vq, sg, nents, data, GFP_ATOMIC);
/* Tell Host to go! */ /* Tell Host to go! */
virtqueue_kick(out_vq); virtqueue_kick(out_vq);
@ -1040,7 +1040,7 @@ static int port_fops_open(struct inode *inode, struct file *filp)
spin_lock_irq(&port->inbuf_lock); spin_lock_irq(&port->inbuf_lock);
if (port->guest_connected) { if (port->guest_connected) {
spin_unlock_irq(&port->inbuf_lock); spin_unlock_irq(&port->inbuf_lock);
ret = -EMFILE; ret = -EBUSY;
goto out; goto out;
} }
@ -1202,7 +1202,7 @@ int __init virtio_cons_early_init(int (*put_chars)(u32, const char *, int))
return hvc_instantiate(0, 0, &hv_ops); return hvc_instantiate(0, 0, &hv_ops);
} }
int init_port_console(struct port *port) static int init_port_console(struct port *port)
{ {
int ret; int ret;

View File

@ -5,10 +5,9 @@ config LGUEST
---help--- ---help---
This is a very simple module which allows you to run This is a very simple module which allows you to run
multiple instances of the same Linux kernel, using the multiple instances of the same Linux kernel, using the
"lguest" command found in the Documentation/virtual/lguest "lguest" command found in the tools/lguest directory.
directory.
Note that "lguest" is pronounced to rhyme with "fell quest", Note that "lguest" is pronounced to rhyme with "fell quest",
not "rustyvisor". See Documentation/virtual/lguest/lguest.txt. not "rustyvisor". See tools/lguest/lguest.txt.
If unsure, say N. If curious, say M. If masochistic, say Y. If unsure, say N. If curious, say M. If masochistic, say Y.

View File

@ -20,9 +20,9 @@
#include <asm/asm-offsets.h> #include <asm/asm-offsets.h>
#include "lg.h" #include "lg.h"
unsigned long switcher_addr;
struct page **lg_switcher_pages;
static struct vm_struct *switcher_vma; static struct vm_struct *switcher_vma;
static struct page **switcher_page;
/* This One Big lock protects all inter-guest data structures. */ /* This One Big lock protects all inter-guest data structures. */
DEFINE_MUTEX(lguest_lock); DEFINE_MUTEX(lguest_lock);
@ -52,13 +52,21 @@ static __init int map_switcher(void)
* easy. * easy.
*/ */
/* We assume Switcher text fits into a single page. */
if (end_switcher_text - start_switcher_text > PAGE_SIZE) {
printk(KERN_ERR "lguest: switcher text too large (%zu)\n",
end_switcher_text - start_switcher_text);
return -EINVAL;
}
/* /*
* We allocate an array of struct page pointers. map_vm_area() wants * We allocate an array of struct page pointers. map_vm_area() wants
* this, rather than just an array of pages. * this, rather than just an array of pages.
*/ */
switcher_page = kmalloc(sizeof(switcher_page[0])*TOTAL_SWITCHER_PAGES, lg_switcher_pages = kmalloc(sizeof(lg_switcher_pages[0])
GFP_KERNEL); * TOTAL_SWITCHER_PAGES,
if (!switcher_page) { GFP_KERNEL);
if (!lg_switcher_pages) {
err = -ENOMEM; err = -ENOMEM;
goto out; goto out;
} }
@ -68,32 +76,29 @@ static __init int map_switcher(void)
* so we make sure they're zeroed. * so we make sure they're zeroed.
*/ */
for (i = 0; i < TOTAL_SWITCHER_PAGES; i++) { for (i = 0; i < TOTAL_SWITCHER_PAGES; i++) {
switcher_page[i] = alloc_page(GFP_KERNEL|__GFP_ZERO); lg_switcher_pages[i] = alloc_page(GFP_KERNEL|__GFP_ZERO);
if (!switcher_page[i]) { if (!lg_switcher_pages[i]) {
err = -ENOMEM; err = -ENOMEM;
goto free_some_pages; goto free_some_pages;
} }
} }
/* /*
* First we check that the Switcher won't overlap the fixmap area at * We place the Switcher underneath the fixmap area, which is the
* the top of memory. It's currently nowhere near, but it could have * highest virtual address we can get. This is important, since we
* very strange effects if it ever happened. * tell the Guest it can't access this memory, so we want its ceiling
* as high as possible.
*/ */
if (SWITCHER_ADDR + (TOTAL_SWITCHER_PAGES+1)*PAGE_SIZE > FIXADDR_START){ switcher_addr = FIXADDR_START - (TOTAL_SWITCHER_PAGES+1)*PAGE_SIZE;
err = -ENOMEM;
printk("lguest: mapping switcher would thwack fixmap\n");
goto free_pages;
}
/* /*
* Now we reserve the "virtual memory area" we want: 0xFFC00000 * Now we reserve the "virtual memory area" we want. We might
* (SWITCHER_ADDR). We might not get it in theory, but in practice * not get it in theory, but in practice it's worked so far.
* it's worked so far. The end address needs +1 because __get_vm_area * The end address needs +1 because __get_vm_area allocates an
* allocates an extra guard page, so we need space for that. * extra guard page, so we need space for that.
*/ */
switcher_vma = __get_vm_area(TOTAL_SWITCHER_PAGES * PAGE_SIZE, switcher_vma = __get_vm_area(TOTAL_SWITCHER_PAGES * PAGE_SIZE,
VM_ALLOC, SWITCHER_ADDR, SWITCHER_ADDR VM_ALLOC, switcher_addr, switcher_addr
+ (TOTAL_SWITCHER_PAGES+1) * PAGE_SIZE); + (TOTAL_SWITCHER_PAGES+1) * PAGE_SIZE);
if (!switcher_vma) { if (!switcher_vma) {
err = -ENOMEM; err = -ENOMEM;
@ -103,12 +108,12 @@ static __init int map_switcher(void)
/* /*
* This code actually sets up the pages we've allocated to appear at * This code actually sets up the pages we've allocated to appear at
* SWITCHER_ADDR. map_vm_area() takes the vma we allocated above, the * switcher_addr. map_vm_area() takes the vma we allocated above, the
* kind of pages we're mapping (kernel pages), and a pointer to our * kind of pages we're mapping (kernel pages), and a pointer to our
* array of struct pages. It increments that pointer, but we don't * array of struct pages. It increments that pointer, but we don't
* care. * care.
*/ */
pagep = switcher_page; pagep = lg_switcher_pages;
err = map_vm_area(switcher_vma, PAGE_KERNEL_EXEC, &pagep); err = map_vm_area(switcher_vma, PAGE_KERNEL_EXEC, &pagep);
if (err) { if (err) {
printk("lguest: map_vm_area failed: %i\n", err); printk("lguest: map_vm_area failed: %i\n", err);
@ -133,8 +138,8 @@ free_pages:
i = TOTAL_SWITCHER_PAGES; i = TOTAL_SWITCHER_PAGES;
free_some_pages: free_some_pages:
for (--i; i >= 0; i--) for (--i; i >= 0; i--)
__free_pages(switcher_page[i], 0); __free_pages(lg_switcher_pages[i], 0);
kfree(switcher_page); kfree(lg_switcher_pages);
out: out:
return err; return err;
} }
@ -149,8 +154,8 @@ static void unmap_switcher(void)
vunmap(switcher_vma->addr); vunmap(switcher_vma->addr);
/* Now we just need to free the pages we copied the switcher into */ /* Now we just need to free the pages we copied the switcher into */
for (i = 0; i < TOTAL_SWITCHER_PAGES; i++) for (i = 0; i < TOTAL_SWITCHER_PAGES; i++)
__free_pages(switcher_page[i], 0); __free_pages(lg_switcher_pages[i], 0);
kfree(switcher_page); kfree(lg_switcher_pages);
} }
/*H:032 /*H:032
@ -323,15 +328,10 @@ static int __init init(void)
if (err) if (err)
goto out; goto out;
/* Now we set up the pagetable implementation for the Guests. */
err = init_pagetables(switcher_page, SHARED_SWITCHER_PAGES);
if (err)
goto unmap;
/* We might need to reserve an interrupt vector. */ /* We might need to reserve an interrupt vector. */
err = init_interrupts(); err = init_interrupts();
if (err) if (err)
goto free_pgtables; goto unmap;
/* /dev/lguest needs to be registered. */ /* /dev/lguest needs to be registered. */
err = lguest_device_init(); err = lguest_device_init();
@ -346,8 +346,6 @@ static int __init init(void)
free_interrupts: free_interrupts:
free_interrupts(); free_interrupts();
free_pgtables:
free_pagetables();
unmap: unmap:
unmap_switcher(); unmap_switcher();
out: out:
@ -359,7 +357,6 @@ static void __exit fini(void)
{ {
lguest_device_remove(); lguest_device_remove();
free_interrupts(); free_interrupts();
free_pagetables();
unmap_switcher(); unmap_switcher();
lguest_arch_host_fini(); lguest_arch_host_fini();

View File

@ -14,11 +14,10 @@
#include <asm/lguest.h> #include <asm/lguest.h>
void free_pagetables(void);
int init_pagetables(struct page **switcher_page, unsigned int pages);
struct pgdir { struct pgdir {
unsigned long gpgdir; unsigned long gpgdir;
bool switcher_mapped;
int last_host_cpu;
pgd_t *pgdir; pgd_t *pgdir;
}; };
@ -124,6 +123,7 @@ bool lguest_address_ok(const struct lguest *lg,
unsigned long addr, unsigned long len); unsigned long addr, unsigned long len);
void __lgread(struct lg_cpu *, void *, unsigned long, unsigned); void __lgread(struct lg_cpu *, void *, unsigned long, unsigned);
void __lgwrite(struct lg_cpu *, unsigned long, const void *, unsigned); void __lgwrite(struct lg_cpu *, unsigned long, const void *, unsigned);
extern struct page **lg_switcher_pages;
/*H:035 /*H:035
* Using memory-copy operations like that is usually inconvient, so we * Using memory-copy operations like that is usually inconvient, so we

View File

@ -250,13 +250,13 @@ static ssize_t read(struct file *file, char __user *user, size_t size,loff_t*o)
*/ */
static int lg_cpu_start(struct lg_cpu *cpu, unsigned id, unsigned long start_ip) static int lg_cpu_start(struct lg_cpu *cpu, unsigned id, unsigned long start_ip)
{ {
/* We have a limited number the number of CPUs in the lguest struct. */ /* We have a limited number of CPUs in the lguest struct. */
if (id >= ARRAY_SIZE(cpu->lg->cpus)) if (id >= ARRAY_SIZE(cpu->lg->cpus))
return -EINVAL; return -EINVAL;
/* Set up this CPU's id, and pointer back to the lguest struct. */ /* Set up this CPU's id, and pointer back to the lguest struct. */
cpu->id = id; cpu->id = id;
cpu->lg = container_of((cpu - id), struct lguest, cpus[0]); cpu->lg = container_of(cpu, struct lguest, cpus[id]);
cpu->lg->nr_cpus++; cpu->lg->nr_cpus++;
/* Each CPU has a timer it can set. */ /* Each CPU has a timer it can set. */
@ -270,7 +270,7 @@ static int lg_cpu_start(struct lg_cpu *cpu, unsigned id, unsigned long start_ip)
if (!cpu->regs_page) if (!cpu->regs_page)
return -ENOMEM; return -ENOMEM;
/* We actually put the registers at the bottom of the page. */ /* We actually put the registers at the end of the page. */
cpu->regs = (void *)cpu->regs_page + PAGE_SIZE - sizeof(*cpu->regs); cpu->regs = (void *)cpu->regs_page + PAGE_SIZE - sizeof(*cpu->regs);
/* /*

View File

@ -7,7 +7,7 @@
* converted Guest pages when running the Guest. * converted Guest pages when running the Guest.
:*/ :*/
/* Copyright (C) Rusty Russell IBM Corporation 2006. /* Copyright (C) Rusty Russell IBM Corporation 2013.
* GPL v2 and any later version */ * GPL v2 and any later version */
#include <linux/mm.h> #include <linux/mm.h>
#include <linux/gfp.h> #include <linux/gfp.h>
@ -62,22 +62,11 @@
* will need the last pmd entry of the last pmd page. * will need the last pmd entry of the last pmd page.
*/ */
#ifdef CONFIG_X86_PAE #ifdef CONFIG_X86_PAE
#define SWITCHER_PMD_INDEX (PTRS_PER_PMD - 1)
#define RESERVE_MEM 2U
#define CHECK_GPGD_MASK _PAGE_PRESENT #define CHECK_GPGD_MASK _PAGE_PRESENT
#else #else
#define RESERVE_MEM 4U
#define CHECK_GPGD_MASK _PAGE_TABLE #define CHECK_GPGD_MASK _PAGE_TABLE
#endif #endif
/*
* We actually need a separate PTE page for each CPU. Remember that after the
* Switcher code itself comes two pages for each CPU, and we don't want this
* CPU's guest to see the pages of any other CPU.
*/
static DEFINE_PER_CPU(pte_t *, switcher_pte_pages);
#define switcher_pte_page(cpu) per_cpu(switcher_pte_pages, cpu)
/*H:320 /*H:320
* The page table code is curly enough to need helper functions to keep it * The page table code is curly enough to need helper functions to keep it
* clear and clean. The kernel itself provides many of them; one advantage * clear and clean. The kernel itself provides many of them; one advantage
@ -95,13 +84,6 @@ static pgd_t *spgd_addr(struct lg_cpu *cpu, u32 i, unsigned long vaddr)
{ {
unsigned int index = pgd_index(vaddr); unsigned int index = pgd_index(vaddr);
#ifndef CONFIG_X86_PAE
/* We kill any Guest trying to touch the Switcher addresses. */
if (index >= SWITCHER_PGD_INDEX) {
kill_guest(cpu, "attempt to access switcher pages");
index = 0;
}
#endif
/* Return a pointer index'th pgd entry for the i'th page table. */ /* Return a pointer index'th pgd entry for the i'th page table. */
return &cpu->lg->pgdirs[i].pgdir[index]; return &cpu->lg->pgdirs[i].pgdir[index];
} }
@ -117,13 +99,6 @@ static pmd_t *spmd_addr(struct lg_cpu *cpu, pgd_t spgd, unsigned long vaddr)
unsigned int index = pmd_index(vaddr); unsigned int index = pmd_index(vaddr);
pmd_t *page; pmd_t *page;
/* We kill any Guest trying to touch the Switcher addresses. */
if (pgd_index(vaddr) == SWITCHER_PGD_INDEX &&
index >= SWITCHER_PMD_INDEX) {
kill_guest(cpu, "attempt to access switcher pages");
index = 0;
}
/* You should never call this if the PGD entry wasn't valid */ /* You should never call this if the PGD entry wasn't valid */
BUG_ON(!(pgd_flags(spgd) & _PAGE_PRESENT)); BUG_ON(!(pgd_flags(spgd) & _PAGE_PRESENT));
page = __va(pgd_pfn(spgd) << PAGE_SHIFT); page = __va(pgd_pfn(spgd) << PAGE_SHIFT);
@ -275,29 +250,120 @@ static void release_pte(pte_t pte)
} }
/*:*/ /*:*/
static void check_gpte(struct lg_cpu *cpu, pte_t gpte) static bool check_gpte(struct lg_cpu *cpu, pte_t gpte)
{ {
if ((pte_flags(gpte) & _PAGE_PSE) || if ((pte_flags(gpte) & _PAGE_PSE) ||
pte_pfn(gpte) >= cpu->lg->pfn_limit) pte_pfn(gpte) >= cpu->lg->pfn_limit) {
kill_guest(cpu, "bad page table entry"); kill_guest(cpu, "bad page table entry");
return false;
}
return true;
} }
static void check_gpgd(struct lg_cpu *cpu, pgd_t gpgd) static bool check_gpgd(struct lg_cpu *cpu, pgd_t gpgd)
{ {
if ((pgd_flags(gpgd) & ~CHECK_GPGD_MASK) || if ((pgd_flags(gpgd) & ~CHECK_GPGD_MASK) ||
(pgd_pfn(gpgd) >= cpu->lg->pfn_limit)) (pgd_pfn(gpgd) >= cpu->lg->pfn_limit)) {
kill_guest(cpu, "bad page directory entry"); kill_guest(cpu, "bad page directory entry");
return false;
}
return true;
} }
#ifdef CONFIG_X86_PAE #ifdef CONFIG_X86_PAE
static void check_gpmd(struct lg_cpu *cpu, pmd_t gpmd) static bool check_gpmd(struct lg_cpu *cpu, pmd_t gpmd)
{ {
if ((pmd_flags(gpmd) & ~_PAGE_TABLE) || if ((pmd_flags(gpmd) & ~_PAGE_TABLE) ||
(pmd_pfn(gpmd) >= cpu->lg->pfn_limit)) (pmd_pfn(gpmd) >= cpu->lg->pfn_limit)) {
kill_guest(cpu, "bad page middle directory entry"); kill_guest(cpu, "bad page middle directory entry");
return false;
}
return true;
} }
#endif #endif
/*H:331
* This is the core routine to walk the shadow page tables and find the page
* table entry for a specific address.
*
* If allocate is set, then we allocate any missing levels, setting the flags
* on the new page directory and mid-level directories using the arguments
* (which are copied from the Guest's page table entries).
*/
static pte_t *find_spte(struct lg_cpu *cpu, unsigned long vaddr, bool allocate,
int pgd_flags, int pmd_flags)
{
pgd_t *spgd;
/* Mid level for PAE. */
#ifdef CONFIG_X86_PAE
pmd_t *spmd;
#endif
/* Get top level entry. */
spgd = spgd_addr(cpu, cpu->cpu_pgd, vaddr);
if (!(pgd_flags(*spgd) & _PAGE_PRESENT)) {
/* No shadow entry: allocate a new shadow PTE page. */
unsigned long ptepage;
/* If they didn't want us to allocate anything, stop. */
if (!allocate)
return NULL;
ptepage = get_zeroed_page(GFP_KERNEL);
/*
* This is not really the Guest's fault, but killing it is
* simple for this corner case.
*/
if (!ptepage) {
kill_guest(cpu, "out of memory allocating pte page");
return NULL;
}
/*
* And we copy the flags to the shadow PGD entry. The page
* number in the shadow PGD is the page we just allocated.
*/
set_pgd(spgd, __pgd(__pa(ptepage) | pgd_flags));
}
/*
* Intel's Physical Address Extension actually uses three levels of
* page tables, so we need to look in the mid-level.
*/
#ifdef CONFIG_X86_PAE
/* Now look at the mid-level shadow entry. */
spmd = spmd_addr(cpu, *spgd, vaddr);
if (!(pmd_flags(*spmd) & _PAGE_PRESENT)) {
/* No shadow entry: allocate a new shadow PTE page. */
unsigned long ptepage;
/* If they didn't want us to allocate anything, stop. */
if (!allocate)
return NULL;
ptepage = get_zeroed_page(GFP_KERNEL);
/*
* This is not really the Guest's fault, but killing it is
* simple for this corner case.
*/
if (!ptepage) {
kill_guest(cpu, "out of memory allocating pmd page");
return NULL;
}
/*
* And we copy the flags to the shadow PMD entry. The page
* number in the shadow PMD is the page we just allocated.
*/
set_pmd(spmd, __pmd(__pa(ptepage) | pmd_flags));
}
#endif
/* Get the pointer to the shadow PTE entry we're going to set. */
return spte_addr(cpu, *spgd, vaddr);
}
/*H:330 /*H:330
* (i) Looking up a page table entry when the Guest faults. * (i) Looking up a page table entry when the Guest faults.
* *
@ -311,17 +377,15 @@ static void check_gpmd(struct lg_cpu *cpu, pmd_t gpmd)
*/ */
bool demand_page(struct lg_cpu *cpu, unsigned long vaddr, int errcode) bool demand_page(struct lg_cpu *cpu, unsigned long vaddr, int errcode)
{ {
pgd_t gpgd;
pgd_t *spgd;
unsigned long gpte_ptr; unsigned long gpte_ptr;
pte_t gpte; pte_t gpte;
pte_t *spte; pte_t *spte;
/* Mid level for PAE. */
#ifdef CONFIG_X86_PAE
pmd_t *spmd;
pmd_t gpmd; pmd_t gpmd;
#endif pgd_t gpgd;
/* We never demand page the Switcher, so trying is a mistake. */
if (vaddr >= switcher_addr)
return false;
/* First step: get the top-level Guest page table entry. */ /* First step: get the top-level Guest page table entry. */
if (unlikely(cpu->linear_pages)) { if (unlikely(cpu->linear_pages)) {
@ -332,65 +396,31 @@ bool demand_page(struct lg_cpu *cpu, unsigned long vaddr, int errcode)
/* Toplevel not present? We can't map it in. */ /* Toplevel not present? We can't map it in. */
if (!(pgd_flags(gpgd) & _PAGE_PRESENT)) if (!(pgd_flags(gpgd) & _PAGE_PRESENT))
return false; return false;
/*
* This kills the Guest if it has weird flags or tries to
* refer to a "physical" address outside the bounds.
*/
if (!check_gpgd(cpu, gpgd))
return false;
} }
/* Now look at the matching shadow entry. */ /* This "mid-level" entry is only used for non-linear, PAE mode. */
spgd = spgd_addr(cpu, cpu->cpu_pgd, vaddr); gpmd = __pmd(_PAGE_TABLE);
if (!(pgd_flags(*spgd) & _PAGE_PRESENT)) {
/* No shadow entry: allocate a new shadow PTE page. */
unsigned long ptepage = get_zeroed_page(GFP_KERNEL);
/*
* This is not really the Guest's fault, but killing it is
* simple for this corner case.
*/
if (!ptepage) {
kill_guest(cpu, "out of memory allocating pte page");
return false;
}
/* We check that the Guest pgd is OK. */
check_gpgd(cpu, gpgd);
/*
* And we copy the flags to the shadow PGD entry. The page
* number in the shadow PGD is the page we just allocated.
*/
set_pgd(spgd, __pgd(__pa(ptepage) | pgd_flags(gpgd)));
}
#ifdef CONFIG_X86_PAE #ifdef CONFIG_X86_PAE
if (unlikely(cpu->linear_pages)) { if (likely(!cpu->linear_pages)) {
/* Faking up a linear mapping. */
gpmd = __pmd(_PAGE_TABLE);
} else {
gpmd = lgread(cpu, gpmd_addr(gpgd, vaddr), pmd_t); gpmd = lgread(cpu, gpmd_addr(gpgd, vaddr), pmd_t);
/* Middle level not present? We can't map it in. */ /* Middle level not present? We can't map it in. */
if (!(pmd_flags(gpmd) & _PAGE_PRESENT)) if (!(pmd_flags(gpmd) & _PAGE_PRESENT))
return false; return false;
}
/* Now look at the matching shadow entry. */ /*
spmd = spmd_addr(cpu, *spgd, vaddr); * This kills the Guest if it has weird flags or tries to
* refer to a "physical" address outside the bounds.
if (!(pmd_flags(*spmd) & _PAGE_PRESENT)) {
/* No shadow entry: allocate a new shadow PTE page. */
unsigned long ptepage = get_zeroed_page(GFP_KERNEL);
/*
* This is not really the Guest's fault, but killing it is
* simple for this corner case.
*/ */
if (!ptepage) { if (!check_gpmd(cpu, gpmd))
kill_guest(cpu, "out of memory allocating pte page");
return false; return false;
}
/* We check that the Guest pmd is OK. */
check_gpmd(cpu, gpmd);
/*
* And we copy the flags to the shadow PMD entry. The page
* number in the shadow PMD is the page we just allocated.
*/
set_pmd(spmd, __pmd(__pa(ptepage) | pmd_flags(gpmd)));
} }
/* /*
@ -433,7 +463,8 @@ bool demand_page(struct lg_cpu *cpu, unsigned long vaddr, int errcode)
* Check that the Guest PTE flags are OK, and the page number is below * Check that the Guest PTE flags are OK, and the page number is below
* the pfn_limit (ie. not mapping the Launcher binary). * the pfn_limit (ie. not mapping the Launcher binary).
*/ */
check_gpte(cpu, gpte); if (!check_gpte(cpu, gpte))
return false;
/* Add the _PAGE_ACCESSED and (for a write) _PAGE_DIRTY flag */ /* Add the _PAGE_ACCESSED and (for a write) _PAGE_DIRTY flag */
gpte = pte_mkyoung(gpte); gpte = pte_mkyoung(gpte);
@ -441,7 +472,9 @@ bool demand_page(struct lg_cpu *cpu, unsigned long vaddr, int errcode)
gpte = pte_mkdirty(gpte); gpte = pte_mkdirty(gpte);
/* Get the pointer to the shadow PTE entry we're going to set. */ /* Get the pointer to the shadow PTE entry we're going to set. */
spte = spte_addr(cpu, *spgd, vaddr); spte = find_spte(cpu, vaddr, true, pgd_flags(gpgd), pmd_flags(gpmd));
if (!spte)
return false;
/* /*
* If there was a valid shadow PTE entry here before, we release it. * If there was a valid shadow PTE entry here before, we release it.
@ -493,29 +526,23 @@ bool demand_page(struct lg_cpu *cpu, unsigned long vaddr, int errcode)
*/ */
static bool page_writable(struct lg_cpu *cpu, unsigned long vaddr) static bool page_writable(struct lg_cpu *cpu, unsigned long vaddr)
{ {
pgd_t *spgd; pte_t *spte;
unsigned long flags; unsigned long flags;
#ifdef CONFIG_X86_PAE /* You can't put your stack in the Switcher! */
pmd_t *spmd; if (vaddr >= switcher_addr)
#endif
/* Look at the current top level entry: is it present? */
spgd = spgd_addr(cpu, cpu->cpu_pgd, vaddr);
if (!(pgd_flags(*spgd) & _PAGE_PRESENT))
return false; return false;
#ifdef CONFIG_X86_PAE /* If there's no shadow PTE, it's not writable. */
spmd = spmd_addr(cpu, *spgd, vaddr); spte = find_spte(cpu, vaddr, false, 0, 0);
if (!(pmd_flags(*spmd) & _PAGE_PRESENT)) if (!spte)
return false; return false;
#endif
/* /*
* Check the flags on the pte entry itself: it must be present and * Check the flags on the pte entry itself: it must be present and
* writable. * writable.
*/ */
flags = pte_flags(*(spte_addr(cpu, *spgd, vaddr))); flags = pte_flags(*spte);
return (flags & (_PAGE_PRESENT|_PAGE_RW)) == (_PAGE_PRESENT|_PAGE_RW); return (flags & (_PAGE_PRESENT|_PAGE_RW)) == (_PAGE_PRESENT|_PAGE_RW);
} }
@ -678,9 +705,6 @@ static unsigned int new_pgdir(struct lg_cpu *cpu,
int *blank_pgdir) int *blank_pgdir)
{ {
unsigned int next; unsigned int next;
#ifdef CONFIG_X86_PAE
pmd_t *pmd_table;
#endif
/* /*
* We pick one entry at random to throw out. Choosing the Least * We pick one entry at random to throw out. Choosing the Least
@ -695,29 +719,11 @@ static unsigned int new_pgdir(struct lg_cpu *cpu,
if (!cpu->lg->pgdirs[next].pgdir) if (!cpu->lg->pgdirs[next].pgdir)
next = cpu->cpu_pgd; next = cpu->cpu_pgd;
else { else {
#ifdef CONFIG_X86_PAE
/* /*
* In PAE mode, allocate a pmd page and populate the * This is a blank page, so there are no kernel
* last pgd entry. * mappings: caller must map the stack!
*/ */
pmd_table = (pmd_t *)get_zeroed_page(GFP_KERNEL);
if (!pmd_table) {
free_page((long)cpu->lg->pgdirs[next].pgdir);
set_pgd(cpu->lg->pgdirs[next].pgdir, __pgd(0));
next = cpu->cpu_pgd;
} else {
set_pgd(cpu->lg->pgdirs[next].pgdir +
SWITCHER_PGD_INDEX,
__pgd(__pa(pmd_table) | _PAGE_PRESENT));
/*
* This is a blank page, so there are no kernel
* mappings: caller must map the stack!
*/
*blank_pgdir = 1;
}
#else
*blank_pgdir = 1; *blank_pgdir = 1;
#endif
} }
} }
/* Record which Guest toplevel this shadows. */ /* Record which Guest toplevel this shadows. */
@ -725,9 +731,50 @@ static unsigned int new_pgdir(struct lg_cpu *cpu,
/* Release all the non-kernel mappings. */ /* Release all the non-kernel mappings. */
flush_user_mappings(cpu->lg, next); flush_user_mappings(cpu->lg, next);
/* This hasn't run on any CPU at all. */
cpu->lg->pgdirs[next].last_host_cpu = -1;
return next; return next;
} }
/*H:501
* We do need the Switcher code mapped at all times, so we allocate that
* part of the Guest page table here. We map the Switcher code immediately,
* but defer mapping of the guest register page and IDT/LDT etc page until
* just before we run the guest in map_switcher_in_guest().
*
* We *could* do this setup in map_switcher_in_guest(), but at that point
* we've interrupts disabled, and allocating pages like that is fraught: we
* can't sleep if we need to free up some memory.
*/
static bool allocate_switcher_mapping(struct lg_cpu *cpu)
{
int i;
for (i = 0; i < TOTAL_SWITCHER_PAGES; i++) {
pte_t *pte = find_spte(cpu, switcher_addr + i * PAGE_SIZE, true,
CHECK_GPGD_MASK, _PAGE_TABLE);
if (!pte)
return false;
/*
* Map the switcher page if not already there. It might
* already be there because we call allocate_switcher_mapping()
* in guest_set_pgd() just in case it did discard our Switcher
* mapping, but it probably didn't.
*/
if (i == 0 && !(pte_flags(*pte) & _PAGE_PRESENT)) {
/* Get a reference to the Switcher page. */
get_page(lg_switcher_pages[0]);
/* Create a read-only, exectuable, kernel-style PTE */
set_pte(pte,
mk_pte(lg_switcher_pages[0], PAGE_KERNEL_RX));
}
}
cpu->lg->pgdirs[cpu->cpu_pgd].switcher_mapped = true;
return true;
}
/*H:470 /*H:470
* Finally, a routine which throws away everything: all PGD entries in all * Finally, a routine which throws away everything: all PGD entries in all
* the shadow page tables, including the Guest's kernel mappings. This is used * the shadow page tables, including the Guest's kernel mappings. This is used
@ -738,28 +785,16 @@ static void release_all_pagetables(struct lguest *lg)
unsigned int i, j; unsigned int i, j;
/* Every shadow pagetable this Guest has */ /* Every shadow pagetable this Guest has */
for (i = 0; i < ARRAY_SIZE(lg->pgdirs); i++) for (i = 0; i < ARRAY_SIZE(lg->pgdirs); i++) {
if (lg->pgdirs[i].pgdir) { if (!lg->pgdirs[i].pgdir)
#ifdef CONFIG_X86_PAE continue;
pgd_t *spgd;
pmd_t *pmdpage;
unsigned int k;
/* Get the last pmd page. */ /* Every PGD entry. */
spgd = lg->pgdirs[i].pgdir + SWITCHER_PGD_INDEX; for (j = 0; j < PTRS_PER_PGD; j++)
pmdpage = __va(pgd_pfn(*spgd) << PAGE_SHIFT); release_pgd(lg->pgdirs[i].pgdir + j);
lg->pgdirs[i].switcher_mapped = false;
/* lg->pgdirs[i].last_host_cpu = -1;
* And release the pmd entries of that pmd page, }
* except for the switcher pmd.
*/
for (k = 0; k < SWITCHER_PMD_INDEX; k++)
release_pmd(&pmdpage[k]);
#endif
/* Every PGD entry except the Switcher at the top */
for (j = 0; j < SWITCHER_PGD_INDEX; j++)
release_pgd(lg->pgdirs[i].pgdir + j);
}
} }
/* /*
@ -773,6 +808,9 @@ void guest_pagetable_clear_all(struct lg_cpu *cpu)
release_all_pagetables(cpu->lg); release_all_pagetables(cpu->lg);
/* We need the Guest kernel stack mapped again. */ /* We need the Guest kernel stack mapped again. */
pin_stack_pages(cpu); pin_stack_pages(cpu);
/* And we need Switcher allocated. */
if (!allocate_switcher_mapping(cpu))
kill_guest(cpu, "Cannot populate switcher mapping");
} }
/*H:430 /*H:430
@ -808,9 +846,17 @@ void guest_new_pagetable(struct lg_cpu *cpu, unsigned long pgtable)
newpgdir = new_pgdir(cpu, pgtable, &repin); newpgdir = new_pgdir(cpu, pgtable, &repin);
/* Change the current pgd index to the new one. */ /* Change the current pgd index to the new one. */
cpu->cpu_pgd = newpgdir; cpu->cpu_pgd = newpgdir;
/* If it was completely blank, we map in the Guest kernel stack */ /*
* If it was completely blank, we map in the Guest kernel stack and
* the Switcher.
*/
if (repin) if (repin)
pin_stack_pages(cpu); pin_stack_pages(cpu);
if (!cpu->lg->pgdirs[cpu->cpu_pgd].switcher_mapped) {
if (!allocate_switcher_mapping(cpu))
kill_guest(cpu, "Cannot populate switcher mapping");
}
} }
/*:*/ /*:*/
@ -865,7 +911,8 @@ static void do_set_pte(struct lg_cpu *cpu, int idx,
* micro-benchmark. * micro-benchmark.
*/ */
if (pte_flags(gpte) & (_PAGE_DIRTY | _PAGE_ACCESSED)) { if (pte_flags(gpte) & (_PAGE_DIRTY | _PAGE_ACCESSED)) {
check_gpte(cpu, gpte); if (!check_gpte(cpu, gpte))
return;
set_pte(spte, set_pte(spte,
gpte_to_spte(cpu, gpte, gpte_to_spte(cpu, gpte,
pte_flags(gpte) & _PAGE_DIRTY)); pte_flags(gpte) & _PAGE_DIRTY));
@ -897,6 +944,12 @@ static void do_set_pte(struct lg_cpu *cpu, int idx,
void guest_set_pte(struct lg_cpu *cpu, void guest_set_pte(struct lg_cpu *cpu,
unsigned long gpgdir, unsigned long vaddr, pte_t gpte) unsigned long gpgdir, unsigned long vaddr, pte_t gpte)
{ {
/* We don't let you remap the Switcher; we need it to get back! */
if (vaddr >= switcher_addr) {
kill_guest(cpu, "attempt to set pte into Switcher pages");
return;
}
/* /*
* Kernel mappings must be changed on all top levels. Slow, but doesn't * Kernel mappings must be changed on all top levels. Slow, but doesn't
* happen often. * happen often.
@ -933,14 +986,23 @@ void guest_set_pgd(struct lguest *lg, unsigned long gpgdir, u32 idx)
{ {
int pgdir; int pgdir;
if (idx >= SWITCHER_PGD_INDEX) if (idx > PTRS_PER_PGD) {
kill_guest(&lg->cpus[0], "Attempt to set pgd %u/%u",
idx, PTRS_PER_PGD);
return; return;
}
/* If they're talking about a page table we have a shadow for... */ /* If they're talking about a page table we have a shadow for... */
pgdir = find_pgdir(lg, gpgdir); pgdir = find_pgdir(lg, gpgdir);
if (pgdir < ARRAY_SIZE(lg->pgdirs)) if (pgdir < ARRAY_SIZE(lg->pgdirs)) {
/* ... throw it away. */ /* ... throw it away. */
release_pgd(lg->pgdirs[pgdir].pgdir + idx); release_pgd(lg->pgdirs[pgdir].pgdir + idx);
/* That might have been the Switcher mapping, remap it. */
if (!allocate_switcher_mapping(&lg->cpus[0])) {
kill_guest(&lg->cpus[0],
"Cannot populate switcher mapping");
}
}
} }
#ifdef CONFIG_X86_PAE #ifdef CONFIG_X86_PAE
@ -958,6 +1020,9 @@ void guest_set_pmd(struct lguest *lg, unsigned long pmdp, u32 idx)
* we will populate on future faults. The Guest doesn't have any actual * we will populate on future faults. The Guest doesn't have any actual
* pagetables yet, so we set linear_pages to tell demand_page() to fake it * pagetables yet, so we set linear_pages to tell demand_page() to fake it
* for the moment. * for the moment.
*
* We do need the Switcher to be mapped at all times, so we allocate that
* part of the Guest page table here.
*/ */
int init_guest_pagetable(struct lguest *lg) int init_guest_pagetable(struct lguest *lg)
{ {
@ -971,21 +1036,34 @@ int init_guest_pagetable(struct lguest *lg)
/* We start with a linear mapping until the initialize. */ /* We start with a linear mapping until the initialize. */
cpu->linear_pages = true; cpu->linear_pages = true;
/* Allocate the page tables for the Switcher. */
if (!allocate_switcher_mapping(cpu)) {
release_all_pagetables(lg);
return -ENOMEM;
}
return 0; return 0;
} }
/*H:508 When the Guest calls LHCALL_LGUEST_INIT we do more setup. */ /*H:508 When the Guest calls LHCALL_LGUEST_INIT we do more setup. */
void page_table_guest_data_init(struct lg_cpu *cpu) void page_table_guest_data_init(struct lg_cpu *cpu)
{ {
/*
* We tell the Guest that it can't use the virtual addresses
* used by the Switcher. This trick is equivalent to 4GB -
* switcher_addr.
*/
u32 top = ~switcher_addr + 1;
/* We get the kernel address: above this is all kernel memory. */ /* We get the kernel address: above this is all kernel memory. */
if (get_user(cpu->lg->kernel_address, if (get_user(cpu->lg->kernel_address,
&cpu->lg->lguest_data->kernel_address) &cpu->lg->lguest_data->kernel_address)
/* /*
* We tell the Guest that it can't use the top 2 or 4 MB * We tell the Guest that it can't use the top virtual
* of virtual addresses used by the Switcher. * addresses (used by the Switcher).
*/ */
|| put_user(RESERVE_MEM * 1024 * 1024, || put_user(top, &cpu->lg->lguest_data->reserve_mem)) {
&cpu->lg->lguest_data->reserve_mem)) {
kill_guest(cpu, "bad guest page %p", cpu->lg->lguest_data); kill_guest(cpu, "bad guest page %p", cpu->lg->lguest_data);
return; return;
} }
@ -995,12 +1073,7 @@ void page_table_guest_data_init(struct lg_cpu *cpu)
* "pgd_index(lg->kernel_address)". This assumes it won't hit the * "pgd_index(lg->kernel_address)". This assumes it won't hit the
* Switcher mappings, so check that now. * Switcher mappings, so check that now.
*/ */
#ifdef CONFIG_X86_PAE if (cpu->lg->kernel_address >= switcher_addr)
if (pgd_index(cpu->lg->kernel_address) == SWITCHER_PGD_INDEX &&
pmd_index(cpu->lg->kernel_address) == SWITCHER_PMD_INDEX)
#else
if (pgd_index(cpu->lg->kernel_address) >= SWITCHER_PGD_INDEX)
#endif
kill_guest(cpu, "bad kernel address %#lx", kill_guest(cpu, "bad kernel address %#lx",
cpu->lg->kernel_address); cpu->lg->kernel_address);
} }
@ -1017,102 +1090,96 @@ void free_guest_pagetable(struct lguest *lg)
free_page((long)lg->pgdirs[i].pgdir); free_page((long)lg->pgdirs[i].pgdir);
} }
/*H:481
* This clears the Switcher mappings for cpu #i.
*/
static void remove_switcher_percpu_map(struct lg_cpu *cpu, unsigned int i)
{
unsigned long base = switcher_addr + PAGE_SIZE + i * PAGE_SIZE*2;
pte_t *pte;
/* Clear the mappings for both pages. */
pte = find_spte(cpu, base, false, 0, 0);
release_pte(*pte);
set_pte(pte, __pte(0));
pte = find_spte(cpu, base + PAGE_SIZE, false, 0, 0);
release_pte(*pte);
set_pte(pte, __pte(0));
}
/*H:480 /*H:480
* (vi) Mapping the Switcher when the Guest is about to run. * (vi) Mapping the Switcher when the Guest is about to run.
* *
* The Switcher and the two pages for this CPU need to be visible in the * The Switcher and the two pages for this CPU need to be visible in the Guest
* Guest (and not the pages for other CPUs). We have the appropriate PTE pages * (and not the pages for other CPUs).
* for each CPU already set up, we just need to hook them in now we know which *
* Guest is about to run on this CPU. * The pages for the pagetables have all been allocated before: we just need
* to make sure the actual PTEs are up-to-date for the CPU we're about to run
* on.
*/ */
void map_switcher_in_guest(struct lg_cpu *cpu, struct lguest_pages *pages) void map_switcher_in_guest(struct lg_cpu *cpu, struct lguest_pages *pages)
{ {
pte_t *switcher_pte_page = __this_cpu_read(switcher_pte_pages); unsigned long base;
pte_t regs_pte; struct page *percpu_switcher_page, *regs_page;
pte_t *pte;
struct pgdir *pgdir = &cpu->lg->pgdirs[cpu->cpu_pgd];
#ifdef CONFIG_X86_PAE /* Switcher page should always be mapped by now! */
pmd_t switcher_pmd; BUG_ON(!pgdir->switcher_mapped);
pmd_t *pmd_table;
switcher_pmd = pfn_pmd(__pa(switcher_pte_page) >> PAGE_SHIFT, /*
PAGE_KERNEL_EXEC); * Remember that we have two pages for each Host CPU, so we can run a
* Guest on each CPU without them interfering. We need to make sure
/* Figure out where the pmd page is, by reading the PGD, and converting * those pages are mapped correctly in the Guest, but since we usually
* it to a virtual address. */ * run on the same CPU, we cache that, and only update the mappings
pmd_table = __va(pgd_pfn(cpu->lg-> * when we move.
pgdirs[cpu->cpu_pgd].pgdir[SWITCHER_PGD_INDEX])
<< PAGE_SHIFT);
/* Now write it into the shadow page table. */
set_pmd(&pmd_table[SWITCHER_PMD_INDEX], switcher_pmd);
#else
pgd_t switcher_pgd;
/*
* Make the last PGD entry for this Guest point to the Switcher's PTE
* page for this CPU (with appropriate flags).
*/ */
switcher_pgd = __pgd(__pa(switcher_pte_page) | __PAGE_KERNEL_EXEC); if (pgdir->last_host_cpu == raw_smp_processor_id())
return;
cpu->lg->pgdirs[cpu->cpu_pgd].pgdir[SWITCHER_PGD_INDEX] = switcher_pgd; /* -1 means unknown so we remove everything. */
if (pgdir->last_host_cpu == -1) {
#endif unsigned int i;
/* for_each_possible_cpu(i)
* We also change the Switcher PTE page. When we're running the Guest, remove_switcher_percpu_map(cpu, i);
* we want the Guest's "regs" page to appear where the first Switcher } else {
* page for this CPU is. This is an optimization: when the Switcher /* We know exactly what CPU mapping to remove. */
* saves the Guest registers, it saves them into the first page of this remove_switcher_percpu_map(cpu, pgdir->last_host_cpu);
* CPU's "struct lguest_pages": if we make sure the Guest's register
* page is already mapped there, we don't have to copy them out
* again.
*/
regs_pte = pfn_pte(__pa(cpu->regs_page) >> PAGE_SHIFT, PAGE_KERNEL);
set_pte(&switcher_pte_page[pte_index((unsigned long)pages)], regs_pte);
}
/*:*/
static void free_switcher_pte_pages(void)
{
unsigned int i;
for_each_possible_cpu(i)
free_page((long)switcher_pte_page(i));
}
/*H:520
* Setting up the Switcher PTE page for given CPU is fairly easy, given
* the CPU number and the "struct page"s for the Switcher code itself.
*
* Currently the Switcher is less than a page long, so "pages" is always 1.
*/
static __init void populate_switcher_pte_page(unsigned int cpu,
struct page *switcher_page[],
unsigned int pages)
{
unsigned int i;
pte_t *pte = switcher_pte_page(cpu);
/* The first entries are easy: they map the Switcher code. */
for (i = 0; i < pages; i++) {
set_pte(&pte[i], mk_pte(switcher_page[i],
__pgprot(_PAGE_PRESENT|_PAGE_ACCESSED)));
} }
/* The only other thing we map is this CPU's pair of pages. */ /*
i = pages + cpu*2; * When we're running the Guest, we want the Guest's "regs" page to
* appear where the first Switcher page for this CPU is. This is an
/* First page (Guest registers) is writable from the Guest */ * optimization: when the Switcher saves the Guest registers, it saves
set_pte(&pte[i], pfn_pte(page_to_pfn(switcher_page[i]), * them into the first page of this CPU's "struct lguest_pages": if we
__pgprot(_PAGE_PRESENT|_PAGE_ACCESSED|_PAGE_RW))); * make sure the Guest's register page is already mapped there, we
* don't have to copy them out again.
*/
/* Find the shadow PTE for this regs page. */
base = switcher_addr + PAGE_SIZE
+ raw_smp_processor_id() * sizeof(struct lguest_pages);
pte = find_spte(cpu, base, false, 0, 0);
regs_page = pfn_to_page(__pa(cpu->regs_page) >> PAGE_SHIFT);
get_page(regs_page);
set_pte(pte, mk_pte(regs_page, __pgprot(__PAGE_KERNEL & ~_PAGE_GLOBAL)));
/* /*
* The second page contains the "struct lguest_ro_state", and is * We map the second page of the struct lguest_pages read-only in
* read-only. * the Guest: the IDT, GDT and other things it's not supposed to
* change.
*/ */
set_pte(&pte[i+1], pfn_pte(page_to_pfn(switcher_page[i+1]), pte = find_spte(cpu, base + PAGE_SIZE, false, 0, 0);
__pgprot(_PAGE_PRESENT|_PAGE_ACCESSED))); percpu_switcher_page
= lg_switcher_pages[1 + raw_smp_processor_id()*2 + 1];
get_page(percpu_switcher_page);
set_pte(pte, mk_pte(percpu_switcher_page,
__pgprot(__PAGE_KERNEL_RO & ~_PAGE_GLOBAL)));
pgdir->last_host_cpu = raw_smp_processor_id();
} }
/* /*H:490
* We've made it through the page table code. Perhaps our tired brains are * We've made it through the page table code. Perhaps our tired brains are
* still processing the details, or perhaps we're simply glad it's over. * still processing the details, or perhaps we're simply glad it's over.
* *
@ -1124,29 +1191,3 @@ static __init void populate_switcher_pte_page(unsigned int cpu,
* *
* There is just one file remaining in the Host. * There is just one file remaining in the Host.
*/ */
/*H:510
* At boot or module load time, init_pagetables() allocates and populates
* the Switcher PTE page for each CPU.
*/
__init int init_pagetables(struct page **switcher_page, unsigned int pages)
{
unsigned int i;
for_each_possible_cpu(i) {
switcher_pte_page(i) = (pte_t *)get_zeroed_page(GFP_KERNEL);
if (!switcher_pte_page(i)) {
free_switcher_pte_pages();
return -ENOMEM;
}
populate_switcher_pte_page(i, switcher_page, pages);
}
return 0;
}
/*:*/
/* Cleaning up simply involves freeing the PTE page for each CPU. */
void free_pagetables(void)
{
free_switcher_pte_pages();
}

View File

@ -59,14 +59,13 @@ static struct {
/* Offset from where switcher.S was compiled to where we've copied it */ /* Offset from where switcher.S was compiled to where we've copied it */
static unsigned long switcher_offset(void) static unsigned long switcher_offset(void)
{ {
return SWITCHER_ADDR - (unsigned long)start_switcher_text; return switcher_addr - (unsigned long)start_switcher_text;
} }
/* This cpu's struct lguest_pages. */ /* This cpu's struct lguest_pages (after the Switcher text page) */
static struct lguest_pages *lguest_pages(unsigned int cpu) static struct lguest_pages *lguest_pages(unsigned int cpu)
{ {
return &(((struct lguest_pages *) return &(((struct lguest_pages *)(switcher_addr + PAGE_SIZE))[cpu]);
(SWITCHER_ADDR + SHARED_SWITCHER_PAGES*PAGE_SIZE))[cpu]);
} }
static DEFINE_PER_CPU(struct lg_cpu *, lg_last_cpu); static DEFINE_PER_CPU(struct lg_cpu *, lg_last_cpu);

View File

@ -40,3 +40,17 @@ config CAIF_HSI
The caif low level driver for CAIF over HSI. The caif low level driver for CAIF over HSI.
Be aware that if you enable this then you also need to Be aware that if you enable this then you also need to
enable a low-level HSI driver. enable a low-level HSI driver.
config CAIF_VIRTIO
tristate "CAIF virtio transport driver"
depends on CAIF
select VHOST_RING
select VIRTIO
select GENERIC_ALLOCATOR
default n
---help---
The caif driver for CAIF over Virtio.
if CAIF_VIRTIO
source "drivers/vhost/Kconfig"
endif

View File

@ -9,3 +9,6 @@ obj-$(CONFIG_CAIF_SPI_SLAVE) += cfspi_slave.o
# HSI interface # HSI interface
obj-$(CONFIG_CAIF_HSI) += caif_hsi.o obj-$(CONFIG_CAIF_HSI) += caif_hsi.o
# Virtio interface
obj-$(CONFIG_CAIF_VIRTIO) += caif_virtio.o

View File

@ -0,0 +1,790 @@
/*
* Copyright (C) ST-Ericsson AB 2013
* Authors: Vicram Arv
* Dmitry Tarnyagin <dmitry.tarnyagin@lockless.no>
* Sjur Brendeland
* License terms: GNU General Public License (GPL) version 2
*/
#include <linux/module.h>
#include <linux/if_arp.h>
#include <linux/virtio.h>
#include <linux/vringh.h>
#include <linux/debugfs.h>
#include <linux/spinlock.h>
#include <linux/genalloc.h>
#include <linux/interrupt.h>
#include <linux/netdevice.h>
#include <linux/rtnetlink.h>
#include <linux/virtio_ids.h>
#include <linux/virtio_caif.h>
#include <linux/virtio_ring.h>
#include <linux/dma-mapping.h>
#include <net/caif/caif_dev.h>
#include <linux/virtio_config.h>
MODULE_LICENSE("GPL v2");
MODULE_AUTHOR("Vicram Arv");
MODULE_AUTHOR("Sjur Brendeland");
MODULE_DESCRIPTION("Virtio CAIF Driver");
/* NAPI schedule quota */
#define CFV_DEFAULT_QUOTA 32
/* Defaults used if virtio config space is unavailable */
#define CFV_DEF_MTU_SIZE 4096
#define CFV_DEF_HEADROOM 32
#define CFV_DEF_TAILROOM 32
/* Required IP header alignment */
#define IP_HDR_ALIGN 4
/* struct cfv_napi_contxt - NAPI context info
* @riov: IOV holding data read from the ring. Note that riov may
* still hold data when cfv_rx_poll() returns.
* @head: Last descriptor ID we received from vringh_getdesc_kern.
* We use this to put descriptor back on the used ring. USHRT_MAX is
* used to indicate invalid head-id.
*/
struct cfv_napi_context {
struct vringh_kiov riov;
unsigned short head;
};
/* struct cfv_stats - statistics for debugfs
* @rx_napi_complete: Number of NAPI completions (RX)
* @rx_napi_resched: Number of calls where the full quota was used (RX)
* @rx_nomem: Number of SKB alloc failures (RX)
* @rx_kicks: Number of RX kicks
* @tx_full_ring: Number times TX ring was full
* @tx_no_mem: Number of times TX went out of memory
* @tx_flow_on: Number of flow on (TX)
* @tx_kicks: Number of TX kicks
*/
struct cfv_stats {
u32 rx_napi_complete;
u32 rx_napi_resched;
u32 rx_nomem;
u32 rx_kicks;
u32 tx_full_ring;
u32 tx_no_mem;
u32 tx_flow_on;
u32 tx_kicks;
};
/* struct cfv_info - Caif Virtio control structure
* @cfdev: caif common header
* @vdev: Associated virtio device
* @vr_rx: rx/downlink host vring
* @vq_tx: tx/uplink virtqueue
* @ndev: CAIF link layer device
* @watermark_tx: indicates number of free descriptors we need
* to reopen the tx-queues after overload.
* @tx_lock: protects vq_tx from concurrent use
* @tx_release_tasklet: Tasklet for freeing consumed TX buffers
* @napi: Napi context used in cfv_rx_poll()
* @ctx: Context data used in cfv_rx_poll()
* @tx_hr: transmit headroom
* @rx_hr: receive headroom
* @tx_tr: transmit tail room
* @rx_tr: receive tail room
* @mtu: transmit max size
* @mru: receive max size
* @allocsz: size of dma memory reserved for TX buffers
* @alloc_addr: virtual address to dma memory for TX buffers
* @alloc_dma: dma address to dma memory for TX buffers
* @genpool: Gen Pool used for allocating TX buffers
* @reserved_mem: Pointer to memory reserve allocated from genpool
* @reserved_size: Size of memory reserve allocated from genpool
* @stats: Statistics exposed in sysfs
* @debugfs: Debugfs dentry for statistic counters
*/
struct cfv_info {
struct caif_dev_common cfdev;
struct virtio_device *vdev;
struct vringh *vr_rx;
struct virtqueue *vq_tx;
struct net_device *ndev;
unsigned int watermark_tx;
/* Protect access to vq_tx */
spinlock_t tx_lock;
struct tasklet_struct tx_release_tasklet;
struct napi_struct napi;
struct cfv_napi_context ctx;
u16 tx_hr;
u16 rx_hr;
u16 tx_tr;
u16 rx_tr;
u32 mtu;
u32 mru;
size_t allocsz;
void *alloc_addr;
dma_addr_t alloc_dma;
struct gen_pool *genpool;
unsigned long reserved_mem;
size_t reserved_size;
struct cfv_stats stats;
struct dentry *debugfs;
};
/* struct buf_info - maintains transmit buffer data handle
* @size: size of transmit buffer
* @dma_handle: handle to allocated dma device memory area
* @vaddr: virtual address mapping to allocated memory area
*/
struct buf_info {
size_t size;
u8 *vaddr;
};
/* Called from virtio device, in IRQ context */
static void cfv_release_cb(struct virtqueue *vq_tx)
{
struct cfv_info *cfv = vq_tx->vdev->priv;
++cfv->stats.tx_kicks;
tasklet_schedule(&cfv->tx_release_tasklet);
}
static void free_buf_info(struct cfv_info *cfv, struct buf_info *buf_info)
{
if (!buf_info)
return;
gen_pool_free(cfv->genpool, (unsigned long) buf_info->vaddr,
buf_info->size);
kfree(buf_info);
}
/* This is invoked whenever the remote processor completed processing
* a TX msg we just sent, and the buffer is put back to the used ring.
*/
static void cfv_release_used_buf(struct virtqueue *vq_tx)
{
struct cfv_info *cfv = vq_tx->vdev->priv;
unsigned long flags;
BUG_ON(vq_tx != cfv->vq_tx);
for (;;) {
unsigned int len;
struct buf_info *buf_info;
/* Get used buffer from used ring to recycle used descriptors */
spin_lock_irqsave(&cfv->tx_lock, flags);
buf_info = virtqueue_get_buf(vq_tx, &len);
spin_unlock_irqrestore(&cfv->tx_lock, flags);
/* Stop looping if there are no more buffers to free */
if (!buf_info)
break;
free_buf_info(cfv, buf_info);
/* watermark_tx indicates if we previously stopped the tx
* queues. If we have enough free stots in the virtio ring,
* re-establish memory reserved and open up tx queues.
*/
if (cfv->vq_tx->num_free <= cfv->watermark_tx)
continue;
/* Re-establish memory reserve */
if (cfv->reserved_mem == 0 && cfv->genpool)
cfv->reserved_mem =
gen_pool_alloc(cfv->genpool,
cfv->reserved_size);
/* Open up the tx queues */
if (cfv->reserved_mem) {
cfv->watermark_tx =
virtqueue_get_vring_size(cfv->vq_tx);
netif_tx_wake_all_queues(cfv->ndev);
/* Buffers are recycled in cfv_netdev_tx, so
* disable notifications when queues are opened.
*/
virtqueue_disable_cb(cfv->vq_tx);
++cfv->stats.tx_flow_on;
} else {
/* if no memory reserve, wait for more free slots */
WARN_ON(cfv->watermark_tx >
virtqueue_get_vring_size(cfv->vq_tx));
cfv->watermark_tx +=
virtqueue_get_vring_size(cfv->vq_tx) / 4;
}
}
}
/* Allocate a SKB and copy packet data to it */
static struct sk_buff *cfv_alloc_and_copy_skb(int *err,
struct cfv_info *cfv,
u8 *frm, u32 frm_len)
{
struct sk_buff *skb;
u32 cfpkt_len, pad_len;
*err = 0;
/* Verify that packet size with down-link header and mtu size */
if (frm_len > cfv->mru || frm_len <= cfv->rx_hr + cfv->rx_tr) {
netdev_err(cfv->ndev,
"Invalid frmlen:%u mtu:%u hr:%d tr:%d\n",
frm_len, cfv->mru, cfv->rx_hr,
cfv->rx_tr);
*err = -EPROTO;
return NULL;
}
cfpkt_len = frm_len - (cfv->rx_hr + cfv->rx_tr);
pad_len = (unsigned long)(frm + cfv->rx_hr) & (IP_HDR_ALIGN - 1);
skb = netdev_alloc_skb(cfv->ndev, frm_len + pad_len);
if (!skb) {
*err = -ENOMEM;
return NULL;
}
skb_reserve(skb, cfv->rx_hr + pad_len);
memcpy(skb_put(skb, cfpkt_len), frm + cfv->rx_hr, cfpkt_len);
return skb;
}
/* Get packets from the host vring */
static int cfv_rx_poll(struct napi_struct *napi, int quota)
{
struct cfv_info *cfv = container_of(napi, struct cfv_info, napi);
int rxcnt = 0;
int err = 0;
void *buf;
struct sk_buff *skb;
struct vringh_kiov *riov = &cfv->ctx.riov;
unsigned int skb_len;
again:
do {
skb = NULL;
/* Put the previous iovec back on the used ring and
* fetch a new iovec if we have processed all elements.
*/
if (riov->i == riov->used) {
if (cfv->ctx.head != USHRT_MAX) {
vringh_complete_kern(cfv->vr_rx,
cfv->ctx.head,
0);
cfv->ctx.head = USHRT_MAX;
}
err = vringh_getdesc_kern(
cfv->vr_rx,
riov,
NULL,
&cfv->ctx.head,
GFP_ATOMIC);
if (err <= 0)
goto exit;
}
buf = phys_to_virt((unsigned long) riov->iov[riov->i].iov_base);
/* TODO: Add check on valid buffer address */
skb = cfv_alloc_and_copy_skb(&err, cfv, buf,
riov->iov[riov->i].iov_len);
if (unlikely(err))
goto exit;
/* Push received packet up the stack. */
skb_len = skb->len;
skb->protocol = htons(ETH_P_CAIF);
skb_reset_mac_header(skb);
skb->dev = cfv->ndev;
err = netif_receive_skb(skb);
if (unlikely(err)) {
++cfv->ndev->stats.rx_dropped;
} else {
++cfv->ndev->stats.rx_packets;
cfv->ndev->stats.rx_bytes += skb_len;
}
++riov->i;
++rxcnt;
} while (rxcnt < quota);
++cfv->stats.rx_napi_resched;
goto out;
exit:
switch (err) {
case 0:
++cfv->stats.rx_napi_complete;
/* Really out of patckets? (stolen from virtio_net)*/
napi_complete(napi);
if (unlikely(!vringh_notify_enable_kern(cfv->vr_rx)) &&
napi_schedule_prep(napi)) {
vringh_notify_disable_kern(cfv->vr_rx);
__napi_schedule(napi);
goto again;
}
break;
case -ENOMEM:
++cfv->stats.rx_nomem;
dev_kfree_skb(skb);
/* Stop NAPI poll on OOM, we hope to be polled later */
napi_complete(napi);
vringh_notify_enable_kern(cfv->vr_rx);
break;
default:
/* We're doomed, any modem fault is fatal */
netdev_warn(cfv->ndev, "Bad ring, disable device\n");
cfv->ndev->stats.rx_dropped = riov->used - riov->i;
napi_complete(napi);
vringh_notify_disable_kern(cfv->vr_rx);
netif_carrier_off(cfv->ndev);
break;
}
out:
if (rxcnt && vringh_need_notify_kern(cfv->vr_rx) > 0)
vringh_notify(cfv->vr_rx);
return rxcnt;
}
static void cfv_recv(struct virtio_device *vdev, struct vringh *vr_rx)
{
struct cfv_info *cfv = vdev->priv;
++cfv->stats.rx_kicks;
vringh_notify_disable_kern(cfv->vr_rx);
napi_schedule(&cfv->napi);
}
static void cfv_destroy_genpool(struct cfv_info *cfv)
{
if (cfv->alloc_addr)
dma_free_coherent(cfv->vdev->dev.parent->parent,
cfv->allocsz, cfv->alloc_addr,
cfv->alloc_dma);
if (!cfv->genpool)
return;
gen_pool_free(cfv->genpool, cfv->reserved_mem,
cfv->reserved_size);
gen_pool_destroy(cfv->genpool);
cfv->genpool = NULL;
}
static int cfv_create_genpool(struct cfv_info *cfv)
{
int err;
/* dma_alloc can only allocate whole pages, and we need a more
* fine graned allocation so we use genpool. We ask for space needed
* by IP and a full ring. If the dma allcoation fails we retry with a
* smaller allocation size.
*/
err = -ENOMEM;
cfv->allocsz = (virtqueue_get_vring_size(cfv->vq_tx) *
(ETH_DATA_LEN + cfv->tx_hr + cfv->tx_tr) * 11)/10;
if (cfv->allocsz <= (num_possible_cpus() + 1) * cfv->ndev->mtu)
return -EINVAL;
for (;;) {
if (cfv->allocsz <= num_possible_cpus() * cfv->ndev->mtu) {
netdev_info(cfv->ndev, "Not enough device memory\n");
return -ENOMEM;
}
cfv->alloc_addr = dma_alloc_coherent(
cfv->vdev->dev.parent->parent,
cfv->allocsz, &cfv->alloc_dma,
GFP_ATOMIC);
if (cfv->alloc_addr)
break;
cfv->allocsz = (cfv->allocsz * 3) >> 2;
}
netdev_dbg(cfv->ndev, "Allocated %zd bytes from dma-memory\n",
cfv->allocsz);
/* Allocate on 128 bytes boundaries (1 << 7)*/
cfv->genpool = gen_pool_create(7, -1);
if (!cfv->genpool)
goto err;
err = gen_pool_add_virt(cfv->genpool, (unsigned long)cfv->alloc_addr,
(phys_addr_t)virt_to_phys(cfv->alloc_addr),
cfv->allocsz, -1);
if (err)
goto err;
/* Reserve some memory for low memory situations. If we hit the roof
* in the memory pool, we stop TX flow and release the reserve.
*/
cfv->reserved_size = num_possible_cpus() * cfv->ndev->mtu;
cfv->reserved_mem = gen_pool_alloc(cfv->genpool,
cfv->reserved_size);
if (!cfv->reserved_mem) {
err = -ENOMEM;
goto err;
}
cfv->watermark_tx = virtqueue_get_vring_size(cfv->vq_tx);
return 0;
err:
cfv_destroy_genpool(cfv);
return err;
}
/* Enable the CAIF interface and allocate the memory-pool */
static int cfv_netdev_open(struct net_device *netdev)
{
struct cfv_info *cfv = netdev_priv(netdev);
if (cfv_create_genpool(cfv))
return -ENOMEM;
netif_carrier_on(netdev);
napi_enable(&cfv->napi);
/* Schedule NAPI to read any pending packets */
napi_schedule(&cfv->napi);
return 0;
}
/* Disable the CAIF interface and free the memory-pool */
static int cfv_netdev_close(struct net_device *netdev)
{
struct cfv_info *cfv = netdev_priv(netdev);
unsigned long flags;
struct buf_info *buf_info;
/* Disable interrupts, queues and NAPI polling */
netif_carrier_off(netdev);
virtqueue_disable_cb(cfv->vq_tx);
vringh_notify_disable_kern(cfv->vr_rx);
napi_disable(&cfv->napi);
/* Release any TX buffers on both used and avilable rings */
cfv_release_used_buf(cfv->vq_tx);
spin_lock_irqsave(&cfv->tx_lock, flags);
while ((buf_info = virtqueue_detach_unused_buf(cfv->vq_tx)))
free_buf_info(cfv, buf_info);
spin_unlock_irqrestore(&cfv->tx_lock, flags);
/* Release all dma allocated memory and destroy the pool */
cfv_destroy_genpool(cfv);
return 0;
}
/* Allocate a buffer in dma-memory and copy skb to it */
static struct buf_info *cfv_alloc_and_copy_to_shm(struct cfv_info *cfv,
struct sk_buff *skb,
struct scatterlist *sg)
{
struct caif_payload_info *info = (void *)&skb->cb;
struct buf_info *buf_info = NULL;
u8 pad_len, hdr_ofs;
if (!cfv->genpool)
goto err;
if (unlikely(cfv->tx_hr + skb->len + cfv->tx_tr > cfv->mtu)) {
netdev_warn(cfv->ndev, "Invalid packet len (%d > %d)\n",
cfv->tx_hr + skb->len + cfv->tx_tr, cfv->mtu);
goto err;
}
buf_info = kmalloc(sizeof(struct buf_info), GFP_ATOMIC);
if (unlikely(!buf_info))
goto err;
/* Make the IP header aligned in tbe buffer */
hdr_ofs = cfv->tx_hr + info->hdr_len;
pad_len = hdr_ofs & (IP_HDR_ALIGN - 1);
buf_info->size = cfv->tx_hr + skb->len + cfv->tx_tr + pad_len;
/* allocate dma memory buffer */
buf_info->vaddr = (void *)gen_pool_alloc(cfv->genpool, buf_info->size);
if (unlikely(!buf_info->vaddr))
goto err;
/* copy skbuf contents to send buffer */
skb_copy_bits(skb, 0, buf_info->vaddr + cfv->tx_hr + pad_len, skb->len);
sg_init_one(sg, buf_info->vaddr + pad_len,
skb->len + cfv->tx_hr + cfv->rx_hr);
return buf_info;
err:
kfree(buf_info);
return NULL;
}
/* Put the CAIF packet on the virtio ring and kick the receiver */
static int cfv_netdev_tx(struct sk_buff *skb, struct net_device *netdev)
{
struct cfv_info *cfv = netdev_priv(netdev);
struct buf_info *buf_info;
struct scatterlist sg;
unsigned long flags;
bool flow_off = false;
int ret;
/* garbage collect released buffers */
cfv_release_used_buf(cfv->vq_tx);
spin_lock_irqsave(&cfv->tx_lock, flags);
/* Flow-off check takes into account number of cpus to make sure
* virtqueue will not be overfilled in any possible smp conditions.
*
* Flow-on is triggered when sufficient buffers are freed
*/
if (unlikely(cfv->vq_tx->num_free <= num_present_cpus())) {
flow_off = true;
cfv->stats.tx_full_ring++;
}
/* If we run out of memory, we release the memory reserve and retry
* allocation.
*/
buf_info = cfv_alloc_and_copy_to_shm(cfv, skb, &sg);
if (unlikely(!buf_info)) {
cfv->stats.tx_no_mem++;
flow_off = true;
if (cfv->reserved_mem && cfv->genpool) {
gen_pool_free(cfv->genpool, cfv->reserved_mem,
cfv->reserved_size);
cfv->reserved_mem = 0;
buf_info = cfv_alloc_and_copy_to_shm(cfv, skb, &sg);
}
}
if (unlikely(flow_off)) {
/* Turn flow on when a 1/4 of the descriptors are released */
cfv->watermark_tx = virtqueue_get_vring_size(cfv->vq_tx) / 4;
/* Enable notifications of recycled TX buffers */
virtqueue_enable_cb(cfv->vq_tx);
netif_tx_stop_all_queues(netdev);
}
if (unlikely(!buf_info)) {
/* If the memory reserve does it's job, this shouldn't happen */
netdev_warn(cfv->ndev, "Out of gen_pool memory\n");
goto err;
}
ret = virtqueue_add_outbuf(cfv->vq_tx, &sg, 1, buf_info, GFP_ATOMIC);
if (unlikely((ret < 0))) {
/* If flow control works, this shouldn't happen */
netdev_warn(cfv->ndev, "Failed adding buffer to TX vring:%d\n",
ret);
goto err;
}
/* update netdev statistics */
cfv->ndev->stats.tx_packets++;
cfv->ndev->stats.tx_bytes += skb->len;
spin_unlock_irqrestore(&cfv->tx_lock, flags);
/* tell the remote processor it has a pending message to read */
virtqueue_kick(cfv->vq_tx);
dev_kfree_skb(skb);
return NETDEV_TX_OK;
err:
spin_unlock_irqrestore(&cfv->tx_lock, flags);
cfv->ndev->stats.tx_dropped++;
free_buf_info(cfv, buf_info);
dev_kfree_skb(skb);
return NETDEV_TX_OK;
}
static void cfv_tx_release_tasklet(unsigned long drv)
{
struct cfv_info *cfv = (struct cfv_info *)drv;
cfv_release_used_buf(cfv->vq_tx);
}
static const struct net_device_ops cfv_netdev_ops = {
.ndo_open = cfv_netdev_open,
.ndo_stop = cfv_netdev_close,
.ndo_start_xmit = cfv_netdev_tx,
};
static void cfv_netdev_setup(struct net_device *netdev)
{
netdev->netdev_ops = &cfv_netdev_ops;
netdev->type = ARPHRD_CAIF;
netdev->tx_queue_len = 100;
netdev->flags = IFF_POINTOPOINT | IFF_NOARP;
netdev->mtu = CFV_DEF_MTU_SIZE;
netdev->destructor = free_netdev;
}
/* Create debugfs counters for the device */
static inline void debugfs_init(struct cfv_info *cfv)
{
cfv->debugfs =
debugfs_create_dir(netdev_name(cfv->ndev), NULL);
if (IS_ERR(cfv->debugfs))
return;
debugfs_create_u32("rx-napi-complete", S_IRUSR, cfv->debugfs,
&cfv->stats.rx_napi_complete);
debugfs_create_u32("rx-napi-resched", S_IRUSR, cfv->debugfs,
&cfv->stats.rx_napi_resched);
debugfs_create_u32("rx-nomem", S_IRUSR, cfv->debugfs,
&cfv->stats.rx_nomem);
debugfs_create_u32("rx-kicks", S_IRUSR, cfv->debugfs,
&cfv->stats.rx_kicks);
debugfs_create_u32("tx-full-ring", S_IRUSR, cfv->debugfs,
&cfv->stats.tx_full_ring);
debugfs_create_u32("tx-no-mem", S_IRUSR, cfv->debugfs,
&cfv->stats.tx_no_mem);
debugfs_create_u32("tx-kicks", S_IRUSR, cfv->debugfs,
&cfv->stats.tx_kicks);
debugfs_create_u32("tx-flow-on", S_IRUSR, cfv->debugfs,
&cfv->stats.tx_flow_on);
}
/* Setup CAIF for the a virtio device */
static int cfv_probe(struct virtio_device *vdev)
{
vq_callback_t *vq_cbs = cfv_release_cb;
vrh_callback_t *vrh_cbs = cfv_recv;
const char *names = "output";
const char *cfv_netdev_name = "cfvrt";
struct net_device *netdev;
struct cfv_info *cfv;
int err = -EINVAL;
netdev = alloc_netdev(sizeof(struct cfv_info), cfv_netdev_name,
cfv_netdev_setup);
if (!netdev)
return -ENOMEM;
cfv = netdev_priv(netdev);
cfv->vdev = vdev;
cfv->ndev = netdev;
spin_lock_init(&cfv->tx_lock);
/* Get the RX virtio ring. This is a "host side vring". */
err = -ENODEV;
if (!vdev->vringh_config || !vdev->vringh_config->find_vrhs)
goto err;
err = vdev->vringh_config->find_vrhs(vdev, 1, &cfv->vr_rx, &vrh_cbs);
if (err)
goto err;
/* Get the TX virtio ring. This is a "guest side vring". */
err = vdev->config->find_vqs(vdev, 1, &cfv->vq_tx, &vq_cbs, &names);
if (err)
goto err;
/* Get the CAIF configuration from virtio config space, if available */
#define GET_VIRTIO_CONFIG_OPS(_v, _var, _f) \
((_v)->config->get(_v, offsetof(struct virtio_caif_transf_config, _f), \
&_var, \
FIELD_SIZEOF(struct virtio_caif_transf_config, _f)))
if (vdev->config->get) {
GET_VIRTIO_CONFIG_OPS(vdev, cfv->tx_hr, headroom);
GET_VIRTIO_CONFIG_OPS(vdev, cfv->rx_hr, headroom);
GET_VIRTIO_CONFIG_OPS(vdev, cfv->tx_tr, tailroom);
GET_VIRTIO_CONFIG_OPS(vdev, cfv->rx_tr, tailroom);
GET_VIRTIO_CONFIG_OPS(vdev, cfv->mtu, mtu);
GET_VIRTIO_CONFIG_OPS(vdev, cfv->mru, mtu);
} else {
cfv->tx_hr = CFV_DEF_HEADROOM;
cfv->rx_hr = CFV_DEF_HEADROOM;
cfv->tx_tr = CFV_DEF_TAILROOM;
cfv->rx_tr = CFV_DEF_TAILROOM;
cfv->mtu = CFV_DEF_MTU_SIZE;
cfv->mru = CFV_DEF_MTU_SIZE;
}
netdev->needed_headroom = cfv->tx_hr;
netdev->needed_tailroom = cfv->tx_tr;
/* Disable buffer release interrupts unless we have stopped TX queues */
virtqueue_disable_cb(cfv->vq_tx);
netdev->mtu = cfv->mtu - cfv->tx_tr;
vdev->priv = cfv;
/* Initialize NAPI poll context data */
vringh_kiov_init(&cfv->ctx.riov, NULL, 0);
cfv->ctx.head = USHRT_MAX;
netif_napi_add(netdev, &cfv->napi, cfv_rx_poll, CFV_DEFAULT_QUOTA);
tasklet_init(&cfv->tx_release_tasklet,
cfv_tx_release_tasklet,
(unsigned long)cfv);
/* Carrier is off until netdevice is opened */
netif_carrier_off(netdev);
/* register Netdev */
err = register_netdev(netdev);
if (err) {
dev_err(&vdev->dev, "Unable to register netdev (%d)\n", err);
goto err;
}
debugfs_init(cfv);
return 0;
err:
netdev_warn(cfv->ndev, "CAIF Virtio probe failed:%d\n", err);
if (cfv->vr_rx)
vdev->vringh_config->del_vrhs(cfv->vdev);
if (cfv->vdev)
vdev->config->del_vqs(cfv->vdev);
free_netdev(netdev);
return err;
}
static void cfv_remove(struct virtio_device *vdev)
{
struct cfv_info *cfv = vdev->priv;
rtnl_lock();
dev_close(cfv->ndev);
rtnl_unlock();
tasklet_kill(&cfv->tx_release_tasklet);
debugfs_remove_recursive(cfv->debugfs);
vringh_kiov_cleanup(&cfv->ctx.riov);
vdev->config->reset(vdev);
vdev->vringh_config->del_vrhs(cfv->vdev);
cfv->vr_rx = NULL;
vdev->config->del_vqs(cfv->vdev);
unregister_netdev(cfv->ndev);
}
static struct virtio_device_id id_table[] = {
{ VIRTIO_ID_CAIF, VIRTIO_DEV_ANY_ID },
{ 0 },
};
static unsigned int features[] = {
};
static struct virtio_driver caif_virtio_driver = {
.feature_table = features,
.feature_table_size = ARRAY_SIZE(features),
.driver.name = KBUILD_MODNAME,
.driver.owner = THIS_MODULE,
.id_table = id_table,
.probe = cfv_probe,
.remove = cfv_remove,
};
module_virtio_driver(caif_virtio_driver);
MODULE_DEVICE_TABLE(virtio, id_table);

View File

@ -39,7 +39,6 @@ module_param(gso, bool, 0444);
#define MAX_PACKET_LEN (ETH_HLEN + VLAN_HLEN + ETH_DATA_LEN) #define MAX_PACKET_LEN (ETH_HLEN + VLAN_HLEN + ETH_DATA_LEN)
#define GOOD_COPY_LEN 128 #define GOOD_COPY_LEN 128
#define VIRTNET_SEND_COMMAND_SG_MAX 2
#define VIRTNET_DRIVER_VERSION "1.0.0" #define VIRTNET_DRIVER_VERSION "1.0.0"
struct virtnet_stats { struct virtnet_stats {
@ -444,7 +443,7 @@ static int add_recvbuf_small(struct receive_queue *rq, gfp_t gfp)
skb_to_sgvec(skb, rq->sg + 1, 0, skb->len); skb_to_sgvec(skb, rq->sg + 1, 0, skb->len);
err = virtqueue_add_buf(rq->vq, rq->sg, 0, 2, skb, gfp); err = virtqueue_add_inbuf(rq->vq, rq->sg, 2, skb, gfp);
if (err < 0) if (err < 0)
dev_kfree_skb(skb); dev_kfree_skb(skb);
@ -489,8 +488,8 @@ static int add_recvbuf_big(struct receive_queue *rq, gfp_t gfp)
/* chain first in list head */ /* chain first in list head */
first->private = (unsigned long)list; first->private = (unsigned long)list;
err = virtqueue_add_buf(rq->vq, rq->sg, 0, MAX_SKB_FRAGS + 2, err = virtqueue_add_inbuf(rq->vq, rq->sg, MAX_SKB_FRAGS + 2,
first, gfp); first, gfp);
if (err < 0) if (err < 0)
give_pages(rq, first); give_pages(rq, first);
@ -508,7 +507,7 @@ static int add_recvbuf_mergeable(struct receive_queue *rq, gfp_t gfp)
sg_init_one(rq->sg, page_address(page), PAGE_SIZE); sg_init_one(rq->sg, page_address(page), PAGE_SIZE);
err = virtqueue_add_buf(rq->vq, rq->sg, 0, 1, page, gfp); err = virtqueue_add_inbuf(rq->vq, rq->sg, 1, page, gfp);
if (err < 0) if (err < 0)
give_pages(rq, page); give_pages(rq, page);
@ -582,7 +581,7 @@ static void refill_work(struct work_struct *work)
bool still_empty; bool still_empty;
int i; int i;
for (i = 0; i < vi->max_queue_pairs; i++) { for (i = 0; i < vi->curr_queue_pairs; i++) {
struct receive_queue *rq = &vi->rq[i]; struct receive_queue *rq = &vi->rq[i];
napi_disable(&rq->napi); napi_disable(&rq->napi);
@ -637,7 +636,7 @@ static int virtnet_open(struct net_device *dev)
struct virtnet_info *vi = netdev_priv(dev); struct virtnet_info *vi = netdev_priv(dev);
int i; int i;
for (i = 0; i < vi->max_queue_pairs; i++) { for (i = 0; i < vi->curr_queue_pairs; i++) {
/* Make sure we have some buffers: if oom use wq. */ /* Make sure we have some buffers: if oom use wq. */
if (!try_fill_recv(&vi->rq[i], GFP_KERNEL)) if (!try_fill_recv(&vi->rq[i], GFP_KERNEL))
schedule_delayed_work(&vi->refill, 0); schedule_delayed_work(&vi->refill, 0);
@ -711,8 +710,7 @@ static int xmit_skb(struct send_queue *sq, struct sk_buff *skb)
sg_set_buf(sq->sg, &hdr->hdr, sizeof hdr->hdr); sg_set_buf(sq->sg, &hdr->hdr, sizeof hdr->hdr);
num_sg = skb_to_sgvec(skb, sq->sg + 1, 0, skb->len) + 1; num_sg = skb_to_sgvec(skb, sq->sg + 1, 0, skb->len) + 1;
return virtqueue_add_buf(sq->vq, sq->sg, num_sg, return virtqueue_add_outbuf(sq->vq, sq->sg, num_sg, skb, GFP_ATOMIC);
0, skb, GFP_ATOMIC);
} }
static netdev_tx_t start_xmit(struct sk_buff *skb, struct net_device *dev) static netdev_tx_t start_xmit(struct sk_buff *skb, struct net_device *dev)
@ -767,32 +765,35 @@ static netdev_tx_t start_xmit(struct sk_buff *skb, struct net_device *dev)
* never fail unless improperly formated. * never fail unless improperly formated.
*/ */
static bool virtnet_send_command(struct virtnet_info *vi, u8 class, u8 cmd, static bool virtnet_send_command(struct virtnet_info *vi, u8 class, u8 cmd,
struct scatterlist *data, int out, int in) struct scatterlist *out,
struct scatterlist *in)
{ {
struct scatterlist *s, sg[VIRTNET_SEND_COMMAND_SG_MAX + 2]; struct scatterlist *sgs[4], hdr, stat;
struct virtio_net_ctrl_hdr ctrl; struct virtio_net_ctrl_hdr ctrl;
virtio_net_ctrl_ack status = ~0; virtio_net_ctrl_ack status = ~0;
unsigned int tmp; unsigned out_num = 0, in_num = 0, tmp;
int i;
/* Caller should know better */ /* Caller should know better */
BUG_ON(!virtio_has_feature(vi->vdev, VIRTIO_NET_F_CTRL_VQ) || BUG_ON(!virtio_has_feature(vi->vdev, VIRTIO_NET_F_CTRL_VQ));
(out + in > VIRTNET_SEND_COMMAND_SG_MAX));
out++; /* Add header */
in++; /* Add return status */
ctrl.class = class; ctrl.class = class;
ctrl.cmd = cmd; ctrl.cmd = cmd;
/* Add header */
sg_init_one(&hdr, &ctrl, sizeof(ctrl));
sgs[out_num++] = &hdr;
sg_init_table(sg, out + in); if (out)
sgs[out_num++] = out;
if (in)
sgs[out_num + in_num++] = in;
sg_set_buf(&sg[0], &ctrl, sizeof(ctrl)); /* Add return status. */
for_each_sg(data, s, out + in - 2, i) sg_init_one(&stat, &status, sizeof(status));
sg_set_buf(&sg[i + 1], sg_virt(s), s->length); sgs[out_num + in_num++] = &stat;
sg_set_buf(&sg[out + in - 1], &status, sizeof(status));
BUG_ON(virtqueue_add_buf(vi->cvq, sg, out, in, vi, GFP_ATOMIC) < 0); BUG_ON(out_num + in_num > ARRAY_SIZE(sgs));
BUG_ON(virtqueue_add_sgs(vi->cvq, sgs, out_num, in_num, vi, GFP_ATOMIC)
< 0);
virtqueue_kick(vi->cvq); virtqueue_kick(vi->cvq);
@ -821,7 +822,7 @@ static int virtnet_set_mac_address(struct net_device *dev, void *p)
sg_init_one(&sg, addr->sa_data, dev->addr_len); sg_init_one(&sg, addr->sa_data, dev->addr_len);
if (!virtnet_send_command(vi, VIRTIO_NET_CTRL_MAC, if (!virtnet_send_command(vi, VIRTIO_NET_CTRL_MAC,
VIRTIO_NET_CTRL_MAC_ADDR_SET, VIRTIO_NET_CTRL_MAC_ADDR_SET,
&sg, 1, 0)) { &sg, NULL)) {
dev_warn(&vdev->dev, dev_warn(&vdev->dev,
"Failed to set mac address by vq command.\n"); "Failed to set mac address by vq command.\n");
return -EINVAL; return -EINVAL;
@ -889,8 +890,7 @@ static void virtnet_ack_link_announce(struct virtnet_info *vi)
{ {
rtnl_lock(); rtnl_lock();
if (!virtnet_send_command(vi, VIRTIO_NET_CTRL_ANNOUNCE, if (!virtnet_send_command(vi, VIRTIO_NET_CTRL_ANNOUNCE,
VIRTIO_NET_CTRL_ANNOUNCE_ACK, NULL, VIRTIO_NET_CTRL_ANNOUNCE_ACK, NULL, NULL))
0, 0))
dev_warn(&vi->dev->dev, "Failed to ack link announce.\n"); dev_warn(&vi->dev->dev, "Failed to ack link announce.\n");
rtnl_unlock(); rtnl_unlock();
} }
@ -900,6 +900,7 @@ static int virtnet_set_queues(struct virtnet_info *vi, u16 queue_pairs)
struct scatterlist sg; struct scatterlist sg;
struct virtio_net_ctrl_mq s; struct virtio_net_ctrl_mq s;
struct net_device *dev = vi->dev; struct net_device *dev = vi->dev;
int i;
if (!vi->has_cvq || !virtio_has_feature(vi->vdev, VIRTIO_NET_F_MQ)) if (!vi->has_cvq || !virtio_has_feature(vi->vdev, VIRTIO_NET_F_MQ))
return 0; return 0;
@ -908,12 +909,16 @@ static int virtnet_set_queues(struct virtnet_info *vi, u16 queue_pairs)
sg_init_one(&sg, &s, sizeof(s)); sg_init_one(&sg, &s, sizeof(s));
if (!virtnet_send_command(vi, VIRTIO_NET_CTRL_MQ, if (!virtnet_send_command(vi, VIRTIO_NET_CTRL_MQ,
VIRTIO_NET_CTRL_MQ_VQ_PAIRS_SET, &sg, 1, 0)){ VIRTIO_NET_CTRL_MQ_VQ_PAIRS_SET, &sg, NULL)) {
dev_warn(&dev->dev, "Fail to set num of queue pairs to %d\n", dev_warn(&dev->dev, "Fail to set num of queue pairs to %d\n",
queue_pairs); queue_pairs);
return -EINVAL; return -EINVAL;
} else } else {
for (i = vi->curr_queue_pairs; i < queue_pairs; i++)
if (!try_fill_recv(&vi->rq[i], GFP_KERNEL))
schedule_delayed_work(&vi->refill, 0);
vi->curr_queue_pairs = queue_pairs; vi->curr_queue_pairs = queue_pairs;
}
return 0; return 0;
} }
@ -955,7 +960,7 @@ static void virtnet_set_rx_mode(struct net_device *dev)
if (!virtnet_send_command(vi, VIRTIO_NET_CTRL_RX, if (!virtnet_send_command(vi, VIRTIO_NET_CTRL_RX,
VIRTIO_NET_CTRL_RX_PROMISC, VIRTIO_NET_CTRL_RX_PROMISC,
sg, 1, 0)) sg, NULL))
dev_warn(&dev->dev, "Failed to %sable promisc mode.\n", dev_warn(&dev->dev, "Failed to %sable promisc mode.\n",
promisc ? "en" : "dis"); promisc ? "en" : "dis");
@ -963,7 +968,7 @@ static void virtnet_set_rx_mode(struct net_device *dev)
if (!virtnet_send_command(vi, VIRTIO_NET_CTRL_RX, if (!virtnet_send_command(vi, VIRTIO_NET_CTRL_RX,
VIRTIO_NET_CTRL_RX_ALLMULTI, VIRTIO_NET_CTRL_RX_ALLMULTI,
sg, 1, 0)) sg, NULL))
dev_warn(&dev->dev, "Failed to %sable allmulti mode.\n", dev_warn(&dev->dev, "Failed to %sable allmulti mode.\n",
allmulti ? "en" : "dis"); allmulti ? "en" : "dis");
@ -1000,7 +1005,7 @@ static void virtnet_set_rx_mode(struct net_device *dev)
if (!virtnet_send_command(vi, VIRTIO_NET_CTRL_MAC, if (!virtnet_send_command(vi, VIRTIO_NET_CTRL_MAC,
VIRTIO_NET_CTRL_MAC_TABLE_SET, VIRTIO_NET_CTRL_MAC_TABLE_SET,
sg, 2, 0)) sg, NULL))
dev_warn(&dev->dev, "Failed to set MAC fitler table.\n"); dev_warn(&dev->dev, "Failed to set MAC fitler table.\n");
kfree(buf); kfree(buf);
@ -1015,7 +1020,7 @@ static int virtnet_vlan_rx_add_vid(struct net_device *dev,
sg_init_one(&sg, &vid, sizeof(vid)); sg_init_one(&sg, &vid, sizeof(vid));
if (!virtnet_send_command(vi, VIRTIO_NET_CTRL_VLAN, if (!virtnet_send_command(vi, VIRTIO_NET_CTRL_VLAN,
VIRTIO_NET_CTRL_VLAN_ADD, &sg, 1, 0)) VIRTIO_NET_CTRL_VLAN_ADD, &sg, NULL))
dev_warn(&dev->dev, "Failed to add VLAN ID %d.\n", vid); dev_warn(&dev->dev, "Failed to add VLAN ID %d.\n", vid);
return 0; return 0;
} }
@ -1029,7 +1034,7 @@ static int virtnet_vlan_rx_kill_vid(struct net_device *dev,
sg_init_one(&sg, &vid, sizeof(vid)); sg_init_one(&sg, &vid, sizeof(vid));
if (!virtnet_send_command(vi, VIRTIO_NET_CTRL_VLAN, if (!virtnet_send_command(vi, VIRTIO_NET_CTRL_VLAN,
VIRTIO_NET_CTRL_VLAN_DEL, &sg, 1, 0)) VIRTIO_NET_CTRL_VLAN_DEL, &sg, NULL))
dev_warn(&dev->dev, "Failed to kill VLAN ID %d.\n", vid); dev_warn(&dev->dev, "Failed to kill VLAN ID %d.\n", vid);
return 0; return 0;
} }
@ -1570,7 +1575,7 @@ static int virtnet_probe(struct virtio_device *vdev)
} }
/* Last of all, set up some receive buffers. */ /* Last of all, set up some receive buffers. */
for (i = 0; i < vi->max_queue_pairs; i++) { for (i = 0; i < vi->curr_queue_pairs; i++) {
try_fill_recv(&vi->rq[i], GFP_KERNEL); try_fill_recv(&vi->rq[i], GFP_KERNEL);
/* If we didn't even get one input buffer, we're useless. */ /* If we didn't even get one input buffer, we're useless. */
@ -1694,7 +1699,7 @@ static int virtnet_restore(struct virtio_device *vdev)
netif_device_attach(vi->dev); netif_device_attach(vi->dev);
for (i = 0; i < vi->max_queue_pairs; i++) for (i = 0; i < vi->curr_queue_pairs; i++)
if (!try_fill_recv(&vi->rq[i], GFP_KERNEL)) if (!try_fill_recv(&vi->rq[i], GFP_KERNEL))
schedule_delayed_work(&vi->refill, 0); schedule_delayed_work(&vi->refill, 0);

View File

@ -757,14 +757,14 @@ int rpmsg_send_offchannel_raw(struct rpmsg_channel *rpdev, u32 src, u32 dst,
mutex_lock(&vrp->tx_lock); mutex_lock(&vrp->tx_lock);
/* add message to the remote processor's virtqueue */ /* add message to the remote processor's virtqueue */
err = virtqueue_add_buf(vrp->svq, &sg, 1, 0, msg, GFP_KERNEL); err = virtqueue_add_outbuf(vrp->svq, &sg, 1, msg, GFP_KERNEL);
if (err) { if (err) {
/* /*
* need to reclaim the buffer here, otherwise it's lost * need to reclaim the buffer here, otherwise it's lost
* (memory won't leak, but rpmsg won't use it again for TX). * (memory won't leak, but rpmsg won't use it again for TX).
* this will wait for a buffer management overhaul. * this will wait for a buffer management overhaul.
*/ */
dev_err(dev, "virtqueue_add_buf failed: %d\n", err); dev_err(dev, "virtqueue_add_outbuf failed: %d\n", err);
goto out; goto out;
} }
@ -839,7 +839,7 @@ static void rpmsg_recv_done(struct virtqueue *rvq)
sg_init_one(&sg, msg, RPMSG_BUF_SIZE); sg_init_one(&sg, msg, RPMSG_BUF_SIZE);
/* add the buffer back to the remote processor's virtqueue */ /* add the buffer back to the remote processor's virtqueue */
err = virtqueue_add_buf(vrp->rvq, &sg, 0, 1, msg, GFP_KERNEL); err = virtqueue_add_inbuf(vrp->rvq, &sg, 1, msg, GFP_KERNEL);
if (err < 0) { if (err < 0) {
dev_err(dev, "failed to add a virtqueue buffer: %d\n", err); dev_err(dev, "failed to add a virtqueue buffer: %d\n", err);
return; return;
@ -972,7 +972,7 @@ static int rpmsg_probe(struct virtio_device *vdev)
sg_init_one(&sg, cpu_addr, RPMSG_BUF_SIZE); sg_init_one(&sg, cpu_addr, RPMSG_BUF_SIZE);
err = virtqueue_add_buf(vrp->rvq, &sg, 0, 1, cpu_addr, err = virtqueue_add_inbuf(vrp->rvq, &sg, 1, cpu_addr,
GFP_KERNEL); GFP_KERNEL);
WARN_ON(err); /* sanity check; this can't really happen */ WARN_ON(err); /* sanity check; this can't really happen */
} }

View File

@ -13,6 +13,8 @@
* *
*/ */
#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
#include <linux/module.h> #include <linux/module.h>
#include <linux/slab.h> #include <linux/slab.h>
#include <linux/mempool.h> #include <linux/mempool.h>
@ -20,12 +22,14 @@
#include <linux/virtio_ids.h> #include <linux/virtio_ids.h>
#include <linux/virtio_config.h> #include <linux/virtio_config.h>
#include <linux/virtio_scsi.h> #include <linux/virtio_scsi.h>
#include <linux/cpu.h>
#include <scsi/scsi_host.h> #include <scsi/scsi_host.h>
#include <scsi/scsi_device.h> #include <scsi/scsi_device.h>
#include <scsi/scsi_cmnd.h> #include <scsi/scsi_cmnd.h>
#define VIRTIO_SCSI_MEMPOOL_SZ 64 #define VIRTIO_SCSI_MEMPOOL_SZ 64
#define VIRTIO_SCSI_EVENT_LEN 8 #define VIRTIO_SCSI_EVENT_LEN 8
#define VIRTIO_SCSI_VQ_BASE 2
/* Command queue element */ /* Command queue element */
struct virtio_scsi_cmd { struct virtio_scsi_cmd {
@ -57,27 +61,61 @@ struct virtio_scsi_vq {
struct virtqueue *vq; struct virtqueue *vq;
}; };
/* Per-target queue state */ /*
* Per-target queue state.
*
* This struct holds the data needed by the queue steering policy. When a
* target is sent multiple requests, we need to drive them to the same queue so
* that FIFO processing order is kept. However, if a target was idle, we can
* choose a queue arbitrarily. In this case the queue is chosen according to
* the current VCPU, so the driver expects the number of request queues to be
* equal to the number of VCPUs. This makes it easy and fast to select the
* queue, and also lets the driver optimize the IRQ affinity for the virtqueues
* (each virtqueue's affinity is set to the CPU that "owns" the queue).
*
* An interesting effect of this policy is that only writes to req_vq need to
* take the tgt_lock. Read can be done outside the lock because:
*
* - writes of req_vq only occur when atomic_inc_return(&tgt->reqs) returns 1.
* In that case, no other CPU is reading req_vq: even if they were in
* virtscsi_queuecommand_multi, they would be spinning on tgt_lock.
*
* - reads of req_vq only occur when the target is not idle (reqs != 0).
* A CPU that enters virtscsi_queuecommand_multi will not modify req_vq.
*
* Similarly, decrements of reqs are never concurrent with writes of req_vq.
* Thus they can happen outside the tgt_lock, provided of course we make reqs
* an atomic_t.
*/
struct virtio_scsi_target_state { struct virtio_scsi_target_state {
/* Protects sg. Lock hierarchy is tgt_lock -> vq_lock. */ /* This spinlock never held at the same time as vq_lock. */
spinlock_t tgt_lock; spinlock_t tgt_lock;
/* For sglist construction when adding commands to the virtqueue. */ /* Count of outstanding requests. */
struct scatterlist sg[]; atomic_t reqs;
/* Currently active virtqueue for requests sent to this target. */
struct virtio_scsi_vq *req_vq;
}; };
/* Driver instance state */ /* Driver instance state */
struct virtio_scsi { struct virtio_scsi {
struct virtio_device *vdev; struct virtio_device *vdev;
struct virtio_scsi_vq ctrl_vq;
struct virtio_scsi_vq event_vq;
struct virtio_scsi_vq req_vq;
/* Get some buffers ready for event vq */ /* Get some buffers ready for event vq */
struct virtio_scsi_event_node event_list[VIRTIO_SCSI_EVENT_LEN]; struct virtio_scsi_event_node event_list[VIRTIO_SCSI_EVENT_LEN];
struct virtio_scsi_target_state *tgt[]; u32 num_queues;
/* If the affinity hint is set for virtqueues */
bool affinity_hint_set;
/* CPU hotplug notifier */
struct notifier_block nb;
struct virtio_scsi_vq ctrl_vq;
struct virtio_scsi_vq event_vq;
struct virtio_scsi_vq req_vqs[];
}; };
static struct kmem_cache *virtscsi_cmd_cache; static struct kmem_cache *virtscsi_cmd_cache;
@ -107,11 +145,13 @@ static void virtscsi_compute_resid(struct scsi_cmnd *sc, u32 resid)
* *
* Called with vq_lock held. * Called with vq_lock held.
*/ */
static void virtscsi_complete_cmd(void *buf) static void virtscsi_complete_cmd(struct virtio_scsi *vscsi, void *buf)
{ {
struct virtio_scsi_cmd *cmd = buf; struct virtio_scsi_cmd *cmd = buf;
struct scsi_cmnd *sc = cmd->sc; struct scsi_cmnd *sc = cmd->sc;
struct virtio_scsi_cmd_resp *resp = &cmd->resp.cmd; struct virtio_scsi_cmd_resp *resp = &cmd->resp.cmd;
struct virtio_scsi_target_state *tgt =
scsi_target(sc->device)->hostdata;
dev_dbg(&sc->device->sdev_gendev, dev_dbg(&sc->device->sdev_gendev,
"cmd %p response %u status %#02x sense_len %u\n", "cmd %p response %u status %#02x sense_len %u\n",
@ -166,32 +206,71 @@ static void virtscsi_complete_cmd(void *buf)
mempool_free(cmd, virtscsi_cmd_pool); mempool_free(cmd, virtscsi_cmd_pool);
sc->scsi_done(sc); sc->scsi_done(sc);
atomic_dec(&tgt->reqs);
} }
static void virtscsi_vq_done(struct virtqueue *vq, void (*fn)(void *buf)) static void virtscsi_vq_done(struct virtio_scsi *vscsi,
struct virtio_scsi_vq *virtscsi_vq,
void (*fn)(struct virtio_scsi *vscsi, void *buf))
{ {
void *buf; void *buf;
unsigned int len; unsigned int len;
unsigned long flags;
struct virtqueue *vq = virtscsi_vq->vq;
spin_lock_irqsave(&virtscsi_vq->vq_lock, flags);
do { do {
virtqueue_disable_cb(vq); virtqueue_disable_cb(vq);
while ((buf = virtqueue_get_buf(vq, &len)) != NULL) while ((buf = virtqueue_get_buf(vq, &len)) != NULL)
fn(buf); fn(vscsi, buf);
} while (!virtqueue_enable_cb(vq)); } while (!virtqueue_enable_cb(vq));
spin_unlock_irqrestore(&virtscsi_vq->vq_lock, flags);
} }
static void virtscsi_req_done(struct virtqueue *vq) static void virtscsi_req_done(struct virtqueue *vq)
{ {
struct Scsi_Host *sh = virtio_scsi_host(vq->vdev); struct Scsi_Host *sh = virtio_scsi_host(vq->vdev);
struct virtio_scsi *vscsi = shost_priv(sh); struct virtio_scsi *vscsi = shost_priv(sh);
unsigned long flags; int index = vq->index - VIRTIO_SCSI_VQ_BASE;
struct virtio_scsi_vq *req_vq = &vscsi->req_vqs[index];
spin_lock_irqsave(&vscsi->req_vq.vq_lock, flags); /*
virtscsi_vq_done(vq, virtscsi_complete_cmd); * Read req_vq before decrementing the reqs field in
spin_unlock_irqrestore(&vscsi->req_vq.vq_lock, flags); * virtscsi_complete_cmd.
*
* With barriers:
*
* CPU #0 virtscsi_queuecommand_multi (CPU #1)
* ------------------------------------------------------------
* lock vq_lock
* read req_vq
* read reqs (reqs = 1)
* write reqs (reqs = 0)
* increment reqs (reqs = 1)
* write req_vq
*
* Possible reordering without barriers:
*
* CPU #0 virtscsi_queuecommand_multi (CPU #1)
* ------------------------------------------------------------
* lock vq_lock
* read reqs (reqs = 1)
* write reqs (reqs = 0)
* increment reqs (reqs = 1)
* write req_vq
* read (wrong) req_vq
*
* We do not need a full smp_rmb, because req_vq is required to get
* to tgt->reqs: tgt is &vscsi->tgt[sc->device->id], where sc is stored
* in the virtqueue as the user token.
*/
smp_read_barrier_depends();
virtscsi_vq_done(vscsi, req_vq, virtscsi_complete_cmd);
}; };
static void virtscsi_complete_free(void *buf) static void virtscsi_complete_free(struct virtio_scsi *vscsi, void *buf)
{ {
struct virtio_scsi_cmd *cmd = buf; struct virtio_scsi_cmd *cmd = buf;
@ -205,11 +284,8 @@ static void virtscsi_ctrl_done(struct virtqueue *vq)
{ {
struct Scsi_Host *sh = virtio_scsi_host(vq->vdev); struct Scsi_Host *sh = virtio_scsi_host(vq->vdev);
struct virtio_scsi *vscsi = shost_priv(sh); struct virtio_scsi *vscsi = shost_priv(sh);
unsigned long flags;
spin_lock_irqsave(&vscsi->ctrl_vq.vq_lock, flags); virtscsi_vq_done(vscsi, &vscsi->ctrl_vq, virtscsi_complete_free);
virtscsi_vq_done(vq, virtscsi_complete_free);
spin_unlock_irqrestore(&vscsi->ctrl_vq.vq_lock, flags);
}; };
static int virtscsi_kick_event(struct virtio_scsi *vscsi, static int virtscsi_kick_event(struct virtio_scsi *vscsi,
@ -223,8 +299,8 @@ static int virtscsi_kick_event(struct virtio_scsi *vscsi,
spin_lock_irqsave(&vscsi->event_vq.vq_lock, flags); spin_lock_irqsave(&vscsi->event_vq.vq_lock, flags);
err = virtqueue_add_buf(vscsi->event_vq.vq, &sg, 0, 1, event_node, err = virtqueue_add_inbuf(vscsi->event_vq.vq, &sg, 1, event_node,
GFP_ATOMIC); GFP_ATOMIC);
if (!err) if (!err)
virtqueue_kick(vscsi->event_vq.vq); virtqueue_kick(vscsi->event_vq.vq);
@ -254,7 +330,7 @@ static void virtscsi_cancel_event_work(struct virtio_scsi *vscsi)
} }
static void virtscsi_handle_transport_reset(struct virtio_scsi *vscsi, static void virtscsi_handle_transport_reset(struct virtio_scsi *vscsi,
struct virtio_scsi_event *event) struct virtio_scsi_event *event)
{ {
struct scsi_device *sdev; struct scsi_device *sdev;
struct Scsi_Host *shost = virtio_scsi_host(vscsi->vdev); struct Scsi_Host *shost = virtio_scsi_host(vscsi->vdev);
@ -332,7 +408,7 @@ static void virtscsi_handle_event(struct work_struct *work)
virtscsi_kick_event(vscsi, event_node); virtscsi_kick_event(vscsi, event_node);
} }
static void virtscsi_complete_event(void *buf) static void virtscsi_complete_event(struct virtio_scsi *vscsi, void *buf)
{ {
struct virtio_scsi_event_node *event_node = buf; struct virtio_scsi_event_node *event_node = buf;
@ -344,82 +420,65 @@ static void virtscsi_event_done(struct virtqueue *vq)
{ {
struct Scsi_Host *sh = virtio_scsi_host(vq->vdev); struct Scsi_Host *sh = virtio_scsi_host(vq->vdev);
struct virtio_scsi *vscsi = shost_priv(sh); struct virtio_scsi *vscsi = shost_priv(sh);
unsigned long flags;
spin_lock_irqsave(&vscsi->event_vq.vq_lock, flags); virtscsi_vq_done(vscsi, &vscsi->event_vq, virtscsi_complete_event);
virtscsi_vq_done(vq, virtscsi_complete_event);
spin_unlock_irqrestore(&vscsi->event_vq.vq_lock, flags);
}; };
static void virtscsi_map_sgl(struct scatterlist *sg, unsigned int *p_idx,
struct scsi_data_buffer *sdb)
{
struct sg_table *table = &sdb->table;
struct scatterlist *sg_elem;
unsigned int idx = *p_idx;
int i;
for_each_sg(table->sgl, sg_elem, table->nents, i)
sg[idx++] = *sg_elem;
*p_idx = idx;
}
/** /**
* virtscsi_map_cmd - map a scsi_cmd to a virtqueue scatterlist * virtscsi_add_cmd - add a virtio_scsi_cmd to a virtqueue
* @vscsi : virtio_scsi state * @vq : the struct virtqueue we're talking about
* @cmd : command structure * @cmd : command structure
* @out_num : number of read-only elements
* @in_num : number of write-only elements
* @req_size : size of the request buffer * @req_size : size of the request buffer
* @resp_size : size of the response buffer * @resp_size : size of the response buffer
* * @gfp : flags to use for memory allocations
* Called with tgt_lock held.
*/ */
static void virtscsi_map_cmd(struct virtio_scsi_target_state *tgt, static int virtscsi_add_cmd(struct virtqueue *vq,
struct virtio_scsi_cmd *cmd, struct virtio_scsi_cmd *cmd,
unsigned *out_num, unsigned *in_num, size_t req_size, size_t resp_size, gfp_t gfp)
size_t req_size, size_t resp_size)
{ {
struct scsi_cmnd *sc = cmd->sc; struct scsi_cmnd *sc = cmd->sc;
struct scatterlist *sg = tgt->sg; struct scatterlist *sgs[4], req, resp;
unsigned int idx = 0; struct sg_table *out, *in;
unsigned out_num = 0, in_num = 0;
out = in = NULL;
if (sc && sc->sc_data_direction != DMA_NONE) {
if (sc->sc_data_direction != DMA_FROM_DEVICE)
out = &scsi_out(sc)->table;
if (sc->sc_data_direction != DMA_TO_DEVICE)
in = &scsi_in(sc)->table;
}
/* Request header. */ /* Request header. */
sg_set_buf(&sg[idx++], &cmd->req, req_size); sg_init_one(&req, &cmd->req, req_size);
sgs[out_num++] = &req;
/* Data-out buffer. */ /* Data-out buffer. */
if (sc && sc->sc_data_direction != DMA_FROM_DEVICE) if (out)
virtscsi_map_sgl(sg, &idx, scsi_out(sc)); sgs[out_num++] = out->sgl;
*out_num = idx;
/* Response header. */ /* Response header. */
sg_set_buf(&sg[idx++], &cmd->resp, resp_size); sg_init_one(&resp, &cmd->resp, resp_size);
sgs[out_num + in_num++] = &resp;
/* Data-in buffer */ /* Data-in buffer */
if (sc && sc->sc_data_direction != DMA_TO_DEVICE) if (in)
virtscsi_map_sgl(sg, &idx, scsi_in(sc)); sgs[out_num + in_num++] = in->sgl;
*in_num = idx - *out_num; return virtqueue_add_sgs(vq, sgs, out_num, in_num, cmd, gfp);
} }
static int virtscsi_kick_cmd(struct virtio_scsi_target_state *tgt, static int virtscsi_kick_cmd(struct virtio_scsi_vq *vq,
struct virtio_scsi_vq *vq,
struct virtio_scsi_cmd *cmd, struct virtio_scsi_cmd *cmd,
size_t req_size, size_t resp_size, gfp_t gfp) size_t req_size, size_t resp_size, gfp_t gfp)
{ {
unsigned int out_num, in_num;
unsigned long flags; unsigned long flags;
int err; int err;
bool needs_kick = false; bool needs_kick = false;
spin_lock_irqsave(&tgt->tgt_lock, flags); spin_lock_irqsave(&vq->vq_lock, flags);
virtscsi_map_cmd(tgt, cmd, &out_num, &in_num, req_size, resp_size); err = virtscsi_add_cmd(vq->vq, cmd, req_size, resp_size, gfp);
spin_lock(&vq->vq_lock);
err = virtqueue_add_buf(vq->vq, tgt->sg, out_num, in_num, cmd, gfp);
spin_unlock(&tgt->tgt_lock);
if (!err) if (!err)
needs_kick = virtqueue_kick_prepare(vq->vq); needs_kick = virtqueue_kick_prepare(vq->vq);
@ -430,10 +489,10 @@ static int virtscsi_kick_cmd(struct virtio_scsi_target_state *tgt,
return err; return err;
} }
static int virtscsi_queuecommand(struct Scsi_Host *sh, struct scsi_cmnd *sc) static int virtscsi_queuecommand(struct virtio_scsi *vscsi,
struct virtio_scsi_vq *req_vq,
struct scsi_cmnd *sc)
{ {
struct virtio_scsi *vscsi = shost_priv(sh);
struct virtio_scsi_target_state *tgt = vscsi->tgt[sc->device->id];
struct virtio_scsi_cmd *cmd; struct virtio_scsi_cmd *cmd;
int ret; int ret;
@ -467,7 +526,7 @@ static int virtscsi_queuecommand(struct Scsi_Host *sh, struct scsi_cmnd *sc)
BUG_ON(sc->cmd_len > VIRTIO_SCSI_CDB_SIZE); BUG_ON(sc->cmd_len > VIRTIO_SCSI_CDB_SIZE);
memcpy(cmd->req.cmd.cdb, sc->cmnd, sc->cmd_len); memcpy(cmd->req.cmd.cdb, sc->cmnd, sc->cmd_len);
if (virtscsi_kick_cmd(tgt, &vscsi->req_vq, cmd, if (virtscsi_kick_cmd(req_vq, cmd,
sizeof cmd->req.cmd, sizeof cmd->resp.cmd, sizeof cmd->req.cmd, sizeof cmd->resp.cmd,
GFP_ATOMIC) == 0) GFP_ATOMIC) == 0)
ret = 0; ret = 0;
@ -478,14 +537,62 @@ out:
return ret; return ret;
} }
static int virtscsi_queuecommand_single(struct Scsi_Host *sh,
struct scsi_cmnd *sc)
{
struct virtio_scsi *vscsi = shost_priv(sh);
struct virtio_scsi_target_state *tgt =
scsi_target(sc->device)->hostdata;
atomic_inc(&tgt->reqs);
return virtscsi_queuecommand(vscsi, &vscsi->req_vqs[0], sc);
}
static struct virtio_scsi_vq *virtscsi_pick_vq(struct virtio_scsi *vscsi,
struct virtio_scsi_target_state *tgt)
{
struct virtio_scsi_vq *vq;
unsigned long flags;
u32 queue_num;
spin_lock_irqsave(&tgt->tgt_lock, flags);
/*
* The memory barrier after atomic_inc_return matches
* the smp_read_barrier_depends() in virtscsi_req_done.
*/
if (atomic_inc_return(&tgt->reqs) > 1)
vq = ACCESS_ONCE(tgt->req_vq);
else {
queue_num = smp_processor_id();
while (unlikely(queue_num >= vscsi->num_queues))
queue_num -= vscsi->num_queues;
tgt->req_vq = vq = &vscsi->req_vqs[queue_num];
}
spin_unlock_irqrestore(&tgt->tgt_lock, flags);
return vq;
}
static int virtscsi_queuecommand_multi(struct Scsi_Host *sh,
struct scsi_cmnd *sc)
{
struct virtio_scsi *vscsi = shost_priv(sh);
struct virtio_scsi_target_state *tgt =
scsi_target(sc->device)->hostdata;
struct virtio_scsi_vq *req_vq = virtscsi_pick_vq(vscsi, tgt);
return virtscsi_queuecommand(vscsi, req_vq, sc);
}
static int virtscsi_tmf(struct virtio_scsi *vscsi, struct virtio_scsi_cmd *cmd) static int virtscsi_tmf(struct virtio_scsi *vscsi, struct virtio_scsi_cmd *cmd)
{ {
DECLARE_COMPLETION_ONSTACK(comp); DECLARE_COMPLETION_ONSTACK(comp);
struct virtio_scsi_target_state *tgt = vscsi->tgt[cmd->sc->device->id];
int ret = FAILED; int ret = FAILED;
cmd->comp = &comp; cmd->comp = &comp;
if (virtscsi_kick_cmd(tgt, &vscsi->ctrl_vq, cmd, if (virtscsi_kick_cmd(&vscsi->ctrl_vq, cmd,
sizeof cmd->req.tmf, sizeof cmd->resp.tmf, sizeof cmd->req.tmf, sizeof cmd->resp.tmf,
GFP_NOIO) < 0) GFP_NOIO) < 0)
goto out; goto out;
@ -547,18 +654,57 @@ static int virtscsi_abort(struct scsi_cmnd *sc)
return virtscsi_tmf(vscsi, cmd); return virtscsi_tmf(vscsi, cmd);
} }
static struct scsi_host_template virtscsi_host_template = { static int virtscsi_target_alloc(struct scsi_target *starget)
{
struct virtio_scsi_target_state *tgt =
kmalloc(sizeof(*tgt), GFP_KERNEL);
if (!tgt)
return -ENOMEM;
spin_lock_init(&tgt->tgt_lock);
atomic_set(&tgt->reqs, 0);
tgt->req_vq = NULL;
starget->hostdata = tgt;
return 0;
}
static void virtscsi_target_destroy(struct scsi_target *starget)
{
struct virtio_scsi_target_state *tgt = starget->hostdata;
kfree(tgt);
}
static struct scsi_host_template virtscsi_host_template_single = {
.module = THIS_MODULE, .module = THIS_MODULE,
.name = "Virtio SCSI HBA", .name = "Virtio SCSI HBA",
.proc_name = "virtio_scsi", .proc_name = "virtio_scsi",
.queuecommand = virtscsi_queuecommand,
.this_id = -1, .this_id = -1,
.queuecommand = virtscsi_queuecommand_single,
.eh_abort_handler = virtscsi_abort, .eh_abort_handler = virtscsi_abort,
.eh_device_reset_handler = virtscsi_device_reset, .eh_device_reset_handler = virtscsi_device_reset,
.can_queue = 1024, .can_queue = 1024,
.dma_boundary = UINT_MAX, .dma_boundary = UINT_MAX,
.use_clustering = ENABLE_CLUSTERING, .use_clustering = ENABLE_CLUSTERING,
.target_alloc = virtscsi_target_alloc,
.target_destroy = virtscsi_target_destroy,
};
static struct scsi_host_template virtscsi_host_template_multi = {
.module = THIS_MODULE,
.name = "Virtio SCSI HBA",
.proc_name = "virtio_scsi",
.this_id = -1,
.queuecommand = virtscsi_queuecommand_multi,
.eh_abort_handler = virtscsi_abort,
.eh_device_reset_handler = virtscsi_device_reset,
.can_queue = 1024,
.dma_boundary = UINT_MAX,
.use_clustering = ENABLE_CLUSTERING,
.target_alloc = virtscsi_target_alloc,
.target_destroy = virtscsi_target_destroy,
}; };
#define virtscsi_config_get(vdev, fld) \ #define virtscsi_config_get(vdev, fld) \
@ -578,6 +724,64 @@ static struct scsi_host_template virtscsi_host_template = {
&__val, sizeof(__val)); \ &__val, sizeof(__val)); \
}) })
static void __virtscsi_set_affinity(struct virtio_scsi *vscsi, bool affinity)
{
int i;
int cpu;
/* In multiqueue mode, when the number of cpu is equal
* to the number of request queues, we let the qeueues
* to be private to one cpu by setting the affinity hint
* to eliminate the contention.
*/
if ((vscsi->num_queues == 1 ||
vscsi->num_queues != num_online_cpus()) && affinity) {
if (vscsi->affinity_hint_set)
affinity = false;
else
return;
}
if (affinity) {
i = 0;
for_each_online_cpu(cpu) {
virtqueue_set_affinity(vscsi->req_vqs[i].vq, cpu);
i++;
}
vscsi->affinity_hint_set = true;
} else {
for (i = 0; i < vscsi->num_queues - VIRTIO_SCSI_VQ_BASE; i++)
virtqueue_set_affinity(vscsi->req_vqs[i].vq, -1);
vscsi->affinity_hint_set = false;
}
}
static void virtscsi_set_affinity(struct virtio_scsi *vscsi, bool affinity)
{
get_online_cpus();
__virtscsi_set_affinity(vscsi, affinity);
put_online_cpus();
}
static int virtscsi_cpu_callback(struct notifier_block *nfb,
unsigned long action, void *hcpu)
{
struct virtio_scsi *vscsi = container_of(nfb, struct virtio_scsi, nb);
switch(action) {
case CPU_ONLINE:
case CPU_ONLINE_FROZEN:
case CPU_DEAD:
case CPU_DEAD_FROZEN:
__virtscsi_set_affinity(vscsi, true);
break;
default:
break;
}
return NOTIFY_OK;
}
static void virtscsi_init_vq(struct virtio_scsi_vq *virtscsi_vq, static void virtscsi_init_vq(struct virtio_scsi_vq *virtscsi_vq,
struct virtqueue *vq) struct virtqueue *vq)
{ {
@ -585,24 +789,6 @@ static void virtscsi_init_vq(struct virtio_scsi_vq *virtscsi_vq,
virtscsi_vq->vq = vq; virtscsi_vq->vq = vq;
} }
static struct virtio_scsi_target_state *virtscsi_alloc_tgt(
struct virtio_device *vdev, int sg_elems)
{
struct virtio_scsi_target_state *tgt;
gfp_t gfp_mask = GFP_KERNEL;
/* We need extra sg elements at head and tail. */
tgt = kmalloc(sizeof(*tgt) + sizeof(tgt->sg[0]) * (sg_elems + 2),
gfp_mask);
if (!tgt)
return NULL;
spin_lock_init(&tgt->tgt_lock);
sg_init_table(tgt->sg, sg_elems + 2);
return tgt;
}
static void virtscsi_scan(struct virtio_device *vdev) static void virtscsi_scan(struct virtio_device *vdev)
{ {
struct Scsi_Host *shost = (struct Scsi_Host *)vdev->priv; struct Scsi_Host *shost = (struct Scsi_Host *)vdev->priv;
@ -614,46 +800,56 @@ static void virtscsi_remove_vqs(struct virtio_device *vdev)
{ {
struct Scsi_Host *sh = virtio_scsi_host(vdev); struct Scsi_Host *sh = virtio_scsi_host(vdev);
struct virtio_scsi *vscsi = shost_priv(sh); struct virtio_scsi *vscsi = shost_priv(sh);
u32 i, num_targets;
virtscsi_set_affinity(vscsi, false);
/* Stop all the virtqueues. */ /* Stop all the virtqueues. */
vdev->config->reset(vdev); vdev->config->reset(vdev);
num_targets = sh->max_id;
for (i = 0; i < num_targets; i++) {
kfree(vscsi->tgt[i]);
vscsi->tgt[i] = NULL;
}
vdev->config->del_vqs(vdev); vdev->config->del_vqs(vdev);
} }
static int virtscsi_init(struct virtio_device *vdev, static int virtscsi_init(struct virtio_device *vdev,
struct virtio_scsi *vscsi, int num_targets) struct virtio_scsi *vscsi)
{ {
int err; int err;
struct virtqueue *vqs[3]; u32 i;
u32 i, sg_elems; u32 num_vqs;
vq_callback_t **callbacks;
const char **names;
struct virtqueue **vqs;
vq_callback_t *callbacks[] = { num_vqs = vscsi->num_queues + VIRTIO_SCSI_VQ_BASE;
virtscsi_ctrl_done, vqs = kmalloc(num_vqs * sizeof(struct virtqueue *), GFP_KERNEL);
virtscsi_event_done, callbacks = kmalloc(num_vqs * sizeof(vq_callback_t *), GFP_KERNEL);
virtscsi_req_done names = kmalloc(num_vqs * sizeof(char *), GFP_KERNEL);
};
const char *names[] = { if (!callbacks || !vqs || !names) {
"control", err = -ENOMEM;
"event", goto out;
"request" }
};
callbacks[0] = virtscsi_ctrl_done;
callbacks[1] = virtscsi_event_done;
names[0] = "control";
names[1] = "event";
for (i = VIRTIO_SCSI_VQ_BASE; i < num_vqs; i++) {
callbacks[i] = virtscsi_req_done;
names[i] = "request";
}
/* Discover virtqueues and write information to configuration. */ /* Discover virtqueues and write information to configuration. */
err = vdev->config->find_vqs(vdev, 3, vqs, callbacks, names); err = vdev->config->find_vqs(vdev, num_vqs, vqs, callbacks, names);
if (err) if (err)
return err; goto out;
virtscsi_init_vq(&vscsi->ctrl_vq, vqs[0]); virtscsi_init_vq(&vscsi->ctrl_vq, vqs[0]);
virtscsi_init_vq(&vscsi->event_vq, vqs[1]); virtscsi_init_vq(&vscsi->event_vq, vqs[1]);
virtscsi_init_vq(&vscsi->req_vq, vqs[2]); for (i = VIRTIO_SCSI_VQ_BASE; i < num_vqs; i++)
virtscsi_init_vq(&vscsi->req_vqs[i - VIRTIO_SCSI_VQ_BASE],
vqs[i]);
virtscsi_set_affinity(vscsi, true);
virtscsi_config_set(vdev, cdb_size, VIRTIO_SCSI_CDB_SIZE); virtscsi_config_set(vdev, cdb_size, VIRTIO_SCSI_CDB_SIZE);
virtscsi_config_set(vdev, sense_size, VIRTIO_SCSI_SENSE_SIZE); virtscsi_config_set(vdev, sense_size, VIRTIO_SCSI_SENSE_SIZE);
@ -661,19 +857,12 @@ static int virtscsi_init(struct virtio_device *vdev,
if (virtio_has_feature(vdev, VIRTIO_SCSI_F_HOTPLUG)) if (virtio_has_feature(vdev, VIRTIO_SCSI_F_HOTPLUG))
virtscsi_kick_event_all(vscsi); virtscsi_kick_event_all(vscsi);
/* We need to know how many segments before we allocate. */
sg_elems = virtscsi_config_get(vdev, seg_max) ?: 1;
for (i = 0; i < num_targets; i++) {
vscsi->tgt[i] = virtscsi_alloc_tgt(vdev, sg_elems);
if (!vscsi->tgt[i]) {
err = -ENOMEM;
goto out;
}
}
err = 0; err = 0;
out: out:
kfree(names);
kfree(callbacks);
kfree(vqs);
if (err) if (err)
virtscsi_remove_vqs(vdev); virtscsi_remove_vqs(vdev);
return err; return err;
@ -686,13 +875,21 @@ static int virtscsi_probe(struct virtio_device *vdev)
int err; int err;
u32 sg_elems, num_targets; u32 sg_elems, num_targets;
u32 cmd_per_lun; u32 cmd_per_lun;
u32 num_queues;
struct scsi_host_template *hostt;
/* We need to know how many queues before we allocate. */
num_queues = virtscsi_config_get(vdev, num_queues) ? : 1;
/* Allocate memory and link the structs together. */
num_targets = virtscsi_config_get(vdev, max_target) + 1; num_targets = virtscsi_config_get(vdev, max_target) + 1;
shost = scsi_host_alloc(&virtscsi_host_template,
sizeof(*vscsi)
+ num_targets * sizeof(struct virtio_scsi_target_state));
if (num_queues == 1)
hostt = &virtscsi_host_template_single;
else
hostt = &virtscsi_host_template_multi;
shost = scsi_host_alloc(hostt,
sizeof(*vscsi) + sizeof(vscsi->req_vqs[0]) * num_queues);
if (!shost) if (!shost)
return -ENOMEM; return -ENOMEM;
@ -700,12 +897,20 @@ static int virtscsi_probe(struct virtio_device *vdev)
shost->sg_tablesize = sg_elems; shost->sg_tablesize = sg_elems;
vscsi = shost_priv(shost); vscsi = shost_priv(shost);
vscsi->vdev = vdev; vscsi->vdev = vdev;
vscsi->num_queues = num_queues;
vdev->priv = shost; vdev->priv = shost;
err = virtscsi_init(vdev, vscsi, num_targets); err = virtscsi_init(vdev, vscsi);
if (err) if (err)
goto virtscsi_init_failed; goto virtscsi_init_failed;
vscsi->nb.notifier_call = &virtscsi_cpu_callback;
err = register_hotcpu_notifier(&vscsi->nb);
if (err) {
pr_err("registering cpu notifier failed\n");
goto scsi_add_host_failed;
}
cmd_per_lun = virtscsi_config_get(vdev, cmd_per_lun) ?: 1; cmd_per_lun = virtscsi_config_get(vdev, cmd_per_lun) ?: 1;
shost->cmd_per_lun = min_t(u32, cmd_per_lun, shost->can_queue); shost->cmd_per_lun = min_t(u32, cmd_per_lun, shost->can_queue);
shost->max_sectors = virtscsi_config_get(vdev, max_sectors) ?: 0xFFFF; shost->max_sectors = virtscsi_config_get(vdev, max_sectors) ?: 0xFFFF;
@ -743,6 +948,8 @@ static void virtscsi_remove(struct virtio_device *vdev)
scsi_remove_host(shost); scsi_remove_host(shost);
unregister_hotcpu_notifier(&vscsi->nb);
virtscsi_remove_vqs(vdev); virtscsi_remove_vqs(vdev);
scsi_host_put(shost); scsi_host_put(shost);
} }
@ -759,7 +966,7 @@ static int virtscsi_restore(struct virtio_device *vdev)
struct Scsi_Host *sh = virtio_scsi_host(vdev); struct Scsi_Host *sh = virtio_scsi_host(vdev);
struct virtio_scsi *vscsi = shost_priv(sh); struct virtio_scsi *vscsi = shost_priv(sh);
return virtscsi_init(vdev, vscsi, sh->max_id); return virtscsi_init(vdev, vscsi);
} }
#endif #endif
@ -794,8 +1001,7 @@ static int __init init(void)
virtscsi_cmd_cache = KMEM_CACHE(virtio_scsi_cmd, 0); virtscsi_cmd_cache = KMEM_CACHE(virtio_scsi_cmd, 0);
if (!virtscsi_cmd_cache) { if (!virtscsi_cmd_cache) {
printk(KERN_ERR "kmem_cache_create() for " pr_err("kmem_cache_create() for virtscsi_cmd_cache failed\n");
"virtscsi_cmd_cache failed\n");
goto error; goto error;
} }
@ -804,8 +1010,7 @@ static int __init init(void)
mempool_create_slab_pool(VIRTIO_SCSI_MEMPOOL_SZ, mempool_create_slab_pool(VIRTIO_SCSI_MEMPOOL_SZ,
virtscsi_cmd_cache); virtscsi_cmd_cache);
if (!virtscsi_cmd_pool) { if (!virtscsi_cmd_pool) {
printk(KERN_ERR "mempool_create() for" pr_err("mempool_create() for virtscsi_cmd_pool failed\n");
"virtscsi_cmd_pool failed\n");
goto error; goto error;
} }
ret = register_virtio_driver(&virtio_scsi_driver); ret = register_virtio_driver(&virtio_scsi_driver);

View File

@ -1,6 +1,7 @@
config VHOST_NET config VHOST_NET
tristate "Host kernel accelerator for virtio net" tristate "Host kernel accelerator for virtio net"
depends on NET && EVENTFD && (TUN || !TUN) && (MACVTAP || !MACVTAP) depends on NET && EVENTFD && (TUN || !TUN) && (MACVTAP || !MACVTAP)
select VHOST_RING
---help--- ---help---
This kernel module can be loaded in host kernel to accelerate This kernel module can be loaded in host kernel to accelerate
guest networking with virtio_net. Not to be confused with virtio_net guest networking with virtio_net. Not to be confused with virtio_net
@ -12,7 +13,14 @@ config VHOST_NET
config VHOST_SCSI config VHOST_SCSI
tristate "VHOST_SCSI TCM fabric driver" tristate "VHOST_SCSI TCM fabric driver"
depends on TARGET_CORE && EVENTFD && m depends on TARGET_CORE && EVENTFD && m
select VHOST_RING
default n default n
---help--- ---help---
Say M here to enable the vhost_scsi TCM fabric module Say M here to enable the vhost_scsi TCM fabric module
for use with virtio-scsi guests for use with virtio-scsi guests
config VHOST_RING
tristate
---help---
This option is selected by any driver which needs to access
the host side of a virtio ring.

View File

@ -3,3 +3,5 @@ vhost_net-y := vhost.o net.o
obj-$(CONFIG_VHOST_SCSI) += vhost_scsi.o obj-$(CONFIG_VHOST_SCSI) += vhost_scsi.o
vhost_scsi-y := scsi.o vhost_scsi-y := scsi.o
obj-$(CONFIG_VHOST_RING) += vringh.o

View File

@ -282,7 +282,9 @@ static long vhost_test_ioctl(struct file *f, unsigned int ioctl,
return vhost_test_reset_owner(n); return vhost_test_reset_owner(n);
default: default:
mutex_lock(&n->dev.mutex); mutex_lock(&n->dev.mutex);
r = vhost_dev_ioctl(&n->dev, ioctl, arg); r = vhost_dev_ioctl(&n->dev, ioctl, argp);
if (r == -ENOIOCTLCMD)
r = vhost_vring_ioctl(&n->dev, ioctl, argp);
vhost_test_flush(n); vhost_test_flush(n);
mutex_unlock(&n->dev.mutex); mutex_unlock(&n->dev.mutex);
return r; return r;

File diff suppressed because it is too large Load Diff

View File

@ -108,7 +108,7 @@ static void tell_host(struct virtio_balloon *vb, struct virtqueue *vq)
sg_init_one(&sg, vb->pfns, sizeof(vb->pfns[0]) * vb->num_pfns); sg_init_one(&sg, vb->pfns, sizeof(vb->pfns[0]) * vb->num_pfns);
/* We should always be able to add one buffer to an empty queue. */ /* We should always be able to add one buffer to an empty queue. */
if (virtqueue_add_buf(vq, &sg, 1, 0, vb, GFP_KERNEL) < 0) if (virtqueue_add_outbuf(vq, &sg, 1, vb, GFP_KERNEL) < 0)
BUG(); BUG();
virtqueue_kick(vq); virtqueue_kick(vq);
@ -256,7 +256,7 @@ static void stats_handle_request(struct virtio_balloon *vb)
if (!virtqueue_get_buf(vq, &len)) if (!virtqueue_get_buf(vq, &len))
return; return;
sg_init_one(&sg, vb->stats, sizeof(vb->stats)); sg_init_one(&sg, vb->stats, sizeof(vb->stats));
if (virtqueue_add_buf(vq, &sg, 1, 0, vb, GFP_KERNEL) < 0) if (virtqueue_add_outbuf(vq, &sg, 1, vb, GFP_KERNEL) < 0)
BUG(); BUG();
virtqueue_kick(vq); virtqueue_kick(vq);
} }
@ -341,7 +341,7 @@ static int init_vqs(struct virtio_balloon *vb)
* use it to signal us later. * use it to signal us later.
*/ */
sg_init_one(&sg, vb->stats, sizeof vb->stats); sg_init_one(&sg, vb->stats, sizeof vb->stats);
if (virtqueue_add_buf(vb->stats_vq, &sg, 1, 0, vb, GFP_KERNEL) if (virtqueue_add_outbuf(vb->stats_vq, &sg, 1, vb, GFP_KERNEL)
< 0) < 0)
BUG(); BUG();
virtqueue_kick(vb->stats_vq); virtqueue_kick(vb->stats_vq);

View File

@ -24,27 +24,6 @@
#include <linux/module.h> #include <linux/module.h>
#include <linux/hrtimer.h> #include <linux/hrtimer.h>
/* virtio guest is communicating with a virtual "device" that actually runs on
* a host processor. Memory barriers are used to control SMP effects. */
#ifdef CONFIG_SMP
/* Where possible, use SMP barriers which are more lightweight than mandatory
* barriers, because mandatory barriers control MMIO effects on accesses
* through relaxed memory I/O windows (which virtio-pci does not use). */
#define virtio_mb(vq) \
do { if ((vq)->weak_barriers) smp_mb(); else mb(); } while(0)
#define virtio_rmb(vq) \
do { if ((vq)->weak_barriers) smp_rmb(); else rmb(); } while(0)
#define virtio_wmb(vq) \
do { if ((vq)->weak_barriers) smp_wmb(); else wmb(); } while(0)
#else
/* We must force memory ordering even if guest is UP since host could be
* running on another CPU, but SMP barriers are defined to barrier() in that
* configuration. So fall back to mandatory barriers instead. */
#define virtio_mb(vq) mb()
#define virtio_rmb(vq) rmb()
#define virtio_wmb(vq) wmb()
#endif
#ifdef DEBUG #ifdef DEBUG
/* For development, we want to crash whenever the ring is screwed. */ /* For development, we want to crash whenever the ring is screwed. */
#define BAD_RING(_vq, fmt, args...) \ #define BAD_RING(_vq, fmt, args...) \
@ -119,16 +98,36 @@ struct vring_virtqueue
#define to_vvq(_vq) container_of(_vq, struct vring_virtqueue, vq) #define to_vvq(_vq) container_of(_vq, struct vring_virtqueue, vq)
static inline struct scatterlist *sg_next_chained(struct scatterlist *sg,
unsigned int *count)
{
return sg_next(sg);
}
static inline struct scatterlist *sg_next_arr(struct scatterlist *sg,
unsigned int *count)
{
if (--(*count) == 0)
return NULL;
return sg + 1;
}
/* Set up an indirect table of descriptors and add it to the queue. */ /* Set up an indirect table of descriptors and add it to the queue. */
static int vring_add_indirect(struct vring_virtqueue *vq, static inline int vring_add_indirect(struct vring_virtqueue *vq,
struct scatterlist sg[], struct scatterlist *sgs[],
unsigned int out, struct scatterlist *(*next)
unsigned int in, (struct scatterlist *, unsigned int *),
gfp_t gfp) unsigned int total_sg,
unsigned int total_out,
unsigned int total_in,
unsigned int out_sgs,
unsigned int in_sgs,
gfp_t gfp)
{ {
struct vring_desc *desc; struct vring_desc *desc;
unsigned head; unsigned head;
int i; struct scatterlist *sg;
int i, n;
/* /*
* We require lowmem mappings for the descriptors because * We require lowmem mappings for the descriptors because
@ -137,25 +136,31 @@ static int vring_add_indirect(struct vring_virtqueue *vq,
*/ */
gfp &= ~(__GFP_HIGHMEM | __GFP_HIGH); gfp &= ~(__GFP_HIGHMEM | __GFP_HIGH);
desc = kmalloc((out + in) * sizeof(struct vring_desc), gfp); desc = kmalloc(total_sg * sizeof(struct vring_desc), gfp);
if (!desc) if (!desc)
return -ENOMEM; return -ENOMEM;
/* Transfer entries from the sg list into the indirect page */ /* Transfer entries from the sg lists into the indirect page */
for (i = 0; i < out; i++) { i = 0;
desc[i].flags = VRING_DESC_F_NEXT; for (n = 0; n < out_sgs; n++) {
desc[i].addr = sg_phys(sg); for (sg = sgs[n]; sg; sg = next(sg, &total_out)) {
desc[i].len = sg->length; desc[i].flags = VRING_DESC_F_NEXT;
desc[i].next = i+1; desc[i].addr = sg_phys(sg);
sg++; desc[i].len = sg->length;
desc[i].next = i+1;
i++;
}
} }
for (; i < (out + in); i++) { for (; n < (out_sgs + in_sgs); n++) {
desc[i].flags = VRING_DESC_F_NEXT|VRING_DESC_F_WRITE; for (sg = sgs[n]; sg; sg = next(sg, &total_in)) {
desc[i].addr = sg_phys(sg); desc[i].flags = VRING_DESC_F_NEXT|VRING_DESC_F_WRITE;
desc[i].len = sg->length; desc[i].addr = sg_phys(sg);
desc[i].next = i+1; desc[i].len = sg->length;
sg++; desc[i].next = i+1;
i++;
}
} }
BUG_ON(i != total_sg);
/* Last one doesn't continue. */ /* Last one doesn't continue. */
desc[i-1].flags &= ~VRING_DESC_F_NEXT; desc[i-1].flags &= ~VRING_DESC_F_NEXT;
@ -176,6 +181,120 @@ static int vring_add_indirect(struct vring_virtqueue *vq,
return head; return head;
} }
static inline int virtqueue_add(struct virtqueue *_vq,
struct scatterlist *sgs[],
struct scatterlist *(*next)
(struct scatterlist *, unsigned int *),
unsigned int total_out,
unsigned int total_in,
unsigned int out_sgs,
unsigned int in_sgs,
void *data,
gfp_t gfp)
{
struct vring_virtqueue *vq = to_vvq(_vq);
struct scatterlist *sg;
unsigned int i, n, avail, uninitialized_var(prev), total_sg;
int head;
START_USE(vq);
BUG_ON(data == NULL);
#ifdef DEBUG
{
ktime_t now = ktime_get();
/* No kick or get, with .1 second between? Warn. */
if (vq->last_add_time_valid)
WARN_ON(ktime_to_ms(ktime_sub(now, vq->last_add_time))
> 100);
vq->last_add_time = now;
vq->last_add_time_valid = true;
}
#endif
total_sg = total_in + total_out;
/* If the host supports indirect descriptor tables, and we have multiple
* buffers, then go indirect. FIXME: tune this threshold */
if (vq->indirect && total_sg > 1 && vq->vq.num_free) {
head = vring_add_indirect(vq, sgs, next, total_sg, total_out,
total_in,
out_sgs, in_sgs, gfp);
if (likely(head >= 0))
goto add_head;
}
BUG_ON(total_sg > vq->vring.num);
BUG_ON(total_sg == 0);
if (vq->vq.num_free < total_sg) {
pr_debug("Can't add buf len %i - avail = %i\n",
total_sg, vq->vq.num_free);
/* FIXME: for historical reasons, we force a notify here if
* there are outgoing parts to the buffer. Presumably the
* host should service the ring ASAP. */
if (out_sgs)
vq->notify(&vq->vq);
END_USE(vq);
return -ENOSPC;
}
/* We're about to use some buffers from the free list. */
vq->vq.num_free -= total_sg;
head = i = vq->free_head;
for (n = 0; n < out_sgs; n++) {
for (sg = sgs[n]; sg; sg = next(sg, &total_out)) {
vq->vring.desc[i].flags = VRING_DESC_F_NEXT;
vq->vring.desc[i].addr = sg_phys(sg);
vq->vring.desc[i].len = sg->length;
prev = i;
i = vq->vring.desc[i].next;
}
}
for (; n < (out_sgs + in_sgs); n++) {
for (sg = sgs[n]; sg; sg = next(sg, &total_in)) {
vq->vring.desc[i].flags = VRING_DESC_F_NEXT|VRING_DESC_F_WRITE;
vq->vring.desc[i].addr = sg_phys(sg);
vq->vring.desc[i].len = sg->length;
prev = i;
i = vq->vring.desc[i].next;
}
}
/* Last one doesn't continue. */
vq->vring.desc[prev].flags &= ~VRING_DESC_F_NEXT;
/* Update free pointer */
vq->free_head = i;
add_head:
/* Set token. */
vq->data[head] = data;
/* Put entry in available array (but don't update avail->idx until they
* do sync). */
avail = (vq->vring.avail->idx & (vq->vring.num-1));
vq->vring.avail->ring[avail] = head;
/* Descriptors and available array need to be set before we expose the
* new available array entries. */
virtio_wmb(vq->weak_barriers);
vq->vring.avail->idx++;
vq->num_added++;
/* This is very unlikely, but theoretically possible. Kick
* just in case. */
if (unlikely(vq->num_added == (1 << 16) - 1))
virtqueue_kick(_vq);
pr_debug("Added buffer head %i to %p\n", head, vq);
END_USE(vq);
return 0;
}
/** /**
* virtqueue_add_buf - expose buffer to other end * virtqueue_add_buf - expose buffer to other end
* @vq: the struct virtqueue we're talking about. * @vq: the struct virtqueue we're talking about.
@ -197,101 +316,99 @@ int virtqueue_add_buf(struct virtqueue *_vq,
void *data, void *data,
gfp_t gfp) gfp_t gfp)
{ {
struct vring_virtqueue *vq = to_vvq(_vq); struct scatterlist *sgs[2];
unsigned int i, avail, uninitialized_var(prev);
int head;
START_USE(vq); sgs[0] = sg;
sgs[1] = sg + out;
BUG_ON(data == NULL); return virtqueue_add(_vq, sgs, sg_next_arr,
out, in, out ? 1 : 0, in ? 1 : 0, data, gfp);
#ifdef DEBUG
{
ktime_t now = ktime_get();
/* No kick or get, with .1 second between? Warn. */
if (vq->last_add_time_valid)
WARN_ON(ktime_to_ms(ktime_sub(now, vq->last_add_time))
> 100);
vq->last_add_time = now;
vq->last_add_time_valid = true;
}
#endif
/* If the host supports indirect descriptor tables, and we have multiple
* buffers, then go indirect. FIXME: tune this threshold */
if (vq->indirect && (out + in) > 1 && vq->vq.num_free) {
head = vring_add_indirect(vq, sg, out, in, gfp);
if (likely(head >= 0))
goto add_head;
}
BUG_ON(out + in > vq->vring.num);
BUG_ON(out + in == 0);
if (vq->vq.num_free < out + in) {
pr_debug("Can't add buf len %i - avail = %i\n",
out + in, vq->vq.num_free);
/* FIXME: for historical reasons, we force a notify here if
* there are outgoing parts to the buffer. Presumably the
* host should service the ring ASAP. */
if (out)
vq->notify(&vq->vq);
END_USE(vq);
return -ENOSPC;
}
/* We're about to use some buffers from the free list. */
vq->vq.num_free -= out + in;
head = vq->free_head;
for (i = vq->free_head; out; i = vq->vring.desc[i].next, out--) {
vq->vring.desc[i].flags = VRING_DESC_F_NEXT;
vq->vring.desc[i].addr = sg_phys(sg);
vq->vring.desc[i].len = sg->length;
prev = i;
sg++;
}
for (; in; i = vq->vring.desc[i].next, in--) {
vq->vring.desc[i].flags = VRING_DESC_F_NEXT|VRING_DESC_F_WRITE;
vq->vring.desc[i].addr = sg_phys(sg);
vq->vring.desc[i].len = sg->length;
prev = i;
sg++;
}
/* Last one doesn't continue. */
vq->vring.desc[prev].flags &= ~VRING_DESC_F_NEXT;
/* Update free pointer */
vq->free_head = i;
add_head:
/* Set token. */
vq->data[head] = data;
/* Put entry in available array (but don't update avail->idx until they
* do sync). */
avail = (vq->vring.avail->idx & (vq->vring.num-1));
vq->vring.avail->ring[avail] = head;
/* Descriptors and available array need to be set before we expose the
* new available array entries. */
virtio_wmb(vq);
vq->vring.avail->idx++;
vq->num_added++;
/* This is very unlikely, but theoretically possible. Kick
* just in case. */
if (unlikely(vq->num_added == (1 << 16) - 1))
virtqueue_kick(_vq);
pr_debug("Added buffer head %i to %p\n", head, vq);
END_USE(vq);
return 0;
} }
EXPORT_SYMBOL_GPL(virtqueue_add_buf); EXPORT_SYMBOL_GPL(virtqueue_add_buf);
/**
* virtqueue_add_sgs - expose buffers to other end
* @vq: the struct virtqueue we're talking about.
* @sgs: array of terminated scatterlists.
* @out_num: the number of scatterlists readable by other side
* @in_num: the number of scatterlists which are writable (after readable ones)
* @data: the token identifying the buffer.
* @gfp: how to do memory allocations (if necessary).
*
* Caller must ensure we don't call this with other virtqueue operations
* at the same time (except where noted).
*
* Returns zero or a negative error (ie. ENOSPC, ENOMEM).
*/
int virtqueue_add_sgs(struct virtqueue *_vq,
struct scatterlist *sgs[],
unsigned int out_sgs,
unsigned int in_sgs,
void *data,
gfp_t gfp)
{
unsigned int i, total_out, total_in;
/* Count them first. */
for (i = total_out = total_in = 0; i < out_sgs; i++) {
struct scatterlist *sg;
for (sg = sgs[i]; sg; sg = sg_next(sg))
total_out++;
}
for (; i < out_sgs + in_sgs; i++) {
struct scatterlist *sg;
for (sg = sgs[i]; sg; sg = sg_next(sg))
total_in++;
}
return virtqueue_add(_vq, sgs, sg_next_chained,
total_out, total_in, out_sgs, in_sgs, data, gfp);
}
EXPORT_SYMBOL_GPL(virtqueue_add_sgs);
/**
* virtqueue_add_outbuf - expose output buffers to other end
* @vq: the struct virtqueue we're talking about.
* @sgs: array of scatterlists (need not be terminated!)
* @num: the number of scatterlists readable by other side
* @data: the token identifying the buffer.
* @gfp: how to do memory allocations (if necessary).
*
* Caller must ensure we don't call this with other virtqueue operations
* at the same time (except where noted).
*
* Returns zero or a negative error (ie. ENOSPC, ENOMEM).
*/
int virtqueue_add_outbuf(struct virtqueue *vq,
struct scatterlist sg[], unsigned int num,
void *data,
gfp_t gfp)
{
return virtqueue_add(vq, &sg, sg_next_arr, num, 0, 1, 0, data, gfp);
}
EXPORT_SYMBOL_GPL(virtqueue_add_outbuf);
/**
* virtqueue_add_inbuf - expose input buffers to other end
* @vq: the struct virtqueue we're talking about.
* @sgs: array of scatterlists (need not be terminated!)
* @num: the number of scatterlists writable by other side
* @data: the token identifying the buffer.
* @gfp: how to do memory allocations (if necessary).
*
* Caller must ensure we don't call this with other virtqueue operations
* at the same time (except where noted).
*
* Returns zero or a negative error (ie. ENOSPC, ENOMEM).
*/
int virtqueue_add_inbuf(struct virtqueue *vq,
struct scatterlist sg[], unsigned int num,
void *data,
gfp_t gfp)
{
return virtqueue_add(vq, &sg, sg_next_arr, 0, num, 0, 1, data, gfp);
}
EXPORT_SYMBOL_GPL(virtqueue_add_inbuf);
/** /**
* virtqueue_kick_prepare - first half of split virtqueue_kick call. * virtqueue_kick_prepare - first half of split virtqueue_kick call.
* @vq: the struct virtqueue * @vq: the struct virtqueue
@ -312,7 +429,7 @@ bool virtqueue_kick_prepare(struct virtqueue *_vq)
START_USE(vq); START_USE(vq);
/* We need to expose available array entries before checking avail /* We need to expose available array entries before checking avail
* event. */ * event. */
virtio_mb(vq); virtio_mb(vq->weak_barriers);
old = vq->vring.avail->idx - vq->num_added; old = vq->vring.avail->idx - vq->num_added;
new = vq->vring.avail->idx; new = vq->vring.avail->idx;
@ -436,7 +553,7 @@ void *virtqueue_get_buf(struct virtqueue *_vq, unsigned int *len)
} }
/* Only get used array entries after they have been exposed by host. */ /* Only get used array entries after they have been exposed by host. */
virtio_rmb(vq); virtio_rmb(vq->weak_barriers);
last_used = (vq->last_used_idx & (vq->vring.num - 1)); last_used = (vq->last_used_idx & (vq->vring.num - 1));
i = vq->vring.used->ring[last_used].id; i = vq->vring.used->ring[last_used].id;
@ -460,7 +577,7 @@ void *virtqueue_get_buf(struct virtqueue *_vq, unsigned int *len)
* the read in the next get_buf call. */ * the read in the next get_buf call. */
if (!(vq->vring.avail->flags & VRING_AVAIL_F_NO_INTERRUPT)) { if (!(vq->vring.avail->flags & VRING_AVAIL_F_NO_INTERRUPT)) {
vring_used_event(&vq->vring) = vq->last_used_idx; vring_used_event(&vq->vring) = vq->last_used_idx;
virtio_mb(vq); virtio_mb(vq->weak_barriers);
} }
#ifdef DEBUG #ifdef DEBUG
@ -513,7 +630,7 @@ bool virtqueue_enable_cb(struct virtqueue *_vq)
* entry. Always do both to keep code simple. */ * entry. Always do both to keep code simple. */
vq->vring.avail->flags &= ~VRING_AVAIL_F_NO_INTERRUPT; vq->vring.avail->flags &= ~VRING_AVAIL_F_NO_INTERRUPT;
vring_used_event(&vq->vring) = vq->last_used_idx; vring_used_event(&vq->vring) = vq->last_used_idx;
virtio_mb(vq); virtio_mb(vq->weak_barriers);
if (unlikely(more_used(vq))) { if (unlikely(more_used(vq))) {
END_USE(vq); END_USE(vq);
return false; return false;
@ -553,7 +670,7 @@ bool virtqueue_enable_cb_delayed(struct virtqueue *_vq)
/* TODO: tune this threshold */ /* TODO: tune this threshold */
bufs = (u16)(vq->vring.avail->idx - vq->last_used_idx) * 3 / 4; bufs = (u16)(vq->vring.avail->idx - vq->last_used_idx) * 3 / 4;
vring_used_event(&vq->vring) = vq->last_used_idx + bufs; vring_used_event(&vq->vring) = vq->last_used_idx + bufs;
virtio_mb(vq); virtio_mb(vq->weak_barriers);
if (unlikely((u16)(vq->vring.used->idx - vq->last_used_idx) > bufs)) { if (unlikely((u16)(vq->vring.used->idx - vq->last_used_idx) > bufs)) {
END_USE(vq); END_USE(vq);
return false; return false;

View File

@ -171,6 +171,22 @@ static inline void sg_mark_end(struct scatterlist *sg)
sg->page_link &= ~0x01; sg->page_link &= ~0x01;
} }
/**
* sg_unmark_end - Undo setting the end of the scatterlist
* @sg: SG entryScatterlist
*
* Description:
* Removes the termination marker from the given entry of the scatterlist.
*
**/
static inline void sg_unmark_end(struct scatterlist *sg)
{
#ifdef CONFIG_DEBUG_SG
BUG_ON(sg->sg_magic != SG_MAGIC);
#endif
sg->page_link &= ~0x02;
}
/** /**
* sg_phys - Return physical address of an sg entry * sg_phys - Return physical address of an sg entry
* @sg: SG entry * @sg: SG entry

View File

@ -8,6 +8,7 @@
#include <linux/device.h> #include <linux/device.h>
#include <linux/mod_devicetable.h> #include <linux/mod_devicetable.h>
#include <linux/gfp.h> #include <linux/gfp.h>
#include <linux/vringh.h>
/** /**
* virtqueue - a queue to register buffers for sending or receiving. * virtqueue - a queue to register buffers for sending or receiving.
@ -40,6 +41,23 @@ int virtqueue_add_buf(struct virtqueue *vq,
void *data, void *data,
gfp_t gfp); gfp_t gfp);
int virtqueue_add_outbuf(struct virtqueue *vq,
struct scatterlist sg[], unsigned int num,
void *data,
gfp_t gfp);
int virtqueue_add_inbuf(struct virtqueue *vq,
struct scatterlist sg[], unsigned int num,
void *data,
gfp_t gfp);
int virtqueue_add_sgs(struct virtqueue *vq,
struct scatterlist *sgs[],
unsigned int out_sgs,
unsigned int in_sgs,
void *data,
gfp_t gfp);
void virtqueue_kick(struct virtqueue *vq); void virtqueue_kick(struct virtqueue *vq);
bool virtqueue_kick_prepare(struct virtqueue *vq); bool virtqueue_kick_prepare(struct virtqueue *vq);
@ -64,6 +82,7 @@ unsigned int virtqueue_get_vring_size(struct virtqueue *vq);
* @dev: underlying device. * @dev: underlying device.
* @id: the device type identification (used to match it with a driver). * @id: the device type identification (used to match it with a driver).
* @config: the configuration ops for this device. * @config: the configuration ops for this device.
* @vringh_config: configuration ops for host vrings.
* @vqs: the list of virtqueues for this device. * @vqs: the list of virtqueues for this device.
* @features: the features supported by both driver and device. * @features: the features supported by both driver and device.
* @priv: private pointer for the driver's use. * @priv: private pointer for the driver's use.
@ -73,6 +92,7 @@ struct virtio_device {
struct device dev; struct device dev;
struct virtio_device_id id; struct virtio_device_id id;
const struct virtio_config_ops *config; const struct virtio_config_ops *config;
const struct vringh_config_ops *vringh_config;
struct list_head vqs; struct list_head vqs;
/* Note that this is a Linux set_bit-style bitmap. */ /* Note that this is a Linux set_bit-style bitmap. */
unsigned long features[1]; unsigned long features[1];

View File

@ -0,0 +1,24 @@
/*
* Copyright (C) ST-Ericsson AB 2012
* Author: Sjur Brændeland <sjur.brandeland@stericsson.com>
*
* This header is BSD licensed so
* anyone can use the definitions to implement compatible remote processors
*/
#ifndef VIRTIO_CAIF_H
#define VIRTIO_CAIF_H
#include <linux/types.h>
struct virtio_caif_transf_config {
u16 headroom;
u16 tailroom;
u32 mtu;
u8 reserved[4];
};
struct virtio_caif_config {
struct virtio_caif_transf_config uplink, downlink;
u8 reserved[8];
};
#endif

View File

@ -4,6 +4,63 @@
#include <linux/irqreturn.h> #include <linux/irqreturn.h>
#include <uapi/linux/virtio_ring.h> #include <uapi/linux/virtio_ring.h>
/*
* Barriers in virtio are tricky. Non-SMP virtio guests can't assume
* they're not on an SMP host system, so they need to assume real
* barriers. Non-SMP virtio hosts could skip the barriers, but does
* anyone care?
*
* For virtio_pci on SMP, we don't need to order with respect to MMIO
* accesses through relaxed memory I/O windows, so smp_mb() et al are
* sufficient.
*
* For using virtio to talk to real devices (eg. other heterogeneous
* CPUs) we do need real barriers. In theory, we could be using both
* kinds of virtio, so it's a runtime decision, and the branch is
* actually quite cheap.
*/
#ifdef CONFIG_SMP
static inline void virtio_mb(bool weak_barriers)
{
if (weak_barriers)
smp_mb();
else
mb();
}
static inline void virtio_rmb(bool weak_barriers)
{
if (weak_barriers)
smp_rmb();
else
rmb();
}
static inline void virtio_wmb(bool weak_barriers)
{
if (weak_barriers)
smp_wmb();
else
wmb();
}
#else
static inline void virtio_mb(bool weak_barriers)
{
mb();
}
static inline void virtio_rmb(bool weak_barriers)
{
rmb();
}
static inline void virtio_wmb(bool weak_barriers)
{
wmb();
}
#endif
struct virtio_device; struct virtio_device;
struct virtqueue; struct virtqueue;

View File

@ -0,0 +1,225 @@
/*
* Linux host-side vring helpers; for when the kernel needs to access
* someone else's vring.
*
* Copyright IBM Corporation, 2013.
* Parts taken from drivers/vhost/vhost.c Copyright 2009 Red Hat, Inc.
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
*
* Written by: Rusty Russell <rusty@rustcorp.com.au>
*/
#ifndef _LINUX_VRINGH_H
#define _LINUX_VRINGH_H
#include <uapi/linux/virtio_ring.h>
#include <linux/uio.h>
#include <linux/slab.h>
#include <asm/barrier.h>
/* virtio_ring with information needed for host access. */
struct vringh {
/* Guest publishes used event idx (note: we always do). */
bool event_indices;
/* Can we get away with weak barriers? */
bool weak_barriers;
/* Last available index we saw (ie. where we're up to). */
u16 last_avail_idx;
/* Last index we used. */
u16 last_used_idx;
/* How many descriptors we've completed since last need_notify(). */
u32 completed;
/* The vring (note: it may contain user pointers!) */
struct vring vring;
/* The function to call to notify the guest about added buffers */
void (*notify)(struct vringh *);
};
/**
* struct vringh_config_ops - ops for creating a host vring from a virtio driver
* @find_vrhs: find the host vrings and instantiate them
* vdev: the virtio_device
* nhvrs: the number of host vrings to find
* hvrs: on success, includes new host vrings
* callbacks: array of driver callbacks, for each host vring
* include a NULL entry for vqs that do not need a callback
* Returns 0 on success or error status
* @del_vrhs: free the host vrings found by find_vrhs().
*/
struct virtio_device;
typedef void vrh_callback_t(struct virtio_device *, struct vringh *);
struct vringh_config_ops {
int (*find_vrhs)(struct virtio_device *vdev, unsigned nhvrs,
struct vringh *vrhs[], vrh_callback_t *callbacks[]);
void (*del_vrhs)(struct virtio_device *vdev);
};
/* The memory the vring can access, and what offset to apply. */
struct vringh_range {
u64 start, end_incl;
u64 offset;
};
/**
* struct vringh_iov - iovec mangler.
*
* Mangles iovec in place, and restores it.
* Remaining data is iov + i, of used - i elements.
*/
struct vringh_iov {
struct iovec *iov;
size_t consumed; /* Within iov[i] */
unsigned i, used, max_num;
};
/**
* struct vringh_iov - kvec mangler.
*
* Mangles kvec in place, and restores it.
* Remaining data is iov + i, of used - i elements.
*/
struct vringh_kiov {
struct kvec *iov;
size_t consumed; /* Within iov[i] */
unsigned i, used, max_num;
};
/* Flag on max_num to indicate we're kmalloced. */
#define VRINGH_IOV_ALLOCATED 0x8000000
/* Helpers for userspace vrings. */
int vringh_init_user(struct vringh *vrh, u32 features,
unsigned int num, bool weak_barriers,
struct vring_desc __user *desc,
struct vring_avail __user *avail,
struct vring_used __user *used);
static inline void vringh_iov_init(struct vringh_iov *iov,
struct iovec *iovec, unsigned num)
{
iov->used = iov->i = 0;
iov->consumed = 0;
iov->max_num = num;
iov->iov = iovec;
}
static inline void vringh_iov_reset(struct vringh_iov *iov)
{
iov->iov[iov->i].iov_len += iov->consumed;
iov->iov[iov->i].iov_base -= iov->consumed;
iov->consumed = 0;
iov->i = 0;
}
static inline void vringh_iov_cleanup(struct vringh_iov *iov)
{
if (iov->max_num & VRINGH_IOV_ALLOCATED)
kfree(iov->iov);
iov->max_num = iov->used = iov->i = iov->consumed = 0;
iov->iov = NULL;
}
/* Convert a descriptor into iovecs. */
int vringh_getdesc_user(struct vringh *vrh,
struct vringh_iov *riov,
struct vringh_iov *wiov,
bool (*getrange)(struct vringh *vrh,
u64 addr, struct vringh_range *r),
u16 *head);
/* Copy bytes from readable vsg, consuming it (and incrementing wiov->i). */
ssize_t vringh_iov_pull_user(struct vringh_iov *riov, void *dst, size_t len);
/* Copy bytes into writable vsg, consuming it (and incrementing wiov->i). */
ssize_t vringh_iov_push_user(struct vringh_iov *wiov,
const void *src, size_t len);
/* Mark a descriptor as used. */
int vringh_complete_user(struct vringh *vrh, u16 head, u32 len);
int vringh_complete_multi_user(struct vringh *vrh,
const struct vring_used_elem used[],
unsigned num_used);
/* Pretend we've never seen descriptor (for easy error handling). */
void vringh_abandon_user(struct vringh *vrh, unsigned int num);
/* Do we need to fire the eventfd to notify the other side? */
int vringh_need_notify_user(struct vringh *vrh);
bool vringh_notify_enable_user(struct vringh *vrh);
void vringh_notify_disable_user(struct vringh *vrh);
/* Helpers for kernelspace vrings. */
int vringh_init_kern(struct vringh *vrh, u32 features,
unsigned int num, bool weak_barriers,
struct vring_desc *desc,
struct vring_avail *avail,
struct vring_used *used);
static inline void vringh_kiov_init(struct vringh_kiov *kiov,
struct kvec *kvec, unsigned num)
{
kiov->used = kiov->i = 0;
kiov->consumed = 0;
kiov->max_num = num;
kiov->iov = kvec;
}
static inline void vringh_kiov_reset(struct vringh_kiov *kiov)
{
kiov->iov[kiov->i].iov_len += kiov->consumed;
kiov->iov[kiov->i].iov_base -= kiov->consumed;
kiov->consumed = 0;
kiov->i = 0;
}
static inline void vringh_kiov_cleanup(struct vringh_kiov *kiov)
{
if (kiov->max_num & VRINGH_IOV_ALLOCATED)
kfree(kiov->iov);
kiov->max_num = kiov->used = kiov->i = kiov->consumed = 0;
kiov->iov = NULL;
}
int vringh_getdesc_kern(struct vringh *vrh,
struct vringh_kiov *riov,
struct vringh_kiov *wiov,
u16 *head,
gfp_t gfp);
ssize_t vringh_iov_pull_kern(struct vringh_kiov *riov, void *dst, size_t len);
ssize_t vringh_iov_push_kern(struct vringh_kiov *wiov,
const void *src, size_t len);
void vringh_abandon_kern(struct vringh *vrh, unsigned int num);
int vringh_complete_kern(struct vringh *vrh, u16 head, u32 len);
bool vringh_notify_enable_kern(struct vringh *vrh);
void vringh_notify_disable_kern(struct vringh *vrh);
int vringh_need_notify_kern(struct vringh *vrh);
/* Notify the guest about buffers added to the used ring */
static inline void vringh_notify(struct vringh *vrh)
{
if (vrh->notify)
vrh->notify(vrh);
}
#endif /* _LINUX_VRINGH_H */

View File

@ -52,8 +52,8 @@ struct virtio_balloon_config
#define VIRTIO_BALLOON_S_NR 6 #define VIRTIO_BALLOON_S_NR 6
struct virtio_balloon_stat { struct virtio_balloon_stat {
u16 tag; __u16 tag;
u64 val; __u64 val;
} __attribute__((packed)); } __attribute__((packed));
#endif /* _LINUX_VIRTIO_BALLOON_H */ #endif /* _LINUX_VIRTIO_BALLOON_H */

View File

@ -38,5 +38,6 @@
#define VIRTIO_ID_SCSI 8 /* virtio scsi */ #define VIRTIO_ID_SCSI 8 /* virtio scsi */
#define VIRTIO_ID_9P 9 /* 9p virtio console */ #define VIRTIO_ID_9P 9 /* 9p virtio console */
#define VIRTIO_ID_RPROC_SERIAL 11 /* virtio remoteproc serial link */ #define VIRTIO_ID_RPROC_SERIAL 11 /* virtio remoteproc serial link */
#define VIRTIO_ID_CAIF 12 /* Virtio caif */
#endif /* _LINUX_VIRTIO_IDS_H */ #endif /* _LINUX_VIRTIO_IDS_H */

View File

@ -194,11 +194,14 @@ static int pack_sg_list(struct scatterlist *sg, int start,
if (s > count) if (s > count)
s = count; s = count;
BUG_ON(index > limit); BUG_ON(index > limit);
/* Make sure we don't terminate early. */
sg_unmark_end(&sg[index]);
sg_set_buf(&sg[index++], data, s); sg_set_buf(&sg[index++], data, s);
count -= s; count -= s;
data += s; data += s;
} }
if (index-start)
sg_mark_end(&sg[index - 1]);
return index-start; return index-start;
} }
@ -236,12 +239,17 @@ pack_sg_list_p(struct scatterlist *sg, int start, int limit,
s = rest_of_page(data); s = rest_of_page(data);
if (s > count) if (s > count)
s = count; s = count;
/* Make sure we don't terminate early. */
sg_unmark_end(&sg[index]);
sg_set_page(&sg[index++], pdata[i++], s, data_off); sg_set_page(&sg[index++], pdata[i++], s, data_off);
data_off = 0; data_off = 0;
data += s; data += s;
count -= s; count -= s;
nr_pages--; nr_pages--;
} }
if (index-start)
sg_mark_end(&sg[index - 1]);
return index - start; return index - start;
} }
@ -256,9 +264,10 @@ static int
p9_virtio_request(struct p9_client *client, struct p9_req_t *req) p9_virtio_request(struct p9_client *client, struct p9_req_t *req)
{ {
int err; int err;
int in, out; int in, out, out_sgs, in_sgs;
unsigned long flags; unsigned long flags;
struct virtio_chan *chan = client->trans; struct virtio_chan *chan = client->trans;
struct scatterlist *sgs[2];
p9_debug(P9_DEBUG_TRANS, "9p debug: virtio request\n"); p9_debug(P9_DEBUG_TRANS, "9p debug: virtio request\n");
@ -266,14 +275,19 @@ p9_virtio_request(struct p9_client *client, struct p9_req_t *req)
req_retry: req_retry:
spin_lock_irqsave(&chan->lock, flags); spin_lock_irqsave(&chan->lock, flags);
out_sgs = in_sgs = 0;
/* Handle out VirtIO ring buffers */ /* Handle out VirtIO ring buffers */
out = pack_sg_list(chan->sg, 0, out = pack_sg_list(chan->sg, 0,
VIRTQUEUE_NUM, req->tc->sdata, req->tc->size); VIRTQUEUE_NUM, req->tc->sdata, req->tc->size);
if (out)
sgs[out_sgs++] = chan->sg;
in = pack_sg_list(chan->sg, out, in = pack_sg_list(chan->sg, out,
VIRTQUEUE_NUM, req->rc->sdata, req->rc->capacity); VIRTQUEUE_NUM, req->rc->sdata, req->rc->capacity);
if (in)
sgs[out_sgs + in_sgs++] = chan->sg + out;
err = virtqueue_add_buf(chan->vq, chan->sg, out, in, req->tc, err = virtqueue_add_sgs(chan->vq, sgs, out_sgs, in_sgs, req->tc,
GFP_ATOMIC); GFP_ATOMIC);
if (err < 0) { if (err < 0) {
if (err == -ENOSPC) { if (err == -ENOSPC) {
@ -289,7 +303,7 @@ req_retry:
} else { } else {
spin_unlock_irqrestore(&chan->lock, flags); spin_unlock_irqrestore(&chan->lock, flags);
p9_debug(P9_DEBUG_TRANS, p9_debug(P9_DEBUG_TRANS,
"virtio rpc add_buf returned failure\n"); "virtio rpc add_sgs returned failure\n");
return -EIO; return -EIO;
} }
} }
@ -351,11 +365,12 @@ p9_virtio_zc_request(struct p9_client *client, struct p9_req_t *req,
char *uidata, char *uodata, int inlen, char *uidata, char *uodata, int inlen,
int outlen, int in_hdr_len, int kern_buf) int outlen, int in_hdr_len, int kern_buf)
{ {
int in, out, err; int in, out, err, out_sgs, in_sgs;
unsigned long flags; unsigned long flags;
int in_nr_pages = 0, out_nr_pages = 0; int in_nr_pages = 0, out_nr_pages = 0;
struct page **in_pages = NULL, **out_pages = NULL; struct page **in_pages = NULL, **out_pages = NULL;
struct virtio_chan *chan = client->trans; struct virtio_chan *chan = client->trans;
struct scatterlist *sgs[4];
p9_debug(P9_DEBUG_TRANS, "virtio request\n"); p9_debug(P9_DEBUG_TRANS, "virtio request\n");
@ -396,13 +411,22 @@ p9_virtio_zc_request(struct p9_client *client, struct p9_req_t *req,
req->status = REQ_STATUS_SENT; req->status = REQ_STATUS_SENT;
req_retry_pinned: req_retry_pinned:
spin_lock_irqsave(&chan->lock, flags); spin_lock_irqsave(&chan->lock, flags);
out_sgs = in_sgs = 0;
/* out data */ /* out data */
out = pack_sg_list(chan->sg, 0, out = pack_sg_list(chan->sg, 0,
VIRTQUEUE_NUM, req->tc->sdata, req->tc->size); VIRTQUEUE_NUM, req->tc->sdata, req->tc->size);
if (out_pages) if (out)
sgs[out_sgs++] = chan->sg;
if (out_pages) {
sgs[out_sgs++] = chan->sg + out;
out += pack_sg_list_p(chan->sg, out, VIRTQUEUE_NUM, out += pack_sg_list_p(chan->sg, out, VIRTQUEUE_NUM,
out_pages, out_nr_pages, uodata, outlen); out_pages, out_nr_pages, uodata, outlen);
}
/* /*
* Take care of in data * Take care of in data
* For example TREAD have 11. * For example TREAD have 11.
@ -412,11 +436,17 @@ req_retry_pinned:
*/ */
in = pack_sg_list(chan->sg, out, in = pack_sg_list(chan->sg, out,
VIRTQUEUE_NUM, req->rc->sdata, in_hdr_len); VIRTQUEUE_NUM, req->rc->sdata, in_hdr_len);
if (in_pages) if (in)
sgs[out_sgs + in_sgs++] = chan->sg + out;
if (in_pages) {
sgs[out_sgs + in_sgs++] = chan->sg + out + in;
in += pack_sg_list_p(chan->sg, out + in, VIRTQUEUE_NUM, in += pack_sg_list_p(chan->sg, out + in, VIRTQUEUE_NUM,
in_pages, in_nr_pages, uidata, inlen); in_pages, in_nr_pages, uidata, inlen);
}
err = virtqueue_add_buf(chan->vq, chan->sg, out, in, req->tc, BUG_ON(out_sgs + in_sgs > ARRAY_SIZE(sgs));
err = virtqueue_add_sgs(chan->vq, sgs, out_sgs, in_sgs, req->tc,
GFP_ATOMIC); GFP_ATOMIC);
if (err < 0) { if (err < 0) {
if (err == -ENOSPC) { if (err == -ENOSPC) {
@ -432,7 +462,7 @@ req_retry_pinned:
} else { } else {
spin_unlock_irqrestore(&chan->lock, flags); spin_unlock_irqrestore(&chan->lock, flags);
p9_debug(P9_DEBUG_TRANS, p9_debug(P9_DEBUG_TRANS,
"virtio rpc add_buf returned failure\n"); "virtio rpc add_sgs returned failure\n");
err = -EIO; err = -EIO;
goto err_out; goto err_out;
} }

View File

@ -70,7 +70,7 @@ Running Lguest:
- Run an lguest as root: - Run an lguest as root:
Documentation/virtual/lguest/lguest 64 vmlinux --tunnet=192.168.19.1 \ tools/lguest/lguest 64 vmlinux --tunnet=192.168.19.1 \
--block=rootfile root=/dev/vda --block=rootfile root=/dev/vda
Explanation: Explanation:

View File

@ -1,12 +1,14 @@
all: test mod all: test mod
test: virtio_test test: virtio_test vringh_test
virtio_test: virtio_ring.o virtio_test.o virtio_test: virtio_ring.o virtio_test.o
CFLAGS += -g -O2 -Wall -I. -I ../../usr/include/ -Wno-pointer-sign -fno-strict-overflow -MMD vringh_test: vringh_test.o vringh.o virtio_ring.o
vpath %.c ../../drivers/virtio
CFLAGS += -g -O2 -Wall -I. -I ../../usr/include/ -Wno-pointer-sign -fno-strict-overflow -fno-strict-aliasing -fno-common -MMD -U_FORTIFY_SOURCE
vpath %.c ../../drivers/virtio ../../drivers/vhost
mod: mod:
${MAKE} -C `pwd`/../.. M=`pwd`/vhost_test ${MAKE} -C `pwd`/../.. M=`pwd`/vhost_test
.PHONY: all test mod clean .PHONY: all test mod clean
clean: clean:
${RM} *.o vhost_test/*.o vhost_test/.*.cmd \ ${RM} *.o vringh_test virtio_test vhost_test/*.o vhost_test/.*.cmd \
vhost_test/Module.symvers vhost_test/modules.order *.d vhost_test/Module.symvers vhost_test/modules.order *.d
-include *.d -include *.d

View File

@ -0,0 +1,14 @@
#if defined(__i386__) || defined(__x86_64__)
#define barrier() asm volatile("" ::: "memory")
#define mb() __sync_synchronize()
#define smp_mb() mb()
# define smp_rmb() barrier()
# define smp_wmb() barrier()
/* Weak barriers should be used. If not - it's a bug */
# define rmb() abort()
# define wmb() abort()
#else
#error Please fill in barrier macros
#endif

View File

@ -0,0 +1,10 @@
#ifndef BUG_H
#define BUG_H
#define BUG_ON(__BUG_ON_cond) assert(!(__BUG_ON_cond))
#define BUILD_BUG_ON(x)
#define BUG() abort()
#endif /* BUG_H */

View File

@ -0,0 +1,26 @@
#ifndef ERR_H
#define ERR_H
#define MAX_ERRNO 4095
#define IS_ERR_VALUE(x) unlikely((x) >= (unsigned long)-MAX_ERRNO)
static inline void * __must_check ERR_PTR(long error)
{
return (void *) error;
}
static inline long __must_check PTR_ERR(const void *ptr)
{
return (long) ptr;
}
static inline long __must_check IS_ERR(const void *ptr)
{
return IS_ERR_VALUE((unsigned long)ptr);
}
static inline long __must_check IS_ERR_OR_NULL(const void *ptr)
{
return !ptr || IS_ERR_VALUE((unsigned long)ptr);
}
#endif /* ERR_H */

View File

@ -0,0 +1,5 @@
#define EXPORT_SYMBOL(sym)
#define EXPORT_SYMBOL_GPL(sym)
#define EXPORT_SYMBOL_GPL_FUTURE(sym)
#define EXPORT_UNUSED_SYMBOL(sym)
#define EXPORT_UNUSED_SYMBOL_GPL(sym)

View File

@ -0,0 +1 @@
#include "../../../include/linux/irqreturn.h"

View File

@ -0,0 +1,112 @@
#ifndef KERNEL_H
#define KERNEL_H
#include <stdbool.h>
#include <stdlib.h>
#include <stddef.h>
#include <stdio.h>
#include <string.h>
#include <assert.h>
#include <stdarg.h>
#include <linux/types.h>
#include <linux/printk.h>
#include <linux/bug.h>
#include <errno.h>
#include <unistd.h>
#include <asm/barrier.h>
#define CONFIG_SMP
#define PAGE_SIZE getpagesize()
#define PAGE_MASK (~(PAGE_SIZE-1))
typedef unsigned long long dma_addr_t;
typedef size_t __kernel_size_t;
struct page {
unsigned long long dummy;
};
/* Physical == Virtual */
#define virt_to_phys(p) ((unsigned long)p)
#define phys_to_virt(a) ((void *)(unsigned long)(a))
/* Page address: Virtual / 4K */
#define page_to_phys(p) ((dma_addr_t)(unsigned long)(p))
#define virt_to_page(p) ((struct page *)((unsigned long)p & PAGE_MASK))
#define offset_in_page(p) (((unsigned long)p) % PAGE_SIZE)
#define __printf(a,b) __attribute__((format(printf,a,b)))
typedef enum {
GFP_KERNEL,
GFP_ATOMIC,
__GFP_HIGHMEM,
__GFP_HIGH
} gfp_t;
#define ARRAY_SIZE(x) (sizeof(x)/sizeof(x[0]))
extern void *__kmalloc_fake, *__kfree_ignore_start, *__kfree_ignore_end;
static inline void *kmalloc(size_t s, gfp_t gfp)
{
if (__kmalloc_fake)
return __kmalloc_fake;
return malloc(s);
}
static inline void kfree(void *p)
{
if (p >= __kfree_ignore_start && p < __kfree_ignore_end)
return;
free(p);
}
static inline void *krealloc(void *p, size_t s, gfp_t gfp)
{
return realloc(p, s);
}
static inline unsigned long __get_free_page(gfp_t gfp)
{
void *p;
posix_memalign(&p, PAGE_SIZE, PAGE_SIZE);
return (unsigned long)p;
}
static inline void free_page(unsigned long addr)
{
free((void *)addr);
}
#define container_of(ptr, type, member) ({ \
const typeof( ((type *)0)->member ) *__mptr = (ptr); \
(type *)( (char *)__mptr - offsetof(type,member) );})
#define uninitialized_var(x) x = x
# ifndef likely
# define likely(x) (__builtin_expect(!!(x), 1))
# endif
# ifndef unlikely
# define unlikely(x) (__builtin_expect(!!(x), 0))
# endif
#define pr_err(format, ...) fprintf (stderr, format, ## __VA_ARGS__)
#ifdef DEBUG
#define pr_debug(format, ...) fprintf (stderr, format, ## __VA_ARGS__)
#else
#define pr_debug(format, ...) do {} while (0)
#endif
#define dev_err(dev, format, ...) fprintf (stderr, format, ## __VA_ARGS__)
#define dev_warn(dev, format, ...) fprintf (stderr, format, ## __VA_ARGS__)
#define min(x, y) ({ \
typeof(x) _min1 = (x); \
typeof(y) _min2 = (y); \
(void) (&_min1 == &_min2); \
_min1 < _min2 ? _min1 : _min2; })
#endif /* KERNEL_H */

View File

@ -0,0 +1 @@
#include <linux/export.h>

View File

@ -0,0 +1,4 @@
#include "../../../include/linux/kern_levels.h"
#define printk printf
#define vprintk vprintf

View File

@ -0,0 +1,4 @@
#define DEFINE_RATELIMIT_STATE(name, interval_init, burst_init) int name = 0
#define __ratelimit(x) (*(x))

View File

@ -0,0 +1,189 @@
#ifndef SCATTERLIST_H
#define SCATTERLIST_H
#include <linux/kernel.h>
struct scatterlist {
unsigned long page_link;
unsigned int offset;
unsigned int length;
dma_addr_t dma_address;
};
/* Scatterlist helpers, stolen from linux/scatterlist.h */
#define sg_is_chain(sg) ((sg)->page_link & 0x01)
#define sg_is_last(sg) ((sg)->page_link & 0x02)
#define sg_chain_ptr(sg) \
((struct scatterlist *) ((sg)->page_link & ~0x03))
/**
* sg_assign_page - Assign a given page to an SG entry
* @sg: SG entry
* @page: The page
*
* Description:
* Assign page to sg entry. Also see sg_set_page(), the most commonly used
* variant.
*
**/
static inline void sg_assign_page(struct scatterlist *sg, struct page *page)
{
unsigned long page_link = sg->page_link & 0x3;
/*
* In order for the low bit stealing approach to work, pages
* must be aligned at a 32-bit boundary as a minimum.
*/
BUG_ON((unsigned long) page & 0x03);
#ifdef CONFIG_DEBUG_SG
BUG_ON(sg->sg_magic != SG_MAGIC);
BUG_ON(sg_is_chain(sg));
#endif
sg->page_link = page_link | (unsigned long) page;
}
/**
* sg_set_page - Set sg entry to point at given page
* @sg: SG entry
* @page: The page
* @len: Length of data
* @offset: Offset into page
*
* Description:
* Use this function to set an sg entry pointing at a page, never assign
* the page directly. We encode sg table information in the lower bits
* of the page pointer. See sg_page() for looking up the page belonging
* to an sg entry.
*
**/
static inline void sg_set_page(struct scatterlist *sg, struct page *page,
unsigned int len, unsigned int offset)
{
sg_assign_page(sg, page);
sg->offset = offset;
sg->length = len;
}
static inline struct page *sg_page(struct scatterlist *sg)
{
#ifdef CONFIG_DEBUG_SG
BUG_ON(sg->sg_magic != SG_MAGIC);
BUG_ON(sg_is_chain(sg));
#endif
return (struct page *)((sg)->page_link & ~0x3);
}
/*
* Loop over each sg element, following the pointer to a new list if necessary
*/
#define for_each_sg(sglist, sg, nr, __i) \
for (__i = 0, sg = (sglist); __i < (nr); __i++, sg = sg_next(sg))
/**
* sg_chain - Chain two sglists together
* @prv: First scatterlist
* @prv_nents: Number of entries in prv
* @sgl: Second scatterlist
*
* Description:
* Links @prv@ and @sgl@ together, to form a longer scatterlist.
*
**/
static inline void sg_chain(struct scatterlist *prv, unsigned int prv_nents,
struct scatterlist *sgl)
{
/*
* offset and length are unused for chain entry. Clear them.
*/
prv[prv_nents - 1].offset = 0;
prv[prv_nents - 1].length = 0;
/*
* Set lowest bit to indicate a link pointer, and make sure to clear
* the termination bit if it happens to be set.
*/
prv[prv_nents - 1].page_link = ((unsigned long) sgl | 0x01) & ~0x02;
}
/**
* sg_mark_end - Mark the end of the scatterlist
* @sg: SG entryScatterlist
*
* Description:
* Marks the passed in sg entry as the termination point for the sg
* table. A call to sg_next() on this entry will return NULL.
*
**/
static inline void sg_mark_end(struct scatterlist *sg)
{
#ifdef CONFIG_DEBUG_SG
BUG_ON(sg->sg_magic != SG_MAGIC);
#endif
/*
* Set termination bit, clear potential chain bit
*/
sg->page_link |= 0x02;
sg->page_link &= ~0x01;
}
/**
* sg_unmark_end - Undo setting the end of the scatterlist
* @sg: SG entryScatterlist
*
* Description:
* Removes the termination marker from the given entry of the scatterlist.
*
**/
static inline void sg_unmark_end(struct scatterlist *sg)
{
#ifdef CONFIG_DEBUG_SG
BUG_ON(sg->sg_magic != SG_MAGIC);
#endif
sg->page_link &= ~0x02;
}
static inline struct scatterlist *sg_next(struct scatterlist *sg)
{
#ifdef CONFIG_DEBUG_SG
BUG_ON(sg->sg_magic != SG_MAGIC);
#endif
if (sg_is_last(sg))
return NULL;
sg++;
if (unlikely(sg_is_chain(sg)))
sg = sg_chain_ptr(sg);
return sg;
}
static inline void sg_init_table(struct scatterlist *sgl, unsigned int nents)
{
memset(sgl, 0, sizeof(*sgl) * nents);
#ifdef CONFIG_DEBUG_SG
{
unsigned int i;
for (i = 0; i < nents; i++)
sgl[i].sg_magic = SG_MAGIC;
}
#endif
sg_mark_end(&sgl[nents - 1]);
}
static inline dma_addr_t sg_phys(struct scatterlist *sg)
{
return page_to_phys(sg_page(sg)) + sg->offset;
}
static inline void sg_set_buf(struct scatterlist *sg, const void *buf,
unsigned int buflen)
{
sg_set_page(sg, virt_to_page(buf), buflen, offset_in_page(buf));
}
static inline void sg_init_one(struct scatterlist *sg,
const void *buf, unsigned int buflen)
{
sg_init_table(sg, 1);
sg_set_buf(sg, buf, buflen);
}
#endif /* SCATTERLIST_H */

View File

@ -0,0 +1,28 @@
#ifndef TYPES_H
#define TYPES_H
#include <stdint.h>
#define __force
#define __user
#define __must_check
#define __cold
typedef uint64_t u64;
typedef int64_t s64;
typedef uint32_t u32;
typedef int32_t s32;
typedef uint16_t u16;
typedef int16_t s16;
typedef uint8_t u8;
typedef int8_t s8;
typedef uint64_t __u64;
typedef int64_t __s64;
typedef uint32_t __u32;
typedef int32_t __s32;
typedef uint16_t __u16;
typedef int16_t __s16;
typedef uint8_t __u8;
typedef int8_t __s8;
#endif /* TYPES_H */

View File

@ -0,0 +1,50 @@
#ifndef UACCESS_H
#define UACCESS_H
extern void *__user_addr_min, *__user_addr_max;
#define ACCESS_ONCE(x) (*(volatile typeof(x) *)&(x))
static inline void __chk_user_ptr(const volatile void *p, size_t size)
{
assert(p >= __user_addr_min && p + size <= __user_addr_max);
}
#define put_user(x, ptr) \
({ \
typeof(ptr) __pu_ptr = (ptr); \
__chk_user_ptr(__pu_ptr, sizeof(*__pu_ptr)); \
ACCESS_ONCE(*(__pu_ptr)) = x; \
0; \
})
#define get_user(x, ptr) \
({ \
typeof(ptr) __pu_ptr = (ptr); \
__chk_user_ptr(__pu_ptr, sizeof(*__pu_ptr)); \
x = ACCESS_ONCE(*(__pu_ptr)); \
0; \
})
static void volatile_memcpy(volatile char *to, const volatile char *from,
unsigned long n)
{
while (n--)
*(to++) = *(from++);
}
static inline int copy_from_user(void *to, const void __user volatile *from,
unsigned long n)
{
__chk_user_ptr(from, n);
volatile_memcpy(to, from, n);
return 0;
}
static inline int copy_to_user(void __user volatile *to, const void *from,
unsigned long n)
{
__chk_user_ptr(to, n);
volatile_memcpy(to, from, n);
return 0;
}
#endif /* UACCESS_H */

View File

@ -0,0 +1,3 @@
#include <linux/kernel.h>
#include "../../../include/linux/uio.h"

View File

@ -1,127 +1,7 @@
#ifndef LINUX_VIRTIO_H #ifndef LINUX_VIRTIO_H
#define LINUX_VIRTIO_H #define LINUX_VIRTIO_H
#include <linux/scatterlist.h>
#include <stdbool.h> #include <linux/kernel.h>
#include <stdlib.h>
#include <stddef.h>
#include <stdio.h>
#include <string.h>
#include <assert.h>
#include <linux/types.h>
#include <errno.h>
typedef unsigned long long dma_addr_t;
struct scatterlist {
unsigned long page_link;
unsigned int offset;
unsigned int length;
dma_addr_t dma_address;
};
struct page {
unsigned long long dummy;
};
#define BUG_ON(__BUG_ON_cond) assert(!(__BUG_ON_cond))
/* Physical == Virtual */
#define virt_to_phys(p) ((unsigned long)p)
#define phys_to_virt(a) ((void *)(unsigned long)(a))
/* Page address: Virtual / 4K */
#define virt_to_page(p) ((struct page*)((virt_to_phys(p) / 4096) * \
sizeof(struct page)))
#define offset_in_page(p) (((unsigned long)p) % 4096)
#define sg_phys(sg) ((sg->page_link & ~0x3) / sizeof(struct page) * 4096 + \
sg->offset)
static inline void sg_mark_end(struct scatterlist *sg)
{
/*
* Set termination bit, clear potential chain bit
*/
sg->page_link |= 0x02;
sg->page_link &= ~0x01;
}
static inline void sg_init_table(struct scatterlist *sgl, unsigned int nents)
{
memset(sgl, 0, sizeof(*sgl) * nents);
sg_mark_end(&sgl[nents - 1]);
}
static inline void sg_assign_page(struct scatterlist *sg, struct page *page)
{
unsigned long page_link = sg->page_link & 0x3;
/*
* In order for the low bit stealing approach to work, pages
* must be aligned at a 32-bit boundary as a minimum.
*/
BUG_ON((unsigned long) page & 0x03);
sg->page_link = page_link | (unsigned long) page;
}
static inline void sg_set_page(struct scatterlist *sg, struct page *page,
unsigned int len, unsigned int offset)
{
sg_assign_page(sg, page);
sg->offset = offset;
sg->length = len;
}
static inline void sg_set_buf(struct scatterlist *sg, const void *buf,
unsigned int buflen)
{
sg_set_page(sg, virt_to_page(buf), buflen, offset_in_page(buf));
}
static inline void sg_init_one(struct scatterlist *sg, const void *buf, unsigned int buflen)
{
sg_init_table(sg, 1);
sg_set_buf(sg, buf, buflen);
}
typedef __u16 u16;
typedef enum {
GFP_KERNEL,
GFP_ATOMIC,
} gfp_t;
typedef enum {
IRQ_NONE,
IRQ_HANDLED
} irqreturn_t;
static inline void *kmalloc(size_t s, gfp_t gfp)
{
return malloc(s);
}
static inline void kfree(void *p)
{
free(p);
}
#define container_of(ptr, type, member) ({ \
const typeof( ((type *)0)->member ) *__mptr = (ptr); \
(type *)( (char *)__mptr - offsetof(type,member) );})
#define uninitialized_var(x) x = x
# ifndef likely
# define likely(x) (__builtin_expect(!!(x), 1))
# endif
# ifndef unlikely
# define unlikely(x) (__builtin_expect(!!(x), 0))
# endif
#define pr_err(format, ...) fprintf (stderr, format, ## __VA_ARGS__)
#ifdef DEBUG
#define pr_debug(format, ...) fprintf (stderr, format, ## __VA_ARGS__)
#else
#define pr_debug(format, ...) do {} while (0)
#endif
#define dev_err(dev, format, ...) fprintf (stderr, format, ## __VA_ARGS__)
#define dev_warn(dev, format, ...) fprintf (stderr, format, ## __VA_ARGS__)
/* TODO: empty stubs for now. Broken but enough for virtio_ring.c */ /* TODO: empty stubs for now. Broken but enough for virtio_ring.c */
#define list_add_tail(a, b) do {} while (0) #define list_add_tail(a, b) do {} while (0)
@ -131,6 +11,7 @@ static inline void kfree(void *p)
#define BITS_PER_BYTE 8 #define BITS_PER_BYTE 8
#define BITS_PER_LONG (sizeof(long) * BITS_PER_BYTE) #define BITS_PER_LONG (sizeof(long) * BITS_PER_BYTE)
#define BIT_MASK(nr) (1UL << ((nr) % BITS_PER_LONG)) #define BIT_MASK(nr) (1UL << ((nr) % BITS_PER_LONG))
/* TODO: Not atomic as it should be: /* TODO: Not atomic as it should be:
* we don't use this for anything important. */ * we don't use this for anything important. */
static inline void clear_bit(int nr, volatile unsigned long *addr) static inline void clear_bit(int nr, volatile unsigned long *addr)
@ -145,10 +26,6 @@ static inline int test_bit(int nr, const volatile unsigned long *addr)
{ {
return 1UL & (addr[BIT_WORD(nr)] >> (nr & (BITS_PER_LONG-1))); return 1UL & (addr[BIT_WORD(nr)] >> (nr & (BITS_PER_LONG-1)));
} }
/* The only feature we care to support */
#define virtio_has_feature(dev, feature) \
test_bit((feature), (dev)->features)
/* end of stubs */ /* end of stubs */
struct virtio_device { struct virtio_device {
@ -163,39 +40,32 @@ struct virtqueue {
void (*callback)(struct virtqueue *vq); void (*callback)(struct virtqueue *vq);
const char *name; const char *name;
struct virtio_device *vdev; struct virtio_device *vdev;
unsigned int index;
unsigned int num_free;
void *priv; void *priv;
}; };
#define EXPORT_SYMBOL_GPL(__EXPORT_SYMBOL_GPL_name) \
void __EXPORT_SYMBOL_GPL##__EXPORT_SYMBOL_GPL_name() { \
}
#define MODULE_LICENSE(__MODULE_LICENSE_value) \ #define MODULE_LICENSE(__MODULE_LICENSE_value) \
const char *__MODULE_LICENSE_name = __MODULE_LICENSE_value const char *__MODULE_LICENSE_name = __MODULE_LICENSE_value
#define CONFIG_SMP
#if defined(__i386__) || defined(__x86_64__)
#define barrier() asm volatile("" ::: "memory")
#define mb() __sync_synchronize()
#define smp_mb() mb()
# define smp_rmb() barrier()
# define smp_wmb() barrier()
/* Weak barriers should be used. If not - it's a bug */
# define rmb() abort()
# define wmb() abort()
#else
#error Please fill in barrier macros
#endif
/* Interfaces exported by virtio_ring. */ /* Interfaces exported by virtio_ring. */
int virtqueue_add_buf(struct virtqueue *vq, int virtqueue_add_sgs(struct virtqueue *vq,
struct scatterlist sg[], struct scatterlist *sgs[],
unsigned int out_num, unsigned int out_sgs,
unsigned int in_num, unsigned int in_sgs,
void *data, void *data,
gfp_t gfp); gfp_t gfp);
int virtqueue_add_outbuf(struct virtqueue *vq,
struct scatterlist sg[], unsigned int num,
void *data,
gfp_t gfp);
int virtqueue_add_inbuf(struct virtqueue *vq,
struct scatterlist sg[], unsigned int num,
void *data,
gfp_t gfp);
void virtqueue_kick(struct virtqueue *vq); void virtqueue_kick(struct virtqueue *vq);
void *virtqueue_get_buf(struct virtqueue *vq, unsigned int *len); void *virtqueue_get_buf(struct virtqueue *vq, unsigned int *len);
@ -206,7 +76,8 @@ bool virtqueue_enable_cb(struct virtqueue *vq);
bool virtqueue_enable_cb_delayed(struct virtqueue *vq); bool virtqueue_enable_cb_delayed(struct virtqueue *vq);
void *virtqueue_detach_unused_buf(struct virtqueue *vq); void *virtqueue_detach_unused_buf(struct virtqueue *vq);
struct virtqueue *vring_new_virtqueue(unsigned int num, struct virtqueue *vring_new_virtqueue(unsigned int index,
unsigned int num,
unsigned int vring_align, unsigned int vring_align,
struct virtio_device *vdev, struct virtio_device *vdev,
bool weak_barriers, bool weak_barriers,

View File

@ -0,0 +1,6 @@
#define VIRTIO_TRANSPORT_F_START 28
#define VIRTIO_TRANSPORT_F_END 32
#define virtio_has_feature(dev, feature) \
test_bit((feature), (dev)->features)

View File

@ -0,0 +1 @@
#include "../../../include/linux/virtio_ring.h"

View File

@ -0,0 +1 @@
#include "../../../include/linux/vringh.h"

View File

@ -0,0 +1 @@
#include <sys/uio.h>

View File

@ -0,0 +1 @@
#include "../../../../include/uapi/linux/virtio_config.h"

View File

@ -0,0 +1,4 @@
#ifndef VIRTIO_RING_H
#define VIRTIO_RING_H
#include "../../../../include/uapi/linux/virtio_ring.h"
#endif /* VIRTIO_RING_H */

View File

@ -10,11 +10,15 @@
#include <sys/stat.h> #include <sys/stat.h>
#include <sys/types.h> #include <sys/types.h>
#include <fcntl.h> #include <fcntl.h>
#include <stdbool.h>
#include <linux/vhost.h> #include <linux/vhost.h>
#include <linux/virtio.h> #include <linux/virtio.h>
#include <linux/virtio_ring.h> #include <linux/virtio_ring.h>
#include "../../drivers/vhost/test.h" #include "../../drivers/vhost/test.h"
/* Unused */
void *__kmalloc_fake, *__kfree_ignore_start, *__kfree_ignore_end;
struct vq_info { struct vq_info {
int kick; int kick;
int call; int call;
@ -92,7 +96,8 @@ static void vq_info_add(struct vdev_info *dev, int num)
assert(r >= 0); assert(r >= 0);
memset(info->ring, 0, vring_size(num, 4096)); memset(info->ring, 0, vring_size(num, 4096));
vring_init(&info->vring, num, info->ring, 4096); vring_init(&info->vring, num, info->ring, 4096);
info->vq = vring_new_virtqueue(info->vring.num, 4096, &dev->vdev, info->vq = vring_new_virtqueue(info->idx,
info->vring.num, 4096, &dev->vdev,
true, info->ring, true, info->ring,
vq_notify, vq_callback, "test"); vq_notify, vq_callback, "test");
assert(info->vq); assert(info->vq);
@ -161,9 +166,9 @@ static void run_test(struct vdev_info *dev, struct vq_info *vq,
do { do {
if (started < bufs) { if (started < bufs) {
sg_init_one(&sl, dev->buf, dev->buf_size); sg_init_one(&sl, dev->buf, dev->buf_size);
r = virtqueue_add_buf(vq->vq, &sl, 1, 0, r = virtqueue_add_outbuf(vq->vq, &sl, 1,
dev->buf + started, dev->buf + started,
GFP_ATOMIC); GFP_ATOMIC);
if (likely(r == 0)) { if (likely(r == 0)) {
++started; ++started;
virtqueue_kick(vq->vq); virtqueue_kick(vq->vq);

View File

@ -0,0 +1,741 @@
/* Simple test of virtio code, entirely in userpsace. */
#define _GNU_SOURCE
#include <sched.h>
#include <err.h>
#include <linux/kernel.h>
#include <linux/err.h>
#include <linux/virtio.h>
#include <linux/vringh.h>
#include <linux/virtio_ring.h>
#include <linux/uaccess.h>
#include <sys/types.h>
#include <sys/stat.h>
#include <sys/mman.h>
#include <sys/wait.h>
#include <fcntl.h>
#define USER_MEM (1024*1024)
void *__user_addr_min, *__user_addr_max;
void *__kmalloc_fake, *__kfree_ignore_start, *__kfree_ignore_end;
static u64 user_addr_offset;
#define RINGSIZE 256
#define ALIGN 4096
static void never_notify_host(struct virtqueue *vq)
{
abort();
}
static void never_callback_guest(struct virtqueue *vq)
{
abort();
}
static bool getrange_iov(struct vringh *vrh, u64 addr, struct vringh_range *r)
{
if (addr < (u64)(unsigned long)__user_addr_min - user_addr_offset)
return false;
if (addr >= (u64)(unsigned long)__user_addr_max - user_addr_offset)
return false;
r->start = (u64)(unsigned long)__user_addr_min - user_addr_offset;
r->end_incl = (u64)(unsigned long)__user_addr_max - 1 - user_addr_offset;
r->offset = user_addr_offset;
return true;
}
/* We return single byte ranges. */
static bool getrange_slow(struct vringh *vrh, u64 addr, struct vringh_range *r)
{
if (addr < (u64)(unsigned long)__user_addr_min - user_addr_offset)
return false;
if (addr >= (u64)(unsigned long)__user_addr_max - user_addr_offset)
return false;
r->start = addr;
r->end_incl = r->start;
r->offset = user_addr_offset;
return true;
}
struct guest_virtio_device {
struct virtio_device vdev;
int to_host_fd;
unsigned long notifies;
};
static void parallel_notify_host(struct virtqueue *vq)
{
struct guest_virtio_device *gvdev;
gvdev = container_of(vq->vdev, struct guest_virtio_device, vdev);
write(gvdev->to_host_fd, "", 1);
gvdev->notifies++;
}
static void no_notify_host(struct virtqueue *vq)
{
}
#define NUM_XFERS (10000000)
/* We aim for two "distant" cpus. */
static void find_cpus(unsigned int *first, unsigned int *last)
{
unsigned int i;
*first = -1U;
*last = 0;
for (i = 0; i < 4096; i++) {
cpu_set_t set;
CPU_ZERO(&set);
CPU_SET(i, &set);
if (sched_setaffinity(getpid(), sizeof(set), &set) == 0) {
if (i < *first)
*first = i;
if (i > *last)
*last = i;
}
}
}
/* Opencoded version for fast mode */
static inline int vringh_get_head(struct vringh *vrh, u16 *head)
{
u16 avail_idx, i;
int err;
err = get_user(avail_idx, &vrh->vring.avail->idx);
if (err)
return err;
if (vrh->last_avail_idx == avail_idx)
return 0;
/* Only get avail ring entries after they have been exposed by guest. */
virtio_rmb(vrh->weak_barriers);
i = vrh->last_avail_idx & (vrh->vring.num - 1);
err = get_user(*head, &vrh->vring.avail->ring[i]);
if (err)
return err;
vrh->last_avail_idx++;
return 1;
}
static int parallel_test(unsigned long features,
bool (*getrange)(struct vringh *vrh,
u64 addr, struct vringh_range *r),
bool fast_vringh)
{
void *host_map, *guest_map;
int fd, mapsize, to_guest[2], to_host[2];
unsigned long xfers = 0, notifies = 0, receives = 0;
unsigned int first_cpu, last_cpu;
cpu_set_t cpu_set;
char buf[128];
/* Create real file to mmap. */
fd = open("/tmp/vringh_test-file", O_RDWR|O_CREAT|O_TRUNC, 0600);
if (fd < 0)
err(1, "Opening /tmp/vringh_test-file");
/* Extra room at the end for some data, and indirects */
mapsize = vring_size(RINGSIZE, ALIGN)
+ RINGSIZE * 2 * sizeof(int)
+ RINGSIZE * 6 * sizeof(struct vring_desc);
mapsize = (mapsize + getpagesize() - 1) & ~(getpagesize() - 1);
ftruncate(fd, mapsize);
/* Parent and child use separate addresses, to check our mapping logic! */
host_map = mmap(NULL, mapsize, PROT_READ|PROT_WRITE, MAP_SHARED, fd, 0);
guest_map = mmap(NULL, mapsize, PROT_READ|PROT_WRITE, MAP_SHARED, fd, 0);
pipe(to_guest);
pipe(to_host);
CPU_ZERO(&cpu_set);
find_cpus(&first_cpu, &last_cpu);
printf("Using CPUS %u and %u\n", first_cpu, last_cpu);
fflush(stdout);
if (fork() != 0) {
struct vringh vrh;
int status, err, rlen = 0;
char rbuf[5];
/* We are the host: never access guest addresses! */
munmap(guest_map, mapsize);
__user_addr_min = host_map;
__user_addr_max = __user_addr_min + mapsize;
user_addr_offset = host_map - guest_map;
assert(user_addr_offset);
close(to_guest[0]);
close(to_host[1]);
vring_init(&vrh.vring, RINGSIZE, host_map, ALIGN);
vringh_init_user(&vrh, features, RINGSIZE, true,
vrh.vring.desc, vrh.vring.avail, vrh.vring.used);
CPU_SET(first_cpu, &cpu_set);
if (sched_setaffinity(getpid(), sizeof(cpu_set), &cpu_set))
errx(1, "Could not set affinity to cpu %u", first_cpu);
while (xfers < NUM_XFERS) {
struct iovec host_riov[2], host_wiov[2];
struct vringh_iov riov, wiov;
u16 head, written;
if (fast_vringh) {
for (;;) {
err = vringh_get_head(&vrh, &head);
if (err != 0)
break;
err = vringh_need_notify_user(&vrh);
if (err < 0)
errx(1, "vringh_need_notify_user: %i",
err);
if (err) {
write(to_guest[1], "", 1);
notifies++;
}
}
if (err != 1)
errx(1, "vringh_get_head");
written = 0;
goto complete;
} else {
vringh_iov_init(&riov,
host_riov,
ARRAY_SIZE(host_riov));
vringh_iov_init(&wiov,
host_wiov,
ARRAY_SIZE(host_wiov));
err = vringh_getdesc_user(&vrh, &riov, &wiov,
getrange, &head);
}
if (err == 0) {
err = vringh_need_notify_user(&vrh);
if (err < 0)
errx(1, "vringh_need_notify_user: %i",
err);
if (err) {
write(to_guest[1], "", 1);
notifies++;
}
if (!vringh_notify_enable_user(&vrh))
continue;
/* Swallow all notifies at once. */
if (read(to_host[0], buf, sizeof(buf)) < 1)
break;
vringh_notify_disable_user(&vrh);
receives++;
continue;
}
if (err != 1)
errx(1, "vringh_getdesc_user: %i", err);
/* We simply copy bytes. */
if (riov.used) {
rlen = vringh_iov_pull_user(&riov, rbuf,
sizeof(rbuf));
if (rlen != 4)
errx(1, "vringh_iov_pull_user: %i",
rlen);
assert(riov.i == riov.used);
written = 0;
} else {
err = vringh_iov_push_user(&wiov, rbuf, rlen);
if (err != rlen)
errx(1, "vringh_iov_push_user: %i",
err);
assert(wiov.i == wiov.used);
written = err;
}
complete:
xfers++;
err = vringh_complete_user(&vrh, head, written);
if (err != 0)
errx(1, "vringh_complete_user: %i", err);
}
err = vringh_need_notify_user(&vrh);
if (err < 0)
errx(1, "vringh_need_notify_user: %i", err);
if (err) {
write(to_guest[1], "", 1);
notifies++;
}
wait(&status);
if (!WIFEXITED(status))
errx(1, "Child died with signal %i?", WTERMSIG(status));
if (WEXITSTATUS(status) != 0)
errx(1, "Child exited %i?", WEXITSTATUS(status));
printf("Host: notified %lu, pinged %lu\n", notifies, receives);
return 0;
} else {
struct guest_virtio_device gvdev;
struct virtqueue *vq;
unsigned int *data;
struct vring_desc *indirects;
unsigned int finished = 0;
/* We pass sg[]s pointing into here, but we need RINGSIZE+1 */
data = guest_map + vring_size(RINGSIZE, ALIGN);
indirects = (void *)data + (RINGSIZE + 1) * 2 * sizeof(int);
/* We are the guest. */
munmap(host_map, mapsize);
close(to_guest[1]);
close(to_host[0]);
gvdev.vdev.features[0] = features;
gvdev.to_host_fd = to_host[1];
gvdev.notifies = 0;
CPU_SET(first_cpu, &cpu_set);
if (sched_setaffinity(getpid(), sizeof(cpu_set), &cpu_set))
err(1, "Could not set affinity to cpu %u", first_cpu);
vq = vring_new_virtqueue(0, RINGSIZE, ALIGN, &gvdev.vdev, true,
guest_map, fast_vringh ? no_notify_host
: parallel_notify_host,
never_callback_guest, "guest vq");
/* Don't kfree indirects. */
__kfree_ignore_start = indirects;
__kfree_ignore_end = indirects + RINGSIZE * 6;
while (xfers < NUM_XFERS) {
struct scatterlist sg[4];
unsigned int num_sg, len;
int *dbuf, err;
bool output = !(xfers % 2);
/* Consume bufs. */
while ((dbuf = virtqueue_get_buf(vq, &len)) != NULL) {
if (len == 4)
assert(*dbuf == finished - 1);
else if (!fast_vringh)
assert(*dbuf == finished);
finished++;
}
/* Produce a buffer. */
dbuf = data + (xfers % (RINGSIZE + 1));
if (output)
*dbuf = xfers;
else
*dbuf = -1;
switch ((xfers / sizeof(*dbuf)) % 4) {
case 0:
/* Nasty three-element sg list. */
sg_init_table(sg, num_sg = 3);
sg_set_buf(&sg[0], (void *)dbuf, 1);
sg_set_buf(&sg[1], (void *)dbuf + 1, 2);
sg_set_buf(&sg[2], (void *)dbuf + 3, 1);
break;
case 1:
sg_init_table(sg, num_sg = 2);
sg_set_buf(&sg[0], (void *)dbuf, 1);
sg_set_buf(&sg[1], (void *)dbuf + 1, 3);
break;
case 2:
sg_init_table(sg, num_sg = 1);
sg_set_buf(&sg[0], (void *)dbuf, 4);
break;
case 3:
sg_init_table(sg, num_sg = 4);
sg_set_buf(&sg[0], (void *)dbuf, 1);
sg_set_buf(&sg[1], (void *)dbuf + 1, 1);
sg_set_buf(&sg[2], (void *)dbuf + 2, 1);
sg_set_buf(&sg[3], (void *)dbuf + 3, 1);
break;
}
/* May allocate an indirect, so force it to allocate
* user addr */
__kmalloc_fake = indirects + (xfers % RINGSIZE) * 4;
if (output)
err = virtqueue_add_outbuf(vq, sg, num_sg, dbuf,
GFP_KERNEL);
else
err = virtqueue_add_inbuf(vq, sg, num_sg,
dbuf, GFP_KERNEL);
if (err == -ENOSPC) {
if (!virtqueue_enable_cb_delayed(vq))
continue;
/* Swallow all notifies at once. */
if (read(to_guest[0], buf, sizeof(buf)) < 1)
break;
receives++;
virtqueue_disable_cb(vq);
continue;
}
if (err)
errx(1, "virtqueue_add_in/outbuf: %i", err);
xfers++;
virtqueue_kick(vq);
}
/* Any extra? */
while (finished != xfers) {
int *dbuf;
unsigned int len;
/* Consume bufs. */
dbuf = virtqueue_get_buf(vq, &len);
if (dbuf) {
if (len == 4)
assert(*dbuf == finished - 1);
else
assert(len == 0);
finished++;
continue;
}
if (!virtqueue_enable_cb_delayed(vq))
continue;
if (read(to_guest[0], buf, sizeof(buf)) < 1)
break;
receives++;
virtqueue_disable_cb(vq);
}
printf("Guest: notified %lu, pinged %lu\n",
gvdev.notifies, receives);
vring_del_virtqueue(vq);
return 0;
}
}
int main(int argc, char *argv[])
{
struct virtio_device vdev;
struct virtqueue *vq;
struct vringh vrh;
struct scatterlist guest_sg[RINGSIZE], *sgs[2];
struct iovec host_riov[2], host_wiov[2];
struct vringh_iov riov, wiov;
struct vring_used_elem used[RINGSIZE];
char buf[28];
u16 head;
int err;
unsigned i;
void *ret;
bool (*getrange)(struct vringh *vrh, u64 addr, struct vringh_range *r);
bool fast_vringh = false, parallel = false;
getrange = getrange_iov;
vdev.features[0] = 0;
while (argv[1]) {
if (strcmp(argv[1], "--indirect") == 0)
vdev.features[0] |= (1 << VIRTIO_RING_F_INDIRECT_DESC);
else if (strcmp(argv[1], "--eventidx") == 0)
vdev.features[0] |= (1 << VIRTIO_RING_F_EVENT_IDX);
else if (strcmp(argv[1], "--slow-range") == 0)
getrange = getrange_slow;
else if (strcmp(argv[1], "--fast-vringh") == 0)
fast_vringh = true;
else if (strcmp(argv[1], "--parallel") == 0)
parallel = true;
else
errx(1, "Unknown arg %s", argv[1]);
argv++;
}
if (parallel)
return parallel_test(vdev.features[0], getrange, fast_vringh);
if (posix_memalign(&__user_addr_min, PAGE_SIZE, USER_MEM) != 0)
abort();
__user_addr_max = __user_addr_min + USER_MEM;
memset(__user_addr_min, 0, vring_size(RINGSIZE, ALIGN));
/* Set up guest side. */
vq = vring_new_virtqueue(0, RINGSIZE, ALIGN, &vdev, true,
__user_addr_min,
never_notify_host, never_callback_guest,
"guest vq");
/* Set up host side. */
vring_init(&vrh.vring, RINGSIZE, __user_addr_min, ALIGN);
vringh_init_user(&vrh, vdev.features[0], RINGSIZE, true,
vrh.vring.desc, vrh.vring.avail, vrh.vring.used);
/* No descriptor to get yet... */
err = vringh_getdesc_user(&vrh, &riov, &wiov, getrange, &head);
if (err != 0)
errx(1, "vringh_getdesc_user: %i", err);
/* Guest puts in a descriptor. */
memcpy(__user_addr_max - 1, "a", 1);
sg_init_table(guest_sg, 1);
sg_set_buf(&guest_sg[0], __user_addr_max - 1, 1);
sg_init_table(guest_sg+1, 1);
sg_set_buf(&guest_sg[1], __user_addr_max - 3, 2);
sgs[0] = &guest_sg[0];
sgs[1] = &guest_sg[1];
/* May allocate an indirect, so force it to allocate user addr */
__kmalloc_fake = __user_addr_min + vring_size(RINGSIZE, ALIGN);
err = virtqueue_add_sgs(vq, sgs, 1, 1, &err, GFP_KERNEL);
if (err)
errx(1, "virtqueue_add_sgs: %i", err);
__kmalloc_fake = NULL;
/* Host retreives it. */
vringh_iov_init(&riov, host_riov, ARRAY_SIZE(host_riov));
vringh_iov_init(&wiov, host_wiov, ARRAY_SIZE(host_wiov));
err = vringh_getdesc_user(&vrh, &riov, &wiov, getrange, &head);
if (err != 1)
errx(1, "vringh_getdesc_user: %i", err);
assert(riov.used == 1);
assert(riov.iov[0].iov_base == __user_addr_max - 1);
assert(riov.iov[0].iov_len == 1);
if (getrange != getrange_slow) {
assert(wiov.used == 1);
assert(wiov.iov[0].iov_base == __user_addr_max - 3);
assert(wiov.iov[0].iov_len == 2);
} else {
assert(wiov.used == 2);
assert(wiov.iov[0].iov_base == __user_addr_max - 3);
assert(wiov.iov[0].iov_len == 1);
assert(wiov.iov[1].iov_base == __user_addr_max - 2);
assert(wiov.iov[1].iov_len == 1);
}
err = vringh_iov_pull_user(&riov, buf, 5);
if (err != 1)
errx(1, "vringh_iov_pull_user: %i", err);
assert(buf[0] == 'a');
assert(riov.i == 1);
assert(vringh_iov_pull_user(&riov, buf, 5) == 0);
memcpy(buf, "bcdef", 5);
err = vringh_iov_push_user(&wiov, buf, 5);
if (err != 2)
errx(1, "vringh_iov_push_user: %i", err);
assert(memcmp(__user_addr_max - 3, "bc", 2) == 0);
assert(wiov.i == wiov.used);
assert(vringh_iov_push_user(&wiov, buf, 5) == 0);
/* Host is done. */
err = vringh_complete_user(&vrh, head, err);
if (err != 0)
errx(1, "vringh_complete_user: %i", err);
/* Guest should see used token now. */
__kfree_ignore_start = __user_addr_min + vring_size(RINGSIZE, ALIGN);
__kfree_ignore_end = __kfree_ignore_start + 1;
ret = virtqueue_get_buf(vq, &i);
if (ret != &err)
errx(1, "virtqueue_get_buf: %p", ret);
assert(i == 2);
/* Guest puts in a huge descriptor. */
sg_init_table(guest_sg, RINGSIZE);
for (i = 0; i < RINGSIZE; i++) {
sg_set_buf(&guest_sg[i],
__user_addr_max - USER_MEM/4, USER_MEM/4);
}
/* Fill contents with recognisable garbage. */
for (i = 0; i < USER_MEM/4; i++)
((char *)__user_addr_max - USER_MEM/4)[i] = i;
/* This will allocate an indirect, so force it to allocate user addr */
__kmalloc_fake = __user_addr_min + vring_size(RINGSIZE, ALIGN);
err = virtqueue_add_outbuf(vq, guest_sg, RINGSIZE, &err, GFP_KERNEL);
if (err)
errx(1, "virtqueue_add_outbuf (large): %i", err);
__kmalloc_fake = NULL;
/* Host picks it up (allocates new iov). */
vringh_iov_init(&riov, host_riov, ARRAY_SIZE(host_riov));
vringh_iov_init(&wiov, host_wiov, ARRAY_SIZE(host_wiov));
err = vringh_getdesc_user(&vrh, &riov, &wiov, getrange, &head);
if (err != 1)
errx(1, "vringh_getdesc_user: %i", err);
assert(riov.max_num & VRINGH_IOV_ALLOCATED);
assert(riov.iov != host_riov);
if (getrange != getrange_slow)
assert(riov.used == RINGSIZE);
else
assert(riov.used == RINGSIZE * USER_MEM/4);
assert(!(wiov.max_num & VRINGH_IOV_ALLOCATED));
assert(wiov.used == 0);
/* Pull data back out (in odd chunks), should be as expected. */
for (i = 0; i < RINGSIZE * USER_MEM/4; i += 3) {
err = vringh_iov_pull_user(&riov, buf, 3);
if (err != 3 && i + err != RINGSIZE * USER_MEM/4)
errx(1, "vringh_iov_pull_user large: %i", err);
assert(buf[0] == (char)i);
assert(err < 2 || buf[1] == (char)(i + 1));
assert(err < 3 || buf[2] == (char)(i + 2));
}
assert(riov.i == riov.used);
vringh_iov_cleanup(&riov);
vringh_iov_cleanup(&wiov);
/* Complete using multi interface, just because we can. */
used[0].id = head;
used[0].len = 0;
err = vringh_complete_multi_user(&vrh, used, 1);
if (err)
errx(1, "vringh_complete_multi_user(1): %i", err);
/* Free up those descriptors. */
ret = virtqueue_get_buf(vq, &i);
if (ret != &err)
errx(1, "virtqueue_get_buf: %p", ret);
/* Add lots of descriptors. */
sg_init_table(guest_sg, 1);
sg_set_buf(&guest_sg[0], __user_addr_max - 1, 1);
for (i = 0; i < RINGSIZE; i++) {
err = virtqueue_add_outbuf(vq, guest_sg, 1, &err, GFP_KERNEL);
if (err)
errx(1, "virtqueue_add_outbuf (multiple): %i", err);
}
/* Now get many, and consume them all at once. */
vringh_iov_init(&riov, host_riov, ARRAY_SIZE(host_riov));
vringh_iov_init(&wiov, host_wiov, ARRAY_SIZE(host_wiov));
for (i = 0; i < RINGSIZE; i++) {
err = vringh_getdesc_user(&vrh, &riov, &wiov, getrange, &head);
if (err != 1)
errx(1, "vringh_getdesc_user: %i", err);
used[i].id = head;
used[i].len = 0;
}
/* Make sure it wraps around ring, to test! */
assert(vrh.vring.used->idx % RINGSIZE != 0);
err = vringh_complete_multi_user(&vrh, used, RINGSIZE);
if (err)
errx(1, "vringh_complete_multi_user: %i", err);
/* Free those buffers. */
for (i = 0; i < RINGSIZE; i++) {
unsigned len;
assert(virtqueue_get_buf(vq, &len) != NULL);
}
/* Test weird (but legal!) indirect. */
if (vdev.features[0] & (1 << VIRTIO_RING_F_INDIRECT_DESC)) {
char *data = __user_addr_max - USER_MEM/4;
struct vring_desc *d = __user_addr_max - USER_MEM/2;
struct vring vring;
/* Force creation of direct, which we modify. */
vdev.features[0] &= ~(1 << VIRTIO_RING_F_INDIRECT_DESC);
vq = vring_new_virtqueue(0, RINGSIZE, ALIGN, &vdev, true,
__user_addr_min,
never_notify_host,
never_callback_guest,
"guest vq");
sg_init_table(guest_sg, 4);
sg_set_buf(&guest_sg[0], d, sizeof(*d)*2);
sg_set_buf(&guest_sg[1], d + 2, sizeof(*d)*1);
sg_set_buf(&guest_sg[2], data + 6, 4);
sg_set_buf(&guest_sg[3], d + 3, sizeof(*d)*3);
err = virtqueue_add_outbuf(vq, guest_sg, 4, &err, GFP_KERNEL);
if (err)
errx(1, "virtqueue_add_outbuf (indirect): %i", err);
vring_init(&vring, RINGSIZE, __user_addr_min, ALIGN);
/* They're used in order, but double-check... */
assert(vring.desc[0].addr == (unsigned long)d);
assert(vring.desc[1].addr == (unsigned long)(d+2));
assert(vring.desc[2].addr == (unsigned long)data + 6);
assert(vring.desc[3].addr == (unsigned long)(d+3));
vring.desc[0].flags |= VRING_DESC_F_INDIRECT;
vring.desc[1].flags |= VRING_DESC_F_INDIRECT;
vring.desc[3].flags |= VRING_DESC_F_INDIRECT;
/* First indirect */
d[0].addr = (unsigned long)data;
d[0].len = 1;
d[0].flags = VRING_DESC_F_NEXT;
d[0].next = 1;
d[1].addr = (unsigned long)data + 1;
d[1].len = 2;
d[1].flags = 0;
/* Second indirect */
d[2].addr = (unsigned long)data + 3;
d[2].len = 3;
d[2].flags = 0;
/* Third indirect */
d[3].addr = (unsigned long)data + 10;
d[3].len = 5;
d[3].flags = VRING_DESC_F_NEXT;
d[3].next = 1;
d[4].addr = (unsigned long)data + 15;
d[4].len = 6;
d[4].flags = VRING_DESC_F_NEXT;
d[4].next = 2;
d[5].addr = (unsigned long)data + 21;
d[5].len = 7;
d[5].flags = 0;
/* Host picks it up (allocates new iov). */
vringh_iov_init(&riov, host_riov, ARRAY_SIZE(host_riov));
vringh_iov_init(&wiov, host_wiov, ARRAY_SIZE(host_wiov));
err = vringh_getdesc_user(&vrh, &riov, &wiov, getrange, &head);
if (err != 1)
errx(1, "vringh_getdesc_user: %i", err);
if (head != 0)
errx(1, "vringh_getdesc_user: head %i not 0", head);
assert(riov.max_num & VRINGH_IOV_ALLOCATED);
if (getrange != getrange_slow)
assert(riov.used == 7);
else
assert(riov.used == 28);
err = vringh_iov_pull_user(&riov, buf, 29);
assert(err == 28);
/* Data should be linear. */
for (i = 0; i < err; i++)
assert(buf[i] == i);
vringh_iov_cleanup(&riov);
}
/* Don't leak memory... */
vring_del_virtqueue(vq);
free(__user_addr_min);
return 0;
}