virtio-net: do not reset during XDP set

We currently reset the device during XDP set, the main reason is
that we allocate more headroom with XDP (for header adjustment).

This works but causes network downtime for users.

Previous patches encoded the headroom in the buffer context,
this makes it possible to detect the case where a buffer
with headroom insufficient for XDP is added to the queue and
XDP is enabled afterwards.

Upon detection, we handle this case by copying the packet
(slow, but it's a temporary condition).

Signed-off-by: Jason Wang <jasowang@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
This commit is contained in:
Jason Wang 2017-07-19 16:54:48 +08:00 committed by David S. Miller
parent 192f68cf35
commit 4941d472bf

View file

@ -407,121 +407,28 @@ static unsigned int virtnet_get_headroom(struct virtnet_info *vi)
return vi->xdp_queue_pairs ? VIRTIO_XDP_HEADROOM : 0; return vi->xdp_queue_pairs ? VIRTIO_XDP_HEADROOM : 0;
} }
static struct sk_buff *receive_small(struct net_device *dev, /* We copy the packet for XDP in the following cases:
struct virtnet_info *vi, *
struct receive_queue *rq, * 1) Packet is scattered across multiple rx buffers.
void *buf, void *ctx, * 2) Headroom space is insufficient.
unsigned int len) *
{ * This is inefficient but it's a temporary condition that
struct sk_buff *skb; * we hit right after XDP is enabled and until queue is refilled
struct bpf_prog *xdp_prog; * with large buffers with sufficient headroom - so it should affect
unsigned int xdp_headroom = virtnet_get_headroom(vi); * at most queue size packets.
unsigned int header_offset = VIRTNET_RX_PAD + xdp_headroom; * Afterwards, the conditions to enable
unsigned int headroom = vi->hdr_len + header_offset; * XDP should preclude the underlying device from sending packets
unsigned int buflen = SKB_DATA_ALIGN(GOOD_PACKET_LEN + headroom) + * across multiple buffers (num_buf > 1), and we make sure buffers
SKB_DATA_ALIGN(sizeof(struct skb_shared_info)); * have enough headroom.
unsigned int delta = 0;
len -= vi->hdr_len;
rcu_read_lock();
xdp_prog = rcu_dereference(rq->xdp_prog);
if (xdp_prog) {
struct virtio_net_hdr_mrg_rxbuf *hdr = buf + header_offset;
struct xdp_buff xdp;
void *orig_data;
u32 act;
if (unlikely(hdr->hdr.gso_type || hdr->hdr.flags))
goto err_xdp;
xdp.data_hard_start = buf + VIRTNET_RX_PAD + vi->hdr_len;
xdp.data = xdp.data_hard_start + xdp_headroom;
xdp.data_end = xdp.data + len;
orig_data = xdp.data;
act = bpf_prog_run_xdp(xdp_prog, &xdp);
switch (act) {
case XDP_PASS:
/* Recalculate length in case bpf program changed it */
delta = orig_data - xdp.data;
break;
case XDP_TX:
if (unlikely(!virtnet_xdp_xmit(vi, rq, &xdp)))
trace_xdp_exception(vi->dev, xdp_prog, act);
rcu_read_unlock();
goto xdp_xmit;
default:
bpf_warn_invalid_xdp_action(act);
case XDP_ABORTED:
trace_xdp_exception(vi->dev, xdp_prog, act);
case XDP_DROP:
goto err_xdp;
}
}
rcu_read_unlock();
skb = build_skb(buf, buflen);
if (!skb) {
put_page(virt_to_head_page(buf));
goto err;
}
skb_reserve(skb, headroom - delta);
skb_put(skb, len + delta);
if (!delta) {
buf += header_offset;
memcpy(skb_vnet_hdr(skb), buf, vi->hdr_len);
} /* keep zeroed vnet hdr since packet was changed by bpf */
err:
return skb;
err_xdp:
rcu_read_unlock();
dev->stats.rx_dropped++;
put_page(virt_to_head_page(buf));
xdp_xmit:
return NULL;
}
static struct sk_buff *receive_big(struct net_device *dev,
struct virtnet_info *vi,
struct receive_queue *rq,
void *buf,
unsigned int len)
{
struct page *page = buf;
struct sk_buff *skb = page_to_skb(vi, rq, page, 0, len, PAGE_SIZE);
if (unlikely(!skb))
goto err;
return skb;
err:
dev->stats.rx_dropped++;
give_pages(rq, page);
return NULL;
}
/* The conditions to enable XDP should preclude the underlying device from
* sending packets across multiple buffers (num_buf > 1). However per spec
* it does not appear to be illegal to do so but rather just against convention.
* So in order to avoid making a system unresponsive the packets are pushed
* into a page and the XDP program is run. This will be extremely slow and we
* push a warning to the user to fix this as soon as possible. Fixing this may
* require resolving the underlying hardware to determine why multiple buffers
* are being received or simply loading the XDP program in the ingress stack
* after the skb is built because there is no advantage to running it here
* anymore.
*/ */
static struct page *xdp_linearize_page(struct receive_queue *rq, static struct page *xdp_linearize_page(struct receive_queue *rq,
u16 *num_buf, u16 *num_buf,
struct page *p, struct page *p,
int offset, int offset,
int page_off,
unsigned int *len) unsigned int *len)
{ {
struct page *page = alloc_page(GFP_ATOMIC); struct page *page = alloc_page(GFP_ATOMIC);
unsigned int page_off = VIRTIO_XDP_HEADROOM;
if (!page) if (!page)
return NULL; return NULL;
@ -563,6 +470,125 @@ err_buf:
return NULL; return NULL;
} }
static struct sk_buff *receive_small(struct net_device *dev,
struct virtnet_info *vi,
struct receive_queue *rq,
void *buf, void *ctx,
unsigned int len)
{
struct sk_buff *skb;
struct bpf_prog *xdp_prog;
unsigned int xdp_headroom = (unsigned long)ctx;
unsigned int header_offset = VIRTNET_RX_PAD + xdp_headroom;
unsigned int headroom = vi->hdr_len + header_offset;
unsigned int buflen = SKB_DATA_ALIGN(GOOD_PACKET_LEN + headroom) +
SKB_DATA_ALIGN(sizeof(struct skb_shared_info));
struct page *page = virt_to_head_page(buf);
unsigned int delta = 0;
struct page *xdp_page;
len -= vi->hdr_len;
rcu_read_lock();
xdp_prog = rcu_dereference(rq->xdp_prog);
if (xdp_prog) {
struct virtio_net_hdr_mrg_rxbuf *hdr = buf + header_offset;
struct xdp_buff xdp;
void *orig_data;
u32 act;
if (unlikely(hdr->hdr.gso_type || hdr->hdr.flags))
goto err_xdp;
if (unlikely(xdp_headroom < virtnet_get_headroom(vi))) {
int offset = buf - page_address(page) + header_offset;
unsigned int tlen = len + vi->hdr_len;
u16 num_buf = 1;
xdp_headroom = virtnet_get_headroom(vi);
header_offset = VIRTNET_RX_PAD + xdp_headroom;
headroom = vi->hdr_len + header_offset;
buflen = SKB_DATA_ALIGN(GOOD_PACKET_LEN + headroom) +
SKB_DATA_ALIGN(sizeof(struct skb_shared_info));
xdp_page = xdp_linearize_page(rq, &num_buf, page,
offset, header_offset,
&tlen);
if (!xdp_page)
goto err_xdp;
buf = page_address(xdp_page);
put_page(page);
page = xdp_page;
}
xdp.data_hard_start = buf + VIRTNET_RX_PAD + vi->hdr_len;
xdp.data = xdp.data_hard_start + xdp_headroom;
xdp.data_end = xdp.data + len;
orig_data = xdp.data;
act = bpf_prog_run_xdp(xdp_prog, &xdp);
switch (act) {
case XDP_PASS:
/* Recalculate length in case bpf program changed it */
delta = orig_data - xdp.data;
break;
case XDP_TX:
if (unlikely(!virtnet_xdp_xmit(vi, rq, &xdp)))
trace_xdp_exception(vi->dev, xdp_prog, act);
rcu_read_unlock();
goto xdp_xmit;
default:
bpf_warn_invalid_xdp_action(act);
case XDP_ABORTED:
trace_xdp_exception(vi->dev, xdp_prog, act);
case XDP_DROP:
goto err_xdp;
}
}
rcu_read_unlock();
skb = build_skb(buf, buflen);
if (!skb) {
put_page(page);
goto err;
}
skb_reserve(skb, headroom - delta);
skb_put(skb, len + delta);
if (!delta) {
buf += header_offset;
memcpy(skb_vnet_hdr(skb), buf, vi->hdr_len);
} /* keep zeroed vnet hdr since packet was changed by bpf */
err:
return skb;
err_xdp:
rcu_read_unlock();
dev->stats.rx_dropped++;
put_page(page);
xdp_xmit:
return NULL;
}
static struct sk_buff *receive_big(struct net_device *dev,
struct virtnet_info *vi,
struct receive_queue *rq,
void *buf,
unsigned int len)
{
struct page *page = buf;
struct sk_buff *skb = page_to_skb(vi, rq, page, 0, len, PAGE_SIZE);
if (unlikely(!skb))
goto err;
return skb;
err:
dev->stats.rx_dropped++;
give_pages(rq, page);
return NULL;
}
static struct sk_buff *receive_mergeable(struct net_device *dev, static struct sk_buff *receive_mergeable(struct net_device *dev,
struct virtnet_info *vi, struct virtnet_info *vi,
struct receive_queue *rq, struct receive_queue *rq,
@ -577,6 +603,7 @@ static struct sk_buff *receive_mergeable(struct net_device *dev,
struct sk_buff *head_skb, *curr_skb; struct sk_buff *head_skb, *curr_skb;
struct bpf_prog *xdp_prog; struct bpf_prog *xdp_prog;
unsigned int truesize; unsigned int truesize;
unsigned int headroom = mergeable_ctx_to_headroom(ctx);
head_skb = NULL; head_skb = NULL;
@ -589,10 +616,13 @@ static struct sk_buff *receive_mergeable(struct net_device *dev,
u32 act; u32 act;
/* This happens when rx buffer size is underestimated */ /* This happens when rx buffer size is underestimated */
if (unlikely(num_buf > 1)) { if (unlikely(num_buf > 1 ||
headroom < virtnet_get_headroom(vi))) {
/* linearize data for XDP */ /* linearize data for XDP */
xdp_page = xdp_linearize_page(rq, &num_buf, xdp_page = xdp_linearize_page(rq, &num_buf,
page, offset, &len); page, offset,
VIRTIO_XDP_HEADROOM,
&len);
if (!xdp_page) if (!xdp_page)
goto err_xdp; goto err_xdp;
offset = VIRTIO_XDP_HEADROOM; offset = VIRTIO_XDP_HEADROOM;
@ -835,7 +865,6 @@ static int add_recvbuf_small(struct virtnet_info *vi, struct receive_queue *rq,
err = virtqueue_add_inbuf_ctx(rq->vq, rq->sg, 1, buf, ctx, gfp); err = virtqueue_add_inbuf_ctx(rq->vq, rq->sg, 1, buf, ctx, gfp);
if (err < 0) if (err < 0)
put_page(virt_to_head_page(buf)); put_page(virt_to_head_page(buf));
return err; return err;
} }
@ -1840,7 +1869,6 @@ static void virtnet_freeze_down(struct virtio_device *vdev)
} }
static int init_vqs(struct virtnet_info *vi); static int init_vqs(struct virtnet_info *vi);
static void _remove_vq_common(struct virtnet_info *vi);
static int virtnet_restore_up(struct virtio_device *vdev) static int virtnet_restore_up(struct virtio_device *vdev)
{ {
@ -1869,39 +1897,6 @@ static int virtnet_restore_up(struct virtio_device *vdev)
return err; return err;
} }
static int virtnet_reset(struct virtnet_info *vi, int curr_qp, int xdp_qp)
{
struct virtio_device *dev = vi->vdev;
int ret;
virtio_config_disable(dev);
dev->failed = dev->config->get_status(dev) & VIRTIO_CONFIG_S_FAILED;
virtnet_freeze_down(dev);
_remove_vq_common(vi);
virtio_add_status(dev, VIRTIO_CONFIG_S_ACKNOWLEDGE);
virtio_add_status(dev, VIRTIO_CONFIG_S_DRIVER);
ret = virtio_finalize_features(dev);
if (ret)
goto err;
vi->xdp_queue_pairs = xdp_qp;
ret = virtnet_restore_up(dev);
if (ret)
goto err;
ret = _virtnet_set_queues(vi, curr_qp);
if (ret)
goto err;
virtio_add_status(dev, VIRTIO_CONFIG_S_DRIVER_OK);
virtio_config_enable(dev);
return 0;
err:
virtio_add_status(dev, VIRTIO_CONFIG_S_FAILED);
return ret;
}
static int virtnet_xdp_set(struct net_device *dev, struct bpf_prog *prog, static int virtnet_xdp_set(struct net_device *dev, struct bpf_prog *prog,
struct netlink_ext_ack *extack) struct netlink_ext_ack *extack)
{ {
@ -1948,35 +1943,29 @@ static int virtnet_xdp_set(struct net_device *dev, struct bpf_prog *prog,
return PTR_ERR(prog); return PTR_ERR(prog);
} }
/* Changing the headroom in buffers is a disruptive operation because /* Make sure NAPI is not using any XDP TX queues for RX. */
* existing buffers must be flushed and reallocated. This will happen for (i = 0; i < vi->max_queue_pairs; i++)
* when a xdp program is initially added or xdp is disabled by removing napi_disable(&vi->rq[i].napi);
* the xdp program resulting in number of XDP queues changing.
*/
if (vi->xdp_queue_pairs != xdp_qp) {
err = virtnet_reset(vi, curr_qp + xdp_qp, xdp_qp);
if (err) {
dev_warn(&dev->dev, "XDP reset failure.\n");
goto virtio_reset_err;
}
}
netif_set_real_num_rx_queues(dev, curr_qp + xdp_qp); netif_set_real_num_rx_queues(dev, curr_qp + xdp_qp);
err = _virtnet_set_queues(vi, curr_qp + xdp_qp);
if (err)
goto err;
vi->xdp_queue_pairs = xdp_qp;
for (i = 0; i < vi->max_queue_pairs; i++) { for (i = 0; i < vi->max_queue_pairs; i++) {
old_prog = rtnl_dereference(vi->rq[i].xdp_prog); old_prog = rtnl_dereference(vi->rq[i].xdp_prog);
rcu_assign_pointer(vi->rq[i].xdp_prog, prog); rcu_assign_pointer(vi->rq[i].xdp_prog, prog);
if (old_prog) if (old_prog)
bpf_prog_put(old_prog); bpf_prog_put(old_prog);
virtnet_napi_enable(vi->rq[i].vq, &vi->rq[i].napi);
} }
return 0; return 0;
virtio_reset_err: err:
/* On reset error do our best to unwind XDP changes inflight and return for (i = 0; i < vi->max_queue_pairs; i++)
* error up to user space for resolution. The underlying reset hung on virtnet_napi_enable(vi->rq[i].vq, &vi->rq[i].napi);
* us so not much we can do here.
*/
if (prog) if (prog)
bpf_prog_sub(prog, vi->max_queue_pairs - 1); bpf_prog_sub(prog, vi->max_queue_pairs - 1);
return err; return err;
@ -2622,15 +2611,6 @@ free:
return err; return err;
} }
static void _remove_vq_common(struct virtnet_info *vi)
{
vi->vdev->config->reset(vi->vdev);
free_unused_bufs(vi);
_free_receive_bufs(vi);
free_receive_page_frags(vi);
virtnet_del_vqs(vi);
}
static void remove_vq_common(struct virtnet_info *vi) static void remove_vq_common(struct virtnet_info *vi)
{ {
vi->vdev->config->reset(vi->vdev); vi->vdev->config->reset(vi->vdev);