From e4e8452a4ab304913c4b4242ba06bc4624f14a84 Mon Sep 17 00:00:00 2001 From: Willem de Bruijn Date: Mon, 24 Apr 2017 13:49:26 -0400 Subject: [PATCH 1/5] virtio-net: napi helper functions Prepare virtio-net for tx napi by converting existing napi code to use helper functions. This also deduplicates some logic. Signed-off-by: Willem de Bruijn Signed-off-by: Jason Wang Signed-off-by: David S. Miller --- drivers/net/virtio_net.c | 65 +++++++++++++++++++++------------------- 1 file changed, 35 insertions(+), 30 deletions(-) diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c index 666ada6130ab..b9c1df29892c 100644 --- a/drivers/net/virtio_net.c +++ b/drivers/net/virtio_net.c @@ -239,6 +239,26 @@ static struct page *get_a_page(struct receive_queue *rq, gfp_t gfp_mask) return p; } +static void virtqueue_napi_schedule(struct napi_struct *napi, + struct virtqueue *vq) +{ + if (napi_schedule_prep(napi)) { + virtqueue_disable_cb(vq); + __napi_schedule(napi); + } +} + +static void virtqueue_napi_complete(struct napi_struct *napi, + struct virtqueue *vq, int processed) +{ + int opaque; + + opaque = virtqueue_enable_cb_prepare(vq); + if (napi_complete_done(napi, processed) && + unlikely(virtqueue_poll(vq, opaque))) + virtqueue_napi_schedule(napi, vq); +} + static void skb_xmit_done(struct virtqueue *vq) { struct virtnet_info *vi = vq->vdev->priv; @@ -936,27 +956,20 @@ static void skb_recv_done(struct virtqueue *rvq) struct virtnet_info *vi = rvq->vdev->priv; struct receive_queue *rq = &vi->rq[vq2rxq(rvq)]; - /* Schedule NAPI, Suppress further interrupts if successful. */ - if (napi_schedule_prep(&rq->napi)) { - virtqueue_disable_cb(rvq); - __napi_schedule(&rq->napi); - } + virtqueue_napi_schedule(&rq->napi, rvq); } -static void virtnet_napi_enable(struct receive_queue *rq) +static void virtnet_napi_enable(struct virtqueue *vq, struct napi_struct *napi) { - napi_enable(&rq->napi); + napi_enable(napi); /* If all buffers were filled by other side before we napi_enabled, we - * won't get another interrupt, so process any outstanding packets - * now. virtnet_poll wants re-enable the queue, so we disable here. - * We synchronize against interrupts via NAPI_STATE_SCHED */ - if (napi_schedule_prep(&rq->napi)) { - virtqueue_disable_cb(rq->vq); - local_bh_disable(); - __napi_schedule(&rq->napi); - local_bh_enable(); - } + * won't get another interrupt, so process any outstanding packets now. + * Call local_bh_enable after to trigger softIRQ processing. + */ + local_bh_disable(); + virtqueue_napi_schedule(napi, vq); + local_bh_enable(); } static void refill_work(struct work_struct *work) @@ -971,7 +984,7 @@ static void refill_work(struct work_struct *work) napi_disable(&rq->napi); still_empty = !try_fill_recv(vi, rq, GFP_KERNEL); - virtnet_napi_enable(rq); + virtnet_napi_enable(rq->vq, &rq->napi); /* In theory, this can happen: if we don't get any buffers in * we will *never* try to fill again. @@ -1011,21 +1024,13 @@ static int virtnet_poll(struct napi_struct *napi, int budget) { struct receive_queue *rq = container_of(napi, struct receive_queue, napi); - unsigned int r, received; + unsigned int received; received = virtnet_receive(rq, budget); /* Out of packets? */ - if (received < budget) { - r = virtqueue_enable_cb_prepare(rq->vq); - if (napi_complete_done(napi, received)) { - if (unlikely(virtqueue_poll(rq->vq, r)) && - napi_schedule_prep(napi)) { - virtqueue_disable_cb(rq->vq); - __napi_schedule(napi); - } - } - } + if (received < budget) + virtqueue_napi_complete(napi, rq->vq, received); return received; } @@ -1040,7 +1045,7 @@ static int virtnet_open(struct net_device *dev) /* Make sure we have some buffers: if oom use wq. */ if (!try_fill_recv(vi, &vi->rq[i], GFP_KERNEL)) schedule_delayed_work(&vi->refill, 0); - virtnet_napi_enable(&vi->rq[i]); + virtnet_napi_enable(vi->rq[i].vq, &vi->rq[i].napi); } return 0; @@ -1747,7 +1752,7 @@ static int virtnet_restore_up(struct virtio_device *vdev) schedule_delayed_work(&vi->refill, 0); for (i = 0; i < vi->max_queue_pairs; i++) - virtnet_napi_enable(&vi->rq[i]); + virtnet_napi_enable(vi->rq[i].vq, &vi->rq[i].napi); } netif_device_attach(vi->dev); From b92f1e6751a6aaaf0b1e05128661ce0f23ed3695 Mon Sep 17 00:00:00 2001 From: Willem de Bruijn Date: Mon, 24 Apr 2017 13:49:27 -0400 Subject: [PATCH 2/5] virtio-net: transmit napi Convert virtio-net to a standard napi tx completion path. This enables better TCP pacing using TCP small queues and increases single stream throughput. The virtio-net driver currently cleans tx descriptors on transmission of new packets in ndo_start_xmit. Latency depends on new traffic, so is unbounded. To avoid deadlock when a socket reaches its snd limit, packets are orphaned on tranmission. This breaks socket backpressure, including TSQ. Napi increases the number of interrupts generated compared to the current model, which keeps interrupts disabled as long as the ring has enough free descriptors. Keep tx napi optional and disabled for now. Follow-on patches will reduce the interrupt cost. Signed-off-by: Willem de Bruijn Signed-off-by: Jason Wang Signed-off-by: David S. Miller --- drivers/net/virtio_net.c | 76 +++++++++++++++++++++++++++++++++++----- 1 file changed, 67 insertions(+), 9 deletions(-) diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c index b9c1df29892c..356d18481ee4 100644 --- a/drivers/net/virtio_net.c +++ b/drivers/net/virtio_net.c @@ -33,9 +33,10 @@ static int napi_weight = NAPI_POLL_WEIGHT; module_param(napi_weight, int, 0444); -static bool csum = true, gso = true; +static bool csum = true, gso = true, napi_tx; module_param(csum, bool, 0444); module_param(gso, bool, 0444); +module_param(napi_tx, bool, 0644); /* FIXME: MTU in config. */ #define GOOD_PACKET_LEN (ETH_HLEN + VLAN_HLEN + ETH_DATA_LEN) @@ -86,6 +87,8 @@ struct send_queue { /* Name of the send queue: output.$index */ char name[40]; + + struct napi_struct napi; }; /* Internal representation of a receive virtqueue */ @@ -262,12 +265,16 @@ static void virtqueue_napi_complete(struct napi_struct *napi, static void skb_xmit_done(struct virtqueue *vq) { struct virtnet_info *vi = vq->vdev->priv; + struct napi_struct *napi = &vi->sq[vq2txq(vq)].napi; /* Suppress further interrupts. */ virtqueue_disable_cb(vq); - /* We were probably waiting for more output buffers. */ - netif_wake_subqueue(vi->dev, vq2txq(vq)); + if (napi->weight) + virtqueue_napi_schedule(napi, vq); + else + /* We were probably waiting for more output buffers. */ + netif_wake_subqueue(vi->dev, vq2txq(vq)); } static unsigned int mergeable_ctx_to_buf_truesize(unsigned long mrg_ctx) @@ -972,6 +979,24 @@ static void virtnet_napi_enable(struct virtqueue *vq, struct napi_struct *napi) local_bh_enable(); } +static void virtnet_napi_tx_enable(struct virtnet_info *vi, + struct virtqueue *vq, + struct napi_struct *napi) +{ + if (!napi->weight) + return; + + /* Tx napi touches cachelines on the cpu handling tx interrupts. Only + * enable the feature if this is likely affine with the transmit path. + */ + if (!vi->affinity_hint_set) { + napi->weight = 0; + return; + } + + return virtnet_napi_enable(vq, napi); +} + static void refill_work(struct work_struct *work) { struct virtnet_info *vi = @@ -1046,6 +1071,7 @@ static int virtnet_open(struct net_device *dev) if (!try_fill_recv(vi, &vi->rq[i], GFP_KERNEL)) schedule_delayed_work(&vi->refill, 0); virtnet_napi_enable(vi->rq[i].vq, &vi->rq[i].napi); + virtnet_napi_tx_enable(vi, vi->sq[i].vq, &vi->sq[i].napi); } return 0; @@ -1081,6 +1107,24 @@ static void free_old_xmit_skbs(struct send_queue *sq) u64_stats_update_end(&stats->tx_syncp); } +static int virtnet_poll_tx(struct napi_struct *napi, int budget) +{ + struct send_queue *sq = container_of(napi, struct send_queue, napi); + struct virtnet_info *vi = sq->vq->vdev->priv; + struct netdev_queue *txq = netdev_get_tx_queue(vi->dev, vq2txq(sq->vq)); + + __netif_tx_lock(txq, raw_smp_processor_id()); + free_old_xmit_skbs(sq); + __netif_tx_unlock(txq); + + virtqueue_napi_complete(napi, sq->vq, 0); + + if (sq->vq->num_free >= 2 + MAX_SKB_FRAGS) + netif_tx_wake_queue(txq); + + return 0; +} + static int xmit_skb(struct send_queue *sq, struct sk_buff *skb) { struct virtio_net_hdr_mrg_rxbuf *hdr; @@ -1130,6 +1174,7 @@ static netdev_tx_t start_xmit(struct sk_buff *skb, struct net_device *dev) int err; struct netdev_queue *txq = netdev_get_tx_queue(dev, qnum); bool kick = !skb->xmit_more; + bool use_napi = sq->napi.weight; /* Free up any pending old buffers before queueing new ones. */ free_old_xmit_skbs(sq); @@ -1152,8 +1197,10 @@ static netdev_tx_t start_xmit(struct sk_buff *skb, struct net_device *dev) } /* Don't wait up for transmitted skbs to be freed. */ - skb_orphan(skb); - nf_reset(skb); + if (!use_napi) { + skb_orphan(skb); + nf_reset(skb); + } /* If running out of space, stop queue to avoid getting packets that we * are then unable to transmit. @@ -1167,7 +1214,8 @@ static netdev_tx_t start_xmit(struct sk_buff *skb, struct net_device *dev) */ if (sq->vq->num_free < 2+MAX_SKB_FRAGS) { netif_stop_subqueue(dev, qnum); - if (unlikely(!virtqueue_enable_cb_delayed(sq->vq))) { + if (!use_napi && + unlikely(!virtqueue_enable_cb_delayed(sq->vq))) { /* More just got used, free them then recheck. */ free_old_xmit_skbs(sq); if (sq->vq->num_free >= 2+MAX_SKB_FRAGS) { @@ -1371,8 +1419,10 @@ static int virtnet_close(struct net_device *dev) /* Make sure refill_work doesn't re-enable napi! */ cancel_delayed_work_sync(&vi->refill); - for (i = 0; i < vi->max_queue_pairs; i++) + for (i = 0; i < vi->max_queue_pairs; i++) { napi_disable(&vi->rq[i].napi); + napi_disable(&vi->sq[i].napi); + } return 0; } @@ -1727,8 +1777,10 @@ static void virtnet_freeze_down(struct virtio_device *vdev) cancel_delayed_work_sync(&vi->refill); if (netif_running(vi->dev)) { - for (i = 0; i < vi->max_queue_pairs; i++) + for (i = 0; i < vi->max_queue_pairs; i++) { napi_disable(&vi->rq[i].napi); + napi_disable(&vi->sq[i].napi); + } } } @@ -1751,8 +1803,11 @@ static int virtnet_restore_up(struct virtio_device *vdev) if (!try_fill_recv(vi, &vi->rq[i], GFP_KERNEL)) schedule_delayed_work(&vi->refill, 0); - for (i = 0; i < vi->max_queue_pairs; i++) + for (i = 0; i < vi->max_queue_pairs; i++) { virtnet_napi_enable(vi->rq[i].vq, &vi->rq[i].napi); + virtnet_napi_tx_enable(vi, vi->sq[i].vq, + &vi->sq[i].napi); + } } netif_device_attach(vi->dev); @@ -1957,6 +2012,7 @@ static void virtnet_free_queues(struct virtnet_info *vi) for (i = 0; i < vi->max_queue_pairs; i++) { napi_hash_del(&vi->rq[i].napi); netif_napi_del(&vi->rq[i].napi); + netif_napi_del(&vi->sq[i].napi); } /* We called napi_hash_del() before netif_napi_del(), @@ -2142,6 +2198,8 @@ static int virtnet_alloc_queues(struct virtnet_info *vi) vi->rq[i].pages = NULL; netif_napi_add(vi->dev, &vi->rq[i].napi, virtnet_poll, napi_weight); + netif_napi_add(vi->dev, &vi->sq[i].napi, virtnet_poll_tx, + napi_tx ? napi_weight : 0); sg_init_table(vi->rq[i].sg, ARRAY_SIZE(vi->rq[i].sg)); ewma_pkt_len_init(&vi->rq[i].mrg_avg_pkt_len); From ea7735d97ba9064c448664429e249991ccd8aa77 Mon Sep 17 00:00:00 2001 From: Willem de Bruijn Date: Mon, 24 Apr 2017 13:49:28 -0400 Subject: [PATCH 3/5] virtio-net: move free_old_xmit_skbs An upcoming patch will call free_old_xmit_skbs indirectly from virtnet_poll. Move the function above this to avoid having to introduce a forward declaration. This is a pure move: no code changes. Signed-off-by: Willem de Bruijn Signed-off-by: David S. Miller --- drivers/net/virtio_net.c | 60 ++++++++++++++++++++-------------------- 1 file changed, 30 insertions(+), 30 deletions(-) diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c index 356d18481ee4..4ec79e5d7a86 100644 --- a/drivers/net/virtio_net.c +++ b/drivers/net/virtio_net.c @@ -1045,6 +1045,36 @@ static int virtnet_receive(struct receive_queue *rq, int budget) return received; } +static void free_old_xmit_skbs(struct send_queue *sq) +{ + struct sk_buff *skb; + unsigned int len; + struct virtnet_info *vi = sq->vq->vdev->priv; + struct virtnet_stats *stats = this_cpu_ptr(vi->stats); + unsigned int packets = 0; + unsigned int bytes = 0; + + while ((skb = virtqueue_get_buf(sq->vq, &len)) != NULL) { + pr_debug("Sent skb %p\n", skb); + + bytes += skb->len; + packets++; + + dev_kfree_skb_any(skb); + } + + /* Avoid overhead when no packets have been processed + * happens when called speculatively from start_xmit. + */ + if (!packets) + return; + + u64_stats_update_begin(&stats->tx_syncp); + stats->tx_bytes += bytes; + stats->tx_packets += packets; + u64_stats_update_end(&stats->tx_syncp); +} + static int virtnet_poll(struct napi_struct *napi, int budget) { struct receive_queue *rq = @@ -1077,36 +1107,6 @@ static int virtnet_open(struct net_device *dev) return 0; } -static void free_old_xmit_skbs(struct send_queue *sq) -{ - struct sk_buff *skb; - unsigned int len; - struct virtnet_info *vi = sq->vq->vdev->priv; - struct virtnet_stats *stats = this_cpu_ptr(vi->stats); - unsigned int packets = 0; - unsigned int bytes = 0; - - while ((skb = virtqueue_get_buf(sq->vq, &len)) != NULL) { - pr_debug("Sent skb %p\n", skb); - - bytes += skb->len; - packets++; - - dev_kfree_skb_any(skb); - } - - /* Avoid overhead when no packets have been processed - * happens when called speculatively from start_xmit. - */ - if (!packets) - return; - - u64_stats_update_begin(&stats->tx_syncp); - stats->tx_bytes += bytes; - stats->tx_packets += packets; - u64_stats_update_end(&stats->tx_syncp); -} - static int virtnet_poll_tx(struct napi_struct *napi, int budget) { struct send_queue *sq = container_of(napi, struct send_queue, napi); From 7b0411ef4aa69c9256d6a2c289d0a2b320414633 Mon Sep 17 00:00:00 2001 From: Willem de Bruijn Date: Mon, 24 Apr 2017 13:49:29 -0400 Subject: [PATCH 4/5] virtio-net: clean tx descriptors from rx napi Amortize the cost of virtual interrupts by doing both rx and tx work on reception of a receive interrupt if tx napi is enabled. With VIRTIO_F_EVENT_IDX, this suppresses most explicit tx completion interrupts for bidirectional workloads. Signed-off-by: Willem de Bruijn Signed-off-by: David S. Miller --- drivers/net/virtio_net.c | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c index 4ec79e5d7a86..9dd978f34c1f 100644 --- a/drivers/net/virtio_net.c +++ b/drivers/net/virtio_net.c @@ -1075,12 +1075,33 @@ static void free_old_xmit_skbs(struct send_queue *sq) u64_stats_update_end(&stats->tx_syncp); } +static void virtnet_poll_cleantx(struct receive_queue *rq) +{ + struct virtnet_info *vi = rq->vq->vdev->priv; + unsigned int index = vq2rxq(rq->vq); + struct send_queue *sq = &vi->sq[index]; + struct netdev_queue *txq = netdev_get_tx_queue(vi->dev, index); + + if (!sq->napi.weight) + return; + + if (__netif_tx_trylock(txq)) { + free_old_xmit_skbs(sq); + __netif_tx_unlock(txq); + } + + if (sq->vq->num_free >= 2 + MAX_SKB_FRAGS) + netif_tx_wake_queue(txq); +} + static int virtnet_poll(struct napi_struct *napi, int budget) { struct receive_queue *rq = container_of(napi, struct receive_queue, napi); unsigned int received; + virtnet_poll_cleantx(rq); + received = virtnet_receive(rq, budget); /* Out of packets? */ From bdb12e0d2ffc84a0248788cdf6cfbff86ee34602 Mon Sep 17 00:00:00 2001 From: Willem de Bruijn Date: Mon, 24 Apr 2017 13:49:30 -0400 Subject: [PATCH 5/5] virtio-net: keep tx interrupts disabled unless kick Tx napi mode increases the rate of transmit interrupts. Suppress some by masking interrupts while more packets are expected. The interrupts will be reenabled before the last packet is sent. This optimization reduces the througput drop with tx napi for unidirectional flows such as UDP_STREAM that do not benefit from cleaning tx completions in the the receive napi handler. Signed-off-by: Willem de Bruijn Signed-off-by: David S. Miller --- drivers/net/virtio_net.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c index 9dd978f34c1f..003143835766 100644 --- a/drivers/net/virtio_net.c +++ b/drivers/net/virtio_net.c @@ -1200,6 +1200,9 @@ static netdev_tx_t start_xmit(struct sk_buff *skb, struct net_device *dev) /* Free up any pending old buffers before queueing new ones. */ free_old_xmit_skbs(sq); + if (use_napi && kick) + virtqueue_enable_cb_delayed(sq->vq); + /* timestamp packet in software */ skb_tx_timestamp(skb);