diff --git a/drivers/net/tun.c b/drivers/net/tun.c index bdbb526eca7b..2762c55aeb66 100644 --- a/drivers/net/tun.c +++ b/drivers/net/tun.c @@ -109,6 +109,12 @@ struct tap_filter { unsigned char addr[FLT_EXACT_COUNT][ETH_ALEN]; }; +/* 1024 is probably a high enough limit: modern hypervisors seem to support on + * the order of 100-200 CPUs so this leaves us some breathing space if we want + * to match a queue per guest CPU. + */ +#define MAX_TAP_QUEUES 1024 + /* A tun_file connects an open character device to a tuntap netdevice. It * also contains all socket related strctures (except sock_fprog and tap_filter) * to serve as one transmit queue for tuntap device. The sock_fprog and @@ -129,6 +135,7 @@ struct tun_file { struct fasync_struct *fasync; /* only used for fasnyc */ unsigned int flags; + u16 queue_index; }; /* Since the socket were moved to tun_file, to preserve the behavior of persist @@ -136,7 +143,8 @@ struct tun_file { * file were attached to a persist device. */ struct tun_struct { - struct tun_file __rcu *tfile; + struct tun_file __rcu *tfiles[MAX_TAP_QUEUES]; + unsigned int numqueues; unsigned int flags; kuid_t owner; kgid_t group; @@ -157,58 +165,159 @@ struct tun_struct { #endif }; +/* We try to identify a flow through its rxhash first. The reason that + * we do not check rxq no. is becuase some cards(e.g 82599), chooses + * the rxq based on the txq where the last packet of the flow comes. As + * the userspace application move between processors, we may get a + * different rxq no. here. If we could not get rxhash, then we would + * hope the rxq no. may help here. + */ +static u16 tun_select_queue(struct net_device *dev, struct sk_buff *skb) +{ + struct tun_struct *tun = netdev_priv(dev); + u32 txq = 0; + u32 numqueues = 0; + + rcu_read_lock(); + numqueues = tun->numqueues; + + txq = skb_get_rxhash(skb); + if (txq) { + /* use multiply and shift instead of expensive divide */ + txq = ((u64)txq * numqueues) >> 32; + } else if (likely(skb_rx_queue_recorded(skb))) { + txq = skb_get_rx_queue(skb); + while (unlikely(txq >= numqueues)) + txq -= numqueues; + } + + rcu_read_unlock(); + return txq; +} + +static void tun_set_real_num_queues(struct tun_struct *tun) +{ + netif_set_real_num_tx_queues(tun->dev, tun->numqueues); + netif_set_real_num_rx_queues(tun->dev, tun->numqueues); +} + +static void __tun_detach(struct tun_file *tfile, bool clean) +{ + struct tun_file *ntfile; + struct tun_struct *tun; + struct net_device *dev; + + tun = rcu_dereference_protected(tfile->tun, + lockdep_rtnl_is_held()); + if (tun) { + u16 index = tfile->queue_index; + BUG_ON(index >= tun->numqueues); + dev = tun->dev; + + rcu_assign_pointer(tun->tfiles[index], + tun->tfiles[tun->numqueues - 1]); + rcu_assign_pointer(tfile->tun, NULL); + ntfile = rcu_dereference_protected(tun->tfiles[index], + lockdep_rtnl_is_held()); + ntfile->queue_index = index; + + --tun->numqueues; + sock_put(&tfile->sk); + + synchronize_net(); + /* Drop read queue */ + skb_queue_purge(&tfile->sk.sk_receive_queue); + tun_set_real_num_queues(tun); + + if (tun->numqueues == 0 && !(tun->flags & TUN_PERSIST)) + if (dev->reg_state == NETREG_REGISTERED) + unregister_netdevice(dev); + } + + if (clean) { + BUG_ON(!test_bit(SOCK_EXTERNALLY_ALLOCATED, + &tfile->socket.flags)); + sk_release_kernel(&tfile->sk); + } +} + +static void tun_detach(struct tun_file *tfile, bool clean) +{ + rtnl_lock(); + __tun_detach(tfile, clean); + rtnl_unlock(); +} + +static void tun_detach_all(struct net_device *dev) +{ + struct tun_struct *tun = netdev_priv(dev); + struct tun_file *tfile; + int i, n = tun->numqueues; + + for (i = 0; i < n; i++) { + tfile = rcu_dereference_protected(tun->tfiles[i], + lockdep_rtnl_is_held()); + BUG_ON(!tfile); + wake_up_all(&tfile->wq.wait); + rcu_assign_pointer(tfile->tun, NULL); + --tun->numqueues; + } + BUG_ON(tun->numqueues != 0); + + synchronize_net(); + for (i = 0; i < n; i++) { + tfile = rcu_dereference_protected(tun->tfiles[i], + lockdep_rtnl_is_held()); + /* Drop read queue */ + skb_queue_purge(&tfile->sk.sk_receive_queue); + sock_put(&tfile->sk); + } +} + static int tun_attach(struct tun_struct *tun, struct file *file) { struct tun_file *tfile = file->private_data; int err; - ASSERT_RTNL(); - - netif_tx_lock_bh(tun->dev); - err = -EINVAL; - if (tfile->tun) + if (rcu_dereference_protected(tfile->tun, lockdep_rtnl_is_held())) goto out; err = -EBUSY; - if (tun->tfile) + if (!(tun->flags & TUN_TAP_MQ) && tun->numqueues == 1) + goto out; + + err = -E2BIG; + if (tun->numqueues == MAX_TAP_QUEUES) goto out; err = 0; - /* Re-attach filter when attaching to a persist device */ + /* Re-attach the filter to presist device */ if (tun->filter_attached == true) { err = sk_attach_filter(&tun->fprog, tfile->socket.sk); if (!err) goto out; } + tfile->queue_index = tun->numqueues; rcu_assign_pointer(tfile->tun, tun); - tfile->socket.sk->sk_sndbuf = tun->sndbuf; - rcu_assign_pointer(tun->tfile, tfile); - netif_carrier_on(tun->dev); + rcu_assign_pointer(tun->tfiles[tun->numqueues], tfile); sock_hold(&tfile->sk); + tun->numqueues++; + + tun_set_real_num_queues(tun); + + if (tun->numqueues == 1) + netif_carrier_on(tun->dev); + + /* device is allowed to go away first, so no need to hold extra + * refcnt. + */ out: - netif_tx_unlock_bh(tun->dev); return err; } -static void __tun_detach(struct tun_struct *tun) -{ - struct tun_file *tfile = rcu_dereference_protected(tun->tfile, - lockdep_rtnl_is_held()); - /* Detach from net device */ - netif_carrier_off(tun->dev); - rcu_assign_pointer(tun->tfile, NULL); - if (tfile) { - rcu_assign_pointer(tfile->tun, NULL); - - synchronize_net(); - /* Drop read queue */ - skb_queue_purge(&tfile->socket.sk->sk_receive_queue); - } -} - static struct tun_struct *__tun_get(struct tun_file *tfile) { struct tun_struct *tun; @@ -349,30 +458,20 @@ static const struct ethtool_ops tun_ethtool_ops; /* Net device detach from fd. */ static void tun_net_uninit(struct net_device *dev) { - struct tun_struct *tun = netdev_priv(dev); - struct tun_file *tfile = rcu_dereference_protected(tun->tfile, - lockdep_rtnl_is_held()); - - /* Inform the methods they need to stop using the dev. - */ - if (tfile) { - wake_up_all(&tfile->wq.wait); - __tun_detach(tun); - synchronize_net(); - } + tun_detach_all(dev); } /* Net device open. */ static int tun_net_open(struct net_device *dev) { - netif_start_queue(dev); + netif_tx_start_all_queues(dev); return 0; } /* Net device close. */ static int tun_net_close(struct net_device *dev) { - netif_stop_queue(dev); + netif_tx_stop_all_queues(dev); return 0; } @@ -380,16 +479,20 @@ static int tun_net_close(struct net_device *dev) static netdev_tx_t tun_net_xmit(struct sk_buff *skb, struct net_device *dev) { struct tun_struct *tun = netdev_priv(dev); + int txq = skb->queue_mapping; struct tun_file *tfile; rcu_read_lock(); - tfile = rcu_dereference(tun->tfile); + tfile = rcu_dereference(tun->tfiles[txq]); + /* Drop packet if interface is not attached */ - if (!tfile) + if (txq >= tun->numqueues) goto drop; tun_debug(KERN_INFO, tun, "tun_net_xmit %d\n", skb->len); + BUG_ON(!tfile); + /* Drop if the filter does not like it. * This is a noop if the filter is disabled. * Filter can be enabled only for the TAP devices. */ @@ -400,12 +503,15 @@ static netdev_tx_t tun_net_xmit(struct sk_buff *skb, struct net_device *dev) sk_filter(tfile->socket.sk, skb)) goto drop; + /* Limit the number of packets queued by divining txq length with the + * number of queues. + */ if (skb_queue_len(&tfile->socket.sk->sk_receive_queue) - >= dev->tx_queue_len) { + >= dev->tx_queue_len / tun->numqueues){ if (!(tun->flags & TUN_ONE_QUEUE)) { /* Normal queueing mode. */ /* Packet scheduler handles dropping of further packets. */ - netif_stop_queue(dev); + netif_stop_subqueue(dev, txq); /* We won't see all dropped packets individually, so overrun * error is more appropriate. */ @@ -494,6 +600,7 @@ static const struct net_device_ops tun_netdev_ops = { .ndo_start_xmit = tun_net_xmit, .ndo_change_mtu = tun_net_change_mtu, .ndo_fix_features = tun_net_fix_features, + .ndo_select_queue = tun_select_queue, #ifdef CONFIG_NET_POLL_CONTROLLER .ndo_poll_controller = tun_poll_controller, #endif @@ -509,6 +616,7 @@ static const struct net_device_ops tap_netdev_ops = { .ndo_set_rx_mode = tun_net_mclist, .ndo_set_mac_address = eth_mac_addr, .ndo_validate_addr = eth_validate_addr, + .ndo_select_queue = tun_select_queue, #ifdef CONFIG_NET_POLL_CONTROLLER .ndo_poll_controller = tun_poll_controller, #endif @@ -550,7 +658,7 @@ static void tun_net_init(struct net_device *dev) /* Character device part */ /* Poll */ -static unsigned int tun_chr_poll(struct file *file, poll_table * wait) +static unsigned int tun_chr_poll(struct file *file, poll_table *wait) { struct tun_file *tfile = file->private_data; struct tun_struct *tun = __tun_get(tfile); @@ -995,7 +1103,7 @@ static ssize_t tun_do_read(struct tun_struct *tun, struct tun_file *tfile, schedule(); continue; } - netif_wake_queue(tun->dev); + netif_wake_subqueue(tun->dev, tfile->queue_index); ret = tun_put_user(tun, tfile, skb, iv, len); kfree_skb(skb); @@ -1156,6 +1264,9 @@ static int tun_flags(struct tun_struct *tun) if (tun->flags & TUN_VNET_HDR) flags |= IFF_VNET_HDR; + if (tun->flags & TUN_TAP_MQ) + flags |= IFF_MULTI_QUEUE; + return flags; } @@ -1247,8 +1358,9 @@ static int tun_set_iff(struct net *net, struct file *file, struct ifreq *ifr) if (*ifr->ifr_name) name = ifr->ifr_name; - dev = alloc_netdev(sizeof(struct tun_struct), name, - tun_setup); + dev = alloc_netdev_mqs(sizeof(struct tun_struct), name, + tun_setup, + MAX_TAP_QUEUES, MAX_TAP_QUEUES); if (!dev) return -ENOMEM; @@ -1283,7 +1395,7 @@ static int tun_set_iff(struct net *net, struct file *file, struct ifreq *ifr) err = tun_attach(tun, file); if (err < 0) - goto failed; + goto err_free_dev; } tun_debug(KERN_INFO, tun, "tun_set_iff\n"); @@ -1303,18 +1415,22 @@ static int tun_set_iff(struct net *net, struct file *file, struct ifreq *ifr) else tun->flags &= ~TUN_VNET_HDR; + if (ifr->ifr_flags & IFF_MULTI_QUEUE) + tun->flags |= TUN_TAP_MQ; + else + tun->flags &= ~TUN_TAP_MQ; + /* Make sure persistent devices do not get stuck in * xoff state. */ if (netif_running(tun->dev)) - netif_wake_queue(tun->dev); + netif_tx_wake_all_queues(tun->dev); strcpy(ifr->ifr_name, tun->dev->name); return 0; err_free_dev: free_netdev(dev); - failed: return err; } @@ -1369,6 +1485,51 @@ static int set_offload(struct tun_struct *tun, unsigned long arg) return 0; } +static void tun_detach_filter(struct tun_struct *tun, int n) +{ + int i; + struct tun_file *tfile; + + for (i = 0; i < n; i++) { + tfile = rcu_dereference_protected(tun->tfiles[i], + lockdep_rtnl_is_held()); + sk_detach_filter(tfile->socket.sk); + } + + tun->filter_attached = false; +} + +static int tun_attach_filter(struct tun_struct *tun) +{ + int i, ret = 0; + struct tun_file *tfile; + + for (i = 0; i < tun->numqueues; i++) { + tfile = rcu_dereference_protected(tun->tfiles[i], + lockdep_rtnl_is_held()); + ret = sk_attach_filter(&tun->fprog, tfile->socket.sk); + if (ret) { + tun_detach_filter(tun, i); + return ret; + } + } + + tun->filter_attached = true; + return ret; +} + +static void tun_set_sndbuf(struct tun_struct *tun) +{ + struct tun_file *tfile; + int i; + + for (i = 0; i < tun->numqueues; i++) { + tfile = rcu_dereference_protected(tun->tfiles[i], + lockdep_rtnl_is_held()); + tfile->socket.sk->sk_sndbuf = tun->sndbuf; + } +} + static long __tun_chr_ioctl(struct file *file, unsigned int cmd, unsigned long arg, int ifreq_len) { @@ -1397,6 +1558,7 @@ static long __tun_chr_ioctl(struct file *file, unsigned int cmd, (unsigned int __user*)argp); } + ret = 0; rtnl_lock(); tun = __tun_get(tfile); @@ -1537,7 +1699,8 @@ static long __tun_chr_ioctl(struct file *file, unsigned int cmd, break; } - tun->sndbuf = tfile->socket.sk->sk_sndbuf = sndbuf; + tun->sndbuf = sndbuf; + tun_set_sndbuf(tun); break; case TUNGETVNETHDRSZ: @@ -1568,9 +1731,7 @@ static long __tun_chr_ioctl(struct file *file, unsigned int cmd, if (copy_from_user(&tun->fprog, argp, sizeof(tun->fprog))) break; - ret = sk_attach_filter(&tun->fprog, tfile->socket.sk); - if (!ret) - tun->filter_attached = true; + ret = tun_attach_filter(tun); break; case TUNDETACHFILTER: @@ -1578,9 +1739,8 @@ static long __tun_chr_ioctl(struct file *file, unsigned int cmd, ret = -EINVAL; if ((tun->flags & TUN_TYPE_MASK) != TUN_TAP_DEV) break; - ret = sk_detach_filter(tfile->socket.sk); - if (!ret) - tun->filter_attached = false; + ret = 0; + tun_detach_filter(tun, tun->numqueues); break; default: @@ -1685,37 +1845,9 @@ static int tun_chr_open(struct inode *inode, struct file * file) static int tun_chr_close(struct inode *inode, struct file *file) { struct tun_file *tfile = file->private_data; - struct tun_struct *tun; struct net *net = tfile->net; - rtnl_lock(); - - tun = rcu_dereference_protected(tfile->tun, lockdep_rtnl_is_held()); - if (tun) { - struct net_device *dev = tun->dev; - - tun_debug(KERN_INFO, tun, "tun_chr_close\n"); - - __tun_detach(tun); - - synchronize_net(); - - /* If desirable, unregister the netdevice. */ - if (!(tun->flags & TUN_PERSIST)) { - if (dev->reg_state == NETREG_REGISTERED) - unregister_netdevice(dev); - } - - /* drop the reference that netdevice holds */ - sock_put(&tfile->sk); - } - - rtnl_unlock(); - - /* drop the reference that file holds */ - BUG_ON(!test_bit(SOCK_EXTERNALLY_ALLOCATED, - &tfile->socket.flags)); - sk_release_kernel(&tfile->sk); + tun_detach(tfile, true); put_net(net); return 0;