From 871b642adebe300be2e50aa5f65a418510f636ec Mon Sep 17 00:00:00 2001 From: Paolo Abeni Date: Fri, 26 Feb 2016 10:45:37 +0100 Subject: [PATCH 1/5] netdev: introduce ndo_set_rx_headroom This method allows the controlling device (i.e. the bridge) to specify additional headroom to be allocated for skb head on frame reception. Signed-off-by: Paolo Abeni Signed-off-by: David S. Miller --- include/linux/netdevice.h | 31 +++++++++++++++++++++++++++++++ 1 file changed, 31 insertions(+) diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index e52077ffe5ed..efe7cec111fa 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -1093,6 +1093,12 @@ struct tc_to_netdev { * This function is used to get egress tunnel information for given skb. * This is useful for retrieving outer tunnel header parameters while * sampling packet. + * void (*ndo_set_rx_headroom)(struct net_device *dev, int needed_headroom); + * This function is used to specify the headroom that the skb must + * consider when allocation skb during packet reception. Setting + * appropriate rx headroom value allows avoiding skb head copy on + * forward. Setting a negative value reset the rx headroom to the + * default value. * */ struct net_device_ops { @@ -1278,6 +1284,8 @@ struct net_device_ops { bool proto_down); int (*ndo_fill_metadata_dst)(struct net_device *dev, struct sk_buff *skb); + void (*ndo_set_rx_headroom)(struct net_device *dev, + int needed_headroom); }; /** @@ -1315,6 +1323,8 @@ struct net_device_ops { * @IFF_L3MDEV_SLAVE: device is enslaved to an L3 master device * @IFF_TEAM: device is a team device * @IFF_RXFH_CONFIGURED: device has had Rx Flow indirection table configured + * @IFF_PHONY_HEADROOM: the headroom value is controlled by an external + * entity (i.e. the master device for bridged veth) */ enum netdev_priv_flags { IFF_802_1Q_VLAN = 1<<0, @@ -1343,6 +1353,7 @@ enum netdev_priv_flags { IFF_L3MDEV_SLAVE = 1<<23, IFF_TEAM = 1<<24, IFF_RXFH_CONFIGURED = 1<<25, + IFF_PHONY_HEADROOM = 1<<26, }; #define IFF_802_1Q_VLAN IFF_802_1Q_VLAN @@ -1937,6 +1948,26 @@ struct netdev_queue *netdev_pick_tx(struct net_device *dev, struct sk_buff *skb, void *accel_priv); +/* returns the headroom that the master device needs to take in account + * when forwarding to this dev + */ +static inline unsigned netdev_get_fwd_headroom(struct net_device *dev) +{ + return dev->priv_flags & IFF_PHONY_HEADROOM ? 0 : dev->needed_headroom; +} + +static inline void netdev_set_rx_headroom(struct net_device *dev, int new_hr) +{ + if (dev->netdev_ops->ndo_set_rx_headroom) + dev->netdev_ops->ndo_set_rx_headroom(dev, new_hr); +} + +/* set the device rx headroom to the dev's default */ +static inline void netdev_reset_rx_headroom(struct net_device *dev) +{ + netdev_set_rx_headroom(dev, -1); +} + /* * Net namespace inlines */ From 45493d47c3db862263583400c1f721f13f2a0610 Mon Sep 17 00:00:00 2001 From: Paolo Abeni Date: Fri, 26 Feb 2016 10:45:38 +0100 Subject: [PATCH 2/5] bridge: notify enslaved devices of headroom changes On bridge needed_headroom changes, the enslaved devices are notified via the ndo_set_rx_headroom method Signed-off-by: Paolo Abeni Signed-off-by: David S. Miller --- net/bridge/br_if.c | 37 +++++++++++++++++++++++++++++++++++-- 1 file changed, 35 insertions(+), 2 deletions(-) diff --git a/net/bridge/br_if.c b/net/bridge/br_if.c index b37a1cc97d98..a73df3315df9 100644 --- a/net/bridge/br_if.c +++ b/net/bridge/br_if.c @@ -223,6 +223,31 @@ static void destroy_nbp_rcu(struct rcu_head *head) destroy_nbp(p); } +static unsigned get_max_headroom(struct net_bridge *br) +{ + unsigned max_headroom = 0; + struct net_bridge_port *p; + + list_for_each_entry(p, &br->port_list, list) { + unsigned dev_headroom = netdev_get_fwd_headroom(p->dev); + + if (dev_headroom > max_headroom) + max_headroom = dev_headroom; + } + + return max_headroom; +} + +static void update_headroom(struct net_bridge *br, int new_hr) +{ + struct net_bridge_port *p; + + list_for_each_entry(p, &br->port_list, list) + netdev_set_rx_headroom(p->dev, new_hr); + + br->dev->needed_headroom = new_hr; +} + /* Delete port(interface) from bridge is done in two steps. * via RCU. First step, marks device as down. That deletes * all the timers and stops new packets from flowing through. @@ -248,6 +273,9 @@ static void del_nbp(struct net_bridge_port *p) br_ifinfo_notify(RTM_DELLINK, p); list_del_rcu(&p->list); + if (netdev_get_fwd_headroom(dev) == br->dev->needed_headroom) + update_headroom(br, get_max_headroom(br)); + netdev_reset_rx_headroom(dev); nbp_vlan_flush(p); br_fdb_delete_by_port(br, p, 0, 1); @@ -438,6 +466,7 @@ int br_add_if(struct net_bridge *br, struct net_device *dev) { struct net_bridge_port *p; int err = 0; + unsigned br_hr, dev_hr; bool changed_addr; /* Don't allow bridging non-ethernet like devices, or DSA-enabled @@ -505,8 +534,12 @@ int br_add_if(struct net_bridge *br, struct net_device *dev) netdev_update_features(br->dev); - if (br->dev->needed_headroom < dev->needed_headroom) - br->dev->needed_headroom = dev->needed_headroom; + br_hr = br->dev->needed_headroom; + dev_hr = netdev_get_fwd_headroom(dev); + if (br_hr < dev_hr) + update_headroom(br, dev_hr); + else + netdev_set_rx_headroom(dev, br_hr); if (br_fdb_insert(br, p, dev->dev_addr, 0)) netdev_err(dev, "failed insert local address bridge forwarding table\n"); From 3a927bc7cf9d0fbe8f4a8189dd5f8440228f64e7 Mon Sep 17 00:00:00 2001 From: Paolo Abeni Date: Fri, 26 Feb 2016 10:45:39 +0100 Subject: [PATCH 3/5] ovs: propagate per dp max headroom to all vports This patch implements bookkeeping support to compute the maximum headroom for all the devices in each datapath. When said value changes, the underlying devs are notified via the ndo_set_rx_headroom method. This also increases the internal vports xmit performance. Signed-off-by: Paolo Abeni Signed-off-by: David S. Miller --- net/openvswitch/datapath.c | 40 ++++++++++++++++++++++++++++ net/openvswitch/datapath.h | 4 +++ net/openvswitch/vport-internal_dev.c | 10 ++++++- 3 files changed, 53 insertions(+), 1 deletion(-) diff --git a/net/openvswitch/datapath.c b/net/openvswitch/datapath.c index c4e8455d5d56..e6a7d494df24 100644 --- a/net/openvswitch/datapath.c +++ b/net/openvswitch/datapath.c @@ -1908,6 +1908,29 @@ static struct vport *lookup_vport(struct net *net, return ERR_PTR(-EINVAL); } +/* Called with ovs_mutex */ +static void update_headroom(struct datapath *dp) +{ + unsigned dev_headroom, max_headroom = 0; + struct net_device *dev; + struct vport *vport; + int i; + + for (i = 0; i < DP_VPORT_HASH_BUCKETS; i++) { + hlist_for_each_entry_rcu(vport, &dp->ports[i], dp_hash_node) { + dev = vport->dev; + dev_headroom = netdev_get_fwd_headroom(dev); + if (dev_headroom > max_headroom) + max_headroom = dev_headroom; + } + } + + dp->max_headroom = max_headroom; + for (i = 0; i < DP_VPORT_HASH_BUCKETS; i++) + hlist_for_each_entry_rcu(vport, &dp->ports[i], dp_hash_node) + netdev_set_rx_headroom(vport->dev, max_headroom); +} + static int ovs_vport_cmd_new(struct sk_buff *skb, struct genl_info *info) { struct nlattr **a = info->attrs; @@ -1973,6 +1996,12 @@ restart: err = ovs_vport_cmd_fill_info(vport, reply, info->snd_portid, info->snd_seq, 0, OVS_VPORT_CMD_NEW); + + if (netdev_get_fwd_headroom(vport->dev) > dp->max_headroom) + update_headroom(dp); + else + netdev_set_rx_headroom(vport->dev, dp->max_headroom); + BUG_ON(err < 0); ovs_unlock(); @@ -2039,8 +2068,10 @@ exit_unlock_free: static int ovs_vport_cmd_del(struct sk_buff *skb, struct genl_info *info) { + bool must_update_headroom = false; struct nlattr **a = info->attrs; struct sk_buff *reply; + struct datapath *dp; struct vport *vport; int err; @@ -2062,7 +2093,16 @@ static int ovs_vport_cmd_del(struct sk_buff *skb, struct genl_info *info) err = ovs_vport_cmd_fill_info(vport, reply, info->snd_portid, info->snd_seq, 0, OVS_VPORT_CMD_DEL); BUG_ON(err < 0); + + /* the vport deletion may trigger dp headroom update */ + dp = vport->dp; + if (netdev_get_fwd_headroom(vport->dev) == dp->max_headroom) + must_update_headroom = true; + netdev_reset_rx_headroom(vport->dev); ovs_dp_detach_port(vport); + + if (must_update_headroom) + update_headroom(dp); ovs_unlock(); ovs_notify(&dp_vport_genl_family, reply, info); diff --git a/net/openvswitch/datapath.h b/net/openvswitch/datapath.h index 67bdecd9fdc1..427e39a045cf 100644 --- a/net/openvswitch/datapath.h +++ b/net/openvswitch/datapath.h @@ -68,6 +68,8 @@ struct dp_stats_percpu { * ovs_mutex and RCU. * @stats_percpu: Per-CPU datapath statistics. * @net: Reference to net namespace. + * @max_headroom: the maximum headroom of all vports in this datapath; it will + * be used by all the internal vports in this dp. * * Context: See the comment on locking at the top of datapath.c for additional * locking information. @@ -89,6 +91,8 @@ struct datapath { possible_net_t net; u32 user_features; + + u32 max_headroom; }; /** diff --git a/net/openvswitch/vport-internal_dev.c b/net/openvswitch/vport-internal_dev.c index ec76398a792f..83a5534abd31 100644 --- a/net/openvswitch/vport-internal_dev.c +++ b/net/openvswitch/vport-internal_dev.c @@ -138,6 +138,11 @@ internal_get_stats(struct net_device *dev, struct rtnl_link_stats64 *stats) return stats; } +void internal_set_rx_headroom(struct net_device *dev, int new_hr) +{ + dev->needed_headroom = new_hr; +} + static const struct net_device_ops internal_dev_netdev_ops = { .ndo_open = internal_dev_open, .ndo_stop = internal_dev_stop, @@ -145,6 +150,7 @@ static const struct net_device_ops internal_dev_netdev_ops = { .ndo_set_mac_address = eth_mac_addr, .ndo_change_mtu = internal_dev_change_mtu, .ndo_get_stats64 = internal_get_stats, + .ndo_set_rx_headroom = internal_set_rx_headroom, }; static struct rtnl_link_ops internal_dev_link_ops __read_mostly = { @@ -158,7 +164,8 @@ static void do_setup(struct net_device *netdev) netdev->netdev_ops = &internal_dev_netdev_ops; netdev->priv_flags &= ~IFF_TX_SKB_SHARING; - netdev->priv_flags |= IFF_LIVE_ADDR_CHANGE | IFF_OPENVSWITCH; + netdev->priv_flags |= IFF_LIVE_ADDR_CHANGE | IFF_OPENVSWITCH | + IFF_PHONY_HEADROOM; netdev->destructor = internal_dev_destructor; netdev->ethtool_ops = &internal_dev_ethtool_ops; netdev->rtnl_link_ops = &internal_dev_link_ops; @@ -199,6 +206,7 @@ static struct vport *internal_dev_create(const struct vport_parms *parms) err = -ENOMEM; goto error_free_netdev; } + vport->dev->needed_headroom = vport->dp->max_headroom; dev_net_set(vport->dev, ovs_dp_get_net(vport->dp)); internal_dev = internal_dev_priv(vport->dev); From eaea34b23c46bf17b4a5638be69ab3561854f34b Mon Sep 17 00:00:00 2001 From: Paolo Abeni Date: Fri, 26 Feb 2016 10:45:40 +0100 Subject: [PATCH 4/5] net/tun: implement ndo_set_rx_headroom ndo_set_rx_headroom controls the align value used by tun devices to allocate skbs on frame reception. When the xmit device adds a large encapsulation, this avoids an skb head reallocation on forwarding. The measured improvement when forwarding towards a vxlan dev with frame size below the egress device MTU is as follow: vxlan over ipv6, bridged: +6% vxlan over ipv6, ovs: +7% In case of ipv4 tunnels there is no improvement, since the tun device default alignment provides enough headroom to avoid the skb head reallocation. Signed-off-by: Paolo Abeni Signed-off-by: David S. Miller --- drivers/net/tun.c | 17 ++++++++++++++++- 1 file changed, 16 insertions(+), 1 deletion(-) diff --git a/drivers/net/tun.c b/drivers/net/tun.c index 88bb8cc3555b..afdf950617c3 100644 --- a/drivers/net/tun.c +++ b/drivers/net/tun.c @@ -187,6 +187,7 @@ struct tun_struct { #define TUN_USER_FEATURES (NETIF_F_HW_CSUM|NETIF_F_TSO_ECN|NETIF_F_TSO| \ NETIF_F_TSO6|NETIF_F_UFO) + int align; int vnet_hdr_sz; int sndbuf; struct tap_filter txflt; @@ -934,6 +935,17 @@ static void tun_poll_controller(struct net_device *dev) return; } #endif + +static void tun_set_headroom(struct net_device *dev, int new_hr) +{ + struct tun_struct *tun = netdev_priv(dev); + + if (new_hr < NET_SKB_PAD) + new_hr = NET_SKB_PAD; + + tun->align = new_hr; +} + static const struct net_device_ops tun_netdev_ops = { .ndo_uninit = tun_net_uninit, .ndo_open = tun_net_open, @@ -945,6 +957,7 @@ static const struct net_device_ops tun_netdev_ops = { #ifdef CONFIG_NET_POLL_CONTROLLER .ndo_poll_controller = tun_poll_controller, #endif + .ndo_set_rx_headroom = tun_set_headroom, }; static const struct net_device_ops tap_netdev_ops = { @@ -962,6 +975,7 @@ static const struct net_device_ops tap_netdev_ops = { .ndo_poll_controller = tun_poll_controller, #endif .ndo_features_check = passthru_features_check, + .ndo_set_rx_headroom = tun_set_headroom, }; static void tun_flow_init(struct tun_struct *tun) @@ -1086,7 +1100,7 @@ static ssize_t tun_get_user(struct tun_struct *tun, struct tun_file *tfile, struct tun_pi pi = { 0, cpu_to_be16(ETH_P_IP) }; struct sk_buff *skb; size_t total_len = iov_iter_count(from); - size_t len = total_len, align = NET_SKB_PAD, linear; + size_t len = total_len, align = tun->align, linear; struct virtio_net_hdr gso = { 0 }; int good_linear; int copylen; @@ -1694,6 +1708,7 @@ static int tun_set_iff(struct net *net, struct file *file, struct ifreq *ifr) tun->txflt.count = 0; tun->vnet_hdr_sz = sizeof(struct virtio_net_hdr); + tun->align = NET_SKB_PAD; tun->filter_attached = false; tun->sndbuf = tfile->socket.sk->sk_sndbuf; From 163e529200af7b7521fbde5dbcc653cf3ce597df Mon Sep 17 00:00:00 2001 From: Paolo Abeni Date: Fri, 26 Feb 2016 10:45:41 +0100 Subject: [PATCH 5/5] veth: implement ndo_set_rx_headroom The rx headroom for veth dev is the peer device needed_headroom. Avoid ping-pong updates setting the private flag IFF_PHONY_HEADROOM. This avoids skb head reallocation when forwarding from a veth dev towards a device adding some kind of encapsulation. When transmitting frames below the MTU size towards a vxlan device, this gives about 10% performance speed-up when OVS is used to connect the veth and the vxlan device and a little more when using a plain Linux bridge. Signed-off-by: Paolo Abeni Signed-off-by: David S. Miller --- drivers/net/veth.c | 26 ++++++++++++++++++++++++++ 1 file changed, 26 insertions(+) diff --git a/drivers/net/veth.c b/drivers/net/veth.c index ba21d072be31..4f30a6ae50d0 100644 --- a/drivers/net/veth.c +++ b/drivers/net/veth.c @@ -35,6 +35,7 @@ struct pcpu_vstats { struct veth_priv { struct net_device __rcu *peer; atomic64_t dropped; + unsigned requested_headroom; }; /* @@ -271,6 +272,29 @@ static int veth_get_iflink(const struct net_device *dev) return iflink; } +static void veth_set_rx_headroom(struct net_device *dev, int new_hr) +{ + struct veth_priv *peer_priv, *priv = netdev_priv(dev); + struct net_device *peer; + + if (new_hr < 0) + new_hr = 0; + + rcu_read_lock(); + peer = rcu_dereference(priv->peer); + if (unlikely(!peer)) + goto out; + + peer_priv = netdev_priv(peer); + priv->requested_headroom = new_hr; + new_hr = max(priv->requested_headroom, peer_priv->requested_headroom); + dev->needed_headroom = new_hr; + peer->needed_headroom = new_hr; + +out: + rcu_read_unlock(); +} + static const struct net_device_ops veth_netdev_ops = { .ndo_init = veth_dev_init, .ndo_open = veth_open, @@ -285,6 +309,7 @@ static const struct net_device_ops veth_netdev_ops = { #endif .ndo_get_iflink = veth_get_iflink, .ndo_features_check = passthru_features_check, + .ndo_set_rx_headroom = veth_set_rx_headroom, }; #define VETH_FEATURES (NETIF_F_SG | NETIF_F_FRAGLIST | NETIF_F_ALL_TSO | \ @@ -301,6 +326,7 @@ static void veth_setup(struct net_device *dev) dev->priv_flags &= ~IFF_TX_SKB_SHARING; dev->priv_flags |= IFF_LIVE_ADDR_CHANGE; dev->priv_flags |= IFF_NO_QUEUE; + dev->priv_flags |= IFF_PHONY_HEADROOM; dev->netdev_ops = &veth_netdev_ops; dev->ethtool_ops = &veth_ethtool_ops;