1
0
Fork 0
alistair23-linux/drivers/net/ethernet/mellanox/mlx5/core/en_fs_ethtool.c

944 lines
24 KiB
C
Raw Normal View History

/*
* Copyright (c) 2016, Mellanox Technologies. All rights reserved.
*
* This software is available to you under a choice of one of two
* licenses. You may choose to be licensed under the terms of the GNU
* General Public License (GPL) Version 2, available from the file
* COPYING in the main directory of this source tree, or the
* OpenIB.org BSD license below:
*
* Redistribution and use in source and binary forms, with or
* without modification, are permitted provided that the following
* conditions are met:
*
* - Redistributions of source code must retain the above
* copyright notice, this list of conditions and the following
* disclaimer.
*
* - Redistributions in binary form must reproduce the above
* copyright notice, this list of conditions and the following
* disclaimer in the documentation and/or other materials
* provided with the distribution.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
* BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
* ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
* CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*/
#include <linux/mlx5/fs.h>
#include "en.h"
net/mlx5e: Add XSK zero-copy support This commit adds support for AF_XDP zero-copy RX and TX. We create a dedicated XSK RQ inside the channel, it means that two RQs are running simultaneously: one for non-XSK traffic and the other for XSK traffic. The regular and XSK RQs use a single ID namespace split into two halves: the lower half is regular RQs, and the upper half is XSK RQs. When any zero-copy AF_XDP socket is active, changing the number of channels is not allowed, because it would break to mapping between XSK RQ IDs and channels. XSK requires different page allocation and release routines. Such functions as mlx5e_{alloc,free}_rx_mpwqe and mlx5e_{get,put}_rx_frag are generic enough to be used for both regular and XSK RQs, and they use the mlx5e_page_{alloc,release} wrappers around the real allocation functions. Function pointers are not used to avoid losing the performance with retpolines. Wherever it's certain that the regular (non-XSK) page release function should be used, it's called directly. Only the stats that could be meaningful for XSK are exposed to the userspace. Those that don't take part in the XSK flow are not considered. Note that we don't wait for WQEs on the XSK RQ (unlike the regular RQ), because the newer xdpsock sample doesn't provide any Fill Ring entries at the setup stage. We create a dedicated XSK SQ in the channel. This separation has its advantages: 1. When the UMEM is closed, the XSK SQ can also be closed and stop receiving completions. If an existing SQ was used for XSK, it would continue receiving completions for the packets of the closed socket. If a new UMEM was opened at that point, it would start getting completions that don't belong to it. 2. Calculating statistics separately. When the userspace kicks the TX, the driver triggers a hardware interrupt by posting a NOP to a dedicated XSK ICO (internal control operations) SQ, in order to trigger NAPI on the right CPU core. This XSK ICO SQ is protected by a spinlock, as the userspace application may kick the TX from any core. Store the pointers to the UMEMs in the net device private context, independently from the kernel. This way the driver can distinguish between the zero-copy and non-zero-copy UMEMs. The kernel function xdp_get_umem_from_qid does not care about this difference, but the driver is only interested in zero-copy UMEMs, particularly, on the cleanup it determines whether to close the XSK RQ and SQ or not by looking at the presence of the UMEM. Use state_lock to protect the access to this area of UMEM pointers. LRO isn't compatible with XDP, but there may be active UMEMs while XDP is off. If this is the case, don't allow LRO to ensure XDP can be reenabled at any time. The validation of XSK parameters typically happens when XSK queues open. However, when the interface is down or the XDP program isn't set, it's still possible to have active AF_XDP sockets and even to open new, but the XSK queues will be closed. To cover these cases, perform the validation also in these flows: 1. A new UMEM is registered, but the XSK queues aren't going to be created due to missing XDP program or interface being down. 2. MTU changes while there are UMEMs registered. Having this early check prevents mlx5e_open_channels from failing at a later stage, where recovery is impossible and the application has no chance to handle the error, because it got the successful return value for an MTU change or XSK open operation. The performance testing was performed on a machine with the following configuration: - 24 cores of Intel Xeon E5-2620 v3 @ 2.40 GHz - Mellanox ConnectX-5 Ex with 100 Gbit/s link The results with retpoline disabled, single stream: txonly: 33.3 Mpps (21.5 Mpps with queue and app pinned to the same CPU) rxdrop: 12.2 Mpps l2fwd: 9.4 Mpps The results with retpoline enabled, single stream: txonly: 21.3 Mpps (14.1 Mpps with queue and app pinned to the same CPU) rxdrop: 9.9 Mpps l2fwd: 6.8 Mpps Signed-off-by: Maxim Mikityanskiy <maximmi@mellanox.com> Signed-off-by: Tariq Toukan <tariqt@mellanox.com> Acked-by: Saeed Mahameed <saeedm@mellanox.com> Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
2019-06-26 08:35:38 -06:00
#include "en/params.h"
#include "en/xsk/umem.h"
struct mlx5e_ethtool_rule {
struct list_head list;
struct ethtool_rx_flow_spec flow_spec;
struct mlx5_flow_handle *rule;
struct mlx5e_ethtool_table *eth_ft;
};
static void put_flow_table(struct mlx5e_ethtool_table *eth_ft)
{
if (!--eth_ft->num_rules) {
mlx5_destroy_flow_table(eth_ft->ft);
eth_ft->ft = NULL;
}
}
#define MLX5E_ETHTOOL_L3_L4_PRIO 0
#define MLX5E_ETHTOOL_L2_PRIO (MLX5E_ETHTOOL_L3_L4_PRIO + ETHTOOL_NUM_L3_L4_FTS)
#define MLX5E_ETHTOOL_NUM_ENTRIES 64000
#define MLX5E_ETHTOOL_NUM_GROUPS 10
static struct mlx5e_ethtool_table *get_flow_table(struct mlx5e_priv *priv,
struct ethtool_rx_flow_spec *fs,
int num_tuples)
{
struct mlx5e_ethtool_table *eth_ft;
struct mlx5_flow_namespace *ns;
struct mlx5_flow_table *ft;
int max_tuples;
int table_size;
int prio;
switch (fs->flow_type & ~(FLOW_EXT | FLOW_MAC_EXT)) {
case TCP_V4_FLOW:
case UDP_V4_FLOW:
case TCP_V6_FLOW:
case UDP_V6_FLOW:
max_tuples = ETHTOOL_NUM_L3_L4_FTS;
prio = MLX5E_ETHTOOL_L3_L4_PRIO + (max_tuples - num_tuples);
eth_ft = &priv->fs.ethtool.l3_l4_ft[prio];
break;
case IP_USER_FLOW:
case IPV6_USER_FLOW:
max_tuples = ETHTOOL_NUM_L3_L4_FTS;
prio = MLX5E_ETHTOOL_L3_L4_PRIO + (max_tuples - num_tuples);
eth_ft = &priv->fs.ethtool.l3_l4_ft[prio];
break;
case ETHER_FLOW:
max_tuples = ETHTOOL_NUM_L2_FTS;
prio = max_tuples - num_tuples;
eth_ft = &priv->fs.ethtool.l2_ft[prio];
prio += MLX5E_ETHTOOL_L2_PRIO;
break;
default:
return ERR_PTR(-EINVAL);
}
eth_ft->num_rules++;
if (eth_ft->ft)
return eth_ft;
ns = mlx5_get_flow_namespace(priv->mdev,
MLX5_FLOW_NAMESPACE_ETHTOOL);
if (!ns)
return ERR_PTR(-EOPNOTSUPP);
table_size = min_t(u32, BIT(MLX5_CAP_FLOWTABLE(priv->mdev,
flow_table_properties_nic_receive.log_max_ft_size)),
MLX5E_ETHTOOL_NUM_ENTRIES);
ft = mlx5_create_auto_grouped_flow_table(ns, prio,
table_size,
MLX5E_ETHTOOL_NUM_GROUPS, 0, 0);
if (IS_ERR(ft))
return (void *)ft;
eth_ft->ft = ft;
return eth_ft;
}
static void mask_spec(u8 *mask, u8 *val, size_t size)
{
unsigned int i;
for (i = 0; i < size; i++, mask++, val++)
*((u8 *)val) = *((u8 *)mask) & *((u8 *)val);
}
#define MLX5E_FTE_SET(header_p, fld, v) \
MLX5_SET(fte_match_set_lyr_2_4, header_p, fld, v)
#define MLX5E_FTE_ADDR_OF(header_p, fld) \
MLX5_ADDR_OF(fte_match_set_lyr_2_4, header_p, fld)
static void
set_ip4(void *headers_c, void *headers_v, __be32 ip4src_m,
__be32 ip4src_v, __be32 ip4dst_m, __be32 ip4dst_v)
{
if (ip4src_m) {
memcpy(MLX5E_FTE_ADDR_OF(headers_v, src_ipv4_src_ipv6.ipv4_layout.ipv4),
&ip4src_v, sizeof(ip4src_v));
memcpy(MLX5E_FTE_ADDR_OF(headers_c, src_ipv4_src_ipv6.ipv4_layout.ipv4),
&ip4src_m, sizeof(ip4src_m));
}
if (ip4dst_m) {
memcpy(MLX5E_FTE_ADDR_OF(headers_v, dst_ipv4_dst_ipv6.ipv4_layout.ipv4),
&ip4dst_v, sizeof(ip4dst_v));
memcpy(MLX5E_FTE_ADDR_OF(headers_c, dst_ipv4_dst_ipv6.ipv4_layout.ipv4),
&ip4dst_m, sizeof(ip4dst_m));
}
MLX5E_FTE_SET(headers_c, ethertype, 0xffff);
MLX5E_FTE_SET(headers_v, ethertype, ETH_P_IP);
}
static void
set_ip6(void *headers_c, void *headers_v, __be32 ip6src_m[4],
__be32 ip6src_v[4], __be32 ip6dst_m[4], __be32 ip6dst_v[4])
{
u8 ip6_sz = MLX5_FLD_SZ_BYTES(ipv6_layout, ipv6);
if (!ipv6_addr_any((struct in6_addr *)ip6src_m)) {
memcpy(MLX5E_FTE_ADDR_OF(headers_v, src_ipv4_src_ipv6.ipv6_layout.ipv6),
ip6src_v, ip6_sz);
memcpy(MLX5E_FTE_ADDR_OF(headers_c, src_ipv4_src_ipv6.ipv6_layout.ipv6),
ip6src_m, ip6_sz);
}
if (!ipv6_addr_any((struct in6_addr *)ip6dst_m)) {
memcpy(MLX5E_FTE_ADDR_OF(headers_v, dst_ipv4_dst_ipv6.ipv6_layout.ipv6),
ip6dst_v, ip6_sz);
memcpy(MLX5E_FTE_ADDR_OF(headers_c, dst_ipv4_dst_ipv6.ipv6_layout.ipv6),
ip6dst_m, ip6_sz);
}
MLX5E_FTE_SET(headers_c, ethertype, 0xffff);
MLX5E_FTE_SET(headers_v, ethertype, ETH_P_IPV6);
}
static void
set_tcp(void *headers_c, void *headers_v, __be16 psrc_m, __be16 psrc_v,
__be16 pdst_m, __be16 pdst_v)
{
if (psrc_m) {
MLX5E_FTE_SET(headers_c, tcp_sport, ntohs(psrc_m));
MLX5E_FTE_SET(headers_v, tcp_sport, ntohs(psrc_v));
}
if (pdst_m) {
MLX5E_FTE_SET(headers_c, tcp_dport, ntohs(pdst_m));
MLX5E_FTE_SET(headers_v, tcp_dport, ntohs(pdst_v));
}
MLX5E_FTE_SET(headers_c, ip_protocol, 0xffff);
MLX5E_FTE_SET(headers_v, ip_protocol, IPPROTO_TCP);
}
static void
set_udp(void *headers_c, void *headers_v, __be16 psrc_m, __be16 psrc_v,
__be16 pdst_m, __be16 pdst_v)
{
if (psrc_m) {
MLX5E_FTE_SET(headers_c, udp_sport, ntohs(psrc_m));
MLX5E_FTE_SET(headers_v, udp_sport, ntohs(psrc_v));
}
if (pdst_m) {
MLX5E_FTE_SET(headers_c, udp_dport, ntohs(pdst_m));
MLX5E_FTE_SET(headers_v, udp_dport, ntohs(pdst_v));
}
MLX5E_FTE_SET(headers_c, ip_protocol, 0xffff);
MLX5E_FTE_SET(headers_v, ip_protocol, IPPROTO_UDP);
}
static void
parse_tcp4(void *headers_c, void *headers_v, struct ethtool_rx_flow_spec *fs)
{
struct ethtool_tcpip4_spec *l4_mask = &fs->m_u.tcp_ip4_spec;
struct ethtool_tcpip4_spec *l4_val = &fs->h_u.tcp_ip4_spec;
set_ip4(headers_c, headers_v, l4_mask->ip4src, l4_val->ip4src,
l4_mask->ip4dst, l4_val->ip4dst);
set_tcp(headers_c, headers_v, l4_mask->psrc, l4_val->psrc,
l4_mask->pdst, l4_val->pdst);
}
static void
parse_udp4(void *headers_c, void *headers_v, struct ethtool_rx_flow_spec *fs)
{
struct ethtool_tcpip4_spec *l4_mask = &fs->m_u.udp_ip4_spec;
struct ethtool_tcpip4_spec *l4_val = &fs->h_u.udp_ip4_spec;
set_ip4(headers_c, headers_v, l4_mask->ip4src, l4_val->ip4src,
l4_mask->ip4dst, l4_val->ip4dst);
set_udp(headers_c, headers_v, l4_mask->psrc, l4_val->psrc,
l4_mask->pdst, l4_val->pdst);
}
static void
parse_ip4(void *headers_c, void *headers_v, struct ethtool_rx_flow_spec *fs)
{
struct ethtool_usrip4_spec *l3_mask = &fs->m_u.usr_ip4_spec;
struct ethtool_usrip4_spec *l3_val = &fs->h_u.usr_ip4_spec;
set_ip4(headers_c, headers_v, l3_mask->ip4src, l3_val->ip4src,
l3_mask->ip4dst, l3_val->ip4dst);
if (l3_mask->proto) {
MLX5E_FTE_SET(headers_c, ip_protocol, l3_mask->proto);
MLX5E_FTE_SET(headers_v, ip_protocol, l3_val->proto);
}
}
static void
parse_ip6(void *headers_c, void *headers_v, struct ethtool_rx_flow_spec *fs)
{
struct ethtool_usrip6_spec *l3_mask = &fs->m_u.usr_ip6_spec;
struct ethtool_usrip6_spec *l3_val = &fs->h_u.usr_ip6_spec;
set_ip6(headers_c, headers_v, l3_mask->ip6src,
l3_val->ip6src, l3_mask->ip6dst, l3_val->ip6dst);
if (l3_mask->l4_proto) {
MLX5E_FTE_SET(headers_c, ip_protocol, l3_mask->l4_proto);
MLX5E_FTE_SET(headers_v, ip_protocol, l3_val->l4_proto);
}
}
static void
parse_tcp6(void *headers_c, void *headers_v, struct ethtool_rx_flow_spec *fs)
{
struct ethtool_tcpip6_spec *l4_mask = &fs->m_u.tcp_ip6_spec;
struct ethtool_tcpip6_spec *l4_val = &fs->h_u.tcp_ip6_spec;
set_ip6(headers_c, headers_v, l4_mask->ip6src,
l4_val->ip6src, l4_mask->ip6dst, l4_val->ip6dst);
set_tcp(headers_c, headers_v, l4_mask->psrc, l4_val->psrc,
l4_mask->pdst, l4_val->pdst);
}
static void
parse_udp6(void *headers_c, void *headers_v, struct ethtool_rx_flow_spec *fs)
{
struct ethtool_tcpip6_spec *l4_mask = &fs->m_u.udp_ip6_spec;
struct ethtool_tcpip6_spec *l4_val = &fs->h_u.udp_ip6_spec;
set_ip6(headers_c, headers_v, l4_mask->ip6src,
l4_val->ip6src, l4_mask->ip6dst, l4_val->ip6dst);
set_udp(headers_c, headers_v, l4_mask->psrc, l4_val->psrc,
l4_mask->pdst, l4_val->pdst);
}
static void
parse_ether(void *headers_c, void *headers_v, struct ethtool_rx_flow_spec *fs)
{
struct ethhdr *eth_mask = &fs->m_u.ether_spec;
struct ethhdr *eth_val = &fs->h_u.ether_spec;
mask_spec((u8 *)eth_mask, (u8 *)eth_val, sizeof(*eth_mask));
ether_addr_copy(MLX5E_FTE_ADDR_OF(headers_c, smac_47_16), eth_mask->h_source);
ether_addr_copy(MLX5E_FTE_ADDR_OF(headers_v, smac_47_16), eth_val->h_source);
ether_addr_copy(MLX5E_FTE_ADDR_OF(headers_c, dmac_47_16), eth_mask->h_dest);
ether_addr_copy(MLX5E_FTE_ADDR_OF(headers_v, dmac_47_16), eth_val->h_dest);
MLX5E_FTE_SET(headers_c, ethertype, ntohs(eth_mask->h_proto));
MLX5E_FTE_SET(headers_v, ethertype, ntohs(eth_val->h_proto));
}
static void
set_cvlan(void *headers_c, void *headers_v, __be16 vlan_tci)
{
MLX5E_FTE_SET(headers_c, cvlan_tag, 1);
MLX5E_FTE_SET(headers_v, cvlan_tag, 1);
MLX5E_FTE_SET(headers_c, first_vid, 0xfff);
MLX5E_FTE_SET(headers_v, first_vid, ntohs(vlan_tci));
}
static void
set_dmac(void *headers_c, void *headers_v,
unsigned char m_dest[ETH_ALEN], unsigned char v_dest[ETH_ALEN])
{
ether_addr_copy(MLX5E_FTE_ADDR_OF(headers_c, dmac_47_16), m_dest);
ether_addr_copy(MLX5E_FTE_ADDR_OF(headers_v, dmac_47_16), v_dest);
}
static int set_flow_attrs(u32 *match_c, u32 *match_v,
struct ethtool_rx_flow_spec *fs)
{
void *outer_headers_c = MLX5_ADDR_OF(fte_match_param, match_c,
outer_headers);
void *outer_headers_v = MLX5_ADDR_OF(fte_match_param, match_v,
outer_headers);
u32 flow_type = fs->flow_type & ~(FLOW_EXT | FLOW_MAC_EXT);
switch (flow_type) {
case TCP_V4_FLOW:
parse_tcp4(outer_headers_c, outer_headers_v, fs);
break;
case UDP_V4_FLOW:
parse_udp4(outer_headers_c, outer_headers_v, fs);
break;
case IP_USER_FLOW:
parse_ip4(outer_headers_c, outer_headers_v, fs);
break;
case TCP_V6_FLOW:
parse_tcp6(outer_headers_c, outer_headers_v, fs);
break;
case UDP_V6_FLOW:
parse_udp6(outer_headers_c, outer_headers_v, fs);
break;
case IPV6_USER_FLOW:
parse_ip6(outer_headers_c, outer_headers_v, fs);
break;
case ETHER_FLOW:
parse_ether(outer_headers_c, outer_headers_v, fs);
break;
default:
return -EINVAL;
}
if ((fs->flow_type & FLOW_EXT) &&
(fs->m_ext.vlan_tci & cpu_to_be16(VLAN_VID_MASK)))
set_cvlan(outer_headers_c, outer_headers_v, fs->h_ext.vlan_tci);
if (fs->flow_type & FLOW_MAC_EXT &&
!is_zero_ether_addr(fs->m_ext.h_dest)) {
mask_spec(fs->m_ext.h_dest, fs->h_ext.h_dest, ETH_ALEN);
set_dmac(outer_headers_c, outer_headers_v, fs->m_ext.h_dest,
fs->h_ext.h_dest);
}
return 0;
}
static void add_rule_to_list(struct mlx5e_priv *priv,
struct mlx5e_ethtool_rule *rule)
{
struct mlx5e_ethtool_rule *iter;
struct list_head *head = &priv->fs.ethtool.rules;
list_for_each_entry(iter, &priv->fs.ethtool.rules, list) {
if (iter->flow_spec.location > rule->flow_spec.location)
break;
head = &iter->list;
}
priv->fs.ethtool.tot_num_rules++;
list_add(&rule->list, head);
}
static bool outer_header_zero(u32 *match_criteria)
{
int size = MLX5_FLD_SZ_BYTES(fte_match_param, outer_headers);
char *outer_headers_c = MLX5_ADDR_OF(fte_match_param, match_criteria,
outer_headers);
return outer_headers_c[0] == 0 && !memcmp(outer_headers_c,
outer_headers_c + 1,
size - 1);
}
static struct mlx5_flow_handle *
add_ethtool_flow_rule(struct mlx5e_priv *priv,
struct mlx5_flow_table *ft,
struct ethtool_rx_flow_spec *fs)
{
struct mlx5_flow_destination *dst = NULL;
struct mlx5_flow_act flow_act = {0};
struct mlx5_flow_spec *spec;
struct mlx5_flow_handle *rule;
int err = 0;
spec = kvzalloc(sizeof(*spec), GFP_KERNEL);
if (!spec)
return ERR_PTR(-ENOMEM);
err = set_flow_attrs(spec->match_criteria, spec->match_value,
fs);
if (err)
goto free;
if (fs->ring_cookie == RX_CLS_FLOW_DISC) {
flow_act.action = MLX5_FLOW_CONTEXT_ACTION_DROP;
} else {
net/mlx5e: Add XSK zero-copy support This commit adds support for AF_XDP zero-copy RX and TX. We create a dedicated XSK RQ inside the channel, it means that two RQs are running simultaneously: one for non-XSK traffic and the other for XSK traffic. The regular and XSK RQs use a single ID namespace split into two halves: the lower half is regular RQs, and the upper half is XSK RQs. When any zero-copy AF_XDP socket is active, changing the number of channels is not allowed, because it would break to mapping between XSK RQ IDs and channels. XSK requires different page allocation and release routines. Such functions as mlx5e_{alloc,free}_rx_mpwqe and mlx5e_{get,put}_rx_frag are generic enough to be used for both regular and XSK RQs, and they use the mlx5e_page_{alloc,release} wrappers around the real allocation functions. Function pointers are not used to avoid losing the performance with retpolines. Wherever it's certain that the regular (non-XSK) page release function should be used, it's called directly. Only the stats that could be meaningful for XSK are exposed to the userspace. Those that don't take part in the XSK flow are not considered. Note that we don't wait for WQEs on the XSK RQ (unlike the regular RQ), because the newer xdpsock sample doesn't provide any Fill Ring entries at the setup stage. We create a dedicated XSK SQ in the channel. This separation has its advantages: 1. When the UMEM is closed, the XSK SQ can also be closed and stop receiving completions. If an existing SQ was used for XSK, it would continue receiving completions for the packets of the closed socket. If a new UMEM was opened at that point, it would start getting completions that don't belong to it. 2. Calculating statistics separately. When the userspace kicks the TX, the driver triggers a hardware interrupt by posting a NOP to a dedicated XSK ICO (internal control operations) SQ, in order to trigger NAPI on the right CPU core. This XSK ICO SQ is protected by a spinlock, as the userspace application may kick the TX from any core. Store the pointers to the UMEMs in the net device private context, independently from the kernel. This way the driver can distinguish between the zero-copy and non-zero-copy UMEMs. The kernel function xdp_get_umem_from_qid does not care about this difference, but the driver is only interested in zero-copy UMEMs, particularly, on the cleanup it determines whether to close the XSK RQ and SQ or not by looking at the presence of the UMEM. Use state_lock to protect the access to this area of UMEM pointers. LRO isn't compatible with XDP, but there may be active UMEMs while XDP is off. If this is the case, don't allow LRO to ensure XDP can be reenabled at any time. The validation of XSK parameters typically happens when XSK queues open. However, when the interface is down or the XDP program isn't set, it's still possible to have active AF_XDP sockets and even to open new, but the XSK queues will be closed. To cover these cases, perform the validation also in these flows: 1. A new UMEM is registered, but the XSK queues aren't going to be created due to missing XDP program or interface being down. 2. MTU changes while there are UMEMs registered. Having this early check prevents mlx5e_open_channels from failing at a later stage, where recovery is impossible and the application has no chance to handle the error, because it got the successful return value for an MTU change or XSK open operation. The performance testing was performed on a machine with the following configuration: - 24 cores of Intel Xeon E5-2620 v3 @ 2.40 GHz - Mellanox ConnectX-5 Ex with 100 Gbit/s link The results with retpoline disabled, single stream: txonly: 33.3 Mpps (21.5 Mpps with queue and app pinned to the same CPU) rxdrop: 12.2 Mpps l2fwd: 9.4 Mpps The results with retpoline enabled, single stream: txonly: 21.3 Mpps (14.1 Mpps with queue and app pinned to the same CPU) rxdrop: 9.9 Mpps l2fwd: 6.8 Mpps Signed-off-by: Maxim Mikityanskiy <maximmi@mellanox.com> Signed-off-by: Tariq Toukan <tariqt@mellanox.com> Acked-by: Saeed Mahameed <saeedm@mellanox.com> Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
2019-06-26 08:35:38 -06:00
struct mlx5e_params *params = &priv->channels.params;
enum mlx5e_rq_group group;
struct mlx5e_tir *tir;
u16 ix;
mlx5e_qid_get_ch_and_group(params, fs->ring_cookie, &ix, &group);
tir = group == MLX5E_RQ_GROUP_XSK ? priv->xsk_tir : priv->direct_tir;
dst = kzalloc(sizeof(*dst), GFP_KERNEL);
if (!dst) {
err = -ENOMEM;
goto free;
}
dst->type = MLX5_FLOW_DESTINATION_TYPE_TIR;
net/mlx5e: Add XSK zero-copy support This commit adds support for AF_XDP zero-copy RX and TX. We create a dedicated XSK RQ inside the channel, it means that two RQs are running simultaneously: one for non-XSK traffic and the other for XSK traffic. The regular and XSK RQs use a single ID namespace split into two halves: the lower half is regular RQs, and the upper half is XSK RQs. When any zero-copy AF_XDP socket is active, changing the number of channels is not allowed, because it would break to mapping between XSK RQ IDs and channels. XSK requires different page allocation and release routines. Such functions as mlx5e_{alloc,free}_rx_mpwqe and mlx5e_{get,put}_rx_frag are generic enough to be used for both regular and XSK RQs, and they use the mlx5e_page_{alloc,release} wrappers around the real allocation functions. Function pointers are not used to avoid losing the performance with retpolines. Wherever it's certain that the regular (non-XSK) page release function should be used, it's called directly. Only the stats that could be meaningful for XSK are exposed to the userspace. Those that don't take part in the XSK flow are not considered. Note that we don't wait for WQEs on the XSK RQ (unlike the regular RQ), because the newer xdpsock sample doesn't provide any Fill Ring entries at the setup stage. We create a dedicated XSK SQ in the channel. This separation has its advantages: 1. When the UMEM is closed, the XSK SQ can also be closed and stop receiving completions. If an existing SQ was used for XSK, it would continue receiving completions for the packets of the closed socket. If a new UMEM was opened at that point, it would start getting completions that don't belong to it. 2. Calculating statistics separately. When the userspace kicks the TX, the driver triggers a hardware interrupt by posting a NOP to a dedicated XSK ICO (internal control operations) SQ, in order to trigger NAPI on the right CPU core. This XSK ICO SQ is protected by a spinlock, as the userspace application may kick the TX from any core. Store the pointers to the UMEMs in the net device private context, independently from the kernel. This way the driver can distinguish between the zero-copy and non-zero-copy UMEMs. The kernel function xdp_get_umem_from_qid does not care about this difference, but the driver is only interested in zero-copy UMEMs, particularly, on the cleanup it determines whether to close the XSK RQ and SQ or not by looking at the presence of the UMEM. Use state_lock to protect the access to this area of UMEM pointers. LRO isn't compatible with XDP, but there may be active UMEMs while XDP is off. If this is the case, don't allow LRO to ensure XDP can be reenabled at any time. The validation of XSK parameters typically happens when XSK queues open. However, when the interface is down or the XDP program isn't set, it's still possible to have active AF_XDP sockets and even to open new, but the XSK queues will be closed. To cover these cases, perform the validation also in these flows: 1. A new UMEM is registered, but the XSK queues aren't going to be created due to missing XDP program or interface being down. 2. MTU changes while there are UMEMs registered. Having this early check prevents mlx5e_open_channels from failing at a later stage, where recovery is impossible and the application has no chance to handle the error, because it got the successful return value for an MTU change or XSK open operation. The performance testing was performed on a machine with the following configuration: - 24 cores of Intel Xeon E5-2620 v3 @ 2.40 GHz - Mellanox ConnectX-5 Ex with 100 Gbit/s link The results with retpoline disabled, single stream: txonly: 33.3 Mpps (21.5 Mpps with queue and app pinned to the same CPU) rxdrop: 12.2 Mpps l2fwd: 9.4 Mpps The results with retpoline enabled, single stream: txonly: 21.3 Mpps (14.1 Mpps with queue and app pinned to the same CPU) rxdrop: 9.9 Mpps l2fwd: 6.8 Mpps Signed-off-by: Maxim Mikityanskiy <maximmi@mellanox.com> Signed-off-by: Tariq Toukan <tariqt@mellanox.com> Acked-by: Saeed Mahameed <saeedm@mellanox.com> Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
2019-06-26 08:35:38 -06:00
dst->tir_num = tir[ix].tirn;
flow_act.action = MLX5_FLOW_CONTEXT_ACTION_FWD_DEST;
}
spec->match_criteria_enable = (!outer_header_zero(spec->match_criteria));
spec->flow_context.flow_tag = MLX5_FS_DEFAULT_FLOW_TAG;
rule = mlx5_add_flow_rules(ft, spec, &flow_act, dst, dst ? 1 : 0);
if (IS_ERR(rule)) {
err = PTR_ERR(rule);
netdev_err(priv->netdev, "%s: failed to add ethtool steering rule: %d\n",
__func__, err);
goto free;
}
free:
kvfree(spec);
kfree(dst);
return err ? ERR_PTR(err) : rule;
}
static void del_ethtool_rule(struct mlx5e_priv *priv,
struct mlx5e_ethtool_rule *eth_rule)
{
if (eth_rule->rule)
mlx5_del_flow_rules(eth_rule->rule);
list_del(&eth_rule->list);
priv->fs.ethtool.tot_num_rules--;
put_flow_table(eth_rule->eth_ft);
kfree(eth_rule);
}
static struct mlx5e_ethtool_rule *find_ethtool_rule(struct mlx5e_priv *priv,
int location)
{
struct mlx5e_ethtool_rule *iter;
list_for_each_entry(iter, &priv->fs.ethtool.rules, list) {
if (iter->flow_spec.location == location)
return iter;
}
return NULL;
}
static struct mlx5e_ethtool_rule *get_ethtool_rule(struct mlx5e_priv *priv,
int location)
{
struct mlx5e_ethtool_rule *eth_rule;
eth_rule = find_ethtool_rule(priv, location);
if (eth_rule)
del_ethtool_rule(priv, eth_rule);
eth_rule = kzalloc(sizeof(*eth_rule), GFP_KERNEL);
if (!eth_rule)
return ERR_PTR(-ENOMEM);
add_rule_to_list(priv, eth_rule);
return eth_rule;
}
#define MAX_NUM_OF_ETHTOOL_RULES BIT(10)
#define all_ones(field) (field == (__force typeof(field))-1)
#define all_zeros_or_all_ones(field) \
((field) == 0 || (field) == (__force typeof(field))-1)
static int validate_ethter(struct ethtool_rx_flow_spec *fs)
{
struct ethhdr *eth_mask = &fs->m_u.ether_spec;
int ntuples = 0;
if (!is_zero_ether_addr(eth_mask->h_dest))
ntuples++;
if (!is_zero_ether_addr(eth_mask->h_source))
ntuples++;
if (eth_mask->h_proto)
ntuples++;
return ntuples;
}
static int validate_tcpudp4(struct ethtool_rx_flow_spec *fs)
{
struct ethtool_tcpip4_spec *l4_mask = &fs->m_u.tcp_ip4_spec;
int ntuples = 0;
if (l4_mask->tos)
return -EINVAL;
if (l4_mask->ip4src)
ntuples++;
if (l4_mask->ip4dst)
ntuples++;
if (l4_mask->psrc)
ntuples++;
if (l4_mask->pdst)
ntuples++;
/* Flow is TCP/UDP */
return ++ntuples;
}
static int validate_ip4(struct ethtool_rx_flow_spec *fs)
{
struct ethtool_usrip4_spec *l3_mask = &fs->m_u.usr_ip4_spec;
int ntuples = 0;
if (l3_mask->l4_4_bytes || l3_mask->tos ||
fs->h_u.usr_ip4_spec.ip_ver != ETH_RX_NFC_IP4)
return -EINVAL;
if (l3_mask->ip4src)
ntuples++;
if (l3_mask->ip4dst)
ntuples++;
if (l3_mask->proto)
ntuples++;
/* Flow is IPv4 */
return ++ntuples;
}
static int validate_ip6(struct ethtool_rx_flow_spec *fs)
{
struct ethtool_usrip6_spec *l3_mask = &fs->m_u.usr_ip6_spec;
int ntuples = 0;
if (l3_mask->l4_4_bytes || l3_mask->tclass)
return -EINVAL;
if (!ipv6_addr_any((struct in6_addr *)l3_mask->ip6src))
ntuples++;
if (!ipv6_addr_any((struct in6_addr *)l3_mask->ip6dst))
ntuples++;
if (l3_mask->l4_proto)
ntuples++;
/* Flow is IPv6 */
return ++ntuples;
}
static int validate_tcpudp6(struct ethtool_rx_flow_spec *fs)
{
struct ethtool_tcpip6_spec *l4_mask = &fs->m_u.tcp_ip6_spec;
int ntuples = 0;
if (l4_mask->tclass)
return -EINVAL;
if (!ipv6_addr_any((struct in6_addr *)l4_mask->ip6src))
ntuples++;
if (!ipv6_addr_any((struct in6_addr *)l4_mask->ip6dst))
ntuples++;
if (l4_mask->psrc)
ntuples++;
if (l4_mask->pdst)
ntuples++;
/* Flow is TCP/UDP */
return ++ntuples;
}
static int validate_vlan(struct ethtool_rx_flow_spec *fs)
{
if (fs->m_ext.vlan_etype ||
fs->m_ext.vlan_tci != cpu_to_be16(VLAN_VID_MASK))
return -EINVAL;
if (fs->m_ext.vlan_tci &&
(be16_to_cpu(fs->h_ext.vlan_tci) >= VLAN_N_VID))
return -EINVAL;
return 1;
}
static int validate_flow(struct mlx5e_priv *priv,
struct ethtool_rx_flow_spec *fs)
{
int num_tuples = 0;
int ret = 0;
if (fs->location >= MAX_NUM_OF_ETHTOOL_RULES)
return -ENOSPC;
net/mlx5e: Add XSK zero-copy support This commit adds support for AF_XDP zero-copy RX and TX. We create a dedicated XSK RQ inside the channel, it means that two RQs are running simultaneously: one for non-XSK traffic and the other for XSK traffic. The regular and XSK RQs use a single ID namespace split into two halves: the lower half is regular RQs, and the upper half is XSK RQs. When any zero-copy AF_XDP socket is active, changing the number of channels is not allowed, because it would break to mapping between XSK RQ IDs and channels. XSK requires different page allocation and release routines. Such functions as mlx5e_{alloc,free}_rx_mpwqe and mlx5e_{get,put}_rx_frag are generic enough to be used for both regular and XSK RQs, and they use the mlx5e_page_{alloc,release} wrappers around the real allocation functions. Function pointers are not used to avoid losing the performance with retpolines. Wherever it's certain that the regular (non-XSK) page release function should be used, it's called directly. Only the stats that could be meaningful for XSK are exposed to the userspace. Those that don't take part in the XSK flow are not considered. Note that we don't wait for WQEs on the XSK RQ (unlike the regular RQ), because the newer xdpsock sample doesn't provide any Fill Ring entries at the setup stage. We create a dedicated XSK SQ in the channel. This separation has its advantages: 1. When the UMEM is closed, the XSK SQ can also be closed and stop receiving completions. If an existing SQ was used for XSK, it would continue receiving completions for the packets of the closed socket. If a new UMEM was opened at that point, it would start getting completions that don't belong to it. 2. Calculating statistics separately. When the userspace kicks the TX, the driver triggers a hardware interrupt by posting a NOP to a dedicated XSK ICO (internal control operations) SQ, in order to trigger NAPI on the right CPU core. This XSK ICO SQ is protected by a spinlock, as the userspace application may kick the TX from any core. Store the pointers to the UMEMs in the net device private context, independently from the kernel. This way the driver can distinguish between the zero-copy and non-zero-copy UMEMs. The kernel function xdp_get_umem_from_qid does not care about this difference, but the driver is only interested in zero-copy UMEMs, particularly, on the cleanup it determines whether to close the XSK RQ and SQ or not by looking at the presence of the UMEM. Use state_lock to protect the access to this area of UMEM pointers. LRO isn't compatible with XDP, but there may be active UMEMs while XDP is off. If this is the case, don't allow LRO to ensure XDP can be reenabled at any time. The validation of XSK parameters typically happens when XSK queues open. However, when the interface is down or the XDP program isn't set, it's still possible to have active AF_XDP sockets and even to open new, but the XSK queues will be closed. To cover these cases, perform the validation also in these flows: 1. A new UMEM is registered, but the XSK queues aren't going to be created due to missing XDP program or interface being down. 2. MTU changes while there are UMEMs registered. Having this early check prevents mlx5e_open_channels from failing at a later stage, where recovery is impossible and the application has no chance to handle the error, because it got the successful return value for an MTU change or XSK open operation. The performance testing was performed on a machine with the following configuration: - 24 cores of Intel Xeon E5-2620 v3 @ 2.40 GHz - Mellanox ConnectX-5 Ex with 100 Gbit/s link The results with retpoline disabled, single stream: txonly: 33.3 Mpps (21.5 Mpps with queue and app pinned to the same CPU) rxdrop: 12.2 Mpps l2fwd: 9.4 Mpps The results with retpoline enabled, single stream: txonly: 21.3 Mpps (14.1 Mpps with queue and app pinned to the same CPU) rxdrop: 9.9 Mpps l2fwd: 6.8 Mpps Signed-off-by: Maxim Mikityanskiy <maximmi@mellanox.com> Signed-off-by: Tariq Toukan <tariqt@mellanox.com> Acked-by: Saeed Mahameed <saeedm@mellanox.com> Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
2019-06-26 08:35:38 -06:00
if (fs->ring_cookie != RX_CLS_FLOW_DISC)
if (!mlx5e_qid_validate(priv->profile, &priv->channels.params,
fs->ring_cookie))
net/mlx5e: Add XSK zero-copy support This commit adds support for AF_XDP zero-copy RX and TX. We create a dedicated XSK RQ inside the channel, it means that two RQs are running simultaneously: one for non-XSK traffic and the other for XSK traffic. The regular and XSK RQs use a single ID namespace split into two halves: the lower half is regular RQs, and the upper half is XSK RQs. When any zero-copy AF_XDP socket is active, changing the number of channels is not allowed, because it would break to mapping between XSK RQ IDs and channels. XSK requires different page allocation and release routines. Such functions as mlx5e_{alloc,free}_rx_mpwqe and mlx5e_{get,put}_rx_frag are generic enough to be used for both regular and XSK RQs, and they use the mlx5e_page_{alloc,release} wrappers around the real allocation functions. Function pointers are not used to avoid losing the performance with retpolines. Wherever it's certain that the regular (non-XSK) page release function should be used, it's called directly. Only the stats that could be meaningful for XSK are exposed to the userspace. Those that don't take part in the XSK flow are not considered. Note that we don't wait for WQEs on the XSK RQ (unlike the regular RQ), because the newer xdpsock sample doesn't provide any Fill Ring entries at the setup stage. We create a dedicated XSK SQ in the channel. This separation has its advantages: 1. When the UMEM is closed, the XSK SQ can also be closed and stop receiving completions. If an existing SQ was used for XSK, it would continue receiving completions for the packets of the closed socket. If a new UMEM was opened at that point, it would start getting completions that don't belong to it. 2. Calculating statistics separately. When the userspace kicks the TX, the driver triggers a hardware interrupt by posting a NOP to a dedicated XSK ICO (internal control operations) SQ, in order to trigger NAPI on the right CPU core. This XSK ICO SQ is protected by a spinlock, as the userspace application may kick the TX from any core. Store the pointers to the UMEMs in the net device private context, independently from the kernel. This way the driver can distinguish between the zero-copy and non-zero-copy UMEMs. The kernel function xdp_get_umem_from_qid does not care about this difference, but the driver is only interested in zero-copy UMEMs, particularly, on the cleanup it determines whether to close the XSK RQ and SQ or not by looking at the presence of the UMEM. Use state_lock to protect the access to this area of UMEM pointers. LRO isn't compatible with XDP, but there may be active UMEMs while XDP is off. If this is the case, don't allow LRO to ensure XDP can be reenabled at any time. The validation of XSK parameters typically happens when XSK queues open. However, when the interface is down or the XDP program isn't set, it's still possible to have active AF_XDP sockets and even to open new, but the XSK queues will be closed. To cover these cases, perform the validation also in these flows: 1. A new UMEM is registered, but the XSK queues aren't going to be created due to missing XDP program or interface being down. 2. MTU changes while there are UMEMs registered. Having this early check prevents mlx5e_open_channels from failing at a later stage, where recovery is impossible and the application has no chance to handle the error, because it got the successful return value for an MTU change or XSK open operation. The performance testing was performed on a machine with the following configuration: - 24 cores of Intel Xeon E5-2620 v3 @ 2.40 GHz - Mellanox ConnectX-5 Ex with 100 Gbit/s link The results with retpoline disabled, single stream: txonly: 33.3 Mpps (21.5 Mpps with queue and app pinned to the same CPU) rxdrop: 12.2 Mpps l2fwd: 9.4 Mpps The results with retpoline enabled, single stream: txonly: 21.3 Mpps (14.1 Mpps with queue and app pinned to the same CPU) rxdrop: 9.9 Mpps l2fwd: 6.8 Mpps Signed-off-by: Maxim Mikityanskiy <maximmi@mellanox.com> Signed-off-by: Tariq Toukan <tariqt@mellanox.com> Acked-by: Saeed Mahameed <saeedm@mellanox.com> Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
2019-06-26 08:35:38 -06:00
return -EINVAL;
switch (fs->flow_type & ~(FLOW_EXT | FLOW_MAC_EXT)) {
case ETHER_FLOW:
num_tuples += validate_ethter(fs);
break;
case TCP_V4_FLOW:
case UDP_V4_FLOW:
ret = validate_tcpudp4(fs);
if (ret < 0)
return ret;
num_tuples += ret;
break;
case IP_USER_FLOW:
ret = validate_ip4(fs);
if (ret < 0)
return ret;
num_tuples += ret;
break;
case TCP_V6_FLOW:
case UDP_V6_FLOW:
ret = validate_tcpudp6(fs);
if (ret < 0)
return ret;
num_tuples += ret;
break;
case IPV6_USER_FLOW:
ret = validate_ip6(fs);
if (ret < 0)
return ret;
num_tuples += ret;
break;
default:
return -ENOTSUPP;
}
if ((fs->flow_type & FLOW_EXT)) {
ret = validate_vlan(fs);
if (ret < 0)
return ret;
num_tuples += ret;
}
if (fs->flow_type & FLOW_MAC_EXT &&
!is_zero_ether_addr(fs->m_ext.h_dest))
num_tuples++;
return num_tuples;
}
static int
mlx5e_ethtool_flow_replace(struct mlx5e_priv *priv,
struct ethtool_rx_flow_spec *fs)
{
struct mlx5e_ethtool_table *eth_ft;
struct mlx5e_ethtool_rule *eth_rule;
struct mlx5_flow_handle *rule;
int num_tuples;
int err;
num_tuples = validate_flow(priv, fs);
if (num_tuples <= 0) {
netdev_warn(priv->netdev, "%s: flow is not valid %d\n",
__func__, num_tuples);
return num_tuples;
}
eth_ft = get_flow_table(priv, fs, num_tuples);
if (IS_ERR(eth_ft))
return PTR_ERR(eth_ft);
eth_rule = get_ethtool_rule(priv, fs->location);
if (IS_ERR(eth_rule)) {
put_flow_table(eth_ft);
return PTR_ERR(eth_rule);
}
eth_rule->flow_spec = *fs;
eth_rule->eth_ft = eth_ft;
if (!eth_ft->ft) {
err = -EINVAL;
goto del_ethtool_rule;
}
rule = add_ethtool_flow_rule(priv, eth_ft->ft, fs);
if (IS_ERR(rule)) {
err = PTR_ERR(rule);
goto del_ethtool_rule;
}
eth_rule->rule = rule;
return 0;
del_ethtool_rule:
del_ethtool_rule(priv, eth_rule);
return err;
}
static int
mlx5e_ethtool_flow_remove(struct mlx5e_priv *priv, int location)
{
struct mlx5e_ethtool_rule *eth_rule;
int err = 0;
if (location >= MAX_NUM_OF_ETHTOOL_RULES)
return -ENOSPC;
eth_rule = find_ethtool_rule(priv, location);
if (!eth_rule) {
err = -ENOENT;
goto out;
}
del_ethtool_rule(priv, eth_rule);
out:
return err;
}
static int
mlx5e_ethtool_get_flow(struct mlx5e_priv *priv,
struct ethtool_rxnfc *info, int location)
{
struct mlx5e_ethtool_rule *eth_rule;
if (location < 0 || location >= MAX_NUM_OF_ETHTOOL_RULES)
return -EINVAL;
list_for_each_entry(eth_rule, &priv->fs.ethtool.rules, list) {
if (eth_rule->flow_spec.location == location) {
info->fs = eth_rule->flow_spec;
return 0;
}
}
return -ENOENT;
}
static int
mlx5e_ethtool_get_all_flows(struct mlx5e_priv *priv,
struct ethtool_rxnfc *info, u32 *rule_locs)
{
int location = 0;
int idx = 0;
int err = 0;
info->data = MAX_NUM_OF_ETHTOOL_RULES;
while ((!err || err == -ENOENT) && idx < info->rule_cnt) {
err = mlx5e_ethtool_get_flow(priv, info, location);
if (!err)
rule_locs[idx++] = location;
location++;
}
return err;
}
void mlx5e_ethtool_cleanup_steering(struct mlx5e_priv *priv)
{
struct mlx5e_ethtool_rule *iter;
struct mlx5e_ethtool_rule *temp;
list_for_each_entry_safe(iter, temp, &priv->fs.ethtool.rules, list)
del_ethtool_rule(priv, iter);
}
void mlx5e_ethtool_init_steering(struct mlx5e_priv *priv)
{
INIT_LIST_HEAD(&priv->fs.ethtool.rules);
}
static enum mlx5e_traffic_types flow_type_to_traffic_type(u32 flow_type)
{
switch (flow_type) {
case TCP_V4_FLOW:
return MLX5E_TT_IPV4_TCP;
case TCP_V6_FLOW:
return MLX5E_TT_IPV6_TCP;
case UDP_V4_FLOW:
return MLX5E_TT_IPV4_UDP;
case UDP_V6_FLOW:
return MLX5E_TT_IPV6_UDP;
case AH_V4_FLOW:
return MLX5E_TT_IPV4_IPSEC_AH;
case AH_V6_FLOW:
return MLX5E_TT_IPV6_IPSEC_AH;
case ESP_V4_FLOW:
return MLX5E_TT_IPV4_IPSEC_ESP;
case ESP_V6_FLOW:
return MLX5E_TT_IPV6_IPSEC_ESP;
case IPV4_FLOW:
return MLX5E_TT_IPV4;
case IPV6_FLOW:
return MLX5E_TT_IPV6;
default:
return MLX5E_NUM_INDIR_TIRS;
}
}
static int mlx5e_set_rss_hash_opt(struct mlx5e_priv *priv,
struct ethtool_rxnfc *nfc)
{
int inlen = MLX5_ST_SZ_BYTES(modify_tir_in);
enum mlx5e_traffic_types tt;
u8 rx_hash_field = 0;
void *in;
tt = flow_type_to_traffic_type(nfc->flow_type);
if (tt == MLX5E_NUM_INDIR_TIRS)
return -EINVAL;
/* RSS does not support anything other than hashing to queues
* on src IP, dest IP, TCP/UDP src port and TCP/UDP dest
* port.
*/
if (nfc->flow_type != TCP_V4_FLOW &&
nfc->flow_type != TCP_V6_FLOW &&
nfc->flow_type != UDP_V4_FLOW &&
nfc->flow_type != UDP_V6_FLOW)
return -EOPNOTSUPP;
if (nfc->data & ~(RXH_IP_SRC | RXH_IP_DST |
RXH_L4_B_0_1 | RXH_L4_B_2_3))
return -EOPNOTSUPP;
if (nfc->data & RXH_IP_SRC)
rx_hash_field |= MLX5_HASH_FIELD_SEL_SRC_IP;
if (nfc->data & RXH_IP_DST)
rx_hash_field |= MLX5_HASH_FIELD_SEL_DST_IP;
if (nfc->data & RXH_L4_B_0_1)
rx_hash_field |= MLX5_HASH_FIELD_SEL_L4_SPORT;
if (nfc->data & RXH_L4_B_2_3)
rx_hash_field |= MLX5_HASH_FIELD_SEL_L4_DPORT;
in = kvzalloc(inlen, GFP_KERNEL);
if (!in)
return -ENOMEM;
mutex_lock(&priv->state_lock);
if (rx_hash_field == priv->rss_params.rx_hash_fields[tt])
goto out;
priv->rss_params.rx_hash_fields[tt] = rx_hash_field;
mlx5e_modify_tirs_hash(priv, in, inlen);
out:
mutex_unlock(&priv->state_lock);
kvfree(in);
return 0;
}
static int mlx5e_get_rss_hash_opt(struct mlx5e_priv *priv,
struct ethtool_rxnfc *nfc)
{
enum mlx5e_traffic_types tt;
u32 hash_field = 0;
tt = flow_type_to_traffic_type(nfc->flow_type);
if (tt == MLX5E_NUM_INDIR_TIRS)
return -EINVAL;
hash_field = priv->rss_params.rx_hash_fields[tt];
nfc->data = 0;
if (hash_field & MLX5_HASH_FIELD_SEL_SRC_IP)
nfc->data |= RXH_IP_SRC;
if (hash_field & MLX5_HASH_FIELD_SEL_DST_IP)
nfc->data |= RXH_IP_DST;
if (hash_field & MLX5_HASH_FIELD_SEL_L4_SPORT)
nfc->data |= RXH_L4_B_0_1;
if (hash_field & MLX5_HASH_FIELD_SEL_L4_DPORT)
nfc->data |= RXH_L4_B_2_3;
return 0;
}
int mlx5e_set_rxnfc(struct net_device *dev, struct ethtool_rxnfc *cmd)
{
int err = 0;
struct mlx5e_priv *priv = netdev_priv(dev);
switch (cmd->cmd) {
case ETHTOOL_SRXCLSRLINS:
err = mlx5e_ethtool_flow_replace(priv, &cmd->fs);
break;
case ETHTOOL_SRXCLSRLDEL:
err = mlx5e_ethtool_flow_remove(priv, cmd->fs.location);
break;
case ETHTOOL_SRXFH:
err = mlx5e_set_rss_hash_opt(priv, cmd);
break;
default:
err = -EOPNOTSUPP;
break;
}
return err;
}
int mlx5e_get_rxnfc(struct net_device *dev,
struct ethtool_rxnfc *info, u32 *rule_locs)
{
struct mlx5e_priv *priv = netdev_priv(dev);
int err = 0;
switch (info->cmd) {
case ETHTOOL_GRXRINGS:
info->data = priv->channels.params.num_channels;
break;
case ETHTOOL_GRXCLSRLCNT:
info->rule_cnt = priv->fs.ethtool.tot_num_rules;
break;
case ETHTOOL_GRXCLSRULE:
err = mlx5e_ethtool_get_flow(priv, info, info->fs.location);
break;
case ETHTOOL_GRXCLSRLALL:
err = mlx5e_ethtool_get_all_flows(priv, info, rule_locs);
break;
case ETHTOOL_GRXFH:
err = mlx5e_get_rss_hash_opt(priv, info);
break;
default:
err = -EOPNOTSUPP;
break;
}
return err;
}