1
0
Fork 0
alistair23-linux/drivers/net/ethernet/mellanox/mlx5/core/en_stats.h

344 lines
9.4 KiB
C
Raw Normal View History

/*
* Copyright (c) 2015-2016, Mellanox Technologies. All rights reserved.
*
* This software is available to you under a choice of one of two
* licenses. You may choose to be licensed under the terms of the GNU
* General Public License (GPL) Version 2, available from the file
* COPYING in the main directory of this source tree, or the
* OpenIB.org BSD license below:
*
* Redistribution and use in source and binary forms, with or
* without modification, are permitted provided that the following
* conditions are met:
*
* - Redistributions of source code must retain the above
* copyright notice, this list of conditions and the following
* disclaimer.
*
* - Redistributions in binary form must reproduce the above
* copyright notice, this list of conditions and the following
* disclaimer in the documentation and/or other materials
* provided with the distribution.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
* BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
* ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
* CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*/
#ifndef __MLX5_EN_STATS_H__
#define __MLX5_EN_STATS_H__
#define MLX5E_READ_CTR64_CPU(ptr, dsc, i) \
(*(u64 *)((char *)ptr + dsc[i].offset))
#define MLX5E_READ_CTR64_BE(ptr, dsc, i) \
be64_to_cpu(*(__be64 *)((char *)ptr + dsc[i].offset))
#define MLX5E_READ_CTR32_CPU(ptr, dsc, i) \
(*(u32 *)((char *)ptr + dsc[i].offset))
#define MLX5E_READ_CTR32_BE(ptr, dsc, i) \
be32_to_cpu(*(__be32 *)((char *)ptr + dsc[i].offset))
#define MLX5E_DECLARE_STAT(type, fld) #fld, offsetof(type, fld)
#define MLX5E_DECLARE_RX_STAT(type, fld) "rx%d_"#fld, offsetof(type, fld)
#define MLX5E_DECLARE_TX_STAT(type, fld) "tx%d_"#fld, offsetof(type, fld)
#define MLX5E_DECLARE_XDPSQ_STAT(type, fld) "tx%d_xdp_"#fld, offsetof(type, fld)
#define MLX5E_DECLARE_RQ_XDPSQ_STAT(type, fld) "rx%d_xdp_tx_"#fld, offsetof(type, fld)
net/mlx5e: Add XSK zero-copy support This commit adds support for AF_XDP zero-copy RX and TX. We create a dedicated XSK RQ inside the channel, it means that two RQs are running simultaneously: one for non-XSK traffic and the other for XSK traffic. The regular and XSK RQs use a single ID namespace split into two halves: the lower half is regular RQs, and the upper half is XSK RQs. When any zero-copy AF_XDP socket is active, changing the number of channels is not allowed, because it would break to mapping between XSK RQ IDs and channels. XSK requires different page allocation and release routines. Such functions as mlx5e_{alloc,free}_rx_mpwqe and mlx5e_{get,put}_rx_frag are generic enough to be used for both regular and XSK RQs, and they use the mlx5e_page_{alloc,release} wrappers around the real allocation functions. Function pointers are not used to avoid losing the performance with retpolines. Wherever it's certain that the regular (non-XSK) page release function should be used, it's called directly. Only the stats that could be meaningful for XSK are exposed to the userspace. Those that don't take part in the XSK flow are not considered. Note that we don't wait for WQEs on the XSK RQ (unlike the regular RQ), because the newer xdpsock sample doesn't provide any Fill Ring entries at the setup stage. We create a dedicated XSK SQ in the channel. This separation has its advantages: 1. When the UMEM is closed, the XSK SQ can also be closed and stop receiving completions. If an existing SQ was used for XSK, it would continue receiving completions for the packets of the closed socket. If a new UMEM was opened at that point, it would start getting completions that don't belong to it. 2. Calculating statistics separately. When the userspace kicks the TX, the driver triggers a hardware interrupt by posting a NOP to a dedicated XSK ICO (internal control operations) SQ, in order to trigger NAPI on the right CPU core. This XSK ICO SQ is protected by a spinlock, as the userspace application may kick the TX from any core. Store the pointers to the UMEMs in the net device private context, independently from the kernel. This way the driver can distinguish between the zero-copy and non-zero-copy UMEMs. The kernel function xdp_get_umem_from_qid does not care about this difference, but the driver is only interested in zero-copy UMEMs, particularly, on the cleanup it determines whether to close the XSK RQ and SQ or not by looking at the presence of the UMEM. Use state_lock to protect the access to this area of UMEM pointers. LRO isn't compatible with XDP, but there may be active UMEMs while XDP is off. If this is the case, don't allow LRO to ensure XDP can be reenabled at any time. The validation of XSK parameters typically happens when XSK queues open. However, when the interface is down or the XDP program isn't set, it's still possible to have active AF_XDP sockets and even to open new, but the XSK queues will be closed. To cover these cases, perform the validation also in these flows: 1. A new UMEM is registered, but the XSK queues aren't going to be created due to missing XDP program or interface being down. 2. MTU changes while there are UMEMs registered. Having this early check prevents mlx5e_open_channels from failing at a later stage, where recovery is impossible and the application has no chance to handle the error, because it got the successful return value for an MTU change or XSK open operation. The performance testing was performed on a machine with the following configuration: - 24 cores of Intel Xeon E5-2620 v3 @ 2.40 GHz - Mellanox ConnectX-5 Ex with 100 Gbit/s link The results with retpoline disabled, single stream: txonly: 33.3 Mpps (21.5 Mpps with queue and app pinned to the same CPU) rxdrop: 12.2 Mpps l2fwd: 9.4 Mpps The results with retpoline enabled, single stream: txonly: 21.3 Mpps (14.1 Mpps with queue and app pinned to the same CPU) rxdrop: 9.9 Mpps l2fwd: 6.8 Mpps Signed-off-by: Maxim Mikityanskiy <maximmi@mellanox.com> Signed-off-by: Tariq Toukan <tariqt@mellanox.com> Acked-by: Saeed Mahameed <saeedm@mellanox.com> Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
2019-06-26 08:35:38 -06:00
#define MLX5E_DECLARE_XSKRQ_STAT(type, fld) "rx%d_xsk_"#fld, offsetof(type, fld)
#define MLX5E_DECLARE_XSKSQ_STAT(type, fld) "tx%d_xsk_"#fld, offsetof(type, fld)
#define MLX5E_DECLARE_CH_STAT(type, fld) "ch%d_"#fld, offsetof(type, fld)
struct counter_desc {
char format[ETH_GSTRING_LEN];
size_t offset; /* Byte offset */
};
struct mlx5e_sw_stats {
u64 rx_packets;
u64 rx_bytes;
u64 tx_packets;
u64 tx_bytes;
u64 tx_tso_packets;
u64 tx_tso_bytes;
u64 tx_tso_inner_packets;
u64 tx_tso_inner_bytes;
u64 tx_added_vlan_packets;
u64 tx_nop;
u64 rx_lro_packets;
u64 rx_lro_bytes;
u64 rx_ecn_mark;
u64 rx_removed_vlan_packets;
u64 rx_csum_unnecessary;
u64 rx_csum_none;
u64 rx_csum_complete;
u64 rx_csum_complete_tail;
u64 rx_csum_complete_tail_slow;
u64 rx_csum_unnecessary_inner;
net/mlx5e: XDP fast RX drop bpf programs support Add support for the BPF_PROG_TYPE_PHYS_DEV hook in mlx5e driver. When XDP is on we make sure to change channels RQs type to MLX5_WQ_TYPE_LINKED_LIST rather than "striding RQ" type to ensure "page per packet". On XDP set, we fail if HW LRO is set and request from user to turn it off. Since on ConnectX4-LX HW LRO is always on by default, this will be annoying, but we prefer not to enforce LRO off from XDP set function. Full channels reset (close/open) is required only when setting XDP on/off. When XDP set is called just to exchange programs, we will update each RQ xdp program on the fly and for synchronization with current data path RX activity of that RQ, we temporally disable that RQ and ensure RX path is not running, quickly update and re-enable that RQ, for that we do: - rq.state = disabled - napi_synnchronize - xchg(rq->xdp_prg) - rq.state = enabled - napi_schedule // Just in case we've missed an IRQ Packet rate performance testing was done with pktgen 64B packets and on TX side and, TC drop action on RX side compared to XDP fast drop. CPU: Intel(R) Xeon(R) CPU E5-2680 v3 @ 2.50GHz Comparison is done between: 1. Baseline, Before this patch with TC drop action 2. This patch with TC drop action 3. This patch with XDP RX fast drop RX Cores Baseline(TC drop) TC drop XDP fast Drop -------------------------------------------------------------- 1 5.3Mpps 5.3Mpps 16.5Mpps 2 10.2Mpps 10.2Mpps 31.3Mpps 4 20.5Mpps 19.9Mpps 36.3Mpps* *My xmitter was limited to 36.3Mpps, so it is the bottleneck. It seems that receive side can handle more. Signed-off-by: Rana Shahout <ranas@mellanox.com> Signed-off-by: Saeed Mahameed <saeedm@mellanox.com> Signed-off-by: Tariq Toukan <tariqt@mellanox.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2016-09-21 03:19:46 -06:00
u64 rx_xdp_drop;
u64 rx_xdp_redirect;
u64 rx_xdp_tx_xmit;
u64 rx_xdp_tx_mpwqe;
net/mlx5e: XDP, Inline small packets into the TX MPWQE in XDP xmit flow Upon high packet rate with multiple CPUs TX workloads, much of the HCA's resources are spent on prefetching TX descriptors, thus affecting transmission rates. This patch comes to mitigate this problem by moving some workload to the CPU and reducing the HW data prefetch overhead for small packets (<= 256B). When forwarding packets with XDP, a packet that is smaller than a certain size (set to ~256 bytes) would be sent inline within its WQE TX descrptor (mem-copied), when the hardware tx queue is congested beyond a pre-defined water-mark. This is added to better utilize the HW resources (which now makes one less packet data prefetch) and allow better scalability, on the account of CPU usage (which now 'memcpy's the packet into the WQE). To load balance between HW and CPU and get max packet rate, we use watermarks to detect how much the HW is congested and move the work loads back and forth between HW and CPU. Performance: Tested packet rate for UDP 64Byte multi-stream over two dual port ConnectX-5 100Gbps NICs. CPU: Intel(R) Xeon(R) CPU E5-2680 v3 @ 2.50GHz * Tested with hyper-threading disabled XDP_TX: | | before | after | | | 24 rings | 51Mpps | 116Mpps | +126% | | 1 ring | 12Mpps | 12Mpps | same | XDP_REDIRECT: ** Below is the transmit rate, not the redirection rate which might be larger, and is not affected by this patch. | | before | after | | | 32 rings | 64Mpps | 92Mpps | +43% | | 1 ring | 6.4Mpps | 6.4Mpps | same | As we can see, feature significantly improves scaling, without hurting single ring performance. Signed-off-by: Shay Agroskin <shayag@mellanox.com> Signed-off-by: Tariq Toukan <tariqt@mellanox.com> Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
2019-03-14 06:54:07 -06:00
u64 rx_xdp_tx_inlnw;
net/mlx5e: XDP, Close TX MPWQE session when no room for inline packet left In MPWQE mode, when transmitting packets with XDP, a packet that is smaller than a certain size (set to 256 bytes) would be sent inline within its WQE TX descriptor (mem-copied), in case the hardware tx queue is congested beyond a pre-defined water-mark. If a MPWQE cannot contain an additional inline packet, we close this MPWQE session, and send the packet inlined within the next MPWQE. To save some MPWQE session close+open operations, we don't open MPWQE sessions that are contiguously smaller than certain size (set to the HW MPWQE maximum size). If there isn't enough contiguous room in the send queue, we fill it with NOPs and wrap the send queue index around. This way, qualified packets are always sent inline. Perf tests: Tested packet rate for UDP 64Byte multi-stream over two dual port ConnectX-5 100Gbps NICs. CPU: Intel(R) Xeon(R) CPU E5-2680 v3 @ 2.50GHz XDP_TX: With 24 channels: | ------ | bounced packets | inlined packets | inline ratio | | before | 113.6Mpps | 96.3Mpps | 84% | | after | 115Mpps | 99.5Mpps | 86% | With one channel: | ------ | bounced packets | inlined packets | inline ratio | | before | 6.7Mpps | 0pps | 0% | | after | 6.8Mpps | 0pps | 0% | As we can see, there is improvement in both inline ratio and overall packet rate for 24 channels. Also, we see no degradation for the one-channel case. Signed-off-by: Shay Agroskin <shayag@mellanox.com> Signed-off-by: Tariq Toukan <tariqt@mellanox.com> Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
2019-05-12 09:28:27 -06:00
u64 rx_xdp_tx_nops;
u64 rx_xdp_tx_full;
u64 rx_xdp_tx_err;
u64 rx_xdp_tx_cqe;
u64 tx_csum_none;
u64 tx_csum_partial;
u64 tx_csum_partial_inner;
u64 tx_queue_stopped;
u64 tx_queue_dropped;
u64 tx_xmit_more;
u64 tx_recover;
u64 tx_cqes;
u64 tx_queue_wake;
u64 tx_cqe_err;
u64 tx_xdp_xmit;
u64 tx_xdp_mpwqe;
net/mlx5e: XDP, Inline small packets into the TX MPWQE in XDP xmit flow Upon high packet rate with multiple CPUs TX workloads, much of the HCA's resources are spent on prefetching TX descriptors, thus affecting transmission rates. This patch comes to mitigate this problem by moving some workload to the CPU and reducing the HW data prefetch overhead for small packets (<= 256B). When forwarding packets with XDP, a packet that is smaller than a certain size (set to ~256 bytes) would be sent inline within its WQE TX descrptor (mem-copied), when the hardware tx queue is congested beyond a pre-defined water-mark. This is added to better utilize the HW resources (which now makes one less packet data prefetch) and allow better scalability, on the account of CPU usage (which now 'memcpy's the packet into the WQE). To load balance between HW and CPU and get max packet rate, we use watermarks to detect how much the HW is congested and move the work loads back and forth between HW and CPU. Performance: Tested packet rate for UDP 64Byte multi-stream over two dual port ConnectX-5 100Gbps NICs. CPU: Intel(R) Xeon(R) CPU E5-2680 v3 @ 2.50GHz * Tested with hyper-threading disabled XDP_TX: | | before | after | | | 24 rings | 51Mpps | 116Mpps | +126% | | 1 ring | 12Mpps | 12Mpps | same | XDP_REDIRECT: ** Below is the transmit rate, not the redirection rate which might be larger, and is not affected by this patch. | | before | after | | | 32 rings | 64Mpps | 92Mpps | +43% | | 1 ring | 6.4Mpps | 6.4Mpps | same | As we can see, feature significantly improves scaling, without hurting single ring performance. Signed-off-by: Shay Agroskin <shayag@mellanox.com> Signed-off-by: Tariq Toukan <tariqt@mellanox.com> Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
2019-03-14 06:54:07 -06:00
u64 tx_xdp_inlnw;
net/mlx5e: XDP, Close TX MPWQE session when no room for inline packet left In MPWQE mode, when transmitting packets with XDP, a packet that is smaller than a certain size (set to 256 bytes) would be sent inline within its WQE TX descriptor (mem-copied), in case the hardware tx queue is congested beyond a pre-defined water-mark. If a MPWQE cannot contain an additional inline packet, we close this MPWQE session, and send the packet inlined within the next MPWQE. To save some MPWQE session close+open operations, we don't open MPWQE sessions that are contiguously smaller than certain size (set to the HW MPWQE maximum size). If there isn't enough contiguous room in the send queue, we fill it with NOPs and wrap the send queue index around. This way, qualified packets are always sent inline. Perf tests: Tested packet rate for UDP 64Byte multi-stream over two dual port ConnectX-5 100Gbps NICs. CPU: Intel(R) Xeon(R) CPU E5-2680 v3 @ 2.50GHz XDP_TX: With 24 channels: | ------ | bounced packets | inlined packets | inline ratio | | before | 113.6Mpps | 96.3Mpps | 84% | | after | 115Mpps | 99.5Mpps | 86% | With one channel: | ------ | bounced packets | inlined packets | inline ratio | | before | 6.7Mpps | 0pps | 0% | | after | 6.8Mpps | 0pps | 0% | As we can see, there is improvement in both inline ratio and overall packet rate for 24 channels. Also, we see no degradation for the one-channel case. Signed-off-by: Shay Agroskin <shayag@mellanox.com> Signed-off-by: Tariq Toukan <tariqt@mellanox.com> Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
2019-05-12 09:28:27 -06:00
u64 tx_xdp_nops;
u64 tx_xdp_full;
u64 tx_xdp_err;
u64 tx_xdp_cqes;
u64 rx_wqe_err;
u64 rx_mpwqe_filler_cqes;
u64 rx_mpwqe_filler_strides;
u64 rx_oversize_pkts_sw_drop;
u64 rx_buff_alloc_err;
u64 rx_cqe_compress_blks;
u64 rx_cqe_compress_pkts;
u64 rx_cache_reuse;
u64 rx_cache_full;
u64 rx_cache_empty;
u64 rx_cache_busy;
u64 rx_cache_waive;
u64 rx_congst_umr;
u64 rx_arfs_err;
u64 rx_recover;
u64 ch_events;
u64 ch_poll;
u64 ch_arm;
u64 ch_aff_change;
net/mlx5e: Add XSK zero-copy support This commit adds support for AF_XDP zero-copy RX and TX. We create a dedicated XSK RQ inside the channel, it means that two RQs are running simultaneously: one for non-XSK traffic and the other for XSK traffic. The regular and XSK RQs use a single ID namespace split into two halves: the lower half is regular RQs, and the upper half is XSK RQs. When any zero-copy AF_XDP socket is active, changing the number of channels is not allowed, because it would break to mapping between XSK RQ IDs and channels. XSK requires different page allocation and release routines. Such functions as mlx5e_{alloc,free}_rx_mpwqe and mlx5e_{get,put}_rx_frag are generic enough to be used for both regular and XSK RQs, and they use the mlx5e_page_{alloc,release} wrappers around the real allocation functions. Function pointers are not used to avoid losing the performance with retpolines. Wherever it's certain that the regular (non-XSK) page release function should be used, it's called directly. Only the stats that could be meaningful for XSK are exposed to the userspace. Those that don't take part in the XSK flow are not considered. Note that we don't wait for WQEs on the XSK RQ (unlike the regular RQ), because the newer xdpsock sample doesn't provide any Fill Ring entries at the setup stage. We create a dedicated XSK SQ in the channel. This separation has its advantages: 1. When the UMEM is closed, the XSK SQ can also be closed and stop receiving completions. If an existing SQ was used for XSK, it would continue receiving completions for the packets of the closed socket. If a new UMEM was opened at that point, it would start getting completions that don't belong to it. 2. Calculating statistics separately. When the userspace kicks the TX, the driver triggers a hardware interrupt by posting a NOP to a dedicated XSK ICO (internal control operations) SQ, in order to trigger NAPI on the right CPU core. This XSK ICO SQ is protected by a spinlock, as the userspace application may kick the TX from any core. Store the pointers to the UMEMs in the net device private context, independently from the kernel. This way the driver can distinguish between the zero-copy and non-zero-copy UMEMs. The kernel function xdp_get_umem_from_qid does not care about this difference, but the driver is only interested in zero-copy UMEMs, particularly, on the cleanup it determines whether to close the XSK RQ and SQ or not by looking at the presence of the UMEM. Use state_lock to protect the access to this area of UMEM pointers. LRO isn't compatible with XDP, but there may be active UMEMs while XDP is off. If this is the case, don't allow LRO to ensure XDP can be reenabled at any time. The validation of XSK parameters typically happens when XSK queues open. However, when the interface is down or the XDP program isn't set, it's still possible to have active AF_XDP sockets and even to open new, but the XSK queues will be closed. To cover these cases, perform the validation also in these flows: 1. A new UMEM is registered, but the XSK queues aren't going to be created due to missing XDP program or interface being down. 2. MTU changes while there are UMEMs registered. Having this early check prevents mlx5e_open_channels from failing at a later stage, where recovery is impossible and the application has no chance to handle the error, because it got the successful return value for an MTU change or XSK open operation. The performance testing was performed on a machine with the following configuration: - 24 cores of Intel Xeon E5-2620 v3 @ 2.40 GHz - Mellanox ConnectX-5 Ex with 100 Gbit/s link The results with retpoline disabled, single stream: txonly: 33.3 Mpps (21.5 Mpps with queue and app pinned to the same CPU) rxdrop: 12.2 Mpps l2fwd: 9.4 Mpps The results with retpoline enabled, single stream: txonly: 21.3 Mpps (14.1 Mpps with queue and app pinned to the same CPU) rxdrop: 9.9 Mpps l2fwd: 6.8 Mpps Signed-off-by: Maxim Mikityanskiy <maximmi@mellanox.com> Signed-off-by: Tariq Toukan <tariqt@mellanox.com> Acked-by: Saeed Mahameed <saeedm@mellanox.com> Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
2019-06-26 08:35:38 -06:00
u64 ch_force_irq;
u64 ch_eq_rearm;
#ifdef CONFIG_MLX5_EN_TLS
u64 tx_tls_encrypted_packets;
u64 tx_tls_encrypted_bytes;
u64 tx_tls_ctx;
u64 tx_tls_ooo;
u64 tx_tls_dump_packets;
u64 tx_tls_dump_bytes;
u64 tx_tls_resync_bytes;
u64 tx_tls_skip_no_sync_data;
u64 tx_tls_drop_no_sync_data;
u64 tx_tls_drop_bypass_req;
#endif
net/mlx5e: Add XSK zero-copy support This commit adds support for AF_XDP zero-copy RX and TX. We create a dedicated XSK RQ inside the channel, it means that two RQs are running simultaneously: one for non-XSK traffic and the other for XSK traffic. The regular and XSK RQs use a single ID namespace split into two halves: the lower half is regular RQs, and the upper half is XSK RQs. When any zero-copy AF_XDP socket is active, changing the number of channels is not allowed, because it would break to mapping between XSK RQ IDs and channels. XSK requires different page allocation and release routines. Such functions as mlx5e_{alloc,free}_rx_mpwqe and mlx5e_{get,put}_rx_frag are generic enough to be used for both regular and XSK RQs, and they use the mlx5e_page_{alloc,release} wrappers around the real allocation functions. Function pointers are not used to avoid losing the performance with retpolines. Wherever it's certain that the regular (non-XSK) page release function should be used, it's called directly. Only the stats that could be meaningful for XSK are exposed to the userspace. Those that don't take part in the XSK flow are not considered. Note that we don't wait for WQEs on the XSK RQ (unlike the regular RQ), because the newer xdpsock sample doesn't provide any Fill Ring entries at the setup stage. We create a dedicated XSK SQ in the channel. This separation has its advantages: 1. When the UMEM is closed, the XSK SQ can also be closed and stop receiving completions. If an existing SQ was used for XSK, it would continue receiving completions for the packets of the closed socket. If a new UMEM was opened at that point, it would start getting completions that don't belong to it. 2. Calculating statistics separately. When the userspace kicks the TX, the driver triggers a hardware interrupt by posting a NOP to a dedicated XSK ICO (internal control operations) SQ, in order to trigger NAPI on the right CPU core. This XSK ICO SQ is protected by a spinlock, as the userspace application may kick the TX from any core. Store the pointers to the UMEMs in the net device private context, independently from the kernel. This way the driver can distinguish between the zero-copy and non-zero-copy UMEMs. The kernel function xdp_get_umem_from_qid does not care about this difference, but the driver is only interested in zero-copy UMEMs, particularly, on the cleanup it determines whether to close the XSK RQ and SQ or not by looking at the presence of the UMEM. Use state_lock to protect the access to this area of UMEM pointers. LRO isn't compatible with XDP, but there may be active UMEMs while XDP is off. If this is the case, don't allow LRO to ensure XDP can be reenabled at any time. The validation of XSK parameters typically happens when XSK queues open. However, when the interface is down or the XDP program isn't set, it's still possible to have active AF_XDP sockets and even to open new, but the XSK queues will be closed. To cover these cases, perform the validation also in these flows: 1. A new UMEM is registered, but the XSK queues aren't going to be created due to missing XDP program or interface being down. 2. MTU changes while there are UMEMs registered. Having this early check prevents mlx5e_open_channels from failing at a later stage, where recovery is impossible and the application has no chance to handle the error, because it got the successful return value for an MTU change or XSK open operation. The performance testing was performed on a machine with the following configuration: - 24 cores of Intel Xeon E5-2620 v3 @ 2.40 GHz - Mellanox ConnectX-5 Ex with 100 Gbit/s link The results with retpoline disabled, single stream: txonly: 33.3 Mpps (21.5 Mpps with queue and app pinned to the same CPU) rxdrop: 12.2 Mpps l2fwd: 9.4 Mpps The results with retpoline enabled, single stream: txonly: 21.3 Mpps (14.1 Mpps with queue and app pinned to the same CPU) rxdrop: 9.9 Mpps l2fwd: 6.8 Mpps Signed-off-by: Maxim Mikityanskiy <maximmi@mellanox.com> Signed-off-by: Tariq Toukan <tariqt@mellanox.com> Acked-by: Saeed Mahameed <saeedm@mellanox.com> Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
2019-06-26 08:35:38 -06:00
u64 rx_xsk_packets;
u64 rx_xsk_bytes;
u64 rx_xsk_csum_complete;
u64 rx_xsk_csum_unnecessary;
u64 rx_xsk_csum_unnecessary_inner;
u64 rx_xsk_csum_none;
u64 rx_xsk_ecn_mark;
u64 rx_xsk_removed_vlan_packets;
u64 rx_xsk_xdp_drop;
u64 rx_xsk_xdp_redirect;
u64 rx_xsk_wqe_err;
u64 rx_xsk_mpwqe_filler_cqes;
u64 rx_xsk_mpwqe_filler_strides;
u64 rx_xsk_oversize_pkts_sw_drop;
u64 rx_xsk_buff_alloc_err;
u64 rx_xsk_cqe_compress_blks;
u64 rx_xsk_cqe_compress_pkts;
u64 rx_xsk_congst_umr;
u64 rx_xsk_arfs_err;
u64 tx_xsk_xmit;
u64 tx_xsk_mpwqe;
u64 tx_xsk_inlnw;
u64 tx_xsk_full;
u64 tx_xsk_err;
u64 tx_xsk_cqes;
};
struct mlx5e_qcounter_stats {
u32 rx_out_of_buffer;
u32 rx_if_down_packets;
};
struct mlx5e_vnic_env_stats {
__be64 query_vnic_env_out[MLX5_ST_SZ_QW(query_vnic_env_out)];
};
#define VPORT_COUNTER_GET(vstats, c) MLX5_GET64(query_vport_counter_out, \
vstats->query_vport_out, c)
struct mlx5e_vport_stats {
__be64 query_vport_out[MLX5_ST_SZ_QW(query_vport_counter_out)];
};
#define PPORT_802_3_GET(pstats, c) \
MLX5_GET64(ppcnt_reg, pstats->IEEE_802_3_counters, \
counter_set.eth_802_3_cntrs_grp_data_layout.c##_high)
#define PPORT_2863_GET(pstats, c) \
MLX5_GET64(ppcnt_reg, pstats->RFC_2863_counters, \
counter_set.eth_2863_cntrs_grp_data_layout.c##_high)
#define PPORT_2819_GET(pstats, c) \
MLX5_GET64(ppcnt_reg, pstats->RFC_2819_counters, \
counter_set.eth_2819_cntrs_grp_data_layout.c##_high)
#define PPORT_PHY_STATISTICAL_GET(pstats, c) \
MLX5_GET64(ppcnt_reg, (pstats)->phy_statistical_counters, \
counter_set.phys_layer_statistical_cntrs.c##_high)
#define PPORT_PER_PRIO_GET(pstats, prio, c) \
MLX5_GET64(ppcnt_reg, pstats->per_prio_counters[prio], \
counter_set.eth_per_prio_grp_data_layout.c##_high)
#define NUM_PPORT_PRIO 8
#define PPORT_ETH_EXT_GET(pstats, c) \
MLX5_GET64(ppcnt_reg, (pstats)->eth_ext_counters, \
counter_set.eth_extended_cntrs_grp_data_layout.c##_high)
struct mlx5e_pport_stats {
__be64 IEEE_802_3_counters[MLX5_ST_SZ_QW(ppcnt_reg)];
__be64 RFC_2863_counters[MLX5_ST_SZ_QW(ppcnt_reg)];
__be64 RFC_2819_counters[MLX5_ST_SZ_QW(ppcnt_reg)];
__be64 per_prio_counters[NUM_PPORT_PRIO][MLX5_ST_SZ_QW(ppcnt_reg)];
__be64 phy_counters[MLX5_ST_SZ_QW(ppcnt_reg)];
__be64 phy_statistical_counters[MLX5_ST_SZ_QW(ppcnt_reg)];
__be64 eth_ext_counters[MLX5_ST_SZ_QW(ppcnt_reg)];
__be64 per_tc_prio_counters[NUM_PPORT_PRIO][MLX5_ST_SZ_QW(ppcnt_reg)];
__be64 per_tc_congest_prio_counters[NUM_PPORT_PRIO][MLX5_ST_SZ_QW(ppcnt_reg)];
};
#define PCIE_PERF_GET(pcie_stats, c) \
MLX5_GET(mpcnt_reg, (pcie_stats)->pcie_perf_counters, \
counter_set.pcie_perf_cntrs_grp_data_layout.c)
#define PCIE_PERF_GET64(pcie_stats, c) \
MLX5_GET64(mpcnt_reg, (pcie_stats)->pcie_perf_counters, \
counter_set.pcie_perf_cntrs_grp_data_layout.c##_high)
struct mlx5e_pcie_stats {
__be64 pcie_perf_counters[MLX5_ST_SZ_QW(mpcnt_reg)];
};
struct mlx5e_rq_stats {
u64 packets;
u64 bytes;
u64 csum_complete;
u64 csum_complete_tail;
u64 csum_complete_tail_slow;
u64 csum_unnecessary;
u64 csum_unnecessary_inner;
u64 csum_none;
u64 lro_packets;
u64 lro_bytes;
u64 ecn_mark;
u64 removed_vlan_packets;
net/mlx5e: XDP fast RX drop bpf programs support Add support for the BPF_PROG_TYPE_PHYS_DEV hook in mlx5e driver. When XDP is on we make sure to change channels RQs type to MLX5_WQ_TYPE_LINKED_LIST rather than "striding RQ" type to ensure "page per packet". On XDP set, we fail if HW LRO is set and request from user to turn it off. Since on ConnectX4-LX HW LRO is always on by default, this will be annoying, but we prefer not to enforce LRO off from XDP set function. Full channels reset (close/open) is required only when setting XDP on/off. When XDP set is called just to exchange programs, we will update each RQ xdp program on the fly and for synchronization with current data path RX activity of that RQ, we temporally disable that RQ and ensure RX path is not running, quickly update and re-enable that RQ, for that we do: - rq.state = disabled - napi_synnchronize - xchg(rq->xdp_prg) - rq.state = enabled - napi_schedule // Just in case we've missed an IRQ Packet rate performance testing was done with pktgen 64B packets and on TX side and, TC drop action on RX side compared to XDP fast drop. CPU: Intel(R) Xeon(R) CPU E5-2680 v3 @ 2.50GHz Comparison is done between: 1. Baseline, Before this patch with TC drop action 2. This patch with TC drop action 3. This patch with XDP RX fast drop RX Cores Baseline(TC drop) TC drop XDP fast Drop -------------------------------------------------------------- 1 5.3Mpps 5.3Mpps 16.5Mpps 2 10.2Mpps 10.2Mpps 31.3Mpps 4 20.5Mpps 19.9Mpps 36.3Mpps* *My xmitter was limited to 36.3Mpps, so it is the bottleneck. It seems that receive side can handle more. Signed-off-by: Rana Shahout <ranas@mellanox.com> Signed-off-by: Saeed Mahameed <saeedm@mellanox.com> Signed-off-by: Tariq Toukan <tariqt@mellanox.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2016-09-21 03:19:46 -06:00
u64 xdp_drop;
u64 xdp_redirect;
u64 wqe_err;
u64 mpwqe_filler_cqes;
u64 mpwqe_filler_strides;
u64 oversize_pkts_sw_drop;
u64 buff_alloc_err;
u64 cqe_compress_blks;
u64 cqe_compress_pkts;
u64 cache_reuse;
u64 cache_full;
u64 cache_empty;
u64 cache_busy;
u64 cache_waive;
u64 congst_umr;
u64 arfs_err;
u64 recover;
};
struct mlx5e_sq_stats {
/* commonly accessed in data path */
u64 packets;
u64 bytes;
u64 xmit_more;
u64 tso_packets;
u64 tso_bytes;
u64 tso_inner_packets;
u64 tso_inner_bytes;
u64 csum_partial;
u64 csum_partial_inner;
u64 added_vlan_packets;
u64 nop;
#ifdef CONFIG_MLX5_EN_TLS
u64 tls_encrypted_packets;
u64 tls_encrypted_bytes;
u64 tls_ctx;
u64 tls_ooo;
u64 tls_dump_packets;
u64 tls_dump_bytes;
u64 tls_resync_bytes;
u64 tls_skip_no_sync_data;
u64 tls_drop_no_sync_data;
u64 tls_drop_bypass_req;
#endif
/* less likely accessed in data path */
u64 csum_none;
u64 stopped;
u64 dropped;
u64 recover;
/* dirtied @completion */
u64 cqes ____cacheline_aligned_in_smp;
u64 wake;
u64 cqe_err;
};
struct mlx5e_xdpsq_stats {
u64 xmit;
u64 mpwqe;
net/mlx5e: XDP, Inline small packets into the TX MPWQE in XDP xmit flow Upon high packet rate with multiple CPUs TX workloads, much of the HCA's resources are spent on prefetching TX descriptors, thus affecting transmission rates. This patch comes to mitigate this problem by moving some workload to the CPU and reducing the HW data prefetch overhead for small packets (<= 256B). When forwarding packets with XDP, a packet that is smaller than a certain size (set to ~256 bytes) would be sent inline within its WQE TX descrptor (mem-copied), when the hardware tx queue is congested beyond a pre-defined water-mark. This is added to better utilize the HW resources (which now makes one less packet data prefetch) and allow better scalability, on the account of CPU usage (which now 'memcpy's the packet into the WQE). To load balance between HW and CPU and get max packet rate, we use watermarks to detect how much the HW is congested and move the work loads back and forth between HW and CPU. Performance: Tested packet rate for UDP 64Byte multi-stream over two dual port ConnectX-5 100Gbps NICs. CPU: Intel(R) Xeon(R) CPU E5-2680 v3 @ 2.50GHz * Tested with hyper-threading disabled XDP_TX: | | before | after | | | 24 rings | 51Mpps | 116Mpps | +126% | | 1 ring | 12Mpps | 12Mpps | same | XDP_REDIRECT: ** Below is the transmit rate, not the redirection rate which might be larger, and is not affected by this patch. | | before | after | | | 32 rings | 64Mpps | 92Mpps | +43% | | 1 ring | 6.4Mpps | 6.4Mpps | same | As we can see, feature significantly improves scaling, without hurting single ring performance. Signed-off-by: Shay Agroskin <shayag@mellanox.com> Signed-off-by: Tariq Toukan <tariqt@mellanox.com> Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
2019-03-14 06:54:07 -06:00
u64 inlnw;
net/mlx5e: XDP, Close TX MPWQE session when no room for inline packet left In MPWQE mode, when transmitting packets with XDP, a packet that is smaller than a certain size (set to 256 bytes) would be sent inline within its WQE TX descriptor (mem-copied), in case the hardware tx queue is congested beyond a pre-defined water-mark. If a MPWQE cannot contain an additional inline packet, we close this MPWQE session, and send the packet inlined within the next MPWQE. To save some MPWQE session close+open operations, we don't open MPWQE sessions that are contiguously smaller than certain size (set to the HW MPWQE maximum size). If there isn't enough contiguous room in the send queue, we fill it with NOPs and wrap the send queue index around. This way, qualified packets are always sent inline. Perf tests: Tested packet rate for UDP 64Byte multi-stream over two dual port ConnectX-5 100Gbps NICs. CPU: Intel(R) Xeon(R) CPU E5-2680 v3 @ 2.50GHz XDP_TX: With 24 channels: | ------ | bounced packets | inlined packets | inline ratio | | before | 113.6Mpps | 96.3Mpps | 84% | | after | 115Mpps | 99.5Mpps | 86% | With one channel: | ------ | bounced packets | inlined packets | inline ratio | | before | 6.7Mpps | 0pps | 0% | | after | 6.8Mpps | 0pps | 0% | As we can see, there is improvement in both inline ratio and overall packet rate for 24 channels. Also, we see no degradation for the one-channel case. Signed-off-by: Shay Agroskin <shayag@mellanox.com> Signed-off-by: Tariq Toukan <tariqt@mellanox.com> Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
2019-05-12 09:28:27 -06:00
u64 nops;
u64 full;
u64 err;
/* dirtied @completion */
u64 cqes ____cacheline_aligned_in_smp;
};
struct mlx5e_ch_stats {
u64 events;
u64 poll;
u64 arm;
u64 aff_change;
net/mlx5e: Add XSK zero-copy support This commit adds support for AF_XDP zero-copy RX and TX. We create a dedicated XSK RQ inside the channel, it means that two RQs are running simultaneously: one for non-XSK traffic and the other for XSK traffic. The regular and XSK RQs use a single ID namespace split into two halves: the lower half is regular RQs, and the upper half is XSK RQs. When any zero-copy AF_XDP socket is active, changing the number of channels is not allowed, because it would break to mapping between XSK RQ IDs and channels. XSK requires different page allocation and release routines. Such functions as mlx5e_{alloc,free}_rx_mpwqe and mlx5e_{get,put}_rx_frag are generic enough to be used for both regular and XSK RQs, and they use the mlx5e_page_{alloc,release} wrappers around the real allocation functions. Function pointers are not used to avoid losing the performance with retpolines. Wherever it's certain that the regular (non-XSK) page release function should be used, it's called directly. Only the stats that could be meaningful for XSK are exposed to the userspace. Those that don't take part in the XSK flow are not considered. Note that we don't wait for WQEs on the XSK RQ (unlike the regular RQ), because the newer xdpsock sample doesn't provide any Fill Ring entries at the setup stage. We create a dedicated XSK SQ in the channel. This separation has its advantages: 1. When the UMEM is closed, the XSK SQ can also be closed and stop receiving completions. If an existing SQ was used for XSK, it would continue receiving completions for the packets of the closed socket. If a new UMEM was opened at that point, it would start getting completions that don't belong to it. 2. Calculating statistics separately. When the userspace kicks the TX, the driver triggers a hardware interrupt by posting a NOP to a dedicated XSK ICO (internal control operations) SQ, in order to trigger NAPI on the right CPU core. This XSK ICO SQ is protected by a spinlock, as the userspace application may kick the TX from any core. Store the pointers to the UMEMs in the net device private context, independently from the kernel. This way the driver can distinguish between the zero-copy and non-zero-copy UMEMs. The kernel function xdp_get_umem_from_qid does not care about this difference, but the driver is only interested in zero-copy UMEMs, particularly, on the cleanup it determines whether to close the XSK RQ and SQ or not by looking at the presence of the UMEM. Use state_lock to protect the access to this area of UMEM pointers. LRO isn't compatible with XDP, but there may be active UMEMs while XDP is off. If this is the case, don't allow LRO to ensure XDP can be reenabled at any time. The validation of XSK parameters typically happens when XSK queues open. However, when the interface is down or the XDP program isn't set, it's still possible to have active AF_XDP sockets and even to open new, but the XSK queues will be closed. To cover these cases, perform the validation also in these flows: 1. A new UMEM is registered, but the XSK queues aren't going to be created due to missing XDP program or interface being down. 2. MTU changes while there are UMEMs registered. Having this early check prevents mlx5e_open_channels from failing at a later stage, where recovery is impossible and the application has no chance to handle the error, because it got the successful return value for an MTU change or XSK open operation. The performance testing was performed on a machine with the following configuration: - 24 cores of Intel Xeon E5-2620 v3 @ 2.40 GHz - Mellanox ConnectX-5 Ex with 100 Gbit/s link The results with retpoline disabled, single stream: txonly: 33.3 Mpps (21.5 Mpps with queue and app pinned to the same CPU) rxdrop: 12.2 Mpps l2fwd: 9.4 Mpps The results with retpoline enabled, single stream: txonly: 21.3 Mpps (14.1 Mpps with queue and app pinned to the same CPU) rxdrop: 9.9 Mpps l2fwd: 6.8 Mpps Signed-off-by: Maxim Mikityanskiy <maximmi@mellanox.com> Signed-off-by: Tariq Toukan <tariqt@mellanox.com> Acked-by: Saeed Mahameed <saeedm@mellanox.com> Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
2019-06-26 08:35:38 -06:00
u64 force_irq;
u64 eq_rearm;
};
struct mlx5e_stats {
struct mlx5e_sw_stats sw;
struct mlx5e_qcounter_stats qcnt;
struct mlx5e_vnic_env_stats vnic;
struct mlx5e_vport_stats vport;
struct mlx5e_pport_stats pport;
struct rtnl_link_stats64 vf_vport;
struct mlx5e_pcie_stats pcie;
};
enum {
MLX5E_NDO_UPDATE_STATS = BIT(0x1),
};
struct mlx5e_priv;
struct mlx5e_stats_grp {
u16 update_stats_mask;
int (*get_num_stats)(struct mlx5e_priv *priv);
int (*fill_strings)(struct mlx5e_priv *priv, u8 *data, int idx);
int (*fill_stats)(struct mlx5e_priv *priv, u64 *data, int idx);
void (*update_stats)(struct mlx5e_priv *priv);
};
extern const struct mlx5e_stats_grp mlx5e_stats_grps[];
extern const int mlx5e_num_stats_grps;
void mlx5e_grp_802_3_update_stats(struct mlx5e_priv *priv);
#endif /* __MLX5_EN_STATS_H__ */