2015-05-28 13:28:46 -06:00
|
|
|
/*
|
|
|
|
* Copyright (c) 2015, Mellanox Technologies. All rights reserved.
|
|
|
|
*
|
|
|
|
* This software is available to you under a choice of one of two
|
|
|
|
* licenses. You may choose to be licensed under the terms of the GNU
|
|
|
|
* General Public License (GPL) Version 2, available from the file
|
|
|
|
* COPYING in the main directory of this source tree, or the
|
|
|
|
* OpenIB.org BSD license below:
|
|
|
|
*
|
|
|
|
* Redistribution and use in source and binary forms, with or
|
|
|
|
* without modification, are permitted provided that the following
|
|
|
|
* conditions are met:
|
|
|
|
*
|
|
|
|
* - Redistributions of source code must retain the above
|
|
|
|
* copyright notice, this list of conditions and the following
|
|
|
|
* disclaimer.
|
|
|
|
*
|
|
|
|
* - Redistributions in binary form must reproduce the above
|
|
|
|
* copyright notice, this list of conditions and the following
|
|
|
|
* disclaimer in the documentation and/or other materials
|
|
|
|
* provided with the distribution.
|
|
|
|
*
|
|
|
|
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
|
|
|
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
|
|
|
* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
|
|
|
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
|
|
|
|
* BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
|
|
|
|
* ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
|
|
|
|
* CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
|
|
* SOFTWARE.
|
|
|
|
*/
|
|
|
|
|
2017-02-22 08:20:11 -07:00
|
|
|
#include <linux/prefetch.h>
|
2015-05-28 13:28:46 -06:00
|
|
|
#include <linux/ip.h>
|
|
|
|
#include <linux/ipv6.h>
|
|
|
|
#include <linux/tcp.h>
|
bpf: add initial bpf tracepoints
This work adds a number of tracepoints to paths that are either
considered slow-path or exception-like states, where monitoring or
inspecting them would be desirable.
For bpf(2) syscall, tracepoints have been placed for main commands
when they succeed. In XDP case, tracepoint is for exceptions, that
is, f.e. on abnormal BPF program exit such as unknown or XDP_ABORTED
return code, or when error occurs during XDP_TX action and the packet
could not be forwarded.
Both have been split into separate event headers, and can be further
extended. Worst case, if they unexpectedly should get into our way in
future, they can also removed [1]. Of course, these tracepoints (like
any other) can be analyzed by eBPF itself, etc. Example output:
# ./perf record -a -e bpf:* sleep 10
# ./perf script
sock_example 6197 [005] 283.980322: bpf:bpf_map_create: map type=ARRAY ufd=4 key=4 val=8 max=256 flags=0
sock_example 6197 [005] 283.980721: bpf:bpf_prog_load: prog=a5ea8fa30ea6849c type=SOCKET_FILTER ufd=5
sock_example 6197 [005] 283.988423: bpf:bpf_prog_get_type: prog=a5ea8fa30ea6849c type=SOCKET_FILTER
sock_example 6197 [005] 283.988443: bpf:bpf_map_lookup_elem: map type=ARRAY ufd=4 key=[06 00 00 00] val=[00 00 00 00 00 00 00 00]
[...]
sock_example 6197 [005] 288.990868: bpf:bpf_map_lookup_elem: map type=ARRAY ufd=4 key=[01 00 00 00] val=[14 00 00 00 00 00 00 00]
swapper 0 [005] 289.338243: bpf:bpf_prog_put_rcu: prog=a5ea8fa30ea6849c type=SOCKET_FILTER
[1] https://lwn.net/Articles/705270/
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
2017-01-24 18:28:18 -07:00
|
|
|
#include <linux/bpf_trace.h>
|
2015-11-18 07:30:55 -07:00
|
|
|
#include <net/busy_poll.h>
|
2015-05-28 13:28:46 -06:00
|
|
|
#include "en.h"
|
2016-03-08 03:42:38 -07:00
|
|
|
#include "en_tc.h"
|
net/mlx5: E-Switch, Support VLAN actions in the offloads mode
Many virtualization systems use a policy under which a vlan tag is
pushed to packets sent by guests, and popped before the packet is
forwarded to the VM.
The current generation of the mlx5 HW doesn't fully support that on
a per flow level. As such, we are addressing the above common use
case with the SRIOV e-Switch abilities to push vlan into packets
sent by VFs and pop vlan from packets forwarded to VFs.
The HW can match on the correct vlan being present in packets
forwarded to VFs (eSwitch steering is done before stripping
the tag), so this part is offloaded as is.
A common practice for vlans is to avoid both push vlan and pop vlan
for inter-host VM/VM (east-west) communication because in this case,
push on egress cancels out with pop on ingress.
For supporting that, we use a global eswitch vlan pop policy, hence
allowing guest A to communicate with both remote VM B and local VM C.
This works since the HW pops the vlan only if it exists (e.g for
C --> A packets but not for B --> A packets).
On the slow path, when a VF vport has an offloaded flow which involves
pushing vlans, wheres another flow is not currently offloaded, the
packets from the 2nd flow seen by the VF representor on the host have
vlan. The VF rep driver removes such vlan before calling into the host
networking stack.
Signed-off-by: Or Gerlitz <ogerlitz@mellanox.com>
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2016-09-22 11:01:47 -06:00
|
|
|
#include "eswitch.h"
|
2017-04-24 03:36:42 -06:00
|
|
|
#include "en_rep.h"
|
2017-06-18 08:13:44 -06:00
|
|
|
#include "ipoib/ipoib.h"
|
2017-06-19 05:04:36 -06:00
|
|
|
#include "en_accel/ipsec_rxtx.h"
|
2015-05-28 13:28:46 -06:00
|
|
|
|
2015-12-29 05:58:31 -07:00
|
|
|
static inline bool mlx5e_rx_hw_stamp(struct mlx5e_tstamp *tstamp)
|
|
|
|
{
|
|
|
|
return tstamp->hwtstamp_config.rx_filter == HWTSTAMP_FILTER_ALL;
|
|
|
|
}
|
|
|
|
|
2016-05-10 15:29:14 -06:00
|
|
|
static inline void mlx5e_read_cqe_slot(struct mlx5e_cq *cq, u32 cqcc,
|
|
|
|
void *data)
|
|
|
|
{
|
|
|
|
u32 ci = cqcc & cq->wq.sz_m1;
|
|
|
|
|
|
|
|
memcpy(data, mlx5_cqwq_get_wqe(&cq->wq, ci), sizeof(struct mlx5_cqe64));
|
|
|
|
}
|
|
|
|
|
|
|
|
static inline void mlx5e_read_title_slot(struct mlx5e_rq *rq,
|
|
|
|
struct mlx5e_cq *cq, u32 cqcc)
|
|
|
|
{
|
|
|
|
mlx5e_read_cqe_slot(cq, cqcc, &cq->title);
|
|
|
|
cq->decmprs_left = be32_to_cpu(cq->title.byte_cnt);
|
|
|
|
cq->decmprs_wqe_counter = be16_to_cpu(cq->title.wqe_counter);
|
|
|
|
rq->stats.cqe_compress_blks++;
|
|
|
|
}
|
|
|
|
|
|
|
|
static inline void mlx5e_read_mini_arr_slot(struct mlx5e_cq *cq, u32 cqcc)
|
|
|
|
{
|
|
|
|
mlx5e_read_cqe_slot(cq, cqcc, cq->mini_arr);
|
|
|
|
cq->mini_arr_idx = 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
static inline void mlx5e_cqes_update_owner(struct mlx5e_cq *cq, u32 cqcc, int n)
|
|
|
|
{
|
|
|
|
u8 op_own = (cqcc >> cq->wq.log_sz) & 1;
|
|
|
|
u32 wq_sz = 1 << cq->wq.log_sz;
|
|
|
|
u32 ci = cqcc & cq->wq.sz_m1;
|
|
|
|
u32 ci_top = min_t(u32, wq_sz, ci + n);
|
|
|
|
|
|
|
|
for (; ci < ci_top; ci++, n--) {
|
|
|
|
struct mlx5_cqe64 *cqe = mlx5_cqwq_get_wqe(&cq->wq, ci);
|
|
|
|
|
|
|
|
cqe->op_own = op_own;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (unlikely(ci == wq_sz)) {
|
|
|
|
op_own = !op_own;
|
|
|
|
for (ci = 0; ci < n; ci++) {
|
|
|
|
struct mlx5_cqe64 *cqe = mlx5_cqwq_get_wqe(&cq->wq, ci);
|
|
|
|
|
|
|
|
cqe->op_own = op_own;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
static inline void mlx5e_decompress_cqe(struct mlx5e_rq *rq,
|
|
|
|
struct mlx5e_cq *cq, u32 cqcc)
|
|
|
|
{
|
|
|
|
cq->title.byte_cnt = cq->mini_arr[cq->mini_arr_idx].byte_cnt;
|
|
|
|
cq->title.check_sum = cq->mini_arr[cq->mini_arr_idx].checksum;
|
|
|
|
cq->title.op_own &= 0xf0;
|
|
|
|
cq->title.op_own |= 0x01 & (cqcc >> cq->wq.log_sz);
|
|
|
|
cq->title.wqe_counter = cpu_to_be16(cq->decmprs_wqe_counter);
|
|
|
|
|
2017-02-22 08:20:16 -07:00
|
|
|
if (rq->wq_type == MLX5_WQ_TYPE_LINKED_LIST_STRIDING_RQ)
|
|
|
|
cq->decmprs_wqe_counter +=
|
|
|
|
mpwrq_get_cqe_consumed_strides(&cq->title);
|
|
|
|
else
|
|
|
|
cq->decmprs_wqe_counter =
|
|
|
|
(cq->decmprs_wqe_counter + 1) & rq->wq.sz_m1;
|
2016-05-10 15:29:14 -06:00
|
|
|
}
|
|
|
|
|
|
|
|
static inline void mlx5e_decompress_cqe_no_hash(struct mlx5e_rq *rq,
|
|
|
|
struct mlx5e_cq *cq, u32 cqcc)
|
|
|
|
{
|
|
|
|
mlx5e_decompress_cqe(rq, cq, cqcc);
|
|
|
|
cq->title.rss_hash_type = 0;
|
|
|
|
cq->title.rss_hash_result = 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
static inline u32 mlx5e_decompress_cqes_cont(struct mlx5e_rq *rq,
|
|
|
|
struct mlx5e_cq *cq,
|
|
|
|
int update_owner_only,
|
|
|
|
int budget_rem)
|
|
|
|
{
|
|
|
|
u32 cqcc = cq->wq.cc + update_owner_only;
|
|
|
|
u32 cqe_count;
|
|
|
|
u32 i;
|
|
|
|
|
|
|
|
cqe_count = min_t(u32, cq->decmprs_left, budget_rem);
|
|
|
|
|
|
|
|
for (i = update_owner_only; i < cqe_count;
|
|
|
|
i++, cq->mini_arr_idx++, cqcc++) {
|
2016-05-10 15:29:15 -06:00
|
|
|
if (cq->mini_arr_idx == MLX5_MINI_CQE_ARRAY_SIZE)
|
2016-05-10 15:29:14 -06:00
|
|
|
mlx5e_read_mini_arr_slot(cq, cqcc);
|
|
|
|
|
|
|
|
mlx5e_decompress_cqe_no_hash(rq, cq, cqcc);
|
|
|
|
rq->handle_rx_cqe(rq, &cq->title);
|
|
|
|
}
|
|
|
|
mlx5e_cqes_update_owner(cq, cq->wq.cc, cqcc - cq->wq.cc);
|
|
|
|
cq->wq.cc = cqcc;
|
|
|
|
cq->decmprs_left -= cqe_count;
|
|
|
|
rq->stats.cqe_compress_pkts += cqe_count;
|
|
|
|
|
|
|
|
return cqe_count;
|
|
|
|
}
|
|
|
|
|
|
|
|
static inline u32 mlx5e_decompress_cqes_start(struct mlx5e_rq *rq,
|
|
|
|
struct mlx5e_cq *cq,
|
|
|
|
int budget_rem)
|
|
|
|
{
|
|
|
|
mlx5e_read_title_slot(rq, cq, cq->wq.cc);
|
|
|
|
mlx5e_read_mini_arr_slot(cq, cq->wq.cc + 1);
|
|
|
|
mlx5e_decompress_cqe(rq, cq, cq->wq.cc);
|
|
|
|
rq->handle_rx_cqe(rq, &cq->title);
|
|
|
|
cq->mini_arr_idx++;
|
|
|
|
|
|
|
|
return mlx5e_decompress_cqes_cont(rq, cq, 1, budget_rem) - 1;
|
|
|
|
}
|
|
|
|
|
2016-09-21 03:19:42 -06:00
|
|
|
#define RQ_PAGE_SIZE(rq) ((1 << rq->buff.page_order) << PAGE_SHIFT)
|
|
|
|
|
net/mlx5e: Introduce RX Page-Reuse
Introduce a Page-Reuse mechanism in non-Striding RQ RX datapath.
A WQE (RX descriptor) buffer is a page, that in most cases was fully
wasted on a packet that is much smaller, requiring a new page for
the next round.
In this patch, we implement a page-reuse mechanism, that resembles a
`SW Striding RQ`.
We allow the WQE to reuse its allocated page as much as it could,
until the page is fully consumed. In each round, the WQE is capable
of receiving packet of maximal size (MTU). Yet, upon the reception of
a packet, the WQE knows the actual packet size, and consumes the exact
amount of memory needed to build a linear SKB. Then, it updates the
buffer pointer within the page accordingly, for the next round.
Feature is mutually exclusive with XDP (packet-per-page)
and LRO (session size is a power of two, needs unused page).
Performance tests:
iperf tcp tests show huge gain:
--------------------------------------------
num streams | BW before | BW after | ratio |
1 | 22.2 | 30.9 | 1.39x |
8 | 64.2 | 93.6 | 1.46x |
64 | 56.7 | 91.4 | 1.61x |
--------------------------------------------
Signed-off-by: Tariq Toukan <tariqt@mellanox.com>
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
2017-01-29 08:42:26 -07:00
|
|
|
static inline bool mlx5e_page_is_reserved(struct page *page)
|
|
|
|
{
|
2017-07-13 09:26:40 -06:00
|
|
|
return page_is_pfmemalloc(page) || page_to_nid(page) != numa_mem_id();
|
net/mlx5e: Introduce RX Page-Reuse
Introduce a Page-Reuse mechanism in non-Striding RQ RX datapath.
A WQE (RX descriptor) buffer is a page, that in most cases was fully
wasted on a packet that is much smaller, requiring a new page for
the next round.
In this patch, we implement a page-reuse mechanism, that resembles a
`SW Striding RQ`.
We allow the WQE to reuse its allocated page as much as it could,
until the page is fully consumed. In each round, the WQE is capable
of receiving packet of maximal size (MTU). Yet, upon the reception of
a packet, the WQE knows the actual packet size, and consumes the exact
amount of memory needed to build a linear SKB. Then, it updates the
buffer pointer within the page accordingly, for the next round.
Feature is mutually exclusive with XDP (packet-per-page)
and LRO (session size is a power of two, needs unused page).
Performance tests:
iperf tcp tests show huge gain:
--------------------------------------------
num streams | BW before | BW after | ratio |
1 | 22.2 | 30.9 | 1.39x |
8 | 64.2 | 93.6 | 1.46x |
64 | 56.7 | 91.4 | 1.61x |
--------------------------------------------
Signed-off-by: Tariq Toukan <tariqt@mellanox.com>
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
2017-01-29 08:42:26 -07:00
|
|
|
}
|
|
|
|
|
2016-09-21 03:19:42 -06:00
|
|
|
static inline bool mlx5e_rx_cache_put(struct mlx5e_rq *rq,
|
|
|
|
struct mlx5e_dma_info *dma_info)
|
2015-05-28 13:28:46 -06:00
|
|
|
{
|
2016-09-21 03:19:42 -06:00
|
|
|
struct mlx5e_page_cache *cache = &rq->page_cache;
|
|
|
|
u32 tail_next = (cache->tail + 1) & (MLX5E_CACHE_SIZE - 1);
|
2015-05-28 13:28:46 -06:00
|
|
|
|
2016-09-21 03:19:42 -06:00
|
|
|
if (tail_next == cache->head) {
|
|
|
|
rq->stats.cache_full++;
|
|
|
|
return false;
|
|
|
|
}
|
2015-05-28 13:28:46 -06:00
|
|
|
|
2017-07-13 09:26:40 -06:00
|
|
|
if (unlikely(mlx5e_page_is_reserved(dma_info->page))) {
|
|
|
|
rq->stats.cache_waive++;
|
2017-01-19 00:03:08 -07:00
|
|
|
return false;
|
2017-07-13 09:26:40 -06:00
|
|
|
}
|
2017-01-19 00:03:08 -07:00
|
|
|
|
2016-09-21 03:19:42 -06:00
|
|
|
cache->page_cache[cache->tail] = *dma_info;
|
|
|
|
cache->tail = tail_next;
|
|
|
|
return true;
|
|
|
|
}
|
2015-05-28 13:28:46 -06:00
|
|
|
|
2016-09-21 03:19:42 -06:00
|
|
|
static inline bool mlx5e_rx_cache_get(struct mlx5e_rq *rq,
|
|
|
|
struct mlx5e_dma_info *dma_info)
|
|
|
|
{
|
|
|
|
struct mlx5e_page_cache *cache = &rq->page_cache;
|
|
|
|
|
|
|
|
if (unlikely(cache->head == cache->tail)) {
|
|
|
|
rq->stats.cache_empty++;
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (page_ref_count(cache->page_cache[cache->head].page) != 1) {
|
|
|
|
rq->stats.cache_busy++;
|
|
|
|
return false;
|
|
|
|
}
|
2015-05-28 13:28:46 -06:00
|
|
|
|
2016-09-21 03:19:42 -06:00
|
|
|
*dma_info = cache->page_cache[cache->head];
|
|
|
|
cache->head = (cache->head + 1) & (MLX5E_CACHE_SIZE - 1);
|
|
|
|
rq->stats.cache_reuse++;
|
2015-05-28 13:28:46 -06:00
|
|
|
|
2016-09-21 03:19:42 -06:00
|
|
|
dma_sync_single_for_device(rq->pdev, dma_info->addr,
|
|
|
|
RQ_PAGE_SIZE(rq),
|
|
|
|
DMA_FROM_DEVICE);
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
|
|
|
|
static inline int mlx5e_page_alloc_mapped(struct mlx5e_rq *rq,
|
|
|
|
struct mlx5e_dma_info *dma_info)
|
|
|
|
{
|
|
|
|
struct page *page;
|
|
|
|
|
|
|
|
if (mlx5e_rx_cache_get(rq, dma_info))
|
|
|
|
return 0;
|
|
|
|
|
|
|
|
page = dev_alloc_pages(rq->buff.page_order);
|
|
|
|
if (unlikely(!page))
|
|
|
|
return -ENOMEM;
|
|
|
|
|
|
|
|
dma_info->addr = dma_map_page(rq->pdev, page, 0,
|
2016-09-21 03:19:48 -06:00
|
|
|
RQ_PAGE_SIZE(rq), rq->buff.map_dir);
|
2016-09-21 03:19:42 -06:00
|
|
|
if (unlikely(dma_mapping_error(rq->pdev, dma_info->addr))) {
|
|
|
|
put_page(page);
|
|
|
|
return -ENOMEM;
|
|
|
|
}
|
2017-08-16 05:37:11 -06:00
|
|
|
dma_info->page = page;
|
2015-05-28 13:28:46 -06:00
|
|
|
|
|
|
|
return 0;
|
2016-09-21 03:19:42 -06:00
|
|
|
}
|
|
|
|
|
|
|
|
void mlx5e_page_release(struct mlx5e_rq *rq, struct mlx5e_dma_info *dma_info,
|
|
|
|
bool recycle)
|
|
|
|
{
|
|
|
|
if (likely(recycle) && mlx5e_rx_cache_put(rq, dma_info))
|
|
|
|
return;
|
|
|
|
|
|
|
|
dma_unmap_page(rq->pdev, dma_info->addr, RQ_PAGE_SIZE(rq),
|
2016-09-21 03:19:48 -06:00
|
|
|
rq->buff.map_dir);
|
2016-09-21 03:19:42 -06:00
|
|
|
put_page(dma_info->page);
|
|
|
|
}
|
|
|
|
|
net/mlx5e: Introduce RX Page-Reuse
Introduce a Page-Reuse mechanism in non-Striding RQ RX datapath.
A WQE (RX descriptor) buffer is a page, that in most cases was fully
wasted on a packet that is much smaller, requiring a new page for
the next round.
In this patch, we implement a page-reuse mechanism, that resembles a
`SW Striding RQ`.
We allow the WQE to reuse its allocated page as much as it could,
until the page is fully consumed. In each round, the WQE is capable
of receiving packet of maximal size (MTU). Yet, upon the reception of
a packet, the WQE knows the actual packet size, and consumes the exact
amount of memory needed to build a linear SKB. Then, it updates the
buffer pointer within the page accordingly, for the next round.
Feature is mutually exclusive with XDP (packet-per-page)
and LRO (session size is a power of two, needs unused page).
Performance tests:
iperf tcp tests show huge gain:
--------------------------------------------
num streams | BW before | BW after | ratio |
1 | 22.2 | 30.9 | 1.39x |
8 | 64.2 | 93.6 | 1.46x |
64 | 56.7 | 91.4 | 1.61x |
--------------------------------------------
Signed-off-by: Tariq Toukan <tariqt@mellanox.com>
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
2017-01-29 08:42:26 -07:00
|
|
|
static inline bool mlx5e_page_reuse(struct mlx5e_rq *rq,
|
|
|
|
struct mlx5e_wqe_frag_info *wi)
|
|
|
|
{
|
|
|
|
return rq->wqe.page_reuse && wi->di.page &&
|
|
|
|
(wi->offset + rq->wqe.frag_sz <= RQ_PAGE_SIZE(rq)) &&
|
|
|
|
!mlx5e_page_is_reserved(wi->di.page);
|
|
|
|
}
|
|
|
|
|
2017-07-17 03:27:26 -06:00
|
|
|
static int mlx5e_alloc_rx_wqe(struct mlx5e_rq *rq, struct mlx5e_rx_wqe *wqe, u16 ix)
|
2016-09-21 03:19:42 -06:00
|
|
|
{
|
net/mlx5e: Introduce RX Page-Reuse
Introduce a Page-Reuse mechanism in non-Striding RQ RX datapath.
A WQE (RX descriptor) buffer is a page, that in most cases was fully
wasted on a packet that is much smaller, requiring a new page for
the next round.
In this patch, we implement a page-reuse mechanism, that resembles a
`SW Striding RQ`.
We allow the WQE to reuse its allocated page as much as it could,
until the page is fully consumed. In each round, the WQE is capable
of receiving packet of maximal size (MTU). Yet, upon the reception of
a packet, the WQE knows the actual packet size, and consumes the exact
amount of memory needed to build a linear SKB. Then, it updates the
buffer pointer within the page accordingly, for the next round.
Feature is mutually exclusive with XDP (packet-per-page)
and LRO (session size is a power of two, needs unused page).
Performance tests:
iperf tcp tests show huge gain:
--------------------------------------------
num streams | BW before | BW after | ratio |
1 | 22.2 | 30.9 | 1.39x |
8 | 64.2 | 93.6 | 1.46x |
64 | 56.7 | 91.4 | 1.61x |
--------------------------------------------
Signed-off-by: Tariq Toukan <tariqt@mellanox.com>
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
2017-01-29 08:42:26 -07:00
|
|
|
struct mlx5e_wqe_frag_info *wi = &rq->wqe.frag_info[ix];
|
2015-05-28 13:28:46 -06:00
|
|
|
|
net/mlx5e: Introduce RX Page-Reuse
Introduce a Page-Reuse mechanism in non-Striding RQ RX datapath.
A WQE (RX descriptor) buffer is a page, that in most cases was fully
wasted on a packet that is much smaller, requiring a new page for
the next round.
In this patch, we implement a page-reuse mechanism, that resembles a
`SW Striding RQ`.
We allow the WQE to reuse its allocated page as much as it could,
until the page is fully consumed. In each round, the WQE is capable
of receiving packet of maximal size (MTU). Yet, upon the reception of
a packet, the WQE knows the actual packet size, and consumes the exact
amount of memory needed to build a linear SKB. Then, it updates the
buffer pointer within the page accordingly, for the next round.
Feature is mutually exclusive with XDP (packet-per-page)
and LRO (session size is a power of two, needs unused page).
Performance tests:
iperf tcp tests show huge gain:
--------------------------------------------
num streams | BW before | BW after | ratio |
1 | 22.2 | 30.9 | 1.39x |
8 | 64.2 | 93.6 | 1.46x |
64 | 56.7 | 91.4 | 1.61x |
--------------------------------------------
Signed-off-by: Tariq Toukan <tariqt@mellanox.com>
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
2017-01-29 08:42:26 -07:00
|
|
|
/* check if page exists, hence can be reused */
|
|
|
|
if (!wi->di.page) {
|
|
|
|
if (unlikely(mlx5e_page_alloc_mapped(rq, &wi->di)))
|
|
|
|
return -ENOMEM;
|
|
|
|
wi->offset = 0;
|
|
|
|
}
|
2015-05-28 13:28:46 -06:00
|
|
|
|
2017-02-13 09:41:30 -07:00
|
|
|
wqe->data.addr = cpu_to_be64(wi->di.addr + wi->offset + rq->buff.headroom);
|
2016-09-21 03:19:42 -06:00
|
|
|
return 0;
|
2015-05-28 13:28:46 -06:00
|
|
|
}
|
|
|
|
|
net/mlx5e: Introduce RX Page-Reuse
Introduce a Page-Reuse mechanism in non-Striding RQ RX datapath.
A WQE (RX descriptor) buffer is a page, that in most cases was fully
wasted on a packet that is much smaller, requiring a new page for
the next round.
In this patch, we implement a page-reuse mechanism, that resembles a
`SW Striding RQ`.
We allow the WQE to reuse its allocated page as much as it could,
until the page is fully consumed. In each round, the WQE is capable
of receiving packet of maximal size (MTU). Yet, upon the reception of
a packet, the WQE knows the actual packet size, and consumes the exact
amount of memory needed to build a linear SKB. Then, it updates the
buffer pointer within the page accordingly, for the next round.
Feature is mutually exclusive with XDP (packet-per-page)
and LRO (session size is a power of two, needs unused page).
Performance tests:
iperf tcp tests show huge gain:
--------------------------------------------
num streams | BW before | BW after | ratio |
1 | 22.2 | 30.9 | 1.39x |
8 | 64.2 | 93.6 | 1.46x |
64 | 56.7 | 91.4 | 1.61x |
--------------------------------------------
Signed-off-by: Tariq Toukan <tariqt@mellanox.com>
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
2017-01-29 08:42:26 -07:00
|
|
|
static inline void mlx5e_free_rx_wqe(struct mlx5e_rq *rq,
|
|
|
|
struct mlx5e_wqe_frag_info *wi)
|
|
|
|
{
|
|
|
|
mlx5e_page_release(rq, &wi->di, true);
|
|
|
|
wi->di.page = NULL;
|
|
|
|
}
|
|
|
|
|
|
|
|
static inline void mlx5e_free_rx_wqe_reuse(struct mlx5e_rq *rq,
|
|
|
|
struct mlx5e_wqe_frag_info *wi)
|
|
|
|
{
|
|
|
|
if (mlx5e_page_reuse(rq, wi)) {
|
|
|
|
rq->stats.page_reuse++;
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
|
|
|
|
mlx5e_free_rx_wqe(rq, wi);
|
|
|
|
}
|
|
|
|
|
2016-06-30 08:34:46 -06:00
|
|
|
void mlx5e_dealloc_rx_wqe(struct mlx5e_rq *rq, u16 ix)
|
|
|
|
{
|
net/mlx5e: Introduce RX Page-Reuse
Introduce a Page-Reuse mechanism in non-Striding RQ RX datapath.
A WQE (RX descriptor) buffer is a page, that in most cases was fully
wasted on a packet that is much smaller, requiring a new page for
the next round.
In this patch, we implement a page-reuse mechanism, that resembles a
`SW Striding RQ`.
We allow the WQE to reuse its allocated page as much as it could,
until the page is fully consumed. In each round, the WQE is capable
of receiving packet of maximal size (MTU). Yet, upon the reception of
a packet, the WQE knows the actual packet size, and consumes the exact
amount of memory needed to build a linear SKB. Then, it updates the
buffer pointer within the page accordingly, for the next round.
Feature is mutually exclusive with XDP (packet-per-page)
and LRO (session size is a power of two, needs unused page).
Performance tests:
iperf tcp tests show huge gain:
--------------------------------------------
num streams | BW before | BW after | ratio |
1 | 22.2 | 30.9 | 1.39x |
8 | 64.2 | 93.6 | 1.46x |
64 | 56.7 | 91.4 | 1.61x |
--------------------------------------------
Signed-off-by: Tariq Toukan <tariqt@mellanox.com>
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
2017-01-29 08:42:26 -07:00
|
|
|
struct mlx5e_wqe_frag_info *wi = &rq->wqe.frag_info[ix];
|
2016-06-30 08:34:46 -06:00
|
|
|
|
net/mlx5e: Introduce RX Page-Reuse
Introduce a Page-Reuse mechanism in non-Striding RQ RX datapath.
A WQE (RX descriptor) buffer is a page, that in most cases was fully
wasted on a packet that is much smaller, requiring a new page for
the next round.
In this patch, we implement a page-reuse mechanism, that resembles a
`SW Striding RQ`.
We allow the WQE to reuse its allocated page as much as it could,
until the page is fully consumed. In each round, the WQE is capable
of receiving packet of maximal size (MTU). Yet, upon the reception of
a packet, the WQE knows the actual packet size, and consumes the exact
amount of memory needed to build a linear SKB. Then, it updates the
buffer pointer within the page accordingly, for the next round.
Feature is mutually exclusive with XDP (packet-per-page)
and LRO (session size is a power of two, needs unused page).
Performance tests:
iperf tcp tests show huge gain:
--------------------------------------------
num streams | BW before | BW after | ratio |
1 | 22.2 | 30.9 | 1.39x |
8 | 64.2 | 93.6 | 1.46x |
64 | 56.7 | 91.4 | 1.61x |
--------------------------------------------
Signed-off-by: Tariq Toukan <tariqt@mellanox.com>
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
2017-01-29 08:42:26 -07:00
|
|
|
if (wi->di.page)
|
|
|
|
mlx5e_free_rx_wqe(rq, wi);
|
2016-06-30 08:34:46 -06:00
|
|
|
}
|
|
|
|
|
2016-05-10 15:29:15 -06:00
|
|
|
static inline int mlx5e_mpwqe_strides_per_page(struct mlx5e_rq *rq)
|
|
|
|
{
|
2017-02-13 09:41:30 -07:00
|
|
|
return rq->mpwqe.num_strides >> MLX5_MPWRQ_WQE_PAGE_ORDER;
|
2016-05-10 15:29:15 -06:00
|
|
|
}
|
|
|
|
|
net/mlx5e: Single flow order-0 pages for Striding RQ
To improve the memory consumption scheme, we omit the flow that
demands and splits high-order pages in Striding RQ, and stay
with a single Striding RQ flow that uses order-0 pages.
Moving to fragmented memory allows the use of larger MPWQEs,
which reduces the number of UMR posts and filler CQEs.
Moving to a single flow allows several optimizations that improve
performance, especially in production servers where we would
anyway fallback to order-0 allocations:
- inline functions that were called via function pointers.
- improve the UMR post process.
This patch alone is expected to give a slight performance reduction.
However, the new memory scheme gives the possibility to use a page-cache
of a fair size, that doesn't inflate the memory footprint, which will
dramatically fix the reduction and even give a performance gain.
Performance tests:
The following results were measured on a freshly booted system,
giving optimal baseline performance, as high-order pages are yet to
be fragmented and depleted.
We ran pktgen single-stream benchmarks, with iptables-raw-drop:
Single stride, 64 bytes:
* 4,739,057 - baseline
* 4,749,550 - this patch
no reduction
Larger packets, no page cross, 1024 bytes:
* 3,982,361 - baseline
* 3,845,682 - this patch
3.5% reduction
Larger packets, every 3rd packet crosses a page, 1500 bytes:
* 3,731,189 - baseline
* 3,579,414 - this patch
4% reduction
Fixes: 461017cb006a ("net/mlx5e: Support RX multi-packet WQE (Striding RQ)")
Fixes: bc77b240b3c5 ("net/mlx5e: Add fragmented memory support for RX multi packet WQE")
Signed-off-by: Tariq Toukan <tariqt@mellanox.com>
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2016-09-15 07:08:36 -06:00
|
|
|
static inline void mlx5e_add_skb_frag_mpwqe(struct mlx5e_rq *rq,
|
|
|
|
struct sk_buff *skb,
|
|
|
|
struct mlx5e_mpw_info *wi,
|
|
|
|
u32 page_idx, u32 frag_offset,
|
|
|
|
u32 len)
|
2016-04-20 13:02:15 -06:00
|
|
|
{
|
2017-07-02 10:02:05 -06:00
|
|
|
unsigned int truesize = ALIGN(len, BIT(rq->mpwqe.log_stride_sz));
|
2016-04-20 13:02:15 -06:00
|
|
|
|
2016-05-10 15:29:15 -06:00
|
|
|
dma_sync_single_for_cpu(rq->pdev,
|
2016-04-20 13:02:15 -06:00
|
|
|
wi->umr.dma_info[page_idx].addr + frag_offset,
|
|
|
|
len, DMA_FROM_DEVICE);
|
|
|
|
wi->skbs_frags[page_idx]++;
|
|
|
|
skb_add_rx_frag(skb, skb_shinfo(skb)->nr_frags,
|
|
|
|
wi->umr.dma_info[page_idx].page, frag_offset,
|
|
|
|
len, truesize);
|
|
|
|
}
|
|
|
|
|
|
|
|
static inline void
|
net/mlx5e: Single flow order-0 pages for Striding RQ
To improve the memory consumption scheme, we omit the flow that
demands and splits high-order pages in Striding RQ, and stay
with a single Striding RQ flow that uses order-0 pages.
Moving to fragmented memory allows the use of larger MPWQEs,
which reduces the number of UMR posts and filler CQEs.
Moving to a single flow allows several optimizations that improve
performance, especially in production servers where we would
anyway fallback to order-0 allocations:
- inline functions that were called via function pointers.
- improve the UMR post process.
This patch alone is expected to give a slight performance reduction.
However, the new memory scheme gives the possibility to use a page-cache
of a fair size, that doesn't inflate the memory footprint, which will
dramatically fix the reduction and even give a performance gain.
Performance tests:
The following results were measured on a freshly booted system,
giving optimal baseline performance, as high-order pages are yet to
be fragmented and depleted.
We ran pktgen single-stream benchmarks, with iptables-raw-drop:
Single stride, 64 bytes:
* 4,739,057 - baseline
* 4,749,550 - this patch
no reduction
Larger packets, no page cross, 1024 bytes:
* 3,982,361 - baseline
* 3,845,682 - this patch
3.5% reduction
Larger packets, every 3rd packet crosses a page, 1500 bytes:
* 3,731,189 - baseline
* 3,579,414 - this patch
4% reduction
Fixes: 461017cb006a ("net/mlx5e: Support RX multi-packet WQE (Striding RQ)")
Fixes: bc77b240b3c5 ("net/mlx5e: Add fragmented memory support for RX multi packet WQE")
Signed-off-by: Tariq Toukan <tariqt@mellanox.com>
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2016-09-15 07:08:36 -06:00
|
|
|
mlx5e_copy_skb_header_mpwqe(struct device *pdev,
|
|
|
|
struct sk_buff *skb,
|
|
|
|
struct mlx5e_mpw_info *wi,
|
|
|
|
u32 page_idx, u32 offset,
|
|
|
|
u32 headlen)
|
2016-04-20 13:02:15 -06:00
|
|
|
{
|
|
|
|
u16 headlen_pg = min_t(u32, headlen, PAGE_SIZE - offset);
|
|
|
|
struct mlx5e_dma_info *dma_info = &wi->umr.dma_info[page_idx];
|
|
|
|
unsigned int len;
|
|
|
|
|
|
|
|
/* Aligning len to sizeof(long) optimizes memcpy performance */
|
|
|
|
len = ALIGN(headlen_pg, sizeof(long));
|
|
|
|
dma_sync_single_for_cpu(pdev, dma_info->addr + offset, len,
|
|
|
|
DMA_FROM_DEVICE);
|
|
|
|
skb_copy_to_linear_data_offset(skb, 0,
|
|
|
|
page_address(dma_info->page) + offset,
|
|
|
|
len);
|
|
|
|
if (unlikely(offset + headlen > PAGE_SIZE)) {
|
|
|
|
dma_info++;
|
|
|
|
headlen_pg = len;
|
|
|
|
len = ALIGN(headlen - headlen_pg, sizeof(long));
|
|
|
|
dma_sync_single_for_cpu(pdev, dma_info->addr, len,
|
|
|
|
DMA_FROM_DEVICE);
|
|
|
|
skb_copy_to_linear_data_offset(skb, headlen_pg,
|
|
|
|
page_address(dma_info->page),
|
|
|
|
len);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
net/mlx5e: Single flow order-0 pages for Striding RQ
To improve the memory consumption scheme, we omit the flow that
demands and splits high-order pages in Striding RQ, and stay
with a single Striding RQ flow that uses order-0 pages.
Moving to fragmented memory allows the use of larger MPWQEs,
which reduces the number of UMR posts and filler CQEs.
Moving to a single flow allows several optimizations that improve
performance, especially in production servers where we would
anyway fallback to order-0 allocations:
- inline functions that were called via function pointers.
- improve the UMR post process.
This patch alone is expected to give a slight performance reduction.
However, the new memory scheme gives the possibility to use a page-cache
of a fair size, that doesn't inflate the memory footprint, which will
dramatically fix the reduction and even give a performance gain.
Performance tests:
The following results were measured on a freshly booted system,
giving optimal baseline performance, as high-order pages are yet to
be fragmented and depleted.
We ran pktgen single-stream benchmarks, with iptables-raw-drop:
Single stride, 64 bytes:
* 4,739,057 - baseline
* 4,749,550 - this patch
no reduction
Larger packets, no page cross, 1024 bytes:
* 3,982,361 - baseline
* 3,845,682 - this patch
3.5% reduction
Larger packets, every 3rd packet crosses a page, 1500 bytes:
* 3,731,189 - baseline
* 3,579,414 - this patch
4% reduction
Fixes: 461017cb006a ("net/mlx5e: Support RX multi-packet WQE (Striding RQ)")
Fixes: bc77b240b3c5 ("net/mlx5e: Add fragmented memory support for RX multi packet WQE")
Signed-off-by: Tariq Toukan <tariqt@mellanox.com>
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2016-09-15 07:08:36 -06:00
|
|
|
static inline void mlx5e_post_umr_wqe(struct mlx5e_rq *rq, u16 ix)
|
2016-04-20 13:02:15 -06:00
|
|
|
{
|
2016-09-21 03:19:43 -06:00
|
|
|
struct mlx5e_mpw_info *wi = &rq->mpwqe.info[ix];
|
2017-03-24 15:52:14 -06:00
|
|
|
struct mlx5e_icosq *sq = &rq->channel->icosq;
|
2016-04-20 13:02:15 -06:00
|
|
|
struct mlx5_wq_cyc *wq = &sq->wq;
|
|
|
|
struct mlx5e_umr_wqe *wqe;
|
|
|
|
u8 num_wqebbs = DIV_ROUND_UP(sizeof(*wqe), MLX5_SEND_WQE_BB);
|
|
|
|
u16 pi;
|
|
|
|
|
|
|
|
/* fill sq edge with nops to avoid wqe wrap around */
|
|
|
|
while ((pi = (sq->pc & wq->sz_m1)) > sq->edge) {
|
2016-09-21 03:19:47 -06:00
|
|
|
sq->db.ico_wqe[pi].opcode = MLX5_OPCODE_NOP;
|
2017-03-24 15:52:11 -06:00
|
|
|
mlx5e_post_nop(wq, sq->sqn, &sq->pc);
|
2016-04-20 13:02:15 -06:00
|
|
|
}
|
|
|
|
|
|
|
|
wqe = mlx5_wq_cyc_get_wqe(wq, pi);
|
net/mlx5e: Single flow order-0 pages for Striding RQ
To improve the memory consumption scheme, we omit the flow that
demands and splits high-order pages in Striding RQ, and stay
with a single Striding RQ flow that uses order-0 pages.
Moving to fragmented memory allows the use of larger MPWQEs,
which reduces the number of UMR posts and filler CQEs.
Moving to a single flow allows several optimizations that improve
performance, especially in production servers where we would
anyway fallback to order-0 allocations:
- inline functions that were called via function pointers.
- improve the UMR post process.
This patch alone is expected to give a slight performance reduction.
However, the new memory scheme gives the possibility to use a page-cache
of a fair size, that doesn't inflate the memory footprint, which will
dramatically fix the reduction and even give a performance gain.
Performance tests:
The following results were measured on a freshly booted system,
giving optimal baseline performance, as high-order pages are yet to
be fragmented and depleted.
We ran pktgen single-stream benchmarks, with iptables-raw-drop:
Single stride, 64 bytes:
* 4,739,057 - baseline
* 4,749,550 - this patch
no reduction
Larger packets, no page cross, 1024 bytes:
* 3,982,361 - baseline
* 3,845,682 - this patch
3.5% reduction
Larger packets, every 3rd packet crosses a page, 1500 bytes:
* 3,731,189 - baseline
* 3,579,414 - this patch
4% reduction
Fixes: 461017cb006a ("net/mlx5e: Support RX multi-packet WQE (Striding RQ)")
Fixes: bc77b240b3c5 ("net/mlx5e: Add fragmented memory support for RX multi packet WQE")
Signed-off-by: Tariq Toukan <tariqt@mellanox.com>
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2016-09-15 07:08:36 -06:00
|
|
|
memcpy(wqe, &wi->umr.wqe, sizeof(*wqe));
|
|
|
|
wqe->ctrl.opmod_idx_opcode =
|
|
|
|
cpu_to_be32((sq->pc << MLX5_WQE_CTRL_WQE_INDEX_SHIFT) |
|
|
|
|
MLX5_OPCODE_UMR);
|
|
|
|
|
2016-09-21 03:19:47 -06:00
|
|
|
sq->db.ico_wqe[pi].opcode = MLX5_OPCODE_UMR;
|
2016-04-20 13:02:15 -06:00
|
|
|
sq->pc += num_wqebbs;
|
2017-03-24 15:52:11 -06:00
|
|
|
mlx5e_notify_hw(&sq->wq, sq->pc, sq->uar_map, &wqe->ctrl);
|
2016-04-20 13:02:15 -06:00
|
|
|
}
|
|
|
|
|
net/mlx5e: Single flow order-0 pages for Striding RQ
To improve the memory consumption scheme, we omit the flow that
demands and splits high-order pages in Striding RQ, and stay
with a single Striding RQ flow that uses order-0 pages.
Moving to fragmented memory allows the use of larger MPWQEs,
which reduces the number of UMR posts and filler CQEs.
Moving to a single flow allows several optimizations that improve
performance, especially in production servers where we would
anyway fallback to order-0 allocations:
- inline functions that were called via function pointers.
- improve the UMR post process.
This patch alone is expected to give a slight performance reduction.
However, the new memory scheme gives the possibility to use a page-cache
of a fair size, that doesn't inflate the memory footprint, which will
dramatically fix the reduction and even give a performance gain.
Performance tests:
The following results were measured on a freshly booted system,
giving optimal baseline performance, as high-order pages are yet to
be fragmented and depleted.
We ran pktgen single-stream benchmarks, with iptables-raw-drop:
Single stride, 64 bytes:
* 4,739,057 - baseline
* 4,749,550 - this patch
no reduction
Larger packets, no page cross, 1024 bytes:
* 3,982,361 - baseline
* 3,845,682 - this patch
3.5% reduction
Larger packets, every 3rd packet crosses a page, 1500 bytes:
* 3,731,189 - baseline
* 3,579,414 - this patch
4% reduction
Fixes: 461017cb006a ("net/mlx5e: Support RX multi-packet WQE (Striding RQ)")
Fixes: bc77b240b3c5 ("net/mlx5e: Add fragmented memory support for RX multi packet WQE")
Signed-off-by: Tariq Toukan <tariqt@mellanox.com>
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2016-09-15 07:08:36 -06:00
|
|
|
static int mlx5e_alloc_rx_umr_mpwqe(struct mlx5e_rq *rq,
|
|
|
|
u16 ix)
|
2016-04-20 13:02:15 -06:00
|
|
|
{
|
2016-09-21 03:19:43 -06:00
|
|
|
struct mlx5e_mpw_info *wi = &rq->mpwqe.info[ix];
|
net/mlx5e: Single flow order-0 pages for Striding RQ
To improve the memory consumption scheme, we omit the flow that
demands and splits high-order pages in Striding RQ, and stay
with a single Striding RQ flow that uses order-0 pages.
Moving to fragmented memory allows the use of larger MPWQEs,
which reduces the number of UMR posts and filler CQEs.
Moving to a single flow allows several optimizations that improve
performance, especially in production servers where we would
anyway fallback to order-0 allocations:
- inline functions that were called via function pointers.
- improve the UMR post process.
This patch alone is expected to give a slight performance reduction.
However, the new memory scheme gives the possibility to use a page-cache
of a fair size, that doesn't inflate the memory footprint, which will
dramatically fix the reduction and even give a performance gain.
Performance tests:
The following results were measured on a freshly booted system,
giving optimal baseline performance, as high-order pages are yet to
be fragmented and depleted.
We ran pktgen single-stream benchmarks, with iptables-raw-drop:
Single stride, 64 bytes:
* 4,739,057 - baseline
* 4,749,550 - this patch
no reduction
Larger packets, no page cross, 1024 bytes:
* 3,982,361 - baseline
* 3,845,682 - this patch
3.5% reduction
Larger packets, every 3rd packet crosses a page, 1500 bytes:
* 3,731,189 - baseline
* 3,579,414 - this patch
4% reduction
Fixes: 461017cb006a ("net/mlx5e: Support RX multi-packet WQE (Striding RQ)")
Fixes: bc77b240b3c5 ("net/mlx5e: Add fragmented memory support for RX multi packet WQE")
Signed-off-by: Tariq Toukan <tariqt@mellanox.com>
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2016-09-15 07:08:36 -06:00
|
|
|
int pg_strides = mlx5e_mpwqe_strides_per_page(rq);
|
2017-06-25 07:28:46 -06:00
|
|
|
struct mlx5e_dma_info *dma_info = &wi->umr.dma_info[0];
|
net/mlx5e: Single flow order-0 pages for Striding RQ
To improve the memory consumption scheme, we omit the flow that
demands and splits high-order pages in Striding RQ, and stay
with a single Striding RQ flow that uses order-0 pages.
Moving to fragmented memory allows the use of larger MPWQEs,
which reduces the number of UMR posts and filler CQEs.
Moving to a single flow allows several optimizations that improve
performance, especially in production servers where we would
anyway fallback to order-0 allocations:
- inline functions that were called via function pointers.
- improve the UMR post process.
This patch alone is expected to give a slight performance reduction.
However, the new memory scheme gives the possibility to use a page-cache
of a fair size, that doesn't inflate the memory footprint, which will
dramatically fix the reduction and even give a performance gain.
Performance tests:
The following results were measured on a freshly booted system,
giving optimal baseline performance, as high-order pages are yet to
be fragmented and depleted.
We ran pktgen single-stream benchmarks, with iptables-raw-drop:
Single stride, 64 bytes:
* 4,739,057 - baseline
* 4,749,550 - this patch
no reduction
Larger packets, no page cross, 1024 bytes:
* 3,982,361 - baseline
* 3,845,682 - this patch
3.5% reduction
Larger packets, every 3rd packet crosses a page, 1500 bytes:
* 3,731,189 - baseline
* 3,579,414 - this patch
4% reduction
Fixes: 461017cb006a ("net/mlx5e: Support RX multi-packet WQE (Striding RQ)")
Fixes: bc77b240b3c5 ("net/mlx5e: Add fragmented memory support for RX multi packet WQE")
Signed-off-by: Tariq Toukan <tariqt@mellanox.com>
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2016-09-15 07:08:36 -06:00
|
|
|
int err;
|
2016-04-20 13:02:15 -06:00
|
|
|
int i;
|
|
|
|
|
2017-06-25 07:28:46 -06:00
|
|
|
for (i = 0; i < MLX5_MPWRQ_PAGES_PER_WQE; i++, dma_info++) {
|
2016-09-15 07:08:37 -06:00
|
|
|
err = mlx5e_page_alloc_mapped(rq, dma_info);
|
net/mlx5e: Single flow order-0 pages for Striding RQ
To improve the memory consumption scheme, we omit the flow that
demands and splits high-order pages in Striding RQ, and stay
with a single Striding RQ flow that uses order-0 pages.
Moving to fragmented memory allows the use of larger MPWQEs,
which reduces the number of UMR posts and filler CQEs.
Moving to a single flow allows several optimizations that improve
performance, especially in production servers where we would
anyway fallback to order-0 allocations:
- inline functions that were called via function pointers.
- improve the UMR post process.
This patch alone is expected to give a slight performance reduction.
However, the new memory scheme gives the possibility to use a page-cache
of a fair size, that doesn't inflate the memory footprint, which will
dramatically fix the reduction and even give a performance gain.
Performance tests:
The following results were measured on a freshly booted system,
giving optimal baseline performance, as high-order pages are yet to
be fragmented and depleted.
We ran pktgen single-stream benchmarks, with iptables-raw-drop:
Single stride, 64 bytes:
* 4,739,057 - baseline
* 4,749,550 - this patch
no reduction
Larger packets, no page cross, 1024 bytes:
* 3,982,361 - baseline
* 3,845,682 - this patch
3.5% reduction
Larger packets, every 3rd packet crosses a page, 1500 bytes:
* 3,731,189 - baseline
* 3,579,414 - this patch
4% reduction
Fixes: 461017cb006a ("net/mlx5e: Support RX multi-packet WQE (Striding RQ)")
Fixes: bc77b240b3c5 ("net/mlx5e: Add fragmented memory support for RX multi packet WQE")
Signed-off-by: Tariq Toukan <tariqt@mellanox.com>
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2016-09-15 07:08:36 -06:00
|
|
|
if (unlikely(err))
|
2016-04-20 13:02:15 -06:00
|
|
|
goto err_unmap;
|
2016-09-15 07:08:37 -06:00
|
|
|
wi->umr.mtt[i] = cpu_to_be64(dma_info->addr | MLX5_EN_WR);
|
|
|
|
page_ref_add(dma_info->page, pg_strides);
|
2016-04-20 13:02:15 -06:00
|
|
|
}
|
|
|
|
|
2017-06-25 06:23:35 -06:00
|
|
|
memset(wi->skbs_frags, 0, sizeof(*wi->skbs_frags) * MLX5_MPWRQ_PAGES_PER_WQE);
|
2016-04-20 13:02:15 -06:00
|
|
|
wi->consumed_strides = 0;
|
|
|
|
|
|
|
|
return 0;
|
|
|
|
|
|
|
|
err_unmap:
|
|
|
|
while (--i >= 0) {
|
2017-06-25 07:28:46 -06:00
|
|
|
dma_info--;
|
2016-09-15 07:08:37 -06:00
|
|
|
page_ref_sub(dma_info->page, pg_strides);
|
net/mlx5e: Implement RX mapped page cache for page recycle
Instead of reallocating and mapping pages for RX data-path,
recycle already used pages in a per ring cache.
Performance tests:
The following results were measured on a freshly booted system,
giving optimal baseline performance, as high-order pages are yet to
be fragmented and depleted.
We ran pktgen single-stream benchmarks, with iptables-raw-drop:
Single stride, 64 bytes:
* 4,739,057 - baseline
* 4,749,550 - order0 no cache
* 4,786,899 - order0 with cache
1% gain
Larger packets, no page cross, 1024 bytes:
* 3,982,361 - baseline
* 3,845,682 - order0 no cache
* 4,127,852 - order0 with cache
3.7% gain
Larger packets, every 3rd packet crosses a page, 1500 bytes:
* 3,731,189 - baseline
* 3,579,414 - order0 no cache
* 3,931,708 - order0 with cache
5.4% gain
Signed-off-by: Tariq Toukan <tariqt@mellanox.com>
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2016-09-15 07:08:38 -06:00
|
|
|
mlx5e_page_release(rq, dma_info, true);
|
2016-04-20 13:02:15 -06:00
|
|
|
}
|
|
|
|
|
net/mlx5e: Single flow order-0 pages for Striding RQ
To improve the memory consumption scheme, we omit the flow that
demands and splits high-order pages in Striding RQ, and stay
with a single Striding RQ flow that uses order-0 pages.
Moving to fragmented memory allows the use of larger MPWQEs,
which reduces the number of UMR posts and filler CQEs.
Moving to a single flow allows several optimizations that improve
performance, especially in production servers where we would
anyway fallback to order-0 allocations:
- inline functions that were called via function pointers.
- improve the UMR post process.
This patch alone is expected to give a slight performance reduction.
However, the new memory scheme gives the possibility to use a page-cache
of a fair size, that doesn't inflate the memory footprint, which will
dramatically fix the reduction and even give a performance gain.
Performance tests:
The following results were measured on a freshly booted system,
giving optimal baseline performance, as high-order pages are yet to
be fragmented and depleted.
We ran pktgen single-stream benchmarks, with iptables-raw-drop:
Single stride, 64 bytes:
* 4,739,057 - baseline
* 4,749,550 - this patch
no reduction
Larger packets, no page cross, 1024 bytes:
* 3,982,361 - baseline
* 3,845,682 - this patch
3.5% reduction
Larger packets, every 3rd packet crosses a page, 1500 bytes:
* 3,731,189 - baseline
* 3,579,414 - this patch
4% reduction
Fixes: 461017cb006a ("net/mlx5e: Support RX multi-packet WQE (Striding RQ)")
Fixes: bc77b240b3c5 ("net/mlx5e: Add fragmented memory support for RX multi packet WQE")
Signed-off-by: Tariq Toukan <tariqt@mellanox.com>
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2016-09-15 07:08:36 -06:00
|
|
|
return err;
|
2016-04-20 13:02:15 -06:00
|
|
|
}
|
|
|
|
|
net/mlx5e: Single flow order-0 pages for Striding RQ
To improve the memory consumption scheme, we omit the flow that
demands and splits high-order pages in Striding RQ, and stay
with a single Striding RQ flow that uses order-0 pages.
Moving to fragmented memory allows the use of larger MPWQEs,
which reduces the number of UMR posts and filler CQEs.
Moving to a single flow allows several optimizations that improve
performance, especially in production servers where we would
anyway fallback to order-0 allocations:
- inline functions that were called via function pointers.
- improve the UMR post process.
This patch alone is expected to give a slight performance reduction.
However, the new memory scheme gives the possibility to use a page-cache
of a fair size, that doesn't inflate the memory footprint, which will
dramatically fix the reduction and even give a performance gain.
Performance tests:
The following results were measured on a freshly booted system,
giving optimal baseline performance, as high-order pages are yet to
be fragmented and depleted.
We ran pktgen single-stream benchmarks, with iptables-raw-drop:
Single stride, 64 bytes:
* 4,739,057 - baseline
* 4,749,550 - this patch
no reduction
Larger packets, no page cross, 1024 bytes:
* 3,982,361 - baseline
* 3,845,682 - this patch
3.5% reduction
Larger packets, every 3rd packet crosses a page, 1500 bytes:
* 3,731,189 - baseline
* 3,579,414 - this patch
4% reduction
Fixes: 461017cb006a ("net/mlx5e: Support RX multi-packet WQE (Striding RQ)")
Fixes: bc77b240b3c5 ("net/mlx5e: Add fragmented memory support for RX multi packet WQE")
Signed-off-by: Tariq Toukan <tariqt@mellanox.com>
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2016-09-15 07:08:36 -06:00
|
|
|
void mlx5e_free_rx_mpwqe(struct mlx5e_rq *rq, struct mlx5e_mpw_info *wi)
|
2016-04-20 13:02:15 -06:00
|
|
|
{
|
net/mlx5e: Single flow order-0 pages for Striding RQ
To improve the memory consumption scheme, we omit the flow that
demands and splits high-order pages in Striding RQ, and stay
with a single Striding RQ flow that uses order-0 pages.
Moving to fragmented memory allows the use of larger MPWQEs,
which reduces the number of UMR posts and filler CQEs.
Moving to a single flow allows several optimizations that improve
performance, especially in production servers where we would
anyway fallback to order-0 allocations:
- inline functions that were called via function pointers.
- improve the UMR post process.
This patch alone is expected to give a slight performance reduction.
However, the new memory scheme gives the possibility to use a page-cache
of a fair size, that doesn't inflate the memory footprint, which will
dramatically fix the reduction and even give a performance gain.
Performance tests:
The following results were measured on a freshly booted system,
giving optimal baseline performance, as high-order pages are yet to
be fragmented and depleted.
We ran pktgen single-stream benchmarks, with iptables-raw-drop:
Single stride, 64 bytes:
* 4,739,057 - baseline
* 4,749,550 - this patch
no reduction
Larger packets, no page cross, 1024 bytes:
* 3,982,361 - baseline
* 3,845,682 - this patch
3.5% reduction
Larger packets, every 3rd packet crosses a page, 1500 bytes:
* 3,731,189 - baseline
* 3,579,414 - this patch
4% reduction
Fixes: 461017cb006a ("net/mlx5e: Support RX multi-packet WQE (Striding RQ)")
Fixes: bc77b240b3c5 ("net/mlx5e: Add fragmented memory support for RX multi packet WQE")
Signed-off-by: Tariq Toukan <tariqt@mellanox.com>
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2016-09-15 07:08:36 -06:00
|
|
|
int pg_strides = mlx5e_mpwqe_strides_per_page(rq);
|
2017-06-25 07:28:46 -06:00
|
|
|
struct mlx5e_dma_info *dma_info = &wi->umr.dma_info[0];
|
2016-04-20 13:02:15 -06:00
|
|
|
int i;
|
|
|
|
|
2017-06-25 07:28:46 -06:00
|
|
|
for (i = 0; i < MLX5_MPWRQ_PAGES_PER_WQE; i++, dma_info++) {
|
2016-09-15 07:08:37 -06:00
|
|
|
page_ref_sub(dma_info->page, pg_strides - wi->skbs_frags[i]);
|
net/mlx5e: Implement RX mapped page cache for page recycle
Instead of reallocating and mapping pages for RX data-path,
recycle already used pages in a per ring cache.
Performance tests:
The following results were measured on a freshly booted system,
giving optimal baseline performance, as high-order pages are yet to
be fragmented and depleted.
We ran pktgen single-stream benchmarks, with iptables-raw-drop:
Single stride, 64 bytes:
* 4,739,057 - baseline
* 4,749,550 - order0 no cache
* 4,786,899 - order0 with cache
1% gain
Larger packets, no page cross, 1024 bytes:
* 3,982,361 - baseline
* 3,845,682 - order0 no cache
* 4,127,852 - order0 with cache
3.7% gain
Larger packets, every 3rd packet crosses a page, 1500 bytes:
* 3,731,189 - baseline
* 3,579,414 - order0 no cache
* 3,931,708 - order0 with cache
5.4% gain
Signed-off-by: Tariq Toukan <tariqt@mellanox.com>
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2016-09-15 07:08:38 -06:00
|
|
|
mlx5e_page_release(rq, dma_info, true);
|
2016-04-20 13:02:15 -06:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2017-07-17 03:27:26 -06:00
|
|
|
static void mlx5e_post_rx_mpwqe(struct mlx5e_rq *rq)
|
2016-04-20 13:02:15 -06:00
|
|
|
{
|
|
|
|
struct mlx5_wq_ll *wq = &rq->wq;
|
|
|
|
struct mlx5e_rx_wqe *wqe = mlx5_wq_ll_get_wqe(wq, wq->head);
|
|
|
|
|
2017-07-03 01:18:19 -06:00
|
|
|
rq->mpwqe.umr_in_progress = false;
|
2016-08-28 16:13:44 -06:00
|
|
|
|
2016-04-20 13:02:15 -06:00
|
|
|
mlx5_wq_ll_push(wq, be16_to_cpu(wqe->next.next_wqe_index));
|
|
|
|
|
|
|
|
/* ensure wqes are visible to device before updating doorbell record */
|
|
|
|
dma_wmb();
|
|
|
|
|
|
|
|
mlx5_wq_ll_update_db_record(wq);
|
|
|
|
}
|
|
|
|
|
2017-07-17 03:27:26 -06:00
|
|
|
static int mlx5e_alloc_rx_mpwqe(struct mlx5e_rq *rq, u16 ix)
|
2016-04-20 13:02:15 -06:00
|
|
|
{
|
|
|
|
int err;
|
|
|
|
|
2017-06-25 07:28:46 -06:00
|
|
|
err = mlx5e_alloc_rx_umr_mpwqe(rq, ix);
|
2017-07-17 03:27:26 -06:00
|
|
|
if (unlikely(err)) {
|
|
|
|
rq->stats.buff_alloc_err++;
|
net/mlx5e: Single flow order-0 pages for Striding RQ
To improve the memory consumption scheme, we omit the flow that
demands and splits high-order pages in Striding RQ, and stay
with a single Striding RQ flow that uses order-0 pages.
Moving to fragmented memory allows the use of larger MPWQEs,
which reduces the number of UMR posts and filler CQEs.
Moving to a single flow allows several optimizations that improve
performance, especially in production servers where we would
anyway fallback to order-0 allocations:
- inline functions that were called via function pointers.
- improve the UMR post process.
This patch alone is expected to give a slight performance reduction.
However, the new memory scheme gives the possibility to use a page-cache
of a fair size, that doesn't inflate the memory footprint, which will
dramatically fix the reduction and even give a performance gain.
Performance tests:
The following results were measured on a freshly booted system,
giving optimal baseline performance, as high-order pages are yet to
be fragmented and depleted.
We ran pktgen single-stream benchmarks, with iptables-raw-drop:
Single stride, 64 bytes:
* 4,739,057 - baseline
* 4,749,550 - this patch
no reduction
Larger packets, no page cross, 1024 bytes:
* 3,982,361 - baseline
* 3,845,682 - this patch
3.5% reduction
Larger packets, every 3rd packet crosses a page, 1500 bytes:
* 3,731,189 - baseline
* 3,579,414 - this patch
4% reduction
Fixes: 461017cb006a ("net/mlx5e: Support RX multi-packet WQE (Striding RQ)")
Fixes: bc77b240b3c5 ("net/mlx5e: Add fragmented memory support for RX multi packet WQE")
Signed-off-by: Tariq Toukan <tariqt@mellanox.com>
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2016-09-15 07:08:36 -06:00
|
|
|
return err;
|
2017-07-17 03:27:26 -06:00
|
|
|
}
|
2017-07-03 01:18:19 -06:00
|
|
|
rq->mpwqe.umr_in_progress = true;
|
net/mlx5e: Single flow order-0 pages for Striding RQ
To improve the memory consumption scheme, we omit the flow that
demands and splits high-order pages in Striding RQ, and stay
with a single Striding RQ flow that uses order-0 pages.
Moving to fragmented memory allows the use of larger MPWQEs,
which reduces the number of UMR posts and filler CQEs.
Moving to a single flow allows several optimizations that improve
performance, especially in production servers where we would
anyway fallback to order-0 allocations:
- inline functions that were called via function pointers.
- improve the UMR post process.
This patch alone is expected to give a slight performance reduction.
However, the new memory scheme gives the possibility to use a page-cache
of a fair size, that doesn't inflate the memory footprint, which will
dramatically fix the reduction and even give a performance gain.
Performance tests:
The following results were measured on a freshly booted system,
giving optimal baseline performance, as high-order pages are yet to
be fragmented and depleted.
We ran pktgen single-stream benchmarks, with iptables-raw-drop:
Single stride, 64 bytes:
* 4,739,057 - baseline
* 4,749,550 - this patch
no reduction
Larger packets, no page cross, 1024 bytes:
* 3,982,361 - baseline
* 3,845,682 - this patch
3.5% reduction
Larger packets, every 3rd packet crosses a page, 1500 bytes:
* 3,731,189 - baseline
* 3,579,414 - this patch
4% reduction
Fixes: 461017cb006a ("net/mlx5e: Support RX multi-packet WQE (Striding RQ)")
Fixes: bc77b240b3c5 ("net/mlx5e: Add fragmented memory support for RX multi packet WQE")
Signed-off-by: Tariq Toukan <tariqt@mellanox.com>
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2016-09-15 07:08:36 -06:00
|
|
|
mlx5e_post_umr_wqe(rq, ix);
|
2017-07-17 03:27:26 -06:00
|
|
|
return 0;
|
net/mlx5e: Support RX multi-packet WQE (Striding RQ)
Introduce the feature of multi-packet WQE (RX Work Queue Element)
referred to as (MPWQE or Striding RQ), in which WQEs are larger
and serve multiple packets each.
Every WQE consists of many strides of the same size, every received
packet is aligned to a beginning of a stride and is written to
consecutive strides within a WQE.
In the regular approach, each regular WQE is big enough to be capable
of serving one received packet of any size up to MTU or 64K in case of
device LRO is enabled, making it very wasteful when dealing with
small packets or device LRO is enabled.
For its flexibility, MPWQE allows a better memory utilization
(implying improvements in CPU utilization and packet rate) as packets
consume strides according to their size, preserving the rest of
the WQE to be available for other packets.
MPWQE default configuration:
Num of WQEs = 16
Strides Per WQE = 2048
Stride Size = 64 byte
The default WQEs memory footprint went from 1024*mtu (~1.5MB) to
16 * 2048 * 64 = 2MB per ring.
However, HW LRO can now be supported at no additional cost in memory
footprint, and hence we turn it on by default and get an even better
performance.
Performance tested on ConnectX4-Lx 50G.
To isolate the feature under test, the numbers below were measured with
HW LRO turned off. We verified that the performance just improves when
LRO is turned back on.
* Netperf single TCP stream:
- BW raised by 10-15% for representative packet sizes:
default, 64B, 1024B, 1478B, 65536B.
* Netperf multi TCP stream:
- No degradation, line rate reached.
* Pktgen: packet rate raised by 2-10% for traffic of different message
sizes: 64B, 128B, 256B, 1024B, and 1500B.
* Pktgen: packet loss in bursts of small messages (64byte),
single stream:
- | num packets | packets loss before | packets loss after
| 2K | ~ 1K | 0
| 8K | ~ 6K | 0
| 16K | ~13K | 0
| 32K | ~28K | 0
| 64K | ~57K | ~24K
As expected as the driver can receive as many small packets (<=64B) as
the number of total strides in the ring (default = 2048 * 16) vs. 1024
(default ring size regardless of packets size) before this feature.
Signed-off-by: Tariq Toukan <tariqt@mellanox.com>
Signed-off-by: Achiad Shochat <achiad@mellanox.com>
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2016-04-20 13:02:13 -06:00
|
|
|
}
|
|
|
|
|
2016-06-30 08:34:46 -06:00
|
|
|
void mlx5e_dealloc_rx_mpwqe(struct mlx5e_rq *rq, u16 ix)
|
|
|
|
{
|
2016-09-21 03:19:43 -06:00
|
|
|
struct mlx5e_mpw_info *wi = &rq->mpwqe.info[ix];
|
2016-06-30 08:34:46 -06:00
|
|
|
|
net/mlx5e: Single flow order-0 pages for Striding RQ
To improve the memory consumption scheme, we omit the flow that
demands and splits high-order pages in Striding RQ, and stay
with a single Striding RQ flow that uses order-0 pages.
Moving to fragmented memory allows the use of larger MPWQEs,
which reduces the number of UMR posts and filler CQEs.
Moving to a single flow allows several optimizations that improve
performance, especially in production servers where we would
anyway fallback to order-0 allocations:
- inline functions that were called via function pointers.
- improve the UMR post process.
This patch alone is expected to give a slight performance reduction.
However, the new memory scheme gives the possibility to use a page-cache
of a fair size, that doesn't inflate the memory footprint, which will
dramatically fix the reduction and even give a performance gain.
Performance tests:
The following results were measured on a freshly booted system,
giving optimal baseline performance, as high-order pages are yet to
be fragmented and depleted.
We ran pktgen single-stream benchmarks, with iptables-raw-drop:
Single stride, 64 bytes:
* 4,739,057 - baseline
* 4,749,550 - this patch
no reduction
Larger packets, no page cross, 1024 bytes:
* 3,982,361 - baseline
* 3,845,682 - this patch
3.5% reduction
Larger packets, every 3rd packet crosses a page, 1500 bytes:
* 3,731,189 - baseline
* 3,579,414 - this patch
4% reduction
Fixes: 461017cb006a ("net/mlx5e: Support RX multi-packet WQE (Striding RQ)")
Fixes: bc77b240b3c5 ("net/mlx5e: Add fragmented memory support for RX multi packet WQE")
Signed-off-by: Tariq Toukan <tariqt@mellanox.com>
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2016-09-15 07:08:36 -06:00
|
|
|
mlx5e_free_rx_mpwqe(rq, wi);
|
2016-06-30 08:34:46 -06:00
|
|
|
}
|
|
|
|
|
2015-05-28 13:28:46 -06:00
|
|
|
bool mlx5e_post_rx_wqes(struct mlx5e_rq *rq)
|
|
|
|
{
|
|
|
|
struct mlx5_wq_ll *wq = &rq->wq;
|
2017-06-19 09:11:30 -06:00
|
|
|
int err;
|
2015-05-28 13:28:46 -06:00
|
|
|
|
2017-07-03 02:27:20 -06:00
|
|
|
if (unlikely(!MLX5E_TEST_BIT(rq->state, MLX5E_RQ_STATE_ENABLED)))
|
2015-05-28 13:28:46 -06:00
|
|
|
return false;
|
|
|
|
|
2017-06-19 09:11:30 -06:00
|
|
|
if (mlx5_wq_ll_is_full(wq))
|
|
|
|
return false;
|
|
|
|
|
|
|
|
do {
|
2015-05-28 13:28:46 -06:00
|
|
|
struct mlx5e_rx_wqe *wqe = mlx5_wq_ll_get_wqe(wq, wq->head);
|
|
|
|
|
2017-07-17 03:27:26 -06:00
|
|
|
err = mlx5e_alloc_rx_wqe(rq, wqe, wq->head);
|
2016-04-20 13:02:19 -06:00
|
|
|
if (unlikely(err)) {
|
net/mlx5e: Single flow order-0 pages for Striding RQ
To improve the memory consumption scheme, we omit the flow that
demands and splits high-order pages in Striding RQ, and stay
with a single Striding RQ flow that uses order-0 pages.
Moving to fragmented memory allows the use of larger MPWQEs,
which reduces the number of UMR posts and filler CQEs.
Moving to a single flow allows several optimizations that improve
performance, especially in production servers where we would
anyway fallback to order-0 allocations:
- inline functions that were called via function pointers.
- improve the UMR post process.
This patch alone is expected to give a slight performance reduction.
However, the new memory scheme gives the possibility to use a page-cache
of a fair size, that doesn't inflate the memory footprint, which will
dramatically fix the reduction and even give a performance gain.
Performance tests:
The following results were measured on a freshly booted system,
giving optimal baseline performance, as high-order pages are yet to
be fragmented and depleted.
We ran pktgen single-stream benchmarks, with iptables-raw-drop:
Single stride, 64 bytes:
* 4,739,057 - baseline
* 4,749,550 - this patch
no reduction
Larger packets, no page cross, 1024 bytes:
* 3,982,361 - baseline
* 3,845,682 - this patch
3.5% reduction
Larger packets, every 3rd packet crosses a page, 1500 bytes:
* 3,731,189 - baseline
* 3,579,414 - this patch
4% reduction
Fixes: 461017cb006a ("net/mlx5e: Support RX multi-packet WQE (Striding RQ)")
Fixes: bc77b240b3c5 ("net/mlx5e: Add fragmented memory support for RX multi packet WQE")
Signed-off-by: Tariq Toukan <tariqt@mellanox.com>
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2016-09-15 07:08:36 -06:00
|
|
|
rq->stats.buff_alloc_err++;
|
2015-05-28 13:28:46 -06:00
|
|
|
break;
|
2016-04-20 13:02:19 -06:00
|
|
|
}
|
2015-05-28 13:28:46 -06:00
|
|
|
|
|
|
|
mlx5_wq_ll_push(wq, be16_to_cpu(wqe->next.next_wqe_index));
|
2017-06-19 09:11:30 -06:00
|
|
|
} while (!mlx5_wq_ll_is_full(wq));
|
2015-05-28 13:28:46 -06:00
|
|
|
|
|
|
|
/* ensure wqes are visible to device before updating doorbell record */
|
|
|
|
dma_wmb();
|
|
|
|
|
|
|
|
mlx5_wq_ll_update_db_record(wq);
|
|
|
|
|
2017-06-19 09:11:30 -06:00
|
|
|
return !!err;
|
2015-05-28 13:28:46 -06:00
|
|
|
}
|
|
|
|
|
2017-07-17 03:27:26 -06:00
|
|
|
static inline void mlx5e_poll_ico_single_cqe(struct mlx5e_cq *cq,
|
|
|
|
struct mlx5e_icosq *sq,
|
|
|
|
struct mlx5e_rq *rq,
|
2017-07-17 05:31:39 -06:00
|
|
|
struct mlx5_cqe64 *cqe)
|
2017-07-17 03:27:26 -06:00
|
|
|
{
|
|
|
|
struct mlx5_wq_cyc *wq = &sq->wq;
|
|
|
|
u16 ci = be16_to_cpu(cqe->wqe_counter) & wq->sz_m1;
|
|
|
|
struct mlx5e_sq_wqe_info *icowi = &sq->db.ico_wqe[ci];
|
|
|
|
|
|
|
|
mlx5_cqwq_pop(&cq->wq);
|
|
|
|
|
|
|
|
if (unlikely((cqe->op_own >> 4) != MLX5_CQE_REQ)) {
|
|
|
|
WARN_ONCE(true, "mlx5e: Bad OP in ICOSQ CQE: 0x%x\n",
|
|
|
|
cqe->op_own);
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (likely(icowi->opcode == MLX5_OPCODE_UMR)) {
|
|
|
|
mlx5e_post_rx_mpwqe(rq);
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (unlikely(icowi->opcode != MLX5_OPCODE_NOP))
|
|
|
|
WARN_ONCE(true,
|
|
|
|
"mlx5e: Bad OPCODE in ICOSQ WQE info: 0x%x\n",
|
|
|
|
icowi->opcode);
|
|
|
|
}
|
|
|
|
|
|
|
|
static void mlx5e_poll_ico_cq(struct mlx5e_cq *cq, struct mlx5e_rq *rq)
|
|
|
|
{
|
|
|
|
struct mlx5e_icosq *sq = container_of(cq, struct mlx5e_icosq, cq);
|
|
|
|
struct mlx5_cqe64 *cqe;
|
|
|
|
|
|
|
|
if (unlikely(!MLX5E_TEST_BIT(sq->state, MLX5E_SQ_STATE_ENABLED)))
|
|
|
|
return;
|
|
|
|
|
|
|
|
cqe = mlx5_cqwq_get_cqe(&cq->wq);
|
|
|
|
if (likely(!cqe))
|
|
|
|
return;
|
|
|
|
|
|
|
|
/* by design, there's only a single cqe */
|
2017-07-17 05:31:39 -06:00
|
|
|
mlx5e_poll_ico_single_cqe(cq, sq, rq, cqe);
|
2017-07-17 03:27:26 -06:00
|
|
|
|
|
|
|
mlx5_cqwq_update_db_record(&cq->wq);
|
|
|
|
}
|
|
|
|
|
|
|
|
bool mlx5e_post_rx_mpwqes(struct mlx5e_rq *rq)
|
|
|
|
{
|
|
|
|
struct mlx5_wq_ll *wq = &rq->wq;
|
|
|
|
|
|
|
|
if (unlikely(!MLX5E_TEST_BIT(rq->state, MLX5E_RQ_STATE_ENABLED)))
|
|
|
|
return false;
|
|
|
|
|
|
|
|
mlx5e_poll_ico_cq(&rq->channel->icosq.cq, rq);
|
|
|
|
|
|
|
|
if (mlx5_wq_ll_is_full(wq))
|
|
|
|
return false;
|
|
|
|
|
|
|
|
if (!rq->mpwqe.umr_in_progress)
|
|
|
|
mlx5e_alloc_rx_mpwqe(rq, wq->head);
|
|
|
|
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
|
net/mlx5e: Support RX multi-packet WQE (Striding RQ)
Introduce the feature of multi-packet WQE (RX Work Queue Element)
referred to as (MPWQE or Striding RQ), in which WQEs are larger
and serve multiple packets each.
Every WQE consists of many strides of the same size, every received
packet is aligned to a beginning of a stride and is written to
consecutive strides within a WQE.
In the regular approach, each regular WQE is big enough to be capable
of serving one received packet of any size up to MTU or 64K in case of
device LRO is enabled, making it very wasteful when dealing with
small packets or device LRO is enabled.
For its flexibility, MPWQE allows a better memory utilization
(implying improvements in CPU utilization and packet rate) as packets
consume strides according to their size, preserving the rest of
the WQE to be available for other packets.
MPWQE default configuration:
Num of WQEs = 16
Strides Per WQE = 2048
Stride Size = 64 byte
The default WQEs memory footprint went from 1024*mtu (~1.5MB) to
16 * 2048 * 64 = 2MB per ring.
However, HW LRO can now be supported at no additional cost in memory
footprint, and hence we turn it on by default and get an even better
performance.
Performance tested on ConnectX4-Lx 50G.
To isolate the feature under test, the numbers below were measured with
HW LRO turned off. We verified that the performance just improves when
LRO is turned back on.
* Netperf single TCP stream:
- BW raised by 10-15% for representative packet sizes:
default, 64B, 1024B, 1478B, 65536B.
* Netperf multi TCP stream:
- No degradation, line rate reached.
* Pktgen: packet rate raised by 2-10% for traffic of different message
sizes: 64B, 128B, 256B, 1024B, and 1500B.
* Pktgen: packet loss in bursts of small messages (64byte),
single stream:
- | num packets | packets loss before | packets loss after
| 2K | ~ 1K | 0
| 8K | ~ 6K | 0
| 16K | ~13K | 0
| 32K | ~28K | 0
| 64K | ~57K | ~24K
As expected as the driver can receive as many small packets (<=64B) as
the number of total strides in the ring (default = 2048 * 16) vs. 1024
(default ring size regardless of packets size) before this feature.
Signed-off-by: Tariq Toukan <tariqt@mellanox.com>
Signed-off-by: Achiad Shochat <achiad@mellanox.com>
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2016-04-20 13:02:13 -06:00
|
|
|
static void mlx5e_lro_update_hdr(struct sk_buff *skb, struct mlx5_cqe64 *cqe,
|
|
|
|
u32 cqe_bcnt)
|
2015-05-28 13:28:46 -06:00
|
|
|
{
|
2016-09-07 10:08:01 -06:00
|
|
|
struct ethhdr *eth = (struct ethhdr *)(skb->data);
|
2015-05-28 13:28:46 -06:00
|
|
|
struct tcphdr *tcp;
|
2016-09-07 10:08:01 -06:00
|
|
|
int network_depth = 0;
|
|
|
|
__be16 proto;
|
|
|
|
u16 tot_len;
|
2017-06-05 02:17:20 -06:00
|
|
|
void *ip_p;
|
2015-05-28 13:28:46 -06:00
|
|
|
|
|
|
|
u8 l4_hdr_type = get_cqe_l4_hdr_type(cqe);
|
2017-06-05 02:17:20 -06:00
|
|
|
u8 tcp_ack = (l4_hdr_type == CQE_L4_HDR_TYPE_TCP_ACK_NO_DATA) ||
|
|
|
|
(l4_hdr_type == CQE_L4_HDR_TYPE_TCP_ACK_AND_DATA);
|
2015-05-28 13:28:46 -06:00
|
|
|
|
2016-09-07 10:08:01 -06:00
|
|
|
skb->mac_len = ETH_HLEN;
|
|
|
|
proto = __vlan_get_protocol(skb, eth->h_proto, &network_depth);
|
2015-05-28 13:28:46 -06:00
|
|
|
|
2016-09-07 10:08:01 -06:00
|
|
|
tot_len = cqe_bcnt - network_depth;
|
2017-06-05 02:17:20 -06:00
|
|
|
ip_p = skb->data + network_depth;
|
2016-09-07 10:08:01 -06:00
|
|
|
|
|
|
|
if (proto == htons(ETH_P_IP)) {
|
2017-06-05 02:17:20 -06:00
|
|
|
struct iphdr *ipv4 = ip_p;
|
2015-05-28 13:28:46 -06:00
|
|
|
|
2017-06-05 02:17:20 -06:00
|
|
|
tcp = ip_p + sizeof(struct iphdr);
|
|
|
|
skb_shinfo(skb)->gso_type = SKB_GSO_TCPV4;
|
2015-05-28 13:28:46 -06:00
|
|
|
|
|
|
|
ipv4->ttl = cqe->lro_min_ttl;
|
|
|
|
ipv4->tot_len = cpu_to_be16(tot_len);
|
|
|
|
ipv4->check = 0;
|
|
|
|
ipv4->check = ip_fast_csum((unsigned char *)ipv4,
|
|
|
|
ipv4->ihl);
|
|
|
|
} else {
|
2017-06-05 02:17:20 -06:00
|
|
|
struct ipv6hdr *ipv6 = ip_p;
|
|
|
|
|
|
|
|
tcp = ip_p + sizeof(struct ipv6hdr);
|
|
|
|
skb_shinfo(skb)->gso_type = SKB_GSO_TCPV6;
|
|
|
|
|
2015-05-28 13:28:46 -06:00
|
|
|
ipv6->hop_limit = cqe->lro_min_ttl;
|
|
|
|
ipv6->payload_len = cpu_to_be16(tot_len -
|
|
|
|
sizeof(struct ipv6hdr));
|
|
|
|
}
|
2017-06-05 02:17:20 -06:00
|
|
|
|
|
|
|
tcp->psh = get_cqe_lro_tcppsh(cqe);
|
|
|
|
|
|
|
|
if (tcp_ack) {
|
|
|
|
tcp->ack = 1;
|
|
|
|
tcp->ack_seq = cqe->lro_ack_seq_num;
|
|
|
|
tcp->window = cqe->lro_tcp_win;
|
|
|
|
}
|
2015-05-28 13:28:46 -06:00
|
|
|
}
|
|
|
|
|
|
|
|
static inline void mlx5e_skb_set_hash(struct mlx5_cqe64 *cqe,
|
|
|
|
struct sk_buff *skb)
|
|
|
|
{
|
|
|
|
u8 cht = cqe->rss_hash_type;
|
|
|
|
int ht = (cht & CQE_RSS_HTYPE_L4) ? PKT_HASH_TYPE_L4 :
|
|
|
|
(cht & CQE_RSS_HTYPE_IP) ? PKT_HASH_TYPE_L3 :
|
|
|
|
PKT_HASH_TYPE_NONE;
|
|
|
|
skb_set_hash(skb, be32_to_cpu(cqe->rss_hash_result), ht);
|
|
|
|
}
|
|
|
|
|
2015-08-16 07:04:52 -06:00
|
|
|
static inline bool is_first_ethertype_ip(struct sk_buff *skb)
|
|
|
|
{
|
|
|
|
__be16 ethertype = ((struct ethhdr *)skb->data)->h_proto;
|
|
|
|
|
|
|
|
return (ethertype == htons(ETH_P_IP) || ethertype == htons(ETH_P_IPV6));
|
|
|
|
}
|
|
|
|
|
|
|
|
static inline void mlx5e_handle_csum(struct net_device *netdev,
|
|
|
|
struct mlx5_cqe64 *cqe,
|
|
|
|
struct mlx5e_rq *rq,
|
2016-02-22 09:17:30 -07:00
|
|
|
struct sk_buff *skb,
|
|
|
|
bool lro)
|
2015-08-16 07:04:52 -06:00
|
|
|
{
|
|
|
|
if (unlikely(!(netdev->features & NETIF_F_RXCSUM)))
|
|
|
|
goto csum_none;
|
|
|
|
|
2016-02-22 09:17:30 -07:00
|
|
|
if (lro) {
|
2015-08-16 07:04:52 -06:00
|
|
|
skb->ip_summed = CHECKSUM_UNNECESSARY;
|
2017-09-13 06:37:50 -06:00
|
|
|
rq->stats.csum_unnecessary++;
|
2016-04-24 13:51:56 -06:00
|
|
|
return;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (is_first_ethertype_ip(skb)) {
|
2015-08-16 07:04:52 -06:00
|
|
|
skb->ip_summed = CHECKSUM_COMPLETE;
|
2015-08-17 22:22:26 -06:00
|
|
|
skb->csum = csum_unfold((__force __sum16)cqe->check_sum);
|
2016-06-27 03:08:38 -06:00
|
|
|
rq->stats.csum_complete++;
|
2016-04-24 13:51:56 -06:00
|
|
|
return;
|
2015-08-16 07:04:52 -06:00
|
|
|
}
|
|
|
|
|
2016-04-24 13:51:56 -06:00
|
|
|
if (likely((cqe->hds_ip_ext & CQE_L3_OK) &&
|
|
|
|
(cqe->hds_ip_ext & CQE_L4_OK))) {
|
|
|
|
skb->ip_summed = CHECKSUM_UNNECESSARY;
|
|
|
|
if (cqe_is_tunneled(cqe)) {
|
|
|
|
skb->csum_level = 1;
|
|
|
|
skb->encapsulation = 1;
|
2016-06-27 03:08:38 -06:00
|
|
|
rq->stats.csum_unnecessary_inner++;
|
2017-09-13 06:37:50 -06:00
|
|
|
return;
|
2016-04-24 13:51:56 -06:00
|
|
|
}
|
2017-09-13 06:37:50 -06:00
|
|
|
rq->stats.csum_unnecessary++;
|
2016-04-24 13:51:56 -06:00
|
|
|
return;
|
|
|
|
}
|
2015-08-16 07:04:52 -06:00
|
|
|
csum_none:
|
|
|
|
skb->ip_summed = CHECKSUM_NONE;
|
|
|
|
rq->stats.csum_none++;
|
|
|
|
}
|
|
|
|
|
2015-05-28 13:28:46 -06:00
|
|
|
static inline void mlx5e_build_rx_skb(struct mlx5_cqe64 *cqe,
|
net/mlx5e: Support RX multi-packet WQE (Striding RQ)
Introduce the feature of multi-packet WQE (RX Work Queue Element)
referred to as (MPWQE or Striding RQ), in which WQEs are larger
and serve multiple packets each.
Every WQE consists of many strides of the same size, every received
packet is aligned to a beginning of a stride and is written to
consecutive strides within a WQE.
In the regular approach, each regular WQE is big enough to be capable
of serving one received packet of any size up to MTU or 64K in case of
device LRO is enabled, making it very wasteful when dealing with
small packets or device LRO is enabled.
For its flexibility, MPWQE allows a better memory utilization
(implying improvements in CPU utilization and packet rate) as packets
consume strides according to their size, preserving the rest of
the WQE to be available for other packets.
MPWQE default configuration:
Num of WQEs = 16
Strides Per WQE = 2048
Stride Size = 64 byte
The default WQEs memory footprint went from 1024*mtu (~1.5MB) to
16 * 2048 * 64 = 2MB per ring.
However, HW LRO can now be supported at no additional cost in memory
footprint, and hence we turn it on by default and get an even better
performance.
Performance tested on ConnectX4-Lx 50G.
To isolate the feature under test, the numbers below were measured with
HW LRO turned off. We verified that the performance just improves when
LRO is turned back on.
* Netperf single TCP stream:
- BW raised by 10-15% for representative packet sizes:
default, 64B, 1024B, 1478B, 65536B.
* Netperf multi TCP stream:
- No degradation, line rate reached.
* Pktgen: packet rate raised by 2-10% for traffic of different message
sizes: 64B, 128B, 256B, 1024B, and 1500B.
* Pktgen: packet loss in bursts of small messages (64byte),
single stream:
- | num packets | packets loss before | packets loss after
| 2K | ~ 1K | 0
| 8K | ~ 6K | 0
| 16K | ~13K | 0
| 32K | ~28K | 0
| 64K | ~57K | ~24K
As expected as the driver can receive as many small packets (<=64B) as
the number of total strides in the ring (default = 2048 * 16) vs. 1024
(default ring size regardless of packets size) before this feature.
Signed-off-by: Tariq Toukan <tariqt@mellanox.com>
Signed-off-by: Achiad Shochat <achiad@mellanox.com>
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2016-04-20 13:02:13 -06:00
|
|
|
u32 cqe_bcnt,
|
2015-05-28 13:28:46 -06:00
|
|
|
struct mlx5e_rq *rq,
|
|
|
|
struct sk_buff *skb)
|
|
|
|
{
|
|
|
|
struct net_device *netdev = rq->netdev;
|
2015-12-29 05:58:31 -07:00
|
|
|
struct mlx5e_tstamp *tstamp = rq->tstamp;
|
2015-05-28 13:28:46 -06:00
|
|
|
int lro_num_seg;
|
|
|
|
|
|
|
|
lro_num_seg = be32_to_cpu(cqe->srqn) >> 24;
|
|
|
|
if (lro_num_seg > 1) {
|
net/mlx5e: Support RX multi-packet WQE (Striding RQ)
Introduce the feature of multi-packet WQE (RX Work Queue Element)
referred to as (MPWQE or Striding RQ), in which WQEs are larger
and serve multiple packets each.
Every WQE consists of many strides of the same size, every received
packet is aligned to a beginning of a stride and is written to
consecutive strides within a WQE.
In the regular approach, each regular WQE is big enough to be capable
of serving one received packet of any size up to MTU or 64K in case of
device LRO is enabled, making it very wasteful when dealing with
small packets or device LRO is enabled.
For its flexibility, MPWQE allows a better memory utilization
(implying improvements in CPU utilization and packet rate) as packets
consume strides according to their size, preserving the rest of
the WQE to be available for other packets.
MPWQE default configuration:
Num of WQEs = 16
Strides Per WQE = 2048
Stride Size = 64 byte
The default WQEs memory footprint went from 1024*mtu (~1.5MB) to
16 * 2048 * 64 = 2MB per ring.
However, HW LRO can now be supported at no additional cost in memory
footprint, and hence we turn it on by default and get an even better
performance.
Performance tested on ConnectX4-Lx 50G.
To isolate the feature under test, the numbers below were measured with
HW LRO turned off. We verified that the performance just improves when
LRO is turned back on.
* Netperf single TCP stream:
- BW raised by 10-15% for representative packet sizes:
default, 64B, 1024B, 1478B, 65536B.
* Netperf multi TCP stream:
- No degradation, line rate reached.
* Pktgen: packet rate raised by 2-10% for traffic of different message
sizes: 64B, 128B, 256B, 1024B, and 1500B.
* Pktgen: packet loss in bursts of small messages (64byte),
single stream:
- | num packets | packets loss before | packets loss after
| 2K | ~ 1K | 0
| 8K | ~ 6K | 0
| 16K | ~13K | 0
| 32K | ~28K | 0
| 64K | ~57K | ~24K
As expected as the driver can receive as many small packets (<=64B) as
the number of total strides in the ring (default = 2048 * 16) vs. 1024
(default ring size regardless of packets size) before this feature.
Signed-off-by: Tariq Toukan <tariqt@mellanox.com>
Signed-off-by: Achiad Shochat <achiad@mellanox.com>
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2016-04-20 13:02:13 -06:00
|
|
|
mlx5e_lro_update_hdr(skb, cqe, cqe_bcnt);
|
2015-08-16 07:04:49 -06:00
|
|
|
skb_shinfo(skb)->gso_size = DIV_ROUND_UP(cqe_bcnt, lro_num_seg);
|
2017-03-21 07:59:19 -06:00
|
|
|
/* Subtract one since we already counted this as one
|
|
|
|
* "regular" packet in mlx5e_complete_rx_cqe()
|
|
|
|
*/
|
|
|
|
rq->stats.packets += lro_num_seg - 1;
|
2015-05-28 13:28:46 -06:00
|
|
|
rq->stats.lro_packets++;
|
|
|
|
rq->stats.lro_bytes += cqe_bcnt;
|
|
|
|
}
|
|
|
|
|
2015-12-29 05:58:31 -07:00
|
|
|
if (unlikely(mlx5e_rx_hw_stamp(tstamp)))
|
|
|
|
mlx5e_fill_hwstamp(tstamp, get_cqe_ts(cqe), skb_hwtstamps(skb));
|
|
|
|
|
2015-05-28 13:28:46 -06:00
|
|
|
skb_record_rx_queue(skb, rq->ix);
|
|
|
|
|
|
|
|
if (likely(netdev->features & NETIF_F_RXHASH))
|
|
|
|
mlx5e_skb_set_hash(cqe, skb);
|
|
|
|
|
|
|
|
if (cqe_has_vlan(cqe))
|
|
|
|
__vlan_hwaccel_put_tag(skb, htons(ETH_P_8021Q),
|
|
|
|
be16_to_cpu(cqe->vlan_info));
|
2016-03-08 03:42:38 -07:00
|
|
|
|
|
|
|
skb->mark = be32_to_cpu(cqe->sop_drop_qpn) & MLX5E_TC_FLOW_ID_MASK;
|
2016-04-20 13:02:18 -06:00
|
|
|
|
|
|
|
mlx5e_handle_csum(netdev, cqe, rq, skb, !!lro_num_seg);
|
|
|
|
skb->protocol = eth_type_trans(skb, netdev);
|
2015-05-28 13:28:46 -06:00
|
|
|
}
|
|
|
|
|
net/mlx5e: Support RX multi-packet WQE (Striding RQ)
Introduce the feature of multi-packet WQE (RX Work Queue Element)
referred to as (MPWQE or Striding RQ), in which WQEs are larger
and serve multiple packets each.
Every WQE consists of many strides of the same size, every received
packet is aligned to a beginning of a stride and is written to
consecutive strides within a WQE.
In the regular approach, each regular WQE is big enough to be capable
of serving one received packet of any size up to MTU or 64K in case of
device LRO is enabled, making it very wasteful when dealing with
small packets or device LRO is enabled.
For its flexibility, MPWQE allows a better memory utilization
(implying improvements in CPU utilization and packet rate) as packets
consume strides according to their size, preserving the rest of
the WQE to be available for other packets.
MPWQE default configuration:
Num of WQEs = 16
Strides Per WQE = 2048
Stride Size = 64 byte
The default WQEs memory footprint went from 1024*mtu (~1.5MB) to
16 * 2048 * 64 = 2MB per ring.
However, HW LRO can now be supported at no additional cost in memory
footprint, and hence we turn it on by default and get an even better
performance.
Performance tested on ConnectX4-Lx 50G.
To isolate the feature under test, the numbers below were measured with
HW LRO turned off. We verified that the performance just improves when
LRO is turned back on.
* Netperf single TCP stream:
- BW raised by 10-15% for representative packet sizes:
default, 64B, 1024B, 1478B, 65536B.
* Netperf multi TCP stream:
- No degradation, line rate reached.
* Pktgen: packet rate raised by 2-10% for traffic of different message
sizes: 64B, 128B, 256B, 1024B, and 1500B.
* Pktgen: packet loss in bursts of small messages (64byte),
single stream:
- | num packets | packets loss before | packets loss after
| 2K | ~ 1K | 0
| 8K | ~ 6K | 0
| 16K | ~13K | 0
| 32K | ~28K | 0
| 64K | ~57K | ~24K
As expected as the driver can receive as many small packets (<=64B) as
the number of total strides in the ring (default = 2048 * 16) vs. 1024
(default ring size regardless of packets size) before this feature.
Signed-off-by: Tariq Toukan <tariqt@mellanox.com>
Signed-off-by: Achiad Shochat <achiad@mellanox.com>
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2016-04-20 13:02:13 -06:00
|
|
|
static inline void mlx5e_complete_rx_cqe(struct mlx5e_rq *rq,
|
|
|
|
struct mlx5_cqe64 *cqe,
|
|
|
|
u32 cqe_bcnt,
|
|
|
|
struct sk_buff *skb)
|
|
|
|
{
|
|
|
|
rq->stats.packets++;
|
|
|
|
rq->stats.bytes += cqe_bcnt;
|
|
|
|
mlx5e_build_rx_skb(cqe, cqe_bcnt, rq, skb);
|
|
|
|
}
|
|
|
|
|
2017-03-24 15:52:14 -06:00
|
|
|
static inline void mlx5e_xmit_xdp_doorbell(struct mlx5e_xdpsq *sq)
|
2016-09-21 03:19:49 -06:00
|
|
|
{
|
|
|
|
struct mlx5_wq_cyc *wq = &sq->wq;
|
|
|
|
struct mlx5e_tx_wqe *wqe;
|
2017-03-24 15:52:10 -06:00
|
|
|
u16 pi = (sq->pc - 1) & wq->sz_m1; /* last pi */
|
2016-09-21 03:19:49 -06:00
|
|
|
|
|
|
|
wqe = mlx5_wq_cyc_get_wqe(wq, pi);
|
|
|
|
|
2017-03-24 15:52:11 -06:00
|
|
|
mlx5e_notify_hw(wq, sq->pc, sq->uar_map, &wqe->ctrl);
|
2016-09-21 03:19:49 -06:00
|
|
|
}
|
|
|
|
|
bpf: add initial bpf tracepoints
This work adds a number of tracepoints to paths that are either
considered slow-path or exception-like states, where monitoring or
inspecting them would be desirable.
For bpf(2) syscall, tracepoints have been placed for main commands
when they succeed. In XDP case, tracepoint is for exceptions, that
is, f.e. on abnormal BPF program exit such as unknown or XDP_ABORTED
return code, or when error occurs during XDP_TX action and the packet
could not be forwarded.
Both have been split into separate event headers, and can be further
extended. Worst case, if they unexpectedly should get into our way in
future, they can also removed [1]. Of course, these tracepoints (like
any other) can be analyzed by eBPF itself, etc. Example output:
# ./perf record -a -e bpf:* sleep 10
# ./perf script
sock_example 6197 [005] 283.980322: bpf:bpf_map_create: map type=ARRAY ufd=4 key=4 val=8 max=256 flags=0
sock_example 6197 [005] 283.980721: bpf:bpf_prog_load: prog=a5ea8fa30ea6849c type=SOCKET_FILTER ufd=5
sock_example 6197 [005] 283.988423: bpf:bpf_prog_get_type: prog=a5ea8fa30ea6849c type=SOCKET_FILTER
sock_example 6197 [005] 283.988443: bpf:bpf_map_lookup_elem: map type=ARRAY ufd=4 key=[06 00 00 00] val=[00 00 00 00 00 00 00 00]
[...]
sock_example 6197 [005] 288.990868: bpf:bpf_map_lookup_elem: map type=ARRAY ufd=4 key=[01 00 00 00] val=[14 00 00 00 00 00 00 00]
swapper 0 [005] 289.338243: bpf:bpf_prog_put_rcu: prog=a5ea8fa30ea6849c type=SOCKET_FILTER
[1] https://lwn.net/Articles/705270/
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
2017-01-24 18:28:18 -07:00
|
|
|
static inline bool mlx5e_xmit_xdp_frame(struct mlx5e_rq *rq,
|
2016-09-21 03:19:48 -06:00
|
|
|
struct mlx5e_dma_info *di,
|
2017-01-17 23:06:07 -07:00
|
|
|
const struct xdp_buff *xdp)
|
2016-09-21 03:19:48 -06:00
|
|
|
{
|
2017-03-24 15:52:14 -06:00
|
|
|
struct mlx5e_xdpsq *sq = &rq->xdpsq;
|
2016-09-21 03:19:48 -06:00
|
|
|
struct mlx5_wq_cyc *wq = &sq->wq;
|
2017-03-24 15:52:14 -06:00
|
|
|
u16 pi = sq->pc & wq->sz_m1;
|
2016-09-21 03:19:48 -06:00
|
|
|
struct mlx5e_tx_wqe *wqe = mlx5_wq_cyc_get_wqe(wq, pi);
|
|
|
|
|
|
|
|
struct mlx5_wqe_ctrl_seg *cseg = &wqe->ctrl;
|
|
|
|
struct mlx5_wqe_eth_seg *eseg = &wqe->eth;
|
|
|
|
struct mlx5_wqe_data_seg *dseg;
|
|
|
|
|
2017-01-17 23:06:07 -07:00
|
|
|
ptrdiff_t data_offset = xdp->data - xdp->data_hard_start;
|
2016-12-06 05:04:05 -07:00
|
|
|
dma_addr_t dma_addr = di->addr + data_offset;
|
2017-01-17 23:06:07 -07:00
|
|
|
unsigned int dma_len = xdp->data_end - xdp->data;
|
|
|
|
|
2017-03-24 15:52:10 -06:00
|
|
|
prefetchw(wqe);
|
|
|
|
|
2017-01-17 23:06:07 -07:00
|
|
|
if (unlikely(dma_len < MLX5E_XDP_MIN_INLINE ||
|
2017-05-18 08:03:21 -06:00
|
|
|
MLX5E_SW2HW_MTU(rq->channel->priv, rq->netdev->mtu) < dma_len)) {
|
2017-01-17 23:06:07 -07:00
|
|
|
rq->stats.xdp_drop++;
|
bpf: add initial bpf tracepoints
This work adds a number of tracepoints to paths that are either
considered slow-path or exception-like states, where monitoring or
inspecting them would be desirable.
For bpf(2) syscall, tracepoints have been placed for main commands
when they succeed. In XDP case, tracepoint is for exceptions, that
is, f.e. on abnormal BPF program exit such as unknown or XDP_ABORTED
return code, or when error occurs during XDP_TX action and the packet
could not be forwarded.
Both have been split into separate event headers, and can be further
extended. Worst case, if they unexpectedly should get into our way in
future, they can also removed [1]. Of course, these tracepoints (like
any other) can be analyzed by eBPF itself, etc. Example output:
# ./perf record -a -e bpf:* sleep 10
# ./perf script
sock_example 6197 [005] 283.980322: bpf:bpf_map_create: map type=ARRAY ufd=4 key=4 val=8 max=256 flags=0
sock_example 6197 [005] 283.980721: bpf:bpf_prog_load: prog=a5ea8fa30ea6849c type=SOCKET_FILTER ufd=5
sock_example 6197 [005] 283.988423: bpf:bpf_prog_get_type: prog=a5ea8fa30ea6849c type=SOCKET_FILTER
sock_example 6197 [005] 283.988443: bpf:bpf_map_lookup_elem: map type=ARRAY ufd=4 key=[06 00 00 00] val=[00 00 00 00 00 00 00 00]
[...]
sock_example 6197 [005] 288.990868: bpf:bpf_map_lookup_elem: map type=ARRAY ufd=4 key=[01 00 00 00] val=[14 00 00 00 00 00 00 00]
swapper 0 [005] 289.338243: bpf:bpf_prog_put_rcu: prog=a5ea8fa30ea6849c type=SOCKET_FILTER
[1] https://lwn.net/Articles/705270/
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
2017-01-24 18:28:18 -07:00
|
|
|
return false;
|
2017-01-17 23:06:07 -07:00
|
|
|
}
|
2016-09-21 03:19:48 -06:00
|
|
|
|
2017-03-24 15:52:11 -06:00
|
|
|
if (unlikely(!mlx5e_wqc_has_room_for(wq, sq->cc, sq->pc, 1))) {
|
2017-03-24 15:52:14 -06:00
|
|
|
if (sq->db.doorbell) {
|
2016-09-21 03:19:49 -06:00
|
|
|
/* SQ is full, ring doorbell */
|
|
|
|
mlx5e_xmit_xdp_doorbell(sq);
|
2017-03-24 15:52:14 -06:00
|
|
|
sq->db.doorbell = false;
|
2016-09-21 03:19:49 -06:00
|
|
|
}
|
2016-09-21 03:19:48 -06:00
|
|
|
rq->stats.xdp_tx_full++;
|
bpf: add initial bpf tracepoints
This work adds a number of tracepoints to paths that are either
considered slow-path or exception-like states, where monitoring or
inspecting them would be desirable.
For bpf(2) syscall, tracepoints have been placed for main commands
when they succeed. In XDP case, tracepoint is for exceptions, that
is, f.e. on abnormal BPF program exit such as unknown or XDP_ABORTED
return code, or when error occurs during XDP_TX action and the packet
could not be forwarded.
Both have been split into separate event headers, and can be further
extended. Worst case, if they unexpectedly should get into our way in
future, they can also removed [1]. Of course, these tracepoints (like
any other) can be analyzed by eBPF itself, etc. Example output:
# ./perf record -a -e bpf:* sleep 10
# ./perf script
sock_example 6197 [005] 283.980322: bpf:bpf_map_create: map type=ARRAY ufd=4 key=4 val=8 max=256 flags=0
sock_example 6197 [005] 283.980721: bpf:bpf_prog_load: prog=a5ea8fa30ea6849c type=SOCKET_FILTER ufd=5
sock_example 6197 [005] 283.988423: bpf:bpf_prog_get_type: prog=a5ea8fa30ea6849c type=SOCKET_FILTER
sock_example 6197 [005] 283.988443: bpf:bpf_map_lookup_elem: map type=ARRAY ufd=4 key=[06 00 00 00] val=[00 00 00 00 00 00 00 00]
[...]
sock_example 6197 [005] 288.990868: bpf:bpf_map_lookup_elem: map type=ARRAY ufd=4 key=[01 00 00 00] val=[14 00 00 00 00 00 00 00]
swapper 0 [005] 289.338243: bpf:bpf_prog_put_rcu: prog=a5ea8fa30ea6849c type=SOCKET_FILTER
[1] https://lwn.net/Articles/705270/
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
2017-01-24 18:28:18 -07:00
|
|
|
return false;
|
2016-09-21 03:19:48 -06:00
|
|
|
}
|
|
|
|
|
2017-03-24 15:52:10 -06:00
|
|
|
dma_sync_single_for_device(sq->pdev, dma_addr, dma_len, PCI_DMA_TODEVICE);
|
2016-09-21 03:19:48 -06:00
|
|
|
|
2017-03-24 15:52:10 -06:00
|
|
|
cseg->fm_ce_se = 0;
|
2016-09-21 03:19:48 -06:00
|
|
|
|
2016-12-06 05:04:05 -07:00
|
|
|
dseg = (struct mlx5_wqe_data_seg *)eseg + 1;
|
2017-03-24 15:52:10 -06:00
|
|
|
|
2016-12-06 05:04:05 -07:00
|
|
|
/* copy the inline part if required */
|
|
|
|
if (sq->min_inline_mode != MLX5_INLINE_MODE_NONE) {
|
|
|
|
memcpy(eseg->inline_hdr.start, xdp->data, MLX5E_XDP_MIN_INLINE);
|
|
|
|
eseg->inline_hdr.sz = cpu_to_be16(MLX5E_XDP_MIN_INLINE);
|
|
|
|
dma_len -= MLX5E_XDP_MIN_INLINE;
|
|
|
|
dma_addr += MLX5E_XDP_MIN_INLINE;
|
|
|
|
dseg++;
|
|
|
|
}
|
2016-09-21 03:19:48 -06:00
|
|
|
|
|
|
|
/* write the dma part */
|
|
|
|
dseg->addr = cpu_to_be64(dma_addr);
|
|
|
|
dseg->byte_count = cpu_to_be32(dma_len);
|
|
|
|
|
|
|
|
cseg->opmod_idx_opcode = cpu_to_be32((sq->pc << 8) | MLX5_OPCODE_SEND);
|
|
|
|
|
net/mlx5e: Introduce RX Page-Reuse
Introduce a Page-Reuse mechanism in non-Striding RQ RX datapath.
A WQE (RX descriptor) buffer is a page, that in most cases was fully
wasted on a packet that is much smaller, requiring a new page for
the next round.
In this patch, we implement a page-reuse mechanism, that resembles a
`SW Striding RQ`.
We allow the WQE to reuse its allocated page as much as it could,
until the page is fully consumed. In each round, the WQE is capable
of receiving packet of maximal size (MTU). Yet, upon the reception of
a packet, the WQE knows the actual packet size, and consumes the exact
amount of memory needed to build a linear SKB. Then, it updates the
buffer pointer within the page accordingly, for the next round.
Feature is mutually exclusive with XDP (packet-per-page)
and LRO (session size is a power of two, needs unused page).
Performance tests:
iperf tcp tests show huge gain:
--------------------------------------------
num streams | BW before | BW after | ratio |
1 | 22.2 | 30.9 | 1.39x |
8 | 64.2 | 93.6 | 1.46x |
64 | 56.7 | 91.4 | 1.61x |
--------------------------------------------
Signed-off-by: Tariq Toukan <tariqt@mellanox.com>
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
2017-01-29 08:42:26 -07:00
|
|
|
/* move page to reference to sq responsibility,
|
|
|
|
* and mark so it's not put back in page-cache.
|
|
|
|
*/
|
|
|
|
rq->wqe.xdp_xmit = true;
|
2017-03-24 15:52:14 -06:00
|
|
|
sq->db.di[pi] = *di;
|
2017-03-24 15:52:10 -06:00
|
|
|
sq->pc++;
|
2016-09-21 03:19:48 -06:00
|
|
|
|
2017-03-24 15:52:14 -06:00
|
|
|
sq->db.doorbell = true;
|
net/mlx5e: Introduce RX Page-Reuse
Introduce a Page-Reuse mechanism in non-Striding RQ RX datapath.
A WQE (RX descriptor) buffer is a page, that in most cases was fully
wasted on a packet that is much smaller, requiring a new page for
the next round.
In this patch, we implement a page-reuse mechanism, that resembles a
`SW Striding RQ`.
We allow the WQE to reuse its allocated page as much as it could,
until the page is fully consumed. In each round, the WQE is capable
of receiving packet of maximal size (MTU). Yet, upon the reception of
a packet, the WQE knows the actual packet size, and consumes the exact
amount of memory needed to build a linear SKB. Then, it updates the
buffer pointer within the page accordingly, for the next round.
Feature is mutually exclusive with XDP (packet-per-page)
and LRO (session size is a power of two, needs unused page).
Performance tests:
iperf tcp tests show huge gain:
--------------------------------------------
num streams | BW before | BW after | ratio |
1 | 22.2 | 30.9 | 1.39x |
8 | 64.2 | 93.6 | 1.46x |
64 | 56.7 | 91.4 | 1.61x |
--------------------------------------------
Signed-off-by: Tariq Toukan <tariqt@mellanox.com>
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
2017-01-29 08:42:26 -07:00
|
|
|
|
2016-09-21 03:19:48 -06:00
|
|
|
rq->stats.xdp_tx++;
|
bpf: add initial bpf tracepoints
This work adds a number of tracepoints to paths that are either
considered slow-path or exception-like states, where monitoring or
inspecting them would be desirable.
For bpf(2) syscall, tracepoints have been placed for main commands
when they succeed. In XDP case, tracepoint is for exceptions, that
is, f.e. on abnormal BPF program exit such as unknown or XDP_ABORTED
return code, or when error occurs during XDP_TX action and the packet
could not be forwarded.
Both have been split into separate event headers, and can be further
extended. Worst case, if they unexpectedly should get into our way in
future, they can also removed [1]. Of course, these tracepoints (like
any other) can be analyzed by eBPF itself, etc. Example output:
# ./perf record -a -e bpf:* sleep 10
# ./perf script
sock_example 6197 [005] 283.980322: bpf:bpf_map_create: map type=ARRAY ufd=4 key=4 val=8 max=256 flags=0
sock_example 6197 [005] 283.980721: bpf:bpf_prog_load: prog=a5ea8fa30ea6849c type=SOCKET_FILTER ufd=5
sock_example 6197 [005] 283.988423: bpf:bpf_prog_get_type: prog=a5ea8fa30ea6849c type=SOCKET_FILTER
sock_example 6197 [005] 283.988443: bpf:bpf_map_lookup_elem: map type=ARRAY ufd=4 key=[06 00 00 00] val=[00 00 00 00 00 00 00 00]
[...]
sock_example 6197 [005] 288.990868: bpf:bpf_map_lookup_elem: map type=ARRAY ufd=4 key=[01 00 00 00] val=[14 00 00 00 00 00 00 00]
swapper 0 [005] 289.338243: bpf:bpf_prog_put_rcu: prog=a5ea8fa30ea6849c type=SOCKET_FILTER
[1] https://lwn.net/Articles/705270/
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
2017-01-24 18:28:18 -07:00
|
|
|
return true;
|
2016-09-21 03:19:48 -06:00
|
|
|
}
|
|
|
|
|
|
|
|
/* returns true if packet was consumed by xdp */
|
2017-01-17 23:06:07 -07:00
|
|
|
static inline int mlx5e_xdp_handle(struct mlx5e_rq *rq,
|
|
|
|
struct mlx5e_dma_info *di,
|
|
|
|
void *va, u16 *rx_headroom, u32 *len)
|
net/mlx5e: XDP fast RX drop bpf programs support
Add support for the BPF_PROG_TYPE_PHYS_DEV hook in mlx5e driver.
When XDP is on we make sure to change channels RQs type to
MLX5_WQ_TYPE_LINKED_LIST rather than "striding RQ" type to
ensure "page per packet".
On XDP set, we fail if HW LRO is set and request from user to turn it
off. Since on ConnectX4-LX HW LRO is always on by default, this will be
annoying, but we prefer not to enforce LRO off from XDP set function.
Full channels reset (close/open) is required only when setting XDP
on/off.
When XDP set is called just to exchange programs, we will update
each RQ xdp program on the fly and for synchronization with current
data path RX activity of that RQ, we temporally disable that RQ and
ensure RX path is not running, quickly update and re-enable that RQ,
for that we do:
- rq.state = disabled
- napi_synnchronize
- xchg(rq->xdp_prg)
- rq.state = enabled
- napi_schedule // Just in case we've missed an IRQ
Packet rate performance testing was done with pktgen 64B packets and on
TX side and, TC drop action on RX side compared to XDP fast drop.
CPU: Intel(R) Xeon(R) CPU E5-2680 v3 @ 2.50GHz
Comparison is done between:
1. Baseline, Before this patch with TC drop action
2. This patch with TC drop action
3. This patch with XDP RX fast drop
RX Cores Baseline(TC drop) TC drop XDP fast Drop
--------------------------------------------------------------
1 5.3Mpps 5.3Mpps 16.5Mpps
2 10.2Mpps 10.2Mpps 31.3Mpps
4 20.5Mpps 19.9Mpps 36.3Mpps*
*My xmitter was limited to 36.3Mpps, so it is the bottleneck.
It seems that receive side can handle more.
Signed-off-by: Rana Shahout <ranas@mellanox.com>
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
Signed-off-by: Tariq Toukan <tariqt@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2016-09-21 03:19:46 -06:00
|
|
|
{
|
2017-01-17 23:06:07 -07:00
|
|
|
const struct bpf_prog *prog = READ_ONCE(rq->xdp_prog);
|
net/mlx5e: XDP fast RX drop bpf programs support
Add support for the BPF_PROG_TYPE_PHYS_DEV hook in mlx5e driver.
When XDP is on we make sure to change channels RQs type to
MLX5_WQ_TYPE_LINKED_LIST rather than "striding RQ" type to
ensure "page per packet".
On XDP set, we fail if HW LRO is set and request from user to turn it
off. Since on ConnectX4-LX HW LRO is always on by default, this will be
annoying, but we prefer not to enforce LRO off from XDP set function.
Full channels reset (close/open) is required only when setting XDP
on/off.
When XDP set is called just to exchange programs, we will update
each RQ xdp program on the fly and for synchronization with current
data path RX activity of that RQ, we temporally disable that RQ and
ensure RX path is not running, quickly update and re-enable that RQ,
for that we do:
- rq.state = disabled
- napi_synnchronize
- xchg(rq->xdp_prg)
- rq.state = enabled
- napi_schedule // Just in case we've missed an IRQ
Packet rate performance testing was done with pktgen 64B packets and on
TX side and, TC drop action on RX side compared to XDP fast drop.
CPU: Intel(R) Xeon(R) CPU E5-2680 v3 @ 2.50GHz
Comparison is done between:
1. Baseline, Before this patch with TC drop action
2. This patch with TC drop action
3. This patch with XDP RX fast drop
RX Cores Baseline(TC drop) TC drop XDP fast Drop
--------------------------------------------------------------
1 5.3Mpps 5.3Mpps 16.5Mpps
2 10.2Mpps 10.2Mpps 31.3Mpps
4 20.5Mpps 19.9Mpps 36.3Mpps*
*My xmitter was limited to 36.3Mpps, so it is the bottleneck.
It seems that receive side can handle more.
Signed-off-by: Rana Shahout <ranas@mellanox.com>
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
Signed-off-by: Tariq Toukan <tariqt@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2016-09-21 03:19:46 -06:00
|
|
|
struct xdp_buff xdp;
|
2016-09-21 03:19:48 -06:00
|
|
|
u32 act;
|
|
|
|
|
|
|
|
if (!prog)
|
|
|
|
return false;
|
net/mlx5e: XDP fast RX drop bpf programs support
Add support for the BPF_PROG_TYPE_PHYS_DEV hook in mlx5e driver.
When XDP is on we make sure to change channels RQs type to
MLX5_WQ_TYPE_LINKED_LIST rather than "striding RQ" type to
ensure "page per packet".
On XDP set, we fail if HW LRO is set and request from user to turn it
off. Since on ConnectX4-LX HW LRO is always on by default, this will be
annoying, but we prefer not to enforce LRO off from XDP set function.
Full channels reset (close/open) is required only when setting XDP
on/off.
When XDP set is called just to exchange programs, we will update
each RQ xdp program on the fly and for synchronization with current
data path RX activity of that RQ, we temporally disable that RQ and
ensure RX path is not running, quickly update and re-enable that RQ,
for that we do:
- rq.state = disabled
- napi_synnchronize
- xchg(rq->xdp_prg)
- rq.state = enabled
- napi_schedule // Just in case we've missed an IRQ
Packet rate performance testing was done with pktgen 64B packets and on
TX side and, TC drop action on RX side compared to XDP fast drop.
CPU: Intel(R) Xeon(R) CPU E5-2680 v3 @ 2.50GHz
Comparison is done between:
1. Baseline, Before this patch with TC drop action
2. This patch with TC drop action
3. This patch with XDP RX fast drop
RX Cores Baseline(TC drop) TC drop XDP fast Drop
--------------------------------------------------------------
1 5.3Mpps 5.3Mpps 16.5Mpps
2 10.2Mpps 10.2Mpps 31.3Mpps
4 20.5Mpps 19.9Mpps 36.3Mpps*
*My xmitter was limited to 36.3Mpps, so it is the bottleneck.
It seems that receive side can handle more.
Signed-off-by: Rana Shahout <ranas@mellanox.com>
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
Signed-off-by: Tariq Toukan <tariqt@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2016-09-21 03:19:46 -06:00
|
|
|
|
2017-01-17 23:06:07 -07:00
|
|
|
xdp.data = va + *rx_headroom;
|
|
|
|
xdp.data_end = xdp.data + *len;
|
|
|
|
xdp.data_hard_start = va;
|
|
|
|
|
2016-09-21 03:19:48 -06:00
|
|
|
act = bpf_prog_run_xdp(prog, &xdp);
|
|
|
|
switch (act) {
|
|
|
|
case XDP_PASS:
|
2017-01-17 23:06:07 -07:00
|
|
|
*rx_headroom = xdp.data - xdp.data_hard_start;
|
|
|
|
*len = xdp.data_end - xdp.data;
|
2016-09-21 03:19:48 -06:00
|
|
|
return false;
|
|
|
|
case XDP_TX:
|
bpf: add initial bpf tracepoints
This work adds a number of tracepoints to paths that are either
considered slow-path or exception-like states, where monitoring or
inspecting them would be desirable.
For bpf(2) syscall, tracepoints have been placed for main commands
when they succeed. In XDP case, tracepoint is for exceptions, that
is, f.e. on abnormal BPF program exit such as unknown or XDP_ABORTED
return code, or when error occurs during XDP_TX action and the packet
could not be forwarded.
Both have been split into separate event headers, and can be further
extended. Worst case, if they unexpectedly should get into our way in
future, they can also removed [1]. Of course, these tracepoints (like
any other) can be analyzed by eBPF itself, etc. Example output:
# ./perf record -a -e bpf:* sleep 10
# ./perf script
sock_example 6197 [005] 283.980322: bpf:bpf_map_create: map type=ARRAY ufd=4 key=4 val=8 max=256 flags=0
sock_example 6197 [005] 283.980721: bpf:bpf_prog_load: prog=a5ea8fa30ea6849c type=SOCKET_FILTER ufd=5
sock_example 6197 [005] 283.988423: bpf:bpf_prog_get_type: prog=a5ea8fa30ea6849c type=SOCKET_FILTER
sock_example 6197 [005] 283.988443: bpf:bpf_map_lookup_elem: map type=ARRAY ufd=4 key=[06 00 00 00] val=[00 00 00 00 00 00 00 00]
[...]
sock_example 6197 [005] 288.990868: bpf:bpf_map_lookup_elem: map type=ARRAY ufd=4 key=[01 00 00 00] val=[14 00 00 00 00 00 00 00]
swapper 0 [005] 289.338243: bpf:bpf_prog_put_rcu: prog=a5ea8fa30ea6849c type=SOCKET_FILTER
[1] https://lwn.net/Articles/705270/
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
2017-01-24 18:28:18 -07:00
|
|
|
if (unlikely(!mlx5e_xmit_xdp_frame(rq, di, &xdp)))
|
|
|
|
trace_xdp_exception(rq->netdev, prog, act);
|
2016-09-21 03:19:48 -06:00
|
|
|
return true;
|
|
|
|
default:
|
|
|
|
bpf_warn_invalid_xdp_action(act);
|
|
|
|
case XDP_ABORTED:
|
bpf: add initial bpf tracepoints
This work adds a number of tracepoints to paths that are either
considered slow-path or exception-like states, where monitoring or
inspecting them would be desirable.
For bpf(2) syscall, tracepoints have been placed for main commands
when they succeed. In XDP case, tracepoint is for exceptions, that
is, f.e. on abnormal BPF program exit such as unknown or XDP_ABORTED
return code, or when error occurs during XDP_TX action and the packet
could not be forwarded.
Both have been split into separate event headers, and can be further
extended. Worst case, if they unexpectedly should get into our way in
future, they can also removed [1]. Of course, these tracepoints (like
any other) can be analyzed by eBPF itself, etc. Example output:
# ./perf record -a -e bpf:* sleep 10
# ./perf script
sock_example 6197 [005] 283.980322: bpf:bpf_map_create: map type=ARRAY ufd=4 key=4 val=8 max=256 flags=0
sock_example 6197 [005] 283.980721: bpf:bpf_prog_load: prog=a5ea8fa30ea6849c type=SOCKET_FILTER ufd=5
sock_example 6197 [005] 283.988423: bpf:bpf_prog_get_type: prog=a5ea8fa30ea6849c type=SOCKET_FILTER
sock_example 6197 [005] 283.988443: bpf:bpf_map_lookup_elem: map type=ARRAY ufd=4 key=[06 00 00 00] val=[00 00 00 00 00 00 00 00]
[...]
sock_example 6197 [005] 288.990868: bpf:bpf_map_lookup_elem: map type=ARRAY ufd=4 key=[01 00 00 00] val=[14 00 00 00 00 00 00 00]
swapper 0 [005] 289.338243: bpf:bpf_prog_put_rcu: prog=a5ea8fa30ea6849c type=SOCKET_FILTER
[1] https://lwn.net/Articles/705270/
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
2017-01-24 18:28:18 -07:00
|
|
|
trace_xdp_exception(rq->netdev, prog, act);
|
2016-09-21 03:19:48 -06:00
|
|
|
case XDP_DROP:
|
|
|
|
rq->stats.xdp_drop++;
|
|
|
|
return true;
|
|
|
|
}
|
net/mlx5e: XDP fast RX drop bpf programs support
Add support for the BPF_PROG_TYPE_PHYS_DEV hook in mlx5e driver.
When XDP is on we make sure to change channels RQs type to
MLX5_WQ_TYPE_LINKED_LIST rather than "striding RQ" type to
ensure "page per packet".
On XDP set, we fail if HW LRO is set and request from user to turn it
off. Since on ConnectX4-LX HW LRO is always on by default, this will be
annoying, but we prefer not to enforce LRO off from XDP set function.
Full channels reset (close/open) is required only when setting XDP
on/off.
When XDP set is called just to exchange programs, we will update
each RQ xdp program on the fly and for synchronization with current
data path RX activity of that RQ, we temporally disable that RQ and
ensure RX path is not running, quickly update and re-enable that RQ,
for that we do:
- rq.state = disabled
- napi_synnchronize
- xchg(rq->xdp_prg)
- rq.state = enabled
- napi_schedule // Just in case we've missed an IRQ
Packet rate performance testing was done with pktgen 64B packets and on
TX side and, TC drop action on RX side compared to XDP fast drop.
CPU: Intel(R) Xeon(R) CPU E5-2680 v3 @ 2.50GHz
Comparison is done between:
1. Baseline, Before this patch with TC drop action
2. This patch with TC drop action
3. This patch with XDP RX fast drop
RX Cores Baseline(TC drop) TC drop XDP fast Drop
--------------------------------------------------------------
1 5.3Mpps 5.3Mpps 16.5Mpps
2 10.2Mpps 10.2Mpps 31.3Mpps
4 20.5Mpps 19.9Mpps 36.3Mpps*
*My xmitter was limited to 36.3Mpps, so it is the bottleneck.
It seems that receive side can handle more.
Signed-off-by: Rana Shahout <ranas@mellanox.com>
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
Signed-off-by: Tariq Toukan <tariqt@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2016-09-21 03:19:46 -06:00
|
|
|
}
|
|
|
|
|
2016-09-22 11:01:46 -06:00
|
|
|
static inline
|
|
|
|
struct sk_buff *skb_from_cqe(struct mlx5e_rq *rq, struct mlx5_cqe64 *cqe,
|
net/mlx5e: Introduce RX Page-Reuse
Introduce a Page-Reuse mechanism in non-Striding RQ RX datapath.
A WQE (RX descriptor) buffer is a page, that in most cases was fully
wasted on a packet that is much smaller, requiring a new page for
the next round.
In this patch, we implement a page-reuse mechanism, that resembles a
`SW Striding RQ`.
We allow the WQE to reuse its allocated page as much as it could,
until the page is fully consumed. In each round, the WQE is capable
of receiving packet of maximal size (MTU). Yet, upon the reception of
a packet, the WQE knows the actual packet size, and consumes the exact
amount of memory needed to build a linear SKB. Then, it updates the
buffer pointer within the page accordingly, for the next round.
Feature is mutually exclusive with XDP (packet-per-page)
and LRO (session size is a power of two, needs unused page).
Performance tests:
iperf tcp tests show huge gain:
--------------------------------------------
num streams | BW before | BW after | ratio |
1 | 22.2 | 30.9 | 1.39x |
8 | 64.2 | 93.6 | 1.46x |
64 | 56.7 | 91.4 | 1.61x |
--------------------------------------------
Signed-off-by: Tariq Toukan <tariqt@mellanox.com>
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
2017-01-29 08:42:26 -07:00
|
|
|
struct mlx5e_wqe_frag_info *wi, u32 cqe_bcnt)
|
2016-04-20 13:02:12 -06:00
|
|
|
{
|
net/mlx5e: Introduce RX Page-Reuse
Introduce a Page-Reuse mechanism in non-Striding RQ RX datapath.
A WQE (RX descriptor) buffer is a page, that in most cases was fully
wasted on a packet that is much smaller, requiring a new page for
the next round.
In this patch, we implement a page-reuse mechanism, that resembles a
`SW Striding RQ`.
We allow the WQE to reuse its allocated page as much as it could,
until the page is fully consumed. In each round, the WQE is capable
of receiving packet of maximal size (MTU). Yet, upon the reception of
a packet, the WQE knows the actual packet size, and consumes the exact
amount of memory needed to build a linear SKB. Then, it updates the
buffer pointer within the page accordingly, for the next round.
Feature is mutually exclusive with XDP (packet-per-page)
and LRO (session size is a power of two, needs unused page).
Performance tests:
iperf tcp tests show huge gain:
--------------------------------------------
num streams | BW before | BW after | ratio |
1 | 22.2 | 30.9 | 1.39x |
8 | 64.2 | 93.6 | 1.46x |
64 | 56.7 | 91.4 | 1.61x |
--------------------------------------------
Signed-off-by: Tariq Toukan <tariqt@mellanox.com>
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
2017-01-29 08:42:26 -07:00
|
|
|
struct mlx5e_dma_info *di = &wi->di;
|
2017-02-13 09:41:30 -07:00
|
|
|
u16 rx_headroom = rq->buff.headroom;
|
2016-09-21 03:19:42 -06:00
|
|
|
struct sk_buff *skb;
|
2016-09-21 03:19:48 -06:00
|
|
|
void *va, *data;
|
bpf, xdp: drop rcu_read_lock from bpf_prog_run_xdp and move to caller
After 326fe02d1ed6 ("net/mlx4_en: protect ring->xdp_prog with rcu_read_lock"),
the rcu_read_lock() in bpf_prog_run_xdp() is superfluous, since callers
need to hold rcu_read_lock() already to make sure BPF program doesn't
get released in the background.
Thus, drop it from bpf_prog_run_xdp(), as it can otherwise be misleading.
Still keeping the bpf_prog_run_xdp() is useful as it allows for grepping
in XDP supported drivers and to keep the typecheck on the context intact.
For mlx4, this means we don't have a double rcu_read_lock() anymore. nfp can
just make use of bpf_prog_run_xdp(), too. For qede, just move rcu_read_lock()
out of the helper. When the driver gets atomic replace support, this will
move to call-sites eventually.
mlx5 needs actual fixing as it has the same issue as described already in
326fe02d1ed6 ("net/mlx4_en: protect ring->xdp_prog with rcu_read_lock"),
that is, we're under RCU bh at this time, BPF programs are released via
call_rcu(), and call_rcu() != call_rcu_bh(), so we need to properly mark
read side as programs can get xchg()'ed in mlx5e_xdp_set() without queue
reset.
Fixes: 86994156c736 ("net/mlx5e: XDP fast RX drop bpf programs support")
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Acked-by: Jakub Kicinski <jakub.kicinski@netronome.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2016-11-30 14:16:06 -07:00
|
|
|
bool consumed;
|
2017-01-18 05:28:53 -07:00
|
|
|
u32 frag_size;
|
2016-04-20 13:02:12 -06:00
|
|
|
|
net/mlx5e: Introduce RX Page-Reuse
Introduce a Page-Reuse mechanism in non-Striding RQ RX datapath.
A WQE (RX descriptor) buffer is a page, that in most cases was fully
wasted on a packet that is much smaller, requiring a new page for
the next round.
In this patch, we implement a page-reuse mechanism, that resembles a
`SW Striding RQ`.
We allow the WQE to reuse its allocated page as much as it could,
until the page is fully consumed. In each round, the WQE is capable
of receiving packet of maximal size (MTU). Yet, upon the reception of
a packet, the WQE knows the actual packet size, and consumes the exact
amount of memory needed to build a linear SKB. Then, it updates the
buffer pointer within the page accordingly, for the next round.
Feature is mutually exclusive with XDP (packet-per-page)
and LRO (session size is a power of two, needs unused page).
Performance tests:
iperf tcp tests show huge gain:
--------------------------------------------
num streams | BW before | BW after | ratio |
1 | 22.2 | 30.9 | 1.39x |
8 | 64.2 | 93.6 | 1.46x |
64 | 56.7 | 91.4 | 1.61x |
--------------------------------------------
Signed-off-by: Tariq Toukan <tariqt@mellanox.com>
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
2017-01-29 08:42:26 -07:00
|
|
|
va = page_address(di->page) + wi->offset;
|
2017-01-17 23:06:07 -07:00
|
|
|
data = va + rx_headroom;
|
net/mlx5e: Introduce RX Page-Reuse
Introduce a Page-Reuse mechanism in non-Striding RQ RX datapath.
A WQE (RX descriptor) buffer is a page, that in most cases was fully
wasted on a packet that is much smaller, requiring a new page for
the next round.
In this patch, we implement a page-reuse mechanism, that resembles a
`SW Striding RQ`.
We allow the WQE to reuse its allocated page as much as it could,
until the page is fully consumed. In each round, the WQE is capable
of receiving packet of maximal size (MTU). Yet, upon the reception of
a packet, the WQE knows the actual packet size, and consumes the exact
amount of memory needed to build a linear SKB. Then, it updates the
buffer pointer within the page accordingly, for the next round.
Feature is mutually exclusive with XDP (packet-per-page)
and LRO (session size is a power of two, needs unused page).
Performance tests:
iperf tcp tests show huge gain:
--------------------------------------------
num streams | BW before | BW after | ratio |
1 | 22.2 | 30.9 | 1.39x |
8 | 64.2 | 93.6 | 1.46x |
64 | 56.7 | 91.4 | 1.61x |
--------------------------------------------
Signed-off-by: Tariq Toukan <tariqt@mellanox.com>
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
2017-01-29 08:42:26 -07:00
|
|
|
frag_size = MLX5_SKB_FRAG_SZ(rx_headroom + cqe_bcnt);
|
2016-04-20 13:02:12 -06:00
|
|
|
|
2016-09-21 03:19:42 -06:00
|
|
|
dma_sync_single_range_for_cpu(rq->pdev,
|
net/mlx5e: Introduce RX Page-Reuse
Introduce a Page-Reuse mechanism in non-Striding RQ RX datapath.
A WQE (RX descriptor) buffer is a page, that in most cases was fully
wasted on a packet that is much smaller, requiring a new page for
the next round.
In this patch, we implement a page-reuse mechanism, that resembles a
`SW Striding RQ`.
We allow the WQE to reuse its allocated page as much as it could,
until the page is fully consumed. In each round, the WQE is capable
of receiving packet of maximal size (MTU). Yet, upon the reception of
a packet, the WQE knows the actual packet size, and consumes the exact
amount of memory needed to build a linear SKB. Then, it updates the
buffer pointer within the page accordingly, for the next round.
Feature is mutually exclusive with XDP (packet-per-page)
and LRO (session size is a power of two, needs unused page).
Performance tests:
iperf tcp tests show huge gain:
--------------------------------------------
num streams | BW before | BW after | ratio |
1 | 22.2 | 30.9 | 1.39x |
8 | 64.2 | 93.6 | 1.46x |
64 | 56.7 | 91.4 | 1.61x |
--------------------------------------------
Signed-off-by: Tariq Toukan <tariqt@mellanox.com>
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
2017-01-29 08:42:26 -07:00
|
|
|
di->addr + wi->offset,
|
|
|
|
0, frag_size,
|
2016-09-21 03:19:42 -06:00
|
|
|
DMA_FROM_DEVICE);
|
2016-09-21 03:19:48 -06:00
|
|
|
prefetch(data);
|
net/mlx5e: Introduce RX Page-Reuse
Introduce a Page-Reuse mechanism in non-Striding RQ RX datapath.
A WQE (RX descriptor) buffer is a page, that in most cases was fully
wasted on a packet that is much smaller, requiring a new page for
the next round.
In this patch, we implement a page-reuse mechanism, that resembles a
`SW Striding RQ`.
We allow the WQE to reuse its allocated page as much as it could,
until the page is fully consumed. In each round, the WQE is capable
of receiving packet of maximal size (MTU). Yet, upon the reception of
a packet, the WQE knows the actual packet size, and consumes the exact
amount of memory needed to build a linear SKB. Then, it updates the
buffer pointer within the page accordingly, for the next round.
Feature is mutually exclusive with XDP (packet-per-page)
and LRO (session size is a power of two, needs unused page).
Performance tests:
iperf tcp tests show huge gain:
--------------------------------------------
num streams | BW before | BW after | ratio |
1 | 22.2 | 30.9 | 1.39x |
8 | 64.2 | 93.6 | 1.46x |
64 | 56.7 | 91.4 | 1.61x |
--------------------------------------------
Signed-off-by: Tariq Toukan <tariqt@mellanox.com>
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
2017-01-29 08:42:26 -07:00
|
|
|
wi->offset += frag_size;
|
2016-04-20 13:02:12 -06:00
|
|
|
|
|
|
|
if (unlikely((cqe->op_own >> 4) != MLX5_CQE_RESP_SEND)) {
|
|
|
|
rq->stats.wqe_err++;
|
2016-09-22 11:01:46 -06:00
|
|
|
return NULL;
|
2016-04-20 13:02:12 -06:00
|
|
|
}
|
|
|
|
|
bpf, xdp: drop rcu_read_lock from bpf_prog_run_xdp and move to caller
After 326fe02d1ed6 ("net/mlx4_en: protect ring->xdp_prog with rcu_read_lock"),
the rcu_read_lock() in bpf_prog_run_xdp() is superfluous, since callers
need to hold rcu_read_lock() already to make sure BPF program doesn't
get released in the background.
Thus, drop it from bpf_prog_run_xdp(), as it can otherwise be misleading.
Still keeping the bpf_prog_run_xdp() is useful as it allows for grepping
in XDP supported drivers and to keep the typecheck on the context intact.
For mlx4, this means we don't have a double rcu_read_lock() anymore. nfp can
just make use of bpf_prog_run_xdp(), too. For qede, just move rcu_read_lock()
out of the helper. When the driver gets atomic replace support, this will
move to call-sites eventually.
mlx5 needs actual fixing as it has the same issue as described already in
326fe02d1ed6 ("net/mlx4_en: protect ring->xdp_prog with rcu_read_lock"),
that is, we're under RCU bh at this time, BPF programs are released via
call_rcu(), and call_rcu() != call_rcu_bh(), so we need to properly mark
read side as programs can get xchg()'ed in mlx5e_xdp_set() without queue
reset.
Fixes: 86994156c736 ("net/mlx5e: XDP fast RX drop bpf programs support")
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Acked-by: Jakub Kicinski <jakub.kicinski@netronome.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2016-11-30 14:16:06 -07:00
|
|
|
rcu_read_lock();
|
2017-01-17 23:06:07 -07:00
|
|
|
consumed = mlx5e_xdp_handle(rq, di, va, &rx_headroom, &cqe_bcnt);
|
bpf, xdp: drop rcu_read_lock from bpf_prog_run_xdp and move to caller
After 326fe02d1ed6 ("net/mlx4_en: protect ring->xdp_prog with rcu_read_lock"),
the rcu_read_lock() in bpf_prog_run_xdp() is superfluous, since callers
need to hold rcu_read_lock() already to make sure BPF program doesn't
get released in the background.
Thus, drop it from bpf_prog_run_xdp(), as it can otherwise be misleading.
Still keeping the bpf_prog_run_xdp() is useful as it allows for grepping
in XDP supported drivers and to keep the typecheck on the context intact.
For mlx4, this means we don't have a double rcu_read_lock() anymore. nfp can
just make use of bpf_prog_run_xdp(), too. For qede, just move rcu_read_lock()
out of the helper. When the driver gets atomic replace support, this will
move to call-sites eventually.
mlx5 needs actual fixing as it has the same issue as described already in
326fe02d1ed6 ("net/mlx4_en: protect ring->xdp_prog with rcu_read_lock"),
that is, we're under RCU bh at this time, BPF programs are released via
call_rcu(), and call_rcu() != call_rcu_bh(), so we need to properly mark
read side as programs can get xchg()'ed in mlx5e_xdp_set() without queue
reset.
Fixes: 86994156c736 ("net/mlx5e: XDP fast RX drop bpf programs support")
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Acked-by: Jakub Kicinski <jakub.kicinski@netronome.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2016-11-30 14:16:06 -07:00
|
|
|
rcu_read_unlock();
|
|
|
|
if (consumed)
|
2016-09-22 11:01:46 -06:00
|
|
|
return NULL; /* page/packet was consumed by XDP */
|
net/mlx5e: XDP fast RX drop bpf programs support
Add support for the BPF_PROG_TYPE_PHYS_DEV hook in mlx5e driver.
When XDP is on we make sure to change channels RQs type to
MLX5_WQ_TYPE_LINKED_LIST rather than "striding RQ" type to
ensure "page per packet".
On XDP set, we fail if HW LRO is set and request from user to turn it
off. Since on ConnectX4-LX HW LRO is always on by default, this will be
annoying, but we prefer not to enforce LRO off from XDP set function.
Full channels reset (close/open) is required only when setting XDP
on/off.
When XDP set is called just to exchange programs, we will update
each RQ xdp program on the fly and for synchronization with current
data path RX activity of that RQ, we temporally disable that RQ and
ensure RX path is not running, quickly update and re-enable that RQ,
for that we do:
- rq.state = disabled
- napi_synnchronize
- xchg(rq->xdp_prg)
- rq.state = enabled
- napi_schedule // Just in case we've missed an IRQ
Packet rate performance testing was done with pktgen 64B packets and on
TX side and, TC drop action on RX side compared to XDP fast drop.
CPU: Intel(R) Xeon(R) CPU E5-2680 v3 @ 2.50GHz
Comparison is done between:
1. Baseline, Before this patch with TC drop action
2. This patch with TC drop action
3. This patch with XDP RX fast drop
RX Cores Baseline(TC drop) TC drop XDP fast Drop
--------------------------------------------------------------
1 5.3Mpps 5.3Mpps 16.5Mpps
2 10.2Mpps 10.2Mpps 31.3Mpps
4 20.5Mpps 19.9Mpps 36.3Mpps*
*My xmitter was limited to 36.3Mpps, so it is the bottleneck.
It seems that receive side can handle more.
Signed-off-by: Rana Shahout <ranas@mellanox.com>
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
Signed-off-by: Tariq Toukan <tariqt@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2016-09-21 03:19:46 -06:00
|
|
|
|
2017-01-18 05:28:53 -07:00
|
|
|
skb = build_skb(va, frag_size);
|
2016-09-21 03:19:42 -06:00
|
|
|
if (unlikely(!skb)) {
|
|
|
|
rq->stats.buff_alloc_err++;
|
2016-09-22 11:01:46 -06:00
|
|
|
return NULL;
|
2016-09-21 03:19:42 -06:00
|
|
|
}
|
|
|
|
|
net/mlx5e: Introduce RX Page-Reuse
Introduce a Page-Reuse mechanism in non-Striding RQ RX datapath.
A WQE (RX descriptor) buffer is a page, that in most cases was fully
wasted on a packet that is much smaller, requiring a new page for
the next round.
In this patch, we implement a page-reuse mechanism, that resembles a
`SW Striding RQ`.
We allow the WQE to reuse its allocated page as much as it could,
until the page is fully consumed. In each round, the WQE is capable
of receiving packet of maximal size (MTU). Yet, upon the reception of
a packet, the WQE knows the actual packet size, and consumes the exact
amount of memory needed to build a linear SKB. Then, it updates the
buffer pointer within the page accordingly, for the next round.
Feature is mutually exclusive with XDP (packet-per-page)
and LRO (session size is a power of two, needs unused page).
Performance tests:
iperf tcp tests show huge gain:
--------------------------------------------
num streams | BW before | BW after | ratio |
1 | 22.2 | 30.9 | 1.39x |
8 | 64.2 | 93.6 | 1.46x |
64 | 56.7 | 91.4 | 1.61x |
--------------------------------------------
Signed-off-by: Tariq Toukan <tariqt@mellanox.com>
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
2017-01-29 08:42:26 -07:00
|
|
|
/* queue up for recycling/reuse */
|
2016-09-21 03:19:42 -06:00
|
|
|
page_ref_inc(di->page);
|
|
|
|
|
2017-01-17 23:06:07 -07:00
|
|
|
skb_reserve(skb, rx_headroom);
|
net/mlx5e: Support RX multi-packet WQE (Striding RQ)
Introduce the feature of multi-packet WQE (RX Work Queue Element)
referred to as (MPWQE or Striding RQ), in which WQEs are larger
and serve multiple packets each.
Every WQE consists of many strides of the same size, every received
packet is aligned to a beginning of a stride and is written to
consecutive strides within a WQE.
In the regular approach, each regular WQE is big enough to be capable
of serving one received packet of any size up to MTU or 64K in case of
device LRO is enabled, making it very wasteful when dealing with
small packets or device LRO is enabled.
For its flexibility, MPWQE allows a better memory utilization
(implying improvements in CPU utilization and packet rate) as packets
consume strides according to their size, preserving the rest of
the WQE to be available for other packets.
MPWQE default configuration:
Num of WQEs = 16
Strides Per WQE = 2048
Stride Size = 64 byte
The default WQEs memory footprint went from 1024*mtu (~1.5MB) to
16 * 2048 * 64 = 2MB per ring.
However, HW LRO can now be supported at no additional cost in memory
footprint, and hence we turn it on by default and get an even better
performance.
Performance tested on ConnectX4-Lx 50G.
To isolate the feature under test, the numbers below were measured with
HW LRO turned off. We verified that the performance just improves when
LRO is turned back on.
* Netperf single TCP stream:
- BW raised by 10-15% for representative packet sizes:
default, 64B, 1024B, 1478B, 65536B.
* Netperf multi TCP stream:
- No degradation, line rate reached.
* Pktgen: packet rate raised by 2-10% for traffic of different message
sizes: 64B, 128B, 256B, 1024B, and 1500B.
* Pktgen: packet loss in bursts of small messages (64byte),
single stream:
- | num packets | packets loss before | packets loss after
| 2K | ~ 1K | 0
| 8K | ~ 6K | 0
| 16K | ~13K | 0
| 32K | ~28K | 0
| 64K | ~57K | ~24K
As expected as the driver can receive as many small packets (<=64B) as
the number of total strides in the ring (default = 2048 * 16) vs. 1024
(default ring size regardless of packets size) before this feature.
Signed-off-by: Tariq Toukan <tariqt@mellanox.com>
Signed-off-by: Achiad Shochat <achiad@mellanox.com>
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2016-04-20 13:02:13 -06:00
|
|
|
skb_put(skb, cqe_bcnt);
|
|
|
|
|
2016-09-22 11:01:46 -06:00
|
|
|
return skb;
|
|
|
|
}
|
|
|
|
|
|
|
|
void mlx5e_handle_rx_cqe(struct mlx5e_rq *rq, struct mlx5_cqe64 *cqe)
|
|
|
|
{
|
net/mlx5e: Introduce RX Page-Reuse
Introduce a Page-Reuse mechanism in non-Striding RQ RX datapath.
A WQE (RX descriptor) buffer is a page, that in most cases was fully
wasted on a packet that is much smaller, requiring a new page for
the next round.
In this patch, we implement a page-reuse mechanism, that resembles a
`SW Striding RQ`.
We allow the WQE to reuse its allocated page as much as it could,
until the page is fully consumed. In each round, the WQE is capable
of receiving packet of maximal size (MTU). Yet, upon the reception of
a packet, the WQE knows the actual packet size, and consumes the exact
amount of memory needed to build a linear SKB. Then, it updates the
buffer pointer within the page accordingly, for the next round.
Feature is mutually exclusive with XDP (packet-per-page)
and LRO (session size is a power of two, needs unused page).
Performance tests:
iperf tcp tests show huge gain:
--------------------------------------------
num streams | BW before | BW after | ratio |
1 | 22.2 | 30.9 | 1.39x |
8 | 64.2 | 93.6 | 1.46x |
64 | 56.7 | 91.4 | 1.61x |
--------------------------------------------
Signed-off-by: Tariq Toukan <tariqt@mellanox.com>
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
2017-01-29 08:42:26 -07:00
|
|
|
struct mlx5e_wqe_frag_info *wi;
|
2016-09-22 11:01:46 -06:00
|
|
|
struct mlx5e_rx_wqe *wqe;
|
|
|
|
__be16 wqe_counter_be;
|
|
|
|
struct sk_buff *skb;
|
|
|
|
u16 wqe_counter;
|
|
|
|
u32 cqe_bcnt;
|
|
|
|
|
|
|
|
wqe_counter_be = cqe->wqe_counter;
|
|
|
|
wqe_counter = be16_to_cpu(wqe_counter_be);
|
|
|
|
wqe = mlx5_wq_ll_get_wqe(&rq->wq, wqe_counter);
|
net/mlx5e: Introduce RX Page-Reuse
Introduce a Page-Reuse mechanism in non-Striding RQ RX datapath.
A WQE (RX descriptor) buffer is a page, that in most cases was fully
wasted on a packet that is much smaller, requiring a new page for
the next round.
In this patch, we implement a page-reuse mechanism, that resembles a
`SW Striding RQ`.
We allow the WQE to reuse its allocated page as much as it could,
until the page is fully consumed. In each round, the WQE is capable
of receiving packet of maximal size (MTU). Yet, upon the reception of
a packet, the WQE knows the actual packet size, and consumes the exact
amount of memory needed to build a linear SKB. Then, it updates the
buffer pointer within the page accordingly, for the next round.
Feature is mutually exclusive with XDP (packet-per-page)
and LRO (session size is a power of two, needs unused page).
Performance tests:
iperf tcp tests show huge gain:
--------------------------------------------
num streams | BW before | BW after | ratio |
1 | 22.2 | 30.9 | 1.39x |
8 | 64.2 | 93.6 | 1.46x |
64 | 56.7 | 91.4 | 1.61x |
--------------------------------------------
Signed-off-by: Tariq Toukan <tariqt@mellanox.com>
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
2017-01-29 08:42:26 -07:00
|
|
|
wi = &rq->wqe.frag_info[wqe_counter];
|
2016-09-22 11:01:46 -06:00
|
|
|
cqe_bcnt = be32_to_cpu(cqe->byte_cnt);
|
|
|
|
|
net/mlx5e: Introduce RX Page-Reuse
Introduce a Page-Reuse mechanism in non-Striding RQ RX datapath.
A WQE (RX descriptor) buffer is a page, that in most cases was fully
wasted on a packet that is much smaller, requiring a new page for
the next round.
In this patch, we implement a page-reuse mechanism, that resembles a
`SW Striding RQ`.
We allow the WQE to reuse its allocated page as much as it could,
until the page is fully consumed. In each round, the WQE is capable
of receiving packet of maximal size (MTU). Yet, upon the reception of
a packet, the WQE knows the actual packet size, and consumes the exact
amount of memory needed to build a linear SKB. Then, it updates the
buffer pointer within the page accordingly, for the next round.
Feature is mutually exclusive with XDP (packet-per-page)
and LRO (session size is a power of two, needs unused page).
Performance tests:
iperf tcp tests show huge gain:
--------------------------------------------
num streams | BW before | BW after | ratio |
1 | 22.2 | 30.9 | 1.39x |
8 | 64.2 | 93.6 | 1.46x |
64 | 56.7 | 91.4 | 1.61x |
--------------------------------------------
Signed-off-by: Tariq Toukan <tariqt@mellanox.com>
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
2017-01-29 08:42:26 -07:00
|
|
|
skb = skb_from_cqe(rq, cqe, wi, cqe_bcnt);
|
|
|
|
if (!skb) {
|
|
|
|
/* probably for XDP */
|
|
|
|
if (rq->wqe.xdp_xmit) {
|
|
|
|
wi->di.page = NULL;
|
|
|
|
rq->wqe.xdp_xmit = false;
|
|
|
|
/* do not return page to cache, it will be returned on XDP_TX completion */
|
|
|
|
goto wq_ll_pop;
|
|
|
|
}
|
|
|
|
/* probably an XDP_DROP, save the page-reuse checks */
|
|
|
|
mlx5e_free_rx_wqe(rq, wi);
|
2016-09-22 11:01:46 -06:00
|
|
|
goto wq_ll_pop;
|
net/mlx5e: Introduce RX Page-Reuse
Introduce a Page-Reuse mechanism in non-Striding RQ RX datapath.
A WQE (RX descriptor) buffer is a page, that in most cases was fully
wasted on a packet that is much smaller, requiring a new page for
the next round.
In this patch, we implement a page-reuse mechanism, that resembles a
`SW Striding RQ`.
We allow the WQE to reuse its allocated page as much as it could,
until the page is fully consumed. In each round, the WQE is capable
of receiving packet of maximal size (MTU). Yet, upon the reception of
a packet, the WQE knows the actual packet size, and consumes the exact
amount of memory needed to build a linear SKB. Then, it updates the
buffer pointer within the page accordingly, for the next round.
Feature is mutually exclusive with XDP (packet-per-page)
and LRO (session size is a power of two, needs unused page).
Performance tests:
iperf tcp tests show huge gain:
--------------------------------------------
num streams | BW before | BW after | ratio |
1 | 22.2 | 30.9 | 1.39x |
8 | 64.2 | 93.6 | 1.46x |
64 | 56.7 | 91.4 | 1.61x |
--------------------------------------------
Signed-off-by: Tariq Toukan <tariqt@mellanox.com>
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
2017-01-29 08:42:26 -07:00
|
|
|
}
|
2016-09-22 11:01:46 -06:00
|
|
|
|
net/mlx5e: Support RX multi-packet WQE (Striding RQ)
Introduce the feature of multi-packet WQE (RX Work Queue Element)
referred to as (MPWQE or Striding RQ), in which WQEs are larger
and serve multiple packets each.
Every WQE consists of many strides of the same size, every received
packet is aligned to a beginning of a stride and is written to
consecutive strides within a WQE.
In the regular approach, each regular WQE is big enough to be capable
of serving one received packet of any size up to MTU or 64K in case of
device LRO is enabled, making it very wasteful when dealing with
small packets or device LRO is enabled.
For its flexibility, MPWQE allows a better memory utilization
(implying improvements in CPU utilization and packet rate) as packets
consume strides according to their size, preserving the rest of
the WQE to be available for other packets.
MPWQE default configuration:
Num of WQEs = 16
Strides Per WQE = 2048
Stride Size = 64 byte
The default WQEs memory footprint went from 1024*mtu (~1.5MB) to
16 * 2048 * 64 = 2MB per ring.
However, HW LRO can now be supported at no additional cost in memory
footprint, and hence we turn it on by default and get an even better
performance.
Performance tested on ConnectX4-Lx 50G.
To isolate the feature under test, the numbers below were measured with
HW LRO turned off. We verified that the performance just improves when
LRO is turned back on.
* Netperf single TCP stream:
- BW raised by 10-15% for representative packet sizes:
default, 64B, 1024B, 1478B, 65536B.
* Netperf multi TCP stream:
- No degradation, line rate reached.
* Pktgen: packet rate raised by 2-10% for traffic of different message
sizes: 64B, 128B, 256B, 1024B, and 1500B.
* Pktgen: packet loss in bursts of small messages (64byte),
single stream:
- | num packets | packets loss before | packets loss after
| 2K | ~ 1K | 0
| 8K | ~ 6K | 0
| 16K | ~13K | 0
| 32K | ~28K | 0
| 64K | ~57K | ~24K
As expected as the driver can receive as many small packets (<=64B) as
the number of total strides in the ring (default = 2048 * 16) vs. 1024
(default ring size regardless of packets size) before this feature.
Signed-off-by: Tariq Toukan <tariqt@mellanox.com>
Signed-off-by: Achiad Shochat <achiad@mellanox.com>
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2016-04-20 13:02:13 -06:00
|
|
|
mlx5e_complete_rx_cqe(rq, cqe, cqe_bcnt, skb);
|
2016-09-22 11:01:46 -06:00
|
|
|
napi_gro_receive(rq->cq.napi, skb);
|
2016-04-20 13:02:12 -06:00
|
|
|
|
net/mlx5e: Introduce RX Page-Reuse
Introduce a Page-Reuse mechanism in non-Striding RQ RX datapath.
A WQE (RX descriptor) buffer is a page, that in most cases was fully
wasted on a packet that is much smaller, requiring a new page for
the next round.
In this patch, we implement a page-reuse mechanism, that resembles a
`SW Striding RQ`.
We allow the WQE to reuse its allocated page as much as it could,
until the page is fully consumed. In each round, the WQE is capable
of receiving packet of maximal size (MTU). Yet, upon the reception of
a packet, the WQE knows the actual packet size, and consumes the exact
amount of memory needed to build a linear SKB. Then, it updates the
buffer pointer within the page accordingly, for the next round.
Feature is mutually exclusive with XDP (packet-per-page)
and LRO (session size is a power of two, needs unused page).
Performance tests:
iperf tcp tests show huge gain:
--------------------------------------------
num streams | BW before | BW after | ratio |
1 | 22.2 | 30.9 | 1.39x |
8 | 64.2 | 93.6 | 1.46x |
64 | 56.7 | 91.4 | 1.61x |
--------------------------------------------
Signed-off-by: Tariq Toukan <tariqt@mellanox.com>
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
2017-01-29 08:42:26 -07:00
|
|
|
mlx5e_free_rx_wqe_reuse(rq, wi);
|
2016-04-20 13:02:12 -06:00
|
|
|
wq_ll_pop:
|
|
|
|
mlx5_wq_ll_pop(&rq->wq, wqe_counter_be,
|
|
|
|
&wqe->next.next_wqe_index);
|
|
|
|
}
|
|
|
|
|
2017-06-05 06:17:12 -06:00
|
|
|
#ifdef CONFIG_MLX5_ESWITCH
|
net/mlx5: E-Switch, Support VLAN actions in the offloads mode
Many virtualization systems use a policy under which a vlan tag is
pushed to packets sent by guests, and popped before the packet is
forwarded to the VM.
The current generation of the mlx5 HW doesn't fully support that on
a per flow level. As such, we are addressing the above common use
case with the SRIOV e-Switch abilities to push vlan into packets
sent by VFs and pop vlan from packets forwarded to VFs.
The HW can match on the correct vlan being present in packets
forwarded to VFs (eSwitch steering is done before stripping
the tag), so this part is offloaded as is.
A common practice for vlans is to avoid both push vlan and pop vlan
for inter-host VM/VM (east-west) communication because in this case,
push on egress cancels out with pop on ingress.
For supporting that, we use a global eswitch vlan pop policy, hence
allowing guest A to communicate with both remote VM B and local VM C.
This works since the HW pops the vlan only if it exists (e.g for
C --> A packets but not for B --> A packets).
On the slow path, when a VF vport has an offloaded flow which involves
pushing vlans, wheres another flow is not currently offloaded, the
packets from the 2nd flow seen by the VF representor on the host have
vlan. The VF rep driver removes such vlan before calling into the host
networking stack.
Signed-off-by: Or Gerlitz <ogerlitz@mellanox.com>
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2016-09-22 11:01:47 -06:00
|
|
|
void mlx5e_handle_rx_cqe_rep(struct mlx5e_rq *rq, struct mlx5_cqe64 *cqe)
|
|
|
|
{
|
|
|
|
struct net_device *netdev = rq->netdev;
|
|
|
|
struct mlx5e_priv *priv = netdev_priv(netdev);
|
2017-04-24 03:36:42 -06:00
|
|
|
struct mlx5e_rep_priv *rpriv = priv->ppriv;
|
|
|
|
struct mlx5_eswitch_rep *rep = rpriv->rep;
|
net/mlx5e: Introduce RX Page-Reuse
Introduce a Page-Reuse mechanism in non-Striding RQ RX datapath.
A WQE (RX descriptor) buffer is a page, that in most cases was fully
wasted on a packet that is much smaller, requiring a new page for
the next round.
In this patch, we implement a page-reuse mechanism, that resembles a
`SW Striding RQ`.
We allow the WQE to reuse its allocated page as much as it could,
until the page is fully consumed. In each round, the WQE is capable
of receiving packet of maximal size (MTU). Yet, upon the reception of
a packet, the WQE knows the actual packet size, and consumes the exact
amount of memory needed to build a linear SKB. Then, it updates the
buffer pointer within the page accordingly, for the next round.
Feature is mutually exclusive with XDP (packet-per-page)
and LRO (session size is a power of two, needs unused page).
Performance tests:
iperf tcp tests show huge gain:
--------------------------------------------
num streams | BW before | BW after | ratio |
1 | 22.2 | 30.9 | 1.39x |
8 | 64.2 | 93.6 | 1.46x |
64 | 56.7 | 91.4 | 1.61x |
--------------------------------------------
Signed-off-by: Tariq Toukan <tariqt@mellanox.com>
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
2017-01-29 08:42:26 -07:00
|
|
|
struct mlx5e_wqe_frag_info *wi;
|
net/mlx5: E-Switch, Support VLAN actions in the offloads mode
Many virtualization systems use a policy under which a vlan tag is
pushed to packets sent by guests, and popped before the packet is
forwarded to the VM.
The current generation of the mlx5 HW doesn't fully support that on
a per flow level. As such, we are addressing the above common use
case with the SRIOV e-Switch abilities to push vlan into packets
sent by VFs and pop vlan from packets forwarded to VFs.
The HW can match on the correct vlan being present in packets
forwarded to VFs (eSwitch steering is done before stripping
the tag), so this part is offloaded as is.
A common practice for vlans is to avoid both push vlan and pop vlan
for inter-host VM/VM (east-west) communication because in this case,
push on egress cancels out with pop on ingress.
For supporting that, we use a global eswitch vlan pop policy, hence
allowing guest A to communicate with both remote VM B and local VM C.
This works since the HW pops the vlan only if it exists (e.g for
C --> A packets but not for B --> A packets).
On the slow path, when a VF vport has an offloaded flow which involves
pushing vlans, wheres another flow is not currently offloaded, the
packets from the 2nd flow seen by the VF representor on the host have
vlan. The VF rep driver removes such vlan before calling into the host
networking stack.
Signed-off-by: Or Gerlitz <ogerlitz@mellanox.com>
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2016-09-22 11:01:47 -06:00
|
|
|
struct mlx5e_rx_wqe *wqe;
|
|
|
|
struct sk_buff *skb;
|
|
|
|
__be16 wqe_counter_be;
|
|
|
|
u16 wqe_counter;
|
|
|
|
u32 cqe_bcnt;
|
|
|
|
|
|
|
|
wqe_counter_be = cqe->wqe_counter;
|
|
|
|
wqe_counter = be16_to_cpu(wqe_counter_be);
|
|
|
|
wqe = mlx5_wq_ll_get_wqe(&rq->wq, wqe_counter);
|
net/mlx5e: Introduce RX Page-Reuse
Introduce a Page-Reuse mechanism in non-Striding RQ RX datapath.
A WQE (RX descriptor) buffer is a page, that in most cases was fully
wasted on a packet that is much smaller, requiring a new page for
the next round.
In this patch, we implement a page-reuse mechanism, that resembles a
`SW Striding RQ`.
We allow the WQE to reuse its allocated page as much as it could,
until the page is fully consumed. In each round, the WQE is capable
of receiving packet of maximal size (MTU). Yet, upon the reception of
a packet, the WQE knows the actual packet size, and consumes the exact
amount of memory needed to build a linear SKB. Then, it updates the
buffer pointer within the page accordingly, for the next round.
Feature is mutually exclusive with XDP (packet-per-page)
and LRO (session size is a power of two, needs unused page).
Performance tests:
iperf tcp tests show huge gain:
--------------------------------------------
num streams | BW before | BW after | ratio |
1 | 22.2 | 30.9 | 1.39x |
8 | 64.2 | 93.6 | 1.46x |
64 | 56.7 | 91.4 | 1.61x |
--------------------------------------------
Signed-off-by: Tariq Toukan <tariqt@mellanox.com>
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
2017-01-29 08:42:26 -07:00
|
|
|
wi = &rq->wqe.frag_info[wqe_counter];
|
net/mlx5: E-Switch, Support VLAN actions in the offloads mode
Many virtualization systems use a policy under which a vlan tag is
pushed to packets sent by guests, and popped before the packet is
forwarded to the VM.
The current generation of the mlx5 HW doesn't fully support that on
a per flow level. As such, we are addressing the above common use
case with the SRIOV e-Switch abilities to push vlan into packets
sent by VFs and pop vlan from packets forwarded to VFs.
The HW can match on the correct vlan being present in packets
forwarded to VFs (eSwitch steering is done before stripping
the tag), so this part is offloaded as is.
A common practice for vlans is to avoid both push vlan and pop vlan
for inter-host VM/VM (east-west) communication because in this case,
push on egress cancels out with pop on ingress.
For supporting that, we use a global eswitch vlan pop policy, hence
allowing guest A to communicate with both remote VM B and local VM C.
This works since the HW pops the vlan only if it exists (e.g for
C --> A packets but not for B --> A packets).
On the slow path, when a VF vport has an offloaded flow which involves
pushing vlans, wheres another flow is not currently offloaded, the
packets from the 2nd flow seen by the VF representor on the host have
vlan. The VF rep driver removes such vlan before calling into the host
networking stack.
Signed-off-by: Or Gerlitz <ogerlitz@mellanox.com>
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2016-09-22 11:01:47 -06:00
|
|
|
cqe_bcnt = be32_to_cpu(cqe->byte_cnt);
|
|
|
|
|
net/mlx5e: Introduce RX Page-Reuse
Introduce a Page-Reuse mechanism in non-Striding RQ RX datapath.
A WQE (RX descriptor) buffer is a page, that in most cases was fully
wasted on a packet that is much smaller, requiring a new page for
the next round.
In this patch, we implement a page-reuse mechanism, that resembles a
`SW Striding RQ`.
We allow the WQE to reuse its allocated page as much as it could,
until the page is fully consumed. In each round, the WQE is capable
of receiving packet of maximal size (MTU). Yet, upon the reception of
a packet, the WQE knows the actual packet size, and consumes the exact
amount of memory needed to build a linear SKB. Then, it updates the
buffer pointer within the page accordingly, for the next round.
Feature is mutually exclusive with XDP (packet-per-page)
and LRO (session size is a power of two, needs unused page).
Performance tests:
iperf tcp tests show huge gain:
--------------------------------------------
num streams | BW before | BW after | ratio |
1 | 22.2 | 30.9 | 1.39x |
8 | 64.2 | 93.6 | 1.46x |
64 | 56.7 | 91.4 | 1.61x |
--------------------------------------------
Signed-off-by: Tariq Toukan <tariqt@mellanox.com>
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
2017-01-29 08:42:26 -07:00
|
|
|
skb = skb_from_cqe(rq, cqe, wi, cqe_bcnt);
|
|
|
|
if (!skb) {
|
|
|
|
if (rq->wqe.xdp_xmit) {
|
|
|
|
wi->di.page = NULL;
|
|
|
|
rq->wqe.xdp_xmit = false;
|
|
|
|
/* do not return page to cache, it will be returned on XDP_TX completion */
|
|
|
|
goto wq_ll_pop;
|
|
|
|
}
|
|
|
|
/* probably an XDP_DROP, save the page-reuse checks */
|
|
|
|
mlx5e_free_rx_wqe(rq, wi);
|
net/mlx5: E-Switch, Support VLAN actions in the offloads mode
Many virtualization systems use a policy under which a vlan tag is
pushed to packets sent by guests, and popped before the packet is
forwarded to the VM.
The current generation of the mlx5 HW doesn't fully support that on
a per flow level. As such, we are addressing the above common use
case with the SRIOV e-Switch abilities to push vlan into packets
sent by VFs and pop vlan from packets forwarded to VFs.
The HW can match on the correct vlan being present in packets
forwarded to VFs (eSwitch steering is done before stripping
the tag), so this part is offloaded as is.
A common practice for vlans is to avoid both push vlan and pop vlan
for inter-host VM/VM (east-west) communication because in this case,
push on egress cancels out with pop on ingress.
For supporting that, we use a global eswitch vlan pop policy, hence
allowing guest A to communicate with both remote VM B and local VM C.
This works since the HW pops the vlan only if it exists (e.g for
C --> A packets but not for B --> A packets).
On the slow path, when a VF vport has an offloaded flow which involves
pushing vlans, wheres another flow is not currently offloaded, the
packets from the 2nd flow seen by the VF representor on the host have
vlan. The VF rep driver removes such vlan before calling into the host
networking stack.
Signed-off-by: Or Gerlitz <ogerlitz@mellanox.com>
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2016-09-22 11:01:47 -06:00
|
|
|
goto wq_ll_pop;
|
net/mlx5e: Introduce RX Page-Reuse
Introduce a Page-Reuse mechanism in non-Striding RQ RX datapath.
A WQE (RX descriptor) buffer is a page, that in most cases was fully
wasted on a packet that is much smaller, requiring a new page for
the next round.
In this patch, we implement a page-reuse mechanism, that resembles a
`SW Striding RQ`.
We allow the WQE to reuse its allocated page as much as it could,
until the page is fully consumed. In each round, the WQE is capable
of receiving packet of maximal size (MTU). Yet, upon the reception of
a packet, the WQE knows the actual packet size, and consumes the exact
amount of memory needed to build a linear SKB. Then, it updates the
buffer pointer within the page accordingly, for the next round.
Feature is mutually exclusive with XDP (packet-per-page)
and LRO (session size is a power of two, needs unused page).
Performance tests:
iperf tcp tests show huge gain:
--------------------------------------------
num streams | BW before | BW after | ratio |
1 | 22.2 | 30.9 | 1.39x |
8 | 64.2 | 93.6 | 1.46x |
64 | 56.7 | 91.4 | 1.61x |
--------------------------------------------
Signed-off-by: Tariq Toukan <tariqt@mellanox.com>
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
2017-01-29 08:42:26 -07:00
|
|
|
}
|
net/mlx5: E-Switch, Support VLAN actions in the offloads mode
Many virtualization systems use a policy under which a vlan tag is
pushed to packets sent by guests, and popped before the packet is
forwarded to the VM.
The current generation of the mlx5 HW doesn't fully support that on
a per flow level. As such, we are addressing the above common use
case with the SRIOV e-Switch abilities to push vlan into packets
sent by VFs and pop vlan from packets forwarded to VFs.
The HW can match on the correct vlan being present in packets
forwarded to VFs (eSwitch steering is done before stripping
the tag), so this part is offloaded as is.
A common practice for vlans is to avoid both push vlan and pop vlan
for inter-host VM/VM (east-west) communication because in this case,
push on egress cancels out with pop on ingress.
For supporting that, we use a global eswitch vlan pop policy, hence
allowing guest A to communicate with both remote VM B and local VM C.
This works since the HW pops the vlan only if it exists (e.g for
C --> A packets but not for B --> A packets).
On the slow path, when a VF vport has an offloaded flow which involves
pushing vlans, wheres another flow is not currently offloaded, the
packets from the 2nd flow seen by the VF representor on the host have
vlan. The VF rep driver removes such vlan before calling into the host
networking stack.
Signed-off-by: Or Gerlitz <ogerlitz@mellanox.com>
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2016-09-22 11:01:47 -06:00
|
|
|
|
|
|
|
mlx5e_complete_rx_cqe(rq, cqe, cqe_bcnt, skb);
|
|
|
|
|
|
|
|
if (rep->vlan && skb_vlan_tag_present(skb))
|
|
|
|
skb_vlan_pop(skb);
|
|
|
|
|
|
|
|
napi_gro_receive(rq->cq.napi, skb);
|
|
|
|
|
net/mlx5e: Introduce RX Page-Reuse
Introduce a Page-Reuse mechanism in non-Striding RQ RX datapath.
A WQE (RX descriptor) buffer is a page, that in most cases was fully
wasted on a packet that is much smaller, requiring a new page for
the next round.
In this patch, we implement a page-reuse mechanism, that resembles a
`SW Striding RQ`.
We allow the WQE to reuse its allocated page as much as it could,
until the page is fully consumed. In each round, the WQE is capable
of receiving packet of maximal size (MTU). Yet, upon the reception of
a packet, the WQE knows the actual packet size, and consumes the exact
amount of memory needed to build a linear SKB. Then, it updates the
buffer pointer within the page accordingly, for the next round.
Feature is mutually exclusive with XDP (packet-per-page)
and LRO (session size is a power of two, needs unused page).
Performance tests:
iperf tcp tests show huge gain:
--------------------------------------------
num streams | BW before | BW after | ratio |
1 | 22.2 | 30.9 | 1.39x |
8 | 64.2 | 93.6 | 1.46x |
64 | 56.7 | 91.4 | 1.61x |
--------------------------------------------
Signed-off-by: Tariq Toukan <tariqt@mellanox.com>
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
2017-01-29 08:42:26 -07:00
|
|
|
mlx5e_free_rx_wqe_reuse(rq, wi);
|
net/mlx5: E-Switch, Support VLAN actions in the offloads mode
Many virtualization systems use a policy under which a vlan tag is
pushed to packets sent by guests, and popped before the packet is
forwarded to the VM.
The current generation of the mlx5 HW doesn't fully support that on
a per flow level. As such, we are addressing the above common use
case with the SRIOV e-Switch abilities to push vlan into packets
sent by VFs and pop vlan from packets forwarded to VFs.
The HW can match on the correct vlan being present in packets
forwarded to VFs (eSwitch steering is done before stripping
the tag), so this part is offloaded as is.
A common practice for vlans is to avoid both push vlan and pop vlan
for inter-host VM/VM (east-west) communication because in this case,
push on egress cancels out with pop on ingress.
For supporting that, we use a global eswitch vlan pop policy, hence
allowing guest A to communicate with both remote VM B and local VM C.
This works since the HW pops the vlan only if it exists (e.g for
C --> A packets but not for B --> A packets).
On the slow path, when a VF vport has an offloaded flow which involves
pushing vlans, wheres another flow is not currently offloaded, the
packets from the 2nd flow seen by the VF representor on the host have
vlan. The VF rep driver removes such vlan before calling into the host
networking stack.
Signed-off-by: Or Gerlitz <ogerlitz@mellanox.com>
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2016-09-22 11:01:47 -06:00
|
|
|
wq_ll_pop:
|
|
|
|
mlx5_wq_ll_pop(&rq->wq, wqe_counter_be,
|
|
|
|
&wqe->next.next_wqe_index);
|
|
|
|
}
|
2017-06-05 06:17:12 -06:00
|
|
|
#endif
|
net/mlx5: E-Switch, Support VLAN actions in the offloads mode
Many virtualization systems use a policy under which a vlan tag is
pushed to packets sent by guests, and popped before the packet is
forwarded to the VM.
The current generation of the mlx5 HW doesn't fully support that on
a per flow level. As such, we are addressing the above common use
case with the SRIOV e-Switch abilities to push vlan into packets
sent by VFs and pop vlan from packets forwarded to VFs.
The HW can match on the correct vlan being present in packets
forwarded to VFs (eSwitch steering is done before stripping
the tag), so this part is offloaded as is.
A common practice for vlans is to avoid both push vlan and pop vlan
for inter-host VM/VM (east-west) communication because in this case,
push on egress cancels out with pop on ingress.
For supporting that, we use a global eswitch vlan pop policy, hence
allowing guest A to communicate with both remote VM B and local VM C.
This works since the HW pops the vlan only if it exists (e.g for
C --> A packets but not for B --> A packets).
On the slow path, when a VF vport has an offloaded flow which involves
pushing vlans, wheres another flow is not currently offloaded, the
packets from the 2nd flow seen by the VF representor on the host have
vlan. The VF rep driver removes such vlan before calling into the host
networking stack.
Signed-off-by: Or Gerlitz <ogerlitz@mellanox.com>
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2016-09-22 11:01:47 -06:00
|
|
|
|
2016-04-20 13:02:15 -06:00
|
|
|
static inline void mlx5e_mpwqe_fill_rx_skb(struct mlx5e_rq *rq,
|
|
|
|
struct mlx5_cqe64 *cqe,
|
|
|
|
struct mlx5e_mpw_info *wi,
|
|
|
|
u32 cqe_bcnt,
|
|
|
|
struct sk_buff *skb)
|
|
|
|
{
|
|
|
|
u16 stride_ix = mpwrq_get_cqe_stride_index(cqe);
|
2017-07-02 10:02:05 -06:00
|
|
|
u32 wqe_offset = stride_ix << rq->mpwqe.log_stride_sz;
|
2016-04-20 13:02:15 -06:00
|
|
|
u32 head_offset = wqe_offset & (PAGE_SIZE - 1);
|
|
|
|
u32 page_idx = wqe_offset >> PAGE_SHIFT;
|
|
|
|
u32 head_page_idx = page_idx;
|
|
|
|
u16 headlen = min_t(u16, MLX5_MPWRQ_SMALL_PACKET_THRESHOLD, cqe_bcnt);
|
|
|
|
u32 frag_offset = head_offset + headlen;
|
|
|
|
u16 byte_cnt = cqe_bcnt - headlen;
|
|
|
|
|
|
|
|
if (unlikely(frag_offset >= PAGE_SIZE)) {
|
|
|
|
page_idx++;
|
|
|
|
frag_offset -= PAGE_SIZE;
|
|
|
|
}
|
|
|
|
|
|
|
|
while (byte_cnt) {
|
|
|
|
u32 pg_consumed_bytes =
|
|
|
|
min_t(u32, PAGE_SIZE - frag_offset, byte_cnt);
|
|
|
|
|
net/mlx5e: Single flow order-0 pages for Striding RQ
To improve the memory consumption scheme, we omit the flow that
demands and splits high-order pages in Striding RQ, and stay
with a single Striding RQ flow that uses order-0 pages.
Moving to fragmented memory allows the use of larger MPWQEs,
which reduces the number of UMR posts and filler CQEs.
Moving to a single flow allows several optimizations that improve
performance, especially in production servers where we would
anyway fallback to order-0 allocations:
- inline functions that were called via function pointers.
- improve the UMR post process.
This patch alone is expected to give a slight performance reduction.
However, the new memory scheme gives the possibility to use a page-cache
of a fair size, that doesn't inflate the memory footprint, which will
dramatically fix the reduction and even give a performance gain.
Performance tests:
The following results were measured on a freshly booted system,
giving optimal baseline performance, as high-order pages are yet to
be fragmented and depleted.
We ran pktgen single-stream benchmarks, with iptables-raw-drop:
Single stride, 64 bytes:
* 4,739,057 - baseline
* 4,749,550 - this patch
no reduction
Larger packets, no page cross, 1024 bytes:
* 3,982,361 - baseline
* 3,845,682 - this patch
3.5% reduction
Larger packets, every 3rd packet crosses a page, 1500 bytes:
* 3,731,189 - baseline
* 3,579,414 - this patch
4% reduction
Fixes: 461017cb006a ("net/mlx5e: Support RX multi-packet WQE (Striding RQ)")
Fixes: bc77b240b3c5 ("net/mlx5e: Add fragmented memory support for RX multi packet WQE")
Signed-off-by: Tariq Toukan <tariqt@mellanox.com>
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2016-09-15 07:08:36 -06:00
|
|
|
mlx5e_add_skb_frag_mpwqe(rq, skb, wi, page_idx, frag_offset,
|
|
|
|
pg_consumed_bytes);
|
2016-04-20 13:02:15 -06:00
|
|
|
byte_cnt -= pg_consumed_bytes;
|
|
|
|
frag_offset = 0;
|
|
|
|
page_idx++;
|
|
|
|
}
|
|
|
|
/* copy header */
|
net/mlx5e: Single flow order-0 pages for Striding RQ
To improve the memory consumption scheme, we omit the flow that
demands and splits high-order pages in Striding RQ, and stay
with a single Striding RQ flow that uses order-0 pages.
Moving to fragmented memory allows the use of larger MPWQEs,
which reduces the number of UMR posts and filler CQEs.
Moving to a single flow allows several optimizations that improve
performance, especially in production servers where we would
anyway fallback to order-0 allocations:
- inline functions that were called via function pointers.
- improve the UMR post process.
This patch alone is expected to give a slight performance reduction.
However, the new memory scheme gives the possibility to use a page-cache
of a fair size, that doesn't inflate the memory footprint, which will
dramatically fix the reduction and even give a performance gain.
Performance tests:
The following results were measured on a freshly booted system,
giving optimal baseline performance, as high-order pages are yet to
be fragmented and depleted.
We ran pktgen single-stream benchmarks, with iptables-raw-drop:
Single stride, 64 bytes:
* 4,739,057 - baseline
* 4,749,550 - this patch
no reduction
Larger packets, no page cross, 1024 bytes:
* 3,982,361 - baseline
* 3,845,682 - this patch
3.5% reduction
Larger packets, every 3rd packet crosses a page, 1500 bytes:
* 3,731,189 - baseline
* 3,579,414 - this patch
4% reduction
Fixes: 461017cb006a ("net/mlx5e: Support RX multi-packet WQE (Striding RQ)")
Fixes: bc77b240b3c5 ("net/mlx5e: Add fragmented memory support for RX multi packet WQE")
Signed-off-by: Tariq Toukan <tariqt@mellanox.com>
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2016-09-15 07:08:36 -06:00
|
|
|
mlx5e_copy_skb_header_mpwqe(rq->pdev, skb, wi, head_page_idx,
|
|
|
|
head_offset, headlen);
|
2016-04-20 13:02:15 -06:00
|
|
|
/* skb linear part was allocated with headlen and aligned to long */
|
|
|
|
skb->tail += headlen;
|
|
|
|
skb->len += headlen;
|
|
|
|
}
|
|
|
|
|
net/mlx5e: Support RX multi-packet WQE (Striding RQ)
Introduce the feature of multi-packet WQE (RX Work Queue Element)
referred to as (MPWQE or Striding RQ), in which WQEs are larger
and serve multiple packets each.
Every WQE consists of many strides of the same size, every received
packet is aligned to a beginning of a stride and is written to
consecutive strides within a WQE.
In the regular approach, each regular WQE is big enough to be capable
of serving one received packet of any size up to MTU or 64K in case of
device LRO is enabled, making it very wasteful when dealing with
small packets or device LRO is enabled.
For its flexibility, MPWQE allows a better memory utilization
(implying improvements in CPU utilization and packet rate) as packets
consume strides according to their size, preserving the rest of
the WQE to be available for other packets.
MPWQE default configuration:
Num of WQEs = 16
Strides Per WQE = 2048
Stride Size = 64 byte
The default WQEs memory footprint went from 1024*mtu (~1.5MB) to
16 * 2048 * 64 = 2MB per ring.
However, HW LRO can now be supported at no additional cost in memory
footprint, and hence we turn it on by default and get an even better
performance.
Performance tested on ConnectX4-Lx 50G.
To isolate the feature under test, the numbers below were measured with
HW LRO turned off. We verified that the performance just improves when
LRO is turned back on.
* Netperf single TCP stream:
- BW raised by 10-15% for representative packet sizes:
default, 64B, 1024B, 1478B, 65536B.
* Netperf multi TCP stream:
- No degradation, line rate reached.
* Pktgen: packet rate raised by 2-10% for traffic of different message
sizes: 64B, 128B, 256B, 1024B, and 1500B.
* Pktgen: packet loss in bursts of small messages (64byte),
single stream:
- | num packets | packets loss before | packets loss after
| 2K | ~ 1K | 0
| 8K | ~ 6K | 0
| 16K | ~13K | 0
| 32K | ~28K | 0
| 64K | ~57K | ~24K
As expected as the driver can receive as many small packets (<=64B) as
the number of total strides in the ring (default = 2048 * 16) vs. 1024
(default ring size regardless of packets size) before this feature.
Signed-off-by: Tariq Toukan <tariqt@mellanox.com>
Signed-off-by: Achiad Shochat <achiad@mellanox.com>
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2016-04-20 13:02:13 -06:00
|
|
|
void mlx5e_handle_rx_cqe_mpwrq(struct mlx5e_rq *rq, struct mlx5_cqe64 *cqe)
|
|
|
|
{
|
|
|
|
u16 cstrides = mpwrq_get_cqe_consumed_strides(cqe);
|
|
|
|
u16 wqe_id = be16_to_cpu(cqe->wqe_id);
|
2016-09-21 03:19:43 -06:00
|
|
|
struct mlx5e_mpw_info *wi = &rq->mpwqe.info[wqe_id];
|
net/mlx5e: Support RX multi-packet WQE (Striding RQ)
Introduce the feature of multi-packet WQE (RX Work Queue Element)
referred to as (MPWQE or Striding RQ), in which WQEs are larger
and serve multiple packets each.
Every WQE consists of many strides of the same size, every received
packet is aligned to a beginning of a stride and is written to
consecutive strides within a WQE.
In the regular approach, each regular WQE is big enough to be capable
of serving one received packet of any size up to MTU or 64K in case of
device LRO is enabled, making it very wasteful when dealing with
small packets or device LRO is enabled.
For its flexibility, MPWQE allows a better memory utilization
(implying improvements in CPU utilization and packet rate) as packets
consume strides according to their size, preserving the rest of
the WQE to be available for other packets.
MPWQE default configuration:
Num of WQEs = 16
Strides Per WQE = 2048
Stride Size = 64 byte
The default WQEs memory footprint went from 1024*mtu (~1.5MB) to
16 * 2048 * 64 = 2MB per ring.
However, HW LRO can now be supported at no additional cost in memory
footprint, and hence we turn it on by default and get an even better
performance.
Performance tested on ConnectX4-Lx 50G.
To isolate the feature under test, the numbers below were measured with
HW LRO turned off. We verified that the performance just improves when
LRO is turned back on.
* Netperf single TCP stream:
- BW raised by 10-15% for representative packet sizes:
default, 64B, 1024B, 1478B, 65536B.
* Netperf multi TCP stream:
- No degradation, line rate reached.
* Pktgen: packet rate raised by 2-10% for traffic of different message
sizes: 64B, 128B, 256B, 1024B, and 1500B.
* Pktgen: packet loss in bursts of small messages (64byte),
single stream:
- | num packets | packets loss before | packets loss after
| 2K | ~ 1K | 0
| 8K | ~ 6K | 0
| 16K | ~13K | 0
| 32K | ~28K | 0
| 64K | ~57K | ~24K
As expected as the driver can receive as many small packets (<=64B) as
the number of total strides in the ring (default = 2048 * 16) vs. 1024
(default ring size regardless of packets size) before this feature.
Signed-off-by: Tariq Toukan <tariqt@mellanox.com>
Signed-off-by: Achiad Shochat <achiad@mellanox.com>
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2016-04-20 13:02:13 -06:00
|
|
|
struct mlx5e_rx_wqe *wqe = mlx5_wq_ll_get_wqe(&rq->wq, wqe_id);
|
|
|
|
struct sk_buff *skb;
|
|
|
|
u16 cqe_bcnt;
|
|
|
|
|
|
|
|
wi->consumed_strides += cstrides;
|
|
|
|
|
|
|
|
if (unlikely((cqe->op_own >> 4) != MLX5_CQE_RESP_SEND)) {
|
|
|
|
rq->stats.wqe_err++;
|
|
|
|
goto mpwrq_cqe_out;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (unlikely(mpwrq_is_filler_cqe(cqe))) {
|
|
|
|
rq->stats.mpwqe_filler++;
|
|
|
|
goto mpwrq_cqe_out;
|
|
|
|
}
|
|
|
|
|
2016-04-20 13:02:16 -06:00
|
|
|
skb = napi_alloc_skb(rq->cq.napi,
|
|
|
|
ALIGN(MLX5_MPWRQ_SMALL_PACKET_THRESHOLD,
|
|
|
|
sizeof(long)));
|
2016-04-20 13:02:19 -06:00
|
|
|
if (unlikely(!skb)) {
|
|
|
|
rq->stats.buff_alloc_err++;
|
net/mlx5e: Support RX multi-packet WQE (Striding RQ)
Introduce the feature of multi-packet WQE (RX Work Queue Element)
referred to as (MPWQE or Striding RQ), in which WQEs are larger
and serve multiple packets each.
Every WQE consists of many strides of the same size, every received
packet is aligned to a beginning of a stride and is written to
consecutive strides within a WQE.
In the regular approach, each regular WQE is big enough to be capable
of serving one received packet of any size up to MTU or 64K in case of
device LRO is enabled, making it very wasteful when dealing with
small packets or device LRO is enabled.
For its flexibility, MPWQE allows a better memory utilization
(implying improvements in CPU utilization and packet rate) as packets
consume strides according to their size, preserving the rest of
the WQE to be available for other packets.
MPWQE default configuration:
Num of WQEs = 16
Strides Per WQE = 2048
Stride Size = 64 byte
The default WQEs memory footprint went from 1024*mtu (~1.5MB) to
16 * 2048 * 64 = 2MB per ring.
However, HW LRO can now be supported at no additional cost in memory
footprint, and hence we turn it on by default and get an even better
performance.
Performance tested on ConnectX4-Lx 50G.
To isolate the feature under test, the numbers below were measured with
HW LRO turned off. We verified that the performance just improves when
LRO is turned back on.
* Netperf single TCP stream:
- BW raised by 10-15% for representative packet sizes:
default, 64B, 1024B, 1478B, 65536B.
* Netperf multi TCP stream:
- No degradation, line rate reached.
* Pktgen: packet rate raised by 2-10% for traffic of different message
sizes: 64B, 128B, 256B, 1024B, and 1500B.
* Pktgen: packet loss in bursts of small messages (64byte),
single stream:
- | num packets | packets loss before | packets loss after
| 2K | ~ 1K | 0
| 8K | ~ 6K | 0
| 16K | ~13K | 0
| 32K | ~28K | 0
| 64K | ~57K | ~24K
As expected as the driver can receive as many small packets (<=64B) as
the number of total strides in the ring (default = 2048 * 16) vs. 1024
(default ring size regardless of packets size) before this feature.
Signed-off-by: Tariq Toukan <tariqt@mellanox.com>
Signed-off-by: Achiad Shochat <achiad@mellanox.com>
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2016-04-20 13:02:13 -06:00
|
|
|
goto mpwrq_cqe_out;
|
2016-04-20 13:02:19 -06:00
|
|
|
}
|
net/mlx5e: Support RX multi-packet WQE (Striding RQ)
Introduce the feature of multi-packet WQE (RX Work Queue Element)
referred to as (MPWQE or Striding RQ), in which WQEs are larger
and serve multiple packets each.
Every WQE consists of many strides of the same size, every received
packet is aligned to a beginning of a stride and is written to
consecutive strides within a WQE.
In the regular approach, each regular WQE is big enough to be capable
of serving one received packet of any size up to MTU or 64K in case of
device LRO is enabled, making it very wasteful when dealing with
small packets or device LRO is enabled.
For its flexibility, MPWQE allows a better memory utilization
(implying improvements in CPU utilization and packet rate) as packets
consume strides according to their size, preserving the rest of
the WQE to be available for other packets.
MPWQE default configuration:
Num of WQEs = 16
Strides Per WQE = 2048
Stride Size = 64 byte
The default WQEs memory footprint went from 1024*mtu (~1.5MB) to
16 * 2048 * 64 = 2MB per ring.
However, HW LRO can now be supported at no additional cost in memory
footprint, and hence we turn it on by default and get an even better
performance.
Performance tested on ConnectX4-Lx 50G.
To isolate the feature under test, the numbers below were measured with
HW LRO turned off. We verified that the performance just improves when
LRO is turned back on.
* Netperf single TCP stream:
- BW raised by 10-15% for representative packet sizes:
default, 64B, 1024B, 1478B, 65536B.
* Netperf multi TCP stream:
- No degradation, line rate reached.
* Pktgen: packet rate raised by 2-10% for traffic of different message
sizes: 64B, 128B, 256B, 1024B, and 1500B.
* Pktgen: packet loss in bursts of small messages (64byte),
single stream:
- | num packets | packets loss before | packets loss after
| 2K | ~ 1K | 0
| 8K | ~ 6K | 0
| 16K | ~13K | 0
| 32K | ~28K | 0
| 64K | ~57K | ~24K
As expected as the driver can receive as many small packets (<=64B) as
the number of total strides in the ring (default = 2048 * 16) vs. 1024
(default ring size regardless of packets size) before this feature.
Signed-off-by: Tariq Toukan <tariqt@mellanox.com>
Signed-off-by: Achiad Shochat <achiad@mellanox.com>
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2016-04-20 13:02:13 -06:00
|
|
|
|
net/mlx5e: Use prefetchw when a write is to follow
"prefetchw()" prefetches the cacheline for write. Use it for
skb->data, as soon we'll be copying the packet header there.
Performance:
Single-stream packet-rate tested with pktgen.
Packets are dropped in tc level to zoom into driver data-path.
Larger gain is expected for smaller packets, as less time
is spent on handling SKB fragments, making the path shorter
and the improvement more significant.
---------------------------------------------
packet size | before | after | gain |
64B | 4,113,306 | 4,778,720 | 16% |
1024B | 3,633,819 | 3,950,593 | 8.7% |
Signed-off-by: Tariq Toukan <tariqt@mellanox.com>
Cc: kernel-team@fb.com
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
2017-02-15 08:05:39 -07:00
|
|
|
prefetchw(skb->data);
|
net/mlx5e: Support RX multi-packet WQE (Striding RQ)
Introduce the feature of multi-packet WQE (RX Work Queue Element)
referred to as (MPWQE or Striding RQ), in which WQEs are larger
and serve multiple packets each.
Every WQE consists of many strides of the same size, every received
packet is aligned to a beginning of a stride and is written to
consecutive strides within a WQE.
In the regular approach, each regular WQE is big enough to be capable
of serving one received packet of any size up to MTU or 64K in case of
device LRO is enabled, making it very wasteful when dealing with
small packets or device LRO is enabled.
For its flexibility, MPWQE allows a better memory utilization
(implying improvements in CPU utilization and packet rate) as packets
consume strides according to their size, preserving the rest of
the WQE to be available for other packets.
MPWQE default configuration:
Num of WQEs = 16
Strides Per WQE = 2048
Stride Size = 64 byte
The default WQEs memory footprint went from 1024*mtu (~1.5MB) to
16 * 2048 * 64 = 2MB per ring.
However, HW LRO can now be supported at no additional cost in memory
footprint, and hence we turn it on by default and get an even better
performance.
Performance tested on ConnectX4-Lx 50G.
To isolate the feature under test, the numbers below were measured with
HW LRO turned off. We verified that the performance just improves when
LRO is turned back on.
* Netperf single TCP stream:
- BW raised by 10-15% for representative packet sizes:
default, 64B, 1024B, 1478B, 65536B.
* Netperf multi TCP stream:
- No degradation, line rate reached.
* Pktgen: packet rate raised by 2-10% for traffic of different message
sizes: 64B, 128B, 256B, 1024B, and 1500B.
* Pktgen: packet loss in bursts of small messages (64byte),
single stream:
- | num packets | packets loss before | packets loss after
| 2K | ~ 1K | 0
| 8K | ~ 6K | 0
| 16K | ~13K | 0
| 32K | ~28K | 0
| 64K | ~57K | ~24K
As expected as the driver can receive as many small packets (<=64B) as
the number of total strides in the ring (default = 2048 * 16) vs. 1024
(default ring size regardless of packets size) before this feature.
Signed-off-by: Tariq Toukan <tariqt@mellanox.com>
Signed-off-by: Achiad Shochat <achiad@mellanox.com>
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2016-04-20 13:02:13 -06:00
|
|
|
cqe_bcnt = mpwrq_get_cqe_byte_cnt(cqe);
|
|
|
|
|
2016-04-20 13:02:15 -06:00
|
|
|
mlx5e_mpwqe_fill_rx_skb(rq, cqe, wi, cqe_bcnt, skb);
|
net/mlx5e: Support RX multi-packet WQE (Striding RQ)
Introduce the feature of multi-packet WQE (RX Work Queue Element)
referred to as (MPWQE or Striding RQ), in which WQEs are larger
and serve multiple packets each.
Every WQE consists of many strides of the same size, every received
packet is aligned to a beginning of a stride and is written to
consecutive strides within a WQE.
In the regular approach, each regular WQE is big enough to be capable
of serving one received packet of any size up to MTU or 64K in case of
device LRO is enabled, making it very wasteful when dealing with
small packets or device LRO is enabled.
For its flexibility, MPWQE allows a better memory utilization
(implying improvements in CPU utilization and packet rate) as packets
consume strides according to their size, preserving the rest of
the WQE to be available for other packets.
MPWQE default configuration:
Num of WQEs = 16
Strides Per WQE = 2048
Stride Size = 64 byte
The default WQEs memory footprint went from 1024*mtu (~1.5MB) to
16 * 2048 * 64 = 2MB per ring.
However, HW LRO can now be supported at no additional cost in memory
footprint, and hence we turn it on by default and get an even better
performance.
Performance tested on ConnectX4-Lx 50G.
To isolate the feature under test, the numbers below were measured with
HW LRO turned off. We verified that the performance just improves when
LRO is turned back on.
* Netperf single TCP stream:
- BW raised by 10-15% for representative packet sizes:
default, 64B, 1024B, 1478B, 65536B.
* Netperf multi TCP stream:
- No degradation, line rate reached.
* Pktgen: packet rate raised by 2-10% for traffic of different message
sizes: 64B, 128B, 256B, 1024B, and 1500B.
* Pktgen: packet loss in bursts of small messages (64byte),
single stream:
- | num packets | packets loss before | packets loss after
| 2K | ~ 1K | 0
| 8K | ~ 6K | 0
| 16K | ~13K | 0
| 32K | ~28K | 0
| 64K | ~57K | ~24K
As expected as the driver can receive as many small packets (<=64B) as
the number of total strides in the ring (default = 2048 * 16) vs. 1024
(default ring size regardless of packets size) before this feature.
Signed-off-by: Tariq Toukan <tariqt@mellanox.com>
Signed-off-by: Achiad Shochat <achiad@mellanox.com>
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2016-04-20 13:02:13 -06:00
|
|
|
mlx5e_complete_rx_cqe(rq, cqe, cqe_bcnt, skb);
|
2016-09-22 11:01:46 -06:00
|
|
|
napi_gro_receive(rq->cq.napi, skb);
|
net/mlx5e: Support RX multi-packet WQE (Striding RQ)
Introduce the feature of multi-packet WQE (RX Work Queue Element)
referred to as (MPWQE or Striding RQ), in which WQEs are larger
and serve multiple packets each.
Every WQE consists of many strides of the same size, every received
packet is aligned to a beginning of a stride and is written to
consecutive strides within a WQE.
In the regular approach, each regular WQE is big enough to be capable
of serving one received packet of any size up to MTU or 64K in case of
device LRO is enabled, making it very wasteful when dealing with
small packets or device LRO is enabled.
For its flexibility, MPWQE allows a better memory utilization
(implying improvements in CPU utilization and packet rate) as packets
consume strides according to their size, preserving the rest of
the WQE to be available for other packets.
MPWQE default configuration:
Num of WQEs = 16
Strides Per WQE = 2048
Stride Size = 64 byte
The default WQEs memory footprint went from 1024*mtu (~1.5MB) to
16 * 2048 * 64 = 2MB per ring.
However, HW LRO can now be supported at no additional cost in memory
footprint, and hence we turn it on by default and get an even better
performance.
Performance tested on ConnectX4-Lx 50G.
To isolate the feature under test, the numbers below were measured with
HW LRO turned off. We verified that the performance just improves when
LRO is turned back on.
* Netperf single TCP stream:
- BW raised by 10-15% for representative packet sizes:
default, 64B, 1024B, 1478B, 65536B.
* Netperf multi TCP stream:
- No degradation, line rate reached.
* Pktgen: packet rate raised by 2-10% for traffic of different message
sizes: 64B, 128B, 256B, 1024B, and 1500B.
* Pktgen: packet loss in bursts of small messages (64byte),
single stream:
- | num packets | packets loss before | packets loss after
| 2K | ~ 1K | 0
| 8K | ~ 6K | 0
| 16K | ~13K | 0
| 32K | ~28K | 0
| 64K | ~57K | ~24K
As expected as the driver can receive as many small packets (<=64B) as
the number of total strides in the ring (default = 2048 * 16) vs. 1024
(default ring size regardless of packets size) before this feature.
Signed-off-by: Tariq Toukan <tariqt@mellanox.com>
Signed-off-by: Achiad Shochat <achiad@mellanox.com>
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2016-04-20 13:02:13 -06:00
|
|
|
|
|
|
|
mpwrq_cqe_out:
|
2017-02-13 09:41:30 -07:00
|
|
|
if (likely(wi->consumed_strides < rq->mpwqe.num_strides))
|
net/mlx5e: Support RX multi-packet WQE (Striding RQ)
Introduce the feature of multi-packet WQE (RX Work Queue Element)
referred to as (MPWQE or Striding RQ), in which WQEs are larger
and serve multiple packets each.
Every WQE consists of many strides of the same size, every received
packet is aligned to a beginning of a stride and is written to
consecutive strides within a WQE.
In the regular approach, each regular WQE is big enough to be capable
of serving one received packet of any size up to MTU or 64K in case of
device LRO is enabled, making it very wasteful when dealing with
small packets or device LRO is enabled.
For its flexibility, MPWQE allows a better memory utilization
(implying improvements in CPU utilization and packet rate) as packets
consume strides according to their size, preserving the rest of
the WQE to be available for other packets.
MPWQE default configuration:
Num of WQEs = 16
Strides Per WQE = 2048
Stride Size = 64 byte
The default WQEs memory footprint went from 1024*mtu (~1.5MB) to
16 * 2048 * 64 = 2MB per ring.
However, HW LRO can now be supported at no additional cost in memory
footprint, and hence we turn it on by default and get an even better
performance.
Performance tested on ConnectX4-Lx 50G.
To isolate the feature under test, the numbers below were measured with
HW LRO turned off. We verified that the performance just improves when
LRO is turned back on.
* Netperf single TCP stream:
- BW raised by 10-15% for representative packet sizes:
default, 64B, 1024B, 1478B, 65536B.
* Netperf multi TCP stream:
- No degradation, line rate reached.
* Pktgen: packet rate raised by 2-10% for traffic of different message
sizes: 64B, 128B, 256B, 1024B, and 1500B.
* Pktgen: packet loss in bursts of small messages (64byte),
single stream:
- | num packets | packets loss before | packets loss after
| 2K | ~ 1K | 0
| 8K | ~ 6K | 0
| 16K | ~13K | 0
| 32K | ~28K | 0
| 64K | ~57K | ~24K
As expected as the driver can receive as many small packets (<=64B) as
the number of total strides in the ring (default = 2048 * 16) vs. 1024
(default ring size regardless of packets size) before this feature.
Signed-off-by: Tariq Toukan <tariqt@mellanox.com>
Signed-off-by: Achiad Shochat <achiad@mellanox.com>
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2016-04-20 13:02:13 -06:00
|
|
|
return;
|
|
|
|
|
net/mlx5e: Single flow order-0 pages for Striding RQ
To improve the memory consumption scheme, we omit the flow that
demands and splits high-order pages in Striding RQ, and stay
with a single Striding RQ flow that uses order-0 pages.
Moving to fragmented memory allows the use of larger MPWQEs,
which reduces the number of UMR posts and filler CQEs.
Moving to a single flow allows several optimizations that improve
performance, especially in production servers where we would
anyway fallback to order-0 allocations:
- inline functions that were called via function pointers.
- improve the UMR post process.
This patch alone is expected to give a slight performance reduction.
However, the new memory scheme gives the possibility to use a page-cache
of a fair size, that doesn't inflate the memory footprint, which will
dramatically fix the reduction and even give a performance gain.
Performance tests:
The following results were measured on a freshly booted system,
giving optimal baseline performance, as high-order pages are yet to
be fragmented and depleted.
We ran pktgen single-stream benchmarks, with iptables-raw-drop:
Single stride, 64 bytes:
* 4,739,057 - baseline
* 4,749,550 - this patch
no reduction
Larger packets, no page cross, 1024 bytes:
* 3,982,361 - baseline
* 3,845,682 - this patch
3.5% reduction
Larger packets, every 3rd packet crosses a page, 1500 bytes:
* 3,731,189 - baseline
* 3,579,414 - this patch
4% reduction
Fixes: 461017cb006a ("net/mlx5e: Support RX multi-packet WQE (Striding RQ)")
Fixes: bc77b240b3c5 ("net/mlx5e: Add fragmented memory support for RX multi packet WQE")
Signed-off-by: Tariq Toukan <tariqt@mellanox.com>
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2016-09-15 07:08:36 -06:00
|
|
|
mlx5e_free_rx_mpwqe(rq, wi);
|
net/mlx5e: Support RX multi-packet WQE (Striding RQ)
Introduce the feature of multi-packet WQE (RX Work Queue Element)
referred to as (MPWQE or Striding RQ), in which WQEs are larger
and serve multiple packets each.
Every WQE consists of many strides of the same size, every received
packet is aligned to a beginning of a stride and is written to
consecutive strides within a WQE.
In the regular approach, each regular WQE is big enough to be capable
of serving one received packet of any size up to MTU or 64K in case of
device LRO is enabled, making it very wasteful when dealing with
small packets or device LRO is enabled.
For its flexibility, MPWQE allows a better memory utilization
(implying improvements in CPU utilization and packet rate) as packets
consume strides according to their size, preserving the rest of
the WQE to be available for other packets.
MPWQE default configuration:
Num of WQEs = 16
Strides Per WQE = 2048
Stride Size = 64 byte
The default WQEs memory footprint went from 1024*mtu (~1.5MB) to
16 * 2048 * 64 = 2MB per ring.
However, HW LRO can now be supported at no additional cost in memory
footprint, and hence we turn it on by default and get an even better
performance.
Performance tested on ConnectX4-Lx 50G.
To isolate the feature under test, the numbers below were measured with
HW LRO turned off. We verified that the performance just improves when
LRO is turned back on.
* Netperf single TCP stream:
- BW raised by 10-15% for representative packet sizes:
default, 64B, 1024B, 1478B, 65536B.
* Netperf multi TCP stream:
- No degradation, line rate reached.
* Pktgen: packet rate raised by 2-10% for traffic of different message
sizes: 64B, 128B, 256B, 1024B, and 1500B.
* Pktgen: packet loss in bursts of small messages (64byte),
single stream:
- | num packets | packets loss before | packets loss after
| 2K | ~ 1K | 0
| 8K | ~ 6K | 0
| 16K | ~13K | 0
| 32K | ~28K | 0
| 64K | ~57K | ~24K
As expected as the driver can receive as many small packets (<=64B) as
the number of total strides in the ring (default = 2048 * 16) vs. 1024
(default ring size regardless of packets size) before this feature.
Signed-off-by: Tariq Toukan <tariqt@mellanox.com>
Signed-off-by: Achiad Shochat <achiad@mellanox.com>
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2016-04-20 13:02:13 -06:00
|
|
|
mlx5_wq_ll_pop(&rq->wq, cqe->wqe_id, &wqe->next.next_wqe_index);
|
|
|
|
}
|
|
|
|
|
2015-11-18 07:30:56 -07:00
|
|
|
int mlx5e_poll_rx_cq(struct mlx5e_cq *cq, int budget)
|
2015-05-28 13:28:46 -06:00
|
|
|
{
|
2015-06-23 08:14:20 -06:00
|
|
|
struct mlx5e_rq *rq = container_of(cq, struct mlx5e_rq, cq);
|
2017-06-19 09:11:30 -06:00
|
|
|
struct mlx5e_xdpsq *xdpsq;
|
|
|
|
struct mlx5_cqe64 *cqe;
|
2016-05-10 15:29:14 -06:00
|
|
|
int work_done = 0;
|
2015-05-28 13:28:46 -06:00
|
|
|
|
2017-07-03 02:27:20 -06:00
|
|
|
if (unlikely(!MLX5E_TEST_BIT(rq->state, MLX5E_RQ_STATE_ENABLED)))
|
2016-06-30 08:34:46 -06:00
|
|
|
return 0;
|
|
|
|
|
2016-05-10 15:29:14 -06:00
|
|
|
if (cq->decmprs_left)
|
|
|
|
work_done += mlx5e_decompress_cqes_cont(rq, cq, 0, budget);
|
|
|
|
|
2017-06-19 09:11:30 -06:00
|
|
|
cqe = mlx5_cqwq_get_cqe(&cq->wq);
|
|
|
|
if (!cqe)
|
|
|
|
return 0;
|
2015-05-28 13:28:46 -06:00
|
|
|
|
2017-06-19 09:11:30 -06:00
|
|
|
xdpsq = &rq->xdpsq;
|
2015-05-28 13:28:46 -06:00
|
|
|
|
2017-06-19 09:11:30 -06:00
|
|
|
do {
|
2016-05-10 15:29:14 -06:00
|
|
|
if (mlx5_get_cqe_format(cqe) == MLX5_COMPRESSED) {
|
|
|
|
work_done +=
|
|
|
|
mlx5e_decompress_cqes_start(rq, cq,
|
|
|
|
budget - work_done);
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
|
2015-06-23 08:14:21 -06:00
|
|
|
mlx5_cqwq_pop(&cq->wq);
|
|
|
|
|
2016-04-20 13:02:12 -06:00
|
|
|
rq->handle_rx_cqe(rq, cqe);
|
2017-06-19 09:11:30 -06:00
|
|
|
} while ((++work_done < budget) && (cqe = mlx5_cqwq_get_cqe(&cq->wq)));
|
2015-05-28 13:28:46 -06:00
|
|
|
|
2017-03-24 15:52:14 -06:00
|
|
|
if (xdpsq->db.doorbell) {
|
2017-03-24 15:52:08 -06:00
|
|
|
mlx5e_xmit_xdp_doorbell(xdpsq);
|
2017-03-24 15:52:14 -06:00
|
|
|
xdpsq->db.doorbell = false;
|
2016-09-21 03:19:49 -06:00
|
|
|
}
|
|
|
|
|
2015-05-28 13:28:46 -06:00
|
|
|
mlx5_cqwq_update_db_record(&cq->wq);
|
|
|
|
|
|
|
|
/* ensure cq space is freed before enabling more cqes */
|
|
|
|
wmb();
|
|
|
|
|
2015-11-18 07:30:56 -07:00
|
|
|
return work_done;
|
2015-05-28 13:28:46 -06:00
|
|
|
}
|
2017-03-24 15:52:06 -06:00
|
|
|
|
|
|
|
bool mlx5e_poll_xdpsq_cq(struct mlx5e_cq *cq)
|
|
|
|
{
|
2017-03-24 15:52:14 -06:00
|
|
|
struct mlx5e_xdpsq *sq;
|
2017-06-19 09:11:30 -06:00
|
|
|
struct mlx5_cqe64 *cqe;
|
2017-03-24 15:52:08 -06:00
|
|
|
struct mlx5e_rq *rq;
|
2017-03-24 15:52:06 -06:00
|
|
|
u16 sqcc;
|
|
|
|
int i;
|
|
|
|
|
2017-03-24 15:52:14 -06:00
|
|
|
sq = container_of(cq, struct mlx5e_xdpsq, cq);
|
2017-03-24 15:52:06 -06:00
|
|
|
|
2017-07-03 02:27:20 -06:00
|
|
|
if (unlikely(!MLX5E_TEST_BIT(sq->state, MLX5E_SQ_STATE_ENABLED)))
|
2017-03-24 15:52:06 -06:00
|
|
|
return false;
|
|
|
|
|
2017-06-19 09:11:30 -06:00
|
|
|
cqe = mlx5_cqwq_get_cqe(&cq->wq);
|
|
|
|
if (!cqe)
|
|
|
|
return false;
|
|
|
|
|
2017-03-24 15:52:08 -06:00
|
|
|
rq = container_of(sq, struct mlx5e_rq, xdpsq);
|
|
|
|
|
2017-03-24 15:52:06 -06:00
|
|
|
/* sq->cc must be updated only after mlx5_cqwq_update_db_record(),
|
|
|
|
* otherwise a cq overrun may occur
|
|
|
|
*/
|
|
|
|
sqcc = sq->cc;
|
|
|
|
|
2017-06-19 09:11:30 -06:00
|
|
|
i = 0;
|
|
|
|
do {
|
2017-03-24 15:52:06 -06:00
|
|
|
u16 wqe_counter;
|
|
|
|
bool last_wqe;
|
|
|
|
|
|
|
|
mlx5_cqwq_pop(&cq->wq);
|
|
|
|
|
|
|
|
wqe_counter = be16_to_cpu(cqe->wqe_counter);
|
|
|
|
|
|
|
|
do {
|
|
|
|
struct mlx5e_dma_info *di;
|
|
|
|
u16 ci;
|
|
|
|
|
|
|
|
last_wqe = (sqcc == wqe_counter);
|
|
|
|
|
|
|
|
ci = sqcc & sq->wq.sz_m1;
|
2017-03-24 15:52:14 -06:00
|
|
|
di = &sq->db.di[ci];
|
2017-03-24 15:52:06 -06:00
|
|
|
|
2017-03-24 15:52:10 -06:00
|
|
|
sqcc++;
|
2017-03-24 15:52:06 -06:00
|
|
|
/* Recycle RX page */
|
2017-03-24 15:52:08 -06:00
|
|
|
mlx5e_page_release(rq, di, true);
|
2017-03-24 15:52:06 -06:00
|
|
|
} while (!last_wqe);
|
2017-06-19 09:11:30 -06:00
|
|
|
} while ((++i < MLX5E_TX_CQ_POLL_BUDGET) && (cqe = mlx5_cqwq_get_cqe(&cq->wq)));
|
2017-03-24 15:52:06 -06:00
|
|
|
|
|
|
|
mlx5_cqwq_update_db_record(&cq->wq);
|
|
|
|
|
|
|
|
/* ensure cq space is freed before enabling more cqes */
|
|
|
|
wmb();
|
|
|
|
|
|
|
|
sq->cc = sqcc;
|
|
|
|
return (i == MLX5E_TX_CQ_POLL_BUDGET);
|
|
|
|
}
|
|
|
|
|
2017-03-24 15:52:14 -06:00
|
|
|
void mlx5e_free_xdpsq_descs(struct mlx5e_xdpsq *sq)
|
2017-03-24 15:52:06 -06:00
|
|
|
{
|
2017-03-24 15:52:08 -06:00
|
|
|
struct mlx5e_rq *rq = container_of(sq, struct mlx5e_rq, xdpsq);
|
2017-03-24 15:52:06 -06:00
|
|
|
struct mlx5e_dma_info *di;
|
|
|
|
u16 ci;
|
|
|
|
|
|
|
|
while (sq->cc != sq->pc) {
|
|
|
|
ci = sq->cc & sq->wq.sz_m1;
|
2017-03-24 15:52:14 -06:00
|
|
|
di = &sq->db.di[ci];
|
2017-03-24 15:52:10 -06:00
|
|
|
sq->cc++;
|
2017-03-24 15:52:06 -06:00
|
|
|
|
2017-03-24 15:52:08 -06:00
|
|
|
mlx5e_page_release(rq, di, false);
|
2017-03-24 15:52:06 -06:00
|
|
|
}
|
|
|
|
}
|
2017-04-12 21:37:04 -06:00
|
|
|
|
|
|
|
#ifdef CONFIG_MLX5_CORE_IPOIB
|
|
|
|
|
|
|
|
#define MLX5_IB_GRH_DGID_OFFSET 24
|
|
|
|
#define MLX5_GID_SIZE 16
|
|
|
|
|
|
|
|
static inline void mlx5i_complete_rx_cqe(struct mlx5e_rq *rq,
|
|
|
|
struct mlx5_cqe64 *cqe,
|
|
|
|
u32 cqe_bcnt,
|
|
|
|
struct sk_buff *skb)
|
|
|
|
{
|
|
|
|
struct net_device *netdev = rq->netdev;
|
2017-06-01 05:43:43 -06:00
|
|
|
struct mlx5e_tstamp *tstamp = rq->tstamp;
|
2017-04-27 08:59:00 -06:00
|
|
|
char *pseudo_header;
|
2017-04-12 21:37:04 -06:00
|
|
|
u8 *dgid;
|
|
|
|
u8 g;
|
|
|
|
|
|
|
|
g = (be32_to_cpu(cqe->flags_rqpn) >> 28) & 3;
|
|
|
|
dgid = skb->data + MLX5_IB_GRH_DGID_OFFSET;
|
|
|
|
if ((!g) || dgid[0] != 0xff)
|
|
|
|
skb->pkt_type = PACKET_HOST;
|
|
|
|
else if (memcmp(dgid, netdev->broadcast + 4, MLX5_GID_SIZE) == 0)
|
|
|
|
skb->pkt_type = PACKET_BROADCAST;
|
|
|
|
else
|
|
|
|
skb->pkt_type = PACKET_MULTICAST;
|
|
|
|
|
|
|
|
/* TODO: IB/ipoib: Allow mcast packets from other VFs
|
|
|
|
* 68996a6e760e5c74654723eeb57bf65628ae87f4
|
|
|
|
*/
|
|
|
|
|
|
|
|
skb_pull(skb, MLX5_IB_GRH_BYTES);
|
|
|
|
|
|
|
|
skb->protocol = *((__be16 *)(skb->data));
|
|
|
|
|
|
|
|
skb->ip_summed = CHECKSUM_COMPLETE;
|
|
|
|
skb->csum = csum_unfold((__force __sum16)cqe->check_sum);
|
|
|
|
|
2017-06-01 05:43:43 -06:00
|
|
|
if (unlikely(mlx5e_rx_hw_stamp(tstamp)))
|
|
|
|
mlx5e_fill_hwstamp(tstamp, get_cqe_ts(cqe), skb_hwtstamps(skb));
|
|
|
|
|
2017-04-12 21:37:04 -06:00
|
|
|
skb_record_rx_queue(skb, rq->ix);
|
|
|
|
|
|
|
|
if (likely(netdev->features & NETIF_F_RXHASH))
|
|
|
|
mlx5e_skb_set_hash(cqe, skb);
|
|
|
|
|
2017-04-27 08:59:00 -06:00
|
|
|
/* 20 bytes of ipoib header and 4 for encap existing */
|
|
|
|
pseudo_header = skb_push(skb, MLX5_IPOIB_PSEUDO_LEN);
|
|
|
|
memset(pseudo_header, 0, MLX5_IPOIB_PSEUDO_LEN);
|
2017-04-12 21:37:04 -06:00
|
|
|
skb_reset_mac_header(skb);
|
2017-04-27 08:59:00 -06:00
|
|
|
skb_pull(skb, MLX5_IPOIB_HARD_LEN);
|
2017-04-12 21:37:04 -06:00
|
|
|
|
|
|
|
skb->dev = netdev;
|
|
|
|
|
|
|
|
rq->stats.csum_complete++;
|
|
|
|
rq->stats.packets++;
|
|
|
|
rq->stats.bytes += cqe_bcnt;
|
|
|
|
}
|
|
|
|
|
|
|
|
void mlx5i_handle_rx_cqe(struct mlx5e_rq *rq, struct mlx5_cqe64 *cqe)
|
|
|
|
{
|
net/mlx5e: Introduce RX Page-Reuse
Introduce a Page-Reuse mechanism in non-Striding RQ RX datapath.
A WQE (RX descriptor) buffer is a page, that in most cases was fully
wasted on a packet that is much smaller, requiring a new page for
the next round.
In this patch, we implement a page-reuse mechanism, that resembles a
`SW Striding RQ`.
We allow the WQE to reuse its allocated page as much as it could,
until the page is fully consumed. In each round, the WQE is capable
of receiving packet of maximal size (MTU). Yet, upon the reception of
a packet, the WQE knows the actual packet size, and consumes the exact
amount of memory needed to build a linear SKB. Then, it updates the
buffer pointer within the page accordingly, for the next round.
Feature is mutually exclusive with XDP (packet-per-page)
and LRO (session size is a power of two, needs unused page).
Performance tests:
iperf tcp tests show huge gain:
--------------------------------------------
num streams | BW before | BW after | ratio |
1 | 22.2 | 30.9 | 1.39x |
8 | 64.2 | 93.6 | 1.46x |
64 | 56.7 | 91.4 | 1.61x |
--------------------------------------------
Signed-off-by: Tariq Toukan <tariqt@mellanox.com>
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
2017-01-29 08:42:26 -07:00
|
|
|
struct mlx5e_wqe_frag_info *wi;
|
2017-04-12 21:37:04 -06:00
|
|
|
struct mlx5e_rx_wqe *wqe;
|
|
|
|
__be16 wqe_counter_be;
|
|
|
|
struct sk_buff *skb;
|
|
|
|
u16 wqe_counter;
|
|
|
|
u32 cqe_bcnt;
|
|
|
|
|
|
|
|
wqe_counter_be = cqe->wqe_counter;
|
|
|
|
wqe_counter = be16_to_cpu(wqe_counter_be);
|
|
|
|
wqe = mlx5_wq_ll_get_wqe(&rq->wq, wqe_counter);
|
net/mlx5e: Introduce RX Page-Reuse
Introduce a Page-Reuse mechanism in non-Striding RQ RX datapath.
A WQE (RX descriptor) buffer is a page, that in most cases was fully
wasted on a packet that is much smaller, requiring a new page for
the next round.
In this patch, we implement a page-reuse mechanism, that resembles a
`SW Striding RQ`.
We allow the WQE to reuse its allocated page as much as it could,
until the page is fully consumed. In each round, the WQE is capable
of receiving packet of maximal size (MTU). Yet, upon the reception of
a packet, the WQE knows the actual packet size, and consumes the exact
amount of memory needed to build a linear SKB. Then, it updates the
buffer pointer within the page accordingly, for the next round.
Feature is mutually exclusive with XDP (packet-per-page)
and LRO (session size is a power of two, needs unused page).
Performance tests:
iperf tcp tests show huge gain:
--------------------------------------------
num streams | BW before | BW after | ratio |
1 | 22.2 | 30.9 | 1.39x |
8 | 64.2 | 93.6 | 1.46x |
64 | 56.7 | 91.4 | 1.61x |
--------------------------------------------
Signed-off-by: Tariq Toukan <tariqt@mellanox.com>
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
2017-01-29 08:42:26 -07:00
|
|
|
wi = &rq->wqe.frag_info[wqe_counter];
|
2017-04-12 21:37:04 -06:00
|
|
|
cqe_bcnt = be32_to_cpu(cqe->byte_cnt);
|
|
|
|
|
net/mlx5e: Introduce RX Page-Reuse
Introduce a Page-Reuse mechanism in non-Striding RQ RX datapath.
A WQE (RX descriptor) buffer is a page, that in most cases was fully
wasted on a packet that is much smaller, requiring a new page for
the next round.
In this patch, we implement a page-reuse mechanism, that resembles a
`SW Striding RQ`.
We allow the WQE to reuse its allocated page as much as it could,
until the page is fully consumed. In each round, the WQE is capable
of receiving packet of maximal size (MTU). Yet, upon the reception of
a packet, the WQE knows the actual packet size, and consumes the exact
amount of memory needed to build a linear SKB. Then, it updates the
buffer pointer within the page accordingly, for the next round.
Feature is mutually exclusive with XDP (packet-per-page)
and LRO (session size is a power of two, needs unused page).
Performance tests:
iperf tcp tests show huge gain:
--------------------------------------------
num streams | BW before | BW after | ratio |
1 | 22.2 | 30.9 | 1.39x |
8 | 64.2 | 93.6 | 1.46x |
64 | 56.7 | 91.4 | 1.61x |
--------------------------------------------
Signed-off-by: Tariq Toukan <tariqt@mellanox.com>
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
2017-01-29 08:42:26 -07:00
|
|
|
skb = skb_from_cqe(rq, cqe, wi, cqe_bcnt);
|
2017-04-12 21:37:04 -06:00
|
|
|
if (!skb)
|
net/mlx5e: Introduce RX Page-Reuse
Introduce a Page-Reuse mechanism in non-Striding RQ RX datapath.
A WQE (RX descriptor) buffer is a page, that in most cases was fully
wasted on a packet that is much smaller, requiring a new page for
the next round.
In this patch, we implement a page-reuse mechanism, that resembles a
`SW Striding RQ`.
We allow the WQE to reuse its allocated page as much as it could,
until the page is fully consumed. In each round, the WQE is capable
of receiving packet of maximal size (MTU). Yet, upon the reception of
a packet, the WQE knows the actual packet size, and consumes the exact
amount of memory needed to build a linear SKB. Then, it updates the
buffer pointer within the page accordingly, for the next round.
Feature is mutually exclusive with XDP (packet-per-page)
and LRO (session size is a power of two, needs unused page).
Performance tests:
iperf tcp tests show huge gain:
--------------------------------------------
num streams | BW before | BW after | ratio |
1 | 22.2 | 30.9 | 1.39x |
8 | 64.2 | 93.6 | 1.46x |
64 | 56.7 | 91.4 | 1.61x |
--------------------------------------------
Signed-off-by: Tariq Toukan <tariqt@mellanox.com>
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
2017-01-29 08:42:26 -07:00
|
|
|
goto wq_free_wqe;
|
2017-04-12 21:37:04 -06:00
|
|
|
|
|
|
|
mlx5i_complete_rx_cqe(rq, cqe, cqe_bcnt, skb);
|
|
|
|
napi_gro_receive(rq->cq.napi, skb);
|
|
|
|
|
net/mlx5e: Introduce RX Page-Reuse
Introduce a Page-Reuse mechanism in non-Striding RQ RX datapath.
A WQE (RX descriptor) buffer is a page, that in most cases was fully
wasted on a packet that is much smaller, requiring a new page for
the next round.
In this patch, we implement a page-reuse mechanism, that resembles a
`SW Striding RQ`.
We allow the WQE to reuse its allocated page as much as it could,
until the page is fully consumed. In each round, the WQE is capable
of receiving packet of maximal size (MTU). Yet, upon the reception of
a packet, the WQE knows the actual packet size, and consumes the exact
amount of memory needed to build a linear SKB. Then, it updates the
buffer pointer within the page accordingly, for the next round.
Feature is mutually exclusive with XDP (packet-per-page)
and LRO (session size is a power of two, needs unused page).
Performance tests:
iperf tcp tests show huge gain:
--------------------------------------------
num streams | BW before | BW after | ratio |
1 | 22.2 | 30.9 | 1.39x |
8 | 64.2 | 93.6 | 1.46x |
64 | 56.7 | 91.4 | 1.61x |
--------------------------------------------
Signed-off-by: Tariq Toukan <tariqt@mellanox.com>
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
2017-01-29 08:42:26 -07:00
|
|
|
wq_free_wqe:
|
|
|
|
mlx5e_free_rx_wqe_reuse(rq, wi);
|
2017-04-12 21:37:04 -06:00
|
|
|
mlx5_wq_ll_pop(&rq->wq, wqe_counter_be,
|
|
|
|
&wqe->next.next_wqe_index);
|
|
|
|
}
|
|
|
|
|
|
|
|
#endif /* CONFIG_MLX5_CORE_IPOIB */
|
2017-06-19 05:04:36 -06:00
|
|
|
|
|
|
|
#ifdef CONFIG_MLX5_EN_IPSEC
|
|
|
|
|
|
|
|
void mlx5e_ipsec_handle_rx_cqe(struct mlx5e_rq *rq, struct mlx5_cqe64 *cqe)
|
|
|
|
{
|
|
|
|
struct mlx5e_wqe_frag_info *wi;
|
|
|
|
struct mlx5e_rx_wqe *wqe;
|
|
|
|
__be16 wqe_counter_be;
|
|
|
|
struct sk_buff *skb;
|
|
|
|
u16 wqe_counter;
|
|
|
|
u32 cqe_bcnt;
|
|
|
|
|
|
|
|
wqe_counter_be = cqe->wqe_counter;
|
|
|
|
wqe_counter = be16_to_cpu(wqe_counter_be);
|
|
|
|
wqe = mlx5_wq_ll_get_wqe(&rq->wq, wqe_counter);
|
|
|
|
wi = &rq->wqe.frag_info[wqe_counter];
|
|
|
|
cqe_bcnt = be32_to_cpu(cqe->byte_cnt);
|
|
|
|
|
|
|
|
skb = skb_from_cqe(rq, cqe, wi, cqe_bcnt);
|
|
|
|
if (unlikely(!skb)) {
|
|
|
|
/* a DROP, save the page-reuse checks */
|
|
|
|
mlx5e_free_rx_wqe(rq, wi);
|
|
|
|
goto wq_ll_pop;
|
|
|
|
}
|
|
|
|
skb = mlx5e_ipsec_handle_rx_skb(rq->netdev, skb);
|
|
|
|
if (unlikely(!skb)) {
|
|
|
|
mlx5e_free_rx_wqe(rq, wi);
|
|
|
|
goto wq_ll_pop;
|
|
|
|
}
|
|
|
|
|
|
|
|
mlx5e_complete_rx_cqe(rq, cqe, cqe_bcnt, skb);
|
|
|
|
napi_gro_receive(rq->cq.napi, skb);
|
|
|
|
|
|
|
|
mlx5e_free_rx_wqe_reuse(rq, wi);
|
|
|
|
wq_ll_pop:
|
|
|
|
mlx5_wq_ll_pop(&rq->wq, wqe_counter_be,
|
|
|
|
&wqe->next.next_wqe_index);
|
|
|
|
}
|
|
|
|
|
|
|
|
#endif /* CONFIG_MLX5_EN_IPSEC */
|